Merge remote-tracking branch 'gerrit/master' into f/centos76

Change-Id: I1c7d7adf054471ef8a00bdc9ce9193dba83025a6
Signed-off-by: Saul Wold <sgw@linux.intel.com>
This commit is contained in:
Saul Wold 2019-02-12 08:02:31 -08:00
commit ba9900eef8
53 changed files with 6984 additions and 385 deletions

View File

@ -1,2 +1,2 @@
SRC_DIR="files"
TIS_PATCH_VER=0
TIS_PATCH_VER=1

View File

@ -140,8 +140,8 @@ static-file.exclude-extensions = ( ".php", ".pl", ".fcgi" )
######### Options that are good to be but not neccesary to be changed #######
## bind to port (default: 80)
#server.port = 81
## bind to port 8080
server.port = 8080
## bind to localhost (default: all interfaces)
#server.bind = "grisu.home.kneschke.de"
@ -220,7 +220,7 @@ $HTTP["url"] !~ "^/(rel-[^/]*|feed|updates|static)/" {
( "localhost" =>
(
"host" => "127.0.0.1",
"port" => 8080
"port" => 8008
)
)
)
@ -244,7 +244,7 @@ $HTTP["url"] !~ "^/(rel-[^/]*|feed|updates|static)/" {
#
#### Listen to IPv6
$SERVER["socket"] == "[::]:80" { }
$SERVER["socket"] == "[::]:8080" { }
#### status module
#status.status-url = "/server-status"

View File

@ -0,0 +1 @@
database/mariadb

View File

@ -1 +1,2 @@
virt/libvirt
database/mariadb

View File

@ -1,5 +1,6 @@
SRC_DIR="$CGCS_BASE/git/ceph"
COPY_LIST="files/* $DISTRO/patches/*"
TIS_BASE_SRCREV=3f07f7ff1a5c7bfa8d0de12c966594d5fb7cf4ec
TIS_PATCH_VER=GITREVCOUNT
TIS_PATCH_VER=GITREVCOUNT+1
BUILD_IS_BIG=40
BUILD_IS_SLOW=26

View File

@ -1 +0,0 @@
../../../../git/ceph/ceph.spec

1893
ceph/ceph/centos/ceph.spec Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,59 @@
From 03340eaf0004e3cc8e3f8991ea96a46757d92830 Mon Sep 17 00:00:00 2001
From: Don Penney <don.penney@windriver.com>
Date: Sat, 26 Jan 2019 13:34:55 -0500
Subject: [PATCH] Add hooks for orderly shutdown on controller
Hook the ceph init script to add systemd overrides to define
an orderly shutdown for StarlingX controllers.
Signed-off-by: Don Penney <don.penney@windriver.com>
---
src/init-ceph.in | 32 ++++++++++++++++++++++++++++++++
1 file changed, 32 insertions(+)
diff --git a/src/init-ceph.in b/src/init-ceph.in
index 1fdb4b3..515d818 100644
--- a/src/init-ceph.in
+++ b/src/init-ceph.in
@@ -861,6 +861,38 @@ for name in $what; do
fi
fi
+ . /etc/platform/platform.conf
+ if [ "${nodetype}" = "controller" ]; then
+ # StarlingX: Hook the transient services launched by systemd-run
+ # to allow for proper cleanup and orderly shutdown
+
+ # Set nullglob so wildcards will return empty string if no match
+ shopt -s nullglob
+
+ OSD_SERVICES=$(for svc in /run/systemd/system/ceph-osd*.service; do basename $svc; done | xargs echo)
+ for d in /run/systemd/system/ceph-osd*.d; do
+ cat <<EOF > $d/starlingx-overrides.conf
+[Unit]
+Before=docker.service
+After=sm-shutdown.service
+
+EOF
+ done
+
+ for d in /run/systemd/system/ceph-mon*.d; do
+ cat <<EOF > $d/starlingx-overrides.conf
+[Unit]
+Before=docker.service
+After=sm-shutdown.service ${OSD_SERVICES}
+
+EOF
+ done
+
+ shopt -u nullglob
+
+ systemctl daemon-reload
+ fi
+
[ -n "$post_start" ] && do_cmd "$post_start"
[ -n "$lockfile" ] && [ "$?" -eq 0 ] && touch $lockfile
;;
--
1.8.3.1

View File

@ -0,0 +1,282 @@
#!/bin/bash
#
# Copyright (c) 2019 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
# This script is a helper wrapper for pmon monitoring of ceph
# processes. The "/etc/init.d/ceph" script does not know if ceph is
# running on the node. For example when the node is locked, ceph
# processes are not running. In that case we do not want pmond to
# monitor these processes.
#
# The script "/etc/services.d/<node>/ceph.sh" will create the file
# "/var/run/.ceph_started" when ceph is running and remove it when
# is not.
#
# The script also extracts one or more ceph process names that are
# reported as 'not running' or 'dead' or 'failed' by '/etc/intit.d/ceph status'
# and writes the names to a text file: /tmp/ceph_status_failure.txt for
# pmond to access. The pmond adds the text to logs and alarms. Example of text
# samples written to file by this script are:
# 'osd.1'
# 'osd.1, osd.2'
# 'mon.storage-0'
# 'mon.storage-0, osd.2'
#
# Moreover, for processes that are reported as 'hung' by '/etc/intit.d/ceph status'
# the script will try increase their logging to 'debug' for a configurable interval.
# With logging increased it will outputs a few stack traces then, at the end of this
# interval, it dumps its stack core and kills it.
#
# Return values;
# zero - /etc/init.d/ceph returned success or ceph is not running on the node
# non-zero /etc/init.d/ceph returned a failure or invalid syntax
#
source /usr/bin/tsconfig
source /etc/platform/platform.conf
CEPH_SCRIPT="/etc/init.d/ceph"
CEPH_FILE="$VOLATILE_PATH/.ceph_started"
CEPH_RESTARTING_FILE="$VOLATILE_PATH/.ceph_restarting"
CEPH_GET_STATUS_FILE="$VOLATILE_PATH/.ceph_getting_status"
CEPH_STATUS_FAILURE_TEXT_FILE="/tmp/ceph_status_failure.txt"
BINDIR=/usr/bin
SBINDIR=/usr/sbin
LIBDIR=/usr/lib64/ceph
ETCDIR=/etc/ceph
source $LIBDIR/ceph_common.sh
LOG_PATH=/var/log/ceph
LOG_FILE=$LOG_PATH/ceph-process-states.log
LOG_LEVEL=NORMAL # DEBUG
verbose=0
DATA_PATH=$VOLATILE_PATH/ceph_hang # folder where we keep state information
mkdir -p $DATA_PATH # make sure folder exists
MONITORING_INTERVAL=15
TRACE_LOOP_INTERVAL=5
GET_STATUS_TIMEOUT=120
CEPH_STATUS_TIMEOUT=20
WAIT_FOR_CMD=1
RC=0
args=("$@")
if [ ! -z $ARGS ]; then
IFS=";" read -r -a new_args <<< "$ARGS"
args+=("${new_args[@]}")
fi
wait_for_status ()
{
timeout=$GET_STATUS_TIMEOUT # wait for status no more than $timeout seconds
while [ -f ${CEPH_GET_STATUS_FILE} ] && [ $timeout -gt 0 ]; do
sleep 1
let timeout-=1
done
if [ $timeout -eq 0 ]; then
wlog "-" "WARN" "Getting status takes more than ${GET_STATUS_TIMEOUT}s, continuing"
rm -f $CEPH_GET_STATUS_FILE
fi
}
start ()
{
if [ -f ${CEPH_FILE} ]; then
wait_for_status
${CEPH_SCRIPT} start $1
RC=$?
else
# Ceph is not running on this node, return success
exit 0
fi
}
stop ()
{
wait_for_status
${CEPH_SCRIPT} stop $1
}
restart ()
{
if [ -f ${CEPH_FILE} ]; then
wait_for_status
touch $CEPH_RESTARTING_FILE
${CEPH_SCRIPT} restart $1
rm -f $CEPH_RESTARTING_FILE
else
# Ceph is not running on this node, return success
exit 0
fi
}
log_and_restart_blocked_osds ()
{
# Log info about the blocked osd daemons and then restart it
local names=$1
for name in $names; do
wlog $name "INFO" "Restarting OSD with blocked operations"
${CEPH_SCRIPT} restart $name
done
}
log_and_kill_hung_procs ()
{
# Log info about the hung processes and then kill them; later on pmon will restart them
local names=$1
for name in $names; do
type=`echo $name | cut -c 1-3` # e.g. 'mon', if $item is 'mon1'
id=`echo $name | cut -c 4- | sed 's/^\\.//'`
get_conf run_dir "/var/run/ceph" "run dir"
get_conf pid_file "$run_dir/$type.$id.pid" "pid file"
pid=$(cat $pid_file)
wlog $name "INFO" "Dealing with hung process (pid:$pid)"
# monitoring interval
wlog $name "INFO" "Increasing log level"
execute_ceph_cmd ret $name "ceph daemon $name config set debug_$type 20/20"
monitoring=$MONITORING_INTERVAL
while [ $monitoring -gt 0 ]; do
if [ $(($monitoring % $TRACE_LOOP_INTERVAL)) -eq 0 ]; then
date=$(date "+%Y-%m-%d_%H-%M-%S")
log_file="$LOG_PATH/hang_trace_${name}_${pid}_${date}.log"
wlog $name "INFO" "Dumping stack trace to: $log_file"
$(pstack $pid >$log_file) &
fi
let monitoring-=1
sleep 1
done
wlog $name "INFO" "Trigger core dump"
kill -ABRT $pid &>/dev/null
rm -f $pid_file # process is dead, core dump is archiving, preparing for restart
# Wait for pending systemd core dumps
sleep 2 # hope systemd_coredump has started meanwhile
deadline=$(( $(date '+%s') + 300 ))
while [[ $(date '+%s') -lt "${deadline}" ]]; do
systemd_coredump_pid=$(pgrep -f "systemd-coredump.*${pid}.*ceph-${type}")
[[ -z "${systemd_coredump_pid}" ]] && break
wlog $name "INFO" "systemd-coredump ceph-${type} in progress: pid ${systemd_coredump_pid}"
sleep 2
done
kill -KILL $pid &>/dev/null
done
}
status ()
{
if [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" != "simplex" ]] && [[ "$1" == "osd" ]]; then
timeout $CEPH_STATUS_TIMEOUT ceph -s
if [ "$?" -ne 0 ]; then
# Ceph cluster is not accessible. Don't panic, controller swact
# may be in progress.
wlog "-" INFO "Ceph is down, ignoring OSD status."
exit 0
fi
fi
if [ -f ${CEPH_RESTARTING_FILE} ]; then
# Ceph is restarting, we don't report state changes on the first pass
rm -f ${CEPH_RESTARTING_FILE}
exit 0
fi
if [ -f ${CEPH_FILE} ]; then
# Make sure the script does not 'exit' between here and the 'rm -f' below
# or the checkpoint file will be left behind
touch -f ${CEPH_GET_STATUS_FILE}
result=`${CEPH_SCRIPT} status $1`
RC=$?
if [ "$RC" -ne 0 ]; then
erred_procs=`echo "$result" | sort | uniq | awk ' /not running|dead|failed/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'`
hung_procs=`echo "$result" | sort | uniq | awk ' /hung/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'`
blocked_ops_procs=`echo "$result" | sort | uniq | awk ' /blocked ops/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'`
invalid=0
host=`hostname`
if [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" != "simplex" ]]; then
# On 2 node configuration we have a floating monitor
host="controller"
fi
for i in $(echo $erred_procs $hung_procs); do
if [[ "$i" =~ osd.?[0-9]?[0-9]|mon.$host ]]; then
continue
else
invalid=1
fi
done
log_and_restart_blocked_osds $blocked_ops_procs
log_and_kill_hung_procs $hung_procs
hung_procs_text=""
for i in $(echo $hung_procs); do
hung_procs_text+="$i(process hung) "
done
rm -f $CEPH_STATUS_FAILURE_TEXT_FILE
if [ $invalid -eq 0 ]; then
text=""
for i in $erred_procs; do
text+="$i, "
done
for i in $hung_procs; do
text+="$i (process hang), "
done
echo "$text" | tr -d '\n' > $CEPH_STATUS_FAILURE_TEXT_FILE
else
echo "$host: '${CEPH_SCRIPT} status $1' result contains invalid process names: $erred_procs"
echo "Undetermined osd or monitor id" > $CEPH_STATUS_FAILURE_TEXT_FILE
fi
fi
rm -f ${CEPH_GET_STATUS_FILE}
if [[ $RC == 0 ]] && [[ "$1" == "mon" ]] && [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" != "simplex" ]]; then
# SM needs exit code != 0 from 'status mon' argument of the init script on
# standby controller otherwise it thinks that the monitor is running and
# tries to stop it.
# '/etc/init.d/ceph status mon' checks the status of monitors configured in
# /etc/ceph/ceph.conf and if it should be running on current host.
# If it should not be running it just exits with code 0. This is what
# happens on the standby controller.
# When floating monitor is running on active controller /var/lib/ceph/mon of
# standby is not mounted (Ceph monitor partition is DRBD synced).
test -e "/var/lib/ceph/mon/ceph-controller"
if [ "$?" -ne 0 ]; then
exit 3
fi
fi
else
# Ceph is not running on this node, return success
exit 0
fi
}
case "${args[0]}" in
start)
start ${args[1]}
;;
stop)
stop ${args[1]}
;;
restart)
restart ${args[1]}
;;
status)
status ${args[1]}
;;
*)
echo "Usage: $0 {start|stop|restart|status} [{mon|osd|osd.<number>|mon.<hostname>}]"
exit 1
;;
esac
exit $RC

View File

@ -1,6 +1,6 @@
#!/usr/bin/python
#
# Copyright (c) 2016 Wind River Systems, Inc.
# Copyright (c) 2019 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@ -12,6 +12,7 @@ import re
import subprocess
import sys
DEVICE_NAME_NVME = "nvme"
#########
# Utils #
@ -85,7 +86,11 @@ def is_partitioning_correct(disk_path, partition_sizes):
partition_index = 1
for size in partition_sizes:
# Check that each partition size matches the one in input
partition_node = disk_node + str(partition_index)
if DEVICE_NAME_NVME in disk_node:
partition_node = '{}p{}'.format(disk_node, str(partition_index))
else:
partition_node = '{}{}'.format(disk_node, str(partition_index))
output, _, _ = command(["udevadm", "settle", "-E", partition_node])
cmd = ["parted", "-s", partition_node, "unit", "MiB", "print"]
output, _, _ = command(cmd)

View File

@ -0,0 +1,30 @@
#!/bin/bash
#
# Copyright (c) 2019 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
script=$(basename $0)
# Set nullglob so wildcards will return empty string if no match
shopt -s nullglob
for dev in /dev/rbd[0-9]*; do
for mnt in $(mount | awk -v dev=$dev '($1 == dev) {print $3}'); do
logger -t ${script} "Unmounting $mnt"
/usr/bin/umount $mnt
done
logger -t ${script} "Unmounted $dev"
done
for dev in /dev/rbd[0-9]*; do
/usr/bin/rbd unmap -o force $dev
logger -t ${script} "Unmapped $dev"
done
lsmod | grep -q '^rbd\>' && /usr/sbin/modprobe -r rbd
lsmod | grep -q '^libceph\>' && /usr/sbin/modprobe -r libceph
exit 0

View File

@ -0,0 +1,18 @@
[Unit]
Description=radosgw RESTful rados gateway
After=network.target
#After=remote-fs.target nss-lookup.target network-online.target time-sync.target
#Wants=network-online.target
[Service]
Type=forking
Restart=no
KillMode=process
RemainAfterExit=yes
ExecStart=/etc/rc.d/init.d/ceph-radosgw start
ExecStop=/etc/rc.d/init.d/ceph-radosgw stop
ExecReload=/etc/rc.d/init.d/ceph-radosgw reload
[Install]
WantedBy=multi-user.target

View File

@ -0,0 +1,92 @@
#!/bin/sh
### BEGIN INIT INFO
# Provides: ceph-rest-api
# Required-Start: $ceph
# Required-Stop: $ceph
# Default-Start: 2 3 4 5
# Default-Stop: 0 1 6
# Short-Description: Ceph REST API daemon
# Description: Ceph REST API daemon
### END INIT INFO
DESC="ceph-rest-api"
DAEMON="/usr/bin/ceph-rest-api"
RUNDIR="/var/run/ceph"
PIDFILE="${RUNDIR}/ceph-rest-api.pid"
start()
{
if [ -e $PIDFILE ]; then
PIDDIR=/proc/$(cat $PIDFILE)
if [ -d ${PIDDIR} ]; then
echo "$DESC already running."
exit 0
else
echo "Removing stale PID file $PIDFILE"
rm -f $PIDFILE
fi
fi
echo -n "Starting $DESC..."
mkdir -p $RUNDIR
start-stop-daemon --start --quiet --background \
--pidfile ${PIDFILE} --make-pidfile --exec ${DAEMON}
if [ $? -eq 0 ]; then
echo "done."
else
echo "failed."
exit 1
fi
}
stop()
{
echo -n "Stopping $DESC..."
start-stop-daemon --stop --quiet --pidfile $PIDFILE
if [ $? -eq 0 ]; then
echo "done."
else
echo "failed."
fi
rm -f $PIDFILE
}
status()
{
pid=`cat $PIDFILE 2>/dev/null`
if [ -n "$pid" ]; then
if ps -p $pid &>/dev/null ; then
echo "$DESC is running"
exit 0
else
echo "$DESC is not running but has pid file"
exit 1
fi
fi
echo "$DESC is not running"
exit 3
}
case "$1" in
start)
start
;;
stop)
stop
;;
restart|force-reload|reload)
stop
start
;;
status)
status
;;
*)
echo "Usage: $0 {start|stop|force-reload|restart|reload|status}"
exit 1
;;
esac
exit 0

View File

@ -0,0 +1,16 @@
[Unit]
Description=Ceph REST API
After=network.target ceph.target
[Service]
Type=forking
Restart=no
KillMode=process
RemainAfterExit=yes
ExecStart=/etc/rc.d/init.d/ceph-rest-api start
ExecStop=/etc/rc.d/init.d/ceph-rest-api stop
ExecReload=/etc/rc.d/init.d/ceph-rest-api reload
[Install]
WantedBy=multi-user.target

50
ceph/ceph/files/ceph.conf Normal file
View File

@ -0,0 +1,50 @@
[global]
# Unique ID for the cluster.
fsid = %CLUSTER_UUID%
# Public network where the monitor is connected to, i.e, 128.224.0.0/16
#public network = 127.0.0.1/24
# For version 0.55 and beyond, you must explicitly enable
# or disable authentication with "auth" entries in [global].
auth_cluster_required = cephx
auth_service_required = cephx
auth_client_required = cephx
osd_journal_size = 1024
# Uncomment the following line if you are mounting with ext4
# filestore xattr use omap = true
# Number of replicas of objects. Write an object 2 times.
# Cluster cannot reach an active + clean state until there's enough OSDs
# to handle the number of copies of an object. In this case, it requires
# at least 2 OSDs
osd_pool_default_size = 2
# Allow writing one copy in a degraded state.
osd_pool_default_min_size = 1
# Ensure you have a realistic number of placement groups. We recommend
# approximately 100 per OSD. E.g., total number of OSDs multiplied by 100
# divided by the number of replicas (i.e., osd pool default size). So for
# 2 OSDs and osd pool default size = 2, we'd recommend approximately
# (100 * 2) / 2 = 100.
osd_pool_default_pg_num = 64
osd_pool_default_pgp_num = 64
osd_crush_chooseleaf_type = 1
setuser match path = /var/lib/ceph/$type/$cluster-$id
# Override Jewel default of 2 reporters. StarlingX has replication factor 2
mon_osd_min_down_reporters = 1
# Use Hammer's report interval default value
osd_mon_report_interval_max = 120
[osd]
osd_mkfs_type = xfs
osd_mkfs_options_xfs = "-f"
osd_mount_options_xfs = "rw,noatime,inode64,logbufs=8,logbsize=256k"
[mon]
mon warn on legacy crush tunables = false
# Quiet new warnings on move to Hammer
mon pg warn max per osd = 2048
mon pg warn max object skew = 0

View File

@ -0,0 +1,26 @@
[process]
process = ceph
script = /etc/init.d/ceph-init-wrapper
style = lsb
severity = major ; minor, major, critical
restarts = 3 ; restart retries before error assertion
interval = 30 ; number of seconds to wait between restarts
mode = status ; Monitoring mode: passive (default) or active
; passive: process death monitoring (default: always)
; active : heartbeat monitoring, i.e. request / response messaging
; status : determine process health with executing "status" command
; "start" is used to start the process(es) again
; ignore : do not monitor or stop monitoring
; Status and Active Monitoring Options
period = 30 ; monitor period in seconds
timeout = 120 ; for active mode, messaging timeout period in seconds, must be shorter than period
; for status mode, max amount of time for a command to execute
; Status Monitoring Options
start_arg = start ; start argument for the script
status_arg = status ; status argument for the script
status_failure_text = /tmp/ceph_status_failure.txt ; text to be added to alarms or logs, this is optional

View File

@ -0,0 +1,16 @@
[Unit]
Description=StarlingX Ceph Startup
After=network.target
[Service]
Type=forking
Restart=no
KillMode=process
RemainAfterExit=yes
ExecStart=/etc/rc.d/init.d/ceph start
ExecStop=/etc/rc.d/init.d/ceph stop
PIDFile=/var/run/ceph/ceph.pid
[Install]
WantedBy=multi-user.target

77
ceph/ceph/files/ceph.sh Executable file
View File

@ -0,0 +1,77 @@
#!/bin/bash
INITDIR=/etc/init.d
LOGFILE=/var/log/ceph/ceph-init.log
CEPH_FILE=/var/run/.ceph_started
# Get our nodetype
. /etc/platform/platform.conf
# Exit immediately if ceph not configured (i.e. no mon in the config file)
if ! grep -q "mon\." /etc/ceph/ceph.conf
then
exit 0
fi
logecho ()
{
echo $1
date >> ${LOGFILE}
echo $1 >> ${LOGFILE}
}
start ()
{
if [[ "$nodetype" == "controller" ]] || [[ "$nodetype" == "storage" ]]; then
logecho "Starting ceph services..."
${INITDIR}/ceph start >> ${LOGFILE} 2>&1
RC=$?
if [ ! -f ${CEPH_FILE} ]; then
touch ${CEPH_FILE}
fi
else
logecho "No ceph services on ${nodetype} node"
exit 0
fi
}
stop ()
{
if [[ "$nodetype" == "controller" ]] || [[ "$nodetype" == "storage" ]]; then
if [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" == "simplex" ]]; then
logecho "Ceph services will continue to run on node"
exit 0
fi
logecho "Stopping ceph services..."
if [ -f ${CEPH_FILE} ]; then
rm -f ${CEPH_FILE}
fi
${INITDIR}/ceph stop >> ${LOGFILE} 2>&1
RC=$?
else
logecho "No ceph services on ${nodetype} node"
exit 0
fi
}
RC=0
case "$1" in
start)
start
;;
stop)
stop
;;
*)
echo "Usage: $0 {start|stop}"
exit 1
;;
esac
logecho "RC was: $RC"
exit $RC

View File

@ -0,0 +1,246 @@
#!/usr/bin/python
#
# Copyright (c) 2019 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
#
# Wait for one or a group of OSDs to match one or a group of statuses
# as reported by "ceph osd tree".
#
# Examples:
# - wait for osd 0 to be up:
# osd-wait-status -o 0 -s up
#
# - wait for osd 0 and osd 1 to be up:
# osd-wait-status -o 0 1 -s up
#
# The amount of time spent waiting for OSDs to match a status can
# be limited by specifying:
#
# - the maximum retry count; the script will if the status doesn't
# match the desired one after more than retry count attempts.
# The interval between attempts is controlled by the "-i" flag.
# Example:
# osd-wait-status -o 0 -s up -c 2 -i 3
# will call "ceph osd tree" once to get the status of osd 0 and if
# it's not "up" then it will try one more time after 3 seconds.
#
# - a deadline as the maximum interval of time the script is looping
# waiting for OSDs to match status. The interval between attempts
# is controlled by the "-i" flag.
# Example:
# osd-wait-status -o 0 -s up -d 10 -i 3
# will call "ceph osd tree" until either osd 0 status is "up" or
# no more than 10 seconds have passed, that's 3-4 attempts depending
# on how much time it takes to run "ceph osd tree"
#
# Status match can be reversed by using "-n" flag.
# Example:
# osd-wait-status -o 0 -n -s up
# waits until osd 0 status is NOT up.
#
# osd-wait-status does not allow matching arbitrary combinations of
# OSDs and statuses. For example: "osd 0 up and osd 1 down" is not
# supported.
#
# Return code is 0 if OSDs match expected status before the
# retry count*interval / deadline limits are reached.
import argparse
import json
import logging
import retrying
import subprocess
import sys
import time
logging.basicConfig(level=logging.DEBUG)
LOG = logging.getLogger('osd-wait-status')
CEPH_BINARY_PATH = '/usr/bin/ceph'
RETRY_INTERVAL_SEC = 1
RETRY_FOREVER = 0
NO_DEADLINE = 0
class OsdException(Exception):
def __init__(self, message, restartable=False):
super(OsdException, self).__init__(message)
self.restartable = restartable
def get_osd_tree():
command = [CEPH_BINARY_PATH,
'osd', 'tree', '--format', 'json']
try:
p = subprocess.Popen(command,
stdout = subprocess.PIPE,
stderr = subprocess.PIPE)
output, error = p.communicate()
if p.returncode != 0:
raise OsdException(
('Command failed: command="{}", '
'returncode={}, output="{}"').format(
' '.join(command),
p.returncode,
output, error),
restartable=True)
except OSError as e:
raise OsdException(
('Command failed: command="{}", '
'reason="{}"').format(command, str(e)))
try:
return json.loads(output)
except ValueError as e:
raise OsdException(
('JSON decode failed: '
'data="{}", error="{}"').format(
output, e))
def osd_match_status(target_osd, target_status,
reverse_logic):
LOG.info(('Match status: '
'target_osd={}, '
'target status={}, '
'reverse_logic={}').format(
target_osd, target_status, reverse_logic))
tree = get_osd_tree()
osd_status = {}
for node in tree.get('nodes'):
name = node.get('name')
if name in target_osd:
osd_status[name] = node.get('status')
if len(osd_status) == len(target_osd):
break
LOG.info('Current OSD(s) status: {}'.format(osd_status))
for name in target_osd:
if name not in osd_status:
raise OsdException(
('Unable to retrieve status '
'for "{}"').format(
name))
if reverse_logic:
if osd_status[name] not in target_status:
del osd_status[name]
else:
if osd_status[name] in target_status:
del osd_status[name]
if len(osd_status) == 0:
LOG.info('OSD(s) status target reached.')
return True
else:
LOG.info('OSD(s) {}matching status {}: {}'.format(
'' if reverse_logic else 'not ',
target_status,
osd_status.keys()))
return False
def osd_wait_status(target_osd, target_status,
reverse_logic,
retry_count, retry_interval,
deadline):
def retry_if_false(result):
return (result is False)
def retry_if_restartable(exception):
return (isinstance(exception, OsdException)
and exception.restartable)
LOG.info(('Wait options: '
'target_osd={}, '
'target_status={}, '
'reverse_logic={}, '
'retry_count={}, '
'retry_interval={}, '
'deadline={}').format(
target_osd, target_status, reverse_logic,
retry_count, retry_interval, deadline))
kwargs = {
'retry_on_result': retry_if_false,
'retry_on_exception': retry_if_restartable}
if retry_count != RETRY_FOREVER:
kwargs['stop_max_attempt_number'] = retry_count
if deadline != NO_DEADLINE:
kwargs['stop_max_delay'] = deadline * 1000
if retry_interval != 0:
kwargs['wait_fixed'] = retry_interval * 1000
if not len(target_osd):
return
retrying.Retrying(**kwargs).call(
osd_match_status,
target_osd, target_status,
reverse_logic)
def non_negative_interger(value):
value = int(value)
if value < 0:
raise argparse.argumenttypeerror(
'{} is a negative integer value'.format(value))
return value
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description='Wait for OSD status match')
parser.add_argument(
'-o', '--osd',
nargs='*',
help='osd id',
type=non_negative_interger,
required=True)
parser.add_argument(
'-n', '--not',
dest='reverse_logic',
help='reverse logic: wait for status NOT to match',
action='store_true',
default=False)
parser.add_argument(
'-s', '--status',
nargs='+',
help='status',
type=str,
required=True)
parser.add_argument(
'-c', '--retry-count',
help='retry count',
type=non_negative_interger,
default=RETRY_FOREVER)
parser.add_argument(
'-i', '--retry-interval',
help='retry interval (seconds)',
type=non_negative_interger,
default=RETRY_INTERVAL_SEC)
parser.add_argument(
'-d', '--deadline',
help='deadline (seconds)',
type=non_negative_interger,
default=NO_DEADLINE)
args = parser.parse_args()
start = time.time()
try:
osd_wait_status(
['osd.{}'.format(o) for o in args.osd],
args.status,
args.reverse_logic,
args.retry_count,
args.retry_interval,
args.deadline)
LOG.info('Elapsed time: {:.02f} seconds'.format(
time.time() - start))
sys.exit(0)
except retrying.RetryError as e:
LOG.warn(
('Retry error: {}. '
'Elapsed time: {:.02f} seconds'.format(
e, time.time() - start)))
except OsdException as e:
LOG.warn(
('OSD wait error: {}. '
'Elapsed time: {:.02f} seconds').format(
e, time.time() - start))
sys.exit(1)

View File

@ -0,0 +1,3 @@
[Service]
ExecStopPost=/usr/sbin/ceph-preshutdown.sh

View File

@ -0,0 +1,2 @@
656b5b63ed7c43bd014bcafd81b001959d5f089f
v10.2.6

View File

@ -1,2 +1,2 @@
SRC_DIR="files"
TIS_PATCH_VER=0
TIS_PATCH_VER=1

View File

@ -107,6 +107,7 @@ destination d_sm { file("/var/log/sm.log"); };
destination d_rmon { file("/var/log/rmond.log" template(t_mtc)); };
destination d_rmon_notify { file("/var/log/rmond_notify.log" template(t_mtc)); };
destination d_pmon { file("/var/log/pmond.log" template(t_mtc)); };
destination d_lmon { file("/var/log/lmond.log" template(t_mtc)); };
destination d_hostwd { file("/var/log/hostwd.log" template(t_mtc)); };
destination d_fsmon { file("/var/log/fsmond.log" template(t_mtc)); };
destination d_hwmon { file("/var/log/hwmond.log" template(t_mtc)); };
@ -352,6 +353,7 @@ filter f_local7 { facility(local7); };
filter f_rmon { facility(local5) and program(rmond); };
filter f_rmon_notify { facility(local5) and program(rmon_resource_notify); };
filter f_pmon { facility(local5) and program(pmond); };
filter f_lmon { facility(local5) and program(lmond); };
filter f_hostw { facility(local5) and program(hostwd); };
filter f_fsmon { facility(local5) and program(fsmond); };
filter f_hwmon { facility(local5) and program(hwmond); };
@ -472,6 +474,7 @@ log { source(s_src); filter(f_local3); destination(d_sm); };
log { source(s_src); filter(f_rmon); destination(d_rmon); };
log { source(s_src); filter(f_rmon_notify); destination(d_rmon_notify); };
log { source(s_src); filter(f_pmon); destination(d_pmon); };
log { source(s_src); filter(f_lmon); destination(d_lmon); };
log { source(s_src); filter(f_hostw); destination(d_hostwd); };
log { source(s_src); filter(f_fsmon); destination(d_fsmon); };
log { source(s_src); filter(f_hwmon); destination(d_hwmon); };

View File

@ -0,0 +1,6 @@
FROM openstackhelm/mariadb:10.2.18
RUN apt-get update && apt-get install -y galera-arbitrator-3
CMD ["/usr/bin/garbd"]

View File

@ -0,0 +1,3 @@
BUILDER=docker
LABEL=stx-mariadb

View File

@ -0,0 +1,3 @@
BUILDER=docker
LABEL=stx-mariadb

View File

@ -1,4 +1,4 @@
COPY_LIST="$FILES_BASE/* \
$DISTRO/patches/* \
$CGCS_BASE/downloads/drbd-8.4.3.tar.gz"
TIS_PATCH_VER=6
TIS_PATCH_VER=7

View File

@ -34,7 +34,7 @@ Source: http://oss.linbit.com/%{name}/8.3/%{name}-%{version}.tar.gz
Source1: drbd.service
# WRS
# StarlingX
Patch0001: 0001-skip_wait_con_int_on_simplex.patch
Patch0002: 0002-drbd-conditional-crm-dependency.patch
Patch0003: 0003-drbd_report_condition.patch
@ -43,6 +43,7 @@ Patch0005: 0005-drbd_reconnect_standby_standalone.patch
Patch0006: 0006-avoid-kernel-userspace-version-check.patch
Patch0007: 0007-Update-OCF-to-attempt-connect-in-certain-states.patch
Patch0008: 0008-Increase-short-cmd-timeout-to-15-secs.patch
Patch0009: 0009-Check-for-mounted-device-before-demoting-Primary-DRB.patch
License: GPLv2+
ExclusiveOS: linux
@ -271,6 +272,7 @@ management utility.
%patch0006 -p1
%patch0007 -p1
%patch0008 -p1
%patch0009 -p1
%build
%configure \

View File

@ -0,0 +1,45 @@
From 017157d21a56410811384a43d0b0cbba6444baeb Mon Sep 17 00:00:00 2001
From: Don Penney <don.penney@windriver.com>
Date: Wed, 6 Feb 2019 01:19:59 -0500
Subject: [PATCH] Check for mounted device before demoting Primary DRBD
resource
Update the OCF script to check for a mounted device when demoting
a resource that's in the Primary state. The state change will fail
if it is still in use, otherwise.
Signed-off-by: Don Penney <don.penney@windriver.com>
---
scripts/drbd.ocf | 16 +++++++++++++++-
1 file changed, 15 insertions(+), 1 deletion(-)
diff --git a/scripts/drbd.ocf b/scripts/drbd.ocf
index e03bf6d..95da11a 100644
--- a/scripts/drbd.ocf
+++ b/scripts/drbd.ocf
@@ -720,7 +720,21 @@ drbd_stop() {
;;
$OCF_RUNNING_MASTER)
ocf_log warn "$DRBD_RESOURCE still Primary, demoting."
- do_drbdadm secondary $DRBD_RESOURCE
+ found=no
+ for dev in ${DRBD_DEVICES[@]} ""; do
+ cat /proc/mounts | grep -q "^${dev} "
+ if [ $? -eq 0 ]; then
+ ocf_log warn "${DRBD_RESOURCE} is still mounted via $dev"
+ found=yes
+ break
+ fi
+ done
+ if [ "${found}" = "yes" ]; then
+ ocf_log warn "Waiting to drop $DRBD_RESOURCE"
+ else
+ ocf_log warn "Dropping $DRBD_RESOURCE to Secondary"
+ do_drbdadm secondary $DRBD_RESOURCE
+ fi
esac
$first_try || sleep 1
first_try=false
--
1.8.3.1

View File

@ -1,4 +1,7 @@
The spec file used here was from the kubernetes 1.10 src rpm.
The spec_diff shows the modifications made to that spec file.
to help understand which changes were needed, to assist with
future upversioning.
The spec file used here was from the kubernetes 1.10.0 src rpm.
The orig file is included to help show modifications made to that
spec file, to help understand which changes were needed and to
assist with future upversioning.
The contrib tarball does not have the same versioning as kubernetes and
there is little activity in that repo.

View File

@ -1,7 +1,8 @@
VERSION=1.12.1
VERSION=1.12.3
CON_VERSION=1.12.1
TAR_NAME=kubernetes
TAR="$TAR_NAME-v$VERSION.tar.gz"
CONTRIB="$TAR_NAME-contrib-v$VERSION.tar.gz"
CONTRIB="$TAR_NAME-contrib-v$CON_VERSION.tar.gz"
COPY_LIST="${CGCS_BASE}/downloads/$TAR ${CGCS_BASE}/downloads/$CONTRIB $FILES_BASE/*"

View File

@ -23,7 +23,7 @@
%global provider_prefix %{provider}.%{provider_tld}/%{project}/%{repo}
%global import_path k8s.io/kubernetes
%global commit 1.12.1
%global commit 1.12.3
%global con_provider github
%global con_provider_tld com
@ -32,7 +32,7 @@
# https://github.com/kubernetes/contrib
%global con_commit 1.12.1
%global kube_version 1.12.1
%global kube_version 1.12.3
%global kube_git_version v%{kube_version}
# Needed otherwise "version_ldflags=$(kube::version_ldflags)" doesn't work

File diff suppressed because it is too large Load Diff

View File

@ -1,77 +0,0 @@
26,27c26
< %global commit fc32d2f3698e36b93322a3465f63a14e9f0eaead
< %global shortcommit %(c=%{commit}; echo ${c:0:7})
---
> %global commit 1.12.1
32c31
< %global con_repo contrib
---
> %global con_repo kubernetes-contrib
34,36c33
< %global con_provider_prefix %{con_provider}.%{con_provider_tld}/%{con_project}/%{con_repo}
< %global con_commit 5b445f1c53aa8d6457523526340077935f62e691
< %global con_shortcommit %(c=%{con_commit}; echo ${c:0:7})
---
> %global con_commit 1.12.1
38c35
< %global kube_version 1.10.0
---
> %global kube_version 1.12.1
48c45
< Release: 1%{?dist}
---
> Release: 1%{?_tis_dist}.%{tis_patch_ver}
53,54c50,51
< Source0: https://%{provider_prefix}/archive/%{commit}/%{repo}-%{shortcommit}.tar.gz
< Source1: https://%{con_provider_prefix}/archive/%{con_commit}/%{con_repo}-%{con_shortcommit}.tar.gz
---
> Source0: %{project}-v%{kube_version}.tar.gz
> Source1: %{con_repo}-v%{con_commit}.tar.gz
60,68d56
< Patch3: build-with-debug-info.patch
< #Patch4: make-test-cmd-run-over-hyperkube-based-kubectl.patch
< #Patch5: make-e2e_node-run-over-distro-bins.patch
<
< # ppc64le
< Patch16: fix-support-for-ppc64le.patch
<
< Patch20: use_go_build-is-not-fully-propagated-so-make-it-fixe.patch
<
810c798
< Suggests: docker
---
> Suggests: docker-ce
812c800
< Requires: docker
---
> Requires: docker-ce
816c804
< BuildRequires: golang >= 1.2-7
---
> BuildRequires: golang >= 1.10.2
858,863d845
< %if 0%{?with_debug}
< %patch3 -p1
< %endif
<
< %patch20 -p1
<
883,890d864
< # Patch tests to be run over distro bins
< #patch4 -p1
< #patch5 -p1
<
< %ifarch ppc64le
< %patch16 -p1
< %endif
<
893a868
> export PBR_VERSION=%{version}
904c879
< make WHAT="--use_go_build cmd/hyperkube cmd/kube-apiserver cmd/kubeadm"
---
> make WHAT="cmd/hyperkube cmd/kube-apiserver cmd/kubeadm"
917a893
> export PBR_VERSION=%{version}
1072a1049
> %config(noreplace) %{_sysconfdir}/%{name}/kubelet.kubeconfig

View File

@ -5,6 +5,7 @@ COPY_LIST="$PKG_BASE/src/LICENSE \
$PKG_BASE/src/collectd.service \
$PKG_BASE/src/fm_notifier.py \
$PKG_BASE/src/mtce_notifier.py \
$PKG_BASE/src/plugin_common.py \
$PKG_BASE/src/python_plugins.conf \
$PKG_BASE/src/cpu.py \
$PKG_BASE/src/cpu.conf \
@ -13,7 +14,9 @@ COPY_LIST="$PKG_BASE/src/LICENSE \
$PKG_BASE/src/df.conf \
$PKG_BASE/src/ntpq.py \
$PKG_BASE/src/ntpq.conf \
$PKG_BASE/src/interface.py \
$PKG_BASE/src/interface.conf \
$PKG_BASE/src/example.py \
$PKG_BASE/src/example.conf"
TIS_PATCH_VER=6
TIS_PATCH_VER=7

View File

@ -15,12 +15,14 @@ Source2: collectd.conf.pmon
# collectd python plugin files - notifiers
Source3: fm_notifier.py
Source4: mtce_notifier.py
Source5: plugin_common.py
# collectd python plugin files - resource plugins
Source11: cpu.py
Source12: memory.py
Source14: example.py
Source15: ntpq.py
Source16: interface.py
# collectd plugin conf files into /etc/collectd.d
Source100: python_plugins.conf
@ -29,6 +31,7 @@ Source102: memory.conf
Source103: df.conf
Source104: example.conf
Source105: ntpq.conf
Source106: interface.conf
BuildRequires: systemd-devel
@ -64,12 +67,15 @@ install -m 600 %{SOURCE2} %{buildroot}%{local_config_extensions_dir}
# collectd python plugin files - notifiers
install -m 700 %{SOURCE3} %{buildroot}%{local_python_extensions_dir}
install -m 700 %{SOURCE4} %{buildroot}%{local_python_extensions_dir}
install -m 700 %{SOURCE5} %{buildroot}%{local_python_extensions_dir}
# collectd python plugin files - resource plugins
install -m 700 %{SOURCE11} %{buildroot}%{local_python_extensions_dir}
install -m 700 %{SOURCE12} %{buildroot}%{local_python_extensions_dir}
install -m 700 %{SOURCE14} %{buildroot}%{local_python_extensions_dir}
install -m 700 %{SOURCE15} %{buildroot}%{local_python_extensions_dir}
install -m 700 %{SOURCE16} %{buildroot}%{local_python_extensions_dir}
# collectd plugin conf files into /etc/collectd.d
install -m 600 %{SOURCE100} %{buildroot}%{local_plugin_dir}
@ -78,6 +84,7 @@ install -m 600 %{SOURCE102} %{buildroot}%{local_plugin_dir}
install -m 600 %{SOURCE103} %{buildroot}%{local_plugin_dir}
install -m 600 %{SOURCE104} %{buildroot}%{local_plugin_dir}
install -m 600 %{SOURCE105} %{buildroot}%{local_plugin_dir}
install -m 600 %{SOURCE106} %{buildroot}%{local_plugin_dir}
%clean
rm -rf $RPM_BUILD_ROOT

View File

@ -13,8 +13,8 @@
Instance "used"
Persist true
PersistOK true
WarningMax 90.00
FailureMax 95.00
WarningMax 89.00
FailureMax 94.00
Hits 2
Invert false
</Type>

View File

@ -13,6 +13,7 @@
MountPoint "/var/lock"
MountPoint "/boot"
MountPoint "/scratch"
MountPoint "/opt/etcd"
MountPoint "/opt/cgcs"
MountPoint "/opt/platform"
MountPoint "/opt/extension"
@ -27,8 +28,8 @@
<Plugin "df">
<Type "percent_bytes">
Instance "used"
WarningMax 80.00
FailureMax 90.00
WarningMax 79.00
FailureMax 89.00
Persist true
PersistOK true
Hits 2

View File

@ -4,8 +4,8 @@
Instance "used"
Persist true
PersistOK true
WarningMax 51.00
FailureMax 75.00
WarningMax 49.00
FailureMax 74.00
Hits 1
Invert false
</Type>

View File

@ -23,17 +23,17 @@
# Collects provides information about each event as an object passed to the
# notification handler ; the notification object.
#
# object.host - the hostname
# object.host - the hostname.
#
# object.plugin - the name of the plugin aka resource
# object.plugin - the name of the plugin aka resource.
# object.plugin_instance - plugin instance string i.e. say mountpoint
# for df plugin
# object.type, - the unit i.e. percent or absolute
# object.type_instance - the attribute i.e. free, used, etc
# for df plugin or numa? node for memory.
# object.type, - the unit i.e. percent or absolute.
# object.type_instance - the attribute i.e. free, used, etc.
#
# object.severity - a integer value 0=OK , 1=warning, 2=failure
# object.severity - a integer value 0=OK , 1=warning, 2=failure.
# object.message - a log-able message containing the above along
# with the value
# with the value.
#
# This notifier uses the notification object to manage plugin/instance alarms.
#
@ -86,9 +86,11 @@ import os
import re
import uuid
import collectd
from threading import RLock as Lock
from fm_api import constants as fm_constants
from fm_api import fm_api
import tsconfig.tsconfig as tsc
import plugin_common as pc
# only load influxdb on the controller
if tsc.nodetype == 'controller':
@ -116,6 +118,12 @@ PLUGIN = 'alarm notifier'
# Path to the plugin's drop dir
PLUGIN_PATH = '/etc/collectd.d/'
# the name of the collectd samples database
DATABASE_NAME = 'collectd samples'
READING_TYPE__PERCENT_USAGE = '% usage'
# collectd severity definitions ;
# Note: can't seem to pull then in symbolically with a header
NOTIF_FAILURE = 1
@ -145,6 +153,7 @@ mangled_list = {"dev-shm",
"etc-nova-instances",
"opt-platform",
"opt-cgcs",
"opt-etcd",
"opt-extension",
"opt-backups"}
@ -154,10 +163,20 @@ ALARM_ID__MEM = "100.103"
ALARM_ID__DF = "100.104"
ALARM_ID__EXAMPLE = "100.113"
ALARM_ID__VSWITCH_CPU = "100.102"
ALARM_ID__VSWITCH_MEM = "100.115"
ALARM_ID__VSWITCH_PORT = "300.001"
ALARM_ID__VSWITCH_IFACE = "300.002"
# ADD_NEW_PLUGIN: add new alarm id to the list
ALARM_ID_LIST = [ALARM_ID__CPU,
ALARM_ID__MEM,
ALARM_ID__DF,
ALARM_ID__VSWITCH_CPU,
ALARM_ID__VSWITCH_MEM,
ALARM_ID__VSWITCH_PORT,
ALARM_ID__VSWITCH_IFACE,
ALARM_ID__EXAMPLE]
# ADD_NEW_PLUGIN: add plugin name definition
@ -168,38 +187,29 @@ PLUGIN__CPU = "cpu"
PLUGIN__MEM = "memory"
PLUGIN__INTERFACE = "interface"
PLUGIN__NTP_QUERY = "ntpq"
PLUGIN__VSWITCH_PORT = "vswitch-port"
PLUGIN__VSWITCH_CPU = "vswitch-cpu"
PLUGIN__VSWITCH_MEM = "vswitch-memory"
PLUGIN__VSWITCH_OVSDB = "vswitch-ovsdb"
PLUGIN__VSWITCH_OPENFLOW = "vswitch-openflow"
PLUGIN__VSWITCH_LACP_IFACE = "vswitch-lacp-iface"
PLUGIN__VSWITCH_IFACE = "vswitch-iface"
PLUGIN__NOVA_THINPOOL_LVM = "nova-thinpool-lvm"
PLUGIN__CINDER_THINPOOL_LVM = "cinder-thinpool-lvm"
PLUGIN__CINDER_THINPOOL_LVM_META = "cinder-thinpool-lvm-meta"
PLUGIN__VSWITCH_PORT = "vswitch_port"
PLUGIN__VSWITCH_CPU = "vswitch_cpu"
PLUGIN__VSWITCH_MEM = "vswitch_mem"
PLUGIN__VSWITCH_IFACE = "vswitch_iface"
PLUGIN__EXAMPLE = "example"
# ADD_NEW_PLUGIN: add plugin name to list
PLUGIN_NAME_LIST = [PLUGIN__CPU,
PLUGIN__MEM,
PLUGIN__DF,
PLUGIN__VSWITCH_CPU,
PLUGIN__VSWITCH_MEM,
PLUGIN__VSWITCH_PORT,
PLUGIN__VSWITCH_IFACE,
PLUGIN__EXAMPLE]
# ADD_NEW_PLUGIN: add alarm id and plugin to dictionary
# ALARM_ID_TO_PLUGIN_DICT = {}
# ALARM_ID_TO_PLUGIN_DICT[ALARM_ID__CPU] = PLUGIN__CPU
# ALARM_ID_TO_PLUGIN_DICT[ALARM_ID__MEM] = PLUGIN__MEM
# ALARM_ID_TO_PLUGIN_DICT[ALARM_ID__DF] = PLUGIN__DF
# ALARM_ID_TO_PLUGIN_DICT[ALARM_ID__EXAMPLE] = PLUGIN__EXAMPLE
# PluginObject Class
class PluginObject:
dbObj = None # shared database connection obj
host = None # saved hostname
lock = None # global lock for mread_func mutex
database_setup = False # state of database setup
database_setup_in_progress = False # connection mutex
@ -213,7 +223,7 @@ class PluginObject:
self.plugin = plugin # name of the plugin ; df, cpu, memory ...
self.plugin_instance = "" # the instance name for the plugin
self.resource_name = "" # The top level name of the resource
self.instance_name = "" # The instanhce name
self.instance_name = "" # The instance name
# Instance specific learned static class members.
self.entity_id = "" # fm entity id host=<hostname>.<instance>
@ -225,12 +235,17 @@ class PluginObject:
self.value = float(0) # float value of reading
# Common static class members.
self.reason_warning = ""
self.reason_failure = ""
self.repair = ""
self.alarm_type = fm_constants.FM_ALARM_TYPE_7
self.cause = fm_constants.ALARM_PROBABLE_CAUSE_50
self.alarm_type = fm_constants.FM_ALARM_TYPE_7 # OPERATIONAL
self.cause = fm_constants.ALARM_PROBABLE_CAUSE_50 # THRESHOLD CROSS
self.suppression = True
self.service_affecting = False
# default most reading types are usage
self.reading_type = READING_TYPE__PERCENT_USAGE
# Severity tracking lists.
# Maintains severity state between notifications.
# Each is a list of entity ids for severity asserted alarms.
@ -329,7 +344,11 @@ class PluginObject:
# filter out messages to ignore ; notifications that have no value
if "has not been updated for" in nObject.message:
collectd.debug("%s NOT UPDATED: %s" % (PLUGIN, self.entity_id))
collectd.info("%s %s %s (%s)" %
(PLUGIN,
self.entity_id,
nObject.message,
nObject.severity))
return "done"
# Get the value from the notification message.
@ -363,8 +382,8 @@ class PluginObject:
# validate the reading
try:
self.value = float(self.values[0])
# get the threshold if its there
if len(self.values) == 2:
# get the threshold if its there.
if len(self.values) > 1:
self.threshold = float(self.values[1])
except ValueError as ex:
@ -390,6 +409,9 @@ class PluginObject:
logit = False
if self.count == 0 or LOG_STEP == 0:
logit = True
elif self.reading_type == "connections":
if self.value != last:
logit = True
elif self.value > last:
if (last + LOG_STEP) < self.value:
logit = True
@ -401,7 +423,18 @@ class PluginObject:
#
# Note: only usage type so far
if logit:
reading_type = "% usage"
resource = self.resource_name
# setup resource name for filesystem instance usage log
if self.plugin == PLUGIN__DF:
resource = self.instance
# setup resource name for vswitch process instance name
elif self.plugin == PLUGIN__VSWITCH_MEM:
resource += ' Processor '
resource += self.instance_name
if self.reading_type == READING_TYPE__PERCENT_USAGE:
tmp = str(self.value).split('.')
if len(tmp[0]) == 1:
pre = ': '
@ -411,8 +444,19 @@ class PluginObject:
(PLUGIN,
pre,
self.value,
reading_type,
self.instance_name))
self.reading_type,
resource))
elif self.reading_type == "connections" and \
self.instance_objects and \
self.value != self.last_value:
if self.instance_objects:
collectd.info("%s monitor: %2d %s - %s" %
(PLUGIN,
self.value,
self.reading_type,
resource))
self.last_value = float(self.value)
##########################################################################
@ -599,12 +643,139 @@ class PluginObject:
collectd.info("%s %s no failures" %
(PLUGIN, self.plugin))
##########################################################################
#
# Name : _get_instance_object
#
# Purpose : Safely get an object from the self instance object list
# indexed by eid.
#
##########################################################################
def _get_instance_object(self, eid):
"""
Safely get an object from the self instance object list indexed
by eid while locked.
:param eid:
:return: object or None
"""
try:
collectd.debug("%s %s Get Lock ..." % (PLUGIN, self.plugin))
PluginObject.lock.acquire()
obj = self.instance_objects[eid]
return obj
except:
collectd.error("%s failed to get instance from %s object list" %
(PLUGIN, self.plugin))
return None
finally:
collectd.debug("%s %s Get UnLock ..." % (PLUGIN, self.plugin))
PluginObject.lock.release()
##########################################################################
#
# Name : _add_instance_object
#
# Purpose : Safely add an object to the self instance object list
# indexed by eid while locked. if found locked the instance
# add will be re-attempted on next sample.
#
##########################################################################
def _add_instance_object(self, obj, eid):
"""
Update self instance_objects list while locked
:param obj: the object to add
:param eid: indexed by this eid
:return: nothing
"""
try:
collectd.debug("%s %s Add Lock ..." % (PLUGIN, self.plugin))
PluginObject.lock.acquire()
self.instance_objects[eid] = obj
except:
collectd.error("%s failed to add instance to %s object list" %
(PLUGIN, self.plugin))
finally:
collectd.debug("%s %s Add UnLock ..." % (PLUGIN, self.plugin))
PluginObject.lock.release()
##########################################################################
#
# Name : _copy_instance_object
#
# Purpose : Copy select members of self object to target object.
#
##########################################################################
def _copy_instance_object(self, object):
"""
Copy select members of self object to target object
"""
object.resource_name = self.resource_name
object.instance_name = self.instance_name
object.reading_type = self.reading_type
object.reason_warning = self.reason_warning
object.reason_failure = self.reason_failure
object.repair = self.repair
object.alarm_type = self.alarm_type
object.cause = self.cause
object.suppression = self.suppression
object.service_affecting = self.service_affecting
##########################################################################
#
# Name : _create_instance_object
#
# Purpose : Create a new instance object and tack it on the supplied base
# object's instance object dictionary.
#
##########################################################################
def _create_instance_object(self, instance):
try:
# create a new plugin object
inst_obj = PluginObject(self.id, self.plugin)
self._copy_instance_object(inst_obj)
# initialize the object with instance specific data
inst_obj.instance_name = instance
inst_obj.entity_id = _build_entity_id(self.plugin,
instance)
self._add_instance_object(inst_obj, inst_obj.entity_id)
collectd.debug("%s created %s instance (%s) object %s" %
(PLUGIN, inst_obj.resource_name,
inst_obj.entity_id, inst_obj))
collectd.debug("%s monitoring %s %s %s" %
(PLUGIN,
inst_obj.resource_name,
inst_obj.instance_name,
inst_obj.reading_type))
return inst_obj
except:
collectd.error("%s %s:%s inst object create failed" %
(PLUGIN, inst_obj.resource_name, instance))
return None
##########################################################################
#
# Name : _create_instance_objects
#
# Purpose : Create a list of instance objects for 'self' type plugin and
# add those objects to the parnet's instance_objects dictionary.
# add those objects to the parent's instance_objects dictionary.
#
# Note : This is currently only used for the DF (filesystem) plugin.
# All other instance creations/allocations are done on-demand.
#
##########################################################################
def _create_instance_objects(self):
@ -612,11 +783,7 @@ class PluginObject:
Create, initialize and add an instance object to this/self plugin
"""
# ADD_NEW_PLUGIN: for plugins that have instances you need to
# add support for creating those instances and adding
# those instances to the parent instance_objects list.
# Currently only the DF plugin has subordinate instance objects.
# Create the File System subordinate instance objects.
if self.id == ALARM_ID__DF:
# read the df.conf file and return/get a list of mount points
@ -651,6 +818,7 @@ class PluginObject:
# initialize the object with instance specific data
inst_obj.resource_name = self.resource_name
inst_obj.instance_name = mp
inst_obj.instance = mp
# build the plugin instance name from the mount point
if mp == '/':
inst_obj.plugin_instance = 'root'
@ -662,20 +830,29 @@ class PluginObject:
# add this subordinate object to the parent's
# instance object list
self.instance_objects[inst_obj.entity_id] = inst_obj
self._add_instance_object(inst_obj, inst_obj.entity_id)
collectd.info("%s monitoring %s usage" %
(PLUGIN, mp))
(PLUGIN, inst_obj.instance))
PluginObject.host = os.uname()[1]
# ADD_NEW_PLUGIN: add plugin to this table
# This instanciates the plugin objects
PLUGINS = {PLUGIN__CPU: PluginObject(ALARM_ID__CPU, PLUGIN__CPU),
# This instantiates the plugin objects
PLUGINS = {
PLUGIN__CPU: PluginObject(ALARM_ID__CPU, PLUGIN__CPU),
PLUGIN__MEM: PluginObject(ALARM_ID__MEM, PLUGIN__MEM),
PLUGIN__DF: PluginObject(ALARM_ID__DF, PLUGIN__DF),
PLUGIN__VSWITCH_CPU: PluginObject(ALARM_ID__VSWITCH_CPU,
PLUGIN__VSWITCH_CPU),
PLUGIN__VSWITCH_MEM: PluginObject(ALARM_ID__VSWITCH_MEM,
PLUGIN__VSWITCH_MEM),
PLUGIN__VSWITCH_PORT: PluginObject(ALARM_ID__VSWITCH_PORT,
PLUGIN__VSWITCH_PORT),
PLUGIN__VSWITCH_IFACE: PluginObject(ALARM_ID__VSWITCH_IFACE,
PLUGIN__VSWITCH_IFACE),
PLUGIN__EXAMPLE: PluginObject(ALARM_ID__EXAMPLE, PLUGIN__EXAMPLE)}
@ -704,27 +881,43 @@ def _get_object(alarm_id, eid):
return base_obj
def is_uuid_like(val):
"""Returns validation of a value as a UUID.
For our purposes, a UUID is a canonical form string:
aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa
"""
try:
return str(uuid.UUID(val)) == val
except (TypeError, ValueError, AttributeError):
return False
def _build_entity_id(plugin, plugin_instance):
"""
Builds an entity id string based on the collectd notification object.
"""
inst_error = False
entity_id = 'host='
entity_id += PluginObject.host
if plugin == PLUGIN__DF:
if plugin == PLUGIN__VSWITCH_MEM:
# host=<hostname>.processor=<socket-id>
if plugin_instance:
entity_id += '.processor=' + plugin_instance
else:
inst_error = True
elif plugin == PLUGIN__VSWITCH_IFACE:
# host=<hostname>.interface=<if-uuid>
if plugin_instance:
entity_id += '.interface=' + plugin_instance
else:
inst_error = True
elif plugin == PLUGIN__VSWITCH_PORT:
# host=<hostname>.port=<port-uuid>
if plugin_instance:
entity_id += '.port=' + plugin_instance
else:
inst_error = True
elif plugin == PLUGIN__DF:
# host=<hostname>.filesystem=<mountpoint>
if plugin_instance:
instance = plugin_instance
@ -740,7 +933,18 @@ def _build_entity_id(plugin, plugin_instance):
instance = instance.replace('-', '/')
entity_id += instance
# collectd.info("%s entity_id : %s" % (PLUGIN, entity_id))
# Will be uncommented when the numa memory monitor is added
# to the platform memory plugin.
#
#elif plugin == PLUGIN__MEM:
# if plugin_instance is not 'platform':
# # host=controller-0.numa=node0
# entity_id += '.numa='
# entity_id += plugin_instance
if inst_error is True:
collectd.error("%s eid build failed ; missing instance" % plugin)
return None
return entity_id
@ -773,37 +977,77 @@ def _get_df_mountpoints():
return(mountpoints)
def _print_obj(obj):
"""
Print a single object
"""
base_object = False
for plugin in PLUGIN_NAME_LIST:
if PLUGINS[plugin] == obj:
base_object = True
break
num = len(obj.instance_objects)
if num > 0 or base_object is True:
prefix = "PLUGIN "
if num:
prefix += str(num)
else:
prefix += " "
else:
prefix = "INSTANCE"
if obj.plugin_instance:
resource = obj.plugin + ":" + obj.plugin_instance
else:
resource = obj.plugin
collectd.info("%s %s res: %s name: %s\n" %
(PLUGIN, prefix, resource, obj.resource_name))
collectd.info("%s eid : %s\n" % (PLUGIN, obj.entity_id))
collectd.info("%s inst: %s name: %s\n" %
(PLUGIN, obj.instance, obj.instance_name))
collectd.info("%s value:%2.1f thld:%2.1f cause:%s (%d) type:%s" %
(PLUGIN,
obj.value,
obj.threshold,
obj.cause,
obj.count,
obj.reading_type))
collectd.info("%s warn:%s fail:%s" %
(PLUGIN, obj.warnings, obj.failures))
collectd.info("%s repair:t: %s" %
(PLUGIN, obj.repair))
if obj.cause != fm_constants.ALARM_PROBABLE_CAUSE_50:
collectd.info("%s reason:w: %s\n"
"%s reason:f: %s\n" %
(PLUGIN, obj.reason_warning,
PLUGIN, obj.reason_failure))
# collectd.info(" ")
def _print_state(obj=None):
"""
Print the current object state
"""
try:
objs = []
if obj is None:
objs.append(_get_base_object(ALARM_ID__CPU))
objs.append(_get_base_object(ALARM_ID__MEM))
objs.append(_get_base_object(ALARM_ID__DF))
for plugin in PLUGIN_NAME_LIST:
objs.append(PLUGINS[plugin])
else:
objs.append(obj)
collectd.debug("%s _print_state Lock ..." % PLUGIN)
PluginObject.lock.acquire()
for o in objs:
collectd.info("%s PLUGIN %2d [%6s:%2.2f:%s] [w:%s f:%s] %d" %
(PLUGIN,
len(o.instance_objects),
o.plugin,
o.value,
o.entity_id,
o.warnings,
o.failures,
o.count))
_print_obj(o)
if len(o.instance_objects):
for inst_obj in o.instance_objects:
collectd.info("%s INSTANCE [%6s:%2.2f:%s] [w:%s f:%s] %d" %
(PLUGIN,
inst_obj.plugin,
inst_obj.value,
inst_obj.entity_id,
inst_obj.warnings,
inst_obj.failures,
inst_obj.count))
_print_obj(o.instance_objects[inst_obj])
finally:
collectd.debug("%s _print_state UnLock ..." % PLUGIN)
PluginObject.lock.release()
def _database_setup(database):
@ -843,14 +1087,14 @@ def _database_setup(database):
############################################################
PluginObject.dbObj.create_retention_policy(
'collectd samples', '4w', 1, database, True)
DATABASE_NAME, '4w', 1, database, True)
except Exception as ex:
if str(ex) == 'database already exists':
try:
collectd.info("%s influxdb:collectd %s" %
(PLUGIN, str(ex)))
PluginObject.dbObj.create_retention_policy(
'collectd samples', '4w', 1, database, True)
DATABASE_NAME, '4w', 1, database, True)
except Exception as ex:
if str(ex) == 'retention policy already exists':
collectd.info("%s influxdb:collectd %s" %
@ -864,15 +1108,21 @@ def _database_setup(database):
error_str = "failed to connect to influxdb:" + database
if not error_str:
found = False
retention = \
PluginObject.dbObj.get_list_retention_policies(database)
collectd.info("%s influxdb:%s samples retention policy: %s" %
(PLUGIN, database, retention))
for r in range(len(retention)):
if retention[r]["name"] == DATABASE_NAME:
collectd.info("%s influxdb:%s samples retention "
"policy: %s" %
(PLUGIN, database, retention[r]))
found = True
if found is True:
collectd.info("%s influxdb:%s is setup" % (PLUGIN, database))
PluginObject.database_setup = True
else:
collectd.error("%s influxdb:%s setup %s" %
(PLUGIN, database, error_str))
collectd.error("%s influxdb:%s retention policy NOT setup" %
(PLUGIN, database))
def _clear_alarm_for_missing_filesystems():
@ -892,10 +1142,11 @@ def _clear_alarm_for_missing_filesystems():
if len(alarm_list):
for eid in alarm_list:
# search for any of them that might be alarmed.
obj = df_base_obj.instance_objects[eid]
obj = df_base_obj._get_instance_object(eid)
# only care about df (file system plugins)
if obj.plugin == PLUGIN__DF and \
if obj is not None and \
obj.plugin == PLUGIN__DF and \
obj.entity_id == eid and \
obj.plugin_instance != 'root':
@ -912,7 +1163,6 @@ def _clear_alarm_for_missing_filesystems():
else:
collectd.debug("%s maintaining alarm for %s" %
(PLUGIN, path))
return 0
# Collectd calls this function on startup.
@ -921,6 +1171,8 @@ def _clear_alarm_for_missing_filesystems():
def init_func():
""" Collectd FM Notifier Initialization Function """
PluginObject.lock = Lock()
PluginObject.host = os.uname()[1]
collectd.info("%s %s:%s init function" %
(PLUGIN, tsc.nodetype, PluginObject.host))
@ -933,15 +1185,19 @@ def init_func():
obj.repair += "contact next level of support."
collectd.info("%s monitoring %s usage" % (PLUGIN, obj.resource_name))
###########################################################################
# Constant Memory Plugin Object settings
obj = PLUGINS[PLUGIN__MEM]
obj.resource_name = "Memory"
obj.resource_name = "Platform Memory"
obj.instance_name = PLUGIN__MEM
obj.repair = "Monitor and if condition persists, "
obj.repair += "contact next level of support; "
obj.repair += "may require additional memory on Host."
collectd.info("%s monitoring %s usage" % (PLUGIN, obj.resource_name))
###########################################################################
# Constant FileSystem Plugin Object settings
obj = PLUGINS[PLUGIN__DF]
obj.resource_name = "File System"
@ -954,6 +1210,63 @@ def init_func():
# Create one DF instance object per mount point
obj._create_instance_objects()
# ntp query is for controllers only
if tsc.nodetype == 'worker' or 'worker' in tsc.subfunctions:
#######################################################################
# Constant vSwitch CPU Usage Plugin Object settings
obj = PLUGINS[PLUGIN__VSWITCH_CPU]
obj.resource_name = "vSwitch CPU"
obj.instance_name = PLUGIN__VSWITCH_CPU
obj.repair = "Monitor and if condition persists, "
obj.repair += "contact next level of support."
collectd.info("%s monitoring %s usage" % (PLUGIN, obj.resource_name))
#######################################################################
# Constant vSwitch Memory Usage Plugin Object settings
obj = PLUGINS[PLUGIN__VSWITCH_MEM]
obj.resource_name = "vSwitch Memory"
obj.instance_name = PLUGIN__VSWITCH_MEM
obj.repair = "Monitor and if condition persists, "
obj.repair += "contact next level of support."
collectd.info("%s monitoring %s usage" % (PLUGIN, obj.resource_name))
#######################################################################
# Constant vSwitch Port State Monitor Plugin Object settings
obj = PLUGINS[PLUGIN__VSWITCH_PORT]
obj.resource_name = "vSwitch Port"
obj.instance_name = PLUGIN__VSWITCH_PORT
obj.reading_type = "state"
obj.reason_failure = "'Data' Port failed."
obj.reason_warning = "'Data' Port failed."
obj.repair = "Check cabling and far-end port configuration and "
obj.repair += "status on adjacent equipment."
obj.alarm_type = fm_constants.FM_ALARM_TYPE_4 # EQUIPMENT
obj.cause = fm_constants.ALARM_PROBABLE_CAUSE_29 # LOSS_OF_SIGNAL
obj.service_affecting = True
collectd.info("%s monitoring %s state" % (PLUGIN, obj.resource_name))
#######################################################################
# Constant vSwitch Interface State Monitor Plugin Object settings
obj = PLUGINS[PLUGIN__VSWITCH_IFACE]
obj.resource_name = "vSwitch Interface"
obj.instance_name = PLUGIN__VSWITCH_IFACE
obj.reading_type = "state"
obj.reason_failure = "'Data' Interface failed."
obj.reason_warning = "'Data' Interface degraded."
obj.repair = "Check cabling and far-end port configuration and "
obj.repair += "status on adjacent equipment."
obj.alarm_type = fm_constants.FM_ALARM_TYPE_4 # EQUIPMENT
obj.cause = fm_constants.ALARM_PROBABLE_CAUSE_29 # LOSS_OF_SIGNAL
obj.service_affecting = True
collectd.info("%s monitoring %s state" % (PLUGIN, obj.resource_name))
###########################################################################
obj = PLUGINS[PLUGIN__EXAMPLE]
obj.resource_name = "Example"
obj.instance_name = PLUGIN__EXAMPLE
@ -981,6 +1294,7 @@ def init_func():
alarms = api.get_faults_by_id(alarm_id)
if alarms:
for alarm in alarms:
want_alarm_clear = False
eid = alarm.entity_instance_id
# ignore alarms not for this host
if PluginObject.host not in eid:
@ -988,28 +1302,31 @@ def init_func():
base_obj = _get_base_object(alarm_id)
if base_obj is None:
# Handle unrecognized alarm by clearing it ;
# should never happen since we are iterating
# over an internal alarm_id list.
# might be a plugin instance - clear it
want_alarm_clear = True
collectd.info('%s found %s %s alarm [%s]' %
(PLUGIN,
alarm.severity,
alarm_id,
eid))
if want_alarm_clear is True:
if api.clear_fault(alarm_id, eid) is False:
collectd.error("%s %s:%s not found ; clear failed" %
collectd.error("%s %s:%s clear failed" %
(PLUGIN,
alarm_id,
eid))
else:
collectd.error("%s %s:%s not found ; cleared" %
collectd.info("%s clear %s %s alarm %s" %
(PLUGIN,
alarm.severity,
alarm_id,
eid))
continue
collectd.info('%s found %s alarm with %s severity [%s:%s:%s]' %
(PLUGIN,
base_obj.id,
alarm.severity,
base_obj.plugin,
alarm_id,
eid))
if alarm.severity == "critical":
sev = "failure"
elif alarm.severity == "major":
@ -1019,6 +1336,7 @@ def init_func():
continue
# Load the alarm severity by doing a plugin/instance lookup.
if base_obj is not None:
base_obj._manage_alarm(eid, sev)
@ -1067,23 +1385,64 @@ def notifier_func(nObject):
base_obj = obj = PLUGINS[nObject.plugin]
# if this notification is for a plugin instance then get that
# instances's object instead. if that object does not yet exists
# then create it
# instances's object instead.
# If that object does not yet exists then create it.
eid = ''
if nObject.plugin_instance:
# DF instances are statically allocated
if nObject.plugin == PLUGIN__DF:
eid = _build_entity_id(nObject.plugin, nObject.plugin_instance)
# get this instances object
obj = base_obj._get_instance_object(eid)
if obj is None:
# path should never be hit since all DF instances
# are statically allocated.
return 0
elif nObject.plugin_instance:
need_instance_object_create = False
# Build the entity_id from the parent object if needed
# Build the entity_id from the parent object if needed
eid = _build_entity_id(nObject.plugin, nObject.plugin_instance)
try:
# Need lock when reading/writing any obj.instance_objects list
collectd.debug("%s %s lock" % (PLUGIN, nObject.plugin))
PluginObject.lock.acquire()
#collectd.info("%s Object Search eid: %s" %
# (nObject.plugin, eid))
#for o in base_obj.instance_objects:
# collectd.error("%s %s inst object dict item %s : %s" %
# (PLUGIN, nObject.plugin, o,
# base_obj.instance_objects[o]))
# we will take an exception if this object is not in the list.
# the exception handling code below will create and add this
# object for success path the next time around.
inst_obj = base_obj.instance_objects[eid]
if inst_obj is None:
collectd.error("%s %s:%s instance object is None" %
(PLUGIN,
nObject.plugin,
nObject.plugin_instance))
return 0
collectd.debug("%s %s instance %s already exists %s" %
(PLUGIN, nObject.plugin, eid, inst_obj))
# _print_state(inst_obj)
except:
# o.k. , not in the list yet, lets create one
collectd.error("%s %s:%s instance object not found" %
need_instance_object_create = True
finally:
collectd.debug("%s %s unlock" % (PLUGIN, nObject.plugin))
PluginObject.lock.release()
if need_instance_object_create is True:
base_obj._create_instance_object(nObject.plugin_instance)
inst_obj = base_obj._get_instance_object(eid)
if inst_obj:
collectd.debug("%s %s:%s inst object created" %
(PLUGIN,
inst_obj.plugin,
inst_obj.instance))
else:
collectd.error("%s %s:%s inst object create failed" %
(PLUGIN,
nObject.plugin,
nObject.plugin_instance))
@ -1096,13 +1455,6 @@ def notifier_func(nObject):
# Build the entity_id from the parent object if needed
eid = _build_entity_id(nObject.plugin, nObject.plugin_instance)
# TODO: Needed ?
if not len(obj.instance):
obj.instance = nObject.plugin
if nObject.plugin_instance:
obj.instance += '_' + nObject.plugin_instance
# TODO: Needed ?
# update the object with the eid if its not already set.
if not len(obj.entity_id):
obj.entity_id = eid
@ -1112,6 +1464,7 @@ def notifier_func(nObject):
(PLUGIN, nObject.plugin, nObject.plugin_instance))
return 0
# if obj.warnings or obj.failures:
# _print_state(obj)
# If want_state_audit is True then run the audit.
@ -1143,21 +1496,32 @@ def notifier_func(nObject):
return 0
if _alarm_state == fm_constants.FM_ALARM_STATE_CLEAR:
if api.clear_fault(base_obj.id, obj.entity_id) is False:
if api.clear_fault(obj.id, obj.entity_id) is False:
collectd.error("%s %s:%s clear_fault failed" %
(PLUGIN, base_obj.id, obj.entity_id))
return 0
else:
# manage addition of the failure reason text
if obj.cause == fm_constants.ALARM_PROBABLE_CAUSE_50:
# if this is a threshold alarm then build the reason text that
# includes the threahold and the reading that caused the assertion.
reason = obj.resource_name
reason += " threshold exceeded"
if obj.threshold:
reason += "; {:2.0f}".format(obj.threshold) + "%"
# reason += "; {:2.2f}".format(obj.threshold) + "%"
reason += "; threshold {:2.0f} ".format(obj.threshold) + "%, "
if obj.value:
reason += ", actual " + "{:2.0f}".format(obj.value) + "%"
reason += "actual {:2.0f}".format(obj.value) + "%"
elif _severity_num == fm_constants.FM_ALARM_SEVERITY_CRITICAL:
reason = obj.reason_failure
else:
reason = obj.reason_warning
# build the alarm object
fault = fm_api.Fault(
alarm_id=base_obj.id,
alarm_id=obj.id,
alarm_state=_alarm_state,
entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST,
entity_instance_id=obj.entity_id,
@ -1170,7 +1534,7 @@ def notifier_func(nObject):
suppression=base_obj.suppression)
alarm_uuid = api.set_fault(fault)
if is_uuid_like(alarm_uuid) is False:
if pc.is_uuid_like(alarm_uuid) is False:
collectd.error("%s %s:%s set_fault failed:%s" %
(PLUGIN, base_obj.id, obj.entity_id, alarm_uuid))
return 0
@ -1191,5 +1555,8 @@ def notifier_func(nObject):
# Debug only: comment out for production code.
# obj._state_audit("change")
return 0
collectd.register_init(init_func)
collectd.register_notification(notifier_func)

View File

@ -1,11 +1,11 @@
<Plugin "threshold">
<Plugin "interface">
<Type "absolute">
Instance "state"
<Type "percent">
Instance "used"
Persist true
PersistOK true
WarningMin 50
FailureMin 0
WarningMin 51
FailureMin 1
# Hits 2
Invert false
</Type>

File diff suppressed because it is too large Load Diff

View File

@ -12,8 +12,8 @@
Instance "used"
Persist true
PersistOK true
WarningMax 80.00
FailureMax 90.00
WarningMax 79.00
FailureMax 89.00
Hits 2
Invert false
</Type>

View File

@ -39,6 +39,7 @@
import os
import socket
import collectd
import tsconfig.tsconfig as tsc
# This plugin name
PLUGIN = 'degrade notifier'
@ -65,6 +66,13 @@ ONE_EVERY = 10
PLUGIN__DF = 'df'
PLUGIN__MEM = 'memory'
PLUGIN__CPU = 'cpu'
PLUGIN__VSWITCH_MEM = 'vswitch_mem'
PLUGIN__VSWITCH_CPU = 'vswitch_cpu'
PLUGIN__VSWITCH_PORT = "vswitch_port"
PLUGIN__VSWITCH_IFACE = "vswitch_iface"
PLUGIN_INTERFACE = 'interface'
PLUGIN__EXAMPLE = 'example'
@ -89,9 +97,13 @@ class collectdMtceNotifierObject:
self.degrade_list__failure = [PLUGIN__DF,
PLUGIN__MEM,
PLUGIN__CPU,
PLUGIN__VSWITCH_MEM,
PLUGIN__VSWITCH_CPU,
PLUGIN__VSWITCH_PORT,
PLUGIN__VSWITCH_IFACE,
PLUGIN_INTERFACE,
PLUGIN__EXAMPLE]
self.degrade_list__warning = []
self.degrade_list__warning = [PLUGIN_INTERFACE]
# the running list of resources that require degrade.
# a degrade clear message is sent whenever this list is empty.
@ -172,7 +184,7 @@ def config_func(config):
Configure the maintenance degrade notifier plugin.
"""
collectd.info('%s config function' % PLUGIN)
collectd.debug('%s config function' % PLUGIN)
for node in config.children:
key = node.key.lower()
val = node.values[0]
@ -194,6 +206,10 @@ def init_func():
Collectd Mtce Notifier Initialization Function
"""
obj.host = os.uname()[1]
collectd.info("%s %s:%s sending to mtce port %d" %
(PLUGIN, tsc.nodetype, obj.host, obj.port))
collectd.debug("%s init function" % PLUGIN)
@ -241,7 +257,7 @@ def notifier_func(nObject):
path = _df_instance_to_path(resource)
add = os.path.ismount(path)
if add is True:
collectd.debug("%s %s added to degrade list" %
collectd.info("%s %s added to degrade list" %
(PLUGIN, resource))
obj.degrade_list.append(resource)
else:
@ -264,7 +280,7 @@ def notifier_func(nObject):
path = _df_instance_to_path(resource)
add = os.path.ismount(path)
if add is True:
collectd.debug("%s %s added to degrade list" %
collectd.info("%s %s added to degrade list" %
(PLUGIN, resource))
obj.degrade_list.append(resource)
else:

View File

@ -0,0 +1,255 @@
#
# Copyright (c) 2019 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
############################################################################
#
# This file contains common collectd plugin constructs and utilities
#
############################################################################
import collectd
import json
import uuid
import httplib2
import socket
import os
from fm_api import constants as fm_constants
import tsconfig.tsconfig as tsc
# http request constants
PLUGIN_TIMEOUT = 10
PLUGIN_HTTP_HEADERS = {'Accept': 'application/json', 'Connection': 'close'}
MIN_AUDITS_B4_FIRST_QUERY = 2
class PluginObject(object):
def __init__(self, plugin, url):
# static variables set in init_func
self.plugin = plugin # the name of this plugin
self.hostname = '' # the name of this host
self.port = 0 # the port number for this plugin
# dynamic gate variables
self.config_complete = False # set to True once config is complete
self.config_done = False # set true if config_func completed ok
self.init_done = False # set true if init_func completed ok
# dynamic variables set in read_func
self.usage = float(0) # last usage value recorded as float
self.audits = 0 # number of audit since init
# http and json specific variables
self.url = url # target url
self.jresp = None # used to store the json response
self.resp = ''
# Log controls
self.config_logged = False # used to log once the plugin config
self.error_logged = False # used to prevent log flooding
self.log_throttle_count = 0 # used to count throttle logs
self.INIT_LOG_THROTTLE = 10 # the init log throttle threshold
collectd.debug("%s Common PluginObject constructor [%s]" %
(plugin, url))
###########################################################################
#
# Name : init_ready
#
# Description: Test for init ready condition
#
# Parameters : plugin name
#
# Returns : False if initial config complete is not done
# True if initial config complete is done
#
###########################################################################
def init_ready(self):
""" Test for system init ready state """
if os.path.exists(tsc.INITIAL_CONFIG_COMPLETE_FLAG) is False:
self.log_throttle_count += 1
if self.log_throttle_count > self.INIT_LOG_THROTTLE:
collectd.info("%s initialization needs retry" % self.plugin)
self.log_throttle_count = 0
return False
else:
self.log_throttle_count = 0
return True
###########################################################################
#
# Name : gethostname
#
# Description: load the hostname
#
# Parameters : plugin name
#
# Returns : Success - hostname
# Failure - None
#
# Updates : obj.hostname
#
###########################################################################
def gethostname(self):
""" Fetch the hostname """
# get current hostname
try:
hostname = socket.gethostname()
if hostname:
return hostname
except:
collectd.error("%s failed to get hostname" % self.plugin)
return None
###########################################################################
#
# Name : check_for_fit
#
# Description: load FIT data if it is present
#
# Fit Format : unit data -> 0 89
# - instance 0 value 89
#
# Parameters : plugin name
# object to update with fit
# name in fit file
# unit
#
# Returns : Did a failure occur ?
# False = no
# True = yes
#
# Updates : self.usage with FIT value if FIT conditions are present
# and apply
#
###########################################################################
def check_for_fit(self, name, unit):
""" Load FIT data into usage if it exists """
fit_file = '/var/run/fit/' + name + '_data'
if os.path.exists(fit_file):
valid = False
with open(fit_file, 'r') as infile:
for line in infile:
try:
inst, val = line.split(' ')
if int(unit) == int(inst):
self.usage = float(val)
valid = True
except:
try:
val = float(line)
self.usage = float(val)
valid = True
except:
collectd.error("%s bad FIT data; ignoring" %
self.plugin)
if valid is True:
collectd.info("%s %.2f usage (unit %d) (FIT)" %
(self.plugin, unit, self.usage))
return False
return True
###########################################################################
#
# Name : make_http_request
#
# Description: Issue an http request to the specified URL.
# Load and return the response
# Handling execution errors
#
# Parameters : self as current context.
#
# Optional:
#
# url - override the default self url with http address to
# issue the get request to.
# to - timeout override
# hdrs - override use of the default header list
#
# Updates : self.jresp with the json string response from the request.
#
# Returns : Error indication (True/False)
# True on error
# False on success
#
###########################################################################
def make_http_request(self, url=None, to=None, hdrs=None):
""" Make a blocking HTTP Request and return result """
try:
# handle timeout override
if to is None:
to = PLUGIN_TIMEOUT
# handle url override
if url is None:
url = self.url
# handle header override
if hdrs is None:
hdrs = PLUGIN_HTTP_HEADERS
http = httplib2.Http(timeout=to)
resp = http.request(url, headers=hdrs)
except Exception as ex:
collectd.info("%s http request failure (%s)" %
(self.plugin, str(ex)))
return True
try:
collectd.debug("%s Resp: %s" %
(self.plugin, resp[1]))
self.resp = resp[1]
self.jresp = json.loads(resp[1])
except Exception as ex:
collectd.info("%s http request parse failure (%s) (%s)" %
(self.plugin, str(ex), resp))
return True
return False
def is_uuid_like(val):
"""Returns validation of a value as a UUID.
For our purposes, a UUID is a canonical form string:
aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa
"""
try:
return str(uuid.UUID(val)) == val
except (TypeError, ValueError, AttributeError):
return False
def get_severity_str(severity):
""" get string that represents the specified severity """
if severity == fm_constants.FM_ALARM_SEVERITY_CLEAR:
return "clear"
elif severity == fm_constants.FM_ALARM_SEVERITY_CRITICAL:
return "critical"
elif severity == fm_constants.FM_ALARM_SEVERITY_MAJOR:
return "major"
elif severity == fm_constants.FM_ALARM_SEVERITY_MINOR:
return "minor"
else:
return "unknown"

View File

@ -10,6 +10,10 @@ LoadPlugin python
Path "/proc/meminfo"
</Module>
Import "ntpq"
Import "interface"
<Module "interface">
Port 2122
</Module>
LogTraces = true
Encoding "utf-8"
</Plugin>

View File

@ -46,6 +46,8 @@ source_suffix = '.rst'
# The master toctree document.
master_doc = 'index'
project = u'stx-integ'
# Release notes are version independent, no need to set version and release
release = ''
version = ''

View File

@ -1,2 +1,2 @@
COPY_LIST="$PKG_BASE/files/* $CGCS_BASE/downloads/swtpm-0.1.0-253eac5.tar.gz"
TIS_PATCH_VER=0
TIS_PATCH_VER=1

View File

@ -2,12 +2,12 @@
%define name swtpm
%define version 0.1.0
#WRS
#STX
#%define release 1
%define release 2%{?_tis_dist}.%{tis_patch_ver}
# Valid crypto subsystems are 'freebl' and 'openssl'
#WRS
#STX
#%if "%{crypto_subsystem}" == ""
%define crypto_subsystem openssl
#%endif
@ -15,7 +15,7 @@
Summary: TPM Emulator
Name: %{name}
Version: %{version}
#WRS
#STX
#Release: %{release}.dev2%{?dist}
Release: %{release}
License: BSD
@ -23,9 +23,8 @@ Group: Applications/Emulators
Source: %{name}-%{version}-253eac5.tar.gz
BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root
#WRS
Source1: qemu
Source2: setup_vtpm
#STX
Source1: setup_vtpm
# due to gnutls backlevel API:
@ -49,11 +48,11 @@ BuildRequires: libtasn1-tools
BuildRequires: kernel-modules-extra
%endif
#WRS
#STX
BuildRequires: openssl-devel
Requires: openssl
#WRS
#STX
Requires: seabios-bin >= 1.10.2-3
Requires: fuse expect libtpms >= 0.6.0
@ -94,7 +93,7 @@ Summary: Tools for the TPM emulator
License: BSD
Group: Applications/Emulators
Requires: swtpm fuse
#WRS
#STX
#Requires: trousers >= 0.3.9 tpm-tools >= 1.3.8-6 expect bash net-tools gnutls-utils
Requires: trousers >= 0.3.9 expect bash net-tools gnutls-utils
@ -106,9 +105,8 @@ Tools for the TPM emulator from the swtpm package
%attr( 755, root, root) %{_bindir}/swtpm
%{_mandir}/man8/swtpm.8*
#WRS
#STX
/etc/libvirt/setup_vtpm
/etc/libvirt/hooks/qemu
%files cuse
@ -158,7 +156,7 @@ Tools for the TPM emulator from the swtpm package
%build
#WRS
#STX
./bootstrap.sh
%configure \
--prefix=/usr \
@ -180,13 +178,12 @@ make %{?_smp_mflags} check
make %{?_smp_mflags} install DESTDIR=${RPM_BUILD_ROOT}
rm -f ${RPM_BUILD_ROOT}%{_libdir}/*.a ${RPM_BUILD_ROOT}%{_libdir}/*.la
#WRS
mkdir -p $RPM_BUILD_ROOT/etc/libvirt/hooks
#STX
mkdir -p $RPM_BUILD_ROOT/etc/libvirt
install -m 0500 %{SOURCE1} $RPM_BUILD_ROOT/etc/libvirt/hooks/qemu
install -m 0500 %{SOURCE2} $RPM_BUILD_ROOT/etc/libvirt/setup_vtpm
install -m 0500 %{SOURCE1} $RPM_BUILD_ROOT/etc/libvirt/setup_vtpm
# WRS: Don't set (or remove on uninstall): SELINUX Policy and contexts
# STX: Don't set (or remove on uninstall): SELINUX Policy and contexts
#%post cuse
#if [ -n "$(type -p semodule)" ]; then
# for pp in /usr/share/swtpm/*.pp ; do

View File

@ -1,4 +1,4 @@
SRC_DIR="platform-util"
COPY_LIST_TO_TAR="scripts"
TIS_PATCH_VER=15
TIS_PATCH_VER=16

View File

@ -131,6 +131,9 @@ do
"mtcalarmd")
pmon_managed_processes=(${pmon_managed_processes[@]} "mtcalarmd:0")
;;
"lmond")
pmon_managed_processes=(${pmon_managed_processes[@]} "lmond:0")
;;
*)
loginfo "Unknown process:${process}"

View File

@ -1,6 +1,7 @@
SRC_DIR="$CGCS_BASE/git/libvirt"
COPY_LIST="\
libvirt/* \
libvirt/hooks/* \
$CGCS_BASE/downloads/gnulib-ffc927e.tar.gz \
$CGCS_BASE/downloads/keycodemapdb-16e5b07.tar.gz"
TIS_BASE_SRCREV=ab58260efaa712650c63bb1917122f270070fa4b

View File

@ -16,7 +16,7 @@
# Always run autoreconf
%{!?enable_autotools:%global enable_autotools 1}
# WRS: Custom build config. Based on the R2/bitbake configure line.
# STX: Custom build config. Based on the R2/bitbake configure line.
%define _without_esx 1
%define _without_hyperv 1
%define _without_libxl 1
@ -258,13 +258,14 @@ URL: https://libvirt.org/
Source0: http://libvirt.org/sources/%{?mainturl}libvirt-%{version}.tar.gz
#Source1: symlinks
# WRS
# STX
Source2: libvirt.logrotate
Source3: libvirt.lxc
Source4: libvirt.qemu
Source5: libvirt.uml
Source6: gnulib-ffc927e.tar.gz
Source7: keycodemapdb-16e5b07.tar.gz
Source8: qemu
Requires: libvirt-daemon = %{version}-%{release}
Requires: libvirt-daemon-config-network = %{version}-%{release}
@ -461,9 +462,9 @@ BuildRequires: wireshark-devel >= 1.12.1
BuildRequires: libssh-devel >= 0.7.0
%endif
# WRS: For generating configure
# STX: For generating configure
BuildRequires: gnulib
# WRS: Needed by bootstrap
# STX: Needed by bootstrap
BuildRequires: perl-XML-XPath
Provides: bundled(gnulib)
@ -1304,7 +1305,7 @@ rm -rf .git
# place macros above and build commands below this comment
# WRS: Generate configure script. Default is to do a "git clone" of gnulib.
# STX: Generate configure script. Default is to do a "git clone" of gnulib.
# Use the tar ball gnulib tarball instead.
tar zxf %{SOURCE6}
./bootstrap --no-git --gnulib-srcdir=gnulib-ffc927e --copy
@ -1379,7 +1380,7 @@ rm -f po/stamp-po
--without-dtrace \
%{arg_init_script}
#WRS: Avoid doing a 'config.status --recheck' (./configure executed twice).
#STX: Avoid doing a 'config.status --recheck' (./configure executed twice).
touch -r config.status configure
make %{?_smp_mflags}
@ -1470,7 +1471,7 @@ rm -rf $RPM_BUILD_ROOT%{_sysconfdir}/logrotate.d/libvirtd.uml
# Copied into libvirt-docs subpackage eventually
mv $RPM_BUILD_ROOT%{_datadir}/doc/libvirt-%{version} libvirt-docs
# WRS: Disable dtrace
# STX: Disable dtrace
# %ifarch %{power64} s390x x86_64 ia64 alpha sparc64
# mv $RPM_BUILD_ROOT%{_datadir}/systemtap/tapset/libvirt_probes.stp \
# $RPM_BUILD_ROOT%{_datadir}/systemtap/tapset/libvirt_probes-64.stp
@ -1478,7 +1479,7 @@ mv $RPM_BUILD_ROOT%{_datadir}/doc/libvirt-%{version} libvirt-docs
# $RPM_BUILD_ROOT%{_datadir}/systemtap/tapset/libvirt_qemu_probes-64.stp
# %endif
# WRS: Begin custom install
# STX: Begin custom install
## Enable syslog for libvirtd ( /var/log/libvirtd.log )
echo "log_outputs=\"3:syslog:libvirtd\"" >> %{buildroot}/etc/libvirt/libvirtd.conf
@ -1493,12 +1494,15 @@ install -p -D -m 644 %{SOURCE2} %{buildroot}/etc/logrotate.d/libvirtd
install -p -D -m 644 %{SOURCE3} %{buildroot}/etc/logrotate.d/libvirtd.lxc
install -p -D -m 644 %{SOURCE4} %{buildroot}/etc/logrotate.d/libvirtd.qemu
install -p -D -m 644 %{SOURCE5} %{buildroot}/etc/logrotate.d/libvirtd.uml
# WRS: End custom install
## Install hooks
mkdir -p $RPM_BUILD_ROOT/etc/libvirt/hooks
install -m 0500 %{SOURCE8} $RPM_BUILD_ROOT/etc/libvirt/hooks/qemu
# STX: End custom install
%clean
rm -fr %{buildroot}
# WRS: We are not maintaining the unit tests.
# STX: We are not maintaining the unit tests.
# %check
# cd tests
# # These tests don't current work in a mock build root
@ -1631,7 +1635,7 @@ if [ $1 -ge 1 ] ; then
fi
%post daemon-config-network
# WRS: The 'with_network' flag doesn't work properly. There are some packaging
# STX: The 'with_network' flag doesn't work properly. There are some packaging
# errors when using it. Disable default.xml manually ...
# We don't want 'virbr0' and 'virbr0-nic' interfaces created.
@ -1777,11 +1781,11 @@ exit 0
%files
# WRS: Customization
# STX: Customization
%dir /data/images/
%files docs
# TODO(WRS): NEWS is not present in git source repo.
# TODO(STX): NEWS is not present in git source repo.
%doc AUTHORS ChangeLog.gz README
%doc libvirt-docs/*
@ -1874,8 +1878,9 @@ exit 0
%doc examples/polkit/*.rules
# WRS: Customization
# STX: Customization
/etc/logrotate.d/*
/etc/libvirt/hooks/qemu
%files daemon-config-network
%dir %{_datadir}/libvirt/networks/
@ -2061,7 +2066,7 @@ exit 0
%{_bindir}/virt-pki-validate
%{_bindir}/virt-host-validate
# WRS: Disable dtrace
# STX: Disable dtrace
# %{_datadir}/systemtap/tapset/libvirt_probes*.stp
# %{_datadir}/systemtap/tapset/libvirt_qemu_probes*.stp
# %{_datadir}/systemtap/tapset/libvirt_functions.stp

View File

@ -34,6 +34,51 @@ OPERATION=$*
logger -p info -t $0 "hook qemu file guest $GUEST_NAME with operation $OPERATION"
# CPU Low latency setup:
#
# A cpu is set to low latency when:
# 1) host is set to subfunction=lowlatency in platform.conf and
# 2) domain has dedicated pinning
#
# example of <cputune> section when domain has dedicated pinning:
# <cputune>
# <vcpupin vcpu='0' cpuset='5'/>
# <vcpupin vcpu='1' cpuset='6'/>
# <vcpupin vcpu='2' cpuset='7'/>
# <emulatorpin cpuset='5'/>
# </cputune>
#
# example of <cputune> section when domain has shared pinning:
# <cputune>
# <shares>4096</shares>
# <vcpupin vcpu='0' cpuset='5-21'/>
# <vcpupin vcpu='1' cpuset='5-21'/>
# <vcpupin vcpu='2' cpuset='5-21'/>
# <vcpupin vcpu='3' cpuset='5-21'/>
# <emulatorpin cpuset='5-21'/>
# </cputune>
if [ "${OPERATION}" == "prepare begin -" ] || [ "${OPERATION}" == "stopped end -" ]; then
# verify this host is set as lowlatency
lowlat=$(cat /etc/platform/platform.conf 2>/dev/null | grep -E 'subfunction.*lowlatency')
if [ -n "${lowlat}" ]; then
# grab the <cputune> settings and remove single quotes
CPUTUNE=$(echo ${XML_DATA} | grep -oP '(?<=<cputune).*?(?=</cputune>)' | sed "s/'//g")
# grab all cpuset pinned to a unique CPU. Treat them as dedicated
CPUSET=($(echo ${CPUTUNE} | grep -oP '(?<=cpuset=)[^/]+(?=.+emulator)' | grep -vP '[^0-9]'))
if [ ${#CPUSET[@]} -ne 0 ]; then
# convert to a comma separated list
CPUS=$(IFS=, ; echo "${CPUSET[*]}")
if [ "${OPERATION}" == "prepare begin -" ]; then
/usr/bin/set-cpu-wakeup-latency.sh "low" "${CPUS}"
else
/usr/bin/set-cpu-wakeup-latency.sh "high" "${CPUS}"
fi
fi
fi
fi
VTPM_OPER=""
if [ "$OPERATION" == "prepare begin -" ]; then