Merge remote-tracking branch 'starlingx/master' into HEAD

Change-Id: Ie11b0475fd0eae5427d303d0c6fadf5f0a1d11f9
Signed-off-by: Scott Little <scott.little@windriver.com>
This commit is contained in:
Scott Little 2019-02-06 11:34:44 -05:00
commit 52a51cbb15
7 changed files with 648 additions and 168 deletions

View File

@ -1,6 +1,6 @@
SRC_DIR="$CGCS_BASE/git/ceph"
COPY_LIST="files/*"
COPY_LIST="files/* $DISTRO/patches/*"
TIS_BASE_SRCREV=3f07f7ff1a5c7bfa8d0de12c966594d5fb7cf4ec
TIS_PATCH_VER=GITREVCOUNT
TIS_PATCH_VER=GITREVCOUNT+1
BUILD_IS_BIG=40
BUILD_IS_SLOW=26

View File

@ -241,6 +241,10 @@ Source9: ceph-rest-api.service
Source10: ceph-radosgw.service
Source11: stx_git_version
Source12: ceph-preshutdown.sh
Source13: starlingx-docker-override.conf
Patch0001: 0001-Add-hooks-for-orderly-shutdown-on-controller.patch
%if 0%{?suse_version}
%if 0%{?is_opensuse}
@ -797,6 +801,7 @@ python-cephfs instead.
#################################################################################
%prep
%setup -q
%patch0001 -p1
# StarlingX: Copy the .git_version file needed by the build
# This commit SHA is from the upstream src rpm which is the base of this repo branch
# TODO: Add a commit hook to update to our latest commit SHA
@ -976,6 +981,8 @@ install -m 700 %{SOURCE7} %{buildroot}/usr/sbin/osd-wait-status
install -m 644 %{SOURCE8} $RPM_BUILD_ROOT/%{_unitdir}/ceph.service
install -m 644 %{SOURCE9} $RPM_BUILD_ROOT/%{_unitdir}/ceph-rest-api.service
install -m 644 %{SOURCE10} $RPM_BUILD_ROOT/%{_unitdir}/ceph-radosgw.service
install -m 700 %{SOURCE12} %{buildroot}%{_sbindir}/ceph-preshutdown.sh
install -D -m 644 %{SOURCE13} $RPM_BUILD_ROOT/%{_sysconfdir}/systemd/system/docker.service.d/starlingx-docker-override.conf
install -m 750 src/init-ceph %{buildroot}/%{_initrddir}/ceph
install -m 750 src/init-radosgw %{buildroot}/%{_initrddir}/ceph-radosgw
@ -1016,6 +1023,8 @@ rm -rf %{buildroot}
%config(noreplace) %{_sysconfdir}/ceph/ceph.conf
%{_sysconfdir}/services.d/*
%{_sbindir}/ceph-manage-journal
%{_sbindir}/ceph-preshutdown.sh
%{_sysconfdir}/systemd/system/docker.service.d/starlingx-docker-override.conf
%endif
%if %{without stx}
%{_unitdir}/ceph-create-keys@.service

View File

@ -0,0 +1,59 @@
From 03340eaf0004e3cc8e3f8991ea96a46757d92830 Mon Sep 17 00:00:00 2001
From: Don Penney <don.penney@windriver.com>
Date: Sat, 26 Jan 2019 13:34:55 -0500
Subject: [PATCH] Add hooks for orderly shutdown on controller
Hook the ceph init script to add systemd overrides to define
an orderly shutdown for StarlingX controllers.
Signed-off-by: Don Penney <don.penney@windriver.com>
---
src/init-ceph.in | 32 ++++++++++++++++++++++++++++++++
1 file changed, 32 insertions(+)
diff --git a/src/init-ceph.in b/src/init-ceph.in
index 1fdb4b3..515d818 100644
--- a/src/init-ceph.in
+++ b/src/init-ceph.in
@@ -861,6 +861,38 @@ for name in $what; do
fi
fi
+ . /etc/platform/platform.conf
+ if [ "${nodetype}" = "controller" ]; then
+ # StarlingX: Hook the transient services launched by systemd-run
+ # to allow for proper cleanup and orderly shutdown
+
+ # Set nullglob so wildcards will return empty string if no match
+ shopt -s nullglob
+
+ OSD_SERVICES=$(for svc in /run/systemd/system/ceph-osd*.service; do basename $svc; done | xargs echo)
+ for d in /run/systemd/system/ceph-osd*.d; do
+ cat <<EOF > $d/starlingx-overrides.conf
+[Unit]
+Before=docker.service
+After=sm-shutdown.service
+
+EOF
+ done
+
+ for d in /run/systemd/system/ceph-mon*.d; do
+ cat <<EOF > $d/starlingx-overrides.conf
+[Unit]
+Before=docker.service
+After=sm-shutdown.service ${OSD_SERVICES}
+
+EOF
+ done
+
+ shopt -u nullglob
+
+ systemctl daemon-reload
+ fi
+
[ -n "$post_start" ] && do_cmd "$post_start"
[ -n "$lockfile" ] && [ "$?" -eq 0 ] && touch $lockfile
;;
--
1.8.3.1

View File

@ -0,0 +1,30 @@
#!/bin/bash
#
# Copyright (c) 2019 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
script=$(basename $0)
# Set nullglob so wildcards will return empty string if no match
shopt -s nullglob
for dev in /dev/rbd[0-9]*; do
for mnt in $(mount | awk -v dev=$dev '($1 == dev) {print $3}'); do
logger -t ${script} "Unmounting $mnt"
/usr/bin/umount $mnt
done
logger -t ${script} "Unmounted $dev"
done
for dev in /dev/rbd[0-9]*; do
/usr/bin/rbd unmap -o force $dev
logger -t ${script} "Unmapped $dev"
done
lsmod | grep -q '^rbd\>' && /usr/sbin/modprobe -r rbd
lsmod | grep -q '^libceph\>' && /usr/sbin/modprobe -r libceph
exit 0

View File

@ -0,0 +1,3 @@
[Service]
ExecStopPost=/usr/sbin/ceph-preshutdown.sh

View File

@ -23,17 +23,17 @@
# Collects provides information about each event as an object passed to the
# notification handler ; the notification object.
#
# object.host - the hostname
# object.host - the hostname.
#
# object.plugin - the name of the plugin aka resource
# object.plugin - the name of the plugin aka resource.
# object.plugin_instance - plugin instance string i.e. say mountpoint
# for df plugin
# object.type, - the unit i.e. percent or absolute
# object.type_instance - the attribute i.e. free, used, etc
# for df plugin or numa? node for memory.
# object.type, - the unit i.e. percent or absolute.
# object.type_instance - the attribute i.e. free, used, etc.
#
# object.severity - a integer value 0=OK , 1=warning, 2=failure
# object.severity - a integer value 0=OK , 1=warning, 2=failure.
# object.message - a log-able message containing the above along
# with the value
# with the value.
#
# This notifier uses the notification object to manage plugin/instance alarms.
#
@ -86,6 +86,7 @@ import os
import re
import uuid
import collectd
from threading import RLock as Lock
from fm_api import constants as fm_constants
from fm_api import fm_api
import tsconfig.tsconfig as tsc
@ -116,6 +117,12 @@ PLUGIN = 'alarm notifier'
# Path to the plugin's drop dir
PLUGIN_PATH = '/etc/collectd.d/'
# the name of the collectd samples database
DATABASE_NAME = 'collectd samples'
READING_TYPE__PERCENT_USAGE = '% usage'
# collectd severity definitions ;
# Note: can't seem to pull then in symbolically with a header
NOTIF_FAILURE = 1
@ -145,6 +152,7 @@ mangled_list = {"dev-shm",
"etc-nova-instances",
"opt-platform",
"opt-cgcs",
"opt-etcd",
"opt-extension",
"opt-backups"}
@ -154,10 +162,20 @@ ALARM_ID__MEM = "100.103"
ALARM_ID__DF = "100.104"
ALARM_ID__EXAMPLE = "100.113"
ALARM_ID__VSWITCH_CPU = "100.102"
ALARM_ID__VSWITCH_MEM = "100.115"
ALARM_ID__VSWITCH_PORT = "300.001"
ALARM_ID__VSWITCH_IFACE = "300.002"
# ADD_NEW_PLUGIN: add new alarm id to the list
ALARM_ID_LIST = [ALARM_ID__CPU,
ALARM_ID__MEM,
ALARM_ID__DF,
ALARM_ID__VSWITCH_CPU,
ALARM_ID__VSWITCH_MEM,
ALARM_ID__VSWITCH_PORT,
ALARM_ID__VSWITCH_IFACE,
ALARM_ID__EXAMPLE]
# ADD_NEW_PLUGIN: add plugin name definition
@ -168,38 +186,29 @@ PLUGIN__CPU = "cpu"
PLUGIN__MEM = "memory"
PLUGIN__INTERFACE = "interface"
PLUGIN__NTP_QUERY = "ntpq"
PLUGIN__VSWITCH_PORT = "vswitch-port"
PLUGIN__VSWITCH_CPU = "vswitch-cpu"
PLUGIN__VSWITCH_MEM = "vswitch-memory"
PLUGIN__VSWITCH_OVSDB = "vswitch-ovsdb"
PLUGIN__VSWITCH_OPENFLOW = "vswitch-openflow"
PLUGIN__VSWITCH_LACP_IFACE = "vswitch-lacp-iface"
PLUGIN__VSWITCH_IFACE = "vswitch-iface"
PLUGIN__NOVA_THINPOOL_LVM = "nova-thinpool-lvm"
PLUGIN__CINDER_THINPOOL_LVM = "cinder-thinpool-lvm"
PLUGIN__CINDER_THINPOOL_LVM_META = "cinder-thinpool-lvm-meta"
PLUGIN__VSWITCH_PORT = "vswitch_port"
PLUGIN__VSWITCH_CPU = "vswitch_cpu"
PLUGIN__VSWITCH_MEM = "vswitch_mem"
PLUGIN__VSWITCH_IFACE = "vswitch_iface"
PLUGIN__EXAMPLE = "example"
# ADD_NEW_PLUGIN: add plugin name to list
PLUGIN_NAME_LIST = [PLUGIN__CPU,
PLUGIN__MEM,
PLUGIN__DF,
PLUGIN__VSWITCH_CPU,
PLUGIN__VSWITCH_MEM,
PLUGIN__VSWITCH_PORT,
PLUGIN__VSWITCH_IFACE,
PLUGIN__EXAMPLE]
# ADD_NEW_PLUGIN: add alarm id and plugin to dictionary
# ALARM_ID_TO_PLUGIN_DICT = {}
# ALARM_ID_TO_PLUGIN_DICT[ALARM_ID__CPU] = PLUGIN__CPU
# ALARM_ID_TO_PLUGIN_DICT[ALARM_ID__MEM] = PLUGIN__MEM
# ALARM_ID_TO_PLUGIN_DICT[ALARM_ID__DF] = PLUGIN__DF
# ALARM_ID_TO_PLUGIN_DICT[ALARM_ID__EXAMPLE] = PLUGIN__EXAMPLE
# PluginObject Class
class PluginObject:
dbObj = None # shared database connection obj
host = None # saved hostname
lock = None # global lock for mread_func mutex
database_setup = False # state of database setup
database_setup_in_progress = False # connection mutex
@ -213,7 +222,7 @@ class PluginObject:
self.plugin = plugin # name of the plugin ; df, cpu, memory ...
self.plugin_instance = "" # the instance name for the plugin
self.resource_name = "" # The top level name of the resource
self.instance_name = "" # The instanhce name
self.instance_name = "" # The instance name
# Instance specific learned static class members.
self.entity_id = "" # fm entity id host=<hostname>.<instance>
@ -225,12 +234,17 @@ class PluginObject:
self.value = float(0) # float value of reading
# Common static class members.
self.reason_warning = ""
self.reason_failure = ""
self.repair = ""
self.alarm_type = fm_constants.FM_ALARM_TYPE_7
self.cause = fm_constants.ALARM_PROBABLE_CAUSE_50
self.alarm_type = fm_constants.FM_ALARM_TYPE_7 # OPERATIONAL
self.cause = fm_constants.ALARM_PROBABLE_CAUSE_50 # THRESHOLD CROSS
self.suppression = True
self.service_affecting = False
# default most reading types are usage
self.reading_type = READING_TYPE__PERCENT_USAGE
# Severity tracking lists.
# Maintains severity state between notifications.
# Each is a list of entity ids for severity asserted alarms.
@ -329,7 +343,11 @@ class PluginObject:
# filter out messages to ignore ; notifications that have no value
if "has not been updated for" in nObject.message:
collectd.debug("%s NOT UPDATED: %s" % (PLUGIN, self.entity_id))
collectd.info("%s %s %s (%s)" %
(PLUGIN,
self.entity_id,
nObject.message,
nObject.severity))
return "done"
# Get the value from the notification message.
@ -363,8 +381,8 @@ class PluginObject:
# validate the reading
try:
self.value = float(self.values[0])
# get the threshold if its there
if len(self.values) == 2:
# get the threshold if its there.
if len(self.values) > 1:
self.threshold = float(self.values[1])
except ValueError as ex:
@ -390,6 +408,9 @@ class PluginObject:
logit = False
if self.count == 0 or LOG_STEP == 0:
logit = True
elif self.reading_type == "connections":
if self.value != last:
logit = True
elif self.value > last:
if (last + LOG_STEP) < self.value:
logit = True
@ -401,18 +422,40 @@ class PluginObject:
#
# Note: only usage type so far
if logit:
reading_type = "% usage"
tmp = str(self.value).split('.')
if len(tmp[0]) == 1:
pre = ': '
else:
pre = ': '
collectd.info("%s reading%s%2.2f %s - %s" %
(PLUGIN,
pre,
self.value,
reading_type,
self.instance_name))
resource = self.resource_name
# setup resource name for filesystem instance usage log
if self.plugin == PLUGIN__DF:
resource = self.instance
# setup resource name for vswitch process instance name
elif self.plugin == PLUGIN__VSWITCH_MEM:
resource += ' Processor '
resource += self.instance_name
if self.reading_type == READING_TYPE__PERCENT_USAGE:
tmp = str(self.value).split('.')
if len(tmp[0]) == 1:
pre = ': '
else:
pre = ': '
collectd.info("%s reading%s%2.2f %s - %s" %
(PLUGIN,
pre,
self.value,
self.reading_type,
resource))
elif self.reading_type == "connections" and \
self.instance_objects and \
self.value != self.last_value:
if self.instance_objects:
collectd.info("%s monitor: %2d %s - %s" %
(PLUGIN,
self.value,
self.reading_type,
resource))
self.last_value = float(self.value)
##########################################################################
@ -599,12 +642,139 @@ class PluginObject:
collectd.info("%s %s no failures" %
(PLUGIN, self.plugin))
##########################################################################
#
# Name : _get_instance_object
#
# Purpose : Safely get an object from the self instance object list
# indexed by eid.
#
##########################################################################
def _get_instance_object(self, eid):
"""
Safely get an object from the self instance object list indexed
by eid while locked.
:param eid:
:return: object or None
"""
try:
collectd.debug("%s %s Get Lock ..." % (PLUGIN, self.plugin))
PluginObject.lock.acquire()
obj = self.instance_objects[eid]
return obj
except:
collectd.error("%s failed to get instance from %s object list" %
(PLUGIN, self.plugin))
return None
finally:
collectd.debug("%s %s Get UnLock ..." % (PLUGIN, self.plugin))
PluginObject.lock.release()
##########################################################################
#
# Name : _add_instance_object
#
# Purpose : Safely add an object to the self instance object list
# indexed by eid while locked. if found locked the instance
# add will be re-attempted on next sample.
#
##########################################################################
def _add_instance_object(self, obj, eid):
"""
Update self instance_objects list while locked
:param obj: the object to add
:param eid: indexed by this eid
:return: nothing
"""
try:
collectd.debug("%s %s Add Lock ..." % (PLUGIN, self.plugin))
PluginObject.lock.acquire()
self.instance_objects[eid] = obj
except:
collectd.error("%s failed to add instance to %s object list" %
(PLUGIN, self.plugin))
finally:
collectd.debug("%s %s Add UnLock ..." % (PLUGIN, self.plugin))
PluginObject.lock.release()
##########################################################################
#
# Name : _copy_instance_object
#
# Purpose : Copy select members of self object to target object.
#
##########################################################################
def _copy_instance_object(self, object):
"""
Copy select members of self object to target object
"""
object.resource_name = self.resource_name
object.instance_name = self.instance_name
object.reading_type = self.reading_type
object.reason_warning = self.reason_warning
object.reason_failure = self.reason_failure
object.repair = self.repair
object.alarm_type = self.alarm_type
object.cause = self.cause
object.suppression = self.suppression
object.service_affecting = self.service_affecting
##########################################################################
#
# Name : _create_instance_object
#
# Purpose : Create a new instance object and tack it on the supplied base
# object's instance object dictionary.
#
##########################################################################
def _create_instance_object(self, instance):
try:
# create a new plugin object
inst_obj = PluginObject(self.id, self.plugin)
self._copy_instance_object(inst_obj)
# initialize the object with instance specific data
inst_obj.instance_name = instance
inst_obj.entity_id = _build_entity_id(self.plugin,
instance)
self._add_instance_object(inst_obj, inst_obj.entity_id)
collectd.debug("%s created %s instance (%s) object %s" %
(PLUGIN, inst_obj.resource_name,
inst_obj.entity_id, inst_obj))
collectd.debug("%s monitoring %s %s %s" %
(PLUGIN,
inst_obj.resource_name,
inst_obj.instance_name,
inst_obj.reading_type))
return inst_obj
except:
collectd.error("%s %s:%s inst object create failed" %
(PLUGIN, inst_obj.resource_name, instance))
return None
##########################################################################
#
# Name : _create_instance_objects
#
# Purpose : Create a list of instance objects for 'self' type plugin and
# add those objects to the parnet's instance_objects dictionary.
# add those objects to the parent's instance_objects dictionary.
#
# Note : This is currently only used for the DF (filesystem) plugin.
# All other instance creations/allocations are done on-demand.
#
##########################################################################
def _create_instance_objects(self):
@ -612,11 +782,7 @@ class PluginObject:
Create, initialize and add an instance object to this/self plugin
"""
# ADD_NEW_PLUGIN: for plugins that have instances you need to
# add support for creating those instances and adding
# those instances to the parent instance_objects list.
# Currently only the DF plugin has subordinate instance objects.
# Create the File System subordinate instance objects.
if self.id == ALARM_ID__DF:
# read the df.conf file and return/get a list of mount points
@ -651,6 +817,7 @@ class PluginObject:
# initialize the object with instance specific data
inst_obj.resource_name = self.resource_name
inst_obj.instance_name = mp
inst_obj.instance = mp
# build the plugin instance name from the mount point
if mp == '/':
inst_obj.plugin_instance = 'root'
@ -662,21 +829,30 @@ class PluginObject:
# add this subordinate object to the parent's
# instance object list
self.instance_objects[inst_obj.entity_id] = inst_obj
self._add_instance_object(inst_obj, inst_obj.entity_id)
collectd.info("%s monitoring %s usage" %
(PLUGIN, mp))
(PLUGIN, inst_obj.instance))
PluginObject.host = os.uname()[1]
# ADD_NEW_PLUGIN: add plugin to this table
# This instanciates the plugin objects
PLUGINS = {PLUGIN__CPU: PluginObject(ALARM_ID__CPU, PLUGIN__CPU),
PLUGIN__MEM: PluginObject(ALARM_ID__MEM, PLUGIN__MEM),
PLUGIN__DF: PluginObject(ALARM_ID__DF, PLUGIN__DF),
PLUGIN__EXAMPLE: PluginObject(ALARM_ID__EXAMPLE, PLUGIN__EXAMPLE)}
# This instantiates the plugin objects
PLUGINS = {
PLUGIN__CPU: PluginObject(ALARM_ID__CPU, PLUGIN__CPU),
PLUGIN__MEM: PluginObject(ALARM_ID__MEM, PLUGIN__MEM),
PLUGIN__DF: PluginObject(ALARM_ID__DF, PLUGIN__DF),
PLUGIN__VSWITCH_CPU: PluginObject(ALARM_ID__VSWITCH_CPU,
PLUGIN__VSWITCH_CPU),
PLUGIN__VSWITCH_MEM: PluginObject(ALARM_ID__VSWITCH_MEM,
PLUGIN__VSWITCH_MEM),
PLUGIN__VSWITCH_PORT: PluginObject(ALARM_ID__VSWITCH_PORT,
PLUGIN__VSWITCH_PORT),
PLUGIN__VSWITCH_IFACE: PluginObject(ALARM_ID__VSWITCH_IFACE,
PLUGIN__VSWITCH_IFACE),
PLUGIN__EXAMPLE: PluginObject(ALARM_ID__EXAMPLE, PLUGIN__EXAMPLE)}
def _get_base_object(alarm_id):
@ -689,21 +865,6 @@ def _get_base_object(alarm_id):
return None
def _get_object(alarm_id, eid):
"""
Get the plugin object for the specified alarm id and eid
"""
base_obj = _get_base_object(alarm_id)
if len(base_obj.instance_objects):
try:
return(base_obj.instance_objects[eid])
except:
collectd.debug("%s %s has no instance objects" %
(PLUGIN, base_obj.plugin))
return base_obj
def is_uuid_like(val):
"""Returns validation of a value as a UUID.
@ -721,10 +882,38 @@ def _build_entity_id(plugin, plugin_instance):
Builds an entity id string based on the collectd notification object.
"""
inst_error = False
entity_id = 'host='
entity_id += PluginObject.host
if plugin == PLUGIN__DF:
if plugin == PLUGIN__VSWITCH_MEM:
# host=<hostname>.processor=<socket-id>
if plugin_instance:
entity_id += '.processor=' + plugin_instance
else:
inst_error = True
elif plugin == PLUGIN__VSWITCH_IFACE:
# host=<hostname>.interface=<if-uuid>
if plugin_instance:
entity_id += '.interface=' + plugin_instance
else:
inst_error = True
elif plugin == PLUGIN__VSWITCH_PORT:
# host=<hostname>.port=<port-uuid>
if plugin_instance:
entity_id += '.port=' + plugin_instance
else:
inst_error = True
elif plugin == PLUGIN__DF:
# host=<hostname>.filesystem=<mountpoint>
if plugin_instance:
instance = plugin_instance
@ -740,7 +929,18 @@ def _build_entity_id(plugin, plugin_instance):
instance = instance.replace('-', '/')
entity_id += instance
# collectd.info("%s entity_id : %s" % (PLUGIN, entity_id))
# Will be uncommented when the numa memory monitor is added
# to the platform memory plugin.
#
#elif plugin == PLUGIN__MEM:
# if plugin_instance is not 'platform':
# # host=controller-0.numa=node0
# entity_id += '.numa='
# entity_id += plugin_instance
if inst_error is True:
collectd.error("%s eid build failed ; missing instance" % plugin)
return None
return entity_id
@ -773,37 +973,77 @@ def _get_df_mountpoints():
return(mountpoints)
def _print_obj(obj):
"""
Print a single object
"""
base_object = False
for plugin in PLUGIN_NAME_LIST:
if PLUGINS[plugin] == obj:
base_object = True
break
num = len(obj.instance_objects)
if num > 0 or base_object is True:
prefix = "PLUGIN "
if num:
prefix += str(num)
else:
prefix += " "
else:
prefix = "INSTANCE"
if obj.plugin_instance:
resource = obj.plugin + ":" + obj.plugin_instance
else:
resource = obj.plugin
collectd.info("%s %s res: %s name: %s\n" %
(PLUGIN, prefix, resource, obj.resource_name))
collectd.info("%s eid : %s\n" % (PLUGIN, obj.entity_id))
collectd.info("%s inst: %s name: %s\n" %
(PLUGIN, obj.instance, obj.instance_name))
collectd.info("%s value:%2.1f thld:%2.1f cause:%s (%d) type:%s" %
(PLUGIN,
obj.value,
obj.threshold,
obj.cause,
obj.count,
obj.reading_type))
collectd.info("%s warn:%s fail:%s" %
(PLUGIN, obj.warnings, obj.failures))
collectd.info("%s repair:t: %s" %
(PLUGIN, obj.repair))
if obj.cause != fm_constants.ALARM_PROBABLE_CAUSE_50:
collectd.info("%s reason:w: %s\n"
"%s reason:f: %s\n" %
(PLUGIN, obj.reason_warning,
PLUGIN, obj.reason_failure))
# collectd.info(" ")
def _print_state(obj=None):
"""
Print the current object state
"""
objs = []
if obj is None:
objs.append(_get_base_object(ALARM_ID__CPU))
objs.append(_get_base_object(ALARM_ID__MEM))
objs.append(_get_base_object(ALARM_ID__DF))
else:
objs.append(obj)
for o in objs:
collectd.info("%s PLUGIN %2d [%6s:%2.2f:%s] [w:%s f:%s] %d" %
(PLUGIN,
len(o.instance_objects),
o.plugin,
o.value,
o.entity_id,
o.warnings,
o.failures,
o.count))
if len(o.instance_objects):
for inst_obj in o.instance_objects:
collectd.info("%s INSTANCE [%6s:%2.2f:%s] [w:%s f:%s] %d" %
(PLUGIN,
inst_obj.plugin,
inst_obj.value,
inst_obj.entity_id,
inst_obj.warnings,
inst_obj.failures,
inst_obj.count))
try:
objs = []
if obj is None:
for plugin in PLUGIN_NAME_LIST:
objs.append(PLUGINS[plugin])
else:
objs.append(obj)
collectd.debug("%s _print_state Lock ..." % PLUGIN)
PluginObject.lock.acquire()
for o in objs:
_print_obj(o)
if len(o.instance_objects):
for inst_obj in o.instance_objects:
_print_obj(o.instance_objects[inst_obj])
finally:
collectd.debug("%s _print_state UnLock ..." % PLUGIN)
PluginObject.lock.release()
def _database_setup(database):
@ -843,14 +1083,14 @@ def _database_setup(database):
############################################################
PluginObject.dbObj.create_retention_policy(
'collectd samples', '4w', 1, database, True)
DATABASE_NAME, '4w', 1, database, True)
except Exception as ex:
if str(ex) == 'database already exists':
try:
collectd.info("%s influxdb:collectd %s" %
(PLUGIN, str(ex)))
PluginObject.dbObj.create_retention_policy(
'collectd samples', '4w', 1, database, True)
DATABASE_NAME, '4w', 1, database, True)
except Exception as ex:
if str(ex) == 'retention policy already exists':
collectd.info("%s influxdb:collectd %s" %
@ -864,15 +1104,21 @@ def _database_setup(database):
error_str = "failed to connect to influxdb:" + database
if not error_str:
found = False
retention = \
PluginObject.dbObj.get_list_retention_policies(database)
collectd.info("%s influxdb:%s samples retention policy: %s" %
(PLUGIN, database, retention))
collectd.info("%s influxdb:%s is setup" % (PLUGIN, database))
PluginObject.database_setup = True
else:
collectd.error("%s influxdb:%s setup %s" %
(PLUGIN, database, error_str))
for r in range(len(retention)):
if retention[r]["name"] == DATABASE_NAME:
collectd.info("%s influxdb:%s samples retention "
"policy: %s" %
(PLUGIN, database, retention[r]))
found = True
if found is True:
collectd.info("%s influxdb:%s is setup" % (PLUGIN, database))
PluginObject.database_setup = True
else:
collectd.error("%s influxdb:%s retention policy NOT setup" %
(PLUGIN, database))
def _clear_alarm_for_missing_filesystems():
@ -892,10 +1138,11 @@ def _clear_alarm_for_missing_filesystems():
if len(alarm_list):
for eid in alarm_list:
# search for any of them that might be alarmed.
obj = df_base_obj.instance_objects[eid]
obj = df_base_obj._get_instance_object(eid)
# only care about df (file system plugins)
if obj.plugin == PLUGIN__DF and \
if obj is not None and \
obj.plugin == PLUGIN__DF and \
obj.entity_id == eid and \
obj.plugin_instance != 'root':
@ -912,7 +1159,6 @@ def _clear_alarm_for_missing_filesystems():
else:
collectd.debug("%s maintaining alarm for %s" %
(PLUGIN, path))
return 0
# Collectd calls this function on startup.
@ -921,6 +1167,8 @@ def _clear_alarm_for_missing_filesystems():
def init_func():
""" Collectd FM Notifier Initialization Function """
PluginObject.lock = Lock()
PluginObject.host = os.uname()[1]
collectd.info("%s %s:%s init function" %
(PLUGIN, tsc.nodetype, PluginObject.host))
@ -933,15 +1181,19 @@ def init_func():
obj.repair += "contact next level of support."
collectd.info("%s monitoring %s usage" % (PLUGIN, obj.resource_name))
###########################################################################
# Constant Memory Plugin Object settings
obj = PLUGINS[PLUGIN__MEM]
obj.resource_name = "Memory"
obj.resource_name = "Platform Memory"
obj.instance_name = PLUGIN__MEM
obj.repair = "Monitor and if condition persists, "
obj.repair += "contact next level of support; "
obj.repair += "may require additional memory on Host."
collectd.info("%s monitoring %s usage" % (PLUGIN, obj.resource_name))
###########################################################################
# Constant FileSystem Plugin Object settings
obj = PLUGINS[PLUGIN__DF]
obj.resource_name = "File System"
@ -954,6 +1206,63 @@ def init_func():
# Create one DF instance object per mount point
obj._create_instance_objects()
# ntp query is for controllers only
if tsc.nodetype == 'worker' or 'worker' in tsc.subfunctions:
#######################################################################
# Constant vSwitch CPU Usage Plugin Object settings
obj = PLUGINS[PLUGIN__VSWITCH_CPU]
obj.resource_name = "vSwitch CPU"
obj.instance_name = PLUGIN__VSWITCH_CPU
obj.repair = "Monitor and if condition persists, "
obj.repair += "contact next level of support."
collectd.info("%s monitoring %s usage" % (PLUGIN, obj.resource_name))
#######################################################################
# Constant vSwitch Memory Usage Plugin Object settings
obj = PLUGINS[PLUGIN__VSWITCH_MEM]
obj.resource_name = "vSwitch Memory"
obj.instance_name = PLUGIN__VSWITCH_MEM
obj.repair = "Monitor and if condition persists, "
obj.repair += "contact next level of support."
collectd.info("%s monitoring %s usage" % (PLUGIN, obj.resource_name))
#######################################################################
# Constant vSwitch Port State Monitor Plugin Object settings
obj = PLUGINS[PLUGIN__VSWITCH_PORT]
obj.resource_name = "vSwitch Port"
obj.instance_name = PLUGIN__VSWITCH_PORT
obj.reading_type = "state"
obj.reason_failure = "'Data' Port failed."
obj.reason_warning = "'Data' Port failed."
obj.repair = "Check cabling and far-end port configuration and "
obj.repair += "status on adjacent equipment."
obj.alarm_type = fm_constants.FM_ALARM_TYPE_4 # EQUIPMENT
obj.cause = fm_constants.ALARM_PROBABLE_CAUSE_29 # LOSS_OF_SIGNAL
obj.service_affecting = True
collectd.info("%s monitoring %s state" % (PLUGIN, obj.resource_name))
#######################################################################
# Constant vSwitch Interface State Monitor Plugin Object settings
obj = PLUGINS[PLUGIN__VSWITCH_IFACE]
obj.resource_name = "vSwitch Interface"
obj.instance_name = PLUGIN__VSWITCH_IFACE
obj.reading_type = "state"
obj.reason_failure = "'Data' Interface failed."
obj.reason_warning = "'Data' Interface degraded."
obj.repair = "Check cabling and far-end port configuration and "
obj.repair += "status on adjacent equipment."
obj.alarm_type = fm_constants.FM_ALARM_TYPE_4 # EQUIPMENT
obj.cause = fm_constants.ALARM_PROBABLE_CAUSE_29 # LOSS_OF_SIGNAL
obj.service_affecting = True
collectd.info("%s monitoring %s state" % (PLUGIN, obj.resource_name))
###########################################################################
obj = PLUGINS[PLUGIN__EXAMPLE]
obj.resource_name = "Example"
obj.instance_name = PLUGIN__EXAMPLE
@ -981,6 +1290,7 @@ def init_func():
alarms = api.get_faults_by_id(alarm_id)
if alarms:
for alarm in alarms:
want_alarm_clear = False
eid = alarm.entity_instance_id
# ignore alarms not for this host
if PluginObject.host not in eid:
@ -988,28 +1298,31 @@ def init_func():
base_obj = _get_base_object(alarm_id)
if base_obj is None:
# Handle unrecognized alarm by clearing it ;
# should never happen since we are iterating
# over an internal alarm_id list.
# might be a plugin instance - clear it
want_alarm_clear = True
collectd.info('%s found %s %s alarm [%s]' %
(PLUGIN,
alarm.severity,
alarm_id,
eid))
if want_alarm_clear is True:
if api.clear_fault(alarm_id, eid) is False:
collectd.error("%s %s:%s not found ; clear failed" %
collectd.error("%s %s:%s clear failed" %
(PLUGIN,
alarm_id,
eid))
else:
collectd.error("%s %s:%s not found ; cleared" %
(PLUGIN,
alarm_id,
eid))
collectd.info("%s clear %s %s alarm %s" %
(PLUGIN,
alarm.severity,
alarm_id,
eid))
continue
collectd.info('%s found %s alarm with %s severity [%s:%s:%s]' %
(PLUGIN,
base_obj.id,
alarm.severity,
base_obj.plugin,
alarm_id,
eid))
if alarm.severity == "critical":
sev = "failure"
elif alarm.severity == "major":
@ -1019,7 +1332,8 @@ def init_func():
continue
# Load the alarm severity by doing a plugin/instance lookup.
base_obj._manage_alarm(eid, sev)
if base_obj is not None:
base_obj._manage_alarm(eid, sev)
# The notifier function inspects the collectd notification and determines if
@ -1067,27 +1381,68 @@ def notifier_func(nObject):
base_obj = obj = PLUGINS[nObject.plugin]
# if this notification is for a plugin instance then get that
# instances's object instead. if that object does not yet exists
# then create it
# instances's object instead.
# If that object does not yet exists then create it.
eid = ''
if nObject.plugin_instance:
# DF instances are statically allocated
if nObject.plugin == PLUGIN__DF:
eid = _build_entity_id(nObject.plugin, nObject.plugin_instance)
# get this instances object
obj = base_obj._get_instance_object(eid)
if obj is None:
# path should never be hit since all DF instances
# are statically allocated.
return 0
elif nObject.plugin_instance:
need_instance_object_create = False
# Build the entity_id from the parent object if needed
# Build the entity_id from the parent object if needed
eid = _build_entity_id(nObject.plugin, nObject.plugin_instance)
try:
# Need lock when reading/writing any obj.instance_objects list
collectd.debug("%s %s lock" % (PLUGIN, nObject.plugin))
PluginObject.lock.acquire()
#collectd.info("%s Object Search eid: %s" %
# (nObject.plugin, eid))
#for o in base_obj.instance_objects:
# collectd.error("%s %s inst object dict item %s : %s" %
# (PLUGIN, nObject.plugin, o,
# base_obj.instance_objects[o]))
# we will take an exception if this object is not in the list.
# the exception handling code below will create and add this
# object for success path the next time around.
inst_obj = base_obj.instance_objects[eid]
if inst_obj is None:
collectd.error("%s %s:%s instance object is None" %
(PLUGIN,
nObject.plugin,
nObject.plugin_instance))
return 0
collectd.debug("%s %s instance %s already exists %s" %
(PLUGIN, nObject.plugin, eid, inst_obj))
# _print_state(inst_obj)
except:
# o.k. , not in the list yet, lets create one
collectd.error("%s %s:%s instance object not found" %
(PLUGIN,
nObject.plugin,
nObject.plugin_instance))
return 0
need_instance_object_create = True
finally:
collectd.debug("%s %s unlock" % (PLUGIN, nObject.plugin))
PluginObject.lock.release()
if need_instance_object_create is True:
base_obj._create_instance_object(nObject.plugin_instance)
inst_obj = base_obj._get_instance_object(eid)
if inst_obj:
collectd.debug("%s %s:%s inst object created" %
(PLUGIN,
inst_obj.plugin,
inst_obj.instance))
else:
collectd.error("%s %s:%s inst object create failed" %
(PLUGIN,
nObject.plugin,
nObject.plugin_instance))
return 0
# re-assign the object
obj = inst_obj
@ -1096,13 +1451,6 @@ def notifier_func(nObject):
# Build the entity_id from the parent object if needed
eid = _build_entity_id(nObject.plugin, nObject.plugin_instance)
# TODO: Needed ?
if not len(obj.instance):
obj.instance = nObject.plugin
if nObject.plugin_instance:
obj.instance += '_' + nObject.plugin_instance
# TODO: Needed ?
# update the object with the eid if its not already set.
if not len(obj.entity_id):
obj.entity_id = eid
@ -1112,7 +1460,8 @@ def notifier_func(nObject):
(PLUGIN, nObject.plugin, nObject.plugin_instance))
return 0
# _print_state(obj)
# if obj.warnings or obj.failures:
# _print_state(obj)
# If want_state_audit is True then run the audit.
# Primarily used for debug
@ -1143,21 +1492,32 @@ def notifier_func(nObject):
return 0
if _alarm_state == fm_constants.FM_ALARM_STATE_CLEAR:
if api.clear_fault(base_obj.id, obj.entity_id) is False:
if api.clear_fault(obj.id, obj.entity_id) is False:
collectd.error("%s %s:%s clear_fault failed" %
(PLUGIN, base_obj.id, obj.entity_id))
return 0
else:
reason = obj.resource_name
reason += " threshold exceeded"
if obj.threshold:
reason += "; {:2.0f}".format(obj.threshold) + "%"
# reason += "; {:2.2f}".format(obj.threshold) + "%"
if obj.value:
reason += ", actual " + "{:2.0f}".format(obj.value) + "%"
# manage addition of the failure reason text
if obj.cause == fm_constants.ALARM_PROBABLE_CAUSE_50:
# if this is a threshold alarm then build the reason text that
# includes the threahold and the reading that caused the assertion.
reason = obj.resource_name
reason += " threshold exceeded"
if obj.threshold:
reason += "; threshold {:2.0f} ".format(obj.threshold) + "%, "
if obj.value:
reason += "actual {:2.0f}".format(obj.value) + "%"
elif _severity_num == fm_constants.FM_ALARM_SEVERITY_CRITICAL:
reason = obj.reason_failure
else:
reason = obj.reason_warning
# build the alarm object
fault = fm_api.Fault(
alarm_id=base_obj.id,
alarm_id=obj.id,
alarm_state=_alarm_state,
entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST,
entity_instance_id=obj.entity_id,
@ -1191,5 +1551,8 @@ def notifier_func(nObject):
# Debug only: comment out for production code.
# obj._state_audit("change")
return 0
collectd.register_init(init_func)
collectd.register_notification(notifier_func)

View File

@ -39,6 +39,7 @@
import os
import socket
import collectd
import tsconfig.tsconfig as tsc
# This plugin name
PLUGIN = 'degrade notifier'
@ -65,6 +66,13 @@ ONE_EVERY = 10
PLUGIN__DF = 'df'
PLUGIN__MEM = 'memory'
PLUGIN__CPU = 'cpu'
PLUGIN__VSWITCH_MEM = 'vswitch_mem'
PLUGIN__VSWITCH_CPU = 'vswitch_cpu'
PLUGIN__VSWITCH_PORT = "vswitch_port"
PLUGIN__VSWITCH_IFACE = "vswitch_iface"
PLUGIN_INTERFACE = 'interface'
PLUGIN__EXAMPLE = 'example'
@ -89,6 +97,10 @@ class collectdMtceNotifierObject:
self.degrade_list__failure = [PLUGIN__DF,
PLUGIN__MEM,
PLUGIN__CPU,
PLUGIN__VSWITCH_MEM,
PLUGIN__VSWITCH_CPU,
PLUGIN__VSWITCH_PORT,
PLUGIN__VSWITCH_IFACE,
PLUGIN_INTERFACE,
PLUGIN__EXAMPLE]
self.degrade_list__warning = []
@ -172,7 +184,7 @@ def config_func(config):
Configure the maintenance degrade notifier plugin.
"""
collectd.info('%s config function' % PLUGIN)
collectd.debug('%s config function' % PLUGIN)
for node in config.children:
key = node.key.lower()
val = node.values[0]
@ -194,6 +206,10 @@ def init_func():
Collectd Mtce Notifier Initialization Function
"""
obj.host = os.uname()[1]
collectd.info("%s %s:%s sending to mtce port %d" %
(PLUGIN, tsc.nodetype, obj.host, obj.port))
collectd.debug("%s init function" % PLUGIN)
@ -241,8 +257,8 @@ def notifier_func(nObject):
path = _df_instance_to_path(resource)
add = os.path.ismount(path)
if add is True:
collectd.debug("%s %s added to degrade list" %
(PLUGIN, resource))
collectd.info("%s %s added to degrade list" %
(PLUGIN, resource))
obj.degrade_list.append(resource)
else:
# If severity is failure and no failures cause degrade
@ -264,8 +280,8 @@ def notifier_func(nObject):
path = _df_instance_to_path(resource)
add = os.path.ismount(path)
if add is True:
collectd.debug("%s %s added to degrade list" %
(PLUGIN, resource))
collectd.info("%s %s added to degrade list" %
(PLUGIN, resource))
obj.degrade_list.append(resource)
else:
# If severity is warning and no warnings cause degrade