Add on-demand instance support to collectd alarm manager plugin

Many plugins need support for on-demand instance sampling
and alarming. The filesystem and memory monitoring plugins
are perfect examples. The number of numa nodes or monitored
file systems vary from host to host.

This update adds on-demand instance support. Any plugin
can now support multiple instances. As new plugin
instances are learned ; memory is allocated for them
and linked to that plugins base object and managed as a
separate instance but within the scope of its parent.

The following additional enhancements were made to the common
alarm and degrade plugins.

1. added /opt/etcd as a new monitored filesystem.
2. added common support for vswitch alarm/degrade handling.
3. a few general cleanup changes for code maintainability.

Change-Id: I05b4de78f30fc27362c63b6dbfc97268d6588e4f
Story: 2002823
Task:29297
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
Eric MacDonald 2019-02-01 09:41:36 -05:00
parent dc14b89999
commit fab989b5bc
2 changed files with 545 additions and 166 deletions

View File

@ -23,17 +23,17 @@
# Collects provides information about each event as an object passed to the
# notification handler ; the notification object.
#
# object.host - the hostname
# object.host - the hostname.
#
# object.plugin - the name of the plugin aka resource
# object.plugin - the name of the plugin aka resource.
# object.plugin_instance - plugin instance string i.e. say mountpoint
# for df plugin
# object.type, - the unit i.e. percent or absolute
# object.type_instance - the attribute i.e. free, used, etc
# for df plugin or numa? node for memory.
# object.type, - the unit i.e. percent or absolute.
# object.type_instance - the attribute i.e. free, used, etc.
#
# object.severity - a integer value 0=OK , 1=warning, 2=failure
# object.severity - a integer value 0=OK , 1=warning, 2=failure.
# object.message - a log-able message containing the above along
# with the value
# with the value.
#
# This notifier uses the notification object to manage plugin/instance alarms.
#
@ -86,6 +86,7 @@ import os
import re
import uuid
import collectd
from threading import RLock as Lock
from fm_api import constants as fm_constants
from fm_api import fm_api
import tsconfig.tsconfig as tsc
@ -116,6 +117,12 @@ PLUGIN = 'alarm notifier'
# Path to the plugin's drop dir
PLUGIN_PATH = '/etc/collectd.d/'
# the name of the collectd samples database
DATABASE_NAME = 'collectd samples'
READING_TYPE__PERCENT_USAGE = '% usage'
# collectd severity definitions ;
# Note: can't seem to pull then in symbolically with a header
NOTIF_FAILURE = 1
@ -145,6 +152,7 @@ mangled_list = {"dev-shm",
"etc-nova-instances",
"opt-platform",
"opt-cgcs",
"opt-etcd",
"opt-extension",
"opt-backups"}
@ -154,10 +162,20 @@ ALARM_ID__MEM = "100.103"
ALARM_ID__DF = "100.104"
ALARM_ID__EXAMPLE = "100.113"
ALARM_ID__VSWITCH_CPU = "100.102"
ALARM_ID__VSWITCH_MEM = "100.115"
ALARM_ID__VSWITCH_PORT = "300.001"
ALARM_ID__VSWITCH_IFACE = "300.002"
# ADD_NEW_PLUGIN: add new alarm id to the list
ALARM_ID_LIST = [ALARM_ID__CPU,
ALARM_ID__MEM,
ALARM_ID__DF,
ALARM_ID__VSWITCH_CPU,
ALARM_ID__VSWITCH_MEM,
ALARM_ID__VSWITCH_PORT,
ALARM_ID__VSWITCH_IFACE,
ALARM_ID__EXAMPLE]
# ADD_NEW_PLUGIN: add plugin name definition
@ -168,38 +186,29 @@ PLUGIN__CPU = "cpu"
PLUGIN__MEM = "memory"
PLUGIN__INTERFACE = "interface"
PLUGIN__NTP_QUERY = "ntpq"
PLUGIN__VSWITCH_PORT = "vswitch-port"
PLUGIN__VSWITCH_CPU = "vswitch-cpu"
PLUGIN__VSWITCH_MEM = "vswitch-memory"
PLUGIN__VSWITCH_OVSDB = "vswitch-ovsdb"
PLUGIN__VSWITCH_OPENFLOW = "vswitch-openflow"
PLUGIN__VSWITCH_LACP_IFACE = "vswitch-lacp-iface"
PLUGIN__VSWITCH_IFACE = "vswitch-iface"
PLUGIN__NOVA_THINPOOL_LVM = "nova-thinpool-lvm"
PLUGIN__CINDER_THINPOOL_LVM = "cinder-thinpool-lvm"
PLUGIN__CINDER_THINPOOL_LVM_META = "cinder-thinpool-lvm-meta"
PLUGIN__VSWITCH_PORT = "vswitch_port"
PLUGIN__VSWITCH_CPU = "vswitch_cpu"
PLUGIN__VSWITCH_MEM = "vswitch_mem"
PLUGIN__VSWITCH_IFACE = "vswitch_iface"
PLUGIN__EXAMPLE = "example"
# ADD_NEW_PLUGIN: add plugin name to list
PLUGIN_NAME_LIST = [PLUGIN__CPU,
PLUGIN__MEM,
PLUGIN__DF,
PLUGIN__VSWITCH_CPU,
PLUGIN__VSWITCH_MEM,
PLUGIN__VSWITCH_PORT,
PLUGIN__VSWITCH_IFACE,
PLUGIN__EXAMPLE]
# ADD_NEW_PLUGIN: add alarm id and plugin to dictionary
# ALARM_ID_TO_PLUGIN_DICT = {}
# ALARM_ID_TO_PLUGIN_DICT[ALARM_ID__CPU] = PLUGIN__CPU
# ALARM_ID_TO_PLUGIN_DICT[ALARM_ID__MEM] = PLUGIN__MEM
# ALARM_ID_TO_PLUGIN_DICT[ALARM_ID__DF] = PLUGIN__DF
# ALARM_ID_TO_PLUGIN_DICT[ALARM_ID__EXAMPLE] = PLUGIN__EXAMPLE
# PluginObject Class
class PluginObject:
dbObj = None # shared database connection obj
host = None # saved hostname
lock = None # global lock for mread_func mutex
database_setup = False # state of database setup
database_setup_in_progress = False # connection mutex
@ -213,7 +222,7 @@ class PluginObject:
self.plugin = plugin # name of the plugin ; df, cpu, memory ...
self.plugin_instance = "" # the instance name for the plugin
self.resource_name = "" # The top level name of the resource
self.instance_name = "" # The instanhce name
self.instance_name = "" # The instance name
# Instance specific learned static class members.
self.entity_id = "" # fm entity id host=<hostname>.<instance>
@ -225,12 +234,17 @@ class PluginObject:
self.value = float(0) # float value of reading
# Common static class members.
self.reason_warning = ""
self.reason_failure = ""
self.repair = ""
self.alarm_type = fm_constants.FM_ALARM_TYPE_7
self.cause = fm_constants.ALARM_PROBABLE_CAUSE_50
self.alarm_type = fm_constants.FM_ALARM_TYPE_7 # OPERATIONAL
self.cause = fm_constants.ALARM_PROBABLE_CAUSE_50 # THRESHOLD CROSS
self.suppression = True
self.service_affecting = False
# default most reading types are usage
self.reading_type = READING_TYPE__PERCENT_USAGE
# Severity tracking lists.
# Maintains severity state between notifications.
# Each is a list of entity ids for severity asserted alarms.
@ -329,7 +343,11 @@ class PluginObject:
# filter out messages to ignore ; notifications that have no value
if "has not been updated for" in nObject.message:
collectd.debug("%s NOT UPDATED: %s" % (PLUGIN, self.entity_id))
collectd.info("%s %s %s (%s)" %
(PLUGIN,
self.entity_id,
nObject.message,
nObject.severity))
return "done"
# Get the value from the notification message.
@ -363,8 +381,8 @@ class PluginObject:
# validate the reading
try:
self.value = float(self.values[0])
# get the threshold if its there
if len(self.values) == 2:
# get the threshold if its there.
if len(self.values) > 1:
self.threshold = float(self.values[1])
except ValueError as ex:
@ -390,6 +408,9 @@ class PluginObject:
logit = False
if self.count == 0 or LOG_STEP == 0:
logit = True
elif self.reading_type == "connections":
if self.value != last:
logit = True
elif self.value > last:
if (last + LOG_STEP) < self.value:
logit = True
@ -401,18 +422,40 @@ class PluginObject:
#
# Note: only usage type so far
if logit:
reading_type = "% usage"
tmp = str(self.value).split('.')
if len(tmp[0]) == 1:
pre = ': '
else:
pre = ': '
collectd.info("%s reading%s%2.2f %s - %s" %
(PLUGIN,
pre,
self.value,
reading_type,
self.instance_name))
resource = self.resource_name
# setup resource name for filesystem instance usage log
if self.plugin == PLUGIN__DF:
resource = self.instance
# setup resource name for vswitch process instance name
elif self.plugin == PLUGIN__VSWITCH_MEM:
resource += ' Processor '
resource += self.instance_name
if self.reading_type == READING_TYPE__PERCENT_USAGE:
tmp = str(self.value).split('.')
if len(tmp[0]) == 1:
pre = ': '
else:
pre = ': '
collectd.info("%s reading%s%2.2f %s - %s" %
(PLUGIN,
pre,
self.value,
self.reading_type,
resource))
elif self.reading_type == "connections" and \
self.instance_objects and \
self.value != self.last_value:
if self.instance_objects:
collectd.info("%s monitor: %2d %s - %s" %
(PLUGIN,
self.value,
self.reading_type,
resource))
self.last_value = float(self.value)
##########################################################################
@ -599,12 +642,139 @@ class PluginObject:
collectd.info("%s %s no failures" %
(PLUGIN, self.plugin))
##########################################################################
#
# Name : _get_instance_object
#
# Purpose : Safely get an object from the self instance object list
# indexed by eid.
#
##########################################################################
def _get_instance_object(self, eid):
"""
Safely get an object from the self instance object list indexed
by eid while locked.
:param eid:
:return: object or None
"""
try:
collectd.debug("%s %s Get Lock ..." % (PLUGIN, self.plugin))
PluginObject.lock.acquire()
obj = self.instance_objects[eid]
return obj
except:
collectd.error("%s failed to get instance from %s object list" %
(PLUGIN, self.plugin))
return None
finally:
collectd.debug("%s %s Get UnLock ..." % (PLUGIN, self.plugin))
PluginObject.lock.release()
##########################################################################
#
# Name : _add_instance_object
#
# Purpose : Safely add an object to the self instance object list
# indexed by eid while locked. if found locked the instance
# add will be re-attempted on next sample.
#
##########################################################################
def _add_instance_object(self, obj, eid):
"""
Update self instance_objects list while locked
:param obj: the object to add
:param eid: indexed by this eid
:return: nothing
"""
try:
collectd.debug("%s %s Add Lock ..." % (PLUGIN, self.plugin))
PluginObject.lock.acquire()
self.instance_objects[eid] = obj
except:
collectd.error("%s failed to add instance to %s object list" %
(PLUGIN, self.plugin))
finally:
collectd.debug("%s %s Add UnLock ..." % (PLUGIN, self.plugin))
PluginObject.lock.release()
##########################################################################
#
# Name : _copy_instance_object
#
# Purpose : Copy select members of self object to target object.
#
##########################################################################
def _copy_instance_object(self, object):
"""
Copy select members of self object to target object
"""
object.resource_name = self.resource_name
object.instance_name = self.instance_name
object.reading_type = self.reading_type
object.reason_warning = self.reason_warning
object.reason_failure = self.reason_failure
object.repair = self.repair
object.alarm_type = self.alarm_type
object.cause = self.cause
object.suppression = self.suppression
object.service_affecting = self.service_affecting
##########################################################################
#
# Name : _create_instance_object
#
# Purpose : Create a new instance object and tack it on the supplied base
# object's instance object dictionary.
#
##########################################################################
def _create_instance_object(self, instance):
try:
# create a new plugin object
inst_obj = PluginObject(self.id, self.plugin)
self._copy_instance_object(inst_obj)
# initialize the object with instance specific data
inst_obj.instance_name = instance
inst_obj.entity_id = _build_entity_id(self.plugin,
instance)
self._add_instance_object(inst_obj, inst_obj.entity_id)
collectd.debug("%s created %s instance (%s) object %s" %
(PLUGIN, inst_obj.resource_name,
inst_obj.entity_id, inst_obj))
collectd.debug("%s monitoring %s %s %s" %
(PLUGIN,
inst_obj.resource_name,
inst_obj.instance_name,
inst_obj.reading_type))
return inst_obj
except:
collectd.error("%s %s:%s inst object create failed" %
(PLUGIN, inst_obj.resource_name, instance))
return None
##########################################################################
#
# Name : _create_instance_objects
#
# Purpose : Create a list of instance objects for 'self' type plugin and
# add those objects to the parnet's instance_objects dictionary.
# add those objects to the parent's instance_objects dictionary.
#
# Note : This is currently only used for the DF (filesystem) plugin.
# All other instance creations/allocations are done on-demand.
#
##########################################################################
def _create_instance_objects(self):
@ -612,11 +782,7 @@ class PluginObject:
Create, initialize and add an instance object to this/self plugin
"""
# ADD_NEW_PLUGIN: for plugins that have instances you need to
# add support for creating those instances and adding
# those instances to the parent instance_objects list.
# Currently only the DF plugin has subordinate instance objects.
# Create the File System subordinate instance objects.
if self.id == ALARM_ID__DF:
# read the df.conf file and return/get a list of mount points
@ -651,6 +817,7 @@ class PluginObject:
# initialize the object with instance specific data
inst_obj.resource_name = self.resource_name
inst_obj.instance_name = mp
inst_obj.instance = mp
# build the plugin instance name from the mount point
if mp == '/':
inst_obj.plugin_instance = 'root'
@ -662,21 +829,30 @@ class PluginObject:
# add this subordinate object to the parent's
# instance object list
self.instance_objects[inst_obj.entity_id] = inst_obj
self._add_instance_object(inst_obj, inst_obj.entity_id)
collectd.info("%s monitoring %s usage" %
(PLUGIN, mp))
(PLUGIN, inst_obj.instance))
PluginObject.host = os.uname()[1]
# ADD_NEW_PLUGIN: add plugin to this table
# This instanciates the plugin objects
PLUGINS = {PLUGIN__CPU: PluginObject(ALARM_ID__CPU, PLUGIN__CPU),
PLUGIN__MEM: PluginObject(ALARM_ID__MEM, PLUGIN__MEM),
PLUGIN__DF: PluginObject(ALARM_ID__DF, PLUGIN__DF),
PLUGIN__EXAMPLE: PluginObject(ALARM_ID__EXAMPLE, PLUGIN__EXAMPLE)}
# This instantiates the plugin objects
PLUGINS = {
PLUGIN__CPU: PluginObject(ALARM_ID__CPU, PLUGIN__CPU),
PLUGIN__MEM: PluginObject(ALARM_ID__MEM, PLUGIN__MEM),
PLUGIN__DF: PluginObject(ALARM_ID__DF, PLUGIN__DF),
PLUGIN__VSWITCH_CPU: PluginObject(ALARM_ID__VSWITCH_CPU,
PLUGIN__VSWITCH_CPU),
PLUGIN__VSWITCH_MEM: PluginObject(ALARM_ID__VSWITCH_MEM,
PLUGIN__VSWITCH_MEM),
PLUGIN__VSWITCH_PORT: PluginObject(ALARM_ID__VSWITCH_PORT,
PLUGIN__VSWITCH_PORT),
PLUGIN__VSWITCH_IFACE: PluginObject(ALARM_ID__VSWITCH_IFACE,
PLUGIN__VSWITCH_IFACE),
PLUGIN__EXAMPLE: PluginObject(ALARM_ID__EXAMPLE, PLUGIN__EXAMPLE)}
def _get_base_object(alarm_id):
@ -689,21 +865,6 @@ def _get_base_object(alarm_id):
return None
def _get_object(alarm_id, eid):
"""
Get the plugin object for the specified alarm id and eid
"""
base_obj = _get_base_object(alarm_id)
if len(base_obj.instance_objects):
try:
return(base_obj.instance_objects[eid])
except:
collectd.debug("%s %s has no instance objects" %
(PLUGIN, base_obj.plugin))
return base_obj
def is_uuid_like(val):
"""Returns validation of a value as a UUID.
@ -721,10 +882,38 @@ def _build_entity_id(plugin, plugin_instance):
Builds an entity id string based on the collectd notification object.
"""
inst_error = False
entity_id = 'host='
entity_id += PluginObject.host
if plugin == PLUGIN__DF:
if plugin == PLUGIN__VSWITCH_MEM:
# host=<hostname>.processor=<socket-id>
if plugin_instance:
entity_id += '.processor=' + plugin_instance
else:
inst_error = True
elif plugin == PLUGIN__VSWITCH_IFACE:
# host=<hostname>.interface=<if-uuid>
if plugin_instance:
entity_id += '.interface=' + plugin_instance
else:
inst_error = True
elif plugin == PLUGIN__VSWITCH_PORT:
# host=<hostname>.port=<port-uuid>
if plugin_instance:
entity_id += '.port=' + plugin_instance
else:
inst_error = True
elif plugin == PLUGIN__DF:
# host=<hostname>.filesystem=<mountpoint>
if plugin_instance:
instance = plugin_instance
@ -740,7 +929,18 @@ def _build_entity_id(plugin, plugin_instance):
instance = instance.replace('-', '/')
entity_id += instance
# collectd.info("%s entity_id : %s" % (PLUGIN, entity_id))
# Will be uncommented when the numa memory monitor is added
# to the platform memory plugin.
#
#elif plugin == PLUGIN__MEM:
# if plugin_instance is not 'platform':
# # host=controller-0.numa=node0
# entity_id += '.numa='
# entity_id += plugin_instance
if inst_error is True:
collectd.error("%s eid build failed ; missing instance" % plugin)
return None
return entity_id
@ -773,37 +973,77 @@ def _get_df_mountpoints():
return(mountpoints)
def _print_obj(obj):
"""
Print a single object
"""
base_object = False
for plugin in PLUGIN_NAME_LIST:
if PLUGINS[plugin] == obj:
base_object = True
break
num = len(obj.instance_objects)
if num > 0 or base_object is True:
prefix = "PLUGIN "
if num:
prefix += str(num)
else:
prefix += " "
else:
prefix = "INSTANCE"
if obj.plugin_instance:
resource = obj.plugin + ":" + obj.plugin_instance
else:
resource = obj.plugin
collectd.info("%s %s res: %s name: %s\n" %
(PLUGIN, prefix, resource, obj.resource_name))
collectd.info("%s eid : %s\n" % (PLUGIN, obj.entity_id))
collectd.info("%s inst: %s name: %s\n" %
(PLUGIN, obj.instance, obj.instance_name))
collectd.info("%s value:%2.1f thld:%2.1f cause:%s (%d) type:%s" %
(PLUGIN,
obj.value,
obj.threshold,
obj.cause,
obj.count,
obj.reading_type))
collectd.info("%s warn:%s fail:%s" %
(PLUGIN, obj.warnings, obj.failures))
collectd.info("%s repair:t: %s" %
(PLUGIN, obj.repair))
if obj.cause != fm_constants.ALARM_PROBABLE_CAUSE_50:
collectd.info("%s reason:w: %s\n"
"%s reason:f: %s\n" %
(PLUGIN, obj.reason_warning,
PLUGIN, obj.reason_failure))
# collectd.info(" ")
def _print_state(obj=None):
"""
Print the current object state
"""
objs = []
if obj is None:
objs.append(_get_base_object(ALARM_ID__CPU))
objs.append(_get_base_object(ALARM_ID__MEM))
objs.append(_get_base_object(ALARM_ID__DF))
else:
objs.append(obj)
for o in objs:
collectd.info("%s PLUGIN %2d [%6s:%2.2f:%s] [w:%s f:%s] %d" %
(PLUGIN,
len(o.instance_objects),
o.plugin,
o.value,
o.entity_id,
o.warnings,
o.failures,
o.count))
if len(o.instance_objects):
for inst_obj in o.instance_objects:
collectd.info("%s INSTANCE [%6s:%2.2f:%s] [w:%s f:%s] %d" %
(PLUGIN,
inst_obj.plugin,
inst_obj.value,
inst_obj.entity_id,
inst_obj.warnings,
inst_obj.failures,
inst_obj.count))
try:
objs = []
if obj is None:
for plugin in PLUGIN_NAME_LIST:
objs.append(PLUGINS[plugin])
else:
objs.append(obj)
collectd.debug("%s _print_state Lock ..." % PLUGIN)
PluginObject.lock.acquire()
for o in objs:
_print_obj(o)
if len(o.instance_objects):
for inst_obj in o.instance_objects:
_print_obj(o.instance_objects[inst_obj])
finally:
collectd.debug("%s _print_state UnLock ..." % PLUGIN)
PluginObject.lock.release()
def _database_setup(database):
@ -843,14 +1083,14 @@ def _database_setup(database):
############################################################
PluginObject.dbObj.create_retention_policy(
'collectd samples', '4w', 1, database, True)
DATABASE_NAME, '4w', 1, database, True)
except Exception as ex:
if str(ex) == 'database already exists':
try:
collectd.info("%s influxdb:collectd %s" %
(PLUGIN, str(ex)))
PluginObject.dbObj.create_retention_policy(
'collectd samples', '4w', 1, database, True)
DATABASE_NAME, '4w', 1, database, True)
except Exception as ex:
if str(ex) == 'retention policy already exists':
collectd.info("%s influxdb:collectd %s" %
@ -864,15 +1104,21 @@ def _database_setup(database):
error_str = "failed to connect to influxdb:" + database
if not error_str:
found = False
retention = \
PluginObject.dbObj.get_list_retention_policies(database)
collectd.info("%s influxdb:%s samples retention policy: %s" %
(PLUGIN, database, retention))
collectd.info("%s influxdb:%s is setup" % (PLUGIN, database))
PluginObject.database_setup = True
else:
collectd.error("%s influxdb:%s setup %s" %
(PLUGIN, database, error_str))
for r in range(len(retention)):
if retention[r]["name"] == DATABASE_NAME:
collectd.info("%s influxdb:%s samples retention "
"policy: %s" %
(PLUGIN, database, retention[r]))
found = True
if found is True:
collectd.info("%s influxdb:%s is setup" % (PLUGIN, database))
PluginObject.database_setup = True
else:
collectd.error("%s influxdb:%s retention policy NOT setup" %
(PLUGIN, database))
def _clear_alarm_for_missing_filesystems():
@ -892,10 +1138,11 @@ def _clear_alarm_for_missing_filesystems():
if len(alarm_list):
for eid in alarm_list:
# search for any of them that might be alarmed.
obj = df_base_obj.instance_objects[eid]
obj = df_base_obj._get_instance_object(eid)
# only care about df (file system plugins)
if obj.plugin == PLUGIN__DF and \
if obj is not None and \
obj.plugin == PLUGIN__DF and \
obj.entity_id == eid and \
obj.plugin_instance != 'root':
@ -912,7 +1159,6 @@ def _clear_alarm_for_missing_filesystems():
else:
collectd.debug("%s maintaining alarm for %s" %
(PLUGIN, path))
return 0
# Collectd calls this function on startup.
@ -921,6 +1167,8 @@ def _clear_alarm_for_missing_filesystems():
def init_func():
""" Collectd FM Notifier Initialization Function """
PluginObject.lock = Lock()
PluginObject.host = os.uname()[1]
collectd.info("%s %s:%s init function" %
(PLUGIN, tsc.nodetype, PluginObject.host))
@ -933,15 +1181,19 @@ def init_func():
obj.repair += "contact next level of support."
collectd.info("%s monitoring %s usage" % (PLUGIN, obj.resource_name))
###########################################################################
# Constant Memory Plugin Object settings
obj = PLUGINS[PLUGIN__MEM]
obj.resource_name = "Memory"
obj.resource_name = "Platform Memory"
obj.instance_name = PLUGIN__MEM
obj.repair = "Monitor and if condition persists, "
obj.repair += "contact next level of support; "
obj.repair += "may require additional memory on Host."
collectd.info("%s monitoring %s usage" % (PLUGIN, obj.resource_name))
###########################################################################
# Constant FileSystem Plugin Object settings
obj = PLUGINS[PLUGIN__DF]
obj.resource_name = "File System"
@ -954,6 +1206,63 @@ def init_func():
# Create one DF instance object per mount point
obj._create_instance_objects()
# ntp query is for controllers only
if tsc.nodetype == 'worker' or 'worker' in tsc.subfunctions:
#######################################################################
# Constant vSwitch CPU Usage Plugin Object settings
obj = PLUGINS[PLUGIN__VSWITCH_CPU]
obj.resource_name = "vSwitch CPU"
obj.instance_name = PLUGIN__VSWITCH_CPU
obj.repair = "Monitor and if condition persists, "
obj.repair += "contact next level of support."
collectd.info("%s monitoring %s usage" % (PLUGIN, obj.resource_name))
#######################################################################
# Constant vSwitch Memory Usage Plugin Object settings
obj = PLUGINS[PLUGIN__VSWITCH_MEM]
obj.resource_name = "vSwitch Memory"
obj.instance_name = PLUGIN__VSWITCH_MEM
obj.repair = "Monitor and if condition persists, "
obj.repair += "contact next level of support."
collectd.info("%s monitoring %s usage" % (PLUGIN, obj.resource_name))
#######################################################################
# Constant vSwitch Port State Monitor Plugin Object settings
obj = PLUGINS[PLUGIN__VSWITCH_PORT]
obj.resource_name = "vSwitch Port"
obj.instance_name = PLUGIN__VSWITCH_PORT
obj.reading_type = "state"
obj.reason_failure = "'Data' Port failed."
obj.reason_warning = "'Data' Port failed."
obj.repair = "Check cabling and far-end port configuration and "
obj.repair += "status on adjacent equipment."
obj.alarm_type = fm_constants.FM_ALARM_TYPE_4 # EQUIPMENT
obj.cause = fm_constants.ALARM_PROBABLE_CAUSE_29 # LOSS_OF_SIGNAL
obj.service_affecting = True
collectd.info("%s monitoring %s state" % (PLUGIN, obj.resource_name))
#######################################################################
# Constant vSwitch Interface State Monitor Plugin Object settings
obj = PLUGINS[PLUGIN__VSWITCH_IFACE]
obj.resource_name = "vSwitch Interface"
obj.instance_name = PLUGIN__VSWITCH_IFACE
obj.reading_type = "state"
obj.reason_failure = "'Data' Interface failed."
obj.reason_warning = "'Data' Interface degraded."
obj.repair = "Check cabling and far-end port configuration and "
obj.repair += "status on adjacent equipment."
obj.alarm_type = fm_constants.FM_ALARM_TYPE_4 # EQUIPMENT
obj.cause = fm_constants.ALARM_PROBABLE_CAUSE_29 # LOSS_OF_SIGNAL
obj.service_affecting = True
collectd.info("%s monitoring %s state" % (PLUGIN, obj.resource_name))
###########################################################################
obj = PLUGINS[PLUGIN__EXAMPLE]
obj.resource_name = "Example"
obj.instance_name = PLUGIN__EXAMPLE
@ -981,6 +1290,7 @@ def init_func():
alarms = api.get_faults_by_id(alarm_id)
if alarms:
for alarm in alarms:
want_alarm_clear = False
eid = alarm.entity_instance_id
# ignore alarms not for this host
if PluginObject.host not in eid:
@ -988,28 +1298,31 @@ def init_func():
base_obj = _get_base_object(alarm_id)
if base_obj is None:
# Handle unrecognized alarm by clearing it ;
# should never happen since we are iterating
# over an internal alarm_id list.
# might be a plugin instance - clear it
want_alarm_clear = True
collectd.info('%s found %s %s alarm [%s]' %
(PLUGIN,
alarm.severity,
alarm_id,
eid))
if want_alarm_clear is True:
if api.clear_fault(alarm_id, eid) is False:
collectd.error("%s %s:%s not found ; clear failed" %
collectd.error("%s %s:%s clear failed" %
(PLUGIN,
alarm_id,
eid))
else:
collectd.error("%s %s:%s not found ; cleared" %
(PLUGIN,
alarm_id,
eid))
collectd.info("%s clear %s %s alarm %s" %
(PLUGIN,
alarm.severity,
alarm_id,
eid))
continue
collectd.info('%s found %s alarm with %s severity [%s:%s:%s]' %
(PLUGIN,
base_obj.id,
alarm.severity,
base_obj.plugin,
alarm_id,
eid))
if alarm.severity == "critical":
sev = "failure"
elif alarm.severity == "major":
@ -1019,7 +1332,8 @@ def init_func():
continue
# Load the alarm severity by doing a plugin/instance lookup.
base_obj._manage_alarm(eid, sev)
if base_obj is not None:
base_obj._manage_alarm(eid, sev)
# The notifier function inspects the collectd notification and determines if
@ -1067,27 +1381,68 @@ def notifier_func(nObject):
base_obj = obj = PLUGINS[nObject.plugin]
# if this notification is for a plugin instance then get that
# instances's object instead. if that object does not yet exists
# then create it
# instances's object instead.
# If that object does not yet exists then create it.
eid = ''
if nObject.plugin_instance:
# DF instances are statically allocated
if nObject.plugin == PLUGIN__DF:
eid = _build_entity_id(nObject.plugin, nObject.plugin_instance)
# get this instances object
obj = base_obj._get_instance_object(eid)
if obj is None:
# path should never be hit since all DF instances
# are statically allocated.
return 0
elif nObject.plugin_instance:
need_instance_object_create = False
# Build the entity_id from the parent object if needed
# Build the entity_id from the parent object if needed
eid = _build_entity_id(nObject.plugin, nObject.plugin_instance)
try:
# Need lock when reading/writing any obj.instance_objects list
collectd.debug("%s %s lock" % (PLUGIN, nObject.plugin))
PluginObject.lock.acquire()
#collectd.info("%s Object Search eid: %s" %
# (nObject.plugin, eid))
#for o in base_obj.instance_objects:
# collectd.error("%s %s inst object dict item %s : %s" %
# (PLUGIN, nObject.plugin, o,
# base_obj.instance_objects[o]))
# we will take an exception if this object is not in the list.
# the exception handling code below will create and add this
# object for success path the next time around.
inst_obj = base_obj.instance_objects[eid]
if inst_obj is None:
collectd.error("%s %s:%s instance object is None" %
(PLUGIN,
nObject.plugin,
nObject.plugin_instance))
return 0
collectd.debug("%s %s instance %s already exists %s" %
(PLUGIN, nObject.plugin, eid, inst_obj))
# _print_state(inst_obj)
except:
# o.k. , not in the list yet, lets create one
collectd.error("%s %s:%s instance object not found" %
(PLUGIN,
nObject.plugin,
nObject.plugin_instance))
return 0
need_instance_object_create = True
finally:
collectd.debug("%s %s unlock" % (PLUGIN, nObject.plugin))
PluginObject.lock.release()
if need_instance_object_create is True:
base_obj._create_instance_object(nObject.plugin_instance)
inst_obj = base_obj._get_instance_object(eid)
if inst_obj:
collectd.debug("%s %s:%s inst object created" %
(PLUGIN,
inst_obj.plugin,
inst_obj.instance))
else:
collectd.error("%s %s:%s inst object create failed" %
(PLUGIN,
nObject.plugin,
nObject.plugin_instance))
return 0
# re-assign the object
obj = inst_obj
@ -1096,13 +1451,6 @@ def notifier_func(nObject):
# Build the entity_id from the parent object if needed
eid = _build_entity_id(nObject.plugin, nObject.plugin_instance)
# TODO: Needed ?
if not len(obj.instance):
obj.instance = nObject.plugin
if nObject.plugin_instance:
obj.instance += '_' + nObject.plugin_instance
# TODO: Needed ?
# update the object with the eid if its not already set.
if not len(obj.entity_id):
obj.entity_id = eid
@ -1112,7 +1460,8 @@ def notifier_func(nObject):
(PLUGIN, nObject.plugin, nObject.plugin_instance))
return 0
# _print_state(obj)
# if obj.warnings or obj.failures:
# _print_state(obj)
# If want_state_audit is True then run the audit.
# Primarily used for debug
@ -1143,21 +1492,32 @@ def notifier_func(nObject):
return 0
if _alarm_state == fm_constants.FM_ALARM_STATE_CLEAR:
if api.clear_fault(base_obj.id, obj.entity_id) is False:
if api.clear_fault(obj.id, obj.entity_id) is False:
collectd.error("%s %s:%s clear_fault failed" %
(PLUGIN, base_obj.id, obj.entity_id))
return 0
else:
reason = obj.resource_name
reason += " threshold exceeded"
if obj.threshold:
reason += "; {:2.0f}".format(obj.threshold) + "%"
# reason += "; {:2.2f}".format(obj.threshold) + "%"
if obj.value:
reason += ", actual " + "{:2.0f}".format(obj.value) + "%"
# manage addition of the failure reason text
if obj.cause == fm_constants.ALARM_PROBABLE_CAUSE_50:
# if this is a threshold alarm then build the reason text that
# includes the threahold and the reading that caused the assertion.
reason = obj.resource_name
reason += " threshold exceeded"
if obj.threshold:
reason += "; threshold {:2.0f} ".format(obj.threshold) + "%, "
if obj.value:
reason += "actual {:2.0f}".format(obj.value) + "%"
elif _severity_num == fm_constants.FM_ALARM_SEVERITY_CRITICAL:
reason = obj.reason_failure
else:
reason = obj.reason_warning
# build the alarm object
fault = fm_api.Fault(
alarm_id=base_obj.id,
alarm_id=obj.id,
alarm_state=_alarm_state,
entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST,
entity_instance_id=obj.entity_id,
@ -1191,5 +1551,8 @@ def notifier_func(nObject):
# Debug only: comment out for production code.
# obj._state_audit("change")
return 0
collectd.register_init(init_func)
collectd.register_notification(notifier_func)

View File

@ -39,6 +39,7 @@
import os
import socket
import collectd
import tsconfig.tsconfig as tsc
# This plugin name
PLUGIN = 'degrade notifier'
@ -65,6 +66,13 @@ ONE_EVERY = 10
PLUGIN__DF = 'df'
PLUGIN__MEM = 'memory'
PLUGIN__CPU = 'cpu'
PLUGIN__VSWITCH_MEM = 'vswitch_mem'
PLUGIN__VSWITCH_CPU = 'vswitch_cpu'
PLUGIN__VSWITCH_PORT = "vswitch_port"
PLUGIN__VSWITCH_IFACE = "vswitch_iface"
PLUGIN_INTERFACE = 'interface'
PLUGIN__EXAMPLE = 'example'
@ -89,6 +97,10 @@ class collectdMtceNotifierObject:
self.degrade_list__failure = [PLUGIN__DF,
PLUGIN__MEM,
PLUGIN__CPU,
PLUGIN__VSWITCH_MEM,
PLUGIN__VSWITCH_CPU,
PLUGIN__VSWITCH_PORT,
PLUGIN__VSWITCH_IFACE,
PLUGIN_INTERFACE,
PLUGIN__EXAMPLE]
self.degrade_list__warning = []
@ -172,7 +184,7 @@ def config_func(config):
Configure the maintenance degrade notifier plugin.
"""
collectd.info('%s config function' % PLUGIN)
collectd.debug('%s config function' % PLUGIN)
for node in config.children:
key = node.key.lower()
val = node.values[0]
@ -194,6 +206,10 @@ def init_func():
Collectd Mtce Notifier Initialization Function
"""
obj.host = os.uname()[1]
collectd.info("%s %s:%s sending to mtce port %d" %
(PLUGIN, tsc.nodetype, obj.host, obj.port))
collectd.debug("%s init function" % PLUGIN)
@ -241,8 +257,8 @@ def notifier_func(nObject):
path = _df_instance_to_path(resource)
add = os.path.ismount(path)
if add is True:
collectd.debug("%s %s added to degrade list" %
(PLUGIN, resource))
collectd.info("%s %s added to degrade list" %
(PLUGIN, resource))
obj.degrade_list.append(resource)
else:
# If severity is failure and no failures cause degrade
@ -264,8 +280,8 @@ def notifier_func(nObject):
path = _df_instance_to_path(resource)
add = os.path.ismount(path)
if add is True:
collectd.debug("%s %s added to degrade list" %
(PLUGIN, resource))
collectd.info("%s %s added to degrade list" %
(PLUGIN, resource))
obj.degrade_list.append(resource)
else:
# If severity is warning and no warnings cause degrade