diff --git a/monitoring/collectd-extensions/src/fm_notifier.py b/monitoring/collectd-extensions/src/fm_notifier.py index 73b7916bc..ba458dc2e 100755 --- a/monitoring/collectd-extensions/src/fm_notifier.py +++ b/monitoring/collectd-extensions/src/fm_notifier.py @@ -23,17 +23,17 @@ # Collects provides information about each event as an object passed to the # notification handler ; the notification object. # -# object.host - the hostname +# object.host - the hostname. # -# object.plugin - the name of the plugin aka resource +# object.plugin - the name of the plugin aka resource. # object.plugin_instance - plugin instance string i.e. say mountpoint -# for df plugin -# object.type, - the unit i.e. percent or absolute -# object.type_instance - the attribute i.e. free, used, etc +# for df plugin or numa? node for memory. +# object.type, - the unit i.e. percent or absolute. +# object.type_instance - the attribute i.e. free, used, etc. # -# object.severity - a integer value 0=OK , 1=warning, 2=failure +# object.severity - a integer value 0=OK , 1=warning, 2=failure. # object.message - a log-able message containing the above along -# with the value +# with the value. # # This notifier uses the notification object to manage plugin/instance alarms. # @@ -86,6 +86,7 @@ import os import re import uuid import collectd +from threading import RLock as Lock from fm_api import constants as fm_constants from fm_api import fm_api import tsconfig.tsconfig as tsc @@ -116,6 +117,12 @@ PLUGIN = 'alarm notifier' # Path to the plugin's drop dir PLUGIN_PATH = '/etc/collectd.d/' +# the name of the collectd samples database +DATABASE_NAME = 'collectd samples' + +READING_TYPE__PERCENT_USAGE = '% usage' + + # collectd severity definitions ; # Note: can't seem to pull then in symbolically with a header NOTIF_FAILURE = 1 @@ -145,6 +152,7 @@ mangled_list = {"dev-shm", "etc-nova-instances", "opt-platform", "opt-cgcs", + "opt-etcd", "opt-extension", "opt-backups"} @@ -154,10 +162,20 @@ ALARM_ID__MEM = "100.103" ALARM_ID__DF = "100.104" ALARM_ID__EXAMPLE = "100.113" +ALARM_ID__VSWITCH_CPU = "100.102" +ALARM_ID__VSWITCH_MEM = "100.115" +ALARM_ID__VSWITCH_PORT = "300.001" +ALARM_ID__VSWITCH_IFACE = "300.002" + + # ADD_NEW_PLUGIN: add new alarm id to the list ALARM_ID_LIST = [ALARM_ID__CPU, ALARM_ID__MEM, ALARM_ID__DF, + ALARM_ID__VSWITCH_CPU, + ALARM_ID__VSWITCH_MEM, + ALARM_ID__VSWITCH_PORT, + ALARM_ID__VSWITCH_IFACE, ALARM_ID__EXAMPLE] # ADD_NEW_PLUGIN: add plugin name definition @@ -168,38 +186,29 @@ PLUGIN__CPU = "cpu" PLUGIN__MEM = "memory" PLUGIN__INTERFACE = "interface" PLUGIN__NTP_QUERY = "ntpq" -PLUGIN__VSWITCH_PORT = "vswitch-port" -PLUGIN__VSWITCH_CPU = "vswitch-cpu" -PLUGIN__VSWITCH_MEM = "vswitch-memory" -PLUGIN__VSWITCH_OVSDB = "vswitch-ovsdb" -PLUGIN__VSWITCH_OPENFLOW = "vswitch-openflow" -PLUGIN__VSWITCH_LACP_IFACE = "vswitch-lacp-iface" -PLUGIN__VSWITCH_IFACE = "vswitch-iface" -PLUGIN__NOVA_THINPOOL_LVM = "nova-thinpool-lvm" -PLUGIN__CINDER_THINPOOL_LVM = "cinder-thinpool-lvm" -PLUGIN__CINDER_THINPOOL_LVM_META = "cinder-thinpool-lvm-meta" +PLUGIN__VSWITCH_PORT = "vswitch_port" +PLUGIN__VSWITCH_CPU = "vswitch_cpu" +PLUGIN__VSWITCH_MEM = "vswitch_mem" +PLUGIN__VSWITCH_IFACE = "vswitch_iface" PLUGIN__EXAMPLE = "example" # ADD_NEW_PLUGIN: add plugin name to list PLUGIN_NAME_LIST = [PLUGIN__CPU, PLUGIN__MEM, PLUGIN__DF, + PLUGIN__VSWITCH_CPU, + PLUGIN__VSWITCH_MEM, + PLUGIN__VSWITCH_PORT, + PLUGIN__VSWITCH_IFACE, PLUGIN__EXAMPLE] -# ADD_NEW_PLUGIN: add alarm id and plugin to dictionary -# ALARM_ID_TO_PLUGIN_DICT = {} -# ALARM_ID_TO_PLUGIN_DICT[ALARM_ID__CPU] = PLUGIN__CPU -# ALARM_ID_TO_PLUGIN_DICT[ALARM_ID__MEM] = PLUGIN__MEM -# ALARM_ID_TO_PLUGIN_DICT[ALARM_ID__DF] = PLUGIN__DF -# ALARM_ID_TO_PLUGIN_DICT[ALARM_ID__EXAMPLE] = PLUGIN__EXAMPLE - - # PluginObject Class class PluginObject: dbObj = None # shared database connection obj host = None # saved hostname + lock = None # global lock for mread_func mutex database_setup = False # state of database setup database_setup_in_progress = False # connection mutex @@ -213,7 +222,7 @@ class PluginObject: self.plugin = plugin # name of the plugin ; df, cpu, memory ... self.plugin_instance = "" # the instance name for the plugin self.resource_name = "" # The top level name of the resource - self.instance_name = "" # The instanhce name + self.instance_name = "" # The instance name # Instance specific learned static class members. self.entity_id = "" # fm entity id host=. @@ -225,12 +234,17 @@ class PluginObject: self.value = float(0) # float value of reading # Common static class members. + self.reason_warning = "" + self.reason_failure = "" self.repair = "" - self.alarm_type = fm_constants.FM_ALARM_TYPE_7 - self.cause = fm_constants.ALARM_PROBABLE_CAUSE_50 + self.alarm_type = fm_constants.FM_ALARM_TYPE_7 # OPERATIONAL + self.cause = fm_constants.ALARM_PROBABLE_CAUSE_50 # THRESHOLD CROSS self.suppression = True self.service_affecting = False + # default most reading types are usage + self.reading_type = READING_TYPE__PERCENT_USAGE + # Severity tracking lists. # Maintains severity state between notifications. # Each is a list of entity ids for severity asserted alarms. @@ -329,7 +343,11 @@ class PluginObject: # filter out messages to ignore ; notifications that have no value if "has not been updated for" in nObject.message: - collectd.debug("%s NOT UPDATED: %s" % (PLUGIN, self.entity_id)) + collectd.info("%s %s %s (%s)" % + (PLUGIN, + self.entity_id, + nObject.message, + nObject.severity)) return "done" # Get the value from the notification message. @@ -363,8 +381,8 @@ class PluginObject: # validate the reading try: self.value = float(self.values[0]) - # get the threshold if its there - if len(self.values) == 2: + # get the threshold if its there. + if len(self.values) > 1: self.threshold = float(self.values[1]) except ValueError as ex: @@ -390,6 +408,9 @@ class PluginObject: logit = False if self.count == 0 or LOG_STEP == 0: logit = True + elif self.reading_type == "connections": + if self.value != last: + logit = True elif self.value > last: if (last + LOG_STEP) < self.value: logit = True @@ -401,18 +422,40 @@ class PluginObject: # # Note: only usage type so far if logit: - reading_type = "% usage" - tmp = str(self.value).split('.') - if len(tmp[0]) == 1: - pre = ': ' - else: - pre = ': ' - collectd.info("%s reading%s%2.2f %s - %s" % - (PLUGIN, - pre, - self.value, - reading_type, - self.instance_name)) + resource = self.resource_name + + # setup resource name for filesystem instance usage log + if self.plugin == PLUGIN__DF: + resource = self.instance + + # setup resource name for vswitch process instance name + elif self.plugin == PLUGIN__VSWITCH_MEM: + resource += ' Processor ' + resource += self.instance_name + + if self.reading_type == READING_TYPE__PERCENT_USAGE: + tmp = str(self.value).split('.') + if len(tmp[0]) == 1: + pre = ': ' + else: + pre = ': ' + collectd.info("%s reading%s%2.2f %s - %s" % + (PLUGIN, + pre, + self.value, + self.reading_type, + resource)) + + elif self.reading_type == "connections" and \ + self.instance_objects and \ + self.value != self.last_value: + if self.instance_objects: + collectd.info("%s monitor: %2d %s - %s" % + (PLUGIN, + self.value, + self.reading_type, + resource)) + self.last_value = float(self.value) ########################################################################## @@ -599,12 +642,139 @@ class PluginObject: collectd.info("%s %s no failures" % (PLUGIN, self.plugin)) + ########################################################################## + # + # Name : _get_instance_object + # + # Purpose : Safely get an object from the self instance object list + # indexed by eid. + # + ########################################################################## + def _get_instance_object(self, eid): + """ + Safely get an object from the self instance object list indexed + by eid while locked. + :param eid: + :return: object or None + """ + + try: + collectd.debug("%s %s Get Lock ..." % (PLUGIN, self.plugin)) + PluginObject.lock.acquire() + + obj = self.instance_objects[eid] + return obj + except: + collectd.error("%s failed to get instance from %s object list" % + (PLUGIN, self.plugin)) + return None + + finally: + collectd.debug("%s %s Get UnLock ..." % (PLUGIN, self.plugin)) + PluginObject.lock.release() + + ########################################################################## + # + # Name : _add_instance_object + # + # Purpose : Safely add an object to the self instance object list + # indexed by eid while locked. if found locked the instance + # add will be re-attempted on next sample. + # + ########################################################################## + def _add_instance_object(self, obj, eid): + """ + Update self instance_objects list while locked + :param obj: the object to add + :param eid: indexed by this eid + :return: nothing + """ + try: + collectd.debug("%s %s Add Lock ..." % (PLUGIN, self.plugin)) + PluginObject.lock.acquire() + + self.instance_objects[eid] = obj + except: + collectd.error("%s failed to add instance to %s object list" % + (PLUGIN, self.plugin)) + + finally: + collectd.debug("%s %s Add UnLock ..." % (PLUGIN, self.plugin)) + PluginObject.lock.release() + + ########################################################################## + # + # Name : _copy_instance_object + # + # Purpose : Copy select members of self object to target object. + # + ########################################################################## + def _copy_instance_object(self, object): + """ + Copy select members of self object to target object + """ + + object.resource_name = self.resource_name + object.instance_name = self.instance_name + object.reading_type = self.reading_type + + object.reason_warning = self.reason_warning + object.reason_failure = self.reason_failure + object.repair = self.repair + + object.alarm_type = self.alarm_type + object.cause = self.cause + object.suppression = self.suppression + object.service_affecting = self.service_affecting + + ########################################################################## + # + # Name : _create_instance_object + # + # Purpose : Create a new instance object and tack it on the supplied base + # object's instance object dictionary. + # + ########################################################################## + def _create_instance_object(self, instance): + + try: + # create a new plugin object + inst_obj = PluginObject(self.id, self.plugin) + self._copy_instance_object(inst_obj) + + # initialize the object with instance specific data + inst_obj.instance_name = instance + inst_obj.entity_id = _build_entity_id(self.plugin, + instance) + + self._add_instance_object(inst_obj, inst_obj.entity_id) + + collectd.debug("%s created %s instance (%s) object %s" % + (PLUGIN, inst_obj.resource_name, + inst_obj.entity_id, inst_obj)) + + collectd.debug("%s monitoring %s %s %s" % + (PLUGIN, + inst_obj.resource_name, + inst_obj.instance_name, + inst_obj.reading_type)) + + return inst_obj + + except: + collectd.error("%s %s:%s inst object create failed" % + (PLUGIN, inst_obj.resource_name, instance)) + return None + ########################################################################## # # Name : _create_instance_objects # # Purpose : Create a list of instance objects for 'self' type plugin and - # add those objects to the parnet's instance_objects dictionary. + # add those objects to the parent's instance_objects dictionary. + # + # Note : This is currently only used for the DF (filesystem) plugin. + # All other instance creations/allocations are done on-demand. # ########################################################################## def _create_instance_objects(self): @@ -612,11 +782,7 @@ class PluginObject: Create, initialize and add an instance object to this/self plugin """ - # ADD_NEW_PLUGIN: for plugins that have instances you need to - # add support for creating those instances and adding - # those instances to the parent instance_objects list. - - # Currently only the DF plugin has subordinate instance objects. + # Create the File System subordinate instance objects. if self.id == ALARM_ID__DF: # read the df.conf file and return/get a list of mount points @@ -651,6 +817,7 @@ class PluginObject: # initialize the object with instance specific data inst_obj.resource_name = self.resource_name inst_obj.instance_name = mp + inst_obj.instance = mp # build the plugin instance name from the mount point if mp == '/': inst_obj.plugin_instance = 'root' @@ -662,21 +829,30 @@ class PluginObject: # add this subordinate object to the parent's # instance object list - self.instance_objects[inst_obj.entity_id] = inst_obj + self._add_instance_object(inst_obj, inst_obj.entity_id) collectd.info("%s monitoring %s usage" % - (PLUGIN, mp)) + (PLUGIN, inst_obj.instance)) PluginObject.host = os.uname()[1] # ADD_NEW_PLUGIN: add plugin to this table -# This instanciates the plugin objects -PLUGINS = {PLUGIN__CPU: PluginObject(ALARM_ID__CPU, PLUGIN__CPU), - PLUGIN__MEM: PluginObject(ALARM_ID__MEM, PLUGIN__MEM), - PLUGIN__DF: PluginObject(ALARM_ID__DF, PLUGIN__DF), - PLUGIN__EXAMPLE: PluginObject(ALARM_ID__EXAMPLE, PLUGIN__EXAMPLE)} +# This instantiates the plugin objects +PLUGINS = { + PLUGIN__CPU: PluginObject(ALARM_ID__CPU, PLUGIN__CPU), + PLUGIN__MEM: PluginObject(ALARM_ID__MEM, PLUGIN__MEM), + PLUGIN__DF: PluginObject(ALARM_ID__DF, PLUGIN__DF), + PLUGIN__VSWITCH_CPU: PluginObject(ALARM_ID__VSWITCH_CPU, + PLUGIN__VSWITCH_CPU), + PLUGIN__VSWITCH_MEM: PluginObject(ALARM_ID__VSWITCH_MEM, + PLUGIN__VSWITCH_MEM), + PLUGIN__VSWITCH_PORT: PluginObject(ALARM_ID__VSWITCH_PORT, + PLUGIN__VSWITCH_PORT), + PLUGIN__VSWITCH_IFACE: PluginObject(ALARM_ID__VSWITCH_IFACE, + PLUGIN__VSWITCH_IFACE), + PLUGIN__EXAMPLE: PluginObject(ALARM_ID__EXAMPLE, PLUGIN__EXAMPLE)} def _get_base_object(alarm_id): @@ -689,21 +865,6 @@ def _get_base_object(alarm_id): return None -def _get_object(alarm_id, eid): - """ - Get the plugin object for the specified alarm id and eid - """ - - base_obj = _get_base_object(alarm_id) - if len(base_obj.instance_objects): - try: - return(base_obj.instance_objects[eid]) - except: - collectd.debug("%s %s has no instance objects" % - (PLUGIN, base_obj.plugin)) - return base_obj - - def is_uuid_like(val): """Returns validation of a value as a UUID. @@ -721,10 +882,38 @@ def _build_entity_id(plugin, plugin_instance): Builds an entity id string based on the collectd notification object. """ + inst_error = False + entity_id = 'host=' entity_id += PluginObject.host - if plugin == PLUGIN__DF: + if plugin == PLUGIN__VSWITCH_MEM: + + # host=.processor= + if plugin_instance: + entity_id += '.processor=' + plugin_instance + else: + inst_error = True + + elif plugin == PLUGIN__VSWITCH_IFACE: + + # host=.interface= + if plugin_instance: + entity_id += '.interface=' + plugin_instance + else: + inst_error = True + + elif plugin == PLUGIN__VSWITCH_PORT: + + # host=.port= + if plugin_instance: + entity_id += '.port=' + plugin_instance + else: + inst_error = True + + elif plugin == PLUGIN__DF: + + # host=.filesystem= if plugin_instance: instance = plugin_instance @@ -740,7 +929,18 @@ def _build_entity_id(plugin, plugin_instance): instance = instance.replace('-', '/') entity_id += instance - # collectd.info("%s entity_id : %s" % (PLUGIN, entity_id)) + # Will be uncommented when the numa memory monitor is added + # to the platform memory plugin. + # + #elif plugin == PLUGIN__MEM: + # if plugin_instance is not 'platform': + # # host=controller-0.numa=node0 + # entity_id += '.numa=' + # entity_id += plugin_instance + + if inst_error is True: + collectd.error("%s eid build failed ; missing instance" % plugin) + return None return entity_id @@ -773,37 +973,77 @@ def _get_df_mountpoints(): return(mountpoints) +def _print_obj(obj): + """ + Print a single object + """ + base_object = False + for plugin in PLUGIN_NAME_LIST: + if PLUGINS[plugin] == obj: + base_object = True + break + + num = len(obj.instance_objects) + if num > 0 or base_object is True: + prefix = "PLUGIN " + if num: + prefix += str(num) + else: + prefix += " " + else: + prefix = "INSTANCE" + + if obj.plugin_instance: + resource = obj.plugin + ":" + obj.plugin_instance + else: + resource = obj.plugin + + collectd.info("%s %s res: %s name: %s\n" % + (PLUGIN, prefix, resource, obj.resource_name)) + collectd.info("%s eid : %s\n" % (PLUGIN, obj.entity_id)) + collectd.info("%s inst: %s name: %s\n" % + (PLUGIN, obj.instance, obj.instance_name)) + collectd.info("%s value:%2.1f thld:%2.1f cause:%s (%d) type:%s" % + (PLUGIN, + obj.value, + obj.threshold, + obj.cause, + obj.count, + obj.reading_type)) + collectd.info("%s warn:%s fail:%s" % + (PLUGIN, obj.warnings, obj.failures)) + collectd.info("%s repair:t: %s" % + (PLUGIN, obj.repair)) + if obj.cause != fm_constants.ALARM_PROBABLE_CAUSE_50: + collectd.info("%s reason:w: %s\n" + "%s reason:f: %s\n" % + (PLUGIN, obj.reason_warning, + PLUGIN, obj.reason_failure)) + # collectd.info(" ") + + def _print_state(obj=None): """ Print the current object state """ - objs = [] - if obj is None: - objs.append(_get_base_object(ALARM_ID__CPU)) - objs.append(_get_base_object(ALARM_ID__MEM)) - objs.append(_get_base_object(ALARM_ID__DF)) - else: - objs.append(obj) - for o in objs: - collectd.info("%s PLUGIN %2d [%6s:%2.2f:%s] [w:%s f:%s] %d" % - (PLUGIN, - len(o.instance_objects), - o.plugin, - o.value, - o.entity_id, - o.warnings, - o.failures, - o.count)) - if len(o.instance_objects): - for inst_obj in o.instance_objects: - collectd.info("%s INSTANCE [%6s:%2.2f:%s] [w:%s f:%s] %d" % - (PLUGIN, - inst_obj.plugin, - inst_obj.value, - inst_obj.entity_id, - inst_obj.warnings, - inst_obj.failures, - inst_obj.count)) + try: + objs = [] + if obj is None: + for plugin in PLUGIN_NAME_LIST: + objs.append(PLUGINS[plugin]) + else: + objs.append(obj) + + collectd.debug("%s _print_state Lock ..." % PLUGIN) + PluginObject.lock.acquire() + for o in objs: + _print_obj(o) + if len(o.instance_objects): + for inst_obj in o.instance_objects: + _print_obj(o.instance_objects[inst_obj]) + finally: + collectd.debug("%s _print_state UnLock ..." % PLUGIN) + PluginObject.lock.release() def _database_setup(database): @@ -843,14 +1083,14 @@ def _database_setup(database): ############################################################ PluginObject.dbObj.create_retention_policy( - 'collectd samples', '4w', 1, database, True) + DATABASE_NAME, '4w', 1, database, True) except Exception as ex: if str(ex) == 'database already exists': try: collectd.info("%s influxdb:collectd %s" % (PLUGIN, str(ex))) PluginObject.dbObj.create_retention_policy( - 'collectd samples', '4w', 1, database, True) + DATABASE_NAME, '4w', 1, database, True) except Exception as ex: if str(ex) == 'retention policy already exists': collectd.info("%s influxdb:collectd %s" % @@ -864,15 +1104,21 @@ def _database_setup(database): error_str = "failed to connect to influxdb:" + database if not error_str: + found = False retention = \ PluginObject.dbObj.get_list_retention_policies(database) - collectd.info("%s influxdb:%s samples retention policy: %s" % - (PLUGIN, database, retention)) - collectd.info("%s influxdb:%s is setup" % (PLUGIN, database)) - PluginObject.database_setup = True - else: - collectd.error("%s influxdb:%s setup %s" % - (PLUGIN, database, error_str)) + for r in range(len(retention)): + if retention[r]["name"] == DATABASE_NAME: + collectd.info("%s influxdb:%s samples retention " + "policy: %s" % + (PLUGIN, database, retention[r])) + found = True + if found is True: + collectd.info("%s influxdb:%s is setup" % (PLUGIN, database)) + PluginObject.database_setup = True + else: + collectd.error("%s influxdb:%s retention policy NOT setup" % + (PLUGIN, database)) def _clear_alarm_for_missing_filesystems(): @@ -892,10 +1138,11 @@ def _clear_alarm_for_missing_filesystems(): if len(alarm_list): for eid in alarm_list: # search for any of them that might be alarmed. - obj = df_base_obj.instance_objects[eid] + obj = df_base_obj._get_instance_object(eid) # only care about df (file system plugins) - if obj.plugin == PLUGIN__DF and \ + if obj is not None and \ + obj.plugin == PLUGIN__DF and \ obj.entity_id == eid and \ obj.plugin_instance != 'root': @@ -912,7 +1159,6 @@ def _clear_alarm_for_missing_filesystems(): else: collectd.debug("%s maintaining alarm for %s" % (PLUGIN, path)) - return 0 # Collectd calls this function on startup. @@ -921,6 +1167,8 @@ def _clear_alarm_for_missing_filesystems(): def init_func(): """ Collectd FM Notifier Initialization Function """ + PluginObject.lock = Lock() + PluginObject.host = os.uname()[1] collectd.info("%s %s:%s init function" % (PLUGIN, tsc.nodetype, PluginObject.host)) @@ -933,15 +1181,19 @@ def init_func(): obj.repair += "contact next level of support." collectd.info("%s monitoring %s usage" % (PLUGIN, obj.resource_name)) + ########################################################################### + # Constant Memory Plugin Object settings obj = PLUGINS[PLUGIN__MEM] - obj.resource_name = "Memory" + obj.resource_name = "Platform Memory" obj.instance_name = PLUGIN__MEM obj.repair = "Monitor and if condition persists, " obj.repair += "contact next level of support; " obj.repair += "may require additional memory on Host." collectd.info("%s monitoring %s usage" % (PLUGIN, obj.resource_name)) + ########################################################################### + # Constant FileSystem Plugin Object settings obj = PLUGINS[PLUGIN__DF] obj.resource_name = "File System" @@ -954,6 +1206,63 @@ def init_func(): # Create one DF instance object per mount point obj._create_instance_objects() + # ntp query is for controllers only + if tsc.nodetype == 'worker' or 'worker' in tsc.subfunctions: + + ####################################################################### + + # Constant vSwitch CPU Usage Plugin Object settings + obj = PLUGINS[PLUGIN__VSWITCH_CPU] + obj.resource_name = "vSwitch CPU" + obj.instance_name = PLUGIN__VSWITCH_CPU + obj.repair = "Monitor and if condition persists, " + obj.repair += "contact next level of support." + collectd.info("%s monitoring %s usage" % (PLUGIN, obj.resource_name)) + + ####################################################################### + + # Constant vSwitch Memory Usage Plugin Object settings + obj = PLUGINS[PLUGIN__VSWITCH_MEM] + obj.resource_name = "vSwitch Memory" + obj.instance_name = PLUGIN__VSWITCH_MEM + obj.repair = "Monitor and if condition persists, " + obj.repair += "contact next level of support." + collectd.info("%s monitoring %s usage" % (PLUGIN, obj.resource_name)) + + ####################################################################### + + # Constant vSwitch Port State Monitor Plugin Object settings + obj = PLUGINS[PLUGIN__VSWITCH_PORT] + obj.resource_name = "vSwitch Port" + obj.instance_name = PLUGIN__VSWITCH_PORT + obj.reading_type = "state" + obj.reason_failure = "'Data' Port failed." + obj.reason_warning = "'Data' Port failed." + obj.repair = "Check cabling and far-end port configuration and " + obj.repair += "status on adjacent equipment." + obj.alarm_type = fm_constants.FM_ALARM_TYPE_4 # EQUIPMENT + obj.cause = fm_constants.ALARM_PROBABLE_CAUSE_29 # LOSS_OF_SIGNAL + obj.service_affecting = True + collectd.info("%s monitoring %s state" % (PLUGIN, obj.resource_name)) + + ####################################################################### + + # Constant vSwitch Interface State Monitor Plugin Object settings + obj = PLUGINS[PLUGIN__VSWITCH_IFACE] + obj.resource_name = "vSwitch Interface" + obj.instance_name = PLUGIN__VSWITCH_IFACE + obj.reading_type = "state" + obj.reason_failure = "'Data' Interface failed." + obj.reason_warning = "'Data' Interface degraded." + obj.repair = "Check cabling and far-end port configuration and " + obj.repair += "status on adjacent equipment." + obj.alarm_type = fm_constants.FM_ALARM_TYPE_4 # EQUIPMENT + obj.cause = fm_constants.ALARM_PROBABLE_CAUSE_29 # LOSS_OF_SIGNAL + obj.service_affecting = True + collectd.info("%s monitoring %s state" % (PLUGIN, obj.resource_name)) + + ########################################################################### + obj = PLUGINS[PLUGIN__EXAMPLE] obj.resource_name = "Example" obj.instance_name = PLUGIN__EXAMPLE @@ -981,6 +1290,7 @@ def init_func(): alarms = api.get_faults_by_id(alarm_id) if alarms: for alarm in alarms: + want_alarm_clear = False eid = alarm.entity_instance_id # ignore alarms not for this host if PluginObject.host not in eid: @@ -988,28 +1298,31 @@ def init_func(): base_obj = _get_base_object(alarm_id) if base_obj is None: - # Handle unrecognized alarm by clearing it ; - # should never happen since we are iterating - # over an internal alarm_id list. + + # might be a plugin instance - clear it + want_alarm_clear = True + + collectd.info('%s found %s %s alarm [%s]' % + (PLUGIN, + alarm.severity, + alarm_id, + eid)) + + if want_alarm_clear is True: + if api.clear_fault(alarm_id, eid) is False: - collectd.error("%s %s:%s not found ; clear failed" % + collectd.error("%s %s:%s clear failed" % (PLUGIN, alarm_id, eid)) else: - collectd.error("%s %s:%s not found ; cleared" % - (PLUGIN, - alarm_id, - eid)) + collectd.info("%s clear %s %s alarm %s" % + (PLUGIN, + alarm.severity, + alarm_id, + eid)) continue - collectd.info('%s found %s alarm with %s severity [%s:%s:%s]' % - (PLUGIN, - base_obj.id, - alarm.severity, - base_obj.plugin, - alarm_id, - eid)) if alarm.severity == "critical": sev = "failure" elif alarm.severity == "major": @@ -1019,7 +1332,8 @@ def init_func(): continue # Load the alarm severity by doing a plugin/instance lookup. - base_obj._manage_alarm(eid, sev) + if base_obj is not None: + base_obj._manage_alarm(eid, sev) # The notifier function inspects the collectd notification and determines if @@ -1067,27 +1381,68 @@ def notifier_func(nObject): base_obj = obj = PLUGINS[nObject.plugin] # if this notification is for a plugin instance then get that - # instances's object instead. if that object does not yet exists - # then create it + # instances's object instead. + # If that object does not yet exists then create it. eid = '' - if nObject.plugin_instance: + + # DF instances are statically allocated + if nObject.plugin == PLUGIN__DF: + eid = _build_entity_id(nObject.plugin, nObject.plugin_instance) + + # get this instances object + obj = base_obj._get_instance_object(eid) + if obj is None: + # path should never be hit since all DF instances + # are statically allocated. + return 0 + + elif nObject.plugin_instance: + need_instance_object_create = False + # Build the entity_id from the parent object if needed # Build the entity_id from the parent object if needed eid = _build_entity_id(nObject.plugin, nObject.plugin_instance) try: + # Need lock when reading/writing any obj.instance_objects list + collectd.debug("%s %s lock" % (PLUGIN, nObject.plugin)) + PluginObject.lock.acquire() + + #collectd.info("%s Object Search eid: %s" % + # (nObject.plugin, eid)) + + #for o in base_obj.instance_objects: + # collectd.error("%s %s inst object dict item %s : %s" % + # (PLUGIN, nObject.plugin, o, + # base_obj.instance_objects[o])) + + # we will take an exception if this object is not in the list. + # the exception handling code below will create and add this + # object for success path the next time around. inst_obj = base_obj.instance_objects[eid] - if inst_obj is None: - collectd.error("%s %s:%s instance object is None" % - (PLUGIN, - nObject.plugin, - nObject.plugin_instance)) - return 0 + + collectd.debug("%s %s instance %s already exists %s" % + (PLUGIN, nObject.plugin, eid, inst_obj)) + # _print_state(inst_obj) + except: - # o.k. , not in the list yet, lets create one - collectd.error("%s %s:%s instance object not found" % - (PLUGIN, - nObject.plugin, - nObject.plugin_instance)) - return 0 + need_instance_object_create = True + finally: + collectd.debug("%s %s unlock" % (PLUGIN, nObject.plugin)) + PluginObject.lock.release() + + if need_instance_object_create is True: + base_obj._create_instance_object(nObject.plugin_instance) + inst_obj = base_obj._get_instance_object(eid) + if inst_obj: + collectd.debug("%s %s:%s inst object created" % + (PLUGIN, + inst_obj.plugin, + inst_obj.instance)) + else: + collectd.error("%s %s:%s inst object create failed" % + (PLUGIN, + nObject.plugin, + nObject.plugin_instance)) + return 0 # re-assign the object obj = inst_obj @@ -1096,13 +1451,6 @@ def notifier_func(nObject): # Build the entity_id from the parent object if needed eid = _build_entity_id(nObject.plugin, nObject.plugin_instance) - # TODO: Needed ? - if not len(obj.instance): - obj.instance = nObject.plugin - if nObject.plugin_instance: - obj.instance += '_' + nObject.plugin_instance - - # TODO: Needed ? # update the object with the eid if its not already set. if not len(obj.entity_id): obj.entity_id = eid @@ -1112,7 +1460,8 @@ def notifier_func(nObject): (PLUGIN, nObject.plugin, nObject.plugin_instance)) return 0 - # _print_state(obj) + # if obj.warnings or obj.failures: + # _print_state(obj) # If want_state_audit is True then run the audit. # Primarily used for debug @@ -1143,21 +1492,32 @@ def notifier_func(nObject): return 0 if _alarm_state == fm_constants.FM_ALARM_STATE_CLEAR: - if api.clear_fault(base_obj.id, obj.entity_id) is False: + if api.clear_fault(obj.id, obj.entity_id) is False: collectd.error("%s %s:%s clear_fault failed" % (PLUGIN, base_obj.id, obj.entity_id)) return 0 else: - reason = obj.resource_name - reason += " threshold exceeded" - if obj.threshold: - reason += "; {:2.0f}".format(obj.threshold) + "%" - # reason += "; {:2.2f}".format(obj.threshold) + "%" - if obj.value: - reason += ", actual " + "{:2.0f}".format(obj.value) + "%" + # manage addition of the failure reason text + if obj.cause == fm_constants.ALARM_PROBABLE_CAUSE_50: + # if this is a threshold alarm then build the reason text that + # includes the threahold and the reading that caused the assertion. + reason = obj.resource_name + reason += " threshold exceeded" + if obj.threshold: + reason += "; threshold {:2.0f} ".format(obj.threshold) + "%, " + if obj.value: + reason += "actual {:2.0f}".format(obj.value) + "%" + + elif _severity_num == fm_constants.FM_ALARM_SEVERITY_CRITICAL: + reason = obj.reason_failure + + else: + reason = obj.reason_warning + + # build the alarm object fault = fm_api.Fault( - alarm_id=base_obj.id, + alarm_id=obj.id, alarm_state=_alarm_state, entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST, entity_instance_id=obj.entity_id, @@ -1191,5 +1551,8 @@ def notifier_func(nObject): # Debug only: comment out for production code. # obj._state_audit("change") + return 0 + + collectd.register_init(init_func) collectd.register_notification(notifier_func) diff --git a/monitoring/collectd-extensions/src/mtce_notifier.py b/monitoring/collectd-extensions/src/mtce_notifier.py index 1ffa88a2a..98de81cf3 100755 --- a/monitoring/collectd-extensions/src/mtce_notifier.py +++ b/monitoring/collectd-extensions/src/mtce_notifier.py @@ -39,6 +39,7 @@ import os import socket import collectd +import tsconfig.tsconfig as tsc # This plugin name PLUGIN = 'degrade notifier' @@ -65,6 +66,13 @@ ONE_EVERY = 10 PLUGIN__DF = 'df' PLUGIN__MEM = 'memory' PLUGIN__CPU = 'cpu' + +PLUGIN__VSWITCH_MEM = 'vswitch_mem' +PLUGIN__VSWITCH_CPU = 'vswitch_cpu' +PLUGIN__VSWITCH_PORT = "vswitch_port" +PLUGIN__VSWITCH_IFACE = "vswitch_iface" + + PLUGIN_INTERFACE = 'interface' PLUGIN__EXAMPLE = 'example' @@ -89,6 +97,10 @@ class collectdMtceNotifierObject: self.degrade_list__failure = [PLUGIN__DF, PLUGIN__MEM, PLUGIN__CPU, + PLUGIN__VSWITCH_MEM, + PLUGIN__VSWITCH_CPU, + PLUGIN__VSWITCH_PORT, + PLUGIN__VSWITCH_IFACE, PLUGIN_INTERFACE, PLUGIN__EXAMPLE] self.degrade_list__warning = [] @@ -172,7 +184,7 @@ def config_func(config): Configure the maintenance degrade notifier plugin. """ - collectd.info('%s config function' % PLUGIN) + collectd.debug('%s config function' % PLUGIN) for node in config.children: key = node.key.lower() val = node.values[0] @@ -194,6 +206,10 @@ def init_func(): Collectd Mtce Notifier Initialization Function """ + obj.host = os.uname()[1] + collectd.info("%s %s:%s sending to mtce port %d" % + (PLUGIN, tsc.nodetype, obj.host, obj.port)) + collectd.debug("%s init function" % PLUGIN) @@ -241,8 +257,8 @@ def notifier_func(nObject): path = _df_instance_to_path(resource) add = os.path.ismount(path) if add is True: - collectd.debug("%s %s added to degrade list" % - (PLUGIN, resource)) + collectd.info("%s %s added to degrade list" % + (PLUGIN, resource)) obj.degrade_list.append(resource) else: # If severity is failure and no failures cause degrade @@ -264,8 +280,8 @@ def notifier_func(nObject): path = _df_instance_to_path(resource) add = os.path.ismount(path) if add is True: - collectd.debug("%s %s added to degrade list" % - (PLUGIN, resource)) + collectd.info("%s %s added to degrade list" % + (PLUGIN, resource)) obj.degrade_list.append(resource) else: # If severity is warning and no warnings cause degrade