diff --git a/config-files/syslog-ng-config/centos/build_srpm.data b/config-files/syslog-ng-config/centos/build_srpm.data index da1e20bd8..2c3b2cb8b 100644 --- a/config-files/syslog-ng-config/centos/build_srpm.data +++ b/config-files/syslog-ng-config/centos/build_srpm.data @@ -1,2 +1,2 @@ SRC_DIR="files" -TIS_PATCH_VER=0 +TIS_PATCH_VER=1 diff --git a/config-files/syslog-ng-config/files/syslog-ng.conf b/config-files/syslog-ng-config/files/syslog-ng.conf index 32c7a3f6c..e4ebcdb5e 100644 --- a/config-files/syslog-ng-config/files/syslog-ng.conf +++ b/config-files/syslog-ng-config/files/syslog-ng.conf @@ -107,6 +107,7 @@ destination d_sm { file("/var/log/sm.log"); }; destination d_rmon { file("/var/log/rmond.log" template(t_mtc)); }; destination d_rmon_notify { file("/var/log/rmond_notify.log" template(t_mtc)); }; destination d_pmon { file("/var/log/pmond.log" template(t_mtc)); }; +destination d_lmon { file("/var/log/lmond.log" template(t_mtc)); }; destination d_hostwd { file("/var/log/hostwd.log" template(t_mtc)); }; destination d_fsmon { file("/var/log/fsmond.log" template(t_mtc)); }; destination d_hwmon { file("/var/log/hwmond.log" template(t_mtc)); }; @@ -352,6 +353,7 @@ filter f_local7 { facility(local7); }; filter f_rmon { facility(local5) and program(rmond); }; filter f_rmon_notify { facility(local5) and program(rmon_resource_notify); }; filter f_pmon { facility(local5) and program(pmond); }; +filter f_lmon { facility(local5) and program(lmond); }; filter f_hostw { facility(local5) and program(hostwd); }; filter f_fsmon { facility(local5) and program(fsmond); }; filter f_hwmon { facility(local5) and program(hwmond); }; @@ -472,6 +474,7 @@ log { source(s_src); filter(f_local3); destination(d_sm); }; log { source(s_src); filter(f_rmon); destination(d_rmon); }; log { source(s_src); filter(f_rmon_notify); destination(d_rmon_notify); }; log { source(s_src); filter(f_pmon); destination(d_pmon); }; +log { source(s_src); filter(f_lmon); destination(d_lmon); }; log { source(s_src); filter(f_hostw); destination(d_hostwd); }; log { source(s_src); filter(f_fsmon); destination(d_fsmon); }; log { source(s_src); filter(f_hwmon); destination(d_hwmon); }; diff --git a/monitoring/collectd-extensions/centos/build_srpm.data b/monitoring/collectd-extensions/centos/build_srpm.data index 82cafe8bb..e7f74e208 100644 --- a/monitoring/collectd-extensions/centos/build_srpm.data +++ b/monitoring/collectd-extensions/centos/build_srpm.data @@ -5,6 +5,7 @@ COPY_LIST="$PKG_BASE/src/LICENSE \ $PKG_BASE/src/collectd.service \ $PKG_BASE/src/fm_notifier.py \ $PKG_BASE/src/mtce_notifier.py \ + $PKG_BASE/src/plugin_common.py \ $PKG_BASE/src/python_plugins.conf \ $PKG_BASE/src/cpu.py \ $PKG_BASE/src/cpu.conf \ @@ -13,7 +14,9 @@ COPY_LIST="$PKG_BASE/src/LICENSE \ $PKG_BASE/src/df.conf \ $PKG_BASE/src/ntpq.py \ $PKG_BASE/src/ntpq.conf \ + $PKG_BASE/src/interface.py \ + $PKG_BASE/src/interface.conf \ $PKG_BASE/src/example.py \ $PKG_BASE/src/example.conf" -TIS_PATCH_VER=6 +TIS_PATCH_VER=7 diff --git a/monitoring/collectd-extensions/centos/collectd-extensions.spec b/monitoring/collectd-extensions/centos/collectd-extensions.spec index 532c06720..0665fb650 100644 --- a/monitoring/collectd-extensions/centos/collectd-extensions.spec +++ b/monitoring/collectd-extensions/centos/collectd-extensions.spec @@ -15,12 +15,14 @@ Source2: collectd.conf.pmon # collectd python plugin files - notifiers Source3: fm_notifier.py Source4: mtce_notifier.py +Source5: plugin_common.py # collectd python plugin files - resource plugins Source11: cpu.py Source12: memory.py Source14: example.py Source15: ntpq.py +Source16: interface.py # collectd plugin conf files into /etc/collectd.d Source100: python_plugins.conf @@ -29,6 +31,7 @@ Source102: memory.conf Source103: df.conf Source104: example.conf Source105: ntpq.conf +Source106: interface.conf BuildRequires: systemd-devel @@ -64,12 +67,15 @@ install -m 600 %{SOURCE2} %{buildroot}%{local_config_extensions_dir} # collectd python plugin files - notifiers install -m 700 %{SOURCE3} %{buildroot}%{local_python_extensions_dir} install -m 700 %{SOURCE4} %{buildroot}%{local_python_extensions_dir} +install -m 700 %{SOURCE5} %{buildroot}%{local_python_extensions_dir} # collectd python plugin files - resource plugins install -m 700 %{SOURCE11} %{buildroot}%{local_python_extensions_dir} install -m 700 %{SOURCE12} %{buildroot}%{local_python_extensions_dir} install -m 700 %{SOURCE14} %{buildroot}%{local_python_extensions_dir} install -m 700 %{SOURCE15} %{buildroot}%{local_python_extensions_dir} +install -m 700 %{SOURCE16} %{buildroot}%{local_python_extensions_dir} + # collectd plugin conf files into /etc/collectd.d install -m 600 %{SOURCE100} %{buildroot}%{local_plugin_dir} @@ -78,6 +84,7 @@ install -m 600 %{SOURCE102} %{buildroot}%{local_plugin_dir} install -m 600 %{SOURCE103} %{buildroot}%{local_plugin_dir} install -m 600 %{SOURCE104} %{buildroot}%{local_plugin_dir} install -m 600 %{SOURCE105} %{buildroot}%{local_plugin_dir} +install -m 600 %{SOURCE106} %{buildroot}%{local_plugin_dir} %clean rm -rf $RPM_BUILD_ROOT diff --git a/monitoring/collectd-extensions/src/cpu.conf b/monitoring/collectd-extensions/src/cpu.conf index 75394cdb2..b1d862f18 100644 --- a/monitoring/collectd-extensions/src/cpu.conf +++ b/monitoring/collectd-extensions/src/cpu.conf @@ -13,8 +13,8 @@ Instance "used" Persist true PersistOK true - WarningMax 90.00 - FailureMax 95.00 + WarningMax 89.00 + FailureMax 94.00 Hits 2 Invert false diff --git a/monitoring/collectd-extensions/src/df.conf b/monitoring/collectd-extensions/src/df.conf index 5df943b8b..19eb764c7 100644 --- a/monitoring/collectd-extensions/src/df.conf +++ b/monitoring/collectd-extensions/src/df.conf @@ -13,6 +13,7 @@ MountPoint "/var/lock" MountPoint "/boot" MountPoint "/scratch" + MountPoint "/opt/etcd" MountPoint "/opt/cgcs" MountPoint "/opt/platform" MountPoint "/opt/extension" @@ -27,8 +28,8 @@ Instance "used" - WarningMax 80.00 - FailureMax 90.00 + WarningMax 79.00 + FailureMax 89.00 Persist true PersistOK true Hits 2 diff --git a/monitoring/collectd-extensions/src/example.conf b/monitoring/collectd-extensions/src/example.conf index fbcf5d4f9..574306027 100644 --- a/monitoring/collectd-extensions/src/example.conf +++ b/monitoring/collectd-extensions/src/example.conf @@ -4,8 +4,8 @@ Instance "used" Persist true PersistOK true - WarningMax 51.00 - FailureMax 75.00 + WarningMax 49.00 + FailureMax 74.00 Hits 1 Invert false diff --git a/monitoring/collectd-extensions/src/fm_notifier.py b/monitoring/collectd-extensions/src/fm_notifier.py index ba458dc2e..815fb07ac 100755 --- a/monitoring/collectd-extensions/src/fm_notifier.py +++ b/monitoring/collectd-extensions/src/fm_notifier.py @@ -90,6 +90,7 @@ from threading import RLock as Lock from fm_api import constants as fm_constants from fm_api import fm_api import tsconfig.tsconfig as tsc +import plugin_common as pc # only load influxdb on the controller if tsc.nodetype == 'controller': @@ -865,16 +866,19 @@ def _get_base_object(alarm_id): return None -def is_uuid_like(val): - """Returns validation of a value as a UUID. - - For our purposes, a UUID is a canonical form string: - aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa +def _get_object(alarm_id, eid): """ - try: - return str(uuid.UUID(val)) == val - except (TypeError, ValueError, AttributeError): - return False + Get the plugin object for the specified alarm id and eid + """ + + base_obj = _get_base_object(alarm_id) + if len(base_obj.instance_objects): + try: + return(base_obj.instance_objects[eid]) + except: + collectd.debug("%s %s has no instance objects" % + (PLUGIN, base_obj.plugin)) + return base_obj def _build_entity_id(plugin, plugin_instance): @@ -1530,7 +1534,7 @@ def notifier_func(nObject): suppression=base_obj.suppression) alarm_uuid = api.set_fault(fault) - if is_uuid_like(alarm_uuid) is False: + if pc.is_uuid_like(alarm_uuid) is False: collectd.error("%s %s:%s set_fault failed:%s" % (PLUGIN, base_obj.id, obj.entity_id, alarm_uuid)) return 0 diff --git a/monitoring/collectd-extensions/src/interface.conf b/monitoring/collectd-extensions/src/interface.conf index c7ef627f6..de3afaf23 100644 --- a/monitoring/collectd-extensions/src/interface.conf +++ b/monitoring/collectd-extensions/src/interface.conf @@ -1,11 +1,11 @@ - - Instance "state" + + Instance "used" Persist true PersistOK true - WarningMin 50 - FailureMin 0 + WarningMin 51 + FailureMin 1 # Hits 2 Invert false diff --git a/monitoring/collectd-extensions/src/interface.py b/monitoring/collectd-extensions/src/interface.py index ae42a47d6..7b44de8e8 100755 --- a/monitoring/collectd-extensions/src/interface.py +++ b/monitoring/collectd-extensions/src/interface.py @@ -1,129 +1,934 @@ # -# Copyright (c) 2018 Wind River Systems, Inc. +# Copyright (c) 2019 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # ############################################################################ # -# This is the Host Interface Monitor plugin for Collectd. +# This is the Host Interface Monitor plugin for collectd. # -# Only mgmnt , infra and oam interfaces are supported with the following +# Only mgmt, infra and oam interfaces are supported with the following # mapping specified in /etc/platform/platform.conf # -# mgmnt - management_interface | all hosts | manditory +# oam - oam_interface | controller | mandatory +# mgmnt - management_interface | all hosts | mandatory # infa - infrastructure_interface | any host | optional -# oam - oam_interface | controller | manditory # -# This plugin reports link state inb the following way. +# This plugin queries the maintenance Link Monitor daemon 'lmon' +# for a link status summary of that hosts configured networks. # -# The plugin init function learns interface names from platform.conf +# This plugin's read_func issues an http GET request to the Link Monitor +# which responds with a json string that represents a complete summary +# of the monitored links, state and the time of the last event or when +# initial status was learned. An example of the Link Monitor response is # +# { +# "status" : "pass" +# "link_info": [ +# { "network":"mgmt", +# "type":"vlan", +# "links": [ +# { "name":"enp0s8.1", "state":"Up", "time":"5674323454567" }, +# { "name":"enp0s8.2", "state":"Up", "time":"5674323454567" }] +# }, +# { "network":"infra", +# "type":"bond", +# "bond":"bond0", +# "links": [ +# { "name":"enp0s9f1", "state":"Down", "time":"5674323454567" }, +# { "name":"enp0s9f0", "state":"Up" , "time":"5674323454567" }] +# }, +# { "network":"oam", +# "type":"single", +# "links": [ +# { "name":"enp0s3", "state":"Up", "time":"5674323454567" }] +# }] +# } +# +# On failure +# +# { +# "status" : "fail ; bad request " +# } +# +# This plugin then uses this information to manage interface alarm +# assertion and clear with appropriate severity. +# +# Severity: Interface and Port levels +# +# Alarm Level Minor Major Critical +# ----------- ----- --------------------- ---------------------------- +# Interface N/A One of lag pair is Up All Interface ports are Down +# Port N/A Physical Link is Down N/A +# +# Sample Data: represented as % of total links Up for that network interface +# +# 100 or 100% percent used - all links of interface are up. +# 50 or 50% percent used - one of lag pair is Up and the other is Down +# 0 or 0% percent used - all ports for that network are Down # ############################################################################ -import os -import random -import collectd -import tsconfig.tsconfig as tsc +import os +import time +import datetime +import collectd +import plugin_common as pc +from fm_api import constants as fm_constants +from fm_api import fm_api + +# Fault manager API Object +api = fm_api.FaultAPIs() + +# name of the plugin - all logs produced by this plugin are prefixed with this PLUGIN = 'interface plugin' -# static variables +# Interface Monitoring Interval in seconds +PLUGIN_AUDIT_INTERVAL = 10 -PLATFORM_CONF_MGMNT_LABEL = "management_interface=" -PLATFORM_CONF_INFRA_LABEL = "infrastructure_interface=" -PLATFORM_CONF_OAM_LABEL = "oam_interface=" +# Sample Data 'type' and 'instance' database field values. +PLUGIN_TYPE = 'percent' +PLUGIN_TYPE_INSTANCE = 'usage' -NETWORK_MGMNT = 'mgmnt' +# The Link Status Query URL +PLUGIN_HTTP_URL_PREFIX = 'http://localhost:' + +# This plugin's timeout +PLUGIN_HTTP_TIMEOUT = 5 + +# Specify the link monitor as the maintenance destination service +# full path should look like ; http://localhost:2122/mtce/lmon +PLUGIN_HTTP_URL_PATH = '/mtce/lmon' + +# Port and Interface Alarm Identifiers +PLUGIN_OAM_PORT_ALARMID = '100.106' # OAM Network Port +PLUGIN_OAM_IFACE_ALARMID = '100.107' # OAM Network Interface + +PLUGIN_MGMT_PORT_ALARMID = '100.108' # Management Network Port +PLUGIN_MGMT_IFACE_ALARMID = '100.109' # Management Network Interface + +PLUGIN_INFRA_PORT_ALARMID = '100.110' # Infrastructure Network Port +PLUGIN_INFRA_IFACE_ALARMID = '100.111' # Infrastructure Nwk Interface + +# List of all alarm identifiers. +ALARM_ID_LIST = [PLUGIN_OAM_PORT_ALARMID, + PLUGIN_OAM_IFACE_ALARMID, + PLUGIN_MGMT_PORT_ALARMID, + PLUGIN_MGMT_IFACE_ALARMID, + PLUGIN_INFRA_PORT_ALARMID, + PLUGIN_INFRA_IFACE_ALARMID] + +# Monitored Network Name Strings +NETWORK_MGMT = 'mgmt' NETWORK_INFRA = 'infra' NETWORK_OAM = 'oam' +# Port / Interface State strings +LINK_UP = 'Up' +LINK_DOWN = 'Down' -class iface: - def __init__(self, n, m, s): - self.master = {'network': n, 'name': m, 'state': 'down', 'slaves': s} - self.slave1 = {} - self.slave2 = {} - self.state = int(100) +# Alarm control actions +ALARM_ACTION_RAISE = 'raise' +ALARM_ACTION_CLEAR = 'clear' + +# Alarm level. +# Ports are the lowest level and represent a physical link +# Interfaces are port groupings in terms of LAG +LEVEL_PORT = 'port' +LEVEL_IFACE = 'interface' -class object: - hostname = '' +# Link Object (aka Port or Physical interface) Structure +# and member functions. +class LinkObject: - def __init__(self): - self.NETWORKS = {} - self.NETWORKS[NETWORK_MGMNT] = None - self.NETWORKS[NETWORK_INFRA] = None - self.NETWORKS[NETWORK_OAM] = None + def __init__(self, alarm_id): -obj = object() + self.name = None + self.state = LINK_UP + self.timestamp = float(0) + self.severity = fm_constants.FM_ALARM_SEVERITY_CLEAR + self.alarm_id = alarm_id + self.state_change = True + + collectd.debug("%s LinkObject constructor: %s" % + (PLUGIN, alarm_id)) + + ################################################################## + # + # Name : raise_port_alarm + # + # Purpose : This link object member function is used to + # raise link/port alarms. + # + # Parameters : Network the link is part of. + # + # Returns : True on failure and False on success. + # + ################################################################## + def raise_port_alarm(self, network): + """ Raise a port alarm """ + + if self.severity != fm_constants.FM_ALARM_SEVERITY_MAJOR: + + if manage_alarm(self.name, + network, + LEVEL_PORT, + ALARM_ACTION_RAISE, + fm_constants.FM_ALARM_SEVERITY_MAJOR, + self.alarm_id, + self.timestamp) is False: + + self.severity = fm_constants.FM_ALARM_SEVERITY_MAJOR + collectd.info("%s %s %s port alarm raised" % + (PLUGIN, self.name, self.alarm_id)) + return False + else: + return True + else: + return False + + ################################################################## + # + # Name : clear_port_alarm + # + # Purpose : This link object member function is used to + # clear link/port alarms. + # + # Parameters : Network the link is part of. + # + # Returns : True on failure and False on success. + # + ################################################################## + def clear_port_alarm(self, network): + """ Clear a port alarm """ + + if self.severity != fm_constants.FM_ALARM_SEVERITY_CLEAR: + if manage_alarm(self.name, + network, + LEVEL_PORT, + ALARM_ACTION_CLEAR, + fm_constants.FM_ALARM_SEVERITY_CLEAR, + self.alarm_id, + self.timestamp) is False: + + collectd.info("%s %s %s port alarm cleared" % + (PLUGIN, self.name, self.alarm_id)) + self.severity = fm_constants.FM_ALARM_SEVERITY_CLEAR + return False + else: + return True + else: + return False + + +# Interface (aka Network) Level Object Structure and member functions +class NetworkObject: + + def __init__(self, name): + + self.name = name + self.sample = 0 + self.sample_last = 0 + self.severity = fm_constants.FM_ALARM_SEVERITY_CLEAR + self.degraded = False + self.timestamp = float(0) + + # add the respective alarm IDs to each object + alarm_id = None + if name == NETWORK_OAM: + alarm_id = PLUGIN_OAM_PORT_ALARMID + self.alarm_id = PLUGIN_OAM_IFACE_ALARMID + elif name == NETWORK_MGMT: + alarm_id = PLUGIN_MGMT_PORT_ALARMID + self.alarm_id = PLUGIN_MGMT_IFACE_ALARMID + elif name == NETWORK_INFRA: + alarm_id = PLUGIN_INFRA_PORT_ALARMID + self.alarm_id = PLUGIN_INFRA_IFACE_ALARMID + else: + self.alarm_id = "" + collectd.error("%s unexpected network (%s)" % (PLUGIN, name)) + + collectd.debug("%s %s NetworkObject constructor: %s" % + (PLUGIN, name, self.alarm_id)) + + if alarm_id: + self.link_one = LinkObject(alarm_id) + self.link_two = LinkObject(alarm_id) + + ################################################################## + # + # Name : raise_iface_alarm + # + # Purpose : This network object member function used to + # raise interface alarms. + # + # Parameters : None + # + # Returns : True on failure and False on success. + # + ################################################################## + def raise_iface_alarm(self, severity): + """ Raise an interface alarm """ + + if severity == fm_constants.FM_ALARM_SEVERITY_CLEAR: + collectd.error("%s %s raise alarm called with clear severity" % + (PLUGIN, self.name)) + return True + + if self.severity != severity: + if manage_alarm(self.name, + self.name, + LEVEL_IFACE, + ALARM_ACTION_RAISE, + severity, + self.alarm_id, + self.timestamp) is False: + + self.severity = severity + collectd.info("%s %s %s %s interface alarm raised" % + (PLUGIN, + self.name, + self.alarm_id, + pc.get_severity_str(severity))) + return False + else: + return True + else: + return False + + ################################################################## + # + # Name : clear_iface_alarm + # + # Purpose : This network object member function used to + # clear interface alarms. + # + # Parameters : None + # + # Returns : True on failure and False on success. + # + ################################################################## + def clear_iface_alarm(self): + """ Clear an interface alarm """ + + if self.severity != fm_constants.FM_ALARM_SEVERITY_CLEAR: + if manage_alarm(self.name, + self.name, + LEVEL_IFACE, + ALARM_ACTION_CLEAR, + fm_constants.FM_ALARM_SEVERITY_CLEAR, + self.alarm_id, + self.timestamp) is False: + + collectd.info("%s %s %s %s interface alarm cleared" % + (PLUGIN, + self.name, + self.alarm_id, + pc.get_severity_str(self.severity))) + self.severity = fm_constants.FM_ALARM_SEVERITY_CLEAR + return False + else: + return True + else: + return False + + ###################################################################### + # + # Name : manage_iface_alarm + # + # Purpose : clear or raise appropriate severity level interface alarm + # + # Returns : None + # + ###################################################################### + def manage_iface_alarm(self): + """ """ + # Single Link Config + if self.link_two.name is None: + if self.link_one.state == LINK_DOWN: + if self.severity != fm_constants.FM_ALARM_SEVERITY_CRITICAL: + self.timestamp = self.link_one.timestamp + self.raise_iface_alarm( + fm_constants.FM_ALARM_SEVERITY_CRITICAL) + elif self.link_one.state == LINK_UP: + if self.severity != fm_constants.FM_ALARM_SEVERITY_CLEAR: + self.clear_iface_alarm() + + # Lagged Link Config + # + # The interface level timestamp is updated based on the failed + # link timestamps + elif self.link_one.state == LINK_UP and \ + self.link_two.state == LINK_DOWN: + if self.severity != fm_constants.FM_ALARM_SEVERITY_MAJOR: + self.timestamp = self.link_two.timestamp + self.raise_iface_alarm(fm_constants.FM_ALARM_SEVERITY_MAJOR) + + elif self.link_one.state == LINK_DOWN and \ + self.link_two.state == LINK_UP: + if self.severity != fm_constants.FM_ALARM_SEVERITY_MAJOR: + self.timestamp = self.link_one.timestamp + self.raise_iface_alarm(fm_constants.FM_ALARM_SEVERITY_MAJOR) + + elif self.link_one.state == LINK_UP and self.link_two.state == LINK_UP: + if self.severity != fm_constants.FM_ALARM_SEVERITY_CLEAR: + self.clear_iface_alarm() + + elif self.link_one.state == LINK_DOWN and \ + self.link_two.state == LINK_DOWN: + if self.severity != fm_constants.FM_ALARM_SEVERITY_CRITICAL: + if self.link_one.timestamp > self.link_two.timestamp: + self.timestamp = self.link_one.timestamp + else: + self.timestamp = self.link_two.timestamp + self.raise_iface_alarm(fm_constants.FM_ALARM_SEVERITY_CRITICAL) + + +# Plugin Control Object +obj = pc.PluginObject(PLUGIN, PLUGIN_HTTP_URL_PREFIX) + + +# Network Object List - Primary Network/Link Control Object +NETWORKS = [NetworkObject(NETWORK_MGMT), + NetworkObject(NETWORK_OAM), + NetworkObject(NETWORK_INFRA)] + + +########################################################################## +# +# Name : get_timestamp +# +# Purpose : Convert the long long int microsecond time as string +# that accompany link info from the Link Monitor (lmond) +# and catch exceptions in doing so. +# +# Parameters: lmon_time - long long int as string +# +# Returns : float time that can be consumed by datetime.fromtimestamp +# +# Returns same unit of now time if provided lmon_time is +# invalid. +# +########################################################################## +def get_timestamp(lmon_time): + """ Convert lmon time to fm timestamp time """ + + if lmon_time: + try: + return(float(float(lmon_time)/1000000)) + except: + collectd.error("%s failed to parse timestamp ;" + " using current time" % PLUGIN) + else: + collectd.error("%s no timestamp ;" + " using current time" % PLUGIN) + + return(float(time.time())) + + +def dump_network_info(network): + """ Log the specified network info """ + + link_one_event_time = datetime.datetime.fromtimestamp( + float(network.link_one.timestamp)).strftime('%Y-%m-%d %H:%M:%S') + + link_two_info = '' + if network.link_two.name is not None: + link_two_event_time = datetime.datetime.fromtimestamp( + float(network.link_two.timestamp)).strftime('%Y-%m-%d %H:%M:%S') + + link_two_info += "; link two '" + link_two_info += network.link_two.name + link_two_info += "' went " + network.link_two.state + link_two_info += " at " + link_two_event_time + + pcnt = '%' + + collectd.info("%s %5s %3d%c ; " + "link one '%s' went %s at %s %s" % + (PLUGIN, + network.name, + network.sample, + pcnt, + network.link_one.name, + network.link_one.state, + link_one_event_time, + link_two_info)) + + +######################################################################### +# +# Name : this_hosts_alarm +# +# Purpose : Determine if the supplied eid is for this host. +# +# Description: The eid formats for the alarms managed by this plugin are +# +# host=.port= +# host=.interface= +# +# Assumptions: There is no restriction preventing the system +# administrator from creating hostnames with period's ('.') +# in them. Because so the eid cannot simply be split +# around '='s and '.'s. Instead its split around this +# plugins level type '.port' or '.interface'. +# +# Returns : True if hostname is a match +# False otherwise +# +########################################################################## +def this_hosts_alarm(hostname, eid): + """ Check if the specified eid is for this host """ + + if hostname: + if eid: + # 'host=controller-0.interface=mgmt' + try: + eid_host = None + eid_disected = eid.split('=') + if len(eid_disected) == 3: + # ['host', 'controller-0.interface', 'mgmt'] + if len(eid_disected[1].split('.port')) == 2: + eid_host = eid_disected[1].split('.port')[0] + if eid_host and eid_host == hostname: + return True + elif len(eid_disected[1].split('.interface')) == 2: + eid_host = eid_disected[1].split('.interface')[0] + if eid_host and eid_host == hostname: + return True + except Exception as ex: + collectd.error("%s failed to parse alarm eid (%s)" + " [eid:%s]" % (PLUGIN, str(ex), eid)) + + return False + + +########################################################################## +# +# Name : clear_alarms +# +# Purpose : Clear all interface alarms on process startup. +# +# Description: Called after first successful Link Status query. +# +# Loops over the provided alarm id list querying all alarms +# for each. Any that are raised are precisely cleared. +# +# Prevents stuck alarms over port and interface reconfig. +# +# If the original alarm case still exists the alarm will +# be re-raised with the original link event timestamp that +# is part of the Link Status query response. +# +# Parameters : A list of this plugin's alarm ids +# +# Returns : True on failure and False on success +# +########################################################################## +def clear_alarms(alarm_id_list): + """ Clear alarm state of all plugin alarms. """ + found = False + for alarm_id in alarm_id_list: + alarms = api.get_faults_by_id(alarm_id) + if alarms: + for alarm in alarms: + eid = alarm.entity_instance_id + if this_hosts_alarm(obj.hostname, eid) is False: + # ignore other host alarms + continue + + if alarm_id == PLUGIN_OAM_PORT_ALARMID or \ + alarm_id == PLUGIN_OAM_IFACE_ALARMID or \ + alarm_id == PLUGIN_MGMT_PORT_ALARMID or \ + alarm_id == PLUGIN_MGMT_IFACE_ALARMID or \ + alarm_id == PLUGIN_INFRA_PORT_ALARMID or \ + alarm_id == PLUGIN_INFRA_IFACE_ALARMID: + eid = alarm.entity_instance_id + if api.clear_fault(alarm_id, eid) is False: + collectd.error("%s %s:%s clear_fault failed" % + (PLUGIN, alarm_id, eid)) + return True + else: + found = True + collectd.info("%s %s clearing %s alarm %s:%s" % + (PLUGIN, + NETWORK_INFRA, + alarm.severity, + alarm_id, + alarm.entity_instance_id)) + + if found is False: + collectd.info("%s found no startup alarms" % PLUGIN) + + return False + + +########################################################################## +# +# Name : manage_alarm +# +# Purpose : Raises or clears port and interface alarms based on +# calling parameters. +# +# Returns : True on failure and False on success +# +########################################################################## +def manage_alarm(name, network, level, action, severity, alarm_id, timestamp): + """ Manage raise and clear of port and interface alarms """ + + ts = datetime.datetime.fromtimestamp( + float(timestamp)).strftime('%Y-%m-%d %H:%M:%S') + collectd.debug("%s %s %s %s alarm for %s:%s [%s] %s" % (PLUGIN, + severity, level, alarm_id, network, name, action, ts)) + + if action == ALARM_ACTION_CLEAR: + alarm_state = fm_constants.FM_ALARM_STATE_CLEAR + reason = '' + repair = '' + else: + # reason ad repair strings are only needed on alarm assertion + alarm_state = fm_constants.FM_ALARM_STATE_SET + reason = "'" + network.upper() + "' " + level + repair = 'Check cabling and far-end port configuration ' \ + 'and status on adjacent equipment.' + + # build the alarm eid and name string + if level == LEVEL_PORT: + eid = 'host=' + obj.hostname + "." + level + '=' + name + reason += " failed" + else: + eid = 'host=' + obj.hostname + "." + level + '=' + network + if severity == fm_constants.FM_ALARM_SEVERITY_MAJOR: + reason += " degraded" + else: + reason += " failed" + + if alarm_state == fm_constants.FM_ALARM_STATE_CLEAR: + if api.clear_fault(alarm_id, eid) is False: + collectd.error("%s %s:%s clear_fault failed" % + (PLUGIN, alarm_id, eid)) + return True + else: + return False + else: + fault = fm_api.Fault( + uuid="", + alarm_id=alarm_id, + alarm_state=alarm_state, + entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST, + entity_instance_id=eid, + severity=severity, + reason_text=reason, + alarm_type=fm_constants.FM_ALARM_TYPE_7, + probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_UNKNOWN, + proposed_repair_action=repair, + service_affecting=True, + timestamp=ts, + suppression=True) + + alarm_uuid = api.set_fault(fault) + if pc.is_uuid_like(alarm_uuid) is False: + collectd.error("%s %s:%s set_fault failed:%s" % + (PLUGIN, alarm_id, eid, alarm_uuid)) + return True + else: + return False # The config function - called once on collectd process startup def config_func(config): - """ - Configure the plugin - """ + """ Configure the plugin """ - collectd.debug('%s config function' % PLUGIN) + # Need to update the Link Status Query URL with the port number. + url_updated = False + + # The Link Monitor port number is first searched for in + # the /etc/mtc/lmond.conf file. + # If its not there then its taken from the plugin config. + + # /etc/mtc/lmond.conf + fn = '/etc/mtc/lmond.conf' + if (os.path.exists(fn)): + try: + with open(fn, 'r') as infile: + for line in infile: + if 'lmon_query_port' in line: + if isinstance(int(line.split()[2]), int): + + # add the port + obj.url += line.split()[2] + + # add the path /mtce/lmon + obj.url += PLUGIN_HTTP_URL_PATH + + url_updated = "config file" + break + except EnvironmentError as e: + collectd.error(str(e), UserWarning) + + if url_updated is False: + # Try the config as this might be updated by manifest + for node in config.children: + key = node.key.lower() + val = int(node.values[0]) + if key == 'port': + if isinstance(int(val), int): + + # add the port + obj.url += str(val) + + # add the path /mtce/lmon + obj.url += PLUGIN_HTTP_URL_PATH + + url_updated = "manifest" + break + + if url_updated: + collectd.info("%s configured by %s [%s]" % + (PLUGIN, url_updated, obj.url)) + obj.config_done = True + else: + collectd.error("%s config failure ; cannot monitor" % + (PLUGIN)) return 0 # The init function - called once on collectd process startup def init_func(): + """ Init the plugin """ - # get current hostname - obj.hostname = os.uname()[1] + if obj.config_done is False: + collectd.info("%s configuration failed" % PLUGIN) + time.sleep(300) + return False - # get the master interface names from /etc/platform/platform.conf - with open(tsc.PLATFORM_CONF_FILE, 'r') as infile: - for line in infile: + if obj.init_done is False: + if obj.init_ready() is False: + return False - # Management Interface - if PLATFORM_CONF_MGMNT_LABEL in line: - name = line.split('=')[1].replace('\n', '') - obj.NETWORKS[NETWORK_MGMNT] = iface(NETWORK_MGMNT, name, 0) - collectd.info("%s monitoring mgmnt interface : %s" % - (PLUGIN, - obj.NETWORKS[NETWORK_MGMNT].master['name'])) + obj.hostname = obj.gethostname() + obj.init_done = True + collectd.info("%s initialization complete" % PLUGIN) - # Infrastructure Interface - elif PLATFORM_CONF_INFRA_LABEL in line: - name = line.split('=')[1].replace('\n', '') - obj.NETWORKS[NETWORK_INFRA] = iface(NETWORK_INFRA, name, 0) - collectd.info("%s monitoring infra interface : %s" % - (PLUGIN, - obj.NETWORKS[NETWORK_INFRA].master['name'])) - - # OAM Interface - elif PLATFORM_CONF_OAM_LABEL in line: - name = line.split('=')[1].replace('\n', '') - obj.NETWORKS[NETWORK_OAM] = iface(NETWORK_OAM, name, 0) - collectd.info("%s monitoring oam interface: %s" % - (PLUGIN, - obj.NETWORKS[NETWORK_OAM].master['name'])) - - return 0 + return True # The sample read function - called on every audit interval def read_func(): + """ collectd interface monitor plugin read function """ - if obj.NETWORKS[NETWORK_MGMNT].state == 0: - obj.NETWORKS[NETWORK_MGMNT].state = 100 - else: - obj.NETWORKS[NETWORK_MGMNT].state -= 25 + if obj.init_done is False: + init_func() + return 0 + + if obj.audits == 0: + + # clear all alarms on first audit + + # block on fm availability + + # If existing raised the alarms are still valid then + # they will be re-raised with the same timestamp the + # original event occurred at once auditing resumes. + if clear_alarms(ALARM_ID_LIST) is True: + collectd.error("%s failed to clear existing alarms ; " + "retry next audit" % PLUGIN) + + # Don't proceed till we can communicate with FM and + # clear all existing interface and port alarms. + return 0 + + try: + # Issue query and construct the monitoring object + error = obj.make_http_request(to=PLUGIN_HTTP_TIMEOUT) + + if len(obj.jresp) == 0: + collectd.error("%s no json response from http request" % PLUGIN) + return 1 + + if error: + return 1 + + # Check query status + try: + if obj.jresp['status'] != 'pass': + collectd.error("%s link monitor query %s" % + (PLUGIN, obj.jresp['status'])) + return 0 + + except Exception as ex: + collectd.error("%s http request get reason failed ; %s" % + (PLUGIN, str(ex))) + collectd.info("%s resp:%d:%s" % + (PLUGIN, len(obj.jresp), obj.jresp)) + return 1 + + # log the first query response + if obj.audits == 0: + collectd.info("%s Link Status Query Response:%d:\n%s" % + (PLUGIN, len(obj.jresp), obj.jresp)) + + # uncomment below for debug purposes + # + # for network in NETWORKS: + # dump_network_info(network) + + try: + link_info = obj.jresp['link_info'] + for network_link_info in link_info: + collectd.debug("%s parse link info:%s" % + (PLUGIN, network_link_info)) + for network in NETWORKS: + if network.name == network_link_info['network']: + links = network_link_info['links'] + nname = network.name + if len(links) > 0: + link_one = links[0] + + # get initial link one name + if network.link_one.name is None: + network.link_one.name = link_one['name'] + + network.link_one.timestamp =\ + float(get_timestamp(link_one['time'])) + + # load link one state + if link_one['state'] == LINK_UP: + collectd.debug("%s %s IS Up [%s]" % + (PLUGIN, network.link_one.name, + network.link_one.state)) + if network.link_one.state != LINK_UP: + network.link_one.state_change = True + network.link_one.clear_port_alarm(nname) + network.link_one.state = LINK_UP + else: + collectd.debug("%s %s IS Down [%s]" % + (PLUGIN, network.link_one.name, + network.link_one.state)) + if network.link_one.state == LINK_UP: + network.link_one.state_change = True + network.link_one.raise_port_alarm(nname) + network.link_one.state = LINK_DOWN + + if len(links) > 1: + link_two = links[1] + + # get initial link two name + if network.link_two.name is None: + network.link_two.name = link_two['name'] + + network.link_two.timestamp =\ + float(get_timestamp(link_two['time'])) + + # load link two state + if link_two['state'] == LINK_UP: + collectd.debug("%s %s IS Up [%s]" % + (PLUGIN, network.link_two.name, + network.link_two.state)) + if network.link_two.state != LINK_UP: + network.link_two.state_change = True + network.link_two.clear_port_alarm(nname) + network.link_two.state = LINK_UP + else: + collectd.debug("%s %s IS Down [%s]" % + (PLUGIN, network.link_two.name, + network.link_two.state)) + if network.link_two.state == LINK_UP: + network.link_two.state_change = True + network.link_two.raise_port_alarm(nname) + network.link_two.state = LINK_DOWN + + # manage interface alarms + network.manage_iface_alarm() + + except Exception as ex: + collectd.error("%s link monitor query parse error: %s " % + (PLUGIN, obj.resp)) + + # handle state changes + for network in NETWORKS: + if network.link_two.name is not None and \ + network.link_one.state_change is True: + + if network.link_one.state == LINK_UP: + collectd.info("%s %s link one '%s' is Up" % + (PLUGIN, + network.name, + network.link_one.name)) + else: + collectd.info("%s %s link one '%s' is Down" % + (PLUGIN, + network.name, + network.link_one.name)) + + if network.link_two.name is not None and \ + network.link_two.state_change is True: + + if network.link_two.state == LINK_UP: + collectd.info("%s %s link two '%s' is Up" % + (PLUGIN, + network.name, + network.link_two.name)) + else: + collectd.info("%s %s link two %s 'is' Down" % + (PLUGIN, + network.name, + network.link_two.name)) + + # Dispatch usage value to collectd + val = collectd.Values(host=obj.hostname) + val.plugin = 'interface' + val.type = 'percent' + val.type_instance = 'used' + + # For each interface [ mgmt, oam, infra ] + # calculate the percentage used sample + # sample = 100 % when all its links are up + # sample = 0 % when all its links are down + # sample = 50 % when one of a lagged group is down + for network in NETWORKS: + + if network.link_one.name is not None: + + val.plugin_instance = network.name + + network.sample = 0 + + if network.link_two.name is not None: + # lagged + + if network.link_one.state == LINK_UP: + network.sample = 50 + if network.link_two.state == LINK_UP: + network.sample += 50 + else: + if network.link_one.state == LINK_UP: + network.sample = 100 + val.dispatch(values=[network.sample]) + + if network.link_one.state_change is True or \ + network.link_two.state_change is True: + + dump_network_info(network) + + network.link_one.state_change = False + network.link_two.state_change = False + + network.sample_last = network.sample + + else: + collectd.debug("%s %s network not provisioned" % + (PLUGIN, network.name)) + obj.audits += 1 + + except Exception as ex: + collectd.info("%s http request failed: %s" % (PLUGIN, str(ex))) - # Dispatch usage value to collectd - val = collectd.Values(host=obj.hostname) - val.plugin = 'interface' - val.plugin_instance = 'mgmnt' - val.type = 'absolute' - val.type_instance = 'used' - val.dispatch(values=[obj.NETWORKS[NETWORK_MGMNT].state]) return 0 # register the config, init and read functions collectd.register_config(config_func) collectd.register_init(init_func) -collectd.register_read(read_func) +collectd.register_read(read_func, interval=PLUGIN_AUDIT_INTERVAL) diff --git a/monitoring/collectd-extensions/src/memory.conf b/monitoring/collectd-extensions/src/memory.conf index 5e5195f09..997bf2d48 100644 --- a/monitoring/collectd-extensions/src/memory.conf +++ b/monitoring/collectd-extensions/src/memory.conf @@ -12,8 +12,8 @@ Instance "used" Persist true PersistOK true - WarningMax 80.00 - FailureMax 90.00 + WarningMax 79.00 + FailureMax 89.00 Hits 2 Invert false diff --git a/monitoring/collectd-extensions/src/mtce_notifier.py b/monitoring/collectd-extensions/src/mtce_notifier.py index 98de81cf3..1f645e0d8 100755 --- a/monitoring/collectd-extensions/src/mtce_notifier.py +++ b/monitoring/collectd-extensions/src/mtce_notifier.py @@ -103,7 +103,7 @@ class collectdMtceNotifierObject: PLUGIN__VSWITCH_IFACE, PLUGIN_INTERFACE, PLUGIN__EXAMPLE] - self.degrade_list__warning = [] + self.degrade_list__warning = [PLUGIN_INTERFACE] # the running list of resources that require degrade. # a degrade clear message is sent whenever this list is empty. diff --git a/monitoring/collectd-extensions/src/plugin_common.py b/monitoring/collectd-extensions/src/plugin_common.py new file mode 100644 index 000000000..d6ba89894 --- /dev/null +++ b/monitoring/collectd-extensions/src/plugin_common.py @@ -0,0 +1,255 @@ +# +# Copyright (c) 2019 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# +############################################################################ +# +# This file contains common collectd plugin constructs and utilities +# +############################################################################ + +import collectd +import json +import uuid +import httplib2 +import socket +import os +from fm_api import constants as fm_constants +import tsconfig.tsconfig as tsc + +# http request constants +PLUGIN_TIMEOUT = 10 +PLUGIN_HTTP_HEADERS = {'Accept': 'application/json', 'Connection': 'close'} + +MIN_AUDITS_B4_FIRST_QUERY = 2 + + +class PluginObject(object): + + def __init__(self, plugin, url): + + # static variables set in init_func + self.plugin = plugin # the name of this plugin + self.hostname = '' # the name of this host + self.port = 0 # the port number for this plugin + + # dynamic gate variables + self.config_complete = False # set to True once config is complete + self.config_done = False # set true if config_func completed ok + self.init_done = False # set true if init_func completed ok + + # dynamic variables set in read_func + self.usage = float(0) # last usage value recorded as float + self.audits = 0 # number of audit since init + + # http and json specific variables + self.url = url # target url + self.jresp = None # used to store the json response + self.resp = '' + + # Log controls + self.config_logged = False # used to log once the plugin config + self.error_logged = False # used to prevent log flooding + self.log_throttle_count = 0 # used to count throttle logs + self.INIT_LOG_THROTTLE = 10 # the init log throttle threshold + + collectd.debug("%s Common PluginObject constructor [%s]" % + (plugin, url)) + + ########################################################################### + # + # Name : init_ready + # + # Description: Test for init ready condition + # + # Parameters : plugin name + # + # Returns : False if initial config complete is not done + # True if initial config complete is done + # + ########################################################################### + + def init_ready(self): + """ Test for system init ready state """ + + if os.path.exists(tsc.INITIAL_CONFIG_COMPLETE_FLAG) is False: + self.log_throttle_count += 1 + if self.log_throttle_count > self.INIT_LOG_THROTTLE: + collectd.info("%s initialization needs retry" % self.plugin) + self.log_throttle_count = 0 + return False + else: + self.log_throttle_count = 0 + + return True + + ########################################################################### + # + # Name : gethostname + # + # Description: load the hostname + # + # Parameters : plugin name + # + # Returns : Success - hostname + # Failure - None + # + # Updates : obj.hostname + # + ########################################################################### + def gethostname(self): + """ Fetch the hostname """ + + # get current hostname + try: + hostname = socket.gethostname() + if hostname: + return hostname + except: + collectd.error("%s failed to get hostname" % self.plugin) + + return None + + ########################################################################### + # + # Name : check_for_fit + # + # Description: load FIT data if it is present + # + # Fit Format : unit data -> 0 89 + # - instance 0 value 89 + # + # Parameters : plugin name + # object to update with fit + # name in fit file + # unit + # + # Returns : Did a failure occur ? + # False = no + # True = yes + # + # Updates : self.usage with FIT value if FIT conditions are present + # and apply + # + ########################################################################### + def check_for_fit(self, name, unit): + """ Load FIT data into usage if it exists """ + + fit_file = '/var/run/fit/' + name + '_data' + + if os.path.exists(fit_file): + valid = False + with open(fit_file, 'r') as infile: + for line in infile: + try: + inst, val = line.split(' ') + if int(unit) == int(inst): + self.usage = float(val) + valid = True + + except: + try: + val = float(line) + self.usage = float(val) + valid = True + + except: + collectd.error("%s bad FIT data; ignoring" % + self.plugin) + + if valid is True: + collectd.info("%s %.2f usage (unit %d) (FIT)" % + (self.plugin, unit, self.usage)) + return False + + return True + + ########################################################################### + # + # Name : make_http_request + # + # Description: Issue an http request to the specified URL. + # Load and return the response + # Handling execution errors + # + # Parameters : self as current context. + # + # Optional: + # + # url - override the default self url with http address to + # issue the get request to. + # to - timeout override + # hdrs - override use of the default header list + # + # Updates : self.jresp with the json string response from the request. + # + # Returns : Error indication (True/False) + # True on error + # False on success + # + ########################################################################### + def make_http_request(self, url=None, to=None, hdrs=None): + """ Make a blocking HTTP Request and return result """ + + try: + + # handle timeout override + if to is None: + to = PLUGIN_TIMEOUT + + # handle url override + if url is None: + url = self.url + + # handle header override + if hdrs is None: + hdrs = PLUGIN_HTTP_HEADERS + + http = httplib2.Http(timeout=to) + resp = http.request(url, headers=hdrs) + + except Exception as ex: + collectd.info("%s http request failure (%s)" % + (self.plugin, str(ex))) + return True + + try: + collectd.debug("%s Resp: %s" % + (self.plugin, resp[1])) + + self.resp = resp[1] + self.jresp = json.loads(resp[1]) + + except Exception as ex: + collectd.info("%s http request parse failure (%s) (%s)" % + (self.plugin, str(ex), resp)) + return True + return False + + +def is_uuid_like(val): + """Returns validation of a value as a UUID. + + For our purposes, a UUID is a canonical form string: + aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa + """ + try: + return str(uuid.UUID(val)) == val + except (TypeError, ValueError, AttributeError): + return False + + +def get_severity_str(severity): + """ get string that represents the specified severity """ + + if severity == fm_constants.FM_ALARM_SEVERITY_CLEAR: + return "clear" + elif severity == fm_constants.FM_ALARM_SEVERITY_CRITICAL: + return "critical" + elif severity == fm_constants.FM_ALARM_SEVERITY_MAJOR: + return "major" + elif severity == fm_constants.FM_ALARM_SEVERITY_MINOR: + return "minor" + else: + return "unknown" diff --git a/monitoring/collectd-extensions/src/python_plugins.conf b/monitoring/collectd-extensions/src/python_plugins.conf index 52aa763d0..85ba02377 100644 --- a/monitoring/collectd-extensions/src/python_plugins.conf +++ b/monitoring/collectd-extensions/src/python_plugins.conf @@ -10,6 +10,10 @@ LoadPlugin python Path "/proc/meminfo" Import "ntpq" + Import "interface" + + Port 2122 + LogTraces = true Encoding "utf-8" diff --git a/utilities/platform-util/centos/build_srpm.data b/utilities/platform-util/centos/build_srpm.data index 880171162..260eb2124 100644 --- a/utilities/platform-util/centos/build_srpm.data +++ b/utilities/platform-util/centos/build_srpm.data @@ -1,4 +1,4 @@ SRC_DIR="platform-util" COPY_LIST_TO_TAR="scripts" -TIS_PATCH_VER=15 +TIS_PATCH_VER=16 diff --git a/utilities/platform-util/scripts/patch-restart-mtce b/utilities/platform-util/scripts/patch-restart-mtce index 357369288..0f888374b 100755 --- a/utilities/platform-util/scripts/patch-restart-mtce +++ b/utilities/platform-util/scripts/patch-restart-mtce @@ -131,6 +131,9 @@ do "mtcalarmd") pmon_managed_processes=(${pmon_managed_processes[@]} "mtcalarmd:0") ;; + "lmond") + pmon_managed_processes=(${pmon_managed_processes[@]} "lmond:0") + ;; *) loginfo "Unknown process:${process}"