From e8c9676d9874af52ab9fd7fc0fa02d48539a9e6a Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Fri, 25 Jan 2019 17:31:30 -0500 Subject: [PATCH] Add network interface monitoring plugin to collectd This update introduces interface monitoring for oam, mgmt and infra networks as a collectd plugin. The interface plugin runs and queries the new maintenance Link Monitor daemon for Link Model and Information every 10 seconds. The plugin then manages alarms based on the link model similar to how rmon did in the past ; port and interface alarms. Severity: Interface and Port levels Alarm Level Minor Major Critical ----------- ----- --------------------- ---------------------------- Interface N/A One of lag pair is Up All Interface ports are Down Port N/A Physical Link is Down N/A Degrade support for interface monitoring is add to the mtce degrade notifier. Any link down condition results in a host degrade condition like was in rmon. Sample Data: represented as % of total links Up for that network interface 100 or 100% percent used - all links of interface are up. 50 or 50% percent used - one of lag pair is Up and the other is Down 0 or 0% percent used - all ports for that network are Down The plugin documents all of this in its header. This update also 1. Adds the new lmond process to syslog-ng config file. 2. Adds the new lmond process to the mtce patch script. 3. Modifies the cpu, df and memory threshold settings by -1. rmon thresholds were precise whereas collectd requires that the samples cross the thresholds, not just meet them. So for example, in terms of a 90% usage action the threshold needs to be 89. Test Plan: (WIP but almost complete) PASS: Verify interface plugin startup PASS: Verify interface plugin logging PASS: Verify interface plugin Link Status Query and response handling PASS: Verify monitor, sample storage and grafana display PASS: verify port and interface alarm matches what rmon produced PASS: Verify lmon port config from manifest configured plugin PASS: Verify lmon port config from lmon.conf PASS: Verify single interface failure handling and recovery PASS: Verify lagged interface failure handling and recovery PASS: Verify link loss of lagged interface shared between mgmt and oam (hp380) PASS: Verify network interface failure handling ; single port PASS: Verify network interface degrade handling ; lagged interface PEND: Verify network interface degrade handling ; vlan interface PASS: Verify HTTP request timeout period and handling PASS: Verify link status query failure handling - invalid uri (timeout) PASS: Verify link status query failure handling - missing uri (timeout) PASS: Verify link status query failure handling - status fail PASS: Verify link status query failure handling - bad json resp Change-Id: I2e2dfe6ddfa06a46770245540c7153d330bdf196 Story: 2002823 Task: 28635 Depends-On: https://review.openstack.org/#/c/633264 Signed-off-by: Eric MacDonald --- .../syslog-ng-config/centos/build_srpm.data | 2 +- .../syslog-ng-config/files/syslog-ng.conf | 3 + .../centos/build_srpm.data | 5 +- .../centos/collectd-extensions.spec | 7 + monitoring/collectd-extensions/src/cpu.conf | 4 +- monitoring/collectd-extensions/src/df.conf | 5 +- .../collectd-extensions/src/example.conf | 4 +- .../collectd-extensions/src/fm_notifier.py | 24 +- .../collectd-extensions/src/interface.conf | 8 +- .../collectd-extensions/src/interface.py | 955 ++++++++++++++++-- .../collectd-extensions/src/memory.conf | 4 +- .../collectd-extensions/src/mtce_notifier.py | 2 +- .../collectd-extensions/src/plugin_common.py | 255 +++++ .../src/python_plugins.conf | 4 + .../platform-util/centos/build_srpm.data | 2 +- .../platform-util/scripts/patch-restart-mtce | 3 + 16 files changed, 1186 insertions(+), 101 deletions(-) create mode 100644 monitoring/collectd-extensions/src/plugin_common.py diff --git a/config-files/syslog-ng-config/centos/build_srpm.data b/config-files/syslog-ng-config/centos/build_srpm.data index da1e20bd8..2c3b2cb8b 100644 --- a/config-files/syslog-ng-config/centos/build_srpm.data +++ b/config-files/syslog-ng-config/centos/build_srpm.data @@ -1,2 +1,2 @@ SRC_DIR="files" -TIS_PATCH_VER=0 +TIS_PATCH_VER=1 diff --git a/config-files/syslog-ng-config/files/syslog-ng.conf b/config-files/syslog-ng-config/files/syslog-ng.conf index 32c7a3f6c..e4ebcdb5e 100644 --- a/config-files/syslog-ng-config/files/syslog-ng.conf +++ b/config-files/syslog-ng-config/files/syslog-ng.conf @@ -107,6 +107,7 @@ destination d_sm { file("/var/log/sm.log"); }; destination d_rmon { file("/var/log/rmond.log" template(t_mtc)); }; destination d_rmon_notify { file("/var/log/rmond_notify.log" template(t_mtc)); }; destination d_pmon { file("/var/log/pmond.log" template(t_mtc)); }; +destination d_lmon { file("/var/log/lmond.log" template(t_mtc)); }; destination d_hostwd { file("/var/log/hostwd.log" template(t_mtc)); }; destination d_fsmon { file("/var/log/fsmond.log" template(t_mtc)); }; destination d_hwmon { file("/var/log/hwmond.log" template(t_mtc)); }; @@ -352,6 +353,7 @@ filter f_local7 { facility(local7); }; filter f_rmon { facility(local5) and program(rmond); }; filter f_rmon_notify { facility(local5) and program(rmon_resource_notify); }; filter f_pmon { facility(local5) and program(pmond); }; +filter f_lmon { facility(local5) and program(lmond); }; filter f_hostw { facility(local5) and program(hostwd); }; filter f_fsmon { facility(local5) and program(fsmond); }; filter f_hwmon { facility(local5) and program(hwmond); }; @@ -472,6 +474,7 @@ log { source(s_src); filter(f_local3); destination(d_sm); }; log { source(s_src); filter(f_rmon); destination(d_rmon); }; log { source(s_src); filter(f_rmon_notify); destination(d_rmon_notify); }; log { source(s_src); filter(f_pmon); destination(d_pmon); }; +log { source(s_src); filter(f_lmon); destination(d_lmon); }; log { source(s_src); filter(f_hostw); destination(d_hostwd); }; log { source(s_src); filter(f_fsmon); destination(d_fsmon); }; log { source(s_src); filter(f_hwmon); destination(d_hwmon); }; diff --git a/monitoring/collectd-extensions/centos/build_srpm.data b/monitoring/collectd-extensions/centos/build_srpm.data index 82cafe8bb..e7f74e208 100644 --- a/monitoring/collectd-extensions/centos/build_srpm.data +++ b/monitoring/collectd-extensions/centos/build_srpm.data @@ -5,6 +5,7 @@ COPY_LIST="$PKG_BASE/src/LICENSE \ $PKG_BASE/src/collectd.service \ $PKG_BASE/src/fm_notifier.py \ $PKG_BASE/src/mtce_notifier.py \ + $PKG_BASE/src/plugin_common.py \ $PKG_BASE/src/python_plugins.conf \ $PKG_BASE/src/cpu.py \ $PKG_BASE/src/cpu.conf \ @@ -13,7 +14,9 @@ COPY_LIST="$PKG_BASE/src/LICENSE \ $PKG_BASE/src/df.conf \ $PKG_BASE/src/ntpq.py \ $PKG_BASE/src/ntpq.conf \ + $PKG_BASE/src/interface.py \ + $PKG_BASE/src/interface.conf \ $PKG_BASE/src/example.py \ $PKG_BASE/src/example.conf" -TIS_PATCH_VER=6 +TIS_PATCH_VER=7 diff --git a/monitoring/collectd-extensions/centos/collectd-extensions.spec b/monitoring/collectd-extensions/centos/collectd-extensions.spec index 532c06720..0665fb650 100644 --- a/monitoring/collectd-extensions/centos/collectd-extensions.spec +++ b/monitoring/collectd-extensions/centos/collectd-extensions.spec @@ -15,12 +15,14 @@ Source2: collectd.conf.pmon # collectd python plugin files - notifiers Source3: fm_notifier.py Source4: mtce_notifier.py +Source5: plugin_common.py # collectd python plugin files - resource plugins Source11: cpu.py Source12: memory.py Source14: example.py Source15: ntpq.py +Source16: interface.py # collectd plugin conf files into /etc/collectd.d Source100: python_plugins.conf @@ -29,6 +31,7 @@ Source102: memory.conf Source103: df.conf Source104: example.conf Source105: ntpq.conf +Source106: interface.conf BuildRequires: systemd-devel @@ -64,12 +67,15 @@ install -m 600 %{SOURCE2} %{buildroot}%{local_config_extensions_dir} # collectd python plugin files - notifiers install -m 700 %{SOURCE3} %{buildroot}%{local_python_extensions_dir} install -m 700 %{SOURCE4} %{buildroot}%{local_python_extensions_dir} +install -m 700 %{SOURCE5} %{buildroot}%{local_python_extensions_dir} # collectd python plugin files - resource plugins install -m 700 %{SOURCE11} %{buildroot}%{local_python_extensions_dir} install -m 700 %{SOURCE12} %{buildroot}%{local_python_extensions_dir} install -m 700 %{SOURCE14} %{buildroot}%{local_python_extensions_dir} install -m 700 %{SOURCE15} %{buildroot}%{local_python_extensions_dir} +install -m 700 %{SOURCE16} %{buildroot}%{local_python_extensions_dir} + # collectd plugin conf files into /etc/collectd.d install -m 600 %{SOURCE100} %{buildroot}%{local_plugin_dir} @@ -78,6 +84,7 @@ install -m 600 %{SOURCE102} %{buildroot}%{local_plugin_dir} install -m 600 %{SOURCE103} %{buildroot}%{local_plugin_dir} install -m 600 %{SOURCE104} %{buildroot}%{local_plugin_dir} install -m 600 %{SOURCE105} %{buildroot}%{local_plugin_dir} +install -m 600 %{SOURCE106} %{buildroot}%{local_plugin_dir} %clean rm -rf $RPM_BUILD_ROOT diff --git a/monitoring/collectd-extensions/src/cpu.conf b/monitoring/collectd-extensions/src/cpu.conf index 75394cdb2..b1d862f18 100644 --- a/monitoring/collectd-extensions/src/cpu.conf +++ b/monitoring/collectd-extensions/src/cpu.conf @@ -13,8 +13,8 @@ Instance "used" Persist true PersistOK true - WarningMax 90.00 - FailureMax 95.00 + WarningMax 89.00 + FailureMax 94.00 Hits 2 Invert false diff --git a/monitoring/collectd-extensions/src/df.conf b/monitoring/collectd-extensions/src/df.conf index 5df943b8b..19eb764c7 100644 --- a/monitoring/collectd-extensions/src/df.conf +++ b/monitoring/collectd-extensions/src/df.conf @@ -13,6 +13,7 @@ MountPoint "/var/lock" MountPoint "/boot" MountPoint "/scratch" + MountPoint "/opt/etcd" MountPoint "/opt/cgcs" MountPoint "/opt/platform" MountPoint "/opt/extension" @@ -27,8 +28,8 @@ Instance "used" - WarningMax 80.00 - FailureMax 90.00 + WarningMax 79.00 + FailureMax 89.00 Persist true PersistOK true Hits 2 diff --git a/monitoring/collectd-extensions/src/example.conf b/monitoring/collectd-extensions/src/example.conf index fbcf5d4f9..574306027 100644 --- a/monitoring/collectd-extensions/src/example.conf +++ b/monitoring/collectd-extensions/src/example.conf @@ -4,8 +4,8 @@ Instance "used" Persist true PersistOK true - WarningMax 51.00 - FailureMax 75.00 + WarningMax 49.00 + FailureMax 74.00 Hits 1 Invert false diff --git a/monitoring/collectd-extensions/src/fm_notifier.py b/monitoring/collectd-extensions/src/fm_notifier.py index ba458dc2e..815fb07ac 100755 --- a/monitoring/collectd-extensions/src/fm_notifier.py +++ b/monitoring/collectd-extensions/src/fm_notifier.py @@ -90,6 +90,7 @@ from threading import RLock as Lock from fm_api import constants as fm_constants from fm_api import fm_api import tsconfig.tsconfig as tsc +import plugin_common as pc # only load influxdb on the controller if tsc.nodetype == 'controller': @@ -865,16 +866,19 @@ def _get_base_object(alarm_id): return None -def is_uuid_like(val): - """Returns validation of a value as a UUID. - - For our purposes, a UUID is a canonical form string: - aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa +def _get_object(alarm_id, eid): """ - try: - return str(uuid.UUID(val)) == val - except (TypeError, ValueError, AttributeError): - return False + Get the plugin object for the specified alarm id and eid + """ + + base_obj = _get_base_object(alarm_id) + if len(base_obj.instance_objects): + try: + return(base_obj.instance_objects[eid]) + except: + collectd.debug("%s %s has no instance objects" % + (PLUGIN, base_obj.plugin)) + return base_obj def _build_entity_id(plugin, plugin_instance): @@ -1530,7 +1534,7 @@ def notifier_func(nObject): suppression=base_obj.suppression) alarm_uuid = api.set_fault(fault) - if is_uuid_like(alarm_uuid) is False: + if pc.is_uuid_like(alarm_uuid) is False: collectd.error("%s %s:%s set_fault failed:%s" % (PLUGIN, base_obj.id, obj.entity_id, alarm_uuid)) return 0 diff --git a/monitoring/collectd-extensions/src/interface.conf b/monitoring/collectd-extensions/src/interface.conf index c7ef627f6..de3afaf23 100644 --- a/monitoring/collectd-extensions/src/interface.conf +++ b/monitoring/collectd-extensions/src/interface.conf @@ -1,11 +1,11 @@ - - Instance "state" + + Instance "used" Persist true PersistOK true - WarningMin 50 - FailureMin 0 + WarningMin 51 + FailureMin 1 # Hits 2 Invert false diff --git a/monitoring/collectd-extensions/src/interface.py b/monitoring/collectd-extensions/src/interface.py index ae42a47d6..7b44de8e8 100755 --- a/monitoring/collectd-extensions/src/interface.py +++ b/monitoring/collectd-extensions/src/interface.py @@ -1,129 +1,934 @@ # -# Copyright (c) 2018 Wind River Systems, Inc. +# Copyright (c) 2019 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # ############################################################################ # -# This is the Host Interface Monitor plugin for Collectd. +# This is the Host Interface Monitor plugin for collectd. # -# Only mgmnt , infra and oam interfaces are supported with the following +# Only mgmt, infra and oam interfaces are supported with the following # mapping specified in /etc/platform/platform.conf # -# mgmnt - management_interface | all hosts | manditory +# oam - oam_interface | controller | mandatory +# mgmnt - management_interface | all hosts | mandatory # infa - infrastructure_interface | any host | optional -# oam - oam_interface | controller | manditory # -# This plugin reports link state inb the following way. +# This plugin queries the maintenance Link Monitor daemon 'lmon' +# for a link status summary of that hosts configured networks. # -# The plugin init function learns interface names from platform.conf +# This plugin's read_func issues an http GET request to the Link Monitor +# which responds with a json string that represents a complete summary +# of the monitored links, state and the time of the last event or when +# initial status was learned. An example of the Link Monitor response is # +# { +# "status" : "pass" +# "link_info": [ +# { "network":"mgmt", +# "type":"vlan", +# "links": [ +# { "name":"enp0s8.1", "state":"Up", "time":"5674323454567" }, +# { "name":"enp0s8.2", "state":"Up", "time":"5674323454567" }] +# }, +# { "network":"infra", +# "type":"bond", +# "bond":"bond0", +# "links": [ +# { "name":"enp0s9f1", "state":"Down", "time":"5674323454567" }, +# { "name":"enp0s9f0", "state":"Up" , "time":"5674323454567" }] +# }, +# { "network":"oam", +# "type":"single", +# "links": [ +# { "name":"enp0s3", "state":"Up", "time":"5674323454567" }] +# }] +# } +# +# On failure +# +# { +# "status" : "fail ; bad request " +# } +# +# This plugin then uses this information to manage interface alarm +# assertion and clear with appropriate severity. +# +# Severity: Interface and Port levels +# +# Alarm Level Minor Major Critical +# ----------- ----- --------------------- ---------------------------- +# Interface N/A One of lag pair is Up All Interface ports are Down +# Port N/A Physical Link is Down N/A +# +# Sample Data: represented as % of total links Up for that network interface +# +# 100 or 100% percent used - all links of interface are up. +# 50 or 50% percent used - one of lag pair is Up and the other is Down +# 0 or 0% percent used - all ports for that network are Down # ############################################################################ -import os -import random -import collectd -import tsconfig.tsconfig as tsc +import os +import time +import datetime +import collectd +import plugin_common as pc +from fm_api import constants as fm_constants +from fm_api import fm_api + +# Fault manager API Object +api = fm_api.FaultAPIs() + +# name of the plugin - all logs produced by this plugin are prefixed with this PLUGIN = 'interface plugin' -# static variables +# Interface Monitoring Interval in seconds +PLUGIN_AUDIT_INTERVAL = 10 -PLATFORM_CONF_MGMNT_LABEL = "management_interface=" -PLATFORM_CONF_INFRA_LABEL = "infrastructure_interface=" -PLATFORM_CONF_OAM_LABEL = "oam_interface=" +# Sample Data 'type' and 'instance' database field values. +PLUGIN_TYPE = 'percent' +PLUGIN_TYPE_INSTANCE = 'usage' -NETWORK_MGMNT = 'mgmnt' +# The Link Status Query URL +PLUGIN_HTTP_URL_PREFIX = 'http://localhost:' + +# This plugin's timeout +PLUGIN_HTTP_TIMEOUT = 5 + +# Specify the link monitor as the maintenance destination service +# full path should look like ; http://localhost:2122/mtce/lmon +PLUGIN_HTTP_URL_PATH = '/mtce/lmon' + +# Port and Interface Alarm Identifiers +PLUGIN_OAM_PORT_ALARMID = '100.106' # OAM Network Port +PLUGIN_OAM_IFACE_ALARMID = '100.107' # OAM Network Interface + +PLUGIN_MGMT_PORT_ALARMID = '100.108' # Management Network Port +PLUGIN_MGMT_IFACE_ALARMID = '100.109' # Management Network Interface + +PLUGIN_INFRA_PORT_ALARMID = '100.110' # Infrastructure Network Port +PLUGIN_INFRA_IFACE_ALARMID = '100.111' # Infrastructure Nwk Interface + +# List of all alarm identifiers. +ALARM_ID_LIST = [PLUGIN_OAM_PORT_ALARMID, + PLUGIN_OAM_IFACE_ALARMID, + PLUGIN_MGMT_PORT_ALARMID, + PLUGIN_MGMT_IFACE_ALARMID, + PLUGIN_INFRA_PORT_ALARMID, + PLUGIN_INFRA_IFACE_ALARMID] + +# Monitored Network Name Strings +NETWORK_MGMT = 'mgmt' NETWORK_INFRA = 'infra' NETWORK_OAM = 'oam' +# Port / Interface State strings +LINK_UP = 'Up' +LINK_DOWN = 'Down' -class iface: - def __init__(self, n, m, s): - self.master = {'network': n, 'name': m, 'state': 'down', 'slaves': s} - self.slave1 = {} - self.slave2 = {} - self.state = int(100) +# Alarm control actions +ALARM_ACTION_RAISE = 'raise' +ALARM_ACTION_CLEAR = 'clear' + +# Alarm level. +# Ports are the lowest level and represent a physical link +# Interfaces are port groupings in terms of LAG +LEVEL_PORT = 'port' +LEVEL_IFACE = 'interface' -class object: - hostname = '' +# Link Object (aka Port or Physical interface) Structure +# and member functions. +class LinkObject: - def __init__(self): - self.NETWORKS = {} - self.NETWORKS[NETWORK_MGMNT] = None - self.NETWORKS[NETWORK_INFRA] = None - self.NETWORKS[NETWORK_OAM] = None + def __init__(self, alarm_id): -obj = object() + self.name = None + self.state = LINK_UP + self.timestamp = float(0) + self.severity = fm_constants.FM_ALARM_SEVERITY_CLEAR + self.alarm_id = alarm_id + self.state_change = True + + collectd.debug("%s LinkObject constructor: %s" % + (PLUGIN, alarm_id)) + + ################################################################## + # + # Name : raise_port_alarm + # + # Purpose : This link object member function is used to + # raise link/port alarms. + # + # Parameters : Network the link is part of. + # + # Returns : True on failure and False on success. + # + ################################################################## + def raise_port_alarm(self, network): + """ Raise a port alarm """ + + if self.severity != fm_constants.FM_ALARM_SEVERITY_MAJOR: + + if manage_alarm(self.name, + network, + LEVEL_PORT, + ALARM_ACTION_RAISE, + fm_constants.FM_ALARM_SEVERITY_MAJOR, + self.alarm_id, + self.timestamp) is False: + + self.severity = fm_constants.FM_ALARM_SEVERITY_MAJOR + collectd.info("%s %s %s port alarm raised" % + (PLUGIN, self.name, self.alarm_id)) + return False + else: + return True + else: + return False + + ################################################################## + # + # Name : clear_port_alarm + # + # Purpose : This link object member function is used to + # clear link/port alarms. + # + # Parameters : Network the link is part of. + # + # Returns : True on failure and False on success. + # + ################################################################## + def clear_port_alarm(self, network): + """ Clear a port alarm """ + + if self.severity != fm_constants.FM_ALARM_SEVERITY_CLEAR: + if manage_alarm(self.name, + network, + LEVEL_PORT, + ALARM_ACTION_CLEAR, + fm_constants.FM_ALARM_SEVERITY_CLEAR, + self.alarm_id, + self.timestamp) is False: + + collectd.info("%s %s %s port alarm cleared" % + (PLUGIN, self.name, self.alarm_id)) + self.severity = fm_constants.FM_ALARM_SEVERITY_CLEAR + return False + else: + return True + else: + return False + + +# Interface (aka Network) Level Object Structure and member functions +class NetworkObject: + + def __init__(self, name): + + self.name = name + self.sample = 0 + self.sample_last = 0 + self.severity = fm_constants.FM_ALARM_SEVERITY_CLEAR + self.degraded = False + self.timestamp = float(0) + + # add the respective alarm IDs to each object + alarm_id = None + if name == NETWORK_OAM: + alarm_id = PLUGIN_OAM_PORT_ALARMID + self.alarm_id = PLUGIN_OAM_IFACE_ALARMID + elif name == NETWORK_MGMT: + alarm_id = PLUGIN_MGMT_PORT_ALARMID + self.alarm_id = PLUGIN_MGMT_IFACE_ALARMID + elif name == NETWORK_INFRA: + alarm_id = PLUGIN_INFRA_PORT_ALARMID + self.alarm_id = PLUGIN_INFRA_IFACE_ALARMID + else: + self.alarm_id = "" + collectd.error("%s unexpected network (%s)" % (PLUGIN, name)) + + collectd.debug("%s %s NetworkObject constructor: %s" % + (PLUGIN, name, self.alarm_id)) + + if alarm_id: + self.link_one = LinkObject(alarm_id) + self.link_two = LinkObject(alarm_id) + + ################################################################## + # + # Name : raise_iface_alarm + # + # Purpose : This network object member function used to + # raise interface alarms. + # + # Parameters : None + # + # Returns : True on failure and False on success. + # + ################################################################## + def raise_iface_alarm(self, severity): + """ Raise an interface alarm """ + + if severity == fm_constants.FM_ALARM_SEVERITY_CLEAR: + collectd.error("%s %s raise alarm called with clear severity" % + (PLUGIN, self.name)) + return True + + if self.severity != severity: + if manage_alarm(self.name, + self.name, + LEVEL_IFACE, + ALARM_ACTION_RAISE, + severity, + self.alarm_id, + self.timestamp) is False: + + self.severity = severity + collectd.info("%s %s %s %s interface alarm raised" % + (PLUGIN, + self.name, + self.alarm_id, + pc.get_severity_str(severity))) + return False + else: + return True + else: + return False + + ################################################################## + # + # Name : clear_iface_alarm + # + # Purpose : This network object member function used to + # clear interface alarms. + # + # Parameters : None + # + # Returns : True on failure and False on success. + # + ################################################################## + def clear_iface_alarm(self): + """ Clear an interface alarm """ + + if self.severity != fm_constants.FM_ALARM_SEVERITY_CLEAR: + if manage_alarm(self.name, + self.name, + LEVEL_IFACE, + ALARM_ACTION_CLEAR, + fm_constants.FM_ALARM_SEVERITY_CLEAR, + self.alarm_id, + self.timestamp) is False: + + collectd.info("%s %s %s %s interface alarm cleared" % + (PLUGIN, + self.name, + self.alarm_id, + pc.get_severity_str(self.severity))) + self.severity = fm_constants.FM_ALARM_SEVERITY_CLEAR + return False + else: + return True + else: + return False + + ###################################################################### + # + # Name : manage_iface_alarm + # + # Purpose : clear or raise appropriate severity level interface alarm + # + # Returns : None + # + ###################################################################### + def manage_iface_alarm(self): + """ """ + # Single Link Config + if self.link_two.name is None: + if self.link_one.state == LINK_DOWN: + if self.severity != fm_constants.FM_ALARM_SEVERITY_CRITICAL: + self.timestamp = self.link_one.timestamp + self.raise_iface_alarm( + fm_constants.FM_ALARM_SEVERITY_CRITICAL) + elif self.link_one.state == LINK_UP: + if self.severity != fm_constants.FM_ALARM_SEVERITY_CLEAR: + self.clear_iface_alarm() + + # Lagged Link Config + # + # The interface level timestamp is updated based on the failed + # link timestamps + elif self.link_one.state == LINK_UP and \ + self.link_two.state == LINK_DOWN: + if self.severity != fm_constants.FM_ALARM_SEVERITY_MAJOR: + self.timestamp = self.link_two.timestamp + self.raise_iface_alarm(fm_constants.FM_ALARM_SEVERITY_MAJOR) + + elif self.link_one.state == LINK_DOWN and \ + self.link_two.state == LINK_UP: + if self.severity != fm_constants.FM_ALARM_SEVERITY_MAJOR: + self.timestamp = self.link_one.timestamp + self.raise_iface_alarm(fm_constants.FM_ALARM_SEVERITY_MAJOR) + + elif self.link_one.state == LINK_UP and self.link_two.state == LINK_UP: + if self.severity != fm_constants.FM_ALARM_SEVERITY_CLEAR: + self.clear_iface_alarm() + + elif self.link_one.state == LINK_DOWN and \ + self.link_two.state == LINK_DOWN: + if self.severity != fm_constants.FM_ALARM_SEVERITY_CRITICAL: + if self.link_one.timestamp > self.link_two.timestamp: + self.timestamp = self.link_one.timestamp + else: + self.timestamp = self.link_two.timestamp + self.raise_iface_alarm(fm_constants.FM_ALARM_SEVERITY_CRITICAL) + + +# Plugin Control Object +obj = pc.PluginObject(PLUGIN, PLUGIN_HTTP_URL_PREFIX) + + +# Network Object List - Primary Network/Link Control Object +NETWORKS = [NetworkObject(NETWORK_MGMT), + NetworkObject(NETWORK_OAM), + NetworkObject(NETWORK_INFRA)] + + +########################################################################## +# +# Name : get_timestamp +# +# Purpose : Convert the long long int microsecond time as string +# that accompany link info from the Link Monitor (lmond) +# and catch exceptions in doing so. +# +# Parameters: lmon_time - long long int as string +# +# Returns : float time that can be consumed by datetime.fromtimestamp +# +# Returns same unit of now time if provided lmon_time is +# invalid. +# +########################################################################## +def get_timestamp(lmon_time): + """ Convert lmon time to fm timestamp time """ + + if lmon_time: + try: + return(float(float(lmon_time)/1000000)) + except: + collectd.error("%s failed to parse timestamp ;" + " using current time" % PLUGIN) + else: + collectd.error("%s no timestamp ;" + " using current time" % PLUGIN) + + return(float(time.time())) + + +def dump_network_info(network): + """ Log the specified network info """ + + link_one_event_time = datetime.datetime.fromtimestamp( + float(network.link_one.timestamp)).strftime('%Y-%m-%d %H:%M:%S') + + link_two_info = '' + if network.link_two.name is not None: + link_two_event_time = datetime.datetime.fromtimestamp( + float(network.link_two.timestamp)).strftime('%Y-%m-%d %H:%M:%S') + + link_two_info += "; link two '" + link_two_info += network.link_two.name + link_two_info += "' went " + network.link_two.state + link_two_info += " at " + link_two_event_time + + pcnt = '%' + + collectd.info("%s %5s %3d%c ; " + "link one '%s' went %s at %s %s" % + (PLUGIN, + network.name, + network.sample, + pcnt, + network.link_one.name, + network.link_one.state, + link_one_event_time, + link_two_info)) + + +######################################################################### +# +# Name : this_hosts_alarm +# +# Purpose : Determine if the supplied eid is for this host. +# +# Description: The eid formats for the alarms managed by this plugin are +# +# host=.port= +# host=.interface= +# +# Assumptions: There is no restriction preventing the system +# administrator from creating hostnames with period's ('.') +# in them. Because so the eid cannot simply be split +# around '='s and '.'s. Instead its split around this +# plugins level type '.port' or '.interface'. +# +# Returns : True if hostname is a match +# False otherwise +# +########################################################################## +def this_hosts_alarm(hostname, eid): + """ Check if the specified eid is for this host """ + + if hostname: + if eid: + # 'host=controller-0.interface=mgmt' + try: + eid_host = None + eid_disected = eid.split('=') + if len(eid_disected) == 3: + # ['host', 'controller-0.interface', 'mgmt'] + if len(eid_disected[1].split('.port')) == 2: + eid_host = eid_disected[1].split('.port')[0] + if eid_host and eid_host == hostname: + return True + elif len(eid_disected[1].split('.interface')) == 2: + eid_host = eid_disected[1].split('.interface')[0] + if eid_host and eid_host == hostname: + return True + except Exception as ex: + collectd.error("%s failed to parse alarm eid (%s)" + " [eid:%s]" % (PLUGIN, str(ex), eid)) + + return False + + +########################################################################## +# +# Name : clear_alarms +# +# Purpose : Clear all interface alarms on process startup. +# +# Description: Called after first successful Link Status query. +# +# Loops over the provided alarm id list querying all alarms +# for each. Any that are raised are precisely cleared. +# +# Prevents stuck alarms over port and interface reconfig. +# +# If the original alarm case still exists the alarm will +# be re-raised with the original link event timestamp that +# is part of the Link Status query response. +# +# Parameters : A list of this plugin's alarm ids +# +# Returns : True on failure and False on success +# +########################################################################## +def clear_alarms(alarm_id_list): + """ Clear alarm state of all plugin alarms. """ + found = False + for alarm_id in alarm_id_list: + alarms = api.get_faults_by_id(alarm_id) + if alarms: + for alarm in alarms: + eid = alarm.entity_instance_id + if this_hosts_alarm(obj.hostname, eid) is False: + # ignore other host alarms + continue + + if alarm_id == PLUGIN_OAM_PORT_ALARMID or \ + alarm_id == PLUGIN_OAM_IFACE_ALARMID or \ + alarm_id == PLUGIN_MGMT_PORT_ALARMID or \ + alarm_id == PLUGIN_MGMT_IFACE_ALARMID or \ + alarm_id == PLUGIN_INFRA_PORT_ALARMID or \ + alarm_id == PLUGIN_INFRA_IFACE_ALARMID: + eid = alarm.entity_instance_id + if api.clear_fault(alarm_id, eid) is False: + collectd.error("%s %s:%s clear_fault failed" % + (PLUGIN, alarm_id, eid)) + return True + else: + found = True + collectd.info("%s %s clearing %s alarm %s:%s" % + (PLUGIN, + NETWORK_INFRA, + alarm.severity, + alarm_id, + alarm.entity_instance_id)) + + if found is False: + collectd.info("%s found no startup alarms" % PLUGIN) + + return False + + +########################################################################## +# +# Name : manage_alarm +# +# Purpose : Raises or clears port and interface alarms based on +# calling parameters. +# +# Returns : True on failure and False on success +# +########################################################################## +def manage_alarm(name, network, level, action, severity, alarm_id, timestamp): + """ Manage raise and clear of port and interface alarms """ + + ts = datetime.datetime.fromtimestamp( + float(timestamp)).strftime('%Y-%m-%d %H:%M:%S') + collectd.debug("%s %s %s %s alarm for %s:%s [%s] %s" % (PLUGIN, + severity, level, alarm_id, network, name, action, ts)) + + if action == ALARM_ACTION_CLEAR: + alarm_state = fm_constants.FM_ALARM_STATE_CLEAR + reason = '' + repair = '' + else: + # reason ad repair strings are only needed on alarm assertion + alarm_state = fm_constants.FM_ALARM_STATE_SET + reason = "'" + network.upper() + "' " + level + repair = 'Check cabling and far-end port configuration ' \ + 'and status on adjacent equipment.' + + # build the alarm eid and name string + if level == LEVEL_PORT: + eid = 'host=' + obj.hostname + "." + level + '=' + name + reason += " failed" + else: + eid = 'host=' + obj.hostname + "." + level + '=' + network + if severity == fm_constants.FM_ALARM_SEVERITY_MAJOR: + reason += " degraded" + else: + reason += " failed" + + if alarm_state == fm_constants.FM_ALARM_STATE_CLEAR: + if api.clear_fault(alarm_id, eid) is False: + collectd.error("%s %s:%s clear_fault failed" % + (PLUGIN, alarm_id, eid)) + return True + else: + return False + else: + fault = fm_api.Fault( + uuid="", + alarm_id=alarm_id, + alarm_state=alarm_state, + entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST, + entity_instance_id=eid, + severity=severity, + reason_text=reason, + alarm_type=fm_constants.FM_ALARM_TYPE_7, + probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_UNKNOWN, + proposed_repair_action=repair, + service_affecting=True, + timestamp=ts, + suppression=True) + + alarm_uuid = api.set_fault(fault) + if pc.is_uuid_like(alarm_uuid) is False: + collectd.error("%s %s:%s set_fault failed:%s" % + (PLUGIN, alarm_id, eid, alarm_uuid)) + return True + else: + return False # The config function - called once on collectd process startup def config_func(config): - """ - Configure the plugin - """ + """ Configure the plugin """ - collectd.debug('%s config function' % PLUGIN) + # Need to update the Link Status Query URL with the port number. + url_updated = False + + # The Link Monitor port number is first searched for in + # the /etc/mtc/lmond.conf file. + # If its not there then its taken from the plugin config. + + # /etc/mtc/lmond.conf + fn = '/etc/mtc/lmond.conf' + if (os.path.exists(fn)): + try: + with open(fn, 'r') as infile: + for line in infile: + if 'lmon_query_port' in line: + if isinstance(int(line.split()[2]), int): + + # add the port + obj.url += line.split()[2] + + # add the path /mtce/lmon + obj.url += PLUGIN_HTTP_URL_PATH + + url_updated = "config file" + break + except EnvironmentError as e: + collectd.error(str(e), UserWarning) + + if url_updated is False: + # Try the config as this might be updated by manifest + for node in config.children: + key = node.key.lower() + val = int(node.values[0]) + if key == 'port': + if isinstance(int(val), int): + + # add the port + obj.url += str(val) + + # add the path /mtce/lmon + obj.url += PLUGIN_HTTP_URL_PATH + + url_updated = "manifest" + break + + if url_updated: + collectd.info("%s configured by %s [%s]" % + (PLUGIN, url_updated, obj.url)) + obj.config_done = True + else: + collectd.error("%s config failure ; cannot monitor" % + (PLUGIN)) return 0 # The init function - called once on collectd process startup def init_func(): + """ Init the plugin """ - # get current hostname - obj.hostname = os.uname()[1] + if obj.config_done is False: + collectd.info("%s configuration failed" % PLUGIN) + time.sleep(300) + return False - # get the master interface names from /etc/platform/platform.conf - with open(tsc.PLATFORM_CONF_FILE, 'r') as infile: - for line in infile: + if obj.init_done is False: + if obj.init_ready() is False: + return False - # Management Interface - if PLATFORM_CONF_MGMNT_LABEL in line: - name = line.split('=')[1].replace('\n', '') - obj.NETWORKS[NETWORK_MGMNT] = iface(NETWORK_MGMNT, name, 0) - collectd.info("%s monitoring mgmnt interface : %s" % - (PLUGIN, - obj.NETWORKS[NETWORK_MGMNT].master['name'])) + obj.hostname = obj.gethostname() + obj.init_done = True + collectd.info("%s initialization complete" % PLUGIN) - # Infrastructure Interface - elif PLATFORM_CONF_INFRA_LABEL in line: - name = line.split('=')[1].replace('\n', '') - obj.NETWORKS[NETWORK_INFRA] = iface(NETWORK_INFRA, name, 0) - collectd.info("%s monitoring infra interface : %s" % - (PLUGIN, - obj.NETWORKS[NETWORK_INFRA].master['name'])) - - # OAM Interface - elif PLATFORM_CONF_OAM_LABEL in line: - name = line.split('=')[1].replace('\n', '') - obj.NETWORKS[NETWORK_OAM] = iface(NETWORK_OAM, name, 0) - collectd.info("%s monitoring oam interface: %s" % - (PLUGIN, - obj.NETWORKS[NETWORK_OAM].master['name'])) - - return 0 + return True # The sample read function - called on every audit interval def read_func(): + """ collectd interface monitor plugin read function """ - if obj.NETWORKS[NETWORK_MGMNT].state == 0: - obj.NETWORKS[NETWORK_MGMNT].state = 100 - else: - obj.NETWORKS[NETWORK_MGMNT].state -= 25 + if obj.init_done is False: + init_func() + return 0 + + if obj.audits == 0: + + # clear all alarms on first audit + + # block on fm availability + + # If existing raised the alarms are still valid then + # they will be re-raised with the same timestamp the + # original event occurred at once auditing resumes. + if clear_alarms(ALARM_ID_LIST) is True: + collectd.error("%s failed to clear existing alarms ; " + "retry next audit" % PLUGIN) + + # Don't proceed till we can communicate with FM and + # clear all existing interface and port alarms. + return 0 + + try: + # Issue query and construct the monitoring object + error = obj.make_http_request(to=PLUGIN_HTTP_TIMEOUT) + + if len(obj.jresp) == 0: + collectd.error("%s no json response from http request" % PLUGIN) + return 1 + + if error: + return 1 + + # Check query status + try: + if obj.jresp['status'] != 'pass': + collectd.error("%s link monitor query %s" % + (PLUGIN, obj.jresp['status'])) + return 0 + + except Exception as ex: + collectd.error("%s http request get reason failed ; %s" % + (PLUGIN, str(ex))) + collectd.info("%s resp:%d:%s" % + (PLUGIN, len(obj.jresp), obj.jresp)) + return 1 + + # log the first query response + if obj.audits == 0: + collectd.info("%s Link Status Query Response:%d:\n%s" % + (PLUGIN, len(obj.jresp), obj.jresp)) + + # uncomment below for debug purposes + # + # for network in NETWORKS: + # dump_network_info(network) + + try: + link_info = obj.jresp['link_info'] + for network_link_info in link_info: + collectd.debug("%s parse link info:%s" % + (PLUGIN, network_link_info)) + for network in NETWORKS: + if network.name == network_link_info['network']: + links = network_link_info['links'] + nname = network.name + if len(links) > 0: + link_one = links[0] + + # get initial link one name + if network.link_one.name is None: + network.link_one.name = link_one['name'] + + network.link_one.timestamp =\ + float(get_timestamp(link_one['time'])) + + # load link one state + if link_one['state'] == LINK_UP: + collectd.debug("%s %s IS Up [%s]" % + (PLUGIN, network.link_one.name, + network.link_one.state)) + if network.link_one.state != LINK_UP: + network.link_one.state_change = True + network.link_one.clear_port_alarm(nname) + network.link_one.state = LINK_UP + else: + collectd.debug("%s %s IS Down [%s]" % + (PLUGIN, network.link_one.name, + network.link_one.state)) + if network.link_one.state == LINK_UP: + network.link_one.state_change = True + network.link_one.raise_port_alarm(nname) + network.link_one.state = LINK_DOWN + + if len(links) > 1: + link_two = links[1] + + # get initial link two name + if network.link_two.name is None: + network.link_two.name = link_two['name'] + + network.link_two.timestamp =\ + float(get_timestamp(link_two['time'])) + + # load link two state + if link_two['state'] == LINK_UP: + collectd.debug("%s %s IS Up [%s]" % + (PLUGIN, network.link_two.name, + network.link_two.state)) + if network.link_two.state != LINK_UP: + network.link_two.state_change = True + network.link_two.clear_port_alarm(nname) + network.link_two.state = LINK_UP + else: + collectd.debug("%s %s IS Down [%s]" % + (PLUGIN, network.link_two.name, + network.link_two.state)) + if network.link_two.state == LINK_UP: + network.link_two.state_change = True + network.link_two.raise_port_alarm(nname) + network.link_two.state = LINK_DOWN + + # manage interface alarms + network.manage_iface_alarm() + + except Exception as ex: + collectd.error("%s link monitor query parse error: %s " % + (PLUGIN, obj.resp)) + + # handle state changes + for network in NETWORKS: + if network.link_two.name is not None and \ + network.link_one.state_change is True: + + if network.link_one.state == LINK_UP: + collectd.info("%s %s link one '%s' is Up" % + (PLUGIN, + network.name, + network.link_one.name)) + else: + collectd.info("%s %s link one '%s' is Down" % + (PLUGIN, + network.name, + network.link_one.name)) + + if network.link_two.name is not None and \ + network.link_two.state_change is True: + + if network.link_two.state == LINK_UP: + collectd.info("%s %s link two '%s' is Up" % + (PLUGIN, + network.name, + network.link_two.name)) + else: + collectd.info("%s %s link two %s 'is' Down" % + (PLUGIN, + network.name, + network.link_two.name)) + + # Dispatch usage value to collectd + val = collectd.Values(host=obj.hostname) + val.plugin = 'interface' + val.type = 'percent' + val.type_instance = 'used' + + # For each interface [ mgmt, oam, infra ] + # calculate the percentage used sample + # sample = 100 % when all its links are up + # sample = 0 % when all its links are down + # sample = 50 % when one of a lagged group is down + for network in NETWORKS: + + if network.link_one.name is not None: + + val.plugin_instance = network.name + + network.sample = 0 + + if network.link_two.name is not None: + # lagged + + if network.link_one.state == LINK_UP: + network.sample = 50 + if network.link_two.state == LINK_UP: + network.sample += 50 + else: + if network.link_one.state == LINK_UP: + network.sample = 100 + val.dispatch(values=[network.sample]) + + if network.link_one.state_change is True or \ + network.link_two.state_change is True: + + dump_network_info(network) + + network.link_one.state_change = False + network.link_two.state_change = False + + network.sample_last = network.sample + + else: + collectd.debug("%s %s network not provisioned" % + (PLUGIN, network.name)) + obj.audits += 1 + + except Exception as ex: + collectd.info("%s http request failed: %s" % (PLUGIN, str(ex))) - # Dispatch usage value to collectd - val = collectd.Values(host=obj.hostname) - val.plugin = 'interface' - val.plugin_instance = 'mgmnt' - val.type = 'absolute' - val.type_instance = 'used' - val.dispatch(values=[obj.NETWORKS[NETWORK_MGMNT].state]) return 0 # register the config, init and read functions collectd.register_config(config_func) collectd.register_init(init_func) -collectd.register_read(read_func) +collectd.register_read(read_func, interval=PLUGIN_AUDIT_INTERVAL) diff --git a/monitoring/collectd-extensions/src/memory.conf b/monitoring/collectd-extensions/src/memory.conf index 5e5195f09..997bf2d48 100644 --- a/monitoring/collectd-extensions/src/memory.conf +++ b/monitoring/collectd-extensions/src/memory.conf @@ -12,8 +12,8 @@ Instance "used" Persist true PersistOK true - WarningMax 80.00 - FailureMax 90.00 + WarningMax 79.00 + FailureMax 89.00 Hits 2 Invert false diff --git a/monitoring/collectd-extensions/src/mtce_notifier.py b/monitoring/collectd-extensions/src/mtce_notifier.py index 98de81cf3..1f645e0d8 100755 --- a/monitoring/collectd-extensions/src/mtce_notifier.py +++ b/monitoring/collectd-extensions/src/mtce_notifier.py @@ -103,7 +103,7 @@ class collectdMtceNotifierObject: PLUGIN__VSWITCH_IFACE, PLUGIN_INTERFACE, PLUGIN__EXAMPLE] - self.degrade_list__warning = [] + self.degrade_list__warning = [PLUGIN_INTERFACE] # the running list of resources that require degrade. # a degrade clear message is sent whenever this list is empty. diff --git a/monitoring/collectd-extensions/src/plugin_common.py b/monitoring/collectd-extensions/src/plugin_common.py new file mode 100644 index 000000000..d6ba89894 --- /dev/null +++ b/monitoring/collectd-extensions/src/plugin_common.py @@ -0,0 +1,255 @@ +# +# Copyright (c) 2019 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# +############################################################################ +# +# This file contains common collectd plugin constructs and utilities +# +############################################################################ + +import collectd +import json +import uuid +import httplib2 +import socket +import os +from fm_api import constants as fm_constants +import tsconfig.tsconfig as tsc + +# http request constants +PLUGIN_TIMEOUT = 10 +PLUGIN_HTTP_HEADERS = {'Accept': 'application/json', 'Connection': 'close'} + +MIN_AUDITS_B4_FIRST_QUERY = 2 + + +class PluginObject(object): + + def __init__(self, plugin, url): + + # static variables set in init_func + self.plugin = plugin # the name of this plugin + self.hostname = '' # the name of this host + self.port = 0 # the port number for this plugin + + # dynamic gate variables + self.config_complete = False # set to True once config is complete + self.config_done = False # set true if config_func completed ok + self.init_done = False # set true if init_func completed ok + + # dynamic variables set in read_func + self.usage = float(0) # last usage value recorded as float + self.audits = 0 # number of audit since init + + # http and json specific variables + self.url = url # target url + self.jresp = None # used to store the json response + self.resp = '' + + # Log controls + self.config_logged = False # used to log once the plugin config + self.error_logged = False # used to prevent log flooding + self.log_throttle_count = 0 # used to count throttle logs + self.INIT_LOG_THROTTLE = 10 # the init log throttle threshold + + collectd.debug("%s Common PluginObject constructor [%s]" % + (plugin, url)) + + ########################################################################### + # + # Name : init_ready + # + # Description: Test for init ready condition + # + # Parameters : plugin name + # + # Returns : False if initial config complete is not done + # True if initial config complete is done + # + ########################################################################### + + def init_ready(self): + """ Test for system init ready state """ + + if os.path.exists(tsc.INITIAL_CONFIG_COMPLETE_FLAG) is False: + self.log_throttle_count += 1 + if self.log_throttle_count > self.INIT_LOG_THROTTLE: + collectd.info("%s initialization needs retry" % self.plugin) + self.log_throttle_count = 0 + return False + else: + self.log_throttle_count = 0 + + return True + + ########################################################################### + # + # Name : gethostname + # + # Description: load the hostname + # + # Parameters : plugin name + # + # Returns : Success - hostname + # Failure - None + # + # Updates : obj.hostname + # + ########################################################################### + def gethostname(self): + """ Fetch the hostname """ + + # get current hostname + try: + hostname = socket.gethostname() + if hostname: + return hostname + except: + collectd.error("%s failed to get hostname" % self.plugin) + + return None + + ########################################################################### + # + # Name : check_for_fit + # + # Description: load FIT data if it is present + # + # Fit Format : unit data -> 0 89 + # - instance 0 value 89 + # + # Parameters : plugin name + # object to update with fit + # name in fit file + # unit + # + # Returns : Did a failure occur ? + # False = no + # True = yes + # + # Updates : self.usage with FIT value if FIT conditions are present + # and apply + # + ########################################################################### + def check_for_fit(self, name, unit): + """ Load FIT data into usage if it exists """ + + fit_file = '/var/run/fit/' + name + '_data' + + if os.path.exists(fit_file): + valid = False + with open(fit_file, 'r') as infile: + for line in infile: + try: + inst, val = line.split(' ') + if int(unit) == int(inst): + self.usage = float(val) + valid = True + + except: + try: + val = float(line) + self.usage = float(val) + valid = True + + except: + collectd.error("%s bad FIT data; ignoring" % + self.plugin) + + if valid is True: + collectd.info("%s %.2f usage (unit %d) (FIT)" % + (self.plugin, unit, self.usage)) + return False + + return True + + ########################################################################### + # + # Name : make_http_request + # + # Description: Issue an http request to the specified URL. + # Load and return the response + # Handling execution errors + # + # Parameters : self as current context. + # + # Optional: + # + # url - override the default self url with http address to + # issue the get request to. + # to - timeout override + # hdrs - override use of the default header list + # + # Updates : self.jresp with the json string response from the request. + # + # Returns : Error indication (True/False) + # True on error + # False on success + # + ########################################################################### + def make_http_request(self, url=None, to=None, hdrs=None): + """ Make a blocking HTTP Request and return result """ + + try: + + # handle timeout override + if to is None: + to = PLUGIN_TIMEOUT + + # handle url override + if url is None: + url = self.url + + # handle header override + if hdrs is None: + hdrs = PLUGIN_HTTP_HEADERS + + http = httplib2.Http(timeout=to) + resp = http.request(url, headers=hdrs) + + except Exception as ex: + collectd.info("%s http request failure (%s)" % + (self.plugin, str(ex))) + return True + + try: + collectd.debug("%s Resp: %s" % + (self.plugin, resp[1])) + + self.resp = resp[1] + self.jresp = json.loads(resp[1]) + + except Exception as ex: + collectd.info("%s http request parse failure (%s) (%s)" % + (self.plugin, str(ex), resp)) + return True + return False + + +def is_uuid_like(val): + """Returns validation of a value as a UUID. + + For our purposes, a UUID is a canonical form string: + aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa + """ + try: + return str(uuid.UUID(val)) == val + except (TypeError, ValueError, AttributeError): + return False + + +def get_severity_str(severity): + """ get string that represents the specified severity """ + + if severity == fm_constants.FM_ALARM_SEVERITY_CLEAR: + return "clear" + elif severity == fm_constants.FM_ALARM_SEVERITY_CRITICAL: + return "critical" + elif severity == fm_constants.FM_ALARM_SEVERITY_MAJOR: + return "major" + elif severity == fm_constants.FM_ALARM_SEVERITY_MINOR: + return "minor" + else: + return "unknown" diff --git a/monitoring/collectd-extensions/src/python_plugins.conf b/monitoring/collectd-extensions/src/python_plugins.conf index 52aa763d0..85ba02377 100644 --- a/monitoring/collectd-extensions/src/python_plugins.conf +++ b/monitoring/collectd-extensions/src/python_plugins.conf @@ -10,6 +10,10 @@ LoadPlugin python Path "/proc/meminfo" Import "ntpq" + Import "interface" + + Port 2122 + LogTraces = true Encoding "utf-8" diff --git a/utilities/platform-util/centos/build_srpm.data b/utilities/platform-util/centos/build_srpm.data index 880171162..260eb2124 100644 --- a/utilities/platform-util/centos/build_srpm.data +++ b/utilities/platform-util/centos/build_srpm.data @@ -1,4 +1,4 @@ SRC_DIR="platform-util" COPY_LIST_TO_TAR="scripts" -TIS_PATCH_VER=15 +TIS_PATCH_VER=16 diff --git a/utilities/platform-util/scripts/patch-restart-mtce b/utilities/platform-util/scripts/patch-restart-mtce index 357369288..0f888374b 100755 --- a/utilities/platform-util/scripts/patch-restart-mtce +++ b/utilities/platform-util/scripts/patch-restart-mtce @@ -131,6 +131,9 @@ do "mtcalarmd") pmon_managed_processes=(${pmon_managed_processes[@]} "mtcalarmd:0") ;; + "lmond") + pmon_managed_processes=(${pmon_managed_processes[@]} "lmond:0") + ;; *) loginfo "Unknown process:${process}"