From edf29d1b7a93564e1f0836bc20028f71c619f76e Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Sun, 10 Mar 2019 15:19:48 -0400 Subject: [PATCH] Add Remote Logging Server connectivity monitoring to collectd This update adds titled support to the starlingX set of collectd monitoring plugins. This update excludes monitoring of IPV6 remote logging servers. Only IPV4 remote logging servers are supported. Story: 2002823 Task: 28636 Test Plan: PASS: Verify monitoring on controller nodes only PASS: Verify system install PASS: Verify plugin logging is value added PASS: Verify connectivity failure to success handling PASS: Verify connectivity success to failure handling PASS: Verify connected / not connected logging on service state change PASS: Verify connected / not connected logging on connectivity state change PASS: Verify service enabled to disabled state transition with alarm asserted PASS: Verify service enabled to disabled state transition while connected PASS: Verify service disabled to enabled state transition with connectivity PASS: Verify service disabled to enabled state transition without connectivity PASS: Verify plugin audit interval is every 60 seconds PASS: Verify plugin alarm assert debounce of 2 PASS: Verify plugin alarm clear with no debounce PASS: Verify plugin alarm assert over process start on TCP conn failure PASS: Verify plugin alarm severity as Minor PASS: Verify plugin alarm clear over process restart PASS: Verify plugin alarm is cleared on service disable transition PASS: Verify plugin sample data Change-Id: I73cd35170ed19abce17bb4f511f0c5e04bc101c6 Signed-off-by: Eric MacDonald --- .../centos/build_srpm.data | 4 +- .../centos/collectd-extensions.spec | 4 + .../collectd-extensions/src/plugin_common.py | 3 + .../src/python_plugins.conf | 1 + .../collectd-extensions/src/remotels.conf | 13 + .../collectd-extensions/src/remotels.py | 345 ++++++++++++++++++ 6 files changed, 369 insertions(+), 1 deletion(-) create mode 100644 monitoring/collectd-extensions/src/remotels.conf create mode 100755 monitoring/collectd-extensions/src/remotels.py diff --git a/monitoring/collectd-extensions/centos/build_srpm.data b/monitoring/collectd-extensions/centos/build_srpm.data index e7f74e208..8514ebc35 100644 --- a/monitoring/collectd-extensions/centos/build_srpm.data +++ b/monitoring/collectd-extensions/centos/build_srpm.data @@ -16,7 +16,9 @@ COPY_LIST="$PKG_BASE/src/LICENSE \ $PKG_BASE/src/ntpq.conf \ $PKG_BASE/src/interface.py \ $PKG_BASE/src/interface.conf \ + $PKG_BASE/src/remotels.py \ + $PKG_BASE/src/remotels.conf \ $PKG_BASE/src/example.py \ $PKG_BASE/src/example.conf" -TIS_PATCH_VER=7 +TIS_PATCH_VER=8 diff --git a/monitoring/collectd-extensions/centos/collectd-extensions.spec b/monitoring/collectd-extensions/centos/collectd-extensions.spec index 0665fb650..f8aa0936f 100644 --- a/monitoring/collectd-extensions/centos/collectd-extensions.spec +++ b/monitoring/collectd-extensions/centos/collectd-extensions.spec @@ -23,6 +23,7 @@ Source12: memory.py Source14: example.py Source15: ntpq.py Source16: interface.py +Source17: remotels.py # collectd plugin conf files into /etc/collectd.d Source100: python_plugins.conf @@ -32,6 +33,7 @@ Source103: df.conf Source104: example.conf Source105: ntpq.conf Source106: interface.conf +Source107: remotels.conf BuildRequires: systemd-devel @@ -75,6 +77,7 @@ install -m 700 %{SOURCE12} %{buildroot}%{local_python_extensions_dir} install -m 700 %{SOURCE14} %{buildroot}%{local_python_extensions_dir} install -m 700 %{SOURCE15} %{buildroot}%{local_python_extensions_dir} install -m 700 %{SOURCE16} %{buildroot}%{local_python_extensions_dir} +install -m 700 %{SOURCE17} %{buildroot}%{local_python_extensions_dir} # collectd plugin conf files into /etc/collectd.d @@ -85,6 +88,7 @@ install -m 600 %{SOURCE103} %{buildroot}%{local_plugin_dir} install -m 600 %{SOURCE104} %{buildroot}%{local_plugin_dir} install -m 600 %{SOURCE105} %{buildroot}%{local_plugin_dir} install -m 600 %{SOURCE106} %{buildroot}%{local_plugin_dir} +install -m 600 %{SOURCE107} %{buildroot}%{local_plugin_dir} %clean rm -rf $RPM_BUILD_ROOT diff --git a/monitoring/collectd-extensions/src/plugin_common.py b/monitoring/collectd-extensions/src/plugin_common.py index d6ba89894..6390024e0 100644 --- a/monitoring/collectd-extensions/src/plugin_common.py +++ b/monitoring/collectd-extensions/src/plugin_common.py @@ -33,6 +33,7 @@ class PluginObject(object): self.plugin = plugin # the name of this plugin self.hostname = '' # the name of this host self.port = 0 # the port number for this plugin + self.base_eid = '' # the base entity id host= # dynamic gate variables self.config_complete = False # set to True once config is complete @@ -42,6 +43,8 @@ class PluginObject(object): # dynamic variables set in read_func self.usage = float(0) # last usage value recorded as float self.audits = 0 # number of audit since init + self.enabled = False # tracks a plugin's enabled state + self.alarmed = False # tracks the current alarmed state # http and json specific variables self.url = url # target url diff --git a/monitoring/collectd-extensions/src/python_plugins.conf b/monitoring/collectd-extensions/src/python_plugins.conf index 85ba02377..e33de86a4 100644 --- a/monitoring/collectd-extensions/src/python_plugins.conf +++ b/monitoring/collectd-extensions/src/python_plugins.conf @@ -14,6 +14,7 @@ LoadPlugin python Port 2122 + Import "remotels" LogTraces = true Encoding "utf-8" diff --git a/monitoring/collectd-extensions/src/remotels.conf b/monitoring/collectd-extensions/src/remotels.conf new file mode 100644 index 000000000..f9e588992 --- /dev/null +++ b/monitoring/collectd-extensions/src/remotels.conf @@ -0,0 +1,13 @@ + + + + Instance "reachable" + Persist true + PersistOK true + WarningMin 1 + FailureMin 0 + Hits 2 + Invert false + + + diff --git a/monitoring/collectd-extensions/src/remotels.py b/monitoring/collectd-extensions/src/remotels.py new file mode 100755 index 000000000..9a766d9df --- /dev/null +++ b/monitoring/collectd-extensions/src/remotels.py @@ -0,0 +1,345 @@ +# +# Copyright (c) 2019 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# +############################################################################ +# +# This is the Remote Logging Server plugin for collectd. +# +# The Remote Logging Server is enabled if /etc/syslog-ng/syslog-ng.conf +# contains '@include remotelogging.conf' +# +# There is no asynchronous notification of remote logging server +# configuration enable/disable state changes. Therefore, each audit +# interval needs to check whether its enabled or not. +# +# every audit interval ... +# +# read_func: +# check enabled: +# if disabled and alarmed: +# clear alarm +# if enabled: +# get ip and port +# query status +# if connected and alarmed: +# clear alarm +# if not connected and not alarmed: +# raise alarm +# +# system remotelogging-modify --ip_address +# --transport tcp +# --enabled True +# +############################################################################ + +import os +import collectd +import tsconfig.tsconfig as tsc +import plugin_common as pc +from fm_api import constants as fm_constants +from oslo_concurrency import processutils +from fm_api import fm_api + +# Fault manager API Object +api = fm_api.FaultAPIs() + +# name of the plugin +PLUGIN_NAME = 'remotels' + +# all logs produced by this plugin are prefixed with this +PLUGIN = 'remote logging server' + +# Interface Monitoring Interval in seconds +PLUGIN_AUDIT_INTERVAL = 60 + +# Sample Data 'type' and 'instance' database field values. +PLUGIN_TYPE = 'absolute' +PLUGIN_TYPE_INSTANCE = 'reachable' + +# Remote Logging Connectivity Alarm ID +PLUGIN_ALARMID = '100.118' + +# The file where this plugin learns if remote logging is enabled +SYSLOG_CONF_FILE = '/etc/syslog-ng/syslog-ng.conf' + +# Plugin Control Object +obj = pc.PluginObject(PLUGIN, "") + + +# Raise Remote Logging Server Alarm +def raise_alarm(): + """ Raise Remote Logging Server Alarm. """ + + repair = 'Ensure Remote Log Server IP is reachable from ' + repair += 'Controller through OAM interface; otherwise ' + repair += 'contact next level of support.' + + reason = 'Controller cannot establish connection with ' + reason += 'remote logging server.' + + try: + fault = fm_api.Fault( + alarm_id=PLUGIN_ALARMID, + alarm_state=fm_constants.FM_ALARM_STATE_SET, + entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST, + entity_instance_id=obj.base_eid, + severity=fm_constants.FM_ALARM_SEVERITY_MINOR, + reason_text=reason, + alarm_type=fm_constants.FM_ALARM_TYPE_1, + probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_6, + proposed_repair_action=repair, + service_affecting=False, + suppression=False) + + alarm_uuid = api.set_fault(fault) + if pc.is_uuid_like(alarm_uuid) is False: + collectd.error("%s %s:%s set_fault failed:%s" % + (PLUGIN, PLUGIN_ALARMID, + obj.base_eid, alarm_uuid)) + else: + collectd.info("%s %s:%s alarm raised" % + (PLUGIN, PLUGIN_ALARMID, obj.base_eid)) + obj.alarmed = True + + except: + collectd.error("%s %s:%s set_fault exception" % + (PLUGIN, PLUGIN_ALARMID, obj.base_eid)) + + +# Clear remote logging server alarm +def clear_alarm(): + """ Clear remote logging server alarm """ + + try: + if api.clear_fault(PLUGIN_ALARMID, obj.base_eid) is True: + collectd.info("%s alarm cleared" % PLUGIN) + obj.alarmed = False + return True + + except: + collectd.error("%s %s:%s clear failed ; will retry" % + (PLUGIN, PLUGIN_ALARMID, obj.base_eid)) + return False + + +# The config function - called once on collectd process startup +def config_func(config): + """ Configure the plugin """ + + # all configuration is learned during normal monitoring + obj.config_done = True + return 0 + + +# The init function - called once on collectd process startup +def init_func(): + """ Init the plugin """ + + # remote logging server monitoring is for controllers only + if tsc.nodetype != 'controller': + return 0 + + if obj.init_done is False: + if obj.init_ready() is False: + return False + + obj.hostname = obj.gethostname() + obj.base_eid = 'host=' + obj.hostname + obj.init_done = True + collectd.info("%s initialization complete" % PLUGIN) + + return True + + +# The sample read function - called on every audit interval +def read_func(): + """ Remote logging server connectivity plugin read function """ + + # remote logging server monitoring is for controllers only + if tsc.nodetype != 'controller': + return 0 + + if obj.init_done is False: + init_func() + return 0 + + # get current state + current_enabled_state = obj.enabled + + # check to see if remote logging is enabled + obj.enabled = False # assume disabled + if os.path.exists(SYSLOG_CONF_FILE) is True: + with open(SYSLOG_CONF_FILE, 'r') as infile: + for line in infile: + if line.startswith('@include '): + service = line.rstrip().split(' ')[1] + if service == '"remotelogging.conf"': + obj.enabled = True + break + + if current_enabled_state == obj.enabled: + logit = False + else: + if obj.enabled is False: + collectd.info("%s is disabled" % PLUGIN) + else: + collectd.info("%s is enabled" % PLUGIN) + logit = True + + # Handle startup case by clearing existing alarm if its raised. + # Its runtime cheaper and simpler to issue a blind clear than query. + if obj.audits == 0: + if clear_alarm() is False: + # if clear fails then retry next time + return 0 + if obj.enabled is False: + collectd.info("%s is disabled" % PLUGIN) + obj.audits = 1 + + if obj.enabled is False: + if obj.alarmed is True: + clear_alarm() + return 0 + + # If we get here then the server is enabled ... + # Need to query it + + # Get the ip and port from line that looks like this + # + # tag proto address port + # ----------------------------- --- -------------- --- + # destination remote_log_server {tcp("128.224.186.65" port(514));}; + # + address = protocol = port = '' + with open(SYSLOG_CONF_FILE, 'r') as infile: + for line in infile: + if line.startswith('destination remote_log_server'): + try: + if len(line.split('{')) > 1: + protocol = line.split('{')[1][0:3] + address = line.split('{')[1].split('"')[1] + port = line.split('{')[1].split('(')[2].split(')')[0] + if not protocol or not address or not port: + collectd.error("%s remote log server credentials " + "parse error ; (%s:%s:%s)" % + (PLUGIN, protocol, address, port)) + return 1 + else: + # line parsed ; move on ... + break + else: + collectd.error("%s remote log server line parse error" + " ; %s" % (PLUGIN, line)) + except Exception as ex: + collectd.error("%s remote log server credentials " + "parse exception ; (%s)" % (PLUGIN, line)) + + if ':' in address: + ipv = 6 + protocol += 6 + + # Monitoring of IPV6 is not currently supported + return 0 + + else: + ipv = 4 + + # This plugin detects server connectivity through its socket status. + # To get that construct the remote logging server IP string. + # The files being looked at(/proc/net/tcp(udp)) use hex values, + # so convert the string caps hex value with reverse ordering of + # the "ipv4" values + index = 3 + addr = [0, 0, 0, 0] + + # swap order + for tup in address.split('.'): + addr[index] = int(tup) + index -= 1 + + # build the CAPs HEX address + UPPER_HEX_IP = '' + for tup in addr: + val = hex(int(tup)).split('x')[-1].upper() + if len(val) == 1: + UPPER_HEX_IP += '0' + UPPER_HEX_IP += val + UPPER_HEX_IP += ':' + tmp = hex(int(port)).split('x')[-1].upper() + for i in range(4-len(tmp)): + UPPER_HEX_IP += '0' + UPPER_HEX_IP += tmp + + # log example tcp:ipv4:128.224.186.65:514 : IP:41BAE080:0202 + collectd.debug("%s %s:ipv%d:%s:%s : IP:%s" % + (PLUGIN, protocol, ipv, address, port, UPPER_HEX_IP)) + + cmd = "cat /proc/net/" + protocol + cmd += " | awk '{print $3 \" \" $4}' | grep " + UPPER_HEX_IP + cmd += " | awk '{print $2}'" + res, err = processutils.execute(cmd, shell=True) + if err: + collectd.error("%s processutils error:%s" % (PLUGIN, err)) + + # cmd example: + # cat /proc/net/tcp | awk '{print $3 " " $4}' + # | grep 41BAE080:0202 + # | awk '{print $2}' + collectd.debug("%s Cmd:%s" % (PLUGIN, cmd)) + return 0 + + if res and res.rstrip() == '01': + # connected state reads 01 + # Example log: Res:[01] + + # clear alarm if + # - currently alarmed and + # - debounced by 1 ; need 2 connected readings in a row + if obj.alarmed is True: + clear_alarm() + + # Only log on state change + if obj.usage != 1: + logit = True + + obj.usage = 1 + conn = '' + + else: + # res typically reads 02 when notr connected + # Example log: Res:[02] + collectd.debug("%s Res:[%s] " % (PLUGIN, res.rstrip())) + + # raise alarm if + # - not already alarmed + # - debounced by 1 ; need 2 failures in a row + if obj.alarmed is False and obj.usage == 0: + raise_alarm() + + # only log on state change + if obj.usage == 1 or obj.audits == 1: + logit = True + + obj.usage = 0 + conn = 'not ' + + if logit is True: + collectd.info("%s is %sconnected [%s ipv%d %s:%s]" % + (PLUGIN, conn, protocol, ipv, address, port)) + obj.audits += 1 + + # Dispatch usage value to collectd + val = collectd.Values(host=obj.hostname) + val.plugin = PLUGIN_NAME + val.type = PLUGIN_TYPE + val.type_instance = PLUGIN_TYPE_INSTANCE + val.dispatch(values=[obj.usage]) + return 0 + + +# register the config, init and read functions +collectd.register_config(config_func) +collectd.register_init(init_func) +collectd.register_read(read_func, interval=PLUGIN_AUDIT_INTERVAL)