From abaff6b27525aaa91df53319f84004640f75e6a3 Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Fri, 18 Jan 2019 16:29:56 -0500 Subject: [PATCH] Remove alarm query before clear in NTP plugin Issue titled 'NTP 100.14 alarm is not cleared' exposed an issue where the NTP plugin alarm clear operation is circumvented when its pre-curser fm_api.get_fault call returns None if the fm process is not running. From the callers point of view the None return suggests that the alarm to be cleared does not exist so the code skips the call to clear. This update works around this by simply issuing the clear without the query. Change-Id: Idcc05bb0e7e1aa1082af1e8ecdcb1a5463b19440 Closes-Bug: 1812440 Signed-off-by: Eric MacDonald --- .../centos/build_srpm.data | 2 +- monitoring/collectd-extensions/src/ntpq.py | 44 +++++++++---------- 2 files changed, 21 insertions(+), 25 deletions(-) diff --git a/monitoring/collectd-extensions/centos/build_srpm.data b/monitoring/collectd-extensions/centos/build_srpm.data index e5b3c5046..82cafe8bb 100644 --- a/monitoring/collectd-extensions/centos/build_srpm.data +++ b/monitoring/collectd-extensions/centos/build_srpm.data @@ -16,4 +16,4 @@ COPY_LIST="$PKG_BASE/src/LICENSE \ $PKG_BASE/src/example.py \ $PKG_BASE/src/example.conf" -TIS_PATCH_VER=5 +TIS_PATCH_VER=6 diff --git a/monitoring/collectd-extensions/src/ntpq.py b/monitoring/collectd-extensions/src/ntpq.py index 7b6f343db..3f7964656 100755 --- a/monitoring/collectd-extensions/src/ntpq.py +++ b/monitoring/collectd-extensions/src/ntpq.py @@ -222,15 +222,14 @@ def _raise_alarm(ip=None): def _clear_base_alarm(): """ Clear the NTP base alarm """ - if api.get_fault(PLUGIN_ALARMID, obj.base_eid) is not None: - if api.clear_fault(PLUGIN_ALARMID, obj.base_eid) is False: - collectd.error("%s failed to clear alarm %s:%s" % - (PLUGIN, PLUGIN_ALARMID, obj.base_eid)) - return True - else: - collectd.info("%s cleared alarm %s:%s" % - (PLUGIN, PLUGIN_ALARMID, obj.base_eid)) - obj.alarm_raised = False + if api.clear_fault(PLUGIN_ALARMID, obj.base_eid) is False: + collectd.error("%s failed to clear alarm %s:%s" % + (PLUGIN, PLUGIN_ALARMID, obj.base_eid)) + return True + else: + collectd.info("%s cleared alarm %s:%s" % + (PLUGIN, PLUGIN_ALARMID, obj.base_eid)) + obj.alarm_raised = False return False @@ -263,23 +262,20 @@ def _remove_ip_from_unreachable_list(ip): if ip and ip in obj.unreachable_servers: eid = obj.base_eid + '=' + ip collectd.debug("%s trying to clear alarm %s" % (PLUGIN, eid)) + # clear the alarm if its asserted - if api.get_fault(PLUGIN_ALARMID, eid) is not None: - if api.clear_fault(PLUGIN_ALARMID, eid) is True: - collectd.info("%s cleared %s:%s alarm" % - (PLUGIN, PLUGIN_ALARMID, eid)) - obj.unreachable_servers.remove(ip) - else: - # Handle clear failure by not removing the IP from the list. - # It will retry on next audit. - # Error should only occur if FM is not running at the time - # this get or clear is called - collectd.error("%s failed alarm clear %s:%s" % - (PLUGIN, PLUGIN_ALARMID, eid)) - return True - else: + if api.clear_fault(PLUGIN_ALARMID, eid) is True: + collectd.info("%s cleared %s:%s alarm" % + (PLUGIN, PLUGIN_ALARMID, eid)) obj.unreachable_servers.remove(ip) - collectd.info("%s alarm %s not raised" % (PLUGIN, eid)) + else: + # Handle clear failure by not removing the IP from the list. + # It will retry on next audit. + # Error should only occur if FM is not running at the time + # this get or clear is called + collectd.error("%s failed alarm clear %s:%s" % + (PLUGIN, PLUGIN_ALARMID, eid)) + return True return False