From 5142fac49806c8b823c50be52119c878841f0955 Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Thu, 20 Sep 2018 14:21:32 -0400 Subject: [PATCH] Make collectd alarm notifier retry alarm clear attempts that fail The Starling-X collectd alarm notification handler Fault Manager (FM) call to clear an alarm can lead to a stuck alarm if that FM request fails, say due to a concurrent swact operation, and the clear is not retried. The alarm will remain stuck until there is another same alarm assertion, followed by deassertion that leads to a successful clear. The fix is to execute a 'return' in the alarm clear failure path so that the alarm notifier's alarm manager control structure is not updated with the clear state so that the clear will be automatically retried on the next audit interval. Change-Id: Iddf4e0e7b99eab0bf0748230a25851419e7c06fa Closes-Bug: 1793314 Signed-off-by: Eric MacDonald --- monitoring/collectd-extensions/centos/build_srpm.data | 2 +- monitoring/collectd-extensions/src/fm_notifier.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/monitoring/collectd-extensions/centos/build_srpm.data b/monitoring/collectd-extensions/centos/build_srpm.data index fc2b9df86..cb4885304 100644 --- a/monitoring/collectd-extensions/centos/build_srpm.data +++ b/monitoring/collectd-extensions/centos/build_srpm.data @@ -16,4 +16,4 @@ COPY_LIST="$PKG_BASE/src/LICENSE \ $PKG_BASE/src/example.py \ $PKG_BASE/src/example.conf" -TIS_PATCH_VER=1 +TIS_PATCH_VER=2 diff --git a/monitoring/collectd-extensions/src/fm_notifier.py b/monitoring/collectd-extensions/src/fm_notifier.py index e7710a670..058c9aeb7 100755 --- a/monitoring/collectd-extensions/src/fm_notifier.py +++ b/monitoring/collectd-extensions/src/fm_notifier.py @@ -1143,6 +1143,7 @@ def notifier_func(nObject): if api.clear_fault(base_obj.id, obj.entity_id) is False: collectd.error("%s %s:%s clear_fault failed" % (PLUGIN, base_obj.id, obj.entity_id)) + return 0 else: reason = obj.resource_name reason += " threshold exceeded"