From a3a20fcf59174c8891d22bd396c3a589e9987172 Mon Sep 17 00:00:00 2001 From: rummadis Date: Fri, 22 Mar 2024 03:14:27 -0400 Subject: [PATCH] Prune stale backup in progress alarm 210.001 User unable to take subcloud backup when there is a stale backup in progress alarm Example: When user tries to take subcloud backup in Distributed cloud env if there is stale 210.001 alarm present in subcloud then user can not trigger the subsequent subcloud backup This Fix helps to identify the 210.001 alarms and clear them if they are pending more than 1 hour TEST PLAN: PASS: DC-libvirt setup with 2 controllers and 2 subclouds PASS: verified stale 210.001 getting removed Closes-Bug: 2058516 Change-Id: Iedcc5e41cd4245c538d331d9aa8c2b6cc445acce Signed-off-by: rummadis --- .../sysinv/sysinv/sysinv/conductor/manager.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/sysinv/sysinv/sysinv/sysinv/conductor/manager.py b/sysinv/sysinv/sysinv/sysinv/conductor/manager.py index 0257166089..1c1087576e 100644 --- a/sysinv/sysinv/sysinv/sysinv/conductor/manager.py +++ b/sysinv/sysinv/sysinv/sysinv/conductor/manager.py @@ -186,6 +186,7 @@ audit_intervals_opts = [ cfg.IntOpt('kube_upgrade_states', default=1800), cfg.IntOpt('prune_runtime_config', default=43200), cfg.IntOpt('k8s_cluster_health', default=180), + cfg.IntOpt('alarm_audit_interval', default=1800), ] app_framework_opts = [ @@ -19140,6 +19141,25 @@ class ConductorManager(service.PeriodicService): def _audit_prune_runtime_config(self): self._prune_runtime_config_table() + def _prune_stale_backup_alarms(self, context): + """Prune stale backup alarms older than 1 Hour""" + backup_alarms = self.fm_api.get_faults_by_id( + fm_constants.FM_ALARM_ID_BACKUP_IN_PROGRESS) + if backup_alarms: + for alarm in backup_alarms: + alarm_ts = datetime.strptime(alarm.timestamp, "%Y-%m-%d %H:%M:%S.%f") + if (datetime.utcnow() - alarm_ts).total_seconds() > \ + 2 * CONF.conductor_periodic_task_intervals.alarm_audit_interval: + LOG.info("Pruning stale backup alarm alarm_id = %s" % alarm.alarm_id) + self.fm_api.clear_fault(fm_constants.FM_ALARM_ID_BACKUP_IN_PROGRESS, + alarm.entity_instance_id) + + @periodic_task.periodic_task( + spacing=CONF.conductor_periodic_task_intervals.alarm_audit_interval + ) + def _audit_prune_stale_backup_alarms(self, context): + self._prune_stale_backup_alarms(context) + def device_image_state_sort_key(dev_img_state): if dev_img_state.bitstream_type == dconstants.BITSTREAM_TYPE_ROOT_KEY: