sw-manager patch-strategy failed to install due to timeout

As part of this fix, alarm clear wait step
checks for stale alarm 750.006 for 30mins. If the alarm
is still no cleard, patch-strategy ignores the alarm.
In the current case alarm 750.006 is not getting cleared
so the patch-strategy times out.

Test Plan:
PASSED: Applying a patch - On DX system,
Create and apply patch strategy,
fm alarm-list to have an uncleared alarm(for test purpose
100.103 - Memory threshold alarm was used). After 30mins
alarm was ignored and patch strategy successfully applied.
PASSED: Removing a patch - On DX system,
Create and apply patch strategy,
fm alarm-list to have an uncleared alarm(for test purpose
100.103 - Memory threshold alarm was used). After 30mins
alarm was ignored and patch strategy successfully applied.
todo: to test with the actual alarm 750.006 in the
lab setup.

Closes-Bug: 2059305
Change-Id: I7ebaf5a24fa45a7e45f3af7e5ca588ce3ee06156
Signed-off-by: Vanathi.Selvaraju <vanathi.selvaraju@windriver.com>
This commit is contained in:
Vanathi.Selvaraju 2024-03-27 15:53:55 -04:00 committed by Vanathi Selvaraju
parent 791463f67f
commit 1d2e429800
2 changed files with 55 additions and 11 deletions

View File

@ -992,8 +992,9 @@ class UpdateControllerHostsMixin(object):
# OSDs configured, but the alarms should clear quickly in
# that case so this will not delay the update strategy.
stage.add_step(strategy.WaitAlarmsClearStep(
timeout_in_secs=30 * 60,
ignore_alarms=self._ignore_alarms))
timeout_in_secs=40 * 60,
ignore_alarms=self._ignore_alarms,
ignore_alarms_conditional=self._ignore_alarms_conditional))
else:
# Less time required if host is not rebooting
stage.add_step(strategy.SystemStabilizeStep(
@ -1004,7 +1005,8 @@ class UpdateControllerHostsMixin(object):
host_list = [local_host]
stage = strategy.StrategyStage(strategy_stage_name)
stage.add_step(strategy.QueryAlarmsStep(
True, ignore_alarms=self._ignore_alarms))
True, ignore_alarms=self._ignore_alarms,
ignore_alarms_conditional=self._ignore_alarms_conditional))
if reboot:
stage.add_step(strategy.SwactHostsStep(host_list))
stage.add_step(strategy.LockHostsStep(host_list))
@ -1025,8 +1027,9 @@ class UpdateControllerHostsMixin(object):
# OSDs configured, but the alarms should clear quickly in
# that case so this will not delay the update strategy.
stage.add_step(strategy.WaitAlarmsClearStep(
timeout_in_secs=30 * 60,
ignore_alarms=self._ignore_alarms))
timeout_in_secs=40 * 60,
ignore_alarms=self._ignore_alarms,
ignore_alarms_conditional=self._ignore_alarms_conditional))
else:
# Less time required if host is not rebooting
stage.add_step(strategy.SystemStabilizeStep(
@ -1297,8 +1300,9 @@ class UpdateWorkerHostsMixin(object):
for host in hosts_to_lock + hosts_to_reboot]):
# Multiple personality nodes that need to wait for OSDs to sync:
stage.add_step(strategy.WaitAlarmsClearStep(
timeout_in_secs=30 * 60,
ignore_alarms=self._ignore_alarms))
timeout_in_secs=40 * 60,
ignore_alarms=self._ignore_alarms,
ignore_alarms_conditional=self._ignore_alarms_conditional))
else:
if any([host.openstack_control or host.openstack_compute
for host in hosts_to_lock + hosts_to_reboot]):
@ -1393,9 +1397,13 @@ class SwPatchStrategy(SwUpdateStrategy,
'100.119', # PTP alarm for SyncE
'900.701', # Node tainted
]
IGNORE_ALARMS_CONDITIONAL = {'100.103': 1800}
self._ignore_alarms += IGNORE_ALARMS
self._single_controller = single_controller
# This is only for patch strategy to ignore 750.006 alarm when it becomes stale
self._ignore_alarms_conditional = IGNORE_ALARMS_CONDITIONAL
# initialize the variables required by the mixins
# ie: self._nfvi_sw_patches, self._nfvi_sw_patch_hosts
self.initialize_mixin()
@ -1409,7 +1417,8 @@ class SwPatchStrategy(SwUpdateStrategy,
stage = strategy.StrategyStage(
strategy.STRATEGY_STAGE_NAME.SW_PATCH_QUERY)
stage.add_step(
strategy.QueryAlarmsStep(ignore_alarms=self._ignore_alarms))
strategy.QueryAlarmsStep(ignore_alarms=self._ignore_alarms,
ignore_alarms_conditional=self._ignore_alarms_conditional))
stage.add_step(strategy.QuerySwPatchesStep())
stage.add_step(strategy.QuerySwPatchHostsStep())
self.build_phase.add_stage(stage)

View File

@ -1912,13 +1912,17 @@ class QueryAlarmsStep(strategy.StrategyStep):
"""
Query Alarms - Strategy Step
"""
def __init__(self, fail_on_alarms=False, ignore_alarms=None):
def __init__(self, fail_on_alarms=False, ignore_alarms=None, ignore_alarms_conditional=None):
super(QueryAlarmsStep, self).__init__(
STRATEGY_STEP_NAME.QUERY_ALARMS, timeout_in_secs=60)
if ignore_alarms is None:
ignore_alarms = []
self._fail_on_alarms = fail_on_alarms
self._ignore_alarms = ignore_alarms
# For ignoring 750.006 alarm for patch strategy
if ignore_alarms_conditional is None:
ignore_alarms_conditional = []
self._ignore_alarms_conditional = ignore_alarms_conditional
@coroutine
def _query_alarms_callback(self, fm_service):
@ -1940,7 +1944,8 @@ class QueryAlarmsStep(strategy.StrategyStep):
"%s - uuid %s due to relaxed alarm "
"strictness" % (nfvi_alarm.alarm_id,
nfvi_alarm.alarm_uuid))
elif nfvi_alarm.alarm_id not in self._ignore_alarms:
elif (nfvi_alarm.alarm_id not in self._ignore_alarms and
nfvi_alarm.alarm_id not in self._ignore_alarms_conditional):
DLOG.warn("Alarm: %s" % nfvi_alarm.alarm_id)
nfvi_alarms.append(nfvi_alarm)
else:
@ -2106,7 +2111,8 @@ class WaitAlarmsClearStep(strategy.StrategyStep):
"""
Alarm Wait - Strategy Step
"""
def __init__(self, timeout_in_secs=300, first_query_delay_in_secs=60, ignore_alarms=None):
def __init__(self, timeout_in_secs=300, first_query_delay_in_secs=60, ignore_alarms=None,
ignore_alarms_conditional=None):
super(WaitAlarmsClearStep, self).__init__(
STRATEGY_STEP_NAME.WAIT_ALARMS_CLEAR, timeout_in_secs=timeout_in_secs)
self._first_query_delay_in_secs = first_query_delay_in_secs
@ -2115,12 +2121,17 @@ class WaitAlarmsClearStep(strategy.StrategyStep):
self._ignore_alarms = ignore_alarms
self._wait_time = 0
self._query_inprogress = False
if ignore_alarms_conditional is None:
ignore_alarms_conditional = {}
self._ignore_alarms_conditional = ignore_alarms_conditional
@coroutine
def _query_alarms_callback(self):
"""
Query Alarms Callback
"""
from datetime import datetime
response = (yield)
DLOG.debug("Query-Alarms callback response=%s." % response)
@ -2138,6 +2149,25 @@ class WaitAlarmsClearStep(strategy.StrategyStep):
"strictness" % (nfvi_alarm.alarm_id,
nfvi_alarm.alarm_uuid))
elif nfvi_alarm.alarm_id not in self._ignore_alarms:
# For ignoring 750.006 alarm for patch strategy
if nfvi_alarm.alarm_id in self._ignore_alarms_conditional:
format_string = "%Y-%m-%dT%H:%M:%S.%f"
alarm_timestamp = nfvi_alarm.timestamp
alarm_timestamp_obj = datetime.strptime(
alarm_timestamp, format_string)
current_time = datetime.now()
time_in_sec = (
current_time - alarm_timestamp_obj).total_seconds()
# Ignore 750.006 alarm if present for more than 30 mins(1800s)
if self._ignore_alarms_conditional[nfvi_alarm.alarm_id] < int(time_in_sec):
self._ignore_alarms.append(
list(self._ignore_alarms_conditional.keys())[0])
else:
nfvi_alarms.append(nfvi_alarm)
else:
nfvi_alarms.append(nfvi_alarm)
nfvi_alarms.append(nfvi_alarm)
else:
DLOG.debug("Ignoring alarm %s - uuid %s" %
@ -2145,6 +2175,9 @@ class WaitAlarmsClearStep(strategy.StrategyStep):
self.strategy.nfvi_alarms = nfvi_alarms
if self.strategy.nfvi_alarms:
for alarm in self.strategy.nfvi_alarms:
if alarm['alarm_id'] == list(self._ignore_alarms_conditional.keys())[0]:
self.strategy.nfvi_alarms.remove(alarm)
# Keep waiting for alarms to clear
pass
else:
@ -2193,6 +2226,7 @@ class WaitAlarmsClearStep(strategy.StrategyStep):
super(WaitAlarmsClearStep, self).from_dict(data)
self._first_query_delay_in_secs = data['first_query_delay_in_secs']
self._ignore_alarms = data['ignore_alarms']
self._ignore_alarms_conditional = data['ignore_alarms_conditional']
self._wait_time = 0
self._query_inprogress = False
return self
@ -2207,6 +2241,7 @@ class WaitAlarmsClearStep(strategy.StrategyStep):
data['entity_uuids'] = list()
data['first_query_delay_in_secs'] = self._first_query_delay_in_secs
data['ignore_alarms'] = self._ignore_alarms
data['ignore_alarms_conditional'] = self._ignore_alarms_conditional
return data