sw-manager patch-strategy failed to install due to timeout

As part of this fix, new parameter ignore_alarm_conditional
is added, which has the list of stale alarms that needs to
ignored after 30mins.
Alarm clear wait step checks for stale alarm 750.006 for
30mins. If the alarm is still not cleared, patch-strategy
ignores the alarm.
Now, since the stale alarms are monitored for 30mins,
the overall alarm clear timeout is increased to 2400sec.

In the current case alarm 750.006 is not getting cleared
and also it is not part of the ignore alarm list
so the patch-strategy times out.

Test Plan:
PASSED: Applying a patch - On DX system(VM),
Create and apply patch strategy,
fm alarm-list to have an uncleared alarm(for test purpose
100.103 - Memory threshold alarm was used). After 30mins
alarm was ignored and patch strategy successfully applied.
PASSED: Removing a patch - On DX system(VM),
Create and apply patch strategy,
fm alarm-list to have an uncleared alarm(for test purpose
100.103 - Memory threshold alarm was used). After 1800sec
alarm was ignored and patch strategy successfully applied.
PASSED: On DX system(lab), 4 consecutive patch orchestration
successfully applied. 750.006 - stale alarm tested.
PASSED: On DX system, create and apply strategy,
with alarm existing on system(not part of ignore list)
strategy would wait for 2400sec before timing out.
PASSED: On DX system, k8s upgrade from v1.21.8 to
v1.22.5 successfully executed.

Closes-Bug: 2059305
Change-Id: I7ebaf5a24fa45a7e45f3af7e5ca588ce3ee06156
Signed-off-by: Vanathi.Selvaraju <vanathi.selvaraju@windriver.com>
This commit is contained in:
Vanathi.Selvaraju 2024-03-27 15:53:55 -04:00 committed by Vanathi Selvaraju
parent 8dfb971980
commit eca1a05b83
4 changed files with 103 additions and 46 deletions

View File

@ -2990,7 +2990,7 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
{'name': 'unlock-hosts',
'entity_names': ['controller-1']},
{'name': 'wait-alarms-clear',
'timeout': 1800},
'timeout': 2400},
]
},
{'name': 'sw-patch-controllers',
@ -3008,7 +3008,7 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
{'name': 'unlock-hosts',
'entity_names': ['controller-0']},
{'name': 'wait-alarms-clear',
'timeout': 1800},
'timeout': 2400},
]
},
]
@ -3100,7 +3100,7 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
{'name': 'unlock-hosts',
'entity_names': ['controller-1']},
{'name': 'wait-alarms-clear',
'timeout': 1800},
'timeout': 2400},
]
},
{'name': 'sw-patch-controllers',
@ -3118,7 +3118,7 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
{'name': 'unlock-hosts',
'entity_names': ['controller-0']},
{'name': 'wait-alarms-clear',
'timeout': 1800},
'timeout': 2400},
]
},
]
@ -3224,7 +3224,7 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
{'name': 'start-instances',
'entity_names': ['test_instance_0']},
{'name': 'wait-alarms-clear',
'timeout': 1800},
'timeout': 2400},
]
},
{'name': 'sw-patch-worker-hosts',
@ -3246,7 +3246,7 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
{'name': 'start-instances',
'entity_names': ['test_instance_1']},
{'name': 'wait-alarms-clear',
'timeout': 1800}
'timeout': 2400}
]
},
]
@ -3349,7 +3349,7 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
{'name': 'start-instances',
'entity_names': ['test_instance_0']},
{'name': 'wait-alarms-clear',
'timeout': 1800}
'timeout': 2400}
]
},
{'name': 'sw-patch-worker-hosts',
@ -3371,7 +3371,7 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
{'name': 'start-instances',
'entity_names': ['test_instance_1']},
{'name': 'wait-alarms-clear',
'timeout': 1800}
'timeout': 2400}
]
},
]
@ -3426,7 +3426,7 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
{'name': 'unlock-hosts',
'entity_names': ['controller-0']},
{'name': 'wait-alarms-clear',
'timeout': 1800}
'timeout': 2400}
]
},
{'name': 'sw-patch-worker-hosts',
@ -3444,7 +3444,7 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
{'name': 'unlock-hosts',
'entity_names': ['controller-1']},
{'name': 'wait-alarms-clear',
'timeout': 1800}
'timeout': 2400}
]
},
]
@ -3499,7 +3499,7 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
{'name': 'unlock-hosts',
'entity_names': ['controller-0']},
{'name': 'wait-alarms-clear',
'timeout': 1800}
'timeout': 2400}
]
},
{'name': 'sw-patch-worker-hosts',
@ -3517,7 +3517,7 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
{'name': 'unlock-hosts',
'entity_names': ['controller-1']},
{'name': 'wait-alarms-clear',
'timeout': 1800}
'timeout': 2400}
]
},
]
@ -3594,7 +3594,7 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
{'name': 'start-instances',
'entity_names': ['test_instance_0']},
{'name': 'wait-alarms-clear',
'timeout': 1800},
'timeout': 2400},
]
},
{'name': 'sw-patch-worker-hosts',
@ -3616,7 +3616,7 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
{'name': 'start-instances',
'entity_names': ['test_instance_1']},
{'name': 'wait-alarms-clear',
'timeout': 1800}
'timeout': 2400}
]
},
{'name': 'sw-patch-worker-hosts',
@ -3758,7 +3758,7 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
{'name': 'start-instances',
'entity_names': ['test_instance_0']},
{'name': 'wait-alarms-clear',
'timeout': 1800}
'timeout': 2400}
]
},
{'name': 'sw-patch-worker-hosts',
@ -3780,7 +3780,7 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
{'name': 'start-instances',
'entity_names': ['test_instance_1']},
{'name': 'wait-alarms-clear',
'timeout': 1800}
'timeout': 2400}
]
},
{'name': 'sw-patch-worker-hosts',
@ -3878,7 +3878,7 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
{'name': 'unlock-hosts',
'entity_names': ['controller-0']},
{'name': 'wait-alarms-clear',
'timeout': 1800}
'timeout': 2400}
]
},
{'name': 'sw-patch-worker-hosts',
@ -3896,7 +3896,7 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
{'name': 'unlock-hosts',
'entity_names': ['controller-1']},
{'name': 'wait-alarms-clear',
'timeout': 1800}
'timeout': 2400}
]
},
{'name': 'sw-patch-worker-hosts',
@ -4014,7 +4014,7 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
{'name': 'unlock-hosts',
'entity_names': ['controller-0']},
{'name': 'wait-alarms-clear',
'timeout': 1800},
'timeout': 2400},
]
},
]
@ -4072,7 +4072,7 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
{'name': 'start-instances',
'entity_names': ['test_instance_0']},
{'name': 'wait-alarms-clear',
'timeout': 1800},
'timeout': 2400},
]
},
]
@ -4123,7 +4123,7 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
{'name': 'unlock-hosts',
'entity_names': ['controller-0']},
{'name': 'wait-alarms-clear',
'timeout': 1800}
'timeout': 2400}
]
},
]

View File

@ -136,7 +136,7 @@ class TestSystemConfigUpdateStrategy(sw_update_testcase.SwUpdateStrategyTestCase
{'name': 'unlock-hosts',
'entity_names': ['controller-0']},
{'name': 'wait-alarms-clear',
'timeout': 1800},
'timeout': 2400},
]
},
]
@ -188,7 +188,7 @@ class TestSystemConfigUpdateStrategy(sw_update_testcase.SwUpdateStrategyTestCase
{'name': 'unlock-hosts',
'entity_names': ['controller-0']},
{'name': 'wait-alarms-clear',
'timeout': 1800},
'timeout': 2400},
]
},
{
@ -207,7 +207,7 @@ class TestSystemConfigUpdateStrategy(sw_update_testcase.SwUpdateStrategyTestCase
{'name': 'unlock-hosts',
'entity_names': ['controller-1']},
{'name': 'wait-alarms-clear',
'timeout': 1800},
'timeout': 2400},
]
},
]
@ -276,7 +276,7 @@ class TestSystemConfigUpdateStrategy(sw_update_testcase.SwUpdateStrategyTestCase
{'name': 'unlock-hosts',
'entity_names': ['controller-0']},
{'name': 'wait-alarms-clear',
'timeout': 1800},
'timeout': 2400},
]
},
{
@ -295,7 +295,7 @@ class TestSystemConfigUpdateStrategy(sw_update_testcase.SwUpdateStrategyTestCase
{'name': 'unlock-hosts',
'entity_names': ['controller-1']},
{'name': 'wait-alarms-clear',
'timeout': 1800},
'timeout': 2400},
]
},
{
@ -433,7 +433,7 @@ class TestSystemConfigUpdateStrategy(sw_update_testcase.SwUpdateStrategyTestCase
{'name': 'unlock-hosts',
'entity_names': ['controller-0']},
{'name': 'wait-alarms-clear',
'timeout': 1800},
'timeout': 2400},
]
},
{
@ -452,7 +452,7 @@ class TestSystemConfigUpdateStrategy(sw_update_testcase.SwUpdateStrategyTestCase
{'name': 'unlock-hosts',
'entity_names': ['controller-1']},
{'name': 'wait-alarms-clear',
'timeout': 1800},
'timeout': 2400},
]
},
{
@ -573,7 +573,7 @@ class TestSystemConfigUpdateStrategy(sw_update_testcase.SwUpdateStrategyTestCase
{'name': 'unlock-hosts',
'entity_names': ['controller-0']},
{'name': 'wait-alarms-clear',
'timeout': 1800},
'timeout': 2400},
]
},
{
@ -763,7 +763,7 @@ class TestSystemConfigUpdateStrategy(sw_update_testcase.SwUpdateStrategyTestCase
{'name': 'unlock-hosts',
'entity_names': ['controller-0']},
{'name': 'wait-alarms-clear',
'timeout': 1800},
'timeout': 2400},
]
},
{

View File

@ -52,6 +52,7 @@ NO_REBOOT_DELAY = 30
# constants used by the patching API for state and repo state
PATCH_REPO_STATE_APPLIED = 'Applied'
PATCH_STATE_APPLIED = 'Applied'
WAIT_ALARM_TIMEOUT = 2400
###################################################################
@ -971,7 +972,8 @@ class UpdateControllerHostsMixin(object):
host_list = [host]
stage = strategy.StrategyStage(strategy_stage_name)
stage.add_step(strategy.QueryAlarmsStep(
True, ignore_alarms=self._ignore_alarms))
True, ignore_alarms=self._ignore_alarms,
ignore_alarms_conditional=self._ignore_alarms_conditional))
if reboot:
stage.add_step(strategy.SwactHostsStep(host_list))
stage.add_step(strategy.LockHostsStep(host_list))
@ -992,8 +994,9 @@ class UpdateControllerHostsMixin(object):
# OSDs configured, but the alarms should clear quickly in
# that case so this will not delay the update strategy.
stage.add_step(strategy.WaitAlarmsClearStep(
timeout_in_secs=30 * 60,
ignore_alarms=self._ignore_alarms))
timeout_in_secs=WAIT_ALARM_TIMEOUT,
ignore_alarms=self._ignore_alarms,
ignore_alarms_conditional=self._ignore_alarms_conditional))
else:
# Less time required if host is not rebooting
stage.add_step(strategy.SystemStabilizeStep(
@ -1004,7 +1007,8 @@ class UpdateControllerHostsMixin(object):
host_list = [local_host]
stage = strategy.StrategyStage(strategy_stage_name)
stage.add_step(strategy.QueryAlarmsStep(
True, ignore_alarms=self._ignore_alarms))
True, ignore_alarms=self._ignore_alarms,
ignore_alarms_conditional=self._ignore_alarms_conditional))
if reboot:
stage.add_step(strategy.SwactHostsStep(host_list))
stage.add_step(strategy.LockHostsStep(host_list))
@ -1025,8 +1029,9 @@ class UpdateControllerHostsMixin(object):
# OSDs configured, but the alarms should clear quickly in
# that case so this will not delay the update strategy.
stage.add_step(strategy.WaitAlarmsClearStep(
timeout_in_secs=30 * 60,
ignore_alarms=self._ignore_alarms))
timeout_in_secs=WAIT_ALARM_TIMEOUT,
ignore_alarms=self._ignore_alarms,
ignore_alarms_conditional=self._ignore_alarms_conditional))
else:
# Less time required if host is not rebooting
stage.add_step(strategy.SystemStabilizeStep(
@ -1105,7 +1110,8 @@ class UpdateStorageHostsMixin(object):
for host_list in host_lists:
stage = strategy.StrategyStage(strategy_stage_name)
stage.add_step(strategy.QueryAlarmsStep(
True, ignore_alarms=self._ignore_alarms))
True, ignore_alarms=self._ignore_alarms,
ignore_alarms_conditional=self._ignore_alarms_conditional))
if reboot:
stage.add_step(strategy.LockHostsStep(host_list))
# Add the action step for these hosts (patch, etc..)
@ -1227,7 +1233,8 @@ class UpdateWorkerHostsMixin(object):
stage = strategy.StrategyStage(strategy_stage_name)
stage.add_step(strategy.QueryAlarmsStep(
True, ignore_alarms=self._ignore_alarms))
True, ignore_alarms=self._ignore_alarms,
ignore_alarms_conditional=self._ignore_alarms_conditional))
if reboot:
if 1 == len(host_list):
@ -1297,8 +1304,9 @@ class UpdateWorkerHostsMixin(object):
for host in hosts_to_lock + hosts_to_reboot]):
# Multiple personality nodes that need to wait for OSDs to sync:
stage.add_step(strategy.WaitAlarmsClearStep(
timeout_in_secs=30 * 60,
ignore_alarms=self._ignore_alarms))
timeout_in_secs=WAIT_ALARM_TIMEOUT,
ignore_alarms=self._ignore_alarms,
ignore_alarms_conditional=self._ignore_alarms_conditional))
else:
if any([host.openstack_control or host.openstack_compute
for host in hosts_to_lock + hosts_to_reboot]):
@ -1393,9 +1401,13 @@ class SwPatchStrategy(SwUpdateStrategy,
'100.119', # PTP alarm for SyncE
'900.701', # Node tainted
]
IGNORE_ALARMS_CONDITIONAL = {'750.006': 1800}
self._ignore_alarms += IGNORE_ALARMS
self._single_controller = single_controller
# This is to ignore the stale alarm(currently 750.006 is ignored).
self._ignore_alarms_conditional = IGNORE_ALARMS_CONDITIONAL
# initialize the variables required by the mixins
# ie: self._nfvi_sw_patches, self._nfvi_sw_patch_hosts
self.initialize_mixin()
@ -1409,7 +1421,8 @@ class SwPatchStrategy(SwUpdateStrategy,
stage = strategy.StrategyStage(
strategy.STRATEGY_STAGE_NAME.SW_PATCH_QUERY)
stage.add_step(
strategy.QueryAlarmsStep(ignore_alarms=self._ignore_alarms))
strategy.QueryAlarmsStep(ignore_alarms=self._ignore_alarms,
ignore_alarms_conditional=self._ignore_alarms_conditional))
stage.add_step(strategy.QuerySwPatchesStep())
stage.add_step(strategy.QuerySwPatchHostsStep())
self.build_phase.add_stage(stage)
@ -2370,7 +2383,7 @@ class SystemConfigUpdateStrategy(SwUpdateStrategy,
]
self._ignore_alarms += IGNORE_ALARMS
self._single_controller = single_controller
self._ignore_alarms_conditional = None
# initialize the variables required by the mixins
self.initialize_mixin()
@ -3325,7 +3338,7 @@ class KubeUpgradeStrategy(SwUpdateStrategy,
]
# self._ignore_alarms is declared in parent class
self._ignore_alarms += IGNORE_ALARMS
self._ignore_alarms_conditional = None
# to_version and single_controller MUST be serialized
self._to_version = to_version
self._single_controller = single_controller

View File

@ -1912,13 +1912,18 @@ class QueryAlarmsStep(strategy.StrategyStep):
"""
Query Alarms - Strategy Step
"""
def __init__(self, fail_on_alarms=False, ignore_alarms=None):
def __init__(self, fail_on_alarms=False, ignore_alarms=None, ignore_alarms_conditional=None):
super(QueryAlarmsStep, self).__init__(
STRATEGY_STEP_NAME.QUERY_ALARMS, timeout_in_secs=60)
if ignore_alarms is None:
ignore_alarms = []
self._fail_on_alarms = fail_on_alarms
self._ignore_alarms = ignore_alarms
# For ignoring stale alarm for the specified amount of time.
# Currently we are ignoring 750.006 alarm for patch strategy.
if ignore_alarms_conditional is None:
ignore_alarms_conditional = {}
self._ignore_alarms_conditional = ignore_alarms_conditional
@coroutine
def _query_alarms_callback(self, fm_service):
@ -1940,7 +1945,8 @@ class QueryAlarmsStep(strategy.StrategyStep):
"%s - uuid %s due to relaxed alarm "
"strictness" % (nfvi_alarm.alarm_id,
nfvi_alarm.alarm_uuid))
elif nfvi_alarm.alarm_id not in self._ignore_alarms:
elif (nfvi_alarm.alarm_id not in self._ignore_alarms and
nfvi_alarm.alarm_id not in self._ignore_alarms_conditional):
DLOG.warn("Alarm: %s" % nfvi_alarm.alarm_id)
nfvi_alarms.append(nfvi_alarm)
else:
@ -1982,6 +1988,7 @@ class QueryAlarmsStep(strategy.StrategyStep):
super(QueryAlarmsStep, self).from_dict(data)
self._fail_on_alarms = data['fail_on_alarms']
self._ignore_alarms = data['ignore_alarms']
self._ignore_alarms_conditional = data['ignore_alarms_conditional']
return self
def as_dict(self):
@ -1994,6 +2001,7 @@ class QueryAlarmsStep(strategy.StrategyStep):
data['entity_uuids'] = list()
data['fail_on_alarms'] = self._fail_on_alarms
data['ignore_alarms'] = self._ignore_alarms
data['ignore_alarms_conditional'] = self._ignore_alarms_conditional
return data
@ -2106,7 +2114,8 @@ class WaitAlarmsClearStep(strategy.StrategyStep):
"""
Alarm Wait - Strategy Step
"""
def __init__(self, timeout_in_secs=300, first_query_delay_in_secs=60, ignore_alarms=None):
def __init__(self, timeout_in_secs=300, first_query_delay_in_secs=60, ignore_alarms=None,
ignore_alarms_conditional=None):
super(WaitAlarmsClearStep, self).__init__(
STRATEGY_STEP_NAME.WAIT_ALARMS_CLEAR, timeout_in_secs=timeout_in_secs)
self._first_query_delay_in_secs = first_query_delay_in_secs
@ -2115,12 +2124,17 @@ class WaitAlarmsClearStep(strategy.StrategyStep):
self._ignore_alarms = ignore_alarms
self._wait_time = 0
self._query_inprogress = False
if ignore_alarms_conditional is None:
ignore_alarms_conditional = {}
self._ignore_alarms_conditional = ignore_alarms_conditional
@coroutine
def _query_alarms_callback(self):
"""
Query Alarms Callback
"""
from datetime import datetime
response = (yield)
DLOG.debug("Query-Alarms callback response=%s." % response)
@ -2138,6 +2152,27 @@ class WaitAlarmsClearStep(strategy.StrategyStep):
"strictness" % (nfvi_alarm.alarm_id,
nfvi_alarm.alarm_uuid))
elif nfvi_alarm.alarm_id not in self._ignore_alarms:
# For ignoring stale alarm(currently 750.006)
if nfvi_alarm.alarm_id in self._ignore_alarms_conditional:
format_string = "%Y-%m-%dT%H:%M:%S.%f"
alarm_timestamp = nfvi_alarm.timestamp
alarm_timestamp_obj = datetime.strptime(
alarm_timestamp, format_string)
current_time = datetime.now()
time_in_sec = (
current_time - alarm_timestamp_obj).total_seconds()
# Ignoring stale alarm if present after specified amount of time
if self._ignore_alarms_conditional[nfvi_alarm.alarm_id] < int(time_in_sec):
# Appends stale alarm to list _ignore_alarms
# if specified timeout is reached.
self._ignore_alarms.append(nfvi_alarm.alarm_id)
else:
# Appends alarm to nfvi_alarms if, the specified
# timeout is not reached.
nfvi_alarms.append(nfvi_alarm)
else:
nfvi_alarms.append(nfvi_alarm)
nfvi_alarms.append(nfvi_alarm)
else:
DLOG.debug("Ignoring alarm %s - uuid %s" %
@ -2145,6 +2180,13 @@ class WaitAlarmsClearStep(strategy.StrategyStep):
self.strategy.nfvi_alarms = nfvi_alarms
if self.strategy.nfvi_alarms:
ignore_alarm_list = list(self._ignore_alarms_conditional.keys())
for alarm in self.strategy.nfvi_alarms:
for remove_alarm in ignore_alarm_list:
if alarm['alarm_id'] == remove_alarm:
# Removes only the alarm which has
# not yet reached specified timeout.
self.strategy.nfvi_alarms.remove(alarm)
# Keep waiting for alarms to clear
pass
else:
@ -2193,6 +2235,7 @@ class WaitAlarmsClearStep(strategy.StrategyStep):
super(WaitAlarmsClearStep, self).from_dict(data)
self._first_query_delay_in_secs = data['first_query_delay_in_secs']
self._ignore_alarms = data['ignore_alarms']
self._ignore_alarms_conditional = data['ignore_alarms_conditional']
self._wait_time = 0
self._query_inprogress = False
return self
@ -2207,6 +2250,7 @@ class WaitAlarmsClearStep(strategy.StrategyStep):
data['entity_uuids'] = list()
data['first_query_delay_in_secs'] = self._first_query_delay_in_secs
data['ignore_alarms'] = self._ignore_alarms
data['ignore_alarms_conditional'] = self._ignore_alarms_conditional
return data