From 19b3a3f385d2449595dcf1546dd948aa093ed89b Mon Sep 17 00:00:00 2001 From: albailey Date: Wed, 10 Mar 2021 13:16:40 -0600 Subject: [PATCH] Add retry capability to host unlock during upgrade During the unlock of a host as part of an upgrade, the unlock can be rejected. This change introduces a retry mechanism for the unlock. Allow up to 5 retries with 2 minutes between attempts. Partial-Bug: 1914836 Signed-off-by: albailey Change-Id: Ic121e1a993c80e2fae32806181342c1e5ea8e688 --- .../tests/test_sw_upgrade_strategy.py | 180 ++++++------- nfv/nfv-vim/nfv_vim/strategy/_strategy.py | 16 +- .../nfv_vim/strategy/_strategy_steps.py | 242 +++++++++++++----- 3 files changed, 275 insertions(+), 163 deletions(-) diff --git a/nfv/nfv-tests/nfv_unit_tests/tests/test_sw_upgrade_strategy.py b/nfv/nfv-tests/nfv_unit_tests/tests/test_sw_upgrade_strategy.py index 177913fe..b8c9a72d 100755 --- a/nfv/nfv-tests/nfv_unit_tests/tests/test_sw_upgrade_strategy.py +++ b/nfv/nfv-tests/nfv_unit_tests/tests/test_sw_upgrade_strategy.py @@ -22,10 +22,27 @@ from nfv_vim.nfvi.objects.v1 import UPGRADE_STATE from . import sw_update_testcase # noqa: H304 -@mock.patch('nfv_vim.event_log._instance._event_issue', sw_update_testcase.fake_event_issue) -@mock.patch('nfv_vim.objects._sw_update.SwUpdate.save', sw_update_testcase.fake_save) -@mock.patch('nfv_vim.objects._sw_update.timers.timers_create_timer', sw_update_testcase.fake_timer) -@mock.patch('nfv_vim.nfvi.nfvi_compute_plugin_disabled', sw_update_testcase.fake_nfvi_compute_plugin_disabled) +# utility method for the formatting of unlock-hosts stage as dict +# workers default to 5 retries with 120 second delay between attempts +# std controllers and storage have 0 retries +def _unlock_hosts_stage_as_dict(host_names, retry_count=5, retry_delay=120): + return { + 'name': 'unlock-hosts', + 'entity_names': host_names, + 'retry_count': retry_count, + 'retry_delay': retry_delay, + 'timeout': 1800, + } + + +@mock.patch('nfv_vim.event_log._instance._event_issue', + sw_update_testcase.fake_event_issue) +@mock.patch('nfv_vim.objects._sw_update.SwUpdate.save', + sw_update_testcase.fake_save) +@mock.patch('nfv_vim.objects._sw_update.timers.timers_create_timer', + sw_update_testcase.fake_timer) +@mock.patch('nfv_vim.nfvi.nfvi_compute_plugin_disabled', + sw_update_testcase.fake_nfvi_compute_plugin_disabled) class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): def create_sw_upgrade_strategy(self, @@ -76,9 +93,10 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): "test_instance_1", 'compute-1') - self.create_instance_group('instance_group_1', - ['test_instance_0', 'test_instance_1'], - [nfvi.objects.v1.INSTANCE_GROUP_POLICY.ANTI_AFFINITY]) + self.create_instance_group( + 'instance_group_1', + ['test_instance_0', 'test_instance_1'], + [nfvi.objects.v1.INSTANCE_GROUP_POLICY.ANTI_AFFINITY]) worker_hosts = [] for host in self._host_table.values(): @@ -162,8 +180,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['compute-2', 'compute-3']}, {'name': 'upgrade-hosts', 'entity_names': ['compute-2', 'compute-3']}, - {'name': 'unlock-hosts', - 'entity_names': ['compute-2', 'compute-3']}, + _unlock_hosts_stage_as_dict(['compute-2', 'compute-3']), {'name': 'wait-alarms-clear', 'timeout': 600} ] @@ -179,8 +196,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['compute-0']}, {'name': 'upgrade-hosts', 'entity_names': ['compute-0']}, - {'name': 'unlock-hosts', - 'entity_names': ['compute-0']}, + _unlock_hosts_stage_as_dict(['compute-0']), {'name': 'wait-alarms-clear', 'timeout': 600} ] @@ -196,8 +212,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['compute-1']}, {'name': 'upgrade-hosts', 'entity_names': ['compute-1']}, - {'name': 'unlock-hosts', - 'entity_names': ['compute-1']}, + _unlock_hosts_stage_as_dict(['compute-1']), {'name': 'wait-alarms-clear', 'timeout': 600} ] @@ -268,8 +283,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['compute-1', 'compute-5']}, {'name': 'upgrade-hosts', 'entity_names': ['compute-1', 'compute-5']}, - {'name': 'unlock-hosts', - 'entity_names': ['compute-1', 'compute-5']}, + _unlock_hosts_stage_as_dict(['compute-1', 'compute-5']), {'name': 'wait-alarms-clear', 'timeout': 600} ] @@ -287,8 +301,8 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['compute-0', 'compute-2', 'compute-3']}, {'name': 'upgrade-hosts', 'entity_names': ['compute-0', 'compute-2', 'compute-3']}, - {'name': 'unlock-hosts', - 'entity_names': ['compute-0', 'compute-2', 'compute-3']}, + _unlock_hosts_stage_as_dict( + ['compute-0', 'compute-2', 'compute-3']), {'name': 'wait-alarms-clear', 'timeout': 600} ] @@ -306,8 +320,8 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['compute-4', 'compute-6', 'compute-7']}, {'name': 'upgrade-hosts', 'entity_names': ['compute-4', 'compute-6', 'compute-7']}, - {'name': 'unlock-hosts', - 'entity_names': ['compute-4', 'compute-6', 'compute-7']}, + _unlock_hosts_stage_as_dict( + ['compute-4', 'compute-6', 'compute-7']), {'name': 'wait-alarms-clear', 'timeout': 600} ] @@ -324,8 +338,8 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['compute-8', 'compute-9']}, {'name': 'upgrade-hosts', 'entity_names': ['compute-8', 'compute-9']}, - {'name': 'unlock-hosts', - 'entity_names': ['compute-8', 'compute-9']}, + _unlock_hosts_stage_as_dict( + ['compute-8', 'compute-9']), {'name': 'wait-alarms-clear', 'timeout': 600} ] @@ -399,8 +413,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['controller-0']}, {'name': 'upgrade-hosts', 'entity_names': ['controller-0']}, - {'name': 'unlock-hosts', - 'entity_names': ['controller-0']}, + _unlock_hosts_stage_as_dict(['controller-0']), {'name': 'wait-data-sync', 'timeout': 14400} ] @@ -418,8 +431,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['controller-1']}, {'name': 'upgrade-hosts', 'entity_names': ['controller-1']}, - {'name': 'unlock-hosts', - 'entity_names': ['controller-1']}, + _unlock_hosts_stage_as_dict(['controller-1']), {'name': 'wait-data-sync', 'timeout': 14400} ] @@ -432,8 +444,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['compute-1']}, {'name': 'upgrade-hosts', 'entity_names': ['compute-1']}, - {'name': 'unlock-hosts', - 'entity_names': ['compute-1']}, + _unlock_hosts_stage_as_dict(['compute-1']), {'name': 'wait-alarms-clear', 'timeout': 600} ] @@ -451,8 +462,8 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['compute-0', 'compute-2', 'compute-3']}, {'name': 'upgrade-hosts', 'entity_names': ['compute-0', 'compute-2', 'compute-3']}, - {'name': 'unlock-hosts', - 'entity_names': ['compute-0', 'compute-2', 'compute-3']}, + _unlock_hosts_stage_as_dict( + ['compute-0', 'compute-2', 'compute-3']), {'name': 'wait-alarms-clear', 'timeout': 600} ] @@ -468,8 +479,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['compute-4']}, {'name': 'upgrade-hosts', 'entity_names': ['compute-4']}, - {'name': 'unlock-hosts', - 'entity_names': ['compute-4']}, + _unlock_hosts_stage_as_dict(['compute-4']), {'name': 'wait-alarms-clear', 'timeout': 600} ] @@ -558,8 +568,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': stage_hosts[0]}, {'name': 'upgrade-hosts', 'entity_names': stage_hosts[0]}, - {'name': 'unlock-hosts', - 'entity_names': stage_hosts[0]}, + _unlock_hosts_stage_as_dict(stage_hosts[0]), {'name': 'wait-alarms-clear', 'timeout': 600} ] @@ -580,8 +589,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': stage_hosts[x]}, {'name': 'upgrade-hosts', 'entity_names': stage_hosts[x]}, - {'name': 'unlock-hosts', - 'entity_names': stage_hosts[x]}, + _unlock_hosts_stage_as_dict(stage_hosts[x]), {'name': 'wait-alarms-clear', 'timeout': 600} ] @@ -644,8 +652,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['compute-2']}, {'name': 'upgrade-hosts', 'entity_names': ['compute-2']}, - {'name': 'unlock-hosts', - 'entity_names': ['compute-2']}, + _unlock_hosts_stage_as_dict(['compute-2']), {'name': 'wait-alarms-clear', 'timeout': 600} ] @@ -658,8 +665,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['compute-3']}, {'name': 'upgrade-hosts', 'entity_names': ['compute-3']}, - {'name': 'unlock-hosts', - 'entity_names': ['compute-3']}, + _unlock_hosts_stage_as_dict(['compute-3']), {'name': 'wait-alarms-clear', 'timeout': 600} ] @@ -674,8 +680,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['compute-0']}, {'name': 'upgrade-hosts', 'entity_names': ['compute-0']}, - {'name': 'unlock-hosts', - 'entity_names': ['compute-0']}, + _unlock_hosts_stage_as_dict(['compute-0']), {'name': 'wait-alarms-clear', 'timeout': 600} ] @@ -690,8 +695,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['compute-1']}, {'name': 'upgrade-hosts', 'entity_names': ['compute-1']}, - {'name': 'unlock-hosts', - 'entity_names': ['compute-1']}, + _unlock_hosts_stage_as_dict(['compute-1']), {'name': 'wait-alarms-clear', 'timeout': 600} ] @@ -745,8 +749,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['compute-0']}, {'name': 'upgrade-hosts', 'entity_names': ['compute-0']}, - {'name': 'unlock-hosts', - 'entity_names': ['compute-0']}, + _unlock_hosts_stage_as_dict(['compute-0']), {'name': 'system-stabilize'} ] }, @@ -758,8 +761,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['compute-1']}, {'name': 'upgrade-hosts', 'entity_names': ['compute-1']}, - {'name': 'unlock-hosts', - 'entity_names': ['compute-1']}, + _unlock_hosts_stage_as_dict(['compute-1']), {'name': 'system-stabilize'} ] }, @@ -771,8 +773,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['compute-2']}, {'name': 'upgrade-hosts', 'entity_names': ['compute-2']}, - {'name': 'unlock-hosts', - 'entity_names': ['compute-2']}, + _unlock_hosts_stage_as_dict(['compute-2']), {'name': 'system-stabilize'} ] }, @@ -784,8 +785,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['compute-3']}, {'name': 'upgrade-hosts', 'entity_names': ['compute-3']}, - {'name': 'unlock-hosts', - 'entity_names': ['compute-3']}, + _unlock_hosts_stage_as_dict(['compute-3']), {'name': 'system-stabilize'} ] } @@ -936,8 +936,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['storage-0']}, {'name': 'upgrade-hosts', 'entity_names': ['storage-0']}, - {'name': 'unlock-hosts', - 'entity_names': ['storage-0']}, + _unlock_hosts_stage_as_dict(['storage-0'], retry_count=0), {'name': 'wait-data-sync', 'timeout': 7200} ] @@ -950,8 +949,8 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['storage-1', 'storage-2']}, {'name': 'upgrade-hosts', 'entity_names': ['storage-1', 'storage-2']}, - {'name': 'unlock-hosts', - 'entity_names': ['storage-1', 'storage-2']}, + _unlock_hosts_stage_as_dict(['storage-1', 'storage-2'], + retry_count=0), {'name': 'wait-data-sync', 'timeout': 7200} ] @@ -964,8 +963,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['storage-3']}, {'name': 'upgrade-hosts', 'entity_names': ['storage-3']}, - {'name': 'unlock-hosts', - 'entity_names': ['storage-3']}, + _unlock_hosts_stage_as_dict(['storage-3'], retry_count=0), {'name': 'wait-data-sync', 'timeout': 7200} ] @@ -1022,8 +1020,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['storage-0']}, {'name': 'upgrade-hosts', 'entity_names': ['storage-0']}, - {'name': 'unlock-hosts', - 'entity_names': ['storage-0']}, + _unlock_hosts_stage_as_dict(['storage-0'], retry_count=0), {'name': 'wait-data-sync', 'ignore_alarms': ['900.005', '900.201', '750.006'], 'timeout': 7200} @@ -1037,8 +1034,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['storage-1']}, {'name': 'upgrade-hosts', 'entity_names': ['storage-1']}, - {'name': 'unlock-hosts', - 'entity_names': ['storage-1']}, + _unlock_hosts_stage_as_dict(['storage-1'], retry_count=0), {'name': 'wait-data-sync', 'ignore_alarms': ['900.005', '900.201', '750.006'], 'timeout': 7200} @@ -1052,8 +1048,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['storage-2']}, {'name': 'upgrade-hosts', 'entity_names': ['storage-2']}, - {'name': 'unlock-hosts', - 'entity_names': ['storage-2']}, + _unlock_hosts_stage_as_dict(['storage-2'], retry_count=0), {'name': 'wait-data-sync', 'ignore_alarms': ['900.005', '900.201', '750.006'], 'timeout': 7200} @@ -1067,8 +1062,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['storage-3']}, {'name': 'upgrade-hosts', 'entity_names': ['storage-3']}, - {'name': 'unlock-hosts', - 'entity_names': ['storage-3']}, + _unlock_hosts_stage_as_dict(['storage-3'], retry_count=0), {'name': 'wait-data-sync', 'ignore_alarms': ['900.005', '900.201', '750.006'], 'timeout': 7200} @@ -1116,8 +1110,8 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['controller-0']}, {'name': 'upgrade-hosts', 'entity_names': ['controller-0']}, - {'name': 'unlock-hosts', - 'entity_names': ['controller-0']}, + _unlock_hosts_stage_as_dict(['controller-0'], + retry_count=0), {'name': 'wait-data-sync', 'ignore_alarms': ['900.005', '900.201', '750.006'], 'timeout': 14400} @@ -1164,8 +1158,8 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['controller-1']}, {'name': 'upgrade-hosts', 'entity_names': ['controller-1']}, - {'name': 'unlock-hosts', - 'entity_names': ['controller-1']}, + _unlock_hosts_stage_as_dict(['controller-1'], + retry_count=0), {'name': 'wait-data-sync', 'ignore_alarms': ['900.005', '900.201', '750.006'], 'timeout': 14400} @@ -1181,8 +1175,8 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['controller-0']}, {'name': 'upgrade-hosts', 'entity_names': ['controller-0']}, - {'name': 'unlock-hosts', - 'entity_names': ['controller-0']}, + _unlock_hosts_stage_as_dict(['controller-0'], + retry_count=0), {'name': 'wait-data-sync', 'ignore_alarms': ['900.005', '900.201', '750.006'], 'timeout': 14400} @@ -1253,8 +1247,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['controller-0']}, {'name': 'upgrade-hosts', 'entity_names': ['controller-0']}, - {'name': 'unlock-hosts', - 'entity_names': ['controller-0']}, + _unlock_hosts_stage_as_dict(['controller-0']), {'name': 'wait-data-sync', 'ignore_alarms': ['900.005', '900.201', '750.006'], 'timeout': 14400} @@ -1270,8 +1263,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['controller-1']}, {'name': 'upgrade-hosts', 'entity_names': ['controller-1']}, - {'name': 'unlock-hosts', - 'entity_names': ['controller-1']}, + _unlock_hosts_stage_as_dict(['controller-1']), {'name': 'wait-data-sync', 'ignore_alarms': ['900.005', '900.201', '750.006'], 'timeout': 14400} @@ -1372,8 +1364,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['controller-1']}, {'name': 'upgrade-hosts', 'entity_names': ['controller-1']}, - {'name': 'unlock-hosts', - 'entity_names': ['controller-1']}, + _unlock_hosts_stage_as_dict(['controller-1']), {'name': 'wait-data-sync', 'ignore_alarms': ['900.005', '900.201', '750.006'], 'timeout': 14400} @@ -1387,8 +1378,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['controller-0']}, {'name': 'upgrade-hosts', 'entity_names': ['controller-0']}, - {'name': 'unlock-hosts', - 'entity_names': ['controller-0']}, + _unlock_hosts_stage_as_dict(['controller-0']), {'name': 'wait-data-sync', 'ignore_alarms': ['900.005', '900.201', '750.006'], 'timeout': 14400} @@ -1402,8 +1392,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['storage-0']}, {'name': 'upgrade-hosts', 'entity_names': ['storage-0']}, - {'name': 'unlock-hosts', - 'entity_names': ['storage-0']}, + _unlock_hosts_stage_as_dict(['storage-0']), {'name': 'wait-data-sync', 'ignore_alarms': ['900.005', '900.201', '750.006'], 'timeout': 7200} @@ -1417,8 +1406,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['storage-1']}, {'name': 'upgrade-hosts', 'entity_names': ['storage-1']}, - {'name': 'unlock-hosts', - 'entity_names': ['storage-1']}, + _unlock_hosts_stage_as_dict(['storage-1']), {'name': 'wait-data-sync', 'ignore_alarms': ['900.005', '900.201', '750.006'], 'timeout': 7200} @@ -1432,8 +1420,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['compute-1']}, {'name': 'upgrade-hosts', 'entity_names': ['compute-1']}, - {'name': 'unlock-hosts', - 'entity_names': ['compute-1']}, + _unlock_hosts_stage_as_dict(['compute-1']), {'name': 'system-stabilize', 'timeout': 60} ] @@ -1448,8 +1435,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['compute-0']}, {'name': 'upgrade-hosts', 'entity_names': ['compute-0']}, - {'name': 'unlock-hosts', - 'entity_names': ['compute-0']}, + _unlock_hosts_stage_as_dict(['compute-0']), {'name': 'system-stabilize', 'timeout': 60} ] @@ -1516,8 +1502,8 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['controller-1']}, {'name': 'upgrade-hosts', 'entity_names': ['controller-1']}, - {'name': 'unlock-hosts', - 'entity_names': ['controller-1']}, + _unlock_hosts_stage_as_dict(['controller-1'], + retry_count=0), {'name': 'wait-data-sync', 'ignore_alarms': ['900.005', '900.201', '750.006'], 'timeout': 14400} @@ -1533,8 +1519,8 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['controller-0']}, {'name': 'upgrade-hosts', 'entity_names': ['controller-0']}, - {'name': 'unlock-hosts', - 'entity_names': ['controller-0']}, + _unlock_hosts_stage_as_dict(['controller-0'], + retry_count=0), {'name': 'wait-data-sync', 'ignore_alarms': ['900.005', '900.201', '750.006'], 'timeout': 14400} @@ -1548,8 +1534,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['compute-1']}, {'name': 'upgrade-hosts', 'entity_names': ['compute-1']}, - {'name': 'unlock-hosts', - 'entity_names': ['compute-1']}, + _unlock_hosts_stage_as_dict(['compute-1']), {'name': 'wait-alarms-clear', 'timeout': 600} ] @@ -1564,8 +1549,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['compute-0']}, {'name': 'upgrade-hosts', 'entity_names': ['compute-0']}, - {'name': 'unlock-hosts', - 'entity_names': ['compute-0']}, + _unlock_hosts_stage_as_dict(['compute-0']), {'name': 'wait-alarms-clear', 'timeout': 600} ] @@ -1794,7 +1778,8 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): expected_results = { 'total_stages': 0, 'result': 'failed', - 'result_reason': 'all controller hosts must be unlocked-enabled-available' + 'result_reason': + 'all controller hosts must be unlocked-enabled-available' } sw_update_testcase.validate_phase(build_phase, expected_results) @@ -1840,7 +1825,8 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): expected_results = { 'total_stages': 0, 'result': 'failed', - 'result_reason': 'all worker hosts must be unlocked-enabled-available' + 'result_reason': + 'all worker hosts must be unlocked-enabled-available' } sw_update_testcase.validate_phase(build_phase, expected_results) diff --git a/nfv/nfv-vim/nfv_vim/strategy/_strategy.py b/nfv/nfv-vim/nfv_vim/strategy/_strategy.py index 709d6f5b..adf9f5de 100755 --- a/nfv/nfv-vim/nfv_vim/strategy/_strategy.py +++ b/nfv/nfv-vim/nfv_vim/strategy/_strategy.py @@ -1321,6 +1321,7 @@ class SwUpgradeStrategy(SwUpdateStrategy): True, ignore_alarms=self._ignore_alarms)) stage.add_step(strategy.LockHostsStep(host_list)) stage.add_step(strategy.UpgradeHostsStep(host_list)) + # Note: standard controllers do not need the same retry as AIO stage.add_step(strategy.UnlockHostsStep(host_list)) # Allow up to four hours for controller disks to synchronize stage.add_step(strategy.WaitDataSyncStep( @@ -1341,6 +1342,7 @@ class SwUpgradeStrategy(SwUpdateStrategy): stage.add_step(strategy.SwactHostsStep(host_list)) stage.add_step(strategy.LockHostsStep(host_list)) stage.add_step(strategy.UpgradeHostsStep(host_list)) + # Note: standard controllers do not need the same retry as AIO stage.add_step(strategy.UnlockHostsStep(host_list)) # Allow up to four hours for controller disks to synchronize stage.add_step(strategy.WaitDataSyncStep( @@ -1390,7 +1392,9 @@ class SwUpgradeStrategy(SwUpdateStrategy): True, ignore_alarms=self._ignore_alarms)) stage.add_step(strategy.LockHostsStep(host_list)) stage.add_step(strategy.UpgradeHostsStep(host_list)) + # storage hosts do not need the same retry logic as AIO stage.add_step(strategy.UnlockHostsStep(host_list)) + # After storage node(s) are unlocked, we need extra time to # allow the OSDs to go back in sync and the storage related # alarms to clear. We no longer wipe the OSD disks when upgrading @@ -1440,7 +1444,11 @@ class SwUpgradeStrategy(SwUpdateStrategy): stage.add_step(strategy.SwactHostsStep(host_list)) stage.add_step(strategy.LockHostsStep(host_list)) stage.add_step(strategy.UpgradeHostsStep(host_list)) - stage.add_step(strategy.UnlockHostsStep(host_list)) + # During an upgrade, unlock may need to retry. Bug details: + # https://bugs.launchpad.net/starlingx/+bug/1914836 + stage.add_step(strategy.UnlockHostsStep( + host_list, + retry_count=strategy.UnlockHostsStep.MAX_RETRIES)) if HOST_PERSONALITY.CONTROLLER in host_list[0].personality: # AIO Controller hosts will undergo WaitDataSyncStep step # Allow up to four hours for controller disks to synchronize @@ -1487,7 +1495,11 @@ class SwUpgradeStrategy(SwUpdateStrategy): stage.add_step(strategy.SwactHostsStep(host_list)) stage.add_step(strategy.LockHostsStep(host_list)) stage.add_step(strategy.UpgradeHostsStep(host_list)) - stage.add_step(strategy.UnlockHostsStep(host_list)) + # During an upgrade, unlock may need to retry. Bug details: + # https://bugs.launchpad.net/starlingx/+bug/1914836 + stage.add_step(strategy.UnlockHostsStep( + host_list, + retry_count=strategy.UnlockHostsStep.MAX_RETRIES)) if HOST_PERSONALITY.CONTROLLER in host_list[0].personality: # AIO Controller hosts will undergo WaitDataSyncStep step # Allow up to four hours for controller disks to synchronize diff --git a/nfv/nfv-vim/nfv_vim/strategy/_strategy_steps.py b/nfv/nfv-vim/nfv_vim/strategy/_strategy_steps.py index b01a4f28..e8c88624 100755 --- a/nfv/nfv-vim/nfv_vim/strategy/_strategy_steps.py +++ b/nfv/nfv-vim/nfv_vim/strategy/_strategy_steps.py @@ -68,13 +68,44 @@ class StrategyStepNames(Constants): STRATEGY_STEP_NAME = StrategyStepNames() -class UnlockHostsStep(strategy.StrategyStep): - """ - Unlock Hosts - Strategy Step - """ - def __init__(self, hosts): - super(UnlockHostsStep, self).__init__( - STRATEGY_STEP_NAME.UNLOCK_HOSTS, timeout_in_secs=1800) +class AbstractStrategyStep(strategy.StrategyStep): + """An abstract base class for strategy steps""" + + def __init__(self, step_name, timeout_in_secs): + super(AbstractStrategyStep, self).__init__( + step_name, + timeout_in_secs=timeout_in_secs) + + def from_dict(self, data): + """ + Returns the step object initialized using the given dictionary + """ + super(AbstractStrategyStep, self).from_dict(data) + return self + + def as_dict(self): + """ + Represent the step as a dictionary + """ + data = super(AbstractStrategyStep, self).as_dict() + # Next 3 lines are required for all strategy steps and may be + # overridden by subclass in some cases + data['entity_type'] = '' + data['entity_names'] = list() + data['entity_uuids'] = list() + return data + + +class AbstractHostsStrategyStep(AbstractStrategyStep): + """An abstract base class for strategy steps performed on list of hosts""" + + def __init__(self, + step_name, + hosts, + timeout_in_secs=1800): + super(AbstractHostsStrategyStep, self).__init__( + step_name, + timeout_in_secs=timeout_in_secs) self._hosts = hosts self._host_names = list() self._host_uuids = list() @@ -82,6 +113,108 @@ class UnlockHostsStep(strategy.StrategyStep): self._host_names.append(host.name) self._host_uuids.append(host.uuid) + def from_dict(self, data): + """ + Returns the step object initialized using the given dictionary + """ + super(AbstractHostsStrategyStep, self).from_dict(data) + self._hosts = list() + self._host_uuids = list() + self._host_names = data['entity_names'] + host_table = tables.tables_get_host_table() + for host_name in self._host_names: + host = host_table.get(host_name, None) + if host is not None: + self._hosts.append(host) + self._host_uuids.append(host.uuid) + return self + + def as_dict(self): + """ + Represent the step as a dictionary + """ + data = super(AbstractHostsStrategyStep, self).as_dict() + data['entity_type'] = 'hosts' + data['entity_names'] = self._host_names + data['entity_uuids'] = self._host_uuids + return data + + +class UnlockHostsStep(AbstractHostsStrategyStep): + """ + Unlock Hosts - Strategy Step + """ + + # During an upgrade, an unlock may need to be retried several times + # https://bugs.launchpad.net/starlingx/+bug/1914836 + MAX_RETRIES = 5 + RETRY_DELAY = 120 + + def __init__(self, hosts, retry_count=0, retry_delay=RETRY_DELAY): + """ + hosts - the list of hosts to be unlocked + retry_count - the number of times to retry per host if unlock fails + retry_delay - the amount of time to delay before retrying unlock + """ + super(UnlockHostsStep, self).__init__(STRATEGY_STEP_NAME.UNLOCK_HOSTS, + hosts, + timeout_in_secs=1800) + # step_name, hosts, timeout are serialized by parent classes + # retry_count and retry_delay must be serialized in from_dict/as_dict + self._retry_count = retry_count + self._retry_delay = retry_delay + # Do not persist: _retries, _wait_time _retrying + self._retries = dict() + for host_name in self._host_names: + self._retries[host_name] = retry_count + self._wait_time = 0 + self._retry_requested = False + + def from_dict(self, data): + """ + Returns unlock hosts step object initialized using the given dictionary + """ + super(UnlockHostsStep, self).from_dict(data) + # deserialize retry_delay + self._retry_count = data['retry_count'] + self._retry_delay = data['retry_delay'] + + # Do not deserialize _retries, _wait_time and _retrying + self._wait_time = 0 + self._retry_requested = False + self._retries = dict() + host_table = tables.tables_get_host_table() + for host_name in self._host_names: + host = host_table.get(host_name, None) + if host is not None: + self._retries[host_name] = self._retry_count + + return self + + def as_dict(self): + """ + Represent the unlock hosts step as a dictionary + """ + data = super(UnlockHostsStep, self).as_dict() + # serialize retries + data['retry_count'] = self._retry_count + # serialize retry_delay + data['retry_delay'] = self._retry_delay + # Do not serialize _retries, _wait_time and _retrying + return data + + def _get_hosts_to_retry(self): + hosts = [] + host_table = tables.tables_get_host_table() + for host_name in self._host_names: + host = host_table.get(host_name, None) + if host is None: + continue + if host.is_locked() and self._retries[host_name] > 0: + self._retries[host_name] = self._retries[host_name] - 1 + hosts.append(host_name) + return hosts + def _total_hosts_unlocked_enabled(self): """ Returns the number of hosts that are unlocked and enabled @@ -98,6 +231,16 @@ class UnlockHostsStep(strategy.StrategyStep): return total_hosts_enabled + def _trigger_retry(self, host_name): + DLOG.info("Step (%s) retry due to failure for (%s)." % (self._name, + host_name)) + # set the retry trigger + self._retry_requested = True + # reset the retry "wait" delay + self._wait_time = timers.get_monotonic_timestamp_in_ms() + # decrement the number of allowed retries for the validated host + self._retries[host_name] = self._retries[host_name] - 1 + def apply(self): """ Unlock all hosts @@ -122,9 +265,12 @@ class UnlockHostsStep(strategy.StrategyStep): """ Handle Host events """ + from nfv_vim import directors + DLOG.debug("Step (%s) handle event (%s)." % (self._name, event)) - if event in [STRATEGY_EVENT.HOST_STATE_CHANGED, STRATEGY_EVENT.HOST_AUDIT]: + if event in [STRATEGY_EVENT.HOST_STATE_CHANGED, + STRATEGY_EVENT.HOST_AUDIT]: total_hosts_enabled = self._total_hosts_unlocked_enabled() if -1 == total_hosts_enabled: @@ -137,41 +283,36 @@ class UnlockHostsStep(strategy.StrategyStep): self.stage.step_complete(result, '') return True + # See if we have requested a retry and are not currently retrying + if self._retry_requested: + now_ms = timers.get_monotonic_timestamp_in_ms() + secs_expired = (now_ms - self._wait_time) / 1000 + if self._retry_delay <= secs_expired: + self._retry_requested = False + # re-issue unlock for all hosts. + # Hosts that are already unlocked or unlocking get skipped + host_director = directors.get_host_director() + operation = host_director.unlock_hosts(self._host_names) + if operation.is_failed(): + result = strategy.STRATEGY_STEP_RESULT.FAILED + self.stage.step_complete(result, "host unlock failed") + return True + elif event == STRATEGY_EVENT.HOST_UNLOCK_FAILED: host = event_data if host is not None and host.name in self._host_names: - result = strategy.STRATEGY_STEP_RESULT.FAILED - self.stage.step_complete(result, "host unlock failed") + if host.is_locked() and self._retries[host.name] > 0: + # if any unlock fails and we have retries, trigger it + # even if the last round of unlocks has not returned + self._trigger_retry(host.name) + else: + # if ANY unlock fails and we are out of retries, fail + result = strategy.STRATEGY_STEP_RESULT.FAILED + self.stage.step_complete(result, "host unlock failed") return True return False - def from_dict(self, data): - """ - Returns the unlock hosts step object initialized using the given dictionary - """ - super(UnlockHostsStep, self).from_dict(data) - self._hosts = list() - self._host_uuids = list() - self._host_names = data['entity_names'] - host_table = tables.tables_get_host_table() - for host_name in self._host_names: - host = host_table.get(host_name, None) - if host is not None: - self._hosts.append(host) - self._host_uuids.append(host.uuid) - return self - - def as_dict(self): - """ - Represent the unlock hosts step as a dictionary - """ - data = super(UnlockHostsStep, self).as_dict() - data['entity_type'] = 'hosts' - data['entity_names'] = self._host_names - data['entity_uuids'] = self._host_uuids - return data - class LockHostsStep(strategy.StrategyStep): """ @@ -2558,33 +2699,6 @@ class EnableHostServicesStep(strategy.StrategyStep): return data -class AbstractStrategyStep(strategy.StrategyStep): - - def __init__(self, step_name, timeout_in_secs): - super(AbstractStrategyStep, self).__init__( - step_name, - timeout_in_secs=timeout_in_secs) - - def from_dict(self, data): - """ - Returns the step object initialized using the given dictionary - """ - super(AbstractStrategyStep, self).from_dict(data) - return self - - def as_dict(self): - """ - Represent the step as a dictionary - """ - data = super(AbstractStrategyStep, self).as_dict() - # Next 3 lines are required for all strategy steps and may be - # overridden by subclass in some cases - data['entity_type'] = '' - data['entity_names'] = list() - data['entity_uuids'] = list() - return data - - class ApplySwPatchesStep(AbstractStrategyStep): """ Apply Patches using patch API @@ -3053,7 +3167,7 @@ class KubeUpgradeNetworkingStep(AbstractKubeUpgradeStep): class AbstractKubeHostUpgradeStep(AbstractKubeUpgradeStep): - """Kube Upgrade Host - Abtsract Strategy Step + """Kube Upgrade Host - Abstract Strategy Step This operation issues a host command, which updates the kube upgrade object """