diff --git a/distributedcloud/dcmanager/common/consts.py b/distributedcloud/dcmanager/common/consts.py index 2beb1a3d3..d6455d7a6 100644 --- a/distributedcloud/dcmanager/common/consts.py +++ b/distributedcloud/dcmanager/common/consts.py @@ -43,6 +43,10 @@ AVAILABILITY_ONLINE = "online" ADMIN_LOCKED = 'locked' ADMIN_UNLOCKED = 'unlocked' +# operational status for hosts +OPERATIONAL_ENABLED = 'enabled' +OPERATIONAL_DISABLED = 'disabled' + # Subcloud sync status SYNC_STATUS_UNKNOWN = "unknown" SYNC_STATUS_IN_SYNC = "in-sync" diff --git a/distributedcloud/dcmanager/manager/states/unlock_host.py b/distributedcloud/dcmanager/manager/states/unlock_host.py index e8b99c7a5..b399fa477 100644 --- a/distributedcloud/dcmanager/manager/states/unlock_host.py +++ b/distributedcloud/dcmanager/manager/states/unlock_host.py @@ -6,29 +6,32 @@ import time from dcmanager.common.consts import ADMIN_UNLOCKED +from dcmanager.common.consts import OPERATIONAL_ENABLED from dcmanager.manager.states.base import BaseState -DEFAULT_MAX_QUERIES = 6 -DEFAULT_SLEEP_DURATION = 10 +# When an unlock occurs, a reboot is triggered. During reboot, API calls fail. +# The max time allowed here is 30 minutes (ie: 30 queries with 1 minute sleep) +DEFAULT_MAX_FAILED_QUERIES = 30 +DEFAULT_FAILED_SLEEP = 60 + +# Before and after reboot, the unlock needs to prepare for shutdown and +# do post-reboot activities during which time the API will succeed, but the +# expected states will not yet be set. +# The max time allowed here is 30 minutes (ie: 30 queries with 1 minute sleep) +DEFAULT_MAX_API_QUERIES = 30 +DEFAULT_API_SLEEP = 60 class UnlockHostState(BaseState): - """Orchestration state for unlocking a host""" + """Orchestration state for unlocking a host.""" - def __init__(self, - hostname='controller-0', - max_queries=DEFAULT_MAX_QUERIES, - sleep_duration=DEFAULT_SLEEP_DURATION): + def __init__(self, hostname='controller-0'): super(UnlockHostState, self).__init__() self.target_hostname = hostname - # max time to wait (in seconds) is: sleep_duration * max_queries - self.sleep_duration = sleep_duration - self.max_queries = max_queries - - def check_async_counter(self, counter): - if counter >= self.max_queries: - raise Exception("Timeout waiting for unlock to complete") - time.sleep(self.sleep_duration) + self.max_api_queries = DEFAULT_MAX_API_QUERIES + self.api_sleep_duration = DEFAULT_API_SLEEP + self.max_failed_queries = DEFAULT_MAX_FAILED_QUERIES + self.failed_sleep_duration = DEFAULT_FAILED_SLEEP def perform_state_action(self, strategy_step): """Unlocks a host on the subcloud @@ -57,19 +60,64 @@ class UnlockHostState(BaseState): if (response.ihost_action != 'unlock' or response.task != 'Unlocking'): raise Exception("Unable to unlock host %s" % self.target_hostname) - # this action is asynchronous, query until it completes or times out - async_counter = 0 + # unlock triggers a reboot. + # must ignore certain errors until the system completes the reboot + # or a timeout occurs + + # Allow separate durations for failures (ie: reboot) and api retries + api_counter = 0 + fail_counter = 0 + # Allow just one failed auth (token expired) + auth_failure = False + while True: - # query the administrative state to see if it is the new state. - host = sysinv_client.get_host(self.target_hostname) - if host.administrative == ADMIN_UNLOCKED: - msg = "Host: %s is now: %s" % (self.target_hostname, - host.administrative) - self.info_log(strategy_step, msg) - break - async_counter += 1 - # check_async_counter throws exception if loops exceeded or aborted - self.check_async_counter(async_counter) + try: + # query the administrative state to see if it is the new state. + host = sysinv_client.get_host(self.target_hostname) + if (host.administrative == ADMIN_UNLOCKED and + host.operational == OPERATIONAL_ENABLED): + # Success. Break out of the loop. + msg = "Host: %s is now: %s %s" % (self.target_hostname, + host.administrative, + host.operational) + self.info_log(strategy_step, msg) + break + # no exception was raised so reset fail and auth checks + auth_failure = False + fail_counter = 0 + except Exception as e: + if e.message == "Authorization failed": + # Since a token could expire while waiting, generate + # a new token (by re-creating the client) and re-try the + # request, but only once. + if not auth_failure: + auth_failure = True + self.info_log(strategy_step, + "Authorization failure. Retrying...") + ks_client = self.get_keystone_client( + strategy_step.subcloud.name) + sysinv_client = self.get_sysinv_client( + strategy_step.subcloud.name, + ks_client.session) + continue + else: + raise Exception("Repeated authorization failures.") + else: + # Handle other exceptions due to being unreachable + # for a significant period of time when there is a + # controller swact, or in the case of AIO-SX, + # when the controller reboots. + fail_counter += 1 + if fail_counter >= self.max_failed_queries: + raise Exception("Timeout waiting for reboot to complete") + time.sleep(self.failed_sleep_duration) + # skip the api_counter + continue + # If the max counter is exceeeded, raise a timeout exception + api_counter += 1 + if api_counter >= self.max_api_queries: + raise Exception("Timeout waiting for unlock to complete") + time.sleep(self.api_sleep_duration) # If we are here, the loop broke out cleanly and the action succeeded # When we return from this method without throwing an exception, the diff --git a/distributedcloud/dcmanager/tests/unit/manager/states/upgrade/test_base.py b/distributedcloud/dcmanager/tests/unit/manager/states/upgrade/test_base.py index 31afac3ac..d3e9e6c36 100644 --- a/distributedcloud/dcmanager/tests/unit/manager/states/upgrade/test_base.py +++ b/distributedcloud/dcmanager/tests/unit/manager/states/upgrade/test_base.py @@ -6,8 +6,8 @@ import mock import uuid +from dcmanager.common import consts from dcmanager.manager.states.base import BaseState -from sysinv.common import constants as sysinv_constants from dcmanager.tests.unit.manager.test_sw_upgrade import TestSwUpgrade @@ -70,14 +70,16 @@ class FakeController(object): def __init__(self, host_id=1, hostname='controller-0', - administrative=sysinv_constants.ADMIN_UNLOCKED, - availability=sysinv_constants.AVAILABILITY_AVAILABLE, + administrative=consts.ADMIN_UNLOCKED, + operational=consts.OPERATIONAL_ENABLED, + availability=consts.AVAILABILITY_ONLINE, ihost_action=None, target_load=UPGRADED_VERSION, task=None): self.id = host_id self.hostname = hostname self.administrative = administrative + self.operational = operational self.availability = availability self.ihost_action = ihost_action self.target_load = target_load diff --git a/distributedcloud/dcmanager/tests/unit/manager/states/upgrade/test_unlock_controller.py b/distributedcloud/dcmanager/tests/unit/manager/states/upgrade/test_unlock_controller.py index 14db98ede..f7cd1c2dd 100644 --- a/distributedcloud/dcmanager/tests/unit/manager/states/upgrade/test_unlock_controller.py +++ b/distributedcloud/dcmanager/tests/unit/manager/states/upgrade/test_unlock_controller.py @@ -7,14 +7,16 @@ import itertools import mock from dcmanager.common import consts -from dcmanager.manager.states.unlock_host import DEFAULT_MAX_QUERIES +from dcmanager.manager.states import unlock_host from dcmanager.tests.unit.manager.states.upgrade.test_base \ import FakeController from dcmanager.tests.unit.manager.states.upgrade.test_base \ import TestSwUpgradeState -CONTROLLER_0_UNLOCKED = FakeController(administrative=consts.ADMIN_UNLOCKED) +CONTROLLER_0_UNLOCKED = \ + FakeController(administrative=consts.ADMIN_UNLOCKED, + operational=consts.OPERATIONAL_ENABLED) CONTROLLER_0_LOCKED = FakeController(administrative=consts.ADMIN_LOCKED) CONTROLLER_0_UNLOCKING = FakeController(administrative=consts.ADMIN_LOCKED, ihost_action='unlock', @@ -25,6 +27,11 @@ CONTROLLER_0_UNLOCKING_FAILED = \ task='Swacting') +@mock.patch("dcmanager.manager.states.unlock_host.DEFAULT_MAX_API_QUERIES", 3) +@mock.patch("dcmanager.manager.states.unlock_host.DEFAULT_MAX_FAILED_QUERIES", + 3) +@mock.patch("dcmanager.manager.states.unlock_host.DEFAULT_API_SLEEP", 1) +@mock.patch("dcmanager.manager.states.unlock_host.DEFAULT_FAILED_SLEEP", 1) class TestSwUpgradeUnlockControllerStage(TestSwUpgradeState): def setUp(self): @@ -102,7 +109,7 @@ class TestSwUpgradeUnlockControllerStage(TestSwUpgradeState): self.sysinv_client.unlock_host.assert_called() # verify the query was invoked: 1 + max_attempts times - self.assertEqual(DEFAULT_MAX_QUERIES + 1, + self.assertEqual(unlock_host.DEFAULT_MAX_API_QUERIES + 1, self.sysinv_client.get_host.call_count) # verify that state failed due to subcloud never finishing the unlock