Hold off dc audit and sync until activation is complete

Currently DC audit and sync resume as soon as the subcloud becomes
unlocked-enabled-available post data migration while dc orchestrator
proceeds to activating phase for the subcloud.

During upgrade activation, many platform activities that occur at
the same time on the subcloud. To reduce load on the subcloud, DC
audit and sync are held off until upgrade activation is complete.
  Note: Platform services changes to reduce CPU contention over
        activation phase are addressed separately.

There are 2 caveats as a result of this change:
  a) extended subcloud offline status (an alarm condition) and
  b) user is not able to log in the subcloud using sysadmin
     password until subcloud upgrade is complete.

Test Plan:
  - Verify successful simplex subcloud upgrade and no audit or
    sync activities take place until upgrade activation is complete.
  - Induce data migration failure and verify that the strategy
    advances to "upgrading simplex" step upon retry.
  - Induce a data migration timeout (i.e. subcloud takes an abnormal
    amount of time to unlock but succeeds eventually with or without
    manual intervention) and verify that the strategy advances to
    "activation upgrade" step following online checks upon retry.
  - Induce activation failure and verify that the strategy advances
    to "upgrade activation" step upon retry.
  - Induce completing upgrade failure and verify that the strategy
    advances to "completing upgrade" step upon retry.
  - Induce a data migration timeout in the first upgrade and an
    activation failure in the second attempt and verify that the
    strategy advances to "migrating data" step upon the third
    attempt.
  - Induce a failure right after successful install and verify
    that the strategy advances to "migrating data" step upon retry.
    This is a very corner case.

Story: 2010798
Task: 48672
Depends-On: https://review.opendev.org/c/starlingx/ansible-playbooks/+/893004

Signed-off-by: Tee Ngo <tee.ngo@windriver.com>
Change-Id: Iafc8cea145c314d325fafd0b4b25076053f751ba
This commit is contained in:
Tee Ngo 2023-08-29 18:07:18 -04:00
parent 7073e525b9
commit 76e56e73d9
8 changed files with 86 additions and 29 deletions

View File

@ -121,7 +121,7 @@ class SubcloudAuditWorkerManager(manager.Manager):
consts.DEPLOY_STATE_PRE_INSTALL_FAILED,
consts.DEPLOY_STATE_INSTALLING,
consts.DEPLOY_STATE_DATA_MIGRATION_FAILED,
consts.DEPLOY_STATE_MIGRATED,
consts.DEPLOY_STATE_UPGRADE_ACTIVATED,
consts.DEPLOY_STATE_RESTORING,
consts.DEPLOY_STATE_RESTORE_PREP_FAILED,
consts.DEPLOY_STATE_RESTORE_FAILED]

View File

@ -207,6 +207,7 @@ DEPLOY_STATE_CONFIG_ABORTED = 'config-aborted'
DEPLOY_STATE_MIGRATING_DATA = 'migrating-data'
DEPLOY_STATE_DATA_MIGRATION_FAILED = 'data-migration-failed'
DEPLOY_STATE_MIGRATED = 'migrated'
DEPLOY_STATE_UPGRADE_ACTIVATED = 'upgrade-activated'
DEPLOY_STATE_PRE_RESTORE = 'pre-restore'
DEPLOY_STATE_RESTORE_PREP_FAILED = 'restore-prep-failed'
DEPLOY_STATE_RESTORING = 'restoring'

View File

@ -1,5 +1,5 @@
#
# Copyright (c) 2023 Wind River Systems, Inc.
# Copyright (c) 2020-2023 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@ -21,7 +21,7 @@ ACTIVATING_IN_PROGRESS_STATES = ['activating', 'activating-hosts', ]
# Max time: 60 minutes = 60 queries x 60 seconds sleep between queries
DEFAULT_MAX_QUERIES = 60
DEFAULT_SLEEP_DURATION = 60
MAX_FAILED_RETRIES = 10
MAX_FAILED_RETRIES = 3
class ActivatingUpgradeState(BaseState):
@ -122,6 +122,7 @@ class ActivatingUpgradeState(BaseState):
upgrade_state = self.get_upgrade_state(strategy_step)
if upgrade_state in ACTIVATING_RETRY_STATES:
# We failed. Better try again
time.sleep(self.sleep_duration * activate_retry_counter)
activate_retry_counter += 1
self.info_log(strategy_step,
"Activation failed, retrying... State=%s"
@ -159,4 +160,6 @@ class ActivatingUpgradeState(BaseState):
# When we return from this method without throwing an exception, the
# state machine can proceed to the next state
db_api.subcloud_update(self.context, strategy_step.subcloud_id,
deploy_status=consts.DEPLOY_STATE_UPGRADE_ACTIVATED)
return self.next_state

View File

@ -1,5 +1,5 @@
#
# Copyright (c) 2020-2022 Wind River Systems, Inc.
# Copyright (c) 2020-2023 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@ -30,9 +30,6 @@ DEFAULT_FAILED_SLEEP = 60
DEFAULT_MAX_API_QUERIES = 30
DEFAULT_API_SLEEP = 60
# sleep for 3 minutes after ansible completes
DEFAULT_ANSIBLE_SLEEP = 180
def migrate_subcloud_data(migrate_command, log_file):
try:
@ -51,7 +48,6 @@ class MigratingDataState(BaseState):
super(MigratingDataState, self).__init__(
next_state=consts.STRATEGY_STATE_UNLOCKING_CONTROLLER_0, region_name=region_name)
self.ansible_sleep = DEFAULT_ANSIBLE_SLEEP
self.max_api_queries = DEFAULT_MAX_API_QUERIES
self.api_sleep_duration = DEFAULT_API_SLEEP
self.max_failed_queries = DEFAULT_MAX_FAILED_QUERIES
@ -170,10 +166,6 @@ class MigratingDataState(BaseState):
self.error_log(strategy_step, str(e))
raise
# Ansible invokes an unlock. Need to wait for the unlock to complete.
# Wait for 3 minutes for mtc/scripts to shut down services
# todo(abailey): split this into smaller sleeps to allow stopping early
time.sleep(self.ansible_sleep)
# wait up to 60 minutes for reboot to complete
self.wait_for_unlock(strategy_step)

View File

@ -1,5 +1,5 @@
#
# Copyright (c) 2020-2022 Wind River Systems, Inc.
# Copyright (c) 2020-2023 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@ -245,8 +245,9 @@ class PreCheckState(BaseState):
return self.next_state
# Skip subcloud online checks if the subcloud deploy status is
# "migrated".
if subcloud.deploy_status == consts.DEPLOY_STATE_MIGRATED:
# either "migrated" or "upgrade-activated".
if subcloud.deploy_status in [consts.DEPLOY_STATE_MIGRATED,
consts.DEPLOY_STATE_UPGRADE_ACTIVATED]:
self.info_log(strategy_step, "Online subcloud checks skipped.")
else:
self._perform_subcloud_online_checks(strategy_step,
@ -254,20 +255,30 @@ class PreCheckState(BaseState):
subcloud_fm_client,
host, upgrades)
if subcloud.deploy_status == consts.DEPLOY_STATE_MIGRATED:
# If the subcloud has completed data migration, advance directly
# to activating upgrade step.
self.override_next_state(consts.STRATEGY_STATE_ACTIVATING_UPGRADE)
if subcloud.deploy_status == consts.DEPLOY_STATE_UPGRADE_ACTIVATED:
# If the subcloud has completed upgrade activation, advance directly
# to completing step.
self.override_next_state(consts.STRATEGY_STATE_COMPLETING_UPGRADE)
elif subcloud.deploy_status == consts.DEPLOY_STATE_DATA_MIGRATION_FAILED:
# If the subcloud deploy status is data-migration-failed but
# it is online and has passed subcloud online checks, it must have
# timed out while waiting for the subcloud to reboot previously and
# timed out while waiting for the subcloud to unlock previously and
# has succesfully been unlocked since. Update the subcloud deploy
# status and advance to activating upgrade step.
db_api.subcloud_update(
self.context, strategy_step.subcloud_id,
deploy_status=consts.DEPLOY_STATE_MIGRATED)
self.override_next_state(consts.STRATEGY_STATE_ACTIVATING_UPGRADE)
elif subcloud.deploy_status == consts.DEPLOY_STATE_MIGRATED:
# If the subcloud deploy status is migrated but it is online, it
# must have undergone 2 upgrade attempts:
# - in 1st upgrade attempt: strategy timed out while waiting
# for the subcloud to unlock
# - in 2nd upgrade attempt: the subcloud was unlocked successfully
# (with or without manual interventions) but failed to activate.
# Advance to activating upgrade step so activation can be retried
# after the manual intervention.
self.override_next_state(consts.STRATEGY_STATE_ACTIVATING_UPGRADE)
else:
# Duplex case
if upgrades:

View File

@ -1,5 +1,5 @@
#
# Copyright (c) 2020, 2022 Wind River Systems, Inc.
# Copyright (c) 2020-2023 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@ -7,6 +7,7 @@ import itertools
import mock
from dcmanager.common import consts
from dcmanager.db.sqlalchemy import api as db_api
from dcmanager.orchestrator.states.upgrade import activating
from dcmanager.tests.unit.orchestrator.states.fakes import FakeUpgrade
@ -85,6 +86,11 @@ class TestSwUpgradeActivatingStage(TestSwUpgradeState):
# verify the API call was invoked
self.sysinv_client.upgrade_activate.assert_called()
# verify the DB update was invoked
updated_subcloud = db_api.subcloud_get(self.ctx,
self.subcloud.id)
self.assertEqual(updated_subcloud.deploy_status, consts.DEPLOY_STATE_UPGRADE_ACTIVATED)
# On success, the state should be updated to the next state
self.assert_step_updated(self.strategy_step.subcloud_id,
self.on_success_state)

View File

@ -30,8 +30,6 @@ CONTROLLER_0_UNLOCKED = \
"DEFAULT_API_SLEEP", 1)
@mock.patch("dcmanager.orchestrator.states.upgrade.migrating_data."
"DEFAULT_FAILED_SLEEP", 1)
@mock.patch("dcmanager.orchestrator.states.upgrade.migrating_data."
"DEFAULT_ANSIBLE_SLEEP", 3)
class TestSwUpgradeMigratingDataStage(TestSwUpgradeState):
def setUp(self):

View File

@ -1,5 +1,5 @@
#
# Copyright (c) 2020-2022 Wind River Systems, Inc.
# Copyright (c) 2020-2023 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@ -423,16 +423,16 @@ class TestSwUpgradePreCheckStage(TestSwUpgradeState):
class TestSwUpgradePreCheckSimplexStage(TestSwUpgradePreCheckStage):
def test_upgrade_pre_check_subcloud_online_migrated(self):
def test_upgrade_pre_check_subcloud_online_activated(self):
"""Test pre check step where the subcloud is online and running N+1 load
The pre-check in this scenario should advance directly to 'activating upgrade'.
The pre-check in this scenario should advance directly to 'completing upgrade'.
"""
# Update the subcloud to have deploy state as "migrated"
db_api.subcloud_update(self.ctx,
self.subcloud.id,
deploy_status=consts.DEPLOY_STATE_MIGRATED)
deploy_status=consts.DEPLOY_STATE_UPGRADE_ACTIVATED)
# invoke the strategy state operation on the orch thread
self.worker.perform_state_action(self.strategy_step)
@ -443,9 +443,9 @@ class TestSwUpgradePreCheckSimplexStage(TestSwUpgradePreCheckStage):
# verify the get host filesystem API call was not invoked
self.sysinv_client.get_host_filesystem.assert_not_called()
# Verify the expected next state happened (activating upgrade)
# Verify the expected next state happened (completing upgrade)
self.assert_step_updated(self.strategy_step.subcloud_id,
consts.STRATEGY_STATE_ACTIVATING_UPGRADE)
consts.STRATEGY_STATE_COMPLETING_UPGRADE)
def test_upgrade_pre_check_subcloud_online_migrate_failed(self):
"""Test pre check step where the subcloud is online following an unlock timeout
@ -484,6 +484,30 @@ class TestSwUpgradePreCheckSimplexStage(TestSwUpgradePreCheckStage):
self.assert_step_updated(self.strategy_step.subcloud_id,
consts.STRATEGY_STATE_ACTIVATING_UPGRADE)
def test_upgrade_pre_check_subcloud_online_migrated(self):
"""Test pre check step where the subcloud is online following an activation failure
The pre-check in this scenario should advance directly to 'activating upgrade'.
"""
# Update the subcloud to have deploy state as "migrated"
db_api.subcloud_update(self.ctx,
self.subcloud.id,
deploy_status=consts.DEPLOY_STATE_MIGRATED)
# invoke the strategy state operation on the orch thread
self.worker.perform_state_action(self.strategy_step)
# verify the get system health API call was not invoked
self.sysinv_client.get_system_health_upgrade.assert_not_called()
# verify the get host filesystem API call was not invoked
self.sysinv_client.get_host_filesystem.assert_not_called()
# Verify the expected next state happened (activating upgrade)
self.assert_step_updated(self.strategy_step.subcloud_id,
consts.STRATEGY_STATE_ACTIVATING_UPGRADE)
def test_upgrade_pre_check_subcloud_online_no_data_install(self):
"""Test pre check step where the subcloud is online without data install
@ -621,6 +645,28 @@ class TestSwUpgradePreCheckSimplexStage(TestSwUpgradePreCheckStage):
self.assert_step_updated(self.strategy_step.subcloud_id,
consts.STRATEGY_STATE_MIGRATING_DATA)
def test_upgrade_pre_check_subcloud_jumps_to_activating(self):
"""Test pre check step which jumps to activating upgrade state
The pre-check should transition in this scenario to activating upgrade
state if the subcloud is now offline, and the deploy status can be
handled by that state.
"""
# Update the subcloud to have deploy state as "migrated",
# and availability status as "offline"
db_api.subcloud_update(self.ctx,
self.subcloud.id,
deploy_status=consts.DEPLOY_STATE_MIGRATED,
availability_status=dccommon_consts.AVAILABILITY_OFFLINE)
# invoke the strategy state operation on the orch thread
self.worker.perform_state_action(self.strategy_step)
# Verify the expected next state happened (activating upgrade)
self.assert_step_updated(self.strategy_step.subcloud_id,
consts.STRATEGY_STATE_ACTIVATING_UPGRADE)
def test_upgrade_pre_check_subcloud_jumps_to_upgrading(self):
"""Test pre check step which jumps to the upgrading state