distcloud/distributedcloud/dcmanager/orchestrator/states/upgrade/migrating_data.py

186 lines
8.5 KiB
Python

#
# Copyright (c) 2020-2022 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
import os
import time
from dccommon import consts as dccommon_consts
from dccommon.exceptions import PlaybookExecutionFailed
from dccommon.utils import run_playbook
from dcmanager.common import consts
from dcmanager.common.exceptions import StrategyStoppedException
from dcmanager.common import utils
from dcmanager.db import api as db_api
from dcmanager.orchestrator.states.base import BaseState
ANSIBLE_UPGRADE_PLAYBOOK = \
'/usr/share/ansible/stx-ansible/playbooks/upgrade_platform.yml'
# When an unlock occurs, a reboot is triggered. During reboot, API calls fail.
# The max time allowed here is 30 minutes (ie: 30 queries with 1 minute sleep)
DEFAULT_MAX_FAILED_QUERIES = 30
DEFAULT_FAILED_SLEEP = 60
# after reboot, the unlock needs to do post-reboot activities during which
# time the API will succeed, but the expected states will not yet be set.
# The max time allowed here is 30 minutes (ie: 30 queries with 1 minute sleep)
DEFAULT_MAX_API_QUERIES = 30
DEFAULT_API_SLEEP = 60
# sleep for 3 minutes after ansible completes
DEFAULT_ANSIBLE_SLEEP = 180
def migrate_subcloud_data(migrate_command, log_file):
try:
run_playbook(log_file, migrate_command)
except PlaybookExecutionFailed:
msg_orch = ("Failed to migrate data, check individual "
"log at %s or run %s for details"
% (log_file, consts.ERROR_DESC_CMD))
raise Exception(msg_orch)
class MigratingDataState(BaseState):
"""Upgrade step for migrating data"""
def __init__(self, region_name):
super(MigratingDataState, self).__init__(
next_state=consts.STRATEGY_STATE_UNLOCKING_CONTROLLER_0, region_name=region_name)
self.ansible_sleep = DEFAULT_ANSIBLE_SLEEP
self.max_api_queries = DEFAULT_MAX_API_QUERIES
self.api_sleep_duration = DEFAULT_API_SLEEP
self.max_failed_queries = DEFAULT_MAX_FAILED_QUERIES
self.failed_sleep_duration = DEFAULT_FAILED_SLEEP
def wait_for_unlock(self, strategy_step):
"""This method returns successfully when the unlock completes.
An exception is raised if it does not recover on time.
"""
# This code is 'borrowed' from the unlock_host state
# Allow separate durations for failures (ie: reboot) and api retries
api_counter = 0
fail_counter = 0
# todo(abailey): only supports AIO-SX here
target_hostname = 'controller-0'
while True:
# If event handler stop has been triggered, fail the state
if self.stopped():
raise StrategyStoppedException()
try:
# query the administrative state to see if it is the new state.
host = self.get_sysinv_client(
strategy_step.subcloud.name).get_host(target_hostname)
if (host.administrative == consts.ADMIN_UNLOCKED and
host.operational == consts.OPERATIONAL_ENABLED):
# Success. Break out of the loop.
msg = "Host: %s is now: %s %s" % (target_hostname,
host.administrative,
host.operational)
self.info_log(strategy_step, msg)
break
# no exception was raised so reset fail and auth checks
fail_counter = 0
except Exception:
# Handle other exceptions due to being unreachable
# for a significant period of time when there is a
# controller swact, or in the case of AIO-SX,
# when the controller reboots.
fail_counter += 1
if fail_counter >= self.max_failed_queries:
db_api.subcloud_update(
self.context, strategy_step.subcloud_id,
deploy_status=consts.DEPLOY_STATE_DATA_MIGRATION_FAILED)
raise Exception("Timeout waiting on reboot to complete")
time.sleep(self.failed_sleep_duration)
# skip the api_counter
continue
# If the max counter is exceeeded, raise a timeout exception
api_counter += 1
if api_counter >= self.max_api_queries:
db_api.subcloud_update(
self.context, strategy_step.subcloud_id,
deploy_status=consts.DEPLOY_STATE_DATA_MIGRATION_FAILED)
raise Exception("Timeout waiting for unlock to complete")
time.sleep(self.api_sleep_duration)
def perform_state_action(self, strategy_step):
"""Migrate data for an upgrade on a subcloud
Returns the next state in the state machine on success.
Any exceptions raised by this method set the strategy to FAILED.
"""
# To account for abrupt termination of dcmanager, check the last known
# subcloud deploy status. If it is migrated/complete, advance to the next
# stage. If it is 'migrating', fail the strategy. The user will need to
# delete the existing strategy, create a new one and apply. Pre-check will
# set the appropriate next step for this subcloud.
subcloud = db_api.subcloud_get(self.context, strategy_step.subcloud.id)
if (subcloud.deploy_status == consts.DEPLOY_STATE_MIGRATED or
subcloud.deploy_status == consts.DEPLOY_STATE_DONE):
return self.next_state
elif subcloud.deploy_status == consts.DEPLOY_STATE_MIGRATING_DATA:
db_api.subcloud_update(
self.context, strategy_step.subcloud_id,
deploy_status=consts.DEPLOY_STATE_DATA_MIGRATION_FAILED)
raise Exception("Previous data migration was abruptly terminated. "
"Please try again with a new upgrade strategy.")
# If it gets here, the subcloud deploy status must be 'installed'.
self.info_log(strategy_step, "Start migrating data...")
db_api.subcloud_update(
self.context, strategy_step.subcloud_id,
deploy_status=consts.DEPLOY_STATE_MIGRATING_DATA)
ansible_subcloud_inventory_file = os.path.join(
dccommon_consts.ANSIBLE_OVERRIDES_PATH,
strategy_step.subcloud.name + consts.INVENTORY_FILE_POSTFIX)
log_file = os.path.join(consts.DC_ANSIBLE_LOG_DIR, subcloud.name) + \
'_playbook_output.log'
# Send skip_patching=true to prevent the playbook from applying any patches present in the
# upgrade_data. All the required patches will be included in the generated install iso.
data_migrating_cmd = [
"ansible-playbook", ANSIBLE_UPGRADE_PLAYBOOK,
"-i", ansible_subcloud_inventory_file, "-e",
"ansible_ssh_pass=%s ansible_become_pass=%s skip_patching=true"
% (consts.TEMP_SYSADMIN_PASSWORD, consts.TEMP_SYSADMIN_PASSWORD)]
try:
migrate_subcloud_data(data_migrating_cmd, log_file)
except Exception as e:
# Two error messages: one for subcloud error description and logs and
# one for orchestrator strategy_step detail (shorter than the previous).
msg_subcloud = utils.find_ansible_error_msg(
strategy_step.subcloud.name, log_file, consts.DEPLOY_STATE_MIGRATING_DATA)
# Get script output in case it is available
error_msg = utils.get_failure_msg(strategy_step.subcloud.name)
failure = ('%s \n%s' % (error_msg, msg_subcloud))
db_api.subcloud_update(
self.context, strategy_step.subcloud_id,
deploy_status=consts.DEPLOY_STATE_DATA_MIGRATION_FAILED,
error_description=failure[0:consts.ERROR_DESCRIPTION_LENGTH])
self.error_log(strategy_step, msg_subcloud)
self.error_log(strategy_step, str(e))
raise
# Ansible invokes an unlock. Need to wait for the unlock to complete.
# Wait for 3 minutes for mtc/scripts to shut down services
# todo(abailey): split this into smaller sleeps to allow stopping early
time.sleep(self.ansible_sleep)
# wait up to 60 minutes for reboot to complete
self.wait_for_unlock(strategy_step)
db_api.subcloud_update(
self.context, strategy_step.subcloud_id,
deploy_status=consts.DEPLOY_STATE_MIGRATED)
self.info_log(strategy_step, "Data migration completed.")
return self.next_state