186 lines
8.5 KiB
Python
186 lines
8.5 KiB
Python
#
|
|
# Copyright (c) 2020-2022 Wind River Systems, Inc.
|
|
#
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
#
|
|
import os
|
|
import time
|
|
|
|
from dccommon import consts as dccommon_consts
|
|
from dccommon.exceptions import PlaybookExecutionFailed
|
|
from dccommon.utils import run_playbook
|
|
from dcmanager.common import consts
|
|
from dcmanager.common.exceptions import StrategyStoppedException
|
|
from dcmanager.common import utils
|
|
from dcmanager.db import api as db_api
|
|
from dcmanager.orchestrator.states.base import BaseState
|
|
|
|
|
|
ANSIBLE_UPGRADE_PLAYBOOK = \
|
|
'/usr/share/ansible/stx-ansible/playbooks/upgrade_platform.yml'
|
|
|
|
# When an unlock occurs, a reboot is triggered. During reboot, API calls fail.
|
|
# The max time allowed here is 30 minutes (ie: 30 queries with 1 minute sleep)
|
|
DEFAULT_MAX_FAILED_QUERIES = 30
|
|
DEFAULT_FAILED_SLEEP = 60
|
|
|
|
# after reboot, the unlock needs to do post-reboot activities during which
|
|
# time the API will succeed, but the expected states will not yet be set.
|
|
# The max time allowed here is 30 minutes (ie: 30 queries with 1 minute sleep)
|
|
DEFAULT_MAX_API_QUERIES = 30
|
|
DEFAULT_API_SLEEP = 60
|
|
|
|
# sleep for 3 minutes after ansible completes
|
|
DEFAULT_ANSIBLE_SLEEP = 180
|
|
|
|
|
|
def migrate_subcloud_data(migrate_command, log_file):
|
|
try:
|
|
run_playbook(log_file, migrate_command)
|
|
except PlaybookExecutionFailed:
|
|
msg_orch = ("Failed to migrate data, check individual "
|
|
"log at %s or run %s for details"
|
|
% (log_file, consts.ERROR_DESC_CMD))
|
|
raise Exception(msg_orch)
|
|
|
|
|
|
class MigratingDataState(BaseState):
|
|
"""Upgrade step for migrating data"""
|
|
|
|
def __init__(self, region_name):
|
|
super(MigratingDataState, self).__init__(
|
|
next_state=consts.STRATEGY_STATE_UNLOCKING_CONTROLLER_0, region_name=region_name)
|
|
|
|
self.ansible_sleep = DEFAULT_ANSIBLE_SLEEP
|
|
self.max_api_queries = DEFAULT_MAX_API_QUERIES
|
|
self.api_sleep_duration = DEFAULT_API_SLEEP
|
|
self.max_failed_queries = DEFAULT_MAX_FAILED_QUERIES
|
|
self.failed_sleep_duration = DEFAULT_FAILED_SLEEP
|
|
|
|
def wait_for_unlock(self, strategy_step):
|
|
"""This method returns successfully when the unlock completes.
|
|
|
|
An exception is raised if it does not recover on time.
|
|
"""
|
|
|
|
# This code is 'borrowed' from the unlock_host state
|
|
# Allow separate durations for failures (ie: reboot) and api retries
|
|
api_counter = 0
|
|
fail_counter = 0
|
|
# todo(abailey): only supports AIO-SX here
|
|
target_hostname = 'controller-0'
|
|
while True:
|
|
# If event handler stop has been triggered, fail the state
|
|
if self.stopped():
|
|
raise StrategyStoppedException()
|
|
try:
|
|
# query the administrative state to see if it is the new state.
|
|
host = self.get_sysinv_client(
|
|
strategy_step.subcloud.name).get_host(target_hostname)
|
|
if (host.administrative == consts.ADMIN_UNLOCKED and
|
|
host.operational == consts.OPERATIONAL_ENABLED):
|
|
# Success. Break out of the loop.
|
|
msg = "Host: %s is now: %s %s" % (target_hostname,
|
|
host.administrative,
|
|
host.operational)
|
|
self.info_log(strategy_step, msg)
|
|
break
|
|
# no exception was raised so reset fail and auth checks
|
|
fail_counter = 0
|
|
except Exception:
|
|
# Handle other exceptions due to being unreachable
|
|
# for a significant period of time when there is a
|
|
# controller swact, or in the case of AIO-SX,
|
|
# when the controller reboots.
|
|
fail_counter += 1
|
|
if fail_counter >= self.max_failed_queries:
|
|
db_api.subcloud_update(
|
|
self.context, strategy_step.subcloud_id,
|
|
deploy_status=consts.DEPLOY_STATE_DATA_MIGRATION_FAILED)
|
|
raise Exception("Timeout waiting on reboot to complete")
|
|
time.sleep(self.failed_sleep_duration)
|
|
# skip the api_counter
|
|
continue
|
|
# If the max counter is exceeeded, raise a timeout exception
|
|
api_counter += 1
|
|
if api_counter >= self.max_api_queries:
|
|
db_api.subcloud_update(
|
|
self.context, strategy_step.subcloud_id,
|
|
deploy_status=consts.DEPLOY_STATE_DATA_MIGRATION_FAILED)
|
|
raise Exception("Timeout waiting for unlock to complete")
|
|
time.sleep(self.api_sleep_duration)
|
|
|
|
def perform_state_action(self, strategy_step):
|
|
"""Migrate data for an upgrade on a subcloud
|
|
|
|
Returns the next state in the state machine on success.
|
|
Any exceptions raised by this method set the strategy to FAILED.
|
|
"""
|
|
|
|
# To account for abrupt termination of dcmanager, check the last known
|
|
# subcloud deploy status. If it is migrated/complete, advance to the next
|
|
# stage. If it is 'migrating', fail the strategy. The user will need to
|
|
# delete the existing strategy, create a new one and apply. Pre-check will
|
|
# set the appropriate next step for this subcloud.
|
|
subcloud = db_api.subcloud_get(self.context, strategy_step.subcloud.id)
|
|
if (subcloud.deploy_status == consts.DEPLOY_STATE_MIGRATED or
|
|
subcloud.deploy_status == consts.DEPLOY_STATE_DONE):
|
|
return self.next_state
|
|
elif subcloud.deploy_status == consts.DEPLOY_STATE_MIGRATING_DATA:
|
|
db_api.subcloud_update(
|
|
self.context, strategy_step.subcloud_id,
|
|
deploy_status=consts.DEPLOY_STATE_DATA_MIGRATION_FAILED)
|
|
raise Exception("Previous data migration was abruptly terminated. "
|
|
"Please try again with a new upgrade strategy.")
|
|
|
|
# If it gets here, the subcloud deploy status must be 'installed'.
|
|
self.info_log(strategy_step, "Start migrating data...")
|
|
db_api.subcloud_update(
|
|
self.context, strategy_step.subcloud_id,
|
|
deploy_status=consts.DEPLOY_STATE_MIGRATING_DATA)
|
|
|
|
ansible_subcloud_inventory_file = os.path.join(
|
|
dccommon_consts.ANSIBLE_OVERRIDES_PATH,
|
|
strategy_step.subcloud.name + consts.INVENTORY_FILE_POSTFIX)
|
|
log_file = os.path.join(consts.DC_ANSIBLE_LOG_DIR, subcloud.name) + \
|
|
'_playbook_output.log'
|
|
# Send skip_patching=true to prevent the playbook from applying any patches present in the
|
|
# upgrade_data. All the required patches will be included in the generated install iso.
|
|
data_migrating_cmd = [
|
|
"ansible-playbook", ANSIBLE_UPGRADE_PLAYBOOK,
|
|
"-i", ansible_subcloud_inventory_file, "-e",
|
|
"ansible_ssh_pass=%s ansible_become_pass=%s skip_patching=true"
|
|
% (consts.TEMP_SYSADMIN_PASSWORD, consts.TEMP_SYSADMIN_PASSWORD)]
|
|
|
|
try:
|
|
migrate_subcloud_data(data_migrating_cmd, log_file)
|
|
except Exception as e:
|
|
# Two error messages: one for subcloud error description and logs and
|
|
# one for orchestrator strategy_step detail (shorter than the previous).
|
|
msg_subcloud = utils.find_ansible_error_msg(
|
|
strategy_step.subcloud.name, log_file, consts.DEPLOY_STATE_MIGRATING_DATA)
|
|
# Get script output in case it is available
|
|
error_msg = utils.get_failure_msg(strategy_step.subcloud.name)
|
|
failure = ('%s \n%s' % (error_msg, msg_subcloud))
|
|
db_api.subcloud_update(
|
|
self.context, strategy_step.subcloud_id,
|
|
deploy_status=consts.DEPLOY_STATE_DATA_MIGRATION_FAILED,
|
|
error_description=failure[0:consts.ERROR_DESCRIPTION_LENGTH])
|
|
self.error_log(strategy_step, msg_subcloud)
|
|
self.error_log(strategy_step, str(e))
|
|
raise
|
|
|
|
# Ansible invokes an unlock. Need to wait for the unlock to complete.
|
|
# Wait for 3 minutes for mtc/scripts to shut down services
|
|
# todo(abailey): split this into smaller sleeps to allow stopping early
|
|
time.sleep(self.ansible_sleep)
|
|
# wait up to 60 minutes for reboot to complete
|
|
self.wait_for_unlock(strategy_step)
|
|
|
|
db_api.subcloud_update(
|
|
self.context, strategy_step.subcloud_id,
|
|
deploy_status=consts.DEPLOY_STATE_MIGRATED)
|
|
|
|
self.info_log(strategy_step, "Data migration completed.")
|
|
return self.next_state
|