Merge "Enhance error reporting on subcloud upgrade"

This commit is contained in:
Zuul 2022-12-07 14:17:15 +00:00 committed by Gerrit Code Review
commit b6badeaa4d
9 changed files with 120 additions and 40 deletions

View File

@ -31,6 +31,7 @@ from dccommon.drivers.openstack.sysinv_v1 import SysinvClient
from dccommon import exceptions from dccommon import exceptions
from dccommon import install_consts from dccommon import install_consts
from dccommon import utils as common_utils from dccommon import utils as common_utils
from dcmanager.common import consts as common_consts
from dcmanager.common import utils from dcmanager.common import utils
LOG = logging.getLogger(__name__) LOG = logging.getLogger(__name__)
@ -618,7 +619,7 @@ class SubcloudInstall(object):
# for cleanup on process restart/SWACT. # for cleanup on process restart/SWACT.
common_utils.run_playbook(log_file, install_command) common_utils.run_playbook(log_file, install_command)
except exceptions.PlaybookExecutionFailed: except exceptions.PlaybookExecutionFailed:
msg = ("Failed to install the subcloud %s, check individual " msg = ("Failed to install %s, check individual "
"log at %s for detailed output." "log at %s or run %s for details"
% (self.name, log_file)) % (self.name, log_file, common_consts.ERROR_DESC_CMD))
raise Exception(msg) raise Exception(msg)

View File

@ -184,6 +184,7 @@ DEPLOY_STATE_DONE = 'complete'
# Subcloud errors # Subcloud errors
ERROR_DESC_EMPTY = 'No errors present' ERROR_DESC_EMPTY = 'No errors present'
ERROR_DESC_CMD = 'dcmanager subcloud errors <subcloud-name>'
# error_description max length # error_description max length
ERROR_DESCRIPTION_LENGTH = 2048 ERROR_DESCRIPTION_LENGTH = 2048

View File

@ -178,7 +178,7 @@ class CertificateUploadError(DCManagerException):
class LicenseInstallError(DCManagerException): class LicenseInstallError(DCManagerException):
message = _("Error while installing license on subcloud: %(subcloud_id)s") message = _("Error while installing license on subcloud: %(subcloud_id)s. %(error_message)s")
class LicenseMissingError(DCManagerException): class LicenseMissingError(DCManagerException):

View File

@ -1,5 +1,5 @@
# #
# Copyright (c) 2020-2021 Wind River Systems, Inc. # Copyright (c) 2020-2022 Wind River Systems, Inc.
# #
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# #
@ -97,8 +97,15 @@ class CompletingUpgradeState(BaseState):
# invoke the API 'upgrade-complete' # invoke the API 'upgrade-complete'
# This is a partially blocking call that raises exception on failure. # This is a partially blocking call that raises exception on failure.
# We will re-attempt even if that failure is encountered # We will re-attempt even if that failure is encountered
self._upgrade_complete(strategy_step) try:
message = self._upgrade_complete(strategy_step)
except Exception as e:
msg = ("Failed to complete upgrade. %s" %
str(e))
db_api.subcloud_update(
self.context, strategy_step.subcloud_id,
error_description=msg[0:consts.ERROR_DESCRIPTION_LENGTH])
raise
# 'completion' deletes the upgrade. Need to loop until it is deleted # 'completion' deletes the upgrade. Need to loop until it is deleted
counter = 0 counter = 0
while True: while True:
@ -112,7 +119,12 @@ class CompletingUpgradeState(BaseState):
break break
counter += 1 counter += 1
if counter >= self.max_queries: if counter >= self.max_queries:
raise Exception("Timeout waiting for completion to complete") msg = ("Timeout waiting for completion to complete: %s:" %
message)
db_api.subcloud_update(
self.context, strategy_step.subcloud_id,
error_description=msg[0:consts.ERROR_DESCRIPTION_LENGTH])
raise Exception(msg)
time.sleep(self.sleep_duration) time.sleep(self.sleep_duration)
# When we return from this method without throwing an exception, the # When we return from this method without throwing an exception, the

View File

@ -11,6 +11,7 @@ from dcmanager.common import utils
from dcmanager.common.exceptions import StrategyStoppedException from dcmanager.common.exceptions import StrategyStoppedException
from dcmanager.common.exceptions import VaultLoadMissingError from dcmanager.common.exceptions import VaultLoadMissingError
from dcmanager.db import api as db_api
from dcmanager.orchestrator.states.base import BaseState from dcmanager.orchestrator.states.base import BaseState
from dcmanager.orchestrator.states.upgrade.cache.cache_specifications import \ from dcmanager.orchestrator.states.upgrade.cache.cache_specifications import \
REGION_ONE_SYSTEM_INFO_CACHE_TYPE REGION_ONE_SYSTEM_INFO_CACHE_TYPE
@ -155,11 +156,20 @@ class ImportingLoadState(BaseState):
# Send only the required fields # Send only the required fields
creation_keys = ['software_version', 'compatible_version', 'required_patches'] creation_keys = ['software_version', 'compatible_version', 'required_patches']
target_load = {key: target_load[key] for key in creation_keys} target_load = {key: target_load[key] for key in creation_keys}
load = self.get_sysinv_client( try:
strategy_step.subcloud.name).import_load_metadata(target_load) load = self.get_sysinv_client(
self.info_log(strategy_step, strategy_step.subcloud.name).import_load_metadata(target_load)
"Load: %s is now: %s" % ( self.info_log(strategy_step,
load.software_version, load.state)) "Load: %s is now: %s" % (
load.software_version, load.state))
except Exception as e:
msg = ("Failed to import load metadata. %s" %
str(e))
db_api.subcloud_update(
self.context, strategy_step.subcloud_id,
error_description=msg[0:consts.ERROR_DESCRIPTION_LENGTH])
self.error_log(strategy_step, msg)
raise
else: else:
while True: while True:
# If event handler stop has been triggered, fail the state # If event handler stop has been triggered, fail the state

View File

@ -6,6 +6,7 @@
from dccommon import consts as dccommon_consts from dccommon import consts as dccommon_consts
from dcmanager.common import consts from dcmanager.common import consts
from dcmanager.common import exceptions from dcmanager.common import exceptions
from dcmanager.db import api as db_api
from dcmanager.orchestrator.states.base import BaseState from dcmanager.orchestrator.states.base import BaseState
from dcmanager.orchestrator.states.upgrade.cache.cache_specifications import \ from dcmanager.orchestrator.states.upgrade.cache.cache_specifications import \
REGION_ONE_LICENSE_CACHE_TYPE REGION_ONE_LICENSE_CACHE_TYPE
@ -51,8 +52,15 @@ class InstallingLicenseState(BaseState):
return self.next_state return self.next_state
else: else:
# An unexpected error occurred querying the license # An unexpected error occurred querying the license
message = ('An unexpected error occurred querying the license %s. Detail: %s' %
(dccommon_consts.SYSTEM_CONTROLLER_NAME,
target_error))
db_api.subcloud_update(
self.context, strategy_step.subcloud_id,
error_description=message[0:consts.ERROR_DESCRIPTION_LENGTH])
raise exceptions.LicenseInstallError( raise exceptions.LicenseInstallError(
subcloud_id=dccommon_consts.SYSTEM_CONTROLLER_NAME) subcloud_id=dccommon_consts.SYSTEM_CONTROLLER_NAME,
error_message=target_error)
# retrieve the keystone session for the subcloud and query its license # retrieve the keystone session for the subcloud and query its license
subcloud_sysinv_client = \ subcloud_sysinv_client = \
@ -76,8 +84,17 @@ class InstallingLicenseState(BaseState):
install_rc = subcloud_sysinv_client.install_license(target_license) install_rc = subcloud_sysinv_client.install_license(target_license)
install_error = install_rc.get('error') install_error = install_rc.get('error')
if len(install_error) != 0: if len(install_error) != 0:
# Save error response from sysinv into subcloud error description.
# Provide exception with sysinv error response to strategy_step details
message = ('Error installing license on subcloud %s. Detail: %s' %
(strategy_step.subcloud.name,
install_error))
db_api.subcloud_update(
self.context, strategy_step.subcloud_id,
error_description=message[0:consts.ERROR_DESCRIPTION_LENGTH])
raise exceptions.LicenseInstallError( raise exceptions.LicenseInstallError(
subcloud_id=strategy_step.subcloud_id) subcloud_id=strategy_step.subcloud_id,
error_message=install_error)
# The license has been successfully installed. Move to the next stage # The license has been successfully installed. Move to the next stage
self.info_log(strategy_step, "License installed.") self.info_log(strategy_step, "License installed.")

View File

@ -1,5 +1,5 @@
# #
# Copyright (c) 2020-2021 Wind River Systems, Inc. # Copyright (c) 2020-2022 Wind River Systems, Inc.
# #
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# #
@ -10,6 +10,7 @@ from dccommon.exceptions import PlaybookExecutionFailed
from dccommon.utils import run_playbook from dccommon.utils import run_playbook
from dcmanager.common import consts from dcmanager.common import consts
from dcmanager.common.exceptions import StrategyStoppedException from dcmanager.common.exceptions import StrategyStoppedException
from dcmanager.common import utils
from dcmanager.db import api as db_api from dcmanager.db import api as db_api
from dcmanager.orchestrator.states.base import BaseState from dcmanager.orchestrator.states.base import BaseState
@ -32,16 +33,14 @@ DEFAULT_API_SLEEP = 60
DEFAULT_ANSIBLE_SLEEP = 180 DEFAULT_ANSIBLE_SLEEP = 180
def migrate_subcloud_data(subcloud_name, migrate_command): def migrate_subcloud_data(migrate_command, log_file):
log_file = os.path.join(consts.DC_ANSIBLE_LOG_DIR, subcloud_name) + \
'_playbook_output.log'
try: try:
run_playbook(log_file, migrate_command) run_playbook(log_file, migrate_command)
except PlaybookExecutionFailed: except PlaybookExecutionFailed:
msg = ("Failed to migrate data for subcloud %s, check individual " msg_orch = ("Failed to migrate data, check individual "
"log at %s for detailed output." "log at %s or run %s for details"
% (subcloud_name, log_file)) % (log_file, consts.ERROR_DESC_CMD))
raise Exception(msg) raise Exception(msg_orch)
class MigratingDataState(BaseState): class MigratingDataState(BaseState):
@ -142,7 +141,8 @@ class MigratingDataState(BaseState):
ansible_subcloud_inventory_file = os.path.join( ansible_subcloud_inventory_file = os.path.join(
consts.ANSIBLE_OVERRIDES_PATH, consts.ANSIBLE_OVERRIDES_PATH,
strategy_step.subcloud.name + consts.INVENTORY_FILE_POSTFIX) strategy_step.subcloud.name + consts.INVENTORY_FILE_POSTFIX)
log_file = os.path.join(consts.DC_ANSIBLE_LOG_DIR, subcloud.name) + \
'_playbook_output.log'
# Send skip_patching=true to prevent the playbook from applying any patches present in the # Send skip_patching=true to prevent the playbook from applying any patches present in the
# upgrade_data. All the required patches will be included in the generated install iso. # upgrade_data. All the required patches will be included in the generated install iso.
data_migrating_cmd = [ data_migrating_cmd = [
@ -152,12 +152,17 @@ class MigratingDataState(BaseState):
% (consts.TEMP_SYSADMIN_PASSWORD, consts.TEMP_SYSADMIN_PASSWORD)] % (consts.TEMP_SYSADMIN_PASSWORD, consts.TEMP_SYSADMIN_PASSWORD)]
try: try:
migrate_subcloud_data(strategy_step.subcloud.name, migrate_subcloud_data(data_migrating_cmd, log_file)
data_migrating_cmd)
except Exception as e: except Exception as e:
# Two error messages: one for subcloud error description and logs and
# one for orchestrator strategy_step detail (shorter than the previous).
msg_subcloud = utils.find_ansible_error_msg(
strategy_step.subcloud.name, log_file, consts.DEPLOY_STATE_MIGRATING_DATA)
db_api.subcloud_update( db_api.subcloud_update(
self.context, strategy_step.subcloud_id, self.context, strategy_step.subcloud_id,
deploy_status=consts.DEPLOY_STATE_DATA_MIGRATION_FAILED) deploy_status=consts.DEPLOY_STATE_DATA_MIGRATION_FAILED,
error_description=msg_subcloud[0:consts.ERROR_DESCRIPTION_LENGTH])
self.error_log(strategy_step, msg_subcloud)
self.error_log(strategy_step, str(e)) self.error_log(strategy_step, str(e))
raise raise

View File

@ -90,6 +90,10 @@ class PreCheckState(BaseState):
if (host.administrative == consts.ADMIN_LOCKED and upgrades): if (host.administrative == consts.ADMIN_LOCKED and upgrades):
alarm_ignore_list.append(HOST_ADMINISTRATIVELY_LOCKED_ALARM) alarm_ignore_list.append(HOST_ADMINISTRATIVELY_LOCKED_ALARM)
# Clean old error messages
db_api.subcloud_update(
self.context, strategy_step.subcloud_id,
error_description=consts.ERROR_DESC_EMPTY)
# The health conditions acceptable for upgrade are: # The health conditions acceptable for upgrade are:
# a) subcloud is completely healthy (i.e. no failed checks) # a) subcloud is completely healthy (i.e. no failed checks)
# b) subcloud only fails alarm check and it only has non-management # b) subcloud only fails alarm check and it only has non-management
@ -106,8 +110,14 @@ class PreCheckState(BaseState):
# #
# These could be Kubernetes or other related failure(s) which has not been been # These could be Kubernetes or other related failure(s) which has not been been
# converted into an alarm condition. # converted into an alarm condition.
details = "System health check failed. Please run 'system health-query' " \ error_desc_msg = ("System health check failed. \n %s" %
"command on the subcloud for more details." fails)
db_api.subcloud_update(
self.context, strategy_step.subcloud_id,
error_description=error_desc_msg[0:consts.ERROR_DESCRIPTION_LENGTH])
details = ("System health check failed. Please run 'system health-query' "
"command on the subcloud or %s on central for details"
% (consts.ERROR_DESC_CMD))
self.error_log(strategy_step, "\n" + system_health) self.error_log(strategy_step, "\n" + system_health)
raise PreCheckFailedException( raise PreCheckFailedException(
subcloud=strategy_step.subcloud.name, subcloud=strategy_step.subcloud.name,
@ -125,9 +135,16 @@ class PreCheckState(BaseState):
for alarm in alarms: for alarm in alarms:
if alarm.alarm_id not in alarm_ignore_list: if alarm.alarm_id not in alarm_ignore_list:
if alarm.mgmt_affecting == "True": if alarm.mgmt_affecting == "True":
details = "System health check failed due to alarm %s. " \ error_desc_msg = ("System health check failed due to alarm %s. "
"Please run 'system health-query' " \ "System health: \n %s" %
"command on the subcloud for more details." % alarm.alarm_id (alarm.alarm_id, system_health))
db_api.subcloud_update(
self.context, strategy_step.subcloud_id,
error_description=error_desc_msg[0:consts.ERROR_DESCRIPTION_LENGTH])
details = ("System health check failed due to alarm %s. "
"Please run 'system health-query' "
"command on the subcloud or %s on central for details." %
(alarm.alarm_id, consts.ERROR_DESC_CMD))
self.error_log(strategy_step, "\n" + system_health) self.error_log(strategy_step, "\n" + system_health)
raise PreCheckFailedException( raise PreCheckFailedException(
subcloud=strategy_step.subcloud.name, subcloud=strategy_step.subcloud.name,
@ -135,9 +152,16 @@ class PreCheckState(BaseState):
) )
else: else:
# Multiple failures # Multiple failures
details = "System health check failed due to multiple failures. " \ error_desc_msg = ("System health check failed due to multiple failures. "
"Please run 'system health-query' command on the " \ "Health: \n %s" %
"subcloud for more details." (system_health))
db_api.subcloud_update(
self.context, strategy_step.subcloud_id,
error_description=error_desc_msg[0:consts.ERROR_DESCRIPTION_LENGTH])
details = ("System health check failed due to multiple failures. "
"Please run 'system health-query' command on the "
"subcloud or %s on central for details." %
(consts.ERROR_DESC_CMD))
self.error_log(strategy_step, "\n" + system_health) self.error_log(strategy_step, "\n" + system_health)
raise PreCheckFailedException( raise PreCheckFailedException(
subcloud=strategy_step.subcloud.name, subcloud=strategy_step.subcloud.name,

View File

@ -1,5 +1,5 @@
# #
# Copyright (c) 2020-2021 Wind River Systems, Inc. # Copyright (c) 2020-2022 Wind River Systems, Inc.
# #
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# #
@ -201,11 +201,12 @@ class UpgradingSimplexState(BaseState):
if not subcloud.data_install: if not subcloud.data_install:
# Set the deploy status to pre-install-failed so it can be # Set the deploy status to pre-install-failed so it can be
# handled accordingly in pre check step. # handled accordingly in pre check step.
message = ("Failed to get upgrade data from install")
db_api.subcloud_update( db_api.subcloud_update(
self.context, strategy_step.subcloud_id, self.context, strategy_step.subcloud_id,
deploy_status=consts.DEPLOY_STATE_PRE_INSTALL_FAILED) deploy_status=consts.DEPLOY_STATE_PRE_INSTALL_FAILED,
error_description=message)
message = ("Failed to get upgrade data from install")
self.warn_log(strategy_step, message) self.warn_log(strategy_step, message)
raise Exception(message) raise Exception(message)
@ -337,6 +338,8 @@ class UpgradingSimplexState(BaseState):
def perform_subcloud_install(self, strategy_step, session, install_values): def perform_subcloud_install(self, strategy_step, session, install_values):
log_file = os.path.join(consts.DC_ANSIBLE_LOG_DIR, strategy_step.subcloud.name) + \
'_playbook_output.log'
db_api.subcloud_update( db_api.subcloud_update(
self.context, strategy_step.subcloud_id, self.context, strategy_step.subcloud_id,
deploy_status=consts.DEPLOY_STATE_PRE_INSTALL) deploy_status=consts.DEPLOY_STATE_PRE_INSTALL)
@ -350,7 +353,8 @@ class UpgradingSimplexState(BaseState):
except Exception as e: except Exception as e:
db_api.subcloud_update( db_api.subcloud_update(
self.context, strategy_step.subcloud_id, self.context, strategy_step.subcloud_id,
deploy_status=consts.DEPLOY_STATE_PRE_INSTALL_FAILED) deploy_status=consts.DEPLOY_STATE_PRE_INSTALL_FAILED,
error_description=str(e)[0:consts.ERROR_DESCRIPTION_LENGTH])
self.error_log(strategy_step, str(e)) self.error_log(strategy_step, str(e))
# TODO(jkung): cleanup to be implemented within SubcloudInstall # TODO(jkung): cleanup to be implemented within SubcloudInstall
install.cleanup() install.cleanup()
@ -379,9 +383,15 @@ class UpgradingSimplexState(BaseState):
try: try:
install.install(consts.DC_ANSIBLE_LOG_DIR, install_command) install.install(consts.DC_ANSIBLE_LOG_DIR, install_command)
except Exception as e: except Exception as e:
# Detailed error message for subcloud error description field.
# Exception message for strategy_step detail.
msg = utils.find_ansible_error_msg(
strategy_step.subcloud.name, log_file, consts.DEPLOY_STATE_INSTALLING)
db_api.subcloud_update( db_api.subcloud_update(
self.context, strategy_step.subcloud_id, self.context, strategy_step.subcloud_id,
deploy_status=consts.DEPLOY_STATE_INSTALL_FAILED) deploy_status=consts.DEPLOY_STATE_INSTALL_FAILED,
error_description=msg[0:consts.ERROR_DESCRIPTION_LENGTH])
self.error_log(strategy_step, msg)
self.error_log(strategy_step, str(e)) self.error_log(strategy_step, str(e))
install.cleanup() install.cleanup()
raise raise