Merge "Fix 'secondary' and 'rehome-pending' subclouds stuck at 'online'"

This commit is contained in:
Zuul 2024-01-04 18:15:08 +00:00 committed by Gerrit Code Review
commit 6a22f7f7de
3 changed files with 50 additions and 21 deletions

View File

@ -125,10 +125,13 @@ class SubcloudAuditWorkerManager(manager.Manager):
consts.DEPLOY_STATE_UPGRADE_ACTIVATED,
consts.DEPLOY_STATE_RESTORING,
consts.DEPLOY_STATE_RESTORE_PREP_FAILED,
consts.DEPLOY_STATE_RESTORE_FAILED]
consts.DEPLOY_STATE_RESTORE_FAILED,
consts.DEPLOY_STATE_REHOME_PENDING]
and not prestage.is_deploy_status_prestage(
subcloud.deploy_status)) or (
subcloud.deploy_status == consts.DEPLOY_STATE_INSTALLING and
(subcloud.deploy_status in [
consts.DEPLOY_STATE_INSTALLING,
consts.DEPLOY_STATE_REHOME_PENDING]) and
subcloud.availability_status == dccommon_consts.AVAILABILITY_OFFLINE):
LOG.debug("Skip subcloud %s audit, deploy_status: %s" %
(subcloud.name, subcloud.deploy_status))

View File

@ -2708,7 +2708,8 @@ class SubcloudManager(manager.Manager):
raise exceptions.BadRequest(resource="subcloud", msg=msg)
if (subcloud.availability_status !=
dccommon_consts.AVAILABILITY_ONLINE):
dccommon_consts.AVAILABILITY_ONLINE) and (
subcloud.deploy_status != consts.DEPLOY_STATE_REHOME_PENDING):
LOG.warning(f"Subcloud {subcloud.name} is not online")
raise exceptions.SubcloudNotOnline()
@ -2884,13 +2885,23 @@ class SubcloudManager(manager.Manager):
# set all endpoint statuses to unknown, except the dc-cert
# endpoint which continues to be audited for unmanaged
# subclouds
ignore_endpoints = [dccommon_consts.ENDPOINT_TYPE_DC_CERT]
# Do not ignore the dc-cert endpoint for secondary or rehome
# pending subclouds as cert-mon does not audit them
if subcloud.deploy_status in (
consts.DEPLOY_STATE_SECONDARY,
consts.DEPLOY_STATE_REHOME_PENDING
):
ignore_endpoints = None
self.state_rpc_client.update_subcloud_endpoint_status_sync(
context,
subcloud_name=subcloud.name,
subcloud_region=subcloud.region_name,
endpoint_type=None,
sync_status=dccommon_consts.SYNC_STATUS_UNKNOWN,
ignore_endpoints=[dccommon_consts.ENDPOINT_TYPE_DC_CERT])
ignore_endpoints=ignore_endpoints)
elif management_state == dccommon_consts.MANAGEMENT_MANAGED:
# Subcloud is managed
# Tell cert-mon to audit endpoint certificate
@ -2898,16 +2909,15 @@ class SubcloudManager(manager.Manager):
dc_notification = dcmanager_rpc_client.DCManagerNotifications()
dc_notification.subcloud_managed(context, subcloud.region_name)
# Set all endpoint statuses to unknown, no endpoint
# will be audited for secondary or rehome-pending subclouds
if subcloud.deploy_status in (consts.DEPLOY_STATE_SECONDARY,
consts.DEPLOY_STATE_REHOME_PENDING):
self.state_rpc_client.update_subcloud_endpoint_status_sync(
# Request the state client to update the subcloud availability
# status to OFFLINE if subcloud is 'secondary'. The state
# service will set all endpoint statuses to 'unknown'.
if deploy_status == consts.DEPLOY_STATE_SECONDARY:
self.state_rpc_client.update_subcloud_availability(
context,
subcloud_name=subcloud.name,
subcloud_region=subcloud.region_name,
endpoint_type=None,
sync_status=dccommon_consts.SYNC_STATUS_UNKNOWN)
subcloud.name,
subcloud.region_name,
dccommon_consts.AVAILABILITY_OFFLINE)
# Clear existing fault alarm of secondary subcloud
if subcloud.deploy_status == consts.DEPLOY_STATE_SECONDARY:

View File

@ -294,7 +294,8 @@ class SubcloudStateManager(manager.Manager):
# Rules for updating sync status:
#
# Skip audit any 'secondary' state subclouds
# For secondary subclouds, only update if the new sync_status is
# 'unknown'
#
# For others, always update if not in-sync.
#
@ -308,11 +309,22 @@ class SubcloudStateManager(manager.Manager):
# This means if a subcloud is going offline or unmanaged, then
# the sync status update must be done first.
#
if ((sync_status != dccommon_consts.SYNC_STATUS_IN_SYNC or
((subcloud.availability_status == dccommon_consts.AVAILABILITY_ONLINE) and
(subcloud.management_state == dccommon_consts.MANAGEMENT_MANAGED
or endpoint_type == dccommon_consts.ENDPOINT_TYPE_DC_CERT))) and
subcloud.deploy_status != consts.DEPLOY_STATE_SECONDARY):
is_in_sync = sync_status == dccommon_consts.SYNC_STATUS_IN_SYNC
is_online = subcloud.availability_status == \
dccommon_consts.AVAILABILITY_ONLINE
is_managed = subcloud.management_state == \
dccommon_consts.MANAGEMENT_MANAGED
is_endpoint_type_dc_cert = endpoint_type == \
dccommon_consts.ENDPOINT_TYPE_DC_CERT
is_secondary = subcloud.deploy_status == consts.DEPLOY_STATE_SECONDARY
is_sync_unknown = sync_status == dccommon_consts.SYNC_STATUS_UNKNOWN
is_secondary_and_sync_unknown = is_secondary and is_sync_unknown
if (
(not is_in_sync
or (is_online and (is_managed or is_endpoint_type_dc_cert)))
and not is_secondary
) or is_secondary_and_sync_unknown:
# update a single subcloud
try:
self._do_update_subcloud_endpoint_status(context,
@ -379,7 +391,8 @@ class SubcloudStateManager(manager.Manager):
'subcloud: %s' % subcloud_name)
def _raise_or_clear_subcloud_status_alarm(self, subcloud_name,
availability_status):
availability_status,
deploy_status=None):
entity_instance_id = "subcloud=%s" % subcloud_name
fault = self.fm_api.get_fault(
fm_const.FM_ALARM_ID_DC_SUBCLOUD_OFFLINE,
@ -394,8 +407,11 @@ class SubcloudStateManager(manager.Manager):
LOG.exception("Failed to clear offline alarm for subcloud: %s",
subcloud_name)
# Raise the alarm if the subcloud became offline and it's not a
# secondary subcloud
elif not fault and \
(availability_status == dccommon_consts.AVAILABILITY_OFFLINE):
(availability_status == dccommon_consts.AVAILABILITY_OFFLINE and
deploy_status != consts.DEPLOY_STATE_SECONDARY):
try:
fault = fm_api.Fault(
alarm_id=fm_const.FM_ALARM_ID_DC_SUBCLOUD_OFFLINE,