Merge "Fix 'secondary' and 'rehome-pending' subclouds stuck at 'online'"
This commit is contained in:
commit
6a22f7f7de
|
@ -125,10 +125,13 @@ class SubcloudAuditWorkerManager(manager.Manager):
|
|||
consts.DEPLOY_STATE_UPGRADE_ACTIVATED,
|
||||
consts.DEPLOY_STATE_RESTORING,
|
||||
consts.DEPLOY_STATE_RESTORE_PREP_FAILED,
|
||||
consts.DEPLOY_STATE_RESTORE_FAILED]
|
||||
consts.DEPLOY_STATE_RESTORE_FAILED,
|
||||
consts.DEPLOY_STATE_REHOME_PENDING]
|
||||
and not prestage.is_deploy_status_prestage(
|
||||
subcloud.deploy_status)) or (
|
||||
subcloud.deploy_status == consts.DEPLOY_STATE_INSTALLING and
|
||||
(subcloud.deploy_status in [
|
||||
consts.DEPLOY_STATE_INSTALLING,
|
||||
consts.DEPLOY_STATE_REHOME_PENDING]) and
|
||||
subcloud.availability_status == dccommon_consts.AVAILABILITY_OFFLINE):
|
||||
LOG.debug("Skip subcloud %s audit, deploy_status: %s" %
|
||||
(subcloud.name, subcloud.deploy_status))
|
||||
|
|
|
@ -2708,7 +2708,8 @@ class SubcloudManager(manager.Manager):
|
|||
raise exceptions.BadRequest(resource="subcloud", msg=msg)
|
||||
|
||||
if (subcloud.availability_status !=
|
||||
dccommon_consts.AVAILABILITY_ONLINE):
|
||||
dccommon_consts.AVAILABILITY_ONLINE) and (
|
||||
subcloud.deploy_status != consts.DEPLOY_STATE_REHOME_PENDING):
|
||||
LOG.warning(f"Subcloud {subcloud.name} is not online")
|
||||
raise exceptions.SubcloudNotOnline()
|
||||
|
||||
|
@ -2884,13 +2885,23 @@ class SubcloudManager(manager.Manager):
|
|||
# set all endpoint statuses to unknown, except the dc-cert
|
||||
# endpoint which continues to be audited for unmanaged
|
||||
# subclouds
|
||||
ignore_endpoints = [dccommon_consts.ENDPOINT_TYPE_DC_CERT]
|
||||
|
||||
# Do not ignore the dc-cert endpoint for secondary or rehome
|
||||
# pending subclouds as cert-mon does not audit them
|
||||
if subcloud.deploy_status in (
|
||||
consts.DEPLOY_STATE_SECONDARY,
|
||||
consts.DEPLOY_STATE_REHOME_PENDING
|
||||
):
|
||||
ignore_endpoints = None
|
||||
|
||||
self.state_rpc_client.update_subcloud_endpoint_status_sync(
|
||||
context,
|
||||
subcloud_name=subcloud.name,
|
||||
subcloud_region=subcloud.region_name,
|
||||
endpoint_type=None,
|
||||
sync_status=dccommon_consts.SYNC_STATUS_UNKNOWN,
|
||||
ignore_endpoints=[dccommon_consts.ENDPOINT_TYPE_DC_CERT])
|
||||
ignore_endpoints=ignore_endpoints)
|
||||
elif management_state == dccommon_consts.MANAGEMENT_MANAGED:
|
||||
# Subcloud is managed
|
||||
# Tell cert-mon to audit endpoint certificate
|
||||
|
@ -2898,16 +2909,15 @@ class SubcloudManager(manager.Manager):
|
|||
dc_notification = dcmanager_rpc_client.DCManagerNotifications()
|
||||
dc_notification.subcloud_managed(context, subcloud.region_name)
|
||||
|
||||
# Set all endpoint statuses to unknown, no endpoint
|
||||
# will be audited for secondary or rehome-pending subclouds
|
||||
if subcloud.deploy_status in (consts.DEPLOY_STATE_SECONDARY,
|
||||
consts.DEPLOY_STATE_REHOME_PENDING):
|
||||
self.state_rpc_client.update_subcloud_endpoint_status_sync(
|
||||
# Request the state client to update the subcloud availability
|
||||
# status to OFFLINE if subcloud is 'secondary'. The state
|
||||
# service will set all endpoint statuses to 'unknown'.
|
||||
if deploy_status == consts.DEPLOY_STATE_SECONDARY:
|
||||
self.state_rpc_client.update_subcloud_availability(
|
||||
context,
|
||||
subcloud_name=subcloud.name,
|
||||
subcloud_region=subcloud.region_name,
|
||||
endpoint_type=None,
|
||||
sync_status=dccommon_consts.SYNC_STATUS_UNKNOWN)
|
||||
subcloud.name,
|
||||
subcloud.region_name,
|
||||
dccommon_consts.AVAILABILITY_OFFLINE)
|
||||
|
||||
# Clear existing fault alarm of secondary subcloud
|
||||
if subcloud.deploy_status == consts.DEPLOY_STATE_SECONDARY:
|
||||
|
|
|
@ -294,7 +294,8 @@ class SubcloudStateManager(manager.Manager):
|
|||
|
||||
# Rules for updating sync status:
|
||||
#
|
||||
# Skip audit any 'secondary' state subclouds
|
||||
# For secondary subclouds, only update if the new sync_status is
|
||||
# 'unknown'
|
||||
#
|
||||
# For others, always update if not in-sync.
|
||||
#
|
||||
|
@ -308,11 +309,22 @@ class SubcloudStateManager(manager.Manager):
|
|||
# This means if a subcloud is going offline or unmanaged, then
|
||||
# the sync status update must be done first.
|
||||
#
|
||||
if ((sync_status != dccommon_consts.SYNC_STATUS_IN_SYNC or
|
||||
((subcloud.availability_status == dccommon_consts.AVAILABILITY_ONLINE) and
|
||||
(subcloud.management_state == dccommon_consts.MANAGEMENT_MANAGED
|
||||
or endpoint_type == dccommon_consts.ENDPOINT_TYPE_DC_CERT))) and
|
||||
subcloud.deploy_status != consts.DEPLOY_STATE_SECONDARY):
|
||||
is_in_sync = sync_status == dccommon_consts.SYNC_STATUS_IN_SYNC
|
||||
is_online = subcloud.availability_status == \
|
||||
dccommon_consts.AVAILABILITY_ONLINE
|
||||
is_managed = subcloud.management_state == \
|
||||
dccommon_consts.MANAGEMENT_MANAGED
|
||||
is_endpoint_type_dc_cert = endpoint_type == \
|
||||
dccommon_consts.ENDPOINT_TYPE_DC_CERT
|
||||
is_secondary = subcloud.deploy_status == consts.DEPLOY_STATE_SECONDARY
|
||||
is_sync_unknown = sync_status == dccommon_consts.SYNC_STATUS_UNKNOWN
|
||||
is_secondary_and_sync_unknown = is_secondary and is_sync_unknown
|
||||
|
||||
if (
|
||||
(not is_in_sync
|
||||
or (is_online and (is_managed or is_endpoint_type_dc_cert)))
|
||||
and not is_secondary
|
||||
) or is_secondary_and_sync_unknown:
|
||||
# update a single subcloud
|
||||
try:
|
||||
self._do_update_subcloud_endpoint_status(context,
|
||||
|
@ -379,7 +391,8 @@ class SubcloudStateManager(manager.Manager):
|
|||
'subcloud: %s' % subcloud_name)
|
||||
|
||||
def _raise_or_clear_subcloud_status_alarm(self, subcloud_name,
|
||||
availability_status):
|
||||
availability_status,
|
||||
deploy_status=None):
|
||||
entity_instance_id = "subcloud=%s" % subcloud_name
|
||||
fault = self.fm_api.get_fault(
|
||||
fm_const.FM_ALARM_ID_DC_SUBCLOUD_OFFLINE,
|
||||
|
@ -394,8 +407,11 @@ class SubcloudStateManager(manager.Manager):
|
|||
LOG.exception("Failed to clear offline alarm for subcloud: %s",
|
||||
subcloud_name)
|
||||
|
||||
# Raise the alarm if the subcloud became offline and it's not a
|
||||
# secondary subcloud
|
||||
elif not fault and \
|
||||
(availability_status == dccommon_consts.AVAILABILITY_OFFLINE):
|
||||
(availability_status == dccommon_consts.AVAILABILITY_OFFLINE and
|
||||
deploy_status != consts.DEPLOY_STATE_SECONDARY):
|
||||
try:
|
||||
fault = fm_api.Fault(
|
||||
alarm_id=fm_const.FM_ALARM_ID_DC_SUBCLOUD_OFFLINE,
|
||||
|
|
Loading…
Reference in New Issue