Merge "Allow rehome related data update when subcloud migration fails"

This commit is contained in:
Zuul 2024-03-20 17:14:34 +00:00 committed by Gerrit Code Review
commit 510f5df953
4 changed files with 162 additions and 47 deletions

View File

@ -305,7 +305,9 @@ class SubcloudPeerGroupsController(restcomm.GenericPathController):
(system_leader_id and
system_leader_id != group.system_leader_id) or
(system_leader_name and
system_leader_name != group.system_leader_name)))
system_leader_name != group.system_leader_name) or
(migration_status and
migration_status != group.migration_status)))
if not any_update:
return db_api.subcloud_peer_group_db_model_to_dict(group)

View File

@ -343,13 +343,19 @@ class SubcloudsController(object):
else dccommon_consts.DEPLOY_CONFIG_UP_TO_DATE
return sync_status
def _validate_rehome_pending(self, subcloud, management_state):
def _validate_rehome_pending(self, subcloud, management_state, request):
unmanaged = dccommon_consts.MANAGEMENT_UNMANAGED
error_msg = None
# Can only set the subcloud to rehome-pending
# if the deployment is done
if subcloud.deploy_status != consts.DEPLOY_STATE_DONE:
# if the deployment is done or request from another site.
# The reason that we skip the validation if the request is from
# another site is when migrating the subcloud back to a peer site,
# the site will attempt to set the remote subcloud's deploy status
# to "rehome-pending." However, the remote subcloud might be in a
# "rehome-failed" state from a previous failed rehoming attempt.
if (subcloud.deploy_status != consts.DEPLOY_STATE_DONE and
not utils.is_req_from_another_dc(request)):
error_msg = (
"The deploy status can only be updated to "
f"'{consts.DEPLOY_STATE_REHOME_PENDING}' if the current "
@ -656,13 +662,16 @@ class SubcloudsController(object):
req_from_another_dc = utils.is_req_from_another_dc(request)
original_pgrp = None
leader_on_local_site = False
peer_site_available = True
pga = None
update_in_non_primary_site = False
if subcloud.peer_group_id is not None:
# Get the original peer group of the subcloud
original_pgrp = db_api.subcloud_peer_group_get(
context, subcloud.peer_group_id)
leader_on_local_site = utils.is_leader_on_local_site(original_pgrp)
# A sync command is required after updating a subcloud
# in an SPG that is already associated with a PGA on the primary
# in an SPG that is already associated with a PGA in the primary
# and leader site. The existence of the PGA will be checked
# by the update_association_sync_status method later.
if (original_pgrp.group_priority == 0 and
@ -670,6 +679,18 @@ class SubcloudsController(object):
not req_from_another_dc):
sync_peer_groups.add(subcloud.peer_group_id)
# Get the peer site availability and PGA sync status
# TODO(lzhu1): support multiple sites
associations = db_api.peer_group_association_get_by_peer_group_id(
context, original_pgrp.id)
for association in associations:
pga = association
system_peer = db_api.system_peer_get(
context, association.system_peer_id)
peer_site_available = \
system_peer.availability_state == \
consts.SYSTEM_PEER_AVAILABILITY_STATE_AVAILABLE
peer_group = payload.get('peer_group')
# Verify the peer_group is valid
peer_group_id = None
@ -690,22 +711,13 @@ class SubcloudsController(object):
pecan.abort(400, _("Removing subcloud from a "
"peer group not led by the "
"current site is prohibited."))
# Get associations by peer group id
associations = db_api.\
peer_group_association_get_by_peer_group_id(
context, original_pgrp.id)
for association in associations:
system_peer = db_api.system_peer_get(
context, association.system_peer_id)
# If system peer is available, then does not allow
# to remove the subcloud from secondary peer group
if system_peer.availability_state == consts.\
SYSTEM_PEER_AVAILABILITY_STATE_AVAILABLE \
and original_pgrp.group_priority > 0:
pecan.abort(400, _(
"Removing subcloud from a peer group "
"associated with an available system peer "
"is prohibited."))
# If system peer is available, then does not allow
# to remove the subcloud from secondary peer group
if peer_site_available and original_pgrp.group_priority > 0:
pecan.abort(400, _(
"Removing subcloud from a peer group "
"associated with an available system peer "
"is prohibited."))
peer_group_id = 'none'
else:
if not (subcloud.rehome_data or (
@ -744,16 +756,39 @@ class SubcloudsController(object):
sync_peer_groups.add(pgrp.id)
peer_group_id = pgrp.id
bootstrap_values = payload.get('bootstrap_values')
bootstrap_address = payload.get('bootstrap_address')
# Subcloud can only be updated while it is managed in
# the primary site because the sync command can only be issued
# in the site where the SPG was created.
# in the site where the SPG was created. However, bootstrap
# values or address update is an exception.
if original_pgrp and peer_group_id is None and not req_from_another_dc:
if original_pgrp.group_priority > 0:
pecan.abort(400, _("Subcloud update is only allowed when "
"its peer group priority value is 0."))
if bootstrap_values or bootstrap_address:
if any(field not in
('bootstrap_values', 'bootstrap_address')
for field in payload):
pecan.abort(400,
_("Only bootstrap values and address "
"can be updated in the non-primary site"))
if (subcloud.deploy_status ==
consts.DEPLOY_STATE_REHOME_FAILED and
not peer_site_available):
update_in_non_primary_site = True
else:
pecan.abort(400,
_("Subcloud bootstrap values or address "
"update in the non-primary site is only "
"allowed when rehome failed and the "
"primary site is unavailable."))
if not update_in_non_primary_site:
pecan.abort(400, _("Subcloud update is only allowed when "
"its peer group priority value is 0."))
# Updating a subcloud under the peer group on primary site
# that the peer group should be led by the primary site.
if not leader_on_local_site:
if not leader_on_local_site and not update_in_non_primary_site:
pecan.abort(400, _("Updating subcloud from a "
"peer group not led by the "
"current site is prohibited."))
@ -845,15 +880,13 @@ class SubcloudsController(object):
group_id = payload.get('group_id')
description = payload.get('description')
location = payload.get('location')
bootstrap_values = payload.get('bootstrap_values')
bootstrap_address = payload.get('bootstrap_address')
# If the migrate flag is present we need to update the deploy status
# to consts.DEPLOY_STATE_REHOME_PENDING
deploy_status = None
if (payload.get('migrate') == 'true' and subcloud.deploy_status !=
consts.DEPLOY_STATE_REHOME_PENDING):
self._validate_rehome_pending(subcloud, management_state)
self._validate_rehome_pending(subcloud, management_state, request)
deploy_status = consts.DEPLOY_STATE_REHOME_PENDING
# Syntax checking
@ -917,7 +950,23 @@ class SubcloudsController(object):
bootstrap_address=bootstrap_address,
deploy_status=deploy_status)
if sync_peer_groups:
# Update the PGA sync_status to out-of-sync locally
# in the non-primary site. This only occurs when the primary site
# is unavailable and rehome fails due to the issue with bootstrap
# values or address.
if (update_in_non_primary_site and
pga.sync_status !=
consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC):
db_api.peer_group_association_update(
context,
pga.id,
sync_status=consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC)
LOG.debug(
f"Updated Local Peer Group Association {pga.id} "
f"sync_status to out-of-sync.")
# Sync the PGA out-of-sync status across all sites launched by
# the primary site.
elif sync_peer_groups:
# Collect the affected peer group association IDs.
association_ids = set()
for pg_id in sync_peer_groups:

View File

@ -43,10 +43,10 @@ class PeerGroupAuditManager(manager.Manager):
self.thread_lock = threading.Lock()
def _get_subclouds_by_peer_group_from_system_peer(self,
dc_client,
system_peer,
peer_group_name):
try:
dc_client = SystemPeerManager.get_peer_dc_client(system_peer)
subclouds = dc_client.get_subcloud_list_by_peer_group(
peer_group_name)
return subclouds
@ -55,6 +55,22 @@ class PeerGroupAuditManager(manager.Manager):
f"{peer_group_name} from DC: "
f"{system_peer.peer_name}")
@staticmethod
def _get_association_sync_status_from_peer_site(dc_client,
system_peer,
peer_group_id):
try:
# Get peer site system peer
dc_peer_system_peer = dc_client.get_system_peer(
utils.get_local_system().uuid)
association = dc_client. \
get_peer_group_association_with_peer_id_and_pg_id(
dc_peer_system_peer.get('id'), peer_group_id)
return association.get("sync-status")
except Exception:
LOG.exception(f"Failed to get subclouds of peer group "
f"{peer_group_id} from DC: {system_peer.peer_name}")
def _update_remote_peer_group_migration_status(self,
system_peer,
peer_group_name,
@ -71,9 +87,11 @@ class PeerGroupAuditManager(manager.Manager):
def _get_local_subclouds_to_update_and_delete(self,
local_peer_group,
remote_subclouds):
remote_subclouds,
remote_sync_status):
local_subclouds_to_update = list()
local_subclouds_to_delete = list()
any_rehome_failed = False
remote_subclouds_dict = {remote_subcloud.get('region-name'):
remote_subcloud for remote_subcloud
in remote_subclouds}
@ -92,10 +110,30 @@ class PeerGroupAuditManager(manager.Manager):
not utils.subcloud_is_secondary_state(
local_subcloud.deploy_status)):
local_subclouds_to_update.append(local_subcloud)
# Sync rehome_data from remote to local subcloud if the remote
# PGA sync_status is out-of-sync once migration completes,
# indicating any bootstrap values/address updates to
# the subcloud on the remote site.
if remote_sync_status == \
consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC:
self._sync_rehome_data(
local_subcloud.id, remote_subcloud.get('rehome_data'))
elif remote_subcloud.get('deploy-status') in \
(consts.DEPLOY_STATE_REHOME_FAILED,
consts.DEPLOY_STATE_REHOME_PREP_FAILED):
# Set local subcloud to rehome-failed if the remote is
# rehome-failed or rehome-prep-failed, otherwise, the
# deploy_status will remain rehome-pending, which will
# block the correction of the bootstrap values/address.
db_api.subcloud_update(
self.context, local_subcloud.id,
deploy_status=consts.DEPLOY_STATE_REHOME_FAILED)
any_rehome_failed = True
else:
local_subclouds_to_delete.append(local_subcloud)
return local_subclouds_to_update, local_subclouds_to_delete
return local_subclouds_to_update, local_subclouds_to_delete, \
any_rehome_failed
def _set_local_subcloud_to_secondary(self, subcloud):
try:
@ -118,6 +156,9 @@ class PeerGroupAuditManager(manager.Manager):
f"and offline subcloud [{subcloud.name}], err: {e}")
raise e
def _sync_rehome_data(self, subcloud_id, rehome_data):
db_api.subcloud_update(self.context, subcloud_id, rehome_data=rehome_data)
def audit(self, system_peer, remote_peer_group, local_peer_group):
if local_peer_group.migration_status == consts.PEER_GROUP_MIGRATING:
LOG.info("Local peer group in migrating state, quit audit")
@ -187,14 +228,22 @@ class PeerGroupAuditManager(manager.Manager):
# set 'unmanaged+secondary' to local on same subclouds
elif remote_peer_group.get("migration_status") == \
consts.PEER_GROUP_MIGRATION_COMPLETE:
dc_client = SystemPeerManager.get_peer_dc_client(system_peer)
remote_subclouds = \
self._get_subclouds_by_peer_group_from_system_peer(
dc_client,
system_peer,
remote_peer_group.get("peer_group_name"))
remote_sync_status = \
self._get_association_sync_status_from_peer_site(
dc_client,
system_peer,
remote_peer_group.get("id"))
local_subclouds_to_update, local_subclouds_to_delete = \
local_subclouds_to_update, local_subclouds_to_delete, \
any_rehome_failed = \
self._get_local_subclouds_to_update_and_delete(
local_peer_group, remote_subclouds)
local_peer_group, remote_subclouds, remote_sync_status)
for subcloud in local_subclouds_to_update:
self._set_local_subcloud_to_secondary(subcloud)
@ -218,7 +267,7 @@ class PeerGroupAuditManager(manager.Manager):
f"peer site, err: {e}")
raise e
if local_subclouds_to_update or local_subclouds_to_delete:
if remote_peer_group.get("system_leader_id") == system_peer.peer_uuid:
self._clear_or_raise_alarm(system_peer,
local_peer_group,
remote_peer_group)
@ -232,10 +281,13 @@ class PeerGroupAuditManager(manager.Manager):
system_peer,
remote_peer_group.get("peer_group_name"),
None)
SystemPeerManager.update_sync_status(
self.context, system_peer,
consts.ASSOCIATION_SYNC_STATUS_IN_SYNC,
local_peer_group, remote_peer_group)
if not (remote_sync_status == consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC
and any_rehome_failed):
SystemPeerManager.update_sync_status(
self.context, system_peer,
consts.ASSOCIATION_SYNC_STATUS_IN_SYNC,
local_peer_group, remote_peer_group)
self.require_audit_flag = False
else:
# If remote peer group migration_status is 'None'

View File

@ -236,10 +236,14 @@ class SystemPeerManager(manager.Manager):
f"it doesn't exist.")
return
is_secondary = SystemPeerManager.is_subcloud_secondary(peer_subcloud)
if not is_secondary:
if SystemPeerManager.get_subcloud_deploy_status(peer_subcloud) not in (
consts.DEPLOY_STATE_SECONDARY_FAILED,
consts.DEPLOY_STATE_SECONDARY,
consts.DEPLOY_STATE_REHOME_FAILED,
consts.DEPLOY_STATE_REHOME_PREP_FAILED
):
LOG.info(f"Ignoring delete Peer Site Subcloud {subcloud_ref} "
f"as is not in secondary state.")
f"as is not in secondary or rehome failed state.")
return
dc_client.delete_subcloud(subcloud_ref)
@ -340,7 +344,10 @@ class SystemPeerManager(manager.Manager):
# should be recorded as a failure.
peer_subcloud_deploy_status = self.get_subcloud_deploy_status(
peer_subcloud)
if peer_subcloud_deploy_status != consts.DEPLOY_STATE_SECONDARY:
if peer_subcloud_deploy_status not in \
(consts.DEPLOY_STATE_SECONDARY,
consts.DEPLOY_STATE_REHOME_FAILED,
consts.DEPLOY_STATE_REHOME_PREP_FAILED):
subcloud.msg = "Subcloud's deploy status not correct: %s" \
% peer_subcloud_deploy_status
return subcloud, False
@ -427,6 +434,9 @@ class SystemPeerManager(manager.Manager):
continue
try:
# TODO(lzhu1): Sending requests to fetch the subcloud one by one
# should be optimized to fetch them all with one request by calling
# the "get_subcloud_list_by_peer_group" method
peer_subcloud = self.get_peer_subcloud(dc_client, subcloud_name)
if not peer_subcloud:
LOG.info(f"Subcloud {subcloud_name} (region_name: "
@ -434,10 +444,12 @@ class SystemPeerManager(manager.Manager):
valid_subclouds.append(subcloud)
continue
if not self.is_subcloud_secondary(peer_subcloud):
msg = "Ignoring update Peer Site Subcloud " + \
f"{subcloud_name} (region_name: {region_name})" + \
" as is not in secondary state."
if not self.is_subcloud_secondary(peer_subcloud) and \
self.get_subcloud_deploy_status(peer_subcloud) not in \
(consts.DEPLOY_STATE_REHOME_FAILED,
consts.DEPLOY_STATE_REHOME_PREP_FAILED):
msg = (f"Subcloud {subcloud_name} is not in the right state "
f"for sync.")
LOG.info(msg)
error_msg[subcloud_name] = msg
continue