diff --git a/distributedcloud/dcmanager/api/controllers/v1/subcloud_peer_group.py b/distributedcloud/dcmanager/api/controllers/v1/subcloud_peer_group.py index 60d1df78d..6686ab5ed 100644 --- a/distributedcloud/dcmanager/api/controllers/v1/subcloud_peer_group.py +++ b/distributedcloud/dcmanager/api/controllers/v1/subcloud_peer_group.py @@ -305,7 +305,9 @@ class SubcloudPeerGroupsController(restcomm.GenericPathController): (system_leader_id and system_leader_id != group.system_leader_id) or (system_leader_name and - system_leader_name != group.system_leader_name))) + system_leader_name != group.system_leader_name) or + (migration_status and + migration_status != group.migration_status))) if not any_update: return db_api.subcloud_peer_group_db_model_to_dict(group) diff --git a/distributedcloud/dcmanager/api/controllers/v1/subclouds.py b/distributedcloud/dcmanager/api/controllers/v1/subclouds.py index fa5b4ae66..68b528856 100644 --- a/distributedcloud/dcmanager/api/controllers/v1/subclouds.py +++ b/distributedcloud/dcmanager/api/controllers/v1/subclouds.py @@ -343,13 +343,19 @@ class SubcloudsController(object): else dccommon_consts.DEPLOY_CONFIG_UP_TO_DATE return sync_status - def _validate_rehome_pending(self, subcloud, management_state): + def _validate_rehome_pending(self, subcloud, management_state, request): unmanaged = dccommon_consts.MANAGEMENT_UNMANAGED error_msg = None # Can only set the subcloud to rehome-pending - # if the deployment is done - if subcloud.deploy_status != consts.DEPLOY_STATE_DONE: + # if the deployment is done or request from another site. + # The reason that we skip the validation if the request is from + # another site is when migrating the subcloud back to a peer site, + # the site will attempt to set the remote subcloud's deploy status + # to "rehome-pending." However, the remote subcloud might be in a + # "rehome-failed" state from a previous failed rehoming attempt. + if (subcloud.deploy_status != consts.DEPLOY_STATE_DONE and + not utils.is_req_from_another_dc(request)): error_msg = ( "The deploy status can only be updated to " f"'{consts.DEPLOY_STATE_REHOME_PENDING}' if the current " @@ -656,13 +662,16 @@ class SubcloudsController(object): req_from_another_dc = utils.is_req_from_another_dc(request) original_pgrp = None leader_on_local_site = False + peer_site_available = True + pga = None + update_in_non_primary_site = False if subcloud.peer_group_id is not None: # Get the original peer group of the subcloud original_pgrp = db_api.subcloud_peer_group_get( context, subcloud.peer_group_id) leader_on_local_site = utils.is_leader_on_local_site(original_pgrp) # A sync command is required after updating a subcloud - # in an SPG that is already associated with a PGA on the primary + # in an SPG that is already associated with a PGA in the primary # and leader site. The existence of the PGA will be checked # by the update_association_sync_status method later. if (original_pgrp.group_priority == 0 and @@ -670,6 +679,18 @@ class SubcloudsController(object): not req_from_another_dc): sync_peer_groups.add(subcloud.peer_group_id) + # Get the peer site availability and PGA sync status + # TODO(lzhu1): support multiple sites + associations = db_api.peer_group_association_get_by_peer_group_id( + context, original_pgrp.id) + for association in associations: + pga = association + system_peer = db_api.system_peer_get( + context, association.system_peer_id) + peer_site_available = \ + system_peer.availability_state == \ + consts.SYSTEM_PEER_AVAILABILITY_STATE_AVAILABLE + peer_group = payload.get('peer_group') # Verify the peer_group is valid peer_group_id = None @@ -690,22 +711,13 @@ class SubcloudsController(object): pecan.abort(400, _("Removing subcloud from a " "peer group not led by the " "current site is prohibited.")) - # Get associations by peer group id - associations = db_api.\ - peer_group_association_get_by_peer_group_id( - context, original_pgrp.id) - for association in associations: - system_peer = db_api.system_peer_get( - context, association.system_peer_id) - # If system peer is available, then does not allow - # to remove the subcloud from secondary peer group - if system_peer.availability_state == consts.\ - SYSTEM_PEER_AVAILABILITY_STATE_AVAILABLE \ - and original_pgrp.group_priority > 0: - pecan.abort(400, _( - "Removing subcloud from a peer group " - "associated with an available system peer " - "is prohibited.")) + # If system peer is available, then does not allow + # to remove the subcloud from secondary peer group + if peer_site_available and original_pgrp.group_priority > 0: + pecan.abort(400, _( + "Removing subcloud from a peer group " + "associated with an available system peer " + "is prohibited.")) peer_group_id = 'none' else: if not (subcloud.rehome_data or ( @@ -744,16 +756,39 @@ class SubcloudsController(object): sync_peer_groups.add(pgrp.id) peer_group_id = pgrp.id + bootstrap_values = payload.get('bootstrap_values') + bootstrap_address = payload.get('bootstrap_address') + # Subcloud can only be updated while it is managed in # the primary site because the sync command can only be issued - # in the site where the SPG was created. + # in the site where the SPG was created. However, bootstrap + # values or address update is an exception. if original_pgrp and peer_group_id is None and not req_from_another_dc: if original_pgrp.group_priority > 0: - pecan.abort(400, _("Subcloud update is only allowed when " - "its peer group priority value is 0.")) + if bootstrap_values or bootstrap_address: + if any(field not in + ('bootstrap_values', 'bootstrap_address') + for field in payload): + pecan.abort(400, + _("Only bootstrap values and address " + "can be updated in the non-primary site")) + if (subcloud.deploy_status == + consts.DEPLOY_STATE_REHOME_FAILED and + not peer_site_available): + update_in_non_primary_site = True + else: + pecan.abort(400, + _("Subcloud bootstrap values or address " + "update in the non-primary site is only " + "allowed when rehome failed and the " + "primary site is unavailable.")) + if not update_in_non_primary_site: + pecan.abort(400, _("Subcloud update is only allowed when " + "its peer group priority value is 0.")) + # Updating a subcloud under the peer group on primary site # that the peer group should be led by the primary site. - if not leader_on_local_site: + if not leader_on_local_site and not update_in_non_primary_site: pecan.abort(400, _("Updating subcloud from a " "peer group not led by the " "current site is prohibited.")) @@ -845,15 +880,13 @@ class SubcloudsController(object): group_id = payload.get('group_id') description = payload.get('description') location = payload.get('location') - bootstrap_values = payload.get('bootstrap_values') - bootstrap_address = payload.get('bootstrap_address') # If the migrate flag is present we need to update the deploy status # to consts.DEPLOY_STATE_REHOME_PENDING deploy_status = None if (payload.get('migrate') == 'true' and subcloud.deploy_status != consts.DEPLOY_STATE_REHOME_PENDING): - self._validate_rehome_pending(subcloud, management_state) + self._validate_rehome_pending(subcloud, management_state, request) deploy_status = consts.DEPLOY_STATE_REHOME_PENDING # Syntax checking @@ -917,7 +950,23 @@ class SubcloudsController(object): bootstrap_address=bootstrap_address, deploy_status=deploy_status) - if sync_peer_groups: + # Update the PGA sync_status to out-of-sync locally + # in the non-primary site. This only occurs when the primary site + # is unavailable and rehome fails due to the issue with bootstrap + # values or address. + if (update_in_non_primary_site and + pga.sync_status != + consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC): + db_api.peer_group_association_update( + context, + pga.id, + sync_status=consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC) + LOG.debug( + f"Updated Local Peer Group Association {pga.id} " + f"sync_status to out-of-sync.") + # Sync the PGA out-of-sync status across all sites launched by + # the primary site. + elif sync_peer_groups: # Collect the affected peer group association IDs. association_ids = set() for pg_id in sync_peer_groups: diff --git a/distributedcloud/dcmanager/manager/peer_group_audit_manager.py b/distributedcloud/dcmanager/manager/peer_group_audit_manager.py index cf287ae17..c71ec8402 100644 --- a/distributedcloud/dcmanager/manager/peer_group_audit_manager.py +++ b/distributedcloud/dcmanager/manager/peer_group_audit_manager.py @@ -43,10 +43,10 @@ class PeerGroupAuditManager(manager.Manager): self.thread_lock = threading.Lock() def _get_subclouds_by_peer_group_from_system_peer(self, + dc_client, system_peer, peer_group_name): try: - dc_client = SystemPeerManager.get_peer_dc_client(system_peer) subclouds = dc_client.get_subcloud_list_by_peer_group( peer_group_name) return subclouds @@ -55,6 +55,22 @@ class PeerGroupAuditManager(manager.Manager): f"{peer_group_name} from DC: " f"{system_peer.peer_name}") + @staticmethod + def _get_association_sync_status_from_peer_site(dc_client, + system_peer, + peer_group_id): + try: + # Get peer site system peer + dc_peer_system_peer = dc_client.get_system_peer( + utils.get_local_system().uuid) + association = dc_client. \ + get_peer_group_association_with_peer_id_and_pg_id( + dc_peer_system_peer.get('id'), peer_group_id) + return association.get("sync-status") + except Exception: + LOG.exception(f"Failed to get subclouds of peer group " + f"{peer_group_id} from DC: {system_peer.peer_name}") + def _update_remote_peer_group_migration_status(self, system_peer, peer_group_name, @@ -71,9 +87,11 @@ class PeerGroupAuditManager(manager.Manager): def _get_local_subclouds_to_update_and_delete(self, local_peer_group, - remote_subclouds): + remote_subclouds, + remote_sync_status): local_subclouds_to_update = list() local_subclouds_to_delete = list() + any_rehome_failed = False remote_subclouds_dict = {remote_subcloud.get('region-name'): remote_subcloud for remote_subcloud in remote_subclouds} @@ -92,10 +110,30 @@ class PeerGroupAuditManager(manager.Manager): not utils.subcloud_is_secondary_state( local_subcloud.deploy_status)): local_subclouds_to_update.append(local_subcloud) + # Sync rehome_data from remote to local subcloud if the remote + # PGA sync_status is out-of-sync once migration completes, + # indicating any bootstrap values/address updates to + # the subcloud on the remote site. + if remote_sync_status == \ + consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC: + self._sync_rehome_data( + local_subcloud.id, remote_subcloud.get('rehome_data')) + elif remote_subcloud.get('deploy-status') in \ + (consts.DEPLOY_STATE_REHOME_FAILED, + consts.DEPLOY_STATE_REHOME_PREP_FAILED): + # Set local subcloud to rehome-failed if the remote is + # rehome-failed or rehome-prep-failed, otherwise, the + # deploy_status will remain rehome-pending, which will + # block the correction of the bootstrap values/address. + db_api.subcloud_update( + self.context, local_subcloud.id, + deploy_status=consts.DEPLOY_STATE_REHOME_FAILED) + any_rehome_failed = True else: local_subclouds_to_delete.append(local_subcloud) - return local_subclouds_to_update, local_subclouds_to_delete + return local_subclouds_to_update, local_subclouds_to_delete, \ + any_rehome_failed def _set_local_subcloud_to_secondary(self, subcloud): try: @@ -118,6 +156,9 @@ class PeerGroupAuditManager(manager.Manager): f"and offline subcloud [{subcloud.name}], err: {e}") raise e + def _sync_rehome_data(self, subcloud_id, rehome_data): + db_api.subcloud_update(self.context, subcloud_id, rehome_data=rehome_data) + def audit(self, system_peer, remote_peer_group, local_peer_group): if local_peer_group.migration_status == consts.PEER_GROUP_MIGRATING: LOG.info("Local peer group in migrating state, quit audit") @@ -187,14 +228,22 @@ class PeerGroupAuditManager(manager.Manager): # set 'unmanaged+secondary' to local on same subclouds elif remote_peer_group.get("migration_status") == \ consts.PEER_GROUP_MIGRATION_COMPLETE: + dc_client = SystemPeerManager.get_peer_dc_client(system_peer) remote_subclouds = \ self._get_subclouds_by_peer_group_from_system_peer( + dc_client, system_peer, remote_peer_group.get("peer_group_name")) + remote_sync_status = \ + self._get_association_sync_status_from_peer_site( + dc_client, + system_peer, + remote_peer_group.get("id")) - local_subclouds_to_update, local_subclouds_to_delete = \ + local_subclouds_to_update, local_subclouds_to_delete, \ + any_rehome_failed = \ self._get_local_subclouds_to_update_and_delete( - local_peer_group, remote_subclouds) + local_peer_group, remote_subclouds, remote_sync_status) for subcloud in local_subclouds_to_update: self._set_local_subcloud_to_secondary(subcloud) @@ -218,7 +267,7 @@ class PeerGroupAuditManager(manager.Manager): f"peer site, err: {e}") raise e - if local_subclouds_to_update or local_subclouds_to_delete: + if remote_peer_group.get("system_leader_id") == system_peer.peer_uuid: self._clear_or_raise_alarm(system_peer, local_peer_group, remote_peer_group) @@ -232,10 +281,13 @@ class PeerGroupAuditManager(manager.Manager): system_peer, remote_peer_group.get("peer_group_name"), None) - SystemPeerManager.update_sync_status( - self.context, system_peer, - consts.ASSOCIATION_SYNC_STATUS_IN_SYNC, - local_peer_group, remote_peer_group) + + if not (remote_sync_status == consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC + and any_rehome_failed): + SystemPeerManager.update_sync_status( + self.context, system_peer, + consts.ASSOCIATION_SYNC_STATUS_IN_SYNC, + local_peer_group, remote_peer_group) self.require_audit_flag = False else: # If remote peer group migration_status is 'None' diff --git a/distributedcloud/dcmanager/manager/system_peer_manager.py b/distributedcloud/dcmanager/manager/system_peer_manager.py index 2db735c35..c171b527b 100644 --- a/distributedcloud/dcmanager/manager/system_peer_manager.py +++ b/distributedcloud/dcmanager/manager/system_peer_manager.py @@ -236,10 +236,14 @@ class SystemPeerManager(manager.Manager): f"it doesn't exist.") return - is_secondary = SystemPeerManager.is_subcloud_secondary(peer_subcloud) - if not is_secondary: + if SystemPeerManager.get_subcloud_deploy_status(peer_subcloud) not in ( + consts.DEPLOY_STATE_SECONDARY_FAILED, + consts.DEPLOY_STATE_SECONDARY, + consts.DEPLOY_STATE_REHOME_FAILED, + consts.DEPLOY_STATE_REHOME_PREP_FAILED + ): LOG.info(f"Ignoring delete Peer Site Subcloud {subcloud_ref} " - f"as is not in secondary state.") + f"as is not in secondary or rehome failed state.") return dc_client.delete_subcloud(subcloud_ref) @@ -337,7 +341,10 @@ class SystemPeerManager(manager.Manager): # should be recorded as a failure. peer_subcloud_deploy_status = self.get_subcloud_deploy_status( peer_subcloud) - if peer_subcloud_deploy_status != consts.DEPLOY_STATE_SECONDARY: + if peer_subcloud_deploy_status not in \ + (consts.DEPLOY_STATE_SECONDARY, + consts.DEPLOY_STATE_REHOME_FAILED, + consts.DEPLOY_STATE_REHOME_PREP_FAILED): subcloud.msg = "Subcloud's deploy status not correct: %s" \ % peer_subcloud_deploy_status return subcloud, False @@ -424,6 +431,9 @@ class SystemPeerManager(manager.Manager): continue try: + # TODO(lzhu1): Sending requests to fetch the subcloud one by one + # should be optimized to fetch them all with one request by calling + # the "get_subcloud_list_by_peer_group" method peer_subcloud = self.get_peer_subcloud(dc_client, subcloud_name) if not peer_subcloud: LOG.info(f"Subcloud {subcloud_name} (region_name: " @@ -431,10 +441,12 @@ class SystemPeerManager(manager.Manager): valid_subclouds.append(subcloud) continue - if not self.is_subcloud_secondary(peer_subcloud): - msg = "Ignoring update Peer Site Subcloud " + \ - f"{subcloud_name} (region_name: {region_name})" + \ - " as is not in secondary state." + if not self.is_subcloud_secondary(peer_subcloud) and \ + self.get_subcloud_deploy_status(peer_subcloud) not in \ + (consts.DEPLOY_STATE_REHOME_FAILED, + consts.DEPLOY_STATE_REHOME_PREP_FAILED): + msg = (f"Subcloud {subcloud_name} is not in the right state " + f"for sync.") LOG.info(msg) error_msg[subcloud_name] = msg continue