From 5f132a43aba15e360e011ec5bc07789e8e1c6c75 Mon Sep 17 00:00:00 2001 From: Li Zhu Date: Fri, 16 Feb 2024 18:56:55 -0500 Subject: [PATCH] Set PGA status to out-of-sync after certain operations Fix the reported issue: The PGA fails to transition from 'in-sync' to 'out-of-sync' after updating subclouds. This commit includes the following changes: 1. Updates the SPG sync_status to 'out-of-sync' after performing any of the following operations and provides an informative message to the operator: a) Adding a subcloud to the SPG b) Removing a subcloud from the SPG c) Updating a subcloud in the SPG 2. Ensures that updates on SPG attributes, such as name, max_subcloud_rehoming, and group_state, are automatically propagated to the peer site. Test plan: Pre-Steps: 1. Create the system peer from Site A to Site B 2. Create System peer from Site B to Site A 3. Create the subcloud peer group in the Site A 4. Add subcloud(s) to the peer group 5. Create peer group association to associate system peer and subcloud peer group - Site A 6. Check current sync status on Sites A and B. Verify they are 'in-sync'. PASS: Verify 'out-of-sync' on both sites after running any of the following operations on site A, the primary and leader site: 1. Adding subcloud to the SPG. 2. Removing subcloud from the SPG. 3. Updating any field of a subcloud in the SPG, such as bootstrap address, bootstrap values, install values, etc. PASS: Repeat the above operations while site B is down and verify that PGA sync_status is set to "failed". PASS: Verify that SPG attribute updates are accepted if peer site is up and the updates are successfully synced to the peer site. PASS: Verify that SPG attribute updates are rejected if the peer site is down. PASS: Verify that if the subcloud does not belong to any peer group, or if it is part of a peer group but the peer group is not associated with any peer yet, updating the subcloud would not result in an "informative" message and no attempt to update the PGA sync_status should occur. Closes-Bug: 2054123 Closes-Bug: 2054124 Change-Id: I9f0e44e34c7db5d60d211b70e839606d0361cf83 Signed-off-by: lzhu1 --- .../controllers/v1/peer_group_association.py | 7 +- .../api/controllers/v1/subcloud_peer_group.py | 40 +++++ .../dcmanager/api/controllers/v1/subclouds.py | 75 ++++++---- .../dcmanager/common/exceptions.py | 4 + distributedcloud/dcmanager/common/utils.py | 16 ++ .../dcmanager/db/sqlalchemy/api.py | 5 +- .../manager/peer_group_audit_manager.py | 6 +- .../dcmanager/manager/peer_monitor_manager.py | 2 +- distributedcloud/dcmanager/manager/service.py | 19 +++ .../dcmanager/manager/system_peer_manager.py | 141 ++++++++++++++++-- distributedcloud/dcmanager/rpc/client.py | 20 ++- .../test_peer_group_association.py | 2 +- .../controllers/test_subcloud_peer_group.py | 30 +++- .../unit/api/v1/controllers/test_subclouds.py | 109 ++++++++++++++ .../unit/manager/test_system_peer_manager.py | 110 +++++++++++++- 15 files changed, 537 insertions(+), 49 deletions(-) diff --git a/distributedcloud/dcmanager/api/controllers/v1/peer_group_association.py b/distributedcloud/dcmanager/api/controllers/v1/peer_group_association.py index d13d8e09f..715338148 100644 --- a/distributedcloud/dcmanager/api/controllers/v1/peer_group_association.py +++ b/distributedcloud/dcmanager/api/controllers/v1/peer_group_association.py @@ -37,7 +37,8 @@ ASSOCIATION_SYNC_STATUS_LIST = \ [consts.ASSOCIATION_SYNC_STATUS_SYNCING, consts.ASSOCIATION_SYNC_STATUS_IN_SYNC, consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC, - consts.ASSOCIATION_SYNC_STATUS_FAILED] + consts.ASSOCIATION_SYNC_STATUS_FAILED, + consts.ASSOCIATION_SYNC_STATUS_UNKNOWN] class PeerGroupAssociationsController(restcomm.GenericPathController): @@ -336,8 +337,8 @@ class PeerGroupAssociationsController(restcomm.GenericPathController): try: # Ask dcmanager-manager to update the subcloud peer group priority # to peer site. It will do the real work... - return self.rpc_client.update_subcloud_peer_group(context, - association.id) + return self.rpc_client.sync_subcloud_peer_group_only( + context, association.id) except RemoteError as e: pecan.abort(httpclient.UNPROCESSABLE_ENTITY, e.value) except Exception as e: diff --git a/distributedcloud/dcmanager/api/controllers/v1/subcloud_peer_group.py b/distributedcloud/dcmanager/api/controllers/v1/subcloud_peer_group.py index 35fcfb46c..f38b0653d 100644 --- a/distributedcloud/dcmanager/api/controllers/v1/subcloud_peer_group.py +++ b/distributedcloud/dcmanager/api/controllers/v1/subcloud_peer_group.py @@ -290,6 +290,25 @@ class SubcloudPeerGroupsController(restcomm.GenericPathController): ): pecan.abort(httpclient.BAD_REQUEST, _('nothing to update')) + # The flag to indicate if the update needs to be synced to + # the peer site(s). + sync_needed = ((peer_group_name and + peer_group_name != group.peer_group_name) or + (group_state and + group_state != group.group_state) or + (max_subcloud_rehoming is not None and + max_subcloud_rehoming != group.max_subcloud_rehoming)) + + any_update = (sync_needed or + ((group_priority is not None and + group_priority != group.group_priority) or + (system_leader_id and + system_leader_id != group.system_leader_id) or + (system_leader_name and + system_leader_name != group.system_leader_name))) + if not any_update: + return db_api.subcloud_peer_group_db_model_to_dict(group) + # Check value is not None or empty before calling validation function if (peer_group_name is not None and not utils.validate_name(peer_group_name, @@ -322,6 +341,27 @@ class SubcloudPeerGroupsController(restcomm.GenericPathController): pecan.abort(httpclient.BAD_REQUEST, _('Invalid migration_status')) + # Update on peer site(s) + if (not utils.is_req_from_another_dc(request)) and sync_needed: + success_peer_ids, failed_peer_ids = \ + self.rpc_client.update_subcloud_peer_group( + context, group.id, group_state, max_subcloud_rehoming, + group.peer_group_name, peer_group_name) + if failed_peer_ids: + if not success_peer_ids: + # Reject if all failed + pecan.abort( + httpclient.FORBIDDEN, + _('Unable to sync the update to the peer site(s)')) + # Local update will continue if it's only partial + # failure. + LOG.error(f"Failed to sync the subcloud peer group " + f"{group.id} update on the peer site(s) " + f"{failed_peer_ids} with the values: " + f"{group_state=}, " + f"{max_subcloud_rehoming=}, " + f"{peer_group_name=}") + try: updated_peer_group = db_api.subcloud_peer_group_update( context, diff --git a/distributedcloud/dcmanager/api/controllers/v1/subclouds.py b/distributedcloud/dcmanager/api/controllers/v1/subclouds.py index 92c42964a..55582d9cb 100644 --- a/distributedcloud/dcmanager/api/controllers/v1/subclouds.py +++ b/distributedcloud/dcmanager/api/controllers/v1/subclouds.py @@ -648,14 +648,34 @@ class SubcloudsController(object): if not payload: pecan.abort(400, _('Body required')) + # Create a set to store the affected SPG(s) that need to be + # synced due to the subcloud update. This set will be utilized to + # update the sync_status in the corresponding PGA on each site. + sync_peer_groups = set() + + req_from_another_dc = utils.is_req_from_another_dc(request) + original_pgrp = None + leader_on_local_site = False + if subcloud.peer_group_id is not None: + # Get the original peer group of the subcloud + original_pgrp = db_api.subcloud_peer_group_get( + context, subcloud.peer_group_id) + leader_on_local_site = utils.is_leader_on_local_site(original_pgrp) + # A sync command is required after updating a subcloud + # in an SPG that is already associated with a PGA on the primary + # and leader site. The existence of the PGA will be checked + # by the update_association_sync_status method later. + if (original_pgrp.group_priority == 0 and + leader_on_local_site and + not req_from_another_dc): + sync_peer_groups.add(subcloud.peer_group_id) + peer_group = payload.get('peer_group') # Verify the peer_group is valid peer_group_id = None if peer_group is not None: # peer_group may be passed in the payload as an int or str peer_group = str(peer_group) - # Get current site system information - local_system_uuid = utils.get_local_system().uuid # Check if user wants to remove a subcloud # from a subcloud-peer-group by # setting peer_group_id as 'none', @@ -664,12 +684,9 @@ class SubcloudsController(object): # update_subcloud() will handle it and # Set the peer_group_id DB into None. if peer_group.lower() == 'none': - if subcloud.peer_group_id is not None: - # Get the peer group of the subcloud - original_pgrp = db_api.subcloud_peer_group_get( - context, subcloud.peer_group_id) + if original_pgrp: # Check the system leader is not on this site - if original_pgrp.system_leader_id != local_system_uuid: + if not leader_on_local_site: pecan.abort(400, _("Removing subcloud from a " "peer group not led by the " "current site is prohibited.")) @@ -691,23 +708,19 @@ class SubcloudsController(object): "is prohibited.")) peer_group_id = 'none' else: - if subcloud.peer_group_id is not None and \ + if original_pgrp and original_pgrp.group_priority > 0 and \ str(subcloud.peer_group_id) != peer_group: - original_pgrp = utils.subcloud_peer_group_get_by_ref( - context, str(subcloud.peer_group_id)) - if original_pgrp and original_pgrp.group_priority > 0: - pecan.abort(400, _( - "Cannot update subcloud to a new peer group " - "if the original peer group has non-zero " - "priority.")) + pecan.abort(400, _( + "Cannot update subcloud to a new peer group " + "if the original peer group has non-zero priority.")) pgrp = utils.subcloud_peer_group_get_by_ref(context, peer_group) if not pgrp: pecan.abort(400, _('Invalid peer group')) - if not utils.is_req_from_another_dc(request): + if not req_from_another_dc: if pgrp.group_priority > 0: pecan.abort(400, _("Cannot set the subcloud to a peer" " group with non-zero priority.")) - elif pgrp.system_leader_id != local_system_uuid: + elif not utils.is_leader_on_local_site(pgrp): pecan.abort(400, _("Update subcloud to a peer " "group that is not led by the " "current site is prohibited.")) @@ -720,24 +733,19 @@ class SubcloudsController(object): pecan.abort(400, _("Only subclouds that are " "managed and online can be " "added to a peer group.")) + sync_peer_groups.add(pgrp.id) peer_group_id = pgrp.id # Subcloud can only be updated while it is managed in # the primary site because the sync command can only be issued # in the site where the SPG was created. - if subcloud.peer_group_id is not None and peer_group_id is None \ - and not utils.is_req_from_another_dc(request): - # Get the peer group of the subcloud - original_pgrp = db_api.subcloud_peer_group_get( - context, subcloud.peer_group_id) + if original_pgrp and peer_group_id is None and not req_from_another_dc: if original_pgrp.group_priority > 0: pecan.abort(400, _("Subcloud update is only allowed when " "its peer group priority value is 0.")) - # Get current site system information - local_system_uuid = utils.get_local_system().uuid # Updating a subcloud under the peer group on primary site # that the peer group should be led by the primary site. - if original_pgrp.system_leader_id != local_system_uuid: + if not leader_on_local_site: pecan.abort(400, _("Updating subcloud from a " "peer group not led by the " "current site is prohibited.")) @@ -900,6 +908,23 @@ class SubcloudsController(object): bootstrap_values=bootstrap_values, bootstrap_address=bootstrap_address, deploy_status=deploy_status) + + if sync_peer_groups: + # Collect the affected peer group association IDs. + association_ids = set() + for pg_id in sync_peer_groups: + association_ids.update( + self.dcmanager_rpc_client. + update_association_sync_status( + context, pg_id, + consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC)) + # Generate an informative message for reminding the operator + # that the sync command(s) should be executed. + info_message = utils.generate_sync_info_message( + association_ids) + if info_message: + subcloud["info_message"] = info_message + return subcloud except RemoteError as e: pecan.abort(422, e.value) diff --git a/distributedcloud/dcmanager/common/exceptions.py b/distributedcloud/dcmanager/common/exceptions.py index 7e7bf4f41..3c9bf0031 100644 --- a/distributedcloud/dcmanager/common/exceptions.py +++ b/distributedcloud/dcmanager/common/exceptions.py @@ -210,6 +210,10 @@ class SubcloudBackupOperationFailed(DCManagerException): "'dcmanager subcloud error' command for details") +class SubcloudSyncFailedException(DCManagerException): + message = _("Failed to sync subcloud update to peer site %(peer_id)s") + + class ConnectionRefused(DCManagerException): message = _("Connection to the service endpoint is refused") diff --git a/distributedcloud/dcmanager/common/utils.py b/distributedcloud/dcmanager/common/utils.py index ade803632..1685c8f78 100644 --- a/distributedcloud/dcmanager/common/utils.py +++ b/distributedcloud/dcmanager/common/utils.py @@ -1595,3 +1595,19 @@ def get_msg_output_info(log_file, target_task, target_str): def get_subcloud_ansible_log_file(subcloud_name): return os.path.join(consts.DC_ANSIBLE_LOG_DIR, subcloud_name + '_playbook_output.log') + + +def is_leader_on_local_site(peer_group): + return peer_group.system_leader_id == get_local_system().uuid + + +def generate_sync_info_message(association_ids): + info_message = None + if association_ids: + info_message = ("The operation has caused the SPG to be out-of-sync. " + "Please run peer-group-association sync command to push " + "the subcloud peer group changes to the peer site:\n") + for association_id in association_ids: + info_message += (f"$ dcmanager peer-group-association" + f" sync {association_id}\n") + return info_message diff --git a/distributedcloud/dcmanager/db/sqlalchemy/api.py b/distributedcloud/dcmanager/db/sqlalchemy/api.py index 81a6cbb52..39e52ca12 100644 --- a/distributedcloud/dcmanager/db/sqlalchemy/api.py +++ b/distributedcloud/dcmanager/db/sqlalchemy/api.py @@ -1272,7 +1272,10 @@ def peer_group_association_update(context, if sync_status is not None: association_ref.sync_status = sync_status if sync_message is not None: - association_ref.sync_message = sync_message + if str(sync_message).lower() == 'none': + association_ref.sync_message = None + else: + association_ref.sync_message = sync_message association_ref.save(session) return association_ref diff --git a/distributedcloud/dcmanager/manager/peer_group_audit_manager.py b/distributedcloud/dcmanager/manager/peer_group_audit_manager.py index 39f7e43ab..cf287ae17 100644 --- a/distributedcloud/dcmanager/manager/peer_group_audit_manager.py +++ b/distributedcloud/dcmanager/manager/peer_group_audit_manager.py @@ -176,7 +176,7 @@ class PeerGroupAuditManager(manager.Manager): LOG.exception(f"Fail to unmanage local subcloud " f"{subcloud.name}, err: {e}") raise e - SystemPeerManager.update_sync_status_on_peer_site( + SystemPeerManager.update_sync_status( self.context, system_peer, consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC, local_peer_group, remote_peer_group) @@ -208,7 +208,7 @@ class PeerGroupAuditManager(manager.Manager): self.context, subcloud.id) LOG.info(f"Deleted local subcloud {subcloud.name}") except Exception as e: - SystemPeerManager.update_sync_status_on_peer_site( + SystemPeerManager.update_sync_status( self.context, system_peer, consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC, local_peer_group, remote_peer_group) @@ -232,7 +232,7 @@ class PeerGroupAuditManager(manager.Manager): system_peer, remote_peer_group.get("peer_group_name"), None) - SystemPeerManager.update_sync_status_on_peer_site( + SystemPeerManager.update_sync_status( self.context, system_peer, consts.ASSOCIATION_SYNC_STATUS_IN_SYNC, local_peer_group, remote_peer_group) diff --git a/distributedcloud/dcmanager/manager/peer_monitor_manager.py b/distributedcloud/dcmanager/manager/peer_monitor_manager.py index 408900b8f..afdd61a85 100644 --- a/distributedcloud/dcmanager/manager/peer_monitor_manager.py +++ b/distributedcloud/dcmanager/manager/peer_monitor_manager.py @@ -161,7 +161,7 @@ class PeerMonitor(object): sync_status = consts.ASSOCIATION_SYNC_STATUS_IN_SYNC dc_local_pg = db_api.subcloud_peer_group_get( self.context, association.peer_group_id) - SystemPeerManager.update_sync_status_on_peer_site( + SystemPeerManager.update_sync_status( self.context, self.peer, sync_status, dc_local_pg, association=association) diff --git a/distributedcloud/dcmanager/manager/service.py b/distributedcloud/dcmanager/manager/service.py index 83fc82d82..dd75f14b7 100644 --- a/distributedcloud/dcmanager/manager/service.py +++ b/distributedcloud/dcmanager/manager/service.py @@ -337,6 +337,16 @@ class DCManagerService(service.Service): return self.system_peer_manager.sync_subcloud_peer_group( context, association_id, sync_subclouds) + @request_context + def update_subcloud_peer_group(self, context, peer_group_id, + group_state, max_subcloud_rehoming, + group_name, new_group_name=None): + LOG.info("Handling update_subcloud_peer_group request for " + "peer group %s" % peer_group_id) + return self.system_peer_manager.update_subcloud_peer_group( + context, peer_group_id, group_state, max_subcloud_rehoming, + group_name, new_group_name) + @request_context def delete_peer_group_association(self, context, association_id): LOG.info("Handling delete_peer_group_association request for: %s", @@ -344,6 +354,15 @@ class DCManagerService(service.Service): return self.system_peer_manager.delete_peer_group_association( context, association_id) + @request_context + def update_association_sync_status(self, context, peer_group_id, + sync_status, sync_message=None): + # Updates peer group association sync_status + LOG.info("Handling update_peer_association_sync_status request for: %s" + % peer_group_id) + return self.system_peer_manager.update_association_sync_status( + context, peer_group_id, sync_status, sync_message) + def _stop_rpc_server(self): # Stop RPC connection to prevent new requests LOG.debug(_("Attempting to stop RPC service...")) diff --git a/distributedcloud/dcmanager/manager/system_peer_manager.py b/distributedcloud/dcmanager/manager/system_peer_manager.py index 320272e21..b2a15dc34 100644 --- a/distributedcloud/dcmanager/manager/system_peer_manager.py +++ b/distributedcloud/dcmanager/manager/system_peer_manager.py @@ -66,10 +66,9 @@ class SystemPeerManager(manager.Manager): return [association] if association else [] @staticmethod - def update_sync_status_on_peer_site(ctx, peer, sync_status, local_pg=None, - remote_pg=None, message="None", - association=None): - """Update sync status of association on peer site. + def update_sync_status(ctx, peer, sync_status, local_pg=None, + remote_pg=None, message="None", association=None): + """Update sync status of association. This function updates the sync status of the association on the peer site and then updates the sync status of the association on the @@ -84,7 +83,7 @@ class SystemPeerManager(manager.Manager): :param association: peer group association object """ - def _update_association_on_peer_site(association, peer, sync_status, + def _update_association_on_peer_site(peer, sync_status, local_pg, remote_pg, message): try: # Get peer site dcmanager client @@ -120,7 +119,6 @@ class SystemPeerManager(manager.Manager): return sync_status, message - associations = list() if association is None: associations = SystemPeerManager.get_local_associations( ctx, peer, local_pg) @@ -138,7 +136,7 @@ class SystemPeerManager(manager.Manager): local_pg = local_pg if local_pg is not None else db_api.\ subcloud_peer_group_get(ctx, association.peer_group_id) sync_status, message = _update_association_on_peer_site( - association, peer, sync_status, local_pg, remote_pg, message) + peer, sync_status, local_pg, remote_pg, message) if association.sync_status == sync_status and sync_status != \ consts.ASSOCIATION_SYNC_STATUS_FAILED: @@ -535,6 +533,120 @@ class SystemPeerManager(manager.Manager): failed_message, dc_peer_association_id) + def update_association_sync_status(self, context, peer_group_id, + sync_status, sync_message=None): + """Update PGA sync status on primary and peer site(s). + + The update of PGA sync status is always triggered on the primary site, + therefore, this method can only be called on the primary site. + + :param context: request context object + :param peer_group_id: id of the peer group used to fetch corresponding PGA + :param sync_status: the sync status to be updated to + :param sync_message: The sync message to be updated to + """ + # Collect the association IDs to be synced. + out_of_sync_associations_ids = set() + local_peer_gp = db_api.subcloud_peer_group_get(self.context, peer_group_id) + # Get associations by peer group id + associations = db_api.peer_group_association_get_by_peer_group_id( + context, peer_group_id) + if not associations: + LOG.debug("No association found for peer group %s" % peer_group_id) + else: + for association in associations: + if sync_status == consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC: + out_of_sync_associations_ids.add(association.id) + + pre_sync_status = association.sync_status + new_sync_status = sync_status + new_sync_message = sync_message + if sync_status in (consts.ASSOCIATION_SYNC_STATUS_IN_SYNC, + consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC): + # We don't need sync_message for in-sync and out-of-sync + # status, so clear the previous remained message for these + # two status. + new_sync_message = 'None' + + # If the local sync_status already set to unknown, indicating + # that the peer site is unreachable, we'll change the sync_status + # to failed directly on the local site without trying to sync + # the status to the peer site. When the peer site becomes + # reachable, the primary site monitor will update the failed + # status to out-of-sync on both sites. + if pre_sync_status == consts.ASSOCIATION_SYNC_STATUS_UNKNOWN: + if sync_status != consts.ASSOCIATION_SYNC_STATUS_IN_SYNC: + new_sync_status = consts.ASSOCIATION_SYNC_STATUS_FAILED + new_sync_message = ("Failed to update sync_status, " + "because the peer site is unreachable.") + # Update peer site peer association sync_status + else: + # Get system peer by peer id from the association + system_peer = db_api.system_peer_get( + context, association.system_peer_id) + if pre_sync_status != sync_status: + SystemPeerManager.update_sync_status( + context, system_peer, sync_status, local_peer_gp, + None, new_sync_message, association) + # Already update sync_status on both peer and local sites + continue + + if pre_sync_status != new_sync_status: + # Update local site peer association sync_status + db_api.peer_group_association_update( + context, + association.id, + sync_status=new_sync_status, + sync_message=new_sync_message) + LOG.debug( + f"Updated Local Peer Group Association {association.id} " + f"sync_status to {new_sync_status}.") + + return out_of_sync_associations_ids + + def update_subcloud_peer_group(self, context, peer_group_id, + group_state, max_subcloud_rehoming, + group_name, new_group_name=None): + # Collect the success and failed peer ids. + success_peer_ids = set() + failed_peer_ids = set() + + # Get associations by peer group id + associations = db_api.peer_group_association_get_by_peer_group_id( + context, peer_group_id) + if not associations: + LOG.info("No association found for peer group %s" % peer_group_id) + else: + for association in associations: + # Get system peer by peer id from the association + system_peer = db_api.system_peer_get( + context, association.system_peer_id) + # Get 'available' system peer + if system_peer.availability_state != \ + consts.SYSTEM_PEER_AVAILABILITY_STATE_AVAILABLE: + LOG.warning("Peer system %s offline" % system_peer.id) + failed_peer_ids.add(system_peer.id) + else: + try: + # Get a client for sending request to this system peer + dc_client = self.get_peer_dc_client(system_peer) + peer_group_kwargs = { + 'peer-group-name': new_group_name, + 'group-state': group_state, + 'max-subcloud-rehoming': max_subcloud_rehoming + } + # Make an API call to update peer group on peer site + dc_client.update_subcloud_peer_group( + group_name, **peer_group_kwargs) + success_peer_ids.add(system_peer.id) + except Exception: + LOG.error(f"Failed to update Subcloud Peer Group " + f"{group_name} on peer site {system_peer.id}" + f" with the values: {peer_group_kwargs}") + failed_peer_ids.add(system_peer.id) + + return success_peer_ids, failed_peer_ids + def _get_non_primary_association(self, dc_client, dc_peer_system_peer_id, dc_peer_pg_id): """Get non-primary Association from peer site.""" @@ -690,9 +802,14 @@ class SystemPeerManager(manager.Manager): error_msg = self._sync_subclouds(context, peer, dc_local_pg.id, dc_peer_pg_id) if len(error_msg) > 0: + LOG.error(f"Failed to sync subcloud(s) in the Subcloud " + f"Peer Group {peer_group_name}: " + f"{json.dumps(error_msg)}") association_update['sync_status'] = \ consts.ASSOCIATION_SYNC_STATUS_FAILED - association_update['sync_message'] = json.dumps(error_msg) + association_update['sync_message'] = \ + (f"Failed to sync {error_msg.keys()} in the Subcloud " + f"Peer Group {peer_group_name}.") association = self._update_sync_status( context, association_id, **association_update) @@ -779,8 +896,14 @@ class SystemPeerManager(manager.Manager): 'peer subcloud clean', clean_function, delete_pool, subclouds) if delete_error_msg: + LOG.error(f"Failed to delete subcloud(s) from " + f"the Subcloud Peer Group {peer_group_name} " + f"on peer site: {json.dumps(delete_error_msg)}") + sync_message = (f"Deletion of {delete_error_msg.keys()} from " + f"the Subcloud Peer Group {peer_group_name} " + f"on the peer site failed.") self._update_sync_status_to_failed(context, association_id, - json.dumps(delete_error_msg)) + sync_message) return # System Peer does not exist on peer site, delete peer group diff --git a/distributedcloud/dcmanager/rpc/client.py b/distributedcloud/dcmanager/rpc/client.py index 88cf6d486..ac4ec50d2 100644 --- a/distributedcloud/dcmanager/rpc/client.py +++ b/distributedcloud/dcmanager/rpc/client.py @@ -265,15 +265,33 @@ class ManagerClient(RPCClient): return self.cast(ctxt, self.make_msg( 'sync_subcloud_peer_group', association_id=association_id)) - def update_subcloud_peer_group(self, ctxt, association_id): + def sync_subcloud_peer_group_only(self, ctxt, association_id): + # Without synchronizing subclouds return self.call(ctxt, self.make_msg( 'sync_subcloud_peer_group', association_id=association_id, sync_subclouds=False)) + def update_subcloud_peer_group(self, ctxt, peer_group_id, + group_state, max_subcloud_rehoming, + group_name, new_group_name=None): + return self.call(ctxt, self.make_msg( + 'update_subcloud_peer_group', + peer_group_id=peer_group_id, + group_state=group_state, + max_subcloud_rehoming=max_subcloud_rehoming, + group_name=group_name, new_group_name=new_group_name)) + def delete_peer_group_association(self, ctxt, association_id): return self.call(ctxt, self.make_msg('delete_peer_group_association', association_id=association_id)) + def update_association_sync_status(self, ctxt, peer_group_id, + sync_status, sync_message=None): + return self.call(ctxt, self.make_msg('update_association_sync_status', + peer_group_id=peer_group_id, + sync_status=sync_status, + sync_message=sync_message)) + def peer_monitor_notify(self, ctxt): return self.call(ctxt, self.make_msg('peer_monitor_notify')) diff --git a/distributedcloud/dcmanager/tests/unit/api/v1/controllers/test_peer_group_association.py b/distributedcloud/dcmanager/tests/unit/api/v1/controllers/test_peer_group_association.py index c14c9c8b3..3f4cb29ec 100644 --- a/distributedcloud/dcmanager/tests/unit/api/v1/controllers/test_peer_group_association.py +++ b/distributedcloud/dcmanager/tests/unit/api/v1/controllers/test_peer_group_association.py @@ -294,7 +294,7 @@ class TestPeerGroupAssociationUpdate(testroot.DCManagerApiTest, @mock.patch.object(rpc_client, 'ManagerClient') def test_update_success(self, mock_client): - mock_client().update_subcloud_peer_group.return_value = { + mock_client().sync_subcloud_peer_group_only.return_value = { 'peer-group-priority': SAMPLE_PEER_GROUP_PRIORITY_UPDATED } context = utils.dummy_context() diff --git a/distributedcloud/dcmanager/tests/unit/api/v1/controllers/test_subcloud_peer_group.py b/distributedcloud/dcmanager/tests/unit/api/v1/controllers/test_subcloud_peer_group.py index fc4add3cb..fe902f209 100644 --- a/distributedcloud/dcmanager/tests/unit/api/v1/controllers/test_subcloud_peer_group.py +++ b/distributedcloud/dcmanager/tests/unit/api/v1/controllers/test_subcloud_peer_group.py @@ -18,8 +18,12 @@ from dcmanager.tests import utils SAMPLE_SUBCLOUD_PEER_GROUP_NAME = 'GroupX' SAMPLE_SUBCLOUD_PEER_GROUP_MAX_SUBCLOUDS_REHOMING = 50 +SAMPLE_SUBCLOUD_PEER_GROUP_STATE = 'enabled' SAMPLE_SUBCLOUD_PEER_GROUP_PIRORITY = 0 +NEW_SUBCLOUD_PEER_GROUP_NAME = 'GroupY' +NEW_SUBCLOUD_PEER_GROUP_MAX_SUBCLOUDS_REHOMING = 20 + API_PREFIX = '/v1.0/subcloud-peer-groups' RESULT_KEY = 'subcloud_peer_groups' EXPECTED_FIELDS = ["id", @@ -74,7 +78,8 @@ class SubcloudPeerGroupAPIMixin(APIMixin): ), 'system_leader_name': kw.get('system_leader_name', 'dc-test'), 'group_priority': kw.get('group_priority', '0'), - 'group_state': kw.get('group_state', 'enabled'), + 'group_state': kw.get( + 'group_state', SAMPLE_SUBCLOUD_PEER_GROUP_STATE), 'max_subcloud_rehoming': kw.get( 'max_subcloud_rehoming', SAMPLE_SUBCLOUD_PEER_GROUP_MAX_SUBCLOUDS_REHOMING @@ -343,6 +348,29 @@ class TestSubcloudPeerGroupUpdate(testroot.DCManagerApiTest, self.assertEqual(response.content_type, 'text/plain') self.assertEqual(response.status_code, http_client.BAD_REQUEST) + @mock.patch.object(rpc_client, 'ManagerClient') + def test_rename_subcloud_peer_group(self, mock_client): + mock_client().update_subcloud_peer_group.return_value = \ + (set(), set(mock.MagicMock())) + context = utils.dummy_context() + single_obj = self._create_db_object(context) + update_data = { + 'peer-group-name': NEW_SUBCLOUD_PEER_GROUP_NAME, + 'max-subcloud-rehoming': + NEW_SUBCLOUD_PEER_GROUP_MAX_SUBCLOUDS_REHOMING + } + response = self.app.patch_json(self.get_single_url(single_obj.id), + headers=self.get_api_headers(), + params=update_data, + expect_errors=False) + self.assertEqual(response.status_code, http_client.OK) + + mock_client().update_subcloud_peer_group.assert_called_once_with( + mock.ANY, single_obj.id, None, + NEW_SUBCLOUD_PEER_GROUP_MAX_SUBCLOUDS_REHOMING, + SAMPLE_SUBCLOUD_PEER_GROUP_NAME, + NEW_SUBCLOUD_PEER_GROUP_NAME) + class TestSubcloudPeerGroupDelete(testroot.DCManagerApiTest, SubcloudPeerGroupAPIMixin): diff --git a/distributedcloud/dcmanager/tests/unit/api/v1/controllers/test_subclouds.py b/distributedcloud/dcmanager/tests/unit/api/v1/controllers/test_subclouds.py index 43a5953dd..d6a96ef40 100644 --- a/distributedcloud/dcmanager/tests/unit/api/v1/controllers/test_subclouds.py +++ b/distributedcloud/dcmanager/tests/unit/api/v1/controllers/test_subclouds.py @@ -45,6 +45,7 @@ from dcmanager.tests.unit.api.v1.controllers.mixins import PostMixin from dcmanager.tests.unit.common import fake_strategy from dcmanager.tests.unit.common import fake_subcloud from dcmanager.tests.unit.fakes import FakeVimStrategy +from dcmanager.tests.unit.manager import test_system_peer_manager from dcmanager.tests import utils SAMPLE_SUBCLOUD_NAME = "SubcloudX" @@ -62,6 +63,7 @@ FAKE_SUBCLOUD_INSTALL_VALUES_WITH_PERSISTENT_SIZE = ( ) FAKE_SUBCLOUD_BOOTSTRAP_PAYLOAD = fake_subcloud.FAKE_SUBCLOUD_BOOTSTRAP_PAYLOAD OAM_FLOATING_IP = "10.10.10.12" +FAKE_PEER_GROUP_ID = 10 FAKE_PATCH = {"value": {"patchstate": "Partial-Apply"}} @@ -1313,6 +1315,9 @@ class TestSubcloudAPIOther(testroot.DCManagerApiTest): self.assertEqual( dccommon_consts.MANAGEMENT_UNMANAGED, updated_subcloud.management_state ) + # Verify that the update PGA sync_status is not called since subcloud + # isn't associated with an SPG. + self.mock_rpc_client().update_association_sync_status.assert_not_called() @mock.patch.object(subclouds.SubcloudsController, "_get_patch_data") def test_update_subcloud_group_value(self, mock_get_patch_data): @@ -1407,6 +1412,10 @@ class TestSubcloudAPIOther(testroot.DCManagerApiTest): deploy_status=None) self.assertEqual(response.status_int, 200) + # Verify that the update PGA sync_status is not called since subcloud + # isn't associated with an SPG. + self.mock_rpc_client().update_association_sync_status.assert_not_called() + @mock.patch.object(subclouds.SubcloudsController, "_check_existing_vim_strategy") @mock.patch.object(psd_common, "get_network_address_pool") @@ -1739,6 +1748,106 @@ class TestSubcloudAPIOther(testroot.DCManagerApiTest): ) self.mock_rpc_client().update_subcloud_endpoint_status.assert_not_called() + @mock.patch.object(subclouds.SubcloudsController, "_get_patch_data") + @mock.patch.object(cutils, 'is_leader_on_local_site') + @mock.patch.object(cutils, 'subcloud_peer_group_get_by_ref') + def test_add_subcloud_to_peer_group(self, mock_get_peer_group, + mock_is_leader_on_local_site, + mock_get_patch_data): + peer_group = test_system_peer_manager.TestSystemPeerManager. \ + create_subcloud_peer_group_static( + self.ctx, peer_group_name='SubcloudPeerGroup1') + mock_get_peer_group.return_value = peer_group + subcloud = fake_subcloud.create_fake_subcloud(self.ctx) + subcloud = db_api.subcloud_update( + self.ctx, + subcloud.id, + availability_status=dccommon_consts.AVAILABILITY_ONLINE, + deploy_status=consts.DEPLOY_STATE_DONE, + management_state=dccommon_consts.MANAGEMENT_MANAGED, + prestage_status=consts.PRESTAGE_STATE_COMPLETE + ) + mock_is_leader_on_local_site.return_value = True + data = {"peer_group": peer_group.id} + self.mock_rpc_client().update_subcloud.return_value = True + mock_get_patch_data.return_value = data + response = self.app.patch_json( + FAKE_URL + "/" + str(subcloud.id), headers=FAKE_HEADERS, + params=data + ) + self.assertEqual(response.status_int, 200) + self.mock_rpc_client().update_subcloud.assert_called_once() + self.mock_rpc_client().update_association_sync_status. \ + assert_called_once() + + @mock.patch.object(subclouds.SubcloudsController, "_get_patch_data") + @mock.patch.object(cutils, 'is_leader_on_local_site') + @mock.patch.object(cutils, 'subcloud_peer_group_get_by_ref') + def test_remove_subcloud_from_peer_group(self, mock_get_peer_group, + mock_is_leader_on_local_site, + mock_get_patch_data): + peer_group = test_system_peer_manager.TestSystemPeerManager. \ + create_subcloud_peer_group_static( + self.ctx, peer_group_name='SubcloudPeerGroup1') + mock_get_peer_group.return_value = peer_group + subcloud = fake_subcloud.create_fake_subcloud(self.ctx) + subcloud = db_api.subcloud_update( + self.ctx, + subcloud.id, + availability_status=dccommon_consts.AVAILABILITY_ONLINE, + deploy_status=consts.DEPLOY_STATE_DONE, + management_state=dccommon_consts.MANAGEMENT_MANAGED, + prestage_status=consts.PRESTAGE_STATE_COMPLETE, + peer_group_id=peer_group.id + ) + mock_is_leader_on_local_site.return_value = True + data = {"peer_group": "none"} + self.mock_rpc_client().update_subcloud.return_value = True + mock_get_patch_data.return_value = data + response = self.app.patch_json( + FAKE_URL + "/" + str(subcloud.id), headers=FAKE_HEADERS, + params=data + ) + self.assertEqual(response.status_int, 200) + self.mock_rpc_client().update_subcloud.assert_called_once() + self.mock_rpc_client().update_association_sync_status. \ + assert_called_once() + + @mock.patch.object(subclouds.SubcloudsController, "_get_patch_data") + @mock.patch.object(cutils, 'is_leader_on_local_site') + @mock.patch.object(cutils, 'subcloud_peer_group_get_by_ref') + def test_update_subcloud_on_non_primary_site(self, mock_get_peer_group, + mock_is_leader_on_local_site, + mock_get_patch_data): + peer_group = test_system_peer_manager.TestSystemPeerManager. \ + create_subcloud_peer_group_static( + self.ctx, peer_group_name='SubcloudPeerGroup1', group_priority=1) + mock_get_peer_group.return_value = peer_group + subcloud = fake_subcloud.create_fake_subcloud(self.ctx) + subcloud = db_api.subcloud_update( + self.ctx, + subcloud.id, + availability_status=dccommon_consts.AVAILABILITY_ONLINE, + deploy_status=consts.DEPLOY_STATE_DONE, + management_state=dccommon_consts.MANAGEMENT_MANAGED, + prestage_status=consts.PRESTAGE_STATE_COMPLETE, + peer_group_id=peer_group.id + ) + mock_is_leader_on_local_site.return_value = True + data = {"bootstrap-address": "10.10.10.11"} + mock_get_patch_data.return_value = data + six.assertRaisesRegex( + self, + webtest.app.AppError, + "400 *", + self.app.patch_json, + FAKE_URL + "/" + str(subcloud.id), + headers=FAKE_HEADERS, + params=data, + ) + self.mock_rpc_client().update_subcloud.assert_not_called() + self.mock_rpc_client().update_association_sync_status.assert_not_called() + def test_get_config_file_path(self): bootstrap_file = psd_common.get_config_file_path("subcloud1") install_values = psd_common.get_config_file_path( diff --git a/distributedcloud/dcmanager/tests/unit/manager/test_system_peer_manager.py b/distributedcloud/dcmanager/tests/unit/manager/test_system_peer_manager.py index a9dcb49d9..419a5cc46 100644 --- a/distributedcloud/dcmanager/tests/unit/manager/test_system_peer_manager.py +++ b/distributedcloud/dcmanager/tests/unit/manager/test_system_peer_manager.py @@ -40,6 +40,9 @@ FAKE_SITE0_PEER_GROUP_STATE = 'enabled' # FAKE SUBCLOUD PEER GROUP DATA (SITE1) FAKE_SITE1_PEER_GROUP_ID = 9 +FAKE_SITE1_PEER_GROUP_NAME = 'PeerGroup2' +FAKE_SITE1_PEER_GROUP_MAX_SUBCLOUDS_REHOMING = 20 +FAKE_SITE1_PEER_GROUP_STATE = 'enabled' # FAKE SYSTEM PEER DATA (SITE1) FAKE_SITE1_SYSTEM_PEER_ID = 10 @@ -156,7 +159,9 @@ class TestSystemPeerManager(base.DCManagerTestCase): 'endpoint': FAKE_MANAGER_ENDPOINT, 'username': FAKE_MANAGER_USERNAME, 'password': FAKE_MANAGER_PASSWORD, - 'gateway_ip': FAKE_PEER_CONTROLLER_GATEWAY_IP + 'gateway_ip': FAKE_PEER_CONTROLLER_GATEWAY_IP, + 'availability_state': + consts.SYSTEM_PEER_AVAILABILITY_STATE_AVAILABLE } values.update(kwargs) return db_api.system_peer_create(ctxt, **values) @@ -492,8 +497,7 @@ class TestSystemPeerManager(base.DCManagerTestCase): 'utils.get_local_system') @mock.patch('dcmanager.manager.system_peer_manager.' 'SystemPeerManager.get_peer_dc_client') - def test_update_sync_status_on_peer_site( - self, mock_client, mock_utils): + def test_update_sync_status(self, mock_client, mock_utils): mock_dc_client = mock.MagicMock() mock_dc_client().get_subcloud_peer_group = mock.MagicMock() mock_dc_client().get_system_peer = mock.MagicMock() @@ -521,7 +525,7 @@ class TestSystemPeerManager(base.DCManagerTestCase): return_value = {'id': FAKE_SITE1_ASSOCIATION_ID} spm = system_peer_manager.SystemPeerManager(mock.MagicMock()) - spm.update_sync_status_on_peer_site( + spm.update_sync_status( self.ctx, peer, consts.ASSOCIATION_SYNC_STATUS_IN_SYNC) mock_dc_client().get_subcloud_peer_group.assert_called_once_with( @@ -539,3 +543,101 @@ class TestSystemPeerManager(base.DCManagerTestCase): self.ctx, association.id) self.assertEqual(consts.ASSOCIATION_SYNC_STATUS_IN_SYNC, association_new.sync_status) + + @mock.patch.object(system_peer_manager.SystemPeerManager, + 'update_sync_status') + def test_update_association_sync_status(self, mock_update_sync_status): + peer = self.create_system_peer_static( + self.ctx, + peer_name='SystemPeer1') + peer_group = self.create_subcloud_peer_group_static( + self.ctx, + peer_group_name='SubcloudPeerGroup1') + self.create_peer_group_association_static( + self.ctx, + system_peer_id=peer.id, + peer_group_id=peer_group.id, + sync_status=consts.ASSOCIATION_SYNC_STATUS_IN_SYNC) + + spm = system_peer_manager.SystemPeerManager(mock.MagicMock()) + spm.update_association_sync_status( + self.ctx, peer_group.id, + consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC) + + mock_update_sync_status.assert_called_once_with( + self.ctx, mock.ANY, consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC, + mock.ANY, None, 'None', mock.ANY) + + @mock.patch.object(system_peer_manager.SystemPeerManager, + 'update_sync_status') + def test_update_association_sync_status_when_peer_site_down( + self, mock_update_sync_status): + peer = self.create_system_peer_static( + self.ctx, + peer_name='SystemPeer1') + peer_group = self.create_subcloud_peer_group_static( + self.ctx, + peer_group_name='SubcloudPeerGroup1') + self.create_peer_group_association_static( + self.ctx, + system_peer_id=peer.id, + peer_group_id=peer_group.id, + sync_status=consts.ASSOCIATION_SYNC_STATUS_UNKNOWN) + + spm = system_peer_manager.SystemPeerManager(mock.MagicMock()) + spm.update_association_sync_status( + self.ctx, peer_group.id, + consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC) + + association = db_api. \ + peer_group_association_get_by_peer_group_and_system_peer_id( + self.ctx, peer_group.id, peer.id) + self.assertEqual(consts.ASSOCIATION_SYNC_STATUS_FAILED, + association.sync_status) + self.assertEqual("Failed to update sync_status, " + "because the peer site is unreachable.", + association.sync_message) + + mock_update_sync_status.assert_not_called() + + @mock.patch.object(system_peer_manager, 'PeerSiteDriver') + @mock.patch.object(system_peer_manager, 'SysinvClient') + @mock.patch.object(system_peer_manager, 'DcmanagerClient') + def test_update_subcloud_peer_group(self, + mock_dc_client, + mock_sysinv_client, + mock_keystone_client): + mock_keystone_client().keystone_client = FakeKeystoneClient() + mock_sysinv_client.return_value = FakeSysinvClient() + mock_dc_client.return_value = FakeDcmanagerClient() + mock_dc_client().get_subcloud_peer_group = mock.MagicMock() + mock_dc_client().update_subcloud_peer_group = mock.MagicMock() + mock_dc_client().get_system_peer = mock.MagicMock() + + peer = self.create_system_peer_static( + self.ctx, + peer_name='SystemPeer1') + peer_group = self.create_subcloud_peer_group_static( + self.ctx, + peer_group_name=FAKE_SITE0_PEER_GROUP_NAME) + self.create_peer_group_association_static( + self.ctx, + system_peer_id=peer.id, + peer_group_id=peer_group.id) + + spm = system_peer_manager.SystemPeerManager(mock.MagicMock()) + spm.update_subcloud_peer_group( + self.ctx, peer_group.id, + FAKE_SITE1_PEER_GROUP_STATE, + FAKE_SITE1_PEER_GROUP_MAX_SUBCLOUDS_REHOMING, + FAKE_SITE0_PEER_GROUP_NAME, + FAKE_SITE1_PEER_GROUP_NAME) + + peer_group_kwargs = { + 'peer-group-name': FAKE_SITE1_PEER_GROUP_NAME, + 'group-state': FAKE_SITE1_PEER_GROUP_STATE, + 'max-subcloud-rehoming': + FAKE_SITE1_PEER_GROUP_MAX_SUBCLOUDS_REHOMING + } + mock_dc_client().update_subcloud_peer_group.assert_called_once_with( + FAKE_SITE0_PEER_GROUP_NAME, **peer_group_kwargs)