From e4227317606d476b1a005e8253f25a907e8486bf Mon Sep 17 00:00:00 2001 From: "Zhang Rong(Jon)" Date: Tue, 19 Dec 2023 15:53:06 +0800 Subject: [PATCH] Fix unable to determine the SPG sync state if one site is down If Site1 (the local site) is down while setting up the protection group, the subcloud peer group sync state is unable to determine. This commit will automatically create the non-primary association on Site2 (the peer site) when creating a primary association, and update the sync state to the non-primary association. Then the operator can check the sync state on Site2 if Site1 is down. Test Plan: - PASS: Create a primary association and check the non-primary association on peer site. It was created, and sync_status will follow the primary association's sync_status. - PASS: Delete the primary association and check the non-primary association on peer site. It was deleted. - PASS: If you restart the "dcmanager-manager service" in the local site while the association sync_status is in "syncing", the sync_status will transition to "failed". - PASS: Create a primary association and wait for the sync_status change to "in-sync". Delete the subcloud peer group on peer site, the deletion will fail because it is associating to the non-primary association. Closes-Bug: 2046809 Change-Id: Ia917d0dc7c65fbea1e222fb52dbec79fdbe65b65 Signed-off-by: Zhang Rong(Jon) --- api-ref/source/api-ref-dcmanager-v1.rst | 20 +- api-ref/source/parameters.yaml | 6 + .../association-get-response.json | 1 + .../association-patch-response.json | 1 + .../associations-get-response.json | 1 + .../associations-post-response.json | 1 + .../drivers/openstack/dcmanager_v1.py | 122 ++++++- distributedcloud/dccommon/exceptions.py | 8 + .../controllers/v1/peer_group_association.py | 228 +++++++----- .../api/controllers/v1/subcloud_peer_group.py | 7 +- .../dcmanager/api/controllers/v1/subclouds.py | 2 +- .../api/controllers/v1/system_peers.py | 0 distributedcloud/dcmanager/common/consts.py | 8 +- distributedcloud/dcmanager/db/api.py | 6 +- .../dcmanager/db/sqlalchemy/api.py | 2 + ...add_subcloud_peer_group_and_association.py | 1 + .../dcmanager/db/sqlalchemy/models.py | 1 + .../manager/peer_group_audit_manager.py | 9 +- .../dcmanager/manager/peer_monitor_manager.py | 1 + distributedcloud/dcmanager/manager/service.py | 5 +- .../dcmanager/manager/system_peer_manager.py | 325 ++++++++++++++---- distributedcloud/dcmanager/rpc/client.py | 4 +- .../test_peer_group_association.py | 5 +- .../unit/manager/test_system_peer_manager.py | 117 ++++++- 24 files changed, 687 insertions(+), 194 deletions(-) mode change 100755 => 100644 distributedcloud/dcmanager/api/controllers/v1/system_peers.py diff --git a/api-ref/source/api-ref-dcmanager-v1.rst b/api-ref/source/api-ref-dcmanager-v1.rst index b128c2804..85bd8cbd3 100644 --- a/api-ref/source/api-ref-dcmanager-v1.rst +++ b/api-ref/source/api-ref-dcmanager-v1.rst @@ -2333,7 +2333,7 @@ Response - peer-controller-gateway-address: peer_controller_gateway_address - administrative-state: administrative_state - heartbeat-interval: heartbeat_interval - - heartbeat-failure-threshold: heartbeat_failure_threshold + - heartbeat-failure-threshold: heartbeat_failure_threshold - heartbeat-failure-policy: heartbeat_failure_policy - heartbeat-maintenance-timeout: heartbeat_maintenance_timeout - created-at: created_at @@ -2377,7 +2377,7 @@ serviceUnavailable (503) - peer_controller_gateway_address: peer_controller_gateway_address - administrative_state: administrative_state - heartbeat_interval: heartbeat_interval - - heartbeat_failure_threshold: heartbeat_failure_threshold + - heartbeat_failure_threshold: heartbeat_failure_threshold - heartbeat_failure_policy: heartbeat_failure_policy - heartbeat_maintenance_timeout: heartbeat_maintenance_timeout @@ -2400,7 +2400,7 @@ Request Example - peer-controller-gateway-address: peer_controller_gateway_address - administrative-state: administrative_state - heartbeat-interval: heartbeat_interval - - heartbeat-failure-threshold: heartbeat_failure_threshold + - heartbeat-failure-threshold: heartbeat_failure_threshold - heartbeat-failure-policy: heartbeat_failure_policy - heartbeat-maintenance-timeout: heartbeat_maintenance_timeout - created-at: created_at @@ -2449,7 +2449,7 @@ This operation does not accept a request body. - peer-controller-gateway-address: peer_controller_gateway_address - administrative-state: administrative_state - heartbeat-interval: heartbeat_interval - - heartbeat-failure-threshold: heartbeat_failure_threshold + - heartbeat-failure-threshold: heartbeat_failure_threshold - heartbeat-failure-policy: heartbeat_failure_policy - heartbeat-maintenance-timeout: heartbeat_maintenance_timeout - created-at: created_at @@ -2561,7 +2561,7 @@ serviceUnavailable (503) - peer_controller_gateway_address: peer_controller_gateway_address - administrative_state: administrative_state - heartbeat_interval: heartbeat_interval - - heartbeat_failure_threshold: heartbeat_failure_threshold + - heartbeat_failure_threshold: heartbeat_failure_threshold - heartbeat_failure_policy: heartbeat_failure_policy - heartbeat_maintenance_timeout: heartbeat_maintenance_timeout @@ -2582,7 +2582,7 @@ Request Example - peer-controller-gateway-address: peer_controller_gateway_address - administrative-state: administrative_state - heartbeat-interval: heartbeat_interval - - heartbeat-failure-threshold: heartbeat_failure_threshold + - heartbeat-failure-threshold: heartbeat_failure_threshold - heartbeat-failure-policy: heartbeat_failure_policy - heartbeat-maintenance-timeout: heartbeat_maintenance_timeout - created-at: created_at @@ -2625,7 +2625,7 @@ Subcloud Peer Groups Subcloud Peer Groups are logical groupings managed by a central System Controller. It's a group of the current managed subclouds which are supposed to be duplicated -in a peer site as secondary subclouds +in a peer site as secondary subclouds ****************************** Lists all subcloud peer groups @@ -3040,6 +3040,7 @@ internalServerError (500), serviceUnavailable (503) - peer-group-id: association_peer_group_id - system-peer-id: system_peer_id - peer-group-priority: association_peer_group_priority + - association-type: association_type - sync-status: association_sync_status - created-at: created_at - updated-at: updated_at @@ -3089,6 +3090,7 @@ Request Example - peer-group-id: association_peer_group_id - system-peer-id: system_peer_id - peer-group-priority: association_peer_group_priority + - association-type: association_type - sync-status: association_sync_status - sync-message: association_sync_message - created-at: created_at @@ -3133,6 +3135,7 @@ This operation does not accept a request body. - peer-group-id: association_peer_group_id - system-peer-id: system_peer_id - peer-group-priority: association_peer_group_priority + - association-type: association_type - sync-status: association_sync_status - sync-message: association_sync_message - created-at: created_at @@ -3175,6 +3178,7 @@ internalServerError (500), serviceUnavailable (503) - peer-group-id: association_peer_group_id - system-peer-id: system_peer_id - peer-group-priority: association_peer_group_priority + - association-type: association_type - sync-status: association_sync_status - sync-message: association_sync_message - created-at: created_at @@ -3213,6 +3217,7 @@ serviceUnavailable (503) - associate_id: peer_group_association_uri - peer_group_priority: association_peer_group_priority + - sync-status: association_sync_status Request Example ---------------- @@ -3227,6 +3232,7 @@ Request Example - peer-group-id: association_peer_group_id - system-peer-id: system_peer_id - peer-group-priority: association_peer_group_priority + - association-type: association_type - sync-status: association_sync_status - sync-message: association_sync_message - created-at: created_at diff --git a/api-ref/source/parameters.yaml b/api-ref/source/parameters.yaml index 5e53ab231..4d9786fbb 100644 --- a/api-ref/source/parameters.yaml +++ b/api-ref/source/parameters.yaml @@ -108,6 +108,12 @@ association_sync_status: in: body required: true type: string +association_type: + description: | + The type of association. + in: body + required: true + type: string availability_status: description: | The availability status of the subcloud. diff --git a/api-ref/source/samples/peer-group-associations/association-get-response.json b/api-ref/source/samples/peer-group-associations/association-get-response.json index 3aef416e5..7f69208fd 100644 --- a/api-ref/source/samples/peer-group-associations/association-get-response.json +++ b/api-ref/source/samples/peer-group-associations/association-get-response.json @@ -2,6 +2,7 @@ "id": 9, "peer-group-id": 1, "system-peer-id": 1, + "association-type": "primary", "peer-group-priority": 1, "sync-status": "synced", "sync-message": null, diff --git a/api-ref/source/samples/peer-group-associations/association-patch-response.json b/api-ref/source/samples/peer-group-associations/association-patch-response.json index 0d0b3917d..6e3c3df28 100644 --- a/api-ref/source/samples/peer-group-associations/association-patch-response.json +++ b/api-ref/source/samples/peer-group-associations/association-patch-response.json @@ -2,6 +2,7 @@ "id": 9, "peer-group-id": 1, "system-peer-id": 1, + "association-type": "primary", "peer-group-priority": 99, "sync-status": "synced", "sync-message": null, diff --git a/api-ref/source/samples/peer-group-associations/associations-get-response.json b/api-ref/source/samples/peer-group-associations/associations-get-response.json index 05152196e..2a3e5eb3d 100644 --- a/api-ref/source/samples/peer-group-associations/associations-get-response.json +++ b/api-ref/source/samples/peer-group-associations/associations-get-response.json @@ -4,6 +4,7 @@ "id": 9, "peer-group-id": 1, "system-peer-id": 1, + "association-type": "primary", "peer-group-priority": 1, "sync-status": "synced", "created-at": "2023-08-21 09:24:07.394961", diff --git a/api-ref/source/samples/peer-group-associations/associations-post-response.json b/api-ref/source/samples/peer-group-associations/associations-post-response.json index d01bacea8..0b6e9b6f6 100644 --- a/api-ref/source/samples/peer-group-associations/associations-post-response.json +++ b/api-ref/source/samples/peer-group-associations/associations-post-response.json @@ -2,6 +2,7 @@ "id": 9, "peer-group-id": 1, "system-peer-id": 1, + "association-type": "primary", "peer-group-priority": 1, "sync-status": "syncing", "sync-message": null, diff --git a/distributedcloud/dccommon/drivers/openstack/dcmanager_v1.py b/distributedcloud/dccommon/drivers/openstack/dcmanager_v1.py index 22d527d14..56bfe440a 100644 --- a/distributedcloud/dccommon/drivers/openstack/dcmanager_v1.py +++ b/distributedcloud/dccommon/drivers/openstack/dcmanager_v1.py @@ -33,6 +33,27 @@ class DcmanagerClient(base.DriverBase): self.token = session.get_token() self.timeout = timeout + def get_system_peer(self, system_peer_uuid): + """Get system peer.""" + if system_peer_uuid is None: + raise ValueError("system_peer_uuid is required.") + url = f"{self.endpoint}/system-peers/{system_peer_uuid}" + + headers = {"X-Auth-Token": self.token} + response = requests.get(url, headers=headers, timeout=self.timeout) + + if response.status_code == 200: + return response.json() + else: + if response.status_code == 404 and \ + 'System Peer not found' in response.text: + raise exceptions.SystemPeerNotFound( + system_peer=system_peer_uuid) + message = "Get SystemPeer: system_peer_uuid %s failed with RC: %d" \ + % (system_peer_uuid, response.status_code) + LOG.error(message) + raise Exception(message) + def get_subcloud(self, subcloud_ref, is_region_name=False): """Get subcloud.""" if subcloud_ref is None: @@ -57,7 +78,7 @@ class DcmanagerClient(base.DriverBase): def get_subcloud_list(self): """Get subcloud list.""" - url = self.endpoint + '/subclouds' + url = f"{self.endpoint}/subclouds" headers = {"X-Auth-Token": self.token} response = requests.get(url, headers=headers, timeout=self.timeout) @@ -73,7 +94,7 @@ class DcmanagerClient(base.DriverBase): def get_subcloud_group_list(self): """Get subcloud group list.""" - url = self.endpoint + '/subcloud-groups' + url = f"{self.endpoint}/subcloud-groups" headers = {"X-Auth-Token": self.token} response = requests.get(url, headers=headers, timeout=self.timeout) @@ -89,7 +110,7 @@ class DcmanagerClient(base.DriverBase): def get_subcloud_peer_group_list(self): """Get subcloud peer group list.""" - url = self.endpoint + '/subcloud-peer-groups' + url = f"{self.endpoint}/subcloud-peer-groups" headers = {"X-Auth-Token": self.token} response = requests.get(url, headers=headers, timeout=self.timeout) @@ -147,9 +168,35 @@ class DcmanagerClient(base.DriverBase): LOG.error(message) raise Exception(message) + def get_peer_group_association_with_peer_id_and_pg_id(self, peer_id, + pg_id): + """Get peer group association with peer id and PG id.""" + for association in self.get_peer_group_association_list(): + if peer_id == association.get('system-peer-id') and \ + pg_id == association.get('peer-group-id'): + return association + raise exceptions.PeerGroupAssociationNotFound( + association_id=None) + + def get_peer_group_association_list(self): + """Get peer group association list.""" + url = f"{self.endpoint}/peer-group-associations" + + headers = {"X-Auth-Token": self.token} + response = requests.get(url, headers=headers, timeout=self.timeout) + + if response.status_code == 200: + data = response.json() + return data.get('peer_group_associations', []) + else: + message = "Get Peer Group Association list failed with RC: %d" % \ + response.status_code + LOG.error(message) + raise Exception(message) + def add_subcloud_peer_group(self, **kwargs): """Add a subcloud peer group.""" - url = self.endpoint + '/subcloud-peer-groups' + url = f"{self.endpoint}/subcloud-peer-groups" headers = {"X-Auth-Token": self.token, "Content-Type": "application/json"} @@ -166,7 +213,7 @@ class DcmanagerClient(base.DriverBase): def add_subcloud_with_secondary_status(self, files, data): """Add a subcloud with secondary status.""" - url = self.endpoint + '/subclouds' + url = f"{self.endpoint}/subclouds" # If not explicitly specified, set 'secondary' to true by default. # This action adds a secondary subcloud with rehoming data in the @@ -197,6 +244,49 @@ class DcmanagerClient(base.DriverBase): LOG.error(message) raise Exception(message) + def add_peer_group_association(self, **kwargs): + """Add a peer group association.""" + url = f"{self.endpoint}/peer-group-associations" + + headers = {"X-Auth-Token": self.token, + "Content-Type": "application/json"} + response = requests.post(url, json=kwargs, headers=headers, + timeout=self.timeout) + + if response.status_code == 200: + return response.json() + else: + message = "Add Peer Group Association: %s, failed with RC: %d" % \ + (kwargs, response.status_code) + LOG.error(message) + raise Exception(message) + + def update_peer_group_association_sync_status(self, association_id, + sync_status): + """Update the peer group association sync_status.""" + if association_id is None: + raise ValueError("association_id is required.") + url = f"{self.endpoint}/peer-group-associations/{association_id}" + update_kwargs = {"sync_status": sync_status} + + headers = {"X-Auth-Token": self.token, + "Content-Type": "application/json"} + response = requests.patch(url, json=update_kwargs, headers=headers, + timeout=self.timeout) + + if response.status_code == 200: + return response.json() + else: + if response.status_code == 404 and \ + 'Peer Group Association not found' in response.text: + raise exceptions.PeerGroupAssociationNotFound( + association_id=association_id) + message = "Update Peer Group Association: association_id %s, " \ + "sync_status %s, failed with RC: %d" % ( + association_id, sync_status, response.status_code) + LOG.error(message) + raise Exception(message) + def update_subcloud_peer_group(self, peer_group_ref, **kwargs): """Update the subcloud peer group.""" if peer_group_ref is None: @@ -280,6 +370,28 @@ class DcmanagerClient(base.DriverBase): LOG.error(message) raise Exception(message) + def delete_peer_group_association(self, association_id): + """Delete the peer group association.""" + if association_id is None: + raise ValueError("association_id is required.") + url = f"{self.endpoint}/peer-group-associations/{association_id}" + + headers = {"X-Auth-Token": self.token} + response = requests.delete(url, headers=headers, + timeout=self.timeout) + + if response.status_code == 200: + return response.json() + else: + if response.status_code == 404 and \ + 'Peer Group Association not found' in response.text: + raise exceptions.PeerGroupAssociationNotFound( + association_id=association_id) + message = "Delete Peer Group Association: association_id %s " \ + "failed with RC: %d" % (association_id, response.status_code) + LOG.error(message) + raise Exception(message) + def delete_subcloud_peer_group(self, peer_group_ref): """Delete the subcloud peer group.""" if peer_group_ref is None: diff --git a/distributedcloud/dccommon/exceptions.py b/distributedcloud/dccommon/exceptions.py index 40863f616..da174d4bc 100644 --- a/distributedcloud/dccommon/exceptions.py +++ b/distributedcloud/dccommon/exceptions.py @@ -125,6 +125,10 @@ class ApiException(DCCommonException): message = _("%(endpoint)s failed with status code: %(rc)d") +class SystemPeerNotFound(NotFound): + message = _("System Peer %(system_peer)s not found") + + class SubcloudNotFound(NotFound): message = _("Subcloud %(subcloud_ref)s not found") @@ -133,6 +137,10 @@ class SubcloudPeerGroupNotFound(NotFound): message = _("Subcloud Peer Group %(peer_group_ref)s not found") +class PeerGroupAssociationNotFound(NotFound): + message = _("Peer Group Association %(association_id)s not found") + + class SubcloudPeerGroupDeleteFailedAssociated(DCCommonException): message = _("Subcloud Peer Group %(peer_group_ref)s delete failed " "cause it is associated with a system peer.") diff --git a/distributedcloud/dcmanager/api/controllers/v1/peer_group_association.py b/distributedcloud/dcmanager/api/controllers/v1/peer_group_association.py index 2f6da213b..5c7143085 100644 --- a/distributedcloud/dcmanager/api/controllers/v1/peer_group_association.py +++ b/distributedcloud/dcmanager/api/controllers/v1/peer_group_association.py @@ -31,8 +31,14 @@ from dcmanager.rpc import client as rpc_client CONF = cfg.CONF LOG = logging.getLogger(__name__) +PEER_GROUP_PRIMARY_PRIORITY = 0 MIN_PEER_GROUP_ASSOCIATION_PRIORITY = 1 MAX_PEER_GROUP_ASSOCIATION_PRIORITY = 65536 +ASSOCIATION_SYNC_STATUS_LIST = \ + [consts.ASSOCIATION_SYNC_STATUS_SYNCING, + consts.ASSOCIATION_SYNC_STATUS_IN_SYNC, + consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC, + consts.ASSOCIATION_SYNC_STATUS_FAILED] class PeerGroupAssociationsController(restcomm.GenericPathController): @@ -150,6 +156,12 @@ class PeerGroupAssociationsController(restcomm.GenericPathController): return False return True + def _validate_sync_status(self, sync_status): + if sync_status not in ASSOCIATION_SYNC_STATUS_LIST: + LOG.debug("Invalid sync_status: %s" % sync_status) + return False + return True + @index.when(method='POST', template='json') def post(self): """Create a new peer group association.""" @@ -174,20 +186,22 @@ class PeerGroupAssociationsController(restcomm.GenericPathController): peer_group_priority = payload.get('peer_group_priority') peer_group = db_api.subcloud_peer_group_get(context, peer_group_id) - if (peer_group.group_priority == 0 and not peer_group_priority) or \ - (peer_group.group_priority > 0 and peer_group_priority): - pecan.abort(httpclient.BAD_REQUEST, - _('Peer Group Association create with peer_group_' - 'priority is required when the subcloud peer group ' - 'priority is 0, and it is not allowed when the ' - 'subcloud peer group priority is greater than 0.')) - - if peer_group_priority and not self._validate_peer_group_priority( - peer_group_priority): + if peer_group_priority is not None and not \ + self._validate_peer_group_priority(peer_group_priority): pecan.abort(httpclient.BAD_REQUEST, _('Invalid peer_group_priority')) - sync_enabled = peer_group.group_priority == 0 + if (peer_group.group_priority == PEER_GROUP_PRIMARY_PRIORITY and + peer_group_priority is None) or ( + peer_group.group_priority > PEER_GROUP_PRIMARY_PRIORITY and + peer_group_priority is not None): + pecan.abort(httpclient.BAD_REQUEST, + _('Peer Group Association create is not allowed when ' + 'the subcloud peer group priority is greater than 0 ' + 'and it is required when the subcloud peer group ' + 'priority is 0.')) + + is_primary = peer_group.group_priority == PEER_GROUP_PRIMARY_PRIORITY # only one combination of peer_group_id + system_peer_id can exists association = None @@ -198,9 +212,8 @@ class PeerGroupAssociationsController(restcomm.GenericPathController): peer_group_id, system_peer_id) except exception.PeerGroupAssociationCombinationNotFound: - LOG.warning("Peer Group Association Combination Not Found, " - "peer_group_id: %s, system_peer_id: %s" - % (peer_group_id, system_peer_id)) + # This is a normal scenario, no need to log or raise an error + pass except Exception as e: LOG.warning("Peer Group Association get failed: %s;" "peer_group_id: %s, system_peer_id: %s" @@ -209,22 +222,22 @@ class PeerGroupAssociationsController(restcomm.GenericPathController): _('peer_group_association_get_by_peer_group_and_' 'system_peer_id failed: %s' % e)) if association: - LOG.info("Failed to create Peer group association, association \ - with peer_group_id:[%s],system_peer_id:[%s] \ - already exists" % (peer_group_id, system_peer_id)) + LOG.warning("Failed to create Peer group association, association " + "with peer_group_id:[%s],system_peer_id:[%s] " + "already exists" % (peer_group_id, system_peer_id)) pecan.abort(httpclient.BAD_REQUEST, _('A Peer group association with same peer_group_id, ' 'system_peer_id already exists')) # Create the peer group association try: - sync_status = consts.ASSOCIATION_SYNC_STATUS_SYNCING if \ - sync_enabled else consts.ASSOCIATION_SYNC_STATUS_DISABLED + association_type = consts.ASSOCIATION_TYPE_PRIMARY if is_primary \ + else consts.ASSOCIATION_TYPE_NON_PRIMARY association = db_api.peer_group_association_create( context, peer_group_id, system_peer_id, peer_group_priority, - sync_status) + association_type, consts.ASSOCIATION_SYNC_STATUS_SYNCING) - if sync_enabled: + if is_primary: # Sync the subcloud peer group to peer site self.rpc_client.sync_subcloud_peer_group(context, association.id) else: @@ -237,6 +250,103 @@ class PeerGroupAssociationsController(restcomm.GenericPathController): pecan.abort(httpclient.INTERNAL_SERVER_ERROR, _('Unable to create peer group association')) + def _sync_association(self, context, association, is_non_primary): + if is_non_primary: + self.rpc_client.peer_monitor_notify(context) + pecan.abort(httpclient.BAD_REQUEST, + _('Peer Group Association sync is not allowed ' + 'when the association type is non-primary. But the ' + 'peer monitor notify was triggered.')) + else: + peer_group = db_api.subcloud_peer_group_get( + context, association.peer_group_id) + if not self._validate_peer_group_leader_id(peer_group. + system_leader_id): + pecan.abort(httpclient.BAD_REQUEST, + _('Peer Group Association sync is not allowed when ' + 'the subcloud peer group system_leader_id is not ' + 'the current system controller UUID.')) + try: + # Sync the subcloud peer group to peer site + self.rpc_client.sync_subcloud_peer_group(context, + association.id) + association = db_api.peer_group_association_update( + context, id=association.id, + sync_status=consts.ASSOCIATION_SYNC_STATUS_SYNCING, + sync_message='None') + return db_api.peer_group_association_db_model_to_dict( + association) + except RemoteError as e: + pecan.abort(httpclient.UNPROCESSABLE_ENTITY, e.value) + except Exception as e: + # additional exceptions. + LOG.exception(e) + pecan.abort(httpclient.INTERNAL_SERVER_ERROR, + _('Unable to sync peer group association')) + + def _update_association(self, context, association, is_non_primary): + payload = self._get_payload(request) + if not payload: + pecan.abort(httpclient.BAD_REQUEST, _('Body required')) + + peer_group_priority = payload.get('peer_group_priority') + sync_status = payload.get('sync_status') + # Check value is not None or empty before calling validate + if not (peer_group_priority is not None or sync_status): + pecan.abort(httpclient.BAD_REQUEST, _('nothing to update')) + elif peer_group_priority is not None and sync_status: + pecan.abort(httpclient.BAD_REQUEST, + _('peer_group_priority and sync_status cannot be ' + 'updated at the same time.')) + if peer_group_priority is not None: + if not self._validate_peer_group_priority(peer_group_priority): + pecan.abort(httpclient.BAD_REQUEST, + _('Invalid peer_group_priority')) + if is_non_primary: + self.rpc_client.peer_monitor_notify(context) + pecan.abort(httpclient.BAD_REQUEST, + _('Peer Group Association peer_group_priority is ' + 'not allowed to update when the association type ' + 'is non-primary.')) + else: + db_api.peer_group_association_update( + context, id=association.id, + peer_group_priority=peer_group_priority) + if sync_status: + if not self._validate_sync_status(sync_status): + pecan.abort(httpclient.BAD_REQUEST, + _('Invalid sync_status')) + + if not is_non_primary: + self.rpc_client.peer_monitor_notify(context) + pecan.abort(httpclient.BAD_REQUEST, + _('Peer Group Association sync_status is not ' + 'allowed to update when the association type is ' + 'primary.')) + else: + sync_message = 'Primary association sync to current site ' + \ + 'failed.' if sync_status == \ + consts.ASSOCIATION_SYNC_STATUS_FAILED else 'None' + association = db_api.peer_group_association_update( + context, id=association.id, sync_status=sync_status, + sync_message=sync_message) + self.rpc_client.peer_monitor_notify(context) + return db_api.peer_group_association_db_model_to_dict( + association) + + try: + # Ask dcmanager-manager to update the subcloud peer group priority + # to peer site. It will do the real work... + return self.rpc_client.update_subcloud_peer_group(context, + association.id) + except RemoteError as e: + pecan.abort(httpclient.UNPROCESSABLE_ENTITY, e.value) + except Exception as e: + # additional exceptions. + LOG.exception(e) + pecan.abort(httpclient.INTERNAL_SERVER_ERROR, + _('Unable to update peer group association')) + @index.when(method='PATCH', template='json') def patch(self, association_id, sync=False): """Update a peer group association. @@ -262,64 +372,13 @@ class PeerGroupAssociationsController(restcomm.GenericPathController): pecan.abort(httpclient.NOT_FOUND, _('Peer Group Association not found')) - sync_disabled = association.sync_status == consts.\ - ASSOCIATION_SYNC_STATUS_DISABLED - if sync_disabled: - pecan.abort(httpclient.BAD_REQUEST, - _('Peer Group Association sync or update is not allowed' - ' when the sync_status is disabled.')) + is_non_primary = association.association_type == consts.\ + ASSOCIATION_TYPE_NON_PRIMARY if sync: - peer_group = db_api.subcloud_peer_group_get( - context, association.peer_group_id) - if not self._validate_peer_group_leader_id(peer_group. - system_leader_id): - pecan.abort(httpclient.BAD_REQUEST, - _('Peer Group Association sync is not allowed when ' - 'the subcloud peer group system_leader_id is not ' - 'the current system controller UUID.')) - try: - # Sync the subcloud peer group to peer site - self.rpc_client.sync_subcloud_peer_group(context, - association.id) - association = db_api.peer_group_association_update( - context, id=association_id, - sync_status=consts.ASSOCIATION_SYNC_STATUS_SYNCING, - sync_message='None') - return db_api.peer_group_association_db_model_to_dict( - association) - except RemoteError as e: - pecan.abort(httpclient.UNPROCESSABLE_ENTITY, e.value) - except Exception as e: - # additional exceptions. - LOG.exception(e) - pecan.abort(httpclient.INTERNAL_SERVER_ERROR, - _('Unable to sync peer group association')) - - payload = self._get_payload(request) - if not payload: - pecan.abort(httpclient.BAD_REQUEST, _('Body required')) - - peer_group_priority = payload.get('peer_group_priority') - # Check value is not None or empty before calling validate - if not peer_group_priority: - pecan.abort(httpclient.BAD_REQUEST, _('nothing to update')) - if not self._validate_peer_group_priority(peer_group_priority): - pecan.abort(httpclient.BAD_REQUEST, - _('Invalid peer_group_priority')) - - try: - # Ask dcmanager-manager to update the subcloud peer group priority - # to peer site. It will do the real work... - return self.rpc_client.update_subcloud_peer_group( - context, association.id, peer_group_priority) - except RemoteError as e: - pecan.abort(httpclient.UNPROCESSABLE_ENTITY, e.value) - except Exception as e: - # additional exceptions. - LOG.exception(e) - pecan.abort(httpclient.INTERNAL_SERVER_ERROR, - _('Unable to update peer group association')) + return self._sync_association(context, association, is_non_primary) + else: + return self._update_association(context, association, is_non_primary) @index.when(method='delete', template='json') def delete(self, association_id): @@ -342,17 +401,18 @@ class PeerGroupAssociationsController(restcomm.GenericPathController): try: association = db_api.peer_group_association_get(context, association_id) - sync_disabled = association.sync_status == consts.\ - ASSOCIATION_SYNC_STATUS_DISABLED - if sync_disabled: + is_non_primary = association.association_type == consts.\ + ASSOCIATION_TYPE_NON_PRIMARY + if is_non_primary: result = db_api.peer_group_association_destroy(context, association_id) self.rpc_client.peer_monitor_notify(context) return result - # Ask system-peer-manager to delete the association. - # It will do all the real work... - return self.rpc_client.delete_peer_group_association( - context, association_id) + else: + # Ask system-peer-manager to delete the association. + # It will do all the real work... + return self.rpc_client.delete_peer_group_association( + context, association_id) except exception.PeerGroupAssociationNotFound: pecan.abort(httpclient.NOT_FOUND, _('Peer Group Association not found')) diff --git a/distributedcloud/dcmanager/api/controllers/v1/subcloud_peer_group.py b/distributedcloud/dcmanager/api/controllers/v1/subcloud_peer_group.py index ad7769466..2116da781 100644 --- a/distributedcloud/dcmanager/api/controllers/v1/subcloud_peer_group.py +++ b/distributedcloud/dcmanager/api/controllers/v1/subcloud_peer_group.py @@ -539,10 +539,11 @@ class SubcloudPeerGroupsController(restcomm.GenericPathController): LOG.info("Subcloud Peer Group [%s] not found" % group_ref) pecan.abort(httpclient.NOT_FOUND, _('Subcloud Peer Group not found')) - LOG.info("Handling delete subcloud peer group request for: %s" % group) + LOG.info("Handling delete subcloud peer group request for: %s" % + group) # A peer group cannot be deleted if it is used by any associations - association = db_api.peer_group_association_get_by_peer_group_id(context, - group.id) + association = db_api.peer_group_association_get_by_peer_group_id( + context, group.id) if len(association) > 0: pecan.abort(httpclient.BAD_REQUEST, _("Cannot delete a peer group " diff --git a/distributedcloud/dcmanager/api/controllers/v1/subclouds.py b/distributedcloud/dcmanager/api/controllers/v1/subclouds.py index b0b413450..6ada99d73 100644 --- a/distributedcloud/dcmanager/api/controllers/v1/subclouds.py +++ b/distributedcloud/dcmanager/api/controllers/v1/subclouds.py @@ -556,7 +556,7 @@ class SubcloudsController(object): # Ask dcmanager-manager to add the subcloud. # It will do all the real work... # If the subcloud is secondary, it will be synchronous operation. - # A normal subcloud add will be a synchronous operation. + # A normal subcloud add will be asynchronous operation. if 'secondary' in payload: self.dcmanager_rpc_client.add_secondary_subcloud( context, subcloud.id, payload) diff --git a/distributedcloud/dcmanager/api/controllers/v1/system_peers.py b/distributedcloud/dcmanager/api/controllers/v1/system_peers.py old mode 100755 new mode 100644 diff --git a/distributedcloud/dcmanager/common/consts.py b/distributedcloud/dcmanager/common/consts.py index 9dcabd06d..72f47ce68 100644 --- a/distributedcloud/dcmanager/common/consts.py +++ b/distributedcloud/dcmanager/common/consts.py @@ -466,11 +466,15 @@ PEER_GROUP_MIGRATING = 'migrating' PEER_GROUP_MIGRATION_COMPLETE = 'complete' PEER_GROUP_MIGRATION_NONE = 'none' +# Peer group association type +ASSOCIATION_TYPE_PRIMARY = 'primary' +ASSOCIATION_TYPE_NON_PRIMARY = 'non-primary' + # Peer group association sync status ASSOCIATION_SYNC_STATUS_SYNCING = 'syncing' -ASSOCIATION_SYNC_STATUS_SYNCED = 'synced' +ASSOCIATION_SYNC_STATUS_IN_SYNC = 'in-sync' ASSOCIATION_SYNC_STATUS_FAILED = 'failed' -ASSOCIATION_SYNC_STATUS_DISABLED = 'disabled' +ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC = 'out-of-sync' # Peer monitor heartbeat policy HEARTBEAT_FAILURE_POLICY_ALARM = 'alarm' diff --git a/distributedcloud/dcmanager/db/api.py b/distributedcloud/dcmanager/db/api.py index 457550786..4d651b81d 100644 --- a/distributedcloud/dcmanager/db/api.py +++ b/distributedcloud/dcmanager/db/api.py @@ -555,6 +555,7 @@ def peer_group_association_db_model_to_dict(peer_group_association): "peer-group-id": peer_group_association.peer_group_id, "system-peer-id": peer_group_association.system_peer_id, "peer-group-priority": peer_group_association.peer_group_priority, + "association-type": peer_group_association.association_type, "sync-status": peer_group_association.sync_status, "sync-message": peer_group_association.sync_message, "created-at": peer_group_association.created_at, @@ -563,13 +564,14 @@ def peer_group_association_db_model_to_dict(peer_group_association): def peer_group_association_create(context, peer_group_id, system_peer_id, - peer_group_priority, sync_status=None, - sync_message=None): + peer_group_priority, association_type=None, + sync_status=None, sync_message=None): """Create a peer_group_association.""" return IMPL.peer_group_association_create(context, peer_group_id, system_peer_id, peer_group_priority, + association_type, sync_status, sync_message) diff --git a/distributedcloud/dcmanager/db/sqlalchemy/api.py b/distributedcloud/dcmanager/db/sqlalchemy/api.py index 69d6b8004..f98676985 100644 --- a/distributedcloud/dcmanager/db/sqlalchemy/api.py +++ b/distributedcloud/dcmanager/db/sqlalchemy/api.py @@ -1241,6 +1241,7 @@ def peer_group_association_create(context, peer_group_id, system_peer_id, peer_group_priority, + association_type, sync_status, sync_message): with write_session() as session: @@ -1248,6 +1249,7 @@ def peer_group_association_create(context, peer_group_association_ref.peer_group_id = peer_group_id peer_group_association_ref.system_peer_id = system_peer_id peer_group_association_ref.peer_group_priority = peer_group_priority + peer_group_association_ref.association_type = association_type peer_group_association_ref.sync_status = sync_status peer_group_association_ref.sync_message = sync_message session.add(peer_group_association_ref) diff --git a/distributedcloud/dcmanager/db/sqlalchemy/migrate_repo/versions/014_add_subcloud_peer_group_and_association.py b/distributedcloud/dcmanager/db/sqlalchemy/migrate_repo/versions/014_add_subcloud_peer_group_and_association.py index 535afffa8..9d794e969 100644 --- a/distributedcloud/dcmanager/db/sqlalchemy/migrate_repo/versions/014_add_subcloud_peer_group_and_association.py +++ b/distributedcloud/dcmanager/db/sqlalchemy/migrate_repo/versions/014_add_subcloud_peer_group_and_association.py @@ -87,6 +87,7 @@ def upgrade(migrate_engine): sqlalchemy.ForeignKey('system_peer.id', ondelete='CASCADE')), sqlalchemy.Column('peer_group_priority', sqlalchemy.Integer), + sqlalchemy.Column('association_type', sqlalchemy.String(255)), sqlalchemy.Column('sync_status', sqlalchemy.String(255)), sqlalchemy.Column('sync_message', sqlalchemy.Text), sqlalchemy.Column('reserved_1', sqlalchemy.Text), diff --git a/distributedcloud/dcmanager/db/sqlalchemy/models.py b/distributedcloud/dcmanager/db/sqlalchemy/models.py index 3ab05d375..e637b31ed 100644 --- a/distributedcloud/dcmanager/db/sqlalchemy/models.py +++ b/distributedcloud/dcmanager/db/sqlalchemy/models.py @@ -155,6 +155,7 @@ class PeerGroupAssociation(BASE, DCManagerBase): id = Column(Integer, primary_key=True, autoincrement=True, nullable=False) peer_group_id = Column(Integer) system_peer_id = Column(Integer) + association_type = Column(String(255)) peer_group_priority = Column(Integer) sync_status = Column(String(255)) sync_message = Column(Text()) diff --git a/distributedcloud/dcmanager/manager/peer_group_audit_manager.py b/distributedcloud/dcmanager/manager/peer_group_audit_manager.py index 28377a5bc..f370a5270 100644 --- a/distributedcloud/dcmanager/manager/peer_group_audit_manager.py +++ b/distributedcloud/dcmanager/manager/peer_group_audit_manager.py @@ -156,11 +156,12 @@ class PeerGroupAuditManager(manager.Manager): raise e self.require_audit_flag = False - # if remote subcloud peer group's migration_status is 'complete', - # Get remote subclouds, for 'managed+online' subclouds, - # set 'unmanaged+secondary' to local on same subclouds + # if remote subcloud peer group's migration_status is 'complete' + # or self.require_audit_flag is True, get remote subclouds. + # For 'managed+online' subclouds, set 'unmanaged+secondary' to + # local on same subclouds elif remote_peer_group.get("migration_status") == \ - consts.PEER_GROUP_MIGRATION_COMPLETE: + consts.PEER_GROUP_MIGRATION_COMPLETE or self.require_audit_flag: remote_subclouds = \ self._get_subclouds_by_peer_group_from_system_peer( system_peer, diff --git a/distributedcloud/dcmanager/manager/peer_monitor_manager.py b/distributedcloud/dcmanager/manager/peer_monitor_manager.py index 44d4365d2..0f836280c 100644 --- a/distributedcloud/dcmanager/manager/peer_monitor_manager.py +++ b/distributedcloud/dcmanager/manager/peer_monitor_manager.py @@ -216,6 +216,7 @@ class PeerMonitor(object): pgam.PeerGroupAuditManager(self.subcloud_manager, peer_group_id) self.peer_group_id_set = peer_group_id_set + self._set_require_audit_flag_to_associated_peer_groups() def start(self): if self.thread is not None: diff --git a/distributedcloud/dcmanager/manager/service.py b/distributedcloud/dcmanager/manager/service.py index 4afa2b7be..ae58c19b5 100644 --- a/distributedcloud/dcmanager/manager/service.py +++ b/distributedcloud/dcmanager/manager/service.py @@ -115,6 +115,7 @@ class DCManagerService(service.Service): os.makedirs(dccommon_consts.ANSIBLE_OVERRIDES_PATH, 0o600, exist_ok=True) self.subcloud_manager.handle_subcloud_operations_in_progress() + self.system_peer_manager.handle_association_operations_in_progress() # Send notify to peer monitor. self.peer_monitor_manager.peer_monitor_notify(self.context) @@ -332,11 +333,11 @@ class DCManagerService(service.Service): @request_context def sync_subcloud_peer_group(self, context, association_id, - sync_subclouds=True, priority=None): + sync_subclouds=True): LOG.info("Handling sync_subcloud_peer_group request for: %s", association_id) return self.system_peer_manager.sync_subcloud_peer_group( - context, association_id, sync_subclouds, priority) + context, association_id, sync_subclouds) @request_context def delete_peer_group_association(self, context, association_id): diff --git a/distributedcloud/dcmanager/manager/system_peer_manager.py b/distributedcloud/dcmanager/manager/system_peer_manager.py index f3e7988e5..ce3afdd13 100644 --- a/distributedcloud/dcmanager/manager/system_peer_manager.py +++ b/distributedcloud/dcmanager/manager/system_peer_manager.py @@ -19,9 +19,11 @@ from dccommon.drivers.openstack.peer_site import PeerSiteDriver from dccommon.drivers.openstack.sysinv_v1 import SysinvClient from dccommon import exceptions as dccommon_exceptions from dcmanager.common import consts +from dcmanager.common import context as dcmanager_context from dcmanager.common import exceptions from dcmanager.common.i18n import _ from dcmanager.common import manager +from dcmanager.common import utils from dcmanager.db import api as db_api LOG = logging.getLogger(__name__) @@ -32,12 +34,17 @@ MAX_PARALLEL_SUBCLOUD_DELETE = 10 VERIFY_SUBCLOUD_SYNC_VALID = 'valid' VERIFY_SUBCLOUD_SYNC_IGNORE = 'ignore' +TRANSITORY_STATES = { + consts.ASSOCIATION_SYNC_STATUS_SYNCING: consts.ASSOCIATION_SYNC_STATUS_FAILED +} + class SystemPeerManager(manager.Manager): """Manages tasks related to system peers.""" def __init__(self, peer_monitor_manager, *args, **kwargs): LOG.debug(_('SystemPeerManager initialization...')) + self.context = dcmanager_context.get_admin_context() self.peer_monitor_manager = peer_monitor_manager super(SystemPeerManager, self).__init__( service_name="system_peer_manager", *args, **kwargs) @@ -192,7 +199,7 @@ class SystemPeerManager(manager.Manager): files, data) LOG.info(f"Updated Subcloud {dc_peer_subcloud.get('name')} " "(region_name: " - f"{dc_peer_subcloud.get('region_name')}) on peer " + f"{dc_peer_subcloud.get('region-name')}) on peer " "site.") else: # Create subcloud on peer site if not exist @@ -200,15 +207,18 @@ class SystemPeerManager(manager.Manager): add_subcloud_with_secondary_status(files, data) LOG.info(f"Created Subcloud {dc_peer_subcloud.get('name')} " "(region_name: " - f"{dc_peer_subcloud.get('region_name')}) on peer " + f"{dc_peer_subcloud.get('region-name')}) on peer " "site.") LOG.debug(f"Updating subcloud {subcloud_name} (region_name: " f"{region_name}) with subcloud peer group id " f"{dc_peer_pg_id} on peer site.") - # Update subcloud associated peer group on peer site + # Update subcloud associated peer group on peer site. + # The peer_group update will check the header and should + # use the region_name as subcloud_ref. peer_subcloud = dc_client.update_subcloud( - subcloud_name, files=None, data={"peer_group": - str(dc_peer_pg_id)}) + dc_peer_subcloud.get('region-name'), files=None, + data={"peer_group": str(dc_peer_pg_id)}, + is_region_name=True) # Need to check the subcloud only in secondary, otherwise it # should be recorded as a failure. @@ -382,14 +392,73 @@ class SystemPeerManager(manager.Manager): return error_msg + def _update_sync_status(self, context, association_id, sync_status, + sync_message, dc_peer_association_id=None, + dc_client=None, **kwargs): + """Update sync status of association.""" + if dc_peer_association_id is not None: + if dc_client is None: + association = db_api.peer_group_association_get(context, + association_id) + peer = db_api.system_peer_get(context, + association.system_peer_id) + dc_client = self.get_peer_dc_client(peer) + dc_client.update_peer_group_association_sync_status( + dc_peer_association_id, sync_status) + LOG.info(f"Updated non-primary Peer Group Association " + f"{dc_peer_association_id} sync_status to {sync_status}.") + return db_api.peer_group_association_update( + context, association_id, sync_status=sync_status, + sync_message=sync_message, **kwargs) + + def _update_sync_status_to_failed(self, context, association_id, + failed_message, + dc_peer_association_id=None): + """Update sync status to failed.""" + return self._update_sync_status(context, association_id, + consts.ASSOCIATION_SYNC_STATUS_FAILED, + failed_message, + dc_peer_association_id) + + def _get_non_primary_association(self, dc_client, dc_peer_system_peer_id, + dc_peer_pg_id): + """Get non-primary Association from peer site.""" + try: + return dc_client.get_peer_group_association_with_peer_id_and_pg_id( + dc_peer_system_peer_id, dc_peer_pg_id) + except dccommon_exceptions.PeerGroupAssociationNotFound: + LOG.error(f"Peer Group association does not exist on peer site." + f"Peer Group ID: {dc_peer_pg_id}, Peer System Peer ID: " + f"{dc_peer_system_peer_id}") + return None + + def _get_peer_site_pg_by_name(self, dc_client, peer_group_name): + """Get remote Peer Group from peer site by name.""" + try: + return dc_client.get_subcloud_peer_group(peer_group_name) + except dccommon_exceptions.SubcloudPeerGroupNotFound: + LOG.error(f"Peer Group {peer_group_name} does not exist on peer " + f"site.") + return None + + def _get_peer_site_system_peer(self, dc_client, peer_uuid=None): + """Get System Peer from peer site.""" + try: + peer_uuid = peer_uuid if peer_uuid is not None else \ + utils.get_local_system().uuid + return dc_client.get_system_peer(peer_uuid) + except dccommon_exceptions.SystemPeerNotFound: + LOG.error(f"Peer Site System Peer {peer_uuid} does not exist.") + return None + def sync_subcloud_peer_group(self, context, association_id, - sync_subclouds=True, priority=None): + sync_subclouds=True): """Sync subcloud peer group to peer site. This function synchronizes subcloud peer groups from current site to peer site, supporting two scenarios: - 1. When creating an association between the system peer and a subcloud + 1. When creating the association between the system peer and a subcloud peer group. This function creates the subcloud peer group on the peer site and synchronizes the subclouds to it. @@ -411,13 +480,12 @@ class SystemPeerManager(manager.Manager): dc_local_pg = db_api.subcloud_peer_group_get(context, association.peer_group_id) peer_group_name = dc_local_pg.peer_group_name + dc_peer_association_id = None try: - sysinv_client = self.get_peer_sysinv_client(peer) - # Check if the system_uuid of the peer site matches with the # peer_uuid - system = sysinv_client.get_system() + system = self.get_peer_sysinv_client(peer).get_system() if system.uuid != peer.peer_uuid: LOG.error(f"Peer site system uuid {system.uuid} does not match " f"with the peer_uuid {peer.peer_uuid}") @@ -426,6 +494,65 @@ class SystemPeerManager(manager.Manager): dc_client = self.get_peer_dc_client(peer) + # Get current site system information + local_system_uuid = utils.get_local_system().uuid + + # Get peer site system peer + dc_peer_system_peer = self._get_peer_site_system_peer( + dc_client, local_system_uuid) + + if dc_peer_system_peer is None: + failed_message = f"System Peer {local_system_uuid} does not" + \ + " exist on peer site." + return db_api.peer_group_association_db_model_to_dict( + self._update_sync_status_to_failed(context, association_id, + failed_message)) + dc_peer_system_peer_id = dc_peer_system_peer.get('id') + + # Get peer site peer group, create if not exist + dc_peer_pg = self._get_peer_site_pg_by_name(dc_client, + peer_group_name) + if dc_peer_pg is None: + peer_group_kwargs = { + 'group-priority': association.peer_group_priority, + 'group-state': dc_local_pg.group_state, + 'system-leader-id': dc_local_pg.system_leader_id, + 'system-leader-name': dc_local_pg.system_leader_name, + 'max-subcloud-rehoming': dc_local_pg.max_subcloud_rehoming + } + peer_group_kwargs['peer-group-name'] = peer_group_name + dc_peer_pg = dc_client.add_subcloud_peer_group( + **peer_group_kwargs) + LOG.info(f"Created Subcloud Peer Group {peer_group_name} on " + f"peer site. ID is {dc_peer_pg.get('id')}.") + dc_peer_pg_id = dc_peer_pg.get('id') + dc_peer_pg_priority = dc_peer_pg.get('group_priority') + # Check if the peer group priority is 0, if so, raise exception + if dc_peer_pg_priority == 0: + LOG.error(f"Skip update. Peer Site {peer_group_name} " + f"has priority 0.") + raise exceptions.SubcloudPeerGroupHasWrongPriority( + priority=dc_peer_pg_priority) + + # Get peer site non-primary association, create if not exist + dc_peer_association = self._get_non_primary_association( + dc_client, dc_peer_system_peer_id, dc_peer_pg_id) + if dc_peer_association is None: + non_primary_association_kwargs = { + 'peer_group_id': dc_peer_pg_id, + 'system_peer_id': dc_peer_system_peer_id + } + dc_peer_association = dc_client.add_peer_group_association( + **non_primary_association_kwargs) + LOG.info(f"Created \"non-primary\" Peer Group Association " + f"{dc_peer_association.get('id')} on peer site.") + dc_peer_association_id = dc_peer_association.get("id") + + # Update peer group association sync status to syncing + dc_client.update_peer_group_association_sync_status( + dc_peer_association_id, consts.ASSOCIATION_SYNC_STATUS_SYNCING) + + # Update peer group on peer site peer_group_kwargs = { 'group-priority': association.peer_group_priority, 'group-state': dc_local_pg.group_state, @@ -433,34 +560,16 @@ class SystemPeerManager(manager.Manager): 'system-leader-name': dc_local_pg.system_leader_name, 'max-subcloud-rehoming': dc_local_pg.max_subcloud_rehoming } - if priority: - peer_group_kwargs['group-priority'] = priority - try: - dc_peer_pg = dc_client.get_subcloud_peer_group( - peer_group_name) - dc_peer_pg_id = dc_peer_pg.get('id') - dc_peer_pg_priority = dc_peer_pg.get('group_priority') - if dc_peer_pg_priority == 0: - LOG.error(f"Skip update. Peer Site {peer_group_name} " - f"has priority 0.") - raise exceptions.SubcloudPeerGroupHasWrongPriority( - priority=dc_peer_pg_priority) - - dc_peer_pg = dc_client.update_subcloud_peer_group( - peer_group_name, **peer_group_kwargs) - LOG.info(f"Updated Subcloud Peer Group {peer_group_name} on " - f"peer site, ID is {dc_peer_pg.get('id')}.") - except dccommon_exceptions.SubcloudPeerGroupNotFound: - peer_group_kwargs['peer-group-name'] = peer_group_name - dc_peer_pg = dc_client.add_subcloud_peer_group( - **peer_group_kwargs) - dc_peer_pg_id = dc_peer_pg.get('id') - LOG.info(f"Created Subcloud Peer Group {peer_group_name} on " - f"peer site, ID is {dc_peer_pg_id}.") + dc_peer_pg = dc_client.update_subcloud_peer_group( + peer_group_name, **peer_group_kwargs) + LOG.info(f"Updated Subcloud Peer Group {peer_group_name} on " + f"peer site, ID is {dc_peer_pg.get('id')}.") association_update = { - 'sync_status': consts.ASSOCIATION_SYNC_STATUS_SYNCED, - 'sync_message': None + 'sync_status': consts.ASSOCIATION_SYNC_STATUS_IN_SYNC, + 'sync_message': 'None', + 'dc_peer_association_id': dc_peer_association_id, + 'dc_client': dc_client } if sync_subclouds: error_msg = self._sync_subclouds(context, peer, dc_local_pg.id, @@ -469,9 +578,7 @@ class SystemPeerManager(manager.Manager): association_update['sync_status'] = \ consts.ASSOCIATION_SYNC_STATUS_FAILED association_update['sync_message'] = json.dumps(error_msg) - if priority: - association_update['peer_group_priority'] = priority - association = db_api.peer_group_association_update( + association = self._update_sync_status( context, association_id, **association_update) self.peer_monitor_manager.peer_monitor_notify(context) @@ -481,12 +588,17 @@ class SystemPeerManager(manager.Manager): except Exception as exception: LOG.exception(f"Failed to sync peer group {peer_group_name} to " f"peer site {peer.peer_name}") - db_api.peer_group_association_update( - context, association_id, - sync_status=consts.ASSOCIATION_SYNC_STATUS_FAILED, - sync_message=str(exception)) + self._update_sync_status_to_failed(context, association_id, + str(exception), + dc_peer_association_id) raise exception + def _delete_primary_association(self, context, association_id): + """Delete primary peer group association.""" + result = db_api.peer_group_association_destroy(context, association_id) + self.peer_monitor_manager.peer_monitor_notify(context) + return result + def delete_peer_group_association(self, context, association_id): """Delete association and remove related association from peer site. @@ -499,25 +611,52 @@ class SystemPeerManager(manager.Manager): association = db_api.peer_group_association_get(context, association_id) peer = db_api.system_peer_get(context, association.system_peer_id) - peer_group = db_api.subcloud_peer_group_get(context, - association.peer_group_id) + dc_local_pg = db_api.subcloud_peer_group_get(context, + association.peer_group_id) + peer_group_name = dc_local_pg.peer_group_name try: + # Check if the system_uuid of the peer site matches with the + # peer_uuid + system = self.get_peer_sysinv_client(peer).get_system() + if system.uuid != peer.peer_uuid: + LOG.warning(f"Peer site system uuid {system.uuid} does not " + f"match with the peer_uuid {peer.peer_uuid}") + return self._delete_primary_association(context, association_id) + dc_client = self.get_peer_dc_client(peer) - dc_peer_pg = dc_client.get_subcloud_peer_group( - peer_group.peer_group_name) + + # Get current site system information + local_system_uuid = utils.get_local_system().uuid + + # Get peer site system peer + dc_peer_system_peer = self._get_peer_site_system_peer( + dc_client, local_system_uuid) + + # Get peer site peer group + dc_peer_pg = self._get_peer_site_pg_by_name(dc_client, + peer_group_name) + + if dc_peer_pg is None: + # peer group does not exist on peer site, the association should + # be deleted + LOG.warning(f"Subcloud Peer Group {peer_group_name} does " + f"not exist on peer site.") + return self._delete_primary_association(context, association_id) + + dc_peer_pg_id = dc_peer_pg.get('id') dc_peer_pg_priority = dc_peer_pg.get('group_priority') + # Check if the peer group priority is 0, if so, raise exception if dc_peer_pg_priority == 0: - LOG.error(f"Failed to delete peer_group_association. " - f"Peer Group {peer_group.peer_group_name} " - f"has priority 0 on peer site.") + LOG.error(f"Failed to delete peer_group_association. Peer Group" + f" {peer_group_name} has priority 0 on peer site.") raise exceptions.SubcloudPeerGroupHasWrongPriority( priority=dc_peer_pg_priority) # Use thread pool to limit number of operations in parallel delete_pool = greenpool.GreenPool(size=MAX_PARALLEL_SUBCLOUD_DELETE) subclouds = db_api.subcloud_get_for_peer_group(context, - peer_group.id) + dc_local_pg.id) # Spawn threads to delete each subcloud clean_function = functools.partial(self._delete_subcloud, dc_client) @@ -525,28 +664,78 @@ class SystemPeerManager(manager.Manager): 'peer subcloud clean', clean_function, delete_pool, subclouds) if delete_error_msg: - association = db_api.peer_group_association_update( - context, association_id, - sync_status=consts.ASSOCIATION_SYNC_STATUS_FAILED, - sync_message=json.dumps(delete_error_msg)) + self._update_sync_status_to_failed(context, association_id, + json.dumps(delete_error_msg)) return - try: - dc_client.delete_subcloud_peer_group(peer_group.peer_group_name) - LOG.info("Deleted Subcloud Peer Group " - f"{peer_group.peer_group_name} on peer site.") - except dccommon_exceptions.SubcloudPeerGroupNotFound: - LOG.debug(f"Subcloud Peer Group {peer_group.peer_group_name} " - "does not exist on peer site.") - except dccommon_exceptions.SubcloudPeerGroupDeleteFailedAssociated: - LOG.debug(f"Subcloud Peer Group {peer_group.peer_group_name} " - "delete failed as it is associated with system peer " - "on peer site.") + # System Peer does not exist on peer site, delete peer group + if dc_peer_system_peer is None: + try: + dc_client.delete_subcloud_peer_group(peer_group_name) + LOG.info(f"Deleted Subcloud Peer Group {peer_group_name} " + f"on peer site.") + except dccommon_exceptions.\ + SubcloudPeerGroupDeleteFailedAssociated: + LOG.error(f"Subcloud Peer Group {peer_group_name} " + "delete failed as it is associated with System " + "Peer on peer site.") + return self._delete_primary_association(context, association_id) + dc_peer_system_peer_id = dc_peer_system_peer.get('id') - db_api.peer_group_association_destroy(context, association_id) - self.peer_monitor_manager.peer_monitor_notify(context) + # Get peer site non-primary association + dc_peer_association = self._get_non_primary_association( + dc_client, dc_peer_system_peer_id, dc_peer_pg_id) + # Delete peer group association on peer site if exist + if dc_peer_association is not None: + dc_peer_association_id = dc_peer_association.get("id") + dc_client.delete_peer_group_association( + dc_peer_association_id) + elif dc_peer_association is None: + LOG.warning(f"PeerGroupAssociation does not exist on peer site." + f"Peer Group ID: {dc_peer_pg_id}, peer site System " + f"Peer ID: {dc_peer_system_peer_id}") + + try: + dc_client.delete_subcloud_peer_group(peer_group_name) + LOG.info("Deleted Subcloud Peer Group " + f"{peer_group_name} on peer site.") + except dccommon_exceptions.SubcloudPeerGroupDeleteFailedAssociated: + failed_message = f"Subcloud Peer Group {peer_group_name} " \ + + "delete failed as it is associated with system peer " \ + + "on peer site." + self._update_sync_status_to_failed(context, association_id, + failed_message) + LOG.error(failed_message) + raise + + return self._delete_primary_association(context, association_id) except Exception as exception: LOG.exception("Failed to delete peer_group_association " f"{association.id}") raise exception + + def handle_association_operations_in_progress(self): + """Identify associations in transitory stages and update association + + state to failure. + """ + + LOG.info('Identifying associations in transitory stages.') + + associations = db_api.peer_group_association_get_all(self.context) + + for association in associations: + # Identify associations in transitory states + new_sync_status = TRANSITORY_STATES.get(association.sync_status) + + # update syncing states to the corresponding failure states + if new_sync_status: + LOG.info(f"Changing association {association.id} sync status " + f"from {association.sync_status} to {new_sync_status}") + + db_api.peer_group_association_update( + self.context, + association.id, + sync_status=new_sync_status or association.sync_status, + sync_message="Service restart during syncing") diff --git a/distributedcloud/dcmanager/rpc/client.py b/distributedcloud/dcmanager/rpc/client.py index 91b300ce7..5491c3b9d 100644 --- a/distributedcloud/dcmanager/rpc/client.py +++ b/distributedcloud/dcmanager/rpc/client.py @@ -265,10 +265,10 @@ class ManagerClient(RPCClient): return self.cast(ctxt, self.make_msg( 'sync_subcloud_peer_group', association_id=association_id)) - def update_subcloud_peer_group(self, ctxt, association_id, priority): + def update_subcloud_peer_group(self, ctxt, association_id): return self.call(ctxt, self.make_msg( 'sync_subcloud_peer_group', association_id=association_id, - sync_subclouds=False, priority=priority)) + sync_subclouds=False)) def delete_peer_group_association(self, ctxt, association_id): return self.call(ctxt, self.make_msg('delete_peer_group_association', diff --git a/distributedcloud/dcmanager/tests/unit/api/v1/controllers/test_peer_group_association.py b/distributedcloud/dcmanager/tests/unit/api/v1/controllers/test_peer_group_association.py index a4b00f7c7..1fafabea2 100644 --- a/distributedcloud/dcmanager/tests/unit/api/v1/controllers/test_peer_group_association.py +++ b/distributedcloud/dcmanager/tests/unit/api/v1/controllers/test_peer_group_association.py @@ -48,6 +48,7 @@ SAMPLE_PEER_GROUP_PRIORITY = 1 SAMPLE_PEER_GROUP_PRIORITY_UPDATED = 99 SAMPLE_SYNC_STATUS = 'synced' SAMPLE_SYNC_MESSAGE = 'None' +SAMPLE_ASSOCIATION_TYPE = 'primary' class FakeSystem(object): @@ -142,7 +143,9 @@ class PeerGroupAssociationAPIMixin(APIMixin): 'peer_group_priority': kw.get('peer_group_priority', SAMPLE_PEER_GROUP_PRIORITY), 'sync_status': kw.get('sync_status', SAMPLE_SYNC_STATUS), - 'sync_message': kw.get('sync_message', SAMPLE_SYNC_MESSAGE) + 'sync_message': kw.get('sync_message', SAMPLE_SYNC_MESSAGE), + 'association_type': kw.get('association_type', + SAMPLE_ASSOCIATION_TYPE) } return association diff --git a/distributedcloud/dcmanager/tests/unit/manager/test_system_peer_manager.py b/distributedcloud/dcmanager/tests/unit/manager/test_system_peer_manager.py index b280b6b09..0ce0b4ffc 100644 --- a/distributedcloud/dcmanager/tests/unit/manager/test_system_peer_manager.py +++ b/distributedcloud/dcmanager/tests/unit/manager/test_system_peer_manager.py @@ -41,17 +41,19 @@ FAKE_SITE1_PEER_GROUP_ID = 9 # FAKE SUBCLOUD DATA (SITE1) FAKE_SITE1_SUBCLOUD1_ID = 11 +FAKE_SITE1_SUBCLOUD1_REGION_NAME = str(uuid.uuid4()) FAKE_SITE1_SUBCLOUD1_DEPLOY_STATUS = 'secondary' FAKE_SITE1_SUBCLOUD1_DATA = {"id": FAKE_SITE1_SUBCLOUD1_ID, "name": "subcloud1", - "region-name": "subcloud1", + "region-name": FAKE_SITE1_SUBCLOUD1_REGION_NAME, "deploy-status": FAKE_SITE1_SUBCLOUD1_DEPLOY_STATUS} FAKE_SITE1_SUBCLOUD2_ID = 12 +FAKE_SITE1_SUBCLOUD2_REGION_NAME = str(uuid.uuid4()) FAKE_SITE1_SUBCLOUD2_DEPLOY_STATUS = 'secondary-failed' FAKE_SITE1_SUBCLOUD2_DATA = {"id": FAKE_SITE1_SUBCLOUD2_ID, "name": "subcloud2", - "region-name": "subcloud2", + "region-name": FAKE_SITE1_SUBCLOUD2_REGION_NAME, "deploy-status": FAKE_SITE1_SUBCLOUD2_DEPLOY_STATUS} FAKE_SITE1_SUBCLOUD3_ID = 13 @@ -62,7 +64,7 @@ FAKE_SITE1_SUBCLOUD3_DATA = {"id": FAKE_SITE1_SUBCLOUD3_ID, "deploy-status": FAKE_SITE1_SUBCLOUD3_DEPLOY_STATUS} -# FAKE PEER GROUP ASSOCIATION DATA +# FAKE PEER GROUP ASSOCIATION DATA (SITE0) FAKE_ASSOCIATION_PEER_GROUP_ID = \ FAKE_SITE0_PEER_GROUP_ID FAKE_ASSOCIATION_SYSTEM_PEER_ID = \ @@ -70,6 +72,10 @@ FAKE_ASSOCIATION_SYSTEM_PEER_ID = \ FAKE_ASSOCIATION_PEER_GROUP_PRIORITY = 1 FAKE_ASSOCIATION_SYNC_STATUS = 'synced' FAKE_ASSOCIATION_SYNC_MESSAGE = 'None' +FAKE_ASSOCIATION_TYPE = 'primary' + +# FAKE PEER GROUP ASSOCIATION DATA (SITE1) +FAKE_SITE1_ASSOCIATION_ID = 10 class FakeDCManagerAuditAPI(object): @@ -172,7 +178,8 @@ class TestSystemPeerManager(base.DCManagerTestCase): "peer_group_id": FAKE_ASSOCIATION_PEER_GROUP_ID, "peer_group_priority": FAKE_ASSOCIATION_PEER_GROUP_PRIORITY, "sync_status": FAKE_ASSOCIATION_SYNC_STATUS, - "sync_message": FAKE_ASSOCIATION_SYNC_MESSAGE + "sync_message": FAKE_ASSOCIATION_SYNC_MESSAGE, + "association_type": FAKE_ASSOCIATION_TYPE } values.update(kwargs) return db_api.peer_group_association_create(ctxt, **values) @@ -193,6 +200,8 @@ class TestSystemPeerManager(base.DCManagerTestCase): mock_sysinv_client.return_value = FakeSysinvClient() mock_dc_client.return_value = FakeDcmanagerClient() mock_dc_client().add_subcloud_with_secondary_status = mock.MagicMock() + mock_dc_client().add_subcloud_with_secondary_status.return_value = { + "region-name": FAKE_SITE1_SUBCLOUD2_REGION_NAME} mock_dc_client().delete_subcloud = mock.MagicMock() peer = self.create_system_peer_static( @@ -247,8 +256,12 @@ class TestSystemPeerManager(base.DCManagerTestCase): ]) mock_dc_client().update_subcloud.assert_has_calls([ mock.call('subcloud1', mock.ANY, mock.ANY), - mock.call('subcloud1', files=None, - data={'peer_group': str(FAKE_SITE1_PEER_GROUP_ID)}) + mock.call(FAKE_SITE1_SUBCLOUD1_REGION_NAME, files=None, + data={'peer_group': str(FAKE_SITE1_PEER_GROUP_ID)}, + is_region_name=True), + mock.call(FAKE_SITE1_SUBCLOUD2_REGION_NAME, files=None, + data={'peer_group': str(FAKE_SITE1_PEER_GROUP_ID)}, + is_region_name=True) ]) mock_dc_client().add_subcloud_with_secondary_status. \ assert_called_once() @@ -256,6 +269,7 @@ class TestSystemPeerManager(base.DCManagerTestCase): @mock.patch.object( system_peer_manager.SystemPeerManager, '_sync_subclouds') + @mock.patch.object(system_peer_manager, 'utils') @mock.patch.object(system_peer_manager, 'PeerSiteDriver') @mock.patch.object(system_peer_manager, 'SysinvClient') @mock.patch.object(system_peer_manager, 'DcmanagerClient') @@ -263,6 +277,7 @@ class TestSystemPeerManager(base.DCManagerTestCase): mock_dc_client, mock_sysinv_client, mock_keystone_client, + mock_utils, mock_sync_subclouds): mock_sync_subclouds.return_value = True mock_keystone_client().keystone_client = FakeKeystoneClient() @@ -270,6 +285,12 @@ class TestSystemPeerManager(base.DCManagerTestCase): mock_dc_client.return_value = FakeDcmanagerClient() mock_dc_client().get_subcloud_peer_group = mock.MagicMock() mock_dc_client().update_subcloud_peer_group = mock.MagicMock() + mock_dc_client().get_system_peer = mock.MagicMock() + mock_dc_client().get_peer_group_association_with_peer_id_and_pg_id = \ + mock.MagicMock() + mock_dc_client().update_peer_group_association_sync_status = \ + mock.MagicMock() + mock_utils().get_local_system = mock.MagicMock() peer = self.create_system_peer_static( self.ctx, @@ -291,12 +312,14 @@ class TestSystemPeerManager(base.DCManagerTestCase): @mock.patch.object( system_peer_manager.SystemPeerManager, '_sync_subclouds') + @mock.patch.object(system_peer_manager, 'utils') @mock.patch.object(system_peer_manager, 'PeerSiteDriver') @mock.patch.object(system_peer_manager, 'SysinvClient') @mock.patch.object(system_peer_manager, 'DcmanagerClient') def test_sync_subcloud_peer_group_not_exist(self, mock_dc_client, mock_sysinv_client, mock_keystone_client, + mock_utils, mock_sync_subclouds): mock_sync_subclouds.return_value = True mock_keystone_client().keystone_client = FakeKeystoneClient() @@ -305,6 +328,12 @@ class TestSystemPeerManager(base.DCManagerTestCase): mock_dc_client().get_subcloud_peer_group = mock.MagicMock() mock_dc_client().add_subcloud_peer_group = mock.MagicMock() mock_dc_client().update_subcloud_peer_group = mock.MagicMock() + mock_dc_client().get_system_peer = mock.MagicMock() + mock_dc_client().get_peer_group_association_with_peer_id_and_pg_id = \ + mock.MagicMock() + mock_dc_client().update_peer_group_association_sync_status = \ + mock.MagicMock() + mock_utils().get_local_system = mock.MagicMock() peer = self.create_system_peer_static( self.ctx, @@ -333,17 +362,27 @@ class TestSystemPeerManager(base.DCManagerTestCase): 'system-leader-name': peer_group.system_leader_name, 'max-subcloud-rehoming': peer_group.max_subcloud_rehoming }) - mock_dc_client().update_subcloud_peer_group.assert_not_called() + @mock.patch.object(system_peer_manager, 'utils') @mock.patch.object(system_peer_manager, 'PeerSiteDriver') + @mock.patch.object(system_peer_manager, 'SysinvClient') @mock.patch.object(system_peer_manager, 'DcmanagerClient') def test_delete_peer_group_association(self, mock_dc_client, - mock_keystone_client): + mock_sysinv_client, + mock_keystone_client, + mock_utils): mock_keystone_client().keystone_client = FakeKeystoneClient() + mock_sysinv_client.return_value = FakeSysinvClient() mock_dc_client.return_value = FakeDcmanagerClient() mock_dc_client().delete_subcloud_peer_group = mock.MagicMock() mock_dc_client().delete_subcloud = mock.MagicMock() + mock_dc_client().get_subcloud_peer_group = mock.MagicMock() + mock_dc_client().get_system_peer = mock.MagicMock() + mock_dc_client().get_peer_group_association_with_peer_id_and_pg_id = \ + mock.MagicMock() + mock_dc_client().delete_peer_group_association = mock.MagicMock() + mock_utils().get_local_system = mock.MagicMock() peer = self.create_system_peer_static( self.ctx, @@ -370,11 +409,8 @@ class TestSystemPeerManager(base.DCManagerTestCase): mock_dc_client().get_subcloud = mock.MagicMock() mock_dc_client().get_subcloud.side_effect = [ peer_subcloud1, peer_subcloud2] - - mock_dc_client().get_subcloud_peer_group = mock.MagicMock() - mock_dc_client().get_subcloud_peer_group.return_value = { - 'group_priority': 1 - } + mock_dc_client().get_peer_group_association_with_peer_id_and_pg_id.\ + return_value = {'id': FAKE_SITE1_ASSOCIATION_ID} spm = system_peer_manager.SystemPeerManager(mock.MagicMock()) spm.delete_peer_group_association(self.ctx, association.id) @@ -385,6 +421,61 @@ class TestSystemPeerManager(base.DCManagerTestCase): ]) mock_dc_client().delete_subcloud_peer_group.assert_called_once_with( peer_group.peer_group_name) + mock_dc_client().delete_peer_group_association.assert_called_once_with( + FAKE_SITE1_ASSOCIATION_ID) + + associations = db_api.peer_group_association_get_all(self.ctx) + self.assertEqual(0, len(associations)) + + @mock.patch.object(system_peer_manager, 'utils') + @mock.patch.object(system_peer_manager, 'PeerSiteDriver') + @mock.patch.object(system_peer_manager, 'SysinvClient') + @mock.patch.object(system_peer_manager, 'DcmanagerClient') + def test_delete_peer_group_association_peer_site_association_not_exsit( + self, mock_dc_client, mock_sysinv_client, mock_keystone_client, + mock_utils): + mock_keystone_client().keystone_client = FakeKeystoneClient() + mock_sysinv_client.return_value = FakeSysinvClient() + mock_dc_client.return_value = FakeDcmanagerClient() + mock_dc_client().delete_subcloud_peer_group = mock.MagicMock() + mock_dc_client().delete_subcloud = mock.MagicMock() + mock_dc_client().get_subcloud_peer_group = mock.MagicMock() + mock_dc_client().get_system_peer = mock.MagicMock() + mock_dc_client().get_peer_group_association_with_peer_id_and_pg_id = \ + mock.MagicMock() + mock_dc_client().delete_peer_group_association = mock.MagicMock() + mock_utils().get_local_system = mock.MagicMock() + + peer = self.create_system_peer_static( + self.ctx, + peer_name='SystemPeer1') + peer_group = self.create_subcloud_peer_group_static( + self.ctx, + peer_group_name='SubcloudPeerGroup1') + # Create local dc subcloud1 mock data in database + subcloud1 = self.create_subcloud_with_pg_static( + self.ctx, + peer_group_id=peer_group.id, + name='subcloud1') + association = self.create_peer_group_association_static( + self.ctx, + system_peer_id=peer.id, + peer_group_id=peer_group.id) + peer_subcloud1 = FAKE_SITE1_SUBCLOUD1_DATA + mock_dc_client().get_subcloud = mock.MagicMock() + mock_dc_client().get_subcloud.side_effect = [ + peer_subcloud1, dccommon_exceptions.SubcloudNotFound] + mock_dc_client().get_peer_group_association_with_peer_id_and_pg_id.\ + side_effect = [dccommon_exceptions.PeerGroupAssociationNotFound] + + spm = system_peer_manager.SystemPeerManager(mock.MagicMock()) + spm.delete_peer_group_association(self.ctx, association.id) + + mock_dc_client().delete_subcloud.assert_has_calls([ + mock.call(subcloud1.name)]) + mock_dc_client().delete_subcloud_peer_group.assert_called_once_with( + peer_group.peer_group_name) + mock_dc_client().delete_peer_group_association.assert_not_called() associations = db_api.peer_group_association_get_all(self.ctx) self.assertEqual(0, len(associations))