From 6cb974bd50b2fb790cab5ec1589c4d317bfbc5fe Mon Sep 17 00:00:00 2001 From: "Zhang Rong(Jon)" Date: Mon, 5 Feb 2024 15:09:09 +0800 Subject: [PATCH] Auditor automatic reconciliation of subclouds This commit will automatically delete the subcloud that was removed from SPG on the peer site after the peer site migrated the SGP. Conduct syntax checks for managing, unmanaging, deleting, and updating subclouds, add or remove subcloud from SPG. Specifically, If the secondary site leads the current SPG, the subcloud can be removed from the SPG only when the primary site is unavailable. Test Plan: PASS - Shutdown site1, migrate SPG to site2, and verify the status of subclouds upon site1 recovery. As anticipated, the subclouds transitioned to the secondary status. PASS - Shutdown site1, migrate SPG to site2, then remove a subcloud from the SPG on site2. Upon site1 recovery, verify the subclouds' status. As expected, the subcloud was deleted in site1 corresponding to the removal of the subcloud from the SPG in site2. PASS - Shutdown site1, migrate SPG to site2, then remove a subcloud from the SPG on site2 and subsequently delete it. Check the status of subclouds upon site1 recovery. As expected, the subcloud was deleted in site1 corresponding to the removal of the subcloud from the SPG in site2. PASS - Removal subcloud from the SPG in the primary site when it is the current leader was successful as expected. PASS - Keep site1 online, migrate SPG to site2, remove a subcloud from SPG in site2 should not be allowed if site1 is online. PASS - Add a subcloud that is offline/unmanaged to an SPG failed in the primary site as expected. PASS - Manage/unmanage subcloud failed when it associated to SPG. PASS - Update subcloud successful when it associated to SPG in the primary site and the SPG is the current leader. PASS - Update subcloud failed when it associated to SPG in the secondary site as expected. Closes-bug: 2052415 Closes-bug: 2052584 Signed-off-by: Zhang Rong(Jon) Change-Id: I210e2865228d166d7f5a5b26015ab07b4d09db47 --- .../drivers/openstack/dcmanager_v1.py | 3 +- .../tests/unit/drivers/test_dcmanager_v1.py | 3 +- .../dcmanager/api/controllers/v1/subclouds.py | 149 ++++++++++++---- .../manager/peer_group_audit_manager.py | 162 ++++++++++-------- 4 files changed, 203 insertions(+), 114 deletions(-) diff --git a/distributedcloud/dccommon/drivers/openstack/dcmanager_v1.py b/distributedcloud/dccommon/drivers/openstack/dcmanager_v1.py index 490543b36..eff57bbc9 100644 --- a/distributedcloud/dccommon/drivers/openstack/dcmanager_v1.py +++ b/distributedcloud/dccommon/drivers/openstack/dcmanager_v1.py @@ -427,7 +427,8 @@ class DcmanagerClient(base.DriverBase): raise ValueError("subcloud_ref is required.") url = f"{self.endpoint}/subclouds/{subcloud_ref}" - headers = {"X-Auth-Token": self.token} + headers = {"X-Auth-Token": self.token, + "User-Agent": consts.DCMANAGER_V1_HTTP_AGENT} response = requests.delete(url, headers=headers, timeout=self.timeout) diff --git a/distributedcloud/dccommon/tests/unit/drivers/test_dcmanager_v1.py b/distributedcloud/dccommon/tests/unit/drivers/test_dcmanager_v1.py index 26b192ef1..b37b269af 100644 --- a/distributedcloud/dccommon/tests/unit/drivers/test_dcmanager_v1.py +++ b/distributedcloud/dccommon/tests/unit/drivers/test_dcmanager_v1.py @@ -362,7 +362,8 @@ class TestDcmanagerClient(base.DCCommonTestCase): result = client.delete_subcloud(SUBCLOUD_NAME) mock_delete.assert_called_once_with( FAKE_ENDPOINT + '/subclouds/' + SUBCLOUD_NAME, - headers={"X-Auth-Token": FAKE_TOKEN}, + headers={"X-Auth-Token": FAKE_TOKEN, + "User-Agent": dccommon_consts.DCMANAGER_V1_HTTP_AGENT}, timeout=FAKE_TIMEOUT ) self.assertEqual(result, '') diff --git a/distributedcloud/dcmanager/api/controllers/v1/subclouds.py b/distributedcloud/dcmanager/api/controllers/v1/subclouds.py index 4f9d0380f..d8d0ea9a9 100644 --- a/distributedcloud/dcmanager/api/controllers/v1/subclouds.py +++ b/distributedcloud/dcmanager/api/controllers/v1/subclouds.py @@ -598,6 +598,100 @@ class SubcloudsController(object): if not payload: pecan.abort(400, _('Body required')) + peer_group = payload.get('peer_group') + # Verify the peer_group is valid + peer_group_id = None + if peer_group is not None: + # peer_group may be passed in the payload as an int or str + peer_group = str(peer_group) + # Get current site system information + local_system_uuid = utils.get_local_system().uuid + # Check if user wants to remove a subcloud + # from a subcloud-peer-group by + # setting peer_group_id as 'none', + # then we will pass 'none' string as + # the peer_group_id, + # update_subcloud() will handle it and + # Set the peer_group_id DB into None. + if peer_group.lower() == 'none': + if subcloud.peer_group_id is not None: + # Get the peer group of the subcloud + original_pgrp = db_api.subcloud_peer_group_get( + context, subcloud.peer_group_id) + # Check the system leader is not on this site + if original_pgrp.system_leader_id != local_system_uuid: + pecan.abort(400, _("Removing subcloud from a " + "peer group not led by the " + "current site is prohibited.")) + # Get associations by peer group id + associations = db_api.\ + peer_group_association_get_by_peer_group_id( + context, original_pgrp.id) + for association in associations: + system_peer = db_api.system_peer_get( + context, association.system_peer_id) + # If system peer is available, then does not allow + # to remove the subcloud from secondary peer group + if system_peer.availability_state == consts.\ + SYSTEM_PEER_AVAILABILITY_STATE_AVAILABLE \ + and original_pgrp.group_priority > 0: + pecan.abort(400, _( + "Removing subcloud from a peer group " + "associated with an available system peer " + "is prohibited.")) + peer_group_id = 'none' + else: + if subcloud.peer_group_id is not None and \ + str(subcloud.peer_group_id) != peer_group: + original_pgrp = utils.subcloud_peer_group_get_by_ref( + context, str(subcloud.peer_group_id)) + if original_pgrp and original_pgrp.group_priority > 0: + pecan.abort(400, _( + "Cannot update subcloud to a new peer group " + "if the original peer group has non-zero " + "priority.")) + pgrp = utils.subcloud_peer_group_get_by_ref(context, peer_group) + if not pgrp: + pecan.abort(400, _('Invalid peer group')) + if not utils.is_req_from_another_dc(request): + if pgrp.group_priority > 0: + pecan.abort(400, _("Cannot set the subcloud to a peer" + " group with non-zero priority.")) + elif pgrp.system_leader_id != local_system_uuid: + pecan.abort(400, _("Update subcloud to a peer " + "group that is not led by the " + "current site is prohibited.")) + elif not ( + subcloud.deploy_status == consts.DEPLOY_STATE_DONE + and subcloud.management_state == + dccommon_consts.MANAGEMENT_MANAGED + and subcloud.availability_status == + dccommon_consts.AVAILABILITY_ONLINE): + pecan.abort(400, _("Only subclouds that are " + "managed and online can be " + "added to a peer group.")) + peer_group_id = pgrp.id + + # Subcloud can only be updated while it is managed in + # the primary site because the sync command can only be issued + # in the site where the SPG was created. + if subcloud.peer_group_id is not None and peer_group_id is None \ + and not utils.is_req_from_another_dc(request): + # Get the peer group of the subcloud + original_pgrp = db_api.subcloud_peer_group_get( + context, subcloud.peer_group_id) + if original_pgrp.group_priority > 0: + pecan.abort(400, _("Subcloud update is only allowed when " + "its peer group priority value is 0.")) + # Get current site system information + local_system_uuid = utils.get_local_system().uuid + # Updating a subcloud under the peer group on primary site + # that the peer group should be led by the primary site. + if original_pgrp.system_leader_id != local_system_uuid: + pecan.abort(400, _("Updating subcloud from a " + "peer group not led by the " + "current site is prohibited.")) + # Rename the subcloud new_subcloud_name = payload.get('name') if new_subcloud_name is not None: @@ -677,7 +771,6 @@ class SubcloudsController(object): description = payload.get('description') location = payload.get('location') bootstrap_values = payload.get('bootstrap_values') - peer_group = payload.get('peer_group') bootstrap_address = payload.get('bootstrap_address') # If the migrate flag is present we need to update the deploy status @@ -693,6 +786,11 @@ class SubcloudsController(object): management_state not in [dccommon_consts.MANAGEMENT_UNMANAGED, dccommon_consts.MANAGEMENT_MANAGED]: pecan.abort(400, _('Invalid management-state')) + if management_state and subcloud.peer_group_id is not None \ + and not utils.is_req_from_another_dc(request): + pecan.abort(400, _('Cannot update the management state of a ' + 'subcloud that is associated with ' + 'a peer group.')) force_flag = payload.get('force') if force_flag is not None: @@ -717,41 +815,6 @@ class SubcloudsController(object): exceptions.SubcloudGroupNotFound): pecan.abort(400, _('Invalid group')) - # Verify the peer_group is valid - peer_group_id = None - if peer_group is not None: - # peer_group may be passed in the payload as an int or str - peer_group = str(peer_group) - # Check if user wants to remove a subcloud - # from a subcloud-peer-group by - # setting peer_group_id as 'none', - # then we will pass 'none' string as - # the peer_group_id, - # update_subcloud() will handle it and - # Set the peer_group_id DB into None. - if peer_group.lower() == 'none': - peer_group_id = 'none' - else: - pgrp = utils.subcloud_peer_group_get_by_ref(context, peer_group) - if not pgrp: - pecan.abort(400, _('Invalid peer group')) - if not utils.is_req_from_another_dc(request): - if pgrp.group_priority > 0: - pecan.abort(400, _("Cannot set the subcloud to a peer" - " group with non-zero priority.")) - elif not ( - subcloud.deploy_status in [ - consts.DEPLOY_STATE_DONE, - consts.PRESTAGE_STATE_COMPLETE - ] and subcloud.management_state == - dccommon_consts.MANAGEMENT_MANAGED - and subcloud.availability_status == - dccommon_consts.AVAILABILITY_ONLINE): - pecan.abort(400, _("Only subclouds that are " - "managed and online can be " - "added to a peer group.")) - peer_group_id = pgrp.id - if consts.INSTALL_VALUES in payload: # install_values of secondary subclouds are validated on # peer site @@ -939,6 +1002,20 @@ class SubcloudsController(object): pecan.abort(404, _('Subcloud not found')) subcloud_id = subcloud.id + peer_group_id = subcloud.peer_group_id + subcloud_management_state = subcloud.management_state + + # Check if the subcloud is "managed" status + if subcloud_management_state == dccommon_consts.MANAGEMENT_MANAGED \ + and not utils.is_req_from_another_dc(request): + pecan.abort(400, _('Cannot delete a subcloud that is "managed" ' + 'status')) + + # Check if the subcloud is part of a peer group + if peer_group_id is not None and \ + not utils.is_req_from_another_dc(request): + pecan.abort(400, _('Cannot delete a subcloud that is part of ' + 'a peer group on this site')) try: # Ask dcmanager-manager to delete the subcloud. diff --git a/distributedcloud/dcmanager/manager/peer_group_audit_manager.py b/distributedcloud/dcmanager/manager/peer_group_audit_manager.py index fc11a7dc2..d49b7efd2 100644 --- a/distributedcloud/dcmanager/manager/peer_group_audit_manager.py +++ b/distributedcloud/dcmanager/manager/peer_group_audit_manager.py @@ -51,9 +51,9 @@ class PeerGroupAuditManager(manager.Manager): peer_group_name) return subclouds except Exception: - LOG.exception("Failed to get subclouds of peer group %s " - "from DC: %s" % - (peer_group_name, system_peer.peer_name)) + LOG.exception(f"Failed to get subclouds of peer group " + f"{peer_group_name} from DC: " + f"{system_peer.peer_name}") def _update_remote_peer_group_migration_status(self, system_peer, @@ -65,37 +65,58 @@ class PeerGroupAuditManager(manager.Manager): } dc_client.update_subcloud_peer_group(peer_group_name, **peer_group_kwargs) - LOG.info("Updated Subcloud Peer Group %s on " - "peer site %s, set migration_status to: %s" % - (peer_group_name, system_peer.peer_name, migration_status)) + LOG.info(f"Updated Subcloud Peer Group {peer_group_name} on " + f"peer site {system_peer.peer_name}, set migration_status " + f"to: {migration_status}") - def _get_local_subclouds_to_update(self, - local_peer_group, - remote_subclouds): + def _get_local_subclouds_to_update_and_delete(self, + local_peer_group, + remote_subclouds): local_subclouds_to_update = list() - remote_managed_subcloud_region_names = list() + local_subclouds_to_delete = list() + remote_subclouds_dict = {remote_subcloud.get('region-name'): + remote_subcloud for remote_subcloud + in remote_subclouds} local_subclouds = db_api.subcloud_get_for_peer_group( self.context, local_peer_group.id) - # get the 'managed+online' remote subclouds - for remote_subcloud in remote_subclouds: - if (remote_subcloud.get('management-state') == - dccommon_consts.MANAGEMENT_MANAGED and - remote_subcloud.get('availability-status') == - dccommon_consts.AVAILABILITY_ONLINE): - remote_managed_subcloud_region_names.append( - remote_subcloud.get('region-name')) - - # Compare with the 'non-secondary' local subclouds for local_subcloud in local_subclouds: - if local_subcloud.region_name in \ - remote_managed_subcloud_region_names \ - and not utils.subcloud_is_secondary_state( - local_subcloud.deploy_status): + remote_subcloud = remote_subclouds_dict.get( + local_subcloud.region_name) + if remote_subcloud: + # Check if the remote subcloud meets the conditions for update + # if it is 'managed' and the local subcloud is not + # in 'secondary' status + if (remote_subcloud.get('management-state') == + dccommon_consts.MANAGEMENT_MANAGED and + not utils.subcloud_is_secondary_state( + local_subcloud.deploy_status)): + local_subclouds_to_update.append(local_subcloud) + else: + local_subclouds_to_delete.append(local_subcloud) - local_subclouds_to_update.append(local_subcloud) + return local_subclouds_to_update, local_subclouds_to_delete - return local_subclouds_to_update + def _set_local_subcloud_to_secondary(self, subcloud): + try: + LOG.info("Set local subcloud %s to secondary" % subcloud.name) + # There will be an exception when unmanage + # a subcloud in 'unamaged' state. + if subcloud.management_state != \ + dccommon_consts.MANAGEMENT_UNMANAGED: + self.subcloud_manager.update_subcloud( + self.context, + subcloud.id, + management_state=dccommon_consts. + MANAGEMENT_UNMANAGED) + self.subcloud_manager.update_subcloud( + self.context, + subcloud.id, + deploy_status=consts.DEPLOY_STATE_SECONDARY) + except Exception as e: + LOG.exception(f"Failed to update local non-secondary " + f"and offline subcloud [{subcloud.name}], err: {e}") + raise e def audit(self, system_peer, remote_peer_group, local_peer_group): if local_peer_group.migration_status == consts.PEER_GROUP_MIGRATING: @@ -120,9 +141,9 @@ class PeerGroupAuditManager(manager.Manager): if remote_peer_group.get("migration_status") == \ consts.PEER_GROUP_MIGRATING: # Unmanaged all local subclouds of peer group - LOG.info("Unmanaged all local subclouds of peer group %s " - "since remote is in migrating state" % - local_peer_group.peer_group_name) + LOG.info(f"Unmanaged all local subclouds of peer group " + f"{local_peer_group.peer_group_name} " + f"since remote is in migrating state") subclouds = db_api.subcloud_get_for_peer_group(self.context, local_peer_group.id) for subcloud in subclouds: @@ -152,8 +173,8 @@ class PeerGroupAuditManager(manager.Manager): subcloud.id, deploy_status=consts.DEPLOY_STATE_REHOME_PENDING) except Exception as e: - LOG.exception("Fail to unmanage local subcloud %s, err: " - "%s" % (subcloud.name, e)) + LOG.exception(f"Fail to unmanage local subcloud " + f"{subcloud.name}, err: {e}") raise e self.require_audit_flag = False @@ -167,39 +188,29 @@ class PeerGroupAuditManager(manager.Manager): system_peer, remote_peer_group.get("peer_group_name")) - if not remote_subclouds: - LOG.error("No subclouds in remote DC:%s's peer group %s" % - (system_peer.peer_name, - remote_peer_group.get("peer_group_name"))) - return - local_subclouds_to_update = \ - self._get_local_subclouds_to_update(local_peer_group, - remote_subclouds) + local_subclouds_to_update, local_subclouds_to_delete = \ + self._get_local_subclouds_to_update_and_delete( + local_peer_group, remote_subclouds) for subcloud in local_subclouds_to_update: + self._set_local_subcloud_to_secondary(subcloud) + + # Change the local subcloud not exist on peer site's SPG to + # secondary status then delete it + for subcloud in local_subclouds_to_delete: + self._set_local_subcloud_to_secondary(subcloud) try: - LOG.info("Set secondary to local subcloud %s" % - subcloud.name) - # There will be an exception when unmanage - # a subcloud in 'unamaged' state. - if subcloud.management_state != \ - dccommon_consts.MANAGEMENT_UNMANAGED: - self.subcloud_manager.update_subcloud( - self.context, - subcloud.id, - management_state=dccommon_consts. - MANAGEMENT_UNMANAGED) - self.subcloud_manager.update_subcloud( - self.context, - subcloud.id, - deploy_status=consts.DEPLOY_STATE_SECONDARY) + self.subcloud_manager.delete_subcloud( + self.context, subcloud.id) + LOG.info(f"Deleted local subcloud {subcloud.name}") except Exception as e: - LOG.exception("Failed to update local non-secondary " - "and offline subcloud [%s], err: %s" % - (subcloud.name, e)) + LOG.exception(f"Failed to delete local subcloud " + f"[{subcloud.name}] that does not exist " + f"under the same subcloud_peer_group on " + f"peer site, err: {e}") raise e - if local_subclouds_to_update: + if local_subclouds_to_update or local_subclouds_to_delete: self._clear_or_raise_alarm(system_peer, local_peer_group, remote_peer_group) @@ -229,10 +240,10 @@ class PeerGroupAuditManager(manager.Manager): entity_instance_id = "peer_group=%s,peer=%s" % \ (local_peer_group.peer_group_name, system_peer.peer_uuid) if local_peer_group.group_priority < remote_peer_group.get('group_priority'): - LOG.warning("Alarm: local subcloud peer group [%s] " - "is managed by remote system [%s]" % - (local_peer_group.peer_group_name, - system_peer.peer_name)) + LOG.warning("Alarm: local subcloud peer group [" + f"{local_peer_group.peer_group_name}] " + f"is managed by remote system [" + f"{system_peer.peer_name}]") try: fault = fm_api.Fault( alarm_id=fm_const. @@ -266,15 +277,15 @@ class PeerGroupAuditManager(manager.Manager): fm_const.FM_ALARM_ID_DC_SUBCLOUD_PEER_GROUP_NOT_MANAGED, entity_instance_id) if fault: - LOG.info("Clear alarm: %s" % entity_instance_id) + LOG.info(f"Clear alarm: {entity_instance_id}") self.fm_api.clear_fault( fm_const.FM_ALARM_ID_DC_SUBCLOUD_PEER_GROUP_NOT_MANAGED, entity_instance_id) except Exception: LOG.exception( - "Problem clearing fault [%s], alarm_id=%s" % - (entity_instance_id, - fm_const.FM_ALARM_ID_DC_SUBCLOUD_PEER_GROUP_NOT_MANAGED)) + f"Problem clearing fault [{entity_instance_id}], " + f"alarm_id=" + f"{fm_const.FM_ALARM_ID_DC_SUBCLOUD_PEER_GROUP_NOT_MANAGED}") def _do_audit(self, system_peer, remote_peer_group, local_peer_group): with self.thread_lock: @@ -286,15 +297,14 @@ class PeerGroupAuditManager(manager.Manager): def stop(self): if self.thread: self.thread.join() - LOG.info("stopped peer group %s audit thread" % self.peer_group_id) + LOG.info(f"stopped peer group {self.peer_group_id} audit thread") else: - LOG.info("No peer group %s audit thread to stop" % - self.peer_group_id) + LOG.info(f"No peer group {self.peer_group_id} audit thread to stop") def start(self, system_peer, remote_peer_group, local_peer_group): if self.thread_lock.locked(): - LOG.warning('Audit thread for %s has already started' % - local_peer_group.peer_group_name) + LOG.warning(f"Audit thread for {local_peer_group.peer_group_name} " + f"has already started") else: self.thread = threading.Thread( target=self._do_audit, @@ -305,8 +315,8 @@ class PeerGroupAuditManager(manager.Manager): system_peer, remote_peer_group, local_peer_group): - LOG.info("Audit peer group [%s] with remote system %s" % - (local_peer_group.peer_group_name, system_peer.peer_name)) + LOG.info(f"Audit peer group [{local_peer_group.peer_group_name}] " + f"with remote system {system_peer.peer_name}") self.start(system_peer, remote_peer_group, local_peer_group) @staticmethod @@ -332,6 +342,6 @@ class PeerGroupAuditManager(manager.Manager): if response: return response except Exception: - LOG.exception("Failed to send audit request for peer group %s " - "to DC: %s" % - (peer_group.peer_group_name, system.peer_name)) + LOG.exception("Failed to send audit request for peer group " + f"{peer_group.peer_group_name} to DC: " + f"{system.peer_name}")