diff --git a/api-ref/source/api-ref-dcmanager-v1.rst b/api-ref/source/api-ref-dcmanager-v1.rst index caf1c8431..fd28c2124 100644 --- a/api-ref/source/api-ref-dcmanager-v1.rst +++ b/api-ref/source/api-ref-dcmanager-v1.rst @@ -2653,6 +2653,7 @@ internalServerError (500), serviceUnavailable (503) - max_subcloud_rehoming: subcloud_peer_group_max_subcloud_rehoming - system_leader_id: subcloud_peer_group_system_leader_id - system_leader_name: subcloud_peer_group_system_leader_name + - migration_status: subcloud_peer_group_migration_status - created_at: created_at - updated_at: updated_at @@ -2707,6 +2708,7 @@ Request Example - max_subcloud_rehoming: subcloud_peer_group_max_subcloud_rehoming - system_leader_id: subcloud_peer_group_system_leader_id - system_leader_name: subcloud_peer_group_system_leader_name + - migration_status: subcloud_peer_group_migration_status - created_at: created_at - updated_at: updated_at @@ -2752,6 +2754,7 @@ This operation does not accept a request body. - max_subcloud_rehoming: subcloud_peer_group_max_subcloud_rehoming - system_leader_id: subcloud_peer_group_system_leader_id - system_leader_name: subcloud_peer_group_system_leader_name + - migration_status: subcloud_peer_group_migration_status - created_at: created_at - updated_at: updated_at @@ -2841,6 +2844,8 @@ The attributes of a subcloud peer group which are modifiable: - system_leader_name +- migration_status + **Normal response codes** @@ -2863,6 +2868,7 @@ serviceUnavailable (503) - max_subcloud_rehoming: subcloud_peer_group_max_subcloud_rehoming - system_leader_id: subcloud_peer_group_system_leader_id - system_leader_name: subcloud_peer_group_system_leader_name + - migration_status: subcloud_peer_group_migration_status Request Example ---------------- @@ -2880,6 +2886,7 @@ Request Example - max_subcloud_rehoming: subcloud_peer_group_max_subcloud_rehoming - system_leader_id: subcloud_peer_group_system_leader_id - system_leader_name: subcloud_peer_group_system_leader_name + - migration_status: subcloud_peer_group_migration_status - created_at: created_at - updated_at: updated_at @@ -2943,6 +2950,55 @@ internalServerError (500), serviceUnavailable (503) This operation does not accept a request body. + +************************************ +Audit a specific subcloud peer group +************************************ + +.. rest_method:: PATCH /v1.0/subcloud-peer-groups/​{subcloud-peer-group}​/audit + +**Normal response codes** + +200 + +**Error response codes** + +badRequest (400), unauthorized (401), forbidden (403), badMethod (405), +HTTPUnprocessableEntity (422), internalServerError (500), +serviceUnavailable (503) + +**Request parameters** + +.. rest_parameters:: parameters.yaml + + - peer_uuid: peer_uuid + - id: subcloud_peer_group_id + - peer_group_name: subcloud_peer_group_name + - group_priority: subcloud_peer_group_priority + - group_state: subcloud_peer_group_administrative_state + - max_subcloud_rehoming: subcloud_peer_group_max_subcloud_rehoming + - system_leader_id: subcloud_peer_group_system_leader_id + - system_leader_name: subcloud_peer_group_system_leader_name + - migration_status: subcloud_peer_group_migration_status + +Request Example +---------------- +.. literalinclude:: samples/subcloud-peer-groups/subcloud-peer-group-audit-request.json + :language: json + +**Response parameters** + +.. rest_parameters:: parameters.yaml + + - message: subcloud_peer_group_audit_error_message + +Response Example +---------------- + +.. literalinclude:: samples/subcloud-peer-groups/subcloud-peer-group-audit-response.json + :language: json + + ---------------------- Peer Group Association ---------------------- diff --git a/api-ref/source/parameters.yaml b/api-ref/source/parameters.yaml index ca14f7b39..4f17780d9 100644 --- a/api-ref/source/parameters.yaml +++ b/api-ref/source/parameters.yaml @@ -727,6 +727,13 @@ subcloud_peer_group_administrative_state: in: body required: false type: string +subcloud_peer_group_audit_error_message: + description: | + The error message from a peer DC's peer group + audit result. + in: body + required: true + type: string subcloud_peer_group_id: description: | The ID of the subcloud peer group associated with this object. @@ -739,6 +746,12 @@ subcloud_peer_group_max_subcloud_rehoming: in: body required: false type: integer +subcloud_peer_group_migration_status: + description: | + The migration status of the subcloud peer group. + in: body + required: false + type: string subcloud_peer_group_name: description: | The NAME of the subcloud peer group. diff --git a/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-group-audit-request.json b/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-group-audit-request.json new file mode 100644 index 000000000..a926aa50a --- /dev/null +++ b/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-group-audit-request.json @@ -0,0 +1,9 @@ +{ + "peer_group_name": "dc1-pg", + "group_priority": 0, + "group_state": "enabled", + "system_leader_id": "ac62f555-9386-42f1-b3a1-51ecb709409d", + "system_leader_name": "dc1-name", + "migration_status": null, + "peer_uuid": "ac62f555-9386-42f1-b3a1-51ecb709409d" +} \ No newline at end of file diff --git a/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-group-audit-response.json b/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-group-audit-response.json new file mode 100644 index 000000000..5eb6d677e --- /dev/null +++ b/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-group-audit-response.json @@ -0,0 +1 @@ +{"message": "error message"} \ No newline at end of file diff --git a/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-group-get-response.json b/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-group-get-response.json index dc9dd01a8..53ab6d399 100644 --- a/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-group-get-response.json +++ b/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-group-get-response.json @@ -6,6 +6,7 @@ "max_subcloud_rehoming": 10, "system_leader_id": "ac62f555-9386-42f1-b3a1-51ecb709409d", "system_leader_name": "dc1-name", + "migration_status": null, "created-at": "2023-07-26 00:51:01.396694", "updated-at": "2023-07-26 00:57:35.941816" } \ No newline at end of file diff --git a/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-group-patch-request.json b/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-group-patch-request.json index 2783263bb..4ab092b86 100644 --- a/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-group-patch-request.json +++ b/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-group-patch-request.json @@ -6,6 +6,5 @@ "max_subcloud_rehoming": 10, "system_leader_id": "ac62f555-9386-42f1-b3a1-51ecb709409d", "system_leader_name": "dc1-name", - "created-at": "2023-07-26 00:51:01.396694", - "updated-at": "2023-08-07 06:09:04.086417" + "migration_status": null } \ No newline at end of file diff --git a/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-group-patch-response.json b/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-group-patch-response.json index 2783263bb..9e07c80a5 100644 --- a/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-group-patch-response.json +++ b/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-group-patch-response.json @@ -6,6 +6,7 @@ "max_subcloud_rehoming": 10, "system_leader_id": "ac62f555-9386-42f1-b3a1-51ecb709409d", "system_leader_name": "dc1-name", + "migration_status": null, "created-at": "2023-07-26 00:51:01.396694", "updated-at": "2023-08-07 06:09:04.086417" } \ No newline at end of file diff --git a/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-groups-get-response.json b/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-groups-get-response.json index ac59c05c1..526d9d9f5 100644 --- a/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-groups-get-response.json +++ b/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-groups-get-response.json @@ -7,6 +7,7 @@ "max_subcloud_rehoming": 10, "system_leader_id": "ac62f555-9386-42f1-b3a1-51ecb709409d", "system_leader_name": "dc1-name", + "migration_status": null, "created-at": "2023-07-26 00:51:01.396694", "updated-at": "2023-08-07 06:09:04.086417" }] diff --git a/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-groups-post-response.json b/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-groups-post-response.json index 38c67a477..f0399a3b3 100644 --- a/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-groups-post-response.json +++ b/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-groups-post-response.json @@ -6,6 +6,7 @@ "max_subcloud_rehoming": 10, "system_leader_id": "ac62f555-9386-42f1-b3a1-51ecb709409d", "system_leader_name": "dc1-name", + "migration_status": null, "created-at": "2023-08-07 06:13:52.664047", "updated-at": null } \ No newline at end of file diff --git a/distributedcloud/dccommon/consts.py b/distributedcloud/dccommon/consts.py index d4a2bb2f4..9affaea59 100644 --- a/distributedcloud/dccommon/consts.py +++ b/distributedcloud/dccommon/consts.py @@ -151,6 +151,10 @@ SSL_CERT_CA_DIR = "/etc/pki/ca-trust/source/anchors/" RVMC_NAME_PREFIX = 'rvmc' RVMC_CONFIG_FILE_NAME = 'rvmc-config.yaml' +# Required for GEO-redundancy +# User-Agent check for subcloud by region_name request. +DCMANAGER_V1_HTTP_AGENT = 'dcmanager/1.0' + # Subcloud installation values BMC_INSTALL_VALUES = [ 'bmc_username', diff --git a/distributedcloud/dccommon/drivers/openstack/dcmanager_v1.py b/distributedcloud/dccommon/drivers/openstack/dcmanager_v1.py index a6518b0df..22d527d14 100644 --- a/distributedcloud/dccommon/drivers/openstack/dcmanager_v1.py +++ b/distributedcloud/dccommon/drivers/openstack/dcmanager_v1.py @@ -37,11 +37,11 @@ class DcmanagerClient(base.DriverBase): """Get subcloud.""" if subcloud_ref is None: raise ValueError("subcloud_ref is required.") - url = f"{self.endpoint}/subclouds/{subcloud_ref}" + url = f"{self.endpoint}/subclouds/{subcloud_ref}/detail" headers = {"X-Auth-Token": self.token} if is_region_name: - headers["User-Agent"] = "dcmanager/1.0" + headers["User-Agent"] = consts.DCMANAGER_V1_HTTP_AGENT response = requests.get(url, headers=headers, timeout=self.timeout) if response.status_code == 200: @@ -221,7 +221,31 @@ class DcmanagerClient(base.DriverBase): LOG.error(message) raise Exception(message) - def update_subcloud(self, subcloud_ref, files, data): + def audit_subcloud_peer_group(self, peer_group_ref, **kwargs): + """Audit the subcloud peer group.""" + if peer_group_ref is None: + raise ValueError("peer_group_ref is required.") + url = f"{self.endpoint}/subcloud-peer-groups/{peer_group_ref}/audit" + + headers = {"X-Auth-Token": self.token, + "Content-Type": "application/json"} + response = requests.patch(url, json=kwargs, headers=headers, + timeout=self.timeout) + + if response.status_code == 200: + return response.json() + else: + if response.status_code == 404 and \ + 'Subcloud Peer Group not found' in response.text: + raise exceptions.SubcloudPeerGroupNotFound( + peer_group_ref=peer_group_ref) + message = "Audit Subcloud Peer Group: peer_group_ref %s, %s, " \ + "failed with RC: %d" % (peer_group_ref, kwargs, + response.status_code) + LOG.error(message) + raise Exception(message) + + def update_subcloud(self, subcloud_ref, files, data, is_region_name=False): """Update the subcloud.""" if subcloud_ref is None: raise ValueError("subcloud_ref is required.") @@ -237,6 +261,10 @@ class DcmanagerClient(base.DriverBase): enc = MultipartEncoder(fields=fields) headers = {"X-Auth-Token": self.token, "Content-Type": enc.content_type} + # Add header to flag the request is from another DC, + # server will treat subcloud_ref as a region_name + if is_region_name: + headers["User-Agent"] = consts.DCMANAGER_V1_HTTP_AGENT response = requests.patch(url, headers=headers, data=enc, timeout=self.timeout) diff --git a/distributedcloud/dcmanager/api/controllers/v1/peer_group_association.py b/distributedcloud/dcmanager/api/controllers/v1/peer_group_association.py index 79c3e7e1b..2f6da213b 100644 --- a/distributedcloud/dcmanager/api/controllers/v1/peer_group_association.py +++ b/distributedcloud/dcmanager/api/controllers/v1/peer_group_association.py @@ -227,6 +227,8 @@ class PeerGroupAssociationsController(restcomm.GenericPathController): if sync_enabled: # Sync the subcloud peer group to peer site self.rpc_client.sync_subcloud_peer_group(context, association.id) + else: + self.rpc_client.peer_monitor_notify(context) return db_api.peer_group_association_db_model_to_dict(association) except RemoteError as e: pecan.abort(httpclient.UNPROCESSABLE_ENTITY, e.value) @@ -343,8 +345,10 @@ class PeerGroupAssociationsController(restcomm.GenericPathController): sync_disabled = association.sync_status == consts.\ ASSOCIATION_SYNC_STATUS_DISABLED if sync_disabled: - return db_api.peer_group_association_destroy(context, - association_id) + result = db_api.peer_group_association_destroy(context, + association_id) + self.rpc_client.peer_monitor_notify(context) + return result # Ask system-peer-manager to delete the association. # It will do all the real work... return self.rpc_client.delete_peer_group_association( diff --git a/distributedcloud/dcmanager/api/controllers/v1/subcloud_peer_group.py b/distributedcloud/dcmanager/api/controllers/v1/subcloud_peer_group.py index 588de2538..ad7769466 100644 --- a/distributedcloud/dcmanager/api/controllers/v1/subcloud_peer_group.py +++ b/distributedcloud/dcmanager/api/controllers/v1/subcloud_peer_group.py @@ -263,6 +263,12 @@ class SubcloudPeerGroupsController(restcomm.GenericPathController): system_leader_id = payload.get('system-leader-id') system_leader_name = payload.get('system-leader-name') max_subcloud_rehoming = payload.get('max-subcloud-rehoming') + if 'migration_status' in payload: + migration_status = payload.get('migration_status') + if migration_status is None: + migration_status = consts.PEER_GROUP_MIGRATION_NONE + else: + migration_status = None if not ( peer_group_name @@ -271,6 +277,7 @@ class SubcloudPeerGroupsController(restcomm.GenericPathController): or system_leader_id or system_leader_name or max_subcloud_rehoming + or migration_status ): pecan.abort(httpclient.BAD_REQUEST, _('nothing to update')) @@ -294,6 +301,12 @@ class SubcloudPeerGroupsController(restcomm.GenericPathController): not self._validate_system_leader_name(system_leader_name)): pecan.abort(httpclient.BAD_REQUEST, _('Invalid system-leader-name')) + if (migration_status and + migration_status.lower() not in [consts.PEER_GROUP_MIGRATING, + consts.PEER_GROUP_MIGRATION_COMPLETE, + consts.PEER_GROUP_MIGRATION_NONE]): + pecan.abort(httpclient.BAD_REQUEST, + _('Invalid migration_status')) try: updated_peer_group = db_api.subcloud_peer_group_update( @@ -304,7 +317,8 @@ class SubcloudPeerGroupsController(restcomm.GenericPathController): group_state=group_state, max_subcloud_rehoming=max_subcloud_rehoming, system_leader_id=system_leader_id, - system_leader_name=system_leader_name) + system_leader_name=system_leader_name, + migration_status=migration_status) return db_api.subcloud_peer_group_db_model_to_dict(updated_peer_group) except RemoteError as e: pecan.abort(httpclient.UNPROCESSABLE_ENTITY, e.value) @@ -403,6 +417,47 @@ class SubcloudPeerGroupsController(restcomm.GenericPathController): "Unable to batch migrate peer group %s" % group.peer_group_name) pecan.abort(500, _('Unable to batch migrate ' 'peer group %s' % group.peer_group_name)) + elif verb == 'audit': + payload = json.loads(request.body) + if 'peer_uuid' not in payload: + pecan.abort(400, _('Unable to audit peer group ' + '%s, missing peer_uuid' % + group.peer_group_name)) + if 'peer_group_name' not in payload: + pecan.abort(400, _('Unable to audit peer group ' + '%s, missing peer_group_name' % + group.peer_group_name)) + if 'group_priority' not in payload: + pecan.abort(400, _('Unable to audit peer group ' + '%s, missing group_priority' % + group.peer_group_name)) + if 'group_state' not in payload: + pecan.abort(400, _('Unable to audit peer group ' + '%s, missing group_state' % + group.peer_group_name)) + if 'system_leader_id' not in payload: + pecan.abort(400, _('Unable to audit peer group ' + '%s, missing system_leader_id' % + group.peer_group_name)) + if 'system_leader_name' not in payload: + pecan.abort(400, _('Unable to audit peer group ' + '%s, missing system_leader_name' % + group.peer_group_name)) + if 'migration_status' not in payload: + pecan.abort(400, _('Unable to audit peer group ' + '%s, missing migration_status' % + group.peer_group_name)) + try: + msg = self.rpc_client.peer_group_audit_notify( + context, + group.peer_group_name, + payload) + return {"message": msg} + except Exception: + LOG.exception('Unable to audit peer group %s' % + group.peer_group_name) + pecan.abort(500, _('Unable to audit peer group %s' % + group.peer_group_name)) else: pecan.abort(400, _('Invalid request')) diff --git a/distributedcloud/dcmanager/api/controllers/v1/subclouds.py b/distributedcloud/dcmanager/api/controllers/v1/subclouds.py index 95ceb36bd..20b88afa8 100644 --- a/distributedcloud/dcmanager/api/controllers/v1/subclouds.py +++ b/distributedcloud/dcmanager/api/controllers/v1/subclouds.py @@ -423,12 +423,13 @@ class SubcloudsController(object): pecan.abort(404, _('Subcloud not found')) else: try: - # When the request comes from the cert-monitor, it is - # based on the region name (which is UUID format). + # When the request comes from the cert-monitor or another + # DC, it is based on the region name (which is UUID format). # Whereas, if the request comes from a client other # than cert-monitor, it will do the lookup based on # the subcloud name. - if utils.is_req_from_cert_mon_agent(request): + if (utils.is_req_from_cert_mon_agent(request) or + utils.is_req_from_another_dc(request)): subcloud = db_api.\ subcloud_get_by_region_name(context, subcloud_ref) else: @@ -573,12 +574,13 @@ class SubcloudsController(object): pecan.abort(404, _('Subcloud not found')) else: try: - # When the request comes from the cert-monitor, it is - # based on the region name (which is UUID format). + # When the request comes from the cert-monitor or another DC, + # it is based on the region name (which is UUID format). # Whereas, if the request comes from a client other # than cert-monitor, it will do the lookup based on # the subcloud name. - if utils.is_req_from_cert_mon_agent(request): + if (utils.is_req_from_cert_mon_agent(request) or + utils.is_req_from_another_dc(request)): subcloud = db_api.\ subcloud_get_by_region_name(context, subcloud_ref) else: diff --git a/distributedcloud/dcmanager/api/policies/subcloud_peer_group.py b/distributedcloud/dcmanager/api/policies/subcloud_peer_group.py index 83bbdf4cc..14eda80c9 100644 --- a/distributedcloud/dcmanager/api/policies/subcloud_peer_group.py +++ b/distributedcloud/dcmanager/api/policies/subcloud_peer_group.py @@ -75,6 +75,11 @@ _subcloud_peer_groups_rules = [ { 'method': 'PATCH', 'path': '/v1.0/subcloud-peer-groups/{subcloud_peer_group}/migrate' + }, + # Trigger a peer group audit + { + 'method': 'PATCH', + 'path': '/v1.0/subcloud-peer-groups/{subcloud_peer_group}/audit' } ] ) diff --git a/distributedcloud/dcmanager/common/consts.py b/distributedcloud/dcmanager/common/consts.py index 50c3c3e38..57af3dd43 100644 --- a/distributedcloud/dcmanager/common/consts.py +++ b/distributedcloud/dcmanager/common/consts.py @@ -443,6 +443,10 @@ OS_REGION_NAME = "OS_REGION_NAME" STATES_FOR_SUBCLOUD_RENAME = [DEPLOY_STATE_DONE, PRESTAGE_STATE_COMPLETE] +# Required for GEO-redundancy +# User-Agent check for subcloud by region_name request. +DCMANAGER_V1_HTTP_AGENT = 'dcmanager/1.0' + # batch rehome manage state wait timeout BATCH_REHOME_MGMT_STATES_TIMEOUT = 900 @@ -450,8 +454,16 @@ BATCH_REHOME_MGMT_STATES_TIMEOUT = 900 SYSTEM_PEER_HEARTBEAT_STATUS_ALIVE = 'alive' SYSTEM_PEER_HEARTBEAT_STATUS_FAILURE = 'failure' +# Peer group migration status +PEER_GROUP_MIGRATING = 'migrating' +PEER_GROUP_MIGRATION_COMPLETE = 'complete' +PEER_GROUP_MIGRATION_NONE = 'none' + # Peer group association sync status ASSOCIATION_SYNC_STATUS_SYNCING = 'syncing' ASSOCIATION_SYNC_STATUS_SYNCED = 'synced' ASSOCIATION_SYNC_STATUS_FAILED = 'failed' ASSOCIATION_SYNC_STATUS_DISABLED = 'disabled' + +# Peer monitor heartbeat policy +HEARTBEAT_FAILURE_POLICY_ALARM = 'alarm' diff --git a/distributedcloud/dcmanager/common/utils.py b/distributedcloud/dcmanager/common/utils.py index ae40c5f04..e6e3d9401 100644 --- a/distributedcloud/dcmanager/common/utils.py +++ b/distributedcloud/dcmanager/common/utils.py @@ -1414,3 +1414,26 @@ def yaml_safe_load(contents, content_type): pecan.abort(400, _(msg)) return data + + +# Feature: Subcloud Name Reconfiguration +# This method is useful to determine the origin of the request +# towards the api. +def is_req_from_another_dc(request): + ua = request.headers.get("User-Agent") + if ua == consts.DCMANAGER_V1_HTTP_AGENT: + return True + else: + return False + + +def get_local_system(): + m_ks_client = OpenStackDriver( + region_name=dccommon_consts.DEFAULT_REGION_NAME, + region_clients=None).keystone_client + endpoint = m_ks_client.endpoint_cache.get_endpoint('sysinv') + sysinv_client = SysinvClient(dccommon_consts.DEFAULT_REGION_NAME, + m_ks_client.session, + endpoint=endpoint) + system = sysinv_client.get_system() + return system diff --git a/distributedcloud/dcmanager/db/api.py b/distributedcloud/dcmanager/db/api.py index e782e8f7c..aa07490d0 100644 --- a/distributedcloud/dcmanager/db/api.py +++ b/distributedcloud/dcmanager/db/api.py @@ -434,14 +434,14 @@ def peer_group_get_for_system_peer(context, peer_id): def system_peer_update(context, peer_id, - peer_uuid, peer_name, - endpoint, username, password, - gateway_ip, - administrative_state, - heartbeat_interval, - heartbeat_failure_threshold, - heartbeat_failure_policy, - heartbeat_maintenance_timeout, + peer_uuid=None, peer_name=None, + endpoint=None, username=None, password=None, + gateway_ip=None, + administrative_state=None, + heartbeat_interval=None, + heartbeat_failure_threshold=None, + heartbeat_failure_policy=None, + heartbeat_maintenance_timeout=None, heartbeat_status=None): """Update the system peer or raise if it does not exist.""" return IMPL.system_peer_update(context, peer_id, @@ -473,13 +473,16 @@ def subcloud_peer_group_db_model_to_dict(subcloud_peer_group): "max_subcloud_rehoming": subcloud_peer_group.max_subcloud_rehoming, "system_leader_id": subcloud_peer_group.system_leader_id, "system_leader_name": subcloud_peer_group.system_leader_name, + "migration_status": subcloud_peer_group.migration_status, "created-at": subcloud_peer_group.created_at, "updated-at": subcloud_peer_group.updated_at} return result -def subcloud_peer_group_create(context, peer_group_name, group_priority, group_state, - max_subcloud_rehoming, system_leader_id, system_leader_name): +def subcloud_peer_group_create(context, peer_group_name, group_priority, + group_state, max_subcloud_rehoming, + system_leader_id, system_leader_name, + migration_status=None): """Create a subcloud_peer_group.""" return IMPL.subcloud_peer_group_create(context, peer_group_name, @@ -487,7 +490,8 @@ def subcloud_peer_group_create(context, peer_group_name, group_priority, group_s group_state, max_subcloud_rehoming, system_leader_id, - system_leader_name) + system_leader_name, + migration_status) def subcloud_peer_group_destroy(context, group_id): @@ -523,9 +527,12 @@ def subcloud_peer_group_get_all(context): return IMPL.subcloud_peer_group_get_all(context) -def subcloud_peer_group_update(context, group_id, peer_group_name, group_priority, - group_state, max_subcloud_rehoming, system_leader_id, - system_leader_name): +def subcloud_peer_group_update(context, group_id, peer_group_name=None, + group_priority=None, group_state=None, + max_subcloud_rehoming=None, + system_leader_id=None, + system_leader_name=None, + migration_status=None): """Update the subcloud peer group or raise if it does not exist.""" return IMPL.subcloud_peer_group_update(context, group_id, @@ -534,7 +541,8 @@ def subcloud_peer_group_update(context, group_id, peer_group_name, group_priorit group_state, max_subcloud_rehoming, system_leader_id, - system_leader_name) + system_leader_name, + migration_status) ################### diff --git a/distributedcloud/dcmanager/db/sqlalchemy/api.py b/distributedcloud/dcmanager/db/sqlalchemy/api.py index 3de3c10de..f28c66ea8 100644 --- a/distributedcloud/dcmanager/db/sqlalchemy/api.py +++ b/distributedcloud/dcmanager/db/sqlalchemy/api.py @@ -1170,7 +1170,8 @@ def subcloud_peer_group_create(context, group_state, max_subcloud_rehoming, system_leader_id, - system_leader_name): + system_leader_name, + migration_status): with write_session() as session: subcloud_peer_group_ref = models.SubcloudPeerGroup() subcloud_peer_group_ref.peer_group_name = peer_group_name @@ -1179,6 +1180,7 @@ def subcloud_peer_group_create(context, subcloud_peer_group_ref.max_subcloud_rehoming = max_subcloud_rehoming subcloud_peer_group_ref.system_leader_id = system_leader_id subcloud_peer_group_ref.system_leader_name = system_leader_name + subcloud_peer_group_ref.migration_status = migration_status session.add(subcloud_peer_group_ref) return subcloud_peer_group_ref @@ -1198,7 +1200,8 @@ def subcloud_peer_group_update(context, group_state=None, max_subcloud_rehoming=None, system_leader_id=None, - system_leader_name=None): + system_leader_name=None, + migration_status=None): with write_session() as session: subcloud_peer_group_ref = subcloud_peer_group_get(context, group_id) if peer_group_name is not None: @@ -1213,6 +1216,11 @@ def subcloud_peer_group_update(context, subcloud_peer_group_ref.system_leader_id = system_leader_id if system_leader_name is not None: subcloud_peer_group_ref.system_leader_name = system_leader_name + if migration_status is not None: + if str(migration_status).lower() == 'none': + subcloud_peer_group_ref.migration_status = None + else: + subcloud_peer_group_ref.migration_status = migration_status subcloud_peer_group_ref.save(session) return subcloud_peer_group_ref ########################## diff --git a/distributedcloud/dcmanager/db/sqlalchemy/migrate_repo/versions/014_add_subcloud_peer_group_and_association.py b/distributedcloud/dcmanager/db/sqlalchemy/migrate_repo/versions/014_add_subcloud_peer_group_and_association.py index af2bedccd..800a1132b 100644 --- a/distributedcloud/dcmanager/db/sqlalchemy/migrate_repo/versions/014_add_subcloud_peer_group_and_association.py +++ b/distributedcloud/dcmanager/db/sqlalchemy/migrate_repo/versions/014_add_subcloud_peer_group_and_association.py @@ -29,6 +29,7 @@ def upgrade(migrate_engine): sqlalchemy.Column('system_leader_id', sqlalchemy.String(255)), sqlalchemy.Column('system_leader_name', sqlalchemy.String(255)), sqlalchemy.Column('max_subcloud_rehoming', sqlalchemy.Integer), + sqlalchemy.Column('migration_status', sqlalchemy.String(255)), sqlalchemy.Column('reserved_1', sqlalchemy.Text), sqlalchemy.Column('reserved_2', sqlalchemy.Text), sqlalchemy.Column('created_at', sqlalchemy.DateTime), diff --git a/distributedcloud/dcmanager/db/sqlalchemy/models.py b/distributedcloud/dcmanager/db/sqlalchemy/models.py index 51815c4b7..6972b4162 100644 --- a/distributedcloud/dcmanager/db/sqlalchemy/models.py +++ b/distributedcloud/dcmanager/db/sqlalchemy/models.py @@ -144,6 +144,7 @@ class SubcloudPeerGroup(BASE, DCManagerBase): max_subcloud_rehoming = Column(Integer) system_leader_id = Column(String(255)) system_leader_name = Column(String(255)) + migration_status = Column(String(255)) class PeerGroupAssociation(BASE, DCManagerBase): diff --git a/distributedcloud/dcmanager/manager/peer_group_audit_manager.py b/distributedcloud/dcmanager/manager/peer_group_audit_manager.py new file mode 100644 index 000000000..5ffd5a5c4 --- /dev/null +++ b/distributedcloud/dcmanager/manager/peer_group_audit_manager.py @@ -0,0 +1,313 @@ +# +# Copyright (c) 2023 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + +import threading + +from oslo_config import cfg +from oslo_log import log as logging + + +from fm_api import constants as fm_const +from fm_api import fm_api + +from dccommon import consts as dccommon_consts +from dcmanager.common import consts +from dcmanager.common import context +from dcmanager.common.i18n import _ +from dcmanager.common import manager +from dcmanager.common import utils +from dcmanager.db import api as db_api +from dcmanager.manager.system_peer_manager import SystemPeerManager + + +CONF = cfg.CONF +LOG = logging.getLogger(__name__) + + +class PeerGroupAuditManager(manager.Manager): + """Manages audit related tasks.""" + def __init__(self, subcloud_manager, peer_group_id, *args, **kwargs): + LOG.debug(_('PeerGroupAuditManager initialization...')) + super().__init__(service_name="peer_group_audit_manager", + *args, **kwargs) + self.context = context.get_admin_context() + self.fm_api = fm_api.FaultAPIs() + self.subcloud_manager = subcloud_manager + self.peer_group_id = peer_group_id + self.require_audit_flag = True + self.thread = None + self.thread_lock = threading.Lock() + + def _get_subclouds_by_peer_group_from_system_peer(self, + system_peer, + peer_group_name): + try: + dc_client = SystemPeerManager.get_peer_dc_client(system_peer) + subclouds = dc_client.get_subcloud_list_by_peer_group( + peer_group_name) + return subclouds + except Exception: + LOG.exception("Failed to get subclouds of peer group %s " + "from DC: %s" % + (peer_group_name, system_peer.peer_name)) + + def _update_remote_peer_group_migration_status(self, + system_peer, + peer_group_name, + migration_status): + dc_client = SystemPeerManager.get_peer_dc_client(system_peer) + peer_group_kwargs = { + 'migration_status': migration_status + } + dc_client.update_subcloud_peer_group(peer_group_name, + **peer_group_kwargs) + LOG.info("Updated Subcloud Peer Group %s on " + "peer site %s, set migration_status to: %s" % + (peer_group_name, system_peer.peer_name, migration_status)) + + def _get_local_subclouds_to_update(self, + local_peer_group, + remote_subclouds): + local_subclouds_to_update = list() + remote_managed_subcloud_region_names = list() + local_subclouds = db_api.subcloud_get_for_peer_group( + self.context, local_peer_group.id) + + # get the 'managed+online' remote subclouds + for remote_subcloud in remote_subclouds: + if (remote_subcloud.get('management-state') == + dccommon_consts.MANAGEMENT_MANAGED and + remote_subcloud.get('availability-status') == + dccommon_consts.AVAILABILITY_ONLINE): + remote_managed_subcloud_region_names.append( + remote_subcloud.get('region-name')) + + # Compare with the 'non-secondary' local subclouds + for local_subcloud in local_subclouds: + if local_subcloud.region_name in \ + remote_managed_subcloud_region_names \ + and not utils.subcloud_is_secondary_state( + local_subcloud.deploy_status): + + local_subclouds_to_update.append(local_subcloud) + + return local_subclouds_to_update + + def audit(self, system_peer, remote_peer_group, local_peer_group): + if local_peer_group.migration_status == consts.PEER_GROUP_MIGRATING: + LOG.info("Local peer group in migrating state, quit audit") + return + + LOG.info("Auditing remote subcloud peer group:[%s] " + "migration_status:[%s] group_priority[%s], " + "local subcloud peer group:[%s] " + "migration_status:[%s] group_priority[%s]" % + (remote_peer_group.get("peer_group_name"), + remote_peer_group.get("migration_status"), + remote_peer_group.get("group_priority"), + local_peer_group.peer_group_name, + local_peer_group.migration_status, + local_peer_group.group_priority)) + + # if remote subcloud peer group's migration_status is 'migrating', + # 'unmanaged' all local subclouds in local peer group + if remote_peer_group.get("migration_status") == \ + consts.PEER_GROUP_MIGRATING: + # Unmanaged all local subclouds of peer group + LOG.info("Unmanaged all local subclouds of peer group %s " + "since remote is in migrating state" % + local_peer_group.peer_group_name) + subclouds = db_api.subcloud_get_for_peer_group(self.context, + local_peer_group.id) + for subcloud in subclouds: + try: + if subcloud.management_state != \ + dccommon_consts.MANAGEMENT_UNMANAGED: + self.subcloud_manager.update_subcloud( + self.context, + subcloud.id, + management_state=dccommon_consts. + MANAGEMENT_UNMANAGED) + except Exception as e: + LOG.exception("Fail to unmanage local subcloud %s, err: " + "%s" % (subcloud.name, e)) + raise e + self.require_audit_flag = False + + # if remote subcloud peer group's migration_status is 'complete', + # Get remote subclouds, for 'managed+online' subclouds, + # set 'unmanaged+secondary' to local on same subclouds + elif remote_peer_group.get("migration_status") == \ + consts.PEER_GROUP_MIGRATION_COMPLETE: + remote_subclouds = \ + self._get_subclouds_by_peer_group_from_system_peer( + system_peer, + remote_peer_group.get("peer_group_name")) + + if not remote_subclouds: + LOG.error("No subclouds in remote DC:%s's peer group %s" % + (system_peer.peer_name, + remote_peer_group.get("peer_group_name"))) + return + local_subclouds_to_update = \ + self._get_local_subclouds_to_update(local_peer_group, + remote_subclouds) + + for subcloud in local_subclouds_to_update: + try: + LOG.info("Set secondary to local subcloud %s" % + subcloud.name) + # There will be an exception when unmanage + # a subcloud in 'unamaged' state. + if subcloud.management_state != \ + dccommon_consts.MANAGEMENT_UNMANAGED: + self.subcloud_manager.update_subcloud( + self.context, + subcloud.id, + management_state=dccommon_consts. + MANAGEMENT_UNMANAGED) + self.subcloud_manager.update_subcloud( + self.context, + subcloud.id, + deploy_status=consts.DEPLOY_STATE_SECONDARY) + except Exception as e: + LOG.exception("Failed to update local non-secondary " + "and offline subcloud [%s], err: %s" % + (subcloud.name, e)) + raise e + + if local_subclouds_to_update: + self._clear_or_raise_alarm(system_peer, + local_peer_group, + remote_peer_group) + db_api.subcloud_peer_group_update( + self.context, + local_peer_group.id, + system_leader_id=system_peer.peer_uuid, + system_leader_name=system_peer.peer_name) + + self._update_remote_peer_group_migration_status( + system_peer, + remote_peer_group.get("peer_group_name"), + None) + self.require_audit_flag = False + else: + # If remote peer group migration_status is 'None' + self.require_audit_flag = False + + def _clear_or_raise_alarm(self, + system_peer, + local_peer_group, + remote_peer_group): + # If local subcloud peer group's group_priority is + # lower than remote subcloud peer group's group_priority, + # an alarm will be raised. + # lower number means higher priority + entity_instance_id = "peer_group=%s,peer=%s" % \ + (local_peer_group.peer_group_name, system_peer.peer_uuid) + if local_peer_group.group_priority < remote_peer_group.get('group_priority'): + LOG.warning("Alarm: local subcloud peer group [%s] " + "is managed by remote system [%s]" % + (local_peer_group.peer_group_name, + system_peer.peer_name)) + try: + fault = fm_api.Fault( + alarm_id=fm_const. + FM_ALARM_ID_DC_SUBCLOUD_PEER_GROUP_NOT_MANAGED, + alarm_state=fm_const.FM_ALARM_STATE_SET, + entity_type_id=fm_const. + FM_ENTITY_TYPE_SUBCLOUD_PEER_GROUP, + entity_instance_id=entity_instance_id, + severity=fm_const.FM_ALARM_SEVERITY_MAJOR, + reason_text=("Subcloud peer group " + "(peer_group_name=%s) " + "is managed by remote " + "system (peer_uuid=%s) " + "with a lower priority." % + (local_peer_group.peer_group_name, + system_peer.peer_uuid)), + alarm_type=fm_const.FM_ALARM_TYPE_0, + probable_cause=fm_const. + ALARM_PROBABLE_CAUSE_UNKNOWN, + proposed_repair_action="Check the reported peer group " + "state. Migrate it back to the current system if the " + "state is 'rehomed' and the current system is stable. " + "Otherwise, wait until these conditions are met.", + service_affecting=False) + self.fm_api.set_fault(fault) + except Exception as e: + LOG.exception(e) + else: + try: + fault = self.fm_api.get_fault( + fm_const.FM_ALARM_ID_DC_SUBCLOUD_PEER_GROUP_NOT_MANAGED, + entity_instance_id) + if fault: + LOG.info("Clear alarm: %s" % entity_instance_id) + self.fm_api.clear_fault( + fm_const.FM_ALARM_ID_DC_SUBCLOUD_PEER_GROUP_NOT_MANAGED, + entity_instance_id) + except Exception: + LOG.exception( + "Problem clearing fault [%s], alarm_id=%s" % + (entity_instance_id, + fm_const.FM_ALARM_ID_DC_SUBCLOUD_PEER_GROUP_NOT_MANAGED)) + + def _do_audit(self, system_peer, remote_peer_group, local_peer_group): + with self.thread_lock: + try: + self.audit(system_peer, remote_peer_group, local_peer_group) + except Exception as e: + LOG.exception("audit error occured: %s" % e) + + def stop(self): + self.thread.join() + LOG.info("stopped peer group %s audit thread" % self.peer_group_id) + + def start(self, system_peer, remote_peer_group, local_peer_group): + if self.thread_lock.locked(): + LOG.warning('Audit thread for %s has already started' % + local_peer_group.peer_group_name) + else: + self.thread = threading.Thread( + target=self._do_audit, + args=(system_peer, remote_peer_group, local_peer_group)) + self.thread.start() + + def audit_peer_group_from_system(self, + system_peer, + remote_peer_group, + local_peer_group): + LOG.info("Audit peer group [%s] with remote system %s" % + (local_peer_group.peer_group_name, system_peer.peer_name)) + self.start(system_peer, remote_peer_group, local_peer_group) + + @staticmethod + def send_audit_peer_group(system_peers, peer_group): + if not system_peers: + return + local_system = utils.get_local_system() + for system in system_peers: + try: + dc_client = SystemPeerManager.get_peer_dc_client(system) + payload = db_api.subcloud_peer_group_db_model_to_dict( + peer_group) + if 'created-at' in payload: + del payload['created-at'] + if 'updated-at' in payload: + del payload['updated-at'] + payload['peer_uuid'] = local_system.uuid + LOG.info("Send audit payload [%s] of peer group %s" % + (payload, peer_group.peer_group_name)) + response = dc_client.audit_subcloud_peer_group( + peer_group.peer_group_name, + **payload) + if response: + return response + except Exception: + LOG.exception("Failed to send audit request for peer group %s " + "to DC: %s" % + (peer_group.peer_group_name, system.peer_name)) diff --git a/distributedcloud/dcmanager/manager/peer_monitor_manager.py b/distributedcloud/dcmanager/manager/peer_monitor_manager.py new file mode 100644 index 000000000..05f6caa05 --- /dev/null +++ b/distributedcloud/dcmanager/manager/peer_monitor_manager.py @@ -0,0 +1,316 @@ +# +# Copyright (c) 2023 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + +from collections import defaultdict +import threading + +from fm_api import constants as fm_const +from fm_api import fm_api +from oslo_config import cfg +from oslo_log import log as logging + +from dcmanager.common import consts +from dcmanager.common import context +from dcmanager.common import manager +from dcmanager.db import api as db_api +from dcmanager.manager import peer_group_audit_manager as pgam +from dcmanager.manager.system_peer_manager import SystemPeerManager + + +CONF = cfg.CONF +LOG = logging.getLogger(__name__) + + +class PeerMonitor(object): + def __init__(self, peer, context, subcloud_manager): + self.peer = peer + self.thread = None + self.exit_flag = threading.Event() + self.fm_api = fm_api.FaultAPIs() + self.context = context + self.subcloud_manager = subcloud_manager + self.peer_group_id_set = set() + # key: peer_group_id + # value: PeerGroupAuditManager object + self.peer_group_audit_obj_map = dict() + + def _clear_failure(self): + alarm_id = fm_const.FM_ALARM_ID_DC_SYSTEM_PEER_HEARTBEAT_FAILED + entity_instance_id = "peer=%s" % self.peer.peer_uuid + try: + fault = self.fm_api.get_fault(alarm_id, entity_instance_id) + if fault: + self.fm_api.clear_fault(alarm_id, entity_instance_id) + except Exception as e: + LOG.exception( + "Problem clearing fault for peer %s, alarm_id=%s " + "error: %s" % (self.peer.peer_uuid, alarm_id, e)) + + def _raise_failure(self): + alarm_id = fm_const.FM_ALARM_ID_DC_SYSTEM_PEER_HEARTBEAT_FAILED + entity_instance_id = "peer=%s" % self.peer.peer_uuid + reason_text = ("Peer %s (peer_uuid=%s) connections in " + "disconnected state." % (self.peer.peer_name, + self.peer.peer_uuid)) + severity = fm_const.FM_ALARM_SEVERITY_MAJOR + + peer_groups = db_api.subcloud_peer_group_get_by_leader_id( + self.context, self.peer.peer_uuid) + if len(peer_groups) > 0: + peer_group_names = [peer_group.peer_group_name + for peer_group in peer_groups] + reason_text = ("Peer %s (peer_uuid=%s) is in disconnected " + "state. The following subcloud peer groups " + "are impacted: %s." % + (self.peer.peer_name, self.peer.peer_uuid, + ", ".join(peer_group_names))) + severity = fm_const.FM_ALARM_SEVERITY_CRITICAL + + try: + fault = fm_api.Fault( + alarm_id=alarm_id, + alarm_state=fm_const.FM_ALARM_STATE_SET, + entity_type_id=fm_const.FM_ENTITY_TYPE_SYSTEM_PEER, + entity_instance_id=entity_instance_id, + severity=severity, + reason_text=reason_text, + alarm_type=fm_const.FM_ALARM_TYPE_1, + probable_cause=fm_const.ALARM_PROBABLE_CAUSE_UNKNOWN, + proposed_repair_action="Check the connectivity between " + "the current system and the reported peer site. If the " + "peer system is down, migrate the affected peer group(s) " + "to the current system for continued subcloud management.", + service_affecting=False) + + self.fm_api.set_fault(fault) + + except Exception as e: + LOG.exception( + "Problem setting fault for peer %s, alarm_id=%s, " + "error: %s" % (self.peer.peer_uuid, alarm_id, e)) + + def _heartbeat_check_via_get_peer_group_list(self): + """Checking the heartbeat of system peer.""" + failed = True + dc_peer_subcloud_peer_group_list = list() + try: + dc_client = SystemPeerManager.get_peer_dc_client(self.peer) + dc_peer_subcloud_peer_group_list = \ + dc_client.get_subcloud_peer_group_list() + failed = False + + if not dc_peer_subcloud_peer_group_list: + LOG.warning("Resource subcloud peer group of dc:%s " + "not found" % self.peer.manager_endpoint) + + except Exception: + LOG.exception("Failed to access the dc: %s" % + self.peer.peer_name) + return failed, dc_peer_subcloud_peer_group_list + + def _do_monitor_peer(self): + failure_count = 0 + LOG.info("Start monitoring thread for peer %s" % + self.peer.peer_name) + # Do the actual peer monitor. + while not self.exit_flag.wait(timeout=self.peer.heartbeat_interval): + try: + # Get system peer from DB + self.peer = db_api.system_peer_get(self.context, self.peer.id) + failed, remote_pg_list = \ + self._heartbeat_check_via_get_peer_group_list() + if failed: + failure_count += 1 + if failure_count >= self.peer.heartbeat_failure_threshold: + # heartbeat_failure_threshold reached. + LOG.warning("DC %s heartbeat failed, Raising alarm" % + self.peer.peer_name) + self._raise_failure() + db_api.system_peer_update( + self.context, self.peer.id, + heartbeat_status=consts.SYSTEM_PEER_HEARTBEAT_STATUS_FAILURE) + failure_count = 0 + self._set_require_audit_flag_to_associated_peer_groups() + else: + failure_count = 0 + self._audit_local_peer_groups(remote_pg_list) + if self.peer.heartbeat_status != \ + consts.SYSTEM_PEER_HEARTBEAT_STATUS_ALIVE: + db_api.system_peer_update( + self.context, self.peer.id, + heartbeat_status=consts.SYSTEM_PEER_HEARTBEAT_STATUS_ALIVE) + LOG.info("DC %s back online, clear alarm" % + self.peer.peer_name) + self._clear_failure() + except Exception as e: + LOG.exception("Got exception monitoring peer %s error: %s" % + (self.peer.peer_name, e)) + LOG.info("Caught graceful exit signal for peer monitor %s" % + self.peer.peer_name) + + def _audit_local_peer_groups(self, remote_pg_list): + # Generate a dict index by remote peer group name + remote_pg_dict = { + remote_peer_group.get("peer_group_name"): remote_peer_group + for remote_peer_group in remote_pg_list + } + + # Only audit peer groups existing on both side + for peer_group_id, pgam_obj in self.peer_group_audit_obj_map.items(): + peer_group = db_api.subcloud_peer_group_get(self.context, + peer_group_id) + if peer_group.peer_group_name in remote_pg_dict: + remote_peer_group = remote_pg_dict[peer_group.peer_group_name] + # Audit for require_audit_flag is True or + # Remote peer group is in 'complete' state. + if (pgam_obj.require_audit_flag + or remote_peer_group.get("migration_status") + == consts.PEER_GROUP_MIGRATION_COMPLETE + ): + pgam_obj.audit_peer_group_from_system( + self.peer, remote_peer_group, peer_group) + else: + LOG.warning("peer group %s not found on remote DC %s " + "nothing to audit, need sync operation" % + (peer_group.peer_group_name, self.peer.peer_name)) + + def _set_require_audit_flag_to_associated_peer_groups(self): + for pgam_obj in self.peer_group_audit_obj_map.values(): + pgam_obj.require_audit_flag = True + + def audit_specific_local_peer_group(self, peer_group, remote_peer_group): + msg = None + if peer_group.id in self.peer_group_audit_obj_map: + pgam_obj = self.peer_group_audit_obj_map[peer_group.id] + pgam_obj.audit(self.peer, remote_peer_group, peer_group) + else: + msg = ("No peer group id %s found" % peer_group.peer_group_name) + return msg + + def _clean_peer_group_audit_threads(self): + for peer_group_id in self.peer_group_audit_obj_map: + pgam_obj = \ + self.peer_group_audit_obj_map[peer_group_id] + pgam_obj.stop() + self.peer_group_audit_obj_map.clear() + + def update_peer_group_id_set(self, peer_group_id_set): + removed_peer_groups = self.peer_group_id_set - peer_group_id_set + new_peer_groups = peer_group_id_set - self.peer_group_id_set + + # destroy removed peer_group audit object + for peer_group_id in removed_peer_groups: + LOG.info("Peer group [%s] removed from peer [%s]" % + (peer_group_id, self.peer.peer_name)) + if peer_group_id in self.peer_group_audit_obj_map: + self.peer_group_audit_obj_map[peer_group_id].stop() + del self.peer_group_audit_obj_map[peer_group_id] + # Add new peer_group audit object + for peer_group_id in new_peer_groups: + LOG.info("New peer group [%s] found for peer [%s]" % + (peer_group_id, self.peer.peer_name)) + self.peer_group_audit_obj_map[peer_group_id] = \ + pgam.PeerGroupAuditManager(self.subcloud_manager, + peer_group_id) + self.peer_group_id_set = peer_group_id_set + + def start(self): + if self.thread is not None: + LOG.error('Peer monitor thread for %s has already started' % + self.peer.peer_name) + else: + self.thread = threading.Thread(target=self._do_monitor_peer) + self.thread.start() + + def stop(self): + self.exit_flag.set() + self.thread.join() + self._clear_failure() + self._clean_peer_group_audit_threads() + + +class PeerMonitorManager(manager.Manager): + """Manages tasks related to peer monitor.""" + def __init__(self, subcloud_manager): + LOG.debug('PeerMonitorManager initialization...') + + super(PeerMonitorManager, self).__init__( + service_name="peer_monitor_manager") + self.peer_monitor = dict() + self.context = context.get_admin_context() + self.subcloud_manager = subcloud_manager + + # key: system_peer_id + # value: PeerMonitor object + self.peer_monitor_thread_map = dict() + + def _remove_peer_monitor_task(self, system_peer_id): + peer_mon_obj = self.peer_monitor_thread_map[system_peer_id] + peer_mon_obj.stop() + + def _create_peer_monitor_task(self, system_peer_id): + peer = db_api.system_peer_get(self.context, + system_peer_id) + LOG.info("Create monitoring thread for peer: %s" % + peer.peer_name) + self.peer_monitor_thread_map[system_peer_id] = PeerMonitor( + peer, self.context, self.subcloud_manager) + self.peer_monitor_thread_map[system_peer_id].start() + + @staticmethod + def _diff_dict(dict1, dict2): + return {key: value for key, value in dict1.items() if key not in dict2} + + def _create_or_destroy_peer_monitor_task(self, peer_system_peer_group_map): + new_peers = self._diff_dict(peer_system_peer_group_map, + self.peer_monitor_thread_map) + removed_peers = self._diff_dict(self.peer_monitor_thread_map, + peer_system_peer_group_map) + for peer_id in new_peers: + self._create_peer_monitor_task(peer_id) + for peer_id in removed_peers: + self._remove_peer_monitor_task(peer_id) + + # Update peer_group_id set + for peer_id, pm_obj in self.peer_monitor_thread_map.items(): + pm_obj.update_peer_group_id_set( + peer_system_peer_group_map[peer_id]) + + def peer_monitor_notify(self, context): + LOG.info("Caught peer monitor notify...") + peer_system_peer_group_map = defaultdict(set) + # Get local associations + associations = db_api.peer_group_association_get_all(context) + for association in associations: + peer_system_peer_group_map[association.system_peer_id].add( + association.peer_group_id) + + self._create_or_destroy_peer_monitor_task(peer_system_peer_group_map) + + def peer_group_audit_notify(self, context, peer_group_name, payload): + LOG.info("Caught peer group audit notification for peer group %s" % + peer_group_name) + msg = None + try: + peer_group = db_api.subcloud_peer_group_get_by_name( + context, peer_group_name) + system_uuid = payload.get('peer_uuid') + system_peer = db_api.system_peer_get_by_uuid(context, + system_uuid) + if system_peer.id in self.peer_monitor_thread_map: + pmobj = self.peer_monitor_thread_map[system_peer.id] + msg = pmobj.audit_specific_local_peer_group(peer_group, + payload) + else: + msg = ("System peer with UUID=%s is not under monitoring. " + "Skipping audit for peer group %s" % + (system_uuid, peer_group_name)) + LOG.warning(msg) + return msg + except Exception as e: + LOG.exception('Handling peer group audit notify error: %s' % + str(e)) + return str(e) diff --git a/distributedcloud/dcmanager/manager/service.py b/distributedcloud/dcmanager/manager/service.py index a411dbc6a..2a00d5e68 100644 --- a/distributedcloud/dcmanager/manager/service.py +++ b/distributedcloud/dcmanager/manager/service.py @@ -32,6 +32,7 @@ from dcmanager.common import exceptions from dcmanager.common.i18n import _ from dcmanager.common import messaging as rpc_messaging from dcmanager.common import utils +from dcmanager.manager.peer_monitor_manager import PeerMonitorManager from dcmanager.manager.subcloud_manager import SubcloudManager from dcmanager.manager.system_peer_manager import SystemPeerManager @@ -85,11 +86,15 @@ class DCManagerService(service.Service): self.target = None self._rpc_server = None self.subcloud_manager = None + self.peer_monitor_manager = None + self.system_peer_manager = None self.audit_rpc_client = None + self.context = context.get_admin_context() def init_managers(self): self.subcloud_manager = SubcloudManager() - self.syspeer_manager = SystemPeerManager() + self.peer_monitor_manager = PeerMonitorManager(self.subcloud_manager) + self.system_peer_manager = SystemPeerManager(self.peer_monitor_manager) def start(self): utils.set_open_file_limit(cfg.CONF.worker_rlimit_nofile) @@ -110,6 +115,10 @@ class DCManagerService(service.Service): os.makedirs(dccommon_consts.ANSIBLE_OVERRIDES_PATH, 0o600, exist_ok=True) self.subcloud_manager.handle_subcloud_operations_in_progress() + + # Send notify to peer monitor. + self.peer_monitor_manager.peer_monitor_notify(self.context) + super(DCManagerService, self).start() @run_in_thread @@ -302,19 +311,31 @@ class DCManagerService(service.Service): payload['peer_group']) return self.subcloud_manager.batch_migrate_subcloud(context, payload) + @request_context + def peer_monitor_notify(self, context): + LOG.info("Handling peer monitor notify") + return self.peer_monitor_manager.peer_monitor_notify(context) + + @request_context + def peer_group_audit_notify(self, context, peer_group_name, payload): + LOG.info("Handling peer group audit notify of peer group " + f"{peer_group_name}") + return self.peer_monitor_manager.peer_group_audit_notify( + context, peer_group_name, payload) + @request_context def sync_subcloud_peer_group(self, context, association_id, sync_subclouds=True, priority=None): LOG.info("Handling sync_subcloud_peer_group request for: %s", association_id) - return self.syspeer_manager.sync_subcloud_peer_group( + return self.system_peer_manager.sync_subcloud_peer_group( context, association_id, sync_subclouds, priority) @request_context def delete_peer_group_association(self, context, association_id): LOG.info("Handling delete_peer_group_association request for: %s", association_id) - return self.syspeer_manager.delete_peer_group_association( + return self.system_peer_manager.delete_peer_group_association( context, association_id) def _stop_rpc_server(self): diff --git a/distributedcloud/dcmanager/manager/subcloud_manager.py b/distributedcloud/dcmanager/manager/subcloud_manager.py index c340bd5db..2d3b52883 100644 --- a/distributedcloud/dcmanager/manager/subcloud_manager.py +++ b/distributedcloud/dcmanager/manager/subcloud_manager.py @@ -43,6 +43,7 @@ from dccommon import consts as dccommon_consts from dccommon.drivers.openstack.sdk_platform import OpenStackDriver from dccommon.drivers.openstack.sysinv_v1 import SysinvClient from dccommon.exceptions import PlaybookExecutionFailed +from dccommon.exceptions import SubcloudNotFound from dccommon import kubeoperator from dccommon.subcloud_install import SubcloudInstall from dccommon.subcloud_install import SubcloudShutdown @@ -61,6 +62,8 @@ from dcmanager.common import prestage from dcmanager.common import utils from dcmanager.db import api as db_api from dcmanager.db.sqlalchemy.models import Subcloud +from dcmanager.manager.peer_group_audit_manager import PeerGroupAuditManager +from dcmanager.manager.system_peer_manager import SystemPeerManager from dcmanager.rpc import client as dcmanager_rpc_client from dcorch.rpc import client as dcorch_rpc_client @@ -388,10 +391,20 @@ class SubcloudManager(manager.Manager): "-e", extra_vars] return rehome_command - def _migrate_manage_subcloud(self, context, payload, subcloud): + def _migrate_manage_subcloud( + self, context, payload, alive_system_peers, subcloud): + success = True + # Try to unmanage the subcloud on peer system + if alive_system_peers: + if self._unmanage_system_peer_subcloud(alive_system_peers, + subcloud): + success = False + LOG.warning("Unmanged subcloud: %s error on peer system, " + "exit migration" % subcloud.name) + return subcloud, success + # migrate and set managed for # online and complete subclouds. - success = True self.migrate_subcloud(context, subcloud.id, payload) subcloud = db_api.subcloud_get(context, subcloud.id) @@ -430,6 +443,100 @@ class SubcloudManager(manager.Manager): return subcloud, success + def _get_peer_system_list(self, peer_group): + system_peers = list() + # Get associations by peer group + associations = db_api.peer_group_association_get_by_peer_group_id( + self.context, peer_group.id) + if not associations: + LOG.info("No association found for peer group %s" % + peer_group.peer_group_name) + return system_peers + for association in associations: + system_peer = db_api.system_peer_get( + self.context, association.system_peer_id) + # Get 'alive' system peer + if system_peer.heartbeat_status != \ + consts.SYSTEM_PEER_HEARTBEAT_STATUS_ALIVE: + LOG.warning("Peer system %s offline, skip checking" % + system_peer.peer_name) + continue + else: + system_peers.append(system_peer) + + return system_peers + + def _unmanage_system_peer_subcloud(self, system_peers, subcloud): + unmanaged_error = False + for system_peer in system_peers: + LOG.debug("Get subcloud: %s from system_peer: %s" % + (subcloud.name, system_peer.peer_name)) + for attempt in range(3): + try: + dc_client = \ + SystemPeerManager.get_peer_dc_client(system_peer) + # Get remote subcloud by region_name from + # system peer + remote_subcloud = dc_client.get_subcloud( + subcloud.region_name, is_region_name=True) + if remote_subcloud.get('management-state') == \ + dccommon_consts.MANAGEMENT_UNMANAGED: + LOG.info("Remote subcloud %s from system peer %s is " + "already unmanaged, skipping unmanage attempt" + % (system_peer.peer_name, + remote_subcloud.get('name'))) + break + + payload = {"management-state": "unmanaged"} + try: + remote_subcloud = dc_client.update_subcloud( + subcloud.region_name, + files=None, + data=payload, + is_region_name=True) + LOG.info("Successfully updated subcloud: %s on " + "peer system %s to unmanaged state" + % (remote_subcloud.get('name'), + system_peer.peer_name)) + return unmanaged_error + except Exception as e: + raise exceptions.SubcloudNotUnmanaged() from e + + except SubcloudNotFound: + LOG.info("No identical subcloud: %s found on " + "peer system: %s" % + (subcloud.region_name, system_peer.peer_name)) + break + except exceptions.SubcloudNotUnmanaged: + LOG.exception("Unmanaged error on subcloud: %s " + "on system %s" % + (subcloud.region_name, + system_peer.peer_name)) + unmanaged_error = True + except Exception: + LOG.exception("Failed to set unmanged for " + "subcloud: %s on system %s attempt: %d" + % (subcloud.region_name, + system_peer.peer_name, attempt)) + time.sleep(1) + return unmanaged_error + + def _clear_alarm_for_peer_group(self, peer_group): + # Get alarms related to peer group + faults = self.fm_api.get_faults_by_id( + fm_const.FM_ALARM_ID_DC_SUBCLOUD_PEER_GROUP_NOT_MANAGED) + if not faults: + return + for fault in faults: + entity_instance_id_str = "peer_group=%s,peer=" % \ + (peer_group.peer_group_name) + if entity_instance_id_str in fault.entity_instance_id: + LOG.info("Clear alarm for peer group %s" % + peer_group.peer_group_name) + self.fm_api.clear_fault( + fm_const.FM_ALARM_ID_DC_SUBCLOUD_PEER_GROUP_NOT_MANAGED, + fault.entity_instance_id) + def migrate_subcloud(self, context, subcloud_ref, payload): '''migrate_subcloud function is for day-2's rehome purpose. @@ -463,6 +570,10 @@ class SubcloudManager(manager.Manager): # Update sysadmin_password sysadmin_password = base64.b64decode(payload['sysadmin_password']).decode('utf-8') saved_payload['sysadmin_password'] = sysadmin_password + # Decode admin_password + if 'admin_password' in saved_payload: + saved_payload['admin_password'] = base64.b64decode( + saved_payload['admin_password']).decode('utf-8') # Re-generate ansible config based on latest rehome_data subcloud = self.subcloud_migrate_generate_ansible_config( @@ -547,6 +658,15 @@ class SubcloudManager(manager.Manager): % str(peer_group.peer_group_name)) return + # Set migration_status to migrating + db_api.subcloud_peer_group_update( + self.context, + peer_group.id, + migration_status=consts.PEER_GROUP_MIGRATING) + + # Try to get peer system by peer group + system_peers = self._get_peer_system_list(peer_group) + # Use thread pool to limit number of operations in parallel migrate_pool = greenpool.GreenPool( size=peer_group.max_subcloud_rehoming) @@ -554,12 +674,35 @@ class SubcloudManager(manager.Manager): tmp_payload = {'sysadmin_password': sysadmin_password} migrate_function = functools.partial(self._migrate_manage_subcloud, context, - tmp_payload) + tmp_payload, + system_peers) self._run_parallel_group_operation('migrate', migrate_function, migrate_pool, subclouds_ready_to_migrate) + + # Set migration_status to complete, + # Update system leader id and name + local_system = utils.get_local_system() + peer_group = db_api.subcloud_peer_group_update( + self.context, + peer_group.id, + system_leader_id=local_system.uuid, + system_leader_name=local_system.name, + migration_status=consts.PEER_GROUP_MIGRATION_COMPLETE) + + # Try to send audit request to system peer + resp = PeerGroupAuditManager.send_audit_peer_group( + system_peers, peer_group) + if resp: + LOG.warning("Audit peer group %s response: %s" % + (peer_group.peer_group_name, resp)) + + # Try to clear existing alarm if we rehomed a '0' priority peer group + if peer_group.group_priority == 0: + self._clear_alarm_for_peer_group(peer_group) + LOG.info("Batch migrate operation finished") def rehome_subcloud(self, context, subcloud): @@ -1043,6 +1186,13 @@ class SubcloudManager(manager.Manager): str(keyring.get_password( user, dccommon_consts.SERVICES_USER_NAME)) + # TODO(Yuxing) remove replicating the smapi user when end the support + # of rehoming a subcloud with a software version below 22.12 + if subcloud.software_version <= LAST_SW_VERSION_IN_CENTOS: + payload['users']['smapi'] = \ + str(keyring.get_password( + 'smapi', dccommon_consts.SERVICES_USER_NAME)) + if 'region_name' not in payload: payload['region_name'] = subcloud.region_name @@ -1238,6 +1388,12 @@ class SubcloudManager(manager.Manager): del original_payload['ansible_ssh_pass'] if 'sysadmin_password' in original_payload: del original_payload['sysadmin_password'] + if 'ansible_become_pass' in original_payload: + del original_payload['ansible_become_pass'] + if 'admin_password' in original_payload: + # Encode admin_password + original_payload['admin_password'] = base64.b64encode( + original_payload['admin_password'].encode("utf-8")).decode('utf-8') bootstrap_info = utils.create_subcloud_rehome_data_template() bootstrap_info['saved_payload'] = original_payload rehome_data = json.dumps(bootstrap_info) @@ -2548,6 +2704,18 @@ class SubcloudManager(manager.Manager): if 'bootstrap-address' in rehome_data_dict['saved_payload']: _bootstrap_address = rehome_data_dict['saved_payload']['bootstrap-address'] bootstrap_values_dict = yaml.load(bootstrap_values, Loader=yaml.SafeLoader) + + # remove sysadmin_password,ansible_ssh_pass,ansible_become_pass + # encode admin_password + if 'sysadmin_password' in bootstrap_values_dict: + del bootstrap_values_dict['sysadmin_password'] + if 'ansible_ssh_pass' in bootstrap_values_dict: + del bootstrap_values_dict['ansible_ssh_pass'] + if 'ansible_become_pass' in bootstrap_values_dict: + del bootstrap_values_dict['ansible_become_pass'] + if 'admin_password' in bootstrap_values_dict: + bootstrap_values_dict['admin_password'] = base64.b64encode( + bootstrap_values_dict['admin_password'].encode("utf-8")).decode('utf-8') rehome_data_dict['saved_payload'] = bootstrap_values_dict # put bootstrap_address back into rehome_data_dict if _bootstrap_address: diff --git a/distributedcloud/dcmanager/manager/system_peer_manager.py b/distributedcloud/dcmanager/manager/system_peer_manager.py index 9493d92b4..071aa1531 100644 --- a/distributedcloud/dcmanager/manager/system_peer_manager.py +++ b/distributedcloud/dcmanager/manager/system_peer_manager.py @@ -35,9 +35,9 @@ VERIFY_SUBCLOUD_SYNC_IGNORE = 'ignore' class SystemPeerManager(manager.Manager): """Manages tasks related to system peers.""" - def __init__(self, *args, **kwargs): + def __init__(self, peer_monitor_manager, *args, **kwargs): LOG.debug(_('SystemPeerManager initialization...')) - + self.peer_monitor_manager = peer_monitor_manager super(SystemPeerManager, self).__init__( service_name="system_peer_manager", *args, **kwargs) @@ -445,6 +445,8 @@ class SystemPeerManager(manager.Manager): association = db_api.peer_group_association_update( context, association_id, **association_update) + self.peer_monitor_manager.peer_monitor_notify(context) + return db_api.peer_group_association_db_model_to_dict(association) except Exception as exception: @@ -501,6 +503,7 @@ class SystemPeerManager(manager.Manager): "on peer site.") db_api.peer_group_association_destroy(context, association_id) + self.peer_monitor_manager.peer_monitor_notify(context) except Exception as exception: LOG.exception("Failed to delete peer_group_association " diff --git a/distributedcloud/dcmanager/rpc/client.py b/distributedcloud/dcmanager/rpc/client.py index a309129d7..63add152f 100644 --- a/distributedcloud/dcmanager/rpc/client.py +++ b/distributedcloud/dcmanager/rpc/client.py @@ -274,6 +274,14 @@ class ManagerClient(RPCClient): return self.call(ctxt, self.make_msg('delete_peer_group_association', association_id=association_id)) + def peer_monitor_notify(self, ctxt): + return self.call(ctxt, self.make_msg('peer_monitor_notify')) + + def peer_group_audit_notify(self, ctxt, peer_group_name, payload): + return self.call(ctxt, self.make_msg('peer_group_audit_notify', + peer_group_name=peer_group_name, + payload=payload)) + class DCManagerNotifications(RPCClient): """DC Manager Notification interface to broadcast subcloud state changed diff --git a/distributedcloud/dcmanager/tests/unit/api/v1/controllers/test_peer_group_association.py b/distributedcloud/dcmanager/tests/unit/api/v1/controllers/test_peer_group_association.py index e0fc25f1b..48c1dd79f 100644 --- a/distributedcloud/dcmanager/tests/unit/api/v1/controllers/test_peer_group_association.py +++ b/distributedcloud/dcmanager/tests/unit/api/v1/controllers/test_peer_group_association.py @@ -128,7 +128,8 @@ class PeerGroupAssociationAPIMixin(APIMixin): SAMPLE_SUBCLOUD_PEER_GROUP_STATE), 'max_subcloud_rehoming': kw.get( 'max_subcloud_rehoming', - SAMPLE_SUBCLOUD_PEER_GROUP_MAX_SUBCLOUDS_REHOMING) + SAMPLE_SUBCLOUD_PEER_GROUP_MAX_SUBCLOUDS_REHOMING), + 'migration_status': None } return group diff --git a/distributedcloud/dcmanager/tests/unit/api/v1/controllers/test_subcloud_peer_group.py b/distributedcloud/dcmanager/tests/unit/api/v1/controllers/test_subcloud_peer_group.py index d6ce993f3..43cda6322 100644 --- a/distributedcloud/dcmanager/tests/unit/api/v1/controllers/test_subcloud_peer_group.py +++ b/distributedcloud/dcmanager/tests/unit/api/v1/controllers/test_subcloud_peer_group.py @@ -82,7 +82,9 @@ class SubcloudPeerGroupAPIMixin(APIMixin): 'enabled'), 'max_subcloud_rehoming': kw.get( 'max_subcloud_rehoming', - SAMPLE_SUBCLOUD_PEER_GROUP_MAX_SUBCLOUDS_REHOMING) + SAMPLE_SUBCLOUD_PEER_GROUP_MAX_SUBCLOUDS_REHOMING), + 'migration_status': None + } return group diff --git a/distributedcloud/dcmanager/tests/unit/manager/test_subcloud_manager.py b/distributedcloud/dcmanager/tests/unit/manager/test_subcloud_manager.py index fa551b2d7..4397ac621 100644 --- a/distributedcloud/dcmanager/tests/unit/manager/test_subcloud_manager.py +++ b/distributedcloud/dcmanager/tests/unit/manager/test_subcloud_manager.py @@ -30,6 +30,7 @@ sys.modules['fm_core'] = mock.Mock() import threading from dccommon import consts as dccommon_consts +from dccommon.drivers.openstack import dcmanager_v1 from dccommon import subcloud_install from dccommon.utils import AnsiblePlaybook from dcmanager.common import consts @@ -38,9 +39,11 @@ from dcmanager.common import prestage from dcmanager.common import utils as cutils from dcmanager.db.sqlalchemy import api as db_api from dcmanager.manager import subcloud_manager +from dcmanager.manager import system_peer_manager from dcmanager.state import subcloud_state_manager from dcmanager.tests import base from dcmanager.tests.unit.common import fake_subcloud +from dcmanager.tests.unit.manager import test_system_peer_manager from dcmanager.tests import utils from tsconfig.tsconfig import SW_VERSION @@ -439,7 +442,8 @@ class TestSubcloudManager(base.DCManagerTestCase): "system_leader_name": "DC0", "group_priority": 0, "group_state": "enabled", - "max_subcloud_rehoming": 50 + "max_subcloud_rehoming": 50, + "migration_status": None } values.update(kwargs) return db_api.subcloud_peer_group_create(ctxt, **values) @@ -3019,6 +3023,57 @@ class TestSubcloudManager(base.DCManagerTestCase): self.assertEqual(1, len(expect_subclouds)) self.assertEqual("sub_migrateable", expect_subclouds[0].name) + @mock.patch.object(subcloud_manager.SubcloudManager, + '_unmanage_system_peer_subcloud') + def test_migrate_manage_subcloud_called_unmanage_peer_subcloud( + self, mock_unmanage_system_peer_subcloud): + sm = subcloud_manager.SubcloudManager() + system_peer_test = test_system_peer_manager.TestSystemPeerManager + system_peer = system_peer_test.create_system_peer_static(self.ctx) + subcloud = self.create_subcloud_static(self.ctx) + sm._migrate_manage_subcloud(self.ctx, mock.ANY, [system_peer], + subcloud) + mock_unmanage_system_peer_subcloud.assert_called() + + @mock.patch.object(subcloud_manager.SubcloudManager, + '_unmanage_system_peer_subcloud') + def test_migrate_manage_subcloud_not_called_unmanage_peer_subcloud( + self, mock_unmanage_system_peer_subcloud): + sm = subcloud_manager.SubcloudManager() + subcloud = self.create_subcloud_static(self.ctx) + # Give empty system peers + system_peers = [] + sm._migrate_manage_subcloud(self.ctx, + mock.ANY, system_peers, subcloud) + mock_unmanage_system_peer_subcloud.assert_not_called() + + @mock.patch.object(system_peer_manager.SystemPeerManager, + 'get_peer_dc_client') + def test_unmanage_system_peer_subcloud_ret_false( + self, mock_get_peer_dc_client): + sm = subcloud_manager.SubcloudManager() + system_peer_test = test_system_peer_manager.TestSystemPeerManager + system_peer = system_peer_test.create_system_peer_static(self.ctx) + subcloud = self.create_subcloud_static(self.ctx) + mock_get_peer_dc_client.return_value = None + ret = sm._unmanage_system_peer_subcloud([system_peer], subcloud) + self.assertEqual(ret, False) + + @mock.patch.object(system_peer_manager.SystemPeerManager, + 'get_peer_dc_client') + @mock.patch.object(dcmanager_v1.DcmanagerClient, 'update_subcloud') + def test_unmanage_system_peer_subcloud_ret_true(self, + mock_get_peer_dc_client, + mock_update_subcloud): + sm = subcloud_manager.SubcloudManager() + system_peer_test = test_system_peer_manager.TestSystemPeerManager + system_peer = system_peer_test.create_system_peer_static(self.ctx) + subcloud = self.create_subcloud_static(self.ctx) + mock_get_peer_dc_client.return_value = mock.MagicMock() + mock_update_subcloud.side_effect = exceptions.SubcloudNotUnmanaged() + ret = sm._unmanage_system_peer_subcloud([system_peer], subcloud) + self.assertEqual(ret, True) + @mock.patch.object(subcloud_manager.SubcloudManager, 'subcloud_deploy_create') @mock.patch.object(subcloud_manager.SubcloudManager, 'rehome_subcloud') @mock.patch.object(subcloud_manager.SubcloudManager, 'run_deploy_phases') diff --git a/distributedcloud/dcmanager/tests/unit/manager/test_system_peer_manager.py b/distributedcloud/dcmanager/tests/unit/manager/test_system_peer_manager.py index 5d2a53d90..b280b6b09 100644 --- a/distributedcloud/dcmanager/tests/unit/manager/test_system_peer_manager.py +++ b/distributedcloud/dcmanager/tests/unit/manager/test_system_peer_manager.py @@ -159,7 +159,8 @@ class TestSystemPeerManager(base.DCManagerTestCase): "group_priority": FAKE_SITE0_PEER_GROUP_PRIORITY, "group_state": FAKE_SITE0_PEER_GROUP_STATE, "max_subcloud_rehoming": - FAKE_SITE0_PEER_GROUP_MAX_SUBCLOUDS_REHOMING + FAKE_SITE0_PEER_GROUP_MAX_SUBCLOUDS_REHOMING, + "migration_status": None } values.update(kwargs) return db_api.subcloud_peer_group_create(ctxt, **values) @@ -177,7 +178,7 @@ class TestSystemPeerManager(base.DCManagerTestCase): return db_api.peer_group_association_create(ctxt, **values) def test_init(self): - spm = system_peer_manager.SystemPeerManager() + spm = system_peer_manager.SystemPeerManager(mock.MagicMock()) self.assertIsNotNone(spm) self.assertEqual('system_peer_manager', spm.service_name) self.assertEqual('localhost', spm.host) @@ -235,7 +236,7 @@ class TestSystemPeerManager(base.DCManagerTestCase): mock_dc_client().update_subcloud.side_effect = [ peer_subcloud1, peer_subcloud1, peer_subcloud2] - spm = system_peer_manager.SystemPeerManager() + spm = system_peer_manager.SystemPeerManager(mock.MagicMock()) spm._sync_subclouds(self.ctx, peer, peer_group.id, FAKE_SITE1_PEER_GROUP_ID) @@ -281,7 +282,7 @@ class TestSystemPeerManager(base.DCManagerTestCase): system_peer_id=peer.id, peer_group_id=peer_group.id) - spm = system_peer_manager.SystemPeerManager() + spm = system_peer_manager.SystemPeerManager(mock.MagicMock()) spm.sync_subcloud_peer_group(self.ctx, association.id, False) mock_dc_client().get_subcloud_peer_group.assert_called_once_with( @@ -319,7 +320,7 @@ class TestSystemPeerManager(base.DCManagerTestCase): mock_dc_client().get_subcloud_peer_group.side_effect = \ dccommon_exceptions.SubcloudPeerGroupNotFound - spm = system_peer_manager.SystemPeerManager() + spm = system_peer_manager.SystemPeerManager(mock.MagicMock()) spm.sync_subcloud_peer_group(self.ctx, association.id, False) mock_dc_client().get_subcloud_peer_group.assert_called_once_with( @@ -375,7 +376,7 @@ class TestSystemPeerManager(base.DCManagerTestCase): 'group_priority': 1 } - spm = system_peer_manager.SystemPeerManager() + spm = system_peer_manager.SystemPeerManager(mock.MagicMock()) spm.delete_peer_group_association(self.ctx, association.id) mock_dc_client().delete_subcloud.assert_has_calls([