From 4027eee3906531addcf568b86daf02b777204a31 Mon Sep 17 00:00:00 2001 From: twang4 Date: Wed, 1 Nov 2023 16:27:33 +0800 Subject: [PATCH] System peer and subcloud peer group monitoring and handling System peer monitoring and handling: It does peer health check by querying peer group list. After failure is detected and the number of heartbeat failure reaches the threshold, alarm will be raised. After connection is back online, alarm will be cleared, and will perform an audit to the peer groups that are associated to the system peer. Subcloud peer group audit and handling: If the remote peer group's migration_status is in the 'migrating' state, unmanage the subclouds of the local peer group. If the remote peer group's migration_status is in the 'complete' state, compare the subclouds on both ends. If the remote end is in a 'managed+online' state, set the local subclouds with the same region_name to 'unmanaged+secondary.' If the remote peer group's migration_status is in the 'none' state, set the migration_status of the local peer group to 'none' as well. Batch rehome update: When Subcloud Peer Group based batch rehoming is performed, need to check if the associated System Peer is alive or not: If yes, and the remote subcloud with same region_name is being managed in peer site, need to unmanage it before rehoming it to local site. If not, this means the peer site is not available for the subcloud management any more, rehoming can be performed directly. If the subcloud peer group priority is 0, will try to clear alarm of FM_ALARM_ID_DC_SUBCLOUD_PEER_GROUP_NOT_MANAGED after batch rehoming is complete. Security enhancement: Base64 encode/decode admin_password when save/load rehome_data from DB. Test Plan: 1. PASS - Add a system peer on DC0, unplug the OAM network between DC0 and DC1. Alarm raised. 2. PASS - Reconnect the OAM network between DC0 and DC1, previous alarm will be cleared. 3. PASS - Add a subcloud peer group on DC0. Add two subclouds on DC0 under the subcloud peer group. which should be in managed, online and complete states. Add a system peer on DC0 pointing to DC1, Add a peer-group-association associating the peer-group and the system-peer above on DC0. create another association in DC1, for system DC0 and the peer group synced to DC1. Shutdown DC0, perform the migrate operation on subcloud-peer-group from DC1, after all subclouds have been migrated and entered online managed states. Power on DC0, check alarm like "Subcloud peer group xxx is managed by remote system (peer_uuid=xxx) with lower priority." has been raised. Check the subcloud state on DC0, should be in 'secondary' state, Perform the migrate operation on subcloud-peer-group from DC0. After all subclouds have been migrated and entered online managed states. Alarm is cleared. Check subclouds on DC1, subcloud state should be in secondary state. 4.PASS - Migrate a subcloud peer group with 0 priority, Before migrate, check alarm FM_ALARM_ID_DC_SUBCLOUD_PEER_GROUP_NOT_MANAGED exists. After migration done, check alarm cleared. 5. PASS - When the remote peer group's migration_status is in the 'migrating' state (if DC0 comes back online while rehoming is still in progress); DC0's subcloud will be auto set to 'unmanaged', after DC1's migration is complete and all subcloud in 'managed+online', DC0's subcloud will auto set to 'secondary'. Story: 2010852 Task: 48483 Task: 48509 Task: 48819 Change-Id: Ic97e7c4a7628445522adfba4b0b2e0cc945cbe22 Signed-off-by: Wang Tao --- api-ref/source/api-ref-dcmanager-v1.rst | 56 ++++ api-ref/source/parameters.yaml | 13 + .../subcloud-peer-group-audit-request.json | 9 + .../subcloud-peer-group-audit-response.json | 1 + .../subcloud-peer-group-get-response.json | 1 + .../subcloud-peer-group-patch-request.json | 3 +- .../subcloud-peer-group-patch-response.json | 1 + .../subcloud-peer-groups-get-response.json | 1 + .../subcloud-peer-groups-post-response.json | 1 + distributedcloud/dccommon/consts.py | 4 + .../drivers/openstack/dcmanager_v1.py | 34 +- .../controllers/v1/peer_group_association.py | 8 +- .../api/controllers/v1/subcloud_peer_group.py | 57 +++- .../dcmanager/api/controllers/v1/subclouds.py | 14 +- .../api/policies/subcloud_peer_group.py | 5 + distributedcloud/dcmanager/common/consts.py | 12 + distributedcloud/dcmanager/common/utils.py | 23 ++ distributedcloud/dcmanager/db/api.py | 38 ++- .../dcmanager/db/sqlalchemy/api.py | 12 +- ...add_subcloud_peer_group_and_association.py | 1 + .../dcmanager/db/sqlalchemy/models.py | 1 + .../manager/peer_group_audit_manager.py | 313 +++++++++++++++++ .../dcmanager/manager/peer_monitor_manager.py | 316 ++++++++++++++++++ distributedcloud/dcmanager/manager/service.py | 27 +- .../dcmanager/manager/subcloud_manager.py | 174 +++++++++- .../dcmanager/manager/system_peer_manager.py | 7 +- distributedcloud/dcmanager/rpc/client.py | 8 + .../test_peer_group_association.py | 3 +- .../controllers/test_subcloud_peer_group.py | 4 +- .../unit/manager/test_subcloud_manager.py | 57 +++- .../unit/manager/test_system_peer_manager.py | 13 +- 31 files changed, 1169 insertions(+), 48 deletions(-) create mode 100644 api-ref/source/samples/subcloud-peer-groups/subcloud-peer-group-audit-request.json create mode 100644 api-ref/source/samples/subcloud-peer-groups/subcloud-peer-group-audit-response.json create mode 100644 distributedcloud/dcmanager/manager/peer_group_audit_manager.py create mode 100644 distributedcloud/dcmanager/manager/peer_monitor_manager.py diff --git a/api-ref/source/api-ref-dcmanager-v1.rst b/api-ref/source/api-ref-dcmanager-v1.rst index caf1c8431..fd28c2124 100644 --- a/api-ref/source/api-ref-dcmanager-v1.rst +++ b/api-ref/source/api-ref-dcmanager-v1.rst @@ -2653,6 +2653,7 @@ internalServerError (500), serviceUnavailable (503) - max_subcloud_rehoming: subcloud_peer_group_max_subcloud_rehoming - system_leader_id: subcloud_peer_group_system_leader_id - system_leader_name: subcloud_peer_group_system_leader_name + - migration_status: subcloud_peer_group_migration_status - created_at: created_at - updated_at: updated_at @@ -2707,6 +2708,7 @@ Request Example - max_subcloud_rehoming: subcloud_peer_group_max_subcloud_rehoming - system_leader_id: subcloud_peer_group_system_leader_id - system_leader_name: subcloud_peer_group_system_leader_name + - migration_status: subcloud_peer_group_migration_status - created_at: created_at - updated_at: updated_at @@ -2752,6 +2754,7 @@ This operation does not accept a request body. - max_subcloud_rehoming: subcloud_peer_group_max_subcloud_rehoming - system_leader_id: subcloud_peer_group_system_leader_id - system_leader_name: subcloud_peer_group_system_leader_name + - migration_status: subcloud_peer_group_migration_status - created_at: created_at - updated_at: updated_at @@ -2841,6 +2844,8 @@ The attributes of a subcloud peer group which are modifiable: - system_leader_name +- migration_status + **Normal response codes** @@ -2863,6 +2868,7 @@ serviceUnavailable (503) - max_subcloud_rehoming: subcloud_peer_group_max_subcloud_rehoming - system_leader_id: subcloud_peer_group_system_leader_id - system_leader_name: subcloud_peer_group_system_leader_name + - migration_status: subcloud_peer_group_migration_status Request Example ---------------- @@ -2880,6 +2886,7 @@ Request Example - max_subcloud_rehoming: subcloud_peer_group_max_subcloud_rehoming - system_leader_id: subcloud_peer_group_system_leader_id - system_leader_name: subcloud_peer_group_system_leader_name + - migration_status: subcloud_peer_group_migration_status - created_at: created_at - updated_at: updated_at @@ -2943,6 +2950,55 @@ internalServerError (500), serviceUnavailable (503) This operation does not accept a request body. + +************************************ +Audit a specific subcloud peer group +************************************ + +.. rest_method:: PATCH /v1.0/subcloud-peer-groups/​{subcloud-peer-group}​/audit + +**Normal response codes** + +200 + +**Error response codes** + +badRequest (400), unauthorized (401), forbidden (403), badMethod (405), +HTTPUnprocessableEntity (422), internalServerError (500), +serviceUnavailable (503) + +**Request parameters** + +.. rest_parameters:: parameters.yaml + + - peer_uuid: peer_uuid + - id: subcloud_peer_group_id + - peer_group_name: subcloud_peer_group_name + - group_priority: subcloud_peer_group_priority + - group_state: subcloud_peer_group_administrative_state + - max_subcloud_rehoming: subcloud_peer_group_max_subcloud_rehoming + - system_leader_id: subcloud_peer_group_system_leader_id + - system_leader_name: subcloud_peer_group_system_leader_name + - migration_status: subcloud_peer_group_migration_status + +Request Example +---------------- +.. literalinclude:: samples/subcloud-peer-groups/subcloud-peer-group-audit-request.json + :language: json + +**Response parameters** + +.. rest_parameters:: parameters.yaml + + - message: subcloud_peer_group_audit_error_message + +Response Example +---------------- + +.. literalinclude:: samples/subcloud-peer-groups/subcloud-peer-group-audit-response.json + :language: json + + ---------------------- Peer Group Association ---------------------- diff --git a/api-ref/source/parameters.yaml b/api-ref/source/parameters.yaml index ca14f7b39..4f17780d9 100644 --- a/api-ref/source/parameters.yaml +++ b/api-ref/source/parameters.yaml @@ -727,6 +727,13 @@ subcloud_peer_group_administrative_state: in: body required: false type: string +subcloud_peer_group_audit_error_message: + description: | + The error message from a peer DC's peer group + audit result. + in: body + required: true + type: string subcloud_peer_group_id: description: | The ID of the subcloud peer group associated with this object. @@ -739,6 +746,12 @@ subcloud_peer_group_max_subcloud_rehoming: in: body required: false type: integer +subcloud_peer_group_migration_status: + description: | + The migration status of the subcloud peer group. + in: body + required: false + type: string subcloud_peer_group_name: description: | The NAME of the subcloud peer group. diff --git a/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-group-audit-request.json b/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-group-audit-request.json new file mode 100644 index 000000000..a926aa50a --- /dev/null +++ b/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-group-audit-request.json @@ -0,0 +1,9 @@ +{ + "peer_group_name": "dc1-pg", + "group_priority": 0, + "group_state": "enabled", + "system_leader_id": "ac62f555-9386-42f1-b3a1-51ecb709409d", + "system_leader_name": "dc1-name", + "migration_status": null, + "peer_uuid": "ac62f555-9386-42f1-b3a1-51ecb709409d" +} \ No newline at end of file diff --git a/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-group-audit-response.json b/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-group-audit-response.json new file mode 100644 index 000000000..5eb6d677e --- /dev/null +++ b/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-group-audit-response.json @@ -0,0 +1 @@ +{"message": "error message"} \ No newline at end of file diff --git a/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-group-get-response.json b/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-group-get-response.json index dc9dd01a8..53ab6d399 100644 --- a/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-group-get-response.json +++ b/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-group-get-response.json @@ -6,6 +6,7 @@ "max_subcloud_rehoming": 10, "system_leader_id": "ac62f555-9386-42f1-b3a1-51ecb709409d", "system_leader_name": "dc1-name", + "migration_status": null, "created-at": "2023-07-26 00:51:01.396694", "updated-at": "2023-07-26 00:57:35.941816" } \ No newline at end of file diff --git a/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-group-patch-request.json b/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-group-patch-request.json index 2783263bb..4ab092b86 100644 --- a/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-group-patch-request.json +++ b/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-group-patch-request.json @@ -6,6 +6,5 @@ "max_subcloud_rehoming": 10, "system_leader_id": "ac62f555-9386-42f1-b3a1-51ecb709409d", "system_leader_name": "dc1-name", - "created-at": "2023-07-26 00:51:01.396694", - "updated-at": "2023-08-07 06:09:04.086417" + "migration_status": null } \ No newline at end of file diff --git a/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-group-patch-response.json b/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-group-patch-response.json index 2783263bb..9e07c80a5 100644 --- a/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-group-patch-response.json +++ b/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-group-patch-response.json @@ -6,6 +6,7 @@ "max_subcloud_rehoming": 10, "system_leader_id": "ac62f555-9386-42f1-b3a1-51ecb709409d", "system_leader_name": "dc1-name", + "migration_status": null, "created-at": "2023-07-26 00:51:01.396694", "updated-at": "2023-08-07 06:09:04.086417" } \ No newline at end of file diff --git a/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-groups-get-response.json b/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-groups-get-response.json index ac59c05c1..526d9d9f5 100644 --- a/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-groups-get-response.json +++ b/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-groups-get-response.json @@ -7,6 +7,7 @@ "max_subcloud_rehoming": 10, "system_leader_id": "ac62f555-9386-42f1-b3a1-51ecb709409d", "system_leader_name": "dc1-name", + "migration_status": null, "created-at": "2023-07-26 00:51:01.396694", "updated-at": "2023-08-07 06:09:04.086417" }] diff --git a/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-groups-post-response.json b/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-groups-post-response.json index 38c67a477..f0399a3b3 100644 --- a/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-groups-post-response.json +++ b/api-ref/source/samples/subcloud-peer-groups/subcloud-peer-groups-post-response.json @@ -6,6 +6,7 @@ "max_subcloud_rehoming": 10, "system_leader_id": "ac62f555-9386-42f1-b3a1-51ecb709409d", "system_leader_name": "dc1-name", + "migration_status": null, "created-at": "2023-08-07 06:13:52.664047", "updated-at": null } \ No newline at end of file diff --git a/distributedcloud/dccommon/consts.py b/distributedcloud/dccommon/consts.py index d4a2bb2f4..9affaea59 100644 --- a/distributedcloud/dccommon/consts.py +++ b/distributedcloud/dccommon/consts.py @@ -151,6 +151,10 @@ SSL_CERT_CA_DIR = "/etc/pki/ca-trust/source/anchors/" RVMC_NAME_PREFIX = 'rvmc' RVMC_CONFIG_FILE_NAME = 'rvmc-config.yaml' +# Required for GEO-redundancy +# User-Agent check for subcloud by region_name request. +DCMANAGER_V1_HTTP_AGENT = 'dcmanager/1.0' + # Subcloud installation values BMC_INSTALL_VALUES = [ 'bmc_username', diff --git a/distributedcloud/dccommon/drivers/openstack/dcmanager_v1.py b/distributedcloud/dccommon/drivers/openstack/dcmanager_v1.py index a6518b0df..22d527d14 100644 --- a/distributedcloud/dccommon/drivers/openstack/dcmanager_v1.py +++ b/distributedcloud/dccommon/drivers/openstack/dcmanager_v1.py @@ -37,11 +37,11 @@ class DcmanagerClient(base.DriverBase): """Get subcloud.""" if subcloud_ref is None: raise ValueError("subcloud_ref is required.") - url = f"{self.endpoint}/subclouds/{subcloud_ref}" + url = f"{self.endpoint}/subclouds/{subcloud_ref}/detail" headers = {"X-Auth-Token": self.token} if is_region_name: - headers["User-Agent"] = "dcmanager/1.0" + headers["User-Agent"] = consts.DCMANAGER_V1_HTTP_AGENT response = requests.get(url, headers=headers, timeout=self.timeout) if response.status_code == 200: @@ -221,7 +221,31 @@ class DcmanagerClient(base.DriverBase): LOG.error(message) raise Exception(message) - def update_subcloud(self, subcloud_ref, files, data): + def audit_subcloud_peer_group(self, peer_group_ref, **kwargs): + """Audit the subcloud peer group.""" + if peer_group_ref is None: + raise ValueError("peer_group_ref is required.") + url = f"{self.endpoint}/subcloud-peer-groups/{peer_group_ref}/audit" + + headers = {"X-Auth-Token": self.token, + "Content-Type": "application/json"} + response = requests.patch(url, json=kwargs, headers=headers, + timeout=self.timeout) + + if response.status_code == 200: + return response.json() + else: + if response.status_code == 404 and \ + 'Subcloud Peer Group not found' in response.text: + raise exceptions.SubcloudPeerGroupNotFound( + peer_group_ref=peer_group_ref) + message = "Audit Subcloud Peer Group: peer_group_ref %s, %s, " \ + "failed with RC: %d" % (peer_group_ref, kwargs, + response.status_code) + LOG.error(message) + raise Exception(message) + + def update_subcloud(self, subcloud_ref, files, data, is_region_name=False): """Update the subcloud.""" if subcloud_ref is None: raise ValueError("subcloud_ref is required.") @@ -237,6 +261,10 @@ class DcmanagerClient(base.DriverBase): enc = MultipartEncoder(fields=fields) headers = {"X-Auth-Token": self.token, "Content-Type": enc.content_type} + # Add header to flag the request is from another DC, + # server will treat subcloud_ref as a region_name + if is_region_name: + headers["User-Agent"] = consts.DCMANAGER_V1_HTTP_AGENT response = requests.patch(url, headers=headers, data=enc, timeout=self.timeout) diff --git a/distributedcloud/dcmanager/api/controllers/v1/peer_group_association.py b/distributedcloud/dcmanager/api/controllers/v1/peer_group_association.py index 79c3e7e1b..2f6da213b 100644 --- a/distributedcloud/dcmanager/api/controllers/v1/peer_group_association.py +++ b/distributedcloud/dcmanager/api/controllers/v1/peer_group_association.py @@ -227,6 +227,8 @@ class PeerGroupAssociationsController(restcomm.GenericPathController): if sync_enabled: # Sync the subcloud peer group to peer site self.rpc_client.sync_subcloud_peer_group(context, association.id) + else: + self.rpc_client.peer_monitor_notify(context) return db_api.peer_group_association_db_model_to_dict(association) except RemoteError as e: pecan.abort(httpclient.UNPROCESSABLE_ENTITY, e.value) @@ -343,8 +345,10 @@ class PeerGroupAssociationsController(restcomm.GenericPathController): sync_disabled = association.sync_status == consts.\ ASSOCIATION_SYNC_STATUS_DISABLED if sync_disabled: - return db_api.peer_group_association_destroy(context, - association_id) + result = db_api.peer_group_association_destroy(context, + association_id) + self.rpc_client.peer_monitor_notify(context) + return result # Ask system-peer-manager to delete the association. # It will do all the real work... return self.rpc_client.delete_peer_group_association( diff --git a/distributedcloud/dcmanager/api/controllers/v1/subcloud_peer_group.py b/distributedcloud/dcmanager/api/controllers/v1/subcloud_peer_group.py index 588de2538..ad7769466 100644 --- a/distributedcloud/dcmanager/api/controllers/v1/subcloud_peer_group.py +++ b/distributedcloud/dcmanager/api/controllers/v1/subcloud_peer_group.py @@ -263,6 +263,12 @@ class SubcloudPeerGroupsController(restcomm.GenericPathController): system_leader_id = payload.get('system-leader-id') system_leader_name = payload.get('system-leader-name') max_subcloud_rehoming = payload.get('max-subcloud-rehoming') + if 'migration_status' in payload: + migration_status = payload.get('migration_status') + if migration_status is None: + migration_status = consts.PEER_GROUP_MIGRATION_NONE + else: + migration_status = None if not ( peer_group_name @@ -271,6 +277,7 @@ class SubcloudPeerGroupsController(restcomm.GenericPathController): or system_leader_id or system_leader_name or max_subcloud_rehoming + or migration_status ): pecan.abort(httpclient.BAD_REQUEST, _('nothing to update')) @@ -294,6 +301,12 @@ class SubcloudPeerGroupsController(restcomm.GenericPathController): not self._validate_system_leader_name(system_leader_name)): pecan.abort(httpclient.BAD_REQUEST, _('Invalid system-leader-name')) + if (migration_status and + migration_status.lower() not in [consts.PEER_GROUP_MIGRATING, + consts.PEER_GROUP_MIGRATION_COMPLETE, + consts.PEER_GROUP_MIGRATION_NONE]): + pecan.abort(httpclient.BAD_REQUEST, + _('Invalid migration_status')) try: updated_peer_group = db_api.subcloud_peer_group_update( @@ -304,7 +317,8 @@ class SubcloudPeerGroupsController(restcomm.GenericPathController): group_state=group_state, max_subcloud_rehoming=max_subcloud_rehoming, system_leader_id=system_leader_id, - system_leader_name=system_leader_name) + system_leader_name=system_leader_name, + migration_status=migration_status) return db_api.subcloud_peer_group_db_model_to_dict(updated_peer_group) except RemoteError as e: pecan.abort(httpclient.UNPROCESSABLE_ENTITY, e.value) @@ -403,6 +417,47 @@ class SubcloudPeerGroupsController(restcomm.GenericPathController): "Unable to batch migrate peer group %s" % group.peer_group_name) pecan.abort(500, _('Unable to batch migrate ' 'peer group %s' % group.peer_group_name)) + elif verb == 'audit': + payload = json.loads(request.body) + if 'peer_uuid' not in payload: + pecan.abort(400, _('Unable to audit peer group ' + '%s, missing peer_uuid' % + group.peer_group_name)) + if 'peer_group_name' not in payload: + pecan.abort(400, _('Unable to audit peer group ' + '%s, missing peer_group_name' % + group.peer_group_name)) + if 'group_priority' not in payload: + pecan.abort(400, _('Unable to audit peer group ' + '%s, missing group_priority' % + group.peer_group_name)) + if 'group_state' not in payload: + pecan.abort(400, _('Unable to audit peer group ' + '%s, missing group_state' % + group.peer_group_name)) + if 'system_leader_id' not in payload: + pecan.abort(400, _('Unable to audit peer group ' + '%s, missing system_leader_id' % + group.peer_group_name)) + if 'system_leader_name' not in payload: + pecan.abort(400, _('Unable to audit peer group ' + '%s, missing system_leader_name' % + group.peer_group_name)) + if 'migration_status' not in payload: + pecan.abort(400, _('Unable to audit peer group ' + '%s, missing migration_status' % + group.peer_group_name)) + try: + msg = self.rpc_client.peer_group_audit_notify( + context, + group.peer_group_name, + payload) + return {"message": msg} + except Exception: + LOG.exception('Unable to audit peer group %s' % + group.peer_group_name) + pecan.abort(500, _('Unable to audit peer group %s' % + group.peer_group_name)) else: pecan.abort(400, _('Invalid request')) diff --git a/distributedcloud/dcmanager/api/controllers/v1/subclouds.py b/distributedcloud/dcmanager/api/controllers/v1/subclouds.py index 95ceb36bd..20b88afa8 100644 --- a/distributedcloud/dcmanager/api/controllers/v1/subclouds.py +++ b/distributedcloud/dcmanager/api/controllers/v1/subclouds.py @@ -423,12 +423,13 @@ class SubcloudsController(object): pecan.abort(404, _('Subcloud not found')) else: try: - # When the request comes from the cert-monitor, it is - # based on the region name (which is UUID format). + # When the request comes from the cert-monitor or another + # DC, it is based on the region name (which is UUID format). # Whereas, if the request comes from a client other # than cert-monitor, it will do the lookup based on # the subcloud name. - if utils.is_req_from_cert_mon_agent(request): + if (utils.is_req_from_cert_mon_agent(request) or + utils.is_req_from_another_dc(request)): subcloud = db_api.\ subcloud_get_by_region_name(context, subcloud_ref) else: @@ -573,12 +574,13 @@ class SubcloudsController(object): pecan.abort(404, _('Subcloud not found')) else: try: - # When the request comes from the cert-monitor, it is - # based on the region name (which is UUID format). + # When the request comes from the cert-monitor or another DC, + # it is based on the region name (which is UUID format). # Whereas, if the request comes from a client other # than cert-monitor, it will do the lookup based on # the subcloud name. - if utils.is_req_from_cert_mon_agent(request): + if (utils.is_req_from_cert_mon_agent(request) or + utils.is_req_from_another_dc(request)): subcloud = db_api.\ subcloud_get_by_region_name(context, subcloud_ref) else: diff --git a/distributedcloud/dcmanager/api/policies/subcloud_peer_group.py b/distributedcloud/dcmanager/api/policies/subcloud_peer_group.py index 83bbdf4cc..14eda80c9 100644 --- a/distributedcloud/dcmanager/api/policies/subcloud_peer_group.py +++ b/distributedcloud/dcmanager/api/policies/subcloud_peer_group.py @@ -75,6 +75,11 @@ _subcloud_peer_groups_rules = [ { 'method': 'PATCH', 'path': '/v1.0/subcloud-peer-groups/{subcloud_peer_group}/migrate' + }, + # Trigger a peer group audit + { + 'method': 'PATCH', + 'path': '/v1.0/subcloud-peer-groups/{subcloud_peer_group}/audit' } ] ) diff --git a/distributedcloud/dcmanager/common/consts.py b/distributedcloud/dcmanager/common/consts.py index 50c3c3e38..57af3dd43 100644 --- a/distributedcloud/dcmanager/common/consts.py +++ b/distributedcloud/dcmanager/common/consts.py @@ -443,6 +443,10 @@ OS_REGION_NAME = "OS_REGION_NAME" STATES_FOR_SUBCLOUD_RENAME = [DEPLOY_STATE_DONE, PRESTAGE_STATE_COMPLETE] +# Required for GEO-redundancy +# User-Agent check for subcloud by region_name request. +DCMANAGER_V1_HTTP_AGENT = 'dcmanager/1.0' + # batch rehome manage state wait timeout BATCH_REHOME_MGMT_STATES_TIMEOUT = 900 @@ -450,8 +454,16 @@ BATCH_REHOME_MGMT_STATES_TIMEOUT = 900 SYSTEM_PEER_HEARTBEAT_STATUS_ALIVE = 'alive' SYSTEM_PEER_HEARTBEAT_STATUS_FAILURE = 'failure' +# Peer group migration status +PEER_GROUP_MIGRATING = 'migrating' +PEER_GROUP_MIGRATION_COMPLETE = 'complete' +PEER_GROUP_MIGRATION_NONE = 'none' + # Peer group association sync status ASSOCIATION_SYNC_STATUS_SYNCING = 'syncing' ASSOCIATION_SYNC_STATUS_SYNCED = 'synced' ASSOCIATION_SYNC_STATUS_FAILED = 'failed' ASSOCIATION_SYNC_STATUS_DISABLED = 'disabled' + +# Peer monitor heartbeat policy +HEARTBEAT_FAILURE_POLICY_ALARM = 'alarm' diff --git a/distributedcloud/dcmanager/common/utils.py b/distributedcloud/dcmanager/common/utils.py index ae40c5f04..e6e3d9401 100644 --- a/distributedcloud/dcmanager/common/utils.py +++ b/distributedcloud/dcmanager/common/utils.py @@ -1414,3 +1414,26 @@ def yaml_safe_load(contents, content_type): pecan.abort(400, _(msg)) return data + + +# Feature: Subcloud Name Reconfiguration +# This method is useful to determine the origin of the request +# towards the api. +def is_req_from_another_dc(request): + ua = request.headers.get("User-Agent") + if ua == consts.DCMANAGER_V1_HTTP_AGENT: + return True + else: + return False + + +def get_local_system(): + m_ks_client = OpenStackDriver( + region_name=dccommon_consts.DEFAULT_REGION_NAME, + region_clients=None).keystone_client + endpoint = m_ks_client.endpoint_cache.get_endpoint('sysinv') + sysinv_client = SysinvClient(dccommon_consts.DEFAULT_REGION_NAME, + m_ks_client.session, + endpoint=endpoint) + system = sysinv_client.get_system() + return system diff --git a/distributedcloud/dcmanager/db/api.py b/distributedcloud/dcmanager/db/api.py index e782e8f7c..aa07490d0 100644 --- a/distributedcloud/dcmanager/db/api.py +++ b/distributedcloud/dcmanager/db/api.py @@ -434,14 +434,14 @@ def peer_group_get_for_system_peer(context, peer_id): def system_peer_update(context, peer_id, - peer_uuid, peer_name, - endpoint, username, password, - gateway_ip, - administrative_state, - heartbeat_interval, - heartbeat_failure_threshold, - heartbeat_failure_policy, - heartbeat_maintenance_timeout, + peer_uuid=None, peer_name=None, + endpoint=None, username=None, password=None, + gateway_ip=None, + administrative_state=None, + heartbeat_interval=None, + heartbeat_failure_threshold=None, + heartbeat_failure_policy=None, + heartbeat_maintenance_timeout=None, heartbeat_status=None): """Update the system peer or raise if it does not exist.""" return IMPL.system_peer_update(context, peer_id, @@ -473,13 +473,16 @@ def subcloud_peer_group_db_model_to_dict(subcloud_peer_group): "max_subcloud_rehoming": subcloud_peer_group.max_subcloud_rehoming, "system_leader_id": subcloud_peer_group.system_leader_id, "system_leader_name": subcloud_peer_group.system_leader_name, + "migration_status": subcloud_peer_group.migration_status, "created-at": subcloud_peer_group.created_at, "updated-at": subcloud_peer_group.updated_at} return result -def subcloud_peer_group_create(context, peer_group_name, group_priority, group_state, - max_subcloud_rehoming, system_leader_id, system_leader_name): +def subcloud_peer_group_create(context, peer_group_name, group_priority, + group_state, max_subcloud_rehoming, + system_leader_id, system_leader_name, + migration_status=None): """Create a subcloud_peer_group.""" return IMPL.subcloud_peer_group_create(context, peer_group_name, @@ -487,7 +490,8 @@ def subcloud_peer_group_create(context, peer_group_name, group_priority, group_s group_state, max_subcloud_rehoming, system_leader_id, - system_leader_name) + system_leader_name, + migration_status) def subcloud_peer_group_destroy(context, group_id): @@ -523,9 +527,12 @@ def subcloud_peer_group_get_all(context): return IMPL.subcloud_peer_group_get_all(context) -def subcloud_peer_group_update(context, group_id, peer_group_name, group_priority, - group_state, max_subcloud_rehoming, system_leader_id, - system_leader_name): +def subcloud_peer_group_update(context, group_id, peer_group_name=None, + group_priority=None, group_state=None, + max_subcloud_rehoming=None, + system_leader_id=None, + system_leader_name=None, + migration_status=None): """Update the subcloud peer group or raise if it does not exist.""" return IMPL.subcloud_peer_group_update(context, group_id, @@ -534,7 +541,8 @@ def subcloud_peer_group_update(context, group_id, peer_group_name, group_priorit group_state, max_subcloud_rehoming, system_leader_id, - system_leader_name) + system_leader_name, + migration_status) ################### diff --git a/distributedcloud/dcmanager/db/sqlalchemy/api.py b/distributedcloud/dcmanager/db/sqlalchemy/api.py index 3de3c10de..f28c66ea8 100644 --- a/distributedcloud/dcmanager/db/sqlalchemy/api.py +++ b/distributedcloud/dcmanager/db/sqlalchemy/api.py @@ -1170,7 +1170,8 @@ def subcloud_peer_group_create(context, group_state, max_subcloud_rehoming, system_leader_id, - system_leader_name): + system_leader_name, + migration_status): with write_session() as session: subcloud_peer_group_ref = models.SubcloudPeerGroup() subcloud_peer_group_ref.peer_group_name = peer_group_name @@ -1179,6 +1180,7 @@ def subcloud_peer_group_create(context, subcloud_peer_group_ref.max_subcloud_rehoming = max_subcloud_rehoming subcloud_peer_group_ref.system_leader_id = system_leader_id subcloud_peer_group_ref.system_leader_name = system_leader_name + subcloud_peer_group_ref.migration_status = migration_status session.add(subcloud_peer_group_ref) return subcloud_peer_group_ref @@ -1198,7 +1200,8 @@ def subcloud_peer_group_update(context, group_state=None, max_subcloud_rehoming=None, system_leader_id=None, - system_leader_name=None): + system_leader_name=None, + migration_status=None): with write_session() as session: subcloud_peer_group_ref = subcloud_peer_group_get(context, group_id) if peer_group_name is not None: @@ -1213,6 +1216,11 @@ def subcloud_peer_group_update(context, subcloud_peer_group_ref.system_leader_id = system_leader_id if system_leader_name is not None: subcloud_peer_group_ref.system_leader_name = system_leader_name + if migration_status is not None: + if str(migration_status).lower() == 'none': + subcloud_peer_group_ref.migration_status = None + else: + subcloud_peer_group_ref.migration_status = migration_status subcloud_peer_group_ref.save(session) return subcloud_peer_group_ref ########################## diff --git a/distributedcloud/dcmanager/db/sqlalchemy/migrate_repo/versions/014_add_subcloud_peer_group_and_association.py b/distributedcloud/dcmanager/db/sqlalchemy/migrate_repo/versions/014_add_subcloud_peer_group_and_association.py index af2bedccd..800a1132b 100644 --- a/distributedcloud/dcmanager/db/sqlalchemy/migrate_repo/versions/014_add_subcloud_peer_group_and_association.py +++ b/distributedcloud/dcmanager/db/sqlalchemy/migrate_repo/versions/014_add_subcloud_peer_group_and_association.py @@ -29,6 +29,7 @@ def upgrade(migrate_engine): sqlalchemy.Column('system_leader_id', sqlalchemy.String(255)), sqlalchemy.Column('system_leader_name', sqlalchemy.String(255)), sqlalchemy.Column('max_subcloud_rehoming', sqlalchemy.Integer), + sqlalchemy.Column('migration_status', sqlalchemy.String(255)), sqlalchemy.Column('reserved_1', sqlalchemy.Text), sqlalchemy.Column('reserved_2', sqlalchemy.Text), sqlalchemy.Column('created_at', sqlalchemy.DateTime), diff --git a/distributedcloud/dcmanager/db/sqlalchemy/models.py b/distributedcloud/dcmanager/db/sqlalchemy/models.py index 51815c4b7..6972b4162 100644 --- a/distributedcloud/dcmanager/db/sqlalchemy/models.py +++ b/distributedcloud/dcmanager/db/sqlalchemy/models.py @@ -144,6 +144,7 @@ class SubcloudPeerGroup(BASE, DCManagerBase): max_subcloud_rehoming = Column(Integer) system_leader_id = Column(String(255)) system_leader_name = Column(String(255)) + migration_status = Column(String(255)) class PeerGroupAssociation(BASE, DCManagerBase): diff --git a/distributedcloud/dcmanager/manager/peer_group_audit_manager.py b/distributedcloud/dcmanager/manager/peer_group_audit_manager.py new file mode 100644 index 000000000..5ffd5a5c4 --- /dev/null +++ b/distributedcloud/dcmanager/manager/peer_group_audit_manager.py @@ -0,0 +1,313 @@ +# +# Copyright (c) 2023 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + +import threading + +from oslo_config import cfg +from oslo_log import log as logging + + +from fm_api import constants as fm_const +from fm_api import fm_api + +from dccommon import consts as dccommon_consts +from dcmanager.common import consts +from dcmanager.common import context +from dcmanager.common.i18n import _ +from dcmanager.common import manager +from dcmanager.common import utils +from dcmanager.db import api as db_api +from dcmanager.manager.system_peer_manager import SystemPeerManager + + +CONF = cfg.CONF +LOG = logging.getLogger(__name__) + + +class PeerGroupAuditManager(manager.Manager): + """Manages audit related tasks.""" + def __init__(self, subcloud_manager, peer_group_id, *args, **kwargs): + LOG.debug(_('PeerGroupAuditManager initialization...')) + super().__init__(service_name="peer_group_audit_manager", + *args, **kwargs) + self.context = context.get_admin_context() + self.fm_api = fm_api.FaultAPIs() + self.subcloud_manager = subcloud_manager + self.peer_group_id = peer_group_id + self.require_audit_flag = True + self.thread = None + self.thread_lock = threading.Lock() + + def _get_subclouds_by_peer_group_from_system_peer(self, + system_peer, + peer_group_name): + try: + dc_client = SystemPeerManager.get_peer_dc_client(system_peer) + subclouds = dc_client.get_subcloud_list_by_peer_group( + peer_group_name) + return subclouds + except Exception: + LOG.exception("Failed to get subclouds of peer group %s " + "from DC: %s" % + (peer_group_name, system_peer.peer_name)) + + def _update_remote_peer_group_migration_status(self, + system_peer, + peer_group_name, + migration_status): + dc_client = SystemPeerManager.get_peer_dc_client(system_peer) + peer_group_kwargs = { + 'migration_status': migration_status + } + dc_client.update_subcloud_peer_group(peer_group_name, + **peer_group_kwargs) + LOG.info("Updated Subcloud Peer Group %s on " + "peer site %s, set migration_status to: %s" % + (peer_group_name, system_peer.peer_name, migration_status)) + + def _get_local_subclouds_to_update(self, + local_peer_group, + remote_subclouds): + local_subclouds_to_update = list() + remote_managed_subcloud_region_names = list() + local_subclouds = db_api.subcloud_get_for_peer_group( + self.context, local_peer_group.id) + + # get the 'managed+online' remote subclouds + for remote_subcloud in remote_subclouds: + if (remote_subcloud.get('management-state') == + dccommon_consts.MANAGEMENT_MANAGED and + remote_subcloud.get('availability-status') == + dccommon_consts.AVAILABILITY_ONLINE): + remote_managed_subcloud_region_names.append( + remote_subcloud.get('region-name')) + + # Compare with the 'non-secondary' local subclouds + for local_subcloud in local_subclouds: + if local_subcloud.region_name in \ + remote_managed_subcloud_region_names \ + and not utils.subcloud_is_secondary_state( + local_subcloud.deploy_status): + + local_subclouds_to_update.append(local_subcloud) + + return local_subclouds_to_update + + def audit(self, system_peer, remote_peer_group, local_peer_group): + if local_peer_group.migration_status == consts.PEER_GROUP_MIGRATING: + LOG.info("Local peer group in migrating state, quit audit") + return + + LOG.info("Auditing remote subcloud peer group:[%s] " + "migration_status:[%s] group_priority[%s], " + "local subcloud peer group:[%s] " + "migration_status:[%s] group_priority[%s]" % + (remote_peer_group.get("peer_group_name"), + remote_peer_group.get("migration_status"), + remote_peer_group.get("group_priority"), + local_peer_group.peer_group_name, + local_peer_group.migration_status, + local_peer_group.group_priority)) + + # if remote subcloud peer group's migration_status is 'migrating', + # 'unmanaged' all local subclouds in local peer group + if remote_peer_group.get("migration_status") == \ + consts.PEER_GROUP_MIGRATING: + # Unmanaged all local subclouds of peer group + LOG.info("Unmanaged all local subclouds of peer group %s " + "since remote is in migrating state" % + local_peer_group.peer_group_name) + subclouds = db_api.subcloud_get_for_peer_group(self.context, + local_peer_group.id) + for subcloud in subclouds: + try: + if subcloud.management_state != \ + dccommon_consts.MANAGEMENT_UNMANAGED: + self.subcloud_manager.update_subcloud( + self.context, + subcloud.id, + management_state=dccommon_consts. + MANAGEMENT_UNMANAGED) + except Exception as e: + LOG.exception("Fail to unmanage local subcloud %s, err: " + "%s" % (subcloud.name, e)) + raise e + self.require_audit_flag = False + + # if remote subcloud peer group's migration_status is 'complete', + # Get remote subclouds, for 'managed+online' subclouds, + # set 'unmanaged+secondary' to local on same subclouds + elif remote_peer_group.get("migration_status") == \ + consts.PEER_GROUP_MIGRATION_COMPLETE: + remote_subclouds = \ + self._get_subclouds_by_peer_group_from_system_peer( + system_peer, + remote_peer_group.get("peer_group_name")) + + if not remote_subclouds: + LOG.error("No subclouds in remote DC:%s's peer group %s" % + (system_peer.peer_name, + remote_peer_group.get("peer_group_name"))) + return + local_subclouds_to_update = \ + self._get_local_subclouds_to_update(local_peer_group, + remote_subclouds) + + for subcloud in local_subclouds_to_update: + try: + LOG.info("Set secondary to local subcloud %s" % + subcloud.name) + # There will be an exception when unmanage + # a subcloud in 'unamaged' state. + if subcloud.management_state != \ + dccommon_consts.MANAGEMENT_UNMANAGED: + self.subcloud_manager.update_subcloud( + self.context, + subcloud.id, + management_state=dccommon_consts. + MANAGEMENT_UNMANAGED) + self.subcloud_manager.update_subcloud( + self.context, + subcloud.id, + deploy_status=consts.DEPLOY_STATE_SECONDARY) + except Exception as e: + LOG.exception("Failed to update local non-secondary " + "and offline subcloud [%s], err: %s" % + (subcloud.name, e)) + raise e + + if local_subclouds_to_update: + self._clear_or_raise_alarm(system_peer, + local_peer_group, + remote_peer_group) + db_api.subcloud_peer_group_update( + self.context, + local_peer_group.id, + system_leader_id=system_peer.peer_uuid, + system_leader_name=system_peer.peer_name) + + self._update_remote_peer_group_migration_status( + system_peer, + remote_peer_group.get("peer_group_name"), + None) + self.require_audit_flag = False + else: + # If remote peer group migration_status is 'None' + self.require_audit_flag = False + + def _clear_or_raise_alarm(self, + system_peer, + local_peer_group, + remote_peer_group): + # If local subcloud peer group's group_priority is + # lower than remote subcloud peer group's group_priority, + # an alarm will be raised. + # lower number means higher priority + entity_instance_id = "peer_group=%s,peer=%s" % \ + (local_peer_group.peer_group_name, system_peer.peer_uuid) + if local_peer_group.group_priority < remote_peer_group.get('group_priority'): + LOG.warning("Alarm: local subcloud peer group [%s] " + "is managed by remote system [%s]" % + (local_peer_group.peer_group_name, + system_peer.peer_name)) + try: + fault = fm_api.Fault( + alarm_id=fm_const. + FM_ALARM_ID_DC_SUBCLOUD_PEER_GROUP_NOT_MANAGED, + alarm_state=fm_const.FM_ALARM_STATE_SET, + entity_type_id=fm_const. + FM_ENTITY_TYPE_SUBCLOUD_PEER_GROUP, + entity_instance_id=entity_instance_id, + severity=fm_const.FM_ALARM_SEVERITY_MAJOR, + reason_text=("Subcloud peer group " + "(peer_group_name=%s) " + "is managed by remote " + "system (peer_uuid=%s) " + "with a lower priority." % + (local_peer_group.peer_group_name, + system_peer.peer_uuid)), + alarm_type=fm_const.FM_ALARM_TYPE_0, + probable_cause=fm_const. + ALARM_PROBABLE_CAUSE_UNKNOWN, + proposed_repair_action="Check the reported peer group " + "state. Migrate it back to the current system if the " + "state is 'rehomed' and the current system is stable. " + "Otherwise, wait until these conditions are met.", + service_affecting=False) + self.fm_api.set_fault(fault) + except Exception as e: + LOG.exception(e) + else: + try: + fault = self.fm_api.get_fault( + fm_const.FM_ALARM_ID_DC_SUBCLOUD_PEER_GROUP_NOT_MANAGED, + entity_instance_id) + if fault: + LOG.info("Clear alarm: %s" % entity_instance_id) + self.fm_api.clear_fault( + fm_const.FM_ALARM_ID_DC_SUBCLOUD_PEER_GROUP_NOT_MANAGED, + entity_instance_id) + except Exception: + LOG.exception( + "Problem clearing fault [%s], alarm_id=%s" % + (entity_instance_id, + fm_const.FM_ALARM_ID_DC_SUBCLOUD_PEER_GROUP_NOT_MANAGED)) + + def _do_audit(self, system_peer, remote_peer_group, local_peer_group): + with self.thread_lock: + try: + self.audit(system_peer, remote_peer_group, local_peer_group) + except Exception as e: + LOG.exception("audit error occured: %s" % e) + + def stop(self): + self.thread.join() + LOG.info("stopped peer group %s audit thread" % self.peer_group_id) + + def start(self, system_peer, remote_peer_group, local_peer_group): + if self.thread_lock.locked(): + LOG.warning('Audit thread for %s has already started' % + local_peer_group.peer_group_name) + else: + self.thread = threading.Thread( + target=self._do_audit, + args=(system_peer, remote_peer_group, local_peer_group)) + self.thread.start() + + def audit_peer_group_from_system(self, + system_peer, + remote_peer_group, + local_peer_group): + LOG.info("Audit peer group [%s] with remote system %s" % + (local_peer_group.peer_group_name, system_peer.peer_name)) + self.start(system_peer, remote_peer_group, local_peer_group) + + @staticmethod + def send_audit_peer_group(system_peers, peer_group): + if not system_peers: + return + local_system = utils.get_local_system() + for system in system_peers: + try: + dc_client = SystemPeerManager.get_peer_dc_client(system) + payload = db_api.subcloud_peer_group_db_model_to_dict( + peer_group) + if 'created-at' in payload: + del payload['created-at'] + if 'updated-at' in payload: + del payload['updated-at'] + payload['peer_uuid'] = local_system.uuid + LOG.info("Send audit payload [%s] of peer group %s" % + (payload, peer_group.peer_group_name)) + response = dc_client.audit_subcloud_peer_group( + peer_group.peer_group_name, + **payload) + if response: + return response + except Exception: + LOG.exception("Failed to send audit request for peer group %s " + "to DC: %s" % + (peer_group.peer_group_name, system.peer_name)) diff --git a/distributedcloud/dcmanager/manager/peer_monitor_manager.py b/distributedcloud/dcmanager/manager/peer_monitor_manager.py new file mode 100644 index 000000000..05f6caa05 --- /dev/null +++ b/distributedcloud/dcmanager/manager/peer_monitor_manager.py @@ -0,0 +1,316 @@ +# +# Copyright (c) 2023 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + +from collections import defaultdict +import threading + +from fm_api import constants as fm_const +from fm_api import fm_api +from oslo_config import cfg +from oslo_log import log as logging + +from dcmanager.common import consts +from dcmanager.common import context +from dcmanager.common import manager +from dcmanager.db import api as db_api +from dcmanager.manager import peer_group_audit_manager as pgam +from dcmanager.manager.system_peer_manager import SystemPeerManager + + +CONF = cfg.CONF +LOG = logging.getLogger(__name__) + + +class PeerMonitor(object): + def __init__(self, peer, context, subcloud_manager): + self.peer = peer + self.thread = None + self.exit_flag = threading.Event() + self.fm_api = fm_api.FaultAPIs() + self.context = context + self.subcloud_manager = subcloud_manager + self.peer_group_id_set = set() + # key: peer_group_id + # value: PeerGroupAuditManager object + self.peer_group_audit_obj_map = dict() + + def _clear_failure(self): + alarm_id = fm_const.FM_ALARM_ID_DC_SYSTEM_PEER_HEARTBEAT_FAILED + entity_instance_id = "peer=%s" % self.peer.peer_uuid + try: + fault = self.fm_api.get_fault(alarm_id, entity_instance_id) + if fault: + self.fm_api.clear_fault(alarm_id, entity_instance_id) + except Exception as e: + LOG.exception( + "Problem clearing fault for peer %s, alarm_id=%s " + "error: %s" % (self.peer.peer_uuid, alarm_id, e)) + + def _raise_failure(self): + alarm_id = fm_const.FM_ALARM_ID_DC_SYSTEM_PEER_HEARTBEAT_FAILED + entity_instance_id = "peer=%s" % self.peer.peer_uuid + reason_text = ("Peer %s (peer_uuid=%s) connections in " + "disconnected state." % (self.peer.peer_name, + self.peer.peer_uuid)) + severity = fm_const.FM_ALARM_SEVERITY_MAJOR + + peer_groups = db_api.subcloud_peer_group_get_by_leader_id( + self.context, self.peer.peer_uuid) + if len(peer_groups) > 0: + peer_group_names = [peer_group.peer_group_name + for peer_group in peer_groups] + reason_text = ("Peer %s (peer_uuid=%s) is in disconnected " + "state. The following subcloud peer groups " + "are impacted: %s." % + (self.peer.peer_name, self.peer.peer_uuid, + ", ".join(peer_group_names))) + severity = fm_const.FM_ALARM_SEVERITY_CRITICAL + + try: + fault = fm_api.Fault( + alarm_id=alarm_id, + alarm_state=fm_const.FM_ALARM_STATE_SET, + entity_type_id=fm_const.FM_ENTITY_TYPE_SYSTEM_PEER, + entity_instance_id=entity_instance_id, + severity=severity, + reason_text=reason_text, + alarm_type=fm_const.FM_ALARM_TYPE_1, + probable_cause=fm_const.ALARM_PROBABLE_CAUSE_UNKNOWN, + proposed_repair_action="Check the connectivity between " + "the current system and the reported peer site. If the " + "peer system is down, migrate the affected peer group(s) " + "to the current system for continued subcloud management.", + service_affecting=False) + + self.fm_api.set_fault(fault) + + except Exception as e: + LOG.exception( + "Problem setting fault for peer %s, alarm_id=%s, " + "error: %s" % (self.peer.peer_uuid, alarm_id, e)) + + def _heartbeat_check_via_get_peer_group_list(self): + """Checking the heartbeat of system peer.""" + failed = True + dc_peer_subcloud_peer_group_list = list() + try: + dc_client = SystemPeerManager.get_peer_dc_client(self.peer) + dc_peer_subcloud_peer_group_list = \ + dc_client.get_subcloud_peer_group_list() + failed = False + + if not dc_peer_subcloud_peer_group_list: + LOG.warning("Resource subcloud peer group of dc:%s " + "not found" % self.peer.manager_endpoint) + + except Exception: + LOG.exception("Failed to access the dc: %s" % + self.peer.peer_name) + return failed, dc_peer_subcloud_peer_group_list + + def _do_monitor_peer(self): + failure_count = 0 + LOG.info("Start monitoring thread for peer %s" % + self.peer.peer_name) + # Do the actual peer monitor. + while not self.exit_flag.wait(timeout=self.peer.heartbeat_interval): + try: + # Get system peer from DB + self.peer = db_api.system_peer_get(self.context, self.peer.id) + failed, remote_pg_list = \ + self._heartbeat_check_via_get_peer_group_list() + if failed: + failure_count += 1 + if failure_count >= self.peer.heartbeat_failure_threshold: + # heartbeat_failure_threshold reached. + LOG.warning("DC %s heartbeat failed, Raising alarm" % + self.peer.peer_name) + self._raise_failure() + db_api.system_peer_update( + self.context, self.peer.id, + heartbeat_status=consts.SYSTEM_PEER_HEARTBEAT_STATUS_FAILURE) + failure_count = 0 + self._set_require_audit_flag_to_associated_peer_groups() + else: + failure_count = 0 + self._audit_local_peer_groups(remote_pg_list) + if self.peer.heartbeat_status != \ + consts.SYSTEM_PEER_HEARTBEAT_STATUS_ALIVE: + db_api.system_peer_update( + self.context, self.peer.id, + heartbeat_status=consts.SYSTEM_PEER_HEARTBEAT_STATUS_ALIVE) + LOG.info("DC %s back online, clear alarm" % + self.peer.peer_name) + self._clear_failure() + except Exception as e: + LOG.exception("Got exception monitoring peer %s error: %s" % + (self.peer.peer_name, e)) + LOG.info("Caught graceful exit signal for peer monitor %s" % + self.peer.peer_name) + + def _audit_local_peer_groups(self, remote_pg_list): + # Generate a dict index by remote peer group name + remote_pg_dict = { + remote_peer_group.get("peer_group_name"): remote_peer_group + for remote_peer_group in remote_pg_list + } + + # Only audit peer groups existing on both side + for peer_group_id, pgam_obj in self.peer_group_audit_obj_map.items(): + peer_group = db_api.subcloud_peer_group_get(self.context, + peer_group_id) + if peer_group.peer_group_name in remote_pg_dict: + remote_peer_group = remote_pg_dict[peer_group.peer_group_name] + # Audit for require_audit_flag is True or + # Remote peer group is in 'complete' state. + if (pgam_obj.require_audit_flag + or remote_peer_group.get("migration_status") + == consts.PEER_GROUP_MIGRATION_COMPLETE + ): + pgam_obj.audit_peer_group_from_system( + self.peer, remote_peer_group, peer_group) + else: + LOG.warning("peer group %s not found on remote DC %s " + "nothing to audit, need sync operation" % + (peer_group.peer_group_name, self.peer.peer_name)) + + def _set_require_audit_flag_to_associated_peer_groups(self): + for pgam_obj in self.peer_group_audit_obj_map.values(): + pgam_obj.require_audit_flag = True + + def audit_specific_local_peer_group(self, peer_group, remote_peer_group): + msg = None + if peer_group.id in self.peer_group_audit_obj_map: + pgam_obj = self.peer_group_audit_obj_map[peer_group.id] + pgam_obj.audit(self.peer, remote_peer_group, peer_group) + else: + msg = ("No peer group id %s found" % peer_group.peer_group_name) + return msg + + def _clean_peer_group_audit_threads(self): + for peer_group_id in self.peer_group_audit_obj_map: + pgam_obj = \ + self.peer_group_audit_obj_map[peer_group_id] + pgam_obj.stop() + self.peer_group_audit_obj_map.clear() + + def update_peer_group_id_set(self, peer_group_id_set): + removed_peer_groups = self.peer_group_id_set - peer_group_id_set + new_peer_groups = peer_group_id_set - self.peer_group_id_set + + # destroy removed peer_group audit object + for peer_group_id in removed_peer_groups: + LOG.info("Peer group [%s] removed from peer [%s]" % + (peer_group_id, self.peer.peer_name)) + if peer_group_id in self.peer_group_audit_obj_map: + self.peer_group_audit_obj_map[peer_group_id].stop() + del self.peer_group_audit_obj_map[peer_group_id] + # Add new peer_group audit object + for peer_group_id in new_peer_groups: + LOG.info("New peer group [%s] found for peer [%s]" % + (peer_group_id, self.peer.peer_name)) + self.peer_group_audit_obj_map[peer_group_id] = \ + pgam.PeerGroupAuditManager(self.subcloud_manager, + peer_group_id) + self.peer_group_id_set = peer_group_id_set + + def start(self): + if self.thread is not None: + LOG.error('Peer monitor thread for %s has already started' % + self.peer.peer_name) + else: + self.thread = threading.Thread(target=self._do_monitor_peer) + self.thread.start() + + def stop(self): + self.exit_flag.set() + self.thread.join() + self._clear_failure() + self._clean_peer_group_audit_threads() + + +class PeerMonitorManager(manager.Manager): + """Manages tasks related to peer monitor.""" + def __init__(self, subcloud_manager): + LOG.debug('PeerMonitorManager initialization...') + + super(PeerMonitorManager, self).__init__( + service_name="peer_monitor_manager") + self.peer_monitor = dict() + self.context = context.get_admin_context() + self.subcloud_manager = subcloud_manager + + # key: system_peer_id + # value: PeerMonitor object + self.peer_monitor_thread_map = dict() + + def _remove_peer_monitor_task(self, system_peer_id): + peer_mon_obj = self.peer_monitor_thread_map[system_peer_id] + peer_mon_obj.stop() + + def _create_peer_monitor_task(self, system_peer_id): + peer = db_api.system_peer_get(self.context, + system_peer_id) + LOG.info("Create monitoring thread for peer: %s" % + peer.peer_name) + self.peer_monitor_thread_map[system_peer_id] = PeerMonitor( + peer, self.context, self.subcloud_manager) + self.peer_monitor_thread_map[system_peer_id].start() + + @staticmethod + def _diff_dict(dict1, dict2): + return {key: value for key, value in dict1.items() if key not in dict2} + + def _create_or_destroy_peer_monitor_task(self, peer_system_peer_group_map): + new_peers = self._diff_dict(peer_system_peer_group_map, + self.peer_monitor_thread_map) + removed_peers = self._diff_dict(self.peer_monitor_thread_map, + peer_system_peer_group_map) + for peer_id in new_peers: + self._create_peer_monitor_task(peer_id) + for peer_id in removed_peers: + self._remove_peer_monitor_task(peer_id) + + # Update peer_group_id set + for peer_id, pm_obj in self.peer_monitor_thread_map.items(): + pm_obj.update_peer_group_id_set( + peer_system_peer_group_map[peer_id]) + + def peer_monitor_notify(self, context): + LOG.info("Caught peer monitor notify...") + peer_system_peer_group_map = defaultdict(set) + # Get local associations + associations = db_api.peer_group_association_get_all(context) + for association in associations: + peer_system_peer_group_map[association.system_peer_id].add( + association.peer_group_id) + + self._create_or_destroy_peer_monitor_task(peer_system_peer_group_map) + + def peer_group_audit_notify(self, context, peer_group_name, payload): + LOG.info("Caught peer group audit notification for peer group %s" % + peer_group_name) + msg = None + try: + peer_group = db_api.subcloud_peer_group_get_by_name( + context, peer_group_name) + system_uuid = payload.get('peer_uuid') + system_peer = db_api.system_peer_get_by_uuid(context, + system_uuid) + if system_peer.id in self.peer_monitor_thread_map: + pmobj = self.peer_monitor_thread_map[system_peer.id] + msg = pmobj.audit_specific_local_peer_group(peer_group, + payload) + else: + msg = ("System peer with UUID=%s is not under monitoring. " + "Skipping audit for peer group %s" % + (system_uuid, peer_group_name)) + LOG.warning(msg) + return msg + except Exception as e: + LOG.exception('Handling peer group audit notify error: %s' % + str(e)) + return str(e) diff --git a/distributedcloud/dcmanager/manager/service.py b/distributedcloud/dcmanager/manager/service.py index a411dbc6a..2a00d5e68 100644 --- a/distributedcloud/dcmanager/manager/service.py +++ b/distributedcloud/dcmanager/manager/service.py @@ -32,6 +32,7 @@ from dcmanager.common import exceptions from dcmanager.common.i18n import _ from dcmanager.common import messaging as rpc_messaging from dcmanager.common import utils +from dcmanager.manager.peer_monitor_manager import PeerMonitorManager from dcmanager.manager.subcloud_manager import SubcloudManager from dcmanager.manager.system_peer_manager import SystemPeerManager @@ -85,11 +86,15 @@ class DCManagerService(service.Service): self.target = None self._rpc_server = None self.subcloud_manager = None + self.peer_monitor_manager = None + self.system_peer_manager = None self.audit_rpc_client = None + self.context = context.get_admin_context() def init_managers(self): self.subcloud_manager = SubcloudManager() - self.syspeer_manager = SystemPeerManager() + self.peer_monitor_manager = PeerMonitorManager(self.subcloud_manager) + self.system_peer_manager = SystemPeerManager(self.peer_monitor_manager) def start(self): utils.set_open_file_limit(cfg.CONF.worker_rlimit_nofile) @@ -110,6 +115,10 @@ class DCManagerService(service.Service): os.makedirs(dccommon_consts.ANSIBLE_OVERRIDES_PATH, 0o600, exist_ok=True) self.subcloud_manager.handle_subcloud_operations_in_progress() + + # Send notify to peer monitor. + self.peer_monitor_manager.peer_monitor_notify(self.context) + super(DCManagerService, self).start() @run_in_thread @@ -302,19 +311,31 @@ class DCManagerService(service.Service): payload['peer_group']) return self.subcloud_manager.batch_migrate_subcloud(context, payload) + @request_context + def peer_monitor_notify(self, context): + LOG.info("Handling peer monitor notify") + return self.peer_monitor_manager.peer_monitor_notify(context) + + @request_context + def peer_group_audit_notify(self, context, peer_group_name, payload): + LOG.info("Handling peer group audit notify of peer group " + f"{peer_group_name}") + return self.peer_monitor_manager.peer_group_audit_notify( + context, peer_group_name, payload) + @request_context def sync_subcloud_peer_group(self, context, association_id, sync_subclouds=True, priority=None): LOG.info("Handling sync_subcloud_peer_group request for: %s", association_id) - return self.syspeer_manager.sync_subcloud_peer_group( + return self.system_peer_manager.sync_subcloud_peer_group( context, association_id, sync_subclouds, priority) @request_context def delete_peer_group_association(self, context, association_id): LOG.info("Handling delete_peer_group_association request for: %s", association_id) - return self.syspeer_manager.delete_peer_group_association( + return self.system_peer_manager.delete_peer_group_association( context, association_id) def _stop_rpc_server(self): diff --git a/distributedcloud/dcmanager/manager/subcloud_manager.py b/distributedcloud/dcmanager/manager/subcloud_manager.py index c340bd5db..2d3b52883 100644 --- a/distributedcloud/dcmanager/manager/subcloud_manager.py +++ b/distributedcloud/dcmanager/manager/subcloud_manager.py @@ -43,6 +43,7 @@ from dccommon import consts as dccommon_consts from dccommon.drivers.openstack.sdk_platform import OpenStackDriver from dccommon.drivers.openstack.sysinv_v1 import SysinvClient from dccommon.exceptions import PlaybookExecutionFailed +from dccommon.exceptions import SubcloudNotFound from dccommon import kubeoperator from dccommon.subcloud_install import SubcloudInstall from dccommon.subcloud_install import SubcloudShutdown @@ -61,6 +62,8 @@ from dcmanager.common import prestage from dcmanager.common import utils from dcmanager.db import api as db_api from dcmanager.db.sqlalchemy.models import Subcloud +from dcmanager.manager.peer_group_audit_manager import PeerGroupAuditManager +from dcmanager.manager.system_peer_manager import SystemPeerManager from dcmanager.rpc import client as dcmanager_rpc_client from dcorch.rpc import client as dcorch_rpc_client @@ -388,10 +391,20 @@ class SubcloudManager(manager.Manager): "-e", extra_vars] return rehome_command - def _migrate_manage_subcloud(self, context, payload, subcloud): + def _migrate_manage_subcloud( + self, context, payload, alive_system_peers, subcloud): + success = True + # Try to unmanage the subcloud on peer system + if alive_system_peers: + if self._unmanage_system_peer_subcloud(alive_system_peers, + subcloud): + success = False + LOG.warning("Unmanged subcloud: %s error on peer system, " + "exit migration" % subcloud.name) + return subcloud, success + # migrate and set managed for # online and complete subclouds. - success = True self.migrate_subcloud(context, subcloud.id, payload) subcloud = db_api.subcloud_get(context, subcloud.id) @@ -430,6 +443,100 @@ class SubcloudManager(manager.Manager): return subcloud, success + def _get_peer_system_list(self, peer_group): + system_peers = list() + # Get associations by peer group + associations = db_api.peer_group_association_get_by_peer_group_id( + self.context, peer_group.id) + if not associations: + LOG.info("No association found for peer group %s" % + peer_group.peer_group_name) + return system_peers + for association in associations: + system_peer = db_api.system_peer_get( + self.context, association.system_peer_id) + # Get 'alive' system peer + if system_peer.heartbeat_status != \ + consts.SYSTEM_PEER_HEARTBEAT_STATUS_ALIVE: + LOG.warning("Peer system %s offline, skip checking" % + system_peer.peer_name) + continue + else: + system_peers.append(system_peer) + + return system_peers + + def _unmanage_system_peer_subcloud(self, system_peers, subcloud): + unmanaged_error = False + for system_peer in system_peers: + LOG.debug("Get subcloud: %s from system_peer: %s" % + (subcloud.name, system_peer.peer_name)) + for attempt in range(3): + try: + dc_client = \ + SystemPeerManager.get_peer_dc_client(system_peer) + # Get remote subcloud by region_name from + # system peer + remote_subcloud = dc_client.get_subcloud( + subcloud.region_name, is_region_name=True) + if remote_subcloud.get('management-state') == \ + dccommon_consts.MANAGEMENT_UNMANAGED: + LOG.info("Remote subcloud %s from system peer %s is " + "already unmanaged, skipping unmanage attempt" + % (system_peer.peer_name, + remote_subcloud.get('name'))) + break + + payload = {"management-state": "unmanaged"} + try: + remote_subcloud = dc_client.update_subcloud( + subcloud.region_name, + files=None, + data=payload, + is_region_name=True) + LOG.info("Successfully updated subcloud: %s on " + "peer system %s to unmanaged state" + % (remote_subcloud.get('name'), + system_peer.peer_name)) + return unmanaged_error + except Exception as e: + raise exceptions.SubcloudNotUnmanaged() from e + + except SubcloudNotFound: + LOG.info("No identical subcloud: %s found on " + "peer system: %s" % + (subcloud.region_name, system_peer.peer_name)) + break + except exceptions.SubcloudNotUnmanaged: + LOG.exception("Unmanaged error on subcloud: %s " + "on system %s" % + (subcloud.region_name, + system_peer.peer_name)) + unmanaged_error = True + except Exception: + LOG.exception("Failed to set unmanged for " + "subcloud: %s on system %s attempt: %d" + % (subcloud.region_name, + system_peer.peer_name, attempt)) + time.sleep(1) + return unmanaged_error + + def _clear_alarm_for_peer_group(self, peer_group): + # Get alarms related to peer group + faults = self.fm_api.get_faults_by_id( + fm_const.FM_ALARM_ID_DC_SUBCLOUD_PEER_GROUP_NOT_MANAGED) + if not faults: + return + for fault in faults: + entity_instance_id_str = "peer_group=%s,peer=" % \ + (peer_group.peer_group_name) + if entity_instance_id_str in fault.entity_instance_id: + LOG.info("Clear alarm for peer group %s" % + peer_group.peer_group_name) + self.fm_api.clear_fault( + fm_const.FM_ALARM_ID_DC_SUBCLOUD_PEER_GROUP_NOT_MANAGED, + fault.entity_instance_id) + def migrate_subcloud(self, context, subcloud_ref, payload): '''migrate_subcloud function is for day-2's rehome purpose. @@ -463,6 +570,10 @@ class SubcloudManager(manager.Manager): # Update sysadmin_password sysadmin_password = base64.b64decode(payload['sysadmin_password']).decode('utf-8') saved_payload['sysadmin_password'] = sysadmin_password + # Decode admin_password + if 'admin_password' in saved_payload: + saved_payload['admin_password'] = base64.b64decode( + saved_payload['admin_password']).decode('utf-8') # Re-generate ansible config based on latest rehome_data subcloud = self.subcloud_migrate_generate_ansible_config( @@ -547,6 +658,15 @@ class SubcloudManager(manager.Manager): % str(peer_group.peer_group_name)) return + # Set migration_status to migrating + db_api.subcloud_peer_group_update( + self.context, + peer_group.id, + migration_status=consts.PEER_GROUP_MIGRATING) + + # Try to get peer system by peer group + system_peers = self._get_peer_system_list(peer_group) + # Use thread pool to limit number of operations in parallel migrate_pool = greenpool.GreenPool( size=peer_group.max_subcloud_rehoming) @@ -554,12 +674,35 @@ class SubcloudManager(manager.Manager): tmp_payload = {'sysadmin_password': sysadmin_password} migrate_function = functools.partial(self._migrate_manage_subcloud, context, - tmp_payload) + tmp_payload, + system_peers) self._run_parallel_group_operation('migrate', migrate_function, migrate_pool, subclouds_ready_to_migrate) + + # Set migration_status to complete, + # Update system leader id and name + local_system = utils.get_local_system() + peer_group = db_api.subcloud_peer_group_update( + self.context, + peer_group.id, + system_leader_id=local_system.uuid, + system_leader_name=local_system.name, + migration_status=consts.PEER_GROUP_MIGRATION_COMPLETE) + + # Try to send audit request to system peer + resp = PeerGroupAuditManager.send_audit_peer_group( + system_peers, peer_group) + if resp: + LOG.warning("Audit peer group %s response: %s" % + (peer_group.peer_group_name, resp)) + + # Try to clear existing alarm if we rehomed a '0' priority peer group + if peer_group.group_priority == 0: + self._clear_alarm_for_peer_group(peer_group) + LOG.info("Batch migrate operation finished") def rehome_subcloud(self, context, subcloud): @@ -1043,6 +1186,13 @@ class SubcloudManager(manager.Manager): str(keyring.get_password( user, dccommon_consts.SERVICES_USER_NAME)) + # TODO(Yuxing) remove replicating the smapi user when end the support + # of rehoming a subcloud with a software version below 22.12 + if subcloud.software_version <= LAST_SW_VERSION_IN_CENTOS: + payload['users']['smapi'] = \ + str(keyring.get_password( + 'smapi', dccommon_consts.SERVICES_USER_NAME)) + if 'region_name' not in payload: payload['region_name'] = subcloud.region_name @@ -1238,6 +1388,12 @@ class SubcloudManager(manager.Manager): del original_payload['ansible_ssh_pass'] if 'sysadmin_password' in original_payload: del original_payload['sysadmin_password'] + if 'ansible_become_pass' in original_payload: + del original_payload['ansible_become_pass'] + if 'admin_password' in original_payload: + # Encode admin_password + original_payload['admin_password'] = base64.b64encode( + original_payload['admin_password'].encode("utf-8")).decode('utf-8') bootstrap_info = utils.create_subcloud_rehome_data_template() bootstrap_info['saved_payload'] = original_payload rehome_data = json.dumps(bootstrap_info) @@ -2548,6 +2704,18 @@ class SubcloudManager(manager.Manager): if 'bootstrap-address' in rehome_data_dict['saved_payload']: _bootstrap_address = rehome_data_dict['saved_payload']['bootstrap-address'] bootstrap_values_dict = yaml.load(bootstrap_values, Loader=yaml.SafeLoader) + + # remove sysadmin_password,ansible_ssh_pass,ansible_become_pass + # encode admin_password + if 'sysadmin_password' in bootstrap_values_dict: + del bootstrap_values_dict['sysadmin_password'] + if 'ansible_ssh_pass' in bootstrap_values_dict: + del bootstrap_values_dict['ansible_ssh_pass'] + if 'ansible_become_pass' in bootstrap_values_dict: + del bootstrap_values_dict['ansible_become_pass'] + if 'admin_password' in bootstrap_values_dict: + bootstrap_values_dict['admin_password'] = base64.b64encode( + bootstrap_values_dict['admin_password'].encode("utf-8")).decode('utf-8') rehome_data_dict['saved_payload'] = bootstrap_values_dict # put bootstrap_address back into rehome_data_dict if _bootstrap_address: diff --git a/distributedcloud/dcmanager/manager/system_peer_manager.py b/distributedcloud/dcmanager/manager/system_peer_manager.py index 9493d92b4..071aa1531 100644 --- a/distributedcloud/dcmanager/manager/system_peer_manager.py +++ b/distributedcloud/dcmanager/manager/system_peer_manager.py @@ -35,9 +35,9 @@ VERIFY_SUBCLOUD_SYNC_IGNORE = 'ignore' class SystemPeerManager(manager.Manager): """Manages tasks related to system peers.""" - def __init__(self, *args, **kwargs): + def __init__(self, peer_monitor_manager, *args, **kwargs): LOG.debug(_('SystemPeerManager initialization...')) - + self.peer_monitor_manager = peer_monitor_manager super(SystemPeerManager, self).__init__( service_name="system_peer_manager", *args, **kwargs) @@ -445,6 +445,8 @@ class SystemPeerManager(manager.Manager): association = db_api.peer_group_association_update( context, association_id, **association_update) + self.peer_monitor_manager.peer_monitor_notify(context) + return db_api.peer_group_association_db_model_to_dict(association) except Exception as exception: @@ -501,6 +503,7 @@ class SystemPeerManager(manager.Manager): "on peer site.") db_api.peer_group_association_destroy(context, association_id) + self.peer_monitor_manager.peer_monitor_notify(context) except Exception as exception: LOG.exception("Failed to delete peer_group_association " diff --git a/distributedcloud/dcmanager/rpc/client.py b/distributedcloud/dcmanager/rpc/client.py index a309129d7..63add152f 100644 --- a/distributedcloud/dcmanager/rpc/client.py +++ b/distributedcloud/dcmanager/rpc/client.py @@ -274,6 +274,14 @@ class ManagerClient(RPCClient): return self.call(ctxt, self.make_msg('delete_peer_group_association', association_id=association_id)) + def peer_monitor_notify(self, ctxt): + return self.call(ctxt, self.make_msg('peer_monitor_notify')) + + def peer_group_audit_notify(self, ctxt, peer_group_name, payload): + return self.call(ctxt, self.make_msg('peer_group_audit_notify', + peer_group_name=peer_group_name, + payload=payload)) + class DCManagerNotifications(RPCClient): """DC Manager Notification interface to broadcast subcloud state changed diff --git a/distributedcloud/dcmanager/tests/unit/api/v1/controllers/test_peer_group_association.py b/distributedcloud/dcmanager/tests/unit/api/v1/controllers/test_peer_group_association.py index e0fc25f1b..48c1dd79f 100644 --- a/distributedcloud/dcmanager/tests/unit/api/v1/controllers/test_peer_group_association.py +++ b/distributedcloud/dcmanager/tests/unit/api/v1/controllers/test_peer_group_association.py @@ -128,7 +128,8 @@ class PeerGroupAssociationAPIMixin(APIMixin): SAMPLE_SUBCLOUD_PEER_GROUP_STATE), 'max_subcloud_rehoming': kw.get( 'max_subcloud_rehoming', - SAMPLE_SUBCLOUD_PEER_GROUP_MAX_SUBCLOUDS_REHOMING) + SAMPLE_SUBCLOUD_PEER_GROUP_MAX_SUBCLOUDS_REHOMING), + 'migration_status': None } return group diff --git a/distributedcloud/dcmanager/tests/unit/api/v1/controllers/test_subcloud_peer_group.py b/distributedcloud/dcmanager/tests/unit/api/v1/controllers/test_subcloud_peer_group.py index d6ce993f3..43cda6322 100644 --- a/distributedcloud/dcmanager/tests/unit/api/v1/controllers/test_subcloud_peer_group.py +++ b/distributedcloud/dcmanager/tests/unit/api/v1/controllers/test_subcloud_peer_group.py @@ -82,7 +82,9 @@ class SubcloudPeerGroupAPIMixin(APIMixin): 'enabled'), 'max_subcloud_rehoming': kw.get( 'max_subcloud_rehoming', - SAMPLE_SUBCLOUD_PEER_GROUP_MAX_SUBCLOUDS_REHOMING) + SAMPLE_SUBCLOUD_PEER_GROUP_MAX_SUBCLOUDS_REHOMING), + 'migration_status': None + } return group diff --git a/distributedcloud/dcmanager/tests/unit/manager/test_subcloud_manager.py b/distributedcloud/dcmanager/tests/unit/manager/test_subcloud_manager.py index fa551b2d7..4397ac621 100644 --- a/distributedcloud/dcmanager/tests/unit/manager/test_subcloud_manager.py +++ b/distributedcloud/dcmanager/tests/unit/manager/test_subcloud_manager.py @@ -30,6 +30,7 @@ sys.modules['fm_core'] = mock.Mock() import threading from dccommon import consts as dccommon_consts +from dccommon.drivers.openstack import dcmanager_v1 from dccommon import subcloud_install from dccommon.utils import AnsiblePlaybook from dcmanager.common import consts @@ -38,9 +39,11 @@ from dcmanager.common import prestage from dcmanager.common import utils as cutils from dcmanager.db.sqlalchemy import api as db_api from dcmanager.manager import subcloud_manager +from dcmanager.manager import system_peer_manager from dcmanager.state import subcloud_state_manager from dcmanager.tests import base from dcmanager.tests.unit.common import fake_subcloud +from dcmanager.tests.unit.manager import test_system_peer_manager from dcmanager.tests import utils from tsconfig.tsconfig import SW_VERSION @@ -439,7 +442,8 @@ class TestSubcloudManager(base.DCManagerTestCase): "system_leader_name": "DC0", "group_priority": 0, "group_state": "enabled", - "max_subcloud_rehoming": 50 + "max_subcloud_rehoming": 50, + "migration_status": None } values.update(kwargs) return db_api.subcloud_peer_group_create(ctxt, **values) @@ -3019,6 +3023,57 @@ class TestSubcloudManager(base.DCManagerTestCase): self.assertEqual(1, len(expect_subclouds)) self.assertEqual("sub_migrateable", expect_subclouds[0].name) + @mock.patch.object(subcloud_manager.SubcloudManager, + '_unmanage_system_peer_subcloud') + def test_migrate_manage_subcloud_called_unmanage_peer_subcloud( + self, mock_unmanage_system_peer_subcloud): + sm = subcloud_manager.SubcloudManager() + system_peer_test = test_system_peer_manager.TestSystemPeerManager + system_peer = system_peer_test.create_system_peer_static(self.ctx) + subcloud = self.create_subcloud_static(self.ctx) + sm._migrate_manage_subcloud(self.ctx, mock.ANY, [system_peer], + subcloud) + mock_unmanage_system_peer_subcloud.assert_called() + + @mock.patch.object(subcloud_manager.SubcloudManager, + '_unmanage_system_peer_subcloud') + def test_migrate_manage_subcloud_not_called_unmanage_peer_subcloud( + self, mock_unmanage_system_peer_subcloud): + sm = subcloud_manager.SubcloudManager() + subcloud = self.create_subcloud_static(self.ctx) + # Give empty system peers + system_peers = [] + sm._migrate_manage_subcloud(self.ctx, + mock.ANY, system_peers, subcloud) + mock_unmanage_system_peer_subcloud.assert_not_called() + + @mock.patch.object(system_peer_manager.SystemPeerManager, + 'get_peer_dc_client') + def test_unmanage_system_peer_subcloud_ret_false( + self, mock_get_peer_dc_client): + sm = subcloud_manager.SubcloudManager() + system_peer_test = test_system_peer_manager.TestSystemPeerManager + system_peer = system_peer_test.create_system_peer_static(self.ctx) + subcloud = self.create_subcloud_static(self.ctx) + mock_get_peer_dc_client.return_value = None + ret = sm._unmanage_system_peer_subcloud([system_peer], subcloud) + self.assertEqual(ret, False) + + @mock.patch.object(system_peer_manager.SystemPeerManager, + 'get_peer_dc_client') + @mock.patch.object(dcmanager_v1.DcmanagerClient, 'update_subcloud') + def test_unmanage_system_peer_subcloud_ret_true(self, + mock_get_peer_dc_client, + mock_update_subcloud): + sm = subcloud_manager.SubcloudManager() + system_peer_test = test_system_peer_manager.TestSystemPeerManager + system_peer = system_peer_test.create_system_peer_static(self.ctx) + subcloud = self.create_subcloud_static(self.ctx) + mock_get_peer_dc_client.return_value = mock.MagicMock() + mock_update_subcloud.side_effect = exceptions.SubcloudNotUnmanaged() + ret = sm._unmanage_system_peer_subcloud([system_peer], subcloud) + self.assertEqual(ret, True) + @mock.patch.object(subcloud_manager.SubcloudManager, 'subcloud_deploy_create') @mock.patch.object(subcloud_manager.SubcloudManager, 'rehome_subcloud') @mock.patch.object(subcloud_manager.SubcloudManager, 'run_deploy_phases') diff --git a/distributedcloud/dcmanager/tests/unit/manager/test_system_peer_manager.py b/distributedcloud/dcmanager/tests/unit/manager/test_system_peer_manager.py index 5d2a53d90..b280b6b09 100644 --- a/distributedcloud/dcmanager/tests/unit/manager/test_system_peer_manager.py +++ b/distributedcloud/dcmanager/tests/unit/manager/test_system_peer_manager.py @@ -159,7 +159,8 @@ class TestSystemPeerManager(base.DCManagerTestCase): "group_priority": FAKE_SITE0_PEER_GROUP_PRIORITY, "group_state": FAKE_SITE0_PEER_GROUP_STATE, "max_subcloud_rehoming": - FAKE_SITE0_PEER_GROUP_MAX_SUBCLOUDS_REHOMING + FAKE_SITE0_PEER_GROUP_MAX_SUBCLOUDS_REHOMING, + "migration_status": None } values.update(kwargs) return db_api.subcloud_peer_group_create(ctxt, **values) @@ -177,7 +178,7 @@ class TestSystemPeerManager(base.DCManagerTestCase): return db_api.peer_group_association_create(ctxt, **values) def test_init(self): - spm = system_peer_manager.SystemPeerManager() + spm = system_peer_manager.SystemPeerManager(mock.MagicMock()) self.assertIsNotNone(spm) self.assertEqual('system_peer_manager', spm.service_name) self.assertEqual('localhost', spm.host) @@ -235,7 +236,7 @@ class TestSystemPeerManager(base.DCManagerTestCase): mock_dc_client().update_subcloud.side_effect = [ peer_subcloud1, peer_subcloud1, peer_subcloud2] - spm = system_peer_manager.SystemPeerManager() + spm = system_peer_manager.SystemPeerManager(mock.MagicMock()) spm._sync_subclouds(self.ctx, peer, peer_group.id, FAKE_SITE1_PEER_GROUP_ID) @@ -281,7 +282,7 @@ class TestSystemPeerManager(base.DCManagerTestCase): system_peer_id=peer.id, peer_group_id=peer_group.id) - spm = system_peer_manager.SystemPeerManager() + spm = system_peer_manager.SystemPeerManager(mock.MagicMock()) spm.sync_subcloud_peer_group(self.ctx, association.id, False) mock_dc_client().get_subcloud_peer_group.assert_called_once_with( @@ -319,7 +320,7 @@ class TestSystemPeerManager(base.DCManagerTestCase): mock_dc_client().get_subcloud_peer_group.side_effect = \ dccommon_exceptions.SubcloudPeerGroupNotFound - spm = system_peer_manager.SystemPeerManager() + spm = system_peer_manager.SystemPeerManager(mock.MagicMock()) spm.sync_subcloud_peer_group(self.ctx, association.id, False) mock_dc_client().get_subcloud_peer_group.assert_called_once_with( @@ -375,7 +376,7 @@ class TestSystemPeerManager(base.DCManagerTestCase): 'group_priority': 1 } - spm = system_peer_manager.SystemPeerManager() + spm = system_peer_manager.SystemPeerManager(mock.MagicMock()) spm.delete_peer_group_association(self.ctx, association.id) mock_dc_client().delete_subcloud.assert_has_calls([