From cd355ca120e9aebec681b4c15ab518c115bc4ba1 Mon Sep 17 00:00:00 2001 From: Gustavo Herzmann Date: Mon, 6 Nov 2023 11:28:04 -0300 Subject: [PATCH] Add rehome-pending deploy status This commit adds the rehome-pending deploy status, which should be used when unmanaging a subcloud before the rehoming/migration operation. This new status will then be used by cert-mon to determine when it should stop auditing an unmanaged subcloud, to avoid certificate issues during the rehoming operation. It's only possible to switch to this state when the subcloud is being unmanaged and its deploy status is 'complete'. It's possible to manage it back in case the rehoming operation is not going to be executed anymore; in this case the deploy-status will be reverted back to 'complete'. Example usage: dcmanager subcloud unmanage --migrate subcloud1 CURL: curl -X PATCH -H "X-Auth-Token: ${TOKEN//[$'\t\r\n']}" \ "http://$MGMT_IP:8119/v1.0/subclouds/{subcloud1}" \ -F migrate="true" \ -F management-state="unmanaged" Test Plan: 1. PASS - Unmanage a subcloud without --migrate and verify that it still works and that cert-mon continues to audit it; 2. PASS - Manage a subcloud, verify that the operation still works as expected; 3. PASS - Try to unmanage with --migrate when the subcloud's deploy status is different than 'complete' and verify that it doesn't allow it; 4. PASS - Unmanage a subcloud using the --migrate option and verify that its deploy status changes to 'rehome-pending', all the sync statuses change to 'unknown', and that cert-mon stops auditing the subcloud; 5. PASS - By directly calling the SubcloudManager.update_subcloud() method (internally, skipping API validation), verify that: - it's possible to update the deploy_status to 'secondary' while also updating its management_state to 'unmanaged'; - it's possible to update the deploy_status to 'rehome-pending' while also updating its management_state to 'unmanaged'; - it's NOT possible to update the deploy_status if the subcloud is not already unmanaged or its management_state is not being updated to 'unmanaged'; - it's possible to update the deploy_status to 'secondary' for a subcloud that's already unmanaged; - it's possible to update the deploy_status to 'secondary' for a subcloud that's already unmanaged and currently in the 'rehome-pending' state. - Try to manage a 'rehome-pending' subcloud while also changing its deploy status to something different than 'complete', verify that it fails; 6. PASS - Manage a 'rehome-pending' subcloud and verify that it succeeds while also reverting its deploy_status to 'complete'; 7. PASS - Run subcloud update to populate the rehome_data with the bootstrap values and address, verify that it still works. Story: 2010852 Task: 49059 Signed-off-by: Gustavo Herzmann Change-Id: I8bf76904e335e382407c2268a9c1879471ef2fc9 --- api-ref/source/api-ref-dcmanager-v1.rst | 5 + api-ref/source/parameters.yaml | 7 + .../dcmanager/api/controllers/v1/subclouds.py | 36 ++- distributedcloud/dcmanager/common/consts.py | 3 + .../dcmanager/manager/subcloud_manager.py | 296 ++++++++++-------- .../unit/api/v1/controllers/test_subclouds.py | 12 +- 6 files changed, 223 insertions(+), 136 deletions(-) diff --git a/api-ref/source/api-ref-dcmanager-v1.rst b/api-ref/source/api-ref-dcmanager-v1.rst index fd28c2124..b128c2804 100644 --- a/api-ref/source/api-ref-dcmanager-v1.rst +++ b/api-ref/source/api-ref-dcmanager-v1.rst @@ -343,6 +343,10 @@ The attributes of a subcloud which are modifiable: - bootstrap_address +Extra flags: + +- migrate + **Normal response codes** 200 @@ -371,6 +375,7 @@ serviceUnavailable (503) - bootstrap-address: bootstrap_address - sysadmin-password: sysadmin_password - bootstrap-values: bootstrap_values_for_rehome + - migrate: rehome_pending Request Example ---------------- diff --git a/api-ref/source/parameters.yaml b/api-ref/source/parameters.yaml index 4f17780d9..5e53ab231 100644 --- a/api-ref/source/parameters.yaml +++ b/api-ref/source/parameters.yaml @@ -470,6 +470,13 @@ rehome_data: in: body required: true type: string +rehome_pending: + description: | + A flag indicating if the subcloud will be rehomed away. Its deploy status + will be set to 'rehome-pending' + in: body + required: false + type: boolean release: description: | The subcloud software version. diff --git a/distributedcloud/dcmanager/api/controllers/v1/subclouds.py b/distributedcloud/dcmanager/api/controllers/v1/subclouds.py index 20b88afa8..b0b413450 100644 --- a/distributedcloud/dcmanager/api/controllers/v1/subclouds.py +++ b/distributedcloud/dcmanager/api/controllers/v1/subclouds.py @@ -54,6 +54,7 @@ from dcmanager.db import api as db_api from dcmanager.rpc import client as rpc_client from fm_api.constants import FM_ALARM_ID_UNSYNCHRONIZED_RESOURCE + CONF = cfg.CONF LOG = logging.getLogger(__name__) @@ -319,6 +320,30 @@ class SubcloudsController(object): pecan.abort(500, _("Unable to migrate subcloud %s, " "need sysadmin_password" % subcloud.name)) + def _validate_rehome_pending(self, subcloud, management_state): + unmanaged = dccommon_consts.MANAGEMENT_UNMANAGED + error_msg = None + + # Can only set the subcloud to rehome-pending + # if the deployment is done + if subcloud.deploy_status != consts.DEPLOY_STATE_DONE: + error_msg = ( + "The deploy status can only be updated to " + f"'{consts.DEPLOY_STATE_REHOME_PENDING}' if the current " + f"deploy status is '{consts.DEPLOY_STATE_DONE}'") + + # Can only set the subcloud to rehome-pending if the subcloud is + # being unmanaged or is already unmanaged + if management_state != unmanaged and ( + management_state or subcloud.management_state != unmanaged + ): + error_msg = ( + f"Subcloud must be {unmanaged} for its deploy status to " + f"be updated to '{consts.DEPLOY_STATE_REHOME_PENDING}'") + + if error_msg: + pecan.abort(400, error_msg) + @staticmethod def _append_static_err_content(subcloud): err_dict = consts.ERR_MSG_DICT @@ -668,6 +693,14 @@ class SubcloudsController(object): peer_group = payload.get('peer_group') bootstrap_address = payload.get('bootstrap_address') + # If the migrate flag is present we need to update the deploy status + # to consts.DEPLOY_STATE_REHOME_PENDING + deploy_status = None + if (payload.get('migrate') == 'true' and subcloud.deploy_status != + consts.DEPLOY_STATE_REHOME_PENDING): + self._validate_rehome_pending(subcloud, management_state) + deploy_status = consts.DEPLOY_STATE_REHOME_PENDING + # Syntax checking if management_state and \ management_state not in [dccommon_consts.MANAGEMENT_UNMANAGED, @@ -733,7 +766,8 @@ class SubcloudsController(object): force=force_flag, peer_group_id=peer_group_id, bootstrap_values=bootstrap_values, - bootstrap_address=bootstrap_address) + bootstrap_address=bootstrap_address, + deploy_status=deploy_status) return subcloud except RemoteError as e: pecan.abort(422, e.value) diff --git a/distributedcloud/dcmanager/common/consts.py b/distributedcloud/dcmanager/common/consts.py index 57af3dd43..291415bf3 100644 --- a/distributedcloud/dcmanager/common/consts.py +++ b/distributedcloud/dcmanager/common/consts.py @@ -230,9 +230,12 @@ DEPLOY_STATE_RESTORE_PREP_FAILED = 'restore-prep-failed' DEPLOY_STATE_RESTORING = 'restoring' DEPLOY_STATE_RESTORE_FAILED = 'restore-failed' DEPLOY_STATE_PRE_REHOME = 'pre-rehome' +# If any of the following rehoming or secondary statuses +# are modified, cert-mon code will need to be updated. DEPLOY_STATE_REHOMING = 'rehoming' DEPLOY_STATE_REHOME_FAILED = 'rehome-failed' DEPLOY_STATE_REHOME_PREP_FAILED = 'rehome-prep-failed' +DEPLOY_STATE_REHOME_PENDING = 'rehome-pending' DEPLOY_STATE_SECONDARY = 'secondary' DEPLOY_STATE_SECONDARY_FAILED = 'secondary-failed' DEPLOY_STATE_DONE = 'complete' diff --git a/distributedcloud/dcmanager/manager/subcloud_manager.py b/distributedcloud/dcmanager/manager/subcloud_manager.py index 2d3b52883..b542a1e31 100644 --- a/distributedcloud/dcmanager/manager/subcloud_manager.py +++ b/distributedcloud/dcmanager/manager/subcloud_manager.py @@ -2508,33 +2508,7 @@ class SubcloudManager(manager.Manager): except Exception as e: LOG.exception(e) - def delete_subcloud(self, context, subcloud_id): - """Delete subcloud and notify orchestrators. - - :param context: request context object. - :param subcloud_id: id of subcloud to delete - """ - LOG.info("Deleting subcloud %s." % subcloud_id) - - # Retrieve the subcloud details from the database - subcloud = db_api.subcloud_get(context, subcloud_id) - - # Semantic checking - if subcloud.management_state != dccommon_consts.MANAGEMENT_UNMANAGED: - raise exceptions.SubcloudNotUnmanaged() - - if subcloud.availability_status == \ - dccommon_consts.AVAILABILITY_ONLINE: - raise exceptions.SubcloudNotOffline() - - # Ansible inventory filename for the specified subcloud - ansible_subcloud_inventory_file = self._get_ansible_filename( - subcloud.name, INVENTORY_FILE_POSTFIX) - - self._remove_subcloud_details(context, - subcloud, - ansible_subcloud_inventory_file) - + def _clear_subcloud_alarms(self, subcloud: Subcloud): # Clear any subcloud alarms. # Note that endpoint out-of-sync alarms should have been cleared when # the subcloud was unmanaged and the endpoint sync statuses were set to @@ -2565,6 +2539,35 @@ class SubcloudManager(manager.Manager): (subcloud.name, alarm_id)) LOG.exception(e) + def delete_subcloud(self, context, subcloud_id): + """Delete subcloud and notify orchestrators. + + :param context: request context object. + :param subcloud_id: id of subcloud to delete + """ + LOG.info("Deleting subcloud %s." % subcloud_id) + + # Retrieve the subcloud details from the database + subcloud = db_api.subcloud_get(context, subcloud_id) + + # Semantic checking + if subcloud.management_state != dccommon_consts.MANAGEMENT_UNMANAGED: + raise exceptions.SubcloudNotUnmanaged() + + if subcloud.availability_status == \ + dccommon_consts.AVAILABILITY_ONLINE: + raise exceptions.SubcloudNotOffline() + + # Ansible inventory filename for the specified subcloud + ansible_subcloud_inventory_file = self._get_ansible_filename( + subcloud.name, INVENTORY_FILE_POSTFIX) + + self._remove_subcloud_details(context, + subcloud, + ansible_subcloud_inventory_file) + + self._clear_subcloud_alarms(subcloud) + def rename_subcloud(self, context, subcloud_id, @@ -2621,73 +2624,54 @@ class SubcloudManager(manager.Manager): return subcloud_name - def update_subcloud(self, - context, - subcloud_id, - management_state=None, - description=None, - location=None, - group_id=None, - data_install=None, - force=None, - deploy_status=None, - peer_group_id=None, - bootstrap_values=None, - bootstrap_address=None): - """Update subcloud and notify orchestrators. + def _validate_management_state_update(self, new_management_state: str, + new_deploy_status: str, + subcloud: Subcloud, force: bool): + if new_management_state == dccommon_consts.MANAGEMENT_UNMANAGED: + if subcloud.management_state == dccommon_consts.MANAGEMENT_UNMANAGED: + msg = f"Subcloud {subcloud.name} already unmanaged" + LOG.warning(msg) + raise exceptions.BadRequest(resource="subcloud", msg=msg) - :param context: request context object - :param subcloud_id: id of subcloud to update - :param management_state: new management state - :param description: new description - :param location: new location - :param group_id: new subcloud group id - :param data_install: subcloud install values - :param force: force flag - :param deploy_status: update to expected deploy status - :param peer_group_id: id of peer group - :param bootstrap_values: bootstrap_values yaml content - :param bootstrap_address: oam IP for rehome - """ + elif new_management_state == dccommon_consts.MANAGEMENT_MANAGED: + if subcloud.management_state == dccommon_consts.MANAGEMENT_MANAGED: + msg = f"Subcloud {subcloud.name} already managed" + LOG.warning(msg) + raise exceptions.BadRequest(resource="subcloud", msg=msg) - LOG.info("Updating subcloud %s." % subcloud_id) + if force: + # No need for further validation + return - # Get the subcloud details from the database - subcloud = db_api.subcloud_get(context, subcloud_id) - original_management_state = subcloud.management_state + deploy_status_complete = ( + subcloud.deploy_status == consts.DEPLOY_STATE_DONE + or prestage.is_deploy_status_prestage(subcloud.deploy_status) + ) + allowed_deploy_transition = ( + subcloud.deploy_status == consts.DEPLOY_STATE_REHOME_PENDING + and new_deploy_status == consts.DEPLOY_STATE_DONE + ) - # Semantic checking - if management_state: - if management_state == dccommon_consts.MANAGEMENT_UNMANAGED: - if subcloud.management_state == dccommon_consts.MANAGEMENT_UNMANAGED: - LOG.warning("Subcloud %s already unmanaged" % subcloud_id) - raise exceptions.BadRequest( - resource='subcloud', - msg='Subcloud is already unmanaged') - elif management_state == dccommon_consts.MANAGEMENT_MANAGED: - if subcloud.management_state == dccommon_consts.MANAGEMENT_MANAGED: - LOG.warning("Subcloud %s already managed" % subcloud_id) - raise exceptions.BadRequest( - resource='subcloud', - msg='Subcloud is already managed') - elif not force: - if (subcloud.deploy_status != consts.DEPLOY_STATE_DONE and - not prestage.is_deploy_status_prestage( - subcloud.deploy_status)): - LOG.warning("Subcloud %s can be managed only when" - "deploy_status is complete" % subcloud_id) - raise exceptions.BadRequest( - resource='subcloud', - msg='Subcloud can be managed only if deploy status is complete') - if subcloud.availability_status != \ - dccommon_consts.AVAILABILITY_ONLINE: - LOG.warning("Subcloud %s is not online" % subcloud_id) - raise exceptions.SubcloudNotOnline() - else: - LOG.error("Invalid management_state %s" % management_state) - raise exceptions.InternalError() + if not deploy_status_complete and not allowed_deploy_transition: + msg = (f"Unable to manage {subcloud.name}: its deploy_status " + f"must be either '{consts.DEPLOY_STATE_DONE}' or " + f"'{consts.DEPLOY_STATE_REHOME_PENDING}'") + LOG.warning(msg) + raise exceptions.BadRequest(resource="subcloud", msg=msg) - # update bootstrap values into rehome_data + if (subcloud.availability_status != + dccommon_consts.AVAILABILITY_ONLINE): + LOG.warning(f"Subcloud {subcloud.name} is not online") + raise exceptions.SubcloudNotOnline() + + # The management state can be 'unmanaged', 'managed' or None (which + # means that it's not being changed), any other value is invalid + elif new_management_state is not None: + LOG.error(f"Invalid management_state {new_management_state}") + raise exceptions.InvalidInputError() + + def _prepare_rehome_data(self, subcloud: Subcloud, + bootstrap_values, bootstrap_address): rehome_data_dict = None # load the existing data if it exists if subcloud.rehome_data: @@ -2721,44 +2705,6 @@ class SubcloudManager(manager.Manager): if _bootstrap_address: rehome_data_dict['saved_payload']['bootstrap-address'] = _bootstrap_address - # update deploy status, ONLY apply for unmanaged subcloud - new_deploy_status = None - if deploy_status is not None: - if subcloud.management_state != dccommon_consts.MANAGEMENT_UNMANAGED: - raise exceptions.BadRequest( - resource='subcloud', - msg='deploy_status can only be updated on unmanaged subcloud') - new_deploy_status = deploy_status - # set all endpoint statuses to unknown - # no endpoint will be audited for secondary - # subclouds - self.state_rpc_client.update_subcloud_endpoint_status_sync( - context, - subcloud_name=subcloud.name, - endpoint_type=None, - sync_status=dccommon_consts.SYNC_STATUS_UNKNOWN) - - # clear existing fault alarm of secondary subcloud - for alarm_id, entity_instance_id in ( - (fm_const.FM_ALARM_ID_DC_SUBCLOUD_OFFLINE, - "subcloud=%s" % subcloud.name), - (fm_const.FM_ALARM_ID_DC_SUBCLOUD_RESOURCE_OUT_OF_SYNC, - "subcloud=%s.resource=%s" % - (subcloud.name, dccommon_consts.ENDPOINT_TYPE_DC_CERT)), - (fm_const.FM_ALARM_ID_DC_SUBCLOUD_BACKUP_FAILED, - "subcloud=%s" % subcloud.name)): - try: - fault = self.fm_api.get_fault(alarm_id, - entity_instance_id) - if fault: - self.fm_api.clear_fault(alarm_id, - entity_instance_id) - except Exception as e: - LOG.info( - "Failed to clear fault for subcloud %s, alarm_id=%s" % - (subcloud.name, alarm_id)) - LOG.exception(e) - # update bootstrap_address if bootstrap_address: if rehome_data_dict is None: @@ -2768,10 +2714,79 @@ class SubcloudManager(manager.Manager): 'need to import bootstrap_values first') rehome_data_dict['saved_payload']['bootstrap-address'] = bootstrap_address + rehome_data = None if rehome_data_dict: rehome_data = json.dumps(rehome_data_dict) - else: - rehome_data = None + + return rehome_data + + def update_subcloud(self, + context, + subcloud_id, + management_state=None, + description=None, + location=None, + group_id=None, + data_install=None, + force=None, + deploy_status=None, + peer_group_id=None, + bootstrap_values=None, + bootstrap_address=None): + """Update subcloud and notify orchestrators. + + :param context: request context object + :param subcloud_id: id of subcloud to update + :param management_state: new management state + :param description: new description + :param location: new location + :param group_id: new subcloud group id + :param data_install: subcloud install values + :param force: force flag + :param deploy_status: update to expected deploy status + :param peer_group_id: id of peer group + :param bootstrap_values: bootstrap_values yaml content + :param bootstrap_address: oam IP for rehome + """ + + LOG.info("Updating subcloud %s." % subcloud_id) + + # Get the subcloud details from the database + subcloud: Subcloud = db_api.subcloud_get(context, subcloud_id) + original_management_state = subcloud.management_state + original_deploy_status = subcloud.deploy_status + + # When trying to manage a 'rehome-pending' subcloud, revert its deploy + # status back to 'complete' if its not specified + if (management_state == dccommon_consts.MANAGEMENT_MANAGED and + subcloud.deploy_status == consts.DEPLOY_STATE_REHOME_PENDING and + not deploy_status): + deploy_status = consts.DEPLOY_STATE_DONE + + # management_state semantic checking + self._validate_management_state_update(management_state, deploy_status, + subcloud, force) + + # Update bootstrap values into rehome_data + rehome_data = self._prepare_rehome_data(subcloud, bootstrap_values, + bootstrap_address) + if deploy_status: + msg = None + # Only update deploy_status if subcloud is or will be unmanaged + if dccommon_consts.MANAGEMENT_UNMANAGED not in ( + management_state, subcloud.management_state): + msg = ("Unable to update deploy_status of subcloud " + f"{subcloud.name} to {deploy_status}: subcloud " + "must also be unmanaged") + # Only allow managing if the deploy status is also set to 'complete' + if (management_state == dccommon_consts.MANAGEMENT_MANAGED and + deploy_status != consts.DEPLOY_STATE_DONE): + msg = (f"Unable to manage {subcloud.name} while also updating " + f"its deploy_status to {deploy_status}: not allowed") + if msg: + LOG.warning(msg) + raise exceptions.BadRequest(resource='subcloud', msg=msg) + subcloud = db_api.subcloud_update( context, subcloud_id, @@ -2780,7 +2795,7 @@ class SubcloudManager(manager.Manager): location=location, group_id=group_id, data_install=data_install, - deploy_status=new_deploy_status, + deploy_status=deploy_status, peer_group_id=peer_group_id, rehome_data=rehome_data ) @@ -2806,11 +2821,16 @@ class SubcloudManager(manager.Manager): 'state change, resume to original state, subcloud: %s' % subcloud.name) management_state = original_management_state + # Also revert the deploy_status otherwise we could have a + # managed subcloud with the 'secondary' or other invalid deploy + # status/management state combination. + deploy_status = original_deploy_status subcloud = \ db_api.subcloud_update(context, subcloud_id, management_state=management_state, description=description, - location=location) + location=location, + deploy_status=deploy_status) if management_state == dccommon_consts.MANAGEMENT_UNMANAGED: # set all endpoint statuses to unknown, except the dc-cert @@ -2830,6 +2850,20 @@ class SubcloudManager(manager.Manager): dc_notification = dcmanager_rpc_client.DCManagerNotifications() dc_notification.subcloud_managed(context, subcloud.region_name) + # Set all endpoint statuses to unknown, no endpoint + # will be audited for secondary or rehome-pending subclouds + if subcloud.deploy_status in (consts.DEPLOY_STATE_SECONDARY, + consts.DEPLOY_STATE_REHOME_PENDING): + self.state_rpc_client.update_subcloud_endpoint_status_sync( + context, + subcloud_name=subcloud.name, + endpoint_type=None, + sync_status=dccommon_consts.SYNC_STATUS_UNKNOWN) + + # Clear existing fault alarm of secondary subcloud + if subcloud.deploy_status == consts.DEPLOY_STATE_SECONDARY: + self._clear_subcloud_alarms(subcloud) + return db_api.subcloud_db_model_to_dict(subcloud) def update_subcloud_with_network_reconfig(self, context, subcloud_id, payload): diff --git a/distributedcloud/dcmanager/tests/unit/api/v1/controllers/test_subclouds.py b/distributedcloud/dcmanager/tests/unit/api/v1/controllers/test_subclouds.py index 6397cddf2..aa90cb95a 100644 --- a/distributedcloud/dcmanager/tests/unit/api/v1/controllers/test_subclouds.py +++ b/distributedcloud/dcmanager/tests/unit/api/v1/controllers/test_subclouds.py @@ -1281,7 +1281,8 @@ class TestSubcloudAPIOther(testroot.DCManagerApiTest): force=None, peer_group_id=None, bootstrap_values=None, - bootstrap_address=None) + bootstrap_address=None, + deploy_status=None) self.assertEqual(response.status_int, 200) @mock.patch.object(psd_common, 'get_network_address_pool') @@ -1355,7 +1356,8 @@ class TestSubcloudAPIOther(testroot.DCManagerApiTest): force=None, peer_group_id=None, bootstrap_values=None, - bootstrap_address=None) + bootstrap_address=None, + deploy_status=None) self.assertEqual(response.status_int, 200) @mock.patch.object(subclouds.SubcloudsController, '_get_patch_data') @@ -1395,7 +1397,8 @@ class TestSubcloudAPIOther(testroot.DCManagerApiTest): force=None, peer_group_id=None, bootstrap_values=None, - bootstrap_address=None) + bootstrap_address=None, + deploy_status=None) self.assertEqual(response.status_int, 200) @mock.patch.object(subclouds.SubcloudsController, '_get_patch_data') @@ -1461,7 +1464,8 @@ class TestSubcloudAPIOther(testroot.DCManagerApiTest): force=True, peer_group_id=None, bootstrap_values=None, - bootstrap_address=None) + bootstrap_address=None, + deploy_status=None) self.assertEqual(response.status_int, 200) @mock.patch.object(subclouds.SubcloudsController, '_get_updatestatus_payload')