From 7ad78ea2aea5b7adb1c3055d0f2cc4a708152b27 Mon Sep 17 00:00:00 2001
From: Li Zhu <li.zhu@windriver.com>
Date: Thu, 14 Mar 2024 17:57:46 -0400
Subject: [PATCH] Allow rehome related data update when subcloud migration
 fails

If the subcloud rehome_data contains an incorrect bootstrap-address in
site A and the user migrates the corresponding peer group to site B,
the migration would fail. Subsequently, it will have the 'rehome-failed'
deploy-status in site B and 'rehome-pending' deploy-status in site A.
Then the user won't be able to update the bootstrap-address in either
site due to the following restrictions:
a) Primary site (site A) is not the current leader of the peer group;
b) Update in non-primary site (site B) is not allowed.

To fix this issue, the following changes are made:
1. In the non-primary site, if the subcloud deploy-status is
rehome-failed and the primary site is unavailable, updating
the bootstrap-values and bootstrap-address will be allowed, and the PGA
will be marked as out-of-sync.
2. Modify audit to automatically sync the rehome_data from non-primary
site to primary site if subcloud in the non-primary site is managed and
online and the PGA is out-of-sync.

Additional fix for the system_leader_id issue: When migrating SPG from
one site to another, if all of the subclouds rehome fail, the leader id
of the SPG in the target site has already been updated to the target
site's UUID. However, in the source site, the leader id is not updated
to the target UUID. The fix ensures that regardless of the migration's
success, only if the migration completes, the leader id in both sites
should be updated to the target UUID.

Test plan:
Pre-Steps: 1. Create the system peer from Site A to Site B
           2. Create System peer from Site B to Site A
           3. Create the subcloud peer group in the Site A
           4. Add a subcloud with an incorrect bootstrap-address
              to the peer group
           5. Create peer group association to associate system peer
              and subcloud peer group - Site A
           6. Check current sync status in sites A and B. Verify
              they are 'in-sync'.
           7. Run migration for the subcloud peer group from Site B.
           8. Verify 'rehome-failed' deploy-status in both sites.
PASS: Verify that the bootstrap-address can be updated in site B when
      site A is down, and the PGA sync status is set to out-of-sync
      in site B. Also, verify that the audit will sync the rehome_data
      to site A and change back the PGA to in-sync once the reattempt of
      migration is successful and site A is up.
PASS: Verify that the bootstrap-values and bootstrap-address are
      the only fields that can be updated in site B when site A is down.
PASS: Verify that the update of bootstrap-address was rejected in site B
      when site A is up.
PASS: Verify that even if all of the subclouds in an SPG experience
      rehome failures, the system_leader_id in both sites is updated to
      the target's UUID.
PASS: Verify that when site A is always online or recovered during
      the migration to site B, the subcloud deploy_status in both sites
      is "rehome-failed" after the migration completes. In this
      scenario, site A can migrate the subcloud back, even though it's
      still failed. However, after correcting the bootstrap-address in
      site A, the reattempt of migration in site A succeeds.

Closes-Bug: 2057981

Change-Id: I999dbf035e29950fd823e9cdb087160ce40fd4ca
Signed-off-by: lzhu1 <li.zhu@windriver.com>
---
 .../api/controllers/v1/subcloud_peer_group.py |   4 +-
 .../dcmanager/api/controllers/v1/subclouds.py | 105 +++++++++++++-----
 .../manager/peer_group_audit_manager.py       |  72 ++++++++++--
 .../dcmanager/manager/system_peer_manager.py  |  28 +++--
 4 files changed, 162 insertions(+), 47 deletions(-)

diff --git a/distributedcloud/dcmanager/api/controllers/v1/subcloud_peer_group.py b/distributedcloud/dcmanager/api/controllers/v1/subcloud_peer_group.py
index 60d1df78d..6686ab5ed 100644
--- a/distributedcloud/dcmanager/api/controllers/v1/subcloud_peer_group.py
+++ b/distributedcloud/dcmanager/api/controllers/v1/subcloud_peer_group.py
@@ -305,7 +305,9 @@ class SubcloudPeerGroupsController(restcomm.GenericPathController):
                            (system_leader_id and
                             system_leader_id != group.system_leader_id) or
                            (system_leader_name and
-                            system_leader_name != group.system_leader_name)))
+                            system_leader_name != group.system_leader_name) or
+                           (migration_status and
+                            migration_status != group.migration_status)))
             if not any_update:
                 return db_api.subcloud_peer_group_db_model_to_dict(group)
 
diff --git a/distributedcloud/dcmanager/api/controllers/v1/subclouds.py b/distributedcloud/dcmanager/api/controllers/v1/subclouds.py
index fa5b4ae66..68b528856 100644
--- a/distributedcloud/dcmanager/api/controllers/v1/subclouds.py
+++ b/distributedcloud/dcmanager/api/controllers/v1/subclouds.py
@@ -343,13 +343,19 @@ class SubcloudsController(object):
             else dccommon_consts.DEPLOY_CONFIG_UP_TO_DATE
         return sync_status
 
-    def _validate_rehome_pending(self, subcloud, management_state):
+    def _validate_rehome_pending(self, subcloud, management_state, request):
         unmanaged = dccommon_consts.MANAGEMENT_UNMANAGED
         error_msg = None
 
         # Can only set the subcloud to rehome-pending
-        # if the deployment is done
-        if subcloud.deploy_status != consts.DEPLOY_STATE_DONE:
+        # if the deployment is done or request from another site.
+        # The reason that we skip the validation if the request is from
+        # another site is when migrating the subcloud back to a peer site,
+        # the site will attempt to set the remote subcloud's deploy status
+        # to "rehome-pending." However, the remote subcloud might be in a
+        # "rehome-failed" state from a previous failed rehoming attempt.
+        if (subcloud.deploy_status != consts.DEPLOY_STATE_DONE and
+                not utils.is_req_from_another_dc(request)):
             error_msg = (
                 "The deploy status can only be updated to "
                 f"'{consts.DEPLOY_STATE_REHOME_PENDING}' if the current "
@@ -656,13 +662,16 @@ class SubcloudsController(object):
             req_from_another_dc = utils.is_req_from_another_dc(request)
             original_pgrp = None
             leader_on_local_site = False
+            peer_site_available = True
+            pga = None
+            update_in_non_primary_site = False
             if subcloud.peer_group_id is not None:
                 # Get the original peer group of the subcloud
                 original_pgrp = db_api.subcloud_peer_group_get(
                     context, subcloud.peer_group_id)
                 leader_on_local_site = utils.is_leader_on_local_site(original_pgrp)
                 # A sync command is required after updating a subcloud
-                # in an SPG that is already associated with a PGA on the primary
+                # in an SPG that is already associated with a PGA in the primary
                 # and leader site. The existence of the PGA will be checked
                 # by the update_association_sync_status method later.
                 if (original_pgrp.group_priority == 0 and
@@ -670,6 +679,18 @@ class SubcloudsController(object):
                         not req_from_another_dc):
                     sync_peer_groups.add(subcloud.peer_group_id)
 
+                # Get the peer site availability and PGA sync status
+                # TODO(lzhu1): support multiple sites
+                associations = db_api.peer_group_association_get_by_peer_group_id(
+                    context, original_pgrp.id)
+                for association in associations:
+                    pga = association
+                    system_peer = db_api.system_peer_get(
+                        context, association.system_peer_id)
+                    peer_site_available = \
+                        system_peer.availability_state == \
+                        consts.SYSTEM_PEER_AVAILABILITY_STATE_AVAILABLE
+
             peer_group = payload.get('peer_group')
             # Verify the peer_group is valid
             peer_group_id = None
@@ -690,22 +711,13 @@ class SubcloudsController(object):
                             pecan.abort(400, _("Removing subcloud from a "
                                                "peer group not led by the "
                                                "current site is prohibited."))
-                        # Get associations by peer group id
-                        associations = db_api.\
-                            peer_group_association_get_by_peer_group_id(
-                                context, original_pgrp.id)
-                        for association in associations:
-                            system_peer = db_api.system_peer_get(
-                                context, association.system_peer_id)
-                            # If system peer is available, then does not allow
-                            # to remove the subcloud from secondary peer group
-                            if system_peer.availability_state == consts.\
-                                    SYSTEM_PEER_AVAILABILITY_STATE_AVAILABLE \
-                                    and original_pgrp.group_priority > 0:
-                                pecan.abort(400, _(
-                                    "Removing subcloud from a peer group "
-                                    "associated with an available system peer "
-                                    "is prohibited."))
+                        # If system peer is available, then does not allow
+                        # to remove the subcloud from secondary peer group
+                        if peer_site_available and original_pgrp.group_priority > 0:
+                            pecan.abort(400, _(
+                                "Removing subcloud from a peer group "
+                                "associated with an available system peer "
+                                "is prohibited."))
                         peer_group_id = 'none'
                 else:
                     if not (subcloud.rehome_data or (
@@ -744,16 +756,39 @@ class SubcloudsController(object):
                         sync_peer_groups.add(pgrp.id)
                     peer_group_id = pgrp.id
 
+            bootstrap_values = payload.get('bootstrap_values')
+            bootstrap_address = payload.get('bootstrap_address')
+
             # Subcloud can only be updated while it is managed in
             # the primary site because the sync command can only be issued
-            # in the site where the SPG was created.
+            # in the site where the SPG was created. However, bootstrap
+            # values or address update is an exception.
             if original_pgrp and peer_group_id is None and not req_from_another_dc:
                 if original_pgrp.group_priority > 0:
-                    pecan.abort(400, _("Subcloud update is only allowed when "
-                                       "its peer group priority value is 0."))
+                    if bootstrap_values or bootstrap_address:
+                        if any(field not in
+                               ('bootstrap_values', 'bootstrap_address')
+                               for field in payload):
+                            pecan.abort(400,
+                                        _("Only bootstrap values and address "
+                                          "can be updated in the non-primary site"))
+                        if (subcloud.deploy_status ==
+                                consts.DEPLOY_STATE_REHOME_FAILED and
+                                not peer_site_available):
+                            update_in_non_primary_site = True
+                        else:
+                            pecan.abort(400,
+                                        _("Subcloud bootstrap values or address "
+                                          "update in the non-primary site is only "
+                                          "allowed when rehome failed and the "
+                                          "primary site is unavailable."))
+                    if not update_in_non_primary_site:
+                        pecan.abort(400, _("Subcloud update is only allowed when "
+                                           "its peer group priority value is 0."))
+
                 # Updating a subcloud under the peer group on primary site
                 # that the peer group should be led by the primary site.
-                if not leader_on_local_site:
+                if not leader_on_local_site and not update_in_non_primary_site:
                     pecan.abort(400, _("Updating subcloud from a "
                                        "peer group not led by the "
                                        "current site is prohibited."))
@@ -845,15 +880,13 @@ class SubcloudsController(object):
             group_id = payload.get('group_id')
             description = payload.get('description')
             location = payload.get('location')
-            bootstrap_values = payload.get('bootstrap_values')
-            bootstrap_address = payload.get('bootstrap_address')
 
             # If the migrate flag is present we need to update the deploy status
             # to consts.DEPLOY_STATE_REHOME_PENDING
             deploy_status = None
             if (payload.get('migrate') == 'true' and subcloud.deploy_status !=
                     consts.DEPLOY_STATE_REHOME_PENDING):
-                self._validate_rehome_pending(subcloud, management_state)
+                self._validate_rehome_pending(subcloud, management_state, request)
                 deploy_status = consts.DEPLOY_STATE_REHOME_PENDING
 
             # Syntax checking
@@ -917,7 +950,23 @@ class SubcloudsController(object):
                     bootstrap_address=bootstrap_address,
                     deploy_status=deploy_status)
 
-                if sync_peer_groups:
+                # Update the PGA sync_status to out-of-sync locally
+                # in the non-primary site. This only occurs when the primary site
+                # is unavailable and rehome fails due to the issue with bootstrap
+                # values or address.
+                if (update_in_non_primary_site and
+                        pga.sync_status !=
+                        consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC):
+                    db_api.peer_group_association_update(
+                        context,
+                        pga.id,
+                        sync_status=consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC)
+                    LOG.debug(
+                        f"Updated Local Peer Group Association {pga.id} "
+                        f"sync_status to out-of-sync.")
+                # Sync the PGA out-of-sync status across all sites launched by
+                # the primary site.
+                elif sync_peer_groups:
                     # Collect the affected peer group association IDs.
                     association_ids = set()
                     for pg_id in sync_peer_groups:
diff --git a/distributedcloud/dcmanager/manager/peer_group_audit_manager.py b/distributedcloud/dcmanager/manager/peer_group_audit_manager.py
index cf287ae17..c71ec8402 100644
--- a/distributedcloud/dcmanager/manager/peer_group_audit_manager.py
+++ b/distributedcloud/dcmanager/manager/peer_group_audit_manager.py
@@ -43,10 +43,10 @@ class PeerGroupAuditManager(manager.Manager):
         self.thread_lock = threading.Lock()
 
     def _get_subclouds_by_peer_group_from_system_peer(self,
+                                                      dc_client,
                                                       system_peer,
                                                       peer_group_name):
         try:
-            dc_client = SystemPeerManager.get_peer_dc_client(system_peer)
             subclouds = dc_client.get_subcloud_list_by_peer_group(
                 peer_group_name)
             return subclouds
@@ -55,6 +55,22 @@ class PeerGroupAuditManager(manager.Manager):
                           f"{peer_group_name} from DC: "
                           f"{system_peer.peer_name}")
 
+    @staticmethod
+    def _get_association_sync_status_from_peer_site(dc_client,
+                                                    system_peer,
+                                                    peer_group_id):
+        try:
+            # Get peer site system peer
+            dc_peer_system_peer = dc_client.get_system_peer(
+                utils.get_local_system().uuid)
+            association = dc_client. \
+                get_peer_group_association_with_peer_id_and_pg_id(
+                    dc_peer_system_peer.get('id'), peer_group_id)
+            return association.get("sync-status")
+        except Exception:
+            LOG.exception(f"Failed to get subclouds of peer group "
+                          f"{peer_group_id} from DC: {system_peer.peer_name}")
+
     def _update_remote_peer_group_migration_status(self,
                                                    system_peer,
                                                    peer_group_name,
@@ -71,9 +87,11 @@ class PeerGroupAuditManager(manager.Manager):
 
     def _get_local_subclouds_to_update_and_delete(self,
                                                   local_peer_group,
-                                                  remote_subclouds):
+                                                  remote_subclouds,
+                                                  remote_sync_status):
         local_subclouds_to_update = list()
         local_subclouds_to_delete = list()
+        any_rehome_failed = False
         remote_subclouds_dict = {remote_subcloud.get('region-name'):
                                  remote_subcloud for remote_subcloud
                                  in remote_subclouds}
@@ -92,10 +110,30 @@ class PeerGroupAuditManager(manager.Manager):
                     not utils.subcloud_is_secondary_state(
                         local_subcloud.deploy_status)):
                     local_subclouds_to_update.append(local_subcloud)
+                    # Sync rehome_data from remote to local subcloud if the remote
+                    # PGA sync_status is out-of-sync once migration completes,
+                    # indicating any bootstrap values/address updates to
+                    # the subcloud on the remote site.
+                    if remote_sync_status == \
+                            consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC:
+                        self._sync_rehome_data(
+                            local_subcloud.id, remote_subcloud.get('rehome_data'))
+                elif remote_subcloud.get('deploy-status') in \
+                    (consts.DEPLOY_STATE_REHOME_FAILED,
+                     consts.DEPLOY_STATE_REHOME_PREP_FAILED):
+                    # Set local subcloud to rehome-failed if the remote is
+                    # rehome-failed or rehome-prep-failed, otherwise, the
+                    # deploy_status will remain rehome-pending, which will
+                    # block the correction of the bootstrap values/address.
+                    db_api.subcloud_update(
+                        self.context, local_subcloud.id,
+                        deploy_status=consts.DEPLOY_STATE_REHOME_FAILED)
+                    any_rehome_failed = True
             else:
                 local_subclouds_to_delete.append(local_subcloud)
 
-        return local_subclouds_to_update, local_subclouds_to_delete
+        return local_subclouds_to_update, local_subclouds_to_delete, \
+            any_rehome_failed
 
     def _set_local_subcloud_to_secondary(self, subcloud):
         try:
@@ -118,6 +156,9 @@ class PeerGroupAuditManager(manager.Manager):
                           f"and offline subcloud [{subcloud.name}], err: {e}")
             raise e
 
+    def _sync_rehome_data(self, subcloud_id, rehome_data):
+        db_api.subcloud_update(self.context, subcloud_id, rehome_data=rehome_data)
+
     def audit(self, system_peer, remote_peer_group, local_peer_group):
         if local_peer_group.migration_status == consts.PEER_GROUP_MIGRATING:
             LOG.info("Local peer group in migrating state, quit audit")
@@ -187,14 +228,22 @@ class PeerGroupAuditManager(manager.Manager):
         # set 'unmanaged+secondary' to local on same subclouds
         elif remote_peer_group.get("migration_status") == \
                 consts.PEER_GROUP_MIGRATION_COMPLETE:
+            dc_client = SystemPeerManager.get_peer_dc_client(system_peer)
             remote_subclouds = \
                 self._get_subclouds_by_peer_group_from_system_peer(
+                    dc_client,
                     system_peer,
                     remote_peer_group.get("peer_group_name"))
+            remote_sync_status = \
+                self._get_association_sync_status_from_peer_site(
+                    dc_client,
+                    system_peer,
+                    remote_peer_group.get("id"))
 
-            local_subclouds_to_update, local_subclouds_to_delete = \
+            local_subclouds_to_update, local_subclouds_to_delete, \
+                any_rehome_failed = \
                 self._get_local_subclouds_to_update_and_delete(
-                    local_peer_group, remote_subclouds)
+                    local_peer_group, remote_subclouds, remote_sync_status)
 
             for subcloud in local_subclouds_to_update:
                 self._set_local_subcloud_to_secondary(subcloud)
@@ -218,7 +267,7 @@ class PeerGroupAuditManager(manager.Manager):
                                   f"peer site, err: {e}")
                     raise e
 
-            if local_subclouds_to_update or local_subclouds_to_delete:
+            if remote_peer_group.get("system_leader_id") == system_peer.peer_uuid:
                 self._clear_or_raise_alarm(system_peer,
                                            local_peer_group,
                                            remote_peer_group)
@@ -232,10 +281,13 @@ class PeerGroupAuditManager(manager.Manager):
                 system_peer,
                 remote_peer_group.get("peer_group_name"),
                 None)
-            SystemPeerManager.update_sync_status(
-                self.context, system_peer,
-                consts.ASSOCIATION_SYNC_STATUS_IN_SYNC,
-                local_peer_group, remote_peer_group)
+
+            if not (remote_sync_status == consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC
+                    and any_rehome_failed):
+                SystemPeerManager.update_sync_status(
+                    self.context, system_peer,
+                    consts.ASSOCIATION_SYNC_STATUS_IN_SYNC,
+                    local_peer_group, remote_peer_group)
             self.require_audit_flag = False
         else:
             # If remote peer group migration_status is 'None'
diff --git a/distributedcloud/dcmanager/manager/system_peer_manager.py b/distributedcloud/dcmanager/manager/system_peer_manager.py
index 2db735c35..c171b527b 100644
--- a/distributedcloud/dcmanager/manager/system_peer_manager.py
+++ b/distributedcloud/dcmanager/manager/system_peer_manager.py
@@ -236,10 +236,14 @@ class SystemPeerManager(manager.Manager):
                      f"it doesn't exist.")
             return
 
-        is_secondary = SystemPeerManager.is_subcloud_secondary(peer_subcloud)
-        if not is_secondary:
+        if SystemPeerManager.get_subcloud_deploy_status(peer_subcloud) not in (
+                consts.DEPLOY_STATE_SECONDARY_FAILED,
+                consts.DEPLOY_STATE_SECONDARY,
+                consts.DEPLOY_STATE_REHOME_FAILED,
+                consts.DEPLOY_STATE_REHOME_PREP_FAILED
+        ):
             LOG.info(f"Ignoring delete Peer Site Subcloud {subcloud_ref} "
-                     f"as is not in secondary state.")
+                     f"as is not in secondary or rehome failed state.")
             return
 
         dc_client.delete_subcloud(subcloud_ref)
@@ -337,7 +341,10 @@ class SystemPeerManager(manager.Manager):
                 # should be recorded as a failure.
                 peer_subcloud_deploy_status = self.get_subcloud_deploy_status(
                     peer_subcloud)
-                if peer_subcloud_deploy_status != consts.DEPLOY_STATE_SECONDARY:
+                if peer_subcloud_deploy_status not in \
+                        (consts.DEPLOY_STATE_SECONDARY,
+                         consts.DEPLOY_STATE_REHOME_FAILED,
+                         consts.DEPLOY_STATE_REHOME_PREP_FAILED):
                     subcloud.msg = "Subcloud's deploy status not correct: %s" \
                         % peer_subcloud_deploy_status
                     return subcloud, False
@@ -424,6 +431,9 @@ class SystemPeerManager(manager.Manager):
                 continue
 
             try:
+                # TODO(lzhu1): Sending requests to fetch the subcloud one by one
+                #  should be optimized to fetch them all with one request by calling
+                #  the "get_subcloud_list_by_peer_group" method
                 peer_subcloud = self.get_peer_subcloud(dc_client, subcloud_name)
                 if not peer_subcloud:
                     LOG.info(f"Subcloud {subcloud_name} (region_name: "
@@ -431,10 +441,12 @@ class SystemPeerManager(manager.Manager):
                     valid_subclouds.append(subcloud)
                     continue
 
-                if not self.is_subcloud_secondary(peer_subcloud):
-                    msg = "Ignoring update Peer Site Subcloud " + \
-                          f"{subcloud_name} (region_name: {region_name})" + \
-                          " as is not in secondary state."
+                if not self.is_subcloud_secondary(peer_subcloud) and \
+                        self.get_subcloud_deploy_status(peer_subcloud) not in \
+                        (consts.DEPLOY_STATE_REHOME_FAILED,
+                         consts.DEPLOY_STATE_REHOME_PREP_FAILED):
+                    msg = (f"Subcloud {subcloud_name} is not in the right state "
+                           f"for sync.")
                     LOG.info(msg)
                     error_msg[subcloud_name] = msg
                     continue