Auditor automatic reconciliation of subclouds

This commit will automatically delete the subcloud that was removed
from SPG on the peer site after the peer site migrated the SGP.

Conduct syntax checks for managing, unmanaging, deleting, and updating
subclouds, add or remove subcloud from SPG.
Specifically, If the secondary site leads the current SPG, the
subcloud can be removed from the SPG only when the primary site is
unavailable.

Test Plan:
PASS - Shutdown site1, migrate SPG to site2, and verify the status of
       subclouds upon site1 recovery. As anticipated, the subclouds
       transitioned to the secondary status.
PASS - Shutdown site1, migrate SPG to site2, then remove a subcloud
       from the SPG on site2. Upon site1 recovery, verify the subclouds'
       status. As expected, the subcloud was deleted in site1
       corresponding to the removal of the subcloud from the SPG in
       site2.
PASS - Shutdown site1, migrate SPG to site2, then remove a subcloud from
       the SPG on site2 and subsequently delete it. Check the status of
       subclouds upon site1 recovery. As expected, the subcloud was
       deleted in site1 corresponding to the removal of the subcloud
       from the SPG in site2.
PASS - Removal subcloud from the SPG in the primary site when it is the
       current leader was successful as expected.
PASS - Keep site1 online, migrate SPG to site2, remove a subcloud from
       SPG in site2 should not be allowed if site1 is online.
PASS - Add a subcloud that is offline/unmanaged to an SPG failed in
       the primary site as expected.
PASS - Manage/unmanage subcloud failed when it associated to SPG.
PASS - Update subcloud successful when it associated to SPG in the
       primary site and the SPG is the current leader.
PASS - Update subcloud failed when it associated to SPG in the secondary
       site as expected.

Closes-bug: 2052415
Closes-bug: 2052584

Signed-off-by: Zhang Rong(Jon) <rong.zhang@windriver.com>
Change-Id: I210e2865228d166d7f5a5b26015ab07b4d09db47
This commit is contained in:
Zhang Rong(Jon) 2024-02-05 15:09:09 +08:00
parent 9510a10345
commit 6cb974bd50
4 changed files with 203 additions and 114 deletions

View File

@ -427,7 +427,8 @@ class DcmanagerClient(base.DriverBase):
raise ValueError("subcloud_ref is required.")
url = f"{self.endpoint}/subclouds/{subcloud_ref}"
headers = {"X-Auth-Token": self.token}
headers = {"X-Auth-Token": self.token,
"User-Agent": consts.DCMANAGER_V1_HTTP_AGENT}
response = requests.delete(url, headers=headers,
timeout=self.timeout)

View File

@ -362,7 +362,8 @@ class TestDcmanagerClient(base.DCCommonTestCase):
result = client.delete_subcloud(SUBCLOUD_NAME)
mock_delete.assert_called_once_with(
FAKE_ENDPOINT + '/subclouds/' + SUBCLOUD_NAME,
headers={"X-Auth-Token": FAKE_TOKEN},
headers={"X-Auth-Token": FAKE_TOKEN,
"User-Agent": dccommon_consts.DCMANAGER_V1_HTTP_AGENT},
timeout=FAKE_TIMEOUT
)
self.assertEqual(result, '')

View File

@ -598,6 +598,100 @@ class SubcloudsController(object):
if not payload:
pecan.abort(400, _('Body required'))
peer_group = payload.get('peer_group')
# Verify the peer_group is valid
peer_group_id = None
if peer_group is not None:
# peer_group may be passed in the payload as an int or str
peer_group = str(peer_group)
# Get current site system information
local_system_uuid = utils.get_local_system().uuid
# Check if user wants to remove a subcloud
# from a subcloud-peer-group by
# setting peer_group_id as 'none',
# then we will pass 'none' string as
# the peer_group_id,
# update_subcloud() will handle it and
# Set the peer_group_id DB into None.
if peer_group.lower() == 'none':
if subcloud.peer_group_id is not None:
# Get the peer group of the subcloud
original_pgrp = db_api.subcloud_peer_group_get(
context, subcloud.peer_group_id)
# Check the system leader is not on this site
if original_pgrp.system_leader_id != local_system_uuid:
pecan.abort(400, _("Removing subcloud from a "
"peer group not led by the "
"current site is prohibited."))
# Get associations by peer group id
associations = db_api.\
peer_group_association_get_by_peer_group_id(
context, original_pgrp.id)
for association in associations:
system_peer = db_api.system_peer_get(
context, association.system_peer_id)
# If system peer is available, then does not allow
# to remove the subcloud from secondary peer group
if system_peer.availability_state == consts.\
SYSTEM_PEER_AVAILABILITY_STATE_AVAILABLE \
and original_pgrp.group_priority > 0:
pecan.abort(400, _(
"Removing subcloud from a peer group "
"associated with an available system peer "
"is prohibited."))
peer_group_id = 'none'
else:
if subcloud.peer_group_id is not None and \
str(subcloud.peer_group_id) != peer_group:
original_pgrp = utils.subcloud_peer_group_get_by_ref(
context, str(subcloud.peer_group_id))
if original_pgrp and original_pgrp.group_priority > 0:
pecan.abort(400, _(
"Cannot update subcloud to a new peer group "
"if the original peer group has non-zero "
"priority."))
pgrp = utils.subcloud_peer_group_get_by_ref(context, peer_group)
if not pgrp:
pecan.abort(400, _('Invalid peer group'))
if not utils.is_req_from_another_dc(request):
if pgrp.group_priority > 0:
pecan.abort(400, _("Cannot set the subcloud to a peer"
" group with non-zero priority."))
elif pgrp.system_leader_id != local_system_uuid:
pecan.abort(400, _("Update subcloud to a peer "
"group that is not led by the "
"current site is prohibited."))
elif not (
subcloud.deploy_status == consts.DEPLOY_STATE_DONE
and subcloud.management_state ==
dccommon_consts.MANAGEMENT_MANAGED
and subcloud.availability_status ==
dccommon_consts.AVAILABILITY_ONLINE):
pecan.abort(400, _("Only subclouds that are "
"managed and online can be "
"added to a peer group."))
peer_group_id = pgrp.id
# Subcloud can only be updated while it is managed in
# the primary site because the sync command can only be issued
# in the site where the SPG was created.
if subcloud.peer_group_id is not None and peer_group_id is None \
and not utils.is_req_from_another_dc(request):
# Get the peer group of the subcloud
original_pgrp = db_api.subcloud_peer_group_get(
context, subcloud.peer_group_id)
if original_pgrp.group_priority > 0:
pecan.abort(400, _("Subcloud update is only allowed when "
"its peer group priority value is 0."))
# Get current site system information
local_system_uuid = utils.get_local_system().uuid
# Updating a subcloud under the peer group on primary site
# that the peer group should be led by the primary site.
if original_pgrp.system_leader_id != local_system_uuid:
pecan.abort(400, _("Updating subcloud from a "
"peer group not led by the "
"current site is prohibited."))
# Rename the subcloud
new_subcloud_name = payload.get('name')
if new_subcloud_name is not None:
@ -677,7 +771,6 @@ class SubcloudsController(object):
description = payload.get('description')
location = payload.get('location')
bootstrap_values = payload.get('bootstrap_values')
peer_group = payload.get('peer_group')
bootstrap_address = payload.get('bootstrap_address')
# If the migrate flag is present we need to update the deploy status
@ -693,6 +786,11 @@ class SubcloudsController(object):
management_state not in [dccommon_consts.MANAGEMENT_UNMANAGED,
dccommon_consts.MANAGEMENT_MANAGED]:
pecan.abort(400, _('Invalid management-state'))
if management_state and subcloud.peer_group_id is not None \
and not utils.is_req_from_another_dc(request):
pecan.abort(400, _('Cannot update the management state of a '
'subcloud that is associated with '
'a peer group.'))
force_flag = payload.get('force')
if force_flag is not None:
@ -717,41 +815,6 @@ class SubcloudsController(object):
exceptions.SubcloudGroupNotFound):
pecan.abort(400, _('Invalid group'))
# Verify the peer_group is valid
peer_group_id = None
if peer_group is not None:
# peer_group may be passed in the payload as an int or str
peer_group = str(peer_group)
# Check if user wants to remove a subcloud
# from a subcloud-peer-group by
# setting peer_group_id as 'none',
# then we will pass 'none' string as
# the peer_group_id,
# update_subcloud() will handle it and
# Set the peer_group_id DB into None.
if peer_group.lower() == 'none':
peer_group_id = 'none'
else:
pgrp = utils.subcloud_peer_group_get_by_ref(context, peer_group)
if not pgrp:
pecan.abort(400, _('Invalid peer group'))
if not utils.is_req_from_another_dc(request):
if pgrp.group_priority > 0:
pecan.abort(400, _("Cannot set the subcloud to a peer"
" group with non-zero priority."))
elif not (
subcloud.deploy_status in [
consts.DEPLOY_STATE_DONE,
consts.PRESTAGE_STATE_COMPLETE
] and subcloud.management_state ==
dccommon_consts.MANAGEMENT_MANAGED
and subcloud.availability_status ==
dccommon_consts.AVAILABILITY_ONLINE):
pecan.abort(400, _("Only subclouds that are "
"managed and online can be "
"added to a peer group."))
peer_group_id = pgrp.id
if consts.INSTALL_VALUES in payload:
# install_values of secondary subclouds are validated on
# peer site
@ -939,6 +1002,20 @@ class SubcloudsController(object):
pecan.abort(404, _('Subcloud not found'))
subcloud_id = subcloud.id
peer_group_id = subcloud.peer_group_id
subcloud_management_state = subcloud.management_state
# Check if the subcloud is "managed" status
if subcloud_management_state == dccommon_consts.MANAGEMENT_MANAGED \
and not utils.is_req_from_another_dc(request):
pecan.abort(400, _('Cannot delete a subcloud that is "managed" '
'status'))
# Check if the subcloud is part of a peer group
if peer_group_id is not None and \
not utils.is_req_from_another_dc(request):
pecan.abort(400, _('Cannot delete a subcloud that is part of '
'a peer group on this site'))
try:
# Ask dcmanager-manager to delete the subcloud.

View File

@ -51,9 +51,9 @@ class PeerGroupAuditManager(manager.Manager):
peer_group_name)
return subclouds
except Exception:
LOG.exception("Failed to get subclouds of peer group %s "
"from DC: %s" %
(peer_group_name, system_peer.peer_name))
LOG.exception(f"Failed to get subclouds of peer group "
f"{peer_group_name} from DC: "
f"{system_peer.peer_name}")
def _update_remote_peer_group_migration_status(self,
system_peer,
@ -65,37 +65,58 @@ class PeerGroupAuditManager(manager.Manager):
}
dc_client.update_subcloud_peer_group(peer_group_name,
**peer_group_kwargs)
LOG.info("Updated Subcloud Peer Group %s on "
"peer site %s, set migration_status to: %s" %
(peer_group_name, system_peer.peer_name, migration_status))
LOG.info(f"Updated Subcloud Peer Group {peer_group_name} on "
f"peer site {system_peer.peer_name}, set migration_status "
f"to: {migration_status}")
def _get_local_subclouds_to_update(self,
local_peer_group,
remote_subclouds):
def _get_local_subclouds_to_update_and_delete(self,
local_peer_group,
remote_subclouds):
local_subclouds_to_update = list()
remote_managed_subcloud_region_names = list()
local_subclouds_to_delete = list()
remote_subclouds_dict = {remote_subcloud.get('region-name'):
remote_subcloud for remote_subcloud
in remote_subclouds}
local_subclouds = db_api.subcloud_get_for_peer_group(
self.context, local_peer_group.id)
# get the 'managed+online' remote subclouds
for remote_subcloud in remote_subclouds:
if (remote_subcloud.get('management-state') ==
dccommon_consts.MANAGEMENT_MANAGED and
remote_subcloud.get('availability-status') ==
dccommon_consts.AVAILABILITY_ONLINE):
remote_managed_subcloud_region_names.append(
remote_subcloud.get('region-name'))
# Compare with the 'non-secondary' local subclouds
for local_subcloud in local_subclouds:
if local_subcloud.region_name in \
remote_managed_subcloud_region_names \
and not utils.subcloud_is_secondary_state(
local_subcloud.deploy_status):
remote_subcloud = remote_subclouds_dict.get(
local_subcloud.region_name)
if remote_subcloud:
# Check if the remote subcloud meets the conditions for update
# if it is 'managed' and the local subcloud is not
# in 'secondary' status
if (remote_subcloud.get('management-state') ==
dccommon_consts.MANAGEMENT_MANAGED and
not utils.subcloud_is_secondary_state(
local_subcloud.deploy_status)):
local_subclouds_to_update.append(local_subcloud)
else:
local_subclouds_to_delete.append(local_subcloud)
local_subclouds_to_update.append(local_subcloud)
return local_subclouds_to_update, local_subclouds_to_delete
return local_subclouds_to_update
def _set_local_subcloud_to_secondary(self, subcloud):
try:
LOG.info("Set local subcloud %s to secondary" % subcloud.name)
# There will be an exception when unmanage
# a subcloud in 'unamaged' state.
if subcloud.management_state != \
dccommon_consts.MANAGEMENT_UNMANAGED:
self.subcloud_manager.update_subcloud(
self.context,
subcloud.id,
management_state=dccommon_consts.
MANAGEMENT_UNMANAGED)
self.subcloud_manager.update_subcloud(
self.context,
subcloud.id,
deploy_status=consts.DEPLOY_STATE_SECONDARY)
except Exception as e:
LOG.exception(f"Failed to update local non-secondary "
f"and offline subcloud [{subcloud.name}], err: {e}")
raise e
def audit(self, system_peer, remote_peer_group, local_peer_group):
if local_peer_group.migration_status == consts.PEER_GROUP_MIGRATING:
@ -120,9 +141,9 @@ class PeerGroupAuditManager(manager.Manager):
if remote_peer_group.get("migration_status") == \
consts.PEER_GROUP_MIGRATING:
# Unmanaged all local subclouds of peer group
LOG.info("Unmanaged all local subclouds of peer group %s "
"since remote is in migrating state" %
local_peer_group.peer_group_name)
LOG.info(f"Unmanaged all local subclouds of peer group "
f"{local_peer_group.peer_group_name} "
f"since remote is in migrating state")
subclouds = db_api.subcloud_get_for_peer_group(self.context,
local_peer_group.id)
for subcloud in subclouds:
@ -152,8 +173,8 @@ class PeerGroupAuditManager(manager.Manager):
subcloud.id,
deploy_status=consts.DEPLOY_STATE_REHOME_PENDING)
except Exception as e:
LOG.exception("Fail to unmanage local subcloud %s, err: "
"%s" % (subcloud.name, e))
LOG.exception(f"Fail to unmanage local subcloud "
f"{subcloud.name}, err: {e}")
raise e
self.require_audit_flag = False
@ -167,39 +188,29 @@ class PeerGroupAuditManager(manager.Manager):
system_peer,
remote_peer_group.get("peer_group_name"))
if not remote_subclouds:
LOG.error("No subclouds in remote DC:%s's peer group %s" %
(system_peer.peer_name,
remote_peer_group.get("peer_group_name")))
return
local_subclouds_to_update = \
self._get_local_subclouds_to_update(local_peer_group,
remote_subclouds)
local_subclouds_to_update, local_subclouds_to_delete = \
self._get_local_subclouds_to_update_and_delete(
local_peer_group, remote_subclouds)
for subcloud in local_subclouds_to_update:
self._set_local_subcloud_to_secondary(subcloud)
# Change the local subcloud not exist on peer site's SPG to
# secondary status then delete it
for subcloud in local_subclouds_to_delete:
self._set_local_subcloud_to_secondary(subcloud)
try:
LOG.info("Set secondary to local subcloud %s" %
subcloud.name)
# There will be an exception when unmanage
# a subcloud in 'unamaged' state.
if subcloud.management_state != \
dccommon_consts.MANAGEMENT_UNMANAGED:
self.subcloud_manager.update_subcloud(
self.context,
subcloud.id,
management_state=dccommon_consts.
MANAGEMENT_UNMANAGED)
self.subcloud_manager.update_subcloud(
self.context,
subcloud.id,
deploy_status=consts.DEPLOY_STATE_SECONDARY)
self.subcloud_manager.delete_subcloud(
self.context, subcloud.id)
LOG.info(f"Deleted local subcloud {subcloud.name}")
except Exception as e:
LOG.exception("Failed to update local non-secondary "
"and offline subcloud [%s], err: %s" %
(subcloud.name, e))
LOG.exception(f"Failed to delete local subcloud "
f"[{subcloud.name}] that does not exist "
f"under the same subcloud_peer_group on "
f"peer site, err: {e}")
raise e
if local_subclouds_to_update:
if local_subclouds_to_update or local_subclouds_to_delete:
self._clear_or_raise_alarm(system_peer,
local_peer_group,
remote_peer_group)
@ -229,10 +240,10 @@ class PeerGroupAuditManager(manager.Manager):
entity_instance_id = "peer_group=%s,peer=%s" % \
(local_peer_group.peer_group_name, system_peer.peer_uuid)
if local_peer_group.group_priority < remote_peer_group.get('group_priority'):
LOG.warning("Alarm: local subcloud peer group [%s] "
"is managed by remote system [%s]" %
(local_peer_group.peer_group_name,
system_peer.peer_name))
LOG.warning("Alarm: local subcloud peer group ["
f"{local_peer_group.peer_group_name}] "
f"is managed by remote system ["
f"{system_peer.peer_name}]")
try:
fault = fm_api.Fault(
alarm_id=fm_const.
@ -266,15 +277,15 @@ class PeerGroupAuditManager(manager.Manager):
fm_const.FM_ALARM_ID_DC_SUBCLOUD_PEER_GROUP_NOT_MANAGED,
entity_instance_id)
if fault:
LOG.info("Clear alarm: %s" % entity_instance_id)
LOG.info(f"Clear alarm: {entity_instance_id}")
self.fm_api.clear_fault(
fm_const.FM_ALARM_ID_DC_SUBCLOUD_PEER_GROUP_NOT_MANAGED,
entity_instance_id)
except Exception:
LOG.exception(
"Problem clearing fault [%s], alarm_id=%s" %
(entity_instance_id,
fm_const.FM_ALARM_ID_DC_SUBCLOUD_PEER_GROUP_NOT_MANAGED))
f"Problem clearing fault [{entity_instance_id}], "
f"alarm_id="
f"{fm_const.FM_ALARM_ID_DC_SUBCLOUD_PEER_GROUP_NOT_MANAGED}")
def _do_audit(self, system_peer, remote_peer_group, local_peer_group):
with self.thread_lock:
@ -286,15 +297,14 @@ class PeerGroupAuditManager(manager.Manager):
def stop(self):
if self.thread:
self.thread.join()
LOG.info("stopped peer group %s audit thread" % self.peer_group_id)
LOG.info(f"stopped peer group {self.peer_group_id} audit thread")
else:
LOG.info("No peer group %s audit thread to stop" %
self.peer_group_id)
LOG.info(f"No peer group {self.peer_group_id} audit thread to stop")
def start(self, system_peer, remote_peer_group, local_peer_group):
if self.thread_lock.locked():
LOG.warning('Audit thread for %s has already started' %
local_peer_group.peer_group_name)
LOG.warning(f"Audit thread for {local_peer_group.peer_group_name} "
f"has already started")
else:
self.thread = threading.Thread(
target=self._do_audit,
@ -305,8 +315,8 @@ class PeerGroupAuditManager(manager.Manager):
system_peer,
remote_peer_group,
local_peer_group):
LOG.info("Audit peer group [%s] with remote system %s" %
(local_peer_group.peer_group_name, system_peer.peer_name))
LOG.info(f"Audit peer group [{local_peer_group.peer_group_name}] "
f"with remote system {system_peer.peer_name}")
self.start(system_peer, remote_peer_group, local_peer_group)
@staticmethod
@ -332,6 +342,6 @@ class PeerGroupAuditManager(manager.Manager):
if response:
return response
except Exception:
LOG.exception("Failed to send audit request for peer group %s "
"to DC: %s" %
(peer_group.peer_group_name, system.peer_name))
LOG.exception("Failed to send audit request for peer group "
f"{peer_group.peer_group_name} to DC: "
f"{system.peer_name}")