Merge "Auditor automatic reconciliation of subclouds"

This commit is contained in:
Zuul 2024-03-05 21:31:09 +00:00 committed by Gerrit Code Review
commit 75296e1caa
4 changed files with 203 additions and 114 deletions

View File

@ -427,7 +427,8 @@ class DcmanagerClient(base.DriverBase):
raise ValueError("subcloud_ref is required.") raise ValueError("subcloud_ref is required.")
url = f"{self.endpoint}/subclouds/{subcloud_ref}" url = f"{self.endpoint}/subclouds/{subcloud_ref}"
headers = {"X-Auth-Token": self.token} headers = {"X-Auth-Token": self.token,
"User-Agent": consts.DCMANAGER_V1_HTTP_AGENT}
response = requests.delete(url, headers=headers, response = requests.delete(url, headers=headers,
timeout=self.timeout) timeout=self.timeout)

View File

@ -362,7 +362,8 @@ class TestDcmanagerClient(base.DCCommonTestCase):
result = client.delete_subcloud(SUBCLOUD_NAME) result = client.delete_subcloud(SUBCLOUD_NAME)
mock_delete.assert_called_once_with( mock_delete.assert_called_once_with(
FAKE_ENDPOINT + '/subclouds/' + SUBCLOUD_NAME, FAKE_ENDPOINT + '/subclouds/' + SUBCLOUD_NAME,
headers={"X-Auth-Token": FAKE_TOKEN}, headers={"X-Auth-Token": FAKE_TOKEN,
"User-Agent": dccommon_consts.DCMANAGER_V1_HTTP_AGENT},
timeout=FAKE_TIMEOUT timeout=FAKE_TIMEOUT
) )
self.assertEqual(result, '') self.assertEqual(result, '')

View File

@ -648,6 +648,100 @@ class SubcloudsController(object):
if not payload: if not payload:
pecan.abort(400, _('Body required')) pecan.abort(400, _('Body required'))
peer_group = payload.get('peer_group')
# Verify the peer_group is valid
peer_group_id = None
if peer_group is not None:
# peer_group may be passed in the payload as an int or str
peer_group = str(peer_group)
# Get current site system information
local_system_uuid = utils.get_local_system().uuid
# Check if user wants to remove a subcloud
# from a subcloud-peer-group by
# setting peer_group_id as 'none',
# then we will pass 'none' string as
# the peer_group_id,
# update_subcloud() will handle it and
# Set the peer_group_id DB into None.
if peer_group.lower() == 'none':
if subcloud.peer_group_id is not None:
# Get the peer group of the subcloud
original_pgrp = db_api.subcloud_peer_group_get(
context, subcloud.peer_group_id)
# Check the system leader is not on this site
if original_pgrp.system_leader_id != local_system_uuid:
pecan.abort(400, _("Removing subcloud from a "
"peer group not led by the "
"current site is prohibited."))
# Get associations by peer group id
associations = db_api.\
peer_group_association_get_by_peer_group_id(
context, original_pgrp.id)
for association in associations:
system_peer = db_api.system_peer_get(
context, association.system_peer_id)
# If system peer is available, then does not allow
# to remove the subcloud from secondary peer group
if system_peer.availability_state == consts.\
SYSTEM_PEER_AVAILABILITY_STATE_AVAILABLE \
and original_pgrp.group_priority > 0:
pecan.abort(400, _(
"Removing subcloud from a peer group "
"associated with an available system peer "
"is prohibited."))
peer_group_id = 'none'
else:
if subcloud.peer_group_id is not None and \
str(subcloud.peer_group_id) != peer_group:
original_pgrp = utils.subcloud_peer_group_get_by_ref(
context, str(subcloud.peer_group_id))
if original_pgrp and original_pgrp.group_priority > 0:
pecan.abort(400, _(
"Cannot update subcloud to a new peer group "
"if the original peer group has non-zero "
"priority."))
pgrp = utils.subcloud_peer_group_get_by_ref(context, peer_group)
if not pgrp:
pecan.abort(400, _('Invalid peer group'))
if not utils.is_req_from_another_dc(request):
if pgrp.group_priority > 0:
pecan.abort(400, _("Cannot set the subcloud to a peer"
" group with non-zero priority."))
elif pgrp.system_leader_id != local_system_uuid:
pecan.abort(400, _("Update subcloud to a peer "
"group that is not led by the "
"current site is prohibited."))
elif not (
subcloud.deploy_status == consts.DEPLOY_STATE_DONE
and subcloud.management_state ==
dccommon_consts.MANAGEMENT_MANAGED
and subcloud.availability_status ==
dccommon_consts.AVAILABILITY_ONLINE):
pecan.abort(400, _("Only subclouds that are "
"managed and online can be "
"added to a peer group."))
peer_group_id = pgrp.id
# Subcloud can only be updated while it is managed in
# the primary site because the sync command can only be issued
# in the site where the SPG was created.
if subcloud.peer_group_id is not None and peer_group_id is None \
and not utils.is_req_from_another_dc(request):
# Get the peer group of the subcloud
original_pgrp = db_api.subcloud_peer_group_get(
context, subcloud.peer_group_id)
if original_pgrp.group_priority > 0:
pecan.abort(400, _("Subcloud update is only allowed when "
"its peer group priority value is 0."))
# Get current site system information
local_system_uuid = utils.get_local_system().uuid
# Updating a subcloud under the peer group on primary site
# that the peer group should be led by the primary site.
if original_pgrp.system_leader_id != local_system_uuid:
pecan.abort(400, _("Updating subcloud from a "
"peer group not led by the "
"current site is prohibited."))
# Rename the subcloud # Rename the subcloud
new_subcloud_name = payload.get('name') new_subcloud_name = payload.get('name')
if new_subcloud_name is not None: if new_subcloud_name is not None:
@ -736,7 +830,6 @@ class SubcloudsController(object):
description = payload.get('description') description = payload.get('description')
location = payload.get('location') location = payload.get('location')
bootstrap_values = payload.get('bootstrap_values') bootstrap_values = payload.get('bootstrap_values')
peer_group = payload.get('peer_group')
bootstrap_address = payload.get('bootstrap_address') bootstrap_address = payload.get('bootstrap_address')
# If the migrate flag is present we need to update the deploy status # If the migrate flag is present we need to update the deploy status
@ -752,6 +845,11 @@ class SubcloudsController(object):
management_state not in [dccommon_consts.MANAGEMENT_UNMANAGED, management_state not in [dccommon_consts.MANAGEMENT_UNMANAGED,
dccommon_consts.MANAGEMENT_MANAGED]: dccommon_consts.MANAGEMENT_MANAGED]:
pecan.abort(400, _('Invalid management-state')) pecan.abort(400, _('Invalid management-state'))
if management_state and subcloud.peer_group_id is not None \
and not utils.is_req_from_another_dc(request):
pecan.abort(400, _('Cannot update the management state of a '
'subcloud that is associated with '
'a peer group.'))
force_flag = payload.get('force') force_flag = payload.get('force')
if force_flag is not None: if force_flag is not None:
@ -776,41 +874,6 @@ class SubcloudsController(object):
exceptions.SubcloudGroupNotFound): exceptions.SubcloudGroupNotFound):
pecan.abort(400, _('Invalid group')) pecan.abort(400, _('Invalid group'))
# Verify the peer_group is valid
peer_group_id = None
if peer_group is not None:
# peer_group may be passed in the payload as an int or str
peer_group = str(peer_group)
# Check if user wants to remove a subcloud
# from a subcloud-peer-group by
# setting peer_group_id as 'none',
# then we will pass 'none' string as
# the peer_group_id,
# update_subcloud() will handle it and
# Set the peer_group_id DB into None.
if peer_group.lower() == 'none':
peer_group_id = 'none'
else:
pgrp = utils.subcloud_peer_group_get_by_ref(context, peer_group)
if not pgrp:
pecan.abort(400, _('Invalid peer group'))
if not utils.is_req_from_another_dc(request):
if pgrp.group_priority > 0:
pecan.abort(400, _("Cannot set the subcloud to a peer"
" group with non-zero priority."))
elif not (
subcloud.deploy_status in [
consts.DEPLOY_STATE_DONE,
consts.PRESTAGE_STATE_COMPLETE
] and subcloud.management_state ==
dccommon_consts.MANAGEMENT_MANAGED
and subcloud.availability_status ==
dccommon_consts.AVAILABILITY_ONLINE):
pecan.abort(400, _("Only subclouds that are "
"managed and online can be "
"added to a peer group."))
peer_group_id = pgrp.id
if consts.INSTALL_VALUES in payload: if consts.INSTALL_VALUES in payload:
# install_values of secondary subclouds are validated on # install_values of secondary subclouds are validated on
# peer site # peer site
@ -998,6 +1061,20 @@ class SubcloudsController(object):
pecan.abort(404, _('Subcloud not found')) pecan.abort(404, _('Subcloud not found'))
subcloud_id = subcloud.id subcloud_id = subcloud.id
peer_group_id = subcloud.peer_group_id
subcloud_management_state = subcloud.management_state
# Check if the subcloud is "managed" status
if subcloud_management_state == dccommon_consts.MANAGEMENT_MANAGED \
and not utils.is_req_from_another_dc(request):
pecan.abort(400, _('Cannot delete a subcloud that is "managed" '
'status'))
# Check if the subcloud is part of a peer group
if peer_group_id is not None and \
not utils.is_req_from_another_dc(request):
pecan.abort(400, _('Cannot delete a subcloud that is part of '
'a peer group on this site'))
try: try:
# Ask dcmanager-manager to delete the subcloud. # Ask dcmanager-manager to delete the subcloud.

View File

@ -51,9 +51,9 @@ class PeerGroupAuditManager(manager.Manager):
peer_group_name) peer_group_name)
return subclouds return subclouds
except Exception: except Exception:
LOG.exception("Failed to get subclouds of peer group %s " LOG.exception(f"Failed to get subclouds of peer group "
"from DC: %s" % f"{peer_group_name} from DC: "
(peer_group_name, system_peer.peer_name)) f"{system_peer.peer_name}")
def _update_remote_peer_group_migration_status(self, def _update_remote_peer_group_migration_status(self,
system_peer, system_peer,
@ -65,37 +65,58 @@ class PeerGroupAuditManager(manager.Manager):
} }
dc_client.update_subcloud_peer_group(peer_group_name, dc_client.update_subcloud_peer_group(peer_group_name,
**peer_group_kwargs) **peer_group_kwargs)
LOG.info("Updated Subcloud Peer Group %s on " LOG.info(f"Updated Subcloud Peer Group {peer_group_name} on "
"peer site %s, set migration_status to: %s" % f"peer site {system_peer.peer_name}, set migration_status "
(peer_group_name, system_peer.peer_name, migration_status)) f"to: {migration_status}")
def _get_local_subclouds_to_update(self, def _get_local_subclouds_to_update_and_delete(self,
local_peer_group, local_peer_group,
remote_subclouds): remote_subclouds):
local_subclouds_to_update = list() local_subclouds_to_update = list()
remote_managed_subcloud_region_names = list() local_subclouds_to_delete = list()
remote_subclouds_dict = {remote_subcloud.get('region-name'):
remote_subcloud for remote_subcloud
in remote_subclouds}
local_subclouds = db_api.subcloud_get_for_peer_group( local_subclouds = db_api.subcloud_get_for_peer_group(
self.context, local_peer_group.id) self.context, local_peer_group.id)
# get the 'managed+online' remote subclouds
for remote_subcloud in remote_subclouds:
if (remote_subcloud.get('management-state') ==
dccommon_consts.MANAGEMENT_MANAGED and
remote_subcloud.get('availability-status') ==
dccommon_consts.AVAILABILITY_ONLINE):
remote_managed_subcloud_region_names.append(
remote_subcloud.get('region-name'))
# Compare with the 'non-secondary' local subclouds
for local_subcloud in local_subclouds: for local_subcloud in local_subclouds:
if local_subcloud.region_name in \ remote_subcloud = remote_subclouds_dict.get(
remote_managed_subcloud_region_names \ local_subcloud.region_name)
and not utils.subcloud_is_secondary_state( if remote_subcloud:
local_subcloud.deploy_status): # Check if the remote subcloud meets the conditions for update
# if it is 'managed' and the local subcloud is not
# in 'secondary' status
if (remote_subcloud.get('management-state') ==
dccommon_consts.MANAGEMENT_MANAGED and
not utils.subcloud_is_secondary_state(
local_subcloud.deploy_status)):
local_subclouds_to_update.append(local_subcloud)
else:
local_subclouds_to_delete.append(local_subcloud)
local_subclouds_to_update.append(local_subcloud) return local_subclouds_to_update, local_subclouds_to_delete
return local_subclouds_to_update def _set_local_subcloud_to_secondary(self, subcloud):
try:
LOG.info("Set local subcloud %s to secondary" % subcloud.name)
# There will be an exception when unmanage
# a subcloud in 'unamaged' state.
if subcloud.management_state != \
dccommon_consts.MANAGEMENT_UNMANAGED:
self.subcloud_manager.update_subcloud(
self.context,
subcloud.id,
management_state=dccommon_consts.
MANAGEMENT_UNMANAGED)
self.subcloud_manager.update_subcloud(
self.context,
subcloud.id,
deploy_status=consts.DEPLOY_STATE_SECONDARY)
except Exception as e:
LOG.exception(f"Failed to update local non-secondary "
f"and offline subcloud [{subcloud.name}], err: {e}")
raise e
def audit(self, system_peer, remote_peer_group, local_peer_group): def audit(self, system_peer, remote_peer_group, local_peer_group):
if local_peer_group.migration_status == consts.PEER_GROUP_MIGRATING: if local_peer_group.migration_status == consts.PEER_GROUP_MIGRATING:
@ -120,9 +141,9 @@ class PeerGroupAuditManager(manager.Manager):
if remote_peer_group.get("migration_status") == \ if remote_peer_group.get("migration_status") == \
consts.PEER_GROUP_MIGRATING: consts.PEER_GROUP_MIGRATING:
# Unmanaged all local subclouds of peer group # Unmanaged all local subclouds of peer group
LOG.info("Unmanaged all local subclouds of peer group %s " LOG.info(f"Unmanaged all local subclouds of peer group "
"since remote is in migrating state" % f"{local_peer_group.peer_group_name} "
local_peer_group.peer_group_name) f"since remote is in migrating state")
subclouds = db_api.subcloud_get_for_peer_group(self.context, subclouds = db_api.subcloud_get_for_peer_group(self.context,
local_peer_group.id) local_peer_group.id)
for subcloud in subclouds: for subcloud in subclouds:
@ -152,8 +173,8 @@ class PeerGroupAuditManager(manager.Manager):
subcloud.id, subcloud.id,
deploy_status=consts.DEPLOY_STATE_REHOME_PENDING) deploy_status=consts.DEPLOY_STATE_REHOME_PENDING)
except Exception as e: except Exception as e:
LOG.exception("Fail to unmanage local subcloud %s, err: " LOG.exception(f"Fail to unmanage local subcloud "
"%s" % (subcloud.name, e)) f"{subcloud.name}, err: {e}")
raise e raise e
self.require_audit_flag = False self.require_audit_flag = False
@ -167,39 +188,29 @@ class PeerGroupAuditManager(manager.Manager):
system_peer, system_peer,
remote_peer_group.get("peer_group_name")) remote_peer_group.get("peer_group_name"))
if not remote_subclouds: local_subclouds_to_update, local_subclouds_to_delete = \
LOG.error("No subclouds in remote DC:%s's peer group %s" % self._get_local_subclouds_to_update_and_delete(
(system_peer.peer_name, local_peer_group, remote_subclouds)
remote_peer_group.get("peer_group_name")))
return
local_subclouds_to_update = \
self._get_local_subclouds_to_update(local_peer_group,
remote_subclouds)
for subcloud in local_subclouds_to_update: for subcloud in local_subclouds_to_update:
self._set_local_subcloud_to_secondary(subcloud)
# Change the local subcloud not exist on peer site's SPG to
# secondary status then delete it
for subcloud in local_subclouds_to_delete:
self._set_local_subcloud_to_secondary(subcloud)
try: try:
LOG.info("Set secondary to local subcloud %s" % self.subcloud_manager.delete_subcloud(
subcloud.name) self.context, subcloud.id)
# There will be an exception when unmanage LOG.info(f"Deleted local subcloud {subcloud.name}")
# a subcloud in 'unamaged' state.
if subcloud.management_state != \
dccommon_consts.MANAGEMENT_UNMANAGED:
self.subcloud_manager.update_subcloud(
self.context,
subcloud.id,
management_state=dccommon_consts.
MANAGEMENT_UNMANAGED)
self.subcloud_manager.update_subcloud(
self.context,
subcloud.id,
deploy_status=consts.DEPLOY_STATE_SECONDARY)
except Exception as e: except Exception as e:
LOG.exception("Failed to update local non-secondary " LOG.exception(f"Failed to delete local subcloud "
"and offline subcloud [%s], err: %s" % f"[{subcloud.name}] that does not exist "
(subcloud.name, e)) f"under the same subcloud_peer_group on "
f"peer site, err: {e}")
raise e raise e
if local_subclouds_to_update: if local_subclouds_to_update or local_subclouds_to_delete:
self._clear_or_raise_alarm(system_peer, self._clear_or_raise_alarm(system_peer,
local_peer_group, local_peer_group,
remote_peer_group) remote_peer_group)
@ -229,10 +240,10 @@ class PeerGroupAuditManager(manager.Manager):
entity_instance_id = "peer_group=%s,peer=%s" % \ entity_instance_id = "peer_group=%s,peer=%s" % \
(local_peer_group.peer_group_name, system_peer.peer_uuid) (local_peer_group.peer_group_name, system_peer.peer_uuid)
if local_peer_group.group_priority < remote_peer_group.get('group_priority'): if local_peer_group.group_priority < remote_peer_group.get('group_priority'):
LOG.warning("Alarm: local subcloud peer group [%s] " LOG.warning("Alarm: local subcloud peer group ["
"is managed by remote system [%s]" % f"{local_peer_group.peer_group_name}] "
(local_peer_group.peer_group_name, f"is managed by remote system ["
system_peer.peer_name)) f"{system_peer.peer_name}]")
try: try:
fault = fm_api.Fault( fault = fm_api.Fault(
alarm_id=fm_const. alarm_id=fm_const.
@ -266,15 +277,15 @@ class PeerGroupAuditManager(manager.Manager):
fm_const.FM_ALARM_ID_DC_SUBCLOUD_PEER_GROUP_NOT_MANAGED, fm_const.FM_ALARM_ID_DC_SUBCLOUD_PEER_GROUP_NOT_MANAGED,
entity_instance_id) entity_instance_id)
if fault: if fault:
LOG.info("Clear alarm: %s" % entity_instance_id) LOG.info(f"Clear alarm: {entity_instance_id}")
self.fm_api.clear_fault( self.fm_api.clear_fault(
fm_const.FM_ALARM_ID_DC_SUBCLOUD_PEER_GROUP_NOT_MANAGED, fm_const.FM_ALARM_ID_DC_SUBCLOUD_PEER_GROUP_NOT_MANAGED,
entity_instance_id) entity_instance_id)
except Exception: except Exception:
LOG.exception( LOG.exception(
"Problem clearing fault [%s], alarm_id=%s" % f"Problem clearing fault [{entity_instance_id}], "
(entity_instance_id, f"alarm_id="
fm_const.FM_ALARM_ID_DC_SUBCLOUD_PEER_GROUP_NOT_MANAGED)) f"{fm_const.FM_ALARM_ID_DC_SUBCLOUD_PEER_GROUP_NOT_MANAGED}")
def _do_audit(self, system_peer, remote_peer_group, local_peer_group): def _do_audit(self, system_peer, remote_peer_group, local_peer_group):
with self.thread_lock: with self.thread_lock:
@ -286,15 +297,14 @@ class PeerGroupAuditManager(manager.Manager):
def stop(self): def stop(self):
if self.thread: if self.thread:
self.thread.join() self.thread.join()
LOG.info("stopped peer group %s audit thread" % self.peer_group_id) LOG.info(f"stopped peer group {self.peer_group_id} audit thread")
else: else:
LOG.info("No peer group %s audit thread to stop" % LOG.info(f"No peer group {self.peer_group_id} audit thread to stop")
self.peer_group_id)
def start(self, system_peer, remote_peer_group, local_peer_group): def start(self, system_peer, remote_peer_group, local_peer_group):
if self.thread_lock.locked(): if self.thread_lock.locked():
LOG.warning('Audit thread for %s has already started' % LOG.warning(f"Audit thread for {local_peer_group.peer_group_name} "
local_peer_group.peer_group_name) f"has already started")
else: else:
self.thread = threading.Thread( self.thread = threading.Thread(
target=self._do_audit, target=self._do_audit,
@ -305,8 +315,8 @@ class PeerGroupAuditManager(manager.Manager):
system_peer, system_peer,
remote_peer_group, remote_peer_group,
local_peer_group): local_peer_group):
LOG.info("Audit peer group [%s] with remote system %s" % LOG.info(f"Audit peer group [{local_peer_group.peer_group_name}] "
(local_peer_group.peer_group_name, system_peer.peer_name)) f"with remote system {system_peer.peer_name}")
self.start(system_peer, remote_peer_group, local_peer_group) self.start(system_peer, remote_peer_group, local_peer_group)
@staticmethod @staticmethod
@ -332,6 +342,6 @@ class PeerGroupAuditManager(manager.Manager):
if response: if response:
return response return response
except Exception: except Exception:
LOG.exception("Failed to send audit request for peer group %s " LOG.exception("Failed to send audit request for peer group "
"to DC: %s" % f"{peer_group.peer_group_name} to DC: "
(peer_group.peer_group_name, system.peer_name)) f"{system.peer_name}")