Merge "Auditor automatic reconciliation of subclouds"

This commit is contained in:
Zuul 2024-03-05 21:31:09 +00:00 committed by Gerrit Code Review
commit 75296e1caa
4 changed files with 203 additions and 114 deletions

View File

@ -427,7 +427,8 @@ class DcmanagerClient(base.DriverBase):
raise ValueError("subcloud_ref is required.")
url = f"{self.endpoint}/subclouds/{subcloud_ref}"
headers = {"X-Auth-Token": self.token}
headers = {"X-Auth-Token": self.token,
"User-Agent": consts.DCMANAGER_V1_HTTP_AGENT}
response = requests.delete(url, headers=headers,
timeout=self.timeout)

View File

@ -362,7 +362,8 @@ class TestDcmanagerClient(base.DCCommonTestCase):
result = client.delete_subcloud(SUBCLOUD_NAME)
mock_delete.assert_called_once_with(
FAKE_ENDPOINT + '/subclouds/' + SUBCLOUD_NAME,
headers={"X-Auth-Token": FAKE_TOKEN},
headers={"X-Auth-Token": FAKE_TOKEN,
"User-Agent": dccommon_consts.DCMANAGER_V1_HTTP_AGENT},
timeout=FAKE_TIMEOUT
)
self.assertEqual(result, '')

View File

@ -648,6 +648,100 @@ class SubcloudsController(object):
if not payload:
pecan.abort(400, _('Body required'))
peer_group = payload.get('peer_group')
# Verify the peer_group is valid
peer_group_id = None
if peer_group is not None:
# peer_group may be passed in the payload as an int or str
peer_group = str(peer_group)
# Get current site system information
local_system_uuid = utils.get_local_system().uuid
# Check if user wants to remove a subcloud
# from a subcloud-peer-group by
# setting peer_group_id as 'none',
# then we will pass 'none' string as
# the peer_group_id,
# update_subcloud() will handle it and
# Set the peer_group_id DB into None.
if peer_group.lower() == 'none':
if subcloud.peer_group_id is not None:
# Get the peer group of the subcloud
original_pgrp = db_api.subcloud_peer_group_get(
context, subcloud.peer_group_id)
# Check the system leader is not on this site
if original_pgrp.system_leader_id != local_system_uuid:
pecan.abort(400, _("Removing subcloud from a "
"peer group not led by the "
"current site is prohibited."))
# Get associations by peer group id
associations = db_api.\
peer_group_association_get_by_peer_group_id(
context, original_pgrp.id)
for association in associations:
system_peer = db_api.system_peer_get(
context, association.system_peer_id)
# If system peer is available, then does not allow
# to remove the subcloud from secondary peer group
if system_peer.availability_state == consts.\
SYSTEM_PEER_AVAILABILITY_STATE_AVAILABLE \
and original_pgrp.group_priority > 0:
pecan.abort(400, _(
"Removing subcloud from a peer group "
"associated with an available system peer "
"is prohibited."))
peer_group_id = 'none'
else:
if subcloud.peer_group_id is not None and \
str(subcloud.peer_group_id) != peer_group:
original_pgrp = utils.subcloud_peer_group_get_by_ref(
context, str(subcloud.peer_group_id))
if original_pgrp and original_pgrp.group_priority > 0:
pecan.abort(400, _(
"Cannot update subcloud to a new peer group "
"if the original peer group has non-zero "
"priority."))
pgrp = utils.subcloud_peer_group_get_by_ref(context, peer_group)
if not pgrp:
pecan.abort(400, _('Invalid peer group'))
if not utils.is_req_from_another_dc(request):
if pgrp.group_priority > 0:
pecan.abort(400, _("Cannot set the subcloud to a peer"
" group with non-zero priority."))
elif pgrp.system_leader_id != local_system_uuid:
pecan.abort(400, _("Update subcloud to a peer "
"group that is not led by the "
"current site is prohibited."))
elif not (
subcloud.deploy_status == consts.DEPLOY_STATE_DONE
and subcloud.management_state ==
dccommon_consts.MANAGEMENT_MANAGED
and subcloud.availability_status ==
dccommon_consts.AVAILABILITY_ONLINE):
pecan.abort(400, _("Only subclouds that are "
"managed and online can be "
"added to a peer group."))
peer_group_id = pgrp.id
# Subcloud can only be updated while it is managed in
# the primary site because the sync command can only be issued
# in the site where the SPG was created.
if subcloud.peer_group_id is not None and peer_group_id is None \
and not utils.is_req_from_another_dc(request):
# Get the peer group of the subcloud
original_pgrp = db_api.subcloud_peer_group_get(
context, subcloud.peer_group_id)
if original_pgrp.group_priority > 0:
pecan.abort(400, _("Subcloud update is only allowed when "
"its peer group priority value is 0."))
# Get current site system information
local_system_uuid = utils.get_local_system().uuid
# Updating a subcloud under the peer group on primary site
# that the peer group should be led by the primary site.
if original_pgrp.system_leader_id != local_system_uuid:
pecan.abort(400, _("Updating subcloud from a "
"peer group not led by the "
"current site is prohibited."))
# Rename the subcloud
new_subcloud_name = payload.get('name')
if new_subcloud_name is not None:
@ -736,7 +830,6 @@ class SubcloudsController(object):
description = payload.get('description')
location = payload.get('location')
bootstrap_values = payload.get('bootstrap_values')
peer_group = payload.get('peer_group')
bootstrap_address = payload.get('bootstrap_address')
# If the migrate flag is present we need to update the deploy status
@ -752,6 +845,11 @@ class SubcloudsController(object):
management_state not in [dccommon_consts.MANAGEMENT_UNMANAGED,
dccommon_consts.MANAGEMENT_MANAGED]:
pecan.abort(400, _('Invalid management-state'))
if management_state and subcloud.peer_group_id is not None \
and not utils.is_req_from_another_dc(request):
pecan.abort(400, _('Cannot update the management state of a '
'subcloud that is associated with '
'a peer group.'))
force_flag = payload.get('force')
if force_flag is not None:
@ -776,41 +874,6 @@ class SubcloudsController(object):
exceptions.SubcloudGroupNotFound):
pecan.abort(400, _('Invalid group'))
# Verify the peer_group is valid
peer_group_id = None
if peer_group is not None:
# peer_group may be passed in the payload as an int or str
peer_group = str(peer_group)
# Check if user wants to remove a subcloud
# from a subcloud-peer-group by
# setting peer_group_id as 'none',
# then we will pass 'none' string as
# the peer_group_id,
# update_subcloud() will handle it and
# Set the peer_group_id DB into None.
if peer_group.lower() == 'none':
peer_group_id = 'none'
else:
pgrp = utils.subcloud_peer_group_get_by_ref(context, peer_group)
if not pgrp:
pecan.abort(400, _('Invalid peer group'))
if not utils.is_req_from_another_dc(request):
if pgrp.group_priority > 0:
pecan.abort(400, _("Cannot set the subcloud to a peer"
" group with non-zero priority."))
elif not (
subcloud.deploy_status in [
consts.DEPLOY_STATE_DONE,
consts.PRESTAGE_STATE_COMPLETE
] and subcloud.management_state ==
dccommon_consts.MANAGEMENT_MANAGED
and subcloud.availability_status ==
dccommon_consts.AVAILABILITY_ONLINE):
pecan.abort(400, _("Only subclouds that are "
"managed and online can be "
"added to a peer group."))
peer_group_id = pgrp.id
if consts.INSTALL_VALUES in payload:
# install_values of secondary subclouds are validated on
# peer site
@ -998,6 +1061,20 @@ class SubcloudsController(object):
pecan.abort(404, _('Subcloud not found'))
subcloud_id = subcloud.id
peer_group_id = subcloud.peer_group_id
subcloud_management_state = subcloud.management_state
# Check if the subcloud is "managed" status
if subcloud_management_state == dccommon_consts.MANAGEMENT_MANAGED \
and not utils.is_req_from_another_dc(request):
pecan.abort(400, _('Cannot delete a subcloud that is "managed" '
'status'))
# Check if the subcloud is part of a peer group
if peer_group_id is not None and \
not utils.is_req_from_another_dc(request):
pecan.abort(400, _('Cannot delete a subcloud that is part of '
'a peer group on this site'))
try:
# Ask dcmanager-manager to delete the subcloud.

View File

@ -51,9 +51,9 @@ class PeerGroupAuditManager(manager.Manager):
peer_group_name)
return subclouds
except Exception:
LOG.exception("Failed to get subclouds of peer group %s "
"from DC: %s" %
(peer_group_name, system_peer.peer_name))
LOG.exception(f"Failed to get subclouds of peer group "
f"{peer_group_name} from DC: "
f"{system_peer.peer_name}")
def _update_remote_peer_group_migration_status(self,
system_peer,
@ -65,37 +65,58 @@ class PeerGroupAuditManager(manager.Manager):
}
dc_client.update_subcloud_peer_group(peer_group_name,
**peer_group_kwargs)
LOG.info("Updated Subcloud Peer Group %s on "
"peer site %s, set migration_status to: %s" %
(peer_group_name, system_peer.peer_name, migration_status))
LOG.info(f"Updated Subcloud Peer Group {peer_group_name} on "
f"peer site {system_peer.peer_name}, set migration_status "
f"to: {migration_status}")
def _get_local_subclouds_to_update(self,
local_peer_group,
remote_subclouds):
def _get_local_subclouds_to_update_and_delete(self,
local_peer_group,
remote_subclouds):
local_subclouds_to_update = list()
remote_managed_subcloud_region_names = list()
local_subclouds_to_delete = list()
remote_subclouds_dict = {remote_subcloud.get('region-name'):
remote_subcloud for remote_subcloud
in remote_subclouds}
local_subclouds = db_api.subcloud_get_for_peer_group(
self.context, local_peer_group.id)
# get the 'managed+online' remote subclouds
for remote_subcloud in remote_subclouds:
if (remote_subcloud.get('management-state') ==
dccommon_consts.MANAGEMENT_MANAGED and
remote_subcloud.get('availability-status') ==
dccommon_consts.AVAILABILITY_ONLINE):
remote_managed_subcloud_region_names.append(
remote_subcloud.get('region-name'))
# Compare with the 'non-secondary' local subclouds
for local_subcloud in local_subclouds:
if local_subcloud.region_name in \
remote_managed_subcloud_region_names \
and not utils.subcloud_is_secondary_state(
local_subcloud.deploy_status):
remote_subcloud = remote_subclouds_dict.get(
local_subcloud.region_name)
if remote_subcloud:
# Check if the remote subcloud meets the conditions for update
# if it is 'managed' and the local subcloud is not
# in 'secondary' status
if (remote_subcloud.get('management-state') ==
dccommon_consts.MANAGEMENT_MANAGED and
not utils.subcloud_is_secondary_state(
local_subcloud.deploy_status)):
local_subclouds_to_update.append(local_subcloud)
else:
local_subclouds_to_delete.append(local_subcloud)
local_subclouds_to_update.append(local_subcloud)
return local_subclouds_to_update, local_subclouds_to_delete
return local_subclouds_to_update
def _set_local_subcloud_to_secondary(self, subcloud):
try:
LOG.info("Set local subcloud %s to secondary" % subcloud.name)
# There will be an exception when unmanage
# a subcloud in 'unamaged' state.
if subcloud.management_state != \
dccommon_consts.MANAGEMENT_UNMANAGED:
self.subcloud_manager.update_subcloud(
self.context,
subcloud.id,
management_state=dccommon_consts.
MANAGEMENT_UNMANAGED)
self.subcloud_manager.update_subcloud(
self.context,
subcloud.id,
deploy_status=consts.DEPLOY_STATE_SECONDARY)
except Exception as e:
LOG.exception(f"Failed to update local non-secondary "
f"and offline subcloud [{subcloud.name}], err: {e}")
raise e
def audit(self, system_peer, remote_peer_group, local_peer_group):
if local_peer_group.migration_status == consts.PEER_GROUP_MIGRATING:
@ -120,9 +141,9 @@ class PeerGroupAuditManager(manager.Manager):
if remote_peer_group.get("migration_status") == \
consts.PEER_GROUP_MIGRATING:
# Unmanaged all local subclouds of peer group
LOG.info("Unmanaged all local subclouds of peer group %s "
"since remote is in migrating state" %
local_peer_group.peer_group_name)
LOG.info(f"Unmanaged all local subclouds of peer group "
f"{local_peer_group.peer_group_name} "
f"since remote is in migrating state")
subclouds = db_api.subcloud_get_for_peer_group(self.context,
local_peer_group.id)
for subcloud in subclouds:
@ -152,8 +173,8 @@ class PeerGroupAuditManager(manager.Manager):
subcloud.id,
deploy_status=consts.DEPLOY_STATE_REHOME_PENDING)
except Exception as e:
LOG.exception("Fail to unmanage local subcloud %s, err: "
"%s" % (subcloud.name, e))
LOG.exception(f"Fail to unmanage local subcloud "
f"{subcloud.name}, err: {e}")
raise e
self.require_audit_flag = False
@ -167,39 +188,29 @@ class PeerGroupAuditManager(manager.Manager):
system_peer,
remote_peer_group.get("peer_group_name"))
if not remote_subclouds:
LOG.error("No subclouds in remote DC:%s's peer group %s" %
(system_peer.peer_name,
remote_peer_group.get("peer_group_name")))
return
local_subclouds_to_update = \
self._get_local_subclouds_to_update(local_peer_group,
remote_subclouds)
local_subclouds_to_update, local_subclouds_to_delete = \
self._get_local_subclouds_to_update_and_delete(
local_peer_group, remote_subclouds)
for subcloud in local_subclouds_to_update:
self._set_local_subcloud_to_secondary(subcloud)
# Change the local subcloud not exist on peer site's SPG to
# secondary status then delete it
for subcloud in local_subclouds_to_delete:
self._set_local_subcloud_to_secondary(subcloud)
try:
LOG.info("Set secondary to local subcloud %s" %
subcloud.name)
# There will be an exception when unmanage
# a subcloud in 'unamaged' state.
if subcloud.management_state != \
dccommon_consts.MANAGEMENT_UNMANAGED:
self.subcloud_manager.update_subcloud(
self.context,
subcloud.id,
management_state=dccommon_consts.
MANAGEMENT_UNMANAGED)
self.subcloud_manager.update_subcloud(
self.context,
subcloud.id,
deploy_status=consts.DEPLOY_STATE_SECONDARY)
self.subcloud_manager.delete_subcloud(
self.context, subcloud.id)
LOG.info(f"Deleted local subcloud {subcloud.name}")
except Exception as e:
LOG.exception("Failed to update local non-secondary "
"and offline subcloud [%s], err: %s" %
(subcloud.name, e))
LOG.exception(f"Failed to delete local subcloud "
f"[{subcloud.name}] that does not exist "
f"under the same subcloud_peer_group on "
f"peer site, err: {e}")
raise e
if local_subclouds_to_update:
if local_subclouds_to_update or local_subclouds_to_delete:
self._clear_or_raise_alarm(system_peer,
local_peer_group,
remote_peer_group)
@ -229,10 +240,10 @@ class PeerGroupAuditManager(manager.Manager):
entity_instance_id = "peer_group=%s,peer=%s" % \
(local_peer_group.peer_group_name, system_peer.peer_uuid)
if local_peer_group.group_priority < remote_peer_group.get('group_priority'):
LOG.warning("Alarm: local subcloud peer group [%s] "
"is managed by remote system [%s]" %
(local_peer_group.peer_group_name,
system_peer.peer_name))
LOG.warning("Alarm: local subcloud peer group ["
f"{local_peer_group.peer_group_name}] "
f"is managed by remote system ["
f"{system_peer.peer_name}]")
try:
fault = fm_api.Fault(
alarm_id=fm_const.
@ -266,15 +277,15 @@ class PeerGroupAuditManager(manager.Manager):
fm_const.FM_ALARM_ID_DC_SUBCLOUD_PEER_GROUP_NOT_MANAGED,
entity_instance_id)
if fault:
LOG.info("Clear alarm: %s" % entity_instance_id)
LOG.info(f"Clear alarm: {entity_instance_id}")
self.fm_api.clear_fault(
fm_const.FM_ALARM_ID_DC_SUBCLOUD_PEER_GROUP_NOT_MANAGED,
entity_instance_id)
except Exception:
LOG.exception(
"Problem clearing fault [%s], alarm_id=%s" %
(entity_instance_id,
fm_const.FM_ALARM_ID_DC_SUBCLOUD_PEER_GROUP_NOT_MANAGED))
f"Problem clearing fault [{entity_instance_id}], "
f"alarm_id="
f"{fm_const.FM_ALARM_ID_DC_SUBCLOUD_PEER_GROUP_NOT_MANAGED}")
def _do_audit(self, system_peer, remote_peer_group, local_peer_group):
with self.thread_lock:
@ -286,15 +297,14 @@ class PeerGroupAuditManager(manager.Manager):
def stop(self):
if self.thread:
self.thread.join()
LOG.info("stopped peer group %s audit thread" % self.peer_group_id)
LOG.info(f"stopped peer group {self.peer_group_id} audit thread")
else:
LOG.info("No peer group %s audit thread to stop" %
self.peer_group_id)
LOG.info(f"No peer group {self.peer_group_id} audit thread to stop")
def start(self, system_peer, remote_peer_group, local_peer_group):
if self.thread_lock.locked():
LOG.warning('Audit thread for %s has already started' %
local_peer_group.peer_group_name)
LOG.warning(f"Audit thread for {local_peer_group.peer_group_name} "
f"has already started")
else:
self.thread = threading.Thread(
target=self._do_audit,
@ -305,8 +315,8 @@ class PeerGroupAuditManager(manager.Manager):
system_peer,
remote_peer_group,
local_peer_group):
LOG.info("Audit peer group [%s] with remote system %s" %
(local_peer_group.peer_group_name, system_peer.peer_name))
LOG.info(f"Audit peer group [{local_peer_group.peer_group_name}] "
f"with remote system {system_peer.peer_name}")
self.start(system_peer, remote_peer_group, local_peer_group)
@staticmethod
@ -332,6 +342,6 @@ class PeerGroupAuditManager(manager.Manager):
if response:
return response
except Exception:
LOG.exception("Failed to send audit request for peer group %s "
"to DC: %s" %
(peer_group.peer_group_name, system.peer_name))
LOG.exception("Failed to send audit request for peer group "
f"{peer_group.peer_group_name} to DC: "
f"{system.peer_name}")