Merge "Fix issues with PGA sync_status"

This commit is contained in:
Zuul 2024-03-07 19:22:57 +00:00 committed by Gerrit Code Review
commit 7c090cd7ba
8 changed files with 448 additions and 2 deletions

View File

@ -295,7 +295,8 @@ class DcmanagerClient(base.DriverBase):
url = f"{self.endpoint}/subcloud-peer-groups/{peer_group_ref}"
headers = {"X-Auth-Token": self.token,
"Content-Type": "application/json"}
"Content-Type": "application/json",
"User-Agent": consts.DCMANAGER_V1_HTTP_AGENT}
response = requests.patch(url, json=kwargs, headers=headers,
timeout=self.timeout)

View File

@ -259,6 +259,12 @@ class SubcloudPeerGroupsController(restcomm.GenericPathController):
if not payload:
pecan.abort(httpclient.BAD_REQUEST, _('Body required'))
if group.group_priority > 0 and \
not utils.is_req_from_another_dc(request):
pecan.abort(httpclient.BAD_REQUEST,
_("Cannot update a peer group from a non-primary "
"site."))
LOG.info("Handling update subcloud peer group request for: %s" % payload)
peer_group_name = payload.get('peer-group-name')
group_priority = payload.get('group-priority')

View File

@ -472,6 +472,7 @@ ASSOCIATION_SYNC_STATUS_SYNCING = 'syncing'
ASSOCIATION_SYNC_STATUS_IN_SYNC = 'in-sync'
ASSOCIATION_SYNC_STATUS_FAILED = 'failed'
ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC = 'out-of-sync'
ASSOCIATION_SYNC_STATUS_UNKNOWN = 'unknown'
# Peer monitor heartbeat policy
HEARTBEAT_FAILURE_POLICY_ALARM = 'alarm'

View File

@ -176,6 +176,10 @@ class PeerGroupAuditManager(manager.Manager):
LOG.exception(f"Fail to unmanage local subcloud "
f"{subcloud.name}, err: {e}")
raise e
SystemPeerManager.update_sync_status_on_peer_site(
self.context, system_peer,
consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC,
local_peer_group, remote_peer_group)
self.require_audit_flag = False
# if remote subcloud peer group's migration_status is 'complete',
@ -204,6 +208,10 @@ class PeerGroupAuditManager(manager.Manager):
self.context, subcloud.id)
LOG.info(f"Deleted local subcloud {subcloud.name}")
except Exception as e:
SystemPeerManager.update_sync_status_on_peer_site(
self.context, system_peer,
consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC,
local_peer_group, remote_peer_group)
LOG.exception(f"Failed to delete local subcloud "
f"[{subcloud.name}] that does not exist "
f"under the same subcloud_peer_group on "
@ -224,6 +232,10 @@ class PeerGroupAuditManager(manager.Manager):
system_peer,
remote_peer_group.get("peer_group_name"),
None)
SystemPeerManager.update_sync_status_on_peer_site(
self.context, system_peer,
consts.ASSOCIATION_SYNC_STATUS_IN_SYNC,
local_peer_group, remote_peer_group)
self.require_audit_flag = False
else:
# If remote peer group migration_status is 'None'

View File

@ -111,6 +111,60 @@ class PeerMonitor(object):
self.peer.peer_name)
return failed, dc_peer_subcloud_peer_group_list
def _update_sync_status_when_secondary_site_becomes_unreachable(self):
# Get associations by system peer
associations = SystemPeerManager.get_local_associations(self.context,
self.peer)
for association in associations:
# If the association is not primary, skip it.
if association.association_type == consts.\
ASSOCIATION_TYPE_NON_PRIMARY:
LOG.debug("Skip update the Association sync_status as "
"it is not primary.")
continue
# If the secondary site is down, set the association sync status
# "in-sync" -> "unknown"
# "unknown" -> "unknown"
# "out-of-sync" -> "failed"
# "syncing" -> "failed"
# "failed" -> "failed"
sync_status = consts.ASSOCIATION_SYNC_STATUS_UNKNOWN
message = f"Peer site ({self.peer.peer_name}) is unreachable."
if association.sync_status not in [
consts.ASSOCIATION_SYNC_STATUS_IN_SYNC,
consts.ASSOCIATION_SYNC_STATUS_UNKNOWN]:
sync_status = consts.ASSOCIATION_SYNC_STATUS_FAILED
db_api.peer_group_association_update(
self.context, association.id,
sync_status=sync_status,
sync_message=message)
def _update_sync_status_when_secondary_site_becomes_reachable(self):
# Get associations by system peer
associations = SystemPeerManager.get_local_associations(self.context,
self.peer)
for association in associations:
# If the association is not primary, skip it.
if association.association_type == consts.\
ASSOCIATION_TYPE_NON_PRIMARY:
LOG.debug("Skip update Peer Site Association sync_status as "
"current site Association is not primary.")
continue
# Upon detecting that the secondary site is reachable again,
# the PGA sync_status will be set for both sites by the primary
# site monitor thread as follows:
# "unknown" -> "in-sync"
# "failed" -> "out-of-sync"
sync_status = consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC
if association.sync_status == \
consts.ASSOCIATION_SYNC_STATUS_UNKNOWN:
sync_status = consts.ASSOCIATION_SYNC_STATUS_IN_SYNC
dc_local_pg = db_api.subcloud_peer_group_get(
self.context, association.peer_group_id)
SystemPeerManager.update_sync_status_on_peer_site(
self.context, self.peer, sync_status, dc_local_pg,
association=association)
def _do_monitor_peer(self):
failure_count = 0
LOG.info("Start monitoring thread for peer %s" %
@ -134,6 +188,8 @@ class PeerMonitor(object):
availability_state= # noqa: E251
consts.SYSTEM_PEER_AVAILABILITY_STATE_UNAVAILABLE
)
# pylint: disable=line-too-long
self._update_sync_status_when_secondary_site_becomes_unreachable() # noqa: E501
failure_count = 0
self._set_require_audit_flag_to_associated_peer_groups()
else:
@ -146,6 +202,8 @@ class PeerMonitor(object):
availability_state= # noqa: E251
consts.SYSTEM_PEER_AVAILABILITY_STATE_AVAILABLE
)
# pylint: disable=line-too-long
self._update_sync_status_when_secondary_site_becomes_reachable() # noqa: E501
LOG.info("DC %s back online, clear alarm" %
self.peer.peer_name)
self._clear_failure()

View File

@ -52,6 +52,106 @@ class SystemPeerManager(manager.Manager):
super(SystemPeerManager, self).__init__(
service_name="system_peer_manager", *args, **kwargs)
@staticmethod
def get_local_associations(ctx, peer, local_pg=None):
if local_pg is None:
# Get associations by system peer id
return db_api.peer_group_association_get_by_system_peer_id(ctx,
peer.id)
else:
# Get association by system peer id and peer group id
association = db_api.\
peer_group_association_get_by_peer_group_and_system_peer_id(
ctx, local_pg.id, peer.id)
return [association] if association else []
@staticmethod
def update_sync_status_on_peer_site(ctx, peer, sync_status, local_pg=None,
remote_pg=None, message="None",
association=None):
"""Update sync status of association on peer site.
This function updates the sync status of the association on the peer
site and then updates the sync status of the association on the
primary site.
:param ctx: request context object
:param peer: system peer object of the current site
:param sync_status: sync status to update
:param local_pg: local peer group object
:param remote_pg: remote peer group object
:param message: sync message
:param association: peer group association object
"""
def _update_association_on_peer_site(association, peer, sync_status,
local_pg, remote_pg, message):
try:
# Get peer site dcmanager client
dc_client = SystemPeerManager.get_peer_dc_client(peer)
# Get peer site peer group if not exist
remote_pg = remote_pg if remote_pg is not None else dc_client.\
get_subcloud_peer_group(local_pg.peer_group_name)
# Get peer site system peer
dc_peer_system_peer = dc_client.get_system_peer(
utils.get_local_system().uuid)
# Get peer site group association
dc_peer_association = dc_client.\
get_peer_group_association_with_peer_id_and_pg_id(
dc_peer_system_peer.get('id'),
remote_pg.get('id'))
# Update peer site association sync_status only if the
# sync_status is different from the current sync_status
if dc_peer_association.get('sync_status') != sync_status:
# Update peer site association sync_status
dc_peer_association_id = dc_peer_association.get('id')
dc_client.update_peer_group_association_sync_status(
dc_peer_association_id, sync_status)
LOG.info(f"Updated Peer site {dc_peer_system_peer.get('id')} "
f"Peer Group Association {dc_peer_association_id} "
f"sync_status to {sync_status}.")
except Exception as e:
message = f"Failed to Update Peer Site ({peer.peer_uuid}) " \
f"Association sync_status to {sync_status}."
LOG.exception(f"{message} Error: {e}")
sync_status = consts.ASSOCIATION_SYNC_STATUS_FAILED
return sync_status, message
associations = list()
if association is None:
associations = SystemPeerManager.get_local_associations(
ctx, peer, local_pg)
else:
associations = [association]
for association in associations:
if association.association_type == \
consts.ASSOCIATION_TYPE_NON_PRIMARY:
LOG.debug(f"Skip update Peer Site association "
f"sync_status to {sync_status} as current "
f"site Association is not primary.")
continue
local_pg = local_pg if local_pg is not None else db_api.\
subcloud_peer_group_get(ctx, association.peer_group_id)
sync_status, message = _update_association_on_peer_site(
association, peer, sync_status, local_pg, remote_pg, message)
if association.sync_status == sync_status and sync_status != \
consts.ASSOCIATION_SYNC_STATUS_FAILED:
LOG.debug(f"Skip update current site association "
f"sync_status to {sync_status} as current "
f"site Association is already in the same status.")
continue
# Update primary site association sync_status
db_api.peer_group_association_update(
ctx, association.id,
sync_status=sync_status,
sync_message=message)
@staticmethod
def get_peer_ks_client(peer):
"""This will get a new peer keystone client (and new token)"""

View File

@ -0,0 +1,212 @@
#
# Copyright (c) 2024 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
import uuid
import mock
from dcmanager.common import consts
from dcmanager.db.sqlalchemy import api as db_api
from dcmanager.manager import peer_monitor_manager
from dcmanager.tests import base
# FAKE SYSINV DATA
FAKE_SITE0_SYSTEM_UUID = str(uuid.uuid4())
FAKE_SITE1_SYSTEM_UUID = str(uuid.uuid4())
# FAKE SYSTEM PEER DATA
FAKE_SYSTEM_PEER_ID = 1
FAKE_SYSTEM_PEER_UUID = FAKE_SITE1_SYSTEM_UUID
FAKE_SYSTEM_PEER_NAME = 'PeerSite1'
FAKE_MANAGER_ENDPOINT = 'http://128.128.128.128:5000/v3'
FAKE_MANAGER_USERNAME = 'admin'
FAKE_MANAGER_PASSWORD = 'cGFzc3dvcmQ='
FAKE_PEER_CONTROLLER_GATEWAY_IP = '128.128.1.1'
# FAKE SUBCLOUD PEER GROUP DATA (SITE0)
FAKE_SITE0_PEER_GROUP_ID = 1
FAKE_SITE0_PEER_GROUP_NAME = 'PeerGroup1'
FAKE_SITE0_PEER_GROUP_SYSTEM_LEADER_ID = FAKE_SITE0_SYSTEM_UUID
FAKE_SITE0_PEER_GROUP_SYSTEM_LEADER_NAME = 'site0'
FAKE_SITE0_PEER_GROUP_MAX_SUBCLOUDS_REHOMING = 50
FAKE_SITE0_PEER_GROUP_PRIORITY = 0
FAKE_SITE0_PEER_GROUP_STATE = 'enabled'
# FAKE SYSTEM PEER DATA (SITE1)
FAKE_SITE1_SYSTEM_PEER_ID = 10
# FAKE SUBCLOUD PEER GROUP DATA (SITE1)
FAKE_SITE1_PEER_GROUP_ID = 9
# FAKE PEER GROUP ASSOCIATION DATA (SITE0)
FAKE_ASSOCIATION_PEER_GROUP_ID = \
FAKE_SITE0_PEER_GROUP_ID
FAKE_ASSOCIATION_SYSTEM_PEER_ID = \
FAKE_SYSTEM_PEER_ID
FAKE_ASSOCIATION_PEER_GROUP_PRIORITY = 1
FAKE_ASSOCIATION_SYNC_STATUS = 'in-sync'
FAKE_ASSOCIATION_SYNC_MESSAGE = 'None'
FAKE_ASSOCIATION_TYPE = 'primary'
# FAKE PEER GROUP ASSOCIATION DATA (SITE1)
FAKE_SITE1_ASSOCIATION_ID = 10
class FakeLocalSystem(object):
def __init__(self):
self.uuid = FAKE_SITE0_SYSTEM_UUID
class TestPeerMonitor(base.DCManagerTestCase):
def setUp(self):
super(TestPeerMonitor, self).setUp()
@staticmethod
def create_system_peer_static(ctxt, **kwargs):
values = {
'peer_uuid': FAKE_SYSTEM_PEER_UUID,
'peer_name': FAKE_SYSTEM_PEER_NAME,
'endpoint': FAKE_MANAGER_ENDPOINT,
'username': FAKE_MANAGER_USERNAME,
'password': FAKE_MANAGER_PASSWORD,
'gateway_ip': FAKE_PEER_CONTROLLER_GATEWAY_IP
}
values.update(kwargs)
return db_api.system_peer_create(ctxt, **values)
@staticmethod
def create_subcloud_peer_group_static(ctxt, **kwargs):
values = {
"peer_group_name": FAKE_SITE0_PEER_GROUP_NAME,
"system_leader_id": FAKE_SITE0_PEER_GROUP_SYSTEM_LEADER_ID,
"system_leader_name": FAKE_SITE0_PEER_GROUP_SYSTEM_LEADER_NAME,
"group_priority": FAKE_SITE0_PEER_GROUP_PRIORITY,
"group_state": FAKE_SITE0_PEER_GROUP_STATE,
"max_subcloud_rehoming":
FAKE_SITE0_PEER_GROUP_MAX_SUBCLOUDS_REHOMING,
"migration_status": None
}
values.update(kwargs)
return db_api.subcloud_peer_group_create(ctxt, **values)
@staticmethod
def create_peer_group_association_static(ctxt, **kwargs):
values = {
"system_peer_id": FAKE_ASSOCIATION_SYSTEM_PEER_ID,
"peer_group_id": FAKE_ASSOCIATION_PEER_GROUP_ID,
"peer_group_priority": FAKE_ASSOCIATION_PEER_GROUP_PRIORITY,
"sync_status": FAKE_ASSOCIATION_SYNC_STATUS,
"sync_message": FAKE_ASSOCIATION_SYNC_MESSAGE,
"association_type": FAKE_ASSOCIATION_TYPE
}
values.update(kwargs)
return db_api.peer_group_association_create(ctxt, **values)
def test_initialize_peer_monitor_manager(self):
peer = self.create_system_peer_static(self.ctx)
pm = peer_monitor_manager.PeerMonitor(peer, self.ctx, mock.MagicMock())
self.assertIsNotNone(pm)
self.assertEqual(FAKE_SYSTEM_PEER_NAME, pm.peer.peer_name)
def test_update_sync_status_when_secondary_site_becomes_unreachable(self):
peer = self.create_system_peer_static(
self.ctx,
peer_name='SystemPeer1')
peer_group = self.create_subcloud_peer_group_static(
self.ctx,
peer_group_name='SubcloudPeerGroup1')
association = self.create_peer_group_association_static(
self.ctx,
system_peer_id=peer.id,
peer_group_id=peer_group.id)
pm = peer_monitor_manager.PeerMonitor(peer, self.ctx, mock.MagicMock())
pm._update_sync_status_when_secondary_site_becomes_unreachable()
association_new = db_api.peer_group_association_get(
self.ctx, association.id)
self.assertEqual(consts.ASSOCIATION_SYNC_STATUS_UNKNOWN,
association_new.sync_status)
@mock.patch('dcmanager.manager.peer_monitor_manager.'
'SystemPeerManager.get_peer_dc_client')
def test_update_sync_status_and_association_is_non_primary(self, mock_client):
mock_dc_client = mock.MagicMock()
mock_dc_client().get_subcloud_peer_group = mock.MagicMock()
mock_client.return_value = mock_dc_client()
peer = self.create_system_peer_static(
self.ctx, peer_name='SystemPeer1')
peer_group = self.create_subcloud_peer_group_static(
self.ctx, peer_group_name='SubcloudPeerGroup1')
association = self.create_peer_group_association_static(
self.ctx, system_peer_id=peer.id,
peer_group_id=peer_group.id,
association_type=consts.ASSOCIATION_TYPE_NON_PRIMARY)
mock_dc_client().get_subcloud_peer_group.return_value = \
{'id': FAKE_SITE1_PEER_GROUP_ID}
# Test the case where the association is non-primary
pm = peer_monitor_manager.PeerMonitor(peer, self.ctx, mock.MagicMock())
pm._update_sync_status_when_secondary_site_becomes_reachable()
mock_dc_client().get_subcloud_peer_group.assert_not_called()
association_new = db_api.peer_group_association_get(
self.ctx, association.id)
self.assertEqual(consts.ASSOCIATION_SYNC_STATUS_IN_SYNC,
association_new.sync_status)
@mock.patch('dcmanager.manager.system_peer_manager.'
'utils.get_local_system')
@mock.patch('dcmanager.manager.peer_monitor_manager.'
'SystemPeerManager.get_peer_dc_client')
def test_update_sync_status_when_secondary_site_becomes_reachable(
self, mock_client, mock_utils):
mock_dc_client = mock.MagicMock()
mock_dc_client().get_subcloud_peer_group = mock.MagicMock()
mock_dc_client().get_system_peer = mock.MagicMock()
mock_dc_client().get_peer_group_association_with_peer_id_and_pg_id = \
mock.MagicMock()
mock_dc_client().update_peer_group_association_sync_status = \
mock.MagicMock()
mock_client.return_value = mock_dc_client()
mock_utils.return_value = FakeLocalSystem()
peer = self.create_system_peer_static(
self.ctx, peer_name='SystemPeer1')
peer_group = self.create_subcloud_peer_group_static(
self.ctx, peer_group_name='SubcloudPeerGroup1')
association = self.create_peer_group_association_static(
self.ctx, system_peer_id=peer.id,
peer_group_id=peer_group.id,
sync_status=consts.ASSOCIATION_SYNC_STATUS_UNKNOWN)
mock_dc_client().get_subcloud_peer_group.return_value = \
{'id': FAKE_SITE1_PEER_GROUP_ID}
mock_dc_client().get_system_peer.return_value = \
{'id': FAKE_SITE1_SYSTEM_PEER_ID}
mock_dc_client().get_peer_group_association_with_peer_id_and_pg_id.\
return_value = {'id': FAKE_SITE1_ASSOCIATION_ID}
# Test the case where the association sync_status is unknown
pm = peer_monitor_manager.PeerMonitor(peer, self.ctx, mock.MagicMock())
pm._update_sync_status_when_secondary_site_becomes_reachable()
mock_dc_client().get_subcloud_peer_group.assert_called_once_with(
peer_group.peer_group_name)
mock_dc_client().get_system_peer.assert_called_once_with(
FAKE_SITE0_SYSTEM_UUID)
mock_dc_client().get_peer_group_association_with_peer_id_and_pg_id.\
assert_called_once_with(FAKE_SITE1_SYSTEM_PEER_ID,
FAKE_SITE1_PEER_GROUP_ID)
mock_dc_client().update_peer_group_association_sync_status.\
assert_called_once_with(FAKE_SITE1_ASSOCIATION_ID,
consts.ASSOCIATION_SYNC_STATUS_IN_SYNC)
association_new = db_api.peer_group_association_get(
self.ctx, association.id)
self.assertEqual(consts.ASSOCIATION_SYNC_STATUS_IN_SYNC,
association_new.sync_status)

View File

@ -10,6 +10,7 @@ import uuid
import mock
from dccommon import exceptions as dccommon_exceptions
from dcmanager.common import consts
from dcmanager.db.sqlalchemy import api as db_api
from dcmanager.manager import system_peer_manager
from dcmanager.tests import base
@ -40,6 +41,9 @@ FAKE_SITE0_PEER_GROUP_STATE = 'enabled'
# FAKE SUBCLOUD PEER GROUP DATA (SITE1)
FAKE_SITE1_PEER_GROUP_ID = 9
# FAKE SYSTEM PEER DATA (SITE1)
FAKE_SITE1_SYSTEM_PEER_ID = 10
# FAKE SUBCLOUD DATA (SITE1)
FAKE_SITE1_SUBCLOUD1_ID = 11
FAKE_SITE1_SUBCLOUD1_REGION_NAME = str(uuid.uuid4())
@ -71,7 +75,7 @@ FAKE_ASSOCIATION_PEER_GROUP_ID = \
FAKE_ASSOCIATION_SYSTEM_PEER_ID = \
FAKE_SYSTEM_PEER_ID
FAKE_ASSOCIATION_PEER_GROUP_PRIORITY = 1
FAKE_ASSOCIATION_SYNC_STATUS = 'synced'
FAKE_ASSOCIATION_SYNC_STATUS = 'in-sync'
FAKE_ASSOCIATION_SYNC_MESSAGE = 'None'
FAKE_ASSOCIATION_TYPE = 'primary'
@ -483,3 +487,55 @@ class TestSystemPeerManager(base.DCManagerTestCase):
associations = db_api.peer_group_association_get_all(self.ctx)
self.assertEqual(0, len(associations))
@mock.patch('dcmanager.manager.system_peer_manager.'
'utils.get_local_system')
@mock.patch('dcmanager.manager.system_peer_manager.'
'SystemPeerManager.get_peer_dc_client')
def test_update_sync_status_on_peer_site(
self, mock_client, mock_utils):
mock_dc_client = mock.MagicMock()
mock_dc_client().get_subcloud_peer_group = mock.MagicMock()
mock_dc_client().get_system_peer = mock.MagicMock()
mock_dc_client().get_peer_group_association_with_peer_id_and_pg_id = \
mock.MagicMock()
mock_dc_client().update_peer_group_association_sync_status = \
mock.MagicMock()
mock_client.return_value = mock_dc_client()
mock_utils.return_value = FakeSystem(FAKE_SITE0_SYSTEM_UUID)
peer = self.create_system_peer_static(
self.ctx, peer_name='SystemPeer1')
peer_group = self.create_subcloud_peer_group_static(
self.ctx, peer_group_name='SubcloudPeerGroup1')
association = self.create_peer_group_association_static(
self.ctx, system_peer_id=peer.id,
peer_group_id=peer_group.id,
sync_status=consts.ASSOCIATION_SYNC_STATUS_UNKNOWN)
mock_dc_client().get_subcloud_peer_group.return_value = \
{'id': FAKE_SITE1_PEER_GROUP_ID}
mock_dc_client().get_system_peer.return_value = \
{'id': FAKE_SITE1_SYSTEM_PEER_ID}
mock_dc_client().get_peer_group_association_with_peer_id_and_pg_id.\
return_value = {'id': FAKE_SITE1_ASSOCIATION_ID}
spm = system_peer_manager.SystemPeerManager(mock.MagicMock())
spm.update_sync_status_on_peer_site(
self.ctx, peer, consts.ASSOCIATION_SYNC_STATUS_IN_SYNC)
mock_dc_client().get_subcloud_peer_group.assert_called_once_with(
peer_group.peer_group_name)
mock_dc_client().get_system_peer.assert_called_once_with(
FAKE_SITE0_SYSTEM_UUID)
mock_dc_client().get_peer_group_association_with_peer_id_and_pg_id.\
assert_called_once_with(FAKE_SITE1_SYSTEM_PEER_ID,
FAKE_SITE1_PEER_GROUP_ID)
mock_dc_client().update_peer_group_association_sync_status.\
assert_called_once_with(FAKE_SITE1_ASSOCIATION_ID,
consts.ASSOCIATION_SYNC_STATUS_IN_SYNC)
association_new = db_api.peer_group_association_get(
self.ctx, association.id)
self.assertEqual(consts.ASSOCIATION_SYNC_STATUS_IN_SYNC,
association_new.sync_status)