Stop fernet key rotations from raising out-of-sync alarms

Fernet key rotation is expected to occur periodically. Currently the
280.002 out-of-sync alarm is raised everytime the sync thread receives
a fernet key rotation request.

This commit makes the sync thread check the type of requests, setting
the alarmable parameter to False if all the requests are due to a
fernet key rotation.

It also improves the sync function so it doesn't unnecessarily calls
the is_subcloud_enable() function by providing an early exit when
there are no pending sync requests.

Test Plan:
1. PASS - Verify that the out-of-sync alarm is not raised when the
          fernet keys are rotated;
2. PASS - Check that the initial sync still works as expected;
3. PASS - Verify that identity sync due to user triggered identity
          resources change on the central cloud works as expected;
4. PASS - Check that platform resources sync due to user triggered
          platform resources on the central cloud works as expected;
5. PASS - Trigger a fernet key sync and at the same time trigger a
          different sync request and verify that alarm gets raised.

Closes-Bug: #2002171

Signed-off-by: Gustavo Herzmann <gustavo.herzmann@windriver.com>
Change-Id: I694d6c3791222739921cd0f5141f54791f847414
This commit is contained in:
Gustavo Herzmann 2023-01-06 15:41:00 -03:00
parent 4b05f5dcc0
commit 18f54a44dd
2 changed files with 176 additions and 135 deletions

View File

@ -1,4 +1,4 @@
# Copyright (c) 2017-2022 Wind River Systems, Inc.
# Copyright (c) 2017-2023 Wind River Systems, Inc.
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
@ -84,25 +84,29 @@ class SubcloudStateClient(RPCClient):
def update_subcloud_endpoint_status(self, ctxt, subcloud_name=None,
endpoint_type=None,
sync_status=dccommon_consts.SYNC_STATUS_OUT_OF_SYNC,
ignore_endpoints=None):
ignore_endpoints=None,
alarmable=True):
# Note: This is an asynchronous operation.
# See below for synchronous method call
return self.cast(ctxt, self.make_msg('update_subcloud_endpoint_status',
subcloud_name=subcloud_name,
endpoint_type=endpoint_type,
sync_status=sync_status,
ignore_endpoints=ignore_endpoints))
ignore_endpoints=ignore_endpoints,
alarmable=alarmable))
def update_subcloud_endpoint_status_sync(self, ctxt, subcloud_name=None,
endpoint_type=None,
sync_status=dccommon_consts.SYNC_STATUS_OUT_OF_SYNC,
ignore_endpoints=None):
ignore_endpoints=None,
alarmable=True):
# Note: synchronous
return self.call(ctxt, self.make_msg('update_subcloud_endpoint_status',
subcloud_name=subcloud_name,
endpoint_type=endpoint_type,
sync_status=sync_status,
ignore_endpoints=ignore_endpoints))
ignore_endpoints=ignore_endpoints,
alarmable=alarmable))
class ManagerClient(RPCClient):

View File

@ -1,4 +1,4 @@
# Copyright 2017-2022 Wind River
# Copyright 2017-2023 Wind River
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -30,6 +30,7 @@ from dcorch.common import context
from dcorch.common import exceptions
from dcorch.common import utils
from dcorch.db import api as db_api
from dcorch.engine.fernet_key_manager import FERNET_REPO_MASTER_ID
from dcorch.objects import orchrequest
from dcorch.objects import resource
from dcorch.objects.subcloud import Subcloud
@ -64,6 +65,12 @@ class SyncThread(object):
"""Manages tasks related to resource management."""
MAX_RETRY = 3
PENDING_SYNC_REQUEST_STATES = (
consts.ORCH_REQUEST_QUEUED,
consts.ORCH_REQUEST_IN_PROGRESS,
consts.ORCH_REQUEST_FAILED,
)
# used by the audit to cache the master resources
master_resources_dict = collections.defaultdict(dict)
@ -261,7 +268,7 @@ class SyncThread(object):
sync_request.orch_job.operation_type), extra=self.log_extra)
handler(sync_request, rsrc)
def set_sync_status(self, sync_status):
def set_sync_status(self, sync_status, alarmable=True):
# Only report sync_status when managed
subcloud_managed = self.is_subcloud_managed()
if not subcloud_managed:
@ -278,17 +285,23 @@ class SyncThread(object):
subcloud_sync.sync_status_report_time, timeutils.utcnow())
if delta < 3600:
if subcloud_sync.sync_status_reported == sync_status:
LOG.debug("skip set_sync_status sync_status_reported=%s, sync_status=%s " %
(subcloud_sync.sync_status_reported, sync_status, ),
extra=self.log_extra)
LOG.debug(
"skip set_sync_status sync_status_reported={}, "
"sync_status={}".format(
subcloud_sync.sync_status_reported, sync_status
),
extra=self.log_extra,
)
return
LOG.info("{}: set_sync_status {}".format(self.subcloud_name, sync_status),
LOG.info("{}: set_sync_status {}, alarmable: {}".format(
self.subcloud_name, sync_status, alarmable),
extra=self.log_extra)
self.dcmanager_state_rpc_client.update_subcloud_endpoint_status(
self.ctxt, self.subcloud_name,
self.endpoint_type, sync_status)
self.endpoint_type, sync_status,
alarmable=alarmable)
db_api.subcloud_sync_update(
self.ctxt, self.subcloud_name, self.endpoint_type,
@ -300,151 +313,175 @@ class SyncThread(object):
extra=self.log_extra)
region_name = self.subcloud_name
sync_requests = []
# We want to check for pending work even if subcloud is disabled.
states = [
consts.ORCH_REQUEST_QUEUED,
consts.ORCH_REQUEST_IN_PROGRESS,
consts.ORCH_REQUEST_FAILED,
]
sync_requests = orchrequest.OrchRequestList.get_by_attrs(
self.ctxt, self.endpoint_type,
target_region_name=region_name,
states=states)
if(len(sync_requests) != 0):
LOG.info("Got " + str(len(sync_requests)) + " sync request(s)",
states=self.PENDING_SYNC_REQUEST_STATES)
# Early exit in case there are no pending sync requests
if not sync_requests:
LOG.info("Sync resources done for subcloud - "
"no sync requests",
extra=self.log_extra)
self.set_sync_status(dccommon_consts.SYNC_STATUS_IN_SYNC)
return
LOG.info(
"Got {} sync request(s)".format(len(sync_requests)),
extra=self.log_extra,
)
actual_sync_requests = []
alarmable = False
for req in sync_requests:
# Failed orch requests were taken into consideration when reporting
# sync status to the dcmanager. They need to be removed from the
# orch requests list before proceeding.
if req.state != consts.ORCH_REQUEST_STATE_FAILED:
actual_sync_requests.append(req)
else:
# Any failed state should be alarmable
alarmable = True
# Do not raise an alarm if all the sync requests are due to
# a fernet key rotation, as these are expected to occur
# periodically.
if req.orch_job.source_resource_id != FERNET_REPO_MASTER_ID:
alarmable = True
# todo: for each request look up sync handler based on
# resource type (I'm assuming here we're not storing a python
# object in the DB)
# Update dcmanager with the current sync status.
subcloud_enabled = self.is_subcloud_enabled()
if sync_requests:
self.set_sync_status(dccommon_consts.SYNC_STATUS_OUT_OF_SYNC)
sync_status_start = dccommon_consts.SYNC_STATUS_OUT_OF_SYNC
else:
self.set_sync_status(dccommon_consts.SYNC_STATUS_IN_SYNC)
sync_status_start = dccommon_consts.SYNC_STATUS_IN_SYNC
self.set_sync_status(
dccommon_consts.SYNC_STATUS_OUT_OF_SYNC, alarmable=alarmable
)
sync_status_start = dccommon_consts.SYNC_STATUS_OUT_OF_SYNC
# Failed orch requests were taken into consideration when reporting
# sync status to the dcmanager. They need to be removed from the
# orch requests list before proceeding.
actual_sync_requests = \
[r for r in sync_requests if r.state != consts.ORCH_REQUEST_STATE_FAILED]
if not actual_sync_requests:
LOG.info("Sync resources done for subcloud - "
"no valid sync requests",
extra=self.log_extra)
return
elif not self.is_subcloud_enabled():
LOG.info("Sync resources done for subcloud - "
"subcloud is disabled",
extra=self.log_extra)
return
if not actual_sync_requests or not subcloud_enabled:
# Either there are no sync requests, or subcloud is disabled,
# or we timed out trying to talk to it.
# We're not going to process any sync requests, just go
# back to sleep.
if not subcloud_enabled:
LOG.info("subcloud is disabled", extra=self.log_extra)
else:
# Subcloud is enabled and there are pending sync requests, so
# we have work to do.
# Subcloud is enabled and there are pending sync requests, so
# we have work to do.
request_aborted = False
try:
for request in actual_sync_requests:
if not self.is_subcloud_enabled() or \
self.should_exit():
# Oops, someone disabled the endpoint while
# we were processing work for it.
raise exceptions.EndpointNotReachable()
request.state = consts.ORCH_REQUEST_STATE_IN_PROGRESS
try:
request.save() # save to DB
except exceptions.OrchRequestNotFound:
# This case is handled in loop below, but should also be
# handled here as well.
LOG.info(
"Orch request already deleted request uuid=%s state=%s"
% (request.uuid, request.state),
extra=self.log_extra,
)
continue
request_aborted = False
try:
for request in actual_sync_requests:
if not self.is_subcloud_enabled() or \
self.should_exit():
# Oops, someone disabled the endpoint while
# we were processing work for it.
raise exceptions.EndpointNotReachable()
request.state = consts.ORCH_REQUEST_STATE_IN_PROGRESS
retry_count = 0
while retry_count < self.MAX_RETRY:
try:
request.save() # save to DB
self.sync_resource(request)
# Sync succeeded, mark the request as
# completed for tracking/debugging purpose
# and tag it for purge when its deleted
# time exceeds the data retention period.
request.state = \
consts.ORCH_REQUEST_STATE_COMPLETED
request.deleted = 1
request.deleted_at = timeutils.utcnow()
request.save()
break
except exceptions.OrchRequestNotFound:
# This case is handled in loop below, but should also be
# handled here as well.
LOG.info("Orch request already deleted request uuid=%s state=%s" %
(request.uuid, request.state),
extra=self.log_extra)
continue
retry_count = 0
while retry_count < self.MAX_RETRY:
try:
self.sync_resource(request)
# Sync succeeded, mark the request as
# completed for tracking/debugging purpose
# and tag it for purge when its deleted
# time exceeds the data retention period.
request.state = \
consts.ORCH_REQUEST_STATE_COMPLETED
request.deleted = 1
request.deleted_at = timeutils.utcnow()
request.save()
break
except exceptions.OrchRequestNotFound:
LOG.info("Orch request already deleted request uuid=%s state=%s" %
(request.uuid, request.state),
extra=self.log_extra)
break
except exceptions.SyncRequestTimeout:
request.try_count += 1
request.save()
retry_count += 1
if retry_count >= self.MAX_RETRY:
# todo: raise "unable to sync this
# subcloud/endpoint" alarm with fmapi
raise exceptions.EndpointNotReachable()
except exceptions.SyncRequestFailedRetry:
LOG.info(
"Orch request already deleted request uuid=%s state=%s"
% (request.uuid, request.state),
extra=self.log_extra,
)
break
except exceptions.SyncRequestTimeout:
request.try_count += 1
request.save()
retry_count += 1
if retry_count >= self.MAX_RETRY:
# todo: raise "unable to sync this
# subcloud/endpoint" alarm with fmapi
request.try_count += 1
request.state = \
consts.ORCH_REQUEST_STATE_FAILED
request.save()
retry_count += 1
# we'll retry
except exceptions.SyncRequestFailed:
request.state = \
consts.ORCH_REQUEST_STATE_FAILED
request.save()
retry_count = self.MAX_RETRY
except exceptions.SyncRequestAbortedBySystem:
request.state = \
consts.ORCH_REQUEST_STATE_FAILED
request.save()
retry_count = self.MAX_RETRY
request_aborted = True
raise exceptions.EndpointNotReachable()
except exceptions.SyncRequestFailedRetry:
# todo: raise "unable to sync this
# subcloud/endpoint" alarm with fmapi
request.try_count += 1
request.state = \
consts.ORCH_REQUEST_STATE_FAILED
request.save()
retry_count += 1
# we'll retry
except exceptions.SyncRequestFailed:
request.state = \
consts.ORCH_REQUEST_STATE_FAILED
request.save()
retry_count = self.MAX_RETRY
except exceptions.SyncRequestAbortedBySystem:
request.state = \
consts.ORCH_REQUEST_STATE_FAILED
request.save()
retry_count = self.MAX_RETRY
request_aborted = True
# If we fall out of the retry loop we either succeeded
# or failed multiple times and want to move to the next
# request.
# If we fall out of the retry loop we either succeeded
# or failed multiple times and want to move to the next
# request.
except exceptions.EndpointNotReachable:
# Endpoint not reachable, throw away all the sync requests.
LOG.info("EndpointNotReachable, {} sync requests pending"
.format(len(actual_sync_requests)))
# del sync_requests[:] #This fails due to:
# 'OrchRequestList' object does not support item deletion
except exceptions.EndpointNotReachable:
# Endpoint not reachable, throw away all the sync requests.
LOG.info(
"EndpointNotReachable, {} sync requests pending".format(
len(actual_sync_requests)), extra=self.log_extra)
# del sync_requests[:] #This fails due to:
# 'OrchRequestList' object does not support item deletion
sync_requests = orchrequest.OrchRequestList.get_by_attrs(
self.ctxt, self.endpoint_type,
target_region_name=region_name,
states=states)
sync_requests = orchrequest.OrchRequestList.get_by_attrs(
self.ctxt, self.endpoint_type,
target_region_name=region_name,
states=self.PENDING_SYNC_REQUEST_STATES)
if (sync_requests and
sync_status_start != dccommon_consts.SYNC_STATUS_OUT_OF_SYNC):
if (sync_requests and
sync_status_start != dccommon_consts.SYNC_STATUS_OUT_OF_SYNC):
self.set_sync_status(dccommon_consts.SYNC_STATUS_OUT_OF_SYNC)
LOG.info(
"End of resource sync out-of-sync. {} sync request(s)".format(
len(sync_requests)), extra=self.log_extra)
elif sync_requests and request_aborted:
if sync_status_start != dccommon_consts.SYNC_STATUS_OUT_OF_SYNC:
self.set_sync_status(dccommon_consts.SYNC_STATUS_OUT_OF_SYNC)
LOG.info("End of resource sync out-of-sync. " +
str(len(sync_requests)) + " sync request(s)",
extra=self.log_extra)
elif sync_requests and request_aborted:
if sync_status_start != dccommon_consts.SYNC_STATUS_OUT_OF_SYNC:
self.set_sync_status(dccommon_consts.SYNC_STATUS_OUT_OF_SYNC)
LOG.info("End of resource sync out-of-sync. " +
str(len(sync_requests)) + " sync request(s)" +
": request_aborted", extra=self.log_extra)
elif sync_status_start != dccommon_consts.SYNC_STATUS_IN_SYNC:
self.set_sync_status(dccommon_consts.SYNC_STATUS_IN_SYNC)
LOG.info("End of resource sync in-sync. " +
str(len(sync_requests)) + " sync request(s)",
extra=self.log_extra)
LOG.info(
"End of resource sync out-of-sync. {} sync request(s): "
"request_aborted".format(len(sync_requests)),
extra=self.log_extra)
elif sync_status_start != dccommon_consts.SYNC_STATUS_IN_SYNC:
self.set_sync_status(dccommon_consts.SYNC_STATUS_IN_SYNC)
LOG.info(
"End of resource sync in-sync. {} sync request(s)".format(
len(sync_requests)), extra=self.log_extra)
LOG.info("Sync resources done for subcloud", extra=self.log_extra)
LOG.info("Sync resources done for subcloud - "
"synced {} request(s)".format(len(actual_sync_requests)),
extra=self.log_extra)
def run_sync_audit(self, engine_id=None):
if self.endpoint_type in cfg.CONF.disable_audit_endpoints: