Filter cert-mon for geo-redundancy in audit and DC_CertWatcher

This commit adds a filter for querying all subclouds from dcmanager, to
account for secondary subclouds that should not be audited by cert-mon
for this system controller. The filter is performed against a list of
invalid deploy states that should be considered when querying
the list of subcloud from dcmanager.

Likewise, the DC_CertWatcher -> DCIntermediateCertRenew flow must ensure
that subclouds which are secondary to this system controller are ignored
by the kubernetes watch in place for the DC intermediate cert renewal
detection. Subclouds are filtered by the watch based on their online
state and their deploy-status. A subcloud with invalid deploy state is
ignored by this system controller.

Test Cases

PASS:
- Trigger audits on service restart. Verify that offline/secondary
  subclouds are excluded.
- Ensure full daily audit is executed. Verify that all subclouds
  belonging to this system controller are audited. Secondary subclouds
  are not audited.
- Verify that DC_CertWatcher -> DCIntermediateCertRenew watch fires are
  ignored for offline and/or invalid deploy state

Closes-Bug: 2060068

Change-Id: Iffe3d7c76db8d2f17aed0bfebc792af0f9d75ca2
Signed-off-by: Kyle MacLeod <kyle.macleod@windriver.com>
This commit is contained in:
Kyle MacLeod 2024-04-02 11:52:39 -04:00
parent 2cbdc83b04
commit 03443ef16c
3 changed files with 102 additions and 39 deletions

View File

@ -1,4 +1,4 @@
# Copyright (c) 2020-2022 Wind River Systems, Inc.
# Copyright (c) 2020-2024 Wind River Systems, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -36,12 +36,13 @@ TASK_NAME_PAUSE_AUDIT = 'pause'
INVALID_SUBCLOUD_AUDIT_DEPLOY_STATES = [
# Secondary subclouds should not be audited as they are expected
# to be managed by a peer system controller (geo-redundancy feat.)
'create-complete',
'pre-rehome',
'rehome-failed',
'rehome-pending',
'rehoming',
'secondary',
'secondary-failed',
'rehome-pending',
'pre-rehome',
'rehoming',
'rehome-failed'
]
cert_mon_opts = [
@ -118,12 +119,19 @@ class CertificateMonManager(periodic_task.PeriodicTasks):
# Do nothing if it is not systemcontroller
return
all_subclouds = utils.get_subclouds()[:]
all_subclouds = utils.get_subclouds_from_dcmanager(
self.token_cache.get_token(), INVALID_SUBCLOUD_AUDIT_DEPLOY_STATES
)
LOG.info("Periodic: begin subcloud certificate audit: %d subclouds"
% len(all_subclouds))
for subcloud_name in all_subclouds:
self.sc_audit_queue.enqueue(
subcloud_audit_queue.SubcloudAuditData(subcloud_name))
for sc in all_subclouds:
try:
self.sc_audit_queue.enqueue(
subcloud_audit_queue.SubcloudAuditData(sc['name']))
except subcloud_audit_queue.SubcloudAuditException as exc:
# Log as warn because we can see this if the watch has fired
# near the same time as we are auditing the subcloud
LOG.warn("Failed to enqueue subcloud audit: %s", str(exc))
def on_start_audit(self):
"""
@ -136,14 +144,18 @@ class CertificateMonManager(periodic_task.PeriodicTasks):
return
if CONF.certmon.startup_audit_all:
LOG.info("Service start: audit all subclouds")
LOG.info("Service start startup_audit_all: audit all subclouds")
self.audit_sc_cert_start(None)
return
LOG.info("Service start: begin subcloud certificate audit [batch: %s]"
% CONF.certmon.audit_batch_size)
all_subclouds = utils.get_subclouds_from_dcmanager(
self.token_cache.get_token())
self.token_cache.get_token(), INVALID_SUBCLOUD_AUDIT_DEPLOY_STATES
)
LOG.info(
"Service start: begin subcloud certificate audit [#sc: %d, batch: %s]"
% (len(all_subclouds), CONF.certmon.audit_batch_size)
)
for subcloud in all_subclouds:
if subcloud[utils.ENDPOINT_TYPE_DC_CERT] != utils.SYNC_STATUS_IN_SYNC:
subcloud_name = subcloud['name']
@ -352,7 +364,8 @@ class CertificateMonManager(periodic_task.PeriodicTasks):
self.dc_monitor = watcher.DC_CertWatcher()
self.dc_monitor.initialize(
audit_subcloud=lambda subcloud_name:
self.audit_subcloud(subcloud_name, allow_requeue=True))
self.audit_subcloud(subcloud_name, allow_requeue=True),
invalid_deploy_states=INVALID_SUBCLOUD_AUDIT_DEPLOY_STATES)
def init_restapicert_monitor(self):
self.restapicert_monitor = watcher.RestApiCert_CertWatcher()

View File

@ -1,4 +1,4 @@
# Copyright (c) 2020-2023 Wind River Systems, Inc.
# Copyright (c) 2020-2024 Wind River Systems, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -204,28 +204,31 @@ def get_subcloud(token, subcloud_name):
return resp
def load_subclouds(resp):
def load_subclouds(resp, invalid_deploy_states=None):
sc_list = []
for obj in resp['subclouds']:
for obj in resp["subclouds"]:
if invalid_deploy_states and obj["deploy-status"] in invalid_deploy_states:
continue
sc = {}
sc['name'] = obj['name']
sc['management-state'] = obj['management-state']
sc['availability-status'] = obj['availability-status']
sc['sync_status'] = obj['sync_status']
for ss in obj['endpoint_sync_status']:
sc[ss['endpoint_type']] = ss['sync_status']
sc["name"] = obj["name"]
sc["region-name"] = obj["region-name"]
sc["management-state"] = obj["management-state"]
sc["availability-status"] = obj["availability-status"]
sc["sync_status"] = obj["sync_status"]
for ss in obj["endpoint_sync_status"]:
sc[ss["endpoint_type"]] = ss["sync_status"]
sc_list.append(sc)
return sc_list
def get_subclouds_from_dcmanager(token):
def get_subclouds_from_dcmanager(token, invalid_deploy_states=None):
api_url = dc_get_service_endpoint_url(token)
api_cmd = api_url + '/subclouds'
LOG.debug('api_cmd %s' % api_cmd)
resp = rest_api_request(token, "GET", api_cmd)
return load_subclouds(resp)
return load_subclouds(resp, invalid_deploy_states)
def is_subcloud_online(subcloud_name, token=None):
@ -239,6 +242,33 @@ def is_subcloud_online(subcloud_name, token=None):
return subcloud_info['availability-status'] == AVAILABILITY_ONLINE
def query_subcloud_online_with_deploy_state(
subcloud_name, invalid_deploy_states=None, token=None
):
"""Check if subcloud is online and not in an invalid deploy state"""
if not token:
token = get_token()
subcloud_info = get_subcloud(token, subcloud_name)
if not subcloud_info:
LOG.error("Cannot find subcloud %s" % subcloud_name)
return False, None, None
subcloud_valid_state = False
if (
invalid_deploy_states
and subcloud_info["deploy-status"] in invalid_deploy_states
):
subcloud_valid_state = False
else:
subcloud_valid_state = (
subcloud_info["availability-status"] == AVAILABILITY_ONLINE
)
return (
subcloud_valid_state,
subcloud_info["availability-status"],
subcloud_info["deploy-status"],
)
def update_subcloud_status(token, subcloud_name, status):
api_url = dc_get_service_endpoint_url(token)
api_cmd = api_url + '/subclouds/%s/update_status' % subcloud_name

View File

@ -1,4 +1,4 @@
# Copyright (c) 2020-2022 Wind River Systems, Inc.
# Copyright (c) 2020-2024 Wind River Systems, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -375,7 +375,7 @@ class DC_CertWatcher(CertWatcher):
def __init__(self):
super(DC_CertWatcher, self).__init__()
def initialize(self, audit_subcloud):
def initialize(self, audit_subcloud, invalid_deploy_states):
self.context.initialize()
dc_role = self.context.dc_role
LOG.info('DC role: %s' % dc_role)
@ -390,7 +390,11 @@ class DC_CertWatcher(CertWatcher):
self.context.kubernete_namespace = ns
self.register_listener(AdminEndpointRenew(self.context))
if dc_role == constants.DISTRIBUTED_CLOUD_ROLE_SYSTEMCONTROLLER:
self.register_listener(DCIntermediateCertRenew(self.context, audit_subcloud))
self.register_listener(
DCIntermediateCertRenew(
self.context, audit_subcloud, invalid_deploy_states
)
)
self.register_listener(RootCARenew(self.context))
@ -520,26 +524,42 @@ class AdminEndpointRenew(CertificateRenew):
class DCIntermediateCertRenew(CertificateRenew):
def __init__(self, context, audit_subcloud):
def __init__(self, context, audit_subcloud, invalid_deploy_states):
super(DCIntermediateCertRenew, self).__init__(context)
self.invalid_deploy_states = invalid_deploy_states
self.secret_pattern = re.compile('-adminep-ca-certificate$')
self.audit_subcloud = audit_subcloud
def check_filter(self, event_data):
m = self.secret_pattern.search(event_data.secret_name)
if m and m.start() > 0:
# Ensure subcloud is online (watch events can fire
# for secrets before the subcloud first comes online)
search_result = self.secret_pattern.search(event_data.secret_name)
if search_result and search_result.start() > 0:
# Ensure subcloud is in a valid deploy-status and online (watch
# events can fire for secrets before the subcloud first comes online)
subcloud_name = self._get_subcloud_name(event_data)
try:
if not utils.is_subcloud_online(subcloud_name,
token=self.context.get_token()):
LOG.info('%s check_filter[%s]: subcloud is not online' %
(self.__class__.__name__, subcloud_name))
(
subcloud_valid_state,
availability_status,
deploy_status,
) = utils.query_subcloud_online_with_deploy_state(
subcloud_name,
invalid_deploy_states=self.invalid_deploy_states,
token=self.context.get_token(),
)
if not subcloud_valid_state:
LOG.info(
"%s check_filter: subcloud %s is ignored, "
"availability=%s, deploy_status: %s",
self.__class__.__name__,
subcloud_name,
availability_status,
deploy_status,
)
return False
except Exception:
LOG.exception('Failed to check subcloud availability: %s'
% subcloud_name)
LOG.exception(
"Failed to check subcloud availability: %s" % subcloud_name
)
return False
return self.certificate_is_ready(event_data)
else: