From 03443ef16c0c47d15631eb9001b413a3b8ea39fc Mon Sep 17 00:00:00 2001 From: Kyle MacLeod Date: Tue, 2 Apr 2024 11:52:39 -0400 Subject: [PATCH] Filter cert-mon for geo-redundancy in audit and DC_CertWatcher This commit adds a filter for querying all subclouds from dcmanager, to account for secondary subclouds that should not be audited by cert-mon for this system controller. The filter is performed against a list of invalid deploy states that should be considered when querying the list of subcloud from dcmanager. Likewise, the DC_CertWatcher -> DCIntermediateCertRenew flow must ensure that subclouds which are secondary to this system controller are ignored by the kubernetes watch in place for the DC intermediate cert renewal detection. Subclouds are filtered by the watch based on their online state and their deploy-status. A subcloud with invalid deploy state is ignored by this system controller. Test Cases PASS: - Trigger audits on service restart. Verify that offline/secondary subclouds are excluded. - Ensure full daily audit is executed. Verify that all subclouds belonging to this system controller are audited. Secondary subclouds are not audited. - Verify that DC_CertWatcher -> DCIntermediateCertRenew watch fires are ignored for offline and/or invalid deploy state Closes-Bug: 2060068 Change-Id: Iffe3d7c76db8d2f17aed0bfebc792af0f9d75ca2 Signed-off-by: Kyle MacLeod --- .../cert_mon/certificate_mon_manager.py | 41 ++++++++++----- sysinv/sysinv/sysinv/sysinv/cert_mon/utils.py | 52 +++++++++++++++---- .../sysinv/sysinv/sysinv/cert_mon/watcher.py | 48 ++++++++++++----- 3 files changed, 102 insertions(+), 39 deletions(-) diff --git a/sysinv/sysinv/sysinv/sysinv/cert_mon/certificate_mon_manager.py b/sysinv/sysinv/sysinv/sysinv/cert_mon/certificate_mon_manager.py index 03f6898bdd..94770808f0 100644 --- a/sysinv/sysinv/sysinv/sysinv/cert_mon/certificate_mon_manager.py +++ b/sysinv/sysinv/sysinv/sysinv/cert_mon/certificate_mon_manager.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022 Wind River Systems, Inc. +# Copyright (c) 2020-2024 Wind River Systems, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -36,12 +36,13 @@ TASK_NAME_PAUSE_AUDIT = 'pause' INVALID_SUBCLOUD_AUDIT_DEPLOY_STATES = [ # Secondary subclouds should not be audited as they are expected # to be managed by a peer system controller (geo-redundancy feat.) + 'create-complete', + 'pre-rehome', + 'rehome-failed', + 'rehome-pending', + 'rehoming', 'secondary', 'secondary-failed', - 'rehome-pending', - 'pre-rehome', - 'rehoming', - 'rehome-failed' ] cert_mon_opts = [ @@ -118,12 +119,19 @@ class CertificateMonManager(periodic_task.PeriodicTasks): # Do nothing if it is not systemcontroller return - all_subclouds = utils.get_subclouds()[:] + all_subclouds = utils.get_subclouds_from_dcmanager( + self.token_cache.get_token(), INVALID_SUBCLOUD_AUDIT_DEPLOY_STATES + ) LOG.info("Periodic: begin subcloud certificate audit: %d subclouds" % len(all_subclouds)) - for subcloud_name in all_subclouds: - self.sc_audit_queue.enqueue( - subcloud_audit_queue.SubcloudAuditData(subcloud_name)) + for sc in all_subclouds: + try: + self.sc_audit_queue.enqueue( + subcloud_audit_queue.SubcloudAuditData(sc['name'])) + except subcloud_audit_queue.SubcloudAuditException as exc: + # Log as warn because we can see this if the watch has fired + # near the same time as we are auditing the subcloud + LOG.warn("Failed to enqueue subcloud audit: %s", str(exc)) def on_start_audit(self): """ @@ -136,14 +144,18 @@ class CertificateMonManager(periodic_task.PeriodicTasks): return if CONF.certmon.startup_audit_all: - LOG.info("Service start: audit all subclouds") + LOG.info("Service start startup_audit_all: audit all subclouds") self.audit_sc_cert_start(None) return - LOG.info("Service start: begin subcloud certificate audit [batch: %s]" - % CONF.certmon.audit_batch_size) all_subclouds = utils.get_subclouds_from_dcmanager( - self.token_cache.get_token()) + self.token_cache.get_token(), INVALID_SUBCLOUD_AUDIT_DEPLOY_STATES + ) + LOG.info( + "Service start: begin subcloud certificate audit [#sc: %d, batch: %s]" + % (len(all_subclouds), CONF.certmon.audit_batch_size) + ) + for subcloud in all_subclouds: if subcloud[utils.ENDPOINT_TYPE_DC_CERT] != utils.SYNC_STATUS_IN_SYNC: subcloud_name = subcloud['name'] @@ -352,7 +364,8 @@ class CertificateMonManager(periodic_task.PeriodicTasks): self.dc_monitor = watcher.DC_CertWatcher() self.dc_monitor.initialize( audit_subcloud=lambda subcloud_name: - self.audit_subcloud(subcloud_name, allow_requeue=True)) + self.audit_subcloud(subcloud_name, allow_requeue=True), + invalid_deploy_states=INVALID_SUBCLOUD_AUDIT_DEPLOY_STATES) def init_restapicert_monitor(self): self.restapicert_monitor = watcher.RestApiCert_CertWatcher() diff --git a/sysinv/sysinv/sysinv/sysinv/cert_mon/utils.py b/sysinv/sysinv/sysinv/sysinv/cert_mon/utils.py index 98e5c50695..3024723db7 100644 --- a/sysinv/sysinv/sysinv/sysinv/cert_mon/utils.py +++ b/sysinv/sysinv/sysinv/sysinv/cert_mon/utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023 Wind River Systems, Inc. +# Copyright (c) 2020-2024 Wind River Systems, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -204,28 +204,31 @@ def get_subcloud(token, subcloud_name): return resp -def load_subclouds(resp): +def load_subclouds(resp, invalid_deploy_states=None): sc_list = [] - for obj in resp['subclouds']: + for obj in resp["subclouds"]: + if invalid_deploy_states and obj["deploy-status"] in invalid_deploy_states: + continue sc = {} - sc['name'] = obj['name'] - sc['management-state'] = obj['management-state'] - sc['availability-status'] = obj['availability-status'] - sc['sync_status'] = obj['sync_status'] - for ss in obj['endpoint_sync_status']: - sc[ss['endpoint_type']] = ss['sync_status'] + sc["name"] = obj["name"] + sc["region-name"] = obj["region-name"] + sc["management-state"] = obj["management-state"] + sc["availability-status"] = obj["availability-status"] + sc["sync_status"] = obj["sync_status"] + for ss in obj["endpoint_sync_status"]: + sc[ss["endpoint_type"]] = ss["sync_status"] sc_list.append(sc) return sc_list -def get_subclouds_from_dcmanager(token): +def get_subclouds_from_dcmanager(token, invalid_deploy_states=None): api_url = dc_get_service_endpoint_url(token) api_cmd = api_url + '/subclouds' LOG.debug('api_cmd %s' % api_cmd) resp = rest_api_request(token, "GET", api_cmd) - return load_subclouds(resp) + return load_subclouds(resp, invalid_deploy_states) def is_subcloud_online(subcloud_name, token=None): @@ -239,6 +242,33 @@ def is_subcloud_online(subcloud_name, token=None): return subcloud_info['availability-status'] == AVAILABILITY_ONLINE +def query_subcloud_online_with_deploy_state( + subcloud_name, invalid_deploy_states=None, token=None +): + """Check if subcloud is online and not in an invalid deploy state""" + if not token: + token = get_token() + subcloud_info = get_subcloud(token, subcloud_name) + if not subcloud_info: + LOG.error("Cannot find subcloud %s" % subcloud_name) + return False, None, None + subcloud_valid_state = False + if ( + invalid_deploy_states + and subcloud_info["deploy-status"] in invalid_deploy_states + ): + subcloud_valid_state = False + else: + subcloud_valid_state = ( + subcloud_info["availability-status"] == AVAILABILITY_ONLINE + ) + return ( + subcloud_valid_state, + subcloud_info["availability-status"], + subcloud_info["deploy-status"], + ) + + def update_subcloud_status(token, subcloud_name, status): api_url = dc_get_service_endpoint_url(token) api_cmd = api_url + '/subclouds/%s/update_status' % subcloud_name diff --git a/sysinv/sysinv/sysinv/sysinv/cert_mon/watcher.py b/sysinv/sysinv/sysinv/sysinv/cert_mon/watcher.py index 68ffcee4a2..68e64a770c 100644 --- a/sysinv/sysinv/sysinv/sysinv/cert_mon/watcher.py +++ b/sysinv/sysinv/sysinv/sysinv/cert_mon/watcher.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022 Wind River Systems, Inc. +# Copyright (c) 2020-2024 Wind River Systems, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -375,7 +375,7 @@ class DC_CertWatcher(CertWatcher): def __init__(self): super(DC_CertWatcher, self).__init__() - def initialize(self, audit_subcloud): + def initialize(self, audit_subcloud, invalid_deploy_states): self.context.initialize() dc_role = self.context.dc_role LOG.info('DC role: %s' % dc_role) @@ -390,7 +390,11 @@ class DC_CertWatcher(CertWatcher): self.context.kubernete_namespace = ns self.register_listener(AdminEndpointRenew(self.context)) if dc_role == constants.DISTRIBUTED_CLOUD_ROLE_SYSTEMCONTROLLER: - self.register_listener(DCIntermediateCertRenew(self.context, audit_subcloud)) + self.register_listener( + DCIntermediateCertRenew( + self.context, audit_subcloud, invalid_deploy_states + ) + ) self.register_listener(RootCARenew(self.context)) @@ -520,26 +524,42 @@ class AdminEndpointRenew(CertificateRenew): class DCIntermediateCertRenew(CertificateRenew): - def __init__(self, context, audit_subcloud): + def __init__(self, context, audit_subcloud, invalid_deploy_states): super(DCIntermediateCertRenew, self).__init__(context) + self.invalid_deploy_states = invalid_deploy_states self.secret_pattern = re.compile('-adminep-ca-certificate$') self.audit_subcloud = audit_subcloud def check_filter(self, event_data): - m = self.secret_pattern.search(event_data.secret_name) - if m and m.start() > 0: - # Ensure subcloud is online (watch events can fire - # for secrets before the subcloud first comes online) + search_result = self.secret_pattern.search(event_data.secret_name) + if search_result and search_result.start() > 0: + # Ensure subcloud is in a valid deploy-status and online (watch + # events can fire for secrets before the subcloud first comes online) subcloud_name = self._get_subcloud_name(event_data) try: - if not utils.is_subcloud_online(subcloud_name, - token=self.context.get_token()): - LOG.info('%s check_filter[%s]: subcloud is not online' % - (self.__class__.__name__, subcloud_name)) + ( + subcloud_valid_state, + availability_status, + deploy_status, + ) = utils.query_subcloud_online_with_deploy_state( + subcloud_name, + invalid_deploy_states=self.invalid_deploy_states, + token=self.context.get_token(), + ) + if not subcloud_valid_state: + LOG.info( + "%s check_filter: subcloud %s is ignored, " + "availability=%s, deploy_status: %s", + self.__class__.__name__, + subcloud_name, + availability_status, + deploy_status, + ) return False except Exception: - LOG.exception('Failed to check subcloud availability: %s' - % subcloud_name) + LOG.exception( + "Failed to check subcloud availability: %s" % subcloud_name + ) return False return self.certificate_is_ready(event_data) else: