diff --git a/sysinv/sysinv/sysinv/sysinv/cert_mon/certificate_mon_manager.py b/sysinv/sysinv/sysinv/sysinv/cert_mon/certificate_mon_manager.py index 03f6898bdd..94770808f0 100644 --- a/sysinv/sysinv/sysinv/sysinv/cert_mon/certificate_mon_manager.py +++ b/sysinv/sysinv/sysinv/sysinv/cert_mon/certificate_mon_manager.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022 Wind River Systems, Inc. +# Copyright (c) 2020-2024 Wind River Systems, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -36,12 +36,13 @@ TASK_NAME_PAUSE_AUDIT = 'pause' INVALID_SUBCLOUD_AUDIT_DEPLOY_STATES = [ # Secondary subclouds should not be audited as they are expected # to be managed by a peer system controller (geo-redundancy feat.) + 'create-complete', + 'pre-rehome', + 'rehome-failed', + 'rehome-pending', + 'rehoming', 'secondary', 'secondary-failed', - 'rehome-pending', - 'pre-rehome', - 'rehoming', - 'rehome-failed' ] cert_mon_opts = [ @@ -118,12 +119,19 @@ class CertificateMonManager(periodic_task.PeriodicTasks): # Do nothing if it is not systemcontroller return - all_subclouds = utils.get_subclouds()[:] + all_subclouds = utils.get_subclouds_from_dcmanager( + self.token_cache.get_token(), INVALID_SUBCLOUD_AUDIT_DEPLOY_STATES + ) LOG.info("Periodic: begin subcloud certificate audit: %d subclouds" % len(all_subclouds)) - for subcloud_name in all_subclouds: - self.sc_audit_queue.enqueue( - subcloud_audit_queue.SubcloudAuditData(subcloud_name)) + for sc in all_subclouds: + try: + self.sc_audit_queue.enqueue( + subcloud_audit_queue.SubcloudAuditData(sc['name'])) + except subcloud_audit_queue.SubcloudAuditException as exc: + # Log as warn because we can see this if the watch has fired + # near the same time as we are auditing the subcloud + LOG.warn("Failed to enqueue subcloud audit: %s", str(exc)) def on_start_audit(self): """ @@ -136,14 +144,18 @@ class CertificateMonManager(periodic_task.PeriodicTasks): return if CONF.certmon.startup_audit_all: - LOG.info("Service start: audit all subclouds") + LOG.info("Service start startup_audit_all: audit all subclouds") self.audit_sc_cert_start(None) return - LOG.info("Service start: begin subcloud certificate audit [batch: %s]" - % CONF.certmon.audit_batch_size) all_subclouds = utils.get_subclouds_from_dcmanager( - self.token_cache.get_token()) + self.token_cache.get_token(), INVALID_SUBCLOUD_AUDIT_DEPLOY_STATES + ) + LOG.info( + "Service start: begin subcloud certificate audit [#sc: %d, batch: %s]" + % (len(all_subclouds), CONF.certmon.audit_batch_size) + ) + for subcloud in all_subclouds: if subcloud[utils.ENDPOINT_TYPE_DC_CERT] != utils.SYNC_STATUS_IN_SYNC: subcloud_name = subcloud['name'] @@ -352,7 +364,8 @@ class CertificateMonManager(periodic_task.PeriodicTasks): self.dc_monitor = watcher.DC_CertWatcher() self.dc_monitor.initialize( audit_subcloud=lambda subcloud_name: - self.audit_subcloud(subcloud_name, allow_requeue=True)) + self.audit_subcloud(subcloud_name, allow_requeue=True), + invalid_deploy_states=INVALID_SUBCLOUD_AUDIT_DEPLOY_STATES) def init_restapicert_monitor(self): self.restapicert_monitor = watcher.RestApiCert_CertWatcher() diff --git a/sysinv/sysinv/sysinv/sysinv/cert_mon/utils.py b/sysinv/sysinv/sysinv/sysinv/cert_mon/utils.py index 98e5c50695..3024723db7 100644 --- a/sysinv/sysinv/sysinv/sysinv/cert_mon/utils.py +++ b/sysinv/sysinv/sysinv/sysinv/cert_mon/utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023 Wind River Systems, Inc. +# Copyright (c) 2020-2024 Wind River Systems, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -204,28 +204,31 @@ def get_subcloud(token, subcloud_name): return resp -def load_subclouds(resp): +def load_subclouds(resp, invalid_deploy_states=None): sc_list = [] - for obj in resp['subclouds']: + for obj in resp["subclouds"]: + if invalid_deploy_states and obj["deploy-status"] in invalid_deploy_states: + continue sc = {} - sc['name'] = obj['name'] - sc['management-state'] = obj['management-state'] - sc['availability-status'] = obj['availability-status'] - sc['sync_status'] = obj['sync_status'] - for ss in obj['endpoint_sync_status']: - sc[ss['endpoint_type']] = ss['sync_status'] + sc["name"] = obj["name"] + sc["region-name"] = obj["region-name"] + sc["management-state"] = obj["management-state"] + sc["availability-status"] = obj["availability-status"] + sc["sync_status"] = obj["sync_status"] + for ss in obj["endpoint_sync_status"]: + sc[ss["endpoint_type"]] = ss["sync_status"] sc_list.append(sc) return sc_list -def get_subclouds_from_dcmanager(token): +def get_subclouds_from_dcmanager(token, invalid_deploy_states=None): api_url = dc_get_service_endpoint_url(token) api_cmd = api_url + '/subclouds' LOG.debug('api_cmd %s' % api_cmd) resp = rest_api_request(token, "GET", api_cmd) - return load_subclouds(resp) + return load_subclouds(resp, invalid_deploy_states) def is_subcloud_online(subcloud_name, token=None): @@ -239,6 +242,33 @@ def is_subcloud_online(subcloud_name, token=None): return subcloud_info['availability-status'] == AVAILABILITY_ONLINE +def query_subcloud_online_with_deploy_state( + subcloud_name, invalid_deploy_states=None, token=None +): + """Check if subcloud is online and not in an invalid deploy state""" + if not token: + token = get_token() + subcloud_info = get_subcloud(token, subcloud_name) + if not subcloud_info: + LOG.error("Cannot find subcloud %s" % subcloud_name) + return False, None, None + subcloud_valid_state = False + if ( + invalid_deploy_states + and subcloud_info["deploy-status"] in invalid_deploy_states + ): + subcloud_valid_state = False + else: + subcloud_valid_state = ( + subcloud_info["availability-status"] == AVAILABILITY_ONLINE + ) + return ( + subcloud_valid_state, + subcloud_info["availability-status"], + subcloud_info["deploy-status"], + ) + + def update_subcloud_status(token, subcloud_name, status): api_url = dc_get_service_endpoint_url(token) api_cmd = api_url + '/subclouds/%s/update_status' % subcloud_name diff --git a/sysinv/sysinv/sysinv/sysinv/cert_mon/watcher.py b/sysinv/sysinv/sysinv/sysinv/cert_mon/watcher.py index 68ffcee4a2..68e64a770c 100644 --- a/sysinv/sysinv/sysinv/sysinv/cert_mon/watcher.py +++ b/sysinv/sysinv/sysinv/sysinv/cert_mon/watcher.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022 Wind River Systems, Inc. +# Copyright (c) 2020-2024 Wind River Systems, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -375,7 +375,7 @@ class DC_CertWatcher(CertWatcher): def __init__(self): super(DC_CertWatcher, self).__init__() - def initialize(self, audit_subcloud): + def initialize(self, audit_subcloud, invalid_deploy_states): self.context.initialize() dc_role = self.context.dc_role LOG.info('DC role: %s' % dc_role) @@ -390,7 +390,11 @@ class DC_CertWatcher(CertWatcher): self.context.kubernete_namespace = ns self.register_listener(AdminEndpointRenew(self.context)) if dc_role == constants.DISTRIBUTED_CLOUD_ROLE_SYSTEMCONTROLLER: - self.register_listener(DCIntermediateCertRenew(self.context, audit_subcloud)) + self.register_listener( + DCIntermediateCertRenew( + self.context, audit_subcloud, invalid_deploy_states + ) + ) self.register_listener(RootCARenew(self.context)) @@ -520,26 +524,42 @@ class AdminEndpointRenew(CertificateRenew): class DCIntermediateCertRenew(CertificateRenew): - def __init__(self, context, audit_subcloud): + def __init__(self, context, audit_subcloud, invalid_deploy_states): super(DCIntermediateCertRenew, self).__init__(context) + self.invalid_deploy_states = invalid_deploy_states self.secret_pattern = re.compile('-adminep-ca-certificate$') self.audit_subcloud = audit_subcloud def check_filter(self, event_data): - m = self.secret_pattern.search(event_data.secret_name) - if m and m.start() > 0: - # Ensure subcloud is online (watch events can fire - # for secrets before the subcloud first comes online) + search_result = self.secret_pattern.search(event_data.secret_name) + if search_result and search_result.start() > 0: + # Ensure subcloud is in a valid deploy-status and online (watch + # events can fire for secrets before the subcloud first comes online) subcloud_name = self._get_subcloud_name(event_data) try: - if not utils.is_subcloud_online(subcloud_name, - token=self.context.get_token()): - LOG.info('%s check_filter[%s]: subcloud is not online' % - (self.__class__.__name__, subcloud_name)) + ( + subcloud_valid_state, + availability_status, + deploy_status, + ) = utils.query_subcloud_online_with_deploy_state( + subcloud_name, + invalid_deploy_states=self.invalid_deploy_states, + token=self.context.get_token(), + ) + if not subcloud_valid_state: + LOG.info( + "%s check_filter: subcloud %s is ignored, " + "availability=%s, deploy_status: %s", + self.__class__.__name__, + subcloud_name, + availability_status, + deploy_status, + ) return False except Exception: - LOG.exception('Failed to check subcloud availability: %s' - % subcloud_name) + LOG.exception( + "Failed to check subcloud availability: %s" % subcloud_name + ) return False return self.certificate_is_ready(event_data) else: