Merge "Filter cert-mon for geo-redundancy in audit and DC_CertWatcher"

This commit is contained in:
Zuul 2024-04-04 21:54:21 +00:00 committed by Gerrit Code Review
commit 8ea80c4b27
3 changed files with 102 additions and 39 deletions

View File

@ -1,4 +1,4 @@
# Copyright (c) 2020-2022 Wind River Systems, Inc. # Copyright (c) 2020-2024 Wind River Systems, Inc.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
@ -36,12 +36,13 @@ TASK_NAME_PAUSE_AUDIT = 'pause'
INVALID_SUBCLOUD_AUDIT_DEPLOY_STATES = [ INVALID_SUBCLOUD_AUDIT_DEPLOY_STATES = [
# Secondary subclouds should not be audited as they are expected # Secondary subclouds should not be audited as they are expected
# to be managed by a peer system controller (geo-redundancy feat.) # to be managed by a peer system controller (geo-redundancy feat.)
'create-complete',
'pre-rehome',
'rehome-failed',
'rehome-pending',
'rehoming',
'secondary', 'secondary',
'secondary-failed', 'secondary-failed',
'rehome-pending',
'pre-rehome',
'rehoming',
'rehome-failed'
] ]
cert_mon_opts = [ cert_mon_opts = [
@ -118,12 +119,19 @@ class CertificateMonManager(periodic_task.PeriodicTasks):
# Do nothing if it is not systemcontroller # Do nothing if it is not systemcontroller
return return
all_subclouds = utils.get_subclouds()[:] all_subclouds = utils.get_subclouds_from_dcmanager(
self.token_cache.get_token(), INVALID_SUBCLOUD_AUDIT_DEPLOY_STATES
)
LOG.info("Periodic: begin subcloud certificate audit: %d subclouds" LOG.info("Periodic: begin subcloud certificate audit: %d subclouds"
% len(all_subclouds)) % len(all_subclouds))
for subcloud_name in all_subclouds: for sc in all_subclouds:
self.sc_audit_queue.enqueue( try:
subcloud_audit_queue.SubcloudAuditData(subcloud_name)) self.sc_audit_queue.enqueue(
subcloud_audit_queue.SubcloudAuditData(sc['name']))
except subcloud_audit_queue.SubcloudAuditException as exc:
# Log as warn because we can see this if the watch has fired
# near the same time as we are auditing the subcloud
LOG.warn("Failed to enqueue subcloud audit: %s", str(exc))
def on_start_audit(self): def on_start_audit(self):
""" """
@ -136,14 +144,18 @@ class CertificateMonManager(periodic_task.PeriodicTasks):
return return
if CONF.certmon.startup_audit_all: if CONF.certmon.startup_audit_all:
LOG.info("Service start: audit all subclouds") LOG.info("Service start startup_audit_all: audit all subclouds")
self.audit_sc_cert_start(None) self.audit_sc_cert_start(None)
return return
LOG.info("Service start: begin subcloud certificate audit [batch: %s]"
% CONF.certmon.audit_batch_size)
all_subclouds = utils.get_subclouds_from_dcmanager( all_subclouds = utils.get_subclouds_from_dcmanager(
self.token_cache.get_token()) self.token_cache.get_token(), INVALID_SUBCLOUD_AUDIT_DEPLOY_STATES
)
LOG.info(
"Service start: begin subcloud certificate audit [#sc: %d, batch: %s]"
% (len(all_subclouds), CONF.certmon.audit_batch_size)
)
for subcloud in all_subclouds: for subcloud in all_subclouds:
if subcloud[utils.ENDPOINT_TYPE_DC_CERT] != utils.SYNC_STATUS_IN_SYNC: if subcloud[utils.ENDPOINT_TYPE_DC_CERT] != utils.SYNC_STATUS_IN_SYNC:
subcloud_name = subcloud['name'] subcloud_name = subcloud['name']
@ -352,7 +364,8 @@ class CertificateMonManager(periodic_task.PeriodicTasks):
self.dc_monitor = watcher.DC_CertWatcher() self.dc_monitor = watcher.DC_CertWatcher()
self.dc_monitor.initialize( self.dc_monitor.initialize(
audit_subcloud=lambda subcloud_name: audit_subcloud=lambda subcloud_name:
self.audit_subcloud(subcloud_name, allow_requeue=True)) self.audit_subcloud(subcloud_name, allow_requeue=True),
invalid_deploy_states=INVALID_SUBCLOUD_AUDIT_DEPLOY_STATES)
def init_restapicert_monitor(self): def init_restapicert_monitor(self):
self.restapicert_monitor = watcher.RestApiCert_CertWatcher() self.restapicert_monitor = watcher.RestApiCert_CertWatcher()

View File

@ -1,4 +1,4 @@
# Copyright (c) 2020-2023 Wind River Systems, Inc. # Copyright (c) 2020-2024 Wind River Systems, Inc.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
@ -204,28 +204,31 @@ def get_subcloud(token, subcloud_name):
return resp return resp
def load_subclouds(resp): def load_subclouds(resp, invalid_deploy_states=None):
sc_list = [] sc_list = []
for obj in resp['subclouds']: for obj in resp["subclouds"]:
if invalid_deploy_states and obj["deploy-status"] in invalid_deploy_states:
continue
sc = {} sc = {}
sc['name'] = obj['name'] sc["name"] = obj["name"]
sc['management-state'] = obj['management-state'] sc["region-name"] = obj["region-name"]
sc['availability-status'] = obj['availability-status'] sc["management-state"] = obj["management-state"]
sc['sync_status'] = obj['sync_status'] sc["availability-status"] = obj["availability-status"]
for ss in obj['endpoint_sync_status']: sc["sync_status"] = obj["sync_status"]
sc[ss['endpoint_type']] = ss['sync_status'] for ss in obj["endpoint_sync_status"]:
sc[ss["endpoint_type"]] = ss["sync_status"]
sc_list.append(sc) sc_list.append(sc)
return sc_list return sc_list
def get_subclouds_from_dcmanager(token): def get_subclouds_from_dcmanager(token, invalid_deploy_states=None):
api_url = dc_get_service_endpoint_url(token) api_url = dc_get_service_endpoint_url(token)
api_cmd = api_url + '/subclouds' api_cmd = api_url + '/subclouds'
LOG.debug('api_cmd %s' % api_cmd) LOG.debug('api_cmd %s' % api_cmd)
resp = rest_api_request(token, "GET", api_cmd) resp = rest_api_request(token, "GET", api_cmd)
return load_subclouds(resp) return load_subclouds(resp, invalid_deploy_states)
def is_subcloud_online(subcloud_name, token=None): def is_subcloud_online(subcloud_name, token=None):
@ -239,6 +242,33 @@ def is_subcloud_online(subcloud_name, token=None):
return subcloud_info['availability-status'] == AVAILABILITY_ONLINE return subcloud_info['availability-status'] == AVAILABILITY_ONLINE
def query_subcloud_online_with_deploy_state(
subcloud_name, invalid_deploy_states=None, token=None
):
"""Check if subcloud is online and not in an invalid deploy state"""
if not token:
token = get_token()
subcloud_info = get_subcloud(token, subcloud_name)
if not subcloud_info:
LOG.error("Cannot find subcloud %s" % subcloud_name)
return False, None, None
subcloud_valid_state = False
if (
invalid_deploy_states
and subcloud_info["deploy-status"] in invalid_deploy_states
):
subcloud_valid_state = False
else:
subcloud_valid_state = (
subcloud_info["availability-status"] == AVAILABILITY_ONLINE
)
return (
subcloud_valid_state,
subcloud_info["availability-status"],
subcloud_info["deploy-status"],
)
def update_subcloud_status(token, subcloud_name, status): def update_subcloud_status(token, subcloud_name, status):
api_url = dc_get_service_endpoint_url(token) api_url = dc_get_service_endpoint_url(token)
api_cmd = api_url + '/subclouds/%s/update_status' % subcloud_name api_cmd = api_url + '/subclouds/%s/update_status' % subcloud_name

View File

@ -1,4 +1,4 @@
# Copyright (c) 2020-2022 Wind River Systems, Inc. # Copyright (c) 2020-2024 Wind River Systems, Inc.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
@ -375,7 +375,7 @@ class DC_CertWatcher(CertWatcher):
def __init__(self): def __init__(self):
super(DC_CertWatcher, self).__init__() super(DC_CertWatcher, self).__init__()
def initialize(self, audit_subcloud): def initialize(self, audit_subcloud, invalid_deploy_states):
self.context.initialize() self.context.initialize()
dc_role = self.context.dc_role dc_role = self.context.dc_role
LOG.info('DC role: %s' % dc_role) LOG.info('DC role: %s' % dc_role)
@ -390,7 +390,11 @@ class DC_CertWatcher(CertWatcher):
self.context.kubernete_namespace = ns self.context.kubernete_namespace = ns
self.register_listener(AdminEndpointRenew(self.context)) self.register_listener(AdminEndpointRenew(self.context))
if dc_role == constants.DISTRIBUTED_CLOUD_ROLE_SYSTEMCONTROLLER: if dc_role == constants.DISTRIBUTED_CLOUD_ROLE_SYSTEMCONTROLLER:
self.register_listener(DCIntermediateCertRenew(self.context, audit_subcloud)) self.register_listener(
DCIntermediateCertRenew(
self.context, audit_subcloud, invalid_deploy_states
)
)
self.register_listener(RootCARenew(self.context)) self.register_listener(RootCARenew(self.context))
@ -520,26 +524,42 @@ class AdminEndpointRenew(CertificateRenew):
class DCIntermediateCertRenew(CertificateRenew): class DCIntermediateCertRenew(CertificateRenew):
def __init__(self, context, audit_subcloud): def __init__(self, context, audit_subcloud, invalid_deploy_states):
super(DCIntermediateCertRenew, self).__init__(context) super(DCIntermediateCertRenew, self).__init__(context)
self.invalid_deploy_states = invalid_deploy_states
self.secret_pattern = re.compile('-adminep-ca-certificate$') self.secret_pattern = re.compile('-adminep-ca-certificate$')
self.audit_subcloud = audit_subcloud self.audit_subcloud = audit_subcloud
def check_filter(self, event_data): def check_filter(self, event_data):
m = self.secret_pattern.search(event_data.secret_name) search_result = self.secret_pattern.search(event_data.secret_name)
if m and m.start() > 0: if search_result and search_result.start() > 0:
# Ensure subcloud is online (watch events can fire # Ensure subcloud is in a valid deploy-status and online (watch
# for secrets before the subcloud first comes online) # events can fire for secrets before the subcloud first comes online)
subcloud_name = self._get_subcloud_name(event_data) subcloud_name = self._get_subcloud_name(event_data)
try: try:
if not utils.is_subcloud_online(subcloud_name, (
token=self.context.get_token()): subcloud_valid_state,
LOG.info('%s check_filter[%s]: subcloud is not online' % availability_status,
(self.__class__.__name__, subcloud_name)) deploy_status,
) = utils.query_subcloud_online_with_deploy_state(
subcloud_name,
invalid_deploy_states=self.invalid_deploy_states,
token=self.context.get_token(),
)
if not subcloud_valid_state:
LOG.info(
"%s check_filter: subcloud %s is ignored, "
"availability=%s, deploy_status: %s",
self.__class__.__name__,
subcloud_name,
availability_status,
deploy_status,
)
return False return False
except Exception: except Exception:
LOG.exception('Failed to check subcloud availability: %s' LOG.exception(
% subcloud_name) "Failed to check subcloud availability: %s" % subcloud_name
)
return False return False
return self.certificate_is_ready(event_data) return self.certificate_is_ready(event_data)
else: else: