config/sysinv/sysinv/sysinv/sysinv/cert_mon/certificate_mon_manager.py

# Copyright (c) 2020-2024 Wind River Systems, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# SPDX-License-Identifier: Apache-2.0

import time

import eventlet
import greenlet
from oslo_config import cfg
from oslo_log import log
from oslo_serialization import base64
from oslo_service import periodic_task

from sysinv.cert_mon import subcloud_audit_queue
from sysinv.cert_mon import utils
from sysinv.cert_mon import watcher
from sysinv.common import constants
from sysinv.common import utils as cutils

LOG = log.getLogger(__name__)
TASK_NAME_PAUSE_AUDIT = 'pause'

INVALID_SUBCLOUD_AUDIT_DEPLOY_STATES = [
    # Secondary subclouds should not be audited as they are expected
    # to be managed by a peer system controller (geo-redundancy feat.)
    'create-complete',
    'pre-rehome',
    'rehome-failed',
    'rehome-pending',
    'rehoming',
    'secondary',
    'secondary-failed',
]

cert_mon_opts = [
    cfg.IntOpt('audit_interval',
               default=86400,  # 24 hours
               help='Interval to run certificate audit'),
    cfg.IntOpt('retry_interval',
               default=10 * 60,  # retry every 10 minutes
               help='Interval to reattempt accessing external system '
                    'if failure occurred'),
    cfg.IntOpt('max_retry',
               default=14,  # retry 14 times to give at least 2 hours to recover
               help='Max number of reattempts accessing external system '
                    'if failure occurred'),
    cfg.BoolOpt('startup_audit_all',
                default=False,
                help='Audit all subclouds on startup'),
    cfg.IntOpt('network_retry_interval',
               default=180,  # every 3 minutes
               help='Max times to reattempt accessing external system '
                    'if network failure occurred'),
    cfg.IntOpt('network_max_retry',
               default=30,
               help='Interval to reattempt accessing external system '
                    'if network failure occurred'),
    cfg.IntOpt('audit_batch_size',
               default=10,
               help='Batch size of subcloud audits per audit_interval'),
    cfg.IntOpt('audit_greenpool_size',
               default=4,
               help='Size of subcloud audit greenpool.'
                    'Set to 0 to disable use of greenpool '
                    '(force serial audit).'),
    cfg.IntOpt('certificate_timeout_secs',
               default=10,
               help='Connection timeout for certificate check (in seconds)'),
]

CONF = cfg.CONF
CONF.register_opts(cert_mon_opts, 'certmon')


class CertificateMonManager(periodic_task.PeriodicTasks):
    def __init__(self):
        super(CertificateMonManager, self).__init__(CONF)
        self.mon_threads = []
        self.audit_thread = None
        self.token_cache = utils.TokenCache('internal')
        self.dc_token_cache = utils.TokenCache('dc')
        self.dc_monitor = None
        self.restapicert_monitor = None
        self.registrycert_monitor = None
        self.openldapcert_monitor = None
        self.reattempt_monitor_tasks = []
        self.sc_audit_queue = subcloud_audit_queue.SubcloudAuditPriorityQueue()
        if CONF.certmon.audit_greenpool_size > 0:
            self.sc_audit_pool = eventlet.greenpool.GreenPool(
                size=CONF.certmon.audit_greenpool_size)
        else:
            self.sc_audit_pool = None

    def periodic_tasks(self, context, raise_on_error=False):
        """Tasks to be run at a periodic interval."""
        return self.run_periodic_tasks(context, raise_on_error=raise_on_error)

    @periodic_task.periodic_task(spacing=CONF.certmon.audit_interval)
    def audit_sc_cert_start(self, context):
        """Kicks an audit of all subclouds.
        By default this task runs once every 24 hours.
        """
        # auditing subcloud certificate
        dc_role = utils.get_dc_role()
        if dc_role != constants.DISTRIBUTED_CLOUD_ROLE_SYSTEMCONTROLLER:
            # Do nothing if it is not systemcontroller
            return

        all_subclouds = utils.get_subclouds_from_dcmanager(
            self.token_cache.get_token(), INVALID_SUBCLOUD_AUDIT_DEPLOY_STATES
        )
        LOG.info("Periodic: begin subcloud certificate audit: %d subclouds"
                 % len(all_subclouds))
        for sc in all_subclouds:
            try:
                self.sc_audit_queue.enqueue(
                    subcloud_audit_queue.SubcloudAuditData(sc['name']))
            except subcloud_audit_queue.SubcloudAuditException as exc:
                # Log as warn because we can see this if the watch has fired
                # near the same time as we are auditing the subcloud
                LOG.warn("Failed to enqueue subcloud audit: %s", str(exc))

    def on_start_audit(self):
        """
        On service start audit
        Audit all subclouds that are out-of-sync
        """
        dc_role = utils.get_dc_role()
        if dc_role != constants.DISTRIBUTED_CLOUD_ROLE_SYSTEMCONTROLLER:
            # Do nothing if it is not systemcontroller
            return

        if CONF.certmon.startup_audit_all:
            LOG.info("Service start startup_audit_all: audit all subclouds")
            self.audit_sc_cert_start(None)
            return

        all_subclouds = utils.get_subclouds_from_dcmanager(
            self.token_cache.get_token(), INVALID_SUBCLOUD_AUDIT_DEPLOY_STATES
        )
        LOG.info(
            "Service start: begin subcloud certificate audit [#sc: %d, batch: %s]"
            % (len(all_subclouds), CONF.certmon.audit_batch_size)
        )

        for subcloud in all_subclouds:
            if subcloud[utils.ENDPOINT_TYPE_DC_CERT] != utils.SYNC_STATUS_IN_SYNC:
                subcloud_name = subcloud['name']
                if self.sc_audit_queue.contains(subcloud_name):
                    LOG.info("%s is not in-sync but already under audit"
                             % subcloud_name)
                else:
                    LOG.info("%s is not in-sync, adding it to audit"
                             % subcloud_name)
                    self.sc_audit_queue.enqueue(
                        subcloud_audit_queue.SubcloudAuditData(subcloud_name))

        if self.sc_audit_queue.qsize() > 0:
            LOG.info("Startup audit: %d subcloud(s) to be audited" %
                     self.sc_audit_queue.qsize())
        else:
            LOG.info("Startup audit: all subclouds are in-sync")

    @periodic_task.periodic_task(spacing=5)
    def audit_sc_cert_task(self, context):
        """This task runs every N seconds, and is responsible for running
        a single subcloud through its next step in the subcloud audit process.

        Pull up to <batch_count> number of ready-to-audit subcloud audit
        data items from the sc_audit_queue, and spawn each item to be
        executed via the GreenPool (or directly invoke the audit if the
        GreenPool is disabled).
        """
        for batch_count in range(CONF.certmon.audit_batch_size):
            if self.sc_audit_queue.qsize() < 1:
                # Nothing to do
                return

            # Only continue if the next in queue is ready to be audited
            # Peek into the timestamp of the next item in our priority queue
            next_audit_timestamp = self.sc_audit_queue.queue[0][0]
            if next_audit_timestamp > int(time.time()):
                LOG.debug("audit_sc_cert_task: no audits ready for "
                          "processing, qsize=%s"
                          % self.sc_audit_queue.qsize())
                return

            _, sc_audit_item = self.sc_audit_queue.get()
            LOG.debug(
                "audit_sc_cert_task: enqueue subcloud audit %s, "
                "qsize:%s, batch:%s" %
                (sc_audit_item, self.sc_audit_queue.qsize(), batch_count))

            # This item is ready for audit
            if self.sc_audit_pool is not None:
                self.sc_audit_pool.spawn_n(self.do_subcloud_audit,
                                           sc_audit_item)
            else:
                self.do_subcloud_audit(sc_audit_item)
            eventlet.sleep()

    def do_subcloud_audit(self, sc_audit_item):
        """A wrapper function to ensure the subcloud audit task is marked done
        within sc_audit_queue"""
        try:
            self._subcloud_audit(sc_audit_item)
        except Exception:
            LOG.exception("An error occurred during the subcloud audit task")
        finally:
            self.sc_audit_queue.task_done()

    def _subcloud_audit(self, sc_audit_item):
        """Invoke a subcloud audit"""
        subcloud_name = sc_audit_item.name

        LOG.info("Auditing subcloud %s, attempt #%s [qsize: %s]"
                 % (subcloud_name,
                    sc_audit_item.audit_count,
                    self.sc_audit_queue.qsize()))

        def my_dc_token():
            """Ensure we always have a valid token"""
            return self.dc_token_cache.get_token()

        # Abort audit if subcloud is in a valid deploy status
        subcloud = utils.get_subcloud(my_dc_token(), subcloud_name)
        if subcloud['deploy-status'] in INVALID_SUBCLOUD_AUDIT_DEPLOY_STATES:
            LOG.info(f"Subcloud {subcloud_name} is in an invalid deploy status:"
                     f" {subcloud['deploy-status']}, aborting audit")
            return

        subcloud_sysinv_url = None
        try:
            subcloud_sysinv_url = utils.dc_get_subcloud_sysinv_url(
                subcloud_name, my_dc_token())
            sc_ssl_cert = utils.get_endpoint_certificate(
                subcloud_sysinv_url,
                timeout_secs=CONF.certmon.certificate_timeout_secs)

        except Exception:
            if not utils.is_subcloud_online(subcloud_name, my_dc_token()):
                LOG.warn("Subcloud is not online, aborting audit: %s"
                         % subcloud_name)
                return
            # Handle network-level issues
            # Re-enqueue the subcloud for reauditing
            max_attempts = CONF.certmon.network_max_retry
            if sc_audit_item.audit_count < max_attempts:
                LOG.exception("Cannot retrieve ssl certificate for %s "
                              "via: %s (requeuing audit)"
                              % (subcloud_name, subcloud_sysinv_url))
                self.requeue_audit_subcloud(sc_audit_item,
                                            CONF.certmon.network_retry_interval)
            else:
                LOG.exception("Cannot retrieve ssl certificate for %s via: %s; "
                              "maximum retry limit exceeded [%d], giving up"
                              % (subcloud_name, subcloud_sysinv_url, max_attempts))
                utils.update_subcloud_status(my_dc_token(), subcloud_name,
                                             utils.SYNC_STATUS_OUT_OF_SYNC)
            return
        try:
            secret = utils.get_sc_intermediate_ca_secret(subcloud_name)
            check_list = ['ca.crt', 'tls.crt', 'tls.key']
            for item in check_list:
                if item not in secret.data:
                    raise Exception('%s certificate data missing: %s'
                                    % (subcloud_name, item))

            txt_ssl_cert = base64.decode_as_text(secret.data['tls.crt'])
            txt_ssl_key = base64.decode_as_text(secret.data['tls.key'])
            txt_ca_cert = base64.decode_as_text(secret.data['ca.crt'])
        except Exception:
            # Handle certificate-level issues
            if not utils.is_subcloud_online(subcloud_name, my_dc_token()):
                LOG.exception("Error getting subcloud intermediate cert. "
                              "Subcloud is not online, aborting audit: %s"
                              % subcloud_name)
                return
            LOG.exception("Cannot audit ssl certificate on %s. "
                          "Certificate is not ready." % subcloud_name)
            # certificate is not ready, no reaudit. Will be picked up
            # by certificate MODIFIED event if it comes back
            return

        cert_chain = txt_ssl_cert + txt_ca_cert
        if not cutils.verify_intermediate_ca_cert(cert_chain, sc_ssl_cert):
            # The subcloud needs renewal.
            LOG.info("Updating %s intermediate CA as it is out-of-sync" %
                     subcloud_name)
            # reaudit this subcloud after delay
            self.requeue_audit_subcloud(sc_audit_item)
            try:
                utils.update_subcloud_ca_cert(my_dc_token(),
                                              subcloud_name,
                                              subcloud_sysinv_url,
                                              txt_ca_cert,
                                              txt_ssl_cert,
                                              txt_ssl_key)
            except Exception:
                LOG.exception("Failed to update intermediate CA on %s"
                              % subcloud_name)
                utils.update_subcloud_status(my_dc_token(), subcloud_name,
                                             utils.SYNC_STATUS_OUT_OF_SYNC)
        else:
            LOG.info("%s intermediate CA cert is in-sync" % subcloud_name)
            utils.update_subcloud_status(my_dc_token(), subcloud_name,
                                         utils.SYNC_STATUS_IN_SYNC)

    @periodic_task.periodic_task(spacing=CONF.certmon.retry_interval)
    def retry_monitor_task(self, context):
        # Failed tasks that need to be reattempted will be taken care here
        max_attempts = CONF.certmon.max_retry
        tasks = self.reattempt_monitor_tasks[:]

        num_tasks = len(tasks)
        if num_tasks > 0:
            LOG.info("Start retry_monitor_task: #tasks in queue: %s" %
                     num_tasks)

        # NOTE: this loop can potentially retry ALL subclouds, which
        # may be a resource concern.
        for task in tasks:
            task_id = task.get_id()
            LOG.info("retry_monitor_task: %s, attempt: %s"
                     % (task_id, task.number_of_reattempt))
            if task.run():
                self.reattempt_monitor_tasks.remove(task)
                LOG.info("retry_monitor_task: %s, reattempt has succeeded"
                         % task_id)
            elif task.number_of_reattempt >= max_attempts:
                LOG.error(("retry_monitor_task: %s, maximum attempts (%s) "
                           "has been reached. Give up")
                          % (task_id, max_attempts))
                if task in self.reattempt_monitor_tasks:
                    self.reattempt_monitor_tasks.remove(task)

                # task has failed
                task.failed()

            # Pause and allow other eventlets to run
            eventlet.sleep(0.1)
        LOG.debug("End retry_monitor_task")

    def start_audit(self):
        LOG.info("Auditing interval %s" % CONF.certmon.audit_interval)
        utils.init_keystone_auth_opts()
        self.audit_thread = eventlet.greenthread.spawn(self.audit_cert_loop)
        self.on_start_audit()

    def init_dc_monitor(self):
        self.dc_monitor = watcher.DC_CertWatcher()
        self.dc_monitor.initialize(
            audit_subcloud=lambda subcloud_name:
                self.audit_subcloud(subcloud_name, allow_requeue=True),
            invalid_deploy_states=INVALID_SUBCLOUD_AUDIT_DEPLOY_STATES)

    def init_restapicert_monitor(self):
        self.restapicert_monitor = watcher.RestApiCert_CertWatcher()
        self.restapicert_monitor.initialize()

    def init_registrycert_monitor(self):
        self.registrycert_monitor = watcher.RegistryCert_CertWatcher()
        self.registrycert_monitor.initialize()

    def init_openldapcert_monitor(self):
        self.openldapcert_monitor = watcher.OpenldapCert_CertWatcher()
        self.openldapcert_monitor.initialize()

    def start_monitor(self, dc_role):
        while True:
            try:
                # init platform cert monitors
                self.init_restapicert_monitor()
                self.init_registrycert_monitor()
                self.init_openldapcert_monitor()

                # init dc monitor only if running in DC role
                if dc_role in (constants.DISTRIBUTED_CLOUD_ROLE_SYSTEMCONTROLLER,
                               constants.DISTRIBUTED_CLOUD_ROLE_SUBCLOUD):
                    self.init_dc_monitor()
            except Exception as e:
                LOG.exception(e)
                time.sleep(5)
            else:
                break

        # spawn threads (DC thread spawned only if running in DC role)
        self.mon_threads.append(
            eventlet.greenthread.spawn(self.monitor_cert,
                                       self.restapicert_monitor))
        self.mon_threads.append(
            eventlet.greenthread.spawn(self.monitor_cert,
                                       self.registrycert_monitor))
        self.mon_threads.append(
            eventlet.greenthread.spawn(self.monitor_cert,
                                       self.openldapcert_monitor))

        if dc_role in (constants.DISTRIBUTED_CLOUD_ROLE_SYSTEMCONTROLLER,
                       constants.DISTRIBUTED_CLOUD_ROLE_SUBCLOUD):
            self.mon_threads.append(
                eventlet.greenthread.spawn(self.monitor_cert, self.dc_monitor))

    def stop_monitor(self):
        for mon_thread in self.mon_threads:
            mon_thread.kill()
            mon_thread.wait()
        self.mon_threads = []

    def stop_audit(self):
        if self.audit_thread:
            self.audit_thread.kill()
            self.audit_thread.wait()
            self.audit_thread = None

    def audit_cert_loop(self):
        while True:
            try:
                self.run_periodic_tasks(context=None)
                time.sleep(1)
            except greenlet.GreenletExit:
                break
            except Exception as e:
                LOG.exception(e)

    def requeue_audit_subcloud(self, sc_audit_item, delay_secs=60):
        if not self.sc_audit_queue.contains(sc_audit_item.name):
            self.sc_audit_queue.enqueue(sc_audit_item, delay_secs)

    def audit_subcloud(self, subcloud_name, allow_requeue=False):
        """Enqueue a subcloud audit

        allow_requeue: This can come from a watch after a DC certificate renew.
                       i.e., outside of the periodic subcloud audit tasks.
                       We allow a re-enqueue here with a new delay.
        """
        if self.sc_audit_queue.contains(subcloud_name):
            if (allow_requeue
                    and self.sc_audit_queue.enqueued_subcloud_names.count(
                        subcloud_name) < 2):
                LOG.info("audit_subcloud: requeing %s" % subcloud_name)
            else:
                LOG.debug("audit_subcloud: ignoring %s, already in queue"
                          % subcloud_name)
                return
        self.sc_audit_queue.enqueue(
            subcloud_audit_queue.SubcloudAuditData(subcloud_name),
            allow_requeue=allow_requeue)

    def subcloud_sysinv_endpoint_update(self, subcloud_name, sysinv_url):
        dc_token = self.dc_token_cache.get_token()
        subcloud_sysinv_url = utils.dc_get_subcloud_sysinv_url(
            subcloud_name, dc_token)
        if subcloud_sysinv_url != sysinv_url:
            utils.dc_update_subcloud_sysinv_url(
                subcloud_name, sysinv_url, dc_token)

    def monitor_cert(self, monitor):
        while True:
            # never exit until exit signal received
            try:
                monitor.start_watch(
                    on_success=lambda task_id:
                        self._purge_reattempt_monitor_task(task_id,
                                                           'on success'),
                    on_error=lambda task:
                        self._add_reattempt_monitor_task(task))
            except greenlet.GreenletExit:
                break
            except Exception:
                # A bug somewhere?
                # It shouldn't fall to here, but log and restart if it did
                LOG.exception("Unexpected exception from start_watch")
                time.sleep(1)

    def _add_reattempt_monitor_task(self, task):
        id = task.get_id()
        self._purge_reattempt_monitor_task(id, 'for new reattempt')
        self.reattempt_monitor_tasks.append(task)

    def _purge_reattempt_monitor_task(self, id, reason_msg):
        for t in self.reattempt_monitor_tasks:
            if t.get_id() == id:
                self.reattempt_monitor_tasks.remove(t)
                LOG.info("Purging reattempt monitor task %s: %s"
                         % (reason_msg, id))
                break