diff --git a/sysinv/sysinv/sysinv/sysinv/conductor/manager.py b/sysinv/sysinv/sysinv/sysinv/conductor/manager.py index dd2c65dedb..4111acf907 100644 --- a/sysinv/sysinv/sysinv/sysinv/conductor/manager.py +++ b/sysinv/sysinv/sysinv/sysinv/conductor/manager.py @@ -34,6 +34,7 @@ import filecmp import glob import hashlib import io +import json import math import os import re @@ -50,6 +51,7 @@ import uuid import xml.etree.ElementTree as ElementTree from contextlib import contextmanager from datetime import datetime +from datetime import timedelta from distutils.util import strtobool from distutils.version import LooseVersion from copy import deepcopy @@ -158,13 +160,6 @@ conductor_opts = [ cfg.IntOpt('fw_update_small_timeout', default=300, help='Timeout interval in seconds for a small device image'), - cfg.IntOpt('config_out_of_date_timeout', - default=600, - help=('Timeout interval in seconds to consider a ' - 'host is stuck in out-of-date config status.')), - cfg.IntOpt('max_runtime_config_history_size', - default=1000, - help='Max number of records allowed on runtime config history') ] audit_intervals_opts = [ @@ -182,6 +177,7 @@ audit_intervals_opts = [ cfg.IntOpt('k8s_application', default=60), cfg.IntOpt('device_image_update', default=300), cfg.IntOpt('kube_upgrade_states', default=1800), + cfg.IntOpt('prune_runtime_config', default=43200), ] CONF = cfg.CONF @@ -296,9 +292,6 @@ class ConductorManager(service.PeriodicService): # track deferred runtime config which need to be applied self._host_deferred_runtime_config = [] - # store the config history to reapply if necessary - self._host_runtime_config_history = {} - # track whether runtime class apply may be in progress self._runtime_class_apply_in_progress = [] @@ -401,6 +394,9 @@ class ConductorManager(service.PeriodicService): self._clear_partition_config_flags() + # Runtime config tasks + self._prune_runtime_config_table() + LOG.info("sysinv-conductor start committed system=%s" % system.as_dict()) @@ -6171,6 +6167,7 @@ class ConductorManager(service.PeriodicService): config_uuid = imsg_dict['config_applied'] self._update_host_config_applied(context, ihost, config_uuid) + self._update_runtime_config_status(ihost, config_uuid, imsg_dict.get('status')) def initial_inventory_completed(self, context, host_uuid): host_uuid.strip() @@ -6829,6 +6826,85 @@ class ConductorManager(service.PeriodicService): return True + def _audit_pending_runtime_config(self): + """Query runtime config table for pending requests""" + expired_date = datetime.utcnow() - \ + timedelta(seconds=constants.RUNTIME_CONFIG_APPLY_TIMEOUT_IN_SECS) + pending_runtime_config = self.dbapi.runtime_config_get_all( + state=constants.RUNTIME_CONFIG_STATE_PENDING, + older_than=expired_date) + + if not pending_runtime_config: + return + + LOG.info("Found stale runtime config entries, retrying the requests...") + for rc in pending_runtime_config: + try: + host = self.dbapi.ihost_get(rc.forihostid) + config_uuid = rc.config_uuid + config_dict = json.loads(rc.config_dict) + config_dict.update({"host_uuids": [host.uuid]}) + config_type = config_dict["config_type"] + force = config_dict["force"] if "force" in config_dict else False + + # retry sending the runtime config only to the specific host + LOG.info("Attempting to reapply target config %s to host %s." % ( + config_uuid, host.hostname)) + self._update_host_deferred_runtime_config( + config_type, + config_uuid, + config_dict, + force) + except exception.ServerNotFound as e: + LOG.warn("Skipping request: %s" % e) + + # update the runtime config entry state in the database + rc_update_values = {"state": constants.RUNTIME_CONFIG_STATE_RETRIED} + self.dbapi.runtime_config_update(rc.id, rc_update_values) + + def _audit_config_out_of_date_hosts(self): + """Get alarms with ID 250.001 and check if any of them + is older than RUNTIME_CONFIG_APPLY_TIMEOUT_IN_SECS seconds. + """ + config_out_of_date_hosts = [] + alarms = self.fm_api.get_faults_by_id( + fm_constants.FM_ALARM_ID_SYSCONFIG_OUT_OF_DATE) + if not alarms: + return + + for alarm in alarms: + alarm_ts = datetime.strptime(alarm.timestamp, "%Y-%m-%d %H:%M:%S.%f") + if (datetime.utcnow() - alarm_ts).total_seconds() > \ + constants.RUNTIME_CONFIG_APPLY_TIMEOUT_IN_SECS: + config_out_of_date_hosts.append(alarm.entity_instance_id.split("=")[1]) + + # try to automatically recover out-of-date hosts + # by retrying the runtime manifest apply on them + for hostname in config_out_of_date_hosts: + try: + host = self.dbapi.ihost_get_by_hostname(hostname) + config_uuid = host.config_target + host_id = host.id + + rc = self.dbapi.runtime_config_get(config_uuid, host_id=host_id) + config_dict = json.loads(rc.config_dict) + config_dict.update({"host_uuids": [host.uuid]}) + config_type = config_dict["config_type"] + force = config_dict["force"] if "force" in config_dict else False + + LOG.info("Attempting to reapply target config %s to host %s." % ( + config_uuid, host.hostname)) + self._update_host_deferred_runtime_config( + config_type, + config_uuid, + config_dict, + force) + except exception.NodeNotFound as e: + LOG.warn("Host not found: %s" % e) + except Exception as e: + LOG.warn("Unable to reapply target config %s to host %s, host may require " + "manual lock/unlock to recover: %s" % (config_uuid, host.hostname, e)) + def _audit_deferred_runtime_config(self, context): """With rlock, apply deferred config runtime manifests when ready""" @@ -6883,68 +6959,13 @@ class ConductorManager(service.PeriodicService): with self.rlock_runtime_config: _cs_audit_deferred_runtime_config(self, context) - def _prune_host_runtime_config_history(self): - # prune oldest runtime config from history to keep - # at most max_runtime_config_history_size records - max_history_size = CONF.conductor.max_runtime_config_history_size - current_history_size = len(self._host_runtime_config_history) - if current_history_size < max_history_size: - return - - sorted_history = sorted(self._host_runtime_config_history.items(), - key=lambda x: x[1]["created_at"]) - for i in range(0, current_history_size - max_history_size): - self._host_runtime_config_history.pop(sorted_history[i][0]) - LOG.info("Pruned config '%s' from runtime config history" % - sorted_history[i][0]) - - def _add_stuck_config_out_of_date_to_deferred(self): - # call runtime config history pruning - self._prune_host_runtime_config_history() - - # get alarms with ID 250.001 and check if any of them - # is older than config_out_of_date_timeout seconds - config_out_of_date_hosts = [] - alarms = self.fm_api.get_faults_by_id( - fm_constants.FM_ALARM_ID_SYSCONFIG_OUT_OF_DATE) - if not alarms: - return - - for alarm in alarms: - alarm_ts = datetime.strptime(alarm.timestamp, "%Y-%m-%d %H:%M:%S.%f") - if (datetime.utcnow() - alarm_ts).total_seconds() > \ - CONF.conductor.config_out_of_date_timeout: - config_out_of_date_hosts.append(alarm.entity_instance_id.split("=")[1]) - - # try to automatically recover out-of-date hosts - # by resending the runtime manifest to them - for hostname in config_out_of_date_hosts: - ihost = self.dbapi.ihost_get_by_hostname(hostname) - ihost_uuid = ihost.uuid - config_uuid = ihost.config_target - try: - history_config_dict = deepcopy( - self._host_runtime_config_history[config_uuid]) - force = history_config_dict["force"] \ - if "force" in history_config_dict else False - config_type = history_config_dict["config_type"] - history_config_dict.update({"host_uuids": [ihost_uuid]}) - - LOG.info("Attempting to resend target config '%s' to host %s '%s'" % ( - config_uuid, ihost.hostname, ihost_uuid)) - self._update_host_deferred_runtime_config( - config_type, - config_uuid, - history_config_dict, - force) - except Exception as e: - LOG.warn("Unable to retrigger '%s' from runtime config history: %s" % ( - config_uuid, e)) - @periodic_task.periodic_task(spacing=CONF.conductor_periodic_task_intervals.deferred_runtime_config) def _audit_deferred_runtime_config_periodic(self, context): # check for possibly stuck out-of-date config hosts - self._add_stuck_config_out_of_date_to_deferred() + self._audit_config_out_of_date_hosts() + + # check for runtime config entries in pending status + self._audit_pending_runtime_config() # check whether there are deferred runtime manifests to apply self._audit_deferred_runtime_config(context) @@ -12151,6 +12172,26 @@ class ConductorManager(service.PeriodicService): _sync_update_host_config_applied(self, context, ihost_obj, config_uuid) + def _update_runtime_config_status(self, ihost, config_uuid, status=None): + """Check report status and update runtime_config entry. Ignore if + runtime_config entry is not found, as it should not be a blocking + issue for the system to operate. + + :param ihost: host corresponding to the runtime_config entry + :param config_uuid: target_config uuid retried on the host + :param status: runtime_config apply state returned from puppet + """ + try: + runtime_config = self.dbapi.runtime_config_get(config_uuid, host_id=ihost.id) + if status == puppet_common.REPORT_FAILURE: + runtime_state = constants.RUNTIME_CONFIG_STATE_FAILED + else: + runtime_state = constants.RUNTIME_CONFIG_STATE_APPLIED + self.dbapi.runtime_config_update(runtime_config.id, {"state": runtime_state}) + except exception.NotFound: + LOG.warn("Host %s applied config %s, which does not exist on " + "the database." % (ihost.hostname, config_uuid)) + def _config_reinstall_hosts(self, context, personalities): """ update the hosts configuration status for all host to be " reinstall is required. @@ -12322,9 +12363,6 @@ class ConductorManager(service.PeriodicService): iconfig_uuid=config_uuid, iconfig_dict=config_dict) config_dict["config_type"] = CONFIG_UPDATE_FILE - config_dict["created_at"] = datetime.utcnow() - if config_uuid not in self._host_runtime_config_history: - self._host_runtime_config_history[config_uuid] = config_dict except Exception as e: LOG.info("Error: %s" % str(e)) return False @@ -12466,6 +12504,44 @@ class ConductorManager(service.PeriodicService): return True + def _prune_runtime_config_table(self): + """Prune runtime_config entries older than 24 hours""" + cutoff_date = datetime.utcnow() - timedelta(hours=24) + LOG.info("Pruning runtime_config entries older than %s." % cutoff_date) + self.dbapi.runtime_config_prune(cutoff_date) + + def _create_runtime_config_entries(self, config_uuid, config_dict): + """Create runtime config entries in the database""" + # it is expected for config_dict to contain the host_uuids + # to which the runtime config must be applied, but the + # database entry is stored without the 'host_uuids' key + # since there should be one entry per host on the table + host_uuids = config_dict.get("host_uuids") + if not host_uuids: + host_uuids = [] + personalities = config_dict.get("personalities") + for personality in personalities: + hosts = self.dbapi.ihost_get_by_personality(personality) + for host in hosts: + host_uuids.append(host.uuid) + + tmp_config_dict = deepcopy(config_dict) + tmp_config_dict.pop("host_uuids", None) + + for host_uuid in host_uuids: + host = self.dbapi.ihost_get(host_uuid) + runtime_config = { + "config_uuid": config_uuid, + "config_dict": json.dumps(tmp_config_dict), + "forihostid": host.id, + } + try: + self.dbapi.runtime_config_create(runtime_config) + except Exception: + # can be ignored as runtime_config can + # already exists in the retry scenario + pass + def _config_apply_runtime_manifest(self, context, config_uuid, @@ -12571,9 +12647,7 @@ class ConductorManager(service.PeriodicService): config_uuid=config_uuid, config_dict=config_dict) config_dict["config_type"] = CONFIG_APPLY_RUNTIME_MANIFEST - config_dict["created_at"] = datetime.utcnow() - if config_uuid not in self._host_runtime_config_history: - self._host_runtime_config_history[config_uuid] = config_dict + self._create_runtime_config_entries(config_uuid, config_dict) if filter_classes: classes = [config_class for config_class in config_dict['classes'] if config_class in filter_classes] @@ -18234,6 +18308,10 @@ class ConductorManager(service.PeriodicService): except exception.NotFound: LOG.debug("A kubernetes upgrade is not in progress") + @periodic_task.periodic_task(spacing=CONF.conductor_periodic_task_intervals.prune_runtime_config) + def _audit_prune_runtime_config(self): + self._prune_runtime_config_table() + def device_image_state_sort_key(dev_img_state): if dev_img_state.bitstream_type == dconstants.BITSTREAM_TYPE_ROOT_KEY: