Query stale runtime config and reapply

This commit makes use of the new runtime_config table to query pending runtime config requests (i.e. that were not reported back as success from the agent after a determined timeout) and retry the runtime request. This partially reverts commit f94629b10d since it is an improvement around the same idea but using different storage and constants. The feature implemented on the referenced commit that queries out-of-date hosts and retry the runtime config manifest only to that specific host remains untouched. Notes: 1. A pruning mechanism was implemented to run when conductor process starts and then periodically, twice a day, so that only the runtime config entries created on the last 24 hours are kept 2. The retry mechanism for pending runtime config requests will retry the request only once, for now 3. Storing the runtime config information will not apply to file update config requests (e.g. license-install) because the config dict in this case contains the full file content, and that would demand a very large table field to store these type of requests Test Plan PASS: install/bootstrap/unlock single and multinode deployments PASS: simulate error applying runtime config and verify conductor retrying the runtime manifest after timeout passes PASS: simulate out-of-date alarm and verify conductor retrying the runtime manifest after timeout passes PASS: DX upgrade stx-6 to stx-8 PASS: SX upgrade stx-6 to stx-8 Closes-bug: 2037090 Depends-on: https://review.opendev.org/c/starlingx/config/+/893567 Change-Id: I24362cb79b6d638d4764467f9dd58832c4381578 Signed-off-by: Heitor Matsui <heitorvieira.matsui@windriver.com>
2023-09-11 11:00:12 -03:00 · 2023-09-11 11:00:12 -03:00 · 1a71e01716
parent 63ffee43a0
commit 1a71e01716
1 changed files with 153 additions and 75 deletions
--- a/sysinv/sysinv/sysinv/sysinv/conductor/manager.py
+++ b/sysinv/sysinv/sysinv/sysinv/conductor/manager.py
@ -34,6 +34,7 @@ import filecmp
 import glob
 import hashlib
 import io
+import json
 import math
 import os
 import re
@ -50,6 +51,7 @@ import uuid
 import xml.etree.ElementTree as ElementTree
 from contextlib import contextmanager
 from datetime import datetime
+from datetime import timedelta
 from distutils.util import strtobool
 from distutils.version import LooseVersion
 from copy import deepcopy
@ -158,13 +160,6 @@ conductor_opts = [
       cfg.IntOpt('fw_update_small_timeout',
                  default=300,
                  help='Timeout interval in seconds for a small device image'),
-       cfg.IntOpt('config_out_of_date_timeout',
-                  default=600,
-                  help=('Timeout interval in seconds to consider a '
-                       'host is stuck in out-of-date config status.')),
-       cfg.IntOpt('max_runtime_config_history_size',
-                  default=1000,
-                  help='Max number of records allowed on runtime config history')
                 ]

 audit_intervals_opts = [
@ -182,6 +177,7 @@ audit_intervals_opts = [
       cfg.IntOpt('k8s_application', default=60),
       cfg.IntOpt('device_image_update', default=300),
       cfg.IntOpt('kube_upgrade_states', default=1800),
+       cfg.IntOpt('prune_runtime_config', default=43200),
                  ]

 CONF = cfg.CONF
@ -296,9 +292,6 @@ class ConductorManager(service.PeriodicService):
        # track deferred runtime config which need to be applied
        self._host_deferred_runtime_config = []

-        # store the config history to reapply if necessary
-        self._host_runtime_config_history = {}
-
        # track whether runtime class apply may be in progress
        self._runtime_class_apply_in_progress = []

@ -401,6 +394,9 @@ class ConductorManager(service.PeriodicService):

        self._clear_partition_config_flags()

+        # Runtime config tasks
+        self._prune_runtime_config_table()
+
        LOG.info("sysinv-conductor start committed system=%s" %
                 system.as_dict())

@ -6077,6 +6073,7 @@ class ConductorManager(service.PeriodicService):

        config_uuid = imsg_dict['config_applied']
        self._update_host_config_applied(context, ihost, config_uuid)
+        self._update_runtime_config_status(ihost, config_uuid, imsg_dict.get('status'))

    def initial_inventory_completed(self, context, host_uuid):
        host_uuid.strip()
@ -6735,6 +6732,85 @@ class ConductorManager(service.PeriodicService):

        return True

+    def _audit_pending_runtime_config(self):
+        """Query runtime config table for pending requests"""
+        expired_date = datetime.utcnow() - \
+                       timedelta(seconds=constants.RUNTIME_CONFIG_APPLY_TIMEOUT_IN_SECS)
+        pending_runtime_config = self.dbapi.runtime_config_get_all(
+            state=constants.RUNTIME_CONFIG_STATE_PENDING,
+            older_than=expired_date)
+
+        if not pending_runtime_config:
+            return
+
+        LOG.info("Found stale runtime config entries, retrying the requests...")
+        for rc in pending_runtime_config:
+            try:
+                host = self.dbapi.ihost_get(rc.forihostid)
+                config_uuid = rc.config_uuid
+                config_dict = json.loads(rc.config_dict)
+                config_dict.update({"host_uuids": [host.uuid]})
+                config_type = config_dict["config_type"]
+                force = config_dict["force"] if "force" in config_dict else False
+
+                # retry sending the runtime config only to the specific host
+                LOG.info("Attempting to reapply target config %s to host %s." % (
+                    config_uuid, host.hostname))
+                self._update_host_deferred_runtime_config(
+                    config_type,
+                    config_uuid,
+                    config_dict,
+                    force)
+            except exception.ServerNotFound as e:
+                LOG.warn("Skipping request: %s" % e)
+
+            # update the runtime config entry state in the database
+            rc_update_values = {"state": constants.RUNTIME_CONFIG_STATE_RETRIED}
+            self.dbapi.runtime_config_update(rc.id, rc_update_values)
+
+    def _audit_config_out_of_date_hosts(self):
+        """Get alarms with ID 250.001 and check if any of them
+        is older than RUNTIME_CONFIG_APPLY_TIMEOUT_IN_SECS seconds.
+        """
+        config_out_of_date_hosts = []
+        alarms = self.fm_api.get_faults_by_id(
+            fm_constants.FM_ALARM_ID_SYSCONFIG_OUT_OF_DATE)
+        if not alarms:
+            return
+
+        for alarm in alarms:
+            alarm_ts = datetime.strptime(alarm.timestamp, "%Y-%m-%d %H:%M:%S.%f")
+            if (datetime.utcnow() - alarm_ts).total_seconds() > \
+                    constants.RUNTIME_CONFIG_APPLY_TIMEOUT_IN_SECS:
+                config_out_of_date_hosts.append(alarm.entity_instance_id.split("=")[1])
+
+        # try to automatically recover out-of-date hosts
+        # by retrying the runtime manifest apply on them
+        for hostname in config_out_of_date_hosts:
+            try:
+                host = self.dbapi.ihost_get_by_hostname(hostname)
+                config_uuid = host.config_target
+                host_id = host.id
+
+                rc = self.dbapi.runtime_config_get(config_uuid, host_id=host_id)
+                config_dict = json.loads(rc.config_dict)
+                config_dict.update({"host_uuids": [host.uuid]})
+                config_type = config_dict["config_type"]
+                force = config_dict["force"] if "force" in config_dict else False
+
+                LOG.info("Attempting to reapply target config %s to host %s." % (
+                    config_uuid, host.hostname))
+                self._update_host_deferred_runtime_config(
+                    config_type,
+                    config_uuid,
+                    config_dict,
+                    force)
+            except exception.NodeNotFound as e:
+                LOG.warn("Host not found: %s" % e)
+            except Exception as e:
+                LOG.warn("Unable to reapply target config %s to host %s, host may require "
+                         "manual lock/unlock to recover: %s" % (config_uuid, host.hostname, e))
+
    def _audit_deferred_runtime_config(self, context):
        """With rlock, apply deferred config runtime manifests when ready"""

@ -6789,68 +6865,13 @@ class ConductorManager(service.PeriodicService):
        with self.rlock_runtime_config:
            _cs_audit_deferred_runtime_config(self, context)

-    def _prune_host_runtime_config_history(self):
-        # prune oldest runtime config from history to keep
-        # at most max_runtime_config_history_size records
-        max_history_size = CONF.conductor.max_runtime_config_history_size
-        current_history_size = len(self._host_runtime_config_history)
-        if current_history_size < max_history_size:
-            return
-
-        sorted_history = sorted(self._host_runtime_config_history.items(),
-                                key=lambda x: x[1]["created_at"])
-        for i in range(0, current_history_size - max_history_size):
-            self._host_runtime_config_history.pop(sorted_history[i][0])
-            LOG.info("Pruned config '%s' from runtime config history" %
-                     sorted_history[i][0])
-
-    def _add_stuck_config_out_of_date_to_deferred(self):
-        # call runtime config history pruning
-        self._prune_host_runtime_config_history()
-
-        # get alarms with ID 250.001 and check if any of them
-        # is older than config_out_of_date_timeout seconds
-        config_out_of_date_hosts = []
-        alarms = self.fm_api.get_faults_by_id(
-            fm_constants.FM_ALARM_ID_SYSCONFIG_OUT_OF_DATE)
-        if not alarms:
-            return
-
-        for alarm in alarms:
-            alarm_ts = datetime.strptime(alarm.timestamp, "%Y-%m-%d %H:%M:%S.%f")
-            if (datetime.utcnow() - alarm_ts).total_seconds() > \
-                    CONF.conductor.config_out_of_date_timeout:
-                config_out_of_date_hosts.append(alarm.entity_instance_id.split("=")[1])
-
-        # try to automatically recover out-of-date hosts
-        # by resending the runtime manifest to them
-        for hostname in config_out_of_date_hosts:
-            ihost = self.dbapi.ihost_get_by_hostname(hostname)
-            ihost_uuid = ihost.uuid
-            config_uuid = ihost.config_target
-            try:
-                history_config_dict = deepcopy(
-                    self._host_runtime_config_history[config_uuid])
-                force = history_config_dict["force"] \
-                    if "force" in history_config_dict else False
-                config_type = history_config_dict["config_type"]
-                history_config_dict.update({"host_uuids": [ihost_uuid]})
-
-                LOG.info("Attempting to resend target config '%s' to host %s '%s'" % (
-                    config_uuid, ihost.hostname, ihost_uuid))
-                self._update_host_deferred_runtime_config(
-                    config_type,
-                    config_uuid,
-                    history_config_dict,
-                    force)
-            except Exception as e:
-                LOG.warn("Unable to retrigger '%s' from runtime config history: %s" % (
-                    config_uuid, e))
-
    @periodic_task.periodic_task(spacing=CONF.conductor_periodic_task_intervals.deferred_runtime_config)
    def _audit_deferred_runtime_config_periodic(self, context):
        # check for possibly stuck out-of-date config hosts
-        self._add_stuck_config_out_of_date_to_deferred()
+        self._audit_config_out_of_date_hosts()
+
+        # check for runtime config entries in pending status
+        self._audit_pending_runtime_config()

        # check whether there are deferred runtime manifests to apply
        self._audit_deferred_runtime_config(context)
@ -12051,6 +12072,26 @@ class ConductorManager(service.PeriodicService):

        _sync_update_host_config_applied(self, context, ihost_obj, config_uuid)

+    def _update_runtime_config_status(self, ihost, config_uuid, status=None):
+        """Check report status and update runtime_config entry. Ignore if
+        runtime_config entry is not found, as it should not be a blocking
+        issue for the system to operate.
+
+        :param ihost: host corresponding to the runtime_config entry
+        :param config_uuid: target_config uuid retried on the host
+        :param status: runtime_config apply state returned from puppet
+        """
+        try:
+            runtime_config = self.dbapi.runtime_config_get(config_uuid, host_id=ihost.id)
+            if status == puppet_common.REPORT_FAILURE:
+                runtime_state = constants.RUNTIME_CONFIG_STATE_FAILED
+            else:
+                runtime_state = constants.RUNTIME_CONFIG_STATE_APPLIED
+            self.dbapi.runtime_config_update(runtime_config.id, {"state": runtime_state})
+        except exception.NotFound:
+            LOG.warn("Host %s applied config %s, which does not exist on "
+                     "the database." % (ihost.hostname, config_uuid))
+
    def _config_reinstall_hosts(self, context, personalities):
        """ update the hosts configuration status for all host to be "
            reinstall is required.
@ -12222,9 +12263,6 @@ class ConductorManager(service.PeriodicService):
                                       iconfig_uuid=config_uuid,
                                       iconfig_dict=config_dict)
            config_dict["config_type"] = CONFIG_UPDATE_FILE
-            config_dict["created_at"] = datetime.utcnow()
-            if config_uuid not in self._host_runtime_config_history:
-                self._host_runtime_config_history[config_uuid] = config_dict
        except Exception as e:
            LOG.info("Error: %s" % str(e))
            return False
@ -12366,6 +12404,44 @@ class ConductorManager(service.PeriodicService):

        return True

+    def _prune_runtime_config_table(self):
+        """Prune runtime_config entries older than 24 hours"""
+        cutoff_date = datetime.utcnow() - timedelta(hours=24)
+        LOG.info("Pruning runtime_config entries older than %s." % cutoff_date)
+        self.dbapi.runtime_config_prune(cutoff_date)
+
+    def _create_runtime_config_entries(self, config_uuid, config_dict):
+        """Create runtime config entries in the database"""
+        # it is expected for config_dict to contain the host_uuids
+        # to which the runtime config must be applied, but the
+        # database entry is stored without the 'host_uuids' key
+        # since there should be one entry per host on the table
+        host_uuids = config_dict.get("host_uuids")
+        if not host_uuids:
+            host_uuids = []
+            personalities = config_dict.get("personalities")
+            for personality in personalities:
+                hosts = self.dbapi.ihost_get_by_personality(personality)
+                for host in hosts:
+                    host_uuids.append(host.uuid)
+
+        tmp_config_dict = deepcopy(config_dict)
+        tmp_config_dict.pop("host_uuids", None)
+
+        for host_uuid in host_uuids:
+            host = self.dbapi.ihost_get(host_uuid)
+            runtime_config = {
+                "config_uuid": config_uuid,
+                "config_dict": json.dumps(tmp_config_dict),
+                "forihostid": host.id,
+            }
+            try:
+                self.dbapi.runtime_config_create(runtime_config)
+            except Exception:
+                # can be ignored as runtime_config can
+                # already exists in the retry scenario
+                pass
+
    def _config_apply_runtime_manifest(self,
                                       context,
                                       config_uuid,
@ -12471,9 +12547,7 @@ class ConductorManager(service.PeriodicService):
                                             config_uuid=config_uuid,
                                             config_dict=config_dict)
        config_dict["config_type"] = CONFIG_APPLY_RUNTIME_MANIFEST
-        config_dict["created_at"] = datetime.utcnow()
-        if config_uuid not in self._host_runtime_config_history:
-            self._host_runtime_config_history[config_uuid] = config_dict
+        self._create_runtime_config_entries(config_uuid, config_dict)

        if filter_classes:
            classes = [config_class for config_class in config_dict['classes'] if config_class in filter_classes]
@ -18042,6 +18116,10 @@ class ConductorManager(service.PeriodicService):
        except exception.NotFound:
            LOG.debug("A kubernetes upgrade is not in progress")

+    @periodic_task.periodic_task(spacing=CONF.conductor_periodic_task_intervals.prune_runtime_config)
+    def _audit_prune_runtime_config(self):
+        self._prune_runtime_config_table()
+

 def device_image_state_sort_key(dev_img_state):
    if dev_img_state.bitstream_type == dconstants.BITSTREAM_TYPE_ROOT_KEY: