Split _conductor_audit into individual audits

Currently, sysinv conductor audit _conductor_audit has a set of several sub audits. The purpose of this story is to split this audit into individual audits. Note that this change will keep the same configuration for the periodic intervals, for now, just the logic of the code has changed. TEST PLAN: PASS: AIO-SX: manually replaced these files into a Debian installation and no crashes happened. PASS: rebuild the whole system with the modification with no crashes. PASS: install the new iso inside a lab including bootstrap and first 'host-unlock'. PASS: AIO-SX: verify if the sysinv conductor audits (periodic tasks) are being called. PASS: change the install_state interval to different values (for test purposes) PASS: follow the sysinv logs seeking for errors. No error was found. PASS: guarantee that all audits are being called by the periodic tasks. Story: 2010087 Task: 45646 Depends-On: https://review.opendev.org/c/starlingx/config/+/848330 Signed-off-by: Bruno Costa <bruno.costa@windriver.com> Change-Id: I215fae7ccbbbaadd7b93f5a8efc11df0834d411a
2022-07-01 08:57:29 -03:00 · 2022-07-01 08:57:29 -03:00 · d2e2a67224
parent a71c4b6c4c
commit d2e2a67224
1 changed files with 97 additions and 107 deletions
--- a/sysinv/sysinv/sysinv/sysinv/conductor/manager.py
+++ b/sysinv/sysinv/sysinv/sysinv/conductor/manager.py
@ -5657,6 +5657,8 @@ class ConductorManager(service.PeriodicService):

        return False

+    @periodic_task.periodic_task(
+        spacing=CONF.conductor_periodic_task_intervals.controller_config_active_apply)
    def _controller_config_active_apply(self, context):
        """Check whether target config has been applied to active
           controller to run postprocessing"""
@ -5768,86 +5770,91 @@ class ConductorManager(service.PeriodicService):

        return all_fs_resized

-    def _audit_ihost_action(self, ihost):
+    @periodic_task.periodic_task(spacing=CONF.conductor_periodic_task_intervals.ihost_action)
+    def _audit_ihost_action(self, context):
        """Audit whether the ihost_action needs to be terminated or escalated.
        """
+        hosts = self.dbapi.ihost_get_list()
+        for ihost in hosts:
+            # only audit configured hosts
+            if ihost.personality:
+                if ihost.administrative == constants.ADMIN_UNLOCKED:
+                    ihost_action_str = ihost.ihost_action or ""

-        if ihost.administrative == constants.ADMIN_UNLOCKED:
-            ihost_action_str = ihost.ihost_action or ""
+                    if (ihost_action_str.startswith(constants.FORCE_LOCK_ACTION) or
+                            ihost_action_str.startswith(constants.LOCK_ACTION)):

-            if (ihost_action_str.startswith(constants.FORCE_LOCK_ACTION) or
-                    ihost_action_str.startswith(constants.LOCK_ACTION)):
+                        task_str = ihost.task or ""
+                        if (('--' in ihost_action_str and
+                              ihost_action_str.startswith(
+                                   constants.FORCE_LOCK_ACTION)) or
+                              ('----------' in ihost_action_str and
+                              ihost_action_str.startswith(constants.LOCK_ACTION))):

-                task_str = ihost.task or ""
-                if (('--' in ihost_action_str and
-                      ihost_action_str.startswith(
-                           constants.FORCE_LOCK_ACTION)) or
-                      ('----------' in ihost_action_str and
-                      ihost_action_str.startswith(constants.LOCK_ACTION))):
+                            ihost_mtc = ihost.as_dict()
+                            keepkeys = ['ihost_action', 'vim_progress_status']
+                            ihost_mtc = cutils.removekeys_nonmtce(ihost_mtc,
+                                                                keepkeys)

-                    ihost_mtc = ihost.as_dict()
-                    keepkeys = ['ihost_action', 'vim_progress_status']
-                    ihost_mtc = cutils.removekeys_nonmtce(ihost_mtc,
-                                                          keepkeys)
+                            if ihost_action_str.startswith(constants.FORCE_LOCK_ACTION):
+                                timeout_in_secs = 6
+                                ihost_mtc['operation'] = 'modify'
+                                ihost_mtc['action'] = constants.FORCE_LOCK_ACTION
+                                ihost_mtc['task'] = constants.FORCE_LOCKING
+                                LOG.warn("ihost_action override %s" %
+                                        ihost_mtc)
+                                mtce_api.host_modify(
+                                    self._api_token, self._mtc_address, self._mtc_port,
+                                    ihost_mtc, timeout_in_secs)

-                    if ihost_action_str.startswith(constants.FORCE_LOCK_ACTION):
-                        timeout_in_secs = 6
-                        ihost_mtc['operation'] = 'modify'
-                        ihost_mtc['action'] = constants.FORCE_LOCK_ACTION
-                        ihost_mtc['task'] = constants.FORCE_LOCKING
-                        LOG.warn("ihost_action override %s" %
-                                 ihost_mtc)
-                        mtce_api.host_modify(
-                            self._api_token, self._mtc_address, self._mtc_port,
-                            ihost_mtc, timeout_in_secs)
+                            # need time for FORCE_LOCK mtce to clear
+                            if ('----' in ihost_action_str):
+                                ihost_action_str = ""
+                            else:
+                                ihost_action_str += "-"

-                    # need time for FORCE_LOCK mtce to clear
-                    if ('----' in ihost_action_str):
-                        ihost_action_str = ""
-                    else:
-                        ihost_action_str += "-"
+                            if (task_str.startswith(constants.FORCE_LOCKING) or
+                            task_str.startswith(constants.LOCKING)):
+                                val = {'task': "",
+                                    'ihost_action': ihost_action_str,
+                                    'vim_progress_status': ""}
+                            else:
+                                val = {'ihost_action': ihost_action_str,
+                                    'vim_progress_status': ""}
+                        else:
+                            ihost_action_str += "-"
+                            if (task_str.startswith(constants.FORCE_LOCKING) or
+                            task_str.startswith(constants.LOCKING)):
+                                task_str += "-"
+                                val = {'task': task_str,
+                                    'ihost_action': ihost_action_str}
+                            else:
+                                val = {'ihost_action': ihost_action_str}

+                        self.dbapi.ihost_update(ihost.uuid, val)
+                else:  # Administrative locked already
+                    task_str = ihost.task or ""
                    if (task_str.startswith(constants.FORCE_LOCKING) or
-                       task_str.startswith(constants.LOCKING)):
-                        val = {'task': "",
-                               'ihost_action': ihost_action_str,
-                               'vim_progress_status': ""}
+                    task_str.startswith(constants.LOCKING)):
+                        val = {'task': ""}
+                        self.dbapi.ihost_update(ihost.uuid, val)
+
+                vim_progress_status_str = ihost.get('vim_progress_status') or ""
+                if (vim_progress_status_str and
+                   (vim_progress_status_str != constants.VIM_SERVICES_ENABLED) and
+                   (vim_progress_status_str != constants.VIM_SERVICES_DISABLED)):
+                    if ('..' in vim_progress_status_str):
+                        LOG.info("Audit clearing vim_progress_status=%s" %
+                                vim_progress_status_str)
+                        vim_progress_status_str = ""
                    else:
-                        val = {'ihost_action': ihost_action_str,
-                               'vim_progress_status': ""}
-                else:
-                    ihost_action_str += "-"
-                    if (task_str.startswith(constants.FORCE_LOCKING) or
-                       task_str.startswith(constants.LOCKING)):
-                        task_str += "-"
-                        val = {'task': task_str,
-                               'ihost_action': ihost_action_str}
-                    else:
-                        val = {'ihost_action': ihost_action_str}
+                        vim_progress_status_str += ".."

-                self.dbapi.ihost_update(ihost.uuid, val)
-        else:  # Administrative locked already
-            task_str = ihost.task or ""
-            if (task_str.startswith(constants.FORCE_LOCKING) or
-               task_str.startswith(constants.LOCKING)):
-                val = {'task': ""}
-                self.dbapi.ihost_update(ihost.uuid, val)
+                    val = {'vim_progress_status': vim_progress_status_str}
+                    self.dbapi.ihost_update(ihost.uuid, val)

-        vim_progress_status_str = ihost.get('vim_progress_status') or ""
-        if (vim_progress_status_str and
-           (vim_progress_status_str != constants.VIM_SERVICES_ENABLED) and
-           (vim_progress_status_str != constants.VIM_SERVICES_DISABLED)):
-            if ('..' in vim_progress_status_str):
-                LOG.info("Audit clearing vim_progress_status=%s" %
-                         vim_progress_status_str)
-                vim_progress_status_str = ""
-            else:
-                vim_progress_status_str += ".."
-
-            val = {'vim_progress_status': vim_progress_status_str}
-            self.dbapi.ihost_update(ihost.uuid, val)
-
-    def _audit_upgrade_status(self):
+    @periodic_task.periodic_task(spacing=CONF.conductor_periodic_task_intervals.upgrade_status)
+    def _audit_upgrade_status(self, context):
        """Audit upgrade related status"""
        try:
            upgrade = self.dbapi.software_upgrade_get_one()
@ -5928,19 +5935,21 @@ class ConductorManager(service.PeriodicService):
                    LOG.info("Ceph Upgrade: Exception %s" % e)
                LOG.info("Ceph Upgrade: Enabled monitor msgr2")

-    def _audit_install_states(self, hosts):
+    @periodic_task.periodic_task(spacing=CONF.conductor_periodic_task_intervals.install_states)
+    def _audit_install_states(self, context):
        # A node could shutdown during it's installation and the install_state
        # for example could get stuck at the value "installing". To avoid
        # this situation we audit the sanity of the states by appending the
        # character '+' to the states in the database. After 15 minutes of the
        # states not changing, set the install_state to failed.

-        # The audit's interval is 60sec
+        # The duration (in minutes) to wait until the install_state fails
        MAX_COUNT = 15

        # Allow longer duration for booting phase
        MAX_COUNT_BOOTING = 40

+        hosts = self.dbapi.ihost_get_list()
        for host in hosts:
            LOG.debug("Auditing %s, install_state is %s",
                      host.hostname, host.install_state)
@ -5961,9 +5970,16 @@ class ConductorManager(service.PeriodicService):
                    if (install_state != constants.INSTALL_STATE_INSTALLED and
                            install_state !=
                            constants.INSTALL_STATE_COMPLETED):
+                        # define the quantity of '+' signs that will be added to install_state_info
+                        # accordingly to the interval set to this audit.
+                        periodic_interval = max(60,
+                            CONF.conductor_periodic_task_intervals.install_states)
+                        factor = periodic_interval // 60 + \
+                            (1 if periodic_interval % 60 > 0 else 0)
                        if (install_state ==
                                constants.INSTALL_STATE_INSTALLING and
                                host.install_state_info is not None):
+                            host.install_state_info += factor * "+"
                            if host.install_state_info.count('+') >= MAX_COUNT:
                                LOG.info(
                                    "Auditing %s, install_state changed from "
@ -5972,13 +5988,12 @@ class ConductorManager(service.PeriodicService):
                                    constants.INSTALL_STATE_FAILED)
                                host.install_state = \
                                    constants.INSTALL_STATE_FAILED
-                            else:
-                                host.install_state_info += "+"
                        else:
                            if install_state == constants.INSTALL_STATE_BOOTING:
                                max_count = MAX_COUNT_BOOTING
                            else:
                                max_count = MAX_COUNT
+                            host.install_state_info += factor * "+"
                            if host.install_state.count('+') >= max_count:
                                LOG.info(
                                    "Auditing %s, install_state changed from "
@ -5987,8 +6002,6 @@ class ConductorManager(service.PeriodicService):
                                    constants.INSTALL_STATE_FAILED)
                                host.install_state = \
                                    constants.INSTALL_STATE_FAILED
-                            else:
-                                host.install_state += "+"

                # It is possible we get stuck in an installed failed state. For
                # example if a node gets powered down during an install booting
@ -6090,6 +6103,11 @@ class ConductorManager(service.PeriodicService):
                    LOG.error("Removed unsupported deferred config_type %s" %
                              config_type)

+    @periodic_task.periodic_task(spacing=CONF.conductor_periodic_task_intervals.deferred_runtime_config)
+    def _audit_deferred_runtime_config_periodic(self, context):
+        # check whether there are deferred runtime manifests to apply
+        self._audit_deferred_runtime_config(context)
+
    @periodic_task.periodic_task(spacing=CONF.conductor_periodic_task_intervals.kubernetes_local_secrets)
    def _kubernetes_local_secrets_audit(self, context):
        # Audit kubernetes local registry secrets info
@ -6097,38 +6115,8 @@ class ConductorManager(service.PeriodicService):
        if self._app:
            self._app.audit_local_registry_secrets(context)

-    @periodic_task.periodic_task(spacing=CONF.conductor_periodic_task_intervals.default)
-    def _conductor_audit(self, context):
-        # periodically, perform audit of inventory
-        LOG.debug("Sysinv Conductor running periodic audit task.")
-
-        # check whether there are deferred runtime manifests to apply
-        self._audit_deferred_runtime_config(context)
-
-        # check whether we may have just become active with target config
-        self._controller_config_active_apply(context)
-
-        # Audit upgrade status
-        self._audit_upgrade_status()
-
-        hosts = self.dbapi.ihost_get_list()
-
-        # Audit install states
-        self._audit_install_states(hosts)
-
-        # Audit kubernetes node labels
-        self._audit_kubernetes_labels(hosts)
-
-        # Audit image conversion
-        self._audit_image_conversion(hosts)
-
-        for host in hosts:
-            # only audit configured hosts
-            if not host.personality:
-                continue
-            self._audit_ihost_action(host)
-
-    def _audit_kubernetes_labels(self, hosts):
+    @periodic_task.periodic_task(spacing=CONF.conductor_periodic_task_intervals.kubernetes_labels)
+    def _audit_kubernetes_labels(self, context):
        if not cutils.is_initial_config_complete():
            LOG.debug("_audit_kubernetes_labels skip")
            return
@ -6137,6 +6125,7 @@ class ConductorManager(service.PeriodicService):
        sysinv_labels = self.dbapi.label_get_all()
        nodes = self._kube.kube_get_nodes()

+        hosts = self.dbapi.ihost_get_list()
        for host in hosts:
            try:
                for node in nodes:
@ -6191,14 +6180,15 @@ class ConductorManager(service.PeriodicService):
            elif bk.backend in self._stor_bck_op_timeouts:
                del self._stor_bck_op_timeouts[bk.backend]

-    def _audit_image_conversion(self, hosts):
+    @periodic_task.periodic_task(spacing=CONF.conductor_periodic_task_intervals.image_conversion)
+    def _audit_image_conversion(self, context):
        """
        Raise alarm if:
           - image-conversion is not added on both controllers;
           - the size of the filesystem is not the same
             on both controllers
        """
-        chosts = [h for h in hosts if h.personality == constants.CONTROLLER]
+        chosts = self.dbapi.ihost_get_by_personality(constants.CONTROLLER)
        if len(chosts) <= 1:
            # No alarm is raised if setup has only one controller
            return