Split _conductor_audit into individual audits

Currently, sysinv conductor audit _conductor_audit has a set of several
sub audits.
The purpose of this story is to split this audit into individual
audits.

Note that this change will keep the same configuration for the periodic
intervals, for now, just the logic of the code has changed.

TEST PLAN:
PASS: AIO-SX: manually replaced these files into a Debian installation
and no crashes happened.
PASS: rebuild the whole system with the modification with no crashes.
PASS: install the new iso inside a lab including bootstrap and first
'host-unlock'.
PASS: AIO-SX: verify if the sysinv conductor audits (periodic tasks)
are being called.
PASS: change the install_state interval to different values (for test
purposes)
PASS: follow the sysinv logs seeking for errors. No error was found.
PASS: guarantee that all audits are being called by the periodic tasks.

Story: 2010087
Task: 45646

Depends-On: https://review.opendev.org/c/starlingx/config/+/848330
Signed-off-by: Bruno Costa <bruno.costa@windriver.com>
Change-Id: I215fae7ccbbbaadd7b93f5a8efc11df0834d411a
This commit is contained in:
Bruno Costa 2022-07-01 08:57:29 -03:00
parent a71c4b6c4c
commit d2e2a67224
1 changed files with 97 additions and 107 deletions

View File

@ -5657,6 +5657,8 @@ class ConductorManager(service.PeriodicService):
return False
@periodic_task.periodic_task(
spacing=CONF.conductor_periodic_task_intervals.controller_config_active_apply)
def _controller_config_active_apply(self, context):
"""Check whether target config has been applied to active
controller to run postprocessing"""
@ -5768,86 +5770,91 @@ class ConductorManager(service.PeriodicService):
return all_fs_resized
def _audit_ihost_action(self, ihost):
@periodic_task.periodic_task(spacing=CONF.conductor_periodic_task_intervals.ihost_action)
def _audit_ihost_action(self, context):
"""Audit whether the ihost_action needs to be terminated or escalated.
"""
hosts = self.dbapi.ihost_get_list()
for ihost in hosts:
# only audit configured hosts
if ihost.personality:
if ihost.administrative == constants.ADMIN_UNLOCKED:
ihost_action_str = ihost.ihost_action or ""
if ihost.administrative == constants.ADMIN_UNLOCKED:
ihost_action_str = ihost.ihost_action or ""
if (ihost_action_str.startswith(constants.FORCE_LOCK_ACTION) or
ihost_action_str.startswith(constants.LOCK_ACTION)):
if (ihost_action_str.startswith(constants.FORCE_LOCK_ACTION) or
ihost_action_str.startswith(constants.LOCK_ACTION)):
task_str = ihost.task or ""
if (('--' in ihost_action_str and
ihost_action_str.startswith(
constants.FORCE_LOCK_ACTION)) or
('----------' in ihost_action_str and
ihost_action_str.startswith(constants.LOCK_ACTION))):
task_str = ihost.task or ""
if (('--' in ihost_action_str and
ihost_action_str.startswith(
constants.FORCE_LOCK_ACTION)) or
('----------' in ihost_action_str and
ihost_action_str.startswith(constants.LOCK_ACTION))):
ihost_mtc = ihost.as_dict()
keepkeys = ['ihost_action', 'vim_progress_status']
ihost_mtc = cutils.removekeys_nonmtce(ihost_mtc,
keepkeys)
ihost_mtc = ihost.as_dict()
keepkeys = ['ihost_action', 'vim_progress_status']
ihost_mtc = cutils.removekeys_nonmtce(ihost_mtc,
keepkeys)
if ihost_action_str.startswith(constants.FORCE_LOCK_ACTION):
timeout_in_secs = 6
ihost_mtc['operation'] = 'modify'
ihost_mtc['action'] = constants.FORCE_LOCK_ACTION
ihost_mtc['task'] = constants.FORCE_LOCKING
LOG.warn("ihost_action override %s" %
ihost_mtc)
mtce_api.host_modify(
self._api_token, self._mtc_address, self._mtc_port,
ihost_mtc, timeout_in_secs)
if ihost_action_str.startswith(constants.FORCE_LOCK_ACTION):
timeout_in_secs = 6
ihost_mtc['operation'] = 'modify'
ihost_mtc['action'] = constants.FORCE_LOCK_ACTION
ihost_mtc['task'] = constants.FORCE_LOCKING
LOG.warn("ihost_action override %s" %
ihost_mtc)
mtce_api.host_modify(
self._api_token, self._mtc_address, self._mtc_port,
ihost_mtc, timeout_in_secs)
# need time for FORCE_LOCK mtce to clear
if ('----' in ihost_action_str):
ihost_action_str = ""
else:
ihost_action_str += "-"
# need time for FORCE_LOCK mtce to clear
if ('----' in ihost_action_str):
ihost_action_str = ""
else:
ihost_action_str += "-"
if (task_str.startswith(constants.FORCE_LOCKING) or
task_str.startswith(constants.LOCKING)):
val = {'task': "",
'ihost_action': ihost_action_str,
'vim_progress_status': ""}
else:
val = {'ihost_action': ihost_action_str,
'vim_progress_status': ""}
else:
ihost_action_str += "-"
if (task_str.startswith(constants.FORCE_LOCKING) or
task_str.startswith(constants.LOCKING)):
task_str += "-"
val = {'task': task_str,
'ihost_action': ihost_action_str}
else:
val = {'ihost_action': ihost_action_str}
self.dbapi.ihost_update(ihost.uuid, val)
else: # Administrative locked already
task_str = ihost.task or ""
if (task_str.startswith(constants.FORCE_LOCKING) or
task_str.startswith(constants.LOCKING)):
val = {'task': "",
'ihost_action': ihost_action_str,
'vim_progress_status': ""}
task_str.startswith(constants.LOCKING)):
val = {'task': ""}
self.dbapi.ihost_update(ihost.uuid, val)
vim_progress_status_str = ihost.get('vim_progress_status') or ""
if (vim_progress_status_str and
(vim_progress_status_str != constants.VIM_SERVICES_ENABLED) and
(vim_progress_status_str != constants.VIM_SERVICES_DISABLED)):
if ('..' in vim_progress_status_str):
LOG.info("Audit clearing vim_progress_status=%s" %
vim_progress_status_str)
vim_progress_status_str = ""
else:
val = {'ihost_action': ihost_action_str,
'vim_progress_status': ""}
else:
ihost_action_str += "-"
if (task_str.startswith(constants.FORCE_LOCKING) or
task_str.startswith(constants.LOCKING)):
task_str += "-"
val = {'task': task_str,
'ihost_action': ihost_action_str}
else:
val = {'ihost_action': ihost_action_str}
vim_progress_status_str += ".."
self.dbapi.ihost_update(ihost.uuid, val)
else: # Administrative locked already
task_str = ihost.task or ""
if (task_str.startswith(constants.FORCE_LOCKING) or
task_str.startswith(constants.LOCKING)):
val = {'task': ""}
self.dbapi.ihost_update(ihost.uuid, val)
val = {'vim_progress_status': vim_progress_status_str}
self.dbapi.ihost_update(ihost.uuid, val)
vim_progress_status_str = ihost.get('vim_progress_status') or ""
if (vim_progress_status_str and
(vim_progress_status_str != constants.VIM_SERVICES_ENABLED) and
(vim_progress_status_str != constants.VIM_SERVICES_DISABLED)):
if ('..' in vim_progress_status_str):
LOG.info("Audit clearing vim_progress_status=%s" %
vim_progress_status_str)
vim_progress_status_str = ""
else:
vim_progress_status_str += ".."
val = {'vim_progress_status': vim_progress_status_str}
self.dbapi.ihost_update(ihost.uuid, val)
def _audit_upgrade_status(self):
@periodic_task.periodic_task(spacing=CONF.conductor_periodic_task_intervals.upgrade_status)
def _audit_upgrade_status(self, context):
"""Audit upgrade related status"""
try:
upgrade = self.dbapi.software_upgrade_get_one()
@ -5928,19 +5935,21 @@ class ConductorManager(service.PeriodicService):
LOG.info("Ceph Upgrade: Exception %s" % e)
LOG.info("Ceph Upgrade: Enabled monitor msgr2")
def _audit_install_states(self, hosts):
@periodic_task.periodic_task(spacing=CONF.conductor_periodic_task_intervals.install_states)
def _audit_install_states(self, context):
# A node could shutdown during it's installation and the install_state
# for example could get stuck at the value "installing". To avoid
# this situation we audit the sanity of the states by appending the
# character '+' to the states in the database. After 15 minutes of the
# states not changing, set the install_state to failed.
# The audit's interval is 60sec
# The duration (in minutes) to wait until the install_state fails
MAX_COUNT = 15
# Allow longer duration for booting phase
MAX_COUNT_BOOTING = 40
hosts = self.dbapi.ihost_get_list()
for host in hosts:
LOG.debug("Auditing %s, install_state is %s",
host.hostname, host.install_state)
@ -5961,9 +5970,16 @@ class ConductorManager(service.PeriodicService):
if (install_state != constants.INSTALL_STATE_INSTALLED and
install_state !=
constants.INSTALL_STATE_COMPLETED):
# define the quantity of '+' signs that will be added to install_state_info
# accordingly to the interval set to this audit.
periodic_interval = max(60,
CONF.conductor_periodic_task_intervals.install_states)
factor = periodic_interval // 60 + \
(1 if periodic_interval % 60 > 0 else 0)
if (install_state ==
constants.INSTALL_STATE_INSTALLING and
host.install_state_info is not None):
host.install_state_info += factor * "+"
if host.install_state_info.count('+') >= MAX_COUNT:
LOG.info(
"Auditing %s, install_state changed from "
@ -5972,13 +5988,12 @@ class ConductorManager(service.PeriodicService):
constants.INSTALL_STATE_FAILED)
host.install_state = \
constants.INSTALL_STATE_FAILED
else:
host.install_state_info += "+"
else:
if install_state == constants.INSTALL_STATE_BOOTING:
max_count = MAX_COUNT_BOOTING
else:
max_count = MAX_COUNT
host.install_state_info += factor * "+"
if host.install_state.count('+') >= max_count:
LOG.info(
"Auditing %s, install_state changed from "
@ -5987,8 +6002,6 @@ class ConductorManager(service.PeriodicService):
constants.INSTALL_STATE_FAILED)
host.install_state = \
constants.INSTALL_STATE_FAILED
else:
host.install_state += "+"
# It is possible we get stuck in an installed failed state. For
# example if a node gets powered down during an install booting
@ -6090,6 +6103,11 @@ class ConductorManager(service.PeriodicService):
LOG.error("Removed unsupported deferred config_type %s" %
config_type)
@periodic_task.periodic_task(spacing=CONF.conductor_periodic_task_intervals.deferred_runtime_config)
def _audit_deferred_runtime_config_periodic(self, context):
# check whether there are deferred runtime manifests to apply
self._audit_deferred_runtime_config(context)
@periodic_task.periodic_task(spacing=CONF.conductor_periodic_task_intervals.kubernetes_local_secrets)
def _kubernetes_local_secrets_audit(self, context):
# Audit kubernetes local registry secrets info
@ -6097,38 +6115,8 @@ class ConductorManager(service.PeriodicService):
if self._app:
self._app.audit_local_registry_secrets(context)
@periodic_task.periodic_task(spacing=CONF.conductor_periodic_task_intervals.default)
def _conductor_audit(self, context):
# periodically, perform audit of inventory
LOG.debug("Sysinv Conductor running periodic audit task.")
# check whether there are deferred runtime manifests to apply
self._audit_deferred_runtime_config(context)
# check whether we may have just become active with target config
self._controller_config_active_apply(context)
# Audit upgrade status
self._audit_upgrade_status()
hosts = self.dbapi.ihost_get_list()
# Audit install states
self._audit_install_states(hosts)
# Audit kubernetes node labels
self._audit_kubernetes_labels(hosts)
# Audit image conversion
self._audit_image_conversion(hosts)
for host in hosts:
# only audit configured hosts
if not host.personality:
continue
self._audit_ihost_action(host)
def _audit_kubernetes_labels(self, hosts):
@periodic_task.periodic_task(spacing=CONF.conductor_periodic_task_intervals.kubernetes_labels)
def _audit_kubernetes_labels(self, context):
if not cutils.is_initial_config_complete():
LOG.debug("_audit_kubernetes_labels skip")
return
@ -6137,6 +6125,7 @@ class ConductorManager(service.PeriodicService):
sysinv_labels = self.dbapi.label_get_all()
nodes = self._kube.kube_get_nodes()
hosts = self.dbapi.ihost_get_list()
for host in hosts:
try:
for node in nodes:
@ -6191,14 +6180,15 @@ class ConductorManager(service.PeriodicService):
elif bk.backend in self._stor_bck_op_timeouts:
del self._stor_bck_op_timeouts[bk.backend]
def _audit_image_conversion(self, hosts):
@periodic_task.periodic_task(spacing=CONF.conductor_periodic_task_intervals.image_conversion)
def _audit_image_conversion(self, context):
"""
Raise alarm if:
- image-conversion is not added on both controllers;
- the size of the filesystem is not the same
on both controllers
"""
chosts = [h for h in hosts if h.personality == constants.CONTROLLER]
chosts = self.dbapi.ihost_get_by_personality(constants.CONTROLLER)
if len(chosts) <= 1:
# No alarm is raised if setup has only one controller
return