From eb1ac2e792fc48ebf285077dbd648a690fc70786 Mon Sep 17 00:00:00 2001 From: Heitor Matsui Date: Mon, 22 Apr 2024 17:41:10 -0300 Subject: [PATCH] Create deploy host alarms This commit creates two alarms, one for the scenario when "deploy host" executes with success, and one for the scenario where it fails. The success alarm will be cleared after the host is unlocked, in [1], and the failure alarm is cleared when the "deploy host" is retried for the same host. [1] https://review.opendev.org/c/starlingx/config/+/916835 Test Plan PASS: run "deploy host" successfully and verify the alarm PASS: run "deploy host" with failure and verify the alarm Story: 2010676 Task: 49937 Depends-on: https://review.opendev.org/c/starlingx/fault/+/916786 Signed-off-by: Heitor Matsui Change-Id: Ie1a32f725511658616f016d56de38cd04f1d9c5d --- software/software/constants.py | 24 ++++++++++ software/software/software_agent.py | 3 ++ software/software/software_controller.py | 58 ++++++++++++++++++++++-- 3 files changed, 80 insertions(+), 5 deletions(-) diff --git a/software/software/constants.py b/software/software/constants.py index d85f6ca8..49edfee8 100644 --- a/software/software/constants.py +++ b/software/software/constants.py @@ -14,6 +14,7 @@ try: except Exception: pass +from fm_api import constants as fm_constants from tsconfig.tsconfig import SW_VERSION ADDRESS_VERSION_IPV4 = 4 @@ -105,3 +106,26 @@ WORKER = 'worker' AVAILABILITY_ONLINE = 'online' ADMIN_LOCKED = 'locked' + +SOFTWARE_ALARMS = { + fm_constants.FM_ALARM_ID_USM_DEPLOY_HOST_SUCCESS_RR: { + "entity_type_id": fm_constants.FM_ENTITY_TYPE_HOST, + "severity": fm_constants.FM_ALARM_SEVERITY_WARNING, + "reason_text": ("Deploy host completed, unlock the host to apply " + "the new software release"), + "alarm_type": fm_constants.FM_ALARM_TYPE_11, + "probable_cause": fm_constants.ALARM_PROBABLE_CAUSE_65, + "proposed_repair_action": "Unlock host", + "service_affecting": True, + }, + fm_constants.FM_ALARM_ID_USM_DEPLOY_HOST_FAILURE: { + "entity_type_id": fm_constants.FM_ENTITY_TYPE_HOST, + "severity": fm_constants.FM_ALARM_SEVERITY_MAJOR, + "reason_text": "Deploy host failed, check logs for errors", + "alarm_type": fm_constants.FM_ALARM_TYPE_11, + "probable_cause": fm_constants.ALARM_PROBABLE_CAUSE_65, + "proposed_repair_action": ("Check the logs for errors, fix the issues manually " + "and retry"), + "service_affecting": True, + } +} diff --git a/software/software/software_agent.py b/software/software/software_agent.py index 46549a4e..7f9598b4 100644 --- a/software/software/software_agent.py +++ b/software/software/software_agent.py @@ -359,6 +359,7 @@ class PatchMessageAgentInstallReq(messages.PatchMessage): if not self.force: setflag(node_is_software_updated_rr_file) + resp.reboot_required = True if not os.path.exists(node_is_locked_file): if self.force: @@ -383,6 +384,7 @@ class PatchMessageAgentInstallResp(messages.PatchMessage): messages.PatchMessage.__init__(self, messages.PATCHMSG_AGENT_INSTALL_RESP) self.status = False self.reject_reason = None + self.reboot_required = False def encode(self): global pa @@ -390,6 +392,7 @@ class PatchMessageAgentInstallResp(messages.PatchMessage): self.message['status'] = self.status if self.reject_reason is not None: self.message['reject_reason'] = self.reject_reason + self.message['reboot_required'] = self.reboot_required def handle(self, sock, addr): LOG.error("Should not get here") diff --git a/software/software/software_controller.py b/software/software/software_controller.py index 05bd7408..1e5da6d5 100644 --- a/software/software/software_controller.py +++ b/software/software/software_controller.py @@ -545,6 +545,7 @@ class PatchMessageAgentInstallResp(messages.PatchMessage): messages.PatchMessage.__init__(self, messages.PATCHMSG_AGENT_INSTALL_RESP) self.status = False self.reject_reason = None + self.reboot_required = False def decode(self, data): messages.PatchMessage.decode(self, data) @@ -552,6 +553,8 @@ class PatchMessageAgentInstallResp(messages.PatchMessage): self.status = data['status'] if 'reject_reason' in data: self.reject_reason = data['reject_reason'] + if 'reboot_required' in data: + self.reboot_required = data['reboot_required'] def encode(self): # Nothing to add, so just call the super class @@ -564,11 +567,11 @@ class PatchMessageAgentInstallResp(messages.PatchMessage): sc.hosts_lock.acquire() try: - # NOTE(bqian) seems like trying to tolerant a failure situation + # NOTE(bqian) seems like trying to tolerate a failure situation # that a host is directed to install a patch but during the installation # software-controller-daemon gets restarted # should remove the sc.hosts which is in memory volatile storage and replaced with - # armanent deploy-host entity + # permanent deploy-host entity ip = addr[0] if ip not in sc.hosts: sc.hosts[ip] = AgentNeighbour(ip) @@ -583,8 +586,15 @@ class PatchMessageAgentInstallResp(messages.PatchMessage): deploy_host_state = DeployHostState(hostname) if self.status: deploy_host_state.deployed() + if self.reboot_required: + sc.manage_software_alarm(fm_constants.FM_ALARM_ID_USM_DEPLOY_HOST_SUCCESS_RR, + fm_constants.FM_ALARM_STATE_SET, + "%s=%s" % (fm_constants.FM_ENTITY_TYPE_HOST, hostname)) else: deploy_host_state.deploy_failed() + sc.manage_software_alarm(fm_constants.FM_ALARM_ID_USM_DEPLOY_HOST_FAILURE, + fm_constants.FM_ALARM_STATE_SET, + "%s=%s" % (fm_constants.FM_ENTITY_TYPE_HOST, hostname)) def send(self, sock): # pylint: disable=unused-argument LOG.error("Should not get here") @@ -2683,10 +2693,10 @@ class PatchController(PatchService): if self._activate(): deploy_state.activate_completed() - msg_info += "Deployment has been activated\n" + msg_info += "Deployment has been activated.\n" else: deploy_state.activate_failed() - msg_error += "Dployment activation has failed.\n" + msg_error += "Deployment activation has failed.\n" return dict(info=msg_info, warning=msg_warning, error=msg_error) @@ -2715,6 +2725,13 @@ class PatchController(PatchService): deploy_state.deploy_host() deploy_host_state.deploy_started() + # if in a 'deploy host' reentrant scenario, i.e. retrying after + # a failure, then clear the failure alarm before retrying + entity_instance_id = "%s=%s" % (fm_constants.FM_ENTITY_TYPE_HOST, hostname) + self.manage_software_alarm(fm_constants.FM_ALARM_ID_USM_DEPLOY_HOST_FAILURE, + fm_constants.FM_ALARM_STATE_CLEAR, + entity_instance_id) + # NOTE(bqian) Get IP address to fulfill the need of patching structure. # need to review the design ip = socket.getaddrinfo(hostname, 0)[0][4][0] @@ -2740,7 +2757,6 @@ class PatchController(PatchService): msg_info += msg + "\n" LOG.info(msg) set_host_target_load(hostname, major_release) - # TODO(heitormatsui) update host deploy status self.hosts_lock.acquire() self.hosts[ip].install_pending = True @@ -2970,6 +2986,38 @@ class PatchController(PatchService): func(*args, **kwargs) self._update_state_to_peer() + def manage_software_alarm(self, alarm_id, alarm_state, entity_instance_id): + try: + if alarm_id not in constants.SOFTWARE_ALARMS: + raise Exception("Unknown software alarm '%s'." % alarm_id) + + # deal with the alarm clear scenario + if alarm_state == fm_constants.FM_ALARM_STATE_CLEAR: + LOG.info("Clearing alarm: %s for %s" % (alarm_id, entity_instance_id)) + self.fm_api.clear_fault(alarm_id, entity_instance_id) + return + + # if not clear alarm scenario, create the alarm + alarm_data = constants.SOFTWARE_ALARMS.get(alarm_id) + alarm = fm_api.Fault( + alarm_id=alarm_id, + alarm_state=alarm_state, + entity_type_id=alarm_data.get("entity_type_id"), + entity_instance_id=entity_instance_id, + severity=alarm_data.get("severity"), + reason_text=alarm_data.get("reason_text"), + alarm_type=alarm_data.get("alarm_type"), + probable_cause=alarm_data.get("probable_cause"), + proposed_repair_action=alarm_data.get("proposed_repair_action"), + service_affecting=alarm_data.get("service_affecting"), + ) + LOG.info("Raising alarm: %s for %s" % (alarm_id, entity_instance_id)) + self.fm_api.set_fault(alarm) + except Exception as e: + LOG.exception("Failed to manage alarm %s with action %s: %s" % ( + alarm_id, alarm_state, str(e) + )) + def handle_deploy_state_sync(self, alarm_instance_id): """ Handle the deploy state sync.