Merge "Create deploy host alarms"

This commit is contained in:
Zuul 2024-04-30 17:55:38 +00:00 committed by Gerrit Code Review
commit 5585c1fc3b
3 changed files with 80 additions and 5 deletions

View File

@ -14,6 +14,7 @@ try:
except Exception:
pass
from fm_api import constants as fm_constants
from tsconfig.tsconfig import SW_VERSION
ADDRESS_VERSION_IPV4 = 4
@ -106,3 +107,26 @@ WORKER = 'worker'
AVAILABILITY_ONLINE = 'online'
ADMIN_LOCKED = 'locked'
SOFTWARE_ALARMS = {
fm_constants.FM_ALARM_ID_USM_DEPLOY_HOST_SUCCESS_RR: {
"entity_type_id": fm_constants.FM_ENTITY_TYPE_HOST,
"severity": fm_constants.FM_ALARM_SEVERITY_WARNING,
"reason_text": ("Deploy host completed, unlock the host to apply "
"the new software release"),
"alarm_type": fm_constants.FM_ALARM_TYPE_11,
"probable_cause": fm_constants.ALARM_PROBABLE_CAUSE_65,
"proposed_repair_action": "Unlock host",
"service_affecting": True,
},
fm_constants.FM_ALARM_ID_USM_DEPLOY_HOST_FAILURE: {
"entity_type_id": fm_constants.FM_ENTITY_TYPE_HOST,
"severity": fm_constants.FM_ALARM_SEVERITY_MAJOR,
"reason_text": "Deploy host failed, check logs for errors",
"alarm_type": fm_constants.FM_ALARM_TYPE_11,
"probable_cause": fm_constants.ALARM_PROBABLE_CAUSE_65,
"proposed_repair_action": ("Check the logs for errors, fix the issues manually "
"and retry"),
"service_affecting": True,
}
}

View File

@ -359,6 +359,7 @@ class PatchMessageAgentInstallReq(messages.PatchMessage):
if not self.force:
setflag(node_is_software_updated_rr_file)
resp.reboot_required = True
if not os.path.exists(node_is_locked_file):
if self.force:
@ -383,6 +384,7 @@ class PatchMessageAgentInstallResp(messages.PatchMessage):
messages.PatchMessage.__init__(self, messages.PATCHMSG_AGENT_INSTALL_RESP)
self.status = False
self.reject_reason = None
self.reboot_required = False
def encode(self):
global pa
@ -390,6 +392,7 @@ class PatchMessageAgentInstallResp(messages.PatchMessage):
self.message['status'] = self.status
if self.reject_reason is not None:
self.message['reject_reason'] = self.reject_reason
self.message['reboot_required'] = self.reboot_required
def handle(self, sock, addr):
LOG.error("Should not get here")

View File

@ -545,6 +545,7 @@ class PatchMessageAgentInstallResp(messages.PatchMessage):
messages.PatchMessage.__init__(self, messages.PATCHMSG_AGENT_INSTALL_RESP)
self.status = False
self.reject_reason = None
self.reboot_required = False
def decode(self, data):
messages.PatchMessage.decode(self, data)
@ -552,6 +553,8 @@ class PatchMessageAgentInstallResp(messages.PatchMessage):
self.status = data['status']
if 'reject_reason' in data:
self.reject_reason = data['reject_reason']
if 'reboot_required' in data:
self.reboot_required = data['reboot_required']
def encode(self):
# Nothing to add, so just call the super class
@ -564,11 +567,11 @@ class PatchMessageAgentInstallResp(messages.PatchMessage):
sc.hosts_lock.acquire()
try:
# NOTE(bqian) seems like trying to tolerant a failure situation
# NOTE(bqian) seems like trying to tolerate a failure situation
# that a host is directed to install a patch but during the installation
# software-controller-daemon gets restarted
# should remove the sc.hosts which is in memory volatile storage and replaced with
# armanent deploy-host entity
# permanent deploy-host entity
ip = addr[0]
if ip not in sc.hosts:
sc.hosts[ip] = AgentNeighbour(ip)
@ -583,8 +586,15 @@ class PatchMessageAgentInstallResp(messages.PatchMessage):
deploy_host_state = DeployHostState(hostname)
if self.status:
deploy_host_state.deployed()
if self.reboot_required:
sc.manage_software_alarm(fm_constants.FM_ALARM_ID_USM_DEPLOY_HOST_SUCCESS_RR,
fm_constants.FM_ALARM_STATE_SET,
"%s=%s" % (fm_constants.FM_ENTITY_TYPE_HOST, hostname))
else:
deploy_host_state.deploy_failed()
sc.manage_software_alarm(fm_constants.FM_ALARM_ID_USM_DEPLOY_HOST_FAILURE,
fm_constants.FM_ALARM_STATE_SET,
"%s=%s" % (fm_constants.FM_ENTITY_TYPE_HOST, hostname))
def send(self, sock): # pylint: disable=unused-argument
LOG.error("Should not get here")
@ -2683,10 +2693,10 @@ class PatchController(PatchService):
if self._activate():
deploy_state.activate_completed()
msg_info += "Deployment has been activated\n"
msg_info += "Deployment has been activated.\n"
else:
deploy_state.activate_failed()
msg_error += "Dployment activation has failed.\n"
msg_error += "Deployment activation has failed.\n"
return dict(info=msg_info, warning=msg_warning, error=msg_error)
@ -2715,6 +2725,13 @@ class PatchController(PatchService):
deploy_state.deploy_host()
deploy_host_state.deploy_started()
# if in a 'deploy host' reentrant scenario, i.e. retrying after
# a failure, then clear the failure alarm before retrying
entity_instance_id = "%s=%s" % (fm_constants.FM_ENTITY_TYPE_HOST, hostname)
self.manage_software_alarm(fm_constants.FM_ALARM_ID_USM_DEPLOY_HOST_FAILURE,
fm_constants.FM_ALARM_STATE_CLEAR,
entity_instance_id)
# NOTE(bqian) Get IP address to fulfill the need of patching structure.
# need to review the design
ip = socket.getaddrinfo(hostname, 0)[0][4][0]
@ -2740,7 +2757,6 @@ class PatchController(PatchService):
msg_info += msg + "\n"
LOG.info(msg)
set_host_target_load(hostname, major_release)
# TODO(heitormatsui) update host deploy status
self.hosts_lock.acquire()
self.hosts[ip].install_pending = True
@ -2970,6 +2986,38 @@ class PatchController(PatchService):
func(*args, **kwargs)
self._update_state_to_peer()
def manage_software_alarm(self, alarm_id, alarm_state, entity_instance_id):
try:
if alarm_id not in constants.SOFTWARE_ALARMS:
raise Exception("Unknown software alarm '%s'." % alarm_id)
# deal with the alarm clear scenario
if alarm_state == fm_constants.FM_ALARM_STATE_CLEAR:
LOG.info("Clearing alarm: %s for %s" % (alarm_id, entity_instance_id))
self.fm_api.clear_fault(alarm_id, entity_instance_id)
return
# if not clear alarm scenario, create the alarm
alarm_data = constants.SOFTWARE_ALARMS.get(alarm_id)
alarm = fm_api.Fault(
alarm_id=alarm_id,
alarm_state=alarm_state,
entity_type_id=alarm_data.get("entity_type_id"),
entity_instance_id=entity_instance_id,
severity=alarm_data.get("severity"),
reason_text=alarm_data.get("reason_text"),
alarm_type=alarm_data.get("alarm_type"),
probable_cause=alarm_data.get("probable_cause"),
proposed_repair_action=alarm_data.get("proposed_repair_action"),
service_affecting=alarm_data.get("service_affecting"),
)
LOG.info("Raising alarm: %s for %s" % (alarm_id, entity_instance_id))
self.fm_api.set_fault(alarm)
except Exception as e:
LOG.exception("Failed to manage alarm %s with action %s: %s" % (
alarm_id, alarm_state, str(e)
))
def handle_deploy_state_sync(self, alarm_instance_id):
"""
Handle the deploy state sync.