Merge "Create deploy host alarms"
This commit is contained in:
commit
5585c1fc3b
|
@ -14,6 +14,7 @@ try:
|
|||
except Exception:
|
||||
pass
|
||||
|
||||
from fm_api import constants as fm_constants
|
||||
from tsconfig.tsconfig import SW_VERSION
|
||||
|
||||
ADDRESS_VERSION_IPV4 = 4
|
||||
|
@ -106,3 +107,26 @@ WORKER = 'worker'
|
|||
|
||||
AVAILABILITY_ONLINE = 'online'
|
||||
ADMIN_LOCKED = 'locked'
|
||||
|
||||
SOFTWARE_ALARMS = {
|
||||
fm_constants.FM_ALARM_ID_USM_DEPLOY_HOST_SUCCESS_RR: {
|
||||
"entity_type_id": fm_constants.FM_ENTITY_TYPE_HOST,
|
||||
"severity": fm_constants.FM_ALARM_SEVERITY_WARNING,
|
||||
"reason_text": ("Deploy host completed, unlock the host to apply "
|
||||
"the new software release"),
|
||||
"alarm_type": fm_constants.FM_ALARM_TYPE_11,
|
||||
"probable_cause": fm_constants.ALARM_PROBABLE_CAUSE_65,
|
||||
"proposed_repair_action": "Unlock host",
|
||||
"service_affecting": True,
|
||||
},
|
||||
fm_constants.FM_ALARM_ID_USM_DEPLOY_HOST_FAILURE: {
|
||||
"entity_type_id": fm_constants.FM_ENTITY_TYPE_HOST,
|
||||
"severity": fm_constants.FM_ALARM_SEVERITY_MAJOR,
|
||||
"reason_text": "Deploy host failed, check logs for errors",
|
||||
"alarm_type": fm_constants.FM_ALARM_TYPE_11,
|
||||
"probable_cause": fm_constants.ALARM_PROBABLE_CAUSE_65,
|
||||
"proposed_repair_action": ("Check the logs for errors, fix the issues manually "
|
||||
"and retry"),
|
||||
"service_affecting": True,
|
||||
}
|
||||
}
|
||||
|
|
|
@ -359,6 +359,7 @@ class PatchMessageAgentInstallReq(messages.PatchMessage):
|
|||
|
||||
if not self.force:
|
||||
setflag(node_is_software_updated_rr_file)
|
||||
resp.reboot_required = True
|
||||
|
||||
if not os.path.exists(node_is_locked_file):
|
||||
if self.force:
|
||||
|
@ -383,6 +384,7 @@ class PatchMessageAgentInstallResp(messages.PatchMessage):
|
|||
messages.PatchMessage.__init__(self, messages.PATCHMSG_AGENT_INSTALL_RESP)
|
||||
self.status = False
|
||||
self.reject_reason = None
|
||||
self.reboot_required = False
|
||||
|
||||
def encode(self):
|
||||
global pa
|
||||
|
@ -390,6 +392,7 @@ class PatchMessageAgentInstallResp(messages.PatchMessage):
|
|||
self.message['status'] = self.status
|
||||
if self.reject_reason is not None:
|
||||
self.message['reject_reason'] = self.reject_reason
|
||||
self.message['reboot_required'] = self.reboot_required
|
||||
|
||||
def handle(self, sock, addr):
|
||||
LOG.error("Should not get here")
|
||||
|
|
|
@ -545,6 +545,7 @@ class PatchMessageAgentInstallResp(messages.PatchMessage):
|
|||
messages.PatchMessage.__init__(self, messages.PATCHMSG_AGENT_INSTALL_RESP)
|
||||
self.status = False
|
||||
self.reject_reason = None
|
||||
self.reboot_required = False
|
||||
|
||||
def decode(self, data):
|
||||
messages.PatchMessage.decode(self, data)
|
||||
|
@ -552,6 +553,8 @@ class PatchMessageAgentInstallResp(messages.PatchMessage):
|
|||
self.status = data['status']
|
||||
if 'reject_reason' in data:
|
||||
self.reject_reason = data['reject_reason']
|
||||
if 'reboot_required' in data:
|
||||
self.reboot_required = data['reboot_required']
|
||||
|
||||
def encode(self):
|
||||
# Nothing to add, so just call the super class
|
||||
|
@ -564,11 +567,11 @@ class PatchMessageAgentInstallResp(messages.PatchMessage):
|
|||
|
||||
sc.hosts_lock.acquire()
|
||||
try:
|
||||
# NOTE(bqian) seems like trying to tolerant a failure situation
|
||||
# NOTE(bqian) seems like trying to tolerate a failure situation
|
||||
# that a host is directed to install a patch but during the installation
|
||||
# software-controller-daemon gets restarted
|
||||
# should remove the sc.hosts which is in memory volatile storage and replaced with
|
||||
# armanent deploy-host entity
|
||||
# permanent deploy-host entity
|
||||
ip = addr[0]
|
||||
if ip not in sc.hosts:
|
||||
sc.hosts[ip] = AgentNeighbour(ip)
|
||||
|
@ -583,8 +586,15 @@ class PatchMessageAgentInstallResp(messages.PatchMessage):
|
|||
deploy_host_state = DeployHostState(hostname)
|
||||
if self.status:
|
||||
deploy_host_state.deployed()
|
||||
if self.reboot_required:
|
||||
sc.manage_software_alarm(fm_constants.FM_ALARM_ID_USM_DEPLOY_HOST_SUCCESS_RR,
|
||||
fm_constants.FM_ALARM_STATE_SET,
|
||||
"%s=%s" % (fm_constants.FM_ENTITY_TYPE_HOST, hostname))
|
||||
else:
|
||||
deploy_host_state.deploy_failed()
|
||||
sc.manage_software_alarm(fm_constants.FM_ALARM_ID_USM_DEPLOY_HOST_FAILURE,
|
||||
fm_constants.FM_ALARM_STATE_SET,
|
||||
"%s=%s" % (fm_constants.FM_ENTITY_TYPE_HOST, hostname))
|
||||
|
||||
def send(self, sock): # pylint: disable=unused-argument
|
||||
LOG.error("Should not get here")
|
||||
|
@ -2683,10 +2693,10 @@ class PatchController(PatchService):
|
|||
|
||||
if self._activate():
|
||||
deploy_state.activate_completed()
|
||||
msg_info += "Deployment has been activated\n"
|
||||
msg_info += "Deployment has been activated.\n"
|
||||
else:
|
||||
deploy_state.activate_failed()
|
||||
msg_error += "Dployment activation has failed.\n"
|
||||
msg_error += "Deployment activation has failed.\n"
|
||||
|
||||
return dict(info=msg_info, warning=msg_warning, error=msg_error)
|
||||
|
||||
|
@ -2715,6 +2725,13 @@ class PatchController(PatchService):
|
|||
deploy_state.deploy_host()
|
||||
deploy_host_state.deploy_started()
|
||||
|
||||
# if in a 'deploy host' reentrant scenario, i.e. retrying after
|
||||
# a failure, then clear the failure alarm before retrying
|
||||
entity_instance_id = "%s=%s" % (fm_constants.FM_ENTITY_TYPE_HOST, hostname)
|
||||
self.manage_software_alarm(fm_constants.FM_ALARM_ID_USM_DEPLOY_HOST_FAILURE,
|
||||
fm_constants.FM_ALARM_STATE_CLEAR,
|
||||
entity_instance_id)
|
||||
|
||||
# NOTE(bqian) Get IP address to fulfill the need of patching structure.
|
||||
# need to review the design
|
||||
ip = socket.getaddrinfo(hostname, 0)[0][4][0]
|
||||
|
@ -2740,7 +2757,6 @@ class PatchController(PatchService):
|
|||
msg_info += msg + "\n"
|
||||
LOG.info(msg)
|
||||
set_host_target_load(hostname, major_release)
|
||||
# TODO(heitormatsui) update host deploy status
|
||||
|
||||
self.hosts_lock.acquire()
|
||||
self.hosts[ip].install_pending = True
|
||||
|
@ -2970,6 +2986,38 @@ class PatchController(PatchService):
|
|||
func(*args, **kwargs)
|
||||
self._update_state_to_peer()
|
||||
|
||||
def manage_software_alarm(self, alarm_id, alarm_state, entity_instance_id):
|
||||
try:
|
||||
if alarm_id not in constants.SOFTWARE_ALARMS:
|
||||
raise Exception("Unknown software alarm '%s'." % alarm_id)
|
||||
|
||||
# deal with the alarm clear scenario
|
||||
if alarm_state == fm_constants.FM_ALARM_STATE_CLEAR:
|
||||
LOG.info("Clearing alarm: %s for %s" % (alarm_id, entity_instance_id))
|
||||
self.fm_api.clear_fault(alarm_id, entity_instance_id)
|
||||
return
|
||||
|
||||
# if not clear alarm scenario, create the alarm
|
||||
alarm_data = constants.SOFTWARE_ALARMS.get(alarm_id)
|
||||
alarm = fm_api.Fault(
|
||||
alarm_id=alarm_id,
|
||||
alarm_state=alarm_state,
|
||||
entity_type_id=alarm_data.get("entity_type_id"),
|
||||
entity_instance_id=entity_instance_id,
|
||||
severity=alarm_data.get("severity"),
|
||||
reason_text=alarm_data.get("reason_text"),
|
||||
alarm_type=alarm_data.get("alarm_type"),
|
||||
probable_cause=alarm_data.get("probable_cause"),
|
||||
proposed_repair_action=alarm_data.get("proposed_repair_action"),
|
||||
service_affecting=alarm_data.get("service_affecting"),
|
||||
)
|
||||
LOG.info("Raising alarm: %s for %s" % (alarm_id, entity_instance_id))
|
||||
self.fm_api.set_fault(alarm)
|
||||
except Exception as e:
|
||||
LOG.exception("Failed to manage alarm %s with action %s: %s" % (
|
||||
alarm_id, alarm_state, str(e)
|
||||
))
|
||||
|
||||
def handle_deploy_state_sync(self, alarm_instance_id):
|
||||
"""
|
||||
Handle the deploy state sync.
|
||||
|
|
Loading…
Reference in New Issue