Create deploy host alarms
This commit creates two alarms, one for the scenario when "deploy host" executes with success, and one for the scenario where it fails. The success alarm will be cleared after the host is unlocked, in [1], and the failure alarm is cleared when the "deploy host" is retried for the same host. [1] https://review.opendev.org/c/starlingx/config/+/916835 Test Plan PASS: run "deploy host" successfully and verify the alarm PASS: run "deploy host" with failure and verify the alarm Story: 2010676 Task: 49937 Depends-on: https://review.opendev.org/c/starlingx/fault/+/916786 Signed-off-by: Heitor Matsui <heitorvieira.matsui@windriver.com> Change-Id: Ie1a32f725511658616f016d56de38cd04f1d9c5d
This commit is contained in:
parent
c5a7d1d336
commit
d65d9f52a6
|
@ -14,6 +14,7 @@ try:
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
from fm_api import constants as fm_constants
|
||||||
from tsconfig.tsconfig import SW_VERSION
|
from tsconfig.tsconfig import SW_VERSION
|
||||||
|
|
||||||
ADDRESS_VERSION_IPV4 = 4
|
ADDRESS_VERSION_IPV4 = 4
|
||||||
|
@ -105,3 +106,26 @@ WORKER = 'worker'
|
||||||
|
|
||||||
AVAILABILITY_ONLINE = 'online'
|
AVAILABILITY_ONLINE = 'online'
|
||||||
ADMIN_LOCKED = 'locked'
|
ADMIN_LOCKED = 'locked'
|
||||||
|
|
||||||
|
SOFTWARE_ALARMS = {
|
||||||
|
fm_constants.FM_ALARM_ID_USM_DEPLOY_HOST_SUCCESS_RR: {
|
||||||
|
"entity_type_id": fm_constants.FM_ENTITY_TYPE_HOST,
|
||||||
|
"severity": fm_constants.FM_ALARM_SEVERITY_WARNING,
|
||||||
|
"reason_text": ("Deploy host completed, unlock the host to apply "
|
||||||
|
"the new software release"),
|
||||||
|
"alarm_type": fm_constants.FM_ALARM_TYPE_11,
|
||||||
|
"probable_cause": fm_constants.ALARM_PROBABLE_CAUSE_65,
|
||||||
|
"proposed_repair_action": "Unlock host",
|
||||||
|
"service_affecting": True,
|
||||||
|
},
|
||||||
|
fm_constants.FM_ALARM_ID_USM_DEPLOY_HOST_FAILURE: {
|
||||||
|
"entity_type_id": fm_constants.FM_ENTITY_TYPE_HOST,
|
||||||
|
"severity": fm_constants.FM_ALARM_SEVERITY_MAJOR,
|
||||||
|
"reason_text": "Deploy host failed, check logs for errors",
|
||||||
|
"alarm_type": fm_constants.FM_ALARM_TYPE_11,
|
||||||
|
"probable_cause": fm_constants.ALARM_PROBABLE_CAUSE_65,
|
||||||
|
"proposed_repair_action": ("Check the logs for errors, fix the issues manually "
|
||||||
|
"and retry"),
|
||||||
|
"service_affecting": True,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -359,6 +359,7 @@ class PatchMessageAgentInstallReq(messages.PatchMessage):
|
||||||
|
|
||||||
if not self.force:
|
if not self.force:
|
||||||
setflag(node_is_software_updated_rr_file)
|
setflag(node_is_software_updated_rr_file)
|
||||||
|
resp.reboot_required = True
|
||||||
|
|
||||||
if not os.path.exists(node_is_locked_file):
|
if not os.path.exists(node_is_locked_file):
|
||||||
if self.force:
|
if self.force:
|
||||||
|
@ -383,6 +384,7 @@ class PatchMessageAgentInstallResp(messages.PatchMessage):
|
||||||
messages.PatchMessage.__init__(self, messages.PATCHMSG_AGENT_INSTALL_RESP)
|
messages.PatchMessage.__init__(self, messages.PATCHMSG_AGENT_INSTALL_RESP)
|
||||||
self.status = False
|
self.status = False
|
||||||
self.reject_reason = None
|
self.reject_reason = None
|
||||||
|
self.reboot_required = False
|
||||||
|
|
||||||
def encode(self):
|
def encode(self):
|
||||||
global pa
|
global pa
|
||||||
|
@ -390,6 +392,7 @@ class PatchMessageAgentInstallResp(messages.PatchMessage):
|
||||||
self.message['status'] = self.status
|
self.message['status'] = self.status
|
||||||
if self.reject_reason is not None:
|
if self.reject_reason is not None:
|
||||||
self.message['reject_reason'] = self.reject_reason
|
self.message['reject_reason'] = self.reject_reason
|
||||||
|
self.message['reboot_required'] = self.reboot_required
|
||||||
|
|
||||||
def handle(self, sock, addr):
|
def handle(self, sock, addr):
|
||||||
LOG.error("Should not get here")
|
LOG.error("Should not get here")
|
||||||
|
|
|
@ -545,6 +545,7 @@ class PatchMessageAgentInstallResp(messages.PatchMessage):
|
||||||
messages.PatchMessage.__init__(self, messages.PATCHMSG_AGENT_INSTALL_RESP)
|
messages.PatchMessage.__init__(self, messages.PATCHMSG_AGENT_INSTALL_RESP)
|
||||||
self.status = False
|
self.status = False
|
||||||
self.reject_reason = None
|
self.reject_reason = None
|
||||||
|
self.reboot_required = False
|
||||||
|
|
||||||
def decode(self, data):
|
def decode(self, data):
|
||||||
messages.PatchMessage.decode(self, data)
|
messages.PatchMessage.decode(self, data)
|
||||||
|
@ -552,6 +553,8 @@ class PatchMessageAgentInstallResp(messages.PatchMessage):
|
||||||
self.status = data['status']
|
self.status = data['status']
|
||||||
if 'reject_reason' in data:
|
if 'reject_reason' in data:
|
||||||
self.reject_reason = data['reject_reason']
|
self.reject_reason = data['reject_reason']
|
||||||
|
if 'reboot_required' in data:
|
||||||
|
self.reboot_required = data['reboot_required']
|
||||||
|
|
||||||
def encode(self):
|
def encode(self):
|
||||||
# Nothing to add, so just call the super class
|
# Nothing to add, so just call the super class
|
||||||
|
@ -564,11 +567,11 @@ class PatchMessageAgentInstallResp(messages.PatchMessage):
|
||||||
|
|
||||||
sc.hosts_lock.acquire()
|
sc.hosts_lock.acquire()
|
||||||
try:
|
try:
|
||||||
# NOTE(bqian) seems like trying to tolerant a failure situation
|
# NOTE(bqian) seems like trying to tolerate a failure situation
|
||||||
# that a host is directed to install a patch but during the installation
|
# that a host is directed to install a patch but during the installation
|
||||||
# software-controller-daemon gets restarted
|
# software-controller-daemon gets restarted
|
||||||
# should remove the sc.hosts which is in memory volatile storage and replaced with
|
# should remove the sc.hosts which is in memory volatile storage and replaced with
|
||||||
# armanent deploy-host entity
|
# permanent deploy-host entity
|
||||||
ip = addr[0]
|
ip = addr[0]
|
||||||
if ip not in sc.hosts:
|
if ip not in sc.hosts:
|
||||||
sc.hosts[ip] = AgentNeighbour(ip)
|
sc.hosts[ip] = AgentNeighbour(ip)
|
||||||
|
@ -585,9 +588,16 @@ class PatchMessageAgentInstallResp(messages.PatchMessage):
|
||||||
# where there could be 4 combinations
|
# where there could be 4 combinations
|
||||||
if self.status:
|
if self.status:
|
||||||
deploy_host_state.deployed()
|
deploy_host_state.deployed()
|
||||||
|
if self.reboot_required:
|
||||||
|
sc.manage_software_alarm(fm_constants.FM_ALARM_ID_USM_DEPLOY_HOST_SUCCESS_RR,
|
||||||
|
fm_constants.FM_ALARM_STATE_SET,
|
||||||
|
"%s=%s" % (fm_constants.FM_ENTITY_TYPE_HOST, hostname))
|
||||||
return
|
return
|
||||||
elif self.reject_reason:
|
elif self.reject_reason:
|
||||||
deploy_host_state.deploy_failed()
|
deploy_host_state.deploy_failed()
|
||||||
|
sc.manage_software_alarm(fm_constants.FM_ALARM_ID_USM_DEPLOY_HOST_FAILURE,
|
||||||
|
fm_constants.FM_ALARM_STATE_SET,
|
||||||
|
"%s=%s" % (fm_constants.FM_ENTITY_TYPE_HOST, hostname))
|
||||||
return
|
return
|
||||||
|
|
||||||
LOG.error("Bug: shouldn't reach here")
|
LOG.error("Bug: shouldn't reach here")
|
||||||
|
@ -1346,7 +1356,7 @@ class PatchController(PatchService):
|
||||||
|
|
||||||
# Get the release_id from the patch's metadata
|
# Get the release_id from the patch's metadata
|
||||||
# and check to see if it's already uploaded
|
# and check to see if it's already uploaded
|
||||||
release_id = get_release_from_patch(patch_file,'id')
|
release_id = get_release_from_patch(patch_file, 'id')
|
||||||
|
|
||||||
release = self.release_collection.get_release_by_id(release_id)
|
release = self.release_collection.get_release_by_id(release_id)
|
||||||
|
|
||||||
|
@ -2689,10 +2699,10 @@ class PatchController(PatchService):
|
||||||
|
|
||||||
if self._activate():
|
if self._activate():
|
||||||
deploy_state.activate_completed()
|
deploy_state.activate_completed()
|
||||||
msg_info += "Deployment has been activated\n"
|
msg_info += "Deployment has been activated.\n"
|
||||||
else:
|
else:
|
||||||
deploy_state.activate_failed()
|
deploy_state.activate_failed()
|
||||||
msg_error += "Dployment activation has failed.\n"
|
msg_error += "Deployment activation has failed.\n"
|
||||||
|
|
||||||
return dict(info=msg_info, warning=msg_warning, error=msg_error)
|
return dict(info=msg_info, warning=msg_warning, error=msg_error)
|
||||||
|
|
||||||
|
@ -2721,6 +2731,13 @@ class PatchController(PatchService):
|
||||||
deploy_state.deploy_host()
|
deploy_state.deploy_host()
|
||||||
deploy_host_state.deploy_started()
|
deploy_host_state.deploy_started()
|
||||||
|
|
||||||
|
# if in a 'deploy host' reentrant scenario, i.e. retrying after
|
||||||
|
# a failure, then clear the failure alarm before retrying
|
||||||
|
entity_instance_id = "%s=%s" % (fm_constants.FM_ENTITY_TYPE_HOST, hostname)
|
||||||
|
self.manage_software_alarm(fm_constants.FM_ALARM_ID_USM_DEPLOY_HOST_FAILURE,
|
||||||
|
fm_constants.FM_ALARM_STATE_CLEAR,
|
||||||
|
entity_instance_id)
|
||||||
|
|
||||||
# NOTE(bqian) Get IP address to fulfill the need of patching structure.
|
# NOTE(bqian) Get IP address to fulfill the need of patching structure.
|
||||||
# need to review the design
|
# need to review the design
|
||||||
ip = socket.getaddrinfo(hostname, 0)[0][4][0]
|
ip = socket.getaddrinfo(hostname, 0)[0][4][0]
|
||||||
|
@ -2746,7 +2763,6 @@ class PatchController(PatchService):
|
||||||
msg_info += msg + "\n"
|
msg_info += msg + "\n"
|
||||||
LOG.info(msg)
|
LOG.info(msg)
|
||||||
set_host_target_load(hostname, major_release)
|
set_host_target_load(hostname, major_release)
|
||||||
# TODO(heitormatsui) update host deploy status
|
|
||||||
|
|
||||||
self.hosts_lock.acquire()
|
self.hosts_lock.acquire()
|
||||||
self.hosts[ip].install_pending = True
|
self.hosts[ip].install_pending = True
|
||||||
|
@ -2976,6 +2992,38 @@ class PatchController(PatchService):
|
||||||
func(*args, **kwargs)
|
func(*args, **kwargs)
|
||||||
self._update_state_to_peer()
|
self._update_state_to_peer()
|
||||||
|
|
||||||
|
def manage_software_alarm(self, alarm_id, alarm_state, entity_instance_id):
|
||||||
|
try:
|
||||||
|
if alarm_id not in constants.SOFTWARE_ALARMS:
|
||||||
|
raise Exception("Unknown software alarm '%s'." % alarm_id)
|
||||||
|
|
||||||
|
# deal with the alarm clear scenario
|
||||||
|
if alarm_state == fm_constants.FM_ALARM_STATE_CLEAR:
|
||||||
|
LOG.info("Clearing alarm: %s for %s" % (alarm_id, entity_instance_id))
|
||||||
|
self.fm_api.clear_fault(alarm_id, entity_instance_id)
|
||||||
|
return
|
||||||
|
|
||||||
|
# if not clear alarm scenario, create the alarm
|
||||||
|
alarm_data = constants.SOFTWARE_ALARMS.get(alarm_id)
|
||||||
|
alarm = fm_api.Fault(
|
||||||
|
alarm_id=alarm_id,
|
||||||
|
alarm_state=alarm_state,
|
||||||
|
entity_type_id=alarm_data.get("entity_type_id"),
|
||||||
|
entity_instance_id=entity_instance_id,
|
||||||
|
severity=alarm_data.get("severity"),
|
||||||
|
reason_text=alarm_data.get("reason_text"),
|
||||||
|
alarm_type=alarm_data.get("alarm_type"),
|
||||||
|
probable_cause=alarm_data.get("probable_cause"),
|
||||||
|
proposed_repair_action=alarm_data.get("proposed_repair_action"),
|
||||||
|
service_affecting=alarm_data.get("service_affecting"),
|
||||||
|
)
|
||||||
|
LOG.info("Raising alarm: %s for %s" % (alarm_id, entity_instance_id))
|
||||||
|
self.fm_api.set_fault(alarm)
|
||||||
|
except Exception as e:
|
||||||
|
LOG.exception("Failed to manage alarm %s with action %s: %s" % (
|
||||||
|
alarm_id, alarm_state, str(e)
|
||||||
|
))
|
||||||
|
|
||||||
def handle_deploy_state_sync(self, alarm_instance_id):
|
def handle_deploy_state_sync(self, alarm_instance_id):
|
||||||
"""
|
"""
|
||||||
Handle the deploy state sync.
|
Handle the deploy state sync.
|
||||||
|
|
|
@ -1352,6 +1352,7 @@ def validate_host_state_to_deploy_host(hostname):
|
||||||
f"{states.DEPLOY_HOST_STATES.PENDING.value}")
|
f"{states.DEPLOY_HOST_STATES.PENDING.value}")
|
||||||
raise SoftwareServiceError(msg)
|
raise SoftwareServiceError(msg)
|
||||||
|
|
||||||
|
|
||||||
def deploy_host_validations(hostname):
|
def deploy_host_validations(hostname):
|
||||||
"""
|
"""
|
||||||
Check the conditions below:
|
Check the conditions below:
|
||||||
|
|
|
@ -192,7 +192,6 @@ class TestSoftwareFunction(unittest.TestCase):
|
||||||
self.assertEqual(val["commit_id"], r.commit_id)
|
self.assertEqual(val["commit_id"], r.commit_id)
|
||||||
self.assertEqual(val["checksum"], r.commit_checksum)
|
self.assertEqual(val["checksum"], r.commit_checksum)
|
||||||
|
|
||||||
|
|
||||||
@patch('software.db.api.SoftwareAPI')
|
@patch('software.db.api.SoftwareAPI')
|
||||||
def test_validate_host_state_to_deploy_host_raises_exception_if_deploy_host_state_is_wrong(self, software_api_mock):
|
def test_validate_host_state_to_deploy_host_raises_exception_if_deploy_host_state_is_wrong(self, software_api_mock):
|
||||||
# Arrange
|
# Arrange
|
||||||
|
@ -200,7 +199,7 @@ class TestSoftwareFunction(unittest.TestCase):
|
||||||
deploy_by_hostname = MagicMock(return_value={"state": deploy_host_state})
|
deploy_by_hostname = MagicMock(return_value={"state": deploy_host_state})
|
||||||
software_api_mock.return_value = MagicMock(get_deploy_host_by_hostname=deploy_by_hostname)
|
software_api_mock.return_value = MagicMock(get_deploy_host_by_hostname=deploy_by_hostname)
|
||||||
with self.assertRaises(SoftwareServiceError) as error:
|
with self.assertRaises(SoftwareServiceError) as error:
|
||||||
# Actions
|
# Actions
|
||||||
validate_host_state_to_deploy_host(hostname="abc")
|
validate_host_state_to_deploy_host(hostname="abc")
|
||||||
# Assertions
|
# Assertions
|
||||||
error_msg = "Host state is deployed and should be pending"
|
error_msg = "Host state is deployed and should be pending"
|
||||||
|
|
Loading…
Reference in New Issue