Create deploy host alarms

This commit creates two alarms, one for the scenario when
"deploy host" executes with success, and one for the scenario
where it fails.

This commit also fixes small tox issues.

Test Plan
PASS: run "deploy host" successfully and verify the alarm
PASS: run "deploy host" with failure and verify the alarm

Story: 2010676
Task: 49937

Depends-on: https://review.opendev.org/c/starlingx/fault/+/916786

Signed-off-by: Heitor Matsui <heitorvieira.matsui@windriver.com>
Change-Id: Ie1a32f725511658616f016d56de38cd04f1d9c5d
This commit is contained in:
Heitor Matsui 2024-04-22 17:41:10 -03:00
parent c5a7d1d336
commit c476c018da
4 changed files with 69 additions and 8 deletions

View File

@ -14,6 +14,7 @@ try:
except Exception:
pass
from fm_api import constants as fm_constants
from tsconfig.tsconfig import SW_VERSION
ADDRESS_VERSION_IPV4 = 4
@ -105,3 +106,26 @@ WORKER = 'worker'
AVAILABILITY_ONLINE = 'online'
ADMIN_LOCKED = 'locked'
SOFTWARE_ALARMS = {
fm_constants.FM_ALARM_ID_USM_DEPLOY_HOST_SUCCESS_RR: {
"entity_type_id": fm_constants.FM_ENTITY_TYPE_HOST,
"severity": fm_constants.FM_ALARM_SEVERITY_MAJOR,
"reason_text": ("Deploy host completed, unlock the host to apply "
"the new software release"),
"alarm_type": fm_constants.FM_ALARM_TYPE_11,
"probable_cause": fm_constants.ALARM_PROBABLE_CAUSE_65,
"proposed_repair_action": "Unlock host",
"service_affecting": True,
},
fm_constants.FM_ALARM_ID_USM_DEPLOY_HOST_FAILURE: {
"entity_type_id": fm_constants.FM_ENTITY_TYPE_HOST,
"severity": fm_constants.FM_ALARM_SEVERITY_MAJOR,
"reason_text": "Deploy host failed, check logs for errors",
"alarm_type": fm_constants.FM_ALARM_TYPE_11,
"probable_cause": fm_constants.ALARM_PROBABLE_CAUSE_65,
"proposed_repair_action": ("Check the logs for errors, fix the issues manually "
"and retry"),
"service_affecting": True,
}
}

View File

@ -564,11 +564,11 @@ class PatchMessageAgentInstallResp(messages.PatchMessage):
sc.hosts_lock.acquire()
try:
# NOTE(bqian) seems like trying to tolerant a failure situation
# NOTE(bqian) seems like trying to tolerate a failure situation
# that a host is directed to install a patch but during the installation
# software-controller-daemon gets restarted
# should remove the sc.hosts which is in memory volatile storage and replaced with
# armanent deploy-host entity
# permanent deploy-host entity
ip = addr[0]
if ip not in sc.hosts:
sc.hosts[ip] = AgentNeighbour(ip)
@ -585,9 +585,15 @@ class PatchMessageAgentInstallResp(messages.PatchMessage):
# where there could be 4 combinations
if self.status:
deploy_host_state.deployed()
sc.manage_software_alarm(fm_constants.FM_ALARM_ID_USM_DEPLOY_HOST_SUCCESS_RR,
fm_constants.FM_ALARM_STATE_SET,
"%s=%s" % (fm_constants.FM_ENTITY_TYPE_HOST, hostname))
return
elif self.reject_reason:
deploy_host_state.deploy_failed()
sc.manage_software_alarm(fm_constants.FM_ALARM_ID_USM_DEPLOY_HOST_FAILURE,
fm_constants.FM_ALARM_STATE_SET,
"%s=%s" % (fm_constants.FM_ENTITY_TYPE_HOST, hostname))
return
LOG.error("Bug: shouldn't reach here")
@ -1346,7 +1352,7 @@ class PatchController(PatchService):
# Get the release_id from the patch's metadata
# and check to see if it's already uploaded
release_id = get_release_from_patch(patch_file,'id')
release_id = get_release_from_patch(patch_file, 'id')
release = self.release_collection.get_release_by_id(release_id)
@ -2689,10 +2695,10 @@ class PatchController(PatchService):
if self._activate():
deploy_state.activate_completed()
msg_info += "Deployment has been activated\n"
msg_info += "Deployment has been activated.\n"
else:
deploy_state.activate_failed()
msg_error += "Dployment activation has failed.\n"
msg_error += "Deployment activation has failed.\n"
return dict(info=msg_info, warning=msg_warning, error=msg_error)
@ -2746,7 +2752,6 @@ class PatchController(PatchService):
msg_info += msg + "\n"
LOG.info(msg)
set_host_target_load(hostname, major_release)
# TODO(heitormatsui) update host deploy status
self.hosts_lock.acquire()
self.hosts[ip].install_pending = True
@ -2976,6 +2981,38 @@ class PatchController(PatchService):
func(*args, **kwargs)
self._update_state_to_peer()
def manage_software_alarm(self, alarm_id, alarm_state, entity_instance_id):
try:
if alarm_id not in constants.SOFTWARE_ALARMS:
raise Exception("Unknown software alarm '%s'." % alarm_id)
# deal with the alarm clear scenario
if alarm_state == fm_constants.FM_ALARM_STATE_CLEAR:
LOG.info("Clearing alarm: %s for %s" % (alarm_id, entity_instance_id))
self.fm_api.clear_fault(alarm_id, entity_instance_id)
return
# if not clear alarm scenario, create the alarm
alarm_data = constants.SOFTWARE_ALARMS.get(alarm_id)
alarm = fm_api.Fault(
alarm_id=alarm_id,
alarm_state=alarm_state,
entity_type_id=alarm_data.get("entity_type_id"),
entity_instance_id=entity_instance_id,
severity=alarm_data.get("severity"),
reason_text=alarm_data.get("reason_text"),
alarm_type=alarm_data.get("alarm_type"),
probable_cause=alarm_data.get("probable_cause"),
proposed_repair_action=alarm_data.get("proposed_repair_action"),
service_affecting=alarm_data.get("service_affecting"),
)
LOG.info("Raising alarm: %s for %s" % (alarm_id, entity_instance_id))
self.fm_api.set_fault(alarm)
except Exception as e:
LOG.exception("Failed to manage alarm %s with action %s: %s" % (
alarm_id, alarm_state, str(e)
))
def handle_deploy_state_sync(self, alarm_instance_id):
"""
Handle the deploy state sync.

View File

@ -1352,6 +1352,7 @@ def validate_host_state_to_deploy_host(hostname):
f"{states.DEPLOY_HOST_STATES.PENDING.value}")
raise SoftwareServiceError(msg)
def deploy_host_validations(hostname):
"""
Check the conditions below:

View File

@ -192,7 +192,6 @@ class TestSoftwareFunction(unittest.TestCase):
self.assertEqual(val["commit_id"], r.commit_id)
self.assertEqual(val["checksum"], r.commit_checksum)
@patch('software.db.api.SoftwareAPI')
def test_validate_host_state_to_deploy_host_raises_exception_if_deploy_host_state_is_wrong(self, software_api_mock):
# Arrange
@ -200,7 +199,7 @@ class TestSoftwareFunction(unittest.TestCase):
deploy_by_hostname = MagicMock(return_value={"state": deploy_host_state})
software_api_mock.return_value = MagicMock(get_deploy_host_by_hostname=deploy_by_hostname)
with self.assertRaises(SoftwareServiceError) as error:
# Actions
# Actions
validate_host_state_to_deploy_host(hostname="abc")
# Assertions
error_msg = "Host state is deployed and should be pending"