From 979bd27d90e3d7d4bc345cc56729ee009f5e950a Mon Sep 17 00:00:00 2001 From: junfeng-li Date: Fri, 12 Apr 2024 20:13:03 +0000 Subject: [PATCH] Raise deploy state out of sync alarm This commit is to raise the deploy state out of sync alarm when the deploy state in the software.json files in both controllers are different. The deploy state is checked every 30 seconds during the deploying stage. If they are insync, the alarm will be cleared. Depends-on: https://review.opendev.org/c/starlingx/fault/+/913581 Test Plan: PASS: the alarm is raised when the state is out of sync in both DX and SX PASS: the alarm is cleared when the state is in sync in both DX and SX Task: 49737 Story: 2010676 Change-Id: Ic31c7166135d03591fa4696445783895254dfc95 Signed-off-by: junfeng-li --- .zuul.yaml | 2 + software/debian/deb_folder/control | 4 +- software/software/constants.py | 6 ++ software/software/software_controller.py | 89 ++++++++++++++++++- software/software/software_functions.py | 36 ++++++++ software/software/tests/base.py | 4 + .../tests/test_software_controller.py | 47 +++++++++- software/software/utils.py | 20 +++++ software/tox.ini | 5 +- 9 files changed, 208 insertions(+), 5 deletions(-) create mode 100644 software/software/tests/base.py diff --git a/.zuul.yaml b/.zuul.yaml index e1ed9252..206ca5a1 100644 --- a/.zuul.yaml +++ b/.zuul.yaml @@ -53,6 +53,7 @@ nodeset: debian-bullseye required-projects: - starlingx/config + - starlingx/fault files: - software/* vars: @@ -66,6 +67,7 @@ nodeset: debian-bullseye required-projects: - starlingx/config + - starlingx/fault files: - software/* vars: diff --git a/software/debian/deb_folder/control b/software/debian/deb_folder/control index 334bc6e0..bc1d6302 100644 --- a/software/debian/deb_folder/control +++ b/software/debian/deb_folder/control @@ -9,6 +9,7 @@ Build-Depends: debhelper-compat (= 13), python3-wheel, build-info-dev Build-Depends-Indep: + python3-fm-api, python3-keystonemiddleware, python3-oslo.config Standards-Version: 4.5.1 @@ -20,7 +21,8 @@ Architecture: all Depends: ${python3:Depends}, ${misc:Depends}, gir1.2-ostree-1.0, - python3-argcomplete + python3-argcomplete, + python3-fm-api, Description: StarlingX unified software deployment and management StarlingX unified software deployment and management. diff --git a/software/software/constants.py b/software/software/constants.py index a7859563..f20e44ff 100644 --- a/software/software/constants.py +++ b/software/software/constants.py @@ -179,6 +179,12 @@ WORKER_SUMMARY_DIR = "%s/summary" % SOFTWARE_STORAGE_DIR WORKER_DATETIME_FORMAT = "%Y%m%dT%H%M%S%f" UNKNOWN_SOFTWARE_VERSION = "0.0.0" +LAST_IN_SYNC = "last_in_sync" + +SYSTEM_MODE_SIMPLEX = "simplex" +SYSTEM_MODE_DUPLEX = "duplex" + + class DEPLOY_STATES(Enum): ACTIVATE = 'activate' diff --git a/software/software/software_controller.py b/software/software/software_controller.py index 241adeed..7056e7e7 100644 --- a/software/software/software_controller.py +++ b/software/software/software_controller.py @@ -23,6 +23,10 @@ import threading import time from wsgiref import simple_server +from fm_api import fm_api +from fm_api import constants as fm_constants + + from oslo_config import cfg as oslo_cfg import software.apt_utils as apt_utils @@ -63,6 +67,8 @@ from software.software_functions import LOG from software.software_functions import audit_log_info from software.software_functions import repo_root_dir from software.software_functions import ReleaseData +from software.software_functions import is_deploy_state_in_sync +from software.software_functions import is_deployment_in_progress from software.release_verify import verify_files import software.config as cfg import software.utils as utils @@ -786,6 +792,11 @@ class PatchController(PatchService): self.check_patch_states() self.base_pkgdata = BasePackageData() + # This is for alarm cache. It will be used to store the last raising alarm id + self.usm_alarm = {constants.LAST_IN_SYNC: False} + self.hostname = socket.gethostname() + self.fm_api = fm_api.FaultAPIs() + self.allow_insvc_patching = True if os.path.exists(app_dependency_filename): @@ -802,6 +813,14 @@ class PatchController(PatchService): else: self.write_state_file() + system_mode = utils.get_platform_conf("system_mode") + if system_mode == constants.SYSTEM_MODE_SIMPLEX: + self.standby_controller = "controller-0" + elif system_mode == constants.SYSTEM_MODE_DUPLEX: + self.standby_controller = "controller-0" \ + if self.hostname == "controller-1" \ + else "controller-1" + @property def release_collection(self): # for this stage, the SWReleaseCollection behaves as a broker which @@ -3062,6 +3081,65 @@ class PatchController(PatchService): func(*args, **kwargs) self._update_state_to_peer() + def handle_deploy_state_sync(self, alarm_instance_id): + """ + Handle the deploy state sync. + If deploy state is in sync, clear the alarm. + If not, raise the alarm. + """ + is_in_sync = is_deploy_state_in_sync() + + # Deploy in sync state is not changed, no need to update the alarm + if is_in_sync == self.usm_alarm.get(constants.LAST_IN_SYNC): + return + + try: + out_of_sync_alarm_fault = sc.fm_api.get_fault( + fm_constants.FM_ALARM_ID_SW_UPGRADE_DEPLOY_STATE_OUT_OF_SYNC, alarm_instance_id) + + LOG.info("software.json in sync: %s", is_in_sync) + + if out_of_sync_alarm_fault and is_in_sync: + # There was an out of sync alarm raised, but local software.json is in sync, + # we clear the alarm + LOG.info("Clearing alarm: %s ", out_of_sync_alarm_fault.alarm_id) + self.fm_api.clear_fault( + fm_constants.FM_ALARM_ID_SW_UPGRADE_DEPLOY_STATE_OUT_OF_SYNC, + alarm_instance_id) + + # Deploy in sync state is changed, update the cache + self.usm_alarm[constants.LAST_IN_SYNC] = is_in_sync + + elif (not out_of_sync_alarm_fault) and (not is_in_sync): + # There was no out of sync alarm raised, but local software.json is not in sync, + # we raise the alarm + LOG.info("Raising alarm: %s ", + fm_constants.FM_ALARM_ID_SW_UPGRADE_DEPLOY_STATE_OUT_OF_SYNC) + out_of_sync_fault = fm_api.Fault( + alarm_id=fm_constants.FM_ALARM_ID_SW_UPGRADE_DEPLOY_STATE_OUT_OF_SYNC, + alarm_state=fm_constants.FM_ALARM_STATE_SET, + entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST, + entity_instance_id=alarm_instance_id, + severity=fm_constants.FM_ALARM_SEVERITY_MAJOR, + reason_text="Software deployment in progress", + alarm_type=fm_constants.FM_ALARM_TYPE_11, + probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_65, + proposed_repair_action="Wait for deployment to complete", + service_affecting=False + ) + + self.fm_api.set_fault(out_of_sync_fault) + + # Deploy in sync state is changed, update the cache + self.usm_alarm[constants.LAST_IN_SYNC] = is_in_sync + + else: + # Shouldn't come to here + LOG.error("Unexpected case in handling deploy state sync. ") + + except Exception as ex: + LOG.exception("Failed in handling deploy state sync. Error: %s" % str(ex)) + def _get_software_upgrade(self): """ Get the current software upgrade from/to versions and state @@ -3244,7 +3322,15 @@ class PatchControllerMainThread(threading.Thread): # We only can use one inverval SEND_MSG_INTERVAL_IN_SECONDS = 30.0 + alarm_instance_id = "%s=%s" % (fm_constants.FM_ENTITY_TYPE_HOST, + sc.standby_controller) + try: + # Update the out of sync alarm cache when the thread starts + out_of_sync_alarm_fault = sc.fm_api.get_fault( + fm_constants.FM_ALARM_ID_SW_UPGRADE_DEPLOY_STATE_OUT_OF_SYNC, alarm_instance_id) + sc.usm_alarm[constants.LAST_IN_SYNC] = not out_of_sync_alarm_fault + sock_in = sc.setup_socket() while sock_in is None: @@ -3445,11 +3531,12 @@ class PatchControllerMainThread(threading.Thread): SEND_MSG_INTERVAL_IN_SECONDS) # Only send the deploy state update from the active controller - if utils.is_active_controller(): + if is_deployment_in_progress(sc.release_data.metadata) and utils.is_active_controller(): try: sc.socket_lock.acquire() deploy_state_update = SoftwareMessageDeployStateUpdate() deploy_state_update.send(sc.sock_out) + sc.handle_deploy_state_sync(alarm_instance_id) except Exception as e: LOG.exception("Failed to send deploy state update. Error: %s", str(e)) finally: diff --git a/software/software/software_functions.py b/software/software/software_functions.py index 013c3993..0bff6986 100644 --- a/software/software/software_functions.py +++ b/software/software/software_functions.py @@ -1287,3 +1287,39 @@ def parse_release_metadata(filename): continue data[child.tag] = child.text return data + + +def is_deploy_state_in_sync(): + """ + Check if deploy state in sync + :return: bool true if in sync, false otherwise + """ + if os.path.isfile(constants.SOFTWARE_JSON_FILE) \ + and os.path.isfile(constants.SYNCED_SOFTWARE_JSON_FILE): + + working_data_deploy_state = utils.load_from_json_file( + constants.SOFTWARE_JSON_FILE) + + synced_data_deploy_state = utils.load_from_json_file( + constants.SYNCED_SOFTWARE_JSON_FILE) + + working_deploy_state = working_data_deploy_state.get("deploy", {}) + + synced_deploy_state = synced_data_deploy_state.get("deploy", {}) + + working_deploy_host_state = working_data_deploy_state.get("deploy_host", {}) + + synced_deploy_host_state = synced_data_deploy_state.get("deploy_host", {}) + + return working_deploy_state == synced_deploy_state \ + and working_deploy_host_state == synced_deploy_host_state + return False + + +def is_deployment_in_progress(release_metadata): + """ + Check if at least one deployment is in progress + :param release_metadata: dict of release metadata + :return: bool true if in progress, false otherwise + """ + return any(release['state'] == constants.DEPLOYING for release in release_metadata.values()) diff --git a/software/software/tests/base.py b/software/software/tests/base.py new file mode 100644 index 00000000..5d86a574 --- /dev/null +++ b/software/software/tests/base.py @@ -0,0 +1,4 @@ +import sys +from unittest import mock + +sys.modules['fm_core'] = mock.Mock() diff --git a/software/software/tests/test_software_controller.py b/software/software/tests/test_software_controller.py index 8ae947df..dd1a3729 100644 --- a/software/software/tests/test_software_controller.py +++ b/software/software/tests/test_software_controller.py @@ -3,6 +3,10 @@ # # Copyright (c) 2023-2024 Wind River Systems, Inc. # + +# This import has to be first +from software.tests import base # pylint: disable=unused-import + from software.software_controller import PatchController from software.software_controller import ReleaseValidationFailure import unittest @@ -144,8 +148,12 @@ class TestSoftwareController(unittest.TestCase): @patch('software.software_controller.os.path.isfile') @patch('software.software_controller.json.load') @patch('software.software_controller.open', new_callable=mock_open) + @patch('software.software_controller.utils.get_platform_conf', return_value='simplex') + @patch('software.software_controller.open', new_callable=mock_open) def test_in_sync_controller_api_files_identical(self, + mock_dummy_open_config, # pylint: disable=unused-argument mock_dummy, # pylint: disable=unused-argument + mock_dummy_open, # pylint: disable=unused-argument mock_json_load, mock_isfile): controller = PatchController() @@ -159,8 +167,12 @@ class TestSoftwareController(unittest.TestCase): @patch('software.software_controller.os.path.isfile') @patch('software.software_controller.json.load') @patch('software.software_controller.open', new_callable=mock_open) + @patch('software.software_controller.utils.get_platform_conf', return_value='simplex') + @patch('software.software_controller.open', new_callable=mock_open) def test_in_sync_controller_api_files_not_identical(self, + mock_dummy_open_config, # pylint: disable=unused-argument mock_dummy, # pylint: disable=unused-argument + mock_dummy_open, # pylint: disable=unused-argument mock_json_load, mock_isfile): controller = PatchController() @@ -174,8 +186,12 @@ class TestSoftwareController(unittest.TestCase): @patch('software.software_controller.os.path.isfile') @patch('software.software_controller.json.load') @patch('software.software_controller.open', new_callable=mock_open) + @patch('software.software_controller.utils.get_platform_conf', return_value='simplex') + @patch('software.software_controller.open', new_callable=mock_open) def test_in_sync_controller_api_files_not_exist(self, + mock_dummy_open_config, # pylint: disable=unused-argument mock_dummy, # pylint: disable=unused-argument + mock_dummy_open, # pylint: disable=unused-argument mock_json_load, # pylint: disable=unused-argument mock_isfile): controller = PatchController() @@ -188,8 +204,12 @@ class TestSoftwareController(unittest.TestCase): @patch('software.software_controller.os.path.isfile') @patch('software.software_controller.json.load') @patch('software.software_controller.open', new_callable=mock_open) + @patch('software.software_controller.utils.get_platform_conf', return_value='simplex') + @patch('software.software_controller.open', new_callable=mock_open) def test_in_sync_controller_api_one_file_exist(self, + mock_dummy_open_config, # pylint: disable=unused-argument mock_dummy, # pylint: disable=unused-argument + mock_dummy_open, # pylint: disable=unused-argument mock_json_load, # pylint: disable=unused-argument mock_isfile): controller = PatchController() @@ -204,8 +224,12 @@ class TestSoftwareController(unittest.TestCase): @patch('software.software_controller.json.load') @patch('software.software_controller.open', new_callable=mock_open) + @patch('software.software_controller.utils.get_platform_conf', return_value='simplex') + @patch('software.software_controller.open', new_callable=mock_open) def test_get_software_host_upgrade_deployed(self, + mock_dummy_open_config, # pylint: disable=unused-argument mock_dummy, # pylint: disable=unused-argument + mock_dummy_open, # pylint: disable=unused-argument mock_json_load, # pylint: disable=unused-argument ): controller = PatchController() @@ -229,8 +253,12 @@ class TestSoftwareController(unittest.TestCase): @patch('software.software_controller.json.load') @patch('software.software_controller.open', new_callable=mock_open) + @patch('software.software_controller.utils.get_platform_conf', return_value='simplex') + @patch('software.software_controller.open', new_callable=mock_open) def test_get_software_host_upgrade_deploying(self, + mock_dummy_open_config, # pylint: disable=unused-argument mock_dummy, # pylint: disable=unused-argument + mock_dummy_open, # pylint: disable=unused-argument mock_json_load, # pylint: disable=unused-argument ): controller = PatchController() @@ -254,8 +282,12 @@ class TestSoftwareController(unittest.TestCase): @patch('software.software_controller.json.load') @patch('software.software_controller.open', new_callable=mock_open) + @patch('software.software_controller.utils.get_platform_conf', return_value='simplex') + @patch('software.software_controller.open', new_callable=mock_open) def test_get_all_software_host_upgrade_deploying(self, + mock_dummy_open_config, # pylint: disable=unused-argument mock_dummy, # pylint: disable=unused-argument + mock_dummy_open, # pylint: disable=unused-argument mock_json_load, # pylint: disable=unused-argument ): controller = PatchController() @@ -284,22 +316,31 @@ class TestSoftwareController(unittest.TestCase): @patch('software.software_controller.json.load') @patch('software.software_controller.open', new_callable=mock_open) + @patch('software.software_controller.utils.get_platform_conf', return_value='simplex') + @patch('software.software_controller.open', new_callable=mock_open) def test_get_software_host_upgrade_none_state(self, + mock_dummy_open_config, # pylint: disable=unused-argument mock_dummy, # pylint: disable=unused-argument + mock_dummy_open, # pylint: disable=unused-argument mock_json_load, # pylint: disable=unused-argument ): controller = PatchController() # Test when the deploy or deploy_hosts is None - controller._get_software_upgrade = MagicMock(return_value=None) # pylint: disable=protected-access + controller._get_software_upgrade = MagicMock( # pylint: disable=protected-access + return_value=None) controller.db_api_instance.get_deploy_host.return_value = None result = controller.get_one_software_host_upgrade("host1") self.assertIsNone(result) @patch('software.software_controller.json.load') @patch('software.software_controller.open', new_callable=mock_open) + @patch('software.software_controller.utils.get_platform_conf', return_value='simplex') + @patch('software.software_controller.open', new_callable=mock_open) def test_get_software_upgrade_get_deploy_all(self, + mock_dummy_open_config, # pylint: disable=unused-argument mock_dummy, # pylint: disable=unused-argument + mock_dummy_open, # pylint: disable=unused-argument mock_json_load, # pylint: disable=unused-argument ): @@ -329,8 +370,12 @@ class TestSoftwareController(unittest.TestCase): @patch('software.software_controller.json.load') @patch('software.software_controller.open', new_callable=mock_open) + @patch('software.software_controller.utils.get_platform_conf', return_value='simplex') + @patch('software.software_controller.open', new_callable=mock_open) def test_get_software_upgrade_get_deploy_all_none(self, + mock_dummy_open_config, # pylint: disable=unused-argument mock_dummy, # pylint: disable=unused-argument + mock_dummy_open, # pylint: disable=unused-argument mock_json_load, # pylint: disable=unused-argument ): diff --git a/software/software/utils.py b/software/software/utils.py index b12c3320..618104fc 100644 --- a/software/software/utils.py +++ b/software/software/utils.py @@ -22,6 +22,7 @@ import webob import software.constants as constants from software.exceptions import StateValidationFailure from software.exceptions import SoftwareServiceError +from tsconfig.tsconfig import PLATFORM_CONF_FILE LOG = logging.getLogger('main_logger') @@ -439,3 +440,22 @@ def is_active_controller(): keyring_file = f"/opt/platform/.keyring/{constants.SW_VERSION}/.CREDENTIAL" return os.path.exists(keyring_file) + + +def get_platform_conf(key): + """ + Get the value of given key in platform.conf + :param key: key to get + :return: value + """ + value = None + + with open(PLATFORM_CONF_FILE) as fp: + lines = fp.readlines() + for line in lines: + if line.find(key) != -1: + value = line.split('=')[1] + value = value.replace('\n', '') + break + + return value diff --git a/software/tox.ini b/software/tox.ini index 15e67227..2c955e2b 100644 --- a/software/tox.ini +++ b/software/tox.ini @@ -16,10 +16,11 @@ allowlist_externals = find basepython = python3 deps = -r{toxinidir}/requirements.txt -r{toxinidir}/test-requirements.txt + -e{[tox]stxdir}/fault/fm-api/source -e{[tox]stxdir}/config/tsconfig/tsconfig -install_command = pip install \ - -c{env:UPPER_CONSTRAINTS_FILE:https://opendev.org/starlingx/root/raw/branch/master/build-tools/requirements/debian/upper-constraints.txt} \ +install_command = pip install -v -v -v \ + -c {env:UPPER_CONSTRAINTS_FILE:https://opendev.org/starlingx/root/raw/branch/master/build-tools/requirements/debian/upper-constraints.txt} \ {opts} {packages} passenv = XDG_CACHE_HOME