Raise deploy state out of sync alarm

This commit is to raise the deploy state out of sync alarm
when the deploy state in the software.json files in both controllers
are different.

The deploy state is checked every 30 seconds during the
deploying stage. If they are insync, the alarm will be cleared.

Depends-on: https://review.opendev.org/c/starlingx/fault/+/913581

Test Plan:

PASS: the alarm is raised when the state is out of sync
          in both DX and SX
PASS: the alarm is cleared when the state is in sync in
          both DX and SX

Task: 49737
Story: 2010676
Change-Id: Ic31c7166135d03591fa4696445783895254dfc95
Signed-off-by: junfeng-li <junfeng.li@windriver.com>
This commit is contained in:
junfeng-li 2024-04-12 20:13:03 +00:00
parent 4b433c3d46
commit 979bd27d90
9 changed files with 208 additions and 5 deletions

View File

@ -53,6 +53,7 @@
nodeset: debian-bullseye
required-projects:
- starlingx/config
- starlingx/fault
files:
- software/*
vars:
@ -66,6 +67,7 @@
nodeset: debian-bullseye
required-projects:
- starlingx/config
- starlingx/fault
files:
- software/*
vars:

View File

@ -9,6 +9,7 @@ Build-Depends: debhelper-compat (= 13),
python3-wheel,
build-info-dev
Build-Depends-Indep:
python3-fm-api,
python3-keystonemiddleware,
python3-oslo.config
Standards-Version: 4.5.1
@ -20,7 +21,8 @@ Architecture: all
Depends: ${python3:Depends},
${misc:Depends},
gir1.2-ostree-1.0,
python3-argcomplete
python3-argcomplete,
python3-fm-api,
Description: StarlingX unified software deployment and management
StarlingX unified software deployment and management.

View File

@ -179,6 +179,12 @@ WORKER_SUMMARY_DIR = "%s/summary" % SOFTWARE_STORAGE_DIR
WORKER_DATETIME_FORMAT = "%Y%m%dT%H%M%S%f"
UNKNOWN_SOFTWARE_VERSION = "0.0.0"
LAST_IN_SYNC = "last_in_sync"
SYSTEM_MODE_SIMPLEX = "simplex"
SYSTEM_MODE_DUPLEX = "duplex"
class DEPLOY_STATES(Enum):
ACTIVATE = 'activate'

View File

@ -23,6 +23,10 @@ import threading
import time
from wsgiref import simple_server
from fm_api import fm_api
from fm_api import constants as fm_constants
from oslo_config import cfg as oslo_cfg
import software.apt_utils as apt_utils
@ -63,6 +67,8 @@ from software.software_functions import LOG
from software.software_functions import audit_log_info
from software.software_functions import repo_root_dir
from software.software_functions import ReleaseData
from software.software_functions import is_deploy_state_in_sync
from software.software_functions import is_deployment_in_progress
from software.release_verify import verify_files
import software.config as cfg
import software.utils as utils
@ -786,6 +792,11 @@ class PatchController(PatchService):
self.check_patch_states()
self.base_pkgdata = BasePackageData()
# This is for alarm cache. It will be used to store the last raising alarm id
self.usm_alarm = {constants.LAST_IN_SYNC: False}
self.hostname = socket.gethostname()
self.fm_api = fm_api.FaultAPIs()
self.allow_insvc_patching = True
if os.path.exists(app_dependency_filename):
@ -802,6 +813,14 @@ class PatchController(PatchService):
else:
self.write_state_file()
system_mode = utils.get_platform_conf("system_mode")
if system_mode == constants.SYSTEM_MODE_SIMPLEX:
self.standby_controller = "controller-0"
elif system_mode == constants.SYSTEM_MODE_DUPLEX:
self.standby_controller = "controller-0" \
if self.hostname == "controller-1" \
else "controller-1"
@property
def release_collection(self):
# for this stage, the SWReleaseCollection behaves as a broker which
@ -3062,6 +3081,65 @@ class PatchController(PatchService):
func(*args, **kwargs)
self._update_state_to_peer()
def handle_deploy_state_sync(self, alarm_instance_id):
"""
Handle the deploy state sync.
If deploy state is in sync, clear the alarm.
If not, raise the alarm.
"""
is_in_sync = is_deploy_state_in_sync()
# Deploy in sync state is not changed, no need to update the alarm
if is_in_sync == self.usm_alarm.get(constants.LAST_IN_SYNC):
return
try:
out_of_sync_alarm_fault = sc.fm_api.get_fault(
fm_constants.FM_ALARM_ID_SW_UPGRADE_DEPLOY_STATE_OUT_OF_SYNC, alarm_instance_id)
LOG.info("software.json in sync: %s", is_in_sync)
if out_of_sync_alarm_fault and is_in_sync:
# There was an out of sync alarm raised, but local software.json is in sync,
# we clear the alarm
LOG.info("Clearing alarm: %s ", out_of_sync_alarm_fault.alarm_id)
self.fm_api.clear_fault(
fm_constants.FM_ALARM_ID_SW_UPGRADE_DEPLOY_STATE_OUT_OF_SYNC,
alarm_instance_id)
# Deploy in sync state is changed, update the cache
self.usm_alarm[constants.LAST_IN_SYNC] = is_in_sync
elif (not out_of_sync_alarm_fault) and (not is_in_sync):
# There was no out of sync alarm raised, but local software.json is not in sync,
# we raise the alarm
LOG.info("Raising alarm: %s ",
fm_constants.FM_ALARM_ID_SW_UPGRADE_DEPLOY_STATE_OUT_OF_SYNC)
out_of_sync_fault = fm_api.Fault(
alarm_id=fm_constants.FM_ALARM_ID_SW_UPGRADE_DEPLOY_STATE_OUT_OF_SYNC,
alarm_state=fm_constants.FM_ALARM_STATE_SET,
entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST,
entity_instance_id=alarm_instance_id,
severity=fm_constants.FM_ALARM_SEVERITY_MAJOR,
reason_text="Software deployment in progress",
alarm_type=fm_constants.FM_ALARM_TYPE_11,
probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_65,
proposed_repair_action="Wait for deployment to complete",
service_affecting=False
)
self.fm_api.set_fault(out_of_sync_fault)
# Deploy in sync state is changed, update the cache
self.usm_alarm[constants.LAST_IN_SYNC] = is_in_sync
else:
# Shouldn't come to here
LOG.error("Unexpected case in handling deploy state sync. ")
except Exception as ex:
LOG.exception("Failed in handling deploy state sync. Error: %s" % str(ex))
def _get_software_upgrade(self):
"""
Get the current software upgrade from/to versions and state
@ -3244,7 +3322,15 @@ class PatchControllerMainThread(threading.Thread):
# We only can use one inverval
SEND_MSG_INTERVAL_IN_SECONDS = 30.0
alarm_instance_id = "%s=%s" % (fm_constants.FM_ENTITY_TYPE_HOST,
sc.standby_controller)
try:
# Update the out of sync alarm cache when the thread starts
out_of_sync_alarm_fault = sc.fm_api.get_fault(
fm_constants.FM_ALARM_ID_SW_UPGRADE_DEPLOY_STATE_OUT_OF_SYNC, alarm_instance_id)
sc.usm_alarm[constants.LAST_IN_SYNC] = not out_of_sync_alarm_fault
sock_in = sc.setup_socket()
while sock_in is None:
@ -3445,11 +3531,12 @@ class PatchControllerMainThread(threading.Thread):
SEND_MSG_INTERVAL_IN_SECONDS)
# Only send the deploy state update from the active controller
if utils.is_active_controller():
if is_deployment_in_progress(sc.release_data.metadata) and utils.is_active_controller():
try:
sc.socket_lock.acquire()
deploy_state_update = SoftwareMessageDeployStateUpdate()
deploy_state_update.send(sc.sock_out)
sc.handle_deploy_state_sync(alarm_instance_id)
except Exception as e:
LOG.exception("Failed to send deploy state update. Error: %s", str(e))
finally:

View File

@ -1287,3 +1287,39 @@ def parse_release_metadata(filename):
continue
data[child.tag] = child.text
return data
def is_deploy_state_in_sync():
"""
Check if deploy state in sync
:return: bool true if in sync, false otherwise
"""
if os.path.isfile(constants.SOFTWARE_JSON_FILE) \
and os.path.isfile(constants.SYNCED_SOFTWARE_JSON_FILE):
working_data_deploy_state = utils.load_from_json_file(
constants.SOFTWARE_JSON_FILE)
synced_data_deploy_state = utils.load_from_json_file(
constants.SYNCED_SOFTWARE_JSON_FILE)
working_deploy_state = working_data_deploy_state.get("deploy", {})
synced_deploy_state = synced_data_deploy_state.get("deploy", {})
working_deploy_host_state = working_data_deploy_state.get("deploy_host", {})
synced_deploy_host_state = synced_data_deploy_state.get("deploy_host", {})
return working_deploy_state == synced_deploy_state \
and working_deploy_host_state == synced_deploy_host_state
return False
def is_deployment_in_progress(release_metadata):
"""
Check if at least one deployment is in progress
:param release_metadata: dict of release metadata
:return: bool true if in progress, false otherwise
"""
return any(release['state'] == constants.DEPLOYING for release in release_metadata.values())

View File

@ -0,0 +1,4 @@
import sys
from unittest import mock
sys.modules['fm_core'] = mock.Mock()

View File

@ -3,6 +3,10 @@
#
# Copyright (c) 2023-2024 Wind River Systems, Inc.
#
# This import has to be first
from software.tests import base # pylint: disable=unused-import
from software.software_controller import PatchController
from software.software_controller import ReleaseValidationFailure
import unittest
@ -144,8 +148,12 @@ class TestSoftwareController(unittest.TestCase):
@patch('software.software_controller.os.path.isfile')
@patch('software.software_controller.json.load')
@patch('software.software_controller.open', new_callable=mock_open)
@patch('software.software_controller.utils.get_platform_conf', return_value='simplex')
@patch('software.software_controller.open', new_callable=mock_open)
def test_in_sync_controller_api_files_identical(self,
mock_dummy_open_config, # pylint: disable=unused-argument
mock_dummy, # pylint: disable=unused-argument
mock_dummy_open, # pylint: disable=unused-argument
mock_json_load,
mock_isfile):
controller = PatchController()
@ -159,8 +167,12 @@ class TestSoftwareController(unittest.TestCase):
@patch('software.software_controller.os.path.isfile')
@patch('software.software_controller.json.load')
@patch('software.software_controller.open', new_callable=mock_open)
@patch('software.software_controller.utils.get_platform_conf', return_value='simplex')
@patch('software.software_controller.open', new_callable=mock_open)
def test_in_sync_controller_api_files_not_identical(self,
mock_dummy_open_config, # pylint: disable=unused-argument
mock_dummy, # pylint: disable=unused-argument
mock_dummy_open, # pylint: disable=unused-argument
mock_json_load,
mock_isfile):
controller = PatchController()
@ -174,8 +186,12 @@ class TestSoftwareController(unittest.TestCase):
@patch('software.software_controller.os.path.isfile')
@patch('software.software_controller.json.load')
@patch('software.software_controller.open', new_callable=mock_open)
@patch('software.software_controller.utils.get_platform_conf', return_value='simplex')
@patch('software.software_controller.open', new_callable=mock_open)
def test_in_sync_controller_api_files_not_exist(self,
mock_dummy_open_config, # pylint: disable=unused-argument
mock_dummy, # pylint: disable=unused-argument
mock_dummy_open, # pylint: disable=unused-argument
mock_json_load, # pylint: disable=unused-argument
mock_isfile):
controller = PatchController()
@ -188,8 +204,12 @@ class TestSoftwareController(unittest.TestCase):
@patch('software.software_controller.os.path.isfile')
@patch('software.software_controller.json.load')
@patch('software.software_controller.open', new_callable=mock_open)
@patch('software.software_controller.utils.get_platform_conf', return_value='simplex')
@patch('software.software_controller.open', new_callable=mock_open)
def test_in_sync_controller_api_one_file_exist(self,
mock_dummy_open_config, # pylint: disable=unused-argument
mock_dummy, # pylint: disable=unused-argument
mock_dummy_open, # pylint: disable=unused-argument
mock_json_load, # pylint: disable=unused-argument
mock_isfile):
controller = PatchController()
@ -204,8 +224,12 @@ class TestSoftwareController(unittest.TestCase):
@patch('software.software_controller.json.load')
@patch('software.software_controller.open', new_callable=mock_open)
@patch('software.software_controller.utils.get_platform_conf', return_value='simplex')
@patch('software.software_controller.open', new_callable=mock_open)
def test_get_software_host_upgrade_deployed(self,
mock_dummy_open_config, # pylint: disable=unused-argument
mock_dummy, # pylint: disable=unused-argument
mock_dummy_open, # pylint: disable=unused-argument
mock_json_load, # pylint: disable=unused-argument
):
controller = PatchController()
@ -229,8 +253,12 @@ class TestSoftwareController(unittest.TestCase):
@patch('software.software_controller.json.load')
@patch('software.software_controller.open', new_callable=mock_open)
@patch('software.software_controller.utils.get_platform_conf', return_value='simplex')
@patch('software.software_controller.open', new_callable=mock_open)
def test_get_software_host_upgrade_deploying(self,
mock_dummy_open_config, # pylint: disable=unused-argument
mock_dummy, # pylint: disable=unused-argument
mock_dummy_open, # pylint: disable=unused-argument
mock_json_load, # pylint: disable=unused-argument
):
controller = PatchController()
@ -254,8 +282,12 @@ class TestSoftwareController(unittest.TestCase):
@patch('software.software_controller.json.load')
@patch('software.software_controller.open', new_callable=mock_open)
@patch('software.software_controller.utils.get_platform_conf', return_value='simplex')
@patch('software.software_controller.open', new_callable=mock_open)
def test_get_all_software_host_upgrade_deploying(self,
mock_dummy_open_config, # pylint: disable=unused-argument
mock_dummy, # pylint: disable=unused-argument
mock_dummy_open, # pylint: disable=unused-argument
mock_json_load, # pylint: disable=unused-argument
):
controller = PatchController()
@ -284,22 +316,31 @@ class TestSoftwareController(unittest.TestCase):
@patch('software.software_controller.json.load')
@patch('software.software_controller.open', new_callable=mock_open)
@patch('software.software_controller.utils.get_platform_conf', return_value='simplex')
@patch('software.software_controller.open', new_callable=mock_open)
def test_get_software_host_upgrade_none_state(self,
mock_dummy_open_config, # pylint: disable=unused-argument
mock_dummy, # pylint: disable=unused-argument
mock_dummy_open, # pylint: disable=unused-argument
mock_json_load, # pylint: disable=unused-argument
):
controller = PatchController()
# Test when the deploy or deploy_hosts is None
controller._get_software_upgrade = MagicMock(return_value=None) # pylint: disable=protected-access
controller._get_software_upgrade = MagicMock( # pylint: disable=protected-access
return_value=None)
controller.db_api_instance.get_deploy_host.return_value = None
result = controller.get_one_software_host_upgrade("host1")
self.assertIsNone(result)
@patch('software.software_controller.json.load')
@patch('software.software_controller.open', new_callable=mock_open)
@patch('software.software_controller.utils.get_platform_conf', return_value='simplex')
@patch('software.software_controller.open', new_callable=mock_open)
def test_get_software_upgrade_get_deploy_all(self,
mock_dummy_open_config, # pylint: disable=unused-argument
mock_dummy, # pylint: disable=unused-argument
mock_dummy_open, # pylint: disable=unused-argument
mock_json_load, # pylint: disable=unused-argument
):
@ -329,8 +370,12 @@ class TestSoftwareController(unittest.TestCase):
@patch('software.software_controller.json.load')
@patch('software.software_controller.open', new_callable=mock_open)
@patch('software.software_controller.utils.get_platform_conf', return_value='simplex')
@patch('software.software_controller.open', new_callable=mock_open)
def test_get_software_upgrade_get_deploy_all_none(self,
mock_dummy_open_config, # pylint: disable=unused-argument
mock_dummy, # pylint: disable=unused-argument
mock_dummy_open, # pylint: disable=unused-argument
mock_json_load, # pylint: disable=unused-argument
):

View File

@ -22,6 +22,7 @@ import webob
import software.constants as constants
from software.exceptions import StateValidationFailure
from software.exceptions import SoftwareServiceError
from tsconfig.tsconfig import PLATFORM_CONF_FILE
LOG = logging.getLogger('main_logger')
@ -439,3 +440,22 @@ def is_active_controller():
keyring_file = f"/opt/platform/.keyring/{constants.SW_VERSION}/.CREDENTIAL"
return os.path.exists(keyring_file)
def get_platform_conf(key):
"""
Get the value of given key in platform.conf
:param key: key to get
:return: value
"""
value = None
with open(PLATFORM_CONF_FILE) as fp:
lines = fp.readlines()
for line in lines:
if line.find(key) != -1:
value = line.split('=')[1]
value = value.replace('\n', '')
break
return value

View File

@ -16,10 +16,11 @@ allowlist_externals = find
basepython = python3
deps = -r{toxinidir}/requirements.txt
-r{toxinidir}/test-requirements.txt
-e{[tox]stxdir}/fault/fm-api/source
-e{[tox]stxdir}/config/tsconfig/tsconfig
install_command = pip install \
-c{env:UPPER_CONSTRAINTS_FILE:https://opendev.org/starlingx/root/raw/branch/master/build-tools/requirements/debian/upper-constraints.txt} \
install_command = pip install -v -v -v \
-c {env:UPPER_CONSTRAINTS_FILE:https://opendev.org/starlingx/root/raw/branch/master/build-tools/requirements/debian/upper-constraints.txt} \
{opts} {packages}
passenv =
XDG_CACHE_HOME