Merge "Ignore certain alarms during kube upgrade pre-check"

This commit is contained in:
Zuul 2023-05-04 21:28:23 +00:00 committed by Gerrit Code Review
commit 4af71c2e29
2 changed files with 50 additions and 1 deletions

View File

@ -14,6 +14,12 @@ from dcmanager.common import utils
from dcmanager.db import api as db_api
from dcmanager.orchestrator.states.base import BaseState
# These following alarms can occur during a vim orchestrated k8s upgrade on the subcloud.
# By ignoring the alarms, subcloud k8s upgrade can be
# retried after a failure using DC orchestrator.
ALARM_IGNORE_LIST = ['100.003', '200.001', '700.004', '750.006',
'900.007', '900.401']
class KubeUpgradePreCheckState(BaseState):
"""Perform pre check operations to determine if kube upgrade is required"""
@ -46,6 +52,22 @@ class KubeUpgradePreCheckState(BaseState):
system_health)
if not fails or (len(fails) == 1 and failed_alarm_check and no_mgmt_alarms):
self.info_log(strategy_step, "Kubernetes upgrade health check passed.")
elif (len(fails) == 1 and failed_alarm_check):
alarms = self.get_fm_client(self.region_name).get_alarms()
for alarm in alarms:
if alarm.alarm_id not in ALARM_IGNORE_LIST:
if alarm.mgmt_affecting == "True":
error_desc_msg = ("Kubernetes upgrade health check failed due to alarm %s. "
"Kubernetes upgrade health: \n %s" %
(alarm.alarm_id, system_health))
db_api.subcloud_update(
self.context, strategy_step.subcloud_id,
error_description=error_desc_msg)
self.error_log(strategy_step, "\n" + system_health)
raise Exception(("Kubernetes upgrade health check failed due to alarm %s. "
"Please run 'system health-query-kube-upgrade' "
"command on the subcloud or %s on central for details." %
(alarm.alarm_id, ERROR_DESC_CMD)))
else:
error_desc_msg = ("Kubernetes upgrade health check failed. \n %s" %
system_health)

View File

@ -14,6 +14,7 @@ from dcmanager.common.consts import STRATEGY_STATE_KUBE_UPGRADE_PRE_CHECK
from dcmanager.db.sqlalchemy import api as db_api
from dcmanager.tests.unit.common import fake_strategy
from dcmanager.tests.unit.orchestrator.states.fakes import FakeAlarm
from dcmanager.tests.unit.orchestrator.states.fakes import FakeKubeUpgrade
from dcmanager.tests.unit.orchestrator.states.fakes import FakeKubeVersion
from dcmanager.tests.unit.orchestrator.states.fakes \
@ -58,6 +59,10 @@ KUBERNETES_UPGRADE_HEALTH_RESPONSE_NON_MGMT_AFFECTING_ALARM = \
"All kubernetes control plane pods are ready: [OK]\n" \
"All kubernetes applications are in a valid state: [OK]"
MEMORY_THRESHOLD_ALARM = FakeAlarm('100.101', 'True')
KUBERNETES_UPGRADE_ALARM = FakeAlarm('900.007', 'True')
CEPH_ALARM = FakeAlarm('250.002', 'False')
class TestKubeUpgradePreCheckStage(TestKubeUpgradeState):
@ -76,6 +81,7 @@ class TestKubeUpgradePreCheckStage(TestKubeUpgradeState):
self.sysinv_client.get_kube_upgrades = mock.MagicMock()
self.sysinv_client.get_kube_upgrades.return_value = []
self.fm_client.get_alarms = mock.MagicMock()
self.sysinv_client.get_kube_upgrade_health = mock.MagicMock()
self.sysinv_client.get_kube_upgrade_health.return_value = KUBERNETES_UPGRADE_HEALTH_RESPONSE_SUCCESS
@ -119,6 +125,7 @@ class TestKubeUpgradePreCheckStage(TestKubeUpgradeState):
self.subcloud.id,
deploy_status=DEPLOY_STATE_DONE)
self.fm_client.get_alarms.return_value = [MEMORY_THRESHOLD_ALARM, KUBERNETES_UPGRADE_ALARM]
self.sysinv_client.get_kube_upgrade_health.return_value = KUBERNETES_UPGRADE_HEALTH_RESPONSE_MGMT_AFFECTING_ALARM
self.sysinv_client.get_kube_upgrades.return_value = [FakeKubeUpgrade()]
self.sysinv_client.get_kube_versions.return_value = [
@ -129,10 +136,30 @@ class TestKubeUpgradePreCheckStage(TestKubeUpgradeState):
]
self.worker.perform_state_action(self.strategy_step)
self.sysinv_client.get_kube_upgrade_health.assert_called_once()
self.assert_step_updated(self.strategy_step.subcloud_id,
STRATEGY_STATE_FAILED)
def test_pre_check_subcloud_failed_health_check_with_allowed_management_alarms(self):
"""Test pre check step where subcloud has management affecting alarms"""
db_api.subcloud_update(self.ctx,
self.subcloud.id,
deploy_status=DEPLOY_STATE_DONE)
self.fm_client.get_alarms.return_value = [CEPH_ALARM, KUBERNETES_UPGRADE_ALARM]
self.sysinv_client.get_kube_upgrade_health.return_value = KUBERNETES_UPGRADE_HEALTH_RESPONSE_MGMT_AFFECTING_ALARM
self.sysinv_client.get_kube_upgrades.return_value = [FakeKubeUpgrade()]
self.sysinv_client.get_kube_versions.return_value = [
FakeKubeVersion(obj_id=1,
version=UPGRADED_KUBE_VERSION,
target=True,
state='active'),
]
self.worker.perform_state_action(self.strategy_step)
self.sysinv_client.get_kube_upgrade_health.assert_called_once()
self.assert_step_updated(self.strategy_step.subcloud_id,
STRATEGY_STATE_KUBE_CREATING_VIM_KUBE_UPGRADE_STRATEGY)
def test_pre_check_subcloud_failed_health_check_with_non_management_alarms(self):
"""Test pre check step where subcloud has non-management affecting alarms"""