Merge "Ignore certain alarms during kube upgrade pre-check"
This commit is contained in:
commit
4af71c2e29
|
@ -14,6 +14,12 @@ from dcmanager.common import utils
|
|||
from dcmanager.db import api as db_api
|
||||
from dcmanager.orchestrator.states.base import BaseState
|
||||
|
||||
# These following alarms can occur during a vim orchestrated k8s upgrade on the subcloud.
|
||||
# By ignoring the alarms, subcloud k8s upgrade can be
|
||||
# retried after a failure using DC orchestrator.
|
||||
ALARM_IGNORE_LIST = ['100.003', '200.001', '700.004', '750.006',
|
||||
'900.007', '900.401']
|
||||
|
||||
|
||||
class KubeUpgradePreCheckState(BaseState):
|
||||
"""Perform pre check operations to determine if kube upgrade is required"""
|
||||
|
@ -46,6 +52,22 @@ class KubeUpgradePreCheckState(BaseState):
|
|||
system_health)
|
||||
if not fails or (len(fails) == 1 and failed_alarm_check and no_mgmt_alarms):
|
||||
self.info_log(strategy_step, "Kubernetes upgrade health check passed.")
|
||||
elif (len(fails) == 1 and failed_alarm_check):
|
||||
alarms = self.get_fm_client(self.region_name).get_alarms()
|
||||
for alarm in alarms:
|
||||
if alarm.alarm_id not in ALARM_IGNORE_LIST:
|
||||
if alarm.mgmt_affecting == "True":
|
||||
error_desc_msg = ("Kubernetes upgrade health check failed due to alarm %s. "
|
||||
"Kubernetes upgrade health: \n %s" %
|
||||
(alarm.alarm_id, system_health))
|
||||
db_api.subcloud_update(
|
||||
self.context, strategy_step.subcloud_id,
|
||||
error_description=error_desc_msg)
|
||||
self.error_log(strategy_step, "\n" + system_health)
|
||||
raise Exception(("Kubernetes upgrade health check failed due to alarm %s. "
|
||||
"Please run 'system health-query-kube-upgrade' "
|
||||
"command on the subcloud or %s on central for details." %
|
||||
(alarm.alarm_id, ERROR_DESC_CMD)))
|
||||
else:
|
||||
error_desc_msg = ("Kubernetes upgrade health check failed. \n %s" %
|
||||
system_health)
|
||||
|
|
|
@ -14,6 +14,7 @@ from dcmanager.common.consts import STRATEGY_STATE_KUBE_UPGRADE_PRE_CHECK
|
|||
from dcmanager.db.sqlalchemy import api as db_api
|
||||
|
||||
from dcmanager.tests.unit.common import fake_strategy
|
||||
from dcmanager.tests.unit.orchestrator.states.fakes import FakeAlarm
|
||||
from dcmanager.tests.unit.orchestrator.states.fakes import FakeKubeUpgrade
|
||||
from dcmanager.tests.unit.orchestrator.states.fakes import FakeKubeVersion
|
||||
from dcmanager.tests.unit.orchestrator.states.fakes \
|
||||
|
@ -58,6 +59,10 @@ KUBERNETES_UPGRADE_HEALTH_RESPONSE_NON_MGMT_AFFECTING_ALARM = \
|
|||
"All kubernetes control plane pods are ready: [OK]\n" \
|
||||
"All kubernetes applications are in a valid state: [OK]"
|
||||
|
||||
MEMORY_THRESHOLD_ALARM = FakeAlarm('100.101', 'True')
|
||||
KUBERNETES_UPGRADE_ALARM = FakeAlarm('900.007', 'True')
|
||||
CEPH_ALARM = FakeAlarm('250.002', 'False')
|
||||
|
||||
|
||||
class TestKubeUpgradePreCheckStage(TestKubeUpgradeState):
|
||||
|
||||
|
@ -76,6 +81,7 @@ class TestKubeUpgradePreCheckStage(TestKubeUpgradeState):
|
|||
self.sysinv_client.get_kube_upgrades = mock.MagicMock()
|
||||
self.sysinv_client.get_kube_upgrades.return_value = []
|
||||
|
||||
self.fm_client.get_alarms = mock.MagicMock()
|
||||
self.sysinv_client.get_kube_upgrade_health = mock.MagicMock()
|
||||
self.sysinv_client.get_kube_upgrade_health.return_value = KUBERNETES_UPGRADE_HEALTH_RESPONSE_SUCCESS
|
||||
|
||||
|
@ -119,6 +125,7 @@ class TestKubeUpgradePreCheckStage(TestKubeUpgradeState):
|
|||
self.subcloud.id,
|
||||
deploy_status=DEPLOY_STATE_DONE)
|
||||
|
||||
self.fm_client.get_alarms.return_value = [MEMORY_THRESHOLD_ALARM, KUBERNETES_UPGRADE_ALARM]
|
||||
self.sysinv_client.get_kube_upgrade_health.return_value = KUBERNETES_UPGRADE_HEALTH_RESPONSE_MGMT_AFFECTING_ALARM
|
||||
self.sysinv_client.get_kube_upgrades.return_value = [FakeKubeUpgrade()]
|
||||
self.sysinv_client.get_kube_versions.return_value = [
|
||||
|
@ -129,10 +136,30 @@ class TestKubeUpgradePreCheckStage(TestKubeUpgradeState):
|
|||
]
|
||||
self.worker.perform_state_action(self.strategy_step)
|
||||
self.sysinv_client.get_kube_upgrade_health.assert_called_once()
|
||||
|
||||
self.assert_step_updated(self.strategy_step.subcloud_id,
|
||||
STRATEGY_STATE_FAILED)
|
||||
|
||||
def test_pre_check_subcloud_failed_health_check_with_allowed_management_alarms(self):
|
||||
"""Test pre check step where subcloud has management affecting alarms"""
|
||||
|
||||
db_api.subcloud_update(self.ctx,
|
||||
self.subcloud.id,
|
||||
deploy_status=DEPLOY_STATE_DONE)
|
||||
|
||||
self.fm_client.get_alarms.return_value = [CEPH_ALARM, KUBERNETES_UPGRADE_ALARM]
|
||||
self.sysinv_client.get_kube_upgrade_health.return_value = KUBERNETES_UPGRADE_HEALTH_RESPONSE_MGMT_AFFECTING_ALARM
|
||||
self.sysinv_client.get_kube_upgrades.return_value = [FakeKubeUpgrade()]
|
||||
self.sysinv_client.get_kube_versions.return_value = [
|
||||
FakeKubeVersion(obj_id=1,
|
||||
version=UPGRADED_KUBE_VERSION,
|
||||
target=True,
|
||||
state='active'),
|
||||
]
|
||||
self.worker.perform_state_action(self.strategy_step)
|
||||
self.sysinv_client.get_kube_upgrade_health.assert_called_once()
|
||||
self.assert_step_updated(self.strategy_step.subcloud_id,
|
||||
STRATEGY_STATE_KUBE_CREATING_VIM_KUBE_UPGRADE_STRATEGY)
|
||||
|
||||
def test_pre_check_subcloud_failed_health_check_with_non_management_alarms(self):
|
||||
"""Test pre check step where subcloud has non-management affecting alarms"""
|
||||
|
||||
|
|
Loading…
Reference in New Issue