diff --git a/nfv/nfv-plugins/nfv_plugins/nfvi_plugins/clients/kubernetes_client.py b/nfv/nfv-plugins/nfv_plugins/nfvi_plugins/clients/kubernetes_client.py index 5e568874..924d4ef3 100644 --- a/nfv/nfv-plugins/nfv_plugins/nfvi_plugins/clients/kubernetes_client.py +++ b/nfv/nfv-plugins/nfv_plugins/nfvi_plugins/clients/kubernetes_client.py @@ -4,7 +4,10 @@ # SPDX-License-Identifier: Apache-2.0 # +from fm_api import constants as fm_constants +from fm_api import fm_api import kubernetes + from kubernetes import __version__ as K8S_MODULE_VERSION from kubernetes.client.models.v1_container_image import V1ContainerImage from kubernetes.client.rest import ApiException @@ -15,6 +18,8 @@ from nfv_common.helpers import Result K8S_MODULE_MAJOR_VERSION = int(K8S_MODULE_VERSION.split('.', maxsplit=1)[0]) +fmapi = fm_api.FaultAPIs() + DLOG = debug.debug_get_logger('nfv_plugins.nfvi_plugins.clients.kubernetes_client') @@ -77,13 +82,42 @@ def get_customobjects_api_instance(): return client.CustomObjectsApi() +def raise_alarm(node_name): + + entity_instance_id = "%s=%s" % (fm_constants.FM_ENTITY_TYPE_HOST, + node_name) + fault = fm_api.Fault( + alarm_id=fm_constants.FM_ALARM_ID_USM_NODE_TAINTED, + alarm_state=fm_constants.FM_ALARM_STATE_SET, + entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST, + entity_instance_id=entity_instance_id, + severity=fm_constants.FM_ALARM_SEVERITY_MAJOR, + reason_text=("Node tainted."), + alarm_type=fm_constants.FM_ALARM_TYPE_7, + probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_8, + proposed_repair_action=("Execute 'kubectl taint nodes %s services=disabled:NoExecute-'. " + "If it fails, Execute 'system host-lock %s' followed by 'system host-unlock %s'. " + "If issue still persists, contact next level of support." + % (node_name, node_name, node_name)), + service_affecting=True) + DLOG.info("Raising alarm %s on %s " % (fm_constants.FM_ALARM_ID_USM_NODE_TAINTED, node_name)) + fmapi.set_fault(fault) + + +def clear_alarm(node_name): + + entity_instance_id = "%s=%s" % (fm_constants.FM_ENTITY_TYPE_HOST, + node_name) + DLOG.info("Clearing alarm %s on %s " % (fm_constants.FM_ALARM_ID_USM_NODE_TAINTED, node_name)) + fmapi.clear_fault(fm_constants.FM_ALARM_ID_USM_NODE_TAINTED, entity_instance_id) + + def taint_node(node_name, effect, key, value): """ Apply a taint to a node """ # Get the client. kube_client = get_client() - # Retrieve the node to access any existing taints. try: response = kube_client.read_node(node_name) @@ -127,6 +161,10 @@ def taint_node(node_name, effect, key, value): new_taint = {"key": key, "value": value, "effect": effect} body["spec"]["taints"].append(new_taint) response = kube_client.patch_node(node_name, body) + # Clear taint node alarm if tainting is successful. + # Alarm not cleared if taint is already present in the system + # or the node is under configuration. + clear_alarm(node_name) return Result(response) @@ -156,8 +194,27 @@ def untaint_node(node_name, effect, key): # Preserve any existing taints updated_taints = [taint for taint in taints if taint.key != key or taint.effect != effect] + DLOG.info("Updated taints %s" % (updated_taints)) body = {"spec": {"taints": updated_taints}} response = kube_client.patch_node(node_name, body) + check_taints = kube_client.read_node(node_name) + taints = check_taints.spec.taints + DLOG.info("Existing taint %s" % (taints)) + if taints is not None: + for taint in taints: + if (taint.key == key and taint.effect == effect): + DLOG.info("Removing %s:%s taint from node %s failed" % (key, + effect, node_name)) + raise_alarm(node_name) + break + else: + # Taint removed successfully. If there are multiple taints + # on the system, removing the 'services' taint will clear the alarm. + clear_alarm(node_name) + else: + # If there is only 'services' taint on the system , then removing the taint + # should clear the alarm. + clear_alarm(node_name) return Result(response) diff --git a/nfv/nfv-tests/nfv_unit_tests/tests/test_nfvi_infrastructure_api.py b/nfv/nfv-tests/nfv_unit_tests/tests/test_nfvi_infrastructure_api.py index f58d696c..41ca467a 100755 --- a/nfv/nfv-tests/nfv_unit_tests/tests/test_nfvi_infrastructure_api.py +++ b/nfv/nfv-tests/nfv_unit_tests/tests/test_nfvi_infrastructure_api.py @@ -3,16 +3,19 @@ # # SPDX-License-Identifier: Apache-2.0 # +import sys import uuid -from nfv_plugins.nfvi_plugins.nfvi_infrastructure_api import host_state -from nfv_plugins.nfvi_plugins.nfvi_infrastructure_api import NFVIInfrastructureAPI from nfv_vim.nfvi.objects.v1 import HOST_AVAIL_STATUS from nfv_vim.nfvi.objects.v1 import HOST_LABEL_KEYS from nfv_vim.nfvi.objects.v1 import HOST_LABEL_VALUES from nfv_vim.nfvi.objects.v1 import HOST_OPER_STATE from nfv_unit_tests.tests import testcase +from unittest import mock +sys.modules['fm_core'] = mock.Mock() +from nfv_plugins.nfvi_plugins.nfvi_infrastructure_api import host_state # noqa: H306,E402 pylint: disable=C0413 +from nfv_plugins.nfvi_plugins.nfvi_infrastructure_api import NFVIInfrastructureAPI # noqa: H306,E402 pylint: disable=C0413 # todo(abailey): use already existing constants CONTROLLER_PERSONALITY = 'controller' diff --git a/nfv/nfv-tests/nfv_unit_tests/tests/test_plugin_kubernetes_client.py b/nfv/nfv-tests/nfv_unit_tests/tests/test_plugin_kubernetes_client.py index a98f3409..8ba5615d 100755 --- a/nfv/nfv-tests/nfv_unit_tests/tests/test_plugin_kubernetes_client.py +++ b/nfv/nfv-tests/nfv_unit_tests/tests/test_plugin_kubernetes_client.py @@ -8,11 +8,12 @@ import kubernetes from kubernetes.client.rest import ApiException -from unittest import mock - -from nfv_plugins.nfvi_plugins.clients import kubernetes_client from nfv_unit_tests.tests import testcase +import sys +from unittest import mock +sys.modules['fm_core'] = mock.Mock() +from nfv_plugins.nfvi_plugins.clients import kubernetes_client # noqa: H306,E402 pylint: disable=C0413 def mock_load_kube_config(path): diff --git a/nfv/nfv-tests/nfv_unit_tests/tests/test_sw_patch_strategy.py b/nfv/nfv-tests/nfv_unit_tests/tests/test_sw_patch_strategy.py index 114b30a1..244b4f4c 100755 --- a/nfv/nfv-tests/nfv_unit_tests/tests/test_sw_patch_strategy.py +++ b/nfv/nfv-tests/nfv_unit_tests/tests/test_sw_patch_strategy.py @@ -2701,7 +2701,8 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase): '200.001', '700.004', '280.002', - '100.119'], + '100.119', + '900.701'], 'timeout': 1800} ] }, @@ -2723,7 +2724,8 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase): '200.001', '700.004', '280.002', - '100.119'], + '100.119', + '900.701'], 'timeout': 1800} ] } @@ -2827,7 +2829,8 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase): '200.001', '700.004', '280.002', - '100.119'], + '100.119', + '900.701'], 'timeout': 1800} ] }, @@ -2850,7 +2853,8 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase): '200.001', '700.004', '280.002', - '100.119'], + '100.119', + '900.701'], 'timeout': 1800} ] }, @@ -2873,7 +2877,8 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase): '200.001', '700.004', '280.002', - '100.119'], + '100.119', + '900.701'], 'timeout': 1800} ] }, @@ -2896,7 +2901,8 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase): '200.001', '700.004', '280.002', - '100.119'], + '100.119', + '900.701'], 'timeout': 1800} ] }, diff --git a/nfv/nfv-tests/nfv_unit_tests/tests/test_sw_upgrade_strategy.py b/nfv/nfv-tests/nfv_unit_tests/tests/test_sw_upgrade_strategy.py index 55e6a3d4..a05cbc8f 100755 --- a/nfv/nfv-tests/nfv_unit_tests/tests/test_sw_upgrade_strategy.py +++ b/nfv/nfv-tests/nfv_unit_tests/tests/test_sw_upgrade_strategy.py @@ -1022,7 +1022,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['storage-0']}, _unlock_hosts_stage_as_dict(['storage-0']), {'name': 'wait-data-sync', - 'ignore_alarms': ['900.005', '900.201', '750.006', '100.119'], + 'ignore_alarms': ['900.005', '900.201', '750.006', '100.119', '900.701'], 'timeout': 7200} ] }, @@ -1036,7 +1036,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['storage-1']}, _unlock_hosts_stage_as_dict(['storage-1']), {'name': 'wait-data-sync', - 'ignore_alarms': ['900.005', '900.201', '750.006', '100.119'], + 'ignore_alarms': ['900.005', '900.201', '750.006', '100.119', '900.701'], 'timeout': 7200} ] }, @@ -1050,7 +1050,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['storage-2']}, _unlock_hosts_stage_as_dict(['storage-2']), {'name': 'wait-data-sync', - 'ignore_alarms': ['900.005', '900.201', '750.006', '100.119'], + 'ignore_alarms': ['900.005', '900.201', '750.006', '100.119', '900.701'], 'timeout': 7200} ] }, @@ -1064,7 +1064,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['storage-3']}, _unlock_hosts_stage_as_dict(['storage-3']), {'name': 'wait-data-sync', - 'ignore_alarms': ['900.005', '900.201', '750.006', '100.119'], + 'ignore_alarms': ['900.005', '900.201', '750.006', '100.119', '900.701'], 'timeout': 7200} ] }, @@ -1112,7 +1112,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['controller-0']}, _unlock_hosts_stage_as_dict(['controller-0']), {'name': 'wait-data-sync', - 'ignore_alarms': ['900.005', '900.201', '750.006', '100.119'], + 'ignore_alarms': ['900.005', '900.201', '750.006', '100.119', '900.701'], 'timeout': 14400} ] } @@ -1159,7 +1159,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['controller-1']}, _unlock_hosts_stage_as_dict(['controller-1']), {'name': 'wait-data-sync', - 'ignore_alarms': ['900.005', '900.201', '750.006', '100.119'], + 'ignore_alarms': ['900.005', '900.201', '750.006', '100.119', '900.701'], 'timeout': 14400} ] }, @@ -1175,7 +1175,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['controller-0']}, _unlock_hosts_stage_as_dict(['controller-0']), {'name': 'wait-data-sync', - 'ignore_alarms': ['900.005', '900.201', '750.006', '100.119'], + 'ignore_alarms': ['900.005', '900.201', '750.006', '100.119', '900.701'], 'timeout': 14400} ] } @@ -1246,7 +1246,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['controller-0']}, _unlock_hosts_stage_as_dict(['controller-0']), {'name': 'wait-data-sync', - 'ignore_alarms': ['900.005', '900.201', '750.006', '100.119'], + 'ignore_alarms': ['900.005', '900.201', '750.006', '100.119', '900.701'], 'timeout': 14400} ] }, @@ -1262,7 +1262,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['controller-1']}, _unlock_hosts_stage_as_dict(['controller-1']), {'name': 'wait-data-sync', - 'ignore_alarms': ['900.005', '900.201', '750.006', '100.119'], + 'ignore_alarms': ['900.005', '900.201', '750.006', '100.119', '900.701'], 'timeout': 14400} ] }, @@ -1363,7 +1363,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['controller-1']}, _unlock_hosts_stage_as_dict(['controller-1']), {'name': 'wait-data-sync', - 'ignore_alarms': ['900.005', '900.201', '750.006', '100.119'], + 'ignore_alarms': ['900.005', '900.201', '750.006', '100.119', '900.701'], 'timeout': 14400} ] }, @@ -1377,7 +1377,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['controller-0']}, _unlock_hosts_stage_as_dict(['controller-0']), {'name': 'wait-data-sync', - 'ignore_alarms': ['900.005', '900.201', '750.006', '100.119'], + 'ignore_alarms': ['900.005', '900.201', '750.006', '100.119', '900.701'], 'timeout': 14400} ] }, @@ -1391,7 +1391,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['storage-0']}, _unlock_hosts_stage_as_dict(['storage-0']), {'name': 'wait-data-sync', - 'ignore_alarms': ['900.005', '900.201', '750.006', '100.119'], + 'ignore_alarms': ['900.005', '900.201', '750.006', '100.119', '900.701'], 'timeout': 7200} ] }, @@ -1405,7 +1405,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['storage-1']}, _unlock_hosts_stage_as_dict(['storage-1']), {'name': 'wait-data-sync', - 'ignore_alarms': ['900.005', '900.201', '750.006', '100.119'], + 'ignore_alarms': ['900.005', '900.201', '750.006', '100.119', '900.701'], 'timeout': 7200} ] }, @@ -1501,7 +1501,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['controller-1']}, _unlock_hosts_stage_as_dict(['controller-1']), {'name': 'wait-data-sync', - 'ignore_alarms': ['900.005', '900.201', '750.006', '100.119'], + 'ignore_alarms': ['900.005', '900.201', '750.006', '100.119', '900.701'], 'timeout': 14400} ] }, @@ -1517,7 +1517,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase): 'entity_names': ['controller-0']}, _unlock_hosts_stage_as_dict(['controller-0']), {'name': 'wait-data-sync', - 'ignore_alarms': ['900.005', '900.201', '750.006', '100.119'], + 'ignore_alarms': ['900.005', '900.201', '750.006', '100.119', '900.701'], 'timeout': 14400} ] }, diff --git a/nfv/nfv-vim/nfv_vim/strategy/_strategy.py b/nfv/nfv-vim/nfv_vim/strategy/_strategy.py index 464a94d1..1c1b03d6 100755 --- a/nfv/nfv-vim/nfv_vim/strategy/_strategy.py +++ b/nfv/nfv-vim/nfv_vim/strategy/_strategy.py @@ -1391,6 +1391,7 @@ class SwPatchStrategy(SwUpdateStrategy, '700.004', # VM stopped '280.002', # Subcloud resource out-of-sync '100.119', # PTP alarm for SyncE + '900.701', # Node tainted ] self._ignore_alarms += IGNORE_ALARMS self._single_controller = single_controller @@ -1741,6 +1742,7 @@ class SwUpgradeStrategy(SwUpdateStrategy): '900.201', # Software upgrade auto apply in progress '750.006', # Configuration change requires reapply of cert-manager '100.119', # PTP alarm for SyncE + '900.701', # Node tainted ] self._ignore_alarms += IGNORE_ALARMS self._single_controller = single_controller @@ -2347,6 +2349,7 @@ class SystemConfigUpdateStrategy(SwUpdateStrategy, '750.006', # Configuration change requires reapply of an application '900.010', # System Config Update in progress '900.601', # System Config Update Auto Apply in progress + '900.701', # Node tainted ] self._ignore_alarms += IGNORE_ALARMS self._single_controller = single_controller @@ -2530,6 +2533,7 @@ class FwUpdateStrategy(SwUpdateStrategy): '900.301', # Fw Update Auto Apply in progress '200.001', # Locked Host '100.119', # PTP alarm for SyncE + '900.701', # Node tainted ] self._ignore_alarms += IGNORE_ALARMS @@ -2898,6 +2902,7 @@ class KubeRootcaUpdateStrategy(SwUpdateStrategy, '900.008', # Kubernetes rootca update in progress '900.009', # Kubernetes rootca update aborted '900.501', # Kubernetes rootca update auto-apply inprogress + '900.701', # Node tainted ] # self._ignore_alarms is declared in parent class self._ignore_alarms += IGNORE_ALARMS @@ -3299,6 +3304,7 @@ class KubeUpgradeStrategy(SwUpdateStrategy, '750.006', # Configuration change requires reapply of cert-manager '900.007', # Kube Upgrade in progress '900.401', # kube-upgrade-auto-apply-inprogress + '900.701', # Node tainted ] # self._ignore_alarms is declared in parent class self._ignore_alarms += IGNORE_ALARMS