Merge "Alarm 900.701 raised on failing to remove node taint."

This commit is contained in:
Zuul 2024-02-27 17:41:22 +00:00 committed by Gerrit Code Review
commit d619a37a6a
6 changed files with 100 additions and 27 deletions

View File

@ -4,7 +4,10 @@
# SPDX-License-Identifier: Apache-2.0
#
from fm_api import constants as fm_constants
from fm_api import fm_api
import kubernetes
from kubernetes import __version__ as K8S_MODULE_VERSION
from kubernetes.client.models.v1_container_image import V1ContainerImage
from kubernetes.client.rest import ApiException
@ -15,6 +18,8 @@ from nfv_common.helpers import Result
K8S_MODULE_MAJOR_VERSION = int(K8S_MODULE_VERSION.split('.', maxsplit=1)[0])
fmapi = fm_api.FaultAPIs()
DLOG = debug.debug_get_logger('nfv_plugins.nfvi_plugins.clients.kubernetes_client')
@ -77,13 +82,42 @@ def get_customobjects_api_instance():
return client.CustomObjectsApi()
def raise_alarm(node_name):
entity_instance_id = "%s=%s" % (fm_constants.FM_ENTITY_TYPE_HOST,
node_name)
fault = fm_api.Fault(
alarm_id=fm_constants.FM_ALARM_ID_USM_NODE_TAINTED,
alarm_state=fm_constants.FM_ALARM_STATE_SET,
entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST,
entity_instance_id=entity_instance_id,
severity=fm_constants.FM_ALARM_SEVERITY_MAJOR,
reason_text=("Node tainted."),
alarm_type=fm_constants.FM_ALARM_TYPE_7,
probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_8,
proposed_repair_action=("Execute 'kubectl taint nodes %s services=disabled:NoExecute-'. "
"If it fails, Execute 'system host-lock %s' followed by 'system host-unlock %s'. "
"If issue still persists, contact next level of support."
% (node_name, node_name, node_name)),
service_affecting=True)
DLOG.info("Raising alarm %s on %s " % (fm_constants.FM_ALARM_ID_USM_NODE_TAINTED, node_name))
fmapi.set_fault(fault)
def clear_alarm(node_name):
entity_instance_id = "%s=%s" % (fm_constants.FM_ENTITY_TYPE_HOST,
node_name)
DLOG.info("Clearing alarm %s on %s " % (fm_constants.FM_ALARM_ID_USM_NODE_TAINTED, node_name))
fmapi.clear_fault(fm_constants.FM_ALARM_ID_USM_NODE_TAINTED, entity_instance_id)
def taint_node(node_name, effect, key, value):
"""
Apply a taint to a node
"""
# Get the client.
kube_client = get_client()
# Retrieve the node to access any existing taints.
try:
response = kube_client.read_node(node_name)
@ -127,6 +161,10 @@ def taint_node(node_name, effect, key, value):
new_taint = {"key": key, "value": value, "effect": effect}
body["spec"]["taints"].append(new_taint)
response = kube_client.patch_node(node_name, body)
# Clear taint node alarm if tainting is successful.
# Alarm not cleared if taint is already present in the system
# or the node is under configuration.
clear_alarm(node_name)
return Result(response)
@ -156,8 +194,27 @@ def untaint_node(node_name, effect, key):
# Preserve any existing taints
updated_taints = [taint for taint in taints if taint.key != key or
taint.effect != effect]
DLOG.info("Updated taints %s" % (updated_taints))
body = {"spec": {"taints": updated_taints}}
response = kube_client.patch_node(node_name, body)
check_taints = kube_client.read_node(node_name)
taints = check_taints.spec.taints
DLOG.info("Existing taint %s" % (taints))
if taints is not None:
for taint in taints:
if (taint.key == key and taint.effect == effect):
DLOG.info("Removing %s:%s taint from node %s failed" % (key,
effect, node_name))
raise_alarm(node_name)
break
else:
# Taint removed successfully. If there are multiple taints
# on the system, removing the 'services' taint will clear the alarm.
clear_alarm(node_name)
else:
# If there is only 'services' taint on the system , then removing the taint
# should clear the alarm.
clear_alarm(node_name)
return Result(response)

View File

@ -3,16 +3,19 @@
#
# SPDX-License-Identifier: Apache-2.0
#
import sys
import uuid
from nfv_plugins.nfvi_plugins.nfvi_infrastructure_api import host_state
from nfv_plugins.nfvi_plugins.nfvi_infrastructure_api import NFVIInfrastructureAPI
from nfv_vim.nfvi.objects.v1 import HOST_AVAIL_STATUS
from nfv_vim.nfvi.objects.v1 import HOST_LABEL_KEYS
from nfv_vim.nfvi.objects.v1 import HOST_LABEL_VALUES
from nfv_vim.nfvi.objects.v1 import HOST_OPER_STATE
from nfv_unit_tests.tests import testcase
from unittest import mock
sys.modules['fm_core'] = mock.Mock()
from nfv_plugins.nfvi_plugins.nfvi_infrastructure_api import host_state # noqa: H306,E402 pylint: disable=C0413
from nfv_plugins.nfvi_plugins.nfvi_infrastructure_api import NFVIInfrastructureAPI # noqa: H306,E402 pylint: disable=C0413
# todo(abailey): use already existing constants
CONTROLLER_PERSONALITY = 'controller'

View File

@ -8,11 +8,12 @@
import kubernetes
from kubernetes.client.rest import ApiException
from unittest import mock
from nfv_plugins.nfvi_plugins.clients import kubernetes_client
from nfv_unit_tests.tests import testcase
import sys
from unittest import mock
sys.modules['fm_core'] = mock.Mock()
from nfv_plugins.nfvi_plugins.clients import kubernetes_client # noqa: H306,E402 pylint: disable=C0413
def mock_load_kube_config(path):

View File

@ -2701,7 +2701,8 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
'200.001',
'700.004',
'280.002',
'100.119'],
'100.119',
'900.701'],
'timeout': 1800}
]
},
@ -2723,7 +2724,8 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
'200.001',
'700.004',
'280.002',
'100.119'],
'100.119',
'900.701'],
'timeout': 1800}
]
}
@ -2827,7 +2829,8 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
'200.001',
'700.004',
'280.002',
'100.119'],
'100.119',
'900.701'],
'timeout': 1800}
]
},
@ -2850,7 +2853,8 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
'200.001',
'700.004',
'280.002',
'100.119'],
'100.119',
'900.701'],
'timeout': 1800}
]
},
@ -2873,7 +2877,8 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
'200.001',
'700.004',
'280.002',
'100.119'],
'100.119',
'900.701'],
'timeout': 1800}
]
},
@ -2896,7 +2901,8 @@ class TestSwPatchStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
'200.001',
'700.004',
'280.002',
'100.119'],
'100.119',
'900.701'],
'timeout': 1800}
]
},

View File

@ -1022,7 +1022,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
'entity_names': ['storage-0']},
_unlock_hosts_stage_as_dict(['storage-0']),
{'name': 'wait-data-sync',
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119'],
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119', '900.701'],
'timeout': 7200}
]
},
@ -1036,7 +1036,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
'entity_names': ['storage-1']},
_unlock_hosts_stage_as_dict(['storage-1']),
{'name': 'wait-data-sync',
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119'],
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119', '900.701'],
'timeout': 7200}
]
},
@ -1050,7 +1050,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
'entity_names': ['storage-2']},
_unlock_hosts_stage_as_dict(['storage-2']),
{'name': 'wait-data-sync',
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119'],
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119', '900.701'],
'timeout': 7200}
]
},
@ -1064,7 +1064,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
'entity_names': ['storage-3']},
_unlock_hosts_stage_as_dict(['storage-3']),
{'name': 'wait-data-sync',
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119'],
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119', '900.701'],
'timeout': 7200}
]
},
@ -1112,7 +1112,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
'entity_names': ['controller-0']},
_unlock_hosts_stage_as_dict(['controller-0']),
{'name': 'wait-data-sync',
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119'],
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119', '900.701'],
'timeout': 14400}
]
}
@ -1159,7 +1159,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
'entity_names': ['controller-1']},
_unlock_hosts_stage_as_dict(['controller-1']),
{'name': 'wait-data-sync',
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119'],
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119', '900.701'],
'timeout': 14400}
]
},
@ -1175,7 +1175,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
'entity_names': ['controller-0']},
_unlock_hosts_stage_as_dict(['controller-0']),
{'name': 'wait-data-sync',
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119'],
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119', '900.701'],
'timeout': 14400}
]
}
@ -1246,7 +1246,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
'entity_names': ['controller-0']},
_unlock_hosts_stage_as_dict(['controller-0']),
{'name': 'wait-data-sync',
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119'],
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119', '900.701'],
'timeout': 14400}
]
},
@ -1262,7 +1262,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
'entity_names': ['controller-1']},
_unlock_hosts_stage_as_dict(['controller-1']),
{'name': 'wait-data-sync',
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119'],
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119', '900.701'],
'timeout': 14400}
]
},
@ -1363,7 +1363,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
'entity_names': ['controller-1']},
_unlock_hosts_stage_as_dict(['controller-1']),
{'name': 'wait-data-sync',
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119'],
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119', '900.701'],
'timeout': 14400}
]
},
@ -1377,7 +1377,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
'entity_names': ['controller-0']},
_unlock_hosts_stage_as_dict(['controller-0']),
{'name': 'wait-data-sync',
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119'],
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119', '900.701'],
'timeout': 14400}
]
},
@ -1391,7 +1391,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
'entity_names': ['storage-0']},
_unlock_hosts_stage_as_dict(['storage-0']),
{'name': 'wait-data-sync',
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119'],
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119', '900.701'],
'timeout': 7200}
]
},
@ -1405,7 +1405,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
'entity_names': ['storage-1']},
_unlock_hosts_stage_as_dict(['storage-1']),
{'name': 'wait-data-sync',
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119'],
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119', '900.701'],
'timeout': 7200}
]
},
@ -1501,7 +1501,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
'entity_names': ['controller-1']},
_unlock_hosts_stage_as_dict(['controller-1']),
{'name': 'wait-data-sync',
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119'],
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119', '900.701'],
'timeout': 14400}
]
},
@ -1517,7 +1517,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
'entity_names': ['controller-0']},
_unlock_hosts_stage_as_dict(['controller-0']),
{'name': 'wait-data-sync',
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119'],
'ignore_alarms': ['900.005', '900.201', '750.006', '100.119', '900.701'],
'timeout': 14400}
]
},

View File

@ -1391,6 +1391,7 @@ class SwPatchStrategy(SwUpdateStrategy,
'700.004', # VM stopped
'280.002', # Subcloud resource out-of-sync
'100.119', # PTP alarm for SyncE
'900.701', # Node tainted
]
self._ignore_alarms += IGNORE_ALARMS
self._single_controller = single_controller
@ -1741,6 +1742,7 @@ class SwUpgradeStrategy(SwUpdateStrategy):
'900.201', # Software upgrade auto apply in progress
'750.006', # Configuration change requires reapply of cert-manager
'100.119', # PTP alarm for SyncE
'900.701', # Node tainted
]
self._ignore_alarms += IGNORE_ALARMS
self._single_controller = single_controller
@ -2347,6 +2349,7 @@ class SystemConfigUpdateStrategy(SwUpdateStrategy,
'750.006', # Configuration change requires reapply of an application
'900.010', # System Config Update in progress
'900.601', # System Config Update Auto Apply in progress
'900.701', # Node tainted
]
self._ignore_alarms += IGNORE_ALARMS
self._single_controller = single_controller
@ -2530,6 +2533,7 @@ class FwUpdateStrategy(SwUpdateStrategy):
'900.301', # Fw Update Auto Apply in progress
'200.001', # Locked Host
'100.119', # PTP alarm for SyncE
'900.701', # Node tainted
]
self._ignore_alarms += IGNORE_ALARMS
@ -2898,6 +2902,7 @@ class KubeRootcaUpdateStrategy(SwUpdateStrategy,
'900.008', # Kubernetes rootca update in progress
'900.009', # Kubernetes rootca update aborted
'900.501', # Kubernetes rootca update auto-apply inprogress
'900.701', # Node tainted
]
# self._ignore_alarms is declared in parent class
self._ignore_alarms += IGNORE_ALARMS
@ -3299,6 +3304,7 @@ class KubeUpgradeStrategy(SwUpdateStrategy,
'750.006', # Configuration change requires reapply of cert-manager
'900.007', # Kube Upgrade in progress
'900.401', # kube-upgrade-auto-apply-inprogress
'900.701', # Node tainted
]
# self._ignore_alarms is declared in parent class
self._ignore_alarms += IGNORE_ALARMS