From 021718688b014e905bb274643ee053bc40589880 Mon Sep 17 00:00:00 2001 From: rakshith mr Date: Mon, 29 Jan 2024 08:20:34 -0500 Subject: [PATCH] Adding alarm when kube-apiserver pod fails It is possible to put the system into a state where kubernetes does not work but no alarms are present. New alarm added to indicate k8s api server is down. Test Plan: PASS: Kube api server was interrupted/stopped by changing configuration files and alarm was raised. PASS: Alarm was cleared when configurations were reset and kube api server was restarted. Change-Id: I335179ea98ef63d7c35c89d82328a52ab2391f5c Signed-off-by: rakshith mr --- fm-api/source/fm_api/constants.py | 3 +++ fm-doc/fm_doc/events.yaml | 19 +++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/fm-api/source/fm_api/constants.py b/fm-api/source/fm_api/constants.py index a890215e..08d08448 100755 --- a/fm-api/source/fm_api/constants.py +++ b/fm-api/source/fm_api/constants.py @@ -309,6 +309,9 @@ FM_ALARM_ID_KUBE_ROOTCA_UPDATE_IN_PROGRESS = ALARM_GROUP_SW_MGMT + ".008" # Kubernetes RootCA Update abort alarm id FM_ALARM_ID_KUBE_ROOTCA_UPDATE_ABORTED = ALARM_GROUP_SW_MGMT + ".009" +# Kubernetes Node Down alarm id +FM_ALARM_ID_KUBE_DOWN = ALARM_GROUP_K8S + ".002" + # The SYSTEM_CONFIG_UPDATE alarms are originated by vim strategy which is the # same as the other sw-mgmt alarms, put them in the same group # System Config Update alarm id diff --git a/fm-doc/fm_doc/events.yaml b/fm-doc/fm_doc/events.yaml index 47e39cde..e9741293 100755 --- a/fm-doc/fm_doc/events.yaml +++ b/fm-doc/fm_doc/events.yaml @@ -3431,6 +3431,25 @@ Degrade_Affecting_Severity: none Context: none +850.002: + Type: Alarm + Description: K8s nodes unreachable + Entity_Instance_ID: kubernetes=k8s-nodes + Severity: major + Proposed_Repair_Action: "Restart kubernetes service. + Consult the System Administration Manual + for more details. If problem persists + contact next level of support." + Maintenance_Action: + Inhibit_Alarms: + Alarm_Type: communication + Probable_Cause: communication-subsystem-failure + Service_Affecting: False + Suppression: False + Management_Affecting_Severity: none + Degrade_Affecting_Severity: none + Context: none + #--------------------------------------------------------------------------- # SOFTWARE #---------------------------------------------------------------------------