From baead557fb75ac71bc37341ef040a61ecc9fe555 Mon Sep 17 00:00:00 2001 From: Agustin Carranza Date: Wed, 13 Jul 2022 16:40:08 -0300 Subject: [PATCH] Update events.yaml for specific alarms The following alarms need to be clearer for users. Some information was gathered in order to improve the 'description' and 'Proposed repair action' fields. Alarm that has been deprecated: 270.001 The list of alarms that were modified is the following: 500.210 500.200 750.002 750.006 800.003 800.102 900.002 900.003 900.004 900.009 900.103 900.203 900.303 900.503 Test plan: There is no need to test the alarms affected by the changes. Story: 2010143 Task: 45785 Signed-off-by: Agustin Carranza Change-Id: I57b86548a36da66119cb04779ce1f9147254316c --- fm-doc/fm_doc/events.yaml | 40 +++++++++++++-------------------------- 1 file changed, 13 insertions(+), 27 deletions(-) diff --git a/fm-doc/fm_doc/events.yaml b/fm-doc/fm_doc/events.yaml index 58c4bb89..507334ea 100755 --- a/fm-doc/fm_doc/events.yaml +++ b/fm-doc/fm_doc/events.yaml @@ -895,21 +895,6 @@ #--------------------------------------------------------------------------- # VM Compute Services #--------------------------------------------------------------------------- -270.001: - Type: Alarm - Description: "Host compute services failure[, reason = ]" - Entity_Instance_ID: host=.services=compute - Severity: critical - Proposed_Repair_Action: Wait for host services recovery to complete; if problem persists contact next level of support - Maintenance_Action: - Inhibit_Alarms: - Alarm_Type: processing-error - Probable_Cause: unspecified-reason - Service_Affecting: True - Suppression: True - Management_Affecting_Severity: warning - Degrade_Affecting_Severity: none - 270.101: Type: Log Description: "Host compute services failure[, reason = ]" @@ -1400,7 +1385,7 @@ OR system.certificate.k8sRootCA Severity: major - Proposed_Repair_Action: Renew certificate for the entity identified + Proposed_Repair_Action: Check certificate expiration time. Renew certificate for the entity identified. Maintenance_Action: Inhibit_Alarms: Alarm_Type: operational-violation @@ -1427,7 +1412,7 @@ OR system.certificate.k8sRootCA Severity: critical - Proposed_Repair_Action: Renew certificate for the entity identified + Proposed_Repair_Action: Check certificate expiration time. Renew certificate for the entity identified. Maintenance_Action: Inhibit_Alarms: Alarm_Type: operational-violation @@ -2788,7 +2773,8 @@ Description: "Application Apply Failure" Entity_Instance_ID: k8s_application= Severity: major - Proposed_Repair_Action: "Retry applying the application. If the issue persists, please check system inventory log for cause." + Proposed_Repair_Action: "Retry applying the application. Check application is managed by the system application framework. + If the issue persists, please check system inventory log for cause." Maintenance_Action: Inhibit_Alarms: Alarm_Type: processing-error @@ -3083,10 +3069,10 @@ 900.002: Type: Alarm - Description: Patch host install failure. + Description: Patch host install failure. Command "sw-patch host-install" failed. Entity_Instance_ID: host= Severity: major - Proposed_Repair_Action: Undo patching operation. + Proposed_Repair_Action: Undo patching operation. Check patch logs on the target host (i.e. /var/log/patching.log) Maintenance_Action: Inhibit_Alarms: Alarm_Type: environmental @@ -3098,7 +3084,7 @@ 900.003: Type: Alarm - Description: Obsolete patch in system. + Description: A patch with state 'obsolete' in its metadata has been uploaded. Entity_Instance_ID: host=controller Severity: warning Proposed_Repair_Action: Remove and delete obsolete patches. @@ -3113,7 +3099,7 @@ 900.004: Type: Alarm - Description: Host version mismatch. + Description: The upgrade and running software version do not match. Command host-upgrade failed. Entity_Instance_ID: host= Severity: major Proposed_Repair_Action: Reinstall host to update applied load. @@ -3188,7 +3174,7 @@ 900.009: Type: Alarm - Description: Kubernetes root CA update aborted, certificates may not be fully updated. + Description: Kubernetes root CA update aborted, certificates may not be fully updated. Command "system kube-rootca-update-abort" has been run. Entity_Instance_ID: host=controller Severity: minor Proposed_Repair_Action: Fully update certificates by a new root CA update. @@ -3233,7 +3219,7 @@ 900.103: Type: Alarm - Description: Software patch auto-apply failed + Description: Software patch auto-apply failed. Command "sw-manager patch-strategy apply" failed. Entity_Instance_ID: orchestration=sw-patch Severity: critical Proposed_Repair_Action: Attempt to apply software patches manually; if problem persists contact next level of support @@ -3377,7 +3363,7 @@ 900.203: Type: Alarm - Description: Software upgrade auto-apply failed + Description: Software upgrade auto-apply failed. Command "sw-manager update-strategy apply" failed Entity_Instance_ID: orchestration=sw-upgrade Severity: critical Proposed_Repair_Action: Attempt to apply software upgrade manually; if problem persists contact next level of support @@ -3521,7 +3507,7 @@ 900.303: Type: Alarm - Description: Firmware Update auto-apply failed + Description: Firmware Update auto-apply failed. Command "sw-manager kube-rootca-update-strategy apply" failed. Entity_Instance_ID: orchestration=fw-update Severity: critical Proposed_Repair_Action: Attempt to apply firmware update manually; if problem persists contact next level of support @@ -3809,7 +3795,7 @@ 900.503: Type: Alarm - Description: Kubernetes rootca update auto-apply failed + Description: Kubernetes rootca update auto-apply failed. Command "sw-manager kube-upgrade-strategy apply" failed. Entity_Instance_ID: orchestration=kube-rootca-update Severity: critical Proposed_Repair_Action: Attempt to apply kubernetes rootca update manually; if problem persists contact next level of support