From d9064e482269f51bffcfdda978df8673e6c12ffa Mon Sep 17 00:00:00 2001 From: Marcelo de Castro Loebens Date: Mon, 16 Jan 2023 14:47:57 -0400 Subject: [PATCH] Remove stale alarms in k8s apps upgrade script Added a step to remove "Application Update in Progress" alarms (750.005) that sometimes remain after updating k8s apps as part of a platform upgrade. The alarms remain due to an IPC error (root cause unclear). The issue is intermittent and very rarely seen. Test Plan: PASS: In a simplex upgrade activation stage (22.06 to 22.12), observe the logs in /var/log/sysinv.log. During k8s apps upgrade script, when app apply start, as soon as you see a log with "Applying app . Overall completion ..", restart fm-mgr service. Wait for the app to finish applying. Even if there was a failure while connecting with FM manager, when activation completes, there shouldn't be any 750.005 alarms. Closes-Bug: 2003228. Signed-off-by: Marcelo de Castro Loebens Change-Id: I74b2a1f29302842452136eb3d3cb150f4c31ac0b --- .../controllerconfig/upgrade-scripts/65-k8s-app-upgrade.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/controllerconfig/controllerconfig/upgrade-scripts/65-k8s-app-upgrade.sh b/controllerconfig/controllerconfig/upgrade-scripts/65-k8s-app-upgrade.sh index 1403c653d8..2ed496b4ba 100644 --- a/controllerconfig/controllerconfig/upgrade-scripts/65-k8s-app-upgrade.sh +++ b/controllerconfig/controllerconfig/upgrade-scripts/65-k8s-app-upgrade.sh @@ -180,6 +180,11 @@ if [ "$ACTION" == "activate" ]; then if [ "${UPDATING_APP_NAME}" == "${UPGRADE_APP_NAME}" ] && \ [ "${UPDATING_APP_VERSION}" == "${UPGRADE_APP_VERSION}" ] && \ [ "${UPDATING_APP_STATUS}" == "applied" ]; then + ALARMS=$(fm alarm-list --nowrap --uuid --query "alarm_id=750.005;entity_type_id=k8s_application;entity_instance_id=${UPGRADE_APP_NAME}" | head -n-1 | tail -n+4 | awk '{print $2}') + for alarm in ${ALARMS}; do + log "$NAME: [Warning] A stale 750.005 Application Update In Progress alarm was found for ${UPGRADE_APP_NAME}. Clearing it (UUID: ${alarm})." + fm alarm-delete $alarm + done log "$NAME: ${UPGRADE_APP_NAME} has been updated to version ${UPGRADE_APP_VERSION} from version ${EXISTING_APP_VERSION}" break fi