From da398e0c5f12a797a5a4039b99743d5648710527 Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Thu, 20 Oct 2022 15:45:49 +0000 Subject: [PATCH] Debian: Make Mtce offline handler more resilient to slow shutdowns The current offline handler assumes the node is offline after 'offline_search_count' reaches 'offline_threshold' count regardless of whether mtcAlive messages were received during the search window. The offline algorithm requires that no mtcAlive messages be seen for the full offline_threshold count. During a slow shutdown the mtcClient runs for longer than it should and as a result can lead to maintenance seeing the node as recovered before it should. This update manages the offline search counter to ensure that it only reached the count threshold after seeing no mtcAlive messages for the full search count. Any mtcAlive message seen during the count triggers a count reset. This update also 1. Adjusts the reset retry cadence from 7 to 12 secs to prevent unnecessary reboot thrash during the current shutdown. 2. Clears the hbsClient ready event at the start of the subfunction handler so the heartbeat soak is only started after seeing heartbeat client ready events that follow the main config. Test Plan: PASS: Debian and CentOS Build and DX install PASS: Verify search count management PASS: Verify issue does not occur over lock/unlock soak (100+) - where the same test without update did show issue. PASS: Monitor alive logs for behavioral correctness PASS: Verify recovery reset occurs after expected extended time. Closes-Bug: 1993656 Signed-off-by: Eric MacDonald Change-Id: If10bb75a1fb01d0ecd3f88524d74c232658ca29e --- mtce-common/src/common/alarmUtil.cpp | 20 ++++---- mtce/src/common/nodeClass.cpp | 4 +- mtce/src/maintenance/mtcCmdHdlr.cpp | 6 +-- mtce/src/maintenance/mtcNodeHdlrs.cpp | 66 ++++++++++++++++++++------- mtce/src/maintenance/mtcSubfHdlrs.cpp | 1 + 5 files changed, 66 insertions(+), 31 deletions(-) diff --git a/mtce-common/src/common/alarmUtil.cpp b/mtce-common/src/common/alarmUtil.cpp index 09b4eca1..1ecc6002 100644 --- a/mtce-common/src/common/alarmUtil.cpp +++ b/mtce-common/src/common/alarmUtil.cpp @@ -151,7 +151,7 @@ EFmAlarmSeverityT alarmUtil_query ( string & hostname, ENTITY_PREFIX, hostname.data(), instance.data()); } - alog ("entity_instance:%s\n", alarm_filter.entity_instance_id ); + alog1 ("entity_instance:%s\n", alarm_filter.entity_instance_id ); if (( rc = fm_get_fault ( &alarm_filter, &alarm_query )) == FM_ERR_OK ) { dlog ("Found with Severity: %d\n", alarm_query.severity ); @@ -183,7 +183,7 @@ int alarmUtil_query_identity ( string identity, SFmAlarmDataT * alarm_list_ptr, memset(&alarm_filter, 0, sizeof(alarm_filter)); snprintf ( alarm_filter.alarm_id, FM_MAX_BUFFER_LENGTH, "%s", identity.data()); rc = fm_get_faults_by_id ( &alarm_filter.alarm_id, alarm_list_ptr, &max_alarms ); - alog ("%s fm_get_faults_by_id rc = %d\n", alarm_filter.alarm_id, rc ); + alog1 ("%s fm_get_faults_by_id rc = %d\n", alarm_filter.alarm_id, rc ); if ( rc == FM_ERR_OK ) { return (PASS); @@ -242,7 +242,7 @@ int alarmUtil ( string & hostname, if (( alarm.alarm_state != FM_ALARM_STATE_MSG ) && ( curr_sev == alarm.severity )) { - alog ("%s %s %s already at desired (%s) severity level\n", + alog1 ("%s %s %s already at desired (%s) severity level\n", hostname.c_str(), identity.c_str(), instance.c_str(), @@ -259,18 +259,18 @@ int alarmUtil ( string & hostname, { if ( alarm.alarm_state == FM_ALARM_STATE_SET ) { - alog ("%s setting %s %s alarm\n", hostname.c_str(), alarm.alarm_id, alarm.entity_instance_id ); + alog1 ("%s setting %s %s alarm\n", hostname.c_str(), alarm.alarm_id, alarm.entity_instance_id ); } else { - alog ("%s creating %s %s log\n", hostname.c_str(), alarm.alarm_id, alarm.entity_instance_id ); + alog1 ("%s creating %s %s log\n", hostname.c_str(), alarm.alarm_id, alarm.entity_instance_id ); } /* Debug Logs */ - alog ("%s Alarm Reason: %s\n", hostname.c_str(), alarm.reason_text ); - alog ("%s Alarm Action: %s\n", hostname.c_str(), alarm.proposed_repair_action ); - alog ("%s Alarm Ident : %s : %s\n", hostname.c_str(), alarm.entity_type_id, alarm.entity_instance_id ); - alog ("%s Alarm State : state:%d sev:%d type:%d cause:%d sa:%c supp:%c\n", + alog1 ("%s Alarm Reason: %s\n", hostname.c_str(), alarm.reason_text ); + alog1 ("%s Alarm Action: %s\n", hostname.c_str(), alarm.proposed_repair_action ); + alog1 ("%s Alarm Ident : %s : %s\n", hostname.c_str(), alarm.entity_type_id, alarm.entity_instance_id ); + alog1 ("%s Alarm State : state:%d sev:%d type:%d cause:%d sa:%c supp:%c\n", hostname.c_str(), alarm.alarm_state, alarm.severity, @@ -322,7 +322,7 @@ int alarmUtil ( string & hostname, // send_log_message ( mtclogd_ptr, hostname.data(), &__alarmObject.varlog_filename[0], // &__alarmObject.temp_str[0] ); - alog ("%s clearing %s %s alarm\n", hostname.c_str(), alarm.alarm_id, alarm.entity_instance_id); + alog1 ("%s clearing %s %s alarm\n", hostname.c_str(), alarm.alarm_id, alarm.entity_instance_id); nodeUtil_latency_log ( hostname, NODEUTIL_LATENCY_MON_START , 0 ); if ( ( rc = fm_clear_fault_async ( &filter )) != FM_ERR_OK ) { diff --git a/mtce/src/common/nodeClass.cpp b/mtce/src/common/nodeClass.cpp index cdccb360..657eba20 100755 --- a/mtce/src/common/nodeClass.cpp +++ b/mtce/src/common/nodeClass.cpp @@ -3902,7 +3902,7 @@ void nodeLinkClass::set_mtcAlive ( struct nodeLinkClass::node * node_ptr, int in { if ( node_ptr->mtcAlive_clstr == false ) { - alog2 ("%s %s mtcAlive received", + alog ("%s %s mtcAlive received", node_ptr->hostname.c_str(), get_iface_name_str(interface)); node_ptr->mtcAlive_clstr = true ; @@ -3912,7 +3912,7 @@ void nodeLinkClass::set_mtcAlive ( struct nodeLinkClass::node * node_ptr, int in { if ( node_ptr->mtcAlive_mgmnt == false ) { - alog2 ("%s %s mtcAlive received", + alog ("%s %s mtcAlive received", node_ptr->hostname.c_str(), get_iface_name_str(interface)); node_ptr->mtcAlive_mgmnt = true ; diff --git a/mtce/src/maintenance/mtcCmdHdlr.cpp b/mtce/src/maintenance/mtcCmdHdlr.cpp index d89580bc..c841c28c 100644 --- a/mtce/src/maintenance/mtcCmdHdlr.cpp +++ b/mtce/src/maintenance/mtcCmdHdlr.cpp @@ -454,7 +454,7 @@ int nodeLinkClass::cmd_handler ( struct nodeLinkClass::node * node_ptr ) } else { - int delay = (((offline_period*offline_threshold)/1000)+3); + int delay = (((offline_period*offline_threshold)/1000)*3); ilog ("%s searching for offline ; next reboot attempt in %d seconds\n", node_ptr->hostname.c_str(), delay); @@ -507,7 +507,7 @@ int nodeLinkClass::cmd_handler ( struct nodeLinkClass::node * node_ptr ) wlog ("%s Board Management Interface not accessible\n", node_ptr->hostname.c_str()); } } - int delay = (((offline_period*offline_threshold)/1000)+3); + int delay = (((offline_period*offline_threshold)/1000)*3); mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, delay ); node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__OFFLINE_CHECK ; break ; @@ -516,7 +516,7 @@ int nodeLinkClass::cmd_handler ( struct nodeLinkClass::node * node_ptr ) { if ( node_ptr->mtcCmd_timer.ring == true ) { - int delay = (((offline_period*offline_threshold)/1000)+3); + int delay = (((offline_period*offline_threshold)/1000)*3); /* bmc power control reset by bmc */ rc = bmc_command_recv ( node_ptr ); diff --git a/mtce/src/maintenance/mtcNodeHdlrs.cpp b/mtce/src/maintenance/mtcNodeHdlrs.cpp index dfe9172d..bef43b4c 100755 --- a/mtce/src/maintenance/mtcNodeHdlrs.cpp +++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp @@ -3221,6 +3221,7 @@ int nodeLinkClass::offline_handler ( struct nodeLinkClass::node * node_ptr ) } case MTC_OFFLINE__START: { + node_ptr->mtcAlive_count = 0 ; node_ptr->mtcAlive_mgmnt = false ; node_ptr->mtcAlive_clstr = false ; node_ptr->offline_log_throttle = 0 ; @@ -3245,6 +3246,27 @@ int nodeLinkClass::offline_handler ( struct nodeLinkClass::node * node_ptr ) availStatus_enum_to_str(node_ptr->availStatus).c_str()); this->ctl_mtcAlive_gate ( node_ptr, false ) ; + + /** + * Handle the race condition case where the + * mtcAlive was received after the last check + * while in MTC_OFFLINE__WAIT below and here when + * the node_ptr->mtcAlive_ state variables + * are cleared. Need to also clear the + * offline_search_count here as well. + **/ + if (( node_ptr->mtcAlive_mgmnt || node_ptr->mtcAlive_clstr ) && node_ptr->offline_search_count ) + { + node_ptr->mtcAlive_online = true ; + ilog ("%s still seeing mtcAlive (%d) (%c:%c) ; reset offline_search_count=%d of %d\n", + node_ptr->hostname.c_str(), + node_ptr->mtcAlive_count, + node_ptr->mtcAlive_mgmnt ? 'Y' : 'n', + node_ptr->mtcAlive_clstr ? 'Y' : 'n', + node_ptr->offline_search_count, + offline_threshold ); + node_ptr->offline_search_count = 0 ; /* reset the count */ + } node_ptr->mtcAlive_mgmnt = false ; node_ptr->mtcAlive_clstr = false ; @@ -3299,24 +3321,36 @@ int nodeLinkClass::offline_handler ( struct nodeLinkClass::node * node_ptr ) offline_threshold ); } } - else + else if ( node_ptr->offline_search_count ) { + /** + * This algorithm was assuming the node is offline after + * offline_search_count reached offline_threshold count. + * + * Note: The mtcClient sends periodic mtcAlive messages + * until it is shutdown. + * This algorithm also explicitely 'requests' the message. + * The algorithm depends on not receving the message, even + * when requested for offline_threshold counts 'in a row'. + * + * When shutdown is slowed or delayed, a late mtcAlive + * can trick this FSM into seeing the node as recovered + * when in fact its still shuttingdown. + * + * To maintain the intended absence of mtcAlive messages + * count 'in a row', this check resets the search count + * if a mtcAlive is seen during the search. + **/ + node_ptr->mtcAlive_online = true ; - if ( node_ptr->mtcAlive_mgmnt || node_ptr->mtcAlive_clstr ) - { - ilog_throttled ( node_ptr->offline_log_throttle, 10, - "%s still seeing mtcAlive (%c:%c)\n", - node_ptr->hostname.c_str(), - node_ptr->mtcAlive_mgmnt ? 'Y' : 'n', - node_ptr->mtcAlive_clstr ? 'Y' : 'n'); - } - else - { - alog ("%s still seeing mtcAlive (%c:%c)\n", - node_ptr->hostname.c_str(), - node_ptr->mtcAlive_mgmnt ? 'Y' : 'n', - node_ptr->mtcAlive_clstr ? 'Y' : 'n'); - } + ilog ("%s still seeing mtcAlive (%d) (%c:%c) ; reset offline_search_count=%d of %d\n", + node_ptr->hostname.c_str(), + node_ptr->mtcAlive_count, + node_ptr->mtcAlive_mgmnt ? 'Y' : 'n', + node_ptr->mtcAlive_clstr ? 'Y' : 'n', + node_ptr->offline_search_count, + offline_threshold ); + node_ptr->offline_search_count = 0 ; /* reset the search count */ } if ( node_ptr->offlineStage == MTC_OFFLINE__IDLE ) diff --git a/mtce/src/maintenance/mtcSubfHdlrs.cpp b/mtce/src/maintenance/mtcSubfHdlrs.cpp index 5c994f4a..a488e922 100644 --- a/mtce/src/maintenance/mtcSubfHdlrs.cpp +++ b/mtce/src/maintenance/mtcSubfHdlrs.cpp @@ -84,6 +84,7 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr ) node_ptr->unknown_health_reported = false ; node_ptr->goEnabled_failed_subf = false ; + node_ptr->hbsClient_ready = false ; /* load worker subfunciton alarm state */ EFmAlarmSeverityT sev = mtcAlarm_state ( node_ptr->hostname,