diff --git a/mtce-common/src/common/alarmUtil.cpp b/mtce-common/src/common/alarmUtil.cpp index 09b4eca1..1ecc6002 100644 --- a/mtce-common/src/common/alarmUtil.cpp +++ b/mtce-common/src/common/alarmUtil.cpp @@ -151,7 +151,7 @@ EFmAlarmSeverityT alarmUtil_query ( string & hostname, ENTITY_PREFIX, hostname.data(), instance.data()); } - alog ("entity_instance:%s\n", alarm_filter.entity_instance_id ); + alog1 ("entity_instance:%s\n", alarm_filter.entity_instance_id ); if (( rc = fm_get_fault ( &alarm_filter, &alarm_query )) == FM_ERR_OK ) { dlog ("Found with Severity: %d\n", alarm_query.severity ); @@ -183,7 +183,7 @@ int alarmUtil_query_identity ( string identity, SFmAlarmDataT * alarm_list_ptr, memset(&alarm_filter, 0, sizeof(alarm_filter)); snprintf ( alarm_filter.alarm_id, FM_MAX_BUFFER_LENGTH, "%s", identity.data()); rc = fm_get_faults_by_id ( &alarm_filter.alarm_id, alarm_list_ptr, &max_alarms ); - alog ("%s fm_get_faults_by_id rc = %d\n", alarm_filter.alarm_id, rc ); + alog1 ("%s fm_get_faults_by_id rc = %d\n", alarm_filter.alarm_id, rc ); if ( rc == FM_ERR_OK ) { return (PASS); @@ -242,7 +242,7 @@ int alarmUtil ( string & hostname, if (( alarm.alarm_state != FM_ALARM_STATE_MSG ) && ( curr_sev == alarm.severity )) { - alog ("%s %s %s already at desired (%s) severity level\n", + alog1 ("%s %s %s already at desired (%s) severity level\n", hostname.c_str(), identity.c_str(), instance.c_str(), @@ -259,18 +259,18 @@ int alarmUtil ( string & hostname, { if ( alarm.alarm_state == FM_ALARM_STATE_SET ) { - alog ("%s setting %s %s alarm\n", hostname.c_str(), alarm.alarm_id, alarm.entity_instance_id ); + alog1 ("%s setting %s %s alarm\n", hostname.c_str(), alarm.alarm_id, alarm.entity_instance_id ); } else { - alog ("%s creating %s %s log\n", hostname.c_str(), alarm.alarm_id, alarm.entity_instance_id ); + alog1 ("%s creating %s %s log\n", hostname.c_str(), alarm.alarm_id, alarm.entity_instance_id ); } /* Debug Logs */ - alog ("%s Alarm Reason: %s\n", hostname.c_str(), alarm.reason_text ); - alog ("%s Alarm Action: %s\n", hostname.c_str(), alarm.proposed_repair_action ); - alog ("%s Alarm Ident : %s : %s\n", hostname.c_str(), alarm.entity_type_id, alarm.entity_instance_id ); - alog ("%s Alarm State : state:%d sev:%d type:%d cause:%d sa:%c supp:%c\n", + alog1 ("%s Alarm Reason: %s\n", hostname.c_str(), alarm.reason_text ); + alog1 ("%s Alarm Action: %s\n", hostname.c_str(), alarm.proposed_repair_action ); + alog1 ("%s Alarm Ident : %s : %s\n", hostname.c_str(), alarm.entity_type_id, alarm.entity_instance_id ); + alog1 ("%s Alarm State : state:%d sev:%d type:%d cause:%d sa:%c supp:%c\n", hostname.c_str(), alarm.alarm_state, alarm.severity, @@ -322,7 +322,7 @@ int alarmUtil ( string & hostname, // send_log_message ( mtclogd_ptr, hostname.data(), &__alarmObject.varlog_filename[0], // &__alarmObject.temp_str[0] ); - alog ("%s clearing %s %s alarm\n", hostname.c_str(), alarm.alarm_id, alarm.entity_instance_id); + alog1 ("%s clearing %s %s alarm\n", hostname.c_str(), alarm.alarm_id, alarm.entity_instance_id); nodeUtil_latency_log ( hostname, NODEUTIL_LATENCY_MON_START , 0 ); if ( ( rc = fm_clear_fault_async ( &filter )) != FM_ERR_OK ) { diff --git a/mtce/src/common/nodeClass.cpp b/mtce/src/common/nodeClass.cpp index cdccb360..657eba20 100755 --- a/mtce/src/common/nodeClass.cpp +++ b/mtce/src/common/nodeClass.cpp @@ -3902,7 +3902,7 @@ void nodeLinkClass::set_mtcAlive ( struct nodeLinkClass::node * node_ptr, int in { if ( node_ptr->mtcAlive_clstr == false ) { - alog2 ("%s %s mtcAlive received", + alog ("%s %s mtcAlive received", node_ptr->hostname.c_str(), get_iface_name_str(interface)); node_ptr->mtcAlive_clstr = true ; @@ -3912,7 +3912,7 @@ void nodeLinkClass::set_mtcAlive ( struct nodeLinkClass::node * node_ptr, int in { if ( node_ptr->mtcAlive_mgmnt == false ) { - alog2 ("%s %s mtcAlive received", + alog ("%s %s mtcAlive received", node_ptr->hostname.c_str(), get_iface_name_str(interface)); node_ptr->mtcAlive_mgmnt = true ; diff --git a/mtce/src/maintenance/mtcCmdHdlr.cpp b/mtce/src/maintenance/mtcCmdHdlr.cpp index d89580bc..c841c28c 100644 --- a/mtce/src/maintenance/mtcCmdHdlr.cpp +++ b/mtce/src/maintenance/mtcCmdHdlr.cpp @@ -454,7 +454,7 @@ int nodeLinkClass::cmd_handler ( struct nodeLinkClass::node * node_ptr ) } else { - int delay = (((offline_period*offline_threshold)/1000)+3); + int delay = (((offline_period*offline_threshold)/1000)*3); ilog ("%s searching for offline ; next reboot attempt in %d seconds\n", node_ptr->hostname.c_str(), delay); @@ -507,7 +507,7 @@ int nodeLinkClass::cmd_handler ( struct nodeLinkClass::node * node_ptr ) wlog ("%s Board Management Interface not accessible\n", node_ptr->hostname.c_str()); } } - int delay = (((offline_period*offline_threshold)/1000)+3); + int delay = (((offline_period*offline_threshold)/1000)*3); mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, delay ); node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__OFFLINE_CHECK ; break ; @@ -516,7 +516,7 @@ int nodeLinkClass::cmd_handler ( struct nodeLinkClass::node * node_ptr ) { if ( node_ptr->mtcCmd_timer.ring == true ) { - int delay = (((offline_period*offline_threshold)/1000)+3); + int delay = (((offline_period*offline_threshold)/1000)*3); /* bmc power control reset by bmc */ rc = bmc_command_recv ( node_ptr ); diff --git a/mtce/src/maintenance/mtcNodeHdlrs.cpp b/mtce/src/maintenance/mtcNodeHdlrs.cpp index dfe9172d..bef43b4c 100755 --- a/mtce/src/maintenance/mtcNodeHdlrs.cpp +++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp @@ -3221,6 +3221,7 @@ int nodeLinkClass::offline_handler ( struct nodeLinkClass::node * node_ptr ) } case MTC_OFFLINE__START: { + node_ptr->mtcAlive_count = 0 ; node_ptr->mtcAlive_mgmnt = false ; node_ptr->mtcAlive_clstr = false ; node_ptr->offline_log_throttle = 0 ; @@ -3245,6 +3246,27 @@ int nodeLinkClass::offline_handler ( struct nodeLinkClass::node * node_ptr ) availStatus_enum_to_str(node_ptr->availStatus).c_str()); this->ctl_mtcAlive_gate ( node_ptr, false ) ; + + /** + * Handle the race condition case where the + * mtcAlive was received after the last check + * while in MTC_OFFLINE__WAIT below and here when + * the node_ptr->mtcAlive_ state variables + * are cleared. Need to also clear the + * offline_search_count here as well. + **/ + if (( node_ptr->mtcAlive_mgmnt || node_ptr->mtcAlive_clstr ) && node_ptr->offline_search_count ) + { + node_ptr->mtcAlive_online = true ; + ilog ("%s still seeing mtcAlive (%d) (%c:%c) ; reset offline_search_count=%d of %d\n", + node_ptr->hostname.c_str(), + node_ptr->mtcAlive_count, + node_ptr->mtcAlive_mgmnt ? 'Y' : 'n', + node_ptr->mtcAlive_clstr ? 'Y' : 'n', + node_ptr->offline_search_count, + offline_threshold ); + node_ptr->offline_search_count = 0 ; /* reset the count */ + } node_ptr->mtcAlive_mgmnt = false ; node_ptr->mtcAlive_clstr = false ; @@ -3299,24 +3321,36 @@ int nodeLinkClass::offline_handler ( struct nodeLinkClass::node * node_ptr ) offline_threshold ); } } - else + else if ( node_ptr->offline_search_count ) { + /** + * This algorithm was assuming the node is offline after + * offline_search_count reached offline_threshold count. + * + * Note: The mtcClient sends periodic mtcAlive messages + * until it is shutdown. + * This algorithm also explicitely 'requests' the message. + * The algorithm depends on not receving the message, even + * when requested for offline_threshold counts 'in a row'. + * + * When shutdown is slowed or delayed, a late mtcAlive + * can trick this FSM into seeing the node as recovered + * when in fact its still shuttingdown. + * + * To maintain the intended absence of mtcAlive messages + * count 'in a row', this check resets the search count + * if a mtcAlive is seen during the search. + **/ + node_ptr->mtcAlive_online = true ; - if ( node_ptr->mtcAlive_mgmnt || node_ptr->mtcAlive_clstr ) - { - ilog_throttled ( node_ptr->offline_log_throttle, 10, - "%s still seeing mtcAlive (%c:%c)\n", - node_ptr->hostname.c_str(), - node_ptr->mtcAlive_mgmnt ? 'Y' : 'n', - node_ptr->mtcAlive_clstr ? 'Y' : 'n'); - } - else - { - alog ("%s still seeing mtcAlive (%c:%c)\n", - node_ptr->hostname.c_str(), - node_ptr->mtcAlive_mgmnt ? 'Y' : 'n', - node_ptr->mtcAlive_clstr ? 'Y' : 'n'); - } + ilog ("%s still seeing mtcAlive (%d) (%c:%c) ; reset offline_search_count=%d of %d\n", + node_ptr->hostname.c_str(), + node_ptr->mtcAlive_count, + node_ptr->mtcAlive_mgmnt ? 'Y' : 'n', + node_ptr->mtcAlive_clstr ? 'Y' : 'n', + node_ptr->offline_search_count, + offline_threshold ); + node_ptr->offline_search_count = 0 ; /* reset the search count */ } if ( node_ptr->offlineStage == MTC_OFFLINE__IDLE ) diff --git a/mtce/src/maintenance/mtcSubfHdlrs.cpp b/mtce/src/maintenance/mtcSubfHdlrs.cpp index 5c994f4a..a488e922 100644 --- a/mtce/src/maintenance/mtcSubfHdlrs.cpp +++ b/mtce/src/maintenance/mtcSubfHdlrs.cpp @@ -84,6 +84,7 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr ) node_ptr->unknown_health_reported = false ; node_ptr->goEnabled_failed_subf = false ; + node_ptr->hbsClient_ready = false ; /* load worker subfunciton alarm state */ EFmAlarmSeverityT sev = mtcAlarm_state ( node_ptr->hostname,