From acd2d684f69bd4afc51c4cb4c9219e68aec39da2 Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Fri, 8 Jun 2018 09:45:50 -0400 Subject: [PATCH] Mtce: Debouce heartbeat recovery For the event of Heartbeat Failure with a host, the Mtce Heartbeat Agent will declare heartbeat recovery upon the first successful heartbeat reply after the loss is declared ; basically edge level trigger recovery. In cases where a networking issue causes heartbeat loss of a group of hosts, Maintenance tracks the group of hosts that experienced heartbeta loss and puts the system into 'Multi Node Failure Avoidance' mode. maintenance then simply waits up to a configured timeout period for hosts to regain heartbeat. As heartbeat is regained for each host that host is attempted to be 'Gracefully Recovered'. However, if the networking issue persists in a way that the occasional transient heartbeat pulse gets through then the maintenance system can prematurely take hosts and then 'the system' out of MNFA mode only to find that heartbeat is actually not properly recovered/working only to then fail and force reboot/reset each node that is still experiencing heartbeat loss. This update changes the heartbeat service from an 'edge' to 'level' sensitive recovery by requiring a number of back-2-back heartbeat pulses following a failure before that host is delared as recovered and pulled out of the MMNFA pool. Basically, This update makes the system's MNFA recovery algorithm more robust in the face of transient heartbeat loss for a group of hosts. Story: 2002882 Task: 22845 Change-Id: Ie36b73a14cfad317d900e3a3a9ddb434326737a1 Signed-off-by: Jack Ding --- .../cgts-mtce-common-1.0/common/nodeBase.h | 2 +- .../cgts-mtce-common-1.0/common/nodeClass.cpp | 91 +++++++++++++++---- .../cgts-mtce-common-1.0/common/nodeClass.h | 8 +- .../heartbeat/hbsAgent.cpp | 51 ++--------- 4 files changed, 88 insertions(+), 64 deletions(-) diff --git a/mtce-common/cgts-mtce-common-1.0/common/nodeBase.h b/mtce-common/cgts-mtce-common-1.0/common/nodeBase.h index 332e4041..120ba0fa 100755 --- a/mtce-common/cgts-mtce-common-1.0/common/nodeBase.h +++ b/mtce-common/cgts-mtce-common-1.0/common/nodeBase.h @@ -165,7 +165,7 @@ void daemon_exit ( void ); #define MAX_API_LOG_LEN (0x1000) #define MAX_FILENAME_LEN (100) #define MAX_SYSTEM_CMD_LEN (200) - +#define HBS_PULSES_REQUIRED_FOR_RECOVERY (10) #define MAX_START_SERVICES_RETRY (20) #define DEFAULT_MTCALIVE_TIMEOUT (1200) diff --git a/mtce-common/cgts-mtce-common-1.0/common/nodeClass.cpp b/mtce-common/cgts-mtce-common-1.0/common/nodeClass.cpp index e10f2528..d096e81a 100755 --- a/mtce-common/cgts-mtce-common-1.0/common/nodeClass.cpp +++ b/mtce-common/cgts-mtce-common-1.0/common/nodeClass.cpp @@ -690,7 +690,9 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname ) ptr->max_count[i] = 0 ; ptr->hbs_count[i] = 0 ; ptr->hbs_minor_count[i] = 0 ; + ptr->hbs_misses_count[i] = 0 ; ptr->b2b_misses_count[i] = 0 ; + ptr->b2b_pulses_count[i] = 0 ; ptr->hbs_degrade_count[i] = 0 ; ptr->hbs_failure_count[i] = 0 ; ptr->heartbeat_failed[i] = false; @@ -1139,25 +1141,26 @@ void nodeLinkClass::print_node_info ( void ) if (( i == INFRA_IFACE ) && ( infra_network_provisioned == false )) continue ; - syslog ( LOG_INFO, "+--------------+-----+-----+------+-----+------+------------+-----------------+\n"); - syslog ( LOG_INFO, "| %s: %3d | Mon | Mis | Max | Deg | Fail | Pulses | %s (%4d) |\n" , + syslog ( LOG_INFO, "+--------------+-----+-----+------+-----+------+------------+---------+-----------------+\n"); + syslog ( LOG_INFO, "| %s: %3d | Mon | Mis | Max | Deg | Fail | Pulses Tot | Pulses | %s (%4d) |\n" , get_iface_name_str ((iface_enum)i), hosts, hbs_disabled ? "DISABLED" : "Enabled ", hbs_pulse_period ); - syslog ( LOG_INFO, "+--------------+-----+-----+------+-----+------+------------+-----------------+\n"); + syslog ( LOG_INFO, "+--------------+-----+-----+------+-----+------+------------+---------+-----------------+\n"); for ( struct node * ptr = head ; ptr != NULL ; ptr = ptr->next ) { - syslog ( LOG_INFO, "| %-12s | %c | %3i | %4i | %3i | %4i | %8x | %d msec\n", + syslog ( LOG_INFO, "| %-12s | %c | %3i | %4i | %3i | %4i | %8x | %7x | %d msec\n", ptr->hostname.c_str(), ptr->monitor[i] ? 'Y' : 'n', - ptr->b2b_misses_count[i], + ptr->hbs_misses_count[i], ptr->max_count[i], ptr->hbs_degrade_count[i], ptr->hbs_failure_count[i], - ptr->hbs_count[i], + ptr->hbs_count[i], + ptr->b2b_pulses_count[i], hbs_pulse_period ); } } - syslog ( LOG_INFO, "+--------------+-----+-----+------+-----+------+------------+-----------------+\n"); + syslog ( LOG_INFO, "+--------------+-----+-----+------+-----+------+------------+---------+-----------------+\n"); } } @@ -7285,7 +7288,7 @@ int nodeLinkClass::launch_host_services_cmd ( struct nodeLinkClass::node * node_ int send_event ( string & hostname, unsigned int cmd, iface_enum iface ); -int nodeLinkClass::mon_host ( const string & hostname, iface_enum iface, bool true_false ) +int nodeLinkClass::mon_host ( const string & hostname, iface_enum iface, bool true_false, bool send_clear ) { int rc = FAIL ; if ( ! hostname.empty() ) @@ -7299,12 +7302,17 @@ int nodeLinkClass::mon_host ( const string & hostname, iface_enum iface, bool tr { node_ptr->no_work_log_throttle = 0 ; node_ptr->b2b_misses_count[iface] = 0 ; + node_ptr->hbs_misses_count[iface] = 0 ; + node_ptr->b2b_pulses_count[iface] = 0 ; node_ptr->max_count[iface] = 0 ; node_ptr->hbs_failure[iface] = false ; - send_event ( node_ptr->hostname, MTC_EVENT_HEARTBEAT_MINOR_CLR, iface ) ; node_ptr->hbs_minor[iface] = false ; - send_event ( node_ptr->hostname, MTC_EVENT_HEARTBEAT_DEGRADE_CLR, iface ) ; node_ptr->hbs_degrade[iface] = false ; + if ( send_clear == true ) + { + send_event ( node_ptr->hostname, MTC_EVENT_HEARTBEAT_MINOR_CLR, iface ) ; + send_event ( node_ptr->hostname, MTC_EVENT_HEARTBEAT_DEGRADE_CLR, iface ) ; + } } return PASS ; } @@ -7693,12 +7701,57 @@ int nodeLinkClass::remPulse ( struct node * node_ptr, iface_enum iface, bool cle if (( pulse_list[iface].head_ptr != NULL ) && ( ptr != NULL ) && ( ptr->linknum[iface] != 0)) { pulse_ptr = ptr ; - ptr->hbs_count[iface]++ ; manage_pulse_flags ( pulse_ptr , flags ); + /* clear_b2b_misses_count override check ; thresold recovery */ if ( clear_b2b_misses_count == true ) { + ptr->hbs_count[iface]++ ; + ptr->b2b_pulses_count[iface]++ ; + if ( ptr->hbs_failure[iface] == true ) + { + /* threshold failure recovery */ + if ( ptr->b2b_pulses_count[iface] < HBS_PULSES_REQUIRED_FOR_RECOVERY ) + { + /* don't clear the alarm or send clear notifications to mtc + * if this interfaces failed and has not yet received the + * required number of back to back pulses needed for recovery */ + clear_b2b_misses_count = false ; + ilog ("%s %s heartbeat failure recovery (%d of %d)\n", + node_ptr->hostname.c_str(), + get_iface_name_str(iface), + ptr->b2b_pulses_count[iface], + HBS_PULSES_REQUIRED_FOR_RECOVERY); + } + else + { + ptr->hbs_failure[iface] = false ; + ilog ("%s %s heartbeat failure recovery (%d)\n", + node_ptr->hostname.c_str(), + get_iface_name_str(iface), + ptr->b2b_pulses_count[iface]); + } + } + else + { + ptr->b2b_misses_count[iface] = 0 ; + } + } + else + { + if (( ptr->b2b_pulses_count[iface] != 0 ) && ( ptr->hbs_failure[iface] == true )) + { + ilog ("%s %s failed but %d\n", node_ptr->hostname.c_str(), + get_iface_name_str(iface), + ptr->b2b_pulses_count[iface]); + } + + } + + if ( clear_b2b_misses_count == true ) + { + manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CLEAR, iface ); if ( ptr->b2b_misses_count[iface] > hbs_degrade_threshold ) { ilog ("%s %s Pulse Rxed (after %d misses)\n", @@ -7707,8 +7760,6 @@ int nodeLinkClass::remPulse ( struct node * node_ptr, iface_enum iface, bool cle node_ptr->b2b_misses_count[iface]); } - manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CLEAR, iface ); - ptr->b2b_misses_count[iface] = 0 ; if ( pulse_ptr->hbs_degrade[iface] == true ) { @@ -7964,15 +8015,15 @@ int nodeLinkClass::lost_pulses ( iface_enum iface ) { string flat = "Flat Line:" ; pulse_ptr->b2b_misses_count[iface]++ ; + pulse_ptr->hbs_misses_count[iface]++ ; + pulse_ptr->b2b_pulses_count[iface] = 0 ; // pulse_ptr->max_count[iface]++ ; /* Don't log single misses unless in debug mode */ if ( pulse_ptr->b2b_misses_count[iface] > 1 ) { - // if ( pulse_ptr->b2b_misses_count[iface] >= 25 ) if ( pulse_ptr->b2b_misses_count[iface] >= hbs_failure_threshold ) { - // if ( pulse_ptr->b2b_misses_count[iface] == 25 ) if ( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold ) { ilog ("%-13s %s Pulse Miss (%d) (log throttled to every %d)\n", @@ -8440,13 +8491,15 @@ void nodeLinkClass::mem_log_hbs_cnts ( struct nodeLinkClass::node * node_ptr ) char str[MAX_MEM_LOG_DATA] ; for ( int iface = 0 ; iface < MAX_IFACES ; iface++ ) { - snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\t%s Counts Minor:%d Degrade:%d Failed:%d Max:%d Cur:%d\n", - node_ptr->hostname.c_str(), + snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\t%s Counts Minor:%d Degrade:%d Failed:%d Misses:%d MaxB2BMisses:%d Cur:%d Tot:%d\n", + node_ptr->hostname.c_str(), get_iface_name_str(iface), - node_ptr->hbs_minor_count[iface], - node_ptr->hbs_degrade_count[iface], + node_ptr->hbs_minor_count[iface], + node_ptr->hbs_degrade_count[iface], node_ptr->hbs_failure_count[iface], + node_ptr->hbs_misses_count[iface], node_ptr->max_count[iface], + node_ptr->b2b_pulses_count[iface], node_ptr->hbs_count[iface]); mem_log (str); } diff --git a/mtce-common/cgts-mtce-common-1.0/common/nodeClass.h b/mtce-common/cgts-mtce-common-1.0/common/nodeClass.h index 444e618b..1e2c80f4 100755 --- a/mtce-common/cgts-mtce-common-1.0/common/nodeClass.h +++ b/mtce-common/cgts-mtce-common-1.0/common/nodeClass.h @@ -515,9 +515,15 @@ private: /** Ongoing heartbeat count cleared on HBS_START reset */ int hbs_count [MAX_IFACES] ; + /** Keep track of the number of misses since heartbeat was started */ + int hbs_misses_count [MAX_IFACES]; + /** Immediate running count of consecutive heartbeat misses */ int b2b_misses_count [MAX_IFACES]; + /** Number of consecutive pulses received since last miss */ + int b2b_pulses_count [MAX_IFACES]; + /** Maximum heartbeat misses since node was last brought into service */ int max_count [MAX_IFACES]; @@ -1929,7 +1935,7 @@ public: void manage_pulse_flags ( string & hostname, unsigned int flags ); /** Control the heartbeat monitoring state of a host */ - int mon_host ( const string & hostname, iface_enum iface, bool true_false ); + int mon_host ( const string & hostname, iface_enum iface, bool true_false, bool send_clear ); /** Return true if the pulse list is empty */ bool pulse_list_empty ( iface_enum iface ); diff --git a/mtce-common/cgts-mtce-common-1.0/heartbeat/hbsAgent.cpp b/mtce-common/cgts-mtce-common-1.0/heartbeat/hbsAgent.cpp index c1b6e14d..b69aefb2 100755 --- a/mtce-common/cgts-mtce-common-1.0/heartbeat/hbsAgent.cpp +++ b/mtce-common/cgts-mtce-common-1.0/heartbeat/hbsAgent.cpp @@ -132,33 +132,6 @@ void monitor_scheduling ( unsigned long long & this_time, unsigned long long & p prev_time = this_time ; } -void nodeLinkClass::recalibrate_thresholds ( void ) -{ - if ( hbsInv.hosts > hbs_config.hbs_calibrate_threshold ) - { - hbsInv.hbs_pulse_period = (hbsInv.hosts * hbs_config.hbs_calibrate_period_factor ) ; - hbsInv.hbs_minor_threshold = (hbsInv.hosts * hbs_config.hbs_calibrate_minor_factor ) ; - hbsInv.hbs_degrade_threshold = (hbsInv.hosts * hbs_config.hbs_calibrate_degrade_factor) ; - hbsInv.hbs_failure_threshold = (hbsInv.hosts * hbs_config.hbs_calibrate_fail_factor ) ; - } - else - { - hbsInv.hbs_pulse_period = hbs_config.hbs_pulse_period ; - hbsInv.hbs_minor_threshold = hbs_config.hbs_minor_threshold ; - hbsInv.hbs_degrade_threshold = hbs_config.hbs_degrade_threshold ; - hbsInv.hbs_failure_threshold = hbs_config.hbs_failure_threshold ; - } - - hbsInv.hbs_pulse_period_save = hbsInv.hbs_pulse_period ; - - ilog ("Heartbeat Thresholds ; hosts:%d pulse:%d msecs - minor:%d degrade:%d failure:%d\n", - hbsInv.hosts, - hbsInv.hbs_pulse_period, - hbsInv.hbs_minor_threshold, - hbsInv.hbs_degrade_threshold, - hbsInv.hbs_failure_threshold); -} - /* Cleanup exit handler */ void daemon_exit ( void ) { @@ -253,18 +226,18 @@ static int hbs_config_handler ( void * user, if (MATCH("agent", "hbs_minor_threshold")) { - config_ptr->hbs_minor_threshold = atoi(value); + config_ptr->hbs_minor_threshold = hbsInv.hbs_minor_threshold = atoi(value); } if (MATCH("agent", "heartbeat_degrade_threshold")) { - config_ptr->hbs_degrade_threshold = atoi(value); + config_ptr->hbs_degrade_threshold = hbsInv.hbs_degrade_threshold = atoi(value); config_ptr->mask |= CONFIG_AGENT_HBS_DEGRADE ; } if (MATCH("agent", "heartbeat_failure_threshold")) { - config_ptr->hbs_failure_threshold = atoi(value); + config_ptr->hbs_failure_threshold = hbsInv.hbs_failure_threshold = atoi(value); config_ptr->mask |= CONFIG_AGENT_HBS_FAILURE ; } @@ -1387,14 +1360,12 @@ void daemon_service_run ( void ) /* clear any outstanding alarms on the ADD */ hbsAlarm_clear_all ( hostname ); - - // hbsInv.recalibrate_thresholds (); } else if ( msg.cmd == MTC_CMD_DEL_HOST ) { for ( int iface = 0 ; iface < MAX_IFACES ; iface++ ) { - hbsInv.mon_host ( hostname, (iface_enum)iface, false ); + hbsInv.mon_host ( hostname, (iface_enum)iface, false, false ); } hbsInv.del_host ( hostname ); @@ -1402,35 +1373,29 @@ void daemon_service_run ( void ) /* clear any outstanding alarms on the DEL */ hbsAlarm_clear_all ( hostname ); - - hbsInv.print_node_info(); - - // hbsInv.recalibrate_thresholds (); } else if ( msg.cmd == MTC_CMD_STOP_HOST ) { for ( int iface = 0 ; iface < MAX_IFACES ; iface++ ) { - hbsInv.mon_host ( hostname, (iface_enum)iface, false ); + hbsInv.mon_host ( hostname, (iface_enum)iface, false, true ); } ilog ("%s stopping heartbeat service\n", hostname.c_str()); - hbsInv.print_node_info(); } else if ( msg.cmd == MTC_CMD_START_HOST ) { for ( int iface = 0 ; iface < MAX_IFACES ; iface++ ) { - hbsInv.mon_host ( hostname, (iface_enum)iface, true ); + hbsInv.mon_host ( hostname, (iface_enum)iface, true, true ); } ilog ("%s starting heartbeat service\n", hostname.c_str()); - hbsInv.print_node_info(); } else if ( msg.cmd == MTC_RESTART_HBS ) { for ( int iface = 0 ; iface < MAX_IFACES ; iface++ ) { - hbsInv.mon_host ( hostname, (iface_enum)iface, false ); - hbsInv.mon_host ( hostname, (iface_enum)iface, true ); + hbsInv.mon_host ( hostname, (iface_enum)iface, false, false ); + hbsInv.mon_host ( hostname, (iface_enum)iface, true, false ); } ilog ("%s restarting heartbeat service\n", hostname.c_str()); hbsInv.print_node_info();