diff --git a/mtce-common/cgts-mtce-common-1.0/common/nodeBase.h b/mtce-common/cgts-mtce-common-1.0/common/nodeBase.h index 332e4041..120ba0fa 100755 --- a/mtce-common/cgts-mtce-common-1.0/common/nodeBase.h +++ b/mtce-common/cgts-mtce-common-1.0/common/nodeBase.h @@ -165,7 +165,7 @@ void daemon_exit ( void ); #define MAX_API_LOG_LEN (0x1000) #define MAX_FILENAME_LEN (100) #define MAX_SYSTEM_CMD_LEN (200) - +#define HBS_PULSES_REQUIRED_FOR_RECOVERY (10) #define MAX_START_SERVICES_RETRY (20) #define DEFAULT_MTCALIVE_TIMEOUT (1200) diff --git a/mtce-common/cgts-mtce-common-1.0/common/nodeClass.cpp b/mtce-common/cgts-mtce-common-1.0/common/nodeClass.cpp index e10f2528..d096e81a 100755 --- a/mtce-common/cgts-mtce-common-1.0/common/nodeClass.cpp +++ b/mtce-common/cgts-mtce-common-1.0/common/nodeClass.cpp @@ -690,7 +690,9 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname ) ptr->max_count[i] = 0 ; ptr->hbs_count[i] = 0 ; ptr->hbs_minor_count[i] = 0 ; + ptr->hbs_misses_count[i] = 0 ; ptr->b2b_misses_count[i] = 0 ; + ptr->b2b_pulses_count[i] = 0 ; ptr->hbs_degrade_count[i] = 0 ; ptr->hbs_failure_count[i] = 0 ; ptr->heartbeat_failed[i] = false; @@ -1139,25 +1141,26 @@ void nodeLinkClass::print_node_info ( void ) if (( i == INFRA_IFACE ) && ( infra_network_provisioned == false )) continue ; - syslog ( LOG_INFO, "+--------------+-----+-----+------+-----+------+------------+-----------------+\n"); - syslog ( LOG_INFO, "| %s: %3d | Mon | Mis | Max | Deg | Fail | Pulses | %s (%4d) |\n" , + syslog ( LOG_INFO, "+--------------+-----+-----+------+-----+------+------------+---------+-----------------+\n"); + syslog ( LOG_INFO, "| %s: %3d | Mon | Mis | Max | Deg | Fail | Pulses Tot | Pulses | %s (%4d) |\n" , get_iface_name_str ((iface_enum)i), hosts, hbs_disabled ? "DISABLED" : "Enabled ", hbs_pulse_period ); - syslog ( LOG_INFO, "+--------------+-----+-----+------+-----+------+------------+-----------------+\n"); + syslog ( LOG_INFO, "+--------------+-----+-----+------+-----+------+------------+---------+-----------------+\n"); for ( struct node * ptr = head ; ptr != NULL ; ptr = ptr->next ) { - syslog ( LOG_INFO, "| %-12s | %c | %3i | %4i | %3i | %4i | %8x | %d msec\n", + syslog ( LOG_INFO, "| %-12s | %c | %3i | %4i | %3i | %4i | %8x | %7x | %d msec\n", ptr->hostname.c_str(), ptr->monitor[i] ? 'Y' : 'n', - ptr->b2b_misses_count[i], + ptr->hbs_misses_count[i], ptr->max_count[i], ptr->hbs_degrade_count[i], ptr->hbs_failure_count[i], - ptr->hbs_count[i], + ptr->hbs_count[i], + ptr->b2b_pulses_count[i], hbs_pulse_period ); } } - syslog ( LOG_INFO, "+--------------+-----+-----+------+-----+------+------------+-----------------+\n"); + syslog ( LOG_INFO, "+--------------+-----+-----+------+-----+------+------------+---------+-----------------+\n"); } } @@ -7285,7 +7288,7 @@ int nodeLinkClass::launch_host_services_cmd ( struct nodeLinkClass::node * node_ int send_event ( string & hostname, unsigned int cmd, iface_enum iface ); -int nodeLinkClass::mon_host ( const string & hostname, iface_enum iface, bool true_false ) +int nodeLinkClass::mon_host ( const string & hostname, iface_enum iface, bool true_false, bool send_clear ) { int rc = FAIL ; if ( ! hostname.empty() ) @@ -7299,12 +7302,17 @@ int nodeLinkClass::mon_host ( const string & hostname, iface_enum iface, bool tr { node_ptr->no_work_log_throttle = 0 ; node_ptr->b2b_misses_count[iface] = 0 ; + node_ptr->hbs_misses_count[iface] = 0 ; + node_ptr->b2b_pulses_count[iface] = 0 ; node_ptr->max_count[iface] = 0 ; node_ptr->hbs_failure[iface] = false ; - send_event ( node_ptr->hostname, MTC_EVENT_HEARTBEAT_MINOR_CLR, iface ) ; node_ptr->hbs_minor[iface] = false ; - send_event ( node_ptr->hostname, MTC_EVENT_HEARTBEAT_DEGRADE_CLR, iface ) ; node_ptr->hbs_degrade[iface] = false ; + if ( send_clear == true ) + { + send_event ( node_ptr->hostname, MTC_EVENT_HEARTBEAT_MINOR_CLR, iface ) ; + send_event ( node_ptr->hostname, MTC_EVENT_HEARTBEAT_DEGRADE_CLR, iface ) ; + } } return PASS ; } @@ -7693,12 +7701,57 @@ int nodeLinkClass::remPulse ( struct node * node_ptr, iface_enum iface, bool cle if (( pulse_list[iface].head_ptr != NULL ) && ( ptr != NULL ) && ( ptr->linknum[iface] != 0)) { pulse_ptr = ptr ; - ptr->hbs_count[iface]++ ; manage_pulse_flags ( pulse_ptr , flags ); + /* clear_b2b_misses_count override check ; thresold recovery */ if ( clear_b2b_misses_count == true ) { + ptr->hbs_count[iface]++ ; + ptr->b2b_pulses_count[iface]++ ; + if ( ptr->hbs_failure[iface] == true ) + { + /* threshold failure recovery */ + if ( ptr->b2b_pulses_count[iface] < HBS_PULSES_REQUIRED_FOR_RECOVERY ) + { + /* don't clear the alarm or send clear notifications to mtc + * if this interfaces failed and has not yet received the + * required number of back to back pulses needed for recovery */ + clear_b2b_misses_count = false ; + ilog ("%s %s heartbeat failure recovery (%d of %d)\n", + node_ptr->hostname.c_str(), + get_iface_name_str(iface), + ptr->b2b_pulses_count[iface], + HBS_PULSES_REQUIRED_FOR_RECOVERY); + } + else + { + ptr->hbs_failure[iface] = false ; + ilog ("%s %s heartbeat failure recovery (%d)\n", + node_ptr->hostname.c_str(), + get_iface_name_str(iface), + ptr->b2b_pulses_count[iface]); + } + } + else + { + ptr->b2b_misses_count[iface] = 0 ; + } + } + else + { + if (( ptr->b2b_pulses_count[iface] != 0 ) && ( ptr->hbs_failure[iface] == true )) + { + ilog ("%s %s failed but %d\n", node_ptr->hostname.c_str(), + get_iface_name_str(iface), + ptr->b2b_pulses_count[iface]); + } + + } + + if ( clear_b2b_misses_count == true ) + { + manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CLEAR, iface ); if ( ptr->b2b_misses_count[iface] > hbs_degrade_threshold ) { ilog ("%s %s Pulse Rxed (after %d misses)\n", @@ -7707,8 +7760,6 @@ int nodeLinkClass::remPulse ( struct node * node_ptr, iface_enum iface, bool cle node_ptr->b2b_misses_count[iface]); } - manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CLEAR, iface ); - ptr->b2b_misses_count[iface] = 0 ; if ( pulse_ptr->hbs_degrade[iface] == true ) { @@ -7964,15 +8015,15 @@ int nodeLinkClass::lost_pulses ( iface_enum iface ) { string flat = "Flat Line:" ; pulse_ptr->b2b_misses_count[iface]++ ; + pulse_ptr->hbs_misses_count[iface]++ ; + pulse_ptr->b2b_pulses_count[iface] = 0 ; // pulse_ptr->max_count[iface]++ ; /* Don't log single misses unless in debug mode */ if ( pulse_ptr->b2b_misses_count[iface] > 1 ) { - // if ( pulse_ptr->b2b_misses_count[iface] >= 25 ) if ( pulse_ptr->b2b_misses_count[iface] >= hbs_failure_threshold ) { - // if ( pulse_ptr->b2b_misses_count[iface] == 25 ) if ( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold ) { ilog ("%-13s %s Pulse Miss (%d) (log throttled to every %d)\n", @@ -8440,13 +8491,15 @@ void nodeLinkClass::mem_log_hbs_cnts ( struct nodeLinkClass::node * node_ptr ) char str[MAX_MEM_LOG_DATA] ; for ( int iface = 0 ; iface < MAX_IFACES ; iface++ ) { - snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\t%s Counts Minor:%d Degrade:%d Failed:%d Max:%d Cur:%d\n", - node_ptr->hostname.c_str(), + snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\t%s Counts Minor:%d Degrade:%d Failed:%d Misses:%d MaxB2BMisses:%d Cur:%d Tot:%d\n", + node_ptr->hostname.c_str(), get_iface_name_str(iface), - node_ptr->hbs_minor_count[iface], - node_ptr->hbs_degrade_count[iface], + node_ptr->hbs_minor_count[iface], + node_ptr->hbs_degrade_count[iface], node_ptr->hbs_failure_count[iface], + node_ptr->hbs_misses_count[iface], node_ptr->max_count[iface], + node_ptr->b2b_pulses_count[iface], node_ptr->hbs_count[iface]); mem_log (str); } diff --git a/mtce-common/cgts-mtce-common-1.0/common/nodeClass.h b/mtce-common/cgts-mtce-common-1.0/common/nodeClass.h index 444e618b..1e2c80f4 100755 --- a/mtce-common/cgts-mtce-common-1.0/common/nodeClass.h +++ b/mtce-common/cgts-mtce-common-1.0/common/nodeClass.h @@ -515,9 +515,15 @@ private: /** Ongoing heartbeat count cleared on HBS_START reset */ int hbs_count [MAX_IFACES] ; + /** Keep track of the number of misses since heartbeat was started */ + int hbs_misses_count [MAX_IFACES]; + /** Immediate running count of consecutive heartbeat misses */ int b2b_misses_count [MAX_IFACES]; + /** Number of consecutive pulses received since last miss */ + int b2b_pulses_count [MAX_IFACES]; + /** Maximum heartbeat misses since node was last brought into service */ int max_count [MAX_IFACES]; @@ -1929,7 +1935,7 @@ public: void manage_pulse_flags ( string & hostname, unsigned int flags ); /** Control the heartbeat monitoring state of a host */ - int mon_host ( const string & hostname, iface_enum iface, bool true_false ); + int mon_host ( const string & hostname, iface_enum iface, bool true_false, bool send_clear ); /** Return true if the pulse list is empty */ bool pulse_list_empty ( iface_enum iface ); diff --git a/mtce-common/cgts-mtce-common-1.0/heartbeat/hbsAgent.cpp b/mtce-common/cgts-mtce-common-1.0/heartbeat/hbsAgent.cpp index c1b6e14d..b69aefb2 100755 --- a/mtce-common/cgts-mtce-common-1.0/heartbeat/hbsAgent.cpp +++ b/mtce-common/cgts-mtce-common-1.0/heartbeat/hbsAgent.cpp @@ -132,33 +132,6 @@ void monitor_scheduling ( unsigned long long & this_time, unsigned long long & p prev_time = this_time ; } -void nodeLinkClass::recalibrate_thresholds ( void ) -{ - if ( hbsInv.hosts > hbs_config.hbs_calibrate_threshold ) - { - hbsInv.hbs_pulse_period = (hbsInv.hosts * hbs_config.hbs_calibrate_period_factor ) ; - hbsInv.hbs_minor_threshold = (hbsInv.hosts * hbs_config.hbs_calibrate_minor_factor ) ; - hbsInv.hbs_degrade_threshold = (hbsInv.hosts * hbs_config.hbs_calibrate_degrade_factor) ; - hbsInv.hbs_failure_threshold = (hbsInv.hosts * hbs_config.hbs_calibrate_fail_factor ) ; - } - else - { - hbsInv.hbs_pulse_period = hbs_config.hbs_pulse_period ; - hbsInv.hbs_minor_threshold = hbs_config.hbs_minor_threshold ; - hbsInv.hbs_degrade_threshold = hbs_config.hbs_degrade_threshold ; - hbsInv.hbs_failure_threshold = hbs_config.hbs_failure_threshold ; - } - - hbsInv.hbs_pulse_period_save = hbsInv.hbs_pulse_period ; - - ilog ("Heartbeat Thresholds ; hosts:%d pulse:%d msecs - minor:%d degrade:%d failure:%d\n", - hbsInv.hosts, - hbsInv.hbs_pulse_period, - hbsInv.hbs_minor_threshold, - hbsInv.hbs_degrade_threshold, - hbsInv.hbs_failure_threshold); -} - /* Cleanup exit handler */ void daemon_exit ( void ) { @@ -253,18 +226,18 @@ static int hbs_config_handler ( void * user, if (MATCH("agent", "hbs_minor_threshold")) { - config_ptr->hbs_minor_threshold = atoi(value); + config_ptr->hbs_minor_threshold = hbsInv.hbs_minor_threshold = atoi(value); } if (MATCH("agent", "heartbeat_degrade_threshold")) { - config_ptr->hbs_degrade_threshold = atoi(value); + config_ptr->hbs_degrade_threshold = hbsInv.hbs_degrade_threshold = atoi(value); config_ptr->mask |= CONFIG_AGENT_HBS_DEGRADE ; } if (MATCH("agent", "heartbeat_failure_threshold")) { - config_ptr->hbs_failure_threshold = atoi(value); + config_ptr->hbs_failure_threshold = hbsInv.hbs_failure_threshold = atoi(value); config_ptr->mask |= CONFIG_AGENT_HBS_FAILURE ; } @@ -1387,14 +1360,12 @@ void daemon_service_run ( void ) /* clear any outstanding alarms on the ADD */ hbsAlarm_clear_all ( hostname ); - - // hbsInv.recalibrate_thresholds (); } else if ( msg.cmd == MTC_CMD_DEL_HOST ) { for ( int iface = 0 ; iface < MAX_IFACES ; iface++ ) { - hbsInv.mon_host ( hostname, (iface_enum)iface, false ); + hbsInv.mon_host ( hostname, (iface_enum)iface, false, false ); } hbsInv.del_host ( hostname ); @@ -1402,35 +1373,29 @@ void daemon_service_run ( void ) /* clear any outstanding alarms on the DEL */ hbsAlarm_clear_all ( hostname ); - - hbsInv.print_node_info(); - - // hbsInv.recalibrate_thresholds (); } else if ( msg.cmd == MTC_CMD_STOP_HOST ) { for ( int iface = 0 ; iface < MAX_IFACES ; iface++ ) { - hbsInv.mon_host ( hostname, (iface_enum)iface, false ); + hbsInv.mon_host ( hostname, (iface_enum)iface, false, true ); } ilog ("%s stopping heartbeat service\n", hostname.c_str()); - hbsInv.print_node_info(); } else if ( msg.cmd == MTC_CMD_START_HOST ) { for ( int iface = 0 ; iface < MAX_IFACES ; iface++ ) { - hbsInv.mon_host ( hostname, (iface_enum)iface, true ); + hbsInv.mon_host ( hostname, (iface_enum)iface, true, true ); } ilog ("%s starting heartbeat service\n", hostname.c_str()); - hbsInv.print_node_info(); } else if ( msg.cmd == MTC_RESTART_HBS ) { for ( int iface = 0 ; iface < MAX_IFACES ; iface++ ) { - hbsInv.mon_host ( hostname, (iface_enum)iface, false ); - hbsInv.mon_host ( hostname, (iface_enum)iface, true ); + hbsInv.mon_host ( hostname, (iface_enum)iface, false, false ); + hbsInv.mon_host ( hostname, (iface_enum)iface, true, false ); } ilog ("%s restarting heartbeat service\n", hostname.c_str()); hbsInv.print_node_info();