diff --git a/mtce-common/src/daemon/daemon_common.h b/mtce-common/src/daemon/daemon_common.h index 9ea16339..3f9ac031 100755 --- a/mtce-common/src/daemon/daemon_common.h +++ b/mtce-common/src/daemon/daemon_common.h @@ -245,6 +245,7 @@ typedef struct long msecs ; } time_delta_type ; +int timedelta ( struct timespec & before , struct timespec & after, time_delta_type & delta ); int timedelta ( time_debug_type & before , time_debug_type & after, time_delta_type & delta ); int gettime ( time_debug_type & nowtime ) ; unsigned long long gettime_monotonic_nsec ( void ); diff --git a/mtce-common/src/daemon/daemon_debug.cpp b/mtce-common/src/daemon/daemon_debug.cpp index 4a39ccfa..68e97652 100755 --- a/mtce-common/src/daemon/daemon_debug.cpp +++ b/mtce-common/src/daemon/daemon_debug.cpp @@ -26,20 +26,20 @@ unsigned long long gettime_monotonic_nsec ( void ) { struct timespec ts; clock_gettime (CLOCK_MONOTONIC, &ts); - return ((unsigned long long) ts.tv_sec) * 1000000000ULL + ts.tv_nsec; + return ((unsigned long long) ts.tv_sec) * 1000000000ULL + ts.tv_nsec; } - + int timedelta ( time_debug_type & before , time_debug_type & after, time_delta_type & delta ) -{ +{ /* Subtract before from after */ if ((after.ts.tv_sec < before.ts.tv_sec) || ((after.ts.tv_sec == before.ts.tv_sec) && - (after.ts.tv_nsec <= before.ts.tv_nsec))) + (after.ts.tv_nsec <= before.ts.tv_nsec))) { delta.secs = delta.msecs = 1 ; - } - else + } + else { delta.secs = after.ts.tv_sec - before.ts.tv_sec ; if (after.ts.tv_nsec < before.ts.tv_nsec) @@ -51,7 +51,34 @@ int timedelta ( time_debug_type & before , time_debug_type & after, time_delta_t { delta.msecs = after.ts.tv_nsec - before.ts.tv_nsec ; } - delta.msecs = (delta.msecs/1000); + /* convert nsec time to msec time */ + delta.msecs = (delta.msecs/1000000); + } + return (PASS) ; +} + +int timedelta ( struct timespec & before , struct timespec & after, time_delta_type & delta ) +{ + /* Subtract before from after */ + + if ((after.tv_sec < before.tv_sec) || ((after.tv_sec == before.tv_sec) && (after.tv_nsec <= before.tv_nsec))) + { + delta.secs = delta.msecs = 0 ; + } + else + { + delta.secs = after.tv_sec - before.tv_sec ; + if (after.tv_nsec < before.tv_nsec) + { + delta.msecs = after.tv_nsec + 1000000000L - before.tv_nsec ; + delta.secs-- ; + } + else + { + delta.msecs = after.tv_nsec - before.tv_nsec ; + } + /* convert nsec time to msec time */ + delta.msecs = (delta.msecs/1000000); } return (PASS) ; } diff --git a/mtce/src/heartbeat/hbsAgent.cpp b/mtce/src/heartbeat/hbsAgent.cpp index b596baba..4eec5a29 100644 --- a/mtce/src/heartbeat/hbsAgent.cpp +++ b/mtce/src/heartbeat/hbsAgent.cpp @@ -69,6 +69,19 @@ using namespace std; #define MAX_LEN 1000 +/* Stores the MONOTONIC time the last SM heartbeat was received + * Heartbeat metrics + * SM heartbeat period definition */ +static struct timespec sm_heartbeat_timestamp_last = { 0 , 0 } ; +static struct timespec sm_heartbeat_timestamp_restart = { 0 , 0 } ; +static int sm_heartbeat_count_b2b_misses = 0 ; +static int sm_heartbeat_count = 0 ; +const int SM_HEARTBEAT_PULSE_INTERVAL_MSEC = 100; +const int SM_HEARTBEAT_PULSE_PERIOD_MSECS = 800; +const int SM_HEARTBEAT_PULSE_RECOVER_DURATION_MSEC = SM_HEARTBEAT_PULSE_PERIOD_MSECS * 2; +const int SM_HEARTBEAT_PULSE_CONTINUE_BEEP_TO_RECOVER = + SM_HEARTBEAT_PULSE_RECOVER_DURATION_MSEC / SM_HEARTBEAT_PULSE_INTERVAL_MSEC; + /* Historical String data for mem_logs */ static string unexpected_pulse_list[MAX_IFACES] = { "" , "" } ; static string arrival_histogram[MAX_IFACES] = { "" , "" } ; @@ -1398,6 +1411,7 @@ int daemon_init ( string iface, string nodetype ) * ****************************************************************************/ static int _hbs_sm_handler_log_throttle = 0 ; +static int _hbs_sm_heartbeat_log_throttle = 0 ; void hbs_sm_handler ( void ) { #define _MAX_MSG_LEN (80) @@ -1438,9 +1452,26 @@ void hbs_sm_handler ( void ) if (( service == SUPPERTED_SERVICE ) && ( request == SUPPORTED_REQUEST )) { - /* success path ... */ - hbs_cluster_send( hbs_sock.sm_client_sock, reqid, "query" ); - + /* SM heartbeat pulses have a reqid = 0 and do not require a response */ + if ( reqid == 0 ) + { + time_delta_type delta ; + struct timespec ts = sm_heartbeat_timestamp_last ; + clock_gettime (CLOCK_MONOTONIC, &sm_heartbeat_timestamp_last ); + if(sm_heartbeat_count_b2b_misses && sm_heartbeat_timestamp_restart.tv_sec == 0) + { + sm_heartbeat_timestamp_restart = sm_heartbeat_timestamp_last; + } + timedelta (ts, sm_heartbeat_timestamp_last, delta ); + sm_heartbeat_count++ ; + ilog_throttled(_hbs_sm_heartbeat_log_throttle, 100, "SM Heartbeat %d (%ld.%03ld secs)", + sm_heartbeat_count, delta.secs, delta.msecs); + } + else + { + /* success path ... */ + hbs_cluster_send( hbs_sock.sm_client_sock, reqid, "query" ); + } /* reset log throttle */ _hbs_sm_handler_log_throttle = 0 ; } @@ -1476,6 +1507,92 @@ void hbs_sm_handler ( void ) dlog ("... %s", sm_mesg ); } +/**************************************************************************** + * + * Name ; manage_sm_heartbeat + * + * Purpose : Determine if we received an SM heartbeat message within + * the last SM_HEARTBEAT_PULSE_PERIOD_MSECS + * + * Description: Compare the monotonic now time to the monotonic time + * of the last received SM heartbeat pulse. + * + * Returns : True if time dela is less than SM_HEARTBEAT_PULSE_PERIOD_MSECS + * False if time delta is greater + * + ***************************************************************************/ +bool manage_sm_heartbeat ( void ) +{ + struct timespec ts ; + time_delta_type delta ; + clock_gettime (CLOCK_MONOTONIC, &ts ); + timedelta (sm_heartbeat_timestamp_last, ts, delta ); + int64_t delta_in_ms = delta.secs * 1000 + delta.msecs; + bool heartbeat_ok; + if ( delta_in_ms > SM_HEARTBEAT_PULSE_PERIOD_MSECS ) + { + sm_heartbeat_count = 0; + if (( ++sm_heartbeat_count_b2b_misses < 20 )|| + (!( sm_heartbeat_count_b2b_misses % 100 ))) + { + wlog("SM Heartbeat missing since %ld.%03ld secs ago ; HBS Period Misses:%3d ; Running HB Count:%4d", + delta.secs, delta.msecs, + sm_heartbeat_count_b2b_misses, + sm_heartbeat_count); + } + heartbeat_ok = false; + } + else + { + if(sm_heartbeat_count_b2b_misses) + { + int expected_beeps = delta_in_ms / SM_HEARTBEAT_PULSE_INTERVAL_MSEC - 1; + + if(sm_heartbeat_count >= expected_beeps) + { + if(sm_heartbeat_count >= SM_HEARTBEAT_PULSE_CONTINUE_BEEP_TO_RECOVER) + { + ilog("SM Heartbeat recovered (%d:%dbeeps/%ldms) after %d missing", + sm_heartbeat_count, SM_HEARTBEAT_PULSE_CONTINUE_BEEP_TO_RECOVER, + delta_in_ms, + sm_heartbeat_count_b2b_misses); + + sm_heartbeat_count_b2b_misses = 0; + sm_heartbeat_count = 0; + sm_heartbeat_timestamp_restart.tv_sec = 0; + sm_heartbeat_timestamp_restart.tv_nsec = 0; + heartbeat_ok = true; + }else + { + ilog("SM Heartbeat recover continue (%d:%dbeeps/%ldms) after %d missing", + sm_heartbeat_count, SM_HEARTBEAT_PULSE_CONTINUE_BEEP_TO_RECOVER, + delta_in_ms, + sm_heartbeat_count_b2b_misses); + heartbeat_ok = false; // not good enough to declare recovered yet + } + }else + { + ilog("SM Heartbeat recover is interrupted after %ldms, missing %d beeps. " + "Counting will restart.", + delta_in_ms, expected_beeps - sm_heartbeat_count); + sm_heartbeat_timestamp_restart = ts; + sm_heartbeat_count = 1; + heartbeat_ok = false; // recover is interrupted by further missing beep + } + }else + { + if(delta_in_ms >= SM_HEARTBEAT_PULSE_INTERVAL_MSEC * 2) + { + ilog("SM Heartbeat missing for %ldms:%dms. Not yet declare stall.", + delta_in_ms, SM_HEARTBEAT_PULSE_PERIOD_MSECS + ); + } + heartbeat_ok = true; // not bad enough to declare heartbeat failed yet + } + } + return heartbeat_ok; +} + /**************************************************************************** * * Name : daemon_service_run @@ -1552,7 +1669,8 @@ void daemon_service_run ( void ) hbs_state_audit (); } - /* run the first audit in 30 seconds */ + /* The first audit was run after 30 seconds but then the + * continuous rate is every hour */ mtcTimer_start ( hbsTimer_audit, hbsTimer_handler, MTC_HRS_1 ); } @@ -1719,6 +1837,9 @@ void daemon_service_run ( void ) sockets_init = true ; monitor_scheduling ( this_time, prev_time, 0, NODEUTIL_LATENCY_MON_START ); + /* Update Sm heartbeat time to now time */ + clock_gettime (CLOCK_MONOTONIC, &sm_heartbeat_timestamp_last ); + /* no need for the heartbeat audit in a simplex system */ if ( hbsInv.system_type != SYSTEM_TYPE__CPE_MODE__SIMPLEX ) { @@ -2363,6 +2484,8 @@ void daemon_service_run ( void ) */ else { + bool heartbeat_ok = manage_sm_heartbeat(); + /* manage vault wrt peer controller */ hbs_cluster_peer(); @@ -2396,9 +2519,14 @@ void daemon_service_run ( void ) int lost = hbsInv.lost_pulses ((iface_enum)iface, storage_0_responding); if ( !hbs_ctrl.locked && !hbsInv.hbs_disabled ) { - hbs_cluster_update ((iface_enum)iface, lost, storage_0_responding); + hbs_cluster_update ((iface_enum)iface, lost, storage_0_responding, heartbeat_ok ); } } + /* log cluster throttled */ + if (( heartbeat_ok == false ) && ( !( sm_heartbeat_count_b2b_misses % 100 ))) + { + hbs_state_audit ( ); + } hbsTimer.ring = false ; heartbeat_request = true ; seq_num++ ; diff --git a/mtce/src/heartbeat/hbsBase.h b/mtce/src/heartbeat/hbsBase.h index 932ed543..bfa8f1d1 100755 --- a/mtce/src/heartbeat/hbsBase.h +++ b/mtce/src/heartbeat/hbsBase.h @@ -290,7 +290,8 @@ void hbs_cluster_peer ( void ); */ void hbs_cluster_update ( iface_enum iface, unsigned short not_responding_hosts, - bool storage_0_responding ); + bool storage_0_responding, + bool sm_heartbeat_ok); /* Called by the hbsAgent pulse transmitter to append this controllers * running cluster view in the next multicast pulse request. diff --git a/mtce/src/heartbeat/hbsCluster.cpp b/mtce/src/heartbeat/hbsCluster.cpp index 10ac8991..780fa8e3 100644 --- a/mtce/src/heartbeat/hbsCluster.cpp +++ b/mtce/src/heartbeat/hbsCluster.cpp @@ -217,7 +217,7 @@ void cluster_list ( void ) void cluster_storage0_state ( bool enabled ) { - if ( ctrl.cluster.storage0_enabled != enabled ) + if ( (bool)ctrl.cluster.storage0_enabled != enabled ) { ctrl.cluster.storage0_enabled = enabled ; ilog ("storage-0 heartbeat state changed to %s", @@ -305,36 +305,40 @@ void hbs_cluster_add ( string & hostname ) void hbs_cluster_del ( string & hostname ) { std::list::iterator hostname_ptr ; + bool found = false ; for ( hostname_ptr = ctrl.monitored_hostname_list.begin(); hostname_ptr != ctrl.monitored_hostname_list.end() ; hostname_ptr++ ) { if ( hostname_ptr->compare(hostname) == 0 ) { - ctrl.monitored_hostname_list.remove(hostname) ; - ctrl.monitored_hosts = (unsigned short)ctrl.monitored_hostname_list.size(); - - /* Manage storage-0 state. */ - if ( hostname.compare(STORAGE_0) == 0 ) - { - cluster_storage0_state ( false ); - } - - /* If we get down to 0 monitored hosts then just start fresh */ - if (( ctrl.monitored_hosts ) == 0 ) - { - hbs_cluster_init ( ctrl.cluster.period_msec, NULL ); - } - - ilog ("%s deleted from cluster", hostname.c_str()); - - cluster_list (); - - hbs_cluster_change ( hostname + " deleted" ); - + found = true ; break ; } } + if ( found == true ) + { + ctrl.monitored_hostname_list.remove(hostname) ; + ctrl.monitored_hosts = (unsigned short)ctrl.monitored_hostname_list.size(); + + /* Manage storage-0 state. */ + if ( hostname.compare(STORAGE_0) == 0 ) + cluster_storage0_state ( false ); + + /* If we get down to 0 monitored hosts then just start fresh */ + if (( ctrl.monitored_hosts ) == 0 ) + hbs_cluster_init ( ctrl.cluster.period_msec, NULL ); + + ilog ("%s deleted from cluster", hostname.c_str()); + + cluster_list (); + + hbs_cluster_change ( hostname + " deleted" ); + } + else + { + slog("%s not found in cluster list", hostname.c_str()); + } } /**************************************************************************** @@ -371,7 +375,7 @@ void hbs_cluster_period_start ( void ) * 4. Maintain a back to back non-responding count for storage-0. * Once the count reaches the minimum threshold of * STORAGE_0_NR_THRESHOLD then the specific network history - * is updated to indicate storgae-0 is not responding. Once + * is updated to indicate storage-0 is not responding. Once * storage-0 starts responding again with a single response * then that network history is updated to indicate storage-0 * is responding. @@ -389,7 +393,8 @@ void hbs_cluster_period_start ( void ) void hbs_cluster_update ( iface_enum iface, unsigned short not_responding_hosts, - bool storage_0_responding ) + bool storage_0_responding, + bool sm_heartbeat_ok) { if ( ctrl.monitored_hosts == 0 ) return ; @@ -407,6 +412,11 @@ void hbs_cluster_update ( iface_enum iface, else return ; + if ( ctrl.monitored_hosts < not_responding_hosts ) + { + slog("Monitored Hosts of %d is less than the number of Not Responding hosts %d" , ctrl.monitored_hosts, not_responding_hosts ); + } + if ( not_responding_hosts ) { clog ("controller-%d %s enabled:%d not responding:%d", @@ -454,13 +464,23 @@ void hbs_cluster_update ( iface_enum iface, } } + /* update sm heartbeat status */ + if ( sm_heartbeat_ok == true ) + { + history_ptr->sm_heartbeat_fail = false ; + } + else + { + history_ptr->sm_heartbeat_fail = true ; + } + /* Manage storage-0 status. */ if ( ctrl.cluster.storage0_enabled ) { /* Handle storage-0 status change from not responding to responding. */ if ( storage_0_responding == true ) { - if (history_ptr->storage0_responding == false) + if ((bool)history_ptr->storage0_responding == false) { history_ptr->storage0_responding = true ; ilog ("controller-%d %s heartbeat ; storage-0 is ok", @@ -483,7 +503,7 @@ void hbs_cluster_update ( iface_enum iface, } /* Handle storage-0 status change from responding to not responding. */ - if (( history_ptr->storage0_responding == true ) && + if (( (bool)history_ptr->storage0_responding == true ) && ( ctrl.storage_0_not_responding_count[n] >= STORAGE_0_NR_THRESHOLD )) { history_ptr->storage0_responding = false ; @@ -495,7 +515,7 @@ void hbs_cluster_update ( iface_enum iface, else { /* Typical path for storage-0 disabled or normal non-storage system case */ - if ( history_ptr->storage0_responding == true ) + if ( (bool)history_ptr->storage0_responding == true ) history_ptr->storage0_responding = false ; /* Handle clearing threshold count when storage-0 is not enabled. */ @@ -507,6 +527,10 @@ void hbs_cluster_update ( iface_enum iface, if ( history_ptr->entries < MTCE_HBS_HISTORY_ENTRIES ) history_ptr->entries++ ; + if ( ctrl.monitored_hosts < not_responding_hosts ) + { + slog("Monitored Hosts of %d is less than the number of Not Responding hosts %d" , ctrl.monitored_hosts, not_responding_hosts ); + } /* Update the history with this data. */ history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled = ctrl.monitored_hosts ; history_ptr->entry[history_ptr->oldest_entry_index].hosts_responding = ctrl.monitored_hosts - not_responding_hosts ; diff --git a/mtce/src/heartbeat/hbsUtil.cpp b/mtce/src/heartbeat/hbsUtil.cpp index ae2a81d3..a9875ed5 100644 --- a/mtce/src/heartbeat/hbsUtil.cpp +++ b/mtce/src/heartbeat/hbsUtil.cpp @@ -328,17 +328,19 @@ void hbs_cluster_dump ( mtce_hbs_cluster_history_type & history, bool storage0_e } if ( storage0_enabled ) { - syslog ( LOG_INFO, "Cluster Vault : C%d %s S:%s %s", + syslog ( LOG_INFO, "Cluster Vault : C%d %s S:%s SM:%s %s", history.controller, hbs_cluster_network_name((mtce_hbs_network_enum)history.network).c_str(), history.storage0_responding ? "y" : "n", + history.sm_heartbeat_fail ? "miss":" ok", str); } else { - syslog ( LOG_INFO, "Cluster Vault : C%d %s %s", + syslog ( LOG_INFO, "Cluster Vault : C%d %s SM:%s %s", history.controller, hbs_cluster_network_name((mtce_hbs_network_enum)history.network).c_str(), + history.sm_heartbeat_fail ? "miss":" ok ", str); } } diff --git a/mtce/src/heartbeat/mtceHbsCluster.h b/mtce/src/heartbeat/mtceHbsCluster.h index 02255226..7690285c 100644 --- a/mtce/src/heartbeat/mtceHbsCluster.h +++ b/mtce/src/heartbeat/mtceHbsCluster.h @@ -71,7 +71,10 @@ typedef struct { unsigned short controller :4 ; /* value 0 or 1 (and 2 in future) */ unsigned short network :4 ; /* see mtce_hbs_network_enum */ - unsigned short reserved_bits :7 ; /* future - initted to 0 */ + unsigned short reserved_bits :6 ; /* future - initted to 0 */ + unsigned short sm_heartbeat_fail :1 ; /* SM Health + 0 = SM Heartbeat OK, + 1 = SM Heartbeat Failure */ unsigned short storage0_responding:1 ; /* 1 = storage-0 is hb healthy */ unsigned short entries ; /* # of valid values in .entry */ unsigned short entries_max ; /* max size of the enry array */ diff --git a/mtce/src/scripts/mtc.conf b/mtce/src/scripts/mtc.conf index 1b431004..0a2b005c 100644 --- a/mtce/src/scripts/mtc.conf +++ b/mtce/src/scripts/mtc.conf @@ -30,6 +30,11 @@ barbican_port = 9311 ; The Barbican Port Number token_refresh_rate = 1200 ; Authentication token refresh rate in seconds. ; A value of zero means no refresh. ; range is 0 or 600-22800 + +sm_heartbeat_loss_thld = 800 ; number of msecs SM heartbeat fails before + ; declaring an SM process heartbeat loss in + ; the heartbeat cluster. + autorecovery_threshold = 3 ; The number of times maintenance will try to ; auto recover a critically failed controller ; while there is no backup controllers to fail