Merge "Add SM process heartbeat and status to the hbs cluster"

This commit is contained in:
Zuul 2020-12-16 16:27:18 +00:00 committed by Gerrit Code Review
commit c88c4cb7ee
8 changed files with 234 additions and 43 deletions

View File

@ -245,6 +245,7 @@ typedef struct
long msecs ;
} time_delta_type ;
int timedelta ( struct timespec & before , struct timespec & after, time_delta_type & delta );
int timedelta ( time_debug_type & before , time_debug_type & after, time_delta_type & delta );
int gettime ( time_debug_type & nowtime ) ;
unsigned long long gettime_monotonic_nsec ( void );

View File

@ -51,7 +51,34 @@ int timedelta ( time_debug_type & before , time_debug_type & after, time_delta_t
{
delta.msecs = after.ts.tv_nsec - before.ts.tv_nsec ;
}
delta.msecs = (delta.msecs/1000);
/* convert nsec time to msec time */
delta.msecs = (delta.msecs/1000000);
}
return (PASS) ;
}
int timedelta ( struct timespec & before , struct timespec & after, time_delta_type & delta )
{
/* Subtract before from after */
if ((after.tv_sec < before.tv_sec) || ((after.tv_sec == before.tv_sec) && (after.tv_nsec <= before.tv_nsec)))
{
delta.secs = delta.msecs = 0 ;
}
else
{
delta.secs = after.tv_sec - before.tv_sec ;
if (after.tv_nsec < before.tv_nsec)
{
delta.msecs = after.tv_nsec + 1000000000L - before.tv_nsec ;
delta.secs-- ;
}
else
{
delta.msecs = after.tv_nsec - before.tv_nsec ;
}
/* convert nsec time to msec time */
delta.msecs = (delta.msecs/1000000);
}
return (PASS) ;
}

View File

@ -69,6 +69,19 @@ using namespace std;
#define MAX_LEN 1000
/* Stores the MONOTONIC time the last SM heartbeat was received
* Heartbeat metrics
* SM heartbeat period definition */
static struct timespec sm_heartbeat_timestamp_last = { 0 , 0 } ;
static struct timespec sm_heartbeat_timestamp_restart = { 0 , 0 } ;
static int sm_heartbeat_count_b2b_misses = 0 ;
static int sm_heartbeat_count = 0 ;
const int SM_HEARTBEAT_PULSE_INTERVAL_MSEC = 100;
const int SM_HEARTBEAT_PULSE_PERIOD_MSECS = 800;
const int SM_HEARTBEAT_PULSE_RECOVER_DURATION_MSEC = SM_HEARTBEAT_PULSE_PERIOD_MSECS * 2;
const int SM_HEARTBEAT_PULSE_CONTINUE_BEEP_TO_RECOVER =
SM_HEARTBEAT_PULSE_RECOVER_DURATION_MSEC / SM_HEARTBEAT_PULSE_INTERVAL_MSEC;
/* Historical String data for mem_logs */
static string unexpected_pulse_list[MAX_IFACES] = { "" , "" } ;
static string arrival_histogram[MAX_IFACES] = { "" , "" } ;
@ -1398,6 +1411,7 @@ int daemon_init ( string iface, string nodetype )
*
****************************************************************************/
static int _hbs_sm_handler_log_throttle = 0 ;
static int _hbs_sm_heartbeat_log_throttle = 0 ;
void hbs_sm_handler ( void )
{
#define _MAX_MSG_LEN (80)
@ -1437,10 +1451,27 @@ void hbs_sm_handler ( void )
{
if (( service == SUPPERTED_SERVICE ) &&
( request == SUPPORTED_REQUEST ))
{
/* SM heartbeat pulses have a reqid = 0 and do not require a response */
if ( reqid == 0 )
{
time_delta_type delta ;
struct timespec ts = sm_heartbeat_timestamp_last ;
clock_gettime (CLOCK_MONOTONIC, &sm_heartbeat_timestamp_last );
if(sm_heartbeat_count_b2b_misses && sm_heartbeat_timestamp_restart.tv_sec == 0)
{
sm_heartbeat_timestamp_restart = sm_heartbeat_timestamp_last;
}
timedelta (ts, sm_heartbeat_timestamp_last, delta );
sm_heartbeat_count++ ;
ilog_throttled(_hbs_sm_heartbeat_log_throttle, 100, "SM Heartbeat %d (%ld.%03ld secs)",
sm_heartbeat_count, delta.secs, delta.msecs);
}
else
{
/* success path ... */
hbs_cluster_send( hbs_sock.sm_client_sock, reqid, "query" );
}
/* reset log throttle */
_hbs_sm_handler_log_throttle = 0 ;
}
@ -1476,6 +1507,92 @@ void hbs_sm_handler ( void )
dlog ("... %s", sm_mesg );
}
/****************************************************************************
*
* Name ; manage_sm_heartbeat
*
* Purpose : Determine if we received an SM heartbeat message within
* the last SM_HEARTBEAT_PULSE_PERIOD_MSECS
*
* Description: Compare the monotonic now time to the monotonic time
* of the last received SM heartbeat pulse.
*
* Returns : True if time dela is less than SM_HEARTBEAT_PULSE_PERIOD_MSECS
* False if time delta is greater
*
***************************************************************************/
bool manage_sm_heartbeat ( void )
{
struct timespec ts ;
time_delta_type delta ;
clock_gettime (CLOCK_MONOTONIC, &ts );
timedelta (sm_heartbeat_timestamp_last, ts, delta );
int64_t delta_in_ms = delta.secs * 1000 + delta.msecs;
bool heartbeat_ok;
if ( delta_in_ms > SM_HEARTBEAT_PULSE_PERIOD_MSECS )
{
sm_heartbeat_count = 0;
if (( ++sm_heartbeat_count_b2b_misses < 20 )||
(!( sm_heartbeat_count_b2b_misses % 100 )))
{
wlog("SM Heartbeat missing since %ld.%03ld secs ago ; HBS Period Misses:%3d ; Running HB Count:%4d",
delta.secs, delta.msecs,
sm_heartbeat_count_b2b_misses,
sm_heartbeat_count);
}
heartbeat_ok = false;
}
else
{
if(sm_heartbeat_count_b2b_misses)
{
int expected_beeps = delta_in_ms / SM_HEARTBEAT_PULSE_INTERVAL_MSEC - 1;
if(sm_heartbeat_count >= expected_beeps)
{
if(sm_heartbeat_count >= SM_HEARTBEAT_PULSE_CONTINUE_BEEP_TO_RECOVER)
{
ilog("SM Heartbeat recovered (%d:%dbeeps/%ldms) after %d missing",
sm_heartbeat_count, SM_HEARTBEAT_PULSE_CONTINUE_BEEP_TO_RECOVER,
delta_in_ms,
sm_heartbeat_count_b2b_misses);
sm_heartbeat_count_b2b_misses = 0;
sm_heartbeat_count = 0;
sm_heartbeat_timestamp_restart.tv_sec = 0;
sm_heartbeat_timestamp_restart.tv_nsec = 0;
heartbeat_ok = true;
}else
{
ilog("SM Heartbeat recover continue (%d:%dbeeps/%ldms) after %d missing",
sm_heartbeat_count, SM_HEARTBEAT_PULSE_CONTINUE_BEEP_TO_RECOVER,
delta_in_ms,
sm_heartbeat_count_b2b_misses);
heartbeat_ok = false; // not good enough to declare recovered yet
}
}else
{
ilog("SM Heartbeat recover is interrupted after %ldms, missing %d beeps. "
"Counting will restart.",
delta_in_ms, expected_beeps - sm_heartbeat_count);
sm_heartbeat_timestamp_restart = ts;
sm_heartbeat_count = 1;
heartbeat_ok = false; // recover is interrupted by further missing beep
}
}else
{
if(delta_in_ms >= SM_HEARTBEAT_PULSE_INTERVAL_MSEC * 2)
{
ilog("SM Heartbeat missing for %ldms:%dms. Not yet declare stall.",
delta_in_ms, SM_HEARTBEAT_PULSE_PERIOD_MSECS
);
}
heartbeat_ok = true; // not bad enough to declare heartbeat failed yet
}
}
return heartbeat_ok;
}
/****************************************************************************
*
* Name : daemon_service_run
@ -1552,7 +1669,8 @@ void daemon_service_run ( void )
hbs_state_audit ();
}
/* run the first audit in 30 seconds */
/* The first audit was run after 30 seconds but then the
* continuous rate is every hour */
mtcTimer_start ( hbsTimer_audit, hbsTimer_handler, MTC_HRS_1 );
}
@ -1719,6 +1837,9 @@ void daemon_service_run ( void )
sockets_init = true ;
monitor_scheduling ( this_time, prev_time, 0, NODEUTIL_LATENCY_MON_START );
/* Update Sm heartbeat time to now time */
clock_gettime (CLOCK_MONOTONIC, &sm_heartbeat_timestamp_last );
/* no need for the heartbeat audit in a simplex system */
if ( hbsInv.system_type != SYSTEM_TYPE__CPE_MODE__SIMPLEX )
{
@ -2363,6 +2484,8 @@ void daemon_service_run ( void )
*/
else
{
bool heartbeat_ok = manage_sm_heartbeat();
/* manage vault wrt peer controller */
hbs_cluster_peer();
@ -2396,9 +2519,14 @@ void daemon_service_run ( void )
int lost = hbsInv.lost_pulses ((iface_enum)iface, storage_0_responding);
if ( !hbs_ctrl.locked && !hbsInv.hbs_disabled )
{
hbs_cluster_update ((iface_enum)iface, lost, storage_0_responding);
hbs_cluster_update ((iface_enum)iface, lost, storage_0_responding, heartbeat_ok );
}
}
/* log cluster throttled */
if (( heartbeat_ok == false ) && ( !( sm_heartbeat_count_b2b_misses % 100 )))
{
hbs_state_audit ( );
}
hbsTimer.ring = false ;
heartbeat_request = true ;
seq_num++ ;

View File

@ -290,7 +290,8 @@ void hbs_cluster_peer ( void );
*/
void hbs_cluster_update ( iface_enum iface,
unsigned short not_responding_hosts,
bool storage_0_responding );
bool storage_0_responding,
bool sm_heartbeat_ok);
/* Called by the hbsAgent pulse transmitter to append this controllers
* running cluster view in the next multicast pulse request.

View File

@ -217,7 +217,7 @@ void cluster_list ( void )
void cluster_storage0_state ( bool enabled )
{
if ( ctrl.cluster.storage0_enabled != enabled )
if ( (bool)ctrl.cluster.storage0_enabled != enabled )
{
ctrl.cluster.storage0_enabled = enabled ;
ilog ("storage-0 heartbeat state changed to %s",
@ -305,35 +305,39 @@ void hbs_cluster_add ( string & hostname )
void hbs_cluster_del ( string & hostname )
{
std::list<string>::iterator hostname_ptr ;
bool found = false ;
for ( hostname_ptr = ctrl.monitored_hostname_list.begin();
hostname_ptr != ctrl.monitored_hostname_list.end() ;
hostname_ptr++ )
{
if ( hostname_ptr->compare(hostname) == 0 )
{
found = true ;
break ;
}
}
if ( found == true )
{
ctrl.monitored_hostname_list.remove(hostname) ;
ctrl.monitored_hosts = (unsigned short)ctrl.monitored_hostname_list.size();
/* Manage storage-0 state. */
if ( hostname.compare(STORAGE_0) == 0 )
{
cluster_storage0_state ( false );
}
/* If we get down to 0 monitored hosts then just start fresh */
if (( ctrl.monitored_hosts ) == 0 )
{
hbs_cluster_init ( ctrl.cluster.period_msec, NULL );
}
ilog ("%s deleted from cluster", hostname.c_str());
cluster_list ();
hbs_cluster_change ( hostname + " deleted" );
break ;
}
else
{
slog("%s not found in cluster list", hostname.c_str());
}
}
@ -371,7 +375,7 @@ void hbs_cluster_period_start ( void )
* 4. Maintain a back to back non-responding count for storage-0.
* Once the count reaches the minimum threshold of
* STORAGE_0_NR_THRESHOLD then the specific network history
* is updated to indicate storgae-0 is not responding. Once
* is updated to indicate storage-0 is not responding. Once
* storage-0 starts responding again with a single response
* then that network history is updated to indicate storage-0
* is responding.
@ -389,7 +393,8 @@ void hbs_cluster_period_start ( void )
void hbs_cluster_update ( iface_enum iface,
unsigned short not_responding_hosts,
bool storage_0_responding )
bool storage_0_responding,
bool sm_heartbeat_ok)
{
if ( ctrl.monitored_hosts == 0 )
return ;
@ -407,6 +412,11 @@ void hbs_cluster_update ( iface_enum iface,
else
return ;
if ( ctrl.monitored_hosts < not_responding_hosts )
{
slog("Monitored Hosts of %d is less than the number of Not Responding hosts %d" , ctrl.monitored_hosts, not_responding_hosts );
}
if ( not_responding_hosts )
{
clog ("controller-%d %s enabled:%d not responding:%d",
@ -454,13 +464,23 @@ void hbs_cluster_update ( iface_enum iface,
}
}
/* update sm heartbeat status */
if ( sm_heartbeat_ok == true )
{
history_ptr->sm_heartbeat_fail = false ;
}
else
{
history_ptr->sm_heartbeat_fail = true ;
}
/* Manage storage-0 status. */
if ( ctrl.cluster.storage0_enabled )
{
/* Handle storage-0 status change from not responding to responding. */
if ( storage_0_responding == true )
{
if (history_ptr->storage0_responding == false)
if ((bool)history_ptr->storage0_responding == false)
{
history_ptr->storage0_responding = true ;
ilog ("controller-%d %s heartbeat ; storage-0 is ok",
@ -483,7 +503,7 @@ void hbs_cluster_update ( iface_enum iface,
}
/* Handle storage-0 status change from responding to not responding. */
if (( history_ptr->storage0_responding == true ) &&
if (( (bool)history_ptr->storage0_responding == true ) &&
( ctrl.storage_0_not_responding_count[n] >= STORAGE_0_NR_THRESHOLD ))
{
history_ptr->storage0_responding = false ;
@ -495,7 +515,7 @@ void hbs_cluster_update ( iface_enum iface,
else
{
/* Typical path for storage-0 disabled or normal non-storage system case */
if ( history_ptr->storage0_responding == true )
if ( (bool)history_ptr->storage0_responding == true )
history_ptr->storage0_responding = false ;
/* Handle clearing threshold count when storage-0 is not enabled. */
@ -507,6 +527,10 @@ void hbs_cluster_update ( iface_enum iface,
if ( history_ptr->entries < MTCE_HBS_HISTORY_ENTRIES )
history_ptr->entries++ ;
if ( ctrl.monitored_hosts < not_responding_hosts )
{
slog("Monitored Hosts of %d is less than the number of Not Responding hosts %d" , ctrl.monitored_hosts, not_responding_hosts );
}
/* Update the history with this data. */
history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled = ctrl.monitored_hosts ;
history_ptr->entry[history_ptr->oldest_entry_index].hosts_responding = ctrl.monitored_hosts - not_responding_hosts ;

View File

@ -328,17 +328,19 @@ void hbs_cluster_dump ( mtce_hbs_cluster_history_type & history, bool storage0_e
}
if ( storage0_enabled )
{
syslog ( LOG_INFO, "Cluster Vault : C%d %s S:%s %s",
syslog ( LOG_INFO, "Cluster Vault : C%d %s S:%s SM:%s %s",
history.controller,
hbs_cluster_network_name((mtce_hbs_network_enum)history.network).c_str(),
history.storage0_responding ? "y" : "n",
history.sm_heartbeat_fail ? "miss":" ok",
str);
}
else
{
syslog ( LOG_INFO, "Cluster Vault : C%d %s %s",
syslog ( LOG_INFO, "Cluster Vault : C%d %s SM:%s %s",
history.controller,
hbs_cluster_network_name((mtce_hbs_network_enum)history.network).c_str(),
history.sm_heartbeat_fail ? "miss":" ok ",
str);
}
}

View File

@ -71,7 +71,10 @@ typedef struct
{
unsigned short controller :4 ; /* value 0 or 1 (and 2 in future) */
unsigned short network :4 ; /* see mtce_hbs_network_enum */
unsigned short reserved_bits :7 ; /* future - initted to 0 */
unsigned short reserved_bits :6 ; /* future - initted to 0 */
unsigned short sm_heartbeat_fail :1 ; /* SM Health
0 = SM Heartbeat OK,
1 = SM Heartbeat Failure */
unsigned short storage0_responding:1 ; /* 1 = storage-0 is hb healthy */
unsigned short entries ; /* # of valid values in .entry */
unsigned short entries_max ; /* max size of the enry array */

View File

@ -30,6 +30,11 @@ barbican_port = 9311 ; The Barbican Port Number
token_refresh_rate = 1200 ; Authentication token refresh rate in seconds.
; A value of zero means no refresh.
; range is 0 or 600-22800
sm_heartbeat_loss_thld = 800 ; number of msecs SM heartbeat fails before
; declaring an SM process heartbeat loss in
; the heartbeat cluster.
autorecovery_threshold = 3 ; The number of times maintenance will try to
; auto recover a critically failed controller
; while there is no backup controllers to fail