Implement Active-Active Heartbeat as HA Improvement Fix

A few small issues were found during integration testing with SM.

This update delivers those integration tested fixes.

1. Send cluster event to SM only after the first 10 heartbeat
   pulses are received.
2. Only send inventory to hbsAgent on provisioned controllers.
3. Add new OOB SM_UNHEALTHY flag to detect and act on an SM
   declared unhealthy controller.
4. Network monitoring enable fix.
5. Fix oldest entry tracking when a network history is not full.
6. Prevent clearing local uptime for a host that is being enabled.
7. Refactor cluster state change notification logging and handling.

These fixes were both UT and IT tested in multiple labs

Change-Id: I28485f241ac47bb3ed3ec1e2a8f4c09a1ca2070a
Story: 2003576
Task: 24907
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
Eric MacDonald 2018-12-07 13:24:27 -05:00
parent 286577940f
commit 9d7a4bf92c
13 changed files with 529 additions and 273 deletions

View File

@ -86,6 +86,7 @@ void daemon_exit ( void );
#define MTC_FLAG__SM_DEGRADED (0x00000080)
#define MTC_FLAG__PATCHING (0x00000100) /* Patching in progress */
#define MTC_FLAG__PATCHED (0x00000200) /* Patched but not reset */
#define MTC_FLAG__SM_UNHEALTHY (0x00001000)
#define MTC_UNHEALTHY_THRESHOLD (3)

View File

@ -1,3 +1,3 @@
SRC_DIR="src"
TIS_PATCH_VER=142
TIS_PATCH_VER=143
BUILD_IS_SLOW=5

View File

@ -4214,6 +4214,25 @@ int nodeLinkClass::num_controllers_enabled ( void )
return (cnt);
}
/** Returns true if the specified hostname is provisioned */
bool nodeLinkClass::hostname_provisioned ( string hostname )
{
bool provisioned = false ;
for ( struct node * ptr = head ; ; ptr = ptr->next )
{
if ( ptr->hostname.compare(hostname) == 0 )
{
provisioned = true ;
break ;
}
if (( ptr->next == NULL ) || ( ptr == tail ))
break ;
}
return (provisioned);
}
int nodeLinkClass::service_netlink_events ( int nl_socket , int ioctl_socket )
{
std::list<string> links_gone_down ;
@ -7336,9 +7355,6 @@ int nodeLinkClass::mon_host ( const string & hostname, bool true_false, bool sen
bool want_log = true ;
for ( int iface = 0 ; iface < MAX_IFACES ; iface++ )
{
if ( node_ptr->monitor[iface] == true_false )
continue ;
if ( iface == INFRA_IFACE )
{
if ( this->infra_network_provisioned == false )
@ -7810,6 +7826,16 @@ int nodeLinkClass::remPulse ( struct node * node_ptr, iface_enum iface, bool cle
{
ptr->hbs_count[iface]++ ;
ptr->b2b_pulses_count[iface]++ ;
if ( ptr->b2b_pulses_count[iface] == hbs_failure_threshold )
{
hbs_cluster_change( ptr->hostname + " " + get_iface_name_str(iface) + " heartbeat pass" );
}
else if ( ptr->b2b_pulses_count[iface] == 1 )
{
hbs_cluster_change( ptr->hostname + " " + get_iface_name_str(iface) + " heartbeat start" );
}
if ( ptr->hbs_failure[iface] == true )
{
/* threshold failure recovery */
@ -8281,8 +8307,7 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
wlog_throttled ( pulse_ptr->no_work_log_throttle, 500,
"%s %s *** Heartbeat Loss *** (degrade only)\n", pulse_ptr->hostname.c_str(),
get_iface_name_str(iface) );
this->print_node_info ();
hbs_cluster_log ( this->my_hostname, "event", true );
hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
}
}
@ -8303,8 +8328,7 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
wlog_throttled ( pulse_ptr->no_work_log_throttle, 500,
"%s %s *** Heartbeat Loss *** (degrade only)\n", pulse_ptr->hostname.c_str(),
get_iface_name_str(iface) );
this->print_node_info ();
hbs_cluster_log ( this->my_hostname, "event", true );
hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
}
}
@ -8327,8 +8351,7 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
else
{
pulse_ptr->hbs_failure[iface] = true ;
this->print_node_info ();
hbs_cluster_log ( this->my_hostname, "event", true );
hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
}
pulse_ptr->hbs_failure_count[iface]++ ;
}

View File

@ -1394,6 +1394,9 @@ public:
/** Sets a hosts's function and subfunction members */
int update_host_functions ( string hostname , string functions );
/** Returns true if the specified hostname is provisioned */
bool hostname_provisioned ( string hostname );
/***********************************************************/
/** Number of provisioned hosts (nodes) */

View File

@ -1277,6 +1277,9 @@ int daemon_init ( string iface, string nodetype )
/* init the utility module */
hbs_utils_init ();
/* init the cluster control structure */
hbs_cluster_ctrl_init ();
/* initialize the timer */
mtcTimer_init ( hbsTimer, "controller", "heartbeat" );
mtcTimer_init ( hbsTimer_audit, "controller", "state audit" );
@ -1398,7 +1401,7 @@ void hbs_sm_handler ( void )
( request == SUPPORTED_REQUEST ))
{
/* success path ... */
hbs_cluster_send( hbs_sock.sm_client_sock, reqid );
hbs_cluster_send( hbs_sock.sm_client_sock, reqid, "query" );
/* reset log throttle */
_hbs_sm_handler_log_throttle = 0 ;
@ -1722,6 +1725,7 @@ void daemon_service_run ( void )
{
hbsInv.hbs_disabled = true ;
hbsInv.hbs_state_change = true ;
hbs_cluster_lock();
ilog ("heartbeat service going disabled (locked)");
/* force the throttle 'still disabled' log to wait for
@ -1900,8 +1904,18 @@ void daemon_service_run ( void )
}
else if ( msg.cmd == MTC_CMD_STOP_HOST )
{
hbsInv.mon_host ( hostname, false, true );
hbs_cluster_del ( hostname );
if ( hostname == hbsInv.my_hostname )
{
ilog ("%s heartbeat service disabled by stop command",
hostname.c_str());
hbs_manage_controller_state( hostname, false );
}
else
{
hbsInv.mon_host ( hostname, false, true );
hbs_cluster_del ( hostname );
}
}
else if ( msg.cmd == MTC_CMD_START_HOST )
{
@ -1938,9 +1952,7 @@ void daemon_service_run ( void )
hbsInv.hbs_pulse_period = (hbsInv.hbs_pulse_period_save * HBS_BACKOFF_FACTOR) ;
ilog ("%s starting heartbeat backoff (period:%d msecs)\n", hostname.c_str(), hbsInv.hbs_pulse_period );
/* Send SM cluster information at start of MNFA */
hbs_cluster_send( hbs_sock.sm_client_sock, 0 );
hbs_cluster_change ( "backoff" );
hbsInv.print_node_info();
}
else
@ -2170,6 +2182,9 @@ void daemon_service_run ( void )
* algorithm into 'receive' mode */
heartbeat_request = false ;
/* tell cluster module that a new pulse period has started */
hbs_cluster_period_start();
/* Start the heartbeat timer.
* All nodes are expected to send a
* pulse before this timer expires. */
@ -2263,6 +2278,9 @@ void daemon_service_run ( void )
*/
else
{
/* manage vault wrt peer controller */
hbs_cluster_peer();
for ( int iface = 0 ; iface < MAX_IFACES ; iface++ )
{
/* Do not service the infrastructure interface if it is not provisioned */

View File

@ -232,6 +232,9 @@ void hbs_cluster_history_clear( mtce_hbs_cluster_type & cluster );
/******** Heartbeat Agent Cluster Functions in hbsCluster.cpp ********/
/* Init the control structure */
void hbs_cluster_ctrl_init ( void );
/* Set the cluster vault to default state.
* Called upon daemon init or heartbeat period change. */
void hbs_cluster_init ( unsigned short period , msgClassSock * sm_socket_ptr );
@ -240,16 +243,25 @@ void hbs_cluster_init ( unsigned short period , msgClassSock * sm_socket_ptr );
* Primarily to know how many history elements are missing. */
unsigned short hbs_cluster_unused_bytes ( void );
/* Inform the cluster module that there was a change to the cluster */
void hbs_cluster_change ( string cluster_change_reason );
/* Add and delete hosts from the monitored list.
* Automatically adjusts the numbers in the cluster vault. */
void hbs_cluster_add ( string & hostname );
void hbs_cluster_del ( string & hostname );
void hbs_cluster_rem ( unsigned short controller );
void hbs_cluster_lock ( void );
/* Do stuff in preparation for another pulse period start */
void hbs_cluster_period_start ( void );
/* Report status of storgate-0 */
void hbs_cluster_storage0_status ( iface_enum iface , bool responding );
/* Look for and clog changes in cluster state */
int hbs_cluster_cmp ( hbs_message_type & msg );
/* Compare 2 histories */
int hbs_cluster_cmp( mtce_hbs_cluster_history_type h1,
mtce_hbs_cluster_history_type h2 );
/* Manage the enabled state of the controllers */
void hbs_manage_controller_state ( string & hostname, bool enabled );
@ -266,6 +278,9 @@ int hbs_cluster_save ( string & hostname,
mtce_hbs_network_enum network,
hbs_message_type & msg );
/* Manage peer controller vault history. */
void hbs_cluster_peer ( void );
/*
* Called by the hbsAgent pulse receiver to create a network specific
* history update entry consisting of
@ -285,6 +300,19 @@ void hbs_cluster_update ( iface_enum iface,
* the other controller back in its response. */
void hbs_cluster_append ( hbs_message_type & msg );
/* Inject a history entry at the next position for all networks of the
* specified controller.
*
* This is used to add a 0:0 entry into the vault history of the specified
* controller as indication that that no host for this pulse period
* provided history for this controller.
*
* Procedure was made generic so that it 'could' be used to add history
* of any values for fault insertion or other potential future purposes
*/
void hbs_cluster_inject ( unsigned short controller, unsigned short hosts_enabled, unsigned short hosts_responding );
/* Produce formatted clog's that characterize current and changing cluster
* history for a given network. Each log is controller/network specific. */
void hbs_cluster_log ( string & hostname, string prefix, bool force=false );
@ -295,13 +323,14 @@ void hbs_cluster_log ( string & hostname, mtce_hbs_cluster_type & cluster, stri
void hbs_sm_handler ( void );
/* send the cluster vault to SM */
void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid );
void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason );
/* copy cluster data from src to dst */
void hbs_cluster_copy ( mtce_hbs_cluster_type & src, mtce_hbs_cluster_type & dst );
/* print the contents of the vault */
void hbs_cluster_dump ( mtce_hbs_cluster_type & vault, string log_prefix, bool force );
void hbs_cluster_dump ( mtce_hbs_cluster_history_type & history, bool storage0_enabled );
void hbs_cluster_dump ( mtce_hbs_cluster_type & vault, string reason );
/* Heartbeat service state audit */
void hbs_state_audit ( void );

View File

@ -108,6 +108,9 @@ static stallMon_type stallMon ;
/* Cached Cluster view from controllers */
mtce_hbs_cluster_type controller_cluster_cache[MTCE_HBS_MAX_CONTROLLERS];
/* Incremented every time the hbsClient fails to receive a summary this
* controller for 2 back-to-back pulse intervals. */
int missed_controller_summary_tracker[MTCE_HBS_MAX_CONTROLLERS] ;
void daemon_sigchld_hdlr ( void )
{
@ -887,8 +890,9 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags )
if ( hbs_sock.rx_mesg[iface].cluster.histories > MTCE_HBS_MAX_NETWORKS )
{
slog ("controller-%d provided %d network histories ; max is %d per controller",
slog ("controller-%d %s provided %d network histories ; max is %d per controller",
controller,
get_iface_name_str(iface),
hbs_sock.rx_mesg[iface].cluster.histories,
MTCE_HBS_MAX_NETWORKS );
}
@ -903,29 +907,81 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags )
{
hbs_cluster_copy ( hbs_sock.rx_mesg[iface].cluster,
controller_cluster_cache[controller] );
clog1 ("controller-%d cluster info from %s pulse request saved to cache",
controller, get_iface_name_str(iface));
hbs_sock.rx_mesg[iface].cluster.histories = 0 ;
/* Clear the expecting count for this controller.
* Each heartbeat cycle should result in this being cleared for
* both controllers.
*
* Clearing this is indication that we got a pulse request from
* this controller. The code below will increment this count
* for its peer controller on every request.
* An accumulation of count is indication that we are not
* receiving response from the indexed controller */
missed_controller_summary_tracker[controller] = 0 ;
if ( have_other_controller_history ( controller ) == true )
{
/* Now copy the other controller's cached cluster info into
* this controlers response */
hbs_cluster_copy ( controller_cluster_cache[controller?0:1],
hbs_sock.rx_mesg[iface].cluster );
/******************************************************************
*
* Increment the expecting count for the other controller.
* If that other controller's expecting count reaches 2 or
* more then do not include a summary for that controller
* in this response.
*
* This avoids sending stale summary info.
*
*****************************************************************/
if ( daemon_get_cfg_ptr()->debug_state & 4 )
/* Since the controllers run asynchronously the absence of
* one or 2 between pulse requests for the same controller
* can happen. This is why we compare against greater than
* the number of monitored networks (histories for this
* controller) times 2 ; following Nyquist Theorem . */
if ( ++missed_controller_summary_tracker[controller?0:1] >
controller_cluster_cache[controller?0:1].histories * 2 )
{
string dump_banner = "" ;
dump_banner.append("controller-") ;
dump_banner.append(itos(controller?0:1));
dump_banner.append(" cluster info from cache injected into controller-");
dump_banner.append(itos(controller));
dump_banner.append(":");
dump_banner.append(get_iface_name_str(iface));
dump_banner.append(" pulse response");
hbs_cluster_dump ( hbs_sock.rx_mesg[iface].cluster, dump_banner, true );
wlog ("controller-%d %s cluster info cleared (%d)",
controller?0:1,
get_iface_name_str(iface),
missed_controller_summary_tracker[controller?0:1]);
/* Clear the cached history for that controller who's
* heartbeat requests are no longer being seen.
* No need to clear the history entries,
* just the number of histories to 0 and update bytes. */
controller_cluster_cache[controller?0:1].histories = 0 ;
controller_cluster_cache[controller?0:1].bytes = BYTES_IN_CLUSTER_VAULT(0) ;
/* now that the peer controller cluster info is cleared
* we will not see another log from above until we get
* another pulse request from the peer controller. */
}
else
{
clog ("controller-%d %s cluster info added to response (%d)",
controller?0:1,
get_iface_name_str(iface), missed_controller_summary_tracker[controller?0:1] );
/* Now copy the other controller's cached cluster info into
* this controller's response */
hbs_cluster_copy ( controller_cluster_cache[controller?0:1],
hbs_sock.rx_mesg[iface].cluster );
if ( daemon_get_cfg_ptr()->debug_state & 4 )
{
string dump_banner = "" ;
dump_banner.append("controller-") ;
dump_banner.append(itos(controller?0:1));
dump_banner.append(" cluster info from cache injected into controller-");
dump_banner.append(itos(controller));
dump_banner.append(":");
dump_banner.append(get_iface_name_str(iface));
dump_banner.append(" pulse response");
hbs_cluster_dump ( hbs_sock.rx_mesg[iface].cluster, dump_banner );
}
}
}
}
@ -1079,7 +1135,10 @@ int daemon_init ( string iface, string nodeType_str )
/* Initialize the controller cluster view data bounce structure */
for ( int c = 0 ; c < MTCE_HBS_MAX_CONTROLLERS ; c++ )
{
memset ( &controller_cluster_cache[c], 0, sizeof(mtce_hbs_cluster_type)) ;
missed_controller_summary_tracker[c] = 0 ;
}
/* init the utility module */
hbs_utils_init ();
@ -1306,7 +1365,7 @@ void daemon_service_run ( void )
int bytes = hbs_sock.sm_client_sock->read((char*)&msg, sizeof(mtce_hbs_cluster_type));
if ( bytes )
{
hbs_cluster_dump (msg, "Cluster info received", true );
hbs_cluster_dump (msg, "cluster info received" );
}
}
#endif

View File

@ -37,6 +37,7 @@ typedef struct
bool controller_2_enabled ;
#endif
bool peer_controller_enabled ;
/* Used to prevent log flooding in presence of back to back errors. */
unsigned int log_throttle ;
@ -64,9 +65,9 @@ typedef struct
/* The working heartbeat cluster data vault. */
mtce_hbs_cluster_type cluster ;
bool cluster_change ;
int cluster_change_threshold_count ;
int cluster_change_difference_count ;
string cluster_change_reason ;
bool got_peer_controller_history ;
msgClassSock * sm_socket_ptr ;
@ -126,6 +127,24 @@ void hbs_cluster_init ( unsigned short period, msgClassSock * sm_socket_ptr )
ctrl.log_throttle = 0 ;
}
void hbs_cluster_ctrl_init ( void )
{
ctrl.this_controller = 0xffff ;
ctrl.controller_0_enabled = false ;
ctrl.controller_1_enabled = false ;
#ifdef THREE_CONTROLLER_SYSTEM
ctrl.controller_2_enabled = false ;
#endif
ctrl.peer_controller_enabled = false ;
ctrl.log_throttle = 0 ;
ctrl.monitored_networks = 0 ;
ctrl.monitored_hosts = 0 ;
ctrl.monitored_hostname_list.clear();
ctrl.cluster_change_reason = "" ;
ctrl.got_peer_controller_history = false ;
ctrl.sm_socket_ptr = NULL ;
memset(&ctrl.storage_0_not_responding_count[0], 0, sizeof(ctrl.storage_0_not_responding_count));
}
/****************************************************************************
*
@ -149,6 +168,23 @@ void hbs_cluster_nums ( unsigned short this_controller,
ctrl.monitored_networks = monitored_networks ;
}
/****************************************************************************
*
* Name : hbs_cluster_change
*
* Description : Maintain a the cluster change reason.
*
* cleared and printed in hbs_cluster_update.
*
***************************************************************************/
void hbs_cluster_change ( string cluster_change_reason )
{
if ( ctrl.cluster_change_reason.empty() )
ctrl.cluster_change_reason = cluster_change_reason ;
else if ( cluster_change_reason.find ( "peer controller cluster event" ) == std::string::npos )
ctrl.cluster_change_reason.append(" ; " + cluster_change_reason);
}
/****************************************************************************
*
@ -196,7 +232,7 @@ void cluster_storage0_state ( bool enabled )
ctrl.cluster.storage0_enabled = enabled ;
ilog ("storage-0 heartbeat state changed to %s",
enabled ? "enabled" : "disabled" );
ctrl.cluster_change = true ;
hbs_cluster_change ( "storage-0 state change" );
}
}
@ -211,21 +247,50 @@ void cluster_storage0_state ( bool enabled )
void hbs_manage_controller_state ( string & hostname, bool enabled )
{
int controller = -1 ;
/* track controller state */
if ( hostname == CONTROLLER_0 )
{
controller = 0 ;
ctrl.controller_0_enabled = enabled ;
}
else if ( hostname == CONTROLLER_1 )
{
controller = 1 ;
ctrl.controller_1_enabled = enabled ;
}
#ifdef THREE_CONTROLLER_SYSTEM
else if ( hostname == CONTROLLER_2 )
else
{
ctrl.controller_2_enabled = enabled ;
/* ignore all other host names */
return ;
}
/* manage the state of the peer controller */
if ( ctrl.this_controller != controller )
{
/* Clear peer controller cluster history when the peer
* controller goes disabled */
if (( ctrl.peer_controller_enabled == true ) &&
( enabled == false ))
{
hbs_cluster_rem ( controller );
}
if ( enabled == false )
{
hbs_cluster_change ( "peer controller disabled" ) ;
}
else
{
hbs_cluster_change ( "peer controller enabled" ) ;
}
ctrl.peer_controller_enabled = enabled ;
}
else if ( enabled == false )
{
hbs_cluster_change ( "this controller locked" ) ;
hbs_cluster_lock();
}
#endif
}
@ -267,7 +332,6 @@ void hbs_cluster_add ( string & hostname )
ctrl.monitored_hosts = (unsigned short)ctrl.monitored_hostname_list.size();
ilog ("%s added to cluster", hostname.c_str());
cluster_list ();
ctrl.cluster_change = true ;
}
/* Manage storage-0 state */
@ -284,13 +348,6 @@ void hbs_cluster_add ( string & hostname )
/* Manage controller state ; true means enabled in this case. */
hbs_manage_controller_state ( hostname, true );
if (( ctrl.cluster_change ) && ( ctrl.sm_socket_ptr ))
{
hbs_cluster_send( ctrl.sm_socket_ptr, 0 );
ctrl.cluster_change = false ;
}
}
/****************************************************************************
@ -341,17 +398,32 @@ void hbs_cluster_del ( string & hostname )
cluster_list ();
ctrl.cluster_change = true ;
hbs_cluster_change ( hostname + " deleted" );
break ;
}
}
}
if (( ctrl.cluster_change ) && ( ctrl.sm_socket_ptr ))
{
hbs_cluster_send( ctrl.sm_socket_ptr, 0 );
ctrl.cluster_change = false ;
}
/****************************************************************************
*
* Name : hbs_cluster_period_start
*
* Description : The following things need to be done at the start of
* every pulse period ...
*
* - set 'got_peer_controller_history' to false only to get
* set true when one at least one hbsClient response
* contains history from the other controller.
*
***************************************************************************/
void hbs_cluster_period_start ( void )
{
clog3 ("Pulse Period Start ; waiting on responses (last:%d)",
ctrl.got_peer_controller_history );
if ( ctrl.got_peer_controller_history )
ctrl.got_peer_controller_history = false ;
}
/****************************************************************************
@ -500,114 +572,36 @@ void hbs_cluster_update ( iface_enum iface,
ctrl.storage_0_not_responding_count[n] = 0 ;
}
/*
* Manage the history entry index.
*
* Get the previous entry index ...
* ... which is the one before the oldest index.
* ... which is the index for the next entry.
*/
unsigned short last_entry_index ;
unsigned short oldest_entry_index = history_ptr->oldest_entry_index ;
if ( oldest_entry_index == 0 )
{
/* Go to the end of the array. */
last_entry_index = MTCE_HBS_HISTORY_ENTRIES-1 ;
}
else
{
/* Otherwise, the previous index in the array */
last_entry_index = oldest_entry_index - 1 ;
}
bool logit = false ;
string logit_reason = "" ;
/* Update the history with this data. */
history_ptr->entry[oldest_entry_index].hosts_enabled = ctrl.monitored_hosts ;
history_ptr->entry[oldest_entry_index].hosts_responding = ctrl.monitored_hosts - not_responding_hosts ;
if (( history_ptr->entry[oldest_entry_index].hosts_enabled !=
history_ptr->entry[ last_entry_index].hosts_enabled ) ||
( history_ptr->entry[oldest_entry_index].hosts_responding !=
history_ptr->entry[ last_entry_index].hosts_responding))
{
/* Only log on change events. */
if ( history_ptr->entry[oldest_entry_index].hosts_enabled ==
history_ptr->entry[oldest_entry_index].hosts_responding )
{
ilog ("controller-%d %s cluster of %d is healthy",
ctrl.this_controller,
hbs_cluster_network_name(n).c_str(),
history_ptr->entry[oldest_entry_index].hosts_enabled);
ctrl.cluster_change_threshold_count = 0 ;
ctrl.cluster_change_difference_count = 0 ;
}
else
{
ctrl.cluster_change_threshold_count++ ;
ctrl.cluster_change_difference_count =
history_ptr->entry[oldest_entry_index].hosts_enabled -
history_ptr->entry[oldest_entry_index].hosts_responding ;
}
}
if ( daemon_get_cfg_ptr()->debug_state&4 )
{
logit = true ;
logit_reason = "(debug)" ;
}
// else if (( ctrl.cluster_change_threshold_count == 1 ) &&
// ( cluster_change == false ))
// {
// logit = true ;
// logit_reason = "" ;
// }
else if ( ctrl.cluster_change_threshold_count >= CLUSTER_CHANGE_THRESHOLD )
{
logit = true ;
ctrl.cluster_change_threshold_count = 0 ;
logit_reason = "(threshold)" ;
}
else
{
int delta =
history_ptr->entry[oldest_entry_index].hosts_enabled -
history_ptr->entry[oldest_entry_index].hosts_responding ;
if ( delta != ctrl.cluster_change_difference_count )
{
logit = true ;
ctrl.cluster_change_difference_count = delta ;
logit_reason = "(delta)" ;
}
}
if ( logit )
{
ilog ("controller-%d %s cluster of %d with %d responding (%d:%d) %s",
ctrl.this_controller,
hbs_cluster_network_name(n).c_str(),
history_ptr->entry[oldest_entry_index].hosts_enabled,
history_ptr->entry[oldest_entry_index].hosts_responding,
ctrl.cluster_change_difference_count,
not_responding_hosts,
logit_reason.c_str());
}
/* Increment the entries count till it reaches the max. */
if ( history_ptr->entries < MTCE_HBS_HISTORY_ENTRIES )
history_ptr->entries++ ;
/* Manage the next entry update index ; aka the oldest index. */
if ( oldest_entry_index == (MTCE_HBS_HISTORY_ENTRIES-1))
/* Update the history with this data. */
history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled = ctrl.monitored_hosts ;
history_ptr->entry[history_ptr->oldest_entry_index].hosts_responding = ctrl.monitored_hosts - not_responding_hosts ;
/* Manage the next entry update index ; aka the oldest index.
* - handle not full case ; oldest entry is the first entry
* - handle the full case ; wrap around */
if (( history_ptr->entries == 0 ) ||
( history_ptr->oldest_entry_index == (MTCE_HBS_HISTORY_ENTRIES-1)))
history_ptr->oldest_entry_index = 0 ;
else
history_ptr->oldest_entry_index++ ;
/* send SM an update if the cluster has changed which is indicated
* by string content in ctrl.cluster_change_reason. */
if ( ! ctrl.cluster_change_reason.empty() )
{
hbs_cluster_send( ctrl.sm_socket_ptr, 0, ctrl.cluster_change_reason );
ctrl.cluster_change_reason = "" ;
}
/* clear the log throttle if we are updating history ok. */
ctrl.log_throttle = 0 ;
}
/****************************************************************************
*
* Name : hbs_cluster_append
@ -646,6 +640,23 @@ void hbs_cluster_append ( hbs_message_type & msg )
ctrl.this_controller, ctrl.monitored_networks, ctrl.cluster.histories, msg.cluster.bytes );
}
/* Manage peer controller vault history. */
void hbs_cluster_peer ( void )
{
/* Manage updating the local peer controller history data with 0:0
* for this pulse period if there was no response from the peer
* controller for this pulse period. */
if (( ctrl.got_peer_controller_history == false ) &&
( ctrl.peer_controller_enabled == true ))
{
ilog ("missing peer controller cluster view" ); /* ERIK: DEBUG */
/* if no nodes have reported peer controller history then inject
* a 0:0 value in for this pulse period for that controller. */
hbs_cluster_inject ( ctrl.this_controller?0:1, 0, 0 );
}
}
/****************************************************************************
*
* Name : hbs_cluster_unused_bytes
@ -679,7 +690,7 @@ unsigned short hbs_cluster_unused_bytes ( void )
*
***************************************************************************/
void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid )
void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason )
{
ctrl.cluster.reqid = (unsigned short)reqid ;
if (( sm_client_sock ) && ( sm_client_sock->sock_ok() == true ))
@ -691,16 +702,7 @@ void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid )
elog ("failed to send cluster vault to SM (bytes=%d) (%d:%s)\n",
bytes , errno, strerror(errno));
}
else
{
string reason = "" ;
// ilog ("heartbeat cluster vault sent to SM (%d bytes)", len );
if ( reqid )
reason = "cluster query" ;
else
reason = "cluster event" ;
hbs_cluster_dump ( ctrl.cluster, reason, true );
}
hbs_cluster_dump ( ctrl.cluster, reason );
}
else
{
@ -725,6 +727,12 @@ void hbs_history_save ( string hostname, mtce_hbs_cluster_history_type & sample
if (( ctrl.cluster.history[h].controller == sample.controller ) &&
( ctrl.cluster.history[h].network == sample.network ))
{
if ( hbs_cluster_cmp( sample, ctrl.cluster.history[h] ) )
{
hbs_cluster_change ("peer controller cluster event " +
hbs_cluster_network_name((mtce_hbs_network_enum)sample.network));
}
memcpy( &ctrl.cluster.history[h], &sample,
sizeof(mtce_hbs_cluster_history_type));
@ -738,9 +746,13 @@ void hbs_history_save ( string hostname, mtce_hbs_cluster_history_type & sample
}
}
hbs_cluster_change ( "peer controller cluster " +
hbs_cluster_network_name((mtce_hbs_network_enum)sample.network));
/* not found ? Add a new one */
memcpy( &ctrl.cluster.history[ctrl.cluster.histories], &sample,
sizeof(mtce_hbs_cluster_history_type));
ctrl.cluster.histories++ ;
ctrl.cluster.bytes = BYTES_IN_CLUSTER_VAULT(ctrl.cluster.histories);
@ -753,7 +765,7 @@ void hbs_history_save ( string hostname, mtce_hbs_cluster_history_type & sample
void hbs_state_audit ( void )
{
hbs_cluster_dump ( ctrl.cluster, "Audit", true );
hbs_cluster_dump ( ctrl.cluster, "Audit" );
}
@ -779,46 +791,39 @@ void hbs_cluster_log ( string & hostname,
*
* Name : hbs_cluster_cmp
*
* Descrition : Performs a sanity check over the cluster structure.
* Descrition : Compare 2 histories
*
* Assumptions : Debug tool, not called at runtime.
*
* Returns : PASS or FAIL
* Returns : 0 - when number of enabled hosts and responding
* hosts are the same for all the entries.
* # - the number of entries that are different.
*
***************************************************************************/
int hbs_cluster_cmp( hbs_message_type & msg )
int hbs_cluster_cmp( mtce_hbs_cluster_history_type h1,
mtce_hbs_cluster_history_type h2 )
{
if ( msg.cluster.version < ctrl.cluster.version )
int h1_delta = 0 ;
int h2_delta = 0 ;
int delta = 0 ;
for ( int e = 0 ; e < h1.entries ; e++ )
if ( h1.entry[e].hosts_enabled != h1.entry[e].hosts_responding )
h1_delta++ ;
for ( int e = 0 ; e < h2.entries ; e++ )
if ( h2.entry[e].hosts_enabled != h2.entry[e].hosts_responding )
h2_delta++ ;
if ( h1_delta > h2_delta )
delta = h1_delta-h2_delta ;
else if ( h2_delta > h1_delta )
delta = h2_delta-h1_delta ;
if ( delta )
{
wlog ("Unexpected version (%d:%d)",
msg.cluster.version, ctrl.cluster.version );
clog3 ("peer controller reporting %d deltas", delta );
}
else if ( msg.cluster.revision != ctrl.cluster.revision )
{
wlog ("Unexpected revision (%d:%d)",
msg.cluster.revision, ctrl.cluster.revision );
}
else if ( msg.cluster.magic_number != ctrl.cluster.magic_number )
{
wlog ("Unexpected magic number (%d:%d)",
msg.cluster.magic_number, ctrl.cluster.magic_number );
}
else if ( msg.cluster.period_msec != ctrl.cluster.period_msec )
{
wlog ("Cluster Heartbeat period delta (%d:%d)",
msg.cluster.period_msec, ctrl.cluster.period_msec );
}
else if ( msg.cluster.storage0_enabled != ctrl.cluster.storage0_enabled )
{
wlog ("Cluster storage0 enabled state delta (%d:%d)",
msg.cluster.storage0_enabled, ctrl.cluster.storage0_enabled );
}
else
{
return (PASS);
}
return (FAIL);
return(delta);
}
/****************************************************************************
@ -843,23 +848,106 @@ int hbs_cluster_save ( string & hostname,
if ( ! ctrl.monitored_hosts )
return RETRY ;
if ( msg.cluster.histories == 0 )
return PASS ;
for ( int h = 0 ; h < msg.cluster.histories ; h++ )
if ( ! msg.cluster.histories )
{
if ( msg.cluster.history[h].network >= MTCE_HBS_MAX_NETWORKS )
wlog_throttled ( ctrl.log_throttle, THROTTLE_COUNT,
"%s %s ; no peer controller history",
hostname.c_str(),
hbs_cluster_network_name(network).c_str());
}
if ( ctrl.peer_controller_enabled )
{
/* Should only contain the other controllers history */
for ( int h = 0 ; h < msg.cluster.histories ; h++ )
{
elog ("Invalid network id (%d:%d:%d)",
h,
msg.cluster.history[h].controller,
msg.cluster.history[h].network );
if ( msg.cluster.history[h].network >= MTCE_HBS_MAX_NETWORKS )
{
elog ("Invalid network id (%d:%d:%d)",
h,
msg.cluster.history[h].controller,
msg.cluster.history[h].network );
}
else if ( msg.cluster.history[h].controller != ctrl.this_controller )
{
/* set that we got some history and save it */
ctrl.got_peer_controller_history = true ;
hbs_history_save ( hostname, msg.cluster.history[h] );
}
hbs_cluster_log( hostname, ctrl.cluster, hbs_cluster_network_name(network) );
}
else if ( msg.cluster.history[h].controller != ctrl.this_controller )
{
hbs_history_save ( hostname, msg.cluster.history[h] );
}
hbs_cluster_log( hostname, ctrl.cluster, hbs_cluster_network_name(network) );
}
return (PASS);
}
void hbs_cluster_inject ( unsigned short controller, unsigned short hosts_enabled, unsigned short hosts_responding )
{
for ( int h = 0 ; h < ctrl.cluster.histories ; h++ )
{
if ( ctrl.cluster.history[h].controller == controller )
{
bool dumpit = false ;
if (( ctrl.cluster.history[h].entry[ctrl.cluster.history[h].oldest_entry_index].hosts_enabled ) ||
( ctrl.cluster.history[h].entry[ctrl.cluster.history[h].oldest_entry_index].hosts_responding ))
{
/* Inject requested data for all networks of specified controller */
ctrl.cluster.history[h].entry[ctrl.cluster.history[h].oldest_entry_index].hosts_enabled = hosts_enabled ;
ctrl.cluster.history[h].entry[ctrl.cluster.history[h].oldest_entry_index].hosts_responding = hosts_responding ;
wlog ("controller-%d injected %d:%d into controller-%d %s history (entry %d)",
controller?0:1,
hosts_enabled,
hosts_responding,
controller,
hbs_cluster_network_name((mtce_hbs_network_enum)ctrl.cluster.history[h].network).c_str(),
ctrl.cluster.history[h].oldest_entry_index );
dumpit = true ;
}
/* manage the oldest index */
if ( ++ctrl.cluster.history[h].oldest_entry_index == MTCE_HBS_HISTORY_ENTRIES )
ctrl.cluster.history[h].oldest_entry_index = 0 ;
/* DEBUG: */
if ( dumpit )
hbs_cluster_dump( ctrl.cluster.history[h], ctrl.cluster.storage0_enabled );
}
}
}
void hbs_cluster_rem ( unsigned short controller )
{
int removed = 0 ;
for ( int h = 0 ; h < ctrl.cluster.histories ; h++ )
{
if ( ctrl.cluster.history[h].controller == controller )
{
removed++ ;
wlog ("controller-%d %s network history removed from cluster (slot %d)",
controller,
hbs_cluster_network_name((mtce_hbs_network_enum)ctrl.cluster.history[h].network).c_str(),
h );
memset ( &ctrl.cluster.history[h], 0, sizeof(mtce_hbs_cluster_history_type));
}
}
if ( removed )
{
hbs_cluster_change ( "removed controller history" ) ;
}
ctrl.cluster.histories -= removed ;
ctrl.cluster.bytes = BYTES_IN_CLUSTER_VAULT(ctrl.cluster.histories);
}
/* remove all cluster history on a lock operation */
void hbs_cluster_lock( void )
{
ilog ("controller-%d lock ; clearing all cluster info", ctrl.this_controller );
for ( int h = 0 ; h < ctrl.cluster.histories ; h++ )
{
memset ( &ctrl.cluster.history[h], 0, sizeof(mtce_hbs_cluster_history_type));
}
ctrl.cluster.histories = 0 ;
}

View File

@ -113,11 +113,13 @@ string hbs_cluster_network_name ( mtce_hbs_network_enum network )
/****************************************************************************
*
* Name : hbs_cluster_copy
* Name : hbs_cluster_copy
*
* Descrition : Copies cluster from src to dst.
* Descrition : Copies cluster from src to dst.
*
* Returns : Nothing.
* Parameters : cluster type.
*
* Returns : Nothing.
*
***************************************************************************/
@ -206,7 +208,6 @@ void hbs_cluster_log ( string & hostname,
line.append (str);
str[0] = '\0' ;
}
//#ifdef WANT_DOTS
else if (( history_ptr->entry[this_index].hosts_enabled ==
e.hosts_enabled ) &&
( history_ptr->entry[this_index].hosts_responding ==
@ -214,7 +215,6 @@ void hbs_cluster_log ( string & hostname,
{
line.append(". ");
}
//#endif
else
{
snprintf (&str[0], MAX_ENTRY_STR_LEN , "%d:%d ", // -%d",
@ -302,66 +302,83 @@ void hbs_cluster_log ( string & hostname,
/****************************************************************************
*
* name : hbs_cluster_dump
* Name : hbs_cluster_dump
*
* Description: Formatted dump of the vault contents to the log file.
* Description : Formatted dump of the specified history to the log file.
*
* Parameters :
*
* history is a single history type whose contents will be logged.
* storage0_enabled true suggests the storage state should also be logged.
*
***************************************************************************/
void hbs_cluster_dump ( mtce_hbs_cluster_type & vault, string log_prefix, bool force )
void hbs_cluster_dump ( mtce_hbs_cluster_history_type & history, bool storage0_enabled )
{
if ( vault.version == 0 )
#define MAX_LINE_LEN (500)
char str[MAX_LINE_LEN] ;
int i = 0 ;
for ( int e = 0 ; e < history.entries_max ; e++ )
{
snprintf ( &str[i], MAX_LINE_LEN, "%c[%d:%d]" ,
history.oldest_entry_index==e ? '>' : ' ',
history.entry[e].hosts_enabled,
history.entry[e].hosts_responding);
i = strlen(str) ;
}
if ( storage0_enabled )
{
syslog ( LOG_INFO, "Cluster Vault : C%d %s S:%s %s",
history.controller,
hbs_cluster_network_name((mtce_hbs_network_enum)history.network).c_str(),
history.storage0_responding ? "y" : "n",
str);
}
else
{
syslog ( LOG_INFO, "Cluster Vault : C%d %s %s",
history.controller,
hbs_cluster_network_name((mtce_hbs_network_enum)history.network).c_str(),
str);
}
}
/****************************************************************************
*
* Name : hbs_cluster_dump
*
* Description : Formatted dump of the vault contents to the log file.
*
* Parameters :
*
* vault is a reference to a cluster type whose contents will be logged.
* reason is a string induicatig the reason for the dump.
*
***************************************************************************/
void hbs_cluster_dump ( mtce_hbs_cluster_type & vault, string reason )
{
if (( vault.version == 0 ) || ( vault.histories == 0 ))
return ;
int debug = daemon_get_cfg_ptr()->debug_state ;
if (( debug & 2 ) || ( force == true ))
/* The reason is cumulative , if long then use a new line */
if ( reason.length() > 40 )
{
ilog ("%s", log_prefix.c_str());
syslog ( LOG_INFO, "Cluster Vault : v%d.%d %d msec heartbeat period %s;%d network heartbeat response histories (%d bytes)",
vault.version,
vault.revision,
vault.period_msec,
vault.storage0_enabled ? " with storage-0: enabled " : "",
vault.histories,
vault.bytes );
syslog ( LOG_INFO, "Cluster Dump : %s", reason.c_str());
reason = "" ;
}
syslog ( LOG_INFO, "Cluster Vault : v%d.%d %d msec period %s;%d network histories (%d bytes) %s",
vault.version,
vault.revision,
vault.period_msec,
vault.storage0_enabled ? " with storage-0: enabled " : "",
vault.histories,
vault.bytes,
reason.c_str());
if (( debug & 4 ) || ( force == true ))
for ( int h = 0 ; h < vault.histories ; h++ )
{
for ( int h = 0 ; h < vault.histories ; h++ )
{
#define MAX_LINE_LEN (500)
char str[MAX_LINE_LEN] ;
int i = 0 ;
for ( int e = 0 ; e < vault.history[h].entries_max ; e++ )
{
snprintf ( &str[i], MAX_LINE_LEN, "%c[%d:%d]" ,
vault.history[h].oldest_entry_index==e ? '>' : ' ',
vault.history[h].entry[e].hosts_enabled,
vault.history[h].entry[e].hosts_responding);
i = strlen(str) ;
}
if ( vault.storage0_enabled )
{
syslog ( LOG_INFO, "Cluster Vault : C%d %s S:%s %s",
vault.history[h].controller,
hbs_cluster_network_name((mtce_hbs_network_enum)vault.history[h].network).c_str(),
vault.history[h].storage0_responding ? "y" : "n",
str);
}
else
{
syslog ( LOG_INFO, "Cluster Vault : C%d %s %s",
vault.history[h].controller,
hbs_cluster_network_name((mtce_hbs_network_enum)vault.history[h].network).c_str(),
str);
}
}
}
if ( debug & 8 )
{
dump_memory ( &vault, 16, vault.bytes );
hbs_cluster_dump ( vault.history[h], vault.storage0_enabled );
}
}

View File

@ -795,9 +795,9 @@ int create_mtcAlive_msg ( mtc_message_type & msg, int cmd, string identity, int
}
if ( daemon_is_file_present ( SMGMT_DEGRADED_FILE ) )
{
msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__SM_DEGRADED ;
}
if ( daemon_is_file_present ( SMGMT_UNHEALTHY_FILE ) )
msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__SM_UNHEALTHY ;
/* add the interface and sequence number to the mtcAlice message */
identity.append ( ",\"interface\":\"");

View File

@ -794,8 +794,10 @@ int send_hbs_command ( string hostname, int cmd, string controller )
controllers.clear();
if ( controller == CONTROLLER )
{
controllers.push_back(CONTROLLER_0);
controllers.push_back(CONTROLLER_1);
if ( obj_ptr->hostname_provisioned(CONTROLLER_0) )
controllers.push_back(CONTROLLER_0);
if ( obj_ptr->hostname_provisioned(CONTROLLER_1) )
controllers.push_back(CONTROLLER_1);
}
else
{

View File

@ -1065,6 +1065,18 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
enableStageChange(node_ptr, MTC_ENABLE__FAILURE);
break ;
}
else if (( is_controller(node_ptr) == true ) &&
( node_ptr->mtce_flags & MTC_FLAG__SM_UNHEALTHY ))
{
elog ("%s is SM UNHEALTHY",
node_ptr->hostname.c_str() );
elog ("%s ... enable failed ; controller needs to reboot\n",
node_ptr->hostname.c_str());
enableStageChange(node_ptr, MTC_ENABLE__FAILURE);
break ;
}
/* Set the node mtcAlive timer to configured value.
* This will revert bact to normal timeout after any first
* unlock value that may be in effect. */

View File

@ -21,3 +21,7 @@ void hbs_cluster_log ( string & hostname, string prefix, bool force=false )
UNUSED(force);
}
void hbs_cluster_change ( string reason )
{
UNUSED(reason);
}