diff --git a/mtce-common/src/common/nodeBase.h b/mtce-common/src/common/nodeBase.h index 788954b6..9e8586cd 100755 --- a/mtce-common/src/common/nodeBase.h +++ b/mtce-common/src/common/nodeBase.h @@ -86,6 +86,7 @@ void daemon_exit ( void ); #define MTC_FLAG__SM_DEGRADED (0x00000080) #define MTC_FLAG__PATCHING (0x00000100) /* Patching in progress */ #define MTC_FLAG__PATCHED (0x00000200) /* Patched but not reset */ +#define MTC_FLAG__SM_UNHEALTHY (0x00001000) #define MTC_UNHEALTHY_THRESHOLD (3) diff --git a/mtce/centos/build_srpm.data b/mtce/centos/build_srpm.data index 01e786b1..9e6662d0 100644 --- a/mtce/centos/build_srpm.data +++ b/mtce/centos/build_srpm.data @@ -1,3 +1,3 @@ SRC_DIR="src" -TIS_PATCH_VER=142 +TIS_PATCH_VER=143 BUILD_IS_SLOW=5 diff --git a/mtce/src/common/nodeClass.cpp b/mtce/src/common/nodeClass.cpp index e33c70ce..04d96330 100755 --- a/mtce/src/common/nodeClass.cpp +++ b/mtce/src/common/nodeClass.cpp @@ -4214,6 +4214,25 @@ int nodeLinkClass::num_controllers_enabled ( void ) return (cnt); } + +/** Returns true if the specified hostname is provisioned */ +bool nodeLinkClass::hostname_provisioned ( string hostname ) +{ + bool provisioned = false ; + for ( struct node * ptr = head ; ; ptr = ptr->next ) + { + if ( ptr->hostname.compare(hostname) == 0 ) + { + provisioned = true ; + break ; + } + if (( ptr->next == NULL ) || ( ptr == tail )) + break ; + } + return (provisioned); +} + + int nodeLinkClass::service_netlink_events ( int nl_socket , int ioctl_socket ) { std::list links_gone_down ; @@ -7336,9 +7355,6 @@ int nodeLinkClass::mon_host ( const string & hostname, bool true_false, bool sen bool want_log = true ; for ( int iface = 0 ; iface < MAX_IFACES ; iface++ ) { - if ( node_ptr->monitor[iface] == true_false ) - continue ; - if ( iface == INFRA_IFACE ) { if ( this->infra_network_provisioned == false ) @@ -7810,6 +7826,16 @@ int nodeLinkClass::remPulse ( struct node * node_ptr, iface_enum iface, bool cle { ptr->hbs_count[iface]++ ; ptr->b2b_pulses_count[iface]++ ; + + if ( ptr->b2b_pulses_count[iface] == hbs_failure_threshold ) + { + hbs_cluster_change( ptr->hostname + " " + get_iface_name_str(iface) + " heartbeat pass" ); + } + else if ( ptr->b2b_pulses_count[iface] == 1 ) + { + hbs_cluster_change( ptr->hostname + " " + get_iface_name_str(iface) + " heartbeat start" ); + } + if ( ptr->hbs_failure[iface] == true ) { /* threshold failure recovery */ @@ -8281,8 +8307,7 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding ) wlog_throttled ( pulse_ptr->no_work_log_throttle, 500, "%s %s *** Heartbeat Loss *** (degrade only)\n", pulse_ptr->hostname.c_str(), get_iface_name_str(iface) ); - this->print_node_info (); - hbs_cluster_log ( this->my_hostname, "event", true ); + hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" ); } } @@ -8303,8 +8328,7 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding ) wlog_throttled ( pulse_ptr->no_work_log_throttle, 500, "%s %s *** Heartbeat Loss *** (degrade only)\n", pulse_ptr->hostname.c_str(), get_iface_name_str(iface) ); - this->print_node_info (); - hbs_cluster_log ( this->my_hostname, "event", true ); + hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" ); } } @@ -8327,8 +8351,7 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding ) else { pulse_ptr->hbs_failure[iface] = true ; - this->print_node_info (); - hbs_cluster_log ( this->my_hostname, "event", true ); + hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" ); } pulse_ptr->hbs_failure_count[iface]++ ; } diff --git a/mtce/src/common/nodeClass.h b/mtce/src/common/nodeClass.h index 0701a749..e5c39172 100755 --- a/mtce/src/common/nodeClass.h +++ b/mtce/src/common/nodeClass.h @@ -1394,6 +1394,9 @@ public: /** Sets a hosts's function and subfunction members */ int update_host_functions ( string hostname , string functions ); + /** Returns true if the specified hostname is provisioned */ + bool hostname_provisioned ( string hostname ); + /***********************************************************/ /** Number of provisioned hosts (nodes) */ diff --git a/mtce/src/heartbeat/hbsAgent.cpp b/mtce/src/heartbeat/hbsAgent.cpp index 1e2b2fc1..ffa0e0fa 100644 --- a/mtce/src/heartbeat/hbsAgent.cpp +++ b/mtce/src/heartbeat/hbsAgent.cpp @@ -1277,6 +1277,9 @@ int daemon_init ( string iface, string nodetype ) /* init the utility module */ hbs_utils_init (); + /* init the cluster control structure */ + hbs_cluster_ctrl_init (); + /* initialize the timer */ mtcTimer_init ( hbsTimer, "controller", "heartbeat" ); mtcTimer_init ( hbsTimer_audit, "controller", "state audit" ); @@ -1398,7 +1401,7 @@ void hbs_sm_handler ( void ) ( request == SUPPORTED_REQUEST )) { /* success path ... */ - hbs_cluster_send( hbs_sock.sm_client_sock, reqid ); + hbs_cluster_send( hbs_sock.sm_client_sock, reqid, "query" ); /* reset log throttle */ _hbs_sm_handler_log_throttle = 0 ; @@ -1722,6 +1725,7 @@ void daemon_service_run ( void ) { hbsInv.hbs_disabled = true ; hbsInv.hbs_state_change = true ; + hbs_cluster_lock(); ilog ("heartbeat service going disabled (locked)"); /* force the throttle 'still disabled' log to wait for @@ -1900,8 +1904,18 @@ void daemon_service_run ( void ) } else if ( msg.cmd == MTC_CMD_STOP_HOST ) { - hbsInv.mon_host ( hostname, false, true ); - hbs_cluster_del ( hostname ); + if ( hostname == hbsInv.my_hostname ) + { + ilog ("%s heartbeat service disabled by stop command", + hostname.c_str()); + + hbs_manage_controller_state( hostname, false ); + } + else + { + hbsInv.mon_host ( hostname, false, true ); + hbs_cluster_del ( hostname ); + } } else if ( msg.cmd == MTC_CMD_START_HOST ) { @@ -1938,9 +1952,7 @@ void daemon_service_run ( void ) hbsInv.hbs_pulse_period = (hbsInv.hbs_pulse_period_save * HBS_BACKOFF_FACTOR) ; ilog ("%s starting heartbeat backoff (period:%d msecs)\n", hostname.c_str(), hbsInv.hbs_pulse_period ); - - /* Send SM cluster information at start of MNFA */ - hbs_cluster_send( hbs_sock.sm_client_sock, 0 ); + hbs_cluster_change ( "backoff" ); hbsInv.print_node_info(); } else @@ -2170,6 +2182,9 @@ void daemon_service_run ( void ) * algorithm into 'receive' mode */ heartbeat_request = false ; + /* tell cluster module that a new pulse period has started */ + hbs_cluster_period_start(); + /* Start the heartbeat timer. * All nodes are expected to send a * pulse before this timer expires. */ @@ -2263,6 +2278,9 @@ void daemon_service_run ( void ) */ else { + /* manage vault wrt peer controller */ + hbs_cluster_peer(); + for ( int iface = 0 ; iface < MAX_IFACES ; iface++ ) { /* Do not service the infrastructure interface if it is not provisioned */ diff --git a/mtce/src/heartbeat/hbsBase.h b/mtce/src/heartbeat/hbsBase.h index b679fadf..f68a902a 100755 --- a/mtce/src/heartbeat/hbsBase.h +++ b/mtce/src/heartbeat/hbsBase.h @@ -232,6 +232,9 @@ void hbs_cluster_history_clear( mtce_hbs_cluster_type & cluster ); /******** Heartbeat Agent Cluster Functions in hbsCluster.cpp ********/ +/* Init the control structure */ +void hbs_cluster_ctrl_init ( void ); + /* Set the cluster vault to default state. * Called upon daemon init or heartbeat period change. */ void hbs_cluster_init ( unsigned short period , msgClassSock * sm_socket_ptr ); @@ -240,16 +243,25 @@ void hbs_cluster_init ( unsigned short period , msgClassSock * sm_socket_ptr ); * Primarily to know how many history elements are missing. */ unsigned short hbs_cluster_unused_bytes ( void ); +/* Inform the cluster module that there was a change to the cluster */ +void hbs_cluster_change ( string cluster_change_reason ); + /* Add and delete hosts from the monitored list. * Automatically adjusts the numbers in the cluster vault. */ void hbs_cluster_add ( string & hostname ); void hbs_cluster_del ( string & hostname ); +void hbs_cluster_rem ( unsigned short controller ); +void hbs_cluster_lock ( void ); + +/* Do stuff in preparation for another pulse period start */ +void hbs_cluster_period_start ( void ); /* Report status of storgate-0 */ void hbs_cluster_storage0_status ( iface_enum iface , bool responding ); -/* Look for and clog changes in cluster state */ -int hbs_cluster_cmp ( hbs_message_type & msg ); +/* Compare 2 histories */ +int hbs_cluster_cmp( mtce_hbs_cluster_history_type h1, + mtce_hbs_cluster_history_type h2 ); /* Manage the enabled state of the controllers */ void hbs_manage_controller_state ( string & hostname, bool enabled ); @@ -266,6 +278,9 @@ int hbs_cluster_save ( string & hostname, mtce_hbs_network_enum network, hbs_message_type & msg ); +/* Manage peer controller vault history. */ +void hbs_cluster_peer ( void ); + /* * Called by the hbsAgent pulse receiver to create a network specific * history update entry consisting of @@ -285,6 +300,19 @@ void hbs_cluster_update ( iface_enum iface, * the other controller back in its response. */ void hbs_cluster_append ( hbs_message_type & msg ); +/* Inject a history entry at the next position for all networks of the + * specified controller. + * + * This is used to add a 0:0 entry into the vault history of the specified + * controller as indication that that no host for this pulse period + * provided history for this controller. + * + * Procedure was made generic so that it 'could' be used to add history + * of any values for fault insertion or other potential future purposes + */ +void hbs_cluster_inject ( unsigned short controller, unsigned short hosts_enabled, unsigned short hosts_responding ); + + /* Produce formatted clog's that characterize current and changing cluster * history for a given network. Each log is controller/network specific. */ void hbs_cluster_log ( string & hostname, string prefix, bool force=false ); @@ -295,13 +323,14 @@ void hbs_cluster_log ( string & hostname, mtce_hbs_cluster_type & cluster, stri void hbs_sm_handler ( void ); /* send the cluster vault to SM */ -void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid ); +void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason ); /* copy cluster data from src to dst */ void hbs_cluster_copy ( mtce_hbs_cluster_type & src, mtce_hbs_cluster_type & dst ); /* print the contents of the vault */ -void hbs_cluster_dump ( mtce_hbs_cluster_type & vault, string log_prefix, bool force ); +void hbs_cluster_dump ( mtce_hbs_cluster_history_type & history, bool storage0_enabled ); +void hbs_cluster_dump ( mtce_hbs_cluster_type & vault, string reason ); /* Heartbeat service state audit */ void hbs_state_audit ( void ); diff --git a/mtce/src/heartbeat/hbsClient.cpp b/mtce/src/heartbeat/hbsClient.cpp index c0dbad8d..21a9d62b 100644 --- a/mtce/src/heartbeat/hbsClient.cpp +++ b/mtce/src/heartbeat/hbsClient.cpp @@ -108,6 +108,9 @@ static stallMon_type stallMon ; /* Cached Cluster view from controllers */ mtce_hbs_cluster_type controller_cluster_cache[MTCE_HBS_MAX_CONTROLLERS]; +/* Incremented every time the hbsClient fails to receive a summary this + * controller for 2 back-to-back pulse intervals. */ +int missed_controller_summary_tracker[MTCE_HBS_MAX_CONTROLLERS] ; void daemon_sigchld_hdlr ( void ) { @@ -887,8 +890,9 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags ) if ( hbs_sock.rx_mesg[iface].cluster.histories > MTCE_HBS_MAX_NETWORKS ) { - slog ("controller-%d provided %d network histories ; max is %d per controller", + slog ("controller-%d %s provided %d network histories ; max is %d per controller", controller, + get_iface_name_str(iface), hbs_sock.rx_mesg[iface].cluster.histories, MTCE_HBS_MAX_NETWORKS ); } @@ -903,29 +907,81 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags ) { hbs_cluster_copy ( hbs_sock.rx_mesg[iface].cluster, controller_cluster_cache[controller] ); + clog1 ("controller-%d cluster info from %s pulse request saved to cache", controller, get_iface_name_str(iface)); - hbs_sock.rx_mesg[iface].cluster.histories = 0 ; + /* Clear the expecting count for this controller. + * Each heartbeat cycle should result in this being cleared for + * both controllers. + * + * Clearing this is indication that we got a pulse request from + * this controller. The code below will increment this count + * for its peer controller on every request. + * An accumulation of count is indication that we are not + * receiving response from the indexed controller */ + missed_controller_summary_tracker[controller] = 0 ; if ( have_other_controller_history ( controller ) == true ) { - /* Now copy the other controller's cached cluster info into - * this controlers response */ - hbs_cluster_copy ( controller_cluster_cache[controller?0:1], - hbs_sock.rx_mesg[iface].cluster ); + /****************************************************************** + * + * Increment the expecting count for the other controller. + * If that other controller's expecting count reaches 2 or + * more then do not include a summary for that controller + * in this response. + * + * This avoids sending stale summary info. + * + *****************************************************************/ - if ( daemon_get_cfg_ptr()->debug_state & 4 ) + /* Since the controllers run asynchronously the absence of + * one or 2 between pulse requests for the same controller + * can happen. This is why we compare against greater than + * the number of monitored networks (histories for this + * controller) times 2 ; following Nyquist Theorem . */ + if ( ++missed_controller_summary_tracker[controller?0:1] > + controller_cluster_cache[controller?0:1].histories * 2 ) { - string dump_banner = "" ; - dump_banner.append("controller-") ; - dump_banner.append(itos(controller?0:1)); - dump_banner.append(" cluster info from cache injected into controller-"); - dump_banner.append(itos(controller)); - dump_banner.append(":"); - dump_banner.append(get_iface_name_str(iface)); - dump_banner.append(" pulse response"); - hbs_cluster_dump ( hbs_sock.rx_mesg[iface].cluster, dump_banner, true ); + wlog ("controller-%d %s cluster info cleared (%d)", + controller?0:1, + get_iface_name_str(iface), + missed_controller_summary_tracker[controller?0:1]); + + /* Clear the cached history for that controller who's + * heartbeat requests are no longer being seen. + * No need to clear the history entries, + * just the number of histories to 0 and update bytes. */ + controller_cluster_cache[controller?0:1].histories = 0 ; + controller_cluster_cache[controller?0:1].bytes = BYTES_IN_CLUSTER_VAULT(0) ; + + /* now that the peer controller cluster info is cleared + * we will not see another log from above until we get + * another pulse request from the peer controller. */ + } + else + { + clog ("controller-%d %s cluster info added to response (%d)", + controller?0:1, + get_iface_name_str(iface), missed_controller_summary_tracker[controller?0:1] ); + + /* Now copy the other controller's cached cluster info into + * this controller's response */ + hbs_cluster_copy ( controller_cluster_cache[controller?0:1], + hbs_sock.rx_mesg[iface].cluster ); + + if ( daemon_get_cfg_ptr()->debug_state & 4 ) + { + string dump_banner = "" ; + dump_banner.append("controller-") ; + dump_banner.append(itos(controller?0:1)); + dump_banner.append(" cluster info from cache injected into controller-"); + dump_banner.append(itos(controller)); + dump_banner.append(":"); + dump_banner.append(get_iface_name_str(iface)); + dump_banner.append(" pulse response"); + hbs_cluster_dump ( hbs_sock.rx_mesg[iface].cluster, dump_banner ); + } } } } @@ -1079,7 +1135,10 @@ int daemon_init ( string iface, string nodeType_str ) /* Initialize the controller cluster view data bounce structure */ for ( int c = 0 ; c < MTCE_HBS_MAX_CONTROLLERS ; c++ ) + { memset ( &controller_cluster_cache[c], 0, sizeof(mtce_hbs_cluster_type)) ; + missed_controller_summary_tracker[c] = 0 ; + } /* init the utility module */ hbs_utils_init (); @@ -1306,7 +1365,7 @@ void daemon_service_run ( void ) int bytes = hbs_sock.sm_client_sock->read((char*)&msg, sizeof(mtce_hbs_cluster_type)); if ( bytes ) { - hbs_cluster_dump (msg, "Cluster info received", true ); + hbs_cluster_dump (msg, "cluster info received" ); } } #endif diff --git a/mtce/src/heartbeat/hbsCluster.cpp b/mtce/src/heartbeat/hbsCluster.cpp index 85b8f363..b46f95b6 100644 --- a/mtce/src/heartbeat/hbsCluster.cpp +++ b/mtce/src/heartbeat/hbsCluster.cpp @@ -37,6 +37,7 @@ typedef struct bool controller_2_enabled ; #endif + bool peer_controller_enabled ; /* Used to prevent log flooding in presence of back to back errors. */ unsigned int log_throttle ; @@ -64,9 +65,9 @@ typedef struct /* The working heartbeat cluster data vault. */ mtce_hbs_cluster_type cluster ; - bool cluster_change ; - int cluster_change_threshold_count ; - int cluster_change_difference_count ; + string cluster_change_reason ; + + bool got_peer_controller_history ; msgClassSock * sm_socket_ptr ; @@ -126,6 +127,24 @@ void hbs_cluster_init ( unsigned short period, msgClassSock * sm_socket_ptr ) ctrl.log_throttle = 0 ; } +void hbs_cluster_ctrl_init ( void ) +{ + ctrl.this_controller = 0xffff ; + ctrl.controller_0_enabled = false ; + ctrl.controller_1_enabled = false ; +#ifdef THREE_CONTROLLER_SYSTEM + ctrl.controller_2_enabled = false ; +#endif + ctrl.peer_controller_enabled = false ; + ctrl.log_throttle = 0 ; + ctrl.monitored_networks = 0 ; + ctrl.monitored_hosts = 0 ; + ctrl.monitored_hostname_list.clear(); + ctrl.cluster_change_reason = "" ; + ctrl.got_peer_controller_history = false ; + ctrl.sm_socket_ptr = NULL ; + memset(&ctrl.storage_0_not_responding_count[0], 0, sizeof(ctrl.storage_0_not_responding_count)); +} /**************************************************************************** * @@ -149,6 +168,23 @@ void hbs_cluster_nums ( unsigned short this_controller, ctrl.monitored_networks = monitored_networks ; } +/**************************************************************************** + * + * Name : hbs_cluster_change + * + * Description : Maintain a the cluster change reason. + * + * cleared and printed in hbs_cluster_update. + * + ***************************************************************************/ + +void hbs_cluster_change ( string cluster_change_reason ) +{ + if ( ctrl.cluster_change_reason.empty() ) + ctrl.cluster_change_reason = cluster_change_reason ; + else if ( cluster_change_reason.find ( "peer controller cluster event" ) == std::string::npos ) + ctrl.cluster_change_reason.append(" ; " + cluster_change_reason); +} /**************************************************************************** * @@ -196,7 +232,7 @@ void cluster_storage0_state ( bool enabled ) ctrl.cluster.storage0_enabled = enabled ; ilog ("storage-0 heartbeat state changed to %s", enabled ? "enabled" : "disabled" ); - ctrl.cluster_change = true ; + hbs_cluster_change ( "storage-0 state change" ); } } @@ -211,21 +247,50 @@ void cluster_storage0_state ( bool enabled ) void hbs_manage_controller_state ( string & hostname, bool enabled ) { + int controller = -1 ; + /* track controller state */ if ( hostname == CONTROLLER_0 ) { + controller = 0 ; ctrl.controller_0_enabled = enabled ; } else if ( hostname == CONTROLLER_1 ) { + controller = 1 ; ctrl.controller_1_enabled = enabled ; } -#ifdef THREE_CONTROLLER_SYSTEM - else if ( hostname == CONTROLLER_2 ) + else { - ctrl.controller_2_enabled = enabled ; + /* ignore all other host names */ + return ; + } + + /* manage the state of the peer controller */ + if ( ctrl.this_controller != controller ) + { + /* Clear peer controller cluster history when the peer + * controller goes disabled */ + if (( ctrl.peer_controller_enabled == true ) && + ( enabled == false )) + { + hbs_cluster_rem ( controller ); + } + if ( enabled == false ) + { + hbs_cluster_change ( "peer controller disabled" ) ; + } + else + { + hbs_cluster_change ( "peer controller enabled" ) ; + } + ctrl.peer_controller_enabled = enabled ; + } + else if ( enabled == false ) + { + hbs_cluster_change ( "this controller locked" ) ; + hbs_cluster_lock(); } -#endif } @@ -267,7 +332,6 @@ void hbs_cluster_add ( string & hostname ) ctrl.monitored_hosts = (unsigned short)ctrl.monitored_hostname_list.size(); ilog ("%s added to cluster", hostname.c_str()); cluster_list (); - ctrl.cluster_change = true ; } /* Manage storage-0 state */ @@ -284,13 +348,6 @@ void hbs_cluster_add ( string & hostname ) /* Manage controller state ; true means enabled in this case. */ hbs_manage_controller_state ( hostname, true ); - - if (( ctrl.cluster_change ) && ( ctrl.sm_socket_ptr )) - { - hbs_cluster_send( ctrl.sm_socket_ptr, 0 ); - ctrl.cluster_change = false ; - } - } /**************************************************************************** @@ -341,17 +398,32 @@ void hbs_cluster_del ( string & hostname ) cluster_list (); - ctrl.cluster_change = true ; + hbs_cluster_change ( hostname + " deleted" ); break ; } } +} - if (( ctrl.cluster_change ) && ( ctrl.sm_socket_ptr )) - { - hbs_cluster_send( ctrl.sm_socket_ptr, 0 ); - ctrl.cluster_change = false ; - } +/**************************************************************************** + * + * Name : hbs_cluster_period_start + * + * Description : The following things need to be done at the start of + * every pulse period ... + * + * - set 'got_peer_controller_history' to false only to get + * set true when one at least one hbsClient response + * contains history from the other controller. + * + ***************************************************************************/ + +void hbs_cluster_period_start ( void ) +{ + clog3 ("Pulse Period Start ; waiting on responses (last:%d)", + ctrl.got_peer_controller_history ); + if ( ctrl.got_peer_controller_history ) + ctrl.got_peer_controller_history = false ; } /**************************************************************************** @@ -500,114 +572,36 @@ void hbs_cluster_update ( iface_enum iface, ctrl.storage_0_not_responding_count[n] = 0 ; } - /* - * Manage the history entry index. - * - * Get the previous entry index ... - * ... which is the one before the oldest index. - * ... which is the index for the next entry. - */ - unsigned short last_entry_index ; - unsigned short oldest_entry_index = history_ptr->oldest_entry_index ; - - if ( oldest_entry_index == 0 ) - { - /* Go to the end of the array. */ - last_entry_index = MTCE_HBS_HISTORY_ENTRIES-1 ; - } - else - { - /* Otherwise, the previous index in the array */ - last_entry_index = oldest_entry_index - 1 ; - } - - bool logit = false ; - string logit_reason = "" ; - - /* Update the history with this data. */ - history_ptr->entry[oldest_entry_index].hosts_enabled = ctrl.monitored_hosts ; - history_ptr->entry[oldest_entry_index].hosts_responding = ctrl.monitored_hosts - not_responding_hosts ; - - if (( history_ptr->entry[oldest_entry_index].hosts_enabled != - history_ptr->entry[ last_entry_index].hosts_enabled ) || - ( history_ptr->entry[oldest_entry_index].hosts_responding != - history_ptr->entry[ last_entry_index].hosts_responding)) - { - /* Only log on change events. */ - if ( history_ptr->entry[oldest_entry_index].hosts_enabled == - history_ptr->entry[oldest_entry_index].hosts_responding ) - { - ilog ("controller-%d %s cluster of %d is healthy", - ctrl.this_controller, - hbs_cluster_network_name(n).c_str(), - history_ptr->entry[oldest_entry_index].hosts_enabled); - ctrl.cluster_change_threshold_count = 0 ; - ctrl.cluster_change_difference_count = 0 ; - } - else - { - ctrl.cluster_change_threshold_count++ ; - ctrl.cluster_change_difference_count = - history_ptr->entry[oldest_entry_index].hosts_enabled - - history_ptr->entry[oldest_entry_index].hosts_responding ; - } - } - if ( daemon_get_cfg_ptr()->debug_state&4 ) - { - logit = true ; - logit_reason = "(debug)" ; - } -// else if (( ctrl.cluster_change_threshold_count == 1 ) && -// ( cluster_change == false )) -// { -// logit = true ; -// logit_reason = "" ; -// } - else if ( ctrl.cluster_change_threshold_count >= CLUSTER_CHANGE_THRESHOLD ) - { - logit = true ; - ctrl.cluster_change_threshold_count = 0 ; - logit_reason = "(threshold)" ; - } - else - { - int delta = - history_ptr->entry[oldest_entry_index].hosts_enabled - - history_ptr->entry[oldest_entry_index].hosts_responding ; - if ( delta != ctrl.cluster_change_difference_count ) - { - logit = true ; - ctrl.cluster_change_difference_count = delta ; - logit_reason = "(delta)" ; - } - } - - if ( logit ) - { - ilog ("controller-%d %s cluster of %d with %d responding (%d:%d) %s", - ctrl.this_controller, - hbs_cluster_network_name(n).c_str(), - history_ptr->entry[oldest_entry_index].hosts_enabled, - history_ptr->entry[oldest_entry_index].hosts_responding, - ctrl.cluster_change_difference_count, - not_responding_hosts, - logit_reason.c_str()); - } - /* Increment the entries count till it reaches the max. */ if ( history_ptr->entries < MTCE_HBS_HISTORY_ENTRIES ) history_ptr->entries++ ; - /* Manage the next entry update index ; aka the oldest index. */ - if ( oldest_entry_index == (MTCE_HBS_HISTORY_ENTRIES-1)) + /* Update the history with this data. */ + history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled = ctrl.monitored_hosts ; + history_ptr->entry[history_ptr->oldest_entry_index].hosts_responding = ctrl.monitored_hosts - not_responding_hosts ; + + /* Manage the next entry update index ; aka the oldest index. + * - handle not full case ; oldest entry is the first entry + * - handle the full case ; wrap around */ + if (( history_ptr->entries == 0 ) || + ( history_ptr->oldest_entry_index == (MTCE_HBS_HISTORY_ENTRIES-1))) history_ptr->oldest_entry_index = 0 ; else history_ptr->oldest_entry_index++ ; + /* send SM an update if the cluster has changed which is indicated + * by string content in ctrl.cluster_change_reason. */ + if ( ! ctrl.cluster_change_reason.empty() ) + { + hbs_cluster_send( ctrl.sm_socket_ptr, 0, ctrl.cluster_change_reason ); + ctrl.cluster_change_reason = "" ; + } + /* clear the log throttle if we are updating history ok. */ ctrl.log_throttle = 0 ; } + /**************************************************************************** * * Name : hbs_cluster_append @@ -646,6 +640,23 @@ void hbs_cluster_append ( hbs_message_type & msg ) ctrl.this_controller, ctrl.monitored_networks, ctrl.cluster.histories, msg.cluster.bytes ); } +/* Manage peer controller vault history. */ +void hbs_cluster_peer ( void ) +{ + /* Manage updating the local peer controller history data with 0:0 + * for this pulse period if there was no response from the peer + * controller for this pulse period. */ + if (( ctrl.got_peer_controller_history == false ) && + ( ctrl.peer_controller_enabled == true )) + { + ilog ("missing peer controller cluster view" ); /* ERIK: DEBUG */ + + /* if no nodes have reported peer controller history then inject + * a 0:0 value in for this pulse period for that controller. */ + hbs_cluster_inject ( ctrl.this_controller?0:1, 0, 0 ); + } +} + /**************************************************************************** * * Name : hbs_cluster_unused_bytes @@ -679,7 +690,7 @@ unsigned short hbs_cluster_unused_bytes ( void ) * ***************************************************************************/ -void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid ) +void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason ) { ctrl.cluster.reqid = (unsigned short)reqid ; if (( sm_client_sock ) && ( sm_client_sock->sock_ok() == true )) @@ -691,16 +702,7 @@ void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid ) elog ("failed to send cluster vault to SM (bytes=%d) (%d:%s)\n", bytes , errno, strerror(errno)); } - else - { - string reason = "" ; - // ilog ("heartbeat cluster vault sent to SM (%d bytes)", len ); - if ( reqid ) - reason = "cluster query" ; - else - reason = "cluster event" ; - hbs_cluster_dump ( ctrl.cluster, reason, true ); - } + hbs_cluster_dump ( ctrl.cluster, reason ); } else { @@ -725,6 +727,12 @@ void hbs_history_save ( string hostname, mtce_hbs_cluster_history_type & sample if (( ctrl.cluster.history[h].controller == sample.controller ) && ( ctrl.cluster.history[h].network == sample.network )) { + if ( hbs_cluster_cmp( sample, ctrl.cluster.history[h] ) ) + { + hbs_cluster_change ("peer controller cluster event " + + hbs_cluster_network_name((mtce_hbs_network_enum)sample.network)); + } + memcpy( &ctrl.cluster.history[h], &sample, sizeof(mtce_hbs_cluster_history_type)); @@ -738,9 +746,13 @@ void hbs_history_save ( string hostname, mtce_hbs_cluster_history_type & sample } } + hbs_cluster_change ( "peer controller cluster " + + hbs_cluster_network_name((mtce_hbs_network_enum)sample.network)); + /* not found ? Add a new one */ memcpy( &ctrl.cluster.history[ctrl.cluster.histories], &sample, sizeof(mtce_hbs_cluster_history_type)); + ctrl.cluster.histories++ ; ctrl.cluster.bytes = BYTES_IN_CLUSTER_VAULT(ctrl.cluster.histories); @@ -753,7 +765,7 @@ void hbs_history_save ( string hostname, mtce_hbs_cluster_history_type & sample void hbs_state_audit ( void ) { - hbs_cluster_dump ( ctrl.cluster, "Audit", true ); + hbs_cluster_dump ( ctrl.cluster, "Audit" ); } @@ -779,46 +791,39 @@ void hbs_cluster_log ( string & hostname, * * Name : hbs_cluster_cmp * - * Descrition : Performs a sanity check over the cluster structure. + * Descrition : Compare 2 histories * - * Assumptions : Debug tool, not called at runtime. - * - * Returns : PASS or FAIL + * Returns : 0 - when number of enabled hosts and responding + * hosts are the same for all the entries. + * # - the number of entries that are different. * ***************************************************************************/ -int hbs_cluster_cmp( hbs_message_type & msg ) +int hbs_cluster_cmp( mtce_hbs_cluster_history_type h1, + mtce_hbs_cluster_history_type h2 ) { - if ( msg.cluster.version < ctrl.cluster.version ) + int h1_delta = 0 ; + int h2_delta = 0 ; + int delta = 0 ; + + for ( int e = 0 ; e < h1.entries ; e++ ) + if ( h1.entry[e].hosts_enabled != h1.entry[e].hosts_responding ) + h1_delta++ ; + + for ( int e = 0 ; e < h2.entries ; e++ ) + if ( h2.entry[e].hosts_enabled != h2.entry[e].hosts_responding ) + h2_delta++ ; + + if ( h1_delta > h2_delta ) + delta = h1_delta-h2_delta ; + else if ( h2_delta > h1_delta ) + delta = h2_delta-h1_delta ; + + if ( delta ) { - wlog ("Unexpected version (%d:%d)", - msg.cluster.version, ctrl.cluster.version ); + clog3 ("peer controller reporting %d deltas", delta ); } - else if ( msg.cluster.revision != ctrl.cluster.revision ) - { - wlog ("Unexpected revision (%d:%d)", - msg.cluster.revision, ctrl.cluster.revision ); - } - else if ( msg.cluster.magic_number != ctrl.cluster.magic_number ) - { - wlog ("Unexpected magic number (%d:%d)", - msg.cluster.magic_number, ctrl.cluster.magic_number ); - } - else if ( msg.cluster.period_msec != ctrl.cluster.period_msec ) - { - wlog ("Cluster Heartbeat period delta (%d:%d)", - msg.cluster.period_msec, ctrl.cluster.period_msec ); - } - else if ( msg.cluster.storage0_enabled != ctrl.cluster.storage0_enabled ) - { - wlog ("Cluster storage0 enabled state delta (%d:%d)", - msg.cluster.storage0_enabled, ctrl.cluster.storage0_enabled ); - } - else - { - return (PASS); - } - return (FAIL); + return(delta); } /**************************************************************************** @@ -843,23 +848,106 @@ int hbs_cluster_save ( string & hostname, if ( ! ctrl.monitored_hosts ) return RETRY ; - if ( msg.cluster.histories == 0 ) - return PASS ; - - for ( int h = 0 ; h < msg.cluster.histories ; h++ ) + if ( ! msg.cluster.histories ) { - if ( msg.cluster.history[h].network >= MTCE_HBS_MAX_NETWORKS ) + wlog_throttled ( ctrl.log_throttle, THROTTLE_COUNT, + "%s %s ; no peer controller history", + hostname.c_str(), + hbs_cluster_network_name(network).c_str()); + } + + if ( ctrl.peer_controller_enabled ) + { + /* Should only contain the other controllers history */ + for ( int h = 0 ; h < msg.cluster.histories ; h++ ) { - elog ("Invalid network id (%d:%d:%d)", - h, - msg.cluster.history[h].controller, - msg.cluster.history[h].network ); + if ( msg.cluster.history[h].network >= MTCE_HBS_MAX_NETWORKS ) + { + elog ("Invalid network id (%d:%d:%d)", + h, + msg.cluster.history[h].controller, + msg.cluster.history[h].network ); + } + else if ( msg.cluster.history[h].controller != ctrl.this_controller ) + { + /* set that we got some history and save it */ + ctrl.got_peer_controller_history = true ; + hbs_history_save ( hostname, msg.cluster.history[h] ); + } + hbs_cluster_log( hostname, ctrl.cluster, hbs_cluster_network_name(network) ); } - else if ( msg.cluster.history[h].controller != ctrl.this_controller ) - { - hbs_history_save ( hostname, msg.cluster.history[h] ); - } - hbs_cluster_log( hostname, ctrl.cluster, hbs_cluster_network_name(network) ); } return (PASS); } + + +void hbs_cluster_inject ( unsigned short controller, unsigned short hosts_enabled, unsigned short hosts_responding ) +{ + for ( int h = 0 ; h < ctrl.cluster.histories ; h++ ) + { + if ( ctrl.cluster.history[h].controller == controller ) + { + bool dumpit = false ; + if (( ctrl.cluster.history[h].entry[ctrl.cluster.history[h].oldest_entry_index].hosts_enabled ) || + ( ctrl.cluster.history[h].entry[ctrl.cluster.history[h].oldest_entry_index].hosts_responding )) + { + /* Inject requested data for all networks of specified controller */ + ctrl.cluster.history[h].entry[ctrl.cluster.history[h].oldest_entry_index].hosts_enabled = hosts_enabled ; + ctrl.cluster.history[h].entry[ctrl.cluster.history[h].oldest_entry_index].hosts_responding = hosts_responding ; + + wlog ("controller-%d injected %d:%d into controller-%d %s history (entry %d)", + controller?0:1, + hosts_enabled, + hosts_responding, + controller, + hbs_cluster_network_name((mtce_hbs_network_enum)ctrl.cluster.history[h].network).c_str(), + ctrl.cluster.history[h].oldest_entry_index ); + dumpit = true ; + } + /* manage the oldest index */ + if ( ++ctrl.cluster.history[h].oldest_entry_index == MTCE_HBS_HISTORY_ENTRIES ) + ctrl.cluster.history[h].oldest_entry_index = 0 ; + + /* DEBUG: */ + if ( dumpit ) + hbs_cluster_dump( ctrl.cluster.history[h], ctrl.cluster.storage0_enabled ); + } + } +} + + +void hbs_cluster_rem ( unsigned short controller ) +{ + int removed = 0 ; + for ( int h = 0 ; h < ctrl.cluster.histories ; h++ ) + { + if ( ctrl.cluster.history[h].controller == controller ) + { + removed++ ; + wlog ("controller-%d %s network history removed from cluster (slot %d)", + controller, + hbs_cluster_network_name((mtce_hbs_network_enum)ctrl.cluster.history[h].network).c_str(), + h ); + memset ( &ctrl.cluster.history[h], 0, sizeof(mtce_hbs_cluster_history_type)); + } + } + + if ( removed ) + { + hbs_cluster_change ( "removed controller history" ) ; + } + + ctrl.cluster.histories -= removed ; + ctrl.cluster.bytes = BYTES_IN_CLUSTER_VAULT(ctrl.cluster.histories); +} + +/* remove all cluster history on a lock operation */ +void hbs_cluster_lock( void ) +{ + ilog ("controller-%d lock ; clearing all cluster info", ctrl.this_controller ); + for ( int h = 0 ; h < ctrl.cluster.histories ; h++ ) + { + memset ( &ctrl.cluster.history[h], 0, sizeof(mtce_hbs_cluster_history_type)); + } + ctrl.cluster.histories = 0 ; +} diff --git a/mtce/src/heartbeat/hbsUtil.cpp b/mtce/src/heartbeat/hbsUtil.cpp index 3980014a..5006b089 100644 --- a/mtce/src/heartbeat/hbsUtil.cpp +++ b/mtce/src/heartbeat/hbsUtil.cpp @@ -113,11 +113,13 @@ string hbs_cluster_network_name ( mtce_hbs_network_enum network ) /**************************************************************************** * - * Name : hbs_cluster_copy + * Name : hbs_cluster_copy * - * Descrition : Copies cluster from src to dst. + * Descrition : Copies cluster from src to dst. * - * Returns : Nothing. + * Parameters : cluster type. + * + * Returns : Nothing. * ***************************************************************************/ @@ -206,7 +208,6 @@ void hbs_cluster_log ( string & hostname, line.append (str); str[0] = '\0' ; } -//#ifdef WANT_DOTS else if (( history_ptr->entry[this_index].hosts_enabled == e.hosts_enabled ) && ( history_ptr->entry[this_index].hosts_responding == @@ -214,7 +215,6 @@ void hbs_cluster_log ( string & hostname, { line.append(". "); } -//#endif else { snprintf (&str[0], MAX_ENTRY_STR_LEN , "%d:%d ", // -%d", @@ -302,66 +302,83 @@ void hbs_cluster_log ( string & hostname, /**************************************************************************** * - * name : hbs_cluster_dump + * Name : hbs_cluster_dump * - * Description: Formatted dump of the vault contents to the log file. + * Description : Formatted dump of the specified history to the log file. + * + * Parameters : + * + * history is a single history type whose contents will be logged. + * storage0_enabled true suggests the storage state should also be logged. * ***************************************************************************/ -void hbs_cluster_dump ( mtce_hbs_cluster_type & vault, string log_prefix, bool force ) + +void hbs_cluster_dump ( mtce_hbs_cluster_history_type & history, bool storage0_enabled ) { - if ( vault.version == 0 ) + #define MAX_LINE_LEN (500) + char str[MAX_LINE_LEN] ; + int i = 0 ; + for ( int e = 0 ; e < history.entries_max ; e++ ) + { + snprintf ( &str[i], MAX_LINE_LEN, "%c[%d:%d]" , + history.oldest_entry_index==e ? '>' : ' ', + history.entry[e].hosts_enabled, + history.entry[e].hosts_responding); + i = strlen(str) ; + } + if ( storage0_enabled ) + { + syslog ( LOG_INFO, "Cluster Vault : C%d %s S:%s %s", + history.controller, + hbs_cluster_network_name((mtce_hbs_network_enum)history.network).c_str(), + history.storage0_responding ? "y" : "n", + str); + } + else + { + syslog ( LOG_INFO, "Cluster Vault : C%d %s %s", + history.controller, + hbs_cluster_network_name((mtce_hbs_network_enum)history.network).c_str(), + str); + } +} + +/**************************************************************************** + * + * Name : hbs_cluster_dump + * + * Description : Formatted dump of the vault contents to the log file. + * + * Parameters : + * + * vault is a reference to a cluster type whose contents will be logged. + * reason is a string induicatig the reason for the dump. + * + ***************************************************************************/ + +void hbs_cluster_dump ( mtce_hbs_cluster_type & vault, string reason ) +{ + if (( vault.version == 0 ) || ( vault.histories == 0 )) return ; - int debug = daemon_get_cfg_ptr()->debug_state ; - - if (( debug & 2 ) || ( force == true )) + /* The reason is cumulative , if long then use a new line */ + if ( reason.length() > 40 ) { - ilog ("%s", log_prefix.c_str()); - syslog ( LOG_INFO, "Cluster Vault : v%d.%d %d msec heartbeat period %s;%d network heartbeat response histories (%d bytes)", - vault.version, - vault.revision, - vault.period_msec, - vault.storage0_enabled ? " with storage-0: enabled " : "", - vault.histories, - vault.bytes ); + syslog ( LOG_INFO, "Cluster Dump : %s", reason.c_str()); + reason = "" ; } + syslog ( LOG_INFO, "Cluster Vault : v%d.%d %d msec period %s;%d network histories (%d bytes) %s", + vault.version, + vault.revision, + vault.period_msec, + vault.storage0_enabled ? " with storage-0: enabled " : "", + vault.histories, + vault.bytes, + reason.c_str()); - if (( debug & 4 ) || ( force == true )) + for ( int h = 0 ; h < vault.histories ; h++ ) { - for ( int h = 0 ; h < vault.histories ; h++ ) - { - #define MAX_LINE_LEN (500) - char str[MAX_LINE_LEN] ; - int i = 0 ; - for ( int e = 0 ; e < vault.history[h].entries_max ; e++ ) - { - snprintf ( &str[i], MAX_LINE_LEN, "%c[%d:%d]" , - vault.history[h].oldest_entry_index==e ? '>' : ' ', - vault.history[h].entry[e].hosts_enabled, - vault.history[h].entry[e].hosts_responding); - i = strlen(str) ; - } - if ( vault.storage0_enabled ) - { - syslog ( LOG_INFO, "Cluster Vault : C%d %s S:%s %s", - vault.history[h].controller, - hbs_cluster_network_name((mtce_hbs_network_enum)vault.history[h].network).c_str(), - vault.history[h].storage0_responding ? "y" : "n", - str); - } - else - { - syslog ( LOG_INFO, "Cluster Vault : C%d %s %s", - vault.history[h].controller, - hbs_cluster_network_name((mtce_hbs_network_enum)vault.history[h].network).c_str(), - str); - } - } - } - - if ( debug & 8 ) - { - dump_memory ( &vault, 16, vault.bytes ); + hbs_cluster_dump ( vault.history[h], vault.storage0_enabled ); } } diff --git a/mtce/src/maintenance/mtcCompMsg.cpp b/mtce/src/maintenance/mtcCompMsg.cpp index d0aa0284..d134ccc6 100755 --- a/mtce/src/maintenance/mtcCompMsg.cpp +++ b/mtce/src/maintenance/mtcCompMsg.cpp @@ -795,9 +795,9 @@ int create_mtcAlive_msg ( mtc_message_type & msg, int cmd, string identity, int } if ( daemon_is_file_present ( SMGMT_DEGRADED_FILE ) ) - { msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__SM_DEGRADED ; - } + if ( daemon_is_file_present ( SMGMT_UNHEALTHY_FILE ) ) + msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__SM_UNHEALTHY ; /* add the interface and sequence number to the mtcAlice message */ identity.append ( ",\"interface\":\""); diff --git a/mtce/src/maintenance/mtcCtrlMsg.cpp b/mtce/src/maintenance/mtcCtrlMsg.cpp index 06dfd228..13b0a9fd 100755 --- a/mtce/src/maintenance/mtcCtrlMsg.cpp +++ b/mtce/src/maintenance/mtcCtrlMsg.cpp @@ -794,8 +794,10 @@ int send_hbs_command ( string hostname, int cmd, string controller ) controllers.clear(); if ( controller == CONTROLLER ) { - controllers.push_back(CONTROLLER_0); - controllers.push_back(CONTROLLER_1); + if ( obj_ptr->hostname_provisioned(CONTROLLER_0) ) + controllers.push_back(CONTROLLER_0); + if ( obj_ptr->hostname_provisioned(CONTROLLER_1) ) + controllers.push_back(CONTROLLER_1); } else { diff --git a/mtce/src/maintenance/mtcNodeHdlrs.cpp b/mtce/src/maintenance/mtcNodeHdlrs.cpp index 2536a7b7..73f85b1d 100755 --- a/mtce/src/maintenance/mtcNodeHdlrs.cpp +++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp @@ -1065,6 +1065,18 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) enableStageChange(node_ptr, MTC_ENABLE__FAILURE); break ; } + + else if (( is_controller(node_ptr) == true ) && + ( node_ptr->mtce_flags & MTC_FLAG__SM_UNHEALTHY )) + { + elog ("%s is SM UNHEALTHY", + node_ptr->hostname.c_str() ); + elog ("%s ... enable failed ; controller needs to reboot\n", + node_ptr->hostname.c_str()); + enableStageChange(node_ptr, MTC_ENABLE__FAILURE); + break ; + } + /* Set the node mtcAlive timer to configured value. * This will revert bact to normal timeout after any first * unlock value that may be in effect. */ diff --git a/mtce/src/maintenance/mtcStubs.cpp b/mtce/src/maintenance/mtcStubs.cpp index 4fc3ff80..4d579928 100644 --- a/mtce/src/maintenance/mtcStubs.cpp +++ b/mtce/src/maintenance/mtcStubs.cpp @@ -21,3 +21,7 @@ void hbs_cluster_log ( string & hostname, string prefix, bool force=false ) UNUSED(force); } +void hbs_cluster_change ( string reason ) +{ + UNUSED(reason); +}