diff --git a/mtce/centos/build_srpm.data b/mtce/centos/build_srpm.data index 9e6662d0..8784f945 100644 --- a/mtce/centos/build_srpm.data +++ b/mtce/centos/build_srpm.data @@ -1,3 +1,3 @@ SRC_DIR="src" -TIS_PATCH_VER=143 +TIS_PATCH_VER=144 BUILD_IS_SLOW=5 diff --git a/mtce/src/common/nodeClass.cpp b/mtce/src/common/nodeClass.cpp index e88b84be..cf08b441 100755 --- a/mtce/src/common/nodeClass.cpp +++ b/mtce/src/common/nodeClass.cpp @@ -8249,7 +8249,7 @@ void nodeLinkClass::manage_heartbeat_alarm ( struct nodeLinkClass::node * node_p - +#define HBS_LOSS_REPORT_THROTTLE (100) int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding ) { int lost = 0 ; @@ -8408,16 +8408,16 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding ) ( infra_degrade_only == true )) { /* Only print the log at the threshold boundary */ - if ( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold ) + if (( pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold ) { if ( this->active_controller ) { manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface ); } - wlog_throttled ( pulse_ptr->no_work_log_throttle, 500, - "%s %s *** Heartbeat Loss *** (degrade only)\n", pulse_ptr->hostname.c_str(), - get_iface_name_str(iface) ); + wlog ( "%s %s *** Heartbeat Loss *** (degrade only)\n", + pulse_ptr->hostname.c_str(), + get_iface_name_str(iface) ); hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" ); } } @@ -8430,21 +8430,20 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding ) (( pulse_ptr->nodetype & CONTROLLER_TYPE) == CONTROLLER_TYPE )) { /* Only print the log at the threshold boundary */ - if ( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold ) + if ( (pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold ) { if ( this->active_controller ) { manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface ); } - wlog_throttled ( pulse_ptr->no_work_log_throttle, 500, - "%s %s *** Heartbeat Loss *** (degrade only)\n", pulse_ptr->hostname.c_str(), - get_iface_name_str(iface) ); + wlog ( "%s %s *** Heartbeat Loss *** (degrade only)\n", + pulse_ptr->hostname.c_str(), + get_iface_name_str(iface)); hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" ); } } - else if (( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold ) && - ( pulse_ptr->hbs_failure[iface] == false )) + else if ((pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold ) { elog ("%s %s *** Heartbeat Loss ***\n", pulse_ptr->hostname.c_str(), get_iface_name_str(iface) ); diff --git a/mtce/src/heartbeat/hbsAgent.cpp b/mtce/src/heartbeat/hbsAgent.cpp index ffa0e0fa..950c6e2c 100644 --- a/mtce/src/heartbeat/hbsAgent.cpp +++ b/mtce/src/heartbeat/hbsAgent.cpp @@ -1183,11 +1183,11 @@ int send_event ( string & hostname, unsigned int event_cmd, iface_enum iface ) memset (&event, 0 , sizeof(mtc_message_type)); if ( event_cmd == MTC_EVENT_HEARTBEAT_LOSS ) { - // daemon_dump_membuf_banner (); - hbsInv.print_node_info (); + if ( hbs_config.debug_state == 2 ) + { + hbsInv.print_node_info (); + } hbs_cluster_log ( hbsInv.my_hostname, "event", true ); - - // daemon_dump_membuf (); snprintf ( &event.hdr[0] , MSG_HEADER_SIZE, "%s", get_heartbeat_loss_header()); } else if ( event_cmd == MTC_EVENT_LOOPBACK ) @@ -1504,9 +1504,6 @@ void daemon_service_run ( void ) daemon_exit (); } - /* set this controller as provisioned */ - hbs_manage_controller_state ( hbsInv.my_hostname , true ); - /* Run heartbeat service forever or until stop condition */ for ( hbsTimer.ring = false , hbsTimer_audit.ring = false ; ; ) { @@ -1518,7 +1515,6 @@ void daemon_service_run ( void ) if ( sockets_init == true ) { hbsInv.print_node_info(); - hbs_state_audit (); } @@ -1725,7 +1721,7 @@ void daemon_service_run ( void ) { hbsInv.hbs_disabled = true ; hbsInv.hbs_state_change = true ; - hbs_cluster_lock(); + hbs_controller_lock (); ilog ("heartbeat service going disabled (locked)"); /* force the throttle 'still disabled' log to wait for @@ -1904,17 +1900,12 @@ void daemon_service_run ( void ) } else if ( msg.cmd == MTC_CMD_STOP_HOST ) { - if ( hostname == hbsInv.my_hostname ) - { - ilog ("%s heartbeat service disabled by stop command", - hostname.c_str()); - - hbs_manage_controller_state( hostname, false ); - } - else + if ( hostname != hbsInv.my_hostname ) { hbsInv.mon_host ( hostname, false, true ); hbs_cluster_del ( hostname ); + ilog ("%s heartbeat service disabled by stop command", + hostname.c_str()); } } else if ( msg.cmd == MTC_CMD_START_HOST ) @@ -2309,7 +2300,10 @@ void daemon_service_run ( void ) */ bool storage_0_responding = true ; int lost = hbsInv.lost_pulses ((iface_enum)iface, storage_0_responding); - hbs_cluster_update ((iface_enum)iface, lost, storage_0_responding); + if ( !hbs_ctrl.locked && !hbsInv.hbs_disabled ) + { + hbs_cluster_update ((iface_enum)iface, lost, storage_0_responding); + } } hbsTimer.ring = false ; heartbeat_request = true ; diff --git a/mtce/src/heartbeat/hbsBase.h b/mtce/src/heartbeat/hbsBase.h index f68a902a..a03a3cac 100755 --- a/mtce/src/heartbeat/hbsBase.h +++ b/mtce/src/heartbeat/hbsBase.h @@ -250,8 +250,9 @@ void hbs_cluster_change ( string cluster_change_reason ); * Automatically adjusts the numbers in the cluster vault. */ void hbs_cluster_add ( string & hostname ); void hbs_cluster_del ( string & hostname ); -void hbs_cluster_rem ( unsigned short controller ); -void hbs_cluster_lock ( void ); + +/* do actions when this controller is detected as locked */ +void hbs_controller_lock ( void ); /* Do stuff in preparation for another pulse period start */ void hbs_cluster_period_start ( void ); @@ -263,9 +264,6 @@ void hbs_cluster_storage0_status ( iface_enum iface , bool responding ); int hbs_cluster_cmp( mtce_hbs_cluster_history_type h1, mtce_hbs_cluster_history_type h2 ); -/* Manage the enabled state of the controllers */ -void hbs_manage_controller_state ( string & hostname, bool enabled ); - /* Set the number of monitored hosts and this controller's * number in the cluster vault. */ void hbs_cluster_nums ( unsigned short this_controller, diff --git a/mtce/src/heartbeat/hbsClient.cpp b/mtce/src/heartbeat/hbsClient.cpp index 21a9d62b..6bbc88e2 100644 --- a/mtce/src/heartbeat/hbsClient.cpp +++ b/mtce/src/heartbeat/hbsClient.cpp @@ -750,6 +750,7 @@ static unsigned int rri[MTCE_HBS_MAX_CONTROLLERS] = {0,0} ; static int rx_error_count[MAX_IFACES] = {0,0} ; static int tx_error_count[MAX_IFACES] = {0,0} ; +static int missing_history_count[MAX_IFACES] = {0,0} ; #define ERROR_LOG_THRESHOLD (200) @@ -961,18 +962,21 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags ) } else { + int debug_state = daemon_get_cfg_ptr()->debug_state ; + clog ("controller-%d %s cluster info added to response (%d)", controller?0:1, - get_iface_name_str(iface), missed_controller_summary_tracker[controller?0:1] ); + get_iface_name_str(iface), + missed_controller_summary_tracker[controller?0:1] ); /* Now copy the other controller's cached cluster info into * this controller's response */ hbs_cluster_copy ( controller_cluster_cache[controller?0:1], hbs_sock.rx_mesg[iface].cluster ); - if ( daemon_get_cfg_ptr()->debug_state & 4 ) + string dump_banner = "" ; + if ( debug_state ) { - string dump_banner = "" ; dump_banner.append("controller-") ; dump_banner.append(itos(controller?0:1)); dump_banner.append(" cluster info from cache injected into controller-"); @@ -980,10 +984,30 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags ) dump_banner.append(":"); dump_banner.append(get_iface_name_str(iface)); dump_banner.append(" pulse response"); + } + + if ( debug_state & 4 ) + { hbs_cluster_dump ( hbs_sock.rx_mesg[iface].cluster, dump_banner ); } + else + { + clog ("%s", dump_banner.c_str()); + } } } + if (missing_history_count[iface]) + { + ilog ("controller-%d %s providing cluster history", + controller, get_iface_name_str(iface)); + missing_history_count[iface] = 0 ; + } + } + else + { + wlog_throttled ( missing_history_count[iface], 5000, + "controller-%d %s proividing no cluster history", + controller, get_iface_name_str(iface)); } } diff --git a/mtce/src/heartbeat/hbsCluster.cpp b/mtce/src/heartbeat/hbsCluster.cpp index b46f95b6..175f5866 100644 --- a/mtce/src/heartbeat/hbsCluster.cpp +++ b/mtce/src/heartbeat/hbsCluster.cpp @@ -30,17 +30,17 @@ typedef struct /* Contains the controller number (0 or 1) for this controller. */ unsigned short this_controller ; - /* Preserves which controllers are enabled. */ - bool controller_0_enabled ; - bool controller_1_enabled ; -#ifdef THREE_CONTROLLER_SYSTEM - bool controller_2_enabled ; -#endif - + /* Used to manage the cluster based on this and peer controller state */ bool peer_controller_enabled ; + /* Used to prevent log flooding in presence of back to back errors. */ unsigned int log_throttle ; + /* Used to log when + * - peer history goes missing (false -> true change) + * - peer history starts being received ( true -> false change ) */ + bool peer_history_missing ; + /* Used to threshold storage-0 not responding state */ unsigned int storage_0_not_responding_count[MTCE_HBS_NETWORKS]; @@ -130,12 +130,8 @@ void hbs_cluster_init ( unsigned short period, msgClassSock * sm_socket_ptr ) void hbs_cluster_ctrl_init ( void ) { ctrl.this_controller = 0xffff ; - ctrl.controller_0_enabled = false ; - ctrl.controller_1_enabled = false ; -#ifdef THREE_CONTROLLER_SYSTEM - ctrl.controller_2_enabled = false ; -#endif ctrl.peer_controller_enabled = false ; + ctrl.peer_history_missing = true ; ctrl.log_throttle = 0 ; ctrl.monitored_networks = 0 ; ctrl.monitored_hosts = 0 ; @@ -236,64 +232,6 @@ void cluster_storage0_state ( bool enabled ) } } - -/**************************************************************************** - * - * Name : hbs_manage_controller_state - * - * Description : Track the monitored enabled state of the controllers. - * - ***************************************************************************/ - -void hbs_manage_controller_state ( string & hostname, bool enabled ) -{ - int controller = -1 ; - - /* track controller state */ - if ( hostname == CONTROLLER_0 ) - { - controller = 0 ; - ctrl.controller_0_enabled = enabled ; - } - else if ( hostname == CONTROLLER_1 ) - { - controller = 1 ; - ctrl.controller_1_enabled = enabled ; - } - else - { - /* ignore all other host names */ - return ; - } - - /* manage the state of the peer controller */ - if ( ctrl.this_controller != controller ) - { - /* Clear peer controller cluster history when the peer - * controller goes disabled */ - if (( ctrl.peer_controller_enabled == true ) && - ( enabled == false )) - { - hbs_cluster_rem ( controller ); - } - if ( enabled == false ) - { - hbs_cluster_change ( "peer controller disabled" ) ; - } - else - { - hbs_cluster_change ( "peer controller enabled" ) ; - } - ctrl.peer_controller_enabled = enabled ; - } - else if ( enabled == false ) - { - hbs_cluster_change ( "this controller locked" ) ; - hbs_cluster_lock(); - } -} - - /**************************************************************************** * * Name : hbs_cluster_add @@ -346,8 +284,11 @@ void hbs_cluster_add ( string & hostname ) hbs_cluster_init ( ctrl.cluster.period_msec, NULL ); } - /* Manage controller state ; true means enabled in this case. */ - hbs_manage_controller_state ( hostname, true ); + /* Catch enable/provisioning of the peer controller */ + if (( hostname == CONTROLLER_0 ) && ( ctrl.this_controller != 0 )) + ctrl.peer_controller_enabled = true ; + if (( hostname == CONTROLLER_1 ) && ( ctrl.this_controller != 1 )) + ctrl.peer_controller_enabled = true ; } /**************************************************************************** @@ -391,9 +332,6 @@ void hbs_cluster_del ( string & hostname ) hbs_cluster_init ( ctrl.cluster.period_msec, NULL ); } - /* Manage controller state ; false means not enabled in this case. */ - hbs_manage_controller_state ( hostname , false ); - ilog ("%s deleted from cluster", hostname.c_str()); cluster_list (); @@ -455,7 +393,6 @@ void hbs_cluster_period_start ( void ) * ***************************************************************************/ - void hbs_cluster_update ( iface_enum iface, unsigned short not_responding_hosts, bool storage_0_responding ) @@ -636,7 +573,7 @@ void hbs_cluster_append ( hbs_message_type & msg ) } msg.cluster.bytes = BYTES_IN_CLUSTER_VAULT(msg.cluster.histories); - clog2 ("controller-%d appending cluster info to heartbeat message (%d:%d:%d)", + clog1 ("controller-%d appending cluster info to heartbeat message (%d:%d:%d)", ctrl.this_controller, ctrl.monitored_networks, ctrl.cluster.histories, msg.cluster.bytes ); } @@ -649,12 +586,22 @@ void hbs_cluster_peer ( void ) if (( ctrl.got_peer_controller_history == false ) && ( ctrl.peer_controller_enabled == true )) { - ilog ("missing peer controller cluster view" ); /* ERIK: DEBUG */ - + if ( ctrl.peer_history_missing == false ) + { + wlog ( "missing peer controller cluster view" ); + ctrl.peer_history_missing = true ; + } /* if no nodes have reported peer controller history then inject * a 0:0 value in for this pulse period for that controller. */ hbs_cluster_inject ( ctrl.this_controller?0:1, 0, 0 ); } + else if (( ctrl.got_peer_controller_history == true ) && + ( ctrl.peer_controller_enabled == true ) && + ( ctrl.peer_history_missing == true )) + { + wlog ( "receiving peer controller cluster view" ); + ctrl.peer_history_missing = false ; + } } /**************************************************************************** @@ -720,7 +667,9 @@ void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason * ***************************************************************************/ -void hbs_history_save ( string hostname, mtce_hbs_cluster_history_type & sample ) +void hbs_history_save ( string hostname, + mtce_hbs_network_enum network, + mtce_hbs_cluster_history_type & sample ) { for ( int h = 0 ; h < ctrl.cluster.histories ; h++ ) { @@ -736,12 +685,12 @@ void hbs_history_save ( string hostname, mtce_hbs_cluster_history_type & sample memcpy( &ctrl.cluster.history[h], &sample, sizeof(mtce_hbs_cluster_history_type)); - clog1 ("controller-%d updated vault with controller-%d:%s network history through %s (histories:%d)", + clog1 ("controller-%d vault update from controller-%d %s reply with %d histories (this:%s)", ctrl.this_controller, sample.controller, - hbs_cluster_network_name((mtce_hbs_network_enum)sample.network).c_str(), - hostname.c_str(), - ctrl.cluster.histories); + hbs_cluster_network_name(network).c_str(), + ctrl.cluster.histories, + hbs_cluster_network_name((mtce_hbs_network_enum)sample.network).c_str()); return ; } } @@ -756,16 +705,17 @@ void hbs_history_save ( string hostname, mtce_hbs_cluster_history_type & sample ctrl.cluster.histories++ ; ctrl.cluster.bytes = BYTES_IN_CLUSTER_VAULT(ctrl.cluster.histories); - ilog ("controller-%d added new controller-%d:%s history to vault ; now have %d network views", + ilog ("controller-%d added new %s:%s history to vault ; now have %d network views", ctrl.this_controller, - sample.controller, + hostname.c_str(), hbs_cluster_network_name((mtce_hbs_network_enum)sample.network).c_str(), ctrl.cluster.histories); } void hbs_state_audit ( void ) { - hbs_cluster_dump ( ctrl.cluster, "Audit" ); + if ( ctrl.monitored_hosts ) + hbs_cluster_dump ( ctrl.cluster, "Audit" ); } @@ -872,7 +822,7 @@ int hbs_cluster_save ( string & hostname, { /* set that we got some history and save it */ ctrl.got_peer_controller_history = true ; - hbs_history_save ( hostname, msg.cluster.history[h] ); + hbs_history_save ( hostname, network, msg.cluster.history[h] ); } hbs_cluster_log( hostname, ctrl.cluster, hbs_cluster_network_name(network) ); } @@ -915,39 +865,26 @@ void hbs_cluster_inject ( unsigned short controller, unsigned short hosts_enable } } +/**************************************************************************** + * + * Name : hbs_controller_lock + * + * Description : Clear all history for this controller. + * Called when this controller is detected as locked. + * + ***************************************************************************/ -void hbs_cluster_rem ( unsigned short controller ) +void hbs_controller_lock ( void ) { - int removed = 0 ; - for ( int h = 0 ; h < ctrl.cluster.histories ; h++ ) + if ( ctrl.cluster.histories ) { - if ( ctrl.cluster.history[h].controller == controller ) + ilog ("controller-%d locked ; clearing all cluster info", ctrl.this_controller ); + for ( int h = 0 ; h < ctrl.cluster.histories ; h++ ) { - removed++ ; - wlog ("controller-%d %s network history removed from cluster (slot %d)", - controller, - hbs_cluster_network_name((mtce_hbs_network_enum)ctrl.cluster.history[h].network).c_str(), - h ); memset ( &ctrl.cluster.history[h], 0, sizeof(mtce_hbs_cluster_history_type)); } + ctrl.cluster.histories = 0 ; + hbs_cluster_change ( "this controller locked" ) ; } - - if ( removed ) - { - hbs_cluster_change ( "removed controller history" ) ; - } - - ctrl.cluster.histories -= removed ; - ctrl.cluster.bytes = BYTES_IN_CLUSTER_VAULT(ctrl.cluster.histories); } -/* remove all cluster history on a lock operation */ -void hbs_cluster_lock( void ) -{ - ilog ("controller-%d lock ; clearing all cluster info", ctrl.this_controller ); - for ( int h = 0 ; h < ctrl.cluster.histories ; h++ ) - { - memset ( &ctrl.cluster.history[h], 0, sizeof(mtce_hbs_cluster_history_type)); - } - ctrl.cluster.histories = 0 ; -}