Mtce: Improve robustness of heartbeat Loss reporting

Closes-Bug: 1806963 In the case where the active controller experiences a spontaneous reboot failure there is the potential for a race condition in the new Active-Active Heartbeat model between the inactive hbsAgent and mtcAgent starting up on the newly active controller. The inactive hbsAgent can report a heartbeat Loss before SM starts up the mtcAgent. This results in a no detect of the of a heartbeat failed host. This update modifies the hbsAgent to continue to report heartbeat Loss at a throttled rate while the hbsAgent continues to experience heartbeat loss of enabled monitored hosts. This change is implemented in nodeClass.cpp. Debug of this issue also revealed another undesirable race condition and logging issue when a controller is locked. This issue is remedied with the introduction of a control structure 'locked' state that is set on controller lock and looked at in the hbs_cluster_update utility. hbsCluster.cpp Two additional hbsAgent logging changes were implemented with this update. 1. Only print "missing peer controller cluster view" on a state change event. Otherwise, this becomes excessive whenever the inactive controller fails. hbsAgent.cpp 2. Don't print the full heartbeat inventory and state banner with hbsInv.print_node_info on every heartbeat Loss event. Otherwise, this becomes excessive in larget systems. hbsCluster.cpp Test Plan: PASS: Verify hbsAgent log stream for implemented improvements. PASS: Verify Lock inactive controller several times. PASS: Fail inactive controller several times. verify detect. PASS: Reboot active controller several times. verify detect. PASS: DOR System several times. Verify proper recovery. PASS: DOR system but prevent power-up of several hosts. Verify detect. Change-Id: I36e6309e141e9c7844b736cce0cf0cddff3eb588 Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
2018-12-14 11:48:57 -05:00 · 2018-12-14 11:48:57 -05:00 · 4fb3ce1121
parent 8eb55b2b03
commit 4fb3ce1121
6 changed files with 105 additions and 153 deletions
--- a/mtce/centos/build_srpm.data
+++ b/mtce/centos/build_srpm.data
@ -1,3 +1,3 @@
 SRC_DIR="src"
-TIS_PATCH_VER=143
+TIS_PATCH_VER=144
 BUILD_IS_SLOW=5
--- a/mtce/src/common/nodeClass.cpp
+++ b/mtce/src/common/nodeClass.cpp
@ -8249,7 +8249,7 @@ void nodeLinkClass::manage_heartbeat_alarm ( struct nodeLinkClass::node * node_p



-
+#define HBS_LOSS_REPORT_THROTTLE (100)
 int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
 {
    int lost = 0  ;
@ -8408,16 +8408,16 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
                ( infra_degrade_only == true ))
            {
                /* Only print the log at the threshold boundary */
-                if ( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold )
+                if (( pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold )
                {
                    if ( this->active_controller )
                    {
                        manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
                    }

-                    wlog_throttled ( pulse_ptr->no_work_log_throttle, 500,
-                                     "%s %s *** Heartbeat Loss *** (degrade only)\n", pulse_ptr->hostname.c_str(),
-                                                                       get_iface_name_str(iface) );
+                    wlog ( "%s %s *** Heartbeat Loss *** (degrade only)\n",
+                               pulse_ptr->hostname.c_str(),
+                               get_iface_name_str(iface) );
                    hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
                }
            }
@ -8430,21 +8430,20 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
                     (( pulse_ptr->nodetype & CONTROLLER_TYPE) == CONTROLLER_TYPE ))
            {
                /* Only print the log at the threshold boundary */
-                if ( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold )
+                if ( (pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold )
                {
                    if ( this->active_controller )
                    {
                        manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
                    }
-                    wlog_throttled ( pulse_ptr->no_work_log_throttle, 500,
-                                     "%s %s *** Heartbeat Loss *** (degrade only)\n", pulse_ptr->hostname.c_str(),
-                                                                       get_iface_name_str(iface) );
+                    wlog ( "%s %s *** Heartbeat Loss *** (degrade only)\n",
+                               pulse_ptr->hostname.c_str(),
+                               get_iface_name_str(iface));
                    hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
                }
            }

-            else if (( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold ) &&
-                     ( pulse_ptr->hbs_failure[iface] == false ))
+            else if ((pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold )
            {
                elog ("%s %s *** Heartbeat Loss ***\n", pulse_ptr->hostname.c_str(),
                                                        get_iface_name_str(iface) );
--- a/mtce/src/heartbeat/hbsAgent.cpp
+++ b/mtce/src/heartbeat/hbsAgent.cpp
@ -1183,11 +1183,11 @@ int send_event ( string & hostname, unsigned int event_cmd, iface_enum iface )
    memset (&event, 0 , sizeof(mtc_message_type));
    if ( event_cmd == MTC_EVENT_HEARTBEAT_LOSS )
    {
-        // daemon_dump_membuf_banner ();
-        hbsInv.print_node_info ();
+        if ( hbs_config.debug_state == 2 )
+        {
+            hbsInv.print_node_info ();
+        }
        hbs_cluster_log ( hbsInv.my_hostname, "event", true );
-
-        // daemon_dump_membuf ();
        snprintf ( &event.hdr[0] , MSG_HEADER_SIZE, "%s", get_heartbeat_loss_header());
    }
    else if ( event_cmd == MTC_EVENT_LOOPBACK )
@ -1504,9 +1504,6 @@ void daemon_service_run ( void )
        daemon_exit ();
    }

-    /* set this controller as provisioned */
-    hbs_manage_controller_state ( hbsInv.my_hostname , true );
-
    /* Run heartbeat service forever or until stop condition */
    for ( hbsTimer.ring = false , hbsTimer_audit.ring = false ; ; )
    {
@ -1518,7 +1515,6 @@ void daemon_service_run ( void )
            if ( sockets_init == true )
            {
                hbsInv.print_node_info();
-
                hbs_state_audit ();
            }

@ -1725,7 +1721,7 @@ void daemon_service_run ( void )
                {
                    hbsInv.hbs_disabled = true ;
                    hbsInv.hbs_state_change = true ;
-                    hbs_cluster_lock();
+                    hbs_controller_lock ();
                    ilog ("heartbeat service going disabled (locked)");

                    /* force the throttle 'still disabled' log to wait for
@ -1904,17 +1900,12 @@ void daemon_service_run ( void )
                        }
                        else if ( msg.cmd == MTC_CMD_STOP_HOST )
                        {
-                            if ( hostname == hbsInv.my_hostname )
-                            {
-                                ilog ("%s heartbeat service disabled by stop command",
-                                          hostname.c_str());
-
-                                hbs_manage_controller_state( hostname, false );
-                            }
-                            else
+                            if ( hostname != hbsInv.my_hostname )
                            {
                                hbsInv.mon_host ( hostname, false, true );
                                hbs_cluster_del ( hostname );
+                                ilog ("%s heartbeat service disabled by stop command",
+                                          hostname.c_str());
                            }
                        }
                        else if ( msg.cmd == MTC_CMD_START_HOST )
@ -2309,7 +2300,10 @@ void daemon_service_run ( void )
                 */
                bool storage_0_responding = true ;
                int lost = hbsInv.lost_pulses ((iface_enum)iface, storage_0_responding);
-                hbs_cluster_update ((iface_enum)iface, lost, storage_0_responding);
+                if ( !hbs_ctrl.locked && !hbsInv.hbs_disabled )
+                {
+                    hbs_cluster_update ((iface_enum)iface, lost, storage_0_responding);
+                }
            }
            hbsTimer.ring = false ;
            heartbeat_request = true ;
--- a/mtce/src/heartbeat/hbsBase.h
+++ b/mtce/src/heartbeat/hbsBase.h
@ -250,8 +250,9 @@ void hbs_cluster_change ( string cluster_change_reason );
 * Automatically adjusts the numbers in the cluster vault. */
 void hbs_cluster_add  ( string & hostname );
 void hbs_cluster_del  ( string & hostname );
-void hbs_cluster_rem  ( unsigned short controller );
-void hbs_cluster_lock ( void );
+
+/* do actions when this controller is detected as locked */
+void hbs_controller_lock ( void );

 /* Do stuff in preparation for another pulse period start */
 void hbs_cluster_period_start ( void );
@ -263,9 +264,6 @@ void hbs_cluster_storage0_status ( iface_enum iface , bool responding );
 int hbs_cluster_cmp( mtce_hbs_cluster_history_type h1,
                     mtce_hbs_cluster_history_type h2 );

-/* Manage the enabled state of the controllers */
-void hbs_manage_controller_state ( string & hostname, bool enabled );
-
 /* Set the number of monitored hosts and this controller's
 * number in the cluster vault. */
 void hbs_cluster_nums ( unsigned short this_controller,
--- a/mtce/src/heartbeat/hbsClient.cpp
+++ b/mtce/src/heartbeat/hbsClient.cpp
@ -750,6 +750,7 @@ static unsigned int rri[MTCE_HBS_MAX_CONTROLLERS] = {0,0} ;

 static int rx_error_count[MAX_IFACES] = {0,0} ;
 static int tx_error_count[MAX_IFACES] = {0,0} ;
+static int missing_history_count[MAX_IFACES] = {0,0} ;

 #define ERROR_LOG_THRESHOLD (200)

@ -961,18 +962,21 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags )
                }
                else
                {
+                    int debug_state = daemon_get_cfg_ptr()->debug_state ;
+
                    clog  ("controller-%d %s cluster info added to response (%d)",
                            controller?0:1,
-                            get_iface_name_str(iface), missed_controller_summary_tracker[controller?0:1] );
+                            get_iface_name_str(iface),
+                            missed_controller_summary_tracker[controller?0:1] );

                    /* Now copy the other controller's cached cluster info into
                     * this controller's response */
                    hbs_cluster_copy ( controller_cluster_cache[controller?0:1],
                                       hbs_sock.rx_mesg[iface].cluster );

-                    if ( daemon_get_cfg_ptr()->debug_state & 4 )
+                    string dump_banner = "" ;
+                    if ( debug_state )
                    {
-                        string dump_banner = "" ;
                        dump_banner.append("controller-") ;
                        dump_banner.append(itos(controller?0:1));
                        dump_banner.append(" cluster info from cache injected into controller-");
@ -980,10 +984,30 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags )
                        dump_banner.append(":");
                        dump_banner.append(get_iface_name_str(iface));
                        dump_banner.append(" pulse response");
+                    }
+
+                    if ( debug_state & 4 )
+                    {
                        hbs_cluster_dump ( hbs_sock.rx_mesg[iface].cluster, dump_banner );
                    }
+                    else
+                    {
+                        clog ("%s", dump_banner.c_str());
+                    }
                }
            }
+            if (missing_history_count[iface])
+            {
+                ilog ("controller-%d %s providing cluster history",
+                       controller, get_iface_name_str(iface));
+                missing_history_count[iface] = 0 ;
+            }
+        }
+        else
+        {
+            wlog_throttled ( missing_history_count[iface], 5000,
+                    "controller-%d %s proividing no cluster history",
+                    controller, get_iface_name_str(iface));
        }
    }

--- a/mtce/src/heartbeat/hbsCluster.cpp
+++ b/mtce/src/heartbeat/hbsCluster.cpp
@ -30,17 +30,17 @@ typedef struct
    /* Contains the controller number (0 or 1) for this controller. */
    unsigned short this_controller ;

-    /* Preserves which controllers are enabled. */
-    bool controller_0_enabled ;
-    bool controller_1_enabled ;
-#ifdef THREE_CONTROLLER_SYSTEM
-    bool controller_2_enabled ;
-#endif
-
+    /* Used to manage the cluster based on this and peer controller state */
    bool peer_controller_enabled ;
+
    /* Used to prevent log flooding in presence of back to back errors. */
    unsigned int log_throttle ;

+    /* Used to log when
+     * - peer history goes missing (false -> true change)
+     * - peer history starts being received ( true -> false change ) */
+    bool peer_history_missing ;
+
    /* Used to threshold storage-0 not responding state */
    unsigned int storage_0_not_responding_count[MTCE_HBS_NETWORKS];

@ -130,12 +130,8 @@ void hbs_cluster_init ( unsigned short period, msgClassSock * sm_socket_ptr )
 void hbs_cluster_ctrl_init ( void )
 {
    ctrl.this_controller = 0xffff ;
-    ctrl.controller_0_enabled = false ;
-    ctrl.controller_1_enabled = false ;
-#ifdef THREE_CONTROLLER_SYSTEM
-    ctrl.controller_2_enabled = false ;
-#endif
    ctrl.peer_controller_enabled = false ;
+    ctrl.peer_history_missing = true ;
    ctrl.log_throttle = 0 ;
    ctrl.monitored_networks = 0 ;
    ctrl.monitored_hosts = 0 ;
@ -236,64 +232,6 @@ void cluster_storage0_state ( bool enabled )
    }
 }

-
-/****************************************************************************
- *
- * Name        : hbs_manage_controller_state
- *
- * Description : Track the monitored enabled state of the controllers.
- *
- ***************************************************************************/
-
-void hbs_manage_controller_state ( string & hostname, bool enabled )
-{
-    int controller = -1 ;
-
-    /* track controller state */
-    if ( hostname == CONTROLLER_0 )
-    {
-        controller = 0 ;
-        ctrl.controller_0_enabled = enabled ;
-    }
-    else if ( hostname == CONTROLLER_1 )
-    {
-        controller = 1 ;
-        ctrl.controller_1_enabled = enabled ;
-    }
-    else
-    {
-        /* ignore all other host names */
-        return ;
-    }
-
-    /* manage the state of the peer controller */
-    if ( ctrl.this_controller != controller )
-    {
-        /* Clear peer controller cluster history when the peer
-         * controller goes disabled */
-        if (( ctrl.peer_controller_enabled == true ) &&
-            ( enabled == false ))
-        {
-            hbs_cluster_rem ( controller );
-        }
-        if ( enabled == false )
-        {
-            hbs_cluster_change ( "peer controller disabled" ) ;
-        }
-        else
-        {
-           hbs_cluster_change ( "peer controller enabled" ) ;
-        }
-        ctrl.peer_controller_enabled = enabled ;
-    }
-    else if ( enabled == false )
-    {
-        hbs_cluster_change ( "this controller locked" ) ;
-        hbs_cluster_lock();
-    }
-}
-
-
 /****************************************************************************
 *
 * Name        : hbs_cluster_add
@ -346,8 +284,11 @@ void hbs_cluster_add ( string & hostname )
        hbs_cluster_init ( ctrl.cluster.period_msec, NULL );
    }

-    /* Manage controller state ; true means enabled in this case. */
-    hbs_manage_controller_state ( hostname, true );
+    /* Catch enable/provisioning of the peer controller */
+    if (( hostname == CONTROLLER_0 ) && ( ctrl.this_controller != 0 ))
+        ctrl.peer_controller_enabled = true ;
+    if (( hostname == CONTROLLER_1 ) && ( ctrl.this_controller != 1 ))
+        ctrl.peer_controller_enabled = true ;
 }

 /****************************************************************************
@ -391,9 +332,6 @@ void hbs_cluster_del ( string & hostname )
                hbs_cluster_init ( ctrl.cluster.period_msec, NULL );
            }

-            /* Manage controller state ; false means not enabled in this case. */
-            hbs_manage_controller_state ( hostname , false );
-
            ilog ("%s deleted from cluster", hostname.c_str());

            cluster_list ();
@ -455,7 +393,6 @@ void hbs_cluster_period_start ( void )
 *
 ***************************************************************************/

-
 void hbs_cluster_update ( iface_enum iface,
                      unsigned short not_responding_hosts,
                                bool storage_0_responding )
@ -636,7 +573,7 @@ void hbs_cluster_append ( hbs_message_type & msg )
    }
    msg.cluster.bytes = BYTES_IN_CLUSTER_VAULT(msg.cluster.histories);

-    clog2 ("controller-%d appending cluster info to heartbeat message (%d:%d:%d)",
+    clog1 ("controller-%d appending cluster info to heartbeat message (%d:%d:%d)",
            ctrl.this_controller, ctrl.monitored_networks, ctrl.cluster.histories, msg.cluster.bytes );
 }

@ -649,12 +586,22 @@ void hbs_cluster_peer ( void )
    if (( ctrl.got_peer_controller_history == false ) &&
        ( ctrl.peer_controller_enabled == true ))
    {
-        ilog ("missing peer controller cluster view" ); /* ERIK: DEBUG */
-
+        if ( ctrl.peer_history_missing == false )
+        {
+            wlog ( "missing peer controller cluster view" );
+            ctrl.peer_history_missing = true ;
+        }
        /* if no nodes have reported peer controller history then inject
         * a 0:0 value in for this pulse period for that controller. */
        hbs_cluster_inject ( ctrl.this_controller?0:1, 0, 0 );
    }
+    else if (( ctrl.got_peer_controller_history == true ) &&
+             ( ctrl.peer_controller_enabled == true ) &&
+             ( ctrl.peer_history_missing == true ))
+    {
+        wlog ( "receiving peer controller cluster view" );
+        ctrl.peer_history_missing = false ;
+    }
 }

 /****************************************************************************
@ -720,7 +667,9 @@ void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason
 *
 ***************************************************************************/

-void hbs_history_save ( string hostname, mtce_hbs_cluster_history_type & sample )
+void hbs_history_save ( string hostname,
+                        mtce_hbs_network_enum network,
+                        mtce_hbs_cluster_history_type & sample )
 {
    for ( int h = 0 ; h < ctrl.cluster.histories ; h++ )
    {
@ -736,12 +685,12 @@ void hbs_history_save ( string hostname, mtce_hbs_cluster_history_type & sample
            memcpy( &ctrl.cluster.history[h], &sample,
                    sizeof(mtce_hbs_cluster_history_type));

-            clog1 ("controller-%d updated vault with controller-%d:%s network history through %s (histories:%d)",
+            clog1 ("controller-%d vault update from controller-%d %s reply with %d histories (this:%s)",
                   ctrl.this_controller,
                   sample.controller,
-                   hbs_cluster_network_name((mtce_hbs_network_enum)sample.network).c_str(),
-                   hostname.c_str(),
-                   ctrl.cluster.histories);
+                   hbs_cluster_network_name(network).c_str(),
+                   ctrl.cluster.histories,
+                   hbs_cluster_network_name((mtce_hbs_network_enum)sample.network).c_str());
            return ;
        }
    }
@ -756,16 +705,17 @@ void hbs_history_save ( string hostname, mtce_hbs_cluster_history_type & sample
    ctrl.cluster.histories++ ;
    ctrl.cluster.bytes = BYTES_IN_CLUSTER_VAULT(ctrl.cluster.histories);

-    ilog ("controller-%d added new controller-%d:%s history to vault ; now have %d network views",
+    ilog ("controller-%d added new %s:%s history to vault ; now have %d network views",
              ctrl.this_controller,
-              sample.controller,
+              hostname.c_str(),
              hbs_cluster_network_name((mtce_hbs_network_enum)sample.network).c_str(),
              ctrl.cluster.histories);
 }

 void hbs_state_audit ( void )
 {
-   hbs_cluster_dump ( ctrl.cluster, "Audit" );
+    if ( ctrl.monitored_hosts )
+        hbs_cluster_dump ( ctrl.cluster, "Audit" );
 }


@ -872,7 +822,7 @@ int hbs_cluster_save ( string & hostname,
            {
                /* set that we got some history and save it */
                ctrl.got_peer_controller_history = true ;
-                hbs_history_save ( hostname, msg.cluster.history[h] );
+                hbs_history_save ( hostname, network, msg.cluster.history[h] );
            }
            hbs_cluster_log( hostname, ctrl.cluster, hbs_cluster_network_name(network) );
        }
@ -915,39 +865,26 @@ void hbs_cluster_inject ( unsigned short controller, unsigned short hosts_enable
    }
 }

+/****************************************************************************
+ *
+ * Name        : hbs_controller_lock
+ *
+ * Description : Clear all history for this controller.
+ *               Called when this controller is detected as locked.
+ *
+ ***************************************************************************/

-void hbs_cluster_rem ( unsigned short controller )
+void hbs_controller_lock ( void )
 {
-    int removed = 0 ;
-    for ( int h = 0 ; h < ctrl.cluster.histories ; h++ )
+    if ( ctrl.cluster.histories )
    {
-        if ( ctrl.cluster.history[h].controller == controller )
+        ilog ("controller-%d locked ; clearing all cluster info", ctrl.this_controller );
+        for ( int h = 0 ; h < ctrl.cluster.histories ; h++ )
        {
-            removed++ ;
-            wlog ("controller-%d %s network history removed from cluster (slot %d)",
-                   controller,
-                   hbs_cluster_network_name((mtce_hbs_network_enum)ctrl.cluster.history[h].network).c_str(),
-                   h );
            memset ( &ctrl.cluster.history[h], 0, sizeof(mtce_hbs_cluster_history_type));
        }
+        ctrl.cluster.histories = 0 ;
+        hbs_cluster_change ( "this controller locked" ) ;
    }
-
-    if ( removed )
-    {
-        hbs_cluster_change ( "removed controller history" ) ;
-    }
-
-    ctrl.cluster.histories -= removed ;
-    ctrl.cluster.bytes = BYTES_IN_CLUSTER_VAULT(ctrl.cluster.histories);
 }

-/* remove all cluster history on a lock operation */
-void hbs_cluster_lock( void )
-{
-    ilog ("controller-%d lock ; clearing all cluster info", ctrl.this_controller );
-    for ( int h = 0 ; h < ctrl.cluster.histories ; h++ )
-    {
-        memset ( &ctrl.cluster.history[h], 0, sizeof(mtce_hbs_cluster_history_type));
-    }
-    ctrl.cluster.histories = 0 ;
-}