Make Mtce ignore heartbeat events from in-active controller.

There is the potential for a race condition that can lead to mtce incorrectly failing hosts due to heartbeat failure event messages sourced from the in-active controller. During a split brain recovery action scenario there was a swact which left the hbsAgent on the new stand-by controller thinking it was still on the active controller. This specific split brain failure mode was one where the active and then (after swact) stand-by controller was failing heartbeat to its peer and other nodes in the system even though the new active controller saw heartbeat working fine. The problem being, the in-active controller detected and sent a heartbeat loss message to mtce before mtce was able to update the in-active controller's heartbeat activity status which would have gated the loss event send. This update adds an additional layer of protection by intentionally ignoring heartbeat events from the in-active controller that might slip through due to this activity state change race condition. Also fixed a flooding log in the hbsAgent for big systems. Change-Id: I825a801166b3e80cbf67945c7f587851f4e0d90b Closes-Bug: 1813976 Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
2019-05-07 15:30:00 -04:00 · 2019-05-07 15:30:00 -04:00 · 5c043f7ca9
parent c46e7d1a2c
commit 5c043f7ca9
4 changed files with 25 additions and 4 deletions
--- a/mtce/centos/build_srpm.data
+++ b/mtce/centos/build_srpm.data
@ -1,3 +1,3 @@
 SRC_DIR="src"
-TIS_PATCH_VER=150
+TIS_PATCH_VER=151
 BUILD_IS_SLOW=5
--- a/mtce/src/common/nodeClass.cpp
+++ b/mtce/src/common/nodeClass.cpp
@ -8299,8 +8299,8 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
                else
                {
                    pulse_ptr->hbs_failure[iface] = true ;
-                    hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
                }
+                hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
                pulse_ptr->hbs_failure_count[iface]++ ;
            }
            if ( pulse_ptr->b2b_misses_count[iface] > pulse_ptr->max_count[iface] )
--- a/mtce/src/heartbeat/hbsAgent.cpp
+++ b/mtce/src/heartbeat/hbsAgent.cpp
@ -128,9 +128,9 @@ void monitor_scheduling ( unsigned long long & this_time, unsigned long long & p
    this_time = gettime_monotonic_nsec () ;
    if ( label_ptr && strncmp ( label_ptr, NODEUTIL_LATENCY_MON_START, strlen(NODEUTIL_LATENCY_MON_START)))
    {
-        if ( ! strcmp (SCHED_MONITOR__RECEIVER, label_ptr ) && ( data > 10 ))
+        if ( ! strcmp (SCHED_MONITOR__RECEIVER, label_ptr ) && ( data > (int)hostname_inventory.size() ))
        {
-            ilog ("===> receive latency : batch of %d pulses in under scheduling threshold of %d msec\n", data , hbs_config.latency_thld );
+            wlog ("===> receive latency : batch of %d pulses in under scheduling threshold of %d msec\n", data , hbs_config.latency_thld );
        }
        else if ( this_time > (prev_time + (NSEC_TO_MSEC*(hbs_config.latency_thld))))
        {
--- a/mtce/src/maintenance/mtcCtrlMsg.cpp
+++ b/mtce/src/maintenance/mtcCtrlMsg.cpp
@ -833,6 +833,27 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr )
        return (RETRY) ;
    }

+    string hostaddr = sock_ptr->mtc_event_rx_sock->get_src_str();
+    string hostname = obj_ptr->get_hostname ( hostaddr ) ;
+    if ( hostname.empty() )
+    {
+        wlog ("%s ignoring service event from unknown host (%s)",
+                obj_ptr->my_hostname.c_str(), hostaddr.c_str());
+        return (PASS);
+    }
+    if (( hostname != obj_ptr->my_hostname ) &&
+        (( msg.cmd == MTC_EVENT_HEARTBEAT_LOSS )       ||
+         ( msg.cmd == MTC_EVENT_HEARTBEAT_MINOR_SET )  ||
+         ( msg.cmd == MTC_EVENT_HEARTBEAT_MINOR_CLR )  ||
+         ( msg.cmd == MTC_EVENT_HEARTBEAT_DEGRADE_SET )||
+         ( msg.cmd == MTC_EVENT_HEARTBEAT_DEGRADE_CLR )))
+    {
+        wlog ("%s %s from %s heartbeat service",
+                &msg.buf[0],
+                get_mtcNodeCommand_str(msg.cmd),
+                hostname.c_str());
+        return (PASS);
+    }
    if ( msg.cmd == MTC_EVENT_LOOPBACK )
    {
        const char * event_hdr_ptr = get_loopback_header() ;