From 5c043f7ca94a1bf4d121c209d42687184ec58a18 Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Tue, 7 May 2019 15:30:00 -0400 Subject: [PATCH] Make Mtce ignore heartbeat events from in-active controller. There is the potential for a race condition that can lead to mtce incorrectly failing hosts due to heartbeat failure event messages sourced from the in-active controller. During a split brain recovery action scenario there was a swact which left the hbsAgent on the new stand-by controller thinking it was still on the active controller. This specific split brain failure mode was one where the active and then (after swact) stand-by controller was failing heartbeat to its peer and other nodes in the system even though the new active controller saw heartbeat working fine. The problem being, the in-active controller detected and sent a heartbeat loss message to mtce before mtce was able to update the in-active controller's heartbeat activity status which would have gated the loss event send. This update adds an additional layer of protection by intentionally ignoring heartbeat events from the in-active controller that might slip through due to this activity state change race condition. Also fixed a flooding log in the hbsAgent for big systems. Change-Id: I825a801166b3e80cbf67945c7f587851f4e0d90b Closes-Bug: 1813976 Signed-off-by: Eric MacDonald --- mtce/centos/build_srpm.data | 2 +- mtce/src/common/nodeClass.cpp | 2 +- mtce/src/heartbeat/hbsAgent.cpp | 4 ++-- mtce/src/maintenance/mtcCtrlMsg.cpp | 21 +++++++++++++++++++++ 4 files changed, 25 insertions(+), 4 deletions(-) diff --git a/mtce/centos/build_srpm.data b/mtce/centos/build_srpm.data index 86474af5..c3dfc2e7 100644 --- a/mtce/centos/build_srpm.data +++ b/mtce/centos/build_srpm.data @@ -1,3 +1,3 @@ SRC_DIR="src" -TIS_PATCH_VER=150 +TIS_PATCH_VER=151 BUILD_IS_SLOW=5 diff --git a/mtce/src/common/nodeClass.cpp b/mtce/src/common/nodeClass.cpp index 51a5e383..4936cba8 100755 --- a/mtce/src/common/nodeClass.cpp +++ b/mtce/src/common/nodeClass.cpp @@ -8299,8 +8299,8 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding ) else { pulse_ptr->hbs_failure[iface] = true ; - hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" ); } + hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" ); pulse_ptr->hbs_failure_count[iface]++ ; } if ( pulse_ptr->b2b_misses_count[iface] > pulse_ptr->max_count[iface] ) diff --git a/mtce/src/heartbeat/hbsAgent.cpp b/mtce/src/heartbeat/hbsAgent.cpp index d673222e..c0b16071 100644 --- a/mtce/src/heartbeat/hbsAgent.cpp +++ b/mtce/src/heartbeat/hbsAgent.cpp @@ -128,9 +128,9 @@ void monitor_scheduling ( unsigned long long & this_time, unsigned long long & p this_time = gettime_monotonic_nsec () ; if ( label_ptr && strncmp ( label_ptr, NODEUTIL_LATENCY_MON_START, strlen(NODEUTIL_LATENCY_MON_START))) { - if ( ! strcmp (SCHED_MONITOR__RECEIVER, label_ptr ) && ( data > 10 )) + if ( ! strcmp (SCHED_MONITOR__RECEIVER, label_ptr ) && ( data > (int)hostname_inventory.size() )) { - ilog ("===> receive latency : batch of %d pulses in under scheduling threshold of %d msec\n", data , hbs_config.latency_thld ); + wlog ("===> receive latency : batch of %d pulses in under scheduling threshold of %d msec\n", data , hbs_config.latency_thld ); } else if ( this_time > (prev_time + (NSEC_TO_MSEC*(hbs_config.latency_thld)))) { diff --git a/mtce/src/maintenance/mtcCtrlMsg.cpp b/mtce/src/maintenance/mtcCtrlMsg.cpp index 78270b53..6cbf86b1 100755 --- a/mtce/src/maintenance/mtcCtrlMsg.cpp +++ b/mtce/src/maintenance/mtcCtrlMsg.cpp @@ -833,6 +833,27 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr ) return (RETRY) ; } + string hostaddr = sock_ptr->mtc_event_rx_sock->get_src_str(); + string hostname = obj_ptr->get_hostname ( hostaddr ) ; + if ( hostname.empty() ) + { + wlog ("%s ignoring service event from unknown host (%s)", + obj_ptr->my_hostname.c_str(), hostaddr.c_str()); + return (PASS); + } + if (( hostname != obj_ptr->my_hostname ) && + (( msg.cmd == MTC_EVENT_HEARTBEAT_LOSS ) || + ( msg.cmd == MTC_EVENT_HEARTBEAT_MINOR_SET ) || + ( msg.cmd == MTC_EVENT_HEARTBEAT_MINOR_CLR ) || + ( msg.cmd == MTC_EVENT_HEARTBEAT_DEGRADE_SET )|| + ( msg.cmd == MTC_EVENT_HEARTBEAT_DEGRADE_CLR ))) + { + wlog ("%s %s from %s heartbeat service", + &msg.buf[0], + get_mtcNodeCommand_str(msg.cmd), + hostname.c_str()); + return (PASS); + } if ( msg.cmd == MTC_EVENT_LOOPBACK ) { const char * event_hdr_ptr = get_loopback_header() ;