Make Mtce ignore heartbeat events from in-active controller.

There is the potential for a race condition that can lead to
mtce incorrectly failing hosts due to heartbeat failure event
messages sourced from the in-active controller.

During a split brain recovery action scenario there was a swact
which left the hbsAgent on the new stand-by controller thinking
it was still on the active controller.

This specific split brain failure mode was one where the active
and then (after swact) stand-by controller was failing heartbeat
to its peer and other nodes in the system even though the new
active controller saw heartbeat working fine.

The problem being, the in-active controller detected and sent
a heartbeat loss message to mtce before mtce was able to update
the in-active controller's heartbeat activity status which would
have gated the loss event send.

This update adds an additional layer of protection by intentionally
ignoring heartbeat events from the in-active controller that might
slip through due to this activity state change race condition.

Also fixed a flooding log in the hbsAgent for big systems.

Change-Id: I825a801166b3e80cbf67945c7f587851f4e0d90b
Closes-Bug: 1813976
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
Eric MacDonald 2019-05-07 15:30:00 -04:00 committed by Al Bailey
parent c46e7d1a2c
commit 5c043f7ca9
4 changed files with 25 additions and 4 deletions

View File

@ -1,3 +1,3 @@
SRC_DIR="src"
TIS_PATCH_VER=150
TIS_PATCH_VER=151
BUILD_IS_SLOW=5

View File

@ -8299,8 +8299,8 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
else
{
pulse_ptr->hbs_failure[iface] = true ;
hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
}
hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
pulse_ptr->hbs_failure_count[iface]++ ;
}
if ( pulse_ptr->b2b_misses_count[iface] > pulse_ptr->max_count[iface] )

View File

@ -128,9 +128,9 @@ void monitor_scheduling ( unsigned long long & this_time, unsigned long long & p
this_time = gettime_monotonic_nsec () ;
if ( label_ptr && strncmp ( label_ptr, NODEUTIL_LATENCY_MON_START, strlen(NODEUTIL_LATENCY_MON_START)))
{
if ( ! strcmp (SCHED_MONITOR__RECEIVER, label_ptr ) && ( data > 10 ))
if ( ! strcmp (SCHED_MONITOR__RECEIVER, label_ptr ) && ( data > (int)hostname_inventory.size() ))
{
ilog ("===> receive latency : batch of %d pulses in under scheduling threshold of %d msec\n", data , hbs_config.latency_thld );
wlog ("===> receive latency : batch of %d pulses in under scheduling threshold of %d msec\n", data , hbs_config.latency_thld );
}
else if ( this_time > (prev_time + (NSEC_TO_MSEC*(hbs_config.latency_thld))))
{

View File

@ -833,6 +833,27 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr )
return (RETRY) ;
}
string hostaddr = sock_ptr->mtc_event_rx_sock->get_src_str();
string hostname = obj_ptr->get_hostname ( hostaddr ) ;
if ( hostname.empty() )
{
wlog ("%s ignoring service event from unknown host (%s)",
obj_ptr->my_hostname.c_str(), hostaddr.c_str());
return (PASS);
}
if (( hostname != obj_ptr->my_hostname ) &&
(( msg.cmd == MTC_EVENT_HEARTBEAT_LOSS ) ||
( msg.cmd == MTC_EVENT_HEARTBEAT_MINOR_SET ) ||
( msg.cmd == MTC_EVENT_HEARTBEAT_MINOR_CLR ) ||
( msg.cmd == MTC_EVENT_HEARTBEAT_DEGRADE_SET )||
( msg.cmd == MTC_EVENT_HEARTBEAT_DEGRADE_CLR )))
{
wlog ("%s %s from %s heartbeat service",
&msg.buf[0],
get_mtcNodeCommand_str(msg.cmd),
hostname.c_str());
return (PASS);
}
if ( msg.cmd == MTC_EVENT_LOOPBACK )
{
const char * event_hdr_ptr = get_loopback_header() ;