From f01fd854702d0c8654b9e441315551b28a0a66bc Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Sat, 28 Sep 2019 19:41:53 -0400 Subject: [PATCH] Fix MNFA recovery race condition that leads to stuck degrade Seeing from 0 to 10% of hosts get stuck in the degrade state after MNFA recovery. Clearing host degrade on Multi-Node Failure Avoidance (MNFA) recovery does not send degrade clear but does clear the hbs controol states. Instead relies on explicit events from hbsAgent per host/network to do so. If MNFA Recovery (exit) event occurs before all hbsAgent clear messages arrive then the hbs control clear tricks the mtcAgent into thinking that there was no degrade event active when it actually may still be. This fix enables the clear option the mon_host MNFA Recovery call so that the host's degrade condition is cleared. It also removes the unnecessary heartbeat disable call. Test Plan: PASS: soak MNFA in large system over and over to verify a 0-10% stuck degrade occurance rate drops to 0 after many (more than 20) occurances. Regression: PASS: Verify heartbeat. PASS: Verify single node graceful recovery. Change-Id: I699a376af5a95cc8dcc6ea5cc8266dc14fbacd09 Closes-Bug: 1845344 Signed-off-by: Eric MacDonald --- mtce/src/common/nodeClass.cpp | 16 ++-------------- mtce/src/heartbeat/hbsAgent.cpp | 9 ++++----- 2 files changed, 6 insertions(+), 19 deletions(-) diff --git a/mtce/src/common/nodeClass.cpp b/mtce/src/common/nodeClass.cpp index 3bda323a..2ced49dc 100755 --- a/mtce/src/common/nodeClass.cpp +++ b/mtce/src/common/nodeClass.cpp @@ -7516,16 +7516,12 @@ int nodeLinkClass::mon_host ( const string & hostname, bool true_false, bool sen node_ptr = nodeLinkClass::getNode ( hostname ); if ( node_ptr != NULL ) { - bool want_log = true ; for ( int iface = 0 ; iface < MAX_IFACES ; iface++ ) { if ( iface == CLSTR_IFACE ) { if ( this->clstr_network_provisioned == false ) continue ; - - if ( node_ptr->monitor[MGMNT_IFACE] == true_false ) - want_log = false ; } if ( send_clear == true ) @@ -7536,11 +7532,7 @@ int nodeLinkClass::mon_host ( const string & hostname, bool true_false, bool sen if ( true_false == true ) { - if ( want_log ) - { - ilog ("%s starting heartbeat service \n", - hostname.c_str()); - } + ilog ("%s heartbeat start", hostname.c_str()); node_ptr->no_work_log_throttle = 0 ; node_ptr->b2b_misses_count[iface] = 0 ; node_ptr->hbs_misses_count[iface] = 0 ; @@ -7552,11 +7544,7 @@ int nodeLinkClass::mon_host ( const string & hostname, bool true_false, bool sen } else { - if ( want_log ) - { - ilog ("%s stopping heartbeat service\n", - hostname.c_str()); - } + ilog ("%s heartbeat stop", hostname.c_str()); } node_ptr->monitor[iface] = true_false ; } diff --git a/mtce/src/heartbeat/hbsAgent.cpp b/mtce/src/heartbeat/hbsAgent.cpp index 4ee52d13..9e4761d0 100644 --- a/mtce/src/heartbeat/hbsAgent.cpp +++ b/mtce/src/heartbeat/hbsAgent.cpp @@ -1963,11 +1963,11 @@ void daemon_service_run ( void ) hbsInv.mon_host ( hostname, true, true ); } } - else if ( msg.cmd == MTC_RESTART_HBS ) + else if (( msg.cmd == MTC_RESTART_HBS ) && + ( hostname != hbsInv.my_hostname )) { - hbsInv.mon_host ( hostname, false, false ); - hbsInv.mon_host ( hostname, true, false ); - ilog ("%s restarting heartbeat service\n", hostname.c_str()); + hbsInv.mon_host ( hostname, true, true ); + ilog ("%s heartbeat restart", hostname.c_str()); hbsInv.print_node_info(); } else if ( msg.cmd == MTC_RECOVER_HBS ) @@ -1978,7 +1978,6 @@ void daemon_service_run ( void ) } else if ( msg.cmd == MTC_BACKOFF_HBS ) { - hbsInv.hbs_pulse_period = (hbsInv.hbs_pulse_period_save * HBS_BACKOFF_FACTOR) ; ilog ("%s starting heartbeat backoff (period:%d msecs)\n", hostname.c_str(), hbsInv.hbs_pulse_period ); hbs_cluster_change ( "backoff" );