Fix MNFA recovery race condition that leads to stuck degrade

Seeing from 0 to 10% of hosts get stuck in the degrade state after MNFA recovery. Clearing host degrade on Multi-Node Failure Avoidance (MNFA) recovery does not send degrade clear but does clear the hbs controol states. Instead relies on explicit events from hbsAgent per host/network to do so. If MNFA Recovery (exit) event occurs before all hbsAgent clear messages arrive then the hbs control clear tricks the mtcAgent into thinking that there was no degrade event active when it actually may still be. This fix enables the clear option the mon_host MNFA Recovery call so that the host's degrade condition is cleared. It also removes the unnecessary heartbeat disable call. Test Plan: PASS: soak MNFA in large system over and over to verify a 0-10% stuck degrade occurance rate drops to 0 after many (more than 20) occurances. Regression: PASS: Verify heartbeat. PASS: Verify single node graceful recovery. Change-Id: I699a376af5a95cc8dcc6ea5cc8266dc14fbacd09 Closes-Bug: 1845344 Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
2019-09-28 19:41:53 -04:00 · 2019-09-28 19:41:53 -04:00 · f01fd85470
parent 901e81a191
commit f01fd85470
2 changed files with 6 additions and 19 deletions
--- a/mtce/src/common/nodeClass.cpp
+++ b/mtce/src/common/nodeClass.cpp
@ -7516,16 +7516,12 @@ int nodeLinkClass::mon_host ( const string & hostname, bool true_false, bool sen
    node_ptr = nodeLinkClass::getNode ( hostname );
    if ( node_ptr != NULL )
    {
-        bool want_log = true ;
        for ( int iface = 0 ; iface < MAX_IFACES ; iface++ )
        {
            if ( iface == CLSTR_IFACE )
            {
                if ( this->clstr_network_provisioned == false )
                    continue ;
-
-                if ( node_ptr->monitor[MGMNT_IFACE] == true_false )
-                    want_log = false ;
            }

            if ( send_clear == true )
@ -7536,11 +7532,7 @@ int nodeLinkClass::mon_host ( const string & hostname, bool true_false, bool sen

            if ( true_false == true )
            {
-                if ( want_log )
-                {
-                    ilog ("%s starting heartbeat service \n",
-                              hostname.c_str());
-                }
+                ilog ("%s heartbeat start", hostname.c_str());
                node_ptr->no_work_log_throttle = 0 ;
                node_ptr->b2b_misses_count[iface] = 0 ;
                node_ptr->hbs_misses_count[iface] = 0 ;
@ -7552,11 +7544,7 @@ int nodeLinkClass::mon_host ( const string & hostname, bool true_false, bool sen
            }
            else
            {
-                if ( want_log )
-                {
-                    ilog ("%s stopping heartbeat service\n",
-                              hostname.c_str());
-                }
+                ilog ("%s heartbeat stop", hostname.c_str());
            }
            node_ptr->monitor[iface] = true_false ;
        }
--- a/mtce/src/heartbeat/hbsAgent.cpp
+++ b/mtce/src/heartbeat/hbsAgent.cpp
@ -1963,11 +1963,11 @@ void daemon_service_run ( void )
                                hbsInv.mon_host ( hostname, true, true );
                            }
                        }
-                        else if ( msg.cmd == MTC_RESTART_HBS )
+                        else if (( msg.cmd == MTC_RESTART_HBS ) &&
+                                 ( hostname != hbsInv.my_hostname ))
                        {
-                            hbsInv.mon_host ( hostname, false, false );
-                            hbsInv.mon_host ( hostname, true, false  );
-                            ilog ("%s restarting heartbeat service\n", hostname.c_str());
+                            hbsInv.mon_host ( hostname, true, true  );
+                            ilog ("%s heartbeat restart", hostname.c_str());
                            hbsInv.print_node_info();
                        }
                        else if ( msg.cmd == MTC_RECOVER_HBS )
@ -1978,7 +1978,6 @@ void daemon_service_run ( void )
                        }
                        else if ( msg.cmd == MTC_BACKOFF_HBS )
                        {
-
                            hbsInv.hbs_pulse_period = (hbsInv.hbs_pulse_period_save * HBS_BACKOFF_FACTOR) ;
                            ilog ("%s starting heartbeat backoff (period:%d msecs)\n", hostname.c_str(), hbsInv.hbs_pulse_period );
                            hbs_cluster_change ( "backoff" );