Add mtcAgent support for sm_node_unhealthy condition

When heartbeat over both networks fail, mtcAgent
provides a 5 second grace period for heartbeat to
recover before failing the node.

However, when heartbeat fails over only one of the
networks (management or cluster) the mtcAgent does
not honour that 5 second grace period ; a bug.

When it comes to peer controller heartbeat failure
handling, SM needs that 5 second grace period to handle
swact before mtcAgent declares the peer controller as
failed, resets the node and updates the database.

This update implements a change that forces a 2 second
wait time between each fast enable and fixes the fast
enable threshold count to be the intended 3 retries.
This ensures that at least 5 seconds, actually 6 in
the case of single network heartbeat loss, passes
before declaring the node as failed.

In addition to that, a special condition is added to
detect and stop work if the active controller is
sm_node_unhealthy. We don't want mtcAgent to make
any database updates while in this failure mode.
This gives SM the time to handle the failure
according to the system's controllers' high
availability handling feature.

Test Plan:

PASS: Verify mtcAgent behavior on set and clear of
      SM node unhealthy state.
PASS: Verify SM has at least 5 seconds to shut down
      mtcAgent when heartbeat to peer controller fails
      for one or both networks.
PASS: Test real case scenario with link pull.
PASS: Verify logging in presence of real failure condition.

Change-Id: I8f8d6688040fe899aff6fc40aadda37894c2d5e9
Closes-Bug: 1847657
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
Eric MacDonald 2019-10-15 15:16:22 -04:00
parent cc69bf7b4a
commit 675f49d556
4 changed files with 40 additions and 6 deletions

View File

@ -376,6 +376,7 @@ void mtc_stages_init ( void )
enableStages_str [MTC_ENABLE__STAGES ] = "unknown" ;
recoveryStages_str[MTC_RECOVERY__START ] = "Handler-Start";
recoveryStages_str[MTC_RECOVERY__RETRY_WAIT ] = "Req-Retry-Wait";
recoveryStages_str[MTC_RECOVERY__REQ_MTCALIVE ] = "Req-MtcAlive";
recoveryStages_str[MTC_RECOVERY__REQ_MTCALIVE_WAIT ] = "Req-MtcAlive-Wait";
recoveryStages_str[MTC_RECOVERY__RESET_RECV_WAIT ] = "Reset-Recv-Wait";

View File

@ -936,10 +936,11 @@ typedef enum
string get_delStages_str ( mtc_delStages_enum stage );
#define MTC_MAX_FAST_ENABLES (2)
#define MTC_MAX_FAST_ENABLES (3)
typedef enum
{
MTC_RECOVERY__START = 0,
MTC_RECOVERY__RETRY_WAIT,
MTC_RECOVERY__REQ_MTCALIVE,
MTC_RECOVERY__REQ_MTCALIVE_WAIT,
MTC_RECOVERY__RESET_RECV_WAIT,

View File

@ -1173,6 +1173,9 @@ int _self_provision ( void )
return(rc);
}
static int sm_unhealthy_log_throttle = 0 ;
#define SM_UNHEALTHY_LOG_THROTTLE (100)
/* Main FSM Loop */
void nodeLinkClass::fsm ( void )
{
@ -1181,7 +1184,23 @@ void nodeLinkClass::fsm ( void )
int rc ;
daemon_signal_hdlr ();
this->uptime_handler ();
for ( struct node * node_ptr = head ; node_ptr != NULL ; node_ptr = node_ptr->next )
/* Controller HA Improvements Feature
* Handle the SM unhealthy of self case.
* If the active controller is unhealthy then stop doing
* work while its in this state. Allow for self healing */
struct node * node_ptr = nodeLinkClass::getNode ( this->my_hostname ) ;
if ( node_ptr->mtce_flags & MTC_FLAG__SM_UNHEALTHY )
{
elog_throttled (sm_unhealthy_log_throttle, SM_UNHEALTHY_LOG_THROTTLE,
"%s SM Unhealthy ; wait on health recovery or process shutdown",
node_ptr->hostname.c_str());
return ;
}
if ( sm_unhealthy_log_throttle )
sm_unhealthy_log_throttle = 0 ;
for ( node_ptr = head ; node_ptr != NULL ; node_ptr = node_ptr->next )
{
string hn = node_ptr->hostname ;
rc = fsm ( node_ptr ) ;

View File

@ -1685,16 +1685,29 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
}
else
{
/* TODO: Consider taking this log out as writing to the database
* during a fast graceful recovery might no be the best idea */
wlog ("%s Graceful Recovery (%d of %d)\n",
node_ptr->hostname.c_str(),
node_ptr->graceful_recovery_counter,
MTC_MAX_FAST_ENABLES );
if ( node_ptr->graceful_recovery_counter > 1 )
mtcInvApi_update_task ( node_ptr, "Graceful Recovery Retry" );
else
mtcInvApi_update_task ( node_ptr, "Graceful Recovery");
/* need to force a 2 second wait if we are in the
* graceful recovery retry so that we honor the 5
* second grace period */
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_SECS_2 );
recoveryStageChange ( node_ptr, MTC_RECOVERY__RETRY_WAIT ) ;
}
break ;
}
case MTC_RECOVERY__RETRY_WAIT:
{
if ( mtcTimer_expired ( node_ptr->mtcTimer ))
{
recoveryStageChange ( node_ptr, MTC_RECOVERY__REQ_MTCALIVE ) ;
}
break ;
}
case MTC_RECOVERY__REQ_MTCALIVE: