Merge "Add mtcAgent support for sm_node_unhealthy condition"

This commit is contained in:
Zuul 2019-10-16 19:15:03 +00:00 committed by Gerrit Code Review
commit 069daf1e22
4 changed files with 40 additions and 6 deletions

View File

@ -376,6 +376,7 @@ void mtc_stages_init ( void )
enableStages_str [MTC_ENABLE__STAGES ] = "unknown" ;
recoveryStages_str[MTC_RECOVERY__START ] = "Handler-Start";
recoveryStages_str[MTC_RECOVERY__RETRY_WAIT ] = "Req-Retry-Wait";
recoveryStages_str[MTC_RECOVERY__REQ_MTCALIVE ] = "Req-MtcAlive";
recoveryStages_str[MTC_RECOVERY__REQ_MTCALIVE_WAIT ] = "Req-MtcAlive-Wait";
recoveryStages_str[MTC_RECOVERY__RESET_RECV_WAIT ] = "Reset-Recv-Wait";

View File

@ -936,10 +936,11 @@ typedef enum
string get_delStages_str ( mtc_delStages_enum stage );
#define MTC_MAX_FAST_ENABLES (2)
#define MTC_MAX_FAST_ENABLES (3)
typedef enum
{
MTC_RECOVERY__START = 0,
MTC_RECOVERY__RETRY_WAIT,
MTC_RECOVERY__REQ_MTCALIVE,
MTC_RECOVERY__REQ_MTCALIVE_WAIT,
MTC_RECOVERY__RESET_RECV_WAIT,

View File

@ -1173,6 +1173,9 @@ int _self_provision ( void )
return(rc);
}
static int sm_unhealthy_log_throttle = 0 ;
#define SM_UNHEALTHY_LOG_THROTTLE (100)
/* Main FSM Loop */
void nodeLinkClass::fsm ( void )
{
@ -1181,7 +1184,23 @@ void nodeLinkClass::fsm ( void )
int rc ;
daemon_signal_hdlr ();
this->uptime_handler ();
for ( struct node * node_ptr = head ; node_ptr != NULL ; node_ptr = node_ptr->next )
/* Controller HA Improvements Feature
* Handle the SM unhealthy of self case.
* If the active controller is unhealthy then stop doing
* work while its in this state. Allow for self healing */
struct node * node_ptr = nodeLinkClass::getNode ( this->my_hostname ) ;
if ( node_ptr->mtce_flags & MTC_FLAG__SM_UNHEALTHY )
{
elog_throttled (sm_unhealthy_log_throttle, SM_UNHEALTHY_LOG_THROTTLE,
"%s SM Unhealthy ; wait on health recovery or process shutdown",
node_ptr->hostname.c_str());
return ;
}
if ( sm_unhealthy_log_throttle )
sm_unhealthy_log_throttle = 0 ;
for ( node_ptr = head ; node_ptr != NULL ; node_ptr = node_ptr->next )
{
string hn = node_ptr->hostname ;
rc = fsm ( node_ptr ) ;

View File

@ -1685,16 +1685,29 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
}
else
{
/* TODO: Consider taking this log out as writing to the database
* during a fast graceful recovery might no be the best idea */
wlog ("%s Graceful Recovery (%d of %d)\n",
node_ptr->hostname.c_str(),
node_ptr->graceful_recovery_counter,
MTC_MAX_FAST_ENABLES );
if ( node_ptr->graceful_recovery_counter > 1 )
mtcInvApi_update_task ( node_ptr, "Graceful Recovery Retry" );
else
mtcInvApi_update_task ( node_ptr, "Graceful Recovery");
/* need to force a 2 second wait if we are in the
* graceful recovery retry so that we honor the 5
* second grace period */
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_SECS_2 );
recoveryStageChange ( node_ptr, MTC_RECOVERY__RETRY_WAIT ) ;
}
break ;
}
case MTC_RECOVERY__RETRY_WAIT:
{
if ( mtcTimer_expired ( node_ptr->mtcTimer ))
{
recoveryStageChange ( node_ptr, MTC_RECOVERY__REQ_MTCALIVE ) ;
}
break ;
}
case MTC_RECOVERY__REQ_MTCALIVE: