Merge "Add mtcAgent support for sm_node_unhealthy condition"
This commit is contained in:
commit
069daf1e22
|
@ -376,6 +376,7 @@ void mtc_stages_init ( void )
|
|||
enableStages_str [MTC_ENABLE__STAGES ] = "unknown" ;
|
||||
|
||||
recoveryStages_str[MTC_RECOVERY__START ] = "Handler-Start";
|
||||
recoveryStages_str[MTC_RECOVERY__RETRY_WAIT ] = "Req-Retry-Wait";
|
||||
recoveryStages_str[MTC_RECOVERY__REQ_MTCALIVE ] = "Req-MtcAlive";
|
||||
recoveryStages_str[MTC_RECOVERY__REQ_MTCALIVE_WAIT ] = "Req-MtcAlive-Wait";
|
||||
recoveryStages_str[MTC_RECOVERY__RESET_RECV_WAIT ] = "Reset-Recv-Wait";
|
||||
|
|
|
@ -936,10 +936,11 @@ typedef enum
|
|||
string get_delStages_str ( mtc_delStages_enum stage );
|
||||
|
||||
|
||||
#define MTC_MAX_FAST_ENABLES (2)
|
||||
#define MTC_MAX_FAST_ENABLES (3)
|
||||
typedef enum
|
||||
{
|
||||
MTC_RECOVERY__START = 0,
|
||||
MTC_RECOVERY__RETRY_WAIT,
|
||||
MTC_RECOVERY__REQ_MTCALIVE,
|
||||
MTC_RECOVERY__REQ_MTCALIVE_WAIT,
|
||||
MTC_RECOVERY__RESET_RECV_WAIT,
|
||||
|
|
|
@ -1173,6 +1173,9 @@ int _self_provision ( void )
|
|||
return(rc);
|
||||
}
|
||||
|
||||
static int sm_unhealthy_log_throttle = 0 ;
|
||||
#define SM_UNHEALTHY_LOG_THROTTLE (100)
|
||||
|
||||
/* Main FSM Loop */
|
||||
void nodeLinkClass::fsm ( void )
|
||||
{
|
||||
|
@ -1181,7 +1184,23 @@ void nodeLinkClass::fsm ( void )
|
|||
int rc ;
|
||||
daemon_signal_hdlr ();
|
||||
this->uptime_handler ();
|
||||
for ( struct node * node_ptr = head ; node_ptr != NULL ; node_ptr = node_ptr->next )
|
||||
|
||||
/* Controller HA Improvements Feature
|
||||
* Handle the SM unhealthy of self case.
|
||||
* If the active controller is unhealthy then stop doing
|
||||
* work while its in this state. Allow for self healing */
|
||||
struct node * node_ptr = nodeLinkClass::getNode ( this->my_hostname ) ;
|
||||
if ( node_ptr->mtce_flags & MTC_FLAG__SM_UNHEALTHY )
|
||||
{
|
||||
elog_throttled (sm_unhealthy_log_throttle, SM_UNHEALTHY_LOG_THROTTLE,
|
||||
"%s SM Unhealthy ; wait on health recovery or process shutdown",
|
||||
node_ptr->hostname.c_str());
|
||||
return ;
|
||||
}
|
||||
if ( sm_unhealthy_log_throttle )
|
||||
sm_unhealthy_log_throttle = 0 ;
|
||||
|
||||
for ( node_ptr = head ; node_ptr != NULL ; node_ptr = node_ptr->next )
|
||||
{
|
||||
string hn = node_ptr->hostname ;
|
||||
rc = fsm ( node_ptr ) ;
|
||||
|
|
|
@ -1685,16 +1685,29 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
}
|
||||
else
|
||||
{
|
||||
/* TODO: Consider taking this log out as writing to the database
|
||||
* during a fast graceful recovery might no be the best idea */
|
||||
wlog ("%s Graceful Recovery (%d of %d)\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->graceful_recovery_counter,
|
||||
MTC_MAX_FAST_ENABLES );
|
||||
|
||||
if ( node_ptr->graceful_recovery_counter > 1 )
|
||||
mtcInvApi_update_task ( node_ptr, "Graceful Recovery Retry" );
|
||||
else
|
||||
mtcInvApi_update_task ( node_ptr, "Graceful Recovery");
|
||||
|
||||
/* need to force a 2 second wait if we are in the
|
||||
* graceful recovery retry so that we honor the 5
|
||||
* second grace period */
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_SECS_2 );
|
||||
recoveryStageChange ( node_ptr, MTC_RECOVERY__RETRY_WAIT ) ;
|
||||
}
|
||||
break ;
|
||||
}
|
||||
case MTC_RECOVERY__RETRY_WAIT:
|
||||
{
|
||||
if ( mtcTimer_expired ( node_ptr->mtcTimer ))
|
||||
{
|
||||
recoveryStageChange ( node_ptr, MTC_RECOVERY__REQ_MTCALIVE ) ;
|
||||
}
|
||||
|
||||
break ;
|
||||
}
|
||||
case MTC_RECOVERY__REQ_MTCALIVE:
|
||||
|
|
Loading…
Reference in New Issue