Mtce: Implement all token fetches as non-blocking operations.

Fetching a keystone token is seen to take a long time When the system
is overloaded. Such delay, due to overload, is most often seen over a
Swact in All-In-One Duplex (AIO-DX) configuration.

Any blocking call that takes a long time to complete can cause a process
stall. If that stall is sufficiently long then the command will either
finish on or before the timeout. The timeout is currently set to 15
seconds which is comparable to the SM monitored audit timeout.

Although rare, then race condition does exist and if there are 2
timeouts within 2 minutes then SM fails the process and triggers a
swact.

Rather than tune the timeouts, this update implements a more robust
fix by making all token fetches ; both 'initial token get' and
'runtime token refresh' as non-blocking.

The change applies to both mtcAgent and hwmond ; both of which need
to get and manage their own authentication token.

Change-Id: I2730b76ae78daec4b9edeaff5c1ca614b75ab52c
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
Eric MacDonald 2018-03-12 15:18:00 -04:00
parent 0328da92d2
commit 08e66abb1b
9 changed files with 334 additions and 114 deletions

View File

@ -103,6 +103,8 @@
#define FIT_CODE__NO_PULSE_REQUEST (15)
#define FIT_CODE__NO_PULSE_RESPONSE (16)
#define FIT_CODE__TOKEN (17)
#define FIT_CODE__FAST_PING_AUDIT_HOST (20)
#define FIT_CODE__FAST_PING_AUDIT_ALL (21)

View File

@ -911,7 +911,7 @@ int httpUtil_api_request ( libEvent & event )
event.status = RETRY ;
if ( event.blocking == true )
{
hlog ("%s Requested (blocking) (to:%d)\n", event.log_prefix.c_str(), event.timeout);
hlog ("%s Requested (blocking) (timeout:%d secs)\n", event.log_prefix.c_str(), event.timeout);
/* Send the message with timeout */
event_base_dispatch(event.base);
@ -920,7 +920,7 @@ int httpUtil_api_request ( libEvent & event )
}
else if ( event.request == KEYSTONE_GET_TOKEN )
{
httpUtil_event_info (event);
hlog ("%s Requested (non-blocking) (timeout:%d secs)\n", event.log_prefix.c_str(), event.timeout);
event.active = true ;
event.status = event_base_loop(event.base, EVLOOP_NONBLOCK);
httpUtil_latency_log ( event, label.c_str(), __LINE__, MAX_DELAY_B4_LATENCY_LOG ); /* Should be immediate ; non blocking */
@ -929,7 +929,7 @@ int httpUtil_api_request ( libEvent & event )
}
else
{
hlog ("%s Requested (blocking) (to:%d)\n", event.log_prefix.c_str(), event.timeout );
hlog ("%s Requested (blocking) (timeout:%d secs)\n", event.log_prefix.c_str(), event.timeout );
event_base_dispatch(event.base);
httpUtil_latency_log ( event, label.c_str(), __LINE__, MAX_DELAY_B4_LATENCY_LOG ) ;
goto httpUtil_api_request_done ;

View File

@ -46,7 +46,7 @@ using namespace std;
#define HTTP_SYSINV_CRIT_TIMEOUT (20)
#define HTTP_SYSINV_NONC_TIMEOUT (10)
#define HTTP_TOKEN_TIMEOUT (15)
#define HTTP_TOKEN_TIMEOUT (30)
#define HTTP_KEYSTONE_GET_TIMEOUT (10)
#define HTTP_SMGR_TIMEOUT (20)
#define HTTP_VIM_TIMEOUT (20)

View File

@ -30,6 +30,7 @@
#include "tokenUtil.h" /* for ... this module header */
#define GET_SERVICE_LIST_LABEL ((const char *)"OS-KSADM:services")
#define TOKEN_REFRESH_RETRY_DELAY (5)
/* The static token used for authentication by any
* daemon that includes this module */
@ -37,6 +38,301 @@ static keyToken_type __token__ ;
keyToken_type * tokenUtil_get_ptr ( void ) { return &__token__ ; };
keyToken_type tokenUtil_get_token ( void ) { return __token__ ; };
/* hold off for TOKEN_REFRESH_RETRY_DELAY seconds before trying again. */
static int __retries = 0 ;
static void __retry_holdoff( int delay )
{
for ( int i = 0 ; i < delay ; i++ )
{
daemon_signal_hdlr ();
__retries++ ;
sleep (1);
}
}
/***************************************************************************
*
* Name : tokenUtil_get_first
*
* Description: Get the first token. Should only be called at process startup
* time before entering the main loop.
*
* Assumptions: The token request is non-blocking but this interface does not
* return until a token has been acquired.
* The signal handler is serviced.
*
* Behavior : Issue non-blocking token request. If the request fails
* immediately the the far end did not connect and we backoff
* for RETRY_DELAY seconds. With a successful send we loop for
* up to 2 seconds longer than the specified timeout, Any failure
* results in a retry after RETRY_DELAY seconds.
* Return as soon as we get a token.
*
* Returns : nothing
*
***************************************************************************/
void tokenUtil_get_first ( libEvent & event, string & hostname )
{
int rc = FAIL ; /* error staus local variable */
bool got_token = false ; /* exit criteria when true */
bool blocking = false ; /* token request is non_blocking */
int log_throttle = 0 ; /* control var to prevent log flooding */
__retries = 0 ; /* count how long it took to get the token */
dlog ("%s Requesting initial token\n", hostname.c_str());
do
{
log_throttle = 0 ;
__token__.token.clear(); /* start with an empty token */
/* Issue the token request in non-blocking form */
rc = tokenUtil_new_token ( event, hostname, blocking );
#ifdef WANT_FIT_TESTING
string data = "" ;
if ( daemon_want_fit ( FIT_CODE__TOKEN , hostname, "fail_request", data ))
rc = atoi(data.data()) ;
#endif
if ( rc == PASS )
{
for ( int i = 0 ; i < (HTTP_TOKEN_TIMEOUT+2) ; i++ )
{
daemon_signal_hdlr ();
/* Look for the response */
if ( event.base )
{
/* look for the response */
event_base_loop( event.base, EVLOOP_NONBLOCK );
/* response is received once the active state is false */
if ( event.active )
{
ilog_throttled ( log_throttle, 20,
"%s waiting on token request completion loop:%d\n",
hostname.c_str(), i);
__retries++ ;
sleep (1);
}
else
{
#ifdef WANT_FIT_TESTING
string data = "" ;
if ( daemon_want_fit ( FIT_CODE__TOKEN, hostname, "timeout", data ))
{
event.status = atoi(data.data());
__token__.token.clear();
}
#endif
if ( event.status == PASS )
{
ilog ("%s got token after %d seconds\n",
hostname.c_str(),
__retries );
got_token = true ;
break ; /* will exit below if there is a token */
}
else
{
/* report warning for timeout and error otherwise */
if ( event.status == FAIL_TIMEOUT )
{
wlog ("%s token request timeout after %d seconds\n",
hostname.c_str(),
__retries);
}
else
{
elog ("%s token request failed after %d seconds (rc:%d)",
hostname.c_str(),
__retries,
event.status );
}
/* check connection pointers ; for debug and failure rca */
if ( ( event.base == NULL ) || ( event.conn == NULL ))
{
wlog ("%s ... base:%p conn:%p retries:%d\n",
hostname.c_str(),
event.base,
event.conn,
__retries);
}
break ; /* will retry below */
} /* end else status fail */
} /* end else no active */
}
else
{
/* should not get here with a null base pointer
* but if we do then break and try again. */
slog ("%s unexpected null event base (%d) - retrying\n",
hostname.c_str(),
__retries );
break ; /* retry below */
}
}
/* Check for a response string */
if ( __token__.token.empty() )
{
elog ("%s no token ; %d second hold-off before retry\n",
hostname.c_str(), TOKEN_REFRESH_RETRY_DELAY);
__retry_holdoff(TOKEN_REFRESH_RETRY_DELAY);
}
}
else
{
elog ("%s token request failed (rc:%d:%d) active:%d\n",
hostname.c_str(),
rc,
event.status,
event.active );
__retry_holdoff(TOKEN_REFRESH_RETRY_DELAY);
}
httpUtil_free_conn ( event );
httpUtil_free_base ( event );
} while (( __token__.token.empty() ) || ( got_token == false )) ;
dlog ("%s took %d seconds to get token\n", hostname.c_str(), __retries );
/* wait 5 seconds for sysinv to be ready if the number of retries > 1 */
if ( __retries > 1 )
{
__retry_holdoff (3);
}
}
/****************************************************************************
*
* Name : tokenUtil_manage_token
*
* Purpose : Manage token refresh and failure retries
*
* Description: There should always be an active token refresh timer running.
* If there is none (swerr) then one is started.
*
* Any maintenance daemon that needs to periodically refresh its
* token must periodicslly call this API as part of its main loop.
*
* All error conditions are handled with a small hold-off retry
* by timer rater than inline wait like in get_first.
*
* Returns : Nothing
*
***************************************************************************/
void tokenUtil_manage_token ( libEvent & event,
string & hostname,
int & refresh_rate,
struct mtc_timer & token_refresh_timer,
void (*handler)(int, siginfo_t*, void*))
{
#ifdef WANT_FIT_TESTING
if ( daemon_want_fit ( FIT_CODE__TOKEN, hostname , "corrupt" ))
tokenUtil_fail_token ();
#endif
if ( token_refresh_timer.ring == true )
{
bool blocking = false ; /* token request is non_blocking */
int _rr = refresh_rate ; /* local copy of refresh rate */
dlog ("%s renewing token\n", hostname.c_str());
/* this is a non-blocking call with the 'false' spec */
int rc = tokenUtil_new_token ( event, hostname, blocking );
if ( rc )
{
/* go for a retry by delayed refresh if the request fails */
__token__.delay = true ;
}
#ifdef WANT_FIT_TESTING
else if ( daemon_want_fit ( FIT_CODE__TOKEN, hostname, "null_base" ))
httpUtil_free_base ( event );
#endif
if ( __token__.delay == true )
{
__token__.delay = false ;
_rr = TOKEN_REFRESH_RETRY_DELAY ;
}
mtcTimer_start(token_refresh_timer,handler,_rr );
}
else if ( token_refresh_timer.active == false )
{
slog ("%s no active token refresh timer ; starting new\n", hostname.c_str());
mtcTimer_start(token_refresh_timer,handler,TOKEN_REFRESH_RETRY_DELAY);
}
else if ( __token__.delay == true )
{
ilog ( "Token Refresh in %d seconds\n", TOKEN_REFRESH_RETRY_DELAY );
mtcTimer_stop ( token_refresh_timer );
__token__.delay = false ;
/* force refresh of token in 5 seconds */
mtcTimer_start(token_refresh_timer,handler,TOKEN_REFRESH_RETRY_DELAY);
}
else if ( event.active == true )
{
/* Look for the response */
if ( event.base )
{
event_base_loop( event.base, EVLOOP_NONBLOCK );
}
else
{
/* should not get here. event active while base is null
* try and recover from this error case. */
__token__.delay = true ;
event.active = false ;
}
}
else if ( event.base )
{
#ifdef WANT_FIT_TESTING
string data = "" ;
if ( daemon_want_fit ( FIT_CODE__TOKEN, hostname, "refresh", data ))
__token__.token.clear();
#endif
/* Check for a response string */
if ( __token__.token.empty() )
{
elog ("%s no token ; %d second hold-off before retry\n",
hostname.c_str(), TOKEN_REFRESH_RETRY_DELAY );
/* force refresh of token in 5 seconds */
mtcTimer_reset(token_refresh_timer);
mtcTimer_start(token_refresh_timer,handler,TOKEN_REFRESH_RETRY_DELAY);
}
dlog ("%s freeing token event base and conn data\n", hostname.c_str());
httpUtil_free_conn ( event );
httpUtil_free_base ( event );
}
#ifdef WANT_FIT_TESTING
if ( daemon_want_fit ( FIT_CODE__TOKEN, hostname, "cancel_timer" ))
mtcTimer_reset ( token_refresh_timer );
#endif
}
void tokenUtil_log_refresh ( void )
{
if ( __token__.refreshed == true )
@ -58,7 +354,7 @@ int tokenUtil_token_refresh ( libEvent & event, string hostname )
if ( event.status != PASS )
{
event.status = tokenUtil_new_token( event, hostname );
event.status = tokenUtil_new_token( event, hostname );
}
else
{
@ -148,7 +444,7 @@ string _get_keystone_prefix_path ( )
int tokenUtil_handler ( libEvent & event )
{
jsonUtil_auth_type info ;
string hn = event.hostname ;
int rc = event.status ;
@ -185,6 +481,7 @@ int tokenUtil_handler ( libEvent & event )
token_ptr->token = token_str ;
token_ptr->url = info.adminURL ;
token_ptr->refreshed = true ;
}
}
else if ( event.request == KEYSTONE_GET_ENDPOINT_LIST )
@ -288,6 +585,7 @@ int tokenUtil_handler ( libEvent & event )
event.information.c_str(),
event.label.c_str());
}
event.active = false ;
return (event.status);
}
else if ( event.request == KEYSTONE_GET_SERVICE_LIST )
@ -393,6 +691,7 @@ int tokenUtil_handler ( libEvent & event )
{
dlog ("%s Token Refresh O.K.\n", event.hostname.c_str());
}
event.active = false ;
return (rc);
}
@ -403,7 +702,7 @@ void tokenUtil_fail_token ( void )
}
/* fetches an authorization token as a blocking request */
int tokenUtil_new_token ( libEvent & event, string hostname )
int tokenUtil_new_token ( libEvent & event, string hostname, bool blocking )
{
ilog ("%s Requesting Authentication Token\n", hostname.c_str());
@ -418,7 +717,8 @@ int tokenUtil_new_token ( libEvent & event, string hostname )
dlog ("%s fetching new token\n", event.hostname.c_str());
event.prefix_path = _get_keystone_prefix_path();
event.blocking = true ;
event.blocking = blocking ;
// event.blocking = true ;
event.request = KEYSTONE_GET_TOKEN ;
event.operation = "get new" ;
event.type = EVHTTP_REQ_POST ;

View File

@ -42,7 +42,8 @@ keyToken_type * tokenUtil_get_ptr ( void );
keyToken_type tokenUtil_get_token ( void );
int tokenUtil_handler ( libEvent & event );
int tokenUtil_new_token ( libEvent & event, string hostname );
int tokenUtil_new_token ( libEvent & event, string hostname, bool blocking=true );
void tokenUtil_get_first ( libEvent & event, string & hostname );
int tokenUtil_token_refresh( libEvent & event, string hostname );
int tokenUtil_get_endpoints( libEvent & event, string service_uuid );
string tokenUtil_get_svc_uuid ( libEvent & event, string service_name );
@ -50,9 +51,15 @@ string tokenUtil_get_svc_uuid ( libEvent & event, string service_name )
void tokenUtil_fail_token ( void );
void tokenUtil_log_refresh ( void );
int keystone_config_handler ( void * user,
int keystone_config_handler ( void * user,
const char * section,
const char * name,
const char * value);
void tokenUtil_manage_token ( libEvent & event,
string & hostname,
int & refresh_rate,
struct mtc_timer & token_refresh_timer,
void (*handler)(int, siginfo_t*, void*));
#endif /* __INCLUDE_TOKENUTIL_H__ */

View File

@ -253,7 +253,6 @@ void hwmon_service ( hwmon_ctrl_type * ctrl_ptr )
daemon_config_type * config_ptr = daemon_get_cfg_ptr();
hwmon_socket_type * sock_ptr = getSock_ptr();
keyToken_type * token_ptr = tokenUtil_get_ptr();
hostInv.hostBase.my_hostname = ctrl_ptr->my_hostname ;
hostInv.hostBase.my_local_ip = ctrl_ptr->my_local_ip ;
@ -362,37 +361,11 @@ void hwmon_service ( hwmon_ctrl_type * ctrl_ptr )
/* Handle refreshing the authentication token */
tokenUtil_log_refresh ();
if ( hwmonTimer_token.ring == true )
{
hwmonTimer_token.ring = false ;
tokenUtil_new_token ( ctrl_ptr->httpEvent, ctrl_ptr->my_hostname );
/* If the token fetch fails then we retry in a shorter interval */
if ( token_ptr->delay == true )
{
mtcTimer_start( hwmonTimer_token, hwmonTimer_handler, 5 );
token_ptr->delay = false ;
}
/* otherwise at the normal refresh rate */
else
{
mtcTimer_start ( hwmonTimer_token, hwmonTimer_handler, config_ptr->token_refresh_rate ) ;
}
}
/* handle the spontaneous error case */
else if ( token_ptr->delay == true )
{
mtcTimer_stop ( hwmonTimer_token );
mtcTimer_start( hwmonTimer_token, hwmonTimer_handler, 5 );
token_ptr->delay = false ;
}
/* Always ensure that the token refesh timer is running */
if ( hwmonTimer_token.tid == NULL )
{
slog ("auto-restarting dead token refresh timer\n");
mtcTimer_start ( hwmonTimer_token, hwmonTimer_handler, config_ptr->token_refresh_rate ) ;
}
tokenUtil_manage_token ( ctrl_ptr->httpEvent,
ctrl_ptr->my_hostname,
config_ptr->token_refresh_rate,
hwmonTimer_token,
hwmonTimer_handler );
/* Run the FSM */
hostInv.hwmon_fsm ( ) ;

View File

@ -371,30 +371,14 @@ void daemon_service_run ( void )
}
}
/* Get an Authentication Token */
ilog ("%s Requesting initial token\n", hwmon_ctrl.my_hostname.c_str() );
do
{
// rc = mtcInv.mtcKeyApi_get_token ( mtcInv.my_hostname );
rc = tokenUtil_new_token ( hwmon_ctrl.httpEvent, hwmon_ctrl.my_hostname );
if ( rc )
{
elog ("Failed to get authentication token (%d)\n", rc );
sleep (2);
}
if ( hwmon_ctrl.httpEvent.base )
{
slog ("%s token base:%p\n",
hwmon_ctrl.my_hostname.c_str(),
hwmon_ctrl.httpEvent.base );
}
} while ( rc != PASS ) ;
/* Get the initial token.
* This call does not return until a token is received */
tokenUtil_get_first ( hwmon_ctrl.httpEvent, hwmon_ctrl.my_hostname );
#ifdef WANT_FIT_TESTING
if ( daemon_want_fit ( FIT_CODE__HWMON__CORRUPT_TOKEN ))
{
tokenUtil_fail_token ();
}
#endif
/* enable the base level signal handler latency monitor */
daemon_latency_monitor (true);

View File

@ -929,25 +929,9 @@ int _self_provision ( void )
daemon_exit();
}
/* Get an Authentication Token */
ilog ("%s Requesting initial token\n", mtcInv.my_hostname.c_str() );
do
{
rc = tokenUtil_new_token ( mtcInv.tokenEvent, mtcInv.my_hostname );
if ( rc )
{
elog ("Failed to get authentication token (%d)\n", rc );
}
/* Give system inventory some time after start-up.
* We see frequent first request failures requiring retry when this
* wait was inside the above if clause. Moving it here ensure there
* is always a small wait after the token fetch on process startup. */
daemon_signal_hdlr ();
sleep (5);
daemon_signal_hdlr ();
} while ( rc != PASS ) ;
/* Get the initial token.
* This call does not return until a token is received */
tokenUtil_get_first ( mtcInv.tokenEvent, mtcInv.my_hostname );
#ifdef WANT_FIT_TESTING
if ( daemon_want_fit ( FIT_CODE__CORRUPT_TOKEN, mtcInv.my_hostname ))
@ -1299,6 +1283,7 @@ void daemon_service_run ( void )
/* Run Maintenance service forever */
for ( ; ; )
{
daemon_signal_hdlr ();
/**
* Can't just run 'mtcHttpSvr_look' off select as it is seen to miss events.
* Would like to use event_base_loopexit with event_base_loopcontinue
@ -1307,11 +1292,11 @@ void daemon_service_run ( void )
* mtcHttpSvr_work ( mtce_event );
**/
mtcHttpSvr_look ( mtce_event );
tokenUtil_manage_token ( mtcInv.tokenEvent, mtcInv.my_hostname, mtcInv.token_refresh_rate, mtcInv.mtcTimer_token, mtcTimer_handler );
tokenUtil_log_refresh ();
if ( mtcInv_ptr->num_hosts () == 0 )
{
// mtcHttpSvr_look ( mtce_event );
sleep (1);
continue ;
}

View File

@ -35,8 +35,6 @@ int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr )
{
int rc = PASS ;
keyToken_type * __token_ptr = tokenUtil_get_ptr();
if ( node_ptr == NULL )
{
slog ("Null Node Pointer\n");
@ -92,35 +90,6 @@ int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr )
*/
nodeLinkClass::online_handler ( node_ptr );
if ( this->mtcTimer_token.ring == true )
{
dlog ("%s renewing token\n", my_hostname.c_str());
/* this is a blocking call */
tokenUtil_new_token ( tokenEvent, my_hostname );
if ( __token_ptr->delay == true )
{
__token_ptr->delay = false ;
mtcTimer_start( mtcTimer_token, mtcTimer_handler, 5 );
}
else
{
mtcTimer_start ( mtcTimer_token, mtcTimer_handler, token_refresh_rate );
}
}
else if ( __token_ptr->delay == true )
{
ilog ( "Token Refresh in 5 seconds\n" );
if ( mtcTimer_token.tid )
mtcTimer_stop ( mtcTimer_token );
__token_ptr->delay = false ;
/* force refresh of token in 5 seconds */
mtcTimer_start( mtcTimer_token, mtcTimer_handler, 5 );
}
if ( node_ptr->adminAction == MTC_ADMIN_ACTION__DELETE )
{
flog ("%s -> Delete Action\n", node_ptr->hostname.c_str());