Prevent process coredump due to missing token in response header

Both Maintenance and the Hardware Monitor use a common token refresh
utility that has been seen to crash the calling process when a token
'get' request is missing the token in its response header.

This update avoids that by exiting the token handler at error
detection point rather than continue handling the response with
invalid data.

Significant fault insertion testing was performed on the update
which lead to some additional improvements in token request error
handling that both processes benefit from.

Additional specific fixes include
- fixed race condition memory leak around authentication error handling
- differentiate token refresh from failure recovery renewal.
- fixed a few missing event status / rc updates.

Test Plan:
 - used mtce fault insertion tools to create failure modes
 - 24+ hr memory leak test run for both success & token error handling
 - all tests were done with both hwmond and mtcAgent

PASS: Verify build and AIO DX install.
PASS: Verify reported hwmon coredump issue is avoided/resolved.
PASS: Verify issue also exists in the mtcAgent and is also
      avoided/resolved by this update.

Regression:

PASS: Verify token get failure retry handling:
PASS: - get first token inline - retry cadence: 5 seconds
PASS: - refresh token by http  - retry cadence: 10, 30 and 1200 secs
PASS: Verify recovery handling cases:
PASS: - corrupt token
PASS: - no token present
PASS: - no token in header
PASS: Verify token renewal stress soak ; every 10 seconds for 24+ hrs
PASS: - repeat over token get failure cases
PASS: - in each success and failure case verify no memory leaks.
PASS: Verify authentication error handling soak
      - every 10-60 secs for 24+ hrs
      - token is corrupted followed by a sysinv request to
        exercise authentication error handling and renewal process.
PASS: Verify no coredumps.
PASS: Verify logging and token retry.
PASS: Verify process continues to use the previous token until a new
      one is acquired.
      - Token Refresh is on time.
      - Token Renew is on event.
PASS: Verify soak of persistent authentication error / token
      renewal cycle. No memory leak or coredumps.

Closes-Bug: 2063475
Change-Id: I5eef62518ac606e6b54323b46fbb6f9475b5c1ef
This commit is contained in:
Eric MacDonald 2024-04-29 13:09:00 +00:00
parent 975e868431
commit 4e62e3ac9f
8 changed files with 193 additions and 140 deletions

View File

@ -68,8 +68,9 @@
#define MTC_CMD_FIT__GOENABLE_AUDIT ("/var/run/fit/goenable_audit") /* mtcAgent */
#define MTC_CMD_FIT__JSON_LEAK_SOAK ("/var/run/fit/json_leak_soak") /* mtcAgent */
#define MTC_CMD_FIT__BMC_ACC_FAIL ("/var/run/fit/bmc_access_fail")/* mtcAgent */
#define MTC_CMD_FIT__MEM_LEAK_DEBUG ("/var/run/fit/mem_leak_debug")/* mtcAgent */
#define MTC_CMD_FIT__MEM_LEAK_DEBUG ("/var/run/fit/mem_leak_debug") /* mtcAgent */
#define MTC_CMD_FIT__FM_ERROR_CODE ("/var/run/fit/fm_error_code") /* mtcAgent */
#define MTC_CMD_FIT__CORRUPT_TOKEN ("/var/run/fit/corrupt_token") /* mtcAgent & hwmond */
/*****************************************************
* Fault Insertion Codes

View File

@ -1,8 +1,8 @@
/*
* Copyright (c) 2015-2016 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
* Copyright (c) 2015-2016, 2024 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
*/
/**
@ -109,7 +109,7 @@ int httpUtil_event_init ( libEvent * ptr ,
ptr->token.token.clear();
ptr->token.issued.clear();
ptr->token.expiry.clear();
ptr->token.delay = false ;
ptr->token.renew = false ;
ptr->token.refreshed = false ;
/* Instance Specific Request Data Data */
@ -494,7 +494,7 @@ int httpUtil_status ( libEvent & event )
{
keyToken_type * token_ptr = tokenUtil_get_ptr() ;
rc = FAIL_AUTHENTICATION ;
token_ptr->delay = true ; /* force delayed token renewal on authentication error */
token_ptr->renew = true ; /* force delayed token renewal on authentication error */
break ;
}
case 0:

View File

@ -109,7 +109,7 @@ typedef struct
string expiry ; /**< Timestamp when token is expired */
string token ; /**< The huge 3kb token */
bool refreshed; /**< set true when refreshed */
bool delay ; /**< trigger renew with small delay
bool renew ; /**< trigger renew with small delay
error renewal - flood avoidance */
} keyToken_type ;

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2015-2017 Wind River Systems, Inc.
* Copyright (c) 2015-2017, 2024 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -39,10 +39,12 @@ keyToken_type * tokenUtil_get_ptr ( void ) { return &__token__ ; };
keyToken_type tokenUtil_get_token ( void ) { return __token__ ; };
/* hold off for TOKEN_REFRESH_RETRY_DELAY seconds before trying again. */
/* Hold off for TOKEN_REFRESH_RETRY_DELAY seconds before trying again.
* This applies to getting the first token only. */
static int __retries = 0 ;
static void __retry_holdoff( int delay )
{
__retries = 0 ;
for ( int i = 0 ; i < delay ; i++ )
{
daemon_signal_hdlr ();
@ -227,7 +229,7 @@ void tokenUtil_get_first ( libEvent & event, string & hostname )
* token must periodicslly call this API as part of its main loop.
*
* All error conditions are handled with a small hold-off retry
* by timer rater than inline wait like in get_first.
* by timer rather than inline wait like in get_first.
*
* Returns : Nothing
*
@ -255,37 +257,50 @@ void tokenUtil_manage_token ( libEvent & event,
int rc = tokenUtil_new_token ( event, hostname, blocking );
if ( rc )
{
/* go for a retry by delayed refresh if the request fails */
__token__.delay = true ;
/* go for a retry by delayed renewal if the request fails */
__token__.renew = true ;
/* Override the default refresh timer duration to be at least
* the timeout value just in case the above request managed
* to launch something (even though it failed). */
_rr = event.timeout ;
}
#ifdef WANT_FIT_TESTING
else if ( daemon_want_fit ( FIT_CODE__TOKEN, hostname, "null_base" ))
httpUtil_free_base ( event );
#endif
if ( __token__.delay == true )
/* Clear the renewal flag */
if ( __token__.renew == true )
{
__token__.delay = false ;
_rr = TOKEN_REFRESH_RETRY_DELAY ;
ilog ("Token renew in %d seconds", _rr);
__token__.renew = false ;
}
else
{
dlog ("Token refresh in %d seconds", _rr);
}
mtcTimer_start(token_refresh_timer,handler,_rr );
}
/* Handle an unexpected state case */
else if ( token_refresh_timer.active == false )
{
slog ("%s no active token refresh timer ; starting new\n", hostname.c_str());
mtcTimer_start(token_refresh_timer,handler,TOKEN_REFRESH_RETRY_DELAY);
}
else if ( __token__.delay == true )
/* Handle the forced token renewal case
* set by receiving a Authentication response 404 */
else if ( __token__.renew == true )
{
ilog ( "Token Refresh in %d seconds\n", TOKEN_REFRESH_RETRY_DELAY );
mtcTimer_stop ( token_refresh_timer );
ilog ( "%s Token Renewal in %d seconds",
hostname.c_str(),
TOKEN_REFRESH_RETRY_DELAY);
if ( token_refresh_timer.ring == false )
mtcTimer_reset ( token_refresh_timer );
__token__.renew = false ;
__token__.delay = false ;
/* force refresh of token in 5 seconds */
mtcTimer_start(token_refresh_timer,handler,TOKEN_REFRESH_RETRY_DELAY);
/* force refresh of token in TOKEN_REFRESH_RETRY_DELAY seconds */
mtcTimer_start(token_refresh_timer,handler, TOKEN_REFRESH_RETRY_DELAY);
}
/* launch the event */
else if ( event.active == true )
{
/* Look for the response */
@ -295,18 +310,19 @@ void tokenUtil_manage_token ( libEvent & event,
}
else
{
slog ("%s token renew libevent base is null ; cannot launch '%s' request",
hostname.c_str(), event.operation.c_str());
/* should not get here. event active while base is null
* try and recover from this error case. */
__token__.delay = true ;
__token__.renew = true ;
event.active = false ;
}
}
/* Look for the received token and free the event objects */
else if ( event.base )
{
#ifdef WANT_FIT_TESTING
string data = "" ;
if ( daemon_want_fit ( FIT_CODE__TOKEN, hostname, "refresh", data ))
if ( daemon_want_fit ( FIT_CODE__TOKEN, hostname, "refresh" ))
__token__.token.clear();
#endif
@ -314,9 +330,9 @@ void tokenUtil_manage_token ( libEvent & event,
if ( __token__.token.empty() )
{
elog ("%s no token ; %d second hold-off before retry\n",
hostname.c_str(), TOKEN_REFRESH_RETRY_DELAY );
hostname.c_str(), event.timeout );
/* force refresh of token in 5 seconds */
/* force refresh of token in TOKEN_REFRESH_RETRY_DELAY seconds */
mtcTimer_reset(token_refresh_timer);
mtcTimer_start(token_refresh_timer,handler,TOKEN_REFRESH_RETRY_DELAY);
}
@ -345,30 +361,10 @@ void tokenUtil_log_refresh ( void )
}
}
/* Handle refreshing the authentication token */
int tokenUtil_token_refresh ( libEvent & event, string hostname )
/* Handle renewing the token ; due to authentication error */
void tokenUtil_token_renew ( void )
{
struct tm tokenExpiry; // given token expired time (UTC)
time_t cTime = time(NULL); // current time (UTC)
double diffTime = 0;
if ( event.status != PASS )
{
event.status = tokenUtil_new_token( event, hostname );
}
else
{
strptime( __token__.expiry.c_str(), "%Y-%m-%dT%H:%M:%S", &tokenExpiry );
/* Get a new authentication token if the given token is about to expire */
diffTime = difftime( mktime( &tokenExpiry ), cTime );
if ( diffTime <= STALE_TOKEN_DURATION )
{
ilog ("The given token will expire in %f seconds\n", diffTime);
event.status = tokenUtil_new_token( event, hostname );
}
}
return (event.status);
__token__.renew = true ;
}
string _get_ip ( void )
@ -454,34 +450,46 @@ int tokenUtil_handler ( libEvent & event )
{
elog ( "%s Token Request Failed - Error Code (%d) \n", hn.c_str(), event.status );
}
if ( event.request == KEYSTONE_GET_TOKEN )
else if ( event.request == KEYSTONE_GET_TOKEN )
{
/* get the token from response header*/
struct evkeyvalq *header_ptr = evhttp_request_get_input_headers(event.req);
const char * header_token_ptr = evhttp_find_header (header_ptr, MTC_JSON_AUTH_ID);
#ifdef WANT_FIT_TESTING
if ( daemon_want_fit ( FIT_CODE__TOKEN, hn , "header" ))
{
slog ("%s FIT token header", hn.c_str());
header_token_ptr = NULL ;
}
#endif
if ( !header_token_ptr )
{
rc = FAIL_JSON_PARSE ;
elog ( "%s Token Request Failed - no token in header\n", hn.c_str());
}
std::string token_str(header_token_ptr);
if ( jsonApi_auth_load ( hn, (char*)event.response.data(), info ) )
{
rc = FAIL_JSON_PARSE ;
elog ( "%s Token Request Failed - Json Parse Error\n", hn.c_str());
elog ( "%s Token Request Failed - no token in header", hn.c_str());
event.status = FAIL_TOKEN_GET ;
}
else
{
jlog ("%s Token Exp: %s\n", hn.c_str(), info.expiry.c_str() );
jlog ("%s Admin URL: %s\n" ,hn.c_str(), info.adminURL.c_str() );
jlog ("%s Token Len: %ld\n",hn.c_str(), token_str.length() );
token_ptr->issued = info.issued ;
token_ptr->expiry = info.expiry ;
token_ptr->token = token_str ;
token_ptr->url = info.adminURL ;
token_ptr->refreshed = true ;
std::string token_str(header_token_ptr);
if ( jsonApi_auth_load ( hn, (char*)event.response.data(), info ) )
{
elog ( "%s Token Request Failed - Json Parse Error", hn.c_str());
event.status = FAIL_JSON_PARSE ;
}
else
{
jlog ("%s Token Exp: %s", hn.c_str(), info.expiry.c_str() );
jlog ("%s Admin URL: %s" ,hn.c_str(), info.adminURL.c_str() );
jlog ("%s Token Len: %ld",hn.c_str(), token_str.length() );
token_ptr->issued = info.issued ;
token_ptr->expiry = info.expiry ;
token_ptr->token = token_str ;
token_ptr->url = info.adminURL ;
token_ptr->refreshed = true ;
event.status = PASS ;
}
}
}
else if ( event.request == KEYSTONE_GET_ENDPOINT_LIST )
@ -527,6 +535,7 @@ int tokenUtil_handler ( libEvent & event )
wlog ("%s '%s' failed to get interface type from endpoint list (%d)\n",
event.hostname.c_str(),
event.information.c_str(), rc);
event.status = FAIL_OPERATION ;
}
else if ( interface_type == "admin" )
{
@ -544,17 +553,23 @@ int tokenUtil_handler ( libEvent & event )
else
{
wlog ("%s '%s' service endpoint not found\n", event.hostname.c_str(), event.information.c_str());
event.status = FAIL_NOT_FOUND ;
}
}
else
{
elog ("%s Parse service endpoint list failed (rc:%d)\n", event.hostname.c_str(), rc);
elog ("%s Response: %s\n", event.hostname.c_str(), event.response.c_str() );
event.status = rc ;
event.status = FAIL_INVALID_DATA ;
}
}
if ( rc1 | rc2 | rc3 )
if ( event.status )
{
/* error already logged above */
dlog ("%s endpoint list get failed ; rc:%d", event.hostname.c_str(), event.status);
}
else if ( rc1 | rc2 | rc3 )
{
wlog ("%s '%s' one or mode endpoint parse failure (%d:%d:%d)\n",
event.hostname.c_str(),
@ -580,13 +595,12 @@ int tokenUtil_handler ( libEvent & event )
}
else
{
wlog ("%s '%s' service not found using '%s' label\n",
event.hostname.c_str(),
event.information.c_str(),
event.label.c_str());
wlog ("%s '%s' service not found using '%s' label",
event.hostname.c_str(),
event.information.c_str(),
event.label.c_str());
event.status = FAIL_NOT_FOUND ;
}
event.active = false ;
return (event.status);
}
else if ( event.request == KEYSTONE_GET_SERVICE_LIST )
{
@ -626,7 +640,6 @@ int tokenUtil_handler ( libEvent & event )
wlog ("%s '%s' service uuid not found\n",
event.hostname.c_str(),
event.information.c_str());
event.status = FAIL_KEY_VALUE_PARSE ;
}
}
}
@ -657,30 +670,40 @@ int tokenUtil_handler ( libEvent & event )
event.label.c_str());
event.status = FAIL_NOT_FOUND ;
}
return (event.status);
}
else
{
wlog ("%s Keystone Request Failed - Unsupported Request (%d)\n", hn.c_str(), event.request );
wlog ("%s Unsupported Token Request (%d)\n", hn.c_str(), event.request );
}
/* Check for a response string */
if ( token_ptr->token.empty() )
/* check for failed event status */
if ( event.status )
rc = event.status ;
else if ( event.request == KEYSTONE_GET_TOKEN )
{
elog ("%s Failed to get token\n", hn.c_str());
rc = FAIL_TOKEN_GET;
/* Check for a response string */
if ( token_ptr->token.empty() )
{
elog ("%s Failed to get token", hn.c_str());
rc = FAIL_TOKEN_GET;
}
/* Check for Key URL */
else if ( token_ptr->url.empty() )
{
elog ("%s Failed to get token URL", hn.c_str());
rc = FAIL_TOKEN_URL;
}
else
{
dlog ("%s Token Refresh O.K.", event.hostname.c_str());
rc = PASS ;
}
}
/* Check for Key URL */
else if ( token_ptr->url.empty() )
{
elog ("%s Failed to get token URL\n", hn.c_str());
rc = FAIL_TOKEN_URL;
}
else
{
dlog ("%s Token Refresh O.K.\n", event.hostname.c_str());
}
/* retry the token request if it failed */
if ( rc ) __token__.renew = true ;
event.active = false ;
return (rc);
}
@ -708,7 +731,6 @@ int tokenUtil_new_token ( libEvent & event, string hostname, bool blocking )
event.prefix_path = _get_keystone_prefix_path();
event.blocking = blocking ;
// event.blocking = true ;
event.request = KEYSTONE_GET_TOKEN ;
event.operation = "get new" ;
event.type = EVHTTP_REQ_POST ;

View File

@ -1,7 +1,7 @@
#ifndef __INCLUDE_TOKENUTIL_H__
#define __INCLUDE_TOKENUTIL_H__
/*
* Copyright (c) 2013, 2017 Wind River Systems, Inc.
* Copyright (c) 2013, 2017, 2024 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -11,13 +11,13 @@
* This module contains a single static __token__ object,
* an interface that updates/refreshes it with a valid token
* an interface that queries keystone service list uuids
* an interface tht queries the specified service admin
* an interface that queries the specified service admin
* endpoint using its service uuid.
*
*
* tokenUtil_get_svc_uuid - returns the service uuid for the
* specified service.
* tokenUtil_get_endpoint - returns the admin endpoint for the
* tokenUtil_get_endpoint - returns the admin endpoint for the
* specified service uuid.
*/
@ -42,12 +42,12 @@ keyToken_type tokenUtil_get_token ( void );
int tokenUtil_handler ( libEvent & event );
int tokenUtil_new_token ( libEvent & event, string hostname, bool blocking=true );
void tokenUtil_get_first ( libEvent & event, string & hostname );
int tokenUtil_token_refresh( libEvent & event, string hostname );
int tokenUtil_get_endpoints( libEvent & event, string service_uuid );
string tokenUtil_get_svc_uuid ( libEvent & event, string service_name );
void tokenUtil_fail_token ( void );
void tokenUtil_log_refresh ( void );
void tokenUtil_token_renew ( void );
int keystone_config_handler ( void * user,
const char * section,

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2013, 2016 Wind River Systems, Inc.
* Copyright (c) 2013, 2016, 2024 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -18,6 +18,10 @@
#include "hwmonThreads.h" /* for ... bmc_thread */
#include "secretUtil.h"
#ifdef WANT_FIT_TESTING
#include "tokenUtil.h"
static int token_corrupt_holdoff = 0 ;
#endif
/**************************************************************************
*
@ -177,6 +181,31 @@ void hwmonHostClass::hwmon_fsm ( void )
{
thread_kill ( host_ptr->bmc_thread_ctrl, host_ptr->bmc_thread_info );
}
#ifdef WANT_FIT_TESTING
if ((host_ptr->hostname == CONTROLLER_0 ) &&
(host_ptr->bm_provisioned ) &&
(daemon_is_file_present (MTC_CMD_FIT__CORRUPT_TOKEN)))
{
// The value in /var/run/fit/corrupt_token specifies the corruption cadence in seconds
if ( token_corrupt_holdoff == 0 )
{
token_corrupt_holdoff = daemon_get_file_int (MTC_CMD_FIT__CORRUPT_TOKEN) ;
slog ("FIT corrupting token and making sysinv request");
tokenUtil_fail_token();
hwmonHttp_mod_sensor ( host_ptr->hostname,
host_ptr->event,
host_ptr->sensor[0].uuid,
"status",
host_ptr->sensor[0].status);
}
else
{
token_corrupt_holdoff-- ;
sleep (1);
}
}
#endif
}
if ( host_ptr->want_degrade_audit )
{

View File

@ -1,8 +1,8 @@
/*
* Copyright (c) 2013-2018 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
* Copyright (c) 2013-2018, 2024 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
*/
/**
@ -433,11 +433,11 @@ int mtcHttpUtil_status ( libEvent & event )
event.status = PASS ;
break;
}
/* Authentication error - refresh the token */
/* Authentication error - need to renew the token */
case 401:
{
keyToken_type * token_ptr = tokenUtil_get_ptr() ;
token_ptr->delay = true ;
token_ptr->renew = true ;
rc = FAIL_AUTHENTICATION ;
break ;
}
@ -944,15 +944,15 @@ mtcHttpUtil_api_request_done:
mtcHttpUtil_free_base ( event );
/**
* If tere is an authentication error then request a new token and
* return the error to the caller so that the request can be retried
* If there is an authentication error then free the event, force a refresh
* and return the error to the caller so that the request can be retried.
**/
if (( event.status == FAIL_AUTHENTICATION ) ||
( event.status == MTC_HTTP_UNAUTHORIZED ))
{
/* Find the host this handler instance is being run against */
nodeLinkClass * obj_ptr = get_mtcInv_ptr () ;
tokenUtil_new_token ( obj_ptr->tokenEvent, obj_ptr->my_hostname );
tokenUtil_token_renew ( );
mtcHttpUtil_free_conn ( obj_ptr->tokenEvent );
mtcHttpUtil_free_base ( obj_ptr->tokenEvent );
event.status = FAIL_AUTHENTICATION ;
@ -1068,13 +1068,6 @@ int mtcHttpUtil_get_length ( libEvent & event )
event.log_prefix.c_str());
event.status = FAIL_JSON_ZERO_LEN ;
}
// else if ( event.response_len > MAX_EVENT_LEN )
// {
// elog ("%s Request Failed - Length Too Long (%d:%ld)\n",
// event.log_prefix.c_str(), MAX_EVENT_LEN, event.response_len );
//
// event.status = FAIL_JSON_TOO_LONG ;
// }
return ( event.response_len );
}
@ -1370,31 +1363,20 @@ _handler_done:
gettime ( event.done_time );
timedelta ( event.send_time, event.done_time, event.diff_time );
// Redundant log - already logged in the work queue FSM
// if ( event.status )
// {
// elog ( "%s Failed (rc:%d)\n",
// event.log_prefix.c_str(),
// event.status );
// }
mtcHttpUtil_log_event ( event );
if ( event.blocking == false )
{
// mtcHttpUtil_free_conn ( event );
// mtcHttpUtil_free_base ( event );
/**
* If tere is an authentication error then request a new token and
* return the error to the caller so that the request can be retried
**/
/**
* If there is an authentication error then free the event, force a refresh
* and return the error to the caller so that the request can be retried.
**/
if (( event.status == FAIL_AUTHENTICATION ) ||
( event.status == MTC_HTTP_UNAUTHORIZED ))
{
/* Find the host this handler instance is being run against */
nodeLinkClass * obj_ptr = get_mtcInv_ptr () ;
tokenUtil_new_token ( obj_ptr->tokenEvent, obj_ptr->my_hostname );
tokenUtil_token_renew ( );
mtcHttpUtil_free_conn ( obj_ptr->tokenEvent );
mtcHttpUtil_free_base ( obj_ptr->tokenEvent );
event.status = FAIL_AUTHENTICATION ;

View File

@ -1421,7 +1421,9 @@ void nodeLinkClass::fsm ( void )
mtcInv.mtcInfo_handler();
}
}
#ifdef WANT_FIT_TESTING
static int token_corrupt_holdoff = 0 ;
#endif
void daemon_service_run ( void )
{
int rc ;
@ -1736,7 +1738,24 @@ void daemon_service_run ( void )
sleep (1);
continue ;
}
#ifdef WANT_FIT_TESTING
if ( daemon_is_file_present (MTC_CMD_FIT__CORRUPT_TOKEN))
{
// The value in /var/run/fit/corrupt_token specifies the corruption cadence in seconds
if ( token_corrupt_holdoff == 0 )
{
token_corrupt_holdoff = daemon_get_file_int (MTC_CMD_FIT__CORRUPT_TOKEN) ;
slog ("FIT corrupting token and making sysinv request");
tokenUtil_fail_token();
mtcInv.mtcInvApi_force_states ( CONTROLLER_0, "unlocked", "enabled", "degraded" );
}
else
{
token_corrupt_holdoff-- ;
sleep (1);
}
}
#endif
/* Handle recovery from MNFA */
mtcInv.mnfa_recovery_handler ( mtcInv.my_hostname );