Mtce: Improve non-blocking http request dispatch

Maintenance is seen to intermittently fail Swact requests early
after initial system provisioning, without logging an error
reason, only to always succeed later on.

The issue is difficult to reproduce so this update adds extra
logging to this code path and implements a speculative fix.

The event_base_loop calls' non-zero return code is never being
logged. The libevent documentation states that this API will
return 1 while the target has not yet provided any data.

Theory is, because the call is local, that normally it returns
with data even on the first dispatch case. However, during early
system configuration, when the system is busy, that first dispatch
does not complete immediately like it normally does later on.

Speculation is, instead it returns a 1 stating retry but the
existing code path treats that as a failure.

This update modifies the code to return a PASS if the command
dispatch returns a 1 while the error case of -1 gets enhanced
logging and continues to be treated as a failure.

Test Plan:
PASS: Swact 5 times
PASS: Lock/Unlock Host
PASS: Large System DOR

Related Bug: https://bugs.launchpad.net/starlingx/+bug/1791381
Change-Id: I19b22e07d3224b2e9dd3f3569ecbe9aed7d9402f
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
Eric MacDonald 2018-09-10 14:38:52 -04:00
parent 74c5f89ab4
commit 316032b904
5 changed files with 93 additions and 38 deletions

View File

@ -1,3 +1,3 @@
SRC_DIR="cgts-mtce-common-1.0"
TIS_PATCH_VER=137
TIS_PATCH_VER=138
BUILD_IS_SLOW=5

View File

@ -112,6 +112,7 @@
#define FIT_CODE__LOCK_HOST (31)
#define FIT_CODE__FORCE_LOCK_HOST (32)
#define FIT_CODE__UNLOCK_HOST (33)
#define FIT_CODE__FAIL_SWACT (34)
#define FIT_CODE__FM_SET_ALARM (40)
#define FIT_CODE__FM_GET_ALARM (41)

View File

@ -1344,8 +1344,8 @@ int daemon_load_fit ( void )
{
daemon_log_value ( "/var/run/fit/fithits", "hits =", __fit_info.hits );
daemon_remove_file ( "/var/run/fit/fitdone" );
daemon_rename_file ( FIT__INFO_FILEPATH, FIT__INFO_FILENAME, FIT__INFO_FILENAME_RENAMED );
}
daemon_rename_file ( FIT__INFO_FILEPATH, FIT__INFO_FILENAME, FIT__INFO_FILENAME_RENAMED );
#endif
return (PASS);

View File

@ -451,7 +451,7 @@ int mtcHttpUtil_status ( libEvent & event )
}
default:
{
hlog3 ("%s Status: %d\n", event.hostname.c_str(), event.status );
wlog ("%s Status: %d\n", event.hostname.c_str(), event.status );
rc = event.status ;
break;
}
@ -868,10 +868,47 @@ int mtcHttpUtil_api_request ( libEvent & event )
hlog ("%s Dispatched (to:%d)\n", event.log_prefix.c_str(), event.timeout);
}
/* TODO: Set a command timer to free up the resources
* and deal with the error if the handler never runs */
/*
* non-blocking event_base_loop can return ...
*
* 0 - command complete ; data available
* 1 - command dispatched but not complete ; no data available
* -1 - error in dispatch ; check errno
*
*/
event.active = true ;
return (event_base_loop(event.base, EVLOOP_NONBLOCK));
rc = event_base_loop(event.base, EVLOOP_NONBLOCK);
#ifdef WANT_FIT_TESTING
string value = "" ;
if ( daemon_want_fit ( FIT_CODE__FAIL_SWACT, event.hostname, "query", value ))
{
if ( value == "-1" )
rc = -1 ;
else
rc = atoi(value.data());
}
#endif
if (( rc == 0 ) || // Dispatched and done with Data ready
( rc == 1 )) // Dispatched but no response yet
{
if (( event.request == SMGR_QUERY_SWACT ) ||
( event.request == SMGR_START_SWACT ))
{
ilog ("%s dispatched%s\n",
event.log_prefix.c_str(),
rc ? "" : " ; data ready" );
}
rc = PASS ;
}
else
{
elog ("%s command dispatch failed (%d)\n",
event.log_prefix.c_str(), errno );
event.active = false ;
rc = FAIL_REQUEST ;
}
return (rc);
}
else
{

View File

@ -58,6 +58,8 @@ void nodeLinkClass::mtcSmgrApi_handler ( struct evhttp_request *req, void *arg )
mtcSmgrApi_handler_out:
mtcHttpUtil_log_event ( smgrEvent );
if ( smgrEvent.blocking == true )
{
mtcHttpUtil_free_conn ( smgrEvent );
@ -120,6 +122,15 @@ int nodeLinkClass::mtcSmgrApi_request ( struct nodeLinkClass::node * node_ptr, m
elog ("%s failed to allocate libEvent memory (%d)\n", node_ptr->hostname.c_str(), rc );
return (rc);
}
#ifdef WANT_FIT_TESTING
string value = "" ;
if ( daemon_want_fit ( FIT_CODE__FAIL_SWACT, node_ptr->hostname, "port", value ))
{
smgrEvent.port = atoi(value.data());
}
#endif
/* Set the common context of this new operation */
smgrEvent.status = RETRY ;
smgrEvent.hostname = node_ptr->hostname ;
@ -143,7 +154,13 @@ int nodeLinkClass::mtcSmgrApi_request ( struct nodeLinkClass::node * node_ptr, m
ilog ("%s sending 'query services' request to HA Service Manager\n",
smgrEvent.hostname.c_str());
return ( mtcHttpUtil_api_request ( smgrEvent )) ;
rc = mtcHttpUtil_api_request ( smgrEvent ) ;
if ( rc )
{
elog ("%s mtcHttpUtil_api_request (rc:%d)\n",
node_ptr->hostname.c_str(), rc );
}
return ( rc ) ;
}
else if ( operation == CONTROLLER_SWACT )
{