From 316032b904ffc6a3ee2cdd0d4a3aa56992e1ed6b Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Mon, 10 Sep 2018 14:38:52 -0400 Subject: [PATCH] Mtce: Improve non-blocking http request dispatch Maintenance is seen to intermittently fail Swact requests early after initial system provisioning, without logging an error reason, only to always succeed later on. The issue is difficult to reproduce so this update adds extra logging to this code path and implements a speculative fix. The event_base_loop calls' non-zero return code is never being logged. The libevent documentation states that this API will return 1 while the target has not yet provided any data. Theory is, because the call is local, that normally it returns with data even on the first dispatch case. However, during early system configuration, when the system is busy, that first dispatch does not complete immediately like it normally does later on. Speculation is, instead it returns a 1 stating retry but the existing code path treats that as a failure. This update modifies the code to return a PASS if the command dispatch returns a 1 while the error case of -1 gets enhanced logging and continues to be treated as a failure. Test Plan: PASS: Swact 5 times PASS: Lock/Unlock Host PASS: Large System DOR Related Bug: https://bugs.launchpad.net/starlingx/+bug/1791381 Change-Id: I19b22e07d3224b2e9dd3f3569ecbe9aed7d9402f Signed-off-by: Eric MacDonald --- mtce-common/centos/build_srpm.data | 2 +- .../cgts-mtce-common-1.0/common/fitCodes.h | 1 + .../daemon/daemon_files.cpp | 2 +- .../maintenance/mtcHttpUtil.cpp | 107 ++++++++++++------ .../maintenance/mtcSmgrApi.cpp | 19 +++- 5 files changed, 93 insertions(+), 38 deletions(-) diff --git a/mtce-common/centos/build_srpm.data b/mtce-common/centos/build_srpm.data index 4b87b935..43e3cd2c 100644 --- a/mtce-common/centos/build_srpm.data +++ b/mtce-common/centos/build_srpm.data @@ -1,3 +1,3 @@ SRC_DIR="cgts-mtce-common-1.0" -TIS_PATCH_VER=137 +TIS_PATCH_VER=138 BUILD_IS_SLOW=5 diff --git a/mtce-common/cgts-mtce-common-1.0/common/fitCodes.h b/mtce-common/cgts-mtce-common-1.0/common/fitCodes.h index 74a9d2b8..308f1d2a 100644 --- a/mtce-common/cgts-mtce-common-1.0/common/fitCodes.h +++ b/mtce-common/cgts-mtce-common-1.0/common/fitCodes.h @@ -112,6 +112,7 @@ #define FIT_CODE__LOCK_HOST (31) #define FIT_CODE__FORCE_LOCK_HOST (32) #define FIT_CODE__UNLOCK_HOST (33) +#define FIT_CODE__FAIL_SWACT (34) #define FIT_CODE__FM_SET_ALARM (40) #define FIT_CODE__FM_GET_ALARM (41) diff --git a/mtce-common/cgts-mtce-common-1.0/daemon/daemon_files.cpp b/mtce-common/cgts-mtce-common-1.0/daemon/daemon_files.cpp index 002c4db4..8f2578ff 100755 --- a/mtce-common/cgts-mtce-common-1.0/daemon/daemon_files.cpp +++ b/mtce-common/cgts-mtce-common-1.0/daemon/daemon_files.cpp @@ -1344,8 +1344,8 @@ int daemon_load_fit ( void ) { daemon_log_value ( "/var/run/fit/fithits", "hits =", __fit_info.hits ); daemon_remove_file ( "/var/run/fit/fitdone" ); - daemon_rename_file ( FIT__INFO_FILEPATH, FIT__INFO_FILENAME, FIT__INFO_FILENAME_RENAMED ); } + daemon_rename_file ( FIT__INFO_FILEPATH, FIT__INFO_FILENAME, FIT__INFO_FILENAME_RENAMED ); #endif return (PASS); diff --git a/mtce-common/cgts-mtce-common-1.0/maintenance/mtcHttpUtil.cpp b/mtce-common/cgts-mtce-common-1.0/maintenance/mtcHttpUtil.cpp index 0265071c..73cce49f 100755 --- a/mtce-common/cgts-mtce-common-1.0/maintenance/mtcHttpUtil.cpp +++ b/mtce-common/cgts-mtce-common-1.0/maintenance/mtcHttpUtil.cpp @@ -260,7 +260,7 @@ int mtcHttpUtil_request_old ( libEvent & event, void(*hdlr)(struct evhttp_request *, void *)) { int rc = PASS ; - + /* make a new request and bind the event handler to it */ event.req = evhttp_request_new( hdlr , event.base ); if ( ! event.req ) @@ -275,7 +275,7 @@ int mtcHttpUtil_request_new ( libEvent & event, void(*hdlr)(struct evhttp_request *, void *)) { int rc = PASS ; - + /* make a new request and bind the event handler to it */ event.req = evhttp_request_new( hdlr , &event ); if ( ! event.req ) @@ -374,15 +374,15 @@ int mtcHttpUtil_header_add ( libEvent * ptr, http_headers_type * hdrs_ptr ) if ( hdrs_ptr->entries > MAX_HEADERS ) { - elog ("%s Too many headers (%d:%d)\n", + elog ("%s Too many headers (%d:%d)\n", ptr->hostname.c_str(), MAX_HEADERS, hdrs_ptr->entries ); return FAIL ; } for ( int i = 0 ; i < hdrs_ptr->entries ; i++ ) { /* Add the header */ - rc = evhttp_add_header( ptr->req->output_headers, - hdrs_ptr->entry[i].key.c_str() , + rc = evhttp_add_header( ptr->req->output_headers, + hdrs_ptr->entry[i].key.c_str() , hdrs_ptr->entry[i].value.c_str()); if ( rc ) { @@ -391,12 +391,12 @@ int mtcHttpUtil_header_add ( libEvent * ptr, http_headers_type * hdrs_ptr ) hdrs_ptr->entry[i].value.c_str()); rc = FAIL ; break ; - } + } } return (rc); } -//int mtcHttpUtil_request_make ( libEvent * ptr, +//int mtcHttpUtil_request_make ( libEvent * ptr, // enum evhttp_cmd_type type, // string path ) //{ @@ -451,7 +451,7 @@ int mtcHttpUtil_status ( libEvent & event ) } default: { - hlog3 ("%s Status: %d\n", event.hostname.c_str(), event.status ); + wlog ("%s Status: %d\n", event.hostname.c_str(), event.status ); rc = event.status ; break; } @@ -476,16 +476,16 @@ int mtcHttpUtil_status ( libEvent & event ) int mtcHttpUtil_api_request ( libEvent & event ) { - http_headers_type hdrs ; + http_headers_type hdrs ; int hdr_entry = 0 ; int rc = FAIL ; void(*handler)(struct evhttp_request *, void *) = NULL ; - + /* Default to PUT */ event.type = EVHTTP_REQ_PUT ; - if (( event.request == SERVICE_NONE ) || - ( event.request >= SERVICE_LAST )) + if (( event.request == SERVICE_NONE ) || + ( event.request >= SERVICE_LAST )) { slog ("Invalid request %d\n", event.request); event.status = FAIL_BAD_PARM ; @@ -577,7 +577,7 @@ int mtcHttpUtil_api_request ( libEvent & event ) event.payload.append (",\"task\":\"\""); event.payload.append (",\"action\":\"none\""); - + event.payload.append (",\"personality\":\""); event.payload.append ( event.inv_info.type ); event.payload.append ("\""); @@ -593,9 +593,9 @@ int mtcHttpUtil_api_request ( libEvent & event ) event.payload.append (",\"availability\":\""); event.payload.append ( event.inv_info.avail ); event.payload.append ("\""); - + event.payload.append (",\"bm_ip\":\"\""); - + if ( !event.inv_info.name.compare("controller-0") ) { event.payload.append (",\"invprovision\":\"provisioned\""); @@ -662,7 +662,7 @@ int mtcHttpUtil_api_request ( libEvent & event ) event.timeout = HTTP_VIM_TIMEOUT ; } - else if (( event.request == SMGR_QUERY_SWACT ) || + else if (( event.request == SMGR_QUERY_SWACT ) || ( event.request == SMGR_START_SWACT ) || ( event.request == SMGR_HOST_LOCKED ) || ( event.request == SMGR_HOST_UNLOCKED ) || @@ -719,7 +719,7 @@ int mtcHttpUtil_api_request ( libEvent & event ) jlog ("%s Address : %s\n", event.hostname.c_str(), event.token.url.c_str()); } - if (( event.type != EVHTTP_REQ_GET ) && + if (( event.type != EVHTTP_REQ_GET ) && ( event.type != EVHTTP_REQ_DELETE )) { /* Add payload to the output buffer but only for PUT, POST and PATCH requests */ @@ -730,15 +730,15 @@ int mtcHttpUtil_api_request ( libEvent & event ) } if ( daemon_get_cfg_ptr()->debug_json ) { - if ((!string_contains(event.payload,"token")) && + if ((!string_contains(event.payload,"token")) && (!string_contains(event.payload,"assword"))) { - jlog ("%s Payload : %s\n", event.hostname.c_str(), + jlog ("%s Payload : %s\n", event.hostname.c_str(), event.payload.c_str() ); } else { - jlog ("%s Payload : ... contains private content ...\n", + jlog ("%s Payload : ... contains private content ...\n", event.hostname.c_str()); } @@ -754,7 +754,7 @@ int mtcHttpUtil_api_request ( libEvent & event ) hdrs.entry[hdr_entry].value = "admin"; hdr_entry++; - if (( event.type != EVHTTP_REQ_GET ) && + if (( event.type != EVHTTP_REQ_GET ) && ( event.type != EVHTTP_REQ_DELETE )) { hdrs.entry[hdr_entry].key = "Content-Length" ; @@ -765,7 +765,7 @@ int mtcHttpUtil_api_request ( libEvent & event ) hdrs.entry[hdr_entry].key = "User-Agent" ; hdrs.entry[hdr_entry].value = "mtce/1.0" ; hdr_entry++; - + hdrs.entry[hdr_entry].key = "Content-Type" ; hdrs.entry[hdr_entry].value = "application/json" ; hdr_entry++; @@ -800,13 +800,13 @@ int mtcHttpUtil_api_request ( libEvent & event ) event.status = FAIL_HEADER_ADD ; goto mtcHttpUtil_api_request_done ; } - + /* get some timestamps and log the request */ - snprintf (&event.req_str[0], MAX_API_LOG_LEN-1, + snprintf (&event.req_str[0], MAX_API_LOG_LEN-1, "\n%s [%5d] %s %s '%s' seq:%d -> Address : %s:%d %s %s ... %s", - pt(), getpid(), + pt(), getpid(), event.hostname.c_str(), - event.service.c_str(), + event.service.c_str(), event.operation.c_str(), event.sequence, event.ip.c_str(), event.port, getHttpCmdType_str( event.type ), @@ -842,10 +842,10 @@ int mtcHttpUtil_api_request ( libEvent & event ) event.log_prefix.append (" "); event.log_prefix.append (event.operation) ; hlog ("%s Requested (blocking) (to:%d)\n", event.log_prefix.c_str(), event.timeout); - + /* Send the message with timeout */ event_base_dispatch(event.base); - + goto mtcHttpUtil_api_request_done ; } else if (( event.request == SYSINV_UPDATE ) || @@ -865,13 +865,50 @@ int mtcHttpUtil_api_request ( libEvent & event ) { if ( event.operation.compare(SYSINV_OPER__UPDATE_UPTIME) ) { - hlog ("%s Dispatched (to:%d)\n", event.log_prefix.c_str(), event.timeout); + hlog ("%s Dispatched (to:%d)\n", event.log_prefix.c_str(), event.timeout); } - /* TODO: Set a command timer to free up the resources - * and deal with the error if the handler never runs */ + /* + * non-blocking event_base_loop can return ... + * + * 0 - command complete ; data available + * 1 - command dispatched but not complete ; no data available + * -1 - error in dispatch ; check errno + * + */ event.active = true ; - return (event_base_loop(event.base, EVLOOP_NONBLOCK)); + rc = event_base_loop(event.base, EVLOOP_NONBLOCK); + +#ifdef WANT_FIT_TESTING + string value = "" ; + if ( daemon_want_fit ( FIT_CODE__FAIL_SWACT, event.hostname, "query", value )) + { + if ( value == "-1" ) + rc = -1 ; + else + rc = atoi(value.data()); + } +#endif + if (( rc == 0 ) || // Dispatched and done with Data ready + ( rc == 1 )) // Dispatched but no response yet + { + if (( event.request == SMGR_QUERY_SWACT ) || + ( event.request == SMGR_START_SWACT )) + { + ilog ("%s dispatched%s\n", + event.log_prefix.c_str(), + rc ? "" : " ; data ready" ); + } + rc = PASS ; + } + else + { + elog ("%s command dispatch failed (%d)\n", + event.log_prefix.c_str(), errno ); + event.active = false ; + rc = FAIL_REQUEST ; + } + return (rc); } else { @@ -881,7 +918,7 @@ int mtcHttpUtil_api_request ( libEvent & event ) event.log_prefix.append (event.service) ; event.log_prefix.append (" "); event.log_prefix.append (event.operation) ; - slog ("%s Requested (blocking) (to:%d) ----------------------------------------\n", event.log_prefix.c_str(), event.timeout ); + slog ("%s Requested (blocking) (to:%d) ----------------------------------------\n", event.log_prefix.c_str(), event.timeout ); event_base_dispatch(event.base); @@ -890,10 +927,10 @@ int mtcHttpUtil_api_request ( libEvent & event ) } else { - elog ("%s Call to 'evhttp_make_request' failed (rc:%d)\n", + elog ("%s Call to 'evhttp_make_request' failed (rc:%d)\n", event.hostname.c_str(), rc); } - + return (FAIL_MAKE_REQUEST); mtcHttpUtil_api_request_done: diff --git a/mtce-common/cgts-mtce-common-1.0/maintenance/mtcSmgrApi.cpp b/mtce-common/cgts-mtce-common-1.0/maintenance/mtcSmgrApi.cpp index f5431f11..e511228c 100644 --- a/mtce-common/cgts-mtce-common-1.0/maintenance/mtcSmgrApi.cpp +++ b/mtce-common/cgts-mtce-common-1.0/maintenance/mtcSmgrApi.cpp @@ -58,6 +58,8 @@ void nodeLinkClass::mtcSmgrApi_handler ( struct evhttp_request *req, void *arg ) mtcSmgrApi_handler_out: + mtcHttpUtil_log_event ( smgrEvent ); + if ( smgrEvent.blocking == true ) { mtcHttpUtil_free_conn ( smgrEvent ); @@ -120,6 +122,15 @@ int nodeLinkClass::mtcSmgrApi_request ( struct nodeLinkClass::node * node_ptr, m elog ("%s failed to allocate libEvent memory (%d)\n", node_ptr->hostname.c_str(), rc ); return (rc); } + +#ifdef WANT_FIT_TESTING + string value = "" ; + if ( daemon_want_fit ( FIT_CODE__FAIL_SWACT, node_ptr->hostname, "port", value )) + { + smgrEvent.port = atoi(value.data()); + } +#endif + /* Set the common context of this new operation */ smgrEvent.status = RETRY ; smgrEvent.hostname = node_ptr->hostname ; @@ -143,7 +154,13 @@ int nodeLinkClass::mtcSmgrApi_request ( struct nodeLinkClass::node * node_ptr, m ilog ("%s sending 'query services' request to HA Service Manager\n", smgrEvent.hostname.c_str()); - return ( mtcHttpUtil_api_request ( smgrEvent )) ; + rc = mtcHttpUtil_api_request ( smgrEvent ) ; + if ( rc ) + { + elog ("%s mtcHttpUtil_api_request (rc:%d)\n", + node_ptr->hostname.c_str(), rc ); + } + return ( rc ) ; } else if ( operation == CONTROLLER_SWACT ) {