Add a wait time between http request retries

Maintenance interfaces with sysinv, sm and the vim using http requests.
Request timeout's have an implicit delay between retries. However,
command failures or outright connection failures don't.

This has only become obvious in mtce's communication with the vim
where there appears to be a process startup timing change that leads
to the 'vim' not being ready to handle commands before mtcAgent
startup starts sending them after a platform services group startup
by sm.

This update adds a 10 second http retry wait as a configuration option
to mtc.conf. The mtcAgent loads this value at startup and uses it
in a new HTTP__RETRY_WAIT state of http request work FSM.

The number of retries remains unchanged. This update is only forcing
a minimum wait time between retries, regardless of cause.

Failure path testing was done using Fault Insertion Testing (FIT).

Test Plan:

PASS: Verify the reported issue is resolved by this update.
PASS: Verify http retry config value load on process startup.
PASS: Verify updated value is used over a process -sighup.
PASS: Verify default value if new mtc.conf config value is not found.
PASS: Verify http connection failure http retry handling.
PASS: Verify http request timeout failure retry handling.
PASS: Verify http request operation failure retry handling.

Regression:

PASS: Build and install ISO - Standard and AIO DX.
PASS: Verify http failures do not fail a lock operation.
PASS: Verify host unlock fails if its http done queue shows failures.
PASS: Verify host swact.
PASS: Verify handling of random and persistent http errors involving
      the need for retries.

Closes-Bug: 2047958
Change-Id: Icc758b0782be2a4f2882efd56f5de1a8dddea490
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
Eric MacDonald 2024-02-07 02:09:46 +00:00
parent 5a3a5ce8ea
commit 191c0aa6a8
9 changed files with 162 additions and 81 deletions

View File

@ -131,6 +131,11 @@
#define FIT_CODE__STOP_HOST_SERVICES (71)
#define FIT_CODE__SOCKET_SETUP (72)
#define FIT_CODE__READ_JSON_FROM_FILE (73)
#define FIT_CODE__HTTP_WORKQUEUE_OPERATION_FAILED (75)
#define FIT_CODE__HTTP_WORKQUEUE_REQUEST_TIMEOUT (76)
#define FIT_CODE__HTTP_WORKQUEUE_CONNECTION_LOSS (77)
/***************** Process Fit Codes ********************************/

View File

@ -2,10 +2,10 @@
#define __INCLUDE_HTTPUTIL_H__
/*
* Copyright (c) 2013, 2016 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
* Copyright (c) 2013, 2016, 2024 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
*/
#include <iostream> /* for ... string */
@ -93,12 +93,14 @@ typedef enum {
HTTP__RECEIVE_WAIT = 1,
HTTP__RECEIVE = 2,
HTTP__FAILURE = 3,
HTTP__DONE_FAIL = 4,
HTTP__DONE_PASS = 5,
HTTP__STAGES = 6
HTTP__RETRY_WAIT = 4,
HTTP__DONE_FAIL = 5,
HTTP__DONE_PASS = 6,
HTTP__STAGES = 7
} httpStages_enum ;
#define HTTP_RECEIVE_WAIT_MSEC (10)
#define HTTP_RETRY_WAIT_SECS (10)
typedef struct
{

View File

@ -1,10 +1,10 @@
#ifndef __INCLUDE_NODELOG_HH__
#define __INCLUDE_NODELOG_HH__
/*
* Copyright (c) 2013-2017,2023 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
* Copyright (c) 2013-2017, 2023-2024 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
*/
/**
@ -116,6 +116,7 @@ typedef struct
int start_delay ; /**< startup delay, added for pmon */
int api_retries ; /**< api retries before failure */
int bmc_reset_delay ; /**< secs delay before bmc reset */
int http_retry_wait ; /**< secs to wait between http reg retries */
int hostwd_failure_threshold ; /**< allowed # of missed pmon/hostwd messages */
bool hostwd_reboot_on_err ; /**< should hostwd reboot on fault detected */
bool hostwd_kdump_on_stall ; /**< sysrq crash dump on quorum msg'ing stall */

View File

@ -1,8 +1,8 @@
/*
* Copyright (c) 2013-2020, 2023 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
*
* SPDX-License-Identifier: Apache-2.0
*
*/
/**
@ -314,6 +314,7 @@ nodeLinkClass::nodeLinkClass()
sysinv_timeout = HTTP_SYSINV_CRIT_TIMEOUT ;
sysinv_noncrit_timeout = HTTP_SYSINV_NONC_TIMEOUT ;
work_queue_timeout = MTC_WORKQUEUE_TIMEOUT ;
http_retry_wait = HTTP_RETRY_WAIT_SECS ;
/* Init the auto recovery threshold and intervals to zero until
* modified by daemon config */

View File

@ -1866,6 +1866,9 @@ public:
* time for crashdumps to complete. */
int bmc_reset_delay ;
/** seconds to wait between http request retries */
int http_retry_wait ;
/* collectd event handler */
int collectd_notify_handler ( string & hostname,
string & resource,

View File

@ -376,6 +376,11 @@ static int mtc_config_handler ( void * user,
config_ptr->bmc_reset_delay = atoi(value);
mtcInv.bmc_reset_delay = config_ptr->bmc_reset_delay ;
}
else if (MATCH("agent", "http_retry_wait"))
{
config_ptr->http_retry_wait = atoi(value);
mtcInv.http_retry_wait = config_ptr->http_retry_wait ;
}
else if (MATCH("timeouts", "failsafe_shutdown_delay"))
{
config_ptr->failsafe_shutdown_delay = atoi(value);
@ -692,6 +697,7 @@ int daemon_configure ( void )
ilog ("TokenRefresh: %3d secs\n" , mtcInv.token_refresh_rate);
ilog ("API Retries : %3d secs\n" , mtcInv.api_retries);
ilog ("Reset Delay : %3d secs\n" , mtcInv.bmc_reset_delay);
ilog ("HTTP Retry : %3d secs\n" , mtcInv.http_retry_wait);
/* Verify loaded config against an expected mask
* as an ini file fault detection method */

View File

@ -69,6 +69,23 @@ string nodeLinkClass::mtcVimApi_state_get ( string hostname, int & http_status_c
http_status_code = HTTP_NOTFOUND ;
return ( payload );
}
#ifdef WANT_FIT_TESTING
static const char * fit_file = "/var/run/fit/mtcVimApi_state_get";
if ( daemon_want_fit ( FIT_CODE__READ_JSON_FROM_FILE, hostname, "mtcVimApi_state_get"))
{
if ( daemon_is_file_present (fit_file) )
{
payload = daemon_read_file(fit_file);
ilog("%s FIT Json: %s", hostname.c_str(), payload.c_str());
return (payload);
}
else
{
slog("%s FIT file %s not found ; aborting fit", hostname.c_str(), fit_file);
}
}
#endif
payload = ("{\"") ;
payload.append (MTC_JSON_INV_ADMIN);
payload.append ("\":\"");
@ -246,6 +263,22 @@ int nodeLinkClass::mtcVimApi_state_change ( struct nodeLinkClass::node * node_pt
node_ptr->httpReq.payload = "{\"state-change\": " ;
node_ptr->httpReq.payload.append (mtcVimApi_state_get ( node_ptr->hostname , http_status_code ));
#ifdef WANT_FIT_TESTING
static const char * fit_file = "/var/run/fit/mtcVimApi_state_change";
if ( daemon_want_fit ( FIT_CODE__READ_JSON_FROM_FILE, node_ptr->hostname, "mtcVimApi_state_change" ))
{
if ( daemon_is_file_present (fit_file) )
{
node_ptr->httpReq.payload = daemon_read_file(fit_file);
ilog("%s FIT Json: %s", node_ptr->hostname.c_str(), node_ptr->httpReq.payload.c_str());
}
else
{
slog("%s FIT file %s not found ; aborting fit", node_ptr->hostname.c_str(), fit_file);
}
}
#endif
if (( request == VIM_HOST_FAILED ) || ( request == VIM_DPORT_FAILED ))
{
wlog ("%s %s\n", node_ptr->hostname.c_str(), node_ptr->httpReq.payload.c_str());

View File

@ -1,8 +1,8 @@
/*
* Copyright (c) 2013, 2016 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
* Copyright (c) 2013, 2016, 2023-2024 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
*/
/**
@ -331,7 +331,6 @@ int nodeLinkClass::workQueue_process ( struct nodeLinkClass::node * node_ptr )
syslog ( LOG_INFO, "+------+-------+--------------+---------+--------------+-----+----------------------+\n");
}
int size = node_ptr->libEvent_work_fifo.size() ;
if ( size > QUEUE_OVERLOAD )
{
@ -456,6 +455,24 @@ int nodeLinkClass::workQueue_process ( struct nodeLinkClass::node * node_ptr )
break ;
}
}
#ifdef WANT_FIT_TESTING
if ( daemon_want_fit ( FIT_CODE__HTTP_WORKQUEUE_OPERATION_FAILED, node_ptr->hostname, "" ))
{
ilog("%s FIT Operation Failed: %s", node_ptr->hostname.c_str(), node_ptr->httpReq.payload.c_str());
node_ptr->thisReq.status = FAIL_AUTHENTICATION ;
rc = FAIL_OPERATION ;
}
else if ( daemon_want_fit ( FIT_CODE__HTTP_WORKQUEUE_REQUEST_TIMEOUT, node_ptr->hostname, "" ))
{
ilog("%s FIT Request Timeout Failed: %s", node_ptr->hostname.c_str(), node_ptr->httpReq.payload.c_str());
rc = FAIL_TIMEOUT ;
}
else if ( daemon_want_fit ( FIT_CODE__HTTP_WORKQUEUE_CONNECTION_LOSS, node_ptr->hostname, "" ))
{
ilog("%s FIT Connection Loss: %s", node_ptr->hostname.c_str(), node_ptr->httpReq.payload.c_str());
node_ptr->thisReq.status = rc = FAIL_HTTP_ZERO_STATUS ;
}
#endif
if ( rc != PASS )
{
node_ptr->libEvent_work_fifo_ptr->state =
@ -598,6 +615,18 @@ int nodeLinkClass::workQueue_process ( struct nodeLinkClass::node * node_ptr )
node_ptr->libEvent_work_fifo_ptr->cur_retries =
node_ptr->thisReq.cur_retries ;
mtcTimer_start ( node_ptr->http_timer, mtcTimer_handler, HTTP_RETRY_WAIT_SECS );
node_ptr->libEvent_work_fifo_ptr->state =
node_ptr->thisReq.state = HTTP__RETRY_WAIT ;
dlog ("%s %d sec retry wait started", node_ptr->thisReq.log_prefix.c_str(), HTTP_RETRY_WAIT_SECS);
}
break ;
}
case HTTP__RETRY_WAIT:
{
if ( node_ptr->http_timer.ring == true )
{
dlog ("%s %d sec retry wait expired", node_ptr->thisReq.log_prefix.c_str(), HTTP_RETRY_WAIT_SECS);
node_ptr->libEvent_work_fifo_ptr->state =
node_ptr->thisReq.state = HTTP__TRANSMIT ;
}
@ -862,7 +891,6 @@ bool nodeLinkClass::workQueue_present ( libEvent & event )
}
}
}
wlog ("%s ... not found in work queue\n", event.log_prefix.c_str());
return (false);
}

View File

@ -78,6 +78,8 @@ bmc_reset_delay = 300 ; seconds to wait before issuing a bmc
; ACK reboot requests. The delay gives
; time for crashdumps to complete.
http_retry_wait = 10 ; secs to wait between http request retries
[client] ; Client Configuration
scheduling_priority = 45 ; realtime scheduling; range of 1 .. 99