From 191c0aa6a8618b5c3530fabc2733da985eb3acc3 Mon Sep 17 00:00:00 2001
From: Eric MacDonald <eric.macdonald@windriver.com>
Date: Wed, 7 Feb 2024 02:09:46 +0000
Subject: [PATCH] Add a wait time between http request retries

Maintenance interfaces with sysinv, sm and the vim using http requests.
Request timeout's have an implicit delay between retries. However,
command failures or outright connection failures don't.

This has only become obvious in mtce's communication with the vim
where there appears to be a process startup timing change that leads
to the 'vim' not being ready to handle commands before mtcAgent
startup starts sending them after a platform services group startup
by sm.

This update adds a 10 second http retry wait as a configuration option
to mtc.conf. The mtcAgent loads this value at startup and uses it
in a new HTTP__RETRY_WAIT state of http request work FSM.

The number of retries remains unchanged. This update is only forcing
a minimum wait time between retries, regardless of cause.

Failure path testing was done using Fault Insertion Testing (FIT).

Test Plan:

PASS: Verify the reported issue is resolved by this update.
PASS: Verify http retry config value load on process startup.
PASS: Verify updated value is used over a process -sighup.
PASS: Verify default value if new mtc.conf config value is not found.
PASS: Verify http connection failure http retry handling.
PASS: Verify http request timeout failure retry handling.
PASS: Verify http request operation failure retry handling.

Regression:

PASS: Build and install ISO - Standard and AIO DX.
PASS: Verify http failures do not fail a lock operation.
PASS: Verify host unlock fails if its http done queue shows failures.
PASS: Verify host swact.
PASS: Verify handling of random and persistent http errors involving
      the need for retries.

Closes-Bug: 2047958
Change-Id: Icc758b0782be2a4f2882efd56f5de1a8dddea490
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
---
 mtce-common/src/common/fitCodes.h     |   5 +
 mtce-common/src/common/httpUtil.h     |  18 +--
 mtce-common/src/common/logMacros.h    |   9 +-
 mtce/src/common/nodeClass.cpp         |   7 +-
 mtce/src/common/nodeClass.h           |   3 +
 mtce/src/maintenance/mtcNodeCtrl.cpp  |   6 +
 mtce/src/maintenance/mtcVimApi.cpp    |  33 ++++++
 mtce/src/maintenance/mtcWorkQueue.cpp | 160 +++++++++++++++-----------
 mtce/src/scripts/mtc.conf             |   2 +
 9 files changed, 162 insertions(+), 81 deletions(-)
diff --git a/mtce-common/src/common/fitCodes.h b/mtce-common/src/common/fitCodes.h
index cfbe7119..023f3483 100644
--- a/mtce-common/src/common/fitCodes.h
+++ b/mtce-common/src/common/fitCodes.h
@@ -131,6 +131,11 @@
 #define FIT_CODE__STOP_HOST_SERVICES                 (71)
 
 #define FIT_CODE__SOCKET_SETUP                       (72)
+#define FIT_CODE__READ_JSON_FROM_FILE                (73)
+
+#define FIT_CODE__HTTP_WORKQUEUE_OPERATION_FAILED    (75)
+#define FIT_CODE__HTTP_WORKQUEUE_REQUEST_TIMEOUT     (76)
+#define FIT_CODE__HTTP_WORKQUEUE_CONNECTION_LOSS     (77)
 
 /*****************      Process Fit Codes     ********************************/
 
diff --git a/mtce-common/src/common/httpUtil.h b/mtce-common/src/common/httpUtil.h
index c1956c3f..27f4724b 100644
--- a/mtce-common/src/common/httpUtil.h
+++ b/mtce-common/src/common/httpUtil.h
@@ -2,10 +2,10 @@
 #define __INCLUDE_HTTPUTIL_H__
 
 /*
- * Copyright (c) 2013, 2016 Wind River Systems, Inc.
-*
-* SPDX-License-Identifier: Apache-2.0
-*
+ * Copyright (c) 2013, 2016, 2024 Wind River Systems, Inc.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
  */
 
 #include <iostream>         /* for ... string               */
@@ -93,12 +93,14 @@ typedef enum {
    HTTP__RECEIVE_WAIT  = 1,
    HTTP__RECEIVE       = 2,
    HTTP__FAILURE       = 3,
-   HTTP__DONE_FAIL     = 4,
-   HTTP__DONE_PASS     = 5,
-   HTTP__STAGES        = 6
+   HTTP__RETRY_WAIT    = 4,
+   HTTP__DONE_FAIL     = 5,
+   HTTP__DONE_PASS     = 6,
+   HTTP__STAGES        = 7
 }  httpStages_enum ;
 
 #define HTTP_RECEIVE_WAIT_MSEC (10)
+#define HTTP_RETRY_WAIT_SECS   (10)
 
 typedef struct
 {
@@ -142,7 +144,7 @@ typedef enum {
 
     SYSINV_CONFIG_SHOW,
     SYSINV_CONFIG_MODIFY,
-    
+
     SYSINV_SENSOR_LOAD,
     SYSINV_SENSOR_LOAD_GROUPS,
     SYSINV_SENSOR_LOAD_GROUP,
diff --git a/mtce-common/src/common/logMacros.h b/mtce-common/src/common/logMacros.h
index b6c758e6..55f6d293 100644
--- a/mtce-common/src/common/logMacros.h
+++ b/mtce-common/src/common/logMacros.h
@@ -1,10 +1,10 @@
 #ifndef __INCLUDE_NODELOG_HH__
 #define __INCLUDE_NODELOG_HH__
 /*
- * Copyright (c) 2013-2017,2023 Wind River Systems, Inc.
-*
-* SPDX-License-Identifier: Apache-2.0
-*
+ * Copyright (c) 2013-2017, 2023-2024 Wind River Systems, Inc.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
  */
 
  /**
@@ -116,6 +116,7 @@ typedef struct
     int   start_delay           ; /**< startup delay, added for pmon          */
     int   api_retries           ; /**< api retries before failure             */
     int   bmc_reset_delay       ; /**< secs delay before bmc reset            */
+    int   http_retry_wait       ; /**< secs to wait between http reg retries  */
     int   hostwd_failure_threshold ; /**< allowed # of missed pmon/hostwd messages */
     bool  hostwd_reboot_on_err  ; /**< should hostwd reboot on fault detected */
     bool  hostwd_kdump_on_stall ; /**< sysrq crash dump on quorum msg'ing stall */
diff --git a/mtce/src/common/nodeClass.cpp b/mtce/src/common/nodeClass.cpp
index bb97253d..7a4ce958 100755
--- a/mtce/src/common/nodeClass.cpp
+++ b/mtce/src/common/nodeClass.cpp
@@ -1,8 +1,8 @@
 /*
  * Copyright (c) 2013-2020, 2023 Wind River Systems, Inc.
-*
-* SPDX-License-Identifier: Apache-2.0
-*
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
  */
 
  /**
@@ -314,6 +314,7 @@ nodeLinkClass::nodeLinkClass()
     sysinv_timeout               = HTTP_SYSINV_CRIT_TIMEOUT ;
     sysinv_noncrit_timeout       = HTTP_SYSINV_NONC_TIMEOUT ;
     work_queue_timeout           = MTC_WORKQUEUE_TIMEOUT    ;
+    http_retry_wait              = HTTP_RETRY_WAIT_SECS     ;
 
     /* Init the auto recovery threshold and intervals to zero until
      * modified by daemon config */
diff --git a/mtce/src/common/nodeClass.h b/mtce/src/common/nodeClass.h
index 514b25ef..1cd03813 100755
--- a/mtce/src/common/nodeClass.h
+++ b/mtce/src/common/nodeClass.h
@@ -1866,6 +1866,9 @@ public:
      * time for crashdumps to complete. */
     int bmc_reset_delay ;
 
+    /** seconds to wait between http request retries */
+    int http_retry_wait ;
+
     /* collectd event handler */
     int collectd_notify_handler ( string & hostname,
                                   string & resource,
diff --git a/mtce/src/maintenance/mtcNodeCtrl.cpp b/mtce/src/maintenance/mtcNodeCtrl.cpp
index f18efc23..e5bdf25e 100644
--- a/mtce/src/maintenance/mtcNodeCtrl.cpp
+++ b/mtce/src/maintenance/mtcNodeCtrl.cpp
@@ -376,6 +376,11 @@ static int mtc_config_handler ( void * user,
         config_ptr->bmc_reset_delay = atoi(value);
         mtcInv.bmc_reset_delay = config_ptr->bmc_reset_delay ;
     }
+    else if (MATCH("agent", "http_retry_wait"))
+    {
+        config_ptr->http_retry_wait = atoi(value);
+        mtcInv.http_retry_wait = config_ptr->http_retry_wait ;
+    }
     else if (MATCH("timeouts", "failsafe_shutdown_delay"))
     {
         config_ptr->failsafe_shutdown_delay = atoi(value);
@@ -692,6 +697,7 @@ int daemon_configure ( void )
     ilog ("TokenRefresh: %3d secs\n" , mtcInv.token_refresh_rate);
     ilog ("API Retries : %3d secs\n" , mtcInv.api_retries);
     ilog ("Reset Delay : %3d secs\n" , mtcInv.bmc_reset_delay);
+    ilog ("HTTP Retry  : %3d secs\n" , mtcInv.http_retry_wait);
 
     /* Verify loaded config against an expected mask
      * as an ini file fault detection method */
diff --git a/mtce/src/maintenance/mtcVimApi.cpp b/mtce/src/maintenance/mtcVimApi.cpp
index a5169d14..fe58f1ab 100644
--- a/mtce/src/maintenance/mtcVimApi.cpp
+++ b/mtce/src/maintenance/mtcVimApi.cpp
@@ -69,6 +69,23 @@ string nodeLinkClass::mtcVimApi_state_get ( string hostname, int & http_status_c
         http_status_code = HTTP_NOTFOUND ;
         return ( payload );
     }
+    #ifdef WANT_FIT_TESTING
+    static const char * fit_file = "/var/run/fit/mtcVimApi_state_get";
+    if ( daemon_want_fit ( FIT_CODE__READ_JSON_FROM_FILE, hostname, "mtcVimApi_state_get"))
+    {
+        if ( daemon_is_file_present (fit_file) )
+        {
+            payload = daemon_read_file(fit_file);
+            ilog("%s FIT Json: %s", hostname.c_str(), payload.c_str());
+            return (payload);
+        }
+        else
+        {
+            slog("%s FIT file %s not found ; aborting fit", hostname.c_str(), fit_file);
+        }
+    }
+    #endif
+
     payload = ("{\"") ;
     payload.append (MTC_JSON_INV_ADMIN);
     payload.append ("\":\"");
@@ -246,6 +263,22 @@ int nodeLinkClass::mtcVimApi_state_change ( struct nodeLinkClass::node * node_pt
     node_ptr->httpReq.payload = "{\"state-change\": " ;
     node_ptr->httpReq.payload.append (mtcVimApi_state_get ( node_ptr->hostname , http_status_code ));
 
+    #ifdef WANT_FIT_TESTING
+    static const char * fit_file = "/var/run/fit/mtcVimApi_state_change";
+    if ( daemon_want_fit ( FIT_CODE__READ_JSON_FROM_FILE, node_ptr->hostname, "mtcVimApi_state_change" ))
+    {
+        if ( daemon_is_file_present (fit_file) )
+        {
+            node_ptr->httpReq.payload = daemon_read_file(fit_file);
+            ilog("%s FIT Json: %s", node_ptr->hostname.c_str(), node_ptr->httpReq.payload.c_str());
+        }
+        else
+        {
+            slog("%s FIT file %s not found ; aborting fit", node_ptr->hostname.c_str(), fit_file);
+        }
+    }
+    #endif
+
     if (( request == VIM_HOST_FAILED ) || ( request == VIM_DPORT_FAILED ))
     {
         wlog ("%s %s\n", node_ptr->hostname.c_str(), node_ptr->httpReq.payload.c_str());
diff --git a/mtce/src/maintenance/mtcWorkQueue.cpp b/mtce/src/maintenance/mtcWorkQueue.cpp
index 3e74e3ea..2ef0a8c7 100644
--- a/mtce/src/maintenance/mtcWorkQueue.cpp
+++ b/mtce/src/maintenance/mtcWorkQueue.cpp
@@ -1,8 +1,8 @@
 /*
- * Copyright (c) 2013, 2016 Wind River Systems, Inc.
-*
-* SPDX-License-Identifier: Apache-2.0
-*
+ * Copyright (c) 2013, 2016, 2023-2024 Wind River Systems, Inc.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
  */
 
 /**
@@ -38,7 +38,7 @@ string _get_work_state_str ( httpStages_enum state )
     else if ( state == HTTP__RECEIVE  ) return ("  Rx");
     else if ( state == HTTP__FAILURE  ) return (" Er ");
     else if ( state == HTTP__RECEIVE_WAIT  ) return ("Wait");
-    else 
+    else
     {
         elog ("Invalid Http Work Queue State: %d\n", state );
         return ("----");
@@ -58,7 +58,7 @@ void nodeLinkClass::workQueue_dump ( struct nodeLinkClass::node * node_ptr )
               node_ptr->libEvent_work_fifo_ptr != node_ptr->libEvent_work_fifo.end();
               node_ptr->libEvent_work_fifo_ptr ++ )
         {
-            syslog ( LOG_INFO, "| %-4s | %5d | %-12s | %-7s | %-13s | %15s:%d | %s\n", 
+            syslog ( LOG_INFO, "| %-4s | %5d | %-12s | %-7s | %-13s | %15s:%d | %s\n",
                 _get_work_state_str(node_ptr->libEvent_work_fifo_ptr->state).c_str(),
                 node_ptr->libEvent_work_fifo_ptr->sequence,
                 node_ptr->libEvent_work_fifo_ptr->hostname.c_str(),
@@ -101,11 +101,11 @@ void nodeLinkClass::doneQueue_dump ( struct nodeLinkClass::node * node_ptr )
               node_ptr->libEvent_done_fifo_ptr != node_ptr->libEvent_done_fifo.end();
               node_ptr->libEvent_done_fifo_ptr ++ )
         {
-            syslog ( LOG_INFO, "%15s httpReq doneQueue:%5d - %s '%s' -> Status:%d\n", 
-                         node_ptr->libEvent_done_fifo_ptr->hostname.c_str(), 
-                         node_ptr->libEvent_done_fifo_ptr->sequence, 
-                         node_ptr->libEvent_done_fifo_ptr->service.c_str(), 
-                         node_ptr->libEvent_done_fifo_ptr->operation.c_str(), 
+            syslog ( LOG_INFO, "%15s httpReq doneQueue:%5d - %s '%s' -> Status:%d\n",
+                         node_ptr->libEvent_done_fifo_ptr->hostname.c_str(),
+                         node_ptr->libEvent_done_fifo_ptr->sequence,
+                         node_ptr->libEvent_done_fifo_ptr->service.c_str(),
+                         node_ptr->libEvent_done_fifo_ptr->operation.c_str(),
                          node_ptr->libEvent_done_fifo_ptr->status );
         }
     }
@@ -240,9 +240,9 @@ int nodeLinkClass::doneQueue_dequeue ( libEvent & event )
  * Description: This is a Per Host Finite State Machine (FSM) that
  *              processes the work queue for the supplied host's
  *              node pointer.
- *              
+ *
  * Constructs:
- * 
+ *
  * node_ptr->libEvent_work_fifo - the current work queue/fifo
  * node_ptr->libEvent_done_fifo - queue/fifo of completed requests
  *
@@ -255,17 +255,17 @@ int nodeLinkClass::doneQueue_dequeue ( libEvent & event )
  *
  * In process libEvents are copied from the callers work queue to
  * its thisReq.
- * 
+ *
  * Completed events including execution status are copied to the host's
  * done fifo.
- * 
- * Failed events may be retried up to max_retries as specified by 
+ *
+ * Failed events may be retried up to max_retries as specified by
  * the callers libEvent.
  *
  * @param event is a reference to the callers libEvent.
  *
  * @return an integer with values of PASS, FAIL, RETRY
- *  
+ *
  * ************************************************************************/
 
 int nodeLinkClass::workQueue_process ( struct nodeLinkClass::node * node_ptr )
@@ -280,18 +280,18 @@ int nodeLinkClass::workQueue_process ( struct nodeLinkClass::node * node_ptr )
          * responses */
          if ( node_ptr->libEvent_done_fifo.size() > 10 )
          {
-             qlog ("%s Done Queue has %ld elements\n", 
+             qlog ("%s Done Queue has %ld elements\n",
                        node_ptr->hostname.c_str(),
                        node_ptr->libEvent_done_fifo.size());
 
              /* TODO: look at the status of the commands and print a log of those that failed */
-             
+
              /* Remove the first 8 - its a fifo the first ones at the front are the oldest */
              for ( int i=0 ; i < 8 ; i++ )
              {
                  node_ptr->libEvent_done_fifo.pop_front();
              }
-             qlog ("%s Done Queue has %ld elements remaining\n", 
+             qlog ("%s Done Queue has %ld elements remaining\n",
                        node_ptr->hostname.c_str(),
                        node_ptr->libEvent_done_fifo.size());
          }
@@ -299,8 +299,8 @@ int nodeLinkClass::workQueue_process ( struct nodeLinkClass::node * node_ptr )
 
     if ( node_ptr->libEvent_work_fifo.empty() )
     {
-        // qlog_throttled ( node_ptr->no_work_log_throttle, 300, 
-        //                  "%s Idle ... \n", 
+        // qlog_throttled ( node_ptr->no_work_log_throttle, 300,
+        //                  "%s Idle ... \n",
         //                  node_ptr->hostname.c_str());
         node_ptr->no_work_log_throttle = 0 ;
         return (PASS);
@@ -317,7 +317,7 @@ int nodeLinkClass::workQueue_process ( struct nodeLinkClass::node * node_ptr )
               node_ptr->libEvent_work_fifo_ptr != node_ptr->libEvent_work_fifo.end();
               node_ptr->libEvent_work_fifo_ptr ++ )
         {
-            syslog ( LOG_INFO, "| %-4s | %5d | %-12s | %-7s | %-13s | %3d | %15s:%d | %s\n", 
+            syslog ( LOG_INFO, "| %-4s | %5d | %-12s | %-7s | %-13s | %3d | %15s:%d | %s\n",
                 _get_work_state_str(node_ptr->libEvent_work_fifo_ptr->state).c_str(),
                 node_ptr->libEvent_work_fifo_ptr->sequence,
                 node_ptr->libEvent_work_fifo_ptr->hostname.c_str(),
@@ -331,7 +331,6 @@ int nodeLinkClass::workQueue_process ( struct nodeLinkClass::node * node_ptr )
         syslog ( LOG_INFO, "+------+-------+--------------+---------+--------------+-----+----------------------+\n");
     }
 
-   
     int size = node_ptr->libEvent_work_fifo.size() ;
     if ( size > QUEUE_OVERLOAD )
     {
@@ -354,18 +353,18 @@ int nodeLinkClass::workQueue_process ( struct nodeLinkClass::node * node_ptr )
         case HTTP__TRANSMIT:
         {
             node_ptr->thisReq = node_ptr->libEvent_work_fifo.front();
-            
+
             qlog ("%s Transmitted\n", node_ptr->thisReq.log_prefix.c_str() );
 
             rc = mtcHttpUtil_api_request ( node_ptr->thisReq ) ;
             if ( rc )
             {
-                node_ptr->libEvent_work_fifo_ptr->state = 
+                node_ptr->libEvent_work_fifo_ptr->state =
                 node_ptr->thisReq.state = HTTP__FAILURE ;
             }
             else
             {
-                node_ptr->libEvent_work_fifo_ptr->state = 
+                node_ptr->libEvent_work_fifo_ptr->state =
                 node_ptr->thisReq.state = HTTP__RECEIVE_WAIT ;
 
                 if ( node_ptr->http_timer.tid )
@@ -374,7 +373,7 @@ int nodeLinkClass::workQueue_process ( struct nodeLinkClass::node * node_ptr )
                 if ( rc != PASS )
                 {
                     elog ("%s failed to start http command timer ; failing command\n", node_ptr->thisReq.log_prefix.c_str());
-                    node_ptr->libEvent_work_fifo_ptr->state = 
+                    node_ptr->libEvent_work_fifo_ptr->state =
                     node_ptr->thisReq.state = HTTP__FAILURE ;
                 }
             }
@@ -408,20 +407,20 @@ int nodeLinkClass::workQueue_process ( struct nodeLinkClass::node * node_ptr )
                 slog ("%s has unexpected null HTTP request base pointer\n",
                           node_ptr->thisReq.log_prefix.c_str());
 
-                node_ptr->libEvent_work_fifo_ptr->state = 
+                node_ptr->libEvent_work_fifo_ptr->state =
                 node_ptr->thisReq.state = HTTP__FAILURE ;
                 break ;
             }
-            
+
             int msec_timeout = (node_ptr->thisReq.timeout*1000);
             int wait_time = (++node_ptr->thisReq.rx_retry_cnt)*HTTP_RECEIVE_WAIT_MSEC ;
 
             rc = mtcHttpUtil_receive ( node_ptr->thisReq );
             if ( rc == RETRY )
             {
-                node_ptr->libEvent_work_fifo_ptr->state = 
+                node_ptr->libEvent_work_fifo_ptr->state =
                 node_ptr->thisReq.state = HTTP__RECEIVE_WAIT ;
-                mtcTimer_start_msec ( node_ptr->http_timer, mtcTimer_handler, HTTP_RECEIVE_WAIT_MSEC ); 
+                mtcTimer_start_msec ( node_ptr->http_timer, mtcTimer_handler, HTTP_RECEIVE_WAIT_MSEC );
 
                 if ((wait_time > (msec_timeout/4)) && ( node_ptr->thisReq.low_wm == false ) )
                 {
@@ -449,48 +448,66 @@ int nodeLinkClass::workQueue_process ( struct nodeLinkClass::node * node_ptr )
                     /* Only print every 16 starting with 2 */
                     if ( (node_ptr->thisReq.rx_retry_cnt & 0xF) == 2 )
                     {
-                        qlog ("%s rx_retry_cnt:%d\n", 
+                        qlog ("%s rx_retry_cnt:%d\n",
                                   node_ptr->thisReq.log_prefix.c_str(),
                                   node_ptr->thisReq.rx_retry_cnt );
                     }
                     break ;
                 }
             }
+            #ifdef WANT_FIT_TESTING
+            if ( daemon_want_fit ( FIT_CODE__HTTP_WORKQUEUE_OPERATION_FAILED, node_ptr->hostname, "" ))
+            {
+               ilog("%s FIT Operation Failed: %s", node_ptr->hostname.c_str(), node_ptr->httpReq.payload.c_str());
+               node_ptr->thisReq.status = FAIL_AUTHENTICATION ;
+               rc = FAIL_OPERATION ;
+            }
+            else if ( daemon_want_fit ( FIT_CODE__HTTP_WORKQUEUE_REQUEST_TIMEOUT, node_ptr->hostname, "" ))
+            {
+               ilog("%s FIT Request Timeout Failed: %s", node_ptr->hostname.c_str(), node_ptr->httpReq.payload.c_str());
+               rc = FAIL_TIMEOUT ;
+            }
+            else if ( daemon_want_fit ( FIT_CODE__HTTP_WORKQUEUE_CONNECTION_LOSS, node_ptr->hostname, "" ))
+            {
+               ilog("%s FIT Connection Loss: %s", node_ptr->hostname.c_str(), node_ptr->httpReq.payload.c_str());
+               node_ptr->thisReq.status = rc = FAIL_HTTP_ZERO_STATUS ;
+            }
+            #endif
             if ( rc != PASS )
             {
-                node_ptr->libEvent_work_fifo_ptr->state = 
+                node_ptr->libEvent_work_fifo_ptr->state =
                 node_ptr->thisReq.state = HTTP__FAILURE ;
             }
             else
             {
                 if ( node_ptr->thisReq.cur_retries )
                 {
-                    ilog ("%s Completed (after %d retries) (took %d of %d msecs)\n", 
-                              node_ptr->thisReq.log_prefix.c_str(), 
-                              node_ptr->thisReq.cur_retries, wait_time, 
+                    ilog ("%s Completed (after %d retries) (took %d of %d msecs)\n",
+                              node_ptr->thisReq.log_prefix.c_str(),
+                              node_ptr->thisReq.cur_retries, wait_time,
                               node_ptr->thisReq.timeout*1000);
                 }
                 else
                 {
-                    qlog ("%s Completed (took %d of %d msecs)\n", 
-                              node_ptr->thisReq.log_prefix.c_str(), 
-                              wait_time, 
+                    qlog ("%s Completed (took %d of %d msecs)\n",
+                              node_ptr->thisReq.log_prefix.c_str(),
+                              wait_time,
                               node_ptr->thisReq.timeout*1000);
                 }
                 node_ptr->thisReq.exec_time_msec = wait_time ;
 
                 node_ptr->thisReq.rx_retry_cnt = 0 ;
-                
+
                 mtcHttpUtil_free_conn ( node_ptr->thisReq );
                 mtcHttpUtil_free_base ( node_ptr->thisReq );
 
-                /* Don't add success responses to non-critical commands like  
+                /* Don't add success responses to non-critical commands like
                  * "update uptime" and "update task" to the done queue */
                 if ( !node_ptr->thisReq.noncritical )
                 {
                     /* Copy done event to the done queue */
                     node_ptr->libEvent_done_fifo.push_back(node_ptr->thisReq);
-            
+
                 }
                 /* Pop that done event off the work queue */
                 node_ptr->libEvent_work_fifo.pop_front();
@@ -503,21 +520,21 @@ int nodeLinkClass::workQueue_process ( struct nodeLinkClass::node * node_ptr )
 
             mtcHttpUtil_free_conn ( node_ptr->thisReq );
             mtcHttpUtil_free_base ( node_ptr->thisReq );
-                
+
             node_ptr->http_retries_cur++ ;
             node_ptr->thisReq.cur_retries++ ;
 
-            if ( node_ptr->thisReq.noncritical == true ) 
-            { 
+            if ( node_ptr->thisReq.noncritical == true )
+            {
                 if ( node_ptr->thisReq.cur_retries > node_ptr->thisReq.max_retries )
                 {
                     node_ptr->oper_failures++ ;
 
                     wlog ("%s retry conjestion abort of non-critical command (%d:%d)\n",
-                              node_ptr->thisReq.log_prefix.c_str(), 
+                              node_ptr->thisReq.log_prefix.c_str(),
                               node_ptr->thisReq.cur_retries,
                               node_ptr->thisReq.max_retries );
-                
+
                     /* Pop this aborted event off the work queue */
                     node_ptr->libEvent_work_fifo.pop_front();
                 }
@@ -561,7 +578,7 @@ int nodeLinkClass::workQueue_process ( struct nodeLinkClass::node * node_ptr )
                           node_ptr->thisReq.max_retries,
                           node_ptr->thisReq.timeout,
                           node_ptr->thisReq.noncritical ? "No" : "Yes" );
-                
+
                 node_ptr->thisReq.response.clear();
 
                 node_ptr->thisReq.status      = PASS  ;
@@ -569,10 +586,10 @@ int nodeLinkClass::workQueue_process ( struct nodeLinkClass::node * node_ptr )
                 node_ptr->thisReq.active      = false ;
                 node_ptr->thisReq.response_len= 0     ;
 
-                /* 
+                /*
                  * If this is an inventory request ...
                  *
-                 * 1. Init the inv struct 
+                 * 1. Init the inv struct
                  * 2. increase the timeout if is a critical command
                  *
                  * */
@@ -583,30 +600,42 @@ int nodeLinkClass::workQueue_process ( struct nodeLinkClass::node * node_ptr )
                     {
                         int temp = node_ptr->libEvent_work_fifo_ptr->timeout ;
 
-                        /* 
+                        /*
                          * Increase and update the timeout value for critical commands
                          * in hope that it will succeed on he next go around.
                          */
                         node_ptr->libEvent_work_fifo_ptr->timeout += get_mtcInv_ptr()->sysinv_timeout ;
-                        dlog ("%s timeout extended from %d to %d secs\n", 
+                        dlog ("%s timeout extended from %d to %d secs\n",
                                   node_ptr->thisReq.log_prefix.c_str(), temp,
                                   node_ptr->libEvent_work_fifo_ptr->timeout );
                     }
                 }
 
                 /* Save the retry count */
-                node_ptr->libEvent_work_fifo_ptr->cur_retries = 
+                node_ptr->libEvent_work_fifo_ptr->cur_retries =
                 node_ptr->thisReq.cur_retries ;
 
-                node_ptr->libEvent_work_fifo_ptr->state = 
+                mtcTimer_start ( node_ptr->http_timer, mtcTimer_handler, HTTP_RETRY_WAIT_SECS );
+                node_ptr->libEvent_work_fifo_ptr->state =
+                node_ptr->thisReq.state = HTTP__RETRY_WAIT ;
+                dlog ("%s %d sec retry wait started", node_ptr->thisReq.log_prefix.c_str(), HTTP_RETRY_WAIT_SECS);
+            }
+            break ;
+        }
+        case HTTP__RETRY_WAIT:
+        {
+            if ( node_ptr->http_timer.ring == true )
+            {
+                dlog ("%s %d sec retry wait expired", node_ptr->thisReq.log_prefix.c_str(), HTTP_RETRY_WAIT_SECS);
+                node_ptr->libEvent_work_fifo_ptr->state =
                 node_ptr->thisReq.state = HTTP__TRANSMIT ;
             }
             break ;
         }
         default:
         {
-            slog ("%s Bad libEvent work state (%d) ; clearing work/done queue\n", 
-                      node_ptr->hostname.c_str(), 
+            slog ("%s Bad libEvent work state (%d) ; clearing work/done queue\n",
+                      node_ptr->hostname.c_str(),
                       node_ptr->libEvent_work_fifo_ptr->state );
             node_ptr->libEvent_work_fifo.clear();
             node_ptr->libEvent_done_fifo.clear();
@@ -623,7 +652,7 @@ int nodeLinkClass::workQueue_process ( struct nodeLinkClass::node * node_ptr )
  * Description: To handle the pathalogical case where an event seems to
  *              have timed out at the callers level then this interface
  *              can be called to delete it from the work queue.
- *              
+ *
  * @param node_ptr so that the hosts work queue can be found
  * @param sequence to specify the specific sequence number to remove
  * @return always PASS since there is nothing the caller can or needs
@@ -660,7 +689,7 @@ int nodeLinkClass::workQueue_del_cmd ( struct nodeLinkClass::node * node_ptr, in
  *
  * Description: Removes all items from the done queue.
  *
- * Returns a failure, the sequence number of the first command 
+ * Returns a failure, the sequence number of the first command
  * in the done queue that did not PASS.
  *
  */
@@ -717,7 +746,7 @@ int nodeLinkClass::doneQueue_purge ( struct nodeLinkClass::node * node_ptr )
         {
             qlog ("%s all (%d) priority queued operations passed (qlog)\n", node_ptr->hostname.c_str(), size );
         }
-        
+
         qlog ("%s purging %d items from doneQueue\n", node_ptr->hostname.c_str(), size );
         node_ptr->libEvent_done_fifo.clear();
     }
@@ -738,7 +767,7 @@ int nodeLinkClass::workQueue_purge ( struct nodeLinkClass::node * node_ptr )
     {
         /* TODO: find out how to force close a connection.
          * Don't free the connection if it is in the receiving state or
-         * we might get a segfault 
+         * we might get a segfault
          * There is only ever one connection open at a time for a specific host
          * so its only 'thisReq' we need to worry about. */
         if ( node_ptr->libEvent_work_fifo_ptr->state != HTTP__RECEIVE )
@@ -754,12 +783,12 @@ int nodeLinkClass::workQueue_purge ( struct nodeLinkClass::node * node_ptr )
         {
             if ( node_ptr->libEvent_work_fifo_ptr->state == HTTP__TRANSMIT )
             {
-                wlog ("%s ... was not executed\n", 
+                wlog ("%s ... was not executed\n",
                            node_ptr->libEvent_work_fifo_ptr->log_prefix.c_str());
             }
             else
             {
-                wlog ("%s ... did not complete (%s)\n", 
+                wlog ("%s ... did not complete (%s)\n",
                            node_ptr->libEvent_work_fifo_ptr->log_prefix.c_str(),
                            _get_work_state_str(node_ptr->libEvent_work_fifo_ptr->state).c_str());
             }
@@ -771,7 +800,7 @@ int nodeLinkClass::workQueue_purge ( struct nodeLinkClass::node * node_ptr )
     {
         qlog ("%s all work done\n", node_ptr->hostname.c_str());
     }
-    
+
     // node_ptr->libEvent_work_fifo_ptr->state = HTTP__TRANSMIT ;
     return (PASS);
 }
@@ -793,7 +822,7 @@ int nodeLinkClass::workQueue_done ( struct nodeLinkClass::node * node_ptr )
               node_ptr->libEvent_work_fifo_ptr++ )
         {
             /* Don't report work queue timeout if there are only noncritical
-             * commands left in the work queue. Such commands might be 
+             * commands left in the work queue. Such commands might be
              * "update uptime" and "update task" */
             if ( !node_ptr->libEvent_work_fifo_ptr->noncritical )
             {
@@ -862,7 +891,6 @@ bool nodeLinkClass::workQueue_present ( libEvent & event )
             }
         }
     }
-    
     wlog ("%s ... not found in work queue\n", event.log_prefix.c_str());
     return (false);
 }
diff --git a/mtce/src/scripts/mtc.conf b/mtce/src/scripts/mtc.conf
index 74b56206..53f252b0 100644
--- a/mtce/src/scripts/mtc.conf
+++ b/mtce/src/scripts/mtc.conf
@@ -78,6 +78,8 @@ bmc_reset_delay = 300        ; seconds to wait before issuing a bmc
                              ; ACK reboot requests. The delay gives
                              ; time for crashdumps to complete.
 
+http_retry_wait = 10         ; secs to wait between http request retries
+
 [client]                     ; Client Configuration
 
 scheduling_priority = 45     ; realtime scheduling; range of 1 .. 99