From 66ba248389e8059847d134eca89b5a9bbabd4e20 Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Wed, 26 Sep 2018 15:44:32 -0400 Subject: [PATCH] Mtce: Increase swact receive retry delay Maintenance is seen to intermittently fail Swact requests when it fails to get a response from SM 500 msecs after having issued the request successfully. A recent instrumentation update went in which verified that the http request was being launched properly even in the failure cases. Seems the 500 msec timeout might not be long enough to account for SM's scheduling/handling. This update increases the receive retry delay from 50 msec to 1 second. Change-Id: I29d6ba03094843a2af9d8720dd074572d76a31a4 Related-Bug: https://bugs.launchpad.net/starlingx/+bug/1791381 Signed-off-by: Eric MacDonald --- mtce/centos/build_srpm.data | 2 +- mtce/src/maintenance/mtcNodeHdlrs.cpp | 18 +++++++++++++----- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/mtce/centos/build_srpm.data b/mtce/centos/build_srpm.data index f8556950..a6783bd5 100644 --- a/mtce/centos/build_srpm.data +++ b/mtce/centos/build_srpm.data @@ -1,3 +1,3 @@ SRC_DIR="src" -TIS_PATCH_VER=139 +TIS_PATCH_VER=140 BUILD_IS_SLOW=5 diff --git a/mtce/src/maintenance/mtcNodeHdlrs.cpp b/mtce/src/maintenance/mtcNodeHdlrs.cpp index 3f2c2253..4fe185aa 100755 --- a/mtce/src/maintenance/mtcNodeHdlrs.cpp +++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp @@ -3418,7 +3418,7 @@ int nodeLinkClass::online_handler ( struct nodeLinkClass::node * node_ptr ) #define SWACT_FAIL_THRESHOLD (3) #define SWACT_RETRY_THRESHOLD (10) #define SWACT_FAIL_MSEC_DELAY (250) -#define SWACT_RECV_MSEC_DELAY (50) +#define SWACT_RECV_RETRY_DELAY (1) #define SWACT_POLL_DELAY (10) #define SWACT_TIMEOUT_DELAY (50) @@ -3507,7 +3507,7 @@ int nodeLinkClass::swact_handler ( struct nodeLinkClass::node * node_ptr ) * delay a bit and check for the response */ nodeLinkClass::smgrEvent.cur_retries = 0 ; nodeLinkClass::smgrEvent.fails = 0 ; - mtcTimer_start_msec ( node_ptr->mtcSwact_timer, mtcTimer_handler, SWACT_RECV_MSEC_DELAY ); + mtcTimer_start ( node_ptr->mtcSwact_timer, mtcTimer_handler, SWACT_RECV_RETRY_DELAY ); node_ptr->swactStage = MTC_SWACT__QUERY_RECV ; } } @@ -3530,7 +3530,11 @@ int nodeLinkClass::swact_handler ( struct nodeLinkClass::node * node_ptr ) } else { - mtcTimer_start_msec ( node_ptr->mtcSwact_timer, mtcTimer_handler, SWACT_RECV_MSEC_DELAY ); + wlog ("%s Swact Query Request Receive Retry (%d of %d)\n", + node_ptr->hostname.c_str(), + nodeLinkClass::smgrEvent.cur_retries, + SWACT_RETRY_THRESHOLD); + mtcTimer_start ( node_ptr->mtcSwact_timer, mtcTimer_handler, SWACT_RECV_RETRY_DELAY ); break ; } } @@ -3605,7 +3609,7 @@ int nodeLinkClass::swact_handler ( struct nodeLinkClass::node * node_ptr ) nodeLinkClass::smgrEvent.status = PASS ; nodeLinkClass::smgrEvent.fails = 0 ; nodeLinkClass::smgrEvent.cur_retries = 0 ; - mtcTimer_start_msec ( node_ptr->mtcSwact_timer, mtcTimer_handler, SWACT_RECV_MSEC_DELAY ); + mtcTimer_start ( node_ptr->mtcSwact_timer, mtcTimer_handler, SWACT_RECV_RETRY_DELAY ); node_ptr->swactStage = MTC_SWACT__SWACT_RECV ; } break ; @@ -3627,7 +3631,11 @@ int nodeLinkClass::swact_handler ( struct nodeLinkClass::node * node_ptr ) } else { - mtcTimer_start_msec ( node_ptr->mtcSwact_timer, mtcTimer_handler, SWACT_RECV_MSEC_DELAY ); + wlog ("%s Swact Request Receive Retry (%d of %d)\n", + node_ptr->hostname.c_str(), + nodeLinkClass::smgrEvent.cur_retries, + SWACT_RETRY_THRESHOLD ); + mtcTimer_start ( node_ptr->mtcSwact_timer, mtcTimer_handler, SWACT_RECV_RETRY_DELAY ); break ; } }