From 4f5bf78f55ec8b0983262ee351183b1edd8443ad Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Fri, 12 Mar 2021 17:10:00 -0500 Subject: [PATCH] Improve mtcAgent interrupted thread cleanup A BMC command send will be rejected if its thread is not in the IDLE state going into the call. This issue is seen to occur over a reprovisioning action while the bmc access alarmable condition exists. Maintenance will do retries. So the only visible side affect of this issue is a failure to provision to 'redfish' over a provisioning switch to 'dynamic' (learn mode). Instead ipmi is selected. The non-return to idle can occur when the bmc handler FSM is interrupted by a reprovisioning request while a bmc command is in flight. This update enhances the thread management module by introducing a thread consumption utility that is called by the bmc command send utility. If the send finds that its thread is not in the IDLE state it will either kill the thread if it is running or free a completed but-not- consumed thread result. Note: Maintenance only supports the execution of a single thread per host per process at one time. Test Plan: PASS: Verify BMC provisioning change from ipmi to dynamic while the ipmi provisioning was failing prior to re-provisioning. Verify the previous error is cleaned up and the reprovisioning request succeeds as expected. PASS: Verify thread 'execution timeout kill' cleanup handling. PASS: Verify thread 'complete but not consumed' cleanup handling. PASS: Verify logging during regression soaks Regression: PASS: Verify bmc protocol reprovisioning script soak PASS: Verify sensor monitoring following BMC reprovisioning PASS: Verify product verification mtce regression test suite Change-Id: Ie5e9e89ed2f8db6888c0fc7de03d494c75517178 Closes-Bug: 1864906 Signed-off-by: Eric MacDonald --- mtce-common/src/common/threadUtil.cpp | 44 ++++++++++++++++++++++++++- mtce-common/src/common/threadUtil.h | 1 + mtce/src/maintenance/mtcBmcUtil.cpp | 27 ++++++++++++++++ mtce/src/maintenance/mtcNodeFsm.cpp | 5 +++ 4 files changed, 76 insertions(+), 1 deletion(-) diff --git a/mtce-common/src/common/threadUtil.cpp b/mtce-common/src/common/threadUtil.cpp index 034647eb..46e650bb 100644 --- a/mtce-common/src/common/threadUtil.cpp +++ b/mtce-common/src/common/threadUtil.cpp @@ -309,6 +309,48 @@ bool thread_idle ( thread_ctrl_type & ctrl ) return (false); } +/**************************************************************************** + * + * Name : thread_done_consume + * + * Description: Return to IDLE stage. + * + ****************************************************************************/ + +int thread_done_consume ( thread_ctrl_type & ctrl, thread_info_type & info ) +{ + if ( ctrl.stage == THREAD_STAGE__IDLE ) + { + return PASS ; + } + else if ( ctrl.done == false ) + { + if ( info.runcount > ctrl.runcount ) + { + ilog("%s thread cleanup ; cmd:%d ; cnt:%d:%d", + info.hostname.c_str(), + info.command, + ctrl.runcount, + info.runcount); + ctrl.done = true ; + ctrl.stage = THREAD_STAGE__DONE ; + thread_handler (ctrl, info); + return PASS ; + } + else + { + thread_kill(ctrl, info); + return RETRY ; + } + } + else + { + ctrl.stage = THREAD_STAGE__DONE ; + thread_handler( ctrl, info ); + return PASS ; + } +} + /**************************************************************************** * * Name : thread_launch @@ -381,7 +423,7 @@ void thread_kill ( thread_ctrl_type & ctrl, thread_info_type & info ) ( ctrl.stage != THREAD_STAGE__WAIT ) && ( ctrl.stage != THREAD_STAGE__IDLE )) { - blog ("%s kill request\n", ctrl.hostname.c_str() ); + wlog ("%s kill request\n", ctrl.hostname.c_str() ); _stage_change ( ctrl, THREAD_STAGE__KILL ); } } diff --git a/mtce-common/src/common/threadUtil.h b/mtce-common/src/common/threadUtil.h index 552d47bb..2cbabe41 100644 --- a/mtce-common/src/common/threadUtil.h +++ b/mtce-common/src/common/threadUtil.h @@ -284,6 +284,7 @@ bool thread_done ( thread_ctrl_type & ctrl ); bool thread_idle ( thread_ctrl_type & ctrl ); void thread_kill ( thread_ctrl_type & ctrl , thread_info_type & info ); string thread_stage ( thread_ctrl_type & ctrl ); +int thread_done_consume ( thread_ctrl_type & ctrl, thread_info_type & info ); /* Cooperative service of cancel and exit requests from parent */ void pthread_signal_handler ( thread_info_type * info_ptr ); diff --git a/mtce/src/maintenance/mtcBmcUtil.cpp b/mtce/src/maintenance/mtcBmcUtil.cpp index 2c76a654..817db71b 100644 --- a/mtce/src/maintenance/mtcBmcUtil.cpp +++ b/mtce/src/maintenance/mtcBmcUtil.cpp @@ -39,6 +39,26 @@ int nodeLinkClass::bmc_command_send ( struct nodeLinkClass::node * node_ptr, { int rc = PASS ; + /* handle 'kill of in-progress' thread or 'done but not consumed' thread */ + if ( ! thread_idle ( node_ptr->bmc_thread_ctrl )) + { + if ( ! thread_done ( node_ptr->bmc_thread_ctrl )) + { + thread_kill ( node_ptr->bmc_thread_ctrl, + node_ptr->bmc_thread_info ); + return (RETRY); + } + else + { + mtcTimer_reset ( node_ptr->bmc_thread_ctrl.timer ); + if ( thread_done_consume ( node_ptr->bmc_thread_ctrl, + node_ptr->bmc_thread_info ) != PASS ) + { + return (RETRY); + } + } + } + node_ptr->bmc_thread_info.command = command ; /* Update / Setup the BMC access credentials */ @@ -437,6 +457,13 @@ bmc_command_recv_cleanup: if ( rc != RETRY ) { + ilog ("%s %s recv '%s' command (%s) (rc:%d)", + node_ptr->hostname.c_str(), + node_ptr->bmc_thread_ctrl.name.c_str(), + bmcUtil_getCmd_str(node_ptr->bmc_thread_info.command).c_str(), + bmcUtil_getProtocol_str(node_ptr->bmc_protocol).c_str(), + rc); + node_ptr->bmc_thread_ctrl.done = true ; node_ptr->bmc_thread_ctrl.retries = 0 ; node_ptr->bmc_thread_ctrl.id = 0 ; diff --git a/mtce/src/maintenance/mtcNodeFsm.cpp b/mtce/src/maintenance/mtcNodeFsm.cpp index af5e9a26..6138b7b3 100755 --- a/mtce/src/maintenance/mtcNodeFsm.cpp +++ b/mtce/src/maintenance/mtcNodeFsm.cpp @@ -63,6 +63,11 @@ int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr ) /* Monitor and Manage active threads */ thread_handler ( node_ptr->bmc_thread_ctrl, node_ptr->bmc_thread_info ); + if ( node_ptr->bmc_thread_ctrl.stage == THREAD_STAGE__KILL ) + { + /* do nothing while thread is being killed */ + return RETRY ; + } /* manage the host connected state and board management alarms */ nodeLinkClass::bmc_handler ( node_ptr );