From 1350502720c0a0d908a359bf80bca73ee9767831 Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Thu, 19 Nov 2020 12:46:52 -0500 Subject: [PATCH] Make Mtce Power-Off FSM verify power-off If a host's BMC server accepts a power-off command without error but does not actually power-off the host, the power-off FSM reports success yet the host power is still on. This update adds a verification component to the power-off FSM. Once the power-off command is issued and succeeds at the command level, the power-off FSM will now query power status and retry the power-off command until the server is verified to be powered-off or the retry max (10) is reached and the power-off command is failed. Test Plan: PASS: Verify 200+ Mtce Power Off/On cycles (ipmi & redfish) PASS: Verify 100+ Mtce Reinstalls with FIT (ipmi & redfish) Change-Id: Iddd120d89d1152fc0b26915df123f586c38b909b Closes-Bug: 1865087 Signed-off-by: Eric MacDonald --- mtce-common/src/common/nodeBase.cpp | 4 + mtce-common/src/common/nodeBase.h | 6 +- mtce-common/src/common/nodeTimers.h | 2 +- mtce/src/maintenance/mtcNodeHdlrs.cpp | 147 ++++++++++++++++++++++++-- 4 files changed, 147 insertions(+), 12 deletions(-) diff --git a/mtce-common/src/common/nodeBase.cpp b/mtce-common/src/common/nodeBase.cpp index 7214a67d..3c7c482a 100755 --- a/mtce-common/src/common/nodeBase.cpp +++ b/mtce-common/src/common/nodeBase.cpp @@ -425,6 +425,10 @@ void mtc_stages_init ( void ) powerStages_str [MTC_POWEROFF__START ] = "Power-Off-Start"; powerStages_str [MTC_POWEROFF__REQ_SEND ] = "Power-Off-Req-Send"; powerStages_str [MTC_POWEROFF__RESP_WAIT ] = "Power-Off-Resp-Wait"; + powerStages_str [MTC_POWEROFF__OFFLINE_WAIT ] = "Power-Off-Offline-Wait"; + powerStages_str [MTC_POWEROFF__POWERQRY ] = "Power-Off-Power-Query"; + powerStages_str [MTC_POWEROFF__POWERQRY_WAIT ] = "Power-Off-Power-Query-Wait"; + powerStages_str [MTC_POWEROFF__QUEUE ] = "Power-Off-Queue"; powerStages_str [MTC_POWEROFF__DONE ] = "Power-Off-Done"; powerStages_str [MTC_POWEROFF__FAIL ] = "Power-Off-Fail"; powerStages_str [MTC_POWEROFF__FAIL_WAIT ] = "Power-Off-Fail-Wait"; diff --git a/mtce-common/src/common/nodeBase.h b/mtce-common/src/common/nodeBase.h index 8994f570..0603c0ce 100755 --- a/mtce-common/src/common/nodeBase.h +++ b/mtce-common/src/common/nodeBase.h @@ -1102,9 +1102,11 @@ typedef enum MTC_POWEROFF__FAIL, MTC_POWEROFF__FAIL_WAIT, MTC_POWEROFF__QUEUE, - + MTC_POWEROFF__OFFLINE_WAIT, + MTC_POWEROFF__POWERQRY, + MTC_POWEROFF__POWERQRY_WAIT, MTC_POWER__DONE, /* clear power action */ - MTC_POWER__STAGES + MTC_POWER__STAGES } mtc_powerStages_enum ; /** Return the string representing the specified 'power' stage */ diff --git a/mtce-common/src/common/nodeTimers.h b/mtce-common/src/common/nodeTimers.h index aac3e4ba..f309566f 100755 --- a/mtce-common/src/common/nodeTimers.h +++ b/mtce-common/src/common/nodeTimers.h @@ -66,7 +66,7 @@ #define MTC_SWACT_POLL_TIMER (10) #define MTC_TASK_UPDATE_DELAY (30) #define MTC_BM_PING_TIMEOUT (30) -#define MTC_BM_POWEROFF_TIMEOUT (30) +#define MTC_BM_POWEROFF_TIMEOUT (60) #define MTC_BM_POWERON_TIMEOUT (30) #define MTC_RESET_PROG_TIMEOUT (20) #define MTC_WORKQUEUE_TIMEOUT (60) diff --git a/mtce/src/maintenance/mtcNodeHdlrs.cpp b/mtce/src/maintenance/mtcNodeHdlrs.cpp index f47cf10a..de5ae2a4 100755 --- a/mtce/src/maintenance/mtcNodeHdlrs.cpp +++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp @@ -4311,7 +4311,7 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr ) { wlog ("%s Reinstall power query receive failed ; retry %d of %d in %d seconds (rc:%d)", node_ptr->hostname.c_str(), - node_ptr->power_action_retries, + MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries, MTC_POWER_ACTION_RETRY_COUNT, MTC_RETRY_WAIT, rc ); mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT ); @@ -4917,30 +4917,159 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) } else if ( rc ) { - node_ptr->power_action_retries--; elog ("%s Power-Off command failed\n", node_ptr->hostname.c_str()); - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY ); - powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE ); + + // Need to handle retries in this case since we don't + // go through the QUEUE stage. + if ( --node_ptr->power_action_retries > 0 ) + { + char buffer[255] ; + int attempts = MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries ; + snprintf ( buffer, 255, MTC_TASK_POWEROFF_QUEUE, attempts, MTC_POWER_ACTION_RETRY_COUNT); + mtcInvApi_update_task ( node_ptr, buffer); + + // The power off command can fail due to connectivity + // issue or if the server is now already powered off. + // The latter could occur if the previous power off + // command failed 'in response' but actually did end up + // powering off. In that case, if we continue to just + // retry the power off when the power is already off + // then that will just fail again since most redfish + // implementations fail rather than wave-on a power off + // request while the power is already off. In this case + // its better to switch to power query power status + // again and allow that result to put this power off + // FSM into the correct state to continue/retry the + // quest for power off. + powerStageChange ( node_ptr , MTC_POWEROFF__POWERQRY ); + } + else + { + powerStageChange ( node_ptr , MTC_POWEROFF__FAIL ); + } } else { - ilog ("%s is Powering Off\n", node_ptr->hostname.c_str() ); + ilog ("%s is Powering Off ; waiting for offline\n", node_ptr->hostname.c_str() ); if ( node_ptr->adminAction != MTC_ADMIN_ACTION__REINSTALL ) { mtcInvApi_update_task ( node_ptr, "Powering Off" ); } - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY ); - powerStageChange ( node_ptr , MTC_POWEROFF__DONE ); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_BM_POWEROFF_TIMEOUT ); + powerStageChange ( node_ptr , MTC_POWEROFF__OFFLINE_WAIT ); } } break ; } + case MTC_POWEROFF__OFFLINE_WAIT: + { + if ( node_ptr->availStatus == MTC_AVAIL_STATUS__OFFLINE ) + { + mtcTimer_reset ( node_ptr->mtcTimer ); + + plog ("%s is now offline\n", node_ptr->hostname.c_str()); + powerStageChange ( node_ptr , MTC_POWEROFF__POWERQRY ); + } + else if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) + { + elog ("%s Power-Off operation timeout - host did not go offline\n", node_ptr->hostname.c_str()); + powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE ); + } + break ; + } + case MTC_POWEROFF__POWERQRY: + { + if ( node_ptr->bmc_thread_ctrl.done ) + { + /* Query Host Power Status */ + if ( bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_STATUS ) != PASS ) + { + elog ("%s '%s' send failed\n", + node_ptr->hostname.c_str(), + bmcUtil_getCmd_str( + node_ptr->bmc_thread_info.command).c_str()); + pingUtil_restart ( node_ptr->bm_ping_info ); + powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE ); + } + else + { + powerStageChange ( node_ptr , MTC_POWEROFF__POWERQRY_WAIT ); + } + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT ); + } + else + { + thread_kill ( node_ptr->bmc_thread_ctrl , node_ptr->bmc_thread_info ) ; + } + break ; + } + case MTC_POWEROFF__POWERQRY_WAIT: + { + if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) + { + int rc = bmc_command_recv ( node_ptr ) ; + if ( rc == RETRY ) + { + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT ); + break ; + } + else if ( rc != PASS ) + { + wlog ("%s '%s' failed receive (rc:%d)", + node_ptr->hostname.c_str(), + bmcUtil_getCmd_str( + node_ptr->bmc_thread_info.command).c_str(), + rc ); + powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE ); + } + else if ( node_ptr->bmc_thread_info.data.empty() ) + { + wlog ("%s '%s' request yielded no response data", + node_ptr->hostname.c_str(), + bmcUtil_getCmd_str( + node_ptr->bmc_thread_info.command).c_str()); + powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE ); + } + else + { + int rc = + bmcUtil_is_power_on ( node_ptr->hostname, + node_ptr->bmc_protocol, + node_ptr->bmc_thread_info.data, + node_ptr->power_on); + if ( rc == PASS ) + { + if ( node_ptr->power_on == true ) + { + ilog ("%s Power not Off ; retry power-off ", + node_ptr->hostname.c_str()); + powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE ); + } + else + { + ilog ("%s Power-Off Verified", + node_ptr->hostname.c_str()); + powerStageChange ( node_ptr , MTC_POWEROFF__DONE ); + mtcTimer_reset ( node_ptr->mtcTimer ); + break ; + } + } + else + { + elog ("%s Power query failed (rc:%d)", + node_ptr->hostname.c_str(), rc ); + powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE ); + } + } + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT ); + } + break ; + } case MTC_POWEROFF__QUEUE: { if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) { - node_ptr->mtcTimer.ring = false ; - if ( node_ptr->power_action_retries > 0 ) + if ( --node_ptr->power_action_retries > 0 ) { char buffer[255] ; int attempts = MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries ;