From 50dc29f6c025de0b9dfea3196cf3bedff8c36908 Mon Sep 17 00:00:00 2001 From: Eric Macdonald Date: Mon, 18 Sep 2023 18:48:56 +0000 Subject: [PATCH] Improve maintenance power/reset control command retry handling This update improves on and drives consistency into the maintenance power on/off and reset handling in terms of retries and use of graceful and immediate commands. This update maintains the 10 retries for both power-on and power-off commands and increases the number of retries for the reset command from 5 to 10 to line up with the power operation commands. This update also ensures that the first 5 retries are done with the graceful action command while the last 5 are with the immediate. This update also removed a power on handling case that could have lead to a stuck state. This case was virtually impossible to hit based on the required sequence of intermittent command failures but that scenario handling was fixed up anyway. Issues have been seen with the power-off handling on some servers. Suspect that those servers need more time to power-off. So, this introduced a 30 seconds delay following a power-off command before issuing the power status query to give the server some time to power-off before retrying the power-off command. Test Plan: Both IPMI and Redfish PASS: Verify power on/off and reset handling support up to 10 retries PASS: Verify graceful command is used for the first power on/off or reset try and the first 5 retries PASS: Verify immediate command is used for the final 5 retries PASS: Verify reset handling with/without retries (none/mid/max) PASS: Verify power-on handling with/without retries (none/mid/max) PASS: Verify power-off handling with/without retries (none/mid/max) PASS: Verify power status command failure handling for power on/off NOTE: FIT (fault insertion testing) was used to create retry scenarios PASS: Verify power-off inter retry delay feature PASS: Verify 30 second power-off to power query delay PASS: Verify redfish power/reset commands used are logged by default PASS: Verify power-off/on and reset logging Regression: PASS: verify power-on/off and reset handling without retries PASS: Verify power-off handling when power is already off PASS: Verify power-on handling when power is already on Closes-Bug: 2031945 Signed-off-by: Eric Macdonald Change-Id: Ie39326bcb205702df48ff9dd090f461c7110dd36 --- mtce-common/src/common/nodeBase.h | 12 +- mtce-common/src/common/nodeTimers.h | 11 +- mtce/src/common/nodeClass.cpp | 1 + mtce/src/common/nodeClass.h | 3 + mtce/src/maintenance/mtcBmcUtil.cpp | 55 ++++-- mtce/src/maintenance/mtcNodeHdlrs.cpp | 241 +++++++++++--------------- mtce/src/maintenance/mtcThreads.cpp | 10 +- 7 files changed, 164 insertions(+), 169 deletions(-) diff --git a/mtce-common/src/common/nodeBase.h b/mtce-common/src/common/nodeBase.h index 3dabad0d..b6493a36 100755 --- a/mtce-common/src/common/nodeBase.h +++ b/mtce-common/src/common/nodeBase.h @@ -187,6 +187,7 @@ typedef enum #define DEFAULT_GOENABLE_TIMEOUT (300) #define DEFAULT_DOR_MODE_TIMEOUT (20) #define DEFAULT_DOR_MODE_AIO_TIMEOUT (600) +#define DEFAULT_POWER_OFF_RETRY_WAIT (30) /** TODO: Convert names to omit JSON part */ #define MTC_JSON_INV_LABEL "ihosts" @@ -323,9 +324,14 @@ typedef enum #define COMMAND_RETRY_DELAY (8) /* from sshUtil.h */ #define COMMAND_DELAY (2) /* from sshUtil.h */ -#define MTC_POWER_ACTION_RETRY_DELAY (20) -#define MTC_POWER_ACTION_RETRY_COUNT (10) -#define MTC_RESET_ACTION_RETRY_COUNT (5) +/* Define Reset and Power Action retry controls ; delay, count and switch threshold */ +#define MTC_POWER_ACTION_QUERY_WAIT (30) +#define MTC_POWER_ACTION_RETRY_DELAY (20) +#define MTC_POWER_ACTION_RETRY_COUNT (10) +#define MTC_POWER_ACTION_SWITCH_THRESHOLD (MTC_POWER_ACTION_RETRY_COUNT/2) +#define MTC_RESET_ACTION_RETRY_DELAY (20) +#define MTC_RESET_ACTION_RETRY_COUNT (10) +#define MTC_RESET_ACTION_SWITCH_THRESHOLD (MTC_RESET_ACTION_RETRY_COUNT/2) /* number of calls to the bmc_handler while bm_access is not confirmed */ #define MTC_MAX_B2B_BM_ACCESS_FAIL_COUNT_B4_ALARM (5) diff --git a/mtce-common/src/common/nodeTimers.h b/mtce-common/src/common/nodeTimers.h index 7cafe642..8a33c713 100755 --- a/mtce-common/src/common/nodeTimers.h +++ b/mtce-common/src/common/nodeTimers.h @@ -2,10 +2,10 @@ #define __INCLUDE_NODETIMERS_HH__ /* - * Copyright (c) 2013-2016 Wind River Systems, Inc. -* -* SPDX-License-Identifier: Apache-2.0 -* + * Copyright (c) 2013-2023 Wind River Systems, Inc. + * + * SPDX-License-Identifier: Apache-2.0 + * */ /** @@ -94,6 +94,9 @@ #define MTC_AGENT_TIMEOUT_EXTENSION (5) #define MTC_LOCK_CEPH_DELAY (90) +#define MTC_RECV_RETRY_WAIT (MTC_RETRY_WAIT) +#define MTC_RECV_WAIT (MTC_RETRY_WAIT) + /** Host must stay enabled for this long for the * failed_recovery_counter to get cleared */ #define MTC_ENABLED_TIMER (5) diff --git a/mtce/src/common/nodeClass.cpp b/mtce/src/common/nodeClass.cpp index b39c8573..bb97253d 100755 --- a/mtce/src/common/nodeClass.cpp +++ b/mtce/src/common/nodeClass.cpp @@ -244,6 +244,7 @@ nodeLinkClass::nodeLinkClass() memory_used = 0 ; hosts = 0 ; host_deleted = false ; + power_off_retry_wait = DEFAULT_POWER_OFF_RETRY_WAIT ; /* Init the base level pulse info and pointers for all interfaces */ pulse_ptr = NULL ; diff --git a/mtce/src/common/nodeClass.h b/mtce/src/common/nodeClass.h index 5703911e..514b25ef 100755 --- a/mtce/src/common/nodeClass.h +++ b/mtce/src/common/nodeClass.h @@ -1508,6 +1508,9 @@ public: /** Host has been deleted */ bool host_deleted ; + /** seconds to wait between power-off retries */ + int power_off_retry_wait ; + /** Host Administrative State Change public member function */ int admin_state_change ( string hostname, string newAdminState ); diff --git a/mtce/src/maintenance/mtcBmcUtil.cpp b/mtce/src/maintenance/mtcBmcUtil.cpp index 75d723f6..6727658d 100644 --- a/mtce/src/maintenance/mtcBmcUtil.cpp +++ b/mtce/src/maintenance/mtcBmcUtil.cpp @@ -100,16 +100,18 @@ int nodeLinkClass::bmc_command_send ( struct nodeLinkClass::node * node_ptr, } case BMC_THREAD_CMD__POWER_RESET: { - /* use immediate for all retries if server supports an immediate command */ - if ( ( node_ptr->power_action_retries < MTC_RESET_ACTION_RETRY_COUNT ) && ( ! node_ptr->bmc_info.power_ctrl.reset.immediate.empty() )) + /* Use graceful for the first half of the retry countdown + * and immediate for the remaining retries. */ + if ((!node_ptr->bmc_info.power_ctrl.reset.immediate.empty()) && + ( node_ptr->power_action_retries < MTC_RESET_ACTION_SWITCH_THRESHOLD)) node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.reset.immediate); - /* unfaulted graceful if it exists */ - else if ( ! node_ptr->bmc_info.power_ctrl.reset.graceful.empty() ) + /* Unfaulted graceful if it exists */ + else if (!node_ptr->bmc_info.power_ctrl.reset.graceful.empty()) node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.reset.graceful); - /* unfaulted immediate if graceful does not exist */ - else if ( ! node_ptr->bmc_info.power_ctrl.reset.immediate.empty()) + /* Unfaulted immediate if graceful does not exist */ + else if (!node_ptr->bmc_info.power_ctrl.reset.immediate.empty()) node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.reset.immediate); else { @@ -120,18 +122,19 @@ int nodeLinkClass::bmc_command_send ( struct nodeLinkClass::node * node_ptr, } case BMC_THREAD_CMD__POWER_ON: { - /* use immediate for all retries if server supports an immediate command */ - if ( ( node_ptr->power_action_retries < MTC_RESET_ACTION_RETRY_COUNT) && ( ! node_ptr->bmc_info.power_ctrl.poweron.immediate.empty() )) + /* Use graceful for the first half of the retry countdown + * and immediate for the remaining retries. */ + if ((!node_ptr->bmc_info.power_ctrl.poweron.immediate.empty()) && + ( node_ptr->power_action_retries < MTC_POWER_ACTION_SWITCH_THRESHOLD)) node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweron.immediate); - /* unfaulted graceful if it exists */ - else if ( ! node_ptr->bmc_info.power_ctrl.poweron.graceful.empty() ) + /* Unfaulted graceful if it exists */ + else if (!node_ptr->bmc_info.power_ctrl.poweron.graceful.empty()) node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweron.graceful); - /* unfaulted immediate if graceful does not exist */ - else if ( ! node_ptr->bmc_info.power_ctrl.poweron.immediate.empty()) + /* Unfaulted immediate if graceful does not exist */ + else if (!node_ptr->bmc_info.power_ctrl.poweron.immediate.empty()) node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweron.immediate); - else { elog("%s offers no supported poweron commands", node_ptr->hostname.c_str()); @@ -141,16 +144,18 @@ int nodeLinkClass::bmc_command_send ( struct nodeLinkClass::node * node_ptr, } case BMC_THREAD_CMD__POWER_OFF: { - /* use immediate for all retries if server supports an immediate command */ - if ( ( node_ptr->power_action_retries < MTC_RESET_ACTION_RETRY_COUNT ) && ( ! node_ptr->bmc_info.power_ctrl.poweroff.immediate.empty() )) + /* Use graceful for the first half of the retry countdown + * and immediate for the remaining retries. */ + if ((!node_ptr->bmc_info.power_ctrl.poweroff.immediate.empty() ) && + ( node_ptr->power_action_retries < MTC_POWER_ACTION_SWITCH_THRESHOLD)) node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweroff.immediate); - /* unfaulted graceful if it exists */ - else if ( ! node_ptr->bmc_info.power_ctrl.poweroff.graceful.empty() ) + /* Unfaulted graceful if it exists */ + else if (!node_ptr->bmc_info.power_ctrl.poweroff.graceful.empty() ) node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweroff.graceful); - /* unfaulted immediate if graceful does not exist */ - else if ( ! node_ptr->bmc_info.power_ctrl.poweroff.immediate.empty()) + /* Unfaulted immediate if graceful does not exist */ + else if (!node_ptr->bmc_info.power_ctrl.poweroff.immediate.empty()) node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweroff.immediate); else { @@ -193,11 +198,23 @@ int nodeLinkClass::bmc_command_send ( struct nodeLinkClass::node * node_ptr, { want_fit = true ; } + else if (( command == BMC_THREAD_CMD__POWER_ON ) && + ( daemon_want_fit ( fit, node_ptr->hostname, "power_none" ) == true )) + { + /* Just change the command to query status */ + command = BMC_THREAD_CMD__POWER_STATUS ; + } else if (( command == BMC_THREAD_CMD__POWER_OFF ) && ( daemon_want_fit ( fit, node_ptr->hostname, "power_off" ) == true )) { want_fit = true ; } + else if (( command == BMC_THREAD_CMD__POWER_OFF ) && + ( daemon_want_fit ( fit, node_ptr->hostname, "power_none" ) == true )) + { + /* Just change the command to query status */ + command = BMC_THREAD_CMD__POWER_STATUS ; + } else if (( command == BMC_THREAD_CMD__POWER_CYCLE ) && ( daemon_want_fit ( fit, node_ptr->hostname, "power_cycle" ) == true )) { diff --git a/mtce/src/maintenance/mtcNodeHdlrs.cpp b/mtce/src/maintenance/mtcNodeHdlrs.cpp index 3c1358e4..9ddc6c16 100755 --- a/mtce/src/maintenance/mtcNodeHdlrs.cpp +++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp @@ -4007,7 +4007,6 @@ int nodeLinkClass::reset_handler ( struct nodeLinkClass::node * node_ptr ) } case MTC_RESET__REQ_SEND: { - node_ptr->power_action_retries--; /* Handle loss of connectivity over retries */ if ( node_ptr->bmc_provisioned == false ) @@ -4022,18 +4021,17 @@ int nodeLinkClass::reset_handler ( struct nodeLinkClass::node * node_ptr ) { wlog ("%s Reset request rejected ; BMC not accessible ; retry in %d seconds \n", node_ptr->hostname.c_str(), - MTC_POWER_ACTION_RETRY_DELAY); + MTC_RESET_ACTION_RETRY_DELAY); mtcTimer_reset ( node_ptr->mtcTimer ); - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY ); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RESET_ACTION_RETRY_DELAY ); resetStageChange ( node_ptr , MTC_RESET__QUEUE ); break ; } else { - rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_RESET ); - + rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_RESET ); if ( rc ) { wlog ("%s Reset request failed (%d)\n", node_ptr->hostname.c_str(), rc ); @@ -4044,7 +4042,7 @@ int nodeLinkClass::reset_handler ( struct nodeLinkClass::node * node_ptr ) blog ("%s Reset requested\n", node_ptr->hostname.c_str()); resetStageChange ( node_ptr , MTC_RESET__RESP_WAIT ); } - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY ); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RESET_ACTION_RETRY_DELAY ); } break ; } @@ -4053,17 +4051,16 @@ int nodeLinkClass::reset_handler ( struct nodeLinkClass::node * node_ptr ) { if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) { - rc = bmc_command_recv ( node_ptr ); - if ( rc == RETRY ) - { - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT ); - break ; - } - - if ( rc ) + rc = bmc_command_recv ( node_ptr ); + if ( rc == RETRY ) + { + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT ); + break ; + } + else if ( rc ) { elog ("%s Reset command failed (rc:%d)\n", node_ptr->hostname.c_str(), rc ); - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY ); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RESET_ACTION_RETRY_DELAY ); resetStageChange ( node_ptr, MTC_RESET__QUEUE ); } else @@ -4082,7 +4079,7 @@ int nodeLinkClass::reset_handler ( struct nodeLinkClass::node * node_ptr ) if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) { node_ptr->mtcTimer.ring = false ; - if ( node_ptr->power_action_retries > 0 ) + if ( --node_ptr->power_action_retries >= 0 ) { char buffer[64] ; int attempts = MTC_RESET_ACTION_RETRY_COUNT - node_ptr->power_action_retries ; @@ -4455,7 +4452,8 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr ) case MTC_REINSTALL__POWEROFF: { node_ptr->power_action_retries = MTC_POWER_ACTION_RETRY_COUNT ; - powerStageChange ( node_ptr , MTC_POWEROFF__REQ_SEND ); + mtcTimer_reset ( node_ptr->mtcTimer ) ; + powerStageChange ( node_ptr, MTC_POWEROFF__REQ_SEND ); reinstallStageChange ( node_ptr , MTC_REINSTALL__POWEROFF_WAIT ); break ; } @@ -4975,54 +4973,56 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) node_ptr->bm_ip.c_str(), rc ); } - else - { - ; - } node_ptr->power_action_retries = MTC_POWER_ACTION_RETRY_COUNT ; - //the fall through to MTC_POWEROFF__REQ_SEND is intentional - MTCE_FALLTHROUGH; + + /* don't allow a timeout of zero to be passed in */ + if ( power_off_retry_wait == 0 ) + power_off_retry_wait = DEFAULT_POWER_OFF_RETRY_WAIT ; + + ilog ("%s power off retry wait is %d seconds", + node_ptr->hostname.c_str(), power_off_retry_wait); + + mtcTimer_reset ( node_ptr->mtcTimer ) ; + powerStageChange ( node_ptr , MTC_POWEROFF__REQ_SEND ); + break ; } case MTC_POWEROFF__REQ_SEND: { - - /* Handle loss of connectivity over retries */ - if ( node_ptr->bmc_provisioned == false ) + if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) { - elog ("%s BMC not provisioned\n", node_ptr->hostname.c_str()); - mtcInvApi_force_task ( node_ptr, MTC_TASK_BMC_NOT_PROV ); - powerStageChange ( node_ptr , MTC_POWEROFF__FAIL ); - break ; - } - - if ( node_ptr->bmc_accessible == false ) - { - wlog ("%s Power Off request rejected ; BMC not accessible ; retry in %d seconds\n", - node_ptr->hostname.c_str(), - MTC_POWER_ACTION_RETRY_DELAY); - - mtcTimer_reset ( node_ptr->mtcTimer ); - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY ); - powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE ); - break ; - } - - else - { - rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_OFF ); - if ( rc ) + /* Handle loss of connectivity over retries */ + if ( node_ptr->bmc_provisioned == false ) { - node_ptr->power_action_retries--; - wlog ("%s Power-Off request failed (%d)\n", node_ptr->hostname.c_str(), rc ); - powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE ); + elog ("%s BMC not provisioned\n", node_ptr->hostname.c_str()); + mtcInvApi_force_task ( node_ptr, MTC_TASK_BMC_NOT_PROV ); + powerStageChange ( node_ptr , MTC_POWEROFF__FAIL ); + break ; } + + if ( node_ptr->bmc_accessible == false ) + { + wlog ("%s Power Off request rejected ; BMC not accessible", + node_ptr->hostname.c_str()); + powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE ); + break ; + } + else { - ilog ("%s Power-Off requested\n", node_ptr->hostname.c_str()); - powerStageChange ( node_ptr , MTC_POWEROFF__RESP_WAIT ); + rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_OFF ); + if ( rc ) + { + wlog ("%s Power-Off request failed (%d)", node_ptr->hostname.c_str(), rc ); + powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE ); + } + else + { + ilog ("%s Power-Off requested", node_ptr->hostname.c_str()); + powerStageChange ( node_ptr , MTC_POWEROFF__RESP_WAIT ); + } + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RECV_WAIT ); } - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY ); } break ; } @@ -5034,41 +5034,14 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) rc = bmc_command_recv ( node_ptr ); if ( rc == RETRY ) { - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT ); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RECV_RETRY_WAIT ); break ; } else if ( rc ) { elog ("%s Power-Off command failed\n", node_ptr->hostname.c_str()); - - // Need to handle retries in this case since we don't - // go through the QUEUE stage. - if ( --node_ptr->power_action_retries > 0 ) - { - char buffer[255] ; - int attempts = MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries ; - snprintf ( buffer, 255, MTC_TASK_POWEROFF_QUEUE, attempts, MTC_POWER_ACTION_RETRY_COUNT); - mtcInvApi_update_task ( node_ptr, buffer); - - // The power off command can fail due to connectivity - // issue or if the server is now already powered off. - // The latter could occur if the previous power off - // command failed 'in response' but actually did end up - // powering off. In that case, if we continue to just - // retry the power off when the power is already off - // then that will just fail again since most redfish - // implementations fail rather than wave-on a power off - // request while the power is already off. In this case - // its better to switch to power query power status - // again and allow that result to put this power off - // FSM into the correct state to continue/retry the - // quest for power off. - powerStageChange ( node_ptr , MTC_POWEROFF__POWERQRY ); - } - else - { - powerStageChange ( node_ptr , MTC_POWEROFF__FAIL ); - } + powerStageChange ( node_ptr , MTC_POWEROFF__POWERQRY ); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_QUERY_WAIT ); } else { @@ -5091,6 +5064,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) plog ("%s is now offline\n", node_ptr->hostname.c_str()); powerStageChange ( node_ptr , MTC_POWEROFF__POWERQRY ); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_QUERY_WAIT ); } else if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) { @@ -5101,27 +5075,31 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) } case MTC_POWEROFF__POWERQRY: { - if ( node_ptr->bmc_thread_ctrl.done ) + /* give the power off action some time to complete */ + if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) { - /* Query Host Power Status */ - if ( bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_STATUS ) != PASS ) + if ( node_ptr->bmc_thread_ctrl.done ) { - elog ("%s '%s' send failed\n", - node_ptr->hostname.c_str(), - bmcUtil_getCmd_str( - node_ptr->bmc_thread_info.command).c_str()); - pingUtil_restart ( node_ptr->bm_ping_info ); - powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE ); + /* Query Host Power Status */ + if ( bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_STATUS ) != PASS ) + { + elog ("%s '%s' send failed", + node_ptr->hostname.c_str(), + bmcUtil_getCmd_str( + node_ptr->bmc_thread_info.command).c_str()); + pingUtil_restart ( node_ptr->bm_ping_info ); + powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE ); + } + else + { + powerStageChange ( node_ptr , MTC_POWEROFF__POWERQRY_WAIT ); + } + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RECV_WAIT ); } else { - powerStageChange ( node_ptr , MTC_POWEROFF__POWERQRY_WAIT ); + thread_kill ( node_ptr->bmc_thread_ctrl , node_ptr->bmc_thread_info ) ; } - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT ); - } - else - { - thread_kill ( node_ptr->bmc_thread_ctrl , node_ptr->bmc_thread_info ) ; } break ; } @@ -5132,7 +5110,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) int rc = bmc_command_recv ( node_ptr ) ; if ( rc == RETRY ) { - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT ); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RECV_RETRY_WAIT ); break ; } else if ( rc != PASS ) @@ -5183,37 +5161,36 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE ); } } - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT ); } break ; } case MTC_POWEROFF__QUEUE: { - if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) + if ( --node_ptr->power_action_retries >= 0 ) { - if ( --node_ptr->power_action_retries > 0 ) - { - char buffer[255] ; - int attempts = MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries ; - snprintf ( buffer, 255, MTC_TASK_POWEROFF_QUEUE, attempts, MTC_POWER_ACTION_RETRY_COUNT); - mtcInvApi_update_task ( node_ptr, buffer); + char buffer[255] ; + int attempts = MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries ; + snprintf ( buffer, 255, MTC_TASK_POWEROFF_QUEUE, attempts, MTC_POWER_ACTION_RETRY_COUNT); + mtcInvApi_update_task ( node_ptr, buffer); - /* Check the thread error status if there is one. Skip the - * typical system call log which just floods the log file. - * The failure is reported in the update task log above. */ - if (( node_ptr->bmc_thread_info.status ) && - ( node_ptr->bmc_thread_info.status != FAIL_SYSTEM_CALL)) - { - wlog ("%s ... %s (rc:%d)\n", node_ptr->hostname.c_str(), - node_ptr->bmc_thread_info.status_string.c_str(), - node_ptr->bmc_thread_info.status ); - } - powerStageChange ( node_ptr , MTC_POWEROFF__REQ_SEND ); - } - else + /* Check the thread error status if there is one. Skip the + * typical system call log which just floods the log file. + * The failure is reported in the update task log above. */ + if (( node_ptr->bmc_thread_info.status ) && + ( node_ptr->bmc_thread_info.status != FAIL_SYSTEM_CALL)) { - powerStageChange ( node_ptr , MTC_POWEROFF__FAIL ); + wlog ("%s ... %s (rc:%d)", node_ptr->hostname.c_str(), + node_ptr->bmc_thread_info.status_string.c_str(), + node_ptr->bmc_thread_info.status ); } + powerStageChange ( node_ptr , MTC_POWEROFF__REQ_SEND ); + ilog ("%s waiting %d seconds before next power off retry", + node_ptr->hostname.c_str(), power_off_retry_wait); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, power_off_retry_wait ); + } + else + { + powerStageChange ( node_ptr , MTC_POWEROFF__FAIL ); } break ; } @@ -5294,7 +5271,6 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) node_ptr->hostname.c_str(), MTC_POWER_ACTION_RETRY_DELAY); - node_ptr->power_action_retries-- ; mtcTimer_reset ( node_ptr->mtcTimer ); mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY ); powerStageChange ( node_ptr , MTC_POWERON__QUEUE ); @@ -5304,7 +5280,6 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_STATUS ) ; if ( rc ) { - node_ptr->power_action_retries-- ; powerStageChange ( node_ptr , MTC_POWERON__QUEUE ); } else @@ -5349,18 +5324,11 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) powerStageChange ( node_ptr , MTC_POWERON__REQ_SEND ); } } - /* failure path handling */ - else if ( node_ptr->power_action_retries <= 0 ) - { - wlog ("%s current power state query failed ; " - "proceeding with power-on", - node_ptr->hostname.c_str()); - powerStageChange ( node_ptr , MTC_POWERON__REQ_SEND ); - node_ptr->power_action_retries = MTC_POWER_ACTION_RETRY_COUNT ; - } else { - powerStageChange ( node_ptr , MTC_POWERON__POWER_STATUS ); + wlog ("%s power state query failed", + node_ptr->hostname.c_str()); + powerStageChange ( node_ptr , MTC_POWERON__QUEUE ); } } break ; @@ -5383,7 +5351,6 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) if ( node_ptr->bmc_accessible == false ) { - node_ptr->power_action_retries--; wlog ("%s Power-On will fail ; not accessible to BMC ; retry in %d seconds \n", node_ptr->hostname.c_str(), MTC_POWER_ACTION_RETRY_DELAY); @@ -5397,7 +5364,6 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_ON ); if ( rc ) { - node_ptr->power_action_retries--; wlog ("%s Power-On request failed (%d)\n", node_ptr->hostname.c_str(), rc ); @@ -5429,7 +5395,6 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) if ( rc ) { - node_ptr->power_action_retries--; elog ("%s Power-On command failed\n", node_ptr->hostname.c_str()); mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY ); powerStageChange ( node_ptr , MTC_POWERON__QUEUE ); @@ -5452,7 +5417,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) { node_ptr->mtcTimer.ring = false ; - if ( node_ptr->power_action_retries > 0 ) + if ( --node_ptr->power_action_retries >= 0 ) { char buffer[64] ; int attempts = MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries ; diff --git a/mtce/src/maintenance/mtcThreads.cpp b/mtce/src/maintenance/mtcThreads.cpp index 97db3af5..a93a7f88 100644 --- a/mtce/src/maintenance/mtcThreads.cpp +++ b/mtce/src/maintenance/mtcThreads.cpp @@ -1,9 +1,9 @@ /* - * Copyright (c) 2016-2017 Wind River Systems, Inc. -* -* SPDX-License-Identifier: Apache-2.0 -* + * Copyright (c) 2016-2023 Wind River Systems, Inc. + * + * SPDX-License-Identifier: Apache-2.0 + * */ /** @@ -316,7 +316,7 @@ void * mtcThread_bmc ( void * arg ) { string chopped_request = bmcUtil_chop_system_req(request); daemon_remove_file ( datafile.data() ) ; - blog_t("%s %s", info_ptr->hostname.c_str(), chopped_request.c_str()); + ilog_t("%s %s", info_ptr->hostname.c_str(), chopped_request.c_str()); /****** Make the system call ******/ rc =