diff --git a/mtce-common/src/common/nodeBase.h b/mtce-common/src/common/nodeBase.h index 3dabad0d..b6493a36 100755 --- a/mtce-common/src/common/nodeBase.h +++ b/mtce-common/src/common/nodeBase.h @@ -187,6 +187,7 @@ typedef enum #define DEFAULT_GOENABLE_TIMEOUT (300) #define DEFAULT_DOR_MODE_TIMEOUT (20) #define DEFAULT_DOR_MODE_AIO_TIMEOUT (600) +#define DEFAULT_POWER_OFF_RETRY_WAIT (30) /** TODO: Convert names to omit JSON part */ #define MTC_JSON_INV_LABEL "ihosts" @@ -323,9 +324,14 @@ typedef enum #define COMMAND_RETRY_DELAY (8) /* from sshUtil.h */ #define COMMAND_DELAY (2) /* from sshUtil.h */ -#define MTC_POWER_ACTION_RETRY_DELAY (20) -#define MTC_POWER_ACTION_RETRY_COUNT (10) -#define MTC_RESET_ACTION_RETRY_COUNT (5) +/* Define Reset and Power Action retry controls ; delay, count and switch threshold */ +#define MTC_POWER_ACTION_QUERY_WAIT (30) +#define MTC_POWER_ACTION_RETRY_DELAY (20) +#define MTC_POWER_ACTION_RETRY_COUNT (10) +#define MTC_POWER_ACTION_SWITCH_THRESHOLD (MTC_POWER_ACTION_RETRY_COUNT/2) +#define MTC_RESET_ACTION_RETRY_DELAY (20) +#define MTC_RESET_ACTION_RETRY_COUNT (10) +#define MTC_RESET_ACTION_SWITCH_THRESHOLD (MTC_RESET_ACTION_RETRY_COUNT/2) /* number of calls to the bmc_handler while bm_access is not confirmed */ #define MTC_MAX_B2B_BM_ACCESS_FAIL_COUNT_B4_ALARM (5) diff --git a/mtce-common/src/common/nodeTimers.h b/mtce-common/src/common/nodeTimers.h index 7cafe642..8a33c713 100755 --- a/mtce-common/src/common/nodeTimers.h +++ b/mtce-common/src/common/nodeTimers.h @@ -2,10 +2,10 @@ #define __INCLUDE_NODETIMERS_HH__ /* - * Copyright (c) 2013-2016 Wind River Systems, Inc. -* -* SPDX-License-Identifier: Apache-2.0 -* + * Copyright (c) 2013-2023 Wind River Systems, Inc. + * + * SPDX-License-Identifier: Apache-2.0 + * */ /** @@ -94,6 +94,9 @@ #define MTC_AGENT_TIMEOUT_EXTENSION (5) #define MTC_LOCK_CEPH_DELAY (90) +#define MTC_RECV_RETRY_WAIT (MTC_RETRY_WAIT) +#define MTC_RECV_WAIT (MTC_RETRY_WAIT) + /** Host must stay enabled for this long for the * failed_recovery_counter to get cleared */ #define MTC_ENABLED_TIMER (5) diff --git a/mtce/src/common/nodeClass.cpp b/mtce/src/common/nodeClass.cpp index b39c8573..bb97253d 100755 --- a/mtce/src/common/nodeClass.cpp +++ b/mtce/src/common/nodeClass.cpp @@ -244,6 +244,7 @@ nodeLinkClass::nodeLinkClass() memory_used = 0 ; hosts = 0 ; host_deleted = false ; + power_off_retry_wait = DEFAULT_POWER_OFF_RETRY_WAIT ; /* Init the base level pulse info and pointers for all interfaces */ pulse_ptr = NULL ; diff --git a/mtce/src/common/nodeClass.h b/mtce/src/common/nodeClass.h index 5703911e..514b25ef 100755 --- a/mtce/src/common/nodeClass.h +++ b/mtce/src/common/nodeClass.h @@ -1508,6 +1508,9 @@ public: /** Host has been deleted */ bool host_deleted ; + /** seconds to wait between power-off retries */ + int power_off_retry_wait ; + /** Host Administrative State Change public member function */ int admin_state_change ( string hostname, string newAdminState ); diff --git a/mtce/src/maintenance/mtcBmcUtil.cpp b/mtce/src/maintenance/mtcBmcUtil.cpp index 75d723f6..6727658d 100644 --- a/mtce/src/maintenance/mtcBmcUtil.cpp +++ b/mtce/src/maintenance/mtcBmcUtil.cpp @@ -100,16 +100,18 @@ int nodeLinkClass::bmc_command_send ( struct nodeLinkClass::node * node_ptr, } case BMC_THREAD_CMD__POWER_RESET: { - /* use immediate for all retries if server supports an immediate command */ - if ( ( node_ptr->power_action_retries < MTC_RESET_ACTION_RETRY_COUNT ) && ( ! node_ptr->bmc_info.power_ctrl.reset.immediate.empty() )) + /* Use graceful for the first half of the retry countdown + * and immediate for the remaining retries. */ + if ((!node_ptr->bmc_info.power_ctrl.reset.immediate.empty()) && + ( node_ptr->power_action_retries < MTC_RESET_ACTION_SWITCH_THRESHOLD)) node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.reset.immediate); - /* unfaulted graceful if it exists */ - else if ( ! node_ptr->bmc_info.power_ctrl.reset.graceful.empty() ) + /* Unfaulted graceful if it exists */ + else if (!node_ptr->bmc_info.power_ctrl.reset.graceful.empty()) node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.reset.graceful); - /* unfaulted immediate if graceful does not exist */ - else if ( ! node_ptr->bmc_info.power_ctrl.reset.immediate.empty()) + /* Unfaulted immediate if graceful does not exist */ + else if (!node_ptr->bmc_info.power_ctrl.reset.immediate.empty()) node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.reset.immediate); else { @@ -120,18 +122,19 @@ int nodeLinkClass::bmc_command_send ( struct nodeLinkClass::node * node_ptr, } case BMC_THREAD_CMD__POWER_ON: { - /* use immediate for all retries if server supports an immediate command */ - if ( ( node_ptr->power_action_retries < MTC_RESET_ACTION_RETRY_COUNT) && ( ! node_ptr->bmc_info.power_ctrl.poweron.immediate.empty() )) + /* Use graceful for the first half of the retry countdown + * and immediate for the remaining retries. */ + if ((!node_ptr->bmc_info.power_ctrl.poweron.immediate.empty()) && + ( node_ptr->power_action_retries < MTC_POWER_ACTION_SWITCH_THRESHOLD)) node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweron.immediate); - /* unfaulted graceful if it exists */ - else if ( ! node_ptr->bmc_info.power_ctrl.poweron.graceful.empty() ) + /* Unfaulted graceful if it exists */ + else if (!node_ptr->bmc_info.power_ctrl.poweron.graceful.empty()) node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweron.graceful); - /* unfaulted immediate if graceful does not exist */ - else if ( ! node_ptr->bmc_info.power_ctrl.poweron.immediate.empty()) + /* Unfaulted immediate if graceful does not exist */ + else if (!node_ptr->bmc_info.power_ctrl.poweron.immediate.empty()) node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweron.immediate); - else { elog("%s offers no supported poweron commands", node_ptr->hostname.c_str()); @@ -141,16 +144,18 @@ int nodeLinkClass::bmc_command_send ( struct nodeLinkClass::node * node_ptr, } case BMC_THREAD_CMD__POWER_OFF: { - /* use immediate for all retries if server supports an immediate command */ - if ( ( node_ptr->power_action_retries < MTC_RESET_ACTION_RETRY_COUNT ) && ( ! node_ptr->bmc_info.power_ctrl.poweroff.immediate.empty() )) + /* Use graceful for the first half of the retry countdown + * and immediate for the remaining retries. */ + if ((!node_ptr->bmc_info.power_ctrl.poweroff.immediate.empty() ) && + ( node_ptr->power_action_retries < MTC_POWER_ACTION_SWITCH_THRESHOLD)) node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweroff.immediate); - /* unfaulted graceful if it exists */ - else if ( ! node_ptr->bmc_info.power_ctrl.poweroff.graceful.empty() ) + /* Unfaulted graceful if it exists */ + else if (!node_ptr->bmc_info.power_ctrl.poweroff.graceful.empty() ) node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweroff.graceful); - /* unfaulted immediate if graceful does not exist */ - else if ( ! node_ptr->bmc_info.power_ctrl.poweroff.immediate.empty()) + /* Unfaulted immediate if graceful does not exist */ + else if (!node_ptr->bmc_info.power_ctrl.poweroff.immediate.empty()) node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweroff.immediate); else { @@ -193,11 +198,23 @@ int nodeLinkClass::bmc_command_send ( struct nodeLinkClass::node * node_ptr, { want_fit = true ; } + else if (( command == BMC_THREAD_CMD__POWER_ON ) && + ( daemon_want_fit ( fit, node_ptr->hostname, "power_none" ) == true )) + { + /* Just change the command to query status */ + command = BMC_THREAD_CMD__POWER_STATUS ; + } else if (( command == BMC_THREAD_CMD__POWER_OFF ) && ( daemon_want_fit ( fit, node_ptr->hostname, "power_off" ) == true )) { want_fit = true ; } + else if (( command == BMC_THREAD_CMD__POWER_OFF ) && + ( daemon_want_fit ( fit, node_ptr->hostname, "power_none" ) == true )) + { + /* Just change the command to query status */ + command = BMC_THREAD_CMD__POWER_STATUS ; + } else if (( command == BMC_THREAD_CMD__POWER_CYCLE ) && ( daemon_want_fit ( fit, node_ptr->hostname, "power_cycle" ) == true )) { diff --git a/mtce/src/maintenance/mtcNodeHdlrs.cpp b/mtce/src/maintenance/mtcNodeHdlrs.cpp index 3c1358e4..9ddc6c16 100755 --- a/mtce/src/maintenance/mtcNodeHdlrs.cpp +++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp @@ -4007,7 +4007,6 @@ int nodeLinkClass::reset_handler ( struct nodeLinkClass::node * node_ptr ) } case MTC_RESET__REQ_SEND: { - node_ptr->power_action_retries--; /* Handle loss of connectivity over retries */ if ( node_ptr->bmc_provisioned == false ) @@ -4022,18 +4021,17 @@ int nodeLinkClass::reset_handler ( struct nodeLinkClass::node * node_ptr ) { wlog ("%s Reset request rejected ; BMC not accessible ; retry in %d seconds \n", node_ptr->hostname.c_str(), - MTC_POWER_ACTION_RETRY_DELAY); + MTC_RESET_ACTION_RETRY_DELAY); mtcTimer_reset ( node_ptr->mtcTimer ); - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY ); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RESET_ACTION_RETRY_DELAY ); resetStageChange ( node_ptr , MTC_RESET__QUEUE ); break ; } else { - rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_RESET ); - + rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_RESET ); if ( rc ) { wlog ("%s Reset request failed (%d)\n", node_ptr->hostname.c_str(), rc ); @@ -4044,7 +4042,7 @@ int nodeLinkClass::reset_handler ( struct nodeLinkClass::node * node_ptr ) blog ("%s Reset requested\n", node_ptr->hostname.c_str()); resetStageChange ( node_ptr , MTC_RESET__RESP_WAIT ); } - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY ); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RESET_ACTION_RETRY_DELAY ); } break ; } @@ -4053,17 +4051,16 @@ int nodeLinkClass::reset_handler ( struct nodeLinkClass::node * node_ptr ) { if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) { - rc = bmc_command_recv ( node_ptr ); - if ( rc == RETRY ) - { - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT ); - break ; - } - - if ( rc ) + rc = bmc_command_recv ( node_ptr ); + if ( rc == RETRY ) + { + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT ); + break ; + } + else if ( rc ) { elog ("%s Reset command failed (rc:%d)\n", node_ptr->hostname.c_str(), rc ); - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY ); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RESET_ACTION_RETRY_DELAY ); resetStageChange ( node_ptr, MTC_RESET__QUEUE ); } else @@ -4082,7 +4079,7 @@ int nodeLinkClass::reset_handler ( struct nodeLinkClass::node * node_ptr ) if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) { node_ptr->mtcTimer.ring = false ; - if ( node_ptr->power_action_retries > 0 ) + if ( --node_ptr->power_action_retries >= 0 ) { char buffer[64] ; int attempts = MTC_RESET_ACTION_RETRY_COUNT - node_ptr->power_action_retries ; @@ -4455,7 +4452,8 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr ) case MTC_REINSTALL__POWEROFF: { node_ptr->power_action_retries = MTC_POWER_ACTION_RETRY_COUNT ; - powerStageChange ( node_ptr , MTC_POWEROFF__REQ_SEND ); + mtcTimer_reset ( node_ptr->mtcTimer ) ; + powerStageChange ( node_ptr, MTC_POWEROFF__REQ_SEND ); reinstallStageChange ( node_ptr , MTC_REINSTALL__POWEROFF_WAIT ); break ; } @@ -4975,54 +4973,56 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) node_ptr->bm_ip.c_str(), rc ); } - else - { - ; - } node_ptr->power_action_retries = MTC_POWER_ACTION_RETRY_COUNT ; - //the fall through to MTC_POWEROFF__REQ_SEND is intentional - MTCE_FALLTHROUGH; + + /* don't allow a timeout of zero to be passed in */ + if ( power_off_retry_wait == 0 ) + power_off_retry_wait = DEFAULT_POWER_OFF_RETRY_WAIT ; + + ilog ("%s power off retry wait is %d seconds", + node_ptr->hostname.c_str(), power_off_retry_wait); + + mtcTimer_reset ( node_ptr->mtcTimer ) ; + powerStageChange ( node_ptr , MTC_POWEROFF__REQ_SEND ); + break ; } case MTC_POWEROFF__REQ_SEND: { - - /* Handle loss of connectivity over retries */ - if ( node_ptr->bmc_provisioned == false ) + if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) { - elog ("%s BMC not provisioned\n", node_ptr->hostname.c_str()); - mtcInvApi_force_task ( node_ptr, MTC_TASK_BMC_NOT_PROV ); - powerStageChange ( node_ptr , MTC_POWEROFF__FAIL ); - break ; - } - - if ( node_ptr->bmc_accessible == false ) - { - wlog ("%s Power Off request rejected ; BMC not accessible ; retry in %d seconds\n", - node_ptr->hostname.c_str(), - MTC_POWER_ACTION_RETRY_DELAY); - - mtcTimer_reset ( node_ptr->mtcTimer ); - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY ); - powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE ); - break ; - } - - else - { - rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_OFF ); - if ( rc ) + /* Handle loss of connectivity over retries */ + if ( node_ptr->bmc_provisioned == false ) { - node_ptr->power_action_retries--; - wlog ("%s Power-Off request failed (%d)\n", node_ptr->hostname.c_str(), rc ); - powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE ); + elog ("%s BMC not provisioned\n", node_ptr->hostname.c_str()); + mtcInvApi_force_task ( node_ptr, MTC_TASK_BMC_NOT_PROV ); + powerStageChange ( node_ptr , MTC_POWEROFF__FAIL ); + break ; } + + if ( node_ptr->bmc_accessible == false ) + { + wlog ("%s Power Off request rejected ; BMC not accessible", + node_ptr->hostname.c_str()); + powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE ); + break ; + } + else { - ilog ("%s Power-Off requested\n", node_ptr->hostname.c_str()); - powerStageChange ( node_ptr , MTC_POWEROFF__RESP_WAIT ); + rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_OFF ); + if ( rc ) + { + wlog ("%s Power-Off request failed (%d)", node_ptr->hostname.c_str(), rc ); + powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE ); + } + else + { + ilog ("%s Power-Off requested", node_ptr->hostname.c_str()); + powerStageChange ( node_ptr , MTC_POWEROFF__RESP_WAIT ); + } + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RECV_WAIT ); } - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY ); } break ; } @@ -5034,41 +5034,14 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) rc = bmc_command_recv ( node_ptr ); if ( rc == RETRY ) { - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT ); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RECV_RETRY_WAIT ); break ; } else if ( rc ) { elog ("%s Power-Off command failed\n", node_ptr->hostname.c_str()); - - // Need to handle retries in this case since we don't - // go through the QUEUE stage. - if ( --node_ptr->power_action_retries > 0 ) - { - char buffer[255] ; - int attempts = MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries ; - snprintf ( buffer, 255, MTC_TASK_POWEROFF_QUEUE, attempts, MTC_POWER_ACTION_RETRY_COUNT); - mtcInvApi_update_task ( node_ptr, buffer); - - // The power off command can fail due to connectivity - // issue or if the server is now already powered off. - // The latter could occur if the previous power off - // command failed 'in response' but actually did end up - // powering off. In that case, if we continue to just - // retry the power off when the power is already off - // then that will just fail again since most redfish - // implementations fail rather than wave-on a power off - // request while the power is already off. In this case - // its better to switch to power query power status - // again and allow that result to put this power off - // FSM into the correct state to continue/retry the - // quest for power off. - powerStageChange ( node_ptr , MTC_POWEROFF__POWERQRY ); - } - else - { - powerStageChange ( node_ptr , MTC_POWEROFF__FAIL ); - } + powerStageChange ( node_ptr , MTC_POWEROFF__POWERQRY ); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_QUERY_WAIT ); } else { @@ -5091,6 +5064,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) plog ("%s is now offline\n", node_ptr->hostname.c_str()); powerStageChange ( node_ptr , MTC_POWEROFF__POWERQRY ); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_QUERY_WAIT ); } else if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) { @@ -5101,27 +5075,31 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) } case MTC_POWEROFF__POWERQRY: { - if ( node_ptr->bmc_thread_ctrl.done ) + /* give the power off action some time to complete */ + if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) { - /* Query Host Power Status */ - if ( bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_STATUS ) != PASS ) + if ( node_ptr->bmc_thread_ctrl.done ) { - elog ("%s '%s' send failed\n", - node_ptr->hostname.c_str(), - bmcUtil_getCmd_str( - node_ptr->bmc_thread_info.command).c_str()); - pingUtil_restart ( node_ptr->bm_ping_info ); - powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE ); + /* Query Host Power Status */ + if ( bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_STATUS ) != PASS ) + { + elog ("%s '%s' send failed", + node_ptr->hostname.c_str(), + bmcUtil_getCmd_str( + node_ptr->bmc_thread_info.command).c_str()); + pingUtil_restart ( node_ptr->bm_ping_info ); + powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE ); + } + else + { + powerStageChange ( node_ptr , MTC_POWEROFF__POWERQRY_WAIT ); + } + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RECV_WAIT ); } else { - powerStageChange ( node_ptr , MTC_POWEROFF__POWERQRY_WAIT ); + thread_kill ( node_ptr->bmc_thread_ctrl , node_ptr->bmc_thread_info ) ; } - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT ); - } - else - { - thread_kill ( node_ptr->bmc_thread_ctrl , node_ptr->bmc_thread_info ) ; } break ; } @@ -5132,7 +5110,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) int rc = bmc_command_recv ( node_ptr ) ; if ( rc == RETRY ) { - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT ); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RECV_RETRY_WAIT ); break ; } else if ( rc != PASS ) @@ -5183,37 +5161,36 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE ); } } - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT ); } break ; } case MTC_POWEROFF__QUEUE: { - if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) + if ( --node_ptr->power_action_retries >= 0 ) { - if ( --node_ptr->power_action_retries > 0 ) - { - char buffer[255] ; - int attempts = MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries ; - snprintf ( buffer, 255, MTC_TASK_POWEROFF_QUEUE, attempts, MTC_POWER_ACTION_RETRY_COUNT); - mtcInvApi_update_task ( node_ptr, buffer); + char buffer[255] ; + int attempts = MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries ; + snprintf ( buffer, 255, MTC_TASK_POWEROFF_QUEUE, attempts, MTC_POWER_ACTION_RETRY_COUNT); + mtcInvApi_update_task ( node_ptr, buffer); - /* Check the thread error status if there is one. Skip the - * typical system call log which just floods the log file. - * The failure is reported in the update task log above. */ - if (( node_ptr->bmc_thread_info.status ) && - ( node_ptr->bmc_thread_info.status != FAIL_SYSTEM_CALL)) - { - wlog ("%s ... %s (rc:%d)\n", node_ptr->hostname.c_str(), - node_ptr->bmc_thread_info.status_string.c_str(), - node_ptr->bmc_thread_info.status ); - } - powerStageChange ( node_ptr , MTC_POWEROFF__REQ_SEND ); - } - else + /* Check the thread error status if there is one. Skip the + * typical system call log which just floods the log file. + * The failure is reported in the update task log above. */ + if (( node_ptr->bmc_thread_info.status ) && + ( node_ptr->bmc_thread_info.status != FAIL_SYSTEM_CALL)) { - powerStageChange ( node_ptr , MTC_POWEROFF__FAIL ); + wlog ("%s ... %s (rc:%d)", node_ptr->hostname.c_str(), + node_ptr->bmc_thread_info.status_string.c_str(), + node_ptr->bmc_thread_info.status ); } + powerStageChange ( node_ptr , MTC_POWEROFF__REQ_SEND ); + ilog ("%s waiting %d seconds before next power off retry", + node_ptr->hostname.c_str(), power_off_retry_wait); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, power_off_retry_wait ); + } + else + { + powerStageChange ( node_ptr , MTC_POWEROFF__FAIL ); } break ; } @@ -5294,7 +5271,6 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) node_ptr->hostname.c_str(), MTC_POWER_ACTION_RETRY_DELAY); - node_ptr->power_action_retries-- ; mtcTimer_reset ( node_ptr->mtcTimer ); mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY ); powerStageChange ( node_ptr , MTC_POWERON__QUEUE ); @@ -5304,7 +5280,6 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_STATUS ) ; if ( rc ) { - node_ptr->power_action_retries-- ; powerStageChange ( node_ptr , MTC_POWERON__QUEUE ); } else @@ -5349,18 +5324,11 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) powerStageChange ( node_ptr , MTC_POWERON__REQ_SEND ); } } - /* failure path handling */ - else if ( node_ptr->power_action_retries <= 0 ) - { - wlog ("%s current power state query failed ; " - "proceeding with power-on", - node_ptr->hostname.c_str()); - powerStageChange ( node_ptr , MTC_POWERON__REQ_SEND ); - node_ptr->power_action_retries = MTC_POWER_ACTION_RETRY_COUNT ; - } else { - powerStageChange ( node_ptr , MTC_POWERON__POWER_STATUS ); + wlog ("%s power state query failed", + node_ptr->hostname.c_str()); + powerStageChange ( node_ptr , MTC_POWERON__QUEUE ); } } break ; @@ -5383,7 +5351,6 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) if ( node_ptr->bmc_accessible == false ) { - node_ptr->power_action_retries--; wlog ("%s Power-On will fail ; not accessible to BMC ; retry in %d seconds \n", node_ptr->hostname.c_str(), MTC_POWER_ACTION_RETRY_DELAY); @@ -5397,7 +5364,6 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_ON ); if ( rc ) { - node_ptr->power_action_retries--; wlog ("%s Power-On request failed (%d)\n", node_ptr->hostname.c_str(), rc ); @@ -5429,7 +5395,6 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) if ( rc ) { - node_ptr->power_action_retries--; elog ("%s Power-On command failed\n", node_ptr->hostname.c_str()); mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY ); powerStageChange ( node_ptr , MTC_POWERON__QUEUE ); @@ -5452,7 +5417,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) { node_ptr->mtcTimer.ring = false ; - if ( node_ptr->power_action_retries > 0 ) + if ( --node_ptr->power_action_retries >= 0 ) { char buffer[64] ; int attempts = MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries ; diff --git a/mtce/src/maintenance/mtcThreads.cpp b/mtce/src/maintenance/mtcThreads.cpp index 97db3af5..a93a7f88 100644 --- a/mtce/src/maintenance/mtcThreads.cpp +++ b/mtce/src/maintenance/mtcThreads.cpp @@ -1,9 +1,9 @@ /* - * Copyright (c) 2016-2017 Wind River Systems, Inc. -* -* SPDX-License-Identifier: Apache-2.0 -* + * Copyright (c) 2016-2023 Wind River Systems, Inc. + * + * SPDX-License-Identifier: Apache-2.0 + * */ /** @@ -316,7 +316,7 @@ void * mtcThread_bmc ( void * arg ) { string chopped_request = bmcUtil_chop_system_req(request); daemon_remove_file ( datafile.data() ) ; - blog_t("%s %s", info_ptr->hostname.c_str(), chopped_request.c_str()); + ilog_t("%s %s", info_ptr->hostname.c_str(), chopped_request.c_str()); /****** Make the system call ******/ rc =