From 11960566125e395e2556af1719778d737d4b86e5 Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Fri, 6 Nov 2020 09:21:22 -0500 Subject: [PATCH] Disable Redfish BMC audit and improve reinstall failure handling The Mtce Reinstall Handler can collide with the BMC Redfish audit resulting in reinstall failure. BMC handler's 2 minute connection audit can colliding with other BMC commands. The reinstall handler, with 4 bmc command operations is particularly suseptable. Two additional bmc communication improvements are implemented: 1. Add 'retry' handling to all BMC requests in the Maintenance Reinstall Handler FSM to handle transient command failures. Note: There are already retries to all but the power status query and the netboot requests in that handler and retries in other administrative commands that involve bmc requests. 2. Switch BMC power control command management from 'static' to 'learned' lists. Some BMCs don't support both graceful and immediate power commands; Graceful Restart and Force Restart. To remove the possibility of using an unsupported BMC command, this update switches from static to learned power command lists with log produced if a server is missing command support. Power commands escalate from graceful to immediate in the presence of retries. Test Cases: PASS: Verify bmc handler redfish audit is disabled PASS: Verify reinstall soak using redfish PASS: Verify reinstall netboot and power status retry handling PASS: Verify all power control commands using redfish PASS: Verify graceful operations are used if available PASS: Verify immediate operations are used for retries Regression: PASS: Verify bmc ping audit success and failure handling PASS: Verify Reset Handling soak (redfish and ipmi) PASS: Verify Power-Off/On Handling soak (redfish and ipmi) PASS: Verify Reinstall Handling soak (redfish and ipmi) PASS: Verify Standard System Install (redfish and ipmi) PASS: Verify AIO DX System Install (redfish and ipmi) PASS: Verify this update as a patch Change-Id: Idb484512ccb1b16e2d0ea9aff4ab7965347b1322 Closes-Bug: 1880578 Signed-off-by: Eric MacDonald --- mtce-common/src/common/bmcUtil.cpp | 10 +- mtce-common/src/common/bmcUtil.h | 19 +- mtce-common/src/common/logMacros.h | 1 + mtce-common/src/common/nodeBase.cpp | 2 - mtce-common/src/common/nodeBase.h | 2 - mtce-common/src/common/redfishUtil.cpp | 297 ++++++++++------------- mtce-common/src/daemon/daemon_config.cpp | 13 + mtce/src/maintenance/mtcBmcUtil.cpp | 91 +++++-- mtce/src/maintenance/mtcNodeHdlrs.cpp | 255 +++++++++++-------- 9 files changed, 403 insertions(+), 287 deletions(-) diff --git a/mtce-common/src/common/bmcUtil.cpp b/mtce-common/src/common/bmcUtil.cpp index ce31d8b2..43bed414 100644 --- a/mtce-common/src/common/bmcUtil.cpp +++ b/mtce-common/src/common/bmcUtil.cpp @@ -197,10 +197,12 @@ void bmcUtil_info_init ( bmc_info_type & bmc_info ) bmc_info.power_on = false ; bmc_info.restart_cause.clear() ; - /* clear the supported actions lists */ - bmc_info.reset_action_list.clear(); - bmc_info.power_on_action_list.clear(); - bmc_info.power_off_action_list.clear(); + bmc_info.power_ctrl.reset.graceful = "" ; + bmc_info.power_ctrl.reset.immediate = "" ; + bmc_info.power_ctrl.poweron.graceful = "" ; + bmc_info.power_ctrl.poweron.immediate = "" ; + bmc_info.power_ctrl.poweroff.graceful = "" ; + bmc_info.power_ctrl.poweroff.immediate = "" ; } /************************************************************************* diff --git a/mtce-common/src/common/bmcUtil.h b/mtce-common/src/common/bmcUtil.h index d120284d..0208b88c 100644 --- a/mtce-common/src/common/bmcUtil.h +++ b/mtce-common/src/common/bmcUtil.h @@ -29,6 +29,19 @@ using namespace std; #define BMC_PROTOCOL__IPMITOOL_STR ((const char *)("ipmitool")) #define BMC_PROTOCOL__REDFISHTOOL_STR ((const char *)("redfishtool")) +/* learned graceful and immediate power control command strings */ +typedef struct +{ + string graceful ; + string immediate ; +} bmc_power_ctrl_type ; + +typedef struct +{ + bmc_power_ctrl_type reset ; + bmc_power_ctrl_type poweron ; + bmc_power_ctrl_type poweroff ; +} bmc_power_ctrl_cmds_type ; /* important BMC query info to log and track */ typedef struct @@ -57,10 +70,8 @@ typedef struct std::string mn ; std::string sn ; - /* actions */ - std::list reset_action_list ; - std::list power_on_action_list ; - std::list power_off_action_list ; + /* Used to store bmc power commands and severity levels */ + bmc_power_ctrl_cmds_type power_ctrl ; /* state info */ std::string restart_cause ; diff --git a/mtce-common/src/common/logMacros.h b/mtce-common/src/common/logMacros.h index 106a2c19..5574ecd3 100644 --- a/mtce-common/src/common/logMacros.h +++ b/mtce-common/src/common/logMacros.h @@ -162,6 +162,7 @@ typedef struct int failsafe_shutdown_delay ; /**< seconds before failsafe reboot */ int hostwd_update_period ; /**< expect hostwd to be updated */ int autorecovery_threshold ; /**< AIO stop autorecovery threshold */ + int bmc_audit_period ; /**< bmc audit period cadence */ /**< Auto Recovery Thresholds */ int ar_config_threshold ; /**< Configuration Failure Threshold */ diff --git a/mtce-common/src/common/nodeBase.cpp b/mtce-common/src/common/nodeBase.cpp index 1d067210..7214a67d 100755 --- a/mtce-common/src/common/nodeBase.cpp +++ b/mtce-common/src/common/nodeBase.cpp @@ -471,8 +471,6 @@ void mtc_stages_init ( void ) reinstallStages_str [MTC_REINSTALL__NETBOOT_WAIT ] = "Reinstall-Netboot-Wait"; reinstallStages_str [MTC_REINSTALL__POWERON ] = "Reinstall-PowerOn"; reinstallStages_str [MTC_REINSTALL__POWERON_WAIT ] = "Reinstall-PowerOn-Wait"; - reinstallStages_str [MTC_REINSTALL__RESET ] = "Reinstall-Reset"; - reinstallStages_str [MTC_REINSTALL__RESET_WAIT ] = "Reinstall-Reset-Wait"; reinstallStages_str [MTC_REINSTALL__WIPEDISK ] = "Reinstall-Wipedisk"; reinstallStages_str [MTC_REINSTALL__WIPEDISK_WAIT ] = "Reinstall-Wipedisk-Wait"; reinstallStages_str [MTC_REINSTALL__OFFLINE_WAIT ] = "Reinstall-Offline-Wait"; diff --git a/mtce-common/src/common/nodeBase.h b/mtce-common/src/common/nodeBase.h index 0f069c51..8994f570 100755 --- a/mtce-common/src/common/nodeBase.h +++ b/mtce-common/src/common/nodeBase.h @@ -1069,8 +1069,6 @@ typedef enum MTC_REINSTALL__NETBOOT_WAIT, MTC_REINSTALL__POWERON, MTC_REINSTALL__POWERON_WAIT, - MTC_REINSTALL__RESET, - MTC_REINSTALL__RESET_WAIT, MTC_REINSTALL__WIPEDISK, MTC_REINSTALL__WIPEDISK_WAIT, MTC_REINSTALL__OFFLINE_WAIT, diff --git a/mtce-common/src/common/redfishUtil.cpp b/mtce-common/src/common/redfishUtil.cpp index 15055f37..e3b51599 100644 --- a/mtce-common/src/common/redfishUtil.cpp +++ b/mtce-common/src/common/redfishUtil.cpp @@ -21,11 +21,40 @@ using namespace std; #include "jsonUtil.h" /* for ... */ #include "redfishUtil.h" /* for ... this module header */ -/* static prioritized list of redfish actions. - * Higher priority action first. */ -static std::list reset_actions ; -static std::list poweron_actions ; -static std::list poweroff_actions ; +/************************************************************************* + * + * Name: : POWER_CTRL_ACTIONS__RESET + * POWER_CTRL_ACTIONS__POWERON + * POWER_CTRL_ACTIONS__POWEROFF + * + * Description: Power control actions/severity levels + * + *************************************************************************/ + +typedef enum +{ + POWER_CTRL_ACTION__GRACEFUL, + POWER_CTRL_ACTION__IMMEDIATE, + POWER_CTRL_ACTION__MAX +} power_ctrl_severity_enum ; + +static std::string _reset_actions[POWER_CTRL_ACTION__MAX] = +{ + REDFISHTOOL_RESET__GRACEFUL_RESTART, + REDFISHTOOL_RESET__FORCE_RESTART +}; + +static std::string _poweron_actions[POWER_CTRL_ACTION__MAX] = +{ + REDFISHTOOL_POWER_ON__ON, + REDFISHTOOL_POWER_ON__FORCE_ON +}; + +static std::string _poweroff_actions[POWER_CTRL_ACTION__MAX] = +{ + REDFISHTOOL_POWER_OFF__GRACEFUL_SHUTDOWN, + REDFISHTOOL_POWER_OFF__FORCE_OFF +}; /************************************************************************* * @@ -43,16 +72,6 @@ int redfishUtil_init ( void ) { daemon_make_dir(REDFISHTOOL_OUTPUT_DIR) ; - /* Stock reset actions in order of priority */ - reset_actions.push_front(REDFISHTOOL_RESET__GRACEFUL_RESTART); /* P1 */ - reset_actions.push_back (REDFISHTOOL_RESET__FORCE_RESTART); /* P2 */ - - poweron_actions.push_front(REDFISHTOOL_POWER_ON__ON); - poweron_actions.push_back (REDFISHTOOL_POWER_ON__FORCE_ON); - - poweroff_actions.push_front(REDFISHTOOL_POWER_OFF__GRACEFUL_SHUTDOWN); - poweroff_actions.push_back (REDFISHTOOL_POWER_OFF__FORCE_OFF); - return (PASS); } @@ -62,12 +81,13 @@ int redfishUtil_init ( void ) * * Purpose : Load supported host actions. * - * Description: Filter stock actions through host actions. + * Description: Set host supported graceful and immediate power control + * commands into the node's power control action strings. * - * Parameters : hostname - this host amer - * host_action_list - what actions this host reports support for. - * - * Updates: bmc_info - reference that includes host action lists + * Parameters : hostname - pointer to the node object + * host_action_list - what actions this host reports support for + * Updates: bmc_info - updated supported graceful and immediate + * power control commands. * *************************************************************************/ @@ -75,10 +95,6 @@ void _load_action_lists ( string & hostname, bmc_info_type & bmc_info, std::list & host_action_list) { - bmc_info.reset_action_list.clear(); - bmc_info.power_on_action_list.clear(); - bmc_info.power_off_action_list.clear(); - /* Walk through the host action list looking for and updating * this host's bmc_info supported actions lists */ std::list::iterator _host_action_list_ptr ; @@ -86,159 +102,114 @@ void _load_action_lists ( string & hostname, _host_action_list_ptr != host_action_list.end() ; _host_action_list_ptr++ ) { - std::list::iterator _action_list_ptr ; - for ( _action_list_ptr = poweroff_actions.begin(); - _action_list_ptr != poweroff_actions.end() ; - _action_list_ptr++ ) + /* Warning log for hosts that don't provide one of graceful or + * immediate action commands. + * + * Error log for hosts that don't provide either graceful and + * immediate action commands. + */ + if ( (*_host_action_list_ptr) == REDFISHTOOL_RESET__GRACEFUL_RESTART ) + bmc_info.power_ctrl.reset.graceful = *_host_action_list_ptr ; + else if ( (*_host_action_list_ptr) == REDFISHTOOL_RESET__FORCE_RESTART ) + bmc_info.power_ctrl.reset.immediate = *_host_action_list_ptr ; + + else if ( (*_host_action_list_ptr) == REDFISHTOOL_POWER_ON__ON ) + bmc_info.power_ctrl.poweron.graceful = *_host_action_list_ptr ; + else if ( (*_host_action_list_ptr) == REDFISHTOOL_POWER_ON__FORCE_ON ) + bmc_info.power_ctrl.poweron.immediate = *_host_action_list_ptr ; + + else if ( (*_host_action_list_ptr) == REDFISHTOOL_POWER_OFF__GRACEFUL_SHUTDOWN ) + bmc_info.power_ctrl.poweroff.graceful = *_host_action_list_ptr ; + else if ( (*_host_action_list_ptr) == REDFISHTOOL_POWER_OFF__FORCE_OFF ) + bmc_info.power_ctrl.poweroff.immediate = *_host_action_list_ptr ; + } + + if (( bmc_info.power_ctrl.reset.graceful.empty() ) || + ( bmc_info.power_ctrl.reset.immediate.empty() )) + { + if (( bmc_info.power_ctrl.reset.graceful.empty() ) && + ( bmc_info.power_ctrl.reset.immediate.empty() )) { - if ( (*_host_action_list_ptr) == (*_action_list_ptr) ) - { - bmc_info.power_off_action_list.push_back(*_action_list_ptr) ; - break ; - } + elog("%s bmc offers no 'Reset' commands (%s:%s)", + hostname.c_str(), + REDFISHTOOL_RESET__GRACEFUL_RESTART, + REDFISHTOOL_RESET__FORCE_RESTART); } - for ( _action_list_ptr = poweron_actions.begin(); - _action_list_ptr != poweron_actions.end() ; - _action_list_ptr++ ) + else if ( bmc_info.power_ctrl.reset.graceful.empty() ) { - if ( (*_host_action_list_ptr) == (*_action_list_ptr) ) - { - bmc_info.power_on_action_list.push_back(*_action_list_ptr) ; - break ; - } + wlog("%s bmc offers no 'Graceful Reset' command (%s)", + hostname.c_str(), + REDFISHTOOL_RESET__GRACEFUL_RESTART); } - for ( _action_list_ptr = reset_actions.begin(); - _action_list_ptr != reset_actions.end() ; - _action_list_ptr++ ) + else { - if ( (*_host_action_list_ptr) == (*_action_list_ptr) ) - { - bmc_info.reset_action_list.push_back(*_action_list_ptr) ; - break ; - } + wlog("%s bmc offers no 'Immediate Reset' command (%s)", + hostname.c_str(), + REDFISHTOOL_RESET__FORCE_RESTART); } } - string reset_tmp = "" ; - string poweron_tmp = "" ; - string poweroff_tmp = "" ; - std::list::iterator _ptr ; - for ( _ptr = bmc_info.reset_action_list.begin(); - _ptr != bmc_info.reset_action_list.end() ; - _ptr++ ) + + if (( bmc_info.power_ctrl.poweron.graceful.empty() ) || + ( bmc_info.power_ctrl.poweron.immediate.empty() )) { - if ( !reset_tmp.empty() ) - reset_tmp.append(","); - reset_tmp.append(*_ptr); + if (( bmc_info.power_ctrl.poweron.graceful.empty() ) && + ( bmc_info.power_ctrl.poweron.immediate.empty() )) + { + elog("%s bmc offers no 'Power-On' commands (%s:%s)", + hostname.c_str(), + REDFISHTOOL_POWER_ON__ON, + REDFISHTOOL_POWER_ON__FORCE_ON); + } + else if ( bmc_info.power_ctrl.poweron.graceful.empty() ) + { + wlog("%s bmc offers no 'Graceful Power-On' command (%s)", + hostname.c_str(), + REDFISHTOOL_POWER_ON__ON); + } + else + { + wlog("%s bmc offers no 'Immediate Power-On' command (%s)", + hostname.c_str(), + REDFISHTOOL_POWER_ON__FORCE_ON); + } } - for ( _ptr = bmc_info.power_on_action_list.begin(); - _ptr != bmc_info.power_on_action_list.end() ; - _ptr++ ) + + if (( bmc_info.power_ctrl.poweroff.graceful.empty() ) || + ( bmc_info.power_ctrl.poweroff.immediate.empty() )) { - if ( !poweron_tmp.empty() ) - poweron_tmp.append(","); - poweron_tmp.append(*_ptr); + if (( bmc_info.power_ctrl.poweroff.graceful.empty() ) && + ( bmc_info.power_ctrl.poweroff.immediate.empty() )) + { + elog("%s bmc offers no 'Power-Off' commands (%s:%s)", + hostname.c_str(), + REDFISHTOOL_POWER_OFF__GRACEFUL_SHUTDOWN, + REDFISHTOOL_POWER_OFF__FORCE_OFF); + + } + else if ( bmc_info.power_ctrl.poweroff.graceful.empty() ) + { + wlog("%s bmc offers no 'Graceful Power-Off' command (%s)", + hostname.c_str(), + REDFISHTOOL_POWER_OFF__GRACEFUL_SHUTDOWN); + } + else + { + wlog("%s bmc offers no 'Immediate Power-Off' command %s)", + hostname.c_str(), + REDFISHTOOL_POWER_OFF__FORCE_OFF); + } } - for ( _ptr = bmc_info.power_off_action_list.begin(); - _ptr != bmc_info.power_off_action_list.end() ; - _ptr++ ) - { - if ( !poweroff_tmp.empty() ) - poweroff_tmp.append(","); - poweroff_tmp.append(*_ptr); - } - ilog ("%s bmc actions ; reset:%s power-on:%s power-off:%s", + + ilog ("%s bmc power ctrl actions ; reset:%s:%s power-on:%s:%s power-off:%s:%s", hostname.c_str(), - reset_tmp.empty() ? "none" : reset_tmp.c_str(), - poweron_tmp.empty() ? "none" : poweron_tmp.c_str(), - poweroff_tmp.empty() ? "none" : poweroff_tmp.c_str()); + bmc_info.power_ctrl.reset.graceful.empty() ? "none" : bmc_info.power_ctrl.reset.graceful.c_str(), + bmc_info.power_ctrl.reset.immediate.empty() ? "none" : bmc_info.power_ctrl.reset.immediate.c_str(), + bmc_info.power_ctrl.poweron.graceful.empty() ? "none" : bmc_info.power_ctrl.poweron.graceful.c_str(), + bmc_info.power_ctrl.poweron.immediate.empty() ? "none" : bmc_info.power_ctrl.poweron.immediate.c_str(), + bmc_info.power_ctrl.poweroff.graceful.empty() ? "none" : bmc_info.power_ctrl.poweroff.graceful.c_str(), + bmc_info.power_ctrl.poweroff.immediate.empty() ? "none" : bmc_info.power_ctrl.poweroff.immediate.c_str()); } -#ifdef SAVE_IMP -int _get_action_list ( string hostname, - redfish_action_enum action, - std::list host_action_list, - std::list & supp_action_list) -{ - int status = PASS ; - std::list * action_ptr = NULL ; - string action_str = "" ; - supp_action_list.clear(); - switch ( action ) - { - case REDFISH_ACTION__RESET: - { - action_ptr = &reset_actions ; - action_str = "reset" ; - break ; - } - case REDFISH_ACTION__POWER_ON: - { - action_ptr = &poweron_actions ; - action_str = "power-on" ; - break ; - } - case REDFISH_ACTION__POWER_OFF: - { - action_ptr = &poweroff_actions ; - action_str = "power-off" ; - break ; - } - default: - { - status = FAIL_BAD_CASE ; - } - } - - /* Filter */ - if (( status == PASS ) && (action_ptr)) - { - /* get the best supported action command - * for the specified action group. */ - std::list::iterator _action_list_ptr ; - std::list::iterator _host_action_list_ptr ; - for ( _action_list_ptr = action_ptr->begin(); - _action_list_ptr != action_ptr->end() ; - _action_list_ptr++ ) - { - for ( _host_action_list_ptr = host_action_list.begin(); - _host_action_list_ptr != host_action_list.end() ; - _host_action_list_ptr++ ) - { - if ( (*_host_action_list_ptr) == (*_action_list_ptr) ) - { - supp_action_list.push_back(*_action_list_ptr) ; - break ; - } - } - } - } - if ( supp_action_list.empty() ) - { - elog ("%s has no %s actions", hostname.c_str(), action_str.c_str()); - if ( status == PASS ) - status = FAIL_STRING_EMPTY ; - } - else - { - string tmp = "" ; - std::list::iterator _ptr ; - for ( _ptr = supp_action_list.begin(); - _ptr != supp_action_list.end() ; - _ptr++ ) - { - if ( !tmp.empty() ) - tmp.append(", "); - tmp.append(*_ptr); - } - ilog ("%s redfish %s actions: %s", - hostname.c_str(), - action_str.c_str(), - tmp.c_str()); - } - return (status); -} -#endif - /************************************************************************* * * Name : redfishUtil_is_supported diff --git a/mtce-common/src/daemon/daemon_config.cpp b/mtce-common/src/daemon/daemon_config.cpp index 31272f9a..713d3b65 100644 --- a/mtce-common/src/daemon/daemon_config.cpp +++ b/mtce-common/src/daemon/daemon_config.cpp @@ -50,6 +50,7 @@ void daemon_config_default ( daemon_config_type* config_ptr ) config_ptr->barbican_api_host = strdup("none"); config_ptr->hostwd_kdump_on_stall = 0 ; + config_ptr->bmc_audit_period = 0 ; config_ptr->debug_all = 0 ; config_ptr->debug_json = 0 ; @@ -194,6 +195,18 @@ int timeout_config_handler ( void * user, config_ptr->dor_recovery_timeout_ext = atoi(value); ilog ("DOR Time Ext: %3d secs\n", config_ptr->dor_recovery_timeout_ext ); } + else if (MATCH("timeouts", "bmc_audit_period")) + { + config_ptr->bmc_audit_period = atoi(value); + if ( config_ptr->bmc_audit_period ) + { + ilog ("BMC Audit : %3d secs", config_ptr->bmc_audit_period ); + } + else + { + ilog ("BMC Audit : disabled"); + } + } return (PASS); } diff --git a/mtce/src/maintenance/mtcBmcUtil.cpp b/mtce/src/maintenance/mtcBmcUtil.cpp index 9e97ff28..2c76a654 100644 --- a/mtce/src/maintenance/mtcBmcUtil.cpp +++ b/mtce/src/maintenance/mtcBmcUtil.cpp @@ -56,21 +56,72 @@ int nodeLinkClass::bmc_command_send ( struct nodeLinkClass::node * node_ptr, if ( node_ptr->bmc_thread_info.proto == BMC_PROTOCOL__REDFISHTOOL ) { + node_ptr->bm_cmd = REDFISHTOOL_POWER_RESET_CMD ; + /* build the reset/power control command */ switch (command) { case BMC_THREAD_CMD__POWER_RESET: - node_ptr->bm_cmd = REDFISHTOOL_POWER_RESET_CMD ; - node_ptr->bm_cmd.append(node_ptr->bmc_info.reset_action_list.front()); + { + /* use immediate for all retries if server supports an immediate command */ + if ( ( node_ptr->power_action_retries < MTC_RESET_ACTION_RETRY_COUNT ) && ( ! node_ptr->bmc_info.power_ctrl.reset.immediate.empty() )) + node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.reset.immediate); + + /* unfaulted graceful if it exists */ + else if ( ! node_ptr->bmc_info.power_ctrl.reset.graceful.empty() ) + node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.reset.graceful); + + /* unfaulted immediate if graceful does not exist */ + else if ( ! node_ptr->bmc_info.power_ctrl.reset.immediate.empty()) + node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.reset.immediate); + else + { + elog("%s offers no supported reset commands", node_ptr->hostname.c_str()); + return(FAIL_NOT_SUPPORTED); + } break ; + } case BMC_THREAD_CMD__POWER_ON: - node_ptr->bm_cmd = REDFISHTOOL_POWER_RESET_CMD ; - node_ptr->bm_cmd.append(node_ptr->bmc_info.power_on_action_list.front()); + { + /* use immediate for all retries if server supports an immediate command */ + if ( ( node_ptr->power_action_retries < MTC_RESET_ACTION_RETRY_COUNT) && ( ! node_ptr->bmc_info.power_ctrl.poweron.immediate.empty() )) + node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweron.immediate); + + /* unfaulted graceful if it exists */ + else if ( ! node_ptr->bmc_info.power_ctrl.poweron.graceful.empty() ) + node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweron.graceful); + + /* unfaulted immediate if graceful does not exist */ + else if ( ! node_ptr->bmc_info.power_ctrl.poweron.immediate.empty()) + node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweron.immediate); + + else + { + elog("%s offers no supported poweron commands", node_ptr->hostname.c_str()); + return(FAIL_NOT_SUPPORTED); + } break ; + } case BMC_THREAD_CMD__POWER_OFF: - node_ptr->bm_cmd = REDFISHTOOL_POWER_RESET_CMD ; - node_ptr->bm_cmd.append(node_ptr->bmc_info.power_off_action_list.front()); + { + /* use immediate for all retries if server supports an immediate command */ + if ( ( node_ptr->power_action_retries < MTC_RESET_ACTION_RETRY_COUNT ) && ( ! node_ptr->bmc_info.power_ctrl.poweroff.immediate.empty() )) + node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweroff.immediate); + + /* unfaulted graceful if it exists */ + else if ( ! node_ptr->bmc_info.power_ctrl.poweroff.graceful.empty() ) + node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweroff.graceful); + + /* unfaulted immediate if graceful does not exist */ + else if ( ! node_ptr->bmc_info.power_ctrl.poweroff.immediate.empty()) + node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweroff.immediate); + else + { + elog("%s offers no supported poweroff commands", node_ptr->hostname.c_str()); + return(FAIL_NOT_SUPPORTED); + } break ; + } } node_ptr->thread_extra_info.bm_cmd = node_ptr->bm_cmd ; } @@ -145,10 +196,11 @@ int nodeLinkClass::bmc_command_send ( struct nodeLinkClass::node * node_ptr, } else { - blog ("%s %s thread launched with the '%s' command\n", + ilog ("%s %s send '%s' command (%s)", node_ptr->hostname.c_str(), node_ptr->bmc_thread_ctrl.name.c_str(), - bmcUtil_getCmd_str(node_ptr->bmc_thread_info.command).c_str()); + bmcUtil_getCmd_str(node_ptr->bmc_thread_info.command).c_str(), + bmcUtil_getProtocol_str(node_ptr->bmc_protocol).c_str()); } node_ptr->bmc_thread_ctrl.retries = 0 ; } @@ -319,6 +371,16 @@ int nodeLinkClass::bmc_command_recv ( struct nodeLinkClass::node * node_ptr ) { want_fit = true ; } + else if (( node_ptr->bmc_thread_info.command == BMC_THREAD_CMD__POWER_STATUS ) && + ( daemon_want_fit ( fit, node_ptr->hostname, "power_status" ) == true )) + { + want_fit = true ; + } + else if (( node_ptr->bmc_thread_info.command == BMC_THREAD_CMD__BOOTDEV_PXE ) && + ( daemon_want_fit ( fit, node_ptr->hostname, "netboot_pxe" ) == true )) + { + want_fit = true ; + } if ( want_fit == true ) { @@ -350,20 +412,15 @@ int nodeLinkClass::bmc_command_recv ( struct nodeLinkClass::node * node_ptr ) { if ( node_ptr->bmc_thread_ctrl.id == 0 ) { - /* don't log a warning for redfish query failures. */ - if (( node_ptr->bmc_thread_info.command != BMC_THREAD_CMD__BMC_QUERY ) && - ( node_ptr->bmc_thread_info.command != BMC_THREAD_CMD__BMC_INFO )) - { - wlog ("%s %s command not-running\n", - node_ptr->hostname.c_str(), - bmcUtil_getCmd_str(node_ptr->bmc_thread_info.command).c_str()); - } + wlog ("%s %s command not-running\n", + node_ptr->hostname.c_str(), + bmcUtil_getCmd_str(node_ptr->bmc_thread_info.command).c_str()); rc = FAIL_NOT_ACTIVE ; } else { /* The BMC is sometimes slow, - * No need to log till we reach lalf of the retry threshold */ + * No need to log till we reach half of the retry threshold */ if ( node_ptr->bmc_thread_ctrl.retries > (BMC__MAX_RECV_RETRIES/2) ) { ilog ("%s %s command in-progress (polling %d of %d)\n", diff --git a/mtce/src/maintenance/mtcNodeHdlrs.cpp b/mtce/src/maintenance/mtcNodeHdlrs.cpp index 355fd021..f47cf10a 100755 --- a/mtce/src/maintenance/mtcNodeHdlrs.cpp +++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp @@ -4121,6 +4121,7 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr ) } else { + node_ptr->power_action_retries = MTC_POWER_ACTION_RETRY_COUNT ; reinstallStageChange ( node_ptr , MTC_REINSTALL__POWERQRY ); } } @@ -4204,16 +4205,31 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr ) } case MTC_REINSTALL__POWERQRY: { - if ( node_ptr->bmc_thread_ctrl.done ) + if ( mtcTimer_expired ( node_ptr->mtcTimer ) == false ) + ; // wait for time to expire + else if ( node_ptr->bmc_thread_ctrl.done ) { /* Query Host Power Status */ - if ( bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_STATUS ) != PASS ) + int rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_STATUS ); + if ( rc != PASS ) { - elog ("%s '%s' send failed\n", - node_ptr->hostname.c_str(), - bmcUtil_getCmd_str( - node_ptr->bmc_thread_info.command).c_str()); - pingUtil_restart ( node_ptr->bm_ping_info ); + if ( --node_ptr->power_action_retries <= 0 ) + { + elog ("%s Reinstall power query send failed ; max retries (rc:%d)", + node_ptr->hostname.c_str(), rc ); + mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_NB ); + reinstallStageChange ( node_ptr, MTC_REINSTALL__FAIL ); + pingUtil_restart ( node_ptr->bm_ping_info ); + } + else + { + elog ("%s Reinstall power query send failed ; retry %d of %d in %d seconds (rc:%d)", + node_ptr->hostname.c_str(), + MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries, + MTC_POWER_ACTION_RETRY_COUNT, + MTC_RETRY_WAIT, rc ); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT ); + } } else { @@ -4224,6 +4240,7 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr ) else { thread_kill ( node_ptr->bmc_thread_ctrl , node_ptr->bmc_thread_info ) ; + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT ); } break ; } @@ -4231,26 +4248,24 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr ) { if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) { + bool retry = false ; /* force retry on any failure */ + int rc = bmc_command_recv ( node_ptr ) ; if ( rc == RETRY ) { + dlog ("%s power query receive retry in %d seconds", + node_ptr->hostname.c_str(), MTC_RETRY_WAIT); mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT ); - break ; } - else if ( rc != PASS ) + else if ( rc ) { - wlog ("%s '%s' failed receive (rc:%d)", - node_ptr->hostname.c_str(), - bmcUtil_getCmd_str( - node_ptr->bmc_thread_info.command).c_str(), - rc ); + retry = true ; } else if ( node_ptr->bmc_thread_info.data.empty() ) { - wlog ("%s '%s' request yielded no response data", - node_ptr->hostname.c_str(), - bmcUtil_getCmd_str( - node_ptr->bmc_thread_info.command).c_str()); + retry = true ; + wlog ("%s Reinstall power query failed ; no response data", + node_ptr->hostname.c_str()); } else { @@ -4272,17 +4287,42 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr ) ilog ("%s Reinstall power-off already", node_ptr->hostname.c_str()); reinstallStageChange ( node_ptr , MTC_REINSTALL__NETBOOT ); + node_ptr->power_action_retries = MTC_POWER_ACTION_RETRY_COUNT ; } break ; } else { + retry = true ; elog ("%s Reinstall power query failed (rc:%d)", node_ptr->hostname.c_str(), rc ); } } - mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_PQ ); - reinstallStageChange ( node_ptr, MTC_REINSTALL__FAIL ); + if ( retry == true ) + { + if ( --node_ptr->power_action_retries <= 0 ) + { + elog ("%s Reinstall power query receive failed ; max retries (rc:%d)", + node_ptr->hostname.c_str(), rc ); + mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_PQ ); + reinstallStageChange ( node_ptr, MTC_REINSTALL__FAIL ); + } + else + { + wlog ("%s Reinstall power query receive failed ; retry %d of %d in %d seconds (rc:%d)", + node_ptr->hostname.c_str(), + node_ptr->power_action_retries, + MTC_POWER_ACTION_RETRY_COUNT, + MTC_RETRY_WAIT, rc ); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT ); + reinstallStageChange ( node_ptr , MTC_REINSTALL__POWERQRY ); + /* stay in case ; send retry in MTC_RETRY_WAIT seconds */ + } + if ( ! node_ptr->bmc_thread_ctrl.done ) + { + thread_kill ( node_ptr->bmc_thread_ctrl , node_ptr->bmc_thread_info ) ; + } + } } else { @@ -4293,6 +4333,7 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr ) case MTC_REINSTALL__POWEROFF: { + node_ptr->power_action_retries = MTC_POWER_ACTION_RETRY_COUNT ; powerStageChange ( node_ptr , MTC_POWEROFF__REQ_SEND ); reinstallStageChange ( node_ptr , MTC_REINSTALL__POWEROFF_WAIT ); break ; @@ -4307,8 +4348,9 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr ) if ( node_ptr->task != MTC_TASK_REINSTALL ) mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL ); - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_SECS_5 ); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT ); reinstallStageChange ( node_ptr , MTC_REINSTALL__NETBOOT ); + node_ptr->power_action_retries = MTC_POWER_ACTION_RETRY_COUNT ; } else { @@ -4334,16 +4376,30 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr ) int rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__BOOTDEV_PXE ); if ( rc ) { - elog ("%s Reinstall netboot request failed (rc:%d)", - node_ptr->hostname.c_str(), rc ); - mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_NB ); - reinstallStageChange ( node_ptr, MTC_REINSTALL__FAIL ); + /* handle max retries */ + if ( --node_ptr->power_action_retries <= 0 ) + { + elog ("%s Reinstall netboot send failed ; max retries (rc:%d)", + node_ptr->hostname.c_str(), rc ); + mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_NB ); + reinstallStageChange ( node_ptr, MTC_REINSTALL__FAIL ); + } + else + { + wlog ("%s netboot request send failed ; retry %d of %d in %d seconds (rc:%d)", + node_ptr->hostname.c_str(), + MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries, + MTC_POWER_ACTION_RETRY_COUNT, + MTC_RETRY_WAIT, rc ); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT ); + /* stay in case, retry in 5 seconds */ + } } else { - ilog ("%s Reinstall netboot request sent", node_ptr->hostname.c_str() ); - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_SECS_5 ); + dlog ("%s Reinstall netboot request sent", node_ptr->hostname.c_str() ); reinstallStageChange ( node_ptr, MTC_REINSTALL__NETBOOT_WAIT ); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_SECS_2 ); } } break ; @@ -4352,29 +4408,59 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr ) { if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) { + bool retry = false ; + int rc = bmc_command_recv ( node_ptr ); - if ( rc == PASS ) + if ( rc == RETRY ) { - ilog ("%s Reinstall netboot request completed", node_ptr->hostname.c_str()); - reinstallStageChange ( node_ptr, MTC_REINSTALL__POWERON); + dlog ("%s netboot receive retry in %d seconds", + node_ptr->hostname.c_str(), MTC_SECS_2); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_SECS_2 ); } - else if ( rc == RETRY ) + else if ( rc ) { - wlog ("%s Reinstall netboot receive retry", node_ptr->hostname.c_str()); - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_SECS_5 ); + retry = true ; } else { - elog ("%s Reinstall netboot receive failed (rc:%d)", - node_ptr->hostname.c_str(), rc ); - mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_NB ); - reinstallStageChange ( node_ptr, MTC_REINSTALL__FAIL ); + ilog ("%s Reinstall netboot request completed", node_ptr->hostname.c_str()); + reinstallStageChange ( node_ptr, MTC_REINSTALL__POWERON); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT ); + } + + if ( retry == true ) + { + if ( --node_ptr->power_action_retries <= 0 ) + { + elog ("%s Reinstall netboot receive failed ; max retries (rc:%d)", + node_ptr->hostname.c_str(), rc ); + mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_NB ); + reinstallStageChange ( node_ptr, MTC_REINSTALL__FAIL ); + } + else + { + wlog ("%s Reinstall netboot receive failed ; retry %d of %d in %d seconds (rc:%d)", + node_ptr->hostname.c_str(), + MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries, + MTC_POWER_ACTION_RETRY_COUNT, + MTC_RETRY_WAIT, rc ); + reinstallStageChange ( node_ptr , MTC_REINSTALL__NETBOOT ); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT ); + } + if ( ! node_ptr->bmc_thread_ctrl.done ) + { + thread_kill ( node_ptr->bmc_thread_ctrl , node_ptr->bmc_thread_info ) ; + } } } break ; } case MTC_REINSTALL__POWERON: { + if ( ! mtcTimer_expired ( node_ptr->mtcTimer )) + break ; + + node_ptr->power_action_retries = MTC_POWER_ACTION_RETRY_COUNT ; powerStageChange ( node_ptr , MTC_POWERON__REQ_SEND ); reinstallStageChange ( node_ptr , MTC_REINSTALL__POWERON_WAIT ); break ; @@ -4389,7 +4475,7 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr ) if ( node_ptr->task != MTC_TASK_REINSTALL ) mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL ); - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_SECS_5 ); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT ); reinstallStageChange ( node_ptr , MTC_REINSTALL__OFFLINE_WAIT ); } else @@ -4408,54 +4494,6 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr ) } break ; } - case MTC_REINSTALL__RESET: - { - int rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_RESET ); - if ( rc ) - { - elog ("%s Reinstall reset request failed (rc:%d)", - node_ptr->hostname.c_str(), rc ); - mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_PR ); - reinstallStageChange ( node_ptr, MTC_REINSTALL__FAIL ); - } - else - { - ilog ("%s Reinstall reset request sent", node_ptr->hostname.c_str()); - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY ); - reinstallStageChange ( node_ptr, MTC_REINSTALL__RESET_WAIT ); - } - break ; - } - case MTC_REINSTALL__RESET_WAIT: - { - if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) - { - int rc = bmc_command_recv ( node_ptr ); - if ( rc == PASS ) - { - ilog ("%s Reinstall reset request completed", node_ptr->hostname.c_str()); - - start_offline_handler ( node_ptr ); - - /* Wait for the host to go offline */ - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RESET_TO_OFFLINE_TIMEOUT ); - reinstallStageChange ( node_ptr, MTC_REINSTALL__OFFLINE_WAIT); - } - else if ( rc == RETRY ) - { - wlog ("%s Reinstall reset receive retry", node_ptr->hostname.c_str()); - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_SECS_5 ); - } - else - { - elog ("%s Reinstall reset receive failed ; rc:%d", - node_ptr->hostname.c_str(), rc ); - mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_PR ); - reinstallStageChange ( node_ptr, MTC_REINSTALL__FAIL ); - } - } - break ; - } /* BMC not provisioned case */ case MTC_REINSTALL__WIPEDISK: { @@ -4826,7 +4864,6 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) } case MTC_POWEROFF__REQ_SEND: { - node_ptr->power_action_retries--; /* Handle loss of connectivity over retries */ if ( node_ptr->bmc_provisioned == false ) @@ -4854,6 +4891,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_OFF ); if ( rc ) { + node_ptr->power_action_retries--; wlog ("%s Power-Off request failed (%d)\n", node_ptr->hostname.c_str(), rc ); powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE ); } @@ -4879,6 +4917,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) } else if ( rc ) { + node_ptr->power_action_retries--; elog ("%s Power-Off command failed\n", node_ptr->hostname.c_str()); mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY ); powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE ); @@ -4967,7 +5006,14 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) } case MTC_POWERON__START: { - plog ("%s Administrative 'Power-On' Action\n", node_ptr->hostname.c_str()); + plog ("%s Administrative 'Power-On' Action (%d:%d:%lu:%lu:%d:idle:%s)", + node_ptr->hostname.c_str(), + node_ptr->bmc_thread_ctrl.done, + node_ptr->bmc_thread_ctrl.retries, + node_ptr->bmc_thread_ctrl.id, + node_ptr->bmc_thread_info.id, + node_ptr->bmc_thread_info.command, + node_ptr->bmc_thread_ctrl.idle ? "Yes":"No"); mtcInvApi_update_task ( node_ptr, "Power-On Requested" ); if ( hostUtil_is_valid_ip_addr ( node_ptr->bm_ip ) == false ) @@ -5070,7 +5116,6 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) } case MTC_POWERON__REQ_SEND: { - node_ptr->power_action_retries--; /* Ensure that mtce is updated with the latest board * management ip address for this host */ @@ -5087,6 +5132,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) if ( node_ptr->bmc_accessible == false ) { + node_ptr->power_action_retries--; wlog ("%s Power-On will fail ; not accessible to BMC ; retry in %d seconds \n", node_ptr->hostname.c_str(), MTC_POWER_ACTION_RETRY_DELAY); @@ -5100,6 +5146,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_ON ); if ( rc ) { + node_ptr->power_action_retries--; wlog ("%s Power-On request failed (%d)\n", node_ptr->hostname.c_str(), rc ); @@ -5131,6 +5178,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) if ( rc ) { + node_ptr->power_action_retries--; elog ("%s Power-On command failed\n", node_ptr->hostname.c_str()); mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY ); powerStageChange ( node_ptr , MTC_POWERON__QUEUE ); @@ -6375,6 +6423,7 @@ int nodeLinkClass::bmc_handler ( struct nodeLinkClass::node * node_ptr ) if ( node_ptr->bmc_protocol_learning == false ) { mtcTimer_reset ( node_ptr->bm_timer ); + ilog("%s BMC Re-Connect Start", node_ptr->hostname.c_str()); /* send the BMC Query request ; redfish 'root' request */ if ( bmc_command_send ( node_ptr, BMC_THREAD_CMD__BMC_QUERY ) != PASS ) @@ -6414,6 +6463,7 @@ int nodeLinkClass::bmc_handler ( struct nodeLinkClass::node * node_ptr ) if (( node_ptr->bmc_thread_info.command == BMC_THREAD_CMD__BMC_QUERY ) && (( rc == FAIL_SYSTEM_CALL ) || ( rc == FAIL_NOT_ACTIVE ))) { + ilog("%s BMC REe-Connect End ; ipmi", node_ptr->hostname.c_str()); /* TODO: may need retries */ plog ("%s bmc does not support Redfish ; " \ "defaulting to ipmi", @@ -6436,6 +6486,7 @@ int nodeLinkClass::bmc_handler ( struct nodeLinkClass::node * node_ptr ) } else { + ilog("%s BMC Re-Connect End", node_ptr->hostname.c_str()); mtcTimer_reset ( node_ptr->bm_timer ); /* check response for redfish support */ @@ -6539,8 +6590,18 @@ int nodeLinkClass::bmc_handler ( struct nodeLinkClass::node * node_ptr ) { mtcTimer_reset ( node_ptr->bm_timer ); mtcTimer_reset ( node_ptr->bmc_audit_timer ); - mtcTimer_start ( node_ptr->bmc_audit_timer, mtcTimer_handler, MTC_MINS_2 ); - plog ("%s bmc audit timer started (%d secs)\n", node_ptr->hostname.c_str(), MTC_MINS_2); + + int bmc_audit_period = daemon_get_cfg_ptr()->bmc_audit_period ; + if ( bmc_audit_period ) + { + /* the time for the first audit is twice the configured period */ + mtcTimer_start ( node_ptr->bmc_audit_timer, mtcTimer_handler, bmc_audit_period*2 ); + plog ("%s bmc audit timer started (%d secs)", node_ptr->hostname.c_str(), bmc_audit_period*2); + } + else + { + ilog("%s bmc audit disabled", node_ptr->hostname.c_str()); + } /* success path */ node_ptr->bmc_accessible = true ; @@ -6767,10 +6828,12 @@ int nodeLinkClass::bmc_handler ( struct nodeLinkClass::node * node_ptr ) ( node_ptr->bmc_provisioned ) && ( node_ptr->bmc_accessible ) && ( mtcTimer_expired ( node_ptr->bmc_audit_timer ) == true ) && - ( mtcTimer_expired ( node_ptr->bm_timer ) == true )) + ( mtcTimer_expired ( node_ptr->bm_timer ) == true ) && + ( daemon_get_cfg_ptr()->bmc_audit_period != 0)) { if ( node_ptr->bmc_thread_ctrl.done ) { + ilog("%s BMC Audit Start", node_ptr->hostname.c_str()); /* send the BMC Query command */ if ( bmc_command_send ( node_ptr, BMC_THREAD_CMD__BMC_INFO ) != PASS ) { @@ -6812,6 +6875,7 @@ int nodeLinkClass::bmc_handler ( struct nodeLinkClass::node * node_ptr ) string filedata = daemon_read_file (node_ptr->bmc_thread_info.data.data()) ; struct json_object *json_obj = json_tokener_parse((char*)filedata.data()); + ilog("%s BMC Audit End", node_ptr->hostname.c_str()); if ( json_obj ) { /* load the power state */ @@ -6838,11 +6902,12 @@ int nodeLinkClass::bmc_handler ( struct nodeLinkClass::node * node_ptr ) power_state.c_str()); } node_ptr->power_on = power_on ; - blog1 ("%s bmc audit timer re-started (%d secs)\n", - node_ptr->hostname.c_str(), MTC_MINS_2); mtcTimer_start ( node_ptr->bmc_audit_timer, mtcTimer_handler, - MTC_MINS_2 ); + daemon_get_cfg_ptr()->bmc_audit_period ); + blog ("%s bmc audit timer re-started (%d secs)", + node_ptr->hostname.c_str(), + daemon_get_cfg_ptr()->bmc_audit_period); } json_object_put(json_obj); }