Disable Redfish BMC audit and improve reinstall failure handling

The Mtce Reinstall Handler can collide with the BMC Redfish
audit resulting in reinstall failure. BMC handler's 2 minute
connection audit can colliding with other BMC commands.

The reinstall handler, with 4 bmc command operations is
particularly suseptable.

Two additional bmc communication improvements are implemented:

1. Add 'retry' handling to all BMC requests in the Maintenance
   Reinstall Handler FSM to handle transient command failures.

   Note: There are already retries to all but the power status
   query and the netboot requests in that handler and retries
   in other administrative commands that involve bmc requests.

2. Switch BMC power control command management from 'static' to
   'learned' lists. Some BMCs don't support both graceful and
   immediate power commands; Graceful Restart and Force Restart.
   To remove the possibility of using an unsupported BMC command,
   this update switches from static to learned power command lists
   with log produced if a server is missing command support.

   Power commands escalate from graceful to immediate in the
   presence of retries.

Test Cases:

PASS: Verify bmc handler redfish audit is disabled
PASS: Verify reinstall soak using redfish
PASS: Verify reinstall netboot and power status retry handling
PASS: Verify all power control commands using redfish
PASS: Verify graceful operations are used if available
PASS: Verify immediate operations are used for retries

Regression:

PASS: Verify bmc ping audit success and failure handling

PASS: Verify Reset        Handling soak (redfish and ipmi)
PASS: Verify Power-Off/On Handling soak (redfish and ipmi)
PASS: Verify Reinstall    Handling soak (redfish and ipmi)
PASS: Verify Standard System Install    (redfish and ipmi)
PASS: Verify AIO DX   System Install    (redfish and ipmi)

PASS: Verify this update as a patch

Change-Id: Idb484512ccb1b16e2d0ea9aff4ab7965347b1322
Closes-Bug: 1880578
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
Eric MacDonald 2020-11-06 09:21:22 -05:00
parent 2fc05673d1
commit 1196056612
9 changed files with 403 additions and 287 deletions

View File

@ -197,10 +197,12 @@ void bmcUtil_info_init ( bmc_info_type & bmc_info )
bmc_info.power_on = false ;
bmc_info.restart_cause.clear() ;
/* clear the supported actions lists */
bmc_info.reset_action_list.clear();
bmc_info.power_on_action_list.clear();
bmc_info.power_off_action_list.clear();
bmc_info.power_ctrl.reset.graceful = "" ;
bmc_info.power_ctrl.reset.immediate = "" ;
bmc_info.power_ctrl.poweron.graceful = "" ;
bmc_info.power_ctrl.poweron.immediate = "" ;
bmc_info.power_ctrl.poweroff.graceful = "" ;
bmc_info.power_ctrl.poweroff.immediate = "" ;
}
/*************************************************************************

View File

@ -29,6 +29,19 @@ using namespace std;
#define BMC_PROTOCOL__IPMITOOL_STR ((const char *)("ipmitool"))
#define BMC_PROTOCOL__REDFISHTOOL_STR ((const char *)("redfishtool"))
/* learned graceful and immediate power control command strings */
typedef struct
{
string graceful ;
string immediate ;
} bmc_power_ctrl_type ;
typedef struct
{
bmc_power_ctrl_type reset ;
bmc_power_ctrl_type poweron ;
bmc_power_ctrl_type poweroff ;
} bmc_power_ctrl_cmds_type ;
/* important BMC query info to log and track */
typedef struct
@ -57,10 +70,8 @@ typedef struct
std::string mn ;
std::string sn ;
/* actions */
std::list<string> reset_action_list ;
std::list<string> power_on_action_list ;
std::list<string> power_off_action_list ;
/* Used to store bmc power commands and severity levels */
bmc_power_ctrl_cmds_type power_ctrl ;
/* state info */
std::string restart_cause ;

View File

@ -162,6 +162,7 @@ typedef struct
int failsafe_shutdown_delay ; /**< seconds before failsafe reboot */
int hostwd_update_period ; /**< expect hostwd to be updated */
int autorecovery_threshold ; /**< AIO stop autorecovery threshold */
int bmc_audit_period ; /**< bmc audit period cadence */
/**< Auto Recovery Thresholds */
int ar_config_threshold ; /**< Configuration Failure Threshold */

View File

@ -471,8 +471,6 @@ void mtc_stages_init ( void )
reinstallStages_str [MTC_REINSTALL__NETBOOT_WAIT ] = "Reinstall-Netboot-Wait";
reinstallStages_str [MTC_REINSTALL__POWERON ] = "Reinstall-PowerOn";
reinstallStages_str [MTC_REINSTALL__POWERON_WAIT ] = "Reinstall-PowerOn-Wait";
reinstallStages_str [MTC_REINSTALL__RESET ] = "Reinstall-Reset";
reinstallStages_str [MTC_REINSTALL__RESET_WAIT ] = "Reinstall-Reset-Wait";
reinstallStages_str [MTC_REINSTALL__WIPEDISK ] = "Reinstall-Wipedisk";
reinstallStages_str [MTC_REINSTALL__WIPEDISK_WAIT ] = "Reinstall-Wipedisk-Wait";
reinstallStages_str [MTC_REINSTALL__OFFLINE_WAIT ] = "Reinstall-Offline-Wait";

View File

@ -1069,8 +1069,6 @@ typedef enum
MTC_REINSTALL__NETBOOT_WAIT,
MTC_REINSTALL__POWERON,
MTC_REINSTALL__POWERON_WAIT,
MTC_REINSTALL__RESET,
MTC_REINSTALL__RESET_WAIT,
MTC_REINSTALL__WIPEDISK,
MTC_REINSTALL__WIPEDISK_WAIT,
MTC_REINSTALL__OFFLINE_WAIT,

View File

@ -21,11 +21,40 @@ using namespace std;
#include "jsonUtil.h" /* for ... */
#include "redfishUtil.h" /* for ... this module header */
/* static prioritized list of redfish <named> actions.
* Higher priority action first. */
static std::list<string> reset_actions ;
static std::list<string> poweron_actions ;
static std::list<string> poweroff_actions ;
/*************************************************************************
*
* Name: : POWER_CTRL_ACTIONS__RESET
* POWER_CTRL_ACTIONS__POWERON
* POWER_CTRL_ACTIONS__POWEROFF
*
* Description: Power control actions/severity levels
*
*************************************************************************/
typedef enum
{
POWER_CTRL_ACTION__GRACEFUL,
POWER_CTRL_ACTION__IMMEDIATE,
POWER_CTRL_ACTION__MAX
} power_ctrl_severity_enum ;
static std::string _reset_actions[POWER_CTRL_ACTION__MAX] =
{
REDFISHTOOL_RESET__GRACEFUL_RESTART,
REDFISHTOOL_RESET__FORCE_RESTART
};
static std::string _poweron_actions[POWER_CTRL_ACTION__MAX] =
{
REDFISHTOOL_POWER_ON__ON,
REDFISHTOOL_POWER_ON__FORCE_ON
};
static std::string _poweroff_actions[POWER_CTRL_ACTION__MAX] =
{
REDFISHTOOL_POWER_OFF__GRACEFUL_SHUTDOWN,
REDFISHTOOL_POWER_OFF__FORCE_OFF
};
/*************************************************************************
*
@ -43,16 +72,6 @@ int redfishUtil_init ( void )
{
daemon_make_dir(REDFISHTOOL_OUTPUT_DIR) ;
/* Stock reset actions in order of priority */
reset_actions.push_front(REDFISHTOOL_RESET__GRACEFUL_RESTART); /* P1 */
reset_actions.push_back (REDFISHTOOL_RESET__FORCE_RESTART); /* P2 */
poweron_actions.push_front(REDFISHTOOL_POWER_ON__ON);
poweron_actions.push_back (REDFISHTOOL_POWER_ON__FORCE_ON);
poweroff_actions.push_front(REDFISHTOOL_POWER_OFF__GRACEFUL_SHUTDOWN);
poweroff_actions.push_back (REDFISHTOOL_POWER_OFF__FORCE_OFF);
return (PASS);
}
@ -62,12 +81,13 @@ int redfishUtil_init ( void )
*
* Purpose : Load supported host actions.
*
* Description: Filter stock actions through host actions.
* Description: Set host supported graceful and immediate power control
* commands into the node's power control action strings.
*
* Parameters : hostname - this host amer
* host_action_list - what actions this host reports support for.
*
* Updates: bmc_info - reference that includes host action lists
* Parameters : hostname - pointer to the node object
* host_action_list - what actions this host reports support for
* Updates: bmc_info - updated supported graceful and immediate
* power control commands.
*
*************************************************************************/
@ -75,10 +95,6 @@ void _load_action_lists ( string & hostname,
bmc_info_type & bmc_info,
std::list<string> & host_action_list)
{
bmc_info.reset_action_list.clear();
bmc_info.power_on_action_list.clear();
bmc_info.power_off_action_list.clear();
/* Walk through the host action list looking for and updating
* this host's bmc_info supported actions lists */
std::list<string>::iterator _host_action_list_ptr ;
@ -86,159 +102,114 @@ void _load_action_lists ( string & hostname,
_host_action_list_ptr != host_action_list.end() ;
_host_action_list_ptr++ )
{
std::list<string>::iterator _action_list_ptr ;
for ( _action_list_ptr = poweroff_actions.begin();
_action_list_ptr != poweroff_actions.end() ;
_action_list_ptr++ )
/* Warning log for hosts that don't provide one of graceful or
* immediate action commands.
*
* Error log for hosts that don't provide either graceful and
* immediate action commands.
*/
if ( (*_host_action_list_ptr) == REDFISHTOOL_RESET__GRACEFUL_RESTART )
bmc_info.power_ctrl.reset.graceful = *_host_action_list_ptr ;
else if ( (*_host_action_list_ptr) == REDFISHTOOL_RESET__FORCE_RESTART )
bmc_info.power_ctrl.reset.immediate = *_host_action_list_ptr ;
else if ( (*_host_action_list_ptr) == REDFISHTOOL_POWER_ON__ON )
bmc_info.power_ctrl.poweron.graceful = *_host_action_list_ptr ;
else if ( (*_host_action_list_ptr) == REDFISHTOOL_POWER_ON__FORCE_ON )
bmc_info.power_ctrl.poweron.immediate = *_host_action_list_ptr ;
else if ( (*_host_action_list_ptr) == REDFISHTOOL_POWER_OFF__GRACEFUL_SHUTDOWN )
bmc_info.power_ctrl.poweroff.graceful = *_host_action_list_ptr ;
else if ( (*_host_action_list_ptr) == REDFISHTOOL_POWER_OFF__FORCE_OFF )
bmc_info.power_ctrl.poweroff.immediate = *_host_action_list_ptr ;
}
if (( bmc_info.power_ctrl.reset.graceful.empty() ) ||
( bmc_info.power_ctrl.reset.immediate.empty() ))
{
if (( bmc_info.power_ctrl.reset.graceful.empty() ) &&
( bmc_info.power_ctrl.reset.immediate.empty() ))
{
if ( (*_host_action_list_ptr) == (*_action_list_ptr) )
{
bmc_info.power_off_action_list.push_back(*_action_list_ptr) ;
break ;
}
elog("%s bmc offers no 'Reset' commands (%s:%s)",
hostname.c_str(),
REDFISHTOOL_RESET__GRACEFUL_RESTART,
REDFISHTOOL_RESET__FORCE_RESTART);
}
for ( _action_list_ptr = poweron_actions.begin();
_action_list_ptr != poweron_actions.end() ;
_action_list_ptr++ )
else if ( bmc_info.power_ctrl.reset.graceful.empty() )
{
if ( (*_host_action_list_ptr) == (*_action_list_ptr) )
{
bmc_info.power_on_action_list.push_back(*_action_list_ptr) ;
break ;
}
wlog("%s bmc offers no 'Graceful Reset' command (%s)",
hostname.c_str(),
REDFISHTOOL_RESET__GRACEFUL_RESTART);
}
for ( _action_list_ptr = reset_actions.begin();
_action_list_ptr != reset_actions.end() ;
_action_list_ptr++ )
else
{
if ( (*_host_action_list_ptr) == (*_action_list_ptr) )
{
bmc_info.reset_action_list.push_back(*_action_list_ptr) ;
break ;
}
wlog("%s bmc offers no 'Immediate Reset' command (%s)",
hostname.c_str(),
REDFISHTOOL_RESET__FORCE_RESTART);
}
}
string reset_tmp = "" ;
string poweron_tmp = "" ;
string poweroff_tmp = "" ;
std::list<string>::iterator _ptr ;
for ( _ptr = bmc_info.reset_action_list.begin();
_ptr != bmc_info.reset_action_list.end() ;
_ptr++ )
if (( bmc_info.power_ctrl.poweron.graceful.empty() ) ||
( bmc_info.power_ctrl.poweron.immediate.empty() ))
{
if ( !reset_tmp.empty() )
reset_tmp.append(",");
reset_tmp.append(*_ptr);
if (( bmc_info.power_ctrl.poweron.graceful.empty() ) &&
( bmc_info.power_ctrl.poweron.immediate.empty() ))
{
elog("%s bmc offers no 'Power-On' commands (%s:%s)",
hostname.c_str(),
REDFISHTOOL_POWER_ON__ON,
REDFISHTOOL_POWER_ON__FORCE_ON);
}
else if ( bmc_info.power_ctrl.poweron.graceful.empty() )
{
wlog("%s bmc offers no 'Graceful Power-On' command (%s)",
hostname.c_str(),
REDFISHTOOL_POWER_ON__ON);
}
else
{
wlog("%s bmc offers no 'Immediate Power-On' command (%s)",
hostname.c_str(),
REDFISHTOOL_POWER_ON__FORCE_ON);
}
}
for ( _ptr = bmc_info.power_on_action_list.begin();
_ptr != bmc_info.power_on_action_list.end() ;
_ptr++ )
if (( bmc_info.power_ctrl.poweroff.graceful.empty() ) ||
( bmc_info.power_ctrl.poweroff.immediate.empty() ))
{
if ( !poweron_tmp.empty() )
poweron_tmp.append(",");
poweron_tmp.append(*_ptr);
if (( bmc_info.power_ctrl.poweroff.graceful.empty() ) &&
( bmc_info.power_ctrl.poweroff.immediate.empty() ))
{
elog("%s bmc offers no 'Power-Off' commands (%s:%s)",
hostname.c_str(),
REDFISHTOOL_POWER_OFF__GRACEFUL_SHUTDOWN,
REDFISHTOOL_POWER_OFF__FORCE_OFF);
}
else if ( bmc_info.power_ctrl.poweroff.graceful.empty() )
{
wlog("%s bmc offers no 'Graceful Power-Off' command (%s)",
hostname.c_str(),
REDFISHTOOL_POWER_OFF__GRACEFUL_SHUTDOWN);
}
else
{
wlog("%s bmc offers no 'Immediate Power-Off' command %s)",
hostname.c_str(),
REDFISHTOOL_POWER_OFF__FORCE_OFF);
}
}
for ( _ptr = bmc_info.power_off_action_list.begin();
_ptr != bmc_info.power_off_action_list.end() ;
_ptr++ )
{
if ( !poweroff_tmp.empty() )
poweroff_tmp.append(",");
poweroff_tmp.append(*_ptr);
}
ilog ("%s bmc actions ; reset:%s power-on:%s power-off:%s",
ilog ("%s bmc power ctrl actions ; reset:%s:%s power-on:%s:%s power-off:%s:%s",
hostname.c_str(),
reset_tmp.empty() ? "none" : reset_tmp.c_str(),
poweron_tmp.empty() ? "none" : poweron_tmp.c_str(),
poweroff_tmp.empty() ? "none" : poweroff_tmp.c_str());
bmc_info.power_ctrl.reset.graceful.empty() ? "none" : bmc_info.power_ctrl.reset.graceful.c_str(),
bmc_info.power_ctrl.reset.immediate.empty() ? "none" : bmc_info.power_ctrl.reset.immediate.c_str(),
bmc_info.power_ctrl.poweron.graceful.empty() ? "none" : bmc_info.power_ctrl.poweron.graceful.c_str(),
bmc_info.power_ctrl.poweron.immediate.empty() ? "none" : bmc_info.power_ctrl.poweron.immediate.c_str(),
bmc_info.power_ctrl.poweroff.graceful.empty() ? "none" : bmc_info.power_ctrl.poweroff.graceful.c_str(),
bmc_info.power_ctrl.poweroff.immediate.empty() ? "none" : bmc_info.power_ctrl.poweroff.immediate.c_str());
}
#ifdef SAVE_IMP
int _get_action_list ( string hostname,
redfish_action_enum action,
std::list<string> host_action_list,
std::list<string> & supp_action_list)
{
int status = PASS ;
std::list<string> * action_ptr = NULL ;
string action_str = "" ;
supp_action_list.clear();
switch ( action )
{
case REDFISH_ACTION__RESET:
{
action_ptr = &reset_actions ;
action_str = "reset" ;
break ;
}
case REDFISH_ACTION__POWER_ON:
{
action_ptr = &poweron_actions ;
action_str = "power-on" ;
break ;
}
case REDFISH_ACTION__POWER_OFF:
{
action_ptr = &poweroff_actions ;
action_str = "power-off" ;
break ;
}
default:
{
status = FAIL_BAD_CASE ;
}
}
/* Filter */
if (( status == PASS ) && (action_ptr))
{
/* get the best supported action command
* for the specified action group. */
std::list<string>::iterator _action_list_ptr ;
std::list<string>::iterator _host_action_list_ptr ;
for ( _action_list_ptr = action_ptr->begin();
_action_list_ptr != action_ptr->end() ;
_action_list_ptr++ )
{
for ( _host_action_list_ptr = host_action_list.begin();
_host_action_list_ptr != host_action_list.end() ;
_host_action_list_ptr++ )
{
if ( (*_host_action_list_ptr) == (*_action_list_ptr) )
{
supp_action_list.push_back(*_action_list_ptr) ;
break ;
}
}
}
}
if ( supp_action_list.empty() )
{
elog ("%s has no %s actions", hostname.c_str(), action_str.c_str());
if ( status == PASS )
status = FAIL_STRING_EMPTY ;
}
else
{
string tmp = "" ;
std::list<string>::iterator _ptr ;
for ( _ptr = supp_action_list.begin();
_ptr != supp_action_list.end() ;
_ptr++ )
{
if ( !tmp.empty() )
tmp.append(", ");
tmp.append(*_ptr);
}
ilog ("%s redfish %s actions: %s",
hostname.c_str(),
action_str.c_str(),
tmp.c_str());
}
return (status);
}
#endif
/*************************************************************************
*
* Name : redfishUtil_is_supported

View File

@ -50,6 +50,7 @@ void daemon_config_default ( daemon_config_type* config_ptr )
config_ptr->barbican_api_host = strdup("none");
config_ptr->hostwd_kdump_on_stall = 0 ;
config_ptr->bmc_audit_period = 0 ;
config_ptr->debug_all = 0 ;
config_ptr->debug_json = 0 ;
@ -194,6 +195,18 @@ int timeout_config_handler ( void * user,
config_ptr->dor_recovery_timeout_ext = atoi(value);
ilog ("DOR Time Ext: %3d secs\n", config_ptr->dor_recovery_timeout_ext );
}
else if (MATCH("timeouts", "bmc_audit_period"))
{
config_ptr->bmc_audit_period = atoi(value);
if ( config_ptr->bmc_audit_period )
{
ilog ("BMC Audit : %3d secs", config_ptr->bmc_audit_period );
}
else
{
ilog ("BMC Audit : disabled");
}
}
return (PASS);
}

View File

@ -56,21 +56,72 @@ int nodeLinkClass::bmc_command_send ( struct nodeLinkClass::node * node_ptr,
if ( node_ptr->bmc_thread_info.proto == BMC_PROTOCOL__REDFISHTOOL )
{
node_ptr->bm_cmd = REDFISHTOOL_POWER_RESET_CMD ;
/* build the reset/power control command */
switch (command)
{
case BMC_THREAD_CMD__POWER_RESET:
node_ptr->bm_cmd = REDFISHTOOL_POWER_RESET_CMD ;
node_ptr->bm_cmd.append(node_ptr->bmc_info.reset_action_list.front());
{
/* use immediate for all retries if server supports an immediate command */
if ( ( node_ptr->power_action_retries < MTC_RESET_ACTION_RETRY_COUNT ) && ( ! node_ptr->bmc_info.power_ctrl.reset.immediate.empty() ))
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.reset.immediate);
/* unfaulted graceful if it exists */
else if ( ! node_ptr->bmc_info.power_ctrl.reset.graceful.empty() )
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.reset.graceful);
/* unfaulted immediate if graceful does not exist */
else if ( ! node_ptr->bmc_info.power_ctrl.reset.immediate.empty())
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.reset.immediate);
else
{
elog("%s offers no supported reset commands", node_ptr->hostname.c_str());
return(FAIL_NOT_SUPPORTED);
}
break ;
}
case BMC_THREAD_CMD__POWER_ON:
node_ptr->bm_cmd = REDFISHTOOL_POWER_RESET_CMD ;
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_on_action_list.front());
{
/* use immediate for all retries if server supports an immediate command */
if ( ( node_ptr->power_action_retries < MTC_RESET_ACTION_RETRY_COUNT) && ( ! node_ptr->bmc_info.power_ctrl.poweron.immediate.empty() ))
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweron.immediate);
/* unfaulted graceful if it exists */
else if ( ! node_ptr->bmc_info.power_ctrl.poweron.graceful.empty() )
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweron.graceful);
/* unfaulted immediate if graceful does not exist */
else if ( ! node_ptr->bmc_info.power_ctrl.poweron.immediate.empty())
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweron.immediate);
else
{
elog("%s offers no supported poweron commands", node_ptr->hostname.c_str());
return(FAIL_NOT_SUPPORTED);
}
break ;
}
case BMC_THREAD_CMD__POWER_OFF:
node_ptr->bm_cmd = REDFISHTOOL_POWER_RESET_CMD ;
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_off_action_list.front());
{
/* use immediate for all retries if server supports an immediate command */
if ( ( node_ptr->power_action_retries < MTC_RESET_ACTION_RETRY_COUNT ) && ( ! node_ptr->bmc_info.power_ctrl.poweroff.immediate.empty() ))
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweroff.immediate);
/* unfaulted graceful if it exists */
else if ( ! node_ptr->bmc_info.power_ctrl.poweroff.graceful.empty() )
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweroff.graceful);
/* unfaulted immediate if graceful does not exist */
else if ( ! node_ptr->bmc_info.power_ctrl.poweroff.immediate.empty())
node_ptr->bm_cmd.append(node_ptr->bmc_info.power_ctrl.poweroff.immediate);
else
{
elog("%s offers no supported poweroff commands", node_ptr->hostname.c_str());
return(FAIL_NOT_SUPPORTED);
}
break ;
}
}
node_ptr->thread_extra_info.bm_cmd = node_ptr->bm_cmd ;
}
@ -145,10 +196,11 @@ int nodeLinkClass::bmc_command_send ( struct nodeLinkClass::node * node_ptr,
}
else
{
blog ("%s %s thread launched with the '%s' command\n",
ilog ("%s %s send '%s' command (%s)",
node_ptr->hostname.c_str(),
node_ptr->bmc_thread_ctrl.name.c_str(),
bmcUtil_getCmd_str(node_ptr->bmc_thread_info.command).c_str());
bmcUtil_getCmd_str(node_ptr->bmc_thread_info.command).c_str(),
bmcUtil_getProtocol_str(node_ptr->bmc_protocol).c_str());
}
node_ptr->bmc_thread_ctrl.retries = 0 ;
}
@ -319,6 +371,16 @@ int nodeLinkClass::bmc_command_recv ( struct nodeLinkClass::node * node_ptr )
{
want_fit = true ;
}
else if (( node_ptr->bmc_thread_info.command == BMC_THREAD_CMD__POWER_STATUS ) &&
( daemon_want_fit ( fit, node_ptr->hostname, "power_status" ) == true ))
{
want_fit = true ;
}
else if (( node_ptr->bmc_thread_info.command == BMC_THREAD_CMD__BOOTDEV_PXE ) &&
( daemon_want_fit ( fit, node_ptr->hostname, "netboot_pxe" ) == true ))
{
want_fit = true ;
}
if ( want_fit == true )
{
@ -350,20 +412,15 @@ int nodeLinkClass::bmc_command_recv ( struct nodeLinkClass::node * node_ptr )
{
if ( node_ptr->bmc_thread_ctrl.id == 0 )
{
/* don't log a warning for redfish query failures. */
if (( node_ptr->bmc_thread_info.command != BMC_THREAD_CMD__BMC_QUERY ) &&
( node_ptr->bmc_thread_info.command != BMC_THREAD_CMD__BMC_INFO ))
{
wlog ("%s %s command not-running\n",
node_ptr->hostname.c_str(),
bmcUtil_getCmd_str(node_ptr->bmc_thread_info.command).c_str());
}
wlog ("%s %s command not-running\n",
node_ptr->hostname.c_str(),
bmcUtil_getCmd_str(node_ptr->bmc_thread_info.command).c_str());
rc = FAIL_NOT_ACTIVE ;
}
else
{
/* The BMC is sometimes slow,
* No need to log till we reach lalf of the retry threshold */
* No need to log till we reach half of the retry threshold */
if ( node_ptr->bmc_thread_ctrl.retries > (BMC__MAX_RECV_RETRIES/2) )
{
ilog ("%s %s command in-progress (polling %d of %d)\n",

View File

@ -4121,6 +4121,7 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr )
}
else
{
node_ptr->power_action_retries = MTC_POWER_ACTION_RETRY_COUNT ;
reinstallStageChange ( node_ptr , MTC_REINSTALL__POWERQRY );
}
}
@ -4204,16 +4205,31 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr )
}
case MTC_REINSTALL__POWERQRY:
{
if ( node_ptr->bmc_thread_ctrl.done )
if ( mtcTimer_expired ( node_ptr->mtcTimer ) == false )
; // wait for time to expire
else if ( node_ptr->bmc_thread_ctrl.done )
{
/* Query Host Power Status */
if ( bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_STATUS ) != PASS )
int rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_STATUS );
if ( rc != PASS )
{
elog ("%s '%s' send failed\n",
node_ptr->hostname.c_str(),
bmcUtil_getCmd_str(
node_ptr->bmc_thread_info.command).c_str());
pingUtil_restart ( node_ptr->bm_ping_info );
if ( --node_ptr->power_action_retries <= 0 )
{
elog ("%s Reinstall power query send failed ; max retries (rc:%d)",
node_ptr->hostname.c_str(), rc );
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_NB );
reinstallStageChange ( node_ptr, MTC_REINSTALL__FAIL );
pingUtil_restart ( node_ptr->bm_ping_info );
}
else
{
elog ("%s Reinstall power query send failed ; retry %d of %d in %d seconds (rc:%d)",
node_ptr->hostname.c_str(),
MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries,
MTC_POWER_ACTION_RETRY_COUNT,
MTC_RETRY_WAIT, rc );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
}
}
else
{
@ -4224,6 +4240,7 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr )
else
{
thread_kill ( node_ptr->bmc_thread_ctrl , node_ptr->bmc_thread_info ) ;
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
}
break ;
}
@ -4231,26 +4248,24 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr )
{
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
{
bool retry = false ; /* force retry on any failure */
int rc = bmc_command_recv ( node_ptr ) ;
if ( rc == RETRY )
{
dlog ("%s power query receive retry in %d seconds",
node_ptr->hostname.c_str(), MTC_RETRY_WAIT);
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
break ;
}
else if ( rc != PASS )
else if ( rc )
{
wlog ("%s '%s' failed receive (rc:%d)",
node_ptr->hostname.c_str(),
bmcUtil_getCmd_str(
node_ptr->bmc_thread_info.command).c_str(),
rc );
retry = true ;
}
else if ( node_ptr->bmc_thread_info.data.empty() )
{
wlog ("%s '%s' request yielded no response data",
node_ptr->hostname.c_str(),
bmcUtil_getCmd_str(
node_ptr->bmc_thread_info.command).c_str());
retry = true ;
wlog ("%s Reinstall power query failed ; no response data",
node_ptr->hostname.c_str());
}
else
{
@ -4272,17 +4287,42 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr )
ilog ("%s Reinstall power-off already",
node_ptr->hostname.c_str());
reinstallStageChange ( node_ptr , MTC_REINSTALL__NETBOOT );
node_ptr->power_action_retries = MTC_POWER_ACTION_RETRY_COUNT ;
}
break ;
}
else
{
retry = true ;
elog ("%s Reinstall power query failed (rc:%d)",
node_ptr->hostname.c_str(), rc );
}
}
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_PQ );
reinstallStageChange ( node_ptr, MTC_REINSTALL__FAIL );
if ( retry == true )
{
if ( --node_ptr->power_action_retries <= 0 )
{
elog ("%s Reinstall power query receive failed ; max retries (rc:%d)",
node_ptr->hostname.c_str(), rc );
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_PQ );
reinstallStageChange ( node_ptr, MTC_REINSTALL__FAIL );
}
else
{
wlog ("%s Reinstall power query receive failed ; retry %d of %d in %d seconds (rc:%d)",
node_ptr->hostname.c_str(),
node_ptr->power_action_retries,
MTC_POWER_ACTION_RETRY_COUNT,
MTC_RETRY_WAIT, rc );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
reinstallStageChange ( node_ptr , MTC_REINSTALL__POWERQRY );
/* stay in case ; send retry in MTC_RETRY_WAIT seconds */
}
if ( ! node_ptr->bmc_thread_ctrl.done )
{
thread_kill ( node_ptr->bmc_thread_ctrl , node_ptr->bmc_thread_info ) ;
}
}
}
else
{
@ -4293,6 +4333,7 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr )
case MTC_REINSTALL__POWEROFF:
{
node_ptr->power_action_retries = MTC_POWER_ACTION_RETRY_COUNT ;
powerStageChange ( node_ptr , MTC_POWEROFF__REQ_SEND );
reinstallStageChange ( node_ptr , MTC_REINSTALL__POWEROFF_WAIT );
break ;
@ -4307,8 +4348,9 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr )
if ( node_ptr->task != MTC_TASK_REINSTALL )
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_SECS_5 );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
reinstallStageChange ( node_ptr , MTC_REINSTALL__NETBOOT );
node_ptr->power_action_retries = MTC_POWER_ACTION_RETRY_COUNT ;
}
else
{
@ -4334,16 +4376,30 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr )
int rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__BOOTDEV_PXE );
if ( rc )
{
elog ("%s Reinstall netboot request failed (rc:%d)",
node_ptr->hostname.c_str(), rc );
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_NB );
reinstallStageChange ( node_ptr, MTC_REINSTALL__FAIL );
/* handle max retries */
if ( --node_ptr->power_action_retries <= 0 )
{
elog ("%s Reinstall netboot send failed ; max retries (rc:%d)",
node_ptr->hostname.c_str(), rc );
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_NB );
reinstallStageChange ( node_ptr, MTC_REINSTALL__FAIL );
}
else
{
wlog ("%s netboot request send failed ; retry %d of %d in %d seconds (rc:%d)",
node_ptr->hostname.c_str(),
MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries,
MTC_POWER_ACTION_RETRY_COUNT,
MTC_RETRY_WAIT, rc );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
/* stay in case, retry in 5 seconds */
}
}
else
{
ilog ("%s Reinstall netboot request sent", node_ptr->hostname.c_str() );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_SECS_5 );
dlog ("%s Reinstall netboot request sent", node_ptr->hostname.c_str() );
reinstallStageChange ( node_ptr, MTC_REINSTALL__NETBOOT_WAIT );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_SECS_2 );
}
}
break ;
@ -4352,29 +4408,59 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr )
{
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
{
bool retry = false ;
int rc = bmc_command_recv ( node_ptr );
if ( rc == PASS )
if ( rc == RETRY )
{
ilog ("%s Reinstall netboot request completed", node_ptr->hostname.c_str());
reinstallStageChange ( node_ptr, MTC_REINSTALL__POWERON);
dlog ("%s netboot receive retry in %d seconds",
node_ptr->hostname.c_str(), MTC_SECS_2);
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_SECS_2 );
}
else if ( rc == RETRY )
else if ( rc )
{
wlog ("%s Reinstall netboot receive retry", node_ptr->hostname.c_str());
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_SECS_5 );
retry = true ;
}
else
{
elog ("%s Reinstall netboot receive failed (rc:%d)",
node_ptr->hostname.c_str(), rc );
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_NB );
reinstallStageChange ( node_ptr, MTC_REINSTALL__FAIL );
ilog ("%s Reinstall netboot request completed", node_ptr->hostname.c_str());
reinstallStageChange ( node_ptr, MTC_REINSTALL__POWERON);
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
}
if ( retry == true )
{
if ( --node_ptr->power_action_retries <= 0 )
{
elog ("%s Reinstall netboot receive failed ; max retries (rc:%d)",
node_ptr->hostname.c_str(), rc );
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_NB );
reinstallStageChange ( node_ptr, MTC_REINSTALL__FAIL );
}
else
{
wlog ("%s Reinstall netboot receive failed ; retry %d of %d in %d seconds (rc:%d)",
node_ptr->hostname.c_str(),
MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries,
MTC_POWER_ACTION_RETRY_COUNT,
MTC_RETRY_WAIT, rc );
reinstallStageChange ( node_ptr , MTC_REINSTALL__NETBOOT );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
}
if ( ! node_ptr->bmc_thread_ctrl.done )
{
thread_kill ( node_ptr->bmc_thread_ctrl , node_ptr->bmc_thread_info ) ;
}
}
}
break ;
}
case MTC_REINSTALL__POWERON:
{
if ( ! mtcTimer_expired ( node_ptr->mtcTimer ))
break ;
node_ptr->power_action_retries = MTC_POWER_ACTION_RETRY_COUNT ;
powerStageChange ( node_ptr , MTC_POWERON__REQ_SEND );
reinstallStageChange ( node_ptr , MTC_REINSTALL__POWERON_WAIT );
break ;
@ -4389,7 +4475,7 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr )
if ( node_ptr->task != MTC_TASK_REINSTALL )
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_SECS_5 );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
reinstallStageChange ( node_ptr , MTC_REINSTALL__OFFLINE_WAIT );
}
else
@ -4408,54 +4494,6 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr )
}
break ;
}
case MTC_REINSTALL__RESET:
{
int rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_RESET );
if ( rc )
{
elog ("%s Reinstall reset request failed (rc:%d)",
node_ptr->hostname.c_str(), rc );
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_PR );
reinstallStageChange ( node_ptr, MTC_REINSTALL__FAIL );
}
else
{
ilog ("%s Reinstall reset request sent", node_ptr->hostname.c_str());
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
reinstallStageChange ( node_ptr, MTC_REINSTALL__RESET_WAIT );
}
break ;
}
case MTC_REINSTALL__RESET_WAIT:
{
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
{
int rc = bmc_command_recv ( node_ptr );
if ( rc == PASS )
{
ilog ("%s Reinstall reset request completed", node_ptr->hostname.c_str());
start_offline_handler ( node_ptr );
/* Wait for the host to go offline */
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RESET_TO_OFFLINE_TIMEOUT );
reinstallStageChange ( node_ptr, MTC_REINSTALL__OFFLINE_WAIT);
}
else if ( rc == RETRY )
{
wlog ("%s Reinstall reset receive retry", node_ptr->hostname.c_str());
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_SECS_5 );
}
else
{
elog ("%s Reinstall reset receive failed ; rc:%d",
node_ptr->hostname.c_str(), rc );
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_PR );
reinstallStageChange ( node_ptr, MTC_REINSTALL__FAIL );
}
}
break ;
}
/* BMC not provisioned case */
case MTC_REINSTALL__WIPEDISK:
{
@ -4826,7 +4864,6 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
}
case MTC_POWEROFF__REQ_SEND:
{
node_ptr->power_action_retries--;
/* Handle loss of connectivity over retries */
if ( node_ptr->bmc_provisioned == false )
@ -4854,6 +4891,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_OFF );
if ( rc )
{
node_ptr->power_action_retries--;
wlog ("%s Power-Off request failed (%d)\n", node_ptr->hostname.c_str(), rc );
powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE );
}
@ -4879,6 +4917,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
}
else if ( rc )
{
node_ptr->power_action_retries--;
elog ("%s Power-Off command failed\n", node_ptr->hostname.c_str());
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE );
@ -4967,7 +5006,14 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
}
case MTC_POWERON__START:
{
plog ("%s Administrative 'Power-On' Action\n", node_ptr->hostname.c_str());
plog ("%s Administrative 'Power-On' Action (%d:%d:%lu:%lu:%d:idle:%s)",
node_ptr->hostname.c_str(),
node_ptr->bmc_thread_ctrl.done,
node_ptr->bmc_thread_ctrl.retries,
node_ptr->bmc_thread_ctrl.id,
node_ptr->bmc_thread_info.id,
node_ptr->bmc_thread_info.command,
node_ptr->bmc_thread_ctrl.idle ? "Yes":"No");
mtcInvApi_update_task ( node_ptr, "Power-On Requested" );
if ( hostUtil_is_valid_ip_addr ( node_ptr->bm_ip ) == false )
@ -5070,7 +5116,6 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
}
case MTC_POWERON__REQ_SEND:
{
node_ptr->power_action_retries--;
/* Ensure that mtce is updated with the latest board
* management ip address for this host */
@ -5087,6 +5132,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
if ( node_ptr->bmc_accessible == false )
{
node_ptr->power_action_retries--;
wlog ("%s Power-On will fail ; not accessible to BMC ; retry in %d seconds \n",
node_ptr->hostname.c_str(), MTC_POWER_ACTION_RETRY_DELAY);
@ -5100,6 +5146,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_ON );
if ( rc )
{
node_ptr->power_action_retries--;
wlog ("%s Power-On request failed (%d)\n",
node_ptr->hostname.c_str(), rc );
@ -5131,6 +5178,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
if ( rc )
{
node_ptr->power_action_retries--;
elog ("%s Power-On command failed\n", node_ptr->hostname.c_str());
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
powerStageChange ( node_ptr , MTC_POWERON__QUEUE );
@ -6375,6 +6423,7 @@ int nodeLinkClass::bmc_handler ( struct nodeLinkClass::node * node_ptr )
if ( node_ptr->bmc_protocol_learning == false )
{
mtcTimer_reset ( node_ptr->bm_timer );
ilog("%s BMC Re-Connect Start", node_ptr->hostname.c_str());
/* send the BMC Query request ; redfish 'root' request */
if ( bmc_command_send ( node_ptr, BMC_THREAD_CMD__BMC_QUERY ) != PASS )
@ -6414,6 +6463,7 @@ int nodeLinkClass::bmc_handler ( struct nodeLinkClass::node * node_ptr )
if (( node_ptr->bmc_thread_info.command == BMC_THREAD_CMD__BMC_QUERY ) &&
(( rc == FAIL_SYSTEM_CALL ) || ( rc == FAIL_NOT_ACTIVE )))
{
ilog("%s BMC REe-Connect End ; ipmi", node_ptr->hostname.c_str());
/* TODO: may need retries */
plog ("%s bmc does not support Redfish ; " \
"defaulting to ipmi",
@ -6436,6 +6486,7 @@ int nodeLinkClass::bmc_handler ( struct nodeLinkClass::node * node_ptr )
}
else
{
ilog("%s BMC Re-Connect End", node_ptr->hostname.c_str());
mtcTimer_reset ( node_ptr->bm_timer );
/* check response for redfish support */
@ -6539,8 +6590,18 @@ int nodeLinkClass::bmc_handler ( struct nodeLinkClass::node * node_ptr )
{
mtcTimer_reset ( node_ptr->bm_timer );
mtcTimer_reset ( node_ptr->bmc_audit_timer );
mtcTimer_start ( node_ptr->bmc_audit_timer, mtcTimer_handler, MTC_MINS_2 );
plog ("%s bmc audit timer started (%d secs)\n", node_ptr->hostname.c_str(), MTC_MINS_2);
int bmc_audit_period = daemon_get_cfg_ptr()->bmc_audit_period ;
if ( bmc_audit_period )
{
/* the time for the first audit is twice the configured period */
mtcTimer_start ( node_ptr->bmc_audit_timer, mtcTimer_handler, bmc_audit_period*2 );
plog ("%s bmc audit timer started (%d secs)", node_ptr->hostname.c_str(), bmc_audit_period*2);
}
else
{
ilog("%s bmc audit disabled", node_ptr->hostname.c_str());
}
/* success path */
node_ptr->bmc_accessible = true ;
@ -6767,10 +6828,12 @@ int nodeLinkClass::bmc_handler ( struct nodeLinkClass::node * node_ptr )
( node_ptr->bmc_provisioned ) &&
( node_ptr->bmc_accessible ) &&
( mtcTimer_expired ( node_ptr->bmc_audit_timer ) == true ) &&
( mtcTimer_expired ( node_ptr->bm_timer ) == true ))
( mtcTimer_expired ( node_ptr->bm_timer ) == true ) &&
( daemon_get_cfg_ptr()->bmc_audit_period != 0))
{
if ( node_ptr->bmc_thread_ctrl.done )
{
ilog("%s BMC Audit Start", node_ptr->hostname.c_str());
/* send the BMC Query command */
if ( bmc_command_send ( node_ptr, BMC_THREAD_CMD__BMC_INFO ) != PASS )
{
@ -6812,6 +6875,7 @@ int nodeLinkClass::bmc_handler ( struct nodeLinkClass::node * node_ptr )
string filedata = daemon_read_file (node_ptr->bmc_thread_info.data.data()) ;
struct json_object *json_obj =
json_tokener_parse((char*)filedata.data());
ilog("%s BMC Audit End", node_ptr->hostname.c_str());
if ( json_obj )
{
/* load the power state */
@ -6838,11 +6902,12 @@ int nodeLinkClass::bmc_handler ( struct nodeLinkClass::node * node_ptr )
power_state.c_str());
}
node_ptr->power_on = power_on ;
blog1 ("%s bmc audit timer re-started (%d secs)\n",
node_ptr->hostname.c_str(), MTC_MINS_2);
mtcTimer_start ( node_ptr->bmc_audit_timer,
mtcTimer_handler,
MTC_MINS_2 );
daemon_get_cfg_ptr()->bmc_audit_period );
blog ("%s bmc audit timer re-started (%d secs)",
node_ptr->hostname.c_str(),
daemon_get_cfg_ptr()->bmc_audit_period);
}
json_object_put(json_obj);
}