Add network boot support to mtce reinstall handling

The current maintenance 'reinstall' handling requires
a host to be booted and online in order to perform
a reinstall by asking the mtcClient to wipe the
disks and self reboot thereby forcing a network boot
and reinstall.

This re-install process is problematic for hosts that
don't install properly and never come online or on new
system installs where the existing boot image on disk
is still valid ; local disk as the first boot device.

Getting around these issues prior to this update
requires manual BIOS intervention to force-select
a network boot.

This update continues to support the online-wipedisk
method for hosts that are not BMC provisioned and
adds offline reinstall support through IPMI commands
for hosts that are BMC provisioned.

For hosts that have the BMC provisioned, the re-install
handler will wait up to 10 minutes for maintenance to
to establish connectivity to the BMC if it has not already.

Then it will issue a devboot pxe IPMI command to tell the
BMC to boot from the network on the 'next' reset and then
maintenance proceeds to reset that host by a second IPMI
command. This way the host will boot from the network and
perform a local install even if the current image on disk
is valid. No manual BIOS actions required.

This update requires a small system inventory update to
relax the online requirement for BMC provisioned hosts so
that the reinstall to proceed. That update depends on this.

This update also does some minor cleanup in the unused
mtcAgent test head to fix a static analysis error.

Test Plan:
With BMC Test Cases: Success
----------------------------
PASS: Verify install requiring power on with valid image on disk ; pass case
PASS: Verify install while powered on but offline with invalid image on disk ; pass case
PASS: Verify install while powered on but offline with valid image on disk ; pass case
PASS: Verify install with UEFI boot
PASS: Verify BMC Reinstall on Dell (720)
PASS: Verify BMC Reinstall on WC
PASS: Verify BMC Reinstall on HP (hp380)
PEND: Verify BMC Reinstall on SM
PEND: Verify BMC Reinstall on WP
COND: Verify install Secure boot - 430 1-2 fails

With BMC Test Cases: Failure
---------------------------
PASS: Verify reinstall handling during install during online wait ; restarts the install
PASS: Verify reinstall handling during install before online wait ; no install interruption
PASS: Verify BMC not accessible at ReInstall start ; recovery
PASS: Verify BMC not accessible at ReInstall start ; timeout
PASS: Verify BMC accessibility loss over Install process
PASS: Verify netboot request failure handling ; no/bad response ; max retry
PASS: Verify reset request failure handling ; no retries
PASS: Verify BMC de-provisioning over install ; failure handling
PASS: Verify BMC re-provisioning over install ; BMC initially not accessible
PASS: Verify BMC re-provisioning over install ; BMC initially accessible
PASS: Verify install requiring power on but gets power-on receive failure
PASS: Verify install requiring power on but gets power-on request failure

No BMC Test Cases: Success
--------------------------
PASS: Verify install when host is powered on and online

No BMC Test Cases: Failure
--------------------------
PASS: Verify reinstall action handling during reinstall ; no install interruption
PASS: Verify install when host is powered off ; install fails
PASS: Verify install when host is powered on and offline ; install fails

Regression:
-----------
PASS: Verify host reset
PASS: Verify host power-off
PASS: Verify host power-on
PASS: Verify host sensor model and monitoring

Change-Id: Ic8c8232167c570e4f75c0bbe1604697966157184
Story: 2005650
Task: 30935
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
Eric MacDonald 2019-05-16 10:59:08 -04:00
parent 30365dd34a
commit 1011fd8a1a
11 changed files with 472 additions and 122 deletions

View File

@ -43,6 +43,9 @@
#define IPMITOOL_RESTART_CAUSE_CMD ((const char *)("chassis restart_cause")) #define IPMITOOL_RESTART_CAUSE_CMD ((const char *)("chassis restart_cause"))
#define IPMITOOL_BOOTDEV_PXE_CMD ((const char *)("chassis bootdev pxe"))
#define IPMITOOL_BOOTDEV_PXE_RESP ((const char *)("Set Boot Device to pxe"))
#define IPMITOOL_MC_INFO_CMD ((const char *)("mc info")) #define IPMITOOL_MC_INFO_CMD ((const char *)("mc info"))
#define IPMITOOL_CMD_FILE_SUFFIX ((const char *)("_power_cmd_result")) #define IPMITOOL_CMD_FILE_SUFFIX ((const char *)("_power_cmd_result"))
@ -65,6 +68,7 @@ typedef enum
IPMITOOL_THREAD_CMD__MC_INFO, IPMITOOL_THREAD_CMD__MC_INFO,
IPMITOOL_THREAD_CMD__POWER_STATUS, IPMITOOL_THREAD_CMD__POWER_STATUS,
IPMITOOL_THREAD_CMD__RESTART_CAUSE, IPMITOOL_THREAD_CMD__RESTART_CAUSE,
IPMITOOL_THREAD_CMD__BOOTDEV_PXE,
IPMITOOL_THREAD_CMD__READ_SENSORS, IPMITOOL_THREAD_CMD__READ_SENSORS,

View File

@ -325,7 +325,7 @@ static std::string sensorStages_str [MTC_SENSOR__STAGES +1] ;
static std::string powerStages_str [MTC_POWER__STAGES +1] ; static std::string powerStages_str [MTC_POWER__STAGES +1] ;
static std::string powercycleStages_str [MTC_POWERCYCLE__STAGES +1] ; static std::string powercycleStages_str [MTC_POWERCYCLE__STAGES +1] ;
static std::string resetStages_str [MTC_RESET__STAGES +1] ; static std::string resetStages_str [MTC_RESET__STAGES +1] ;
static std::string reinstallStages_str [MTC_RESET__STAGES +1] ; static std::string reinstallStages_str [MTC_REINSTALL__STAGES +1] ;
static std::string oosTestStages_str [MTC_OOS_TEST__STAGES +1] ; static std::string oosTestStages_str [MTC_OOS_TEST__STAGES +1] ;
static std::string insvTestStages_str [MTC_INSV_TEST__STAGES +1] ; static std::string insvTestStages_str [MTC_INSV_TEST__STAGES +1] ;
static std::string configStages_str [MTC_CONFIG__STAGES +1] ; static std::string configStages_str [MTC_CONFIG__STAGES +1] ;
@ -451,7 +451,17 @@ void mtc_stages_init ( void )
resetStages_str [MTC_RESET__STAGES ] = "Reset-Unknown"; resetStages_str [MTC_RESET__STAGES ] = "Reset-Unknown";
reinstallStages_str [MTC_REINSTALL__START ] = "Reinstall-Start"; reinstallStages_str [MTC_REINSTALL__START ] = "Reinstall-Start";
reinstallStages_str [MTC_REINSTALL__RESP_WAIT ] = "Reinstall-Response-Wait"; reinstallStages_str [MTC_REINSTALL__START_WAIT ] = "Reinstall-Start-Wait";
reinstallStages_str [MTC_REINSTALL__RESTART ] = "Reinstall-ReStart";
reinstallStages_str [MTC_REINSTALL__RESTART_WAIT ] = "Reinstall-ReStart-Wait";
reinstallStages_str [MTC_REINSTALL__POWERON ] = "Reinstall-PowerOn";
reinstallStages_str [MTC_REINSTALL__POWERON_WAIT ] = "Reinstall-PowerOn-Wait";
reinstallStages_str [MTC_REINSTALL__NETBOOT ] = "Reinstall-Netboot";
reinstallStages_str [MTC_REINSTALL__NETBOOT_WAIT ] = "Reinstall-Netboot-Wait";
reinstallStages_str [MTC_REINSTALL__RESET ] = "Reinstall-Reset";
reinstallStages_str [MTC_REINSTALL__RESET_WAIT ] = "Reinstall-Reset-Wait";
reinstallStages_str [MTC_REINSTALL__WIPEDISK ] = "Reinstall-Wipedisk";
reinstallStages_str [MTC_REINSTALL__WIPEDISK_WAIT ] = "Reinstall-Wipedisk-Wait";
reinstallStages_str [MTC_REINSTALL__OFFLINE_WAIT ] = "Reinstall-Offline-Wait"; reinstallStages_str [MTC_REINSTALL__OFFLINE_WAIT ] = "Reinstall-Offline-Wait";
reinstallStages_str [MTC_REINSTALL__ONLINE_WAIT ] = "Reinstall-Online-Wait"; reinstallStages_str [MTC_REINSTALL__ONLINE_WAIT ] = "Reinstall-Online-Wait";
reinstallStages_str [MTC_REINSTALL__FAIL ] = "Reinstall-Failure"; reinstallStages_str [MTC_REINSTALL__FAIL ] = "Reinstall-Failure";

View File

@ -220,7 +220,17 @@ void daemon_exit ( void );
#define MTC_TASK_REBOOT_FAIL_RETRY "Reboot Failed, retrying (%d of %d)" #define MTC_TASK_REBOOT_FAIL_RETRY "Reboot Failed, retrying (%d of %d)"
#define MTC_TASK_REBOOT_ABORT "Reboot Failed, try again when host is 'online'" #define MTC_TASK_REBOOT_ABORT "Reboot Failed, try again when host is 'online'"
#define MTC_TASK_RESET_PROG "Rebooting/Resetting Host" #define MTC_TASK_RESET_PROG "Rebooting/Resetting Host"
#define MTC_TASK_REINSTALL "Reinstalling Host" #define MTC_TASK_REINSTALL "Reinstalling"
#define MTC_TASK_REINSTALL_WAIT_NA "Reinstall Wait ; BMC not accessible"
#define MTC_TASK_REINSTALL_RTRY_PC "Reinstall Retry ; BMC provisioned change during install"
#define MTC_TASK_REINSTALL_FAIL_CL "Reinstall Failed ; BMC connectivity lost"
#define MTC_TASK_REINSTALL_FAIL_OL "Reinstall Failed ; timeout waiting for offline"
#define MTC_TASK_REINSTALL_FAIL_TO "Reinstall Failed ; timeout waiting for online"
#define MTC_TASK_REINSTALL_FAIL_BA "Reinstall Failed ; timeout waiting BMC access"
#define MTC_TASK_REINSTALL_FAIL_PO "Reinstall Failed ; could not power on host"
#define MTC_TASK_REINSTALL_FAIL_NB "Reinstall Failed ; netboot request"
#define MTC_TASK_REINSTALL_FAIL_PR "Reinstall Failed ; power reset request"
#define MTC_TASK_REINSTALL_FAIL "Reinstall Failed" #define MTC_TASK_REINSTALL_FAIL "Reinstall Failed"
#define MTC_TASK_REINSTALL_SUCCESS "Reinstall Succeeded" #define MTC_TASK_REINSTALL_SUCCESS "Reinstall Succeeded"
#define MTC_TASK_BOOTING "Booting" #define MTC_TASK_BOOTING "Booting"
@ -1008,7 +1018,17 @@ string get_resetStages_str ( mtc_resetStages_enum stage );
typedef enum typedef enum
{ {
MTC_REINSTALL__START = 0, MTC_REINSTALL__START = 0,
MTC_REINSTALL__RESP_WAIT, MTC_REINSTALL__START_WAIT,
MTC_REINSTALL__RESTART,
MTC_REINSTALL__RESTART_WAIT,
MTC_REINSTALL__POWERON,
MTC_REINSTALL__POWERON_WAIT,
MTC_REINSTALL__NETBOOT,
MTC_REINSTALL__NETBOOT_WAIT,
MTC_REINSTALL__RESET,
MTC_REINSTALL__RESET_WAIT,
MTC_REINSTALL__WIPEDISK,
MTC_REINSTALL__WIPEDISK_WAIT,
MTC_REINSTALL__OFFLINE_WAIT, MTC_REINSTALL__OFFLINE_WAIT,
MTC_REINSTALL__ONLINE_WAIT, MTC_REINSTALL__ONLINE_WAIT,
MTC_REINSTALL__FAIL, MTC_REINSTALL__FAIL,

View File

@ -234,7 +234,7 @@ int _timer_stop ( struct mtc_timer * mtcTimer_ptr , bool int_safe)
} }
else if ( int_safe == false ) else if ( int_safe == false )
{ {
elog ("%s (%s) called with null TID (count:%d)\n", wlog ("%s (%s) called with null TID (count:%d)\n",
mtcTimer_ptr->hostname.c_str(), mtcTimer_ptr->hostname.c_str(),
mtcTimer_ptr->service.c_str(), mtcTimer_ptr->service.c_str(),
timer_count); timer_count);

View File

@ -64,7 +64,7 @@
#define MTC_ALIVE_TIMER (5) #define MTC_ALIVE_TIMER (5)
#define MTC_POWEROFF_DELAY (5) #define MTC_POWEROFF_DELAY (5)
#define MTC_SWACT_POLL_TIMER (10) #define MTC_SWACT_POLL_TIMER (10)
#define MTC_TASK_UPDATE_DELAY (10) #define MTC_TASK_UPDATE_DELAY (30)
#define MTC_BM_PING_TIMEOUT (30) #define MTC_BM_PING_TIMEOUT (30)
#define MTC_BM_POWEROFF_TIMEOUT (30) #define MTC_BM_POWEROFF_TIMEOUT (30)
#define MTC_BM_POWERON_TIMEOUT (30) #define MTC_BM_POWERON_TIMEOUT (30)
@ -80,6 +80,7 @@
#define MTC_POWERCYCLE_BACK2BACK_DELAY (MTC_MINS_5) #define MTC_POWERCYCLE_BACK2BACK_DELAY (MTC_MINS_5)
#define MTC_HEARTBEAT_SOAK_BEFORE_ENABLE (11) #define MTC_HEARTBEAT_SOAK_BEFORE_ENABLE (11)
#define MTC_REINSTALL_TIMEOUT_DEFAULT (MTC_MINS_40) #define MTC_REINSTALL_TIMEOUT_DEFAULT (MTC_MINS_40)
#define MTC_REINSTALL_TIMEOUT_BMC_ACC (MTC_MINS_10)
#define MTC_REINSTALL_TIMEOUT_MIN (MTC_MINS_1) #define MTC_REINSTALL_TIMEOUT_MIN (MTC_MINS_1)
#define MTC_REINSTALL_TIMEOUT_MAX (MTC_HRS_4) #define MTC_REINSTALL_TIMEOUT_MAX (MTC_HRS_4)
#define MTC_REINSTALL_WAIT_TIMER (10) #define MTC_REINSTALL_WAIT_TIMER (10)

View File

@ -2181,6 +2181,25 @@ int nodeLinkClass::mod_host ( node_inv_type & inv )
else if ( !inv.action.compare ( "reinstall" ) ) else if ( !inv.action.compare ( "reinstall" ) )
{ {
plog ("%s Reinstall Action\n", node_ptr->hostname.c_str()); plog ("%s Reinstall Action\n", node_ptr->hostname.c_str());
if ( node_ptr->adminAction == MTC_ADMIN_ACTION__REINSTALL )
{
/* Allow user to restart the re-install if
* - its in progress,
* - there is a BMC provisioned and
* - are waiting while the actual install is in progress */
if (( node_ptr->bm_provisioned == true ) &&
( node_ptr->reinstallStage == MTC_REINSTALL__ONLINE_WAIT))
{
reinstallStageChange ( node_ptr , MTC_REINSTALL__START );
}
else
{
/* Otherwise allow the current install to continue
* remind the user that there is a reinstall
* in progress */
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL);
}
}
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__REINSTALL ); adminActionChange ( node_ptr , MTC_ADMIN_ACTION__REINSTALL );
/* generate command=reinstall log */ /* generate command=reinstall log */

View File

@ -130,6 +130,11 @@ int nodeLinkClass::ipmi_command_send ( struct nodeLinkClass::node * node_ptr, in
{ {
want_fit = true ; want_fit = true ;
} }
else if (( command == IPMITOOL_THREAD_CMD__BOOTDEV_PXE ) &&
( daemon_want_fit ( fit, node_ptr->hostname, "netboot_pxe" ) == true ))
{
want_fit = true ;
}
if ( want_fit == true ) if ( want_fit == true )
{ {

View File

@ -1615,31 +1615,6 @@ extern int mtcJsonInv_testhead ( void );
int daemon_run_testhead ( void ) int daemon_run_testhead ( void )
{ {
int rc = PASS; int rc = PASS;
mtc_config.testmode = true ;
nodeLinkClass * mtcInv_testhead_ptr = new nodeLinkClass ;
printf ("\n\n");
printf (TESTHEAD_BAR);
printf ("| Node Class Test Head - Private and Public Member Functions\n");
printf (TESTHEAD_BAR);
for ( int i = 0 ; i < 11 ; i++ )
{
if ( mtcInv_testhead_ptr->testhead ( i+1 ) )
{
FAILED_STR ;
rc = FAIL ;
}
else
PASSED ;
}
free(mtcInv_testhead_ptr);
printf (TESTHEAD_BAR);
printf ("| Maintenance Timer Test Head\n");
printf (TESTHEAD_BAR);
return (rc); return (rc);
} }

View File

@ -294,37 +294,16 @@ int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr )
{ {
flog ("%s -> OOS Action Check\n", node_ptr->hostname.c_str()); flog ("%s -> OOS Action Check\n", node_ptr->hostname.c_str());
/* TEMPORARY: To allow reset of unlocked host for fault insertion. */ elog ("%s Administrative '%s' Operation Rejected\n",
if ( node_ptr->adminAction == MTC_ADMIN_ACTION__RESET ) node_ptr->hostname.c_str(),
{ get_adminAction_str (node_ptr->adminAction) );
wlog ("%s Allowing Reset of unlocked host for FIT\n", node_ptr->hostname.c_str());
if ( node_ptr->hostname.compare(nodeLinkClass::my_hostname)) elog ("%s Cannot perform out-of-service action against in-service host\n",
{ node_ptr->hostname.c_str());
nodeLinkClass::reset_handler ( node_ptr ); adminActionChange ( node_ptr , MTC_ADMIN_ACTION__NONE );
}
else
{
wlog ("%s Cowardly avoiding reset of self\n", node_ptr->hostname.c_str());
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__NONE );
/* Clear the UI task since we are not really resetting */ /* Clear the UI task since we are not really taking this action */
mtcInvApi_update_task ( node_ptr, "" ); mtcInvApi_update_task ( node_ptr, "" );
}
}
else
{
elog ("%s Administrative '%s' Operation Rejected\n",
node_ptr->hostname.c_str(),
get_adminAction_str (node_ptr->adminAction) );
elog ("%s Cannot perform out-of-service action against in-service host\n",
node_ptr->hostname.c_str());
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__NONE );
/* Clear the UI task since we are not really resetting */
mtcInvApi_update_task ( node_ptr, "" );
}
} }
/**************************************************************************** /****************************************************************************

View File

@ -1971,12 +1971,12 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
{ {
if ( mtcTimer_expired ( node_ptr->mtcTimer )) if ( mtcTimer_expired ( node_ptr->mtcTimer ))
{ {
rc = ipmi_command_recv ( node_ptr ); rc = ipmi_command_recv ( node_ptr );
if ( rc == RETRY ) if ( rc == RETRY )
{ {
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT ); mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
break ; break ;
} }
if ( rc ) if ( rc )
{ {
@ -4011,63 +4011,364 @@ int nodeLinkClass::reset_handler ( struct nodeLinkClass::node * node_ptr )
return (PASS); return (PASS);
} }
/* Reinstall handler /****************************************************************************
* -------------- *
* Name : reinstall_handler
*
* Purpose : Perform actions that result in a network boot so that a new
* image is installed on the specified node's boot partition.
*
* Description: This FSM handles node (re)install with and without
* a provisioned Board Management Controller (BMC).
*
* BMC provisioned case: using IPMI commands to BMC ...
*
* - ensure host power is on
* - force network boot on next reset
* - issue node reset
*
* BMC not provisioned case: using mtce messaging to node ...
*
* - send mtcClient wipedisk command
* fail reinstall if no ACK
* - send mtcClient reboot command
*
* Both casess:
*
* - wait for offline
* - wait for online
* - install complete
*
* Failure Handling:
*
* BMC provisioned cases:
*
* BMC won't power on
* BMC ipmi command failure
* BMC connectivity lost mid-FSM.
* BMC access timeout
*
* BMC not provisioned cases:
*
* no wipedisk ACK\
*
* failure to go offline after resaet/reboot
* timeout waiting for online after reset/reboot
*
* Manage reinstall operations for a locked-disabled host */ * Manage reinstall operations for a locked-disabled host */
int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr ) int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr )
{ {
/* Handle 'lost BMC connectivity during the install' case */
if (( node_ptr->bm_provisioned == true ) &&
( node_ptr->bm_accessible == false ))
{
if (( node_ptr->reinstallStage != MTC_REINSTALL__START ) &&
( node_ptr->reinstallStage != MTC_REINSTALL__START_WAIT ) &&
( node_ptr->reinstallStage != MTC_REINSTALL__FAIL ) &&
( node_ptr->reinstallStage != MTC_REINSTALL__MSG_DISPLAY ) &&
( node_ptr->reinstallStage != MTC_REINSTALL__DONE ))
{
mtcTimer_reset ( node_ptr->mtcTimer );
elog ("%s Reinstall lost bmc connection",
node_ptr->hostname.c_str());
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_CL );
reinstallStageChange ( node_ptr , MTC_REINSTALL__FAIL );
}
/* fall into switch to ...
* - handle failure
* - finish the FSM
*/
}
switch ( node_ptr->reinstallStage ) switch ( node_ptr->reinstallStage )
{ {
case MTC_REINSTALL__START: case MTC_REINSTALL__START:
{ {
int host_reinstall_wait_timer = node_ptr->mtcalive_timeout + node_reinstall_timeout ; LOAD_NODETYPE_TIMERS ;
node_ptr->retries = host_reinstall_wait_timer / MTC_REINSTALL_WAIT_TIMER ; mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL );
node_ptr->retries = ( node_ptr->mtcalive_timeout +
this->node_reinstall_timeout) /
MTC_REINSTALL_WAIT_TIMER ;
mtcTimer_reset ( node_ptr->mtcTimer );
if ( node_ptr->bm_provisioned == true )
{
if ( node_ptr->bm_accessible == false )
{
/* Handle 'lost BMC connectivity during the install' case */
wlog ("%s Reinstall wait for BMC access ; %d second timeout",
node_ptr->hostname.c_str(),
MTC_REINSTALL_TIMEOUT_BMC_ACC);
start_offline_handler ( node_ptr ); mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_WAIT_NA );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_REINSTALL_TIMEOUT_BMC_ACC );
reinstallStageChange ( node_ptr, MTC_REINSTALL__START_WAIT );
}
else if ( node_ptr->power_on == false )
{
/* need to power on node */
wlog ("%s Reinstall power-on required", node_ptr->hostname.c_str());
reinstallStageChange ( node_ptr, MTC_REINSTALL__POWERON );
}
else
{
/* power is on so issue net boot command */
ilog ("%s Reinstall power is on", node_ptr->hostname.c_str());
reinstallStageChange ( node_ptr , MTC_REINSTALL__NETBOOT );
}
}
else
{
/* If the BMC is not provisioned coming into this handler
* then service the install by mtce commands by starting
* the install by wipedisk. */
reinstallStageChange ( node_ptr, MTC_REINSTALL__WIPEDISK );
}
break ;
}
/* BMC provisioned but bm_handler has not reported accessability yet.
* Need to wait ... */
case MTC_REINSTALL__START_WAIT:
{
if ( node_ptr->bm_provisioned == true )
{
if ( node_ptr->bm_accessible == false )
{
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
{
/* wait period has timed out ; fail the install */
elog ("%s %s", node_ptr->hostname.c_str(), MTC_TASK_REINSTALL_FAIL_BA);
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_BA );
reinstallStageChange ( node_ptr , MTC_REINSTALL__FAIL );
}
else
{
; /* ... wait longer */
}
}
else
{
/* the BMC is not accessible to start the install over */
plog ("%s BMC access established ; starting install",
node_ptr->hostname.c_str());
mtcTimer_reset ( node_ptr->mtcTimer );
reinstallStageChange ( node_ptr , MTC_REINSTALL__START );
}
}
else
{
/*
* Handle case where BMC gets deprovisioned
* while waiting for accessibility.
*
* Restart the install in that case after a monitored
* wait period for reprovision.
*
* Has the side effect of allowing the admin to
* reprovision the BMC during a re-install.
*/
mtcTimer_reset ( node_ptr->mtcTimer );
wlog ("%s %s", node_ptr->hostname.c_str(), MTC_TASK_REINSTALL_RTRY_PC );
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_RTRY_PC );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_MINS_5 );
reinstallStageChange ( node_ptr, MTC_REINSTALL__RESTART_WAIT );
}
break ;
}
case MTC_REINSTALL__RESTART_WAIT:
{
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
{
reinstallStageChange ( node_ptr , MTC_REINSTALL__START );
}
else if ( node_ptr->bm_provisioned == true )
{
mtcTimer_reset ( node_ptr->mtcTimer );
wlog ("%s %s", node_ptr->hostname.c_str(), MTC_TASK_REINSTALL_RTRY_PC );
reinstallStageChange ( node_ptr , MTC_REINSTALL__START );
}
else
{
; /* ... wait longer */
}
break ;
}
case MTC_REINSTALL__POWERON:
{
powerStageChange ( node_ptr , MTC_POWERON__REQ_SEND );
reinstallStageChange ( node_ptr , MTC_REINSTALL__POWERON_WAIT );
break ;
}
case MTC_REINSTALL__POWERON_WAIT:
{
/* The power handler manages timeout */
if ( node_ptr->powerStage == MTC_POWER__DONE )
{
if ( node_ptr->power_on == true )
{
if ( node_ptr->task != MTC_TASK_REINSTALL )
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_SECS_5 );
reinstallStageChange ( node_ptr , MTC_REINSTALL__NETBOOT );
}
else
{
elog ("%s %s", node_ptr->hostname.c_str(), MTC_TASK_REINSTALL_FAIL_PO);
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_PO );
reinstallStageChange ( node_ptr , MTC_REINSTALL__FAIL );
}
}
else
{
/* run the power handler till the host's power is on or
* the power-on handler times out */
power_handler ( node_ptr );
}
break ;
}
case MTC_REINSTALL__NETBOOT:
{
/* Issue netboot command after timed delay */
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
{
int rc = ipmi_command_send ( node_ptr, IPMITOOL_THREAD_CMD__BOOTDEV_PXE );
if ( rc )
{
elog ("%s Reinstall netboot request failed (rc:%d)",
node_ptr->hostname.c_str(), rc );
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_NB );
reinstallStageChange ( node_ptr, MTC_REINSTALL__FAIL );
}
else
{
ilog ("%s Reinstall netboot request sent", node_ptr->hostname.c_str() );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_SECS_5 );
reinstallStageChange ( node_ptr, MTC_REINSTALL__NETBOOT_WAIT );
}
}
break ;
}
case MTC_REINSTALL__NETBOOT_WAIT:
{
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
{
int rc = ipmi_command_recv ( node_ptr );
if ( rc == PASS )
{
ilog ("%s Reinstall netboot request completed", node_ptr->hostname.c_str());
reinstallStageChange ( node_ptr, MTC_REINSTALL__RESET);
}
else if ( rc == RETRY )
{
wlog ("%s Reinstall netboot receive retry", node_ptr->hostname.c_str());
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_SECS_5 );
}
else
{
elog ("%s Reinstall netboot receive failed (rc:%d)",
node_ptr->hostname.c_str(), rc );
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_NB );
reinstallStageChange ( node_ptr, MTC_REINSTALL__FAIL );
}
}
break ;
}
case MTC_REINSTALL__RESET:
{
int rc = ipmi_command_send ( node_ptr, IPMITOOL_THREAD_CMD__POWER_RESET );
if ( rc )
{
elog ("%s Reinstall reset request failed (rc:%d)",
node_ptr->hostname.c_str(), rc );
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_PR );
reinstallStageChange ( node_ptr, MTC_REINSTALL__FAIL );
}
else
{
ilog ("%s Reinstall reset request sent", node_ptr->hostname.c_str());
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
reinstallStageChange ( node_ptr, MTC_REINSTALL__RESET_WAIT );
}
break ;
}
case MTC_REINSTALL__RESET_WAIT:
{
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
{
int rc = ipmi_command_recv ( node_ptr );
if ( rc == PASS )
{
ilog ("%s Reinstall reset request completed", node_ptr->hostname.c_str());
start_offline_handler ( node_ptr );
/* Wait for the host to go offline */
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RESET_TO_OFFLINE_TIMEOUT );
reinstallStageChange ( node_ptr, MTC_REINSTALL__OFFLINE_WAIT);
}
else if ( rc == RETRY )
{
wlog ("%s Reinstall reset receive retry", node_ptr->hostname.c_str());
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_SECS_5 );
}
else
{
elog ("%s Reinstall reset receive failed ; rc:%d",
node_ptr->hostname.c_str(), rc );
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_PR );
reinstallStageChange ( node_ptr, MTC_REINSTALL__FAIL );
}
}
break ;
}
/* BMC not provisioned case */
case MTC_REINSTALL__WIPEDISK:
{
node_ptr->cmdReq = MTC_CMD_WIPEDISK ; node_ptr->cmdReq = MTC_CMD_WIPEDISK ;
plog ("%s Administrative Reinstall Requested\n", node_ptr->hostname.c_str()); plog ("%s Reinstall wipedisk requested", node_ptr->hostname.c_str());
if ( send_mtc_cmd ( node_ptr->hostname, MTC_CMD_WIPEDISK, MGMNT_INTERFACE ) != PASS ) if ( send_mtc_cmd ( node_ptr->hostname, MTC_CMD_WIPEDISK, MGMNT_INTERFACE ) != PASS )
{ {
elog ("Failed to send 'reinstall' request to %s\n", node_ptr->hostname.c_str()); elog ("%s Reinstall request send failed", node_ptr->hostname.c_str());
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL );
reinstallStageChange ( node_ptr , MTC_REINSTALL__FAIL ); reinstallStageChange ( node_ptr , MTC_REINSTALL__FAIL );
} }
else else
{ {
node_ptr->cmdRsp = MTC_CMD_NONE ; node_ptr->cmdRsp = MTC_CMD_NONE ;
if ( node_ptr->mtcTimer.tid ) mtcTimer_reset ( node_ptr->mtcTimer );
{
mtcTimer_stop ( node_ptr->mtcTimer );
}
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_CMD_RSP_TIMEOUT ); mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_CMD_RSP_TIMEOUT );
ilog ("%s waiting for REINSTALL ACK \n", node_ptr->hostname.c_str() ); reinstallStageChange ( node_ptr , MTC_REINSTALL__WIPEDISK_WAIT );
reinstallStageChange ( node_ptr , MTC_REINSTALL__RESP_WAIT );
} }
break ; break ;
} }
case MTC_REINSTALL__RESP_WAIT: case MTC_REINSTALL__WIPEDISK_WAIT:
{ {
if ( node_ptr->cmdRsp != MTC_CMD_WIPEDISK ) if ( node_ptr->cmdRsp != MTC_CMD_WIPEDISK )
{ {
if ( node_ptr->mtcTimer.ring == true ) if ( node_ptr->mtcTimer.ring == true )
{ {
elog ("%s REINSTALL ACK Timeout\n", elog ("%s Reinstall wipedisk ACK timeout",
node_ptr->hostname.c_str()); node_ptr->hostname.c_str());
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL );
reinstallStageChange ( node_ptr , MTC_REINSTALL__FAIL ); reinstallStageChange ( node_ptr , MTC_REINSTALL__FAIL );
} }
} }
else else
{ {
/* declare successful reinstall request */ /* declare successful reinstall request */
plog ("%s REINSTALL Request Succeeded\n", node_ptr->hostname.c_str()); plog ("%s Reinstall request succeeded", node_ptr->hostname.c_str());
mtcTimer_stop ( node_ptr->mtcTimer ); mtcTimer_reset ( node_ptr->mtcTimer );
start_offline_handler ( node_ptr );
/* We need to wait for the host to go offline */ /* We need to wait for the host to go offline */
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RESET_TO_OFFLINE_TIMEOUT ); mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RESET_TO_OFFLINE_TIMEOUT );
@ -4085,49 +4386,57 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr )
clear_service_readies ( node_ptr ); clear_service_readies ( node_ptr );
ilog ("%s Reinstall Progress: host is offline ; waiting for host to come back\n", node_ptr->hostname.c_str()); ilog ("%s Reinstall in-progress ; waiting for 'online' state",
node_ptr->hostname.c_str());
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_REINSTALL_WAIT_TIMER ); mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_REINSTALL_WAIT_TIMER );
reinstallStageChange ( node_ptr , MTC_REINSTALL__ONLINE_WAIT ); reinstallStageChange ( node_ptr , MTC_REINSTALL__ONLINE_WAIT );
} }
else if ( node_ptr->mtcTimer.ring == true ) else if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
{ {
elog ("%s offline timeout - reinstall failed\n", node_ptr->hostname.c_str()); elog ("%s failed to go offline ; timeout", node_ptr->hostname.c_str());
stop_offline_handler ( node_ptr );
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_OL );
reinstallStageChange ( node_ptr , MTC_REINSTALL__FAIL ); reinstallStageChange ( node_ptr , MTC_REINSTALL__FAIL );
} }
else
{
; // wait longer ...
}
break ; break ;
} }
case MTC_REINSTALL__ONLINE_WAIT: case MTC_REINSTALL__ONLINE_WAIT:
{ {
if ( node_ptr->mtcTimer.ring == true ) if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
{ {
if ( node_ptr->availStatus == MTC_AVAIL_STATUS__ONLINE ) if ( --node_ptr->retries < 0 )
{ {
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_SUCCESS); elog ("%s %s", node_ptr->hostname.c_str(), MTC_TASK_REINSTALL_FAIL_TO);
mtcTimer_stop ( node_ptr->mtcTimer ); mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_TO );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY ); reinstallStageChange ( node_ptr , MTC_REINSTALL__FAIL );
reinstallStageChange ( node_ptr , MTC_REINSTALL__MSG_DISPLAY );
mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__STATUSCHANGE_REINSTALL_COMPLETE );
} }
else else
{ {
if ( --node_ptr->retries < 0 ) mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_REINSTALL_WAIT_TIMER );
{
elog ("%s online timeout - reinstall failed\n", node_ptr->hostname.c_str());
reinstallStageChange ( node_ptr , MTC_REINSTALL__FAIL );
}
else
{
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_REINSTALL_WAIT_TIMER );
}
} }
} }
else if ( node_ptr->availStatus == MTC_AVAIL_STATUS__ONLINE )
{
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_SUCCESS);
mtcTimer_reset ( node_ptr->mtcTimer );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY );
reinstallStageChange ( node_ptr , MTC_REINSTALL__MSG_DISPLAY );
mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__STATUSCHANGE_REINSTALL_COMPLETE );
}
else
{
; // wait longer ...
}
break; break;
} }
case MTC_REINSTALL__FAIL: case MTC_REINSTALL__FAIL:
{ {
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL); mtcTimer_reset ( node_ptr->mtcTimer );
mtcTimer_stop ( node_ptr->mtcTimer );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY ); mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY );
reinstallStageChange ( node_ptr , MTC_REINSTALL__MSG_DISPLAY ); reinstallStageChange ( node_ptr , MTC_REINSTALL__MSG_DISPLAY );
mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__STATUSCHANGE_REINSTALL_FAILED ); mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__STATUSCHANGE_REINSTALL_FAILED );
@ -4135,23 +4444,33 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr )
} }
case MTC_REINSTALL__MSG_DISPLAY: case MTC_REINSTALL__MSG_DISPLAY:
{ {
if ( node_ptr->mtcTimer.ring == true ) if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
{ {
node_ptr->mtcTimer.ring = false ;
reinstallStageChange ( node_ptr , MTC_REINSTALL__DONE ); reinstallStageChange ( node_ptr , MTC_REINSTALL__DONE );
} }
else
{
; // wait longer ...
}
break ; break ;
} }
case MTC_REINSTALL__DONE: case MTC_REINSTALL__DONE:
default: default:
{ {
plog ("%s Reinstall Completed\n", node_ptr->hostname.c_str()); if ( node_ptr->task == MTC_TASK_REINSTALL_SUCCESS )
{
plog ("%s Reinstall completed successfully",
node_ptr->hostname.c_str());
}
else
{
plog ("%s Reinstall complete ; operation failure",
node_ptr->hostname.c_str());
}
/* Default timeout values */ /* Default timeout values */
LOAD_NODETYPE_TIMERS ; LOAD_NODETYPE_TIMERS ;
mtcTimer_stop ( node_ptr->mtcTimer );
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__NONE ); adminActionChange ( node_ptr , MTC_ADMIN_ACTION__NONE );
recovery_ctrl_init ( node_ptr->hwmon_reset ); recovery_ctrl_init ( node_ptr->hwmon_reset );
@ -4583,6 +4902,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
} }
else else
{ {
node_ptr->power_on = false ;
ilog ("%s power is off ; powering on ...\n", node_ptr->hostname.c_str() ); ilog ("%s power is off ; powering on ...\n", node_ptr->hostname.c_str() );
powerStageChange ( node_ptr , MTC_POWERON__REQ_SEND ); powerStageChange ( node_ptr , MTC_POWERON__REQ_SEND );
} }
@ -4623,7 +4943,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
} }
else else
{ {
rc = ipmi_command_send ( node_ptr, IPMITOOL_THREAD_CMD__POWER_ON ); rc = ipmi_command_send ( node_ptr, IPMITOOL_THREAD_CMD__POWER_ON );
if ( rc ) if ( rc )
{ {
wlog ("%s Power-On request failed (%d)\n", wlog ("%s Power-On request failed (%d)\n",
@ -4918,14 +5238,14 @@ int nodeLinkClass::powercycle_handler ( struct nodeLinkClass::node * node_ptr )
{ {
bool on = false ; bool on = false ;
ilog ("%s Power Status: %s\n", ilog ("%s Power Status: %s\n",
node_ptr->hostname.c_str(), node_ptr->hostname.c_str(),
node_ptr->ipmitool_thread_info.data.c_str()); node_ptr->ipmitool_thread_info.data.c_str());
if ( node_ptr->ipmitool_thread_info.data.find ( IPMITOOL_POWER_ON_STATUS ) != std::string::npos ) if ( node_ptr->ipmitool_thread_info.data.find ( IPMITOOL_POWER_ON_STATUS ) != std::string::npos )
{ {
on = true ; on = true ;
} }
if ( rc == PASS ) if ( rc == PASS )
{ {
/* maintain current power state */ /* maintain current power state */
@ -6033,6 +6353,12 @@ int nodeLinkClass::bm_handler ( struct nodeLinkClass::node * node_ptr )
node_ptr->ipmitool_thread_info.data.c_str()); node_ptr->ipmitool_thread_info.data.c_str());
plog ("%s bmc is accessible\n", node_ptr->hostname.c_str()); plog ("%s bmc is accessible\n", node_ptr->hostname.c_str());
/* set host power state ; on or off */
if ( node_ptr->ipmitool_thread_info.data.find (IPMITOOL_POWER_ON_STATUS) != std::string::npos )
node_ptr->power_on = true ;
else
node_ptr->power_on = false ;
if ( node_ptr->ipmitool_thread_info.data.find (IPMITOOL_POWER_OFF_STATUS) != std::string::npos ) if ( node_ptr->ipmitool_thread_info.data.find (IPMITOOL_POWER_OFF_STATUS) != std::string::npos )
{ {
if ( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED ) if ( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED )
@ -6333,7 +6659,6 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
} }
node_ptr->ipmitool_thread_ctrl.done = true ; node_ptr->ipmitool_thread_ctrl.done = true ;
} }
#endif #endif
/* Audits for this controller host only */ /* Audits for this controller host only */

View File

@ -79,6 +79,7 @@ void * mtcThread_ipmitool ( void * arg )
switch ( info_ptr->command ) switch ( info_ptr->command )
{ {
/* control commands */
case IPMITOOL_THREAD_CMD__POWER_RESET: case IPMITOOL_THREAD_CMD__POWER_RESET:
{ {
command = IPMITOOL_POWER_RESET_CMD ; command = IPMITOOL_POWER_RESET_CMD ;
@ -103,6 +104,14 @@ void * mtcThread_ipmitool ( void * arg )
response = IPMITOOL_POWER_CYCLE_RESP ; response = IPMITOOL_POWER_CYCLE_RESP ;
break ; break ;
} }
case IPMITOOL_THREAD_CMD__BOOTDEV_PXE:
{
command = IPMITOOL_BOOTDEV_PXE_CMD ;
response = IPMITOOL_BOOTDEV_PXE_RESP ;
break ;
}
/* Status commands */
case IPMITOOL_THREAD_CMD__POWER_STATUS: case IPMITOOL_THREAD_CMD__POWER_STATUS:
{ {
command = IPMITOOL_POWER_STATUS_CMD ; command = IPMITOOL_POWER_STATUS_CMD ;
@ -118,6 +127,7 @@ void * mtcThread_ipmitool ( void * arg )
command = IPMITOOL_MC_INFO_CMD ; command = IPMITOOL_MC_INFO_CMD ;
break ; break ;
} }
default: default:
{ {
rc = info_ptr->status = FAIL_BAD_CASE ; rc = info_ptr->status = FAIL_BAD_CASE ;
@ -222,9 +232,11 @@ void * mtcThread_ipmitool ( void * arg )
else if ((( command == IPMITOOL_POWER_RESET_CMD ) || else if ((( command == IPMITOOL_POWER_RESET_CMD ) ||
( command == IPMITOOL_POWER_OFF_CMD ) || ( command == IPMITOOL_POWER_OFF_CMD ) ||
( command == IPMITOOL_POWER_ON_CMD ) || ( command == IPMITOOL_POWER_ON_CMD ) ||
( command == IPMITOOL_POWER_CYCLE_CMD )) && ( command == IPMITOOL_POWER_CYCLE_CMD ) ||
( command == IPMITOOL_BOOTDEV_PXE_CMD)) &&
( daemon_is_file_present ( MTC_CMD_FIT__POWER_CMD ))) ( daemon_is_file_present ( MTC_CMD_FIT__POWER_CMD )))
{ {
slog("%s FIT Bypass power or bootdev command", info_ptr->hostname.c_str());
bypass_ipmitool_request = true ; bypass_ipmitool_request = true ;
rc = PASS ; rc = PASS ;
} }