Merge "Add network boot support to mtce reinstall handling"

This commit is contained in:
Zuul 2019-05-28 23:16:39 +00:00 committed by Gerrit Code Review
commit dd9982a902
11 changed files with 472 additions and 122 deletions

View File

@ -43,6 +43,9 @@
#define IPMITOOL_RESTART_CAUSE_CMD ((const char *)("chassis restart_cause"))
#define IPMITOOL_BOOTDEV_PXE_CMD ((const char *)("chassis bootdev pxe"))
#define IPMITOOL_BOOTDEV_PXE_RESP ((const char *)("Set Boot Device to pxe"))
#define IPMITOOL_MC_INFO_CMD ((const char *)("mc info"))
#define IPMITOOL_CMD_FILE_SUFFIX ((const char *)("_power_cmd_result"))
@ -65,6 +68,7 @@ typedef enum
IPMITOOL_THREAD_CMD__MC_INFO,
IPMITOOL_THREAD_CMD__POWER_STATUS,
IPMITOOL_THREAD_CMD__RESTART_CAUSE,
IPMITOOL_THREAD_CMD__BOOTDEV_PXE,
IPMITOOL_THREAD_CMD__READ_SENSORS,

View File

@ -325,7 +325,7 @@ static std::string sensorStages_str [MTC_SENSOR__STAGES +1] ;
static std::string powerStages_str [MTC_POWER__STAGES +1] ;
static std::string powercycleStages_str [MTC_POWERCYCLE__STAGES +1] ;
static std::string resetStages_str [MTC_RESET__STAGES +1] ;
static std::string reinstallStages_str [MTC_RESET__STAGES +1] ;
static std::string reinstallStages_str [MTC_REINSTALL__STAGES +1] ;
static std::string oosTestStages_str [MTC_OOS_TEST__STAGES +1] ;
static std::string insvTestStages_str [MTC_INSV_TEST__STAGES +1] ;
static std::string configStages_str [MTC_CONFIG__STAGES +1] ;
@ -451,7 +451,17 @@ void mtc_stages_init ( void )
resetStages_str [MTC_RESET__STAGES ] = "Reset-Unknown";
reinstallStages_str [MTC_REINSTALL__START ] = "Reinstall-Start";
reinstallStages_str [MTC_REINSTALL__RESP_WAIT ] = "Reinstall-Response-Wait";
reinstallStages_str [MTC_REINSTALL__START_WAIT ] = "Reinstall-Start-Wait";
reinstallStages_str [MTC_REINSTALL__RESTART ] = "Reinstall-ReStart";
reinstallStages_str [MTC_REINSTALL__RESTART_WAIT ] = "Reinstall-ReStart-Wait";
reinstallStages_str [MTC_REINSTALL__POWERON ] = "Reinstall-PowerOn";
reinstallStages_str [MTC_REINSTALL__POWERON_WAIT ] = "Reinstall-PowerOn-Wait";
reinstallStages_str [MTC_REINSTALL__NETBOOT ] = "Reinstall-Netboot";
reinstallStages_str [MTC_REINSTALL__NETBOOT_WAIT ] = "Reinstall-Netboot-Wait";
reinstallStages_str [MTC_REINSTALL__RESET ] = "Reinstall-Reset";
reinstallStages_str [MTC_REINSTALL__RESET_WAIT ] = "Reinstall-Reset-Wait";
reinstallStages_str [MTC_REINSTALL__WIPEDISK ] = "Reinstall-Wipedisk";
reinstallStages_str [MTC_REINSTALL__WIPEDISK_WAIT ] = "Reinstall-Wipedisk-Wait";
reinstallStages_str [MTC_REINSTALL__OFFLINE_WAIT ] = "Reinstall-Offline-Wait";
reinstallStages_str [MTC_REINSTALL__ONLINE_WAIT ] = "Reinstall-Online-Wait";
reinstallStages_str [MTC_REINSTALL__FAIL ] = "Reinstall-Failure";

View File

@ -220,7 +220,17 @@ void daemon_exit ( void );
#define MTC_TASK_REBOOT_FAIL_RETRY "Reboot Failed, retrying (%d of %d)"
#define MTC_TASK_REBOOT_ABORT "Reboot Failed, try again when host is 'online'"
#define MTC_TASK_RESET_PROG "Rebooting/Resetting Host"
#define MTC_TASK_REINSTALL "Reinstalling Host"
#define MTC_TASK_REINSTALL "Reinstalling"
#define MTC_TASK_REINSTALL_WAIT_NA "Reinstall Wait ; BMC not accessible"
#define MTC_TASK_REINSTALL_RTRY_PC "Reinstall Retry ; BMC provisioned change during install"
#define MTC_TASK_REINSTALL_FAIL_CL "Reinstall Failed ; BMC connectivity lost"
#define MTC_TASK_REINSTALL_FAIL_OL "Reinstall Failed ; timeout waiting for offline"
#define MTC_TASK_REINSTALL_FAIL_TO "Reinstall Failed ; timeout waiting for online"
#define MTC_TASK_REINSTALL_FAIL_BA "Reinstall Failed ; timeout waiting BMC access"
#define MTC_TASK_REINSTALL_FAIL_PO "Reinstall Failed ; could not power on host"
#define MTC_TASK_REINSTALL_FAIL_NB "Reinstall Failed ; netboot request"
#define MTC_TASK_REINSTALL_FAIL_PR "Reinstall Failed ; power reset request"
#define MTC_TASK_REINSTALL_FAIL "Reinstall Failed"
#define MTC_TASK_REINSTALL_SUCCESS "Reinstall Succeeded"
#define MTC_TASK_BOOTING "Booting"
@ -1008,7 +1018,17 @@ string get_resetStages_str ( mtc_resetStages_enum stage );
typedef enum
{
MTC_REINSTALL__START = 0,
MTC_REINSTALL__RESP_WAIT,
MTC_REINSTALL__START_WAIT,
MTC_REINSTALL__RESTART,
MTC_REINSTALL__RESTART_WAIT,
MTC_REINSTALL__POWERON,
MTC_REINSTALL__POWERON_WAIT,
MTC_REINSTALL__NETBOOT,
MTC_REINSTALL__NETBOOT_WAIT,
MTC_REINSTALL__RESET,
MTC_REINSTALL__RESET_WAIT,
MTC_REINSTALL__WIPEDISK,
MTC_REINSTALL__WIPEDISK_WAIT,
MTC_REINSTALL__OFFLINE_WAIT,
MTC_REINSTALL__ONLINE_WAIT,
MTC_REINSTALL__FAIL,

View File

@ -234,7 +234,7 @@ int _timer_stop ( struct mtc_timer * mtcTimer_ptr , bool int_safe)
}
else if ( int_safe == false )
{
elog ("%s (%s) called with null TID (count:%d)\n",
wlog ("%s (%s) called with null TID (count:%d)\n",
mtcTimer_ptr->hostname.c_str(),
mtcTimer_ptr->service.c_str(),
timer_count);

View File

@ -64,7 +64,7 @@
#define MTC_ALIVE_TIMER (5)
#define MTC_POWEROFF_DELAY (5)
#define MTC_SWACT_POLL_TIMER (10)
#define MTC_TASK_UPDATE_DELAY (10)
#define MTC_TASK_UPDATE_DELAY (30)
#define MTC_BM_PING_TIMEOUT (30)
#define MTC_BM_POWEROFF_TIMEOUT (30)
#define MTC_BM_POWERON_TIMEOUT (30)
@ -80,6 +80,7 @@
#define MTC_POWERCYCLE_BACK2BACK_DELAY (MTC_MINS_5)
#define MTC_HEARTBEAT_SOAK_BEFORE_ENABLE (11)
#define MTC_REINSTALL_TIMEOUT_DEFAULT (MTC_MINS_40)
#define MTC_REINSTALL_TIMEOUT_BMC_ACC (MTC_MINS_10)
#define MTC_REINSTALL_TIMEOUT_MIN (MTC_MINS_1)
#define MTC_REINSTALL_TIMEOUT_MAX (MTC_HRS_4)
#define MTC_REINSTALL_WAIT_TIMER (10)

View File

@ -2181,6 +2181,25 @@ int nodeLinkClass::mod_host ( node_inv_type & inv )
else if ( !inv.action.compare ( "reinstall" ) )
{
plog ("%s Reinstall Action\n", node_ptr->hostname.c_str());
if ( node_ptr->adminAction == MTC_ADMIN_ACTION__REINSTALL )
{
/* Allow user to restart the re-install if
* - its in progress,
* - there is a BMC provisioned and
* - are waiting while the actual install is in progress */
if (( node_ptr->bm_provisioned == true ) &&
( node_ptr->reinstallStage == MTC_REINSTALL__ONLINE_WAIT))
{
reinstallStageChange ( node_ptr , MTC_REINSTALL__START );
}
else
{
/* Otherwise allow the current install to continue
* remind the user that there is a reinstall
* in progress */
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL);
}
}
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__REINSTALL );
/* generate command=reinstall log */

View File

@ -130,6 +130,11 @@ int nodeLinkClass::ipmi_command_send ( struct nodeLinkClass::node * node_ptr, in
{
want_fit = true ;
}
else if (( command == IPMITOOL_THREAD_CMD__BOOTDEV_PXE ) &&
( daemon_want_fit ( fit, node_ptr->hostname, "netboot_pxe" ) == true ))
{
want_fit = true ;
}
if ( want_fit == true )
{

View File

@ -1615,31 +1615,6 @@ extern int mtcJsonInv_testhead ( void );
int daemon_run_testhead ( void )
{
int rc = PASS;
mtc_config.testmode = true ;
nodeLinkClass * mtcInv_testhead_ptr = new nodeLinkClass ;
printf ("\n\n");
printf (TESTHEAD_BAR);
printf ("| Node Class Test Head - Private and Public Member Functions\n");
printf (TESTHEAD_BAR);
for ( int i = 0 ; i < 11 ; i++ )
{
if ( mtcInv_testhead_ptr->testhead ( i+1 ) )
{
FAILED_STR ;
rc = FAIL ;
}
else
PASSED ;
}
free(mtcInv_testhead_ptr);
printf (TESTHEAD_BAR);
printf ("| Maintenance Timer Test Head\n");
printf (TESTHEAD_BAR);
return (rc);
}

View File

@ -294,37 +294,16 @@ int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr )
{
flog ("%s -> OOS Action Check\n", node_ptr->hostname.c_str());
/* TEMPORARY: To allow reset of unlocked host for fault insertion. */
if ( node_ptr->adminAction == MTC_ADMIN_ACTION__RESET )
{
wlog ("%s Allowing Reset of unlocked host for FIT\n", node_ptr->hostname.c_str());
elog ("%s Administrative '%s' Operation Rejected\n",
node_ptr->hostname.c_str(),
get_adminAction_str (node_ptr->adminAction) );
if ( node_ptr->hostname.compare(nodeLinkClass::my_hostname))
{
nodeLinkClass::reset_handler ( node_ptr );
}
else
{
wlog ("%s Cowardly avoiding reset of self\n", node_ptr->hostname.c_str());
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__NONE );
elog ("%s Cannot perform out-of-service action against in-service host\n",
node_ptr->hostname.c_str());
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__NONE );
/* Clear the UI task since we are not really resetting */
mtcInvApi_update_task ( node_ptr, "" );
}
}
else
{
elog ("%s Administrative '%s' Operation Rejected\n",
node_ptr->hostname.c_str(),
get_adminAction_str (node_ptr->adminAction) );
elog ("%s Cannot perform out-of-service action against in-service host\n",
node_ptr->hostname.c_str());
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__NONE );
/* Clear the UI task since we are not really resetting */
mtcInvApi_update_task ( node_ptr, "" );
}
/* Clear the UI task since we are not really taking this action */
mtcInvApi_update_task ( node_ptr, "" );
}
/****************************************************************************

View File

@ -1971,12 +1971,12 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
{
if ( mtcTimer_expired ( node_ptr->mtcTimer ))
{
rc = ipmi_command_recv ( node_ptr );
if ( rc == RETRY )
{
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
break ;
}
rc = ipmi_command_recv ( node_ptr );
if ( rc == RETRY )
{
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
break ;
}
if ( rc )
{
@ -4011,63 +4011,364 @@ int nodeLinkClass::reset_handler ( struct nodeLinkClass::node * node_ptr )
return (PASS);
}
/* Reinstall handler
* --------------
/****************************************************************************
*
* Name : reinstall_handler
*
* Purpose : Perform actions that result in a network boot so that a new
* image is installed on the specified node's boot partition.
*
* Description: This FSM handles node (re)install with and without
* a provisioned Board Management Controller (BMC).
*
* BMC provisioned case: using IPMI commands to BMC ...
*
* - ensure host power is on
* - force network boot on next reset
* - issue node reset
*
* BMC not provisioned case: using mtce messaging to node ...
*
* - send mtcClient wipedisk command
* fail reinstall if no ACK
* - send mtcClient reboot command
*
* Both casess:
*
* - wait for offline
* - wait for online
* - install complete
*
* Failure Handling:
*
* BMC provisioned cases:
*
* BMC won't power on
* BMC ipmi command failure
* BMC connectivity lost mid-FSM.
* BMC access timeout
*
* BMC not provisioned cases:
*
* no wipedisk ACK\
*
* failure to go offline after resaet/reboot
* timeout waiting for online after reset/reboot
*
* Manage reinstall operations for a locked-disabled host */
int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr )
{
/* Handle 'lost BMC connectivity during the install' case */
if (( node_ptr->bm_provisioned == true ) &&
( node_ptr->bm_accessible == false ))
{
if (( node_ptr->reinstallStage != MTC_REINSTALL__START ) &&
( node_ptr->reinstallStage != MTC_REINSTALL__START_WAIT ) &&
( node_ptr->reinstallStage != MTC_REINSTALL__FAIL ) &&
( node_ptr->reinstallStage != MTC_REINSTALL__MSG_DISPLAY ) &&
( node_ptr->reinstallStage != MTC_REINSTALL__DONE ))
{
mtcTimer_reset ( node_ptr->mtcTimer );
elog ("%s Reinstall lost bmc connection",
node_ptr->hostname.c_str());
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_CL );
reinstallStageChange ( node_ptr , MTC_REINSTALL__FAIL );
}
/* fall into switch to ...
* - handle failure
* - finish the FSM
*/
}
switch ( node_ptr->reinstallStage )
{
case MTC_REINSTALL__START:
{
int host_reinstall_wait_timer = node_ptr->mtcalive_timeout + node_reinstall_timeout ;
node_ptr->retries = host_reinstall_wait_timer / MTC_REINSTALL_WAIT_TIMER ;
LOAD_NODETYPE_TIMERS ;
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL );
node_ptr->retries = ( node_ptr->mtcalive_timeout +
this->node_reinstall_timeout) /
MTC_REINSTALL_WAIT_TIMER ;
mtcTimer_reset ( node_ptr->mtcTimer );
if ( node_ptr->bm_provisioned == true )
{
if ( node_ptr->bm_accessible == false )
{
/* Handle 'lost BMC connectivity during the install' case */
wlog ("%s Reinstall wait for BMC access ; %d second timeout",
node_ptr->hostname.c_str(),
MTC_REINSTALL_TIMEOUT_BMC_ACC);
start_offline_handler ( node_ptr );
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_WAIT_NA );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_REINSTALL_TIMEOUT_BMC_ACC );
reinstallStageChange ( node_ptr, MTC_REINSTALL__START_WAIT );
}
else if ( node_ptr->power_on == false )
{
/* need to power on node */
wlog ("%s Reinstall power-on required", node_ptr->hostname.c_str());
reinstallStageChange ( node_ptr, MTC_REINSTALL__POWERON );
}
else
{
/* power is on so issue net boot command */
ilog ("%s Reinstall power is on", node_ptr->hostname.c_str());
reinstallStageChange ( node_ptr , MTC_REINSTALL__NETBOOT );
}
}
else
{
/* If the BMC is not provisioned coming into this handler
* then service the install by mtce commands by starting
* the install by wipedisk. */
reinstallStageChange ( node_ptr, MTC_REINSTALL__WIPEDISK );
}
break ;
}
/* BMC provisioned but bm_handler has not reported accessability yet.
* Need to wait ... */
case MTC_REINSTALL__START_WAIT:
{
if ( node_ptr->bm_provisioned == true )
{
if ( node_ptr->bm_accessible == false )
{
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
{
/* wait period has timed out ; fail the install */
elog ("%s %s", node_ptr->hostname.c_str(), MTC_TASK_REINSTALL_FAIL_BA);
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_BA );
reinstallStageChange ( node_ptr , MTC_REINSTALL__FAIL );
}
else
{
; /* ... wait longer */
}
}
else
{
/* the BMC is not accessible to start the install over */
plog ("%s BMC access established ; starting install",
node_ptr->hostname.c_str());
mtcTimer_reset ( node_ptr->mtcTimer );
reinstallStageChange ( node_ptr , MTC_REINSTALL__START );
}
}
else
{
/*
* Handle case where BMC gets deprovisioned
* while waiting for accessibility.
*
* Restart the install in that case after a monitored
* wait period for reprovision.
*
* Has the side effect of allowing the admin to
* reprovision the BMC during a re-install.
*/
mtcTimer_reset ( node_ptr->mtcTimer );
wlog ("%s %s", node_ptr->hostname.c_str(), MTC_TASK_REINSTALL_RTRY_PC );
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_RTRY_PC );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_MINS_5 );
reinstallStageChange ( node_ptr, MTC_REINSTALL__RESTART_WAIT );
}
break ;
}
case MTC_REINSTALL__RESTART_WAIT:
{
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
{
reinstallStageChange ( node_ptr , MTC_REINSTALL__START );
}
else if ( node_ptr->bm_provisioned == true )
{
mtcTimer_reset ( node_ptr->mtcTimer );
wlog ("%s %s", node_ptr->hostname.c_str(), MTC_TASK_REINSTALL_RTRY_PC );
reinstallStageChange ( node_ptr , MTC_REINSTALL__START );
}
else
{
; /* ... wait longer */
}
break ;
}
case MTC_REINSTALL__POWERON:
{
powerStageChange ( node_ptr , MTC_POWERON__REQ_SEND );
reinstallStageChange ( node_ptr , MTC_REINSTALL__POWERON_WAIT );
break ;
}
case MTC_REINSTALL__POWERON_WAIT:
{
/* The power handler manages timeout */
if ( node_ptr->powerStage == MTC_POWER__DONE )
{
if ( node_ptr->power_on == true )
{
if ( node_ptr->task != MTC_TASK_REINSTALL )
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_SECS_5 );
reinstallStageChange ( node_ptr , MTC_REINSTALL__NETBOOT );
}
else
{
elog ("%s %s", node_ptr->hostname.c_str(), MTC_TASK_REINSTALL_FAIL_PO);
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_PO );
reinstallStageChange ( node_ptr , MTC_REINSTALL__FAIL );
}
}
else
{
/* run the power handler till the host's power is on or
* the power-on handler times out */
power_handler ( node_ptr );
}
break ;
}
case MTC_REINSTALL__NETBOOT:
{
/* Issue netboot command after timed delay */
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
{
int rc = ipmi_command_send ( node_ptr, IPMITOOL_THREAD_CMD__BOOTDEV_PXE );
if ( rc )
{
elog ("%s Reinstall netboot request failed (rc:%d)",
node_ptr->hostname.c_str(), rc );
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_NB );
reinstallStageChange ( node_ptr, MTC_REINSTALL__FAIL );
}
else
{
ilog ("%s Reinstall netboot request sent", node_ptr->hostname.c_str() );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_SECS_5 );
reinstallStageChange ( node_ptr, MTC_REINSTALL__NETBOOT_WAIT );
}
}
break ;
}
case MTC_REINSTALL__NETBOOT_WAIT:
{
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
{
int rc = ipmi_command_recv ( node_ptr );
if ( rc == PASS )
{
ilog ("%s Reinstall netboot request completed", node_ptr->hostname.c_str());
reinstallStageChange ( node_ptr, MTC_REINSTALL__RESET);
}
else if ( rc == RETRY )
{
wlog ("%s Reinstall netboot receive retry", node_ptr->hostname.c_str());
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_SECS_5 );
}
else
{
elog ("%s Reinstall netboot receive failed (rc:%d)",
node_ptr->hostname.c_str(), rc );
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_NB );
reinstallStageChange ( node_ptr, MTC_REINSTALL__FAIL );
}
}
break ;
}
case MTC_REINSTALL__RESET:
{
int rc = ipmi_command_send ( node_ptr, IPMITOOL_THREAD_CMD__POWER_RESET );
if ( rc )
{
elog ("%s Reinstall reset request failed (rc:%d)",
node_ptr->hostname.c_str(), rc );
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_PR );
reinstallStageChange ( node_ptr, MTC_REINSTALL__FAIL );
}
else
{
ilog ("%s Reinstall reset request sent", node_ptr->hostname.c_str());
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
reinstallStageChange ( node_ptr, MTC_REINSTALL__RESET_WAIT );
}
break ;
}
case MTC_REINSTALL__RESET_WAIT:
{
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
{
int rc = ipmi_command_recv ( node_ptr );
if ( rc == PASS )
{
ilog ("%s Reinstall reset request completed", node_ptr->hostname.c_str());
start_offline_handler ( node_ptr );
/* Wait for the host to go offline */
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RESET_TO_OFFLINE_TIMEOUT );
reinstallStageChange ( node_ptr, MTC_REINSTALL__OFFLINE_WAIT);
}
else if ( rc == RETRY )
{
wlog ("%s Reinstall reset receive retry", node_ptr->hostname.c_str());
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_SECS_5 );
}
else
{
elog ("%s Reinstall reset receive failed ; rc:%d",
node_ptr->hostname.c_str(), rc );
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_PR );
reinstallStageChange ( node_ptr, MTC_REINSTALL__FAIL );
}
}
break ;
}
/* BMC not provisioned case */
case MTC_REINSTALL__WIPEDISK:
{
node_ptr->cmdReq = MTC_CMD_WIPEDISK ;
plog ("%s Administrative Reinstall Requested\n", node_ptr->hostname.c_str());
plog ("%s Reinstall wipedisk requested", node_ptr->hostname.c_str());
if ( send_mtc_cmd ( node_ptr->hostname, MTC_CMD_WIPEDISK, MGMNT_INTERFACE ) != PASS )
{
elog ("Failed to send 'reinstall' request to %s\n", node_ptr->hostname.c_str());
elog ("%s Reinstall request send failed", node_ptr->hostname.c_str());
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL );
reinstallStageChange ( node_ptr , MTC_REINSTALL__FAIL );
}
else
{
node_ptr->cmdRsp = MTC_CMD_NONE ;
if ( node_ptr->mtcTimer.tid )
{
mtcTimer_stop ( node_ptr->mtcTimer );
}
mtcTimer_reset ( node_ptr->mtcTimer );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_CMD_RSP_TIMEOUT );
ilog ("%s waiting for REINSTALL ACK \n", node_ptr->hostname.c_str() );
reinstallStageChange ( node_ptr , MTC_REINSTALL__RESP_WAIT );
reinstallStageChange ( node_ptr , MTC_REINSTALL__WIPEDISK_WAIT );
}
break ;
}
case MTC_REINSTALL__RESP_WAIT:
case MTC_REINSTALL__WIPEDISK_WAIT:
{
if ( node_ptr->cmdRsp != MTC_CMD_WIPEDISK )
{
if ( node_ptr->mtcTimer.ring == true )
{
elog ("%s REINSTALL ACK Timeout\n",
elog ("%s Reinstall wipedisk ACK timeout",
node_ptr->hostname.c_str());
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL );
reinstallStageChange ( node_ptr , MTC_REINSTALL__FAIL );
}
}
else
{
/* declare successful reinstall request */
plog ("%s REINSTALL Request Succeeded\n", node_ptr->hostname.c_str());
plog ("%s Reinstall request succeeded", node_ptr->hostname.c_str());
mtcTimer_stop ( node_ptr->mtcTimer );
mtcTimer_reset ( node_ptr->mtcTimer );
start_offline_handler ( node_ptr );
/* We need to wait for the host to go offline */
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RESET_TO_OFFLINE_TIMEOUT );
@ -4085,49 +4386,57 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr )
clear_service_readies ( node_ptr );
ilog ("%s Reinstall Progress: host is offline ; waiting for host to come back\n", node_ptr->hostname.c_str());
ilog ("%s Reinstall in-progress ; waiting for 'online' state",
node_ptr->hostname.c_str());
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_REINSTALL_WAIT_TIMER );
reinstallStageChange ( node_ptr , MTC_REINSTALL__ONLINE_WAIT );
}
else if ( node_ptr->mtcTimer.ring == true )
else if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
{
elog ("%s offline timeout - reinstall failed\n", node_ptr->hostname.c_str());
elog ("%s failed to go offline ; timeout", node_ptr->hostname.c_str());
stop_offline_handler ( node_ptr );
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_OL );
reinstallStageChange ( node_ptr , MTC_REINSTALL__FAIL );
}
else
{
; // wait longer ...
}
break ;
}
case MTC_REINSTALL__ONLINE_WAIT:
{
if ( node_ptr->mtcTimer.ring == true )
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
{
if ( node_ptr->availStatus == MTC_AVAIL_STATUS__ONLINE )
if ( --node_ptr->retries < 0 )
{
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_SUCCESS);
mtcTimer_stop ( node_ptr->mtcTimer );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY );
reinstallStageChange ( node_ptr , MTC_REINSTALL__MSG_DISPLAY );
mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__STATUSCHANGE_REINSTALL_COMPLETE );
elog ("%s %s", node_ptr->hostname.c_str(), MTC_TASK_REINSTALL_FAIL_TO);
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_TO );
reinstallStageChange ( node_ptr , MTC_REINSTALL__FAIL );
}
else
{
if ( --node_ptr->retries < 0 )
{
elog ("%s online timeout - reinstall failed\n", node_ptr->hostname.c_str());
reinstallStageChange ( node_ptr , MTC_REINSTALL__FAIL );
}
else
{
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_REINSTALL_WAIT_TIMER );
}
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_REINSTALL_WAIT_TIMER );
}
}
else if ( node_ptr->availStatus == MTC_AVAIL_STATUS__ONLINE )
{
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_SUCCESS);
mtcTimer_reset ( node_ptr->mtcTimer );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY );
reinstallStageChange ( node_ptr , MTC_REINSTALL__MSG_DISPLAY );
mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__STATUSCHANGE_REINSTALL_COMPLETE );
}
else
{
; // wait longer ...
}
break;
}
case MTC_REINSTALL__FAIL:
{
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL);
mtcTimer_stop ( node_ptr->mtcTimer );
mtcTimer_reset ( node_ptr->mtcTimer );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY );
reinstallStageChange ( node_ptr , MTC_REINSTALL__MSG_DISPLAY );
mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__STATUSCHANGE_REINSTALL_FAILED );
@ -4135,23 +4444,33 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr )
}
case MTC_REINSTALL__MSG_DISPLAY:
{
if ( node_ptr->mtcTimer.ring == true )
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
{
node_ptr->mtcTimer.ring = false ;
reinstallStageChange ( node_ptr , MTC_REINSTALL__DONE );
}
else
{
; // wait longer ...
}
break ;
}
case MTC_REINSTALL__DONE:
default:
{
plog ("%s Reinstall Completed\n", node_ptr->hostname.c_str());
if ( node_ptr->task == MTC_TASK_REINSTALL_SUCCESS )
{
plog ("%s Reinstall completed successfully",
node_ptr->hostname.c_str());
}
else
{
plog ("%s Reinstall complete ; operation failure",
node_ptr->hostname.c_str());
}
/* Default timeout values */
LOAD_NODETYPE_TIMERS ;
mtcTimer_stop ( node_ptr->mtcTimer );
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__NONE );
recovery_ctrl_init ( node_ptr->hwmon_reset );
@ -4583,6 +4902,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
}
else
{
node_ptr->power_on = false ;
ilog ("%s power is off ; powering on ...\n", node_ptr->hostname.c_str() );
powerStageChange ( node_ptr , MTC_POWERON__REQ_SEND );
}
@ -4623,7 +4943,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
}
else
{
rc = ipmi_command_send ( node_ptr, IPMITOOL_THREAD_CMD__POWER_ON );
rc = ipmi_command_send ( node_ptr, IPMITOOL_THREAD_CMD__POWER_ON );
if ( rc )
{
wlog ("%s Power-On request failed (%d)\n",
@ -4918,14 +5238,14 @@ int nodeLinkClass::powercycle_handler ( struct nodeLinkClass::node * node_ptr )
{
bool on = false ;
ilog ("%s Power Status: %s\n",
node_ptr->hostname.c_str(),
node_ptr->ipmitool_thread_info.data.c_str());
ilog ("%s Power Status: %s\n",
node_ptr->hostname.c_str(),
node_ptr->ipmitool_thread_info.data.c_str());
if ( node_ptr->ipmitool_thread_info.data.find ( IPMITOOL_POWER_ON_STATUS ) != std::string::npos )
{
on = true ;
}
if ( node_ptr->ipmitool_thread_info.data.find ( IPMITOOL_POWER_ON_STATUS ) != std::string::npos )
{
on = true ;
}
if ( rc == PASS )
{
/* maintain current power state */
@ -6033,6 +6353,12 @@ int nodeLinkClass::bm_handler ( struct nodeLinkClass::node * node_ptr )
node_ptr->ipmitool_thread_info.data.c_str());
plog ("%s bmc is accessible\n", node_ptr->hostname.c_str());
/* set host power state ; on or off */
if ( node_ptr->ipmitool_thread_info.data.find (IPMITOOL_POWER_ON_STATUS) != std::string::npos )
node_ptr->power_on = true ;
else
node_ptr->power_on = false ;
if ( node_ptr->ipmitool_thread_info.data.find (IPMITOOL_POWER_OFF_STATUS) != std::string::npos )
{
if ( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED )
@ -6333,7 +6659,6 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
}
node_ptr->ipmitool_thread_ctrl.done = true ;
}
#endif
/* Audits for this controller host only */

View File

@ -79,6 +79,7 @@ void * mtcThread_ipmitool ( void * arg )
switch ( info_ptr->command )
{
/* control commands */
case IPMITOOL_THREAD_CMD__POWER_RESET:
{
command = IPMITOOL_POWER_RESET_CMD ;
@ -103,6 +104,14 @@ void * mtcThread_ipmitool ( void * arg )
response = IPMITOOL_POWER_CYCLE_RESP ;
break ;
}
case IPMITOOL_THREAD_CMD__BOOTDEV_PXE:
{
command = IPMITOOL_BOOTDEV_PXE_CMD ;
response = IPMITOOL_BOOTDEV_PXE_RESP ;
break ;
}
/* Status commands */
case IPMITOOL_THREAD_CMD__POWER_STATUS:
{
command = IPMITOOL_POWER_STATUS_CMD ;
@ -118,6 +127,7 @@ void * mtcThread_ipmitool ( void * arg )
command = IPMITOOL_MC_INFO_CMD ;
break ;
}
default:
{
rc = info_ptr->status = FAIL_BAD_CASE ;
@ -222,9 +232,11 @@ void * mtcThread_ipmitool ( void * arg )
else if ((( command == IPMITOOL_POWER_RESET_CMD ) ||
( command == IPMITOOL_POWER_OFF_CMD ) ||
( command == IPMITOOL_POWER_ON_CMD ) ||
( command == IPMITOOL_POWER_CYCLE_CMD )) &&
( command == IPMITOOL_POWER_CYCLE_CMD ) ||
( command == IPMITOOL_BOOTDEV_PXE_CMD)) &&
( daemon_is_file_present ( MTC_CMD_FIT__POWER_CMD )))
{
slog("%s FIT Bypass power or bootdev command", info_ptr->hostname.c_str());
bypass_ipmitool_request = true ;
rc = PASS ;
}