diff --git a/mtce-common/src/common/ipmiUtil.h b/mtce-common/src/common/ipmiUtil.h index 3a4c650f..6567b303 100644 --- a/mtce-common/src/common/ipmiUtil.h +++ b/mtce-common/src/common/ipmiUtil.h @@ -43,6 +43,9 @@ #define IPMITOOL_RESTART_CAUSE_CMD ((const char *)("chassis restart_cause")) +#define IPMITOOL_BOOTDEV_PXE_CMD ((const char *)("chassis bootdev pxe")) +#define IPMITOOL_BOOTDEV_PXE_RESP ((const char *)("Set Boot Device to pxe")) + #define IPMITOOL_MC_INFO_CMD ((const char *)("mc info")) #define IPMITOOL_CMD_FILE_SUFFIX ((const char *)("_power_cmd_result")) @@ -65,6 +68,7 @@ typedef enum IPMITOOL_THREAD_CMD__MC_INFO, IPMITOOL_THREAD_CMD__POWER_STATUS, IPMITOOL_THREAD_CMD__RESTART_CAUSE, + IPMITOOL_THREAD_CMD__BOOTDEV_PXE, IPMITOOL_THREAD_CMD__READ_SENSORS, diff --git a/mtce-common/src/common/nodeBase.cpp b/mtce-common/src/common/nodeBase.cpp index bb15979d..e799815f 100755 --- a/mtce-common/src/common/nodeBase.cpp +++ b/mtce-common/src/common/nodeBase.cpp @@ -325,7 +325,7 @@ static std::string sensorStages_str [MTC_SENSOR__STAGES +1] ; static std::string powerStages_str [MTC_POWER__STAGES +1] ; static std::string powercycleStages_str [MTC_POWERCYCLE__STAGES +1] ; static std::string resetStages_str [MTC_RESET__STAGES +1] ; -static std::string reinstallStages_str [MTC_RESET__STAGES +1] ; +static std::string reinstallStages_str [MTC_REINSTALL__STAGES +1] ; static std::string oosTestStages_str [MTC_OOS_TEST__STAGES +1] ; static std::string insvTestStages_str [MTC_INSV_TEST__STAGES +1] ; static std::string configStages_str [MTC_CONFIG__STAGES +1] ; @@ -451,7 +451,17 @@ void mtc_stages_init ( void ) resetStages_str [MTC_RESET__STAGES ] = "Reset-Unknown"; reinstallStages_str [MTC_REINSTALL__START ] = "Reinstall-Start"; - reinstallStages_str [MTC_REINSTALL__RESP_WAIT ] = "Reinstall-Response-Wait"; + reinstallStages_str [MTC_REINSTALL__START_WAIT ] = "Reinstall-Start-Wait"; + reinstallStages_str [MTC_REINSTALL__RESTART ] = "Reinstall-ReStart"; + reinstallStages_str [MTC_REINSTALL__RESTART_WAIT ] = "Reinstall-ReStart-Wait"; + reinstallStages_str [MTC_REINSTALL__POWERON ] = "Reinstall-PowerOn"; + reinstallStages_str [MTC_REINSTALL__POWERON_WAIT ] = "Reinstall-PowerOn-Wait"; + reinstallStages_str [MTC_REINSTALL__NETBOOT ] = "Reinstall-Netboot"; + reinstallStages_str [MTC_REINSTALL__NETBOOT_WAIT ] = "Reinstall-Netboot-Wait"; + reinstallStages_str [MTC_REINSTALL__RESET ] = "Reinstall-Reset"; + reinstallStages_str [MTC_REINSTALL__RESET_WAIT ] = "Reinstall-Reset-Wait"; + reinstallStages_str [MTC_REINSTALL__WIPEDISK ] = "Reinstall-Wipedisk"; + reinstallStages_str [MTC_REINSTALL__WIPEDISK_WAIT ] = "Reinstall-Wipedisk-Wait"; reinstallStages_str [MTC_REINSTALL__OFFLINE_WAIT ] = "Reinstall-Offline-Wait"; reinstallStages_str [MTC_REINSTALL__ONLINE_WAIT ] = "Reinstall-Online-Wait"; reinstallStages_str [MTC_REINSTALL__FAIL ] = "Reinstall-Failure"; diff --git a/mtce-common/src/common/nodeBase.h b/mtce-common/src/common/nodeBase.h index 76bce0dd..eb23ffbd 100755 --- a/mtce-common/src/common/nodeBase.h +++ b/mtce-common/src/common/nodeBase.h @@ -220,7 +220,17 @@ void daemon_exit ( void ); #define MTC_TASK_REBOOT_FAIL_RETRY "Reboot Failed, retrying (%d of %d)" #define MTC_TASK_REBOOT_ABORT "Reboot Failed, try again when host is 'online'" #define MTC_TASK_RESET_PROG "Rebooting/Resetting Host" -#define MTC_TASK_REINSTALL "Reinstalling Host" +#define MTC_TASK_REINSTALL "Reinstalling" +#define MTC_TASK_REINSTALL_WAIT_NA "Reinstall Wait ; BMC not accessible" +#define MTC_TASK_REINSTALL_RTRY_PC "Reinstall Retry ; BMC provisioned change during install" +#define MTC_TASK_REINSTALL_FAIL_CL "Reinstall Failed ; BMC connectivity lost" +#define MTC_TASK_REINSTALL_FAIL_OL "Reinstall Failed ; timeout waiting for offline" +#define MTC_TASK_REINSTALL_FAIL_TO "Reinstall Failed ; timeout waiting for online" +#define MTC_TASK_REINSTALL_FAIL_BA "Reinstall Failed ; timeout waiting BMC access" +#define MTC_TASK_REINSTALL_FAIL_PO "Reinstall Failed ; could not power on host" +#define MTC_TASK_REINSTALL_FAIL_NB "Reinstall Failed ; netboot request" +#define MTC_TASK_REINSTALL_FAIL_PR "Reinstall Failed ; power reset request" + #define MTC_TASK_REINSTALL_FAIL "Reinstall Failed" #define MTC_TASK_REINSTALL_SUCCESS "Reinstall Succeeded" #define MTC_TASK_BOOTING "Booting" @@ -1008,7 +1018,17 @@ string get_resetStages_str ( mtc_resetStages_enum stage ); typedef enum { MTC_REINSTALL__START = 0, - MTC_REINSTALL__RESP_WAIT, + MTC_REINSTALL__START_WAIT, + MTC_REINSTALL__RESTART, + MTC_REINSTALL__RESTART_WAIT, + MTC_REINSTALL__POWERON, + MTC_REINSTALL__POWERON_WAIT, + MTC_REINSTALL__NETBOOT, + MTC_REINSTALL__NETBOOT_WAIT, + MTC_REINSTALL__RESET, + MTC_REINSTALL__RESET_WAIT, + MTC_REINSTALL__WIPEDISK, + MTC_REINSTALL__WIPEDISK_WAIT, MTC_REINSTALL__OFFLINE_WAIT, MTC_REINSTALL__ONLINE_WAIT, MTC_REINSTALL__FAIL, diff --git a/mtce-common/src/common/nodeTimers.cpp b/mtce-common/src/common/nodeTimers.cpp index 390e9183..d3590378 100755 --- a/mtce-common/src/common/nodeTimers.cpp +++ b/mtce-common/src/common/nodeTimers.cpp @@ -234,7 +234,7 @@ int _timer_stop ( struct mtc_timer * mtcTimer_ptr , bool int_safe) } else if ( int_safe == false ) { - elog ("%s (%s) called with null TID (count:%d)\n", + wlog ("%s (%s) called with null TID (count:%d)\n", mtcTimer_ptr->hostname.c_str(), mtcTimer_ptr->service.c_str(), timer_count); diff --git a/mtce-common/src/common/nodeTimers.h b/mtce-common/src/common/nodeTimers.h index 823a7167..0d6d25ae 100755 --- a/mtce-common/src/common/nodeTimers.h +++ b/mtce-common/src/common/nodeTimers.h @@ -64,7 +64,7 @@ #define MTC_ALIVE_TIMER (5) #define MTC_POWEROFF_DELAY (5) #define MTC_SWACT_POLL_TIMER (10) -#define MTC_TASK_UPDATE_DELAY (10) +#define MTC_TASK_UPDATE_DELAY (30) #define MTC_BM_PING_TIMEOUT (30) #define MTC_BM_POWEROFF_TIMEOUT (30) #define MTC_BM_POWERON_TIMEOUT (30) @@ -80,6 +80,7 @@ #define MTC_POWERCYCLE_BACK2BACK_DELAY (MTC_MINS_5) #define MTC_HEARTBEAT_SOAK_BEFORE_ENABLE (11) #define MTC_REINSTALL_TIMEOUT_DEFAULT (MTC_MINS_40) +#define MTC_REINSTALL_TIMEOUT_BMC_ACC (MTC_MINS_10) #define MTC_REINSTALL_TIMEOUT_MIN (MTC_MINS_1) #define MTC_REINSTALL_TIMEOUT_MAX (MTC_HRS_4) #define MTC_REINSTALL_WAIT_TIMER (10) diff --git a/mtce/src/common/nodeClass.cpp b/mtce/src/common/nodeClass.cpp index 4936cba8..8dbb755a 100755 --- a/mtce/src/common/nodeClass.cpp +++ b/mtce/src/common/nodeClass.cpp @@ -2181,6 +2181,25 @@ int nodeLinkClass::mod_host ( node_inv_type & inv ) else if ( !inv.action.compare ( "reinstall" ) ) { plog ("%s Reinstall Action\n", node_ptr->hostname.c_str()); + if ( node_ptr->adminAction == MTC_ADMIN_ACTION__REINSTALL ) + { + /* Allow user to restart the re-install if + * - its in progress, + * - there is a BMC provisioned and + * - are waiting while the actual install is in progress */ + if (( node_ptr->bm_provisioned == true ) && + ( node_ptr->reinstallStage == MTC_REINSTALL__ONLINE_WAIT)) + { + reinstallStageChange ( node_ptr , MTC_REINSTALL__START ); + } + else + { + /* Otherwise allow the current install to continue + * remind the user that there is a reinstall + * in progress */ + mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL); + } + } adminActionChange ( node_ptr , MTC_ADMIN_ACTION__REINSTALL ); /* generate command=reinstall log */ diff --git a/mtce/src/maintenance/mtcIpmiUtil.cpp b/mtce/src/maintenance/mtcIpmiUtil.cpp index 349e2c58..1e865cde 100644 --- a/mtce/src/maintenance/mtcIpmiUtil.cpp +++ b/mtce/src/maintenance/mtcIpmiUtil.cpp @@ -130,6 +130,11 @@ int nodeLinkClass::ipmi_command_send ( struct nodeLinkClass::node * node_ptr, in { want_fit = true ; } + else if (( command == IPMITOOL_THREAD_CMD__BOOTDEV_PXE ) && + ( daemon_want_fit ( fit, node_ptr->hostname, "netboot_pxe" ) == true )) + { + want_fit = true ; + } if ( want_fit == true ) { diff --git a/mtce/src/maintenance/mtcNodeCtrl.cpp b/mtce/src/maintenance/mtcNodeCtrl.cpp index a0523149..1cd96655 100644 --- a/mtce/src/maintenance/mtcNodeCtrl.cpp +++ b/mtce/src/maintenance/mtcNodeCtrl.cpp @@ -1615,31 +1615,6 @@ extern int mtcJsonInv_testhead ( void ); int daemon_run_testhead ( void ) { int rc = PASS; - - mtc_config.testmode = true ; - - nodeLinkClass * mtcInv_testhead_ptr = new nodeLinkClass ; - - printf ("\n\n"); - printf (TESTHEAD_BAR); - - printf ("| Node Class Test Head - Private and Public Member Functions\n"); - printf (TESTHEAD_BAR); - for ( int i = 0 ; i < 11 ; i++ ) - { - if ( mtcInv_testhead_ptr->testhead ( i+1 ) ) - { - FAILED_STR ; - rc = FAIL ; - } - else - PASSED ; - } - free(mtcInv_testhead_ptr); - - printf (TESTHEAD_BAR); - printf ("| Maintenance Timer Test Head\n"); - printf (TESTHEAD_BAR); return (rc); } diff --git a/mtce/src/maintenance/mtcNodeFsm.cpp b/mtce/src/maintenance/mtcNodeFsm.cpp index 19eba3ee..e9aca599 100755 --- a/mtce/src/maintenance/mtcNodeFsm.cpp +++ b/mtce/src/maintenance/mtcNodeFsm.cpp @@ -294,37 +294,16 @@ int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr ) { flog ("%s -> OOS Action Check\n", node_ptr->hostname.c_str()); - /* TEMPORARY: To allow reset of unlocked host for fault insertion. */ - if ( node_ptr->adminAction == MTC_ADMIN_ACTION__RESET ) - { - wlog ("%s Allowing Reset of unlocked host for FIT\n", node_ptr->hostname.c_str()); + elog ("%s Administrative '%s' Operation Rejected\n", + node_ptr->hostname.c_str(), + get_adminAction_str (node_ptr->adminAction) ); - if ( node_ptr->hostname.compare(nodeLinkClass::my_hostname)) - { - nodeLinkClass::reset_handler ( node_ptr ); - } - else - { - wlog ("%s Cowardly avoiding reset of self\n", node_ptr->hostname.c_str()); - adminActionChange ( node_ptr , MTC_ADMIN_ACTION__NONE ); + elog ("%s Cannot perform out-of-service action against in-service host\n", + node_ptr->hostname.c_str()); + adminActionChange ( node_ptr , MTC_ADMIN_ACTION__NONE ); - /* Clear the UI task since we are not really resetting */ - mtcInvApi_update_task ( node_ptr, "" ); - } - } - else - { - elog ("%s Administrative '%s' Operation Rejected\n", - node_ptr->hostname.c_str(), - get_adminAction_str (node_ptr->adminAction) ); - - elog ("%s Cannot perform out-of-service action against in-service host\n", - node_ptr->hostname.c_str()); - adminActionChange ( node_ptr , MTC_ADMIN_ACTION__NONE ); - - /* Clear the UI task since we are not really resetting */ - mtcInvApi_update_task ( node_ptr, "" ); - } + /* Clear the UI task since we are not really taking this action */ + mtcInvApi_update_task ( node_ptr, "" ); } /**************************************************************************** diff --git a/mtce/src/maintenance/mtcNodeHdlrs.cpp b/mtce/src/maintenance/mtcNodeHdlrs.cpp index f7700411..6aa14f42 100755 --- a/mtce/src/maintenance/mtcNodeHdlrs.cpp +++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp @@ -1971,12 +1971,12 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) { if ( mtcTimer_expired ( node_ptr->mtcTimer )) { - rc = ipmi_command_recv ( node_ptr ); - if ( rc == RETRY ) - { - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT ); - break ; - } + rc = ipmi_command_recv ( node_ptr ); + if ( rc == RETRY ) + { + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT ); + break ; + } if ( rc ) { @@ -4011,63 +4011,364 @@ int nodeLinkClass::reset_handler ( struct nodeLinkClass::node * node_ptr ) return (PASS); } -/* Reinstall handler - * -------------- +/**************************************************************************** + * + * Name : reinstall_handler + * + * Purpose : Perform actions that result in a network boot so that a new + * image is installed on the specified node's boot partition. + * + * Description: This FSM handles node (re)install with and without + * a provisioned Board Management Controller (BMC). + * + * BMC provisioned case: using IPMI commands to BMC ... + * + * - ensure host power is on + * - force network boot on next reset + * - issue node reset + * + * BMC not provisioned case: using mtce messaging to node ... + * + * - send mtcClient wipedisk command + * fail reinstall if no ACK + * - send mtcClient reboot command + * + * Both casess: + * + * - wait for offline + * - wait for online + * - install complete + * + * Failure Handling: + * + * BMC provisioned cases: + * + * BMC won't power on + * BMC ipmi command failure + * BMC connectivity lost mid-FSM. + * BMC access timeout + * + * BMC not provisioned cases: + * + * no wipedisk ACK\ + * + * failure to go offline after resaet/reboot + * timeout waiting for online after reset/reboot + * * Manage reinstall operations for a locked-disabled host */ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr ) { + /* Handle 'lost BMC connectivity during the install' case */ + if (( node_ptr->bm_provisioned == true ) && + ( node_ptr->bm_accessible == false )) + { + if (( node_ptr->reinstallStage != MTC_REINSTALL__START ) && + ( node_ptr->reinstallStage != MTC_REINSTALL__START_WAIT ) && + ( node_ptr->reinstallStage != MTC_REINSTALL__FAIL ) && + ( node_ptr->reinstallStage != MTC_REINSTALL__MSG_DISPLAY ) && + ( node_ptr->reinstallStage != MTC_REINSTALL__DONE )) + { + mtcTimer_reset ( node_ptr->mtcTimer ); + + elog ("%s Reinstall lost bmc connection", + node_ptr->hostname.c_str()); + + mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_CL ); + reinstallStageChange ( node_ptr , MTC_REINSTALL__FAIL ); + } + + /* fall into switch to ... + * - handle failure + * - finish the FSM + */ + } switch ( node_ptr->reinstallStage ) { case MTC_REINSTALL__START: { - int host_reinstall_wait_timer = node_ptr->mtcalive_timeout + node_reinstall_timeout ; - node_ptr->retries = host_reinstall_wait_timer / MTC_REINSTALL_WAIT_TIMER ; + LOAD_NODETYPE_TIMERS ; + mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL ); + node_ptr->retries = ( node_ptr->mtcalive_timeout + + this->node_reinstall_timeout) / + MTC_REINSTALL_WAIT_TIMER ; + mtcTimer_reset ( node_ptr->mtcTimer ); + if ( node_ptr->bm_provisioned == true ) + { + if ( node_ptr->bm_accessible == false ) + { + /* Handle 'lost BMC connectivity during the install' case */ + wlog ("%s Reinstall wait for BMC access ; %d second timeout", + node_ptr->hostname.c_str(), + MTC_REINSTALL_TIMEOUT_BMC_ACC); - start_offline_handler ( node_ptr ); + mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_WAIT_NA ); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_REINSTALL_TIMEOUT_BMC_ACC ); + reinstallStageChange ( node_ptr, MTC_REINSTALL__START_WAIT ); + } + else if ( node_ptr->power_on == false ) + { + /* need to power on node */ + wlog ("%s Reinstall power-on required", node_ptr->hostname.c_str()); + reinstallStageChange ( node_ptr, MTC_REINSTALL__POWERON ); + } + else + { + /* power is on so issue net boot command */ + ilog ("%s Reinstall power is on", node_ptr->hostname.c_str()); + reinstallStageChange ( node_ptr , MTC_REINSTALL__NETBOOT ); + } + } + else + { + /* If the BMC is not provisioned coming into this handler + * then service the install by mtce commands by starting + * the install by wipedisk. */ + reinstallStageChange ( node_ptr, MTC_REINSTALL__WIPEDISK ); + } + break ; + } + /* BMC provisioned but bm_handler has not reported accessability yet. + * Need to wait ... */ + case MTC_REINSTALL__START_WAIT: + { + if ( node_ptr->bm_provisioned == true ) + { + if ( node_ptr->bm_accessible == false ) + { + if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) + { + /* wait period has timed out ; fail the install */ + elog ("%s %s", node_ptr->hostname.c_str(), MTC_TASK_REINSTALL_FAIL_BA); + mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_BA ); + reinstallStageChange ( node_ptr , MTC_REINSTALL__FAIL ); + } + else + { + ; /* ... wait longer */ + } + } + else + { + /* the BMC is not accessible to start the install over */ + plog ("%s BMC access established ; starting install", + node_ptr->hostname.c_str()); + mtcTimer_reset ( node_ptr->mtcTimer ); + reinstallStageChange ( node_ptr , MTC_REINSTALL__START ); + } + } + else + { + /* + * Handle case where BMC gets deprovisioned + * while waiting for accessibility. + * + * Restart the install in that case after a monitored + * wait period for reprovision. + * + * Has the side effect of allowing the admin to + * reprovision the BMC during a re-install. + */ + mtcTimer_reset ( node_ptr->mtcTimer ); + wlog ("%s %s", node_ptr->hostname.c_str(), MTC_TASK_REINSTALL_RTRY_PC ); + + mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_RTRY_PC ); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_MINS_5 ); + reinstallStageChange ( node_ptr, MTC_REINSTALL__RESTART_WAIT ); + } + break ; + } + case MTC_REINSTALL__RESTART_WAIT: + { + if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) + { + reinstallStageChange ( node_ptr , MTC_REINSTALL__START ); + } + else if ( node_ptr->bm_provisioned == true ) + { + mtcTimer_reset ( node_ptr->mtcTimer ); + wlog ("%s %s", node_ptr->hostname.c_str(), MTC_TASK_REINSTALL_RTRY_PC ); + reinstallStageChange ( node_ptr , MTC_REINSTALL__START ); + } + else + { + ; /* ... wait longer */ + } + break ; + } + case MTC_REINSTALL__POWERON: + { + powerStageChange ( node_ptr , MTC_POWERON__REQ_SEND ); + reinstallStageChange ( node_ptr , MTC_REINSTALL__POWERON_WAIT ); + break ; + } + case MTC_REINSTALL__POWERON_WAIT: + { + /* The power handler manages timeout */ + if ( node_ptr->powerStage == MTC_POWER__DONE ) + { + if ( node_ptr->power_on == true ) + { + if ( node_ptr->task != MTC_TASK_REINSTALL ) + mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL ); + + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_SECS_5 ); + reinstallStageChange ( node_ptr , MTC_REINSTALL__NETBOOT ); + } + else + { + elog ("%s %s", node_ptr->hostname.c_str(), MTC_TASK_REINSTALL_FAIL_PO); + + mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_PO ); + reinstallStageChange ( node_ptr , MTC_REINSTALL__FAIL ); + } + } + else + { + /* run the power handler till the host's power is on or + * the power-on handler times out */ + power_handler ( node_ptr ); + } + break ; + } + case MTC_REINSTALL__NETBOOT: + { + /* Issue netboot command after timed delay */ + if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) + { + int rc = ipmi_command_send ( node_ptr, IPMITOOL_THREAD_CMD__BOOTDEV_PXE ); + if ( rc ) + { + elog ("%s Reinstall netboot request failed (rc:%d)", + node_ptr->hostname.c_str(), rc ); + mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_NB ); + reinstallStageChange ( node_ptr, MTC_REINSTALL__FAIL ); + } + else + { + ilog ("%s Reinstall netboot request sent", node_ptr->hostname.c_str() ); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_SECS_5 ); + reinstallStageChange ( node_ptr, MTC_REINSTALL__NETBOOT_WAIT ); + } + } + break ; + } + case MTC_REINSTALL__NETBOOT_WAIT: + { + if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) + { + int rc = ipmi_command_recv ( node_ptr ); + if ( rc == PASS ) + { + ilog ("%s Reinstall netboot request completed", node_ptr->hostname.c_str()); + reinstallStageChange ( node_ptr, MTC_REINSTALL__RESET); + } + else if ( rc == RETRY ) + { + wlog ("%s Reinstall netboot receive retry", node_ptr->hostname.c_str()); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_SECS_5 ); + } + else + { + elog ("%s Reinstall netboot receive failed (rc:%d)", + node_ptr->hostname.c_str(), rc ); + mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_NB ); + reinstallStageChange ( node_ptr, MTC_REINSTALL__FAIL ); + } + } + break ; + } + case MTC_REINSTALL__RESET: + { + int rc = ipmi_command_send ( node_ptr, IPMITOOL_THREAD_CMD__POWER_RESET ); + if ( rc ) + { + elog ("%s Reinstall reset request failed (rc:%d)", + node_ptr->hostname.c_str(), rc ); + mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_PR ); + reinstallStageChange ( node_ptr, MTC_REINSTALL__FAIL ); + } + else + { + ilog ("%s Reinstall reset request sent", node_ptr->hostname.c_str()); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY ); + reinstallStageChange ( node_ptr, MTC_REINSTALL__RESET_WAIT ); + } + break ; + } + case MTC_REINSTALL__RESET_WAIT: + { + if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) + { + int rc = ipmi_command_recv ( node_ptr ); + if ( rc == PASS ) + { + ilog ("%s Reinstall reset request completed", node_ptr->hostname.c_str()); + + start_offline_handler ( node_ptr ); + + /* Wait for the host to go offline */ + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RESET_TO_OFFLINE_TIMEOUT ); + reinstallStageChange ( node_ptr, MTC_REINSTALL__OFFLINE_WAIT); + } + else if ( rc == RETRY ) + { + wlog ("%s Reinstall reset receive retry", node_ptr->hostname.c_str()); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_SECS_5 ); + } + else + { + elog ("%s Reinstall reset receive failed ; rc:%d", + node_ptr->hostname.c_str(), rc ); + mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_PR ); + reinstallStageChange ( node_ptr, MTC_REINSTALL__FAIL ); + } + } + break ; + } + /* BMC not provisioned case */ + case MTC_REINSTALL__WIPEDISK: + { node_ptr->cmdReq = MTC_CMD_WIPEDISK ; - plog ("%s Administrative Reinstall Requested\n", node_ptr->hostname.c_str()); + plog ("%s Reinstall wipedisk requested", node_ptr->hostname.c_str()); if ( send_mtc_cmd ( node_ptr->hostname, MTC_CMD_WIPEDISK, MGMNT_INTERFACE ) != PASS ) { - elog ("Failed to send 'reinstall' request to %s\n", node_ptr->hostname.c_str()); + elog ("%s Reinstall request send failed", node_ptr->hostname.c_str()); + mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL ); reinstallStageChange ( node_ptr , MTC_REINSTALL__FAIL ); } else { node_ptr->cmdRsp = MTC_CMD_NONE ; - if ( node_ptr->mtcTimer.tid ) - { - mtcTimer_stop ( node_ptr->mtcTimer ); - } - + mtcTimer_reset ( node_ptr->mtcTimer ); mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_CMD_RSP_TIMEOUT ); - ilog ("%s waiting for REINSTALL ACK \n", node_ptr->hostname.c_str() ); - - reinstallStageChange ( node_ptr , MTC_REINSTALL__RESP_WAIT ); + reinstallStageChange ( node_ptr , MTC_REINSTALL__WIPEDISK_WAIT ); } break ; } - case MTC_REINSTALL__RESP_WAIT: + case MTC_REINSTALL__WIPEDISK_WAIT: { if ( node_ptr->cmdRsp != MTC_CMD_WIPEDISK ) { if ( node_ptr->mtcTimer.ring == true ) { - elog ("%s REINSTALL ACK Timeout\n", + elog ("%s Reinstall wipedisk ACK timeout", node_ptr->hostname.c_str()); - + mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL ); reinstallStageChange ( node_ptr , MTC_REINSTALL__FAIL ); } } else { /* declare successful reinstall request */ - plog ("%s REINSTALL Request Succeeded\n", node_ptr->hostname.c_str()); + plog ("%s Reinstall request succeeded", node_ptr->hostname.c_str()); - mtcTimer_stop ( node_ptr->mtcTimer ); + mtcTimer_reset ( node_ptr->mtcTimer ); + + start_offline_handler ( node_ptr ); /* We need to wait for the host to go offline */ mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RESET_TO_OFFLINE_TIMEOUT ); @@ -4085,49 +4386,57 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr ) clear_service_readies ( node_ptr ); - ilog ("%s Reinstall Progress: host is offline ; waiting for host to come back\n", node_ptr->hostname.c_str()); + ilog ("%s Reinstall in-progress ; waiting for 'online' state", + node_ptr->hostname.c_str()); mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_REINSTALL_WAIT_TIMER ); reinstallStageChange ( node_ptr , MTC_REINSTALL__ONLINE_WAIT ); } - else if ( node_ptr->mtcTimer.ring == true ) + else if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) { - elog ("%s offline timeout - reinstall failed\n", node_ptr->hostname.c_str()); + elog ("%s failed to go offline ; timeout", node_ptr->hostname.c_str()); + stop_offline_handler ( node_ptr ); + mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_OL ); reinstallStageChange ( node_ptr , MTC_REINSTALL__FAIL ); } + else + { + ; // wait longer ... + } break ; } case MTC_REINSTALL__ONLINE_WAIT: { - if ( node_ptr->mtcTimer.ring == true ) + if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) { - if ( node_ptr->availStatus == MTC_AVAIL_STATUS__ONLINE ) + if ( --node_ptr->retries < 0 ) { - mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_SUCCESS); - mtcTimer_stop ( node_ptr->mtcTimer ); - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY ); - reinstallStageChange ( node_ptr , MTC_REINSTALL__MSG_DISPLAY ); - mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__STATUSCHANGE_REINSTALL_COMPLETE ); + elog ("%s %s", node_ptr->hostname.c_str(), MTC_TASK_REINSTALL_FAIL_TO); + mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_TO ); + reinstallStageChange ( node_ptr , MTC_REINSTALL__FAIL ); } else { - if ( --node_ptr->retries < 0 ) - { - elog ("%s online timeout - reinstall failed\n", node_ptr->hostname.c_str()); - reinstallStageChange ( node_ptr , MTC_REINSTALL__FAIL ); - } - else - { - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_REINSTALL_WAIT_TIMER ); - } + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_REINSTALL_WAIT_TIMER ); } } + else if ( node_ptr->availStatus == MTC_AVAIL_STATUS__ONLINE ) + { + mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_SUCCESS); + mtcTimer_reset ( node_ptr->mtcTimer ); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY ); + reinstallStageChange ( node_ptr , MTC_REINSTALL__MSG_DISPLAY ); + mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__STATUSCHANGE_REINSTALL_COMPLETE ); + } + else + { + ; // wait longer ... + } break; } case MTC_REINSTALL__FAIL: { - mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL); - mtcTimer_stop ( node_ptr->mtcTimer ); + mtcTimer_reset ( node_ptr->mtcTimer ); mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY ); reinstallStageChange ( node_ptr , MTC_REINSTALL__MSG_DISPLAY ); mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__STATUSCHANGE_REINSTALL_FAILED ); @@ -4135,23 +4444,33 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr ) } case MTC_REINSTALL__MSG_DISPLAY: { - if ( node_ptr->mtcTimer.ring == true ) + if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) { - node_ptr->mtcTimer.ring = false ; reinstallStageChange ( node_ptr , MTC_REINSTALL__DONE ); } + else + { + ; // wait longer ... + } break ; } case MTC_REINSTALL__DONE: default: { - plog ("%s Reinstall Completed\n", node_ptr->hostname.c_str()); + if ( node_ptr->task == MTC_TASK_REINSTALL_SUCCESS ) + { + plog ("%s Reinstall completed successfully", + node_ptr->hostname.c_str()); + } + else + { + plog ("%s Reinstall complete ; operation failure", + node_ptr->hostname.c_str()); + } /* Default timeout values */ LOAD_NODETYPE_TIMERS ; - mtcTimer_stop ( node_ptr->mtcTimer ); - adminActionChange ( node_ptr , MTC_ADMIN_ACTION__NONE ); recovery_ctrl_init ( node_ptr->hwmon_reset ); @@ -4583,6 +4902,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) } else { + node_ptr->power_on = false ; ilog ("%s power is off ; powering on ...\n", node_ptr->hostname.c_str() ); powerStageChange ( node_ptr , MTC_POWERON__REQ_SEND ); } @@ -4623,7 +4943,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) } else { - rc = ipmi_command_send ( node_ptr, IPMITOOL_THREAD_CMD__POWER_ON ); + rc = ipmi_command_send ( node_ptr, IPMITOOL_THREAD_CMD__POWER_ON ); if ( rc ) { wlog ("%s Power-On request failed (%d)\n", @@ -4918,14 +5238,14 @@ int nodeLinkClass::powercycle_handler ( struct nodeLinkClass::node * node_ptr ) { bool on = false ; - ilog ("%s Power Status: %s\n", - node_ptr->hostname.c_str(), - node_ptr->ipmitool_thread_info.data.c_str()); + ilog ("%s Power Status: %s\n", + node_ptr->hostname.c_str(), + node_ptr->ipmitool_thread_info.data.c_str()); - if ( node_ptr->ipmitool_thread_info.data.find ( IPMITOOL_POWER_ON_STATUS ) != std::string::npos ) - { - on = true ; - } + if ( node_ptr->ipmitool_thread_info.data.find ( IPMITOOL_POWER_ON_STATUS ) != std::string::npos ) + { + on = true ; + } if ( rc == PASS ) { /* maintain current power state */ @@ -6033,6 +6353,12 @@ int nodeLinkClass::bm_handler ( struct nodeLinkClass::node * node_ptr ) node_ptr->ipmitool_thread_info.data.c_str()); plog ("%s bmc is accessible\n", node_ptr->hostname.c_str()); + /* set host power state ; on or off */ + if ( node_ptr->ipmitool_thread_info.data.find (IPMITOOL_POWER_ON_STATUS) != std::string::npos ) + node_ptr->power_on = true ; + else + node_ptr->power_on = false ; + if ( node_ptr->ipmitool_thread_info.data.find (IPMITOOL_POWER_OFF_STATUS) != std::string::npos ) { if ( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED ) @@ -6333,7 +6659,6 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr ) } node_ptr->ipmitool_thread_ctrl.done = true ; } - #endif /* Audits for this controller host only */ diff --git a/mtce/src/maintenance/mtcThreads.cpp b/mtce/src/maintenance/mtcThreads.cpp index 0e7c4acd..5a970885 100644 --- a/mtce/src/maintenance/mtcThreads.cpp +++ b/mtce/src/maintenance/mtcThreads.cpp @@ -79,6 +79,7 @@ void * mtcThread_ipmitool ( void * arg ) switch ( info_ptr->command ) { + /* control commands */ case IPMITOOL_THREAD_CMD__POWER_RESET: { command = IPMITOOL_POWER_RESET_CMD ; @@ -103,6 +104,14 @@ void * mtcThread_ipmitool ( void * arg ) response = IPMITOOL_POWER_CYCLE_RESP ; break ; } + case IPMITOOL_THREAD_CMD__BOOTDEV_PXE: + { + command = IPMITOOL_BOOTDEV_PXE_CMD ; + response = IPMITOOL_BOOTDEV_PXE_RESP ; + break ; + } + + /* Status commands */ case IPMITOOL_THREAD_CMD__POWER_STATUS: { command = IPMITOOL_POWER_STATUS_CMD ; @@ -118,6 +127,7 @@ void * mtcThread_ipmitool ( void * arg ) command = IPMITOOL_MC_INFO_CMD ; break ; } + default: { rc = info_ptr->status = FAIL_BAD_CASE ; @@ -222,9 +232,11 @@ void * mtcThread_ipmitool ( void * arg ) else if ((( command == IPMITOOL_POWER_RESET_CMD ) || ( command == IPMITOOL_POWER_OFF_CMD ) || ( command == IPMITOOL_POWER_ON_CMD ) || - ( command == IPMITOOL_POWER_CYCLE_CMD )) && + ( command == IPMITOOL_POWER_CYCLE_CMD ) || + ( command == IPMITOOL_BOOTDEV_PXE_CMD)) && ( daemon_is_file_present ( MTC_CMD_FIT__POWER_CMD ))) { + slog("%s FIT Bypass power or bootdev command", info_ptr->hostname.c_str()); bypass_ipmitool_request = true ; rc = PASS ; }