From 1011fd8a1a18c5850cabf54267cd1b8175127225 Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Thu, 16 May 2019 10:59:08 -0400 Subject: [PATCH] Add network boot support to mtce reinstall handling The current maintenance 'reinstall' handling requires a host to be booted and online in order to perform a reinstall by asking the mtcClient to wipe the disks and self reboot thereby forcing a network boot and reinstall. This re-install process is problematic for hosts that don't install properly and never come online or on new system installs where the existing boot image on disk is still valid ; local disk as the first boot device. Getting around these issues prior to this update requires manual BIOS intervention to force-select a network boot. This update continues to support the online-wipedisk method for hosts that are not BMC provisioned and adds offline reinstall support through IPMI commands for hosts that are BMC provisioned. For hosts that have the BMC provisioned, the re-install handler will wait up to 10 minutes for maintenance to to establish connectivity to the BMC if it has not already. Then it will issue a devboot pxe IPMI command to tell the BMC to boot from the network on the 'next' reset and then maintenance proceeds to reset that host by a second IPMI command. This way the host will boot from the network and perform a local install even if the current image on disk is valid. No manual BIOS actions required. This update requires a small system inventory update to relax the online requirement for BMC provisioned hosts so that the reinstall to proceed. That update depends on this. This update also does some minor cleanup in the unused mtcAgent test head to fix a static analysis error. Test Plan: With BMC Test Cases: Success ---------------------------- PASS: Verify install requiring power on with valid image on disk ; pass case PASS: Verify install while powered on but offline with invalid image on disk ; pass case PASS: Verify install while powered on but offline with valid image on disk ; pass case PASS: Verify install with UEFI boot PASS: Verify BMC Reinstall on Dell (720) PASS: Verify BMC Reinstall on WC PASS: Verify BMC Reinstall on HP (hp380) PEND: Verify BMC Reinstall on SM PEND: Verify BMC Reinstall on WP COND: Verify install Secure boot - 430 1-2 fails With BMC Test Cases: Failure --------------------------- PASS: Verify reinstall handling during install during online wait ; restarts the install PASS: Verify reinstall handling during install before online wait ; no install interruption PASS: Verify BMC not accessible at ReInstall start ; recovery PASS: Verify BMC not accessible at ReInstall start ; timeout PASS: Verify BMC accessibility loss over Install process PASS: Verify netboot request failure handling ; no/bad response ; max retry PASS: Verify reset request failure handling ; no retries PASS: Verify BMC de-provisioning over install ; failure handling PASS: Verify BMC re-provisioning over install ; BMC initially not accessible PASS: Verify BMC re-provisioning over install ; BMC initially accessible PASS: Verify install requiring power on but gets power-on receive failure PASS: Verify install requiring power on but gets power-on request failure No BMC Test Cases: Success -------------------------- PASS: Verify install when host is powered on and online No BMC Test Cases: Failure -------------------------- PASS: Verify reinstall action handling during reinstall ; no install interruption PASS: Verify install when host is powered off ; install fails PASS: Verify install when host is powered on and offline ; install fails Regression: ----------- PASS: Verify host reset PASS: Verify host power-off PASS: Verify host power-on PASS: Verify host sensor model and monitoring Change-Id: Ic8c8232167c570e4f75c0bbe1604697966157184 Story: 2005650 Task: 30935 Signed-off-by: Eric MacDonald --- mtce-common/src/common/ipmiUtil.h | 4 + mtce-common/src/common/nodeBase.cpp | 14 +- mtce-common/src/common/nodeBase.h | 24 +- mtce-common/src/common/nodeTimers.cpp | 2 +- mtce-common/src/common/nodeTimers.h | 3 +- mtce/src/common/nodeClass.cpp | 19 ++ mtce/src/maintenance/mtcIpmiUtil.cpp | 5 + mtce/src/maintenance/mtcNodeCtrl.cpp | 25 -- mtce/src/maintenance/mtcNodeFsm.cpp | 37 +-- mtce/src/maintenance/mtcNodeHdlrs.cpp | 447 ++++++++++++++++++++++---- mtce/src/maintenance/mtcThreads.cpp | 14 +- 11 files changed, 472 insertions(+), 122 deletions(-) diff --git a/mtce-common/src/common/ipmiUtil.h b/mtce-common/src/common/ipmiUtil.h index 3a4c650f..6567b303 100644 --- a/mtce-common/src/common/ipmiUtil.h +++ b/mtce-common/src/common/ipmiUtil.h @@ -43,6 +43,9 @@ #define IPMITOOL_RESTART_CAUSE_CMD ((const char *)("chassis restart_cause")) +#define IPMITOOL_BOOTDEV_PXE_CMD ((const char *)("chassis bootdev pxe")) +#define IPMITOOL_BOOTDEV_PXE_RESP ((const char *)("Set Boot Device to pxe")) + #define IPMITOOL_MC_INFO_CMD ((const char *)("mc info")) #define IPMITOOL_CMD_FILE_SUFFIX ((const char *)("_power_cmd_result")) @@ -65,6 +68,7 @@ typedef enum IPMITOOL_THREAD_CMD__MC_INFO, IPMITOOL_THREAD_CMD__POWER_STATUS, IPMITOOL_THREAD_CMD__RESTART_CAUSE, + IPMITOOL_THREAD_CMD__BOOTDEV_PXE, IPMITOOL_THREAD_CMD__READ_SENSORS, diff --git a/mtce-common/src/common/nodeBase.cpp b/mtce-common/src/common/nodeBase.cpp index bb15979d..e799815f 100755 --- a/mtce-common/src/common/nodeBase.cpp +++ b/mtce-common/src/common/nodeBase.cpp @@ -325,7 +325,7 @@ static std::string sensorStages_str [MTC_SENSOR__STAGES +1] ; static std::string powerStages_str [MTC_POWER__STAGES +1] ; static std::string powercycleStages_str [MTC_POWERCYCLE__STAGES +1] ; static std::string resetStages_str [MTC_RESET__STAGES +1] ; -static std::string reinstallStages_str [MTC_RESET__STAGES +1] ; +static std::string reinstallStages_str [MTC_REINSTALL__STAGES +1] ; static std::string oosTestStages_str [MTC_OOS_TEST__STAGES +1] ; static std::string insvTestStages_str [MTC_INSV_TEST__STAGES +1] ; static std::string configStages_str [MTC_CONFIG__STAGES +1] ; @@ -451,7 +451,17 @@ void mtc_stages_init ( void ) resetStages_str [MTC_RESET__STAGES ] = "Reset-Unknown"; reinstallStages_str [MTC_REINSTALL__START ] = "Reinstall-Start"; - reinstallStages_str [MTC_REINSTALL__RESP_WAIT ] = "Reinstall-Response-Wait"; + reinstallStages_str [MTC_REINSTALL__START_WAIT ] = "Reinstall-Start-Wait"; + reinstallStages_str [MTC_REINSTALL__RESTART ] = "Reinstall-ReStart"; + reinstallStages_str [MTC_REINSTALL__RESTART_WAIT ] = "Reinstall-ReStart-Wait"; + reinstallStages_str [MTC_REINSTALL__POWERON ] = "Reinstall-PowerOn"; + reinstallStages_str [MTC_REINSTALL__POWERON_WAIT ] = "Reinstall-PowerOn-Wait"; + reinstallStages_str [MTC_REINSTALL__NETBOOT ] = "Reinstall-Netboot"; + reinstallStages_str [MTC_REINSTALL__NETBOOT_WAIT ] = "Reinstall-Netboot-Wait"; + reinstallStages_str [MTC_REINSTALL__RESET ] = "Reinstall-Reset"; + reinstallStages_str [MTC_REINSTALL__RESET_WAIT ] = "Reinstall-Reset-Wait"; + reinstallStages_str [MTC_REINSTALL__WIPEDISK ] = "Reinstall-Wipedisk"; + reinstallStages_str [MTC_REINSTALL__WIPEDISK_WAIT ] = "Reinstall-Wipedisk-Wait"; reinstallStages_str [MTC_REINSTALL__OFFLINE_WAIT ] = "Reinstall-Offline-Wait"; reinstallStages_str [MTC_REINSTALL__ONLINE_WAIT ] = "Reinstall-Online-Wait"; reinstallStages_str [MTC_REINSTALL__FAIL ] = "Reinstall-Failure"; diff --git a/mtce-common/src/common/nodeBase.h b/mtce-common/src/common/nodeBase.h index 76bce0dd..eb23ffbd 100755 --- a/mtce-common/src/common/nodeBase.h +++ b/mtce-common/src/common/nodeBase.h @@ -220,7 +220,17 @@ void daemon_exit ( void ); #define MTC_TASK_REBOOT_FAIL_RETRY "Reboot Failed, retrying (%d of %d)" #define MTC_TASK_REBOOT_ABORT "Reboot Failed, try again when host is 'online'" #define MTC_TASK_RESET_PROG "Rebooting/Resetting Host" -#define MTC_TASK_REINSTALL "Reinstalling Host" +#define MTC_TASK_REINSTALL "Reinstalling" +#define MTC_TASK_REINSTALL_WAIT_NA "Reinstall Wait ; BMC not accessible" +#define MTC_TASK_REINSTALL_RTRY_PC "Reinstall Retry ; BMC provisioned change during install" +#define MTC_TASK_REINSTALL_FAIL_CL "Reinstall Failed ; BMC connectivity lost" +#define MTC_TASK_REINSTALL_FAIL_OL "Reinstall Failed ; timeout waiting for offline" +#define MTC_TASK_REINSTALL_FAIL_TO "Reinstall Failed ; timeout waiting for online" +#define MTC_TASK_REINSTALL_FAIL_BA "Reinstall Failed ; timeout waiting BMC access" +#define MTC_TASK_REINSTALL_FAIL_PO "Reinstall Failed ; could not power on host" +#define MTC_TASK_REINSTALL_FAIL_NB "Reinstall Failed ; netboot request" +#define MTC_TASK_REINSTALL_FAIL_PR "Reinstall Failed ; power reset request" + #define MTC_TASK_REINSTALL_FAIL "Reinstall Failed" #define MTC_TASK_REINSTALL_SUCCESS "Reinstall Succeeded" #define MTC_TASK_BOOTING "Booting" @@ -1008,7 +1018,17 @@ string get_resetStages_str ( mtc_resetStages_enum stage ); typedef enum { MTC_REINSTALL__START = 0, - MTC_REINSTALL__RESP_WAIT, + MTC_REINSTALL__START_WAIT, + MTC_REINSTALL__RESTART, + MTC_REINSTALL__RESTART_WAIT, + MTC_REINSTALL__POWERON, + MTC_REINSTALL__POWERON_WAIT, + MTC_REINSTALL__NETBOOT, + MTC_REINSTALL__NETBOOT_WAIT, + MTC_REINSTALL__RESET, + MTC_REINSTALL__RESET_WAIT, + MTC_REINSTALL__WIPEDISK, + MTC_REINSTALL__WIPEDISK_WAIT, MTC_REINSTALL__OFFLINE_WAIT, MTC_REINSTALL__ONLINE_WAIT, MTC_REINSTALL__FAIL, diff --git a/mtce-common/src/common/nodeTimers.cpp b/mtce-common/src/common/nodeTimers.cpp index 390e9183..d3590378 100755 --- a/mtce-common/src/common/nodeTimers.cpp +++ b/mtce-common/src/common/nodeTimers.cpp @@ -234,7 +234,7 @@ int _timer_stop ( struct mtc_timer * mtcTimer_ptr , bool int_safe) } else if ( int_safe == false ) { - elog ("%s (%s) called with null TID (count:%d)\n", + wlog ("%s (%s) called with null TID (count:%d)\n", mtcTimer_ptr->hostname.c_str(), mtcTimer_ptr->service.c_str(), timer_count); diff --git a/mtce-common/src/common/nodeTimers.h b/mtce-common/src/common/nodeTimers.h index 823a7167..0d6d25ae 100755 --- a/mtce-common/src/common/nodeTimers.h +++ b/mtce-common/src/common/nodeTimers.h @@ -64,7 +64,7 @@ #define MTC_ALIVE_TIMER (5) #define MTC_POWEROFF_DELAY (5) #define MTC_SWACT_POLL_TIMER (10) -#define MTC_TASK_UPDATE_DELAY (10) +#define MTC_TASK_UPDATE_DELAY (30) #define MTC_BM_PING_TIMEOUT (30) #define MTC_BM_POWEROFF_TIMEOUT (30) #define MTC_BM_POWERON_TIMEOUT (30) @@ -80,6 +80,7 @@ #define MTC_POWERCYCLE_BACK2BACK_DELAY (MTC_MINS_5) #define MTC_HEARTBEAT_SOAK_BEFORE_ENABLE (11) #define MTC_REINSTALL_TIMEOUT_DEFAULT (MTC_MINS_40) +#define MTC_REINSTALL_TIMEOUT_BMC_ACC (MTC_MINS_10) #define MTC_REINSTALL_TIMEOUT_MIN (MTC_MINS_1) #define MTC_REINSTALL_TIMEOUT_MAX (MTC_HRS_4) #define MTC_REINSTALL_WAIT_TIMER (10) diff --git a/mtce/src/common/nodeClass.cpp b/mtce/src/common/nodeClass.cpp index 4936cba8..8dbb755a 100755 --- a/mtce/src/common/nodeClass.cpp +++ b/mtce/src/common/nodeClass.cpp @@ -2181,6 +2181,25 @@ int nodeLinkClass::mod_host ( node_inv_type & inv ) else if ( !inv.action.compare ( "reinstall" ) ) { plog ("%s Reinstall Action\n", node_ptr->hostname.c_str()); + if ( node_ptr->adminAction == MTC_ADMIN_ACTION__REINSTALL ) + { + /* Allow user to restart the re-install if + * - its in progress, + * - there is a BMC provisioned and + * - are waiting while the actual install is in progress */ + if (( node_ptr->bm_provisioned == true ) && + ( node_ptr->reinstallStage == MTC_REINSTALL__ONLINE_WAIT)) + { + reinstallStageChange ( node_ptr , MTC_REINSTALL__START ); + } + else + { + /* Otherwise allow the current install to continue + * remind the user that there is a reinstall + * in progress */ + mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL); + } + } adminActionChange ( node_ptr , MTC_ADMIN_ACTION__REINSTALL ); /* generate command=reinstall log */ diff --git a/mtce/src/maintenance/mtcIpmiUtil.cpp b/mtce/src/maintenance/mtcIpmiUtil.cpp index 349e2c58..1e865cde 100644 --- a/mtce/src/maintenance/mtcIpmiUtil.cpp +++ b/mtce/src/maintenance/mtcIpmiUtil.cpp @@ -130,6 +130,11 @@ int nodeLinkClass::ipmi_command_send ( struct nodeLinkClass::node * node_ptr, in { want_fit = true ; } + else if (( command == IPMITOOL_THREAD_CMD__BOOTDEV_PXE ) && + ( daemon_want_fit ( fit, node_ptr->hostname, "netboot_pxe" ) == true )) + { + want_fit = true ; + } if ( want_fit == true ) { diff --git a/mtce/src/maintenance/mtcNodeCtrl.cpp b/mtce/src/maintenance/mtcNodeCtrl.cpp index a0523149..1cd96655 100644 --- a/mtce/src/maintenance/mtcNodeCtrl.cpp +++ b/mtce/src/maintenance/mtcNodeCtrl.cpp @@ -1615,31 +1615,6 @@ extern int mtcJsonInv_testhead ( void ); int daemon_run_testhead ( void ) { int rc = PASS; - - mtc_config.testmode = true ; - - nodeLinkClass * mtcInv_testhead_ptr = new nodeLinkClass ; - - printf ("\n\n"); - printf (TESTHEAD_BAR); - - printf ("| Node Class Test Head - Private and Public Member Functions\n"); - printf (TESTHEAD_BAR); - for ( int i = 0 ; i < 11 ; i++ ) - { - if ( mtcInv_testhead_ptr->testhead ( i+1 ) ) - { - FAILED_STR ; - rc = FAIL ; - } - else - PASSED ; - } - free(mtcInv_testhead_ptr); - - printf (TESTHEAD_BAR); - printf ("| Maintenance Timer Test Head\n"); - printf (TESTHEAD_BAR); return (rc); } diff --git a/mtce/src/maintenance/mtcNodeFsm.cpp b/mtce/src/maintenance/mtcNodeFsm.cpp index 19eba3ee..e9aca599 100755 --- a/mtce/src/maintenance/mtcNodeFsm.cpp +++ b/mtce/src/maintenance/mtcNodeFsm.cpp @@ -294,37 +294,16 @@ int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr ) { flog ("%s -> OOS Action Check\n", node_ptr->hostname.c_str()); - /* TEMPORARY: To allow reset of unlocked host for fault insertion. */ - if ( node_ptr->adminAction == MTC_ADMIN_ACTION__RESET ) - { - wlog ("%s Allowing Reset of unlocked host for FIT\n", node_ptr->hostname.c_str()); + elog ("%s Administrative '%s' Operation Rejected\n", + node_ptr->hostname.c_str(), + get_adminAction_str (node_ptr->adminAction) ); - if ( node_ptr->hostname.compare(nodeLinkClass::my_hostname)) - { - nodeLinkClass::reset_handler ( node_ptr ); - } - else - { - wlog ("%s Cowardly avoiding reset of self\n", node_ptr->hostname.c_str()); - adminActionChange ( node_ptr , MTC_ADMIN_ACTION__NONE ); + elog ("%s Cannot perform out-of-service action against in-service host\n", + node_ptr->hostname.c_str()); + adminActionChange ( node_ptr , MTC_ADMIN_ACTION__NONE ); - /* Clear the UI task since we are not really resetting */ - mtcInvApi_update_task ( node_ptr, "" ); - } - } - else - { - elog ("%s Administrative '%s' Operation Rejected\n", - node_ptr->hostname.c_str(), - get_adminAction_str (node_ptr->adminAction) ); - - elog ("%s Cannot perform out-of-service action against in-service host\n", - node_ptr->hostname.c_str()); - adminActionChange ( node_ptr , MTC_ADMIN_ACTION__NONE ); - - /* Clear the UI task since we are not really resetting */ - mtcInvApi_update_task ( node_ptr, "" ); - } + /* Clear the UI task since we are not really taking this action */ + mtcInvApi_update_task ( node_ptr, "" ); } /**************************************************************************** diff --git a/mtce/src/maintenance/mtcNodeHdlrs.cpp b/mtce/src/maintenance/mtcNodeHdlrs.cpp index f7700411..6aa14f42 100755 --- a/mtce/src/maintenance/mtcNodeHdlrs.cpp +++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp @@ -1971,12 +1971,12 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) { if ( mtcTimer_expired ( node_ptr->mtcTimer )) { - rc = ipmi_command_recv ( node_ptr ); - if ( rc == RETRY ) - { - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT ); - break ; - } + rc = ipmi_command_recv ( node_ptr ); + if ( rc == RETRY ) + { + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT ); + break ; + } if ( rc ) { @@ -4011,63 +4011,364 @@ int nodeLinkClass::reset_handler ( struct nodeLinkClass::node * node_ptr ) return (PASS); } -/* Reinstall handler - * -------------- +/**************************************************************************** + * + * Name : reinstall_handler + * + * Purpose : Perform actions that result in a network boot so that a new + * image is installed on the specified node's boot partition. + * + * Description: This FSM handles node (re)install with and without + * a provisioned Board Management Controller (BMC). + * + * BMC provisioned case: using IPMI commands to BMC ... + * + * - ensure host power is on + * - force network boot on next reset + * - issue node reset + * + * BMC not provisioned case: using mtce messaging to node ... + * + * - send mtcClient wipedisk command + * fail reinstall if no ACK + * - send mtcClient reboot command + * + * Both casess: + * + * - wait for offline + * - wait for online + * - install complete + * + * Failure Handling: + * + * BMC provisioned cases: + * + * BMC won't power on + * BMC ipmi command failure + * BMC connectivity lost mid-FSM. + * BMC access timeout + * + * BMC not provisioned cases: + * + * no wipedisk ACK\ + * + * failure to go offline after resaet/reboot + * timeout waiting for online after reset/reboot + * * Manage reinstall operations for a locked-disabled host */ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr ) { + /* Handle 'lost BMC connectivity during the install' case */ + if (( node_ptr->bm_provisioned == true ) && + ( node_ptr->bm_accessible == false )) + { + if (( node_ptr->reinstallStage != MTC_REINSTALL__START ) && + ( node_ptr->reinstallStage != MTC_REINSTALL__START_WAIT ) && + ( node_ptr->reinstallStage != MTC_REINSTALL__FAIL ) && + ( node_ptr->reinstallStage != MTC_REINSTALL__MSG_DISPLAY ) && + ( node_ptr->reinstallStage != MTC_REINSTALL__DONE )) + { + mtcTimer_reset ( node_ptr->mtcTimer ); + + elog ("%s Reinstall lost bmc connection", + node_ptr->hostname.c_str()); + + mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_CL ); + reinstallStageChange ( node_ptr , MTC_REINSTALL__FAIL ); + } + + /* fall into switch to ... + * - handle failure + * - finish the FSM + */ + } switch ( node_ptr->reinstallStage ) { case MTC_REINSTALL__START: { - int host_reinstall_wait_timer = node_ptr->mtcalive_timeout + node_reinstall_timeout ; - node_ptr->retries = host_reinstall_wait_timer / MTC_REINSTALL_WAIT_TIMER ; + LOAD_NODETYPE_TIMERS ; + mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL ); + node_ptr->retries = ( node_ptr->mtcalive_timeout + + this->node_reinstall_timeout) / + MTC_REINSTALL_WAIT_TIMER ; + mtcTimer_reset ( node_ptr->mtcTimer ); + if ( node_ptr->bm_provisioned == true ) + { + if ( node_ptr->bm_accessible == false ) + { + /* Handle 'lost BMC connectivity during the install' case */ + wlog ("%s Reinstall wait for BMC access ; %d second timeout", + node_ptr->hostname.c_str(), + MTC_REINSTALL_TIMEOUT_BMC_ACC); - start_offline_handler ( node_ptr ); + mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_WAIT_NA ); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_REINSTALL_TIMEOUT_BMC_ACC ); + reinstallStageChange ( node_ptr, MTC_REINSTALL__START_WAIT ); + } + else if ( node_ptr->power_on == false ) + { + /* need to power on node */ + wlog ("%s Reinstall power-on required", node_ptr->hostname.c_str()); + reinstallStageChange ( node_ptr, MTC_REINSTALL__POWERON ); + } + else + { + /* power is on so issue net boot command */ + ilog ("%s Reinstall power is on", node_ptr->hostname.c_str()); + reinstallStageChange ( node_ptr , MTC_REINSTALL__NETBOOT ); + } + } + else + { + /* If the BMC is not provisioned coming into this handler + * then service the install by mtce commands by starting + * the install by wipedisk. */ + reinstallStageChange ( node_ptr, MTC_REINSTALL__WIPEDISK ); + } + break ; + } + /* BMC provisioned but bm_handler has not reported accessability yet. + * Need to wait ... */ + case MTC_REINSTALL__START_WAIT: + { + if ( node_ptr->bm_provisioned == true ) + { + if ( node_ptr->bm_accessible == false ) + { + if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) + { + /* wait period has timed out ; fail the install */ + elog ("%s %s", node_ptr->hostname.c_str(), MTC_TASK_REINSTALL_FAIL_BA); + mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_BA ); + reinstallStageChange ( node_ptr , MTC_REINSTALL__FAIL ); + } + else + { + ; /* ... wait longer */ + } + } + else + { + /* the BMC is not accessible to start the install over */ + plog ("%s BMC access established ; starting install", + node_ptr->hostname.c_str()); + mtcTimer_reset ( node_ptr->mtcTimer ); + reinstallStageChange ( node_ptr , MTC_REINSTALL__START ); + } + } + else + { + /* + * Handle case where BMC gets deprovisioned + * while waiting for accessibility. + * + * Restart the install in that case after a monitored + * wait period for reprovision. + * + * Has the side effect of allowing the admin to + * reprovision the BMC during a re-install. + */ + mtcTimer_reset ( node_ptr->mtcTimer ); + wlog ("%s %s", node_ptr->hostname.c_str(), MTC_TASK_REINSTALL_RTRY_PC ); + + mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_RTRY_PC ); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_MINS_5 ); + reinstallStageChange ( node_ptr, MTC_REINSTALL__RESTART_WAIT ); + } + break ; + } + case MTC_REINSTALL__RESTART_WAIT: + { + if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) + { + reinstallStageChange ( node_ptr , MTC_REINSTALL__START ); + } + else if ( node_ptr->bm_provisioned == true ) + { + mtcTimer_reset ( node_ptr->mtcTimer ); + wlog ("%s %s", node_ptr->hostname.c_str(), MTC_TASK_REINSTALL_RTRY_PC ); + reinstallStageChange ( node_ptr , MTC_REINSTALL__START ); + } + else + { + ; /* ... wait longer */ + } + break ; + } + case MTC_REINSTALL__POWERON: + { + powerStageChange ( node_ptr , MTC_POWERON__REQ_SEND ); + reinstallStageChange ( node_ptr , MTC_REINSTALL__POWERON_WAIT ); + break ; + } + case MTC_REINSTALL__POWERON_WAIT: + { + /* The power handler manages timeout */ + if ( node_ptr->powerStage == MTC_POWER__DONE ) + { + if ( node_ptr->power_on == true ) + { + if ( node_ptr->task != MTC_TASK_REINSTALL ) + mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL ); + + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_SECS_5 ); + reinstallStageChange ( node_ptr , MTC_REINSTALL__NETBOOT ); + } + else + { + elog ("%s %s", node_ptr->hostname.c_str(), MTC_TASK_REINSTALL_FAIL_PO); + + mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_PO ); + reinstallStageChange ( node_ptr , MTC_REINSTALL__FAIL ); + } + } + else + { + /* run the power handler till the host's power is on or + * the power-on handler times out */ + power_handler ( node_ptr ); + } + break ; + } + case MTC_REINSTALL__NETBOOT: + { + /* Issue netboot command after timed delay */ + if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) + { + int rc = ipmi_command_send ( node_ptr, IPMITOOL_THREAD_CMD__BOOTDEV_PXE ); + if ( rc ) + { + elog ("%s Reinstall netboot request failed (rc:%d)", + node_ptr->hostname.c_str(), rc ); + mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_NB ); + reinstallStageChange ( node_ptr, MTC_REINSTALL__FAIL ); + } + else + { + ilog ("%s Reinstall netboot request sent", node_ptr->hostname.c_str() ); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_SECS_5 ); + reinstallStageChange ( node_ptr, MTC_REINSTALL__NETBOOT_WAIT ); + } + } + break ; + } + case MTC_REINSTALL__NETBOOT_WAIT: + { + if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) + { + int rc = ipmi_command_recv ( node_ptr ); + if ( rc == PASS ) + { + ilog ("%s Reinstall netboot request completed", node_ptr->hostname.c_str()); + reinstallStageChange ( node_ptr, MTC_REINSTALL__RESET); + } + else if ( rc == RETRY ) + { + wlog ("%s Reinstall netboot receive retry", node_ptr->hostname.c_str()); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_SECS_5 ); + } + else + { + elog ("%s Reinstall netboot receive failed (rc:%d)", + node_ptr->hostname.c_str(), rc ); + mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_NB ); + reinstallStageChange ( node_ptr, MTC_REINSTALL__FAIL ); + } + } + break ; + } + case MTC_REINSTALL__RESET: + { + int rc = ipmi_command_send ( node_ptr, IPMITOOL_THREAD_CMD__POWER_RESET ); + if ( rc ) + { + elog ("%s Reinstall reset request failed (rc:%d)", + node_ptr->hostname.c_str(), rc ); + mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_PR ); + reinstallStageChange ( node_ptr, MTC_REINSTALL__FAIL ); + } + else + { + ilog ("%s Reinstall reset request sent", node_ptr->hostname.c_str()); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY ); + reinstallStageChange ( node_ptr, MTC_REINSTALL__RESET_WAIT ); + } + break ; + } + case MTC_REINSTALL__RESET_WAIT: + { + if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) + { + int rc = ipmi_command_recv ( node_ptr ); + if ( rc == PASS ) + { + ilog ("%s Reinstall reset request completed", node_ptr->hostname.c_str()); + + start_offline_handler ( node_ptr ); + + /* Wait for the host to go offline */ + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RESET_TO_OFFLINE_TIMEOUT ); + reinstallStageChange ( node_ptr, MTC_REINSTALL__OFFLINE_WAIT); + } + else if ( rc == RETRY ) + { + wlog ("%s Reinstall reset receive retry", node_ptr->hostname.c_str()); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_SECS_5 ); + } + else + { + elog ("%s Reinstall reset receive failed ; rc:%d", + node_ptr->hostname.c_str(), rc ); + mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_PR ); + reinstallStageChange ( node_ptr, MTC_REINSTALL__FAIL ); + } + } + break ; + } + /* BMC not provisioned case */ + case MTC_REINSTALL__WIPEDISK: + { node_ptr->cmdReq = MTC_CMD_WIPEDISK ; - plog ("%s Administrative Reinstall Requested\n", node_ptr->hostname.c_str()); + plog ("%s Reinstall wipedisk requested", node_ptr->hostname.c_str()); if ( send_mtc_cmd ( node_ptr->hostname, MTC_CMD_WIPEDISK, MGMNT_INTERFACE ) != PASS ) { - elog ("Failed to send 'reinstall' request to %s\n", node_ptr->hostname.c_str()); + elog ("%s Reinstall request send failed", node_ptr->hostname.c_str()); + mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL ); reinstallStageChange ( node_ptr , MTC_REINSTALL__FAIL ); } else { node_ptr->cmdRsp = MTC_CMD_NONE ; - if ( node_ptr->mtcTimer.tid ) - { - mtcTimer_stop ( node_ptr->mtcTimer ); - } - + mtcTimer_reset ( node_ptr->mtcTimer ); mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_CMD_RSP_TIMEOUT ); - ilog ("%s waiting for REINSTALL ACK \n", node_ptr->hostname.c_str() ); - - reinstallStageChange ( node_ptr , MTC_REINSTALL__RESP_WAIT ); + reinstallStageChange ( node_ptr , MTC_REINSTALL__WIPEDISK_WAIT ); } break ; } - case MTC_REINSTALL__RESP_WAIT: + case MTC_REINSTALL__WIPEDISK_WAIT: { if ( node_ptr->cmdRsp != MTC_CMD_WIPEDISK ) { if ( node_ptr->mtcTimer.ring == true ) { - elog ("%s REINSTALL ACK Timeout\n", + elog ("%s Reinstall wipedisk ACK timeout", node_ptr->hostname.c_str()); - + mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL ); reinstallStageChange ( node_ptr , MTC_REINSTALL__FAIL ); } } else { /* declare successful reinstall request */ - plog ("%s REINSTALL Request Succeeded\n", node_ptr->hostname.c_str()); + plog ("%s Reinstall request succeeded", node_ptr->hostname.c_str()); - mtcTimer_stop ( node_ptr->mtcTimer ); + mtcTimer_reset ( node_ptr->mtcTimer ); + + start_offline_handler ( node_ptr ); /* We need to wait for the host to go offline */ mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RESET_TO_OFFLINE_TIMEOUT ); @@ -4085,49 +4386,57 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr ) clear_service_readies ( node_ptr ); - ilog ("%s Reinstall Progress: host is offline ; waiting for host to come back\n", node_ptr->hostname.c_str()); + ilog ("%s Reinstall in-progress ; waiting for 'online' state", + node_ptr->hostname.c_str()); mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_REINSTALL_WAIT_TIMER ); reinstallStageChange ( node_ptr , MTC_REINSTALL__ONLINE_WAIT ); } - else if ( node_ptr->mtcTimer.ring == true ) + else if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) { - elog ("%s offline timeout - reinstall failed\n", node_ptr->hostname.c_str()); + elog ("%s failed to go offline ; timeout", node_ptr->hostname.c_str()); + stop_offline_handler ( node_ptr ); + mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_OL ); reinstallStageChange ( node_ptr , MTC_REINSTALL__FAIL ); } + else + { + ; // wait longer ... + } break ; } case MTC_REINSTALL__ONLINE_WAIT: { - if ( node_ptr->mtcTimer.ring == true ) + if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) { - if ( node_ptr->availStatus == MTC_AVAIL_STATUS__ONLINE ) + if ( --node_ptr->retries < 0 ) { - mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_SUCCESS); - mtcTimer_stop ( node_ptr->mtcTimer ); - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY ); - reinstallStageChange ( node_ptr , MTC_REINSTALL__MSG_DISPLAY ); - mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__STATUSCHANGE_REINSTALL_COMPLETE ); + elog ("%s %s", node_ptr->hostname.c_str(), MTC_TASK_REINSTALL_FAIL_TO); + mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_TO ); + reinstallStageChange ( node_ptr , MTC_REINSTALL__FAIL ); } else { - if ( --node_ptr->retries < 0 ) - { - elog ("%s online timeout - reinstall failed\n", node_ptr->hostname.c_str()); - reinstallStageChange ( node_ptr , MTC_REINSTALL__FAIL ); - } - else - { - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_REINSTALL_WAIT_TIMER ); - } + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_REINSTALL_WAIT_TIMER ); } } + else if ( node_ptr->availStatus == MTC_AVAIL_STATUS__ONLINE ) + { + mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_SUCCESS); + mtcTimer_reset ( node_ptr->mtcTimer ); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY ); + reinstallStageChange ( node_ptr , MTC_REINSTALL__MSG_DISPLAY ); + mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__STATUSCHANGE_REINSTALL_COMPLETE ); + } + else + { + ; // wait longer ... + } break; } case MTC_REINSTALL__FAIL: { - mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL); - mtcTimer_stop ( node_ptr->mtcTimer ); + mtcTimer_reset ( node_ptr->mtcTimer ); mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY ); reinstallStageChange ( node_ptr , MTC_REINSTALL__MSG_DISPLAY ); mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__STATUSCHANGE_REINSTALL_FAILED ); @@ -4135,23 +4444,33 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr ) } case MTC_REINSTALL__MSG_DISPLAY: { - if ( node_ptr->mtcTimer.ring == true ) + if ( mtcTimer_expired ( node_ptr->mtcTimer ) ) { - node_ptr->mtcTimer.ring = false ; reinstallStageChange ( node_ptr , MTC_REINSTALL__DONE ); } + else + { + ; // wait longer ... + } break ; } case MTC_REINSTALL__DONE: default: { - plog ("%s Reinstall Completed\n", node_ptr->hostname.c_str()); + if ( node_ptr->task == MTC_TASK_REINSTALL_SUCCESS ) + { + plog ("%s Reinstall completed successfully", + node_ptr->hostname.c_str()); + } + else + { + plog ("%s Reinstall complete ; operation failure", + node_ptr->hostname.c_str()); + } /* Default timeout values */ LOAD_NODETYPE_TIMERS ; - mtcTimer_stop ( node_ptr->mtcTimer ); - adminActionChange ( node_ptr , MTC_ADMIN_ACTION__NONE ); recovery_ctrl_init ( node_ptr->hwmon_reset ); @@ -4583,6 +4902,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) } else { + node_ptr->power_on = false ; ilog ("%s power is off ; powering on ...\n", node_ptr->hostname.c_str() ); powerStageChange ( node_ptr , MTC_POWERON__REQ_SEND ); } @@ -4623,7 +4943,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) } else { - rc = ipmi_command_send ( node_ptr, IPMITOOL_THREAD_CMD__POWER_ON ); + rc = ipmi_command_send ( node_ptr, IPMITOOL_THREAD_CMD__POWER_ON ); if ( rc ) { wlog ("%s Power-On request failed (%d)\n", @@ -4918,14 +5238,14 @@ int nodeLinkClass::powercycle_handler ( struct nodeLinkClass::node * node_ptr ) { bool on = false ; - ilog ("%s Power Status: %s\n", - node_ptr->hostname.c_str(), - node_ptr->ipmitool_thread_info.data.c_str()); + ilog ("%s Power Status: %s\n", + node_ptr->hostname.c_str(), + node_ptr->ipmitool_thread_info.data.c_str()); - if ( node_ptr->ipmitool_thread_info.data.find ( IPMITOOL_POWER_ON_STATUS ) != std::string::npos ) - { - on = true ; - } + if ( node_ptr->ipmitool_thread_info.data.find ( IPMITOOL_POWER_ON_STATUS ) != std::string::npos ) + { + on = true ; + } if ( rc == PASS ) { /* maintain current power state */ @@ -6033,6 +6353,12 @@ int nodeLinkClass::bm_handler ( struct nodeLinkClass::node * node_ptr ) node_ptr->ipmitool_thread_info.data.c_str()); plog ("%s bmc is accessible\n", node_ptr->hostname.c_str()); + /* set host power state ; on or off */ + if ( node_ptr->ipmitool_thread_info.data.find (IPMITOOL_POWER_ON_STATUS) != std::string::npos ) + node_ptr->power_on = true ; + else + node_ptr->power_on = false ; + if ( node_ptr->ipmitool_thread_info.data.find (IPMITOOL_POWER_OFF_STATUS) != std::string::npos ) { if ( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED ) @@ -6333,7 +6659,6 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr ) } node_ptr->ipmitool_thread_ctrl.done = true ; } - #endif /* Audits for this controller host only */ diff --git a/mtce/src/maintenance/mtcThreads.cpp b/mtce/src/maintenance/mtcThreads.cpp index 0e7c4acd..5a970885 100644 --- a/mtce/src/maintenance/mtcThreads.cpp +++ b/mtce/src/maintenance/mtcThreads.cpp @@ -79,6 +79,7 @@ void * mtcThread_ipmitool ( void * arg ) switch ( info_ptr->command ) { + /* control commands */ case IPMITOOL_THREAD_CMD__POWER_RESET: { command = IPMITOOL_POWER_RESET_CMD ; @@ -103,6 +104,14 @@ void * mtcThread_ipmitool ( void * arg ) response = IPMITOOL_POWER_CYCLE_RESP ; break ; } + case IPMITOOL_THREAD_CMD__BOOTDEV_PXE: + { + command = IPMITOOL_BOOTDEV_PXE_CMD ; + response = IPMITOOL_BOOTDEV_PXE_RESP ; + break ; + } + + /* Status commands */ case IPMITOOL_THREAD_CMD__POWER_STATUS: { command = IPMITOOL_POWER_STATUS_CMD ; @@ -118,6 +127,7 @@ void * mtcThread_ipmitool ( void * arg ) command = IPMITOOL_MC_INFO_CMD ; break ; } + default: { rc = info_ptr->status = FAIL_BAD_CASE ; @@ -222,9 +232,11 @@ void * mtcThread_ipmitool ( void * arg ) else if ((( command == IPMITOOL_POWER_RESET_CMD ) || ( command == IPMITOOL_POWER_OFF_CMD ) || ( command == IPMITOOL_POWER_ON_CMD ) || - ( command == IPMITOOL_POWER_CYCLE_CMD )) && + ( command == IPMITOOL_POWER_CYCLE_CMD ) || + ( command == IPMITOOL_BOOTDEV_PXE_CMD)) && ( daemon_is_file_present ( MTC_CMD_FIT__POWER_CMD ))) { + slog("%s FIT Bypass power or bootdev command", info_ptr->hostname.c_str()); bypass_ipmitool_request = true ; rc = PASS ; }