From 1210ed450a291829b7ce0a30f63e2a038c70fce0 Mon Sep 17 00:00:00 2001 From: Jagatguru Prasad Mishra Date: Mon, 20 Nov 2023 06:41:16 -0500 Subject: [PATCH] Failure case handling of LUKS service luks-fs-mgr service creates and unseals the LUKS volume used to store keys/secrets. This change handles the failure case if this essential service is inactive. It introduces an alarm LUKS_ALARM_ID which is raised if service is inactive which implies that there is an issue in creating or unsealing the LUKS volume. Test Plan: PASS" build-pkgs -c -p mtce-common PASS: build-pkgs -c -p mtce PASS: build-image PASS: AIO-SX bootstrap with luks volume status active PASS: AIO-DX bootstrap with volume status active PASS: Standard setup with 2 controllers and 1 compute node with luks volume status active. There should not be any alarm and node status should be unlocked/enabled/available. PASS: AIO-DX node enable failure on the controller where luks volume is inactive. Node availability should be failed. A critical alarm with id 200.016 should be displayed with 'fm alarm-list' PASS: AIO-SX node enable failure on the controller-0. Node availability should be failed. A critical alarm with id 200.016 should be displayed with 'fm alarm-list' PASS: Standard- node enable failure on the node (controller-0, controller-1, storage-0, compute-1). Node availability should be failed. A critical alarm with id 200.016 should be displayed with 'fm alarm-list' for the failed host. PASS: AIO-DX In service volume inactive should be detected and a critical alarm should be raised with ID 200.016. Node availability should be changed to degraded. PASS: AIO-SX In service volume inactive status should be detected and a critical alarm should be raised with ID 200.016. Node availability should be changed to degraded. PASS: Standard ( 2 controller, 1 storage, 1 compute) In service volume inactive status should be detected and a critical alarm should be raised with ID 200.016. Node availability should be changed to degraded. PASS: AIO-DX In service: If volume becomes active and a LUKS alarm is active, alarm should be cleared. Node availability should be changed to available. PASS: AIO-SX In service: If volume becomes active and a LUKS alarm is active, alarm should be cleared. Node availability should be changed to available. PASS: Standard ( 2 controller, 1 storage, 1 compute) In service: If volume becomes active and a LUKS alarm is active, alarm should be cleared. Node availability should be changed to available. PASS: AIO-SX, AIO-DX, Standard- If intest fails and node availability is 'failed'. After fixing the volume issue, a lock/unlock should make the node available. Story: 2010872 Task: 49108 Change-Id: I4621e7c546078c3cc22fe47079ba7725fbea5c8f Signed-off-by: Jagatguru Prasad Mishra --- mtce-common/src/common/alarmUtil.h | 1 + mtce-common/src/common/nodeBase.h | 5 +++- mtce/src/alarm/alarm.h | 1 + mtce/src/common/nodeClass.cpp | 35 +++++++++++++++++++++++ mtce/src/common/nodeClass.h | 3 ++ mtce/src/maintenance/mtcAlarm.cpp | 29 +++++++++++++++++++ mtce/src/maintenance/mtcAlarm.h | 1 + mtce/src/maintenance/mtcCompMsg.cpp | 5 ++++ mtce/src/maintenance/mtcNodeCtrl.cpp | 7 +++++ mtce/src/maintenance/mtcNodeHdlrs.cpp | 40 +++++++++++++++++++++++++-- mtce/src/scripts/mtc.conf | 2 ++ 11 files changed, 125 insertions(+), 4 deletions(-) diff --git a/mtce-common/src/common/alarmUtil.h b/mtce-common/src/common/alarmUtil.h index e4aebfca..c2f7faed 100644 --- a/mtce-common/src/common/alarmUtil.h +++ b/mtce-common/src/common/alarmUtil.h @@ -41,6 +41,7 @@ #define CH_COMP_ALARM_ID ((const char *)"200.013") /* Combo Host Compute Failure - on last Controller */ #define SENSORCFG_ALARM_ID ((const char *)"200.014") /* Sensor configuration alarm ; i.e. could not add */ #define SENSORGROUP_ALARM_ID ((const char *)"200.015") /* Sensor Group Read Error */ +#define LUKS_ALARM_ID ((const char *)"200.016") /* LUKS volume failure alarm */ #define EVENT_LOG_ID ((const char *)"200.020") #define COMMAND_LOG_ID ((const char *)"200.021") diff --git a/mtce-common/src/common/nodeBase.h b/mtce-common/src/common/nodeBase.h index b6028fa8..3dabad0d 100755 --- a/mtce-common/src/common/nodeBase.h +++ b/mtce-common/src/common/nodeBase.h @@ -86,6 +86,7 @@ void daemon_exit ( void ); #define MTC_FLAG__SM_DEGRADED (0x00000080) #define MTC_FLAG__PATCHING (0x00000100) /* Patching in progress */ #define MTC_FLAG__PATCHED (0x00000200) /* Patched but not reset */ +#define MTC_FLAG__LUKS_VOL_FAILED (0x00000400) #define MTC_FLAG__SM_UNHEALTHY (0x00001000) #define MTC_UNHEALTHY_THRESHOLD (3) @@ -289,6 +290,7 @@ typedef enum #define MTC_TASK_AR_DISABLED_SERVICES "Service Failure, threshold reached, Lock/Unlock to retry" #define MTC_TASK_AR_DISABLED_ENABLE "Enable Failure, threshold reached, Lock/Unlock to retry" #define MTC_TASK_AR_DISABLED_HEARTBEAT "Heartbeat Failure, threshold reached, Lock/Unlock to retry" +#define MTC_TASK_AR_DISABLED_LUKS "LUKS volume failure, threshold reached, Lock/Unlock to retry" #define MTC_TASK_RESET_FAIL "Reset Failed" #define MTC_TASK_RESET_QUEUE "Reset Failed, retrying (%d of %d)" @@ -1020,7 +1022,7 @@ string get_configStages_str ( mtc_configStages_enum stage ); #define DEGRADE_MASK_CONFIG 0x00000400 #define DEGRADE_MASK_COLLECTD 0x00000800 #define DEGRADE_MASK_ENABLE 0x00001000 -#define DEGRADE_MASK_RES4 0x00002000 +#define DEGRADE_MASK_LUKS 0x00002000 #define DEGRADE_MASK_RES5 0x00004000 #define DEGRADE_MASK_RES6 0x00008000 @@ -1261,6 +1263,7 @@ typedef enum MTC_AR_DISABLE_CAUSE__GOENABLE, MTC_AR_DISABLE_CAUSE__HOST_SERVICES, MTC_AR_DISABLE_CAUSE__HEARTBEAT, + MTC_AR_DISABLE_CAUSE__LUKS, MTC_AR_DISABLE_CAUSE__LAST, MTC_AR_DISABLE_CAUSE__NONE, } autorecovery_disable_cause_enum ; diff --git a/mtce/src/alarm/alarm.h b/mtce/src/alarm/alarm.h index 58e02e13..a9895ac4 100644 --- a/mtce/src/alarm/alarm.h +++ b/mtce/src/alarm/alarm.h @@ -39,6 +39,7 @@ #define CH_COMP_ALARM_ID ((const char *)"200.013") /* Combo Host Compute Failure - on last Controller */ #define SENSORCFG_ALARM_ID ((const char *)"200.014") /* Sensor configuration alarm ; i.e. could not add */ #define SENSORGROUP_ALARM_ID ((const char *)"200.015") /* Sensor Group Read Error */ +#define LUKS_ALARM_ID ((const char *)"200.016") /* LUKS volume failure alarm */ #define EVENT_LOG_ID ((const char *)"200.020") #define COMMAND_LOG_ID ((const char *)"200.021") diff --git a/mtce/src/common/nodeClass.cpp b/mtce/src/common/nodeClass.cpp index f8bee057..b39c8573 100755 --- a/mtce/src/common/nodeClass.cpp +++ b/mtce/src/common/nodeClass.cpp @@ -1640,6 +1640,41 @@ int nodeLinkClass::lazy_graceful_fs_reboot ( struct nodeLinkClass::node * node_p return (FAIL); } +/* Generate a log and a critical alarm if the LUKS volume config failed */ +int nodeLinkClass::alarm_luks_failure ( struct nodeLinkClass::node * node_ptr ) +{ + if ( (node_ptr->degrade_mask & DEGRADE_MASK_LUKS) == 0 ) + { + node_ptr->degrade_mask |= DEGRADE_MASK_LUKS ; + } + + if ( node_ptr->alarms[MTC_ALARM_ID__LUKS] != FM_ALARM_SEVERITY_CRITICAL ) + { + elog ("%s critical luks filesystem config failure\n", node_ptr->hostname.c_str()); + + mtcAlarm_critical ( node_ptr->hostname, MTC_ALARM_ID__LUKS ); + node_ptr->alarms[MTC_ALARM_ID__LUKS] = FM_ALARM_SEVERITY_CRITICAL ; + } + return (PASS); +} + +/* Clear the luks alarm and degrade flag */ +int nodeLinkClass::alarm_luks_clear ( struct nodeLinkClass::node * node_ptr ) +{ + if ( node_ptr->degrade_mask & DEGRADE_MASK_LUKS ) + { + node_ptr->degrade_mask &= ~DEGRADE_MASK_LUKS ; + } + + if ( node_ptr->alarms[MTC_ALARM_ID__LUKS] != FM_ALARM_SEVERITY_CLEAR ) + { + ilog ("%s luks config alarm clear\n", node_ptr->hostname.c_str()); + + mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__LUKS ); + node_ptr->alarms[MTC_ALARM_ID__LUKS] = FM_ALARM_SEVERITY_CLEAR ; + } + return (PASS); +} /* Generate a log and a critical alarm if the node config failed */ int nodeLinkClass::alarm_config_failure ( struct nodeLinkClass::node * node_ptr ) diff --git a/mtce/src/common/nodeClass.h b/mtce/src/common/nodeClass.h index 367bda87..5703911e 100755 --- a/mtce/src/common/nodeClass.h +++ b/mtce/src/common/nodeClass.h @@ -1106,6 +1106,9 @@ private: int alarm_config_clear ( struct nodeLinkClass::node * node_ptr ); int alarm_config_failure ( struct nodeLinkClass::node * node_ptr ); + int alarm_luks_clear ( struct nodeLinkClass::node * node_ptr ); + int alarm_luks_failure ( struct nodeLinkClass::node * node_ptr ); + int alarm_compute_clear ( struct nodeLinkClass::node * node_ptr, bool force ); int alarm_compute_failure ( struct nodeLinkClass::node * node_ptr , EFmAlarmSeverityT sev ); diff --git a/mtce/src/maintenance/mtcAlarm.cpp b/mtce/src/maintenance/mtcAlarm.cpp index 67c7e61e..be705d75 100644 --- a/mtce/src/maintenance/mtcAlarm.cpp +++ b/mtce/src/maintenance/mtcAlarm.cpp @@ -184,6 +184,34 @@ void mtcAlarm_init ( void ) "and Switch Activity (Swact) to it as soon as possible. If the alarm " "persists then Lock/Unlock host to recover its local compute service."); + /** LUKS volume config failure Alarm Entry *************************************/ + + ptr = &alarm_list[MTC_ALARM_ID__LUKS]; + memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT))); + snprintf(&ptr->alarm.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s", LUKS_ALARM_ID); + + ptr->name = "LUKS volume failure" ; + ptr->instc_prefix = "" ; + + ptr->minor_reason = + ptr->major_reason = + ptr->critl_reason = "LUKS volume is not active or functioning properly."; + ptr->clear_reason = "'LUKS volume' has been successfully unsealed and service is functioning properly."; + + ptr->alarm.alarm_type = FM_ALARM_OPERATIONAL; + ptr->alarm.probable_cause = FM_ALARM_APP_SUBSYS_FAILURE ; + ptr->alarm.inhibit_alarms = FM_FALSE ; + ptr->alarm.service_affecting = FM_FALSE ; + ptr->alarm.suppression = FM_TRUE ; + + ptr->alarm.severity = FM_ALARM_SEVERITY_CLEAR ; /* Dynamic */ + ptr->alarm.alarm_state = FM_ALARM_STATE_CLEAR ; /* Dynamic */ + + snprintf (ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH, + "If this alarm does not automatically clear after some time and " + "continues to be asserted after Host is locked and unlocked then " + "contact next level of support for root cause analysis and recovery."); + /** Init Event Log Entry *************************************************/ ptr = &alarm_list[MTC_LOG_ID__EVENT]; @@ -315,6 +343,7 @@ string _getIdentity ( mtc_alarm_id_enum id ) case MTC_ALARM_ID__ENABLE: return (ENABLE_ALARM_ID); case MTC_ALARM_ID__BM: return (BM_ALARM_ID); case MTC_ALARM_ID__CH_COMP: return (CH_COMP_ALARM_ID); + case MTC_ALARM_ID__LUKS: return (LUKS_ALARM_ID); case MTC_LOG_ID__EVENT: return (EVENT_LOG_ID); case MTC_LOG_ID__COMMAND: return (COMMAND_LOG_ID); case MTC_LOG_ID__STATECHANGE: return (STATECHANGE_LOG_ID); diff --git a/mtce/src/maintenance/mtcAlarm.h b/mtce/src/maintenance/mtcAlarm.h index 3d998ff3..e0ddf87e 100644 --- a/mtce/src/maintenance/mtcAlarm.h +++ b/mtce/src/maintenance/mtcAlarm.h @@ -30,6 +30,7 @@ typedef enum MTC_ALARM_ID__ENABLE, MTC_ALARM_ID__BM, MTC_ALARM_ID__CH_COMP, /* Combo Host Compute Failure - on last Controller */ + MTC_ALARM_ID__LUKS, MTC_LOG_ID__EVENT, MTC_LOG_ID__COMMAND, diff --git a/mtce/src/maintenance/mtcCompMsg.cpp b/mtce/src/maintenance/mtcCompMsg.cpp index 41e37213..76858273 100755 --- a/mtce/src/maintenance/mtcCompMsg.cpp +++ b/mtce/src/maintenance/mtcCompMsg.cpp @@ -771,6 +771,11 @@ int create_mtcAlive_msg ( mtc_message_type & msg, int cmd, string identity, int /* Insert the mtce flags */ msg.parm[MTC_PARM_FLAGS_IDX] = 0 ; + + //Check if LUKS FS manager service is active + int exitstatus = system("cryptsetup status luks_encrypted_vault"); + if ( 0 != exitstatus ) + msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__LUKS_VOL_FAILED ; if ( daemon_is_file_present ( CONFIG_COMPLETE_FILE ) ) msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__I_AM_CONFIGURED ; if ( daemon_is_file_present ( CONFIG_FAIL_FILE ) ) diff --git a/mtce/src/maintenance/mtcNodeCtrl.cpp b/mtce/src/maintenance/mtcNodeCtrl.cpp index b7b6e4f9..f18efc23 100644 --- a/mtce/src/maintenance/mtcNodeCtrl.cpp +++ b/mtce/src/maintenance/mtcNodeCtrl.cpp @@ -406,6 +406,8 @@ static int mtc_config_handler ( void * user, mtcInv.ar_threshold[MTC_AR_DISABLE_CAUSE__HOST_SERVICES] = atoi(value); else if (MATCH("agent", "ar_heartbeat_threshold")) mtcInv.ar_threshold[MTC_AR_DISABLE_CAUSE__HEARTBEAT] = atoi(value); + else if (MATCH("agent", "ar_luks_threshold")) + mtcInv.ar_threshold[MTC_AR_DISABLE_CAUSE__LUKS] = atoi(value); else if (MATCH("agent", "ar_config_interval")) mtcInv.ar_interval[MTC_AR_DISABLE_CAUSE__CONFIG] = atoi(value); @@ -415,6 +417,8 @@ static int mtc_config_handler ( void * user, mtcInv.ar_interval[MTC_AR_DISABLE_CAUSE__HOST_SERVICES] = atoi(value); else if (MATCH("agent", "ar_heartbeat_interval")) mtcInv.ar_interval[MTC_AR_DISABLE_CAUSE__HEARTBEAT] = atoi(value); + else if (MATCH("agent", "ar_luks_interval")) + mtcInv.ar_interval[MTC_AR_DISABLE_CAUSE__LUKS] = atoi(value); else @@ -757,6 +761,9 @@ int daemon_configure ( void ) ilog("AR Heartbeat: %d (threshold) %d sec (retry interval)", mtcInv.ar_threshold[MTC_AR_DISABLE_CAUSE__HEARTBEAT], mtcInv.ar_interval [MTC_AR_DISABLE_CAUSE__HEARTBEAT]); + ilog("AR luks : %d (threshold) %d sec (retry interval)", + mtcInv.ar_threshold[MTC_AR_DISABLE_CAUSE__LUKS], + mtcInv.ar_interval [MTC_AR_DISABLE_CAUSE__LUKS]); /* Get this Controller Activity State */ mtc_config.active = daemon_get_run_option ("active") ; diff --git a/mtce/src/maintenance/mtcNodeHdlrs.cpp b/mtce/src/maintenance/mtcNodeHdlrs.cpp index bf3569f9..1486078d 100755 --- a/mtce/src/maintenance/mtcNodeHdlrs.cpp +++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp @@ -1099,8 +1099,27 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) node_ptr->hbsClient_ready = false ; mtcTimer_reset ( node_ptr->mtcTimer ); + /* Check for LUKS volume availability */ + if ( node_ptr->mtce_flags & MTC_FLAG__LUKS_VOL_FAILED ) + { + elog ("%s LUKS volume failure (oob:%x)\n", + node_ptr->hostname.c_str(), + node_ptr->mtce_flags) + + /* raise an alarm for the failure of the config */ + alarm_luks_failure ( node_ptr ); + + mtcInvApi_update_task ( node_ptr, MTC_TASK_MAIN_CONFIG_FAIL ); + enableStageChange ( node_ptr, MTC_ENABLE__FAILURE ); + + /* handle auto recovery for this failure */ + if ( ar_manage ( node_ptr, + MTC_AR_DISABLE_CAUSE__LUKS, + MTC_TASK_AR_DISABLED_LUKS ) != PASS ) + break ; + } /* Check to see if the host is/got configured correctly */ - if ((( !node_ptr->mtce_flags & MTC_FLAG__I_AM_CONFIGURED )) || + else if ((( !node_ptr->mtce_flags & MTC_FLAG__I_AM_CONFIGURED )) || (( node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY ))) { elog ("%s configuration failed or incomplete (oob:%x)\n", @@ -6341,7 +6360,8 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr ) (( !node_ptr->task.compare(MTC_TASK_AR_DISABLED_CONFIG)) || ( !node_ptr->task.compare(MTC_TASK_AR_DISABLED_GOENABLE))|| ( !node_ptr->task.compare(MTC_TASK_AR_DISABLED_SERVICES))|| - ( !node_ptr->task.compare(MTC_TASK_AR_DISABLED_HEARTBEAT)))) + ( !node_ptr->task.compare(MTC_TASK_AR_DISABLED_HEARTBEAT))|| + (!node_ptr->task.compare(MTC_TASK_AR_DISABLED_LUKS)))) { if ( !node_ptr->task.compare(MTC_TASK_AR_DISABLED_CONFIG )) { @@ -6362,6 +6382,11 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr ) { node_ptr->ar_cause = MTC_AR_DISABLE_CAUSE__HEARTBEAT ; } + else if ( !node_ptr->task.compare(MTC_TASK_AR_DISABLED_LUKS )) + { + node_ptr->ar_cause = MTC_AR_DISABLE_CAUSE__LUKS ; + alarm_luks_failure ( node_ptr ); + } node_ptr->ar_disabled = true ; if ( THIS_HOST ) @@ -7949,10 +7974,19 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr ) { /* clear the SM degrade flag */ node_ptr->degrade_mask &= ~DEGRADE_MASK_SM ; - ilog ("%s sm degrade clear\n", node_ptr->hostname.c_str()); } + /* In-service luks volume config failure handling */ + if ( !(node_ptr->mtce_flags & MTC_FLAG__LUKS_VOL_FAILED)) + { + alarm_luks_clear ( node_ptr ); + } + else + { + alarm_luks_failure ( node_ptr ); + } + /* * In-service Config Failure/Alarm handling */ diff --git a/mtce/src/scripts/mtc.conf b/mtce/src/scripts/mtc.conf index d029fc4c..74b56206 100644 --- a/mtce/src/scripts/mtc.conf +++ b/mtce/src/scripts/mtc.conf @@ -56,6 +56,7 @@ ar_config_threshold = 2 ar_goenable_threshold = 2 ar_hostservices_threshold = 2 ar_heartbeat_threshold = 2 +ar_luks_threshold = 2 ; Service specific Auto Recovery retry interval. ; @@ -68,6 +69,7 @@ ar_config_interval = 30 ar_goenable_interval = 30 ar_hostservices_interval = 30 ar_heartbeat_interval = 600 +ar_luks_interval = 30 api_retries = 10 ; number of API retries b4 failure