diff --git a/mtce-common/src/common/alarmUtil.h b/mtce-common/src/common/alarmUtil.h index e4aebfca..c2f7faed 100644 --- a/mtce-common/src/common/alarmUtil.h +++ b/mtce-common/src/common/alarmUtil.h @@ -41,6 +41,7 @@ #define CH_COMP_ALARM_ID ((const char *)"200.013") /* Combo Host Compute Failure - on last Controller */ #define SENSORCFG_ALARM_ID ((const char *)"200.014") /* Sensor configuration alarm ; i.e. could not add */ #define SENSORGROUP_ALARM_ID ((const char *)"200.015") /* Sensor Group Read Error */ +#define LUKS_ALARM_ID ((const char *)"200.016") /* LUKS volume failure alarm */ #define EVENT_LOG_ID ((const char *)"200.020") #define COMMAND_LOG_ID ((const char *)"200.021") diff --git a/mtce-common/src/common/nodeBase.h b/mtce-common/src/common/nodeBase.h index b6028fa8..3dabad0d 100755 --- a/mtce-common/src/common/nodeBase.h +++ b/mtce-common/src/common/nodeBase.h @@ -86,6 +86,7 @@ void daemon_exit ( void ); #define MTC_FLAG__SM_DEGRADED (0x00000080) #define MTC_FLAG__PATCHING (0x00000100) /* Patching in progress */ #define MTC_FLAG__PATCHED (0x00000200) /* Patched but not reset */ +#define MTC_FLAG__LUKS_VOL_FAILED (0x00000400) #define MTC_FLAG__SM_UNHEALTHY (0x00001000) #define MTC_UNHEALTHY_THRESHOLD (3) @@ -289,6 +290,7 @@ typedef enum #define MTC_TASK_AR_DISABLED_SERVICES "Service Failure, threshold reached, Lock/Unlock to retry" #define MTC_TASK_AR_DISABLED_ENABLE "Enable Failure, threshold reached, Lock/Unlock to retry" #define MTC_TASK_AR_DISABLED_HEARTBEAT "Heartbeat Failure, threshold reached, Lock/Unlock to retry" +#define MTC_TASK_AR_DISABLED_LUKS "LUKS volume failure, threshold reached, Lock/Unlock to retry" #define MTC_TASK_RESET_FAIL "Reset Failed" #define MTC_TASK_RESET_QUEUE "Reset Failed, retrying (%d of %d)" @@ -1020,7 +1022,7 @@ string get_configStages_str ( mtc_configStages_enum stage ); #define DEGRADE_MASK_CONFIG 0x00000400 #define DEGRADE_MASK_COLLECTD 0x00000800 #define DEGRADE_MASK_ENABLE 0x00001000 -#define DEGRADE_MASK_RES4 0x00002000 +#define DEGRADE_MASK_LUKS 0x00002000 #define DEGRADE_MASK_RES5 0x00004000 #define DEGRADE_MASK_RES6 0x00008000 @@ -1261,6 +1263,7 @@ typedef enum MTC_AR_DISABLE_CAUSE__GOENABLE, MTC_AR_DISABLE_CAUSE__HOST_SERVICES, MTC_AR_DISABLE_CAUSE__HEARTBEAT, + MTC_AR_DISABLE_CAUSE__LUKS, MTC_AR_DISABLE_CAUSE__LAST, MTC_AR_DISABLE_CAUSE__NONE, } autorecovery_disable_cause_enum ; diff --git a/mtce/src/alarm/alarm.h b/mtce/src/alarm/alarm.h index 58e02e13..a9895ac4 100644 --- a/mtce/src/alarm/alarm.h +++ b/mtce/src/alarm/alarm.h @@ -39,6 +39,7 @@ #define CH_COMP_ALARM_ID ((const char *)"200.013") /* Combo Host Compute Failure - on last Controller */ #define SENSORCFG_ALARM_ID ((const char *)"200.014") /* Sensor configuration alarm ; i.e. could not add */ #define SENSORGROUP_ALARM_ID ((const char *)"200.015") /* Sensor Group Read Error */ +#define LUKS_ALARM_ID ((const char *)"200.016") /* LUKS volume failure alarm */ #define EVENT_LOG_ID ((const char *)"200.020") #define COMMAND_LOG_ID ((const char *)"200.021") diff --git a/mtce/src/common/nodeClass.cpp b/mtce/src/common/nodeClass.cpp index f8bee057..b39c8573 100755 --- a/mtce/src/common/nodeClass.cpp +++ b/mtce/src/common/nodeClass.cpp @@ -1640,6 +1640,41 @@ int nodeLinkClass::lazy_graceful_fs_reboot ( struct nodeLinkClass::node * node_p return (FAIL); } +/* Generate a log and a critical alarm if the LUKS volume config failed */ +int nodeLinkClass::alarm_luks_failure ( struct nodeLinkClass::node * node_ptr ) +{ + if ( (node_ptr->degrade_mask & DEGRADE_MASK_LUKS) == 0 ) + { + node_ptr->degrade_mask |= DEGRADE_MASK_LUKS ; + } + + if ( node_ptr->alarms[MTC_ALARM_ID__LUKS] != FM_ALARM_SEVERITY_CRITICAL ) + { + elog ("%s critical luks filesystem config failure\n", node_ptr->hostname.c_str()); + + mtcAlarm_critical ( node_ptr->hostname, MTC_ALARM_ID__LUKS ); + node_ptr->alarms[MTC_ALARM_ID__LUKS] = FM_ALARM_SEVERITY_CRITICAL ; + } + return (PASS); +} + +/* Clear the luks alarm and degrade flag */ +int nodeLinkClass::alarm_luks_clear ( struct nodeLinkClass::node * node_ptr ) +{ + if ( node_ptr->degrade_mask & DEGRADE_MASK_LUKS ) + { + node_ptr->degrade_mask &= ~DEGRADE_MASK_LUKS ; + } + + if ( node_ptr->alarms[MTC_ALARM_ID__LUKS] != FM_ALARM_SEVERITY_CLEAR ) + { + ilog ("%s luks config alarm clear\n", node_ptr->hostname.c_str()); + + mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__LUKS ); + node_ptr->alarms[MTC_ALARM_ID__LUKS] = FM_ALARM_SEVERITY_CLEAR ; + } + return (PASS); +} /* Generate a log and a critical alarm if the node config failed */ int nodeLinkClass::alarm_config_failure ( struct nodeLinkClass::node * node_ptr ) diff --git a/mtce/src/common/nodeClass.h b/mtce/src/common/nodeClass.h index 367bda87..5703911e 100755 --- a/mtce/src/common/nodeClass.h +++ b/mtce/src/common/nodeClass.h @@ -1106,6 +1106,9 @@ private: int alarm_config_clear ( struct nodeLinkClass::node * node_ptr ); int alarm_config_failure ( struct nodeLinkClass::node * node_ptr ); + int alarm_luks_clear ( struct nodeLinkClass::node * node_ptr ); + int alarm_luks_failure ( struct nodeLinkClass::node * node_ptr ); + int alarm_compute_clear ( struct nodeLinkClass::node * node_ptr, bool force ); int alarm_compute_failure ( struct nodeLinkClass::node * node_ptr , EFmAlarmSeverityT sev ); diff --git a/mtce/src/maintenance/mtcAlarm.cpp b/mtce/src/maintenance/mtcAlarm.cpp index 67c7e61e..be705d75 100644 --- a/mtce/src/maintenance/mtcAlarm.cpp +++ b/mtce/src/maintenance/mtcAlarm.cpp @@ -184,6 +184,34 @@ void mtcAlarm_init ( void ) "and Switch Activity (Swact) to it as soon as possible. If the alarm " "persists then Lock/Unlock host to recover its local compute service."); + /** LUKS volume config failure Alarm Entry *************************************/ + + ptr = &alarm_list[MTC_ALARM_ID__LUKS]; + memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT))); + snprintf(&ptr->alarm.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s", LUKS_ALARM_ID); + + ptr->name = "LUKS volume failure" ; + ptr->instc_prefix = "" ; + + ptr->minor_reason = + ptr->major_reason = + ptr->critl_reason = "LUKS volume is not active or functioning properly."; + ptr->clear_reason = "'LUKS volume' has been successfully unsealed and service is functioning properly."; + + ptr->alarm.alarm_type = FM_ALARM_OPERATIONAL; + ptr->alarm.probable_cause = FM_ALARM_APP_SUBSYS_FAILURE ; + ptr->alarm.inhibit_alarms = FM_FALSE ; + ptr->alarm.service_affecting = FM_FALSE ; + ptr->alarm.suppression = FM_TRUE ; + + ptr->alarm.severity = FM_ALARM_SEVERITY_CLEAR ; /* Dynamic */ + ptr->alarm.alarm_state = FM_ALARM_STATE_CLEAR ; /* Dynamic */ + + snprintf (ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH, + "If this alarm does not automatically clear after some time and " + "continues to be asserted after Host is locked and unlocked then " + "contact next level of support for root cause analysis and recovery."); + /** Init Event Log Entry *************************************************/ ptr = &alarm_list[MTC_LOG_ID__EVENT]; @@ -315,6 +343,7 @@ string _getIdentity ( mtc_alarm_id_enum id ) case MTC_ALARM_ID__ENABLE: return (ENABLE_ALARM_ID); case MTC_ALARM_ID__BM: return (BM_ALARM_ID); case MTC_ALARM_ID__CH_COMP: return (CH_COMP_ALARM_ID); + case MTC_ALARM_ID__LUKS: return (LUKS_ALARM_ID); case MTC_LOG_ID__EVENT: return (EVENT_LOG_ID); case MTC_LOG_ID__COMMAND: return (COMMAND_LOG_ID); case MTC_LOG_ID__STATECHANGE: return (STATECHANGE_LOG_ID); diff --git a/mtce/src/maintenance/mtcAlarm.h b/mtce/src/maintenance/mtcAlarm.h index 3d998ff3..e0ddf87e 100644 --- a/mtce/src/maintenance/mtcAlarm.h +++ b/mtce/src/maintenance/mtcAlarm.h @@ -30,6 +30,7 @@ typedef enum MTC_ALARM_ID__ENABLE, MTC_ALARM_ID__BM, MTC_ALARM_ID__CH_COMP, /* Combo Host Compute Failure - on last Controller */ + MTC_ALARM_ID__LUKS, MTC_LOG_ID__EVENT, MTC_LOG_ID__COMMAND, diff --git a/mtce/src/maintenance/mtcCompMsg.cpp b/mtce/src/maintenance/mtcCompMsg.cpp index 41e37213..76858273 100755 --- a/mtce/src/maintenance/mtcCompMsg.cpp +++ b/mtce/src/maintenance/mtcCompMsg.cpp @@ -771,6 +771,11 @@ int create_mtcAlive_msg ( mtc_message_type & msg, int cmd, string identity, int /* Insert the mtce flags */ msg.parm[MTC_PARM_FLAGS_IDX] = 0 ; + + //Check if LUKS FS manager service is active + int exitstatus = system("cryptsetup status luks_encrypted_vault"); + if ( 0 != exitstatus ) + msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__LUKS_VOL_FAILED ; if ( daemon_is_file_present ( CONFIG_COMPLETE_FILE ) ) msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__I_AM_CONFIGURED ; if ( daemon_is_file_present ( CONFIG_FAIL_FILE ) ) diff --git a/mtce/src/maintenance/mtcNodeCtrl.cpp b/mtce/src/maintenance/mtcNodeCtrl.cpp index b7b6e4f9..f18efc23 100644 --- a/mtce/src/maintenance/mtcNodeCtrl.cpp +++ b/mtce/src/maintenance/mtcNodeCtrl.cpp @@ -406,6 +406,8 @@ static int mtc_config_handler ( void * user, mtcInv.ar_threshold[MTC_AR_DISABLE_CAUSE__HOST_SERVICES] = atoi(value); else if (MATCH("agent", "ar_heartbeat_threshold")) mtcInv.ar_threshold[MTC_AR_DISABLE_CAUSE__HEARTBEAT] = atoi(value); + else if (MATCH("agent", "ar_luks_threshold")) + mtcInv.ar_threshold[MTC_AR_DISABLE_CAUSE__LUKS] = atoi(value); else if (MATCH("agent", "ar_config_interval")) mtcInv.ar_interval[MTC_AR_DISABLE_CAUSE__CONFIG] = atoi(value); @@ -415,6 +417,8 @@ static int mtc_config_handler ( void * user, mtcInv.ar_interval[MTC_AR_DISABLE_CAUSE__HOST_SERVICES] = atoi(value); else if (MATCH("agent", "ar_heartbeat_interval")) mtcInv.ar_interval[MTC_AR_DISABLE_CAUSE__HEARTBEAT] = atoi(value); + else if (MATCH("agent", "ar_luks_interval")) + mtcInv.ar_interval[MTC_AR_DISABLE_CAUSE__LUKS] = atoi(value); else @@ -757,6 +761,9 @@ int daemon_configure ( void ) ilog("AR Heartbeat: %d (threshold) %d sec (retry interval)", mtcInv.ar_threshold[MTC_AR_DISABLE_CAUSE__HEARTBEAT], mtcInv.ar_interval [MTC_AR_DISABLE_CAUSE__HEARTBEAT]); + ilog("AR luks : %d (threshold) %d sec (retry interval)", + mtcInv.ar_threshold[MTC_AR_DISABLE_CAUSE__LUKS], + mtcInv.ar_interval [MTC_AR_DISABLE_CAUSE__LUKS]); /* Get this Controller Activity State */ mtc_config.active = daemon_get_run_option ("active") ; diff --git a/mtce/src/maintenance/mtcNodeHdlrs.cpp b/mtce/src/maintenance/mtcNodeHdlrs.cpp index bf3569f9..1486078d 100755 --- a/mtce/src/maintenance/mtcNodeHdlrs.cpp +++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp @@ -1099,8 +1099,27 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) node_ptr->hbsClient_ready = false ; mtcTimer_reset ( node_ptr->mtcTimer ); + /* Check for LUKS volume availability */ + if ( node_ptr->mtce_flags & MTC_FLAG__LUKS_VOL_FAILED ) + { + elog ("%s LUKS volume failure (oob:%x)\n", + node_ptr->hostname.c_str(), + node_ptr->mtce_flags) + + /* raise an alarm for the failure of the config */ + alarm_luks_failure ( node_ptr ); + + mtcInvApi_update_task ( node_ptr, MTC_TASK_MAIN_CONFIG_FAIL ); + enableStageChange ( node_ptr, MTC_ENABLE__FAILURE ); + + /* handle auto recovery for this failure */ + if ( ar_manage ( node_ptr, + MTC_AR_DISABLE_CAUSE__LUKS, + MTC_TASK_AR_DISABLED_LUKS ) != PASS ) + break ; + } /* Check to see if the host is/got configured correctly */ - if ((( !node_ptr->mtce_flags & MTC_FLAG__I_AM_CONFIGURED )) || + else if ((( !node_ptr->mtce_flags & MTC_FLAG__I_AM_CONFIGURED )) || (( node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY ))) { elog ("%s configuration failed or incomplete (oob:%x)\n", @@ -6341,7 +6360,8 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr ) (( !node_ptr->task.compare(MTC_TASK_AR_DISABLED_CONFIG)) || ( !node_ptr->task.compare(MTC_TASK_AR_DISABLED_GOENABLE))|| ( !node_ptr->task.compare(MTC_TASK_AR_DISABLED_SERVICES))|| - ( !node_ptr->task.compare(MTC_TASK_AR_DISABLED_HEARTBEAT)))) + ( !node_ptr->task.compare(MTC_TASK_AR_DISABLED_HEARTBEAT))|| + (!node_ptr->task.compare(MTC_TASK_AR_DISABLED_LUKS)))) { if ( !node_ptr->task.compare(MTC_TASK_AR_DISABLED_CONFIG )) { @@ -6362,6 +6382,11 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr ) { node_ptr->ar_cause = MTC_AR_DISABLE_CAUSE__HEARTBEAT ; } + else if ( !node_ptr->task.compare(MTC_TASK_AR_DISABLED_LUKS )) + { + node_ptr->ar_cause = MTC_AR_DISABLE_CAUSE__LUKS ; + alarm_luks_failure ( node_ptr ); + } node_ptr->ar_disabled = true ; if ( THIS_HOST ) @@ -7949,10 +7974,19 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr ) { /* clear the SM degrade flag */ node_ptr->degrade_mask &= ~DEGRADE_MASK_SM ; - ilog ("%s sm degrade clear\n", node_ptr->hostname.c_str()); } + /* In-service luks volume config failure handling */ + if ( !(node_ptr->mtce_flags & MTC_FLAG__LUKS_VOL_FAILED)) + { + alarm_luks_clear ( node_ptr ); + } + else + { + alarm_luks_failure ( node_ptr ); + } + /* * In-service Config Failure/Alarm handling */ diff --git a/mtce/src/scripts/mtc.conf b/mtce/src/scripts/mtc.conf index d029fc4c..74b56206 100644 --- a/mtce/src/scripts/mtc.conf +++ b/mtce/src/scripts/mtc.conf @@ -56,6 +56,7 @@ ar_config_threshold = 2 ar_goenable_threshold = 2 ar_hostservices_threshold = 2 ar_heartbeat_threshold = 2 +ar_luks_threshold = 2 ; Service specific Auto Recovery retry interval. ; @@ -68,6 +69,7 @@ ar_config_interval = 30 ar_goenable_interval = 30 ar_hostservices_interval = 30 ar_heartbeat_interval = 600 +ar_luks_interval = 30 api_retries = 10 ; number of API retries b4 failure