Failure case handling of LUKS service

luks-fs-mgr service creates and unseals the LUKS volume used to store
keys/secrets. This change handles the failure case if this essential
service is inactive. It introduces an alarm LUKS_ALARM_ID which is
raised if service is inactive which implies that there is an issue in
creating or unsealing the LUKS volume.

Test Plan:
PASS" build-pkgs -c -p mtce-common
PASS: build-pkgs -c -p mtce
PASS: build-image
PASS: AIO-SX bootstrap with luks volume status active
PASS: AIO-DX bootstrap with volume status active
PASS: Standard setup with 2 controllers and 1 compute node with luks
      volume status active. There should not be any alarm and node
      status should be unlocked/enabled/available.
PASS: AIO-DX node enable failure on the controller where luks volume
      is inactive. Node availability should be failed. A critical
      alarm with id 200.016 should be displayed with 'fm alarm-list'
PASS: AIO-SX node enable failure on the controller-0. Node availability
      should be failed. A critical alarm with id 200.016 should be
      displayed with 'fm alarm-list'
PASS: Standard- node enable failure on the node (controller-0,
      controller-1, storage-0, compute-1). Node availability
      should be failed. A critical alarm with id 200.016 should be
      displayed with 'fm alarm-list' for the failed host.
PASS: AIO-DX In service volume inactive should be detected and a
      critical alarm should be raised with ID 200.016. Node
      availability should be changed to degraded.
PASS: AIO-SX In service volume inactive  status should be detected
      and a critical alarm should be raised with ID 200.016. Node
      availability should be changed to degraded.
PASS: Standard ( 2 controller, 1 storage, 1 compute) In service
      volume inactive status should be detected and a
      critical alarm should be raised with ID 200.016. Node
      availability should be changed to degraded.
PASS: AIO-DX In service: If volume becomes active and a LUKS alarm
      is active, alarm should be cleared. Node availability should
      be changed to available.
PASS: AIO-SX In service: If volume becomes active and a  LUKS alarm is
      active, alarm should be cleared. Node availability should be
      changed to available.
PASS: Standard ( 2 controller, 1 storage, 1 compute) In service:
      If volume becomes active and a LUKS alarm is active, alarm
      should be cleared. Node availability should be changed to
      available.
PASS: AIO-SX, AIO-DX, Standard- If intest fails and node availability
      is 'failed'. After fixing the volume issue, a lock/unlock should
      make the node available.

Story: 2010872
Task: 49108

Change-Id: I4621e7c546078c3cc22fe47079ba7725fbea5c8f
Signed-off-by: Jagatguru Prasad Mishra <jagatguruprasad.mishra@windriver.com>
This commit is contained in:
Jagatguru Prasad Mishra 2023-11-20 06:41:16 -05:00
parent 79d8644b1e
commit 1210ed450a
11 changed files with 125 additions and 4 deletions

View File

@ -41,6 +41,7 @@
#define CH_COMP_ALARM_ID ((const char *)"200.013") /* Combo Host Compute Failure - on last Controller */
#define SENSORCFG_ALARM_ID ((const char *)"200.014") /* Sensor configuration alarm ; i.e. could not add */
#define SENSORGROUP_ALARM_ID ((const char *)"200.015") /* Sensor Group Read Error */
#define LUKS_ALARM_ID ((const char *)"200.016") /* LUKS volume failure alarm */
#define EVENT_LOG_ID ((const char *)"200.020")
#define COMMAND_LOG_ID ((const char *)"200.021")

View File

@ -86,6 +86,7 @@ void daemon_exit ( void );
#define MTC_FLAG__SM_DEGRADED (0x00000080)
#define MTC_FLAG__PATCHING (0x00000100) /* Patching in progress */
#define MTC_FLAG__PATCHED (0x00000200) /* Patched but not reset */
#define MTC_FLAG__LUKS_VOL_FAILED (0x00000400)
#define MTC_FLAG__SM_UNHEALTHY (0x00001000)
#define MTC_UNHEALTHY_THRESHOLD (3)
@ -289,6 +290,7 @@ typedef enum
#define MTC_TASK_AR_DISABLED_SERVICES "Service Failure, threshold reached, Lock/Unlock to retry"
#define MTC_TASK_AR_DISABLED_ENABLE "Enable Failure, threshold reached, Lock/Unlock to retry"
#define MTC_TASK_AR_DISABLED_HEARTBEAT "Heartbeat Failure, threshold reached, Lock/Unlock to retry"
#define MTC_TASK_AR_DISABLED_LUKS "LUKS volume failure, threshold reached, Lock/Unlock to retry"
#define MTC_TASK_RESET_FAIL "Reset Failed"
#define MTC_TASK_RESET_QUEUE "Reset Failed, retrying (%d of %d)"
@ -1020,7 +1022,7 @@ string get_configStages_str ( mtc_configStages_enum stage );
#define DEGRADE_MASK_CONFIG 0x00000400
#define DEGRADE_MASK_COLLECTD 0x00000800
#define DEGRADE_MASK_ENABLE 0x00001000
#define DEGRADE_MASK_RES4 0x00002000
#define DEGRADE_MASK_LUKS 0x00002000
#define DEGRADE_MASK_RES5 0x00004000
#define DEGRADE_MASK_RES6 0x00008000
@ -1261,6 +1263,7 @@ typedef enum
MTC_AR_DISABLE_CAUSE__GOENABLE,
MTC_AR_DISABLE_CAUSE__HOST_SERVICES,
MTC_AR_DISABLE_CAUSE__HEARTBEAT,
MTC_AR_DISABLE_CAUSE__LUKS,
MTC_AR_DISABLE_CAUSE__LAST,
MTC_AR_DISABLE_CAUSE__NONE,
} autorecovery_disable_cause_enum ;

View File

@ -39,6 +39,7 @@
#define CH_COMP_ALARM_ID ((const char *)"200.013") /* Combo Host Compute Failure - on last Controller */
#define SENSORCFG_ALARM_ID ((const char *)"200.014") /* Sensor configuration alarm ; i.e. could not add */
#define SENSORGROUP_ALARM_ID ((const char *)"200.015") /* Sensor Group Read Error */
#define LUKS_ALARM_ID ((const char *)"200.016") /* LUKS volume failure alarm */
#define EVENT_LOG_ID ((const char *)"200.020")
#define COMMAND_LOG_ID ((const char *)"200.021")

View File

@ -1640,6 +1640,41 @@ int nodeLinkClass::lazy_graceful_fs_reboot ( struct nodeLinkClass::node * node_p
return (FAIL);
}
/* Generate a log and a critical alarm if the LUKS volume config failed */
int nodeLinkClass::alarm_luks_failure ( struct nodeLinkClass::node * node_ptr )
{
if ( (node_ptr->degrade_mask & DEGRADE_MASK_LUKS) == 0 )
{
node_ptr->degrade_mask |= DEGRADE_MASK_LUKS ;
}
if ( node_ptr->alarms[MTC_ALARM_ID__LUKS] != FM_ALARM_SEVERITY_CRITICAL )
{
elog ("%s critical luks filesystem config failure\n", node_ptr->hostname.c_str());
mtcAlarm_critical ( node_ptr->hostname, MTC_ALARM_ID__LUKS );
node_ptr->alarms[MTC_ALARM_ID__LUKS] = FM_ALARM_SEVERITY_CRITICAL ;
}
return (PASS);
}
/* Clear the luks alarm and degrade flag */
int nodeLinkClass::alarm_luks_clear ( struct nodeLinkClass::node * node_ptr )
{
if ( node_ptr->degrade_mask & DEGRADE_MASK_LUKS )
{
node_ptr->degrade_mask &= ~DEGRADE_MASK_LUKS ;
}
if ( node_ptr->alarms[MTC_ALARM_ID__LUKS] != FM_ALARM_SEVERITY_CLEAR )
{
ilog ("%s luks config alarm clear\n", node_ptr->hostname.c_str());
mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__LUKS );
node_ptr->alarms[MTC_ALARM_ID__LUKS] = FM_ALARM_SEVERITY_CLEAR ;
}
return (PASS);
}
/* Generate a log and a critical alarm if the node config failed */
int nodeLinkClass::alarm_config_failure ( struct nodeLinkClass::node * node_ptr )

View File

@ -1106,6 +1106,9 @@ private:
int alarm_config_clear ( struct nodeLinkClass::node * node_ptr );
int alarm_config_failure ( struct nodeLinkClass::node * node_ptr );
int alarm_luks_clear ( struct nodeLinkClass::node * node_ptr );
int alarm_luks_failure ( struct nodeLinkClass::node * node_ptr );
int alarm_compute_clear ( struct nodeLinkClass::node * node_ptr, bool force );
int alarm_compute_failure ( struct nodeLinkClass::node * node_ptr , EFmAlarmSeverityT sev );

View File

@ -184,6 +184,34 @@ void mtcAlarm_init ( void )
"and Switch Activity (Swact) to it as soon as possible. If the alarm "
"persists then Lock/Unlock host to recover its local compute service.");
/** LUKS volume config failure Alarm Entry *************************************/
ptr = &alarm_list[MTC_ALARM_ID__LUKS];
memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT)));
snprintf(&ptr->alarm.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s", LUKS_ALARM_ID);
ptr->name = "LUKS volume failure" ;
ptr->instc_prefix = "" ;
ptr->minor_reason =
ptr->major_reason =
ptr->critl_reason = "LUKS volume is not active or functioning properly.";
ptr->clear_reason = "'LUKS volume' has been successfully unsealed and service is functioning properly.";
ptr->alarm.alarm_type = FM_ALARM_OPERATIONAL;
ptr->alarm.probable_cause = FM_ALARM_APP_SUBSYS_FAILURE ;
ptr->alarm.inhibit_alarms = FM_FALSE ;
ptr->alarm.service_affecting = FM_FALSE ;
ptr->alarm.suppression = FM_TRUE ;
ptr->alarm.severity = FM_ALARM_SEVERITY_CLEAR ; /* Dynamic */
ptr->alarm.alarm_state = FM_ALARM_STATE_CLEAR ; /* Dynamic */
snprintf (ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH,
"If this alarm does not automatically clear after some time and "
"continues to be asserted after Host is locked and unlocked then "
"contact next level of support for root cause analysis and recovery.");
/** Init Event Log Entry *************************************************/
ptr = &alarm_list[MTC_LOG_ID__EVENT];
@ -315,6 +343,7 @@ string _getIdentity ( mtc_alarm_id_enum id )
case MTC_ALARM_ID__ENABLE: return (ENABLE_ALARM_ID);
case MTC_ALARM_ID__BM: return (BM_ALARM_ID);
case MTC_ALARM_ID__CH_COMP: return (CH_COMP_ALARM_ID);
case MTC_ALARM_ID__LUKS: return (LUKS_ALARM_ID);
case MTC_LOG_ID__EVENT: return (EVENT_LOG_ID);
case MTC_LOG_ID__COMMAND: return (COMMAND_LOG_ID);
case MTC_LOG_ID__STATECHANGE: return (STATECHANGE_LOG_ID);

View File

@ -30,6 +30,7 @@ typedef enum
MTC_ALARM_ID__ENABLE,
MTC_ALARM_ID__BM,
MTC_ALARM_ID__CH_COMP, /* Combo Host Compute Failure - on last Controller */
MTC_ALARM_ID__LUKS,
MTC_LOG_ID__EVENT,
MTC_LOG_ID__COMMAND,

View File

@ -771,6 +771,11 @@ int create_mtcAlive_msg ( mtc_message_type & msg, int cmd, string identity, int
/* Insert the mtce flags */
msg.parm[MTC_PARM_FLAGS_IDX] = 0 ;
//Check if LUKS FS manager service is active
int exitstatus = system("cryptsetup status luks_encrypted_vault");
if ( 0 != exitstatus )
msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__LUKS_VOL_FAILED ;
if ( daemon_is_file_present ( CONFIG_COMPLETE_FILE ) )
msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__I_AM_CONFIGURED ;
if ( daemon_is_file_present ( CONFIG_FAIL_FILE ) )

View File

@ -406,6 +406,8 @@ static int mtc_config_handler ( void * user,
mtcInv.ar_threshold[MTC_AR_DISABLE_CAUSE__HOST_SERVICES] = atoi(value);
else if (MATCH("agent", "ar_heartbeat_threshold"))
mtcInv.ar_threshold[MTC_AR_DISABLE_CAUSE__HEARTBEAT] = atoi(value);
else if (MATCH("agent", "ar_luks_threshold"))
mtcInv.ar_threshold[MTC_AR_DISABLE_CAUSE__LUKS] = atoi(value);
else if (MATCH("agent", "ar_config_interval"))
mtcInv.ar_interval[MTC_AR_DISABLE_CAUSE__CONFIG] = atoi(value);
@ -415,6 +417,8 @@ static int mtc_config_handler ( void * user,
mtcInv.ar_interval[MTC_AR_DISABLE_CAUSE__HOST_SERVICES] = atoi(value);
else if (MATCH("agent", "ar_heartbeat_interval"))
mtcInv.ar_interval[MTC_AR_DISABLE_CAUSE__HEARTBEAT] = atoi(value);
else if (MATCH("agent", "ar_luks_interval"))
mtcInv.ar_interval[MTC_AR_DISABLE_CAUSE__LUKS] = atoi(value);
else
@ -757,6 +761,9 @@ int daemon_configure ( void )
ilog("AR Heartbeat: %d (threshold) %d sec (retry interval)",
mtcInv.ar_threshold[MTC_AR_DISABLE_CAUSE__HEARTBEAT],
mtcInv.ar_interval [MTC_AR_DISABLE_CAUSE__HEARTBEAT]);
ilog("AR luks : %d (threshold) %d sec (retry interval)",
mtcInv.ar_threshold[MTC_AR_DISABLE_CAUSE__LUKS],
mtcInv.ar_interval [MTC_AR_DISABLE_CAUSE__LUKS]);
/* Get this Controller Activity State */
mtc_config.active = daemon_get_run_option ("active") ;

View File

@ -1099,8 +1099,27 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
node_ptr->hbsClient_ready = false ;
mtcTimer_reset ( node_ptr->mtcTimer );
/* Check for LUKS volume availability */
if ( node_ptr->mtce_flags & MTC_FLAG__LUKS_VOL_FAILED )
{
elog ("%s LUKS volume failure (oob:%x)\n",
node_ptr->hostname.c_str(),
node_ptr->mtce_flags)
/* raise an alarm for the failure of the config */
alarm_luks_failure ( node_ptr );
mtcInvApi_update_task ( node_ptr, MTC_TASK_MAIN_CONFIG_FAIL );
enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
/* handle auto recovery for this failure */
if ( ar_manage ( node_ptr,
MTC_AR_DISABLE_CAUSE__LUKS,
MTC_TASK_AR_DISABLED_LUKS ) != PASS )
break ;
}
/* Check to see if the host is/got configured correctly */
if ((( !node_ptr->mtce_flags & MTC_FLAG__I_AM_CONFIGURED )) ||
else if ((( !node_ptr->mtce_flags & MTC_FLAG__I_AM_CONFIGURED )) ||
(( node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY )))
{
elog ("%s configuration failed or incomplete (oob:%x)\n",
@ -6341,7 +6360,8 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
(( !node_ptr->task.compare(MTC_TASK_AR_DISABLED_CONFIG)) ||
( !node_ptr->task.compare(MTC_TASK_AR_DISABLED_GOENABLE))||
( !node_ptr->task.compare(MTC_TASK_AR_DISABLED_SERVICES))||
( !node_ptr->task.compare(MTC_TASK_AR_DISABLED_HEARTBEAT))))
( !node_ptr->task.compare(MTC_TASK_AR_DISABLED_HEARTBEAT))||
(!node_ptr->task.compare(MTC_TASK_AR_DISABLED_LUKS))))
{
if ( !node_ptr->task.compare(MTC_TASK_AR_DISABLED_CONFIG ))
{
@ -6362,6 +6382,11 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
{
node_ptr->ar_cause = MTC_AR_DISABLE_CAUSE__HEARTBEAT ;
}
else if ( !node_ptr->task.compare(MTC_TASK_AR_DISABLED_LUKS ))
{
node_ptr->ar_cause = MTC_AR_DISABLE_CAUSE__LUKS ;
alarm_luks_failure ( node_ptr );
}
node_ptr->ar_disabled = true ;
if ( THIS_HOST )
@ -7949,10 +7974,19 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
{
/* clear the SM degrade flag */
node_ptr->degrade_mask &= ~DEGRADE_MASK_SM ;
ilog ("%s sm degrade clear\n", node_ptr->hostname.c_str());
}
/* In-service luks volume config failure handling */
if ( !(node_ptr->mtce_flags & MTC_FLAG__LUKS_VOL_FAILED))
{
alarm_luks_clear ( node_ptr );
}
else
{
alarm_luks_failure ( node_ptr );
}
/*
* In-service Config Failure/Alarm handling
*/

View File

@ -56,6 +56,7 @@ ar_config_threshold = 2
ar_goenable_threshold = 2
ar_hostservices_threshold = 2
ar_heartbeat_threshold = 2
ar_luks_threshold = 2
; Service specific Auto Recovery retry interval.
;
@ -68,6 +69,7 @@ ar_config_interval = 30
ar_goenable_interval = 30
ar_hostservices_interval = 30
ar_heartbeat_interval = 600
ar_luks_interval = 30
api_retries = 10 ; number of API retries b4 failure