Add SysRq crash dump support for pmon quorum health messaging loss

The hostwd process supports failure handling for two pmon
quorum failure modes.
 1. persistent pmon quorum process failure
 2. persistent absence of pmon's quorum health report

This update adds a new configuration option and associated
implementation required to force a crash dump action for
failure mode 2 above.

This means that if the Process Monitor itself gets stalled or stops
running for 3 (default config) minutes then the hostwd will trigger
a SysRq to force a crash dump.

Test Plan:

PASS: Verify kdump for pmon quorum health report message loss
PASS: Verify no kdump when kdump_on_stall is disabled
PASS: Verify handling when kdump service is not active
PASS: Verify sighup config change detection and handling

Regression:

PASS: Verify softdog timeout handling and logs
PASS: Verify quorum threshold config change and handling
PASS: Verify handling with reboot/reset recovery methods disabled
PASS: Verify enable reboot_on_err config change handling
PASS: Verify reboot/reset actions are ignored while host is locked
PASS: Verify pmon failure recovery handling before threshold reached

Change-Id: Id926447574e02013f83c0170784e2a8f9a46bac1
Partial-Bug: 1894889
Depends-On: https://review.opendev.org/#/c/750806
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
Eric MacDonald 2020-11-12 15:39:52 -05:00
parent 3a6fec50c1
commit 2fc05673d1
6 changed files with 185 additions and 18 deletions

View File

@ -117,6 +117,7 @@ typedef struct
int api_retries ; /**< api retries before failure */
int hostwd_failure_threshold ; /**< allowed # of missed pmon/hostwd messages */
bool hostwd_reboot_on_err ; /**< should hostwd reboot on fault detected */
bool hostwd_kdump_on_stall ; /**< sysrq crash dump on quorum msg'ing stall */
bool hostwd_use_kern_wd ; /**< use the kernel watchdog for extra safety */
bool need_clstr_poll_audit ; /**< true if we need to poll for clstr */
char *hostwd_console_path ; /**< console on which to log extreme events */

View File

@ -49,6 +49,8 @@ void daemon_config_default ( daemon_config_type* config_ptr )
config_ptr->multicast = strdup("none");
config_ptr->barbican_api_host = strdup("none");
config_ptr->hostwd_kdump_on_stall = 0 ;
config_ptr->debug_all = 0 ;
config_ptr->debug_json = 0 ;
config_ptr->debug_timer = 0 ;

View File

@ -57,8 +57,16 @@ using namespace std;
#include "hostwMsg.h" /* message format */
/* Configuration Files */
#define HOSTWD_CONFIG_FILE ((const char *)"/etc/mtc/hostwd.conf")
#define PMOND_CONFIG_FILE ((const char *)"/etc/mtc/pmond.conf")
#define HOSTWD_CONFIG_FILE ((const char * const)"/etc/mtc/hostwd.conf")
#define PMOND_CONFIG_FILE ((const char * const)"/etc/mtc/pmond.conf")
/* kernel SysRq control interface file used to enable the magic SysRq key.
* Used to ensure that the SysRq service is enabled when needed. */
#define SYSRQ_CONTROL_INTERFACE ((const char * const)"/proc/sys/kernel/sysrq")
/* kernel SysRq trigger interface file used to trigger a SysRq command.
* commands @ https://www.kernel.org/doc/html/latest/admin-guide/sysrq.html */
#define SYSRQ_COMMAND_INTERFACE ((const char * const)"/proc/sysrq-trigger")
#define HOSTW_MIN_KERN_UPDATE_PERIOD 5 /* user can set how long until kernel
* watchdog panics, down to this
@ -109,6 +117,17 @@ typedef struct
struct sigaction prev ; /**< Action handler that was replaced */
/**< This is put back on the exit */
bool quorum_failed ; /** When true gates repeated failure handling */
/* Crash Dump Support */
/* ------------------ */
bool kdump_supported ; /** State of kdump support; not supported default.
* kdump support is queried through systemctl
* on process startup when this is updated to
* true if supported */
int fd_sysrq_enable ; /** SysRq magic key file descriptor */
int fd_sysrq_command ; /** SysRq command interface file descriptor */
} hostw_ctrl_type ;
/** Daemon Service messaging socket control structure */

View File

@ -38,7 +38,7 @@
int hostw_service_command ( hostw_socket_type * hostw_socket );
static void fork_hostwd_logger ( void );
void fork_hostwd_logger ( void );
char my_hostname [MAX_HOST_NAME_SIZE+1];
/* Push daemon state to log file */
@ -69,6 +69,130 @@ void hostwTimer_handler ( int sig, siginfo_t *si, void *uc)
mtcTimer_stop_tid_int_safe ( tid_ptr );
}
/***************************************************************************
*
* Name : get_kdump_support
*
* Purpose : Query the state of the kdump service
*
* Updates : kdump_supported default of false is updated to true if
* the kdump service query indicates that kdump is active.
*
**************************************************************************/
void get_kdump_support ( void )
{
char pipe_cmd_output [PIPE_COMMAND_RESPON_LEN] ;
execute_pipe_cmd ( "/usr/bin/systemctl is-active kdump",
&pipe_cmd_output[0],
PIPE_COMMAND_RESPON_LEN );
if ( strnlen ( pipe_cmd_output, PIPE_COMMAND_RESPON_LEN ) > 0 )
{
if ( ! strncmp (&pipe_cmd_output[0], "active", strlen("active")))
{
hostw_ctrl_type * ctrl_ptr = get_ctrl_ptr() ;
ctrl_ptr->fd_sysrq_enable = open(SYSRQ_CONTROL_INTERFACE,O_WRONLY);
ctrl_ptr->fd_sysrq_command= open(SYSRQ_COMMAND_INTERFACE,O_WRONLY);
if ( ctrl_ptr->fd_sysrq_enable && ctrl_ptr->fd_sysrq_command )
{
ilog("kdump is active");
ctrl_ptr->kdump_supported = true ;
return ;
}
ilog("kdump service setup failed ; %d:%d:%s",
ctrl_ptr->fd_sysrq_enable,
ctrl_ptr->fd_sysrq_command,
pipe_cmd_output);
}
else
{
ilog("kdump is inactive (%s)", pipe_cmd_output);
}
}
else
{
elog("kdump status query failed ; assuming kdump is inactive");
}
}
/***************************************************************************
*
* Name : force_crashdump
*
* Purpose : Force a crash dump via SysRq command 'c'
*
* Warning : Host will reset immediately, without graceful shutdown.
*
**************************************************************************/
void force_crashdump ( void )
{
hostw_ctrl_type * ctrl_ptr = get_ctrl_ptr() ;
if (( daemon_get_cfg_ptr()->hostwd_kdump_on_stall == 0 ) ||
( ctrl_ptr->kdump_supported == false ))
{
/* crash dump is disabled or not supported */
return ;
}
/* Go for the crash dump */
/* Enable all functions of sysrq */
static char sysrq_enable_cmd = '1' ;
/* Crash Dump by NULL pointer dereference */
static char sysrq_crash_dump_cmd = 'c' ;
int bytes = write(ctrl_ptr->fd_sysrq_enable, &sysrq_enable_cmd, 1 );
if ( bytes <= 0 )
{
elog("SysRq Enable failed (%d:%d:%s)", bytes, errno, strerror(errno) );
}
else
{
/*************** force crash dump *****************/
bytes = write(ctrl_ptr->fd_sysrq_command, &sysrq_crash_dump_cmd, 1);
if ( bytes <= 0 )
{
elog("SysRq command failed (%d:%d:%s)",
bytes, errno, strerror(errno) );
}
else
{
; /* should not get here */
}
}
}
/***************************************************************************
*
* Name : manage_quorum_failed
*
* Purpose : permit recovery
*
* Description: If called while none of the reboot or sysRq reset failure
* recovery options are enabled then we are not going for a
* reboot or reset so allow recovery.
*
**************************************************************************/
void manage_quorum_failed ( void )
{
hostw_ctrl_type * ctrl_ptr = get_ctrl_ptr() ;
daemon_config_type* config_ptr = daemon_get_cfg_ptr() ;
if ((( config_ptr->hostwd_kdump_on_stall == 0 ) ||
( ctrl_ptr->kdump_supported == false )) &&
( config_ptr->hostwd_reboot_on_err == 0 ))
{
/* If we are not going for a reboot or reset then allow recovery
* by clearing the control boolean that prevents recovery. */
ilog ("Quorum failed but all reboot/reset recovery options disabled");
ilog ("... allowing auto recovery");
ctrl_ptr->quorum_failed = 0 ;
ctrl_ptr->pmon_grace_loops = config_ptr->hostwd_failure_threshold ;
}
}
/**
* This is the main loop of the program
*
@ -94,6 +218,9 @@ void hostw_service ( void )
daemon_config_type *config = daemon_get_cfg_ptr ();
get_hostname (&my_hostname[0], MAX_HOST_NAME_SIZE );
get_kdump_support(); /* query for kdump support */
mtcTimer_init ( pmonTimer, my_hostname, "pmon" );
mtcTimer_start( pmonTimer, hostwTimer_handler, config->hostwd_update_period);
@ -108,7 +235,7 @@ void hostw_service ( void )
socks.sort();
ilog("Host Watchdog Service running\n");
for ( int quorum_failed = 0 ; ; )
for ( ; ; )
{
timeout.tv_sec =1; /* 1 second select ; pet watchdog every second */
timeout.tv_usec=0;
@ -158,7 +285,7 @@ void hostw_service ( void )
pmonTimer.ring = false ;
}
}
else if ( quorum_failed == 0 )
else if ( ctrl->quorum_failed == 0 )
{
if (FD_ISSET(hostw_socket->status_sock, &(hostw_socket->readfds)))
{
@ -177,12 +304,12 @@ void hostw_service ( void )
}
}
else if ( rc != RETRY )
quorum_failed++ ;
ctrl->quorum_failed++ ;
}
}
if ( 0 >= ctrl->pmon_grace_loops )
{
if ( quorum_failed++ == 0 )
if ( ctrl->quorum_failed++ == 0 )
{
if (daemon_is_file_present(NODE_LOCKED_FILE))
{
@ -192,12 +319,19 @@ void hostw_service ( void )
}
else
{
/* force a crash dump if that feature is enabled */
force_crashdump();
emergency_log( "*** Host watchdog (hostwd) not receiving messages "
"from PMON ***\n");
"from PMON ***\n");
hostw_log_and_reboot();
}
}
}
if ( ctrl->quorum_failed )
manage_quorum_failed();
daemon_signal_hdlr ();
}
}
@ -269,8 +403,11 @@ void hostw_log_and_reboot()
emergency_log ("*** Host Watchdog declaring system unhealthy ***\n");
/* start the process to log as much data as possible */
fork_hostwd_logger ();
/* Start the process to log as much data as possible */
/* NOTE: This function currently does not do anything so its commented
* out for now. Uncomment when actual value add logging is implemented.
fork_hostwd_logger (); */
if (config->hostwd_reboot_on_err) {
emergency_log ("*** Initiating reboot ***\n");
@ -288,7 +425,7 @@ void hostw_log_and_reboot()
* Initiate the thread which logs as much information about the system
* as possible.
*/
static void fork_hostwd_logger ( void )
void fork_hostwd_logger ( void )
{
int parent = double_fork ();
if (0 > parent) /* problem forking */

View File

@ -48,16 +48,10 @@ daemon_config_type * daemon_get_cfg_ptr ( void )
/* Cleanup exit handler */
void daemon_exit ( void )
{
int rc ;
char pipe_cmd_output [PIPE_COMMAND_RESPON_LEN] ;
hostw_socket_type * hostw_socket = hostw_getSock_ptr();
rc = execute_pipe_cmd ( "systemctl is-system-running", &pipe_cmd_output[0], PIPE_COMMAND_RESPON_LEN );
if ( rc != 0 )
{
elog ("call to 'systemctl is-system-running' failed (%d:%d:%m)\n", rc, errno );
}
execute_pipe_cmd ( "systemctl is-system-running", &pipe_cmd_output[0], PIPE_COMMAND_RESPON_LEN );
if ( strnlen ( pipe_cmd_output, PIPE_COMMAND_RESPON_LEN ) > 0 )
{
ilog ("systemctl is-system-running result: <%s>\n", pipe_cmd_output );
@ -102,9 +96,12 @@ int hostw_process_config ( void * user,
if (MATCH("config", "hostwd_failure_threshold"))
{
int previous = config_ptr->hostwd_failure_threshold ;
config_ptr->hostwd_failure_threshold = atoi(value);
config_ptr->mask |= CONFIG_HOSTWD_FAILURE_THRESHOLD ;
ilog("Quorum Thld : %d", config_ptr->hostwd_failure_threshold);
if ( config_ptr->hostwd_failure_threshold != previous )
hostw_ctrl.pmon_grace_loops = config_ptr->hostwd_failure_threshold;
}
else if (MATCH("config", "hostwd_reboot_on_err"))
{
@ -113,6 +110,14 @@ int hostw_process_config ( void * user,
ilog("Quorum Loss : %s",
config_ptr->hostwd_reboot_on_err ? "Reboot & Log" : "Log Only");
}
else if (MATCH("config", "hostwd_kdump_on_stall"))
{
config_ptr->hostwd_kdump_on_stall = atoi(value);
string state = "enabled" ;
if ( config_ptr->hostwd_kdump_on_stall == 0 )
state = "disabled" ;
ilog("Crash Dump : %s (%d)", state.c_str(), config_ptr->hostwd_kdump_on_stall );
}
else if (MATCH("config", "hostwd_use_kern_wd"))
{
config_ptr->hostwd_use_kern_wd = atoi(value);

View File

@ -2,10 +2,13 @@
[config] ; Configuration
hostwd_reboot_on_err = 1 ; host watchdog to reboot on detected failure
hostwd_kdump_on_stall = 1 ; issue a sysrq crash dump on quorum msg'ing stall
; - missing pmond quorum reports
hostwd_failure_threshold = 3 ; number of # missed messages before action taken
hostwd_use_kern_wd = 1 ; use kernel /dev/watchdog as backup watchdog
hostwd_console_path = /dev/console ; console on which to log extreme events, like
; notification of reboot
[timeouts]
kernwd_update_period = 300 ; timeout until kernel resets system due to dead
; hostwd process (kernel watchdog)