diff --git a/mtce-common/src/common/logMacros.h b/mtce-common/src/common/logMacros.h index 645c3681..106a2c19 100644 --- a/mtce-common/src/common/logMacros.h +++ b/mtce-common/src/common/logMacros.h @@ -117,6 +117,7 @@ typedef struct int api_retries ; /**< api retries before failure */ int hostwd_failure_threshold ; /**< allowed # of missed pmon/hostwd messages */ bool hostwd_reboot_on_err ; /**< should hostwd reboot on fault detected */ + bool hostwd_kdump_on_stall ; /**< sysrq crash dump on quorum msg'ing stall */ bool hostwd_use_kern_wd ; /**< use the kernel watchdog for extra safety */ bool need_clstr_poll_audit ; /**< true if we need to poll for clstr */ char *hostwd_console_path ; /**< console on which to log extreme events */ diff --git a/mtce-common/src/daemon/daemon_config.cpp b/mtce-common/src/daemon/daemon_config.cpp index 88fc6a64..31272f9a 100644 --- a/mtce-common/src/daemon/daemon_config.cpp +++ b/mtce-common/src/daemon/daemon_config.cpp @@ -49,6 +49,8 @@ void daemon_config_default ( daemon_config_type* config_ptr ) config_ptr->multicast = strdup("none"); config_ptr->barbican_api_host = strdup("none"); + config_ptr->hostwd_kdump_on_stall = 0 ; + config_ptr->debug_all = 0 ; config_ptr->debug_json = 0 ; config_ptr->debug_timer = 0 ; diff --git a/mtce/src/hostw/hostw.h b/mtce/src/hostw/hostw.h index d4f32940..28d04adb 100755 --- a/mtce/src/hostw/hostw.h +++ b/mtce/src/hostw/hostw.h @@ -57,8 +57,16 @@ using namespace std; #include "hostwMsg.h" /* message format */ /* Configuration Files */ -#define HOSTWD_CONFIG_FILE ((const char *)"/etc/mtc/hostwd.conf") -#define PMOND_CONFIG_FILE ((const char *)"/etc/mtc/pmond.conf") +#define HOSTWD_CONFIG_FILE ((const char * const)"/etc/mtc/hostwd.conf") +#define PMOND_CONFIG_FILE ((const char * const)"/etc/mtc/pmond.conf") + +/* kernel SysRq control interface file used to enable the magic SysRq key. + * Used to ensure that the SysRq service is enabled when needed. */ +#define SYSRQ_CONTROL_INTERFACE ((const char * const)"/proc/sys/kernel/sysrq") + +/* kernel SysRq trigger interface file used to trigger a SysRq command. + * commands @ https://www.kernel.org/doc/html/latest/admin-guide/sysrq.html */ +#define SYSRQ_COMMAND_INTERFACE ((const char * const)"/proc/sysrq-trigger") #define HOSTW_MIN_KERN_UPDATE_PERIOD 5 /* user can set how long until kernel * watchdog panics, down to this @@ -109,6 +117,17 @@ typedef struct struct sigaction prev ; /**< Action handler that was replaced */ /**< This is put back on the exit */ + bool quorum_failed ; /** When true gates repeated failure handling */ + + /* Crash Dump Support */ + /* ------------------ */ + bool kdump_supported ; /** State of kdump support; not supported default. + * kdump support is queried through systemctl + * on process startup when this is updated to + * true if supported */ + int fd_sysrq_enable ; /** SysRq magic key file descriptor */ + int fd_sysrq_command ; /** SysRq command interface file descriptor */ + } hostw_ctrl_type ; /** Daemon Service messaging socket control structure */ diff --git a/mtce/src/hostw/hostwHdlr.cpp b/mtce/src/hostw/hostwHdlr.cpp index 13a6ca76..f1c21dd8 100644 --- a/mtce/src/hostw/hostwHdlr.cpp +++ b/mtce/src/hostw/hostwHdlr.cpp @@ -38,7 +38,7 @@ int hostw_service_command ( hostw_socket_type * hostw_socket ); -static void fork_hostwd_logger ( void ); +void fork_hostwd_logger ( void ); char my_hostname [MAX_HOST_NAME_SIZE+1]; /* Push daemon state to log file */ @@ -69,6 +69,130 @@ void hostwTimer_handler ( int sig, siginfo_t *si, void *uc) mtcTimer_stop_tid_int_safe ( tid_ptr ); } +/*************************************************************************** + * + * Name : get_kdump_support + * + * Purpose : Query the state of the kdump service + * + * Updates : kdump_supported default of false is updated to true if + * the kdump service query indicates that kdump is active. + * + **************************************************************************/ + +void get_kdump_support ( void ) +{ + char pipe_cmd_output [PIPE_COMMAND_RESPON_LEN] ; + execute_pipe_cmd ( "/usr/bin/systemctl is-active kdump", + &pipe_cmd_output[0], + PIPE_COMMAND_RESPON_LEN ); + if ( strnlen ( pipe_cmd_output, PIPE_COMMAND_RESPON_LEN ) > 0 ) + { + if ( ! strncmp (&pipe_cmd_output[0], "active", strlen("active"))) + { + hostw_ctrl_type * ctrl_ptr = get_ctrl_ptr() ; + ctrl_ptr->fd_sysrq_enable = open(SYSRQ_CONTROL_INTERFACE,O_WRONLY); + ctrl_ptr->fd_sysrq_command= open(SYSRQ_COMMAND_INTERFACE,O_WRONLY); + if ( ctrl_ptr->fd_sysrq_enable && ctrl_ptr->fd_sysrq_command ) + { + ilog("kdump is active"); + ctrl_ptr->kdump_supported = true ; + return ; + } + ilog("kdump service setup failed ; %d:%d:%s", + ctrl_ptr->fd_sysrq_enable, + ctrl_ptr->fd_sysrq_command, + pipe_cmd_output); + } + else + { + ilog("kdump is inactive (%s)", pipe_cmd_output); + } + } + else + { + elog("kdump status query failed ; assuming kdump is inactive"); + } +} + +/*************************************************************************** + * + * Name : force_crashdump + * + * Purpose : Force a crash dump via SysRq command 'c' + * + * Warning : Host will reset immediately, without graceful shutdown. + * + **************************************************************************/ +void force_crashdump ( void ) +{ + hostw_ctrl_type * ctrl_ptr = get_ctrl_ptr() ; + if (( daemon_get_cfg_ptr()->hostwd_kdump_on_stall == 0 ) || + ( ctrl_ptr->kdump_supported == false )) + { + /* crash dump is disabled or not supported */ + return ; + } + + /* Go for the crash dump */ + + /* Enable all functions of sysrq */ + static char sysrq_enable_cmd = '1' ; + + /* Crash Dump by NULL pointer dereference */ + static char sysrq_crash_dump_cmd = 'c' ; + + int bytes = write(ctrl_ptr->fd_sysrq_enable, &sysrq_enable_cmd, 1 ); + if ( bytes <= 0 ) + { + elog("SysRq Enable failed (%d:%d:%s)", bytes, errno, strerror(errno) ); + } + else + { + /*************** force crash dump *****************/ + bytes = write(ctrl_ptr->fd_sysrq_command, &sysrq_crash_dump_cmd, 1); + if ( bytes <= 0 ) + { + elog("SysRq command failed (%d:%d:%s)", + bytes, errno, strerror(errno) ); + } + else + { + ; /* should not get here */ + } + } +} + +/*************************************************************************** + * + * Name : manage_quorum_failed + * + * Purpose : permit recovery + * + * Description: If called while none of the reboot or sysRq reset failure + * recovery options are enabled then we are not going for a + * reboot or reset so allow recovery. + * + **************************************************************************/ + +void manage_quorum_failed ( void ) +{ + hostw_ctrl_type * ctrl_ptr = get_ctrl_ptr() ; + daemon_config_type* config_ptr = daemon_get_cfg_ptr() ; + + if ((( config_ptr->hostwd_kdump_on_stall == 0 ) || + ( ctrl_ptr->kdump_supported == false )) && + ( config_ptr->hostwd_reboot_on_err == 0 )) + { + /* If we are not going for a reboot or reset then allow recovery + * by clearing the control boolean that prevents recovery. */ + ilog ("Quorum failed but all reboot/reset recovery options disabled"); + ilog ("... allowing auto recovery"); + ctrl_ptr->quorum_failed = 0 ; + ctrl_ptr->pmon_grace_loops = config_ptr->hostwd_failure_threshold ; + } +} + /** * This is the main loop of the program * @@ -94,6 +218,9 @@ void hostw_service ( void ) daemon_config_type *config = daemon_get_cfg_ptr (); get_hostname (&my_hostname[0], MAX_HOST_NAME_SIZE ); + + get_kdump_support(); /* query for kdump support */ + mtcTimer_init ( pmonTimer, my_hostname, "pmon" ); mtcTimer_start( pmonTimer, hostwTimer_handler, config->hostwd_update_period); @@ -108,7 +235,7 @@ void hostw_service ( void ) socks.sort(); ilog("Host Watchdog Service running\n"); - for ( int quorum_failed = 0 ; ; ) + for ( ; ; ) { timeout.tv_sec =1; /* 1 second select ; pet watchdog every second */ timeout.tv_usec=0; @@ -158,7 +285,7 @@ void hostw_service ( void ) pmonTimer.ring = false ; } } - else if ( quorum_failed == 0 ) + else if ( ctrl->quorum_failed == 0 ) { if (FD_ISSET(hostw_socket->status_sock, &(hostw_socket->readfds))) { @@ -177,12 +304,12 @@ void hostw_service ( void ) } } else if ( rc != RETRY ) - quorum_failed++ ; + ctrl->quorum_failed++ ; } } if ( 0 >= ctrl->pmon_grace_loops ) { - if ( quorum_failed++ == 0 ) + if ( ctrl->quorum_failed++ == 0 ) { if (daemon_is_file_present(NODE_LOCKED_FILE)) { @@ -192,12 +319,19 @@ void hostw_service ( void ) } else { + /* force a crash dump if that feature is enabled */ + force_crashdump(); + emergency_log( "*** Host watchdog (hostwd) not receiving messages " - "from PMON ***\n"); + "from PMON ***\n"); + hostw_log_and_reboot(); } } } + if ( ctrl->quorum_failed ) + manage_quorum_failed(); + daemon_signal_hdlr (); } } @@ -269,8 +403,11 @@ void hostw_log_and_reboot() emergency_log ("*** Host Watchdog declaring system unhealthy ***\n"); - /* start the process to log as much data as possible */ - fork_hostwd_logger (); + /* Start the process to log as much data as possible */ + + /* NOTE: This function currently does not do anything so its commented + * out for now. Uncomment when actual value add logging is implemented. + fork_hostwd_logger (); */ if (config->hostwd_reboot_on_err) { emergency_log ("*** Initiating reboot ***\n"); @@ -288,7 +425,7 @@ void hostw_log_and_reboot() * Initiate the thread which logs as much information about the system * as possible. */ -static void fork_hostwd_logger ( void ) +void fork_hostwd_logger ( void ) { int parent = double_fork (); if (0 > parent) /* problem forking */ diff --git a/mtce/src/hostw/hostwInit.cpp b/mtce/src/hostw/hostwInit.cpp index 7a71d064..4e55736f 100644 --- a/mtce/src/hostw/hostwInit.cpp +++ b/mtce/src/hostw/hostwInit.cpp @@ -48,16 +48,10 @@ daemon_config_type * daemon_get_cfg_ptr ( void ) /* Cleanup exit handler */ void daemon_exit ( void ) { - int rc ; char pipe_cmd_output [PIPE_COMMAND_RESPON_LEN] ; hostw_socket_type * hostw_socket = hostw_getSock_ptr(); - rc = execute_pipe_cmd ( "systemctl is-system-running", &pipe_cmd_output[0], PIPE_COMMAND_RESPON_LEN ); - - if ( rc != 0 ) - { - elog ("call to 'systemctl is-system-running' failed (%d:%d:%m)\n", rc, errno ); - } + execute_pipe_cmd ( "systemctl is-system-running", &pipe_cmd_output[0], PIPE_COMMAND_RESPON_LEN ); if ( strnlen ( pipe_cmd_output, PIPE_COMMAND_RESPON_LEN ) > 0 ) { ilog ("systemctl is-system-running result: <%s>\n", pipe_cmd_output ); @@ -102,9 +96,12 @@ int hostw_process_config ( void * user, if (MATCH("config", "hostwd_failure_threshold")) { + int previous = config_ptr->hostwd_failure_threshold ; config_ptr->hostwd_failure_threshold = atoi(value); config_ptr->mask |= CONFIG_HOSTWD_FAILURE_THRESHOLD ; ilog("Quorum Thld : %d", config_ptr->hostwd_failure_threshold); + if ( config_ptr->hostwd_failure_threshold != previous ) + hostw_ctrl.pmon_grace_loops = config_ptr->hostwd_failure_threshold; } else if (MATCH("config", "hostwd_reboot_on_err")) { @@ -113,6 +110,14 @@ int hostw_process_config ( void * user, ilog("Quorum Loss : %s", config_ptr->hostwd_reboot_on_err ? "Reboot & Log" : "Log Only"); } + else if (MATCH("config", "hostwd_kdump_on_stall")) + { + config_ptr->hostwd_kdump_on_stall = atoi(value); + string state = "enabled" ; + if ( config_ptr->hostwd_kdump_on_stall == 0 ) + state = "disabled" ; + ilog("Crash Dump : %s (%d)", state.c_str(), config_ptr->hostwd_kdump_on_stall ); + } else if (MATCH("config", "hostwd_use_kern_wd")) { config_ptr->hostwd_use_kern_wd = atoi(value); diff --git a/mtce/src/hostw/scripts/hostwd.conf b/mtce/src/hostw/scripts/hostwd.conf index 3b66927d..79bf85f9 100755 --- a/mtce/src/hostw/scripts/hostwd.conf +++ b/mtce/src/hostw/scripts/hostwd.conf @@ -2,10 +2,13 @@ [config] ; Configuration hostwd_reboot_on_err = 1 ; host watchdog to reboot on detected failure +hostwd_kdump_on_stall = 1 ; issue a sysrq crash dump on quorum msg'ing stall + ; - missing pmond quorum reports hostwd_failure_threshold = 3 ; number of # missed messages before action taken hostwd_use_kern_wd = 1 ; use kernel /dev/watchdog as backup watchdog hostwd_console_path = /dev/console ; console on which to log extreme events, like ; notification of reboot + [timeouts] kernwd_update_period = 300 ; timeout until kernel resets system due to dead ; hostwd process (kernel watchdog)