Reduce Maintenance Host Watchdog timeout for controllers

This update makes changes to the maintenance host watchdog
and reduces the timeout from 5 to 3 minutes for controllers.

This update also decouples the pmon quorum monitoring
feature handling from the host watchdog timeout. Both were
driven off the same select timer which prevented watchdog
timeout value to be independently changed without affecting
quorum monitoring.

A new config label 'kernwd_update_period_stall_detect' is
added and value loaded for hosts that need more rigid
process stall detection.

This new lower timeout value label is loaded and applied to
hosts that run the system controller function.

A few logging improvements were made.

Test Plan:

PASS: Verify pmon quorum failure handling while unlocked.
             Was and remains at 3 misses, 60 seconds each.
PASS: Verify watchdog TO at 12 seconds on controllers.
             Was 300 secs.
PASS: Verify kernel watchdog is not enabled when loaded
             kernwd_update_period is less than 5 seconds.
             Was 60 secs.
PASS: Verify process logging ; startup, failure, transient
PASS: Verify all config values loaded by hostwd process

Regression:

PASS: Verify watchdog TO at 300 seconds on non-controllers
PASS: Verify handling of failed quorum process while locked
PASS: Verify handling of failed quorum process while unlocked
PASS: Verify handling of transient quorum messaging loss while
             unlocked
PASS: Verify hostwd process patching ; locked and unlocked
             cases

PASS: Verify AIO DX System Install
PASS: Verify Standard System Install

Note: There is no kernel WD TO log.
      The log is output to the console.

Change-Id: Iad726436e28dfa48a06743aa166318969eb6915d
Closes-Bug: #1894889
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
Eric MacDonald 2020-09-09 15:39:48 -04:00
parent fdf6ff8650
commit 3a6fec50c1
5 changed files with 137 additions and 62 deletions

View File

@ -160,7 +160,6 @@ typedef struct
int oos_test_period ; /**< oos test period in secs */ int oos_test_period ; /**< oos test period in secs */
int failsafe_shutdown_delay ; /**< seconds before failsafe reboot */ int failsafe_shutdown_delay ; /**< seconds before failsafe reboot */
int hostwd_update_period ; /**< expect hostwd to be updated */ int hostwd_update_period ; /**< expect hostwd to be updated */
int kernwd_update_period ; /**< expect kernel watchdog to be updated */
int autorecovery_threshold ; /**< AIO stop autorecovery threshold */ int autorecovery_threshold ; /**< AIO stop autorecovery threshold */
/**< Auto Recovery Thresholds */ /**< Auto Recovery Thresholds */

View File

@ -60,7 +60,7 @@ using namespace std;
#define HOSTWD_CONFIG_FILE ((const char *)"/etc/mtc/hostwd.conf") #define HOSTWD_CONFIG_FILE ((const char *)"/etc/mtc/hostwd.conf")
#define PMOND_CONFIG_FILE ((const char *)"/etc/mtc/pmond.conf") #define PMOND_CONFIG_FILE ((const char *)"/etc/mtc/pmond.conf")
#define HOSTW_MIN_KERN_UPDATE_PERIOD 60 /* user can set how long until kernel #define HOSTW_MIN_KERN_UPDATE_PERIOD 5 /* user can set how long until kernel
* watchdog panics, down to this * watchdog panics, down to this
* minimum (seconds) */ * minimum (seconds) */
@ -93,9 +93,12 @@ using namespace std;
/* Context control structure */ /* Context control structure */
typedef struct typedef struct
{ {
int nodetype ;
/* Watchdog interface */ /* Watchdog interface */
/* ------------------ */ /* ------------------ */
int watchdog ; /** The opened /dev/watchdog file */ int watchdog ; /** The opened /dev/watchdog file */
int kernwd_update_period; /** period in seconds the watchdog is serviced */
/* Loop counters */ /* Loop counters */
/* ------------------ */ /* ------------------ */

View File

@ -39,6 +39,7 @@
int hostw_service_command ( hostw_socket_type * hostw_socket ); int hostw_service_command ( hostw_socket_type * hostw_socket );
static void fork_hostwd_logger ( void ); static void fork_hostwd_logger ( void );
char my_hostname [MAX_HOST_NAME_SIZE+1];
/* Push daemon state to log file */ /* Push daemon state to log file */
void daemon_dump_info ( void ) void daemon_dump_info ( void )
@ -49,6 +50,25 @@ void daemon_sigchld_hdlr ( void )
{ {
} }
static struct mtc_timer pmonTimer ;
void hostwTimer_handler ( int sig, siginfo_t *si, void *uc)
{
timer_t * tid_ptr = (void**)si->si_value.sival_ptr ;
/* Avoid compiler errors/warnings for parms we must
* have but currently do nothing with */
UNUSED(sig);
UNUSED(uc);
if ( !(*tid_ptr) )
return ;
else if (( *tid_ptr == pmonTimer.tid ) )
pmonTimer.ring = true ;
else
mtcTimer_stop_tid_int_safe ( tid_ptr );
}
/** /**
* This is the main loop of the program * This is the main loop of the program
* *
@ -73,6 +93,10 @@ void hostw_service ( void )
hostw_ctrl_type *ctrl = get_ctrl_ptr (); hostw_ctrl_type *ctrl = get_ctrl_ptr ();
daemon_config_type *config = daemon_get_cfg_ptr (); daemon_config_type *config = daemon_get_cfg_ptr ();
get_hostname (&my_hostname[0], MAX_HOST_NAME_SIZE );
mtcTimer_init ( pmonTimer, my_hostname, "pmon" );
mtcTimer_start( pmonTimer, hostwTimer_handler, config->hostwd_update_period);
ctrl->pmon_grace_loops = config->hostwd_failure_threshold; ctrl->pmon_grace_loops = config->hostwd_failure_threshold;
socks.clear(); socks.clear();
@ -84,12 +108,11 @@ void hostw_service ( void )
socks.sort(); socks.sort();
ilog("Host Watchdog Service running\n"); ilog("Host Watchdog Service running\n");
for ( ; ; ) for ( int quorum_failed = 0 ; ; )
{ {
timeout.tv_sec = config->hostwd_update_period; timeout.tv_sec =1; /* 1 second select ; pet watchdog every second */
timeout.tv_usec=0; timeout.tv_usec=0;
/* pet the watchdog */
kernel_watchdog_pet(); kernel_watchdog_pet();
/* set the master fd_set */ /* set the master fd_set */
@ -107,56 +130,75 @@ void hostw_service ( void )
if ( errno != EINTR ) if ( errno != EINTR )
{ {
elog ("Select Failed (rc:%d) %s \n", errno, strerror(errno)); elog ("Select Failed (rc:%d) %s \n", errno, strerror(errno));
ctrl->pmon_grace_loops--; if ( ctrl->pmon_grace_loops > 0 )
ctrl->pmon_grace_loops--;
} }
} }
else if ( rc == 0 ) else if ( rc == 0 )
{ {
if (daemon_is_file_present(NODE_LOCKED_FILE)) if ( pmonTimer.ring == true )
{ {
wlog( "Did not receive message from PMON, however node is" if (daemon_is_file_present(NODE_LOCKED_FILE))
" locked -- refusing to take reset action while locked\n" );
}
else
{
ctrl->pmon_grace_loops--;
if ( ctrl->pmon_grace_loops )
{ {
ilog ("Did not receive expected message from PMON - %d more missed messages allowed\n", wlog("Process Quorum Health not receive from PMON ; "
ctrl->pmon_grace_loops-1); "no action while node is locked");
} }
else
{
if ( ctrl->pmon_grace_loops )
ctrl->pmon_grace_loops--;
if ( ctrl->pmon_grace_loops > 0 )
{
wlog ("Process Quorum Health not received from PMON ; "
"%d more misses allowed before self-reboot",
ctrl->pmon_grace_loops-1);
}
}
pmonTimer.ring = false ;
} }
} }
else else if ( quorum_failed == 0 )
{ {
if (FD_ISSET(hostw_socket->status_sock, &(hostw_socket->readfds))) if (FD_ISSET(hostw_socket->status_sock, &(hostw_socket->readfds)))
{ {
rc = hostw_service_command ( hostw_socket); rc = hostw_service_command ( hostw_socket);
if ( rc == PASS ) /* got "all is well" message */ if ( rc == PASS ) /* got "all is well" message */
{ {
ctrl->pmon_grace_loops = config->hostwd_failure_threshold; /* reset the pmon quorum health timer */
mtcTimer_reset(pmonTimer);
mtcTimer_start(pmonTimer, hostwTimer_handler, config->hostwd_update_period);
/* reload pmon grace loops count down */
if ( ctrl->pmon_grace_loops != config->hostwd_failure_threshold )
{
ilog("Process Quorum Health messaging restored");
ctrl->pmon_grace_loops = config->hostwd_failure_threshold;
}
}
else if ( rc != RETRY )
quorum_failed++ ;
}
}
if ( 0 >= ctrl->pmon_grace_loops )
{
if ( quorum_failed++ == 0 )
{
if (daemon_is_file_present(NODE_LOCKED_FILE))
{
wlog( "Host watchdog (hostwd) not receiving messages from PMON"
" however host is locked - refusing to take reset action"
" while locked\n" );
}
else
{
emergency_log( "*** Host watchdog (hostwd) not receiving messages "
"from PMON ***\n");
hostw_log_and_reboot();
} }
} }
} }
if (0 >= ctrl->pmon_grace_loops)
{
if (daemon_is_file_present(NODE_LOCKED_FILE))
{
wlog( "Host watchdog (hostwd) not receiving messages from PMON"
" however host is locked - refusing to take reset action"
" while locked\n" );
}
else
{
emergency_log( "*** Host watchdog (hostwd) not receiving messages "
"from PMON ***\n");
hostw_log_and_reboot();
}
}
daemon_signal_hdlr (); daemon_signal_hdlr ();
} }
} }
@ -187,21 +229,22 @@ int hostw_service_command ( hostw_socket_type * hostw_socket)
{ {
case MTC_CMD_NONE: case MTC_CMD_NONE:
/* All is well */ /* All is well */
dlog ("pmon is happy");
return PASS; return PASS;
case MTC_EVENT_PMON_CRIT: case MTC_EVENT_PMON_CRIT:
if (daemon_is_file_present(NODE_LOCKED_FILE)) if (daemon_is_file_present(NODE_LOCKED_FILE))
{ {
ilog( "PMON reports unrecoverable system, however node is" wlog("PMON reports unrecoverable system - message '%s'", msg[0].buf );
" locked - considering this an OK message\n" ); ilog("... no action while node is locked");
return PASS; return PASS;
} }
else else
{ {
emergency_log( "*** PMON reports unrecoverable system - message '%s' ***\n", msg[0].buf); emergency_log( "*** PMON reports unrecoverable system - message '%s' ***\n", msg[0].buf);
hostw_log_and_reboot(); hostw_log_and_reboot();
return FAIL;
} }
return FAIL;
default: default:
elog("Unknown status reported\n"); elog("Unknown status reported\n");
@ -213,7 +256,7 @@ int hostw_service_command ( hostw_socket_type * hostw_socket)
/* bad message size */ /* bad message size */
elog("Host Watchdog received bad or corrupted message (length = %d)\n", len); elog("Host Watchdog received bad or corrupted message (length = %d)\n", len);
} }
return FAIL; return RETRY;
} }
/** /**

View File

@ -104,31 +104,56 @@ int hostw_process_config ( void * user,
{ {
config_ptr->hostwd_failure_threshold = atoi(value); config_ptr->hostwd_failure_threshold = atoi(value);
config_ptr->mask |= CONFIG_HOSTWD_FAILURE_THRESHOLD ; config_ptr->mask |= CONFIG_HOSTWD_FAILURE_THRESHOLD ;
ilog("Quorum Thld : %d", config_ptr->hostwd_failure_threshold);
} }
else if (MATCH("config", "hostwd_reboot_on_err")) else if (MATCH("config", "hostwd_reboot_on_err"))
{ {
config_ptr->hostwd_reboot_on_err = atoi(value); config_ptr->hostwd_reboot_on_err = atoi(value);
config_ptr->mask |= CONFIG_HOSTWD_REBOOT ; config_ptr->mask |= CONFIG_HOSTWD_REBOOT ;
ilog("Quorum Loss : %s",
config_ptr->hostwd_reboot_on_err ? "Reboot & Log" : "Log Only");
} }
else if (MATCH("config", "hostwd_use_kern_wd")) else if (MATCH("config", "hostwd_use_kern_wd"))
{ {
config_ptr->hostwd_use_kern_wd = atoi(value); config_ptr->hostwd_use_kern_wd = atoi(value);
config_ptr->mask |= CONFIG_HOSTWD_USE_KERN_WD ; config_ptr->mask |= CONFIG_HOSTWD_USE_KERN_WD ;
ilog("Use Kern Wd : %s", config_ptr->hostwd_use_kern_wd ? "Yes":"No");
} }
else if (MATCH("config", "hostwd_console_path")) else if (MATCH("config", "hostwd_console_path"))
{ {
config_ptr->hostwd_console_path = strdup(value); config_ptr->hostwd_console_path = strdup(value);
config_ptr->mask |= CONFIG_HOSTWD_CONSOLE_PATH ; config_ptr->mask |= CONFIG_HOSTWD_CONSOLE_PATH ;
} ilog("WD Console : %s", config_ptr->hostwd_console_path);
else if (MATCH("timeouts", "kernwd_update_period"))
{
config_ptr->kernwd_update_period = atoi(value);
config_ptr->mask |= CONFIG_KERNWD_UPDATE_PERIOD ;
} }
else if (MATCH("config", "hostwd_update_period")) /* in pmond.conf file */ else if (MATCH("config", "hostwd_update_period")) /* in pmond.conf file */
{ {
config_ptr->hostwd_update_period = atoi(value); config_ptr->hostwd_update_period = atoi(value);
config_ptr->mask |= CONFIG_HOSTWD_UPDATE_PERIOD ; config_ptr->mask |= CONFIG_HOSTWD_UPDATE_PERIOD ;
ilog("Quorum Rate : %d secs", config_ptr->hostwd_update_period);
}
/* Timeout config options */
/* kernwd_update_period is how many seconds the watchdog can run without
* petting it before timeout and kernel reboot.
* The timeout for controller nodes is loaded with the stall detection
* value compared to other nodes with the legacy value.
*/
else if (hostw_ctrl.nodetype & CONTROLLER_TYPE)
{
if (MATCH("timeouts", "kernwd_update_period_stall_detect"))
{
hostw_ctrl.kernwd_update_period = atoi(value);
config_ptr->mask |= CONFIG_KERNWD_UPDATE_PERIOD ;
}
}
else
{
if (MATCH("timeouts", "kernwd_update_period"))
{
hostw_ctrl.kernwd_update_period = atoi(value);
config_ptr->mask |= CONFIG_KERNWD_UPDATE_PERIOD ;
}
} }
return (PASS); return (PASS);
} }
@ -186,12 +211,10 @@ int socket_init ( void )
int daemon_init ( string iface, string nodetype_str ) int daemon_init ( string iface, string nodetype_str )
{ {
int rc = PASS ; int rc = PASS ;
hostw_ctrl_type* ctrl_ptr = get_ctrl_ptr();
UNUSED(iface); UNUSED(iface);
UNUSED(nodetype_str);
/* init the control struct */ /* init the control struct */
memset(ctrl_ptr, 0, sizeof(hostw_ctrl_type)); memset(&hostw_ctrl, 0, sizeof(hostw_ctrl_type));
if (daemon_files_init() != PASS) if (daemon_files_init() != PASS)
{ {
@ -206,6 +229,12 @@ int daemon_init ( string iface, string nodetype_str )
return ( FAIL_SIGNAL_INIT ); return ( FAIL_SIGNAL_INIT );
} }
/* convert node type to bit field integer */
hostw_ctrl.nodetype = get_host_function_mask ( nodetype_str ) ;
ilog ("Node Type : %s (0x%x)\n",
nodetype_str.c_str(),
hostw_ctrl.nodetype);
/************************************************************************ /************************************************************************
* There is no point continuing with init ; i.e. running daemon_configure, * There is no point continuing with init ; i.e. running daemon_configure,
* initializing sockets and trying to query for an ip address until the * initializing sockets and trying to query for an ip address until the
@ -262,39 +291,37 @@ void daemon_service_run ( void )
*/ */
static int kernel_watchdog_init ( void ) static int kernel_watchdog_init ( void )
{ {
hostw_ctrl_type * ctrl_ptr = get_ctrl_ptr();
daemon_config_type * config_ptr = daemon_get_cfg_ptr(); daemon_config_type * config_ptr = daemon_get_cfg_ptr();
/* open the watchdog */ /* open the watchdog */
if ( (config_ptr->hostwd_use_kern_wd == 0) || if ( (config_ptr->hostwd_use_kern_wd == 0) ||
(config_ptr->kernwd_update_period < HOSTW_MIN_KERN_UPDATE_PERIOD)) (hostw_ctrl.kernwd_update_period < HOSTW_MIN_KERN_UPDATE_PERIOD))
{ {
/* config file says don't use watchdog, or used too small a period */ /* config file says don't use watchdog, or used too small a period */
wlog("Watchdog NOT enabled ; %s",
config_ptr->hostwd_use_kern_wd ?
"kernwd_update_period too small":
"hostwd_use_kern_wd=0");
return PASS; return PASS;
} }
ilog ("Opening kernel watchdog device\n"); hostw_ctrl.watchdog = open("/dev/watchdog", O_WRONLY);
ctrl_ptr->watchdog = open("/dev/watchdog", O_WRONLY); if (0 >= hostw_ctrl.watchdog)
if (0 >= ctrl_ptr->watchdog)
{ {
elog("Could not open kernel watchdog\n"); elog("Could not open kernel watchdog\n");
return FAIL; return FAIL;
} }
/* set watchdog timeout (in seconds) */ /* set watchdog timeout (in seconds) */
ilog ("Setting kernel watchdog options - kernel timeout after %d seconds\n", if (ioctl(hostw_ctrl.watchdog, WDIOC_SETTIMEOUT, &hostw_ctrl.kernwd_update_period))
config_ptr->kernwd_update_period);
if (ioctl(ctrl_ptr->watchdog, WDIOC_SETTIMEOUT, &config_ptr->kernwd_update_period))
{ {
elog ("Error setting watchdog options -- closing watchdog\n") elog ("Failed to enable watchdog")
kernel_watchdog_close(); kernel_watchdog_close();
return FAIL; return FAIL;
} }
ilog ("Kernel watchdog enabled with %d second timeout",
/* do initial keep alive */ hostw_ctrl.kernwd_update_period);
ilog ("Watchdog options set\n");
kernel_watchdog_pet();
return PASS; return PASS;
} }

View File

@ -10,3 +10,6 @@ hostwd_console_path = /dev/console ; console on which to log extreme events, lik
kernwd_update_period = 300 ; timeout until kernel resets system due to dead kernwd_update_period = 300 ; timeout until kernel resets system due to dead
; hostwd process (kernel watchdog) ; hostwd process (kernel watchdog)
kernwd_update_period_stall_detect = 180; kernel watchdog timeout introduced to
; detect and react to excessive process
; stalls.