Reduce Maintenance Host Watchdog timeout for controllers

This update makes changes to the maintenance host watchdog
and reduces the timeout from 5 to 3 minutes for controllers.

This update also decouples the pmon quorum monitoring
feature handling from the host watchdog timeout. Both were
driven off the same select timer which prevented watchdog
timeout value to be independently changed without affecting
quorum monitoring.

A new config label 'kernwd_update_period_stall_detect' is
added and value loaded for hosts that need more rigid
process stall detection.

This new lower timeout value label is loaded and applied to
hosts that run the system controller function.

A few logging improvements were made.

Test Plan:

PASS: Verify pmon quorum failure handling while unlocked.
             Was and remains at 3 misses, 60 seconds each.
PASS: Verify watchdog TO at 12 seconds on controllers.
             Was 300 secs.
PASS: Verify kernel watchdog is not enabled when loaded
             kernwd_update_period is less than 5 seconds.
             Was 60 secs.
PASS: Verify process logging ; startup, failure, transient
PASS: Verify all config values loaded by hostwd process

Regression:

PASS: Verify watchdog TO at 300 seconds on non-controllers
PASS: Verify handling of failed quorum process while locked
PASS: Verify handling of failed quorum process while unlocked
PASS: Verify handling of transient quorum messaging loss while
             unlocked
PASS: Verify hostwd process patching ; locked and unlocked
             cases

PASS: Verify AIO DX System Install
PASS: Verify Standard System Install

Note: There is no kernel WD TO log.
      The log is output to the console.

Change-Id: Iad726436e28dfa48a06743aa166318969eb6915d
Closes-Bug: #1894889
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
Eric MacDonald 2020-09-09 15:39:48 -04:00
parent fdf6ff8650
commit 3a6fec50c1
5 changed files with 137 additions and 62 deletions

View File

@ -160,7 +160,6 @@ typedef struct
int oos_test_period ; /**< oos test period in secs */
int failsafe_shutdown_delay ; /**< seconds before failsafe reboot */
int hostwd_update_period ; /**< expect hostwd to be updated */
int kernwd_update_period ; /**< expect kernel watchdog to be updated */
int autorecovery_threshold ; /**< AIO stop autorecovery threshold */
/**< Auto Recovery Thresholds */

View File

@ -60,7 +60,7 @@ using namespace std;
#define HOSTWD_CONFIG_FILE ((const char *)"/etc/mtc/hostwd.conf")
#define PMOND_CONFIG_FILE ((const char *)"/etc/mtc/pmond.conf")
#define HOSTW_MIN_KERN_UPDATE_PERIOD 60 /* user can set how long until kernel
#define HOSTW_MIN_KERN_UPDATE_PERIOD 5 /* user can set how long until kernel
* watchdog panics, down to this
* minimum (seconds) */
@ -93,9 +93,12 @@ using namespace std;
/* Context control structure */
typedef struct
{
int nodetype ;
/* Watchdog interface */
/* ------------------ */
int watchdog ; /** The opened /dev/watchdog file */
int watchdog ; /** The opened /dev/watchdog file */
int kernwd_update_period; /** period in seconds the watchdog is serviced */
/* Loop counters */
/* ------------------ */

View File

@ -39,6 +39,7 @@
int hostw_service_command ( hostw_socket_type * hostw_socket );
static void fork_hostwd_logger ( void );
char my_hostname [MAX_HOST_NAME_SIZE+1];
/* Push daemon state to log file */
void daemon_dump_info ( void )
@ -49,6 +50,25 @@ void daemon_sigchld_hdlr ( void )
{
}
static struct mtc_timer pmonTimer ;
void hostwTimer_handler ( int sig, siginfo_t *si, void *uc)
{
timer_t * tid_ptr = (void**)si->si_value.sival_ptr ;
/* Avoid compiler errors/warnings for parms we must
* have but currently do nothing with */
UNUSED(sig);
UNUSED(uc);
if ( !(*tid_ptr) )
return ;
else if (( *tid_ptr == pmonTimer.tid ) )
pmonTimer.ring = true ;
else
mtcTimer_stop_tid_int_safe ( tid_ptr );
}
/**
* This is the main loop of the program
*
@ -73,6 +93,10 @@ void hostw_service ( void )
hostw_ctrl_type *ctrl = get_ctrl_ptr ();
daemon_config_type *config = daemon_get_cfg_ptr ();
get_hostname (&my_hostname[0], MAX_HOST_NAME_SIZE );
mtcTimer_init ( pmonTimer, my_hostname, "pmon" );
mtcTimer_start( pmonTimer, hostwTimer_handler, config->hostwd_update_period);
ctrl->pmon_grace_loops = config->hostwd_failure_threshold;
socks.clear();
@ -84,12 +108,11 @@ void hostw_service ( void )
socks.sort();
ilog("Host Watchdog Service running\n");
for ( ; ; )
for ( int quorum_failed = 0 ; ; )
{
timeout.tv_sec = config->hostwd_update_period;
timeout.tv_sec =1; /* 1 second select ; pet watchdog every second */
timeout.tv_usec=0;
/* pet the watchdog */
kernel_watchdog_pet();
/* set the master fd_set */
@ -107,56 +130,75 @@ void hostw_service ( void )
if ( errno != EINTR )
{
elog ("Select Failed (rc:%d) %s \n", errno, strerror(errno));
ctrl->pmon_grace_loops--;
if ( ctrl->pmon_grace_loops > 0 )
ctrl->pmon_grace_loops--;
}
}
else if ( rc == 0 )
{
if (daemon_is_file_present(NODE_LOCKED_FILE))
if ( pmonTimer.ring == true )
{
wlog( "Did not receive message from PMON, however node is"
" locked -- refusing to take reset action while locked\n" );
}
else
{
ctrl->pmon_grace_loops--;
if ( ctrl->pmon_grace_loops )
if (daemon_is_file_present(NODE_LOCKED_FILE))
{
ilog ("Did not receive expected message from PMON - %d more missed messages allowed\n",
ctrl->pmon_grace_loops-1);
wlog("Process Quorum Health not receive from PMON ; "
"no action while node is locked");
}
else
{
if ( ctrl->pmon_grace_loops )
ctrl->pmon_grace_loops--;
if ( ctrl->pmon_grace_loops > 0 )
{
wlog ("Process Quorum Health not received from PMON ; "
"%d more misses allowed before self-reboot",
ctrl->pmon_grace_loops-1);
}
}
pmonTimer.ring = false ;
}
}
else
else if ( quorum_failed == 0 )
{
if (FD_ISSET(hostw_socket->status_sock, &(hostw_socket->readfds)))
{
rc = hostw_service_command ( hostw_socket);
if ( rc == PASS ) /* got "all is well" message */
{
ctrl->pmon_grace_loops = config->hostwd_failure_threshold;
/* reset the pmon quorum health timer */
mtcTimer_reset(pmonTimer);
mtcTimer_start(pmonTimer, hostwTimer_handler, config->hostwd_update_period);
/* reload pmon grace loops count down */
if ( ctrl->pmon_grace_loops != config->hostwd_failure_threshold )
{
ilog("Process Quorum Health messaging restored");
ctrl->pmon_grace_loops = config->hostwd_failure_threshold;
}
}
else if ( rc != RETRY )
quorum_failed++ ;
}
}
if ( 0 >= ctrl->pmon_grace_loops )
{
if ( quorum_failed++ == 0 )
{
if (daemon_is_file_present(NODE_LOCKED_FILE))
{
wlog( "Host watchdog (hostwd) not receiving messages from PMON"
" however host is locked - refusing to take reset action"
" while locked\n" );
}
else
{
emergency_log( "*** Host watchdog (hostwd) not receiving messages "
"from PMON ***\n");
hostw_log_and_reboot();
}
}
}
if (0 >= ctrl->pmon_grace_loops)
{
if (daemon_is_file_present(NODE_LOCKED_FILE))
{
wlog( "Host watchdog (hostwd) not receiving messages from PMON"
" however host is locked - refusing to take reset action"
" while locked\n" );
}
else
{
emergency_log( "*** Host watchdog (hostwd) not receiving messages "
"from PMON ***\n");
hostw_log_and_reboot();
}
}
daemon_signal_hdlr ();
}
}
@ -187,21 +229,22 @@ int hostw_service_command ( hostw_socket_type * hostw_socket)
{
case MTC_CMD_NONE:
/* All is well */
dlog ("pmon is happy");
return PASS;
case MTC_EVENT_PMON_CRIT:
if (daemon_is_file_present(NODE_LOCKED_FILE))
{
ilog( "PMON reports unrecoverable system, however node is"
" locked - considering this an OK message\n" );
wlog("PMON reports unrecoverable system - message '%s'", msg[0].buf );
ilog("... no action while node is locked");
return PASS;
}
else
{
emergency_log( "*** PMON reports unrecoverable system - message '%s' ***\n", msg[0].buf);
hostw_log_and_reboot();
return FAIL;
}
return FAIL;
default:
elog("Unknown status reported\n");
@ -213,7 +256,7 @@ int hostw_service_command ( hostw_socket_type * hostw_socket)
/* bad message size */
elog("Host Watchdog received bad or corrupted message (length = %d)\n", len);
}
return FAIL;
return RETRY;
}
/**

View File

@ -104,31 +104,56 @@ int hostw_process_config ( void * user,
{
config_ptr->hostwd_failure_threshold = atoi(value);
config_ptr->mask |= CONFIG_HOSTWD_FAILURE_THRESHOLD ;
ilog("Quorum Thld : %d", config_ptr->hostwd_failure_threshold);
}
else if (MATCH("config", "hostwd_reboot_on_err"))
{
config_ptr->hostwd_reboot_on_err = atoi(value);
config_ptr->mask |= CONFIG_HOSTWD_REBOOT ;
ilog("Quorum Loss : %s",
config_ptr->hostwd_reboot_on_err ? "Reboot & Log" : "Log Only");
}
else if (MATCH("config", "hostwd_use_kern_wd"))
{
config_ptr->hostwd_use_kern_wd = atoi(value);
config_ptr->mask |= CONFIG_HOSTWD_USE_KERN_WD ;
ilog("Use Kern Wd : %s", config_ptr->hostwd_use_kern_wd ? "Yes":"No");
}
else if (MATCH("config", "hostwd_console_path"))
{
config_ptr->hostwd_console_path = strdup(value);
config_ptr->mask |= CONFIG_HOSTWD_CONSOLE_PATH ;
}
else if (MATCH("timeouts", "kernwd_update_period"))
{
config_ptr->kernwd_update_period = atoi(value);
config_ptr->mask |= CONFIG_KERNWD_UPDATE_PERIOD ;
ilog("WD Console : %s", config_ptr->hostwd_console_path);
}
else if (MATCH("config", "hostwd_update_period")) /* in pmond.conf file */
{
config_ptr->hostwd_update_period = atoi(value);
config_ptr->mask |= CONFIG_HOSTWD_UPDATE_PERIOD ;
ilog("Quorum Rate : %d secs", config_ptr->hostwd_update_period);
}
/* Timeout config options */
/* kernwd_update_period is how many seconds the watchdog can run without
* petting it before timeout and kernel reboot.
* The timeout for controller nodes is loaded with the stall detection
* value compared to other nodes with the legacy value.
*/
else if (hostw_ctrl.nodetype & CONTROLLER_TYPE)
{
if (MATCH("timeouts", "kernwd_update_period_stall_detect"))
{
hostw_ctrl.kernwd_update_period = atoi(value);
config_ptr->mask |= CONFIG_KERNWD_UPDATE_PERIOD ;
}
}
else
{
if (MATCH("timeouts", "kernwd_update_period"))
{
hostw_ctrl.kernwd_update_period = atoi(value);
config_ptr->mask |= CONFIG_KERNWD_UPDATE_PERIOD ;
}
}
return (PASS);
}
@ -186,12 +211,10 @@ int socket_init ( void )
int daemon_init ( string iface, string nodetype_str )
{
int rc = PASS ;
hostw_ctrl_type* ctrl_ptr = get_ctrl_ptr();
UNUSED(iface);
UNUSED(nodetype_str);
/* init the control struct */
memset(ctrl_ptr, 0, sizeof(hostw_ctrl_type));
memset(&hostw_ctrl, 0, sizeof(hostw_ctrl_type));
if (daemon_files_init() != PASS)
{
@ -206,6 +229,12 @@ int daemon_init ( string iface, string nodetype_str )
return ( FAIL_SIGNAL_INIT );
}
/* convert node type to bit field integer */
hostw_ctrl.nodetype = get_host_function_mask ( nodetype_str ) ;
ilog ("Node Type : %s (0x%x)\n",
nodetype_str.c_str(),
hostw_ctrl.nodetype);
/************************************************************************
* There is no point continuing with init ; i.e. running daemon_configure,
* initializing sockets and trying to query for an ip address until the
@ -262,39 +291,37 @@ void daemon_service_run ( void )
*/
static int kernel_watchdog_init ( void )
{
hostw_ctrl_type * ctrl_ptr = get_ctrl_ptr();
daemon_config_type * config_ptr = daemon_get_cfg_ptr();
/* open the watchdog */
if ( (config_ptr->hostwd_use_kern_wd == 0) ||
(config_ptr->kernwd_update_period < HOSTW_MIN_KERN_UPDATE_PERIOD))
(hostw_ctrl.kernwd_update_period < HOSTW_MIN_KERN_UPDATE_PERIOD))
{
/* config file says don't use watchdog, or used too small a period */
wlog("Watchdog NOT enabled ; %s",
config_ptr->hostwd_use_kern_wd ?
"kernwd_update_period too small":
"hostwd_use_kern_wd=0");
return PASS;
}
ilog ("Opening kernel watchdog device\n");
ctrl_ptr->watchdog = open("/dev/watchdog", O_WRONLY);
if (0 >= ctrl_ptr->watchdog)
hostw_ctrl.watchdog = open("/dev/watchdog", O_WRONLY);
if (0 >= hostw_ctrl.watchdog)
{
elog("Could not open kernel watchdog\n");
return FAIL;
}
/* set watchdog timeout (in seconds) */
ilog ("Setting kernel watchdog options - kernel timeout after %d seconds\n",
config_ptr->kernwd_update_period);
if (ioctl(ctrl_ptr->watchdog, WDIOC_SETTIMEOUT, &config_ptr->kernwd_update_period))
if (ioctl(hostw_ctrl.watchdog, WDIOC_SETTIMEOUT, &hostw_ctrl.kernwd_update_period))
{
elog ("Error setting watchdog options -- closing watchdog\n")
elog ("Failed to enable watchdog")
kernel_watchdog_close();
return FAIL;
}
/* do initial keep alive */
ilog ("Watchdog options set\n");
kernel_watchdog_pet();
ilog ("Kernel watchdog enabled with %d second timeout",
hostw_ctrl.kernwd_update_period);
return PASS;
}

View File

@ -10,3 +10,6 @@ hostwd_console_path = /dev/console ; console on which to log extreme events, lik
kernwd_update_period = 300 ; timeout until kernel resets system due to dead
; hostwd process (kernel watchdog)
kernwd_update_period_stall_detect = 180; kernel watchdog timeout introduced to
; detect and react to excessive process
; stalls.