diff --git a/mtce-common/src/common/logMacros.h b/mtce-common/src/common/logMacros.h index 7d4b6677..645c3681 100644 --- a/mtce-common/src/common/logMacros.h +++ b/mtce-common/src/common/logMacros.h @@ -160,7 +160,6 @@ typedef struct int oos_test_period ; /**< oos test period in secs */ int failsafe_shutdown_delay ; /**< seconds before failsafe reboot */ int hostwd_update_period ; /**< expect hostwd to be updated */ - int kernwd_update_period ; /**< expect kernel watchdog to be updated */ int autorecovery_threshold ; /**< AIO stop autorecovery threshold */ /**< Auto Recovery Thresholds */ diff --git a/mtce/src/hostw/hostw.h b/mtce/src/hostw/hostw.h index 713afacd..d4f32940 100755 --- a/mtce/src/hostw/hostw.h +++ b/mtce/src/hostw/hostw.h @@ -60,7 +60,7 @@ using namespace std; #define HOSTWD_CONFIG_FILE ((const char *)"/etc/mtc/hostwd.conf") #define PMOND_CONFIG_FILE ((const char *)"/etc/mtc/pmond.conf") -#define HOSTW_MIN_KERN_UPDATE_PERIOD 60 /* user can set how long until kernel +#define HOSTW_MIN_KERN_UPDATE_PERIOD 5 /* user can set how long until kernel * watchdog panics, down to this * minimum (seconds) */ @@ -93,9 +93,12 @@ using namespace std; /* Context control structure */ typedef struct { + int nodetype ; + /* Watchdog interface */ /* ------------------ */ - int watchdog ; /** The opened /dev/watchdog file */ + int watchdog ; /** The opened /dev/watchdog file */ + int kernwd_update_period; /** period in seconds the watchdog is serviced */ /* Loop counters */ /* ------------------ */ diff --git a/mtce/src/hostw/hostwHdlr.cpp b/mtce/src/hostw/hostwHdlr.cpp index 55931911..13a6ca76 100644 --- a/mtce/src/hostw/hostwHdlr.cpp +++ b/mtce/src/hostw/hostwHdlr.cpp @@ -39,6 +39,7 @@ int hostw_service_command ( hostw_socket_type * hostw_socket ); static void fork_hostwd_logger ( void ); +char my_hostname [MAX_HOST_NAME_SIZE+1]; /* Push daemon state to log file */ void daemon_dump_info ( void ) @@ -49,6 +50,25 @@ void daemon_sigchld_hdlr ( void ) { } +static struct mtc_timer pmonTimer ; + +void hostwTimer_handler ( int sig, siginfo_t *si, void *uc) +{ + timer_t * tid_ptr = (void**)si->si_value.sival_ptr ; + + /* Avoid compiler errors/warnings for parms we must + * have but currently do nothing with */ + UNUSED(sig); + UNUSED(uc); + + if ( !(*tid_ptr) ) + return ; + else if (( *tid_ptr == pmonTimer.tid ) ) + pmonTimer.ring = true ; + else + mtcTimer_stop_tid_int_safe ( tid_ptr ); +} + /** * This is the main loop of the program * @@ -73,6 +93,10 @@ void hostw_service ( void ) hostw_ctrl_type *ctrl = get_ctrl_ptr (); daemon_config_type *config = daemon_get_cfg_ptr (); + get_hostname (&my_hostname[0], MAX_HOST_NAME_SIZE ); + mtcTimer_init ( pmonTimer, my_hostname, "pmon" ); + mtcTimer_start( pmonTimer, hostwTimer_handler, config->hostwd_update_period); + ctrl->pmon_grace_loops = config->hostwd_failure_threshold; socks.clear(); @@ -84,12 +108,11 @@ void hostw_service ( void ) socks.sort(); ilog("Host Watchdog Service running\n"); - for ( ; ; ) + for ( int quorum_failed = 0 ; ; ) { - timeout.tv_sec = config->hostwd_update_period; + timeout.tv_sec =1; /* 1 second select ; pet watchdog every second */ timeout.tv_usec=0; - /* pet the watchdog */ kernel_watchdog_pet(); /* set the master fd_set */ @@ -107,56 +130,75 @@ void hostw_service ( void ) if ( errno != EINTR ) { elog ("Select Failed (rc:%d) %s \n", errno, strerror(errno)); - ctrl->pmon_grace_loops--; + if ( ctrl->pmon_grace_loops > 0 ) + ctrl->pmon_grace_loops--; } } else if ( rc == 0 ) { - if (daemon_is_file_present(NODE_LOCKED_FILE)) + if ( pmonTimer.ring == true ) { - wlog( "Did not receive message from PMON, however node is" - " locked -- refusing to take reset action while locked\n" ); - } - else - { - ctrl->pmon_grace_loops--; - - if ( ctrl->pmon_grace_loops ) + if (daemon_is_file_present(NODE_LOCKED_FILE)) { - ilog ("Did not receive expected message from PMON - %d more missed messages allowed\n", - ctrl->pmon_grace_loops-1); + wlog("Process Quorum Health not receive from PMON ; " + "no action while node is locked"); } + else + { + if ( ctrl->pmon_grace_loops ) + ctrl->pmon_grace_loops--; + + if ( ctrl->pmon_grace_loops > 0 ) + { + wlog ("Process Quorum Health not received from PMON ; " + "%d more misses allowed before self-reboot", + ctrl->pmon_grace_loops-1); + } + } + pmonTimer.ring = false ; } } - else + else if ( quorum_failed == 0 ) { if (FD_ISSET(hostw_socket->status_sock, &(hostw_socket->readfds))) { rc = hostw_service_command ( hostw_socket); if ( rc == PASS ) /* got "all is well" message */ { - ctrl->pmon_grace_loops = config->hostwd_failure_threshold; + /* reset the pmon quorum health timer */ + mtcTimer_reset(pmonTimer); + mtcTimer_start(pmonTimer, hostwTimer_handler, config->hostwd_update_period); + + /* reload pmon grace loops count down */ + if ( ctrl->pmon_grace_loops != config->hostwd_failure_threshold ) + { + ilog("Process Quorum Health messaging restored"); + ctrl->pmon_grace_loops = config->hostwd_failure_threshold; + } + } + else if ( rc != RETRY ) + quorum_failed++ ; + } + } + if ( 0 >= ctrl->pmon_grace_loops ) + { + if ( quorum_failed++ == 0 ) + { + if (daemon_is_file_present(NODE_LOCKED_FILE)) + { + wlog( "Host watchdog (hostwd) not receiving messages from PMON" + " however host is locked - refusing to take reset action" + " while locked\n" ); + } + else + { + emergency_log( "*** Host watchdog (hostwd) not receiving messages " + "from PMON ***\n"); + hostw_log_and_reboot(); } } } - if (0 >= ctrl->pmon_grace_loops) - { - if (daemon_is_file_present(NODE_LOCKED_FILE)) - { - wlog( "Host watchdog (hostwd) not receiving messages from PMON" - " however host is locked - refusing to take reset action" - " while locked\n" ); - } - else - { - emergency_log( "*** Host watchdog (hostwd) not receiving messages " - "from PMON ***\n"); - hostw_log_and_reboot(); - } - } - daemon_signal_hdlr (); - } } @@ -187,21 +229,22 @@ int hostw_service_command ( hostw_socket_type * hostw_socket) { case MTC_CMD_NONE: /* All is well */ + dlog ("pmon is happy"); return PASS; case MTC_EVENT_PMON_CRIT: if (daemon_is_file_present(NODE_LOCKED_FILE)) { - ilog( "PMON reports unrecoverable system, however node is" - " locked - considering this an OK message\n" ); + wlog("PMON reports unrecoverable system - message '%s'", msg[0].buf ); + ilog("... no action while node is locked"); return PASS; } else { emergency_log( "*** PMON reports unrecoverable system - message '%s' ***\n", msg[0].buf); hostw_log_and_reboot(); + return FAIL; } - return FAIL; default: elog("Unknown status reported\n"); @@ -213,7 +256,7 @@ int hostw_service_command ( hostw_socket_type * hostw_socket) /* bad message size */ elog("Host Watchdog received bad or corrupted message (length = %d)\n", len); } - return FAIL; + return RETRY; } /** diff --git a/mtce/src/hostw/hostwInit.cpp b/mtce/src/hostw/hostwInit.cpp index 66532daf..7a71d064 100644 --- a/mtce/src/hostw/hostwInit.cpp +++ b/mtce/src/hostw/hostwInit.cpp @@ -104,31 +104,56 @@ int hostw_process_config ( void * user, { config_ptr->hostwd_failure_threshold = atoi(value); config_ptr->mask |= CONFIG_HOSTWD_FAILURE_THRESHOLD ; + ilog("Quorum Thld : %d", config_ptr->hostwd_failure_threshold); } else if (MATCH("config", "hostwd_reboot_on_err")) { config_ptr->hostwd_reboot_on_err = atoi(value); config_ptr->mask |= CONFIG_HOSTWD_REBOOT ; + ilog("Quorum Loss : %s", + config_ptr->hostwd_reboot_on_err ? "Reboot & Log" : "Log Only"); } else if (MATCH("config", "hostwd_use_kern_wd")) { config_ptr->hostwd_use_kern_wd = atoi(value); config_ptr->mask |= CONFIG_HOSTWD_USE_KERN_WD ; + ilog("Use Kern Wd : %s", config_ptr->hostwd_use_kern_wd ? "Yes":"No"); } else if (MATCH("config", "hostwd_console_path")) { config_ptr->hostwd_console_path = strdup(value); config_ptr->mask |= CONFIG_HOSTWD_CONSOLE_PATH ; - } - else if (MATCH("timeouts", "kernwd_update_period")) - { - config_ptr->kernwd_update_period = atoi(value); - config_ptr->mask |= CONFIG_KERNWD_UPDATE_PERIOD ; + ilog("WD Console : %s", config_ptr->hostwd_console_path); } else if (MATCH("config", "hostwd_update_period")) /* in pmond.conf file */ { config_ptr->hostwd_update_period = atoi(value); config_ptr->mask |= CONFIG_HOSTWD_UPDATE_PERIOD ; + ilog("Quorum Rate : %d secs", config_ptr->hostwd_update_period); + } + + /* Timeout config options */ + + /* kernwd_update_period is how many seconds the watchdog can run without + * petting it before timeout and kernel reboot. + * The timeout for controller nodes is loaded with the stall detection + * value compared to other nodes with the legacy value. + */ + else if (hostw_ctrl.nodetype & CONTROLLER_TYPE) + { + if (MATCH("timeouts", "kernwd_update_period_stall_detect")) + { + hostw_ctrl.kernwd_update_period = atoi(value); + config_ptr->mask |= CONFIG_KERNWD_UPDATE_PERIOD ; + } + } + else + { + if (MATCH("timeouts", "kernwd_update_period")) + { + hostw_ctrl.kernwd_update_period = atoi(value); + config_ptr->mask |= CONFIG_KERNWD_UPDATE_PERIOD ; + } } return (PASS); } @@ -186,12 +211,10 @@ int socket_init ( void ) int daemon_init ( string iface, string nodetype_str ) { int rc = PASS ; - hostw_ctrl_type* ctrl_ptr = get_ctrl_ptr(); UNUSED(iface); - UNUSED(nodetype_str); /* init the control struct */ - memset(ctrl_ptr, 0, sizeof(hostw_ctrl_type)); + memset(&hostw_ctrl, 0, sizeof(hostw_ctrl_type)); if (daemon_files_init() != PASS) { @@ -206,6 +229,12 @@ int daemon_init ( string iface, string nodetype_str ) return ( FAIL_SIGNAL_INIT ); } + /* convert node type to bit field integer */ + hostw_ctrl.nodetype = get_host_function_mask ( nodetype_str ) ; + ilog ("Node Type : %s (0x%x)\n", + nodetype_str.c_str(), + hostw_ctrl.nodetype); + /************************************************************************ * There is no point continuing with init ; i.e. running daemon_configure, * initializing sockets and trying to query for an ip address until the @@ -262,39 +291,37 @@ void daemon_service_run ( void ) */ static int kernel_watchdog_init ( void ) { - hostw_ctrl_type * ctrl_ptr = get_ctrl_ptr(); daemon_config_type * config_ptr = daemon_get_cfg_ptr(); /* open the watchdog */ if ( (config_ptr->hostwd_use_kern_wd == 0) || - (config_ptr->kernwd_update_period < HOSTW_MIN_KERN_UPDATE_PERIOD)) + (hostw_ctrl.kernwd_update_period < HOSTW_MIN_KERN_UPDATE_PERIOD)) { /* config file says don't use watchdog, or used too small a period */ + wlog("Watchdog NOT enabled ; %s", + config_ptr->hostwd_use_kern_wd ? + "kernwd_update_period too small": + "hostwd_use_kern_wd=0"); return PASS; } - ilog ("Opening kernel watchdog device\n"); - ctrl_ptr->watchdog = open("/dev/watchdog", O_WRONLY); - if (0 >= ctrl_ptr->watchdog) + hostw_ctrl.watchdog = open("/dev/watchdog", O_WRONLY); + if (0 >= hostw_ctrl.watchdog) { elog("Could not open kernel watchdog\n"); return FAIL; } /* set watchdog timeout (in seconds) */ - ilog ("Setting kernel watchdog options - kernel timeout after %d seconds\n", - config_ptr->kernwd_update_period); - if (ioctl(ctrl_ptr->watchdog, WDIOC_SETTIMEOUT, &config_ptr->kernwd_update_period)) + if (ioctl(hostw_ctrl.watchdog, WDIOC_SETTIMEOUT, &hostw_ctrl.kernwd_update_period)) { - elog ("Error setting watchdog options -- closing watchdog\n") + elog ("Failed to enable watchdog") kernel_watchdog_close(); return FAIL; } - - /* do initial keep alive */ - ilog ("Watchdog options set\n"); - kernel_watchdog_pet(); + ilog ("Kernel watchdog enabled with %d second timeout", + hostw_ctrl.kernwd_update_period); return PASS; } diff --git a/mtce/src/hostw/scripts/hostwd.conf b/mtce/src/hostw/scripts/hostwd.conf index 9b487356..3b66927d 100755 --- a/mtce/src/hostw/scripts/hostwd.conf +++ b/mtce/src/hostw/scripts/hostwd.conf @@ -10,3 +10,6 @@ hostwd_console_path = /dev/console ; console on which to log extreme events, lik kernwd_update_period = 300 ; timeout until kernel resets system due to dead ; hostwd process (kernel watchdog) +kernwd_update_period_stall_detect = 180; kernel watchdog timeout introduced to + ; detect and react to excessive process + ; stalls.