From 04055390fa644060b41533498b98bfa2510250cc Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Sun, 13 May 2018 17:53:32 -0400 Subject: [PATCH] pmond: add support for no script label in conf files Many new services being added to our system are no longer accompanied with an init script ; only a service file. With the migration from sysvinit to systemd pmond still requires process conf files to provide a script label. This update removes that dependency. Instead, pmond will use the service or script label to find the most appropriate process failure recover method while handling the omition of either but not both of the service and script labels. The change is to first search for a service file that corresponds with the service label in the conf file. If the service label does not exist then the script label is looked at. If the basename of the script has a corresponding service file then use it. If no service file is found then the full pathed script is searched for. If no script file is found then the process monitor errors out. This update also makes an improvement to how pmond deals with the absence of the hostw process. Current code base blocks startup if it cannot connect to the hostw process. This update implements host watchdog socket failure auto recovery while continuing to monitor processes. With this update, if the host watchdog process is restarted or is not running then pmond will continue to monitor processes while periodically trying to recover connection to the host watchdog once it does recover. Change-Id: Icf27090d4d00954195b0ac931474587c67341207 Signed-off-by: Jack Ding --- mtce-common/cgts-mtce-common-1.0/pmon/pmon.h | 9 +- .../cgts-mtce-common-1.0/pmon/pmonHdlr.cpp | 129 +++++++++++++----- .../cgts-mtce-common-1.0/pmon/pmonInit.cpp | 25 +--- .../cgts-mtce-common-1.0/pmon/pmonMsg.cpp | 79 +++++------ .../pmon/scripts/acpid.conf | 1 - .../pmon/scripts/nslcd.conf | 1 - .../pmon/scripts/ntpd.conf | 1 - .../pmon/scripts/sshd.conf | 1 - .../pmon/scripts/syslog-ng.conf | 1 - 9 files changed, 144 insertions(+), 103 deletions(-) diff --git a/mtce-common/cgts-mtce-common-1.0/pmon/pmon.h b/mtce-common/cgts-mtce-common-1.0/pmon/pmon.h index 69fbc24e..c7a03a14 100755 --- a/mtce-common/cgts-mtce-common-1.0/pmon/pmon.h +++ b/mtce-common/cgts-mtce-common-1.0/pmon/pmon.h @@ -194,6 +194,9 @@ typedef enum PMOND_RECOVERY_METHOD__SYSTEMD = 1, } recovery_method_type ; +#define SYSTEMD_SERVICE_FILE_DIR1 ((const char *)"/etc/systemd/system") +#define SYSTEMD_SERVICE_FILE_DIR2 ((const char *)"/usr/lib/systemd/system") + /* * Used to mark a configured process * This aids in freeing duped memory over a process re-config @@ -267,7 +270,7 @@ int setup_signal_handler ( int rt_signal_num ); /* Monitored Process Config Bit Mask */ #define CONF_PROCESS (0x0001) -#define CONF_SCRIPT (0x0002) +#define CONF_RECOVERY (0x0002) #define CONF_STYLE (0x0004) #define CONF_PIDFILE (0x0008) #define CONF_RESTARTS (0x0010) @@ -286,8 +289,8 @@ int setup_signal_handler ( int rt_signal_num ); /* Monitored Passive Process Config Mask */ #define CONF_MASK (CONF_PROCESS | \ - CONF_SCRIPT | \ CONF_STYLE | \ + CONF_RECOVERY | \ CONF_PIDFILE | \ CONF_SEVERITY | \ CONF_RESTARTS | \ @@ -302,8 +305,8 @@ int setup_signal_handler ( int rt_signal_num ); /* Monitored Status Process Config Mask */ #define CONF_STATUS_MON_MASK (CONF_PROCESS | \ - CONF_SCRIPT | \ CONF_STYLE | \ + CONF_RECOVERY | \ CONF_SEVERITY | \ CONF_RESTARTS | \ CONF_INTERVAL | \ diff --git a/mtce-common/cgts-mtce-common-1.0/pmon/pmonHdlr.cpp b/mtce-common/cgts-mtce-common-1.0/pmon/pmonHdlr.cpp index eb1b56a1..daf31540 100644 --- a/mtce-common/cgts-mtce-common-1.0/pmon/pmonHdlr.cpp +++ b/mtce-common/cgts-mtce-common-1.0/pmon/pmonHdlr.cpp @@ -524,6 +524,43 @@ void pmon_timer_handler ( int sig, siginfo_t *si, void *uc) } } +/**************************************************************************** + * + * Name : service_file_exists + * + * Description: Look in some well known places for the specified service file. + * + * Returns : Return true if the specified service file is found. + * + * Updates : If the service file is found then update the supplied + * character string buffer with the full path/name of that + * service file. + * + ****************************************************************************/ +bool service_file_exists ( string service_filename, + char * path_n_name_ptr, + int max_len ) +{ + /* load the name of the service file */ + snprintf ( path_n_name_ptr, max_len, "%s/%s", + SYSTEMD_SERVICE_FILE_DIR1, + service_filename.data()); + if (( path_n_name_ptr ) && (strnlen ( path_n_name_ptr, max_len ))) + { + if ( daemon_is_file_present ( path_n_name_ptr ) == true ) + return true ; + } + snprintf ( path_n_name_ptr, max_len, "%s/%s", + SYSTEMD_SERVICE_FILE_DIR2, + service_filename.data()); + if (( path_n_name_ptr ) && ( strnlen ( path_n_name_ptr, max_len ))) + { + if ( daemon_is_file_present ( path_n_name_ptr ) == true ) + return true ; + } + return false ; +} + /***************************************************************************** * * Name : process_config_load @@ -533,8 +570,8 @@ void pmon_timer_handler ( int sig, siginfo_t *si, void *uc) *****************************************************************************/ int process_config_load (process_config_type * pc_ptr, const char * config_file_ptr ) { - char service_name_buf [_MAX_LEN_] ; - memset (service_name_buf,0, sizeof(service_name_buf)); + char recovery_method_buf [_MAX_LEN_] ; + memset (recovery_method_buf,0, sizeof(recovery_method_buf)); if ( _pmon_ctrl_ptr->processes >= MAX_PROCESSES ) { @@ -566,40 +603,59 @@ int process_config_load (process_config_type * pc_ptr, const char * config_file_ pc_ptr->startuptime = PMON_MIN_START_DELAY ; } - /* default recovery method to process init script */ - snprintf ( &service_name_buf[0], _MAX_LEN_, "%s", pc_ptr->script ); + /* Many process conf files came from a sysvinit origin and might not + * have a service file label. Account for that in the following + * load of recovery_method_buf. + * Accept a script name if the service name is missing. */ + bool recovery_method_found = false ; - /* Print error logs if there is no recovery method present for this service/process */ - if ( _pmon_ctrl_ptr->recovery_method == PMOND_RECOVERY_METHOD__SYSTEMD ) + /* look for the service file */ + if ( pc_ptr->service ) { - /* If the config file does not specify a service name - * then the service name defaults to the process name */ - if ( ! pc_ptr->service ) - { - snprintf ( &service_name_buf[0], _MAX_LEN_, "%s/%s.service", SYSTEMD_SERVICE_FILE_DIR, pc_ptr->process ); - if ( daemon_is_file_present ( service_name_buf ) == false ) - { - if ( daemon_is_file_present ( pc_ptr->script ) == false ) - { - /* print a log if we have no recovery method */ - wlog ("%s has no recovery method\n", pc_ptr->process ); - wlog ("... neither %s nor %s exist\n", service_name_buf, pc_ptr->script ); - } - } - } - else - { - snprintf ( &service_name_buf[0], _MAX_LEN_, "%s/%s.service", SYSTEMD_SERVICE_FILE_DIR, pc_ptr->service ); - if ( daemon_is_file_present ( service_name_buf ) == false ) - { - /* print a log if we have no recovery method */ - wlog ("%s service has no recovery method\n", pc_ptr->service ); - wlog ("... %s does not exist\n", service_name_buf ); - } - } + string service = pc_ptr->service ; + if ( service.find(".service") == string::npos ) + service.append(".service"); + if ( service_file_exists(service, &recovery_method_buf[0], _MAX_LEN_) == true ) + recovery_method_found = true ; + } + else if ( pc_ptr->script ) + { + string script = basename((char*)pc_ptr->script); + if ( script.find(".service") == string::npos ) + script.append(".service"); + if ( service_file_exists(script, &recovery_method_buf[0], _MAX_LEN_) == true ) + recovery_method_found = true ; + else + { + /* resort to the script file only */ + /* load the name of the process init script */ + snprintf ( &recovery_method_buf[0], _MAX_LEN_, "%s", pc_ptr->script ); + if ( daemon_is_file_present ( recovery_method_buf ) == true ) + { + recovery_method_found = true ; + } + else + { + wlog ("%s has script but not found (%s)\n", + pc_ptr->process, recovery_method_buf ); + } + } + } + else + { + /* print a log if we have no recovery method */ + wlog ("%s has no recovery method ; process not monitored\n", pc_ptr->process ); + wlog ("... conf file has no 'service' or 'script' recovery entry\n"); + return (FAIL_NOT_FOUND); } - update_config_option ( &pc_ptr->recovery_method , service_name_buf ); + if ( recovery_method_found == false ) + { + wlog ("%s has no recovery method found ; process not monitored\n", pc_ptr->process ); + return (FAIL_NOT_FOUND); + } + + update_config_option ( &pc_ptr->recovery_method , recovery_method_buf ); if ( !strcmp ( pc_ptr->mode, "status" ) ) { @@ -710,7 +766,7 @@ int process_config_load (process_config_type * pc_ptr, const char * config_file_ * that subfunction init is complete */ ilog ("%7s Def : %-30s %-8s - %s (%s)\n", pc_ptr->mode, pc_ptr->process, - pc_ptr->ignore ? "ignored" : pc_ptr->severity, service_name_buf, + pc_ptr->ignore ? "ignored" : pc_ptr->severity, recovery_method_buf, pc_ptr->subfunction); /* defer subfunction processes to the FSM to get enabled */ pc_ptr->stage = PMON_STAGE__POLLING ; @@ -724,7 +780,7 @@ int process_config_load (process_config_type * pc_ptr, const char * config_file_ ilog ("%7s Mon : %-30s %-8s - %s\n", pc_ptr->mode, pc_ptr->process, - pc_ptr->ignore ? "ignored" : pc_ptr->severity, service_name_buf); + pc_ptr->ignore ? "ignored" : pc_ptr->severity, recovery_method_buf); pc_ptr->stage = PMON_STAGE__MANAGE ; } // mem_log_process ( pc_ptr ); @@ -1870,6 +1926,11 @@ void pmon_service ( pmon_ctrl_type * ctrl_ptr ) if ( pmonTimer_hostwd.ring == true ) { + /* inservice recovery from hostw connection failures */ + if ( sock_ptr->hostwd_sock == 0 ) + { + hostwd_port_init(); + } if ( ctrl_ptr->event_mode == true ) { pmon_send_hostwd ( ); diff --git a/mtce-common/cgts-mtce-common-1.0/pmon/pmonInit.cpp b/mtce-common/cgts-mtce-common-1.0/pmon/pmonInit.cpp index 6837100b..dd87c9e7 100644 --- a/mtce-common/cgts-mtce-common-1.0/pmon/pmonInit.cpp +++ b/mtce-common/cgts-mtce-common-1.0/pmon/pmonInit.cpp @@ -116,14 +116,15 @@ int pmon_process_config ( void * user, } if (MATCH("process", "service")) { + ptr->mask |= CONF_RECOVERY ; ptr->service = strdup(value); dlog1 ("Service : %s\n", ptr->service ); rc = PASS ; } else if (MATCH("process", "script")) { - ptr->mask |= CONF_SCRIPT ; - ptr->status_mask |= CONF_SCRIPT ; + ptr->mask |= CONF_RECOVERY ; + ptr->status_mask |= CONF_RECOVERY ; ptr->script = strdup(value); dlog1 ("Script : %s\n", ptr->script ); } @@ -423,7 +424,7 @@ int socket_init ( void ) * host watchdog process */ if ( rc == PASS ) { - rc = hostwd_port_init ( ); + hostwd_port_init ( ); } pmon_inbox_init ( ); @@ -500,22 +501,8 @@ int daemon_init ( string iface, string nodetype_str ) pmon_timer_init (); } - /* - * Setup the recovery method based on the O/S - * - * WRL - SYSVINIT - * CENTOS - SYSTEMD - * - **/ - if ( daemon_is_file_present ( CENTOS_RELEASE_FILE ) ) - { - pmon_ctrl.recovery_method = PMOND_RECOVERY_METHOD__SYSTEMD ; - pmon_ctrl.system_state = get_system_state(); - } - else - { - pmon_ctrl.recovery_method = PMOND_RECOVERY_METHOD__SYSVINIT ; - } + pmon_ctrl.recovery_method = PMOND_RECOVERY_METHOD__SYSTEMD ; + pmon_ctrl.system_state = get_system_state(); ilog ("Recovery Method: %s\n", pmon_ctrl.recovery_method ? "systemd via systemctl" : "sysvinit via script" ); return (rc); } diff --git a/mtce-common/cgts-mtce-common-1.0/pmon/pmonMsg.cpp b/mtce-common/cgts-mtce-common-1.0/pmon/pmonMsg.cpp index 4b388b01..8e0daf6a 100644 --- a/mtce-common/cgts-mtce-common-1.0/pmon/pmonMsg.cpp +++ b/mtce-common/cgts-mtce-common-1.0/pmon/pmonMsg.cpp @@ -78,53 +78,42 @@ int pulse_port_init ( void ) } /* Setup the Unix Host Watchdog Socket */ -#define _THROTTLE_LEVEL (5) int hostwd_port_init ( void ) { - int rc = FAIL ; - int fail_count = 0 ; memset(&pmon_sock.hostwd_addr, 0, sizeof(pmon_sock.hostwd_addr)); - while (rc == FAIL) + pmon_sock.hostwd_sock = socket(AF_UNIX, SOCK_DGRAM, 0); + + if (pmon_sock.hostwd_sock <= 0) { - int len; - int connected; - pmon_sock.hostwd_sock = socket(AF_UNIX, SOCK_DGRAM, 0); - - if (pmon_sock.hostwd_sock <= 0) { - if ( fail_count++ > _THROTTLE_LEVEL ) { - wlog("Could not connect to create hostwd socket - will retry\n"); - } - sleep(1); - continue; - } - - /* Set up the socket address */ - memset (&pmon_sock.hostwd_addr, 0, sizeof(pmon_sock.hostwd_addr)); - pmon_sock.hostwd_addr.sun_family = AF_UNIX; - - /* Unix abstract namespace takes a string that starts with a NULL - * as the identifier. Thus, we need a pointer to byte[1] of the - * sockaddr_un.sun_path (a char array) - */ - strncpy( &(pmon_sock.hostwd_addr.sun_path[1]), - HOSTW_UNIX_SOCKNAME, - UNIX_PATH_MAX-1); - len = sizeof(pmon_sock.hostwd_addr); - - connected = connect( pmon_sock.hostwd_sock, (sockaddr*) &pmon_sock.hostwd_addr, - len); - if (connected == -1) { - if ( fail_count++ > _THROTTLE_LEVEL ) { - wlog("Could not connect to hostwd port - will retry\n"); - } - close(pmon_sock.hostwd_sock); - pmon_sock.hostwd_sock = 0; - sleep(1); - } else { - rc = PASS; - } + wlog("Could not connect to create hostwd socket - will retry\n"); + pmon_sock.hostwd_sock = 0 ; + return (FAIL_SOCKET_CREATE); } - return (rc); + + /* Set up the socket address */ + memset (&pmon_sock.hostwd_addr, 0, sizeof(pmon_sock.hostwd_addr)); + pmon_sock.hostwd_addr.sun_family = AF_UNIX; + + /* Unix abstract namespace takes a string that starts with a NULL + * as the identifier. Thus, we need a pointer to byte[1] of the + * sockaddr_un.sun_path (a char array) + */ + strncpy( &(pmon_sock.hostwd_addr.sun_path[1]), + HOSTW_UNIX_SOCKNAME, + UNIX_PATH_MAX-1); + int len = sizeof(pmon_sock.hostwd_addr); + int connected = connect( pmon_sock.hostwd_sock, (sockaddr*) &pmon_sock.hostwd_addr, + len); + if (connected == -1) + { + wlog("Could not connect to hostwd port - will retry\n"); + if ( pmon_sock.hostwd_sock ) + close(pmon_sock.hostwd_sock); + pmon_sock.hostwd_sock = 0; + return (FAIL_CONNECT); + } + ilog ("connected to host watchdog\n"); + return (PASS); } /* Build a message for host watchdog, and send it */ @@ -174,7 +163,13 @@ int pmon_send_hostwd ( void ) { elog("Error sending message to host watchdog -- error %d (%s)\n", errno, strerror(errno)); + if ( pmon_sock.hostwd_sock ) + { + close(pmon_sock.hostwd_sock); + pmon_sock.hostwd_sock = 0; + } return (FAIL); + } } return (FAIL); diff --git a/mtce-common/cgts-mtce-common-1.0/pmon/scripts/acpid.conf b/mtce-common/cgts-mtce-common-1.0/pmon/scripts/acpid.conf index 7d5740c4..e1c88cfc 100644 --- a/mtce-common/cgts-mtce-common-1.0/pmon/scripts/acpid.conf +++ b/mtce-common/cgts-mtce-common-1.0/pmon/scripts/acpid.conf @@ -2,7 +2,6 @@ process = acpid service = acpid pidfile = /var/run/acpid.pid -script = /etc/init.d/acpid style = lsb ; ocf or lsb severity = minor ; minor, major, critical restarts = 3 ; restart retries before error assertion diff --git a/mtce-common/cgts-mtce-common-1.0/pmon/scripts/nslcd.conf b/mtce-common/cgts-mtce-common-1.0/pmon/scripts/nslcd.conf index ad9dd838..63cc2f13 100644 --- a/mtce-common/cgts-mtce-common-1.0/pmon/scripts/nslcd.conf +++ b/mtce-common/cgts-mtce-common-1.0/pmon/scripts/nslcd.conf @@ -2,7 +2,6 @@ process = nslcd service = nslcd pidfile = /var/run/nslcd/nslcd.pid -script = /etc/init.d/openldap style = lsb ; ocf or lsb severity = major ; minor, major, critical restarts = 3 ; restart retries before error assertion diff --git a/mtce-common/cgts-mtce-common-1.0/pmon/scripts/ntpd.conf b/mtce-common/cgts-mtce-common-1.0/pmon/scripts/ntpd.conf index 568a89ee..524573b7 100644 --- a/mtce-common/cgts-mtce-common-1.0/pmon/scripts/ntpd.conf +++ b/mtce-common/cgts-mtce-common-1.0/pmon/scripts/ntpd.conf @@ -2,7 +2,6 @@ process = ntpd service = ntpd pidfile = /var/run/ntp.pid -script = /etc/init.d/ntpd style = lsb ; ocf or lsb severity = minor ; minor, major, critical restarts = 0 ; restart retries before error assertion diff --git a/mtce-common/cgts-mtce-common-1.0/pmon/scripts/sshd.conf b/mtce-common/cgts-mtce-common-1.0/pmon/scripts/sshd.conf index 80c507a7..dfa3a21c 100644 --- a/mtce-common/cgts-mtce-common-1.0/pmon/scripts/sshd.conf +++ b/mtce-common/cgts-mtce-common-1.0/pmon/scripts/sshd.conf @@ -2,7 +2,6 @@ process = sshd service = sshd pidfile = /var/run/sshd.pid -script = /etc/init.d/sshd style = lsb ; ocf or lsb severity = minor ; minor, major, critical restarts = 10 ; restart retries before error assertion diff --git a/mtce-common/cgts-mtce-common-1.0/pmon/scripts/syslog-ng.conf b/mtce-common/cgts-mtce-common-1.0/pmon/scripts/syslog-ng.conf index 5bb48129..9d78fcad 100644 --- a/mtce-common/cgts-mtce-common-1.0/pmon/scripts/syslog-ng.conf +++ b/mtce-common/cgts-mtce-common-1.0/pmon/scripts/syslog-ng.conf @@ -2,7 +2,6 @@ process = syslog-ng service = syslog-ng pidfile = /var/run/syslog-ng/syslog-ng.pid -script = /etc/init.d/syslog style = lsb ; ocf or lsb severity = minor ; minor, major, critical restarts = 2 ; restart retries before error assertion