From bd9e560d4bed07b4f4f83ce8d5af3f1975c6a07a Mon Sep 17 00:00:00 2001 From: Davi Frossard Date: Thu, 11 Aug 2022 15:59:17 -0400 Subject: [PATCH] Remove sm-watchdog service since NFS is now stable sm-watchdog was introduced as a workaround because of NFS hung. Another clean fix is already provided, but the sm-watchdog was not removed. Test plan: [centos] build, install and unlock. [debian] build, install and unlock. Story: 2010087 Task: 46007 Signed-off-by: Davi Frossard Change-Id: I29fffff4e8982dc504f104f49c6586f7c74527fb --- .zuul.yaml | 1 - devstack/lib/ha | 22 - service-mgmt/sm-common/Makefile | 4 - service-mgmt/sm-common/centos/sm-common.spec | 15 - .../sm-common/debian/deb_folder/rules | 3 - .../debian/deb_folder/sm-common-dev.install | 1 - .../debian/deb_folder/sm-common-libs.dirs | 2 - .../debian/deb_folder/sm-common-libs.install | 1 - .../debian/deb_folder/sm-common.install | 4 - .../deb_folder/systemd/00-sm-common.preset | 1 - .../sm-common/opensuse/sm-common.spec | 21 - service-mgmt/sm-common/scripts/Makefile | 2 +- service-mgmt/sm-common/scripts/sm-watchdog | 131 ---- .../sm-common/scripts/sm-watchdog.conf | 15 - .../sm-common/scripts/sm-watchdog.service | 15 - service-mgmt/sm-common/src/Makefile | 19 +- service-mgmt/sm-common/src/sm_types.h | 3 - service-mgmt/sm-common/src/sm_utils.c | 77 --- service-mgmt/sm-common/src/sm_utils.h | 12 - service-mgmt/sm-common/src/sm_watchdog_main.c | 49 -- .../sm-common/src/sm_watchdog_module.c | 247 ------- .../sm-common/src/sm_watchdog_module.h | 31 - service-mgmt/sm-common/src/sm_watchdog_nfs.c | 608 ------------------ service-mgmt/sm-common/src/sm_watchdog_nfs.h | 37 -- .../sm-common/src/sm_watchdog_process.c | 241 ------- .../sm-common/src/sm_watchdog_process.h | 25 - service-mgmt/sm/scripts/sm.service | 2 +- service-mgmt/sm/scripts/sm.troubleshoot | 3 - service-mgmt/sm/src/sm_service_action.c | 12 - .../sm/src/sm_service_group_notification.c | 10 - 30 files changed, 4 insertions(+), 1610 deletions(-) delete mode 100755 service-mgmt/sm-common/scripts/sm-watchdog delete mode 100644 service-mgmt/sm-common/scripts/sm-watchdog.conf delete mode 100644 service-mgmt/sm-common/scripts/sm-watchdog.service delete mode 100644 service-mgmt/sm-common/src/sm_watchdog_main.c delete mode 100644 service-mgmt/sm-common/src/sm_watchdog_module.c delete mode 100644 service-mgmt/sm-common/src/sm_watchdog_module.h delete mode 100644 service-mgmt/sm-common/src/sm_watchdog_nfs.c delete mode 100644 service-mgmt/sm-common/src/sm_watchdog_nfs.h delete mode 100644 service-mgmt/sm-common/src/sm_watchdog_process.c delete mode 100644 service-mgmt/sm-common/src/sm_watchdog_process.h diff --git a/.zuul.yaml b/.zuul.yaml index 4631e0d3..eda83670 100644 --- a/.zuul.yaml +++ b/.zuul.yaml @@ -44,7 +44,6 @@ sm-tools: true sm-api: true sm-eru: true - sm-watchdog: true mysql: false postgresql: true tls-proxy: false diff --git a/devstack/lib/ha b/devstack/lib/ha index 12ee724b..a50b3dcd 100644 --- a/devstack/lib/ha +++ b/devstack/lib/ha @@ -156,14 +156,9 @@ function cleanup_sm_common { $STX_INST_DIR/lib64/libsm_common.so.* \ $STX_BIN_DIR/sm-eru \ $STX_BIN_DIR/sm-eru-dump \ - $STX_BIN_DIR/sm-watchdog \ - $STX_SM_VAR_DIR/watchdog/modules/libsm_watchdog_nfs.so.* \ $STX_SYSCONFDIR/systemd/system/sm-eru.service \ - $STX_SYSCONFDIR/systemd/system/sm-watchdog.service \ $STX_SYSCONFDIR/pmon.d/sm-eru.conf \ - $STX_SYSCONFDIR/pmon.d/sm-watchdog.conf \ $STX_SYSCONFDIR/init.d/sm-eru \ - $STX_SYSCONFDIR/init.d/sm-watchdog \ /etc/ld.so.conf.d/stx-ha.conf popd @@ -190,7 +185,6 @@ function configure_ha { if is_service_enabled sm-common; then config_eru - config_watchdog fi if is_service_enabled sm-daemon; then @@ -215,12 +209,6 @@ function config_eru { iniset -sudo ${STX_SYSCONFDIR}/systemd/system/devstack@sm-eru.service "Service" "PIDFile" "/var/run/sm-eru.pid" } -function config_watchdog { - sudo sed -i "s%SM_WATCHDOG=\"/usr/bin/\${SM_WATCHDOG_NAME}\"%SM_WATCHDOG=\"$STX_INST_DIR/bin/\${SM_WATCHDOG_NAME}\"%" $STX_SYSCONFDIR/init.d/sm-watchdog - iniset -sudo ${STX_SYSCONFDIR}/systemd/system/devstack@sm-watchdog.service "Service" "Type" "forking" - iniset -sudo ${STX_SYSCONFDIR}/systemd/system/devstack@sm-watchdog.service "Service" "PIDFile" "/var/run/sm-watchdog.pid" -} - function create_sm_accounts { create_service_user "smapi" get_or_create_service "smapi" "servicemanagement" "Service Management" @@ -340,15 +328,11 @@ function install_sm_common { install_sm_common_libs - sudo install -m 0755 -p -D -t $STX_SM_VAR_DIR/watchdog/modules src/libsm_watchdog_nfs.so.${STX_SM_COMMON_VERSION} - sudo cp -P src/libsm_watchdog_nfs.so src/libsm_watchdog_nfs.so.${STX_SM_COMMON_VERSION%%.*} $STX_SM_VAR_DIR/watchdog/modules - # scripts/ (cd scripts; sudo make DEST_DIR= UNIT_DIR=$STX_SYSCONFDIR/systemd/system install) sudo install -m 750 -p -D src/sm_eru $STX_BIN_DIR/sm-eru sudo install -m 750 -p -D src/sm_eru_dump $STX_BIN_DIR/sm-eru-dump - sudo install -m 750 -p -D src/sm_watchdog $STX_BIN_DIR/sm-watchdog echo $STX_INST_DIR/lib64 | sudo tee /etc/ld.so.conf.d/stx-ha.conf sudo ldconfig @@ -411,10 +395,6 @@ function start_eru { run_process sm-eru "${STX_SYSCONFDIR}/init.d/sm-eru start" root root } -function start_watchdog { - run_process sm-watchdog "${STX_SYSCONFDIR}/init.d/sm-watchdog start" root root -} - function start_ha { if is_service_enabled sm-daemon; then start_sm @@ -426,14 +406,12 @@ function start_ha { if is_service_enabled sm-common; then start_eru - start_watchdog fi } function stop_ha { if is_service_enabled sm-common; then stop_process sm-eru - stop_process sm-watchdog fi if is_service_enabled sm-api; then diff --git a/service-mgmt/sm-common/Makefile b/service-mgmt/sm-common/Makefile index 2ed37b77..02fe0e1a 100644 --- a/service-mgmt/sm-common/Makefile +++ b/service-mgmt/sm-common/Makefile @@ -16,14 +16,10 @@ install: install -m 750 -d $(DEST_DIR)/usr/bin install -m 750 -p -D $(BUILDSUBDIR)/src/sm_eru $(DEST_DIR)/$(BIN_DIR)/sm-eru install -m 750 -p -D $(BUILDSUBDIR)/src/sm_eru_dump $(DEST_DIR)/$(BIN_DIR)/sm-eru-dump - install -m 750 -p -D $(BUILDSUBDIR)/src/sm_watchdog $(DEST_DIR)/$(BIN_DIR)/sm-watchdog install -m 644 -p -D $(BUILDSUBDIR)/scripts/sm-eru.service $(DEST_DIR)/$(UNIT_DIR)/sm-eru.service - install -m 644 -p -D $(BUILDSUBDIR)/scripts/sm-watchdog.service $(DEST_DIR)/$(UNIT_DIR)/sm-watchdog.service install -m 750 -d $(DEST_DIR)/$(ETC_DIR)/pmon.d install -m 640 -p -D $(BUILDSUBDIR)/scripts/sm-eru.conf $(DEST_DIR)/$(ETC_DIR)/pmon.d/sm-eru.conf - install -m 640 -p -D $(BUILDSUBDIR)/scripts/sm-watchdog.conf $(DEST_DIR)/$(ETC_DIR)/pmon.d/sm-watchdog.conf install -m 750 -p -D $(BUILDSUBDIR)/scripts/sm-eru $(DEST_DIR)/$(ETC_DIR)/init.d/sm-eru - install -m 750 -p -D $(BUILDSUBDIR)/scripts/sm-watchdog $(DEST_DIR)/$(ETC_DIR)/init.d/sm-watchdog clean: @( cd src; make clean ) diff --git a/service-mgmt/sm-common/centos/sm-common.spec b/service-mgmt/sm-common/centos/sm-common.spec index 32be8df4..c796a1ad 100644 --- a/service-mgmt/sm-common/centos/sm-common.spec +++ b/service-mgmt/sm-common/centos/sm-common.spec @@ -91,9 +91,6 @@ MAJOR=`echo $VER | awk -F . '{print $1}'` MINOR=`echo $VER | awk -F . '{print $2}'` make DEST_DIR=%{buildroot} BIN_DIR=%{_bindir} UNIT_DIR=%{_unitdir} LIB_DIR=%{_libdir} INC_DIR=%{_includedir} BUILDSUBDIR=%{_buildsubdir} VER=$VER VER_MJR=$MAJOR install -%post -/usr/bin/systemctl enable sm-watchdog.service >/dev/null 2>&1 - %post -n sm-eru /usr/bin/systemctl enable sm-eru.service >/dev/null 2>&1 @@ -101,10 +98,6 @@ make DEST_DIR=%{buildroot} BIN_DIR=%{_bindir} UNIT_DIR=%{_unitdir} LIB_DIR=%{_li %files %license LICENSE %defattr(-,root,root,-) -/etc/init.d/sm-watchdog -/etc/pmon.d/sm-watchdog.conf -/usr/bin/sm-watchdog -/usr/lib/systemd/system/sm-watchdog.service #%{_unitdir}/* #%{_bindir}/* @@ -113,10 +106,6 @@ make DEST_DIR=%{buildroot} BIN_DIR=%{_bindir} UNIT_DIR=%{_unitdir} LIB_DIR=%{_li %files libs %{_libdir}/*.so.* -%dir "/var/lib/sm" -%dir "/var/lib/sm/watchdog" -%dir "/var/lib/sm/watchdog/modules" -/var/lib/sm/watchdog/modules/*.so.* %files -n sm-eru %defattr(-,root,root,-) @@ -135,18 +124,14 @@ make DEST_DIR=%{buildroot} BIN_DIR=%{_bindir} UNIT_DIR=%{_unitdir} LIB_DIR=%{_li #"/usr/lib64/.debug/libsm_common.so.1.0.0" #%dir "/usr/bin/.debug" #"/usr/bin/.debug/sm-eru-dump" -#"/usr/bin/.debug/sm-watchdog" #"/usr/bin/.debug/sm-eru" #%dir "/usr/src/debug/sm-common" #%dir "/usr/src/debug/sm-common/1.0.0-r7" #%dir "/usr/src/debug/sm-common/1.0.0-r7/src" #/usr/src/debug/sm-common/1.0.0-r7/src/*.h #/usr/src/debug/sm-common/1.0.0-r7/src/*.c -#%dir "/var/lib/sm/watchdog/modules/.debug" -#"/var/lib/sm/watchdog/modules/.debug/libsm_watchdog_nfs.so.1.0.0" %files dev %defattr(-,root,root,-) %{_includedir}/* %{_libdir}/*.so -/var/lib/sm/watchdog/modules/libsm_watchdog_nfs.so diff --git a/service-mgmt/sm-common/debian/deb_folder/rules b/service-mgmt/sm-common/debian/deb_folder/rules index bc09ebeb..f3f89f27 100755 --- a/service-mgmt/sm-common/debian/deb_folder/rules +++ b/service-mgmt/sm-common/debian/deb_folder/rules @@ -23,11 +23,8 @@ override_dh_auto_install: # Prevents dh_fixperms from changing the permissions defined in the makefiles override_dh_fixperms: dh_fixperms \ - -Xsm-watchdog* \ -Xlibsm_common.so.* \ - -Xlibsm_watchdog_nfs.so.* \ -Xsm-eru* override_dh_installsystemd: - dh_installsystemd -psm-common sm-watchdog.service dh_installsystemd -psm-eru sm-eru.service diff --git a/service-mgmt/sm-common/debian/deb_folder/sm-common-dev.install b/service-mgmt/sm-common/debian/deb_folder/sm-common-dev.install index 64a86973..1537de11 100644 --- a/service-mgmt/sm-common/debian/deb_folder/sm-common-dev.install +++ b/service-mgmt/sm-common/debian/deb_folder/sm-common-dev.install @@ -1,3 +1,2 @@ usr/include/* usr/lib/*.so -var/lib/sm/watchdog/modules/libsm_watchdog_nfs.so diff --git a/service-mgmt/sm-common/debian/deb_folder/sm-common-libs.dirs b/service-mgmt/sm-common/debian/deb_folder/sm-common-libs.dirs index c9f53c45..69d063f5 100644 --- a/service-mgmt/sm-common/debian/deb_folder/sm-common-libs.dirs +++ b/service-mgmt/sm-common/debian/deb_folder/sm-common-libs.dirs @@ -1,3 +1 @@ /var/lib/sm -/var/lib/sm/watchdog -/var/lib/sm/watchdog/modules diff --git a/service-mgmt/sm-common/debian/deb_folder/sm-common-libs.install b/service-mgmt/sm-common/debian/deb_folder/sm-common-libs.install index 258af2c3..093956b1 100644 --- a/service-mgmt/sm-common/debian/deb_folder/sm-common-libs.install +++ b/service-mgmt/sm-common/debian/deb_folder/sm-common-libs.install @@ -1,2 +1 @@ usr/lib/*.so.* -var/lib/sm/watchdog/modules/*.so.* diff --git a/service-mgmt/sm-common/debian/deb_folder/sm-common.install b/service-mgmt/sm-common/debian/deb_folder/sm-common.install index 3fe883c3..012f61d3 100644 --- a/service-mgmt/sm-common/debian/deb_folder/sm-common.install +++ b/service-mgmt/sm-common/debian/deb_folder/sm-common.install @@ -1,5 +1 @@ -etc/init.d/sm-watchdog -etc/pmon.d/sm-watchdog.conf -usr/bin/sm-watchdog -lib/systemd/system/sm-watchdog.service debian/systemd/00-sm-common.preset etc/systemd/system-preset diff --git a/service-mgmt/sm-common/debian/deb_folder/systemd/00-sm-common.preset b/service-mgmt/sm-common/debian/deb_folder/systemd/00-sm-common.preset index a5bc3dcd..e69de29b 100644 --- a/service-mgmt/sm-common/debian/deb_folder/systemd/00-sm-common.preset +++ b/service-mgmt/sm-common/debian/deb_folder/systemd/00-sm-common.preset @@ -1 +0,0 @@ -enable sm-watchdog.service diff --git a/service-mgmt/sm-common/opensuse/sm-common.spec b/service-mgmt/sm-common/opensuse/sm-common.spec index 3ced7265..dad7dd84 100644 --- a/service-mgmt/sm-common/opensuse/sm-common.spec +++ b/service-mgmt/sm-common/opensuse/sm-common.spec @@ -72,19 +72,6 @@ MAJOR=`echo $VER | awk -F . '{print $1}'` MINOR=`echo $VER | awk -F . '{print $2}'` make DEST_DIR=%{buildroot} BIN_DIR=%{_bindir} UNIT_DIR=%{_unitdir} LIB_DIR=%{_libdir} INC_DIR=%{_includedir} BUILDSUBDIR=%{_buildsubdir} VER=$VER VER_MJR=$MAJOR install -%pre -%service_add_pre sm-watchdog.service sm-watchdog.target - -%preun -%service_del_preun sm-watchdog.service sm-watchdog.target - -%post -%service_add_post sm-watchdog.service sm-watchdog.target -/usr/bin/systemctl enable sm-watchdog.service - -%postun -%service_del_postun sm-watchdog.service sm-watchdog.target - %pre -n sm-eru %service_add_pre sm-eru.service sm-eru.target @@ -108,17 +95,10 @@ make DEST_DIR=%{buildroot} BIN_DIR=%{_bindir} UNIT_DIR=%{_unitdir} LIB_DIR=%{_li %files %license LICENSE %defattr(-,root,root,-) -%{_sysconfdir}/init.d/sm-watchdog -%config %{_sysconfdir}/pmon.d/sm-watchdog.conf -%{_bindir}/sm-watchdog -%{_unitdir}/sm-watchdog.service %files libs %{_libdir}/*.so.* %dir %{_sharedstatedir}/sm -%dir %{_sharedstatedir}/sm/watchdog -%dir %{_sharedstatedir}/sm/watchdog/modules -%{_sharedstatedir}/sm/watchdog/modules/*.so.* %files -n sm-eru %defattr(-,root,root,-) @@ -134,6 +114,5 @@ make DEST_DIR=%{buildroot} BIN_DIR=%{_bindir} UNIT_DIR=%{_unitdir} LIB_DIR=%{_li %defattr(-,root,root,-) %{_includedir}/* %{_libdir}/*.so -%{_sharedstatedir}/sm/watchdog/modules/libsm_watchdog_nfs.so %changelog diff --git a/service-mgmt/sm-common/scripts/Makefile b/service-mgmt/sm-common/scripts/Makefile index 2824793f..7656ee85 100644 --- a/service-mgmt/sm-common/scripts/Makefile +++ b/service-mgmt/sm-common/scripts/Makefile @@ -6,7 +6,7 @@ install: install -d $(DEST_DIR)$(UNIT_DIR) install -m 644 *.service $(DEST_DIR)$(UNIT_DIR) install -d $(DEST_DIR)/etc/init.d - install sm-watchdog sm-eru $(DEST_DIR)/etc/init.d + install sm-eru $(DEST_DIR)/etc/init.d install -d $(DEST_DIR)/etc/pmon.d install *.conf $(DEST_DIR)/etc/pmon.d diff --git a/service-mgmt/sm-common/scripts/sm-watchdog b/service-mgmt/sm-common/scripts/sm-watchdog deleted file mode 100755 index 1588b6c0..00000000 --- a/service-mgmt/sm-common/scripts/sm-watchdog +++ /dev/null @@ -1,131 +0,0 @@ -#! /bin/sh -# -# Copyright (c) 2014 Wind River Systems, Inc. -# -# SPDX-License-Identifier: Apache-2.0 -# -# chkconfig: - 87 87 -# processname: sm-watchdog -# description: Service Management Watchdog -# -### BEGIN INIT INFO -# Description: sm-watchdog -# -# Short-Description: Service Management Watchdog -# Provides: sm-watchdog -# Required-Start: $network -# Should-Start: $syslog -# Required-Stop: $network -# Default-Start: 3 5 -# Default-Stop: 0 6 -### END INIT INFO - -. /etc/init.d/functions - -RETVAL=0 - -SM_WATCHDOG_NAME="sm-watchdog" -SM_WATCHDOG="/usr/bin/${SM_WATCHDOG_NAME}" -SM_WATCHDOG_PIDFILE="/var/run/${SM_WATCHDOG_NAME}.pid" - -if [ ! -e "${SM_WATCHDOG}" ] -then - logger "${SM_WATCHDOG} is missing" - exit 5 -fi - -PATH=/sbin:/usr/sbin:/bin:/usr/bin:/usr/local/bin - -case "$1" in - start) - echo -n "Starting ${SM_WATCHDOG_NAME}: " - if [ -n "`pidof ${SM_WATCHDOG}`" ] - then - # PMOND might have restarted SM-WATCHDOG already. - RETVAL=0 - else - start-stop-daemon --start -b -x ${SM_WATCHDOG} - RETVAL=$? - fi - if [ ${RETVAL} -eq 0 ] - then - echo "OK" - else - echo "FAIL" - RETVAL=1 - fi - ;; - - stop) - echo -n "Stopping ${SM_WATCHDOG_NAME}: " - if [ -n "`pidof ${SM_WATCHDOG}`" ] - then - killproc ${SM_WATCHDOG} - fi - - SHUTDOWN_TIMEOUT=5 - count=0 - while [ ${count} -lt ${SHUTDOWN_TIMEOUT} ] - do - pidof ${SM_WATCHDOG} &> /dev/null - rc=$? - if [ ${rc} -eq 1 ] - then - echo "OK" - break - fi - count=`expr ${count} + 1` - sleep 1 - done - - pidof ${SM_WATCHDOG} &> /dev/null - rc=$? - if [ ${rc} -eq 0 ] - then - echo "FAIL" - RETVAL=7 - fi - - rm -f ${SM_WATCHDOG_PIDFILE} - ;; - - status) - pid=`cat ${SM_WATCHDOG_PIDFILE} 2>/dev/null` - if [ -n "${pid}" ] - then - if ps -p ${pid} &>/dev/null - then - echo "${SM_WATCHDOG_NAME} is running" - RETVAL=0 - else - echo "${SM_WATCHDOG_NAME} is not running but has pid file" - RETVAL=1 - fi - else - echo "${SM_WATCHDOG_NAME} is not running" - RETVAL=3 - fi - ;; - - restart) - $0 stop - sleep 1 - $0 start - ;; - - reload) - echo "${SM_WATCHDOG_NAME} reload" - $0 restart - ;; - - force-reload) - echo "${SM_WATCHDOG_NAME} force-reload" - $0 restart - ;; - - *) - echo "usage: $0 { start | stop | status | restart | reload | force-reload }" - ;; -esac - -exit ${RETVAL} diff --git a/service-mgmt/sm-common/scripts/sm-watchdog.conf b/service-mgmt/sm-common/scripts/sm-watchdog.conf deleted file mode 100644 index 8ae37adf..00000000 --- a/service-mgmt/sm-common/scripts/sm-watchdog.conf +++ /dev/null @@ -1,15 +0,0 @@ -; -; Copyright (c) 2014 Wind River Systems, Inc. -; -; SPDX-License-Identifier: Apache-2.0 -; -[process] -process = sm-watchdog -pidfile = /var/run/sm-watchdog.pid -script = /etc/init.d/sm-watchdog -style = lsb ; lsb -severity = major ; minor, major, critical -restarts = 3 ; restarts before error assertion -startuptime = 5 ; seconds to wait after process start -interval = 5 ; number of seconds to wait between restarts -debounce = 20 ; number of seconds to wait before degrade clear diff --git a/service-mgmt/sm-common/scripts/sm-watchdog.service b/service-mgmt/sm-common/scripts/sm-watchdog.service deleted file mode 100644 index 5fd147a5..00000000 --- a/service-mgmt/sm-common/scripts/sm-watchdog.service +++ /dev/null @@ -1,15 +0,0 @@ -[Unit] -Description=Service Management Watchdog -After=network-online.target syslog-ng.service config.service -Before=sm.service pmon.service - -[Service] -Type=forking -RemainAfterExit=yes -User=root -ExecStart=/etc/init.d/sm-watchdog start -ExecStop=/etc/init.d/sm-watchdog stop -PIDFile=/var/run/sm-watchdog.pid - -[Install] -WantedBy=multi-user.target diff --git a/service-mgmt/sm-common/src/Makefile b/service-mgmt/sm-common/src/Makefile index 3629f8f3..bf1c7c34 100644 --- a/service-mgmt/sm-common/src/Makefile +++ b/service-mgmt/sm-common/src/Makefile @@ -34,7 +34,7 @@ EXTRACCFLAGS+= -Wformat -Wformat-security LDLIBS= -lsqlite3 -lglib-2.0 -lgmodule-2.0 -luuid -lrt -lpthread LDFLAGS = -shared -rdynamic -build: libsm_common.so libsm_watchdog_nfs.so sm_watchdog sm_eru sm_eru_dump +build: libsm_common.so sm_eru sm_eru_dump .c.o: $(CXX) $(INCLUDES) $(CCFLAGS) $(EXTRACCFLAGS) -c $< -o $@ @@ -48,18 +48,6 @@ libsm_common.so.$(VER_MJR): libsm_common.so.$(VER) libsm_common.so.$(VER): ${OBJS} $(CXX) ${LDFLAGS} -Wl,--start-group $(LDLIBS) -Wl,-soname,libsm_common.so.$(VER_MJR) -o $@ $^ -libsm_watchdog_nfs.so: libsm_watchdog_nfs.so.$(VER_MJR) - ln -sf $^ $@ - -libsm_watchdog_nfs.so.$(VER_MJR): libsm_watchdog_nfs.so.$(VER) - ln -sf $^ $@ - -libsm_watchdog_nfs.so.$(VER): libsm_common.so.$(VER) libsm_common.so - $(CXX) $(INCLUDES) $(CCFLAGS) $(EXTRACCFLAGS) sm_watchdog_nfs.c ${LDFLAGS} $(LDLIBS) -L./ -lsm_common -Wl,-soname,libsm_watchdog_nfs.so.$(VER_MJR) -o $@ - -sm_watchdog: libsm_common.so - $(CXX) $(INCLUDES) $(CCFLAGS) $(EXTRACCFLAGS) $(OBJS) sm_watchdog_module.c sm_watchdog_process.c sm_watchdog_main.c $(LDLIBS) -L./ -lsm_common -o sm_watchdog - sm_eru: libsm_common.so $(CXX) $(INCLUDES) $(CCFLAGS) $(EXTRACCFLAGS) $(OBJS) sm_eru_process.c sm_eru_main.c $(LDLIBS) -L./ -lsm_common -o sm_eru @@ -71,15 +59,12 @@ install: # renamed with '-' like they are in the bitbake file. # # install -d $(DEST_DIR)$(BIN_DIR) - # install sm_watchdog sm_eru sm_eru_dump $(DEST_DIR)$(BIN_DIR) + # install sm_eru sm_eru_dump $(DEST_DIR)$(BIN_DIR) install -d $(DEST_DIR)$(LIB_DIR) install libsm_common.so.${VER} $(DEST_DIR)$(LIB_DIR) cp -P libsm_common.so libsm_common.so.$(VER_MJR) $(DEST_DIR)$(LIB_DIR) install -d $(DEST_DIR)$(INC_DIR) install -m 644 *.h $(DEST_DIR)$(INC_DIR) - install -d $(DEST_DIR)/var/lib/sm/watchdog/modules - install libsm_watchdog_nfs.so.${VER} $(DEST_DIR)/var/lib/sm/watchdog/modules - cp -P libsm_watchdog_nfs.so libsm_watchdog_nfs.so.${VER_MJR} $(DEST_DIR)/var/lib/sm/watchdog/modules clean: rm -f *.o *.so *.so.* diff --git a/service-mgmt/sm-common/src/sm_types.h b/service-mgmt/sm-common/src/sm_types.h index bc7ba28a..ec843e15 100644 --- a/service-mgmt/sm-common/src/sm_types.h +++ b/service-mgmt/sm-common/src/sm_types.h @@ -77,15 +77,12 @@ extern "C" { #define SM_PROCESS_PID_FILENAME "/var/run/sm.pid" #define SM_TRAP_PROCESS_PID_FILENAME "/var/run/sm-trap.pid" -#define SM_WATCHDOG_PROCESS_PID_FILENAME "/var/run/sm-watchdog.pid" #define SM_ERU_PROCESS_PID_FILENAME "/var/run/sm-eru.pid" #define SM_BOOT_COMPLETE_FILENAME "/var/run/sm_boot_complete" #define SM_INDICATE_DEGRADED_FILENAME "/var/run/.sm_degraded" -#define SM_WATCHDOG_HEARTBEAT_FILENAME "/var/run/.sm_watchdog_heartbeat" - #define SM_DUMP_DATA_FILE "/tmp/sm_data_dump.txt" #define SM_TROUBLESHOOT_LOG_FILE "/var/log/sm-troubleshoot.log" diff --git a/service-mgmt/sm-common/src/sm_utils.c b/service-mgmt/sm-common/src/sm_utils.c index ade30e07..aff627a1 100644 --- a/service-mgmt/sm-common/src/sm_utils.c +++ b/service-mgmt/sm-common/src/sm_utils.c @@ -15,9 +15,6 @@ #include #include #include -#include -#include -#include #include // **************************************************************************** @@ -127,77 +124,3 @@ SmErrorT sm_utils_clear_degraded( void ) return( SM_OKAY ); } // **************************************************************************** - -// **************************************************************************** -// Utils - Watchdog Heartbeat -// ========================== -void sm_utils_watchdog_heartbeat( void ) -{ - struct utimbuf file_times; - struct timespec ts_mono; - - clock_gettime( CLOCK_MONOTONIC_RAW, &ts_mono ); - - memset( &file_times, 0, sizeof(struct utimbuf) ); - - file_times.actime = ts_mono.tv_sec; - file_times.modtime = ts_mono.tv_sec; - - if( 0 > access( SM_WATCHDOG_HEARTBEAT_FILENAME, F_OK ) ) - { - int fd = open( SM_WATCHDOG_HEARTBEAT_FILENAME, O_RDWR | O_CREAT, - S_IRUSR | S_IRGRP | S_IROTH | O_CLOEXEC ); - if( 0 > fd ) - { - DPRINTFE( "Failed to create/open watchdog heartbeat, error=%s.", - strerror(errno) ); - return; - } - - close( fd ); - } - - if( 0 > utime( SM_WATCHDOG_HEARTBEAT_FILENAME, &file_times ) ) - { - DPRINTFE( "Failed to update watchdog heartbeat timings, error=%s.", - strerror(errno) ); - return; - } -} -// **************************************************************************** - -// **************************************************************************** -// Utils - Watchdog Delayed -// ========================= -bool sm_utils_watchdog_delayed( int max_delay_secs ) -{ - struct stat stat_data; - - if( 0 == access( SM_WATCHDOG_HEARTBEAT_FILENAME, F_OK ) ) - { - int elapsed_secs; - struct timespec ts_mono; - - clock_gettime( CLOCK_MONOTONIC_RAW, &ts_mono ); - - if( 0 > stat( SM_WATCHDOG_HEARTBEAT_FILENAME, &stat_data ) ) - { - DPRINTFE( "Stat failed on file (%s), error=%s.", - SM_WATCHDOG_HEARTBEAT_FILENAME, strerror( errno ) ); - return( false ); - } - - // Make sure that the elapsed seconds drift is in a valid range. - elapsed_secs = ts_mono.tv_sec - stat_data.st_mtime; - if(( max_delay_secs < elapsed_secs )&&( elapsed_secs <= 300 )) - { - DPRINTFI( "SM-Watchdog has been delayed by more than %d " - "seconds, elapsed_secs=%d", max_delay_secs, - elapsed_secs ); - return( true ); - } - } - - return( false ); -} -// **************************************************************************** diff --git a/service-mgmt/sm-common/src/sm_utils.h b/service-mgmt/sm-common/src/sm_utils.h index f76c7940..b05276e5 100644 --- a/service-mgmt/sm-common/src/sm_utils.h +++ b/service-mgmt/sm-common/src/sm_utils.h @@ -50,18 +50,6 @@ extern SmErrorT sm_utils_indicate_degraded( void ); extern SmErrorT sm_utils_clear_degraded( void ); // **************************************************************************** -// **************************************************************************** -// Utils - Watchdog Heartbeat -// ========================== -extern void sm_utils_watchdog_heartbeat( void ); -// **************************************************************************** - -// **************************************************************************** -// Utils - Watchdog Delayed -// ========================= -extern bool sm_utils_watchdog_delayed( int max_delay_secs ); -// **************************************************************************** - #ifdef __cplusplus } #endif diff --git a/service-mgmt/sm-common/src/sm_watchdog_main.c b/service-mgmt/sm-common/src/sm_watchdog_main.c deleted file mode 100644 index c9b807d9..00000000 --- a/service-mgmt/sm-common/src/sm_watchdog_main.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// Copyright (c) 2014 Wind River Systems, Inc. -// -// SPDX-License-Identifier: Apache-2.0 -// -#include -#include -#include -#include -#include -#include -#include - -#include "sm_types.h" -#include "sm_debug.h" -#include "sm_watchdog_process.h" - -// **************************************************************************** -// Main - Thread -// ============= -int main( int argc, char *argv[], char *envp[] ) -{ - SmErrorT error; - - error = sm_debug_initialize(); - if( SM_OKAY != error ) - { - printf( "Debug initialization failed, error=%s.\n", - sm_error_str( error ) ); - return( EXIT_FAILURE ); - } - - error = sm_watchdog_process_main( argc, argv, envp ); - if( SM_OKAY != error ) - { - printf( "Process failure, error=%s.\n", sm_error_str( error ) ); - return( EXIT_FAILURE ); - } - - error = sm_debug_finalize(); - if( SM_OKAY != error ) - { - printf( "Debug finalization failed, error=%s.\n", - sm_error_str( error ) ); - } - - return( EXIT_SUCCESS ); -} -// **************************************************************************** diff --git a/service-mgmt/sm-common/src/sm_watchdog_module.c b/service-mgmt/sm-common/src/sm_watchdog_module.c deleted file mode 100644 index b489faac..00000000 --- a/service-mgmt/sm-common/src/sm_watchdog_module.c +++ /dev/null @@ -1,247 +0,0 @@ -// -// Copyright (c) 2014 Wind River Systems, Inc. -// -// SPDX-License-Identifier: Apache-2.0 -// -#include "sm_watchdog_module.h" - -#include -#include -#include -#include -#include -#include - -#include "sm_types.h" -#include "sm_list.h" -#include "sm_timer.h" -#include "sm_debug.h" - -#define SM_WATCHDOG_MODULE_FILENAME_MAX_SIZE 128 -#define SM_WATCHDOG_MODULE_PATH "/var/lib/sm/watchdog/modules" -#define SM_WATCHDOG_MODULE_DO_CHECK_FUNC "sm_watchdog_module_do_check" -#define SM_WATCHDOG_MODULE_INITIALIZE_FUNC "sm_watchdog_module_initialize" -#define SM_WATCHDOG_MODULE_FINALIZE_FUNC "sm_watchdog_module_finalize" - -typedef void (*SmWatchdogModuleDoCheckT) (void); -typedef bool (*SmWatchdogModuleInitializeT) (int* do_check_in_ms); -typedef bool (*SmWatchdogModuleFinalizeT) (void); - -typedef struct -{ - gchar filename[SM_WATCHDOG_MODULE_FILENAME_MAX_SIZE]; - GModule* glibmod; - int do_check_in_ms; - SmTimerIdT do_check_timer_id; - SmWatchdogModuleDoCheckT do_check; - SmWatchdogModuleInitializeT initialize; - SmWatchdogModuleFinalizeT finalize; -} SmWatchdogModuleT; - -static SmListT* _modules = NULL; - -// **************************************************************************** -// Watchdog Module - Do Check Timer -// ================================ -static bool sm_watchdog_module_do_check_timer( SmTimerIdT timer_id, - int64_t user_data ) -{ - SmListT* entry = NULL; - SmListEntryDataPtrT entry_data; - SmWatchdogModuleT* module = NULL; - - SM_LIST_FOREACH( _modules, entry, entry_data ) - { - module = (SmWatchdogModuleT*) entry_data; - if( NULL == module ) - { - continue; - } - - if( timer_id == module->do_check_timer_id ) - { - DPRINTFD( "Found do-check timer for module (%s).", - g_module_name(module->glibmod) ); - break; - } - } - - if( NULL != module ) - { - if( NULL != module->do_check ) - { - DPRINTFD( "Calling do-check for module (%s).", - g_module_name(module->glibmod) ); - module->do_check(); - return( true ); - } - } else { - DPRINTFE( "Module not found for do-check timer." ); - } - - return( false ); -} -// **************************************************************************** - -// *************************************************************************** -// Watchdog Module - Load -// ====================== -static SmErrorT sm_watchdog_module_load( const gchar* filename ) -{ - gchar* filepath; - SmWatchdogModuleT* module; - - module = (SmWatchdogModuleT*) malloc( sizeof(SmWatchdogModuleT) ); - if( NULL == module ) - { - DPRINTFE( "Failed to allocate watchdog module." ); - return( SM_FAILED ); - } - - memset( module, 0, sizeof(SmWatchdogModuleT) ); - - g_snprintf(module->filename, SM_WATCHDOG_MODULE_FILENAME_MAX_SIZE, - "%s", filename); - - filepath = g_module_build_path( SM_WATCHDOG_MODULE_PATH, filename ); - - module->glibmod = g_module_open( filepath, G_MODULE_BIND_LAZY ); - if( NULL == module->glibmod ) - { - DPRINTFE( "Failed to open module (%s).", filepath ); - free( module ); - g_free( filepath ); - return( SM_FAILED ); - } - - g_free( filepath ); - - g_module_symbol( module->glibmod, SM_WATCHDOG_MODULE_INITIALIZE_FUNC, - (gpointer*) &(module->initialize) ); - - g_module_symbol( module->glibmod, SM_WATCHDOG_MODULE_FINALIZE_FUNC, - (gpointer*) &(module->finalize) ); - - g_module_symbol( module->glibmod, SM_WATCHDOG_MODULE_DO_CHECK_FUNC, - (gpointer*) &(module->do_check) ); - - SM_LIST_PREPEND( _modules, (SmListEntryDataPtrT) module ); - - return( SM_OKAY ); -} -// *************************************************************************** - -// *************************************************************************** -// Watchdog Module - Load All -// ========================== -SmErrorT sm_watchdog_module_load_all( void ) -{ - const gchar* file; - GDir* directory; - GError* g_error; - SmListT* entry = NULL; - SmListEntryDataPtrT entry_data; - SmWatchdogModuleT* module; - SmErrorT error; - - directory = g_dir_open( SM_WATCHDOG_MODULE_PATH, 0, &g_error ); - if( NULL == directory ) - { - DPRINTFE( "Failed to open directory( %s), error=%s", - SM_WATCHDOG_MODULE_PATH, g_error->message ); - g_error_free( g_error ); - return( SM_FAILED ); - } - - file = g_dir_read_name( directory ); - while( NULL != file ) - { - DPRINTFI( "Loading module (%s).", file ); - - error = sm_watchdog_module_load( file ); - if( SM_OKAY != error ) - { - DPRINTFE( "Failed to load module (%s), error=%s.", - file, sm_error_str(error) ); - } - - file = g_dir_read_name( directory ); - } - - g_dir_close( directory ); - - SM_LIST_FOREACH( _modules, entry, entry_data ) - { - module = (SmWatchdogModuleT*) entry_data; - if( NULL == module ) - { - continue; - } - - if( NULL != module->initialize ) - { - DPRINTFI( "Initializing module (%s).", - g_module_name(module->glibmod) ); - - if( !(module->initialize( &(module->do_check_in_ms) )) ) - { - DPRINTFE( "Failed to initialize %s.", - g_module_name(module->glibmod) ); - return( SM_FAILED ); - } - - error = sm_timer_register( module->filename, - module->do_check_in_ms, - sm_watchdog_module_do_check_timer, - 0, &(module->do_check_timer_id) ); - if( SM_OKAY != error ) - { - DPRINTFE( "Failed to create module (%s) do-check timer, " - "error=%s.", g_module_name(module->glibmod), - sm_error_str( error ) ); - return( error ); - } - } - } - - return( SM_OKAY ); -} -// *************************************************************************** - -// *************************************************************************** -// Watchdog Module - Unload All -// ============================ -SmErrorT sm_watchdog_module_unload_all( void ) -{ - SmListT* entry = NULL; - SmListEntryDataPtrT entry_data; - SmWatchdogModuleT* module; - - SM_LIST_FOREACH( _modules, entry, entry_data ) - { - module = (SmWatchdogModuleT*) entry_data; - if( NULL == module ) - { - continue; - } - - if( NULL != module->finalize ) - { - DPRINTFI( "Finalizing module (%s).", - g_module_name(module->glibmod) ); - - if( !(module->finalize()) ) - { - DPRINTFE( "Failed to finalize %s.", - g_module_name(module->glibmod) ); - } - } - - g_module_close( module->glibmod ); - } - - SM_LIST_CLEANUP_ALL( _modules ); - - return( SM_OKAY ); -} -// *************************************************************************** diff --git a/service-mgmt/sm-common/src/sm_watchdog_module.h b/service-mgmt/sm-common/src/sm_watchdog_module.h deleted file mode 100644 index acd7f997..00000000 --- a/service-mgmt/sm-common/src/sm_watchdog_module.h +++ /dev/null @@ -1,31 +0,0 @@ -// -// Copyright (c) 2014 Wind River Systems, Inc. -// -// SPDX-License-Identifier: Apache-2.0 -// -#ifndef __SM_WATCHDOG_MODULE_H__ -#define __SM_WATCHDOG_MODULE_H__ - -#include "sm_types.h" - -#ifdef __cplusplus -extern "C" { -#endif - -// **************************************************************************** -// Watchdog Module - Load All -// ========================== -extern SmErrorT sm_watchdog_module_load_all( void ); -// **************************************************************************** - -// **************************************************************************** -// Watchdog Module - Unload All -// ============================ -extern SmErrorT sm_watchdog_module_unload_all( void ); -// **************************************************************************** - -#ifdef __cplusplus -} -#endif - -#endif // __SM_WATCHDOG_MODULE_H__ diff --git a/service-mgmt/sm-common/src/sm_watchdog_nfs.c b/service-mgmt/sm-common/src/sm_watchdog_nfs.c deleted file mode 100644 index db085999..00000000 --- a/service-mgmt/sm-common/src/sm_watchdog_nfs.c +++ /dev/null @@ -1,608 +0,0 @@ -// -// Copyright (c) 2014 Wind River Systems, Inc. -// -// SPDX-License-Identifier: Apache-2.0 -// -#include "sm_watchdog_nfs.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "sm_types.h" -#include "sm_time.h" -#include "sm_debug.h" -#include "sm_node_utils.h" -#include "sm_node_stats.h" - -#define SM_WATCHDOG_NFS_THREAD_NAME "(nfsd)" -#define SM_WATCHDOG_NFS_REBOOT_INPROGRESS 0xA5A5A5A5 -#define SM_WATCHDOG_NFS_MAX_BLOCKED_THREADS 32 -#define SM_WATCHDOG_NFS_CHECK_IN_MS 10000 -#define SM_WATCHDOG_NFS_MAX_UNINTERRUPTIBLE_SLEEP 60000 -#define SM_WATCHDOG_NFS_DELAY_REBOOT_IN_MS 60000 -#define SM_WATCHDOG_NFS_DELAY_REBOOT_FORCE_IN_MS 480000 -#define SM_WATCHDOG_NFS_DEBUG_FILE "/var/log/nfs.debug" - -typedef struct -{ - bool inuse; - bool stale; - int pid; - SmTimeT timestamp; - SmNodeProcessStatusT status; -} SmWatchDogNfsBlockedInfoT; - -static uint32_t _nfs_reboot_inprogress; - -static SmWatchDogNfsBlockedInfoT - _nfs_blocked_threads[SM_WATCHDOG_NFS_MAX_BLOCKED_THREADS]; - -// **************************************************************************** -// Watchdog NFS - Find Blocked Thread -// ================================== -static SmWatchDogNfsBlockedInfoT* sm_watchdog_nfs_find_blocked_thread( int pid ) -{ - SmWatchDogNfsBlockedInfoT* entry; - - int thread_i; - for( thread_i=0; SM_WATCHDOG_NFS_MAX_BLOCKED_THREADS > thread_i; - ++thread_i ) - { - entry = &(_nfs_blocked_threads[thread_i]); - - if( entry->inuse ) - { - if( pid == entry->pid ) - { - return( entry ); - } - } - } - - return( NULL ); -} -// **************************************************************************** - -// **************************************************************************** -// Watchdog NFS - Add Blocked Thread -// ================================= -static void sm_watchdog_nfs_add_blocked_thread( int pid, - SmNodeProcessStatusT* status ) -{ - SmWatchDogNfsBlockedInfoT* entry; - - int thread_i; - for( thread_i=0; SM_WATCHDOG_NFS_MAX_BLOCKED_THREADS > thread_i; - ++thread_i ) - { - entry = &(_nfs_blocked_threads[thread_i]); - - if( !(entry->inuse) ) - { - entry->inuse = true; - entry->stale = false; - entry->pid = pid; - sm_time_get( &(entry->timestamp) ); - memcpy( &(entry->status), status, sizeof(SmNodeProcessStatusT) ); - return; - } - } - - DPRINTFE( "Not enough room for all the NFS blocked threads." ); -} -// **************************************************************************** - -// **************************************************************************** -// Watchdog NFS - Delete Blocked Thread -// ==================================== -static void sm_watchdog_nfs_delete_blocked_thread( int pid ) -{ - SmWatchDogNfsBlockedInfoT* entry; - - entry = sm_watchdog_nfs_find_blocked_thread( pid ); - if( NULL != entry ) - { - memset( entry, 0, sizeof(SmWatchDogNfsBlockedInfoT) ); - entry->inuse = false; - } -} -// **************************************************************************** - -// **************************************************************************** -// Watchdog NFS - Do Reboot -// ======================== -static void sm_watchdog_nfs_do_reboot( void ) -{ - char cmd[2048]; - pid_t reboot_pid; - pid_t reboot_force_pid; - pid_t sm_troubleshoot_pid; - pid_t collect_pid; - SmWatchDogNfsBlockedInfoT* entry; - SmErrorT error; - - if( SM_WATCHDOG_NFS_REBOOT_INPROGRESS == _nfs_reboot_inprogress ) - { - DPRINTFD( "Reboot already inprogress." ); - return; - } - - // Fork child to do the reboot. - reboot_pid = fork(); - if( 0 > reboot_pid ) - { - DPRINTFE( "Failed to fork process for reboot, error=%s.", - strerror( errno ) ); - return; - - } else if( 0 == reboot_pid ) { - // Child process. - long ms_expired; - char reboot_cmd[] = "reboot"; - char* reboot_argv[] = {reboot_cmd, NULL}; - char* reboot_env[] = {NULL}; - struct rlimit file_limits; - SmTimeT timestamp; - - setpgid( 0, 0 ); - - if( 0 == getrlimit( RLIMIT_NOFILE, &file_limits ) ) - { - unsigned int fd_i; - for( fd_i=0; fd_i < file_limits.rlim_cur; ++fd_i ) - { - close( fd_i ); - } - - open( "/dev/null", O_RDONLY ); // stdin - open( "/dev/null", O_WRONLY ); // stdout - open( "/dev/null", O_WRONLY ); // stderr - } - - sm_time_get( ×tamp ); - - while( true ) - { - ms_expired = sm_time_get_elapsed_ms( ×tamp ); - if( SM_WATCHDOG_NFS_DELAY_REBOOT_IN_MS < ms_expired ) - { - break; - } - - sleep( 10 ); // 10 seconds - } - - execve( "/sbin/reboot", reboot_argv, reboot_env ); - - // Shouldn't get this far, else there was an error. - exit(-1); - } - - // Fork child to do reboot force. - reboot_force_pid = fork(); - if( 0 > reboot_force_pid ) - { - DPRINTFE( "Failed to fork process for reboot escalation, " - "error=%s.", strerror( errno ) ); - return; - - } else if( 0 == reboot_force_pid ) { - // Child process. - long ms_expired; - int sysrq_handler_fd; - int sysrq_tigger_fd; - struct rlimit file_limits; - SmTimeT timestamp; - - setpgid( 0, 0 ); - - if( 0 == getrlimit( RLIMIT_NOFILE, &file_limits ) ) - { - unsigned int fd_i; - for( fd_i=0; fd_i < file_limits.rlim_cur; ++fd_i ) - { - close( fd_i ); - } - - open( "/dev/null", O_RDONLY ); // stdin - open( "/dev/null", O_WRONLY ); // stdout - open( "/dev/null", O_WRONLY ); // stderr - } - - sm_time_get( ×tamp ); - - while( true ) - { - ms_expired = sm_time_get_elapsed_ms( ×tamp ); - if( SM_WATCHDOG_NFS_DELAY_REBOOT_FORCE_IN_MS < ms_expired ) - { - break; - } - - sleep( 10 ); // 10 seconds - } - - // Enable sysrq handling. - sysrq_handler_fd = open( "/proc/sys/kernel/sysrq", O_RDWR | O_CLOEXEC ); - if( 0 > sysrq_handler_fd ) - { - return; - } - - write( sysrq_handler_fd, "1", 1 ); - close( sysrq_handler_fd ); - - // Trigger sysrq command. - sysrq_tigger_fd = open( "/proc/sysrq-trigger", O_RDWR | O_CLOEXEC ); - if( 0 > sysrq_tigger_fd ) - { - return; - } - - write( sysrq_tigger_fd, "b", 1 ); - close( sysrq_tigger_fd ); - - exit( EXIT_SUCCESS ); - } - - _nfs_reboot_inprogress = SM_WATCHDOG_NFS_REBOOT_INPROGRESS; - - // Fork child to do the sm-troubleshoot. - sm_troubleshoot_pid = fork(); - if( 0 > sm_troubleshoot_pid ) - { - DPRINTFE( "Failed to fork process for sm-trouble, error=%s.", - strerror( errno ) ); - - } else if( 0 == sm_troubleshoot_pid ) { - // Child process. - char cmd[] = "sm-troubleshoot"; - char log_file[] = SM_TROUBLESHOOT_LOG_FILE; - char* argv[] = {cmd, log_file, NULL}; - char* env[] = {NULL}; - struct rlimit file_limits; - - setpgid( 0, 0 ); - - if( 0 == getrlimit( RLIMIT_NOFILE, &file_limits ) ) - { - unsigned int fd_i; - for( fd_i=0; fd_i < file_limits.rlim_cur; ++fd_i ) - { - close( fd_i ); - } - - open( "/dev/null", O_RDONLY ); // stdin - open( "/dev/null", O_WRONLY ); // stdout - open( "/dev/null", O_WRONLY ); // stderr - } - - execve( SM_TROUBLESHOOT_SCRIPT, argv, env ); - - // Shouldn't get this far, else there was an error. - exit(-1); - } - - // Fork child to run collect. - collect_pid = fork(); - if( 0 > collect_pid ) - { - DPRINTFE( "Failed to fork process for collect, error=%s.", - strerror( errno ) ); - - } else if( 0 == collect_pid ) { - // Child process. - char cmd[] = "collect"; - char* argv[] = {cmd, NULL}; - char* env[] = {NULL}; - struct rlimit file_limits; - - setpgid( 0, 0 ); - - if( 0 == getrlimit( RLIMIT_NOFILE, &file_limits ) ) - { - unsigned int fd_i; - for( fd_i=0; fd_i < file_limits.rlim_cur; ++fd_i ) - { - close( fd_i ); - } - - open( "/dev/null", O_RDONLY ); // stdin - open( "/dev/null", O_WRONLY ); // stdout - open( "/dev/null", O_WRONLY ); // stderr - } - - execve( "/usr/local/sbin/collect", argv, env ); - - // Shouldn't get this far, else there was an error. - exit(-1); - } - - error = sm_node_utils_set_unhealthy(); - if( SM_OKAY != error ) - { - DPRINTFE( "Failed to set node unhealthy, error=%s.", - sm_error_str(error) ); - } - - DPRINTFI( "*******************************************************" ); - DPRINTFI( "** Issuing a reboot of the system, NFS hang detected **" ); - DPRINTFI( "*******************************************************" ); - - DPRINTFI( "Reboot (%i) process created.", (int) reboot_pid ); - DPRINTFI( "Reboot force (%i) process created.", (int) reboot_force_pid ); - DPRINTFI( "SM troubleshoot (%i) process created.", (int) sm_troubleshoot_pid ); - DPRINTFI( "Collect (%i) process created.", (int) collect_pid ); - - snprintf( cmd, sizeof(cmd), - "date >> %s; " - "echo \"*******************************************\" >> %s; " - "echo \"NFS HANG DETECTED\" >> %s", SM_WATCHDOG_NFS_DEBUG_FILE, - SM_WATCHDOG_NFS_DEBUG_FILE, SM_WATCHDOG_NFS_DEBUG_FILE ); - system( cmd ); - - int thread_i; - for( thread_i=0; SM_WATCHDOG_NFS_MAX_BLOCKED_THREADS > thread_i; - ++thread_i ) - { - entry = &(_nfs_blocked_threads[thread_i]); - - if( entry->inuse ) - { - snprintf( cmd, sizeof(cmd), - "date >> %s; " - "echo \"cat /proc/%i/sched\" >> %s; " - "cat /proc/%i/sched >> %s", SM_WATCHDOG_NFS_DEBUG_FILE, - entry->pid, SM_WATCHDOG_NFS_DEBUG_FILE, entry->pid, - SM_WATCHDOG_NFS_DEBUG_FILE ); - system( cmd ); - - snprintf( cmd, sizeof(cmd), - "date >> %s; " - "echo \"cat /proc/%i/stack\" >> %s; " - "cat /proc/%i/stack >> %s", SM_WATCHDOG_NFS_DEBUG_FILE, - entry->pid, SM_WATCHDOG_NFS_DEBUG_FILE, entry->pid, - SM_WATCHDOG_NFS_DEBUG_FILE ); - system( cmd ); - } - } - - snprintf( cmd, sizeof(cmd), - "echo \"*******************************************\" >> %s", - SM_WATCHDOG_NFS_DEBUG_FILE ); - system( cmd ); -} -// **************************************************************************** - -// **************************************************************************** -// Watchdog NFS - Search -// ===================== -static void sm_watchdog_nfs_search( const char dir_name[] ) -{ - bool is_dir; - DIR* dir; - char path[PATH_MAX]; - int path_len; - SmNodeProcessStatusT status; - SmErrorT error; - - dir = opendir( dir_name ); - if( NULL == dir ) - { - DPRINTFE( "Failed to open directory (%s), error=%s.", dir_name, - strerror( errno ) ); - return; - } - - struct dirent* entry; - for( entry = readdir( dir ); NULL != entry; entry = readdir( dir ) ) - { - is_dir = false; - - path_len = snprintf( path, sizeof(path), "%s/%s", dir_name, - entry->d_name ); - if( PATH_MAX <= path_len ) - { - DPRINTFE( "Path (%s/%s) is too long, max_len=%i.", - dir_name, entry->d_name, path_len ); - break; - } - - if( 0 != (DT_REG & entry->d_type) ) - { - if( '.' != entry->d_name[0] ) - { - struct stat stat_data; - - if( 0 > lstat( path, &stat_data ) ) - { - DPRINTFE( "Stat on (%s) failed, error=%s.", entry->d_name, - strerror( errno ) ); - continue; - } - - is_dir = S_ISDIR( stat_data.st_mode ); - } - } else if( 0 != (DT_DIR & entry->d_type) ) { - if(( 0 != strcmp( ".", entry->d_name ) )&& - ( 0 != strcmp( "..", entry->d_name ) )) - { - is_dir = true; - } - } - - if( is_dir ) - { - long val; - char* end; - - val = strtol( entry->d_name, &end, 10 ); - if(( ERANGE == errno )&& - (( LONG_MIN == val ) ||( LONG_MAX == val ))) - { - DPRINTFD( "Directory (%s) name out of range.", - entry->d_name ); - continue; - } - - if( end == entry->d_name ) - { - DPRINTFD( "Directory (%s) is not a pid directory.", - entry->d_name ); - continue; - } - - error = sm_node_stats_get_process_status( val, &status ); - if( SM_OKAY != error ) - { - if( SM_NOT_FOUND == error ) - { - DPRINTFD( "Failed to get %ld pid status, error=%s.", - val, sm_error_str(error) ); - } else { - DPRINTFE( "Failed to get %ld pid status, error=%s.", - val, sm_error_str(error) ); - } - continue; - } - - DPRINTFD( "Looking at pid=%i, name=%s", status.pid, status.name ); - - if( 0 != strcmp( SM_WATCHDOG_NFS_THREAD_NAME, status.name ) ) - { - DPRINTFD( "Process (%s) not an nfs thread, pid=%i.", - status.name, status.pid ); - continue; - } - - DPRINTFD( "NFS thread, pid=%i, state=%c, block_start_ns=%lld.", - status.pid, status.state, status.block_start_ns ); - - if(( 0 != status.block_start_ns )&&( 'D' == status.state )) - { - SmWatchDogNfsBlockedInfoT* entry; - - entry = sm_watchdog_nfs_find_blocked_thread( (int) val ); - if( NULL == entry ) - { - sm_watchdog_nfs_add_blocked_thread( (int) val, &status ); - - } else if( status.block_start_ns == entry->status.block_start_ns ) { - long ms_expired; - - entry->stale = false; - ms_expired = sm_time_get_elapsed_ms( &(entry->timestamp) ); - if( SM_WATCHDOG_NFS_MAX_UNINTERRUPTIBLE_SLEEP < ms_expired ) - { - sm_watchdog_nfs_do_reboot(); - DPRINTFI( "Rebooting stuck nfs thread (%i).", - (int) val ); - break; - } else { - if( (SM_WATCHDOG_NFS_MAX_UNINTERRUPTIBLE_SLEEP/2) - < ms_expired ) - { - DPRINTFI( "WARNING: NFS thread, pid=%i, state=%c, " - "block_start_ns=%lld, elapsed_ms=%ld.", - status.pid, status.state, - status.block_start_ns, ms_expired ); - } - } - } else { - sm_watchdog_nfs_delete_blocked_thread( (int) val ); - sm_watchdog_nfs_add_blocked_thread( (int) val, &status ); - } - } else { - sm_watchdog_nfs_delete_blocked_thread( (int) val ); - } - } - } - - closedir( dir ); -} -// **************************************************************************** - -// **************************************************************************** -// Watchdog NFS - Do Check -// ======================= -void sm_watchdog_module_do_check( void ) -{ - DPRINTFD( "NFS do check called." ); - - if( SM_WATCHDOG_NFS_REBOOT_INPROGRESS != _nfs_reboot_inprogress ) - { - int thread_i; - SmWatchDogNfsBlockedInfoT* entry; - - // Mark entries as stale. - for( thread_i=0; SM_WATCHDOG_NFS_MAX_BLOCKED_THREADS > thread_i; - ++thread_i ) - { - entry = &(_nfs_blocked_threads[thread_i]); - - if( entry->inuse ) - { - entry->stale = true; - } - } - - // Audit NFS threads. - sm_watchdog_nfs_search( "/proc" ); - - // Cleanup stale entries. - for( thread_i=0; SM_WATCHDOG_NFS_MAX_BLOCKED_THREADS > thread_i; - ++thread_i ) - { - entry = &(_nfs_blocked_threads[thread_i]); - - if(( entry->inuse )&&( entry->stale )) - { - memset( entry, 0, sizeof(SmWatchDogNfsBlockedInfoT) ); - entry->inuse = false; - } - } - } else { - DPRINTFD( "Reboot inprogress." ); - } -} -// **************************************************************************** - -// **************************************************************************** -// Watchdog NFS - Initialize -// ========================= -bool sm_watchdog_module_initialize( int* do_check_in_ms ) -{ - *do_check_in_ms = SM_WATCHDOG_NFS_CHECK_IN_MS; - _nfs_reboot_inprogress = 0; - memset( &_nfs_blocked_threads, 0, sizeof(_nfs_blocked_threads) ); - return( true ); -} -// **************************************************************************** - -// **************************************************************************** -// Watchdog NFS - Finalize -// ======================= -bool sm_watchdog_module_finalize( void ) -{ - _nfs_reboot_inprogress = 0; - memset( &_nfs_blocked_threads, 0, sizeof(_nfs_blocked_threads) ); - return( true ); -} -// **************************************************************************** diff --git a/service-mgmt/sm-common/src/sm_watchdog_nfs.h b/service-mgmt/sm-common/src/sm_watchdog_nfs.h deleted file mode 100644 index cd80d43b..00000000 --- a/service-mgmt/sm-common/src/sm_watchdog_nfs.h +++ /dev/null @@ -1,37 +0,0 @@ -// -// Copyright (c) 2014 Wind River Systems, Inc. -// -// SPDX-License-Identifier: Apache-2.0 -// -#ifndef __SM_WATCHDOG_NFS_H__ -#define __SM_WATCHDOG_NFS_H__ - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -// **************************************************************************** -// Watchdog NFS - Do Check -// ======================= -extern void sm_watchdog_module_do_check( void ); -// **************************************************************************** - -// **************************************************************************** -// Watchdog NFS - Initialize -// ========================= -extern bool sm_watchdog_module_initialize( int* do_check_in_ms ); -// **************************************************************************** - -// **************************************************************************** -// Watchdog NFS - Finalize -// ======================= -extern bool sm_watchdog_module_finalize( void ); -// **************************************************************************** - -#ifdef __cplusplus -} -#endif - -#endif // __SM_WATCHDOG_NFS_H__ diff --git a/service-mgmt/sm-common/src/sm_watchdog_process.c b/service-mgmt/sm-common/src/sm_watchdog_process.c deleted file mode 100644 index b252483b..00000000 --- a/service-mgmt/sm-common/src/sm_watchdog_process.c +++ /dev/null @@ -1,241 +0,0 @@ -// -// Copyright (c) 2014 Wind River Systems, Inc. -// -// SPDX-License-Identifier: Apache-2.0 -// -#include "sm_watchdog_process.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "sm_limits.h" -#include "sm_types.h" -#include "sm_debug.h" -#include "sm_utils.h" -#include "sm_selobj.h" -#include "sm_time.h" -#include "sm_timer.h" -#include "sm_node_stats.h" -#include "sm_watchdog_module.h" - -#define SM_WATCHDOG_PROCESS_TICK_INTERVAL_IN_MS 1000 - -static sig_atomic_t _stay_on = 1; - -// **************************************************************************** -// Watchdog Process - Signal Handler -// ================================= -static void sm_watchdog_process_signal_handler( int signum ) -{ - switch( signum ) - { - case SIGINT: - case SIGTERM: - case SIGQUIT: - _stay_on = 0; - break; - - case SIGCONT: - DPRINTFD( "Ignoring signal SIGCONT (%i).", signum ); - break; - - default: - DPRINTFD( "Signal (%i) ignored.", signum ); - break; - } -} -// **************************************************************************** - -// **************************************************************************** -// Watchdog Process - Setup Signal Handler -// ======================================= -static void sm_watchdog_process_setup_signal_handler( void ) -{ - struct sigaction sa; - - memset( &sa, 0, sizeof(sa) ); - sa.sa_handler = sm_watchdog_process_signal_handler; - - sigaction( SIGINT, &sa, NULL ); - sigaction( SIGTERM, &sa, NULL ); - sigaction( SIGQUIT, &sa, NULL ); - sigaction( SIGCONT, &sa, NULL ); - - signal( SIGCHLD, SIG_IGN ); -} -// **************************************************************************** - -// **************************************************************************** -// Watchdog Process - Initialize -// ============================= -static SmErrorT sm_watchdog_process_initialize( void ) -{ - SmErrorT error; - - error = sm_selobj_initialize(); - if( SM_OKAY != error ) - { - DPRINTFE( "Failed to initialize selection object module, error=%s.", - sm_error_str( error ) ); - return( error ); - } - - error = sm_timer_initialize( SM_WATCHDOG_PROCESS_TICK_INTERVAL_IN_MS ); - if( SM_OKAY != error ) - { - DPRINTFE( "Failed to initialize timer module, error=%s.", - sm_error_str( error ) ); - return( error ); - } - - error = sm_node_stats_initialize(); - if( SM_OKAY != error ) - { - DPRINTFE( "Failed to initialize node stats, error=%s.", - sm_error_str( error ) ); - return( error ); - } - - return( SM_OKAY ); -} -// **************************************************************************** - -// **************************************************************************** -// Watchdog Process - Finalize -// =========================== -static SmErrorT sm_watchdog_process_finalize( void ) -{ - SmErrorT error; - - error = sm_node_stats_finalize(); - if( SM_OKAY != error ) - { - DPRINTFE( "Failed to finialize node stats, error=%s.", - sm_error_str( error ) ); - } - - error = sm_timer_finalize(); - if( SM_OKAY != error ) - { - DPRINTFE( "Failed to finalize timer module, error=%s.", - sm_error_str( error ) ); - } - - error = sm_selobj_finalize(); - if( SM_OKAY != error ) - { - DPRINTFE( "Failed to finalize selection object module, error=%s.", - sm_error_str( error ) ); - } - - return( SM_OKAY ); -} -// **************************************************************************** - -// **************************************************************************** -// Watchdog Process - Main -// ======================= -SmErrorT sm_watchdog_process_main( int argc, char *argv[], char *envp[] ) -{ - long ms_expired; - SmTimeT watchdog_heartbeat_time_prev; - SmErrorT error; - - sm_watchdog_process_setup_signal_handler(); - - DPRINTFI( "Starting" ); - - if( sm_utils_process_running( SM_WATCHDOG_PROCESS_PID_FILENAME ) ) - { - DPRINTFI( "Already running an instance of sm-watchdog." ); - return( SM_OKAY ); - } - - if( !sm_utils_set_pid_file( SM_WATCHDOG_PROCESS_PID_FILENAME ) ) - { - DPRINTFE( "Failed to write pid file for sm-watchdog, error=%s.", - strerror(errno) ); - return( SM_FAILED ); - } - - error = sm_watchdog_process_initialize(); - if( SM_OKAY != error ) - { - DPRINTFE( "Failed initialize process, error=%s.", - sm_error_str(error) ); - return( error ); - } - - error = sm_watchdog_module_load_all(); - if( SM_OKAY != error ) - { - DPRINTFE( "Failed load modules, error=%s.", - sm_error_str(error) ); - return( error ); - } - - DPRINTFI( "Started." ); - - sm_time_get( &watchdog_heartbeat_time_prev ); - sm_utils_watchdog_heartbeat(); - - while( _stay_on ) - { - error = sm_selobj_dispatch( SM_WATCHDOG_PROCESS_TICK_INTERVAL_IN_MS ); - if( SM_OKAY != error ) - { - DPRINTFE( "Selection object dispatch failed, error=%s.", - sm_error_str(error) ); - break; - } - - ms_expired = sm_time_get_elapsed_ms( &watchdog_heartbeat_time_prev ); - if( SM_WATCHDOG_PROCESS_TICK_INTERVAL_IN_MS <= ms_expired ) - { - if( sm_timer_scheduling_on_time() ) - { - sm_utils_watchdog_heartbeat(); - sm_time_get( &watchdog_heartbeat_time_prev ); - } - } - } - - DPRINTFI( "Shutting down." ); - - error = sm_watchdog_module_unload_all(); - if( SM_OKAY != error ) - { - DPRINTFE( "Failed unload modules, error=%s.", - sm_error_str(error) ); - } - - error = sm_watchdog_process_finalize(); - if( SM_OKAY != error ) - { - DPRINTFE( "Failed to finalize process, error=%s.", - sm_error_str( error ) ); - } - - DPRINTFI( "Shutdown complete." ); - - return( SM_OKAY ); -} -// **************************************************************************** diff --git a/service-mgmt/sm-common/src/sm_watchdog_process.h b/service-mgmt/sm-common/src/sm_watchdog_process.h deleted file mode 100644 index 10a97408..00000000 --- a/service-mgmt/sm-common/src/sm_watchdog_process.h +++ /dev/null @@ -1,25 +0,0 @@ -// -// Copyright (c) 2014 Wind River Systems, Inc. -// -// SPDX-License-Identifier: Apache-2.0 -// -#ifndef __SM_WATCHDOG_PROCESS_H__ -#define __SM_WATCHDOG_PROCESS_H__ - -#include "sm_types.h" - -#ifdef __cplusplus -extern "C" { -#endif - -// **************************************************************************** -// Watchdog Process - Main -// ======================= -extern SmErrorT sm_watchdog_process_main( int argc, char *argv[], char *envp[] ); -// **************************************************************************** - -#ifdef __cplusplus -} -#endif - -#endif // __SM_WATCHDOG_PROCESS_H__ diff --git a/service-mgmt/sm/scripts/sm.service b/service-mgmt/sm/scripts/sm.service index 1186692b..1c46068f 100644 --- a/service-mgmt/sm/scripts/sm.service +++ b/service-mgmt/sm/scripts/sm.service @@ -1,6 +1,6 @@ [Unit] Description=Service Management Unit -After=network-online.target syslog-ng.service config.service sm-watchdog.service systemd-udev-settle.service drbd.service +After=network-online.target syslog-ng.service config.service systemd-udev-settle.service drbd.service Before=sm-shutdown.service sm-api.service pmon.service [Service] diff --git a/service-mgmt/sm/scripts/sm.troubleshoot b/service-mgmt/sm/scripts/sm.troubleshoot index 6a2b2f3d..d0830906 100644 --- a/service-mgmt/sm/scripts/sm.troubleshoot +++ b/service-mgmt/sm/scripts/sm.troubleshoot @@ -67,9 +67,6 @@ timeout --signal KILL 5s pmap -x `cat /var/run/sm-trap.pid` delimiter "pmap -x cat /var/run/sm-eru.pid" timeout --signal KILL 5s pmap -x `cat /var/run/sm-eru.pid` -delimiter "pmap -x cat /var/run/sm-watchdog.pid" -timeout --signal KILL 5s pmap -x `cat /var/run/sm-watchdog.pid` - delimiter "top -b -n 1 -H -c" timeout --signal KILL 5s top -b -n 1 -H -c diff --git a/service-mgmt/sm/src/sm_service_action.c b/service-mgmt/sm/src/sm_service_action.c index 973a9137..b0df6717 100644 --- a/service-mgmt/sm/src/sm_service_action.c +++ b/service-mgmt/sm/src/sm_service_action.c @@ -19,14 +19,11 @@ #include #include "sm_types.h" -#include "sm_utils.h" #include "sm_debug.h" #include "sm_sha512.h" #include "sm_service_action_table.h" #include "sm_service_action_result_table.h" -#define SM_SERVICE_ACTION_MAX_DELAY_IN_SECS 4 -#define SM_SERVICE_ACTION_TIMER_SKEW_IN_MS 60000 #define SM_SERVICE_ACTION_VALIDATE_TIMER_IN_MS 60000 // **************************************************************************** @@ -839,15 +836,6 @@ SmErrorT sm_service_action_run( char service_name[], char instance_name[], *process_id = (int) pid; *timeout_in_ms = action_data->timeout_in_secs * 1000; - if( sm_utils_watchdog_delayed( SM_SERVICE_ACTION_MAX_DELAY_IN_SECS ) ) - { - DPRINTFI( "Service (%s) timeout %d secs increased by %d ms, " - "sm-watchdog delayed.", action_data->service_name, - action_data->timeout_in_secs, - SM_SERVICE_ACTION_TIMER_SKEW_IN_MS ); - *timeout_in_ms += SM_SERVICE_ACTION_TIMER_SKEW_IN_MS; - } - DPRINTFD( "Child process (%i) created for service (%s).", *process_id, action_data->service_name ); } diff --git a/service-mgmt/sm/src/sm_service_group_notification.c b/service-mgmt/sm/src/sm_service_group_notification.c index b3bbd747..39129fd0 100644 --- a/service-mgmt/sm/src/sm_service_group_notification.c +++ b/service-mgmt/sm/src/sm_service_group_notification.c @@ -40,9 +40,7 @@ typedef struct SmServiceGroupNotificationT service_group_notification; } SmNotificationEnvT; -#define SM_NOTIFICATION_SCRIPT_MAX_DELAY_IN_SECS 4 #define SM_NOTIFICATION_SCRIPT_TIMEOUT_IN_MS 30000 -#define SM_NOTIFICATION_SCRIPT_TIMER_SKEW_IN_MS 60000 #define SM_NOTIFICATION_SCRIPT_SUCCESS 0 #define SM_NOTIFICATION_SCRIPT_TIMEOUT -65534 #define SM_NOTIFICATION_SCRIPT_FAILURE -65535 @@ -712,14 +710,6 @@ SmErrorT sm_service_group_notification_notify( SmServiceGroupT* service_group, snprintf( timer_name, sizeof(timer_name), "%s %s notification ", service_group->name, notification_str ); - if( sm_utils_watchdog_delayed( SM_NOTIFICATION_SCRIPT_MAX_DELAY_IN_SECS ) ) - { - DPRINTFI( "Notification timeout %d secs increased by %d ms, " - "sm-watchdog delayed.", timeout_in_ms, - SM_NOTIFICATION_SCRIPT_TIMER_SKEW_IN_MS ); - timeout_in_ms += SM_NOTIFICATION_SCRIPT_TIMER_SKEW_IN_MS; - } - error = sm_timer_register( timer_name, timeout_in_ms, sm_service_group_notification_timeout, service_group->id, &timer_id );