diff --git a/service-mgmt/sm/scripts/sm b/service-mgmt/sm/scripts/sm index 55e89e68..83b1eaf9 100755 --- a/service-mgmt/sm/scripts/sm +++ b/service-mgmt/sm/scripts/sm @@ -55,19 +55,52 @@ case "$1" in fi echo -n "Starting ${SM_NAME}: " - if [ -n "`pidof ${SM}`" ] - then - # PMOND might have restarted SM already. - RETVAL=0 - else + c=0 + p=$(pidof ${SM}) + # pidof /usr/bin/sm return 2 pids. When SM main process is killed + # subprocess id is returned until the subprocess goes away. + # calling start-stop-daemon --start too early will fail + # + # Add a loop below to wait up to 10 seconds for sub process to finish. + # Sub process terminates in around 5 seconds by itself, try killing it. + # A slight longer wait time to see main process goes away is not a concern, + # as if the SM is actually running (possibly started by pmon or systemd), + # sm is already functioning. + c=0 + p=$(pidof ${SM}) + while [[ $c -lt 10 && ${p} ]] + do + logger "SM waiting ${p}" + if [[ $(echo ${p} | grep "^[0-9]*$") ]]; then + # only subprocess running, try killing it + kill -9 ${p} + fi + sleep 1 + c=$(( ${c} + 1 )) + p=$(pidof ${SM}) + done + + running=0 + if [[ "${c}" == "10" ]]; then + if [[ $(echo ${p} | grep "^[0-9]*$") ]]; then + kill -9 ${p} + elif [[ $(echo ${p} | grep "^[0-9]* [0-9]*$") ]]; then + running=1 + logger "pidof ${SM} still running." + RETVAL=0 + fi + fi + + if [[ "${running}" == "0" ]]; then start-stop-daemon --start -b -x ${SM} -- ${sm_args} RETVAL=$? fi + if [ ${RETVAL} -eq 0 ] then echo "OK" else - echo "FAIL" + echo "FAIL ${RETVAL}" RETVAL=1 fi ;; diff --git a/service-mgmt/sm/scripts/sm.conf b/service-mgmt/sm/scripts/sm.conf index 9d454ee2..97747ec9 100644 --- a/service-mgmt/sm/scripts/sm.conf +++ b/service-mgmt/sm/scripts/sm.conf @@ -10,7 +10,7 @@ script = /etc/init.d/sm style = lsb ; lsb severity = critical ; minor, major, critical restarts = 3 ; restarts before error assertion -startuptime = 15 ; seconds to wait after process start +startuptime = 5 ; seconds to wait after process start interval = 5 ; number of seconds to wait between restarts debounce = 20 ; number of seconds to wait before degrade clear quorum = 1 ; process is in the host watchdog quorum