metal/mtce/src/hwmon/scripts/ocf/hwmon

502 lines
13 KiB
Bash

#!/bin/sh
#
# Copyright (c) 2013-2017 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
#
# Support: www.windriver.com
#
# Purpose: This resource agent manages
#
# .... the CGCS Platform Hardware Monitor Daemon
#
# RA Spec:
#
# http://www.opencf.org/cgi-bin/viewcvs.cgi/specs/ra/resource-agent-api.txt?rev=HEAD
#
#######################################################################
# Initialization:
: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
#######################################################################
# Fill in some defaults if no values are specified
OCF_RESKEY_binary_default="hwmond"
OCF_RESKEY_config_default="/etc/mtc/hwmond.conf"
OCF_RESKEY_dbg_default="false"
OCF_RESKEY_logging_default="true"
OCF_RESKEY_mode_default="normal"
OCF_RESKEY_user_default="admin"
OCF_RESKEY_pid_default="/var/run/hwmond.pid"
OCF_RESKEY_state_default="standby"
: ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}}
: ${OCF_RESKEY_config=${OCF_RESKEY_config_default}}
: ${OCF_RESKEY_logging=${OCF_RESKEY_logging_default}}
: ${OCF_RESKEY_dbg=${OCF_RESKEY_dbg_default}}
: ${OCF_RESKEY_mode=${OCF_RESKEY_mode_default}}
: ${OCF_RESKEY_user=${OCF_RESKEY_user_default}}
: ${OCF_RESKEY_pid=${OCF_RESKEY_pid_default}}
: ${OCF_RESKEY_state=${OCF_RESKEY_state_default}}
mydaemon="/usr/local/bin/${OCF_RESKEY_binary}"
statusfile="/var/run/${OCF_RESKEY_binary}.info"
#######################################################################
usage() {
cat <<UEND
usage: $0 (start|stop|reload|status|monitor|validate-all|meta-data)
$0 manages the Platform's Controller Hardware Monitor (hwmond) process as an HA resource
The 'start' ..... operation starts the hardware monitor service in the active state.
The 'stop' ...... operation stops the hardware monitor service.
The 'reload' .... operation stops and then starts the hardware monitor service.
The 'status' .... operation checks the status of the hardware monitor service.
The 'monitor' ... operation indicates the in-service status of the hardware monitor service.
The 'validate-all' operation reports whether the parameters are valid.
The 'meta-data' . operation reports the hwmond's meta-data information.
UEND
}
#######################################################################
meta_data() {
if [ ${OCF_RESKEY_dbg} = "true" ] ; then
ocf_log info "hwmond:meta_data"
fi
cat <<END
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="hwmond">
<version>1.0</version>
<longdesc lang="en">
This 'hwmond' is an OCF Compliant Resource Agent that manages start, stop
and in-service monitoring of the Hardware Monitor service on Wind River's
Titanium Cloud in an active mode.
</longdesc>
<shortdesc lang="en">
Manages the Titanium Cloud's Hardware Monitor (hwmond) Daemon.
</shortdesc>
<parameters>
<parameter name="state" unique="0" required="0">
<longdesc lang="en">
state = standby ... run daemon in 'standby' mode (default)
state = active ... run daemon in 'active' mode
</longdesc>
<shortdesc lang="en">Hardware Monitor Daemon Activity State Option</shortdesc>
<content type="string" default="${OCF_RESKEY_state_default}"/>
</parameter>
<parameter name="mode" unique="0" required="0">
<longdesc lang="en">
mode = normal ... run hardware monitor daemon in 'normal' mode (default)
mode = passive ... run hardware monitor daemon in 'passive' mode
</longdesc>
<shortdesc lang="en">Maintenance Mode Option</shortdesc>
<content type="string" default="${OCF_RESKEY_mode_default}"/>
</parameter>
<parameter name="logging" unique="0" required="0">
<longdesc lang="en">
This option is used to direct the hwmon dameon log stream.
logging = true ... /var/log/hwmond.log (default)
logging = false ... /dev/null
See also debug option which sets the verbosity of logging.
</longdesc>
<shortdesc lang="en">Service Logging Control Option</shortdesc>
<content type="boolean" default="${OCF_RESKEY_logging_default}"/>
</parameter>
<parameter name="dbg" unique="0" required="0">
<longdesc lang="en">
dbg = false ... info, warn and err logs sent to output stream (default)
dbg = true ... Additional dbg logs are also sent to the output stream
</longdesc>
<shortdesc lang="en">Service Debug Control Option</shortdesc>
<content type="boolean" default="${OCF_RESKEY_dbg_default}"/>
</parameter>
</parameters>
<actions>
<action name="start" timeout="10s" />
<action name="stop" timeout="10s" />
<action name="monitor" timeout="10s" interval="300s" />
<action name="meta-data" timeout="10s" />
<action name="validate-all" timeout="10s" />
</actions>
</resource-agent>
END
return ${OCF_SUCCESS}
}
hwmond_validate() {
if [ ${OCF_RESKEY_dbg} = "true" ] ; then
ocf_log info "hwmond:validate"
fi
check_binary "/usr/local/bin/${OCF_RESKEY_binary}"
check_binary "/usr/local/bin/mtcAgent"
check_binary "/usr/local/bin/mtcClient"
check_binary sysinv-api
check_binary pidof
if [ ! -f ${OCF_RESKEY_config} ] ; then
msg="${OCF_RESKEY_binary} conf file missing ${OCF_RESKEY_config}"
ocf_log err "${msg}"
return ${OCF_ERR_CONFIGURED}
fi
return ${OCF_SUCCESS}
}
hwmond_status () {
proc="hwmond:status"
if [ ${OCF_RESKEY_dbg} = "true" ] ; then
ocf_log info "hwmond:status"
fi
# remove the status file before we request a new
rm -f ${statusfile}
# Verify the pid file exists as part of status
for ((loop=0;loop<3;loop++)) {
if [ -f ${OCF_RESKEY_pid} ] ; then
break
else
sleep 1
fi
}
# See if the daemon is running
pid=`cat ${OCF_RESKEY_pid}`
kill -0 $pid 2> /dev/null
if [ $? -eq 0 ] ; then
log_sig="${OCF_RESKEY_binary} In-Service Active Monitor Test"
# Ask the daemon to produce status
ocf_run kill -s USR1 $pid
# Wait for the response
for ((loop=0;loop<10;loop++)) {
sleep 1
if [ -f ${statusfile} ] ; then
ocf_log info "${log_sig} Passed ($loop)"
return ${OCF_SUCCESS}
elif [ $loop -eq 5 ] ; then
# send the signal again
ocf_run kill -s USR1 $pid
pid_stat=`cat /proc/${pid}/stat`
ocf_log notice "${log_sig} is slow to respond"
ocf_log notice "$pid_stat"
elif [ $loop -eq 8 ] ; then
pid_stat=`cat /proc/${pid}/stat`
ocf_log warn "${log_sig} is very slow to respond"
ocf_log warn "$pid_stat"
fi
}
log_procfs
ocf_log err "${log_sig} Failed"
return ${OCF_ERR_GENERIC}
fi
return ${OCF_NOT_RUNNING}
}
hwmond_monitor () {
proc="hwmond:monitor"
if [ ${OCF_RESKEY_dbg} = "true" ] ; then
ocf_log info "${proc}"
fi
# Uncomment if you want the monitor function to force-pass
# return ${OCF_SUCCESS}
pid=`cat ${OCF_RESKEY_pid}`
kill -0 $pid 2> /dev/null
if [ $? -ne 0 ] ; then
if [ ${OCF_RESKEY_dbg} = "true" ] ; then
ocf_log info "${proc} called while ${OCF_RESKEY_binary} not running."
fi
return ${OCF_NOT_RUNNING}
fi
hwmond_status
return $?
}
hwmond_start () {
local rc
start_proc="hwmond:start"
if [ ${OCF_RESKEY_dbg} = "true" ] ; then
ocf_log info "${start_proc}"
fi
# Uncomment if you want the start function to force-pass without starting
# return ${OCF_SUCCESS}
# If running then issue a ping test
pid=`cat ${OCF_RESKEY_pid}`
kill -0 $pid 2> /dev/null
if [ $? -eq 0 ] ; then
hwmond_status
rc=$?
if [ $rc -ne ${OCF_SUCCESS} ] ; then
msg="${start_proc} ping test failed rc=${rc}"
ocf_log err "${msg}"
hwmond_stop
else
# Spec says to return success if process is already running for start
pid=`cat ${OCF_RESKEY_pid}`
kill -0 $pid 2> /dev/null
if [ $? -eq 0 ] ; then
ocf_log info "${start_proc} called while ${OCF_RESKEY_binary} is already running"
return ${OCF_SUCCESS}
fi
fi
fi
# should not be running now or error
pid=`cat ${OCF_RESKEY_pid}`
kill -0 $pid 2> /dev/null
if [ $? -eq 0 ] ; then
msg="${start_proc} cannot kill off existing instance of ${OCF_RESKEY_binary}"
ocf_log err "${msg}"
return ${OCF_RUNNING_MASTER}
fi
rm -f ${statusfile}
if [ ${OCF_RESKEY_state} = "active" ] ; then
RUN_OPT_STATE="-a"
else
RUN_OPT_STATE=""
fi
if [ ${OCF_RESKEY_dbg} = "true" ] ; then
RUN_OPT_DEBUG="-d debug"
else
RUN_OPT_DEBUG=""
fi
if [ ${OCF_RESKEY_mode} = "passive" ] ; then
RUN_OPT_MODE="-p"
else
RUN_OPT_MODE=""
fi
if [ ${OCF_RESKEY_logging} = "true" ] ; then
RUN_OPT_LOG="-l"
else
RUN_OPT_LOG=""
fi
# default PID to null
pid=""
# Try to Start the daemon
${mydaemon} ${RUN_OPT_STATE} ${RUN_OPT_LOG} ${RUN_OPT_MODE} ${RUN_OPT_DEBUG}
rc=$?
# verify it was started and set return code appropriately
if [ $rc -eq ${OCF_SUCCESS} ] ; then
# Verify the pid file exists as part of status
for ((loop=0;loop<3;loop++)) {
if [ -f ${OCF_RESKEY_pid} ] ; then
break
else
ocf_log info "${start_proc} waiting ... loop=${loop}"
sleep 1
fi
}
pid=`cat ${OCF_RESKEY_pid}`
# ocf_log info "PID:$pid"
kill -0 $pid 2> /dev/null
if [ $? -ne 0 ] ; then
rc=${OCF_FAILED_MASTER}
else
if [ ! -f ${statusfile} ] ; then
ocf_log info "hwmond: Startup Health Test Failed - missing info"
rc = ${OCF_ERR_GENERIC}
fi
fi
else
ocf_log info "${start_proc} failed ${mydaemon} daemon rc=${rc}"
rc = ${OCF_ERR_GENERIC}
fi
# Record success or failure and return status
if [ ${rc} -eq $OCF_SUCCESS ] ; then
msg="${start_proc}ed pid=${pid}"
ocf_log info "${msg}"
else
msg="${start_proc} failed rc=${rc}"
ocf_log err "${msg}"
rc=${OCF_NOT_RUNNING}
fi
return $rc
}
hwmond_confirm_stop () {
proc="hwmond:confirm_stop"
ocf_log info "${proc}"
pid=`pidof ${OCF_RESKEY_binary}`
kill -0 ${pid} 2> /dev/null
if [ $? -eq 0 ] ; then
ocf_log info "${proc} 'kill -9 ${pid}'"
kill -9 ${pid}
ocf_log info "${proc}ed (by emergency kill -9 ${pid})"
sleep 1
fi
rm -f ${OCF_RESKEY_pid}
}
hwmond_stop () {
proc="hwmond:stop"
# See if the process is running by pidfile
pid=`pidof ${OCF_RESKEY_binary}`
ocf_log info "${proc} PID:${pid}"
kill -0 ${pid} 2> /dev/null
if [ $? -ne 0 ] ; then
ocf_log info "${proc} called while already stopped (no process)"
hwmond_confirm_stop
return ${OCF_SUCCESS}
fi
MAX=3
for ((loop=0;loop<$MAX;loop++)) {
# verify stop with pidfile
if [ -f ${OCF_RESKEY_pid} ] ; then
pid=`cat ${OCF_RESKEY_pid}`
# if pid file is gone we are done
if [ ${pid} = "" ] ; then
ocf_log info "${proc}ped (by -int)"
break
# if pidfile is empty then kill by -int
else
kill -0 ${pid} 2> /dev/null
if [ $? -ne 0 ] ; then
ocf_log info "${proc}ped (by pid)"
break
else
ocf_log info "${proc}ping (by -int - loop:${loop})"
kill -int ${pid}
sleep 1
fi
fi
fi
}
hwmond_confirm_stop
return ${OCF_SUCCESS}
}
hwmond_reload () {
local rc
proc="hwmond:reload"
if [ ${OCF_RESKEY_dbg} = "true" ] ; then
ocf_log info "${proc}"
fi
hwmond_stop
rc=$?
if [ $rc -eq ${OCF_SUCCESS} ] ; then
#sleep 1
hwmond_start
rc=$?
if [ $rc -eq ${OCF_SUCCESS} ] ; then
msg="${proc}ed"
ocf_log info "${mgs}"
fi
fi
if [ ${rc} -ne ${OCF_SUCCESS} ] ; then
msg="${OCF_RESKEY_binary}: failed to restart rc=${rc}"
ocf_log info "${mgs}"
fi
return ${rc}
}
case ${__OCF_ACTION} in
meta-data) meta_data
exit ${OCF_SUCCESS}
;;
usage|help) usage
exit ${OCF_SUCCESS}
;;
esac
if [ ${__OCF_ACTION} = "monitor" ] ; then
ocf_log debug "hwmond:${__OCF_ACTION} action"
else
ocf_log info "hwmond:${__OCF_ACTION} action"
fi
# Anything except meta-data and help must pass validation
hwmond_validate || exit $?
case ${__OCF_ACTION} in
start) hwmond_start
;;
stop) hwmond_stop
;;
status) hwmond_status
;;
reload) hwmond_reload
;;
monitor) hwmond_monitor
;;
validate-all) hwmond_validate
;;
*) usage
exit ${OCF_ERR_UNIMPLEMENTED}
;;
esac