#!/bin/sh # Start/stop ceph daemons # chkconfig: 2345 60 80 ### BEGIN INIT INFO # Provides: ceph # Default-Start: 2 3 4 5 # Default-Stop: 0 1 6 # Required-Start: $remote_fs $named $network $time # Required-Stop: $remote_fs $named $network $time # Short-Description: Start Ceph distributed file system daemons at boot time # Description: Enable Ceph distributed file system services. ### END INIT INFO if [ `dirname $0` = "." ] && [ $PWD != "/etc/init.d" ]; then # looks like an autotools src dir build BINDIR=. SBINDIR=. LIBDIR=. LIBEXECDIR=. ETCDIR=. ASSUME_DEV=1 else if [ -e CMakeCache.txt ] && [ -e bin/init-ceph ]; then # looks like a cmake build directory CEPH_ROOT=`grep ceph_SOURCE_DIR CMakeCache.txt | cut -d "=" -f 2` BINDIR=bin SBINDIR=bin LIBEXECDIR=$CEPH_ROOT/src ETCDIR=. ASSUME_DEV=1 CEPH_LIB=$CEPH_ROOT/build/lib echo "$PYTHONPATH" | grep -q $CEPH_LIB || export PYTHONPATH=$CEPH_LIB/cython_modules/lib.@MGR_PYTHON_VERSION_MAJOR@:$PYTHONPATH echo "$LD_LIBRARY_PATH" | grep -q $CEPH_LIB || export LD_LIBRARY_PATH=$CEPH_LIB:$LD_LIBRARY_PATH echo "$DYLD_LIBRARY_PATH" | grep -q $CEPH_LIB || export DYLD_LIBRARY_PATH=$CEPH_LIB:$DYLD_LIBRARY_PATH else BINDIR=/usr/bin SBINDIR=/usr/sbin LIBDIR=/usr/lib/ceph LIBEXECDIR=$LIBDIR ETCDIR=/etc/ceph ASSUME_DEV=0 fi fi if [ -n "$CEPH_BIN" ] && [ -n "$CEPH_ROOT" ] && [ -n "$CEPH_BUILD_DIR" ]; then BINDIR=$CEPH_BIN SBINDIR=$CEPH_ROOT/src ETCDIR=$CEPH_BIN LIBEXECDIR=$CEPH_ROOT/src ASSUME_DEV=1 fi #################################################### #### StarlingX: Check if a ceph process is hung #### #################################################### # Each Ceph process has a state and a status. # - States are used by the this script for decisions # - Statuses are $UP or $DOWN and represent a process working state as reported by Ceph # OSD processes can have a status of "up" or "down" based on what Ceph is # reporting. We ignore the "in" status (we consider it as "Down") # Ceph processes states: # 1. STARTUP - a process is started but has not yet joined the cluster # 2. OPERATIONAL - a process joined the cluster # 3. HANGED - a process is hung # 4. STOPPED - a process is offline (/etc/init.d start was not executed) # Monitoring is done differently based on state # All timers are in seconds # FSM states ST_STARTUP="STARTED" ST_OPER="OPERATIONAL" ST_HANGED="HANGED" ST_STOPPED="STOPPED" # Ceph OSD status UP="up" DOWN="down" #paths source /usr/bin/tsconfig DATA_PATH=$VOLATILE_PATH/ceph_hang # folder where we keep sate information LOG_PATH=/var/log/ceph LOG_FILE=$LOG_PATH/ceph-process-states.log mkdir -p $DATA_PATH # make sure folder exists #timeouts WAIT_FOR_CMD=10 # max wait for a response from Ceph WAIT_FOR_OSD_OPERATIONAL=300 # max wait for OSD to go 'up' when in startup state WAIT_FOR_OSD_DOWN_CONFIRM=300 # even if OSD is down it may be flapping, wait for a while WAIT_FOR_MON_OPERATIONAL=60 # max wait for MON to go 'up' when in startup state WAIT_FOR_MON_DOWN_CONFIRM=300 # even if MON is down it may be flapping, wait for a while # Blocked Ops Detection (values in seconds) BLOCKED_OPS_DETECTION_ENABLED="true" # Enable/Disable detection BLOCKED_OPS_START_DETECTION=300 # Wait on OSD startup before handling blocked ops BLOCKED_OPS_RESTART_THRESH=480 # Restart OSD if blocked reqs are longer that this # Stuck Peering Detection (values in seconds) STUCK_PEERING_DETECTION_ENABLED="true" # Enable/Disable detection STUCK_PEERING_START_DETECTION=300 # Wait on OSD startup before handling stuck peering STUCK_PEERING_RESTART_THRESH=180 # Restart OSD if stuck peering is longer that this LOG_LEVEL=NORMAL # DEBUG save_proc_state() { # Set the state of a process in the state machine and store it in a file local name=$1 local state=$2 if [ "$state" != "$ST_STARTUP" ] && [ "$state" != "$ST_OPER" ] && \ [ "$state" != "$ST_HANGED" ] && [ "$state" != "$ST_STOPPED" ]; then wlog $name "ERROR" "State $state is invalid, resetting to $ST_STARTUP" print_trace state=$ST_STARTUP fi echo "$state" > ${DATA_PATH}/.${name}_state } load_proc_state() { # Get the state of the state machine from file and validate content local name=$1 local state=$ST_STARTUP if [ -f ${DATA_PATH}/.${name}_state ]; then state=$(cat ${DATA_PATH}/.${name}_state) fi if [ "$state" != "$ST_STARTUP" ] && [ "$state" != "$ST_OPER" ] && \ [ "$state" != "$ST_HANGED" ] && [ "$state" != "$ST_STOPPED" ]; then wlog $name "ERROR" "State $state is invalid, resetting to $ST_STARTUP" print_trace state=$ST_STARTUP save_proc_state $name $state fi echo "$state"; return } save_proc_startup_ok() { # Reset to initial state after a process started successfully (i.e. it has a valid pid) local name=$1 # Process just started, clear all records, reset state rm -f ${DATA_PATH}/.${name}_start_time rm -f ${DATA_PATH}/.${name}_down_time save_proc_state $name $ST_STARTUP # Save the time when a process was started wlog $name INFO "Process $ST_STARTUP successfully, waiting for it to become $ST_OPER" echo $(date +%s) > ${DATA_PATH}/.${name}_start_time } save_proc_status() { # Store the status of a process ($UP or $DOWN) and its start time # Note that a process status is different from its states local name=$1 local status=$2 if [ "$status" == $UP ]; then rm -f ${DATA_PATH}/.${name}_down_time else if [ ! -f ${DATA_PATH}/.${name}_down_time ]; then echo $(date +%s) > ${DATA_PATH}/.${name}_down_time fi fi } load_proc_status() { # Load a process status ($UP or $DOWN) local name=$1 if [ -f ${DATA_PATH}/.${name}_down_time ]; then echo $DOWN else echo $UP fi } get_duration(){ # Get duration based on file record and current time local name=$1 local record=$2 # filename with prev time # Check that we have a filename if [ ! -f $record ]; then wlog $name ERROR "Failed to compute duration, time was never stored in $record!" print_trace echo "-1"; return fi # Get and validate time previously saved in file local start_time=$(cat $record) re="^[0-9]+$" if ! [[ "$start_time" =~ $re ]] ; then wlog $name ERROR "Recorded time '$start_time' is not a number!" print_trace echo "-1"; return fi # Compute duration local now=$(date +%s) local duration=$((now-start_time)) if [ "$duration" -lt 0 ]; then wlog $name ERROR "Duration less than 0!" print_trace echo "-1"; return fi echo $duration } get_proc_run_time() { local name=$1 local time=$(get_duration $name ${DATA_PATH}/.${name}_start_time) wlog $name DEBUG ">>> process running for: ${time}s" echo $time } get_proc_down_time() { local name=$1 local time=$(get_duration $name ${DATA_PATH}/.${name}_down_time) wlog $name DEBUG ">>> process down for: ${time}s" echo $time } run_state_machine() { # Small state machine, returns process state as defined in ST_* constants # Same logic apply to both ceph-osd and ceph-mon, only timeouts are different local name=$1 # 'osd.' or 'mon.' local type=$2 # 'osd' or 'mon' local status=$3 # daemon current status ($UP or $DOWN) local wait_for_operational=$4 # how much time to wait for a process to go up local wait_for_down_confirm=$5 # how much time to wait before reporting a process as down local state=$(load_proc_state $name) wlog $name "DEBUG" ">>> state: $state" # state machine if [ "$state" = "$ST_STARTUP" ]; then wlog $name "DEBUG" ">>> status: $status" if [ "$status" = $UP ]; then save_proc_state $name $ST_OPER wlog $name "INFO" "Process is OPERATIONAL" echo $ST_OPER; return else # the process should be 'up' in $WAIT_FOR_OSD_OPERATIONAL seconds! if [ $(get_proc_run_time $name) -gt $wait_for_operational ]; then # process hung! wlog $name "ERROR" "Process failed to go up in ${wait_for_operational}s after start, reporting it as $ST_HANGED!" save_proc_state $name $ST_HANGED echo $ST_HANGED; return fi fi elif [ "$state" = "$ST_OPER" ]; then if [ "$status" = "$DOWN" ]; then if [ $(load_proc_status $name) = $UP ];then wlog $name "WARN" "Process went down!" save_proc_status $name $DOWN fi # if a process is down we don't report it as hung for a while # this should avoid status flapping if [ $(get_proc_down_time $name) -gt $wait_for_down_confirm ]; then # the process is down for a long time, report it as hung! wlog $name "ERROR" "Process went down for more than ${wait_for_down_confirm}s, reporting it as $ST_HANGED" save_proc_state $name $ST_HANGED echo $ST_HANGED; return fi elif [ "$status" = "$UP" ]; then if [ $(load_proc_status $name) = $DOWN ]; then wlog $name "WARN" "Process went up, flapping status or busy process?" save_proc_status $name $UP return fi fi elif [ "$state" = "$ST_HANGED" ] || [ "$state" = "$ST_STOPPED" ]; then # nothing to do, resetting from these states is done externally in /etc/ceph/ceph_pmon_wrapper.sh echo $state; return fi } CEPH_FAILURE="" execute_ceph_cmd() { # execute a comand and in case it timeouts mark ceph as failed local name=$1 local cmd=$2 local cmd="timeout $WAIT_FOR_CMD $cmd" set -o pipefail eval "$cmd >$DATA_PATH/.ceph_cmd_out" errcode=$? set +o pipefail if [ -z "$output" ] && [ $errcode -eq 124 ]; then # 'timeout' returns 124 when timing out wlog $name "WARN" "Ceph cluster failed to respond in ${WAIT_FOR_CMD}s when running: $cmd" CEPH_FAILURE="true" echo ""; return 1 fi output=$(cat $DATA_PATH/.ceph_cmd_out) if [ -z "$output" ] || [ $errcode -ne 0 ]; then wlog $name "WARN" "Error executing: $cmd errorcode: $errcode output: $output" echo ""; return 1 fi echo "$output"; return $errcode } CEPH_OSD_TREE="" CEPH_HEALTH_DETAIL="" is_process_hung() { local name=$1 local type=$2 # 'osd' or 'mon' # Abort if we had previous errors with Ceph if [ "$CEPH_FAILURE" = "true" ]; then wlog $name "WARN" "Ceph cluster is marked as failed, aborting hang check" echo "false"; return fi # Cache Ceph Health for later use as calling Ceph takes time if [ -z "$CEPH_HEALTH_DETAIL" ]; then execute_ceph_cmd CEPH_HEALTH_DETAIL $name "ceph health detail" if [ $? -ne 0 ]; then wlog $name "WARN" "Aborting hang check" echo "false"; return fi fi # Check if an OSD is hung if [ "$type" = "osd" ]; then # Ignore health check if OSDs are administratively down # Note this can be done with: 'ceph osd set noup; ceph osd down ' $(echo "$CEPH_HEALTH_DETAIL" | grep -q "noup.*set") if [ $? -eq 0 ]; then wlog $name "WARN" "Ceph 'noup' flag is set, aborting hang check" echo "false"; return fi # Multiple OSD processes may be running, so we only run # 'ceph osd tree' once as it takes some time to execute if [ -z "$CEPH_OSD_TREE" ]; then execute_ceph_cmd CEPH_OSD_TREE $name "ceph osd tree" if [ $? -ne 0 ]; then wlog $name "WARN" "Ceph cmd exec failed, aborting hang check" echo "false"; return fi fi # Get osd status as 'up' or, for any other output, as 'down' echo "$CEPH_OSD_TREE" | grep $name | grep -q "up" if [ "$?" -eq 0 ]; then osd_status=$UP else osd_status=$DOWN fi local state=$(run_state_machine $name $type $osd_status \ $WAIT_FOR_OSD_OPERATIONAL $WAIT_FOR_OSD_DOWN_CONFIRM) if [ "$state" = "$ST_HANGED" ]; then echo "true"; return else echo "false"; return fi # Check if a Monitor is hung elif [ "$type" = "mon" ]; then # Get monitor status info local mon_status=$UP echo "$CEPH_HEALTH_DETAIL" | grep -q -e "^[[:space:]]*$name.*down" if [ $? -eq 0 ]; then mon_status=$DOWN fi local state=$(run_state_machine $name $type $mon_status \ $WAIT_FOR_MON_OPERATIONAL $WAIT_FOR_MON_DOWN_CONFIRM) if [ "$state" = "$ST_HANGED" ]; then echo "true"; return else echo "false"; return fi else wlog $name "WARN" "Unknown process type: $type" fi echo "false" } osd_has_blocked_ops() { local name=$1 # Abort if we had previous errors with Ceph if [ "$CEPH_FAILURE" = "true" ]; then wlog $name "WARN" "Ceph cluster is marked as failed, aborting blocked ops check" echo "false"; return fi # Cache Ceph Health for later use as calling Ceph takes time This is # initially cached from the hang check but check and call again here if # needed if [ -z "$CEPH_HEALTH_DETAIL" ]; then execute_ceph_cmd CEPH_HEALTH_DETAIL $name "ceph health detail" if [ $? -ne 0 ]; then wlog $name "WARN" "Aborting blocked ops check" echo "false"; return fi fi # Ignore health check if OSDs are administratively down # Note this can be done with: 'ceph osd set noup; ceph osd down ' $(echo "$CEPH_HEALTH_DETAIL" | grep -q "noup.*set") if [ $? -eq 0 ]; then wlog $name "WARN" "Ceph 'noup' flag is set, aborting blocked ops check" echo "false"; return fi # Multiple OSD processes may be running, so we only run 'ceph osd tree' once # as it takes some time to execute. This is initially cached from the hang # check but check and call again here if needed if [ -z "$CEPH_OSD_TREE" ]; then execute_ceph_cmd CEPH_OSD_TREE $name "ceph osd tree" if [ $? -ne 0 ]; then wlog $name "WARN" "Ceph cmd exec failed, aborting blocked ops check" echo "false"; return fi fi # Get osd status as 'up' or, for any other output, as 'down' echo "$CEPH_OSD_TREE" | grep $name | grep -q "up" if [ "$?" -eq 0 ]; then # Look for and parse:'1 ops are blocked > 1048.58 sec on osd.1' local blocked_time=$(echo "$CEPH_HEALTH_DETAIL" | grep $name | sed -rn 's/.*ops are blocked > ([[:digit:]]*).*/\1/p') [[ "$blocked_time" == "" ]] && blocked_time=0 if [ $blocked_time -gt $BLOCKED_OPS_RESTART_THRESH ]; then wlog $name "WARN" "Detected blocked operations for $blocked_time seconds" echo "true"; return else echo "false"; return fi fi } osd_has_stuck_peering() { local name=$1 local id=$2 # Abort if we had previous errors with Ceph if [ "$CEPH_FAILURE" = "true" ]; then wlog $name "WARN" "Ceph cluster is marked as failed, aborting stuck peering check" echo "false"; return fi # Cache Ceph Health for later use as calling Ceph takes time This is # initially cached from the hang check but check and call again here if # needed if [ -z "$CEPH_HEALTH_DETAIL" ]; then execute_ceph_cmd CEPH_HEALTH_DETAIL $name "ceph health detail" if [ $? -ne 0 ]; then wlog $name "WARN" "Aborting stuck peering check" echo "false"; return fi fi # Ignore health check if OSDs are administratively up # Note this can be done with: 'ceph osd set nodown; /etc/init.d/ceph stop ' $(echo "$CEPH_HEALTH_DETAIL" | grep -q "nodown.*set") if [ $? -eq 0 ]; then wlog $name "WARN" "Ceph 'nodown' flag is set, aborting stuck peering check" echo "false"; return fi file="${DATA_PATH}/.${name}_stuck_peering_start" max_blocked_time=0 $(echo "$CEPH_HEALTH_DETAIL" | grep "stuck peering" | awk '{split($0,a,"acting"); print a[2]}' | grep -q $id) if [ "$?" -eq 0 ]; then while read -r line; do $(echo $line | awk '{split($0,a,"acting"); print a[2]}' | grep -q $id) [ $? -eq 0 ] || continue # Look for and parse:'pg 1.3b is stuck peering for 69.893513, current state peering, last acting [1,0]' local blocked_time=$(echo "$line" | sed -rn 's/.*stuck peering for ([[:digit:]]*).*/\1/p') if [ $blocked_time -gt $max_blocked_time ]; then max_blocked_time="$blocked_time" fi done <<< "$(echo "$CEPH_HEALTH_DETAIL" | grep "stuck peering")" fi if [ $max_blocked_time -gt 0 ]; then if [ -f ${file} ]; then # check against old time first_blocked_time=$(cat ${file}) blocked_time=$((max_blocked_time - first_blocked_time)) # if stuck peering time restarted between status commands # then allow it if [ $blocked_time -lt 0 ]; then blocked_time=$max_blocked_time echo $max_blocked_time > ${file} fi if [ $blocked_time -gt $STUCK_PEERING_RESTART_THRESH ]; then wlog $name "WARN" "Detected stuck peering for $blocked_time seconds" echo "true"; return else echo "false"; return fi else # register the time for first detected stuck peering echo $max_blocked_time > ${file} fi else rm -f ${file} 2>/dev/null fi } ###################### #### StarlingX END ### ###################### if [ `uname` = FreeBSD ]; then GETOPT=/usr/local/bin/getopt else GETOPT=getopt fi if id ceph > /dev/null 2>&1; then SET_CEPHUSER_ARGS=" --setuser ceph --setgroup ceph" fi usage_exit() { echo "usage: $0 [options] {start|stop|restart|condrestart} [mon|osd|mds]..." printf "Core options:\n" printf "\t--allhosts / -a execute (via ssh) on all hosts in conf file\n" printf "\t--cluster [cluster name] define the cluster name\n" printf "\t--conf / -c [conf file] use [conf file] instead of default\n" printf "\t--help / -h show this usage message\n" printf "\t--hostname [hostname] override hostname lookup\n" printf "\t-m [mon addr] mon address\n" printf "\n" printf "Other options:\n" printf "\t--btrfs btrfs\n" printf "\t--nobtrfs no btrfs\n" printf "\t--btrfsumount btrfs umount\n" printf "\t--fsmount fsmount\n" printf "\t--nofsmount no fsmount\n" printf "\t--fsumount fsumount\n" printf "\t--restart restart on core dump\n" printf "\t--norestart do not restart on core dump\n" printf "\t--valgrind run via valgrind\n" printf "\t--novalgrind do not run via valgrind\n" printf "\t--verbose / -v be verbose\n" exit } # behave if we are not completely installed (e.g., Debian "removed, # config remains" state) test -f $LIBEXECDIR/ceph_common.sh || exit 0 . $LIBEXECDIR/ceph_common.sh EXIT_STATUS=0 signal_daemon() { name=$1 daemon=$2 pidfile=$3 signal=$4 action=$5 [ -z "$action" ] && action="Stopping" printf "$action Ceph $name on $host..." do_cmd "if [ -e $pidfile ]; then pid=\`cat $pidfile\` if ps -p \$pid -o args= | grep -q $daemon; then cmd=\"kill $signal \$pid\" printf \"\$cmd...\" \$cmd fi fi" echo done } daemon_is_running() { name=$1 daemon=$2 daemon_id=$3 pidfile=$4 do_cmd "[ -e $pidfile ] || exit 1 # no pid, presumably not running pid=\`cat $pidfile\` ps -p \$pid -o args= | grep $daemon | grep -qwe -i.$daemon_id && exit 0 # running exit 1 # pid is something else" "" "okfail" } stop_daemon() { name=$1 daemon=$2 pidfile=$3 signal=$4 action=$5 timeout=$6 [ -z "$action" ] && action="Stopping" printf "$action Ceph $name on $host..." do_cmd "if [ -e $pidfile ] ; then pid=\`cat $pidfile\` timeout=$timeout while ps -p \$pid -o args= | grep -q $daemon; do if [ ! -z "$timeout" ]; then if [ \$timeout -lt 0 ]; then break fi timeout-=1 fi cmd=\"kill $signal \$pid\" printf \"\$cmd...\" \$cmd sleep 1 continue done fi" echo done } ## command line options options= IFS=" " read -r -a args <<< "$@" wlog "-" INFO "$@" OPTS=$(${GETOPT} -n 'init-ceph' -o 'hvam:c:' -l 'help,verbose,valgrind,novalgrind,allhosts,restart,norestart,btrfs,nobtrfs,fsmount,nofsmount,btrfsumount,fsumount,conf:,cluster:,hostname:' -- "${args[@]}") if [ $? != 0 ] then exit 1 fi eval set -- "$OPTS" dovalgrind= docrun= allhosts=0 monaddr= dofsmount=1 dofsumount=0 verbose=0 use_default_conf=1 ## set variables like cluster or conf [ -e /etc/sysconfig/ceph ] && . /etc/sysconfig/ceph [ -e /etc/default/ceph ] && . /etc/default/ceph while echo $1 | grep -q '^-'; do # FIXME: why not '^-'? case $1 in -v | --verbose) verbose=1 ;; --valgrind) dovalgrind=1 ;; --novalgrind) dovalgrind=0 ;; --allhosts | -a) allhosts=1; ;; --restart) docrun=1 ;; --norestart) docrun=0 ;; -h | --help) usage_exit ;; -m ) [ -z "$2" ] && usage_exit options="$options $1" shift MON_ADDR=$1 ;; --btrfs | --fsmount) dofsmount=1 ;; --nobtrfs | --nofsmount) dofsmount=0 ;; --btrfsumount | --fsumount) dofsumount=1 ;; --conf | -c) [ -z "$2" ] && usage_exit options="$options $1" shift use_default_conf=0 conf=$1 ;; --cluster ) [ -z "$2" ] && usage_exit options="$options $1" shift cluster=$1 ;; --hostname ) [ -z "$2" ] && usage_exit options="$options $1" shift hostname=$1 ;; -- ) shift break ;; *) echo unrecognized option \'$1\' usage_exit ;; esac options="$options $1" shift done # if `--cluster` was not passed in, fallback to looking at the config name if [ -z "$cluster" ]; then cluster=`echo $conf | awk -F'/' '{print $(NF)}' | cut -d'.' -f 1` else # if we were told to use a given cluster name then $conf needs to be updated # but just define it if `--conf` was not specified, otherwise we would be silently # overriding $conf even if it was defined with `--conf` if [ $use_default_conf -eq 1 ]; then conf="/etc/ceph/$cluster.conf" fi fi verify_conf command=$1 [ -n "$*" ] && shift get_local_name_list get_name_list "$@" # Reverse the order if we are stopping if [ "$command" = "stop" -o "$command" = "onestop" ]; then for f in $what; do new_order="$f $new_order" done what="$new_order" fi . /etc/platform/platform.conf # When this is a AIO-DX pmon is monitoring ceph-mds process. # If ceph-mon is not running, ceph-mds will hang when starting. # Check if we are trying to bring up ceph-mds and ceph-mon is not ready yet if [ "${system_type}" == "All-in-one" ] && [ "${system_mode}" == "duplex" ]; then if [ "${command}" = "start" -o "${command}" = "onestart" ]; then what_out= what_mds= re="\s*mon" if [[ ${what} =~ ${re} ]]; then has_mon=1 else has_mon=0 CEPH_STATUS='' execute_ceph_cmd CEPH_STATUS "ceph status" "ceph -s" if [ $? -eq 0 ]; then has_mon=1 fi fi for name in ${what}; do type=$(echo "${name}" | cut -c 1-3) if [ "${type}" == "mds" ]; then what_mds="${name}" continue fi what_out+=" ${name}" done if [ ${has_mon} -eq 1 ] && [ ! -z "${what_mds}" ]; then what_out+=" ${what_mds}" fi what="${what_out}" fi # If the variable 'what' is empty, then it was trying to bring up ceph-mds but ceph-mon is not active. # When ceph-mon is not active, we cannot execute ceph-mds yet, thus returning error. if [ -z "${what}" ]; then EXIT_STATUS=1 fi fi # Check if the monitors are up before starting any mds # This is needed only for Standard deployments if [ "$system_type" == "Standard" ]; then CEPH_STATUS='' execute_ceph_cmd CEPH_STATUS "ceph status" "ceph -s" if [ "$?" -ne 0 ]; then what_out= for name in $what; do type=$(echo $name | cut -c 1-3) if [ "$type" == "mds" ]; then continue fi what_out+=" $name" done what=$what_out fi fi for name in $what; do type=`echo $name | cut -c 1-3` # e.g. 'mon', if $item is 'mon1' id=`echo $name | cut -c 4- | sed 's/^\\.//'` num=$id name="$type.$id" check_host $cluster || continue binary="$BINDIR/ceph-$type" cmd="$binary -i $id" if [ $ASSUME_DEV -eq 1 ]; then cmd="PATH=$PWD:$PATH $cmd" fi get_conf run_dir "/var/run/ceph" "run dir" get_conf pid_file "$run_dir/$type.$id.pid" "pid file" if [ "$command" = "start" -o "$command" = "onestart" ]; then if [ -n "$pid_file" ]; then do_cmd "mkdir -p "`dirname $pid_file` cmd="$cmd --pid-file $pid_file" fi get_conf log_dir "" "log dir" [ -n "$log_dir" ] && do_cmd "mkdir -p $log_dir" get_conf auto_start "" "auto start" if [ "$auto_start" = "no" ] || [ "$auto_start" = "false" ] || [ "$auto_start" = "0" ]; then if [ -z "$@" ]; then echo "Skipping Ceph $name on $host... auto start is disabled" continue fi fi if daemon_is_running $name ceph-$type $id $pid_file; then echo "Starting Ceph $name on $host...already running" continue fi get_conf copy_executable_to "" "copy executable to" if [ -n "$copy_executable_to" ]; then scp $binary "$host:$copy_executable_to" binary="$copy_executable_to" fi fi # conf file cmd="$cmd -c $conf" if echo $name | grep -q ^osd; then get_conf osd_data "/var/lib/ceph/osd/$cluster-$id" "osd data" get_conf fs_path "$osd_data" "fs path" # mount point defaults so osd data get_conf fs_devs "" "devs" if [ -z "$fs_devs" ]; then # try to fallback to old keys get_conf tmp_btrfs_devs "" "btrfs devs" if [ -n "$tmp_btrfs_devs" ]; then fs_devs="$tmp_btrfs_devs" fi fi first_dev=`echo $fs_devs | cut '-d ' -f 1` fi # do lockfile, if RH get_conf lockfile "/var/lock/subsys/ceph" "lock file" lockdir=`dirname $lockfile` if [ ! -d "$lockdir" ]; then lockfile="" fi get_conf asok "$run_dir/$cluster-$type.$id.asok" "admin socket" case "$command" in start|onestart) # Increase max_open_files, if the configuration calls for it. get_conf max_open_files "32768" "max open files" # Remove stale admin socket [ -n "$asok" ] && rm -f $asok # Wait for pending systemd core dumps deadline=$(( $(date '+%s') + 300 )) while [[ $(date '+%s') -lt "${deadline}" ]]; do systemd_coredump_pid=$(pgrep -f "systemd-coredump.*ceph-${type}") [[ -z "${systemd_coredump_pid}" ]] && break wlog $name "INFO" "systemd-coredump ceph-${type} in progress: pid ${systemd_coredump_pid}" sleep 1 done # build final command wrap="" runmode="" runarg="" [ -z "$docrun" ] && get_conf_bool docrun "0" "restart on core dump" [ "$docrun" -eq 1 ] && wrap="$BINDIR/ceph-run" [ -z "$dovalgrind" ] && get_conf_bool valgrind "" "valgrind" [ -n "$valgrind" ] && wrap="$wrap valgrind $valgrind" [ -n "$wrap" ] && runmode="-f &" && runarg="-f" [ -n "$max_open_files" ] && files="ulimit -n $max_open_files;" [ -n "$TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES" ] && tcmalloc="TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES=$TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES" # StarlingX: start processes in scope under slice system-ceph.slice # so that ceph processes do not start under this script's callers cgroup if [ "$type" = "osd" ]; then cmd="systemd-run --scope --unit=ceph-${type}-${id} --slice=system-ceph $cmd" else cmd="systemd-run --scope --unit=ceph-${type} --slice=system-ceph $cmd" fi # StarlingX: not running as ceph user/group cmd="$files $tcmalloc $wrap $cmd --cluster $cluster $runmode" if [ $dofsmount -eq 1 ] && [ -n "$fs_devs" ]; then get_conf pre_mount "true" "pre mount command" get_conf fs_type "" "osd mkfs type" if [ -z "$fs_type" ]; then # try to fallback to old keys get_conf tmp_devs "" "btrfs devs" if [ -n "$tmp_devs" ]; then fs_type="btrfs" else echo No filesystem type defined! exit 0 fi fi get_conf fs_opt "" "osd mount options $fs_type" if [ -z "$fs_opt" ]; then if [ "$fs_type" = "btrfs" ]; then #try to fallback to old keys get_conf fs_opt "" "btrfs options" fi if [ -z "$fs_opt" ]; then if [ "$fs_type" = "xfs" ]; then fs_opt="rw,noatime,inode64" else #fallback to use at least noatime fs_opt="rw,noatime" fi fi fi [ -n "$fs_opt" ] && fs_opt="-o $fs_opt" [ -n "$pre_mount" ] && do_cmd "$pre_mount" do_root_cmd_okfail "mkdir -p $fs_path" if [ "$fs_type" = "btrfs" ]; then echo Mounting Btrfs on $host:$fs_path do_root_cmd_okfail "modprobe btrfs ; btrfs device scan || btrfsctl -a ; egrep -q '^[^ ]+ $fs_path ' /proc/mounts && umount $fs_path ; mount -t btrfs $fs_opt $first_dev $fs_path" else echo Mounting $fs_type on $host:$fs_path do_root_cmd_okfail "modprobe $fs_type ; egrep -q '^[^ ]+ $fs_path ' /proc/mounts && umount $fs_path ; MOUNT_RESULT=\$(mount -t $fs_type $fs_opt $first_dev $fs_path 2>&1); [ \$? == 0 ] || (echo \$MOUNT_RESULT | grep 'already mounted on ${fs_path}')" fi if [ "$ERR" != "0" ]; then EXIT_STATUS=$ERR continue fi fi save_proc_startup_ok $name echo Starting Ceph $name on $host... if [ ! -d $run_dir ]; then # assume /var/run exists install -d -m0770 -o ceph -g ceph /var/run/ceph fi get_conf pre_start_eval "" "pre start eval" [ -n "$pre_start_eval" ] && $pre_start_eval get_conf pre_start "" "pre start command" get_conf post_start "" "post start command" [ -n "$pre_start" ] && do_cmd "$pre_start" do_cmd_okfail "$cmd" $runarg if [ "$ERR" != "0" ]; then EXIT_STATUS=$ERR continue fi . /etc/platform/platform.conf if [ "${nodetype}" = "controller" ]; then # StarlingX: Hook the transient services launched by systemd-run # to allow for proper cleanup and orderly shutdown # Set nullglob so wildcards will return empty string if no match shopt -s nullglob OSD_SERVICES=$(for svc in /run/systemd/system/ceph-osd*.service; do basename $svc; done | xargs echo) for d in /run/systemd/transient/ceph-osd*.scope; do do_cmd "mkdir -p $d.d" cat < $d.d/starlingx-overrides.conf [Unit] Before=containerd.service After=sm-shutdown.service EOF done for d in /run/systemd/transient/ceph-mds*.scope; do do_cmd "mkdir -p $d.d" cat < $d.d/starlingx-overrides.conf [Unit] Before=containerd.service After=sm-shutdown.service EOF done for d in /run/systemd/transient/ceph-mon*.scope; do do_cmd "mkdir -p $d.d" cat < $d.d/starlingx-overrides.conf [Unit] Before=containerd.service After=sm-shutdown.service ${OSD_SERVICES} EOF done shopt -u nullglob systemctl daemon-reload fi [ -n "$post_start" ] && do_cmd "$post_start" [ -n "$lockfile" ] && [ "$?" -eq 0 ] && touch $lockfile ;; stop|onestop) get_conf pre_stop "" "pre stop command" get_conf post_stop "" "post stop command" [ -n "$pre_stop" ] && do_cmd "$pre_stop" wlog $name "INFO" "Stopping process" if [ $(load_proc_state $name) != "$ST_HANGED" ]; then stop_daemon $name ceph-$type $pid_file else # first try to gracefully close process, this should be fast if # its threads still respond to the TERM signal wlog $name "DEBUG" ">>> Sending term signal" stop_daemon $name ceph-$type $pid_file TERM "" 5 wlog $name "DEBUG" ">>> Sending kill signal" # then just kill it stop_daemon $name ceph-$type $pid_file KILL fi [ -n "$pidfile" ] && rm -f $pidfile [ -n "$asok" ] && rm -f $asok [ -n "$post_stop" ] && do_cmd "$post_stop" [ -n "$lockfile" ] && [ "$?" -eq 0 ] && rm -f $lockfile # flush journal to data disk in background if [ "${type}" = "osd" ];then CMD_OUTPUT='' execute_ceph_cmd CMD_OUTPUT "Ceph Status" "ceph -s" if [ $? == 0 ]; then wlog "${name}" "INFO" "Flushing journal" $(/usr/bin/ceph-osd -i $id --flush-journal) & else wlog "${name}" "INFO" "Skipping journal flush" fi fi wlog $name "INFO" "Process stopped, setting state to $ST_STOPPED" save_proc_state $name $ST_STOPPED if [ $dofsumount -eq 1 ] && [ -n "$fs_devs" ]; then echo Unmounting OSD volume on $host:$fs_path do_root_cmd "umount $fs_path || true" fi ;; status) if daemon_is_running $name ceph-$type $id $pid_file; then # ceph processes answer in around 100ms when the process works correctly do_cmd "timeout 1 $BINDIR/ceph --admin-daemon $asok version 2>/dev/null || echo unknown" # log ceph osd state if [ "$type" = "osd" ];then CEPH_DAEMON_STATUS="" execute_ceph_cmd CEPH_DAEMON_STATUS $name "ceph daemon $name status" if [ $? -eq 0 ]; then state=$(echo "$CEPH_DAEMON_STATUS" | awk -F ':' '/state/{gsub(/[,[:space:]]/, "", $2);print $2}') if [ "$state" != "active" ]; then wlog $name "INFO" "$name has the following state: $state" fi fi fi # check if daemon is hung is_hung=$(is_process_hung $name $type) if [ "$is_hung" = "true" ]; then echo "$name: hung." # based on http://refspecs.linuxbase.org/LSB_3.1.0/LSB-Core-generic/LSB-Core-generic/iniscrptact.html # exit codes from 150 to 199 are application specific, therefore we define one here EXIT_STATUS=150 else # Wait a period of time prior to OSD start before restarting based on slow/blocked requests if [ "$type" = "osd" ] && [ $BLOCKED_OPS_DETECTION_ENABLED = "true" ]; then up_time=$(get_proc_run_time $name) if [ $up_time -gt $BLOCKED_OPS_START_DETECTION ]; then has_blocked_ops=$(osd_has_blocked_ops $name) if [ "$has_blocked_ops" = "true" ]; then echo "$name: blocked ops." # based on http://refspecs.linuxbase.org/LSB_3.1.0/LSB-Core-generic/LSB-Core-generic/iniscrptact.html # exit codes from 150 to 199 are application specific, therefore we define one here EXIT_STATUS=151 else echo "$name: running." fi else echo "$name: running." fi else echo "$name: running." fi # Wait a period of time prior to OSD start before restarting based on stuck peering if [ "$type" = "osd" ] && [ $STUCK_PEERING_DETECTION_ENABLED = "true" ]; then up_time=$(get_proc_run_time $name) if [ $up_time -gt $STUCK_PEERING_START_DETECTION ]; then has_stuck_peering=$(osd_has_stuck_peering $name $id) if [ "$has_stuck_peering" = "true" ]; then echo "$name: stuck peering." # based on http://refspecs.linuxbase.org/LSB_3.1.0/LSB-Core-generic/LSB-Core-generic/iniscrptact.html # exit codes from 150 to 199 are application specific, therefore we define one here EXIT_STATUS=152 else echo "$name: running." fi else echo "$name: running." fi else echo "$name: running." fi fi elif [ -e "$pid_file" ]; then # daemon is dead, but pid file still exists echo "$name: dead." EXIT_STATUS=1 else # daemon is dead, and pid file is gone echo "$name: not running." EXIT_STATUS=3 fi ;; ssh) $ssh ;; forcestop) get_conf pre_forcestop "" "pre forcestop command" get_conf post_forcestop "" "post forcestop command" [ -n "$pre_forcestop" ] && do_cmd "$pre_forcestop" stop_daemon $name ceph-$type $pid_file -9 [ -n "$post_forcestop" ] && do_cmd "$post_forcestop" [ -n "$lockfile" ] && [ "$?" -eq 0 ] && rm -f $lockfile ;; killall) echo "killall ceph-$type on $host" do_cmd "pkill ^ceph-$type || true" [ -n "$lockfile" ] && [ "$?" -eq 0 ] && rm -f $lockfile ;; force-reload | reload) signal_daemon $name ceph-$type $pid_file -1 "Reloading" ;; restart|onerestart) $0 $options stop $name $0 $options start $name ;; condrestart) if daemon_is_running $name ceph-$type $id $pid_file; then $0 $options stop $name $0 $options start $name else echo "$name: not running." fi ;; cleanlogs) echo removing logs [ -n "$log_dir" ] && do_cmd "rm -f $log_dir/$type.$id.*" ;; cleanalllogs) echo removing all logs [ -n "$log_dir" ] && do_cmd "rm -f $log_dir/* || true" ;; *) usage_exit ;; esac done # wait for journal flushing to complete if [ $command == "stop" ]; then wait fi exit $EXIT_STATUS