#!/bin/sh
# Start/stop ceph daemons
# chkconfig: 2345 60 80

### BEGIN INIT INFO
# Provides:          ceph
# Default-Start:     2 3 4 5
# Default-Stop:      0 1 6
# Required-Start:    $remote_fs $named $network $time
# Required-Stop:     $remote_fs $named $network $time
# Short-Description: Start Ceph distributed file system daemons at boot time
# Description:       Enable Ceph distributed file system services.
### END INIT INFO

if [ `dirname $0` = "." ] && [ $PWD != "/etc/init.d" ]; then
    # looks like an autotools src dir build
    BINDIR=.
    SBINDIR=.
    LIBDIR=.
    LIBEXECDIR=.
    ETCDIR=.
    ASSUME_DEV=1
else
    if [ -e CMakeCache.txt ] && [ -e bin/init-ceph ]; then
	# looks like a cmake build directory
	CEPH_ROOT=`grep ceph_SOURCE_DIR CMakeCache.txt | cut -d "=" -f 2`
	BINDIR=bin
	SBINDIR=bin
	LIBEXECDIR=$CEPH_ROOT/src
	ETCDIR=.
	ASSUME_DEV=1
	CEPH_LIB=$CEPH_ROOT/build/lib
	echo "$PYTHONPATH" | grep -q $CEPH_LIB || export PYTHONPATH=$CEPH_LIB/cython_modules/lib.@MGR_PYTHON_VERSION_MAJOR@:$PYTHONPATH
	echo "$LD_LIBRARY_PATH" | grep -q $CEPH_LIB || export LD_LIBRARY_PATH=$CEPH_LIB:$LD_LIBRARY_PATH
	echo "$DYLD_LIBRARY_PATH" | grep -q $CEPH_LIB || export DYLD_LIBRARY_PATH=$CEPH_LIB:$DYLD_LIBRARY_PATH
    else
	BINDIR=/usr/bin
	SBINDIR=/usr/sbin
	LIBDIR=/usr/lib/ceph
	LIBEXECDIR=$LIBDIR
	ETCDIR=/etc/ceph
	ASSUME_DEV=0
    fi
fi

if [ -n "$CEPH_BIN" ] && [ -n "$CEPH_ROOT" ] && [ -n "$CEPH_BUILD_DIR" ]; then
  BINDIR=$CEPH_BIN
  SBINDIR=$CEPH_ROOT/src
  ETCDIR=$CEPH_BIN
  LIBEXECDIR=$CEPH_ROOT/src
  ASSUME_DEV=1
fi

####################################################
#### StarlingX: Check if a ceph process is hung ####
####################################################

# Each Ceph process has a state and a status.
#  - States are used by the this script for decisions
#  - Statuses are $UP or $DOWN and represent a process working state as reported by Ceph
# OSD processes can have a status of "up" or "down" based on what Ceph is
# reporting. We ignore the "in" status (we consider it as "Down")
# Ceph processes states:
# 1. STARTUP - a process is started but has not yet joined the cluster
# 2. OPERATIONAL - a process joined the cluster
# 3. HANGED - a process is hung
# 4. STOPPED - a process is offline (/etc/init.d start was not executed)
# Monitoring is done differently based on state
# All timers are in seconds

# FSM states
ST_STARTUP="STARTED"
ST_OPER="OPERATIONAL"
ST_HANGED="HANGED"
ST_STOPPED="STOPPED"

# Ceph OSD status
UP="up"
DOWN="down"

#paths
source /usr/bin/tsconfig
DATA_PATH=$VOLATILE_PATH/ceph_hang    # folder where we keep sate information
LOG_PATH=/var/log/ceph
LOG_FILE=$LOG_PATH/ceph-process-states.log
mkdir -p $DATA_PATH         # make sure folder exists

#timeouts
WAIT_FOR_CMD=10                 # max wait for a response from Ceph
WAIT_FOR_OSD_OPERATIONAL=300    # max wait for OSD to go 'up' when in startup state
WAIT_FOR_OSD_DOWN_CONFIRM=300   # even if OSD is down it may be flapping, wait for a while
WAIT_FOR_MON_OPERATIONAL=60     # max wait for MON to go 'up' when in startup state
WAIT_FOR_MON_DOWN_CONFIRM=300   # even if MON is down it may be flapping, wait for a while

# Blocked Ops Detection (values in seconds)
BLOCKED_OPS_DETECTION_ENABLED="true" # Enable/Disable detection
BLOCKED_OPS_START_DETECTION=300      # Wait on OSD startup before handling blocked ops
BLOCKED_OPS_RESTART_THRESH=480       # Restart OSD if blocked reqs are longer that this

# Stuck Peering Detection (values in seconds)
STUCK_PEERING_DETECTION_ENABLED="true"  # Enable/Disable detection
STUCK_PEERING_START_DETECTION=300       # Wait on OSD startup before handling stuck peering
STUCK_PEERING_RESTART_THRESH=180        # Restart OSD if stuck peering is longer that this

LOG_LEVEL=NORMAL  # DEBUG

save_proc_state() {
    # Set the state of a process in the state machine and store it in a file
    local name=$1
    local state=$2
    if [ "$state" != "$ST_STARTUP" ] && [ "$state" != "$ST_OPER" ] && \
       [ "$state" != "$ST_HANGED" ] && [ "$state" != "$ST_STOPPED" ]; then
        wlog $name "ERROR" "State $state is invalid, resetting to $ST_STARTUP" print_trace
        state=$ST_STARTUP
    fi
    echo "$state" > ${DATA_PATH}/.${name}_state
}

load_proc_state() {
    # Get the state of the state machine from file and validate content
    local name=$1
    local state=$ST_STARTUP
    if [ -f ${DATA_PATH}/.${name}_state ]; then
        state=$(cat ${DATA_PATH}/.${name}_state)
    fi
    if [ "$state" != "$ST_STARTUP" ] && [ "$state" != "$ST_OPER" ] && \
       [ "$state" != "$ST_HANGED" ] && [ "$state" != "$ST_STOPPED" ]; then
        wlog $name "ERROR" "State $state is invalid, resetting to $ST_STARTUP" print_trace
        state=$ST_STARTUP
        save_proc_state $name $state
    fi
    echo "$state"; return
}

save_proc_startup_ok() {
    # Reset to initial state after a process started successfully (i.e. it has a valid pid)
    local name=$1

    # Process just started, clear all records, reset state
    rm -f ${DATA_PATH}/.${name}_start_time
    rm -f ${DATA_PATH}/.${name}_down_time
    save_proc_state $name $ST_STARTUP

    # Save the time when a process was started
    wlog $name INFO "Process $ST_STARTUP successfully, waiting for it to become $ST_OPER"
    echo $(date +%s) > ${DATA_PATH}/.${name}_start_time
}

save_proc_status() {
    # Store the status of a process ($UP or $DOWN) and its start time
    # Note that a process status is different from its states
    local name=$1
    local status=$2
    if [ "$status" == $UP ]; then
        rm -f ${DATA_PATH}/.${name}_down_time
    else
        if [ ! -f ${DATA_PATH}/.${name}_down_time ]; then
            echo $(date +%s) > ${DATA_PATH}/.${name}_down_time
        fi
    fi
}

load_proc_status() {
    # Load a process status ($UP or $DOWN)
    local name=$1
    if [ -f ${DATA_PATH}/.${name}_down_time ]; then
        echo $DOWN
    else
        echo $UP
    fi
}

get_duration(){
    # Get duration based on file record and current time
    local name=$1
    local record=$2  # filename with prev time

    # Check that we have a filename
    if [ ! -f $record ]; then
        wlog $name ERROR "Failed to compute duration, time was never stored in $record!" print_trace
        echo "-1"; return
    fi

    # Get and validate time previously saved in file
    local start_time=$(cat $record)
    re="^[0-9]+$"
    if ! [[ "$start_time" =~ $re ]] ; then
       wlog $name ERROR "Recorded time '$start_time' is not a number!" print_trace
       echo "-1"; return
    fi

    # Compute duration
    local now=$(date +%s)
    local duration=$((now-start_time))
    if [ "$duration" -lt 0 ]; then
        wlog $name ERROR "Duration less than 0!" print_trace
        echo "-1"; return
    fi
    echo $duration
}

get_proc_run_time() {
    local name=$1
    local time=$(get_duration $name ${DATA_PATH}/.${name}_start_time)
    wlog $name DEBUG ">>> process running for: ${time}s"
    echo $time
}

get_proc_down_time() {
    local name=$1
    local time=$(get_duration $name ${DATA_PATH}/.${name}_down_time)
    wlog $name DEBUG ">>> process down for: ${time}s"
    echo $time
}

run_state_machine() {
    # Small state machine, returns process state as defined in ST_* constants
    # Same logic apply to both ceph-osd and ceph-mon, only timeouts are different
    local name=$1       # 'osd.<number>' or 'mon.<hostname>'
    local type=$2       # 'osd' or 'mon'
    local status=$3     # daemon current status ($UP or $DOWN)
    local wait_for_operational=$4  # how much time to wait for a process to go up
    local wait_for_down_confirm=$5 # how much time to wait before reporting a process as down

    local state=$(load_proc_state $name)

    wlog $name "DEBUG" ">>>  state: $state"
    # state machine
    if [ "$state" = "$ST_STARTUP" ]; then
        wlog $name "DEBUG" ">>> status: $status"
        if [ "$status" = $UP ]; then
            save_proc_state $name $ST_OPER
            wlog $name "INFO" "Process is OPERATIONAL"
            echo $ST_OPER; return
        else
            # the process should be 'up' in $WAIT_FOR_OSD_OPERATIONAL seconds!
            if [ $(get_proc_run_time $name) -gt $wait_for_operational ]; then
                # process hung!
                wlog $name "ERROR" "Process failed to go up in ${wait_for_operational}s after start, reporting it as $ST_HANGED!"
                save_proc_state $name $ST_HANGED
                echo $ST_HANGED; return
            fi
        fi
    elif [ "$state" = "$ST_OPER" ]; then
        if [ "$status" = "$DOWN" ]; then
            if [ $(load_proc_status $name) = $UP ];then
                wlog $name "WARN" "Process went down!"
                save_proc_status $name $DOWN
            fi
            # if a process is down we don't report it as hung for a while
            # this should avoid status flapping
            if [ $(get_proc_down_time $name) -gt $wait_for_down_confirm ]; then
                # the process is down for a long time, report it as hung!
                wlog $name "ERROR" "Process went down for more than ${wait_for_down_confirm}s, reporting it as $ST_HANGED"
                save_proc_state $name $ST_HANGED
                echo $ST_HANGED; return
            fi
        elif [ "$status" = "$UP" ]; then
            if [ $(load_proc_status $name) = $DOWN ]; then
                wlog $name "WARN" "Process went up, flapping status or busy process?"
                save_proc_status $name $UP
                return
            fi
        fi
    elif [ "$state" = "$ST_HANGED" ] || [ "$state" = "$ST_STOPPED" ]; then
        # nothing to do, resetting from these states is done externally in /etc/ceph/ceph_pmon_wrapper.sh
        echo $state; return
    fi
}

CEPH_FAILURE=""
execute_ceph_cmd() {
    # execute a comand and in case it timeouts mark ceph as failed
    local name=$1
    local cmd=$2
    local cmd="timeout $WAIT_FOR_CMD $cmd"
    set -o pipefail
    eval "$cmd >$DATA_PATH/.ceph_cmd_out"
    errcode=$?
    set +o pipefail
    if [ -z "$output" ] && [ $errcode -eq 124 ]; then  # 'timeout' returns 124 when timing out
        wlog $name "WARN" "Ceph cluster failed to respond in ${WAIT_FOR_CMD}s when running: $cmd"
        CEPH_FAILURE="true"
        echo ""; return 1
    fi
    output=$(cat $DATA_PATH/.ceph_cmd_out)
    if [ -z "$output" ] || [ $errcode -ne 0 ]; then
        wlog $name "WARN" "Error executing: $cmd errorcode: $errcode output: $output"
        echo ""; return 1
    fi
    echo "$output"; return $errcode
}

CEPH_OSD_TREE=""
CEPH_HEALTH_DETAIL=""
is_process_hung() {
    local name=$1
    local type=$2  # 'osd' or 'mon'

    # Abort if we had previous errors with Ceph
    if [ "$CEPH_FAILURE" = "true" ]; then
        wlog $name "WARN" "Ceph cluster is marked as failed, aborting hang check"
        echo "false"; return
    fi

    # Cache Ceph Health for later use as calling Ceph takes time
    if [ -z "$CEPH_HEALTH_DETAIL" ]; then
        execute_ceph_cmd CEPH_HEALTH_DETAIL $name "ceph health detail"
        if [ $? -ne 0 ]; then
            wlog $name "WARN" "Aborting hang check"
            echo "false"; return
        fi
    fi

    # Check if an OSD is hung
    if [ "$type" = "osd" ]; then
        # Ignore health check if OSDs are administratively down
        # Note this can be done with: 'ceph osd set noup; ceph osd down <osd.id>'
        $(echo "$CEPH_HEALTH_DETAIL" | grep -q "noup.*set")
        if [ $? -eq 0 ]; then
           wlog $name "WARN" "Ceph 'noup' flag is set, aborting hang check"
           echo "false"; return
        fi

        # Multiple OSD processes may be running, so we only run
        # 'ceph osd tree' once as it takes some time to execute
        if [ -z "$CEPH_OSD_TREE" ]; then
            execute_ceph_cmd CEPH_OSD_TREE $name "ceph osd tree"
            if [ $? -ne 0 ]; then
                wlog $name "WARN" "Ceph cmd exec failed, aborting hang check"
                echo "false"; return
            fi
        fi

        # Get osd status as 'up' or, for any other output, as 'down'
        echo "$CEPH_OSD_TREE" | grep $name | grep -q "up"
        if [ "$?" -eq 0 ]; then
            osd_status=$UP
        else
            osd_status=$DOWN
        fi

        local state=$(run_state_machine $name $type $osd_status \
                      $WAIT_FOR_OSD_OPERATIONAL $WAIT_FOR_OSD_DOWN_CONFIRM)
        if [ "$state" = "$ST_HANGED" ]; then
            echo "true"; return
        else
            echo "false"; return
        fi


     # Check if a Monitor is hung
     elif [ "$type" = "mon" ]; then
        # Get monitor status info
        local mon_status=$UP
        echo "$CEPH_HEALTH_DETAIL" | grep -q -e "^[[:space:]]*$name.*down"
        if [ $? -eq 0 ]; then
            mon_status=$DOWN
        fi

        local state=$(run_state_machine $name $type $mon_status \
                      $WAIT_FOR_MON_OPERATIONAL $WAIT_FOR_MON_DOWN_CONFIRM)
        if [ "$state" = "$ST_HANGED" ]; then
            echo "true"; return
        else
            echo "false"; return
        fi

     else
        wlog $name "WARN" "Unknown process type: $type"
     fi
   echo "false"
}

osd_has_blocked_ops() {
    local name=$1

    # Abort if we had previous errors with Ceph
    if [ "$CEPH_FAILURE" = "true" ]; then
        wlog $name "WARN" "Ceph cluster is marked as failed, aborting blocked ops check"
        echo "false"; return
    fi

    # Cache Ceph Health for later use as calling Ceph takes time This is
    # initially cached from the hang check but check and call again here if
    # needed
    if [ -z "$CEPH_HEALTH_DETAIL" ]; then
        execute_ceph_cmd CEPH_HEALTH_DETAIL $name "ceph health detail"
        if [ $? -ne 0 ]; then
            wlog $name "WARN" "Aborting blocked ops check"
            echo "false"; return
        fi
    fi

    # Ignore health check if OSDs are administratively down
    # Note this can be done with: 'ceph osd set noup; ceph osd down <osd.id>'
    $(echo "$CEPH_HEALTH_DETAIL" | grep -q "noup.*set")
    if [ $? -eq 0 ]; then
        wlog $name "WARN" "Ceph 'noup' flag is set, aborting blocked ops check"
        echo "false"; return
    fi

    # Multiple OSD processes may be running, so we only run 'ceph osd tree' once
    # as it takes some time to execute. This is initially cached from the hang
    # check but check and call again here if needed
    if [ -z "$CEPH_OSD_TREE" ]; then
        execute_ceph_cmd CEPH_OSD_TREE $name "ceph osd tree"
        if [ $? -ne 0 ]; then
            wlog $name "WARN" "Ceph cmd exec failed, aborting blocked ops check"
            echo "false"; return
        fi
    fi

    # Get osd status as 'up' or, for any other output, as 'down'
    echo "$CEPH_OSD_TREE" | grep $name | grep -q "up"
    if [ "$?" -eq 0 ]; then
        # Look for and parse:'1 ops are blocked > 1048.58 sec on osd.1'
        local blocked_time=$(echo "$CEPH_HEALTH_DETAIL" | grep $name | sed -rn 's/.*ops are blocked > ([[:digit:]]*).*/\1/p')
        [[ "$blocked_time" == "" ]] && blocked_time=0
        if [ $blocked_time -gt $BLOCKED_OPS_RESTART_THRESH ]; then
            wlog $name "WARN" "Detected blocked operations for $blocked_time seconds"
            echo "true"; return
        else
            echo "false"; return
        fi
    fi
}

osd_has_stuck_peering() {
    local name=$1
    local id=$2

    # Abort if we had previous errors with Ceph
    if [ "$CEPH_FAILURE" = "true" ]; then
        wlog $name "WARN" "Ceph cluster is marked as failed, aborting stuck peering check"
        echo "false"; return
    fi

    # Cache Ceph Health for later use as calling Ceph takes time This is
    # initially cached from the hang check but check and call again here if
    # needed
    if [ -z "$CEPH_HEALTH_DETAIL" ]; then
        execute_ceph_cmd CEPH_HEALTH_DETAIL $name "ceph health detail"
        if [ $? -ne 0 ]; then
            wlog $name "WARN" "Aborting stuck peering check"
            echo "false"; return
        fi
    fi

    # Ignore health check if OSDs are administratively up
    # Note this can be done with: 'ceph osd set nodown; /etc/init.d/ceph stop <osd.id>'
    $(echo "$CEPH_HEALTH_DETAIL" | grep -q "nodown.*set")
    if [ $? -eq 0 ]; then
        wlog $name "WARN" "Ceph 'nodown' flag is set, aborting stuck peering check"
        echo "false"; return
    fi


    file="${DATA_PATH}/.${name}_stuck_peering_start"
    max_blocked_time=0

    $(echo "$CEPH_HEALTH_DETAIL" | grep "stuck peering" | awk '{split($0,a,"acting"); print a[2]}' | grep -q $id)
    if [ "$?" -eq 0 ]; then
        while read -r line; do
            $(echo $line | awk '{split($0,a,"acting"); print a[2]}' | grep -q $id)
            [ $? -eq 0 ] || continue
            # Look for and parse:'pg 1.3b is stuck peering for 69.893513, current state peering, last acting [1,0]'
            local blocked_time=$(echo "$line" | sed -rn 's/.*stuck peering for ([[:digit:]]*).*/\1/p')
            if [ $blocked_time -gt $max_blocked_time ]; then
                max_blocked_time="$blocked_time"
            fi
        done <<< "$(echo "$CEPH_HEALTH_DETAIL" | grep "stuck peering")"
    fi


    if [ $max_blocked_time -gt 0 ]; then
        if [ -f ${file} ]; then
            # check against old time
            first_blocked_time=$(cat ${file})
            blocked_time=$((max_blocked_time - first_blocked_time))

            # if stuck peering time restarted between status commands
            # then allow it
            if [ $blocked_time -lt 0 ]; then
                blocked_time=$max_blocked_time
                echo $max_blocked_time > ${file}
            fi

            if [ $blocked_time -gt $STUCK_PEERING_RESTART_THRESH ]; then
                wlog $name "WARN" "Detected stuck peering for $blocked_time seconds"
                echo "true"; return
            else
                echo "false"; return
            fi
        else
            # register the time for first detected stuck peering
            echo $max_blocked_time > ${file}
        fi
    else
        rm -f ${file} 2>/dev/null
    fi


}

######################
#### StarlingX END ###
######################


if [ `uname` = FreeBSD ]; then
  GETOPT=/usr/local/bin/getopt
else
  GETOPT=getopt
fi

if id ceph > /dev/null 2>&1; then
  SET_CEPHUSER_ARGS=" --setuser ceph --setgroup ceph"
fi

usage_exit() {
    echo "usage: $0 [options] {start|stop|restart|condrestart} [mon|osd|mds]..."
    printf "Core options:\n"
    printf "\t--allhosts / -a           execute (via ssh) on all hosts in conf file\n"
    printf "\t--cluster [cluster name]  define the cluster name\n"
    printf "\t--conf / -c [conf file]   use [conf file] instead of default\n"
    printf "\t--help / -h               show this usage message\n"
    printf "\t--hostname [hostname]     override hostname lookup\n"
    printf "\t-m [mon addr]             mon address\n"
    printf "\n"
    printf "Other options:\n"
    printf "\t--btrfs                   btrfs\n"
    printf "\t--nobtrfs                 no btrfs\n"
    printf "\t--btrfsumount             btrfs umount\n"
    printf "\t--fsmount                 fsmount\n"
    printf "\t--nofsmount               no fsmount\n"
    printf "\t--fsumount                fsumount\n"
    printf "\t--restart                 restart on core dump\n"
    printf "\t--norestart               do not restart on core dump\n"
    printf "\t--valgrind                run via valgrind\n"
    printf "\t--novalgrind              do not run via valgrind\n"
    printf "\t--verbose / -v            be verbose\n"
    exit
}

# behave if we are not completely installed (e.g., Debian "removed,
# config remains" state)
test -f $LIBEXECDIR/ceph_common.sh || exit 0

. $LIBEXECDIR/ceph_common.sh

EXIT_STATUS=0

signal_daemon() {
    name=$1
    daemon=$2
    pidfile=$3
    signal=$4
    action=$5
    [ -z "$action" ] && action="Stopping"
    printf "$action Ceph $name on $host..."
    do_cmd "if [ -e $pidfile ]; then
        pid=\`cat $pidfile\`
        if ps -p \$pid -o args= | grep -q $daemon; then
	    cmd=\"kill $signal \$pid\"
	    printf \"\$cmd...\"
	    \$cmd
        fi
    fi"
    echo done
}

daemon_is_running() {
    name=$1
    daemon=$2
    daemon_id=$3
    pidfile=$4
    do_cmd "[ -e $pidfile ] || exit 1   # no pid, presumably not running
	pid=\`cat $pidfile\`
	ps -p \$pid -o args= | grep $daemon | grep -qwe -i.$daemon_id && exit 0 # running
        exit 1  # pid is something else" "" "okfail"
}

stop_daemon() {
    name=$1
    daemon=$2
    pidfile=$3
    signal=$4
    action=$5
    timeout=$6
    [ -z "$action" ] && action="Stopping"
    printf "$action Ceph $name on $host..."
    do_cmd "if [ -e $pidfile ] ; then 
    pid=\`cat $pidfile\`
    timeout=$timeout
    while ps -p \$pid -o args= | grep -q $daemon; do
	    if [ ! -z "$timeout" ]; then
	        if [ \$timeout -lt 0 ]; then
	            break
	        fi
	        timeout-=1
	    fi
	    cmd=\"kill $signal \$pid\"
	    printf \"\$cmd...\"
	    \$cmd
	    sleep 1
	    continue
	done
    fi"
    echo done
}

## command line options
options=
IFS=" " read -r -a args <<< "$@"
wlog "-" INFO "$@"

OPTS=$(${GETOPT} -n 'init-ceph' -o 'hvam:c:' -l 'help,verbose,valgrind,novalgrind,allhosts,restart,norestart,btrfs,nobtrfs,fsmount,nofsmount,btrfsumount,fsumount,conf:,cluster:,hostname:' -- "${args[@]}")
if [ $? != 0 ]
then
    exit 1
fi

eval set -- "$OPTS"

dovalgrind=
docrun=
allhosts=0
monaddr=
dofsmount=1
dofsumount=0
verbose=0
use_default_conf=1

## set variables like cluster or conf
[ -e /etc/sysconfig/ceph ] && . /etc/sysconfig/ceph
[ -e /etc/default/ceph ] && . /etc/default/ceph


while echo $1 | grep -q '^-'; do     # FIXME: why not '^-'?
case $1 in
    -v | --verbose)
	    verbose=1
	    ;;
    --valgrind)
	    dovalgrind=1
	    ;;
    --novalgrind)
	    dovalgrind=0
	    ;;
    --allhosts | -a)
	    allhosts=1;
	    ;;
    --restart)
	    docrun=1
	    ;;
    --norestart)
	    docrun=0
	    ;;
    -h | --help)
            usage_exit
            ;;
    -m )
	    [ -z "$2" ] && usage_exit
	    options="$options $1"
	    shift
	    MON_ADDR=$1
	    ;;
    --btrfs | --fsmount)
	    dofsmount=1
	    ;;
    --nobtrfs | --nofsmount)
	    dofsmount=0
	    ;;
    --btrfsumount | --fsumount)
	    dofsumount=1
	    ;;
    --conf | -c)
	    [ -z "$2" ] && usage_exit
	    options="$options $1"
	    shift
        use_default_conf=0
	    conf=$1
	    ;;
    --cluster )
	    [ -z "$2" ] && usage_exit
	    options="$options $1"
	    shift
	    cluster=$1
	    ;;
    --hostname )
	    [ -z "$2" ] && usage_exit
	    options="$options $1"
	    shift
	    hostname=$1
            ;;
    -- )
            shift
            break
            ;;
    *)
	    echo unrecognized option \'$1\'
	    usage_exit
	    ;;
esac
options="$options $1"
shift
done

# if `--cluster` was not passed in, fallback to looking at the config name
if [ -z "$cluster" ]; then
    cluster=`echo $conf | awk -F'/' '{print $(NF)}' | cut -d'.' -f 1`
else
    # if we were told to use a given cluster name then $conf needs to be updated
    # but just define it if `--conf` was not specified, otherwise we would be silently
    # overriding $conf even if it was defined with `--conf`
    if [ $use_default_conf -eq 1 ]; then
        conf="/etc/ceph/$cluster.conf"
    fi
fi


verify_conf

command=$1
[ -n "$*" ] && shift

get_local_name_list
get_name_list "$@"

# Reverse the order if we are stopping

if [ "$command" = "stop" -o "$command" = "onestop" ]; then
    for f in $what; do
       new_order="$f $new_order"
    done
    what="$new_order"
fi


. /etc/platform/platform.conf


# When this is a AIO-DX pmon is monitoring ceph-mds process.
# If ceph-mon is not running, ceph-mds will hang when starting.
# Check if we are trying to bring up ceph-mds and ceph-mon is not ready yet
if [ "${system_type}" == "All-in-one" ] && [ "${system_mode}" == "duplex" ]; then
    if [ "${command}" = "start" -o "${command}" = "onestart" ]; then
        what_out=
        what_mds=
        re="\s*mon"
        if [[ ${what} =~ ${re} ]]; then
            has_mon=1
        else
            has_mon=0
            CEPH_STATUS=''
            execute_ceph_cmd CEPH_STATUS "ceph status" "ceph -s"
            if [ $? -eq 0 ]; then
                has_mon=1
            fi
        fi
        for name in ${what}; do
            type=$(echo "${name}" | cut -c 1-3)
            if [ "${type}" == "mds" ]; then
                what_mds="${name}"
                continue
            fi
            what_out+=" ${name}"
        done
        if [ ${has_mon} -eq 1 ] && [ ! -z "${what_mds}" ]; then
            what_out+=" ${what_mds}"
        fi
        what="${what_out}"
    fi
    # If the variable 'what' is empty, then it was trying to bring up ceph-mds but ceph-mon is not active.
    # When ceph-mon is not active, we cannot execute ceph-mds yet, thus returning error.
    if [ -z "${what}" ]; then
        EXIT_STATUS=1
    fi
fi

# Check if the monitors are up before starting any mds
# This is needed only for Standard deployments

if [ "$system_type" == "Standard" ]; then
    CEPH_STATUS=''
    execute_ceph_cmd CEPH_STATUS "ceph status" "ceph -s"
    if [ "$?" -ne 0 ]; then
        what_out=
        for name in $what; do
            type=$(echo $name | cut -c 1-3)
            if [ "$type" == "mds" ]; then
                continue
            fi
            what_out+=" $name"
        done
        what=$what_out
    fi
fi

for name in $what; do
    type=`echo $name | cut -c 1-3`   # e.g. 'mon', if $item is 'mon1'
    id=`echo $name | cut -c 4- | sed 's/^\\.//'`
    num=$id
    name="$type.$id"

    check_host $cluster || continue

    binary="$BINDIR/ceph-$type"
    cmd="$binary -i $id"
    if [ $ASSUME_DEV -eq 1 ]; then
      cmd="PATH=$PWD:$PATH $cmd"
    fi

    get_conf run_dir "/var/run/ceph" "run dir"

    get_conf pid_file "$run_dir/$type.$id.pid" "pid file"

    if [ "$command" = "start" -o "$command" = "onestart" ]; then
	if [ -n "$pid_file" ]; then
	    do_cmd "mkdir -p "`dirname $pid_file`
	    cmd="$cmd --pid-file $pid_file"
	fi

	get_conf log_dir "" "log dir"
	[ -n "$log_dir" ] && do_cmd "mkdir -p $log_dir"

        get_conf auto_start "" "auto start"
        if [ "$auto_start" = "no" ] || [ "$auto_start" = "false" ] || [ "$auto_start" = "0" ]; then
            if [ -z "$@" ]; then
                echo "Skipping Ceph $name on $host... auto start is disabled"
                continue
            fi
        fi

	if daemon_is_running $name ceph-$type $id $pid_file; then
	    echo "Starting Ceph $name on $host...already running"
	    continue
	fi

	get_conf copy_executable_to "" "copy executable to"
	if [ -n "$copy_executable_to" ]; then
	    scp $binary "$host:$copy_executable_to"
	    binary="$copy_executable_to"
	fi
    fi

    # conf file
    cmd="$cmd -c $conf"

    if echo $name | grep -q ^osd; then
	get_conf osd_data "/var/lib/ceph/osd/$cluster-$id" "osd data"
	get_conf fs_path "$osd_data" "fs path"  # mount point defaults so osd data
        get_conf fs_devs "" "devs"
	if [ -z "$fs_devs" ]; then
	    # try to fallback to old keys
	    get_conf tmp_btrfs_devs "" "btrfs devs"
	    if [ -n "$tmp_btrfs_devs" ]; then
		fs_devs="$tmp_btrfs_devs"
	    fi
	fi
        first_dev=`echo $fs_devs | cut '-d ' -f 1`
    fi

    # do lockfile, if RH
    get_conf lockfile "/var/lock/subsys/ceph" "lock file"
    lockdir=`dirname $lockfile`
    if [ ! -d "$lockdir" ]; then
	lockfile=""
    fi

    get_conf asok "$run_dir/$cluster-$type.$id.asok" "admin socket"

    case "$command" in
	start|onestart)
	    # Increase max_open_files, if the configuration calls for it.
	    get_conf max_open_files "32768" "max open files"
	    # Remove stale admin socket
	    [ -n "$asok" ] && rm -f $asok

	    # Wait for pending systemd core dumps
	    deadline=$(( $(date '+%s') + 300 ))
	    while [[ $(date '+%s') -lt "${deadline}" ]]; do
	        systemd_coredump_pid=$(pgrep -f "systemd-coredump.*ceph-${type}")
	        [[ -z "${systemd_coredump_pid}" ]] && break
	        wlog $name "INFO" "systemd-coredump ceph-${type} in progress: pid ${systemd_coredump_pid}"
	        sleep 1
	    done

	    # build final command
	    wrap=""
	    runmode=""
	    runarg=""

	    [ -z "$docrun" ] && get_conf_bool docrun "0" "restart on core dump"
	    [ "$docrun" -eq 1 ] && wrap="$BINDIR/ceph-run"

	    [ -z "$dovalgrind" ] && get_conf_bool valgrind "" "valgrind"
	    [ -n "$valgrind" ] && wrap="$wrap valgrind $valgrind"

	    [ -n "$wrap" ] && runmode="-f &" && runarg="-f"
	    [ -n "$max_open_files" ] && files="ulimit -n $max_open_files;"

	    [ -n "$TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES" ] && tcmalloc="TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES=$TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES"

		# StarlingX: start processes in scope under slice system-ceph.slice
		# so that ceph processes do not start under this script's callers cgroup
		if [ "$type" = "osd" ]; then
			cmd="systemd-run --scope --unit=ceph-${type}-${id} --slice=system-ceph $cmd"
		else
			cmd="systemd-run --scope --unit=ceph-${type} --slice=system-ceph $cmd"
		fi

	    # StarlingX: not running as ceph user/group
	    cmd="$files $tcmalloc $wrap $cmd --cluster $cluster $runmode"

	    if [ $dofsmount -eq 1 ] && [ -n "$fs_devs" ]; then
		get_conf pre_mount "true" "pre mount command"
		get_conf fs_type "" "osd mkfs type"

		if [ -z "$fs_type" ]; then
		    # try to fallback to old keys
		    get_conf tmp_devs "" "btrfs devs"
		    if [ -n "$tmp_devs" ]; then
			fs_type="btrfs"
		    else
		        echo No filesystem type defined!
		        exit 0
                    fi
		fi

		get_conf fs_opt "" "osd mount options $fs_type"
		if [ -z "$fs_opt" ]; then
		    if [ "$fs_type" = "btrfs" ]; then
		        #try to fallback to old keys
			get_conf fs_opt "" "btrfs options"
		    fi

		    if [ -z "$fs_opt" ]; then
			if [ "$fs_type" = "xfs" ]; then
			    fs_opt="rw,noatime,inode64"
			else
		            #fallback to use at least noatime
		            fs_opt="rw,noatime"
			fi
		    fi
		fi

		[ -n "$fs_opt" ] && fs_opt="-o $fs_opt"
		[ -n "$pre_mount" ] && do_cmd "$pre_mount"

		do_root_cmd_okfail "mkdir -p $fs_path"
		if [ "$fs_type" = "btrfs" ]; then
		    echo Mounting Btrfs on $host:$fs_path
		    do_root_cmd_okfail "modprobe btrfs ; btrfs device scan || btrfsctl -a ; egrep -q '^[^ ]+ $fs_path ' /proc/mounts && umount $fs_path ; mount -t btrfs $fs_opt $first_dev $fs_path"
		else
		    echo Mounting $fs_type on $host:$fs_path
		    do_root_cmd_okfail "modprobe $fs_type ; egrep -q '^[^ ]+ $fs_path ' /proc/mounts && umount $fs_path ; MOUNT_RESULT=\$(mount -t $fs_type $fs_opt $first_dev $fs_path 2>&1); [ \$? == 0 ] || (echo \$MOUNT_RESULT | grep 'already mounted on ${fs_path}')"
		fi
		if [ "$ERR" != "0" ]; then
		    EXIT_STATUS=$ERR
		    continue
		fi
	    fi

	    save_proc_startup_ok $name

	    echo Starting Ceph $name on $host...
	    if [ ! -d $run_dir ]; then
		# assume /var/run exists
		install -d -m0770 -o ceph -g ceph /var/run/ceph
	    fi
	    get_conf pre_start_eval "" "pre start eval"
	    [ -n "$pre_start_eval" ] && $pre_start_eval
	    get_conf pre_start "" "pre start command"
	    get_conf post_start "" "post start command"
	    [ -n "$pre_start" ] && do_cmd "$pre_start"
	    do_cmd_okfail "$cmd" $runarg
	    if [ "$ERR" != "0" ]; then
		EXIT_STATUS=$ERR
		continue
	    fi

	    . /etc/platform/platform.conf
	    if [ "${nodetype}" = "controller" ]; then
		# StarlingX: Hook the transient services launched by systemd-run
		# to allow for proper cleanup and orderly shutdown

		# Set nullglob so wildcards will return empty string if no match
		shopt -s nullglob

		OSD_SERVICES=$(for svc in /run/systemd/system/ceph-osd*.service; do basename $svc; done | xargs echo)
	    for d in /run/systemd/transient/ceph-osd*.scope; do
	        do_cmd "mkdir -p $d.d"
	        cat <<EOF > $d.d/starlingx-overrides.conf
[Unit]
Before=containerd.service
After=sm-shutdown.service

EOF
	    done

	    for d in /run/systemd/transient/ceph-mds*.scope; do
	        do_cmd "mkdir -p $d.d"
	        cat <<EOF > $d.d/starlingx-overrides.conf
[Unit]
Before=containerd.service
After=sm-shutdown.service

EOF
	    done

	    for d in /run/systemd/transient/ceph-mon*.scope; do
	        do_cmd "mkdir -p $d.d"
	        cat <<EOF > $d.d/starlingx-overrides.conf
[Unit]
Before=containerd.service
After=sm-shutdown.service ${OSD_SERVICES}

EOF
	    done

		shopt -u nullglob
		systemctl daemon-reload

	    fi

	    [ -n "$post_start" ] && do_cmd "$post_start"
	    [ -n "$lockfile" ] && [ "$?" -eq 0 ] && touch $lockfile
	    ;;

	stop|onestop)
	    get_conf pre_stop "" "pre stop command"
	    get_conf post_stop "" "post stop command"
	    [ -n "$pre_stop" ] && do_cmd "$pre_stop"

	    wlog $name "INFO" "Stopping process"

	    if [ $(load_proc_state $name) != "$ST_HANGED" ]; then
		stop_daemon $name ceph-$type $pid_file
	    else
		# first try to gracefully close process, this should be fast if
		# its threads still respond to the TERM signal
		wlog $name "DEBUG" ">>> Sending term signal"
		stop_daemon $name ceph-$type $pid_file TERM "" 5
		wlog $name "DEBUG" ">>> Sending kill signal"
		# then just kill it
		stop_daemon $name ceph-$type $pid_file KILL
	    fi

	    [ -n "$pidfile" ] && rm -f $pidfile
	    [ -n "$asok" ] && rm -f $asok
	    [ -n "$post_stop" ] && do_cmd "$post_stop"
	    [ -n "$lockfile" ] && [ "$?" -eq 0 ] && rm -f $lockfile
	    # flush journal to data disk in background
	    if [ "${type}" = "osd" ];then
                CMD_OUTPUT=''
                execute_ceph_cmd CMD_OUTPUT "Ceph Status" "ceph -s"
                if [ $? == 0 ]; then
                    wlog "${name}" "INFO" "Flushing journal"
                    $(/usr/bin/ceph-osd -i $id --flush-journal) &
                else
                    wlog "${name}" "INFO" "Skipping journal flush"
                fi
	    fi
	    wlog $name "INFO" "Process stopped, setting state to $ST_STOPPED"
	    save_proc_state $name $ST_STOPPED
	    if [ $dofsumount -eq 1 ] && [ -n "$fs_devs" ]; then
		echo Unmounting OSD volume on $host:$fs_path
		do_root_cmd "umount $fs_path || true"
	    fi
	    ;;

	status)
	    if daemon_is_running $name ceph-$type $id $pid_file; then

		# ceph processes answer in around 100ms when the process works correctly
		do_cmd "timeout 1 $BINDIR/ceph --admin-daemon $asok version 2>/dev/null || echo unknown"

        	# log ceph osd state
		if [ "$type" = "osd" ];then
		    CEPH_DAEMON_STATUS=""
		    execute_ceph_cmd CEPH_DAEMON_STATUS $name "ceph daemon $name status"
		    if [ $? -eq 0 ]; then
		        state=$(echo "$CEPH_DAEMON_STATUS" | awk -F ':' '/state/{gsub(/[,[:space:]]/, "", $2);print $2}')
		        if [ "$state" != "active" ]; then
			    wlog $name "INFO" "$name has the following state: $state"
			fi
		    fi
		fi

		# check if daemon is hung
		is_hung=$(is_process_hung $name $type)
		if [ "$is_hung" = "true" ]; then
		echo "$name: hung."
		# based on http://refspecs.linuxbase.org/LSB_3.1.0/LSB-Core-generic/LSB-Core-generic/iniscrptact.html
		# exit codes from 150 to 199 are application specific, therefore we define one here
		EXIT_STATUS=150
		else
		# Wait a period of time prior to OSD start before restarting based on slow/blocked requests
		if [ "$type" = "osd" ] && [ $BLOCKED_OPS_DETECTION_ENABLED = "true" ]; then
		    up_time=$(get_proc_run_time $name)
		    if [ $up_time -gt $BLOCKED_OPS_START_DETECTION ]; then
		        has_blocked_ops=$(osd_has_blocked_ops $name)
		        if [ "$has_blocked_ops" = "true" ]; then
		            echo "$name: blocked ops."
		            # based on http://refspecs.linuxbase.org/LSB_3.1.0/LSB-Core-generic/LSB-Core-generic/iniscrptact.html
		            # exit codes from 150 to 199 are application specific, therefore we define one here
		            EXIT_STATUS=151
		        else
		            echo "$name: running."
		        fi
		    else
		        echo "$name: running."
		    fi
		else
		    echo "$name: running."
		fi

                # Wait a period of time prior to OSD start before restarting based on stuck peering
                if [ "$type" = "osd" ] && [ $STUCK_PEERING_DETECTION_ENABLED = "true" ]; then
                    up_time=$(get_proc_run_time $name)
                    if [ $up_time -gt $STUCK_PEERING_START_DETECTION ]; then
                        has_stuck_peering=$(osd_has_stuck_peering $name $id)
                        if [ "$has_stuck_peering" = "true" ]; then
                            echo "$name: stuck peering."
                            # based on http://refspecs.linuxbase.org/LSB_3.1.0/LSB-Core-generic/LSB-Core-generic/iniscrptact.html
                            # exit codes from 150 to 199 are application specific, therefore we define one here
                            EXIT_STATUS=152
                        else
                            echo "$name: running."
                        fi
                    else
                        echo "$name: running."
                    fi
                else
                    echo "$name: running."
                fi
		fi

	    elif [ -e "$pid_file" ]; then
		# daemon is dead, but pid file still exists
		echo "$name: dead."
		EXIT_STATUS=1
	    else
		# daemon is dead, and pid file is gone
		echo "$name: not running."
		EXIT_STATUS=3
	    fi
	    ;;

	ssh)
	    $ssh
	    ;;

	forcestop)
	    get_conf pre_forcestop "" "pre forcestop command"
	    get_conf post_forcestop "" "post forcestop command"
	    [ -n "$pre_forcestop" ] && do_cmd "$pre_forcestop"
	    stop_daemon $name ceph-$type $pid_file -9
	    [ -n "$post_forcestop" ] && do_cmd "$post_forcestop"
	    [ -n "$lockfile" ] && [ "$?" -eq 0 ] && rm -f $lockfile
	    ;;

	killall)
	    echo "killall ceph-$type on $host"
	    do_cmd "pkill ^ceph-$type || true"
	    [ -n "$lockfile" ] && [ "$?" -eq 0 ] && rm -f $lockfile
	    ;;

	force-reload | reload)
	    signal_daemon $name ceph-$type $pid_file -1 "Reloading"
	    ;;

	restart|onerestart)
	    $0 $options stop $name
	    $0 $options start $name
	    ;;

        condrestart)
            if daemon_is_running $name ceph-$type $id $pid_file; then
                $0 $options stop $name
                $0 $options start $name
            else
                echo "$name: not running."
            fi
            ;;

	cleanlogs)
	    echo removing logs
	    [ -n "$log_dir" ] && do_cmd "rm -f $log_dir/$type.$id.*"
	    ;;

	cleanalllogs)
	    echo removing all logs
	    [ -n "$log_dir" ] && do_cmd "rm -f $log_dir/* || true"
	    ;;

	*)
	    usage_exit
	    ;;
    esac
done

# wait for journal flushing to complete
if [ $command == "stop" ]; then
    wait
fi

exit $EXIT_STATUS