diff --git a/ceph/ceph/debian/deb_folder/ceph-base.ceph.init b/ceph/ceph/debian/deb_folder/ceph-base.ceph.init index cad117111..3415c0417 100755 --- a/ceph/ceph/debian/deb_folder/ceph-base.ceph.init +++ b/ceph/ceph/debian/deb_folder/ceph-base.ceph.init @@ -104,13 +104,35 @@ STUCK_PEERING_RESTART_THRESH=180 # Restart OSD if stuck peering is longer LOG_LEVEL=NORMAL # DEBUG +# Internal Log Management +# Adding PID and PPID informations +ilog () { + local name="" + local log_level="$1" + # Checking if the first parameter is not a log level + if grep -q -v ${log_level} <<< "INFO DEBUG WARN ERROR"; then + name=" ($1)"; + log_level="$2" + shift + fi + + shift + + local message="$@" + # prefix = [] + local prefix="${BASHPID} $(cat /proc/${PPID}/comm)[${PPID}]${name}" + # yyyy-MM-dd HH:mm:ss.SSSSSS /etc/init.d/ceph : + wlog "${prefix}" "${log_level}" "${message}" + return 0 +} + save_proc_state() { # Set the state of a process in the state machine and store it in a file local name=$1 local state=$2 if [ "$state" != "$ST_STARTUP" ] && [ "$state" != "$ST_OPER" ] && \ [ "$state" != "$ST_HANGED" ] && [ "$state" != "$ST_STOPPED" ]; then - wlog $name "ERROR" "State $state is invalid, resetting to $ST_STARTUP" print_trace + ilog $name "ERROR" "State $state is invalid, resetting to $ST_STARTUP" print_trace state=$ST_STARTUP fi echo "$state" > ${DATA_PATH}/.${name}_state @@ -125,7 +147,7 @@ load_proc_state() { fi if [ "$state" != "$ST_STARTUP" ] && [ "$state" != "$ST_OPER" ] && \ [ "$state" != "$ST_HANGED" ] && [ "$state" != "$ST_STOPPED" ]; then - wlog $name "ERROR" "State $state is invalid, resetting to $ST_STARTUP" print_trace + ilog $name "ERROR" "State $state is invalid, resetting to $ST_STARTUP" print_trace state=$ST_STARTUP save_proc_state $name $state fi @@ -142,7 +164,7 @@ save_proc_startup_ok() { save_proc_state $name $ST_STARTUP # Save the time when a process was started - wlog $name INFO "Process $ST_STARTUP successfully, waiting for it to become $ST_OPER" + ilog $name INFO "Process $ST_STARTUP successfully, waiting for it to become $ST_OPER" echo $(date +%s) > ${DATA_PATH}/.${name}_start_time } @@ -177,7 +199,7 @@ get_duration(){ # Check that we have a filename if [ ! -f $record ]; then - wlog $name ERROR "Failed to compute duration, time was never stored in $record!" print_trace + ilog $name ERROR "Failed to compute duration, time was never stored in $record!" print_trace echo "-1"; return fi @@ -185,7 +207,7 @@ get_duration(){ local start_time=$(cat $record) re="^[0-9]+$" if ! [[ "$start_time" =~ $re ]] ; then - wlog $name ERROR "Recorded time '$start_time' is not a number!" print_trace + ilog $name ERROR "Recorded time '$start_time' is not a number!" print_trace echo "-1"; return fi @@ -193,7 +215,7 @@ get_duration(){ local now=$(date +%s) local duration=$((now-start_time)) if [ "$duration" -lt 0 ]; then - wlog $name ERROR "Duration less than 0!" print_trace + ilog $name ERROR "Duration less than 0!" print_trace echo "-1"; return fi echo $duration @@ -202,14 +224,14 @@ get_duration(){ get_proc_run_time() { local name=$1 local time=$(get_duration $name ${DATA_PATH}/.${name}_start_time) - wlog $name DEBUG ">>> process running for: ${time}s" + ilog $name DEBUG ">>> process running for: ${time}s" echo $time } get_proc_down_time() { local name=$1 local time=$(get_duration $name ${DATA_PATH}/.${name}_down_time) - wlog $name DEBUG ">>> process down for: ${time}s" + ilog $name DEBUG ">>> process down for: ${time}s" echo $time } @@ -224,19 +246,19 @@ run_state_machine() { local state=$(load_proc_state $name) - wlog $name "DEBUG" ">>> state: $state" + ilog $name "DEBUG" ">>> state: $state" # state machine if [ "$state" = "$ST_STARTUP" ]; then - wlog $name "DEBUG" ">>> status: $status" + ilog $name "DEBUG" ">>> status: $status" if [ "$status" = $UP ]; then save_proc_state $name $ST_OPER - wlog $name "INFO" "Process is OPERATIONAL" + ilog $name "INFO" "Process is OPERATIONAL" echo $ST_OPER; return else # the process should be 'up' in $WAIT_FOR_OSD_OPERATIONAL seconds! if [ $(get_proc_run_time $name) -gt $wait_for_operational ]; then # process hung! - wlog $name "ERROR" "Process failed to go up in ${wait_for_operational}s after start, reporting it as $ST_HANGED!" + ilog $name "ERROR" "Process failed to go up in ${wait_for_operational}s after start, reporting it as $ST_HANGED!" save_proc_state $name $ST_HANGED echo $ST_HANGED; return fi @@ -244,20 +266,20 @@ run_state_machine() { elif [ "$state" = "$ST_OPER" ]; then if [ "$status" = "$DOWN" ]; then if [ $(load_proc_status $name) = $UP ];then - wlog $name "WARN" "Process went down!" + ilog $name "WARN" "Process went down!" save_proc_status $name $DOWN fi # if a process is down we don't report it as hung for a while # this should avoid status flapping if [ $(get_proc_down_time $name) -gt $wait_for_down_confirm ]; then # the process is down for a long time, report it as hung! - wlog $name "ERROR" "Process went down for more than ${wait_for_down_confirm}s, reporting it as $ST_HANGED" + ilog $name "ERROR" "Process went down for more than ${wait_for_down_confirm}s, reporting it as $ST_HANGED" save_proc_state $name $ST_HANGED echo $ST_HANGED; return fi elif [ "$status" = "$UP" ]; then if [ $(load_proc_status $name) = $DOWN ]; then - wlog $name "WARN" "Process went up, flapping status or busy process?" + ilog $name "WARN" "Process went up, flapping status or busy process?" save_proc_status $name $UP return fi @@ -279,13 +301,13 @@ execute_ceph_cmd() { errcode=$? set +o pipefail if [ -z "$output" ] && [ $errcode -eq 124 ]; then # 'timeout' returns 124 when timing out - wlog $name "WARN" "Ceph cluster failed to respond in ${WAIT_FOR_CMD}s when running: $cmd" + ilog $name "WARN" "Ceph cluster failed to respond in ${WAIT_FOR_CMD}s when running: $cmd" CEPH_FAILURE="true" echo ""; return 1 fi output=$(cat $DATA_PATH/.ceph_cmd_out) if [ -z "$output" ] || [ $errcode -ne 0 ]; then - wlog $name "WARN" "Error executing: $cmd errorcode: $errcode output: $output" + ilog $name "WARN" "Error executing: $cmd errorcode: $errcode output: $output" echo ""; return 1 fi echo "$output"; return $errcode @@ -299,7 +321,7 @@ is_process_hung() { # Abort if we had previous errors with Ceph if [ "$CEPH_FAILURE" = "true" ]; then - wlog $name "WARN" "Ceph cluster is marked as failed, aborting hang check" + ilog $name "WARN" "Ceph cluster is marked as failed, aborting hang check" echo "false"; return fi @@ -307,7 +329,7 @@ is_process_hung() { if [ -z "$CEPH_HEALTH_DETAIL" ]; then execute_ceph_cmd CEPH_HEALTH_DETAIL $name "ceph health detail" if [ $? -ne 0 ]; then - wlog $name "WARN" "Aborting hang check" + ilog $name "WARN" "Aborting hang check" echo "false"; return fi fi @@ -318,7 +340,7 @@ is_process_hung() { # Note this can be done with: 'ceph osd set noup; ceph osd down ' $(echo "$CEPH_HEALTH_DETAIL" | grep -q "noup.*set") if [ $? -eq 0 ]; then - wlog $name "WARN" "Ceph 'noup' flag is set, aborting hang check" + ilog $name "WARN" "Ceph 'noup' flag is set, aborting hang check" echo "false"; return fi @@ -327,7 +349,7 @@ is_process_hung() { if [ -z "$CEPH_OSD_TREE" ]; then execute_ceph_cmd CEPH_OSD_TREE $name "ceph osd tree" if [ $? -ne 0 ]; then - wlog $name "WARN" "Ceph cmd exec failed, aborting hang check" + ilog $name "WARN" "Ceph cmd exec failed, aborting hang check" echo "false"; return fi fi @@ -349,8 +371,8 @@ is_process_hung() { fi - # Check if a Monitor is hung - elif [ "$type" = "mon" ]; then + # Check if a Monitor is hung + elif [ "$type" = "mon" ]; then # Get monitor status info local mon_status=$UP echo "$CEPH_HEALTH_DETAIL" | grep -q -e "^[[:space:]]*$name.*down" @@ -366,10 +388,13 @@ is_process_hung() { echo "false"; return fi - else - wlog $name "WARN" "Unknown process type: $type" - fi - echo "false" + elif [ "$type" = "mds" ]; then + echo "false"; return + + else + ilog $name "WARN" "Unknown process type: $type" + fi + echo "false" } osd_has_blocked_ops() { @@ -377,7 +402,7 @@ osd_has_blocked_ops() { # Abort if we had previous errors with Ceph if [ "$CEPH_FAILURE" = "true" ]; then - wlog $name "WARN" "Ceph cluster is marked as failed, aborting blocked ops check" + ilog $name "WARN" "Ceph cluster is marked as failed, aborting blocked ops check" echo "false"; return fi @@ -387,7 +412,7 @@ osd_has_blocked_ops() { if [ -z "$CEPH_HEALTH_DETAIL" ]; then execute_ceph_cmd CEPH_HEALTH_DETAIL $name "ceph health detail" if [ $? -ne 0 ]; then - wlog $name "WARN" "Aborting blocked ops check" + ilog $name "WARN" "Aborting blocked ops check" echo "false"; return fi fi @@ -396,7 +421,7 @@ osd_has_blocked_ops() { # Note this can be done with: 'ceph osd set noup; ceph osd down ' $(echo "$CEPH_HEALTH_DETAIL" | grep -q "noup.*set") if [ $? -eq 0 ]; then - wlog $name "WARN" "Ceph 'noup' flag is set, aborting blocked ops check" + ilog $name "WARN" "Ceph 'noup' flag is set, aborting blocked ops check" echo "false"; return fi @@ -406,7 +431,7 @@ osd_has_blocked_ops() { if [ -z "$CEPH_OSD_TREE" ]; then execute_ceph_cmd CEPH_OSD_TREE $name "ceph osd tree" if [ $? -ne 0 ]; then - wlog $name "WARN" "Ceph cmd exec failed, aborting blocked ops check" + ilog $name "WARN" "Ceph cmd exec failed, aborting blocked ops check" echo "false"; return fi fi @@ -418,7 +443,7 @@ osd_has_blocked_ops() { local blocked_time=$(echo "$CEPH_HEALTH_DETAIL" | grep $name | sed -rn 's/.*ops are blocked > ([[:digit:]]*).*/\1/p') [[ "$blocked_time" == "" ]] && blocked_time=0 if [ $blocked_time -gt $BLOCKED_OPS_RESTART_THRESH ]; then - wlog $name "WARN" "Detected blocked operations for $blocked_time seconds" + ilog $name "WARN" "Detected blocked operations for $blocked_time seconds" echo "true"; return else echo "false"; return @@ -432,7 +457,7 @@ osd_has_stuck_peering() { # Abort if we had previous errors with Ceph if [ "$CEPH_FAILURE" = "true" ]; then - wlog $name "WARN" "Ceph cluster is marked as failed, aborting stuck peering check" + ilog $name "WARN" "Ceph cluster is marked as failed, aborting stuck peering check" echo "false"; return fi @@ -442,7 +467,7 @@ osd_has_stuck_peering() { if [ -z "$CEPH_HEALTH_DETAIL" ]; then execute_ceph_cmd CEPH_HEALTH_DETAIL $name "ceph health detail" if [ $? -ne 0 ]; then - wlog $name "WARN" "Aborting stuck peering check" + ilog $name "WARN" "Aborting stuck peering check" echo "false"; return fi fi @@ -451,7 +476,7 @@ osd_has_stuck_peering() { # Note this can be done with: 'ceph osd set nodown; /etc/init.d/ceph stop ' $(echo "$CEPH_HEALTH_DETAIL" | grep -q "nodown.*set") if [ $? -eq 0 ]; then - wlog $name "WARN" "Ceph 'nodown' flag is set, aborting stuck peering check" + ilog $name "WARN" "Ceph 'nodown' flag is set, aborting stuck peering check" echo "false"; return fi @@ -487,7 +512,7 @@ osd_has_stuck_peering() { fi if [ $blocked_time -gt $STUCK_PEERING_RESTART_THRESH ]; then - wlog $name "WARN" "Detected stuck peering for $blocked_time seconds" + ilog $name "WARN" "Detected stuck peering for $blocked_time seconds" echo "true"; return else echo "false"; return @@ -613,7 +638,7 @@ stop_daemon() { ## command line options options= IFS=" " read -r -a args <<< "$@" -wlog "-" INFO "$@" +ilog INFO "$@" OPTS=$(${GETOPT} -n 'init-ceph' -o 'hvam:c:' -l 'help,verbose,valgrind,novalgrind,allhosts,restart,norestart,btrfs,nobtrfs,fsmount,nofsmount,btrfsumount,fsumount,conf:,cluster:,hostname:' -- "${args[@]}") if [ $? != 0 ] @@ -883,7 +908,7 @@ for name in $what; do while [[ $(date '+%s') -lt "${deadline}" ]]; do systemd_coredump_pid=$(pgrep -f "systemd-coredump.*ceph-${type}") [[ -z "${systemd_coredump_pid}" ]] && break - wlog $name "INFO" "systemd-coredump ceph-${type} in progress: pid ${systemd_coredump_pid}" + ilog $name "INFO" "systemd-coredump ceph-${type} in progress: pid ${systemd_coredump_pid}" sleep 1 done @@ -1034,16 +1059,16 @@ EOF get_conf post_stop "" "post stop command" [ -n "$pre_stop" ] && do_cmd "$pre_stop" - wlog $name "INFO" "Stopping process" + ilog $name "INFO" "Stopping process" if [ $(load_proc_state $name) != "$ST_HANGED" ]; then stop_daemon $name ceph-$type $pid_file else # first try to gracefully close process, this should be fast if # its threads still respond to the TERM signal - wlog $name "DEBUG" ">>> Sending term signal" + ilog $name "DEBUG" ">>> Sending term signal" stop_daemon $name ceph-$type $pid_file TERM "" 5 - wlog $name "DEBUG" ">>> Sending kill signal" + ilog $name "DEBUG" ">>> Sending kill signal" # then just kill it stop_daemon $name ceph-$type $pid_file KILL fi @@ -1057,13 +1082,13 @@ EOF CMD_OUTPUT='' execute_ceph_cmd CMD_OUTPUT "Ceph Status" "ceph -s" if [ $? == 0 ]; then - wlog "${name}" "INFO" "Flushing journal" + ilog "${name}" "INFO" "Flushing journal" $(/usr/bin/ceph-osd -i $id --flush-journal) & else - wlog "${name}" "INFO" "Skipping journal flush" + ilog "${name}" "INFO" "Skipping journal flush" fi fi - wlog $name "INFO" "Process stopped, setting state to $ST_STOPPED" + ilog $name "INFO" "Process stopped, setting state to $ST_STOPPED" save_proc_state $name $ST_STOPPED if [ $dofsumount -eq 1 ] && [ -n "$fs_devs" ]; then echo Unmounting OSD volume on $host:$fs_path @@ -1084,7 +1109,7 @@ EOF if [ $? -eq 0 ]; then state=$(echo "$CEPH_DAEMON_STATUS" | awk -F ':' '/state/{gsub(/[,[:space:]]/, "", $2);print $2}') if [ "$state" != "active" ]; then - wlog $name "INFO" "$name has the following state: $state" + ilog $name "INFO" "$name has the following state: $state" fi fi fi diff --git a/ceph/ceph/files/ceph-init-wrapper.sh b/ceph/ceph/files/ceph-init-wrapper.sh index 637856740..b8638a7f4 100755 --- a/ceph/ceph/files/ceph-init-wrapper.sh +++ b/ceph/ceph/files/ceph-init-wrapper.sh @@ -86,16 +86,38 @@ if [ ! -z $ARGS ]; then args+=("${new_args[@]}") fi +# Internal Log Management +# Adding PID and PPID informations +ilog () { + local name="" + local log_level="$1" + # Checking if the first parameter is not a log level + if grep -q -v ${log_level} <<< "INFO DEBUG WARN ERROR"; then + name=" ($1)"; + log_level="$2" + shift + fi + + shift + + local message="$@" + # prefix = [] + local prefix="${BASHPID} $(cat /proc/${PPID}/comm)[${PPID}]${name}" + # yyyy-MM-dd HH:mm:ss.SSSSSS /etc/init.d/ceph-init-wrapper : + wlog "${prefix}" "${log_level}" "${message}" + return 0 +} + # Verify if drbd-cephmon role is primary, checking the output of 'drbdadm role' # Return 0 on success and 1 if drbd-cephmon is not primary is_drbd_cephmon_primary () { drbdadm role drbd-cephmon | grep -q 'Primary/' if [ $? -eq 0 ]; then - wlog "-" INFO "drbd-cephmon role is Primary" + ilog INFO "drbd-cephmon role is Primary" return 0 fi - wlog "-" INFO "drbd-cephmon role is NOT Primary" + ilog INFO "drbd-cephmon role is NOT Primary" return 1 } @@ -105,10 +127,10 @@ is_drbd_cephmon_mounted () { findmnt -no SOURCE "${CEPH_MON_LIB_PATH}" | grep -q drbd if [ $? -eq 0 ]; then - wlog "-" INFO "drbd-cephmon partition is mounted" + ilog INFO "drbd-cephmon partition is mounted" return 0 fi - wlog "-" INFO "drbd-cephmon partition is NOT mounted" + ilog INFO "drbd-cephmon partition is NOT mounted" return 1 } @@ -129,7 +151,7 @@ has_all_network_no_carrier() # Check if all networks have no carrier, meaning the other host is down if [ "${oam_carrier}" -eq 0 ] && [ "${cluster_host_carrier}" -eq 0 ] && [ "${mgmt_carrier}" -eq 0 ]; then - wlog "-" INFO "No carrier detected from all network interfaces" + ilog INFO "No carrier detected from all network interfaces" return 0 fi return 1 @@ -142,10 +164,10 @@ has_mgmt_network_carrier() # If no-carrier message is detected, then the interface has no physical link ip link show "${management_interface}" | grep NO-CARRIER if [ $? -eq 0 ]; then - wlog "-" INFO "management interface '${management_interface}' has NO-CARRIER, cannot start ceph mon" + ilog INFO "Management Interface '${management_interface}' has NO-CARRIER, cannot start ceph-mon" return 1 fi - wlog "-" INFO "management interface '${management_interface}' is working" + ilog "-" DEBUG "Management Interface '${management_interface}' is working" return 0 } @@ -168,7 +190,7 @@ can_start_ceph_mon () done if [ ${times} -eq 0 ]; then - wlog "-" ERROR "drbd-cephmon is not primary, cannot start ceph mon" + ilog ERROR "drbd-cephmon is NOT Primary, cannot start ceph-mon" return 1 fi @@ -184,7 +206,7 @@ can_start_ceph_mon () done if [ ${times} -eq 0 ]; then - wlog "-" ERROR "drbd-cephmon is not mounted, cannot start ceph mon" + ilog ERROR "drbd-cephmon is NOT mounted, cannot start ceph-mon" return 1 fi @@ -201,12 +223,12 @@ with_service_lock () # used for locking service actions ( # Grab service locks - wlog "-" INFO "Grab service locks" + ilog INFO "Grab service locks" [[ "${target}" == *"mon"* ]] && flock ${LOCK_CEPH_MON_SERVICE_FD} [[ "${target}" == *"osd"* ]] && flock ${LOCK_CEPH_OSD_SERVICE_FD} # Try to lock status with a timeout in case status is stuck - wlog "-" INFO "Lock service status" + ilog INFO "Lock service status" deadline=$((SECONDS + MAX_STATUS_TIMEOUT + 1)) if [[ "${target}" == *"mon"* ]]; then flock --exclusive --timeout ${MONITOR_STATUS_TIMEOUT} ${LOCK_CEPH_MON_STATUS_FD} @@ -221,7 +243,7 @@ with_service_lock () # Close lock file descriptors so they are # not inherited by the spawned process then # run service action - wlog "-" INFO "Run service action: $@" + ilog INFO "Run service action: $@" "$@" {LOCK_CEPH_MON_SERVICE_FD}>&- \ {LOCK_CEPH_MON_STATUS_FD}>&- \ {LOCK_CEPH_OSD_SERVICE_FD}>&- \ @@ -250,7 +272,7 @@ start () if [ "${service}" == "mon" ]; then can_start_ceph_mon if [ $? -ne 0 ]; then - wlog "-" INFO "Ceph monitor is not ready to start because drbd-cephmon is not ready and mounted" + ilog INFO "Ceph Monitor is not ready to start because drbd-cephmon is not ready and mounted" exit 1 fi fi @@ -262,33 +284,33 @@ start () if [ "${system_mode}" == "duplex-direct" ]; then has_all_network_no_carrier if [ $? -eq 0 ]; then - wlog "-" INFO "All network interfaces are not functional, considering the other host is down. Let Ceph start." + ilog INFO "All network interfaces are not functional, considering the other host is down. Let Ceph start." else # Else AIO-DX Direct mgmt network is NOT functional - wlog "-" INFO "Mgmt network is not functional, defer starting Ceph processes until recovered" + ilog INFO "Management Interface is not functional, defer starting Ceph processes until recovered" exit 1 fi else # Else AIO-DX mgmt network is NOT functional - wlog "-" INFO "Mgmt network is not functional, defer starting Ceph processes until recovered" + ilog INFO "Management Interface is not functional, defer starting Ceph processes until recovered" exit 1 fi fi fi # Start the service - wlog "-" INFO "Ceph START ${service} command received" + ilog INFO "Ceph START ${service} command received" with_service_lock "${service}" ${CEPH_SCRIPT} start ${service} - wlog "-" INFO "Ceph START ${service} command finished." + ilog INFO "Ceph START ${service} command finished." } stop () { local service="$1" - wlog "-" INFO "Ceph STOP $1 command received." + ilog INFO "Ceph STOP $1 command received." with_service_lock "$1" ${CEPH_SCRIPT} stop $1 - wlog "-" INFO "Ceph STOP $1 command finished." + ilog INFO "Ceph STOP $1 command finished." } restart () @@ -297,10 +319,10 @@ restart () # Ceph is not running on this node, return success exit 0 fi - wlog "-" INFO "Ceph RESTART $1 command received." + ilog INFO "Ceph RESTART $1 command received." stop "$1" start "$1" - wlog "-" INFO "Ceph RESTART $1 command finished." + ilog INFO "Ceph RESTART $1 command finished." } log_and_restart_blocked_osds () @@ -309,7 +331,7 @@ log_and_restart_blocked_osds () local names=$1 local message=$2 for name in $names; do - wlog $name "INFO" "$message" + ilog $name "INFO" "$message" # Restart the daemons but release ceph mon and osd file descriptors ${CEPH_SCRIPT} restart $name {LOCK_CEPH_MON_STATUS_FD}>&- {LOCK_CEPH_OSD_STATUS_FD}>&- done @@ -325,17 +347,17 @@ log_and_kill_hung_procs () get_conf run_dir "/var/run/ceph" "run dir" get_conf pid_file "$run_dir/$type.$id.pid" "pid file" pid=$(cat $pid_file) - wlog $name "INFO" "Dealing with hung process (pid:$pid)" + ilog $name "INFO" "Dealing with hung process (pid:$pid)" # monitoring interval - wlog $name "INFO" "Increasing log level" + ilog $name "INFO" "Increasing log level" WAIT_FOR_CMD=10 execute_ceph_cmd ret $name "ceph daemon $name config set debug_$type 20/20" monitoring=$MONITORING_INTERVAL while [ $monitoring -gt 0 ]; do if [ $(($monitoring % $TRACE_LOOP_INTERVAL)) -eq 0 ]; then date=$(date "+%Y-%m-%d_%H-%M-%S") log_file="$LOG_PATH/hang_trace_${name}_${pid}_${date}.log" - wlog $name "INFO" "Dumping stack trace to: $log_file" + ilog $name "INFO" "Dumping stack trace to: $log_file" if grep -q "Debian" /etc/os-release; then $(eu-stack -p $pid >$log_file) & elif grep -q "CentOS" /etc/os-release; then @@ -345,7 +367,7 @@ log_and_kill_hung_procs () let monitoring-=1 sleep 1 done - wlog $name "INFO" "Trigger core dump" + ilog $name "INFO" "Trigger core dump" kill -ABRT $pid &>/dev/null rm -f $pid_file # process is dead, core dump is archiving, preparing for restart # Wait for pending systemd core dumps @@ -354,7 +376,7 @@ log_and_kill_hung_procs () while [[ $(date '+%s') -lt "${deadline}" ]]; do systemd_coredump_pid=$(pgrep -f "systemd-coredump.*${pid}.*ceph-${type}") [[ -z "${systemd_coredump_pid}" ]] && break - wlog $name "INFO" "systemd-coredump ceph-${type} in progress: pid ${systemd_coredump_pid}" + ilog $name "INFO" "systemd-coredump ceph-${type} in progress: pid ${systemd_coredump_pid}" sleep 2 done kill -KILL $pid &>/dev/null @@ -375,19 +397,19 @@ status () has_mgmt_network_carrier if [ $? -eq 0 ]; then # Network is functional, continue - wlog "-" INFO "mgmt network active..." + ilog DEBUG "Management Interface active." else if [ "${system_mode}" == "duplex-direct" ]; then has_all_network_no_carrier if [ $? -ne 0 ]; then # Network is NOT functional, prevent split brain corruptions - wlog "-" INFO "mgmt network inactive... stop OSDs to force a re-peering once the network has recovered" + ilog INFO "Management Interface inactive. Stopping OSDs to force a re-peering once the network has recovered" stop "$1" exit 0 fi else # Network is NOT functional, prevent split brain corruptions - wlog "-" INFO "mgmt network inactive... stop OSDs to force a re-peering once the network has recovered" + ilog INFO "Management Interface inactive. Stopping OSDs to force a re-peering once the network has recovered" stop "$1" exit 0 fi @@ -397,7 +419,7 @@ status () if [ "$?" -ne 0 ]; then # Ceph cluster is not accessible. Don't panic, controller swact # may be in progress. - wlog "-" INFO "Ceph is down, ignoring OSD status." + ilog INFO "Ceph is DOWN, ignoring OSD status." exit 0 fi fi @@ -425,7 +447,7 @@ status () flock --shared ${LOCK_CEPH_OSD_STATUS_FD} fi - result=`${CEPH_SCRIPT} status $1 {LOCK_CEPH_MON_STATUS_FD}>&- {LOCK_CEPH_OSD_STATUS_FD}>&-` + result=`ilog INFO "status $1"; ${CEPH_SCRIPT} status $1 {LOCK_CEPH_MON_STATUS_FD}>&- {LOCK_CEPH_OSD_STATUS_FD}>&-` RC=$? if [ "$RC" -ne 0 ]; then erred_procs=`echo "$result" | sort | uniq | awk ' /not running|dead|failed/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'` @@ -488,13 +510,13 @@ status () has_all_network_no_carrier if [ $? -ne 0 ]; then # Network is NOT functional, prevent split brain corruptions - wlog "-" INFO "mgmt network inactive... stop MON to prevent localized operation" + ilog INFO "Management Interface inactive. Stopping ceph-mon to prevent localized operation" stop "$1" exit 0 fi else # Network is NOT functional, prevent split brain corruptions - wlog "-" INFO "mgmt network inactive... stop MON to prevent localized operation" + ilog INFO "Management Interface inactive. Stopping ceph-mon to prevent localized operation" stop "$1" exit 0 fi