From 11fd5d9cd48a1539b9c7a4ebc8aaad69ed24ae5b Mon Sep 17 00:00:00 2001 From: Dan Voiculeasa Date: Thu, 21 Nov 2019 15:01:36 +0200 Subject: [PATCH] ceph-init-wrapper: Detect stuck peering OSDs and restart them OSDs might become stuck peering. Recover from such state. Closes-bug: 1851287 Change-Id: I2ef1a0e93d38c3d041ee0c5c1e66a4ac42785a68 Signed-off-by: Dan Voiculeasa --- ceph/ceph/files/ceph-init-wrapper.sh | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/ceph/ceph/files/ceph-init-wrapper.sh b/ceph/ceph/files/ceph-init-wrapper.sh index de1bb3549..ddbbc8443 100755 --- a/ceph/ceph/files/ceph-init-wrapper.sh +++ b/ceph/ceph/files/ceph-init-wrapper.sh @@ -156,8 +156,9 @@ log_and_restart_blocked_osds () { # Log info about the blocked osd daemons and then restart it local names=$1 + local message=$2 for name in $names; do - wlog $name "INFO" "Restarting OSD with blocked operations" + wlog $name "INFO" "$message" ${CEPH_SCRIPT} restart $name done } @@ -253,6 +254,7 @@ status () erred_procs=`echo "$result" | sort | uniq | awk ' /not running|dead|failed/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'` hung_procs=`echo "$result" | sort | uniq | awk ' /hung/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'` blocked_ops_procs=`echo "$result" | sort | uniq | awk ' /blocked ops/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'` + stuck_peering_procs=`echo "$result" | sort | uniq | awk ' /stuck peering/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'` invalid=0 host=`hostname` if [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" != "simplex" ]]; then @@ -267,14 +269,12 @@ status () fi done - log_and_restart_blocked_osds $blocked_ops_procs + log_and_restart_blocked_osds "$blocked_ops_procs"\ + "Restarting OSD with blocked operations" + log_and_restart_blocked_osds "$stuck_peering_procs"\ + "Restarting OSD stuck peering" log_and_kill_hung_procs $hung_procs - hung_procs_text="" - for i in $(echo $hung_procs); do - hung_procs_text+="$i(process hung) " - done - rm -f $CEPH_STATUS_FAILURE_TEXT_FILE if [ $invalid -eq 0 ]; then text=""