AIO-DX Ceph Optimizations

This change is part of the solution to resolve the scenario where Ceph MON starts without having data in sync when there is no communication with the peer, leading to PG issues. Improvements: Removed starting Ceph MON and MDS from ceph.sh script called by mtcClient for AIO-DX: - Ceph MDS was not being managed, only started by ceph.sh script called from mtcClient. Now it will be managed by PMON. - Ceph MON will continue to be managed by SM. Ceph-init-wrapper script will verify some conditions to start Ceph MON safely: - First, check if drbd-cephmon role is Primary. - Then, check if drbd-cephmon partition is mounted correctly. - Check flags (inside drbd-cephmon path) for last active Ceph MON process (Controller-0 or Controler-1). This flag will be created by the last Ceph MON successful start. - If the last active monitor is the other one, check if drbd-cephmon is UpToDate/UpToDate, meaning that data is synchronized between controllers. We also made some improvements to /etc/init.d/ceph script to be able to stop Ceph OSD even if Ceph MON was not available. Stopping OSD without a Ceph Monitor was hanging when the command to flush the journal would wait forever to communicate to any available Ceph Monitor. Test Plan: PASS: system host-swact. PASS: Ceph recovery after mgmt network outage for few minutes even when rebooting controllers. PASS: Ceph recovery after rebooting active controller. PASS: Ceph recovery after case of dead office recovery (DOR). PASS: Running shellcheck on ceph-base.ceph.init, ceph.sh, and ceph-init-wrapper.sh files without any complaints about the lines related to the changes. Closes-bug: 2004183 Signed-off-by: Hediberto Cavalcante da Silva <hediberto.cavalcantedasilva@windriver.com> Change-Id: Id09432aecef68b39adabf633c74545f2efa02e99
2023-01-30 13:09:42 -05:00 · 2023-01-30 13:09:42 -05:00 · b629db6b9f
parent 14d53c3566
commit b629db6b9f
6 changed files with 271 additions and 38 deletions
--- a/ceph/ceph/debian/deb_folder/ceph-base.ceph.init
+++ b/ceph/ceph/debian/deb_folder/ceph-base.ceph.init
@ -612,8 +612,10 @@ stop_daemon() {

 ## command line options
 options=
+IFS=" " read -r -a args <<< "$@"
+wlog "-" INFO "$@"

-OPTS=$(${GETOPT} -n 'init-ceph' -o 'hvam:c:' -l 'help,verbose,valgrind,novalgrind,allhosts,restart,norestart,btrfs,nobtrfs,fsmount,nofsmount,btrfsumount,fsumount,conf:,cluster:,hostname:' -- "$@")
+OPTS=$(${GETOPT} -n 'init-ceph' -o 'hvam:c:' -l 'help,verbose,valgrind,novalgrind,allhosts,restart,norestart,btrfs,nobtrfs,fsmount,nofsmount,btrfsumount,fsumount,conf:,cluster:,hostname:' -- "${args[@]}")
 if [ $? != 0 ]
 then
    exit 1
@ -735,11 +737,51 @@ if [ "$command" = "stop" -o "$command" = "onestop" ]; then
    what="$new_order"
 fi

-# Check if the monitors are up before starting any mds
-# This is needed only for Standard deployments

 . /etc/platform/platform.conf

+
+# When this is a AIO-DX pmon is monitoring ceph-mds process.
+# If ceph-mon is not running, ceph-mds will hang when starting.
+# Check if we are trying to bring up ceph-mds and ceph-mon is not ready yet
+if [ "${system_type}" == "All-in-one" ] && [ "${system_mode}" == "duplex" ]; then
+    if [ "${command}" = "start" -o "${command}" = "onestart" ]; then
+        what_out=
+        what_mds=
+        re="\s*mon"
+        if [[ ${what} =~ ${re} ]]; then
+            has_mon=1
+        else
+            has_mon=0
+            CEPH_STATUS=''
+            execute_ceph_cmd CEPH_STATUS "ceph status" "ceph -s"
+            if [ $? -eq 0 ]; then
+                has_mon=1
+            fi
+        fi
+        for name in ${what}; do
+            type=$(echo "${name}" | cut -c 1-3)
+            if [ "${type}" == "mds" ]; then
+                what_mds="${name}"
+                continue
+            fi
+            what_out+=" ${name}"
+        done
+        if [ ${has_mon} -eq 1 ] && [ ! -z "${what_mds}" ]; then
+            what_out+=" ${what_mds}"
+        fi
+        what="${what_out}"
+    fi
+    # If the variable 'what' is empty, then it was trying to bring up ceph-mds but ceph-mon is not active.
+    # When ceph-mon is not active, we cannot execute ceph-mds yet, thus returning error.
+    if [ -z "${what}" ]; then
+        EXIT_STATUS=1
+    fi
+fi
+
+# Check if the monitors are up before starting any mds
+# This is needed only for Standard deployments
+
 if [ "$system_type" == "Standard" ]; then
    CEPH_STATUS=''
    execute_ceph_cmd CEPH_STATUS "ceph status" "ceph -s"
@ -999,8 +1041,15 @@ EOF
 	    [ -n "$post_stop" ] && do_cmd "$post_stop"
 	    [ -n "$lockfile" ] && [ "$?" -eq 0 ] && rm -f $lockfile
 	    # flush journal to data disk in background
-	    if [ "$type" = "osd" ];then
-	        $(/usr/bin/ceph-osd -i $id --flush-journal) &
+	    if [ "${type}" = "osd" ];then
+                CMD_OUTPUT=''
+                execute_ceph_cmd CMD_OUTPUT "Ceph Status" "ceph -s"
+                if [ $? == 0 ]; then
+                    wlog "${name}" "INFO" "Flushing journal"
+                    $(/usr/bin/ceph-osd -i $id --flush-journal) &
+                else
+                    wlog "${name}" "INFO" "Skipping journal flush"
+                fi
 	    fi
 	    wlog $name "INFO" "Process stopped, setting state to $ST_STOPPED"
 	    save_proc_state $name $ST_STOPPED
--- a/ceph/ceph/debian/deb_folder/ceph-base.install
+++ b/ceph/ceph/debian/deb_folder/ceph-base.install
@ -26,6 +26,7 @@ etc/init.d/ceph
 etc/init.d/mgr-restful-plugin
 etc/init.d/ceph-init-wrapper
 etc/ceph/ceph.conf.pmon
+etc/ceph/ceph-mds.conf.pmon
 etc/ceph/ceph.conf
 etc/services.d/*
 usr/sbin/ceph-preshutdown.sh
--- a/ceph/ceph/debian/deb_folder/rules
+++ b/ceph/ceph/debian/deb_folder/rules
@ -6,13 +6,15 @@
 SOURCE1 := ceph.sh
 SOURCE2 := mgr-restful-plugin.py
 SOURCE3 := ceph.conf.pmon
-SOURCE4 := ceph-init-wrapper.sh
-SOURCE5 := ceph.conf
-SOURCE6 := ceph-manage-journal.py
-SOURCE7 := ceph.service
-SOURCE8 := mgr-restful-plugin.service
-SOURCE9 := ceph-preshutdown.sh
-SOURCE10 := starlingx-docker-override.conf
+SOURCE4 := ceph-mds.conf.pmon
+SOURCE5 := ceph-init-wrapper.sh
+SOURCE6 := ceph.conf
+SOURCE7 := ceph-manage-journal.py
+SOURCE8 := ceph.service
+SOURCE9 := mgr-restful-plugin.service
+SOURCE10 := ceph-preshutdown.sh
+SOURCE11 := starlingx-docker-override.conf
+

 # Paths
 export DESTDIR = $(CURDIR)/debian/tmp
@ -188,13 +190,14 @@ override_dh_auto_install:
 	install -D -m 750 ${SOURCE1} $(DESTDIR)/${SYSCONFDIR}/services.d/worker/
 	install -D -m 750 ${SOURCE2} $(DESTDIR)/${INITDIR}/mgr-restful-plugin
 	install -D -m 750 ${SOURCE3} $(DESTDIR)/${SYSCONFDIR}/ceph/
-	install -D -m 750 ${SOURCE4} $(DESTDIR)/${INITDIR}/ceph-init-wrapper
-	install -D -m 640 ${SOURCE5} $(DESTDIR)/${SYSCONFDIR}/ceph/
-	install -D -m 700 ${SOURCE6} $(DESTDIR)/${SBINDIR}/ceph-manage-journal
-	install -D -m 644 ${SOURCE7} $(DESTDIR)/${UNITDIR}/ceph.service
-	install -D -m 644 ${SOURCE8} $(DESTDIR)/${UNITDIR}/mgr-restful-plugin.service
-	install -D -m 700 ${SOURCE9} $(DESTDIR)/${SBINDIR}/ceph-preshutdown.sh
-	install -D -m 644 ${SOURCE10} $(DESTDIR)/${UNITDIR}/docker.service.d/starlingx-docker-override.conf
+	install -D -m 750 ${SOURCE4} $(DESTDIR)/${SYSCONFDIR}/ceph/
+	install -D -m 750 ${SOURCE5} $(DESTDIR)/${INITDIR}/ceph-init-wrapper
+	install -D -m 640 ${SOURCE6} $(DESTDIR)/${SYSCONFDIR}/ceph/
+	install -D -m 700 ${SOURCE7} $(DESTDIR)/${SBINDIR}/ceph-manage-journal
+	install -D -m 644 ${SOURCE8} $(DESTDIR)/${UNITDIR}/ceph.service
+	install -D -m 644 ${SOURCE9} $(DESTDIR)/${UNITDIR}/mgr-restful-plugin.service
+	install -D -m 700 ${SOURCE10} $(DESTDIR)/${SBINDIR}/ceph-preshutdown.sh
+	install -D -m 644 ${SOURCE11} $(DESTDIR)/${UNITDIR}/docker.service.d/starlingx-docker-override.conf
 	install -m 750 src/init-radosgw $(DESTDIR)/${INITDIR}/ceph-radosgw
 	sed -i '/### END INIT INFO/a SYSTEMCTL_SKIP_REDIRECT=1' $(DESTDIR)/${INITDIR}/ceph-radosgw
 	install -m 750 src/init-rbdmap $(DESTDIR)/${INITDIR}/rbdmap
@ -275,6 +278,7 @@ override_dh_fixperms:
 	-Xceph.sh  \
 	-Xmgr-restful-plugin  \
 	-Xceph.conf.pmon  \
+	-Xceph-mds.conf.pmon  \
 	-Xceph-init-wrapper  \
 	-Xceph.conf  \
 	-Xceph-manage-journal  \
--- a/ceph/ceph/files/ceph-init-wrapper.sh
+++ b/ceph/ceph/files/ceph-init-wrapper.sh
@ -1,6 +1,6 @@
 #!/bin/bash
 #
-# Copyright (c) 2019 Wind River Systems, Inc.
+# Copyright (c) 2019-2023 Wind River Systems, Inc.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@ -14,8 +14,8 @@
 # "/var/run/.ceph_started" when ceph is running and remove it when
 # is not.
 #
-# The script also extracts  one or more ceph process names  that are
-# reported as 'not running' or 'dead' or 'failed'  by '/etc/intit.d/ceph status'
+# The script also extracts  one or more ceph process names that are
+# reported as 'not running' or 'dead' or 'failed' by '/etc/init.d/ceph status'
 # and writes the names to a text file: /tmp/ceph_status_failure.txt for
 # pmond to access. The pmond adds the text to logs and alarms. Example of text
 # samples written to file by this script are:
@ -24,7 +24,7 @@
 #   'mon.storage-0'
 #   'mon.storage-0, osd.2'
 #
-# Moreover, for processes that are reported as 'hung' by '/etc/intit.d/ceph status'
+# Moreover, for processes that are reported as 'hung' by '/etc/init.d/ceph status'
 # the script will try increase their logging to 'debug' for a configurable interval.
 # With logging increased it will outputs a few stack traces then, at the end of this
 # interval, it dumps its stack core and kills it.
@ -43,6 +43,14 @@ CEPH_GET_MON_STATUS_FILE="$VOLATILE_PATH/.ceph_getting_mon_status"
 CEPH_GET_OSD_STATUS_FILE="$VOLATILE_PATH/.ceph_getting_osd_status"
 CEPH_STATUS_FAILURE_TEXT_FILE="/tmp/ceph_status_failure.txt"

+# For All-in-one duplex, set some variables
+if [ "${system_type}" == "All-in-one" ] && [ "${system_mode}" == "duplex" ]; then
+    CEPH_MON_LIB_PATH=/var/lib/ceph/mon
+    CEPH_LAST_ACTIVE_CONTROLLER_0_FLAG="${CEPH_MON_LIB_PATH}/.last_ceph_mon_active_controller_0"
+    CEPH_LAST_ACTIVE_CONTROLLER_1_FLAG="${CEPH_MON_LIB_PATH}/.last_ceph_mon_active_controller_1"
+    CEPH_LAST_ACTIVE_CONTROLLER_FLAG="${CEPH_MON_LIB_PATH}/.last_ceph_mon_active_${HOSTNAME/-/_}"
+fi
+
 BINDIR=/usr/bin
 SBINDIR=/usr/sbin
 if grep -q "Debian" /etc/os-release; then
@ -85,6 +93,114 @@ if [ ! -z $ARGS ]; then
    args+=("${new_args[@]}")
 fi

+# Verify if drbd-cephmon is in sync, checking the output of 'drbdadm dstate'
+# Return 0 on success and 1 if drbd-cephmon is not ready
+is_drbd_cephmon_in_sync ()
+{
+    local DRBD_CEPHMON_STATUS=$(drbdadm dstate drbd-cephmon)
+    wlog "-" INFO "drbd-cephmon status: ${DRBD_CEPHMON_STATUS}"
+    if [ "${DRBD_CEPHMON_STATUS}" == "UpToDate/UpToDate" ]; then
+        return 0
+    fi
+    return 1
+}
+
+# Verify if drbd-cephmon role is primary, checking the output of 'drbdadm role'
+# Return 0 on success and 1 if drbd-cephmon is not primary
+is_drbd_cephmon_primary ()
+{
+    drbdadm role drbd-cephmon | grep -q 'Primary/'
+    if [ $? -eq 0 ]; then
+        wlog "-" INFO "drbd-cephmon role is Primary"
+        return 0
+    fi
+    wlog "-" INFO "drbd-cephmon role is NOT Primary"
+    return 1
+}
+
+# Verify if drbd-cephmon partition is mounted.
+# Return 0 on success and 1 if drbd-cephmon partition is not mounted
+is_drbd_cephmon_mounted ()
+{
+    findmnt -no SOURCE "${CEPH_MON_LIB_PATH}" | grep -q drbd
+    if [ $? -eq 0 ]; then
+        wlog "-" INFO "drbd-cephmon partition is mounted"
+        return 0
+    fi
+    wlog "-" INFO "drbd-cephmon partition is NOT mounted"
+    return 1
+}
+
+# Verify if ceph mon can be started on AIO-DX configuration.
+# This function must be called only on AIO-DX.
+# Return 0 on success and 1 if ceph mon cannot be started
+can_start_ceph_mon ()
+{
+    local times=""
+
+    # Verify if drbd-cephmon has role Primary
+    # Retries 10 times, 1 second interval
+    for times in {9..0}; do
+        is_drbd_cephmon_primary
+        if [ $? -eq 0 ]; then
+            times=-1
+            break;
+        fi
+        sleep 1
+    done
+
+    if [ ${times} -eq 0 ]; then
+        wlog "-" ERROR "drbd-cephmon is not primary, cannot start ceph mon"
+        return 1
+    fi
+
+    # Check if drbd-cephmon partition is mounted
+    # Retries 10 times, 1 second interval
+    for times in {9..0}; do
+        is_drbd_cephmon_mounted
+        if [ $? -eq 0 ]; then
+            times=-1
+            break;
+        fi
+        sleep 1
+    done
+
+    if [ ${times} -eq 0 ]; then
+        wlog "-" ERROR "drbd-cephmon is not mounted, cannot start ceph mon"
+        return 1
+    fi
+
+    # Ceph mon was last active in this controller. Can run safely.
+    if [ -f "${CEPH_LAST_ACTIVE_CONTROLLER_FLAG}" ]; then
+        return 0
+    fi
+
+    # Check if last active ceph-mon was in another controller
+    if [ "${CEPH_LAST_ACTIVE_CONTROLLER_FLAG}" == "${CEPH_LAST_ACTIVE_CONTROLLER_0_FLAG}" ]; then
+        local CEPH_OTHER_ACTIVE_CONTROLLER_FLAG="${CEPH_LAST_ACTIVE_CONTROLLER_1_FLAG}"
+    else
+        local CEPH_OTHER_ACTIVE_CONTROLLER_FLAG="${CEPH_LAST_ACTIVE_CONTROLLER_0_FLAG}"
+    fi
+    if [ -f "${CEPH_OTHER_ACTIVE_CONTROLLER_FLAG}" ]; then
+        # Verify drbd-cephmon status
+        for times in {9..0}; do
+            is_drbd_cephmon_in_sync
+            if [ $? -eq 0 ]; then
+                # drbd-cephmon is in sync, it is safe to run.
+                return 0
+            fi
+            sleep 1
+        done
+
+        # drbd-cephmon is not in sync, it is not safe to run
+        wlog "-" ERROR "drbd-cephmon is not in sync, cannot start ceph mon"
+        return 1
+    fi
+
+    # This is safe to run ceph mon
+    return 0
+}
+
 with_service_lock ()
 {
    local target="$1"; shift
@ -133,9 +249,45 @@ start ()
        # Ceph is not running on this node, return success
        exit 0
    fi
-    wlog "-" INFO "Ceph START $1 command received"
-    with_service_lock "$1" ${CEPH_SCRIPT} start $1
-    wlog "-" INFO "Ceph START $1 command finished."
+
+    local service="$1"
+
+    # For AIO-DX, the mon service has special treatment
+    if [ "${service}" == "mon" ] && [ "${system_type}" == "All-in-one" ] && [ "${system_mode}" == "duplex" ]; then
+        # After the first controller unlock, ceph-mon is started by
+        # puppet-ceph module via sysvinit using /etc/init.d/ceph directly.
+        # Setting the controller-0 flag to the default prevents
+        # another controller from starting before any host-swact.
+        if [ ! -e "${CEPH_MON_LIB_PATH}"/.last_ceph_mon_active_controller_* ]; then
+            touch "${CEPH_LAST_ACTIVE_CONTROLLER_0_FLAG}"
+        fi
+
+        # NOTE: In case of uncontrolled swact, to force start ceph-mon service
+        #       it will be needed to rename the flag to the desired controller.
+        can_start_ceph_mon
+        if [ $? -ne 0 ]; then
+            wlog "-" ERROR "Ceph mon cannot be started now."
+            exit 1
+        fi
+    fi
+
+    # Start the service
+    wlog "-" INFO "Ceph START ${service} command received"
+    with_service_lock "${service}" ${CEPH_SCRIPT} start ${service}
+    wlog "-" INFO "Ceph START ${service} command finished."
+
+    # For AIO-DX, the mon service has special treatment
+    if [ "${service}" == "mon" ] && [ "${system_type}" == "All-in-one" ] && [ "${system_mode}" == "duplex" ]; then
+        # If ceph-mon is successfully running, clear old flags and set the new one
+        # RC global variable is set by the with_service_lock function trying to start ceph-mon
+        if [ ${RC} -eq 0 ]; then
+            # Remove old flags
+            rm -f "${CEPH_LAST_ACTIVE_CONTROLLER_0_FLAG}"
+            rm -f "${CEPH_LAST_ACTIVE_CONTROLLER_1_FLAG}"
+            # Create new flag
+            touch "${CEPH_LAST_ACTIVE_CONTROLLER_FLAG}"
+        fi
+    fi
 }

 stop ()
--- a/ceph/ceph/files/ceph-mds.conf.pmon
+++ b/ceph/ceph/files/ceph-mds.conf.pmon
@ -0,0 +1,26 @@
+[process]
+process  = ceph-mds
+script   = /etc/init.d/ceph
+
+style    = lsb
+severity = major          ; minor, major, critical
+restarts = 5              ; restart retries before error assertion
+interval = 30             ; number of seconds to wait between restarts
+
+mode = status             ; Monitoring mode: passive (default) or active
+                          ; passive: process death monitoring (default: always)
+                          ; active : heartbeat monitoring, i.e. request / response messaging
+                          ; status : determine process health with executing "status" command
+                          ;          "start" is used to start the process(es) again
+                          ; ignore : do not monitor or stop monitoring
+
+; Status and Active Monitoring Options
+
+period     = 30           ; monitor period in seconds
+timeout    = 120          ; for active mode, messaging timeout period in seconds, must be shorter than period
+                          ; for status mode, max amount of time for a command to execute
+
+; Status Monitoring Options
+start_arg   = start mds   ; start argument for the script
+status_arg  = status mds  ; status argument for the script
+status_failure_text = /tmp/ceph_status_failure.txt   ; text to be added to alarms or logs, this is optional
--- a/ceph/ceph/files/ceph.sh
+++ b/ceph/ceph/files/ceph.sh
@ -1,4 +1,8 @@
 #!/bin/bash
+#
+# Copyright (c) 2023 Wind River Systems, Inc.
+#
+# SPDX-License-Identifier: Apache-2.0

 INITDIR=/etc/init.d
 LOGFILE=/var/log/ceph/ceph-init.log
@ -22,20 +26,17 @@ logecho ()

 start ()
 {
-    SERVICES=""
-    if [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" == "duplex" ]]; then
-        # In an AIO-DX configuration SM manages the floating MON and OSDs. Here
-        # we defer starting OSDs directly via the init script to allow SM to
-        # start them at the appropriate time. This will eliminate a race between
-        # MTC and SM starting OSDs simultaneously. Continue to start MON/MDS
-        # service here so that MDS is operational after the monitor is up.
-        SERVICES="mon mds"
+    if [[ "$system_type" != "All-in-one" ]] || [[ "$system_mode" != "duplex" ]]; then
+        logecho "Starting ceph services..."
+        ${INITDIR}/ceph start >> ${LOGFILE} 2>&1
+        RC=$?
+    else
+        # In an AIO-DX configuration SM manages the floating MON and OSDs and pmon manages
+        # the ceph-mds process. Here we defer starting all ceph process to allow SM and pmon
+        # to start them at the appropriate time.
+        RC=0
    fi

-    logecho "Starting ceph ${SERVICES} services..."
-    ${INITDIR}/ceph start ${SERVICES} >> ${LOGFILE} 2>&1
-    RC=$?
-
    if [ ! -f ${CEPH_FILE} ]; then
        touch ${CEPH_FILE}
    fi