From b629db6b9fc58a96171f2142b347463cc9fd288f Mon Sep 17 00:00:00 2001 From: Hediberto Cavalcante da Silva Date: Mon, 30 Jan 2023 13:09:42 -0500 Subject: [PATCH] AIO-DX Ceph Optimizations This change is part of the solution to resolve the scenario where Ceph MON starts without having data in sync when there is no communication with the peer, leading to PG issues. Improvements: Removed starting Ceph MON and MDS from ceph.sh script called by mtcClient for AIO-DX: - Ceph MDS was not being managed, only started by ceph.sh script called from mtcClient. Now it will be managed by PMON. - Ceph MON will continue to be managed by SM. Ceph-init-wrapper script will verify some conditions to start Ceph MON safely: - First, check if drbd-cephmon role is Primary. - Then, check if drbd-cephmon partition is mounted correctly. - Check flags (inside drbd-cephmon path) for last active Ceph MON process (Controller-0 or Controler-1). This flag will be created by the last Ceph MON successful start. - If the last active monitor is the other one, check if drbd-cephmon is UpToDate/UpToDate, meaning that data is synchronized between controllers. We also made some improvements to /etc/init.d/ceph script to be able to stop Ceph OSD even if Ceph MON was not available. Stopping OSD without a Ceph Monitor was hanging when the command to flush the journal would wait forever to communicate to any available Ceph Monitor. Test Plan: PASS: system host-swact. PASS: Ceph recovery after mgmt network outage for few minutes even when rebooting controllers. PASS: Ceph recovery after rebooting active controller. PASS: Ceph recovery after case of dead office recovery (DOR). PASS: Running shellcheck on ceph-base.ceph.init, ceph.sh, and ceph-init-wrapper.sh files without any complaints about the lines related to the changes. Closes-bug: 2004183 Signed-off-by: Hediberto Cavalcante da Silva Change-Id: Id09432aecef68b39adabf633c74545f2efa02e99 --- .../debian/deb_folder/ceph-base.ceph.init | 59 ++++++- ceph/ceph/debian/deb_folder/ceph-base.install | 1 + ceph/ceph/debian/deb_folder/rules | 32 ++-- ceph/ceph/files/ceph-init-wrapper.sh | 166 +++++++++++++++++- ceph/ceph/files/ceph-mds.conf.pmon | 26 +++ ceph/ceph/files/ceph.sh | 25 +-- 6 files changed, 271 insertions(+), 38 deletions(-) create mode 100644 ceph/ceph/files/ceph-mds.conf.pmon diff --git a/ceph/ceph/debian/deb_folder/ceph-base.ceph.init b/ceph/ceph/debian/deb_folder/ceph-base.ceph.init index badd1ea9b..d29d2849a 100755 --- a/ceph/ceph/debian/deb_folder/ceph-base.ceph.init +++ b/ceph/ceph/debian/deb_folder/ceph-base.ceph.init @@ -612,8 +612,10 @@ stop_daemon() { ## command line options options= +IFS=" " read -r -a args <<< "$@" +wlog "-" INFO "$@" -OPTS=$(${GETOPT} -n 'init-ceph' -o 'hvam:c:' -l 'help,verbose,valgrind,novalgrind,allhosts,restart,norestart,btrfs,nobtrfs,fsmount,nofsmount,btrfsumount,fsumount,conf:,cluster:,hostname:' -- "$@") +OPTS=$(${GETOPT} -n 'init-ceph' -o 'hvam:c:' -l 'help,verbose,valgrind,novalgrind,allhosts,restart,norestart,btrfs,nobtrfs,fsmount,nofsmount,btrfsumount,fsumount,conf:,cluster:,hostname:' -- "${args[@]}") if [ $? != 0 ] then exit 1 @@ -735,11 +737,51 @@ if [ "$command" = "stop" -o "$command" = "onestop" ]; then what="$new_order" fi -# Check if the monitors are up before starting any mds -# This is needed only for Standard deployments . /etc/platform/platform.conf + +# When this is a AIO-DX pmon is monitoring ceph-mds process. +# If ceph-mon is not running, ceph-mds will hang when starting. +# Check if we are trying to bring up ceph-mds and ceph-mon is not ready yet +if [ "${system_type}" == "All-in-one" ] && [ "${system_mode}" == "duplex" ]; then + if [ "${command}" = "start" -o "${command}" = "onestart" ]; then + what_out= + what_mds= + re="\s*mon" + if [[ ${what} =~ ${re} ]]; then + has_mon=1 + else + has_mon=0 + CEPH_STATUS='' + execute_ceph_cmd CEPH_STATUS "ceph status" "ceph -s" + if [ $? -eq 0 ]; then + has_mon=1 + fi + fi + for name in ${what}; do + type=$(echo "${name}" | cut -c 1-3) + if [ "${type}" == "mds" ]; then + what_mds="${name}" + continue + fi + what_out+=" ${name}" + done + if [ ${has_mon} -eq 1 ] && [ ! -z "${what_mds}" ]; then + what_out+=" ${what_mds}" + fi + what="${what_out}" + fi + # If the variable 'what' is empty, then it was trying to bring up ceph-mds but ceph-mon is not active. + # When ceph-mon is not active, we cannot execute ceph-mds yet, thus returning error. + if [ -z "${what}" ]; then + EXIT_STATUS=1 + fi +fi + +# Check if the monitors are up before starting any mds +# This is needed only for Standard deployments + if [ "$system_type" == "Standard" ]; then CEPH_STATUS='' execute_ceph_cmd CEPH_STATUS "ceph status" "ceph -s" @@ -999,8 +1041,15 @@ EOF [ -n "$post_stop" ] && do_cmd "$post_stop" [ -n "$lockfile" ] && [ "$?" -eq 0 ] && rm -f $lockfile # flush journal to data disk in background - if [ "$type" = "osd" ];then - $(/usr/bin/ceph-osd -i $id --flush-journal) & + if [ "${type}" = "osd" ];then + CMD_OUTPUT='' + execute_ceph_cmd CMD_OUTPUT "Ceph Status" "ceph -s" + if [ $? == 0 ]; then + wlog "${name}" "INFO" "Flushing journal" + $(/usr/bin/ceph-osd -i $id --flush-journal) & + else + wlog "${name}" "INFO" "Skipping journal flush" + fi fi wlog $name "INFO" "Process stopped, setting state to $ST_STOPPED" save_proc_state $name $ST_STOPPED diff --git a/ceph/ceph/debian/deb_folder/ceph-base.install b/ceph/ceph/debian/deb_folder/ceph-base.install index 9cecc1aa1..549058c0c 100644 --- a/ceph/ceph/debian/deb_folder/ceph-base.install +++ b/ceph/ceph/debian/deb_folder/ceph-base.install @@ -26,6 +26,7 @@ etc/init.d/ceph etc/init.d/mgr-restful-plugin etc/init.d/ceph-init-wrapper etc/ceph/ceph.conf.pmon +etc/ceph/ceph-mds.conf.pmon etc/ceph/ceph.conf etc/services.d/* usr/sbin/ceph-preshutdown.sh diff --git a/ceph/ceph/debian/deb_folder/rules b/ceph/ceph/debian/deb_folder/rules index 7709b7aff..a4590e681 100755 --- a/ceph/ceph/debian/deb_folder/rules +++ b/ceph/ceph/debian/deb_folder/rules @@ -6,13 +6,15 @@ SOURCE1 := ceph.sh SOURCE2 := mgr-restful-plugin.py SOURCE3 := ceph.conf.pmon -SOURCE4 := ceph-init-wrapper.sh -SOURCE5 := ceph.conf -SOURCE6 := ceph-manage-journal.py -SOURCE7 := ceph.service -SOURCE8 := mgr-restful-plugin.service -SOURCE9 := ceph-preshutdown.sh -SOURCE10 := starlingx-docker-override.conf +SOURCE4 := ceph-mds.conf.pmon +SOURCE5 := ceph-init-wrapper.sh +SOURCE6 := ceph.conf +SOURCE7 := ceph-manage-journal.py +SOURCE8 := ceph.service +SOURCE9 := mgr-restful-plugin.service +SOURCE10 := ceph-preshutdown.sh +SOURCE11 := starlingx-docker-override.conf + # Paths export DESTDIR = $(CURDIR)/debian/tmp @@ -188,13 +190,14 @@ override_dh_auto_install: install -D -m 750 ${SOURCE1} $(DESTDIR)/${SYSCONFDIR}/services.d/worker/ install -D -m 750 ${SOURCE2} $(DESTDIR)/${INITDIR}/mgr-restful-plugin install -D -m 750 ${SOURCE3} $(DESTDIR)/${SYSCONFDIR}/ceph/ - install -D -m 750 ${SOURCE4} $(DESTDIR)/${INITDIR}/ceph-init-wrapper - install -D -m 640 ${SOURCE5} $(DESTDIR)/${SYSCONFDIR}/ceph/ - install -D -m 700 ${SOURCE6} $(DESTDIR)/${SBINDIR}/ceph-manage-journal - install -D -m 644 ${SOURCE7} $(DESTDIR)/${UNITDIR}/ceph.service - install -D -m 644 ${SOURCE8} $(DESTDIR)/${UNITDIR}/mgr-restful-plugin.service - install -D -m 700 ${SOURCE9} $(DESTDIR)/${SBINDIR}/ceph-preshutdown.sh - install -D -m 644 ${SOURCE10} $(DESTDIR)/${UNITDIR}/docker.service.d/starlingx-docker-override.conf + install -D -m 750 ${SOURCE4} $(DESTDIR)/${SYSCONFDIR}/ceph/ + install -D -m 750 ${SOURCE5} $(DESTDIR)/${INITDIR}/ceph-init-wrapper + install -D -m 640 ${SOURCE6} $(DESTDIR)/${SYSCONFDIR}/ceph/ + install -D -m 700 ${SOURCE7} $(DESTDIR)/${SBINDIR}/ceph-manage-journal + install -D -m 644 ${SOURCE8} $(DESTDIR)/${UNITDIR}/ceph.service + install -D -m 644 ${SOURCE9} $(DESTDIR)/${UNITDIR}/mgr-restful-plugin.service + install -D -m 700 ${SOURCE10} $(DESTDIR)/${SBINDIR}/ceph-preshutdown.sh + install -D -m 644 ${SOURCE11} $(DESTDIR)/${UNITDIR}/docker.service.d/starlingx-docker-override.conf install -m 750 src/init-radosgw $(DESTDIR)/${INITDIR}/ceph-radosgw sed -i '/### END INIT INFO/a SYSTEMCTL_SKIP_REDIRECT=1' $(DESTDIR)/${INITDIR}/ceph-radosgw install -m 750 src/init-rbdmap $(DESTDIR)/${INITDIR}/rbdmap @@ -275,6 +278,7 @@ override_dh_fixperms: -Xceph.sh \ -Xmgr-restful-plugin \ -Xceph.conf.pmon \ + -Xceph-mds.conf.pmon \ -Xceph-init-wrapper \ -Xceph.conf \ -Xceph-manage-journal \ diff --git a/ceph/ceph/files/ceph-init-wrapper.sh b/ceph/ceph/files/ceph-init-wrapper.sh index 33f56c441..3b1c1ce19 100755 --- a/ceph/ceph/files/ceph-init-wrapper.sh +++ b/ceph/ceph/files/ceph-init-wrapper.sh @@ -1,6 +1,6 @@ #!/bin/bash # -# Copyright (c) 2019 Wind River Systems, Inc. +# Copyright (c) 2019-2023 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # @@ -14,8 +14,8 @@ # "/var/run/.ceph_started" when ceph is running and remove it when # is not. # -# The script also extracts one or more ceph process names that are -# reported as 'not running' or 'dead' or 'failed' by '/etc/intit.d/ceph status' +# The script also extracts one or more ceph process names that are +# reported as 'not running' or 'dead' or 'failed' by '/etc/init.d/ceph status' # and writes the names to a text file: /tmp/ceph_status_failure.txt for # pmond to access. The pmond adds the text to logs and alarms. Example of text # samples written to file by this script are: @@ -24,7 +24,7 @@ # 'mon.storage-0' # 'mon.storage-0, osd.2' # -# Moreover, for processes that are reported as 'hung' by '/etc/intit.d/ceph status' +# Moreover, for processes that are reported as 'hung' by '/etc/init.d/ceph status' # the script will try increase their logging to 'debug' for a configurable interval. # With logging increased it will outputs a few stack traces then, at the end of this # interval, it dumps its stack core and kills it. @@ -43,6 +43,14 @@ CEPH_GET_MON_STATUS_FILE="$VOLATILE_PATH/.ceph_getting_mon_status" CEPH_GET_OSD_STATUS_FILE="$VOLATILE_PATH/.ceph_getting_osd_status" CEPH_STATUS_FAILURE_TEXT_FILE="/tmp/ceph_status_failure.txt" +# For All-in-one duplex, set some variables +if [ "${system_type}" == "All-in-one" ] && [ "${system_mode}" == "duplex" ]; then + CEPH_MON_LIB_PATH=/var/lib/ceph/mon + CEPH_LAST_ACTIVE_CONTROLLER_0_FLAG="${CEPH_MON_LIB_PATH}/.last_ceph_mon_active_controller_0" + CEPH_LAST_ACTIVE_CONTROLLER_1_FLAG="${CEPH_MON_LIB_PATH}/.last_ceph_mon_active_controller_1" + CEPH_LAST_ACTIVE_CONTROLLER_FLAG="${CEPH_MON_LIB_PATH}/.last_ceph_mon_active_${HOSTNAME/-/_}" +fi + BINDIR=/usr/bin SBINDIR=/usr/sbin if grep -q "Debian" /etc/os-release; then @@ -85,6 +93,114 @@ if [ ! -z $ARGS ]; then args+=("${new_args[@]}") fi +# Verify if drbd-cephmon is in sync, checking the output of 'drbdadm dstate' +# Return 0 on success and 1 if drbd-cephmon is not ready +is_drbd_cephmon_in_sync () +{ + local DRBD_CEPHMON_STATUS=$(drbdadm dstate drbd-cephmon) + wlog "-" INFO "drbd-cephmon status: ${DRBD_CEPHMON_STATUS}" + if [ "${DRBD_CEPHMON_STATUS}" == "UpToDate/UpToDate" ]; then + return 0 + fi + return 1 +} + +# Verify if drbd-cephmon role is primary, checking the output of 'drbdadm role' +# Return 0 on success and 1 if drbd-cephmon is not primary +is_drbd_cephmon_primary () +{ + drbdadm role drbd-cephmon | grep -q 'Primary/' + if [ $? -eq 0 ]; then + wlog "-" INFO "drbd-cephmon role is Primary" + return 0 + fi + wlog "-" INFO "drbd-cephmon role is NOT Primary" + return 1 +} + +# Verify if drbd-cephmon partition is mounted. +# Return 0 on success and 1 if drbd-cephmon partition is not mounted +is_drbd_cephmon_mounted () +{ + findmnt -no SOURCE "${CEPH_MON_LIB_PATH}" | grep -q drbd + if [ $? -eq 0 ]; then + wlog "-" INFO "drbd-cephmon partition is mounted" + return 0 + fi + wlog "-" INFO "drbd-cephmon partition is NOT mounted" + return 1 +} + +# Verify if ceph mon can be started on AIO-DX configuration. +# This function must be called only on AIO-DX. +# Return 0 on success and 1 if ceph mon cannot be started +can_start_ceph_mon () +{ + local times="" + + # Verify if drbd-cephmon has role Primary + # Retries 10 times, 1 second interval + for times in {9..0}; do + is_drbd_cephmon_primary + if [ $? -eq 0 ]; then + times=-1 + break; + fi + sleep 1 + done + + if [ ${times} -eq 0 ]; then + wlog "-" ERROR "drbd-cephmon is not primary, cannot start ceph mon" + return 1 + fi + + # Check if drbd-cephmon partition is mounted + # Retries 10 times, 1 second interval + for times in {9..0}; do + is_drbd_cephmon_mounted + if [ $? -eq 0 ]; then + times=-1 + break; + fi + sleep 1 + done + + if [ ${times} -eq 0 ]; then + wlog "-" ERROR "drbd-cephmon is not mounted, cannot start ceph mon" + return 1 + fi + + # Ceph mon was last active in this controller. Can run safely. + if [ -f "${CEPH_LAST_ACTIVE_CONTROLLER_FLAG}" ]; then + return 0 + fi + + # Check if last active ceph-mon was in another controller + if [ "${CEPH_LAST_ACTIVE_CONTROLLER_FLAG}" == "${CEPH_LAST_ACTIVE_CONTROLLER_0_FLAG}" ]; then + local CEPH_OTHER_ACTIVE_CONTROLLER_FLAG="${CEPH_LAST_ACTIVE_CONTROLLER_1_FLAG}" + else + local CEPH_OTHER_ACTIVE_CONTROLLER_FLAG="${CEPH_LAST_ACTIVE_CONTROLLER_0_FLAG}" + fi + if [ -f "${CEPH_OTHER_ACTIVE_CONTROLLER_FLAG}" ]; then + # Verify drbd-cephmon status + for times in {9..0}; do + is_drbd_cephmon_in_sync + if [ $? -eq 0 ]; then + # drbd-cephmon is in sync, it is safe to run. + return 0 + fi + sleep 1 + done + + # drbd-cephmon is not in sync, it is not safe to run + wlog "-" ERROR "drbd-cephmon is not in sync, cannot start ceph mon" + return 1 + fi + + # This is safe to run ceph mon + return 0 +} + with_service_lock () { local target="$1"; shift @@ -133,9 +249,45 @@ start () # Ceph is not running on this node, return success exit 0 fi - wlog "-" INFO "Ceph START $1 command received" - with_service_lock "$1" ${CEPH_SCRIPT} start $1 - wlog "-" INFO "Ceph START $1 command finished." + + local service="$1" + + # For AIO-DX, the mon service has special treatment + if [ "${service}" == "mon" ] && [ "${system_type}" == "All-in-one" ] && [ "${system_mode}" == "duplex" ]; then + # After the first controller unlock, ceph-mon is started by + # puppet-ceph module via sysvinit using /etc/init.d/ceph directly. + # Setting the controller-0 flag to the default prevents + # another controller from starting before any host-swact. + if [ ! -e "${CEPH_MON_LIB_PATH}"/.last_ceph_mon_active_controller_* ]; then + touch "${CEPH_LAST_ACTIVE_CONTROLLER_0_FLAG}" + fi + + # NOTE: In case of uncontrolled swact, to force start ceph-mon service + # it will be needed to rename the flag to the desired controller. + can_start_ceph_mon + if [ $? -ne 0 ]; then + wlog "-" ERROR "Ceph mon cannot be started now." + exit 1 + fi + fi + + # Start the service + wlog "-" INFO "Ceph START ${service} command received" + with_service_lock "${service}" ${CEPH_SCRIPT} start ${service} + wlog "-" INFO "Ceph START ${service} command finished." + + # For AIO-DX, the mon service has special treatment + if [ "${service}" == "mon" ] && [ "${system_type}" == "All-in-one" ] && [ "${system_mode}" == "duplex" ]; then + # If ceph-mon is successfully running, clear old flags and set the new one + # RC global variable is set by the with_service_lock function trying to start ceph-mon + if [ ${RC} -eq 0 ]; then + # Remove old flags + rm -f "${CEPH_LAST_ACTIVE_CONTROLLER_0_FLAG}" + rm -f "${CEPH_LAST_ACTIVE_CONTROLLER_1_FLAG}" + # Create new flag + touch "${CEPH_LAST_ACTIVE_CONTROLLER_FLAG}" + fi + fi } stop () diff --git a/ceph/ceph/files/ceph-mds.conf.pmon b/ceph/ceph/files/ceph-mds.conf.pmon new file mode 100644 index 000000000..f02b22b42 --- /dev/null +++ b/ceph/ceph/files/ceph-mds.conf.pmon @@ -0,0 +1,26 @@ +[process] +process = ceph-mds +script = /etc/init.d/ceph + +style = lsb +severity = major ; minor, major, critical +restarts = 5 ; restart retries before error assertion +interval = 30 ; number of seconds to wait between restarts + +mode = status ; Monitoring mode: passive (default) or active + ; passive: process death monitoring (default: always) + ; active : heartbeat monitoring, i.e. request / response messaging + ; status : determine process health with executing "status" command + ; "start" is used to start the process(es) again + ; ignore : do not monitor or stop monitoring + +; Status and Active Monitoring Options + +period = 30 ; monitor period in seconds +timeout = 120 ; for active mode, messaging timeout period in seconds, must be shorter than period + ; for status mode, max amount of time for a command to execute + +; Status Monitoring Options +start_arg = start mds ; start argument for the script +status_arg = status mds ; status argument for the script +status_failure_text = /tmp/ceph_status_failure.txt ; text to be added to alarms or logs, this is optional diff --git a/ceph/ceph/files/ceph.sh b/ceph/ceph/files/ceph.sh index e646a149f..be72061e0 100644 --- a/ceph/ceph/files/ceph.sh +++ b/ceph/ceph/files/ceph.sh @@ -1,4 +1,8 @@ #!/bin/bash +# +# Copyright (c) 2023 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 INITDIR=/etc/init.d LOGFILE=/var/log/ceph/ceph-init.log @@ -22,20 +26,17 @@ logecho () start () { - SERVICES="" - if [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" == "duplex" ]]; then - # In an AIO-DX configuration SM manages the floating MON and OSDs. Here - # we defer starting OSDs directly via the init script to allow SM to - # start them at the appropriate time. This will eliminate a race between - # MTC and SM starting OSDs simultaneously. Continue to start MON/MDS - # service here so that MDS is operational after the monitor is up. - SERVICES="mon mds" + if [[ "$system_type" != "All-in-one" ]] || [[ "$system_mode" != "duplex" ]]; then + logecho "Starting ceph services..." + ${INITDIR}/ceph start >> ${LOGFILE} 2>&1 + RC=$? + else + # In an AIO-DX configuration SM manages the floating MON and OSDs and pmon manages + # the ceph-mds process. Here we defer starting all ceph process to allow SM and pmon + # to start them at the appropriate time. + RC=0 fi - logecho "Starting ceph ${SERVICES} services..." - ${INITDIR}/ceph start ${SERVICES} >> ${LOGFILE} 2>&1 - RC=$? - if [ ! -f ${CEPH_FILE} ]; then touch ${CEPH_FILE} fi