From 09e29800cb4b55ebd4370cf5f23a333c70259c4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pedro=20Vin=C3=ADcius=20Silva=20da=20Cruz?= Date: Thu, 20 Apr 2023 08:36:22 -0400 Subject: [PATCH] Fix AIO-DX Uncontrolled Swact ceph-mon failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change is the solution to resolve the scenario where after an uncontrolled swact due to killing one of the critical processes twice, the ceph-mon service doesn't start in the new active controller occasioning a new swact. It was created a flag to signalize a complete shutdown of ceph-mon. After an uncontrolled swact, the system verifies if the flag exists, and if so starts the ceph-mon service in the new active controller. Test Plan: PASS: System host-swact. PASS: Ceph recovery after rebooting the active controller. PASS: Ceph recovery after uncontrolled swact killing a critical process twice. PASS: Ceph recovery after mgmt network outage for a few minutes even when rebooting controllers. PASS: Ceph recovery after case of dead office recovery (DOR). PASS: Upgrade success from stx 7.0 to 8.0 in a duplex lab. Closes-bug: 2017133 Signed-off-by: Pedro Vinícius Silva da Cruz Change-Id: I6784ec76afa3e62ee14e8ca8f3d6c0212a9f6f3e --- ceph/ceph/files/ceph-init-wrapper.sh | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/ceph/ceph/files/ceph-init-wrapper.sh b/ceph/ceph/files/ceph-init-wrapper.sh index 3b1c1ce19..84d8a1eb8 100755 --- a/ceph/ceph/files/ceph-init-wrapper.sh +++ b/ceph/ceph/files/ceph-init-wrapper.sh @@ -49,6 +49,8 @@ if [ "${system_type}" == "All-in-one" ] && [ "${system_mode}" == "duplex" ]; the CEPH_LAST_ACTIVE_CONTROLLER_0_FLAG="${CEPH_MON_LIB_PATH}/.last_ceph_mon_active_controller_0" CEPH_LAST_ACTIVE_CONTROLLER_1_FLAG="${CEPH_MON_LIB_PATH}/.last_ceph_mon_active_controller_1" CEPH_LAST_ACTIVE_CONTROLLER_FLAG="${CEPH_MON_LIB_PATH}/.last_ceph_mon_active_${HOSTNAME/-/_}" + + CEPH_MON_SHUTDOWN_COMPLETE="${CEPH_MON_LIB_PATH}/.ceph_mon_shutdown_complete" fi BINDIR=/usr/bin @@ -181,7 +183,13 @@ can_start_ceph_mon () else local CEPH_OTHER_ACTIVE_CONTROLLER_FLAG="${CEPH_LAST_ACTIVE_CONTROLLER_0_FLAG}" fi + if [ -f "${CEPH_OTHER_ACTIVE_CONTROLLER_FLAG}" ]; then + + if [ -f "${CEPH_MON_SHUTDOWN_COMPLETE}" ]; then + return 0 + fi + # Verify drbd-cephmon status for times in {9..0}; do is_drbd_cephmon_in_sync @@ -284,6 +292,8 @@ start () # Remove old flags rm -f "${CEPH_LAST_ACTIVE_CONTROLLER_0_FLAG}" rm -f "${CEPH_LAST_ACTIVE_CONTROLLER_1_FLAG}" + rm -f "${CEPH_MON_SHUTDOWN_COMPLETE}" + # Create new flag touch "${CEPH_LAST_ACTIVE_CONTROLLER_FLAG}" fi @@ -292,9 +302,15 @@ start () stop () { + local service="$1" + wlog "-" INFO "Ceph STOP $1 command received." with_service_lock "$1" ${CEPH_SCRIPT} stop $1 wlog "-" INFO "Ceph STOP $1 command finished." + + if [ "${service}" == "mon" ] && [ "${system_type}" == "All-in-one" ] && [ "${system_mode}" == "duplex" ]; then + touch "${CEPH_MON_SHUTDOWN_COMPLETE}" + fi } restart () @@ -304,7 +320,8 @@ restart () exit 0 fi wlog "-" INFO "Ceph RESTART $1 command received." - with_service_lock "$1" ${CEPH_SCRIPT} restart $1 + stop "$1" + start "$1" wlog "-" INFO "Ceph RESTART $1 command finished." }