From 655ab05b712663eb99d9a82aad48983d5ddd954b Mon Sep 17 00:00:00 2001 From: Felipe Sanches Zanoni Date: Tue, 30 May 2023 16:51:59 -0300 Subject: [PATCH] Fix Ceph mon and osd processes start/stop conditions For AIO-DX, Ceph monitor was not being started after an uncontrolled swact caused by sudden power off/reboot of the active controller, breaking the system high availability. This happens because there is a flag to indicate on which controller the last active ceph monitor was running to prevent starting ceph monitor without drbd-cephmon data in sync, what could cause Ceph data corruption. That flag was also avoiding data corruption caused when mgmt network was down and both controllers were set to be active, starting ceph monitor without drbd-cephmon in sync. To prevent data corruption and to maintain system high availability, this fix checks the mgmt network carrier instead of managing flags. If no carrier is detected on mgmt network interface, then ceph mon and osd are stopped and only allowed to start again after mgmt network has carrier. For the AIO-DX Direct, all networks are also verified. If all networks have no carrier, then the other controller is considered down, letting the working controller to be in active state even if mgmt network has no carrier. Test-Plan: PASS: Run system host-swact on AIO-DX and verify ceph is running with status HEALTH_OK PASS: Force an uncontrolled swact on AIO-DX by killing a critical process and verify if ceph is running with status HEALTH_OK PASS: Disconnect OAM and MGMT networks for both controllers on AIO-DX and verify ceph mon and osd stop on both controllers. Reconnect OAM and MGMT networks and verify if ceph is running and status is HEALTH_OK PASS: Reboot or power off active controller and verify on the other controller if ceph is running with status HEALT_WARN because one host is down. Power on the controller, wait until it is online/available. Verify if ceph HEALTH_OK after data is all ODSs are up and data is recovered. Closes-bug: 2020889 Signed-off-by: Felipe Sanches Zanoni Change-Id: I38470f43eba86f88fb9cfe47869d2393cacbd365 --- ceph/ceph/files/ceph-init-wrapper.sh | 191 +++++++++++++++------------ 1 file changed, 104 insertions(+), 87 deletions(-) diff --git a/ceph/ceph/files/ceph-init-wrapper.sh b/ceph/ceph/files/ceph-init-wrapper.sh index 84d8a1eb8..59367480f 100755 --- a/ceph/ceph/files/ceph-init-wrapper.sh +++ b/ceph/ceph/files/ceph-init-wrapper.sh @@ -42,16 +42,7 @@ CEPH_FILE="$VOLATILE_PATH/.ceph_started" CEPH_GET_MON_STATUS_FILE="$VOLATILE_PATH/.ceph_getting_mon_status" CEPH_GET_OSD_STATUS_FILE="$VOLATILE_PATH/.ceph_getting_osd_status" CEPH_STATUS_FAILURE_TEXT_FILE="/tmp/ceph_status_failure.txt" - -# For All-in-one duplex, set some variables -if [ "${system_type}" == "All-in-one" ] && [ "${system_mode}" == "duplex" ]; then - CEPH_MON_LIB_PATH=/var/lib/ceph/mon - CEPH_LAST_ACTIVE_CONTROLLER_0_FLAG="${CEPH_MON_LIB_PATH}/.last_ceph_mon_active_controller_0" - CEPH_LAST_ACTIVE_CONTROLLER_1_FLAG="${CEPH_MON_LIB_PATH}/.last_ceph_mon_active_controller_1" - CEPH_LAST_ACTIVE_CONTROLLER_FLAG="${CEPH_MON_LIB_PATH}/.last_ceph_mon_active_${HOSTNAME/-/_}" - - CEPH_MON_SHUTDOWN_COMPLETE="${CEPH_MON_LIB_PATH}/.ceph_mon_shutdown_complete" -fi +CEPH_MON_LIB_PATH=/var/lib/ceph/mon BINDIR=/usr/bin SBINDIR=/usr/sbin @@ -95,18 +86,6 @@ if [ ! -z $ARGS ]; then args+=("${new_args[@]}") fi -# Verify if drbd-cephmon is in sync, checking the output of 'drbdadm dstate' -# Return 0 on success and 1 if drbd-cephmon is not ready -is_drbd_cephmon_in_sync () -{ - local DRBD_CEPHMON_STATUS=$(drbdadm dstate drbd-cephmon) - wlog "-" INFO "drbd-cephmon status: ${DRBD_CEPHMON_STATUS}" - if [ "${DRBD_CEPHMON_STATUS}" == "UpToDate/UpToDate" ]; then - return 0 - fi - return 1 -} - # Verify if drbd-cephmon role is primary, checking the output of 'drbdadm role' # Return 0 on success and 1 if drbd-cephmon is not primary is_drbd_cephmon_primary () @@ -133,6 +112,43 @@ is_drbd_cephmon_mounted () return 1 } +# Verify if oam, cluster host and mgmt networks have carrier. +# This is a special condition for AIO-DX Direct setup. +# If all networks have no carrier, then the other host is down. +# When the other host is down, ceph must start on this host. +# Return 0 if no carrier is detected on all network interfaces. +# Return 1 of carrier has been detected in at lease one network interface. +has_all_network_no_carrier() +{ + ip link show "${oam_interface}" | grep NO-CARRIER + oam_carrier=$? + ip link show "${cluster_host_interface}" | grep NO-CARRIER + cluster_host_carrier=$? + ip link show "${management_interface}" | grep NO-CARRIER + mgmt_carrier=$? + + # Check if all networks have no carrier, meaning the other host is down + if [ "${oam_carrier}" -eq 0 ] && [ "${cluster_host_carrier}" -eq 0 ] && [ "${mgmt_carrier}" -eq 0 ]; then + wlog "-" INFO "No carrier detected from all network interfaces" + return 0 + fi + return 1 +} + +# Check mgmt network carrier signal +has_mgmt_network_carrier() +{ + # Checks the carrier (cable connected) for management interface + # If no-carrier message is detected, then the interface has no physical link + ip link show "${management_interface}" | grep NO-CARRIER + if [ $? -eq 0 ]; then + wlog "-" INFO "management interface '${management_interface}' has NO-CARRIER, cannot start ceph mon" + return 1 + fi + wlog "-" INFO "management interface '${management_interface}' is working" + return 0 +} + # Verify if ceph mon can be started on AIO-DX configuration. # This function must be called only on AIO-DX. # Return 0 on success and 1 if ceph mon cannot be started @@ -172,39 +188,6 @@ can_start_ceph_mon () return 1 fi - # Ceph mon was last active in this controller. Can run safely. - if [ -f "${CEPH_LAST_ACTIVE_CONTROLLER_FLAG}" ]; then - return 0 - fi - - # Check if last active ceph-mon was in another controller - if [ "${CEPH_LAST_ACTIVE_CONTROLLER_FLAG}" == "${CEPH_LAST_ACTIVE_CONTROLLER_0_FLAG}" ]; then - local CEPH_OTHER_ACTIVE_CONTROLLER_FLAG="${CEPH_LAST_ACTIVE_CONTROLLER_1_FLAG}" - else - local CEPH_OTHER_ACTIVE_CONTROLLER_FLAG="${CEPH_LAST_ACTIVE_CONTROLLER_0_FLAG}" - fi - - if [ -f "${CEPH_OTHER_ACTIVE_CONTROLLER_FLAG}" ]; then - - if [ -f "${CEPH_MON_SHUTDOWN_COMPLETE}" ]; then - return 0 - fi - - # Verify drbd-cephmon status - for times in {9..0}; do - is_drbd_cephmon_in_sync - if [ $? -eq 0 ]; then - # drbd-cephmon is in sync, it is safe to run. - return 0 - fi - sleep 1 - done - - # drbd-cephmon is not in sync, it is not safe to run - wlog "-" ERROR "drbd-cephmon is not in sync, cannot start ceph mon" - return 1 - fi - # This is safe to run ceph mon return 0 } @@ -260,22 +243,36 @@ start () local service="$1" - # For AIO-DX, the mon service has special treatment - if [ "${service}" == "mon" ] && [ "${system_type}" == "All-in-one" ] && [ "${system_mode}" == "duplex" ]; then - # After the first controller unlock, ceph-mon is started by - # puppet-ceph module via sysvinit using /etc/init.d/ceph directly. - # Setting the controller-0 flag to the default prevents - # another controller from starting before any host-swact. - if [ ! -e "${CEPH_MON_LIB_PATH}"/.last_ceph_mon_active_controller_* ]; then - touch "${CEPH_LAST_ACTIVE_CONTROLLER_0_FLAG}" + # For AIO-DX, ceph services have special treatment + if [ "${system_type}" == "All-in-one" ] && [ "${system_mode}" != "simplex" ]; then + + # For ceph mon, check if drbd-cephmon is ready + if [ "${service}" == "mon" ]; then + can_start_ceph_mon + if [ $? -ne 0 ]; then + wlog "-" INFO "Ceph monitor is not ready to start because drbd-cephmon is not ready and mounted" + exit 1 + fi fi - # NOTE: In case of uncontrolled swact, to force start ceph-mon service - # it will be needed to rename the flag to the desired controller. - can_start_ceph_mon + # Check mgmt network state + has_mgmt_network_carrier if [ $? -ne 0 ]; then - wlog "-" ERROR "Ceph mon cannot be started now." - exit 1 + # If this is a AIO-DX Direct, check if all other network interfaces are down + if [ "${system_mode}" == "duplex-direct" ]; then + has_all_network_no_carrier + if [ $? -eq 0 ]; then + wlog "-" INFO "All network interfaces are not functional, considering the other host is down. Let Ceph start." + else + # Else AIO-DX Direct mgmt network is NOT functional + wlog "-" INFO "Mgmt network is not functional, defer starting Ceph processes until recovered" + exit 1 + fi + else + # Else AIO-DX mgmt network is NOT functional + wlog "-" INFO "Mgmt network is not functional, defer starting Ceph processes until recovered" + exit 1 + fi fi fi @@ -283,21 +280,6 @@ start () wlog "-" INFO "Ceph START ${service} command received" with_service_lock "${service}" ${CEPH_SCRIPT} start ${service} wlog "-" INFO "Ceph START ${service} command finished." - - # For AIO-DX, the mon service has special treatment - if [ "${service}" == "mon" ] && [ "${system_type}" == "All-in-one" ] && [ "${system_mode}" == "duplex" ]; then - # If ceph-mon is successfully running, clear old flags and set the new one - # RC global variable is set by the with_service_lock function trying to start ceph-mon - if [ ${RC} -eq 0 ]; then - # Remove old flags - rm -f "${CEPH_LAST_ACTIVE_CONTROLLER_0_FLAG}" - rm -f "${CEPH_LAST_ACTIVE_CONTROLLER_1_FLAG}" - rm -f "${CEPH_MON_SHUTDOWN_COMPLETE}" - - # Create new flag - touch "${CEPH_LAST_ACTIVE_CONTROLLER_FLAG}" - fi - fi } stop () @@ -307,10 +289,6 @@ stop () wlog "-" INFO "Ceph STOP $1 command received." with_service_lock "$1" ${CEPH_SCRIPT} stop $1 wlog "-" INFO "Ceph STOP $1 command finished." - - if [ "${service}" == "mon" ] && [ "${system_type}" == "All-in-one" ] && [ "${system_mode}" == "duplex" ]; then - touch "${CEPH_MON_SHUTDOWN_COMPLETE}" - fi } restart () @@ -394,6 +372,27 @@ status () fi if [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" != "simplex" ]] && [[ "$1" == "osd" ]]; then + has_mgmt_network_carrier + if [ $? -eq 0 ]; then + # Network is functional, continue + wlog "-" INFO "mgmt network active..." + else + if [ "${system_mode}" == "duplex-direct" ]; then + has_all_network_no_carrier + if [ $? -ne 0 ]; then + # Network is NOT functional, prevent split brain corruptions + wlog "-" INFO "mgmt network inactive... stop OSDs to force a re-peering once the network has recovered" + stop "$1" + exit 0 + fi + else + # Network is NOT functional, prevent split brain corruptions + wlog "-" INFO "mgmt network inactive... stop OSDs to force a re-peering once the network has recovered" + stop "$1" + exit 0 + fi + fi + timeout $CEPH_STATUS_TIMEOUT ceph -s if [ "$?" -ne 0 ]; then # Ceph cluster is not accessible. Don't panic, controller swact @@ -482,6 +481,24 @@ status () test -e "/var/lib/ceph/mon/ceph-controller" if [ "$?" -ne 0 ]; then exit 3 + else + has_mgmt_network_carrier + if [ $? -ne 0 ]; then + if [ "${system_mode}" == "duplex-direct" ]; then + has_all_network_no_carrier + if [ $? -ne 0 ]; then + # Network is NOT functional, prevent split brain corruptions + wlog "-" INFO "mgmt network inactive... stop MON to prevent localized operation" + stop "$1" + exit 0 + fi + else + # Network is NOT functional, prevent split brain corruptions + wlog "-" INFO "mgmt network inactive... stop MON to prevent localized operation" + stop "$1" + exit 0 + fi + fi fi fi }