From 696f987a174725cb2d87a56935ac45e3b2fa56cb Mon Sep 17 00:00:00 2001 From: Jim Gauld Date: Tue, 16 Jul 2019 15:35:36 -0400 Subject: [PATCH] AIO reaffine DRBD tasks during startup This will speedup the initial DRBD sync on AIO when there are limited number of platform cores by reaffining DRBD tasks to use all cpus. This enhances affine-tasks init script to dynamically reaffine CPU intensive DRBD tasks. The receiver threads (i.e., drbd_r_*) may use a full core each. On systems with fast disk, we notice the receiver threads and softirq processing get CPU limited by the number of platform cores configured. The DRBD receiver tasks are reaffined initially to float across all cores. This will poll for newly created DRBD resources and reaffine them as they are found until all DRBD resources have started. This script waits for sufficient platform readiness criteria. Once the system is at steady-state, this will ensure that DRBD tasks are constrained to platform cores and do not run on cores with VMs/containers. The DRBD configuration file affinity option is left as-is in case the DRBD kernel threads are restarted for some reason. Change-Id: I019137ea1cf3736768ad8882bd8d8628cc5c2857 Closes-Bug: 1832781 Signed-off-by: Jim Gauld --- worker-utils/worker-utils/affine-tasks.sh | 76 ++++++++++++++++++++++- 1 file changed, 75 insertions(+), 1 deletion(-) diff --git a/worker-utils/worker-utils/affine-tasks.sh b/worker-utils/worker-utils/affine-tasks.sh index 303865ebb5..ddf566e53f 100644 --- a/worker-utils/worker-utils/affine-tasks.sh +++ b/worker-utils/worker-utils/affine-tasks.sh @@ -44,7 +44,7 @@ # ################################################################################ # Define minimal path -PATH=/bin:/usr/bin:/usr/local/bin +PATH=/bin:/usr/bin:/usr/sbin:/usr/local/bin CPUMAP_FUNCTIONS=${CPUMAP_FUNCTIONS:-"/etc/init.d/cpumap_functions.sh"} [[ -e ${CPUMAP_FUNCTIONS} ]] && source ${CPUMAP_FUNCTIONS} @@ -273,6 +273,60 @@ END { printf "%d\n", n; } return ${PASS} } +# Get number of DRBD resources started. +# Returns 0 if DRBD not ready. +function number_drbd_resources_started { + local started + + # Number of started DRBD resources + started=$(cat /proc/drbd 2>/dev/null | \ + awk '/cs:/ { n+=1; } END {printf "%d\n", n}') + echo "${started}" +} + +# Check criteria for all drbd resources started. +# i.e., see running DRBD worker threads for each configured resource. +function all_drbd_resources_started { + local PASS=0 + local FAIL=1 + local -i started=0 + local -i resources=0 + + # Global variable + NOT_READY_REASON="" + + # Number of started DRBD resources + started=$(number_drbd_resources_started) + if [ ${started} -eq 0 ]; then + NOT_READY_REASON="no drbd resources started" + return ${FAIL} + fi + + # Number of expected DRBD resources + resources=$(drbdadm sh-resources | \ + awk -vFS='[[:space:]]' 'END {print NF}') + if [ ${started} -ne ${resources} ]; then + NOT_READY_REASON="${started} of ${resources} drbd resources started" + return ${FAIL} + fi + + return ${PASS} +} + +function affine_drbd_tasks { + local CPUS=$1 + local pidlist + + LOG "Affine drbd tasks, CPUS=${CPUS}" + + # Affine drbd_r_* threads to all cores. The DRBD receiver threads are + # particularly CPU intensive. Leave the other DRBD threads alone. + pidlist=$(pgrep drbd_r_) + for pid in ${pidlist[@]}; do + taskset --pid --cpu-list ${CPUS} ${pid} > /dev/null 2>&1 + done +} + # Return list of reaffineable pids. This includes all processes, but excludes # kernel threads, vSwitch, and anything in K8S or qemu/kvm. function reaffineable_pids { @@ -332,6 +386,9 @@ function affine_tasks_to_platform_cores { taskset --pid --cpu-list ${PLATFORM_CPUS} {} > /dev/null 2>&1 done + # Reaffine drbd_r_* threads to platform cpus + affine_drbd_tasks ${PLATFORM_CPUS} + LOG "Affined ${count} processes to platform cores." } @@ -381,6 +438,23 @@ function start { update_cgroup_cpuset_k8s_infra_all fi + # Wait for all DRBD resources to have started. Affine DRBD tasks + # to float on all cores as we find them. + until all_drbd_resources_started; do + started=$(number_drbd_resources_started) + if [ ${started} -gt 0 ]; then + affine_drbd_tasks ${NONISOL_CPUS} + fi + dt=$(( ${SECONDS} - ${t0} )) + if [ ${dt} -ge ${PRINT_INTERVAL_SECONDS} ]; then + t0=${SECONDS} + LOG "Recovery wait, elapsed ${SECONDS} seconds." \ + "Reason: ${NOT_READY_REASON}" + fi + sleep ${INIT_INTERVAL_SECONDS} + done + affine_drbd_tasks ${NONISOL_CPUS} + # Wait until K8s pods have recovered and nova-compute is running t0=${SECONDS} until is_k8s_platform_steady_state_ready; do