Fix memory overcommit that caused OOM killer

Parallel package builds use large ramdisks.
It's important not to commit too much memory to these
ram disks, or we may push the system into memory exhaustion.
At that stage the Kernel will invoke the OOM killer,
It will likely select our build, or worse someone else's build,
to sacrifice.

The current algorithm only considers free memory at the instant
the parallel build starts.  It does not consider how many
other builds are in flight, but might not have allocated their
ramdisk yet.  The other build intends to use the memory, we see
the memory as free and try to use the same memory.

Solution is to consider total memory, and number of builds
already running or which might foreseeably start in the near future
(share factor) to derive an alternate estimate of memory available.
We then allocate the lesser amount.

Also fixed some issues with cleaning up of child processes when
a newer mockchain-parallel is in use.

Closes-Bug: 1917525
Signed-off-by: Scott Little <scott.little@windriver.com>
Change-Id: Iab178c6f9acbd5a209d66d0da21f367911f34905
This commit is contained in:
Scott Little 2021-03-03 11:02:43 -05:00
parent 0972ffe246
commit 78be59c758
2 changed files with 72 additions and 17 deletions

View File

@ -122,6 +122,10 @@ number_of_users () {
users | tr ' ' '\n' | sort --uniq | wc -l
}
total_mem_gb () {
free -g | grep 'Mem:' | awk '{ print $2 }'
}
available_mem_gb () {
free -g | grep 'Mem:' | awk '{ print $7 }'
}
@ -238,26 +242,41 @@ compute_resources () {
local users=$(number_of_users)
if [ $users -lt 1 ]; then users=1; fi
local mem=$(available_mem_gb)
local total_mem=$(total_mem_gb)
local disk=$(available_disk_gb)
local cpus=$(number_of_cpus)
local num_users=$(sqrt $users)
local num_build=$(number_of_builds_in_progress)
num_build=$((num_build+1))
echo "compute_resources: total: cpus=$cpus, mem=$mem, disk=$disk, weight=$weight, num_build=$num_build"
echo "compute_resources: total: cpus=$cpus, total_mem=$total_mem, avail_mem=$mem, disk=$disk, weight=$weight, num_build=$num_build"
# What fraction of the machine will we use
local share_factor=$num_users
if [ $share_factor -gt $((MAX_SHARE_FACTOR+num_build-1)) ]; then share_factor=$((MAX_SHARE_FACTOR+num_build-1)); fi
if [ $share_factor -lt $num_build ]; then share_factor=$num_build; fi
local mem_share_factor=$((share_factor-num_build))
# What fraction of free memory can we use.
# e.g.
# We intend to support 4 concurrent builds (share_factor)
# Two builds (excluding ours) are already underway (num_build-1)
# So we should be able to support 2 more builds (mem_share_factor)
local mem_share_factor=$((share_factor-(num_build-1)))
if [ $mem_share_factor -lt 1 ]; then mem_share_factor=1; fi
echo "compute_resources: share_factor=$share_factor mem_share_factor=$mem_share_factor"
# What resources are we permitted to use
# Continuing the example from above ... memory share is the lesser of
# - Half the available memory (mem/mem_share_factor)
# - A quarter of the total memory (total_mem/share_factor)
local mem_share=$(((mem-MEMORY_RESERVE)/mem_share_factor))
if [ $mem_share -lt 0 ]; then mem_share=0; fi
local total_mem_share=$(((total_mem-MEMORY_RESERVE)/share_factor))
if [ $total_mem_share -lt 0 ]; then total_mem_share=0; fi
if [ $mem_share -gt $total_mem_share ]; then mem_share=$total_mem_share; fi
local disk_share=$((disk/share_factor))
local cpus_share=$((cpus/share_factor))
echo "compute_resources: our share: cpus=$cpus_share, mem=$mem_share, disk=$disk_share"
# How many build jobs, how many jobs will use tmpfs, and how much mem for each tmpfs
@ -293,7 +312,7 @@ compute_resources () {
fi
done
# Our output is saved in environmnet variables
# Our output is saved in environment variables
MOCKCHAIN_RESOURCE_ALLOCATION=$(echo $x | sed 's#^:##')
MAX_WORKERS=$workers
echo "compute_resources: MAX_WORKERS=$MAX_WORKERS, MOCKCHAIN_RESOURCE_ALLOCATION=$MOCKCHAIN_RESOURCE_ALLOCATION"
@ -654,7 +673,7 @@ kill_descendents ()
local relevant_recursive_children="$ME"
local relevant_recursive_promote_children="mock"
local relevant_other_children="mockchain-parallel mockchain-parallel-1.3.4 mockchain-parallel-1.4.16"
local relevant_other_children="mockchain-parallel mockchain-parallel-1.3.4 mockchain-parallel-1.4.16 mockchain-parallel-2.6 mockchain-parallel-2.7"
local recursive_promote_children=$(for relevant_child in $relevant_recursive_promote_children; do pgrep -P $kill_pid $relevant_child; done)
local recursive_children=$(for relevant_child in $relevant_recursive_children; do pgrep -P $kill_pid $relevant_child; done)
@ -1181,14 +1200,24 @@ mock_clean_metadata_cfg () {
return 1
fi
CMD=$((cat $CFG; \
grep config_opts\\[\'yum.conf\'\\\] $CFG | \
sed 's#\\n#\n#g') | \
grep '^[[]' | \
grep -v main | \
sed -e 's/[][]//g' -e "s#^#${PKG_MANAGER} --enablerepo=#" -e 's#$# clean metadata#' | \
sort -u | \
tr '\n' ';')
#
# From mock config, extract the embedded yum/dnf config.
# Then extract the repo definitions,
# and convert to a series of yum commands to clean the
# metadata one repo at a time. e.g.
# CMD="yum --disablerepo=* --enablerepo=StxCentos7Distro clean metadata; \
# yum --disablerepo=* --enablerepo=StxCentos7Distro-rt clean metadata;
# ...
# "
#
CMD=$((grep -e config_opts\\[\'yum.conf\'\\\] $CFG \
-e config_opts\\[\'dnf.conf\'\\\] $CFG | \
sed 's#\\n#\n#g') | \
grep '^[[]' | \
grep -v main | \
sed -e 's/[][]//g' -e "s#^#${PKG_MANAGER} --disablerepo=* --enablerepo=#" -e 's#$# clean metadata#' | \
sort -u | \
tr '\n' ';')
echo "$MOCK --root $CFG --configdir $(dirname $CFG) --chroot bash -c $CMD" &> $TMP
trapwrap_n $CFG $MOCK --root $CFG --configdir $(dirname $CFG) --chroot "bash -c '($CMD)'" &>>$TMP
RC=$?
@ -2338,6 +2367,7 @@ if [ $CAREFUL -eq 1 ]; then
CMD_OPTIONS="$MOCK_PASSTHROUGH --no-cleanup-after"
fi
CMD_OPTIONS+=" $MOCK_PASSTHROUGH --enable-plugin=package_state"
CMD_OPTIONS+=" --log=$MOCKCHAIN_LOG"
echo "CAREFUL=$CAREFUL"

View File

@ -25,7 +25,14 @@
export ME=$(basename "$0")
CMDLINE="$ME $@"
BUILD_RPMS_PARALLEL_DIR="$(dirname "$(readlink -f "${BASH_SOURCE[0]}" )" )"
# Set PKG_MANAGER for our build environment.
source "${BUILD_RPMS_PARALLEL_DIR}/pkg-manager-utils.sh"
# Build for distribution. Currently 'centos' is only supported value.
export DISTRO="centos"
CREATEREPO=$(which createrepo_c)
if [ $? -ne 0 ]; then
@ -42,6 +49,7 @@ if [ ! -d ${LOCAL_REPO} ]; then
fi
fi
# Make sure we have a dependency cache
DEPENDANCY_DIR="${LOCAL_REPO}/dependancy-cache"
SRPM_DIRECT_REQUIRES_FILE="$DEPENDANCY_DIR/SRPM-direct-requires"
SRPM_TRANSITIVE_REQUIRES_FILE="$DEPENDANCY_DIR/SRPM-transitive-requires"
@ -118,7 +126,7 @@ create-no-clean-list () {
local g
for g in $install_groups; do
# Find manditory packages in the group.
# Find mandatory packages in the group.
# Discard anything before (and including) 'Mandatory Packages:'
# and anything after (and including) 'Optional Packages:'.
# Also discard leading spaces or '+' characters.
@ -135,7 +143,7 @@ create-no-clean-list () {
while [ $noclean_list_len -gt $noclean_last_list_len ]; do
noclean_last_list_len=$noclean_list_len
noclean_list=$( (yum -c $MY_YUM_CONF deplist $noclean_list 2>> /dev/null | grep provider: | awk '{ print $2 }' | awk -F . '{ print $1 }'; for p in $noclean_list; do echo $p; done) | sort --uniq)
noclean_list=$( (${PKG_MANAGER} -c $MY_YUM_CONF deplist $noclean_list 2>> /dev/null | grep provider: | awk '{ print $2 }' | awk -F . '{ print $1 }'; for p in $noclean_list; do echo $p; done) | sort --uniq)
noclean_list_len=$(echo $noclean_list | wc -w)
done
@ -475,7 +483,7 @@ kill_descendents ()
local relevant_recursive_children="$ME"
local relevant_recursive_promote_children="mock"
local relevant_other_children="mockchain-parallel"
local relevant_other_children="mockchain-parallel mockchain-parallel-1.3.4 mockchain-parallel-1.4.16 mockchain-parallel-2.6 mockchain-parallel-2.7"
local recursive_promote_children=$(for relevant_child in $relevant_recursive_promote_children; do pgrep -P $kill_pid $relevant_child; done)
local recursive_children=$(for relevant_child in $relevant_recursive_children; do pgrep -P $kill_pid $relevant_child; done)
@ -964,7 +972,24 @@ mock_clean_metadata_cfg () {
return 1
fi
CMD=$((cat $CFG; grep config_opts\\[\'yum.conf\'\\\] $CFG | sed 's#\\n#\n#g') | grep '^[[]' | grep -v main | sed 's/[][]//g' | sed 's#^#yum --enablerepo=#' | sed 's#$# clean metadata#' | sort -u | tr '\n' ';')
#
# From mock config, extract the embedded yum/dnf config.
# Then extract the repo definitions,
# and convert to a series of yum commands to clean the
# metadata one repo at a time. e.g.
# CMD="yum --disablerepo=* --enablerepo=StxCentos7Distro clean metadata; \
# yum --disablerepo=* --enablerepo=StxCentos7Distro-rt clean metadata;
# ...
# "
#
CMD=$((grep -e config_opts\\[\'yum.conf\'\\\] $CFG \
-e config_opts\\[\'dnf.conf\'\\\] $CFG | \
sed 's#\\n#\n#g') | \
grep '^[[]' | \
grep -v main | \
sed -e 's/[][]//g' -e "s#^#${PKG_MANAGER} --disablerepo=* --enablerepo=#" -e 's#$# clean metadata#' | \
sort -u | \
tr '\n' ';')
echo "$MOCK --root $CFG --configdir $(dirname $CFG) --chroot bash -c $CMD" &> $TMP
trapwrap_n $CFG $MOCK --root $CFG --configdir $(dirname $CFG) --chroot "bash -c '($CMD)'" &>>$TMP
RC=$?
@ -1129,6 +1154,7 @@ clean_yum_cache_cfg () {
return $RC
}
clean_yum_cache () {
echo "${FUNCNAME[0]}: in"
clean_yum_cache_cfg $BUILD_CFG
@ -1249,7 +1275,6 @@ while true ; do
esac
done
# Reset variables
if [ -n "$MY_WORKSPACE" ]; then
export MY_WORKSPACE_TOP=${MY_WORKSPACE_TOP:-$MY_WORKSPACE}