From 69365bb834c37dbfd21cde86bd1084df505dff74 Mon Sep 17 00:00:00 2001 From: Matt Peters Date: Thu, 7 Jun 2018 15:59:14 -0500 Subject: [PATCH] Integrate host configuration into configuration framework Integrates the following host configuration into the configuration framework: - Host boot parameters - CPU reservation - Process affinity - Memory huge page allocations Change-Id: I2259e0e93eefd5ce5000271fa32ecaa8d13fa411 Signed-off-by: Matt Peters --- compute-huge/centos/compute-huge.spec | 6 - .../compute-huge/affine-platform.sh.service | 2 +- .../compute-huge/compute-huge-goenabled.sh | 2 +- compute-huge/compute-huge/compute-huge.sh | 1512 ----------------- .../compute-huge/compute-huge.sh.service | 14 - .../compute-huge/compute_hugepages_total.conf | 0 .../src/bin/puppet-manifest-apply.sh | 18 +- puppet-manifests/src/manifests/compute.pp | 1 + .../platform/lib/facter/get_cmdline.rb | 5 + .../lib/facter/is_broadwell_processor.rb | 8 + .../lib/facter/is_gb_page_supported.rb | 7 + .../lib/facter/is_hugetlbfs_enabled.rb | 7 + .../lib/facter/is_per_numa_supported.rb | 6 + .../lib/facter/is_resctrl_supported.rb | 6 + .../lib/facter/number_of_logical_cpus.rb | 4 + .../lib/facter/number_of_numa_nodes.rb | 4 + .../parser/functions/check_grub_config.rb | 34 + .../src/modules/platform/manifests/compute.pp | 246 +++ .../templates/compute_extend.conf.erb | 12 + sysinv/sysinv/sysinv/sysinv/agent/manager.py | 47 +- sysinv/sysinv/sysinv/sysinv/agent/node.py | 329 ++-- .../sysinv/sysinv/api/controllers/v1/cpu.py | 3 +- .../sysinv/sysinv/api/controllers/v1/host.py | 49 +- .../sysinv/sysinv/sysinv/common/constants.py | 2 + .../sysinv/sysinv/sysinv/conductor/manager.py | 66 +- .../sysinv/sysinv/sysinv/conductor/rpcapi.py | 14 +- sysinv/sysinv/sysinv/sysinv/puppet/base.py | 9 + .../sysinv/sysinv/sysinv/puppet/platform.py | 161 +- 28 files changed, 846 insertions(+), 1728 deletions(-) delete mode 100755 compute-huge/compute-huge/compute-huge.sh delete mode 100644 compute-huge/compute-huge/compute-huge.sh.service delete mode 100644 compute-huge/compute-huge/compute_hugepages_total.conf create mode 100644 puppet-manifests/src/modules/platform/lib/facter/get_cmdline.rb create mode 100644 puppet-manifests/src/modules/platform/lib/facter/is_broadwell_processor.rb create mode 100644 puppet-manifests/src/modules/platform/lib/facter/is_gb_page_supported.rb create mode 100644 puppet-manifests/src/modules/platform/lib/facter/is_hugetlbfs_enabled.rb create mode 100644 puppet-manifests/src/modules/platform/lib/facter/is_per_numa_supported.rb create mode 100644 puppet-manifests/src/modules/platform/lib/facter/is_resctrl_supported.rb create mode 100644 puppet-manifests/src/modules/platform/lib/facter/number_of_logical_cpus.rb create mode 100644 puppet-manifests/src/modules/platform/lib/facter/number_of_numa_nodes.rb create mode 100644 puppet-manifests/src/modules/platform/lib/puppet/parser/functions/check_grub_config.rb create mode 100644 puppet-manifests/src/modules/platform/manifests/compute.pp create mode 100644 puppet-manifests/src/modules/platform/templates/compute_extend.conf.erb diff --git a/compute-huge/centos/compute-huge.spec b/compute-huge/centos/compute-huge.spec index e778b12524..067b3ff4a8 100644 --- a/compute-huge/centos/compute-huge.spec +++ b/compute-huge/centos/compute-huge.spec @@ -35,7 +35,6 @@ Initial compute node hugepages and reserved cpus configuration # compute init scripts install -d -m 755 %{buildroot}%{local_etc_initd} install -p -D -m 755 affine-platform.sh %{buildroot}%{local_etc_initd}/affine-platform.sh -install -p -D -m 755 compute-huge.sh %{buildroot}%{local_etc_initd}/compute-huge.sh # utility scripts install -p -D -m 755 cpumap_functions.sh %{buildroot}%{local_etc_initd}/cpumap_functions.sh @@ -53,7 +52,6 @@ install -p -D -m 755 bin/topology %{buildroot}%{local_bindir}/topology # compute config data install -d -m 755 %{buildroot}%{local_etc_nova} install -p -D -m 755 compute_reserved.conf %{buildroot}%{local_etc_nova}/compute_reserved.conf -install -p -D -m 755 compute_hugepages_total.conf %{buildroot}%{local_etc_nova}/compute_hugepages_total.conf # goenabled check install -d -m 755 %{buildroot}%{local_etc_goenabledd} @@ -62,11 +60,9 @@ install -p -D -m 755 compute-huge-goenabled.sh %{buildroot}%{local_etc_goenabled # systemd services install -d -m 755 %{buildroot}%{_unitdir} install -p -D -m 664 affine-platform.sh.service %{buildroot}%{_unitdir}/affine-platform.sh.service -install -p -D -m 664 compute-huge.sh.service %{buildroot}%{_unitdir}/compute-huge.sh.service %post /bin/systemctl enable affine-platform.sh.service >/dev/null 2>&1 -/bin/systemctl enable compute-huge.sh.service >/dev/null 2>&1 %clean rm -rf $RPM_BUILD_ROOT @@ -79,7 +75,5 @@ rm -rf $RPM_BUILD_ROOT %{local_etc_initd}/* %{local_etc_goenabledd}/* %config(noreplace) %{local_etc_nova}/compute_reserved.conf -%config(noreplace) %{local_etc_nova}/compute_hugepages_total.conf -%{_unitdir}/compute-huge.sh.service %{_unitdir}/affine-platform.sh.service diff --git a/compute-huge/compute-huge/affine-platform.sh.service b/compute-huge/compute-huge/affine-platform.sh.service index 43b8567314..7ab9bbe89a 100644 --- a/compute-huge/compute-huge/affine-platform.sh.service +++ b/compute-huge/compute-huge/affine-platform.sh.service @@ -1,7 +1,7 @@ [Unit] Description=Titanium Cloud Affine Platform After=syslog.service network.service dbus.service sw-patch.service -Before=compute-huge.sh.service +Before=computeconfig.service [Service] Type=oneshot diff --git a/compute-huge/compute-huge/compute-huge-goenabled.sh b/compute-huge/compute-huge/compute-huge-goenabled.sh index c4a617b088..fc909a00ca 100644 --- a/compute-huge/compute-huge/compute-huge-goenabled.sh +++ b/compute-huge/compute-huge/compute-huge-goenabled.sh @@ -17,7 +17,7 @@ source "/etc/init.d/log_functions.sh" source "/usr/bin/tsconfig" if [ -e ${VOLATILE_COMPUTE_CONFIG_COMPLETE} -a ! -f ${COMPUTE_HUGE_GOENABLED} ]; then - log_error "compute-huge.sh CPU configuration check failed. Failing goenabled check." + log_error "Compute manifest CPU configuration check failed. Failing goenabled check." exit 1 fi diff --git a/compute-huge/compute-huge/compute-huge.sh b/compute-huge/compute-huge/compute-huge.sh deleted file mode 100755 index 3afcf67034..0000000000 --- a/compute-huge/compute-huge/compute-huge.sh +++ /dev/null @@ -1,1512 +0,0 @@ -#!/bin/bash -################################################################################ -# Copyright (c) 2013-2016 Wind River Systems, Inc. -# -# SPDX-License-Identifier: Apache-2.0 -# -################################################################################ -# compute-huge.sh -# - mounts hugepages memory backing for libvirt/qemu and vswitch -# - allocates per-NUMA node hugepages values based on compute node -# topology and memory engineered parameters. -# - IMPORTANT: mount of hugetlbfs must be called after udev is -# initialized, otherwise libvirt/qemu will not properly recognize -# the mount as HugeTLBFS. -# - generates /etc/nova/compute_extend.conf which nova-compute reads on init -# - updates grub.conf kernel boot arg parameters based on hugepages and cores - -. /usr/bin/tsconfig - -# Enable the 'extglob' feature to allow grouping in pattern matching -shopt -s extglob - -# Utility functions -LOG_FUNCTIONS=${LOG_FUNCTIONS:-"/etc/init.d/log_functions.sh"} -CPUMAP_FUNCTIONS=${CPUMAP_FUNCTIONS:-"/etc/init.d/cpumap_functions.sh"} -source /etc/init.d/functions -[[ -e ${LOG_FUNCTIONS} ]] && source ${LOG_FUNCTIONS} -[[ -e ${CPUMAP_FUNCTIONS} ]] && source ${CPUMAP_FUNCTIONS} - -# Configuration -PRODUCT_NAME=$(dmidecode --string 'system-product-name' 2>/dev/null) -RESERVE_CONF=${RESERVE_CONF:-"/etc/nova/compute_reserved.conf"} -VSWITCH_CONF=${VSWITCH_CONF:-"/etc/vswitch/vswitch.conf"} -linkname=$(readlink -n -f $0) -scriptname=$(basename $linkname) - -# Enable debug logs (uncomment) -LOG_DEBUG=1 - -# Flag file that is touched to signal that it is safe to enable the board -COMPUTE_HUGE_GOENABLED="/var/run/compute_huge_goenabled" - -# Flag file that is touched to signal that compute-huge has run at least once -COMPUTE_HUGE_RUN_ONCE="/etc/platform/.compute_huge_run_once" - -# Flag file that is touched to indicate that hei host needs a reboot to finish the config -RECONFIG_REBOOT_REQUIRED="/var/run/.reconfig_reboot_required" - -# Grub configuration files -GRUB_DEFAULTS=/etc/default/grub -if [ -f /etc/centos-release ] ; then - GRUB=grub2-mkconfig - if [ -d /sys/firmware/efi ] ; then - GRUB_CONFIG=/boot/efi/EFI/centos/grub.cfg - else - GRUB_CONFIG=/boot/grub2/grub.cfg - fi -else - GRUB=grub-mkconfig - GRUB_CONFIG=/boot/grub/grub.cfg -fi - -# Various globals -declare -i N_CPUS=1 -declare -i N_SOCKETS=1 -declare -i N_SIBLINGS_IN_PKG=1 -declare -i N_CORES_IN_PKG=1 -declare -i N_THREADS=1 -declare -i N_NUMA=1 -declare -i MEMTOTAL_MiB=0 -declare -i do_huge=1 -declare -i is_reconfig=0 - -# Disable Broadwell kvm-intel.eptad flag to prevent kernel oops/memory issues. -declare BROADWELL_EPTAD="0" # Broadwell flag kvm-intel.eptad (0=disable, 1=enable) - -# NOTE: cgroups currently disabled - this was previously working with DEV 0001, -# however we now get write permission errors. cgroups is supported by libvirt -# to give domain accounting, but is optional. Likely need to re-enable this to -# support performance measurements. -declare -i do_cgroups=0 - -# Ensure that first configuration doesn't contain stale info, -# clear these fields prior to reading config file. -if [ ! -f ${COMPUTE_HUGE_RUN_ONCE} ]; then - sed -i "s#^COMPUTE_VM_MEMORY_2M=.*\$#COMPUTE_VM_MEMORY_2M=\(\)#" ${RESERVE_CONF} - sed -i "s#^COMPUTE_VM_MEMORY_1G=.*\$#COMPUTE_VM_MEMORY_1G=\(\)#" ${RESERVE_CONF} -fi - -# Load configuration files (declare arrays that get sourced) -declare -a COMPUTE_PLATFORM_CORES -declare -a COMPUTE_VSWITCH_CORES -declare -a COMPUTE_VSWITCH_MEMORY -declare -a COMPUTE_VM_MEMORY_2M -declare -a COMPUTE_VM_MEMORY_1G -[[ -e ${RESERVE_CONF} ]] && source ${RESERVE_CONF} -[[ -e ${VSWITCH_CONF} ]] && source ${VSWITCH_CONF} -. /etc/platform/platform.conf - -################################################################################ -# vswitch_cpu_list() - compute the vswitch cpu list, including it's siblings -################################################################################ -function vswitch_cpu_list() { - local CONF_FILE=${VSWITCH_CONF} - local KEY="VSWITCH_CPU_LIST=" - - provision_list=$(curl -sf http://controller:6385/v1/ihosts/${UUID}/icpus/vswitch_cpu_list) - if [ $? -eq 0 ]; then - list=`echo ${provision_list} | bc` - grep ${KEY} ${CONF_FILE} > /dev/null - if [ $? -ne 0 ]; then - echo "$KEY\"$list"\" >> ${CONF_FILE} - else - #update vswitch.conf - sed -i "s/^VSWITCH_CPU_LIST=.*/VSWITCH_CPU_LIST=\"${list}\"/" /etc/vswitch/vswitch.conf - fi - else - list=$(get_vswitch_cpu_list) - fi - # Expand vswitch cpulist - vswitch_cpulist=$(expand_sequence ${list} " ") - - cpulist="" - for e in $vswitch_cpulist - do - # claim hyperthread siblings if SMT enabled - SIBLINGS_CPULIST=$(cat /sys/devices/system/cpu/cpu${e}/topology/thread_siblings_list 2>/dev/null) - siblings_cpulist=$(expand_sequence ${SIBLINGS_CPULIST} " ") - for s in $siblings_cpulist - do - in_list ${s} ${cpulist} - if [ $? -eq 1 ] - then - cpulist=$(append_list ${s} ${cpulist}) - fi - done - done - - echo "$cpulist" - return 0 -} - -################################################################################ -# platform_cpu_list() - compute the platform cpu list, including it's siblings -################################################################################ -function platform_cpu_list() { - local CONF_FILE=${RESERVE_CONF} - local KEY="PLATFORM_CPU_LIST=" - - provision_list=$(curl -sf http://controller:6385/v1/ihosts/${UUID}/icpus/platform_cpu_list) - if [ $? -eq 0 ]; then - list=`echo ${provision_list} | bc` - grep ${KEY} ${CONF_FILE} > /dev/null - if [ $? -ne 0 ]; then - echo "$KEY\"$list"\" >> ${CONF_FILE} - else - #update compute_reserved.conf - sed -i "s/^${KEY}.*/${KEY}\"${list}\"/" ${CONF_FILE} - fi - else - list=$(get_platform_cpu_list) - fi - # Expand platform cpulist - platform_cpulist=$(expand_sequence ${list} " ") - - cpulist="" - for e in $platform_cpulist - do - # claim hyperthread siblings if SMT enabled - SIBLINGS_CPULIST=$(cat /sys/devices/system/cpu/cpu${e}/topology/thread_siblings_list 2>/dev/null) - siblings_cpulist=$(expand_sequence ${SIBLINGS_CPULIST} " ") - for s in $siblings_cpulist - do - in_list ${s} ${cpulist} - if [ $? -eq 1 ] - then - cpulist=$(append_list ${s} ${cpulist}) - fi - done - done - - echo "$cpulist" - return 0 -} - -################################################################################ -# check_cpu_configuration() - check that the current state of the CPU (e.g., -# hyperthreading enabled/disabled) matches the expected state that was last -# written to the configuration file. -# -# NOTE: Puppet manifests are generated on unlock via sysinv profile. -# Config file is updated via manifest (cgcs_vswitch_095). -# -################################################################################ -function check_cpu_configuration() { - local CONFIGURED=$(condense_sequence $(expand_sequence ${COMPUTE_CPU_LIST} " ")) - local ACTUAL="0-$((${N_CPUS} - 1))" - local INIT="0-1" - - if [ -z "${CONFIGURED}" -o -z "${ACTUAL}" ]; then - log_error "Unable to compare configured=${CONFIGURED} and actual=${ACTUAL} CPU configurations" - return 2 - fi - - if [ "${CONFIGURED}" == "${INIT}" ]; then - log_debug "CPU configuration init: configured=${CONFIGURED} and actual=${ACTUAL}" - return 0 - fi - - if [ "${CONFIGURED}" != "${ACTUAL}" ]; then - log_error "CPU configurations mismatched: configured=${CONFIGURED} and actual=${ACTUAL}" - return 1 - fi - - return 0 -} - -################################################################################ -# check_kernel_boot_args() - check that the kernel boot arguments are in -# agreement with the current set of logical CPU instances. That is, check that -# the hyperthreading state has not changed since the last time we updated our -# grub configuration. -# - check Broadwell kvm-intel.eptad flag is in agreement with current setting -# -################################################################################ -function check_kernel_boot_args() { - local BASE_CPULIST=$1 - local ISOL_CPULIST=$2 - - local BASE_CPUMAP=$(cpulist_to_cpumap ${BASE_CPULIST} ${N_CPUS}) - local RCU_NOCBS_CPUMAP=$(invert_cpumap ${BASE_CPUMAP} ${N_CPUS}) - local RCU_NOCBS_CPULIST=$(cpumap_to_cpulist ${RCU_NOCBS_CPUMAP} ${N_CPUS}) - - ## Query the current boot args and store them in a hash/map for easy access - local CMDLINE=($(cat /proc/cmdline)) - declare -A BOOTARGS - for ITEM in ${CMDLINE[@]}; do - KV=(${ITEM//=/ }) - BOOTARGS[${KV[0]}]=${KV[1]} - done - - ## Audit the attributes that impacts VM scheduling behaviour - if [ "${BOOTARGS[isolcpus]}" != "${ISOL_CPULIST}" ]; then - log_error "Kernel boot argument mismatch: isolcpus=${BOOTARGS[isolcpus]} expecting ${ISOL_CPULIST}" - return 1 - fi - - if [ "${BOOTARGS[rcu_nocbs]}" != "${RCU_NOCBS_CPULIST}" ]; then - log_error "Kernel boot argument mismatch: rcu_nocbs=${BOOTARGS[rcu_nocbs]} expecting ${RCU_NOCBS_CPULIST}" - return 1 - fi - - if [ "${BOOTARGS[kthread_cpus]}" != "${BASE_CPULIST}" ]; then - log_error "Kernel boot argument mismatch: kthread_cpus=${BOOTARGS[kthread_cpus]} expecting ${BASE_CPULIST}" - return 1 - fi - - if [ "${BOOTARGS[irqaffinity]}" != "${BASE_CPULIST}" ]; then - log_error "Kernel boot argument mismatch: irqaffinity=${BOOTARGS[irqaffinity]} expecting ${BASE_CPULIST}" - return 1 - fi - - if grep -q -E "^model\s+:\s+79$" /proc/cpuinfo - then - if [ "${BOOTARGS[kvm-intel.eptad]}" != "${BROADWELL_EPTAD}" ]; then - log_error "Kernel boot argument mismatch: kvm-intel.eptad=${BOOTARGS[kvm-intel.eptad]} expecting ${BROADWELL_EPTAD}" - return 1 - fi - fi - - return 0 -} - -################################################################################ -# update_grub_configuration() - update the grub configuration so that the -# kernel boot arguments are correct on the next reboot. -# -################################################################################ -function update_grub_configuration() { - local BASE_CPULIST=$1 - local ISOL_CPULIST=$2 - - local BASE_CPUMAP=$(cpulist_to_cpumap ${BASE_CPULIST} ${N_CPUS}) - local RCU_NOCBS_CPUMAP=$(invert_cpumap ${BASE_CPUMAP} ${N_CPUS}) - local RCU_NOCBS_CPULIST=$(cpumap_to_cpulist ${RCU_NOCBS_CPUMAP} ${N_CPUS}) - - log "Updating grub configuration:" - - if [ ! -f ${GRUB_DEFAULTS} ]; then - log_error "Missing grub defaults file ${GRUB_DEFAULTS}" - return 1 - fi - - if [ ! -f ${GRUB_CONFIG} ]; then - log_error "Missing grub config file ${GRUB_CONFIG}" - return 1 - fi - - source ${GRUB_DEFAULTS} - if [ -z "${GRUB_CMDLINE_LINUX}" ]; then - log_error "Missing grub cmdline variable: GRUB_CMDLINE_LINUX" - return 1 - fi - - ## Remove the arguments that we need to update (or remove) - VALUE="${GRUB_CMDLINE_LINUX//?([[:blank:]])+(kvm-intel.eptad|default_hugepagesz|hugepagesz|hugepages|isolcpus|nohz_full|rcu_nocbs|kthread_cpus|irqaffinity)=+([-,0-9MG])/}" - - ## Add the new argument values - - # Broadwell specific flags (model: 79) - if grep -q -E "^model\s+:\s+79$" /proc/cpuinfo - then - VALUE="${VALUE} kvm-intel.eptad=${BROADWELL_EPTAD}" - fi - if grep -q pdpe1gb /proc/cpuinfo - then - VALUE="${VALUE} hugepagesz=1G hugepages=${N_NUMA}" - fi - VALUE="${VALUE} hugepagesz=2M hugepages=0" - VALUE="${VALUE} default_hugepagesz=2M" - VALUE="${VALUE} isolcpus=${ISOL_CPULIST}" - VALUE="${VALUE} rcu_nocbs=${RCU_NOCBS_CPULIST}" - VALUE="${VALUE} kthread_cpus=${BASE_CPULIST}" - VALUE="${VALUE} irqaffinity=${BASE_CPULIST}" - if [[ "$subfunction" == *"compute,lowlatency" ]]; then - # As force_grub_update() and check_cpu_grub_configuration call this - # function with an ISOL_CPULIST with from lowlatency compute checks we'll - # use it here for the nohz_full option - VALUE="${VALUE} nohz_full=${ISOL_CPULIST}" - fi - - if [ "${VALUE}" == "${GRUB_CMDLINE_LINUX}" ] && - grep -q -e "${GRUB_CMDLINE_LINUX}" /proc/cmdline - then - log_debug "Unchanged cmdline: ${GRUB_CMDLINE_LINUX}" - return 0 - fi - - ## Replace the value in the file and re-run the grub config tool - perl -pi -e 's/(GRUB_CMDLINE_LINUX)=.*/\1=\"'"${VALUE}"'\"/g' ${GRUB_DEFAULTS} - ${GRUB} -o ${GRUB_CONFIG} 2>/dev/null - RET=$? - if [ ${RET} -ne 0 ]; then - log_error "Failed to run grub-mkconfig, rc=${RET}" - return 1 - fi - source ${GRUB_DEFAULTS} - if [ -z "${GRUB_CMDLINE_LINUX}" ]; then - log_error "Missing grub cmdline variable: GRUB_CMDLINE_LINUX" - return 1 - else - log_debug "Updated cmdline: ${GRUB_CMDLINE_LINUX}" - fi - sync - - return 0 -} - -################################################################################ -# force_grub_update() - force an update to the grub configuration so that the -# kernel boot arguments are correct on the next reboot. -# -################################################################################ -function force_grub_update() { - log_debug "stop: force_grub_update" - - ## fetch the cpu topology - get_topology - - ## calculate the base and isolation cpu lists - local BASE_CPULIST=$(platform_cpu_list) - local ISOL_CPULIST=$(vswitch_cpu_list) - - if [[ "$subfunction" == *"compute,lowlatency" ]]; then - local BASE_CPUMAP=$(cpulist_to_cpumap ${BASE_CPULIST} ${N_CPUS}) - local RCU_NOCBS_CPUMAP=$(invert_cpumap ${BASE_CPUMAP} ${N_CPUS}) - local RCU_NOCBS_CPULIST=$(cpumap_to_cpulist ${RCU_NOCBS_CPUMAP} ${N_CPUS}) - - ISOL_CPULIST=$RCU_NOCBS_CPULIST - fi - - if [ -z "${ISOL_CPULIST}" ]; then - log_error "isolcpus cpu list is empty" - return 1 - fi - - ## update grub with new settings - update_grub_configuration ${BASE_CPULIST} ${ISOL_CPULIST} - RET=$? - - return ${RET} -} - -################################################################################ -# check_cpu_grub_configuration() - check kernel boot arguments to ensure -# that the current CPU configuration matches the isolation and platform arguments -# passed to the kernel at boot time. -# -################################################################################ -function check_cpu_grub_configuration() { - ## calculate the base and isolation cpu lists - local BASE_CPULIST=$(platform_cpu_list) - local ISOL_CPULIST=$(vswitch_cpu_list) - - if [[ "$subfunction" == *"compute,lowlatency" ]]; then - local BASE_CPUMAP=$(cpulist_to_cpumap ${BASE_CPULIST} ${N_CPUS}) - local RCU_NOCBS_CPUMAP=$(invert_cpumap ${BASE_CPUMAP} ${N_CPUS}) - local RCU_NOCBS_CPULIST=$(cpumap_to_cpulist ${RCU_NOCBS_CPUMAP} ${N_CPUS}) - - ISOL_CPULIST=$RCU_NOCBS_CPULIST - fi - - if [ -z "${ISOL_CPULIST}" ]; then - log_error "isolcpus cpu list is empty" - return 1 - fi - - if [ -z "${BASE_CPULIST}" ]; then - log_error "platform cpu list is empty" - return 1 - fi - - ## check that the boot arguments are consistent with the current - ## base/isolation cpu lists - check_kernel_boot_args ${BASE_CPULIST} ${ISOL_CPULIST} - RET=$? - if [ ${RET} -eq 1 ]; then - log_error "Boot args check failed; updating grub configuration" - update_grub_configuration ${BASE_CPULIST} ${ISOL_CPULIST} - RET=$? - if [ ${RET} -ne 0 ]; then - log_error "Failed to update grub configuration, rc=${RET}" - return 2 - fi - - return 1 - fi - - return 0 -} - -################################################################################ -# check_configuration() - check system configuration -# -################################################################################ -function check_configuration() { - ## Since script is called multiple times, remove previous flag - rm -f ${COMPUTE_HUGE_GOENABLED} - - if [ -z "${N_CPUS}" ]; then - log_error "N_CPUS environment variable not set" - return 1 - fi - - # Check that the actual CPU configuration matches configured settings - check_cpu_configuration - RET1=$? - if [ ${RET1} -gt 1 ]; then - return ${RET1} - fi - - # Check that CPU isolation and platform configuration has been applied according to the - # current CPU configuration - check_cpu_grub_configuration - RET2=$? - if [ ${RET2} -gt 1 ]; then - return ${RET2} - fi - - RET=$[ ${RET1} + ${RET2} ] - if [ ${RET} -eq 0 ]; then - ## All checks passed; safe to enable - log_debug "compute-huge-goenabled: pass" - touch ${COMPUTE_HUGE_GOENABLED} - elif [ "$nodetype" = "controller" \ - -a ! -f ${COMPUTE_HUGE_RUN_ONCE} \ - -a ! -f ${PLATFORM_SIMPLEX_FLAG} ]; then - touch ${COMPUTE_HUGE_RUN_ONCE} - log_debug "Rebooting to process config changes" - /sbin/reboot - else - log_error "compute-huge-goenabled: failed" - if [ ! -f ${COMPUTE_HUGE_RUN_ONCE} ]; then - touch ${RECONFIG_REBOOT_REQUIRED} - fi - fi - - # Mark when configuration run via compute_config packstack applyscript - if [ ${is_reconfig} -eq 1 ]; then - if [ ! -f ${COMPUTE_HUGE_RUN_ONCE} ]; then - log_debug "check_configuration: config FIRST_RUN" - else - log_debug "check_configuration: config" - fi - touch ${COMPUTE_HUGE_RUN_ONCE} - fi - - return 0 -} - - -################################################################################ -# get_topology() - deduce CPU and NUMA topology -# -################################################################################ -function get_topology() { - # number of logical cpus - N_CPUS=$(cat /proc/cpuinfo 2>/dev/null | \ - awk '/^[pP]rocessor/ { n +=1 } END { print (n>0) ? n : 1}') - - # number of sockets (i.e. packages) - N_SOCKETS=$(cat /proc/cpuinfo 2>/dev/null | \ - awk '/physical id/ { a[$4] = 1; } END { n=0; for (i in a) n++; print (n>0) ? n : 1 }') - - # number of logical cpu siblings per package - N_SIBLINGS_IN_PKG=$(cat /proc/cpuinfo 2>/dev/null | \ - awk '/^siblings/ {n = $3} END { print (n>0) ? n: 1 }') - - # number of cores per package - N_CORES_IN_PKG=$(cat /proc/cpuinfo 2>/dev/null | \ - awk '/^cpu cores/ {n = $4} END { print (n>0) ? n : 1 }') - - # number of SMT threads per core - N_THREADS=$[ $N_SIBLINGS_IN_PKG / $N_CORES_IN_PKG ] - - # number of numa nodes - N_NUMA=$(ls -d /sys/devices/system/node/node* 2>/dev/null | wc -l) - - # Total physical memory - MEMTOTAL_MiB=$(cat /proc/meminfo 2>/dev/null | \ - awk '/^MemTotal/ {n = int($2/1024)} END { print (n>0) ? n : 0 }') - - log_debug "TOPOLOGY: CPUS:${N_CPUS} SOCKETS:${N_SOCKETS}" \ - "SIBLINGS:${N_SIBLINGS_IN_PKG} CORES:${N_CORES_IN_PKG} THREADS:${N_THREADS}" \ - "NODES:${N_NUMA} MEMTOTAL:${MEMTOTAL_MiB} MiB" - - # Get kernel command line options - CMDLINE=$(cat /proc/cmdline 2>/dev/null) - if [[ $CMDLINE =~ (console=.*) ]]; then - log_debug "cmdline: ${BASH_REMATCH[1]}" - fi -} - -################################################################################ -# is_strict() - determine whether we are using strict memory accounting -# -################################################################################ -function is_strict() { - RET=0 - OC_MEM=$(cat /proc/sys/vm/overcommit_memory 2>/dev/null) - if [ ${OC_MEM} -eq 2 ]; then - echo 1 # strict - else - echo 0 # non-strict - fi -} - -################################################################################ -# get_memory() - determine memory breakdown for standard linux memory and -# default hugepages -# -################################################################################ -function get_memory() { - local NODESYSFS=/sys/devices/system/node - local HTLBSYSFS="" - local -i Ki=1024 - local -i Ki2=512 - local -i SZ_2M_Ki=2048 - local -i SZ_1G_Ki=1048576 - - # number of numa nodes - local n_numa=$(ls -d /sys/devices/system/node/node* 2>/dev/null | wc -l) - - # Parse all values of /proc/meminfo - declare -gA meminfo - while read -r line - do - if [[ $line =~ ^([[:alnum:]_]+):[[:space:]]+([[:digit:]]+) ]]; then - meminfo[${BASH_REMATCH[1]}]=${BASH_REMATCH[2]} - fi - done < "/proc/meminfo" - - # Parse all values of /sys/devices/system/node/node*/meminfo - declare -gA memnode - for ((node=0; node < n_numa; node++)) - do - while read -r line - do - if [[ $line =~ ^Node[[:space:]]+[[:digit:]]+[[:space:]]+([[:alnum:]_]+):[[:space:]]+([[:digit:]]+) ]]; then - memnode[$node,${BASH_REMATCH[1]}]=${BASH_REMATCH[2]} - fi - done < "/sys/devices/system/node/node${node}/meminfo" - done - - # Parse all values of /sys/devices/system/node/node*/meminfo_extra - for ((node=0; node < n_numa; node++)) - do - memnode[$node,'MemFreeInit']=${memnode[$node,'MemTotal']} - if [ -f /sys/devices/system/node/node${node}/meminfo_extra ]; then - while read -r line - do - if [[ $line =~ ^Node[[:space:]]+[[:digit:]]+[[:space:]]+([[:alnum:]_]+):[[:space:]]+([[:digit:]]+) ]]; then - memnode[$node,${BASH_REMATCH[1]}]=${BASH_REMATCH[2]} - fi - done < "/sys/devices/system/node/node${node}/meminfo_extra" - fi - done - - # Parse all values of /sys/devices/system/node/node*/hugepages/hugepages-${pgsize}kB - declare -a pgsizes - pgsizes+=(${SZ_2M_Ki}) - pgsizes+=(${SZ_1G_Ki}) - for ((node=0; node < n_numa; node++)) - do - for pgsize in ${pgsizes[@]} - do - memnode[$node,$pgsize,'nr']=0 - memnode[$node,$pgsize,'nf']=0 - done - done - for ((node=0; node < n_numa; node++)) - do - for pgsize in ${pgsizes[@]} - do - HTLBSYSFS=${NODESYSFS}/node${node}/hugepages/hugepages-${pgsize}kB - if [ -d ${HTLBSYSFS} ]; then - memnode[$node,$pgsize,'nr']=$(cat ${HTLBSYSFS}/nr_hugepages) - memnode[$node,$pgsize,'nf']=$(cat ${HTLBSYSFS}/free_hugepages) - fi - done - done - - # Calculate available memory - is_strict=$(is_strict) - if [ $is_strict -eq 1 ]; then - strict_msg='strict accounting' - meminfo['Avail']=$[ ${meminfo['CommitLimit']} - ${meminfo['Committed_AS']} ] - else - strict_msg='non-strict accounting' - meminfo['Avail']=$[ ${meminfo['MemFree']} + - ${meminfo['Cached']} + - ${meminfo['Buffers']} + - ${meminfo['SReclaimable']} ] - fi - # Used memory (this includes kernel overhead, so it is a bit bogus) - meminfo['Used']=$[ ${meminfo['MemTotal']} - ${meminfo['Avail']} ] - for ((node=0; node < n_numa; node++)) - do - memnode[${node},'Avail']=$[ ${memnode[$node,'MemFree']} + - ${memnode[$node,'FilePages']} + - ${memnode[$node,'SReclaimable']} ] - memnode[${node},'HTot']=0 - memnode[${node},'HFree']=0 - for pgsize in ${pgsizes[@]} - do - memnode[${node},'HTot']=$[ ${memnode[${node},'HTot']} + - ${pgsize} * ${memnode[$node,${pgsize},'nr']} ] - memnode[${node},'HFree']=$[ ${memnode[${node},'HFree']} + - ${pgsize} * ${memnode[$node,${pgsize},'nf']} ] - done - done - - # Print memory usage summary - log_debug "MEMORY OVERALL: MiB (${strict_msg})" - - # Print overall memory - MEM=$(printf "%6s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s" \ - 'Tot' 'Used' 'Free' 'Ca' 'Buf' 'Slab' 'CAS' 'CLim' 'Dirty' 'WBack' 'Active' 'Inact' 'Avail') - log_debug "${MEM}" - MEM=$(printf "%6d %6d %6d %6d %6d %6d %6d %6d %6d %6d %6d %6d %6d" \ - $[ (${meminfo['MemTotal']} + $Ki2) / $Ki ] \ - $[ (${meminfo['Used']} + $Ki2) / $Ki ] \ - $[ (${meminfo['MemFree']} + $Ki2) / $Ki ] \ - $[ (${meminfo['Cached']} + $Ki2) / $Ki ] \ - $[ (${meminfo['Buffers']} + $Ki2) / $Ki ] \ - $[ (${meminfo['Slab']} + $Ki2) / $Ki ] \ - $[ (${meminfo['Committed_AS']} + $Ki2) / $Ki ] \ - $[ (${meminfo['CommitLimit']} + $Ki2) / $Ki ] \ - $[ (${meminfo['Dirty']} + $Ki2) / $Ki ] \ - $[ (${meminfo['Writeback']} + $Ki2) / $Ki ] \ - $[ (${meminfo['Active']} + $Ki2) / $Ki ] \ - $[ (${meminfo['Inactive']} + $Ki2) / $Ki ] \ - $[ (${meminfo['Avail']} + $Ki2) / $Ki ]) - log_debug "${MEM}" - - # Print per-numa node memorybreakdown - log_debug "MEMORY PER-NUMA NODE: MiB" - MEM="" - for ((node=0; node < n_numa; node++)) - do - L=$(printf " %7s %7s %7s %7s" "$node:Init" "$node:Avail" "$node:Htot" "$node:HFree") - MEM="${MEM}${L}" - done - log_debug "${MEM}" - MEM="" - for ((node=0; node < n_numa; node++)) - do - L=$(printf " %7d %7d %7d %7d" \ - $[ (${memnode[$node,'MemFreeInit']} + $Ki2) / $Ki ] \ - $[ (${memnode[$node,'Avail']} + $Ki2) / $Ki ] \ - $[ (${memnode[$node,'HTot']} + $Ki2) / $Ki ] \ - $[ (${memnode[$node,'HFree']} + $Ki2) / $Ki ]) - MEM="${MEM}${L}" - done - log_debug "${MEM}" -} - -################################################################################ -# mount_cgroups() -# - mounts cgroups and all available controllers. -# - cgroup domains used by libvirt/qemu -# -################################################################################ -function mount_cgroups() { - local RET=0 - - # mount /sys/fs/cgroup - log_debug "Mounting cgroups" - mountpoint -q /sys/fs/cgroup || \ - mount -t tmpfs -o uid=0,gid=0,mode=0755 cgroup /sys/fs/cgroup - RET=$? - if [ ${RET} -ne 0 ]; then - log_error "Failed to mount cgroups, rc=${RET}" - return ${RET} - fi - - # mount each available cgroup controller - for cnt in $(cat /proc/cgroups | awk '!/#/ {print $1;}') - do - mkdir -p /sys/fs/cgroup/$cnt - mountpoint -q /sys/fs/cgroup/$cnt || \ - (mount -n -t cgroup -o $cnt cgroup /sys/fs/cgroup/$cnt || \ - rmdir /sys/fs/cgroup/$cnt || true) - done - return ${RET} -} - -################################################################################ -# mount_resctrl() -# - mounts resctrl for Cache Allocation Technology -# -################################################################################ -function mount_resctrl() { - local RET=0 - - # mount /sys/fs/resctrl - log_debug "Mounting resctrl" - mountpoint -q /sys/fs/resctrl || \ - mount -t resctrl resctrl /sys/fs/resctrl - RET=$? - if [ ${RET} -ne 0 ]; then - log_error "Failed to mount resctrl, rc=${RET}" - return ${RET} - fi - - return ${RET} -} - - -################################################################################ -# Set Power Management QoS resume latency constraints for CPUs. -# The PM QoS resume latency limit is set to shalow C-state for vswitch CPUs. -# All other CPUs are allowed to go to the deepest C-state available. -# -################################################################################ -set_pmqos_policy() { - local RET=0 - - if [[ "$subfunction" == *"compute,lowlatency" ]]; then - ## Set low wakeup latency (shalow C-state) for vswitch CPUs using PM QoS interface - local VSWITCH_CPULIST=$(vswitch_cpu_list) - /bin/bash -c "/usr/bin/set-cpu-wakeup-latency.sh low ${VSWITCH_CPULIST}" 2>/dev/null - RET=$? - if [ ${RET} -ne 0 ]; then - log_error "Failed to set low wakeup CPU latency for vswitch CPUs ${VSWITCH_CPULIST}, rc=${RET}" - fi - ## Set high wakeup latency (deep C-state) for non-vswitch CPUs using PM QoS interface - local NON_VSWITCH_CPULIST=$(invert_cpulist ${VSWITCH_CPULIST} ${N_CPUS}) - /bin/bash -c "/usr/bin/set-cpu-wakeup-latency.sh high ${NON_VSWITCH_CPULIST}" 2>/dev/null - RET=$? - if [ ${RET} -ne 0 ]; then - log_error "Failed to set high wakeup CPU latency for non-vswitch CPUs ${NON_VSWITCH_CPULIST}, rc=${RET}" - fi - fi - - return ${RET} -} - -################################################################################ -# Mounts virtual hugetlbfs filesystems for each supported page size. -# return: 0 - success; 1 - failure -# -################################################################################ -function mount_hugetlbfs_auto -{ - local SYSFSLIST=($(ls -1d /sys/kernel/mm/hugepages/hugepages-*)) - local SYSFS="" - local RET=0 - - if ! grep -q hugetlbfs /proc/filesystems - then - log_error "hugetlbfs not enabled" - return 1 - fi - - for SYSFS in ${SYSFSLIST[@]}; do - local PGNAME=$(basename $SYSFS) - local PGSIZE=${PGNAME/hugepages-/} - - local HUGEMNT=/mnt/huge-${PGSIZE} - log_debug "Mounting hugetlbfs at: $HUGEMNT" - if [ ! -d ${HUGEMNT} ]; then - mkdir -p ${HUGEMNT} - fi - - grep -q ${HUGEMNT} /proc/mounts || \ - mount -t hugetlbfs -o pagesize=${PGSIZE} none ${HUGEMNT} - RET=$? - if [ ${RET} -ne 0 ]; then - log_error "Failed to mount hugetlbfs at ${HUGEMNT}, rc=${RET}" - return ${RET} - fi - done - - return ${RET} -} - -################################################################################ -# Mounts virtual hugetlbfs filesystems for specific supported page size. -# param: MNT_HUGE - mount point for hugepages -# param: PGSIZE - pagesize attribute (eg, 2M, 1G) -# return: 0 - success; 1 - failure -# -################################################################################ -function mount_hugetlbfs -{ - local MNT_HUGE=$1 - local PGSIZE=$2 - local RET=0 - log_debug "Mounting hugetlbfs at: $MNT_HUGE" - - if ! grep -q hugetlbfs /proc/filesystems - then - log_error "hugetlbfs not enabled" - return 1 - fi - - mountpoint -q ${MNT_HUGE} - if [ $? -eq 1 ] - then - mkdir -p ${MNT_HUGE} - mount -t hugetlbfs -o pagesize=${PGSIZE} hugetlbfs ${MNT_HUGE} - RET=$? - if [ ${RET} -ne 0 ] - then - log_error "Failed to mount hugetlbfs at ${MNT_HUGE}, rc=${RET}" - return ${RET} - fi - fi - return 0 -} - -################################################################################ -# Allocates a set of HugeTLB pages according to the specified parameters. -# The first parameter specifies the NUMA node (e.g., node0, node1, etc.). -# The second parameter specifies the HugeTLB page size (e.g, 2048kB, -# 1048576kB, etc). -# The third parameter specifies the number of pages for the given page size. -################################################################################ -function allocate_one_pagesize -{ - local NODE=$1 - local PGSIZE=$2 - local PGCOUNT=$3 - local NODESYSFS=/sys/devices/system/node - local HTLBSYSFS="" - local RET=0 - - log_debug "Allocating ${PGCOUNT} HugeTLB pages of ${PGSIZE} on ${NODE}" - - if [ ! -d "${NODESYSFS}" ]; then - ## Single NUMA node - if [ "${NODE}" != "node0" ]; then - log_error "${NODE} is not valid on a single NUMA node system" - return 1 - fi - NODESYSFS=/sys/kernel/mm/ - else - NODESYSFS=${NODESYSFS}/${NODE} - if [ ! -d "${NODESYSFS}" ]; then - log_error "NUMA node ${NODE} does not exist" - return 1 - fi - fi - - HTLBSYSFS=${NODESYSFS}/hugepages/hugepages-${PGSIZE} - if [ ! -d ${HTLBSYSFS} ]; then - log_error "No HugeTLB support for ${PGSIZE} pages on ${NODE}" - return 1 - fi - - ## Request pages - echo ${PGCOUNT} > ${HTLBSYSFS}/nr_hugepages - RET=$? - if [ ${RET} -ne 0 ] - then - log_error "Failed to allocate ${PGCOUNT} pages on ${HTLBSYSFS}, rc=${RET}" - return ${RET} - fi - - return ${RET} -} - -################################################################################ -# Allocates HugeTLB memory according to the attributes specified in the -# parameter list. The first parameters is expected to be a reference to an -# array rather than the actual contents of an array. -# -# Each element of the array is expected to be in the following format. -# "::" -# For example, -# ("node0:2048kB:256" "node0:1048576kB:2") -# -################################################################################ -function allocate_hugetlb_memory -{ - local MEMLIST=("${!1}") - local MEMDESC="" - local ARRAY="" - local RET=0 - - ## Reserve memory for each node + pagesize - for MEMDESC in ${MEMLIST[@]} - do - ARRAY=(${MEMDESC//:/ }) - if [ ${#ARRAY[@]} -ne 3 ]; then - log_error "Invalid element format ${MEMDESC}, expecting 'node:pgsize:pgcount'" - return 1 - fi - - NODE=${ARRAY[0]} - PGSIZE=${ARRAY[1]} - PGCOUNT=${ARRAY[2]} - allocate_one_pagesize ${NODE} ${PGSIZE} ${PGCOUNT} - RET=$? - if [ ${RET} -ne 0 ]; then - log_error "Failed to setup HugeTLB for ${NODE}:${PGSIZE}:${PGCOUNT}, rc=${RET}" - return ${RET} - fi - done - - return 0 -} - -################################################################################ -# per_numa_resources() -# - mounts and allocates hugepages for Compute node libvirt -# - hugepage requirements are calculated per NUMA node -# based on engineering of BASE and VSWITCH -# - it is assumed this is done very early in init to prevent fragmentation -# - calculates reserved cpulists for BASE and vswitch -# -################################################################################ -function per_numa_resources() { - local err=0 - local NODESYSFS=/sys/devices/system/node - local HTLBSYSFS="" - local node - - do_huge=${do_huge:-1} - - log_debug "Setting per-NUMA resources: ${PRODUCT_NAME}" - - # Check for per-node NUMA topology - NODESYSFS0=${NODESYSFS}/node0 - if [ ! -d "${NODESYSFS0}" ]; then - log_error "NUMA node0 does not exist" - return 1 - fi - - # Check that we have support for 2MB hugepages - if [ ${do_huge} -eq 1 ] - then - node=0 - pgsize=2048 - HTLBSYSFS=${NODESYSFS}/node${node}/hugepages/hugepages-${pgsize}kB - if [ ! -d ${HTLBSYSFS} ]; then - do_huge=0 - log_error "No HugeTLB support for ${pgsize}kB pages on node${node}, do_huge=0" - fi - fi - - # Workaround: customize /etc/nova/rootwrap.d/ - ROOTWRAP=/etc/nova/rootwrap.d - FILTER=${ROOTWRAP}/compute-extend.filters - mkdir -p ${ROOTWRAP} - PERM=$(stat --format=%a ${ROOTWRAP}) - chmod 755 ${ROOTWRAP} - : > ${FILTER} - echo "# nova-rootwrap command filters for compute nodes" >> ${FILTER} - echo "# This file should be owned by (and only-writeable by) the root user" >> ${FILTER} - echo "[Filters]" >> ${FILTER} - echo "cat: CommandFilter, cat, root" >> ${FILTER} - echo "taskset: CommandFilter, taskset, root" >> ${FILTER} - chmod ${PERM} ${ROOTWRAP} - - # Minimally need 1GB for compute in VirtualBox - declare -i compute_min_MB=1600 - declare -i compute_min_non0_MB=500 - - # Minimally need 6GB for controller in VirtualBox - declare -i controller_min_MB=6000 - - # Some constants - local -i Ki=1024 - local -i Ki2=512 - local -i SZ_4K_Ki=4 - local -i SZ_2M_Ki=2048 - local -i SZ_1G_Ki=1048576 - - # Declare memory page sizes - declare -A pgsizes - pgsizes[${SZ_4K_Ki}]='4K' - pgsizes[${SZ_2M_Ki}]='2M' - pgsizes[${SZ_1G_Ki}]='1G' - - # Declare per-numa memory storage - declare -A do_manual - declare -A tot_memory - declare -A base_memory - declare -A vs_pages - declare -A vm_pages - declare -A max_vm_pages - for ((node=0; node < N_NUMA; node++)) - do - do_manual[$node]=0 - tot_memory[$node]=0 - base_memory[$node]=0 - for pgsize in "${!pgsizes[@]}" - do - vm_pages[${node},${pgsize}]=0 - max_vm_pages[${node},${pgsize}]=0 - vs_pages[${node},${pgsize}]=0 - done - done - - # Track vswitch hugepages. Note that COMPUTE_VSWITCH_MEMORY is defined in - # /etc/nova/compute_reserved.conf . - for MEMDESC in ${COMPUTE_VSWITCH_MEMORY[@]} - do - ARRAY=(${MEMDESC//:/ }) - if [ ${#ARRAY[@]} -ne 3 ]; then - log_error "Invalid element format ${MEMDESC}, expecting 'node:pgsize:pgcount'" - return 1 - fi - node=${ARRAY[0]#node} - pgsize=${ARRAY[1]%kB} - pgcount=${ARRAY[2]} - if [ ${node} -ge ${N_NUMA} ]; then - continue - fi - HTLBSYSFS=${NODESYSFS}/node${node}/hugepages/hugepages-${pgsize}kB - if [ ! -d ${HTLBSYSFS} ]; then - log_debug "SKIP: No HugeTLB support for ${pgsize}kB pages on node${node}" - continue - fi - - # Keep track of vswitch pages (we'll add them back in later) - vs_pages[${node},${pgsize}]=$[ ${vs_pages[${node},${pgsize}]} + $pgcount ] - done - - # Track total VM memory. Note that COMPUTE_VM_MEMORY_2M and - # COMPUTE_VM_MEMORY_1G is defined in /etc/nova/compute_reserved.conf . - for MEMDESC in ${COMPUTE_VM_MEMORY_2M[@]} ${COMPUTE_VM_MEMORY_1G[@]} - do - ARRAY=(${MEMDESC//:/ }) - if [ ${#ARRAY[@]} -ne 3 ]; then - log_debug "Invalid element format ${MEMDESC}, expecting 'node:pgsize:pgcount'" - break - fi - node=${ARRAY[0]#node} - pgsize=${ARRAY[1]%kB} - pgcount=${ARRAY[2]} - if [ ${node} -ge ${N_NUMA} ]; then - continue - fi - HTLBSYSFS=${NODESYSFS}/node${node}/hugepages/hugepages-${pgsize}kB - if [ ! -d ${HTLBSYSFS} ]; then - log_debug "SKIP: No HugeTLB support for ${pgsize}kB pages on node${node}" - continue - fi - - # Cumulate total VM memory - do_manual[${node}]=1 - vm_pages[${node},${pgsize}]=$[ ${vm_pages[${node},${pgsize}]} + $pgcount ] - done - - # Track base reserved cores and memory. Note that COMPUTE_BASE_RESERVED is - # defined in /etc/nova/compute_reserved.conf . - for MEMDESC in ${COMPUTE_BASE_RESERVED[@]} - do - ARRAY=(${MEMDESC//:/ }) - if [ ${#ARRAY[@]} -ne 3 ]; then - log_error "Invalid element format ${MEMDESC}, expecting 'node:memory:cores'" - return 1 - fi - local -i node=${ARRAY[0]#node} - local -i memory=${ARRAY[1]%MB} - local -i cores=${ARRAY[2]} - - # On small systems, clip memory overhead to more reasonable minimal - # settings in the case sysinv hasn't set run yet. - INIT_MiB=$[ (${memnode[${node},'MemFreeInit']} + ${Ki2}) / ${Ki} ] - MEMFREE=$[ ${INIT_MiB} - ${memory} ] - if [ ${MEMFREE} -lt 1000 ]; then - if [ ${node} -eq 0 ]; then - memory=${compute_min_MB} - if [ "$nodetype" = "controller" ]; then - ((memory += controller_min_MB)) - fi - else - memory=${compute_min_non0_MB} - fi - fi - - base_memory[$node]=$memory - done - - # Declare array to store hugepage allocation info - declare -a HUGE_MEMORY - declare -a VM_MEMORY_2M - declare -a VM_MEMORY_1G - HUGE_MEMORY=() - VM_MEMORY_2M=() - VM_MEMORY_1G=() - - # Calculate memory breakdown for this numa node - for ((node=0; node < N_NUMA; node++)) - do - # Top-down memory calculation: - # NODE_TOTAL_MiB = MemFreeInit - if [ -f /sys/devices/system/node/node${node}/meminfo_extra ]; then - NODE_TOTAL_INIT_MiB=$(grep MemFreeInit \ - /sys/devices/system/node/node${node}/meminfo_extra | \ - awk '{printf "%d", ($4+512)/1024;}') - else - NODE_TOTAL_INIT_MiB=$(grep MemTotal \ - /sys/devices/system/node/node${node}/meminfo | \ - awk '{printf "%d", ($4+512)/1024;}') - fi - - # Bottom-up memory calculation (total hugepages + usable linux mem) - # NODE_TOTAL_MiB = HTOT + (AVAIL + PSS) - HTOT_MiB=$[ (${memnode[${node},'HTot']} + ${Ki2}) / ${Ki} ] - AVAIL_MiB=$[ (${memnode[${node},'Avail']} + ${Ki2}) / ${Ki} ] - if [ $node -eq 0 ]; then - # Assume calling this when VMs not launched, so assume numa 0 - PSS_MiB=$(cat /proc/*/smaps 2>/dev/null | \ - awk '/^Pss:/ {a += $2;} END {printf "%d\n", a/1024.0;}') - else - PSS_MiB=0 - fi - NODE_TOTAL_MiB=$[ ${HTOT_MiB} + ${AVAIL_MiB} + ${PSS_MiB} ] - tot_memory[${node}]=${NODE_TOTAL_MiB} - - # Engineered amount of memory for vswitch plus VMs. - ENG_MiB=$[ ${NODE_TOTAL_MiB} - ${base_memory[$node]} ] - if [ ${ENG_MiB} -lt 0 ]; then - ENG_MiB=0 - fi - - # Amount of memory left for VMs - VM_MiB=$[ ${ENG_MiB} - - ${SZ_2M_Ki} * ${vs_pages[$node,${SZ_2M_Ki}]} / ${Ki} - - ${SZ_1G_Ki} * ${vs_pages[$node,${SZ_1G_Ki}]} / ${Ki} ] - - # Prevent allocating hugepages if host is too small - if [ ${do_huge} -eq 0 -o $VM_MiB -le 16 ] - then - VM_MiB=0 - log_error "insufficient memory on node $node to allocate hugepages" - fi - - # Maximize use of 2M pages if not using pre-determined 2M and 1G pages. - if [ ${do_manual[${node}]} -ne 1 ]; then - vm_pages[${node},${SZ_2M_Ki}]=$[ ${Ki} * ${VM_MiB} / ${SZ_2M_Ki} / 16 * 16 ] - fi - - # Calculate remaining memory as 4K pages - vm_pages[${node},${SZ_4K_Ki}]=$[ (${Ki} * ${VM_MiB} - - ${SZ_2M_Ki} * ${vm_pages[${node},${SZ_2M_Ki}]} - - ${SZ_1G_Ki} * ${vm_pages[${node},${SZ_1G_Ki}]}) / ${SZ_4K_Ki} ] - min_4K=$[ 32 * ${Ki} / ${SZ_4K_Ki} ] - if [ ${vm_pages[${node},${SZ_4K_Ki}]} -lt ${min_4K} ]; then - vm_pages[${node},${SZ_4K_Ki}]=0 - fi - - # Sanity check - # The memory pages specifed in the $RESERVE_CONF file should not - # exceed the available memory in the system. Validate the values by - # calculating the memory required for specified pages, and comparing - # with available memory. - # - # We will override configured pages if the specified values are out of - # range. Note that we do not expect this to happen (unless a DIMM - # fails, or some other error) as we check available pages before - # allowing user to change allocated pages. - local requested_VM_MiB=$[ - ${SZ_4K_Ki} * ${vm_pages[${node},${SZ_4K_Ki}]} / ${Ki} - + ${SZ_2M_Ki} * ${vm_pages[${node},${SZ_2M_Ki}]} / ${Ki} - + ${SZ_1G_Ki} * ${vm_pages[${node},${SZ_1G_Ki}]} / ${Ki} ] - - if [ ${requested_VM_MiB} -gt ${VM_MiB} ]; then - - # We're over comitted - clamp memory usage to actual available - # memory. In addition to the log files, we also want to output - # to console - log_error "Over-commited VM memory: " \ - "Requested ${requested_VM_MiB} MiB through ${RESERVE_CONF} " \ - "but ${VM_MiB} MiB available." - - # Reduce 1G pages to the max number that will fit (leave 1G pages - # unchanged if it's already small enough) - if [ $[ ${VM_MiB} * ${Ki} / ${SZ_1G_Ki} ] -lt \ - ${vm_pages[${node},${SZ_1G_Ki}]} ]; then - vm_pages[${node},${SZ_1G_Ki}]=$[ ${VM_MiB} * ${Ki} / ${SZ_1G_Ki} ] - fi - - # Calculate the 2M pages based on amount of memory left over after - # 1G pages accounted for - vm_pages[${node},${SZ_2M_Ki}]=$[ (${Ki} * ${VM_MiB} - - ${SZ_1G_Ki} * ${vm_pages[${node},${SZ_1G_Ki}]}) - / ${SZ_2M_Ki} / 16 * 16 ] - - # Anything left over is 4K pages - vm_pages[${node},${SZ_4K_Ki}]=$[ (${Ki} * ${VM_MiB} - - ${SZ_2M_Ki} * ${vm_pages[${node},${SZ_2M_Ki}]} - - ${SZ_1G_Ki} * ${vm_pages[${node},${SZ_1G_Ki}]}) / ${SZ_4K_Ki} ] - - if [ ${vm_pages[${node},${SZ_4K_Ki}]} -lt ${min_4K} ]; then - vm_pages[${node},${SZ_4K_Ki}]=0 - fi - - requested_VM_MiB=$[ - ${SZ_4K_Ki} * ${vm_pages[${node},${SZ_4K_Ki}]} / ${Ki} - + ${SZ_2M_Ki} * ${vm_pages[${node},${SZ_2M_Ki}]} / ${Ki} - + ${SZ_1G_Ki} * ${vm_pages[${node},${SZ_1G_Ki}]} / ${Ki} ] - log_error "VM memory reduced to ${requested_VM_MiB} MiB " \ - "using ${vm_pages[${node},${SZ_1G_Ki}]} 1G pages and " \ - "${vm_pages[${node},${SZ_2M_Ki}]} 2M pages" - fi - - # Calculate total hugepages to be allocated. Setting HUGE_MEMORY will - # reset nr_hugepages. Always set values even if 0. - if grep -q pdpe1gb /proc/cpuinfo - then - pages_1G=$[ ${vm_pages[${node},${SZ_1G_Ki}]} + ${vs_pages[${node},${SZ_1G_Ki}]} ] - HUGE_MEMORY+=("node${node}:${SZ_1G_Ki}kB:${pages_1G}") - pages_1G=$[ ${vm_pages[${node},${SZ_1G_Ki}]} ] - VM_MEMORY_1G+=("node${node}:${SZ_1G_Ki}kB:${pages_1G}") - fi - pages_2M=$[ ${vm_pages[${node},${SZ_2M_Ki}]} + ${vs_pages[${node},${SZ_2M_Ki}]} ] - HUGE_MEMORY+=("node${node}:${SZ_2M_Ki}kB:${pages_2M}") - pages_2M=$[ ${vm_pages[${node},${SZ_2M_Ki}]} ] - VM_MEMORY_2M+=("node${node}:${SZ_2M_Ki}kB:${pages_2M}") - - # Calculate maximum possible VM pages of a given pagesize - max_vm_pages[${node},${SZ_2M_Ki}]=$[ ${Ki} * ${VM_MiB} / ${SZ_2M_Ki} / 16 * 16 ] - max_vm_pages[${node},${SZ_1G_Ki}]=$[ ${Ki} * ${VM_MiB} / ${SZ_1G_Ki} ] - - # Calculate a few things to print out - max_2M=${max_vm_pages[${node},${SZ_2M_Ki}]} - max_1G=${max_vm_pages[${node},${SZ_1G_Ki}]} - vm_4K_MiB=$[ ${SZ_4K_Ki} * ${vm_pages[${node},${SZ_4K_Ki}]} / ${Ki} ] - vm_2M_MiB=$[ ${SZ_2M_Ki} * ${vm_pages[${node},${SZ_2M_Ki}]} / ${Ki} ] - vm_1G_MiB=$[ ${SZ_1G_Ki} * ${vm_pages[${node},${SZ_1G_Ki}]} / ${Ki} ] - vs_2M_MiB=$[ ${SZ_2M_Ki} * ${vs_pages[${node},${SZ_2M_Ki}]} / ${Ki} ] - vs_1G_MiB=$[ ${SZ_1G_Ki} * ${vs_pages[${node},${SZ_1G_Ki}]} / ${Ki} ] - log_debug "Memory: node:${node}, TOTAL:${NODE_TOTAL_MiB} MiB," \ - "INIT:${NODE_TOTAL_INIT_MiB} MiB," \ - "AVAIL:${AVAIL_MiB} MiB, PSS:${PSS_MiB} MiB," \ - "HTOT:${HTOT_MiB} MiB" - log_debug "Memory: node:${node}," \ - "ENG:${ENG_MiB} MiB, VM:${VM_MiB} MiB," \ - "4K:${vm_4K_MiB} MiB, 2M:${vm_2M_MiB} MiB, 1G:${vm_1G_MiB} MiB," \ - "manual-set:${do_manual[$node]}" - log_debug "Memory: node:${node}," \ - "max: 2M:${max_2M} pages, 1G:${max_1G} pages" - log_debug "Memory: node:${node}," \ - "vswitch: 2M:${vs_2M_MiB} MiB, 1G:${vs_1G_MiB} MiB;" \ - "BASE:${base_memory[$node]} MiB reserved" - done - - # Summarize overall lists and hugetlb - log_debug "compute_hugetlb: ${HUGE_MEMORY[@]}" - - # Write out maximum possible hugepages of each type and total memory - max_2M=""; max_1G=""; tot_MiB="" - for ((node=0; node < N_NUMA; node++)) - do - max_2M=$(append_list ${max_vm_pages[${node},${SZ_2M_Ki}]} ${max_2M}) - max_1G=$(append_list ${max_vm_pages[${node},${SZ_1G_Ki}]} ${max_1G}) - tot_MiB=$(append_list ${tot_memory[${node}]} ${tot_MiB}) - done - CONF=/etc/nova/compute_hugepages_total.conf - echo "# Compute total possible hugepages to allocate (generated: do not modify)" > ${CONF} - echo "compute_hp_total_2M=${max_2M}" >> ${CONF} - echo "compute_hp_total_1G=${max_1G}" >> ${CONF} - echo "compute_total_MiB=${tot_MiB}" >> ${CONF} - echo "" >> ${CONF} - - # Write out extended nova compute options; used with nova accounting. - CONF=/etc/nova/compute_extend.conf - echo "# Compute extended nova options (generated: do not modify)" > ${CONF} - - # memory allocations of each type - vs_2M=""; vs_1G=""; vm_4K=""; vm_2M=""; vm_1G="" - for ((node=0; node < N_NUMA; node++)) - do - vs_2M=$(append_list ${vs_pages[${node},${SZ_2M_Ki}]} ${vs_2M}) - vs_1G=$(append_list ${vs_pages[${node},${SZ_1G_Ki}]} ${vs_1G}) - vm_4K=$(append_list ${vm_pages[${node},${SZ_4K_Ki}]} ${vm_4K}) - vm_2M=$(append_list ${vm_pages[${node},${SZ_2M_Ki}]} ${vm_2M}) - vm_1G=$(append_list ${vm_pages[${node},${SZ_1G_Ki}]} ${vm_1G}) - done - echo "# memory options" >> ${CONF} - echo "compute_vswitch_2M_pages=${vs_2M}" >> ${CONF} - echo "compute_vswitch_1G_pages=${vs_1G}" >> ${CONF} - echo "compute_vm_4K_pages=${vm_4K}" >> ${CONF} - echo "compute_vm_2M_pages=${vm_2M}" >> ${CONF} - echo "compute_vm_1G_pages=${vm_1G}" >> ${CONF} - echo "" >> ${CONF} - - # Allocate hugepages of each pgsize for each NUMA node - if [ ${do_huge} -eq 1 ]; then - allocate_hugetlb_memory HUGE_MEMORY[@] - - # Write out current hugepages to configuration file, - # keeping each individual array element quoted. - q=(); for e in "${VM_MEMORY_2M[@]}"; do q+="\"${e}\" "; done - r="${q[@]}"; r="${r%"${r##*[![:space:]]}"}" - sed -i "s#^COMPUTE_VM_MEMORY_2M=.*\$#COMPUTE_VM_MEMORY_2M=\($r\)#" ${RESERVE_CONF} - - q=(); for e in "${VM_MEMORY_1G[@]}"; do q+="\"${e}\" "; done - r="${q[@]}"; r="${r%"${r##*[![:space:]]}"}" - sed -i "s#^COMPUTE_VM_MEMORY_1G=.*\$#COMPUTE_VM_MEMORY_1G=\($r\)#" ${RESERVE_CONF} - fi -} - -################################################################################ -# Start/Setup all Compute node resources -# - Enabled a performance boost by mounting HugeTLBFS. -# This reduces TLB entries, hence reduces processor cache-thrash. -# - Allocates aggregate nr_hugepages per NUMA node. -# - Mounts cgroups . -# -################################################################################ -function start_compute() { - local RET=0 - log_debug "start_compute" - - # Flush page cache - sync; echo 3 > /proc/sys/vm/drop_caches - - # Determine cpu topology - get_topology - - # Determine memory breakdown - get_memory - - check_configuration - RET=$? - if [ ${RET} -ne 0 ]; then - log_error "Failed to check configuration, rc=${RET}" - return ${RET} - fi - - # Mount HugeTLBFS for vswitch and libvirt - mount_hugetlbfs_auto - RET=$? - if [ ${RET} -ne 0 ]; then - log_error "Failed to auto mount HugeTLB filesystem(s), rc=${RET}" - return ${RET} - fi - - # Check that 2MB hugepages are available for libvirt - MOUNT=/mnt/huge-2048kB - mountpoint -q $MOUNT - RET=$? - if [ ${RET} -ne 0 ]; then - log_error "Failed to mount 2048kB HugeTLB pages for libvirt, rc=${RET}, disabling huge" - do_huge=0 - fi - - # Calculate aggregate hugepage memory requirements for vswitch + libvirt. - # Set nr_hugepages per NUMA node. - per_numa_resources - RET=$? - if [ ${RET} -ne 0 ]; then - log_error "Failed to allocate sufficient resources, rc=${RET}" - return ${RET} - fi - - # Mount cgroups to take advantage of per domain accounting. - if [ ${do_cgroups} -eq 1 ]; then - mount_cgroups - RET=$? - if [ ${RET} -ne 0 ]; then - log_error "Failed to mount cgroups, rc=${RET}" - return ${RET} - fi - fi - - # Mount resctrl to allow Cache Allocation Technology per VM - RESCTRL=/sys/fs/resctrl - if [ -d $RESCTRL ]; then - mount_resctrl - RET=$? - if [ ${RET} -ne 0 ]; then - log_error "Failed to mount resctrl, rc=${RET}" - return ${RET} - fi - fi - - # Set Power Management QoS resume latency constraints for all CPUs. - set_pmqos_policy - RET=$? - if [ ${RET} -ne 0 ]; then - log_error "Failed to set Power Management QoS policy, rc=${RET}" - return ${RET} - fi - - # Disable IRQ balance service - IRQBALANCED=/etc/init.d/irqbalanced - if [ -x ${IRQBALANCED} ]; then - ${IRQBALANCED} stop &> /dev/null - RET=$? - if [ ${RET} -ne 0 ]; then - log_error "Failed to stop IRQ balance service, rc=${RET}" - return ${RET} - fi - fi - - return ${RET} -} - -################################################################################ -# Start Action -################################################################################ -function start() { - local RET=0 - echo -n "Starting ${scriptname}: " - - # COMPUTE Node related setup - if [ -x /etc/init.d/nova-compute ] - then - start_compute - RET=$? - fi - - print_status ${RET} - return ${RET} -} - -################################################################################ -# Stop Action -################################################################################ -function stop -{ - local RET=0 - echo -n "Stopping ${scriptname}: " - - force_grub_update - RET=$? - - print_status ${RET} - return ${RET} -} - - -################################################################################ -# Restart Action -################################################################################ -function restart() { - stop - start -} - -################################################################################ -# Main Entry -# -################################################################################ -case "$1" in -start) - start - ;; -stop) - stop - ;; -restart|reload) - is_reconfig=1 - restart - ;; -status) - echo -n "OK" - ;; -*) - echo $"Usage: $0 {start|stop|restart|reload|status}" - exit 1 -esac - -exit $? diff --git a/compute-huge/compute-huge/compute-huge.sh.service b/compute-huge/compute-huge/compute-huge.sh.service deleted file mode 100644 index a4ce0d91e8..0000000000 --- a/compute-huge/compute-huge/compute-huge.sh.service +++ /dev/null @@ -1,14 +0,0 @@ -[Unit] -Description=Titanium Cloud Compute Huge -After=syslog.service network.service affine-platform.sh.service sw-patch.service -Before=sshd.service sw-patch-agent.service sysinv-agent.service - -[Service] -Type=oneshot -RemainAfterExit=yes -ExecStart=/etc/init.d/compute-huge.sh start -ExecStop=/etc/init.d/compute-huge.sh stop -ExecReload=/etc/init.d/compute-huge.sh restart - -[Install] -WantedBy=multi-user.target diff --git a/compute-huge/compute-huge/compute_hugepages_total.conf b/compute-huge/compute-huge/compute_hugepages_total.conf deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/puppet-manifests/src/bin/puppet-manifest-apply.sh b/puppet-manifests/src/bin/puppet-manifest-apply.sh index 3774de15c3..c98ac74883 100755 --- a/puppet-manifests/src/bin/puppet-manifest-apply.sh +++ b/puppet-manifests/src/bin/puppet-manifest-apply.sh @@ -52,7 +52,23 @@ rm -rf ${PUPPET_TMP} mkdir -p ${PUPPET_TMP}/hieradata cp /etc/puppet/hieradata/global.yaml ${PUPPET_TMP}/hieradata/global.yaml cp /etc/puppet/hieradata/${PERSONALITY}.yaml ${PUPPET_TMP}/hieradata/personality.yaml -cp -f ${HIERADATA}/${HOST}.yaml ${PUPPET_TMP}/hieradata/host.yaml + +# When the compute node is first booted and goes online, sysinv-agent reports +# host CPU inventory which triggers the first runtime manifest apply that updates +# the grub. At this time, copying the host file failed due to a timing issue that +# has not yet been fully understood. Subsequent retries worked. +if [ "${PERSONALITY}" = "compute" ]; then + n=0 + until [ $n -ge 3 ] + do + cp -f ${HIERADATA}/${HOST}.yaml ${PUPPET_TMP}/hieradata/host.yaml && break + n=$[$n+1] + logger -t $0 "Failed to copy /etc/puppet/hieradata/${HOST}.yaml" + sleep 15 + done +else + cp -f ${HIERADATA}/${HOST}.yaml ${PUPPET_TMP}/hieradata/host.yaml +fi cp -f ${HIERADATA}/system.yaml \ ${HIERADATA}/secure_system.yaml \ ${HIERADATA}/static.yaml \ diff --git a/puppet-manifests/src/manifests/compute.pp b/puppet-manifests/src/manifests/compute.pp index 52f4c2e2a8..46226316d1 100644 --- a/puppet-manifests/src/manifests/compute.pp +++ b/puppet-manifests/src/manifests/compute.pp @@ -13,6 +13,7 @@ include ::platform::sysctl::compute include ::platform::dhclient include ::platform::partitions include ::platform::lvm::compute +include ::platform::compute include ::platform::vswitch include ::platform::network include ::platform::fstab diff --git a/puppet-manifests/src/modules/platform/lib/facter/get_cmdline.rb b/puppet-manifests/src/modules/platform/lib/facter/get_cmdline.rb new file mode 100644 index 0000000000..9074d7fcea --- /dev/null +++ b/puppet-manifests/src/modules/platform/lib/facter/get_cmdline.rb @@ -0,0 +1,5 @@ +# Returns the current boot parameters +Facter.add(:get_cmdline) do + setcode "cat /proc/cmdline 2>/dev/null" +end + diff --git a/puppet-manifests/src/modules/platform/lib/facter/is_broadwell_processor.rb b/puppet-manifests/src/modules/platform/lib/facter/is_broadwell_processor.rb new file mode 100644 index 0000000000..9429a29fbe --- /dev/null +++ b/puppet-manifests/src/modules/platform/lib/facter/is_broadwell_processor.rb @@ -0,0 +1,8 @@ +# Returns true if it is Broadwell processor +# Broadwell specific flags (model: 79) +Facter.add("is_broadwell_processor") do + setcode do + Facter::Core::Execution.exec('grep -q -E "^model\s+:\s+79$" /proc/cpuinfo') + $?.exitstatus == 0 + end +end diff --git a/puppet-manifests/src/modules/platform/lib/facter/is_gb_page_supported.rb b/puppet-manifests/src/modules/platform/lib/facter/is_gb_page_supported.rb new file mode 100644 index 0000000000..122768ce2b --- /dev/null +++ b/puppet-manifests/src/modules/platform/lib/facter/is_gb_page_supported.rb @@ -0,0 +1,7 @@ +# Returns true if one GB pages is supported +Facter.add("is_gb_page_supported") do + setcode do + Facter::Core::Execution.exec('grep -q pdpe1gb /proc/cpuinfo') + $?.exitstatus == 0 + end +end diff --git a/puppet-manifests/src/modules/platform/lib/facter/is_hugetlbfs_enabled.rb b/puppet-manifests/src/modules/platform/lib/facter/is_hugetlbfs_enabled.rb new file mode 100644 index 0000000000..aadada4f7a --- /dev/null +++ b/puppet-manifests/src/modules/platform/lib/facter/is_hugetlbfs_enabled.rb @@ -0,0 +1,7 @@ +# Returns true if hugetlbfs not enabled +Facter.add("is_hugetlbfs_enabled") do + setcode do + Facter::Core::Execution.exec('grep -q hugetlbfs /proc/filesystems') + $?.exitstatus == 0 + end +end diff --git a/puppet-manifests/src/modules/platform/lib/facter/is_per_numa_supported.rb b/puppet-manifests/src/modules/platform/lib/facter/is_per_numa_supported.rb new file mode 100644 index 0000000000..70061a0f82 --- /dev/null +++ b/puppet-manifests/src/modules/platform/lib/facter/is_per_numa_supported.rb @@ -0,0 +1,6 @@ +# Returns true if Resource Control is supported on this node +Facter.add("is_per_numa_supported") do + setcode do + Dir.exist?('/sys/devices/system/node/node0') + end +end diff --git a/puppet-manifests/src/modules/platform/lib/facter/is_resctrl_supported.rb b/puppet-manifests/src/modules/platform/lib/facter/is_resctrl_supported.rb new file mode 100644 index 0000000000..4a25e065a4 --- /dev/null +++ b/puppet-manifests/src/modules/platform/lib/facter/is_resctrl_supported.rb @@ -0,0 +1,6 @@ +# Returns true if Resource Control is supported on this node +Facter.add("is_resctrl_supported") do + setcode do + Dir.exist?('/sys/fs/resctrl') + end +end diff --git a/puppet-manifests/src/modules/platform/lib/facter/number_of_logical_cpus.rb b/puppet-manifests/src/modules/platform/lib/facter/number_of_logical_cpus.rb new file mode 100644 index 0000000000..652a3db9b0 --- /dev/null +++ b/puppet-manifests/src/modules/platform/lib/facter/number_of_logical_cpus.rb @@ -0,0 +1,4 @@ +# Returns number of logical cpus +Facter.add(:number_of_logical_cpus) do + setcode "cat /proc/cpuinfo 2>/dev/null | awk '/^[pP]rocessor/ { n +=1 } END { print (n>0) ? n : 1}'" +end diff --git a/puppet-manifests/src/modules/platform/lib/facter/number_of_numa_nodes.rb b/puppet-manifests/src/modules/platform/lib/facter/number_of_numa_nodes.rb new file mode 100644 index 0000000000..b8962abf34 --- /dev/null +++ b/puppet-manifests/src/modules/platform/lib/facter/number_of_numa_nodes.rb @@ -0,0 +1,4 @@ +# Returns number of numa nodes +Facter.add(:number_of_numa_nodes) do + setcode "ls -d /sys/devices/system/node/node* 2>/dev/null | wc -l" +end diff --git a/puppet-manifests/src/modules/platform/lib/puppet/parser/functions/check_grub_config.rb b/puppet-manifests/src/modules/platform/lib/puppet/parser/functions/check_grub_config.rb new file mode 100644 index 0000000000..c6840432f8 --- /dev/null +++ b/puppet-manifests/src/modules/platform/lib/puppet/parser/functions/check_grub_config.rb @@ -0,0 +1,34 @@ +module Puppet::Parser::Functions + newfunction(:check_grub_config, + :type => :rvalue, + :doc => <<-EOD + This internal function checks if a list of arguments are configured + in the current boot args based on the input parameters + + EOD + ) do |args| + + func_name = "check_grub_config()" + + raise(Puppet::ParseError, "#{func_name}: Requires 1 argument" + + "#{args.size} given") if args.size != 1 + + expected = args[0] + raise(Puppet::ParseError, "#{func_name}: first argument must be a string") \ + unless expected.instance_of? String + + # get the current boot args + cmd = Facter.value(:get_cmdline) + cmd_array = cmd.split() + + value = true + expected.split().each do |element| + value = cmd_array.include?(element) + if value == false + Puppet.debug("#{element} is not presented in #{cmd}") + return value + end + end + value + end +end diff --git a/puppet-manifests/src/modules/platform/manifests/compute.pp b/puppet-manifests/src/modules/platform/manifests/compute.pp new file mode 100644 index 0000000000..4846432e6d --- /dev/null +++ b/puppet-manifests/src/modules/platform/manifests/compute.pp @@ -0,0 +1,246 @@ +class platform::compute::grub::params ( + $n_cpus = '', + $cpu_options = '', + $m_hugepages = 'hugepagesz=2M hugepages=0', + $default_pgsz = 'default_hugepagesz=2M', + $keys = ['kvm-intel.eptad', 'default_hugepagesz', 'hugepagesz', 'hugepages', 'isolcpus', 'nohz_full', 'rcu_nocbs', 'kthread_cpus', 'irqaffinity'], +) { + + if $::is_broadwell_processor { + $eptad = 'kvm-intel.eptad=0' + } else { + $eptad = '' + } + + if $::is_gb_page_supported { + $gb_hugepages = "hugepagesz=1G hugepages=$::number_of_numa_nodes" + } else { + $gb_hugepages = '' + } + + $grub_updates = strip("${eptad} ${$gb_hugepages} ${m_hugepages} ${default_pgsz} ${cpu_options}") +} + +class platform::compute::grub::update + inherits ::platform::compute::grub::params { + + notice("Updating grub configuration") + + $to_be_removed = join($keys, " ") + exec { "Remove the cpu arguments": + command => "grubby --update-kernel=ALL --remove-args='$to_be_removed'", + } -> + exec { "Add the cpu arguments": + command => "grubby --update-kernel=ALL --args='$grub_updates'", + } +} + +class platform::compute::grub::recovery { + + notice("Update Grub and Reboot") + + class {'platform::compute::grub::update': } -> Exec['reboot-recovery'] + + exec { "reboot-recovery": + command => "reboot", + } +} + +class platform::compute::grub::audit + inherits ::platform::compute::grub::params { + + if ! str2bool($::is_initial_config_primary) { + + notice("Audit CPU and Grub Configuration") + + $expected_n_cpus = $::number_of_logical_cpus + $n_cpus_ok = ("$n_cpus" == "$expected_n_cpus") + + $cmd_ok = check_grub_config($grub_updates) + + if $cmd_ok and $n_cpus_ok { + $ensure = present + notice("CPU and Boot Argument audit passed.") + } else { + $ensure = absent + if !$cmd_ok { + notice("Kernel Boot Argument Mismatch") + include ::platform::compute::grub::recovery + } + } + + file { "/var/run/compute_huge_goenabled": + ensure => $ensure, + owner => 'root', + group => 'root', + mode => '0644', + } + } +} + +class platform::compute::grub::runtime { + include ::platform::compute::grub::update +} + +# Mounts virtual hugetlbfs filesystems for each supported page size +class platform::compute::hugetlbf { + + if str2bool($::is_hugetlbfs_enabled) { + + $fs_list = generate("/bin/bash", "-c", "ls -1d /sys/kernel/mm/hugepages/hugepages-*") + $array = split($fs_list, '\n') + $array.each | String $val | { + $page_name = generate("/bin/bash", "-c", "basename $val") + $page_size = strip(regsubst($page_name, 'hugepages-', '')) + $hugemnt ="/mnt/huge-$page_size" + $options = "pagesize=${page_size}" + + notice("Mounting hugetlbfs at: $hugemnt") + exec { "create $hugemnt": + command => "mkdir -p ${hugemnt}", + onlyif => "test ! -d ${hugemnt}", + } -> + mount { "${hugemnt}": + name => "${hugemnt}", + device => 'none', + fstype => 'hugetlbfs', + ensure => 'mounted', + options => "${options}", + atboot => 'yes', + remounts => true, + } + } + } +} + +class platform::compute::hugepage::params ( + $nr_hugepages_2M = undef, + $nr_hugepages_1G = undef, + $vswitch_2M_pages = '', + $vswitch_1G_pages = '', + $vm_4K_pages = '', + $vm_2M_pages = '', + $vm_1G_pages = '', +) {} + + +define allocate_pages ( + $path, + $page_count, +) { + exec { "Allocate ${page_count} ${path}": + command => "echo $page_count > $path", + onlyif => "test -f $path", + } +} + +# Allocates HugeTLB memory according to the attributes specified in the +# nr_hugepages_2M and nr_hugepages_1G +class platform::compute::allocate + inherits ::platform::compute::hugepage::params { + + # determine the node file system + if str2bool($::is_per_numa_supported) { + $nodefs = '/sys/devices/system/node' + } else { + $nodefs = '/sys/kernel/mm' + } + + if $nr_hugepages_2M != undef { + $nr_hugepages_2M_array = regsubst($nr_hugepages_2M, '[\(\)\"]', '', 'G').split(' ') + $nr_hugepages_2M_array.each | String $val | { + $per_node_2M = $val.split(':') + if size($per_node_2M)== 3 { + $node = $per_node_2M[0] + $page_size = $per_node_2M[1] + allocate_pages { "Start ${node} ${page_size}": + path => "${nodefs}/${node}/hugepages/hugepages-${page_size}/nr_hugepages", + page_count => $per_node_2M[2], + } + } + } + } + + if $nr_hugepages_1G != undef { + $nr_hugepages_1G_array = regsubst($nr_hugepages_1G , '[\(\)\"]', '', 'G').split(' ') + $nr_hugepages_1G_array.each | String $val | { + $per_node_1G = $val.split(':') + if size($per_node_1G)== 3 { + $node = $per_node_1G[0] + $page_size = $per_node_1G[1] + allocate_pages { "Start ${node} ${page_size}": + path => "${nodefs}/${node}/hugepages/hugepages-${page_size}/nr_hugepages", + page_count => $per_node_1G[2], + } + } + } + } +} + +class platform::compute::extend + inherits ::platform::compute::hugepage::params { + + # nova-compute reads on init, extended nova compute options + # used with nova accounting + file { "/etc/nova/compute_extend.conf": + ensure => 'present', + replace => true, + content => template('platform/compute_extend.conf.erb') + } +} + +# Mount resctrl to allow Cache Allocation Technology per VM +class platform::compute::resctrl { + + if str2bool($::is_resctrl_supported) { + mount { "/sys/fs/resctrl": + name => '/sys/fs/resctrl', + device => 'resctrl', + fstype => 'resctrl', + ensure => 'mounted', + atboot => 'yes', + remounts => true, + } + } +} + +# Set Power Management QoS resume latency constraints for CPUs. +# The PM QoS resume latency limit is set to shallow C-state for vswitch CPUs. +# All other CPUs are allowed to go to the deepest C-state available. +class platform::compute::pmqos ( + $low_wakeup_cpus = '', + $hight_wakeup_cpus = '', +) { + + if str2bool($::is_compute_subfunction) and str2bool($::is_lowlatency_subfunction) { + + $script = "/usr/bin/set-cpu-wakeup-latency.sh" + + # Set low wakeup latency (shallow C-state) for vswitch CPUs using PM QoS interface + exec { "low-wakeup-latency": + command => "${script} low ${low_wakeup_cpus}", + onlyif => "test -f ${script}", + logoutput => true, + } + + #Set high wakeup latency (deep C-state) for non-vswitch CPUs using PM QoS interface + exec { "high-wakeup-latency": + command => "${script} high ${hight_wakeup_cpus}", + onlyif => "test -f ${script}", + logoutput => true, + } + } +} + +class platform::compute { + + Class[$name] -> Class['::platform::vswitch'] + Class[$name] -> Class['::nova::compute'] + + require ::platform::compute::grub::audit + require ::platform::compute::hugetlbf + require ::platform::compute::allocate + require ::platform::compute::pmqos + require ::platform::compute::resctrl + require ::platform::compute::extend +} diff --git a/puppet-manifests/src/modules/platform/templates/compute_extend.conf.erb b/puppet-manifests/src/modules/platform/templates/compute_extend.conf.erb new file mode 100644 index 0000000000..d11d1a2e2c --- /dev/null +++ b/puppet-manifests/src/modules/platform/templates/compute_extend.conf.erb @@ -0,0 +1,12 @@ +########################################################################### +# +# compute_extend.conf contains compute extended nova options +# +# - This file is managed by Puppet. DO NOT EDIT. +# +########################################################################### +compute_vswitch_2M_pages=<%= @vswitch_2M_pages.gsub!(/\A"|"\Z/, '') %> +compute_vswitch_1G_pages=<%= @vswitch_1G_pages.gsub!(/\A"|"\Z/, '') %> +compute_vm_4K_pages=<%= @vm_4K_pages.gsub!(/\A"|"\Z/, '') %> +compute_vm_2M_pages=<%= @vm_2M_pages.gsub!(/\A"|"\Z/, '') %> +compute_vm_1G_pages=<%= @vm_1G_pages.gsub!(/\A"|"\Z/, '') %> diff --git a/sysinv/sysinv/sysinv/sysinv/agent/manager.py b/sysinv/sysinv/sysinv/sysinv/agent/manager.py index 9607dc32b7..b4b0381060 100644 --- a/sysinv/sysinv/sysinv/sysinv/agent/manager.py +++ b/sysinv/sysinv/sysinv/sysinv/agent/manager.py @@ -151,6 +151,7 @@ class AgentManager(service.PeriodicService): self._notify_subfunctions_alarm_raise = False self._tpmconfig_rpc_failure = False self._tpmconfig_host_first_apply = False + self._first_grub_update = False def start(self): super(AgentManager, self).start() @@ -316,6 +317,16 @@ class AgentManager(service.PeriodicService): except subprocess.CalledProcessError as e: LOG.error("subprocess error: (%d)", e.returncode) + def _force_grub_update(self): + """ Force update the grub on the first AIO controller after the initial + config is completed + """ + if (not self._first_grub_update and + os.path.isfile(tsc.INITIAL_CONFIG_COMPLETE_FLAG)): + self._first_grub_update = True + return True + return False + def periodic_tasks(self, context, raise_on_error=False): """ Periodic tasks are run at pre-specified intervals. """ @@ -712,11 +723,13 @@ class AgentManager(service.PeriodicService): LOG.exception("Sysinv Agent uncaught exception updating inuma.") pass + force_grub_update = self._force_grub_update() try: # may get duplicate key if already sent on earlier init rpcapi.icpus_update_by_ihost(icontext, ihost['uuid'], - icpus) + icpus, + force_grub_update) except RemoteError as e: LOG.error("icpus_update_by_ihost RemoteError exc_type=%s" % e.exc_type) @@ -731,19 +744,21 @@ class AgentManager(service.PeriodicService): pass imemory = self._inode_operator.inodes_get_imemory() - try: - # may get duplicate key if already sent on earlier init - rpcapi.imemory_update_by_ihost(icontext, - ihost['uuid'], - imemory) - except RemoteError as e: - LOG.error("imemory_update_by_ihost RemoteError exc_type=%s" % - e.exc_type) - # Allow the audit to update - pass - except: - LOG.exception("Sysinv Agent exception updating imemory conductor.") - pass + if imemory: + try: + # may get duplicate key if already sent on earlier init + rpcapi.imemory_update_by_ihost(icontext, + ihost['uuid'], + imemory) + except RemoteError as e: + LOG.error("imemory_update_by_ihost RemoteError exc_type=%s" % + e.exc_type) + # Allow the audit to update + pass + except: + LOG.exception("Sysinv Agent exception updating imemory " + "conductor.") + pass idisk = self._idisk_operator.idisk_get() try: @@ -1283,7 +1298,9 @@ class AgentManager(service.PeriodicService): try: # runtime manifests can not be applied without the initial # configuration applied - if not os.path.isfile(tsc.INITIAL_CONFIG_COMPLETE_FLAG): + force = config_dict.get('force', False) + if (not force and + not os.path.isfile(tsc.INITIAL_CONFIG_COMPLETE_FLAG)): return personalities = config_dict.get('personalities') diff --git a/sysinv/sysinv/sysinv/sysinv/agent/node.py b/sysinv/sysinv/sysinv/sysinv/agent/node.py index 5f66e44de5..543665efbf 100644 --- a/sysinv/sysinv/sysinv/sysinv/agent/node.py +++ b/sysinv/sysinv/sysinv/sysinv/agent/node.py @@ -19,18 +19,13 @@ from os import listdir from os.path import isfile, join import random import re -import shlex -import shutil -import signal -import six -import socket import subprocess -import tempfile from sysinv.common import exception from sysinv.common import utils from sysinv.openstack.common import log as logging +import tsconfig.tsconfig as tsc LOG = logging.getLogger(__name__) @@ -97,6 +92,30 @@ class NodeOperator(object): # self._get_free_memory_MiB() # self._get_free_memory_nodes_MiB() + def _is_strict(self): + with open(os.devnull, "w") as fnull: + try: + output = subprocess.check_output( + ["cat", "/proc/sys/vm/overcommit_memory"], + stderr=fnull) + if int(output) == 2: + return True + except subprocess.CalledProcessError as e: + LOG.info("Failed to check for overcommit, error (%s)", + e.output) + return False + + def _is_hugepages_allocated(self): + with open(os.devnull, "w") as fnull: + try: + output = subprocess.check_output( + ["cat", "/proc/sys/vm/nr_hugepages"], stderr=fnull) + if int(output) > 0: + return True + except subprocess.CalledProcessError as e: + LOG.info("Failed to check hugepages, error (%s)", e.output) + return False + def convert_range_string_to_list(self, s): olist = [] s = s.strip() @@ -267,7 +286,7 @@ class NodeOperator(object): return [name for name in listdir(dir) if os.path.isdir(join(dir, name))] - def _set_default_avs_hugesize(self, attr): + def _set_default_avs_hugesize(self): ''' Set the default memory size for avs hugepages when it must fallback to 2MB pages because there are no 1GB pages. In a virtual environment we @@ -281,18 +300,10 @@ class NodeOperator(object): else: avs_hugepages_nr = AVS_REAL_MEMORY_MB / hugepage_size - memtotal_mib = attr.get('memtotal_mib', 0) - memavail_mib = attr.get('memavail_mib', 0) - memtotal_mib = memtotal_mib - (hugepage_size * avs_hugepages_nr) - memavail_mib = min(memtotal_mib, memavail_mib) - ## Create a new set of dict attributes hp_attr = {'avs_hugepages_size_mib': hugepage_size, 'avs_hugepages_nr': avs_hugepages_nr, - 'avs_hugepages_avail': 0, - 'vm_hugepages_use_1G': 'False', - 'memtotal_mib': memtotal_mib, - 'memavail_mib': memavail_mib} + 'avs_hugepages_avail': 0} return hp_attr def _inode_get_memory_hugepages(self): @@ -303,17 +314,34 @@ class NodeOperator(object): ''' imemory = [] - num_2M_for_1G = 512 - num_4K_for_2M = 512 + Ki = 1024 + SZ_2M_Ki = 2048 + SZ_1G_Ki = 1048576 + controller_min_MB = 6000 + compute_min_MB = 1600 + compute_min_non0_MB = 500 - re_node_MemFreeInit = re.compile(r'^Node\s+\d+\s+\MemFreeInit:\s+(\d+)') + initial_compute_config_completed = \ + os.path.exists(tsc.INITIAL_COMPUTE_CONFIG_COMPLETE) + + # check if it is initial report before the huge pages are allocated + initial_report = not initial_compute_config_completed + + # do not send report if the initial compute config is completed and + # the huge pages have not been allocated, i.e.during subsequent + # reboot before the manifest allocates the huge pages + if (initial_compute_config_completed and + not self._is_hugepages_allocated()): + return imemory for node in range(self.num_nodes): attr = {} - Total_MiB = 0 - Free_MiB = 0 + Total_HP_MiB = 0 # Total memory (MiB) currently configured in HPs + Free_HP_MiB = 0 # Check AVS and Libvirt memory + # Loop through configured hugepage sizes of this node and record + # total number and number free hugepages = "/sys/devices/system/node/node%d/hugepages" % node try: @@ -325,15 +353,14 @@ class NodeOperator(object): # role via size; also from /etc/nova/compute_reserved.conf if sizesplit[1].startswith("1048576kB"): hugepages_role = "avs" - size = int(1048576 / 1024) + size = int(SZ_1G_Ki / Ki) else: hugepages_role = "vm" - size = int(2048 / 1024) + size = int(SZ_2M_Ki / Ki) nr_hugepages = 0 free_hugepages = 0 - # files = os.walk(subdir).next()[2] mydir = hugepages + '/' + subdir files = [f for f in listdir(mydir) if isfile(join(mydir, f))] @@ -345,11 +372,11 @@ class NodeOperator(object): if file.startswith("free_hugepages"): free_hugepages = int(f.readline()) + Total_HP_MiB = Total_HP_MiB + int(nr_hugepages * size) + Free_HP_MiB = Free_HP_MiB + int(free_hugepages * size) + # Libvirt hugepages can now be 1G and 2M, can't only look # at 2M pages - Total_MiB = Total_MiB + int(nr_hugepages * size) - Free_MiB = Free_MiB + int(free_hugepages * size) - if hugepages_role == "avs": avs_hugepages_nr = AVS_REAL_MEMORY_MB / size hp_attr = { @@ -359,18 +386,19 @@ class NodeOperator(object): 'vm_hugepages_nr_1G': (nr_hugepages - avs_hugepages_nr), 'vm_hugepages_avail_1G': free_hugepages, + 'vm_hugepages_use_1G': 'True' } else: if len(subdirs) == 1: - hp_attr = { - 'vm_hugepages_nr_2M': (nr_hugepages - 256), - 'vm_hugepages_avail_2M': free_hugepages, - } - else: - hp_attr = { - 'vm_hugepages_nr_2M': nr_hugepages, - 'vm_hugepages_avail_2M': free_hugepages, - } + hp_attr = self._set_default_avs_hugesize() + hp_attr.update({'vm_hugepages_use_1G': 'False'}) + + avs_hugepages_nr = hp_attr.get('avs_hugepages_nr', 0) + hp_attr.update({ + 'vm_hugepages_avail_2M': free_hugepages, + 'vm_hugepages_nr_2M': + (nr_hugepages - avs_hugepages_nr) + }) attr.update(hp_attr) @@ -378,115 +406,134 @@ class NodeOperator(object): # silently ignore IO errors (eg. file missing) pass - # Read the total possible number of libvirt (2M and 1G) hugepages, - # and total available memory determined by compute-huge. - hp_pages_2M = [] - hp_pages_1G = [] - tot_memory = [] - huge_total_attrs = {} - hp_total_info = "/etc/nova/compute_hugepages_total.conf" - try: - with open(hp_total_info, 'r') as infile: - for line in infile: - possible_memorys = line.split("=") - if possible_memorys[0] == 'compute_hp_total_2M': - hp_pages_2M = map(int, possible_memorys[1].split(',')) - continue + # Get the free and total memory from meminfo for this node + re_node_MemTotal = re.compile(r'^Node\s+\d+\s+\MemTotal:\s+(\d+)') + re_node_MemFree = re.compile(r'^Node\s+\d+\s+\MemFree:\s+(\d+)') + re_node_FilePages = \ + re.compile(r'^Node\s+\d+\s+\FilePages:\s+(\d+)') + re_node_SReclaim = \ + re.compile(r'^Node\s+\d+\s+\SReclaimable:\s+(\d+)') + re_node_CommitLimit = \ + re.compile(r'^Node\s+\d+\s+\CommitLimit:\s+(\d+)') + re_node_Committed_AS = \ + re.compile(r'^Node\s+\d+\s+\'Committed_AS:\s+(\d+)') - elif possible_memorys[0] == 'compute_hp_total_1G': - hp_pages_1G = map(int, possible_memorys[1].split(',')) - continue + Free_KiB = 0 # Free Memory (KiB) available + Total_KiB = 0 # Total Memory (KiB) + limit = 0 # only used in strict accounting + committed = 0 # only used in strict accounting - elif possible_memorys[0] == 'compute_total_MiB': - tot_memory = map(int, possible_memorys[1].split(',')) - continue - - except IOError: - # silently ignore IO errors (eg. file missing) - pass - - huge_total_attrs = { - 'vm_hugepages_possible_2M': hp_pages_2M[node], - 'vm_hugepages_possible_1G': hp_pages_1G[node], - } - - # The remaining VM pages are allocated to 4K pages - vm_hugepages_2M = attr.get('vm_hugepages_nr_2M') - vm_hugepages_1G = attr.get('vm_hugepages_nr_1G') - - vm_hugepages_4K = (hp_pages_2M[node] - vm_hugepages_2M) - if vm_hugepages_1G: - vm_hugepages_4K -= (vm_hugepages_1G * num_2M_for_1G) - - vm_hugepages_4K = vm_hugepages_4K * num_4K_for_2M - - # Clip 4K pages, just like compute-huge. - min_4K = 32 * 1024 / 4 - if vm_hugepages_4K < min_4K: - vm_hugepages_4K = 0 - - hp_attrs_4K = { - 'vm_hugepages_nr_4K': vm_hugepages_4K, - } - - attr.update(huge_total_attrs) - attr.update(hp_attrs_4K) - - # Include 4K pages in the displayed VM memtotal. - # Since there is no way to track used VM 4K pages, we treat them - # as available, but that is bogus. - vm_4K_MiB = vm_hugepages_4K * 4 / 1024 - Total_MiB += vm_4K_MiB - Free_MiB += vm_4K_MiB - self.total_memory_nodes_MiB.append(Total_MiB) - attroverview = { - 'numa_node': node, - 'memtotal_mib': Total_MiB, - 'memavail_mib': Free_MiB, - 'hugepages_configured': 'True', - } - - attr.update(attroverview) - - new_attrs = {} - if 'avs_hugepages_size_mib' not in attr: - ## No 1GB pages were found so borrow from the VM 2MB pool - ## - ## FIXME: - ## It is unfortunate that memory is categorized as VM or - ## AVS here on the compute node. It would have been more - ## flexible if memory parameters were collected and sent - ## up to the controller without making any decisions about - ## what the memory was going to be used for. That type of - ## decision is better left to the controller (or better - ## yet, to the user) - new_attrs = self._set_default_avs_hugesize(attr) - else: - new_attrs = {'vm_hugepages_use_1G': 'True'} - - attr.update(new_attrs) - - # Get the total memory of the numa node - memTotal_mib = 0 - meminfo = "/sys/devices/system/node/node%d/meminfo_extra" % node + meminfo = "/sys/devices/system/node/node%d/meminfo" % node try: with open(meminfo, 'r') as infile: for line in infile: - match = re_node_MemFreeInit.search(line) + match = re_node_MemTotal.search(line) if match: - memTotal_mib = int(match.group(1)) + Total_KiB += int(match.group(1)) continue + match = re_node_MemFree.search(line) + if match: + Free_KiB += int(match.group(1)) + continue + match = re_node_FilePages.search(line) + if match: + Free_KiB += int(match.group(1)) + continue + match = re_node_SReclaim.search(line) + if match: + Free_KiB += int(match.group(1)) + continue + match = re_node_CommitLimit.search(line) + if match: + limit = int(match.group(1)) + continue + match = re_node_Committed_AS.search(line) + if match: + committed = int(match.group(1)) + continue + + if self._is_strict(): + Free_KiB = limit - committed + except IOError: # silently ignore IO errors (eg. file missing) pass - memTotal_mib /= 1024 - if tot_memory[node]: - memTotal_mib = tot_memory[node] - node_attr = { - 'node_memtotal_mib': memTotal_mib, - } - attr.update(node_attr) + # Calculate PSS + Pss_MiB = 0 + if node == 0: + cmd = 'cat /proc/*/smaps 2>/dev/null | awk \'/^Pss:/ ' \ + '{a += $2;} END {printf "%d\\n", a/1024.0;}\'' + try: + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, + shell=True) + result = proc.stdout.read().strip() + Pss_MiB = int(result) + except subprocess.CalledProcessError as e: + LOG.error("Cannot calculate PSS (%s) (%d)", cmd, + e.returncode) + except OSError as e: + LOG.error("Failed to execute (%s) OS error (%d)", cmd, + e.errno) + + # need to multiply Total_MiB by 1024 to match compute_huge + node_total_kib = Total_HP_MiB * Ki + Free_KiB + Pss_MiB * Ki + + # Read base memory from compute_reserved.conf + base_mem_MiB = 0 + with open('/etc/nova/compute_reserved.conf', 'r') as infile: + for line in infile: + if "COMPUTE_BASE_RESERVED" in line: + val = line.split("=") + base_reserves = val[1].strip('\n')[1:-1] + for reserve in base_reserves.split(): + reserve = reserve.split(":") + if reserve[0].strip('"') == "node%d" % node: + base_mem_MiB = int(reserve[1].strip('MB')) + + # On small systems, clip memory overhead to more reasonable minimal + # settings + if (Total_KiB / Ki - base_mem_MiB) < 1000: + if node == 0: + base_mem_MiB = compute_min_MB + if tsc.nodetype == 'controller': + base_mem_MiB += controller_min_MB + else: + base_mem_MiB = compute_min_non0_MB + + Eng_KiB = node_total_kib - base_mem_MiB * Ki + + vswitch_mem_kib = (attr.get('avs_hugepages_size_mib', 0) * + attr.get('avs_hugepages_nr', 0) * Ki) + + VM_KiB = (Eng_KiB - vswitch_mem_kib) + + max_vm_pages_2M = VM_KiB / SZ_2M_Ki + max_vm_pages_1G = VM_KiB / SZ_1G_Ki + + attr.update({ + 'vm_hugepages_possible_2M': max_vm_pages_2M, + 'vm_hugepages_possible_1G': max_vm_pages_1G, + }) + + # calculate 100% 2M pages if it is initial report and the huge + # pages have not been allocated + if initial_report: + Total_HP_MiB += int(max_vm_pages_2M * (SZ_2M_Ki / Ki)) + Free_HP_MiB = Total_HP_MiB + attr.update({ + 'vm_hugepages_nr_2M': max_vm_pages_2M, + 'vm_hugepages_avail_2M': max_vm_pages_2M, + 'vm_hugepages_nr_1G': 0 + }) + + attr.update({ + 'numa_node': node, + 'memtotal_mib': Total_HP_MiB, + 'memavail_mib': Free_HP_MiB, + 'hugepages_configured': 'True', + 'node_memtotal_mib': node_total_kib / 1024, + }) imemory.append(attr) @@ -502,7 +549,6 @@ class NodeOperator(object): self.total_memory_MiB = 0 re_node_MemTotal = re.compile(r'^Node\s+\d+\s+\MemTotal:\s+(\d+)') - re_node_MemFreeInit = re.compile(r'^Node\s+\d+\s+\MemFreeInit:\s+(\d+)') re_node_MemFree = re.compile(r'^Node\s+\d+\s+\MemFree:\s+(\d+)') re_node_FilePages = re.compile(r'^Node\s+\d+\s+\FilePages:\s+(\d+)') re_node_SReclaim = re.compile(r'^Node\s+\d+\s+\SReclaimable:\s+(\d+)') @@ -538,19 +584,6 @@ class NodeOperator(object): # silently ignore IO errors (eg. file missing) pass - # WRS kernel customization to exclude kernel overheads - meminfo = "/sys/devices/system/node/node%d/meminfo_extra" % node - try: - with open(meminfo, 'r') as infile: - for line in infile: - match = re_node_MemFreeInit.search(line) - if match: - Total_MiB = int(match.group(1)) - continue - except IOError: - # silently ignore IO errors (eg. file missing) - pass - Total_MiB /= 1024 Free_MiB /= 1024 self.total_memory_nodes_MiB.append(Total_MiB) diff --git a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/cpu.py b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/cpu.py index 434dc0aee0..11a0951cd9 100644 --- a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/cpu.py +++ b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/cpu.py @@ -436,8 +436,7 @@ class CPUController(rest.RestController): rpc_port.save() - if (utils.get_system_mode() == constants.SYSTEM_MODE_SIMPLEX and - action == constants.APPLY_ACTION): + if action == constants.APPLY_ACTION: # perform rpc to conductor to perform config apply pecan.request.rpcapi.update_cpu_config( pecan.request.context) diff --git a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/host.py b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/host.py index f57ba7173a..3974962354 100644 --- a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/host.py +++ b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/host.py @@ -305,9 +305,9 @@ class HostStatesController(rest.RestController): (cpu.uuid, values)) pecan.request.dbapi.icpu_update(cpu.uuid, values) - # perform inservice apply if this is a controller in simplex state - if utils.is_host_simplex_controller(ihost): - pecan.request.rpcapi.update_cpu_config(pecan.request.context) + # perform inservice apply + pecan.request.rpcapi.update_cpu_config(pecan.request.context, + host_uuid) return self._get_host_cpus_collection(ihost.uuid) @@ -3478,6 +3478,46 @@ class HostController(rest.RestController): (ihost['hostname'], values)) pecan.request.dbapi.imemory_update(m.uuid, values) + @staticmethod + def _update_vm_4k_pages(ihost): + """ + Update VM 4K huge pages. + """ + ihost_inodes = pecan.request.dbapi.inode_get_by_ihost(ihost['uuid']) + + for node in ihost_inodes: + mems = pecan.request.dbapi.imemory_get_by_inode(node['id']) + for m in mems: + if m.hugepages_configured: + vm_hugepages_nr_2M = m.vm_hugepages_nr_2M_pending \ + if m.vm_hugepages_nr_2M_pending is not None \ + else m.vm_hugepages_nr_2M + vm_hugepages_nr_1G = m.vm_hugepages_nr_1G_pending \ + if m.vm_hugepages_nr_1G_pending is not None \ + else m.vm_hugepages_nr_1G + + vm_hugepages_4K = \ + (m.node_memtotal_mib - m.platform_reserved_mib) + vm_hugepages_4K -= \ + (m.avs_hugepages_nr * m.avs_hugepages_size_mib) + vm_hugepages_4K -= \ + (constants.MIB_2M * vm_hugepages_nr_2M) + vm_hugepages_4K -= \ + (constants.MIB_1G * vm_hugepages_nr_1G) + vm_hugepages_4K = \ + (constants.NUM_4K_PER_MiB * vm_hugepages_4K) + + # Clip 4K pages + min_4K = 32 * constants.Ki / 4 + if vm_hugepages_4K < min_4K: + vm_hugepages_4K = 0 + + value = {'vm_hugepages_nr_4K': vm_hugepages_4K} + LOG.info("Set VM 4K pages for host (%s) node (%d) pages " + "(%d)" % (ihost['hostname'], node['id'], + vm_hugepages_4K)) + pecan.request.dbapi.imemory_update(m.uuid, value) + @staticmethod def _semantic_mtc_check_action(hostupdate, action): """ @@ -4739,6 +4779,9 @@ class HostController(rest.RestController): if align_2M_memory or align_1G_memory: self._align_pending_memory(ihost, align_2M_memory, align_1G_memory) + # calculate the VM 4K huge pages for nova + self._update_vm_4k_pages(ihost) + if cutils.is_virtual() or cutils.is_virtual_compute(ihost): mib_platform_reserved_no_io = mib_reserved required_platform = \ diff --git a/sysinv/sysinv/sysinv/sysinv/common/constants.py b/sysinv/sysinv/sysinv/sysinv/common/constants.py index 1fcc2950ca..2c749e6265 100644 --- a/sysinv/sysinv/sysinv/sysinv/common/constants.py +++ b/sysinv/sysinv/sysinv/sysinv/common/constants.py @@ -206,6 +206,8 @@ REGION_SECONDARY = "External" # Hugepage sizes in MiB MIB_2M = 2 MIB_1G = 1024 +Ki = 1024 +NUM_4K_PER_MiB = 256 # Dynamic IO Resident Set Size(RSS) in MiB per socket DISK_IO_RESIDENT_SET_SIZE_MIB = 2000 diff --git a/sysinv/sysinv/sysinv/sysinv/conductor/manager.py b/sysinv/sysinv/sysinv/sysinv/conductor/manager.py index 85e380e385..6cf3a8b3c4 100644 --- a/sysinv/sysinv/sysinv/sysinv/conductor/manager.py +++ b/sysinv/sysinv/sysinv/sysinv/conductor/manager.py @@ -2553,7 +2553,8 @@ class ConductorManager(service.PeriodicService): LOG.info('%9s : %s' % ('thread_id', t)) def icpus_update_by_ihost(self, context, - ihost_uuid, icpu_dict_array): + ihost_uuid, icpu_dict_array, + force_grub_update=False): """Create cpus for an ihost with the supplied data. This method allows records for cpus for ihost to be created. @@ -2561,6 +2562,7 @@ class ConductorManager(service.PeriodicService): :param context: an admin context :param ihost_uuid: ihost uuid unique id :param icpu_dict_array: initial values for cpu objects + :param force_grub_update: bool value to force grub update :returns: pass or fail """ @@ -2626,6 +2628,9 @@ class ConductorManager(service.PeriodicService): subfunctions=ihost.get('subfunctions'), reference='current (unchanged)', sockets=cs, cores=cc, threads=ct) + if ihost.administrative == constants.ADMIN_LOCKED and \ + force_grub_update: + self.update_cpu_config(context, ihost_uuid) return self.print_cpu_topology(hostname=ihost.get('hostname'), @@ -2679,9 +2684,15 @@ class ConductorManager(service.PeriodicService): # info may have already been posted pass - if (utils.is_host_simplex_controller(ihost) and - ihost.administrative == constants.ADMIN_LOCKED): - self.update_cpu_config(context) + # if it is the first controller wait for the initial config to + # be completed + if ((utils.is_host_simplex_controller(ihost) and + os.path.isfile(tsc.INITIAL_CONFIG_COMPLETE_FLAG)) or + (not utils.is_host_simplex_controller(ihost) and + ihost.administrative == constants.ADMIN_LOCKED)): + LOG.info("Update CPU grub config, host_uuid (%s), name (%s)" + % (ihost_uuid, ihost.get('hostname'))) + self.update_cpu_config(context, ihost_uuid) return @@ -2753,6 +2764,13 @@ class ConductorManager(service.PeriodicService): mem = self.dbapi.imemory_create(forihostid, mem_dict) else: for imem in imems: + # Include 4K pages in the displayed VM memtotal + if imem.vm_hugepages_nr_4K is not None: + vm_4K_mib = \ + (imem.vm_hugepages_nr_4K / + constants.NUM_4K_PER_MiB) + mem_dict['memtotal_mib'] += vm_4K_mib + mem_dict['memavail_mib'] += vm_4K_mib pmem = self.dbapi.imemory_update(imem['uuid'], mem_dict) except: @@ -6689,19 +6707,28 @@ class ConductorManager(service.PeriodicService): # discard temporary file os.remove(hosts_file_temp) - def update_cpu_config(self, context): - """Update the cpu assignment configuration on an AIO system""" - LOG.info("update_cpu_config") + def update_cpu_config(self, context, host_uuid): + """Update the cpu assignment configuration on a host""" - try: - hostname = socket.gethostname() - host = self.dbapi.ihost_get(hostname) - except Exception as e: - LOG.warn("Failed to get local host object: %s", str(e)) - return - command = ['/etc/init.d/compute-huge.sh', 'reload'] - rpcapi = agent_rpcapi.AgentAPI() - rpcapi.execute_command(context, host_uuid=host.uuid, command=command) + # only apply the manifest on the host that has compute sub function + host = self.dbapi.ihost_get(host_uuid) + if constants.COMPUTE in host.subfunctions: + force = (not utils.is_host_simplex_controller(host)) + LOG.info("update_cpu_config, host uuid: (%s), force: (%s)", + host_uuid, str(force)) + personalities = [constants.CONTROLLER, constants.COMPUTE] + config_uuid = self._config_update_hosts(context, + personalities, + host_uuid=host_uuid) + config_dict = { + "personalities": personalities, + "host_uuids": [host_uuid], + "classes": ['platform::compute::grub::runtime'] + } + self._config_apply_runtime_manifest(context, config_uuid, + config_dict, + force=force, + host_uuid=host_uuid) def _update_resolv_file(self, context, config_uuid, personalities): """Generate and update the resolv.conf files on the system""" @@ -7403,7 +7430,8 @@ class ConductorManager(service.PeriodicService): context, config_uuid, config_dict, - host_uuid=None): + host_uuid=None, + force=False): """Apply manifests on all hosts affected by the supplied personalities. If host_uuid is set, only update hiera data for that host @@ -7413,8 +7441,10 @@ class ConductorManager(service.PeriodicService): # is not set. If host_uuid is set only update hiera data for that host self._config_update_puppet(config_uuid, config_dict, - host_uuid=host_uuid) + host_uuid=host_uuid, + force=force) + config_dict.update({'force': force}) rpcapi = agent_rpcapi.AgentAPI() rpcapi.config_apply_runtime_manifest(context, config_uuid=config_uuid, diff --git a/sysinv/sysinv/sysinv/sysinv/conductor/rpcapi.py b/sysinv/sysinv/sysinv/sysinv/conductor/rpcapi.py index 736be1b5b1..454af0d022 100644 --- a/sysinv/sysinv/sysinv/sysinv/conductor/rpcapi.py +++ b/sysinv/sysinv/sysinv/sysinv/conductor/rpcapi.py @@ -282,7 +282,9 @@ class ConductorAPI(sysinv.openstack.common.rpc.proxy.RpcProxy): inuma_dict_array=inuma_dict_array)) def icpus_update_by_ihost(self, context, - ihost_uuid, icpu_dict_array): + ihost_uuid, icpu_dict_array, + force_grub_update, + ): """Create cpus for an ihost with the supplied data. This method allows records for cpus for ihost to be created. @@ -290,13 +292,15 @@ class ConductorAPI(sysinv.openstack.common.rpc.proxy.RpcProxy): :param context: an admin context :param ihost_uuid: ihost uuid unique id :param icpu_dict_array: initial values for cpu objects + :param force_grub_update: bool value to force grub update :returns: pass or fail """ return self.call(context, self.make_msg('icpus_update_by_ihost', ihost_uuid=ihost_uuid, - icpu_dict_array=icpu_dict_array)) + icpu_dict_array=icpu_dict_array, + force_grub_update=force_grub_update)) def imemory_update_by_ihost(self, context, ihost_uuid, imemory_dict_array): @@ -834,13 +838,15 @@ class ConductorAPI(sysinv.openstack.common.rpc.proxy.RpcProxy): status=status, error=error)) - def update_cpu_config(self, context): + def update_cpu_config(self, context, host_uuid): """Synchronously, have the conductor update the cpu configuration. :param context: request context. + :param host_uuid: host unique uuid """ - return self.call(context, self.make_msg('update_cpu_config')) + return self.call(context, self.make_msg('update_cpu_config', + host_uuid=host_uuid)) def iconfig_update_by_ihost(self, context, ihost_uuid, imsg_dict): diff --git a/sysinv/sysinv/sysinv/sysinv/puppet/base.py b/sysinv/sysinv/sysinv/sysinv/puppet/base.py index c28a89bdae..173cdf2b0d 100644 --- a/sysinv/sysinv/sysinv/sysinv/puppet/base.py +++ b/sysinv/sysinv/sysinv/sysinv/puppet/base.py @@ -4,6 +4,7 @@ # SPDX-License-Identifier: Apache-2.0 # +import collections import abc import itertools import netaddr @@ -213,3 +214,11 @@ class BasePuppet(object): s = "%s-%s" % (rng[0][1], rng[-1][1]) ranges.append(s) return ','.join(ranges) + + def _get_numa_index_list(self, obj): + """Create map of objects indexed by numa node""" + obj_lists = collections.defaultdict(list) + for index, o in enumerate(obj): + o["_index"] = index + obj_lists[o.numa_node].append(o) + return obj_lists diff --git a/sysinv/sysinv/sysinv/sysinv/puppet/platform.py b/sysinv/sysinv/sysinv/sysinv/puppet/platform.py index dc9f9bb9ba..66eb924a62 100644 --- a/sysinv/sysinv/sysinv/sysinv/puppet/platform.py +++ b/sysinv/sysinv/sysinv/sysinv/puppet/platform.py @@ -4,18 +4,18 @@ # SPDX-License-Identifier: Apache-2.0 # +import copy +import itertools import os from sysinv.common import constants from sysinv.common import exception -from sysinv.openstack.common import log as logging +from sysinv.common import utils from tsconfig import tsconfig from . import base -LOG = logging.getLogger(__name__) - HOSTNAME_INFRA_SUFFIX = '-infra' NOVA_UPGRADE_LEVEL_NEWTON = 'newton' @@ -65,6 +65,8 @@ class PlatformPuppet(base.BasePuppet): config.update(self._get_host_sysctl_config(host)) config.update(self._get_host_drbd_config(host)) config.update(self._get_host_upgrade_config(host)) + config.update(self._get_host_cpu_config(host)) + config.update(self._get_host_hugepage_config(host)) return config def _get_static_software_config(self): @@ -480,6 +482,159 @@ class PlatformPuppet(base.BasePuppet): }) return config + def _get_host_cpu_config(self, host): + config = {} + if constants.COMPUTE in utils.get_personalities(host): + host_cpus = self._get_host_cpu_list(host, threads=True) + if not host_cpus: + return config + + host_cpus = sorted(host_cpus, key=lambda c: c.cpu) + n_cpus = len(host_cpus) + host_cpu_list = [c.cpu for c in host_cpus] + + platform_cpus = self._get_host_cpu_list( + host, function=constants.PLATFORM_FUNCTION, threads=True) + platform_cpus = sorted(platform_cpus, key=lambda c: c.cpu) + platform_cpu_list = \ + "%s" % ','.join([str(c.cpu) for c in platform_cpus]) + + vswitch_cpus = self._get_host_cpu_list( + host, constants.VSWITCH_FUNCTION, threads=True) + vswitch_cpus = sorted(vswitch_cpus, key=lambda c: c.cpu) + vswitch_cpu_list = \ + "%s" % ','.join([str(c.cpu) for c in vswitch_cpus]) + + # rcu_nocbs = all cores - platform cores + rcu_nocbs = copy.deepcopy(host_cpu_list) + for i in [int(s) for s in platform_cpu_list.split(',')]: + rcu_nocbs.remove(i) + + # change the CPU list to ranges + rcu_nocbs_ranges = "" + for key, group in itertools.groupby(enumerate(rcu_nocbs), + lambda (x, y): y - x): + group = list(group) + rcu_nocbs_ranges += "%s-%s," % (group[0][1], group[-1][1]) + rcu_nocbs_ranges = rcu_nocbs_ranges.rstrip(',') + + # non-vswitch CPUs = all cores - vswitch cores + non_vswitch_cpus = host_cpu_list + for i in [int(s) for s in vswitch_cpu_list.split(',')]: + non_vswitch_cpus.remove(i) + + # change the CPU list to ranges + non_vswitch_cpus_ranges = "" + for key, group in itertools.groupby(enumerate(non_vswitch_cpus), + lambda (x, y): y - x): + group = list(group) + non_vswitch_cpus_ranges += "\"%s-%s\"," % (group[0][1], group[-1][1]) + + cpu_options = "" + if constants.LOWLATENCY in host.subfunctions: + vswitch_cpu_list_with_quotes = \ + "\"%s\"" % ','.join([str(c.cpu) for c in vswitch_cpus]) + config.update({ + 'platform::compute::pmqos::low_wakeup_cpus': + vswitch_cpu_list_with_quotes, + 'platform::compute::pmqos::hight_wakeup_cpus': + non_vswitch_cpus_ranges.rstrip(',')}) + vswitch_cpu_list = rcu_nocbs_ranges + cpu_options += "nohz_full=%s " % vswitch_cpu_list + + cpu_options += "isolcpus=%s rcu_nocbs=%s kthread_cpus=%s " \ + "irqaffinity=%s" % (vswitch_cpu_list, + rcu_nocbs_ranges, + platform_cpu_list, + platform_cpu_list) + config.update({ + 'platform::compute::grub::params::n_cpus': n_cpus, + 'platform::compute::grub::params::cpu_options': cpu_options, + }) + return config + + def _get_host_hugepage_config(self, host): + config = {} + if constants.COMPUTE in utils.get_personalities(host): + host_memory = self.dbapi.imemory_get_by_ihost(host.id) + + memory_numa_list = self._get_numa_index_list(host_memory) + + hugepages_2Ms = [] + hugepages_1Gs = [] + vswitch_2M_pages = [] + vswitch_1G_pages = [] + vm_4K_pages = [] + vm_2M_pages = [] + vm_1G_pages = [] + + for node, memory_list in memory_numa_list.items(): + + memory = memory_list[0] + vswitch_2M_page = 0 + vswitch_1G_page = 0 + + vm_hugepages_nr_2M = memory.vm_hugepages_nr_2M_pending \ + if memory.vm_hugepages_nr_2M_pending is not None \ + else memory.vm_hugepages_nr_2M + vm_hugepages_nr_1G = memory.vm_hugepages_nr_1G_pending \ + if memory.vm_hugepages_nr_1G_pending is not None \ + else memory.vm_hugepages_nr_1G + vm_hugepages_nr_4K = memory.vm_hugepages_nr_4K \ + if memory.vm_hugepages_nr_4K is not None else 0 + + total_hugepages_2M = vm_hugepages_nr_2M + total_hugepages_1G = vm_hugepages_nr_1G + + if memory.avs_hugepages_size_mib == constants.MIB_2M: + total_hugepages_2M += memory.avs_hugepages_nr + vswitch_2M_page += memory.avs_hugepages_nr + elif memory.avs_hugepages_size_mib == constants.MIB_1G: + total_hugepages_1G += memory.avs_hugepages_nr + vswitch_1G_page += memory.avs_hugepages_nr + + vswitch_2M_pages.append(vswitch_2M_page) + vswitch_1G_pages.append(vswitch_1G_page) + + hugepages_2M = "\"node%d:%dkB:%d\"" % ( + node, constants.MIB_2M * 1024, total_hugepages_2M) + hugepages_1G = "\"node%d:%dkB:%d\"" % ( + node, constants.MIB_1G * 1024, total_hugepages_1G) + hugepages_2Ms.append(hugepages_2M) + hugepages_1Gs.append(hugepages_1G) + + vm_4K_pages.append(vm_hugepages_nr_4K) + vm_2M_pages.append(vm_hugepages_nr_2M) + vm_1G_pages.append(vm_hugepages_nr_1G) + + nr_hugepages_2Ms = "(%s)" % ' '.join(hugepages_2Ms) + nr_hugepages_1Gs = "(%s)" % ' '.join(hugepages_1Gs) + + vswitch_2M = "\"%s\"" % ','.join([str(i) for i in vswitch_2M_pages]) + vswitch_1G = "\"%s\"" % ','.join([str(i) for i in vswitch_1G_pages]) + vm_4K = "\"%s\"" % ','.join([str(i) for i in vm_4K_pages]) + vm_2M = "\"%s\"" % ','.join([str(i) for i in vm_2M_pages]) + vm_1G = "\"%s\"" % ','.join([str(i) for i in vm_1G_pages]) + + config.update({ + 'platform::compute::hugepage::params::nr_hugepages_2M': + nr_hugepages_2Ms, + 'platform::compute::hugepage::params::nr_hugepages_1G': + nr_hugepages_1Gs, + 'platform::compute::hugepage::params::vswitch_2M_pages': + vswitch_2M, + 'platform::compute::hugepage::params::vswitch_1G_pages': + vswitch_1G, + 'platform::compute::hugepage::params::vm_4K_pages': + vm_4K, + 'platform::compute::hugepage::params::vm_2M_pages': + vm_2M, + 'platform::compute::hugepage::params::vm_1G_pages': + vm_1G, + }) + + return config + def _get_drbd_link_speed(self): # return infra link speed if provisioned, otherwise mgmt try: