Add staged kubernetes version 1.21.3

Multiple versions of kubernetes are required to support upgrade.

This adds staged version of kubernetes 1.21.3, built with a
specific version of golang.

All subpackage versions are included in the iso image without
collisions.

The following patches are ported to specific kubernetes version:
kubelet-cpumanager-disable-CFS-quota-throttling-for-.patch
kubelet-cpumanager-keep-normal-containers-off-reserv.patch
kubelet-cpumanager-infrastructure-pods-use-system-re.patch
kubelet-cpumanager-introduce-concept-of-isolated-CPU.patch
kubeadm-create-platform-pods-with-zero-CPU-resources.patch
enable-support-for-kubernetes-to-ignore-isolcpus.patch

The following changes were made for 1.21.3:
- following upstream commit was reverted:
  Revert-use-subpath-for-coredns-only-for-default-repo.patch

- kubelet-cpumanager-disable-CFS-quota-throttling-for-.patch
  was refactored due to new internal_container_lifecycle framework
  We leverage the same mechanism to set Linux resources as:
  cpu manager: specify the container CPU set during the creation
  (commit 38dc7509f862f081828e7d9167107b8c6e98ea23).

- kubelet-cpumanager-introduce-concept-of-isolated-CPU.patch
  was refactored due to upstream API change:
  node: podresources: make GetDevices() consistent
  (commit ad68f9588c72d6477b5a290c548a9031063ac659).

  The routine podIsolCPUs() was refactored in 1.21.3 since the
  API p.deviceManager.GetDevices() is returning multiple devices
  with a device per cpu. The resultant cpuset needs to be the
  aggregate.

Story: 2008972
Task: 43056

Signed-off-by: Jim Gauld <james.gauld@windriver.com>
Change-Id: I5ba7ff2e6aebb744af265698c0f90256ac5e70f4
This commit is contained in:
Jim Gauld 2021-09-22 16:31:39 -04:00
parent 229a6b32af
commit 872dd513fc
17 changed files with 6336 additions and 0 deletions

View File

@ -165,6 +165,9 @@ kubernetes-1.19.13-client
kubernetes-1.20.9-node
kubernetes-1.20.9-kubeadm
kubernetes-1.20.9-client
kubernetes-1.21.3-node
kubernetes-1.21.3-kubeadm
kubernetes-1.21.3-client
containerd
k8s-pod-recovery
containernetworking-plugins

View File

@ -57,6 +57,7 @@ kubernetes/cni/plugins
kubernetes/kubernetes-1.18.1
kubernetes/kubernetes-1.19.13
kubernetes/kubernetes-1.20.9
kubernetes/kubernetes-1.21.3
kubernetes/kubernetes-unversioned
kubernetes/docker-distribution
kubernetes/etcd

View File

@ -34,6 +34,7 @@ kubernetes-contrib-v1.18.1.tar.gz#kubernetes-contrib-1.18.1#https://github.com/k
kubernetes-v1.18.1.tar.gz#kubernetes-1.18.1#https://github.com/kubernetes/kubernetes/archive/7879fc12a63337efff607952a323df90cdc7a335.tar.gz#http##
kubernetes-v1.19.13.tar.gz#kubernetes-1.19.13#https://github.com/kubernetes/kubernetes/archive/refs/tags/v1.19.13.tar.gz#http##
kubernetes-v1.20.9.tar.gz#kubernetes-1.20.9#https://github.com/kubernetes/kubernetes/archive/refs/tags/v1.20.9.tar.gz#http##
kubernetes-v1.21.3.tar.gz#kubernetes-1.21.3#https://github.com/kubernetes/kubernetes/archive/refs/tags/v1.21.3.tar.gz#http##
kvm-unit-tests.git-4ea7633.tar.bz2#kvm-unit-tests#https://git.kernel.org/pub/scm/virt/kvm/kvm-unit-tests.git/snapshot/kvm-unit-tests-20171020.tar.gz#http##
ldapscripts-2.0.8.tgz#ldapscripts-2.0.8#https://sourceforge.net/projects/ldapscripts/files/ldapscripts/ldapscripts-2.0.8/ldapscripts-2.0.8.tgz/download#http##
libtpms-0.6.0-4f0d59d.tar.gz#libtpms-0.6.0#https://github.com/stefanberger/libtpms/tarball/c421ca0f4d00c0caceeda8d62c1efb2b7e47db04#http##

View File

@ -0,0 +1,9 @@
The spec file used here was from the kubernetes 1.10.0 src rpm.
The orig file is included to help show modifications made to that
spec file, to help understand which changes were needed and to
assist with future upversioning.
The contrib tarball does not have the same versioning as kubernetes and
there is little activity in that repo.
The version for the contrib tarball is arbitrary.

View File

@ -0,0 +1,5 @@
VERSION=1.21.3
TAR_NAME=kubernetes
TAR="$TAR_NAME-v$VERSION.tar.gz"
COPY_LIST="${CGCS_BASE}/downloads/$TAR $FILES_BASE/*"
TIS_PATCH_VER=PKG_GITREVCOUNT

View File

@ -0,0 +1,117 @@
From b90e3858a8d319c7526dd8190ee05edce24ba072 Mon Sep 17 00:00:00 2001
From: Jim Gauld <james.gauld@windriver.com>
Date: Thu, 9 Sep 2021 13:16:26 -0400
Subject: [PATCH 7/7] Revert "use subpath for coredns only for default
repository"
This reverts commit 38a41e1557649a7cc763bf737779db9aa03ec75e.
---
cmd/kubeadm/app/constants/constants.go | 2 +-
cmd/kubeadm/app/images/images.go | 5 ---
cmd/kubeadm/app/images/images_test.go | 50 --------------------------
3 files changed, 1 insertion(+), 56 deletions(-)
diff --git a/cmd/kubeadm/app/constants/constants.go b/cmd/kubeadm/app/constants/constants.go
index aed3a713020..3cb2d11ad45 100644
--- a/cmd/kubeadm/app/constants/constants.go
+++ b/cmd/kubeadm/app/constants/constants.go
@@ -328,7 +328,7 @@ const (
CoreDNSDeploymentName = "coredns"
// CoreDNSImageName specifies the name of the image for CoreDNS add-on
- CoreDNSImageName = "coredns"
+ CoreDNSImageName = "coredns/coredns"
// CoreDNSVersion is the version of CoreDNS to be deployed if it is used
CoreDNSVersion = "v1.8.0"
diff --git a/cmd/kubeadm/app/images/images.go b/cmd/kubeadm/app/images/images.go
index 7ada3b75018..edf087e9c46 100644
--- a/cmd/kubeadm/app/images/images.go
+++ b/cmd/kubeadm/app/images/images.go
@@ -21,7 +21,6 @@ import (
"k8s.io/klog/v2"
kubeadmapi "k8s.io/kubernetes/cmd/kubeadm/app/apis/kubeadm"
- kubeadmapiv1beta2 "k8s.io/kubernetes/cmd/kubeadm/app/apis/kubeadm/v1beta2"
"k8s.io/kubernetes/cmd/kubeadm/app/constants"
kubeadmutil "k8s.io/kubernetes/cmd/kubeadm/app/util"
)
@@ -53,10 +52,6 @@ func GetDNSImage(cfg *kubeadmapi.ClusterConfiguration) string {
if cfg.DNS.ImageRepository != "" {
dnsImageRepository = cfg.DNS.ImageRepository
}
- // Handle the renaming of the official image from "k8s.gcr.io/coredns" to "k8s.gcr.io/coredns/coredns
- if dnsImageRepository == kubeadmapiv1beta2.DefaultImageRepository {
- dnsImageRepository = fmt.Sprintf("%s/coredns", dnsImageRepository)
- }
// DNS uses an imageTag that corresponds to the DNS version matching the Kubernetes version
dnsImageTag := constants.GetDNSVersion(cfg.DNS.Type)
diff --git a/cmd/kubeadm/app/images/images_test.go b/cmd/kubeadm/app/images/images_test.go
index e5b417bcdc7..f5165406e0f 100644
--- a/cmd/kubeadm/app/images/images_test.go
+++ b/cmd/kubeadm/app/images/images_test.go
@@ -22,7 +22,6 @@ import (
"testing"
kubeadmapi "k8s.io/kubernetes/cmd/kubeadm/app/apis/kubeadm"
- kubeadmapiv1beta2 "k8s.io/kubernetes/cmd/kubeadm/app/apis/kubeadm/v1beta2"
"k8s.io/kubernetes/cmd/kubeadm/app/constants"
)
@@ -238,52 +237,3 @@ func TestGetAllImages(t *testing.T) {
})
}
}
-
-func TestGetDNSImage(t *testing.T) {
- var tests = []struct {
- expected string
- cfg *kubeadmapi.ClusterConfiguration
- }{
- {
- expected: "foo.io/coredns:v1.8.0",
- cfg: &kubeadmapi.ClusterConfiguration{
- ImageRepository: "foo.io",
- DNS: kubeadmapi.DNS{
- Type: kubeadmapi.CoreDNS,
- },
- },
- },
- {
- expected: kubeadmapiv1beta2.DefaultImageRepository + "/coredns/coredns:v1.8.0",
- cfg: &kubeadmapi.ClusterConfiguration{
- ImageRepository: kubeadmapiv1beta2.DefaultImageRepository,
- DNS: kubeadmapi.DNS{
- Type: kubeadmapi.CoreDNS,
- },
- },
- },
- {
- expected: "foo.io/coredns/coredns:v1.8.0",
- cfg: &kubeadmapi.ClusterConfiguration{
- ImageRepository: "foo.io",
- DNS: kubeadmapi.DNS{
- Type: kubeadmapi.CoreDNS,
- ImageMeta: kubeadmapi.ImageMeta{
- ImageRepository: "foo.io/coredns",
- },
- },
- },
- },
- }
-
- for _, test := range tests {
- actual := GetDNSImage(test.cfg)
- if actual != test.expected {
- t.Errorf(
- "failed to GetDNSImage:\n\texpected: %s\n\t actual: %s",
- test.expected,
- actual,
- )
- }
- }
-}
--
2.17.1

View File

@ -0,0 +1,79 @@
From a0011e7749f7e54d3f1a689e717ded88e284860f Mon Sep 17 00:00:00 2001
From: Chris Friesen <chris.friesen@windriver.com>
Date: Fri, 23 Oct 2020 17:46:10 -0600
Subject: [PATCH 6/7] enable support for kubernetes to ignore isolcpus
The normal mechanisms for allocating isolated CPUs do not allow
a mix of isolated and exclusive CPUs in the same container. In
order to allow this in *very* limited cases where the pod spec
is known in advance we will add the ability to disable the normal
isolcpus behaviour.
If the file "/etc/kubernetes/ignore_isolcpus" exists, then kubelet
will basically forget everything it knows about isolcpus and just
treat them like regular CPUs.
The admin user can then rely on the fact that CPU allocation is
deterministic to ensure that the isolcpus they configure end up being
allocated to the correct pods.
---
pkg/kubelet/cm/cpumanager/cpu_manager.go | 9 +++++++++
pkg/kubelet/cm/cpumanager/policy_static.go | 8 ++++++++
2 files changed, 17 insertions(+)
diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager.go b/pkg/kubelet/cm/cpumanager/cpu_manager.go
index 2563f61e7b5..1b226187fef 100644
--- a/pkg/kubelet/cm/cpumanager/cpu_manager.go
+++ b/pkg/kubelet/cm/cpumanager/cpu_manager.go
@@ -19,6 +19,7 @@ package cpumanager
import (
"fmt"
"math"
+ "os"
"sync"
"time"
"strings"
@@ -55,6 +56,14 @@ const cpuManagerStateFileName = "cpu_manager_state"
// get the system-level isolated CPUs
func getIsolcpus() cpuset.CPUSet {
+
+ // This is a gross hack to basically turn off awareness of isolcpus to enable
+ // isolated cpus to be allocated to pods the same way as non-isolated CPUs.
+ if _, err := os.Stat("/etc/kubernetes/ignore_isolcpus"); err == nil {
+ klog.Infof("[cpumanager] turning off isolcpus awareness")
+ return cpuset.NewCPUSet()
+ }
+
dat, err := ioutil.ReadFile("/sys/devices/system/cpu/isolated")
if err != nil {
klog.Errorf("[cpumanager] unable to read sysfs isolcpus subdir")
diff --git a/pkg/kubelet/cm/cpumanager/policy_static.go b/pkg/kubelet/cm/cpumanager/policy_static.go
index 2ad14a98911..73b74d5c4cc 100644
--- a/pkg/kubelet/cm/cpumanager/policy_static.go
+++ b/pkg/kubelet/cm/cpumanager/policy_static.go
@@ -18,6 +18,7 @@ package cpumanager
import (
"fmt"
+ "os"
"strconv"
v1 "k8s.io/api/core/v1"
@@ -613,6 +614,13 @@ func isKubeInfra(pod *v1.Pod) bool {
// get the isolated CPUs (if any) from the devices associated with a specific container
func (p *staticPolicy) podIsolCPUs(pod *v1.Pod, container *v1.Container) cpuset.CPUSet {
+
+ // This is a gross hack to basically turn off awareness of isolcpus to enable
+ // isolated cpus to be allocated to pods the same way as non-isolated CPUs.
+ if _, err := os.Stat("/etc/kubernetes/ignore_isolcpus"); err == nil {
+ return cpuset.NewCPUSet()
+ }
+
// NOTE: This is required for TestStaticPolicyAdd() since makePod() does
// not create UID. We also need a way to properly stub devicemanager.
if len(string(pod.UID)) == 0 {
--
2.17.1

View File

@ -0,0 +1,84 @@
MDSFORMANPAGES="kube-apiserver.md kube-controller-manager.md kube-proxy.md kube-scheduler.md kubelet.md"
# remove comments from man pages
for manpage in ${MDSFORMANPAGES}; do
pos=$(grep -n "<\!-- END MUNGE: UNVERSIONED_WARNING -->" ${manpage} | cut -d':' -f1)
if [ -n ${pos} ]; then
sed -i "1,${pos}{/.*/d}" ${manpage}
fi
done
# for each man page add NAME and SYNOPSIS section
# kube-apiserver
sed -i -s "s/## kube-apiserver/# NAME\nkube-apiserver \- Provides the API for kubernetes orchestration.\n\n# SYNOPSIS\n**kube-apiserver** [OPTIONS]\n/" kube-apiserver.md
cat << 'EOF' >> kube-apiserver.md
# EXAMPLES
```
/usr/bin/kube-apiserver --logtostderr=true --v=0 --etcd_servers=http://127.0.0.1:4001 --insecure_bind_address=127.0.0.1 --insecure_port=8080 --kubelet_port=10250 --service-cluster-ip-range=10.1.1.0/24 --allow_privileged=false
```
EOF
# kube-controller-manager
sed -i -s "s/## kube-controller-manager/# NAME\nkube-controller-manager \- Enforces kubernetes services.\n\n# SYNOPSIS\n**kube-controller-manager** [OPTIONS]\n/" kube-controller-manager.md
cat << 'EOF' >> kube-controller-manager.md
# EXAMPLES
```
/usr/bin/kube-controller-manager --logtostderr=true --v=0 --master=127.0.0.1:8080
```
EOF
# kube-proxy
sed -i -s "s/## kube-proxy/# NAME\nkube-proxy \- Provides network proxy services.\n\n# SYNOPSIS\n**kube-proxy** [OPTIONS]\n/" kube-proxy.md
cat << 'EOF' >> kube-proxy.md
# EXAMPLES
```
/usr/bin/kube-proxy --logtostderr=true --v=0 --master=http://127.0.0.1:8080
```
EOF
# kube-scheduler
sed -i -s "s/## kube-scheduler/# NAME\nkube-scheduler \- Schedules containers on hosts.\n\n# SYNOPSIS\n**kube-scheduler** [OPTIONS]\n/" kube-scheduler.md
cat << 'EOF' >> kube-scheduler.md
# EXAMPLES
```
/usr/bin/kube-scheduler --logtostderr=true --v=0 --master=127.0.0.1:8080
```
EOF
# kubelet
sed -i -s "s/## kubelet/# NAME\nkubelet \- Processes a container manifest so the containers are launched according to how they are described.\n\n# SYNOPSIS\n**kubelet** [OPTIONS]\n/" kubelet.md
cat << 'EOF' >> kubelet.md
# EXAMPLES
```
/usr/bin/kubelet --logtostderr=true --v=0 --api_servers=http://127.0.0.1:8080 --address=127.0.0.1 --port=10250 --hostname_override=127.0.0.1 --allow-privileged=false
```
EOF
# for all man-pages
for md in $MDSFORMANPAGES; do
# correct section names
sed -i -s "s/### Synopsis/# DESCRIPTION/" $md
sed -i -s "s/### Options/# OPTIONS/" $md
# add header
sed -i "s/# NAME/% KUBERNETES(1) kubernetes User Manuals\n# NAME/" $md
# modify list of options
# options with no value in ""
sed -i -r 's/(^ )(-[^":][^":]*)(:)(.*)/\*\*\2\*\*\n\t\4\n/' $md
# option with value in ""
sed -i -r 's/(^ )(-[^":][^":]*)("[^"]*")(:)(.*)/\*\*\2\3\*\*\n\t\5\n/' $md
# options in -s, --long
sed -i -r 's/(^ )(-[a-z], -[^":][^":]*)(:)(.*)/\*\*\2\*\*\n\t\4\n/' $md
sed -i -r 's/(^ )(-[a-z], -[^":][^":]*)("[^"]*")(:)(.*)/\*\*\2\3\*\*\n\t\5\n/' $md
# remove ```
sed -i 's/```//' $md
# remove all lines starting with ######
sed -i 's/^######.*//' $md
# modify footer
sed -i -r "s/^\[!\[Analytics\].*//" $md
# md does not contain section => taking 1
name="${md%.md}"
go-md2man -in $md -out man/man1/$name.1
done

View File

@ -0,0 +1,108 @@
From de653bd0823b248d623a39c17a3872e85ce952b0 Mon Sep 17 00:00:00 2001
From: Chris Friesen <chris.friesen@windriver.com>
Date: Fri, 3 Sep 2021 18:05:15 -0400
Subject: [PATCH 5/7] kubeadm: create platform pods with zero CPU resources
We want to specify zero CPU resources when creating the manifests
for the static platform pods, as a workaround for the lack of
separate resource tracking for platform resources.
We also specify zero CPU resources for the coredns deployment.
manifests.go appears to be the main file for this, not sure if the
others are used but I changed them just in case.
Signed-off-by: Jim Gauld <james.gauld@windriver.com>
---
cluster/addons/dns/coredns/coredns.yaml.base | 2 +-
cluster/addons/dns/coredns/coredns.yaml.in | 2 +-
cluster/addons/dns/coredns/coredns.yaml.sed | 2 +-
cmd/kubeadm/app/phases/addons/dns/manifests.go | 2 +-
cmd/kubeadm/app/phases/controlplane/manifests.go | 6 +++---
5 files changed, 7 insertions(+), 7 deletions(-)
diff --git a/cluster/addons/dns/coredns/coredns.yaml.base b/cluster/addons/dns/coredns/coredns.yaml.base
index 4ee054f8ba5..d2b58f4af0e 100644
--- a/cluster/addons/dns/coredns/coredns.yaml.base
+++ b/cluster/addons/dns/coredns/coredns.yaml.base
@@ -138,7 +138,7 @@ spec:
limits:
memory: __DNS__MEMORY__LIMIT__
requests:
- cpu: 100m
+ cpu: 0
memory: 70Mi
args: [ "-conf", "/etc/coredns/Corefile" ]
volumeMounts:
diff --git a/cluster/addons/dns/coredns/coredns.yaml.in b/cluster/addons/dns/coredns/coredns.yaml.in
index 1f791e447c9..ff03a801646 100644
--- a/cluster/addons/dns/coredns/coredns.yaml.in
+++ b/cluster/addons/dns/coredns/coredns.yaml.in
@@ -138,7 +138,7 @@ spec:
limits:
memory: 'dns_memory_limit'
requests:
- cpu: 100m
+ cpu: 0
memory: 70Mi
args: [ "-conf", "/etc/coredns/Corefile" ]
volumeMounts:
diff --git a/cluster/addons/dns/coredns/coredns.yaml.sed b/cluster/addons/dns/coredns/coredns.yaml.sed
index 4d64278aaa4..38fc9196b28 100644
--- a/cluster/addons/dns/coredns/coredns.yaml.sed
+++ b/cluster/addons/dns/coredns/coredns.yaml.sed
@@ -138,7 +138,7 @@ spec:
limits:
memory: $DNS_MEMORY_LIMIT
requests:
- cpu: 100m
+ cpu: 0
memory: 70Mi
args: [ "-conf", "/etc/coredns/Corefile" ]
volumeMounts:
diff --git a/cmd/kubeadm/app/phases/addons/dns/manifests.go b/cmd/kubeadm/app/phases/addons/dns/manifests.go
index 3ac6856bfc6..0763b4c63db 100644
--- a/cmd/kubeadm/app/phases/addons/dns/manifests.go
+++ b/cmd/kubeadm/app/phases/addons/dns/manifests.go
@@ -95,7 +95,7 @@ spec:
limits:
memory: 170Mi
requests:
- cpu: 100m
+ cpu: 0
memory: 70Mi
args: [ "-conf", "/etc/coredns/Corefile" ]
volumeMounts:
diff --git a/cmd/kubeadm/app/phases/controlplane/manifests.go b/cmd/kubeadm/app/phases/controlplane/manifests.go
index 8181bea63a4..4c4b4448dd4 100644
--- a/cmd/kubeadm/app/phases/controlplane/manifests.go
+++ b/cmd/kubeadm/app/phases/controlplane/manifests.go
@@ -60,7 +60,7 @@ func GetStaticPodSpecs(cfg *kubeadmapi.ClusterConfiguration, endpoint *kubeadmap
LivenessProbe: staticpodutil.LivenessProbe(staticpodutil.GetAPIServerProbeAddress(endpoint), "/livez", int(endpoint.BindPort), v1.URISchemeHTTPS),
ReadinessProbe: staticpodutil.ReadinessProbe(staticpodutil.GetAPIServerProbeAddress(endpoint), "/readyz", int(endpoint.BindPort), v1.URISchemeHTTPS),
StartupProbe: staticpodutil.StartupProbe(staticpodutil.GetAPIServerProbeAddress(endpoint), "/livez", int(endpoint.BindPort), v1.URISchemeHTTPS, cfg.APIServer.TimeoutForControlPlane),
- Resources: staticpodutil.ComponentResources("250m"),
+ Resources: staticpodutil.ComponentResources("0"),
Env: kubeadmutil.GetProxyEnvVars(),
}, mounts.GetVolumes(kubeadmconstants.KubeAPIServer),
map[string]string{kubeadmconstants.KubeAPIServerAdvertiseAddressEndpointAnnotationKey: endpoint.String()}),
@@ -72,7 +72,7 @@ func GetStaticPodSpecs(cfg *kubeadmapi.ClusterConfiguration, endpoint *kubeadmap
VolumeMounts: staticpodutil.VolumeMountMapToSlice(mounts.GetVolumeMounts(kubeadmconstants.KubeControllerManager)),
LivenessProbe: staticpodutil.LivenessProbe(staticpodutil.GetControllerManagerProbeAddress(cfg), "/healthz", kubeadmconstants.KubeControllerManagerPort, v1.URISchemeHTTPS),
StartupProbe: staticpodutil.StartupProbe(staticpodutil.GetControllerManagerProbeAddress(cfg), "/healthz", kubeadmconstants.KubeControllerManagerPort, v1.URISchemeHTTPS, cfg.APIServer.TimeoutForControlPlane),
- Resources: staticpodutil.ComponentResources("200m"),
+ Resources: staticpodutil.ComponentResources("0"),
Env: kubeadmutil.GetProxyEnvVars(),
}, mounts.GetVolumes(kubeadmconstants.KubeControllerManager), nil),
kubeadmconstants.KubeScheduler: staticpodutil.ComponentPod(v1.Container{
@@ -83,7 +83,7 @@ func GetStaticPodSpecs(cfg *kubeadmapi.ClusterConfiguration, endpoint *kubeadmap
VolumeMounts: staticpodutil.VolumeMountMapToSlice(mounts.GetVolumeMounts(kubeadmconstants.KubeScheduler)),
LivenessProbe: staticpodutil.LivenessProbe(staticpodutil.GetSchedulerProbeAddress(cfg), "/healthz", kubeadmconstants.KubeSchedulerPort, v1.URISchemeHTTPS),
StartupProbe: staticpodutil.StartupProbe(staticpodutil.GetSchedulerProbeAddress(cfg), "/healthz", kubeadmconstants.KubeSchedulerPort, v1.URISchemeHTTPS, cfg.APIServer.TimeoutForControlPlane),
- Resources: staticpodutil.ComponentResources("100m"),
+ Resources: staticpodutil.ComponentResources("0"),
Env: kubeadmutil.GetProxyEnvVars(),
}, mounts.GetVolumes(kubeadmconstants.KubeScheduler), nil),
}
--
2.17.1

View File

@ -0,0 +1,17 @@
# Note: This dropin only works with kubeadm and kubelet v1.11+
[Service]
Environment="KUBELET_KUBECONFIG_ARGS=--bootstrap-kubeconfig=/etc/kubernetes/bootstrap-kubelet.conf --kubeconfig=/etc/kubernetes/kubelet.conf"
Environment="KUBELET_CONFIG_ARGS=--config=/var/lib/kubelet/config.yaml"
# This is a file that "kubeadm init" and "kubeadm join" generates at runtime, populating the KUBELET_KUBEADM_ARGS variable dynamically
EnvironmentFile=-/var/lib/kubelet/kubeadm-flags.env
# This is a file that the user can use for overrides of the kubelet args as a last resort. Preferably, the user should use
# the .NodeRegistration.KubeletExtraArgs object in the configuration files instead. KUBELET_EXTRA_ARGS should be sourced from this file.
EnvironmentFile=-/etc/sysconfig/kubelet
ExecStart=
ExecStart=/usr/bin/kubelet $KUBELET_KUBECONFIG_ARGS $KUBELET_CONFIG_ARGS $KUBELET_KUBEADM_ARGS $KUBELET_EXTRA_ARGS
ExecStartPre=-/usr/bin/kubelet-cgroup-setup.sh
ExecStartPost=/bin/bash -c 'echo $MAINPID > /var/run/kubelet.pid;'
ExecStopPost=/bin/rm -f /var/run/kubelet.pid
Restart=always
StartLimitInterval=0
RestartSec=10

View File

@ -0,0 +1,132 @@
#!/bin/bash
#
# Copyright (c) 2019 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
# This script does minimal cgroup setup for kubelet. This creates k8s-infra
# cgroup for a minimal set of resource controllers, and configures cpuset
# attributes to span all online cpus and nodes. This will do nothing if
# the k8s-infra cgroup already exists (i.e., assume already configured).
# NOTE: The creation of directories under /sys/fs/cgroup is volatile, and
# does not persist reboots. The cpuset.mems and cpuset.cpus is later updated
# by puppet kubernetes.pp manifest.
#
# Define minimal path
PATH=/bin:/usr/bin:/usr/local/bin
# Log info message to /var/log/daemon.log
function LOG {
logger -p daemon.info "$0($$): $@"
}
# Log error message to /var/log/daemon.log
function ERROR {
logger -s -p daemon.error "$0($$): ERROR: $@"
}
# Create minimal cgroup directories and configure cpuset attributes if required
function create_cgroup {
local cg_name=$1
local cg_nodeset=$2
local cg_cpuset=$3
local CGROUP=/sys/fs/cgroup
local CONTROLLERS_AUTO_DELETED=("pids" "hugetlb")
local CONTROLLERS_PRESERVED=("cpuset" "memory" "cpu,cpuacct" "systemd")
local cnt=''
local CGDIR=''
local RC=0
# Ensure that these cgroups are created every time as they are auto deleted
for cnt in ${CONTROLLERS_AUTO_DELETED[@]}; do
CGDIR=${CGROUP}/${cnt}/${cg_name}
if [ -d ${CGDIR} ]; then
LOG "Nothing to do, already configured: ${CGDIR}."
continue
fi
LOG "Creating: ${CGDIR}"
mkdir -p ${CGDIR}
RC=$?
if [ ${RC} -ne 0 ]; then
ERROR "Creating: ${CGDIR}, rc=${RC}"
exit ${RC}
fi
done
# These cgroups are preserved so if any of these are encountered additional
# cgroup setup is not required
for cnt in ${CONTROLLERS_PRESERVED[@]}; do
CGDIR=${CGROUP}/${cnt}/${cg_name}
if [ -d ${CGDIR} ]; then
LOG "Nothing to do, already configured: ${CGDIR}."
exit ${RC}
fi
LOG "Creating: ${CGDIR}"
mkdir -p ${CGDIR}
RC=$?
if [ ${RC} -ne 0 ]; then
ERROR "Creating: ${CGDIR}, rc=${RC}"
exit ${RC}
fi
done
# Customize cpuset attributes
LOG "Configuring cgroup: ${cg_name}, nodeset: ${cg_nodeset}, cpuset: ${cg_cpuset}"
CGDIR=${CGROUP}/cpuset/${cg_name}
local CGMEMS=${CGDIR}/cpuset.mems
local CGCPUS=${CGDIR}/cpuset.cpus
local CGTASKS=${CGDIR}/tasks
# Assign cgroup memory nodeset
LOG "Assign nodeset ${cg_nodeset} to ${CGMEMS}"
/bin/echo ${cg_nodeset} > ${CGMEMS}
RC=$?
if [ ${RC} -ne 0 ]; then
ERROR "Unable to write to: ${CGMEMS}, rc=${RC}"
exit ${RC}
fi
# Assign cgroup cpus
LOG "Assign cpuset ${cg_cpuset} to ${CGCPUS}"
/bin/echo ${cg_cpuset} > ${CGCPUS}
RC=$?
if [ ${RC} -ne 0 ]; then
ERROR "Assigning: ${cg_cpuset} to ${CGCPUS}, rc=${RC}"
exit ${RC}
fi
# Set file ownership
chown root:root ${CGMEMS} ${CGCPUS} ${CGTASKS}
RC=$?
if [ ${RC} -ne 0 ]; then
ERROR "Setting owner for: ${CGMEMS}, ${CGCPUS}, ${CGTASKS}, rc=${RC}"
exit ${RC}
fi
# Set file mode permissions
chmod 644 ${CGMEMS} ${CGCPUS} ${CGTASKS}
RC=$?
if [ ${RC} -ne 0 ]; then
ERROR "Setting mode for: ${CGMEMS}, ${CGCPUS}, ${CGTASKS}, rc=${RC}"
exit ${RC}
fi
return ${RC}
}
if [ $UID -ne 0 ]; then
ERROR "Require sudo/root."
exit 1
fi
# Configure default kubepods cpuset to span all online cpus and nodes.
ONLINE_NODESET=$(/bin/cat /sys/devices/system/node/online)
ONLINE_CPUSET=$(/bin/cat /sys/devices/system/cpu/online)
# Configure kubelet cgroup to match cgroupRoot.
create_cgroup 'k8s-infra' ${ONLINE_NODESET} ${ONLINE_CPUSET}
exit $?

View File

@ -0,0 +1,256 @@
From 3f69868f7bca99f6875dd4d197b3a974d1b558ed Mon Sep 17 00:00:00 2001
From: Jim Gauld <james.gauld@windriver.com>
Date: Wed, 22 Sep 2021 10:09:06 -0400
Subject: [PATCH 1/7] kubelet cpumanager disable CFS quota throttling for
Guaranteed pods
This disables CFS CPU quota to avoid performance degradation due to
Linux kernel CFS quota implementation. Note that 4.18 kernel attempts
to solve the CFS throttling problem, but there are reports that it is
not completely effective.
This disables CFS quota throttling for Guaranteed pods for both
parent and container cgroups by writing -1 to cgroup cpu.cfs_quota_us.
Disabling has a dramatic latency improvement for HTTP response times.
This patch is refactored in 1.21.3 due to new internal_container_lifecycle
framework. We leverage the same mechanism to set Linux resources as:
cpu manager: specify the container CPU set during the creation
(commit 38dc7509f862f081828e7d9167107b8c6e98ea23).
Signed-off-by: Jim Gauld <james.gauld@windriver.com>
---
pkg/kubelet/cm/cpumanager/cpu_manager.go | 7 ++++
pkg/kubelet/cm/cpumanager/fake_cpu_manager.go | 6 +++
pkg/kubelet/cm/helpers_linux.go | 10 +++++
pkg/kubelet/cm/helpers_linux_test.go | 42 ++++++++++---------
.../cm/internal_container_lifecycle_linux.go | 9 ++++
5 files changed, 54 insertions(+), 20 deletions(-)
diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager.go b/pkg/kubelet/cm/cpumanager/cpu_manager.go
index 5a6e5082f15..f7b9c8d07bf 100644
--- a/pkg/kubelet/cm/cpumanager/cpu_manager.go
+++ b/pkg/kubelet/cm/cpumanager/cpu_manager.go
@@ -72,6 +72,9 @@ type Manager interface {
// State returns a read-only interface to the internal CPU manager state.
State() state.Reader
+ // GetCPUPolicy returns the assigned CPU manager policy
+ GetCPUPolicy() string
+
// GetTopologyHints implements the topologymanager.HintProvider Interface
// and is consulted to achieve NUMA aware resource alignment among this
// and other resource controllers.
@@ -291,6 +294,10 @@ func (m *manager) State() state.Reader {
return m.state
}
+func (m *manager) GetCPUPolicy() string {
+ return m.policy.Name()
+}
+
func (m *manager) GetTopologyHints(pod *v1.Pod, container *v1.Container) map[string][]topologymanager.TopologyHint {
// Garbage collect any stranded resources before providing TopologyHints
m.removeStaleState()
diff --git a/pkg/kubelet/cm/cpumanager/fake_cpu_manager.go b/pkg/kubelet/cm/cpumanager/fake_cpu_manager.go
index 2c38b52b374..1cb0ea10923 100644
--- a/pkg/kubelet/cm/cpumanager/fake_cpu_manager.go
+++ b/pkg/kubelet/cm/cpumanager/fake_cpu_manager.go
@@ -28,6 +28,7 @@ import (
)
type fakeManager struct {
+ policy Policy
state state.State
}
@@ -69,6 +70,10 @@ func (m *fakeManager) State() state.Reader {
return m.state
}
+func (m *fakeManager) GetCPUPolicy() string {
+ return m.policy.Name()
+}
+
func (m *fakeManager) GetCPUs(podUID, containerName string) cpuset.CPUSet {
klog.InfoS("GetCPUs", "podUID", podUID, "containerName", containerName)
return cpuset.CPUSet{}
@@ -82,6 +87,7 @@ func (m *fakeManager) GetAllocatableCPUs() cpuset.CPUSet {
// NewFakeManager creates empty/fake cpu manager
func NewFakeManager() Manager {
return &fakeManager{
+ policy: &nonePolicy{},
state: state.NewMemoryState(),
}
}
diff --git a/pkg/kubelet/cm/helpers_linux.go b/pkg/kubelet/cm/helpers_linux.go
index aa5c37639dc..302284ef408 100644
--- a/pkg/kubelet/cm/helpers_linux.go
+++ b/pkg/kubelet/cm/helpers_linux.go
@@ -169,6 +169,16 @@ func ResourceConfigForPod(pod *v1.Pod, enforceCPULimits bool, cpuPeriod uint64)
// build the result
result := &ResourceConfig{}
if qosClass == v1.PodQOSGuaranteed {
+ // Disable CFS CPU quota to avoid performance degradation due to
+ // Linux kernel CFS throttle implementation.
+ // NOTE: 4.18 kernel attempts to solve CFS throttling problem,
+ // but there are reports that it is not completely effective.
+ // This will configure cgroup CFS parameters at pod level:
+ // /sys/fs/cgroup/cpu/k8s-infra/kubepods/<pod>/cpu.cfs_quota_us
+ // /sys/fs/cgroup/cpu/k8s-infra/kubepods/<pod>/cpu.cfs_period_us
+ cpuQuota = int64(-1)
+ cpuPeriod = uint64(100000)
+
result.CpuShares = &cpuShares
result.CpuQuota = &cpuQuota
result.CpuPeriod = &cpuPeriod
diff --git a/pkg/kubelet/cm/helpers_linux_test.go b/pkg/kubelet/cm/helpers_linux_test.go
index 56d765fbc22..8c7309937dd 100644
--- a/pkg/kubelet/cm/helpers_linux_test.go
+++ b/pkg/kubelet/cm/helpers_linux_test.go
@@ -63,8 +63,9 @@ func TestResourceConfigForPod(t *testing.T) {
burstablePartialShares := MilliCPUToShares(200)
burstableQuota := MilliCPUToQuota(200, int64(defaultQuotaPeriod))
guaranteedShares := MilliCPUToShares(100)
- guaranteedQuota := MilliCPUToQuota(100, int64(defaultQuotaPeriod))
- guaranteedTunedQuota := MilliCPUToQuota(100, int64(tunedQuotaPeriod))
+ guaranteedQuotaPeriod := uint64(100000)
+ guaranteedQuota := int64(-1)
+ guaranteedTunedQuota := int64(-1)
memoryQuantity = resource.MustParse("100Mi")
cpuNoLimit := int64(-1)
guaranteedMemory := memoryQuantity.Value()
@@ -203,8 +204,8 @@ func TestResourceConfigForPod(t *testing.T) {
},
},
enforceCPULimits: true,
- quotaPeriod: defaultQuotaPeriod,
- expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &guaranteedQuota, CpuPeriod: &defaultQuotaPeriod, Memory: &guaranteedMemory},
+ quotaPeriod: guaranteedQuotaPeriod,
+ expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &guaranteedQuota, CpuPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory},
},
"guaranteed-no-cpu-enforcement": {
pod: &v1.Pod{
@@ -217,8 +218,8 @@ func TestResourceConfigForPod(t *testing.T) {
},
},
enforceCPULimits: false,
- quotaPeriod: defaultQuotaPeriod,
- expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &cpuNoLimit, CpuPeriod: &defaultQuotaPeriod, Memory: &guaranteedMemory},
+ quotaPeriod: guaranteedQuotaPeriod,
+ expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &cpuNoLimit, CpuPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory},
},
"guaranteed-with-tuned-quota": {
pod: &v1.Pod{
@@ -231,8 +232,8 @@ func TestResourceConfigForPod(t *testing.T) {
},
},
enforceCPULimits: true,
- quotaPeriod: tunedQuotaPeriod,
- expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &guaranteedTunedQuota, CpuPeriod: &tunedQuotaPeriod, Memory: &guaranteedMemory},
+ quotaPeriod: guaranteedQuotaPeriod,
+ expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &guaranteedTunedQuota, CpuPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory},
},
"guaranteed-no-cpu-enforcement-with-tuned-quota": {
pod: &v1.Pod{
@@ -245,8 +246,8 @@ func TestResourceConfigForPod(t *testing.T) {
},
},
enforceCPULimits: false,
- quotaPeriod: tunedQuotaPeriod,
- expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &cpuNoLimit, CpuPeriod: &tunedQuotaPeriod, Memory: &guaranteedMemory},
+ quotaPeriod: guaranteedQuotaPeriod,
+ expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &cpuNoLimit, CpuPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory},
},
}
@@ -283,8 +284,9 @@ func TestResourceConfigForPodWithCustomCPUCFSQuotaPeriod(t *testing.T) {
burstablePartialShares := MilliCPUToShares(200)
burstableQuota := MilliCPUToQuota(200, int64(defaultQuotaPeriod))
guaranteedShares := MilliCPUToShares(100)
- guaranteedQuota := MilliCPUToQuota(100, int64(defaultQuotaPeriod))
- guaranteedTunedQuota := MilliCPUToQuota(100, int64(tunedQuotaPeriod))
+ guaranteedQuotaPeriod := uint64(100000)
+ guaranteedQuota := int64(-1)
+ guaranteedTunedQuota := int64(-1)
memoryQuantity = resource.MustParse("100Mi")
cpuNoLimit := int64(-1)
guaranteedMemory := memoryQuantity.Value()
@@ -423,8 +425,8 @@ func TestResourceConfigForPodWithCustomCPUCFSQuotaPeriod(t *testing.T) {
},
},
enforceCPULimits: true,
- quotaPeriod: defaultQuotaPeriod,
- expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &guaranteedQuota, CpuPeriod: &defaultQuotaPeriod, Memory: &guaranteedMemory},
+ quotaPeriod: guaranteedQuotaPeriod,
+ expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &guaranteedQuota, CpuPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory},
},
"guaranteed-no-cpu-enforcement": {
pod: &v1.Pod{
@@ -437,8 +439,8 @@ func TestResourceConfigForPodWithCustomCPUCFSQuotaPeriod(t *testing.T) {
},
},
enforceCPULimits: false,
- quotaPeriod: defaultQuotaPeriod,
- expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &cpuNoLimit, CpuPeriod: &defaultQuotaPeriod, Memory: &guaranteedMemory},
+ quotaPeriod: guaranteedQuotaPeriod,
+ expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &cpuNoLimit, CpuPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory},
},
"guaranteed-with-tuned-quota": {
pod: &v1.Pod{
@@ -451,8 +453,8 @@ func TestResourceConfigForPodWithCustomCPUCFSQuotaPeriod(t *testing.T) {
},
},
enforceCPULimits: true,
- quotaPeriod: tunedQuotaPeriod,
- expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &guaranteedTunedQuota, CpuPeriod: &tunedQuotaPeriod, Memory: &guaranteedMemory},
+ quotaPeriod: guaranteedQuotaPeriod,
+ expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &guaranteedTunedQuota, CpuPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory},
},
"guaranteed-no-cpu-enforcement-with-tuned-quota": {
pod: &v1.Pod{
@@ -465,8 +467,8 @@ func TestResourceConfigForPodWithCustomCPUCFSQuotaPeriod(t *testing.T) {
},
},
enforceCPULimits: false,
- quotaPeriod: tunedQuotaPeriod,
- expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &cpuNoLimit, CpuPeriod: &tunedQuotaPeriod, Memory: &guaranteedMemory},
+ quotaPeriod: guaranteedQuotaPeriod,
+ expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &cpuNoLimit, CpuPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory},
},
}
diff --git a/pkg/kubelet/cm/internal_container_lifecycle_linux.go b/pkg/kubelet/cm/internal_container_lifecycle_linux.go
index 9cf41620b8c..fa15dbe1671 100644
--- a/pkg/kubelet/cm/internal_container_lifecycle_linux.go
+++ b/pkg/kubelet/cm/internal_container_lifecycle_linux.go
@@ -19,10 +19,12 @@ limitations under the License.
package cm
import (
+ //"fmt"
"strconv"
"strings"
"k8s.io/api/core/v1"
+ v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1alpha2"
)
@@ -32,6 +34,13 @@ func (i *internalContainerLifecycleImpl) PreCreateContainer(pod *v1.Pod, contain
if !allocatedCPUs.IsEmpty() {
containerConfig.Linux.Resources.CpusetCpus = allocatedCPUs.String()
}
+ // Disable cgroup CFS throttle at the container level.
+ // /sys/fs/cgroup/cpu/k8s-infra/kubepods/<pod>/<container>/cpu.cfs_quota_us
+ // /sys/fs/cgroup/cpu/k8s-infra/kubepods/<pod>/<container>/cpu.cfs_period_us
+ if i.cpuManager.GetCPUPolicy() == "static" && v1qos.GetPodQOS(pod) == v1.PodQOSGuaranteed {
+ containerConfig.Linux.Resources.CpuPeriod = int64(100000)
+ containerConfig.Linux.Resources.CpuQuota = int64(-1)
+ }
}
if i.memoryManager != nil {
--
2.17.1

View File

@ -0,0 +1,139 @@
From 66f4a61ce77624dcc0b520e01fccacec61dfea37 Mon Sep 17 00:00:00 2001
From: Jim Gauld <james.gauld@windriver.com>
Date: Tue, 7 Sep 2021 14:21:03 -0400
Subject: [PATCH 3/7] kubelet cpumanager infrastructure pods use system
reserved CPUs
This assigns system infrastructure pods to the "reserved" cpuset
to isolate them from the shared pool of CPUs.
Infrastructure pods include any pods that belong to the kube-system,
armada, cert-manager, vault, platform-deployment-manager, portieris,
or notification namespaces.
The implementation is a bit simplistic, it is assumed that the
"reserved" cpuset is large enough to handle all infrastructure pods
CPU allocations.
This also prevents infrastucture pods from using Guaranteed resources.
Signed-off-by: Jim Gauld <james.gauld@windriver.com>
---
pkg/kubelet/cm/cpumanager/policy_static.go | 44 +++++++++++++++++++
.../cm/cpumanager/policy_static_test.go | 19 +++++++-
2 files changed, 62 insertions(+), 1 deletion(-)
diff --git a/pkg/kubelet/cm/cpumanager/policy_static.go b/pkg/kubelet/cm/cpumanager/policy_static.go
index 94d81796132..b48a5b997a3 100644
--- a/pkg/kubelet/cm/cpumanager/policy_static.go
+++ b/pkg/kubelet/cm/cpumanager/policy_static.go
@@ -32,6 +32,11 @@ import (
// PolicyStatic is the name of the static policy
const PolicyStatic policyName = "static"
+// Define namespaces used by platform infrastructure pods
+var infraNamespaces = [...]string{
+ "kube-system", "armada", "cert-manager", "platform-deployment-manager", "portieris", "vault", "notification",
+}
+
// staticPolicy is a CPU manager policy that does not change CPU
// assignments for exclusively pinned guaranteed containers after the main
// container process starts.
@@ -232,6 +237,31 @@ func (p *staticPolicy) updateCPUsToReuse(pod *v1.Pod, container *v1.Container, c
}
func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Container) error {
+ // Process infra pods before guaranteed pods
+ if isKubeInfra(pod) {
+ // Container belongs in reserved pool.
+ // We don't want to fall through to the p.guaranteedCPUs() clause below so return either nil or error.
+ if _, ok := s.GetCPUSet(string(pod.UID), container.Name); ok {
+ klog.Infof("[cpumanager] static policy: reserved container already present in state, skipping " +
+ "(namespace: %s, pod UID: %s, pod: %s, container: %s)",
+ pod.Namespace, string(pod.UID), pod.Name, container.Name)
+ return nil
+ }
+
+ cpuset := p.reserved
+ if cpuset.IsEmpty() {
+ // If this happens then someone messed up.
+ return fmt.Errorf("[cpumanager] static policy: reserved container unable to allocate cpus " +
+ "(namespace: %s, pod UID: %s, pod: %s, container: %s); cpuset=%v, reserved:%v",
+ pod.Namespace, string(pod.UID), pod.Name, container.Name, cpuset, p.reserved)
+ }
+ s.SetCPUSet(string(pod.UID), container.Name, cpuset)
+ klog.Infof("[cpumanager] static policy: reserved: AddContainer " +
+ "(namespace: %s, pod UID: %s, pod: %s, container: %s); cpuset=%v",
+ pod.Namespace, string(pod.UID), pod.Name, container.Name, cpuset)
+ return nil
+ }
+
if numCPUs := p.guaranteedCPUs(pod, container); numCPUs != 0 {
klog.InfoS("Static policy: Allocate", "pod", klog.KObj(pod), "containerName", container.Name)
// container belongs in an exclusively allocated pool
@@ -321,6 +351,10 @@ func (p *staticPolicy) guaranteedCPUs(pod *v1.Pod, container *v1.Container) int
if cpuQuantity.Value()*1000 != cpuQuantity.MilliValue() {
return 0
}
+ // Infrastructure pods use reserved CPUs even if they're in the Guaranteed QoS class
+ if isKubeInfra(pod) {
+ return 0
+ }
// Safe downcast to do for all systems with < 2.1 billion CPUs.
// Per the language spec, `int` is guaranteed to be at least 32 bits wide.
// https://golang.org/ref/spec#Numeric_types
@@ -523,3 +557,13 @@ func (p *staticPolicy) generateCPUTopologyHints(availableCPUs cpuset.CPUSet, reu
return hints
}
+
+// check if a given pod is in a platform infrastructure namespace
+func isKubeInfra(pod *v1.Pod) bool {
+ for _, namespace := range infraNamespaces {
+ if namespace == pod.Namespace {
+ return true
+ }
+ }
+ return false
+}
\ No newline at end of file
diff --git a/pkg/kubelet/cm/cpumanager/policy_static_test.go b/pkg/kubelet/cm/cpumanager/policy_static_test.go
index 93a2870532a..90ad98e71bc 100644
--- a/pkg/kubelet/cm/cpumanager/policy_static_test.go
+++ b/pkg/kubelet/cm/cpumanager/policy_static_test.go
@@ -747,7 +747,8 @@ func TestStaticPolicyStartWithResvList(t *testing.T) {
}
func TestStaticPolicyAddWithResvList(t *testing.T) {
-
+ infraPod := makePod("fakePod", "fakeContainer2", "200m", "200m")
+ infraPod.Namespace = "kube-system"
testCases := []staticPolicyTestWithResvList{
{
description: "GuPodSingleCore, SingleSocketHT, ExpectError",
@@ -789,6 +790,22 @@ func TestStaticPolicyAddWithResvList(t *testing.T) {
expCPUAlloc: true,
expCSet: cpuset.NewCPUSet(4, 5),
},
+ {
+ description: "InfraPod, SingleSocketHT, ExpectAllocReserved",
+ topo: topoSingleSocketHT,
+ numReservedCPUs: 2,
+ reserved: cpuset.NewCPUSet(0, 1),
+ stAssignments: state.ContainerCPUAssignments{
+ "fakePod": map[string]cpuset.CPUSet{
+ "fakeContainer100": cpuset.NewCPUSet(2, 3, 6, 7),
+ },
+ },
+ stDefaultCPUSet: cpuset.NewCPUSet(4, 5),
+ pod: infraPod,
+ expErr: nil,
+ expCPUAlloc: true,
+ expCSet: cpuset.NewCPUSet(0, 1),
+ },
}
testExcl := true
--
2.17.1

View File

@ -0,0 +1,535 @@
From 81a6d41690c45b168034df04df0199cd5abe60e9 Mon Sep 17 00:00:00 2001
From: Jim Gauld <james.gauld@windriver.com>
Date: Wed, 8 Sep 2021 09:28:40 -0400
Subject: [PATCH 4/7] kubelet cpumanager introduce concept of isolated CPUs
This introduces the concept of "isolated CPUs", which are CPUs that
have been isolated at the kernel level via the "isolcpus" kernel boot
parameter.
When starting the kubelet process, two separate sets of reserved CPUs
may be specified. With this change CPUs reserved via
'--system-reserved=cpu' will be used for infrastructure pods while the
isolated CPUs should be reserved via '--kube-reserved=cpu' to cause
kubelet to skip over them for "normal" CPU resource tracking. The
kubelet code will double-check that the specified isolated CPUs match
what the kernel exposes in "/sys/devices/system/cpu/isolated".
A plugin (outside the scope of this commit) will expose the isolated
CPUs to kubelet via the device plugin API.
If a pod specifies some number of "isolcpus" resources, the device
manager will allocate them. In this code we check whether such
resources have been allocated, and if so we set the container cpuset to
the isolated CPUs. This does mean that it really only makes sense to
specify "isolcpus" resources for best-effort or burstable pods, not for
guaranteed ones since that would throw off the accounting code. In
order to ensure the accounting still works as designed, if "isolcpus"
are specified for guaranteed pods, the affinity will be set to the
non-isolated CPUs.
This patch was refactored in 1.21.3 due to upstream API change
node: podresources: make GetDevices() consistent
(commit ad68f9588c72d6477b5a290c548a9031063ac659).
The routine podIsolCPUs() was refactored in 1.21.3 since the API
p.deviceManager.GetDevices() is returning multiple devices with
a device per cpu. The resultant cpuset needs to be the aggregate.
Signed-off-by: Jim Gauld <james.gauld@windriver.com>
Co-authored-by: Chris Friesen <chris.friesen@windriver.com>
---
pkg/kubelet/cm/container_manager_linux.go | 1 +
pkg/kubelet/cm/cpumanager/cpu_manager.go | 31 ++++++-
pkg/kubelet/cm/cpumanager/cpu_manager_test.go | 13 ++-
pkg/kubelet/cm/cpumanager/policy_static.go | 85 +++++++++++++++++--
.../cm/cpumanager/policy_static_test.go | 44 ++++++++--
5 files changed, 156 insertions(+), 18 deletions(-)
diff --git a/pkg/kubelet/cm/container_manager_linux.go b/pkg/kubelet/cm/container_manager_linux.go
index e7bed73fc69..d74a6339939 100644
--- a/pkg/kubelet/cm/container_manager_linux.go
+++ b/pkg/kubelet/cm/container_manager_linux.go
@@ -337,6 +337,7 @@ func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.I
cm.GetNodeAllocatableReservation(),
nodeConfig.KubeletRootDir,
cm.topologyManager,
+ cm.deviceManager,
)
if err != nil {
klog.ErrorS(err, "Failed to initialize cpu manager")
diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager.go b/pkg/kubelet/cm/cpumanager/cpu_manager.go
index 5b82e5b02ac..2563f61e7b5 100644
--- a/pkg/kubelet/cm/cpumanager/cpu_manager.go
+++ b/pkg/kubelet/cm/cpumanager/cpu_manager.go
@@ -21,6 +21,8 @@ import (
"math"
"sync"
"time"
+ "strings"
+ "io/ioutil"
cadvisorapi "github.com/google/cadvisor/info/v1"
v1 "k8s.io/api/core/v1"
@@ -34,6 +36,7 @@ import (
"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/kubernetes/pkg/kubelet/config"
+ "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/status"
)
@@ -50,6 +53,25 @@ type policyName string
// cpuManagerStateFileName is the file name where cpu manager stores its state
const cpuManagerStateFileName = "cpu_manager_state"
+// get the system-level isolated CPUs
+func getIsolcpus() cpuset.CPUSet {
+ dat, err := ioutil.ReadFile("/sys/devices/system/cpu/isolated")
+ if err != nil {
+ klog.Errorf("[cpumanager] unable to read sysfs isolcpus subdir")
+ return cpuset.NewCPUSet()
+ }
+
+ // The isolated cpus string ends in a newline
+ cpustring := strings.TrimSuffix(string(dat), "\n")
+ cset, err := cpuset.Parse(cpustring)
+ if err != nil {
+ klog.Errorf("[cpumanager] unable to parse sysfs isolcpus string to cpuset")
+ return cpuset.NewCPUSet()
+ }
+
+ return cset
+}
+
// Manager interface provides methods for Kubelet to manage pod cpus.
type Manager interface {
// Start is called during Kubelet initialization.
@@ -143,7 +165,7 @@ func (s *sourcesReadyStub) AddSource(source string) {}
func (s *sourcesReadyStub) AllReady() bool { return true }
// NewManager creates new cpu manager based on provided policy
-func NewManager(cpuPolicyName string, reconcilePeriod time.Duration, machineInfo *cadvisorapi.MachineInfo, specificCPUs cpuset.CPUSet, nodeAllocatableReservation v1.ResourceList, stateFileDirectory string, affinity topologymanager.Store) (Manager, error) {
+func NewManager(cpuPolicyName string, reconcilePeriod time.Duration, machineInfo *cadvisorapi.MachineInfo, specificCPUs cpuset.CPUSet, nodeAllocatableReservation v1.ResourceList, stateFileDirectory string, affinity topologymanager.Store, deviceManager devicemanager.Manager) (Manager, error) {
var topo *topology.CPUTopology
var policy Policy
@@ -181,8 +203,11 @@ func NewManager(cpuPolicyName string, reconcilePeriod time.Duration, machineInfo
// NOTE: Set excludeReserved unconditionally to exclude reserved CPUs from default cpuset.
// This variable is primarily to make testing easier.
excludeReserved := true
- policy, err = NewStaticPolicy(topo, numReservedCPUs, specificCPUs, affinity, excludeReserved)
-
+ // isolCPUs is the set of kernel-isolated CPUs. They should be a subset of specificCPUs or
+ // of the CPUs that NewStaticPolicy() will pick if numReservedCPUs is set. It's only in the
+ // argument list here for ease of testing, it's really internal to the policy.
+ isolCPUs := getIsolcpus()
+ policy, err = NewStaticPolicy(topo, numReservedCPUs, specificCPUs, isolCPUs, affinity, deviceManager, excludeReserved)
if err != nil {
return nil, fmt.Errorf("new static policy error: %v", err)
}
diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager_test.go b/pkg/kubelet/cm/cpumanager/cpu_manager_test.go
index a2e73de6875..4c021634d6f 100644
--- a/pkg/kubelet/cm/cpumanager/cpu_manager_test.go
+++ b/pkg/kubelet/cm/cpumanager/cpu_manager_test.go
@@ -38,6 +38,7 @@ import (
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
+ "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager"
)
type mockState struct {
@@ -215,6 +216,7 @@ func makeMultiContainerPod(initCPUs, appCPUs []struct{ request, limit string })
}
func TestCPUManagerAdd(t *testing.T) {
+ testDM, _ := devicemanager.NewManagerStub()
testExcl := false
testPolicy, _ := NewStaticPolicy(
&topology.CPUTopology{
@@ -230,7 +232,9 @@ func TestCPUManagerAdd(t *testing.T) {
},
0,
cpuset.NewCPUSet(),
+ cpuset.NewCPUSet(),
topologymanager.NewFakeManager(),
+ testDM,
testExcl)
testCases := []struct {
description string
@@ -480,8 +484,9 @@ func TestCPUManagerAddWithInitContainers(t *testing.T) {
}
testExcl := false
+ testDM, _ := devicemanager.NewManagerStub()
for _, testCase := range testCases {
- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), testExcl)
+ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), cpuset.NewCPUSet(), topologymanager.NewFakeManager(), testDM, testExcl)
mockState := &mockState{
assignments: testCase.stAssignments,
@@ -635,7 +640,8 @@ func TestCPUManagerGenerate(t *testing.T) {
}
defer os.RemoveAll(sDir)
- mgr, err := NewManager(testCase.cpuPolicyName, 5*time.Second, machineInfo, cpuset.NewCPUSet(), testCase.nodeAllocatableReservation, sDir, topologymanager.NewFakeManager())
+ testDM, err := devicemanager.NewManagerStub()
+ mgr, err := NewManager(testCase.cpuPolicyName, 5*time.Second, machineInfo, cpuset.NewCPUSet(), testCase.nodeAllocatableReservation, sDir, topologymanager.NewFakeManager(), testDM)
if testCase.expectedError != nil {
if !strings.Contains(err.Error(), testCase.expectedError.Error()) {
t.Errorf("Unexpected error message. Have: %s wants %s", err.Error(), testCase.expectedError.Error())
@@ -990,6 +996,7 @@ func TestReconcileState(t *testing.T) {
// the following tests are with --reserved-cpus configured
func TestCPUManagerAddWithResvList(t *testing.T) {
testExcl := false
+ testDM, _ := devicemanager.NewManagerStub()
testPolicy, _ := NewStaticPolicy(
&topology.CPUTopology{
NumCPUs: 4,
@@ -1004,7 +1011,9 @@ func TestCPUManagerAddWithResvList(t *testing.T) {
},
1,
cpuset.NewCPUSet(0),
+ cpuset.NewCPUSet(),
topologymanager.NewFakeManager(),
+ testDM,
testExcl)
testCases := []struct {
description string
diff --git a/pkg/kubelet/cm/cpumanager/policy_static.go b/pkg/kubelet/cm/cpumanager/policy_static.go
index b48a5b997a3..2ad14a98911 100644
--- a/pkg/kubelet/cm/cpumanager/policy_static.go
+++ b/pkg/kubelet/cm/cpumanager/policy_static.go
@@ -18,6 +18,7 @@ package cpumanager
import (
"fmt"
+ "strconv"
v1 "k8s.io/api/core/v1"
"k8s.io/klog/v2"
@@ -27,6 +28,7 @@ import (
"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask"
+ "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager"
)
// PolicyStatic is the name of the static policy
@@ -80,6 +82,10 @@ type staticPolicy struct {
topology *topology.CPUTopology
// set of CPUs that is not available for exclusive assignment
reserved cpuset.CPUSet
+ // subset of reserved CPUs with isolcpus attribute
+ isolcpus cpuset.CPUSet
+ // parent containerManager, used to get device list
+ deviceManager devicemanager.Manager
// If true, default CPUSet should exclude reserved CPUs
excludeReserved bool
// topology manager reference to get container Topology affinity
@@ -94,7 +100,7 @@ var _ Policy = &staticPolicy{}
// NewStaticPolicy returns a CPU manager policy that does not change CPU
// assignments for exclusively pinned guaranteed containers after the main
// container process starts.
-func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reservedCPUs cpuset.CPUSet, affinity topologymanager.Store, excludeReserved bool) (Policy, error) {
+func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reservedCPUs cpuset.CPUSet, isolCPUs cpuset.CPUSet, affinity topologymanager.Store, deviceManager devicemanager.Manager, excludeReserved bool) (Policy, error) {
allCPUs := topology.CPUDetails.CPUs()
var reserved cpuset.CPUSet
if reservedCPUs.Size() > 0 {
@@ -115,9 +121,17 @@ func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reserv
klog.InfoS("Reserved CPUs not available for exclusive assignment", "reservedSize", reserved.Size(), "reserved", reserved)
+ if !isolCPUs.IsSubsetOf(reserved) {
+ klog.Errorf("[cpumanager] isolCPUs %v is not a subset of reserved %v", isolCPUs, reserved)
+ reserved = reserved.Union(isolCPUs)
+ klog.Warningf("[cpumanager] mismatch isolCPUs %v, force reserved %v", isolCPUs, reserved)
+ }
+
return &staticPolicy{
topology: topology,
reserved: reserved,
+ isolcpus: isolCPUs,
+ deviceManager: deviceManager,
excludeReserved: excludeReserved,
affinity: affinity,
cpusToReuse: make(map[string]cpuset.CPUSet),
@@ -154,8 +168,8 @@ func (p *staticPolicy) validateState(s state.State) error {
} else {
s.SetDefaultCPUSet(allCPUs)
}
- klog.Infof("[cpumanager] static policy: CPUSet: allCPUs:%v, reserved:%v, default:%v\n",
- allCPUs, p.reserved, s.GetDefaultCPUSet())
+ klog.Infof("[cpumanager] static policy: CPUSet: allCPUs:%v, reserved:%v, isolcpus:%v, default:%v\n",
+ allCPUs, p.reserved, p.isolcpus, s.GetDefaultCPUSet())
return nil
}
@@ -248,12 +262,12 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai
return nil
}
- cpuset := p.reserved
+ cpuset := p.reserved.Clone().Difference(p.isolcpus)
if cpuset.IsEmpty() {
// If this happens then someone messed up.
return fmt.Errorf("[cpumanager] static policy: reserved container unable to allocate cpus " +
- "(namespace: %s, pod UID: %s, pod: %s, container: %s); cpuset=%v, reserved:%v",
- pod.Namespace, string(pod.UID), pod.Name, container.Name, cpuset, p.reserved)
+ "(namespace: %s, pod UID: %s, pod: %s, container: %s); cpuset=%v, reserved:%v, isolcpus:%v",
+ pod.Namespace, string(pod.UID), pod.Name, container.Name, cpuset, p.reserved, p.isolcpus)
}
s.SetCPUSet(string(pod.UID), container.Name, cpuset)
klog.Infof("[cpumanager] static policy: reserved: AddContainer " +
@@ -284,8 +298,37 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai
}
s.SetCPUSet(string(pod.UID), container.Name, cpuset)
p.updateCPUsToReuse(pod, container, cpuset)
+ klog.Infof("[cpumanager] guaranteed: AddContainer " +
+ "(namespace: %s, pod UID: %s, pod: %s, container: %s); numCPUS=%d, cpuset=%v",
+ pod.Namespace, string(pod.UID), pod.Name, container.Name, numCPUs, cpuset)
+ return nil
+ }
+ if isolcpus := p.podIsolCPUs(pod, container); isolcpus.Size() > 0 {
+ // container has requested isolated CPUs
+ if set, ok := s.GetCPUSet(string(pod.UID), container.Name); ok {
+ if set.Equals(isolcpus) {
+ klog.Infof("[cpumanager] isolcpus container already present in state, skipping " +
+ "(namespace: %s, pod UID: %s, pod: %s, container: %s)",
+ pod.Namespace, string(pod.UID), pod.Name, container.Name)
+ return nil
+ } else {
+ klog.Infof("[cpumanager] isolcpus container state has cpus %v, should be %v" +
+ "(namespace: %s, pod UID: %s, pod: %s, container: %s)",
+ isolcpus, set, pod.Namespace, string(pod.UID), pod.Name, container.Name)
+ }
+ }
+ // Note that we do not do anything about init containers here.
+ // It looks like devices are allocated per-pod based on effective requests/limits
+ // and extra devices from initContainers are not freed up when the regular containers start.
+ // TODO: confirm this is still true for 1.20
+ s.SetCPUSet(string(pod.UID), container.Name, isolcpus)
+ klog.Infof("[cpumanager] isolcpus: AddContainer " +
+ "(namespace: %s, pod UID: %s, pod: %s, container: %s); cpuset=%v",
+ pod.Namespace, string(pod.UID), pod.Name, container.Name, isolcpus)
+ return nil
}
+
// container belongs in the shared pool (nothing to do; use default cpuset)
return nil
}
@@ -566,4 +609,34 @@ func isKubeInfra(pod *v1.Pod) bool {
}
}
return false
+}
+
+// get the isolated CPUs (if any) from the devices associated with a specific container
+func (p *staticPolicy) podIsolCPUs(pod *v1.Pod, container *v1.Container) cpuset.CPUSet {
+ // NOTE: This is required for TestStaticPolicyAdd() since makePod() does
+ // not create UID. We also need a way to properly stub devicemanager.
+ if len(string(pod.UID)) == 0 {
+ return cpuset.NewCPUSet()
+ }
+ resContDevices := p.deviceManager.GetDevices(string(pod.UID), container.Name)
+ cpuSet := cpuset.NewCPUSet()
+ for resourceName, resourceDevs := range resContDevices {
+ // this resource name needs to match the isolcpus device plugin
+ if resourceName == "windriver.com/isolcpus" {
+ for devID, _ := range resourceDevs {
+ cpuStrList := []string{devID}
+ if len(cpuStrList) > 0 {
+ // loop over the list of strings, convert each one to int, add to cpuset
+ for _, cpuStr := range cpuStrList {
+ cpu, err := strconv.Atoi(cpuStr)
+ if err != nil {
+ panic(err)
+ }
+ cpuSet = cpuSet.Union(cpuset.NewCPUSet(cpu))
+ }
+ }
+ }
+ }
+ }
+ return cpuSet
}
\ No newline at end of file
diff --git a/pkg/kubelet/cm/cpumanager/policy_static_test.go b/pkg/kubelet/cm/cpumanager/policy_static_test.go
index 90ad98e71bc..8511d5635bd 100644
--- a/pkg/kubelet/cm/cpumanager/policy_static_test.go
+++ b/pkg/kubelet/cm/cpumanager/policy_static_test.go
@@ -27,6 +27,7 @@ import (
"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask"
+ "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager"
)
type staticPolicyTest struct {
@@ -45,8 +46,9 @@ type staticPolicyTest struct {
}
func TestStaticPolicyName(t *testing.T) {
+ testDM, _ := devicemanager.NewManagerStub()
testExcl := false
- policy, _ := NewStaticPolicy(topoSingleSocketHT, 1, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), testExcl)
+ policy, _ := NewStaticPolicy(topoSingleSocketHT, 1, cpuset.NewCPUSet(), cpuset.NewCPUSet(), topologymanager.NewFakeManager(), testDM, testExcl)
policyName := policy.Name()
if policyName != "static" {
@@ -56,6 +58,7 @@ func TestStaticPolicyName(t *testing.T) {
}
func TestStaticPolicyStart(t *testing.T) {
+ testDM, _ := devicemanager.NewManagerStub()
testCases := []staticPolicyTest{
{
description: "non-corrupted state",
@@ -131,7 +134,7 @@ func TestStaticPolicyStart(t *testing.T) {
}
for _, testCase := range testCases {
t.Run(testCase.description, func(t *testing.T) {
- p, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), testCase.excludeReserved)
+ p, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), cpuset.NewCPUSet(), topologymanager.NewFakeManager(), testDM, testCase.excludeReserved)
policy := p.(*staticPolicy)
st := &mockState{
assignments: testCase.stAssignments,
@@ -179,6 +182,7 @@ func TestStaticPolicyAdd(t *testing.T) {
largeTopoSock0CPUSet := largeTopoSock0Builder.Result()
largeTopoSock1CPUSet := largeTopoSock1Builder.Result()
+ testDM, _ := devicemanager.NewManagerStub()
testCases := []staticPolicyTest{
{
description: "GuPodSingleCore, SingleSocketHT, ExpectError",
@@ -447,7 +451,7 @@ func TestStaticPolicyAdd(t *testing.T) {
}
for _, testCase := range testCases {
- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), testCase.excludeReserved)
+ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), cpuset.NewCPUSet(), topologymanager.NewFakeManager(), testDM, testCase.excludeReserved)
st := &mockState{
assignments: testCase.stAssignments,
@@ -490,6 +494,7 @@ func TestStaticPolicyAdd(t *testing.T) {
}
func TestStaticPolicyRemove(t *testing.T) {
+ testDM, _ := devicemanager.NewManagerStub()
excludeReserved := false
testCases := []staticPolicyTest{
{
@@ -549,7 +554,7 @@ func TestStaticPolicyRemove(t *testing.T) {
}
for _, testCase := range testCases {
- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), excludeReserved)
+ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), cpuset.NewCPUSet(), topologymanager.NewFakeManager(), testDM, excludeReserved)
st := &mockState{
assignments: testCase.stAssignments,
@@ -571,6 +576,7 @@ func TestStaticPolicyRemove(t *testing.T) {
}
func TestTopologyAwareAllocateCPUs(t *testing.T) {
+ testDM, _ := devicemanager.NewManagerStub()
excludeReserved := false
testCases := []struct {
description string
@@ -640,7 +646,7 @@ func TestTopologyAwareAllocateCPUs(t *testing.T) {
},
}
for _, tc := range testCases {
- p, _ := NewStaticPolicy(tc.topo, 0, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), excludeReserved)
+ p, _ := NewStaticPolicy(tc.topo, 0, cpuset.NewCPUSet(), cpuset.NewCPUSet(), topologymanager.NewFakeManager(), testDM, excludeReserved)
policy := p.(*staticPolicy)
st := &mockState{
assignments: tc.stAssignments,
@@ -673,6 +679,7 @@ type staticPolicyTestWithResvList struct {
topo *topology.CPUTopology
numReservedCPUs int
reserved cpuset.CPUSet
+ isolcpus cpuset.CPUSet
stAssignments state.ContainerCPUAssignments
stDefaultCPUSet cpuset.CPUSet
pod *v1.Pod
@@ -713,9 +720,10 @@ func TestStaticPolicyStartWithResvList(t *testing.T) {
},
}
testExcl := false
+ testDM, _ := devicemanager.NewManagerStub()
for _, testCase := range testCases {
t.Run(testCase.description, func(t *testing.T) {
- p, err := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, topologymanager.NewFakeManager(), testExcl)
+ p, err := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), testDM, testExcl)
if !reflect.DeepEqual(err, testCase.expNewErr) {
t.Errorf("StaticPolicy Start() error (%v). expected error: %v but got: %v",
testCase.description, testCase.expNewErr, err)
@@ -755,6 +763,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) {
topo: topoSingleSocketHT,
numReservedCPUs: 1,
reserved: cpuset.NewCPUSet(0),
+ isolcpus: cpuset.NewCPUSet(),
stAssignments: state.ContainerCPUAssignments{},
stDefaultCPUSet: cpuset.NewCPUSet(1, 2, 3, 4, 5, 6, 7),
pod: makePod("fakePod", "fakeContainer2", "8000m", "8000m"),
@@ -767,6 +776,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) {
topo: topoSingleSocketHT,
numReservedCPUs: 2,
reserved: cpuset.NewCPUSet(0, 1),
+ isolcpus: cpuset.NewCPUSet(),
stAssignments: state.ContainerCPUAssignments{},
stDefaultCPUSet: cpuset.NewCPUSet(2, 3, 4, 5, 6, 7),
pod: makePod("fakePod", "fakeContainer2", "1000m", "1000m"),
@@ -779,6 +789,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) {
topo: topoSingleSocketHT,
numReservedCPUs: 2,
reserved: cpuset.NewCPUSet(0, 1),
+ isolcpus: cpuset.NewCPUSet(),
stAssignments: state.ContainerCPUAssignments{
"fakePod": map[string]cpuset.CPUSet{
"fakeContainer100": cpuset.NewCPUSet(2, 3, 6, 7),
@@ -795,6 +806,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) {
topo: topoSingleSocketHT,
numReservedCPUs: 2,
reserved: cpuset.NewCPUSet(0, 1),
+ isolcpus: cpuset.NewCPUSet(),
stAssignments: state.ContainerCPUAssignments{
"fakePod": map[string]cpuset.CPUSet{
"fakeContainer100": cpuset.NewCPUSet(2, 3, 6, 7),
@@ -806,11 +818,29 @@ func TestStaticPolicyAddWithResvList(t *testing.T) {
expCPUAlloc: true,
expCSet: cpuset.NewCPUSet(0, 1),
},
+ {
+ description: "InfraPod, SingleSocketHT, Isolcpus, ExpectAllocReserved",
+ topo: topoSingleSocketHT,
+ numReservedCPUs: 2,
+ reserved: cpuset.NewCPUSet(0, 1),
+ isolcpus: cpuset.NewCPUSet(1),
+ stAssignments: state.ContainerCPUAssignments{
+ "fakePod": map[string]cpuset.CPUSet{
+ "fakeContainer100": cpuset.NewCPUSet(2, 3, 6, 7),
+ },
+ },
+ stDefaultCPUSet: cpuset.NewCPUSet(4, 5),
+ pod: infraPod,
+ expErr: nil,
+ expCPUAlloc: true,
+ expCSet: cpuset.NewCPUSet(0),
+ },
}
testExcl := true
+ testDM, _ := devicemanager.NewManagerStub()
for _, testCase := range testCases {
- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, topologymanager.NewFakeManager(), testExcl)
+ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, testCase.isolcpus, topologymanager.NewFakeManager(), testDM, testExcl)
st := &mockState{
assignments: testCase.stAssignments,
--
2.17.1

View File

@ -0,0 +1,313 @@
From 38ecddc618a62bc1b73bafd530b45e9bd8cac19e Mon Sep 17 00:00:00 2001
From: Jim Gauld <james.gauld@windriver.com>
Date: Tue, 7 Sep 2021 13:22:41 -0400
Subject: [PATCH 2/7] kubelet cpumanager keep normal containers off reserved
CPUs
When starting the kubelet process, two separate sets of reserved CPUs
may be specified. With this change CPUs reserved via '--system-reserved=cpu'
or '--kube-reserved=cpu' will be ignored by kubernetes itself. A small
tweak to the default CPU affinity ensures that "normal" Kubernetes
pods won't run on the reserved CPUs.
Signed-off-by: Jim Gauld <james.gauld@windriver.com>
---
pkg/kubelet/cm/cpumanager/cpu_manager.go | 6 +++-
pkg/kubelet/cm/cpumanager/cpu_manager_test.go | 11 ++++--
pkg/kubelet/cm/cpumanager/policy_static.go | 29 ++++++++++++---
.../cm/cpumanager/policy_static_test.go | 35 +++++++++++++------
4 files changed, 62 insertions(+), 19 deletions(-)
diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager.go b/pkg/kubelet/cm/cpumanager/cpu_manager.go
index f7b9c8d07bf..5b82e5b02ac 100644
--- a/pkg/kubelet/cm/cpumanager/cpu_manager.go
+++ b/pkg/kubelet/cm/cpumanager/cpu_manager.go
@@ -178,7 +178,11 @@ func NewManager(cpuPolicyName string, reconcilePeriod time.Duration, machineInfo
// exclusively allocated.
reservedCPUsFloat := float64(reservedCPUs.MilliValue()) / 1000
numReservedCPUs := int(math.Ceil(reservedCPUsFloat))
- policy, err = NewStaticPolicy(topo, numReservedCPUs, specificCPUs, affinity)
+ // NOTE: Set excludeReserved unconditionally to exclude reserved CPUs from default cpuset.
+ // This variable is primarily to make testing easier.
+ excludeReserved := true
+ policy, err = NewStaticPolicy(topo, numReservedCPUs, specificCPUs, affinity, excludeReserved)
+
if err != nil {
return nil, fmt.Errorf("new static policy error: %v", err)
}
diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager_test.go b/pkg/kubelet/cm/cpumanager/cpu_manager_test.go
index 51c6ad99251..a2e73de6875 100644
--- a/pkg/kubelet/cm/cpumanager/cpu_manager_test.go
+++ b/pkg/kubelet/cm/cpumanager/cpu_manager_test.go
@@ -215,6 +215,7 @@ func makeMultiContainerPod(initCPUs, appCPUs []struct{ request, limit string })
}
func TestCPUManagerAdd(t *testing.T) {
+ testExcl := false
testPolicy, _ := NewStaticPolicy(
&topology.CPUTopology{
NumCPUs: 4,
@@ -229,7 +230,8 @@ func TestCPUManagerAdd(t *testing.T) {
},
0,
cpuset.NewCPUSet(),
- topologymanager.NewFakeManager())
+ topologymanager.NewFakeManager(),
+ testExcl)
testCases := []struct {
description string
updateErr error
@@ -477,8 +479,9 @@ func TestCPUManagerAddWithInitContainers(t *testing.T) {
},
}
+ testExcl := false
for _, testCase := range testCases {
- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager())
+ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), testExcl)
mockState := &mockState{
assignments: testCase.stAssignments,
@@ -986,6 +989,7 @@ func TestReconcileState(t *testing.T) {
// above test cases are without kubelet --reserved-cpus cmd option
// the following tests are with --reserved-cpus configured
func TestCPUManagerAddWithResvList(t *testing.T) {
+ testExcl := false
testPolicy, _ := NewStaticPolicy(
&topology.CPUTopology{
NumCPUs: 4,
@@ -1000,7 +1004,8 @@ func TestCPUManagerAddWithResvList(t *testing.T) {
},
1,
cpuset.NewCPUSet(0),
- topologymanager.NewFakeManager())
+ topologymanager.NewFakeManager(),
+ testExcl)
testCases := []struct {
description string
updateErr error
diff --git a/pkg/kubelet/cm/cpumanager/policy_static.go b/pkg/kubelet/cm/cpumanager/policy_static.go
index ec25a15a3c2..94d81796132 100644
--- a/pkg/kubelet/cm/cpumanager/policy_static.go
+++ b/pkg/kubelet/cm/cpumanager/policy_static.go
@@ -75,6 +75,8 @@ type staticPolicy struct {
topology *topology.CPUTopology
// set of CPUs that is not available for exclusive assignment
reserved cpuset.CPUSet
+ // If true, default CPUSet should exclude reserved CPUs
+ excludeReserved bool
// topology manager reference to get container Topology affinity
affinity topologymanager.Store
// set of CPUs to reuse across allocations in a pod
@@ -87,7 +89,7 @@ var _ Policy = &staticPolicy{}
// NewStaticPolicy returns a CPU manager policy that does not change CPU
// assignments for exclusively pinned guaranteed containers after the main
// container process starts.
-func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reservedCPUs cpuset.CPUSet, affinity topologymanager.Store) (Policy, error) {
+func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reservedCPUs cpuset.CPUSet, affinity topologymanager.Store, excludeReserved bool) (Policy, error) {
allCPUs := topology.CPUDetails.CPUs()
var reserved cpuset.CPUSet
if reservedCPUs.Size() > 0 {
@@ -111,6 +113,7 @@ func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reserv
return &staticPolicy{
topology: topology,
reserved: reserved,
+ excludeReserved: excludeReserved,
affinity: affinity,
cpusToReuse: make(map[string]cpuset.CPUSet),
}, nil
@@ -139,7 +142,15 @@ func (p *staticPolicy) validateState(s state.State) error {
}
// state is empty initialize
allCPUs := p.topology.CPUDetails.CPUs()
- s.SetDefaultCPUSet(allCPUs)
+ if p.excludeReserved {
+ // Exclude reserved CPUs from the default CPUSet to keep containers off them
+ // unless explicitly affined.
+ s.SetDefaultCPUSet(allCPUs.Difference(p.reserved))
+ } else {
+ s.SetDefaultCPUSet(allCPUs)
+ }
+ klog.Infof("[cpumanager] static policy: CPUSet: allCPUs:%v, reserved:%v, default:%v\n",
+ allCPUs, p.reserved, s.GetDefaultCPUSet())
return nil
}
@@ -147,9 +158,11 @@ func (p *staticPolicy) validateState(s state.State) error {
// 1. Check if the reserved cpuset is not part of default cpuset because:
// - kube/system reserved have changed (increased) - may lead to some containers not being able to start
// - user tampered with file
- if !p.reserved.Intersection(tmpDefaultCPUset).Equals(p.reserved) {
- return fmt.Errorf("not all reserved cpus: \"%s\" are present in defaultCpuSet: \"%s\"",
- p.reserved.String(), tmpDefaultCPUset.String())
+ if !p.excludeReserved {
+ if !p.reserved.Intersection(tmpDefaultCPUset).Equals(p.reserved) {
+ return fmt.Errorf("not all reserved cpus: \"%s\" are present in defaultCpuSet: \"%s\"",
+ p.reserved.String(), tmpDefaultCPUset.String())
+ }
}
// 2. Check if state for static policy is consistent
@@ -178,6 +191,9 @@ func (p *staticPolicy) validateState(s state.State) error {
}
}
totalKnownCPUs = totalKnownCPUs.UnionAll(tmpCPUSets)
+ if p.excludeReserved {
+ totalKnownCPUs = totalKnownCPUs.Union(p.reserved)
+ }
if !totalKnownCPUs.Equals(p.topology.CPUDetails.CPUs()) {
return fmt.Errorf("current set of available CPUs \"%s\" doesn't match with CPUs in state \"%s\"",
p.topology.CPUDetails.CPUs().String(), totalKnownCPUs.String())
@@ -248,6 +264,9 @@ func (p *staticPolicy) RemoveContainer(s state.State, podUID string, containerNa
klog.InfoS("Static policy: RemoveContainer", "podUID", podUID, "containerName", containerName)
if toRelease, ok := s.GetCPUSet(podUID, containerName); ok {
s.Delete(podUID, containerName)
+ if p.excludeReserved {
+ toRelease = toRelease.Difference(p.reserved)
+ }
// Mutate the shared pool, adding released cpus.
s.SetDefaultCPUSet(s.GetDefaultCPUSet().Union(toRelease))
}
diff --git a/pkg/kubelet/cm/cpumanager/policy_static_test.go b/pkg/kubelet/cm/cpumanager/policy_static_test.go
index c54997787b4..93a2870532a 100644
--- a/pkg/kubelet/cm/cpumanager/policy_static_test.go
+++ b/pkg/kubelet/cm/cpumanager/policy_static_test.go
@@ -33,6 +33,7 @@ type staticPolicyTest struct {
description string
topo *topology.CPUTopology
numReservedCPUs int
+ excludeReserved bool
podUID string
containerName string
stAssignments state.ContainerCPUAssignments
@@ -44,7 +45,8 @@ type staticPolicyTest struct {
}
func TestStaticPolicyName(t *testing.T) {
- policy, _ := NewStaticPolicy(topoSingleSocketHT, 1, cpuset.NewCPUSet(), topologymanager.NewFakeManager())
+ testExcl := false
+ policy, _ := NewStaticPolicy(topoSingleSocketHT, 1, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), testExcl)
policyName := policy.Name()
if policyName != "static" {
@@ -74,6 +76,15 @@ func TestStaticPolicyStart(t *testing.T) {
stDefaultCPUSet: cpuset.NewCPUSet(),
expCSet: cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11),
},
+ {
+ description: "empty cpuset exclude reserved",
+ topo: topoDualSocketHT,
+ numReservedCPUs: 2,
+ excludeReserved: true,
+ stAssignments: state.ContainerCPUAssignments{},
+ stDefaultCPUSet: cpuset.NewCPUSet(),
+ expCSet: cpuset.NewCPUSet(1, 2, 3, 4, 5, 7, 8, 9, 10, 11),
+ },
{
description: "reserved cores 0 & 6 are not present in available cpuset",
topo: topoDualSocketHT,
@@ -120,7 +131,7 @@ func TestStaticPolicyStart(t *testing.T) {
}
for _, testCase := range testCases {
t.Run(testCase.description, func(t *testing.T) {
- p, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager())
+ p, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), testCase.excludeReserved)
policy := p.(*staticPolicy)
st := &mockState{
assignments: testCase.stAssignments,
@@ -436,7 +447,7 @@ func TestStaticPolicyAdd(t *testing.T) {
}
for _, testCase := range testCases {
- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager())
+ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), testCase.excludeReserved)
st := &mockState{
assignments: testCase.stAssignments,
@@ -479,6 +490,7 @@ func TestStaticPolicyAdd(t *testing.T) {
}
func TestStaticPolicyRemove(t *testing.T) {
+ excludeReserved := false
testCases := []staticPolicyTest{
{
description: "SingleSocketHT, DeAllocOneContainer",
@@ -537,7 +549,7 @@ func TestStaticPolicyRemove(t *testing.T) {
}
for _, testCase := range testCases {
- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager())
+ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), excludeReserved)
st := &mockState{
assignments: testCase.stAssignments,
@@ -559,6 +571,7 @@ func TestStaticPolicyRemove(t *testing.T) {
}
func TestTopologyAwareAllocateCPUs(t *testing.T) {
+ excludeReserved := false
testCases := []struct {
description string
topo *topology.CPUTopology
@@ -627,7 +640,7 @@ func TestTopologyAwareAllocateCPUs(t *testing.T) {
},
}
for _, tc := range testCases {
- p, _ := NewStaticPolicy(tc.topo, 0, cpuset.NewCPUSet(), topologymanager.NewFakeManager())
+ p, _ := NewStaticPolicy(tc.topo, 0, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), excludeReserved)
policy := p.(*staticPolicy)
st := &mockState{
assignments: tc.stAssignments,
@@ -699,9 +712,10 @@ func TestStaticPolicyStartWithResvList(t *testing.T) {
expNewErr: fmt.Errorf("[cpumanager] unable to reserve the required amount of CPUs (size of 0-1 did not equal 1)"),
},
}
+ testExcl := false
for _, testCase := range testCases {
t.Run(testCase.description, func(t *testing.T) {
- p, err := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, topologymanager.NewFakeManager())
+ p, err := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, topologymanager.NewFakeManager(), testExcl)
if !reflect.DeepEqual(err, testCase.expNewErr) {
t.Errorf("StaticPolicy Start() error (%v). expected error: %v but got: %v",
testCase.description, testCase.expNewErr, err)
@@ -741,7 +755,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) {
numReservedCPUs: 1,
reserved: cpuset.NewCPUSet(0),
stAssignments: state.ContainerCPUAssignments{},
- stDefaultCPUSet: cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7),
+ stDefaultCPUSet: cpuset.NewCPUSet(1, 2, 3, 4, 5, 6, 7),
pod: makePod("fakePod", "fakeContainer2", "8000m", "8000m"),
expErr: fmt.Errorf("not enough cpus available to satisfy request"),
expCPUAlloc: false,
@@ -753,7 +767,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) {
numReservedCPUs: 2,
reserved: cpuset.NewCPUSet(0, 1),
stAssignments: state.ContainerCPUAssignments{},
- stDefaultCPUSet: cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7),
+ stDefaultCPUSet: cpuset.NewCPUSet(2, 3, 4, 5, 6, 7),
pod: makePod("fakePod", "fakeContainer2", "1000m", "1000m"),
expErr: nil,
expCPUAlloc: true,
@@ -769,7 +783,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) {
"fakeContainer100": cpuset.NewCPUSet(2, 3, 6, 7),
},
},
- stDefaultCPUSet: cpuset.NewCPUSet(0, 1, 4, 5),
+ stDefaultCPUSet: cpuset.NewCPUSet(4, 5),
pod: makePod("fakePod", "fakeContainer3", "2000m", "2000m"),
expErr: nil,
expCPUAlloc: true,
@@ -777,8 +791,9 @@ func TestStaticPolicyAddWithResvList(t *testing.T) {
},
}
+ testExcl := true
for _, testCase := range testCases {
- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, topologymanager.NewFakeManager())
+ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, topologymanager.NewFakeManager(), testExcl)
st := &mockState{
assignments: testCase.stAssignments,
--
2.17.1

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff