From bc28897dc55f9212b433c0f79fa1df363976c7c8 Mon Sep 17 00:00:00 2001 From: Saba Touheed Mujawar Date: Wed, 6 Dec 2023 05:10:27 -0500 Subject: [PATCH] Add kubernetes 1.28.4 patches This change ports the following kubernetes 1.28.4 patches which were refactored slightly to allow for upstream changes The following patches were applied cleanly: kubelet-sort-isolcpus-allocation-when-SMT-enabled.patch kubelet-cpumanager-disable-CFS-quota-throttling.patch kubelet-cpumanager-keep-normal-containers-off-reserv.patch kubelet-cpumanager-infra-pods-use-system-reserved-CP.patch Affinity-of-guaranteed-pod-to-non-isolated-CPUs.patch kubelet-CFS-quota-throttling-for-non-integer-cpulimit.patch The following patches were refactored: kubeadm-create-platform-pods-with-zero-CPU-resources.patch kubernetes-make-isolcpus-allocation-SMT-aware.patch kubelet-cpumanager-introduce-concept-of-isolated-CPU.patch enable-support-for-kubernetes-to-ignore-isolcpus.patch Note: Revert-use-subpath-for-coredns-only-for-default-repo.patch is removed as this change that updates the dns imageRepository is taken care in ansible playbook https://review.opendev.org/c/starlingx/ansible-playbooks/+/903499/1/playbookconfig/src/playbooks/roles/common/files/kubeadm.yaml.j2 Test Plan: PASS: Kubernetes package 1.28.4 builds properly. PASS: Run all Kubelet, kubeadm, kubectl make tests for affected code. PASS: build-iso successful with multiple kubernetes versions PASS: Install iso with k8s 1.28 default and test all patches. Story: 2010878 Task: 49209 Change-Id: I7693ad2fcc93d146eeae882d44f83b60589565db Signed-off-by: Saba Touheed Mujawar --- ...-guaranteed-pod-to-non-isolated-CPUs.patch | 34 + ...rt-for-kubernetes-to-ignore-isolcpus.patch | 74 ++ ...latform-pods-with-zero-CPU-resources.patch | 109 +++ ...-throttling-for-non-integer-cpulimit.patch | 30 + ...manager-disable-CFS-quota-throttling.patch | 256 ++++++ ...er-infra-pods-use-system-reserved-CP.patch | 167 ++++ ...er-introduce-concept-of-isolated-CPU.patch | 755 ++++++++++++++++++ ...er-keep-normal-containers-off-reserv.patch | 357 +++++++++ ...isolcpus-allocation-when-SMT-enabled.patch | 50 ++ ...s-make-isolcpus-allocation-SMT-aware.patch | 148 ++++ .../debian/deb_folder/patches/series | 10 + 11 files changed, 1990 insertions(+) create mode 100644 kubernetes/kubernetes-1.28.4/debian/deb_folder/patches/Affinity-of-guaranteed-pod-to-non-isolated-CPUs.patch create mode 100644 kubernetes/kubernetes-1.28.4/debian/deb_folder/patches/enable-support-for-kubernetes-to-ignore-isolcpus.patch create mode 100644 kubernetes/kubernetes-1.28.4/debian/deb_folder/patches/kubeadm-create-platform-pods-with-zero-CPU-resources.patch create mode 100644 kubernetes/kubernetes-1.28.4/debian/deb_folder/patches/kubelet-CFS-quota-throttling-for-non-integer-cpulimit.patch create mode 100644 kubernetes/kubernetes-1.28.4/debian/deb_folder/patches/kubelet-cpumanager-disable-CFS-quota-throttling.patch create mode 100644 kubernetes/kubernetes-1.28.4/debian/deb_folder/patches/kubelet-cpumanager-infra-pods-use-system-reserved-CP.patch create mode 100644 kubernetes/kubernetes-1.28.4/debian/deb_folder/patches/kubelet-cpumanager-introduce-concept-of-isolated-CPU.patch create mode 100644 kubernetes/kubernetes-1.28.4/debian/deb_folder/patches/kubelet-cpumanager-keep-normal-containers-off-reserv.patch create mode 100644 kubernetes/kubernetes-1.28.4/debian/deb_folder/patches/kubelet-sort-isolcpus-allocation-when-SMT-enabled.patch create mode 100644 kubernetes/kubernetes-1.28.4/debian/deb_folder/patches/kubernetes-make-isolcpus-allocation-SMT-aware.patch create mode 100644 kubernetes/kubernetes-1.28.4/debian/deb_folder/patches/series diff --git a/kubernetes/kubernetes-1.28.4/debian/deb_folder/patches/Affinity-of-guaranteed-pod-to-non-isolated-CPUs.patch b/kubernetes/kubernetes-1.28.4/debian/deb_folder/patches/Affinity-of-guaranteed-pod-to-non-isolated-CPUs.patch new file mode 100644 index 000000000..6d3f026fe --- /dev/null +++ b/kubernetes/kubernetes-1.28.4/debian/deb_folder/patches/Affinity-of-guaranteed-pod-to-non-isolated-CPUs.patch @@ -0,0 +1,34 @@ +From 904157cdf02bf9dd0d90af4372ceadd16717e806 Mon Sep 17 00:00:00 2001 +From: Boovan Rajendran +Date: Mon, 6 Nov 2023 02:22:16 -0500 +Subject: [PATCH] Affinity of guaranteed pod to non-isolated CPUs + +This corrects kubelet cpumanager static cpuset tracking for isolcpus +for versions 1.26.1 and 1.27.5. This ensures that pods specified with +isolcpus + guaranteed QoS + integer cpu requests, are affined to +exclusive cpuset and tracked as non-isolated cpus. + +Signed-off-by: Boovan Rajendran +--- + pkg/kubelet/cm/cpumanager/policy_static.go | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/pkg/kubelet/cm/cpumanager/policy_static.go b/pkg/kubelet/cm/cpumanager/policy_static.go +index e9a2defd848..c76a6edbc20 100644 +--- a/pkg/kubelet/cm/cpumanager/policy_static.go ++++ b/pkg/kubelet/cm/cpumanager/policy_static.go +@@ -343,7 +343,10 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai + return nil + } + +- if isolcpus := p.podIsolCPUs(pod, container); isolcpus.Size() > 0 { ++ cpuQuantity := container.Resources.Requests[v1.ResourceCPU] ++ fractionalCpuQuantity := cpuQuantity.MilliValue() % 1000 ++ if isolcpus := p.podIsolCPUs(pod, container); isolcpus.Size() > 0 && ++ v1qos.GetPodQOS(pod) != v1.PodQOSGuaranteed && fractionalCpuQuantity == 0{ + // container has requested isolated CPUs + if set, ok := s.GetCPUSet(string(pod.UID), container.Name); ok { + if set.Equals(isolcpus) { +-- +2.25.1 + diff --git a/kubernetes/kubernetes-1.28.4/debian/deb_folder/patches/enable-support-for-kubernetes-to-ignore-isolcpus.patch b/kubernetes/kubernetes-1.28.4/debian/deb_folder/patches/enable-support-for-kubernetes-to-ignore-isolcpus.patch new file mode 100644 index 000000000..ae1e13f77 --- /dev/null +++ b/kubernetes/kubernetes-1.28.4/debian/deb_folder/patches/enable-support-for-kubernetes-to-ignore-isolcpus.patch @@ -0,0 +1,74 @@ +From 4714c7f51561af27f7aec64efe8dc128ee0fb8bc Mon Sep 17 00:00:00 2001 +From: Saba Touheed Mujawar +Date: Tue, 12 Dec 2023 08:57:04 -0500 +Subject: [PATCH] enable support for kubernetes to ignore isolcpus + +The normal mechanisms for allocating isolated CPUs do not allow +a mix of isolated and exclusive CPUs in the same container. In +order to allow this in *very* limited cases where the pod spec +is known in advance we will add the ability to disable the normal +isolcpus behaviour. + +If the file "/etc/kubernetes/ignore_isolcpus" exists, then kubelet +will basically forget everything it knows about isolcpus and just +treat them like regular CPUs. + +The admin user can then rely on the fact that CPU allocation is +deterministic to ensure that the isolcpus they configure end up being +allocated to the correct pods. + +Signed-off-by: Daniel Safta +Signed-off-by: Ramesh Kumar Sivanandam +Signed-off-by: Boovan Rajendran +Signed-off-by: Saba Touheed Mujawar +--- + pkg/kubelet/cm/cpumanager/cpu_manager.go | 7 +++++++ + pkg/kubelet/cm/cpumanager/policy_static.go | 7 +++++++ + 2 files changed, 14 insertions(+) + +diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager.go b/pkg/kubelet/cm/cpumanager/cpu_manager.go +index 396d9358a04..ebd6e2917ab 100644 +--- a/pkg/kubelet/cm/cpumanager/cpu_manager.go ++++ b/pkg/kubelet/cm/cpumanager/cpu_manager.go +@@ -56,6 +56,13 @@ const cpuManagerStateFileName = "cpu_manager_state" + + // get the system-level isolated CPUs + func getIsolcpus() cpuset.CPUSet { ++ // This is a gross hack to basically turn off awareness of isolcpus to enable ++ // isolated cpus to be allocated to pods the same way as non-isolated CPUs. ++ if _, err := os.Stat("/etc/kubernetes/ignore_isolcpus"); err == nil { ++ klog.Infof("[cpumanager] turning off isolcpus awareness") ++ return cpuset.New() ++ } ++ + dat, err := os.ReadFile("/sys/devices/system/cpu/isolated") + if err != nil { + klog.Errorf("[cpumanager] unable to read sysfs isolcpus subdir") +diff --git a/pkg/kubelet/cm/cpumanager/policy_static.go b/pkg/kubelet/cm/cpumanager/policy_static.go +index c76a6edbc20..9425f5b27fb 100644 +--- a/pkg/kubelet/cm/cpumanager/policy_static.go ++++ b/pkg/kubelet/cm/cpumanager/policy_static.go +@@ -18,6 +18,7 @@ package cpumanager + + import ( + "fmt" ++ "os" + "strconv" + + v1 "k8s.io/api/core/v1" +@@ -755,6 +756,12 @@ func isKubeInfra(pod *v1.Pod) bool { + + // get the isolated CPUs (if any) from the devices associated with a specific container + func (p *staticPolicy) podIsolCPUs(pod *v1.Pod, container *v1.Container) cpuset.CPUSet { ++ // This is a gross hack to basically turn off awareness of isolcpus to enable ++ // isolated cpus to be allocated to pods the same way as non-isolated CPUs. ++ if _, err := os.Stat("/etc/kubernetes/ignore_isolcpus"); err == nil { ++ return cpuset.New() ++ } ++ + // NOTE: This is required for TestStaticPolicyAdd() since makePod() does + // not create UID. We also need a way to properly stub devicemanager. + if len(string(pod.UID)) == 0 { +-- +2.25.1 + diff --git a/kubernetes/kubernetes-1.28.4/debian/deb_folder/patches/kubeadm-create-platform-pods-with-zero-CPU-resources.patch b/kubernetes/kubernetes-1.28.4/debian/deb_folder/patches/kubeadm-create-platform-pods-with-zero-CPU-resources.patch new file mode 100644 index 000000000..193b6e303 --- /dev/null +++ b/kubernetes/kubernetes-1.28.4/debian/deb_folder/patches/kubeadm-create-platform-pods-with-zero-CPU-resources.patch @@ -0,0 +1,109 @@ +From 570a2290ff1a1ef4a0c93fd0ebca0aba7054f394 Mon Sep 17 00:00:00 2001 +From: Saba Touheed Mujawar +Date: Tue, 28 Nov 2023 09:16:45 -0500 +Subject: [PATCH] kubeadm: create platform pods with zero CPU resources + +We want to specify zero CPU resources when creating the manifests +for the static platform pods, as a workaround for the lack of +separate resource tracking for platform resources. + +We also specify zero CPU resources for the coredns deployment. +manifests.go appears to be the main file for this, not sure if the +others are used but I changed them just in case. + +Signed-off-by: Daniel Safta +Signed-off-by: Saba Touheed Mujawar +--- + cluster/addons/dns/coredns/coredns.yaml.base | 2 +- + cluster/addons/dns/coredns/coredns.yaml.in | 2 +- + cluster/addons/dns/coredns/coredns.yaml.sed | 2 +- + cmd/kubeadm/app/phases/addons/dns/manifests.go | 2 +- + cmd/kubeadm/app/phases/controlplane/manifests.go | 6 +++--- + 5 files changed, 7 insertions(+), 7 deletions(-) + +diff --git a/cluster/addons/dns/coredns/coredns.yaml.base b/cluster/addons/dns/coredns/coredns.yaml.base +index 69c0f456591..29e181e186b 100644 +--- a/cluster/addons/dns/coredns/coredns.yaml.base ++++ b/cluster/addons/dns/coredns/coredns.yaml.base +@@ -139,7 +139,7 @@ spec: + limits: + memory: __DNS__MEMORY__LIMIT__ + requests: +- cpu: 100m ++ cpu: 0 + memory: 70Mi + args: [ "-conf", "/etc/coredns/Corefile" ] + volumeMounts: +diff --git a/cluster/addons/dns/coredns/coredns.yaml.in b/cluster/addons/dns/coredns/coredns.yaml.in +index 98edc4e1a54..d01df09835c 100644 +--- a/cluster/addons/dns/coredns/coredns.yaml.in ++++ b/cluster/addons/dns/coredns/coredns.yaml.in +@@ -139,7 +139,7 @@ spec: + limits: + memory: 'dns_memory_limit' + requests: +- cpu: 100m ++ cpu: 0 + memory: 70Mi + args: [ "-conf", "/etc/coredns/Corefile" ] + volumeMounts: +diff --git a/cluster/addons/dns/coredns/coredns.yaml.sed b/cluster/addons/dns/coredns/coredns.yaml.sed +index 021b35d0ba4..04971150617 100644 +--- a/cluster/addons/dns/coredns/coredns.yaml.sed ++++ b/cluster/addons/dns/coredns/coredns.yaml.sed +@@ -139,7 +139,7 @@ spec: + limits: + memory: $DNS_MEMORY_LIMIT + requests: +- cpu: 100m ++ cpu: 0 + memory: 70Mi + args: [ "-conf", "/etc/coredns/Corefile" ] + volumeMounts: +diff --git a/cmd/kubeadm/app/phases/addons/dns/manifests.go b/cmd/kubeadm/app/phases/addons/dns/manifests.go +index 931897b16e2..e6f92f31127 100644 +--- a/cmd/kubeadm/app/phases/addons/dns/manifests.go ++++ b/cmd/kubeadm/app/phases/addons/dns/manifests.go +@@ -104,7 +104,7 @@ spec: + limits: + memory: 170Mi + requests: +- cpu: 100m ++ cpu: 0 + memory: 70Mi + args: [ "-conf", "/etc/coredns/Corefile" ] + volumeMounts: +diff --git a/cmd/kubeadm/app/phases/controlplane/manifests.go b/cmd/kubeadm/app/phases/controlplane/manifests.go +index baa8ab6a965..e2c469a6e2f 100644 +--- a/cmd/kubeadm/app/phases/controlplane/manifests.go ++++ b/cmd/kubeadm/app/phases/controlplane/manifests.go +@@ -66,7 +66,7 @@ func GetStaticPodSpecs(cfg *kubeadmapi.ClusterConfiguration, endpoint *kubeadmap + LivenessProbe: staticpodutil.LivenessProbe(staticpodutil.GetAPIServerProbeAddress(endpoint), "/livez", int(endpoint.BindPort), v1.URISchemeHTTPS), + ReadinessProbe: staticpodutil.ReadinessProbe(staticpodutil.GetAPIServerProbeAddress(endpoint), "/readyz", int(endpoint.BindPort), v1.URISchemeHTTPS), + StartupProbe: staticpodutil.StartupProbe(staticpodutil.GetAPIServerProbeAddress(endpoint), "/livez", int(endpoint.BindPort), v1.URISchemeHTTPS, cfg.APIServer.TimeoutForControlPlane), +- Resources: staticpodutil.ComponentResources("250m"), ++ Resources: staticpodutil.ComponentResources("0"), + Env: kubeadmutil.MergeKubeadmEnvVars(proxyEnvs, cfg.APIServer.ExtraEnvs), + }, mounts.GetVolumes(kubeadmconstants.KubeAPIServer), + map[string]string{kubeadmconstants.KubeAPIServerAdvertiseAddressEndpointAnnotationKey: endpoint.String()}), +@@ -78,7 +78,7 @@ func GetStaticPodSpecs(cfg *kubeadmapi.ClusterConfiguration, endpoint *kubeadmap + VolumeMounts: staticpodutil.VolumeMountMapToSlice(mounts.GetVolumeMounts(kubeadmconstants.KubeControllerManager)), + LivenessProbe: staticpodutil.LivenessProbe(staticpodutil.GetControllerManagerProbeAddress(cfg), "/healthz", kubeadmconstants.KubeControllerManagerPort, v1.URISchemeHTTPS), + StartupProbe: staticpodutil.StartupProbe(staticpodutil.GetControllerManagerProbeAddress(cfg), "/healthz", kubeadmconstants.KubeControllerManagerPort, v1.URISchemeHTTPS, cfg.APIServer.TimeoutForControlPlane), +- Resources: staticpodutil.ComponentResources("200m"), ++ Resources: staticpodutil.ComponentResources("0"), + Env: kubeadmutil.MergeKubeadmEnvVars(proxyEnvs, cfg.ControllerManager.ExtraEnvs), + }, mounts.GetVolumes(kubeadmconstants.KubeControllerManager), nil), + kubeadmconstants.KubeScheduler: staticpodutil.ComponentPod(v1.Container{ +@@ -89,7 +89,7 @@ func GetStaticPodSpecs(cfg *kubeadmapi.ClusterConfiguration, endpoint *kubeadmap + VolumeMounts: staticpodutil.VolumeMountMapToSlice(mounts.GetVolumeMounts(kubeadmconstants.KubeScheduler)), + LivenessProbe: staticpodutil.LivenessProbe(staticpodutil.GetSchedulerProbeAddress(cfg), "/healthz", kubeadmconstants.KubeSchedulerPort, v1.URISchemeHTTPS), + StartupProbe: staticpodutil.StartupProbe(staticpodutil.GetSchedulerProbeAddress(cfg), "/healthz", kubeadmconstants.KubeSchedulerPort, v1.URISchemeHTTPS, cfg.APIServer.TimeoutForControlPlane), +- Resources: staticpodutil.ComponentResources("100m"), ++ Resources: staticpodutil.ComponentResources("0"), + Env: kubeadmutil.MergeKubeadmEnvVars(proxyEnvs, cfg.Scheduler.ExtraEnvs), + }, mounts.GetVolumes(kubeadmconstants.KubeScheduler), nil), + } +-- +2.25.1 + diff --git a/kubernetes/kubernetes-1.28.4/debian/deb_folder/patches/kubelet-CFS-quota-throttling-for-non-integer-cpulimit.patch b/kubernetes/kubernetes-1.28.4/debian/deb_folder/patches/kubelet-CFS-quota-throttling-for-non-integer-cpulimit.patch new file mode 100644 index 000000000..fe3d60754 --- /dev/null +++ b/kubernetes/kubernetes-1.28.4/debian/deb_folder/patches/kubelet-CFS-quota-throttling-for-non-integer-cpulimit.patch @@ -0,0 +1,30 @@ +From e2055fb98c3773562b45909f0dcbc32216f08f11 Mon Sep 17 00:00:00 2001 +From: Boovan Rajendran +Date: Mon, 4 Sep 2023 08:05:29 -0400 +Subject: [PATCH] kubelet CFS quota throttling for non integer cpulimit + +Signed-off-by: Boovan Rajendran +--- + pkg/kubelet/cm/internal_container_lifecycle_linux.go | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/pkg/kubelet/cm/internal_container_lifecycle_linux.go b/pkg/kubelet/cm/internal_container_lifecycle_linux.go +index a99d01f8884..e5e25cd56de 100644 +--- a/pkg/kubelet/cm/internal_container_lifecycle_linux.go ++++ b/pkg/kubelet/cm/internal_container_lifecycle_linux.go +@@ -39,7 +39,11 @@ func (i *internalContainerLifecycleImpl) PreCreateContainer(pod *v1.Pod, contain + // Disable cgroup CFS throttle at the container level. + // /sys/fs/cgroup/cpu/k8s-infra/kubepods///cpu.cfs_quota_us + // /sys/fs/cgroup/cpu/k8s-infra/kubepods///cpu.cfs_period_us +- if i.cpuManager.GetCPUPolicy() == "static" && v1qos.GetPodQOS(pod) == v1.PodQOSGuaranteed { ++ // We can only set CpuQuota to -1 if we're allocating the entire CPU. ++ // For fractional CPUs the CpuQuota is needed to enforce the limit. ++ cpuQuantity := container.Resources.Requests[v1.ResourceCPU] ++ fractionalCpuQuantity := cpuQuantity.MilliValue()%1000 ++ if i.cpuManager.GetCPUPolicy() == "static" && v1qos.GetPodQOS(pod) == v1.PodQOSGuaranteed && fractionalCpuQuantity == 0 { + containerConfig.Linux.Resources.CpuPeriod = int64(100000) + containerConfig.Linux.Resources.CpuQuota = int64(-1) + } +-- +2.25.1 + diff --git a/kubernetes/kubernetes-1.28.4/debian/deb_folder/patches/kubelet-cpumanager-disable-CFS-quota-throttling.patch b/kubernetes/kubernetes-1.28.4/debian/deb_folder/patches/kubelet-cpumanager-disable-CFS-quota-throttling.patch new file mode 100644 index 000000000..28c81d5ef --- /dev/null +++ b/kubernetes/kubernetes-1.28.4/debian/deb_folder/patches/kubelet-cpumanager-disable-CFS-quota-throttling.patch @@ -0,0 +1,256 @@ +From 752e3e88d162aada55282aea7544e458e610c947 Mon Sep 17 00:00:00 2001 +From: Boovan Rajendran +Date: Wed, 30 Aug 2023 06:01:30 -0400 +Subject: [PATCH] kubelet cpumanager disable CFS quota throttling + +This disables CFS CPU quota to avoid performance degradation due to +Linux kernel CFS quota implementation. Note that 4.18 kernel attempts +to solve the CFS throttling problem, but there are reports that it is +not completely effective. + +This disables CFS quota throttling for Guaranteed pods for both +parent and container cgroups by writing -1 to cgroup cpu.cfs_quota_us. +Disabling has a dramatic latency improvement for HTTP response times. + +This patch is refactored in 1.22.5 due to new internal_container_lifecycle +framework. We leverage the same mechanism to set Linux resources as: +cpu manager: specify the container CPU set during the creation + +Co-authored-by: Jim Gauld +Signed-off-by: Sachin Gopala Krishna +Signed-off-by: Boovan Rajendran +--- + pkg/kubelet/cm/cpumanager/cpu_manager.go | 7 +++ + pkg/kubelet/cm/cpumanager/fake_cpu_manager.go | 10 ++++- + pkg/kubelet/cm/helpers_linux.go | 10 +++++ + pkg/kubelet/cm/helpers_linux_test.go | 43 ++++++++++--------- + .../cm/internal_container_lifecycle_linux.go | 9 ++++ + 5 files changed, 57 insertions(+), 22 deletions(-) + +diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager.go b/pkg/kubelet/cm/cpumanager/cpu_manager.go +index 8b5049d7d74..1d8901f4b36 100644 +--- a/pkg/kubelet/cm/cpumanager/cpu_manager.go ++++ b/pkg/kubelet/cm/cpumanager/cpu_manager.go +@@ -73,6 +73,9 @@ type Manager interface { + // State returns a read-only interface to the internal CPU manager state. + State() state.Reader + ++ // GetCPUPolicy returns the assigned CPU manager policy ++ GetCPUPolicy() string ++ + // GetTopologyHints implements the topologymanager.HintProvider Interface + // and is consulted to achieve NUMA aware resource alignment among this + // and other resource controllers. +@@ -315,6 +318,10 @@ func (m *manager) State() state.Reader { + return m.state + } + ++func (m *manager) GetCPUPolicy() string { ++ return m.policy.Name() ++} ++ + func (m *manager) GetTopologyHints(pod *v1.Pod, container *v1.Container) map[string][]topologymanager.TopologyHint { + // The pod is during the admission phase. We need to save the pod to avoid it + // being cleaned before the admission ended +diff --git a/pkg/kubelet/cm/cpumanager/fake_cpu_manager.go b/pkg/kubelet/cm/cpumanager/fake_cpu_manager.go +index 4a03f3dd23f..b36fb0da3b4 100644 +--- a/pkg/kubelet/cm/cpumanager/fake_cpu_manager.go ++++ b/pkg/kubelet/cm/cpumanager/fake_cpu_manager.go +@@ -28,7 +28,8 @@ import ( + ) + + type fakeManager struct { +- state state.State ++ policy Policy ++ state state.State + } + + func (m *fakeManager) Start(activePods ActivePodsFunc, sourcesReady config.SourcesReady, podStatusProvider status.PodStatusProvider, containerRuntime runtimeService, initialContainers containermap.ContainerMap) error { +@@ -70,6 +71,10 @@ func (m *fakeManager) State() state.Reader { + return m.state + } + ++func (m *fakeManager) GetCPUPolicy() string { ++ return m.policy.Name() ++} ++ + func (m *fakeManager) GetExclusiveCPUs(podUID, containerName string) cpuset.CPUSet { + klog.InfoS("GetExclusiveCPUs", "podUID", podUID, "containerName", containerName) + return cpuset.CPUSet{} +@@ -88,6 +93,7 @@ func (m *fakeManager) GetCPUAffinity(podUID, containerName string) cpuset.CPUSet + // NewFakeManager creates empty/fake cpu manager + func NewFakeManager() Manager { + return &fakeManager{ +- state: state.NewMemoryState(), ++ policy: &nonePolicy{}, ++ state: state.NewMemoryState(), + } + } +diff --git a/pkg/kubelet/cm/helpers_linux.go b/pkg/kubelet/cm/helpers_linux.go +index 8a144e7a73c..008b955ee98 100644 +--- a/pkg/kubelet/cm/helpers_linux.go ++++ b/pkg/kubelet/cm/helpers_linux.go +@@ -170,6 +170,16 @@ func ResourceConfigForPod(pod *v1.Pod, enforceCPULimits bool, cpuPeriod uint64, + // build the result + result := &ResourceConfig{} + if qosClass == v1.PodQOSGuaranteed { ++ // Disable CFS CPU quota to avoid performance degradation due to ++ // Linux kernel CFS throttle implementation. ++ // NOTE: 4.18 kernel attempts to solve CFS throttling problem, ++ // but there are reports that it is not completely effective. ++ // This will configure cgroup CFS parameters at pod level: ++ // /sys/fs/cgroup/cpu/k8s-infra/kubepods//cpu.cfs_quota_us ++ // /sys/fs/cgroup/cpu/k8s-infra/kubepods//cpu.cfs_period_us ++ cpuQuota = int64(-1) ++ cpuPeriod = uint64(100000) ++ + result.CPUShares = &cpuShares + result.CPUQuota = &cpuQuota + result.CPUPeriod = &cpuPeriod +diff --git a/pkg/kubelet/cm/helpers_linux_test.go b/pkg/kubelet/cm/helpers_linux_test.go +index fba41fd49be..60609394659 100644 +--- a/pkg/kubelet/cm/helpers_linux_test.go ++++ b/pkg/kubelet/cm/helpers_linux_test.go +@@ -64,8 +64,9 @@ func TestResourceConfigForPod(t *testing.T) { + burstablePartialShares := MilliCPUToShares(200) + burstableQuota := MilliCPUToQuota(200, int64(defaultQuotaPeriod)) + guaranteedShares := MilliCPUToShares(100) +- guaranteedQuota := MilliCPUToQuota(100, int64(defaultQuotaPeriod)) +- guaranteedTunedQuota := MilliCPUToQuota(100, int64(tunedQuotaPeriod)) ++ guaranteedQuotaPeriod := uint64(100000) ++ guaranteedQuota := int64(-1) ++ guaranteedTunedQuota := int64(-1) + memoryQuantity = resource.MustParse("100Mi") + cpuNoLimit := int64(-1) + guaranteedMemory := memoryQuantity.Value() +@@ -204,8 +205,8 @@ func TestResourceConfigForPod(t *testing.T) { + }, + }, + enforceCPULimits: true, +- quotaPeriod: defaultQuotaPeriod, +- expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &guaranteedQuota, CPUPeriod: &defaultQuotaPeriod, Memory: &guaranteedMemory}, ++ quotaPeriod: guaranteedQuotaPeriod, ++ expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &guaranteedQuota, CPUPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory}, + }, + "guaranteed-no-cpu-enforcement": { + pod: &v1.Pod{ +@@ -218,8 +219,8 @@ func TestResourceConfigForPod(t *testing.T) { + }, + }, + enforceCPULimits: false, +- quotaPeriod: defaultQuotaPeriod, +- expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &cpuNoLimit, CPUPeriod: &defaultQuotaPeriod, Memory: &guaranteedMemory}, ++ quotaPeriod: guaranteedQuotaPeriod, ++ expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &cpuNoLimit, CPUPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory}, + }, + "guaranteed-with-tuned-quota": { + pod: &v1.Pod{ +@@ -232,8 +233,8 @@ func TestResourceConfigForPod(t *testing.T) { + }, + }, + enforceCPULimits: true, +- quotaPeriod: tunedQuotaPeriod, +- expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &guaranteedTunedQuota, CPUPeriod: &tunedQuotaPeriod, Memory: &guaranteedMemory}, ++ quotaPeriod: guaranteedQuotaPeriod, ++ expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &guaranteedTunedQuota, CPUPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory}, + }, + "guaranteed-no-cpu-enforcement-with-tuned-quota": { + pod: &v1.Pod{ +@@ -246,8 +247,8 @@ func TestResourceConfigForPod(t *testing.T) { + }, + }, + enforceCPULimits: false, +- quotaPeriod: tunedQuotaPeriod, +- expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &cpuNoLimit, CPUPeriod: &tunedQuotaPeriod, Memory: &guaranteedMemory}, ++ quotaPeriod: guaranteedQuotaPeriod, ++ expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &cpuNoLimit, CPUPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory}, + }, + "burstable-partial-limits-with-init-containers": { + pod: &v1.Pod{ +@@ -309,8 +310,10 @@ func TestResourceConfigForPodWithCustomCPUCFSQuotaPeriod(t *testing.T) { + burstablePartialShares := MilliCPUToShares(200) + burstableQuota := MilliCPUToQuota(200, int64(defaultQuotaPeriod)) + guaranteedShares := MilliCPUToShares(100) +- guaranteedQuota := MilliCPUToQuota(100, int64(defaultQuotaPeriod)) +- guaranteedTunedQuota := MilliCPUToQuota(100, int64(tunedQuotaPeriod)) ++ guaranteedQuotaPeriod := uint64(100000) ++ guaranteedQuota := int64(-1) ++ guaranteedTunedQuota := int64(-1) ++ + memoryQuantity = resource.MustParse("100Mi") + cpuNoLimit := int64(-1) + guaranteedMemory := memoryQuantity.Value() +@@ -449,8 +452,8 @@ func TestResourceConfigForPodWithCustomCPUCFSQuotaPeriod(t *testing.T) { + }, + }, + enforceCPULimits: true, +- quotaPeriod: defaultQuotaPeriod, +- expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &guaranteedQuota, CPUPeriod: &defaultQuotaPeriod, Memory: &guaranteedMemory}, ++ quotaPeriod: guaranteedQuotaPeriod, ++ expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &guaranteedQuota, CPUPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory}, + }, + "guaranteed-no-cpu-enforcement": { + pod: &v1.Pod{ +@@ -463,8 +466,8 @@ func TestResourceConfigForPodWithCustomCPUCFSQuotaPeriod(t *testing.T) { + }, + }, + enforceCPULimits: false, +- quotaPeriod: defaultQuotaPeriod, +- expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &cpuNoLimit, CPUPeriod: &defaultQuotaPeriod, Memory: &guaranteedMemory}, ++ quotaPeriod: guaranteedQuotaPeriod, ++ expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &cpuNoLimit, CPUPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory}, + }, + "guaranteed-with-tuned-quota": { + pod: &v1.Pod{ +@@ -477,8 +480,8 @@ func TestResourceConfigForPodWithCustomCPUCFSQuotaPeriod(t *testing.T) { + }, + }, + enforceCPULimits: true, +- quotaPeriod: tunedQuotaPeriod, +- expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &guaranteedTunedQuota, CPUPeriod: &tunedQuotaPeriod, Memory: &guaranteedMemory}, ++ quotaPeriod: guaranteedQuotaPeriod, ++ expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &guaranteedTunedQuota, CPUPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory}, + }, + "guaranteed-no-cpu-enforcement-with-tuned-quota": { + pod: &v1.Pod{ +@@ -491,8 +494,8 @@ func TestResourceConfigForPodWithCustomCPUCFSQuotaPeriod(t *testing.T) { + }, + }, + enforceCPULimits: false, +- quotaPeriod: tunedQuotaPeriod, +- expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &cpuNoLimit, CPUPeriod: &tunedQuotaPeriod, Memory: &guaranteedMemory}, ++ quotaPeriod: guaranteedQuotaPeriod, ++ expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &cpuNoLimit, CPUPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory}, + }, + } + +diff --git a/pkg/kubelet/cm/internal_container_lifecycle_linux.go b/pkg/kubelet/cm/internal_container_lifecycle_linux.go +index cb7c0cfa543..a99d01f8884 100644 +--- a/pkg/kubelet/cm/internal_container_lifecycle_linux.go ++++ b/pkg/kubelet/cm/internal_container_lifecycle_linux.go +@@ -25,6 +25,7 @@ import ( + + "k8s.io/api/core/v1" + runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" ++ v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos" + ) + + func (i *internalContainerLifecycleImpl) PreCreateContainer(pod *v1.Pod, container *v1.Container, containerConfig *runtimeapi.ContainerConfig) error { +@@ -35,6 +36,14 @@ func (i *internalContainerLifecycleImpl) PreCreateContainer(pod *v1.Pod, contain + } + } + ++ // Disable cgroup CFS throttle at the container level. ++ // /sys/fs/cgroup/cpu/k8s-infra/kubepods///cpu.cfs_quota_us ++ // /sys/fs/cgroup/cpu/k8s-infra/kubepods///cpu.cfs_period_us ++ if i.cpuManager.GetCPUPolicy() == "static" && v1qos.GetPodQOS(pod) == v1.PodQOSGuaranteed { ++ containerConfig.Linux.Resources.CpuPeriod = int64(100000) ++ containerConfig.Linux.Resources.CpuQuota = int64(-1) ++ } ++ + if i.memoryManager != nil { + numaNodes := i.memoryManager.GetMemoryNUMANodes(pod, container) + if numaNodes.Len() > 0 { +-- +2.25.1 + diff --git a/kubernetes/kubernetes-1.28.4/debian/deb_folder/patches/kubelet-cpumanager-infra-pods-use-system-reserved-CP.patch b/kubernetes/kubernetes-1.28.4/debian/deb_folder/patches/kubelet-cpumanager-infra-pods-use-system-reserved-CP.patch new file mode 100644 index 000000000..760364e5b --- /dev/null +++ b/kubernetes/kubernetes-1.28.4/debian/deb_folder/patches/kubelet-cpumanager-infra-pods-use-system-reserved-CP.patch @@ -0,0 +1,167 @@ +From 000e637a3298cf488f3d9dc144ab835ce7e93068 Mon Sep 17 00:00:00 2001 +From: Boovan Rajendran +Date: Tue, 5 Sep 2023 06:27:39 -0400 +Subject: [PATCH] kubelet cpumanager infra pods use system reserved CPUs + +This assigns system infrastructure pods to the "reserved" cpuset +to isolate them from the shared pool of CPUs. + +Infrastructure pods include any pods that belong to the kube-system, +armada, cert-manager, vault, platform-deployment-manager, portieris, +notification, flux-helm or metrics-server namespaces. + +The implementation is a bit simplistic, it is assumed that the +"reserved" cpuset is large enough to handle all infrastructure pods +CPU allocations. + +This also prevents infrastucture pods from using Guaranteed resources. + +Co-authored-by: Jim Gauld +Signed-off-by: Gleb Aronsky +Signed-off-by: Thiago Miranda +Signed-off-by: Kaustubh Dhokte +Signed-off-by: Ramesh Kumar Sivanandam +Signed-off-by: Sachin Gopala Krishna +Signed-off-by: Boovan Rajendran +--- + pkg/kubelet/cm/cpumanager/policy_static.go | 50 ++++++++++++++++--- + .../cm/cpumanager/policy_static_test.go | 19 ++++++- + 2 files changed, 62 insertions(+), 7 deletions(-) + +diff --git a/pkg/kubelet/cm/cpumanager/policy_static.go b/pkg/kubelet/cm/cpumanager/policy_static.go +index 895c707600d..9b7545c2207 100644 +--- a/pkg/kubelet/cm/cpumanager/policy_static.go ++++ b/pkg/kubelet/cm/cpumanager/policy_static.go +@@ -62,6 +62,11 @@ func (e SMTAlignmentError) Type() string { + return ErrorSMTAlignment + } + ++// Define namespaces used by platform infrastructure pods ++var infraNamespaces = [...]string{ ++ "kube-system", "armada", "cert-manager", "platform-deployment-manager", "portieris", "vault", "notification", "flux-helm", "metrics-server", ++} ++ + // staticPolicy is a CPU manager policy that does not change CPU + // assignments for exclusively pinned guaranteed containers after the main + // container process starts. +@@ -140,11 +145,11 @@ func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reserv + klog.InfoS("Static policy created with configuration", "options", opts) + + policy := &staticPolicy{ +- topology: topology, +- affinity: affinity, ++ topology: topology, ++ affinity: affinity, + excludeReserved: excludeReserved, +- cpusToReuse: make(map[string]cpuset.CPUSet), +- options: opts, ++ cpusToReuse: make(map[string]cpuset.CPUSet), ++ options: opts, + } + + allCPUs := topology.CPUDetails.CPUs() +@@ -222,8 +227,8 @@ func (p *staticPolicy) validateState(s state.State) error { + // - user tampered with file + if !p.excludeReserved { + if !p.reservedCPUs.Intersection(tmpDefaultCPUset).Equals(p.reservedCPUs) { +- return fmt.Errorf("not all reserved cpus: \"%s\" are present in defaultCpuSet: \"%s\"", +- p.reservedCPUs.String(), tmpDefaultCPUset.String()) ++ return fmt.Errorf("not all reserved cpus: \"%s\" are present in defaultCpuSet: \"%s\"", ++ p.reservedCPUs.String(), tmpDefaultCPUset.String()) + } + } + // 2. Check if state for static policy is consistent +@@ -302,6 +307,25 @@ func (p *staticPolicy) updateCPUsToReuse(pod *v1.Pod, container *v1.Container, c + } + + func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Container) (rerr error) { ++ // Process infra pods before guaranteed pods ++ if isKubeInfra(pod) { ++ // Container belongs in reserved pool. ++ // We don't want to fall through to the p.guaranteedCPUs() clause below so return either nil or error. ++ if _, ok := s.GetCPUSet(string(pod.UID), container.Name); ok { ++ klog.Infof("[cpumanager] static policy: reserved container already present in state, skipping (namespace: %s, pod UID: %s, pod: %s, container: %s)", pod.Namespace, string(pod.UID), pod.Name, container.Name) ++ return nil ++ } ++ ++ cpuset := p.reservedCPUs ++ if cpuset.IsEmpty() { ++ // If this happens then someone messed up. ++ return fmt.Errorf("[cpumanager] static policy: reserved container unable to allocate cpus (namespace: %s, pod UID: %s, pod: %s, container: %s); cpuset=%v, reserved:%v", pod.Namespace, string(pod.UID), pod.Name, container.Name, cpuset, p.reservedCPUs) ++ } ++ s.SetCPUSet(string(pod.UID), container.Name, cpuset) ++ klog.Infof("[cpumanager] static policy: reserved: AddContainer (namespace: %s, pod UID: %s, pod: %s, container: %s); cpuset=%v", pod.Namespace, string(pod.UID), pod.Name, container.Name, cpuset) ++ return nil ++ } ++ + numCPUs := p.guaranteedCPUs(pod, container) + if numCPUs == 0 { + // container belongs in the shared pool (nothing to do; use default cpuset) +@@ -453,6 +477,10 @@ func (p *staticPolicy) guaranteedCPUs(pod *v1.Pod, container *v1.Container) int + if cpuQuantity.Value()*1000 != cpuQuantity.MilliValue() { + return 0 + } ++ // Infrastructure pods use reserved CPUs even if they're in the Guaranteed QoS class ++ if isKubeInfra(pod) { ++ return 0 ++ } + // Safe downcast to do for all systems with < 2.1 billion CPUs. + // Per the language spec, `int` is guaranteed to be at least 32 bits wide. + // https://golang.org/ref/spec#Numeric_types +@@ -671,6 +699,16 @@ func (p *staticPolicy) generateCPUTopologyHints(availableCPUs cpuset.CPUSet, reu + return hints + } + ++// check if a given pod is in a platform infrastructure namespace ++func isKubeInfra(pod *v1.Pod) bool { ++ for _, namespace := range infraNamespaces { ++ if namespace == pod.Namespace { ++ return true ++ } ++ } ++ return false ++} ++ + // isHintSocketAligned function return true if numa nodes in hint are socket aligned. + func (p *staticPolicy) isHintSocketAligned(hint topologymanager.TopologyHint, minAffinitySize int) bool { + numaNodesBitMask := hint.NUMANodeAffinity.GetBits() +diff --git a/pkg/kubelet/cm/cpumanager/policy_static_test.go b/pkg/kubelet/cm/cpumanager/policy_static_test.go +index deb2f4c7982..b864c6c57c6 100644 +--- a/pkg/kubelet/cm/cpumanager/policy_static_test.go ++++ b/pkg/kubelet/cm/cpumanager/policy_static_test.go +@@ -988,7 +988,8 @@ func TestStaticPolicyStartWithResvList(t *testing.T) { + } + + func TestStaticPolicyAddWithResvList(t *testing.T) { +- ++ infraPod := makePod("fakePod", "fakeContainer2", "200m", "200m") ++ infraPod.Namespace = "kube-system" + testCases := []staticPolicyTestWithResvList{ + { + description: "GuPodSingleCore, SingleSocketHT, ExpectError", +@@ -1030,6 +1031,22 @@ func TestStaticPolicyAddWithResvList(t *testing.T) { + expCPUAlloc: true, + expCSet: cpuset.New(4, 5), + }, ++ { ++ description: "InfraPod, SingleSocketHT, ExpectAllocReserved", ++ topo: topoSingleSocketHT, ++ numReservedCPUs: 2, ++ reserved: cpuset.New(0, 1), ++ stAssignments: state.ContainerCPUAssignments{ ++ "fakePod": map[string]cpuset.CPUSet{ ++ "fakeContainer100": cpuset.New(2, 3, 6, 7), ++ }, ++ }, ++ stDefaultCPUSet: cpuset.New(4, 5), ++ pod: infraPod, ++ expErr: nil, ++ expCPUAlloc: true, ++ expCSet: cpuset.New(0, 1), ++ }, + } + + testExcl := true +-- +2.25.1 + diff --git a/kubernetes/kubernetes-1.28.4/debian/deb_folder/patches/kubelet-cpumanager-introduce-concept-of-isolated-CPU.patch b/kubernetes/kubernetes-1.28.4/debian/deb_folder/patches/kubelet-cpumanager-introduce-concept-of-isolated-CPU.patch new file mode 100644 index 000000000..675d624dd --- /dev/null +++ b/kubernetes/kubernetes-1.28.4/debian/deb_folder/patches/kubelet-cpumanager-introduce-concept-of-isolated-CPU.patch @@ -0,0 +1,755 @@ +From 856dfbe0960418618262998ebce65d0ae070c1bb Mon Sep 17 00:00:00 2001 +From: Saba Touheed Mujawar +Date: Fri, 1 Dec 2023 07:42:14 -0500 +Subject: [PATCH] kubelet cpumanager introduce concept of isolated CPUs + +This introduces the concept of "isolated CPUs", which are CPUs that +have been isolated at the kernel level via the "isolcpus" kernel boot +parameter. + +When starting the kubelet process, two separate sets of reserved CPUs +may be specified. With this change CPUs reserved via +'--system-reserved=cpu' will be used for infrastructure pods while the +isolated CPUs should be reserved via '--kube-reserved=cpu' to cause +kubelet to skip over them for "normal" CPU resource tracking. The +kubelet code will double-check that the specified isolated CPUs match +what the kernel exposes in "/sys/devices/system/cpu/isolated". + +A plugin (outside the scope of this commit) will expose the isolated +CPUs to kubelet via the device plugin API. + +If a pod specifies some number of "isolcpus" resources, the device +manager will allocate them. In this code we check whether such +resources have been allocated, and if so we set the container cpuset to +the isolated CPUs. This does mean that it really only makes sense to +specify "isolcpus" resources for best-effort or burstable pods, not for +guaranteed ones since that would throw off the accounting code. In +order to ensure the accounting still works as designed, if "isolcpus" +are specified for guaranteed pods, the affinity will be set to the +non-isolated CPUs. + +This patch was refactored in 1.21.3 due to upstream API change +node: podresources: make GetDevices() consistent +(commit ad68f9588c72d6477b5a290c548a9031063ac659). + +The routine podIsolCPUs() was refactored in 1.21.3 since the API +p.deviceManager.GetDevices() is returning multiple devices with +a device per cpu. The resultant cpuset needs to be the aggregate. + +The routine NewStaticPolicy was refactored in 1.22.5, adding a new argument +in its signature: cpuPolicyOptions map[string]string. This change is implies +shifting the new arguments(deviceManager, excludeReserved) with one position +to the right. + +Co-authored-by: Jim Gauld +Co-authored-by: Chris Friesen +Signed-off-by: Gleb Aronsky +Signed-off-by: Ramesh Kumar Sivanandam +Signed-off-by: Sachin Gopala Krishna +Signed-off-by: Boovan Rajendran +Signed-off-by: Saba Touheed Mujawar +--- + pkg/kubelet/cm/container_manager_linux.go | 1 + + pkg/kubelet/cm/cpumanager/cpu_manager.go | 35 +++++- + pkg/kubelet/cm/cpumanager/cpu_manager_test.go | 23 +++- + pkg/kubelet/cm/cpumanager/policy_static.go | 83 ++++++++++++- + .../cm/cpumanager/policy_static_test.go | 53 +++++++-- + pkg/kubelet/cm/devicemanager/manager_stub.go | 110 ++++++++++++++++++ + 6 files changed, 284 insertions(+), 21 deletions(-) + create mode 100644 pkg/kubelet/cm/devicemanager/manager_stub.go + +diff --git a/pkg/kubelet/cm/container_manager_linux.go b/pkg/kubelet/cm/container_manager_linux.go +index 18001ed61f6..927310f0d92 100644 +--- a/pkg/kubelet/cm/container_manager_linux.go ++++ b/pkg/kubelet/cm/container_manager_linux.go +@@ -324,6 +324,7 @@ func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.I + cm.GetNodeAllocatableReservation(), + nodeConfig.KubeletRootDir, + cm.topologyManager, ++ cm.deviceManager, + ) + if err != nil { + klog.ErrorS(err, "Failed to initialize cpu manager") +diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager.go b/pkg/kubelet/cm/cpumanager/cpu_manager.go +index 29458efb05c..396d9358a04 100644 +--- a/pkg/kubelet/cm/cpumanager/cpu_manager.go ++++ b/pkg/kubelet/cm/cpumanager/cpu_manager.go +@@ -19,7 +19,9 @@ package cpumanager + import ( + "context" + "fmt" ++ "os" + "math" ++ "strings" + "sync" + "time" + +@@ -32,6 +34,7 @@ import ( + "k8s.io/kubernetes/pkg/kubelet/cm/containermap" + "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state" + "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology" ++ "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager" + "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager" + "k8s.io/kubernetes/pkg/kubelet/config" + kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" +@@ -51,6 +54,25 @@ type policyName string + // cpuManagerStateFileName is the file name where cpu manager stores its state + const cpuManagerStateFileName = "cpu_manager_state" + ++// get the system-level isolated CPUs ++func getIsolcpus() cpuset.CPUSet { ++ dat, err := os.ReadFile("/sys/devices/system/cpu/isolated") ++ if err != nil { ++ klog.Errorf("[cpumanager] unable to read sysfs isolcpus subdir") ++ return cpuset.New() ++ } ++ ++ // The isolated cpus string ends in a newline ++ cpustring := strings.TrimSuffix(string(dat), "\n") ++ cset, err := cpuset.Parse(cpustring) ++ if err != nil { ++ klog.Errorf("[cpumanager] unable to parse sysfs isolcpus string to cpuset") ++ return cpuset.New() ++ } ++ ++ return cset ++} ++ + // Manager interface provides methods for Kubelet to manage pod cpus. + type Manager interface { + // Start is called during Kubelet initialization. +@@ -154,7 +176,8 @@ func (s *sourcesReadyStub) AddSource(source string) {} + func (s *sourcesReadyStub) AllReady() bool { return true } + + // NewManager creates new cpu manager based on provided policy +-func NewManager(cpuPolicyName string, cpuPolicyOptions map[string]string, reconcilePeriod time.Duration, machineInfo *cadvisorapi.MachineInfo, specificCPUs cpuset.CPUSet, nodeAllocatableReservation v1.ResourceList, stateFileDirectory string, affinity topologymanager.Store) (Manager, error) { ++func NewManager(cpuPolicyName string, cpuPolicyOptions map[string]string, reconcilePeriod time.Duration, machineInfo *cadvisorapi.MachineInfo, specificCPUs cpuset.CPUSet, nodeAllocatableReservation v1.ResourceList, stateFileDirectory string, affinity topologymanager.Store, deviceManager devicemanager.Manager) (Manager, error) { ++ + var topo *topology.CPUTopology + var policy Policy + var err error +@@ -195,7 +218,15 @@ func NewManager(cpuPolicyName string, cpuPolicyOptions map[string]string, reconc + // NOTE: Set excludeReserved unconditionally to exclude reserved CPUs from default cpuset. + // This variable is primarily to make testing easier. + excludeReserved := true +- policy, err = NewStaticPolicy(topo, numReservedCPUs, specificCPUs, affinity, cpuPolicyOptions, excludeReserved) ++ ++ // isolCPUs is the set of kernel-isolated CPUs. They should be a subset of specificCPUs or ++ // of the CPUs that NewStaticPolicy() will pick if numReservedCPUs is set. It's only in the ++ // argument list here for ease of testing, it's really internal to the policy. ++ isolCPUs := getIsolcpus() ++ policy, err = NewStaticPolicy(topo, numReservedCPUs, specificCPUs, isolCPUs, affinity, cpuPolicyOptions, deviceManager, excludeReserved) ++ if err != nil { ++ return nil, fmt.Errorf("new static policy error: %v", err) ++ } + + if err != nil { + return nil, fmt.Errorf("new static policy error: %w", err) +diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager_test.go b/pkg/kubelet/cm/cpumanager/cpu_manager_test.go +index daecd35f67b..2298cc037fe 100644 +--- a/pkg/kubelet/cm/cpumanager/cpu_manager_test.go ++++ b/pkg/kubelet/cm/cpumanager/cpu_manager_test.go +@@ -36,6 +36,7 @@ import ( + "k8s.io/kubernetes/pkg/kubelet/cm/containermap" + "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state" + "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology" ++ "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager" + "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager" + "k8s.io/utils/cpuset" + ) +@@ -215,6 +216,7 @@ func makeMultiContainerPod(initCPUs, appCPUs []struct{ request, limit string }) + } + + func TestCPUManagerAdd(t *testing.T) { ++ testDM, _ := devicemanager.NewManagerStub() + testExcl := false + testPolicy, _ := NewStaticPolicy( + &topology.CPUTopology{ +@@ -230,8 +232,10 @@ func TestCPUManagerAdd(t *testing.T) { + }, + 0, + cpuset.New(), ++ cpuset.New(), + topologymanager.NewFakeManager(), + nil, ++ testDM, + testExcl) + testCases := []struct { + description string +@@ -482,8 +486,9 @@ func TestCPUManagerAddWithInitContainers(t *testing.T) { + } + + testExcl := false ++ testDM, _ := devicemanager.NewManagerStub() + for _, testCase := range testCases { +- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), topologymanager.NewFakeManager(), nil, testExcl) ++ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), cpuset.New(), topologymanager.NewFakeManager(), nil, testDM, testExcl) + + mockState := &mockState{ + assignments: testCase.stAssignments, +@@ -638,7 +643,9 @@ func TestCPUManagerGenerate(t *testing.T) { + } + defer os.RemoveAll(sDir) + +- mgr, err := NewManager(testCase.cpuPolicyName, nil, 5*time.Second, machineInfo, cpuset.New(), testCase.nodeAllocatableReservation, sDir, topologymanager.NewFakeManager()) ++ testDM, err := devicemanager.NewManagerStub() ++ mgr, err := NewManager(testCase.cpuPolicyName, nil, 5*time.Second, machineInfo, cpuset.New(), testCase.nodeAllocatableReservation, sDir, topologymanager.NewFakeManager(), testDM) ++ + if testCase.expectedError != nil { + if !strings.Contains(err.Error(), testCase.expectedError.Error()) { + t.Errorf("Unexpected error message. Have: %s wants %s", err.Error(), testCase.expectedError.Error()) +@@ -709,6 +716,7 @@ func TestCPUManagerRemove(t *testing.T) { + + func TestReconcileState(t *testing.T) { + testExcl := false ++ testDM, _ := devicemanager.NewManagerStub() + testPolicy, _ := NewStaticPolicy( + &topology.CPUTopology{ + NumCPUs: 8, +@@ -727,8 +735,10 @@ func TestReconcileState(t *testing.T) { + }, + 0, + cpuset.New(), ++ cpuset.New(), + topologymanager.NewFakeManager(), + nil, ++ testDM, + testExcl) + + testCases := []struct { +@@ -1234,6 +1244,7 @@ func TestReconcileState(t *testing.T) { + // the following tests are with --reserved-cpus configured + func TestCPUManagerAddWithResvList(t *testing.T) { + testExcl := false ++ testDM, _ := devicemanager.NewManagerStub() + testPolicy, _ := NewStaticPolicy( + &topology.CPUTopology{ + NumCPUs: 4, +@@ -1248,8 +1259,10 @@ func TestCPUManagerAddWithResvList(t *testing.T) { + }, + 1, + cpuset.New(0), ++ cpuset.New(), + topologymanager.NewFakeManager(), + nil, ++ testDM, + testExcl) + testCases := []struct { + description string +@@ -1362,7 +1375,8 @@ func TestCPUManagerHandlePolicyOptions(t *testing.T) { + } + defer os.RemoveAll(sDir) + +- _, err = NewManager(testCase.cpuPolicyName, testCase.cpuPolicyOptions, 5*time.Second, machineInfo, cpuset.New(), nodeAllocatableReservation, sDir, topologymanager.NewFakeManager()) ++ testDM, err := devicemanager.NewManagerStub() ++ _, err = NewManager(testCase.cpuPolicyName, testCase.cpuPolicyOptions, 5*time.Second, machineInfo, cpuset.New(), nodeAllocatableReservation, sDir, topologymanager.NewFakeManager(), testDM) + if err == nil { + t.Errorf("Expected error, but NewManager succeeded") + } +@@ -1376,6 +1390,7 @@ func TestCPUManagerHandlePolicyOptions(t *testing.T) { + + func TestCPUManagerGetAllocatableCPUs(t *testing.T) { + testExcl := false ++ testDm, _ := devicemanager.NewManagerStub() + nonePolicy, _ := NewNonePolicy(nil) + staticPolicy, _ := NewStaticPolicy( + &topology.CPUTopology{ +@@ -1391,8 +1406,10 @@ func TestCPUManagerGetAllocatableCPUs(t *testing.T) { + }, + 1, + cpuset.New(0), ++ cpuset.New(), + topologymanager.NewFakeManager(), + nil, ++ testDm, + testExcl) + + testCases := []struct { +diff --git a/pkg/kubelet/cm/cpumanager/policy_static.go b/pkg/kubelet/cm/cpumanager/policy_static.go +index 9b7545c2207..e9a2defd848 100644 +--- a/pkg/kubelet/cm/cpumanager/policy_static.go ++++ b/pkg/kubelet/cm/cpumanager/policy_static.go +@@ -18,6 +18,7 @@ package cpumanager + + import ( + "fmt" ++ "strconv" + + v1 "k8s.io/api/core/v1" + utilfeature "k8s.io/apiserver/pkg/util/feature" +@@ -27,6 +28,7 @@ import ( + "k8s.io/kubernetes/pkg/features" + "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state" + "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology" ++ "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager" + "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager" + "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask" + "k8s.io/kubernetes/pkg/kubelet/metrics" +@@ -110,6 +112,10 @@ type staticPolicy struct { + topology *topology.CPUTopology + // set of CPUs that is not available for exclusive assignment + reservedCPUs cpuset.CPUSet ++ // subset of reserved CPUs with isolcpus attribute ++ isolcpus cpuset.CPUSet ++ // parent containerManager, used to get device list ++ deviceManager devicemanager.Manager + // If true, default CPUSet should exclude reserved CPUs + excludeReserved bool + // Superset of reservedCPUs. It includes not just the reservedCPUs themselves, +@@ -132,7 +138,8 @@ var _ Policy = &staticPolicy{} + // NewStaticPolicy returns a CPU manager policy that does not change CPU + // assignments for exclusively pinned guaranteed containers after the main + // container process starts. +-func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reservedCPUs cpuset.CPUSet, affinity topologymanager.Store, cpuPolicyOptions map[string]string, excludeReserved bool) (Policy, error) { ++func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reservedCPUs cpuset.CPUSet, isolCPUs cpuset.CPUSet, affinity topologymanager.Store, cpuPolicyOptions map[string]string, deviceManager devicemanager.Manager, excludeReserved bool) (Policy, error) { ++ + opts, err := NewStaticPolicyOptions(cpuPolicyOptions) + if err != nil { + return nil, err +@@ -147,6 +154,8 @@ func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reserv + policy := &staticPolicy{ + topology: topology, + affinity: affinity, ++ isolcpus: isolCPUs, ++ deviceManager: deviceManager, + excludeReserved: excludeReserved, + cpusToReuse: make(map[string]cpuset.CPUSet), + options: opts, +@@ -183,6 +192,12 @@ func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reserv + policy.reservedCPUs = reserved + policy.reservedPhysicalCPUs = reservedPhysicalCPUs + ++ if !isolCPUs.IsSubsetOf(reserved) { ++ klog.Errorf("[cpumanager] isolCPUs %v is not a subset of reserved %v", isolCPUs, reserved) ++ reserved = reserved.Union(isolCPUs) ++ klog.Warningf("[cpumanager] mismatch isolCPUs %v, force reserved %v", isolCPUs, reserved) ++ } ++ + return policy, nil + } + +@@ -216,8 +231,9 @@ func (p *staticPolicy) validateState(s state.State) error { + } else { + s.SetDefaultCPUSet(allCPUs) + } +- klog.Infof("[cpumanager] static policy: CPUSet: allCPUs:%v, reserved:%v, default:%v\n", +- allCPUs, p.reservedCPUs, s.GetDefaultCPUSet()) ++ klog.Infof("[cpumanager] static policy: CPUSet: allCPUs:%v, reserved:%v, isolcpus:%v, default:%v\n", ++ allCPUs, p.reservedCPUs, p.isolcpus, s.GetDefaultCPUSet()) ++ + return nil + } + +@@ -316,16 +332,39 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai + return nil + } + +- cpuset := p.reservedCPUs ++ cpuset := p.reservedCPUs.Clone().Difference(p.isolcpus) + if cpuset.IsEmpty() { + // If this happens then someone messed up. +- return fmt.Errorf("[cpumanager] static policy: reserved container unable to allocate cpus (namespace: %s, pod UID: %s, pod: %s, container: %s); cpuset=%v, reserved:%v", pod.Namespace, string(pod.UID), pod.Name, container.Name, cpuset, p.reservedCPUs) ++ return fmt.Errorf("[cpumanager] static policy: reserved container unable to allocate cpus (namespace: %s, pod UID: %s, pod: %s, container: %s); cpuset=%v, reserved:%v, isolcpus:%v", pod.Namespace, string(pod.UID), pod.Name, container.Name, cpuset, p.reservedCPUs, p.isolcpus) ++ + } + s.SetCPUSet(string(pod.UID), container.Name, cpuset) + klog.Infof("[cpumanager] static policy: reserved: AddContainer (namespace: %s, pod UID: %s, pod: %s, container: %s); cpuset=%v", pod.Namespace, string(pod.UID), pod.Name, container.Name, cpuset) + return nil + } + ++ if isolcpus := p.podIsolCPUs(pod, container); isolcpus.Size() > 0 { ++ // container has requested isolated CPUs ++ if set, ok := s.GetCPUSet(string(pod.UID), container.Name); ok { ++ if set.Equals(isolcpus) { ++ klog.Infof("[cpumanager] isolcpus container already present in state, skipping (namespace: %s, pod UID: %s, pod: %s, container: %s)", ++ pod.Namespace, string(pod.UID), pod.Name, container.Name) ++ return nil ++ } else { ++ klog.Infof("[cpumanager] isolcpus container state has cpus %v, should be %v (namespace: %s, pod UID: %s, pod: %s, container: %s)", ++ isolcpus, set, pod.Namespace, string(pod.UID), pod.Name, container.Name) ++ } ++ } ++ // Note that we do not do anything about init containers here. ++ // It looks like devices are allocated per-pod based on effective requests/limits ++ // and extra devices from initContainers are not freed up when the regular containers start. ++ // TODO: confirm this is still true for 1.20 ++ s.SetCPUSet(string(pod.UID), container.Name, isolcpus) ++ klog.Infof("[cpumanager] isolcpus: AddContainer (namespace: %s, pod UID: %s, pod: %s, container: %s); cpuset=%v", ++ pod.Namespace, string(pod.UID), pod.Name, container.Name, isolcpus) ++ return nil ++ } ++ + numCPUs := p.guaranteedCPUs(pod, container) + if numCPUs == 0 { + // container belongs in the shared pool (nothing to do; use default cpuset) +@@ -391,7 +430,9 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai + } + s.SetCPUSet(string(pod.UID), container.Name, cpuset) + p.updateCPUsToReuse(pod, container, cpuset) +- ++ klog.Infof("[cpumanager] guaranteed: AddContainer "+ ++ "(namespace: %s, pod UID: %s, pod: %s, container: %s); numCPUS=%d, cpuset=%v", ++ pod.Namespace, string(pod.UID), pod.Name, container.Name, numCPUs, cpuset) + return nil + } + +@@ -709,6 +750,36 @@ func isKubeInfra(pod *v1.Pod) bool { + return false + } + ++// get the isolated CPUs (if any) from the devices associated with a specific container ++func (p *staticPolicy) podIsolCPUs(pod *v1.Pod, container *v1.Container) cpuset.CPUSet { ++ // NOTE: This is required for TestStaticPolicyAdd() since makePod() does ++ // not create UID. We also need a way to properly stub devicemanager. ++ if len(string(pod.UID)) == 0 { ++ return cpuset.New() ++ } ++ resContDevices := p.deviceManager.GetDevices(string(pod.UID), container.Name) ++ cpuSet := cpuset.New() ++ for resourceName, resourceDevs := range resContDevices { ++ // this resource name needs to match the isolcpus device plugin ++ if resourceName == "windriver.com/isolcpus" { ++ for devID, _ := range resourceDevs { ++ cpuStrList := []string{devID} ++ if len(cpuStrList) > 0 { ++ // loop over the list of strings, convert each one to int, add to cpuset ++ for _, cpuStr := range cpuStrList { ++ cpu, err := strconv.Atoi(cpuStr) ++ if err != nil { ++ panic(err) ++ } ++ cpuSet = cpuSet.Union(cpuset.New(cpu)) ++ } ++ } ++ } ++ } ++ } ++ return cpuSet ++} ++ + // isHintSocketAligned function return true if numa nodes in hint are socket aligned. + func (p *staticPolicy) isHintSocketAligned(hint topologymanager.TopologyHint, minAffinitySize int) bool { + numaNodesBitMask := hint.NUMANodeAffinity.GetBits() +diff --git a/pkg/kubelet/cm/cpumanager/policy_static_test.go b/pkg/kubelet/cm/cpumanager/policy_static_test.go +index b864c6c57c6..cb363bb29ab 100644 +--- a/pkg/kubelet/cm/cpumanager/policy_static_test.go ++++ b/pkg/kubelet/cm/cpumanager/policy_static_test.go +@@ -27,6 +27,7 @@ import ( + pkgfeatures "k8s.io/kubernetes/pkg/features" + "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state" + "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology" ++ "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager" + "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager" + "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask" + "k8s.io/utils/cpuset" +@@ -70,8 +71,9 @@ func (spt staticPolicyTest) PseudoClone() staticPolicyTest { + } + + func TestStaticPolicyName(t *testing.T) { ++ testDM, _ := devicemanager.NewManagerStub() + testExcl := false +- policy, _ := NewStaticPolicy(topoSingleSocketHT, 1, cpuset.New(), topologymanager.NewFakeManager(), nil, testExcl) ++ policy, _ := NewStaticPolicy(topoSingleSocketHT, 1, cpuset.New(), cpuset.New(), topologymanager.NewFakeManager(), nil, testDM, testExcl) + + policyName := policy.Name() + if policyName != "static" { +@@ -81,6 +83,7 @@ func TestStaticPolicyName(t *testing.T) { + } + + func TestStaticPolicyStart(t *testing.T) { ++ testDM, _ := devicemanager.NewManagerStub() + testCases := []staticPolicyTest{ + { + description: "non-corrupted state", +@@ -156,7 +159,7 @@ func TestStaticPolicyStart(t *testing.T) { + } + for _, testCase := range testCases { + t.Run(testCase.description, func(t *testing.T) { +- p, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), topologymanager.NewFakeManager(), nil, testCase.excludeReserved) ++ p, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), cpuset.New(), topologymanager.NewFakeManager(), nil, testDM, testCase.excludeReserved) + + policy := p.(*staticPolicy) + st := &mockState{ +@@ -204,7 +207,6 @@ func TestStaticPolicyAdd(t *testing.T) { + largeTopoCPUSet := cpuset.New(largeTopoCPUids...) + largeTopoSock0CPUSet := cpuset.New(largeTopoSock0CPUids...) + largeTopoSock1CPUSet := cpuset.New(largeTopoSock1CPUids...) +- + // these are the cases which must behave the same regardless the policy options. + // So we will permutate the options to ensure this holds true. + +@@ -627,7 +629,9 @@ func runStaticPolicyTestCase(t *testing.T, testCase staticPolicyTest) { + cpus = testCase.reservedCPUs.Clone() + } + testExcl := false +- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpus, tm, testCase.options, testExcl) ++ testDM, _ := devicemanager.NewManagerStub() ++ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpus, cpus, tm, testCase.options, testDM, testExcl) ++ + + st := &mockState{ + assignments: testCase.stAssignments, +@@ -674,6 +678,8 @@ func runStaticPolicyTestCaseWithFeatureGate(t *testing.T, testCase staticPolicyT + } + + func TestStaticPolicyReuseCPUs(t *testing.T) { ++ excludeReserved := false ++ testDM, _ := devicemanager.NewManagerStub() + testCases := []struct { + staticPolicyTest + expCSetAfterAlloc cpuset.CPUSet +@@ -698,7 +704,7 @@ func TestStaticPolicyReuseCPUs(t *testing.T) { + } + + for _, testCase := range testCases { +- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), topologymanager.NewFakeManager(), nil, testCase.excludeReserved) ++ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), cpuset.New(), topologymanager.NewFakeManager(), nil, testDM, excludeReserved) + + st := &mockState{ + assignments: testCase.stAssignments, +@@ -731,6 +737,7 @@ func TestStaticPolicyReuseCPUs(t *testing.T) { + + func TestStaticPolicyRemove(t *testing.T) { + excludeReserved := false ++ testDM, _ := devicemanager.NewManagerStub() + testCases := []staticPolicyTest{ + { + description: "SingleSocketHT, DeAllocOneContainer", +@@ -789,7 +796,7 @@ func TestStaticPolicyRemove(t *testing.T) { + } + + for _, testCase := range testCases { +- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), topologymanager.NewFakeManager(), nil, excludeReserved) ++ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), cpuset.New(), topologymanager.NewFakeManager(), nil, testDM, excludeReserved) + + st := &mockState{ + assignments: testCase.stAssignments, +@@ -812,6 +819,7 @@ func TestStaticPolicyRemove(t *testing.T) { + + func TestTopologyAwareAllocateCPUs(t *testing.T) { + excludeReserved := false ++ testDM, _ := devicemanager.NewManagerStub() + testCases := []struct { + description string + topo *topology.CPUTopology +@@ -880,7 +888,8 @@ func TestTopologyAwareAllocateCPUs(t *testing.T) { + }, + } + for _, tc := range testCases { +- p, _ := NewStaticPolicy(tc.topo, 0, cpuset.New(), topologymanager.NewFakeManager(), nil, excludeReserved) ++ p, _ := NewStaticPolicy(tc.topo, 0, cpuset.New(), cpuset.New(), topologymanager.NewFakeManager(), nil, testDM, excludeReserved) ++ + policy := p.(*staticPolicy) + st := &mockState{ + assignments: tc.stAssignments, +@@ -913,6 +922,7 @@ type staticPolicyTestWithResvList struct { + topo *topology.CPUTopology + numReservedCPUs int + reserved cpuset.CPUSet ++ isolcpus cpuset.CPUSet + stAssignments state.ContainerCPUAssignments + stDefaultCPUSet cpuset.CPUSet + pod *v1.Pod +@@ -923,6 +933,8 @@ type staticPolicyTestWithResvList struct { + } + + func TestStaticPolicyStartWithResvList(t *testing.T) { ++ testDM, _ := devicemanager.NewManagerStub() ++ testExcl := false + testCases := []staticPolicyTestWithResvList{ + { + description: "empty cpuset", +@@ -952,10 +964,9 @@ func TestStaticPolicyStartWithResvList(t *testing.T) { + expNewErr: fmt.Errorf("[cpumanager] unable to reserve the required amount of CPUs (size of 0-1 did not equal 1)"), + }, + } +- testExcl := false + for _, testCase := range testCases { + t.Run(testCase.description, func(t *testing.T) { +- p, err := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, topologymanager.NewFakeManager(), nil, testExcl) ++ p, err := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, cpuset.New(), topologymanager.NewFakeManager(), nil, testDM, testExcl) + + if !reflect.DeepEqual(err, testCase.expNewErr) { + t.Errorf("StaticPolicy Start() error (%v). expected error: %v but got: %v", +@@ -996,6 +1007,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) { + topo: topoSingleSocketHT, + numReservedCPUs: 1, + reserved: cpuset.New(0), ++ isolcpus: cpuset.New(), + stAssignments: state.ContainerCPUAssignments{}, + stDefaultCPUSet: cpuset.New(1, 2, 3, 4, 5, 6, 7), + pod: makePod("fakePod", "fakeContainer2", "8000m", "8000m"), +@@ -1008,6 +1020,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) { + topo: topoSingleSocketHT, + numReservedCPUs: 2, + reserved: cpuset.New(0, 1), ++ isolcpus: cpuset.New(), + stAssignments: state.ContainerCPUAssignments{}, + stDefaultCPUSet: cpuset.New(2, 3, 4, 5, 6, 7), + pod: makePod("fakePod", "fakeContainer2", "1000m", "1000m"), +@@ -1020,6 +1033,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) { + topo: topoSingleSocketHT, + numReservedCPUs: 2, + reserved: cpuset.New(0, 1), ++ isolcpus: cpuset.New(), + stAssignments: state.ContainerCPUAssignments{ + "fakePod": map[string]cpuset.CPUSet{ + "fakeContainer100": cpuset.New(2, 3, 6, 7), +@@ -1036,6 +1050,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) { + topo: topoSingleSocketHT, + numReservedCPUs: 2, + reserved: cpuset.New(0, 1), ++ isolcpus: cpuset.New(), + stAssignments: state.ContainerCPUAssignments{ + "fakePod": map[string]cpuset.CPUSet{ + "fakeContainer100": cpuset.New(2, 3, 6, 7), +@@ -1047,11 +1062,29 @@ func TestStaticPolicyAddWithResvList(t *testing.T) { + expCPUAlloc: true, + expCSet: cpuset.New(0, 1), + }, ++ { ++ description: "InfraPod, SingleSocketHT, Isolcpus, ExpectAllocReserved", ++ topo: topoSingleSocketHT, ++ numReservedCPUs: 2, ++ reserved: cpuset.New(0, 1), ++ isolcpus: cpuset.New(1), ++ stAssignments: state.ContainerCPUAssignments{ ++ "fakePod": map[string]cpuset.CPUSet{ ++ "fakeContainer100": cpuset.New(2, 3, 6, 7), ++ }, ++ }, ++ stDefaultCPUSet: cpuset.New(4, 5), ++ pod: infraPod, ++ expErr: nil, ++ expCPUAlloc: true, ++ expCSet: cpuset.New(0), ++ }, + } + + testExcl := true ++ testDM, _ := devicemanager.NewManagerStub() + for _, testCase := range testCases { +- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, topologymanager.NewFakeManager(), nil, testExcl) ++ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, testCase.isolcpus, topologymanager.NewFakeManager(), nil, testDM, testExcl) + + st := &mockState{ + assignments: testCase.stAssignments, +diff --git a/pkg/kubelet/cm/devicemanager/manager_stub.go b/pkg/kubelet/cm/devicemanager/manager_stub.go +new file mode 100644 +index 00000000000..98abcde2519 +--- /dev/null ++++ b/pkg/kubelet/cm/devicemanager/manager_stub.go +@@ -0,0 +1,110 @@ ++/* ++Copyright 2017 The Kubernetes Authors. ++ ++Licensed under the Apache License, Version 2.0 (the "License"); ++you may not use this file except in compliance with the License. ++You may obtain a copy of the License at ++ ++ http://www.apache.org/licenses/LICENSE-2.0 ++ ++Unless required by applicable law or agreed to in writing, software ++distributed under the License is distributed on an "AS IS" BASIS, ++WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++See the License for the specific language governing permissions and ++limitations under the License. ++*/ ++ ++package devicemanager ++ ++import ( ++ v1 "k8s.io/api/core/v1" ++ "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager" ++ "k8s.io/kubernetes/pkg/kubelet/config" ++ "k8s.io/kubernetes/pkg/kubelet/lifecycle" ++ "k8s.io/kubernetes/pkg/kubelet/pluginmanager/cache" ++ schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework" ++ "k8s.io/apimachinery/pkg/util/sets" ++ "k8s.io/kubernetes/pkg/kubelet/cm/containermap" ++) ++ ++// ManagerStub provides a simple stub implementation for the Device Manager. ++type ManagerStub struct{ ++ // containerMap provides a mapping from (pod, container) -> containerID ++ // for all containers in a pod. Used to detect pods running across a restart ++ containerMap containermap.ContainerMap ++ ++ // containerRunningSet identifies which container among those present in `containerMap` ++ // was reported running by the container runtime when `containerMap` was computed. ++ // Used to detect pods running across a restart ++ containerRunningSet sets.String ++} ++ ++// NewManagerStub creates a ManagerStub. ++func NewManagerStub() (*ManagerStub, error) { ++ return &ManagerStub{}, nil ++} ++ ++// Start simply returns nil. ++func (h *ManagerStub) Start(activePods ActivePodsFunc, sourcesReady config.SourcesReady, initialContainers containermap.ContainerMap, initialContainerRunningSet sets.String) error { ++ return nil ++} ++ ++// Stop simply returns nil. ++func (h *ManagerStub) Stop() error { ++ return nil ++} ++ ++// Allocate simply returns nil. ++func (h *ManagerStub) Allocate(pod *v1.Pod, container *v1.Container) error { ++ return nil ++} ++ ++// UpdatePluginResources simply returns nil. ++func (h *ManagerStub) UpdatePluginResources(node *schedulerframework.NodeInfo, attrs *lifecycle.PodAdmitAttributes) error { ++ return nil ++} ++ ++// GetDeviceRunContainerOptions simply returns nil. ++func (h *ManagerStub) GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Container) (*DeviceRunContainerOptions, error) { ++ return nil, nil ++} ++ ++// GetCapacity simply returns nil capacity and empty removed resource list. ++func (h *ManagerStub) GetCapacity() (v1.ResourceList, v1.ResourceList, []string) { ++ return nil, nil, []string{} ++} ++ ++// GetWatcherHandler returns plugin watcher interface ++func (h *ManagerStub) GetWatcherHandler() cache.PluginHandler { ++ return nil ++} ++ ++// GetTopologyHints returns an empty TopologyHint map ++func (h *ManagerStub) GetTopologyHints(pod *v1.Pod, container *v1.Container) map[string][]topologymanager.TopologyHint { ++ return map[string][]topologymanager.TopologyHint{} ++} ++ ++// GetPodTopologyHints returns an empty TopologyHint map ++func (h *ManagerStub) GetPodTopologyHints(pod *v1.Pod) map[string][]topologymanager.TopologyHint { ++ return map[string][]topologymanager.TopologyHint{} ++} ++ ++// GetDevices returns nil ++func (h *ManagerStub) GetDevices(_, _ string) ResourceDeviceInstances { ++ return nil ++} ++ ++// GetAllocatableDevices returns nothing ++func (h *ManagerStub) GetAllocatableDevices() ResourceDeviceInstances { ++ return nil ++} ++ ++// ShouldResetExtendedResourceCapacity returns false ++func (h *ManagerStub) ShouldResetExtendedResourceCapacity() bool { ++ return false ++} ++ ++// UpdateAllocatedDevices returns nothing ++func (h *ManagerStub) UpdateAllocatedDevices() { ++ return ++} +-- +2.25.1 + diff --git a/kubernetes/kubernetes-1.28.4/debian/deb_folder/patches/kubelet-cpumanager-keep-normal-containers-off-reserv.patch b/kubernetes/kubernetes-1.28.4/debian/deb_folder/patches/kubelet-cpumanager-keep-normal-containers-off-reserv.patch new file mode 100644 index 000000000..8f9c30130 --- /dev/null +++ b/kubernetes/kubernetes-1.28.4/debian/deb_folder/patches/kubelet-cpumanager-keep-normal-containers-off-reserv.patch @@ -0,0 +1,357 @@ +From 60bd00132deb5fe20a8a5adc8dce990cd02757bc Mon Sep 17 00:00:00 2001 +From: Boovan Rajendran +Date: Fri, 1 Sep 2023 07:15:25 -0400 +Subject: [PATCH] kubelet cpumanager keep normal containers off reserved CPUs + +When starting the kubelet process, two separate sets of reserved CPUs +may be specified. With this change CPUs reserved via +'--system-reserved=cpu' +or '--kube-reserved=cpu' will be ignored by kubernetes itself. A small +tweak to the default CPU affinity ensures that "normal" Kubernetes +pods won't run on the reserved CPUs. + +Co-authored-by: Jim Gauld +Signed-off-by: Sachin Gopala Krishna +Signed-off-by: Ramesh Kumar Sivanandam +Signed-off-by: Boovan Rajendran +--- + pkg/kubelet/cm/cpumanager/cpu_manager.go | 6 ++- + pkg/kubelet/cm/cpumanager/cpu_manager_test.go | 19 +++++++--- + pkg/kubelet/cm/cpumanager/policy_static.go | 30 ++++++++++++--- + .../cm/cpumanager/policy_static_test.go | 38 ++++++++++++++----- + 4 files changed, 71 insertions(+), 22 deletions(-) + +diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager.go b/pkg/kubelet/cm/cpumanager/cpu_manager.go +index 1d8901f4b36..29458efb05c 100644 +--- a/pkg/kubelet/cm/cpumanager/cpu_manager.go ++++ b/pkg/kubelet/cm/cpumanager/cpu_manager.go +@@ -192,7 +192,11 @@ func NewManager(cpuPolicyName string, cpuPolicyOptions map[string]string, reconc + // exclusively allocated. + reservedCPUsFloat := float64(reservedCPUs.MilliValue()) / 1000 + numReservedCPUs := int(math.Ceil(reservedCPUsFloat)) +- policy, err = NewStaticPolicy(topo, numReservedCPUs, specificCPUs, affinity, cpuPolicyOptions) ++ // NOTE: Set excludeReserved unconditionally to exclude reserved CPUs from default cpuset. ++ // This variable is primarily to make testing easier. ++ excludeReserved := true ++ policy, err = NewStaticPolicy(topo, numReservedCPUs, specificCPUs, affinity, cpuPolicyOptions, excludeReserved) ++ + if err != nil { + return nil, fmt.Errorf("new static policy error: %w", err) + } +diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager_test.go b/pkg/kubelet/cm/cpumanager/cpu_manager_test.go +index 65bb88bf0f0..daecd35f67b 100644 +--- a/pkg/kubelet/cm/cpumanager/cpu_manager_test.go ++++ b/pkg/kubelet/cm/cpumanager/cpu_manager_test.go +@@ -215,6 +215,7 @@ func makeMultiContainerPod(initCPUs, appCPUs []struct{ request, limit string }) + } + + func TestCPUManagerAdd(t *testing.T) { ++ testExcl := false + testPolicy, _ := NewStaticPolicy( + &topology.CPUTopology{ + NumCPUs: 4, +@@ -230,7 +231,8 @@ func TestCPUManagerAdd(t *testing.T) { + 0, + cpuset.New(), + topologymanager.NewFakeManager(), +- nil) ++ nil, ++ testExcl) + testCases := []struct { + description string + updateErr error +@@ -479,8 +481,9 @@ func TestCPUManagerAddWithInitContainers(t *testing.T) { + }, + } + ++ testExcl := false + for _, testCase := range testCases { +- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), topologymanager.NewFakeManager(), nil) ++ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), topologymanager.NewFakeManager(), nil, testExcl) + + mockState := &mockState{ + assignments: testCase.stAssignments, +@@ -705,6 +708,7 @@ func TestCPUManagerRemove(t *testing.T) { + } + + func TestReconcileState(t *testing.T) { ++ testExcl := false + testPolicy, _ := NewStaticPolicy( + &topology.CPUTopology{ + NumCPUs: 8, +@@ -724,7 +728,8 @@ func TestReconcileState(t *testing.T) { + 0, + cpuset.New(), + topologymanager.NewFakeManager(), +- nil) ++ nil, ++ testExcl) + + testCases := []struct { + description string +@@ -1228,6 +1233,7 @@ func TestReconcileState(t *testing.T) { + // above test cases are without kubelet --reserved-cpus cmd option + // the following tests are with --reserved-cpus configured + func TestCPUManagerAddWithResvList(t *testing.T) { ++ testExcl := false + testPolicy, _ := NewStaticPolicy( + &topology.CPUTopology{ + NumCPUs: 4, +@@ -1243,7 +1249,8 @@ func TestCPUManagerAddWithResvList(t *testing.T) { + 1, + cpuset.New(0), + topologymanager.NewFakeManager(), +- nil) ++ nil, ++ testExcl) + testCases := []struct { + description string + updateErr error +@@ -1368,6 +1375,7 @@ func TestCPUManagerHandlePolicyOptions(t *testing.T) { + } + + func TestCPUManagerGetAllocatableCPUs(t *testing.T) { ++ testExcl := false + nonePolicy, _ := NewNonePolicy(nil) + staticPolicy, _ := NewStaticPolicy( + &topology.CPUTopology{ +@@ -1384,7 +1392,8 @@ func TestCPUManagerGetAllocatableCPUs(t *testing.T) { + 1, + cpuset.New(0), + topologymanager.NewFakeManager(), +- nil) ++ nil, ++ testExcl) + + testCases := []struct { + description string +diff --git a/pkg/kubelet/cm/cpumanager/policy_static.go b/pkg/kubelet/cm/cpumanager/policy_static.go +index 0f72e64dbc8..895c707600d 100644 +--- a/pkg/kubelet/cm/cpumanager/policy_static.go ++++ b/pkg/kubelet/cm/cpumanager/policy_static.go +@@ -105,6 +105,8 @@ type staticPolicy struct { + topology *topology.CPUTopology + // set of CPUs that is not available for exclusive assignment + reservedCPUs cpuset.CPUSet ++ // If true, default CPUSet should exclude reserved CPUs ++ excludeReserved bool + // Superset of reservedCPUs. It includes not just the reservedCPUs themselves, + // but also any siblings of those reservedCPUs on the same physical die. + // NOTE: If the reserved set includes full physical CPUs from the beginning +@@ -125,7 +127,7 @@ var _ Policy = &staticPolicy{} + // NewStaticPolicy returns a CPU manager policy that does not change CPU + // assignments for exclusively pinned guaranteed containers after the main + // container process starts. +-func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reservedCPUs cpuset.CPUSet, affinity topologymanager.Store, cpuPolicyOptions map[string]string) (Policy, error) { ++func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reservedCPUs cpuset.CPUSet, affinity topologymanager.Store, cpuPolicyOptions map[string]string, excludeReserved bool) (Policy, error) { + opts, err := NewStaticPolicyOptions(cpuPolicyOptions) + if err != nil { + return nil, err +@@ -140,6 +142,7 @@ func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reserv + policy := &staticPolicy{ + topology: topology, + affinity: affinity, ++ excludeReserved: excludeReserved, + cpusToReuse: make(map[string]cpuset.CPUSet), + options: opts, + } +@@ -201,7 +204,15 @@ func (p *staticPolicy) validateState(s state.State) error { + } + // state is empty initialize + allCPUs := p.topology.CPUDetails.CPUs() +- s.SetDefaultCPUSet(allCPUs) ++ if p.excludeReserved { ++ // Exclude reserved CPUs from the default CPUSet to keep containers off them ++ // unless explicitly affined. ++ s.SetDefaultCPUSet(allCPUs.Difference(p.reservedCPUs)) ++ } else { ++ s.SetDefaultCPUSet(allCPUs) ++ } ++ klog.Infof("[cpumanager] static policy: CPUSet: allCPUs:%v, reserved:%v, default:%v\n", ++ allCPUs, p.reservedCPUs, s.GetDefaultCPUSet()) + return nil + } + +@@ -209,11 +220,12 @@ func (p *staticPolicy) validateState(s state.State) error { + // 1. Check if the reserved cpuset is not part of default cpuset because: + // - kube/system reserved have changed (increased) - may lead to some containers not being able to start + // - user tampered with file +- if !p.reservedCPUs.Intersection(tmpDefaultCPUset).Equals(p.reservedCPUs) { +- return fmt.Errorf("not all reserved cpus: \"%s\" are present in defaultCpuSet: \"%s\"", +- p.reservedCPUs.String(), tmpDefaultCPUset.String()) ++ if !p.excludeReserved { ++ if !p.reservedCPUs.Intersection(tmpDefaultCPUset).Equals(p.reservedCPUs) { ++ return fmt.Errorf("not all reserved cpus: \"%s\" are present in defaultCpuSet: \"%s\"", ++ p.reservedCPUs.String(), tmpDefaultCPUset.String()) ++ } + } +- + // 2. Check if state for static policy is consistent + for pod := range tmpAssignments { + for container, cset := range tmpAssignments[pod] { +@@ -240,6 +252,9 @@ func (p *staticPolicy) validateState(s state.State) error { + } + } + totalKnownCPUs = totalKnownCPUs.Union(tmpCPUSets...) ++ if p.excludeReserved { ++ totalKnownCPUs = totalKnownCPUs.Union(p.reservedCPUs) ++ } + if !totalKnownCPUs.Equals(p.topology.CPUDetails.CPUs()) { + return fmt.Errorf("current set of available CPUs \"%s\" doesn't match with CPUs in state \"%s\"", + p.topology.CPUDetails.CPUs().String(), totalKnownCPUs.String()) +@@ -374,6 +389,9 @@ func (p *staticPolicy) RemoveContainer(s state.State, podUID string, containerNa + cpusInUse := getAssignedCPUsOfSiblings(s, podUID, containerName) + if toRelease, ok := s.GetCPUSet(podUID, containerName); ok { + s.Delete(podUID, containerName) ++ if p.excludeReserved { ++ toRelease = toRelease.Difference(p.reservedCPUs) ++ } + // Mutate the shared pool, adding released cpus. + toRelease = toRelease.Difference(cpusInUse) + s.SetDefaultCPUSet(s.GetDefaultCPUSet().Union(toRelease)) +diff --git a/pkg/kubelet/cm/cpumanager/policy_static_test.go b/pkg/kubelet/cm/cpumanager/policy_static_test.go +index 0dcf78d49dc..deb2f4c7982 100644 +--- a/pkg/kubelet/cm/cpumanager/policy_static_test.go ++++ b/pkg/kubelet/cm/cpumanager/policy_static_test.go +@@ -36,6 +36,7 @@ type staticPolicyTest struct { + description string + topo *topology.CPUTopology + numReservedCPUs int ++ excludeReserved bool + reservedCPUs *cpuset.CPUSet + podUID string + options map[string]string +@@ -69,7 +70,8 @@ func (spt staticPolicyTest) PseudoClone() staticPolicyTest { + } + + func TestStaticPolicyName(t *testing.T) { +- policy, _ := NewStaticPolicy(topoSingleSocketHT, 1, cpuset.New(), topologymanager.NewFakeManager(), nil) ++ testExcl := false ++ policy, _ := NewStaticPolicy(topoSingleSocketHT, 1, cpuset.New(), topologymanager.NewFakeManager(), nil, testExcl) + + policyName := policy.Name() + if policyName != "static" { +@@ -99,6 +101,15 @@ func TestStaticPolicyStart(t *testing.T) { + stDefaultCPUSet: cpuset.New(), + expCSet: cpuset.New(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), + }, ++ { ++ description: "empty cpuset exclude reserved", ++ topo: topoDualSocketHT, ++ numReservedCPUs: 2, ++ excludeReserved: true, ++ stAssignments: state.ContainerCPUAssignments{}, ++ stDefaultCPUSet: cpuset.New(), ++ expCSet: cpuset.New(1, 2, 3, 4, 5, 7, 8, 9, 10, 11), ++ }, + { + description: "reserved cores 0 & 6 are not present in available cpuset", + topo: topoDualSocketHT, +@@ -145,7 +156,8 @@ func TestStaticPolicyStart(t *testing.T) { + } + for _, testCase := range testCases { + t.Run(testCase.description, func(t *testing.T) { +- p, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), topologymanager.NewFakeManager(), nil) ++ p, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), topologymanager.NewFakeManager(), nil, testCase.excludeReserved) ++ + policy := p.(*staticPolicy) + st := &mockState{ + assignments: testCase.stAssignments, +@@ -614,7 +626,8 @@ func runStaticPolicyTestCase(t *testing.T, testCase staticPolicyTest) { + if testCase.reservedCPUs != nil { + cpus = testCase.reservedCPUs.Clone() + } +- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpus, tm, testCase.options) ++ testExcl := false ++ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpus, tm, testCase.options, testExcl) + + st := &mockState{ + assignments: testCase.stAssignments, +@@ -685,7 +698,7 @@ func TestStaticPolicyReuseCPUs(t *testing.T) { + } + + for _, testCase := range testCases { +- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), topologymanager.NewFakeManager(), nil) ++ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), topologymanager.NewFakeManager(), nil, testCase.excludeReserved) + + st := &mockState{ + assignments: testCase.stAssignments, +@@ -717,6 +730,7 @@ func TestStaticPolicyReuseCPUs(t *testing.T) { + } + + func TestStaticPolicyRemove(t *testing.T) { ++ excludeReserved := false + testCases := []staticPolicyTest{ + { + description: "SingleSocketHT, DeAllocOneContainer", +@@ -775,7 +789,7 @@ func TestStaticPolicyRemove(t *testing.T) { + } + + for _, testCase := range testCases { +- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), topologymanager.NewFakeManager(), nil) ++ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), topologymanager.NewFakeManager(), nil, excludeReserved) + + st := &mockState{ + assignments: testCase.stAssignments, +@@ -797,6 +811,7 @@ func TestStaticPolicyRemove(t *testing.T) { + } + + func TestTopologyAwareAllocateCPUs(t *testing.T) { ++ excludeReserved := false + testCases := []struct { + description string + topo *topology.CPUTopology +@@ -865,7 +880,7 @@ func TestTopologyAwareAllocateCPUs(t *testing.T) { + }, + } + for _, tc := range testCases { +- p, _ := NewStaticPolicy(tc.topo, 0, cpuset.New(), topologymanager.NewFakeManager(), nil) ++ p, _ := NewStaticPolicy(tc.topo, 0, cpuset.New(), topologymanager.NewFakeManager(), nil, excludeReserved) + policy := p.(*staticPolicy) + st := &mockState{ + assignments: tc.stAssignments, +@@ -937,9 +952,11 @@ func TestStaticPolicyStartWithResvList(t *testing.T) { + expNewErr: fmt.Errorf("[cpumanager] unable to reserve the required amount of CPUs (size of 0-1 did not equal 1)"), + }, + } ++ testExcl := false + for _, testCase := range testCases { + t.Run(testCase.description, func(t *testing.T) { +- p, err := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, topologymanager.NewFakeManager(), nil) ++ p, err := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, topologymanager.NewFakeManager(), nil, testExcl) ++ + if !reflect.DeepEqual(err, testCase.expNewErr) { + t.Errorf("StaticPolicy Start() error (%v). expected error: %v but got: %v", + testCase.description, testCase.expNewErr, err) +@@ -979,7 +996,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) { + numReservedCPUs: 1, + reserved: cpuset.New(0), + stAssignments: state.ContainerCPUAssignments{}, +- stDefaultCPUSet: cpuset.New(0, 1, 2, 3, 4, 5, 6, 7), ++ stDefaultCPUSet: cpuset.New(1, 2, 3, 4, 5, 6, 7), + pod: makePod("fakePod", "fakeContainer2", "8000m", "8000m"), + expErr: fmt.Errorf("not enough cpus available to satisfy request"), + expCPUAlloc: false, +@@ -991,7 +1008,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) { + numReservedCPUs: 2, + reserved: cpuset.New(0, 1), + stAssignments: state.ContainerCPUAssignments{}, +- stDefaultCPUSet: cpuset.New(0, 1, 2, 3, 4, 5, 6, 7), ++ stDefaultCPUSet: cpuset.New(2, 3, 4, 5, 6, 7), + pod: makePod("fakePod", "fakeContainer2", "1000m", "1000m"), + expErr: nil, + expCPUAlloc: true, +@@ -1015,8 +1032,9 @@ func TestStaticPolicyAddWithResvList(t *testing.T) { + }, + } + ++ testExcl := true + for _, testCase := range testCases { +- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, topologymanager.NewFakeManager(), nil) ++ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, topologymanager.NewFakeManager(), nil, testExcl) + + st := &mockState{ + assignments: testCase.stAssignments, +-- +2.25.1 + diff --git a/kubernetes/kubernetes-1.28.4/debian/deb_folder/patches/kubelet-sort-isolcpus-allocation-when-SMT-enabled.patch b/kubernetes/kubernetes-1.28.4/debian/deb_folder/patches/kubelet-sort-isolcpus-allocation-when-SMT-enabled.patch new file mode 100644 index 000000000..9515f9101 --- /dev/null +++ b/kubernetes/kubernetes-1.28.4/debian/deb_folder/patches/kubelet-sort-isolcpus-allocation-when-SMT-enabled.patch @@ -0,0 +1,50 @@ +From a145d85ad716f7dc654727a3d0b2f998686bd273 Mon Sep 17 00:00:00 2001 +From: Jim Gauld +Date: Fri, 11 Feb 2022 11:06:35 -0500 +Subject: [PATCH] kubelet: sort isolcpus allocation when SMT enabled + +The existing device manager code returns CPUs as devices in unsorted +order. This numerically sorts isolcpus allocations when SMT/HT is +enabled on the host. This logs SMT pairs, singletons, and algorithm +order details to make the algorithm understandable. + +Signed-off-by: Jim Gauld +--- + pkg/kubelet/cm/devicemanager/manager.go | 13 ++++++++++++- + 1 file changed, 12 insertions(+), 1 deletion(-) + +diff --git a/pkg/kubelet/cm/devicemanager/manager.go b/pkg/kubelet/cm/devicemanager/manager.go +index 5d0c16ee51e..2447aa88114 100644 +--- a/pkg/kubelet/cm/devicemanager/manager.go ++++ b/pkg/kubelet/cm/devicemanager/manager.go +@@ -561,7 +561,16 @@ func order_devices_by_sibling(devices sets.String, needed int) ([]string, error) + return cpu_lst[0] + } + } ++ //Make post-analysis of selection algorithm obvious by numerical sorting ++ //the available isolated cpu_id. ++ cpu_ids := make([]int, 0, int(devices.Len())) + for cpu_id := range devices { ++ cpu_id_, _ := strconv.Atoi(cpu_id) ++ cpu_ids = append(cpu_ids, cpu_id_) ++ } ++ sort.Ints(cpu_ids) ++ for _, _cpu_id := range cpu_ids { ++ cpu_id := strconv.Itoa(_cpu_id) + // If we've already found cpu_id as a sibling, skip it. + if _, ok := _iterated_cpu[cpu_id]; ok { + continue +@@ -603,7 +612,9 @@ func order_devices_by_sibling(devices sets.String, needed int) ([]string, error) + } + } + } +- //klog.Infof("needed=%d ordered_cpu_list=%v", needed, dev_lst) ++ //This algorithm will get some attention. Show minimal details. ++ klog.Infof("order_devices_by_sibling: needed=%d, smtpairs=%v, singletons=%v, order=%v", ++ needed, sibling_lst, single_lst, dev_lst) + return dev_lst, nil + } + func smt_enabled() bool { +-- +2.25.1 + diff --git a/kubernetes/kubernetes-1.28.4/debian/deb_folder/patches/kubernetes-make-isolcpus-allocation-SMT-aware.patch b/kubernetes/kubernetes-1.28.4/debian/deb_folder/patches/kubernetes-make-isolcpus-allocation-SMT-aware.patch new file mode 100644 index 000000000..ba8aa4cdc --- /dev/null +++ b/kubernetes/kubernetes-1.28.4/debian/deb_folder/patches/kubernetes-make-isolcpus-allocation-SMT-aware.patch @@ -0,0 +1,148 @@ +From 2ae8d69bc49fcbf6fe95209fef0e256b10ad7a0f Mon Sep 17 00:00:00 2001 +From: Saba Touheed Mujawar +Date: Fri, 1 Dec 2023 05:27:16 -0500 +Subject: [PATCH] kubernetes: make isolcpus allocation SMT-aware + +Enhance isolcpus support in Kubernetes to allocate isolated SMT +siblings to the same container when SMT/HT is enabled on the host. + +As it stands, the device manager code in Kubernetes is not SMT-aware +(since normally it doesn't deal with CPUs). However, StarlingX +exposes isolated CPUs as devices and if possible we want to allocate +all SMT siblings from a CPU core to the same container in order to +minimize cross- container interference due to resource contention +within the CPU core. + +The solution is basically to take the list of isolated CPUs and +re-order it so that the SMT siblings are next to each other. That +way the existing resource selection code will allocate the siblings +together. As an optimization, if it is known that an odd number +of isolated CPUs are desired, a singleton SMT sibling will be +inserted into the list to avoid breaking up sibling pairs. + +Signed-off-by: Tao Wang +Signed-off-by: Ramesh Kumar Sivanandam +Signed-off-by: Boovan Rajendran +Signed-off-by: Saba Touheed Mujawar +--- + pkg/kubelet/cm/devicemanager/manager.go | 83 ++++++++++++++++++++++++- + 1 file changed, 82 insertions(+), 1 deletion(-) + +diff --git a/pkg/kubelet/cm/devicemanager/manager.go b/pkg/kubelet/cm/devicemanager/manager.go +index d780ee801bd..a9966290ab5 100644 +--- a/pkg/kubelet/cm/devicemanager/manager.go ++++ b/pkg/kubelet/cm/devicemanager/manager.go +@@ -23,6 +23,8 @@ import ( + "path/filepath" + "runtime" + "sort" ++ "strconv" ++ "strings" + "sync" + "time" + +@@ -36,6 +38,7 @@ import ( + pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" + "k8s.io/kubernetes/pkg/kubelet/checkpointmanager" + "k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors" ++ "k8s.io/utils/cpuset" + "k8s.io/kubernetes/pkg/kubelet/cm/containermap" + "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/checkpoint" + plugin "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/plugin/v1beta1" +@@ -542,6 +545,75 @@ func (m *ManagerImpl) UpdateAllocatedDevices() { + m.allocatedDevices = m.podDevices.devices() + } + ++//Given a list of isolated CPUs in 'devices', and the number of desired CPUs in 'needed', ++//return an ordered list of isolated CPUs such that the first 'needed' CPUs in the list ++//contain as many hyperthread sibling pairs as possible. ++func order_devices_by_sibling(devices sets.String, needed int) ([]string, error) { ++ var dev_lst []string ++ var single_lst []string ++ sibling_lst := make([]string, 0, int(devices.Len())) ++ _iterated_cpu := make(map[string]string) ++ get_sibling := func(cpu string, cpu_lst []string) string { ++ if cpu_lst[0] == cpu { ++ return cpu_lst[1] ++ } else { ++ return cpu_lst[0] ++ } ++ } ++ for cpu_id := range devices { ++ // If we've already found cpu_id as a sibling, skip it. ++ if _, ok := _iterated_cpu[cpu_id]; ok { ++ continue ++ } ++ devPath := fmt.Sprintf("/sys/devices/system/cpu/cpu%s/topology/thread_siblings_list", cpu_id) ++ dat, err := os.ReadFile(devPath) ++ if err != nil { ++ return dev_lst, fmt.Errorf("Can't read cpu[%s] thread_siblings_list", cpu_id) ++ } ++ cpustring := strings.TrimSuffix(string(dat), "\n") ++ cpu_pair_set, err := cpuset.Parse(cpustring) ++ if err != nil { ++ return dev_lst, fmt.Errorf("Unable to parse thread_siblings_list[%s] string to cpuset", cpustring) ++ } ++ var cpu_pair_lst []string ++ for _, v := range cpu_pair_set.List() { ++ cpu_pair_lst = append(cpu_pair_lst, strconv.Itoa(v)) ++ } ++ sibling_cpu_id := get_sibling(cpu_id, cpu_pair_lst) ++ if _, ok := devices[sibling_cpu_id]; ok { ++ sibling_lst = append(sibling_lst, cpu_id, sibling_cpu_id) ++ _iterated_cpu[sibling_cpu_id] = "" ++ } else { ++ single_lst = append(single_lst, cpu_id) ++ } ++ _iterated_cpu[cpu_id] = "" ++ } ++ if needed%2 == 0 { ++ dev_lst = append(sibling_lst, single_lst...) ++ } else { ++ if len(single_lst) > 1 { ++ _tmp_list := append(sibling_lst, single_lst[1:]...) ++ dev_lst = append(single_lst[0:1], _tmp_list...) ++ } else { ++ if len(single_lst) == 0 { ++ dev_lst = sibling_lst ++ } else { ++ dev_lst = append(single_lst, sibling_lst...) ++ } ++ } ++ } ++ //klog.Infof("needed=%d ordered_cpu_list=%v", needed, dev_lst) ++ return dev_lst, nil ++} ++func smt_enabled() bool { ++ dat, _ := os.ReadFile("/sys/devices/system/cpu/smt/active") ++ state := strings.TrimSuffix(string(dat), "\n") ++ if state == "0" { ++ return false ++ } ++ return true ++} ++ + // Returns list of device Ids we need to allocate with Allocate rpc call. + // Returns empty list in case we don't need to issue the Allocate rpc call. + func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, required int, reusableDevices sets.String) (sets.String, error) { +@@ -615,7 +687,16 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi + // Create a closure to help with device allocation + // Returns 'true' once no more devices need to be allocated. + allocateRemainingFrom := func(devices sets.String) bool { +- for device := range devices.Difference(allocated) { ++ availableDevices := devices.Difference(allocated).List() ++ // If we're dealing with isolcpus and SMT is enabled, reorder to group SMT siblings together. ++ if resource == "windriver.com/isolcpus" && len(devices) > 0 && smt_enabled() { ++ var err error ++ availableDevices, err = order_devices_by_sibling(devices.Difference(allocated), needed) ++ if err != nil { ++ klog.Errorf("error in order_devices_by_sibling: %v", err) ++ } ++ } ++ for _, device := range availableDevices { + m.allocatedDevices[resource].Insert(device) + allocated.Insert(device) + needed-- +-- +2.25.1 + diff --git a/kubernetes/kubernetes-1.28.4/debian/deb_folder/patches/series b/kubernetes/kubernetes-1.28.4/debian/deb_folder/patches/series new file mode 100644 index 000000000..a0cf7c144 --- /dev/null +++ b/kubernetes/kubernetes-1.28.4/debian/deb_folder/patches/series @@ -0,0 +1,10 @@ +kubeadm-create-platform-pods-with-zero-CPU-resources.patch +kubernetes-make-isolcpus-allocation-SMT-aware.patch +kubelet-sort-isolcpus-allocation-when-SMT-enabled.patch +kubelet-cpumanager-disable-CFS-quota-throttling.patch +kubelet-cpumanager-keep-normal-containers-off-reserv.patch +kubelet-cpumanager-infra-pods-use-system-reserved-CP.patch +kubelet-cpumanager-introduce-concept-of-isolated-CPU.patch +Affinity-of-guaranteed-pod-to-non-isolated-CPUs.patch +enable-support-for-kubernetes-to-ignore-isolcpus.patch +kubelet-CFS-quota-throttling-for-non-integer-cpulimit.patch