From 8e843096242d0f21ad9c4226491d13e2d5543a55 Mon Sep 17 00:00:00 2001 From: Cole Walker Date: Wed, 9 Jun 2021 17:04:56 -0400 Subject: [PATCH] Add check to avoid restarting running device plugin pod This script was set to always restart the local sriov device plugin pod which could result in sriov pods not starting properly. Originally, this sequence of commands would not work properly if the device plugin was running kubectl delete pods -n kube-system --selector=app=sriovdp --field-selector=spec.nodeName=${HOST} --wait=false kubectl wait pods -n kube-system --selector=app=sriovdp --field-selector=spec.nodeName=${HOST} --for=condition=Ready --timeout=360s Result when device plugin is running: pod "kube-sriov-device-plugin-amd64-rbjpw" deleted pod/kube-sriov-device-plugin-amd64-rbjpw condition met The wait command succeeds against the deleted pod and the script continues. It then deletes labeled pods without having confirmed that the device plugin is running and can result in sriov pods not starting properly. Ensuring that we are only restarting a not-running device plugin pod prevents the wait condition from immediately passing. Closes-Bug: 1928965 Signed-off-by: Cole Walker Change-Id: I1cc576b26a4bba4eba4a088d33f918bb07ef3b0d --- .../k8s-pod-recovery/centos/files/k8s-pod-recovery | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/kubernetes/k8s-pod-recovery/centos/files/k8s-pod-recovery b/kubernetes/k8s-pod-recovery/centos/files/k8s-pod-recovery index 3c69d5a49..3c9b05096 100755 --- a/kubernetes/k8s-pod-recovery/centos/files/k8s-pod-recovery +++ b/kubernetes/k8s-pod-recovery/centos/files/k8s-pod-recovery @@ -167,11 +167,18 @@ function _labeled_pods { # Don't have to restart device-plugin if no labeled pods are present. System may not be configured for SRIOV. if [ ! -z "${PODS}" ]; then LOG "Waiting for SRIOV device plugin pod to become available" - kubectl delete pods -n kube-system --selector=app=sriovdp --field-selector=spec.nodeName=${HOST} --wait=false - kubectl wait pods -n kube-system --selector=app=sriovdp --field-selector=spec.nodeName=${HOST} --for=condition=Ready --timeout=360s + # Check if device-plugin is ready, but do not wait + kubectl wait pods -n kube-system --selector=app=sriovdp --field-selector=spec.nodeName=${HOST} --for=condition=Ready --timeout=0s + + # If device plugin is not ready, restart it and wait if [ "$?" -ne 0 ]; then - ERROR "SRIOV device plugin timed out on ready wait. Continuing anyway. SRIOV pods may not recover." + kubectl delete pods -n kube-system --selector=app=sriovdp --field-selector=spec.nodeName=${HOST} --wait=false + kubectl wait pods -n kube-system --selector=app=sriovdp --field-selector=spec.nodeName=${HOST} --for=condition=Ready --timeout=360s + + if [ "$?" -ne 0 ]; then + ERROR "SRIOV device plugin timed out on ready wait. Continuing anyway. SRIOV pods may not recover." + fi fi fi