diff --git a/centos_iso_image.inc b/centos_iso_image.inc index 54b69c016..3defd49b7 100644 --- a/centos_iso_image.inc +++ b/centos_iso_image.inc @@ -170,6 +170,7 @@ kubernetes-1.21.3-kubeadm kubernetes-1.21.3-client containerd k8s-pod-recovery +k8s-cni-cache-cleanup containernetworking-plugins # resource-agents diff --git a/centos_pkg_dirs b/centos_pkg_dirs index 32c3e4608..2cd913ab5 100644 --- a/centos_pkg_dirs +++ b/centos_pkg_dirs @@ -66,6 +66,7 @@ kubernetes/chartmuseum kubernetes/armada-helm-toolkit kubernetes/armada kubernetes/k8s-pod-recovery +kubernetes/k8s-cni-cache-cleanup kubernetes/plugins/isolcpus-device-plugin python/python-kubernetes grub/grubby diff --git a/kubernetes/k8s-cni-cache-cleanup/centos/build_srpm.data b/kubernetes/k8s-cni-cache-cleanup/centos/build_srpm.data new file mode 100644 index 000000000..2f3c17bc1 --- /dev/null +++ b/kubernetes/k8s-cni-cache-cleanup/centos/build_srpm.data @@ -0,0 +1,4 @@ +SRC_DIR="." +COPY_LIST="$FILES_BASE/*" + +TIS_PATCH_VER=PKG_GITREVCOUNT diff --git a/kubernetes/k8s-cni-cache-cleanup/centos/files/k8s-cni-cache-cleanup b/kubernetes/k8s-cni-cache-cleanup/centos/files/k8s-cni-cache-cleanup new file mode 100644 index 000000000..e1435228d --- /dev/null +++ b/kubernetes/k8s-cni-cache-cleanup/centos/files/k8s-cni-cache-cleanup @@ -0,0 +1,214 @@ +#!/bin/bash +# +# Copyright (c) 2021 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + +NAME=$(basename $0) +RESULTSDIR="/var/lib/cni/results" +MULTUSDIR="/var/lib/cni/multus" +PODS=$(crictl ps -v 2> /dev/null | grep -w -E 'PodID|pod.name') +PODIDS=($(echo "$PODS" | grep PodID | awk '{print $2}')) +PODNAMES=($(echo "$PODS" | grep -w pod.name | awk '{print $3}')) +KUBELET_UPTIME_MINUTES=5 +POD_ID_LENGTH=64 +DELETE="no" +OLDERTHANHOURS=1 + +# Log info message to /var/log/daemon.log +function LOG { + logger -p daemon.info -t "${NAME}($$): " "${@}" +} + +# Log error message to /var/log/daemon.log +function ERROR { + logger -p daemon.error -t "${NAME}($$): " "${@}" +} + +# Determine the age of a file in hours. +function file_age { + local file=${1} + local SECONDSPERHOUR=3600 + now=$(date +%s) + old=$(stat -c %Z ${file}) + diff=$(((${now} - ${old})/${SECONDSPERHOUR})) + echo ${diff} +} + +# Determine the pod id associated with a result CNI cache file. +function results_cni_cache_file_to_pod_id { + local path=${1} + local ret="" + file=$(basename ${path}) + + # A valid CNI cache results file looks like: + # type-pod_id-interface_name + RESULTS_REGEX='^.*-([0-9a-zA-Z]{64})-[0-9a-zA-Z]+$' + + if [[ ${file} =~ ${RESULTS_REGEX} ]]; then + ret=${BASH_REMATCH[1]} + fi + + echo ${ret} +} + +# Determine the pod id associated with a multus CNI cache file. +function multus_cni_cache_file_to_pod_id { + local path=${1} + local ret="" + file=$(basename ${path}) + + # A valid CNI cache multus file is simply the pod id + MULTUS_REGEX='^([0-9a-zA-Z]{64})$' + + if [[ ${file} =~ ${MULTUS_REGEX} ]]; then + ret=${BASH_REMATCH[1]} + fi + + echo ${ret} +} + +# Determine the pod id associated with a CNI cache file. +function cni_cache_file_to_pod_id { + local path=${1} + local ret="" + dir=$(dirname ${path}) + + if [[ "${dir}" == "${RESULTSDIR}" ]]; then + ret=$(results_cni_cache_file_to_pod_id ${path}) + elif [[ "${dir}" == "${MULTUSDIR}" ]]; then + ret=$(multus_cni_cache_file_to_pod_id ${path}) + fi + + echo ${ret} +} + +# Determine the original pod name from a CNI cache file (if any). +function cache_file_to_pod_name { + local path=${1} + local ret="unknown" + + grep -q "K8S_POD_NAME" ${path} + if [ ${?} -eq 0 ]; then + ret=$(cat ${path} | sed "s/.*K8S_POD_NAME\",\"//g" | cut -f1 -d"\"") + fi + + echo ${ret} +} + +# Given a CNI cache id, return the existing pod name (if any). +function get_pod { + local cacheid=${1} + local ret="" + + for i in ${!PODIDS[@]}; do + podid=${PODIDS[${i}]} + if [[ "${podid}" == "${cacheid}" ]]; then + ret=${PODNAMES[${i}]} + fi + done + + echo ${ret} +} + +# Determine if the CNI cache file is old enough to process. +function check_cache_file_age { + local age=${1} + local ret="" + + if [ -n ${OLDERTHANHOURS} ]; then + if [[ ${age} -ge ${OLDERTHANHOURS} ]]; then + ret=${age} + fi + fi + + echo ${ret} +} + +# Determine how long kubelet has been up in minutes +function kubelet_uptime { + local SECONDSPERMINUTE=60 + + kubelet_uptime=$(systemctl show kubelet --property WatchdogTimestamp | awk -F= '{print $2}') + [[ -n ${kubelet_uptime} ]] + if [ ${?} -ne 0 ]; then + ERROR "Failed to get kubelet uptime." + minutes=0 + else + uptime=$(date --date="${kubelet_uptime}" +%s) + now=$(date +%s) + minutes=$(((${now}-${uptime})/${SECONDSPERMINUTE})) + fi + + echo ${minutes} +} + +# Wait for kubelet to be up for long enough to process CNI cache files. +function check_kubelet { + local retries=0 + + while [ ${retries} -le 30 ]; do + uptime=$(kubelet_uptime) + if [ ${uptime} -ge ${KUBELET_UPTIME_MINUTES} ]; then + return 0 + fi + remaining=$((${KUBELET_UPTIME_MINUTES}-${uptime})) + LOG "Waiting for kubelet to be up for ${remaining} minutes ..." + retries=$((${retries}+1)) + sleep 30 + done + + return 1 +} + +while getopts :o:d OPT; do + case ${OPT} in + o|--older-than) + OLDERTHANHOURS=${OPTARG} + ;; + d|+d) + DELETE="yes" + ;; + *) + echo "usage: ${0##*/} [-d] [-o older_than_hours]" + exit 2 + esac +done + +check_kubelet +if [[ ${?} -ne 0 ]]; then + LOG "Kubelet must be up for a minimum of ${KUBELET_UPTIME_MINUTES} minutes. Not running CNI cache cleanup." + exit 1 +fi + +for f in ${RESULTSDIR}/* ${MULTUSDIR}/*; do + cacheid=$(cni_cache_file_to_pod_id ${f}) + if [[ ${#cacheid} -ne ${POD_ID_LENGTH} ]]; then + # Unrecognized file pattern, skip. + continue + fi + + existing_podname=$(get_pod ${cacheid}) + if [[ ${existing_podname} ]]; then + LOG "Pod ${existing_podname} exists. Not cleaning up CNI cache file(s)." + continue + fi + + age=$(file_age ${f}) + if [[ ! $(check_cache_file_age ${age}) ]]; then + LOG "Stale CNI cache file ${f} detected. Cleanup to occur after $((${OLDERTHANHOURS} - ${age})) hour(s)." + continue + fi + + if [[ "${DELETE}" == "yes" ]]; then + rm -f ${f} + action="Deleted" + else + action="Detected" + fi + + cache_podname=$(cache_file_to_pod_name ${f}) + LOG "${action} stale CNI cache file ${f}: [age: ${age} hours old, podname: ${cache_podname}]." +done + diff --git a/kubernetes/k8s-cni-cache-cleanup/centos/k8s-cni-cache-cleanup.spec b/kubernetes/k8s-cni-cache-cleanup/centos/k8s-cni-cache-cleanup.spec new file mode 100644 index 000000000..280d43cd4 --- /dev/null +++ b/kubernetes/k8s-cni-cache-cleanup/centos/k8s-cni-cache-cleanup.spec @@ -0,0 +1,27 @@ +Name: k8s-cni-cache-cleanup +Version: 1.0 +Release: 0%{?_tis_dist}.%{tis_patch_ver} +Summary: Kubernetes CNI Cache Cleanup Utility +License: Apache-2.0 +Group: base +Packager: Wind River +URL: unknown +Source0: k8s-cni-cache-cleanup + +Requires: /bin/bash + +%description +%{summary} + +%define local_dir /usr/local +%define local_sbindir %{local_dir}/sbin + +%prep + +%install +install -d %{buildroot}%{local_sbindir} +install -m 755 %{SOURCE0} %{buildroot}%{local_sbindir}/k8s-cni-cache-cleanup + +%files +%defattr(-,root,root,-) +%{local_sbindir}/k8s-cni-cache-cleanup diff --git a/kubernetes/k8s-pod-recovery/centos/files/k8s-pod-recovery b/kubernetes/k8s-pod-recovery/centos/files/k8s-pod-recovery index 3050927f6..2f80d219e 100755 --- a/kubernetes/k8s-pod-recovery/centos/files/k8s-pod-recovery +++ b/kubernetes/k8s-pod-recovery/centos/files/k8s-pod-recovery @@ -19,7 +19,7 @@ . /etc/platform/platform.conf -export PATH=/sbin:/usr/sbin:/bin:/usr/bin:/usr/local/bin +export PATH=/sbin:/usr/sbin:/bin:/usr/bin:/usr/local/bin:/usr/local/sbin export KUBECONFIG=/etc/kubernetes/admin.conf CONF_DIR=/etc/k8s-post-recovery.d SLEEP_DELAY_SEC=15 @@ -74,6 +74,16 @@ function _wait_for_systemd { done } +function _do_cni_cache_cleanup { + # Cleanup any stale CNI cache files (not associated with any running pod) + # that are older than 1 hour old + LOG "Starting CNI cache cleanup..." + k8s-cni-cache-cleanup -o 1 -d + if [[ ${?} -ne 0 ]]; then + ERROR "Failed to run CNI cache cleanup." + fi +} + function _wait_for_pod_stabilization { local extra_args=$1 @@ -298,6 +308,7 @@ function start { _wait_for_systemd _examine_pods 'recover' _examine_pods 'verify' + _do_cni_cache_cleanup } function stop {