diff --git a/centos_iso_image.inc b/centos_iso_image.inc index 048d001e0..af18217fb 100644 --- a/centos_iso_image.inc +++ b/centos_iso_image.inc @@ -174,6 +174,7 @@ kubernetes-node kubernetes-kubeadm kubernetes-client containerd +k8s-pod-recovery # resource-agents resource-agents diff --git a/centos_pkg_dirs b/centos_pkg_dirs index eadeb773f..0c9f26a17 100644 --- a/centos_pkg_dirs +++ b/centos_pkg_dirs @@ -61,6 +61,7 @@ kubernetes/helm kubernetes/chartmuseum kubernetes/armada-helm-toolkit kubernetes/armada +kubernetes/k8s-pod-recovery grub/grubby base/dpkg base/cluster-resource-agents diff --git a/kubernetes/k8s-pod-recovery/centos/build_srpm.data b/kubernetes/k8s-pod-recovery/centos/build_srpm.data new file mode 100644 index 000000000..2f3c17bc1 --- /dev/null +++ b/kubernetes/k8s-pod-recovery/centos/build_srpm.data @@ -0,0 +1,4 @@ +SRC_DIR="." +COPY_LIST="$FILES_BASE/*" + +TIS_PATCH_VER=PKG_GITREVCOUNT diff --git a/kubernetes/k8s-pod-recovery/centos/files/k8s-pod-recovery b/kubernetes/k8s-pod-recovery/centos/files/k8s-pod-recovery new file mode 100755 index 000000000..68b965cd0 --- /dev/null +++ b/kubernetes/k8s-pod-recovery/centos/files/k8s-pod-recovery @@ -0,0 +1,247 @@ +#!/bin/bash +# +# Copyright (c) 2020 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + +# +# chkconfig: 2345 76 25 +# +### BEGIN INIT INFO +# Provides: k8s-pod-recovery +# Default-Start: 3 5 +# Required-Start: +# Required-Stop: +# Default-Stop: 0 1 2 6 +# Short-Description: Service to recovery pods after host boot +### END INIT INFO + +. /etc/platform/platform.conf + +export PATH=/sbin:/usr/sbin:/bin:/usr/bin:/usr/local/bin +export KUBECONFIG=/etc/kubernetes/admin.conf +SLEEP_DELAY_SEC=15 + +NAME=$(basename $0) +PIDFILE=/var/run/${NAME}.pid +HOST=$(hostname) + +# Log info message to /var/log/daemon.log +function LOG { + logger -p daemon.info -t "${NAME}($$): " "$@" +} + +# Log error message to /var/log/daemon.log +function ERROR { + logger -p daemon.error -t "${NAME}($$): " "$@" +} + +function _check_for_k8s_config { + # If this node has not been configured, then there is nothing to recovery + if [ ! -f ${KUBECONFIG} ]; then + LOG "${KUBECONFIG} does not exist. No pods to recover." + exit 0 + fi +} + +function _check_for_existing_process { + # Abort if another instantiation is already running + if [ -e ${PIDFILE} ]; then + PID=$(cat ${PIDFILE}) + PROCESS=$(cat /proc/${PID}/comm) + if [ -n "${PID}" -a -e /proc/${PID} -a ${PROCESS} == ${NAME} ]; then + ERROR "Aborting, ${PID} already running: ${PIDFILE}." + exit 1 + else + OUT=$(rm -v -f ${PIDFILE}) + LOG "${OUT}" + fi + fi + + # Create pidfile to indicate the script is running + echo $$ > ${PIDFILE} +} + +function _wait_for_systemd { + while true; do + if systemctl is-system-running | grep -q -e running -e degraded; then + break + fi + LOG "Waiting for systemd to finish booting..." + sleep ${SLEEP_DELAY_SEC} + done +} + +function _wait_for_pod_stabilization { + last_count=0 + stability_count=0 + NINETY_SEC_COUNT=$((90/SLEEP_DELAY_SEC)) + while true ; do + pods_in_flux=$(KUBECONFIG=/etc/kubernetes/admin.conf kubectl get pods --no-headers --all-namespaces | grep -v -e Running -e Completed | wc -l) + if [[ $pods_in_flux -ne $last_count ]]; then + LOG "Waiting on pod transitions to stabilize... $pods_in_flux pods are not Running/Completed" + last_count=$pods_in_flux + stability_count=0 + else + LOG "Pods transitions are stable... for $((stability_count*${SLEEP_DELAY_SEC})) seconds." + if [[ $stability_count -eq $NINETY_SEC_COUNT ]]; then + break + fi + stability_count=$((stability_count+1)) + fi + sleep ${SLEEP_DELAY_SEC} + done +} + +function _unknown_pods { + # $1: actions + + # Target specific namespaces and pods on this host + SUPPORTED_NAMESPACES=('openstack' 'monitor') + + if [ "$1" == 'recover' ]; then + # Recovers pods that are: Running/Unknown and Pending/Init:Unknown + for ns in ${SUPPORTED_NAMESPACES[@]}; do + PODS=$(kubectl get pods -n $ns --field-selector spec.nodeName=${HOST} 2>/dev/null | awk /Unknown/'{print $1}') + for pod in $PODS ; do + LOG "Unknown pods: Recovering: $ns/$pod" + kubectl delete pods -n $ns $pod --wait=false + done + done + elif [ "$1" == 'verify' ]; then + for ns in ${SUPPORTED_NAMESPACES[@]}; do + PODS=$(kubectl get pods -n $ns --field-selector spec.nodeName=${HOST} 2>/dev/null | awk /Unknown/'{print $1}') + if [ -z "${PODS}" ]; then + LOG "Unknown pods: None present for namespace: $ns" + else + ERROR "Unknown pods: still present for namespace: $ns" + fi + done + else + ERROR "Unknown action: $1" + fi +} + +function _node_affinity_pods { + # $1: actions + + if [ "$1" == 'recover' ]; then + PODS=$(kubectl get pods --all-namespaces --field-selector status.phase=Failed,spec.nodeName=${HOST} 2>/dev/null | awk /NodeAffinity/'{print $1"/"$2}') + for pod in $PODS ; do + LOG "NodeAffinity pods: Recovering: $pod" + kubectl delete pods -n ${pod//// } --wait=false + done + elif [ "$1" == 'verify' ]; then + PODS=$(kubectl get pods --all-namespaces --field-selector status.phase=Failed,spec.nodeName=${HOST} 2>/dev/null | awk /NodeAffnity/'{print $1"/"$2}') + if [ -z "${PODS}" ]; then + LOG "NodeAffinity pods: None present." + else + ERROR "NodeAffinity pods: still present" + fi + else + ERROR "Unknown action: $1" + fi + +} + +function _force_reset_pods { + # $1: actions + + # Handle resetting openstack libvirt pod as it sometimes is in a Running but + # unusable state + if kubectl get namespace openstack > /dev/null 2>&1; then + + # Get the libvirt pods on this host that are Running without all + # conditions True + # + # Conditions: + # Initialized True + # Ready True + # ContainersReady True + # PodScheduled True + # + # NAME STATUS CONDITIONS NODE + # libvirt-libvirt-controller-0-937646f6-xst4r Running True,True,True,True controller-0 + # + CUSTOM_COLUMNS='custom-columns=NAME:.metadata.name,STATUS:status.phase,CONDITIONS:status.conditions[*].status,NODE:spec.nodeName' + FIELD_SELECTOR="spec.nodeName=${HOST}" + PODS=$(kubectl get pods -n openstack -l application=libvirt --field-selector ${FIELD_SELECTOR} -o ${CUSTOM_COLUMNS} | grep -v NAME | grep -v 'True,True,True,True' | awk '{print $1}') + + if [ "$1" == 'recover' ]; then + for pod in $PODS ; do + LOG "Recovering libvirt pod: $pod" + kubectl delete pods -n openstack $pod --wait=false + done + elif [ "$1" == 'verify' ]; then + if [ -z "${PODS}" ]; then + LOG "Openstack libvirt pod on ${HOST} is running." + else + ERROR "Openstack libvirt pod on ${HOST} has not been recovered." + fi + else + ERROR "Unknown action: $1" + fi + fi +} + +function _examine_pods { + # $1: actions + + # Wait for pods transitions to stop + _wait_for_pod_stabilization + + # Check for recovery actions + _unknown_pods $1 + _node_affinity_pods $1 + _force_reset_pods $1 +} + + +function start { + _check_for_k8s_config + _check_for_existing_process + + LOG "Starting." + + _wait_for_systemd + _examine_pods 'recover' + _examine_pods 'verify' +} + +function stop { + LOG "Stopping." +} + +function status { + : +} + +function reset { + : +} + +case "$1" in + start) + start + ;; + stop) + stop + ;; + restart|force-reload|reload) + stop + start + ;; + status) + status + ;; + reset) + reset + ;; + *) + echo "Usage: $0 {start|stop|force-reload|restart|reload|status|reset}" + exit 1 + ;; +esac + +exit 0 diff --git a/kubernetes/k8s-pod-recovery/centos/files/k8s-pod-recovery.service b/kubernetes/k8s-pod-recovery/centos/files/k8s-pod-recovery.service new file mode 100644 index 000000000..113d0efd4 --- /dev/null +++ b/kubernetes/k8s-pod-recovery/centos/files/k8s-pod-recovery.service @@ -0,0 +1,14 @@ +[Unit] +Description=Kubernetes Pods Recovery Service +After=sw-patch.service +After=kubelet.service +Requires=kubelet.service + +[Service] +Type=simple +ExecStart=/usr/local/sbin/k8s-pod-recovery start +ExecStop=/usr/local/sbin/k8s-pod-recovery stop +PIDFile=/var/run/k8s-pod-recovery.pid + +[Install] +WantedBy=multi-user.target diff --git a/kubernetes/k8s-pod-recovery/centos/k8s-pod-recovery.spec b/kubernetes/k8s-pod-recovery/centos/k8s-pod-recovery.spec new file mode 100644 index 000000000..082dd6cc6 --- /dev/null +++ b/kubernetes/k8s-pod-recovery/centos/k8s-pod-recovery.spec @@ -0,0 +1,52 @@ +Name: k8s-pod-recovery +Version: 1.0 +Release: 0%{?_tis_dist}.%{tis_patch_ver} +Summary: Kubernetes Pod Recovery Service +License: Apache-2.0 +Group: base +Packager: Wind River +URL: unknown +Source0: k8s-pod-recovery +Source1: k8s-pod-recovery.service + +Requires: /bin/bash +Requires: systemd + +%description +%{summary} + +%define local_dir /usr/local +%define local_sbindir %{local_dir}/sbin + +%prep + +%install +install -d %{buildroot}%{local_sbindir} +install -m 755 %{SOURCE0} %{buildroot}%{local_sbindir}/k8s-pod-recovery +install -p -D -m 644 %{SOURCE1} %{buildroot}%{_unitdir}/k8s-pod-recovery.service + +%post +if [ $1 -eq 1 ]; then + # Package install: enable and start it + /usr/bin/systemctl enable k8s-pod-recovery.service > /dev/null 2>&1 || : + /usr/bin/systemctl start k8s-pod-recovery.service > /dev/null 2>&1 || : +else + # Package upgrade: reenable in case [Install] changes and restart to pick up + # new actions + if /usr/bin/systemctl --quiet is-enabled k8s-pod-recovery.service ; then + /usr/bin/systemctl reenable k8s-pod-recovery.service > /dev/null 2>&1 || : + /usr/bin/systemctl restart k8s-pod-recovery.service > /dev/null 2>&1 || : + fi +fi + +%preun +if [ $1 -eq 0 ]; then + /usr/bin/systemctl stop k8s-pod-recovery.service > /dev/null 2>&1 || : + /usr/bin/systemctl disable k8s-pod-recovery.service > /dev/null 2>&1 || : +fi + + +%files +%defattr(-,root,root,-) +%{local_sbindir}/k8s-pod-recovery +%{_unitdir}/k8s-pod-recovery.service