integ/kubernetes/k8s-pod-recovery/centos/files/k8s-pod-recovery

248 lines
7.0 KiB
Bash
Executable File

#!/bin/bash
#
# Copyright (c) 2020 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
#
# chkconfig: 2345 76 25
#
### BEGIN INIT INFO
# Provides: k8s-pod-recovery
# Default-Start: 3 5
# Required-Start:
# Required-Stop:
# Default-Stop: 0 1 2 6
# Short-Description: Service to recovery pods after host boot
### END INIT INFO
. /etc/platform/platform.conf
export PATH=/sbin:/usr/sbin:/bin:/usr/bin:/usr/local/bin
export KUBECONFIG=/etc/kubernetes/admin.conf
SLEEP_DELAY_SEC=15
NAME=$(basename $0)
PIDFILE=/var/run/${NAME}.pid
HOST=$(hostname)
# Log info message to /var/log/daemon.log
function LOG {
logger -p daemon.info -t "${NAME}($$): " "$@"
}
# Log error message to /var/log/daemon.log
function ERROR {
logger -p daemon.error -t "${NAME}($$): " "$@"
}
function _check_for_k8s_config {
# If this node has not been configured, then there is nothing to recovery
if [ ! -f ${KUBECONFIG} ]; then
LOG "${KUBECONFIG} does not exist. No pods to recover."
exit 0
fi
}
function _check_for_existing_process {
# Abort if another instantiation is already running
if [ -e ${PIDFILE} ]; then
PID=$(cat ${PIDFILE})
PROCESS=$(cat /proc/${PID}/comm)
if [ -n "${PID}" -a -e /proc/${PID} -a ${PROCESS} == ${NAME} ]; then
ERROR "Aborting, ${PID} already running: ${PIDFILE}."
exit 1
else
OUT=$(rm -v -f ${PIDFILE})
LOG "${OUT}"
fi
fi
# Create pidfile to indicate the script is running
echo $$ > ${PIDFILE}
}
function _wait_for_systemd {
while true; do
if systemctl is-system-running | grep -q -e running -e degraded; then
break
fi
LOG "Waiting for systemd to finish booting..."
sleep ${SLEEP_DELAY_SEC}
done
}
function _wait_for_pod_stabilization {
last_count=0
stability_count=0
NINETY_SEC_COUNT=$((90/SLEEP_DELAY_SEC))
while true ; do
pods_in_flux=$(KUBECONFIG=/etc/kubernetes/admin.conf kubectl get pods --no-headers --all-namespaces | grep -v -e Running -e Completed | wc -l)
if [[ $pods_in_flux -ne $last_count ]]; then
LOG "Waiting on pod transitions to stabilize... $pods_in_flux pods are not Running/Completed"
last_count=$pods_in_flux
stability_count=0
else
LOG "Pods transitions are stable... for $((stability_count*${SLEEP_DELAY_SEC})) seconds."
if [[ $stability_count -eq $NINETY_SEC_COUNT ]]; then
break
fi
stability_count=$((stability_count+1))
fi
sleep ${SLEEP_DELAY_SEC}
done
}
function _unknown_pods {
# $1: actions <recover|verify>
# Target specific namespaces and pods on this host
SUPPORTED_NAMESPACES=('openstack' 'monitor')
if [ "$1" == 'recover' ]; then
# Recovers pods that are: Running/Unknown and Pending/Init:Unknown
for ns in ${SUPPORTED_NAMESPACES[@]}; do
PODS=$(kubectl get pods -n $ns --field-selector spec.nodeName=${HOST} 2>/dev/null | awk /Unknown/'{print $1}')
for pod in $PODS ; do
LOG "Unknown pods: Recovering: $ns/$pod"
kubectl delete pods -n $ns $pod --wait=false
done
done
elif [ "$1" == 'verify' ]; then
for ns in ${SUPPORTED_NAMESPACES[@]}; do
PODS=$(kubectl get pods -n $ns --field-selector spec.nodeName=${HOST} 2>/dev/null | awk /Unknown/'{print $1}')
if [ -z "${PODS}" ]; then
LOG "Unknown pods: None present for namespace: $ns"
else
ERROR "Unknown pods: still present for namespace: $ns"
fi
done
else
ERROR "Unknown action: $1"
fi
}
function _node_affinity_pods {
# $1: actions <recover|verify>
if [ "$1" == 'recover' ]; then
PODS=$(kubectl get pods --all-namespaces --field-selector status.phase=Failed,spec.nodeName=${HOST} 2>/dev/null | awk /NodeAffinity/'{print $1"/"$2}')
for pod in $PODS ; do
LOG "NodeAffinity pods: Recovering: $pod"
kubectl delete pods -n ${pod//// } --wait=false
done
elif [ "$1" == 'verify' ]; then
PODS=$(kubectl get pods --all-namespaces --field-selector status.phase=Failed,spec.nodeName=${HOST} 2>/dev/null | awk /NodeAffnity/'{print $1"/"$2}')
if [ -z "${PODS}" ]; then
LOG "NodeAffinity pods: None present."
else
ERROR "NodeAffinity pods: still present"
fi
else
ERROR "Unknown action: $1"
fi
}
function _force_reset_pods {
# $1: actions <recover|verify>
# Handle resetting openstack libvirt pod as it sometimes is in a Running but
# unusable state
if kubectl get namespace openstack > /dev/null 2>&1; then
# Get the libvirt pods on this host that are Running without all
# conditions True
#
# Conditions:
# Initialized True
# Ready True
# ContainersReady True
# PodScheduled True
#
# NAME STATUS CONDITIONS NODE
# libvirt-libvirt-controller-0-937646f6-xst4r Running True,True,True,True controller-0
#
CUSTOM_COLUMNS='custom-columns=NAME:.metadata.name,STATUS:status.phase,CONDITIONS:status.conditions[*].status,NODE:spec.nodeName'
FIELD_SELECTOR="spec.nodeName=${HOST}"
PODS=$(kubectl get pods -n openstack -l application=libvirt --field-selector ${FIELD_SELECTOR} -o ${CUSTOM_COLUMNS} | grep -v NAME | grep -v 'True,True,True,True' | awk '{print $1}')
if [ "$1" == 'recover' ]; then
for pod in $PODS ; do
LOG "Recovering libvirt pod: $pod"
kubectl delete pods -n openstack $pod --wait=false
done
elif [ "$1" == 'verify' ]; then
if [ -z "${PODS}" ]; then
LOG "Openstack libvirt pod on ${HOST} is running."
else
ERROR "Openstack libvirt pod on ${HOST} has not been recovered."
fi
else
ERROR "Unknown action: $1"
fi
fi
}
function _examine_pods {
# $1: actions <recover|verify>
# Wait for pods transitions to stop
_wait_for_pod_stabilization
# Check for recovery actions
_unknown_pods $1
_node_affinity_pods $1
_force_reset_pods $1
}
function start {
_check_for_k8s_config
_check_for_existing_process
LOG "Starting."
_wait_for_systemd
_examine_pods 'recover'
_examine_pods 'verify'
}
function stop {
LOG "Stopping."
}
function status {
:
}
function reset {
:
}
case "$1" in
start)
start
;;
stop)
stop
;;
restart|force-reload|reload)
stop
start
;;
status)
status
;;
reset)
reset
;;
*)
echo "Usage: $0 {start|stop|force-reload|restart|reload|status|reset}"
exit 1
;;
esac
exit 0