3554 lines
114 KiB
YAML
3554 lines
114 KiB
YAML
apiVersion: v1
|
|
data:
|
|
init.sh: |
|
|
#!/bin/bash
|
|
|
|
# Get the CA path from environment vars
|
|
CERT=$CA_CERT
|
|
# Store cert as a oneliner for curl purposes
|
|
CA_ONELINE=$(awk '{printf "%s\\n", $0}' $CERT)
|
|
|
|
# Template vaules from helm
|
|
VAULT_NS={{ .Release.Namespace }}
|
|
VAULT_NAME={{ .Values.vault.name }}
|
|
VAULT_FN={{ .Values.vault.fullname }}
|
|
HA_REPLICAS={{ .Values.server.ha.replicas }}
|
|
|
|
# Set the domain for resolving pod names
|
|
DOMAIN="${VAULT_NS}.pod.cluster.local"
|
|
SVCDOMAIN="${VAULT_NS}.svc.cluster.local"
|
|
|
|
# define host targets and port
|
|
POD_TARGET_BASE="$DOMAIN" # requires 'DNS NAME' of pod
|
|
ACTIVE_TARGET="${VAULT_FN}-active.${SVCDOMAIN}" # only the active
|
|
TARGET_PORT=8200
|
|
|
|
# impermanent location to store files while running
|
|
WORKDIR=/workdir
|
|
mkdir -p $WORKDIR
|
|
|
|
# Selection of kubectl version from helm override
|
|
KUBECTL=kubectl
|
|
KUBECTL_HELM_OVERRIDE={{ .Values.manager.k8s.client_version }}
|
|
|
|
# Trap and trap notification file. When SIGTERM is sent to this pod
|
|
# we want to exit promptly and gracefully.
|
|
TRAPFILE=$WORKDIR/exit_on_trap
|
|
trap "touch $TRAPFILE" SIGTERM
|
|
|
|
# when specifying a trap for debug, remember it with this variable
|
|
# reserve trap '0' for disabling a debugging trap request
|
|
DEBUGGING_TRAP=0
|
|
|
|
# Pause notification file. A debugging option to permit
|
|
# vault-manager to be paused at any of the exit_on_trap code points.
|
|
# Use case may include permitting time for a developer to setup
|
|
# conditions for debug and test.
|
|
PAUSEFILE=$WORKDIR/pause_on_trap
|
|
PAUSE_RATE=1 # rate at which to test for unpause
|
|
EARLY_PAUSE={{ .Values.manager.pause }}
|
|
|
|
# set the default manager mode; modes include
|
|
# VAULT_MANAGER (default)
|
|
# MOUNT_HELPER
|
|
# INTERACTIVE (i.e., when this script is sourced by an author)
|
|
if [ -z "$MANAGER_MODE" ]; then
|
|
MANAGER_MODE="VAULT_MANAGER"
|
|
fi
|
|
if [[ "${BASH_SOURCE[0]}" != "${0}" ]]; then
|
|
MANAGER_MODE="INTERACTIVE"
|
|
fi
|
|
|
|
# Maximum sleep seconds for mount-helper before exiting
|
|
MOUNT_HELPER_MAX_TIME=60
|
|
|
|
# Maximum seconds to wait for mount-helper pod to start
|
|
MAX_POD_RUN_TRIES=10
|
|
|
|
# Maximum seconds to wait for vault-manager pod to exit
|
|
# Vault-manager is not responding to SIGTERM, so will take 30
|
|
# seconds
|
|
TERMINATE_TRIES_MAX={{ .Values.manager.waitTermination.maxTries }}
|
|
TERMINATE_TRIES_SLEEP={{ .Values.manager.waitTermination.sleepTime }}
|
|
|
|
# Vault key share configuration
|
|
KEY_SECRET_SHARES=5
|
|
KEY_REQUIRED_THRESHOLD=3
|
|
|
|
# Enable vault rekey upon conversion of storage from PVC to k8s
|
|
# secrets
|
|
AUTO_REKEY_CONVERT={{ .Values.manager.rekey.enableOnPVCConversion }}
|
|
|
|
# Keep track of vault-manager restarting the rekey procedure; if
|
|
# this variable is not true (0) and a rekey procedure is in
|
|
# progress, then vault-manager was restarted
|
|
REKEY_STARTED=1
|
|
|
|
# Vault manager will rekey the vault at a time when the vault
|
|
# servers are stable for a period of time specified by
|
|
# REKEY_STABLE_TIME seconds
|
|
REKEY_STABLE_TIME=300
|
|
|
|
# Global variable to share rekey status
|
|
REKEY_STATUS_JSON=''
|
|
|
|
# Keep track of shards that were last successful
|
|
SHARDS_LAST_SUCCESSFUL="cluster-key"
|
|
|
|
# Records for seal status state machine:
|
|
PODREC_F="$WORKDIR/previous_pods_status.txt"
|
|
PODREC_TMP_F="$WORKDIR/new_pods_status.txt"
|
|
|
|
# Vault server health query timeout during HA recovery scenario
|
|
QUERY_TMOUT={{ .Values.manager.api.healthQueryTimeout }}
|
|
|
|
# Default curl timout for REST API commands to vault server.
|
|
# This value is what testing shows is the default timeout.
|
|
# Specifying it explicitly for clarity.
|
|
API_TMOUT=120
|
|
|
|
# API timeout for unseal operations
|
|
API_UNSEAL_OP_TMOUT={{ .Values.manager.api.unsealOpTimeout }}
|
|
|
|
# API timeout values for rekey operations
|
|
API_REKEY_QUERY_TMOUT={{ .Values.manager.api.rekeyStatusTimeout }}
|
|
API_REKEY_OP_TMOUT={{ .Values.manager.api.rekeyOpTimeout }}
|
|
|
|
STATEFULSET_RATE=5
|
|
INIT_CONVERGE_TIME=10
|
|
JOIN_RATE=5
|
|
JOIN_CONVERGE_TIME=1
|
|
UNSEAL_RATE=10
|
|
UNSEAL_CONVERGE_TIME=3
|
|
STATUS_RATE={{ .Values.manager.statusCheckRate }}
|
|
if [ -z "$STATUS_RATE" ]; then
|
|
STATUS_RATE=5
|
|
fi
|
|
|
|
# with STATUS_RATE, the period to delay unseal
|
|
# STATUS_RATE * STATEMACH_START seconds
|
|
STATEMACH_START={{ .Values.manager.unsealWaitIntervals }}
|
|
if [ -z "$STATEMACH_START" ]; then
|
|
STATEMACH_START=3
|
|
fi
|
|
|
|
# Log levels
|
|
DEBUG=1
|
|
INFO=2
|
|
WARNING=3
|
|
ERROR=4
|
|
FATAL=5
|
|
|
|
# Default log level and the set log level (Initially set as default).
|
|
# If the log function detects an override file, then it will switch
|
|
# the set log level and then delete it.
|
|
DEFAULT_LOG_LEVEL=$INFO
|
|
LOG_LEVEL={{ .Values.manager.log.defaultLogLevel }}
|
|
LOG_OVERRIDE_FILE="$WORKDIR/log_level"
|
|
|
|
# FUNCTIONS
|
|
|
|
# takes major/minor version of k8s and compares
|
|
# for example: v1.28 > v1.27 > v1.26
|
|
#
|
|
# Returns:
|
|
# 0 left is larger
|
|
# 1 equal
|
|
# 2 right is larger
|
|
function compareK8sVersion {
|
|
local left="$1"
|
|
local right="$2"
|
|
|
|
# strip leading 'v'
|
|
left="${left#v}"
|
|
right="${right#v}"
|
|
|
|
# compare the strings
|
|
if [ "$left" == "$right" ]; then
|
|
return 1
|
|
fi
|
|
# compare major
|
|
if [ "${left%.*}" -gt "${right%.*}" ]; then
|
|
return 0
|
|
elif [ "${left%.*}" -lt "${right%.*}" ]; then
|
|
return 2
|
|
fi
|
|
|
|
# compare the minor
|
|
if [ "${left#*.}" -gt "${right#*.}" ]; then
|
|
return 0
|
|
fi
|
|
return 2
|
|
}
|
|
|
|
# Give kubectl an opportunity to express complaints in the log
|
|
function k8sComplain {
|
|
local result
|
|
|
|
result="$( $KUBECTL version -o json 2>&1 >/dev/null )"
|
|
if [ -n "$result" ]; then
|
|
log $WARNING "kubectl: $result"
|
|
fi
|
|
}
|
|
|
|
# Double-check that the binary exists before setting the specified
|
|
# value of KUBECTL
|
|
function switchK8sVersion {
|
|
local select="$1"
|
|
local fname="kubectl.$select"
|
|
local newbin="${KUBECTL_INSTALL_PATH}/$fname"
|
|
|
|
which "$fname" >/dev/null
|
|
if [ $? -ne 0 -o ! -f "$newbin" ]; then
|
|
log $ERROR "Missing kubectl version: $select"
|
|
k8sComplain
|
|
return 1
|
|
fi
|
|
|
|
if [ "$KUBECTL" != "$fname" ]; then
|
|
KUBECTL="$fname"
|
|
log $INFO "Switching to use kubectl version $select"
|
|
fi
|
|
|
|
k8sComplain
|
|
return 0
|
|
}
|
|
|
|
# Select the version of kubectl matching the running server
|
|
function pickK8sVersion {
|
|
local result
|
|
local serverver
|
|
local majorver
|
|
local minorver
|
|
local select=""
|
|
local majmin=""
|
|
local maxver
|
|
local minver
|
|
|
|
# omit this code if the image does not support kubectl versions
|
|
if [ -z "$KUBE_VERSIONS" ]; then
|
|
k8sComplain
|
|
return
|
|
fi
|
|
|
|
if [ -n "$KUBECTL_HELM_OVERRIDE" ]; then
|
|
# pick the binary requested, if it exists
|
|
switchK8sVersion "$KUBECTL_HELM_OVERRIDE"
|
|
if [ $? -eq 0 ]; then
|
|
return
|
|
fi
|
|
log $ERROR "kubectl version from helm-override not" \
|
|
"available: $KUBECTL_HELM_OVERRIDE"
|
|
fi
|
|
|
|
# use -o json for consistent usage, as oppose to --short
|
|
result="$( $KUBECTL version -o json 2>/dev/null )"
|
|
if [ $? -ne 0 ]; then
|
|
log $ERROR "Unable to get k8s server version"
|
|
# no change in value of KUBECTL
|
|
k8sComplain
|
|
return
|
|
fi
|
|
|
|
serverver="$( jq -r '.serverVersion.gitVersion' <<<"$result" \
|
|
| grep "[0-9]" )"
|
|
majorver="$( jq -r '.serverVersion.major' <<<"$result" \
|
|
| grep "[0-9]" )"
|
|
minorver="$( jq -r '.serverVersion.minor' <<<"$result" \
|
|
| grep "[0-9]" )"
|
|
if [ -z "$serverver" -o -z "$majorver" -o -z "$minorver" ]; then
|
|
log $ERROR "Unable to detect K8s server version:" \
|
|
"["$result"]"
|
|
# no change in value of KUBECTL
|
|
k8sComplain
|
|
return
|
|
fi
|
|
|
|
# pick matching client major/minor version
|
|
for select in $KUBE_VERSIONS noverhere; do
|
|
majmin="v${majorver}.${minorver}"
|
|
if [[ "$select" =~ ^$majmin ]]; then
|
|
break
|
|
fi
|
|
done
|
|
|
|
if [ "$select" == noverhere ]; then
|
|
# Try to pick a near version. We really shouldn't be in
|
|
# this situation, but here is a compromise. This algorithm
|
|
# assumes that there are no omitted versions in the series
|
|
# of KUBE_VERSIONS, and that they are sorted largest to
|
|
# smallest in that list
|
|
maxver="$( awk '{print $1}' <<<"$KUBE_VERSIONS" )"
|
|
minver="$( awk '{print $NF}' <<<"$KUBE_VERSIONS" )"
|
|
|
|
compareK8sVersion ${serverver%.*} ${maxver%.*}
|
|
if [ "$?" -le 1 ]; then
|
|
select="$maxver"
|
|
else
|
|
compareK8sVersion ${minver%.*} ${serverver%.*}
|
|
if [ "$?" -le 1 ]; then
|
|
select="$minver"
|
|
else
|
|
log $ERROR "Could not pick nearest version for kubectl"
|
|
k8sComplain
|
|
return
|
|
fi
|
|
fi
|
|
fi
|
|
|
|
switchK8sVersion "${select%.*}"
|
|
}
|
|
|
|
# Convert log level to text for log message
|
|
function log_to_str {
|
|
local level="$1"
|
|
local logStr
|
|
|
|
case "$level" in
|
|
$INFO)
|
|
logStr="INFO"
|
|
;;
|
|
$DEBUG)
|
|
logStr="DEBUG"
|
|
;;
|
|
$WARNING)
|
|
logStr="WARNING"
|
|
;;
|
|
$ERROR)
|
|
logStr="ERROR"
|
|
;;
|
|
$FATAL)
|
|
logStr="FATAL"
|
|
;;
|
|
esac
|
|
echo "$logStr"
|
|
}
|
|
|
|
# Print the specified message to stdout if the call's specified
|
|
# level is at least the configured log level
|
|
function log {
|
|
local lvl="$1"
|
|
local logStr
|
|
local newLogLevel
|
|
|
|
# check if log override file "Exists"
|
|
if [ -f $LOG_OVERRIDE_FILE ] \
|
|
&& [ "$MANAGER_MODE" != "INTERACTIVE" ]; then
|
|
newLogLevel=$(cat $LOG_OVERRIDE_FILE)
|
|
# validation for newLogLevel
|
|
if [[ "$newLogLevel" =~ ^[1-5]$ ]]; then
|
|
LOG_LEVEL=$newLogLevel
|
|
logStr="$( log_to_str "$LOG_LEVEL" )"
|
|
echo "$(date +%Y-%m-%dT%H-%M-%S) DEBUG" \
|
|
"Log level set to $logStr"
|
|
else
|
|
echo "$(date +%Y-%m-%dT%H-%M-%S) DEBUG" \
|
|
"Invalid log level read from $LOG_OVERRIDE_FILE."
|
|
fi
|
|
rm $LOG_OVERRIDE_FILE
|
|
fi
|
|
|
|
# validate LOG_LEVEL. If it is not valid, then use
|
|
# DEFAULT_LOG_LEVEL instead.
|
|
if [[ ! "$LOG_LEVEL" =~ ^[1-5]$ ]]; then
|
|
echo "$(date +%Y-%m-%dT%H-%M-%S) DEBUG" \
|
|
"Invalid log level detected, will be set to" \
|
|
"$( log_to_str "$DEFAULT_LOG_LEVEL" )"
|
|
LOG_LEVEL=$DEFAULT_LOG_LEVEL
|
|
fi
|
|
|
|
# check if the log level for this call is equal to or higher
|
|
# than the set log level
|
|
if [ "$lvl" -ge "$LOG_LEVEL" ]; then
|
|
# print log
|
|
logStr="$( log_to_str "$lvl" )"
|
|
echo "$(date +%Y-%m-%dT%H-%M-%S) $logStr ${@:2}"
|
|
fi
|
|
}
|
|
|
|
if ! [[ "$QUERY_TMOUT" =~ ^[0-9]+$ ]]; then
|
|
log $WARNING ".Values.manager.healthQueryTimeout not an integer"
|
|
QUERY_TMOUT=""
|
|
fi
|
|
|
|
function pause_on_trap {
|
|
local thistrap="$1"
|
|
local pausenum
|
|
|
|
if [ ! -e "$PAUSEFILE" ]; then
|
|
# no pause request
|
|
return
|
|
fi
|
|
|
|
pausenum="$( cat "$PAUSEFILE" )"
|
|
if [ -n "$pausenum" ] \
|
|
&& [ "$pausenum" != "$thistrap" ]; then
|
|
# not on this trap
|
|
return
|
|
fi
|
|
|
|
log $INFO "Vault manager is paused ($thistrap)"
|
|
# Until pause file is removed by the author,
|
|
# or until the content of pause_on_trap file is
|
|
# not-empty and not matching the current trap.
|
|
#
|
|
# If the pause_on_trap file containing specific trap number is
|
|
# replaced with empty file: the pause state is maintained.
|
|
while [ -e "$PAUSEFILE" ]; do
|
|
pausenum="$( cat "$PAUSEFILE" )"
|
|
if [ -n "$pausenum" ] \
|
|
&& [ "$thistrap" != "$pausenum" ]; then
|
|
break;
|
|
fi
|
|
sleep "$PAUSE_RATE"
|
|
done
|
|
log $INFO "Vault manager is unpaused"
|
|
}
|
|
|
|
function exit_on_trap {
|
|
local trap="$1"
|
|
local tfnum=""
|
|
|
|
if [ "$MANAGER_MODE" == "INTERACTIVE" ]; then
|
|
# do not interfere with exit_on_trap intended for
|
|
# vault-manager pod
|
|
return
|
|
fi
|
|
|
|
# Debug option pause_on_trap
|
|
pause_on_trap "$trap"
|
|
|
|
if [ -e "$TRAPFILE" ]; then
|
|
tfnum=$(cat $TRAPFILE)
|
|
log $DEBUG "exit_on_trap: removing $TRAPFILE"
|
|
rm "$TRAPFILE" # for workdir on PVC
|
|
if [ -z "$tfnum" ]; then
|
|
# an empty trap file is the default expected behaviour
|
|
log $INFO "exit_on_trap: ($trap)"
|
|
exit
|
|
# handle trap debugging feature - a developer specifies the
|
|
# trap number to target a specific exit_on_trap call.
|
|
# Setting a value of 0 (zero) disables the debugging trap
|
|
elif [ "$tfnum" -eq 0 ]; then
|
|
log $DEBUG "exit_on_trap: ($trap):" \
|
|
"disable debug trap ($DEBUGGING_TRAP)"
|
|
DEBUGGING_TRAP=0
|
|
# there is no trap with value zero
|
|
return
|
|
else
|
|
DEBUGGING_TRAP="$tfnum"
|
|
log $DEBUG "exit_on_trap: ($trap): " \
|
|
"enable debug trap ($DEBUGGING_TRAP)"
|
|
# check now just in case it matches
|
|
if [ "$DEBUGGING_TRAP" -eq "$trap" ]; then
|
|
log $INFO "exit_on_trap: ($trap): matching"
|
|
exit
|
|
fi
|
|
fi
|
|
# check if there is a matching debug trap set
|
|
elif [ "$DEBUGGING_TRAP" -eq "$trap" ]; then
|
|
log $INFO "exit_on_trap: ($trap): matching"
|
|
exit
|
|
else
|
|
log $DEBUG "exit_on_trap: ($trap): no trap file, no exit"
|
|
fi
|
|
}
|
|
|
|
# splits keys into separate files. Each file contains the key and the base64 encoded version.
|
|
# root token will be stored separately
|
|
|
|
function splitShard {
|
|
local index="$1"
|
|
jq '{"keys": [.keys['$index']], "keys_base64": [.keys_base64['$index']]}'
|
|
}
|
|
|
|
# merges two split keys
|
|
function mergeKeyJson {
|
|
# the two parameters are names for variables
|
|
local jstr1="$1"
|
|
local jstr2="$2"
|
|
|
|
mkfifo "$WORKDIR"/s1
|
|
mkfifo "$WORKDIR"/s2
|
|
|
|
(
|
|
jq -Mn --argfile file1 $WORKDIR/s1 --argfile file2 $WORKDIR/s2 '
|
|
def mergek: ($file1, $file2) | .keys as $k | $k;
|
|
def mergeb: ($file1, $file2) | .keys_base64 as $b | $b;
|
|
{keys: (reduce mergek as $x ([]; . + $x)),
|
|
keys_base64: (reduce mergeb as $x ([]; . + $x))}
|
|
' & ) 2>/dev/null
|
|
|
|
echo -n "${!jstr1}" > "$WORKDIR"/s1
|
|
echo -n "${!jstr2}" > "$WORKDIR"/s2
|
|
|
|
rm -f "$WORKDIR"/s1 "$WORKDIR"/s2
|
|
}
|
|
|
|
# Prepare a json document from the k8s secrets prefixed with
|
|
# prefix, and the root token
|
|
#
|
|
# Required parameter: The prefix of the k8s secrets containing
|
|
# the shards
|
|
#
|
|
# Outputs the json document which is comparable to the original
|
|
# response for vault initialization. The calling function is
|
|
# responsible for validating the document content.
|
|
#
|
|
function reconstructInitResponse {
|
|
local prefix="$1"
|
|
local index
|
|
local keys
|
|
local mkeys
|
|
|
|
# pull secrets from k8s and merge into one json file.
|
|
for index in $( seq 0 $(( KEY_SECRET_SHARES - 1 )) ); do
|
|
keys="$( get_secret "${prefix}-$index" )"
|
|
if [ "$index" -eq 0 ]; then
|
|
mkeys="$keys"
|
|
continue
|
|
fi
|
|
mkeys=$( mergeKeyJson mkeys keys )
|
|
done
|
|
|
|
# append the root secret and echo the document
|
|
echo "$mkeys" | jq -c '{keys: .keys,
|
|
keys_base64: .keys_base64,
|
|
root_token: "'$( get_secret "cluster-key-root" )'"}'
|
|
}
|
|
|
|
# Check the structure of json data and confirm equivalence of
|
|
# the stdin with stored secrets
|
|
#
|
|
# Required parameter: The prefix of the k8s secrets containing
|
|
# the shards in stored secrets
|
|
#
|
|
# Returns the normal linux success=0, failure!=0
|
|
function validateSecrets {
|
|
local keyprefix="$1"
|
|
local text
|
|
local keys
|
|
local keys_base64
|
|
local root_token
|
|
local count
|
|
local saved
|
|
local shaA
|
|
local shaB
|
|
|
|
text=$( cat )
|
|
keys=$( echo "$text" | jq '.keys' )
|
|
keys_base64=$( echo "$text" | jq '.keys_base64' )
|
|
root_token=$( echo "$text" | jq -r '.root_token' )
|
|
# response is 'null' if the dict key is missing
|
|
# response is empty (-z) is the source document is empty
|
|
if [ -z "$keys" -o "$keys" == "null" \
|
|
-o -z "$keys_base64" -o "$keys_base64" == "null" \
|
|
-o -z "$root_token" -o "$root_token" == "null" ]; then
|
|
log $ERROR "one or more missing keys"
|
|
return 1
|
|
fi
|
|
|
|
count=$( echo "$keys" | jq '. | length' )
|
|
if [ $? -ne 0 ]; then
|
|
log $ERROR "jq did not parse keys length"
|
|
return 1
|
|
fi
|
|
if [ -z "$count" ] || [ "$count" -ne "$KEY_SECRET_SHARES" ]; then
|
|
log $ERROR "Incorrect array length for keys:" \
|
|
"$count instead of $KEY_SECRET_SHARES"
|
|
return 1
|
|
fi
|
|
count=$( echo "$keys_base64" | jq '. | length' )
|
|
if [ $? -ne 0 ]; then
|
|
log $ERROR "jq did not parse keys_base64 length"
|
|
return 1
|
|
fi
|
|
if [ -z "$count" ] || [ "$count" -ne "$KEY_SECRET_SHARES" ]; then
|
|
log $ERROR "Incorrect array length for keys_base64:" \
|
|
"$count instead of $KEY_SECRET_SHARES"
|
|
return 1
|
|
fi
|
|
|
|
saved="$( reconstructInitResponse "${keyprefix}" )"
|
|
|
|
# finally ensure that the saved secrets are the same as the
|
|
# supplied text
|
|
shaA=$( echo "$text" | sha256sum )
|
|
shaB=$( echo "$saved" | sha256sum )
|
|
if [ "$shaA" != "$shaB" ]; then
|
|
log $ERROR "saved data differs from source data"
|
|
return 1
|
|
fi
|
|
|
|
log $INFO "Verified stored secrets are the same as supplied data"
|
|
return 0
|
|
}
|
|
|
|
# Creates a list of all k8s vault pods and stores in text file.
|
|
# Converts ips from X.X.X.X or a:b:c::d to X-X-X-X for use as pod
|
|
# dns names
|
|
#
|
|
# Optional parameter:
|
|
# --ha : append vault server active/standby status (boolean)
|
|
#
|
|
# Example output with --ha
|
|
# sva-vault-0 172-16-226-97 true
|
|
function getVaultPods {
|
|
local ha="$1"
|
|
local jpath
|
|
local meta='{.metadata.name}'
|
|
local ip='{.status.podIPs[].ip}'
|
|
local active='{.metadata.labels.vault-active}'
|
|
local jfields=${meta}'{"\t"}'${ip}
|
|
|
|
if [ "$ha" == "--ha" ]; then
|
|
jfields=${jfields}'{"\t"}'${active}
|
|
fi
|
|
jpath='{range .items[*]}'"$jfields"'{"\n"}{end}'
|
|
|
|
$KUBECTL get pods \
|
|
-n "$VAULT_NS" \
|
|
-l component=server,app.kubernetes.io/name=vault \
|
|
-o=jsonpath="$jpath" \
|
|
| sed 's/\.\|:/-/g'
|
|
}
|
|
|
|
# Wait for the vault servers in the stateful set to be
|
|
# created before initializing
|
|
function waitForPods {
|
|
local jsonPath='{range .items[*]}{.metadata.name}{"\t"} \
|
|
{.status.podIPs[].ip}{"\t"}{.status.phase}{"\n"} \
|
|
{end}'
|
|
|
|
CURRENT_PODS=$($KUBECTL get pods \
|
|
-l component=server,app.kubernetes.io/name=vault \
|
|
-o=jsonpath="$jsonPath" \
|
|
| grep Running \
|
|
| wc -l)
|
|
DESIRED_PODS=$1
|
|
|
|
if ! [[ "$CURRENT_PODS" =~ ^[0-9]+$ ]]; then
|
|
log $ERROR "Invalid Running pod number ($CURRENT_PODS) from kubectl get pods"
|
|
CURRENT_PODS=0
|
|
fi
|
|
|
|
while [ $CURRENT_PODS -lt $DESIRED_PODS ]; do
|
|
sleep "$STATEFULSET_RATE"
|
|
log $INFO "Waiting for ${VAULT_FN}" \
|
|
"statefulset running pods ($CURRENT_PODS) to equal" \
|
|
"desired pods ($DESIRED_PODS)"
|
|
CURRENT_PODS=$($KUBECTL get pods \
|
|
-l component=server,app.kubernetes.io/name=vault \
|
|
-o=jsonpath="$jsonPath" \
|
|
| grep Running \
|
|
| wc -l)
|
|
done
|
|
}
|
|
|
|
# Takes the json document output from vault initialization
|
|
# and stores it into secrets for key shards and the root token
|
|
#
|
|
# Required parameter: The prefix of the k8s secrets into which to
|
|
# store the shards
|
|
#
|
|
# This only works if the secrets are not pre-existing. An error
|
|
# is printed by set_secrets.
|
|
function storeVaultInitSecrets {
|
|
local keyprefix="$1"
|
|
local secrets
|
|
local index
|
|
local split_json
|
|
|
|
secrets=$( cat )
|
|
|
|
for index in $(seq 0 $((KEY_SECRET_SHARES - 1 ))); do
|
|
split_json=$( echo -n "$secrets" | splitShard "$index" )
|
|
set_secret "${keyprefix}-$index" /dev/stdin <<< "$split_json"
|
|
done
|
|
|
|
# if the data contains root_token, save it as well
|
|
split_json=$( echo "$secrets" | jq -r '.root_token' )
|
|
if [ -n "$split_json" -a "$split_json" != 'null' ]; then
|
|
set_secret "${keyprefix}-root" /dev/stdin <<< "$split_json"
|
|
fi
|
|
}
|
|
|
|
# Address a vault server with REST API request. Capture stderr,
|
|
# stdout and result of curl commands. Print error and debug logs
|
|
#
|
|
# Required positional parameters, in order:
|
|
# Response variable : variable in which to store the response
|
|
# from vault
|
|
# http request type : GET, POST, DELETE
|
|
# vault server : FQDN
|
|
# vault REST API path : e.g., /sys/health
|
|
#
|
|
# Optional final parameter : a quoted string of data
|
|
#
|
|
# Examples:
|
|
# # get health status query for the active vault status
|
|
# vaultAPI myvar GET $ACTIVE_TARGET /sys/health
|
|
#
|
|
# # post rekey initialization with shares 5 and threshold 3
|
|
# data='{"secret_shares": 5,"secret_threshold": 3}'
|
|
# vaultAPI myvar POST $ACTIVE_TARGET /sys/rekey/init "$data"
|
|
#
|
|
# Overridable ENV variables:
|
|
# API_TMOUT: the curl timeout
|
|
# NO_HEADER: omit header (the root token) if not empty
|
|
#
|
|
# Output:
|
|
# Return the stdout and command result code
|
|
#
|
|
# Print log messages for errors. The responses from vault are
|
|
# restricted to DEBUG lovel log in case there's secret information
|
|
# in them. But a non-specific ERROR message is printed in all
|
|
# cases of errors.
|
|
function vaultAPI {
|
|
local answer="$1"
|
|
local reqarg="$2"
|
|
local server="$3"
|
|
local apipath="$4"
|
|
local data="$5"
|
|
local cmderr=""
|
|
local cmdout=""
|
|
local cmdres=1
|
|
local header=""
|
|
local errors=""
|
|
|
|
if [ -z "$NO_HEADER" ]; then
|
|
header="X-Vault-Token:$( get_secret cluster-key-root )"
|
|
fi
|
|
|
|
log $DEBUG "Executing: [curl -s -S --cacert \""$CERT"\"" \
|
|
${API_TMOUT:+"--connect-timeout" "$API_TMOUT"} \
|
|
${header:+"--header" "xxxx"} \
|
|
"--request \"$reqarg\"" \
|
|
${data:+"--data" "xxxx"} \
|
|
"\"https://${server}:${TARGET_PORT}/v1${apipath}\"]"
|
|
|
|
# Capture stderr and stdout copied from google search example
|
|
# on stack overflow. Add capture of the command result code
|
|
{
|
|
IFS=$'\n' read -r -d '' cmderr;
|
|
IFS=$'\n' read -r -d '' cmdout;
|
|
cmdres="$( echo "$cmdout" | tail -n1 )"
|
|
cmdout="$( echo "$cmdout" | head -n-1 )"
|
|
} < <((printf '\0%s\0' "$(
|
|
curl -s -S --cacert "$CERT" \
|
|
${API_TMOUT:+"--connect-timeout" "$API_TMOUT"} \
|
|
${header:+"--header" "$header"} \
|
|
--request "$reqarg" \
|
|
${data:+"--data" "$data"} \
|
|
"https://${server}:${TARGET_PORT}/v1${apipath}"
|
|
echo "$?"
|
|
)" 1>&2) 2>&1)
|
|
|
|
if [ "$cmdres" -ne 0 ]; then
|
|
log $ERROR "curl returns non-zero result: $cmdres"
|
|
fi
|
|
if [ -n "$cmderr" ]; then
|
|
log $ERROR "curl returns stderr"
|
|
log $DEBUG "curl returns stderr: [$cmderr]"
|
|
fi
|
|
|
|
if [ -n "$cmdout" ]; then
|
|
# errors from the REST API
|
|
errors=$( echo "$cmdout" | jq -cr '.errors' )
|
|
if [[ "$errors" != 'null' ]] && [ -n "$errors" ]; then
|
|
log $ERROR "vault REST API error"
|
|
log $DEBUG "vault REST API error: $errors"
|
|
if [ "$cmdres" -eq 0 ]; then
|
|
# this code wants to know if there was an error
|
|
cmdres=1
|
|
fi
|
|
fi
|
|
fi
|
|
eval "$answer"='$cmdout'
|
|
return $cmdres
|
|
}
|
|
|
|
# Initializes the first vault pod, only needs to be performed once
|
|
# after deploying the helm chart
|
|
# Stores the root token and master key shards in k8s secrets
|
|
function initVault {
|
|
local V0 # the zeroeth vault pod
|
|
local keys
|
|
local key_error
|
|
local shares
|
|
local threshold
|
|
|
|
V0=$(awk 'NR==1{print $2}' $WORKDIR/pods.txt)
|
|
log $INFO "Initializing $V0"
|
|
shares='"secret_shares": '$KEY_SECRET_SHARES
|
|
threshold='"secret_threshold": '$KEY_REQUIRED_THRESHOLD
|
|
|
|
NO_HEADER=true \
|
|
vaultAPI keys POST $V0.$POD_TARGET_BASE \
|
|
/sys/init "{$shares, $threshold}"
|
|
|
|
key_error=$(echo -n "$keys"| jq -r '.errors[]?')
|
|
if [ -n "$key_error" ]; then
|
|
log $ERROR "vault init request failed: $key_error"
|
|
fi
|
|
|
|
echo "$keys" | storeVaultInitSecrets cluster-key
|
|
|
|
# check if the secrets match vault's REST API response
|
|
echo "$keys" | validateSecrets cluster-key
|
|
}
|
|
|
|
# Uses the master key shards to unseal vault
|
|
function unsealVault {
|
|
local server="$1"
|
|
local prefix="$2"
|
|
local index
|
|
local b64key
|
|
local data
|
|
local response
|
|
local value
|
|
local autherror
|
|
|
|
if [ -z "$prefix" ]; then
|
|
prefix='cluster-key'
|
|
fi
|
|
|
|
# always abort an unseal in progress
|
|
data='{"reset": true}'
|
|
NO_HEADER=true \
|
|
API_TMOUT=$API_UNSEAL_OP_TMOUT \
|
|
vaultAPI response POST $server.$POD_TARGET_BASE \
|
|
/sys/unseal "$data"
|
|
if [ $? -ne 0 ]; then
|
|
# error is already printed
|
|
# Including if vault is already unsealed.
|
|
if [[ "$response" == *"vault is unsealed"* ]]; then
|
|
log $WARNING "unsealVault: server $server is" \
|
|
"already unsealed"
|
|
fi
|
|
return 1
|
|
fi
|
|
|
|
for index in $(seq 0 $((KEY_SECRET_SHARES - 1 ))); do
|
|
b64key=$( get_secret "${prefix}-$index" \
|
|
| jq -r '.keys_base64[]' )
|
|
data="{\"key\": \"$b64key\"}"
|
|
|
|
NO_HEADER=true \
|
|
API_TMOUT=$API_UNSEAL_OP_TMOUT \
|
|
vaultAPI response POST $server.$POD_TARGET_BASE \
|
|
/sys/unseal "$data"
|
|
if [ $? -ne 0 ]; then
|
|
# error is already printed, including errors from the
|
|
# vault REST API; but for debugging purposes, highlight
|
|
# the authentication error
|
|
autherror="cipher: message authentication failed"
|
|
if [[ "$response" == *"$autherror"* ]]; then
|
|
log $ERROR "Failed to authenticate /sys/unseal" \
|
|
"with $prefix"
|
|
# perhaps use this info in the future
|
|
return 2
|
|
fi
|
|
log $DEBUG "Unknown failure authenticating unseal" \
|
|
"$response"
|
|
return 1
|
|
fi
|
|
|
|
# when the unseal completes with KEY_REQUIRED_THRESHOLD then
|
|
# the response will indicate sealed=false
|
|
value="$( echo "$response" | jq -r ".sealed" )"
|
|
if [ "$value" == "false" ]; then
|
|
log $DEBUG "Success authenticating unseal"
|
|
return 0
|
|
fi
|
|
|
|
value="$( echo "$response" | jq -r ".progress" )"
|
|
log $DEBUG "Success authenticating unseal" \
|
|
"(${value}/${KEY_REQUIRED_THRESHOLD})"
|
|
# Some sleep is required to allow Raft convergence
|
|
sleep "$UNSEAL_CONVERGE_TIME"
|
|
done
|
|
|
|
log $ERROR "unsealVault completes without unseal or error"
|
|
return 1
|
|
}
|
|
|
|
# Unseal a vault server under conditions of recovery,
|
|
# including selecting and remembering alternate shard
|
|
# secrets.
|
|
#
|
|
# This algorithm remembers the last shards used to unseal the vault,
|
|
# to prioritize using those again the next time.
|
|
function unsealVaultRecover {
|
|
local server="$1"
|
|
local attempted
|
|
local use_secrets=""
|
|
|
|
if [ -n "$SHARDS_LAST_SUCCESSFUL" ]; then
|
|
# double check the keys we were using are not deleted
|
|
if assertShardSecrets "$SHARDS_LAST_SUCCESSFUL"; then
|
|
use_secrets="$SHARDS_LAST_SUCCESSFUL"
|
|
fi
|
|
fi
|
|
|
|
use_secrets="$use_secrets $( \
|
|
getOtherShardSecrets "$SHARDS_LAST_SUCCESSFUL" )"
|
|
for attempted in $use_secrets; do
|
|
log $INFO "Attempt unseal with $attempted"
|
|
unsealVault "$server" "$attempted"
|
|
case $? in
|
|
0)
|
|
SHARDS_LAST_SUCCESSFUL="$attempted"
|
|
return 0
|
|
;;
|
|
2)
|
|
# an error is already printed
|
|
# try a different set of shards
|
|
continue
|
|
;;
|
|
*)
|
|
# failure is not clear, try again later
|
|
log $ERROR "Fail to unseal $server with" \
|
|
"$attempted; try later"
|
|
return 1
|
|
;;
|
|
esac
|
|
done
|
|
|
|
log $ERROR "No set of shards unseal the server $server:" \
|
|
"attempted: $use_secrets"
|
|
return 1
|
|
}
|
|
|
|
# Takes the address of vault-0 as the cluster leader and
|
|
# joins other nodes to raft
|
|
function joinRaft {
|
|
local dnsname="$1"
|
|
local activeLink="https://${ACTIVE_TARGET}:${TARGET_PORT}"
|
|
local dataJson="{\"leader_api_addr\": \"$activeLink\", \"leader_ca_cert\": \"$CA_ONELINE\"}"
|
|
RAFT_STATUS=""
|
|
while [ "$RAFT_STATUS" != "true" ]; do
|
|
|
|
vaultAPI RAFT_STATUS POST $dnsname.$POD_TARGET_BASE \
|
|
/sys/storage/raft/join "$dataJson"
|
|
|
|
log $INFO "$dnsname $RAFT_STATUS"
|
|
RAFT_STATUS=$(echo $RAFT_STATUS | jq -r .joined)
|
|
sleep "$JOIN_CONVERGE_TIME"
|
|
done
|
|
}
|
|
|
|
function runStateMachine {
|
|
local host="$1"
|
|
local dns_name="$2"
|
|
local sealed="$3"
|
|
local status_rec
|
|
local old_rec
|
|
local counter
|
|
|
|
status_rec="/$host/$dns_name/$sealed/"
|
|
|
|
# log compression: do not print logs when status is unchanged
|
|
# omit counter when checking vault server state change
|
|
old_rec="$( grep "$status_rec" "$PODREC_F" )"
|
|
if [ $? -ne 0 ]; then
|
|
log $DEBUG "$( grep "$dns_name" $WORKDIR/pods.txt )"
|
|
log $INFO "Sealed status of $dns_name is now: $sealed"
|
|
|
|
# reread the record by hostname only
|
|
old_rec="$( grep "^/$host/" "$PODREC_F" )"
|
|
else
|
|
log $DEBUG "There is no change in pod seal status"
|
|
fi
|
|
|
|
if [ "$sealed" != "true" ]; then
|
|
# There is nothing more to do: the vault is unsealed
|
|
# or the sealed status is unclear
|
|
echo "$status_rec" >> "$PODREC_TMP_F"
|
|
return
|
|
fi
|
|
|
|
# The vault is sealed
|
|
#
|
|
# Check if there is a countdown in progress
|
|
#
|
|
# else -z old_rec: "the pod didn't have an IP address the last
|
|
# iteration, but now it does" - treat the same as "sealed
|
|
# without a countdown"
|
|
counter=""
|
|
if [ -n "$old_rec" ]; then
|
|
counter="$( echo "$old_rec" | awk -F/ '{print $5}' )"
|
|
fi
|
|
|
|
if [ -z "$counter" ]; then
|
|
# sealed without a countdown: start counting
|
|
log $DEBUG "Sealed vault $host: begin unseal delay:" \
|
|
"$( expr "$STATUS_RATE" \* "$STATEMACH_START" )s"
|
|
echo "${status_rec}${STATEMACH_START}" >> "$PODREC_TMP_F"
|
|
return
|
|
fi
|
|
|
|
# Check for end of period: 1 means "zero at this interval"
|
|
# "less than 1" for resilience
|
|
if [ "$counter" -le 1 -o "$STATEMACH_START" -eq 0 ]; then
|
|
# We've waited (STATUS_RATE * STATEMACH_START) seconds
|
|
# Or, STATEMACH_START == 0 means do not delay
|
|
log $INFO "Unsealing $dns_name"
|
|
unsealVaultRecover "$dns_name"
|
|
echo "$status_rec" >> "$PODREC_TMP_F"
|
|
return
|
|
fi
|
|
|
|
# finally, continue to countdown
|
|
counter="$( expr "$counter" - 1 )"
|
|
echo "${status_rec}${counter}" >> "$PODREC_TMP_F"
|
|
}
|
|
|
|
function vaultInitialized {
|
|
local response
|
|
local dnsname
|
|
local initialized
|
|
local text
|
|
|
|
# Wait for the pod to respond with a positive vault API response
|
|
# (i.e., not just a curl failure, and not a vault API failure)
|
|
while true; do
|
|
dnsname=$(awk 'NR==1{print $2}' $WORKDIR/pods.txt)
|
|
if [ -z "$dnsname" ]; then
|
|
log $INFO "waiting..."
|
|
sleep $STATUS_RATE
|
|
getVaultPods > $WORKDIR/pods.txt
|
|
continue
|
|
fi
|
|
|
|
log $INFO "Query server $dnsname for initialization status"
|
|
NO_HEADER=true \
|
|
API_TMOUT=$QUERY_TMOUT \
|
|
vaultAPI response GET $dnsname.$POD_TARGET_BASE /sys/health
|
|
if [ $? -ne 0 ]; then
|
|
log $INFO "waiting..."
|
|
sleep $STATUS_RATE
|
|
getVaultPods > $WORKDIR/pods.txt
|
|
continue
|
|
fi
|
|
break
|
|
done
|
|
|
|
echo -n "$response" > $WORKDIR/healthcheck.txt
|
|
initialized=$( echo "$response" | jq -r .initialized )
|
|
|
|
text="$( grep $dnsname $WORKDIR/pods.txt )"
|
|
if [ $? -eq 0 ]; then
|
|
log $DEBUG "$text"
|
|
log $DEBUG "Initialized status is $initialized"
|
|
fi
|
|
|
|
# The empty check is here as a extra safety net, but an
|
|
# investigation into in which exact conditions the result would
|
|
# be empty would be helpful.
|
|
if [ ! -z $initialized ] && [ $initialized = false ]; then
|
|
return 1
|
|
else
|
|
return 0
|
|
fi
|
|
}
|
|
|
|
function set_secret {
|
|
local secret="$1"
|
|
local contentf="$2"
|
|
local output
|
|
local result
|
|
|
|
output="$( $KUBECTL create secret generic -n "$VAULT_NS" \
|
|
"$secret" "--from-file=strdata=$contentf" 2>&1 )"
|
|
result=$?
|
|
if [ "$result" -ne 0 ]; then
|
|
log $ERROR "Failed to create secret $secret"
|
|
log $DEBUG "Output: [$output]"
|
|
fi
|
|
return $result
|
|
}
|
|
|
|
function get_secret {
|
|
local secret="$1"
|
|
|
|
$KUBECTL get secrets -n "$VAULT_NS" "$secret" \
|
|
-o jsonpath='{.data.strdata}' \
|
|
| base64 -d
|
|
}
|
|
|
|
# When vault-manager is run in "MOUNT_HELPER" mode, this function
|
|
# will not return. Instead the function will exit_on_trap or exit
|
|
# when it times-out.
|
|
#
|
|
# Basically: this function doesn't do anything except wait to be
|
|
# terminated.
|
|
#
|
|
# Vault-manager in MOUNT_HELPER has PVC mounted, allowing the real
|
|
# vault-manager to read secrets from cluster_keys.json
|
|
function mountHelper {
|
|
local count
|
|
|
|
# omit this function if this pod is not the mount helper
|
|
if [ -z "$MANAGER_MODE" -o "$MANAGER_MODE" != "MOUNT_HELPER" ]; then
|
|
log $INFO "Mode is VAULT_MANAGER"
|
|
return
|
|
fi
|
|
|
|
# When vault-manager is running in this mode, it should be
|
|
# deleted by vault-manager running in the default mode, which
|
|
# is using this pod to read secrets from mounted PVC
|
|
log $INFO "Mode is $MANAGER_MODE"
|
|
|
|
# start with some debug/error logs
|
|
if [ -f "$PVC_DIR/cluster_keys.json" ]; then
|
|
log $DEBUG "Successfully mounted secrets file"
|
|
else
|
|
log $WARNING "Secrets file not found"
|
|
fi
|
|
|
|
# sleep for MOUNT_HELPER_MAX_TIME, expecting SIGTERM signal
|
|
log $INFO "Waiting for termination request via SIGTERM"
|
|
count=0
|
|
while [ "$count" -lt "$MOUNT_HELPER_MAX_TIME" ]; do
|
|
exit_on_trap
|
|
count=$((count+1))
|
|
sleep 1
|
|
done
|
|
|
|
# Normally should exit by exit_on_trap, but here we timeout
|
|
# waiting for the real vault-manager to delete this job/pod.
|
|
log $INFO "Exiting without receiving SIGTERM request"
|
|
exit 0
|
|
}
|
|
|
|
# Check if a secret exists
|
|
#
|
|
# Returns the normal linux success=0, failure!=0
|
|
# Prints the name of the secret
|
|
function secretExists {
|
|
local name="$1"
|
|
$KUBECTL get secrets -n "$VAULT_NS" "$name" \
|
|
-o jsonpath='{.metadata.name}' 2>/dev/null \
|
|
| grep "$name"
|
|
}
|
|
|
|
# Return linux success=0 if any of the secrets exist
|
|
function secretsExistAny {
|
|
local list="$@"
|
|
local name
|
|
|
|
for name in $list; do
|
|
secretExists $name >/dev/null
|
|
if [ $? -eq 0 ]; then
|
|
return 0
|
|
fi
|
|
done
|
|
|
|
return 1
|
|
}
|
|
|
|
# Assert that the shard secrets starting with prefix exist
|
|
#
|
|
# Parameter: prefix for k8s secrets, such as 'cluster-key'
|
|
#
|
|
# Optional second parameter:
|
|
# --nokeys : failed if at least one exists
|
|
#
|
|
# Returns the normal linux success=0, failure!=0
|
|
#
|
|
# When --nokeys is selected, the failure return code is the number
|
|
# of secrets found. Zero secrets were expected.
|
|
#
|
|
# When --nokeys is omitted, the failure return code is either the
|
|
# number of secrets found or if the number of secrets found was
|
|
# zero, KEY_SECRET_SHARES is returned as error code
|
|
function assertShardSecrets {
|
|
local prefix="$1"
|
|
local nokey="$2"
|
|
local i
|
|
local count=0
|
|
|
|
for i in $( seq 0 $((KEY_SECRET_SHARES-1)) ); do
|
|
secretExists "${prefix}-$i" >/dev/null
|
|
if [ $? -eq 0 ]; then
|
|
count=$((count+1))
|
|
fi
|
|
done
|
|
if [ "$nokey" == "--nokeys" ]; then
|
|
# 0 secrets == true (0)
|
|
# Else return the number of secrets
|
|
return $count
|
|
fi
|
|
if [ "$count" -eq "$KEY_SECRET_SHARES" ]; then
|
|
return 0
|
|
elif [ "$count" -eq 0 ]; then
|
|
return "$KEY_SECRET_SHARES" # an error result
|
|
fi
|
|
return "$count"
|
|
}
|
|
|
|
# Return a list of other existing Shard secrets other than the set
|
|
# specified
|
|
#
|
|
# Sort by priority order:
|
|
# cluster-key
|
|
# cluster-rekey
|
|
# cluster-key-bk
|
|
#
|
|
function getOtherShardSecrets {
|
|
local omit="$1"
|
|
local secrets="cluster-key cluster-rekey cluster-key-bk"
|
|
local secret
|
|
local others=""
|
|
|
|
for secret in $secrets; do
|
|
if [ "$secret" == "$omit" ]; then
|
|
continue
|
|
fi
|
|
if assertShardSecrets $secret; then
|
|
others="$others $secret"
|
|
fi
|
|
done
|
|
echo $others
|
|
}
|
|
|
|
# Delete the specified list of secrets
|
|
#
|
|
# Uses a single kubectl command
|
|
function deleteSecrets {
|
|
local secrets="$@"
|
|
local text
|
|
text="$( $KUBECTL delete secrets -n "$VAULT_NS" \
|
|
$secrets 2>&1 )"
|
|
if [ $? -ne 0 ]; then
|
|
log $ERROR "Error deleting secrets: ["$text"]"
|
|
return 1
|
|
fi
|
|
log $INFO $text
|
|
return 0
|
|
}
|
|
|
|
# Check if the PVC resource exists
|
|
#
|
|
# Returns 0 if pvc does not exist
|
|
# Returns 1 if pvc exists but is terminating
|
|
# Returns 2 if pvc exists and is not terminating
|
|
# Prints the name of the PVC resource
|
|
function pvcRemoved {
|
|
local text
|
|
local jqscript
|
|
|
|
jqscript='.items
|
|
| map(select(.metadata.name | test("^manager-pvc")))
|
|
| "\(.[0].metadata.name) \(.[0].status.phase)"'
|
|
|
|
# using jq since kubernetes does not support regex
|
|
# the grep makes sure the result contains the 'manager-pvc'
|
|
# string (as opposed to 'null' for example)
|
|
text="$(
|
|
$KUBECTL get persistentvolumeclaims -n "$VAULT_NS" -o json \
|
|
| jq -r "$jqscript" 2>/dev/null \
|
|
| grep manager-pvc )"
|
|
|
|
if [ -n "$text" ]; then
|
|
readarray -d " " -t pvcInfo <<< "$text"
|
|
pvcName="${pvcInfo[0]}"
|
|
pvcStatus="${pvcInfo[1]}"
|
|
echo "$pvcName"
|
|
if [ "$pvcStatus" = "Terminating" ]; then
|
|
return 1
|
|
else
|
|
return 2
|
|
fi
|
|
fi
|
|
|
|
return 0
|
|
}
|
|
|
|
# Check if the PVC is mounted to any pod in vault namespace
|
|
#
|
|
# Returns the normal linux success=0, failure!=0
|
|
# Prints the name of the PVC resource
|
|
function testPVCMount {
|
|
local result
|
|
local cspec
|
|
local vspec
|
|
|
|
cspec=".items[*].spec.containers[*]"
|
|
vspec="volumeMounts[?(@.name=='manager-pvc')].name"
|
|
|
|
# this kubectl query returns zero whether manager-pvc is
|
|
# found or not
|
|
# result variable is either empty or 'manager-pvc'
|
|
result="$( $KUBECTL get pods -n "$VAULT_NS" \
|
|
-o jsonpath="{${cspec}.${vspec}}" )"
|
|
|
|
if [ -n "$result" ]; then
|
|
return 0
|
|
fi
|
|
return 1 # assertion 'fails'
|
|
}
|
|
|
|
# This function prints a DEBUG log of kubectl delete
|
|
function deleteMountHelper {
|
|
local text
|
|
local result
|
|
|
|
log $DEBUG "Waiting for delete of mount-helper job"
|
|
text="$( $KUBECTL delete --ignore-not-found=true --wait=true \
|
|
-f /opt/yaml/pvc-attach.yaml 2>&1 )"
|
|
result=$?
|
|
log $DEBUG "Output of deleting mount-helper: [$text]"
|
|
return $result
|
|
}
|
|
|
|
# Run shred on the file content of PVC
|
|
#
|
|
# All files a shredded, and the result is an error if
|
|
# - command return code is non-zero
|
|
# - file comparison shows unchanged file(s)
|
|
#
|
|
# A warning is issued if shred/kubectl command has any stdout or
|
|
# stderr
|
|
#
|
|
# Returns the normal linux success=0, failure!=0
|
|
function securelyWipePVC {
|
|
local helper="$1"
|
|
|
|
if [ -z "$helper" ]; then
|
|
log $ERROR "No pod specified for shredding"
|
|
return 1
|
|
fi
|
|
|
|
# get profile of the files before shredding
|
|
$KUBECTL exec -n "$VAULT_NS" "$helper" -- \
|
|
bash -c 'find /mnt/data -type f \
|
|
| sort | xargs wc | head -n-1' \
|
|
>/tmp/shred_before.txt 2>&1
|
|
log $DEBUG "Original files: [$( cat /tmp/shred_before.txt )]"
|
|
|
|
# run the shred command
|
|
#
|
|
# Shred all the files in mounted /mnt/data/
|
|
#
|
|
# The shred by default has three randomized passes, and with -z
|
|
# option will finalize with zeros. -f prompts shred to work
|
|
# around any unexpected file permissions
|
|
text="$( $KUBECTL exec -n "$VAULT_NS" "$helper" -- \
|
|
bash -c '\
|
|
result=0; \
|
|
while read fname; do \
|
|
shred -f -z "$fname"; \
|
|
[ $? -ne 0 ] && result=1; \
|
|
done <<<"$(find /mnt/data -type f )"; \
|
|
exit $result' 2>&1 )"
|
|
result=$?
|
|
|
|
# get profile of the files after shredding
|
|
$KUBECTL exec -n "$VAULT_NS" "$helper" -- \
|
|
bash -c 'find /mnt/data -type f \
|
|
| sort | xargs wc | head -n-1' \
|
|
>/tmp/shred_after.txt 2>&1
|
|
log $DEBUG "Shredded files: [$( cat /tmp/shred_after.txt )]"
|
|
|
|
# compare the profiles for error reporting
|
|
#
|
|
# If the file lists, pushed through wc, have files with the same
|
|
# character, word, and line counts then report an error: a file
|
|
# has not been shred
|
|
#
|
|
# Ignore files that were empty
|
|
difftext="$( diff -wuU100000 /tmp/shred_before.txt \
|
|
/tmp/shred_after.txt )"
|
|
unchanged="$( echo "$difftext" | grep "^ " \
|
|
| grep -v "^\([ ]\{1,\}0\)\{3\} /" )"
|
|
|
|
# Report the errors/success
|
|
if [ "$result" -ne 0 ]; then
|
|
log $ERROR "Error on shred: [$text]"
|
|
if [ -n "$unchanged" ]; then
|
|
log $ERROR "Unchanged: [$unchanged]"
|
|
fi
|
|
return 1
|
|
fi
|
|
if [ -n "$text" ]; then
|
|
log $WARNING "Output of shred is not empty: [$text]"
|
|
fi
|
|
if [ -n "$unchanged" ]; then
|
|
log $ERROR "Shred did not shred some files"
|
|
log $ERROR "Unchanged: [$unchanged]"
|
|
return 1
|
|
fi
|
|
|
|
log $INFO "Shredding of PVC data verified"
|
|
|
|
return 0
|
|
}
|
|
|
|
# Delete the PVC resource
|
|
#
|
|
# The delete will succeed even if attached to a pod, such as a
|
|
# terminating vault-manager or mount-helper - the PVC remains
|
|
# in terminating status until the pod is also terminated.
|
|
function deletePVC {
|
|
local text
|
|
local name
|
|
|
|
name="$( pvcRemoved )"
|
|
if [ $? -eq 2 ] && [[ "$name" =~ ^manager-pvc ]]; then
|
|
text="$( $KUBECTL delete persistentvolumeclaims \
|
|
-n "$VAULT_NS" "$name" 2>&1 )"
|
|
if [ $? -ne 0 ]; then
|
|
log $ERROR "Error deleting PVC: [$text]"
|
|
else
|
|
log $INFO "$text"
|
|
fi
|
|
else
|
|
log $WARNING "Request to delete PVC but PVC not found"
|
|
fi
|
|
}
|
|
|
|
# Run a job/pod, to mount the PVC resource, and retrieve the secrets
|
|
# from PVC.
|
|
#
|
|
# See also the function mountHelper and the ConfigMap named:
|
|
# {{ .Values.vault.name }}-mount-helper
|
|
#
|
|
# This function does not support overwriting an existing
|
|
# cluster-key-* secret, but it does support validating those secrets
|
|
# if they exist
|
|
function convertPVC {
|
|
local output
|
|
local pod
|
|
local count
|
|
local text
|
|
local PVCtext
|
|
local result
|
|
local waitPVCterm
|
|
|
|
if testPVCMount; then
|
|
log $ERROR "Cannot mount PVC already mounted"
|
|
return 1
|
|
fi
|
|
|
|
# run the pod
|
|
output="$( $KUBECTL apply -f /opt/yaml/pvc-attach.yaml 2>&1 )"
|
|
if [ $? -ne 0 ]; then
|
|
log $ERROR "Failed to apply mount-helper"
|
|
log $DEBUG "Output: [$output]"
|
|
deleteMountHelper
|
|
return 1
|
|
fi
|
|
|
|
# wait for pod
|
|
pod=''
|
|
count=0
|
|
log $INFO "Waiting for mount-helper pod to run"
|
|
while [ -z "$pod" -a "$count" -le "$MAX_POD_RUN_TRIES" ]; do
|
|
count=$((count+1))
|
|
text="$( $KUBECTL get pods -n "$VAULT_NS" \
|
|
| grep "mount-helper" )"
|
|
pod="$( echo "$text" | grep "Running" | awk '{print $1}' )"
|
|
if [ -z "$pod" ]; then
|
|
sleep 1
|
|
fi
|
|
done
|
|
|
|
if [ -z "$pod" ]; then
|
|
log $ERROR "Failed to run mount-helper pod"
|
|
log $DEBUG "Pod state: [$( echo $text )]"
|
|
deleteMountHelper
|
|
return 1
|
|
fi
|
|
|
|
# get the pvc data
|
|
PVCtext="$( $KUBECTL exec -n "$VAULT_NS" "$pod" \
|
|
-- cat /mnt/data/cluster_keys.json )"
|
|
if [ $? -ne 0 -o -z "$PVCtext" ]; then
|
|
log $ERROR "Failed to read cluster_keys.json"
|
|
deleteMountHelper
|
|
return 1
|
|
fi
|
|
log $INFO "Data retrieved from PVC"
|
|
|
|
# if the Root secret is pre-existing, compare the existing
|
|
# shard secrets and root secret before deleting the PVC
|
|
$KUBECTL get secrets -n "$VAULT_NS" \
|
|
cluster-key-root >/dev/null 2>&1
|
|
if [ $? -eq 0 ]; then
|
|
log $INFO "Cluster secrets exist:" \
|
|
"validating"
|
|
else
|
|
# create a secret from the data
|
|
echo "$PVCtext" | storeVaultInitSecrets cluster-key
|
|
fi
|
|
|
|
# verify the data stored versus text from PVC
|
|
echo "$PVCtext" | validateSecrets cluster-key
|
|
result=$?
|
|
if [ "$result" -eq 0 ]; then
|
|
securelyWipePVC "$pod"
|
|
# omit deleting the PVC for manual analysis and shred
|
|
# when the wipe fails
|
|
if [ $? -eq 0 ]; then
|
|
deletePVC
|
|
fi
|
|
fi
|
|
|
|
# clean up but do not care about the result
|
|
deleteMountHelper
|
|
|
|
# Sleep before finishing conversion, so that pvc termination process has started
|
|
waitPVCterm=5
|
|
sleep $waitPVCterm
|
|
|
|
return $result
|
|
}
|
|
|
|
function convertBootstrapSecrets {
|
|
local text
|
|
local count
|
|
|
|
text="$( get_secret cluster-key-bootstrap )"
|
|
echo "$text" | storeVaultInitSecrets cluster-key
|
|
|
|
# verify the split secrets versus the bootstrap text
|
|
echo "$text" | validateSecrets cluster-key
|
|
if [ $? -ne 0 ]; then
|
|
# an error is already printed
|
|
return 1
|
|
fi
|
|
|
|
deleteSecrets cluster-key-bootstrap
|
|
|
|
# Also validate and delete the PVC resource
|
|
# This procedure depends on waiting for the old version
|
|
# of vault-manager pod to exit
|
|
count="$TERMINATE_TRIES_MAX"
|
|
log $INFO "Waiting for vault-manager pod to exit"
|
|
while testPVCMount && [ "$count" -gt 0 ]; do
|
|
sleep "$TERMINATE_TRIES_SLEEP"
|
|
count=$((count-1))
|
|
done
|
|
|
|
if [ $count -eq 0 ]; then
|
|
log $WARNING "Maximum time reached waiting" \
|
|
"for the previous pod to be terminated."
|
|
fi
|
|
|
|
convertPVC
|
|
}
|
|
|
|
# When enabled, after conversion of storage from PVC to k8s secrets,
|
|
# Vault-manager will prompt itself to rekey the vault server
|
|
# storage.
|
|
function requestRekey {
|
|
local value
|
|
|
|
if [ "$AUTO_REKEY_CONVERT" != "true" ]; then
|
|
return
|
|
fi
|
|
log $INFO "Auto rekey enabled: [$AUTO_REKEY_CONVERT]"
|
|
|
|
secretExists cluster-rekey-request >/dev/null
|
|
if [ $? -eq 0 ]; then
|
|
value="$( get_secret cluster-rekey-request )"
|
|
log $WARNING "Auto rekey: rekey request exists: $value"
|
|
return
|
|
fi
|
|
|
|
value=$( uuidgen )
|
|
set_secret cluster-rekey-request /dev/stdin <<<"$value"
|
|
if [ $? -eq 0 ]; then
|
|
log $INFO "Rekey requested: $value"
|
|
else
|
|
log $ERROR "Failed to request rekey: $value"
|
|
fi
|
|
return
|
|
}
|
|
|
|
function runConversion {
|
|
if [ -n "$K8S_SECRETS_PREEXIST" ]; then
|
|
log $INFO "Cluster secrets exist"
|
|
return
|
|
elif [ -n "$BOOTSTRAP_PREEXISTS" ]; then
|
|
# this is the normal application update procedure; the
|
|
# lifecycle code retrieved the secrets from previous version
|
|
# of the application.
|
|
log $INFO "Using secrets provided in $BOOTSTRAP_PREEXISTS"
|
|
convertBootstrapSecrets
|
|
requestRekey
|
|
return
|
|
elif [ -z "$PVC_PREEXISTS" ]; then
|
|
log $INFO "No pre-existing secrets exist"
|
|
return
|
|
fi
|
|
|
|
# Finally, read the pre-existing PVC. This occurs if the
|
|
# application updates outside of application-update. For
|
|
# example if the old application is removed and deleted, and the
|
|
# new application is uploaded and applied.
|
|
convertPVC
|
|
requestRekey
|
|
}
|
|
|
|
# Test whether the specified vault server(s) agree with the
|
|
# specified status of the specified endpoint
|
|
#
|
|
# Print DEBUG logs when status is non-conforming (the function will
|
|
# be used to wait for conformance).
|
|
#
|
|
# The first parameter is the vault API endpoint to check status
|
|
# of, either /sys/rekey/init or /sys/rekey/verify
|
|
# The second parameter is the quoted string of json data returned
|
|
# from vault REST API call. The data should include these fields,
|
|
# which are tested for conformance:
|
|
# {"nonce": "S", "started": B, "progress": N,
|
|
# "verification_required": B}
|
|
#
|
|
# The other parameters are the servers to test, specified as
|
|
# dash-separated IP address output of getVaultPods (XX-XX-XX-XX)
|
|
#
|
|
# Returns the normal linux success=0, failure!=0
|
|
function assertRekeyStatus {
|
|
local endpoint="$1"
|
|
local data="$2"
|
|
shift 2
|
|
local -a servers=($@)
|
|
local -a key_arr
|
|
local required
|
|
local jscript
|
|
local key
|
|
local index
|
|
local error
|
|
local server
|
|
local response
|
|
local record
|
|
|
|
required="nonce progress started verification_required"
|
|
jscript=".nonce, .progress, .started, .verification_required"
|
|
if [ "$endpoint" == "/sys/rekey/verify" ]; then
|
|
required="nonce progress started"
|
|
jscript=".nonce, .progress, .started"
|
|
fi
|
|
|
|
# quick check to assure the data parameter is sane
|
|
key_arr=($(echo "$data" | jq -r 'keys[]' | sort))
|
|
for key in $required; do
|
|
if [[ " ${key_arr[*]} " != *" $key "* ]]; then
|
|
log $ERROR "assertRekeyStatus requires: [$required]," \
|
|
"received: ${key_arr[*]}"
|
|
return 1
|
|
fi
|
|
done
|
|
|
|
required="$( echo "$data" | jq -r "$jscript" )"
|
|
|
|
index=0
|
|
error=0
|
|
while [ "$index" -lt "${#servers[@]}" ]; do
|
|
server="${servers[$index]}"
|
|
index=$((index+1))
|
|
server="${server}.$POD_TARGET_BASE"
|
|
NO_HEADER=true \
|
|
API_TMOUT=$API_REKEY_QUERY_TMOUT \
|
|
vaultAPI response GET "$server" "$endpoint"
|
|
if [ $? -ne 0 -o -z "$response" ]; then
|
|
# failing the REST API call is not the same
|
|
# as non-conformance
|
|
return 2
|
|
continue
|
|
fi
|
|
|
|
record="$( echo "$response" | jq -r "$jscript" )"
|
|
if [ "$record" != "$required" ]; then
|
|
log $ERROR "$server does not conform to:" \
|
|
"$( echo "$data" | jq -c '.' )"
|
|
log $DEBUG "$server does not confirm: $response"
|
|
error=1
|
|
continue
|
|
fi
|
|
log $DEBUG "$server conforms: $response"
|
|
done
|
|
|
|
return $error
|
|
}
|
|
|
|
# Test whether the vault server(s) agree about rekey status
|
|
#
|
|
# The parameter is the quoted string of json data to pass to
|
|
# assertRekeyStatus
|
|
#
|
|
# Returns the normal linux success=0, failure!=0
|
|
function assertServerStatus {
|
|
local reference="$1"
|
|
local pods
|
|
local count
|
|
|
|
pods="$( getVaultPods | awk '{print $2}' )"
|
|
count="$( echo $pods | wc -w )"
|
|
if [ "$count" -ne "$HA_REPLICAS" ]; then
|
|
log $ERROR "server without IP does not conform"
|
|
return 1
|
|
fi
|
|
assertRekeyStatus "/sys/rekey/init" "$reference" $pods
|
|
}
|
|
|
|
# Test whether the vault server(s) agree about rekey validation
|
|
# status. Warn when the active vault server changes
|
|
#
|
|
# The parameter is the quoted string of json data to pass to
|
|
# assertRekeyStatus
|
|
#
|
|
# Returns the normal linux success=0, failure!=0
|
|
function assertVerifyStatus {
|
|
local reference="$1"
|
|
local response
|
|
local pods
|
|
local result
|
|
local count
|
|
|
|
# first assert the rekey status; /sys/rekey/verify returns
|
|
# error if a server does not have rekey in progress
|
|
NO_HEADER=true \
|
|
API_TMOUT=$API_REKEY_QUERY_TMOUT \
|
|
vaultAPI response GET $ACTIVE_TARGET /sys/rekey/init
|
|
result=$?
|
|
if [ "$result" -ne 0 ]; then
|
|
return $result
|
|
fi
|
|
assertServerStatus "$response"
|
|
result=$?
|
|
if [ $result -ne 0 ]; then
|
|
return $result
|
|
fi
|
|
|
|
pods="$( getVaultPods | awk '{print $2}' )"
|
|
count="$( echo $pods | wc -w )"
|
|
if [ "$count" -ne "$HA_REPLICAS" ]; then
|
|
log $ERROR "server without IP does not conform"
|
|
return 1
|
|
fi
|
|
assertRekeyStatus "/sys/rekey/verify" "$reference" $pods
|
|
}
|
|
|
|
# Assert that the /sys/rekey/init endpoint reports no
|
|
# rekey procedure in progress on any server
|
|
#
|
|
# Returns the normal linux success=0, failure!=0
|
|
function assertNoRekey {
|
|
local data
|
|
|
|
data='{"nonce": "", "started": false, "progress": 0'
|
|
data="$data"', "verification_required": false}'
|
|
assertServerStatus "$data"
|
|
}
|
|
|
|
# Retrieve the rekey status from active vault server
|
|
# and assert that all server conform to the status
|
|
#
|
|
# Returns the normal linux success=0, failure!=0
|
|
function assertServersConform {
|
|
local response
|
|
local value
|
|
local result
|
|
local pods
|
|
local count
|
|
|
|
NO_HEADER=true \
|
|
API_TMOUT=$API_REKEY_QUERY_TMOUT \
|
|
vaultAPI response GET $ACTIVE_TARGET /sys/rekey/init
|
|
if [ $? -ne 0 ]; then
|
|
# cannot check conformance
|
|
log $ERROR "Cannot check server conformance to" \
|
|
"/sys/rekey/init"
|
|
return 2
|
|
fi
|
|
|
|
assertServerStatus "$response"
|
|
result="$?"
|
|
if [ "$result" -ne 0 ]; then
|
|
return $result
|
|
fi
|
|
|
|
value="$( echo "$response" | jq -r '.verification_nonce' )"
|
|
if [ -z "$value" -o "$value" == "null" ]; then
|
|
return 0
|
|
fi
|
|
|
|
NO_HEADER=true \
|
|
API_TMOUT=$API_REKEY_QUERY_TMOUT \
|
|
vaultAPI response GET $ACTIVE_TARGET /sys/rekey/verify
|
|
if [ $? -ne 0 ]; then
|
|
# cannot check conformance
|
|
log $ERROR "Cannot check server conformance to" \
|
|
"/sys/rekey/verify"
|
|
return 2
|
|
fi
|
|
|
|
pods="$( getVaultPods | awk '{print $2}' )"
|
|
count="$( echo $pods | wc -w )"
|
|
if [ "$count" -ne "$HA_REPLICAS" ]; then
|
|
log $ERROR "server without IP does not conform"
|
|
return 1
|
|
fi
|
|
assertRekeyStatus "/sys/rekey/verify" "$response" $pods
|
|
}
|
|
|
|
# This function is used during the pre-rekey assertions
|
|
# Testing if the main loop (via PODREC_F) indicates a server
|
|
# is not running.
|
|
function allServersRunning {
|
|
local records
|
|
local count
|
|
|
|
records="$( grep "^/$VAULT_FN" "$PODREC_F" )"
|
|
count="$( awk -F/ '{print $2}' <<<"$records" | wc -w )"
|
|
if [ "$count" -ne "$HA_REPLICAS" ]; then
|
|
return 1
|
|
fi
|
|
return 0
|
|
}
|
|
|
|
# This function is used during the pre-rekey assertions
|
|
# Testing if the main loop (via PODREC_F) indicates a server
|
|
# is sealed
|
|
function allServersUnsealed {
|
|
local records
|
|
local count
|
|
|
|
records="$( grep "^/$VAULT_FN" "$PODREC_F" )"
|
|
count="$( grep "/false/" <<<"$records" \
|
|
| awk -F/ '{print $2}' | wc -w )"
|
|
if [ "$count" -ne "$HA_REPLICAS" ]; then
|
|
return 1
|
|
fi
|
|
|
|
return 0
|
|
}
|
|
|
|
# This function is used during the pre-rekey assertions
|
|
# Testing if the main loop (via PODREC_F) indicates a server
|
|
# omits IP address
|
|
function allServersHaveIP {
|
|
local records
|
|
local count
|
|
|
|
records="$( grep "^/$VAULT_FN" "$PODREC_F" )"
|
|
count="$( echo "$records" | awk -F/ '{print $3}' | wc -w )"
|
|
if [ "$count" -ne "$HA_REPLICAS" ]; then
|
|
return 1
|
|
fi
|
|
return 0
|
|
}
|
|
|
|
# Test the status of rekey procedure 'started' during pre-rekey
|
|
# tests for procedure progress selection (sharing a single vaultAPI
|
|
# call to GET /sys/rekey/init
|
|
#
|
|
# Return linux true (0) if the status of /sys/rekey/init includes
|
|
# started == true
|
|
#
|
|
# Optional argument --not inverts the logic, but maintains
|
|
# error response 2
|
|
function assertRekeyStarted {
|
|
local started
|
|
local not="$1"
|
|
|
|
# assert that a rekey is in progress
|
|
started="$( echo "$REKEY_STATUS_JSON" | jq -r '.started' )"
|
|
if [ "$started" == "true" ]; then
|
|
started=0
|
|
elif [ "$started" != "false" ]; then
|
|
# the rekey status is unclear
|
|
# an error is probably printed
|
|
log $DEBUG "unclear response for /sys/rekey/init:" \
|
|
"$( jq -c <<<"$REKEY_STATUS_JSON" )"
|
|
return 2
|
|
else
|
|
started=1
|
|
fi
|
|
|
|
if [ "$started" -eq 0 ]; then
|
|
if [ "$not" == "--not" ]; then
|
|
return 1
|
|
fi
|
|
return 0
|
|
fi
|
|
if [ "$not" == "--not" ]; then
|
|
return 0
|
|
fi
|
|
return 1
|
|
}
|
|
|
|
# Delete the shard secrets with speficied prefix
|
|
#
|
|
# The secrets are deleting on a single kubectl command
|
|
function deleteShardSecrets {
|
|
local prefix="$1"
|
|
local i
|
|
local list=''
|
|
|
|
for i in $( seq 0 $((KEY_SECRET_SHARES-1)) ); do
|
|
if [ -n "$( secretExists "${prefix}-$i" )" ]; then
|
|
list="$list ${prefix}-$i"
|
|
fi
|
|
done
|
|
if [ -n "$list" ]; then
|
|
deleteSecrets $list
|
|
return $?
|
|
fi
|
|
return 0
|
|
}
|
|
|
|
# Make a copy of the shard secrets with specified prefix
|
|
#
|
|
# The calling function needs to verify the result
|
|
function copyShardSecrets {
|
|
local from="$1"
|
|
local to="$2"
|
|
local i
|
|
|
|
for i in $( seq 0 $((KEY_SECRET_SHARES-1))); do
|
|
get_secret "${from}-$i" \
|
|
| set_secret "${to}-$i" /dev/stdin
|
|
if [ $? -ne 0 ]; then
|
|
# don't try anything else
|
|
log $ERROR "Failed to copy ${from}-$i to ${to}-$i"
|
|
break
|
|
fi
|
|
done
|
|
}
|
|
|
|
# Just log the content of cluster-rekey-request again
|
|
#
|
|
# Keeps track of whether vault-manager has been restarted
|
|
# with REKEY_STARTED variable, so that the rekey procedure
|
|
# status is documented in log
|
|
function rekeyResuming {
|
|
if [ "$REKEY_STARTED" -ne 0 ]; then
|
|
log $INFO "Resuming rekey:" \
|
|
"$( get_secret cluster-rekey-request )"
|
|
REKEY_STARTED=0
|
|
fi
|
|
}
|
|
|
|
# Return linux true (0) if a rekey is requested and the vault
|
|
# server pods are in a stable condition
|
|
#
|
|
# If the vault servers are not "stable" then the rekey operation
|
|
# needs that stability first. vault-manager's main runStateMachine
|
|
# will monitor pods and restore unsealed status.
|
|
function needsRekey {
|
|
local pods
|
|
local sealed
|
|
local response
|
|
|
|
# the first milestone to be created is cluster-rekey-request;
|
|
# the last milestone to be deleted is cluster-rekey-audit;
|
|
# proceed if any exists
|
|
secretsExistAny cluster-rekey-request \
|
|
cluster-rekey-verified \
|
|
cluster-rekey-shuffle \
|
|
cluster-rekey-audit
|
|
if [ $? -ne 0 ]; then
|
|
# rekey is not requested
|
|
return 1
|
|
fi
|
|
|
|
# progress the rekey procedure only if the servers are all
|
|
# running
|
|
if ! allServersRunning; then
|
|
log $INFO "Rekey: wait for vault servers to equal" \
|
|
"$HA_REPLICAS"
|
|
return 1
|
|
fi
|
|
|
|
# progress the rekey procedure only if the servers were
|
|
# previously unsealed.
|
|
if ! allServersUnsealed; then
|
|
log $INFO "Rekey: wait for unsealed vault servers to" \
|
|
"equal $HA_REPLICAS"
|
|
return 1
|
|
fi
|
|
|
|
# progress the rekey procedure only if the servers all have
|
|
# DNS names (IP addresses) provided by k8s
|
|
if ! allServersHaveIP; then
|
|
log $INFO "Rekey: wait for $HA_REPLICAS vault servers" \
|
|
"to have IP addresses"
|
|
return 1
|
|
fi
|
|
|
|
# The above three tests are based on output of kubectl get pods
|
|
# command. Doublecheck with REST API call to each server
|
|
pods="$( getVaultPods | grep "^$VAULT_FN" | awk '{print $2}' )"
|
|
for pod in $pods; do
|
|
NO_HEADER=true \
|
|
API_TMOUT=$QUERY_TMOUT \
|
|
vaultAPI response GET ${pod}.$POD_TARGET_BASE /sys/health
|
|
if [ $? -ne 0 ]; then
|
|
log $ERROR "$pod fails health check during rekey"
|
|
return 1
|
|
fi
|
|
sealed="$( echo "$response" | jq -r '.sealed' )"
|
|
if [ "$sealed" != "false" ]; then
|
|
log $ERROR "$pod is sealed during rekey"
|
|
return 1
|
|
fi
|
|
done
|
|
|
|
assertServersConform
|
|
return $?
|
|
}
|
|
|
|
# Return linux true (0) if the current step of the rekey procedure
|
|
# is to send initialize request to /sys/rekey/int
|
|
#
|
|
# Initialize is the first step
|
|
#
|
|
# Will not begin initialization if there are stale cluster-rekey or
|
|
# cluster-key-bk secrets
|
|
function needsInitialization {
|
|
local progress
|
|
local count
|
|
local error=0
|
|
|
|
assertRekeyStarted --not
|
|
progress=$?
|
|
if [ "$progress" -ne 0 ]; then
|
|
return "$progress"
|
|
fi
|
|
|
|
# skip if this represents a recovery path
|
|
secretsExistAny cluster-rekey-verified \
|
|
cluster-rekey-shuffle \
|
|
cluster-rekey-audit
|
|
if [ $? -eq 0 ]; then
|
|
return 1
|
|
fi
|
|
|
|
# make assertions about the artifacts left behind by previous
|
|
# rekey procedure attempts
|
|
# assert that there are no stale keys before starting rekey
|
|
assertShardSecrets cluster-rekey --nokeys
|
|
count=$?
|
|
if [ "$count" -ne 0 ]; then
|
|
log $ERROR "Stale cluster-rekey secrets ($count) present"
|
|
# there was a possibility that vault had cancelled the rekey
|
|
# due to active server failure, so fall through to
|
|
# rekeyRecovery
|
|
return 1
|
|
fi
|
|
|
|
assertShardSecrets cluster-key-bk --nokeys
|
|
count=$?
|
|
if [ "$count" -ne 0 ]; then
|
|
log $ERROR "cluster-key-bk secrets ($count) present"
|
|
return 2
|
|
fi
|
|
|
|
return 0
|
|
}
|
|
|
|
# Start the rekey procedure
|
|
#
|
|
# Send initialize request to /sys/rekey/int
|
|
#
|
|
# Initialize is the first step
|
|
#
|
|
# Will not begin initialization if there are stale cluster-rekey or
|
|
# cluster-key-bk secrets
|
|
function rekeyInitialize {
|
|
local shares
|
|
local threshold
|
|
local verify
|
|
local data
|
|
local response
|
|
local value
|
|
|
|
log $INFO "Initializing vault rekey"
|
|
|
|
REKEY_STARTED=0
|
|
|
|
shares='"secret_shares": '$KEY_SECRET_SHARES
|
|
threshold='"secret_threshold": '$KEY_REQUIRED_THRESHOLD
|
|
verify='"require_verification": true'
|
|
data="{$shares,$threshold,$verify}"
|
|
|
|
NO_HEADER=true \
|
|
API_TMOUT=$API_REKEY_OP_TMOUT \
|
|
vaultAPI response POST $ACTIVE_TARGET /sys/rekey/init "$data"
|
|
if [ $? -ne 0 ]; then
|
|
return 1
|
|
fi
|
|
|
|
value="$( echo "$response" | jq -r ".started" )"
|
|
if [ 'false' == "$value" ]; then
|
|
log $ERROR "Rekey not started"
|
|
return 1
|
|
fi
|
|
|
|
# log the nonce
|
|
value="$( echo "$response" | jq -r ".nonce" )"
|
|
verify="$( echo "$response" | jq -r ".verification_required" )"
|
|
log $INFO "Rekey started: $value" \
|
|
"(verification_required==$verify)"
|
|
|
|
# just a sanity check
|
|
if [ 'true' != "$verify" ]; then
|
|
log $ERROR "Rekey started without verification_required:" \
|
|
"aborting"
|
|
NO_HEADER=true \
|
|
API_TMOUT=$API_REKEY_OP_TMOUT \
|
|
vaultAPI response DELETE $ACTIVE_TARGET /sys/rekey/init
|
|
return 1
|
|
fi
|
|
|
|
assertServerStatus "$response"
|
|
return $?
|
|
}
|
|
|
|
# The rekey authentication should happen when
|
|
# - there is a rekey in progress
|
|
# - there is a verification_nonce
|
|
#
|
|
# Authentication of the rekey request is the second step
|
|
#
|
|
# Omit rekey verification if:
|
|
# - there are existing cluster-rekey secrets
|
|
# - Verification is complete: cluster-rekey-verified or any later
|
|
# stage is complete
|
|
#
|
|
# Return linux true (0) if the current stage of rekey
|
|
# is to complete the rekey verification
|
|
# Return linux true (0) if the current stage of rekey
|
|
# is to authentication the rekey request
|
|
function needsAuthentication {
|
|
local progress
|
|
|
|
assertRekeyStarted
|
|
progress=$?
|
|
if [ "$progress" -ne 0 ]; then
|
|
return "$progress"
|
|
fi
|
|
|
|
progress="$( echo "$REKEY_STATUS_JSON" \
|
|
| jq -r '.verification_nonce' )"
|
|
if ! [ -z "$progress" -o "$progress" == "null" ]; then
|
|
# There is a rekey in progress with a verification nonce
|
|
# pass through to recovery
|
|
return 1
|
|
fi
|
|
|
|
# this represents a recovery path
|
|
assertShardSecrets cluster-rekey --nokeys
|
|
if [ $? -ne 0 ]; then
|
|
# There are already cluster-rekey secrets
|
|
return 1
|
|
fi
|
|
|
|
# skip if this represents a recovery path
|
|
secretsExistAny cluster-rekey-verified \
|
|
cluster-rekey-shuffle \
|
|
cluster-rekey-audit
|
|
if [ $? -eq 0 ]; then
|
|
return 1
|
|
fi
|
|
|
|
return $?
|
|
}
|
|
|
|
# Submits a keyshard for the rekey procedure
|
|
# Returns 0 on success
|
|
# Returns 1 on failure
|
|
# Returns KEY_SECRET_SHARES when authentication completes
|
|
function rekeySubmitShard {
|
|
local nonce="$1"
|
|
local index="$2"
|
|
local verifyauth="$3"
|
|
local prefix="$4"
|
|
local shard
|
|
local dnonce
|
|
local key
|
|
local data
|
|
local response
|
|
local progress
|
|
local root_token
|
|
local new_doc
|
|
|
|
if [ -z "$prefix" ]; then
|
|
prefix=cluster-key
|
|
fi
|
|
|
|
shard="$( get_secret "${prefix}-$index" | jq -r .keys[0] )"
|
|
dnonce='"nonce": "'$nonce'"'
|
|
key='"key": "'$shard'"'
|
|
data="{$dnonce,$key}"
|
|
|
|
NO_HEADER=true \
|
|
API_TMOUT=$API_REKEY_OP_TMOUT \
|
|
vaultAPI response POST $ACTIVE_TARGET /sys/rekey/update "$data"
|
|
if [ $? -ne 0 ]; then
|
|
return 1
|
|
fi
|
|
|
|
# Check the response for verification_nonce, which
|
|
# indicates completion
|
|
progress="$( echo "$response" | jq -r '.verification_nonce' )"
|
|
if [ -n "$progress" -a "$progress" != 'null' ]; then
|
|
log $INFO "Success authenticating:" \
|
|
"$((index+1)) of $KEY_REQUIRED_THRESHOLD"
|
|
|
|
if [ "$verifyauth" == "--verify-auth" ]; then
|
|
# delete the rekey and return success
|
|
NO_HEADER=true \
|
|
API_TMOUT=$API_REKEY_OP_TMOUT \
|
|
vaultAPI response DELETE $ACTIVE_TARGET /sys/rekey/init
|
|
return "$KEY_SECRET_SHARES"
|
|
fi
|
|
|
|
# Procedure to ensure that the old and new shards are
|
|
# secured in k8s secrets. Deletion of old shards will only
|
|
# occur when verification is successful.
|
|
root_token="$( get_secret cluster-key-root )"
|
|
new_doc="$( echo "$response" \
|
|
| jq -c '{"keys": .keys,
|
|
"keys_base64": .keys_base64,
|
|
"root_token": "'"$root_token"'"}' )"
|
|
# store the new shards
|
|
echo "$response" \
|
|
| jq -c '{"keys": .keys, "keys_base64": .keys_base64}' \
|
|
| storeVaultInitSecrets cluster-rekey
|
|
|
|
# check that the secrets match vault's rekey response
|
|
echo "$new_doc" | validateSecrets cluster-rekey
|
|
if [ $? -ne 0 ]; then
|
|
# calling function will abort the rekey
|
|
# and any cluster-rekey secrets
|
|
log $ERROR "Failed to store and verify shards" \
|
|
"after rekey authentication complete"
|
|
return 1
|
|
fi
|
|
|
|
# authentication of the rekey request is completed
|
|
# successfully
|
|
log $INFO "Rekey authentication successful"
|
|
return "$KEY_SECRET_SHARES"
|
|
fi
|
|
|
|
# Otherwise verify the response
|
|
progress="$( echo "$response" | jq -r '.progress' )"
|
|
index="$((index+1))"
|
|
if [ "$progress" -ne "$index" ]; then
|
|
log $ERROR "Authentication sequence mismatching" \
|
|
"($progress, $index)"
|
|
return 1
|
|
fi
|
|
|
|
# assert that the servers agree
|
|
assertServerStatus "$response"
|
|
if [ $? -ne 0 ]; then
|
|
log $ERROR "Vault server rekey status fails during" \
|
|
"authentication at $index of $KEY_REQUIRED_THRESHOLD"
|
|
return 1
|
|
fi
|
|
|
|
log $INFO "Success authenticating:" \
|
|
"$index of $KEY_REQUIRED_THRESHOLD"
|
|
return 0
|
|
}
|
|
|
|
# Return linux true (0) if the current step of the rekey procedure
|
|
# is to authenticate the request
|
|
#
|
|
# Authentication of the rekey request is the second step
|
|
#
|
|
function rekeyAuthenticate {
|
|
local verifyauth="$1"
|
|
local prefix="$2"
|
|
local response
|
|
local index
|
|
local value
|
|
local nonce
|
|
local progress
|
|
local result
|
|
|
|
NO_HEADER=true \
|
|
API_TMOUT=$API_REKEY_QUERY_TMOUT \
|
|
vaultAPI response GET $ACTIVE_TARGET /sys/rekey/init
|
|
if [ $? -ne 0 ]; then
|
|
# an error is already printed
|
|
return 1
|
|
fi
|
|
|
|
value="$( echo "$response" | jq -r '.started' )"
|
|
if [ 'true' != "$value" ]; then
|
|
log $ERROR "Rekey authentication, but rekey not in progress"
|
|
return 1
|
|
fi
|
|
|
|
nonce="$( echo "$response" | jq -r '.nonce' )"
|
|
progress="$( echo "$response" | jq -r '.progress' )"
|
|
if ! [[ "$progress" =~ ^[0-9]{1,}$ ]]; then
|
|
log $ERROR "Rekey authentication progress not integer:" \
|
|
"$response"
|
|
return 1
|
|
elif [ "$progress" -ge "$KEY_SECRET_SHARES" ]; then
|
|
log $ERROR "Rekey authentication progress out of range:" \
|
|
"$response"
|
|
return 1
|
|
fi
|
|
|
|
if [ "$progress" -ne 0 ]; then
|
|
log $WARNING "Continue authenticating rekey at: $progress"
|
|
fi
|
|
|
|
# authenticate and store the new keys
|
|
for index in $( seq $progress $((KEY_SECRET_SHARES-1)) ); do
|
|
rekeySubmitShard "$nonce" "$index" $verifyauth $prefix
|
|
result="$?"
|
|
if [ "$result" -eq "$KEY_SECRET_SHARES" ]; then
|
|
# start the verify procedure now
|
|
if [ "$verifyauth" != "--verify-auth" ]; then
|
|
log $INFO "Starting rekey verify"
|
|
fi
|
|
break
|
|
elif [ "$result" -ne 0 ]; then
|
|
return $result
|
|
fi
|
|
done
|
|
return 0
|
|
}
|
|
|
|
# The rekey verification should happen when
|
|
# - there is a rekey in progress
|
|
# - there is a verification_nonce
|
|
#
|
|
# Omit rekey verification if:
|
|
# - there are existing cluster-rekey secrets
|
|
# - Verification is complete: cluster-rekey-verified or any later
|
|
# stage is complete
|
|
#
|
|
# Return linux true (0) if the current stage of rekey
|
|
# is to complete the rekey verification
|
|
function needsVerify {
|
|
local progress
|
|
|
|
assertRekeyStarted
|
|
progress=$?
|
|
if [ "$progress" -ne 0 ]; then
|
|
return "$progress"
|
|
fi
|
|
|
|
progress="$( echo "$REKEY_STATUS_JSON" \
|
|
| jq -r '.verification_nonce' )"
|
|
if [ -z "$progress" -o "$progress" == "null" ]; then
|
|
# There is a rekey in progress, but not with a
|
|
# verification nonce
|
|
return 1
|
|
fi
|
|
|
|
# Assert that the nonce is UUID-ish
|
|
if ! [[ "$progress" =~ ^[a-f0-9-]{36}$ ]]; then
|
|
log $ERROR "The verification_nonce is not UUID-ish:" \
|
|
"$REKEY_STATUS_JSON"
|
|
return 2
|
|
fi
|
|
|
|
assertShardSecrets cluster-rekey
|
|
if [ $? -ne 0 ]; then
|
|
# this should not happen: verify in progress but no
|
|
# cluster-rekey secrets
|
|
log $ERROR "rekey verify in progress but no cluster-rekey"
|
|
return 1
|
|
fi
|
|
|
|
# skip if this represents a recovery path
|
|
secretsExistAny cluster-rekey-verified \
|
|
cluster-rekey-shuffle \
|
|
cluster-rekey-audit
|
|
if [ $? -eq 0 ]; then
|
|
return 1
|
|
fi
|
|
|
|
return 0
|
|
}
|
|
|
|
# Submits a keyshard for the rekey verification procedure
|
|
# Returns 0 on success
|
|
# Returns 1 on failure
|
|
# Returns KEY_REQUIRED_THRESHOLD when authentication completes
|
|
function rekeyVerifySubmitShard {
|
|
local nonce="$1"
|
|
local index="$2"
|
|
local shard
|
|
local dnonce
|
|
local key
|
|
local data
|
|
local response
|
|
local progress
|
|
|
|
shard="$( get_secret cluster-rekey-$index \
|
|
| jq -r .keys[0] )"
|
|
dnonce='"nonce": "'$nonce'"'
|
|
key='"key": "'$shard'"'
|
|
data="{$dnonce,$key}"
|
|
|
|
NO_HEADER=true \
|
|
API_TMOUT=$API_REKEY_OP_TMOUT \
|
|
vaultAPI response POST $ACTIVE_TARGET \
|
|
/sys/rekey/verify "$data"
|
|
if [ $? -ne 0 ]; then
|
|
# an error is printed
|
|
return 1
|
|
fi
|
|
|
|
progress="$( echo "$response" | jq -r ".complete" )"
|
|
if [ "$progress" == 'true' ]; then
|
|
log $INFO "Success verifying: using new shards"
|
|
set_secret cluster-rekey-verified /dev/stdin \
|
|
<<<"$( get_secret cluster-rekey-request )"
|
|
return $KEY_REQUIRED_THRESHOLD
|
|
fi
|
|
progress="$( echo "$response" | jq -r ".progress" )"
|
|
if [ -z "$progress" -o "$progress" == "null" ]; then
|
|
log $ERROR "Expecting rekey verify progress" \
|
|
"[$((index+1))] instead of [$progress]"
|
|
return 1
|
|
fi
|
|
# Print the progress of rekey verify.
|
|
if [ "$((index+1))" -eq "$progress" ]; then
|
|
log $INFO "Success verifying:" \
|
|
"$progress of $KEY_REQUIRED_THRESHOLD"
|
|
elif [ "$((index+1))" -gt "$progress" ]; then
|
|
# A sanity check only
|
|
log $WARNING "Verify progress [$progress] less" \
|
|
"than expected [$((index+1))]"
|
|
else
|
|
# A sanity check only
|
|
log $WARNING "Verify progress [$progress]" \
|
|
"greater than expected [$((index+1))]"
|
|
fi
|
|
assertVerifyStatus "$response"
|
|
if [ $? -ne 0 ]; then
|
|
log $ERROR "Vault server verify status fails during" \
|
|
"authentication at" \
|
|
"$index of $KEY_REQUIRED_THRESHOLD"
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
# Return linux true (0) if the current step of the rekey procedure
|
|
# is to verify shard secrets
|
|
#
|
|
# This step confirms that vault manager has correctly stored the
|
|
# shards received from the vault server. This allows failures of
|
|
# the procedure to be recovered:
|
|
# - receive the shards from vault
|
|
# - store the shards in k8s secrets
|
|
# - play the shards back to vault
|
|
# - upon successful verification the new shards are effective
|
|
#
|
|
# Verification of the rekey request is the Third step
|
|
#
|
|
function rekeyVerify {
|
|
local value
|
|
local nonce
|
|
local progress
|
|
local response
|
|
local shard
|
|
local dnonce
|
|
local key
|
|
local data
|
|
local index
|
|
|
|
NO_HEADER=true \
|
|
API_TMOUT=$API_REKEY_QUERY_TMOUT \
|
|
vaultAPI response GET $ACTIVE_TARGET /sys/rekey/verify
|
|
if [ $? -ne 0 ]; then
|
|
# an error is already printed
|
|
return 1
|
|
fi
|
|
|
|
value="$( echo "$response" | jq -r '.started' )"
|
|
if [ 'true' != "$value" ]; then
|
|
log $ERROR "Rekey verify, but rekey not in progress"
|
|
return 1
|
|
fi
|
|
|
|
nonce="$( echo "$response" | jq -r '.nonce' )"
|
|
progress="$( echo "$response" | jq -r '.progress' )"
|
|
if ! [[ "$progress" =~ ^[0-9]{1,}$ ]]; then
|
|
log $ERROR "Rekey authentication progress not integer:" \
|
|
"$response"
|
|
return 1
|
|
elif [ "$progress" -ge "$KEY_SECRET_SHARES" ]; then
|
|
log $ERROR "Rekey authentication progress out of range:" \
|
|
"$response"
|
|
return 1
|
|
fi
|
|
if [ "$progress" -ne 0 ]; then
|
|
log $WARNING "Continue verifying rekey at: $progress"
|
|
fi
|
|
|
|
# assert that the servers agree on verify status
|
|
assertVerifyStatus "$response"
|
|
if [ $? -ne 0 ]; then
|
|
return 1
|
|
fi
|
|
|
|
# authenticate the verify procedure
|
|
for index in $( seq $progress $((KEY_SECRET_SHARES-1)) ); do
|
|
rekeyVerifySubmitShard "$nonce" "$index"
|
|
result=$?
|
|
if [ "$result" -eq "$KEY_REQUIRED_THRESHOLD" ]; then
|
|
# rekeyVerifySubmitShard returns KEY_REQUIRED_THRESHOLD
|
|
# when .complete == true was received
|
|
return 0
|
|
elif [ "$result" -ne 0 ]; then
|
|
# any other non-zero result is a failure
|
|
return 1
|
|
fi
|
|
done
|
|
|
|
log $ERROR "Verify procedure ended without completion"
|
|
return 1
|
|
}
|
|
|
|
# The shuffling of keys shards in k8s secrets should happen when
|
|
# th cluster-rekey-verified procedure step is completed.
|
|
#
|
|
# Omit shuffling if:
|
|
# - vault server reports rekey in progress (unclear status)
|
|
# - shuffling is already complete: cluster-rekey-shuffle or later
|
|
# stage is complete
|
|
# - there are no cluster-rekey secrets
|
|
# - there are cluster-key-bk secrets
|
|
#
|
|
# Return linux true (0) if the current stage of rekey
|
|
# is to complete the swapping of validated shards
|
|
function needsShuffle {
|
|
local progress
|
|
|
|
# assert that a rekey is not in progress
|
|
assertRekeyStarted --not
|
|
progress=$?
|
|
if [ "$progress" -ne 0 ]; then
|
|
# 1 - maintain the status of rekey in progress
|
|
# 2 - api error, try again later
|
|
return "$progress"
|
|
fi
|
|
|
|
secretExists cluster-rekey-verified >/dev/null
|
|
if [ $? -ne 0 ]; then
|
|
# proceeds to next procedure step
|
|
return 1
|
|
fi
|
|
|
|
# skip if this represents a recovery path
|
|
secretsExistAny cluster-rekey-shuffle \
|
|
cluster-rekey-audit
|
|
if [ $? -eq 0 ]; then
|
|
return 1
|
|
fi
|
|
|
|
assertShardSecrets cluster-rekey
|
|
case $? in
|
|
0)
|
|
# There is no rekey in progress, and there is a set
|
|
# of cluster-rekey shards recorded
|
|
;;
|
|
$KEY_SECRET_SHARES)
|
|
# There is no rekey in progress, and there are no
|
|
# cluster-rekey shards recorded
|
|
return 1
|
|
;;
|
|
*)
|
|
# with cluster-rekey-verified, an incomplete set of
|
|
# cluster-rekey indicates partial deletion after copying
|
|
# to cluster-key
|
|
# will want to audit the cluster-key secrets before
|
|
# deleting cluster-rekey
|
|
log $WARNING "The number key shard secrets for" \
|
|
"cluster-rekey is not complete"
|
|
return 1
|
|
;;
|
|
esac
|
|
|
|
# otherwise allow rekeyShuffleKeys to be re-entrant to
|
|
# the existance of or lack of cluster-key and cluster-key-bk
|
|
# cluster-rekey is only deleted when confirmed to be copied to
|
|
# cluster-key
|
|
return 0
|
|
}
|
|
|
|
# This procedure shuffles the shard secrets from cluster-rekey to
|
|
# cluster-key to cluster-bk
|
|
#
|
|
# The function intends to be resolve failures of the vault manager
|
|
# process where it is interrupted abruptly such as with kill -9.
|
|
# In combination with needsShuffle it can be re-run until it
|
|
# completes the shuffle:
|
|
# - cluster-key shards are copied to cluster-key-bk
|
|
# - cluster-key shards are delete
|
|
# - cluster-rekey is copied to cluster-key
|
|
# - cluster-rekey is delete
|
|
#
|
|
# A subsequent step audits the new keys before deleting the
|
|
# cluster-key-bk secrets
|
|
function rekeyShuffleKeys {
|
|
local key_exists
|
|
local rekey_exists
|
|
local bk_exists
|
|
local key_doc=""
|
|
local rekey_doc=""
|
|
|
|
assertShardSecrets cluster-key
|
|
key_exists=$?
|
|
assertShardSecrets cluster-rekey
|
|
rekey_exists=$?
|
|
assertShardSecrets cluster-key-bk
|
|
bk_exists=$?
|
|
|
|
if [ "$key_exists" -eq 0 ]; then
|
|
key_doc="$( reconstructInitResponse cluster-key )"
|
|
echo "$key_doc" | validateSecrets cluster-key
|
|
if [ $? -ne 0 ]; then
|
|
log $ERRROR "Failed to read cluster-key"
|
|
return 1
|
|
fi
|
|
fi
|
|
|
|
if [ "$rekey_exists" -eq 0 ]; then
|
|
rekey_doc="$( reconstructInitResponse cluster-rekey )"
|
|
echo "$rekey_doc" | validateSecrets cluster-rekey
|
|
if [ $? -ne 0 ]; then
|
|
log $ERROR "Failed to read cluster-rekey"
|
|
return 1
|
|
fi
|
|
else
|
|
# this is recovery path
|
|
if [ -n "key_doc" ]; then
|
|
log $WARNING "Progress cluster-rekey-shuffle without" \
|
|
"cluster-rekey"
|
|
set_secret cluster-rekey-shuffle /dev/stdin \
|
|
<<<"$( get_secret cluster-rekey-request )"
|
|
return
|
|
fi
|
|
log $ERROR "No cluster-key or cluster-rekey"
|
|
return 1
|
|
fi
|
|
|
|
if [ "$bk_exists" -lt "$KEY_SECRET_SHARES" \
|
|
-a "$bk_exists" -ne 0 ]; then
|
|
# this is a recovery path
|
|
# an incomplete copy of cluster-key secrets
|
|
if [ -n "$key_doc" ]; then
|
|
deleteShardSecrets cluster-key-bk
|
|
assertShardSecrets cluster-key-bk
|
|
bk_exists=$?
|
|
if [ "$bk_exists" -lt "$KEY_SECRET_SHARES" ]; then
|
|
log $ERROR "Failed to delete incomplete" \
|
|
"cluster-key-bk"
|
|
return 1
|
|
fi
|
|
else
|
|
# this shouldn't happen;
|
|
# either not both failures is anticipated
|
|
log $ERROR "Sanity: incomplete both cluster-key-bk" \
|
|
"and missing/incomplete cluster-key secrets"
|
|
return 1
|
|
fi
|
|
fi
|
|
if [ "$bk_exists" -eq 0 ]; then
|
|
# this is a recovery path
|
|
if [ -n "$key_doc" ]; then
|
|
# Assert that cluster-key and cluster-key-bk are the
|
|
# same
|
|
log $INFO "Recovering from pre-existing cluster-key-bk"
|
|
echo "$key_doc" | validateSecrets cluster-key-bk
|
|
if [ $? -eq 0 ]; then
|
|
# cluster-key-bk == cluster-key
|
|
deleteShardSecrets cluster-key
|
|
assertShardSecrets cluster-key
|
|
key_exists=$?
|
|
key_doc=""
|
|
else
|
|
echo "$key_doc" | validateSecrets cluster-rekey
|
|
if [ $? -eq 0 ]; then
|
|
# Recovering cluster-key == cluster-rekey
|
|
log $INFO "Recovering with cluster-key"
|
|
deleteShardSecrets cluster-rekey
|
|
set_secret cluster-rekey-shuffle /dev/stdin \
|
|
<<<"$( get_secret cluster-rekey-request )"
|
|
return 0
|
|
else
|
|
log $ERROR "Three different sets of keys" \
|
|
"in k8s secrets"
|
|
return 1
|
|
fi
|
|
fi
|
|
fi
|
|
# else: there is no cluster-key to backup
|
|
else
|
|
# this is the normal procedure path
|
|
log $INFO "Copying cluster-key secrets to cluster-key-bk"
|
|
copyShardSecrets cluster-key cluster-key-bk
|
|
echo "$key_doc" | validateSecrets cluster-key-bk
|
|
if [ $? -ne 0 ]; then
|
|
log $ERROR "Failed to copy cluster-key to cluster-key-bk"
|
|
deleteShardSecrets cluster-key-bk
|
|
return 1
|
|
fi
|
|
deleteShardSecrets cluster-key
|
|
if [ $? -ne 0 ]; then
|
|
log $ERROR "Failed to delete cluster-key secrets"
|
|
return 1
|
|
fi
|
|
assertShardSecrets cluster-key
|
|
key_exists=$?
|
|
key_doc=""
|
|
fi
|
|
|
|
# cluster-key-bk exists here
|
|
# cluster-rekey rekey_doc is valid here
|
|
|
|
# if cluster-key exists, such as number of secrets less than
|
|
# KEY_SECRET_SHARES, then delete them; deleteShardSecrets is a
|
|
# no-op if there are none there
|
|
deleteShardSecrets cluster-key
|
|
if [ $? -ne 0 ]; then
|
|
log $ERROR "Failed to delete cluster-key"
|
|
return 1
|
|
# try again later
|
|
fi
|
|
|
|
log $INFO "Copying cluster-rekey secrets to cluster-key"
|
|
copyShardSecrets cluster-rekey cluster-key
|
|
echo "$rekey_doc" | validateSecrets cluster-key
|
|
if [ $? -ne 0 ]; then
|
|
log $ERROR "Failed to copy cluster-rekey to cluster-key"
|
|
return 1
|
|
fi
|
|
|
|
deleteShardSecrets cluster-rekey
|
|
set_secret cluster-rekey-shuffle /dev/stdin \
|
|
<<<"$( get_secret cluster-rekey-request )"
|
|
|
|
return 0
|
|
}
|
|
|
|
# The audit of cluster-key should happen when these other procedure
|
|
# steps are completed:
|
|
# - cluster-rekey-verified
|
|
# - cluster-rekey-shuffle
|
|
#
|
|
# Omit audit if:
|
|
# - vault server reports rekey in progress (failed previous audit?)
|
|
# - audit is already complete: cluster-rekey-audit exists
|
|
#
|
|
# Return linux true (0) if the current stage of rekey
|
|
# is to run the audit
|
|
function needsAudit {
|
|
local progress
|
|
|
|
# assert that a rekey is not in progress
|
|
assertRekeyStarted --not
|
|
progress=$?
|
|
if [ "$progress" -ne 0 ]; then
|
|
return "$progress"
|
|
fi
|
|
|
|
# Select recovery path with response '3'
|
|
secretExists cluster-rekey-audit >/dev/null
|
|
if [ $? -eq 0 ]; then
|
|
# this path indicates a failure to complete
|
|
# finalizeRekey. cluster-rekey-audit is the last
|
|
# milestone to be deleted
|
|
log $INFO "rekey audit already completed"
|
|
return 3
|
|
fi
|
|
|
|
secretExists cluster-rekey-request >/dev/null
|
|
if [ $? -ne 0 ]; then
|
|
return 1
|
|
fi
|
|
|
|
secretExists cluster-rekey-verified >/dev/null
|
|
if [ $? -ne 0 ]; then
|
|
return 1
|
|
fi
|
|
|
|
secretExists cluster-rekey-shuffle >/dev/null
|
|
if [ $? -ne 0 ]; then
|
|
return 1
|
|
fi
|
|
|
|
assertShardSecrets cluster-key
|
|
if [ $? -ne 0 ]; then
|
|
log $ERROR "rekey audit requested but cluster-keys absent"
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
# Audit that the active vault server authenticates with the cluster
|
|
# keys specified by prefix
|
|
#
|
|
# Returns 0 on success
|
|
# Returns 1 if the audit failes
|
|
# Returns 2 if there was a failure unrelated to authentication
|
|
function rekeyAudit {
|
|
local prefix="$1"
|
|
local value
|
|
local response
|
|
|
|
if [ -z "$prefix" ]; then
|
|
prefix="cluster-key"
|
|
fi
|
|
|
|
log $INFO "Auditing the shards in $prefix secrets"
|
|
assertNoRekey
|
|
if [ $? -ne 0 ]; then
|
|
log $ERROR "Cannot audit with rekey in progress"
|
|
return 2
|
|
fi
|
|
|
|
assertShardSecrets "$prefix"
|
|
if [ $? -ne 0 ]; then
|
|
log $ERROR "Audit fails with absent $prefix secrets"
|
|
return 1
|
|
fi
|
|
|
|
rekeyInitialize
|
|
if [ $? -ne 0 ]; then
|
|
NO_HEADER=true \
|
|
API_TMOUT=$API_REKEY_OP_TMOUT \
|
|
vaultAPI response DELETE $ACTIVE_TARGET /sys/rekey/init
|
|
return 2
|
|
fi
|
|
|
|
NO_HEADER=true \
|
|
API_TMOUT=$API_REKEY_QUERY_TMOUT \
|
|
vaultAPI response GET $ACTIVE_TARGET /sys/rekey/init
|
|
if [ $? -ne 0 ]; then
|
|
# There's no reason to believe this one will succeed where
|
|
# the other hadn't
|
|
NO_HEADER=true \
|
|
API_TMOUT=$API_REKEY_OP_TMOUT \
|
|
vaultAPI response DELETE $ACTIVE_TARGET /sys/rekey/init
|
|
return 2
|
|
fi
|
|
|
|
value="$( echo "$response" | jq -r ".verification_required" )"
|
|
if [ "$value" != "true" ]; then
|
|
log $ERROR "Audit sanity: verification_required not set:" \
|
|
"$response"
|
|
NO_HEADER=true \
|
|
API_TMOUT=$API_REKEY_OP_TMOUT \
|
|
vaultAPI response DELETE $ACTIVE_TARGET /sys/rekey/init
|
|
return 1
|
|
fi
|
|
|
|
rekeyAuthenticate --verify-auth "$prefix"
|
|
result="$?"
|
|
if [ "$result" -eq 0 ]; then
|
|
log $INFO "Audit of cluster-key secrets passes"
|
|
else
|
|
NO_HEADER=true \
|
|
API_TMOUT=$API_REKEY_OP_TMOUT \
|
|
vaultAPI response DELETE $ACTIVE_TARGET /sys/rekey/init
|
|
fi
|
|
|
|
return $result
|
|
}
|
|
|
|
# clean up the artifacts from rekey procedure
|
|
# The audit procedure proves the shards in cluster-key
|
|
# secrets will unseal the vault.
|
|
#
|
|
# If vault-manager is killed during this procedure step it should
|
|
# continue to try to delete the artifacts until finally deleting
|
|
# cluster-rekey-audit
|
|
function finalizeRekey {
|
|
local secrettext
|
|
secrettext="$( get_secret cluster-rekey-audit )"
|
|
|
|
log $INFO "removing artifacts of the rekey procedure:" \
|
|
"$secrettext"
|
|
assertShardSecrets cluster-rekey --nokeys
|
|
if [ $? -ne 0 ]; then
|
|
log $WARNING "removing cluster-rekey secrets" \
|
|
"after audit"
|
|
deleteShardSecrets cluster-rekey
|
|
fi
|
|
deleteShardSecrets cluster-key-bk
|
|
deleteSecrets cluster-rekey-verified
|
|
deleteSecrets cluster-rekey-shuffle
|
|
deleteSecrets cluster-rekey-request
|
|
deleteSecrets cluster-rekey-audit
|
|
|
|
log $INFO "Rekey request complete: $secrettext"
|
|
}
|
|
|
|
# This procedure handle a few cases where the vault active server or
|
|
# vault-manager were killed.
|
|
#
|
|
# - rekey authentication completed by vault-manager was killed
|
|
# before the shards could be stored
|
|
# - rekey verification may be cancelled by the failure of the active
|
|
# vault server
|
|
#
|
|
function rekeyRecovery {
|
|
local key_exists
|
|
local rekey_exists
|
|
local bk_exists
|
|
local verified_exists
|
|
local shuffle_exists
|
|
local audit_exists
|
|
local inprogress
|
|
local verifyprogress
|
|
|
|
log $INFO "Recovering the rekey procedure"
|
|
|
|
# assert that the vault server are all up and agree
|
|
# about the rekey status
|
|
allServersRunning \
|
|
&& allServersHaveIP \
|
|
&& allServersUnsealed \
|
|
|| return 1
|
|
|
|
NO_HEADER=true \
|
|
API_TMOUT=$API_REKEY_QUERY_TMOUT \
|
|
vaultAPI REKEY_STATUS_JSON GET $ACTIVE_TARGET /sys/rekey/init
|
|
if [ $? -ne 0 ]; then
|
|
# an error is printed
|
|
# wait for recovery
|
|
REKEY_STATUS_JSON=''
|
|
return 1
|
|
fi
|
|
assertServerStatus "$REKEY_STATUS_JSON"
|
|
if [ $? -ne 0 ]; then
|
|
# wait for the vault servers to sync
|
|
return 1
|
|
fi
|
|
|
|
inprogress="$( echo "$REKEY_STATUS_JSON" | jq -r '.started' )"
|
|
verifyprogress="$( echo "$REKEY_STATUS_JSON" \
|
|
| jq -r '.verification_nonce' )"
|
|
if [ "$inprogress" == "true" ]; then
|
|
# If a rekey is in progress, then cancel it
|
|
# - an authentication will reinitialize
|
|
# - a verification will reinitialtize
|
|
# - a rekeyAudit will retry
|
|
log $INFO "Cancelling rekey in progress"
|
|
NO_HEADER=true \
|
|
API_TMOUT=$API_REKEY_OP_TMOUT \
|
|
vaultAPI response DELETE $ACTIVE_TARGET /sys/rekey/init
|
|
if [ $? -ne 0 ]; then
|
|
# retry later
|
|
return 1
|
|
fi
|
|
fi
|
|
|
|
assertShardSecrets cluster-key
|
|
key_exists=$?
|
|
assertShardSecrets cluster-rekey
|
|
rekey_exists=$?
|
|
assertShardSecrets cluster-key-bk
|
|
bk_exists=$?
|
|
|
|
secretExists cluster-rekey-verified >/dev/null
|
|
verified_exists=$?
|
|
secretExists cluster-rekey-shuffle >/dev/null
|
|
shuffle_exists=$?
|
|
secretExists cluster-rekey-audit >/dev/null
|
|
audit_exists=$?
|
|
|
|
# review each of the milestones to discern the failure point
|
|
if [ "$audit_exists" -eq 0 ]; then
|
|
true
|
|
# no recovery options here
|
|
# pass through
|
|
elif [ "$shuffle_exists" -eq 0 ]; then
|
|
true
|
|
# no recovery options here
|
|
# pass through
|
|
elif [ "$verified_exists" -eq 0 ]; then
|
|
if [ "$rekey_exists" -gt 0 ]; then
|
|
if [ "$rekey_exists" -lt "$KEY_SECRET_SHARES" ]; then
|
|
# with verified_exists, indicates partial deletion
|
|
# of the cluster-rekey secrets after copying to
|
|
# cluster-key. Audit the cluster-key secrets before
|
|
# deleting rekey
|
|
rekeyAudit cluster-key
|
|
if [ $? -ne 0 ]; then
|
|
log $ERROR "Audit cluster-key fails with a" \
|
|
"partial set of cluster-rekey"
|
|
return 1
|
|
fi
|
|
|
|
deleteShardSecrets cluster-rekey
|
|
fi
|
|
|
|
# Handle condition where secrets were shuffled but
|
|
# vault-manager failed before recording the
|
|
# milestone cluster-rekey-shuffle
|
|
|
|
# auditRekey will double-check that cluster-key is
|
|
# in use
|
|
set_secret cluster-rekey-shuffle /dev/stdin \
|
|
<<<"$( get_secret cluster-rekey-request )"
|
|
log $INFO "Continuing rekey procedure with audit" \
|
|
"of cluster-key"
|
|
return 0
|
|
fi
|
|
# else: pass through
|
|
else
|
|
if [ "$rekey_exists" -eq 0 ]; then
|
|
# Handle condition where an active server fails during
|
|
# verification: vault may have cancelled the rekey procedure
|
|
|
|
# This question is: which shards are the vault servers
|
|
# using?
|
|
log $INFO "Recovering from mismatch of cluster-rekey" \
|
|
"and verified status"
|
|
|
|
# Audit the existing shards to see which ones the
|
|
# vault servers are keyed for.
|
|
# Most likely that the verification failed due to
|
|
# active server failing, start with cluster-key
|
|
rekeyAudit cluster-key
|
|
if [ $? -eq 0 ]; then
|
|
# The rekey verification did not complete
|
|
# remove cluster-rekey secrets
|
|
# The rekey procedure should restart
|
|
deleteShardSecrets cluster-rekey
|
|
log $INFO "Restart rekey procedure"
|
|
return 0
|
|
fi
|
|
|
|
# this happens when vault-manager process is killed
|
|
rekeyAudit cluster-rekey
|
|
if [ $? -eq 0 ]; then
|
|
set_secret cluster-rekey-verified /dev/null \
|
|
<<<$( get_secret cluster-rekey-request )
|
|
log $INFO "Continue rekey procedure with cluster-rekey"
|
|
return 0
|
|
fi
|
|
# else: pass through
|
|
elif [ "$rekey_exists" -eq 5 ]; then
|
|
# There are no cluster-rekey secrets; and the rekey is
|
|
# cancelled: the rekey procedure will restart
|
|
log $INFO "Continue rekey procedure with initialization"
|
|
return 0
|
|
else # cluster-rekey secrets are incomplete
|
|
# Handle condition where verification is needed but
|
|
# vault-manager did not store shards. The rekey was
|
|
# canceled above
|
|
|
|
# assert cluster-key before deleteing rekey
|
|
rekeyAudit cluster-key
|
|
if [ $? -eq 0 ]; then
|
|
# the rekey procedure will restart
|
|
log $INFO "Deleting partial set of" \
|
|
"cluster-rekey secrets"
|
|
deleteShardSecrets cluster-rekey
|
|
return 0
|
|
fi
|
|
# else: pass through
|
|
fi
|
|
fi
|
|
|
|
log $ERROR "Did not recover from current rekey status"
|
|
}
|
|
|
|
# The state machine for rekeying the vault server
|
|
#
|
|
# The overall procedure for rekey request includes:
|
|
# - wait for stability of vault servers
|
|
# - initialize the procedure
|
|
# - authenticate the rekey procedure by supplying shards
|
|
# - store the new shards
|
|
# - verify the rekey with the new shards read from k8s secrets
|
|
# - rotate the shard secrets:
|
|
# cluster-rekey - cluster-key - cluster-key-bk
|
|
# - Audit the new shards with active vault server
|
|
# - Remove artifacts of rekey procedure:
|
|
# cluster-key-bk, milestone secrets
|
|
#
|
|
function vaultRekey {
|
|
local records
|
|
local count
|
|
local result
|
|
local secrettext
|
|
|
|
if ! needsRekey; then
|
|
return
|
|
fi
|
|
|
|
# Retrieve and record the rekey status once for the tests that
|
|
# follow
|
|
NO_HEADER=true \
|
|
API_TMOUT=$API_REKEY_QUERY_TMOUT \
|
|
vaultAPI REKEY_STATUS_JSON GET $ACTIVE_TARGET /sys/rekey/init
|
|
if [ $? -ne 0 ]; then
|
|
# an error is printed
|
|
REKEY_STATUS_JSON=''
|
|
return
|
|
fi
|
|
|
|
needsAudit
|
|
case $? in
|
|
0)
|
|
rekeyResuming
|
|
rekeyAudit
|
|
if [ $? -eq 0 ]; then
|
|
set_secret cluster-rekey-audit /dev/stdin \
|
|
<<<$( get_secret cluster-rekey-request )
|
|
|
|
finalizeRekey
|
|
fi
|
|
return
|
|
;;
|
|
1) # continue to procedure step
|
|
;;
|
|
3) # audit is already completed
|
|
secretExists cluster-rekey-audit >/dev/null
|
|
if [ $? -eq 0 ]; then
|
|
# the cluster-key secrets were audit, but vault
|
|
# manager didn't get a chance to set
|
|
# cluster-rekey-audit milestone
|
|
finalizeRekey
|
|
return
|
|
fi
|
|
log $ERROR "Discrepancy between needsAudit and" \
|
|
"rekeyVault"
|
|
return
|
|
;;
|
|
*)
|
|
# an error occurs for which the procedure should not
|
|
# continue
|
|
return
|
|
;;
|
|
esac
|
|
|
|
needsShuffle
|
|
case $? in
|
|
0)
|
|
rekeyResuming
|
|
rekeyShuffleKeys
|
|
return
|
|
;;
|
|
1) # continue to procedure step
|
|
;;
|
|
*)
|
|
# an error occurs for which the procedure should not
|
|
# continue
|
|
return
|
|
;;
|
|
esac
|
|
|
|
needsVerify
|
|
case $? in
|
|
0)
|
|
rekeyResuming
|
|
rekeyVerify
|
|
return
|
|
;;
|
|
1) # continue to procedure step
|
|
;;
|
|
*)
|
|
# an error occurs for which the procedure should not
|
|
# continue
|
|
return
|
|
;;
|
|
esac
|
|
|
|
needsAuthentication
|
|
case $? in
|
|
0)
|
|
rekeyResuming
|
|
rekeyAuthenticate
|
|
return
|
|
;;
|
|
1) # continue to procedure step
|
|
;;
|
|
*)
|
|
# an error occurs for which the procedure should not
|
|
# continue
|
|
return
|
|
;;
|
|
esac
|
|
|
|
needsInitialization
|
|
case $? in
|
|
0)
|
|
secrettext="$( get_secret cluster-rekey-request )"
|
|
log $INFO "Rekey request started: $secrettext"
|
|
rekeyInitialize
|
|
return
|
|
;;
|
|
1) # continue to failure
|
|
;;
|
|
*)
|
|
# an error occurs for which the procedure should not
|
|
# continue
|
|
return
|
|
;;
|
|
esac
|
|
|
|
# falling through the case statements requires remediation
|
|
rekeyResuming
|
|
rekeyRecovery
|
|
}
|
|
|
|
|
|
#
|
|
# LOGIC
|
|
#
|
|
if [[ "${BASH_SOURCE[0]}" != "${0}" ]]; then
|
|
# This script was sourced
|
|
return 0
|
|
fi
|
|
|
|
if [ -n "$EARLY_PAUSE" ]; then
|
|
echo -n "$EARLY_PAUSE" > $PAUSEFILE
|
|
fi
|
|
|
|
exit_on_trap 1
|
|
|
|
# Match kubectl version to server version (or etc)
|
|
pickK8sVersion
|
|
|
|
# check if this pod is helping to convert storage from pvc to k8s
|
|
# secrets
|
|
mountHelper
|
|
exit_on_trap 15
|
|
|
|
# check if there are existing key shard secrets, boot strap secret,
|
|
# or pre-existing resource
|
|
K8S_SECRETS_PREEXIST="$( secretExists cluster-key-root )"
|
|
exit_on_trap 16
|
|
BOOTSTRAP_PREEXISTS="$( secretExists cluster-key-bootstrap )"
|
|
exit_on_trap 17
|
|
PVC_PREEXISTS="$( pvcRemoved )"
|
|
exit_on_trap 18
|
|
|
|
runConversion
|
|
exit_on_trap 19
|
|
|
|
# check if PVC still persisted after conversion, and if so issue a warning.
|
|
PVC_PREEXISTS="$( pvcRemoved )"
|
|
PVC_STATUS=$?
|
|
if [ $PVC_STATUS -eq 1 ]; then
|
|
log $DEBUG "PVC storage $PVC_PREEXISTS is currently terminating"
|
|
elif [ $PVC_STATUS -eq 2 ]; then
|
|
log $WARNING "PVC storage $PVC_PREEXISTS deletion has failed during conversion"
|
|
fi
|
|
|
|
# Waiting for at least one vault server, to check initialization
|
|
waitForPods 1
|
|
exit_on_trap 2
|
|
|
|
log $DEBUG "Putting a list of vault pods and ip in $WORKDIR/pods.txt"
|
|
getVaultPods > $WORKDIR/pods.txt
|
|
exit_on_trap 3
|
|
|
|
vaultInitialized
|
|
IS_VAULT_INITIALIZED=$?
|
|
if [ $IS_VAULT_INITIALIZED -eq 1 ]; then
|
|
exit_on_trap 4
|
|
desired_pods=$HA_REPLICAS
|
|
|
|
# Waiting for vault servers to come up
|
|
waitForPods $desired_pods
|
|
exit_on_trap 5
|
|
|
|
log $INFO "Putting a list of vault pods and IPs in $WORKDIR/pods.txt"
|
|
getVaultPods > $WORKDIR/pods.txt
|
|
exit_on_trap 6
|
|
|
|
log $DEBUG "Initializing the vault on vault-0 and" \
|
|
"storing keys in k8s secrets"
|
|
initVault
|
|
|
|
#Some sleep required to allow convergence"
|
|
sleep "$INIT_CONVERGE_TIME"
|
|
|
|
log $DEBUG "Unsealing vault-0 using the init shards"
|
|
for row in $(awk 'NR==1{print $2}' $WORKDIR/pods.txt); do
|
|
unsealVault "$row"
|
|
done
|
|
|
|
log $DEBUG "Joining other vault servers to the HA Raft cluster"
|
|
for row in $(awk 'NR>1{print $2}' $WORKDIR/pods.txt); do
|
|
log $DEBUG "$( grep $row $WORKDIR/pods.txt )"
|
|
joinRaft "$row"
|
|
sleep "$JOIN_RATE"
|
|
done
|
|
|
|
exit_on_trap 7
|
|
log $INFO "Unsealing the remaining vaults"
|
|
for row in $(awk 'NR>1{print $2}' $WORKDIR/pods.txt); do
|
|
log $DEBUG "$( grep $row $WORKDIR/pods.txt )"
|
|
unsealVault "$row"
|
|
sleep "$UNSEAL_RATE"
|
|
exit_on_trap 8
|
|
done
|
|
else
|
|
log $INFO "Vault is initialized"
|
|
fi
|
|
|
|
exit_on_trap 9
|
|
# initialize the state machine - vault server status records
|
|
echo "" > "$PODREC_F"
|
|
while read host dns_name; do
|
|
if [ -z "$host" ]; then
|
|
continue
|
|
fi
|
|
status_rec="/$host/$dns_name//"
|
|
echo "$status_rec" >> "$PODREC_F"
|
|
done <$WORKDIR/pods.txt
|
|
|
|
# Loop forever to check the seal status of vaults and
|
|
# unseal if required
|
|
log $INFO "Checking vault pods seal status in perpetuity..."
|
|
while true; do
|
|
exit_on_trap 10
|
|
sleep "$STATUS_RATE"
|
|
exit_on_trap 20
|
|
pickK8sVersion # check if the k8s server version is changed
|
|
|
|
count=$( kubectl get pods -n "${VAULT_NS}" \
|
|
-o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' \
|
|
| grep "^${VAULT_FN}-manager" | wc -w )
|
|
if [ "$count" -gt 1 ]; then
|
|
log $ERROR "Multiple instances of vault manager detected. Waiting until one left"
|
|
exit_on_trap 21
|
|
continue
|
|
fi
|
|
|
|
rm $WORKDIR/pods.txt
|
|
echo "" > "$PODREC_TMP_F"
|
|
exit_on_trap 11
|
|
getVaultPods > $WORKDIR/pods.txt
|
|
exit_on_trap 12
|
|
|
|
while read host dnsname; do
|
|
if [ -z "$dnsname" ]; then
|
|
# probably a recovering pod waiting for an IP address
|
|
log $DEBUG "pod list has empty data: [$host] [$dnsname]"
|
|
continue
|
|
fi
|
|
|
|
NO_HEADER=true \
|
|
API_TMOUT=$QUERY_TMOUT \
|
|
vaultAPI server_status GET $dnsname.$POD_TARGET_BASE \
|
|
/sys/health
|
|
echo -n "$server_status" > $WORKDIR/healthcheck.txt
|
|
|
|
TEMP=$( echo "$server_status" | jq -r .sealed )
|
|
|
|
exit_on_trap 13
|
|
# Decide when to unseal the vault server; includes
|
|
# Adding records to new_pods_status.txt
|
|
runStateMachine "$host" "$dnsname" "$TEMP"
|
|
exit_on_trap 14
|
|
done <$WORKDIR/pods.txt
|
|
mv "$PODREC_TMP_F" "$PODREC_F"
|
|
|
|
vaultRekey
|
|
done
|
|
kind: ConfigMap
|
|
metadata:
|
|
managedFields:
|
|
- apiVersion: v1
|
|
fieldsType: FieldsV1
|
|
fieldsV1:
|
|
f:data:
|
|
.: {}
|
|
f:init.sh: {}
|
|
manager: vault-init-unseal
|
|
name: vault-init-unseal-3
|
|
namespace: {{ .Release.Namespace }}
|
|
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
managedFields:
|
|
- apiVersion: v1
|
|
fieldsType: FieldsV1
|
|
fieldsV1:
|
|
f:data:
|
|
.: {}
|
|
f:pvc-attach.yaml: {}
|
|
manager: {{ .Values.vault.name }}-mount-helper
|
|
name: {{ .Values.vault.name }}-mount-helper
|
|
namespace: {{ .Release.Namespace }}
|
|
data:
|
|
pvc-attach.yaml: |
|
|
---
|
|
apiVersion: batch/v1
|
|
kind: Job
|
|
metadata:
|
|
name: {{ .Values.vault.fullname }}-mount-helper
|
|
namespace: vault
|
|
spec:
|
|
activeDeadlineSeconds: 600
|
|
completions: 1
|
|
parallelism: 1
|
|
ttlSecondsAfterFinished: 0
|
|
template:
|
|
spec:
|
|
restartPolicy: Never
|
|
serviceAccountName: "{{ .Values.vault.fullname }}-manager-1"
|
|
{{- if .Values.manager.imagePullSecrets }}
|
|
imagePullSecrets:
|
|
{{- toYaml .Values.manager.imagePullSecrets | nindent 12 }}
|
|
{{- end }}
|
|
{{- if .Values.manager.tolerations }}
|
|
tolerations:
|
|
{{- tpl .Values.manager.tolerations . | nindent 12 }}
|
|
{{- end }}
|
|
containers:
|
|
- name: mount
|
|
image: "{{ .Values.manager.image.repository }}:{{ .Values.manager.image.tag }}"
|
|
imagePullPolicy: "{{ .Values.manager.image.pullPolicy }}"
|
|
args:
|
|
- bash
|
|
- /opt/script/init.sh
|
|
env:
|
|
- name: MANAGER_MODE
|
|
value: MOUNT_HELPER
|
|
- name: PVC_DIR
|
|
value: /mnt/data
|
|
volumeMounts:
|
|
- name: mount-helper
|
|
mountPath: /opt/script
|
|
readOnly: true
|
|
- name: manager-pvc
|
|
mountPath: /mnt/data
|
|
readOnly: false
|
|
volumes:
|
|
- name: mount-helper
|
|
configMap:
|
|
name: vault-init-unseal-3
|
|
- name: manager-pvc
|
|
persistentVolumeClaim:
|
|
claimName: manager-pvc-sva-vault-manager-0
|
|
---
|
|
apiVersion: rbac.authorization.k8s.io/v1
|
|
kind: Role
|
|
metadata:
|
|
namespace: {{ .Release.Namespace }}
|
|
name: {{ .Values.vault.fullname }}-manager-1
|
|
rules:
|
|
- apiGroups: [""] # "" indicates the core API group
|
|
resources: ["pods"]
|
|
verbs: ["get", "watch", "list"]
|
|
- apiGroups: [""] # "" indicates the core API group
|
|
resources: ["pods/exec"]
|
|
verbs: ["create"]
|
|
- apiGroups: [""] # "" indicates the core API group
|
|
resources: ["secrets"]
|
|
verbs: ["get", "create", "delete"]
|
|
- apiGroups: ["batch"]
|
|
resources: ["jobs"]
|
|
verbs: ["get", "create", "delete"]
|
|
- apiGroups: [""] # "" indicates the core API group
|
|
resources: ["persistentvolumeclaims"]
|
|
verbs: ["list", "delete"]
|
|
---
|
|
apiVersion: v1
|
|
kind: ServiceAccount
|
|
metadata:
|
|
name: {{ .Values.vault.fullname }}-manager-1
|
|
namespace: {{ .Release.Namespace }}
|
|
labels:
|
|
helm.sh/chart: {{ .Values.manager.chart }}
|
|
app.kubernetes.io/name: {{ .Values.vault.name }}-manager
|
|
app.kubernetes.io/instance: {{ .Release.Name }}
|
|
app.kubernetes.io/managed-by: {{ .Release.Service }}
|
|
---
|
|
apiVersion: rbac.authorization.k8s.io/v1
|
|
kind: RoleBinding
|
|
metadata:
|
|
name: {{ .Values.vault.fullname }}-manager-1
|
|
namespace: {{ .Release.Namespace }}
|
|
subjects:
|
|
- kind: ServiceAccount
|
|
name: {{ .Values.vault.fullname }}-manager-1
|
|
roleRef:
|
|
kind: Role
|
|
name: {{ .Values.vault.fullname }}-manager-1
|
|
apiGroup: rbac.authorization.k8s.io
|
|
---
|
|
apiVersion: apps/v1
|
|
kind: StatefulSet
|
|
metadata:
|
|
name: {{ .Values.vault.fullname }}-manager-3
|
|
namespace: {{ .Release.Namespace }}
|
|
labels:
|
|
app.kubernetes.io/name: {{ .Values.vault.name }}-manager
|
|
app.kubernetes.io/instance: {{ .Release.Name }}
|
|
app.kubernetes.io/managed-by: {{ .Release.Service }}
|
|
component: webhook
|
|
spec:
|
|
serviceName: {{ .Values.vault.fullname }}
|
|
replicas: 1
|
|
selector:
|
|
matchLabels:
|
|
app.kubernetes.io/instance: {{ .Release.Name }}
|
|
component: webhook
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app.kubernetes.io/name: {{ .Values.vault.name }}-manager
|
|
app.kubernetes.io/instance: {{ .Release.Name }}
|
|
component: webhook
|
|
{{- if .Values.manager.extraLabels }}
|
|
{{- toYaml .Values.manager.extraLabels | nindent 8 -}}
|
|
{{- end }}
|
|
spec:
|
|
serviceAccountName: "{{ .Values.vault.fullname }}-manager-1"
|
|
{{- if .Values.manager.imagePullSecrets }}
|
|
imagePullSecrets:
|
|
{{- toYaml .Values.manager.imagePullSecrets | nindent 8 }}
|
|
{{- end }}
|
|
{{- if .Values.manager.tolerations }}
|
|
tolerations:
|
|
{{- tpl .Values.manager.tolerations . | nindent 8 }}
|
|
{{- end }}
|
|
containers:
|
|
- name: manager
|
|
image: "{{ .Values.manager.image.repository }}:{{ .Values.manager.image.tag }}"
|
|
imagePullPolicy: "{{ .Values.manager.image.pullPolicy }}"
|
|
args:
|
|
- bash
|
|
- /opt/script/init.sh
|
|
env:
|
|
- name: CA_CERT
|
|
value: /mnt/data/ca/tls.crt
|
|
volumeMounts:
|
|
- name: vault-init-unseal-3
|
|
mountPath: /opt/script
|
|
readOnly: false
|
|
- name: mount-helper-yaml
|
|
mountPath: /opt/yaml
|
|
readOnly: true
|
|
- name: vault-ca
|
|
mountPath: /mnt/data/ca
|
|
readOnly: true
|
|
volumes:
|
|
- name: vault-init-unseal-3
|
|
configMap:
|
|
name: vault-init-unseal-3
|
|
- name: mount-helper-yaml
|
|
configMap:
|
|
name: {{ .Values.vault.name }}-mount-helper
|
|
- name: vault-ca
|
|
secret:
|
|
secretName: vault-ca
|