config/workerconfig/workerconfig/worker_config

393 lines
13 KiB
Bash

#!/bin/bash
#
# Copyright (c) 2013-2019 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
#
# chkconfig: 2345 80 80
#
### BEGIN INIT INFO
# Provides: worker_config
# Short-Description: Worker node config agent
# Default-Start: 2 3 4 5
# Default-Stop: 0 1 6
### END INIT INFO
. /usr/bin/tsconfig
. /etc/platform/platform.conf
PLATFORM_DIR=/opt/platform
CONFIG_DIR=$CONFIG_PATH
VOLATILE_CONFIG_PASS="/var/run/.config_pass"
VOLATILE_CONFIG_FAIL="/var/run/.config_fail"
LOGFILE="/var/log/worker_config.log"
IMA_POLICY=/etc/ima.policy
# Copy of /opt/platform required for worker_services
VOLATILE_PLATFORM_PATH=$VOLATILE_PATH/cpe_upgrade_opt_platform
DELAY_SEC=600
# If we're on a controller, increase DELAY_SEC to a large value
# to allow for active services to recover from a reboot or DOR
if [ "$nodetype" = "controller" ]
then
DELAY_SEC=900
fi
fatal_error()
{
cat <<EOF
*****************************************************
*****************************************************
$1
*****************************************************
*****************************************************
EOF
touch $VOLATILE_CONFIG_FAIL
logger "Error: $1"
echo "Pausing for 5 seconds..."
sleep 5
exit 1
}
get_ip()
{
local host=$1
# Check /etc/hosts for the hostname
local ipaddr=$(cat /etc/hosts | awk -v host=$host '$2 == host {print $1}')
if [ -n "$ipaddr" ]
then
echo $ipaddr
return
fi
START=$SECONDS
let -i UNTIL=${SECONDS}+${DELAY_SEC}
while [ ${UNTIL} -ge ${SECONDS} ]
do
# Because dnsmasq can resolve both a hostname to both an IPv4 and an IPv6
# address in certain situations, and the last address is the IPv6, which
# would be the management, this is preferred over the IPv4 pxeboot address,
# so take the last address only.
ipaddr=$(dig +short ANY $host|tail -1)
if [[ "$ipaddr" =~ ^[0-9][0-9]*\.[0-9][0-9]*\.[0-9][0-9]*\.[0-9][0-9]*$ ]]
then
let -i DURATION=$SECONDS-$START
logger -t $0 -p info "DNS query resolved to $ipaddr (took ${DURATION} secs)"
echo $ipaddr
return
fi
if [[ "$ipaddr" =~ ^[0-9a-z]*\:[0-9a-z\:]*$ ]]
then
let -i DURATION=$SECONDS-$START
logger -t $0 -p info "DNS query resolved to $ipaddr (took ${DURATION} secs)"
echo $ipaddr
return
fi
logger -t $0 -p warn "DNS query failed for $host"
sleep 5
done
let -i DURATION=$SECONDS-$START
logger -t $0 -p warn "DNS query failed after max retries for $host (${DURATION} secs)"
}
wait_for_controller_services()
{
while [ "$SECONDS" -le "$DELAY_SEC" ]
do
# Check to make sure the cloud-services group is enabled
OUTPUT=`sm-query service-group cloud-services`
if [ "$OUTPUT" == "cloud-services active" ]
then
return 0
fi
# Not running Let's wait a couple of seconds and check again
sleep 2
done
return 1
}
start()
{
if [ -f /etc/platform/installation_failed ] ; then
fatal_error "/etc/platform/installation_failed flag is set. Aborting."
fi
function=`echo "$subfunction" | cut -f 2 -d','`
if [ "$nodetype" != "worker" -a "$function" != "worker" ] ; then
logger -t $0 -p warn "exiting because this is not worker node"
exit 0
fi
# If we're on a controller, ensure we only run if the controller config is complete
if [ "$nodetype" = "controller" -a ! -f /etc/platform/.initial_controller_config_complete ]
then
logger -t $0 -p warn "exiting because this is controller that has not completed initial config"
exit 0
fi
# Exit in error if called while the fail flag file is present
if [ -e $VOLATILE_CONFIG_FAIL ] ; then
logger -t $0 -p warn "exiting due to presence of $VOLATILE_CONFIG_FAIL file"
exit 1
fi
# remove previous pass flag file so that if this fails we don't
# end up with both pass and fail flag files present
rm -f $VOLATILE_CONFIG_PASS
if [ "$(stat -c %d:%i /)" != "$(stat -c %d:%i /proc/1/root/.)" ]; then
# we are in chroot installer environment
exit 0
fi
echo "Configuring worker node..."
###### SECURITY PROFILE (EXTENDED) #################
# If we are in Extended Security Profile mode, #
# then before anything else, we need to load the #
# IMA Policy so that all configuration operations #
# can be measured and appraised #
# #
# N.B: Only run for worker nodetype since for AIO #
# controllerconfig would have already enabled IMA #
# policy #
#####################################################
if [ "$nodetype" = "worker" -a "${security_profile}" = "extended" ]
then
IMA_LOAD_PATH=/sys/kernel/security/ima/policy
if [ -f ${IMA_LOAD_PATH} ]; then
echo "Loading IMA Policy"
# Best effort operation only, if policy is
# malformed then audit logs will indicate this,
# and customer will need to load policy manually
cat $IMA_POLICY > ${IMA_LOAD_PATH}
[ $? -eq 0 ] || logger -t $0 -p warn "IMA Policy could not be loaded, see audit.log"
else
# the securityfs mount should have been
# created had the IMA module loaded properly.
# This is therefore a fatal error
fatal_error "${IMA_LOAD_PATH} not available. Aborting."
fi
fi
HOST=$(hostname)
if [ -z "$HOST" -o "$HOST" = "localhost" ]
then
fatal_error "Host undefined. Unable to perform config"
fi
date "+%FT%T.%3N" > $LOGFILE
IPADDR=$(get_ip $HOST)
if [ -z "$IPADDR" ]
then
fatal_error "Unable to get IP from host: $HOST"
fi
# wait for controller services to be ready if it is an AIO system
# since ping the loopback interface always returns ok
if [ -e "${PLATFORM_SIMPLEX_FLAG}" ]
then
echo "Wait for the controller services"
wait_for_controller_services
if [ $? -ne 0 ]
then
fatal_error "Controller services are not ready"
fi
else
/usr/local/bin/connectivity_test -t ${DELAY_SEC} -i ${IPADDR} controller-platform-nfs
if [ $? -ne 0 ]
then
# 'controller-platform-nfs' is not available from management address
fatal_error "Unable to contact active controller (controller-platform-nfs) from management address"
fi
fi
# Write the hostname to file so it's persistent
echo $HOST > /etc/hostname
if ! [ -e "${PLATFORM_SIMPLEX_FLAG}" ]
then
# Mount the platform filesystem (if necessary - could be auto-mounted by now)
mkdir -p $PLATFORM_DIR
if [ ! -f $CONFIG_DIR/hosts ]
then
nfs-mount controller-platform-nfs:$PLATFORM_DIR $PLATFORM_DIR > /dev/null 2>&1
RC=$?
if [ $RC -ne 0 ]
then
fatal_error "Unable to mount $PLATFORM_DIR (RC:$RC)"
fi
fi
# Copy over external_ceph config files
if [ -e $CONFIG_DIR/ceph-config ]
then
cp $CONFIG_DIR/ceph-config/*.conf /etc/ceph/
if [ $? -ne 0 ]
then
fatal_error "Unable to copy ceph-external config files"
fi
fi
fi
if [ "$nodetype" = "worker" ]
then
# Check whether our installed load matches the active controller
CONTROLLER_UUID=`curl -sf http://controller:${http_port}/feed/rel-${SW_VERSION}/install_uuid`
if [ $? -ne 0 ]
then
fatal_error "Unable to retrieve installation uuid from active controller"
fi
if [ "$INSTALL_UUID" != "$CONTROLLER_UUID" ]
then
fatal_error "This node is running a different load than the active controller and must be reinstalled"
fi
fi
# banner customization always returns 0, success:
/usr/sbin/install_banner_customization
cp $CONFIG_DIR/hosts /etc/hosts
if [ $? -ne 0 ]
then
fatal_error "Unable to copy $CONFIG_DIR/hosts"
fi
if [ "$nodetype" = "controller" -a "$HOST" = "controller-1" ]
then
# In a small system restore, there may be instance data that we want to
# restore. Copy it and delete it.
MATE_INSTANCES_DIR="$CONFIG_DIR/controller-1_nova_instances"
if [ -d "$MATE_INSTANCES_DIR" ]
then
echo "Restoring instance data from mate controller"
cp -Rp $MATE_INSTANCES_DIR/* /etc/nova/instances/
rm -rf $MATE_INSTANCES_DIR
fi
fi
# Upgrade related checks for controller-1 in combined controller/worker
if [ "$nodetype" = "controller" -a "$HOST" = "controller-1" ]
then
# Check controller activity.
# Prior to the final compile of R5 the service check below had been
# against platform-nfs-ip. However, there was a worker
# subfunction configuration failure when an AIO-DX system controller
# booted up while there was no pingable backup controller. Seems the
# platform-nfs-ip service was not always reaching the enabled-active
# state when this check was performed under this particular failure.
# Seems an earlier launched service of like functionality, namely
# 'platform-export-fs' is reliably enabled at this point there-by
# resolving the issue.
sm-query service platform-export-fs | grep enabled-active > /dev/null 2>&1
if [ $? -ne 0 ]
then
# This controller is not active so it is safe to check the version
# of the mate controller.
VOLATILE_ETC_PLATFORM_MOUNT=$VOLATILE_PATH/etc_platform
mkdir $VOLATILE_ETC_PLATFORM_MOUNT
nfs-mount controller-0:/etc/platform $VOLATILE_ETC_PLATFORM_MOUNT
if [ $? -eq 0 ]
then
# Check whether software versions match on the two controllers
MATE_SW_VERSION=$(source $VOLATILE_ETC_PLATFORM_MOUNT/platform.conf && echo $sw_version)
if [ $SW_VERSION != $MATE_SW_VERSION ]
then
echo "Controllers are running different software versions"
echo "SW_VERSION: $SW_VERSION MATE_SW_VERSION: $MATE_SW_VERSION"
# Since controller-1 is always upgraded first (and downgraded
# last), we know that controller-1 is running a higher release
# than controller-0.
# This controller is not active and is running a higher
# release than the mate controller, so do not launch
# any of the worker services (they will not work with
# a lower version of the controller services).
echo "Disabling worker services until controller activated"
touch $VOLATILE_DISABLE_WORKER_SERVICES
# Copy $PLATFORM_DIR into a temporary location for the worker_services script to
# access. This is only required for CPE upgrades
rm -rf $VOLATILE_PLATFORM_PATH
mkdir -p $VOLATILE_PLATFORM_PATH
cp -Rp $PLATFORM_DIR/* $VOLATILE_PLATFORM_PATH/
fi
umount $VOLATILE_ETC_PLATFORM_MOUNT
rmdir $VOLATILE_ETC_PLATFORM_MOUNT
else
rmdir $VOLATILE_ETC_PLATFORM_MOUNT
fatal_error "Unable to mount /etc/platform"
fi
else
# Controller-1 (CPE) is active and is rebooting. This is probably a DOR. Since this
# could happen during an upgrade, we will copy $PLATFORM_DIR into a temporary
# location for the worker_services script to access in case of a future swact.
rm -rf $VOLATILE_PLATFORM_PATH
mkdir -p $VOLATILE_PLATFORM_PATH
cp -Rp $PLATFORM_DIR/* $VOLATILE_PLATFORM_PATH/
fi
fi
# Apply the puppet manifest
HOST_HIERA=${PUPPET_PATH}/hieradata/${IPADDR}.yaml
if [ -f ${HOST_HIERA} ]; then
echo "$0: Running puppet manifest apply"
puppet-manifest-apply.sh ${PUPPET_PATH}/hieradata ${IPADDR} worker
RC=$?
if [ $RC -ne 0 ];
then
fatal_error "Failed to run the puppet manifest (RC:$RC)"
fi
else
fatal_error "Host configuration not yet available for this node ($(hostname)=${IPADDR}); aborting configuration."
fi
# Load Network Block Device
modprobe nbd
if [ $? -ne 0 ]
then
echo "WARNING: Unable to load kernel module: nbd."
logger "WARNING: Unable to load kernel module: nbd."
fi
#Run mount command to mount any NFS filesystems that required network access
/bin/mount -a -t nfs
RC=$?
if [ $RC -ne 0 ]
then
fatal_error "Unable to mount NFS filesystems (RC:$RC)"
fi
touch $VOLATILE_CONFIG_PASS
}
stop ()
{
# Nothing to do
return
}
case "$1" in
start)
start
;;
stop)
stop
;;
*)
echo "Usage: $0 {start|stop}"
exit 1
;;
esac
exit 0