config/workerconfig/workerconfig/worker_config

#!/bin/bash
#
# Copyright (c) 2013-2019 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#

#
# chkconfig: 2345 80 80
#

### BEGIN INIT INFO
# Provides:		worker_config
# Short-Description: 	Worker node config agent
# Default-Start:	2 3 4 5
# Default-Stop:		0 1 6
### END INIT INFO

. /usr/bin/tsconfig
. /etc/platform/platform.conf

PLATFORM_DIR=/opt/platform
CONFIG_DIR=$CONFIG_PATH
VOLATILE_CONFIG_PASS="/var/run/.config_pass"
VOLATILE_CONFIG_FAIL="/var/run/.config_fail"
LOGFILE="/var/log/worker_config.log"
IMA_POLICY=/etc/ima.policy

# Copy of /opt/platform required for worker_services
VOLATILE_PLATFORM_PATH=$VOLATILE_PATH/cpe_upgrade_opt_platform

DELAY_SEC=600
# If we're on a controller, increase DELAY_SEC to a large value
# to allow for active services to recover from a reboot or DOR
if [ "$nodetype" = "controller" ]
then
    DELAY_SEC=900
fi

fatal_error()
{
    cat <<EOF
*****************************************************
*****************************************************
$1
*****************************************************
*****************************************************
EOF
    touch $VOLATILE_CONFIG_FAIL
    logger "Error: $1"
    echo "Pausing for 5 seconds..."
    sleep 5
    exit 1
}

get_ip()
{
    local host=$1

    # Check /etc/hosts for the hostname
    local ipaddr=$(cat /etc/hosts | awk -v host=$host '$2 == host {print $1}')
    if [ -n "$ipaddr" ]
    then
        echo $ipaddr
        return
    fi

    START=$SECONDS
    let -i UNTIL=${SECONDS}+${DELAY_SEC}
    while [ ${UNTIL} -ge ${SECONDS} ]
    do

        # Because dnsmasq can resolve both a hostname to both an IPv4 and an IPv6
        # address in certain situations, and the last address is the IPv6, which
        # would be the management, this is preferred over the IPv4 pxeboot address,
        # so take the last address only.
        ipaddr=$(dig +short ANY $host|tail -1)
        if [[ "$ipaddr" =~ ^[0-9][0-9]*\.[0-9][0-9]*\.[0-9][0-9]*\.[0-9][0-9]*$ ]]
        then
             let -i DURATION=$SECONDS-$START
            logger -t $0 -p info "DNS query resolved to $ipaddr (took ${DURATION} secs)"
            echo $ipaddr
            return
        fi
        if [[ "$ipaddr" =~ ^[0-9a-z]*\:[0-9a-z\:]*$ ]]
        then
            let -i DURATION=$SECONDS-$START
            logger -t $0 -p info "DNS query resolved to $ipaddr (took ${DURATION} secs)"
            echo $ipaddr
            return
        fi
        logger -t $0 -p warn "DNS query failed for $host"
        sleep 5
    done
    let -i DURATION=$SECONDS-$START
    logger -t $0 -p warn "DNS query failed after max retries for $host (${DURATION} secs)"
}

wait_for_controller_services()
{
    while [ "$SECONDS" -le "$DELAY_SEC" ]
    do
        # Check to make sure the cloud-services group is enabled
        OUTPUT=`sm-query service-group cloud-services`
        if [ "$OUTPUT" == "cloud-services active" ]
        then
           return 0
        fi
        # Not running  Let's wait a couple of seconds and check again
        sleep 2
   done
   return 1
}

start()
{
    if [ -f /etc/platform/installation_failed ] ; then
        fatal_error "/etc/platform/installation_failed flag is set. Aborting."
    fi

    function=`echo "$subfunction" | cut -f 2 -d','`

    if [ "$nodetype" != "worker" -a "$function" != "worker" ] ; then
        logger -t $0 -p warn "exiting because this is not worker node"
        exit 0
    fi

    # If we're on a controller, ensure we only run if the controller config is complete
    if [ "$nodetype" = "controller" -a ! -f /etc/platform/.initial_controller_config_complete ]
    then
        logger -t $0 -p warn "exiting because this is controller that has not completed initial config"
        exit 0
    fi

    # Exit in error if called while the fail flag file is present
    if [ -e $VOLATILE_CONFIG_FAIL ] ; then
        logger -t $0 -p warn "exiting due to presence of $VOLATILE_CONFIG_FAIL file"
        exit 1
    fi

    # remove previous pass flag file so that if this fails we don't
    # end up with both pass and fail flag files present
    rm -f $VOLATILE_CONFIG_PASS


    if [ "$(stat -c %d:%i /)" != "$(stat -c %d:%i /proc/1/root/.)" ]; then
        # we are in chroot installer environment
        exit 0
    fi
    echo "Configuring worker node..."

    ######  SECURITY PROFILE (EXTENDED) #################
    # If we are in Extended Security Profile mode,      #
    # then before anything else, we need to load the    #
    # IMA Policy so that all configuration operations   #
    # can be measured and appraised                     #
    #                                                   #
    # N.B: Only run for worker nodetype since for AIO  #
    # controllerconfig would have already enabled IMA   #
    # policy                                            #
    #####################################################
    if [ "$nodetype" = "worker" -a "${security_profile}" = "extended" ]
    then
        IMA_LOAD_PATH=/sys/kernel/security/ima/policy
        if [ -f ${IMA_LOAD_PATH} ]; then
            echo "Loading IMA Policy"
            # Best effort operation only, if policy is
            # malformed then audit logs will indicate this,
            # and customer will need to load policy manually
            cat $IMA_POLICY > ${IMA_LOAD_PATH}
            [ $? -eq 0 ] || logger -t $0 -p warn "IMA Policy could not be loaded, see audit.log"
        else
            # the securityfs mount should have been
            # created had the IMA module loaded properly.
            # This is therefore a fatal error
            fatal_error "${IMA_LOAD_PATH} not available. Aborting."
        fi
    fi

    HOST=$(hostname)
    if [ -z "$HOST" -o "$HOST" = "localhost" ]
    then
        fatal_error "Host undefined. Unable to perform config"
    fi

    date "+%FT%T.%3N" > $LOGFILE
    IPADDR=$(get_ip $HOST)
    if [ -z "$IPADDR" ]
    then
        fatal_error "Unable to get IP from host: $HOST"
    fi

    # wait for controller services to be ready if it is an AIO system
    # since ping the loopback interface always returns ok
    if [ -e "${PLATFORM_SIMPLEX_FLAG}" ]
    then
         echo "Wait for the controller services"
         wait_for_controller_services
         if [ $? -ne 0 ]
         then
              fatal_error "Controller services are not ready"
         fi
    else
         /usr/local/bin/connectivity_test -t ${DELAY_SEC} -i ${IPADDR} controller-platform-nfs
         if [ $? -ne 0 ]
         then
              # 'controller-platform-nfs' is not available from management address
              fatal_error "Unable to contact active controller (controller-platform-nfs) from management address"
         fi
    fi
    # Write the hostname to file so it's persistent
    echo $HOST > /etc/hostname

    if ! [ -e "${PLATFORM_SIMPLEX_FLAG}" ]
    then
        # Mount the platform filesystem (if necessary - could be auto-mounted by now)
        mkdir -p $PLATFORM_DIR
        if [ ! -f $CONFIG_DIR/hosts ]
        then
            nfs-mount controller-platform-nfs:$PLATFORM_DIR $PLATFORM_DIR > /dev/null 2>&1
            RC=$?
            if [ $RC -ne 0 ]
            then
                fatal_error "Unable to mount $PLATFORM_DIR (RC:$RC)"
            fi
        fi

        # Copy over external_ceph config files
        if [ -e $CONFIG_DIR/ceph-config ]
        then
            cp $CONFIG_DIR/ceph-config/*.conf /etc/ceph/
            if [ $? -ne 0 ]
            then
                fatal_error "Unable to copy ceph-external config files"
            fi
        fi
    fi

    if [ "$nodetype" = "worker" ]
    then
        # Check whether our installed load matches the active controller
        CONTROLLER_UUID=`curl -sf http://controller:${http_port}/feed/rel-${SW_VERSION}/install_uuid`
        if [ $? -ne 0 ]
        then
            fatal_error "Unable to retrieve installation uuid from active controller"
        fi

        if [ "$INSTALL_UUID" != "$CONTROLLER_UUID" ]
        then
            fatal_error "This node is running a different load than the active controller and must be reinstalled"
        fi
    fi

    # banner customization always returns 0, success:
    /usr/sbin/install_banner_customization

    cp $CONFIG_DIR/hosts /etc/hosts
    if [ $? -ne 0 ]
    then
        fatal_error "Unable to copy $CONFIG_DIR/hosts"
    fi

    if [ "$nodetype" = "controller" -a "$HOST" = "controller-1" ]
    then
        # In a small system restore, there may be instance data that we want to
        # restore. Copy it and delete it.
        MATE_INSTANCES_DIR="$CONFIG_DIR/controller-1_nova_instances"
        if [ -d "$MATE_INSTANCES_DIR" ]
        then
            echo "Restoring instance data from mate controller"
            cp -Rp $MATE_INSTANCES_DIR/* /etc/nova/instances/
            rm -rf $MATE_INSTANCES_DIR
        fi
    fi

    # Upgrade related checks for controller-1 in combined controller/worker
    if [ "$nodetype" = "controller" -a "$HOST" = "controller-1" ]
    then
        # Check controller activity.
        # Prior to the final compile of R5 the service check below had been
        # against platform-nfs-ip. However, there was a worker
        # subfunction configuration failure when an AIO-DX system controller
        # booted up while there was no pingable backup controller. Seems the
        # platform-nfs-ip service was not always reaching the enabled-active
        # state when this check was performed under this particular failure.
        # Seems an earlier launched service of like functionality, namely
        # 'platform-export-fs' is reliably enabled at this point there-by
        # resolving the issue.
        sm-query service platform-export-fs | grep enabled-active > /dev/null 2>&1
        if [ $? -ne 0 ]
        then
            # This controller is not active so it is safe to check the version
            # of the mate controller.
            VOLATILE_ETC_PLATFORM_MOUNT=$VOLATILE_PATH/etc_platform
            mkdir $VOLATILE_ETC_PLATFORM_MOUNT
            nfs-mount controller-0:/etc/platform $VOLATILE_ETC_PLATFORM_MOUNT
            if [ $? -eq 0 ]
            then
                # Check whether software versions match on the two controllers
                MATE_SW_VERSION=$(source $VOLATILE_ETC_PLATFORM_MOUNT/platform.conf && echo $sw_version)
                if [ $SW_VERSION != $MATE_SW_VERSION ]
                then
                    echo "Controllers are running different software versions"
                    echo "SW_VERSION: $SW_VERSION  MATE_SW_VERSION: $MATE_SW_VERSION"

                    # Since controller-1 is always upgraded first (and downgraded
                    # last), we know that controller-1 is running a higher release
                    # than controller-0.
                    # This controller is not active and is running a higher
                    # release than the mate controller, so do not launch
                    # any of the worker services (they will not work with
                    # a lower version of the controller services).
                    echo "Disabling worker services until controller activated"
                    touch $VOLATILE_DISABLE_WORKER_SERVICES

                    # Copy $PLATFORM_DIR into a temporary location for the worker_services script to
                    # access. This is only required for CPE upgrades
                    rm -rf $VOLATILE_PLATFORM_PATH
                    mkdir -p $VOLATILE_PLATFORM_PATH
                    cp -Rp $PLATFORM_DIR/* $VOLATILE_PLATFORM_PATH/

                fi
                umount $VOLATILE_ETC_PLATFORM_MOUNT
                rmdir $VOLATILE_ETC_PLATFORM_MOUNT
            else
                rmdir $VOLATILE_ETC_PLATFORM_MOUNT
                fatal_error "Unable to mount /etc/platform"
            fi
        else
            # Controller-1 (CPE) is active and is rebooting. This is probably a DOR. Since this
            # could happen during an upgrade, we will copy $PLATFORM_DIR into a temporary
            # location for the worker_services script to access in case of a future swact.
            rm -rf $VOLATILE_PLATFORM_PATH
            mkdir -p $VOLATILE_PLATFORM_PATH
            cp -Rp $PLATFORM_DIR/* $VOLATILE_PLATFORM_PATH/
        fi
    fi

    # Apply the puppet manifest
    HOST_HIERA=${PUPPET_PATH}/hieradata/${IPADDR}.yaml
    if [ -f ${HOST_HIERA} ]; then
        echo "$0: Running puppet manifest apply"
        puppet-manifest-apply.sh ${PUPPET_PATH}/hieradata ${IPADDR} worker
        RC=$?
        if [ $RC -ne 0 ];
        then
            fatal_error "Failed to run the puppet manifest (RC:$RC)"
        fi
    else
        fatal_error "Host configuration not yet available for this node ($(hostname)=${IPADDR}); aborting configuration."
    fi

    # Load Network Block Device
    modprobe nbd
    if [ $? -ne 0 ]
    then
        echo "WARNING: Unable to load kernel module: nbd."
        logger "WARNING: Unable to load kernel module: nbd."
    fi

    #Run mount command to mount any NFS filesystems that required network access
    /bin/mount -a -t nfs
    RC=$?
    if [ $RC -ne 0 ]
    then
        fatal_error "Unable to mount NFS filesystems (RC:$RC)"
    fi

    touch $VOLATILE_CONFIG_PASS
}

stop ()
{
    # Nothing to do
    return
}

case "$1" in
    start)
        start
        ;;
    stop)
        stop
        ;;
    *)
        echo "Usage: $0 {start|stop}"
        exit 1
        ;;
esac

exit 0