Merge "Ensure agent is ready before issuing runtime config"

This commit is contained in:
Zuul 2021-03-01 15:14:07 +00:00 committed by Gerrit Code Review
commit 53c6bed5b2
5 changed files with 412 additions and 72 deletions

View File

@ -1,6 +1,6 @@
#! /bin/sh
#
# Copyright (c) 2013-2014 Wind River Systems, Inc.
# Copyright (c) 2013-2021 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@ -29,6 +29,7 @@ SYSINV_CONF_DIR="/etc/sysinv"
SYSINV_CONF_FILE="${SYSINV_CONF_DIR}/sysinv.conf"
SYSINV_CONF_DEFAULT_FILE="/opt/platform/sysinv/${SW_VERSION}/sysinv.conf.default"
SYSINV_READY_FLAG=/var/run/.sysinv_ready
SYSINV_REPORTED_FLAG=/var/run/sysinv/.sysinv_reported
DELAY_SEC=20
@ -62,11 +63,11 @@ function mount_and_copy_config_file()
if [ ${RETVAL} -ne 0 ] ; then
logger "$0: Warn: nfs-mount controller:/opt/platform/sysinv/${SW_VERSION} /mnt/sysinv"
else
mkdir -p $SYSINV_CONF_DIR
mkdir -p $SYSINV_CONF_DIR
cp /mnt/sysinv/sysinv.conf.default ${SYSINV_CONF_FILE}
RETVAL=$?
RETVAL=$?
if [ $? -ne 0 ] ; then
logger "$0: Warn: cp /mnt/sysinv/sysinv.conf.default ${SYSINV_CONF_FILE}"
logger "$0: Warn: cp /mnt/sysinv/sysinv.conf.default ${SYSINV_CONF_FILE}"
fi
timeout 5s umount /mnt/sysinv
rmdir /mnt/sysinv
@ -84,18 +85,20 @@ case "$1" in
exit 1
fi
# if [ "$NODETYPE" = "compute" ] ; then
# if [ "$NODETYPE" = "compute" ] || [ "$NODETYPE" = "controller" ] ; then
echo -n "Setting up config for sysinv-agent: "
if [ -e ${SYSINV_READY_FLAG} ] ; then
echo -n "Setting up config for sysinv-agent: "
if [ -e ${SYSINV_READY_FLAG} ] ; then
# clear it on every restart, so agent can update it
rm -f ${SYSINV_READY_FLAG}
fi
if [ -f ${SYSINV_CONF_FILE} ] ; then
logger "$0: ${SYSINV_CONF_FILE} already exists"
RETVAL=0
else
if [ -e ${SYSINV_REPORTED_FLAG} ] ; then
# clear it on every restart, so agent can update it
rm -f ${SYSINV_REPORTED_FLAG}
fi
if [ -f ${SYSINV_CONF_FILE} ] ; then
logger "$0: ${SYSINV_CONF_FILE} already exists"
RETVAL=0
else
# Avoid self-mount due to potential nfs issues
echo "Checking for controller-platform-nfs "
@ -120,20 +123,20 @@ case "$1" in
echo "controller-platform-nfs is not available"
else
# Only required if conf file does not already exist
if [ -f ${SYSINV_CONF_DEFAULT_FILE} ]
then
echo "Copying self sysinv.conf without mount"
mkdir -p $SYSINV_CONF_DIR
cp ${SYSINV_CONF_DEFAULT_FILE} ${SYSINV_CONF_FILE}
RETVAL=$?
if [ $? -ne 0 ] ; then
if [ -f ${SYSINV_CONF_DEFAULT_FILE} ]
then
echo "Copying self sysinv.conf without mount"
mkdir -p $SYSINV_CONF_DIR
cp ${SYSINV_CONF_DEFAULT_FILE} ${SYSINV_CONF_FILE}
RETVAL=$?
if [ $? -ne 0 ] ; then
logger "$0: Warn: cp /mnt/sysinv/sysinv.conf.default ${SYSINV_CONF_FILE} failed. Try mount."
else
CONF_COPIED=1
fi
fi
fi
if [ ${CONF_COPIED} -eq 0 ]
then
then
CONF_COPY_COUNT=0
while [ $CONF_COPY_COUNT -lt 3 ]; do
if mount_and_copy_config_file ;
@ -144,52 +147,48 @@ case "$1" in
let CONF_COPY_COUNT=CONF_COPY_COUNT+1
logger "$0: Warn: Mount and copy config file failed. Attempt: ${CONF_COPY_COUNT}"
done
fi
fi
fi
fi
fi
echo -n "Installing virtio_net driver: "
timeout 5s modprobe virtio_net
RETVAL=$?
if [ ${RETVAL} -eq 0 ] ; then
echo "OK"
else
echo "FAIL"
fi
echo -n "Installing virtio_net driver: "
timeout 5s modprobe virtio_net
RETVAL=$?
if [ ${RETVAL} -eq 0 ] ; then
echo "OK"
else
echo "FAIL"
fi
if [ -e ${daemon_pidfile} ] ; then
echo "Killing existing process before starting new"
pid=`cat ${daemon_pidfile}`
kill -TERM $pid
rm -f ${daemon_pidfile}
fi
if [ -e ${daemon_pidfile} ] ; then
echo "Killing existing process before starting new"
pid=`cat ${daemon_pidfile}`
kill -TERM $pid
rm -f ${daemon_pidfile}
fi
echo -n "Starting sysinv-agent: "
/bin/sh -c "${SYSINVAGENT}"' >> /dev/null 2>&1 & echo $!' > ${daemon_pidfile}
RETVAL=$?
if [ $RETVAL -eq 0 ] ; then
echo "OK"
touch /var/lock/subsys/${DAEMON_NAME}
else
echo "FAIL"
fi
# fi
echo -n "Starting sysinv-agent: "
/bin/sh -c "${SYSINVAGENT}"' >> /dev/null 2>&1 & echo $!' > ${daemon_pidfile}
RETVAL=$?
if [ $RETVAL -eq 0 ] ; then
echo "OK"
touch /var/lock/subsys/${DAEMON_NAME}
else
echo "FAIL"
fi
;;
stop)
# if [ "$NODETYPE" = "compute" ] ; then
# if [ "$NODETYPE" = "compute" ] || [ "$NODETYPE" = "controller" ] ; then
echo -n "Stopping sysinv-agent: "
if [ -e ${daemon_pidfile} ] ; then
pid=`cat ${daemon_pidfile}`
kill -TERM $pid
rm -f ${daemon_pidfile}
rm -f /var/lock/subsys/${DAEMON_NAME}
echo "OK"
else
echo "FAIL"
fi
# fi
echo -n "Stopping sysinv-agent: "
if [ -e ${daemon_pidfile} ] ; then
pid=`cat ${daemon_pidfile}`
kill -TERM $pid
rm -f ${daemon_pidfile}
rm -f /var/lock/subsys/${DAEMON_NAME}
echo "OK"
else
echo "FAIL"
fi
;;
restart)

View File

@ -94,8 +94,6 @@ CONF.register_opts(agent_opts, 'agent')
MAXSLEEP = 300 # 5 minutes
SYSINV_READY_FLAG = os.path.join(tsc.VOLATILE_PATH, ".sysinv_ready")
SYSINV_FIRST_REPORT_FLAG = os.path.join(tsc.VOLATILE_PATH,
".sysinv_agent_report_sent")
CONFIG_APPLIED_FILE = os.path.join(tsc.PLATFORM_CONF_PATH, ".config_applied")
CONFIG_APPLIED_DEFAULT = "install"
@ -209,6 +207,10 @@ class AgentManager(service.PeriodicService):
initial_reports_required = \
self.INVENTORY_REPORTS_REQUIRED - self._inventory_reported
initial_reports_required.discard(self.HOST_FILESYSTEMS)
if self._inventory_reported:
utils.touch(constants.SYSINV_REPORTED)
if initial_reports_required:
LOG.info("_report_to_conductor initial_reports_required=%s" %
initial_reports_required)
@ -218,7 +220,7 @@ class AgentManager(service.PeriodicService):
def _report_to_conductor_iplatform_avail(self):
# First report sent to conductor since boot
utils.touch(SYSINV_FIRST_REPORT_FLAG)
utils.touch(constants.SYSINV_FIRST_REPORT_FLAG)
# Sysinv-agent ready; used also by the init script.
utils.touch(SYSINV_READY_FLAG)
time.sleep(1) # give time for conductor to process
@ -553,7 +555,7 @@ class AgentManager(service.PeriodicService):
# Is this the first time since boot we are reporting to conductor?
msg_dict.update({constants.SYSINV_AGENT_FIRST_REPORT:
not os.path.exists(SYSINV_FIRST_REPORT_FLAG)})
not os.path.exists(constants.SYSINV_FIRST_REPORT_FLAG)})
try:
rpcapi.iplatform_update_by_ihost(context,

View File

@ -1329,6 +1329,11 @@ if tox_work_dir:
else:
SYSINV_VOLATILE_PATH = os.path.join(tsc.VOLATILE_PATH, "sysinv")
SYSINV_FIRST_REPORT_FLAG = os.path.join(SYSINV_VOLATILE_PATH,
".sysinv_agent_first_report_sent")
SYSINV_REPORTED = os.path.join(SYSINV_VOLATILE_PATH,
".sysinv_reported")
NETWORK_CONFIG_LOCK_FILE = os.path.join(
tsc.VOLATILE_PATH, "apply_network_config.lock")

View File

@ -169,6 +169,10 @@ ACTIVE_CONFIG_REBOOT_REQUIRED = os.path.join(
# configuration UUID reboot required flag (bit)
CONFIG_REBOOT_REQUIRED = (1 << 127)
# Types of runtime configuration applies
CONFIG_APPLY_RUNTIME_MANIFEST = 'config_apply_runtime_manifest'
CONFIG_UPDATE_FILE = 'config_update_file'
LOCK_NAME_UPDATE_CONFIG = 'update_config_'
LOCK_AUTO_APPLY = 'AutoApplyLock'
@ -212,6 +216,9 @@ class ConductorManager(service.PeriodicService):
# this will track the config w/ reboot request to apply
self._host_reboot_config_uuid = {}
# track deferred runtime config which need to be applied
self._host_deferred_runtime_config = []
def start(self):
self._start()
# accept API calls and run periodic tasks after
@ -5263,6 +5270,68 @@ class ConductorManager(service.PeriodicService):
'install_state_info':
host.install_state_info})
def _ready_to_apply_runtime_config(
self, context, personalities=None, host_uuids=None):
"""Determine whether ready to apply runtime config"""
# Scope to the active controller since do not want to block runtime
# manifest apply due to other hosts here. The config target will
# still track for any missed config (on other hosts in case other
# hosts are unavailable).
if personalities is None:
personalities = []
if host_uuids is None:
host_uuids = []
check_required = False
if constants.CONTROLLER in personalities:
check_required = True
if constants.WORKER in personalities and cutils.is_aio_system(self.dbapi):
check_required = True
if host_uuids and self.host_uuid not in host_uuids:
check_required = False
if not check_required:
return True
if not os.path.exists(constants.SYSINV_REPORTED):
LOG.info("_ready_to_apply_runtime_config path does not exist: %s" %
constants.SYSINV_REPORTED)
return False
return True
def _audit_deferred_runtime_config(self, context):
"""Apply deferred config runtime manifests when ready"""
LOG.debug("_audit_deferred_runtime_config %s" %
self._host_deferred_runtime_config)
if not self._ready_to_apply_runtime_config(context):
return
if self._host_deferred_runtime_config:
# apply the deferred runtime manifests
for config in list(self._host_deferred_runtime_config):
config_type = config.get('config_type')
LOG.info("found _audit_deferred_runtime_config request apply %s" %
config)
if config_type == CONFIG_APPLY_RUNTIME_MANIFEST:
self._config_apply_runtime_manifest(
context,
config['config_uuid'],
config['config_dict'],
force=config.get('force', False))
elif config_type == CONFIG_UPDATE_FILE:
self._config_update_file(
context,
config['config_uuid'],
config['config_dict'])
else:
LOG.error("Removing unsupported deferred config_type %s" %
config_type)
self._host_deferred_runtime_config.remove(config)
@periodic_task.periodic_task(spacing=CONF.conductor.audit_interval)
def _kubernetes_local_secrets_audit(self, context):
# Audit kubernetes local registry secrets info
@ -5275,6 +5344,9 @@ class ConductorManager(service.PeriodicService):
# periodically, perform audit of inventory
LOG.debug("Sysinv Conductor running periodic audit task.")
# check whether there are deferred runtime manifests to apply
self._audit_deferred_runtime_config(context)
# check whether we may have just become active with target config
self._controller_config_active_apply(context)
@ -9168,6 +9240,21 @@ class ConductorManager(service.PeriodicService):
: action_key: match key (for patch only)
: }
"""
if not self._ready_to_apply_runtime_config(
context,
config_dict.get('personalities'),
config_dict.get('host_uuids')):
# append to deferred for audit
self._host_deferred_runtime_config.append(
{'config_type': CONFIG_UPDATE_FILE,
'config_uuid': config_uuid,
'config_dict': config_dict,
})
LOG.info("defer update file to _host_deferred_runtime_config %s" %
self._host_deferred_runtime_config)
return
# Ensure hiera data is updated prior to active apply.
self._config_update_puppet(config_uuid, config_dict)
@ -9225,6 +9312,23 @@ class ConductorManager(service.PeriodicService):
else:
LOG.info("applying runtime manifest config_uuid=%s" % config_uuid)
# only apply runtime manifests to active controller if agent ready,
# otherwise will append to the list of outstanding runtime manifests
if not self._ready_to_apply_runtime_config(
context,
config_dict.get('personalities'),
config_dict.get('host_uuids')):
# append to deferred for audit
self._host_deferred_runtime_config.append(
{'config_type': CONFIG_APPLY_RUNTIME_MANIFEST,
'config_uuid': config_uuid,
'config_dict': config_dict,
'force': force,
})
LOG.info("defer apply runtime manifest %s" %
self._host_deferred_runtime_config)
return
# Update hiera data for all hosts prior to runtime apply if host_uuid
# is not set. If host_uuids is set only update hiera data for those hosts.
self._config_update_puppet(config_uuid,

View File

@ -24,8 +24,10 @@
import mock
import os.path
import tsconfig.tsconfig as tsc
import uuid
from sysinv.agent import rpcapi as agent_rpcapi
from sysinv.common import constants
from sysinv.common import device as dconstants
from sysinv.common import exception
@ -114,8 +116,19 @@ class ManagerTestCase(base.DbTestCase):
self.fail_config_apply_runtime_manifest = False
def mock_config_apply_runtime_manifest(obj, context, config_uuid,
config_dict, force=False):
# Mock ready to apply runtime config
self._ready_to_apply_runtime_config = True
self.ready_to_apply_runtime_config_patcher = mock.patch.object(
manager.ConductorManager, '_ready_to_apply_runtime_config')
self.mock_ready_to_apply_runtime_config = \
self.ready_to_apply_runtime_config_patcher.start()
self.mock_ready_to_apply_runtime_config.return_value = \
self._ready_to_apply_runtime_config
self.addCleanup(self.ready_to_apply_runtime_config_patcher.stop)
# Mock agent config_apply_runtime_manifest
def mock_agent_config_apply_runtime_manifest(obj, context, config_uuid,
config_dict):
if not self.fail_config_apply_runtime_manifest:
# Pretend the config was applied
if 'host_uuids' in config_dict:
@ -129,11 +142,40 @@ class ManagerTestCase(base.DbTestCase):
self.dbapi.ihost_update(
host.uuid, {'config_applied': config_uuid})
self.mocked_config_apply_runtime_manifest = mock.patch.object(
manager.ConductorManager, '_config_apply_runtime_manifest',
mock_config_apply_runtime_manifest)
self.mocked_config_apply_runtime_manifest.start()
self.addCleanup(self.mocked_config_apply_runtime_manifest.stop)
self.mocked_rpcapi_config_apply_runtime_manifest = mock.patch.object(
agent_rpcapi.AgentAPI, 'config_apply_runtime_manifest',
mock_agent_config_apply_runtime_manifest)
self.mocked_rpcapi_config_apply_runtime_manifest.start()
self.addCleanup(self.mocked_rpcapi_config_apply_runtime_manifest.stop)
self.fail_config_apply_runtime_manifest = False
# Mock agent iconfig_update_file
def mock_agent_iconfig_update_file(obj, context, iconfig_uuid, iconfig_dict):
if not self.fail_config_apply_runtime_manifest:
# Simulate the config was applied
if 'host_uuids' in iconfig_dict:
for host_uuid in iconfig_dict['host_uuids']:
self.dbapi.ihost_update(host_uuid,
{'config_applied': iconfig_uuid})
else:
for personality in iconfig_dict['personalities']:
hosts = self.dbapi.ihost_get_by_personality(personality)
for host in hosts:
self.dbapi.ihost_update(
host.uuid, {'config_applied': iconfig_uuid})
self.mocked_rpcapi_iconfig_update_file = mock.patch.object(
agent_rpcapi.AgentAPI, 'iconfig_update_file',
mock_agent_iconfig_update_file)
self.mocked_rpcapi_iconfig_update_file.start()
self.addCleanup(self.mocked_rpcapi_iconfig_update_file.stop)
self.mocked_is_initial_config_complete = mock.patch.object(
cutils, 'is_initial_config_complete')
self.mocked_is_initial_config_complete.start()
self.mocked_is_initial_config_complete.return_value = True
self.addCleanup(self.mocked_is_initial_config_complete.stop)
# Mock subprocess popen
self.fake_subprocess_popen = FakePopen()
@ -1333,6 +1375,194 @@ class ManagerTestCase(base.DbTestCase):
self.service.iconfig_update_by_ihost(self.context, ihost['uuid'], imsg_dict)
self.assertEqual(self.alarm_raised, True)
def fake_rename(self, old, new):
self.executes.append(('mv', old, new))
@staticmethod
def scope_open(*args, **kwargs):
fake_contents = "lorem ipsum"
fake_file = mock.Mock()
fake_file.read.return_value = fake_contents
fake_context_manager = mock.MagicMock()
fake_context_manager.__enter__.return_value = fake_file
fake_context_manager.__exit__.return_value = None
if not args[0].startswith( # filename
os.path.join(tsc.CONFIG_PATH, 'resolv.conf')):
return open(*args, **kwargs)
else:
return fake_context_manager
def test_deferred_runtime_config_file(self):
# Create controller-0
config_uuid = str(uuid.uuid4())
chost = self._create_test_ihost(
personality=constants.CONTROLLER,
hostname='controller-0',
uuid=str(uuid.uuid4()),
config_status=None,
config_applied=config_uuid,
config_target=config_uuid,
invprovision=constants.PROVISIONED,
administrative=constants.ADMIN_UNLOCKED,
operational=constants.OPERATIONAL_ENABLED,
availability=constants.AVAILABILITY_ONLINE,
)
# create test dns nameservers config
utils.create_test_dns(forisystemid=self.system.id,
nameservers='8.8.8.8,8.8.4.4')
cutils.gethostbyname = mock.Mock(return_value='192.168.204.2')
self.executes = []
self.stub_out('os.rename', self.fake_rename)
# These mock for builtin open are needed for py27 and py3 compatibility
mock_trace_caller = mock.MagicMock()
p = mock.patch(
'traceback.format_stack',
mock_trace_caller)
p.start()
p.return_value = ['one', 'two', 'three']
self.addCleanup(p.stop)
mock_open = mock.mock_open()
with mock.patch('six.moves.builtins.open', mock_open):
mock_open.side_effect = self.scope_open
self.mock_ready_to_apply_runtime_config.return_value = False
self.service.update_dns_config(self.context)
chost_updated = self.dbapi.ihost_get(chost.uuid)
# Verify that the config is updated and alarm is raised
self.assertNotEqual(chost_updated.config_applied,
chost_updated.config_target)
self.assertEqual(self.alarm_raised, True)
self.mock_ready_to_apply_runtime_config.return_value = True
self.service._audit_deferred_runtime_config(self.context)
# Simulate agent update
chost_updated = self.dbapi.ihost_get(chost.uuid)
self.service._update_host_config_applied(
self.context, chost_updated, chost_updated.config_applied)
# Verify the config is up to date.
self.assertEqual(chost_updated.config_target,
chost_updated.config_applied)
self.assertEqual(self.alarm_raised, False)
def test_deferred_runtime_config_manifest(self):
# Create controller-0
config_uuid = str(uuid.uuid4())
chost = self._create_test_ihost(
personality=constants.CONTROLLER,
hostname='controller-0',
uuid=str(uuid.uuid4()),
config_status=None,
config_applied=config_uuid,
config_target=config_uuid,
invprovision=constants.PROVISIONED,
administrative=constants.ADMIN_UNLOCKED,
operational=constants.OPERATIONAL_ENABLED,
availability=constants.AVAILABILITY_ONLINE,
)
self.mock_ready_to_apply_runtime_config.return_value = False
self.service.update_user_config(self.context)
chost_updated = self.dbapi.ihost_get(chost.uuid)
# Verify that the config is updated and alarm is raised
self.assertNotEqual(chost_updated.config_applied,
chost_updated.config_target)
self.assertEqual(self.alarm_raised, True)
self.mock_ready_to_apply_runtime_config.return_value = True
self.service._audit_deferred_runtime_config(self.context)
# Simulate agent update
chost_updated = self.dbapi.ihost_get(chost.uuid)
self.service._update_host_config_applied(
self.context, chost_updated, chost_updated.config_applied)
# Verify the config is up to date.
self.assertEqual(chost_updated.config_target,
chost_updated.config_applied)
self.assertEqual(self.alarm_raised, False)
def test_deferred_multiple_runtime_config(self):
# Create controller-0
config_uuid = str(uuid.uuid4())
chost = self._create_test_ihost(
personality=constants.CONTROLLER,
hostname='controller-0',
uuid=str(uuid.uuid4()),
config_status=None,
config_applied=config_uuid,
config_target=config_uuid,
invprovision=constants.PROVISIONED,
administrative=constants.ADMIN_UNLOCKED,
operational=constants.OPERATIONAL_ENABLED,
availability=constants.AVAILABILITY_ONLINE,
)
# create test dns nameservers config
utils.create_test_dns(forisystemid=self.system.id,
nameservers='8.8.8.8,8.8.4.4')
cutils.gethostbyname = mock.Mock(return_value='192.168.204.2')
self.executes = []
self.stub_out('os.rename', self.fake_rename)
# These mock for builtin open are needed for py27 and py3 compatibility
mock_trace_caller = mock.MagicMock()
p = mock.patch(
'traceback.format_stack',
mock_trace_caller)
p.start()
p.return_value = ['one', 'two', 'three']
self.addCleanup(p.stop)
mock_open = mock.mock_open()
with mock.patch('six.moves.builtins.open', mock_open):
mock_open.side_effect = self.scope_open
# Attempt to apply a runtime config, which is deferred
self.mock_ready_to_apply_runtime_config.return_value = False
self.service.update_dns_config(self.context)
c1host_updated = self.dbapi.ihost_get(chost.uuid)
# Verify that the config is updated and alarm is raised
self.assertNotEqual(c1host_updated.config_applied,
c1host_updated.config_target)
self.assertEqual(self.alarm_raised, True)
# Attempt another runtime config, which is also deferred
self.service.update_user_config(self.context)
c2host_updated = self.dbapi.ihost_get(chost.uuid)
# Verify that the target is updated and alarm is still raised
self.assertNotEqual(c1host_updated.config_target,
c2host_updated.config_target)
self.assertEqual(c1host_updated.config_applied,
c1host_updated.config_applied)
self.assertNotEqual(c2host_updated.config_applied,
c2host_updated.config_target)
self.assertEqual(self.alarm_raised, True)
# Run the audit for deferred runtime config
self.mock_ready_to_apply_runtime_config.return_value = True
self.service._audit_deferred_runtime_config(self.context)
# Simulate agent update
chost_updated = self.dbapi.ihost_get(chost.uuid)
self.service._update_host_config_applied(
self.context, chost_updated, chost_updated.config_applied)
# Verify the config is up to date.
self.assertEqual(chost_updated.config_target,
chost_updated.config_applied)
self.assertEqual(self.alarm_raised, False)
def _raise_alarm(self, fault):
self.alarm_raised = True