Report port and device inventory after the worker manifest

The SR-IOV configuration of a device is not retained across reboots,
until puppet manifests bind/enable completes. The sysinv-agent should
not report device inventory at any time after it is started, it should
wait until puppet worker manifest completes. Though during bootstrap
(fresh install), restore, network-boot and subsequent reboots in case
of non-worker roles (controller, storage) sysinv-agent can report at
any time it is started.

Upon reboot, SR-IOV configuration (of ACC100) (sriov_numvfs=0) is
updated to intended configuration by puppet worker manifest. In this
case, there is a small chance that the sysinv-agent audit (every 60
seconds) will run before the driver configuration. Since the agent will
only actually report the port and device inventory once, the SR-IOV
configuration data is not accurately reflected in the db, thus
requiring additional lock/unlock(s) to force correction.

After fresh-install/restore/network-boot and reboot, there was no
/etc/platform/.initial_worker_config_complete and
/var/run/.worker_config_complete files until puppet worker manifest
completes. sysinv-agent audit happened to read device inventory before
the driver configuration (i.e. before worker manifest completed), thus
not accurately reflected in the db.

This commit fixes such that port and device configuration are only
reported after the worker manifest has completed, in case the host is
being configured as worker subfunction.

TEST PLAN:
   PASS: Fresh install node (that has ACC100 device) AIO, check
         host-device-list/show (before config/unlock) to see
         ACC100 device config:: driver:None, vf-driver:None, N:0.

   PASS: After above, update config (ACC100 device config::
         driver:igb_uio, vf-driver:igb_uio, N:1) and also use
         host-label-assign as sriovdp=enabled and unlock, for
         subsequent reboots validate device config as
         (driver:igb_uio, vf-driver:igb_uio, N:1) and validate
         content of /etc/pcidp/config.json.

   PASS: Restore node from backup (ACC100 device config::
         driver:igb_uio, vf-driver:igb_uio, N:1 and also
         host-label-assing as sriovdp=enabled), once node
         come back up, check host-device-list/show for after-boot
         update time and num_vfs = 1. Also validate content of
         /etc/pcidp/config.json.

    PASS: In AIO-DX setup, ports and devices can be listed and
         and second worker node can be unlocked, after the
         network-boot.

Closes-Bug: 2053149
Change-Id: I69d483041bd75ea0abbd68cedccfbc5f10062c75
Signed-off-by: Tara Nath Subedi <tara.subedi@windriver.com>
This commit is contained in:
Tara Subedi 2024-02-08 14:51:50 -05:00
parent 1a2fea6846
commit 9c3bf050cd
3 changed files with 126 additions and 13 deletions

View File

@ -115,8 +115,7 @@ SYSINV_READY_FLAG = os.path.join(tsc.VOLATILE_PATH, ".sysinv_ready")
CONFIG_APPLIED_FILE = os.path.join(tsc.PLATFORM_CONF_PATH, ".config_applied")
CONFIG_APPLIED_DEFAULT = "install"
FIRST_BOOT_FLAG = os.path.join(
tsc.PLATFORM_CONF_PATH, ".first_boot")
FIRST_BOOT_FLAG = constants.FIRST_BOOT_FLAG
PUPPET_HIERADATA_PATH = os.path.join(tsc.PUPPET_PATH, 'hieradata')
PUPPET_HIERADATA_CACHE_PATH = '/etc/puppet/cache/hieradata'
@ -225,6 +224,7 @@ class AgentManager(service.PeriodicService):
self._first_grub_update = False
self._inventoried_initial = False
self._inventory_reported = set()
self._first_boot_flag = os.path.exists(FIRST_BOOT_FLAG)
def start(self):
super(AgentManager, self).start()
@ -611,20 +611,29 @@ class AgentManager(service.PeriodicService):
def _get_ports_inventory(self):
"""Collect ports inventory for this host"""
# During bootstrap (fresh install and also restore), and network first-boot
# send report unconditionally
# During host configuration (subsequent reboots):
# worker in subfunction : do not send report until worker manifest applied
# otherwise: send report unconditionally
port_list = []
pci_device_list = []
host_macs = []
initial_worker_config_completed = \
os.path.exists(tsc.INITIAL_WORKER_CONFIG_COMPLETE)
ansible_bootstrap_flag = \
os.path.exists(constants.ANSIBLE_BOOTSTRAP_FLAG)
worker_config_completed = \
os.path.exists(tsc.VOLATILE_WORKER_CONFIG_COMPLETE)
# do not send report if the initial worker config is completed and
# worker config has not finished, i.e.during subsequent
# reboot before the manifest enables and binds any SR-IOV devices
if (initial_worker_config_completed and
not worker_config_completed):
# do not send report if it is neither first boot nor
# ansible-bootstrap, and worker config has not finished,
# i.e. during subsequent reboot before the worker manifest
# enables and binds any SR-IOV devices
if (not self._first_boot_flag
and not ansible_bootstrap_flag
and constants.WORKER in self.subfunctions_list_get()
and not worker_config_completed):
return port_list, pci_device_list, host_macs
# find list of network related inics for this host

View File

@ -2109,6 +2109,9 @@ KUBE_POWER_MANAGER_VALUE = 'enabled'
# Default DNS service domain
DEFAULT_DNS_SERVICE_DOMAIN = 'cluster.local'
# First boot
FIRST_BOOT_FLAG = os.path.join(tsc.PLATFORM_CONF_PATH, ".first_boot")
# Ansible bootstrap
ANSIBLE_BOOTSTRAP_FLAG = os.path.join(tsc.VOLATILE_PATH, ".ansible_bootstrap")
ANSIBLE_BOOTSTRAP_COMPLETED_FLAG = os.path.join(tsc.PLATFORM_CONF_PATH,

View File

@ -27,6 +27,7 @@ from sysinv.agent.pci import PCIOperator
from sysinv.agent.pci import PCI
from sysinv.agent.manager import AgentManager
from sysinv.tests import base
from sysinv.common import constants
from sysinv.common import fpga_constants
import tsconfig.tsconfig as tsc
@ -170,10 +171,11 @@ class TestAgentOperator(base.TestCase):
mock.patch.object(PCIOperator, 'pci_get_device_attrs'),
mock.patch.object(PCIOperator, 'inics_get'),
mock.patch.object(PCIOperator, 'pci_devices_get'),
mock.patch.object(AgentManager, 'subfunctions_list_get'),
mock.patch.object(AgentManager, '_acquire_network_config_lock'),
mock.patch.object(AgentManager, '_release_network_config_lock')) as (
mock_net_attrs, mock_device_attrs, mock_nics, mock_devices,
aquire_lock, release_lock):
mock_subfunctions, aquire_lock, release_lock):
def fake_get_net_attrs(a):
return FAKE_PORT_ATTRIBUTES
@ -187,26 +189,119 @@ class TestAgentOperator(base.TestCase):
def fake_get_devices():
return FAKE_DEVICES
def fake_subfunctions_list_get():
return self.subfunctions_list
mock_net_attrs.side_effect = fake_get_net_attrs
mock_device_attrs.side_effect = fake_get_device_attrs
mock_nics.side_effect = fake_get_nics
mock_devices.side_effect = fake_get_devices
mock_subfunctions.side_effect = fake_subfunctions_list_get
ports, devices, macs = self.agent_manager._get_ports_inventory()
return ports, devices, macs
@mock.patch('os.path.exists')
def test_get_pci_inventory_before_worker_initial_config_complete(self, mock_exists):
def test_get_pci_inventory_during_network_boot_install(self, mock_exists):
def file_exists_side_effect(filename):
# Neither the initial nor volatile worker config complete flags are set
return False
if filename == constants.FIRST_BOOT_FLAG:
# first boot flag is set.
return True
else:
# ansible bootstrap flag is not set.
# Neither the initial nor volatile worker config complete flags are set
return False
self.agent_manager._first_boot_flag = True
mock_exists.side_effect = file_exists_side_effect
self.subfunctions_list = [constants.WORKER]
ports, devices, macs = self._get_ports_inventory()
assert len(ports) == 1
assert len(devices) == 1
assert len(macs) == 1
@mock.patch('os.path.exists')
def test_get_pci_inventory_during_bootstrap_fresh_install(self, mock_exists):
def file_exists_side_effect(filename):
if filename == constants.ANSIBLE_BOOTSTRAP_FLAG:
# ansible bootstrap flag is set.
return True
else:
# Neither the initial nor volatile worker config complete flags are set
return False
mock_exists.side_effect = file_exists_side_effect
self.subfunctions_list = [constants.WORKER]
ports, devices, macs = self._get_ports_inventory()
assert len(ports) == 1
assert len(devices) == 1
assert len(macs) == 1
@mock.patch('os.path.exists')
def test_get_pci_inventory_during_restore(self, mock_exists):
def file_exists_side_effect(filename):
if filename in [constants.ANSIBLE_BOOTSTRAP_FLAG,
tsc.RESTORE_IN_PROGRESS_FLAG]:
# Both ansible bootstrap and restore flag are set.
return True
else:
# Neither the initial nor volatile worker config complete flags are set
return False
mock_exists.side_effect = file_exists_side_effect
self.subfunctions_list = [constants.WORKER]
ports, devices, macs = self._get_ports_inventory()
assert len(ports) == 1
assert len(devices) == 1
assert len(macs) == 1
@mock.patch('os.path.exists')
def test_get_pci_inventory_controller_before_config_complete(self, mock_exists):
def file_exists_side_effect(filename):
# Neither the initial nor volatile controller config complete flags are set
# Neither first boot nor ansible bootstrap flag are not set.
return False
mock_exists.side_effect = file_exists_side_effect
self.subfunctions_list = [constants.CONTROLLER]
ports, devices, macs = self._get_ports_inventory()
assert len(ports) == 1
assert len(devices) == 1
assert len(macs) == 1
@mock.patch('os.path.exists')
def test_get_pci_inventory_storage_before_config_complete(self, mock_exists):
def file_exists_side_effect(filename):
# Neither the initial nor volatile storage config complete flags are set
# Neither first boot nor ansible bootstrap flag are not set.
return False
mock_exists.side_effect = file_exists_side_effect
self.subfunctions_list = [constants.STORAGE]
ports, devices, macs = self._get_ports_inventory()
assert len(ports) == 1
assert len(devices) == 1
assert len(macs) == 1
@mock.patch('os.path.exists')
def test_get_pci_inventory_before_worker_initial_config_complete(self, mock_exists):
def file_exists_side_effect(filename):
# Neither the initial nor volatile worker config complete flags are set
# Neither first boot nor ansible bootstrap flag are not set.
return False
mock_exists.side_effect = file_exists_side_effect
self.subfunctions_list = [constants.WORKER]
ports, devices, macs = self._get_ports_inventory()
assert len(ports) == 0
assert len(devices) == 0
assert len(macs) == 0
@mock.patch('os.path.exists')
def test_get_pci_inventory_before_worker_config_complete(self, mock_exists):
def file_exists_side_effect(filename):
@ -214,8 +309,11 @@ class TestAgentOperator(base.TestCase):
# Only the initial worker config complete flag is set
return True
else:
# worker config complete is not set.
# Neither first boot nor ansible bootstrap flag are set.
return False
mock_exists.side_effect = file_exists_side_effect
self.subfunctions_list = [constants.WORKER]
ports, devices, macs = self._get_ports_inventory()
assert len(ports) == 0
@ -230,8 +328,10 @@ class TestAgentOperator(base.TestCase):
# Both of the initial and volatile worker config complete flags are set
return True
else:
# Neither first boot nor ansible bootstrap flag are set.
return False
mock_exists.side_effect = file_exists_side_effect
self.subfunctions_list = [constants.WORKER]
ports, devices, macs = self._get_ports_inventory()
for dev in devices:
@ -250,6 +350,7 @@ class TestAgentOperator(base.TestCase):
else:
return False
mock_exists.side_effect = file_exists_side_effect
self.subfunctions_list = [constants.WORKER]
ports, devices, macs = self._get_ports_inventory()
for dev in devices: