From 9c3bf050cd57916325a3e7218a4816ba575b63e4 Mon Sep 17 00:00:00 2001 From: Tara Subedi Date: Thu, 8 Feb 2024 14:51:50 -0500 Subject: [PATCH] Report port and device inventory after the worker manifest The SR-IOV configuration of a device is not retained across reboots, until puppet manifests bind/enable completes. The sysinv-agent should not report device inventory at any time after it is started, it should wait until puppet worker manifest completes. Though during bootstrap (fresh install), restore, network-boot and subsequent reboots in case of non-worker roles (controller, storage) sysinv-agent can report at any time it is started. Upon reboot, SR-IOV configuration (of ACC100) (sriov_numvfs=0) is updated to intended configuration by puppet worker manifest. In this case, there is a small chance that the sysinv-agent audit (every 60 seconds) will run before the driver configuration. Since the agent will only actually report the port and device inventory once, the SR-IOV configuration data is not accurately reflected in the db, thus requiring additional lock/unlock(s) to force correction. After fresh-install/restore/network-boot and reboot, there was no /etc/platform/.initial_worker_config_complete and /var/run/.worker_config_complete files until puppet worker manifest completes. sysinv-agent audit happened to read device inventory before the driver configuration (i.e. before worker manifest completed), thus not accurately reflected in the db. This commit fixes such that port and device configuration are only reported after the worker manifest has completed, in case the host is being configured as worker subfunction. TEST PLAN: PASS: Fresh install node (that has ACC100 device) AIO, check host-device-list/show (before config/unlock) to see ACC100 device config:: driver:None, vf-driver:None, N:0. PASS: After above, update config (ACC100 device config:: driver:igb_uio, vf-driver:igb_uio, N:1) and also use host-label-assign as sriovdp=enabled and unlock, for subsequent reboots validate device config as (driver:igb_uio, vf-driver:igb_uio, N:1) and validate content of /etc/pcidp/config.json. PASS: Restore node from backup (ACC100 device config:: driver:igb_uio, vf-driver:igb_uio, N:1 and also host-label-assing as sriovdp=enabled), once node come back up, check host-device-list/show for after-boot update time and num_vfs = 1. Also validate content of /etc/pcidp/config.json. PASS: In AIO-DX setup, ports and devices can be listed and and second worker node can be unlocked, after the network-boot. Closes-Bug: 2053149 Change-Id: I69d483041bd75ea0abbd68cedccfbc5f10062c75 Signed-off-by: Tara Nath Subedi --- sysinv/sysinv/sysinv/sysinv/agent/manager.py | 27 +++-- .../sysinv/sysinv/sysinv/common/constants.py | 3 + .../sysinv/sysinv/tests/agent/test_pci.py | 109 +++++++++++++++++- 3 files changed, 126 insertions(+), 13 deletions(-) diff --git a/sysinv/sysinv/sysinv/sysinv/agent/manager.py b/sysinv/sysinv/sysinv/sysinv/agent/manager.py index 3b4fd41a4d..b14301baa2 100644 --- a/sysinv/sysinv/sysinv/sysinv/agent/manager.py +++ b/sysinv/sysinv/sysinv/sysinv/agent/manager.py @@ -115,8 +115,7 @@ SYSINV_READY_FLAG = os.path.join(tsc.VOLATILE_PATH, ".sysinv_ready") CONFIG_APPLIED_FILE = os.path.join(tsc.PLATFORM_CONF_PATH, ".config_applied") CONFIG_APPLIED_DEFAULT = "install" -FIRST_BOOT_FLAG = os.path.join( - tsc.PLATFORM_CONF_PATH, ".first_boot") +FIRST_BOOT_FLAG = constants.FIRST_BOOT_FLAG PUPPET_HIERADATA_PATH = os.path.join(tsc.PUPPET_PATH, 'hieradata') PUPPET_HIERADATA_CACHE_PATH = '/etc/puppet/cache/hieradata' @@ -225,6 +224,7 @@ class AgentManager(service.PeriodicService): self._first_grub_update = False self._inventoried_initial = False self._inventory_reported = set() + self._first_boot_flag = os.path.exists(FIRST_BOOT_FLAG) def start(self): super(AgentManager, self).start() @@ -611,20 +611,29 @@ class AgentManager(service.PeriodicService): def _get_ports_inventory(self): """Collect ports inventory for this host""" + # During bootstrap (fresh install and also restore), and network first-boot + # send report unconditionally + # During host configuration (subsequent reboots): + # worker in subfunction : do not send report until worker manifest applied + # otherwise: send report unconditionally + port_list = [] pci_device_list = [] host_macs = [] - initial_worker_config_completed = \ - os.path.exists(tsc.INITIAL_WORKER_CONFIG_COMPLETE) + ansible_bootstrap_flag = \ + os.path.exists(constants.ANSIBLE_BOOTSTRAP_FLAG) worker_config_completed = \ os.path.exists(tsc.VOLATILE_WORKER_CONFIG_COMPLETE) - # do not send report if the initial worker config is completed and - # worker config has not finished, i.e.during subsequent - # reboot before the manifest enables and binds any SR-IOV devices - if (initial_worker_config_completed and - not worker_config_completed): + # do not send report if it is neither first boot nor + # ansible-bootstrap, and worker config has not finished, + # i.e. during subsequent reboot before the worker manifest + # enables and binds any SR-IOV devices + if (not self._first_boot_flag + and not ansible_bootstrap_flag + and constants.WORKER in self.subfunctions_list_get() + and not worker_config_completed): return port_list, pci_device_list, host_macs # find list of network related inics for this host diff --git a/sysinv/sysinv/sysinv/sysinv/common/constants.py b/sysinv/sysinv/sysinv/sysinv/common/constants.py index 4d9f76825c..84ca1a52b0 100644 --- a/sysinv/sysinv/sysinv/sysinv/common/constants.py +++ b/sysinv/sysinv/sysinv/sysinv/common/constants.py @@ -2109,6 +2109,9 @@ KUBE_POWER_MANAGER_VALUE = 'enabled' # Default DNS service domain DEFAULT_DNS_SERVICE_DOMAIN = 'cluster.local' +# First boot +FIRST_BOOT_FLAG = os.path.join(tsc.PLATFORM_CONF_PATH, ".first_boot") + # Ansible bootstrap ANSIBLE_BOOTSTRAP_FLAG = os.path.join(tsc.VOLATILE_PATH, ".ansible_bootstrap") ANSIBLE_BOOTSTRAP_COMPLETED_FLAG = os.path.join(tsc.PLATFORM_CONF_PATH, diff --git a/sysinv/sysinv/sysinv/sysinv/tests/agent/test_pci.py b/sysinv/sysinv/sysinv/sysinv/tests/agent/test_pci.py index e167834b98..66ead09d56 100644 --- a/sysinv/sysinv/sysinv/sysinv/tests/agent/test_pci.py +++ b/sysinv/sysinv/sysinv/sysinv/tests/agent/test_pci.py @@ -27,6 +27,7 @@ from sysinv.agent.pci import PCIOperator from sysinv.agent.pci import PCI from sysinv.agent.manager import AgentManager from sysinv.tests import base +from sysinv.common import constants from sysinv.common import fpga_constants import tsconfig.tsconfig as tsc @@ -170,10 +171,11 @@ class TestAgentOperator(base.TestCase): mock.patch.object(PCIOperator, 'pci_get_device_attrs'), mock.patch.object(PCIOperator, 'inics_get'), mock.patch.object(PCIOperator, 'pci_devices_get'), + mock.patch.object(AgentManager, 'subfunctions_list_get'), mock.patch.object(AgentManager, '_acquire_network_config_lock'), mock.patch.object(AgentManager, '_release_network_config_lock')) as ( mock_net_attrs, mock_device_attrs, mock_nics, mock_devices, - aquire_lock, release_lock): + mock_subfunctions, aquire_lock, release_lock): def fake_get_net_attrs(a): return FAKE_PORT_ATTRIBUTES @@ -187,26 +189,119 @@ class TestAgentOperator(base.TestCase): def fake_get_devices(): return FAKE_DEVICES + def fake_subfunctions_list_get(): + return self.subfunctions_list + mock_net_attrs.side_effect = fake_get_net_attrs mock_device_attrs.side_effect = fake_get_device_attrs mock_nics.side_effect = fake_get_nics mock_devices.side_effect = fake_get_devices + mock_subfunctions.side_effect = fake_subfunctions_list_get ports, devices, macs = self.agent_manager._get_ports_inventory() return ports, devices, macs @mock.patch('os.path.exists') - def test_get_pci_inventory_before_worker_initial_config_complete(self, mock_exists): + def test_get_pci_inventory_during_network_boot_install(self, mock_exists): def file_exists_side_effect(filename): - # Neither the initial nor volatile worker config complete flags are set - return False + if filename == constants.FIRST_BOOT_FLAG: + # first boot flag is set. + return True + else: + # ansible bootstrap flag is not set. + # Neither the initial nor volatile worker config complete flags are set + return False + + self.agent_manager._first_boot_flag = True mock_exists.side_effect = file_exists_side_effect + self.subfunctions_list = [constants.WORKER] ports, devices, macs = self._get_ports_inventory() assert len(ports) == 1 assert len(devices) == 1 assert len(macs) == 1 + @mock.patch('os.path.exists') + def test_get_pci_inventory_during_bootstrap_fresh_install(self, mock_exists): + def file_exists_side_effect(filename): + if filename == constants.ANSIBLE_BOOTSTRAP_FLAG: + # ansible bootstrap flag is set. + return True + else: + # Neither the initial nor volatile worker config complete flags are set + return False + + mock_exists.side_effect = file_exists_side_effect + self.subfunctions_list = [constants.WORKER] + + ports, devices, macs = self._get_ports_inventory() + assert len(ports) == 1 + assert len(devices) == 1 + assert len(macs) == 1 + + @mock.patch('os.path.exists') + def test_get_pci_inventory_during_restore(self, mock_exists): + def file_exists_side_effect(filename): + if filename in [constants.ANSIBLE_BOOTSTRAP_FLAG, + tsc.RESTORE_IN_PROGRESS_FLAG]: + # Both ansible bootstrap and restore flag are set. + return True + else: + # Neither the initial nor volatile worker config complete flags are set + return False + + mock_exists.side_effect = file_exists_side_effect + self.subfunctions_list = [constants.WORKER] + + ports, devices, macs = self._get_ports_inventory() + assert len(ports) == 1 + assert len(devices) == 1 + assert len(macs) == 1 + + @mock.patch('os.path.exists') + def test_get_pci_inventory_controller_before_config_complete(self, mock_exists): + def file_exists_side_effect(filename): + # Neither the initial nor volatile controller config complete flags are set + # Neither first boot nor ansible bootstrap flag are not set. + return False + + mock_exists.side_effect = file_exists_side_effect + self.subfunctions_list = [constants.CONTROLLER] + + ports, devices, macs = self._get_ports_inventory() + assert len(ports) == 1 + assert len(devices) == 1 + assert len(macs) == 1 + + @mock.patch('os.path.exists') + def test_get_pci_inventory_storage_before_config_complete(self, mock_exists): + def file_exists_side_effect(filename): + # Neither the initial nor volatile storage config complete flags are set + # Neither first boot nor ansible bootstrap flag are not set. + return False + + mock_exists.side_effect = file_exists_side_effect + self.subfunctions_list = [constants.STORAGE] + + ports, devices, macs = self._get_ports_inventory() + assert len(ports) == 1 + assert len(devices) == 1 + assert len(macs) == 1 + + @mock.patch('os.path.exists') + def test_get_pci_inventory_before_worker_initial_config_complete(self, mock_exists): + def file_exists_side_effect(filename): + # Neither the initial nor volatile worker config complete flags are set + # Neither first boot nor ansible bootstrap flag are not set. + return False + mock_exists.side_effect = file_exists_side_effect + self.subfunctions_list = [constants.WORKER] + + ports, devices, macs = self._get_ports_inventory() + assert len(ports) == 0 + assert len(devices) == 0 + assert len(macs) == 0 + @mock.patch('os.path.exists') def test_get_pci_inventory_before_worker_config_complete(self, mock_exists): def file_exists_side_effect(filename): @@ -214,8 +309,11 @@ class TestAgentOperator(base.TestCase): # Only the initial worker config complete flag is set return True else: + # worker config complete is not set. + # Neither first boot nor ansible bootstrap flag are set. return False mock_exists.side_effect = file_exists_side_effect + self.subfunctions_list = [constants.WORKER] ports, devices, macs = self._get_ports_inventory() assert len(ports) == 0 @@ -230,8 +328,10 @@ class TestAgentOperator(base.TestCase): # Both of the initial and volatile worker config complete flags are set return True else: + # Neither first boot nor ansible bootstrap flag are set. return False mock_exists.side_effect = file_exists_side_effect + self.subfunctions_list = [constants.WORKER] ports, devices, macs = self._get_ports_inventory() for dev in devices: @@ -250,6 +350,7 @@ class TestAgentOperator(base.TestCase): else: return False mock_exists.side_effect = file_exists_side_effect + self.subfunctions_list = [constants.WORKER] ports, devices, macs = self._get_ports_inventory() for dev in devices: