From 933d3a3a73e923efc86d7ac8b8a059a598e6fbe1 Mon Sep 17 00:00:00 2001 From: Tara Subedi Date: Mon, 25 Mar 2024 13:51:26 -0400 Subject: [PATCH] Report port and device inventory after the worker manifest This is incremental fix of bug:2053149. Upon network boot (first boot) of worker node, agent manager is supposed to report ports/devices, without waiting for worker manifest, as that would never run on first boot. Without this, after system restore, it will be unable to unlock compute node due to sriov config update. kickstart records first boot as "/etc/platform/.first_boot". Agent manager deletes this file. In case agent manager get crashed, it will start again. This time, agent manager don't see .first_boot file, and don't know this is still first boot and it won't report inventory for the worker node. This commit fixes this issue by creating volatile file "/var/run/.first_boot" before deleting "/etc/platform/.first_boot", and agent relies on both files to figure out it is first boot or not. This present same logic for multiple crash/restart of agent manager. TEST PLAN: PASS: AIO-DX bootstrap has no issues. lock/unlock has no issues. PASS: Network-boot worker node, before doing unlock, restart agent manager (sysinv-agent), check sysinv.log to see ports are reported. Closes-Bug: 2053149 Change-Id: Iace5576575388a6ed3403590dbeec545c25fc0e0 Signed-off-by: Tara Nath Subedi --- sysinv/sysinv/sysinv/sysinv/agent/manager.py | 11 ++++++++++- sysinv/sysinv/sysinv/sysinv/common/constants.py | 1 + 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/sysinv/sysinv/sysinv/sysinv/agent/manager.py b/sysinv/sysinv/sysinv/sysinv/agent/manager.py index b3a14868dc..3a61621fd3 100644 --- a/sysinv/sysinv/sysinv/sysinv/agent/manager.py +++ b/sysinv/sysinv/sysinv/sysinv/agent/manager.py @@ -224,7 +224,8 @@ class AgentManager(service.PeriodicService): self._first_grub_update = False self._inventoried_initial = False self._inventory_reported = set() - self._first_boot_flag = os.path.exists(FIRST_BOOT_FLAG) + self._first_boot_flag = os.path.exists(FIRST_BOOT_FLAG) or \ + os.path.exists(constants.VOLATILE_FIRST_BOOT_FLAG) def start(self): super(AgentManager, self).start() @@ -579,6 +580,14 @@ class AgentManager(service.PeriodicService): host_uuid, msg_dict) if os.path.exists(FIRST_BOOT_FLAG): + # Create volatile first_boot file, that will be checked by agent manager + # when it get crashed and restarted, so that it will know this boot is still + # first boot. + try: + os.mknod(constants.VOLATILE_FIRST_BOOT_FLAG) + except OSError: + LOG.error("%s could not be created." % constants.VOLATILE_FIRST_BOOT_FLAG) + os.remove(FIRST_BOOT_FLAG) LOG.info("Removed %s" % FIRST_BOOT_FLAG) except exception.SysinvException: diff --git a/sysinv/sysinv/sysinv/sysinv/common/constants.py b/sysinv/sysinv/sysinv/sysinv/common/constants.py index babe094440..e6aad8cb51 100644 --- a/sysinv/sysinv/sysinv/sysinv/common/constants.py +++ b/sysinv/sysinv/sysinv/sysinv/common/constants.py @@ -2120,6 +2120,7 @@ DEFAULT_DNS_SERVICE_DOMAIN = 'cluster.local' # First boot FIRST_BOOT_FLAG = os.path.join(tsc.PLATFORM_CONF_PATH, ".first_boot") +VOLATILE_FIRST_BOOT_FLAG = os.path.join(tsc.VOLATILE_PATH, ".first_boot") # Ansible bootstrap ANSIBLE_BOOTSTRAP_FLAG = os.path.join(tsc.VOLATILE_PATH, ".ansible_bootstrap")