Fix the condition to delete a stuck partition in the database

In the changes of [1] review, there is a condition to delete a
partition in the database that doesn't exist in the agent report
and at the same moment, there is no puppet from the
'platform::partitions::runtime' class running.

In case of a partition with the status "Creating on unlock", it
satisfies both conditions I mentioned, because the agent won't
report it, and Puppet to create a partition won't be running.
This commit changes the behavior to not delete the partition with
this status, because it will still be created during unlock.

Additionally, a failure was also identified in the check condition
when puppet is running, which was causing the partition to be
deleted incorrectly. To fix this, an in-file flag was implemented
to identify puppet manifest execution.

[1] https://review.opendev.org/c/starlingx/config/+/889090

Test-Plan:
  PASS: AIO-SX fresh install
  PASS: AIO-DX fresh install
  PASS: create/modify/delete a partition in the
        controller-0|1 followed by a reboot and check the status
        with 'system host-disk-partition-list'.
  PASS: Restart of sysinv-conductor and/or sysinv-agent services
        during puppet manifest applying.
  PASS: AIO-SX upgrade stx 7.0 to stx 8.0
  PASS: AIO-SX Backup and Restore

Closes-Bug: 2028254

Change-Id: I2024ab841ca3edbcc140de9b4ea0fbea12044791
Signed-off-by: Gabriel de Araújo Cabral <gabriel.cabral@windriver.com>
Signed-off-by: Erickson Silva <Erickson.SilvadeOliveira@windriver.com>
This commit is contained in:
Gabriel de Araújo Cabral 2023-10-27 17:25:01 -03:00 committed by Erickson Silva
parent 1d2eb82cad
commit 2a39372b51
4 changed files with 47 additions and 10 deletions

View File

@ -691,6 +691,8 @@ def _create(partition):
idiskid,
{'available_mib': new_available_mib})
partition_config_flag = constants.PARTITION_CONFIG_FLAG % (forihostid)
cutils.touch(partition_config_flag)
try:
# Update the database
new_partition = pecan.request.dbapi.partition_create(forihostid,
@ -707,11 +709,13 @@ def _create(partition):
# Instruct puppet to implement the change
pecan.request.rpcapi.update_partition_config(pecan.request.context,
partition)
except exception.HTTPNotFound:
msg = _("Creating partition failed for host %s ") % (ihost['hostname'])
raise wsme.exc.ClientSideError(msg)
except exception.PartitionAlreadyExists:
msg = _("Disk partition %s already exists." % partition.get('device_path'))
cutils.remove(partition_config_flag)
raise wsme.exc.ClientSideError(msg)
except Exception:
msg = _("Creating partition failed for host %s ") % (ihost['hostname'])
cutils.remove(partition_config_flag)
raise wsme.exc.ClientSideError(msg)
return new_partition

View File

@ -1594,6 +1594,9 @@ SYSINV_FIRST_REPORT_FLAG = os.path.join(SYSINV_VOLATILE_PATH,
SYSINV_REPORTED = os.path.join(SYSINV_VOLATILE_PATH,
".sysinv_reported")
PARTITION_CONFIG_FLAG = os.path.join(
SYSINV_VOLATILE_PATH, ".sysinv_partition_config_%s")
NETWORK_CONFIG_LOCK_FILE = os.path.join(
tsc.VOLATILE_PATH, "apply_network_config.lock")

View File

@ -928,6 +928,11 @@ def touch(fname):
os.utime(fname, None)
def remove(fname):
if os.path.exists(fname):
os.remove(fname)
def symlink_force(source, link_name):
""" Force creation of a symlink
Params:

View File

@ -399,6 +399,8 @@ class ConductorManager(service.PeriodicService):
self._sx_to_dx_post_migration_actions(system)
self._clear_partition_config_flags()
LOG.info("sysinv-conductor start committed system=%s" %
system.as_dict())
@ -676,6 +678,11 @@ class ConductorManager(service.PeriodicService):
constants.CINDER_BACKEND_CEPH,
task=constants.SB_TASK_RESTORE)
def _clear_partition_config_flags(self):
files = constants.PARTITION_CONFIG_FLAG % ("*")
for fname in glob.glob(files):
cutils.remove(fname)
def _clear_stuck_loads(self):
load_stuck_states = [constants.IMPORTING_LOAD_STATE]
@ -4968,6 +4975,7 @@ class ConductorManager(service.PeriodicService):
LOG.debug("PART conductor-manager partition: %s" % str(partition))
# Get host.
host_uuid = partition.get('ihost_uuid')
forihostid = partition.get('forihostid')
try:
db_host = self.dbapi.ihost_get(host_uuid)
except exception.ServerNotFound:
@ -4998,6 +5006,10 @@ class ConductorManager(service.PeriodicService):
config_dict,
force=force_apply,
filter_classes=[self.PUPPET_RUNTIME_CLASS_PARTITIONS])
# The flag is cleared because the manifest class has already been added
# using the _add_runtime_class_apply_in_progress() method
# within _config_apply_runtime_manifest().
cutils.remove(constants.PARTITION_CONFIG_FLAG % (forihostid))
def ipartition_update_by_ihost(self, context,
ihost_uuid, ipart_dict_array, first_report=False):
@ -5014,8 +5026,10 @@ class ConductorManager(service.PeriodicService):
LOG.exception("Invalid ihost_uuid %s" % ihost_uuid)
return
upgrade_in_progress = False
try:
self.dbapi.software_upgrade_get_one()
upgrade_in_progress = True
except exception.NotFound:
# No upgrade in progress
pass
@ -5032,14 +5046,20 @@ class ConductorManager(service.PeriodicService):
db_host.hostname)
return
if first_report and self._check_runtime_class_apply_in_progress([self.PUPPET_RUNTIME_CLASS_PARTITIONS],
host_uuids=ihost_uuid):
self._clear_runtime_class_apply_in_progress(classes_list=[self.PUPPET_RUNTIME_CLASS_PARTITIONS],
host_uuids=ihost_uuid)
# Get the id of the host.
forihostid = db_host['id']
partition_config_flag = constants.PARTITION_CONFIG_FLAG % (forihostid)
# Receiving first_report=True means the sysinv-agent on that host has just started..
# This means that if there were any puppet manifests running, they have been
# terminated, so we need to clear the list of runtime manifests in progress
# below and also remove the partition config flag from that host, to avoid a false positive.
if first_report:
self._clear_runtime_class_apply_in_progress(classes_list=[self.PUPPET_RUNTIME_CLASS_PARTITIONS],
host_uuids=ihost_uuid)
cutils.remove(partition_config_flag)
# Obtain the partitions, disks and physical volumes that are currently
# present in the DB.
db_parts = self.dbapi.partition_get_by_ihost(ihost_uuid)
@ -5085,8 +5105,13 @@ class ConductorManager(service.PeriodicService):
# Handle database to fix partitions with the status 'stuck'
# in creating/deleting/modifying.
if not self._check_runtime_class_apply_in_progress([self.PUPPET_RUNTIME_CLASS_PARTITIONS]):
if db_part.device_path not in ipart_device_paths:
if not os.path.exists(partition_config_flag) and \
not self._check_runtime_class_apply_in_progress([self.PUPPET_RUNTIME_CLASS_PARTITIONS],
host_uuids=ihost_uuid):
if db_part.device_path not in ipart_device_paths and \
not upgrade_in_progress and \
not os.path.exists(tsc.RESTORE_IN_PROGRESS_FLAG) and \
db_part.status != constants.PARTITION_CREATE_ON_UNLOCK_STATUS:
self.dbapi.partition_destroy(db_part.uuid)
LOG.info("Delete DB partition stuck: %s" % str(db_part.items()))
elif db_part.status == constants.PARTITION_MODIFYING_STATUS: