Fix partition status stuck in creating/modifying/deleting

After using one of the following partition operation commands:

        - system host-disk-partition-add
        - system host-disk-partition-modify
        - system host-disk-partition-delete

If there is a reboot or a restart of sysinv-conductor in the active
controller before the operation ends, its status will be stuck on
creating/modifying/deleting. In this case, it will no longer be
possible create/modify/delete of partitions and lock/unlock on
that host.

To fix this bug, after the reboot, sysinv-conductor gets the
report from sysinv-agent containing information about the partitions
and then uses this information to ensure that the database partition
table is in sync with reality, as seen by the agent, such as defining
the sizes/states and also adding new entries or deleting the ones
that are no longer present.

Also, other bugs were fixed:
- In _semantic_checks() method, the 'delete' operation allowed
  deleting a partition that wasn't the last one, depending on the
  state of that partition. And in the 'create' operation, an
  improvement has been made to ensure that no new partitions are
  created in case the last one has a problem.

- In _are_partition_operations_simultaneous() method, the error
  message was inconsistent, always considering the same operation.

  For example, in a scenario where a user tries to delete a partition
  on a host while another one is being created on the same host.

  The old message would be:
  "Cannot delete a partition while another partition is being
  deleted. Wait for all other partitions to finish deleting"

  The new message will be:
  "Cannot delete a partition while the current status of another
  partition is 'Creating'. Wait for the operation to finish."

- In _config_apply_runtime_manifest() method, "config_dict['classes']"
  was considered a string, but in reality it should be a list.
  Therefore, the condition check has been fixed to work as expected.

Test-Plan:
  PASS: AIO-SX -> create/modify/delete a partition in the
        controller-0 followed by a reboot and check the status with
        'system host-disk-partition-list'.
  PASS: AIO-DX -> create/modify/delete a partition in the
        controller-0/controller-1 followed by a reboot on the
        active controller and check the status with
        'system host-disk-partition-list'.
  PASS: AIO-DX+worker -> create/modify/delete a partition in the
        controller-0/controller-1/compute-0 followed by a reboot
        on the active controller and check the status with
        'system host-disk-partition-list'.

Closes-Bug: 2028254

Change-Id: Ib626b547068162ff41dbbbebf3531ca3dafd27b0
Signed-off-by: Gabriel de Araújo Cabral <gabriel.cabral@windriver.com>
Signed-off-by: Erickson Silva de Oliveira <Erickson.SilvadeOliveira@windriver.com>
This commit is contained in:
Gabriel de Araújo Cabral 2023-07-20 09:05:08 -03:00 committed by Erickson Silva
parent a7a09945f2
commit 7045c9d545
9 changed files with 132 additions and 74 deletions

View File

@ -1,5 +1,5 @@
#
# Copyright (c) 2013-2021 Wind River Systems, Inc.
# Copyright (c) 2013-2023 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@ -97,8 +97,9 @@ PARTITION_STATUS_MSG = {
PARTITION_READY_STATUS: "Ready",
PARTITION_DELETED_STATUS: "Deleted",
PARTITION_ERROR_STATUS: "Error",
PARTITION_ERROR_STATUS_INTERNAL: "Error: Internal script error.",
PARTITION_ERROR_STATUS_GPT: "Error:Missing GPT Table."}
PARTITION_ERROR_STATUS_INTERNAL: "Error: Internal script error",
PARTITION_ERROR_STATUS_GPT: "Error: Missing GPT Table"
}
# Partition table types.
PARTITION_TABLE_GPT = "gpt"

View File

@ -1011,7 +1011,7 @@ class AgentManager(service.PeriodicService):
pass
self._update_disk_partitions(rpcapi, icontext,
ihost['uuid'], force_update=True)
ihost['uuid'], force_update=True, first_report=True)
ipv = self._ipv_operator.ipv_get()
try:
@ -1432,7 +1432,7 @@ class AgentManager(service.PeriodicService):
@utils.synchronized(constants.PARTITION_MANAGE_LOCK)
def _update_disk_partitions(self, rpcapi, icontext,
host_uuid, force_update=False):
host_uuid, force_update=False, first_report=False):
ipartition = self._ipartition_operator.ipartition_get()
if not force_update:
if self._prev_partition == ipartition:
@ -1440,7 +1440,7 @@ class AgentManager(service.PeriodicService):
self._prev_partition = ipartition
try:
rpcapi.ipartition_update_by_ihost(
icontext, host_uuid, ipartition)
icontext, host_uuid, ipartition, first_report)
except AttributeError:
# safe to ignore during upgrades
LOG.warn("Skip updating ipartition conductor. "

View File

@ -438,6 +438,16 @@ def _enough_avail_space_on_disk(partition_size_mib, idisk):
return idisk.available_mib >= partition_size_mib
def _last_partition_is_ok(dbapi, partition):
"""Checks that the last partition is not in error status.
"""
idisk_uuid = partition.get('idisk_uuid')
partitions = dbapi.partition_get_by_idisk(idisk_uuid, sort_key='device_node')
is_ok = True if not partitions else partitions[-1].status not in \
constants.PARTITION_STATUS_NOT_OK_TO_CREATE
return is_ok
def _check_partition_type(partition):
"""Checks that a partition is a user created partition and raises Client
Error if not.
@ -458,7 +468,7 @@ def _check_for_outstanding_requests(partition, idisk):
def _are_partition_operations_simultaneous(ihost, partition, operation):
"""Check that Create and Delete requests are serialized per host.
"""Check that create, modify and delete requests are serialized per host.
:param ihost the ihost object
:param partition dict partition request
:param operation Delete/Create
@ -469,17 +479,15 @@ def _are_partition_operations_simultaneous(ihost, partition, operation):
if (ihost.invprovision in
[constants.UPGRADING, constants.PROVISIONED, constants.PROVISIONING]):
if not (all(host_partition.get('status') in
[constants.PARTITION_READY_STATUS,
constants.PARTITION_IN_USE_STATUS,
constants.PARTITION_CREATE_ON_UNLOCK_STATUS,
constants.PARTITION_ERROR_STATUS,
constants.PARTITION_ERROR_STATUS_INTERNAL]
for host_partition in host_partitions)):
raise wsme.exc.ClientSideError(
"Cannot %s a partition while another partition "
"is being %sd. Wait for all other partitions to "
"finish %sing." % (operation, operation, operation[:-1]))
states = [constants.PARTITION_CREATE_IN_SVC_STATUS,
constants.PARTITION_DELETING_STATUS,
constants.PARTITION_MODIFYING_STATUS]
for host_partition in host_partitions:
if host_partition.get('status') in states:
raise wsme.exc.ClientSideError(
"Cannot %s a partition while the current status of another "
"partition is '%s'. Wait for the operation to finish." %
(operation, constants.PARTITION_STATUS_MSG[host_partition.get('status')]))
def _semantic_checks(operation, partition):
@ -509,6 +517,13 @@ def _semantic_checks(operation, partition):
############
# CREATING #
############
if not _last_partition_is_ok(pecan.request.dbapi,
partition):
raise wsme.exc.ClientSideError(
_("Cannot create partition on a disk where the last partition has an error status. "
"Delete it before creating a new partition."))
if int(partition['size_mib']) <= 0:
raise wsme.exc.ClientSideError(
_("Partition size must be greater than 0."))
@ -619,14 +634,14 @@ def _semantic_checks(operation, partition):
constants.PARTITION_CMD_DELETE)
status = partition.get('status')
if status == constants.PARTITION_READY_STATUS:
if status in constants.PARTITION_STATUS_OK_TO_DELETE:
# Check that the partition to delete is the last partition.
if not cutils.is_partition_the_last(pecan.request.dbapi,
partition):
raise wsme.exc.ClientSideError(
_("Can not delete partition. Only the last partition on "
"disk can be deleted."))
elif status not in constants.PARTITION_STATUS_OK_TO_DELETE:
else:
raise wsme.exc.ClientSideError(
_("Can not delete partition. Only partitions in one of these "
"states can be deleted: %s") % ", ".join(

View File

@ -1508,8 +1508,8 @@ PARTITION_STATUS_MSG = {
PARTITION_READY_STATUS: "Ready",
PARTITION_DELETED_STATUS: "Deleted",
PARTITION_ERROR_STATUS: "Error",
PARTITION_ERROR_STATUS_INTERNAL: "Error: Internal script error.",
PARTITION_ERROR_STATUS_GPT: "Error:Missing GPT Table."}
PARTITION_ERROR_STATUS_INTERNAL: "Error: Internal script error",
PARTITION_ERROR_STATUS_GPT: "Error: Missing GPT Table"}
PARTITION_STATUS_OK_TO_DELETE = [
PARTITION_READY_STATUS,
@ -1518,6 +1518,11 @@ PARTITION_STATUS_OK_TO_DELETE = [
PARTITION_ERROR_STATUS_INTERNAL,
PARTITION_ERROR_STATUS_GPT]
PARTITION_STATUS_NOT_OK_TO_CREATE = [
PARTITION_ERROR_STATUS,
PARTITION_ERROR_STATUS_INTERNAL,
PARTITION_ERROR_STATUS_GPT]
PARTITION_STATUS_SEND_DELETE_RPC = [
PARTITION_READY_STATUS,
PARTITION_ERROR_STATUS,

View File

@ -4904,7 +4904,7 @@ class ConductorManager(service.PeriodicService):
config_dict = {
"host_uuids": [host_uuid],
'personalities': personalities,
"classes": ['platform::partitions::runtime'],
"classes": [self.PUPPET_RUNTIME_CLASS_PARTITIONS],
"idisk_uuid": partition.get('idisk_uuid'),
"partition_uuid": partition.get('uuid'),
puppet_common.REPORT_STATUS_CFG: puppet_common.REPORT_DISK_PARTITON_CONFIG
@ -4918,10 +4918,11 @@ class ConductorManager(service.PeriodicService):
self._config_apply_runtime_manifest(context,
config_uuid,
config_dict,
force=force_apply)
force=force_apply,
filter_classes=[self.PUPPET_RUNTIME_CLASS_PARTITIONS])
def ipartition_update_by_ihost(self, context,
ihost_uuid, ipart_dict_array):
ihost_uuid, ipart_dict_array, first_report):
"""Update existing partition information based on information received
from the agent."""
LOG.debug("PART ipartition_update_by_ihost %s ihost_uuid "
@ -4953,6 +4954,11 @@ class ConductorManager(service.PeriodicService):
db_host.hostname)
return
if first_report and self._check_runtime_class_apply_in_progress([self.PUPPET_RUNTIME_CLASS_PARTITIONS],
host_uuids=ihost_uuid):
self._clear_runtime_class_apply_in_progress(classes_list=[self.PUPPET_RUNTIME_CLASS_PARTITIONS],
host_uuids=ihost_uuid)
# Get the id of the host.
forihostid = db_host['id']
@ -4961,6 +4967,9 @@ class ConductorManager(service.PeriodicService):
db_parts = self.dbapi.partition_get_by_ihost(ihost_uuid)
db_disks = self.dbapi.idisk_get_by_ihost(ihost_uuid)
# Get the partitions device paths and UUIDs received from the agent
ipart_dps_uuids_dict = dict([(ipart['device_path'], ipart['uuid']) for ipart in ipart_dict_array])
# Check that the DB partitions are in sync with the DB disks and PVs.
for db_part in db_parts:
if not db_part.device_path:
@ -4996,21 +5005,45 @@ class ConductorManager(service.PeriodicService):
partition_dict = {'forihostid': forihostid}
partition_update_needed = False
if part_disk.uuid != db_part['idisk_uuid']:
# Handle database to fix partitions with the status 'stuck'
# in creating/deleting/modifying.
if not self._check_runtime_class_apply_in_progress([self.PUPPET_RUNTIME_CLASS_PARTITIONS]):
# Check and update the partition uuid if different than reported by agent
if db_part.uuid not in ipart_dps_uuids_dict.values():
if db_part.device_path in ipart_dps_uuids_dict.keys():
partition_dict['uuid'] = ipart_dps_uuids_dict[db_part.device_path]
partition_update_needed = True
LOG.info("Update DB partition UUID according to agent report: %s to %s" %
(db_part.uuid, partition_dict['uuid']))
else:
self.dbapi.partition_destroy(db_part.uuid)
LOG.info("Delete DB partition stuck")
elif db_part.status == constants.PARTITION_MODIFYING_STATUS:
partition_dict['status'] = constants.PARTITION_READY_STATUS
partition_update_needed = True
LOG.info("Update DB partition stuck in %s" %
constants.PARTITION_STATUS_MSG[db_part.status].lower())
elif db_part.status == constants.PARTITION_DELETING_STATUS:
self.update_partition_config(context, db_part)
LOG.info("Delete partition stuck")
if part_disk.uuid != db_part.idisk_uuid:
# TO DO: What happens when a disk is replaced
partition_update_needed = True
partition_dict.update({'idisk_uuid': part_disk.uuid})
partition_dict['idisk_uuid'] = part_disk.uuid
LOG.info("Disk for partition %s has changed." %
db_part['uuid'])
db_part.uuid)
if partition_update_needed:
self.dbapi.partition_update(db_part['uuid'],
partition_dict)
LOG.debug("PART conductor - partition needs to be "
"updated.")
try:
self.dbapi.partition_update(db_part.uuid, partition_dict)
except TypeError:
pass
LOG.debug("PART conductor - partition needs to be updated.")
# Go through the partitions reported by the agent and make needed
# modifications.
db_parts = self.dbapi.partition_get_by_ihost(ihost_uuid)
for ipart in ipart_dict_array:
# Not to add ceph osd related partitions
if (ipart['type_guid'] in constants.CEPH_PARTITIONS):
@ -5030,27 +5063,27 @@ class ConductorManager(service.PeriodicService):
if ipart['device_path'] == db_part.device_path:
found = True
part_update_dict = {}
# On CentOS to Debian upgrade partitions may differ
# ipart 'start_mib' and 'end_mib' values are strings
# whereas db_part are integers.
if (str(ipart['start_mib']) != str(db_part['start_mib']) or
str(ipart['end_mib']) != str(db_part['end_mib']) or
ipart['type_guid'] != db_part['type_guid']):
LOG.info("PART update part start/end/size/type/name")
self.dbapi.partition_update(
db_part.uuid,
{'start_mib': ipart['start_mib'],
'end_mib': ipart['end_mib'],
'size_mib': ipart['size_mib'],
'type_guid': ipart['type_guid'],
'type_name': ipart['type_name']}
)
if (ipart['start_mib'] != str(db_part.start_mib)):
part_update_dict['start_mib'] = ipart['start_mib']
if (ipart['end_mib'] != str(db_part.end_mib)):
part_update_dict['end_mib'] = ipart['end_mib']
if (ipart['size_mib'] != str(db_part.size_mib)):
part_update_dict['size_mib'] = ipart['size_mib']
if (ipart['type_guid'] != db_part.type_guid):
part_update_dict['type_guid'] = ipart['type_guid']
if (ipart['type_name'] != db_part.type_name):
part_update_dict['type_name'] = ipart['type_name']
if (ipart['device_node'] != db_part.device_node):
part_update_dict['device_node'] = ipart['device_node']
if part_update_dict:
LOG.info("PART update part: %s" % str(list(part_update_dict.keys())))
self.dbapi.partition_update(db_part.uuid, part_update_dict)
if ipart['device_node'] != db_part.device_node:
LOG.info("PART update part device node")
self.dbapi.partition_update(
db_part.uuid,
{'device_node': ipart['device_node']})
LOG.debug("PART conductor - found partition: %s" %
db_part.device_path)
@ -6524,7 +6557,9 @@ class ConductorManager(service.PeriodicService):
{'install_state': host.install_state,
'install_state_info':
host.install_state_info})
PUPPET_RUNTIME_CLASS_ROUTES = 'platform::network::routes::runtime'
PUPPET_RUNTIME_CLASS_PARTITIONS = 'platform::partitions::runtime'
PUPPET_RUNTIME_CLASS_DOCKERDISTRIBUTION = 'platform::dockerdistribution::runtime'
PUPPET_RUNTIME_CLASS_USERS = 'platform::users::runtime'
PUPPET_RUNTIME_FILES_DOCKER_REGISTRY_KEY_FILE = constants.DOCKER_REGISTRY_KEY_FILE
@ -6584,7 +6619,6 @@ class ConductorManager(service.PeriodicService):
check_required = True
if host_uuids and self.host_uuid not in host_uuids:
check_required = False
if not check_required and not filter_classes:
return True
@ -8338,7 +8372,7 @@ class ConductorManager(service.PeriodicService):
config_dict = {
"personalities": personalities,
'host_uuids': [host.uuid],
"classes": 'platform::network::routes::runtime',
"classes": ['platform::network::routes::runtime'],
puppet_common.REPORT_STATUS_CFG:
puppet_common.REPORT_ROUTE_CONFIG,
}
@ -8814,7 +8848,7 @@ class ConductorManager(service.PeriodicService):
# Update service table
self.update_service_table_for_cinder()
classes = ['platform::partitions::runtime',
classes = [self.PUPPET_RUNTIME_CLASS_PARTITIONS,
'platform::lvm::controller::runtime',
'platform::haproxy::runtime',
'platform::drbd::runtime',
@ -8922,7 +8956,7 @@ class ConductorManager(service.PeriodicService):
ctrl.administrative == constants.ADMIN_UNLOCKED and
ctrl.availability in [constants.AVAILABILITY_AVAILABLE,
constants.AVAILABILITY_DEGRADED]]
classes = ['platform::partitions::runtime',
classes = [self.PUPPET_RUNTIME_CLASS_PARTITIONS,
'platform::lvm::controller::runtime',
'platform::haproxy::runtime',
'openstack::keystone::endpoint::runtime',
@ -9021,7 +9055,7 @@ class ConductorManager(service.PeriodicService):
(ctrl.administrative == constants.ADMIN_UNLOCKED and
ctrl.operational == constants.OPERATIONAL_ENABLED)]
classes = ['platform::partitions::runtime',
classes = [self.PUPPET_RUNTIME_CLASS_PARTITIONS,
'platform::lvm::controller::runtime',
'platform::haproxy::runtime',
'openstack::keystone::endpoint::runtime',
@ -9200,7 +9234,7 @@ class ConductorManager(service.PeriodicService):
# Apply the runtime manifest to revert the k8s upgrade process
config_dict = {
"personalities": personalities,
"classes": 'platform::kubernetes::upgrade_abort',
"classes": ['platform::kubernetes::upgrade_abort'],
puppet_common.REPORT_STATUS_CFG:
puppet_common.REPORT_UPGRADE_ABORT
}
@ -9227,7 +9261,7 @@ class ConductorManager(service.PeriodicService):
# Apply the runtime manifest to revert the k8s upgrade process
config_dict = {
"personalities": personalities,
"classes": 'platform::kubernetes::upgrade_abort_recovery',
"classes": ['platform::kubernetes::upgrade_abort_recovery'],
}
self._config_apply_runtime_manifest(
context, config_uuid=active_controller.config_target,
@ -12084,10 +12118,9 @@ class ConductorManager(service.PeriodicService):
for c, h in self._runtime_class_apply_in_progress:
if c in classes_list:
if host_uuids and host_uuids in h:
LOG.info("config runtime host_uuids=%s from %s" %
(host_uuids, h))
return True
if not host_uuids or next((host for host in host_uuids if host in h)):
LOG.info("config runtime in progress (%s, %s)" % (c, h))
return True
return False
@cutils.synchronized(LOCK_RUNTIME_CONFIG_CHECK)
@ -12262,11 +12295,12 @@ class ConductorManager(service.PeriodicService):
if config_uuid not in self._host_runtime_config_history:
self._host_runtime_config_history[config_uuid] = config_dict
if filter_classes and config_dict['classes'] in filter_classes:
LOG.info("config runtime filter_classes add %s %s" %
(filter_classes, config_dict))
self._add_runtime_class_apply_in_progress(filter_classes,
host_uuids=config_dict.get('host_uuids', None))
if filter_classes:
classes = [config_class for config_class in config_dict['classes'] if config_class in filter_classes]
if classes:
LOG.info("config runtime filter_classes add %s" % (classes))
self._add_runtime_class_apply_in_progress(classes,
host_uuids=config_dict.get('host_uuids', None))
def _update_ipv_device_path(self, idisk, ipv):
if not idisk.device_path:
@ -15616,7 +15650,7 @@ class ConductorManager(service.PeriodicService):
# each controller.
config_dict = {
"personalities": personalities,
"classes": 'platform::kubernetes::pre_pull_control_plane_images'
"classes": ['platform::kubernetes::pre_pull_control_plane_images']
}
self._config_apply_runtime_manifest(context, config_uuid, config_dict)
@ -16048,7 +16082,7 @@ class ConductorManager(service.PeriodicService):
config_dict = {
"personalities": personalities,
"classes": 'platform::kubernetes::upgrade_abort',
"classes": ['platform::kubernetes::upgrade_abort'],
puppet_common.REPORT_STATUS_CFG:
puppet_common.REPORT_UPGRADE_ABORT
}

View File

@ -436,7 +436,7 @@ class ConductorAPI(sysinv.openstack.common.rpc.proxy.RpcProxy):
version='1.1')
def ipartition_update_by_ihost(self, context,
ihost_uuid, ipart_dict_array):
ihost_uuid, ipart_dict_array, first_report=False):
"""Create or update partitions for an ihost with the supplied data.
@ -446,13 +446,15 @@ class ConductorAPI(sysinv.openstack.common.rpc.proxy.RpcProxy):
:param context: an admin context
:param ihost_uuid: ihost uuid unique id
:param ipart_dict_array: initial values for partition objects
:param first_report: first report from agent
:returns: pass or fail
"""
return self.call(context,
self.make_msg('ipartition_update_by_ihost',
ihost_uuid=ihost_uuid,
ipart_dict_array=ipart_dict_array))
ipart_dict_array=ipart_dict_array,
first_report=first_report))
def update_partition_config(self, context, partition):
"""Asynchronously, have a conductor configure the physical volume

View File

@ -2937,7 +2937,7 @@ class Connection(api.Connection):
count = query.update(values, synchronize_session='fetch')
if count != 1:
raise exception.DiskPartitionNotFound(partition_id=partition_id)
return query.one()
return query.first()
def partition_destroy(self, partition_id):
with _session_for_write():

View File

@ -206,7 +206,8 @@ class TestDeletePartition(TestPartition):
forihostid=self.ihost.id,
idisk_id=self.disk.id,
idisk_uuid=self.disk.uuid,
size_mib=128)
size_mib=128,
device_path=self.partition_device_path)
def test_delete_partition(self):
# Delete the partition

View File

@ -2299,7 +2299,7 @@ class ManagerTestCase(base.DbTestCase):
personalities = [constants.CONTROLLER]
config_dict = {
"personalities": personalities,
"classes": 'platform::kubernetes::upgrade_abort',
"classes": ['platform::kubernetes::upgrade_abort'],
puppet_common.REPORT_STATUS_CFG:
puppet_common.REPORT_UPGRADE_ABORT
}
@ -2338,7 +2338,7 @@ class ManagerTestCase(base.DbTestCase):
personalities = [constants.CONTROLLER]
config_dict = {
"personalities": personalities,
"classes": 'platform::kubernetes::upgrade_abort',
"classes": ['platform::kubernetes::upgrade_abort'],
puppet_common.REPORT_STATUS_CFG:
puppet_common.REPORT_UPGRADE_ABORT
}
@ -2359,7 +2359,7 @@ class ManagerTestCase(base.DbTestCase):
personalities = [constants.CONTROLLER]
config_dict = {
"personalities": personalities,
"classes": 'platform::kubernetes::upgrade_abort_recovery',
"classes": ['platform::kubernetes::upgrade_abort_recovery'],
}
mock_config_apply_runtime_manifest.assert_called_with(mock.ANY,
config_uuid='4e93a1c4-44c0-4cb8-839b-e50d166514d0',