From 604b4a5ea0c2f8fa78f204032acfbf0a72f8a50c Mon Sep 17 00:00:00 2001 From: Stefan Dinescu Date: Thu, 11 Oct 2018 12:43:55 +0300 Subject: [PATCH] Standby controller filesystem sizes check While installing a system with two controllers, you can assign new PVs to the cgts-vg volume group and resize the filesystem on controller-0, before provisioning controller-1. If these new sizes are above the default provisioned space for cgts-vg on controller-1, the unlock is allowed, but the node goes into a reboot loop due to not having enough space to assign to all the partitions. Now, before unlocking the standby controller for the first time we check if the provisioned space on the node is equal or larger than the used space on the active controller and if it is not reject the unlock. Change-Id: I3fce3430abbb81d08272f35915cc50c761754733 Closes-bug: 1797108 Signed-off-by: Stefan Dinescu --- .../api/controllers/v1/controller_fs.py | 3 +- .../sysinv/sysinv/api/controllers/v1/host.py | 82 +++++++++++++++++++ .../sysinv/sysinv/sysinv/common/constants.py | 3 + 3 files changed, 86 insertions(+), 2 deletions(-) diff --git a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/controller_fs.py b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/controller_fs.py index a024391600..17d104d67c 100644 --- a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/controller_fs.py +++ b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/controller_fs.py @@ -399,7 +399,6 @@ def _get_controller_fs_limit(device_path_ctrl0, device_path_ctrl1): """ reserved_space = constants.CONTROLLER_ROOTFS_RESERVED - CFS_RESIZE_BUFFER_GIB = 2 # reserve space and ensure no rounding errors max_disk_size_controller0 = 0 max_disk_size_controller1 = 0 @@ -516,7 +515,7 @@ def _get_controller_fs_limit(device_path_ctrl0, device_path_ctrl1): else: cgtsvg_max_free_GiB = cgtsvg0_free_mib / 1024 - cgtsvg_max_free_GiB -= CFS_RESIZE_BUFFER_GIB + cgtsvg_max_free_GiB -= constants.CFS_RESIZE_BUFFER_GIB LOG.info("SYS_I filesystem limits cgtsvg0_free_mib=%s, " "cgtsvg1_free_mib=%s, cgtsvg_max_free_GiB=%s" diff --git a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/host.py b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/host.py index d785b5432f..c1f9e122c4 100644 --- a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/host.py +++ b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/host.py @@ -3785,6 +3785,87 @@ class HostController(rest.RestController): else: raise wsme.exc.ClientSideError(msg) + @staticmethod + def _semantic_check_filesystem_sizes(ihost): + """ + Perform checks for filesystem consistency across controllers + :param ihost: host information of host with controller functionality + """ + # Unlocking the active controller happens only after running + # config_controller or on a one-node system, so this check isn't + # needed in such scenarios + if (utils.is_host_active_controller(ihost) or + utils.is_host_simplex_controller(ihost)): + return + + # The check should only happen the first time the standby controller + # is unlocked, so we check if the node has already been provisioned + # or is in a provisioning state (meaning the unlock is in progress) + # After the second controller is provisioned, the filesystem resize + # consistency checks prevent any inconsistencies between nodes + if (ihost['invprovision'] and + ihost['invprovision'] != constants.UNPROVISIONED): + LOG.info("Controller host %s provisioning or already provisioned. " + "Skipping filesystem checks." % ihost['hostname']) + return + + active_controller = utils.HostHelper.get_active_controller() + ihost_ilvgs = pecan.request.dbapi.ilvg_get_by_ihost(active_controller.uuid) + + for lvg in ihost_ilvgs: + if lvg.lvm_vg_name == constants.LVG_CGTS_VG: + if (not lvg.lvm_vg_size or not lvg.lvm_vg_total_pe): + # Should not happen for active controller, but we should check + # this anyway. + raise wsme.exc.ClientSideError( + _("Active controller %s volume group not yet inventoried.") % + constants.LVG_CGTS_VG) + lvm_vg_used_pe = int(lvg.lvm_vg_total_pe) - int(lvg.lvm_vg_free_pe) + active_controller_used = ( + int(lvg.lvm_vg_size) * lvm_vg_used_pe / int(lvg.lvm_vg_total_pe)) + + # For the standby controller the PVs are not yet allocated to the volume + # group, so we can't get the size directly from volume-group info + # For the standby controller the allocated space is the sum between: + # - cgts-vg space allocated by kickstarts + # - partition PVs assigned to cgts-vg + # - disk PVs assigned to cgts-vg + standby_controller_allocated_space = 0 + standby_pvs = pecan.request.dbapi.ipv_get_by_ihost(ihost['uuid']) + for pv in standby_pvs: + if pv.lvm_vg_name == constants.LVG_CGTS_VG: + if pv.lvm_pv_size: + standby_controller_allocated_space += int( + pv.lvm_pv_size) + elif pv.pv_type == constants.PV_TYPE_PARTITION: + part_info = pecan.request.dbapi.partition_get_by_ipv(pv['uuid']) + standby_controller_allocated_space += int( + part_info[0].size_mib) * (1024**2) + elif pv.pv_type == constants.PV_TYPE_DISK: + disk_info = pecan.request.dbapi.idisk_get_by_ipv(pv['uuid']) + standby_controller_allocated_space += int( + disk_info[0].size_mib) * (1024**2) + + LOG.info("Active controller filesystem space used: %s" % + str(active_controller_used)) + LOG.info("Standby controller filesystem allocated space: %s" % + str(standby_controller_allocated_space)) + + if (active_controller_used > standby_controller_allocated_space): + # Since we allocate space that is measured in GiB, the human + # readable information shown in case of an error should also + # be in GiB. We add a 2GB buffer (the same used when changing + # filesystem sizes) to ensure no rounding errors + needed_space = (float( + active_controller_used - + standby_controller_allocated_space) / (1024 ** 3) + + constants.CFS_RESIZE_BUFFER_GIB) + msg = _("Standby controller does not have enough space allocated to " + "%(vg_name)s volume-group in order to create all filesystems. " + "Please assign an extra %(needed).2f GB to the volume group.") % { + 'vg_name': constants.LVG_CGTS_VG, 'needed': needed_space} + raise wsme.exc.ClientSideError(msg) + @staticmethod def _semantic_check_storage_backend(ihost): """ @@ -5005,6 +5086,7 @@ class HostController(rest.RestController): self._semantic_check_unlock_upgrade(hostupdate.ihost_orig, force_unlock) self._semantic_check_oam_interface(hostupdate.ihost_orig) self._semantic_check_cinder_volumes(hostupdate.ihost_orig) + self._semantic_check_filesystem_sizes(hostupdate.ihost_orig) self._semantic_check_storage_backend(hostupdate.ihost_orig) # If HTTPS is enabled then we may be in TPM configuration mode if utils.get_https_enabled(): diff --git a/sysinv/sysinv/sysinv/sysinv/common/constants.py b/sysinv/sysinv/sysinv/sysinv/common/constants.py index 8b73357bdf..59c8d9bd8f 100644 --- a/sysinv/sysinv/sysinv/sysinv/common/constants.py +++ b/sysinv/sysinv/sysinv/sysinv/common/constants.py @@ -433,6 +433,9 @@ MIN_STOR_MONITORS = 2 # Storage: reserved space for calculating controller rootfs limit CONTROLLER_ROOTFS_RESERVED = 38 +# Controller filesystem reserved space to ensure no rounding errors +CFS_RESIZE_BUFFER_GIB = 2 + BACKUP_OVERHEAD = 20 # Suffix used in LVM volume name to indicate that the