From 747595fb0b3fff8ce87da194cd2d692d535098b5 Mon Sep 17 00:00:00 2001 From: Ovidiu Poncea Date: Tue, 8 Jan 2019 16:01:02 +0200 Subject: [PATCH] Ceph for standard: Semantic checks In order to enable Openstack's helm charts on StarlingX we need a distributed persistent storage for Kubernetes that leverages our existing storage configurations. For this stage we will enable CEPH's RBD to work with Kubernetes RBD provisioner through a new Helm chart. Since RBD will be the persistent storage solution, CEPH support has to be extended to the 1, 2 node and Standard configurations. This commit adds multiple semantic checks to make sure that Ceph on a standard deployment works as expected. Change-Id: I31786fa78e8c926a57077bb0a25f1e4cbf543cab Co-Authored-By: Stefan Dinescu Implements: containerization-2002844-CEPH-persistent-storage-backend-for-Kubernetes Story: 2002844 Task: 28723 Depends-On: https://review.openstack.org/629512 Signed-off-by: Ovidiu Poncea --- .../sysinv/api/controllers/v1/ceph_mon.py | 68 +++++- .../sysinv/sysinv/api/controllers/v1/host.py | 206 +++++++++++------- .../sysinv/api/controllers/v1/storage.py | 32 ++- .../sysinv/api/controllers/v1/storage_ceph.py | 9 + sysinv/sysinv/sysinv/sysinv/common/ceph.py | 90 ++++++-- .../sysinv/sysinv/sysinv/common/constants.py | 10 + .../sysinv/common/storage_backend_conf.py | 2 +- .../sysinv/sysinv/sysinv/conductor/manager.py | 2 - .../sysinv/tests/api/test_storage_tier.py | 14 ++ sysinv/sysinv/sysinv/sysinv/tests/db/utils.py | 17 ++ 10 files changed, 338 insertions(+), 112 deletions(-) diff --git a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/ceph_mon.py b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/ceph_mon.py index 6b3a529e4d..e07a0c97b8 100644 --- a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/ceph_mon.py +++ b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/ceph_mon.py @@ -35,6 +35,7 @@ from sysinv.api.controllers.v1 import controller_fs as controller_fs_utils from sysinv.api.controllers.v1 import link from sysinv.api.controllers.v1 import types from sysinv.api.controllers.v1 import utils +from sysinv.common import ceph from sysinv.common import constants from sysinv.common import exception from sysinv.common import utils as cutils @@ -406,13 +407,57 @@ def _set_defaults(ceph_mon): def _create(ceph_mon): + # validate host + try: + chost = pecan.request.dbapi.ihost_get(ceph_mon['ihost_uuid']) + except exception.ServerNotFound: + raise wsme.exc.ClientSideError( + _("Host not found uuid: %s ." % ceph_mon['ihost_uuid'])) + + ceph_mon['forihostid'] = chost['id'] + + # check if ceph monitor is already configured + if pecan.request.dbapi.ceph_mon_get_by_ihost(ceph_mon['forihostid']): + raise wsme.exc.ClientSideError( + _("Ceph monitor already configured for host '%s'." % chost['hostname'])) + + # only one instance of the 3rd ceph monitor is allowed + ceph_mons = pecan.request.dbapi.ceph_mon_get_list() + for mon in ceph_mons: + h = pecan.request.dbapi.ihost_get(mon['forihostid']) + if h.personality in [constants.STORAGE, constants.WORKER]: + raise wsme.exc.ClientSideError( + _("Ceph monitor already configured for host '%s'." % h['hostname'])) + + # Adding a ceph monitor to a worker selects Ceph's deployment model + if chost['personality'] == constants.WORKER: + # Only if replication model is CONTROLLER or not yet defined + stor_model = ceph.get_ceph_storage_model() + worker_stor_models = [constants.CEPH_CONTROLLER_MODEL, constants.CEPH_UNDEFINED_MODEL] + if stor_model not in worker_stor_models: + raise wsme.exc.ClientSideError( + _("Can not add a storage monitor to a worker if " + "ceph's deployments model is already set to %s." % stor_model)) + + replication, min_replication = \ + StorageBackendConfig.get_ceph_max_replication(pecan.request.dbapi) + supported_replication = constants.CEPH_CONTROLLER_MODEL_REPLICATION_SUPPORTED + if replication not in supported_replication: + raise wsme.exc.ClientSideError( + _("Ceph monitor can be added to a worker only if " + "replication is set to: %s'. Please update replication " + "before configuring a monitor on a worker node." % supported_replication)) + + # host must be locked and online + if (chost['availability'] != constants.AVAILABILITY_ONLINE or + chost['administrative'] != constants.ADMIN_LOCKED): + raise wsme.exc.ClientSideError( + _("Host %s must be locked and online." % chost['hostname'])) + ceph_mon = _set_defaults(ceph_mon) _check_ceph_mon(ceph_mon) - chost = pecan.request.dbapi.ihost_get(ceph_mon['ihost_uuid']) - ceph_mon['forihostid'] = chost['id'] - controller_fs_utils._check_controller_fs( ceph_mon_gib_new=ceph_mon['ceph_mon_gib']) @@ -452,12 +497,17 @@ def _create(ceph_mon): # At this moment the only possibility to add a dynamic monitor # is on a worker node, so we check for that. if chost.personality == constants.WORKER: - # Storage nodes are not supported on a controller based - # storage model. - personalities = [constants.CONTROLLER, constants.WORKER] - pecan.request.rpcapi.update_ceph_base_config( - pecan.request.context, - personalities) + try: + # Storage nodes are not supported on a controller based + # storage model. + personalities = [constants.CONTROLLER, constants.WORKER] + pecan.request.rpcapi.update_ceph_base_config( + pecan.request.context, + personalities) + except Exception: + values = {'state': constants.SB_STATE_CONFIG_ERR, 'task': None} + pecan.request.dbapi.ceph_mon_update(new_ceph_mon['uuid'], values) + raise # The return value needs to be iterable, so make it a list. return [new_ceph_mon] diff --git a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/host.py b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/host.py index 2216e02692..3526299f0e 100644 --- a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/host.py +++ b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/host.py @@ -2282,11 +2282,8 @@ class HostController(rest.RestController): "monitor available. At least %s unlocked and " "enabled hosts with monitors are required. Please" " ensure hosts with monitors are unlocked and " - "enabled - candidates: %s, %s, %s") % - (num_monitors, constants.MIN_STOR_MONITORS, - constants.CONTROLLER_0_HOSTNAME, - constants.CONTROLLER_1_HOSTNAME, - constants.STORAGE_0_HOSTNAME)) + "enabled.") % + (num_monitors, constants.MIN_STOR_MONITORS)) # If it is the last storage node to delete, we need to delete # ceph osd pools and update additional tier status to "defined" @@ -3025,15 +3022,23 @@ class HostController(rest.RestController): patched_ihost['subfunctions'] = subfunctions elif patched_ihost['personality'] == constants.STORAGE: - # Storage nodes are only allowed if we are configured to use - # ceph for the cinder backend. + # Storage nodes are only allowed if we are configured to use ceph. if not StorageBackendConfig.has_backend_configured( pecan.request.dbapi, - constants.CINDER_BACKEND_CEPH + constants.SB_TYPE_CEPH ): raise wsme.exc.ClientSideError( _("Storage nodes can only be configured if storage " - "cluster is configured for the cinder backend.")) + "cluster is configured for the Ceph backend.")) + + # Storage nodes are allowed when using the CEPH_STORAGE_MODEL model + stor_model = ceph.get_ceph_storage_model() + if stor_model not in [constants.CEPH_STORAGE_MODEL, constants.CEPH_UNDEFINED_MODEL]: + # Adding storage-0 when storage model is CEPH_UNDEFINED_MODEL will + # set it to CEPH_STORAGE_MODEL. + raise wsme.exc.ClientSideError( + _("Storage nodes can not be configured for " + "the '%s' storage model." % stor_model)) current_storage_ihosts = \ pecan.request.dbapi.ihost_get_by_personality(constants.STORAGE) @@ -4408,7 +4413,7 @@ class HostController(rest.RestController): api = pecan.request.dbapi backend = StorageBackendConfig.get_configuring_backend(api) - if backend and backend.backend == constants.CINDER_BACKEND_CEPH: + if backend and backend.backend == constants.SB_TYPE_CEPH: ihosts = api.ihost_get_by_personality( constants.CONTROLLER ) @@ -4419,9 +4424,14 @@ class HostController(rest.RestController): # check if customer needs to install storage nodes if backend.task == constants.SB_TASK_RECONFIG_CONTROLLER: - if HostController._check_provisioned_storage_hosts(): - # Storage nodes are provisioned. This means that - # this is not the first time Ceph is configured + stor_model = ceph.get_ceph_storage_model() + if (HostController._check_provisioned_storage_hosts() or + stor_model == constants.CEPH_CONTROLLER_MODEL): + # This means that either: + # 1. Storage nodes are already provisioned (this is not + # the first time Ceph is configured) or + # 2. We are on a standard config and we don't need to + # configure storage nodes at all. api.storage_backend_update(backend.uuid, { 'state': constants.SB_STATE_CONFIGURED, 'task': None @@ -4906,7 +4916,8 @@ class HostController(rest.RestController): subfunctions_set = \ set(hostupdate.ihost_patch[constants.SUBFUNCTIONS].split(',')) - if constants.WORKER in subfunctions_set: + if (personality == constants.WORKER or + constants.WORKER in subfunctions_set): self.check_lock_worker(hostupdate) hostupdate.notify_vim = True @@ -4938,20 +4949,38 @@ class HostController(rest.RestController): if StorageBackendConfig.has_backend_configured( pecan.request.dbapi, - constants.CINDER_BACKEND_CEPH): + constants.SB_TYPE_CEPH): + query_hosts = None + stor_model = ceph.get_ceph_storage_model() + if stor_model == constants.CEPH_STORAGE_MODEL: + query_hosts = constants.STORAGE + elif stor_model == constants.CEPH_CONTROLLER_MODEL: + query_hosts = constants.CONTROLLER + else: + # If backend type is still undefined it means no storage nodes + # have been configured and no worker monitor has been added, + # so it is safe to not check the quorum. + # Or we are dealing with an AIO-SX. + return try: - st_nodes = pecan.request.dbapi.ihost_get_by_personality(constants.STORAGE) + st_nodes = pecan.request.dbapi.ihost_get_by_personality(query_hosts) except exception.NodeNotFound: # If we don't have any storage nodes we don't need to # check for quorum. We'll allow the node to be locked. + # We will always have at least one controller, so for + # controllers that also act as storage nodes this should + # never happen. return + # TODO(oponcea) remove once SM supports in-service config reload # Allow locking controllers when all storage nodes are locked. - for node in st_nodes: - if (node['administrative'] == constants.ADMIN_UNLOCKED): - break - else: - return + if stor_model == constants.CEPH_STORAGE_MODEL: + for node in st_nodes: + if (node['administrative'] == constants.ADMIN_UNLOCKED): + break + else: + return + if (hostupdate.ihost_orig['administrative'] == constants.ADMIN_UNLOCKED and hostupdate.ihost_orig['operational'] == @@ -4986,11 +5015,8 @@ class HostController(rest.RestController): "monitor available. At least %s unlocked and " "enabled hosts with monitors are required. Please" " ensure hosts with monitors are unlocked and " - "enabled - candidates: %s, %s, %s") % - (num_monitors, constants.MIN_STOR_MONITORS, - constants.CONTROLLER_0_HOSTNAME, - constants.CONTROLLER_1_HOSTNAME, - constants.STORAGE_0_HOSTNAME)) + "enabled.") % + (num_monitors, constants.MIN_STOR_MONITORS)) if not force: # sm-lock-pre-check @@ -5185,9 +5211,9 @@ class HostController(rest.RestController): storage_nodes = pecan.request.dbapi.ihost_get_by_personality( personality=constants.STORAGE) except Exception: - raise wsme.exc.ClientSideError( - _("Can not unlock a worker node until at " - "least one storage node is unlocked and enabled.")) + # We are unlocking worker node when no storage nodes are + # defined. This is ok in CEPH_CONTROLLER_MODEL. + pass is_storage_host_unlocked = False if storage_nodes: for node in storage_nodes: @@ -5197,8 +5223,9 @@ class HostController(rest.RestController): is_storage_host_unlocked = True break - - if not is_storage_host_unlocked: + stor_model = ceph.get_ceph_storage_model() + if (not is_storage_host_unlocked and + not stor_model == constants.CEPH_CONTROLLER_MODEL): raise wsme.exc.ClientSideError( _("Can not unlock a worker node until at " "least one storage node is unlocked and enabled.")) @@ -5229,11 +5256,8 @@ class HostController(rest.RestController): "monitor available. At least %s unlocked and " "enabled hosts with monitors are required. Please" " ensure hosts with monitors are unlocked and " - "enabled - candidates: %s, %s, %s") % - (num_monitors, constants.MIN_STOR_MONITORS, - constants.CONTROLLER_0_HOSTNAME, - constants.CONTROLLER_1_HOSTNAME, - constants.STORAGE_0_HOSTNAME)) + "enabled.") % + (num_monitors, constants.MIN_STOR_MONITORS)) # Check Ceph configuration, if it is wiped out (in the Backup & Restore # process) then restore the configuration. @@ -5528,11 +5552,8 @@ class HostController(rest.RestController): "monitor available. At least %s unlocked and " "enabled hosts with monitors are required. Please" " ensure hosts with monitors are unlocked and " - "enabled - candidates: %s, %s, %s") % - (num_monitors, constants.MIN_STOR_MONITORS, - constants.CONTROLLER_0_HOSTNAME, - constants.CONTROLLER_1_HOSTNAME, - constants.STORAGE_0_HOSTNAME)) + "enabled.") % + (num_monitors, constants.MIN_STOR_MONITORS)) storage_nodes = pecan.request.dbapi.ihost_get_by_personality( constants.STORAGE) @@ -5607,50 +5628,79 @@ class HostController(rest.RestController): def check_lock_worker(self, hostupdate, force=False): """Pre lock semantic checks for worker""" + hostname = hostupdate.ihost_patch.get('hostname') LOG.info("%s host check_lock_worker" % hostupdate.displayid) if force: + LOG.info("Forced lock of host: %s" % hostname) return - upgrade = None - try: - upgrade = pecan.request.dbapi.software_upgrade_get_one() - except exception.NotFound: - return - - upgrade_state = upgrade.state system = pecan.request.dbapi.isystem_get_one() system_mode = system.system_mode system_type = system.system_type - hostname = hostupdate.ihost_patch.get('hostname') if system_mode == constants.SYSTEM_MODE_SIMPLEX: return - if upgrade_state in [ - constants.UPGRADE_STARTING, - constants.UPGRADE_STARTED, - constants.UPGRADE_DATA_MIGRATION, - constants.UPGRADE_DATA_MIGRATION_COMPLETE, - constants.UPGRADE_DATA_MIGRATION_FAILED]: - if system_type == constants.TIS_AIO_BUILD: - if hostname == constants.CONTROLLER_1_HOSTNAME: - # Allow AIO-DX lock of controller-1 - return - raise wsme.exc.ClientSideError( - _("Rejected: Can not lock %s with worker function " - "at this upgrade stage '%s'.") % - (hostupdate.displayid, upgrade_state)) + # Check upgrade state for controllers with worker subfunction + subfunctions_set = \ + set(hostupdate.ihost_patch[constants.SUBFUNCTIONS].split(',')) + if (hostupdate.ihost_orig['personality'] == constants.CONTROLLER and + constants.WORKER in subfunctions_set): + upgrade = None + try: + upgrade = pecan.request.dbapi.software_upgrade_get_one() + upgrade_state = upgrade.state + except exception.NotFound: + upgrade_state = None - if upgrade_state in [constants.UPGRADE_UPGRADING_CONTROLLERS]: - if system_type == constants.TIS_AIO_BUILD: - # Allow lock for AIO-DX controller-0 after upgrading - # controller-1. Allow lock for AIO-DX controllers. - if hostname == constants.CONTROLLER_0_HOSTNAME: - return - raise wsme.exc.ClientSideError( - _("Rejected: Can not lock %s with worker function " - "at this upgrade stage '%s'.") % - (hostupdate.displayid, upgrade_state)) + if upgrade_state in [ + constants.UPGRADE_STARTING, + constants.UPGRADE_STARTED, + constants.UPGRADE_DATA_MIGRATION, + constants.UPGRADE_DATA_MIGRATION_COMPLETE, + constants.UPGRADE_DATA_MIGRATION_FAILED]: + if system_type == constants.TIS_AIO_BUILD: + if hostname == constants.CONTROLLER_1_HOSTNAME: + # Allow AIO-DX lock of controller-1 + return + raise wsme.exc.ClientSideError( + _("Rejected: Can not lock %s with worker function " + "at this upgrade stage '%s'.") % + (hostupdate.displayid, upgrade_state)) + + if upgrade_state in [constants.UPGRADE_UPGRADING_CONTROLLERS]: + if system_type == constants.TIS_AIO_BUILD: + # Allow lock for AIO-DX controller-0 after upgrading + # controller-1. Allow lock for AIO-DX controllers. + if hostname == constants.CONTROLLER_0_HOSTNAME: + return + raise wsme.exc.ClientSideError( + _("Rejected: Can not lock %s with worker function " + "at this upgrade stage '%s'.") % + (hostupdate.displayid, upgrade_state)) + + # Worker node with a Ceph Monitor service? Make sure at least + # two monitors will remain up after lock. + host_id = hostupdate.ihost_orig.get('id') + ceph_mon = pecan.request.dbapi.ceph_mon_get_by_ihost(host_id) + if ceph_mon: + if (hostupdate.ihost_orig['personality'] == + constants.WORKER and + hostupdate.ihost_orig['administrative'] == + constants.ADMIN_UNLOCKED and + hostupdate.ihost_orig['operational'] == + constants.OPERATIONAL_ENABLED): + num_monitors, required_monitors, quorum_names = \ + self._ceph.get_monitors_status(pecan.request.dbapi) + if (hostname in quorum_names and + num_monitors - 1 < required_monitors): + raise wsme.exc.ClientSideError(_( + "Only %d Ceph " + "monitors available. At least %s unlocked and " + "enabled hosts with monitors are required. " + "Please ensure hosts with monitors are " + "unlocked and enabled.") % + (num_monitors, constants.MIN_STOR_MONITORS)) def check_unlock_interfaces(self, hostupdate): """Semantic check for interfaces on host-unlock.""" @@ -6112,6 +6162,8 @@ class HostController(rest.RestController): LOG.info("%s _handle_unlock_action" % hostupdate.displayid) if hostupdate.ihost_patch.get('personality') == constants.STORAGE: self._handle_unlock_storage_host(hostupdate) + elif hostupdate.ihost_patch.get('personality') == constants.WORKER: + self._handle_unlock_worker_host(hostupdate) hostupdate.notify_vim_action = False hostupdate.notify_mtce = True val = {'ihost_action': constants.UNLOCK_ACTION} @@ -6121,6 +6173,14 @@ class HostController(rest.RestController): def _handle_unlock_storage_host(self, hostupdate): self._ceph.update_crushmap(hostupdate) + def _handle_unlock_worker_host(self, hostupdate): + # Update crushmap if we unlocked the worker with a ceph monitor. + monitor_list = pecan.request.dbapi.ceph_mon_get_list() + for mon in monitor_list: + ihost = pecan.request.dbapi.ihost_get(mon['forihostid']) + if ihost.id == hostupdate.ihost_orig['id']: + self._ceph.update_crushmap(hostupdate) + @staticmethod def _handle_lock_action(hostupdate): """Handle host-lock action.""" diff --git a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/storage.py b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/storage.py index 9fffa8f7e4..3bea50d834 100644 --- a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/storage.py +++ b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/storage.py @@ -497,13 +497,11 @@ def _check_host(stor): if ihost['administrative'] != constants.ADMIN_LOCKED: raise wsme.exc.ClientSideError(_("Host must be locked")) - # semantic check: whether personality == storage or we have k8s AIO SX - is_k8s_aio = (utils.is_aio_system(pecan.request.dbapi) and - utils.is_kubernetes_config(pecan.request.dbapi)) - if not is_k8s_aio and ihost['personality'] != constants.STORAGE: - msg = ("Host personality must be 'storage' or kubernetes enabled " - "1 or 2 node system") - raise wsme.exc.ClientSideError(_(msg)) + # semantic check: only storage nodes are allowed without k8s + if (not utils.is_kubernetes_config(pecan.request.dbapi) and + ihost['personality'] != constants.STORAGE): + msg = ("Host personality must be 'storage' or kubernetes enabled.") + raise wsme.exc.ClientSideError(_(msg)) # semantic check: whether system has a ceph backend if not StorageBackendConfig.has_backend_configured( @@ -526,8 +524,24 @@ def _check_host(stor): "Only %d storage monitor available. " "At least %s unlocked and enabled hosts with monitors are " "required. Please ensure hosts with monitors are unlocked " - "and enabled - candidates: controller-0, controller-1, " - "storage-0") % (num_monitors, required_monitors)) + "and enabled.") % (num_monitors, required_monitors)) + + # semantic check: whether OSD can be added to this host. + stor_model = ceph.get_ceph_storage_model() + if stor_model == constants.CEPH_STORAGE_MODEL: + if ihost.personality != constants.STORAGE: + msg = ("Storage model is '%s'. Storage devices can only be added " + "to storage nodes." % stor_model) + raise wsme.exc.ClientSideError(_(msg)) + elif stor_model == constants.CEPH_CONTROLLER_MODEL: + if ihost.personality != constants.CONTROLLER: + msg = ("Storage model is '%s'. Storage devices can only be added " + "to controller nodes." % stor_model) + raise wsme.exc.ClientSideError(_(msg)) + elif stor_model == constants.CEPH_UNDEFINED_MODEL: + msg = ("Please install storage-0 or configure a Ceph monitor " + "on a worker node.") + raise wsme.exc.ClientSideError(_(msg)) def _check_disk(stor): diff --git a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/storage_ceph.py b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/storage_ceph.py index c24ab6313d..813864a530 100644 --- a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/storage_ceph.py +++ b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/storage_ceph.py @@ -41,6 +41,7 @@ from sysinv.api.controllers.v1 import link from sysinv.api.controllers.v1 import types from sysinv.api.controllers.v1 import utils from sysinv.api.controllers.v1.utils import SBApiHelper as api_helper +from sysinv.common import ceph from sysinv.common import constants from sysinv.common import exception from sysinv.common import utils as cutils @@ -974,6 +975,14 @@ def _check_replication_number(new_cap, orig_cap): raise wsme.exc.ClientSideError( _("Can not modify ceph replication factor on " "two node configuration.")) + + if ceph.get_ceph_storage_model() == constants.CEPH_CONTROLLER_MODEL: + # Replication change is not allowed when storage OSDs + # are enabled on controllers. + raise wsme.exc.ClientSideError( + _("Can not modify replication factor on " + "'%s' ceph deployment model." % constants.CEPH_CONTROLLER_MODEL)) + # On a standard install we allow modifications of ceph storage # backend parameters after the manifests have been applied and # before first storage node has been configured. diff --git a/sysinv/sysinv/sysinv/sysinv/common/ceph.py b/sysinv/sysinv/sysinv/sysinv/common/ceph.py index 42641de1e6..da71c620a7 100644 --- a/sysinv/sysinv/sysinv/sysinv/common/ceph.py +++ b/sysinv/sysinv/sysinv/sysinv/common/ceph.py @@ -645,21 +645,18 @@ class CephApiOperator(object): required_monitors = constants.MIN_STOR_MONITORS quorum_names = [] inventory_monitor_names = [] - ihosts = db_api.ihost_get_list() - for ihost in ihosts: - if ihost['personality'] == constants.WORKER: - continue - capabilities = ihost['capabilities'] - if 'stor_function' in capabilities: - host_action = ihost['ihost_action'] or "" - locking = (host_action.startswith(constants.LOCK_ACTION) or - host_action.startswith(constants.FORCE_LOCK_ACTION)) - if (capabilities['stor_function'] == constants.STOR_FUNCTION_MONITOR and - ihost['administrative'] == constants.ADMIN_UNLOCKED and - ihost['operational'] == constants.OPERATIONAL_ENABLED and - not locking): - num_inv_monitors += 1 - inventory_monitor_names.append(ihost['hostname']) + + monitor_list = db_api.ceph_mon_get_list() + for mon in monitor_list: + ihost = db_api.ihost_get(mon['forihostid']) + host_action = ihost['ihost_action'] or "" + locking = (host_action.startswith(constants.LOCK_ACTION) or + host_action.startswith(constants.FORCE_LOCK_ACTION)) + if (ihost['administrative'] == constants.ADMIN_UNLOCKED and + ihost['operational'] == constants.OPERATIONAL_ENABLED and + not locking): + num_inv_monitors += 1 + inventory_monitor_names.append(ihost['hostname']) LOG.info("Active ceph monitors in inventory = %s" % str(inventory_monitor_names)) @@ -702,15 +699,16 @@ class CephApiOperator(object): def fix_crushmap(dbapi=None): - # Crush Map: Replication of PGs across storage node pairs + """ Set Ceph's CRUSH Map based on storage model """ if not dbapi: dbapi = pecan.request.dbapi crushmap_flag_file = os.path.join(constants.SYSINV_CONFIG_PATH, constants.CEPH_CRUSH_MAP_APPLIED) if not os.path.isfile(crushmap_flag_file): - if utils.is_aio_simplex_system(dbapi): + stor_model = get_ceph_storage_model(dbapi) + if stor_model == constants.CEPH_AIO_SX_MODEL: crushmap_txt = "/etc/sysinv/crushmap-aio-sx.txt" - elif utils.is_aio_duplex_system(dbapi): + elif stor_model == constants.CEPH_CONTROLLER_MODEL: crushmap_txt = "/etc/sysinv/crushmap-controller-model.txt" else: crushmap_txt = "/etc/sysinv/crushmap-storage-model.txt" @@ -736,3 +734,59 @@ def fix_crushmap(dbapi=None): 'Reason: {}').format(crushmap_flag_file, e)) return True + + +def get_ceph_storage_model(dbapi=None): + + if not dbapi: + dbapi = pecan.request.dbapi + + if utils.is_aio_simplex_system(dbapi): + return constants.CEPH_AIO_SX_MODEL + + if utils.is_aio_duplex_system(dbapi): + return constants.CEPH_CONTROLLER_MODEL + + is_storage_model = False + is_controller_model = False + + monitor_list = dbapi.ceph_mon_get_list() + for mon in monitor_list: + ihost = dbapi.ihost_get(mon['forihostid']) + if ihost.personality == constants.WORKER: + # 3rd monitor is on a compute node, so OSDs are on controller + is_controller_model = True + elif ihost.personality == constants.STORAGE: + # 3rd monitor is on storage-0, so OSDs are also on storage nodes + is_storage_model = True + + # There are cases where we delete the monitor on worker node and have not + # yet assigned it to another worker. In this case check if any OSDs have + # been configured on controller nodes. + + if not is_storage_model: + controller_hosts = dbapi.ihost_get_by_personality(constants.CONTROLLER) + for chost in controller_hosts: + istors = dbapi.istor_get_by_ihost(chost['uuid']) + if len(istors): + LOG.info("Controller host %s has OSDs configured. System has ceph " + "controller storage." % chost['hostname']) + is_controller_model = True + break + + if is_storage_model and is_controller_model: + # Both types should not be true at the same time, but we should log a + # message for debug purposes + # TODO(sdinescu): Improve error message + LOG.error("Wrong ceph storage type. Bad configuration.") + return constants.CEPH_STORAGE_MODEL + elif is_storage_model: + return constants.CEPH_STORAGE_MODEL + elif is_controller_model: + return constants.CEPH_CONTROLLER_MODEL + else: + # This case is for the install stage where the decision + # to configure OSDs on controller or storage nodes is not + # clear (before adding a monitor on a compute or before + # configuring the first storage node) + return constants.CEPH_UNDEFINED_MODEL diff --git a/sysinv/sysinv/sysinv/sysinv/common/constants.py b/sysinv/sysinv/sysinv/sysinv/common/constants.py index 8b2c9d9f0e..87e072f82f 100644 --- a/sysinv/sysinv/sysinv/sysinv/common/constants.py +++ b/sysinv/sysinv/sysinv/sysinv/common/constants.py @@ -434,6 +434,15 @@ SB_CEPH_MON_GIB_MAX = 40 SB_CONFIGURATION_TIMEOUT = 1200 +# Ceph storage deployment model +# Controller model: OSDs are on controllers, no storage nodes can +# be defined. +# Storage model: OSDs are on dedicated storage nodes. +CEPH_STORAGE_MODEL = 'storage' +CEPH_CONTROLLER_MODEL = 'controller' +CEPH_AIO_SX_MODEL = 'aio-sx' +CEPH_UNDEFINED_MODEL = 'undefined' + # Storage: Minimum number of monitors MIN_STOR_MONITORS = 2 @@ -779,6 +788,7 @@ CEPH_TARGET_PGS_PER_OSD = 200 # Dual node and Storage CEPH_REPLICATION_FACTOR_DEFAULT = 2 CEPH_REPLICATION_FACTOR_SUPPORTED = [2, 3] +CEPH_CONTROLLER_MODEL_REPLICATION_SUPPORTED = [2] # Single node AIO_SX_CEPH_REPLICATION_FACTOR_DEFAULT = 1 diff --git a/sysinv/sysinv/sysinv/sysinv/common/storage_backend_conf.py b/sysinv/sysinv/sysinv/sysinv/common/storage_backend_conf.py index 3a72a49376..877251028e 100644 --- a/sysinv/sysinv/sysinv/sysinv/common/storage_backend_conf.py +++ b/sysinv/sysinv/sysinv/sysinv/common/storage_backend_conf.py @@ -142,7 +142,7 @@ class StorageBackendConfig(object): @staticmethod def has_backend_configured(dbapi, target, service=None, check_only_defaults=True, rpcapi=None): - """ Check is a backend is configured. """ + """ Check if a backend is configured. """ # If cinder is a shared service on another region and # we want to know if the ceph backend is configured, # send a rpc to conductor which sends a query to the primary diff --git a/sysinv/sysinv/sysinv/sysinv/conductor/manager.py b/sysinv/sysinv/sysinv/sysinv/conductor/manager.py index 091b062f0f..8fdbff9467 100644 --- a/sysinv/sysinv/sysinv/sysinv/conductor/manager.py +++ b/sysinv/sysinv/sysinv/sysinv/conductor/manager.py @@ -5664,7 +5664,6 @@ class ConductorManager(service.PeriodicService): # Update service table self.update_service_table_for_cinder() - # TODO(oponcea): Uncomment when SM supports in-service config reload ctrls = self.dbapi.ihost_get_by_personality(constants.CONTROLLER) valid_ctrls = [ctrl for ctrl in ctrls if ctrl.administrative == constants.ADMIN_UNLOCKED and @@ -5741,7 +5740,6 @@ class ConductorManager(service.PeriodicService): (node.administrative == constants.ADMIN_UNLOCKED and node.operational == constants.OPERATIONAL_ENABLED)] - # TODO: check what other puppet class need to be called config_dict = { "personalities": personalities, "host_uuids": [node.uuid for node in valid_nodes], diff --git a/sysinv/sysinv/sysinv/sysinv/tests/api/test_storage_tier.py b/sysinv/sysinv/sysinv/sysinv/tests/api/test_storage_tier.py index 016d320358..2aa0273f3d 100644 --- a/sysinv/sysinv/sysinv/sysinv/tests/api/test_storage_tier.py +++ b/sysinv/sysinv/sysinv/sysinv/tests/api/test_storage_tier.py @@ -530,6 +530,7 @@ class StorageTierDependentTCs(base.FunctionalTest): self.system = dbutils.create_test_isystem() self.load = dbutils.create_test_load() self.host_index = -1 + self.mon_index = -1 def tearDown(self): super(StorageTierDependentTCs, self).tearDown() @@ -554,6 +555,17 @@ class StorageTierDependentTCs(base.FunctionalTest): invprovision='unprovisioned') return self.dbapi.ihost_create(ihost_dict) + def _create_storage_mon(self, hostname, ihost_id): + self.mon_index += 1 + ceph_mon_dict = dbutils.get_test_mon( + id=self.mon_index, + uuid=uuidutils.generate_uuid(), + state=constants.SB_STATE_CONFIGURED, + task=constants.SB_TASK_NONE, + forihostid=ihost_id, + hostname=hostname) + return self.dbapi.ceph_mon_create(ceph_mon_dict) + # # StorageTier with stors # @@ -567,6 +579,8 @@ class StorageTierDependentTCs(base.FunctionalTest): device_path='/dev/disk/by-path/pci-0000:00:0d.0-ata-2.0', forihostid=storage_0.id) + self._create_storage_mon('storage-0', storage_0['id']) + # Mock the fsid call so that we don't have to wait for the timeout with mock.patch.object(ceph.CephWrapper, 'fsid') as mock_fsid: mock_fsid.return_value = (mock.MagicMock(ok=False), None) diff --git a/sysinv/sysinv/sysinv/sysinv/tests/db/utils.py b/sysinv/sysinv/sysinv/sysinv/tests/db/utils.py index d77e4c98a9..5a33ef01ff 100644 --- a/sysinv/sysinv/sysinv/sysinv/tests/db/utils.py +++ b/sysinv/sysinv/sysinv/sysinv/tests/db/utils.py @@ -439,6 +439,23 @@ def get_test_stor(**kw): return stor +def get_test_mon(**kw): + mon = { + 'id': kw.get('id', 2), + 'uuid': kw.get('uuid'), + + 'device_path': kw.get('device_path', ''), + 'ceph_mon_gib': kw.get('ceph_mon_gib', 20), + 'state': kw.get('state', 'configured'), + 'task': kw.get('task', None), + + 'forihostid': kw.get('forihostid', 0), + 'ihost_uuid': kw.get('ihost_uuid', '1be26c0b-03f2-4d2e-ae87-c02d7f33c781'), + 'hostname': kw.get('hostname', 'controller-0'), + } + return mon + + def get_test_lvg(**kw): lvg = { 'id': kw.get('id', 2),