Platform restore for AIO-DX and Standard no-storage configuration

This commit is to support platform restore for AIO-DX and Standard no-storage configuration using restore_platform playbook: - For AIO-DX, the restored ceph crushmap is loaded through puppet when controller-0 is unlocked for the first time. OSDs are created on controller nodes during controller unlock. - For Standard no-storage configuration, the restored ceph crushmap is loaded through sysinv when ceph quorum is formed. OSDs are created on controller nodes by applying ceph osd runtime manifests. - The .restore_in_progress flag file is removed as part of first unlock of controller-0. Change-Id: I65bfc67cf90e894d125eb6c860139b26d17b562e Story: 2004761 Task: 35965 Signed-off-by: Wei Zhou <wei.zhou@windriver.com>
2019-07-23 16:23:35 -04:00 · 2019-07-23 16:23:35 -04:00 · edb8206bf7
parent 524c62c426
commit edb8206bf7
5 changed files with 107 additions and 34 deletions
--- a/puppet-manifests/src/modules/platform/manifests/ceph.pp
+++ b/puppet-manifests/src/modules/platform/manifests/ceph.pp
@ -1,5 +1,6 @@
 class platform::ceph::params(
  $service_enabled = false,
+  $skip_osds_during_restore = false,
  $cluster_uuid = undef,
  $cluster_name = 'ceph',
  $authentication_type = 'none',
@ -375,31 +376,36 @@ class platform::ceph::osds(
  $journal_config = {},
 ) inherits ::platform::ceph::params {

-  file { '/var/lib/ceph/osd':
-    ensure => 'directory',
-    path   => '/var/lib/ceph/osd',
-    owner  => 'root',
-    group  => 'root',
-    mode   => '0755',
+  # skip_osds_during_restore is set to true when the default primary
+  # ceph backend "ceph-store" has "restore" as its task and it is
+  # not an AIO system.
+  if ! $skip_osds_during_restore {
+    file { '/var/lib/ceph/osd':
+      ensure => 'directory',
+      path   => '/var/lib/ceph/osd',
+      owner  => 'root',
+      group  => 'root',
+      mode   => '0755',
+    }
+
+    # Ensure ceph.conf is complete before configuring OSDs
+    Class['::ceph'] -> Platform_ceph_osd <| |>
+
+    # Journal disks need to be prepared before the OSDs are configured
+    Platform_ceph_journal <| |> -> Platform_ceph_osd <| |>
+    # Crush locations in ceph.conf need to be set before the OSDs are configured
+    Osd_crush_location <| |> -> Platform_ceph_osd <| |>
+
+    # default configuration for all ceph object resources
+    Ceph::Osd {
+      cluster => $cluster_name,
+      cluster_uuid => $cluster_uuid,
+    }
+
+    create_resources('osd_crush_location', $osd_config)
+    create_resources('platform_ceph_osd', $osd_config)
+    create_resources('platform_ceph_journal', $journal_config)
  }
-
-  # Ensure ceph.conf is complete before configuring OSDs
-  Class['::ceph'] -> Platform_ceph_osd <| |>
-
-  # Journal disks need to be prepared before the OSDs are configured
-  Platform_ceph_journal <| |> -> Platform_ceph_osd <| |>
-  # Crush locations in ceph.conf need to be set before the OSDs are configured
-  Osd_crush_location <| |> -> Platform_ceph_osd <| |>
-
-  # default configuration for all ceph object resources
-  Ceph::Osd {
-    cluster => $cluster_name,
-    cluster_uuid => $cluster_uuid,
-  }
-
-  create_resources('osd_crush_location', $osd_config)
-  create_resources('platform_ceph_osd', $osd_config)
-  create_resources('platform_ceph_journal', $journal_config)
 }

 class platform::ceph::haproxy
--- a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/host.py
+++ b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/host.py
@ -2202,6 +2202,15 @@ class HostController(rest.RestController):
            LOG.info("Update host memory for (%s)" % ihost_obj['hostname'])
            pecan.request.rpcapi.update_host_memory(pecan.request.context,
                                                    ihost_obj['uuid'])
+
+        # The restore_in_progress flag file is needed to bypass vim and
+        # application re-apply when issuing the first unlock command during
+        # restore. Once the command is accepted by mtce, it can be removed.
+        if (os.path.isfile(tsc.RESTORE_IN_PROGRESS_FLAG) and
+                patched_ihost.get('action') in
+                [constants.UNLOCK_ACTION, constants.FORCE_UNLOCK_ACTION]):
+            os.remove(tsc.RESTORE_IN_PROGRESS_FLAG)
+
        return Host.convert_with_links(ihost_obj)

    def _vim_host_add(self, ihost):
@ -4448,8 +4457,7 @@ class HostController(rest.RestController):
        else:
            return False

-    @staticmethod
-    def _update_add_ceph_state():
+    def _update_add_ceph_state(self):
        api = pecan.request.dbapi

        backend = StorageBackendConfig.get_configuring_backend(api)
@ -4556,18 +4564,63 @@ class HostController(rest.RestController):
                    except Exception as e:
                        raise wsme.exc.ClientSideError(
                            _("Restore Ceph config failed: %s" % e))
-            elif cutils.is_aio_system(pecan.request.dbapi):
-                # For AIO, ceph config restore is done in puppet when ceph
+            elif cutils.is_aio_simplex_system(pecan.request.dbapi):
+                # For AIO-SX, ceph config restore is done in puppet when ceph
                # manifest is applied on first unlock. The
                # initial_config_complete flag is set after first unlock.
-                # Once one controller is up, ceph cluster should be operational.
+                # Once one controller is up, ceph cluster should be fully
+                # operational.
                LOG.info("This is AIO-SX... Ceph backend task is RESTORE")
                if cutils.is_initial_config_complete():
                    LOG.info("This is AIO-SX... clear ceph backend task to None")
                    api.storage_backend_update(backend.uuid, {'task': None})
+            elif cutils.is_aio_duplex_system(pecan.request.dbapi):
+                # For AIO-DX, ceph config restore is done in puppet when ceph
+                # manifest is applied on first unlock. The 2nd osd is created
+                # in puppet when controller-1 is unlocked. Once both
+                # controllers are up, Ceph cluster should be fully operational.
+                LOG.info("This is AIO-DX... Ceph backend task is RESTORE")
+                c_hosts = api.ihost_get_by_personality(
+                    constants.CONTROLLER
+                )
+
+                ctlr_enabled = 0
+                for c_host in c_hosts:
+                    if c_host.operational == constants.OPERATIONAL_ENABLED:
+                        ctlr_enabled = ctlr_enabled + 1
+
+                if ctlr_enabled == len(c_hosts):
+                    LOG.info("This is AIO-DX... clear ceph backend task to None")
+                    api.storage_backend_update(backend.uuid, {'task': None})
            else:
-                # TODO(Wei): Need more work to restore ceph for 2+2
-                pass
+                # This is ceph restore for standard non-storage configuration.
+                # Ceph config restore is done via sysinv after both ceph
+                # monitors are available.
+                LOG.info("This is 2+2... Ceph backend task is RESTORE")
+                active_mons, required_mons, __ = \
+                        self._ceph.get_monitors_status(pecan.request.dbapi)
+                if required_mons > active_mons:
+                    LOG.info("Not enough monitors yet to restore ceph config.")
+                else:
+                    # By clearing ceph backend task to None osds will be
+                    # created thru applying runtime manifests.
+                    LOG.info("This is 2+2... clear ceph backend task to None")
+                    api.storage_backend_update(backend.uuid, {'task': None})
+
+                    # Apply runtime manifests to create OSDs on two controller
+                    # nodes.
+                    c_hosts = api.ihost_get_by_personality(
+                        constants.CONTROLLER)
+
+                    runtime_manifests = True
+                    for c_host in c_hosts:
+                        istors = pecan.request.dbapi.istor_get_by_ihost(c_host.uuid)
+                        for stor in istors:
+                            pecan.request.rpcapi.update_ceph_osd_config(
+                                pecan.request.context,
+                                c_host,
+                                stor.uuid,
+                                runtime_manifests)

    @staticmethod
    def update_ihost_action(action, hostupdate):
--- a/sysinv/sysinv/sysinv/sysinv/common/ceph.py
+++ b/sysinv/sysinv/sysinv/sysinv/common/ceph.py
@ -735,9 +735,12 @@ def fix_crushmap(dbapi=None):

            # Check if a backup crushmap exists. If it does, that means
            # it is during restore. We need to restore the backup crushmap
-            # instead of generating it. For AIO system, the backup crushmap
-            # is stored in /etc/sysinv. For non-AIO system, it is stored in
-            # /opt/platform/sysinv.
+            # instead of generating it. For non-AIO system, it is stored in
+            # /opt/platform/sysinv which is a drbd fs. For AIO systems because
+            # when unlocking controller-0 for the first time, the crushmap is
+            # set thru ceph puppet when /opt/platform is not mounted yet, we
+            # store the crushmap in /etc/sysinv.
+
            if cutils.is_aio_system(dbapi):
                backup = os.path.join(constants.CEPH_CRUSH_MAP_BACKUP_DIR_FOR_AIO,
                                      constants.CEPH_CRUSH_MAP_BACKUP)
--- a/sysinv/sysinv/sysinv/sysinv/common/utils.py
+++ b/sysinv/sysinv/sysinv/sysinv/common/utils.py
@ -2037,6 +2037,11 @@ def is_inventory_config_complete(dbapi, forihostid):
        return False


+def is_std_system(dbapi):
+    system = dbapi.isystem_get_one()
+    return system.system_type == constants.TIS_STD_BUILD
+
+
 def is_aio_system(dbapi):
    system = dbapi.isystem_get_one()
    return system.system_type == constants.TIS_AIO_BUILD
--- a/sysinv/sysinv/sysinv/sysinv/puppet/ceph.py
+++ b/sysinv/sysinv/sysinv/sysinv/puppet/ceph.py
@ -83,6 +83,10 @@ class CephPuppet(openstack.OpenstackBasePuppet):

        ksuser = self._get_service_user_name(self.SERVICE_NAME_RGW)

+        skip_osds_during_restore = \
+            (utils.is_std_system(self.dbapi) and
+            ceph_backend.task == constants.SB_TASK_RESTORE)
+
        config = {
            'ceph::ms_bind_ipv6': ms_bind_ipv6,

@ -112,6 +116,8 @@ class CephPuppet(openstack.OpenstackBasePuppet):
                self._get_service_user_domain_name(),
            'platform::ceph::params::rgw_admin_project':
                self._get_service_tenant_name(),
+            'platform::ceph::params::skip_osds_during_restore':
+                skip_osds_during_restore,
        }

        if utils.is_openstack_applied(self.dbapi):