From 6ef1e829fd2b1e5f5688c41376f0fad4c9240db6 Mon Sep 17 00:00:00 2001 From: Wei Zhou Date: Fri, 21 Jun 2019 18:23:27 -0400 Subject: [PATCH] Restore containerized platform using Ansible restore_platform playbook This commit is to support platform restore for AIO-SX using restore_platform playbook: 1. During AIO-SX restore, the restored ceph crushmap is loaded through puppet. 2. Bypass vim when unlocking controller-0 for the first time. 3. When unlocking controller-0 for the first time, app_reapply is skipped for stx-openstack application. 4. After controller-0 is unlocked, ceph backend task is set to None. Change-Id: I36d27b162334e5a2f0371793243f2301b5fec1eb Story: 2004761 Task: 33645 Signed-off-by: Wei Zhou --- .../src/modules/platform/manifests/ceph.pp | 5 ++ .../src/modules/platform/manifests/config.pp | 3 +- .../sysinv/sysinv/api/controllers/v1/host.py | 35 ++++++------ sysinv/sysinv/sysinv/sysinv/common/ceph.py | 55 ++++++++++++++----- .../sysinv/sysinv/sysinv/common/constants.py | 1 + sysinv/sysinv/sysinv/sysinv/conductor/ceph.py | 2 + 6 files changed, 68 insertions(+), 33 deletions(-) diff --git a/puppet-manifests/src/modules/platform/manifests/ceph.pp b/puppet-manifests/src/modules/platform/manifests/ceph.pp index 256e3f24c0..769bd554ee 100644 --- a/puppet-manifests/src/modules/platform/manifests/ceph.pp +++ b/puppet-manifests/src/modules/platform/manifests/ceph.pp @@ -250,7 +250,12 @@ class platform::ceph::monitor $crushmap_txt = '/etc/sysinv/crushmap-aio-sx.txt' } $crushmap_bin = '/etc/sysinv/crushmap.bin' + $crushmap_bin_backup = '/etc/sysinv/crushmap.bin.backup' Ceph::Mon <| |> + -> exec { 'Copy crushmap if backup exists': + command => "mv -f ${crushmap_bin_backup} ${crushmap_bin}", + onlyif => "test -f ${crushmap_bin_backup}", + } -> exec { 'Compile crushmap': command => "crushtool -c ${crushmap_txt} -o ${crushmap_bin}", onlyif => "test ! -f ${crushmap_bin}", diff --git a/puppet-manifests/src/modules/platform/manifests/config.pp b/puppet-manifests/src/modules/platform/manifests/config.pp index 7c00ae4da1..623ed828f2 100644 --- a/puppet-manifests/src/modules/platform/manifests/config.pp +++ b/puppet-manifests/src/modules/platform/manifests/config.pp @@ -354,8 +354,7 @@ class platform::config::controller::post } } - if ! $::platform::params::controller_upgrade and - ! str2bool($::is_restore_in_progress) { + if ! $::platform::params::controller_upgrade { file { '/etc/platform/.initial_config_complete': ensure => present, } diff --git a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/host.py b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/host.py index 09df915aa3..d801a8b7ff 100644 --- a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/host.py +++ b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/host.py @@ -2134,8 +2134,10 @@ class HostController(rest.RestController): ihost_obj['uuid'], ibm_msg_dict) - # Trigger a system app reapply if the host has been unlocked - if (patched_ihost.get('action') in + # Trigger a system app reapply if the host has been unlocked. + # Only trigger the reapply if it is not during restore. + if (not os.path.isfile(tsc.RESTORE_IN_PROGRESS_FLAG) and + patched_ihost.get('action') in [constants.UNLOCK_ACTION, constants.FORCE_UNLOCK_ACTION]): self._reapply_system_app() @@ -4681,8 +4683,8 @@ class HostController(rest.RestController): ) if ihosts: - # For storage setup, no change is required. - LOG.info("This is a storage setup. No change.") + # TODO (Wei) Need to revisit storage setup. + LOG.info("This is a storage setup. Will need to revisit.") storage_enabled = 0 for ihost in ihosts: if ihost.operational == constants.OPERATIONAL_ENABLED: @@ -4699,18 +4701,16 @@ class HostController(rest.RestController): raise wsme.exc.ClientSideError( _("Restore Ceph config failed: %s" % e)) elif cutils.is_aio_system(pecan.request.dbapi): - # TODO(wz): Need more work to restore ceph for AIO - LOG.info("For an AIO system, Restore crushmap...") - try: - if not pecan.request.rpcapi.restore_ceph_config( - pecan.request.context, after_storage_enabled=True): - raise Exception("restore_ceph_config returned false") - except Exception as e: - raise wsme.exc.ClientSideError( - _("Restore Ceph config failed: %s" % e)) - + # For AIO, ceph config restore is done in puppet when ceph + # manifest is applied on first unlock. The + # initial_config_complete flag is set after first unlock. + # Once one controller is up, ceph cluster should be operational. + LOG.info("This is AIO-SX... Ceph backend task is RESTORE") + if cutils.is_initial_config_complete(): + LOG.info("This is AIO-SX... clear ceph backend task to None") + api.storage_backend_update(backend.uuid, {'task': None}) else: - # TODO(wz): Need more work to restore ceph for 2+2 + # TODO(Wei): Need more work to restore ceph for 2+2 pass @staticmethod @@ -5057,11 +5057,12 @@ class HostController(rest.RestController): self.check_unlock_patching(hostupdate, force_unlock) hostupdate.configure_required = True - if (os.path.isfile(constants.ANSIBLE_BOOTSTRAP_FLAG) and + if ((os.path.isfile(constants.ANSIBLE_BOOTSTRAP_FLAG) or + os.path.isfile(tsc.RESTORE_IN_PROGRESS_FLAG)) and hostupdate.ihost_patch['hostname'] == constants.CONTROLLER_0_HOSTNAME): # For the first unlock of the initial controller bootstrapped by - # Ansible, don't notify vim. + # Ansible or the first unlock during restore, don't notify vim. hostupdate.notify_vim = False else: hostupdate.notify_vim = True diff --git a/sysinv/sysinv/sysinv/sysinv/common/ceph.py b/sysinv/sysinv/sysinv/sysinv/common/ceph.py index 00d3720996..b3c7f71c9a 100644 --- a/sysinv/sysinv/sysinv/sysinv/common/ceph.py +++ b/sysinv/sysinv/sysinv/sysinv/common/ceph.py @@ -12,6 +12,7 @@ from __future__ import absolute_import +import shutil import subprocess import os import pecan @@ -709,7 +710,8 @@ def fix_crushmap(dbapi=None): LOG.info("Not enough monitors yet available to fix crushmap.") return False - # Crushmap may be already loaded thorough puppet, avoid doing it twice. + # For AIO system, crushmap should be already loaded through puppet. + # If it was loaded, set the crushmap flag to avoid loading it twice. default_ceph_tier_name = constants.SB_TIER_DEFAULT_NAMES[ constants.SB_TIER_TYPE_CEPH] + constants.CEPH_CRUSH_TIER_SUFFIX rule_is_present, __, __ = _operator._crush_rule_status(default_ceph_tier_name) @@ -717,28 +719,53 @@ def fix_crushmap(dbapi=None): _create_crushmap_flag_file() return False - stor_model = get_ceph_storage_model(dbapi) - if stor_model == constants.CEPH_AIO_SX_MODEL: - crushmap_txt = "/etc/sysinv/crushmap-aio-sx.txt" - elif stor_model == constants.CEPH_CONTROLLER_MODEL: - crushmap_txt = "/etc/sysinv/crushmap-controller-model.txt" - else: - crushmap_txt = "/etc/sysinv/crushmap-storage-model.txt" - LOG.info("Updating crushmap with: %s" % crushmap_txt) - try: - # Compile crushmap + # For AIO system, crushmap should alreadby be loaded through + # puppet. If for any reason it is not, as a precaution we set + # the crushmap here. + + # Check if a backup crushmap exists. If it does, that means + # it is during restore. We need to restore the backup crushmap + # instead of generating it. For AIO system, the backup crushmap + # is stored in /etc/sysinv. For non-AIO system, it is stored in + # /opt/platform/sysinv. + if cutils.is_aio_system(dbapi): + backup = os.path.join(constants.CEPH_CRUSH_MAP_BACKUP_DIR_FOR_AIO, + constants.CEPH_CRUSH_MAP_BACKUP) + else: + backup = os.path.join(constants.SYSINV_CONFIG_PATH, + constants.CEPH_CRUSH_MAP_BACKUP) crushmap_bin = "/etc/sysinv/crushmap.bin" - subprocess.check_output("crushtool -c %s " - "-o %s" % (crushmap_txt, crushmap_bin), + if os.path.exists(backup): + shutil.copyfile(backup, crushmap_bin) + else: + stor_model = get_ceph_storage_model(dbapi) + if stor_model == constants.CEPH_AIO_SX_MODEL: + crushmap_txt = "/etc/sysinv/crushmap-aio-sx.txt" + elif stor_model == constants.CEPH_CONTROLLER_MODEL: + crushmap_txt = "/etc/sysinv/crushmap-controller-model.txt" + elif stor_model == constants.CEPH_STORAGE_MODEL: + crushmap_txt = "/etc/sysinv/crushmap-storage-model.txt" + else: + reason = "Error: Undefined ceph storage model %s" % stor_model + raise exception.CephCrushMapNotApplied(reason=reason) + LOG.info("Updating crushmap with: %s" % crushmap_txt) + + # Compile crushmap + subprocess.check_output("crushtool -c %s " + "-o %s" % (crushmap_txt, crushmap_bin), stderr=subprocess.STDOUT, shell=True) # Set crushmap subprocess.check_output("ceph osd setcrushmap -i %s" % crushmap_bin, stderr=subprocess.STDOUT, shell=True) - except subprocess.CalledProcessError as e: + + if os.path.exists(backup): + os.remove(backup) + except (IOError, subprocess.CalledProcessError) as e: # May not be critical, depends on where this is called. reason = "Error: %s Output: %s" % (str(e), e.output) raise exception.CephCrushMapNotApplied(reason=reason) + _create_crushmap_flag_file() return True diff --git a/sysinv/sysinv/sysinv/sysinv/common/constants.py b/sysinv/sysinv/sysinv/sysinv/common/constants.py index bc23793e31..0e733c8fa5 100644 --- a/sysinv/sysinv/sysinv/sysinv/common/constants.py +++ b/sysinv/sysinv/sysinv/sysinv/common/constants.py @@ -856,6 +856,7 @@ CEPH_REPLICATION_GROUP0_HOSTS = { CEPH_MANAGER_RPC_TOPIC = "sysinv.ceph_manager" CEPH_MANAGER_RPC_VERSION = "1.0" +CEPH_CRUSH_MAP_BACKUP_DIR_FOR_AIO = '/etc/sysinv' CEPH_CRUSH_MAP_BACKUP = 'crushmap.bin.backup' CEPH_CRUSH_MAP_APPLIED = '.crushmap_applied' CEPH_CRUSH_MAP_DEPTH = 3 diff --git a/sysinv/sysinv/sysinv/sysinv/conductor/ceph.py b/sysinv/sysinv/sysinv/sysinv/conductor/ceph.py index 9c51301760..7c128e8b6e 100644 --- a/sysinv/sysinv/sysinv/sysinv/conductor/ceph.py +++ b/sysinv/sysinv/sysinv/sysinv/conductor/ceph.py @@ -348,6 +348,8 @@ class CephOperator(object): 'recognized as operational.') return False + # TODO (Wei): This function is not invoked during AIO system restore. + # It will be revisited in the non-AIO system restore tasks. try: backup = os.path.join(constants.SYSINV_CONFIG_PATH, constants.CEPH_CRUSH_MAP_BACKUP)