config/controllerconfig/controllerconfig/controllerconfig/backup_restore.py

#
# Copyright (c) 2014-2017 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#

"""
Backup & Restore
"""

from __future__ import print_function
import copy
import filecmp
import fileinput
import os
import glob
import shutil
import stat
import subprocess
import tarfile
import tempfile
import textwrap
import time

from fm_api import constants as fm_constants
from fm_api import fm_api
from sysinv.common import constants as sysinv_constants

from controllerconfig.common import log
from controllerconfig.common import constants
from controllerconfig.common.exceptions import BackupFail
from controllerconfig.common.exceptions import RestoreFail
from controllerconfig.common.exceptions import KeystoneFail
from controllerconfig.common.exceptions import SysInvFail
from controllerconfig import openstack
import tsconfig.tsconfig as tsconfig
from controllerconfig import utils
from controllerconfig import sysinv_api as sysinv
from six.moves import input

LOG = log.get_logger(__name__)

DEVNULL = open(os.devnull, 'w')
RESTORE_COMPLETE = "restore-complete"
RESTORE_RERUN_REQUIRED = "restore-rerun-required"

# Backup/restore related constants
backup_in_progress = tsconfig.BACKUP_IN_PROGRESS_FLAG
restore_in_progress = tsconfig.RESTORE_IN_PROGRESS_FLAG
restore_system_ready = tsconfig.RESTORE_SYSTEM_FLAG
restore_patching_complete = '/etc/platform/.restore_patching_complete'
node_is_patched = '/var/run/node_is_patched'
keyring_permdir = os.path.join('/opt/platform/.keyring', tsconfig.SW_VERSION)
ceph_permdir = os.path.join(tsconfig.CONFIG_PATH, 'ceph-config')
ldap_permdir = '/var/lib/openldap-data'
ceilometer_permdir = '/opt/cgcs/ceilometer/' + tsconfig.SW_VERSION
glance_permdir = '/opt/cgcs/glance'
patching_permdir = '/opt/patching'
patching_repo_permdir = '/www/pages/updates'
home_permdir = '/home'
cinder_permdir = '/opt/cgcs/cinder'
extension_permdir = '/opt/extension'
patch_vault_permdir = '/opt/patch-vault'


def get_backup_databases(cinder_config=False):
    """
    Retrieve database lists for backup.
    :return: backup_databases and backup_database_skip_tables
    """

    # Databases common to all configurations
    REGION_LOCAL_DATABASES = ('postgres', 'template1', 'nova', 'sysinv',
                              'neutron', 'heat', 'nova_api',
                              'aodh', 'murano', 'magnum', 'panko', 'ironic',
                              'nova_cell0', 'gnocchi', 'fm', 'barbican')
    REGION_SHARED_DATABASES = ('glance', 'keystone')

    if cinder_config:
        REGION_SHARED_DATABASES += ('cinder', )

    # Indicates which tables have to be dropped for a certain database.
    DB_TABLE_SKIP_MAPPING = {
        'fm': ('alarm',),
        'gnocchi': ('metric', 'resource'),
        'dcorch': ('orch_job',
                   'orch_request',
                   'resource',
                   'subcloud_resource'), }

    if tsconfig.region_config == 'yes':
        BACKUP_DATABASES = REGION_LOCAL_DATABASES
        # Add databases which are optional in secondary regions(and subclouds)
        shared_services = sysinv.get_shared_services()
        for service_type in ["image", "volume"]:
            if service_type not in shared_services:
                service = 'glance' if service_type == "image" else 'cinder'
                BACKUP_DATABASES += (service, )

    else:
        # Add additional databases for non-region configuration and for the
        # primary region in region deployments.
        BACKUP_DATABASES = REGION_LOCAL_DATABASES + REGION_SHARED_DATABASES

        # Add distributed cloud databases
        if tsconfig.distributed_cloud_role == \
                sysinv_constants.DISTRIBUTED_CLOUD_ROLE_SYSTEMCONTROLLER:
            BACKUP_DATABASES += ('dcmanager', 'dcorch')

    # We generate the tables to be skipped for each database
    # mentioned in BACKUP_DATABASES. We explicitly list
    # skip tables in DB_TABLE_SKIP_MAPPING
    BACKUP_DB_SKIP_TABLES = dict(
        [[x, DB_TABLE_SKIP_MAPPING.get(x, ())] for x in BACKUP_DATABASES])

    return BACKUP_DATABASES, BACKUP_DB_SKIP_TABLES


def check_load_versions(archive, staging_dir):
    match = False
    try:
        member = archive.getmember('etc/build.info')
        archive.extract(member, path=staging_dir)
        match = filecmp.cmp('/etc/build.info', staging_dir + '/etc/build.info')
        shutil.rmtree(staging_dir + '/etc')
    except Exception as e:
        LOG.exception(e)
        raise RestoreFail("Unable to verify load version in backup file. "
                          "Invalid backup file.")

    if not match:
        LOG.error("Load version mismatch.")
        raise RestoreFail("Load version of backup does not match the "
                          "version of the installed load.")


def get_subfunctions(filename):
    """
    Retrieves the subfunctions from a platform.conf file.
    :param filename: file to retrieve subfunctions from
    :return: a list of the subfunctions or None if no subfunctions exist
    """
    matchstr = 'subfunction='

    with open(filename, 'r') as f:
        for line in f:
            if matchstr in line:
                parsed = line.split('=')
                return parsed[1].rstrip().split(",")
    return


def check_load_subfunctions(archive, staging_dir):
    """
    Verify that the subfunctions in the backup match the installed load.
    :param archive: backup archive
    :param staging_dir: staging directory
    :return: raises exception if the subfunctions do not match
    """
    match = False
    backup_subfunctions = None
    try:
        member = archive.getmember('etc/platform/platform.conf')
        archive.extract(member, path=staging_dir)
        backup_subfunctions = get_subfunctions(staging_dir +
                                               '/etc/platform/platform.conf')
        shutil.rmtree(staging_dir + '/etc')
        if set(backup_subfunctions) ^ set(tsconfig.subfunctions):
            # The set of subfunctions do not match
            match = False
        else:
            match = True
    except Exception:
        LOG.exception("Unable to verify subfunctions in backup file")
        raise RestoreFail("Unable to verify subfunctions in backup file. "
                          "Invalid backup file.")

    if not match:
        LOG.error("Subfunction mismatch - backup: %s, installed: %s" %
                  (str(backup_subfunctions), str(tsconfig.subfunctions)))
        raise RestoreFail("Subfunctions in backup load (%s) do not match the "
                          "subfunctions of the installed load (%s)." %
                          (str(backup_subfunctions),
                           str(tsconfig.subfunctions)))


def file_exists_in_archive(archive, file_path):
    """ Check if file exists in archive """
    try:
        archive.getmember(file_path)
        return True

    except KeyError:
        LOG.info("File %s is not in archive." % file_path)
        return False


def filter_directory(archive, directory):
    for tarinfo in archive:
        if tarinfo.name.split('/')[0] == directory:
            yield tarinfo


def backup_etc_size():
    """ Backup etc size estimate """
    try:
        total_size = utils.directory_get_size('/etc')
        return total_size
    except OSError:
        LOG.error("Failed to estimate backup etc size.")
        raise BackupFail("Failed to estimate backup etc size")


def backup_etc(archive):
    """ Backup etc """
    try:
        archive.add('/etc', arcname='etc')

    except tarfile.TarError:
        LOG.error("Failed to backup etc.")
        raise BackupFail("Failed to backup etc")


def restore_etc_file(archive, dest_dir, etc_file):
    """ Restore etc file """
    try:
        # Change the name of this file to remove the leading path
        member = archive.getmember('etc/' + etc_file)
        # Copy the member to avoid changing the name for future operations on
        # this member.
        temp_member = copy.copy(member)
        temp_member.name = os.path.basename(temp_member.name)
        archive.extract(temp_member, path=dest_dir)

    except tarfile.TarError:
        LOG.error("Failed to restore etc file.")
        raise RestoreFail("Failed to restore etc file")


def restore_etc_ssl_dir(archive, configpath=constants.CONFIG_WORKDIR):
    """ Restore the etc SSL dir """

    def filter_etc_ssl_private(members):
        for tarinfo in members:
            if 'etc/ssl/private' in tarinfo.name:
                yield tarinfo

    if file_exists_in_archive(archive, 'config/server-cert.pem'):
        restore_config_file(
            archive, configpath, 'server-cert.pem')

    if file_exists_in_archive(archive, 'etc/ssl/private'):
        # NOTE: This will include all TPM certificate files if TPM was
        # enabled on the backed up system. However in that case, this
        # restoration is only done for the first controller and TPM
        # will need to be reconfigured once duplex controller (if any)
        # is restored.
        archive.extractall(path='/',
                           members=filter_etc_ssl_private(archive))


def restore_ceph_external_config_files(archive, staging_dir):
    # Restore ceph-config.
    if file_exists_in_archive(archive, "config/ceph-config"):
        restore_config_dir(archive, staging_dir, 'ceph-config', ceph_permdir)

        # Copy the file to /etc/ceph.
        # There might be no files to copy, so don't check the return code.
        cp_command = ('cp -Rp ' + os.path.join(ceph_permdir, '*') +
                      ' /etc/ceph/')
        subprocess.call(cp_command, shell=True)


def backup_config_size(config_permdir):
    """ Backup configuration size estimate """
    try:
        return(utils.directory_get_size(config_permdir))

    except OSError:
        LOG.error("Failed to estimate backup configuration size.")
        raise BackupFail("Failed to estimate backup configuration size")


def backup_config(archive, config_permdir):
    """ Backup configuration """
    try:
        # The config dir is versioned, but we're only grabbing the current
        # release
        archive.add(config_permdir, arcname='config')

    except tarfile.TarError:
        LOG.error("Failed to backup config.")
        raise BackupFail("Failed to backup configuration")


def restore_config_file(archive, dest_dir, config_file):
    """ Restore configuration file """
    try:
        # Change the name of this file to remove the leading path
        member = archive.getmember('config/' + config_file)
        # Copy the member to avoid changing the name for future operations on
        # this member.
        temp_member = copy.copy(member)
        temp_member.name = os.path.basename(temp_member.name)
        archive.extract(temp_member, path=dest_dir)

    except tarfile.TarError:
        LOG.error("Failed to restore config file %s." % config_file)
        raise RestoreFail("Failed to restore configuration")


def restore_configuration(archive, staging_dir):
    """ Restore configuration """
    try:
        os.makedirs(constants.CONFIG_WORKDIR, stat.S_IRWXU | stat.S_IRGRP |
                    stat.S_IXGRP | stat.S_IROTH | stat.S_IXOTH)
    except OSError:
        LOG.error("Failed to create config directory: %s",
                  constants.CONFIG_WORKDIR)
        raise RestoreFail("Failed to restore configuration files")

    # Restore cgcs_config file from original installation for historical
    # purposes. Not used to restore the system as the information in this
    # file is out of date (not updated after original installation).
    restore_config_file(archive, constants.CONFIG_WORKDIR, 'cgcs_config')

    # Restore platform.conf file and update as necessary. The file will be
    # created in a temporary location and then moved into place when it is
    # complete to prevent access to a partially created file.
    restore_etc_file(archive, staging_dir, 'platform/platform.conf')
    temp_platform_conf_file = os.path.join(tsconfig.PLATFORM_CONF_PATH,
                                           'platform.conf.temp')
    shutil.copyfile(os.path.join(staging_dir, 'platform.conf'),
                    temp_platform_conf_file)
    install_uuid = utils.get_install_uuid()
    for line in fileinput.FileInput(temp_platform_conf_file, inplace=1):
        if line.startswith("INSTALL_UUID="):
            # The INSTALL_UUID must be updated to match the new INSTALL_UUID
            # which was generated when this controller was installed prior to
            # doing the restore.
            print("INSTALL_UUID=%s" % install_uuid)
        elif line.startswith("management_interface=") or \
                line.startswith("oam_interface=") or \
                line.startswith("infrastructure_interface=") or \
                line.startswith("UUID="):
            # Strip out any entries that are host specific as the backup can
            # be done on either controller. The application of the
            # platform_conf manifest will add these back in.
            pass
        else:
            print(line, end='')
    fileinput.close()
    # Move updated platform.conf file into place.
    os.rename(temp_platform_conf_file, tsconfig.PLATFORM_CONF_FILE)

    # Kick tsconfig to reload the platform.conf file
    tsconfig._load()

    # Restore branding
    restore_config_dir(archive, staging_dir, 'branding', '/opt/branding/')

    # Restore banner customization
    restore_config_dir(archive, staging_dir, 'banner/etc', '/opt/banner')

    # Restore ssh configuration
    restore_config_dir(archive, staging_dir, 'ssh_config',
                       constants.CONFIG_WORKDIR + '/ssh_config')

    # Configure hostname
    utils.configure_hostname('controller-0')

    # Restore hosts file
    restore_etc_file(archive, '/etc', 'hosts')
    restore_etc_file(archive, constants.CONFIG_WORKDIR, 'hosts')

    # Restore certificate files
    restore_etc_ssl_dir(archive)

    # Restore firewall rules file if it is in the archive
    if file_exists_in_archive(archive, 'config/iptables.rules'):
        restore_config_file(
            archive, constants.CONFIG_WORKDIR, 'iptables.rules')
        restore_etc_file(archive, tsconfig.PLATFORM_CONF_PATH,
                         'platform/iptables.rules')


def filter_pxelinux(archive):
    for tarinfo in archive:
        if tarinfo.name.find('config/pxelinux.cfg') == 0:
            yield tarinfo


def restore_dnsmasq(archive, config_permdir):
    """ Restore dnsmasq """
    try:
        etc_files = ['hosts']

        perm_files = ['hosts',
                      'dnsmasq.hosts', 'dnsmasq.leases',
                      'dnsmasq.addn_hosts']

        for etc_file in etc_files:
            restore_config_file(archive, '/etc', etc_file)

        for perm_file in perm_files:
            restore_config_file(archive, config_permdir, perm_file)

        # Extract distributed cloud addn_hosts file if present in archive.
        if file_exists_in_archive(
                archive, 'config/dnsmasq.addn_hosts_dc'):
            restore_config_file(archive, config_permdir,
                                'dnsmasq.addn_hosts_dc')

        tmpdir = tempfile.mkdtemp(prefix="pxerestore_")

        archive.extractall(tmpdir,
                           members=filter_pxelinux(archive))

        if os.path.exists(tmpdir + '/config/pxelinux.cfg'):
            shutil.rmtree(config_permdir + 'pxelinux.cfg', ignore_errors=True)
            shutil.move(tmpdir + '/config/pxelinux.cfg', config_permdir)

        shutil.rmtree(tmpdir, ignore_errors=True)

    except (shutil.Error, subprocess.CalledProcessError, tarfile.TarError):
        LOG.error("Failed to restore dnsmasq config.")
        raise RestoreFail("Failed to restore dnsmasq files")


def backup_puppet_data_size(puppet_permdir):
    """ Backup puppet data size estimate """
    try:
        return(utils.directory_get_size(puppet_permdir))

    except OSError:
        LOG.error("Failed to estimate backup puppet data size.")
        raise BackupFail("Failed to estimate backup puppet data size")


def backup_puppet_data(archive, puppet_permdir):
    """ Backup puppet data """
    try:
        # The puppet dir is versioned, but we're only grabbing the current
        # release
        archive.add(puppet_permdir, arcname='hieradata')

    except tarfile.TarError:
        LOG.error("Failed to backup puppet data.")
        raise BackupFail("Failed to backup puppet data")


def restore_static_puppet_data(archive, puppet_workdir):
    """ Restore static puppet data """
    try:
        member = archive.getmember('hieradata/static.yaml')
        archive.extract(member, path=os.path.dirname(puppet_workdir))

        member = archive.getmember('hieradata/secure_static.yaml')
        archive.extract(member, path=os.path.dirname(puppet_workdir))

    except tarfile.TarError:
        LOG.error("Failed to restore static puppet data.")
        raise RestoreFail("Failed to restore static puppet data")

    except OSError:
        pass


def restore_puppet_data(archive, puppet_workdir):
    """ Restore puppet data """
    try:
        archive.extractall(
            path=os.path.dirname(puppet_workdir),
            members=filter_directory(archive,
                                     os.path.basename(puppet_workdir)))

    except tarfile.TarError:
        LOG.error("Failed to restore puppet data.")
        raise RestoreFail("Failed to restore puppet data")

    except OSError:
        pass


def backup_cinder_config(archive):
    """ Backup cinder configuration """

    # If the iscsi target config file exists, add it to the archive
    # On setups without LVM backends this file is absent
    if os.path.exists(cinder_permdir + '/iscsi-target/saveconfig.json'):
        archive.add(
            cinder_permdir + '/iscsi-target/saveconfig.json',
            arcname='cinder/saveconfig.json')


def restore_cinder_file(archive, dest_dir, cinder_file):
    """ Restore cinder file """
    try:
        # Change the name of this file to remove the leading path
        member = archive.getmember('cinder/' + cinder_file)
        # Copy the member to avoid changing the name for future operations on
        # this member.
        temp_member = copy.copy(member)
        temp_member.name = os.path.basename(temp_member.name)
        archive.extract(temp_member, path=dest_dir)

    except tarfile.TarError:
        LOG.error("Failed to restore cinder file %s." % cinder_file)
        raise RestoreFail("Failed to restore configuration")


def restore_cinder_config(archive):
    """Restore cinder config files"""
    # If the iscsi target config file is present in the archive,
    # restore it.
    if file_exists_in_archive(archive, 'cinder/saveconfig.json'):
        restore_cinder_file(
            archive, cinder_permdir + '/iscsi-target',
            'saveconfig.json')
        # Also create a copy of the original file as the volume
        # restore procedure changes this file and breaks the
        # valid nova settings.
        shutil.copyfile(
            cinder_permdir + '/iscsi-target/saveconfig.json',
            cinder_permdir + '/iscsi-target/saveconfig.json.bck')


def backup_cinder_size(cinder_permdir):
    """ Backup cinder size estimate """
    try:
        if not os.path.exists(
                cinder_permdir + '/iscsi-target/saveconfig.json'):
            return 0
        statinfo = os.stat(cinder_permdir + '/iscsi-target/saveconfig.json')
        return statinfo.st_size

    except OSError:
        LOG.error("Failed to estimate backup cinder size.")
        raise BackupFail("Failed to estimate backup cinder size")


def backup_keyring_size(keyring_permdir):
    """ Backup keyring size estimate """
    try:
        return(utils.directory_get_size(keyring_permdir))

    except OSError:
        LOG.error("Failed to estimate backup keyring size.")
        raise BackupFail("Failed to estimate backup keyring size")


def backup_keyring(archive, keyring_permdir):
    """ Backup keyring configuration """
    try:
        archive.add(keyring_permdir, arcname='.keyring')

    except tarfile.TarError:
        LOG.error("Failed to backup keyring.")
        raise BackupFail("Failed to backup keyring configuration")


def restore_keyring(archive, keyring_permdir):
    """ Restore keyring configuration """
    try:
        shutil.rmtree(keyring_permdir, ignore_errors=False)
        members = filter_directory(archive, '.keyring')
        temp_members = list()
        # remove .keyring and .keyring/ from the member path since they are
        # extracted to keyring_permdir: /opt/platform/.keyring/release
        for m in members:
            temp_member = copy.copy(m)
            lst = temp_member.name.split('.keyring/')
            if len(lst) > 1:
                temp_member.name = lst[1]
                temp_members.append(temp_member)
        archive.extractall(path=keyring_permdir, members=temp_members)

    except (tarfile.TarError, shutil.Error):
        LOG.error("Failed to restore keyring.")
        shutil.rmtree(keyring_permdir, ignore_errors=True)
        raise RestoreFail("Failed to restore keyring configuration")


def prefetch_keyring(archive):
    """ Prefetch keyring configuration for manifest use """
    keyring_tmpdir = '/tmp/.keyring'
    python_keyring_tmpdir = '/tmp/python_keyring'
    try:
        shutil.rmtree(keyring_tmpdir, ignore_errors=True)
        shutil.rmtree(python_keyring_tmpdir, ignore_errors=True)
        archive.extractall(
            path=os.path.dirname(keyring_tmpdir),
            members=filter_directory(archive,
                                     os.path.basename(keyring_tmpdir)))

        shutil.move(keyring_tmpdir + '/python_keyring', python_keyring_tmpdir)

    except (tarfile.TarError, shutil.Error):
        LOG.error("Failed to restore keyring.")
        shutil.rmtree(keyring_tmpdir, ignore_errors=True)
        shutil.rmtree(python_keyring_tmpdir, ignore_errors=True)
        raise RestoreFail("Failed to restore keyring configuration")


def cleanup_prefetched_keyring():
    """ Cleanup fetched keyring """
    try:
        keyring_tmpdir = '/tmp/.keyring'
        python_keyring_tmpdir = '/tmp/python_keyring'

        shutil.rmtree(keyring_tmpdir, ignore_errors=True)
        shutil.rmtree(python_keyring_tmpdir, ignore_errors=True)

    except shutil.Error:
        LOG.error("Failed to cleanup keyring.")
        raise RestoreFail("Failed to cleanup fetched keyring")


def backup_ldap_size():
    """ Backup ldap size estimate """
    try:
        total_size = 0

        proc = subprocess.Popen(
            ['slapcat -d 0 -F /etc/openldap/schema | wc -c'],
            shell=True, stdout=subprocess.PIPE)

        for line in proc.stdout:
            total_size = int(line)
            break

        proc.communicate()

        return total_size

    except subprocess.CalledProcessError:
        LOG.error("Failed to estimate backup ldap size.")
        raise BackupFail("Failed to estimate backup ldap size")


def backup_ldap(archive, staging_dir):
    """ Backup ldap configuration """
    try:
        ldap_staging_dir = staging_dir + '/ldap'
        os.mkdir(ldap_staging_dir, 0o655)

        subprocess.check_call([
            'slapcat', '-d', '0', '-F', '/etc/openldap/schema',
            '-l', (ldap_staging_dir + '/ldap.db')], stdout=DEVNULL)

        archive.add(ldap_staging_dir + '/ldap.db', arcname='ldap.db')

    except (OSError, subprocess.CalledProcessError, tarfile.TarError):
        LOG.error("Failed to backup ldap database.")
        raise BackupFail("Failed to backup ldap configuration")


def restore_ldap(archive, ldap_permdir, staging_dir):
    """ Restore ldap configuration """
    try:
        ldap_staging_dir = staging_dir + '/ldap'
        archive.extract('ldap.db', path=ldap_staging_dir)

        utils.stop_lsb_service('openldap')

        subprocess.call(['rm', '-rf', ldap_permdir], stdout=DEVNULL)
        os.mkdir(ldap_permdir, 0o755)

        subprocess.check_call(['slapadd', '-F', '/etc/openldap/schema',
                              '-l', ldap_staging_dir + '/ldap.db'],
                              stdout=DEVNULL, stderr=DEVNULL)

    except (subprocess.CalledProcessError, OSError, tarfile.TarError):
        LOG.error("Failed to restore ldap database.")
        raise RestoreFail("Failed to restore ldap configuration")

    finally:
        utils.start_lsb_service('openldap')


def backup_postgres_size(cinder_config=False):
    """ Backup postgres size estimate """
    try:
        total_size = 0

        # Backup roles, table spaces and schemas for databases.
        proc = subprocess.Popen([('sudo -u postgres pg_dumpall --clean ' +
                                  '--schema-only | wc -c')], shell=True,
                                stdout=subprocess.PIPE, stderr=DEVNULL)

        for line in proc.stdout:
            total_size = int(line)
            break

        proc.communicate()

        # get backup database
        backup_databases, backup_db_skip_tables = get_backup_databases(
            cinder_config)

        # Backup data for databases.
        for _, db_elem in enumerate(backup_databases):

            db_cmd = 'sudo -u postgres pg_dump --format=plain --inserts '
            db_cmd += '--disable-triggers --data-only %s ' % db_elem

            for _, table_elem in enumerate(backup_db_skip_tables[db_elem]):
                db_cmd += '--exclude-table=%s ' % table_elem

            db_cmd += '| wc -c'

            proc = subprocess.Popen([db_cmd], shell=True,
                                    stdout=subprocess.PIPE, stderr=DEVNULL)

            for line in proc.stdout:
                total_size += int(line)
                break

            proc.communicate()

        return total_size

    except subprocess.CalledProcessError:
        LOG.error("Failed to estimate backup database size.")
        raise BackupFail("Failed to estimate backup database size")


def backup_postgres(archive, staging_dir, cinder_config=False):
    """ Backup postgres configuration """
    try:
        postgres_staging_dir = staging_dir + '/postgres'
        os.mkdir(postgres_staging_dir, 0o655)

        # Backup roles, table spaces and schemas for databases.
        subprocess.check_call([('sudo -u postgres pg_dumpall --clean ' +
                                '--schema-only' +
                                '> %s/%s' % (postgres_staging_dir,
                                             'postgres.sql.config'))],
                              shell=True, stderr=DEVNULL)

        # get backup database
        backup_databases, backup_db_skip_tables = get_backup_databases(
            cinder_config)

        # Backup data for databases.
        for _, db_elem in enumerate(backup_databases):

            db_cmd = 'sudo -u postgres pg_dump --format=plain --inserts '
            db_cmd += '--disable-triggers --data-only %s ' % db_elem

            for _, table_elem in enumerate(backup_db_skip_tables[db_elem]):
                db_cmd += '--exclude-table=%s ' % table_elem

            db_cmd += '> %s/%s.sql.data' % (postgres_staging_dir, db_elem)

            subprocess.check_call([db_cmd], shell=True, stderr=DEVNULL)

        archive.add(postgres_staging_dir, arcname='postgres')

    except (OSError, subprocess.CalledProcessError, tarfile.TarError):
        LOG.error("Failed to backup postgres databases.")
        raise BackupFail("Failed to backup database configuration")


def restore_postgres(archive, staging_dir):
    """ Restore postgres configuration """
    try:
        postgres_staging_dir = staging_dir + '/postgres'
        archive.extractall(path=staging_dir,
                           members=filter_directory(archive, 'postgres'))

        utils.start_service("postgresql")

        # Restore roles, table spaces and schemas for databases.
        subprocess.check_call(["sudo", "-u", "postgres", "psql", "-f",
                               postgres_staging_dir +
                               '/postgres.sql.config', "postgres"],
                              stdout=DEVNULL, stderr=DEVNULL)

        # Restore data for databases.
        for data in glob.glob(postgres_staging_dir + '/*.sql.data'):
            db_elem = data.split('/')[-1].split('.')[0]
            subprocess.check_call(["sudo", "-u", "postgres", "psql", "-f",
                                   data, db_elem],
                                  stdout=DEVNULL)

    except (OSError, subprocess.CalledProcessError, tarfile.TarError) as e:
        LOG.error("Failed to restore postgres databases. Error: %s", e)
        raise RestoreFail("Failed to restore database configuration")

    finally:
        utils.stop_service('postgresql')


def backup_ceilometer_size(ceilometer_permdir):
    """ Backup ceilometer size estimate """
    try:
        statinfo = os.stat(ceilometer_permdir + '/pipeline.yaml')
        return statinfo.st_size

    except OSError:
        LOG.error("Failed to estimate backup ceilometer size.")
        raise BackupFail("Failed to estimate backup ceilometer size")


def backup_ceilometer(archive, ceilometer_permdir):
    """ Backup ceilometer """
    try:
        archive.add(ceilometer_permdir + '/pipeline.yaml',
                    arcname='pipeline.yaml')

    except tarfile.TarError:
        LOG.error("Failed to backup ceilometer.")
        raise BackupFail("Failed to backup ceilometer")


def restore_ceilometer(archive, ceilometer_permdir):
    """ Restore ceilometer """
    try:
        archive.extract('pipeline.yaml', path=ceilometer_permdir)

    except tarfile.TarError:
        LOG.error("Failed to restore ceilometer")
        raise RestoreFail("Failed to restore ceilometer")


def filter_config_dir(archive, directory):
    for tarinfo in archive:
        if tarinfo.name.find('config/' + directory) == 0:
            yield tarinfo


def restore_config_dir(archive, staging_dir, config_dir, dest_dir):
    """ Restore configuration directory if it exists """
    try:
        archive.extractall(staging_dir,
                           members=filter_config_dir(archive, config_dir))

        # Copy files from backup to dest dir
        if (os.path.exists(staging_dir + '/config/' + config_dir) and
                os.listdir(staging_dir + '/config/' + config_dir)):
            subprocess.call(["mkdir", "-p", dest_dir])

            try:
                for f in glob.glob(
                        staging_dir + '/config/' + config_dir + '/*'):
                    subprocess.check_call(["cp", "-p", f, dest_dir])
            except IOError:
                LOG.warning("Failed to copy %s files" % config_dir)

    except (subprocess.CalledProcessError, tarfile.TarError):
        LOG.info("No custom %s config was found during restore." % config_dir)


def backup_std_dir_size(directory):
    """ Backup standard directory size estimate """
    try:
        return utils.directory_get_size(directory)

    except OSError:
        LOG.error("Failed to estimate backup size for %s" % directory)
        raise BackupFail("Failed to estimate backup size for %s" % directory)


def backup_std_dir(archive, directory):
    """ Backup standard directory """
    try:
        archive.add(directory, arcname=os.path.basename(directory))

    except tarfile.TarError:
        LOG.error("Failed to backup %s" % directory)
        raise BackupFail("Failed to backup %s" % directory)


def restore_std_dir(archive, directory):
    """ Restore standard directory """
    try:
        shutil.rmtree(directory, ignore_errors=True)
        # Verify that archive contains this directory
        try:
            archive.getmember(os.path.basename(directory))
        except KeyError:
            LOG.error("Archive does not contain directory %s" % directory)
            raise RestoreFail("Invalid backup file - missing directory %s" %
                              directory)
        archive.extractall(
            path=os.path.dirname(directory),
            members=filter_directory(archive, os.path.basename(directory)))

    except (shutil.Error, tarfile.TarError):
        LOG.error("Failed to restore %s" % directory)
        raise RestoreFail("Failed to restore %s" % directory)


def configure_loopback_interface(archive):
    """ Restore and apply configuration for loopback interface """
    utils.remove_interface_config_files()
    restore_etc_file(
        archive, utils.NETWORK_SCRIPTS_PATH,
        'sysconfig/network-scripts/' + utils.NETWORK_SCRIPTS_LOOPBACK)
    utils.restart_networking()


def backup_ceph_crush_map(archive, staging_dir):
    """ Backup ceph crush map """
    try:
        ceph_staging_dir = os.path.join(staging_dir, 'ceph')
        os.mkdir(ceph_staging_dir, 0o655)
        crushmap_file = os.path.join(ceph_staging_dir,
                                     sysinv_constants.CEPH_CRUSH_MAP_BACKUP)
        subprocess.check_call(['ceph', 'osd', 'getcrushmap',
                               '-o', crushmap_file], stdout=DEVNULL,
                              stderr=DEVNULL)
        archive.add(crushmap_file, arcname='ceph/' +
                    sysinv_constants.CEPH_CRUSH_MAP_BACKUP)
    except Exception as e:
        LOG.error('Failed to backup ceph crush map. Reason: {}'.format(e))
        raise BackupFail('Failed to backup ceph crush map')


def restore_ceph_crush_map(archive):
    """ Restore ceph crush map """
    if not file_exists_in_archive(archive, 'ceph/' +
                                  sysinv_constants.CEPH_CRUSH_MAP_BACKUP):
        return

    try:
        crush_map_file = 'ceph/' + sysinv_constants.CEPH_CRUSH_MAP_BACKUP
        if file_exists_in_archive(archive, crush_map_file):
            member = archive.getmember(crush_map_file)
            # Copy the member to avoid changing the name for future
            # operations on this member.
            temp_member = copy.copy(member)
            temp_member.name = os.path.basename(temp_member.name)
            archive.extract(temp_member,
                            path=sysinv_constants.SYSINV_CONFIG_PATH)

    except tarfile.TarError as e:
        LOG.error('Failed to restore crush map file. Reason: {}'.format(e))
        raise RestoreFail('Failed to restore crush map file')


def check_size(archive_dir, cinder_config):
    """Check if there is enough space to create backup."""
    backup_overhead_bytes = 1024 ** 3  # extra GB for staging directory

    # backup_cinder_size() will return 0 if cinder/lvm is not configured,
    # So no need to add extra check here.
    backup_size = (backup_overhead_bytes +
                   backup_etc_size() +
                   backup_config_size(tsconfig.CONFIG_PATH) +
                   backup_puppet_data_size(constants.HIERADATA_PERMDIR) +
                   backup_keyring_size(keyring_permdir) +
                   backup_ldap_size() +
                   backup_postgres_size(cinder_config) +
                   backup_ceilometer_size(ceilometer_permdir) +
                   backup_std_dir_size(glance_permdir) +
                   backup_std_dir_size(home_permdir) +
                   backup_std_dir_size(patching_permdir) +
                   backup_std_dir_size(patching_repo_permdir) +
                   backup_std_dir_size(extension_permdir) +
                   backup_std_dir_size(patch_vault_permdir) +
                   backup_cinder_size(cinder_permdir)
                   )

    archive_dir_free_space = \
        utils.filesystem_get_free_space(archive_dir)

    if backup_size > archive_dir_free_space:
        print("Archive directory (%s) does not have enough free "
              "space (%s), estimated backup size is %s." %
              (archive_dir, utils.print_bytes(archive_dir_free_space),
               utils.print_bytes(backup_size)))

        raise BackupFail("Not enough free space for backup.")


def backup(backup_name, archive_dir, clone=False):
    """Backup configuration."""

    if not os.path.isdir(archive_dir):
        raise BackupFail("Archive directory (%s) not found." % archive_dir)

    if not utils.is_active("management-ip"):
        raise BackupFail(
            "Backups can only be performed from the active controller.")

    if os.path.isfile(backup_in_progress):
        raise BackupFail("Backup already in progress.")
    else:
        open(backup_in_progress, 'w')

    fmApi = fm_api.FaultAPIs()
    entity_instance_id = "%s=%s" % (fm_constants.FM_ENTITY_TYPE_HOST,
                                    sysinv_constants.CONTROLLER_HOSTNAME)
    fault = fm_api.Fault(alarm_id=fm_constants.FM_ALARM_ID_BACKUP_IN_PROGRESS,
                         alarm_state=fm_constants.FM_ALARM_STATE_SET,
                         entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST,
                         entity_instance_id=entity_instance_id,
                         severity=fm_constants.FM_ALARM_SEVERITY_MINOR,
                         reason_text=("System Backup in progress."),
                         # operational
                         alarm_type=fm_constants.FM_ALARM_TYPE_7,
                         # congestion
                         probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_8,
                         proposed_repair_action=("No action required."),
                         service_affecting=False)

    fmApi.set_fault(fault)

    cinder_config = False
    backend_services = sysinv.get_storage_backend_services()
    for services in backend_services.values():
        if (services is not None and
                services.find(sysinv_constants.SB_SVC_CINDER) != -1):
            cinder_config = True
            break

    staging_dir = None
    system_tar_path = None
    images_tar_path = None
    warnings = ''
    try:
        os.chdir('/')

        if not clone:
            check_size(archive_dir, cinder_config)

        print ("\nPerforming backup (this might take several minutes):")
        staging_dir = tempfile.mkdtemp(dir=archive_dir)

        system_tar_path = os.path.join(archive_dir,
                                       backup_name + '_system.tgz')
        system_archive = tarfile.open(system_tar_path, "w:gz")
        images_tar_path = os.path.join(archive_dir,
                                       backup_name + '_images.tgz')

        step = 1
        total_steps = 15

        if sysinv_constants.SB_TYPE_CEPH in backend_services.keys():
            total_steps += 1

        if tsconfig.region_config == "yes":
            # We don't run the glance backup step
            total_steps -= 1

        # Step 1: Backup etc
        backup_etc(system_archive)
        utils.progress(total_steps, step, 'backup etc', 'DONE')
        step += 1

        # Step 2: Backup configuration
        backup_config(system_archive, tsconfig.CONFIG_PATH)
        utils.progress(total_steps, step, 'backup configuration', 'DONE')
        step += 1

        # Step 3: Backup puppet data
        backup_puppet_data(system_archive, constants.HIERADATA_PERMDIR)
        utils.progress(total_steps, step, 'backup puppet data', 'DONE')
        step += 1

        # Step 4: Backup keyring
        backup_keyring(system_archive, keyring_permdir)
        utils.progress(total_steps, step, 'backup keyring', 'DONE')
        step += 1

        # Step 5: Backup ldap
        backup_ldap(system_archive, staging_dir)
        utils.progress(total_steps, step, 'backup ldap', 'DONE')
        step += 1

        # Step 6: Backup postgres
        backup_postgres(system_archive, staging_dir, cinder_config)
        utils.progress(total_steps, step, 'backup postgres', 'DONE')
        step += 1

        # Step 7: Backup ceilometer
        backup_ceilometer(system_archive, ceilometer_permdir)
        utils.progress(total_steps, step, 'backup ceilometer', 'DONE')
        step += 1

        if tsconfig.region_config != "yes":
            # Step 8: Backup glance
            images_archive = tarfile.open(images_tar_path, "w:gz")
            backup_std_dir(images_archive, glance_permdir)
            images_archive.close()
            utils.progress(total_steps, step, 'backup glance', 'DONE')
            step += 1

        # Step 9: Backup home
        backup_std_dir(system_archive, home_permdir)
        utils.progress(total_steps, step, 'backup home directory', 'DONE')
        step += 1

        # Step 10: Backup patching
        if not clone:
            backup_std_dir(system_archive, patching_permdir)
            utils.progress(total_steps, step, 'backup patching', 'DONE')
        step += 1

        # Step 11: Backup patching repo
        if not clone:
            backup_std_dir(system_archive, patching_repo_permdir)
            utils.progress(total_steps, step, 'backup patching repo', 'DONE')
        step += 1

        # Step 12: Backup extension filesystem
        backup_std_dir(system_archive, extension_permdir)
        utils.progress(total_steps, step, 'backup extension filesystem '
                                          'directory', 'DONE')
        step += 1

        # Step 13: Backup patch-vault filesystem
        if os.path.exists(patch_vault_permdir):
            backup_std_dir(system_archive, patch_vault_permdir)
            utils.progress(total_steps, step, 'backup patch-vault filesystem '
                                              'directory', 'DONE')
        step += 1

        # Step 14: Backup cinder config/LVM config
        # No need to add extra check here as if cinder/LVM is not configured,
        # ../iscsi-target/saveconfig.json will be absent, so this function will
        # do nothing.
        backup_cinder_config(system_archive)
        utils.progress(total_steps, step, 'backup cinder/LVM config', 'DONE')
        step += 1

        # Step 15: Backup ceph crush map
        if sysinv_constants.SB_TYPE_CEPH in backend_services.keys():
            backup_ceph_crush_map(system_archive, staging_dir)
            utils.progress(total_steps, step, 'backup ceph crush map', 'DONE')
            step += 1

        # Step 16: Create archive
        system_archive.close()
        utils.progress(total_steps, step, 'create archive', 'DONE')
        step += 1

    except Exception:
        if system_tar_path and os.path.isfile(system_tar_path):
            os.remove(system_tar_path)
        if images_tar_path and os.path.isfile(images_tar_path):
            os.remove(images_tar_path)

        raise
    finally:
        fmApi.clear_fault(fm_constants.FM_ALARM_ID_BACKUP_IN_PROGRESS,
                          entity_instance_id)
        os.remove(backup_in_progress)
        if staging_dir:
            shutil.rmtree(staging_dir, ignore_errors=True)

    system_msg = "System backup file created"
    images_msg = "Images backup file created"
    if not clone:
        system_msg += ": " + system_tar_path
        images_msg += ": " + images_tar_path

    print(system_msg)
    if tsconfig.region_config != "yes":
        print(images_msg)
    if warnings != '':
        print("WARNING: The following problems occurred:")
        print(textwrap.fill(warnings, 80))


def create_restore_runtime_config(filename):
    """ Create any runtime parameters needed for Restore."""
    config = {}
    # We need to re-enable Openstack password rules, which
    # were previously disabled while the controller manifests
    # were applying during a Restore
    config['classes'] = ['keystone::security_compliance']
    utils.create_manifest_runtime_config(filename, config)


def overwrite_iscsi_target_config():
    """
    Overwrite the current iscsi target config file with the one
    from the backup archive.
    """

    if not os.path.exists(
            cinder_permdir + '/iscsi-target/saveconfig.json'):
        LOG.info("Restore: Missing current saveconfig.json file")
        return

    if not os.path.exists(
            cinder_permdir + '/iscsi-target/saveconfig.json.bck'):
        LOG.info("Restore: Missing backup saveconfig.json file")
        return

    os.remove(cinder_permdir + '/iscsi-target/saveconfig.json')
    shutil.copyfile(
        cinder_permdir + '/iscsi-target/saveconfig.json.bck',
        cinder_permdir + '/iscsi-target/saveconfig.json')

    os.remove(cinder_permdir + '/iscsi-target/saveconfig.json.bck')
    subprocess.call(["targetctl", "restore"], stdout=DEVNULL, stderr=DEVNULL)


def restore_complete():
    """
    Restore proper ISCSI configuration file after cinder restore.
    Enable worker functionality for AIO system.
    :return: True if worker-config-complete is executed
    """
    if utils.get_system_type() == sysinv_constants.TIS_AIO_BUILD:
        if not os.path.isfile(restore_system_ready):
            print(textwrap.fill(
                "--restore-complete can only be run "
                "after restore-system has completed "
                "successfully", 80
            ))
            return False

        # The iscsi target config file must be overwritten with the
        # original file from the backup archive.
        # This is due to the cinder restore process actually changing
        # this file. These changes cause VMs that were present at
        # backup time to not boot up properly anymore.
        # The original icsci config file has the proper settings so
        # we use use that.
        overwrite_iscsi_target_config()

        print("\nApplying worker manifests for %s. " %
              (utils.get_controller_hostname()))
        print("Node will reboot on completion.")

        sysinv.do_worker_config_complete(utils.get_controller_hostname())

        # show in-progress log on console every 30 seconds
        # until self reboot or timeout
        os.remove(restore_system_ready)
        time.sleep(30)
        for i in range(1, 10):
            print("worker manifest apply in progress ... ")
            time.sleep(30)

        raise RestoreFail("Timeout running worker manifests, "
                          "reboot did not occur")

    else:
        if not os.path.isfile(restore_system_ready):
            print(textwrap.fill(
                "--restore-complete can only be run "
                "after restore-system has completed "
                "successfully", 80
            ))
            return False
        overwrite_iscsi_target_config()
        os.remove(restore_system_ready)
        return True


def restore_system(backup_file, include_storage_reinstall=False, clone=False):
    """Restoring system configuration."""

    if (os.path.exists(constants.CGCS_CONFIG_FILE) or
            os.path.exists(tsconfig.CONFIG_PATH) or
            os.path.exists(constants.INITIAL_CONFIG_COMPLETE_FILE)):
        print(textwrap.fill(
            "Configuration has already been done. "
            "A system restore operation can only be done "
            "immediately after the load has been installed.", 80))
        print('')
        raise RestoreFail("System configuration already completed")

    if not os.path.isabs(backup_file):
        raise RestoreFail("Backup file (%s) not found. Full path is "
                          "required." % backup_file)

    if os.path.isfile(restore_in_progress):
        raise RestoreFail("Restore already in progress.")
    else:
        open(restore_in_progress, 'w')

    # Add newline to console log for install-clone scenario
    newline = clone
    staging_dir = None

    try:
        try:
            with open(os.devnull, "w") as fnull:
                subprocess.check_call(["vgdisplay", "cgts-vg"],
                                      stdout=fnull,
                                      stderr=fnull)
        except subprocess.CalledProcessError:
            LOG.error("The cgts-vg volume group was not found")
            raise RestoreFail("Volume groups not configured")

        print("\nRestoring system (this will take several minutes):")
        # Use /scratch for the staging dir for now,
        # until /opt/backups is available
        staging_dir = tempfile.mkdtemp(dir='/scratch')
        # Permission change required or postgres restore fails
        subprocess.call(['chmod', 'a+rx', staging_dir], stdout=DEVNULL)
        os.chdir('/')

        step = 1
        total_steps = 24

        # Step 1: Open archive and verify installed load matches backup
        try:
            archive = tarfile.open(backup_file)
        except tarfile.TarError as e:
            LOG.exception(e)
            raise RestoreFail("Error opening backup file. Invalid backup "
                              "file.")
        check_load_versions(archive, staging_dir)
        check_load_subfunctions(archive, staging_dir)
        utils.progress(total_steps, step, 'open archive', 'DONE', newline)
        step += 1

        # Patching is potentially a multi-phase step.
        # If the controller is impacted by patches from the backup,
        # it must be rebooted before continuing the restore.
        # If this is the second pass through, we can skip over this.
        if not os.path.isfile(restore_patching_complete) and not clone:
            # Step 2: Restore patching
            restore_std_dir(archive, patching_permdir)
            utils.progress(total_steps, step, 'restore patching', 'DONE',
                           newline)
            step += 1

            # Step 3: Restore patching repo
            restore_std_dir(archive, patching_repo_permdir)
            utils.progress(total_steps, step, 'restore patching repo', 'DONE',
                           newline)
            step += 1

            # Step 4: Apply patches
            try:
                subprocess.check_output(["sw-patch", "install-local"])
            except subprocess.CalledProcessError:
                LOG.error("Failed to install patches")
                raise RestoreFail("Failed to install patches")
            utils.progress(total_steps, step, 'install patches', 'DONE',
                           newline)
            step += 1

            open(restore_patching_complete, 'w')

            # If the controller was impacted by patches, we need to reboot.
            if os.path.isfile(node_is_patched):
                if not clone:
                    print("\nThis controller has been patched. " +
                          "A reboot is required.")
                    print("After the reboot is complete, " +
                          "re-execute the restore command.")
                    while True:
                        user_input = input(
                            "Enter 'reboot' to reboot controller: ")
                        if user_input == 'reboot':
                            break
                LOG.info("This controller has been patched. Rebooting now")
                print("\nThis controller has been patched. Rebooting now\n\n")
                time.sleep(5)
                os.remove(restore_in_progress)
                if staging_dir:
                    shutil.rmtree(staging_dir, ignore_errors=True)
                subprocess.call("reboot")

            else:
                # We need to restart the patch controller and agent, since
                # we setup the repo and patch store outside its control
                with open(os.devnull, "w") as devnull:
                    subprocess.call(
                        ["systemctl",
                         "restart",
                         "sw-patch-controller-daemon.service"],
                        stdout=devnull, stderr=devnull)
                    subprocess.call(
                        ["systemctl",
                         "restart",
                         "sw-patch-agent.service"],
                        stdout=devnull, stderr=devnull)
                if clone:
                    #  No patches were applied, return to cloning code
                    #  to run validation code.
                    return RESTORE_RERUN_REQUIRED
        else:
            # Add the skipped steps
            step += 3

        if os.path.isfile(node_is_patched):
            # If we get here, it means the node was patched by the user
            # AFTER the restore applied patches and rebooted, but didn't
            # reboot.
            # This means the patch lineup no longer matches what's in the
            # backup, but we can't (and probably shouldn't) prevent that.
            # However, since this will ultimately cause the node to fail
            # the goenabled step, we can fail immediately and force the
            # user to reboot.
            print ("\nThis controller has been patched, but not rebooted.")
            print ("Please reboot before continuing the restore process.")
            raise RestoreFail("Controller node patched without rebooting")

        # Flag can now be cleared
        if os.path.exists(restore_patching_complete):
            os.remove(restore_patching_complete)

        # Prefetch keyring
        prefetch_keyring(archive)

        # Step 5: Restore configuration
        restore_configuration(archive, staging_dir)
        # In AIO SX systems, the loopback interface is used as the management
        # interface. However, the application of the interface manifest will
        # not configure the necessary addresses on the loopback interface (see
        # apply_network_config.sh for details). So, we need to configure the
        # loopback interface here.
        if tsconfig.system_mode == sysinv_constants.SYSTEM_MODE_SIMPLEX:
            configure_loopback_interface(archive)
        # Write the simplex flag
        utils.write_simplex_flag()
        utils.progress(total_steps, step, 'restore configuration', 'DONE',
                       newline)
        step += 1

        # Step 6: Apply restore bootstrap manifest
        controller_0_address = utils.get_address_from_hosts_file(
            'controller-0')
        restore_static_puppet_data(archive, constants.HIERADATA_WORKDIR)
        try:
            utils.apply_manifest(controller_0_address,
                                 sysinv_constants.CONTROLLER,
                                 'bootstrap',
                                 constants.HIERADATA_WORKDIR)
        except Exception as e:
            LOG.exception(e)
            raise RestoreFail(
                'Failed to apply bootstrap manifest. '
                'See /var/log/puppet/latest/puppet.log for details.')

        utils.progress(total_steps, step, 'apply bootstrap manifest', 'DONE',
                       newline)
        step += 1

        # Step 7: Restore puppet data
        restore_puppet_data(archive, constants.HIERADATA_WORKDIR)
        utils.progress(total_steps, step, 'restore puppet data', 'DONE',
                       newline)
        step += 1

        # Step 8: Persist configuration
        utils.persist_config()
        utils.progress(total_steps, step, 'persist configuration', 'DONE',
                       newline)
        step += 1

        # Step 9: Apply controller manifest
        try:
            utils.apply_manifest(controller_0_address,
                                 sysinv_constants.CONTROLLER,
                                 'controller',
                                 constants.HIERADATA_PERMDIR)
        except Exception as e:
            LOG.exception(e)
            raise RestoreFail(
                'Failed to apply controller manifest. '
                'See /var/log/puppet/latest/puppet.log for details.')
        utils.progress(total_steps, step, 'apply controller manifest', 'DONE',
                       newline)
        step += 1

        # Step 10: Apply runtime controller manifests
        restore_filename = os.path.join(staging_dir, 'restore.yaml')
        create_restore_runtime_config(restore_filename)
        try:
            utils.apply_manifest(controller_0_address,
                                 sysinv_constants.CONTROLLER,
                                 'runtime',
                                 constants.HIERADATA_PERMDIR,
                                 runtime_filename=restore_filename)
        except Exception as e:
            LOG.exception(e)
            raise RestoreFail(
                'Failed to apply runtime controller manifest. '
                'See /var/log/puppet/latest/puppet.log for details.')
        utils.progress(total_steps, step,
                       'apply runtime controller manifest', 'DONE',
                       newline)
        step += 1

        # Move the staging dir under /opt/backups, now that it's setup
        shutil.rmtree(staging_dir, ignore_errors=True)
        staging_dir = tempfile.mkdtemp(dir=constants.BACKUPS_PATH)
        # Permission change required or postgres restore fails
        subprocess.call(['chmod', 'a+rx', staging_dir], stdout=DEVNULL)

        # Step 11: Restore cinder config file
        restore_cinder_config(archive)
        utils.progress(total_steps, step, 'restore cinder config', 'DONE',
                       newline)
        step += 1

        # Step 12: Apply banner customization
        utils.apply_banner_customization()
        utils.progress(total_steps, step, 'apply banner customization', 'DONE',
                       newline)
        step += 1

        # Step 13: Restore dnsmasq and pxeboot config
        restore_dnsmasq(archive, tsconfig.CONFIG_PATH)
        utils.progress(total_steps, step, 'restore dnsmasq', 'DONE', newline)
        step += 1

        # Step 14: Restore keyring
        restore_keyring(archive, keyring_permdir)
        utils.progress(total_steps, step, 'restore keyring', 'DONE', newline)
        step += 1

        # Step 15: Restore ldap
        restore_ldap(archive, ldap_permdir, staging_dir)
        utils.progress(total_steps, step, 'restore ldap', 'DONE', newline)
        step += 1

        # Step 16: Restore postgres
        restore_postgres(archive, staging_dir)
        utils.progress(total_steps, step, 'restore postgres', 'DONE', newline)
        step += 1

        # Step 17: Restore ceilometer
        restore_ceilometer(archive, ceilometer_permdir)
        utils.progress(total_steps, step, 'restore ceilometer', 'DONE',
                       newline)
        step += 1

        # Step 18: Restore ceph crush map
        restore_ceph_crush_map(archive)
        utils.progress(total_steps, step, 'restore ceph crush map', 'DONE',
                       newline)
        step += 1

        # Step 19: Restore home
        restore_std_dir(archive, home_permdir)
        utils.progress(total_steps, step, 'restore home directory', 'DONE',
                       newline)
        step += 1

        # Step 20: Restore extension filesystem
        restore_std_dir(archive, extension_permdir)
        utils.progress(total_steps, step, 'restore extension filesystem '
                                          'directory', 'DONE', newline)
        step += 1

        # Step 21: Restore patch-vault filesystem
        if file_exists_in_archive(archive,
                                  os.path.basename(patch_vault_permdir)):
            restore_std_dir(archive, patch_vault_permdir)
            utils.progress(total_steps, step, 'restore patch-vault filesystem '
                                              'directory', 'DONE', newline)

        step += 1

        # Step 22: Restore external ceph configuration files.
        restore_ceph_external_config_files(archive, staging_dir)
        utils.progress(total_steps, step, 'restore CEPH external config',
                       'DONE', newline)
        step += 1

        # Step 23: Shutdown file systems
        archive.close()
        shutil.rmtree(staging_dir, ignore_errors=True)
        utils.shutdown_file_systems()
        utils.progress(total_steps, step, 'shutdown file systems', 'DONE',
                       newline)
        step += 1

        # Step 24: Recover services
        utils.mtce_restart()
        utils.mark_config_complete()
        time.sleep(120)

        for service in ['sysinv-conductor', 'sysinv-inv']:
            if not utils.wait_sm_service(service):
                raise RestoreFail("Services have failed to initialize.")

        utils.progress(total_steps, step, 'recover services', 'DONE', newline)
        step += 1

        if tsconfig.system_mode != sysinv_constants.SYSTEM_MODE_SIMPLEX:

            print("\nRestoring node states (this will take several minutes):")

            backend_services = sysinv.get_storage_backend_services()

            with openstack.OpenStack() as client:
                # On ceph setups storage nodes take about 90 seconds
                # to become locked. Setting the timeout to 120 seconds
                # for such setups
                lock_timeout = 60
                if sysinv_constants.SB_TYPE_CEPH in backend_services.keys():
                    lock_timeout = 120

                failed_lock_host = False
                skip_hosts = ['controller-0']
                if not include_storage_reinstall:
                    storage_hosts = \
                        sysinv.get_hosts(client.admin_token,
                                         client.conf['region_name'],
                                         personality='storage')
                    if storage_hosts:
                        install_uuid = utils.get_install_uuid()
                        for h in storage_hosts:
                            skip_hosts.append(h.name)

                            # Update install_uuid on the storage node
                            client.sysinv.ihost.update_install_uuid(
                                h.uuid,
                                install_uuid)

                skip_hosts_count = len(skip_hosts)

                # Wait for nodes to be identified as disabled before attempting
                # to lock hosts. Even if after 3 minute nodes are still not
                # identified as disabled, we still continue the restore.
                if not client.wait_for_hosts_disabled(
                        exempt_hostnames=skip_hosts,
                        timeout=180):
                    LOG.info("At least one node is not in a disabling state. "
                             "Continuing.")

                print("\nLocking nodes:")
                try:
                    failed_hosts = client.lock_hosts(skip_hosts,
                                                     utils.progress,
                                                     timeout=lock_timeout)
                    # Don't power off nodes that could not be locked
                    if len(failed_hosts) > 0:
                        skip_hosts.append(failed_hosts)

                except (KeystoneFail, SysInvFail) as e:
                    LOG.exception(e)
                    failed_lock_host = True

                if not failed_lock_host:
                    print("\nPowering-off nodes:")
                    try:
                        client.power_off_hosts(skip_hosts,
                                               utils.progress,
                                               timeout=60)
                    except (KeystoneFail, SysInvFail) as e:
                        LOG.exception(e)
                        # this is somehow expected

                if failed_lock_host or len(skip_hosts) > skip_hosts_count:
                    if include_storage_reinstall:
                        print(textwrap.fill(
                            "Failed to lock at least one node. " +
                            "Please lock the unlocked nodes manually.", 80
                        ))
                    else:
                        print(textwrap.fill(
                            "Failed to lock at least one node. " +
                            "Please lock the unlocked controller-1 or " +
                            "worker nodes manually.", 80
                        ))

                if not clone:
                    print(textwrap.fill(
                        "Before continuing to the next step in the restore, " +
                        "please ensure all nodes other than controller-0 " +
                        "and storage nodes, if they are not being " +
                        "reinstalled, are powered off. Please refer to the " +
                        "system administration guide for more details.", 80
                    ))

    finally:
        os.remove(restore_in_progress)
        if staging_dir:
            shutil.rmtree(staging_dir, ignore_errors=True)
        cleanup_prefetched_keyring()

    fmApi = fm_api.FaultAPIs()
    entity_instance_id = "%s=%s" % (fm_constants.FM_ENTITY_TYPE_HOST,
                                    sysinv_constants.CONTROLLER_HOSTNAME)
    fault = fm_api.Fault(
        alarm_id=fm_constants.FM_ALARM_ID_BACKUP_IN_PROGRESS,
        alarm_state=fm_constants.FM_ALARM_STATE_MSG,
        entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST,
        entity_instance_id=entity_instance_id,
        severity=fm_constants.FM_ALARM_SEVERITY_MINOR,
        reason_text=("System Restore complete."),
        # other
        alarm_type=fm_constants.FM_ALARM_TYPE_0,
        # unknown
        probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_UNKNOWN,
        proposed_repair_action=(""),
        service_affecting=False)

    fmApi.set_fault(fault)

    # Mark system restore as complete
    if (utils.get_controller_hostname() ==
            sysinv_constants.CONTROLLER_0_HOSTNAME):
        # Create the flag file that permits the
        # restore_complete command option.
        utils.touch(restore_system_ready)

    return RESTORE_COMPLETE


def restore_images(backup_file, clone=False):
    """Restoring images."""

    if not os.path.exists(constants.INITIAL_CONFIG_COMPLETE_FILE):
        print(textwrap.fill(
            "System restore has not been done. "
            "An image restore operation can only be done after "
            "the system restore has been completed.", 80))
        print('')
        raise RestoreFail("System restore required")

    if not os.path.isabs(backup_file):
        raise RestoreFail("Backup file (%s) not found. Full path is "
                          "required." % backup_file)

    if os.path.isfile(restore_in_progress):
        raise RestoreFail("Restore already in progress.")
    else:
        open(restore_in_progress, 'w')

    # Add newline to console log for install-clone scenario
    newline = clone

    try:
        print("\nRestoring images (this will take several minutes):")
        os.chdir('/')

        step = 1
        total_steps = 2

        # Step 1: Open archive
        try:
            archive = tarfile.open(backup_file)
        except tarfile.TarError as e:
            LOG.exception(e)
            raise RestoreFail("Error opening backup file. Invalid backup "
                              "file.")
        utils.progress(total_steps, step, 'open archive', 'DONE', newline)
        step += 1

        # Step 2: Restore glance
        restore_std_dir(archive, glance_permdir)
        utils.progress(total_steps, step, 'restore glance', 'DONE',
                       newline)
        step += 1
        archive.close()

    finally:
        os.remove(restore_in_progress)