config/controllerconfig/controllerconfig/controllerconfig/backup_restore.py

1755 lines
65 KiB
Python

#
# Copyright (c) 2014-2017 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
"""
Backup & Restore
"""
from __future__ import print_function
import copy
import filecmp
import fileinput
import os
import glob
import shutil
import stat
import subprocess
import tarfile
import tempfile
import textwrap
import time
from fm_api import constants as fm_constants
from fm_api import fm_api
from sysinv.common import constants as sysinv_constants
from controllerconfig.common import log
from controllerconfig.common import constants
from controllerconfig.common.exceptions import BackupFail
from controllerconfig.common.exceptions import RestoreFail
from controllerconfig.common.exceptions import KeystoneFail
from controllerconfig.common.exceptions import SysInvFail
from controllerconfig import openstack
import tsconfig.tsconfig as tsconfig
from controllerconfig import utils
from controllerconfig import sysinv_api as sysinv
from six.moves import input
LOG = log.get_logger(__name__)
DEVNULL = open(os.devnull, 'w')
RESTORE_COMPLETE = "restore-complete"
RESTORE_RERUN_REQUIRED = "restore-rerun-required"
# Backup/restore related constants
backup_in_progress = tsconfig.BACKUP_IN_PROGRESS_FLAG
restore_in_progress = tsconfig.RESTORE_IN_PROGRESS_FLAG
restore_system_ready = tsconfig.RESTORE_SYSTEM_FLAG
restore_patching_complete = '/etc/platform/.restore_patching_complete'
node_is_patched = '/var/run/node_is_patched'
keyring_permdir = os.path.join('/opt/platform/.keyring', tsconfig.SW_VERSION)
ceph_permdir = os.path.join(tsconfig.CONFIG_PATH, 'ceph-config')
ldap_permdir = '/var/lib/openldap-data'
ceilometer_permdir = '/opt/cgcs/ceilometer/' + tsconfig.SW_VERSION
glance_permdir = '/opt/cgcs/glance'
patching_permdir = '/opt/patching'
patching_repo_permdir = '/www/pages/updates'
home_permdir = '/home'
cinder_permdir = '/opt/cgcs/cinder'
extension_permdir = '/opt/extension'
patch_vault_permdir = '/opt/patch-vault'
def get_backup_databases(cinder_config=False):
"""
Retrieve database lists for backup.
:return: backup_databases and backup_database_skip_tables
"""
# Databases common to all configurations
REGION_LOCAL_DATABASES = ('postgres', 'template1', 'nova', 'sysinv',
'neutron', 'heat', 'nova_api',
'aodh', 'murano', 'magnum', 'panko', 'ironic',
'nova_cell0', 'gnocchi', 'fm', 'barbican')
REGION_SHARED_DATABASES = ('glance', 'keystone')
if cinder_config:
REGION_SHARED_DATABASES += ('cinder', )
# Indicates which tables have to be dropped for a certain database.
DB_TABLE_SKIP_MAPPING = {
'fm': ('alarm',),
'gnocchi': ('metric', 'resource'),
'dcorch': ('orch_job',
'orch_request',
'resource',
'subcloud_resource'), }
if tsconfig.region_config == 'yes':
BACKUP_DATABASES = REGION_LOCAL_DATABASES
# Add databases which are optional in secondary regions(and subclouds)
shared_services = sysinv.get_shared_services()
for service_type in ["image", "volume"]:
if service_type not in shared_services:
service = 'glance' if service_type == "image" else 'cinder'
BACKUP_DATABASES += (service, )
else:
# Add additional databases for non-region configuration and for the
# primary region in region deployments.
BACKUP_DATABASES = REGION_LOCAL_DATABASES + REGION_SHARED_DATABASES
# Add distributed cloud databases
if tsconfig.distributed_cloud_role == \
sysinv_constants.DISTRIBUTED_CLOUD_ROLE_SYSTEMCONTROLLER:
BACKUP_DATABASES += ('dcmanager', 'dcorch')
# We generate the tables to be skipped for each database
# mentioned in BACKUP_DATABASES. We explicitly list
# skip tables in DB_TABLE_SKIP_MAPPING
BACKUP_DB_SKIP_TABLES = dict(
[[x, DB_TABLE_SKIP_MAPPING.get(x, ())] for x in BACKUP_DATABASES])
return BACKUP_DATABASES, BACKUP_DB_SKIP_TABLES
def check_load_versions(archive, staging_dir):
match = False
try:
member = archive.getmember('etc/build.info')
archive.extract(member, path=staging_dir)
match = filecmp.cmp('/etc/build.info', staging_dir + '/etc/build.info')
shutil.rmtree(staging_dir + '/etc')
except Exception as e:
LOG.exception(e)
raise RestoreFail("Unable to verify load version in backup file. "
"Invalid backup file.")
if not match:
LOG.error("Load version mismatch.")
raise RestoreFail("Load version of backup does not match the "
"version of the installed load.")
def get_subfunctions(filename):
"""
Retrieves the subfunctions from a platform.conf file.
:param filename: file to retrieve subfunctions from
:return: a list of the subfunctions or None if no subfunctions exist
"""
matchstr = 'subfunction='
with open(filename, 'r') as f:
for line in f:
if matchstr in line:
parsed = line.split('=')
return parsed[1].rstrip().split(",")
return
def check_load_subfunctions(archive, staging_dir):
"""
Verify that the subfunctions in the backup match the installed load.
:param archive: backup archive
:param staging_dir: staging directory
:return: raises exception if the subfunctions do not match
"""
match = False
backup_subfunctions = None
try:
member = archive.getmember('etc/platform/platform.conf')
archive.extract(member, path=staging_dir)
backup_subfunctions = get_subfunctions(staging_dir +
'/etc/platform/platform.conf')
shutil.rmtree(staging_dir + '/etc')
if set(backup_subfunctions) ^ set(tsconfig.subfunctions):
# The set of subfunctions do not match
match = False
else:
match = True
except Exception:
LOG.exception("Unable to verify subfunctions in backup file")
raise RestoreFail("Unable to verify subfunctions in backup file. "
"Invalid backup file.")
if not match:
LOG.error("Subfunction mismatch - backup: %s, installed: %s" %
(str(backup_subfunctions), str(tsconfig.subfunctions)))
raise RestoreFail("Subfunctions in backup load (%s) do not match the "
"subfunctions of the installed load (%s)." %
(str(backup_subfunctions),
str(tsconfig.subfunctions)))
def file_exists_in_archive(archive, file_path):
""" Check if file exists in archive """
try:
archive.getmember(file_path)
return True
except KeyError:
LOG.info("File %s is not in archive." % file_path)
return False
def filter_directory(archive, directory):
for tarinfo in archive:
if tarinfo.name.split('/')[0] == directory:
yield tarinfo
def backup_etc_size():
""" Backup etc size estimate """
try:
total_size = utils.directory_get_size('/etc')
return total_size
except OSError:
LOG.error("Failed to estimate backup etc size.")
raise BackupFail("Failed to estimate backup etc size")
def backup_etc(archive):
""" Backup etc """
try:
archive.add('/etc', arcname='etc')
except tarfile.TarError:
LOG.error("Failed to backup etc.")
raise BackupFail("Failed to backup etc")
def restore_etc_file(archive, dest_dir, etc_file):
""" Restore etc file """
try:
# Change the name of this file to remove the leading path
member = archive.getmember('etc/' + etc_file)
# Copy the member to avoid changing the name for future operations on
# this member.
temp_member = copy.copy(member)
temp_member.name = os.path.basename(temp_member.name)
archive.extract(temp_member, path=dest_dir)
except tarfile.TarError:
LOG.error("Failed to restore etc file.")
raise RestoreFail("Failed to restore etc file")
def restore_etc_ssl_dir(archive, configpath=constants.CONFIG_WORKDIR):
""" Restore the etc SSL dir """
def filter_etc_ssl_private(members):
for tarinfo in members:
if 'etc/ssl/private' in tarinfo.name:
yield tarinfo
if file_exists_in_archive(archive, 'config/server-cert.pem'):
restore_config_file(
archive, configpath, 'server-cert.pem')
if file_exists_in_archive(archive, 'etc/ssl/private'):
# NOTE: This will include all TPM certificate files if TPM was
# enabled on the backed up system. However in that case, this
# restoration is only done for the first controller and TPM
# will need to be reconfigured once duplex controller (if any)
# is restored.
archive.extractall(path='/',
members=filter_etc_ssl_private(archive))
def restore_ceph_external_config_files(archive, staging_dir):
# Restore ceph-config.
if file_exists_in_archive(archive, "config/ceph-config"):
restore_config_dir(archive, staging_dir, 'ceph-config', ceph_permdir)
# Copy the file to /etc/ceph.
# There might be no files to copy, so don't check the return code.
cp_command = ('cp -Rp ' + os.path.join(ceph_permdir, '*') +
' /etc/ceph/')
subprocess.call(cp_command, shell=True)
def backup_config_size(config_permdir):
""" Backup configuration size estimate """
try:
return(utils.directory_get_size(config_permdir))
except OSError:
LOG.error("Failed to estimate backup configuration size.")
raise BackupFail("Failed to estimate backup configuration size")
def backup_config(archive, config_permdir):
""" Backup configuration """
try:
# The config dir is versioned, but we're only grabbing the current
# release
archive.add(config_permdir, arcname='config')
except tarfile.TarError:
LOG.error("Failed to backup config.")
raise BackupFail("Failed to backup configuration")
def restore_config_file(archive, dest_dir, config_file):
""" Restore configuration file """
try:
# Change the name of this file to remove the leading path
member = archive.getmember('config/' + config_file)
# Copy the member to avoid changing the name for future operations on
# this member.
temp_member = copy.copy(member)
temp_member.name = os.path.basename(temp_member.name)
archive.extract(temp_member, path=dest_dir)
except tarfile.TarError:
LOG.error("Failed to restore config file %s." % config_file)
raise RestoreFail("Failed to restore configuration")
def restore_configuration(archive, staging_dir):
""" Restore configuration """
try:
os.makedirs(constants.CONFIG_WORKDIR, stat.S_IRWXU | stat.S_IRGRP |
stat.S_IXGRP | stat.S_IROTH | stat.S_IXOTH)
except OSError:
LOG.error("Failed to create config directory: %s",
constants.CONFIG_WORKDIR)
raise RestoreFail("Failed to restore configuration files")
# Restore cgcs_config file from original installation for historical
# purposes. Not used to restore the system as the information in this
# file is out of date (not updated after original installation).
restore_config_file(archive, constants.CONFIG_WORKDIR, 'cgcs_config')
# Restore platform.conf file and update as necessary. The file will be
# created in a temporary location and then moved into place when it is
# complete to prevent access to a partially created file.
restore_etc_file(archive, staging_dir, 'platform/platform.conf')
temp_platform_conf_file = os.path.join(tsconfig.PLATFORM_CONF_PATH,
'platform.conf.temp')
shutil.copyfile(os.path.join(staging_dir, 'platform.conf'),
temp_platform_conf_file)
install_uuid = utils.get_install_uuid()
for line in fileinput.FileInput(temp_platform_conf_file, inplace=1):
if line.startswith("INSTALL_UUID="):
# The INSTALL_UUID must be updated to match the new INSTALL_UUID
# which was generated when this controller was installed prior to
# doing the restore.
print("INSTALL_UUID=%s" % install_uuid)
elif line.startswith("management_interface=") or \
line.startswith("oam_interface=") or \
line.startswith("infrastructure_interface=") or \
line.startswith("UUID="):
# Strip out any entries that are host specific as the backup can
# be done on either controller. The application of the
# platform_conf manifest will add these back in.
pass
else:
print(line, end='')
fileinput.close()
# Move updated platform.conf file into place.
os.rename(temp_platform_conf_file, tsconfig.PLATFORM_CONF_FILE)
# Kick tsconfig to reload the platform.conf file
tsconfig._load()
# Restore branding
restore_config_dir(archive, staging_dir, 'branding', '/opt/branding/')
# Restore banner customization
restore_config_dir(archive, staging_dir, 'banner/etc', '/opt/banner')
# Restore ssh configuration
restore_config_dir(archive, staging_dir, 'ssh_config',
constants.CONFIG_WORKDIR + '/ssh_config')
# Configure hostname
utils.configure_hostname('controller-0')
# Restore hosts file
restore_etc_file(archive, '/etc', 'hosts')
restore_etc_file(archive, constants.CONFIG_WORKDIR, 'hosts')
# Restore certificate files
restore_etc_ssl_dir(archive)
# Restore firewall rules file if it is in the archive
if file_exists_in_archive(archive, 'config/iptables.rules'):
restore_config_file(
archive, constants.CONFIG_WORKDIR, 'iptables.rules')
restore_etc_file(archive, tsconfig.PLATFORM_CONF_PATH,
'platform/iptables.rules')
def filter_pxelinux(archive):
for tarinfo in archive:
if tarinfo.name.find('config/pxelinux.cfg') == 0:
yield tarinfo
def restore_dnsmasq(archive, config_permdir):
""" Restore dnsmasq """
try:
etc_files = ['hosts']
perm_files = ['hosts',
'dnsmasq.hosts', 'dnsmasq.leases',
'dnsmasq.addn_hosts']
for etc_file in etc_files:
restore_config_file(archive, '/etc', etc_file)
for perm_file in perm_files:
restore_config_file(archive, config_permdir, perm_file)
# Extract distributed cloud addn_hosts file if present in archive.
if file_exists_in_archive(
archive, 'config/dnsmasq.addn_hosts_dc'):
restore_config_file(archive, config_permdir,
'dnsmasq.addn_hosts_dc')
tmpdir = tempfile.mkdtemp(prefix="pxerestore_")
archive.extractall(tmpdir,
members=filter_pxelinux(archive))
if os.path.exists(tmpdir + '/config/pxelinux.cfg'):
shutil.rmtree(config_permdir + 'pxelinux.cfg', ignore_errors=True)
shutil.move(tmpdir + '/config/pxelinux.cfg', config_permdir)
shutil.rmtree(tmpdir, ignore_errors=True)
except (shutil.Error, subprocess.CalledProcessError, tarfile.TarError):
LOG.error("Failed to restore dnsmasq config.")
raise RestoreFail("Failed to restore dnsmasq files")
def backup_puppet_data_size(puppet_permdir):
""" Backup puppet data size estimate """
try:
return(utils.directory_get_size(puppet_permdir))
except OSError:
LOG.error("Failed to estimate backup puppet data size.")
raise BackupFail("Failed to estimate backup puppet data size")
def backup_puppet_data(archive, puppet_permdir):
""" Backup puppet data """
try:
# The puppet dir is versioned, but we're only grabbing the current
# release
archive.add(puppet_permdir, arcname='hieradata')
except tarfile.TarError:
LOG.error("Failed to backup puppet data.")
raise BackupFail("Failed to backup puppet data")
def restore_static_puppet_data(archive, puppet_workdir):
""" Restore static puppet data """
try:
member = archive.getmember('hieradata/static.yaml')
archive.extract(member, path=os.path.dirname(puppet_workdir))
member = archive.getmember('hieradata/secure_static.yaml')
archive.extract(member, path=os.path.dirname(puppet_workdir))
except tarfile.TarError:
LOG.error("Failed to restore static puppet data.")
raise RestoreFail("Failed to restore static puppet data")
except OSError:
pass
def restore_puppet_data(archive, puppet_workdir):
""" Restore puppet data """
try:
archive.extractall(
path=os.path.dirname(puppet_workdir),
members=filter_directory(archive,
os.path.basename(puppet_workdir)))
except tarfile.TarError:
LOG.error("Failed to restore puppet data.")
raise RestoreFail("Failed to restore puppet data")
except OSError:
pass
def backup_cinder_config(archive):
""" Backup cinder configuration """
# If the iscsi target config file exists, add it to the archive
# On setups without LVM backends this file is absent
if os.path.exists(cinder_permdir + '/iscsi-target/saveconfig.json'):
archive.add(
cinder_permdir + '/iscsi-target/saveconfig.json',
arcname='cinder/saveconfig.json')
def restore_cinder_file(archive, dest_dir, cinder_file):
""" Restore cinder file """
try:
# Change the name of this file to remove the leading path
member = archive.getmember('cinder/' + cinder_file)
# Copy the member to avoid changing the name for future operations on
# this member.
temp_member = copy.copy(member)
temp_member.name = os.path.basename(temp_member.name)
archive.extract(temp_member, path=dest_dir)
except tarfile.TarError:
LOG.error("Failed to restore cinder file %s." % cinder_file)
raise RestoreFail("Failed to restore configuration")
def restore_cinder_config(archive):
"""Restore cinder config files"""
# If the iscsi target config file is present in the archive,
# restore it.
if file_exists_in_archive(archive, 'cinder/saveconfig.json'):
restore_cinder_file(
archive, cinder_permdir + '/iscsi-target',
'saveconfig.json')
# Also create a copy of the original file as the volume
# restore procedure changes this file and breaks the
# valid nova settings.
shutil.copyfile(
cinder_permdir + '/iscsi-target/saveconfig.json',
cinder_permdir + '/iscsi-target/saveconfig.json.bck')
def backup_cinder_size(cinder_permdir):
""" Backup cinder size estimate """
try:
if not os.path.exists(
cinder_permdir + '/iscsi-target/saveconfig.json'):
return 0
statinfo = os.stat(cinder_permdir + '/iscsi-target/saveconfig.json')
return statinfo.st_size
except OSError:
LOG.error("Failed to estimate backup cinder size.")
raise BackupFail("Failed to estimate backup cinder size")
def backup_keyring_size(keyring_permdir):
""" Backup keyring size estimate """
try:
return(utils.directory_get_size(keyring_permdir))
except OSError:
LOG.error("Failed to estimate backup keyring size.")
raise BackupFail("Failed to estimate backup keyring size")
def backup_keyring(archive, keyring_permdir):
""" Backup keyring configuration """
try:
archive.add(keyring_permdir, arcname='.keyring')
except tarfile.TarError:
LOG.error("Failed to backup keyring.")
raise BackupFail("Failed to backup keyring configuration")
def restore_keyring(archive, keyring_permdir):
""" Restore keyring configuration """
try:
shutil.rmtree(keyring_permdir, ignore_errors=False)
members = filter_directory(archive, '.keyring')
temp_members = list()
# remove .keyring and .keyring/ from the member path since they are
# extracted to keyring_permdir: /opt/platform/.keyring/release
for m in members:
temp_member = copy.copy(m)
lst = temp_member.name.split('.keyring/')
if len(lst) > 1:
temp_member.name = lst[1]
temp_members.append(temp_member)
archive.extractall(path=keyring_permdir, members=temp_members)
except (tarfile.TarError, shutil.Error):
LOG.error("Failed to restore keyring.")
shutil.rmtree(keyring_permdir, ignore_errors=True)
raise RestoreFail("Failed to restore keyring configuration")
def prefetch_keyring(archive):
""" Prefetch keyring configuration for manifest use """
keyring_tmpdir = '/tmp/.keyring'
python_keyring_tmpdir = '/tmp/python_keyring'
try:
shutil.rmtree(keyring_tmpdir, ignore_errors=True)
shutil.rmtree(python_keyring_tmpdir, ignore_errors=True)
archive.extractall(
path=os.path.dirname(keyring_tmpdir),
members=filter_directory(archive,
os.path.basename(keyring_tmpdir)))
shutil.move(keyring_tmpdir + '/python_keyring', python_keyring_tmpdir)
except (tarfile.TarError, shutil.Error):
LOG.error("Failed to restore keyring.")
shutil.rmtree(keyring_tmpdir, ignore_errors=True)
shutil.rmtree(python_keyring_tmpdir, ignore_errors=True)
raise RestoreFail("Failed to restore keyring configuration")
def cleanup_prefetched_keyring():
""" Cleanup fetched keyring """
try:
keyring_tmpdir = '/tmp/.keyring'
python_keyring_tmpdir = '/tmp/python_keyring'
shutil.rmtree(keyring_tmpdir, ignore_errors=True)
shutil.rmtree(python_keyring_tmpdir, ignore_errors=True)
except shutil.Error:
LOG.error("Failed to cleanup keyring.")
raise RestoreFail("Failed to cleanup fetched keyring")
def backup_ldap_size():
""" Backup ldap size estimate """
try:
total_size = 0
proc = subprocess.Popen(
['slapcat -d 0 -F /etc/openldap/schema | wc -c'],
shell=True, stdout=subprocess.PIPE)
for line in proc.stdout:
total_size = int(line)
break
proc.communicate()
return total_size
except subprocess.CalledProcessError:
LOG.error("Failed to estimate backup ldap size.")
raise BackupFail("Failed to estimate backup ldap size")
def backup_ldap(archive, staging_dir):
""" Backup ldap configuration """
try:
ldap_staging_dir = staging_dir + '/ldap'
os.mkdir(ldap_staging_dir, 0o655)
subprocess.check_call([
'slapcat', '-d', '0', '-F', '/etc/openldap/schema',
'-l', (ldap_staging_dir + '/ldap.db')], stdout=DEVNULL)
archive.add(ldap_staging_dir + '/ldap.db', arcname='ldap.db')
except (OSError, subprocess.CalledProcessError, tarfile.TarError):
LOG.error("Failed to backup ldap database.")
raise BackupFail("Failed to backup ldap configuration")
def restore_ldap(archive, ldap_permdir, staging_dir):
""" Restore ldap configuration """
try:
ldap_staging_dir = staging_dir + '/ldap'
archive.extract('ldap.db', path=ldap_staging_dir)
utils.stop_lsb_service('openldap')
subprocess.call(['rm', '-rf', ldap_permdir], stdout=DEVNULL)
os.mkdir(ldap_permdir, 0o755)
subprocess.check_call(['slapadd', '-F', '/etc/openldap/schema',
'-l', ldap_staging_dir + '/ldap.db'],
stdout=DEVNULL, stderr=DEVNULL)
except (subprocess.CalledProcessError, OSError, tarfile.TarError):
LOG.error("Failed to restore ldap database.")
raise RestoreFail("Failed to restore ldap configuration")
finally:
utils.start_lsb_service('openldap')
def backup_postgres_size(cinder_config=False):
""" Backup postgres size estimate """
try:
total_size = 0
# Backup roles, table spaces and schemas for databases.
proc = subprocess.Popen([('sudo -u postgres pg_dumpall --clean ' +
'--schema-only | wc -c')], shell=True,
stdout=subprocess.PIPE, stderr=DEVNULL)
for line in proc.stdout:
total_size = int(line)
break
proc.communicate()
# get backup database
backup_databases, backup_db_skip_tables = get_backup_databases(
cinder_config)
# Backup data for databases.
for _, db_elem in enumerate(backup_databases):
db_cmd = 'sudo -u postgres pg_dump --format=plain --inserts '
db_cmd += '--disable-triggers --data-only %s ' % db_elem
for _, table_elem in enumerate(backup_db_skip_tables[db_elem]):
db_cmd += '--exclude-table=%s ' % table_elem
db_cmd += '| wc -c'
proc = subprocess.Popen([db_cmd], shell=True,
stdout=subprocess.PIPE, stderr=DEVNULL)
for line in proc.stdout:
total_size += int(line)
break
proc.communicate()
return total_size
except subprocess.CalledProcessError:
LOG.error("Failed to estimate backup database size.")
raise BackupFail("Failed to estimate backup database size")
def backup_postgres(archive, staging_dir, cinder_config=False):
""" Backup postgres configuration """
try:
postgres_staging_dir = staging_dir + '/postgres'
os.mkdir(postgres_staging_dir, 0o655)
# Backup roles, table spaces and schemas for databases.
subprocess.check_call([('sudo -u postgres pg_dumpall --clean ' +
'--schema-only' +
'> %s/%s' % (postgres_staging_dir,
'postgres.sql.config'))],
shell=True, stderr=DEVNULL)
# get backup database
backup_databases, backup_db_skip_tables = get_backup_databases(
cinder_config)
# Backup data for databases.
for _, db_elem in enumerate(backup_databases):
db_cmd = 'sudo -u postgres pg_dump --format=plain --inserts '
db_cmd += '--disable-triggers --data-only %s ' % db_elem
for _, table_elem in enumerate(backup_db_skip_tables[db_elem]):
db_cmd += '--exclude-table=%s ' % table_elem
db_cmd += '> %s/%s.sql.data' % (postgres_staging_dir, db_elem)
subprocess.check_call([db_cmd], shell=True, stderr=DEVNULL)
archive.add(postgres_staging_dir, arcname='postgres')
except (OSError, subprocess.CalledProcessError, tarfile.TarError):
LOG.error("Failed to backup postgres databases.")
raise BackupFail("Failed to backup database configuration")
def restore_postgres(archive, staging_dir):
""" Restore postgres configuration """
try:
postgres_staging_dir = staging_dir + '/postgres'
archive.extractall(path=staging_dir,
members=filter_directory(archive, 'postgres'))
utils.start_service("postgresql")
# Restore roles, table spaces and schemas for databases.
subprocess.check_call(["sudo", "-u", "postgres", "psql", "-f",
postgres_staging_dir +
'/postgres.sql.config', "postgres"],
stdout=DEVNULL, stderr=DEVNULL)
# Restore data for databases.
for data in glob.glob(postgres_staging_dir + '/*.sql.data'):
db_elem = data.split('/')[-1].split('.')[0]
subprocess.check_call(["sudo", "-u", "postgres", "psql", "-f",
data, db_elem],
stdout=DEVNULL)
except (OSError, subprocess.CalledProcessError, tarfile.TarError) as e:
LOG.error("Failed to restore postgres databases. Error: %s", e)
raise RestoreFail("Failed to restore database configuration")
finally:
utils.stop_service('postgresql')
def backup_ceilometer_size(ceilometer_permdir):
""" Backup ceilometer size estimate """
try:
statinfo = os.stat(ceilometer_permdir + '/pipeline.yaml')
return statinfo.st_size
except OSError:
LOG.error("Failed to estimate backup ceilometer size.")
raise BackupFail("Failed to estimate backup ceilometer size")
def backup_ceilometer(archive, ceilometer_permdir):
""" Backup ceilometer """
try:
archive.add(ceilometer_permdir + '/pipeline.yaml',
arcname='pipeline.yaml')
except tarfile.TarError:
LOG.error("Failed to backup ceilometer.")
raise BackupFail("Failed to backup ceilometer")
def restore_ceilometer(archive, ceilometer_permdir):
""" Restore ceilometer """
try:
archive.extract('pipeline.yaml', path=ceilometer_permdir)
except tarfile.TarError:
LOG.error("Failed to restore ceilometer")
raise RestoreFail("Failed to restore ceilometer")
def filter_config_dir(archive, directory):
for tarinfo in archive:
if tarinfo.name.find('config/' + directory) == 0:
yield tarinfo
def restore_config_dir(archive, staging_dir, config_dir, dest_dir):
""" Restore configuration directory if it exists """
try:
archive.extractall(staging_dir,
members=filter_config_dir(archive, config_dir))
# Copy files from backup to dest dir
if (os.path.exists(staging_dir + '/config/' + config_dir) and
os.listdir(staging_dir + '/config/' + config_dir)):
subprocess.call(["mkdir", "-p", dest_dir])
try:
for f in glob.glob(
staging_dir + '/config/' + config_dir + '/*'):
subprocess.check_call(["cp", "-p", f, dest_dir])
except IOError:
LOG.warning("Failed to copy %s files" % config_dir)
except (subprocess.CalledProcessError, tarfile.TarError):
LOG.info("No custom %s config was found during restore." % config_dir)
def backup_std_dir_size(directory):
""" Backup standard directory size estimate """
try:
return utils.directory_get_size(directory)
except OSError:
LOG.error("Failed to estimate backup size for %s" % directory)
raise BackupFail("Failed to estimate backup size for %s" % directory)
def backup_std_dir(archive, directory):
""" Backup standard directory """
try:
archive.add(directory, arcname=os.path.basename(directory))
except tarfile.TarError:
LOG.error("Failed to backup %s" % directory)
raise BackupFail("Failed to backup %s" % directory)
def restore_std_dir(archive, directory):
""" Restore standard directory """
try:
shutil.rmtree(directory, ignore_errors=True)
# Verify that archive contains this directory
try:
archive.getmember(os.path.basename(directory))
except KeyError:
LOG.error("Archive does not contain directory %s" % directory)
raise RestoreFail("Invalid backup file - missing directory %s" %
directory)
archive.extractall(
path=os.path.dirname(directory),
members=filter_directory(archive, os.path.basename(directory)))
except (shutil.Error, tarfile.TarError):
LOG.error("Failed to restore %s" % directory)
raise RestoreFail("Failed to restore %s" % directory)
def configure_loopback_interface(archive):
""" Restore and apply configuration for loopback interface """
utils.remove_interface_config_files()
restore_etc_file(
archive, utils.NETWORK_SCRIPTS_PATH,
'sysconfig/network-scripts/' + utils.NETWORK_SCRIPTS_LOOPBACK)
utils.restart_networking()
def backup_ceph_crush_map(archive, staging_dir):
""" Backup ceph crush map """
try:
ceph_staging_dir = os.path.join(staging_dir, 'ceph')
os.mkdir(ceph_staging_dir, 0o655)
crushmap_file = os.path.join(ceph_staging_dir,
sysinv_constants.CEPH_CRUSH_MAP_BACKUP)
subprocess.check_call(['ceph', 'osd', 'getcrushmap',
'-o', crushmap_file], stdout=DEVNULL,
stderr=DEVNULL)
archive.add(crushmap_file, arcname='ceph/' +
sysinv_constants.CEPH_CRUSH_MAP_BACKUP)
except Exception as e:
LOG.error('Failed to backup ceph crush map. Reason: {}'.format(e))
raise BackupFail('Failed to backup ceph crush map')
def restore_ceph_crush_map(archive):
""" Restore ceph crush map """
if not file_exists_in_archive(archive, 'ceph/' +
sysinv_constants.CEPH_CRUSH_MAP_BACKUP):
return
try:
crush_map_file = 'ceph/' + sysinv_constants.CEPH_CRUSH_MAP_BACKUP
if file_exists_in_archive(archive, crush_map_file):
member = archive.getmember(crush_map_file)
# Copy the member to avoid changing the name for future
# operations on this member.
temp_member = copy.copy(member)
temp_member.name = os.path.basename(temp_member.name)
archive.extract(temp_member,
path=sysinv_constants.SYSINV_CONFIG_PATH)
except tarfile.TarError as e:
LOG.error('Failed to restore crush map file. Reason: {}'.format(e))
raise RestoreFail('Failed to restore crush map file')
def check_size(archive_dir, cinder_config):
"""Check if there is enough space to create backup."""
backup_overhead_bytes = 1024 ** 3 # extra GB for staging directory
# backup_cinder_size() will return 0 if cinder/lvm is not configured,
# So no need to add extra check here.
backup_size = (backup_overhead_bytes +
backup_etc_size() +
backup_config_size(tsconfig.CONFIG_PATH) +
backup_puppet_data_size(constants.HIERADATA_PERMDIR) +
backup_keyring_size(keyring_permdir) +
backup_ldap_size() +
backup_postgres_size(cinder_config) +
backup_ceilometer_size(ceilometer_permdir) +
backup_std_dir_size(glance_permdir) +
backup_std_dir_size(home_permdir) +
backup_std_dir_size(patching_permdir) +
backup_std_dir_size(patching_repo_permdir) +
backup_std_dir_size(extension_permdir) +
backup_std_dir_size(patch_vault_permdir) +
backup_cinder_size(cinder_permdir)
)
archive_dir_free_space = \
utils.filesystem_get_free_space(archive_dir)
if backup_size > archive_dir_free_space:
print("Archive directory (%s) does not have enough free "
"space (%s), estimated backup size is %s." %
(archive_dir, utils.print_bytes(archive_dir_free_space),
utils.print_bytes(backup_size)))
raise BackupFail("Not enough free space for backup.")
def backup(backup_name, archive_dir, clone=False):
"""Backup configuration."""
if not os.path.isdir(archive_dir):
raise BackupFail("Archive directory (%s) not found." % archive_dir)
if not utils.is_active("management-ip"):
raise BackupFail(
"Backups can only be performed from the active controller.")
if os.path.isfile(backup_in_progress):
raise BackupFail("Backup already in progress.")
else:
open(backup_in_progress, 'w')
fmApi = fm_api.FaultAPIs()
entity_instance_id = "%s=%s" % (fm_constants.FM_ENTITY_TYPE_HOST,
sysinv_constants.CONTROLLER_HOSTNAME)
fault = fm_api.Fault(alarm_id=fm_constants.FM_ALARM_ID_BACKUP_IN_PROGRESS,
alarm_state=fm_constants.FM_ALARM_STATE_SET,
entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST,
entity_instance_id=entity_instance_id,
severity=fm_constants.FM_ALARM_SEVERITY_MINOR,
reason_text=("System Backup in progress."),
# operational
alarm_type=fm_constants.FM_ALARM_TYPE_7,
# congestion
probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_8,
proposed_repair_action=("No action required."),
service_affecting=False)
fmApi.set_fault(fault)
cinder_config = False
backend_services = sysinv.get_storage_backend_services()
for services in backend_services.values():
if (services is not None and
services.find(sysinv_constants.SB_SVC_CINDER) != -1):
cinder_config = True
break
staging_dir = None
system_tar_path = None
images_tar_path = None
warnings = ''
try:
os.chdir('/')
if not clone:
check_size(archive_dir, cinder_config)
print ("\nPerforming backup (this might take several minutes):")
staging_dir = tempfile.mkdtemp(dir=archive_dir)
system_tar_path = os.path.join(archive_dir,
backup_name + '_system.tgz')
system_archive = tarfile.open(system_tar_path, "w:gz")
images_tar_path = os.path.join(archive_dir,
backup_name + '_images.tgz')
step = 1
total_steps = 15
if sysinv_constants.SB_TYPE_CEPH in backend_services.keys():
total_steps += 1
if tsconfig.region_config == "yes":
# We don't run the glance backup step
total_steps -= 1
# Step 1: Backup etc
backup_etc(system_archive)
utils.progress(total_steps, step, 'backup etc', 'DONE')
step += 1
# Step 2: Backup configuration
backup_config(system_archive, tsconfig.CONFIG_PATH)
utils.progress(total_steps, step, 'backup configuration', 'DONE')
step += 1
# Step 3: Backup puppet data
backup_puppet_data(system_archive, constants.HIERADATA_PERMDIR)
utils.progress(total_steps, step, 'backup puppet data', 'DONE')
step += 1
# Step 4: Backup keyring
backup_keyring(system_archive, keyring_permdir)
utils.progress(total_steps, step, 'backup keyring', 'DONE')
step += 1
# Step 5: Backup ldap
backup_ldap(system_archive, staging_dir)
utils.progress(total_steps, step, 'backup ldap', 'DONE')
step += 1
# Step 6: Backup postgres
backup_postgres(system_archive, staging_dir, cinder_config)
utils.progress(total_steps, step, 'backup postgres', 'DONE')
step += 1
# Step 7: Backup ceilometer
backup_ceilometer(system_archive, ceilometer_permdir)
utils.progress(total_steps, step, 'backup ceilometer', 'DONE')
step += 1
if tsconfig.region_config != "yes":
# Step 8: Backup glance
images_archive = tarfile.open(images_tar_path, "w:gz")
backup_std_dir(images_archive, glance_permdir)
images_archive.close()
utils.progress(total_steps, step, 'backup glance', 'DONE')
step += 1
# Step 9: Backup home
backup_std_dir(system_archive, home_permdir)
utils.progress(total_steps, step, 'backup home directory', 'DONE')
step += 1
# Step 10: Backup patching
if not clone:
backup_std_dir(system_archive, patching_permdir)
utils.progress(total_steps, step, 'backup patching', 'DONE')
step += 1
# Step 11: Backup patching repo
if not clone:
backup_std_dir(system_archive, patching_repo_permdir)
utils.progress(total_steps, step, 'backup patching repo', 'DONE')
step += 1
# Step 12: Backup extension filesystem
backup_std_dir(system_archive, extension_permdir)
utils.progress(total_steps, step, 'backup extension filesystem '
'directory', 'DONE')
step += 1
# Step 13: Backup patch-vault filesystem
if os.path.exists(patch_vault_permdir):
backup_std_dir(system_archive, patch_vault_permdir)
utils.progress(total_steps, step, 'backup patch-vault filesystem '
'directory', 'DONE')
step += 1
# Step 14: Backup cinder config/LVM config
# No need to add extra check here as if cinder/LVM is not configured,
# ../iscsi-target/saveconfig.json will be absent, so this function will
# do nothing.
backup_cinder_config(system_archive)
utils.progress(total_steps, step, 'backup cinder/LVM config', 'DONE')
step += 1
# Step 15: Backup ceph crush map
if sysinv_constants.SB_TYPE_CEPH in backend_services.keys():
backup_ceph_crush_map(system_archive, staging_dir)
utils.progress(total_steps, step, 'backup ceph crush map', 'DONE')
step += 1
# Step 16: Create archive
system_archive.close()
utils.progress(total_steps, step, 'create archive', 'DONE')
step += 1
except Exception:
if system_tar_path and os.path.isfile(system_tar_path):
os.remove(system_tar_path)
if images_tar_path and os.path.isfile(images_tar_path):
os.remove(images_tar_path)
raise
finally:
fmApi.clear_fault(fm_constants.FM_ALARM_ID_BACKUP_IN_PROGRESS,
entity_instance_id)
os.remove(backup_in_progress)
if staging_dir:
shutil.rmtree(staging_dir, ignore_errors=True)
system_msg = "System backup file created"
images_msg = "Images backup file created"
if not clone:
system_msg += ": " + system_tar_path
images_msg += ": " + images_tar_path
print(system_msg)
if tsconfig.region_config != "yes":
print(images_msg)
if warnings != '':
print("WARNING: The following problems occurred:")
print(textwrap.fill(warnings, 80))
def create_restore_runtime_config(filename):
""" Create any runtime parameters needed for Restore."""
config = {}
# We need to re-enable Openstack password rules, which
# were previously disabled while the controller manifests
# were applying during a Restore
config['classes'] = ['keystone::security_compliance']
utils.create_manifest_runtime_config(filename, config)
def overwrite_iscsi_target_config():
"""
Overwrite the current iscsi target config file with the one
from the backup archive.
"""
if not os.path.exists(
cinder_permdir + '/iscsi-target/saveconfig.json'):
LOG.info("Restore: Missing current saveconfig.json file")
return
if not os.path.exists(
cinder_permdir + '/iscsi-target/saveconfig.json.bck'):
LOG.info("Restore: Missing backup saveconfig.json file")
return
os.remove(cinder_permdir + '/iscsi-target/saveconfig.json')
shutil.copyfile(
cinder_permdir + '/iscsi-target/saveconfig.json.bck',
cinder_permdir + '/iscsi-target/saveconfig.json')
os.remove(cinder_permdir + '/iscsi-target/saveconfig.json.bck')
subprocess.call(["targetctl", "restore"], stdout=DEVNULL, stderr=DEVNULL)
def restore_complete():
"""
Restore proper ISCSI configuration file after cinder restore.
Enable worker functionality for AIO system.
:return: True if worker-config-complete is executed
"""
if utils.get_system_type() == sysinv_constants.TIS_AIO_BUILD:
if not os.path.isfile(restore_system_ready):
print(textwrap.fill(
"--restore-complete can only be run "
"after restore-system has completed "
"successfully", 80
))
return False
# The iscsi target config file must be overwritten with the
# original file from the backup archive.
# This is due to the cinder restore process actually changing
# this file. These changes cause VMs that were present at
# backup time to not boot up properly anymore.
# The original icsci config file has the proper settings so
# we use use that.
overwrite_iscsi_target_config()
print("\nApplying worker manifests for %s. " %
(utils.get_controller_hostname()))
print("Node will reboot on completion.")
sysinv.do_worker_config_complete(utils.get_controller_hostname())
# show in-progress log on console every 30 seconds
# until self reboot or timeout
os.remove(restore_system_ready)
time.sleep(30)
for i in range(1, 10):
print("worker manifest apply in progress ... ")
time.sleep(30)
raise RestoreFail("Timeout running worker manifests, "
"reboot did not occur")
else:
if not os.path.isfile(restore_system_ready):
print(textwrap.fill(
"--restore-complete can only be run "
"after restore-system has completed "
"successfully", 80
))
return False
overwrite_iscsi_target_config()
os.remove(restore_system_ready)
return True
def restore_system(backup_file, include_storage_reinstall=False, clone=False):
"""Restoring system configuration."""
if (os.path.exists(constants.CGCS_CONFIG_FILE) or
os.path.exists(tsconfig.CONFIG_PATH) or
os.path.exists(constants.INITIAL_CONFIG_COMPLETE_FILE)):
print(textwrap.fill(
"Configuration has already been done. "
"A system restore operation can only be done "
"immediately after the load has been installed.", 80))
print('')
raise RestoreFail("System configuration already completed")
if not os.path.isabs(backup_file):
raise RestoreFail("Backup file (%s) not found. Full path is "
"required." % backup_file)
if os.path.isfile(restore_in_progress):
raise RestoreFail("Restore already in progress.")
else:
open(restore_in_progress, 'w')
# Add newline to console log for install-clone scenario
newline = clone
staging_dir = None
try:
try:
with open(os.devnull, "w") as fnull:
subprocess.check_call(["vgdisplay", "cgts-vg"],
stdout=fnull,
stderr=fnull)
except subprocess.CalledProcessError:
LOG.error("The cgts-vg volume group was not found")
raise RestoreFail("Volume groups not configured")
print("\nRestoring system (this will take several minutes):")
# Use /scratch for the staging dir for now,
# until /opt/backups is available
staging_dir = tempfile.mkdtemp(dir='/scratch')
# Permission change required or postgres restore fails
subprocess.call(['chmod', 'a+rx', staging_dir], stdout=DEVNULL)
os.chdir('/')
step = 1
total_steps = 24
# Step 1: Open archive and verify installed load matches backup
try:
archive = tarfile.open(backup_file)
except tarfile.TarError as e:
LOG.exception(e)
raise RestoreFail("Error opening backup file. Invalid backup "
"file.")
check_load_versions(archive, staging_dir)
check_load_subfunctions(archive, staging_dir)
utils.progress(total_steps, step, 'open archive', 'DONE', newline)
step += 1
# Patching is potentially a multi-phase step.
# If the controller is impacted by patches from the backup,
# it must be rebooted before continuing the restore.
# If this is the second pass through, we can skip over this.
if not os.path.isfile(restore_patching_complete) and not clone:
# Step 2: Restore patching
restore_std_dir(archive, patching_permdir)
utils.progress(total_steps, step, 'restore patching', 'DONE',
newline)
step += 1
# Step 3: Restore patching repo
restore_std_dir(archive, patching_repo_permdir)
utils.progress(total_steps, step, 'restore patching repo', 'DONE',
newline)
step += 1
# Step 4: Apply patches
try:
subprocess.check_output(["sw-patch", "install-local"])
except subprocess.CalledProcessError:
LOG.error("Failed to install patches")
raise RestoreFail("Failed to install patches")
utils.progress(total_steps, step, 'install patches', 'DONE',
newline)
step += 1
open(restore_patching_complete, 'w')
# If the controller was impacted by patches, we need to reboot.
if os.path.isfile(node_is_patched):
if not clone:
print("\nThis controller has been patched. " +
"A reboot is required.")
print("After the reboot is complete, " +
"re-execute the restore command.")
while True:
user_input = input(
"Enter 'reboot' to reboot controller: ")
if user_input == 'reboot':
break
LOG.info("This controller has been patched. Rebooting now")
print("\nThis controller has been patched. Rebooting now\n\n")
time.sleep(5)
os.remove(restore_in_progress)
if staging_dir:
shutil.rmtree(staging_dir, ignore_errors=True)
subprocess.call("reboot")
else:
# We need to restart the patch controller and agent, since
# we setup the repo and patch store outside its control
with open(os.devnull, "w") as devnull:
subprocess.call(
["systemctl",
"restart",
"sw-patch-controller-daemon.service"],
stdout=devnull, stderr=devnull)
subprocess.call(
["systemctl",
"restart",
"sw-patch-agent.service"],
stdout=devnull, stderr=devnull)
if clone:
# No patches were applied, return to cloning code
# to run validation code.
return RESTORE_RERUN_REQUIRED
else:
# Add the skipped steps
step += 3
if os.path.isfile(node_is_patched):
# If we get here, it means the node was patched by the user
# AFTER the restore applied patches and rebooted, but didn't
# reboot.
# This means the patch lineup no longer matches what's in the
# backup, but we can't (and probably shouldn't) prevent that.
# However, since this will ultimately cause the node to fail
# the goenabled step, we can fail immediately and force the
# user to reboot.
print ("\nThis controller has been patched, but not rebooted.")
print ("Please reboot before continuing the restore process.")
raise RestoreFail("Controller node patched without rebooting")
# Flag can now be cleared
if os.path.exists(restore_patching_complete):
os.remove(restore_patching_complete)
# Prefetch keyring
prefetch_keyring(archive)
# Step 5: Restore configuration
restore_configuration(archive, staging_dir)
# In AIO SX systems, the loopback interface is used as the management
# interface. However, the application of the interface manifest will
# not configure the necessary addresses on the loopback interface (see
# apply_network_config.sh for details). So, we need to configure the
# loopback interface here.
if tsconfig.system_mode == sysinv_constants.SYSTEM_MODE_SIMPLEX:
configure_loopback_interface(archive)
# Write the simplex flag
utils.write_simplex_flag()
utils.progress(total_steps, step, 'restore configuration', 'DONE',
newline)
step += 1
# Step 6: Apply restore bootstrap manifest
controller_0_address = utils.get_address_from_hosts_file(
'controller-0')
restore_static_puppet_data(archive, constants.HIERADATA_WORKDIR)
try:
utils.apply_manifest(controller_0_address,
sysinv_constants.CONTROLLER,
'bootstrap',
constants.HIERADATA_WORKDIR)
except Exception as e:
LOG.exception(e)
raise RestoreFail(
'Failed to apply bootstrap manifest. '
'See /var/log/puppet/latest/puppet.log for details.')
utils.progress(total_steps, step, 'apply bootstrap manifest', 'DONE',
newline)
step += 1
# Step 7: Restore puppet data
restore_puppet_data(archive, constants.HIERADATA_WORKDIR)
utils.progress(total_steps, step, 'restore puppet data', 'DONE',
newline)
step += 1
# Step 8: Persist configuration
utils.persist_config()
utils.progress(total_steps, step, 'persist configuration', 'DONE',
newline)
step += 1
# Step 9: Apply controller manifest
try:
utils.apply_manifest(controller_0_address,
sysinv_constants.CONTROLLER,
'controller',
constants.HIERADATA_PERMDIR)
except Exception as e:
LOG.exception(e)
raise RestoreFail(
'Failed to apply controller manifest. '
'See /var/log/puppet/latest/puppet.log for details.')
utils.progress(total_steps, step, 'apply controller manifest', 'DONE',
newline)
step += 1
# Step 10: Apply runtime controller manifests
restore_filename = os.path.join(staging_dir, 'restore.yaml')
create_restore_runtime_config(restore_filename)
try:
utils.apply_manifest(controller_0_address,
sysinv_constants.CONTROLLER,
'runtime',
constants.HIERADATA_PERMDIR,
runtime_filename=restore_filename)
except Exception as e:
LOG.exception(e)
raise RestoreFail(
'Failed to apply runtime controller manifest. '
'See /var/log/puppet/latest/puppet.log for details.')
utils.progress(total_steps, step,
'apply runtime controller manifest', 'DONE',
newline)
step += 1
# Move the staging dir under /opt/backups, now that it's setup
shutil.rmtree(staging_dir, ignore_errors=True)
staging_dir = tempfile.mkdtemp(dir=constants.BACKUPS_PATH)
# Permission change required or postgres restore fails
subprocess.call(['chmod', 'a+rx', staging_dir], stdout=DEVNULL)
# Step 11: Restore cinder config file
restore_cinder_config(archive)
utils.progress(total_steps, step, 'restore cinder config', 'DONE',
newline)
step += 1
# Step 12: Apply banner customization
utils.apply_banner_customization()
utils.progress(total_steps, step, 'apply banner customization', 'DONE',
newline)
step += 1
# Step 13: Restore dnsmasq and pxeboot config
restore_dnsmasq(archive, tsconfig.CONFIG_PATH)
utils.progress(total_steps, step, 'restore dnsmasq', 'DONE', newline)
step += 1
# Step 14: Restore keyring
restore_keyring(archive, keyring_permdir)
utils.progress(total_steps, step, 'restore keyring', 'DONE', newline)
step += 1
# Step 15: Restore ldap
restore_ldap(archive, ldap_permdir, staging_dir)
utils.progress(total_steps, step, 'restore ldap', 'DONE', newline)
step += 1
# Step 16: Restore postgres
restore_postgres(archive, staging_dir)
utils.progress(total_steps, step, 'restore postgres', 'DONE', newline)
step += 1
# Step 17: Restore ceilometer
restore_ceilometer(archive, ceilometer_permdir)
utils.progress(total_steps, step, 'restore ceilometer', 'DONE',
newline)
step += 1
# Step 18: Restore ceph crush map
restore_ceph_crush_map(archive)
utils.progress(total_steps, step, 'restore ceph crush map', 'DONE',
newline)
step += 1
# Step 19: Restore home
restore_std_dir(archive, home_permdir)
utils.progress(total_steps, step, 'restore home directory', 'DONE',
newline)
step += 1
# Step 20: Restore extension filesystem
restore_std_dir(archive, extension_permdir)
utils.progress(total_steps, step, 'restore extension filesystem '
'directory', 'DONE', newline)
step += 1
# Step 21: Restore patch-vault filesystem
if file_exists_in_archive(archive,
os.path.basename(patch_vault_permdir)):
restore_std_dir(archive, patch_vault_permdir)
utils.progress(total_steps, step, 'restore patch-vault filesystem '
'directory', 'DONE', newline)
step += 1
# Step 22: Restore external ceph configuration files.
restore_ceph_external_config_files(archive, staging_dir)
utils.progress(total_steps, step, 'restore CEPH external config',
'DONE', newline)
step += 1
# Step 23: Shutdown file systems
archive.close()
shutil.rmtree(staging_dir, ignore_errors=True)
utils.shutdown_file_systems()
utils.progress(total_steps, step, 'shutdown file systems', 'DONE',
newline)
step += 1
# Step 24: Recover services
utils.mtce_restart()
utils.mark_config_complete()
time.sleep(120)
for service in ['sysinv-conductor', 'sysinv-inv']:
if not utils.wait_sm_service(service):
raise RestoreFail("Services have failed to initialize.")
utils.progress(total_steps, step, 'recover services', 'DONE', newline)
step += 1
if tsconfig.system_mode != sysinv_constants.SYSTEM_MODE_SIMPLEX:
print("\nRestoring node states (this will take several minutes):")
backend_services = sysinv.get_storage_backend_services()
with openstack.OpenStack() as client:
# On ceph setups storage nodes take about 90 seconds
# to become locked. Setting the timeout to 120 seconds
# for such setups
lock_timeout = 60
if sysinv_constants.SB_TYPE_CEPH in backend_services.keys():
lock_timeout = 120
failed_lock_host = False
skip_hosts = ['controller-0']
if not include_storage_reinstall:
storage_hosts = \
sysinv.get_hosts(client.admin_token,
client.conf['region_name'],
personality='storage')
if storage_hosts:
install_uuid = utils.get_install_uuid()
for h in storage_hosts:
skip_hosts.append(h.name)
# Update install_uuid on the storage node
client.sysinv.ihost.update_install_uuid(
h.uuid,
install_uuid)
skip_hosts_count = len(skip_hosts)
# Wait for nodes to be identified as disabled before attempting
# to lock hosts. Even if after 3 minute nodes are still not
# identified as disabled, we still continue the restore.
if not client.wait_for_hosts_disabled(
exempt_hostnames=skip_hosts,
timeout=180):
LOG.info("At least one node is not in a disabling state. "
"Continuing.")
print("\nLocking nodes:")
try:
failed_hosts = client.lock_hosts(skip_hosts,
utils.progress,
timeout=lock_timeout)
# Don't power off nodes that could not be locked
if len(failed_hosts) > 0:
skip_hosts.append(failed_hosts)
except (KeystoneFail, SysInvFail) as e:
LOG.exception(e)
failed_lock_host = True
if not failed_lock_host:
print("\nPowering-off nodes:")
try:
client.power_off_hosts(skip_hosts,
utils.progress,
timeout=60)
except (KeystoneFail, SysInvFail) as e:
LOG.exception(e)
# this is somehow expected
if failed_lock_host or len(skip_hosts) > skip_hosts_count:
if include_storage_reinstall:
print(textwrap.fill(
"Failed to lock at least one node. " +
"Please lock the unlocked nodes manually.", 80
))
else:
print(textwrap.fill(
"Failed to lock at least one node. " +
"Please lock the unlocked controller-1 or " +
"worker nodes manually.", 80
))
if not clone:
print(textwrap.fill(
"Before continuing to the next step in the restore, " +
"please ensure all nodes other than controller-0 " +
"and storage nodes, if they are not being " +
"reinstalled, are powered off. Please refer to the " +
"system administration guide for more details.", 80
))
finally:
os.remove(restore_in_progress)
if staging_dir:
shutil.rmtree(staging_dir, ignore_errors=True)
cleanup_prefetched_keyring()
fmApi = fm_api.FaultAPIs()
entity_instance_id = "%s=%s" % (fm_constants.FM_ENTITY_TYPE_HOST,
sysinv_constants.CONTROLLER_HOSTNAME)
fault = fm_api.Fault(
alarm_id=fm_constants.FM_ALARM_ID_BACKUP_IN_PROGRESS,
alarm_state=fm_constants.FM_ALARM_STATE_MSG,
entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST,
entity_instance_id=entity_instance_id,
severity=fm_constants.FM_ALARM_SEVERITY_MINOR,
reason_text=("System Restore complete."),
# other
alarm_type=fm_constants.FM_ALARM_TYPE_0,
# unknown
probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_UNKNOWN,
proposed_repair_action=(""),
service_affecting=False)
fmApi.set_fault(fault)
# Mark system restore as complete
if (utils.get_controller_hostname() ==
sysinv_constants.CONTROLLER_0_HOSTNAME):
# Create the flag file that permits the
# restore_complete command option.
utils.touch(restore_system_ready)
return RESTORE_COMPLETE
def restore_images(backup_file, clone=False):
"""Restoring images."""
if not os.path.exists(constants.INITIAL_CONFIG_COMPLETE_FILE):
print(textwrap.fill(
"System restore has not been done. "
"An image restore operation can only be done after "
"the system restore has been completed.", 80))
print('')
raise RestoreFail("System restore required")
if not os.path.isabs(backup_file):
raise RestoreFail("Backup file (%s) not found. Full path is "
"required." % backup_file)
if os.path.isfile(restore_in_progress):
raise RestoreFail("Restore already in progress.")
else:
open(restore_in_progress, 'w')
# Add newline to console log for install-clone scenario
newline = clone
try:
print("\nRestoring images (this will take several minutes):")
os.chdir('/')
step = 1
total_steps = 2
# Step 1: Open archive
try:
archive = tarfile.open(backup_file)
except tarfile.TarError as e:
LOG.exception(e)
raise RestoreFail("Error opening backup file. Invalid backup "
"file.")
utils.progress(total_steps, step, 'open archive', 'DONE', newline)
step += 1
# Step 2: Restore glance
restore_std_dir(archive, glance_permdir)
utils.progress(total_steps, step, 'restore glance', 'DONE',
newline)
step += 1
archive.close()
finally:
os.remove(restore_in_progress)