Restore proper iscsi target config file

Using the upstream implementation of cinder backup and restore
breaks bringing up any instance that was booted from volume
and still running at the time of the backup.

This is due to cinder restore process, writing data through iscsi
back to the volumes, thus adding new iscsi options for the
connection and breaking the existing connections to VMs.

Changes done to fix this:
- at restore, restore the iscsi config file, but also make a copy
  of it
- after volumes are restores, use 'config_controller --restore-complete'
  to overwrite the broken config file, with the original one from
  the above copy
- 'config_controller --restore-complete' replaces 'config_controller
  --restore-compute' on AIO setups
- 'config_controller --restore-complete' must now be also run on
  non-AIO setups.
- as part of the restore procedure, compute hosts cannot be unlocked
  until 'config_controller --restore-complte' is run

Story: 2002824
Task: 22741
Depends-On: https://review.openstack.org/584492

Change-Id: I441997019987d593872c82dd62baafbb397a97b1
Signed-off-by: Jack Ding <jack.ding@windriver.com>
This commit is contained in:
Stefan Dinescu 2018-06-07 11:08:30 +00:00 committed by Jack Ding
parent 946afa3cfd
commit 7bc4528e9d
3 changed files with 85 additions and 25 deletions

View File

@ -45,7 +45,7 @@ RESTORE_RERUN_REQUIRED = "restore-rerun-required"
# Backup/restore related constants
backup_in_progress = tsconfig.BACKUP_IN_PROGRESS_FLAG
restore_in_progress = tsconfig.RESTORE_IN_PROGRESS_FLAG
restore_compute_ready = '/var/run/.restore_compute_ready'
restore_system_ready = tsconfig.RESTORE_SYSTEM_FLAG
restore_patching_complete = '/etc/platform/.restore_patching_complete'
node_is_patched = '/var/run/node_is_patched'
keyring_permdir = os.path.join('/opt/platform/.keyring', tsconfig.SW_VERSION)
@ -691,6 +691,12 @@ def restore_cinder_config(archive):
restore_cinder_file(
archive, cinder_permdir + '/iscsi-target',
'saveconfig.json')
# Also create a copy of the original file as the volume
# restore procedure changes this file and breaks the
# valid nova settings.
shutil.copyfile(
cinder_permdir + '/iscsi-target/saveconfig.json',
cinder_permdir + '/iscsi-target/saveconfig.json.bck')
def backup_cinder_size(cinder_permdir):
@ -1378,20 +1384,55 @@ def create_restore_runtime_config(filename):
utils.create_manifest_runtime_config(filename, config)
def restore_compute():
def overwrite_iscsi_target_config():
"""
Overwrite the current iscsi target config file with the one
from the backup archive.
"""
if not os.path.exists(
cinder_permdir + '/iscsi-target/saveconfig.json'):
LOG.info("Restore: Missing current saveconfig.json file")
return
if not os.path.exists(
cinder_permdir + '/iscsi-target/saveconfig.json.bck'):
LOG.info("Restore: Missing backup saveconfig.json file")
return
os.remove(cinder_permdir + '/iscsi-target/saveconfig.json')
shutil.copyfile(
cinder_permdir + '/iscsi-target/saveconfig.json.bck',
cinder_permdir + '/iscsi-target/saveconfig.json')
os.remove(cinder_permdir + '/iscsi-target/saveconfig.json.bck')
subprocess.call(["targetctl", "restore"], stdout=DEVNULL, stderr=DEVNULL)
def restore_complete():
"""
Restore proper ISCSI configuration file after cinder restore.
Enable compute functionality for AIO system.
:return: True if compute-config-complete is executed
"""
if utils.get_system_type() == sysinv_constants.TIS_AIO_BUILD:
if not os.path.isfile(restore_compute_ready):
if not os.path.isfile(restore_system_ready):
print textwrap.fill(
"--restore-compute can only be run "
"--restore-complete can only be run "
"after restore-system has completed "
"successfully", 80
)
return False
# The iscsi target config file must be overwritten with the
# original file from the backup archive.
# This is due to the cinder restore process actually changing
# this file. These changes cause VMs that were present at
# backup time to not boot up properly anymore.
# The original icsci config file has the proper settings so
# we use use that.
overwrite_iscsi_target_config()
print ("\nApplying compute manifests for %s. " %
(utils.get_controller_hostname()))
print ("Node will reboot on completion.")
@ -1400,6 +1441,7 @@ def restore_compute():
# show in-progress log on console every 30 seconds
# until self reboot or timeout
os.remove(restore_system_ready)
time.sleep(30)
for i in range(1, 10):
print("compute manifest apply in progress ... ")
@ -1407,14 +1449,18 @@ def restore_compute():
raise RestoreFail("Timeout running compute manifests, "
"reboot did not occur")
return True
else:
print textwrap.fill(
"--restore-compute option is only applicable to "
"the All-In-One system type. Command not executed", 80
)
return False
if not os.path.isfile(restore_system_ready):
print textwrap.fill(
"--restore-complete can only be run "
"after restore-system has completed "
"successfully", 80
)
return False
overwrite_iscsi_target_config()
os.remove(restore_system_ready)
return True
def restore_system(backup_file, clone=False):
@ -1833,13 +1879,12 @@ def restore_system(backup_file, clone=False):
fmApi.set_fault(fault)
# Operational check for controller-0 in AIO system.
if (utils.get_system_type() == sysinv_constants.TIS_AIO_BUILD and
utils.get_controller_hostname() ==
# Mark system restore as complete
if (utils.get_controller_hostname() ==
sysinv_constants.CONTROLLER_0_HOSTNAME):
# Create the flag file that permits the
# restore_compute command option.
utils.touch(restore_compute_ready)
# restore_complete command option.
utils.touch(restore_system_ready)
return RESTORE_COMPLETE

View File

@ -286,9 +286,7 @@ def show_help():
"--restore-images <name> Restore images from backup file with the "
"given name,\n"
" full path required\n"
"--restore-compute Restore controller-0 compute function for"
" All-In-One\n"
" system, controller-0 will reboot\n"
"--restore-complete Complete restore of controller-0"
"--allow-ssh Allow configuration to be executed in "
"ssh\n"
% sys.argv[0])
@ -324,7 +322,7 @@ def main():
do_backup = False
do_system_restore = False
do_images_restore = False
do_compute_restore = False
do_complete_restore = False
do_clone = False
do_non_interactive = False
do_provision = False
@ -374,8 +372,8 @@ def main():
print "--restore-images requires the filename of the backup"
exit(1)
do_images_restore = True
elif sys.argv[arg] == "--restore-compute":
do_compute_restore = True
elif sys.argv[arg] == "--restore-complete":
do_complete_restore = True
elif sys.argv[arg] == "--archive-dir":
arg += 1
if arg < len(sys.argv):
@ -426,7 +424,7 @@ def main():
if [do_backup,
do_system_restore,
do_images_restore,
do_compute_restore,
do_complete_restore,
do_clone,
do_default_config,
do_non_interactive].count(True) > 1:
@ -436,7 +434,7 @@ def main():
if answerfile and [do_backup,
do_system_restore,
do_images_restore,
do_compute_restore,
do_complete_restore,
do_clone,
do_default_config,
do_non_interactive].count(True) > 0:
@ -474,8 +472,8 @@ def main():
elif do_images_restore:
backup_restore.restore_images(backup_name)
print "\nImages restore complete"
elif do_compute_restore:
backup_restore.restore_compute()
elif do_complete_restore:
backup_restore.restore_complete()
elif do_clone:
clone.clone(backup_name, archive_dir)
print "\nCloning complete"

View File

@ -3932,6 +3932,20 @@ class HostController(rest.RestController):
raise wsme.exc.ClientSideError('%s' % msg)
@staticmethod
def _semantic_check_restore_complete(ihost):
"""
During a restore procedure, checks compute nodes can be unlocked
only after running "config_controller --restore-complete"
"""
if os.path.isfile(tsc.RESTORE_SYSTEM_FLAG):
raise wsme.exc.ClientSideError(
_("Cannot unlock host %s. Please restore any volumes "
"and then complete the restore procedure by running "
"'config_controller --restore-complete' first. "
"Please refer to system admin guide for more details.") %
(ihost['hostname']))
@staticmethod
def _handle_ttys_dcd_change(ihost, ttys_dcd):
"""
@ -4830,6 +4844,9 @@ class HostController(rest.RestController):
"configure host and wait for Availability State "
"'online' prior to unlock." % hostupdate.displayid))
# Check whether a restore was properly completed
self._semantic_check_restore_complete(ihost)
# sdn configuration check
self._semantic_check_sdn_attributes(ihost)