Restore proper iscsi target config file
Using the upstream implementation of cinder backup and restore breaks bringing up any instance that was booted from volume and still running at the time of the backup. This is due to cinder restore process, writing data through iscsi back to the volumes, thus adding new iscsi options for the connection and breaking the existing connections to VMs. Changes done to fix this: - at restore, restore the iscsi config file, but also make a copy of it - after volumes are restores, use 'config_controller --restore-complete' to overwrite the broken config file, with the original one from the above copy - 'config_controller --restore-complete' replaces 'config_controller --restore-compute' on AIO setups - 'config_controller --restore-complete' must now be also run on non-AIO setups. - as part of the restore procedure, compute hosts cannot be unlocked until 'config_controller --restore-complte' is run Story: 2002824 Task: 22741 Depends-On: https://review.openstack.org/584492 Change-Id: I441997019987d593872c82dd62baafbb397a97b1 Signed-off-by: Jack Ding <jack.ding@windriver.com>
This commit is contained in:
parent
946afa3cfd
commit
7bc4528e9d
|
@ -45,7 +45,7 @@ RESTORE_RERUN_REQUIRED = "restore-rerun-required"
|
|||
# Backup/restore related constants
|
||||
backup_in_progress = tsconfig.BACKUP_IN_PROGRESS_FLAG
|
||||
restore_in_progress = tsconfig.RESTORE_IN_PROGRESS_FLAG
|
||||
restore_compute_ready = '/var/run/.restore_compute_ready'
|
||||
restore_system_ready = tsconfig.RESTORE_SYSTEM_FLAG
|
||||
restore_patching_complete = '/etc/platform/.restore_patching_complete'
|
||||
node_is_patched = '/var/run/node_is_patched'
|
||||
keyring_permdir = os.path.join('/opt/platform/.keyring', tsconfig.SW_VERSION)
|
||||
|
@ -691,6 +691,12 @@ def restore_cinder_config(archive):
|
|||
restore_cinder_file(
|
||||
archive, cinder_permdir + '/iscsi-target',
|
||||
'saveconfig.json')
|
||||
# Also create a copy of the original file as the volume
|
||||
# restore procedure changes this file and breaks the
|
||||
# valid nova settings.
|
||||
shutil.copyfile(
|
||||
cinder_permdir + '/iscsi-target/saveconfig.json',
|
||||
cinder_permdir + '/iscsi-target/saveconfig.json.bck')
|
||||
|
||||
|
||||
def backup_cinder_size(cinder_permdir):
|
||||
|
@ -1378,20 +1384,55 @@ def create_restore_runtime_config(filename):
|
|||
utils.create_manifest_runtime_config(filename, config)
|
||||
|
||||
|
||||
def restore_compute():
|
||||
def overwrite_iscsi_target_config():
|
||||
"""
|
||||
Overwrite the current iscsi target config file with the one
|
||||
from the backup archive.
|
||||
"""
|
||||
|
||||
if not os.path.exists(
|
||||
cinder_permdir + '/iscsi-target/saveconfig.json'):
|
||||
LOG.info("Restore: Missing current saveconfig.json file")
|
||||
return
|
||||
|
||||
if not os.path.exists(
|
||||
cinder_permdir + '/iscsi-target/saveconfig.json.bck'):
|
||||
LOG.info("Restore: Missing backup saveconfig.json file")
|
||||
return
|
||||
|
||||
os.remove(cinder_permdir + '/iscsi-target/saveconfig.json')
|
||||
shutil.copyfile(
|
||||
cinder_permdir + '/iscsi-target/saveconfig.json.bck',
|
||||
cinder_permdir + '/iscsi-target/saveconfig.json')
|
||||
|
||||
os.remove(cinder_permdir + '/iscsi-target/saveconfig.json.bck')
|
||||
subprocess.call(["targetctl", "restore"], stdout=DEVNULL, stderr=DEVNULL)
|
||||
|
||||
|
||||
def restore_complete():
|
||||
"""
|
||||
Restore proper ISCSI configuration file after cinder restore.
|
||||
Enable compute functionality for AIO system.
|
||||
:return: True if compute-config-complete is executed
|
||||
"""
|
||||
if utils.get_system_type() == sysinv_constants.TIS_AIO_BUILD:
|
||||
if not os.path.isfile(restore_compute_ready):
|
||||
if not os.path.isfile(restore_system_ready):
|
||||
print textwrap.fill(
|
||||
"--restore-compute can only be run "
|
||||
"--restore-complete can only be run "
|
||||
"after restore-system has completed "
|
||||
"successfully", 80
|
||||
)
|
||||
return False
|
||||
|
||||
# The iscsi target config file must be overwritten with the
|
||||
# original file from the backup archive.
|
||||
# This is due to the cinder restore process actually changing
|
||||
# this file. These changes cause VMs that were present at
|
||||
# backup time to not boot up properly anymore.
|
||||
# The original icsci config file has the proper settings so
|
||||
# we use use that.
|
||||
overwrite_iscsi_target_config()
|
||||
|
||||
print ("\nApplying compute manifests for %s. " %
|
||||
(utils.get_controller_hostname()))
|
||||
print ("Node will reboot on completion.")
|
||||
|
@ -1400,6 +1441,7 @@ def restore_compute():
|
|||
|
||||
# show in-progress log on console every 30 seconds
|
||||
# until self reboot or timeout
|
||||
os.remove(restore_system_ready)
|
||||
time.sleep(30)
|
||||
for i in range(1, 10):
|
||||
print("compute manifest apply in progress ... ")
|
||||
|
@ -1407,14 +1449,18 @@ def restore_compute():
|
|||
|
||||
raise RestoreFail("Timeout running compute manifests, "
|
||||
"reboot did not occur")
|
||||
return True
|
||||
|
||||
else:
|
||||
print textwrap.fill(
|
||||
"--restore-compute option is only applicable to "
|
||||
"the All-In-One system type. Command not executed", 80
|
||||
)
|
||||
return False
|
||||
if not os.path.isfile(restore_system_ready):
|
||||
print textwrap.fill(
|
||||
"--restore-complete can only be run "
|
||||
"after restore-system has completed "
|
||||
"successfully", 80
|
||||
)
|
||||
return False
|
||||
overwrite_iscsi_target_config()
|
||||
os.remove(restore_system_ready)
|
||||
return True
|
||||
|
||||
|
||||
def restore_system(backup_file, clone=False):
|
||||
|
@ -1833,13 +1879,12 @@ def restore_system(backup_file, clone=False):
|
|||
|
||||
fmApi.set_fault(fault)
|
||||
|
||||
# Operational check for controller-0 in AIO system.
|
||||
if (utils.get_system_type() == sysinv_constants.TIS_AIO_BUILD and
|
||||
utils.get_controller_hostname() ==
|
||||
# Mark system restore as complete
|
||||
if (utils.get_controller_hostname() ==
|
||||
sysinv_constants.CONTROLLER_0_HOSTNAME):
|
||||
# Create the flag file that permits the
|
||||
# restore_compute command option.
|
||||
utils.touch(restore_compute_ready)
|
||||
# restore_complete command option.
|
||||
utils.touch(restore_system_ready)
|
||||
|
||||
return RESTORE_COMPLETE
|
||||
|
||||
|
|
|
@ -286,9 +286,7 @@ def show_help():
|
|||
"--restore-images <name> Restore images from backup file with the "
|
||||
"given name,\n"
|
||||
" full path required\n"
|
||||
"--restore-compute Restore controller-0 compute function for"
|
||||
" All-In-One\n"
|
||||
" system, controller-0 will reboot\n"
|
||||
"--restore-complete Complete restore of controller-0"
|
||||
"--allow-ssh Allow configuration to be executed in "
|
||||
"ssh\n"
|
||||
% sys.argv[0])
|
||||
|
@ -324,7 +322,7 @@ def main():
|
|||
do_backup = False
|
||||
do_system_restore = False
|
||||
do_images_restore = False
|
||||
do_compute_restore = False
|
||||
do_complete_restore = False
|
||||
do_clone = False
|
||||
do_non_interactive = False
|
||||
do_provision = False
|
||||
|
@ -374,8 +372,8 @@ def main():
|
|||
print "--restore-images requires the filename of the backup"
|
||||
exit(1)
|
||||
do_images_restore = True
|
||||
elif sys.argv[arg] == "--restore-compute":
|
||||
do_compute_restore = True
|
||||
elif sys.argv[arg] == "--restore-complete":
|
||||
do_complete_restore = True
|
||||
elif sys.argv[arg] == "--archive-dir":
|
||||
arg += 1
|
||||
if arg < len(sys.argv):
|
||||
|
@ -426,7 +424,7 @@ def main():
|
|||
if [do_backup,
|
||||
do_system_restore,
|
||||
do_images_restore,
|
||||
do_compute_restore,
|
||||
do_complete_restore,
|
||||
do_clone,
|
||||
do_default_config,
|
||||
do_non_interactive].count(True) > 1:
|
||||
|
@ -436,7 +434,7 @@ def main():
|
|||
if answerfile and [do_backup,
|
||||
do_system_restore,
|
||||
do_images_restore,
|
||||
do_compute_restore,
|
||||
do_complete_restore,
|
||||
do_clone,
|
||||
do_default_config,
|
||||
do_non_interactive].count(True) > 0:
|
||||
|
@ -474,8 +472,8 @@ def main():
|
|||
elif do_images_restore:
|
||||
backup_restore.restore_images(backup_name)
|
||||
print "\nImages restore complete"
|
||||
elif do_compute_restore:
|
||||
backup_restore.restore_compute()
|
||||
elif do_complete_restore:
|
||||
backup_restore.restore_complete()
|
||||
elif do_clone:
|
||||
clone.clone(backup_name, archive_dir)
|
||||
print "\nCloning complete"
|
||||
|
|
|
@ -3932,6 +3932,20 @@ class HostController(rest.RestController):
|
|||
|
||||
raise wsme.exc.ClientSideError('%s' % msg)
|
||||
|
||||
@staticmethod
|
||||
def _semantic_check_restore_complete(ihost):
|
||||
"""
|
||||
During a restore procedure, checks compute nodes can be unlocked
|
||||
only after running "config_controller --restore-complete"
|
||||
"""
|
||||
if os.path.isfile(tsc.RESTORE_SYSTEM_FLAG):
|
||||
raise wsme.exc.ClientSideError(
|
||||
_("Cannot unlock host %s. Please restore any volumes "
|
||||
"and then complete the restore procedure by running "
|
||||
"'config_controller --restore-complete' first. "
|
||||
"Please refer to system admin guide for more details.") %
|
||||
(ihost['hostname']))
|
||||
|
||||
@staticmethod
|
||||
def _handle_ttys_dcd_change(ihost, ttys_dcd):
|
||||
"""
|
||||
|
@ -4830,6 +4844,9 @@ class HostController(rest.RestController):
|
|||
"configure host and wait for Availability State "
|
||||
"'online' prior to unlock." % hostupdate.displayid))
|
||||
|
||||
# Check whether a restore was properly completed
|
||||
self._semantic_check_restore_complete(ihost)
|
||||
|
||||
# sdn configuration check
|
||||
self._semantic_check_sdn_attributes(ihost)
|
||||
|
||||
|
|
Loading…
Reference in New Issue