# # Copyright (c) 2015-2016 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # import os import six from nfv_common import config from nfv_common import debug from nfv_common import timers from nfv_common.helpers import coroutine from nfv_common.helpers import Singleton from nfv_vim import dor from nfv_vim import nfvi from nfv_vim import objects from nfv_vim import tables from nfv_vim.directors._directors_defs import Operation from nfv_vim.directors._directors_defs import OPERATION_STATE from nfv_vim.directors._directors_defs import OPERATION_TYPE DLOG = debug.debug_get_logger('nfv_vim.instance_director') _instance_director = None NFV_VIM_UNLOCK_COMPLETE_FILE = '/var/run/.nfv-vim.unlock_complete' @six.add_metaclass(Singleton) class InstanceDirector(object): """ Instance Director """ def __init__(self, max_concurrent_recovering_instances, max_concurrent_migrates_per_host, max_concurrent_evacuates_per_host, recovery_audit_interval, recovery_audit_cooldown, recovery_audit_batch_interval, recovery_cooldown, rebuild_timeout, reboot_timeout, migrate_timeout, single_hypervisor, recovery_threshold, max_throttled_recovering_instances): self._max_concurrent_recovering_instances \ = max_concurrent_recovering_instances self._max_concurrent_migrates_per_host \ = max_concurrent_migrates_per_host self._max_concurrent_evacuates_per_host \ = max_concurrent_evacuates_per_host self._recovery_audit_interval = recovery_audit_interval self._recovery_audit_cooldown = recovery_audit_cooldown self._recovery_audit_batch_interval = recovery_audit_batch_interval self._recovery_cooldown = recovery_cooldown self._rebuild_timeout = rebuild_timeout self._reboot_timeout = reboot_timeout self._migrate_timeout = migrate_timeout self._single_hypervisor = single_hypervisor self._recovery_threshold = recovery_threshold self._max_throttled_recovering_instances \ = max_throttled_recovering_instances self._host_operations = dict() self._reboot_count = dict() self._instance_recovery_list = list() self._instance_failed_list = list() self._instance_rebuilding_list = list() self._instance_migrating_list = list() self._instance_rebooting_list = list() self._instance_cleanup_list = list() self._next_audit_interval = recovery_audit_interval if not nfvi.nfvi_compute_plugin_disabled(): # Do not launch audit if compute plugin not enabled. self._timer_audit_instances = timers.timers_create_timer( "audit-instances", recovery_audit_cooldown, recovery_audit_interval, self.audit_instances) else: self._timer_audit_instances = None self._timer_cleanup_instances = None @staticmethod def _is_host_enabled(host_name): """ Returns true if the hypervisor is enabled """ host_table = tables.tables_get_host_table() host = host_table.get(host_name, None) if host is not None: if host.nfvi_host_is_enabled(): return True return False @staticmethod def _is_hypervisor_enabled(host_name): """ Returns true if the hypervisor is enabled """ hypervisor_table = tables.tables_get_hypervisor_table() hypervisor = hypervisor_table.get_by_host_name(host_name) if hypervisor is not None: if hypervisor.is_enabled(): return True return False @staticmethod def _hypervisors_available(min_count=1, excluded_hosts=None): """ Returns true if at least given count of hosts and hypervisors are enabled """ if excluded_hosts is None: excluded_hosts = list() available_count = 0 host_table = tables.tables_get_host_table() hypervisor_table = tables.tables_get_hypervisor_table() for host_name in host_table.keys(): if host_name in excluded_hosts: continue host = host_table.get(host_name, None) if host.nfvi_host_is_enabled(): hypervisor = hypervisor_table.get_by_host_name(host_name) if hypervisor is not None: if hypervisor.is_enabled(): available_count += 1 return available_count >= min_count @staticmethod def upgrade_inprogress(): """ Returns true if the system is going through an upgrade """ host_table = tables.tables_get_host_table() for host_name in host_table.keys(): host = host_table.get(host_name, None) if host is not None: if host.upgrade_inprogress and not host.recover_instances: return True return False @staticmethod def instance_action_allowed(instance, action_type): """ Returns true if instance action is allowed """ DLOG.info("Instance action allowed for %s, action_type=%s" % (instance.name, action_type)) return not InstanceDirector.upgrade_inprogress() def _instance_recovery_allowed(self, instance): """ Returns true if instance recovery is allowed """ recovery_allowed = False if instance.is_rebuilding(): if instance.elapsed_time_in_state >= self._rebuild_timeout: recovery_allowed = True # We only recover failed live migrations - not failed cold migrations # or failed resize operations (instance.is_resizing). elif instance.is_migrating(): if instance.elapsed_time_in_state >= self._migrate_timeout: recovery_allowed = True elif instance.is_rebooting(): if instance.elapsed_time_in_state >= self._reboot_timeout: recovery_allowed = True else: if instance.elapsed_time_in_state >= self._recovery_cooldown: recovery_allowed = True return recovery_allowed def _get_instance_recovery_list(self): """ Get instance recovery list after the previous list is exhausted """ next_audit_interval = self._recovery_audit_interval # Get all instances that are to be considered for recovery. instances_recover = list() instances_failed = list() instances_rebuilding = list() instances_migrating = list() instances_rebooting = list() instance_tracking_uuids = list() instance_table = tables.tables_get_instance_table() # Check for failed instances; exclude instances that are part of a # host operation or have recently failed. Also check for failed # instances stuck recovering. for instance_uuid in instance_table: instance = instance_table[instance_uuid] host_operation = self._host_operations.get(instance.host_name, None) if host_operation is not None: if host_operation.is_inprogress(): DLOG.debug("Skip recovery of instance %s, host %s operation " "inprogress." % (instance.name, instance.host_name)) next_audit_interval = self._recovery_audit_cooldown continue if instance.host_name is None: DLOG.info("Can't recover instance %s, host is not valid." % instance.name) continue if not (instance.is_deleting() or instance.is_deleted() or instance.is_locked()) and instance.is_failed(): next_audit_interval = self._recovery_audit_cooldown instance_tracking_uuids.append(instance.uuid) if self._instance_recovery_allowed(instance): instances_recover.append(instance) else: if instance.is_rebuilding(): instances_rebuilding.append(instance) elif instance.is_migrating(): instances_migrating.append(instance) elif instance.is_rebooting(): instances_rebooting.append(instance) else: instances_failed.append(instance) # Remove reboot counts for instances that recovered # Make a copy of the keys for this to work in python3 # since _reboot_count is changing while iterating reboot_tracking_instance_uuids = list(self._reboot_count.keys()) for instance_uuid in reboot_tracking_instance_uuids: if instance_uuid not in instance_tracking_uuids: del self._reboot_count[instance_uuid] # Initialize reboot counts for new instances for instance_uuid in instance_tracking_uuids: if instance_uuid not in self._reboot_count: self._reboot_count[instance_uuid] = 0 # Order instances based on recovery priority instances_recover.sort(key=objects.Instance.recovery_sort_key, reverse=True) return (next_audit_interval, instances_recover, instances_failed, instances_rebuilding, instances_migrating, instances_rebooting) def _host_migrate_instances(self, host, host_operation): """ Host Migrate Instances """ if host_operation.operation_type not in [OPERATION_TYPE.HOST_LOCK_FORCE, OPERATION_TYPE.HOST_LOCK]: if not dor.dor_is_complete(): DLOG.info("DOR is not complete, can't migrate instances off of " "host %s." % host.name) self.reschedule_audit_instances(self._recovery_audit_cooldown) return if self.upgrade_inprogress(): DLOG.info("Upgrade inprogress, can't migrate instances off of " "host %s." % host.name) self.reschedule_audit_instances(self._recovery_audit_cooldown) return if not self._hypervisors_available(min_count=1): DLOG.info("No hypervisors available, can't migrate instances " "off of host %s." % host.name) self.reschedule_audit_instances(self._recovery_audit_cooldown) return if OPERATION_TYPE.HOST_LOCK_FORCE == host_operation.operation_type: DLOG.info("Force-Lock issued, can't migrate instances off of " "host %s." % host.name) self.reschedule_audit_instances(self._recovery_audit_cooldown) return initiated_by = objects.INSTANCE_ACTION_INITIATED_BY.DIRECTOR if OPERATION_TYPE.HOST_LOCK_FORCE == host_operation.operation_type: reason = "host force lock command issued" elif OPERATION_TYPE.HOST_LOCK == host_operation.operation_type: reason = "host lock command issued" elif OPERATION_TYPE.HOST_DISABLE == host_operation.operation_type: reason = "host disabled" elif OPERATION_TYPE.HOST_FAILED == host_operation.operation_type: if host.is_component_failure(): reason = "host component failure" else: reason = "host failed" elif OPERATION_TYPE.MIGRATE_INSTANCES == host_operation.operation_type: reason = "migrate instances requested" else: reason = None migrates_inprogress = host_operation.total_inprogress() instance_table = tables.tables_get_instance_table() for instance in instance_table.on_host(host.name): if host_operation.instance_exists(instance.uuid): continue if instance.is_deleting() or instance.is_deleted() or \ instance.is_locked() or instance.is_failed(): continue method = objects.INSTANCE_ACTION_TYPE.COLD_MIGRATE if instance.is_enabled() or instance.is_paused(): if instance.supports_live_migration(): method = objects.INSTANCE_ACTION_TYPE.LIVE_MIGRATE if host_operation.operation_type in [OPERATION_TYPE.HOST_LOCK, OPERATION_TYPE.MIGRATE_INSTANCES]: if OPERATION_TYPE.HOST_LOCK == host_operation.operation_type: preamble = "Lock of host" else: preamble = "Migrate instances from host" if instance.is_paused() and \ objects.INSTANCE_ACTION_TYPE.COLD_MIGRATE == method: reason = ("%s %s failed because instance %s is paused." % (preamble, host.name, instance.name)) DLOG.info(reason) host_operation.add_instance(instance.uuid, OPERATION_STATE.FAILED) host_operation.update_failure_reason(reason) return elif instance.is_suspended(): reason = ("%s %s failed because instance %s is suspended." % (preamble, host.name, instance.name)) DLOG.info(reason) host_operation.add_instance(instance.uuid, OPERATION_STATE.FAILED) host_operation.update_failure_reason(reason) return elif instance.is_migrating() or instance.is_cold_migrating(): reason = ("%s %s failed because instance %s is migrating." % (preamble, host.name, instance.name)) DLOG.info(reason) host_operation.add_instance(instance.uuid, OPERATION_STATE.FAILED) host_operation.update_failure_reason(reason) return elif instance.is_rebuilding(): reason = ("%s %s failed because instance %s is rebuilding." % (preamble, host.name, instance.name)) DLOG.info(reason) host_operation.add_instance(instance.uuid, OPERATION_STATE.FAILED) host_operation.update_failure_reason(reason) return elif instance.is_action_running(): # Nova will not migrate an instance if an action is already # running. reason = ( "%s %s failed because instance %s action in progress." % (preamble, host.name, instance.name)) DLOG.info(reason) host_operation.add_instance(instance.uuid, OPERATION_STATE.FAILED) host_operation.update_failure_reason(reason) return elif instance.is_resized(): # Nova will not migrate an instance if is resized and # waiting for confirmation. reason = ( "%s %s failed because instance %s is resizing." % (preamble, host.name, instance.name)) DLOG.info(reason) host_operation.add_instance(instance.uuid, OPERATION_STATE.FAILED) host_operation.update_failure_reason(reason) return elif not self._hypervisors_available(min_count=1, excluded_hosts=[host.name]): reason = ("%s %s failed because there are no other " "hypervisors available." % (preamble, host.name)) DLOG.info(reason) host_operation.add_instance(instance.uuid, OPERATION_STATE.FAILED) host_operation.update_failure_reason(reason) return else: if objects.INSTANCE_ACTION_TYPE.LIVE_MIGRATE == method: if not instance.can_live_migrate(system_initiated=True): reason = ("%s %s failed because instance %s " "can't be live-migrated by the system. " "Manually move the instance off of host %s." % (preamble, host.name, instance.name, host.name)) DLOG.info(reason) host_operation.add_instance(instance.uuid, OPERATION_STATE.FAILED) host_operation.update_failure_reason(reason) return else: if not instance.can_cold_migrate(system_initiated=True): reason = ("%s %s failed because instance %s " "can't be cold-migrated by the system. " "Manually move the instance off of host %s." % (preamble, host.name, instance.name, host.name)) DLOG.info(reason) host_operation.add_instance(instance.uuid, OPERATION_STATE.FAILED) host_operation.update_failure_reason(reason) return else: if instance.is_paused() and \ objects.INSTANCE_ACTION_TYPE.COLD_MIGRATE == method: DLOG.info("Instance %s set as failed on host %s " "because it is paused." % (instance.name, host.name)) instance.fail(reason + " and instance is paused") continue elif instance.is_suspended(): DLOG.info("Instance %s set as failed on host %s because " "it is suspended." % (instance.name, host.name)) instance.fail(reason + " and instance is suspended") continue elif instance.is_migrating() or instance.is_cold_migrating(): # Allow current migrations to continue DLOG.info("Instance %s on host %s is already migrating." % (instance.name, host.name)) elif instance.is_rebuilding(): DLOG.info("Instance %s set as failed on host %s because " "it is rebuilding." % (instance.name, host.name)) instance.fail(reason + " and instance is rebuilding") continue elif instance.is_action_running(): DLOG.info("Instance %s set as failed on host %s because " "an action is in progress." % (instance.name, host.name)) instance.fail(reason + " and instance has action in progress") continue elif instance.is_resized(): DLOG.info("Instance %s set as failed on host %s because " "it is resized." % (instance.name, host.name)) instance.fail(reason + " and instance is resized") continue else: if objects.INSTANCE_ACTION_TYPE.LIVE_MIGRATE == method: if not instance.can_live_migrate(system_initiated=True): DLOG.info("Instance %s set as failed on host %s " "because the system can't live-migrate " "instance." % (instance.name, host.name)) instance.fail(reason + " and the system can't " "live-migrate instance") continue else: if not instance.can_cold_migrate(system_initiated=True): DLOG.info("Instance %s set as failed on host %s " "because the system can't cold-migrate " "instance." % (instance.name, host.name)) instance.fail(reason + " and the system can't " "cold-migrate instance") continue host_operation.add_instance(instance.uuid, OPERATION_STATE.INPROGRESS) if not (instance.is_migrating() or instance.is_cold_migrating()): instance.do_action(method, initiated_by=initiated_by, reason=reason) migrates_inprogress += 1 if migrates_inprogress >= self._max_concurrent_migrates_per_host: break def _host_evacuate_instances(self, host, host_operation): """ Host Evacuate Instances """ do_evacuates = True if host_operation.operation_type not in [OPERATION_TYPE.HOST_LOCK_FORCE, OPERATION_TYPE.HOST_LOCK]: if not dor.dor_is_complete(): DLOG.info("DOR is not complete, can't evacuate instances off of " "host %s." % host.name) self.reschedule_audit_instances(self._recovery_audit_cooldown) do_evacuates = False if do_evacuates and self.upgrade_inprogress(): DLOG.info("Upgrade inprogress, can't evacuate instances off of " "host %s." % host.name) self.reschedule_audit_instances(self._recovery_audit_cooldown) do_evacuates = False if do_evacuates and not self._hypervisors_available(min_count=1): DLOG.info("No hypervisors available, can't evacuate instances " "off of host %s." % host.name) self.reschedule_audit_instances(self._recovery_audit_cooldown) do_evacuates = False if do_evacuates and \ OPERATION_TYPE.HOST_LOCK_FORCE == host_operation.operation_type: DLOG.info("Force-Lock issued, can't evacuate instances off of " "host %s until it is rebooted." % host.name) self.reschedule_audit_instances(self._recovery_audit_cooldown) do_evacuates = False initiated_by = objects.INSTANCE_ACTION_INITIATED_BY.DIRECTOR if OPERATION_TYPE.HOST_LOCK_FORCE == host_operation.operation_type: reason = "host force lock command issued" elif OPERATION_TYPE.HOST_LOCK == host_operation.operation_type: reason = "host lock command issued" elif OPERATION_TYPE.HOST_DISABLE == host_operation.operation_type: reason = "host disable action" elif OPERATION_TYPE.HOST_FAILED == host_operation.operation_type: if host.is_component_failure(): reason = "host component failure" else: reason = "host failed" else: reason = None evacuates_inprogress = host_operation.total_inprogress() instance_table = tables.tables_get_instance_table() # Sort the instances on this host based on their recovery priority evacuate_priority_list = list() for instance in instance_table.on_host(host.name): evacuate_priority_list.append(instance) evacuate_priority_list.sort( key=objects.Instance.recovery_sort_key, reverse=True) for instance in evacuate_priority_list: if host_operation.instance_exists(instance.uuid): continue if instance.is_deleting() or instance.is_deleted() or \ instance.is_locked(): continue if OPERATION_TYPE.HOST_LOCK == host_operation.operation_type: if instance.is_paused(): reason = ("Lock of host %s failed because instance %s " "is paused." % (host.name, instance.name)) DLOG.info(reason) host_operation.add_instance(instance.uuid, OPERATION_STATE.FAILED) host_operation.update_failure_reason(reason) return elif instance.is_suspended(): reason = ("Lock of host %s failed because instance %s " "is suspended." % (host.name, instance.name)) DLOG.info(reason) host_operation.add_instance(instance.uuid, OPERATION_STATE.FAILED) host_operation.update_failure_reason(reason) return elif instance.is_migrating() or instance.is_cold_migrating(): reason = ("Lock of host %s failed because instance %s " "is migrating." % (host.name, instance.name)) DLOG.info(reason) host_operation.add_instance(instance.uuid, OPERATION_STATE.FAILED) host_operation.update_failure_reason(reason) return elif instance.is_rebuilding(): reason = ("Lock of host %s failed because instance %s " "is rebuilding." % (host.name, instance.name)) DLOG.info(reason) host_operation.add_instance(instance.uuid, OPERATION_STATE.FAILED) host_operation.update_failure_reason(reason) return elif instance.is_action_running(): # Nova will not evacuate an instance if an action is already # running. reason = ("Lock of host %s failed because instance %s " "action in progress." % (host.name, instance.name)) DLOG.info(reason) host_operation.add_instance(instance.uuid, OPERATION_STATE.FAILED) host_operation.update_failure_reason(reason) return elif instance.is_resized(): # Nova will not migrate an instance if is resized and # waiting for confirmation. reason = ("Lock of host %s failed because instance %s " "is resizing." % (host.name, instance.name)) DLOG.info(reason) host_operation.add_instance(instance.uuid, OPERATION_STATE.FAILED) host_operation.update_failure_reason(reason) return elif not self._hypervisors_available(min_count=1, excluded_hosts=[host.name]): reason = ("Lock of host %s failed because there are no " "other hypervisors available." % host.name) DLOG.info(reason) host_operation.add_instance(instance.uuid, OPERATION_STATE.FAILED) host_operation.update_failure_reason(reason) return elif not instance.can_evacuate(system_initiated=True): reason = ("Lock of host %s failed because instance %s " "can't be evacuated by the system. Manually " "move the instance off of host %s." % (host.name, instance.name, host.name)) DLOG.info(reason) host_operation.add_instance(instance.uuid, OPERATION_STATE.FAILED) host_operation.update_failure_reason(reason) return else: if not instance.is_failed(): instance.fail(reason) if do_evacuates: if evacuates_inprogress < self._max_concurrent_evacuates_per_host: if instance.auto_recovery and instance.recoverable and \ instance.can_evacuate(system_initiated=True): host_operation.add_instance(instance.uuid, OPERATION_STATE.INPROGRESS) instance.do_action(objects.INSTANCE_ACTION_TYPE.EVACUATE, initiated_by=initiated_by, reason=reason) evacuates_inprogress += 1 @staticmethod def _host_stop_instances(host, host_operation, instance_uuids): """ Host Stop Instances """ initiated_by = objects.INSTANCE_ACTION_INITIATED_BY.DIRECTOR if OPERATION_TYPE.STOP_INSTANCES == host_operation.operation_type: reason = "stop instances issued" elif OPERATION_TYPE.HOST_LOCK_FORCE == host_operation.operation_type: reason = "host force lock command issued" elif OPERATION_TYPE.HOST_LOCK == host_operation.operation_type: reason = "host lock command issued" else: reason = ("Unsupported operation (%s) against host %s." % (host_operation.operation_type, host.name)) DLOG.info(reason) host_operation.set_failed(reason) return instance_table = tables.tables_get_instance_table() for instance in instance_table.on_host(host.name): if instance.uuid not in instance_uuids: # We were not asked to stop this instance DLOG.info("Ignoring instance %s while stopping instances on " "host %s" % (instance.name, host.name)) continue if host_operation.instance_exists(instance.uuid): continue if instance.is_deleting() or instance.is_deleted() or \ instance.is_locked(): continue if host_operation.operation_type in [OPERATION_TYPE.HOST_LOCK, OPERATION_TYPE.STOP_INSTANCES]: # Fail the operation if an instance cannot be stopped if OPERATION_TYPE.HOST_LOCK == host_operation.operation_type: preamble = "Lock of host" else: preamble = "Stop instances on host" if instance.is_paused(): reason = ("%s %s failed because instance %s is paused." % (preamble, host.name, instance.name)) DLOG.info(reason) host_operation.add_instance(instance.uuid, OPERATION_STATE.FAILED) host_operation.update_failure_reason(reason) return elif instance.is_suspended(): reason = ("%s %s failed because instance %s is suspended." % (preamble, host.name, instance.name)) DLOG.info(reason) host_operation.add_instance(instance.uuid, OPERATION_STATE.FAILED) host_operation.update_failure_reason(reason) return elif instance.is_migrating() or instance.is_cold_migrating(): reason = ("%s %s failed because instance %s is migrating." % (preamble, host.name, instance.name)) DLOG.info(reason) host_operation.add_instance(instance.uuid, OPERATION_STATE.FAILED) host_operation.update_failure_reason(reason) return elif instance.is_rebuilding(): reason = ("%s %s failed because instance %s is rebuilding." % (preamble, host.name, instance.name)) DLOG.info(reason) host_operation.add_instance(instance.uuid, OPERATION_STATE.FAILED) host_operation.update_failure_reason(reason) return elif instance.is_action_running(): # Nova will not stop an instance if an action is already # running. reason = ( "%s %s failed because instance %s action in progress." % (preamble, host.name, instance.name)) DLOG.info(reason) host_operation.add_instance(instance.uuid, OPERATION_STATE.FAILED) host_operation.update_failure_reason(reason) return elif instance.is_resized(): # Nova will not stop an instance if is resized and # waiting for confirmation. reason = ( "%s %s failed because instance %s is resizing." % (preamble, host.name, instance.name)) DLOG.info(reason) host_operation.add_instance(instance.uuid, OPERATION_STATE.FAILED) host_operation.update_failure_reason(reason) return else: # Force lock - fail instances that cannot be stopped. if instance.is_paused(): DLOG.info("Instance %s set as failed on host %s " "because it is paused." % (instance.name, host.name)) instance.fail(reason + " and instance is paused") continue elif instance.is_suspended(): DLOG.info("Instance %s set as failed on host %s because " "it is suspended." % (instance.name, host.name)) instance.fail(reason + " and instance is suspended") continue elif instance.is_migrating() or instance.is_cold_migrating(): DLOG.info("Instance %s set as failed on host %s because " "it is migrating." % (instance.name, host.name)) instance.fail(reason + " and instance is migrating") continue elif instance.is_rebuilding(): DLOG.info("Instance %s set as failed on host %s because " "it is rebuilding." % (instance.name, host.name)) instance.fail(reason + " and instance is rebuilding") continue elif instance.is_action_running(): DLOG.info("Instance %s set as failed on host %s because " "an action is in progress." % (instance.name, host.name)) instance.fail(reason + " and instance has action in progress") continue elif instance.is_resized(): DLOG.info("Instance %s set as failed on host %s because " "it is resized." % (instance.name, host.name)) instance.fail(reason + " and instance is resized") continue host_operation.add_instance(instance.uuid, OPERATION_STATE.INPROGRESS) instance.do_action(objects.INSTANCE_ACTION_TYPE.STOP, initiated_by=initiated_by, reason=reason) @staticmethod def _host_start_instances(host, host_operation, instance_uuids): """ Host Start Instances """ if host_operation.operation_type not in [ OPERATION_TYPE.START_INSTANCES, OPERATION_TYPE.START_INSTANCES_SERIAL]: reason = ("Unsupported operation (%s) against host %s." % (host_operation.operation_type, host.name)) DLOG.info(reason) host_operation.set_failed(reason) return initiated_by = objects.INSTANCE_ACTION_INITIATED_BY.DIRECTOR if OPERATION_TYPE.START_INSTANCES == host_operation.operation_type: reason = "start instances issued" elif OPERATION_TYPE.START_INSTANCES_SERIAL == \ host_operation.operation_type: reason = "start instances serial issued" else: reason = None starts_inprogress = 0 instance_table = tables.tables_get_instance_table() for instance in instance_table.on_host(host.name): if instance.uuid not in instance_uuids: # We were not asked to start this instance DLOG.info("Ignoring instance %s while starting instances on " "host %s" % (instance.name, host.name)) continue if host_operation.instance_exists(instance.uuid): continue if instance.is_deleting() or instance.is_deleted() or \ instance.is_failed(): continue if instance.is_paused(): reason = ("Start instances on host %s failed because instance %s " "is paused." % (host.name, instance.name)) DLOG.info(reason) host_operation.add_instance(instance.uuid, OPERATION_STATE.FAILED) host_operation.update_failure_reason(reason) return elif instance.is_suspended(): reason = ("Start instances on host %s failed because instance %s " "is suspended." % (host.name, instance.name)) DLOG.info(reason) host_operation.add_instance(instance.uuid, OPERATION_STATE.FAILED) host_operation.update_failure_reason(reason) return elif instance.is_migrating() or instance.is_cold_migrating(): reason = ("Start instances on host %s failed because instance %s " "is migrating." % (host.name, instance.name)) DLOG.info(reason) host_operation.add_instance(instance.uuid, OPERATION_STATE.FAILED) host_operation.update_failure_reason(reason) return elif instance.is_rebuilding(): reason = ("Start instances on host %s failed because instance %s " "is rebuilding." % (host.name, instance.name)) DLOG.info(reason) host_operation.add_instance(instance.uuid, OPERATION_STATE.FAILED) host_operation.update_failure_reason(reason) return elif instance.is_action_running(): # Nova will not start an instance if an action is already # running. reason = ( "Start instances on host %s failed because instance %s " "action in progress." % (host.name, instance.name)) DLOG.info(reason) host_operation.add_instance(instance.uuid, OPERATION_STATE.FAILED) host_operation.update_failure_reason(reason) return elif instance.is_resized(): # Nova will not start an instance if is resized and # waiting for confirmation. reason = ( "Start instances on host %s failed because instance %s " "is resizing." % (host.name, instance.name)) DLOG.info(reason) host_operation.add_instance(instance.uuid, OPERATION_STATE.FAILED) host_operation.update_failure_reason(reason) return elif not instance.is_locked(): reason = ("Start instances on host %s failed because instance %s " "is not locked." % (host.name, instance.name)) DLOG.info(reason) host_operation.add_instance(instance.uuid, OPERATION_STATE.FAILED) host_operation.update_failure_reason(reason) return if OPERATION_TYPE.START_INSTANCES_SERIAL == \ host_operation.operation_type and starts_inprogress >= 1: # When starting instances in serial, the first instance is # started and the rest are set to the READY state, to be # started later. host_operation.add_instance(instance.uuid, OPERATION_STATE.READY) else: host_operation.add_instance(instance.uuid, OPERATION_STATE.INPROGRESS) instance.do_action(objects.INSTANCE_ACTION_TYPE.START, initiated_by=initiated_by, reason=reason) starts_inprogress += 1 def instance_migrate_complete(self, instance, from_host_name, failed=False, timed_out=False, cancelled=False): """ Instance Migrate Complete """ from nfv_vim import directors host_director = directors.get_host_director() host_operation = self._host_operations.get(from_host_name, None) if host_operation is None: DLOG.verbose("No host %s operation inprogress." % from_host_name) return if host_operation.operation_type not in [OPERATION_TYPE.HOST_LOCK_FORCE, OPERATION_TYPE.HOST_LOCK, OPERATION_TYPE.HOST_DISABLE, OPERATION_TYPE.HOST_FAILED, OPERATION_TYPE.MIGRATE_INSTANCES]: DLOG.verbose("Unexpected host %s operation %s, ignoring." % (from_host_name, host_operation.operation_type)) return host_table = tables.tables_get_host_table() from_host = host_table.get(from_host_name, None) if from_host is None: DLOG.verbose("Host %s does not exist." % from_host_name) return if failed: reason = ("Migrate of instance %s from host %s failed." % (instance.name, from_host_name)) host_operation_state = OPERATION_STATE.FAILED DLOG.info(reason) elif timed_out: reason = ("Migrate of instance %s from host %s timed out." % (instance.name, from_host_name)) host_operation_state = OPERATION_STATE.TIMED_OUT DLOG.info(reason) elif cancelled: reason = ("Migrate of instance %s on host %s cancelled." % (instance.name, from_host_name)) host_operation_state = OPERATION_STATE.CANCELLED DLOG.info(reason) else: reason = ("Migrate of instance %s from host %s succeeded." % (instance.name, from_host_name)) host_operation_state = OPERATION_STATE.COMPLETED DLOG.info(reason) host_operation.update_instance(instance.uuid, host_operation_state) if host_operation.operation_type in [OPERATION_TYPE.HOST_LOCK, OPERATION_TYPE.MIGRATE_INSTANCES]: if OPERATION_STATE.COMPLETED != host_operation_state: host_operation.update_failure_reason(reason) host_director.host_instances_moved(from_host, host_operation) if OPERATION_TYPE.MIGRATE_INSTANCES == \ host_operation.operation_type: sw_mgmt_director = directors.get_sw_mgmt_director() sw_mgmt_director.migrate_instances_failed(reason) host_operation = self._host_operations.get(from_host.name, None) if host_operation is not None: del self._host_operations[from_host.name] return else: if OPERATION_STATE.COMPLETED != host_operation_state: if not instance.is_failed(): instance.fail() # Continue with the next batch of instances if self._is_hypervisor_enabled(from_host_name): self._host_migrate_instances(from_host, host_operation) else: self._host_evacuate_instances(from_host, host_operation) # Check if host operation is complete if host_operation.is_inprogress(): from_host.notify_instance_moved() else: host_director.host_instances_moved(from_host, host_operation) host_operation = self._host_operations.get(from_host.name, None) if host_operation is not None: del self._host_operations[from_host.name] def instance_evacuate_complete(self, instance, from_host_name, failed=False, timed_out=False, cancelled=False): """ Instance Evacuate Complete """ from nfv_vim import directors host_director = directors.get_host_director() host_operation = self._host_operations.get(from_host_name, None) if host_operation is None: DLOG.verbose("No host %s operation inprogress." % from_host_name) return if host_operation.operation_type not in [OPERATION_TYPE.HOST_LOCK_FORCE, OPERATION_TYPE.HOST_LOCK, OPERATION_TYPE.HOST_DISABLE, OPERATION_TYPE.HOST_FAILED]: DLOG.verbose("Unexpected host %s operation %s, ignoring." % (from_host_name, host_operation.operation_type)) return host_table = tables.tables_get_host_table() from_host = host_table.get(from_host_name, None) if from_host is None: DLOG.verbose("Host %s does not exist." % from_host_name) return if failed: reason = ("Evacuate of instance %s from host %s failed." % (instance.name, from_host_name)) host_operation_state = OPERATION_STATE.FAILED DLOG.info(reason) elif timed_out: reason = ("Evacuate of instance %s from host %s timed out." % (instance.name, from_host_name)) host_operation_state = OPERATION_STATE.TIMED_OUT DLOG.info(reason) elif cancelled: reason = ("Evacuate of instance %s on host %s cancelled." % (instance.name, from_host_name)) host_operation_state = OPERATION_STATE.CANCELLED DLOG.info(reason) else: reason = ("Evacuate of instance %s from host %s succeeded." % (instance.name, from_host_name)) host_operation_state = OPERATION_STATE.COMPLETED DLOG.info(reason) host_operation.update_instance(instance.uuid, host_operation_state) if OPERATION_TYPE.HOST_LOCK == host_operation.operation_type: if OPERATION_STATE.COMPLETED != host_operation_state: host_operation.update_failure_reason(reason) host_director.host_instances_moved(from_host, host_operation) host_operation = self._host_operations.get(from_host.name, None) if host_operation is not None: del self._host_operations[from_host.name] return else: if OPERATION_STATE.COMPLETED != host_operation_state: if not instance.is_failed(): instance.fail() # Continue with the next batch of instances self._host_evacuate_instances(from_host, host_operation) # Check if host operation is complete if host_operation.is_inprogress(): from_host.notify_instance_moved() else: host_director.host_instances_moved(from_host, host_operation) host_operation = self._host_operations.get(from_host.name, None) if host_operation is not None: del self._host_operations[from_host.name] def instance_stop_complete(self, instance, on_host_name, failed=False, timed_out=False, cancelled=False): """ Instance Stop Complete """ from nfv_vim import directors host_director = directors.get_host_director() host_operation = self._host_operations.get(on_host_name, None) if host_operation is None: DLOG.verbose("No host %s operation inprogress." % on_host_name) return if host_operation.operation_type not in [OPERATION_TYPE.STOP_INSTANCES, OPERATION_TYPE.HOST_LOCK_FORCE, OPERATION_TYPE.HOST_LOCK]: DLOG.verbose("Unexpected host %s operation %s, ignoring." % (on_host_name, host_operation.operation_type)) return host_table = tables.tables_get_host_table() host = host_table.get(on_host_name, None) if host is None: DLOG.verbose("Host %s does not exist." % on_host_name) return if failed: reason = ("Stop of instance %s on host %s failed." % (instance.name, on_host_name)) host_operation_state = OPERATION_STATE.FAILED DLOG.info(reason) elif timed_out: reason = ("Stop of instance %s on host %s timed out." % (instance.name, on_host_name)) host_operation_state = OPERATION_STATE.TIMED_OUT DLOG.info(reason) elif cancelled: reason = ("Stop of instance %s on host %s cancelled." % (instance.name, on_host_name)) host_operation_state = OPERATION_STATE.CANCELLED DLOG.info(reason) else: reason = ("Stop of instance %s on host %s succeeded." % (instance.name, on_host_name)) host_operation_state = OPERATION_STATE.COMPLETED DLOG.info(reason) host_operation.update_instance(instance.uuid, host_operation_state) if host_operation.operation_type in [OPERATION_TYPE.STOP_INSTANCES, OPERATION_TYPE.HOST_LOCK]: if OPERATION_STATE.COMPLETED != host_operation_state: host_operation.update_failure_reason(reason) host_director.host_instances_stopped(host, host_operation) host_operation = self._host_operations.get(host.name, None) if host_operation is not None: del self._host_operations[host.name] return else: if OPERATION_STATE.COMPLETED != host_operation_state: # Fail the instance because we are going to proceed with the # operation regardless. Don't fail the instance if the stop # operation was cancelled - this is a force lock and we # fail instances that have operations in progress. Doing # another fail here will cause a loop because the # instance.is_failed() will not be true until nova has # reported the updated state to us. if not instance.is_failed() and not cancelled: instance.fail() # Check if host operation is complete if host_operation.is_inprogress(): host.notify_instance_stopped() else: host_director.host_instances_stopped(host, host_operation) host_operation = self._host_operations.get(host.name, None) if host_operation is not None: del self._host_operations[host.name] def instance_start_complete(self, instance, on_host_name, failed=False, timed_out=False, cancelled=False): """ Instance Start Complete """ host_operation = self._host_operations.get(on_host_name, None) if host_operation is None: DLOG.verbose("No host %s operation inprogress." % on_host_name) return if host_operation.operation_type not in [ OPERATION_TYPE.START_INSTANCES, OPERATION_TYPE.START_INSTANCES_SERIAL]: DLOG.verbose("Unexpected host %s operation %s, ignoring." % (on_host_name, host_operation.operation_type)) return host_table = tables.tables_get_host_table() host = host_table.get(on_host_name, None) if host is None: DLOG.verbose("Host %s does not exist." % on_host_name) return if failed: reason = ("Start of instance %s on host %s failed." % (instance.name, on_host_name)) host_operation_state = OPERATION_STATE.FAILED DLOG.info(reason) elif timed_out: reason = ("Start of instance %s on host %s timed out." % (instance.name, on_host_name)) host_operation_state = OPERATION_STATE.TIMED_OUT DLOG.info(reason) elif cancelled: reason = ("Start of instance %s on host %s cancelled." % (instance.name, on_host_name)) host_operation_state = OPERATION_STATE.CANCELLED DLOG.info(reason) else: reason = ("Start of instance %s on host %s succeeded." % (instance.name, on_host_name)) host_operation_state = OPERATION_STATE.COMPLETED DLOG.info(reason) host_operation.update_instance(instance.uuid, host_operation_state) if OPERATION_STATE.COMPLETED != host_operation_state: host_operation.update_failure_reason(reason) if OPERATION_TYPE.START_INSTANCES_SERIAL == \ host_operation.operation_type: # Check if there is another instance on this host ready to start. # We continue starting instances even if the previous instance # failed to start. instance_table = tables.tables_get_instance_table() for instance in instance_table.on_host(host.name): if host_operation.instance_ready(instance.uuid): host_operation.update_instance(instance.uuid, OPERATION_STATE.INPROGRESS) instance.do_action( objects.INSTANCE_ACTION_TYPE.START, initiated_by=objects.INSTANCE_ACTION_INITIATED_BY.DIRECTOR, reason="start instances serial issued") return # Check if host operation is complete if not host_operation.is_inprogress(): host_operation = self._host_operations.get(host.name, None) if host_operation is not None: del self._host_operations[host.name] def _host_disabling_okay(self, host, host_operation): """ Host Disabling Semantic checks """ if OPERATION_TYPE.HOST_LOCK != host_operation.operation_type: return True instance_table = tables.tables_get_instance_table() for instance in instance_table.on_host(host.name): if instance.is_deleting() or instance.is_deleted() or \ instance.is_locked() or instance.is_failed(): continue if self._is_hypervisor_enabled(host.name): if self._single_hypervisor: # Only one hypervisor so instances will be stopped operation = objects.INSTANCE_ACTION_TYPE.STOP else: # Default behaviour is to cold migrate instance operation = objects.INSTANCE_ACTION_TYPE.COLD_MIGRATE # Unless live migration is supported... if instance.is_enabled() or instance.is_paused(): if instance.supports_live_migration(): operation = objects.INSTANCE_ACTION_TYPE.LIVE_MIGRATE else: operation = objects.INSTANCE_ACTION_TYPE.EVACUATE if instance.is_paused() and \ objects.INSTANCE_ACTION_TYPE.LIVE_MIGRATE != operation: reason = ("Lock of host %s rejected because instance %s " "is paused." % (host.name, instance.name)) DLOG.info(reason) host_operation.add_instance(instance.uuid, OPERATION_STATE.FAILED) host_operation.update_failure_reason(reason) return False elif instance.is_suspended(): reason = ("Lock of host %s rejected because instance %s " "is suspended." % (host.name, instance.name)) DLOG.info(reason) host_operation.add_instance(instance.uuid, OPERATION_STATE.FAILED) host_operation.update_failure_reason(reason) return False elif instance.is_migrating() or instance.is_cold_migrating(): reason = ("Lock of host %s rejected because instance %s " "is migrating." % (host.name, instance.name)) DLOG.info(reason) host_operation.add_instance(instance.uuid, OPERATION_STATE.FAILED) host_operation.update_failure_reason(reason) return False elif instance.is_rebuilding(): reason = ("Lock of host %s rejected because instance %s " "is rebuilding." % (host.name, instance.name)) DLOG.info(reason) host_operation.add_instance(instance.uuid, OPERATION_STATE.FAILED) host_operation.update_failure_reason(reason) return False elif instance.is_action_running(): # Nova will not migrate or evacuate an instance if an action # is already running. reason = ("Lock of host %s rejected because instance %s " "action in progress." % (host.name, instance.name)) DLOG.info(reason) host_operation.add_instance(instance.uuid, OPERATION_STATE.FAILED) host_operation.update_failure_reason(reason) return False elif instance.is_resized(): # Nova will not migrate or evacuate an instance if is resized # and waiting for confirmation. reason = ("Lock of host %s rejected because instance %s " "is resizing." % (host.name, instance.name)) DLOG.info(reason) host_operation.add_instance(instance.uuid, OPERATION_STATE.FAILED) host_operation.update_failure_reason(reason) return False elif not self._single_hypervisor and \ not self._hypervisors_available(min_count=1, excluded_hosts=[host.name]): reason = ("Lock of host %s rejected because there are no " "other hypervisors available." % host.name) DLOG.info(reason) host_operation.add_instance(instance.uuid, OPERATION_STATE.FAILED) host_operation.update_failure_reason(reason) return False else: if objects.INSTANCE_ACTION_TYPE.LIVE_MIGRATE == operation: if not instance.can_live_migrate(system_initiated=True): reason = ("Lock of host %s rejected because instance %s " "can't be live-migrated by the system. " "Manually move the instance off of host %s." % (host.name, instance.name, host.name)) DLOG.info(reason) host_operation.add_instance(instance.uuid, OPERATION_STATE.FAILED) host_operation.update_failure_reason(reason) return False elif objects.INSTANCE_ACTION_TYPE.COLD_MIGRATE == operation: if not instance.can_cold_migrate(system_initiated=True): reason = ("Lock of host %s rejected because instance %s " "can't be cold-migrated by the system. " "Manually move the instance off of host %s." % (host.name, instance.name, host.name)) DLOG.info(reason) host_operation.add_instance(instance.uuid, OPERATION_STATE.FAILED) host_operation.update_failure_reason(reason) return False elif objects.INSTANCE_ACTION_TYPE.EVACUATE == operation: if not instance.can_evacuate(system_initiated=True): reason = ("Lock of host %s rejected because instance %s " "can't be evacuated by the system. Manually " "move the instance off of host %s." % (host.name, instance.name, host.name)) DLOG.info(reason) host_operation.add_instance(instance.uuid, OPERATION_STATE.FAILED) host_operation.update_failure_reason(reason) return False return True def host_operation_cancel(self, host_name): """ Host Operation Cancel """ host_operation = self._host_operations.get(host_name, None) if host_operation is not None: DLOG.info("Canceling host operation %s for host %s." % (host_operation.operation_type, host_name)) del self._host_operations[host_name] def host_services_disabling(self, host): """ Host Services Disabling """ DLOG.info("Host %s services disabling." % host.name) if host.is_force_lock(): host_operation_type = OPERATION_TYPE.HOST_LOCK_FORCE elif host.is_locking(): host_operation_type = OPERATION_TYPE.HOST_LOCK elif host.is_failed(): host_operation_type = OPERATION_TYPE.HOST_FAILED else: host_operation_type = OPERATION_TYPE.HOST_DISABLE host_operation = self._host_operations.get(host.name, None) if host_operation is not None: DLOG.debug("Canceling previous host operation %s, before " "continuing with host operation %s for %s." % (host_operation.operation_type, host_operation_type, host.name)) del self._host_operations[host.name] host_operation = Operation(host_operation_type) DLOG.verbose("Host %s operation %s inprogress." % (host.name, host_operation_type)) if self._host_disabling_okay(host, host_operation): self._host_operations[host.name] = host_operation if self._single_hypervisor: # In single hypervisor configurations, we stop the instances # before disabling the host services. instance_table = tables.tables_get_instance_table() instance_uuids = list() for instance in instance_table.on_host(host.name): # Stop any running instances. if not instance.is_locked(): instance_uuids.append(instance.uuid) # Instance should be unlocked when hypervisor recovers instance.unlock_to_recover = True if instance_uuids: self._host_stop_instances(host, host_operation, instance_uuids) elif self._is_hypervisor_enabled(host.name): # Migrate instances from this host before stopping host services. self._host_migrate_instances(host, host_operation) else: # Evacuate instances from this host before stopping host services. self._host_evacuate_instances(host, host_operation) return host_operation def host_services_disabled(self, host): """ Host Services Disabled """ DLOG.info("Host %s services disabled." % host.name) if host.is_force_lock(): host_operation_type = OPERATION_TYPE.HOST_LOCK_FORCE elif host.is_locking(): host_operation_type = OPERATION_TYPE.HOST_LOCK elif host.is_failed(): host_operation_type = OPERATION_TYPE.HOST_FAILED else: host_operation_type = OPERATION_TYPE.HOST_DISABLE host_operation = self._host_operations.get(host.name, None) if host_operation is not None: DLOG.debug("Canceling previous host operation %s, before " "continuing with host operation %s for %s." % (host_operation.operation_type, host_operation_type, host.name)) del self._host_operations[host.name] host_operation = Operation(host_operation_type) DLOG.verbose("Host %s operation %s inprogress." % (host.name, host_operation_type)) self._host_operations[host.name] = host_operation # Do not evacuate instances from this host if we are in a single # hypervisor configuration. if not self._single_hypervisor: self._host_evacuate_instances(host, host_operation) return host_operation def host_disabled(self, host): """ Host Disabled """ DLOG.info("Host %s disabled." % host.name) host_operation = self._host_operations.get(host.name, None) if host_operation is not None: DLOG.debug("Canceling host operation %s for %s." % (host_operation.operation_type, host.name)) del self._host_operations[host.name] @staticmethod def host_offline(host): """ Host Offline """ instance_table = tables.tables_get_instance_table() for instance in instance_table.on_host(host.name): if instance.is_deleting() or instance.is_deleted(): continue DLOG.info("Host %s is offline, notifying instance %s." % (host.name, instance.name)) instance.host_offline() def host_audit(self, host): """ Host Audit """ if not dor.system_is_stabilized(): DLOG.info("DOR system stabilization is not complete, can't audit " "instances on host %s." % host.name) return if self.upgrade_inprogress(): DLOG.info("Upgrade inprogress, can't audit instances on host %s." % host.name) return host_operation = self._host_operations.get(host.name, None) if host_operation is not None: DLOG.debug("Host operation %s for %s inprogress, can't audit " "instances." % (host_operation.operation_type, host.name)) return if not host.nfvi_host_is_enabled() or host.is_failed() or host.is_offline(): instance_table = tables.tables_get_instance_table() for instance in instance_table.on_host(host.name): if instance.is_deleting() or instance.is_deleted() or \ instance.is_locked() or instance.is_failed(): continue DLOG.info("Host %s is failed or offline, setting instance %s " "to failed, host audit." % (host.name, instance.name)) instance.fail() @staticmethod def host_has_instances(host, skip_stopped=False): """ Returns true if the given host has instances """ count = 0 instance_table = tables.tables_get_instance_table() for instance in instance_table.on_host(host.name): if instance.is_deleting() or instance.is_deleted(): continue if skip_stopped and instance.is_locked(): continue count += 1 DLOG.info("Host %s has %s instances." % (host.name, count)) return 0 < count @coroutine def _instance_create_callback(self, instance_name, callback): """ Instance Create Callback """ response = (yield) DLOG.verbose("Instance-Create callback response=%s." % response) if response['completed']: nfvi_instance = response['result-data'] instance_table = tables.tables_get_instance_table() instance = instance_table.get(nfvi_instance.uuid, None) if instance is None: instance = objects.Instance(nfvi_instance) instance_table[instance.uuid] = instance instance.nfvi_instance_update(nfvi_instance) callback(response['completed'], instance_name, instance.uuid) else: callback(response['completed'], instance_name, None) @coroutine def _instance_type_create_callback(self, instance_name, instance_type_uuid, image_uuid, block_devices, networks, callback): """ Instance-Type Create Callback """ response = (yield) DLOG.verbose("Instance-Type-Create callback response=%s." % response) if response['completed']: nfvi_instance_type = response['result-data'] instance_type_table = tables.tables_get_instance_type_table() instance_type_table[nfvi_instance_type.uuid] = nfvi_instance_type instance_create_callback = self._instance_create_callback( instance_name, callback) nfvi.nfvi_create_instance(instance_name, instance_type_uuid, image_uuid, block_devices, networks, instance_create_callback) else: callback(response['completed'], instance_name, None) def create_instance(self, instance_name, instance_type_uuid, vcpus, mem_mb, disk_gb, ephemeral_gb, swap_gb, image_uuid, block_devices, networks, auto_recovery, live_migration_timeout, live_migration_max_downtime, callback): """ Create an instance """ instance_type_create_callback = self._instance_type_create_callback( instance_name, instance_type_uuid, image_uuid, block_devices, networks, callback) instance_type_name = "%s-type" % instance_name instance_type_attributes = \ nfvi.objects.v1.InstanceTypeAttributes( vcpus, mem_mb, disk_gb, ephemeral_gb, swap_gb, None, auto_recovery, live_migration_timeout, live_migration_max_downtime) nfvi.nfvi_create_instance_type(instance_type_uuid, instance_type_name, instance_type_attributes, instance_type_create_callback) @staticmethod def delete_instance(instance): """ Delete an instance """ DLOG.info("Instance %s delete requested." % instance.uuid) instance.do_action(objects.INSTANCE_ACTION_TYPE.DELETE) @staticmethod def instance_audit(instance): """ Notifies the instance director that an instance audit is inprogress """ from nfv_vim import directors DLOG.verbose("Notify other directors that an instance %s audit is " "inprogress." % instance.name) sw_mgmt_director = directors.get_sw_mgmt_director() sw_mgmt_director.host_audit(instance) @staticmethod def instance_state_change_notify(instance): """ Notifies the instance director that a instance has changed state """ from nfv_vim import directors DLOG.info("Instance %s state change notification." % instance.name) sw_mgmt_director = directors.get_sw_mgmt_director() sw_mgmt_director.instance_state_change(instance) def instance_recovered(self, instance): """ Instance has signalled that it has recovered """ DLOG.info("Instance %s has recovered on host %s." % (instance.name, instance.host_name)) self._reboot_count[instance.uuid] = 0 def recover_instance(self, instance, recovery_method=None, force_fail=False, fail_reason=None): """ Recover an instance """ if not dor.system_is_stabilized(): DLOG.info("DOR system stabilization is not complete, can't " "recover instance %s." % instance.name) self.reschedule_audit_instances(self._recovery_audit_cooldown) return False if self.upgrade_inprogress(): DLOG.info("Upgrade inprogress, can't recover instance %s." % instance.name) self.reschedule_audit_instances(self._recovery_audit_cooldown) return False if instance.uuid not in self._reboot_count: self._reboot_count[instance.uuid] = 0 if not dor.dor_is_complete(): self._reboot_count[instance.uuid] = 0 method = recovery_method if method is None: if self._is_host_enabled(instance.host_name) and \ self._is_hypervisor_enabled(instance.host_name): # Evacuates are indicated by the instance is rebuilding state if instance.is_rebuilding(): force_fail = True instance.cancel_action(objects.INSTANCE_ACTION_TYPE.REBUILD) if instance.image_uuid is not None: method = objects.INSTANCE_ACTION_TYPE.REBUILD else: method = objects.INSTANCE_ACTION_TYPE.REBOOT elif instance.is_migrating(): force_fail = True instance.cancel_action( objects.INSTANCE_ACTION_TYPE.LIVE_MIGRATE) if instance.image_uuid is not None: method = objects.INSTANCE_ACTION_TYPE.REBUILD else: method = objects.INSTANCE_ACTION_TYPE.REBOOT else: if instance.is_rebooting(): force_fail = True instance.cancel_action(objects.INSTANCE_ACTION_TYPE.REBOOT) # Escalate to rebuild if last reboot didn't work if self._reboot_count[instance.uuid] > 0: if instance.image_uuid is not None: method = objects.INSTANCE_ACTION_TYPE.REBUILD else: method = objects.INSTANCE_ACTION_TYPE.REBOOT else: method = objects.INSTANCE_ACTION_TYPE.REBOOT elif instance.can_evacuate(system_initiated=True): if dor.dor_is_complete(): method = objects.INSTANCE_ACTION_TYPE.EVACUATE if instance.is_rebooting(): force_fail = True instance.cancel_action(objects.INSTANCE_ACTION_TYPE.REBOOT) else: host_table = tables.tables_get_host_table() host = host_table.get(instance.host_name, None) if host is None: method = objects.INSTANCE_ACTION_TYPE.EVACUATE if instance.is_rebooting(): force_fail = True instance.cancel_action( objects.INSTANCE_ACTION_TYPE.REBOOT) else: DLOG.info("Instance %s can't be evacuated by the system." % instance.name) if not instance.is_failed() or force_fail: instance.fail(fail_reason) if method is not None: DLOG.info("Attempt recovery of instance %s by %s, " "uuid=%s, host_name=%s, admin_state=%s, " "oper_state=%s, avail_status=%s, action=%s, " "elapse_time_in_state=%s secs." % (instance.name, method, instance.uuid, instance.host_name, instance.admin_state, instance.oper_state, instance.avail_status, instance.action, instance.elapsed_time_in_state)) if not instance.is_failed() or force_fail: instance.fail(fail_reason) if not instance.auto_recovery: DLOG.info("Recovery of instance %s by %s is skipped since " "auto-recovery is turned off." % (instance.name, method)) return False if not instance.recoverable: DLOG.info("Instance %s by %s is skipped since instance is not " "recoverable." % (instance.name, method)) return False initiated_by = objects.INSTANCE_ACTION_INITIATED_BY.DIRECTOR if objects.INSTANCE_ACTION_TYPE.REBOOT == method: self._reboot_count[instance.uuid] += 1 instance.do_action(method, initiated_by=initiated_by) elif objects.INSTANCE_ACTION_TYPE.REBUILD == method: self._reboot_count[instance.uuid] = 0 instance.do_action(method, initiated_by=initiated_by) elif objects.INSTANCE_ACTION_TYPE.EVACUATE == method: self._reboot_count[instance.uuid] = 0 instance.do_action(method, initiated_by=initiated_by) elif objects.INSTANCE_ACTION_TYPE.STOP == method: self._reboot_count[instance.uuid] = 0 instance.do_action(method, initiated_by=initiated_by) return method is not None def reschedule_audit_instances(self, interval=None): """ Reschedule audit instances """ if interval is None: interval = self._next_audit_interval if self._timer_audit_instances is not None: timers.timers_reschedule_timer(self._timer_audit_instances, interval) DLOG.verbose("Recovery audit is rescheduled to %s second " "intervals." % interval) def recover_instances(self, audit=False): """ Recover instances that were previously launched but are currently failed or executing an action for a very long time """ if not dor.system_is_stabilized(): DLOG.info("DOR system stabilization is not complete, can't recover " "instances.") self.reschedule_audit_instances(self._recovery_audit_cooldown) return if self.upgrade_inprogress(): DLOG.info("Upgrade inprogress, can't recover instances.") self.reschedule_audit_instances(self._recovery_audit_cooldown) return if not self._hypervisors_available(min_count=1): DLOG.info("No hypervisors available, can't recover instances.") self.reschedule_audit_instances(self._recovery_audit_cooldown) return if not audit: interval = self._recovery_audit_cooldown if 0 == len(self._instance_recovery_list): if self._next_audit_interval == self._recovery_audit_interval: self.reschedule_audit_instances(interval) self._next_audit_interval = interval return if 0 == len(self._instance_recovery_list): (self._next_audit_interval, self._instance_recovery_list, self._instance_failed_list, self._instance_rebuilding_list, self._instance_migrating_list, self._instance_rebooting_list) \ = self._get_instance_recovery_list() DLOG.info("Running recovery audit, instances_to_recover=%s, " "instances_failed=%s, instances_rebuilding=%s, " "instances_migrating=%s, instances_rebooting=%s." % (len(self._instance_recovery_list), len(self._instance_failed_list), len(self._instance_rebuilding_list), len(self._instance_migrating_list), len(self._instance_rebooting_list))) else: DLOG.info("Running recovery audit, instances_remaining=%s." % len(self._instance_recovery_list)) # Attempt to recover instances, whether resources are available or not instance_table = tables.tables_get_instance_table() count = 0 # Use a lower cutoff when there are a large number of instances to recover cutoff = self._max_concurrent_recovering_instances if len(self._instance_recovery_list) > self._recovery_threshold: cutoff = self._max_throttled_recovering_instances for instance_recover in list(self._instance_recovery_list): instance = instance_table.get(instance_recover.uuid, None) if instance is not None: host_operation_inprogress = False host_operation = self._host_operations.get(instance.host_name, None) if host_operation is not None: if host_operation.is_inprogress(): DLOG.debug("Skip recovery of instance %s, host %s " "operation inprogress." % (instance.name, instance.host_name)) host_operation_inprogress = True if not host_operation_inprogress: if not (instance.is_deleting() or instance.is_deleted() or instance.is_locked()) and instance.is_failed(): if self._instance_recovery_allowed(instance): if self.recover_instance(instance): count += 1 self._instance_recovery_list.remove(instance_recover) if count >= cutoff: break if 0 == len(self._instance_recovery_list): self.unlock_instances() self.reschedule_audit_instances() DLOG.info("Completed recovery audit cycle.") else: self.reschedule_audit_instances(self._recovery_audit_batch_interval) DLOG.info("Completed recovery audit batch.") def unlock_instances(self): """ Unlock (start) instances that were locked (stopped) when a single hypervisor configuration had its hypervisor disabled. This should only be done after all unlocked instances have been recovered. """ if not os.path.exists(NFV_VIM_UNLOCK_COMPLETE_FILE): if self._single_hypervisor: DLOG.info("Unlocking instances after hypervisor enabled") instance_table = tables.tables_get_instance_table() instance_uuids = list() for instance in instance_table.values(): if instance.unlock_to_recover and instance.is_locked(): instance_uuids.append(instance.uuid) instance.unlock_to_recover = False if instance_uuids: self.start_instances(instance_uuids, serial=True) # Do not attempt to do the unlock again. open(NFV_VIM_UNLOCK_COMPLETE_FILE, 'w').close() @coroutine def audit_instances(self): """ Audit Instances """ while True: (yield) self.recover_instances(audit=True) def cleanup_instance(self, instance_uuid): """ Cleanup an instance """ if instance_uuid not in self._instance_cleanup_list: self._instance_cleanup_list.append(instance_uuid) if self._timer_cleanup_instances is None: self._timer_cleanup_instances = timers.timers_create_timer( "cleanup-instances", 1, 1, self.cleanup_instances) @coroutine def cleanup_instances(self): """ Cleanup Instances """ (yield) trigger_recovery = False instance_table = tables.tables_get_instance_table() for instance_uuid in self._instance_cleanup_list: instance = instance_table.get(instance_uuid, None) if instance is not None: if instance.is_deleted(): DLOG.info("Cleaned up instance %s" % instance.name) del instance_table[instance_uuid] trigger_recovery = True self._instance_cleanup_list[:] = list() self._timer_cleanup_instances = None if trigger_recovery: DLOG.info("Recover-Instances-Audit triggered by instance deletion.") self.recover_instances() def migrate_instances(self, instance_uuids): """ Migrate Instances """ DLOG.info("Migrate instances uuids=%s." % instance_uuids) host_table = tables.tables_get_host_table() instance_table = tables.tables_get_instance_table() overall_operation = Operation(OPERATION_TYPE.MIGRATE_INSTANCES) host_operations = dict() for instance_uuid in instance_uuids: instance = instance_table.get(instance_uuid, None) if instance is None: reason = "Instance %s does not exist." % instance_uuid DLOG.info(reason) overall_operation.set_failed(reason) return overall_operation host = host_table.get(instance.host_name, None) if host is None: reason = "Host %s does not exist." % instance.host_name DLOG.info(reason) overall_operation.set_failed(reason) return overall_operation host_operation = self._host_operations.get(instance.host_name, None) if host_operation is not None: if host_operation.is_inprogress(): reason = ("Another host operation %s is already inprogress " "for host %s." % (host_operation.operation_type, instance.host_name)) DLOG.info(reason) overall_operation.set_failed(reason) return overall_operation else: del self._host_operations[instance.host_name] host_operation = host_operations.get(instance.host_name, None) if host_operation is None: host_operation = Operation(OPERATION_TYPE.MIGRATE_INSTANCES) host_operations[instance.host_name] = host_operation for host_name, host_operation in host_operations.items(): self._host_operations[host_name] = host_operation self._host_migrate_instances(host_table[host_name], host_operation) if host_operation.is_inprogress(): overall_operation.add_host(host_name, OPERATION_STATE.INPROGRESS) elif host_operation.is_failed(): overall_operation.add_host(host_name, OPERATION_STATE.FAILED) overall_operation.update_failure_reason(host_operation.reason) break elif host_operation.is_timed_out(): overall_operation.add_host(host_name, OPERATION_STATE.TIMED_OUT) overall_operation.update_failure_reason(host_operation.reason) break else: overall_operation.add_host(host_name, OPERATION_STATE.COMPLETED) return overall_operation def stop_instances(self, instance_uuids): """ Stop Instances """ DLOG.info("Stop instances uuids=%s." % instance_uuids) host_table = tables.tables_get_host_table() instance_table = tables.tables_get_instance_table() overall_operation = Operation(OPERATION_TYPE.STOP_INSTANCES) host_operations = dict() for instance_uuid in instance_uuids: instance = instance_table.get(instance_uuid, None) if instance is None: reason = "Instance %s does not exist." % instance_uuid DLOG.info(reason) overall_operation.set_failed(reason) return overall_operation host = host_table.get(instance.host_name, None) if host is None: reason = "Host %s does not exist." % instance.host_name DLOG.info(reason) overall_operation.set_failed(reason) return overall_operation host_operation = self._host_operations.get(instance.host_name, None) if host_operation is not None: if host_operation.is_inprogress(): reason = ("Another host operation %s is already inprogress " "for host %s." % (host_operation.operation_type, instance.host_name)) DLOG.info(reason) overall_operation.set_failed(reason) return overall_operation else: del self._host_operations[instance.host_name] host_operation = host_operations.get(instance.host_name, None) if host_operation is None: host_operation = Operation(OPERATION_TYPE.STOP_INSTANCES) host_operations[instance.host_name] = host_operation for host_name, host_operation in host_operations.items(): self._host_operations[host_name] = host_operation self._host_stop_instances(host_table[host_name], host_operation, instance_uuids) if host_operation.is_inprogress(): overall_operation.add_host(host_name, OPERATION_STATE.INPROGRESS) elif host_operation.is_failed(): overall_operation.add_host(host_name, OPERATION_STATE.FAILED) overall_operation.update_failure_reason(host_operation.reason) break elif host_operation.is_timed_out(): overall_operation.add_host(host_name, OPERATION_STATE.TIMED_OUT) overall_operation.update_failure_reason(host_operation.reason) break else: overall_operation.add_host(host_name, OPERATION_STATE.COMPLETED) return overall_operation def start_instances(self, instance_uuids, serial=False): """ Start Instances """ DLOG.info("Start instances uuids=%s." % instance_uuids) host_table = tables.tables_get_host_table() instance_table = tables.tables_get_instance_table() if serial: operation_type = OPERATION_TYPE.START_INSTANCES_SERIAL else: operation_type = OPERATION_TYPE.START_INSTANCES overall_operation = Operation(operation_type) host_operations = dict() for instance_uuid in instance_uuids: instance = instance_table.get(instance_uuid, None) if instance is None: reason = "Instance %s does not exist." % instance_uuid DLOG.info(reason) overall_operation.set_failed(reason) return overall_operation host = host_table.get(instance.host_name, None) if host is None: reason = "Host %s does not exist." % instance.host_name DLOG.info(reason) overall_operation.set_failed(reason) return overall_operation host_operation = self._host_operations.get(instance.host_name, None) if host_operation is not None: if host_operation.is_inprogress(): reason = ("Another host operation %s is already inprogress " "for host %s." % (host_operation.operation_type, instance.host_name)) DLOG.info(reason) overall_operation.set_failed(reason) return overall_operation else: del self._host_operations[instance.host_name] host_operation = host_operations.get(instance.host_name, None) if host_operation is None: host_operation = Operation(operation_type) host_operations[instance.host_name] = host_operation for host_name, host_operation in host_operations.items(): self._host_operations[host_name] = host_operation self._host_start_instances(host_table[host_name], host_operation, instance_uuids) if host_operation.is_inprogress(): overall_operation.add_host(host_name, OPERATION_STATE.INPROGRESS) elif host_operation.is_failed(): overall_operation.add_host(host_name, OPERATION_STATE.FAILED) overall_operation.update_failure_reason(host_operation.reason) break elif host_operation.is_timed_out(): overall_operation.add_host(host_name, OPERATION_STATE.TIMED_OUT) overall_operation.update_failure_reason(host_operation.reason) break else: overall_operation.add_host(host_name, OPERATION_STATE.COMPLETED) return overall_operation def get_instance_director(): """ Returns the Instance Director """ return _instance_director def instance_director_initialize(): """ Initialize Instance Director """ global _instance_director if config.section_exists('instance-configuration'): section = config.CONF['instance-configuration'] max_concurrent_recovering_instances \ = int(section.get('max_concurrent_recovering_instances', 4)) max_concurrent_migrates_per_host \ = int(section.get('max_concurrent_migrates_per_host', 1)) max_concurrent_evacuates_per_host \ = int(section.get('max_concurrent_evacuates_per_host', 1)) recovery_audit_interval \ = int(section.get('recovery_audit_interval', 330)) recovery_audit_cooldown \ = int(section.get('recovery_audit_cooldown', 30)) recovery_audit_batch_interval \ = int(section.get('recovery_audit_batch_interval', 2)) recovery_cooldown \ = int(section.get('recovery_cooldown', 30)) rebuild_timeout \ = int(section.get('rebuild_timeout', 900)) reboot_timeout \ = int(section.get('reboot_timeout', 300)) migrate_timeout \ = int(section.get('migrate_timeout', 960)) single_hypervisor \ = (section.get('single_hypervisor', 'false').lower() == 'true') recovery_threshold = int(section.get('recovery_threshold', 250)) max_throttled_recovering_instances \ = int(section.get('max_throttled_recovering_instances', 2)) else: max_concurrent_recovering_instances = 4 max_concurrent_migrates_per_host = 1 max_concurrent_evacuates_per_host = 1 recovery_audit_interval = 330 recovery_audit_cooldown = 30 recovery_audit_batch_interval = 2 recovery_cooldown = 30 rebuild_timeout = 900 reboot_timeout = 300 migrate_timeout = 960 single_hypervisor = False recovery_threshold = 250 max_throttled_recovering_instances = 2 _instance_director = InstanceDirector( max_concurrent_recovering_instances, max_concurrent_migrates_per_host, max_concurrent_evacuates_per_host, recovery_audit_interval, recovery_audit_cooldown, recovery_audit_batch_interval, recovery_cooldown, rebuild_timeout, reboot_timeout, migrate_timeout, single_hypervisor, recovery_threshold, max_throttled_recovering_instances) def instance_director_finalize(): """ Finalize Instance Director """ pass