nfv/nfv/nfv-vim/nfv_vim/directors/_instance_director.py

2163 lines
97 KiB
Python
Executable File

#
# Copyright (c) 2015-2016 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
import os
import six
from nfv_common import config
from nfv_common import debug
from nfv_common import timers
from nfv_common.helpers import coroutine
from nfv_common.helpers import Singleton
from nfv_vim import dor
from nfv_vim import nfvi
from nfv_vim import objects
from nfv_vim import tables
from nfv_vim.directors._directors_defs import Operation
from nfv_vim.directors._directors_defs import OPERATION_STATE
from nfv_vim.directors._directors_defs import OPERATION_TYPE
DLOG = debug.debug_get_logger('nfv_vim.instance_director')
_instance_director = None
NFV_VIM_UNLOCK_COMPLETE_FILE = '/var/run/.nfv-vim.unlock_complete'
@six.add_metaclass(Singleton)
class InstanceDirector(object):
"""
Instance Director
"""
def __init__(self, max_concurrent_recovering_instances,
max_concurrent_migrates_per_host,
max_concurrent_evacuates_per_host, recovery_audit_interval,
recovery_audit_cooldown, recovery_audit_batch_interval,
recovery_cooldown, rebuild_timeout, reboot_timeout,
migrate_timeout, single_hypervisor,
recovery_threshold, max_throttled_recovering_instances):
self._max_concurrent_recovering_instances \
= max_concurrent_recovering_instances
self._max_concurrent_migrates_per_host \
= max_concurrent_migrates_per_host
self._max_concurrent_evacuates_per_host \
= max_concurrent_evacuates_per_host
self._recovery_audit_interval = recovery_audit_interval
self._recovery_audit_cooldown = recovery_audit_cooldown
self._recovery_audit_batch_interval = recovery_audit_batch_interval
self._recovery_cooldown = recovery_cooldown
self._rebuild_timeout = rebuild_timeout
self._reboot_timeout = reboot_timeout
self._migrate_timeout = migrate_timeout
self._single_hypervisor = single_hypervisor
self._recovery_threshold = recovery_threshold
self._max_throttled_recovering_instances \
= max_throttled_recovering_instances
self._host_operations = dict()
self._reboot_count = dict()
self._instance_recovery_list = list()
self._instance_failed_list = list()
self._instance_rebuilding_list = list()
self._instance_migrating_list = list()
self._instance_rebooting_list = list()
self._instance_cleanup_list = list()
self._next_audit_interval = recovery_audit_interval
if not nfvi.nfvi_compute_plugin_disabled():
# Do not launch audit if compute plugin not enabled.
self._timer_audit_instances = timers.timers_create_timer(
"audit-instances", recovery_audit_cooldown,
recovery_audit_interval, self.audit_instances)
else:
self._timer_audit_instances = None
self._timer_cleanup_instances = None
@staticmethod
def _is_host_enabled(host_name):
"""
Returns true if the hypervisor is enabled
"""
host_table = tables.tables_get_host_table()
host = host_table.get(host_name, None)
if host is not None:
if host.nfvi_host_is_enabled():
return True
return False
@staticmethod
def _is_hypervisor_enabled(host_name):
"""
Returns true if the hypervisor is enabled
"""
hypervisor_table = tables.tables_get_hypervisor_table()
hypervisor = hypervisor_table.get_by_host_name(host_name)
if hypervisor is not None:
if hypervisor.is_enabled():
return True
return False
@staticmethod
def _hypervisors_available(min_count=1, excluded_hosts=None):
"""
Returns true if at least given count of hosts and hypervisors are enabled
"""
if excluded_hosts is None:
excluded_hosts = list()
available_count = 0
host_table = tables.tables_get_host_table()
hypervisor_table = tables.tables_get_hypervisor_table()
for host_name in host_table.keys():
if host_name in excluded_hosts:
continue
host = host_table.get(host_name, None)
if host.nfvi_host_is_enabled():
hypervisor = hypervisor_table.get_by_host_name(host_name)
if hypervisor is not None:
if hypervisor.is_enabled():
available_count += 1
return available_count >= min_count
@staticmethod
def upgrade_inprogress():
"""
Returns true if the system is going through an upgrade
"""
host_table = tables.tables_get_host_table()
for host_name in host_table.keys():
host = host_table.get(host_name, None)
if host is not None:
if host.upgrade_inprogress and not host.recover_instances:
return True
return False
@staticmethod
def instance_action_allowed(instance, action_type):
"""
Returns true if instance action is allowed
"""
DLOG.info("Instance action allowed for %s, action_type=%s"
% (instance.name, action_type))
return not InstanceDirector.upgrade_inprogress()
def _instance_recovery_allowed(self, instance):
"""
Returns true if instance recovery is allowed
"""
recovery_allowed = False
if instance.is_rebuilding():
if instance.elapsed_time_in_state >= self._rebuild_timeout:
recovery_allowed = True
# We only recover failed live migrations - not failed cold migrations
# or failed resize operations (instance.is_resizing).
elif instance.is_migrating():
if instance.elapsed_time_in_state >= self._migrate_timeout:
recovery_allowed = True
elif instance.is_rebooting():
if instance.elapsed_time_in_state >= self._reboot_timeout:
recovery_allowed = True
else:
if instance.elapsed_time_in_state >= self._recovery_cooldown:
recovery_allowed = True
return recovery_allowed
def _get_instance_recovery_list(self):
"""
Get instance recovery list after the previous list is exhausted
"""
next_audit_interval = self._recovery_audit_interval
# Get all instances that are to be considered for recovery.
instances_recover = list()
instances_failed = list()
instances_rebuilding = list()
instances_migrating = list()
instances_rebooting = list()
instance_tracking_uuids = list()
instance_table = tables.tables_get_instance_table()
# Check for failed instances; exclude instances that are part of a
# host operation or have recently failed. Also check for failed
# instances stuck recovering.
for instance_uuid in instance_table:
instance = instance_table[instance_uuid]
host_operation = self._host_operations.get(instance.host_name, None)
if host_operation is not None:
if host_operation.is_inprogress():
DLOG.debug("Skip recovery of instance %s, host %s operation "
"inprogress." % (instance.name, instance.host_name))
next_audit_interval = self._recovery_audit_cooldown
continue
if instance.host_name is None:
DLOG.info("Can't recover instance %s, host is not valid."
% instance.name)
continue
if not (instance.is_deleting() or instance.is_deleted() or
instance.is_locked()) and instance.is_failed():
next_audit_interval = self._recovery_audit_cooldown
instance_tracking_uuids.append(instance.uuid)
if self._instance_recovery_allowed(instance):
instances_recover.append(instance)
else:
if instance.is_rebuilding():
instances_rebuilding.append(instance)
elif instance.is_migrating():
instances_migrating.append(instance)
elif instance.is_rebooting():
instances_rebooting.append(instance)
else:
instances_failed.append(instance)
# Remove reboot counts for instances that recovered
# Make a copy of the keys for this to work in python3
# since _reboot_count is changing while iterating
reboot_tracking_instance_uuids = list(self._reboot_count.keys())
for instance_uuid in reboot_tracking_instance_uuids:
if instance_uuid not in instance_tracking_uuids:
del self._reboot_count[instance_uuid]
# Initialize reboot counts for new instances
for instance_uuid in instance_tracking_uuids:
if instance_uuid not in self._reboot_count:
self._reboot_count[instance_uuid] = 0
# Order instances based on recovery priority
instances_recover.sort(key=objects.Instance.recovery_sort_key,
reverse=True)
return (next_audit_interval, instances_recover, instances_failed,
instances_rebuilding, instances_migrating, instances_rebooting)
def _host_migrate_instances(self, host, host_operation):
"""
Host Migrate Instances
"""
if host_operation.operation_type not in [OPERATION_TYPE.HOST_LOCK_FORCE,
OPERATION_TYPE.HOST_LOCK]:
if not dor.dor_is_complete():
DLOG.info("DOR is not complete, can't migrate instances off of "
"host %s." % host.name)
self.reschedule_audit_instances(self._recovery_audit_cooldown)
return
if self.upgrade_inprogress():
DLOG.info("Upgrade inprogress, can't migrate instances off of "
"host %s." % host.name)
self.reschedule_audit_instances(self._recovery_audit_cooldown)
return
if not self._hypervisors_available(min_count=1):
DLOG.info("No hypervisors available, can't migrate instances "
"off of host %s." % host.name)
self.reschedule_audit_instances(self._recovery_audit_cooldown)
return
if OPERATION_TYPE.HOST_LOCK_FORCE == host_operation.operation_type:
DLOG.info("Force-Lock issued, can't migrate instances off of "
"host %s." % host.name)
self.reschedule_audit_instances(self._recovery_audit_cooldown)
return
initiated_by = objects.INSTANCE_ACTION_INITIATED_BY.DIRECTOR
if OPERATION_TYPE.HOST_LOCK_FORCE == host_operation.operation_type:
reason = "host force lock command issued"
elif OPERATION_TYPE.HOST_LOCK == host_operation.operation_type:
reason = "host lock command issued"
elif OPERATION_TYPE.HOST_DISABLE == host_operation.operation_type:
reason = "host disabled"
elif OPERATION_TYPE.HOST_FAILED == host_operation.operation_type:
if host.is_component_failure():
reason = "host component failure"
else:
reason = "host failed"
elif OPERATION_TYPE.MIGRATE_INSTANCES == host_operation.operation_type:
reason = "migrate instances requested"
else:
reason = None
migrates_inprogress = host_operation.total_inprogress()
instance_table = tables.tables_get_instance_table()
for instance in instance_table.on_host(host.name):
if host_operation.instance_exists(instance.uuid):
continue
if instance.is_deleting() or instance.is_deleted() or \
instance.is_locked() or instance.is_failed():
continue
method = objects.INSTANCE_ACTION_TYPE.COLD_MIGRATE
if instance.is_enabled() or instance.is_paused():
if instance.supports_live_migration():
method = objects.INSTANCE_ACTION_TYPE.LIVE_MIGRATE
if host_operation.operation_type in [OPERATION_TYPE.HOST_LOCK,
OPERATION_TYPE.MIGRATE_INSTANCES]:
if OPERATION_TYPE.HOST_LOCK == host_operation.operation_type:
preamble = "Lock of host"
else:
preamble = "Migrate instances from host"
if instance.is_paused() and \
objects.INSTANCE_ACTION_TYPE.COLD_MIGRATE == method:
reason = ("%s %s failed because instance %s is paused."
% (preamble, host.name, instance.name))
DLOG.info(reason)
host_operation.add_instance(instance.uuid,
OPERATION_STATE.FAILED)
host_operation.update_failure_reason(reason)
return
elif instance.is_suspended():
reason = ("%s %s failed because instance %s is suspended."
% (preamble, host.name, instance.name))
DLOG.info(reason)
host_operation.add_instance(instance.uuid,
OPERATION_STATE.FAILED)
host_operation.update_failure_reason(reason)
return
elif instance.is_migrating() or instance.is_cold_migrating():
reason = ("%s %s failed because instance %s is migrating."
% (preamble, host.name, instance.name))
DLOG.info(reason)
host_operation.add_instance(instance.uuid,
OPERATION_STATE.FAILED)
host_operation.update_failure_reason(reason)
return
elif instance.is_rebuilding():
reason = ("%s %s failed because instance %s is rebuilding."
% (preamble, host.name, instance.name))
DLOG.info(reason)
host_operation.add_instance(instance.uuid,
OPERATION_STATE.FAILED)
host_operation.update_failure_reason(reason)
return
elif instance.is_action_running():
# Nova will not migrate an instance if an action is already
# running.
reason = (
"%s %s failed because instance %s action in progress."
% (preamble, host.name, instance.name))
DLOG.info(reason)
host_operation.add_instance(instance.uuid,
OPERATION_STATE.FAILED)
host_operation.update_failure_reason(reason)
return
elif instance.is_resized():
# Nova will not migrate an instance if is resized and
# waiting for confirmation.
reason = (
"%s %s failed because instance %s is resizing."
% (preamble, host.name, instance.name))
DLOG.info(reason)
host_operation.add_instance(instance.uuid,
OPERATION_STATE.FAILED)
host_operation.update_failure_reason(reason)
return
elif not self._hypervisors_available(min_count=1,
excluded_hosts=[host.name]):
reason = ("%s %s failed because there are no other "
"hypervisors available." % (preamble, host.name))
DLOG.info(reason)
host_operation.add_instance(instance.uuid,
OPERATION_STATE.FAILED)
host_operation.update_failure_reason(reason)
return
else:
if objects.INSTANCE_ACTION_TYPE.LIVE_MIGRATE == method:
if not instance.can_live_migrate(system_initiated=True):
reason = ("%s %s failed because instance %s "
"can't be live-migrated by the system. "
"Manually move the instance off of host %s."
% (preamble, host.name, instance.name,
host.name))
DLOG.info(reason)
host_operation.add_instance(instance.uuid,
OPERATION_STATE.FAILED)
host_operation.update_failure_reason(reason)
return
else:
if not instance.can_cold_migrate(system_initiated=True):
reason = ("%s %s failed because instance %s "
"can't be cold-migrated by the system. "
"Manually move the instance off of host %s."
% (preamble, host.name, instance.name,
host.name))
DLOG.info(reason)
host_operation.add_instance(instance.uuid,
OPERATION_STATE.FAILED)
host_operation.update_failure_reason(reason)
return
else:
if instance.is_paused() and \
objects.INSTANCE_ACTION_TYPE.COLD_MIGRATE == method:
DLOG.info("Instance %s set as failed on host %s "
"because it is paused." % (instance.name,
host.name))
instance.fail(reason + " and instance is paused")
continue
elif instance.is_suspended():
DLOG.info("Instance %s set as failed on host %s because "
"it is suspended." % (instance.name, host.name))
instance.fail(reason + " and instance is suspended")
continue
elif instance.is_migrating() or instance.is_cold_migrating():
# Allow current migrations to continue
DLOG.info("Instance %s on host %s is already migrating."
% (instance.name, host.name))
elif instance.is_rebuilding():
DLOG.info("Instance %s set as failed on host %s because "
"it is rebuilding." % (instance.name, host.name))
instance.fail(reason + " and instance is rebuilding")
continue
elif instance.is_action_running():
DLOG.info("Instance %s set as failed on host %s because "
"an action is in progress." % (instance.name,
host.name))
instance.fail(reason +
" and instance has action in progress")
continue
elif instance.is_resized():
DLOG.info("Instance %s set as failed on host %s because "
"it is resized." % (instance.name, host.name))
instance.fail(reason + " and instance is resized")
continue
else:
if objects.INSTANCE_ACTION_TYPE.LIVE_MIGRATE == method:
if not instance.can_live_migrate(system_initiated=True):
DLOG.info("Instance %s set as failed on host %s "
"because the system can't live-migrate "
"instance." % (instance.name, host.name))
instance.fail(reason + " and the system can't "
"live-migrate instance")
continue
else:
if not instance.can_cold_migrate(system_initiated=True):
DLOG.info("Instance %s set as failed on host %s "
"because the system can't cold-migrate "
"instance." % (instance.name, host.name))
instance.fail(reason + " and the system can't "
"cold-migrate instance")
continue
host_operation.add_instance(instance.uuid, OPERATION_STATE.INPROGRESS)
if not (instance.is_migrating() or instance.is_cold_migrating()):
instance.do_action(method, initiated_by=initiated_by, reason=reason)
migrates_inprogress += 1
if migrates_inprogress >= self._max_concurrent_migrates_per_host:
break
def _host_evacuate_instances(self, host, host_operation):
"""
Host Evacuate Instances
"""
do_evacuates = True
if host_operation.operation_type not in [OPERATION_TYPE.HOST_LOCK_FORCE,
OPERATION_TYPE.HOST_LOCK]:
if not dor.dor_is_complete():
DLOG.info("DOR is not complete, can't evacuate instances off of "
"host %s." % host.name)
self.reschedule_audit_instances(self._recovery_audit_cooldown)
do_evacuates = False
if do_evacuates and self.upgrade_inprogress():
DLOG.info("Upgrade inprogress, can't evacuate instances off of "
"host %s." % host.name)
self.reschedule_audit_instances(self._recovery_audit_cooldown)
do_evacuates = False
if do_evacuates and not self._hypervisors_available(min_count=1):
DLOG.info("No hypervisors available, can't evacuate instances "
"off of host %s." % host.name)
self.reschedule_audit_instances(self._recovery_audit_cooldown)
do_evacuates = False
if do_evacuates and \
OPERATION_TYPE.HOST_LOCK_FORCE == host_operation.operation_type:
DLOG.info("Force-Lock issued, can't evacuate instances off of "
"host %s until it is rebooted." % host.name)
self.reschedule_audit_instances(self._recovery_audit_cooldown)
do_evacuates = False
initiated_by = objects.INSTANCE_ACTION_INITIATED_BY.DIRECTOR
if OPERATION_TYPE.HOST_LOCK_FORCE == host_operation.operation_type:
reason = "host force lock command issued"
elif OPERATION_TYPE.HOST_LOCK == host_operation.operation_type:
reason = "host lock command issued"
elif OPERATION_TYPE.HOST_DISABLE == host_operation.operation_type:
reason = "host disable action"
elif OPERATION_TYPE.HOST_FAILED == host_operation.operation_type:
if host.is_component_failure():
reason = "host component failure"
else:
reason = "host failed"
else:
reason = None
evacuates_inprogress = host_operation.total_inprogress()
instance_table = tables.tables_get_instance_table()
# Sort the instances on this host based on their recovery priority
evacuate_priority_list = list()
for instance in instance_table.on_host(host.name):
evacuate_priority_list.append(instance)
evacuate_priority_list.sort(
key=objects.Instance.recovery_sort_key, reverse=True)
for instance in evacuate_priority_list:
if host_operation.instance_exists(instance.uuid):
continue
if instance.is_deleting() or instance.is_deleted() or \
instance.is_locked():
continue
if OPERATION_TYPE.HOST_LOCK == host_operation.operation_type:
if instance.is_paused():
reason = ("Lock of host %s failed because instance %s "
"is paused." % (host.name, instance.name))
DLOG.info(reason)
host_operation.add_instance(instance.uuid,
OPERATION_STATE.FAILED)
host_operation.update_failure_reason(reason)
return
elif instance.is_suspended():
reason = ("Lock of host %s failed because instance %s "
"is suspended." % (host.name, instance.name))
DLOG.info(reason)
host_operation.add_instance(instance.uuid,
OPERATION_STATE.FAILED)
host_operation.update_failure_reason(reason)
return
elif instance.is_migrating() or instance.is_cold_migrating():
reason = ("Lock of host %s failed because instance %s "
"is migrating." % (host.name, instance.name))
DLOG.info(reason)
host_operation.add_instance(instance.uuid,
OPERATION_STATE.FAILED)
host_operation.update_failure_reason(reason)
return
elif instance.is_rebuilding():
reason = ("Lock of host %s failed because instance %s "
"is rebuilding." % (host.name, instance.name))
DLOG.info(reason)
host_operation.add_instance(instance.uuid,
OPERATION_STATE.FAILED)
host_operation.update_failure_reason(reason)
return
elif instance.is_action_running():
# Nova will not evacuate an instance if an action is already
# running.
reason = ("Lock of host %s failed because instance %s "
"action in progress." % (host.name, instance.name))
DLOG.info(reason)
host_operation.add_instance(instance.uuid,
OPERATION_STATE.FAILED)
host_operation.update_failure_reason(reason)
return
elif instance.is_resized():
# Nova will not migrate an instance if is resized and
# waiting for confirmation.
reason = ("Lock of host %s failed because instance %s "
"is resizing." % (host.name, instance.name))
DLOG.info(reason)
host_operation.add_instance(instance.uuid,
OPERATION_STATE.FAILED)
host_operation.update_failure_reason(reason)
return
elif not self._hypervisors_available(min_count=1,
excluded_hosts=[host.name]):
reason = ("Lock of host %s failed because there are no "
"other hypervisors available." % host.name)
DLOG.info(reason)
host_operation.add_instance(instance.uuid,
OPERATION_STATE.FAILED)
host_operation.update_failure_reason(reason)
return
elif not instance.can_evacuate(system_initiated=True):
reason = ("Lock of host %s failed because instance %s "
"can't be evacuated by the system. Manually "
"move the instance off of host %s."
% (host.name, instance.name, host.name))
DLOG.info(reason)
host_operation.add_instance(instance.uuid,
OPERATION_STATE.FAILED)
host_operation.update_failure_reason(reason)
return
else:
if not instance.is_failed():
instance.fail(reason)
if do_evacuates:
if evacuates_inprogress < self._max_concurrent_evacuates_per_host:
if instance.auto_recovery and instance.recoverable and \
instance.can_evacuate(system_initiated=True):
host_operation.add_instance(instance.uuid,
OPERATION_STATE.INPROGRESS)
instance.do_action(objects.INSTANCE_ACTION_TYPE.EVACUATE,
initiated_by=initiated_by, reason=reason)
evacuates_inprogress += 1
@staticmethod
def _host_stop_instances(host, host_operation, instance_uuids):
"""
Host Stop Instances
"""
initiated_by = objects.INSTANCE_ACTION_INITIATED_BY.DIRECTOR
if OPERATION_TYPE.STOP_INSTANCES == host_operation.operation_type:
reason = "stop instances issued"
elif OPERATION_TYPE.HOST_LOCK_FORCE == host_operation.operation_type:
reason = "host force lock command issued"
elif OPERATION_TYPE.HOST_LOCK == host_operation.operation_type:
reason = "host lock command issued"
else:
reason = ("Unsupported operation (%s) against host %s."
% (host_operation.operation_type, host.name))
DLOG.info(reason)
host_operation.set_failed(reason)
return
instance_table = tables.tables_get_instance_table()
for instance in instance_table.on_host(host.name):
if instance.uuid not in instance_uuids:
# We were not asked to stop this instance
DLOG.info("Ignoring instance %s while stopping instances on "
"host %s" % (instance.name, host.name))
continue
if host_operation.instance_exists(instance.uuid):
continue
if instance.is_deleting() or instance.is_deleted() or \
instance.is_locked():
continue
if host_operation.operation_type in [OPERATION_TYPE.HOST_LOCK,
OPERATION_TYPE.STOP_INSTANCES]:
# Fail the operation if an instance cannot be stopped
if OPERATION_TYPE.HOST_LOCK == host_operation.operation_type:
preamble = "Lock of host"
else:
preamble = "Stop instances on host"
if instance.is_paused():
reason = ("%s %s failed because instance %s is paused."
% (preamble, host.name, instance.name))
DLOG.info(reason)
host_operation.add_instance(instance.uuid,
OPERATION_STATE.FAILED)
host_operation.update_failure_reason(reason)
return
elif instance.is_suspended():
reason = ("%s %s failed because instance %s is suspended."
% (preamble, host.name, instance.name))
DLOG.info(reason)
host_operation.add_instance(instance.uuid,
OPERATION_STATE.FAILED)
host_operation.update_failure_reason(reason)
return
elif instance.is_migrating() or instance.is_cold_migrating():
reason = ("%s %s failed because instance %s is migrating."
% (preamble, host.name, instance.name))
DLOG.info(reason)
host_operation.add_instance(instance.uuid,
OPERATION_STATE.FAILED)
host_operation.update_failure_reason(reason)
return
elif instance.is_rebuilding():
reason = ("%s %s failed because instance %s is rebuilding."
% (preamble, host.name, instance.name))
DLOG.info(reason)
host_operation.add_instance(instance.uuid,
OPERATION_STATE.FAILED)
host_operation.update_failure_reason(reason)
return
elif instance.is_action_running():
# Nova will not stop an instance if an action is already
# running.
reason = (
"%s %s failed because instance %s action in progress."
% (preamble, host.name, instance.name))
DLOG.info(reason)
host_operation.add_instance(instance.uuid,
OPERATION_STATE.FAILED)
host_operation.update_failure_reason(reason)
return
elif instance.is_resized():
# Nova will not stop an instance if is resized and
# waiting for confirmation.
reason = (
"%s %s failed because instance %s is resizing."
% (preamble, host.name, instance.name))
DLOG.info(reason)
host_operation.add_instance(instance.uuid,
OPERATION_STATE.FAILED)
host_operation.update_failure_reason(reason)
return
else:
# Force lock - fail instances that cannot be stopped.
if instance.is_paused():
DLOG.info("Instance %s set as failed on host %s "
"because it is paused." % (instance.name,
host.name))
instance.fail(reason + " and instance is paused")
continue
elif instance.is_suspended():
DLOG.info("Instance %s set as failed on host %s because "
"it is suspended." % (instance.name, host.name))
instance.fail(reason + " and instance is suspended")
continue
elif instance.is_migrating() or instance.is_cold_migrating():
DLOG.info("Instance %s set as failed on host %s because "
"it is migrating." % (instance.name, host.name))
instance.fail(reason + " and instance is migrating")
continue
elif instance.is_rebuilding():
DLOG.info("Instance %s set as failed on host %s because "
"it is rebuilding." % (instance.name, host.name))
instance.fail(reason + " and instance is rebuilding")
continue
elif instance.is_action_running():
DLOG.info("Instance %s set as failed on host %s because "
"an action is in progress." % (instance.name,
host.name))
instance.fail(reason +
" and instance has action in progress")
continue
elif instance.is_resized():
DLOG.info("Instance %s set as failed on host %s because "
"it is resized." % (instance.name, host.name))
instance.fail(reason + " and instance is resized")
continue
host_operation.add_instance(instance.uuid, OPERATION_STATE.INPROGRESS)
instance.do_action(objects.INSTANCE_ACTION_TYPE.STOP,
initiated_by=initiated_by, reason=reason)
@staticmethod
def _host_start_instances(host, host_operation, instance_uuids):
"""
Host Start Instances
"""
if host_operation.operation_type not in [
OPERATION_TYPE.START_INSTANCES,
OPERATION_TYPE.START_INSTANCES_SERIAL]:
reason = ("Unsupported operation (%s) against host %s."
% (host_operation.operation_type, host.name))
DLOG.info(reason)
host_operation.set_failed(reason)
return
initiated_by = objects.INSTANCE_ACTION_INITIATED_BY.DIRECTOR
if OPERATION_TYPE.START_INSTANCES == host_operation.operation_type:
reason = "start instances issued"
elif OPERATION_TYPE.START_INSTANCES_SERIAL == \
host_operation.operation_type:
reason = "start instances serial issued"
else:
reason = None
starts_inprogress = 0
instance_table = tables.tables_get_instance_table()
for instance in instance_table.on_host(host.name):
if instance.uuid not in instance_uuids:
# We were not asked to start this instance
DLOG.info("Ignoring instance %s while starting instances on "
"host %s" % (instance.name, host.name))
continue
if host_operation.instance_exists(instance.uuid):
continue
if instance.is_deleting() or instance.is_deleted() or \
instance.is_failed():
continue
if instance.is_paused():
reason = ("Start instances on host %s failed because instance %s "
"is paused." % (host.name, instance.name))
DLOG.info(reason)
host_operation.add_instance(instance.uuid, OPERATION_STATE.FAILED)
host_operation.update_failure_reason(reason)
return
elif instance.is_suspended():
reason = ("Start instances on host %s failed because instance %s "
"is suspended." % (host.name, instance.name))
DLOG.info(reason)
host_operation.add_instance(instance.uuid, OPERATION_STATE.FAILED)
host_operation.update_failure_reason(reason)
return
elif instance.is_migrating() or instance.is_cold_migrating():
reason = ("Start instances on host %s failed because instance %s "
"is migrating." % (host.name, instance.name))
DLOG.info(reason)
host_operation.add_instance(instance.uuid, OPERATION_STATE.FAILED)
host_operation.update_failure_reason(reason)
return
elif instance.is_rebuilding():
reason = ("Start instances on host %s failed because instance %s "
"is rebuilding." % (host.name, instance.name))
DLOG.info(reason)
host_operation.add_instance(instance.uuid, OPERATION_STATE.FAILED)
host_operation.update_failure_reason(reason)
return
elif instance.is_action_running():
# Nova will not start an instance if an action is already
# running.
reason = (
"Start instances on host %s failed because instance %s "
"action in progress." % (host.name, instance.name))
DLOG.info(reason)
host_operation.add_instance(instance.uuid,
OPERATION_STATE.FAILED)
host_operation.update_failure_reason(reason)
return
elif instance.is_resized():
# Nova will not start an instance if is resized and
# waiting for confirmation.
reason = (
"Start instances on host %s failed because instance %s "
"is resizing." % (host.name, instance.name))
DLOG.info(reason)
host_operation.add_instance(instance.uuid,
OPERATION_STATE.FAILED)
host_operation.update_failure_reason(reason)
return
elif not instance.is_locked():
reason = ("Start instances on host %s failed because instance %s "
"is not locked." % (host.name, instance.name))
DLOG.info(reason)
host_operation.add_instance(instance.uuid, OPERATION_STATE.FAILED)
host_operation.update_failure_reason(reason)
return
if OPERATION_TYPE.START_INSTANCES_SERIAL == \
host_operation.operation_type and starts_inprogress >= 1:
# When starting instances in serial, the first instance is
# started and the rest are set to the READY state, to be
# started later.
host_operation.add_instance(instance.uuid,
OPERATION_STATE.READY)
else:
host_operation.add_instance(instance.uuid,
OPERATION_STATE.INPROGRESS)
instance.do_action(objects.INSTANCE_ACTION_TYPE.START,
initiated_by=initiated_by, reason=reason)
starts_inprogress += 1
def instance_migrate_complete(self, instance, from_host_name, failed=False,
timed_out=False, cancelled=False):
"""
Instance Migrate Complete
"""
from nfv_vim import directors
host_director = directors.get_host_director()
host_operation = self._host_operations.get(from_host_name, None)
if host_operation is None:
DLOG.verbose("No host %s operation inprogress." % from_host_name)
return
if host_operation.operation_type not in [OPERATION_TYPE.HOST_LOCK_FORCE,
OPERATION_TYPE.HOST_LOCK,
OPERATION_TYPE.HOST_DISABLE,
OPERATION_TYPE.HOST_FAILED,
OPERATION_TYPE.MIGRATE_INSTANCES]:
DLOG.verbose("Unexpected host %s operation %s, ignoring."
% (from_host_name, host_operation.operation_type))
return
host_table = tables.tables_get_host_table()
from_host = host_table.get(from_host_name, None)
if from_host is None:
DLOG.verbose("Host %s does not exist." % from_host_name)
return
if failed:
reason = ("Migrate of instance %s from host %s failed."
% (instance.name, from_host_name))
host_operation_state = OPERATION_STATE.FAILED
DLOG.info(reason)
elif timed_out:
reason = ("Migrate of instance %s from host %s timed out."
% (instance.name, from_host_name))
host_operation_state = OPERATION_STATE.TIMED_OUT
DLOG.info(reason)
elif cancelled:
reason = ("Migrate of instance %s on host %s cancelled."
% (instance.name, from_host_name))
host_operation_state = OPERATION_STATE.CANCELLED
DLOG.info(reason)
else:
reason = ("Migrate of instance %s from host %s succeeded."
% (instance.name, from_host_name))
host_operation_state = OPERATION_STATE.COMPLETED
DLOG.info(reason)
host_operation.update_instance(instance.uuid, host_operation_state)
if host_operation.operation_type in [OPERATION_TYPE.HOST_LOCK,
OPERATION_TYPE.MIGRATE_INSTANCES]:
if OPERATION_STATE.COMPLETED != host_operation_state:
host_operation.update_failure_reason(reason)
host_director.host_instances_moved(from_host, host_operation)
if OPERATION_TYPE.MIGRATE_INSTANCES == \
host_operation.operation_type:
sw_mgmt_director = directors.get_sw_mgmt_director()
sw_mgmt_director.migrate_instances_failed(reason)
host_operation = self._host_operations.get(from_host.name, None)
if host_operation is not None:
del self._host_operations[from_host.name]
return
else:
if OPERATION_STATE.COMPLETED != host_operation_state:
if not instance.is_failed():
instance.fail()
# Continue with the next batch of instances
if self._is_hypervisor_enabled(from_host_name):
self._host_migrate_instances(from_host, host_operation)
else:
self._host_evacuate_instances(from_host, host_operation)
# Check if host operation is complete
if host_operation.is_inprogress():
from_host.notify_instance_moved()
else:
host_director.host_instances_moved(from_host, host_operation)
host_operation = self._host_operations.get(from_host.name, None)
if host_operation is not None:
del self._host_operations[from_host.name]
def instance_evacuate_complete(self, instance, from_host_name, failed=False,
timed_out=False, cancelled=False):
"""
Instance Evacuate Complete
"""
from nfv_vim import directors
host_director = directors.get_host_director()
host_operation = self._host_operations.get(from_host_name, None)
if host_operation is None:
DLOG.verbose("No host %s operation inprogress." % from_host_name)
return
if host_operation.operation_type not in [OPERATION_TYPE.HOST_LOCK_FORCE,
OPERATION_TYPE.HOST_LOCK,
OPERATION_TYPE.HOST_DISABLE,
OPERATION_TYPE.HOST_FAILED]:
DLOG.verbose("Unexpected host %s operation %s, ignoring."
% (from_host_name, host_operation.operation_type))
return
host_table = tables.tables_get_host_table()
from_host = host_table.get(from_host_name, None)
if from_host is None:
DLOG.verbose("Host %s does not exist." % from_host_name)
return
if failed:
reason = ("Evacuate of instance %s from host %s failed."
% (instance.name, from_host_name))
host_operation_state = OPERATION_STATE.FAILED
DLOG.info(reason)
elif timed_out:
reason = ("Evacuate of instance %s from host %s timed out."
% (instance.name, from_host_name))
host_operation_state = OPERATION_STATE.TIMED_OUT
DLOG.info(reason)
elif cancelled:
reason = ("Evacuate of instance %s on host %s cancelled."
% (instance.name, from_host_name))
host_operation_state = OPERATION_STATE.CANCELLED
DLOG.info(reason)
else:
reason = ("Evacuate of instance %s from host %s succeeded."
% (instance.name, from_host_name))
host_operation_state = OPERATION_STATE.COMPLETED
DLOG.info(reason)
host_operation.update_instance(instance.uuid, host_operation_state)
if OPERATION_TYPE.HOST_LOCK == host_operation.operation_type:
if OPERATION_STATE.COMPLETED != host_operation_state:
host_operation.update_failure_reason(reason)
host_director.host_instances_moved(from_host, host_operation)
host_operation = self._host_operations.get(from_host.name, None)
if host_operation is not None:
del self._host_operations[from_host.name]
return
else:
if OPERATION_STATE.COMPLETED != host_operation_state:
if not instance.is_failed():
instance.fail()
# Continue with the next batch of instances
self._host_evacuate_instances(from_host, host_operation)
# Check if host operation is complete
if host_operation.is_inprogress():
from_host.notify_instance_moved()
else:
host_director.host_instances_moved(from_host, host_operation)
host_operation = self._host_operations.get(from_host.name, None)
if host_operation is not None:
del self._host_operations[from_host.name]
def instance_stop_complete(self, instance, on_host_name, failed=False,
timed_out=False, cancelled=False):
"""
Instance Stop Complete
"""
from nfv_vim import directors
host_director = directors.get_host_director()
host_operation = self._host_operations.get(on_host_name, None)
if host_operation is None:
DLOG.verbose("No host %s operation inprogress." % on_host_name)
return
if host_operation.operation_type not in [OPERATION_TYPE.STOP_INSTANCES,
OPERATION_TYPE.HOST_LOCK_FORCE,
OPERATION_TYPE.HOST_LOCK]:
DLOG.verbose("Unexpected host %s operation %s, ignoring."
% (on_host_name, host_operation.operation_type))
return
host_table = tables.tables_get_host_table()
host = host_table.get(on_host_name, None)
if host is None:
DLOG.verbose("Host %s does not exist." % on_host_name)
return
if failed:
reason = ("Stop of instance %s on host %s failed."
% (instance.name, on_host_name))
host_operation_state = OPERATION_STATE.FAILED
DLOG.info(reason)
elif timed_out:
reason = ("Stop of instance %s on host %s timed out."
% (instance.name, on_host_name))
host_operation_state = OPERATION_STATE.TIMED_OUT
DLOG.info(reason)
elif cancelled:
reason = ("Stop of instance %s on host %s cancelled."
% (instance.name, on_host_name))
host_operation_state = OPERATION_STATE.CANCELLED
DLOG.info(reason)
else:
reason = ("Stop of instance %s on host %s succeeded."
% (instance.name, on_host_name))
host_operation_state = OPERATION_STATE.COMPLETED
DLOG.info(reason)
host_operation.update_instance(instance.uuid, host_operation_state)
if host_operation.operation_type in [OPERATION_TYPE.STOP_INSTANCES,
OPERATION_TYPE.HOST_LOCK]:
if OPERATION_STATE.COMPLETED != host_operation_state:
host_operation.update_failure_reason(reason)
host_director.host_instances_stopped(host, host_operation)
host_operation = self._host_operations.get(host.name, None)
if host_operation is not None:
del self._host_operations[host.name]
return
else:
if OPERATION_STATE.COMPLETED != host_operation_state:
# Fail the instance because we are going to proceed with the
# operation regardless. Don't fail the instance if the stop
# operation was cancelled - this is a force lock and we
# fail instances that have operations in progress. Doing
# another fail here will cause a loop because the
# instance.is_failed() will not be true until nova has
# reported the updated state to us.
if not instance.is_failed() and not cancelled:
instance.fail()
# Check if host operation is complete
if host_operation.is_inprogress():
host.notify_instance_stopped()
else:
host_director.host_instances_stopped(host, host_operation)
host_operation = self._host_operations.get(host.name, None)
if host_operation is not None:
del self._host_operations[host.name]
def instance_start_complete(self, instance, on_host_name, failed=False,
timed_out=False, cancelled=False):
"""
Instance Start Complete
"""
host_operation = self._host_operations.get(on_host_name, None)
if host_operation is None:
DLOG.verbose("No host %s operation inprogress." % on_host_name)
return
if host_operation.operation_type not in [
OPERATION_TYPE.START_INSTANCES,
OPERATION_TYPE.START_INSTANCES_SERIAL]:
DLOG.verbose("Unexpected host %s operation %s, ignoring."
% (on_host_name, host_operation.operation_type))
return
host_table = tables.tables_get_host_table()
host = host_table.get(on_host_name, None)
if host is None:
DLOG.verbose("Host %s does not exist." % on_host_name)
return
if failed:
reason = ("Start of instance %s on host %s failed."
% (instance.name, on_host_name))
host_operation_state = OPERATION_STATE.FAILED
DLOG.info(reason)
elif timed_out:
reason = ("Start of instance %s on host %s timed out."
% (instance.name, on_host_name))
host_operation_state = OPERATION_STATE.TIMED_OUT
DLOG.info(reason)
elif cancelled:
reason = ("Start of instance %s on host %s cancelled."
% (instance.name, on_host_name))
host_operation_state = OPERATION_STATE.CANCELLED
DLOG.info(reason)
else:
reason = ("Start of instance %s on host %s succeeded."
% (instance.name, on_host_name))
host_operation_state = OPERATION_STATE.COMPLETED
DLOG.info(reason)
host_operation.update_instance(instance.uuid, host_operation_state)
if OPERATION_STATE.COMPLETED != host_operation_state:
host_operation.update_failure_reason(reason)
if OPERATION_TYPE.START_INSTANCES_SERIAL == \
host_operation.operation_type:
# Check if there is another instance on this host ready to start.
# We continue starting instances even if the previous instance
# failed to start.
instance_table = tables.tables_get_instance_table()
for instance in instance_table.on_host(host.name):
if host_operation.instance_ready(instance.uuid):
host_operation.update_instance(instance.uuid,
OPERATION_STATE.INPROGRESS)
instance.do_action(
objects.INSTANCE_ACTION_TYPE.START,
initiated_by=objects.INSTANCE_ACTION_INITIATED_BY.DIRECTOR,
reason="start instances serial issued")
return
# Check if host operation is complete
if not host_operation.is_inprogress():
host_operation = self._host_operations.get(host.name, None)
if host_operation is not None:
del self._host_operations[host.name]
def _host_disabling_okay(self, host, host_operation):
"""
Host Disabling Semantic checks
"""
if OPERATION_TYPE.HOST_LOCK != host_operation.operation_type:
return True
instance_table = tables.tables_get_instance_table()
for instance in instance_table.on_host(host.name):
if instance.is_deleting() or instance.is_deleted() or \
instance.is_locked() or instance.is_failed():
continue
if self._is_hypervisor_enabled(host.name):
if self._single_hypervisor:
# Only one hypervisor so instances will be stopped
operation = objects.INSTANCE_ACTION_TYPE.STOP
else:
# Default behaviour is to cold migrate instance
operation = objects.INSTANCE_ACTION_TYPE.COLD_MIGRATE
# Unless live migration is supported...
if instance.is_enabled() or instance.is_paused():
if instance.supports_live_migration():
operation = objects.INSTANCE_ACTION_TYPE.LIVE_MIGRATE
else:
operation = objects.INSTANCE_ACTION_TYPE.EVACUATE
if instance.is_paused() and \
objects.INSTANCE_ACTION_TYPE.LIVE_MIGRATE != operation:
reason = ("Lock of host %s rejected because instance %s "
"is paused." % (host.name, instance.name))
DLOG.info(reason)
host_operation.add_instance(instance.uuid,
OPERATION_STATE.FAILED)
host_operation.update_failure_reason(reason)
return False
elif instance.is_suspended():
reason = ("Lock of host %s rejected because instance %s "
"is suspended." % (host.name, instance.name))
DLOG.info(reason)
host_operation.add_instance(instance.uuid,
OPERATION_STATE.FAILED)
host_operation.update_failure_reason(reason)
return False
elif instance.is_migrating() or instance.is_cold_migrating():
reason = ("Lock of host %s rejected because instance %s "
"is migrating." % (host.name, instance.name))
DLOG.info(reason)
host_operation.add_instance(instance.uuid,
OPERATION_STATE.FAILED)
host_operation.update_failure_reason(reason)
return False
elif instance.is_rebuilding():
reason = ("Lock of host %s rejected because instance %s "
"is rebuilding." % (host.name, instance.name))
DLOG.info(reason)
host_operation.add_instance(instance.uuid,
OPERATION_STATE.FAILED)
host_operation.update_failure_reason(reason)
return False
elif instance.is_action_running():
# Nova will not migrate or evacuate an instance if an action
# is already running.
reason = ("Lock of host %s rejected because instance %s "
"action in progress." % (host.name, instance.name))
DLOG.info(reason)
host_operation.add_instance(instance.uuid,
OPERATION_STATE.FAILED)
host_operation.update_failure_reason(reason)
return False
elif instance.is_resized():
# Nova will not migrate or evacuate an instance if is resized
# and waiting for confirmation.
reason = ("Lock of host %s rejected because instance %s "
"is resizing." % (host.name, instance.name))
DLOG.info(reason)
host_operation.add_instance(instance.uuid,
OPERATION_STATE.FAILED)
host_operation.update_failure_reason(reason)
return False
elif not self._single_hypervisor and \
not self._hypervisors_available(min_count=1,
excluded_hosts=[host.name]):
reason = ("Lock of host %s rejected because there are no "
"other hypervisors available." % host.name)
DLOG.info(reason)
host_operation.add_instance(instance.uuid,
OPERATION_STATE.FAILED)
host_operation.update_failure_reason(reason)
return False
else:
if objects.INSTANCE_ACTION_TYPE.LIVE_MIGRATE == operation:
if not instance.can_live_migrate(system_initiated=True):
reason = ("Lock of host %s rejected because instance %s "
"can't be live-migrated by the system. "
"Manually move the instance off of host %s."
% (host.name, instance.name, host.name))
DLOG.info(reason)
host_operation.add_instance(instance.uuid,
OPERATION_STATE.FAILED)
host_operation.update_failure_reason(reason)
return False
elif objects.INSTANCE_ACTION_TYPE.COLD_MIGRATE == operation:
if not instance.can_cold_migrate(system_initiated=True):
reason = ("Lock of host %s rejected because instance %s "
"can't be cold-migrated by the system. "
"Manually move the instance off of host %s."
% (host.name, instance.name, host.name))
DLOG.info(reason)
host_operation.add_instance(instance.uuid,
OPERATION_STATE.FAILED)
host_operation.update_failure_reason(reason)
return False
elif objects.INSTANCE_ACTION_TYPE.EVACUATE == operation:
if not instance.can_evacuate(system_initiated=True):
reason = ("Lock of host %s rejected because instance %s "
"can't be evacuated by the system. Manually "
"move the instance off of host %s."
% (host.name, instance.name, host.name))
DLOG.info(reason)
host_operation.add_instance(instance.uuid,
OPERATION_STATE.FAILED)
host_operation.update_failure_reason(reason)
return False
return True
def host_operation_cancel(self, host_name):
"""
Host Operation Cancel
"""
host_operation = self._host_operations.get(host_name, None)
if host_operation is not None:
DLOG.info("Canceling host operation %s for host %s."
% (host_operation.operation_type, host_name))
del self._host_operations[host_name]
def host_services_disabling(self, host):
"""
Host Services Disabling
"""
DLOG.info("Host %s services disabling." % host.name)
if host.is_force_lock():
host_operation_type = OPERATION_TYPE.HOST_LOCK_FORCE
elif host.is_locking():
host_operation_type = OPERATION_TYPE.HOST_LOCK
elif host.is_failed():
host_operation_type = OPERATION_TYPE.HOST_FAILED
else:
host_operation_type = OPERATION_TYPE.HOST_DISABLE
host_operation = self._host_operations.get(host.name, None)
if host_operation is not None:
DLOG.debug("Canceling previous host operation %s, before "
"continuing with host operation %s for %s."
% (host_operation.operation_type, host_operation_type,
host.name))
del self._host_operations[host.name]
host_operation = Operation(host_operation_type)
DLOG.verbose("Host %s operation %s inprogress."
% (host.name, host_operation_type))
if self._host_disabling_okay(host, host_operation):
self._host_operations[host.name] = host_operation
if self._single_hypervisor:
# In single hypervisor configurations, we stop the instances
# before disabling the host services.
instance_table = tables.tables_get_instance_table()
instance_uuids = list()
for instance in instance_table.on_host(host.name):
# Stop any running instances.
if not instance.is_locked():
instance_uuids.append(instance.uuid)
# Instance should be unlocked when hypervisor recovers
instance.unlock_to_recover = True
if instance_uuids:
self._host_stop_instances(host, host_operation, instance_uuids)
elif self._is_hypervisor_enabled(host.name):
# Migrate instances from this host before stopping host services.
self._host_migrate_instances(host, host_operation)
else:
# Evacuate instances from this host before stopping host services.
self._host_evacuate_instances(host, host_operation)
return host_operation
def host_services_disabled(self, host):
"""
Host Services Disabled
"""
DLOG.info("Host %s services disabled." % host.name)
if host.is_force_lock():
host_operation_type = OPERATION_TYPE.HOST_LOCK_FORCE
elif host.is_locking():
host_operation_type = OPERATION_TYPE.HOST_LOCK
elif host.is_failed():
host_operation_type = OPERATION_TYPE.HOST_FAILED
else:
host_operation_type = OPERATION_TYPE.HOST_DISABLE
host_operation = self._host_operations.get(host.name, None)
if host_operation is not None:
DLOG.debug("Canceling previous host operation %s, before "
"continuing with host operation %s for %s."
% (host_operation.operation_type, host_operation_type,
host.name))
del self._host_operations[host.name]
host_operation = Operation(host_operation_type)
DLOG.verbose("Host %s operation %s inprogress."
% (host.name, host_operation_type))
self._host_operations[host.name] = host_operation
# Do not evacuate instances from this host if we are in a single
# hypervisor configuration.
if not self._single_hypervisor:
self._host_evacuate_instances(host, host_operation)
return host_operation
def host_disabled(self, host):
"""
Host Disabled
"""
DLOG.info("Host %s disabled." % host.name)
host_operation = self._host_operations.get(host.name, None)
if host_operation is not None:
DLOG.debug("Canceling host operation %s for %s."
% (host_operation.operation_type, host.name))
del self._host_operations[host.name]
@staticmethod
def host_offline(host):
"""
Host Offline
"""
instance_table = tables.tables_get_instance_table()
for instance in instance_table.on_host(host.name):
if instance.is_deleting() or instance.is_deleted():
continue
DLOG.info("Host %s is offline, notifying instance %s."
% (host.name, instance.name))
instance.host_offline()
def host_audit(self, host):
"""
Host Audit
"""
if not dor.system_is_stabilized():
DLOG.info("DOR system stabilization is not complete, can't audit "
"instances on host %s." % host.name)
return
if self.upgrade_inprogress():
DLOG.info("Upgrade inprogress, can't audit instances on host %s."
% host.name)
return
host_operation = self._host_operations.get(host.name, None)
if host_operation is not None:
DLOG.debug("Host operation %s for %s inprogress, can't audit "
"instances." % (host_operation.operation_type,
host.name))
return
if not host.nfvi_host_is_enabled() or host.is_failed() or host.is_offline():
instance_table = tables.tables_get_instance_table()
for instance in instance_table.on_host(host.name):
if instance.is_deleting() or instance.is_deleted() or \
instance.is_locked() or instance.is_failed():
continue
DLOG.info("Host %s is failed or offline, setting instance %s "
"to failed, host audit." % (host.name, instance.name))
instance.fail()
@staticmethod
def host_has_instances(host, skip_stopped=False):
"""
Returns true if the given host has instances
"""
count = 0
instance_table = tables.tables_get_instance_table()
for instance in instance_table.on_host(host.name):
if instance.is_deleting() or instance.is_deleted():
continue
if skip_stopped and instance.is_locked():
continue
count += 1
DLOG.info("Host %s has %s instances." % (host.name, count))
return 0 < count
@coroutine
def _instance_create_callback(self, instance_name, callback):
"""
Instance Create Callback
"""
response = (yield)
DLOG.verbose("Instance-Create callback response=%s." % response)
if response['completed']:
nfvi_instance = response['result-data']
instance_table = tables.tables_get_instance_table()
instance = instance_table.get(nfvi_instance.uuid, None)
if instance is None:
instance = objects.Instance(nfvi_instance)
instance_table[instance.uuid] = instance
instance.nfvi_instance_update(nfvi_instance)
callback(response['completed'], instance_name, instance.uuid)
else:
callback(response['completed'], instance_name, None)
@coroutine
def _instance_type_create_callback(self, instance_name, instance_type_uuid,
image_uuid, block_devices, networks,
callback):
"""
Instance-Type Create Callback
"""
response = (yield)
DLOG.verbose("Instance-Type-Create callback response=%s." % response)
if response['completed']:
nfvi_instance_type = response['result-data']
instance_type_table = tables.tables_get_instance_type_table()
instance_type_table[nfvi_instance_type.uuid] = nfvi_instance_type
instance_create_callback = self._instance_create_callback(
instance_name, callback)
nfvi.nfvi_create_instance(instance_name, instance_type_uuid,
image_uuid, block_devices, networks,
instance_create_callback)
else:
callback(response['completed'], instance_name, None)
def create_instance(self, instance_name, instance_type_uuid, vcpus,
mem_mb, disk_gb, ephemeral_gb, swap_gb, image_uuid,
block_devices, networks, auto_recovery,
live_migration_timeout, live_migration_max_downtime,
callback):
"""
Create an instance
"""
instance_type_create_callback = self._instance_type_create_callback(
instance_name, instance_type_uuid, image_uuid, block_devices,
networks, callback)
instance_type_name = "%s-type" % instance_name
instance_type_attributes = \
nfvi.objects.v1.InstanceTypeAttributes(
vcpus, mem_mb, disk_gb, ephemeral_gb, swap_gb, None, auto_recovery,
live_migration_timeout, live_migration_max_downtime)
nfvi.nfvi_create_instance_type(instance_type_uuid, instance_type_name,
instance_type_attributes,
instance_type_create_callback)
@staticmethod
def delete_instance(instance):
"""
Delete an instance
"""
DLOG.info("Instance %s delete requested." % instance.uuid)
instance.do_action(objects.INSTANCE_ACTION_TYPE.DELETE)
@staticmethod
def instance_audit(instance):
"""
Notifies the instance director that an instance audit is inprogress
"""
from nfv_vim import directors
DLOG.verbose("Notify other directors that an instance %s audit is "
"inprogress." % instance.name)
sw_mgmt_director = directors.get_sw_mgmt_director()
sw_mgmt_director.host_audit(instance)
@staticmethod
def instance_state_change_notify(instance):
"""
Notifies the instance director that a instance has changed state
"""
from nfv_vim import directors
DLOG.info("Instance %s state change notification." % instance.name)
sw_mgmt_director = directors.get_sw_mgmt_director()
sw_mgmt_director.instance_state_change(instance)
def instance_recovered(self, instance):
"""
Instance has signalled that it has recovered
"""
DLOG.info("Instance %s has recovered on host %s."
% (instance.name, instance.host_name))
self._reboot_count[instance.uuid] = 0
def recover_instance(self, instance, recovery_method=None, force_fail=False,
fail_reason=None):
"""
Recover an instance
"""
if not dor.system_is_stabilized():
DLOG.info("DOR system stabilization is not complete, can't "
"recover instance %s." % instance.name)
self.reschedule_audit_instances(self._recovery_audit_cooldown)
return False
if self.upgrade_inprogress():
DLOG.info("Upgrade inprogress, can't recover instance %s."
% instance.name)
self.reschedule_audit_instances(self._recovery_audit_cooldown)
return False
if instance.uuid not in self._reboot_count:
self._reboot_count[instance.uuid] = 0
if not dor.dor_is_complete():
self._reboot_count[instance.uuid] = 0
method = recovery_method
if method is None:
if self._is_host_enabled(instance.host_name) and \
self._is_hypervisor_enabled(instance.host_name):
# Evacuates are indicated by the instance is rebuilding state
if instance.is_rebuilding():
force_fail = True
instance.cancel_action(objects.INSTANCE_ACTION_TYPE.REBUILD)
if instance.image_uuid is not None:
method = objects.INSTANCE_ACTION_TYPE.REBUILD
else:
method = objects.INSTANCE_ACTION_TYPE.REBOOT
elif instance.is_migrating():
force_fail = True
instance.cancel_action(
objects.INSTANCE_ACTION_TYPE.LIVE_MIGRATE)
if instance.image_uuid is not None:
method = objects.INSTANCE_ACTION_TYPE.REBUILD
else:
method = objects.INSTANCE_ACTION_TYPE.REBOOT
else:
if instance.is_rebooting():
force_fail = True
instance.cancel_action(objects.INSTANCE_ACTION_TYPE.REBOOT)
# Escalate to rebuild if last reboot didn't work
if self._reboot_count[instance.uuid] > 0:
if instance.image_uuid is not None:
method = objects.INSTANCE_ACTION_TYPE.REBUILD
else:
method = objects.INSTANCE_ACTION_TYPE.REBOOT
else:
method = objects.INSTANCE_ACTION_TYPE.REBOOT
elif instance.can_evacuate(system_initiated=True):
if dor.dor_is_complete():
method = objects.INSTANCE_ACTION_TYPE.EVACUATE
if instance.is_rebooting():
force_fail = True
instance.cancel_action(objects.INSTANCE_ACTION_TYPE.REBOOT)
else:
host_table = tables.tables_get_host_table()
host = host_table.get(instance.host_name, None)
if host is None:
method = objects.INSTANCE_ACTION_TYPE.EVACUATE
if instance.is_rebooting():
force_fail = True
instance.cancel_action(
objects.INSTANCE_ACTION_TYPE.REBOOT)
else:
DLOG.info("Instance %s can't be evacuated by the system."
% instance.name)
if not instance.is_failed() or force_fail:
instance.fail(fail_reason)
if method is not None:
DLOG.info("Attempt recovery of instance %s by %s, "
"uuid=%s, host_name=%s, admin_state=%s, "
"oper_state=%s, avail_status=%s, action=%s, "
"elapse_time_in_state=%s secs."
% (instance.name, method, instance.uuid,
instance.host_name, instance.admin_state,
instance.oper_state, instance.avail_status,
instance.action, instance.elapsed_time_in_state))
if not instance.is_failed() or force_fail:
instance.fail(fail_reason)
if not instance.auto_recovery:
DLOG.info("Recovery of instance %s by %s is skipped since "
"auto-recovery is turned off." % (instance.name, method))
return False
if not instance.recoverable:
DLOG.info("Instance %s by %s is skipped since instance is not "
"recoverable." % (instance.name, method))
return False
initiated_by = objects.INSTANCE_ACTION_INITIATED_BY.DIRECTOR
if objects.INSTANCE_ACTION_TYPE.REBOOT == method:
self._reboot_count[instance.uuid] += 1
instance.do_action(method, initiated_by=initiated_by)
elif objects.INSTANCE_ACTION_TYPE.REBUILD == method:
self._reboot_count[instance.uuid] = 0
instance.do_action(method, initiated_by=initiated_by)
elif objects.INSTANCE_ACTION_TYPE.EVACUATE == method:
self._reboot_count[instance.uuid] = 0
instance.do_action(method, initiated_by=initiated_by)
elif objects.INSTANCE_ACTION_TYPE.STOP == method:
self._reboot_count[instance.uuid] = 0
instance.do_action(method, initiated_by=initiated_by)
return method is not None
def reschedule_audit_instances(self, interval=None):
"""
Reschedule audit instances
"""
if interval is None:
interval = self._next_audit_interval
if self._timer_audit_instances is not None:
timers.timers_reschedule_timer(self._timer_audit_instances,
interval)
DLOG.verbose("Recovery audit is rescheduled to %s second "
"intervals." % interval)
def recover_instances(self, audit=False):
"""
Recover instances that were previously launched but are currently
failed or executing an action for a very long time
"""
if not dor.system_is_stabilized():
DLOG.info("DOR system stabilization is not complete, can't recover "
"instances.")
self.reschedule_audit_instances(self._recovery_audit_cooldown)
return
if self.upgrade_inprogress():
DLOG.info("Upgrade inprogress, can't recover instances.")
self.reschedule_audit_instances(self._recovery_audit_cooldown)
return
if not self._hypervisors_available(min_count=1):
DLOG.info("No hypervisors available, can't recover instances.")
self.reschedule_audit_instances(self._recovery_audit_cooldown)
return
if not audit:
interval = self._recovery_audit_cooldown
if 0 == len(self._instance_recovery_list):
if self._next_audit_interval == self._recovery_audit_interval:
self.reschedule_audit_instances(interval)
self._next_audit_interval = interval
return
if 0 == len(self._instance_recovery_list):
(self._next_audit_interval, self._instance_recovery_list,
self._instance_failed_list, self._instance_rebuilding_list,
self._instance_migrating_list, self._instance_rebooting_list) \
= self._get_instance_recovery_list()
DLOG.info("Running recovery audit, instances_to_recover=%s, "
"instances_failed=%s, instances_rebuilding=%s, "
"instances_migrating=%s, instances_rebooting=%s."
% (len(self._instance_recovery_list),
len(self._instance_failed_list),
len(self._instance_rebuilding_list),
len(self._instance_migrating_list),
len(self._instance_rebooting_list)))
else:
DLOG.info("Running recovery audit, instances_remaining=%s."
% len(self._instance_recovery_list))
# Attempt to recover instances, whether resources are available or not
instance_table = tables.tables_get_instance_table()
count = 0
# Use a lower cutoff when there are a large number of instances to recover
cutoff = self._max_concurrent_recovering_instances
if len(self._instance_recovery_list) > self._recovery_threshold:
cutoff = self._max_throttled_recovering_instances
for instance_recover in list(self._instance_recovery_list):
instance = instance_table.get(instance_recover.uuid, None)
if instance is not None:
host_operation_inprogress = False
host_operation = self._host_operations.get(instance.host_name, None)
if host_operation is not None:
if host_operation.is_inprogress():
DLOG.debug("Skip recovery of instance %s, host %s "
"operation inprogress." % (instance.name,
instance.host_name))
host_operation_inprogress = True
if not host_operation_inprogress:
if not (instance.is_deleting() or instance.is_deleted() or
instance.is_locked()) and instance.is_failed():
if self._instance_recovery_allowed(instance):
if self.recover_instance(instance):
count += 1
self._instance_recovery_list.remove(instance_recover)
if count >= cutoff:
break
if 0 == len(self._instance_recovery_list):
self.unlock_instances()
self.reschedule_audit_instances()
DLOG.info("Completed recovery audit cycle.")
else:
self.reschedule_audit_instances(self._recovery_audit_batch_interval)
DLOG.info("Completed recovery audit batch.")
def unlock_instances(self):
"""
Unlock (start) instances that were locked (stopped) when a single
hypervisor configuration had its hypervisor disabled. This should only
be done after all unlocked instances have been recovered.
"""
if not os.path.exists(NFV_VIM_UNLOCK_COMPLETE_FILE):
if self._single_hypervisor:
DLOG.info("Unlocking instances after hypervisor enabled")
instance_table = tables.tables_get_instance_table()
instance_uuids = list()
for instance in instance_table.values():
if instance.unlock_to_recover and instance.is_locked():
instance_uuids.append(instance.uuid)
instance.unlock_to_recover = False
if instance_uuids:
self.start_instances(instance_uuids, serial=True)
# Do not attempt to do the unlock again.
open(NFV_VIM_UNLOCK_COMPLETE_FILE, 'w').close()
@coroutine
def audit_instances(self):
"""
Audit Instances
"""
while True:
(yield)
self.recover_instances(audit=True)
def cleanup_instance(self, instance_uuid):
"""
Cleanup an instance
"""
if instance_uuid not in self._instance_cleanup_list:
self._instance_cleanup_list.append(instance_uuid)
if self._timer_cleanup_instances is None:
self._timer_cleanup_instances = timers.timers_create_timer(
"cleanup-instances", 1, 1, self.cleanup_instances)
@coroutine
def cleanup_instances(self):
"""
Cleanup Instances
"""
(yield)
trigger_recovery = False
instance_table = tables.tables_get_instance_table()
for instance_uuid in self._instance_cleanup_list:
instance = instance_table.get(instance_uuid, None)
if instance is not None:
if instance.is_deleted():
DLOG.info("Cleaned up instance %s" % instance.name)
del instance_table[instance_uuid]
trigger_recovery = True
self._instance_cleanup_list[:] = list()
self._timer_cleanup_instances = None
if trigger_recovery:
DLOG.info("Recover-Instances-Audit triggered by instance deletion.")
self.recover_instances()
def migrate_instances(self, instance_uuids):
"""
Migrate Instances
"""
DLOG.info("Migrate instances uuids=%s." % instance_uuids)
host_table = tables.tables_get_host_table()
instance_table = tables.tables_get_instance_table()
overall_operation = Operation(OPERATION_TYPE.MIGRATE_INSTANCES)
host_operations = dict()
for instance_uuid in instance_uuids:
instance = instance_table.get(instance_uuid, None)
if instance is None:
reason = "Instance %s does not exist." % instance_uuid
DLOG.info(reason)
overall_operation.set_failed(reason)
return overall_operation
host = host_table.get(instance.host_name, None)
if host is None:
reason = "Host %s does not exist." % instance.host_name
DLOG.info(reason)
overall_operation.set_failed(reason)
return overall_operation
host_operation = self._host_operations.get(instance.host_name, None)
if host_operation is not None:
if host_operation.is_inprogress():
reason = ("Another host operation %s is already inprogress "
"for host %s." % (host_operation.operation_type,
instance.host_name))
DLOG.info(reason)
overall_operation.set_failed(reason)
return overall_operation
else:
del self._host_operations[instance.host_name]
host_operation = host_operations.get(instance.host_name, None)
if host_operation is None:
host_operation = Operation(OPERATION_TYPE.MIGRATE_INSTANCES)
host_operations[instance.host_name] = host_operation
for host_name, host_operation in host_operations.items():
self._host_operations[host_name] = host_operation
self._host_migrate_instances(host_table[host_name], host_operation)
if host_operation.is_inprogress():
overall_operation.add_host(host_name, OPERATION_STATE.INPROGRESS)
elif host_operation.is_failed():
overall_operation.add_host(host_name, OPERATION_STATE.FAILED)
overall_operation.update_failure_reason(host_operation.reason)
break
elif host_operation.is_timed_out():
overall_operation.add_host(host_name, OPERATION_STATE.TIMED_OUT)
overall_operation.update_failure_reason(host_operation.reason)
break
else:
overall_operation.add_host(host_name, OPERATION_STATE.COMPLETED)
return overall_operation
def stop_instances(self, instance_uuids):
"""
Stop Instances
"""
DLOG.info("Stop instances uuids=%s." % instance_uuids)
host_table = tables.tables_get_host_table()
instance_table = tables.tables_get_instance_table()
overall_operation = Operation(OPERATION_TYPE.STOP_INSTANCES)
host_operations = dict()
for instance_uuid in instance_uuids:
instance = instance_table.get(instance_uuid, None)
if instance is None:
reason = "Instance %s does not exist." % instance_uuid
DLOG.info(reason)
overall_operation.set_failed(reason)
return overall_operation
host = host_table.get(instance.host_name, None)
if host is None:
reason = "Host %s does not exist." % instance.host_name
DLOG.info(reason)
overall_operation.set_failed(reason)
return overall_operation
host_operation = self._host_operations.get(instance.host_name, None)
if host_operation is not None:
if host_operation.is_inprogress():
reason = ("Another host operation %s is already inprogress "
"for host %s." % (host_operation.operation_type,
instance.host_name))
DLOG.info(reason)
overall_operation.set_failed(reason)
return overall_operation
else:
del self._host_operations[instance.host_name]
host_operation = host_operations.get(instance.host_name, None)
if host_operation is None:
host_operation = Operation(OPERATION_TYPE.STOP_INSTANCES)
host_operations[instance.host_name] = host_operation
for host_name, host_operation in host_operations.items():
self._host_operations[host_name] = host_operation
self._host_stop_instances(host_table[host_name], host_operation,
instance_uuids)
if host_operation.is_inprogress():
overall_operation.add_host(host_name, OPERATION_STATE.INPROGRESS)
elif host_operation.is_failed():
overall_operation.add_host(host_name, OPERATION_STATE.FAILED)
overall_operation.update_failure_reason(host_operation.reason)
break
elif host_operation.is_timed_out():
overall_operation.add_host(host_name, OPERATION_STATE.TIMED_OUT)
overall_operation.update_failure_reason(host_operation.reason)
break
else:
overall_operation.add_host(host_name, OPERATION_STATE.COMPLETED)
return overall_operation
def start_instances(self, instance_uuids, serial=False):
"""
Start Instances
"""
DLOG.info("Start instances uuids=%s." % instance_uuids)
host_table = tables.tables_get_host_table()
instance_table = tables.tables_get_instance_table()
if serial:
operation_type = OPERATION_TYPE.START_INSTANCES_SERIAL
else:
operation_type = OPERATION_TYPE.START_INSTANCES
overall_operation = Operation(operation_type)
host_operations = dict()
for instance_uuid in instance_uuids:
instance = instance_table.get(instance_uuid, None)
if instance is None:
reason = "Instance %s does not exist." % instance_uuid
DLOG.info(reason)
overall_operation.set_failed(reason)
return overall_operation
host = host_table.get(instance.host_name, None)
if host is None:
reason = "Host %s does not exist." % instance.host_name
DLOG.info(reason)
overall_operation.set_failed(reason)
return overall_operation
host_operation = self._host_operations.get(instance.host_name, None)
if host_operation is not None:
if host_operation.is_inprogress():
reason = ("Another host operation %s is already inprogress "
"for host %s." % (host_operation.operation_type,
instance.host_name))
DLOG.info(reason)
overall_operation.set_failed(reason)
return overall_operation
else:
del self._host_operations[instance.host_name]
host_operation = host_operations.get(instance.host_name, None)
if host_operation is None:
host_operation = Operation(operation_type)
host_operations[instance.host_name] = host_operation
for host_name, host_operation in host_operations.items():
self._host_operations[host_name] = host_operation
self._host_start_instances(host_table[host_name], host_operation,
instance_uuids)
if host_operation.is_inprogress():
overall_operation.add_host(host_name, OPERATION_STATE.INPROGRESS)
elif host_operation.is_failed():
overall_operation.add_host(host_name, OPERATION_STATE.FAILED)
overall_operation.update_failure_reason(host_operation.reason)
break
elif host_operation.is_timed_out():
overall_operation.add_host(host_name, OPERATION_STATE.TIMED_OUT)
overall_operation.update_failure_reason(host_operation.reason)
break
else:
overall_operation.add_host(host_name, OPERATION_STATE.COMPLETED)
return overall_operation
def get_instance_director():
"""
Returns the Instance Director
"""
return _instance_director
def instance_director_initialize():
"""
Initialize Instance Director
"""
global _instance_director
if config.section_exists('instance-configuration'):
section = config.CONF['instance-configuration']
max_concurrent_recovering_instances \
= int(section.get('max_concurrent_recovering_instances', 4))
max_concurrent_migrates_per_host \
= int(section.get('max_concurrent_migrates_per_host', 1))
max_concurrent_evacuates_per_host \
= int(section.get('max_concurrent_evacuates_per_host', 1))
recovery_audit_interval \
= int(section.get('recovery_audit_interval', 330))
recovery_audit_cooldown \
= int(section.get('recovery_audit_cooldown', 30))
recovery_audit_batch_interval \
= int(section.get('recovery_audit_batch_interval', 2))
recovery_cooldown \
= int(section.get('recovery_cooldown', 30))
rebuild_timeout \
= int(section.get('rebuild_timeout', 900))
reboot_timeout \
= int(section.get('reboot_timeout', 300))
migrate_timeout \
= int(section.get('migrate_timeout', 960))
single_hypervisor \
= (section.get('single_hypervisor', 'false').lower() == 'true')
recovery_threshold = int(section.get('recovery_threshold', 250))
max_throttled_recovering_instances \
= int(section.get('max_throttled_recovering_instances', 2))
else:
max_concurrent_recovering_instances = 4
max_concurrent_migrates_per_host = 1
max_concurrent_evacuates_per_host = 1
recovery_audit_interval = 330
recovery_audit_cooldown = 30
recovery_audit_batch_interval = 2
recovery_cooldown = 30
rebuild_timeout = 900
reboot_timeout = 300
migrate_timeout = 960
single_hypervisor = False
recovery_threshold = 250
max_throttled_recovering_instances = 2
_instance_director = InstanceDirector(
max_concurrent_recovering_instances,
max_concurrent_migrates_per_host,
max_concurrent_evacuates_per_host,
recovery_audit_interval,
recovery_audit_cooldown,
recovery_audit_batch_interval,
recovery_cooldown,
rebuild_timeout,
reboot_timeout,
migrate_timeout,
single_hypervisor,
recovery_threshold,
max_throttled_recovering_instances)
def instance_director_finalize():
"""
Finalize Instance Director
"""
pass