Merge "Apply NoExecute taint to locked nodes"

This commit is contained in:
Zuul 2019-02-05 22:12:16 +00:00 committed by Gerrit Code Review
commit 97524fee3a
4 changed files with 81 additions and 70 deletions

View File

@ -893,34 +893,21 @@ class NFVIInfrastructureAPI(nfvi.api.v1.NFVIInfrastructureAPI):
future.set_timeouts(config.CONF.get('nfvi-timeouts', None))
if self._host_supports_kubernetes(host_personality):
if True:
# For now, we do not want to apply the NoExecute taint.
# When the VIM detects that a service is failed on a host,
# it goes through a disable/enable cycle. This would cause
# the NoExecute taint to be applied/removed which causes
# most pods to be stopped/started. If the pods don't come
# back quickly enough the VIM will attempt another
# disable/enable, which can go on forever. For now,
# we will just avoid tainting hosts.
# TODO(bwensley): Rework when support for pure k8s hosts is
# added.
pass
else:
response['reason'] = 'failed to disable kubernetes services'
response['reason'] = 'failed to disable kubernetes services'
# To disable kubernetes we add the NoExecute taint to the
# node. This removes pods that can be scheduled elsewhere
# and prevents new pods from scheduling on the node.
future.work(kubernetes_client.taint_node,
host_name, "NoExecute", "services", "disabled")
# To disable kubernetes we add the NoExecute taint to the
# node. This removes pods that can be scheduled elsewhere
# and prevents new pods from scheduling on the node.
future.work(kubernetes_client.taint_node,
host_name, "NoExecute", "services", "disabled")
future.result = (yield)
future.result = (yield)
if not future.result.is_complete():
DLOG.error("Kubernetes taint_node failed, operation "
"did not complete, host_uuid=%s, host_name=%s."
% (host_uuid, host_name))
return
if not future.result.is_complete():
DLOG.error("Kubernetes taint_node failed, operation "
"did not complete, host_uuid=%s, host_name=%s."
% (host_uuid, host_name))
return
response['completed'] = True
response['reason'] = ''

View File

@ -669,49 +669,45 @@ def query_network_agents(token, host_name, check_fully_up):
Input parameter check_fully_up set to True will check for
both alive and admin_state_up, otherwise only alive is checked.
"""
try:
url, api_cmd, api_cmd_headers, result_data = get_network_agents(
token, host_name)
url, api_cmd, api_cmd_headers, result_data = get_network_agents(
token, host_name)
agent_state = 'up'
supported_agents = [AGENT_TYPE.L3, AGENT_TYPE.DHCP]
for supported_agent in supported_agents:
found = False
for agent in result_data:
agent_type = agent.get('agent_type', '')
host = agent.get('host', '')
if (agent_type == supported_agent) and (host == host_name):
DLOG.verbose("found agent %s for host %s" %
(supported_agent, host_name))
alive = agent.get('alive', False)
admin_state_up = agent.get('admin_state_up', False)
# found the agent of interest.
found = True
break
if found:
if check_fully_up:
if not (alive and admin_state_up):
DLOG.verbose("host %s agent %s not fully up. alive: %s,"
" admin_state_up: %s" %
(host_name, supported_agent,
alive, admin_state_up))
agent_state = 'down'
break
else:
if not alive:
DLOG.verbose("host %s agent %s not alive" %
(host_name, supported_agent))
agent_state = 'down'
break
else:
DLOG.error("host %s agent %s not present" %
(host_name, supported_agent))
agent_state = 'down'
agent_state = 'up'
alive = False
admin_state_up = False
supported_agents = [AGENT_TYPE.L3, AGENT_TYPE.DHCP]
for supported_agent in supported_agents:
found = False
for agent in result_data:
agent_type = agent.get('agent_type', '')
host = agent.get('host', '')
if (agent_type == supported_agent) and (host == host_name):
DLOG.verbose("found agent %s for host %s" %
(supported_agent, host_name))
alive = agent.get('alive', False)
admin_state_up = agent.get('admin_state_up', False)
# found the agent of interest.
found = True
break
except Exception as e:
DLOG.exception("Caught exception trying to query host %s "
"agent states: %s" % (host_name, e))
agent_state = 'down'
if found:
if check_fully_up:
if not (alive and admin_state_up):
DLOG.verbose("host %s agent %s not fully up. alive: %s,"
" admin_state_up: %s" %
(host_name, supported_agent,
alive, admin_state_up))
agent_state = 'down'
break
else:
if not alive:
DLOG.verbose("host %s agent %s not alive" %
(host_name, supported_agent))
agent_state = 'down'
break
else:
DLOG.error("host %s agent %s not present" %
(host_name, supported_agent))
agent_state = 'down'
break
return agent_state

View File

@ -38,6 +38,13 @@ class SwMgmtDirector(object):
"""
return self._sw_update
@property
def single_controller(self):
"""
Returns whether this is a single controller configuration
"""
return self._single_controller
def create_sw_patch_strategy(self, controller_apply_type, storage_apply_type,
swift_apply_type, worker_apply_type,
max_parallel_worker_hosts,

View File

@ -234,9 +234,6 @@ class DisableHostTask(state_machine.StateTask):
if host.host_service_configured(objects.HOST_SERVICES.GUEST):
task_work_list.append(DisableHostServicesTaskWork(
self, host, objects.HOST_SERVICES.GUEST))
if host.host_service_configured(objects.HOST_SERVICES.CONTAINER):
task_work_list.append(DisableHostServicesTaskWork(
self, host, objects.HOST_SERVICES.CONTAINER))
if host.host_service_configured(objects.HOST_SERVICES.COMPUTE):
task_work_list.append(QueryHypervisorTaskWork(
self, host, force_pass=True))
@ -248,6 +245,17 @@ class DisableHostTask(state_machine.StateTask):
task_work_list.append(NotifyHostDisabledTaskWork(
self, host, objects.HOST_SERVICES.NETWORK))
task_work_list.append(NotifyInstancesHostDisabledTaskWork(self, host))
if host.host_service_configured(objects.HOST_SERVICES.CONTAINER):
# Only disable the container services if the host is being locked
# and we are not running in a single controller configuration. In
# a single controller configuration we keep the container services
# running.
if self._host.is_locking():
from nfv_vim import directors
sw_mgmt_director = directors.get_sw_mgmt_director()
if not sw_mgmt_director.single_controller:
task_work_list.append(DisableHostServicesTaskWork(
self, host, objects.HOST_SERVICES.CONTAINER))
task_work_list.append(notify_host_services_task(
self, host, force_pass=True))
if host.host_service_configured(objects.HOST_SERVICES.COMPUTE):
@ -443,8 +451,21 @@ class NotifyDisabledHostTask(state_machine.StateTask):
Notify Disabled Host Task
"""
def __init__(self, host):
from nfv_vim import objects
self._host_reference = weakref.ref(host)
task_work_list = list()
if host.host_service_configured(objects.HOST_SERVICES.CONTAINER):
# Only disable the container services if the host is being locked
# and we are not running in a single controller configuration. In
# a single controller configuration we keep the container services
# running.
if self._host.is_locking():
from nfv_vim import directors
sw_mgmt_director = directors.get_sw_mgmt_director()
if not sw_mgmt_director.single_controller:
task_work_list.append(DisableHostServicesTaskWork(
self, host, objects.HOST_SERVICES.CONTAINER))
task_work_list.append(NotifyHostServicesDisabledTaskWork(
self, host, force_pass=True))
super(NotifyDisabledHostTask, self).__init__(