Apply NoExecute taint to locked nodes

This change applies the NoExecute taint to AIO-DX and worker nodes
when they are locked. This causes pods to be evicted from the node
and prevents new pods from being scheduled on that node. When the
node is unlocked and has rebooted, the taint is removed.

Change-Id: I2a7c8843a68661e15224260c53fd171920473161
story: 2002843
task: 29359
Signed-off-by: Bart Wensley <barton.wensley@windriver.com>
This commit is contained in:
Bart Wensley 2019-02-05 14:09:24 -06:00
parent a6732cbe47
commit 3e9d13ec85
4 changed files with 81 additions and 70 deletions

View File

@ -884,34 +884,21 @@ class NFVIInfrastructureAPI(nfvi.api.v1.NFVIInfrastructureAPI):
future.set_timeouts(config.CONF.get('nfvi-timeouts', None))
if self._host_supports_kubernetes(host_personality):
if True:
# For now, we do not want to apply the NoExecute taint.
# When the VIM detects that a service is failed on a host,
# it goes through a disable/enable cycle. This would cause
# the NoExecute taint to be applied/removed which causes
# most pods to be stopped/started. If the pods don't come
# back quickly enough the VIM will attempt another
# disable/enable, which can go on forever. For now,
# we will just avoid tainting hosts.
# TODO(bwensley): Rework when support for pure k8s hosts is
# added.
pass
else:
response['reason'] = 'failed to disable kubernetes services'
response['reason'] = 'failed to disable kubernetes services'
# To disable kubernetes we add the NoExecute taint to the
# node. This removes pods that can be scheduled elsewhere
# and prevents new pods from scheduling on the node.
future.work(kubernetes_client.taint_node,
host_name, "NoExecute", "services", "disabled")
# To disable kubernetes we add the NoExecute taint to the
# node. This removes pods that can be scheduled elsewhere
# and prevents new pods from scheduling on the node.
future.work(kubernetes_client.taint_node,
host_name, "NoExecute", "services", "disabled")
future.result = (yield)
future.result = (yield)
if not future.result.is_complete():
DLOG.error("Kubernetes taint_node failed, operation "
"did not complete, host_uuid=%s, host_name=%s."
% (host_uuid, host_name))
return
if not future.result.is_complete():
DLOG.error("Kubernetes taint_node failed, operation "
"did not complete, host_uuid=%s, host_name=%s."
% (host_uuid, host_name))
return
response['completed'] = True
response['reason'] = ''

View File

@ -669,49 +669,45 @@ def query_network_agents(token, host_name, check_fully_up):
Input parameter check_fully_up set to True will check for
both alive and admin_state_up, otherwise only alive is checked.
"""
try:
url, api_cmd, api_cmd_headers, result_data = get_network_agents(
token, host_name)
url, api_cmd, api_cmd_headers, result_data = get_network_agents(
token, host_name)
agent_state = 'up'
supported_agents = [AGENT_TYPE.L3, AGENT_TYPE.DHCP]
for supported_agent in supported_agents:
found = False
for agent in result_data:
agent_type = agent.get('agent_type', '')
host = agent.get('host', '')
if (agent_type == supported_agent) and (host == host_name):
DLOG.verbose("found agent %s for host %s" %
(supported_agent, host_name))
alive = agent.get('alive', False)
admin_state_up = agent.get('admin_state_up', False)
# found the agent of interest.
found = True
break
if found:
if check_fully_up:
if not (alive and admin_state_up):
DLOG.verbose("host %s agent %s not fully up. alive: %s,"
" admin_state_up: %s" %
(host_name, supported_agent,
alive, admin_state_up))
agent_state = 'down'
break
else:
if not alive:
DLOG.verbose("host %s agent %s not alive" %
(host_name, supported_agent))
agent_state = 'down'
break
else:
DLOG.error("host %s agent %s not present" %
(host_name, supported_agent))
agent_state = 'down'
agent_state = 'up'
alive = False
admin_state_up = False
supported_agents = [AGENT_TYPE.L3, AGENT_TYPE.DHCP]
for supported_agent in supported_agents:
found = False
for agent in result_data:
agent_type = agent.get('agent_type', '')
host = agent.get('host', '')
if (agent_type == supported_agent) and (host == host_name):
DLOG.verbose("found agent %s for host %s" %
(supported_agent, host_name))
alive = agent.get('alive', False)
admin_state_up = agent.get('admin_state_up', False)
# found the agent of interest.
found = True
break
except Exception as e:
DLOG.exception("Caught exception trying to query host %s "
"agent states: %s" % (host_name, e))
agent_state = 'down'
if found:
if check_fully_up:
if not (alive and admin_state_up):
DLOG.verbose("host %s agent %s not fully up. alive: %s,"
" admin_state_up: %s" %
(host_name, supported_agent,
alive, admin_state_up))
agent_state = 'down'
break
else:
if not alive:
DLOG.verbose("host %s agent %s not alive" %
(host_name, supported_agent))
agent_state = 'down'
break
else:
DLOG.error("host %s agent %s not present" %
(host_name, supported_agent))
agent_state = 'down'
break
return agent_state

View File

@ -38,6 +38,13 @@ class SwMgmtDirector(object):
"""
return self._sw_update
@property
def single_controller(self):
"""
Returns whether this is a single controller configuration
"""
return self._single_controller
def create_sw_patch_strategy(self, controller_apply_type, storage_apply_type,
swift_apply_type, worker_apply_type,
max_parallel_worker_hosts,

View File

@ -234,9 +234,6 @@ class DisableHostTask(state_machine.StateTask):
if host.host_service_configured(objects.HOST_SERVICES.GUEST):
task_work_list.append(DisableHostServicesTaskWork(
self, host, objects.HOST_SERVICES.GUEST))
if host.host_service_configured(objects.HOST_SERVICES.CONTAINER):
task_work_list.append(DisableHostServicesTaskWork(
self, host, objects.HOST_SERVICES.CONTAINER))
if host.host_service_configured(objects.HOST_SERVICES.COMPUTE):
task_work_list.append(QueryHypervisorTaskWork(
self, host, force_pass=True))
@ -248,6 +245,17 @@ class DisableHostTask(state_machine.StateTask):
task_work_list.append(NotifyHostDisabledTaskWork(
self, host, objects.HOST_SERVICES.NETWORK))
task_work_list.append(NotifyInstancesHostDisabledTaskWork(self, host))
if host.host_service_configured(objects.HOST_SERVICES.CONTAINER):
# Only disable the container services if the host is being locked
# and we are not running in a single controller configuration. In
# a single controller configuration we keep the container services
# running.
if self._host.is_locking():
from nfv_vim import directors
sw_mgmt_director = directors.get_sw_mgmt_director()
if not sw_mgmt_director.single_controller:
task_work_list.append(DisableHostServicesTaskWork(
self, host, objects.HOST_SERVICES.CONTAINER))
task_work_list.append(notify_host_services_task(
self, host, force_pass=True))
if host.host_service_configured(objects.HOST_SERVICES.COMPUTE):
@ -443,8 +451,21 @@ class NotifyDisabledHostTask(state_machine.StateTask):
Notify Disabled Host Task
"""
def __init__(self, host):
from nfv_vim import objects
self._host_reference = weakref.ref(host)
task_work_list = list()
if host.host_service_configured(objects.HOST_SERVICES.CONTAINER):
# Only disable the container services if the host is being locked
# and we are not running in a single controller configuration. In
# a single controller configuration we keep the container services
# running.
if self._host.is_locking():
from nfv_vim import directors
sw_mgmt_director = directors.get_sw_mgmt_director()
if not sw_mgmt_director.single_controller:
task_work_list.append(DisableHostServicesTaskWork(
self, host, objects.HOST_SERVICES.CONTAINER))
task_work_list.append(NotifyHostServicesDisabledTaskWork(
self, host, force_pass=True))
super(NotifyDisabledHostTask, self).__init__(