Apply NoExecute taint to locked nodes
This change applies the NoExecute taint to AIO-DX and worker nodes when they are locked. This causes pods to be evicted from the node and prevents new pods from being scheduled on that node. When the node is unlocked and has rebooted, the taint is removed. Change-Id: I2a7c8843a68661e15224260c53fd171920473161 story: 2002843 task: 29359 Signed-off-by: Bart Wensley <barton.wensley@windriver.com>
This commit is contained in:
parent
a6732cbe47
commit
3e9d13ec85
|
@ -884,34 +884,21 @@ class NFVIInfrastructureAPI(nfvi.api.v1.NFVIInfrastructureAPI):
|
|||
future.set_timeouts(config.CONF.get('nfvi-timeouts', None))
|
||||
|
||||
if self._host_supports_kubernetes(host_personality):
|
||||
if True:
|
||||
# For now, we do not want to apply the NoExecute taint.
|
||||
# When the VIM detects that a service is failed on a host,
|
||||
# it goes through a disable/enable cycle. This would cause
|
||||
# the NoExecute taint to be applied/removed which causes
|
||||
# most pods to be stopped/started. If the pods don't come
|
||||
# back quickly enough the VIM will attempt another
|
||||
# disable/enable, which can go on forever. For now,
|
||||
# we will just avoid tainting hosts.
|
||||
# TODO(bwensley): Rework when support for pure k8s hosts is
|
||||
# added.
|
||||
pass
|
||||
else:
|
||||
response['reason'] = 'failed to disable kubernetes services'
|
||||
response['reason'] = 'failed to disable kubernetes services'
|
||||
|
||||
# To disable kubernetes we add the NoExecute taint to the
|
||||
# node. This removes pods that can be scheduled elsewhere
|
||||
# and prevents new pods from scheduling on the node.
|
||||
future.work(kubernetes_client.taint_node,
|
||||
host_name, "NoExecute", "services", "disabled")
|
||||
# To disable kubernetes we add the NoExecute taint to the
|
||||
# node. This removes pods that can be scheduled elsewhere
|
||||
# and prevents new pods from scheduling on the node.
|
||||
future.work(kubernetes_client.taint_node,
|
||||
host_name, "NoExecute", "services", "disabled")
|
||||
|
||||
future.result = (yield)
|
||||
future.result = (yield)
|
||||
|
||||
if not future.result.is_complete():
|
||||
DLOG.error("Kubernetes taint_node failed, operation "
|
||||
"did not complete, host_uuid=%s, host_name=%s."
|
||||
% (host_uuid, host_name))
|
||||
return
|
||||
if not future.result.is_complete():
|
||||
DLOG.error("Kubernetes taint_node failed, operation "
|
||||
"did not complete, host_uuid=%s, host_name=%s."
|
||||
% (host_uuid, host_name))
|
||||
return
|
||||
|
||||
response['completed'] = True
|
||||
response['reason'] = ''
|
||||
|
|
|
@ -669,49 +669,45 @@ def query_network_agents(token, host_name, check_fully_up):
|
|||
Input parameter check_fully_up set to True will check for
|
||||
both alive and admin_state_up, otherwise only alive is checked.
|
||||
"""
|
||||
try:
|
||||
url, api_cmd, api_cmd_headers, result_data = get_network_agents(
|
||||
token, host_name)
|
||||
url, api_cmd, api_cmd_headers, result_data = get_network_agents(
|
||||
token, host_name)
|
||||
|
||||
agent_state = 'up'
|
||||
supported_agents = [AGENT_TYPE.L3, AGENT_TYPE.DHCP]
|
||||
for supported_agent in supported_agents:
|
||||
found = False
|
||||
for agent in result_data:
|
||||
agent_type = agent.get('agent_type', '')
|
||||
host = agent.get('host', '')
|
||||
if (agent_type == supported_agent) and (host == host_name):
|
||||
DLOG.verbose("found agent %s for host %s" %
|
||||
(supported_agent, host_name))
|
||||
alive = agent.get('alive', False)
|
||||
admin_state_up = agent.get('admin_state_up', False)
|
||||
# found the agent of interest.
|
||||
found = True
|
||||
break
|
||||
if found:
|
||||
if check_fully_up:
|
||||
if not (alive and admin_state_up):
|
||||
DLOG.verbose("host %s agent %s not fully up. alive: %s,"
|
||||
" admin_state_up: %s" %
|
||||
(host_name, supported_agent,
|
||||
alive, admin_state_up))
|
||||
agent_state = 'down'
|
||||
break
|
||||
else:
|
||||
if not alive:
|
||||
DLOG.verbose("host %s agent %s not alive" %
|
||||
(host_name, supported_agent))
|
||||
agent_state = 'down'
|
||||
break
|
||||
else:
|
||||
DLOG.error("host %s agent %s not present" %
|
||||
(host_name, supported_agent))
|
||||
agent_state = 'down'
|
||||
agent_state = 'up'
|
||||
alive = False
|
||||
admin_state_up = False
|
||||
supported_agents = [AGENT_TYPE.L3, AGENT_TYPE.DHCP]
|
||||
for supported_agent in supported_agents:
|
||||
found = False
|
||||
for agent in result_data:
|
||||
agent_type = agent.get('agent_type', '')
|
||||
host = agent.get('host', '')
|
||||
if (agent_type == supported_agent) and (host == host_name):
|
||||
DLOG.verbose("found agent %s for host %s" %
|
||||
(supported_agent, host_name))
|
||||
alive = agent.get('alive', False)
|
||||
admin_state_up = agent.get('admin_state_up', False)
|
||||
# found the agent of interest.
|
||||
found = True
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
DLOG.exception("Caught exception trying to query host %s "
|
||||
"agent states: %s" % (host_name, e))
|
||||
agent_state = 'down'
|
||||
if found:
|
||||
if check_fully_up:
|
||||
if not (alive and admin_state_up):
|
||||
DLOG.verbose("host %s agent %s not fully up. alive: %s,"
|
||||
" admin_state_up: %s" %
|
||||
(host_name, supported_agent,
|
||||
alive, admin_state_up))
|
||||
agent_state = 'down'
|
||||
break
|
||||
else:
|
||||
if not alive:
|
||||
DLOG.verbose("host %s agent %s not alive" %
|
||||
(host_name, supported_agent))
|
||||
agent_state = 'down'
|
||||
break
|
||||
else:
|
||||
DLOG.error("host %s agent %s not present" %
|
||||
(host_name, supported_agent))
|
||||
agent_state = 'down'
|
||||
break
|
||||
|
||||
return agent_state
|
||||
|
|
|
@ -38,6 +38,13 @@ class SwMgmtDirector(object):
|
|||
"""
|
||||
return self._sw_update
|
||||
|
||||
@property
|
||||
def single_controller(self):
|
||||
"""
|
||||
Returns whether this is a single controller configuration
|
||||
"""
|
||||
return self._single_controller
|
||||
|
||||
def create_sw_patch_strategy(self, controller_apply_type, storage_apply_type,
|
||||
swift_apply_type, worker_apply_type,
|
||||
max_parallel_worker_hosts,
|
||||
|
|
|
@ -234,9 +234,6 @@ class DisableHostTask(state_machine.StateTask):
|
|||
if host.host_service_configured(objects.HOST_SERVICES.GUEST):
|
||||
task_work_list.append(DisableHostServicesTaskWork(
|
||||
self, host, objects.HOST_SERVICES.GUEST))
|
||||
if host.host_service_configured(objects.HOST_SERVICES.CONTAINER):
|
||||
task_work_list.append(DisableHostServicesTaskWork(
|
||||
self, host, objects.HOST_SERVICES.CONTAINER))
|
||||
if host.host_service_configured(objects.HOST_SERVICES.COMPUTE):
|
||||
task_work_list.append(QueryHypervisorTaskWork(
|
||||
self, host, force_pass=True))
|
||||
|
@ -248,6 +245,17 @@ class DisableHostTask(state_machine.StateTask):
|
|||
task_work_list.append(NotifyHostDisabledTaskWork(
|
||||
self, host, objects.HOST_SERVICES.NETWORK))
|
||||
task_work_list.append(NotifyInstancesHostDisabledTaskWork(self, host))
|
||||
if host.host_service_configured(objects.HOST_SERVICES.CONTAINER):
|
||||
# Only disable the container services if the host is being locked
|
||||
# and we are not running in a single controller configuration. In
|
||||
# a single controller configuration we keep the container services
|
||||
# running.
|
||||
if self._host.is_locking():
|
||||
from nfv_vim import directors
|
||||
sw_mgmt_director = directors.get_sw_mgmt_director()
|
||||
if not sw_mgmt_director.single_controller:
|
||||
task_work_list.append(DisableHostServicesTaskWork(
|
||||
self, host, objects.HOST_SERVICES.CONTAINER))
|
||||
task_work_list.append(notify_host_services_task(
|
||||
self, host, force_pass=True))
|
||||
if host.host_service_configured(objects.HOST_SERVICES.COMPUTE):
|
||||
|
@ -443,8 +451,21 @@ class NotifyDisabledHostTask(state_machine.StateTask):
|
|||
Notify Disabled Host Task
|
||||
"""
|
||||
def __init__(self, host):
|
||||
from nfv_vim import objects
|
||||
|
||||
self._host_reference = weakref.ref(host)
|
||||
task_work_list = list()
|
||||
if host.host_service_configured(objects.HOST_SERVICES.CONTAINER):
|
||||
# Only disable the container services if the host is being locked
|
||||
# and we are not running in a single controller configuration. In
|
||||
# a single controller configuration we keep the container services
|
||||
# running.
|
||||
if self._host.is_locking():
|
||||
from nfv_vim import directors
|
||||
sw_mgmt_director = directors.get_sw_mgmt_director()
|
||||
if not sw_mgmt_director.single_controller:
|
||||
task_work_list.append(DisableHostServicesTaskWork(
|
||||
self, host, objects.HOST_SERVICES.CONTAINER))
|
||||
task_work_list.append(NotifyHostServicesDisabledTaskWork(
|
||||
self, host, force_pass=True))
|
||||
super(NotifyDisabledHostTask, self).__init__(
|
||||
|
|
Loading…
Reference in New Issue