Merge "Apply NoExecute taint to locked nodes"

This commit is contained in:
Zuul 2019-02-05 22:12:16 +00:00 committed by Gerrit Code Review
commit 97524fee3a
4 changed files with 81 additions and 70 deletions

View File

@ -893,34 +893,21 @@ class NFVIInfrastructureAPI(nfvi.api.v1.NFVIInfrastructureAPI):
future.set_timeouts(config.CONF.get('nfvi-timeouts', None)) future.set_timeouts(config.CONF.get('nfvi-timeouts', None))
if self._host_supports_kubernetes(host_personality): if self._host_supports_kubernetes(host_personality):
if True: response['reason'] = 'failed to disable kubernetes services'
# For now, we do not want to apply the NoExecute taint.
# When the VIM detects that a service is failed on a host,
# it goes through a disable/enable cycle. This would cause
# the NoExecute taint to be applied/removed which causes
# most pods to be stopped/started. If the pods don't come
# back quickly enough the VIM will attempt another
# disable/enable, which can go on forever. For now,
# we will just avoid tainting hosts.
# TODO(bwensley): Rework when support for pure k8s hosts is
# added.
pass
else:
response['reason'] = 'failed to disable kubernetes services'
# To disable kubernetes we add the NoExecute taint to the # To disable kubernetes we add the NoExecute taint to the
# node. This removes pods that can be scheduled elsewhere # node. This removes pods that can be scheduled elsewhere
# and prevents new pods from scheduling on the node. # and prevents new pods from scheduling on the node.
future.work(kubernetes_client.taint_node, future.work(kubernetes_client.taint_node,
host_name, "NoExecute", "services", "disabled") host_name, "NoExecute", "services", "disabled")
future.result = (yield) future.result = (yield)
if not future.result.is_complete(): if not future.result.is_complete():
DLOG.error("Kubernetes taint_node failed, operation " DLOG.error("Kubernetes taint_node failed, operation "
"did not complete, host_uuid=%s, host_name=%s." "did not complete, host_uuid=%s, host_name=%s."
% (host_uuid, host_name)) % (host_uuid, host_name))
return return
response['completed'] = True response['completed'] = True
response['reason'] = '' response['reason'] = ''

View File

@ -669,49 +669,45 @@ def query_network_agents(token, host_name, check_fully_up):
Input parameter check_fully_up set to True will check for Input parameter check_fully_up set to True will check for
both alive and admin_state_up, otherwise only alive is checked. both alive and admin_state_up, otherwise only alive is checked.
""" """
try: url, api_cmd, api_cmd_headers, result_data = get_network_agents(
url, api_cmd, api_cmd_headers, result_data = get_network_agents( token, host_name)
token, host_name)
agent_state = 'up' agent_state = 'up'
supported_agents = [AGENT_TYPE.L3, AGENT_TYPE.DHCP] alive = False
for supported_agent in supported_agents: admin_state_up = False
found = False supported_agents = [AGENT_TYPE.L3, AGENT_TYPE.DHCP]
for agent in result_data: for supported_agent in supported_agents:
agent_type = agent.get('agent_type', '') found = False
host = agent.get('host', '') for agent in result_data:
if (agent_type == supported_agent) and (host == host_name): agent_type = agent.get('agent_type', '')
DLOG.verbose("found agent %s for host %s" % host = agent.get('host', '')
(supported_agent, host_name)) if (agent_type == supported_agent) and (host == host_name):
alive = agent.get('alive', False) DLOG.verbose("found agent %s for host %s" %
admin_state_up = agent.get('admin_state_up', False) (supported_agent, host_name))
# found the agent of interest. alive = agent.get('alive', False)
found = True admin_state_up = agent.get('admin_state_up', False)
break # found the agent of interest.
if found: found = True
if check_fully_up:
if not (alive and admin_state_up):
DLOG.verbose("host %s agent %s not fully up. alive: %s,"
" admin_state_up: %s" %
(host_name, supported_agent,
alive, admin_state_up))
agent_state = 'down'
break
else:
if not alive:
DLOG.verbose("host %s agent %s not alive" %
(host_name, supported_agent))
agent_state = 'down'
break
else:
DLOG.error("host %s agent %s not present" %
(host_name, supported_agent))
agent_state = 'down'
break break
if found:
except Exception as e: if check_fully_up:
DLOG.exception("Caught exception trying to query host %s " if not (alive and admin_state_up):
"agent states: %s" % (host_name, e)) DLOG.verbose("host %s agent %s not fully up. alive: %s,"
agent_state = 'down' " admin_state_up: %s" %
(host_name, supported_agent,
alive, admin_state_up))
agent_state = 'down'
break
else:
if not alive:
DLOG.verbose("host %s agent %s not alive" %
(host_name, supported_agent))
agent_state = 'down'
break
else:
DLOG.error("host %s agent %s not present" %
(host_name, supported_agent))
agent_state = 'down'
break
return agent_state return agent_state

View File

@ -38,6 +38,13 @@ class SwMgmtDirector(object):
""" """
return self._sw_update return self._sw_update
@property
def single_controller(self):
"""
Returns whether this is a single controller configuration
"""
return self._single_controller
def create_sw_patch_strategy(self, controller_apply_type, storage_apply_type, def create_sw_patch_strategy(self, controller_apply_type, storage_apply_type,
swift_apply_type, worker_apply_type, swift_apply_type, worker_apply_type,
max_parallel_worker_hosts, max_parallel_worker_hosts,

View File

@ -234,9 +234,6 @@ class DisableHostTask(state_machine.StateTask):
if host.host_service_configured(objects.HOST_SERVICES.GUEST): if host.host_service_configured(objects.HOST_SERVICES.GUEST):
task_work_list.append(DisableHostServicesTaskWork( task_work_list.append(DisableHostServicesTaskWork(
self, host, objects.HOST_SERVICES.GUEST)) self, host, objects.HOST_SERVICES.GUEST))
if host.host_service_configured(objects.HOST_SERVICES.CONTAINER):
task_work_list.append(DisableHostServicesTaskWork(
self, host, objects.HOST_SERVICES.CONTAINER))
if host.host_service_configured(objects.HOST_SERVICES.COMPUTE): if host.host_service_configured(objects.HOST_SERVICES.COMPUTE):
task_work_list.append(QueryHypervisorTaskWork( task_work_list.append(QueryHypervisorTaskWork(
self, host, force_pass=True)) self, host, force_pass=True))
@ -248,6 +245,17 @@ class DisableHostTask(state_machine.StateTask):
task_work_list.append(NotifyHostDisabledTaskWork( task_work_list.append(NotifyHostDisabledTaskWork(
self, host, objects.HOST_SERVICES.NETWORK)) self, host, objects.HOST_SERVICES.NETWORK))
task_work_list.append(NotifyInstancesHostDisabledTaskWork(self, host)) task_work_list.append(NotifyInstancesHostDisabledTaskWork(self, host))
if host.host_service_configured(objects.HOST_SERVICES.CONTAINER):
# Only disable the container services if the host is being locked
# and we are not running in a single controller configuration. In
# a single controller configuration we keep the container services
# running.
if self._host.is_locking():
from nfv_vim import directors
sw_mgmt_director = directors.get_sw_mgmt_director()
if not sw_mgmt_director.single_controller:
task_work_list.append(DisableHostServicesTaskWork(
self, host, objects.HOST_SERVICES.CONTAINER))
task_work_list.append(notify_host_services_task( task_work_list.append(notify_host_services_task(
self, host, force_pass=True)) self, host, force_pass=True))
if host.host_service_configured(objects.HOST_SERVICES.COMPUTE): if host.host_service_configured(objects.HOST_SERVICES.COMPUTE):
@ -443,8 +451,21 @@ class NotifyDisabledHostTask(state_machine.StateTask):
Notify Disabled Host Task Notify Disabled Host Task
""" """
def __init__(self, host): def __init__(self, host):
from nfv_vim import objects
self._host_reference = weakref.ref(host) self._host_reference = weakref.ref(host)
task_work_list = list() task_work_list = list()
if host.host_service_configured(objects.HOST_SERVICES.CONTAINER):
# Only disable the container services if the host is being locked
# and we are not running in a single controller configuration. In
# a single controller configuration we keep the container services
# running.
if self._host.is_locking():
from nfv_vim import directors
sw_mgmt_director = directors.get_sw_mgmt_director()
if not sw_mgmt_director.single_controller:
task_work_list.append(DisableHostServicesTaskWork(
self, host, objects.HOST_SERVICES.CONTAINER))
task_work_list.append(NotifyHostServicesDisabledTaskWork( task_work_list.append(NotifyHostServicesDisabledTaskWork(
self, host, force_pass=True)) self, host, force_pass=True))
super(NotifyDisabledHostTask, self).__init__( super(NotifyDisabledHostTask, self).__init__(