From 3e9d13ec85424a234f815d5db7e5357a68a4c5f9 Mon Sep 17 00:00:00 2001 From: Bart Wensley Date: Tue, 5 Feb 2019 14:09:24 -0600 Subject: [PATCH] Apply NoExecute taint to locked nodes This change applies the NoExecute taint to AIO-DX and worker nodes when they are locked. This causes pods to be evicted from the node and prevents new pods from being scheduled on that node. When the node is unlocked and has rebooted, the taint is removed. Change-Id: I2a7c8843a68661e15224260c53fd171920473161 story: 2002843 task: 29359 Signed-off-by: Bart Wensley --- .../nfvi_plugins/nfvi_infrastructure_api.py | 37 +++------ .../nfvi_plugins/openstack/neutron.py | 80 +++++++++---------- .../nfv_vim/directors/_sw_mgmt_director.py | 7 ++ nfv/nfv-vim/nfv_vim/host_fsm/_host_tasks.py | 27 ++++++- 4 files changed, 81 insertions(+), 70 deletions(-) diff --git a/nfv/nfv-plugins/nfv_plugins/nfvi_plugins/nfvi_infrastructure_api.py b/nfv/nfv-plugins/nfv_plugins/nfvi_plugins/nfvi_infrastructure_api.py index 7b627d2d..7b834e9c 100755 --- a/nfv/nfv-plugins/nfv_plugins/nfvi_plugins/nfvi_infrastructure_api.py +++ b/nfv/nfv-plugins/nfv_plugins/nfvi_plugins/nfvi_infrastructure_api.py @@ -884,34 +884,21 @@ class NFVIInfrastructureAPI(nfvi.api.v1.NFVIInfrastructureAPI): future.set_timeouts(config.CONF.get('nfvi-timeouts', None)) if self._host_supports_kubernetes(host_personality): - if True: - # For now, we do not want to apply the NoExecute taint. - # When the VIM detects that a service is failed on a host, - # it goes through a disable/enable cycle. This would cause - # the NoExecute taint to be applied/removed which causes - # most pods to be stopped/started. If the pods don't come - # back quickly enough the VIM will attempt another - # disable/enable, which can go on forever. For now, - # we will just avoid tainting hosts. - # TODO(bwensley): Rework when support for pure k8s hosts is - # added. - pass - else: - response['reason'] = 'failed to disable kubernetes services' + response['reason'] = 'failed to disable kubernetes services' - # To disable kubernetes we add the NoExecute taint to the - # node. This removes pods that can be scheduled elsewhere - # and prevents new pods from scheduling on the node. - future.work(kubernetes_client.taint_node, - host_name, "NoExecute", "services", "disabled") + # To disable kubernetes we add the NoExecute taint to the + # node. This removes pods that can be scheduled elsewhere + # and prevents new pods from scheduling on the node. + future.work(kubernetes_client.taint_node, + host_name, "NoExecute", "services", "disabled") - future.result = (yield) + future.result = (yield) - if not future.result.is_complete(): - DLOG.error("Kubernetes taint_node failed, operation " - "did not complete, host_uuid=%s, host_name=%s." - % (host_uuid, host_name)) - return + if not future.result.is_complete(): + DLOG.error("Kubernetes taint_node failed, operation " + "did not complete, host_uuid=%s, host_name=%s." + % (host_uuid, host_name)) + return response['completed'] = True response['reason'] = '' diff --git a/nfv/nfv-plugins/nfv_plugins/nfvi_plugins/openstack/neutron.py b/nfv/nfv-plugins/nfv_plugins/nfvi_plugins/openstack/neutron.py index 5efeba68..61d3b7de 100755 --- a/nfv/nfv-plugins/nfv_plugins/nfvi_plugins/openstack/neutron.py +++ b/nfv/nfv-plugins/nfv_plugins/nfvi_plugins/openstack/neutron.py @@ -669,49 +669,45 @@ def query_network_agents(token, host_name, check_fully_up): Input parameter check_fully_up set to True will check for both alive and admin_state_up, otherwise only alive is checked. """ - try: - url, api_cmd, api_cmd_headers, result_data = get_network_agents( - token, host_name) + url, api_cmd, api_cmd_headers, result_data = get_network_agents( + token, host_name) - agent_state = 'up' - supported_agents = [AGENT_TYPE.L3, AGENT_TYPE.DHCP] - for supported_agent in supported_agents: - found = False - for agent in result_data: - agent_type = agent.get('agent_type', '') - host = agent.get('host', '') - if (agent_type == supported_agent) and (host == host_name): - DLOG.verbose("found agent %s for host %s" % - (supported_agent, host_name)) - alive = agent.get('alive', False) - admin_state_up = agent.get('admin_state_up', False) - # found the agent of interest. - found = True - break - if found: - if check_fully_up: - if not (alive and admin_state_up): - DLOG.verbose("host %s agent %s not fully up. alive: %s," - " admin_state_up: %s" % - (host_name, supported_agent, - alive, admin_state_up)) - agent_state = 'down' - break - else: - if not alive: - DLOG.verbose("host %s agent %s not alive" % - (host_name, supported_agent)) - agent_state = 'down' - break - else: - DLOG.error("host %s agent %s not present" % - (host_name, supported_agent)) - agent_state = 'down' + agent_state = 'up' + alive = False + admin_state_up = False + supported_agents = [AGENT_TYPE.L3, AGENT_TYPE.DHCP] + for supported_agent in supported_agents: + found = False + for agent in result_data: + agent_type = agent.get('agent_type', '') + host = agent.get('host', '') + if (agent_type == supported_agent) and (host == host_name): + DLOG.verbose("found agent %s for host %s" % + (supported_agent, host_name)) + alive = agent.get('alive', False) + admin_state_up = agent.get('admin_state_up', False) + # found the agent of interest. + found = True break - - except Exception as e: - DLOG.exception("Caught exception trying to query host %s " - "agent states: %s" % (host_name, e)) - agent_state = 'down' + if found: + if check_fully_up: + if not (alive and admin_state_up): + DLOG.verbose("host %s agent %s not fully up. alive: %s," + " admin_state_up: %s" % + (host_name, supported_agent, + alive, admin_state_up)) + agent_state = 'down' + break + else: + if not alive: + DLOG.verbose("host %s agent %s not alive" % + (host_name, supported_agent)) + agent_state = 'down' + break + else: + DLOG.error("host %s agent %s not present" % + (host_name, supported_agent)) + agent_state = 'down' + break return agent_state diff --git a/nfv/nfv-vim/nfv_vim/directors/_sw_mgmt_director.py b/nfv/nfv-vim/nfv_vim/directors/_sw_mgmt_director.py index 1e822ac2..f14f5444 100755 --- a/nfv/nfv-vim/nfv_vim/directors/_sw_mgmt_director.py +++ b/nfv/nfv-vim/nfv_vim/directors/_sw_mgmt_director.py @@ -38,6 +38,13 @@ class SwMgmtDirector(object): """ return self._sw_update + @property + def single_controller(self): + """ + Returns whether this is a single controller configuration + """ + return self._single_controller + def create_sw_patch_strategy(self, controller_apply_type, storage_apply_type, swift_apply_type, worker_apply_type, max_parallel_worker_hosts, diff --git a/nfv/nfv-vim/nfv_vim/host_fsm/_host_tasks.py b/nfv/nfv-vim/nfv_vim/host_fsm/_host_tasks.py index 159bd802..82e1fa79 100755 --- a/nfv/nfv-vim/nfv_vim/host_fsm/_host_tasks.py +++ b/nfv/nfv-vim/nfv_vim/host_fsm/_host_tasks.py @@ -234,9 +234,6 @@ class DisableHostTask(state_machine.StateTask): if host.host_service_configured(objects.HOST_SERVICES.GUEST): task_work_list.append(DisableHostServicesTaskWork( self, host, objects.HOST_SERVICES.GUEST)) - if host.host_service_configured(objects.HOST_SERVICES.CONTAINER): - task_work_list.append(DisableHostServicesTaskWork( - self, host, objects.HOST_SERVICES.CONTAINER)) if host.host_service_configured(objects.HOST_SERVICES.COMPUTE): task_work_list.append(QueryHypervisorTaskWork( self, host, force_pass=True)) @@ -248,6 +245,17 @@ class DisableHostTask(state_machine.StateTask): task_work_list.append(NotifyHostDisabledTaskWork( self, host, objects.HOST_SERVICES.NETWORK)) task_work_list.append(NotifyInstancesHostDisabledTaskWork(self, host)) + if host.host_service_configured(objects.HOST_SERVICES.CONTAINER): + # Only disable the container services if the host is being locked + # and we are not running in a single controller configuration. In + # a single controller configuration we keep the container services + # running. + if self._host.is_locking(): + from nfv_vim import directors + sw_mgmt_director = directors.get_sw_mgmt_director() + if not sw_mgmt_director.single_controller: + task_work_list.append(DisableHostServicesTaskWork( + self, host, objects.HOST_SERVICES.CONTAINER)) task_work_list.append(notify_host_services_task( self, host, force_pass=True)) if host.host_service_configured(objects.HOST_SERVICES.COMPUTE): @@ -443,8 +451,21 @@ class NotifyDisabledHostTask(state_machine.StateTask): Notify Disabled Host Task """ def __init__(self, host): + from nfv_vim import objects + self._host_reference = weakref.ref(host) task_work_list = list() + if host.host_service_configured(objects.HOST_SERVICES.CONTAINER): + # Only disable the container services if the host is being locked + # and we are not running in a single controller configuration. In + # a single controller configuration we keep the container services + # running. + if self._host.is_locking(): + from nfv_vim import directors + sw_mgmt_director = directors.get_sw_mgmt_director() + if not sw_mgmt_director.single_controller: + task_work_list.append(DisableHostServicesTaskWork( + self, host, objects.HOST_SERVICES.CONTAINER)) task_work_list.append(NotifyHostServicesDisabledTaskWork( self, host, force_pass=True)) super(NotifyDisabledHostTask, self).__init__(