Initial HA kubernetes deployment

Provides initial HA support for our kubernetes deployment, including:
- Tuning kubelet and kube-controller-manager configuration to provide
  faster detection/response to node failures. Pods on failed nodes
  will now be evicted in approximately 50s vs. the default of about
  5m40s.
- Configure kube-dns and tiller-deploy pods with a node selector to
  ensure they only run on master nodes.
- Update VIM to apply NoExecute taint to any kubernetes node when host
  services are disabled (e.g. due to host lock or host failure).
  Kubernetes will then evict any dynamic pods from the node (but not
  static pods or daemonset managed pods).
- Update VIM to remove NoExecute taint from any kubernetes node when
  host services are enabled.

Change-Id: Ib1bb4265e2b947772d2ec5972bb7271f230f6f4f
Story: 2002843
Task: 22790
Depends-On: https://review.openstack.org/#/c/597123/
Signed-off-by: David Sullivan <david.sullivan@windriver.com>
This commit is contained in:
Bart Wensley 2018-07-19 08:23:02 -05:00 committed by David Sullivan
parent 8b370e8060
commit 47eefce50e
5 changed files with 221 additions and 65 deletions

View File

@ -0,0 +1,5 @@
#
# Copyright (c) 2018 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#

View File

@ -0,0 +1,110 @@
#
# Copyright (c) 2018 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
import httplib
import kubernetes
from kubernetes.client.rest import ApiException
from nfv_common import debug
from nfv_common.helpers import Result
DLOG = debug.debug_get_logger('nfv_plugins.nfvi_plugins.clients.kubernetes_client')
def get_client():
kubernetes.config.load_kube_config('/etc/kubernetes/admin.conf')
# Workaround: Turn off SSL/TLS verification
c = kubernetes.client.Configuration()
c.verify_ssl = False
kubernetes.client.Configuration.set_default(c)
return kubernetes.client.CoreV1Api()
def taint_node(node_name, effect, key, value):
"""
Apply a taint to a node
"""
# Get the client.
kube_client = get_client()
# Retrieve the node to access any existing taints.
try:
response = kube_client.read_node(node_name)
except ApiException as e:
if e.status == httplib.NOT_FOUND:
# In some cases we may attempt to taint a node that exists in
# the VIM, but not yet in kubernetes (e.g. when the node is first
# being configured). Ignore the failure.
DLOG.info("Not tainting node %s because it doesn't exist" %
node_name)
return
else:
raise
add_taint = True
taints = response.spec.taints
if taints is not None:
for taint in taints:
# Taints must be unique by key and effect
if taint.key == key and taint.effect == effect:
add_taint = False
if taint.value != value:
msg = ("Duplicate value - key: %s effect: %s "
"value: %s new value %s" % (key, effect,
taint.value, value))
DLOG.error(msg)
raise Exception(msg)
else:
# This taint already exists
break
if add_taint:
DLOG.info("Adding %s=%s:%s taint to node %s" % (key, value, effect,
node_name))
# Preserve any existing taints
if taints is not None:
body = {"spec": {"taints": taints}}
else:
body = {"spec": {"taints": []}}
# Add our new taint
new_taint = {"key": key, "value": value, "effect": effect}
body["spec"]["taints"].append(new_taint)
response = kube_client.patch_node(node_name, body)
return Result(response)
def untaint_node(node_name, effect, key):
"""
Remove a taint from a node
"""
# Get the client.
kube_client = get_client()
# Retrieve the node to access any existing taints.
response = kube_client.read_node(node_name)
remove_taint = False
taints = response.spec.taints
if taints is not None:
for taint in taints:
# Taints must be unique by key and effect
if taint.key == key and taint.effect == effect:
remove_taint = True
break
if remove_taint:
DLOG.info("Removing %s:%s taint from node %s" % (key, effect,
node_name))
# Preserve any existing taints
updated_taints = [taint for taint in taints if taint.key != key or
taint.effect != effect]
body = {"spec": {"taints": updated_taints}}
response = kube_client.patch_node(node_name, body)
return Result(response)

View File

@ -5,6 +5,7 @@
#
import json
import httplib
import os
from nfv_common import debug
from nfv_common import tcp
@ -12,6 +13,7 @@ from nfv_common import tcp
from nfv_vim import nfvi
import config
from clients import kubernetes_client
from openstack import rest_api
from openstack import exceptions
from openstack import openstack
@ -125,6 +127,14 @@ class NFVIInfrastructureAPI(nfvi.api.v1.NFVIInfrastructureAPI):
def _host_supports_nova_compute(personality):
return ('compute' in personality)
@staticmethod
def _host_supports_kubernetes(personality):
# TODO: This check will disappear once kubernetes is the default
if os.path.isfile('/etc/kubernetes/admin.conf'):
return ('compute' in personality or 'controller' in personality)
else:
return False
def __init__(self):
super(NFVIInfrastructureAPI, self).__init__()
self._token = None
@ -1035,8 +1045,8 @@ class NFVIInfrastructureAPI(nfvi.api.v1.NFVIInfrastructureAPI):
def enable_host_services(self, future, host_uuid, host_name,
host_personality, callback):
"""
Enable Host Services, notifies Nova, Neutron and Guest to enable their
services for the specified host
Enable Host Services, notifies Nova, Neutron, Guest and Kubernetes to
enable their services for the specified host
"""
response = dict()
response['completed'] = False
@ -1121,9 +1131,9 @@ class NFVIInfrastructureAPI(nfvi.api.v1.NFVIInfrastructureAPI):
% (host_uuid, host_name))
return
if self._host_supports_nova_compute(host_personality):
response['reason'] = 'failed to enable guest services'
if self._host_supports_nova_compute(host_personality):
# Send the Enable request to Guest
future.work(guest.host_services_enable, self._token, host_uuid,
host_name)
@ -1135,6 +1145,21 @@ class NFVIInfrastructureAPI(nfvi.api.v1.NFVIInfrastructureAPI):
"did not complete, host_uuid=%s, host_name=%s."
% (host_uuid, host_name))
if self._host_supports_kubernetes(host_personality):
response['reason'] = 'failed to enable kubernetes services'
# To enable kubernetes we remove the NoExecute taint from the
# node. This allows new pods to be scheduled on the node.
future.work(kubernetes_client.untaint_node,
host_name, "NoExecute", "services")
future.result = (yield)
if not future.result.is_complete():
DLOG.error("Kubernetes untaint_node failed, operation "
"did not complete, host_uuid=%s, host_name=%s."
% (host_uuid, host_name))
return
response['completed'] = True
response['reason'] = ''
@ -1150,7 +1175,7 @@ class NFVIInfrastructureAPI(nfvi.api.v1.NFVIInfrastructureAPI):
except Exception as e:
DLOG.exception("Caught exception while trying to enable %s "
"nova or neutron openstack services, error=%s."
"host services, error=%s."
% (host_name, e))
finally:
@ -1160,8 +1185,8 @@ class NFVIInfrastructureAPI(nfvi.api.v1.NFVIInfrastructureAPI):
def disable_host_services(self, future, host_uuid, host_name,
host_personality, callback):
"""
Disable Host Services, notifies Nova and Guest to disable their
services for the specified host
Disable Host Services, notifies Nova, Guest and Kubernetes to disable
their services for the specified host (as applicable)
"""
response = dict()
response['completed'] = False
@ -1170,72 +1195,85 @@ class NFVIInfrastructureAPI(nfvi.api.v1.NFVIInfrastructureAPI):
try:
future.set_timeouts(config.CONF.get('nfvi-timeouts', None))
# Only applies to compute hosts
if not self._host_supports_nova_compute(host_personality):
response['completed'] = True
response['reason'] = ''
return
# The following only applies to compute hosts
if self._host_supports_nova_compute(host_personality):
response['reason'] = 'failed to get token from keystone'
response['reason'] = 'failed to get token from keystone'
if self._token is None or self._token.is_expired():
future.work(openstack.get_token, self._directory)
future.result = (yield)
if not future.result.is_complete():
DLOG.error("OpenStack get-token did not complete, "
"host_uuid=%s, host_name=%s." % (host_uuid,
host_name))
return
self._token = future.result.data
response['reason'] = 'failed to disable nova services'
# Send the Disable request to Nova.
future.work(nova.disable_host_services, self._token, host_name)
try:
future.result = (yield)
if not future.result.is_complete():
DLOG.error("Nova disable-host-services failed, operation "
"did not complete, host_uuid=%s, host_name=%s."
% (host_uuid, host_name))
return
result_data = future.result.data['service']
if not ('disabled' == result_data['status'] and
host_name == result_data['host'] and
'nova-compute' == result_data['binary']):
DLOG.error("Nova disable-host-services failed, operation "
"did not complete, host_uuid=%s, host_name=%s."
% (host_uuid, host_name))
return
except exceptions.OpenStackRestAPIException as e:
if httplib.NOT_FOUND != e.http_status_code:
raise
response['reason'] = 'failed to disable guest services'
# Send the Disable request to Guest.
future.work(guest.host_services_disable, self._token, host_uuid,
host_name)
try:
future.result = (yield)
if not future.result.is_complete():
# Do not return since the disable will be retried by audit
DLOG.error("Guest host-services-disable failed, operation "
"did not complete, host_uuid=%s, host_name=%s."
% (host_uuid, host_name))
except exceptions.OpenStackRestAPIException as e:
if httplib.NOT_FOUND != e.http_status_code:
raise
if self._host_supports_kubernetes(host_personality):
response['reason'] = 'failed to disable kubernetes services'
# To disable kubernetes we add the NoExecute taint to the
# node. This removes pods that can be scheduled elsewhere
# and prevents new pods from scheduling on the node.
future.work(kubernetes_client.taint_node,
host_name, "NoExecute", "services", "disabled")
if self._token is None or self._token.is_expired():
future.work(openstack.get_token, self._directory)
future.result = (yield)
if not future.result.is_complete():
DLOG.error("OpenStack get-token did not complete, "
"host_uuid=%s, host_name=%s." % (host_uuid,
host_name))
return
self._token = future.result.data
response['reason'] = 'failed to disable nova services'
# Send the Disable request to Nova.
future.work(nova.disable_host_services, self._token, host_name)
try:
future.result = (yield)
if not future.result.is_complete():
DLOG.error("Nova disable-host-services failed, operation "
DLOG.error("Kubernetes taint_node failed, operation "
"did not complete, host_uuid=%s, host_name=%s."
% (host_uuid, host_name))
return
result_data = future.result.data['service']
if not ('disabled' == result_data['status'] and
host_name == result_data['host'] and
'nova-compute' == result_data['binary']):
DLOG.error("Nova disable-host-services failed, operation "
"did not complete, host_uuid=%s, host_name=%s."
% (host_uuid, host_name))
return
except exceptions.OpenStackRestAPIException as e:
if httplib.NOT_FOUND != e.http_status_code:
raise
response['reason'] = 'failed to disable guest services'
# Send the Disable request to Guest.
future.work(guest.host_services_disable, self._token, host_uuid,
host_name)
try:
future.result = (yield)
if not future.result.is_complete():
# Do not return since the disable will be retried by audit
DLOG.error("Guest host-services-disable failed, operation "
"did not complete, host_uuid=%s, host_name=%s."
% (host_uuid, host_name))
except exceptions.OpenStackRestAPIException as e:
if httplib.NOT_FOUND != e.http_status_code:
raise
response['completed'] = True
response['reason'] = ''
@ -1251,7 +1289,7 @@ class NFVIInfrastructureAPI(nfvi.api.v1.NFVIInfrastructureAPI):
except Exception as e:
DLOG.exception("Caught exception while trying to disable %s "
"nova or neutron openstack services, error=%s."
"host services, error=%s."
% (host_name, e))
finally:

View File

@ -27,6 +27,7 @@ deps = {[nfv]nfv_client_dir}
iso8601
keyring
kombu
kubernetes
passlib
pecan
pyparsing
@ -45,7 +46,7 @@ envlist = pep8,py27-sqlite
[testenv]
recreate = True
install_command = pip install -c{env:UPPER_CONSTRAINTS_FILE:https://git.openstack.org/cgit/openstack/requirements/plain/upper-constraints.txt?h=stable/pike} {opts} {packages}
install_command = pip install -c{env:UPPER_CONSTRAINTS_FILE:https://git.openstack.org/cgit/openstack/requirements/plain/upper-constraints.txt} {opts} {packages}
[testenv:pep8]

View File

@ -169,6 +169,8 @@ nfv_vim.webserver.webserver: debug.level.verbose
nfv_plugins.alarm_handlers.fm: debug.level.info
nfv_plugins.event_log_handlers.fm: debug.level.info
# ----------------------------------------------------------------------------
nfv_plugins.nfvi_plugins.clients: debug.level.info
nfv_plugins.nfvi_plugins.clients.kubernetes_client: debug.level.info
nfv_plugins.nfvi_plugins.openstack: debug.level.info
nfv_plugins.nfvi_plugins.openstack.patching: debug.level.info
nfv_plugins.nfvi_plugins.openstack.keystone: debug.level.info