Initial HA kubernetes deployment
Provides initial HA support for our kubernetes deployment, including: - Tuning kubelet and kube-controller-manager configuration to provide faster detection/response to node failures. Pods on failed nodes will now be evicted in approximately 50s vs. the default of about 5m40s. - Configure kube-dns and tiller-deploy pods with a node selector to ensure they only run on master nodes. - Update VIM to apply NoExecute taint to any kubernetes node when host services are disabled (e.g. due to host lock or host failure). Kubernetes will then evict any dynamic pods from the node (but not static pods or daemonset managed pods). - Update VIM to remove NoExecute taint from any kubernetes node when host services are enabled. Change-Id: Ib1bb4265e2b947772d2ec5972bb7271f230f6f4f Story: 2002843 Task: 22790 Depends-On: https://review.openstack.org/#/c/597123/ Signed-off-by: David Sullivan <david.sullivan@windriver.com>
This commit is contained in:
parent
8b370e8060
commit
47eefce50e
|
@ -0,0 +1,5 @@
|
|||
#
|
||||
# Copyright (c) 2018 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
|
@ -0,0 +1,110 @@
|
|||
#
|
||||
# Copyright (c) 2018 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
|
||||
import httplib
|
||||
import kubernetes
|
||||
from kubernetes.client.rest import ApiException
|
||||
|
||||
from nfv_common import debug
|
||||
from nfv_common.helpers import Result
|
||||
|
||||
DLOG = debug.debug_get_logger('nfv_plugins.nfvi_plugins.clients.kubernetes_client')
|
||||
|
||||
|
||||
def get_client():
|
||||
kubernetes.config.load_kube_config('/etc/kubernetes/admin.conf')
|
||||
|
||||
# Workaround: Turn off SSL/TLS verification
|
||||
c = kubernetes.client.Configuration()
|
||||
c.verify_ssl = False
|
||||
kubernetes.client.Configuration.set_default(c)
|
||||
|
||||
return kubernetes.client.CoreV1Api()
|
||||
|
||||
|
||||
def taint_node(node_name, effect, key, value):
|
||||
"""
|
||||
Apply a taint to a node
|
||||
"""
|
||||
# Get the client.
|
||||
kube_client = get_client()
|
||||
|
||||
# Retrieve the node to access any existing taints.
|
||||
try:
|
||||
response = kube_client.read_node(node_name)
|
||||
except ApiException as e:
|
||||
if e.status == httplib.NOT_FOUND:
|
||||
# In some cases we may attempt to taint a node that exists in
|
||||
# the VIM, but not yet in kubernetes (e.g. when the node is first
|
||||
# being configured). Ignore the failure.
|
||||
DLOG.info("Not tainting node %s because it doesn't exist" %
|
||||
node_name)
|
||||
return
|
||||
else:
|
||||
raise
|
||||
|
||||
add_taint = True
|
||||
taints = response.spec.taints
|
||||
if taints is not None:
|
||||
for taint in taints:
|
||||
# Taints must be unique by key and effect
|
||||
if taint.key == key and taint.effect == effect:
|
||||
add_taint = False
|
||||
if taint.value != value:
|
||||
msg = ("Duplicate value - key: %s effect: %s "
|
||||
"value: %s new value %s" % (key, effect,
|
||||
taint.value, value))
|
||||
DLOG.error(msg)
|
||||
raise Exception(msg)
|
||||
else:
|
||||
# This taint already exists
|
||||
break
|
||||
|
||||
if add_taint:
|
||||
DLOG.info("Adding %s=%s:%s taint to node %s" % (key, value, effect,
|
||||
node_name))
|
||||
# Preserve any existing taints
|
||||
if taints is not None:
|
||||
body = {"spec": {"taints": taints}}
|
||||
else:
|
||||
body = {"spec": {"taints": []}}
|
||||
# Add our new taint
|
||||
new_taint = {"key": key, "value": value, "effect": effect}
|
||||
body["spec"]["taints"].append(new_taint)
|
||||
response = kube_client.patch_node(node_name, body)
|
||||
|
||||
return Result(response)
|
||||
|
||||
|
||||
def untaint_node(node_name, effect, key):
|
||||
"""
|
||||
Remove a taint from a node
|
||||
"""
|
||||
# Get the client.
|
||||
kube_client = get_client()
|
||||
|
||||
# Retrieve the node to access any existing taints.
|
||||
response = kube_client.read_node(node_name)
|
||||
|
||||
remove_taint = False
|
||||
taints = response.spec.taints
|
||||
if taints is not None:
|
||||
for taint in taints:
|
||||
# Taints must be unique by key and effect
|
||||
if taint.key == key and taint.effect == effect:
|
||||
remove_taint = True
|
||||
break
|
||||
|
||||
if remove_taint:
|
||||
DLOG.info("Removing %s:%s taint from node %s" % (key, effect,
|
||||
node_name))
|
||||
# Preserve any existing taints
|
||||
updated_taints = [taint for taint in taints if taint.key != key or
|
||||
taint.effect != effect]
|
||||
body = {"spec": {"taints": updated_taints}}
|
||||
response = kube_client.patch_node(node_name, body)
|
||||
|
||||
return Result(response)
|
|
@ -5,6 +5,7 @@
|
|||
#
|
||||
import json
|
||||
import httplib
|
||||
import os
|
||||
|
||||
from nfv_common import debug
|
||||
from nfv_common import tcp
|
||||
|
@ -12,6 +13,7 @@ from nfv_common import tcp
|
|||
from nfv_vim import nfvi
|
||||
|
||||
import config
|
||||
from clients import kubernetes_client
|
||||
from openstack import rest_api
|
||||
from openstack import exceptions
|
||||
from openstack import openstack
|
||||
|
@ -125,6 +127,14 @@ class NFVIInfrastructureAPI(nfvi.api.v1.NFVIInfrastructureAPI):
|
|||
def _host_supports_nova_compute(personality):
|
||||
return ('compute' in personality)
|
||||
|
||||
@staticmethod
|
||||
def _host_supports_kubernetes(personality):
|
||||
# TODO: This check will disappear once kubernetes is the default
|
||||
if os.path.isfile('/etc/kubernetes/admin.conf'):
|
||||
return ('compute' in personality or 'controller' in personality)
|
||||
else:
|
||||
return False
|
||||
|
||||
def __init__(self):
|
||||
super(NFVIInfrastructureAPI, self).__init__()
|
||||
self._token = None
|
||||
|
@ -1035,8 +1045,8 @@ class NFVIInfrastructureAPI(nfvi.api.v1.NFVIInfrastructureAPI):
|
|||
def enable_host_services(self, future, host_uuid, host_name,
|
||||
host_personality, callback):
|
||||
"""
|
||||
Enable Host Services, notifies Nova, Neutron and Guest to enable their
|
||||
services for the specified host
|
||||
Enable Host Services, notifies Nova, Neutron, Guest and Kubernetes to
|
||||
enable their services for the specified host
|
||||
"""
|
||||
response = dict()
|
||||
response['completed'] = False
|
||||
|
@ -1121,9 +1131,9 @@ class NFVIInfrastructureAPI(nfvi.api.v1.NFVIInfrastructureAPI):
|
|||
% (host_uuid, host_name))
|
||||
return
|
||||
|
||||
if self._host_supports_nova_compute(host_personality):
|
||||
response['reason'] = 'failed to enable guest services'
|
||||
|
||||
if self._host_supports_nova_compute(host_personality):
|
||||
# Send the Enable request to Guest
|
||||
future.work(guest.host_services_enable, self._token, host_uuid,
|
||||
host_name)
|
||||
|
@ -1135,6 +1145,21 @@ class NFVIInfrastructureAPI(nfvi.api.v1.NFVIInfrastructureAPI):
|
|||
"did not complete, host_uuid=%s, host_name=%s."
|
||||
% (host_uuid, host_name))
|
||||
|
||||
if self._host_supports_kubernetes(host_personality):
|
||||
response['reason'] = 'failed to enable kubernetes services'
|
||||
|
||||
# To enable kubernetes we remove the NoExecute taint from the
|
||||
# node. This allows new pods to be scheduled on the node.
|
||||
future.work(kubernetes_client.untaint_node,
|
||||
host_name, "NoExecute", "services")
|
||||
future.result = (yield)
|
||||
|
||||
if not future.result.is_complete():
|
||||
DLOG.error("Kubernetes untaint_node failed, operation "
|
||||
"did not complete, host_uuid=%s, host_name=%s."
|
||||
% (host_uuid, host_name))
|
||||
return
|
||||
|
||||
response['completed'] = True
|
||||
response['reason'] = ''
|
||||
|
||||
|
@ -1150,7 +1175,7 @@ class NFVIInfrastructureAPI(nfvi.api.v1.NFVIInfrastructureAPI):
|
|||
|
||||
except Exception as e:
|
||||
DLOG.exception("Caught exception while trying to enable %s "
|
||||
"nova or neutron openstack services, error=%s."
|
||||
"host services, error=%s."
|
||||
% (host_name, e))
|
||||
|
||||
finally:
|
||||
|
@ -1160,8 +1185,8 @@ class NFVIInfrastructureAPI(nfvi.api.v1.NFVIInfrastructureAPI):
|
|||
def disable_host_services(self, future, host_uuid, host_name,
|
||||
host_personality, callback):
|
||||
"""
|
||||
Disable Host Services, notifies Nova and Guest to disable their
|
||||
services for the specified host
|
||||
Disable Host Services, notifies Nova, Guest and Kubernetes to disable
|
||||
their services for the specified host (as applicable)
|
||||
"""
|
||||
response = dict()
|
||||
response['completed'] = False
|
||||
|
@ -1170,72 +1195,85 @@ class NFVIInfrastructureAPI(nfvi.api.v1.NFVIInfrastructureAPI):
|
|||
try:
|
||||
future.set_timeouts(config.CONF.get('nfvi-timeouts', None))
|
||||
|
||||
# Only applies to compute hosts
|
||||
if not self._host_supports_nova_compute(host_personality):
|
||||
response['completed'] = True
|
||||
response['reason'] = ''
|
||||
return
|
||||
# The following only applies to compute hosts
|
||||
if self._host_supports_nova_compute(host_personality):
|
||||
response['reason'] = 'failed to get token from keystone'
|
||||
|
||||
response['reason'] = 'failed to get token from keystone'
|
||||
if self._token is None or self._token.is_expired():
|
||||
future.work(openstack.get_token, self._directory)
|
||||
future.result = (yield)
|
||||
|
||||
if not future.result.is_complete():
|
||||
DLOG.error("OpenStack get-token did not complete, "
|
||||
"host_uuid=%s, host_name=%s." % (host_uuid,
|
||||
host_name))
|
||||
return
|
||||
|
||||
self._token = future.result.data
|
||||
|
||||
response['reason'] = 'failed to disable nova services'
|
||||
|
||||
# Send the Disable request to Nova.
|
||||
future.work(nova.disable_host_services, self._token, host_name)
|
||||
|
||||
try:
|
||||
future.result = (yield)
|
||||
|
||||
if not future.result.is_complete():
|
||||
DLOG.error("Nova disable-host-services failed, operation "
|
||||
"did not complete, host_uuid=%s, host_name=%s."
|
||||
% (host_uuid, host_name))
|
||||
return
|
||||
|
||||
result_data = future.result.data['service']
|
||||
if not ('disabled' == result_data['status'] and
|
||||
host_name == result_data['host'] and
|
||||
'nova-compute' == result_data['binary']):
|
||||
DLOG.error("Nova disable-host-services failed, operation "
|
||||
"did not complete, host_uuid=%s, host_name=%s."
|
||||
% (host_uuid, host_name))
|
||||
return
|
||||
|
||||
except exceptions.OpenStackRestAPIException as e:
|
||||
if httplib.NOT_FOUND != e.http_status_code:
|
||||
raise
|
||||
|
||||
response['reason'] = 'failed to disable guest services'
|
||||
|
||||
# Send the Disable request to Guest.
|
||||
future.work(guest.host_services_disable, self._token, host_uuid,
|
||||
host_name)
|
||||
|
||||
try:
|
||||
future.result = (yield)
|
||||
|
||||
if not future.result.is_complete():
|
||||
# Do not return since the disable will be retried by audit
|
||||
DLOG.error("Guest host-services-disable failed, operation "
|
||||
"did not complete, host_uuid=%s, host_name=%s."
|
||||
% (host_uuid, host_name))
|
||||
|
||||
except exceptions.OpenStackRestAPIException as e:
|
||||
if httplib.NOT_FOUND != e.http_status_code:
|
||||
raise
|
||||
|
||||
if self._host_supports_kubernetes(host_personality):
|
||||
response['reason'] = 'failed to disable kubernetes services'
|
||||
|
||||
# To disable kubernetes we add the NoExecute taint to the
|
||||
# node. This removes pods that can be scheduled elsewhere
|
||||
# and prevents new pods from scheduling on the node.
|
||||
future.work(kubernetes_client.taint_node,
|
||||
host_name, "NoExecute", "services", "disabled")
|
||||
|
||||
if self._token is None or self._token.is_expired():
|
||||
future.work(openstack.get_token, self._directory)
|
||||
future.result = (yield)
|
||||
|
||||
if not future.result.is_complete():
|
||||
DLOG.error("OpenStack get-token did not complete, "
|
||||
"host_uuid=%s, host_name=%s." % (host_uuid,
|
||||
host_name))
|
||||
return
|
||||
|
||||
self._token = future.result.data
|
||||
|
||||
response['reason'] = 'failed to disable nova services'
|
||||
|
||||
# Send the Disable request to Nova.
|
||||
future.work(nova.disable_host_services, self._token, host_name)
|
||||
|
||||
try:
|
||||
future.result = (yield)
|
||||
|
||||
if not future.result.is_complete():
|
||||
DLOG.error("Nova disable-host-services failed, operation "
|
||||
DLOG.error("Kubernetes taint_node failed, operation "
|
||||
"did not complete, host_uuid=%s, host_name=%s."
|
||||
% (host_uuid, host_name))
|
||||
return
|
||||
|
||||
result_data = future.result.data['service']
|
||||
if not ('disabled' == result_data['status'] and
|
||||
host_name == result_data['host'] and
|
||||
'nova-compute' == result_data['binary']):
|
||||
DLOG.error("Nova disable-host-services failed, operation "
|
||||
"did not complete, host_uuid=%s, host_name=%s."
|
||||
% (host_uuid, host_name))
|
||||
return
|
||||
|
||||
except exceptions.OpenStackRestAPIException as e:
|
||||
if httplib.NOT_FOUND != e.http_status_code:
|
||||
raise
|
||||
|
||||
response['reason'] = 'failed to disable guest services'
|
||||
|
||||
# Send the Disable request to Guest.
|
||||
future.work(guest.host_services_disable, self._token, host_uuid,
|
||||
host_name)
|
||||
|
||||
try:
|
||||
future.result = (yield)
|
||||
|
||||
if not future.result.is_complete():
|
||||
# Do not return since the disable will be retried by audit
|
||||
DLOG.error("Guest host-services-disable failed, operation "
|
||||
"did not complete, host_uuid=%s, host_name=%s."
|
||||
% (host_uuid, host_name))
|
||||
|
||||
except exceptions.OpenStackRestAPIException as e:
|
||||
if httplib.NOT_FOUND != e.http_status_code:
|
||||
raise
|
||||
|
||||
response['completed'] = True
|
||||
response['reason'] = ''
|
||||
|
||||
|
@ -1251,7 +1289,7 @@ class NFVIInfrastructureAPI(nfvi.api.v1.NFVIInfrastructureAPI):
|
|||
|
||||
except Exception as e:
|
||||
DLOG.exception("Caught exception while trying to disable %s "
|
||||
"nova or neutron openstack services, error=%s."
|
||||
"host services, error=%s."
|
||||
% (host_name, e))
|
||||
|
||||
finally:
|
||||
|
|
|
@ -27,6 +27,7 @@ deps = {[nfv]nfv_client_dir}
|
|||
iso8601
|
||||
keyring
|
||||
kombu
|
||||
kubernetes
|
||||
passlib
|
||||
pecan
|
||||
pyparsing
|
||||
|
@ -45,7 +46,7 @@ envlist = pep8,py27-sqlite
|
|||
|
||||
[testenv]
|
||||
recreate = True
|
||||
install_command = pip install -c{env:UPPER_CONSTRAINTS_FILE:https://git.openstack.org/cgit/openstack/requirements/plain/upper-constraints.txt?h=stable/pike} {opts} {packages}
|
||||
install_command = pip install -c{env:UPPER_CONSTRAINTS_FILE:https://git.openstack.org/cgit/openstack/requirements/plain/upper-constraints.txt} {opts} {packages}
|
||||
|
||||
|
||||
[testenv:pep8]
|
||||
|
|
|
@ -169,6 +169,8 @@ nfv_vim.webserver.webserver: debug.level.verbose
|
|||
nfv_plugins.alarm_handlers.fm: debug.level.info
|
||||
nfv_plugins.event_log_handlers.fm: debug.level.info
|
||||
# ----------------------------------------------------------------------------
|
||||
nfv_plugins.nfvi_plugins.clients: debug.level.info
|
||||
nfv_plugins.nfvi_plugins.clients.kubernetes_client: debug.level.info
|
||||
nfv_plugins.nfvi_plugins.openstack: debug.level.info
|
||||
nfv_plugins.nfvi_plugins.openstack.patching: debug.level.info
|
||||
nfv_plugins.nfvi_plugins.openstack.keystone: debug.level.info
|
||||
|
|
Loading…
Reference in New Issue