442 lines
15 KiB
Python
442 lines
15 KiB
Python
#
|
|
# Copyright (c) 2018-2023 Wind River Systems, Inc.
|
|
#
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
#
|
|
|
|
from fm_api import constants as fm_constants
|
|
from fm_api import fm_api
|
|
import kubernetes
|
|
|
|
from kubernetes import __version__ as K8S_MODULE_VERSION
|
|
from kubernetes.client.models.v1_container_image import V1ContainerImage
|
|
from kubernetes.client.rest import ApiException
|
|
from six.moves import http_client as httplib
|
|
|
|
from nfv_common import debug
|
|
from nfv_common.helpers import Result
|
|
|
|
K8S_MODULE_MAJOR_VERSION = int(K8S_MODULE_VERSION.split('.', maxsplit=1)[0])
|
|
|
|
fmapi = fm_api.FaultAPIs()
|
|
|
|
DLOG = debug.debug_get_logger('nfv_plugins.nfvi_plugins.clients.kubernetes_client')
|
|
|
|
|
|
# https://github.com/kubernetes-client/python/issues/895
|
|
# If a container image contains no tag or digest, node
|
|
# related requests sent via python Kubernetes client will be
|
|
# returned with exception because python Kubernetes client
|
|
# deserializes the ContainerImage response from kube-apiserver
|
|
# and it fails the validation due to the empty image name.
|
|
#
|
|
# Implement this workaround to replace the V1ContainerImage.names
|
|
# in the python Kubernetes client to bypass the "none image"
|
|
# check because the error is not from kubernetes.
|
|
#
|
|
# This workaround should be removed when we update to
|
|
# kubernetes client v22
|
|
def names(self, names):
|
|
"""Monkey patch V1ContainerImage with this to set the names."""
|
|
self._names = names
|
|
|
|
|
|
# Replacing address of "names" in V1ContainerImage
|
|
# with the "names" defined above
|
|
V1ContainerImage.names = V1ContainerImage.names.setter(names) # pylint: disable=assignment-from-no-return
|
|
|
|
|
|
def get_client():
|
|
kubernetes.config.load_kube_config('/etc/kubernetes/admin.conf')
|
|
|
|
# Workaround: Turn off SSL/TLS verification
|
|
if K8S_MODULE_MAJOR_VERSION < 12:
|
|
c = kubernetes.client.Configuration()
|
|
else:
|
|
c = kubernetes.client.Configuration().get_default_copy()
|
|
c.verify_ssl = False
|
|
kubernetes.client.Configuration.set_default(c)
|
|
|
|
return kubernetes.client.CoreV1Api()
|
|
|
|
|
|
def get_kubertnetes_https_client():
|
|
"""
|
|
Get Kubernetes client with HTTPS enabled
|
|
"""
|
|
kubernetes.config.load_kube_config('/etc/kubernetes/admin.conf')
|
|
|
|
if K8S_MODULE_MAJOR_VERSION < 12:
|
|
c = kubernetes.client.Configuration()
|
|
else:
|
|
c = kubernetes.client.Configuration().get_default_copy()
|
|
kubernetes.client.Configuration.set_default(c)
|
|
return kubernetes.client
|
|
|
|
|
|
def get_customobjects_api_instance():
|
|
"""
|
|
Get a custom objects API instance
|
|
"""
|
|
client = get_kubertnetes_https_client()
|
|
return client.CustomObjectsApi()
|
|
|
|
|
|
def raise_alarm(node_name):
|
|
|
|
entity_instance_id = "%s=%s" % (fm_constants.FM_ENTITY_TYPE_HOST,
|
|
node_name)
|
|
fault = fm_api.Fault(
|
|
alarm_id=fm_constants.FM_ALARM_ID_USM_NODE_TAINTED,
|
|
alarm_state=fm_constants.FM_ALARM_STATE_SET,
|
|
entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST,
|
|
entity_instance_id=entity_instance_id,
|
|
severity=fm_constants.FM_ALARM_SEVERITY_MAJOR,
|
|
reason_text=("Node tainted."),
|
|
alarm_type=fm_constants.FM_ALARM_TYPE_7,
|
|
probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_8,
|
|
proposed_repair_action=("Execute 'kubectl taint nodes %s services=disabled:NoExecute-'. "
|
|
"If it fails, Execute 'system host-lock %s' followed by 'system host-unlock %s'. "
|
|
"If issue still persists, contact next level of support."
|
|
% (node_name, node_name, node_name)),
|
|
service_affecting=True)
|
|
DLOG.info("Raising alarm %s on %s " % (fm_constants.FM_ALARM_ID_USM_NODE_TAINTED, node_name))
|
|
fmapi.set_fault(fault)
|
|
|
|
|
|
def clear_alarm(node_name):
|
|
|
|
entity_instance_id = "%s=%s" % (fm_constants.FM_ENTITY_TYPE_HOST,
|
|
node_name)
|
|
DLOG.info("Clearing alarm %s on %s " % (fm_constants.FM_ALARM_ID_USM_NODE_TAINTED, node_name))
|
|
fmapi.clear_fault(fm_constants.FM_ALARM_ID_USM_NODE_TAINTED, entity_instance_id)
|
|
|
|
|
|
def taint_node(node_name, effect, key, value):
|
|
"""
|
|
Apply a taint to a node
|
|
"""
|
|
# Get the client.
|
|
kube_client = get_client()
|
|
# Retrieve the node to access any existing taints.
|
|
try:
|
|
response = kube_client.read_node(node_name)
|
|
except ApiException as e:
|
|
if e.status == httplib.NOT_FOUND:
|
|
# In some cases we may attempt to taint a node that exists in
|
|
# the VIM, but not yet in kubernetes (e.g. when the node is first
|
|
# being configured). Ignore the failure.
|
|
DLOG.info("Not tainting node %s because it doesn't exist" %
|
|
node_name)
|
|
return
|
|
else:
|
|
raise
|
|
|
|
add_taint = True
|
|
taints = response.spec.taints
|
|
if taints is not None:
|
|
for taint in taints:
|
|
# Taints must be unique by key and effect
|
|
if taint.key == key and taint.effect == effect:
|
|
add_taint = False
|
|
if taint.value != value:
|
|
msg = ("Duplicate value - key: %s effect: %s "
|
|
"value: %s new value %s" % (key, effect,
|
|
taint.value, value))
|
|
DLOG.error(msg)
|
|
raise Exception(msg)
|
|
else:
|
|
# This taint already exists
|
|
break
|
|
|
|
if add_taint:
|
|
DLOG.info("Adding %s=%s:%s taint to node %s" % (key, value, effect,
|
|
node_name))
|
|
# Preserve any existing taints
|
|
if taints is not None:
|
|
body = {"spec": {"taints": taints}}
|
|
else:
|
|
body = {"spec": {"taints": []}}
|
|
# Add our new taint
|
|
new_taint = {"key": key, "value": value, "effect": effect}
|
|
body["spec"]["taints"].append(new_taint)
|
|
response = kube_client.patch_node(node_name, body)
|
|
# Clear taint node alarm if tainting is successful.
|
|
# Alarm not cleared if taint is already present in the system
|
|
# or the node is under configuration.
|
|
clear_alarm(node_name)
|
|
|
|
return Result(response)
|
|
|
|
|
|
def untaint_node(node_name, effect, key):
|
|
"""
|
|
Remove a taint from a node
|
|
"""
|
|
# Get the client.
|
|
kube_client = get_client()
|
|
|
|
# Retrieve the node to access any existing taints.
|
|
response = kube_client.read_node(node_name)
|
|
|
|
remove_taint = False
|
|
taints = response.spec.taints
|
|
if taints is not None:
|
|
for taint in taints:
|
|
# Taints must be unique by key and effect
|
|
if taint.key == key and taint.effect == effect:
|
|
remove_taint = True
|
|
break
|
|
|
|
if remove_taint:
|
|
DLOG.info("Removing %s:%s taint from node %s" % (key, effect,
|
|
node_name))
|
|
# Preserve any existing taints
|
|
updated_taints = [taint for taint in taints if taint.key != key or
|
|
taint.effect != effect]
|
|
DLOG.info("Updated taints %s" % (updated_taints))
|
|
body = {"spec": {"taints": updated_taints}}
|
|
response = kube_client.patch_node(node_name, body)
|
|
check_taints = kube_client.read_node(node_name)
|
|
taints = check_taints.spec.taints
|
|
DLOG.info("Existing taint %s" % (taints))
|
|
if taints is not None:
|
|
for taint in taints:
|
|
if (taint.key == key and taint.effect == effect):
|
|
DLOG.info("Removing %s:%s taint from node %s failed" % (key,
|
|
effect, node_name))
|
|
raise_alarm(node_name)
|
|
break
|
|
else:
|
|
# Taint removed successfully. If there are multiple taints
|
|
# on the system, removing the 'services' taint will clear the alarm.
|
|
clear_alarm(node_name)
|
|
else:
|
|
# If there is only 'services' taint on the system , then removing the taint
|
|
# should clear the alarm.
|
|
clear_alarm(node_name)
|
|
|
|
return Result(response)
|
|
|
|
|
|
def delete_node(node_name):
|
|
"""
|
|
Delete a node
|
|
"""
|
|
# Get the client.
|
|
kube_client = get_client()
|
|
|
|
# Delete the node
|
|
body = kubernetes.client.V1DeleteOptions()
|
|
|
|
try:
|
|
if K8S_MODULE_MAJOR_VERSION < 12:
|
|
response = kube_client.delete_node(node_name, body)
|
|
else:
|
|
response = kube_client.delete_node(node_name, body=body)
|
|
except ApiException as e:
|
|
if e.status == httplib.NOT_FOUND:
|
|
# In some cases we may attempt to delete a node that exists in
|
|
# the VIM, but not yet in kubernetes (e.g. when the node is first
|
|
# being configured). Ignore the failure.
|
|
DLOG.info("Not deleting node %s because it doesn't exist" %
|
|
node_name)
|
|
return
|
|
else:
|
|
raise
|
|
|
|
return Result(response)
|
|
|
|
|
|
def mark_all_pods_not_ready(node_name, reason):
|
|
"""
|
|
Mark all pods on a node as not ready
|
|
Note: It would be preferable to mark the node as not ready and have
|
|
kubernetes then mark the pods as not ready, but this is not supported.
|
|
"""
|
|
# Get the client.
|
|
kube_client = get_client()
|
|
|
|
# Retrieve the pods on the specified node.
|
|
response = kube_client.list_namespaced_pod(
|
|
"", field_selector="spec.nodeName=%s" % node_name)
|
|
|
|
pods = response.items
|
|
if pods is not None:
|
|
for pod in pods:
|
|
for condition in pod.status.conditions:
|
|
if condition.type == "Ready":
|
|
if condition.status != "False":
|
|
# Update the Ready status to False
|
|
body = {"status":
|
|
{"conditions":
|
|
[{"type": "Ready",
|
|
"status": "False",
|
|
"reason": reason,
|
|
}]}}
|
|
try:
|
|
DLOG.debug(
|
|
"Marking pod %s in namespace %s not ready" %
|
|
(pod.metadata.name, pod.metadata.namespace))
|
|
kube_client.patch_namespaced_pod_status(
|
|
pod.metadata.name, pod.metadata.namespace, body)
|
|
except ApiException:
|
|
DLOG.exception(
|
|
"Failed to update status for pod %s in "
|
|
"namespace %s" % (pod.metadata.name,
|
|
pod.metadata.namespace))
|
|
break
|
|
return
|
|
|
|
|
|
def get_terminating_pods(node_name):
|
|
"""
|
|
Get all pods on a node that are terminating
|
|
"""
|
|
# Get the client.
|
|
kube_client = get_client()
|
|
|
|
# Retrieve the pods on the specified node.
|
|
response = kube_client.list_namespaced_pod(
|
|
"", field_selector="spec.nodeName=%s" % node_name)
|
|
|
|
terminating_pods = list()
|
|
pods = response.items
|
|
if pods is not None:
|
|
for pod in pods:
|
|
# The presence of the deletion_timestamp indicates the pod is
|
|
# terminating.
|
|
if pod.metadata.deletion_timestamp is not None:
|
|
terminating_pods.append(pod.metadata.name)
|
|
|
|
return Result(','.join(terminating_pods))
|
|
|
|
|
|
def get_namespaced_custom_object(name, plural, group, version, namespace):
|
|
"""
|
|
Get a custom resource object in a namespace
|
|
"""
|
|
# Get a CustomObjectsApi instance
|
|
api_instance = get_customobjects_api_instance()
|
|
|
|
try:
|
|
resource = api_instance.get_namespaced_custom_object(
|
|
group=group,
|
|
version=version,
|
|
name=name,
|
|
namespace=namespace,
|
|
plural=plural
|
|
)
|
|
return Result(resource)
|
|
except ApiException as e:
|
|
DLOG.exception(
|
|
"Failed to get object %s from namespace %s, "
|
|
"reason: %s" % (name, namespace, e.reason))
|
|
return None
|
|
|
|
|
|
def get_deployment_host(name):
|
|
"""
|
|
Get a host in the deployment namespace
|
|
"""
|
|
# Get a CustomObjectsApi instance
|
|
api_instance = get_customobjects_api_instance()
|
|
|
|
try:
|
|
resource = api_instance.get_namespaced_custom_object(
|
|
group='starlingx.windriver.com',
|
|
version='v1',
|
|
name=name,
|
|
namespace='deployment',
|
|
plural='hosts'
|
|
)
|
|
unlock_request = resource.get('status').get('strategyRequired')
|
|
result = {'name': name, 'unlock_request': unlock_request}
|
|
return Result(result)
|
|
except ApiException as e:
|
|
DLOG.exception(
|
|
"Failed to get object %s from namespace deployment, "
|
|
"reason: %s" % (name, e.reason))
|
|
return None
|
|
|
|
|
|
def list_namespaced_custom_objects(plural, group, version, namespace):
|
|
"""
|
|
List custom resource objects in a namespace
|
|
"""
|
|
# Get a CustomObjectsApi instance
|
|
api_instance = get_customobjects_api_instance()
|
|
|
|
try:
|
|
resources = api_instance.list_namespaced_custom_object(
|
|
group=group,
|
|
version=version,
|
|
namespace=namespace,
|
|
plural=plural
|
|
)
|
|
return Result(resources)
|
|
except ApiException as e:
|
|
DLOG.exception(
|
|
"Failed to list objects %s from namespace %s, "
|
|
"reason: %s" % (plural, namespace, e.reason))
|
|
return None
|
|
|
|
|
|
def list_deployment_hosts():
|
|
"""
|
|
List hosts in a deployment namespace
|
|
"""
|
|
# Get a CustomObjectsApi instance
|
|
api_instance = get_customobjects_api_instance()
|
|
|
|
try:
|
|
resources = api_instance.list_namespaced_custom_object(
|
|
group='starlingx.windriver.com',
|
|
version='v1',
|
|
namespace='deployment',
|
|
plural='hosts'
|
|
)
|
|
|
|
if not resources:
|
|
return None
|
|
|
|
results = list()
|
|
for resource in resources.get('items'):
|
|
name = resource.get('metadata').get('name')
|
|
unlock_request = resource.get('status').get('strategyRequired')
|
|
results.append({'name': name,
|
|
'unlock_request': unlock_request})
|
|
|
|
return Result(results)
|
|
except ApiException as e:
|
|
DLOG.exception(
|
|
"Failed to list hosts from deployment namespace, "
|
|
"reason: %s" % e.reason)
|
|
return None
|
|
|
|
|
|
def get_namespaced_running_pods(namespace, name):
|
|
"""
|
|
Get running pods in a namespace
|
|
"""
|
|
api_instance = get_client()
|
|
|
|
try:
|
|
response = api_instance.list_namespaced_pod(
|
|
namespace=namespace,
|
|
field_selector="status.phase=Running",)
|
|
except ApiException as e:
|
|
DLOG.exception(
|
|
"Failed to list pods from namespace %s, "
|
|
"reason: %s" % (namespace, e.reason))
|
|
return None
|
|
|
|
pods = response.items
|
|
found = list()
|
|
if pods is not None:
|
|
for pod in pods:
|
|
if name in pod.metadata.name:
|
|
found.append(pod.metadata.name)
|
|
|
|
return Result(','.join(found))
|