381 lines
14 KiB
Python
Executable File
381 lines
14 KiB
Python
Executable File
#
|
|
# Copyright (c) 2018-2019 Wind River Systems, Inc.
|
|
#
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
#
|
|
#############################################################################
|
|
#
|
|
# This file is the collectd 'Maintenance' Notifier.
|
|
#
|
|
# Collects provides information about each event as an object passed to the
|
|
# notification handler ; the notification object.
|
|
#
|
|
# object.host - the hostname
|
|
#
|
|
# object.plugin - the name of the plugin aka resource
|
|
# object.plugin_instance - plugin instance string i.e. say mountpoint
|
|
# for df plugin
|
|
# object.type, - the unit i.e. percent or absolute
|
|
# object.type_instance - the attribute i.e. free, used, etc
|
|
#
|
|
# object.severity - a integer value 0=OK , 1=warning, 2=failure
|
|
# object.message - a log-able message containing the above along
|
|
# with the value
|
|
#
|
|
# This notifier manages requesting mtce to assert or clear its collectd
|
|
# host-degrade-cause flag based on notification messages sent from collectd.
|
|
#
|
|
# Messages to maintenance are throttled ONE_EVERY while this state is the
|
|
# same as last state.
|
|
#
|
|
# Message is sent on every state change
|
|
# from clear to assert or
|
|
# from assert to clear
|
|
#
|
|
# See code comments for details.
|
|
#
|
|
############################################################################
|
|
#
|
|
# Import list
|
|
|
|
import os
|
|
import socket
|
|
import collectd
|
|
import tsconfig.tsconfig as tsc
|
|
|
|
# This plugin name
|
|
PLUGIN = 'degrade notifier'
|
|
|
|
# collectd severity definitions ;
|
|
# Note: can't seem to pull then in symbolically with a header
|
|
NOTIF_FAILURE = 1
|
|
NOTIF_WARNING = 2
|
|
NOTIF_OKAY = 4
|
|
|
|
# default mtce port.
|
|
# ... with configuration override
|
|
MTCE_CMD_RX_PORT = 2101
|
|
|
|
# same state message throttle count.
|
|
# ... only send the degrade message every 'this' number
|
|
# while the state of assert or clear remains the same.
|
|
ONE_EVERY = 10
|
|
|
|
PLUGIN__DF = 'df'
|
|
PLUGIN__MEM = 'memory'
|
|
PLUGIN__CPU = 'cpu'
|
|
|
|
PLUGIN__VSWITCH_MEM = 'vswitch_mem'
|
|
PLUGIN__VSWITCH_CPU = 'vswitch_cpu'
|
|
PLUGIN__VSWITCH_PORT = "vswitch_port"
|
|
PLUGIN__VSWITCH_IFACE = "vswitch_iface"
|
|
|
|
|
|
PLUGIN_INTERFACE = 'interface'
|
|
PLUGIN__EXAMPLE = 'example'
|
|
|
|
|
|
# The collectd Maintenance Notifier Object
|
|
class collectdMtceNotifierObject:
|
|
|
|
def __init__(self, port):
|
|
"""collectdMtceNotifierObject Class constructor"""
|
|
# default maintenance port
|
|
self.port = port
|
|
self.addr = None
|
|
|
|
# specifies the protocol family to use when messaging maintenance.
|
|
# if system is IPV6, then that is learned and this 'protocol' is
|
|
# updated with AF_INET6
|
|
self.protocol = socket.AF_INET
|
|
|
|
# List of plugin names that require degrade for specified severity.
|
|
self.degrade_list__failure = [PLUGIN__DF,
|
|
PLUGIN__MEM,
|
|
PLUGIN__CPU,
|
|
PLUGIN__VSWITCH_MEM,
|
|
PLUGIN__VSWITCH_CPU,
|
|
PLUGIN__VSWITCH_PORT,
|
|
PLUGIN__VSWITCH_IFACE,
|
|
PLUGIN_INTERFACE,
|
|
PLUGIN__EXAMPLE]
|
|
self.degrade_list__warning = [PLUGIN_INTERFACE]
|
|
|
|
# the running list of resources that require degrade.
|
|
# a degrade clear message is sent whenever this list is empty.
|
|
# a degrade assert message is sent whenever this list is not empty.
|
|
self.degrade_list = []
|
|
|
|
# throttle down sending of duplicate degrade assert/clear messages
|
|
self.last_state = "undef"
|
|
self.msg_throttle = 0
|
|
|
|
|
|
# Instantiate the mtce_notifier object
|
|
# This object persists from notificaiton to notification
|
|
obj = collectdMtceNotifierObject(MTCE_CMD_RX_PORT)
|
|
|
|
|
|
def _get_active_controller_ip():
|
|
"""Get the active controller host IP"""
|
|
|
|
try:
|
|
obj.addr = socket.getaddrinfo('controller', None)[0][4][0]
|
|
collectd.info("%s controller ip: %s" % (PLUGIN, obj.addr))
|
|
except Exception as ex:
|
|
obj.addr = None
|
|
collectd.error("%s failed to get controller ip ; %s" %
|
|
(PLUGIN, str(ex)))
|
|
return 0
|
|
|
|
|
|
def _df_instance_to_path(df_inst):
|
|
"""Convert a df instance name to a mountpoint"""
|
|
|
|
# df_root is not a dynamic file system. Ignore that one.
|
|
if df_inst == 'df_root':
|
|
return '/'
|
|
else:
|
|
# For all others replace all '-' with '/'
|
|
return('/' + df_inst[3:].replace('-', '/'))
|
|
|
|
|
|
# This function removes degraded file systems that are no longer present.
|
|
def _clear_degrade_for_missing_filesystems():
|
|
"""Remove degraded file systems that are no longer mounted or present"""
|
|
|
|
for df_inst in obj.degrade_list:
|
|
|
|
# Only file system plugins are looked at.
|
|
# File system plugin instance names are prefixed with 'df_'
|
|
# as the first 3 chars in the instance name.
|
|
if df_inst[0:3] == 'df_':
|
|
path = _df_instance_to_path(df_inst)
|
|
|
|
# check the mount point.
|
|
# if the mount point no longer exists then remove
|
|
# this instance from the degrade list.
|
|
if os.path.ismount(path) is False:
|
|
collectd.info("%s clearing degrade for missing %s ; %s" %
|
|
(PLUGIN, path, obj.degrade_list))
|
|
obj.degrade_list.remove(df_inst)
|
|
|
|
return 0
|
|
|
|
|
|
# The collectd configuration interface
|
|
#
|
|
# Used to configure the maintenance port.
|
|
# key = 'port'
|
|
# val = port number
|
|
#
|
|
def config_func(config):
|
|
"""Configure the maintenance degrade notifier plugin"""
|
|
|
|
collectd.debug('%s config function' % PLUGIN)
|
|
for node in config.children:
|
|
key = node.key.lower()
|
|
val = node.values[0]
|
|
|
|
if key == 'port':
|
|
obj.port = int(val)
|
|
collectd.info("%s configured mtce port: %d" %
|
|
(PLUGIN, obj.port))
|
|
return 0
|
|
|
|
obj.port = MTCE_CMD_RX_PORT
|
|
collectd.error("%s no mtce port provided ; defaulting to %d" %
|
|
(PLUGIN, obj.port))
|
|
|
|
|
|
# Collectd calls this function on startup.
|
|
def init_func():
|
|
"""Collectd Mtce Notifier Initialization Function"""
|
|
|
|
obj.host = os.uname()[1]
|
|
collectd.info("%s %s:%s sending to mtce port %d" %
|
|
(PLUGIN, tsc.nodetype, obj.host, obj.port))
|
|
|
|
collectd.debug("%s init function" % PLUGIN)
|
|
|
|
|
|
# This is the Notifier function that is called by collectd.
|
|
#
|
|
# Handling steps are
|
|
#
|
|
# 1. build resource name from notification object.
|
|
# 2. check resource against severity lists.
|
|
# 3. manage this instance's degrade state.
|
|
# 4. send mtcAgent the degrade state message.
|
|
#
|
|
def notifier_func(nObject):
|
|
"""Collectd Mtce Notifier Handler Function"""
|
|
|
|
# Create the resource name from the notifier object.
|
|
# format: <plugin name>_<plugin_instance_name>
|
|
resource = nObject.plugin
|
|
if nObject.plugin_instance:
|
|
resource += "_" + nObject.plugin_instance
|
|
|
|
# This block looks at the current notification severity
|
|
# and manages the degrade_list.
|
|
# If the specified plugin name exists in each of the warnings
|
|
# or failure lists and there is a current severity match then
|
|
# add that resource instance to the degrade list.
|
|
# Conversly if this notification is OKAY then make sure this
|
|
# resource instance is not in the degrade list (remove it if it is)
|
|
if nObject.severity is NOTIF_OKAY:
|
|
if obj.degrade_list and resource in obj.degrade_list:
|
|
obj.degrade_list.remove(resource)
|
|
|
|
elif nObject.severity is NOTIF_FAILURE:
|
|
if obj.degrade_list__failure:
|
|
if nObject.plugin in obj.degrade_list__failure:
|
|
if resource not in obj.degrade_list:
|
|
# handle dynamic filesystems going missing over a swact
|
|
# or unmount and being reported as a transient error by
|
|
# the df plugin. Don't add it to the failed list if the
|
|
# mountpoint is gone.
|
|
add = True
|
|
if nObject.plugin == PLUGIN__DF:
|
|
path = _df_instance_to_path(resource)
|
|
add = os.path.ismount(path)
|
|
if add is True:
|
|
collectd.info("%s %s added to degrade list" %
|
|
(PLUGIN, resource))
|
|
obj.degrade_list.append(resource)
|
|
else:
|
|
# If severity is failure and no failures cause degrade
|
|
# then make sure this plugin is not in the degrade list,
|
|
# Should never occur.
|
|
if resource in obj.degrade_list:
|
|
obj.degrade_list.remove(resource)
|
|
|
|
elif nObject.severity is NOTIF_WARNING:
|
|
if obj.degrade_list__warning:
|
|
if nObject.plugin in obj.degrade_list__warning:
|
|
if resource not in obj.degrade_list:
|
|
# handle dynamic filesystems going missing over a swact
|
|
# or unmount and being reported as a transient error by
|
|
# the df plugin. Don't add it to the failed list if the
|
|
# mountpoint is gone.
|
|
add = True
|
|
if nObject.plugin == PLUGIN__DF:
|
|
path = _df_instance_to_path(resource)
|
|
add = os.path.ismount(path)
|
|
if add is True:
|
|
collectd.info("%s %s added to degrade list" %
|
|
(PLUGIN, resource))
|
|
obj.degrade_list.append(resource)
|
|
else:
|
|
# If severity is warning and no warnings cause degrade
|
|
# then make sure this plugin is not in the degrade list.
|
|
# Should never occur..
|
|
if resource in obj.degrade_list:
|
|
obj.degrade_list.remove(resource)
|
|
else:
|
|
collectd.info("%s unsupported severity %d" %
|
|
(PLUGIN, nObject.severity))
|
|
return 0
|
|
|
|
# running counter of notifications.
|
|
obj.msg_throttle += 1
|
|
|
|
# Support for Dynamic File Systems
|
|
# --------------------------------
|
|
# Some active controller mounted filesystems can become
|
|
# unmounted under the watch of collectd. This can occur
|
|
# as a result of a Swact. If an 'degrade' is raised at the
|
|
# time an fs disappears then that state can become stuck
|
|
# active until the next Swact. This call handles this case.
|
|
#
|
|
# Audit file system presence every time we get the
|
|
# notification for the root file system.
|
|
# Depending on the root filesystem always being there.
|
|
if nObject.plugin == 'df' \
|
|
and nObject.plugin_instance == 'root' \
|
|
and len(obj.degrade_list):
|
|
_clear_degrade_for_missing_filesystems()
|
|
|
|
# If degrade list is empty then a clear state is sent to maintenance.
|
|
# If degrade list is NOT empty then an assert state is sent to maintenance
|
|
# For logging and to ease debug the code below will create a list of
|
|
# degraded resource instances to be included in the message to maintenance
|
|
# for mtcAgent to optionally log it.
|
|
resources = ""
|
|
if obj.degrade_list:
|
|
# loop over the list,
|
|
# limit the degraded resource list being sent to mtce to 5
|
|
for r in obj.degrade_list[0:1:5]:
|
|
resources += r + ','
|
|
resources = resources[:-1]
|
|
state = "assert"
|
|
else:
|
|
state = "clear"
|
|
|
|
# Message throttling ....
|
|
|
|
# Avoid sending the same last state message for up to ONE_EVERY count.
|
|
# Just reduce load on mtcAgent
|
|
if obj.last_state == state and obj.msg_throttle < ONE_EVERY:
|
|
return 0
|
|
|
|
# if the degrade state has changed then log it and proceed
|
|
if obj.last_state != state:
|
|
if obj.last_state != "undef":
|
|
collectd.info("%s degrade %s %s" %
|
|
(PLUGIN,
|
|
state,
|
|
obj.degrade_list))
|
|
|
|
# Save state for next time
|
|
obj.last_state = state
|
|
|
|
# Clear the message throttle counter
|
|
obj.msg_throttle = 0
|
|
|
|
# Send the degrade state ; assert or clear message to mtcAgent.
|
|
# If we get a send failure then log it and set the addr to None
|
|
# so it forces us to refresh the controller address on the next
|
|
# notification
|
|
try:
|
|
mtce_socket = socket.socket(obj.protocol, socket.SOCK_DGRAM)
|
|
if mtce_socket:
|
|
if obj.addr is None:
|
|
_get_active_controller_ip()
|
|
if obj.addr is None:
|
|
return 0
|
|
|
|
# Create the Maintenance message.
|
|
message = "{\"service\":\"collectd_notifier\","
|
|
message += "\"hostname\":\"" + nObject.host + "\","
|
|
message += "\"degrade\":\"" + state + "\","
|
|
message += "\"resource\":\"" + resources + "\"}"
|
|
collectd.debug("%s: %s" % (PLUGIN, message))
|
|
|
|
mtce_socket.settimeout(1.0)
|
|
mtce_socket.sendto(message, (obj.addr, obj.port))
|
|
mtce_socket.close()
|
|
else:
|
|
collectd.error("%s %s failed to open socket (%s)" %
|
|
(PLUGIN, resource, obj.addr))
|
|
except socket.error as e:
|
|
if e.args[0] == socket.EAI_ADDRFAMILY:
|
|
# Handle IPV4 to IPV6 switchover:
|
|
obj.protocol = socket.AF_INET6
|
|
collectd.info("%s %s ipv6 addressing (%s)" %
|
|
(PLUGIN, resource, obj.addr))
|
|
else:
|
|
collectd.error("%s %s socket error (%s) ; %s" %
|
|
(PLUGIN, resource, obj.addr, str(e)))
|
|
# try self correction
|
|
obj.addr = None
|
|
obj.protocol = socket.AF_INET
|
|
|
|
return 0
|
|
|
|
|
|
collectd.register_config(config_func)
|
|
collectd.register_init(init_func)
|
|
collectd.register_notification(notifier_func)
|