Add numa node and huge page memory monitoring

This update adds titled support to the existing
Platform Memory monitor collectd plugin.

Instance Mapping

Plugin Refinements                      Instance Name
-------------------------------------   ----------
Platform Memory                         platform
Platform Memory Numa Node 0             node0
Platform Memory Numa Node 1             node1
Platform Memory Numa Node 0 Huge Pages  node0_hugepages
Platform Memory Numa Node 1 Huge Pages  node1_hugepages

New Alarm Entity IDs added to existing 100.103 alarm ID

host=<hostname>.numa=node0
host=<hostname>.numa=node1
host=<hostname>.numa=node0_hugepages
host=<hostname>.numa=node1_hugepages

Modified memory plugin thresholds and added alarm notifier
to support collectd requiring samples to be 'gt' rather
than 'ge' the specified thresholds for a severity change.

This update also corrects a few subtle pep8 warnings to
a few of the existing python plugins.

There is no need for an rmond update because numa and
huge page monitoring was never enabled in rmond.

Story: 2002823
Task: 29369

PASS: Verify logging of all memory instance types
PASS: Verify monitoring of new numa node memory
PASS: Verify monitoring of new numa node huge page memory
PASS: Verify memory instance alarm handling in fm notifier
PASS: Verify memory instance alarm load on startup
PASS: Verify memory instance alarm clear ; runtime condition gone
PASS: Verify memory instance alarm clear ; startup condition gone

Regression:
PASS: Verify End-To-End Sample Collection for all monitored resources.
Corner Case:
PASS: Verify alarm reporting with threshold of zero
PROG: Verify memory alarm raised at threshold value
PASS: Verify memory alarm cleared 1 below threshold value
PASS: Verify above case for both major and critical thresholds

Change-Id: I4e2612ac7b3d906be4b0a140286dbbb095ce7e1b
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
Eric MacDonald 2019-02-06 16:01:47 -05:00
parent fab989b5bc
commit 4dadf61bea
6 changed files with 190 additions and 101 deletions

View File

@ -1,5 +1,5 @@
#
# Copyright (c) 2018 Wind River Systems, Inc.
# Copyright (c) 2018-2019 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@ -179,7 +179,7 @@ def read_func():
_schedstat))
else:
collectd.error('%s unsupported schedstat version [%d]' %
(PLUGIN, c.version))
(PLUGIN, c.version))
return 0
f.close()

View File

@ -1,5 +1,5 @@
#
# Copyright (c) 2018 Wind River Systems, Inc.
# Copyright (c) 2018-2019 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@ -101,6 +101,7 @@ api = fm_api.FaultAPIs()
debug = False
debug_lists = False
want_state_audit = False
want_vswitch = False
# number of notifier loops before the state is object dumped
DEBUG_AUDIT = 2
@ -122,6 +123,8 @@ DATABASE_NAME = 'collectd samples'
READING_TYPE__PERCENT_USAGE = '% usage'
# Default invalid threshold value
INVALID_THRESHOLD = float(-1)
# collectd severity definitions ;
# Note: can't seem to pull then in symbolically with a header
@ -230,8 +233,10 @@ class PluginObject:
# [ 'float value string','float threshold string]
self.values = []
self.threshold = float(0) # float value of threshold
self.value = float(0) # float value of reading
self.value = float(0) # float value of reading
# float value of threshold
self.threshold = float(INVALID_THRESHOLD)
# Common static class members.
self.reason_warning = ""
@ -332,7 +337,8 @@ class PluginObject:
# Purpose : Manage sample value change.
#
# Handle no sample update case.
# Parse the notification log
# Parse the notification log.
# Handle base object instances.
# Generate a log entry if the sample value changes more than
# step value.
#
@ -384,10 +390,22 @@ class PluginObject:
# get the threshold if its there.
if len(self.values) > 1:
self.threshold = float(self.values[1])
if nObject.plugin == PLUGIN__MEM:
if self.reading_type == READING_TYPE__PERCENT_USAGE:
# Note: add one to % usage reading types so that it
# matches how rmond did it. In collectd an
# overage is over the specified threshold
# whereas in rmon an overage is at threshold
# or above.
self.threshold = float(self.values[1]) + 1
else:
self.threshold = float(self.values[1])
else:
self.threshold = float(INVALID_THRESHOLD) # invalid value
except ValueError as ex:
collectd.error("%s %s value not integer or float (%s) (%s)" %
(PLUGIN, self.entity_id, self.value, str(ex)))
(PLUGIN, self.entity_id, self.value, str(ex)))
return "done"
except TypeError as ex:
collectd.info("%s %s value has no type (%s)" %
@ -428,6 +446,11 @@ class PluginObject:
if self.plugin == PLUGIN__DF:
resource = self.instance
elif self.plugin == PLUGIN__MEM:
if self.instance_name:
if self.instance_name != 'platform':
resource += ' ' + self.instance_name
# setup resource name for vswitch process instance name
elif self.plugin == PLUGIN__VSWITCH_MEM:
resource += ' Processor '
@ -696,7 +719,7 @@ class PluginObject:
self.instance_objects[eid] = obj
except:
collectd.error("%s failed to add instance to %s object list" %
(PLUGIN, self.plugin))
(PLUGIN, self.plugin))
finally:
collectd.debug("%s %s Add UnLock ..." % (PLUGIN, self.plugin))
@ -750,14 +773,14 @@ class PluginObject:
self._add_instance_object(inst_obj, inst_obj.entity_id)
collectd.debug("%s created %s instance (%s) object %s" %
(PLUGIN, inst_obj.resource_name,
inst_obj.entity_id, inst_obj))
(PLUGIN, inst_obj.resource_name,
inst_obj.entity_id, inst_obj))
collectd.debug("%s monitoring %s %s %s" %
(PLUGIN,
inst_obj.resource_name,
inst_obj.instance_name,
inst_obj.reading_type))
collectd.info("%s monitoring %s %s %s" %
(PLUGIN,
inst_obj.resource_name,
inst_obj.instance_name,
inst_obj.reading_type))
return inst_obj
@ -887,7 +910,11 @@ def _build_entity_id(plugin, plugin_instance):
entity_id = 'host='
entity_id += PluginObject.host
if plugin == PLUGIN__VSWITCH_MEM:
if plugin == PLUGIN__MEM:
if plugin_instance != 'platform':
entity_id += '.numa=' + plugin_instance
elif plugin == PLUGIN__VSWITCH_MEM:
# host=<hostname>.processor=<socket-id>
if plugin_instance:
@ -929,15 +956,6 @@ def _build_entity_id(plugin, plugin_instance):
instance = instance.replace('-', '/')
entity_id += instance
# Will be uncommented when the numa memory monitor is added
# to the platform memory plugin.
#
#elif plugin == PLUGIN__MEM:
# if plugin_instance is not 'platform':
# # host=controller-0.numa=node0
# entity_id += '.numa='
# entity_id += plugin_instance
if inst_error is True:
collectd.error("%s eid build failed ; missing instance" % plugin)
return None
@ -953,7 +971,7 @@ def _get_df_mountpoints():
if not os.path.exists(conf_file):
collectd.error("%s cannot create filesystem "
"instance objects ; missing : %s" %
(PLUGIN, conf_file))
(PLUGIN, conf_file))
return FAIL
mountpoints = []
@ -1158,7 +1176,7 @@ def _clear_alarm_for_missing_filesystems():
df_base_obj._manage_alarm(obj.entity_id, "okay")
else:
collectd.debug("%s maintaining alarm for %s" %
(PLUGIN, path))
(PLUGIN, path))
# Collectd calls this function on startup.
@ -1207,7 +1225,9 @@ def init_func():
obj._create_instance_objects()
# ntp query is for controllers only
if tsc.nodetype == 'worker' or 'worker' in tsc.subfunctions:
if want_vswitch is False:
collectd.debug("%s vSwitch monitoring disabled" % PLUGIN)
elif tsc.nodetype == 'worker' or 'worker' in tsc.subfunctions:
#######################################################################
@ -1406,13 +1426,13 @@ def notifier_func(nObject):
collectd.debug("%s %s lock" % (PLUGIN, nObject.plugin))
PluginObject.lock.acquire()
#collectd.info("%s Object Search eid: %s" %
# (nObject.plugin, eid))
# collectd.info("%s Object Search eid: %s" %
# (nObject.plugin, eid))
#for o in base_obj.instance_objects:
# collectd.error("%s %s inst object dict item %s : %s" %
# (PLUGIN, nObject.plugin, o,
# base_obj.instance_objects[o]))
# for o in base_obj.instance_objects:
# collectd.error("%s %s inst object dict item %s : %s" %
# (PLUGIN, nObject.plugin, o,
# base_obj.instance_objects[o]))
# we will take an exception if this object is not in the list.
# the exception handling code below will create and add this
@ -1434,14 +1454,14 @@ def notifier_func(nObject):
inst_obj = base_obj._get_instance_object(eid)
if inst_obj:
collectd.debug("%s %s:%s inst object created" %
(PLUGIN,
inst_obj.plugin,
inst_obj.instance))
(PLUGIN,
inst_obj.plugin,
inst_obj.instance))
else:
collectd.error("%s %s:%s inst object create failed" %
(PLUGIN,
nObject.plugin,
nObject.plugin_instance))
(PLUGIN,
nObject.plugin,
nObject.plugin_instance))
return 0
# re-assign the object
@ -1457,7 +1477,7 @@ def notifier_func(nObject):
else:
collectd.debug("%s notification for unknown plugin: %s %s" %
(PLUGIN, nObject.plugin, nObject.plugin_instance))
(PLUGIN, nObject.plugin, nObject.plugin_instance))
return 0
# if obj.warnings or obj.failures:
@ -1503,11 +1523,11 @@ def notifier_func(nObject):
# if this is a threshold alarm then build the reason text that
# includes the threahold and the reading that caused the assertion.
reason = obj.resource_name
reason += " threshold exceeded"
if obj.threshold:
reason += "; threshold {:2.0f} ".format(obj.threshold) + "%, "
reason += " threshold exceeded ;"
if obj.threshold != INVALID_THRESHOLD:
reason += " threshold {:2.0f}".format(obj.threshold) + "%,"
if obj.value:
reason += "actual {:2.0f}".format(obj.value) + "%"
reason += " actual {:2.0f}".format(obj.value) + "%"
elif _severity_num == fm_constants.FM_ALARM_SEVERITY_CRITICAL:
reason = obj.reason_failure
@ -1538,14 +1558,13 @@ def notifier_func(nObject):
# update the lists now that
base_obj._manage_alarm(obj.entity_id, severity_str)
collectd.info("%s %s alarm %s:%s %s:%s thld:%2.2f value:%2.2f" % (
collectd.info("%s %s alarm %s:%s %s:%s value:%2.2f" % (
PLUGIN,
_alarm_state,
base_obj.id,
severity_str,
obj.instance,
obj.entity_id,
obj.threshold,
obj.value))
# Debug only: comment out for production code.

View File

@ -12,8 +12,8 @@
Instance "used"
Persist true
PersistOK true
WarningMax 80.00
FailureMax 90.00
WarningMax 79.00
FailureMax 89.00
Hits 2
Invert false
</Type>

View File

@ -1,5 +1,5 @@
#
# Copyright (c) 2018 Wind River Systems, Inc.
# Copyright (c) 2018-2019 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@ -19,11 +19,9 @@ import collectd
debug = False
# general return codes
PASS = 0
FAIL = 1
PLUGIN = 'platform memory usage'
PLUGIN_NUMA = 'numa memory usage'
PLUGIN_HUGE = 'hugepage memory usage'
# CPU Control class
@ -41,8 +39,10 @@ class MEM:
CommitLimit = 0
Committed_AS = 0
HugePages_Total = 0
HugePages_Free = 0
Hugepagesize = 0
AnonPages = 0
FilePages = 0
# derived values
avail = 0
@ -54,6 +54,27 @@ class MEM:
obj = MEM()
def log_meminfo(plugin, name, meminfo):
""" Log the supplied meminfo """
if debug is False:
return
collectd.info("%s %s" % (plugin, name))
collectd.info("%s ---------------------------" % plugin)
collectd.info("%s memTotal_kB : %f" % (plugin, meminfo.memTotal_kB))
collectd.info("%s memFree_kB : %f" % (plugin, meminfo.memFree_kB))
collectd.info("%s Buffers : %f" % (plugin, meminfo.buffers))
collectd.info("%s Cached : %f" % (plugin, meminfo.cached))
collectd.info("%s SReclaimable : %f" % (plugin, meminfo.SReclaimable))
collectd.info("%s CommitLimit : %f" % (plugin, meminfo.CommitLimit))
collectd.info("%s Committed_AS : %f" % (plugin, meminfo.Committed_AS))
collectd.info("%s HugePages_Total: %f" % (plugin, meminfo.HugePages_Total))
collectd.info("%s HugePages_Free : %f" % (plugin, meminfo.HugePages_Free))
collectd.info("%s Hugepagesize : %f" % (plugin, meminfo.Hugepagesize))
collectd.info("%s AnonPages : %f" % (plugin, meminfo.AnonPages))
def config_func(config):
"""
Configure the memory usage plugin
@ -110,7 +131,12 @@ def read_func():
except EnvironmentError as e:
collectd.error("%s unable to read from %s ; str(e)" %
(PLUGIN, str(e)))
return FAIL
return 0
# setup the sample structure
val = collectd.Values(host=obj.hostname)
val.type = 'percent'
val.type_instance = 'used'
# remove the 'unit' (kB) suffix that might be on some of the lines
for line in meminfo:
@ -130,20 +156,11 @@ def read_func():
obj.CommitLimit = float(meminfo['CommitLimit'])
obj.Committed_AS = float(meminfo['Committed_AS'])
obj.HugePages_Total = float(meminfo['HugePages_Total'])
obj.HugePages_Free = float(meminfo['HugePages_Free'])
obj.Hugepagesize = float(meminfo['Hugepagesize'])
obj.AnonPages = float(meminfo['AnonPages'])
# collectd.info("%s /proc/meminfo: %s" % (PLUGIN, meminfo))
# collectd.info("%s ---------------------------" % PLUGIN)
# collectd.info("%s memTotal_kB : %f" % (PLUGIN, obj.memTotal_kB))
# collectd.info("%s memFree_kB : %f" % (PLUGIN, obj.memFree_kB))
# collectd.info("%s Buffers : %f" % (PLUGIN, obj.buffers))
# collectd.info("%s Cached : %f" % (PLUGIN, obj.cached))
# collectd.info("%s SReclaimable : %f" % (PLUGIN, obj.SReclaimable))
# collectd.info("%s CommitLimit : %f" % (PLUGIN, obj.CommitLimit))
# collectd.info("%s Committed_AS : %f" % (PLUGIN, obj.Committed_AS))
# collectd.info("%s HugePages_Total: %f" % (PLUGIN, obj.HugePages_Total))
# collectd.info("%s AnonPages : %f" % (PLUGIN, obj.AnonPages))
log_meminfo(PLUGIN, "/proc/meminfo", obj)
obj.avail = float(float(obj.memFree_kB) +
float(obj.buffers) +
@ -152,38 +169,93 @@ def read_func():
obj.total = float(float(obj.avail) +
float(obj.AnonPages))
# collectd.info("%s ---------------------------" % PLUGIN)
# collectd.info("%s memTotal: %d" % (PLUGIN, obj.avail))
# collectd.info("%s memAvail: %d" % (PLUGIN, obj.total))
if obj.strict == 1:
obj.value = float(float(obj.Committed_AS) / float(obj.CommitLimit))
else:
obj.value = float(float(obj.AnonPages) / float(obj.total))
obj.value = float(float(obj.value) * 100)
# get numa node memory
# numa_node_files = []
# fn = "/sys/devices/system/node/"
# files = os.listdir(fn)
# for file in files:
# if 'node' in file:
# numa_node_files.append(fn + file)
# collectd.info("%s numa node files: %s" %
# (PLUGIN, numa_node_files))
collectd.debug('%s reports %.2f %% usage' %
(PLUGIN, obj.value))
if debug is True:
collectd.info("%s ---------------------------" % PLUGIN)
collectd.info("%s memAvail: %d" % (PLUGIN, obj.avail))
collectd.info("%s memTotal: %d" % (PLUGIN, obj.total))
collectd.info('%s reports %.2f %% usage' % (PLUGIN, obj.value))
# Dispatch usage value to collectd
val = collectd.Values(host=obj.hostname)
val.plugin = 'memory'
val.type = 'percent'
val.type_instance = 'used'
val.plugin_instance = 'platform'
val.dispatch(values=[obj.value])
return PASS
#####################################################################
# Now get the Numa Node Memory Usage
#####################################################################
numa_node_files = []
fn = "/sys/devices/system/node/"
files = os.listdir(fn)
for file in files:
if 'node' in file:
numa_node_files.append(fn + file + '/meminfo')
for numa_node in numa_node_files:
meminfo = {}
try:
with open(numa_node) as fd:
for line in fd:
meminfo[line.split()[2][0:-1]] = line.split()[3].strip()
obj.memFree_kB = float(meminfo['MemFree'])
obj.FilePages = float(meminfo['FilePages'])
obj.SReclaimable = float(meminfo['SReclaimable'])
obj.AnonPages = float(meminfo['AnonPages'])
obj.HugePages_Total = float(meminfo['HugePages_Total'])
obj.HugePages_Free = float(meminfo['HugePages_Free'])
log_meminfo(PLUGIN, numa_node, obj)
avail = float(float(obj.memFree_kB) +
float(obj.FilePages) +
float(obj.SReclaimable))
total = float(float(avail) +
float(obj.AnonPages))
obj.value = float(float(obj.AnonPages)) / float(total)
obj.value = float(float(obj.value) * 100)
# Dispatch usage value to collectd for this numa node
val.plugin_instance = numa_node.split('/')[5]
val.dispatch(values=[obj.value])
collectd.debug('%s reports %s at %.2f %% usage (%s)' %
(PLUGIN_NUMA,
val.plugin,
obj.value,
val.plugin_instance))
# Numa Node Huge Page Memory Monitoring
#
# Only monitor if there is Huge Page Memory
if obj.HugePages_Total > 0:
obj.value = \
float(float(obj.HugePages_Total -
obj.HugePages_Free)) / \
float(obj.HugePages_Total)
obj.value = float(float(obj.value) * 100)
# Dispatch huge page memory usage value
# to collectd for this numa node.
val.plugin_instance = numa_node.split('/')[5] + '_hugepages'
val.dispatch(values=[obj.value])
collectd.debug('%s reports %s at %.2f %% usage (%s)' %
(PLUGIN_HUGE,
val.plugin,
obj.value,
val.plugin_instance))
except EnvironmentError as e:
collectd.error("%s unable to read from %s ; str(e)" %
(PLUGIN_NUMA, str(e)))
return 0
collectd.register_config(config_func)

View File

@ -1,8 +1,10 @@
#
# Copyright (c) 2018 Wind River Systems, Inc.
# Copyright (c) 2018-2019 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
#############################################################################
#
# This file is the collectd 'Maintenance' Notifier.
#
# Collects provides information about each event as an object passed to the
@ -50,10 +52,6 @@ NOTIF_FAILURE = 1
NOTIF_WARNING = 2
NOTIF_OKAY = 4
# generic return codes
PASS = 0
FAIL = 1
# default mtce port.
# ... with configuration override
MTCE_CMD_RX_PORT = 2101
@ -292,7 +290,7 @@ def notifier_func(nObject):
else:
collectd.info("%s unsupported severity %d" %
(PLUGIN, nObject.severity))
return FAIL
return 0
# running counter of notifications.
obj.msg_throttle += 1
@ -374,7 +372,7 @@ def notifier_func(nObject):
mtce_socket.close()
else:
collectd.error("%s %s failed to open socket (%s)" %
(PLUGIN, resource, obj.addr))
(PLUGIN, resource, obj.addr))
except socket.error as e:
if e.args[0] == socket.EAI_ADDRFAMILY:
# Handle IPV4 to IPV6 switchover:
@ -383,7 +381,7 @@ def notifier_func(nObject):
(PLUGIN, resource, obj.addr))
else:
collectd.error("%s %s socket error (%s) ; %s" %
(PLUGIN, resource, obj.addr, str(e)))
(PLUGIN, resource, obj.addr, str(e)))
# try self correction
obj.addr = None
obj.protocol = socket.AF_INET

View File

@ -118,15 +118,15 @@ def _add_unreachable_server(ip=None):
if ip:
if ip not in obj.unreachable_servers:
collectd.debug("%s adding '%s' to unreachable servers list: %s" %
(PLUGIN, ip, obj.unreachable_servers))
(PLUGIN, ip, obj.unreachable_servers))
obj.unreachable_servers.append(ip)
collectd.info("%s added '%s' to unreachable servers list: %s" %
(PLUGIN, ip, obj.unreachable_servers))
(PLUGIN, ip, obj.unreachable_servers))
else:
collectd.debug("%s ip '%s' already in unreachable_servers list" %
(PLUGIN, ip))
(PLUGIN, ip))
else:
collectd.error("%s _add_unreachable_server called with no IP" % PLUGIN)
@ -323,7 +323,7 @@ def _cleanup_stale_servers():
""" Cleanup the server IP tracking lists """
collectd.debug("%s CLEANUP REACHABLE: %s %s" %
(PLUGIN, obj.server_list_ntpq, obj.reachable_servers))
(PLUGIN, obj.server_list_ntpq, obj.reachable_servers))
for ip in obj.reachable_servers:
if ip not in obj.server_list_ntpq:
collectd.info("%s removing missing '%s' server from reachable "
@ -506,7 +506,7 @@ def init_func():
obj.base_eid = 'host=' + obj.hostname + '.ntp'
collectd.debug("%s on %s with entity id '%s'" %
(PLUGIN, obj.hostname, obj.base_eid))
(PLUGIN, obj.hostname, obj.base_eid))
# get a list of provisioned ntp servers
_get_ntp_servers()
@ -686,7 +686,7 @@ def read_func():
# update the selected server list
obj.selected_server = ip
collectd.debug("%s selected server is '%s'" %
(PLUGIN, obj.selected_server))
(PLUGIN, obj.selected_server))
else:
collectd.debug("%s local controller '%s' marked "
"as selected server ; ignoring" %