Add numa node and huge page memory monitoring
This update adds titled support to the existing Platform Memory monitor collectd plugin. Instance Mapping Plugin Refinements Instance Name ------------------------------------- ---------- Platform Memory platform Platform Memory Numa Node 0 node0 Platform Memory Numa Node 1 node1 Platform Memory Numa Node 0 Huge Pages node0_hugepages Platform Memory Numa Node 1 Huge Pages node1_hugepages New Alarm Entity IDs added to existing 100.103 alarm ID host=<hostname>.numa=node0 host=<hostname>.numa=node1 host=<hostname>.numa=node0_hugepages host=<hostname>.numa=node1_hugepages Modified memory plugin thresholds and added alarm notifier to support collectd requiring samples to be 'gt' rather than 'ge' the specified thresholds for a severity change. This update also corrects a few subtle pep8 warnings to a few of the existing python plugins. There is no need for an rmond update because numa and huge page monitoring was never enabled in rmond. Story: 2002823 Task: 29369 PASS: Verify logging of all memory instance types PASS: Verify monitoring of new numa node memory PASS: Verify monitoring of new numa node huge page memory PASS: Verify memory instance alarm handling in fm notifier PASS: Verify memory instance alarm load on startup PASS: Verify memory instance alarm clear ; runtime condition gone PASS: Verify memory instance alarm clear ; startup condition gone Regression: PASS: Verify End-To-End Sample Collection for all monitored resources. Corner Case: PASS: Verify alarm reporting with threshold of zero PROG: Verify memory alarm raised at threshold value PASS: Verify memory alarm cleared 1 below threshold value PASS: Verify above case for both major and critical thresholds Change-Id: I4e2612ac7b3d906be4b0a140286dbbb095ce7e1b Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
parent
fab989b5bc
commit
4dadf61bea
|
@ -1,5 +1,5 @@
|
||||||
#
|
#
|
||||||
# Copyright (c) 2018 Wind River Systems, Inc.
|
# Copyright (c) 2018-2019 Wind River Systems, Inc.
|
||||||
#
|
#
|
||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
#
|
#
|
||||||
|
@ -179,7 +179,7 @@ def read_func():
|
||||||
_schedstat))
|
_schedstat))
|
||||||
else:
|
else:
|
||||||
collectd.error('%s unsupported schedstat version [%d]' %
|
collectd.error('%s unsupported schedstat version [%d]' %
|
||||||
(PLUGIN, c.version))
|
(PLUGIN, c.version))
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
f.close()
|
f.close()
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
#
|
#
|
||||||
# Copyright (c) 2018 Wind River Systems, Inc.
|
# Copyright (c) 2018-2019 Wind River Systems, Inc.
|
||||||
#
|
#
|
||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
#
|
#
|
||||||
|
@ -101,6 +101,7 @@ api = fm_api.FaultAPIs()
|
||||||
debug = False
|
debug = False
|
||||||
debug_lists = False
|
debug_lists = False
|
||||||
want_state_audit = False
|
want_state_audit = False
|
||||||
|
want_vswitch = False
|
||||||
|
|
||||||
# number of notifier loops before the state is object dumped
|
# number of notifier loops before the state is object dumped
|
||||||
DEBUG_AUDIT = 2
|
DEBUG_AUDIT = 2
|
||||||
|
@ -122,6 +123,8 @@ DATABASE_NAME = 'collectd samples'
|
||||||
|
|
||||||
READING_TYPE__PERCENT_USAGE = '% usage'
|
READING_TYPE__PERCENT_USAGE = '% usage'
|
||||||
|
|
||||||
|
# Default invalid threshold value
|
||||||
|
INVALID_THRESHOLD = float(-1)
|
||||||
|
|
||||||
# collectd severity definitions ;
|
# collectd severity definitions ;
|
||||||
# Note: can't seem to pull then in symbolically with a header
|
# Note: can't seem to pull then in symbolically with a header
|
||||||
|
@ -230,8 +233,10 @@ class PluginObject:
|
||||||
|
|
||||||
# [ 'float value string','float threshold string]
|
# [ 'float value string','float threshold string]
|
||||||
self.values = []
|
self.values = []
|
||||||
self.threshold = float(0) # float value of threshold
|
self.value = float(0) # float value of reading
|
||||||
self.value = float(0) # float value of reading
|
|
||||||
|
# float value of threshold
|
||||||
|
self.threshold = float(INVALID_THRESHOLD)
|
||||||
|
|
||||||
# Common static class members.
|
# Common static class members.
|
||||||
self.reason_warning = ""
|
self.reason_warning = ""
|
||||||
|
@ -332,7 +337,8 @@ class PluginObject:
|
||||||
# Purpose : Manage sample value change.
|
# Purpose : Manage sample value change.
|
||||||
#
|
#
|
||||||
# Handle no sample update case.
|
# Handle no sample update case.
|
||||||
# Parse the notification log
|
# Parse the notification log.
|
||||||
|
# Handle base object instances.
|
||||||
# Generate a log entry if the sample value changes more than
|
# Generate a log entry if the sample value changes more than
|
||||||
# step value.
|
# step value.
|
||||||
#
|
#
|
||||||
|
@ -384,10 +390,22 @@ class PluginObject:
|
||||||
# get the threshold if its there.
|
# get the threshold if its there.
|
||||||
if len(self.values) > 1:
|
if len(self.values) > 1:
|
||||||
self.threshold = float(self.values[1])
|
self.threshold = float(self.values[1])
|
||||||
|
if nObject.plugin == PLUGIN__MEM:
|
||||||
|
if self.reading_type == READING_TYPE__PERCENT_USAGE:
|
||||||
|
# Note: add one to % usage reading types so that it
|
||||||
|
# matches how rmond did it. In collectd an
|
||||||
|
# overage is over the specified threshold
|
||||||
|
# whereas in rmon an overage is at threshold
|
||||||
|
# or above.
|
||||||
|
self.threshold = float(self.values[1]) + 1
|
||||||
|
else:
|
||||||
|
self.threshold = float(self.values[1])
|
||||||
|
else:
|
||||||
|
self.threshold = float(INVALID_THRESHOLD) # invalid value
|
||||||
|
|
||||||
except ValueError as ex:
|
except ValueError as ex:
|
||||||
collectd.error("%s %s value not integer or float (%s) (%s)" %
|
collectd.error("%s %s value not integer or float (%s) (%s)" %
|
||||||
(PLUGIN, self.entity_id, self.value, str(ex)))
|
(PLUGIN, self.entity_id, self.value, str(ex)))
|
||||||
return "done"
|
return "done"
|
||||||
except TypeError as ex:
|
except TypeError as ex:
|
||||||
collectd.info("%s %s value has no type (%s)" %
|
collectd.info("%s %s value has no type (%s)" %
|
||||||
|
@ -428,6 +446,11 @@ class PluginObject:
|
||||||
if self.plugin == PLUGIN__DF:
|
if self.plugin == PLUGIN__DF:
|
||||||
resource = self.instance
|
resource = self.instance
|
||||||
|
|
||||||
|
elif self.plugin == PLUGIN__MEM:
|
||||||
|
if self.instance_name:
|
||||||
|
if self.instance_name != 'platform':
|
||||||
|
resource += ' ' + self.instance_name
|
||||||
|
|
||||||
# setup resource name for vswitch process instance name
|
# setup resource name for vswitch process instance name
|
||||||
elif self.plugin == PLUGIN__VSWITCH_MEM:
|
elif self.plugin == PLUGIN__VSWITCH_MEM:
|
||||||
resource += ' Processor '
|
resource += ' Processor '
|
||||||
|
@ -696,7 +719,7 @@ class PluginObject:
|
||||||
self.instance_objects[eid] = obj
|
self.instance_objects[eid] = obj
|
||||||
except:
|
except:
|
||||||
collectd.error("%s failed to add instance to %s object list" %
|
collectd.error("%s failed to add instance to %s object list" %
|
||||||
(PLUGIN, self.plugin))
|
(PLUGIN, self.plugin))
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
collectd.debug("%s %s Add UnLock ..." % (PLUGIN, self.plugin))
|
collectd.debug("%s %s Add UnLock ..." % (PLUGIN, self.plugin))
|
||||||
|
@ -750,14 +773,14 @@ class PluginObject:
|
||||||
self._add_instance_object(inst_obj, inst_obj.entity_id)
|
self._add_instance_object(inst_obj, inst_obj.entity_id)
|
||||||
|
|
||||||
collectd.debug("%s created %s instance (%s) object %s" %
|
collectd.debug("%s created %s instance (%s) object %s" %
|
||||||
(PLUGIN, inst_obj.resource_name,
|
(PLUGIN, inst_obj.resource_name,
|
||||||
inst_obj.entity_id, inst_obj))
|
inst_obj.entity_id, inst_obj))
|
||||||
|
|
||||||
collectd.debug("%s monitoring %s %s %s" %
|
collectd.info("%s monitoring %s %s %s" %
|
||||||
(PLUGIN,
|
(PLUGIN,
|
||||||
inst_obj.resource_name,
|
inst_obj.resource_name,
|
||||||
inst_obj.instance_name,
|
inst_obj.instance_name,
|
||||||
inst_obj.reading_type))
|
inst_obj.reading_type))
|
||||||
|
|
||||||
return inst_obj
|
return inst_obj
|
||||||
|
|
||||||
|
@ -887,7 +910,11 @@ def _build_entity_id(plugin, plugin_instance):
|
||||||
entity_id = 'host='
|
entity_id = 'host='
|
||||||
entity_id += PluginObject.host
|
entity_id += PluginObject.host
|
||||||
|
|
||||||
if plugin == PLUGIN__VSWITCH_MEM:
|
if plugin == PLUGIN__MEM:
|
||||||
|
if plugin_instance != 'platform':
|
||||||
|
entity_id += '.numa=' + plugin_instance
|
||||||
|
|
||||||
|
elif plugin == PLUGIN__VSWITCH_MEM:
|
||||||
|
|
||||||
# host=<hostname>.processor=<socket-id>
|
# host=<hostname>.processor=<socket-id>
|
||||||
if plugin_instance:
|
if plugin_instance:
|
||||||
|
@ -929,15 +956,6 @@ def _build_entity_id(plugin, plugin_instance):
|
||||||
instance = instance.replace('-', '/')
|
instance = instance.replace('-', '/')
|
||||||
entity_id += instance
|
entity_id += instance
|
||||||
|
|
||||||
# Will be uncommented when the numa memory monitor is added
|
|
||||||
# to the platform memory plugin.
|
|
||||||
#
|
|
||||||
#elif plugin == PLUGIN__MEM:
|
|
||||||
# if plugin_instance is not 'platform':
|
|
||||||
# # host=controller-0.numa=node0
|
|
||||||
# entity_id += '.numa='
|
|
||||||
# entity_id += plugin_instance
|
|
||||||
|
|
||||||
if inst_error is True:
|
if inst_error is True:
|
||||||
collectd.error("%s eid build failed ; missing instance" % plugin)
|
collectd.error("%s eid build failed ; missing instance" % plugin)
|
||||||
return None
|
return None
|
||||||
|
@ -953,7 +971,7 @@ def _get_df_mountpoints():
|
||||||
if not os.path.exists(conf_file):
|
if not os.path.exists(conf_file):
|
||||||
collectd.error("%s cannot create filesystem "
|
collectd.error("%s cannot create filesystem "
|
||||||
"instance objects ; missing : %s" %
|
"instance objects ; missing : %s" %
|
||||||
(PLUGIN, conf_file))
|
(PLUGIN, conf_file))
|
||||||
return FAIL
|
return FAIL
|
||||||
|
|
||||||
mountpoints = []
|
mountpoints = []
|
||||||
|
@ -1158,7 +1176,7 @@ def _clear_alarm_for_missing_filesystems():
|
||||||
df_base_obj._manage_alarm(obj.entity_id, "okay")
|
df_base_obj._manage_alarm(obj.entity_id, "okay")
|
||||||
else:
|
else:
|
||||||
collectd.debug("%s maintaining alarm for %s" %
|
collectd.debug("%s maintaining alarm for %s" %
|
||||||
(PLUGIN, path))
|
(PLUGIN, path))
|
||||||
|
|
||||||
|
|
||||||
# Collectd calls this function on startup.
|
# Collectd calls this function on startup.
|
||||||
|
@ -1207,7 +1225,9 @@ def init_func():
|
||||||
obj._create_instance_objects()
|
obj._create_instance_objects()
|
||||||
|
|
||||||
# ntp query is for controllers only
|
# ntp query is for controllers only
|
||||||
if tsc.nodetype == 'worker' or 'worker' in tsc.subfunctions:
|
if want_vswitch is False:
|
||||||
|
collectd.debug("%s vSwitch monitoring disabled" % PLUGIN)
|
||||||
|
elif tsc.nodetype == 'worker' or 'worker' in tsc.subfunctions:
|
||||||
|
|
||||||
#######################################################################
|
#######################################################################
|
||||||
|
|
||||||
|
@ -1406,13 +1426,13 @@ def notifier_func(nObject):
|
||||||
collectd.debug("%s %s lock" % (PLUGIN, nObject.plugin))
|
collectd.debug("%s %s lock" % (PLUGIN, nObject.plugin))
|
||||||
PluginObject.lock.acquire()
|
PluginObject.lock.acquire()
|
||||||
|
|
||||||
#collectd.info("%s Object Search eid: %s" %
|
# collectd.info("%s Object Search eid: %s" %
|
||||||
# (nObject.plugin, eid))
|
# (nObject.plugin, eid))
|
||||||
|
|
||||||
#for o in base_obj.instance_objects:
|
# for o in base_obj.instance_objects:
|
||||||
# collectd.error("%s %s inst object dict item %s : %s" %
|
# collectd.error("%s %s inst object dict item %s : %s" %
|
||||||
# (PLUGIN, nObject.plugin, o,
|
# (PLUGIN, nObject.plugin, o,
|
||||||
# base_obj.instance_objects[o]))
|
# base_obj.instance_objects[o]))
|
||||||
|
|
||||||
# we will take an exception if this object is not in the list.
|
# we will take an exception if this object is not in the list.
|
||||||
# the exception handling code below will create and add this
|
# the exception handling code below will create and add this
|
||||||
|
@ -1434,14 +1454,14 @@ def notifier_func(nObject):
|
||||||
inst_obj = base_obj._get_instance_object(eid)
|
inst_obj = base_obj._get_instance_object(eid)
|
||||||
if inst_obj:
|
if inst_obj:
|
||||||
collectd.debug("%s %s:%s inst object created" %
|
collectd.debug("%s %s:%s inst object created" %
|
||||||
(PLUGIN,
|
(PLUGIN,
|
||||||
inst_obj.plugin,
|
inst_obj.plugin,
|
||||||
inst_obj.instance))
|
inst_obj.instance))
|
||||||
else:
|
else:
|
||||||
collectd.error("%s %s:%s inst object create failed" %
|
collectd.error("%s %s:%s inst object create failed" %
|
||||||
(PLUGIN,
|
(PLUGIN,
|
||||||
nObject.plugin,
|
nObject.plugin,
|
||||||
nObject.plugin_instance))
|
nObject.plugin_instance))
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
# re-assign the object
|
# re-assign the object
|
||||||
|
@ -1457,7 +1477,7 @@ def notifier_func(nObject):
|
||||||
|
|
||||||
else:
|
else:
|
||||||
collectd.debug("%s notification for unknown plugin: %s %s" %
|
collectd.debug("%s notification for unknown plugin: %s %s" %
|
||||||
(PLUGIN, nObject.plugin, nObject.plugin_instance))
|
(PLUGIN, nObject.plugin, nObject.plugin_instance))
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
# if obj.warnings or obj.failures:
|
# if obj.warnings or obj.failures:
|
||||||
|
@ -1503,11 +1523,11 @@ def notifier_func(nObject):
|
||||||
# if this is a threshold alarm then build the reason text that
|
# if this is a threshold alarm then build the reason text that
|
||||||
# includes the threahold and the reading that caused the assertion.
|
# includes the threahold and the reading that caused the assertion.
|
||||||
reason = obj.resource_name
|
reason = obj.resource_name
|
||||||
reason += " threshold exceeded"
|
reason += " threshold exceeded ;"
|
||||||
if obj.threshold:
|
if obj.threshold != INVALID_THRESHOLD:
|
||||||
reason += "; threshold {:2.0f} ".format(obj.threshold) + "%, "
|
reason += " threshold {:2.0f}".format(obj.threshold) + "%,"
|
||||||
if obj.value:
|
if obj.value:
|
||||||
reason += "actual {:2.0f}".format(obj.value) + "%"
|
reason += " actual {:2.0f}".format(obj.value) + "%"
|
||||||
|
|
||||||
elif _severity_num == fm_constants.FM_ALARM_SEVERITY_CRITICAL:
|
elif _severity_num == fm_constants.FM_ALARM_SEVERITY_CRITICAL:
|
||||||
reason = obj.reason_failure
|
reason = obj.reason_failure
|
||||||
|
@ -1538,14 +1558,13 @@ def notifier_func(nObject):
|
||||||
# update the lists now that
|
# update the lists now that
|
||||||
base_obj._manage_alarm(obj.entity_id, severity_str)
|
base_obj._manage_alarm(obj.entity_id, severity_str)
|
||||||
|
|
||||||
collectd.info("%s %s alarm %s:%s %s:%s thld:%2.2f value:%2.2f" % (
|
collectd.info("%s %s alarm %s:%s %s:%s value:%2.2f" % (
|
||||||
PLUGIN,
|
PLUGIN,
|
||||||
_alarm_state,
|
_alarm_state,
|
||||||
base_obj.id,
|
base_obj.id,
|
||||||
severity_str,
|
severity_str,
|
||||||
obj.instance,
|
obj.instance,
|
||||||
obj.entity_id,
|
obj.entity_id,
|
||||||
obj.threshold,
|
|
||||||
obj.value))
|
obj.value))
|
||||||
|
|
||||||
# Debug only: comment out for production code.
|
# Debug only: comment out for production code.
|
||||||
|
|
|
@ -12,8 +12,8 @@
|
||||||
Instance "used"
|
Instance "used"
|
||||||
Persist true
|
Persist true
|
||||||
PersistOK true
|
PersistOK true
|
||||||
WarningMax 80.00
|
WarningMax 79.00
|
||||||
FailureMax 90.00
|
FailureMax 89.00
|
||||||
Hits 2
|
Hits 2
|
||||||
Invert false
|
Invert false
|
||||||
</Type>
|
</Type>
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
#
|
#
|
||||||
# Copyright (c) 2018 Wind River Systems, Inc.
|
# Copyright (c) 2018-2019 Wind River Systems, Inc.
|
||||||
#
|
#
|
||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
#
|
#
|
||||||
|
@ -19,11 +19,9 @@ import collectd
|
||||||
|
|
||||||
debug = False
|
debug = False
|
||||||
|
|
||||||
# general return codes
|
|
||||||
PASS = 0
|
|
||||||
FAIL = 1
|
|
||||||
|
|
||||||
PLUGIN = 'platform memory usage'
|
PLUGIN = 'platform memory usage'
|
||||||
|
PLUGIN_NUMA = 'numa memory usage'
|
||||||
|
PLUGIN_HUGE = 'hugepage memory usage'
|
||||||
|
|
||||||
|
|
||||||
# CPU Control class
|
# CPU Control class
|
||||||
|
@ -41,8 +39,10 @@ class MEM:
|
||||||
CommitLimit = 0
|
CommitLimit = 0
|
||||||
Committed_AS = 0
|
Committed_AS = 0
|
||||||
HugePages_Total = 0
|
HugePages_Total = 0
|
||||||
|
HugePages_Free = 0
|
||||||
Hugepagesize = 0
|
Hugepagesize = 0
|
||||||
AnonPages = 0
|
AnonPages = 0
|
||||||
|
FilePages = 0
|
||||||
|
|
||||||
# derived values
|
# derived values
|
||||||
avail = 0
|
avail = 0
|
||||||
|
@ -54,6 +54,27 @@ class MEM:
|
||||||
obj = MEM()
|
obj = MEM()
|
||||||
|
|
||||||
|
|
||||||
|
def log_meminfo(plugin, name, meminfo):
|
||||||
|
""" Log the supplied meminfo """
|
||||||
|
|
||||||
|
if debug is False:
|
||||||
|
return
|
||||||
|
|
||||||
|
collectd.info("%s %s" % (plugin, name))
|
||||||
|
collectd.info("%s ---------------------------" % plugin)
|
||||||
|
collectd.info("%s memTotal_kB : %f" % (plugin, meminfo.memTotal_kB))
|
||||||
|
collectd.info("%s memFree_kB : %f" % (plugin, meminfo.memFree_kB))
|
||||||
|
collectd.info("%s Buffers : %f" % (plugin, meminfo.buffers))
|
||||||
|
collectd.info("%s Cached : %f" % (plugin, meminfo.cached))
|
||||||
|
collectd.info("%s SReclaimable : %f" % (plugin, meminfo.SReclaimable))
|
||||||
|
collectd.info("%s CommitLimit : %f" % (plugin, meminfo.CommitLimit))
|
||||||
|
collectd.info("%s Committed_AS : %f" % (plugin, meminfo.Committed_AS))
|
||||||
|
collectd.info("%s HugePages_Total: %f" % (plugin, meminfo.HugePages_Total))
|
||||||
|
collectd.info("%s HugePages_Free : %f" % (plugin, meminfo.HugePages_Free))
|
||||||
|
collectd.info("%s Hugepagesize : %f" % (plugin, meminfo.Hugepagesize))
|
||||||
|
collectd.info("%s AnonPages : %f" % (plugin, meminfo.AnonPages))
|
||||||
|
|
||||||
|
|
||||||
def config_func(config):
|
def config_func(config):
|
||||||
"""
|
"""
|
||||||
Configure the memory usage plugin
|
Configure the memory usage plugin
|
||||||
|
@ -110,7 +131,12 @@ def read_func():
|
||||||
except EnvironmentError as e:
|
except EnvironmentError as e:
|
||||||
collectd.error("%s unable to read from %s ; str(e)" %
|
collectd.error("%s unable to read from %s ; str(e)" %
|
||||||
(PLUGIN, str(e)))
|
(PLUGIN, str(e)))
|
||||||
return FAIL
|
return 0
|
||||||
|
|
||||||
|
# setup the sample structure
|
||||||
|
val = collectd.Values(host=obj.hostname)
|
||||||
|
val.type = 'percent'
|
||||||
|
val.type_instance = 'used'
|
||||||
|
|
||||||
# remove the 'unit' (kB) suffix that might be on some of the lines
|
# remove the 'unit' (kB) suffix that might be on some of the lines
|
||||||
for line in meminfo:
|
for line in meminfo:
|
||||||
|
@ -130,20 +156,11 @@ def read_func():
|
||||||
obj.CommitLimit = float(meminfo['CommitLimit'])
|
obj.CommitLimit = float(meminfo['CommitLimit'])
|
||||||
obj.Committed_AS = float(meminfo['Committed_AS'])
|
obj.Committed_AS = float(meminfo['Committed_AS'])
|
||||||
obj.HugePages_Total = float(meminfo['HugePages_Total'])
|
obj.HugePages_Total = float(meminfo['HugePages_Total'])
|
||||||
|
obj.HugePages_Free = float(meminfo['HugePages_Free'])
|
||||||
obj.Hugepagesize = float(meminfo['Hugepagesize'])
|
obj.Hugepagesize = float(meminfo['Hugepagesize'])
|
||||||
obj.AnonPages = float(meminfo['AnonPages'])
|
obj.AnonPages = float(meminfo['AnonPages'])
|
||||||
|
|
||||||
# collectd.info("%s /proc/meminfo: %s" % (PLUGIN, meminfo))
|
log_meminfo(PLUGIN, "/proc/meminfo", obj)
|
||||||
# collectd.info("%s ---------------------------" % PLUGIN)
|
|
||||||
# collectd.info("%s memTotal_kB : %f" % (PLUGIN, obj.memTotal_kB))
|
|
||||||
# collectd.info("%s memFree_kB : %f" % (PLUGIN, obj.memFree_kB))
|
|
||||||
# collectd.info("%s Buffers : %f" % (PLUGIN, obj.buffers))
|
|
||||||
# collectd.info("%s Cached : %f" % (PLUGIN, obj.cached))
|
|
||||||
# collectd.info("%s SReclaimable : %f" % (PLUGIN, obj.SReclaimable))
|
|
||||||
# collectd.info("%s CommitLimit : %f" % (PLUGIN, obj.CommitLimit))
|
|
||||||
# collectd.info("%s Committed_AS : %f" % (PLUGIN, obj.Committed_AS))
|
|
||||||
# collectd.info("%s HugePages_Total: %f" % (PLUGIN, obj.HugePages_Total))
|
|
||||||
# collectd.info("%s AnonPages : %f" % (PLUGIN, obj.AnonPages))
|
|
||||||
|
|
||||||
obj.avail = float(float(obj.memFree_kB) +
|
obj.avail = float(float(obj.memFree_kB) +
|
||||||
float(obj.buffers) +
|
float(obj.buffers) +
|
||||||
|
@ -152,38 +169,93 @@ def read_func():
|
||||||
obj.total = float(float(obj.avail) +
|
obj.total = float(float(obj.avail) +
|
||||||
float(obj.AnonPages))
|
float(obj.AnonPages))
|
||||||
|
|
||||||
# collectd.info("%s ---------------------------" % PLUGIN)
|
|
||||||
# collectd.info("%s memTotal: %d" % (PLUGIN, obj.avail))
|
|
||||||
# collectd.info("%s memAvail: %d" % (PLUGIN, obj.total))
|
|
||||||
|
|
||||||
if obj.strict == 1:
|
if obj.strict == 1:
|
||||||
obj.value = float(float(obj.Committed_AS) / float(obj.CommitLimit))
|
obj.value = float(float(obj.Committed_AS) / float(obj.CommitLimit))
|
||||||
else:
|
else:
|
||||||
obj.value = float(float(obj.AnonPages) / float(obj.total))
|
obj.value = float(float(obj.AnonPages) / float(obj.total))
|
||||||
|
|
||||||
obj.value = float(float(obj.value) * 100)
|
obj.value = float(float(obj.value) * 100)
|
||||||
|
|
||||||
# get numa node memory
|
if debug is True:
|
||||||
# numa_node_files = []
|
collectd.info("%s ---------------------------" % PLUGIN)
|
||||||
# fn = "/sys/devices/system/node/"
|
collectd.info("%s memAvail: %d" % (PLUGIN, obj.avail))
|
||||||
# files = os.listdir(fn)
|
collectd.info("%s memTotal: %d" % (PLUGIN, obj.total))
|
||||||
# for file in files:
|
collectd.info('%s reports %.2f %% usage' % (PLUGIN, obj.value))
|
||||||
# if 'node' in file:
|
|
||||||
# numa_node_files.append(fn + file)
|
|
||||||
# collectd.info("%s numa node files: %s" %
|
|
||||||
# (PLUGIN, numa_node_files))
|
|
||||||
|
|
||||||
collectd.debug('%s reports %.2f %% usage' %
|
|
||||||
(PLUGIN, obj.value))
|
|
||||||
|
|
||||||
# Dispatch usage value to collectd
|
# Dispatch usage value to collectd
|
||||||
val = collectd.Values(host=obj.hostname)
|
|
||||||
val.plugin = 'memory'
|
val.plugin = 'memory'
|
||||||
val.type = 'percent'
|
val.plugin_instance = 'platform'
|
||||||
val.type_instance = 'used'
|
|
||||||
val.dispatch(values=[obj.value])
|
val.dispatch(values=[obj.value])
|
||||||
|
|
||||||
return PASS
|
#####################################################################
|
||||||
|
# Now get the Numa Node Memory Usage
|
||||||
|
#####################################################################
|
||||||
|
numa_node_files = []
|
||||||
|
fn = "/sys/devices/system/node/"
|
||||||
|
files = os.listdir(fn)
|
||||||
|
for file in files:
|
||||||
|
if 'node' in file:
|
||||||
|
numa_node_files.append(fn + file + '/meminfo')
|
||||||
|
|
||||||
|
for numa_node in numa_node_files:
|
||||||
|
meminfo = {}
|
||||||
|
try:
|
||||||
|
with open(numa_node) as fd:
|
||||||
|
for line in fd:
|
||||||
|
meminfo[line.split()[2][0:-1]] = line.split()[3].strip()
|
||||||
|
|
||||||
|
obj.memFree_kB = float(meminfo['MemFree'])
|
||||||
|
obj.FilePages = float(meminfo['FilePages'])
|
||||||
|
obj.SReclaimable = float(meminfo['SReclaimable'])
|
||||||
|
obj.AnonPages = float(meminfo['AnonPages'])
|
||||||
|
obj.HugePages_Total = float(meminfo['HugePages_Total'])
|
||||||
|
obj.HugePages_Free = float(meminfo['HugePages_Free'])
|
||||||
|
|
||||||
|
log_meminfo(PLUGIN, numa_node, obj)
|
||||||
|
|
||||||
|
avail = float(float(obj.memFree_kB) +
|
||||||
|
float(obj.FilePages) +
|
||||||
|
float(obj.SReclaimable))
|
||||||
|
total = float(float(avail) +
|
||||||
|
float(obj.AnonPages))
|
||||||
|
obj.value = float(float(obj.AnonPages)) / float(total)
|
||||||
|
obj.value = float(float(obj.value) * 100)
|
||||||
|
|
||||||
|
# Dispatch usage value to collectd for this numa node
|
||||||
|
val.plugin_instance = numa_node.split('/')[5]
|
||||||
|
val.dispatch(values=[obj.value])
|
||||||
|
|
||||||
|
collectd.debug('%s reports %s at %.2f %% usage (%s)' %
|
||||||
|
(PLUGIN_NUMA,
|
||||||
|
val.plugin,
|
||||||
|
obj.value,
|
||||||
|
val.plugin_instance))
|
||||||
|
|
||||||
|
# Numa Node Huge Page Memory Monitoring
|
||||||
|
#
|
||||||
|
# Only monitor if there is Huge Page Memory
|
||||||
|
if obj.HugePages_Total > 0:
|
||||||
|
obj.value = \
|
||||||
|
float(float(obj.HugePages_Total -
|
||||||
|
obj.HugePages_Free)) / \
|
||||||
|
float(obj.HugePages_Total)
|
||||||
|
obj.value = float(float(obj.value) * 100)
|
||||||
|
|
||||||
|
# Dispatch huge page memory usage value
|
||||||
|
# to collectd for this numa node.
|
||||||
|
val.plugin_instance = numa_node.split('/')[5] + '_hugepages'
|
||||||
|
val.dispatch(values=[obj.value])
|
||||||
|
|
||||||
|
collectd.debug('%s reports %s at %.2f %% usage (%s)' %
|
||||||
|
(PLUGIN_HUGE,
|
||||||
|
val.plugin,
|
||||||
|
obj.value,
|
||||||
|
val.plugin_instance))
|
||||||
|
|
||||||
|
except EnvironmentError as e:
|
||||||
|
collectd.error("%s unable to read from %s ; str(e)" %
|
||||||
|
(PLUGIN_NUMA, str(e)))
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
collectd.register_config(config_func)
|
collectd.register_config(config_func)
|
||||||
|
|
|
@ -1,8 +1,10 @@
|
||||||
#
|
#
|
||||||
# Copyright (c) 2018 Wind River Systems, Inc.
|
# Copyright (c) 2018-2019 Wind River Systems, Inc.
|
||||||
#
|
#
|
||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
#
|
#
|
||||||
|
#############################################################################
|
||||||
|
#
|
||||||
# This file is the collectd 'Maintenance' Notifier.
|
# This file is the collectd 'Maintenance' Notifier.
|
||||||
#
|
#
|
||||||
# Collects provides information about each event as an object passed to the
|
# Collects provides information about each event as an object passed to the
|
||||||
|
@ -50,10 +52,6 @@ NOTIF_FAILURE = 1
|
||||||
NOTIF_WARNING = 2
|
NOTIF_WARNING = 2
|
||||||
NOTIF_OKAY = 4
|
NOTIF_OKAY = 4
|
||||||
|
|
||||||
# generic return codes
|
|
||||||
PASS = 0
|
|
||||||
FAIL = 1
|
|
||||||
|
|
||||||
# default mtce port.
|
# default mtce port.
|
||||||
# ... with configuration override
|
# ... with configuration override
|
||||||
MTCE_CMD_RX_PORT = 2101
|
MTCE_CMD_RX_PORT = 2101
|
||||||
|
@ -292,7 +290,7 @@ def notifier_func(nObject):
|
||||||
else:
|
else:
|
||||||
collectd.info("%s unsupported severity %d" %
|
collectd.info("%s unsupported severity %d" %
|
||||||
(PLUGIN, nObject.severity))
|
(PLUGIN, nObject.severity))
|
||||||
return FAIL
|
return 0
|
||||||
|
|
||||||
# running counter of notifications.
|
# running counter of notifications.
|
||||||
obj.msg_throttle += 1
|
obj.msg_throttle += 1
|
||||||
|
@ -374,7 +372,7 @@ def notifier_func(nObject):
|
||||||
mtce_socket.close()
|
mtce_socket.close()
|
||||||
else:
|
else:
|
||||||
collectd.error("%s %s failed to open socket (%s)" %
|
collectd.error("%s %s failed to open socket (%s)" %
|
||||||
(PLUGIN, resource, obj.addr))
|
(PLUGIN, resource, obj.addr))
|
||||||
except socket.error as e:
|
except socket.error as e:
|
||||||
if e.args[0] == socket.EAI_ADDRFAMILY:
|
if e.args[0] == socket.EAI_ADDRFAMILY:
|
||||||
# Handle IPV4 to IPV6 switchover:
|
# Handle IPV4 to IPV6 switchover:
|
||||||
|
@ -383,7 +381,7 @@ def notifier_func(nObject):
|
||||||
(PLUGIN, resource, obj.addr))
|
(PLUGIN, resource, obj.addr))
|
||||||
else:
|
else:
|
||||||
collectd.error("%s %s socket error (%s) ; %s" %
|
collectd.error("%s %s socket error (%s) ; %s" %
|
||||||
(PLUGIN, resource, obj.addr, str(e)))
|
(PLUGIN, resource, obj.addr, str(e)))
|
||||||
# try self correction
|
# try self correction
|
||||||
obj.addr = None
|
obj.addr = None
|
||||||
obj.protocol = socket.AF_INET
|
obj.protocol = socket.AF_INET
|
||||||
|
|
|
@ -118,15 +118,15 @@ def _add_unreachable_server(ip=None):
|
||||||
if ip:
|
if ip:
|
||||||
if ip not in obj.unreachable_servers:
|
if ip not in obj.unreachable_servers:
|
||||||
collectd.debug("%s adding '%s' to unreachable servers list: %s" %
|
collectd.debug("%s adding '%s' to unreachable servers list: %s" %
|
||||||
(PLUGIN, ip, obj.unreachable_servers))
|
(PLUGIN, ip, obj.unreachable_servers))
|
||||||
|
|
||||||
obj.unreachable_servers.append(ip)
|
obj.unreachable_servers.append(ip)
|
||||||
|
|
||||||
collectd.info("%s added '%s' to unreachable servers list: %s" %
|
collectd.info("%s added '%s' to unreachable servers list: %s" %
|
||||||
(PLUGIN, ip, obj.unreachable_servers))
|
(PLUGIN, ip, obj.unreachable_servers))
|
||||||
else:
|
else:
|
||||||
collectd.debug("%s ip '%s' already in unreachable_servers list" %
|
collectd.debug("%s ip '%s' already in unreachable_servers list" %
|
||||||
(PLUGIN, ip))
|
(PLUGIN, ip))
|
||||||
else:
|
else:
|
||||||
collectd.error("%s _add_unreachable_server called with no IP" % PLUGIN)
|
collectd.error("%s _add_unreachable_server called with no IP" % PLUGIN)
|
||||||
|
|
||||||
|
@ -323,7 +323,7 @@ def _cleanup_stale_servers():
|
||||||
""" Cleanup the server IP tracking lists """
|
""" Cleanup the server IP tracking lists """
|
||||||
|
|
||||||
collectd.debug("%s CLEANUP REACHABLE: %s %s" %
|
collectd.debug("%s CLEANUP REACHABLE: %s %s" %
|
||||||
(PLUGIN, obj.server_list_ntpq, obj.reachable_servers))
|
(PLUGIN, obj.server_list_ntpq, obj.reachable_servers))
|
||||||
for ip in obj.reachable_servers:
|
for ip in obj.reachable_servers:
|
||||||
if ip not in obj.server_list_ntpq:
|
if ip not in obj.server_list_ntpq:
|
||||||
collectd.info("%s removing missing '%s' server from reachable "
|
collectd.info("%s removing missing '%s' server from reachable "
|
||||||
|
@ -506,7 +506,7 @@ def init_func():
|
||||||
|
|
||||||
obj.base_eid = 'host=' + obj.hostname + '.ntp'
|
obj.base_eid = 'host=' + obj.hostname + '.ntp'
|
||||||
collectd.debug("%s on %s with entity id '%s'" %
|
collectd.debug("%s on %s with entity id '%s'" %
|
||||||
(PLUGIN, obj.hostname, obj.base_eid))
|
(PLUGIN, obj.hostname, obj.base_eid))
|
||||||
|
|
||||||
# get a list of provisioned ntp servers
|
# get a list of provisioned ntp servers
|
||||||
_get_ntp_servers()
|
_get_ntp_servers()
|
||||||
|
@ -686,7 +686,7 @@ def read_func():
|
||||||
# update the selected server list
|
# update the selected server list
|
||||||
obj.selected_server = ip
|
obj.selected_server = ip
|
||||||
collectd.debug("%s selected server is '%s'" %
|
collectd.debug("%s selected server is '%s'" %
|
||||||
(PLUGIN, obj.selected_server))
|
(PLUGIN, obj.selected_server))
|
||||||
else:
|
else:
|
||||||
collectd.debug("%s local controller '%s' marked "
|
collectd.debug("%s local controller '%s' marked "
|
||||||
"as selected server ; ignoring" %
|
"as selected server ; ignoring" %
|
||||||
|
|
Loading…
Reference in New Issue