diff --git a/monitoring/collectd-extensions/src/cpu.py b/monitoring/collectd-extensions/src/cpu.py index 79ffe97bf..09832556c 100755 --- a/monitoring/collectd-extensions/src/cpu.py +++ b/monitoring/collectd-extensions/src/cpu.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2018 Wind River Systems, Inc. +# Copyright (c) 2018-2019 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # @@ -179,7 +179,7 @@ def read_func(): _schedstat)) else: collectd.error('%s unsupported schedstat version [%d]' % - (PLUGIN, c.version)) + (PLUGIN, c.version)) return 0 f.close() diff --git a/monitoring/collectd-extensions/src/fm_notifier.py b/monitoring/collectd-extensions/src/fm_notifier.py index ba458dc2e..60341abf0 100755 --- a/monitoring/collectd-extensions/src/fm_notifier.py +++ b/monitoring/collectd-extensions/src/fm_notifier.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2018 Wind River Systems, Inc. +# Copyright (c) 2018-2019 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # @@ -101,6 +101,7 @@ api = fm_api.FaultAPIs() debug = False debug_lists = False want_state_audit = False +want_vswitch = False # number of notifier loops before the state is object dumped DEBUG_AUDIT = 2 @@ -122,6 +123,8 @@ DATABASE_NAME = 'collectd samples' READING_TYPE__PERCENT_USAGE = '% usage' +# Default invalid threshold value +INVALID_THRESHOLD = float(-1) # collectd severity definitions ; # Note: can't seem to pull then in symbolically with a header @@ -230,8 +233,10 @@ class PluginObject: # [ 'float value string','float threshold string] self.values = [] - self.threshold = float(0) # float value of threshold - self.value = float(0) # float value of reading + self.value = float(0) # float value of reading + + # float value of threshold + self.threshold = float(INVALID_THRESHOLD) # Common static class members. self.reason_warning = "" @@ -332,7 +337,8 @@ class PluginObject: # Purpose : Manage sample value change. # # Handle no sample update case. - # Parse the notification log + # Parse the notification log. + # Handle base object instances. # Generate a log entry if the sample value changes more than # step value. # @@ -384,10 +390,22 @@ class PluginObject: # get the threshold if its there. if len(self.values) > 1: self.threshold = float(self.values[1]) + if nObject.plugin == PLUGIN__MEM: + if self.reading_type == READING_TYPE__PERCENT_USAGE: + # Note: add one to % usage reading types so that it + # matches how rmond did it. In collectd an + # overage is over the specified threshold + # whereas in rmon an overage is at threshold + # or above. + self.threshold = float(self.values[1]) + 1 + else: + self.threshold = float(self.values[1]) + else: + self.threshold = float(INVALID_THRESHOLD) # invalid value except ValueError as ex: collectd.error("%s %s value not integer or float (%s) (%s)" % - (PLUGIN, self.entity_id, self.value, str(ex))) + (PLUGIN, self.entity_id, self.value, str(ex))) return "done" except TypeError as ex: collectd.info("%s %s value has no type (%s)" % @@ -428,6 +446,11 @@ class PluginObject: if self.plugin == PLUGIN__DF: resource = self.instance + elif self.plugin == PLUGIN__MEM: + if self.instance_name: + if self.instance_name != 'platform': + resource += ' ' + self.instance_name + # setup resource name for vswitch process instance name elif self.plugin == PLUGIN__VSWITCH_MEM: resource += ' Processor ' @@ -696,7 +719,7 @@ class PluginObject: self.instance_objects[eid] = obj except: collectd.error("%s failed to add instance to %s object list" % - (PLUGIN, self.plugin)) + (PLUGIN, self.plugin)) finally: collectd.debug("%s %s Add UnLock ..." % (PLUGIN, self.plugin)) @@ -750,14 +773,14 @@ class PluginObject: self._add_instance_object(inst_obj, inst_obj.entity_id) collectd.debug("%s created %s instance (%s) object %s" % - (PLUGIN, inst_obj.resource_name, - inst_obj.entity_id, inst_obj)) + (PLUGIN, inst_obj.resource_name, + inst_obj.entity_id, inst_obj)) - collectd.debug("%s monitoring %s %s %s" % - (PLUGIN, - inst_obj.resource_name, - inst_obj.instance_name, - inst_obj.reading_type)) + collectd.info("%s monitoring %s %s %s" % + (PLUGIN, + inst_obj.resource_name, + inst_obj.instance_name, + inst_obj.reading_type)) return inst_obj @@ -887,7 +910,11 @@ def _build_entity_id(plugin, plugin_instance): entity_id = 'host=' entity_id += PluginObject.host - if plugin == PLUGIN__VSWITCH_MEM: + if plugin == PLUGIN__MEM: + if plugin_instance != 'platform': + entity_id += '.numa=' + plugin_instance + + elif plugin == PLUGIN__VSWITCH_MEM: # host=.processor= if plugin_instance: @@ -929,15 +956,6 @@ def _build_entity_id(plugin, plugin_instance): instance = instance.replace('-', '/') entity_id += instance - # Will be uncommented when the numa memory monitor is added - # to the platform memory plugin. - # - #elif plugin == PLUGIN__MEM: - # if plugin_instance is not 'platform': - # # host=controller-0.numa=node0 - # entity_id += '.numa=' - # entity_id += plugin_instance - if inst_error is True: collectd.error("%s eid build failed ; missing instance" % plugin) return None @@ -953,7 +971,7 @@ def _get_df_mountpoints(): if not os.path.exists(conf_file): collectd.error("%s cannot create filesystem " "instance objects ; missing : %s" % - (PLUGIN, conf_file)) + (PLUGIN, conf_file)) return FAIL mountpoints = [] @@ -1158,7 +1176,7 @@ def _clear_alarm_for_missing_filesystems(): df_base_obj._manage_alarm(obj.entity_id, "okay") else: collectd.debug("%s maintaining alarm for %s" % - (PLUGIN, path)) + (PLUGIN, path)) # Collectd calls this function on startup. @@ -1207,7 +1225,9 @@ def init_func(): obj._create_instance_objects() # ntp query is for controllers only - if tsc.nodetype == 'worker' or 'worker' in tsc.subfunctions: + if want_vswitch is False: + collectd.debug("%s vSwitch monitoring disabled" % PLUGIN) + elif tsc.nodetype == 'worker' or 'worker' in tsc.subfunctions: ####################################################################### @@ -1406,13 +1426,13 @@ def notifier_func(nObject): collectd.debug("%s %s lock" % (PLUGIN, nObject.plugin)) PluginObject.lock.acquire() - #collectd.info("%s Object Search eid: %s" % - # (nObject.plugin, eid)) + # collectd.info("%s Object Search eid: %s" % + # (nObject.plugin, eid)) - #for o in base_obj.instance_objects: - # collectd.error("%s %s inst object dict item %s : %s" % - # (PLUGIN, nObject.plugin, o, - # base_obj.instance_objects[o])) + # for o in base_obj.instance_objects: + # collectd.error("%s %s inst object dict item %s : %s" % + # (PLUGIN, nObject.plugin, o, + # base_obj.instance_objects[o])) # we will take an exception if this object is not in the list. # the exception handling code below will create and add this @@ -1434,14 +1454,14 @@ def notifier_func(nObject): inst_obj = base_obj._get_instance_object(eid) if inst_obj: collectd.debug("%s %s:%s inst object created" % - (PLUGIN, - inst_obj.plugin, - inst_obj.instance)) + (PLUGIN, + inst_obj.plugin, + inst_obj.instance)) else: collectd.error("%s %s:%s inst object create failed" % - (PLUGIN, - nObject.plugin, - nObject.plugin_instance)) + (PLUGIN, + nObject.plugin, + nObject.plugin_instance)) return 0 # re-assign the object @@ -1457,7 +1477,7 @@ def notifier_func(nObject): else: collectd.debug("%s notification for unknown plugin: %s %s" % - (PLUGIN, nObject.plugin, nObject.plugin_instance)) + (PLUGIN, nObject.plugin, nObject.plugin_instance)) return 0 # if obj.warnings or obj.failures: @@ -1503,11 +1523,11 @@ def notifier_func(nObject): # if this is a threshold alarm then build the reason text that # includes the threahold and the reading that caused the assertion. reason = obj.resource_name - reason += " threshold exceeded" - if obj.threshold: - reason += "; threshold {:2.0f} ".format(obj.threshold) + "%, " + reason += " threshold exceeded ;" + if obj.threshold != INVALID_THRESHOLD: + reason += " threshold {:2.0f}".format(obj.threshold) + "%," if obj.value: - reason += "actual {:2.0f}".format(obj.value) + "%" + reason += " actual {:2.0f}".format(obj.value) + "%" elif _severity_num == fm_constants.FM_ALARM_SEVERITY_CRITICAL: reason = obj.reason_failure @@ -1538,14 +1558,13 @@ def notifier_func(nObject): # update the lists now that base_obj._manage_alarm(obj.entity_id, severity_str) - collectd.info("%s %s alarm %s:%s %s:%s thld:%2.2f value:%2.2f" % ( + collectd.info("%s %s alarm %s:%s %s:%s value:%2.2f" % ( PLUGIN, _alarm_state, base_obj.id, severity_str, obj.instance, obj.entity_id, - obj.threshold, obj.value)) # Debug only: comment out for production code. diff --git a/monitoring/collectd-extensions/src/memory.conf b/monitoring/collectd-extensions/src/memory.conf index 5e5195f09..997bf2d48 100644 --- a/monitoring/collectd-extensions/src/memory.conf +++ b/monitoring/collectd-extensions/src/memory.conf @@ -12,8 +12,8 @@ Instance "used" Persist true PersistOK true - WarningMax 80.00 - FailureMax 90.00 + WarningMax 79.00 + FailureMax 89.00 Hits 2 Invert false diff --git a/monitoring/collectd-extensions/src/memory.py b/monitoring/collectd-extensions/src/memory.py index cf4d1e7cd..b9a8e1f8d 100755 --- a/monitoring/collectd-extensions/src/memory.py +++ b/monitoring/collectd-extensions/src/memory.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2018 Wind River Systems, Inc. +# Copyright (c) 2018-2019 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # @@ -19,11 +19,9 @@ import collectd debug = False -# general return codes -PASS = 0 -FAIL = 1 - PLUGIN = 'platform memory usage' +PLUGIN_NUMA = 'numa memory usage' +PLUGIN_HUGE = 'hugepage memory usage' # CPU Control class @@ -41,8 +39,10 @@ class MEM: CommitLimit = 0 Committed_AS = 0 HugePages_Total = 0 + HugePages_Free = 0 Hugepagesize = 0 AnonPages = 0 + FilePages = 0 # derived values avail = 0 @@ -54,6 +54,27 @@ class MEM: obj = MEM() +def log_meminfo(plugin, name, meminfo): + """ Log the supplied meminfo """ + + if debug is False: + return + + collectd.info("%s %s" % (plugin, name)) + collectd.info("%s ---------------------------" % plugin) + collectd.info("%s memTotal_kB : %f" % (plugin, meminfo.memTotal_kB)) + collectd.info("%s memFree_kB : %f" % (plugin, meminfo.memFree_kB)) + collectd.info("%s Buffers : %f" % (plugin, meminfo.buffers)) + collectd.info("%s Cached : %f" % (plugin, meminfo.cached)) + collectd.info("%s SReclaimable : %f" % (plugin, meminfo.SReclaimable)) + collectd.info("%s CommitLimit : %f" % (plugin, meminfo.CommitLimit)) + collectd.info("%s Committed_AS : %f" % (plugin, meminfo.Committed_AS)) + collectd.info("%s HugePages_Total: %f" % (plugin, meminfo.HugePages_Total)) + collectd.info("%s HugePages_Free : %f" % (plugin, meminfo.HugePages_Free)) + collectd.info("%s Hugepagesize : %f" % (plugin, meminfo.Hugepagesize)) + collectd.info("%s AnonPages : %f" % (plugin, meminfo.AnonPages)) + + def config_func(config): """ Configure the memory usage plugin @@ -110,7 +131,12 @@ def read_func(): except EnvironmentError as e: collectd.error("%s unable to read from %s ; str(e)" % (PLUGIN, str(e))) - return FAIL + return 0 + + # setup the sample structure + val = collectd.Values(host=obj.hostname) + val.type = 'percent' + val.type_instance = 'used' # remove the 'unit' (kB) suffix that might be on some of the lines for line in meminfo: @@ -130,20 +156,11 @@ def read_func(): obj.CommitLimit = float(meminfo['CommitLimit']) obj.Committed_AS = float(meminfo['Committed_AS']) obj.HugePages_Total = float(meminfo['HugePages_Total']) + obj.HugePages_Free = float(meminfo['HugePages_Free']) obj.Hugepagesize = float(meminfo['Hugepagesize']) obj.AnonPages = float(meminfo['AnonPages']) - # collectd.info("%s /proc/meminfo: %s" % (PLUGIN, meminfo)) - # collectd.info("%s ---------------------------" % PLUGIN) - # collectd.info("%s memTotal_kB : %f" % (PLUGIN, obj.memTotal_kB)) - # collectd.info("%s memFree_kB : %f" % (PLUGIN, obj.memFree_kB)) - # collectd.info("%s Buffers : %f" % (PLUGIN, obj.buffers)) - # collectd.info("%s Cached : %f" % (PLUGIN, obj.cached)) - # collectd.info("%s SReclaimable : %f" % (PLUGIN, obj.SReclaimable)) - # collectd.info("%s CommitLimit : %f" % (PLUGIN, obj.CommitLimit)) - # collectd.info("%s Committed_AS : %f" % (PLUGIN, obj.Committed_AS)) - # collectd.info("%s HugePages_Total: %f" % (PLUGIN, obj.HugePages_Total)) - # collectd.info("%s AnonPages : %f" % (PLUGIN, obj.AnonPages)) + log_meminfo(PLUGIN, "/proc/meminfo", obj) obj.avail = float(float(obj.memFree_kB) + float(obj.buffers) + @@ -152,38 +169,93 @@ def read_func(): obj.total = float(float(obj.avail) + float(obj.AnonPages)) - # collectd.info("%s ---------------------------" % PLUGIN) - # collectd.info("%s memTotal: %d" % (PLUGIN, obj.avail)) - # collectd.info("%s memAvail: %d" % (PLUGIN, obj.total)) - if obj.strict == 1: obj.value = float(float(obj.Committed_AS) / float(obj.CommitLimit)) else: obj.value = float(float(obj.AnonPages) / float(obj.total)) - obj.value = float(float(obj.value) * 100) - # get numa node memory - # numa_node_files = [] - # fn = "/sys/devices/system/node/" - # files = os.listdir(fn) - # for file in files: - # if 'node' in file: - # numa_node_files.append(fn + file) - # collectd.info("%s numa node files: %s" % - # (PLUGIN, numa_node_files)) - - collectd.debug('%s reports %.2f %% usage' % - (PLUGIN, obj.value)) + if debug is True: + collectd.info("%s ---------------------------" % PLUGIN) + collectd.info("%s memAvail: %d" % (PLUGIN, obj.avail)) + collectd.info("%s memTotal: %d" % (PLUGIN, obj.total)) + collectd.info('%s reports %.2f %% usage' % (PLUGIN, obj.value)) # Dispatch usage value to collectd - val = collectd.Values(host=obj.hostname) val.plugin = 'memory' - val.type = 'percent' - val.type_instance = 'used' + val.plugin_instance = 'platform' val.dispatch(values=[obj.value]) - return PASS + ##################################################################### + # Now get the Numa Node Memory Usage + ##################################################################### + numa_node_files = [] + fn = "/sys/devices/system/node/" + files = os.listdir(fn) + for file in files: + if 'node' in file: + numa_node_files.append(fn + file + '/meminfo') + + for numa_node in numa_node_files: + meminfo = {} + try: + with open(numa_node) as fd: + for line in fd: + meminfo[line.split()[2][0:-1]] = line.split()[3].strip() + + obj.memFree_kB = float(meminfo['MemFree']) + obj.FilePages = float(meminfo['FilePages']) + obj.SReclaimable = float(meminfo['SReclaimable']) + obj.AnonPages = float(meminfo['AnonPages']) + obj.HugePages_Total = float(meminfo['HugePages_Total']) + obj.HugePages_Free = float(meminfo['HugePages_Free']) + + log_meminfo(PLUGIN, numa_node, obj) + + avail = float(float(obj.memFree_kB) + + float(obj.FilePages) + + float(obj.SReclaimable)) + total = float(float(avail) + + float(obj.AnonPages)) + obj.value = float(float(obj.AnonPages)) / float(total) + obj.value = float(float(obj.value) * 100) + + # Dispatch usage value to collectd for this numa node + val.plugin_instance = numa_node.split('/')[5] + val.dispatch(values=[obj.value]) + + collectd.debug('%s reports %s at %.2f %% usage (%s)' % + (PLUGIN_NUMA, + val.plugin, + obj.value, + val.plugin_instance)) + + # Numa Node Huge Page Memory Monitoring + # + # Only monitor if there is Huge Page Memory + if obj.HugePages_Total > 0: + obj.value = \ + float(float(obj.HugePages_Total - + obj.HugePages_Free)) / \ + float(obj.HugePages_Total) + obj.value = float(float(obj.value) * 100) + + # Dispatch huge page memory usage value + # to collectd for this numa node. + val.plugin_instance = numa_node.split('/')[5] + '_hugepages' + val.dispatch(values=[obj.value]) + + collectd.debug('%s reports %s at %.2f %% usage (%s)' % + (PLUGIN_HUGE, + val.plugin, + obj.value, + val.plugin_instance)) + + except EnvironmentError as e: + collectd.error("%s unable to read from %s ; str(e)" % + (PLUGIN_NUMA, str(e))) + + return 0 collectd.register_config(config_func) diff --git a/monitoring/collectd-extensions/src/mtce_notifier.py b/monitoring/collectd-extensions/src/mtce_notifier.py index 98de81cf3..c97d8491b 100755 --- a/monitoring/collectd-extensions/src/mtce_notifier.py +++ b/monitoring/collectd-extensions/src/mtce_notifier.py @@ -1,8 +1,10 @@ # -# Copyright (c) 2018 Wind River Systems, Inc. +# Copyright (c) 2018-2019 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # +############################################################################# +# # This file is the collectd 'Maintenance' Notifier. # # Collects provides information about each event as an object passed to the @@ -50,10 +52,6 @@ NOTIF_FAILURE = 1 NOTIF_WARNING = 2 NOTIF_OKAY = 4 -# generic return codes -PASS = 0 -FAIL = 1 - # default mtce port. # ... with configuration override MTCE_CMD_RX_PORT = 2101 @@ -292,7 +290,7 @@ def notifier_func(nObject): else: collectd.info("%s unsupported severity %d" % (PLUGIN, nObject.severity)) - return FAIL + return 0 # running counter of notifications. obj.msg_throttle += 1 @@ -374,7 +372,7 @@ def notifier_func(nObject): mtce_socket.close() else: collectd.error("%s %s failed to open socket (%s)" % - (PLUGIN, resource, obj.addr)) + (PLUGIN, resource, obj.addr)) except socket.error as e: if e.args[0] == socket.EAI_ADDRFAMILY: # Handle IPV4 to IPV6 switchover: @@ -383,7 +381,7 @@ def notifier_func(nObject): (PLUGIN, resource, obj.addr)) else: collectd.error("%s %s socket error (%s) ; %s" % - (PLUGIN, resource, obj.addr, str(e))) + (PLUGIN, resource, obj.addr, str(e))) # try self correction obj.addr = None obj.protocol = socket.AF_INET diff --git a/monitoring/collectd-extensions/src/ntpq.py b/monitoring/collectd-extensions/src/ntpq.py index 3f7964656..b470d7c01 100755 --- a/monitoring/collectd-extensions/src/ntpq.py +++ b/monitoring/collectd-extensions/src/ntpq.py @@ -118,15 +118,15 @@ def _add_unreachable_server(ip=None): if ip: if ip not in obj.unreachable_servers: collectd.debug("%s adding '%s' to unreachable servers list: %s" % - (PLUGIN, ip, obj.unreachable_servers)) + (PLUGIN, ip, obj.unreachable_servers)) obj.unreachable_servers.append(ip) collectd.info("%s added '%s' to unreachable servers list: %s" % - (PLUGIN, ip, obj.unreachable_servers)) + (PLUGIN, ip, obj.unreachable_servers)) else: collectd.debug("%s ip '%s' already in unreachable_servers list" % - (PLUGIN, ip)) + (PLUGIN, ip)) else: collectd.error("%s _add_unreachable_server called with no IP" % PLUGIN) @@ -323,7 +323,7 @@ def _cleanup_stale_servers(): """ Cleanup the server IP tracking lists """ collectd.debug("%s CLEANUP REACHABLE: %s %s" % - (PLUGIN, obj.server_list_ntpq, obj.reachable_servers)) + (PLUGIN, obj.server_list_ntpq, obj.reachable_servers)) for ip in obj.reachable_servers: if ip not in obj.server_list_ntpq: collectd.info("%s removing missing '%s' server from reachable " @@ -506,7 +506,7 @@ def init_func(): obj.base_eid = 'host=' + obj.hostname + '.ntp' collectd.debug("%s on %s with entity id '%s'" % - (PLUGIN, obj.hostname, obj.base_eid)) + (PLUGIN, obj.hostname, obj.base_eid)) # get a list of provisioned ntp servers _get_ntp_servers() @@ -686,7 +686,7 @@ def read_func(): # update the selected server list obj.selected_server = ip collectd.debug("%s selected server is '%s'" % - (PLUGIN, obj.selected_server)) + (PLUGIN, obj.selected_server)) else: collectd.debug("%s local controller '%s' marked " "as selected server ; ignoring" %