From 892489acd71c3aae7a0280f1dd90c873201bbe73 Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Mon, 14 May 2018 16:12:16 -0400 Subject: [PATCH] Collectd+InfluxDb-RMON Replacement(ALL METRICS) P1 This is the primary update that introduces collectd monitoring and sample storage into the influxdb database. Two new packages are introduced by this update - collectd-extensions package which includes - newly developed collectd platform memory, cpu and filesystem plugins - note that the example, ntpq and interface plugins are not complete and are not enabled by this update. - pmond process monitoring / recovery support for collectd - updated service file for pidfile management ; needed by pmond - influxdb-extensions package which includes - pmond process monitoring / recovery support for influxdb - updated service file for pidfile management ; needed by pmond - log rotate support for influxdb Change-Id: I06511fecb781781ed5491c926ad4b1273a1bc23b Signed-off-by: Jack Ding --- centos_pkg_dirs | 2 + monitoring/collectd-extensions/PKG-INFO | 10 + .../centos/build_srpm.data | 19 + .../centos/collectd-extensions.spec | 90 ++ monitoring/collectd-extensions/src/LICENSE | 202 +++ .../src/collectd.conf.pmon | 18 + .../collectd-extensions/src/collectd.service | 14 + monitoring/collectd-extensions/src/cpu.conf | 22 + monitoring/collectd-extensions/src/cpu.py | 253 ++++ monitoring/collectd-extensions/src/df.conf | 38 + .../collectd-extensions/src/example.conf | 13 + monitoring/collectd-extensions/src/example.py | 75 ++ .../collectd-extensions/src/fm_notifier.py | 1191 +++++++++++++++++ .../collectd-extensions/src/interface.conf | 13 + .../collectd-extensions/src/interface.py | 129 ++ .../collectd-extensions/src/memory.conf | 21 + monitoring/collectd-extensions/src/memory.py | 181 +++ .../collectd-extensions/src/mtce_notifier.py | 379 ++++++ monitoring/collectd-extensions/src/ntpq.conf | 17 + monitoring/collectd-extensions/src/ntpq.py | 195 +++ .../src/python_plugins.conf | 20 + monitoring/influxdb-extensions/PKG-INFO | 10 + .../centos/build_srpm.data | 7 + .../centos/influxdb-extensions.spec | 46 + monitoring/influxdb-extensions/src/LICENSE | 202 +++ .../influxdb-extensions/src/influxdb.conf | 322 +++++ .../src/influxdb.conf.pmon | 17 + .../src/influxdb.logrotate | 16 + .../influxdb-extensions/src/influxdb.service | 20 + 29 files changed, 3542 insertions(+) create mode 100644 monitoring/collectd-extensions/PKG-INFO create mode 100644 monitoring/collectd-extensions/centos/build_srpm.data create mode 100644 monitoring/collectd-extensions/centos/collectd-extensions.spec create mode 100644 monitoring/collectd-extensions/src/LICENSE create mode 100644 monitoring/collectd-extensions/src/collectd.conf.pmon create mode 100644 monitoring/collectd-extensions/src/collectd.service create mode 100644 monitoring/collectd-extensions/src/cpu.conf create mode 100755 monitoring/collectd-extensions/src/cpu.py create mode 100644 monitoring/collectd-extensions/src/df.conf create mode 100644 monitoring/collectd-extensions/src/example.conf create mode 100755 monitoring/collectd-extensions/src/example.py create mode 100755 monitoring/collectd-extensions/src/fm_notifier.py create mode 100644 monitoring/collectd-extensions/src/interface.conf create mode 100755 monitoring/collectd-extensions/src/interface.py create mode 100644 monitoring/collectd-extensions/src/memory.conf create mode 100755 monitoring/collectd-extensions/src/memory.py create mode 100755 monitoring/collectd-extensions/src/mtce_notifier.py create mode 100644 monitoring/collectd-extensions/src/ntpq.conf create mode 100755 monitoring/collectd-extensions/src/ntpq.py create mode 100644 monitoring/collectd-extensions/src/python_plugins.conf create mode 100644 monitoring/influxdb-extensions/PKG-INFO create mode 100644 monitoring/influxdb-extensions/centos/build_srpm.data create mode 100644 monitoring/influxdb-extensions/centos/influxdb-extensions.spec create mode 100644 monitoring/influxdb-extensions/src/LICENSE create mode 100644 monitoring/influxdb-extensions/src/influxdb.conf create mode 100644 monitoring/influxdb-extensions/src/influxdb.conf.pmon create mode 100644 monitoring/influxdb-extensions/src/influxdb.logrotate create mode 100644 monitoring/influxdb-extensions/src/influxdb.service diff --git a/centos_pkg_dirs b/centos_pkg_dirs index d33331278..f51a9423e 100644 --- a/centos_pkg_dirs +++ b/centos_pkg_dirs @@ -101,4 +101,6 @@ extended/memcached devtools/puppet-modules/openstack/puppet-memcached-3.0.2 devtools/puppet-modules/openstack/puppet-horizon-9.5.0 devtools/puppet-modules/openstack/puppet-swift-11.3.0 +monitoring/collectd-extensions +monitoring/influxdb-extensions kubernetes/kubernetes diff --git a/monitoring/collectd-extensions/PKG-INFO b/monitoring/collectd-extensions/PKG-INFO new file mode 100644 index 000000000..857dce240 --- /dev/null +++ b/monitoring/collectd-extensions/PKG-INFO @@ -0,0 +1,10 @@ +Metadata-Version: 1.1 +Name: collectd-extensions +Version: 1.0 +Summary: collectd-extensions +Home-page: +Author: Windriver +Author-email: info@windriver.com +License: windriver +Description: Titanium Cloud collectd extensions +Platform: UNKNOWN diff --git a/monitoring/collectd-extensions/centos/build_srpm.data b/monitoring/collectd-extensions/centos/build_srpm.data new file mode 100644 index 000000000..fc2b9df86 --- /dev/null +++ b/monitoring/collectd-extensions/centos/build_srpm.data @@ -0,0 +1,19 @@ +SRC_DIR="$PKG_BASE" + +COPY_LIST="$PKG_BASE/src/LICENSE \ + $PKG_BASE/src/collectd.conf.pmon \ + $PKG_BASE/src/collectd.service \ + $PKG_BASE/src/fm_notifier.py \ + $PKG_BASE/src/mtce_notifier.py \ + $PKG_BASE/src/python_plugins.conf \ + $PKG_BASE/src/cpu.py \ + $PKG_BASE/src/cpu.conf \ + $PKG_BASE/src/memory.py \ + $PKG_BASE/src/memory.conf \ + $PKG_BASE/src/df.conf \ + $PKG_BASE/src/ntpq.py \ + $PKG_BASE/src/ntpq.conf \ + $PKG_BASE/src/example.py \ + $PKG_BASE/src/example.conf" + +TIS_PATCH_VER=1 diff --git a/monitoring/collectd-extensions/centos/collectd-extensions.spec b/monitoring/collectd-extensions/centos/collectd-extensions.spec new file mode 100644 index 000000000..532c06720 --- /dev/null +++ b/monitoring/collectd-extensions/centos/collectd-extensions.spec @@ -0,0 +1,90 @@ +Summary: Titanuim Server collectd Package +Name: collectd-extensions +Version: 1.0 +Release: 0%{?_tis_dist}.%{tis_patch_ver} +License: windriver +Group: base +Packager: Wind River +URL: unknown + +# create the files tarball +Source0: %{name}-%{version}.tar.gz +Source1: collectd.service +Source2: collectd.conf.pmon + +# collectd python plugin files - notifiers +Source3: fm_notifier.py +Source4: mtce_notifier.py + +# collectd python plugin files - resource plugins +Source11: cpu.py +Source12: memory.py +Source14: example.py +Source15: ntpq.py + +# collectd plugin conf files into /etc/collectd.d +Source100: python_plugins.conf +Source101: cpu.conf +Source102: memory.conf +Source103: df.conf +Source104: example.conf +Source105: ntpq.conf + +BuildRequires: systemd-devel + +Requires: systemd +Requires: collectd +Requires: /bin/systemctl + +%description +Titanium Cloud collectd extensions + +%define debug_package %{nil} +%define local_unit_dir %{_sysconfdir}/systemd/system +%define local_plugin_dir %{_sysconfdir}/collectd.d +%define local_python_extensions_dir /opt/collectd/extensions/python +%define local_config_extensions_dir /opt/collectd/extensions/config + +%prep +%setup + +%build + +%install +install -m 755 -d %{buildroot}%{_sysconfdir} +install -m 755 -d %{buildroot}%{local_unit_dir} +install -m 755 -d %{buildroot}%{local_plugin_dir} +install -m 755 -d %{buildroot}%{local_config_extensions_dir} +install -m 755 -d %{buildroot}%{local_python_extensions_dir} + +# support files ; service and pmon conf +install -m 644 %{SOURCE1} %{buildroot}%{local_unit_dir} +install -m 600 %{SOURCE2} %{buildroot}%{local_config_extensions_dir} + +# collectd python plugin files - notifiers +install -m 700 %{SOURCE3} %{buildroot}%{local_python_extensions_dir} +install -m 700 %{SOURCE4} %{buildroot}%{local_python_extensions_dir} + +# collectd python plugin files - resource plugins +install -m 700 %{SOURCE11} %{buildroot}%{local_python_extensions_dir} +install -m 700 %{SOURCE12} %{buildroot}%{local_python_extensions_dir} +install -m 700 %{SOURCE14} %{buildroot}%{local_python_extensions_dir} +install -m 700 %{SOURCE15} %{buildroot}%{local_python_extensions_dir} + +# collectd plugin conf files into /etc/collectd.d +install -m 600 %{SOURCE100} %{buildroot}%{local_plugin_dir} +install -m 600 %{SOURCE101} %{buildroot}%{local_plugin_dir} +install -m 600 %{SOURCE102} %{buildroot}%{local_plugin_dir} +install -m 600 %{SOURCE103} %{buildroot}%{local_plugin_dir} +install -m 600 %{SOURCE104} %{buildroot}%{local_plugin_dir} +install -m 600 %{SOURCE105} %{buildroot}%{local_plugin_dir} + +%clean +rm -rf $RPM_BUILD_ROOT + +%files +%defattr(-,root,root,-) +%config(noreplace) %{local_unit_dir}/collectd.service +%{local_plugin_dir}/* +%{local_config_extensions_dir}/* +%{local_python_extensions_dir}/* diff --git a/monitoring/collectd-extensions/src/LICENSE b/monitoring/collectd-extensions/src/LICENSE new file mode 100644 index 000000000..d64569567 --- /dev/null +++ b/monitoring/collectd-extensions/src/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/monitoring/collectd-extensions/src/collectd.conf.pmon b/monitoring/collectd-extensions/src/collectd.conf.pmon new file mode 100644 index 000000000..8d905d432 --- /dev/null +++ b/monitoring/collectd-extensions/src/collectd.conf.pmon @@ -0,0 +1,18 @@ +[process] +process = collectd +service = collectd +style = lsb +pidfile = /var/run/collectd.pid +severity = major ; minor, major, critical +restarts = 3 ; restart retries before error assertion +interval = 5 ; number of seconds to wait between restarts +debounce = 10 ; number of seconds that a process needs to remain + ; running before degrade is removed and retry count + ; is cleared. +startuptime = 3 ; Seconds to wait after process start before starting the debounce monitor +mode = passive ; Monitoring mode: passive (default) or active + ; passive: process death monitoring (default: always) + ; active : heartbeat monitoring, i.e. request / response messaging + ; ignore : do not monitor or stop monitoring +quorum = 0 ; process is in the host watchdog quorum + diff --git a/monitoring/collectd-extensions/src/collectd.service b/monitoring/collectd-extensions/src/collectd.service new file mode 100644 index 000000000..14d12c327 --- /dev/null +++ b/monitoring/collectd-extensions/src/collectd.service @@ -0,0 +1,14 @@ +[Unit] +Description=Collectd statistics daemon and extension services +Documentation=man:collectd(1) man:collectd.conf(5) +After=local-fs.target network-online.target +Requires=local-fs.target network-online.target + +[Service] +Type=notify +ExecStart=/usr/sbin/collectd +ExecStartPost=/bin/bash -c 'echo $MAINPID > /var/run/collectd.pid' +ExecStopPost=/bin/rm -f /var/run/collectd.pid + +[Install] +WantedBy=multi-user.target diff --git a/monitoring/collectd-extensions/src/cpu.conf b/monitoring/collectd-extensions/src/cpu.conf new file mode 100644 index 000000000..75394cdb2 --- /dev/null +++ b/monitoring/collectd-extensions/src/cpu.conf @@ -0,0 +1,22 @@ +# For stock plugin only +# Uncomment to compare stock to tiS plugin readings +# --------------------- +# +# ReportByCpu false +# ReportByState false +# ValuesPercentage true +# + + + + + Instance "used" + Persist true + PersistOK true + WarningMax 90.00 + FailureMax 95.00 + Hits 2 + Invert false + + + diff --git a/monitoring/collectd-extensions/src/cpu.py b/monitoring/collectd-extensions/src/cpu.py new file mode 100755 index 000000000..6f0fbf539 --- /dev/null +++ b/monitoring/collectd-extensions/src/cpu.py @@ -0,0 +1,253 @@ +# +# Copyright (c) 2018 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# +############################################################################ +# +# This file is the collectd 'Platform CPU Usage' Monitor. +# +# The Platform CPU Usage is calculated as an averaged percentage of +# platform core usable since the previous sample. +# +# Init Function: +# - if 'compute_reserved.conf exists then query/store PLATFORM_CPU_LIST +# +############################################################################ +import os +import time +import collectd + +debug = False + +PASS = 0 +FAIL = 1 + +PATH = '/proc/cpuinfo' +COMPUTE_RESERVED_CONF = '/etc/nova/compute_reserved.conf' + +PLUGIN = 'platform cpu usage plugin' + + +# CPU Control class +class CPU: + hostname = "" # hostname for sample notification message + usage = float(0.0) # float value of cpu usage + + processors = int(0) # number of processors for all cpus case + cpu_list = [] # list of CPUs to calculate combined usage for + cpu_time = [] # schedstat time for each CPU + cpu_time_last = [] # last schedstat time for each CPU + time_last = float(0.0) # float of the time the last sample was taken + + def log_error(self, err_str): + """ Print an error log with plugin name prefixing the log """ + + collectd.error("%s %s" % (PLUGIN, err_str)) + +# Instantiate the class +c = CPU() + + +# The collectd configuration interface +# collectd needs this defined ; but not used/needed. +def config_func(config): + collectd.info('%s config function' % PLUGIN) + + +# Get the platform cpu list and number of cpus reported by /proc/cpuinfo +def init_func(): + # get current hostname + c.hostname = os.uname()[1] + + collectd.info('%s init function for %s' % (PLUGIN, c.hostname)) + + raw_list = "" + if os.path.exists(COMPUTE_RESERVED_CONF): + with open(COMPUTE_RESERVED_CONF, 'r') as infile: + for line in infile: + if 'PLATFORM_CPU_LIST' in line: + val = line.split("=") + raw_list = val[1].strip('\n')[1:-1].strip('"') + break + if raw_list: + + # Convert the cpu list fetched from the compute + # reserved file into an integer list. + # Handle mix of number list #,# and number range #-# + split_list = raw_list.split(',') + if debug: + collectd.info('%s split list: %s' % (PLUGIN, split_list)) + for cpu in split_list: + if cpu.find('-') == -1: + # add individual cpu # with assumed ',' delimiter + c.cpu_list.append(int(cpu)) + else: + # add all in range #-# + cpu_range = cpu.split('-') + if len(cpu_range) == 2: + first = int(cpu_range[0]) + last = int(cpu_range[1]) + 1 + # add each + for i in list(range(first, last)): + c.cpu_list.append(i) + + # with the full CPU list in hand we can now just read their samples + if debug: + collectd.info('%s full cpu list: %s' % + (PLUGIN, c.cpu_list)) + + try: + f = open('/proc/cpuinfo') + except EnvironmentError as e: + collectd.error(str(e), UserWarning) + else: + + if len(c.cpu_list) == 0: + _want_all_cpus = True + else: + _want_all_cpus = False + + c.processors = 0 + for line in f: + name_value = [s.strip() for s in line.split(':', 1)] + if len(name_value) != 2: + continue + + name, value = name_value + if 'rocessor' in name: + if _want_all_cpus is True: + c.cpu_list.append(int(c.processors)) + c.processors += 1 + + collectd.info('%s has found %d cpus total' % + (PLUGIN, c.processors)) + collectd.info('%s monitoring %d cpus %s' % + (PLUGIN, len(c.cpu_list), c.cpu_list)) + f.close() + + +# Calculate the CPU usage sample +def read_func(): + try: + f = open('/proc/schedstat') + except EnvironmentError as e: + c.log_error('file open failed ; ' + str(e)) + return FAIL + else: + # schedstat time for each CPU + c.cpu_time = [] + + # Loop over each line ... + # get the output version ; only 15 is supported + # get the cpu time from each line staring with 'cpux ....' + for line in f: + + # break each line into name/value pairs + line_split = [s.strip() for s in line.split(' ', 1)] + name, value = line_split + + # get the output version. + if 'ersion' in name: + try: + c.version = int(value) + except ValueError as e: + c.log_error('got invalid schedstat version ; ' + str(e)) + + # TODO: Consider exiting here and raising alarm. + # Calling this type of exit will stop the plugin. + # sys._exit() + return FAIL + + # only version 15 is supported + if c.version == 15: + if 'cpu' in name: + # get the cpu number for each line + if int(name.replace('cpu', '')) in c.cpu_list: + _in_list = True + else: + _in_list = False + + # get cpu time for each cpu that is valid + if len(c.cpu_list) == 0 or _in_list is True: + _schedstat = value + value_split = value.split(' ') + c.cpu_time.append(float(value_split[6])) + if debug: + collectd.info('%s %s schedstat is %s [%s]' % + (PLUGIN, name, value_split[6], + _schedstat)) + else: + collectd.error('%s unsupported schedstat version [%d]' % + (PLUGIN, c.version)) + return 0 + + f.close() + + # Now that we have the cpu time recorded for each cpu + _time_delta = float(0) + _cpu_count = int(0) + if len(c.cpu_time_last) == 0: + c.time_last = time.time() + if c.cpu_list: + # This is a compute node. + # Do not include vswitch or pinned cpus in calculation. + for cpu in c.cpu_list: + c.cpu_time_last.append(float(c.cpu_time[_cpu_count])) + _cpu_count += 1 + if debug: + collectd.info('%s cpu time ; first pass ; %s' % + (PLUGIN, c.cpu_time)) + return PASS + else: + _time_this = time.time() + _time_delta = _time_this - c.time_last + c.total_avg_cpu = 0 + cpu_occupancy = [] + if debug: + collectd.info('%s cpu time ; this pass ; %s -> %s' % + (PLUGIN, c.cpu_time_last, c.cpu_time)) + + if c.cpu_list: + # This is a compute node. + # Do not include vswitch or pinned cpus in calculation. + for cpu in c.cpu_list: + if cpu >= c.processors: + c.log_error(' got out of range cpu number') + else: + _delta = (c.cpu_time[_cpu_count] - c.cpu_time_last[_cpu_count]) + _delta = _delta / 1000000 / _time_delta + cpu_occupancy.append(float((100*(_delta))/1000)) + c.total_avg_cpu += cpu_occupancy[_cpu_count] + if debug: + collectd.info('%s cpu %d - count:%d [%s]' % + (PLUGIN, cpu, _cpu_count, cpu_occupancy)) + _cpu_count += 1 + + else: + collectd.info('%s no cpus to monitor' % PLUGIN) + return 0 + + c.usage = c.total_avg_cpu / _cpu_count + if debug: + collectd.info('%s reports %.2f %% usage (averaged)' % + (PLUGIN, c.usage)) + + # Prepare for next audit ; mode now to last + # c.cpu_time_last = [] + c.cpu_time_last = c.cpu_time + c.time_last = _time_this + + # Dispatch usage value to collectd + val = collectd.Values(host=c.hostname) + val.plugin = 'cpu' + val.type = 'percent' + val.type_instance = 'used' + val.dispatch(values=[c.usage]) + + return 0 + + +collectd.register_config(config_func) +collectd.register_init(init_func) +collectd.register_read(read_func) diff --git a/monitoring/collectd-extensions/src/df.conf b/monitoring/collectd-extensions/src/df.conf new file mode 100644 index 000000000..5df943b8b --- /dev/null +++ b/monitoring/collectd-extensions/src/df.conf @@ -0,0 +1,38 @@ + + ValuesPercentage true + IgnoreSelected false + ReportByDevice false + ReportInodes false + ValuesAbsolute false + MountPoint "/" + MountPoint "/tmp" + MountPoint "/dev" + MountPoint "/dev/shm" + MountPoint "/var/run" + MountPoint "/var/log" + MountPoint "/var/lock" + MountPoint "/boot" + MountPoint "/scratch" + MountPoint "/opt/cgcs" + MountPoint "/opt/platform" + MountPoint "/opt/extension" + MountPoint "/etc/nova/instances" + MountPoint "/var/lib/rabbitmq" + MountPoint "/var/lib/postgresql" + MountPoint "/var/lib/ceph/mon" + MountPoint "/opt/backups" + + + + + + Instance "used" + WarningMax 80.00 + FailureMax 90.00 + Persist true + PersistOK true + Hits 2 + Invert false + + + diff --git a/monitoring/collectd-extensions/src/example.conf b/monitoring/collectd-extensions/src/example.conf new file mode 100644 index 000000000..fbcf5d4f9 --- /dev/null +++ b/monitoring/collectd-extensions/src/example.conf @@ -0,0 +1,13 @@ + + + + Instance "used" + Persist true + PersistOK true + WarningMax 51.00 + FailureMax 75.00 + Hits 1 + Invert false + + + diff --git a/monitoring/collectd-extensions/src/example.py b/monitoring/collectd-extensions/src/example.py new file mode 100755 index 000000000..dacc745e6 --- /dev/null +++ b/monitoring/collectd-extensions/src/example.py @@ -0,0 +1,75 @@ +# +# Copyright (c) 2018 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + +import os +import random +import collectd + +PLUGIN = 'random number plugin' + +# static variables + + +# define a class here that will persist over read calls +class ExampleObject: + hostname = "" + plugin_data = ['1', '100'] + + +obj = ExampleObject() + + +# The config function - called once on collectd process startup +def config_func(config): + """ + Configure the plugin + """ + + for node in config.children: + key = node.key.lower() + val = node.values[0] + + if key == 'data': + obj.plugin_data = str(val).split(' ') + collectd.info("%s configured data '%d:%d'" % + (PLUGIN, + int(obj.plugin_data[0]), + int(obj.plugin_data[1]))) + return 0 + + collectd.info('%s config function' % PLUGIN) + return 0 + + +# The init function - called once on collectd process startup +def init_func(): + + # get current hostname + obj.hostname = os.uname()[1] + return 0 + + +# The sample read function - called on every audit interval +def read_func(): + + # do the work to create the sample + low = int(obj.plugin_data[0]) + high = int(obj.plugin_data[1]) + sample = random.randint(low, high) + + # Dispatch usage value to collectd + val = collectd.Values(host=obj.hostname) + val.plugin = 'example' + val.type = 'percent' + val.type_instance = 'used' + val.dispatch(values=[sample]) + return 0 + + +# register the config, init and read functions +collectd.register_config(config_func) +collectd.register_init(init_func) +collectd.register_read(read_func) diff --git a/monitoring/collectd-extensions/src/fm_notifier.py b/monitoring/collectd-extensions/src/fm_notifier.py new file mode 100755 index 000000000..e7710a670 --- /dev/null +++ b/monitoring/collectd-extensions/src/fm_notifier.py @@ -0,0 +1,1191 @@ +# +# Copyright (c) 2018 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Version 1.0 +# +############################################################################ +# +# This file is the collectd 'FM Alarm' Notifier. +# +# This notifier manages raising and clearing alarms based on collectd +# notifications ; i.e. automatic collectd calls to this handler/notifier. +# +# Collectd process startup automatically calls this module's init_func which +# declares and initializes a plugObject class for plugin type in preparation +# for periodic ongoing monitoring where collectd calls notify_func for each +# plugin and instance of that plugin. +# +# All other class or common member functions implemented herein exist in +# support of that aformentioned initialization and periodic monitoring. +# +# Collects provides information about each event as an object passed to the +# notification handler ; the notification object. +# +# object.host - the hostname +# +# object.plugin - the name of the plugin aka resource +# object.plugin_instance - plugin instance string i.e. say mountpoint +# for df plugin +# object.type, - the unit i.e. percent or absolute +# object.type_instance - the attribute i.e. free, used, etc +# +# object.severity - a integer value 0=OK , 1=warning, 2=failure +# object.message - a log-able message containing the above along +# with the value +# +# This notifier uses the notification object to manage plugin/instance alarms. +# +# To avoid stuck alarms or missing alarms the plugin thresholds should be +# configured with Persist = true and persistOK = true. Thes controls tell +# collectd to always send notifications regardless of state change ; which +# would be the case with these cobtrols set to false. +# +# Persist = false ; only send notifications on 'okay' to 'not okay' change. +# PersistOK = false ; only send notifications on 'not okay' to 'okay' change. +# +# With these both set to true in the threshold spec for the plugin then +# collectd will call this notifier for each audit plugin/instance audit. +# +# Collectd supports only 2 threshold severities ; warning and failure. +# The 'failure' maps to 'critical' while 'warning' maps to 'major' in FM. +# +# To avoid unnecessary load on FM, this notifier maintains current alarm +# state and only makes an FM call on alarm state changes. Current alarm state +# is queried by the init function called by collectd on process startup. +# +# Current alarm state is maintained by two severity lists for each plugin, +# a warnings list and a failures list. +# +# When a failure is reported against a specific plugin then that resources's +# entity_id is added to that plugin's alarm object's failures list. Similarly, +# warning assertions get their entity id added to plugin's alarm object's +# warnings list. Any entity id should only exist in one of the lists at one +# time or in none at all if the notification condition is 'okay' and the alarm +# is cleared. +# +# Adding Plugins: +# +# To add new plugin support just search for ADD_NEW_PLUGIN and add the data +# requested in that area. +# +# Example commands to read samples from the influx database +# +# SELECT * FROM df_value WHERE instance='root' AND type='percent_bytes' AND +# type_instance='used' +# SELECT * FROM cpu_value WHERE type='percent' AND type_instance='used' +# SELECT * FROM memory_value WHERE type='percent' AND type_instance='used' +# +############################################################################ +# +# Import list + +# UT imports +import os +import re +import uuid +import collectd +from fm_api import constants as fm_constants +from fm_api import fm_api +import tsconfig.tsconfig as tsc + +# only load influxdb on the controller +if tsc.nodetype == 'controller': + from influxdb import InfluxDBClient + +api = fm_api.FaultAPIs() + +# Debug control +debug = False +debug_lists = False +want_state_audit = False + +# number of notifier loops before the state is object dumped +DEBUG_AUDIT = 2 + +# write a 'value' log on a the resource sample change of more than this amount +LOG_STEP = 10 + +# Number of back to back database update misses +MAX_NO_UPDATE_B4_ALARM = 5 + +# This plugin name +PLUGIN = 'alarm notifier' + +# Path to the plugin's drop dir +PLUGIN_PATH = '/etc/collectd.d/' + +# collectd severity definitions ; +# Note: can't seem to pull then in symbolically with a header +NOTIF_FAILURE = 1 +NOTIF_WARNING = 2 +NOTIF_OKAY = 4 + +PASS = 0 +FAIL = 1 + +# Some plugin_instances are mangled by collectd. +# The filesystem plugin is especially bad for this. +# For instance the "/var/log" MountPoint instance is +# reported as "var-log". +# The following is a list of mangled instances list +# that need the '-' replaced with '/'. +# +# ADD_NEW_PLUGIN if there are new file systems being added that +# have subdirectories in the name then they will need to be added +# to the mangled list +mangled_list = {"dev-shm", + "var-log", + "var-run", + "var-lock", + "var-lib-rabbitmq", + "var-lib-postgresql", + "var-lib-ceph-mon", + "etc-nova-instances", + "opt-platform", + "opt-cgcs", + "opt-extension", + "opt-backups"} + +# ADD_NEW_PLUGIN: add new alarm id definition +ALARM_ID__CPU = "100.101" +ALARM_ID__MEM = "100.103" +ALARM_ID__DF = "100.104" +ALARM_ID__EXAMPLE = "100.113" + +# ADD_NEW_PLUGIN: add new alarm id to the list +ALARM_ID_LIST = [ALARM_ID__CPU, + ALARM_ID__MEM, + ALARM_ID__DF, + ALARM_ID__EXAMPLE] + +# ADD_NEW_PLUGIN: add plugin name definition +# WARNING: This must line up exactly with the plugin +# filename without the extension. +PLUGIN__DF = "df" +PLUGIN__CPU = "cpu" +PLUGIN__MEM = "memory" +PLUGIN__INTERFACE = "interface" +PLUGIN__NTP_QUERY = "ntpq" +PLUGIN__VSWITCH_PORT = "vswitch-port" +PLUGIN__VSWITCH_CPU = "vswitch-cpu" +PLUGIN__VSWITCH_MEM = "vswitch-memory" +PLUGIN__VSWITCH_OVSDB = "vswitch-ovsdb" +PLUGIN__VSWITCH_OPENFLOW = "vswitch-openflow" +PLUGIN__VSWITCH_LACP_IFACE = "vswitch-lacp-iface" +PLUGIN__VSWITCH_IFACE = "vswitch-iface" +PLUGIN__NOVA_THINPOOL_LVM = "nova-thinpool-lvm" +PLUGIN__CINDER_THINPOOL_LVM = "cinder-thinpool-lvm" +PLUGIN__CINDER_THINPOOL_LVM_META = "cinder-thinpool-lvm-meta" +PLUGIN__EXAMPLE = "example" + +# ADD_NEW_PLUGIN: add plugin name to list +PLUGIN_NAME_LIST = [PLUGIN__CPU, + PLUGIN__MEM, + PLUGIN__DF, + PLUGIN__EXAMPLE] + + +# ADD_NEW_PLUGIN: add alarm id and plugin to dictionary +# ALARM_ID_TO_PLUGIN_DICT = {} +# ALARM_ID_TO_PLUGIN_DICT[ALARM_ID__CPU] = PLUGIN__CPU +# ALARM_ID_TO_PLUGIN_DICT[ALARM_ID__MEM] = PLUGIN__MEM +# ALARM_ID_TO_PLUGIN_DICT[ALARM_ID__DF] = PLUGIN__DF +# ALARM_ID_TO_PLUGIN_DICT[ALARM_ID__EXAMPLE] = PLUGIN__EXAMPLE + + +# PluginObject Class +class PluginObject: + + dbObj = None # shared database connection obj + host = None # saved hostname + database_setup = False # state of database setup + database_setup_in_progress = False # connection mutex + + def __init__(self, id, plugin): + """ + PluginObject Class constructor + """ + + # plugin specific static class members. + self.id = id # alarm id ; 100.1?? + self.plugin = plugin # name of the plugin ; df, cpu, memory ... + self.plugin_instance = "" # the instance name for the plugin + self.resource_name = "" # The top level name of the resource + self.instance_name = "" # The instanhce name + + # Instance specific learned static class members. + self.entity_id = "" # fm entity id host=. + self.instance = "" # _ + + # [ 'float value string','float threshold string] + self.values = [] + self.threshold = float(0) # float value of threshold + self.value = float(0) # float value of reading + + # Common static class members. + self.repair = "" + self.alarm_type = fm_constants.FM_ALARM_TYPE_7 + self.cause = fm_constants.ALARM_PROBABLE_CAUSE_50 + self.suppression = True + self.service_affecting = False + + # Severity tracking lists. + # Maintains severity state between notifications. + # Each is a list of entity ids for severity asserted alarms. + # As alarms are cleared so is the entry in these lists. + # The entity id should only be in one lists for any given raised alarm. + self.warnings = [] + self.failures = [] + + # total notification count + self.count = 0 + + # Debug: state audit controls + self.audit_threshold = 0 + self.audit_count = 0 + + # This member is used to help log change values using the + # LOG_STEP threshold consant + self.last_value = "" + + # For plugins that have multiple instances like df (filesystem plugin) + # we need to create an instance of this object for each one. + # This dictionary is used to associate a instance with its object. + self.instance_objects = {} + + def _ilog(self, string): + """ + Create a collectd notifier info log with the specified string. + """ + collectd.info('%s %s : %s' % (PLUGIN, self.plugin, string)) + + def _llog(self, string): + """ + Create a collectd notifier info log with the specified string + if debug_lists is True. + """ + if debug_lists: + collectd.info('%s %s : %s' % (PLUGIN, self.plugin, string)) + + def _elog(self, string): + """ + Create a collectd notifier error log with the specified string. + """ + collectd.error('%s %s : %s' % (PLUGIN, self.plugin, string)) + + ########################################################################## + # + # Name : _state_audit + # + # Purpose : Debug Tool to log plugin object info. + # + # Not called in production code. + # + # Only the severity lists are dumped for now. + # Other info can be added as needed. + # Can be run as an audit or called directly. + # + ########################################################################## + + def _state_audit(self, location): + """ Log the state of the specified object. """ + + if self.id == ALARM_ID__CPU: + _print_state() + + self.audit_count += 1 + if self.warnings: + collectd.info("%s AUDIT %d: %s warning list %s:%s" % + (PLUGIN, + self.audit_count, + self.plugin, + location, + self.warnings)) + if self.failures: + collectd.info("%s AUDIT %d: %s failure list %s:%s" % + (PLUGIN, + self.audit_count, + self.plugin, + location, + self.failures)) + + ########################################################################## + # + # Name : _manage_change + # + # Purpose : Manage sample value change. + # + # Handle no sample update case. + # Parse the notification log + # Generate a log entry if the sample value changes more than + # step value. + # + ########################################################################## + + def _manage_change(self, nObject): + """ Log resource instance value on step state change. """ + + # filter out messages to ignore ; notifications that have no value + if "has not been updated for" in nObject.message: + collectd.debug("%s NOT UPDATED: %s" % (PLUGIN, self.entity_id)) + return "done" + + # Get the value from the notification message. + # The location in the message is different based on the message type ; + # normal reading or overage reading + # + # message: Host controller-0, plugin memory type percent ... [snip] + # All data sources are within range again. + # Current value of "value" is 51.412038. <------ + # + # message: Host controller-0, plugin df (instance scratch) ... [snip] + # Data source "value" is currently 97.464027. <------ + # That is above the failure threshold of 90.000000. <------ + + # recognized strings - value only value and threshold + # ------------ ------------------- + value_sig_list = ['Current value of', 'is currently'] + + # list of parsed 'string version' float values ['value','threshold'] + self.values = [] + for sig in value_sig_list: + index = nObject.message.find(sig) + if index != -1: + self.values = \ + re.findall(r"[-+]?\d*\.\d+|\d+", nObject.message[index:-1]) + + # contains string versions of the float values extracted from + # the notification message. The threshold value is included for + # readings that are out of threshold. + if len(self.values): + # validate the reading + try: + self.value = float(self.values[0]) + # get the threshold if its there + if len(self.values) == 2: + self.threshold = float(self.values[1]) + + except ValueError as ex: + collectd.error("%s %s value not integer or float (%s) (%s)" % + (PLUGIN, self.entity_id, self.value, str(ex))) + return "done" + except TypeError as ex: + collectd.info("%s %s value has no type (%s)" % + (PLUGIN, self.entity_id, str(ex))) + return "done" + else: + collectd.info("%s %s reported no value (%s)" % + (PLUGIN, self.entity_id, nObject.message)) + return "done" + + # get the last reading + if self.last_value: + last = float(self.last_value) + else: + last = float(0) + + # Determine if the change is large enough to log and save the new value + logit = False + if self.count == 0 or LOG_STEP == 0: + logit = True + elif self.value > last: + if (last + LOG_STEP) < self.value: + logit = True + elif last > self.value: + if (self.value + LOG_STEP) < last: + logit = True + + # Case on types. + # + # Note: only usage type so far + if logit: + reading_type = "% usage" + tmp = str(self.value).split('.') + if len(tmp[0]) == 1: + pre = ': ' + else: + pre = ': ' + collectd.info("%s reading%s%2.2f %s - %s" % + (PLUGIN, + pre, + self.value, + reading_type, + self.instance_name)) + self.last_value = float(self.value) + + ########################################################################## + # + # Name : _severity_change + # + # Purpose : Compare current severity to instance severity lists to + # facilitate early 'do nothing' exit from a notification. + # + # Returns : True if the severity changed + # False if severity is the same + # + ########################################################################## + + def _severity_change(self, entity_id, severity): + """ + Check for a severity change + """ + + if entity_id in self.warnings: + self._llog(entity_id + " is already in warnings list") + current_severity_str = "warning" + elif entity_id in self.failures: + self._llog(entity_id + " is already in failures list") + current_severity_str = "failure" + else: + self._llog(entity_id + " is already OK") + current_severity_str = "okay" + + # Compare to current state to previous state. + # If they are the same then return done. + if severity == current_severity_str: + return False + else: + return True + + ######################################################################## + # + # Name : _manage_alarm + # + # Putpose : Alarm Severity Tracking + # + # This class member function accepts a severity level and entity id. + # It manages the content of the current alarm object's 'failures' and + # 'warnings' lists ; aka Severity Lists. + # + # These Severity Lists are used to record current alarmed state for + # each instance of a plugin. + # If an alarm is raised then its entity id is added to the appropriate + # severity list. + # + # A failure notification or critical alarm goes in the failures list. + # A warning notification or major alarm goes into the warnings list. + # + # These lists are used to avoid making unnecessary calls to FM. + # + # Startup Behavior: + # + # The collectd daemon runs the init function of every plugin on startup. + # That includes this notifier plugin. The init function queries the FM + # database for any active alarms. + # + # This member function is called for any active alarms that are found. + # The entity id for active alarms is added to the appropriate + # Severity List. This way existing alarms are maintained over collectd + # process startup. + # + # Runtime Behavior: + # + # The current severity state is first queried and compared to the + # newly reported severity level. If they are the same then a "done" + # is returned telling the caller that there is no further work to do. + # Otherwise, the lists are managed in a way that has the entity id + # of a raised alarm in the corresponding severity list. + # + # See inline comments below for each specific severity and state + # transition case. + # + ######################################################################### + + def _manage_alarm(self, entity_id, severity): + """ + Manage the alarm severity lists and report state change. + """ + + collectd.debug("%s manage alarm %s %s %s" % + (PLUGIN, + self.id, + severity, + entity_id)) + + # Get the instance's current state + if entity_id in self.warnings: + current_severity_str = "warning" + elif entity_id in self.failures: + current_severity_str = "failure" + else: + current_severity_str = "okay" + + # Compare to current state to previous state. + # If they are the same then return done. + if severity == current_severity_str: + return "done" + + # Otherwise, manage the severity lists ; case by case. + warnings_list_change = False + failures_list_change = False + + # Case 1: Handle warning to failure severity change. + if severity == "warning" and current_severity_str == "failure": + + if entity_id in self.failures: + self.failures.remove(entity_id) + failures_list_change = True + self._llog(entity_id + " is removed from failures list") + else: + self._elog(entity_id + " UNEXPECTEDLY not in failures list") + + # Error detection + if entity_id in self.warnings: + self.warnings.remove(entity_id) + self._elog(entity_id + " UNEXPECTEDLY in warnings list") + + self.warnings.append(entity_id) + warnings_list_change = True + self._llog(entity_id + " is added to warnings list") + + # Case 2: Handle failure to warning alarm severity change. + elif severity == "failure" and current_severity_str == "warning": + + if entity_id in self.warnings: + self.warnings.remove(entity_id) + warnings_list_change = True + self._llog(entity_id + " is removed from warnings list") + else: + self._elog(entity_id + " UNEXPECTEDLY not in warnings list") + + # Error detection + if entity_id in self.failures: + self.failures.remove(entity_id) + self._elog(entity_id + " UNEXPECTEDLY in failures list") + + self.failures.append(entity_id) + failures_list_change = True + self._llog(entity_id + " is added to failures list") + + # Case 3: Handle new alarm. + elif severity != "okay" and current_severity_str == "okay": + if severity == "warning": + self.warnings.append(entity_id) + warnings_list_change = True + self._llog(entity_id + " added to warnings list") + elif severity == "failure": + self.failures.append(entity_id) + failures_list_change = True + self._llog(entity_id + " added to failures list") + + # Case 4: Handle alarm clear. + else: + # plugin is okay, ensure this plugin's entity id + # is not in either list + if entity_id in self.warnings: + self.warnings.remove(entity_id) + warnings_list_change = True + self._llog(entity_id + " removed from warnings list") + if entity_id in self.failures: + self.failures.remove(entity_id) + failures_list_change = True + self._llog(entity_id + " removed from failures list") + + if warnings_list_change is True: + if self.warnings: + collectd.info("%s %s warnings %s" % + (PLUGIN, self.plugin, self.warnings)) + else: + collectd.info("%s %s no warnings" % + (PLUGIN, self.plugin)) + + if failures_list_change is True: + if self.failures: + collectd.info("%s %s failures %s" % + (PLUGIN, self.plugin, self.failures)) + else: + collectd.info("%s %s no failures" % + (PLUGIN, self.plugin)) + + ########################################################################## + # + # Name : _create_instance_objects + # + # Purpose : Create a list of instance objects for 'self' type plugin and + # add those objects to the parnet's instance_objects dictionary. + # + ########################################################################## + def _create_instance_objects(self): + """ + Create, initialize and add an instance object to this/self plugin + """ + + # ADD_NEW_PLUGIN: for plugins that have instances you need to + # add support for creating those instances and adding + # those instances to the parent instance_objects list. + + # Currently only the DF plugin has subordinate instance objects. + if self.id == ALARM_ID__DF: + + # read the df.conf file and return/get a list of mount points + conf_file = PLUGIN_PATH + 'df.conf' + if not os.path.exists(conf_file): + collectd.error("%s cannot create filesystem " + "instance objects ; missing : %s" % + (PLUGIN, conf_file)) + return FAIL + + mountpoints = [] + with open(conf_file, 'r') as infile: + for line in infile: + if 'MountPoint ' in line: + + # get the mountpoint path from the line + try: + mountpoint = line.split('MountPoint ')[1][1:-2] + mountpoints.append(mountpoint) + except: + collectd.error("%s skipping invalid '%s' " + "mountpoint line: %s" % + (PLUGIN, conf_file, line)) + + collectd.debug("%s MountPoints: %s" % (PLUGIN, mountpoints)) + + # loop over the mount points + for mp in mountpoints: + # create a new plugin object + inst_obj = PluginObject(ALARM_ID__DF, PLUGIN__DF) + + # initialize the object with instance specific data + inst_obj.resource_name = self.resource_name + inst_obj.instance_name = mp + # build the plugin instance name from the mount point + if mp == '/': + inst_obj.plugin_instance = 'root' + else: + inst_obj.plugin_instance = mp[1:].replace('/', '-') + + inst_obj.entity_id = _build_entity_id(PLUGIN__DF, + inst_obj.plugin_instance) + + # add this subordinate object to the parent's + # instance object list + self.instance_objects[inst_obj.entity_id] = inst_obj + + collectd.info("%s monitoring %s usage" % + (PLUGIN, mp)) + + +PluginObject.host = os.uname()[1] + + +# ADD_NEW_PLUGIN: add plugin to this table +# This instanciates the plugin objects +PLUGINS = {PLUGIN__CPU: PluginObject(ALARM_ID__CPU, PLUGIN__CPU), + PLUGIN__MEM: PluginObject(ALARM_ID__MEM, PLUGIN__MEM), + PLUGIN__DF: PluginObject(ALARM_ID__DF, PLUGIN__DF), + PLUGIN__EXAMPLE: PluginObject(ALARM_ID__EXAMPLE, PLUGIN__EXAMPLE)} + + +def _get_base_object(alarm_id): + """ + Get the alarm object for the specified alarm id. + """ + for plugin in PLUGIN_NAME_LIST: + if PLUGINS[plugin].id == alarm_id: + return PLUGINS[plugin] + return None + + +def _get_object(alarm_id, eid): + """ + Get the plugin object for the specified alarm id and eid + """ + + base_obj = _get_base_object(alarm_id) + if len(base_obj.instance_objects): + try: + return(base_obj.instance_objects[eid]) + except: + collectd.debug("%s %s has no instance objects" % + (PLUGIN, base_obj.plugin)) + return base_obj + + +def is_uuid_like(val): + """Returns validation of a value as a UUID. + + For our purposes, a UUID is a canonical form string: + aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa + """ + try: + return str(uuid.UUID(val)) == val + except (TypeError, ValueError, AttributeError): + return False + + +def _build_entity_id(plugin, plugin_instance): + """ + Builds an entity id string based on the collectd notification object. + """ + + entity_id = 'host=' + entity_id += PluginObject.host + + if plugin == PLUGIN__DF: + if plugin_instance: + instance = plugin_instance + + # build the entity_id for this plugin + entity_id += '.filesystem=/' + + # collectd replaces the instance '/' with the word 'root' + # So skip over "root" as '/' is already part of the + # entity_id + if instance != 'root': + # Look for other instances that are in the mangled list + if instance in mangled_list: + instance = instance.replace('-', '/') + entity_id += instance + + # collectd.info("%s entity_id : %s" % (PLUGIN, entity_id)) + + return entity_id + + +def _get_df_mountpoints(): + """ + """ + + conf_file = PLUGIN_PATH + 'df.conf' + if not os.path.exists(conf_file): + collectd.error("%s cannot create filesystem " + "instance objects ; missing : %s" % + (PLUGIN, conf_file)) + return FAIL + + mountpoints = [] + with open(conf_file, 'r') as infile: + for line in infile: + if 'MountPoint ' in line: + + # get the mountpoint path from the line + try: + mountpoint = line.split('MountPoint ')[1][1:-2] + mountpoints.append(mountpoint) + except: + collectd.error("%s skipping invalid '%s' " + "mountpoint line: %s" % + (PLUGIN, conf_file, line)) + + return(mountpoints) + + +def _print_state(obj=None): + """ + Print the current object state + """ + objs = [] + if obj is None: + objs.append(_get_base_object(ALARM_ID__CPU)) + objs.append(_get_base_object(ALARM_ID__MEM)) + objs.append(_get_base_object(ALARM_ID__DF)) + else: + objs.append(obj) + for o in objs: + collectd.info("%s PLUGIN %2d [%6s:%2.2f:%s] [w:%s f:%s] %d" % + (PLUGIN, + len(o.instance_objects), + o.plugin, + o.value, + o.entity_id, + o.warnings, + o.failures, + o.count)) + if len(o.instance_objects): + for inst_obj in o.instance_objects: + collectd.info("%s INSTANCE [%6s:%2.2f:%s] [w:%s f:%s] %d" % + (PLUGIN, + inst_obj.plugin, + inst_obj.value, + inst_obj.entity_id, + inst_obj.warnings, + inst_obj.failures, + inst_obj.count)) + + +def _database_setup(database): + """ + Setup the influx database for collectd resource samples + """ + + collectd.info("%s setting up influxdb:%s database" % + (PLUGIN, database)) + + error_str = "" + + # http://influxdb-python.readthedocs.io/en/latest/examples.html + # http://influxdb-python.readthedocs.io/en/latest/api-documentation.html + PluginObject.dbObj = InfluxDBClient('127.0.0.1', '8086', database) + if PluginObject.dbObj: + try: + PluginObject.dbObj.create_database('collectd') + + ############################################################ + # + # TODO: Read current retention period from service parameter + # Make it a puppet implementation. + # + # Create a 1 month samples retention policy + # ----------------------------------------- + # name = 'collectd samples' + # duration = set retention period in time + # xm - minutes + # xh - hours + # xd - days + # xw - weeks + # xy - years + # database = 'collectd' + # default = True ; make it the default + # + ############################################################ + + PluginObject.dbObj.create_retention_policy( + 'collectd samples', '4w', 1, database, True) + except Exception as ex: + if str(ex) == 'database already exists': + try: + collectd.info("%s influxdb:collectd %s" % + (PLUGIN, str(ex))) + PluginObject.dbObj.create_retention_policy( + 'collectd samples', '4w', 1, database, True) + except Exception as ex: + if str(ex) == 'retention policy already exists': + collectd.info("%s influxdb:collectd %s" % + (PLUGIN, str(ex))) + else: + error_str = "failure from influxdb ; " + error_str += str(ex) + else: + error_str = "failed to create influxdb:" + database + else: + error_str = "failed to connect to influxdb:" + database + + if not error_str: + retention = \ + PluginObject.dbObj.get_list_retention_policies(database) + collectd.info("%s influxdb:%s samples retention policy: %s" % + (PLUGIN, database, retention)) + collectd.info("%s influxdb:%s is setup" % (PLUGIN, database)) + PluginObject.database_setup = True + + +def _clear_alarm_for_missing_filesystems(): + """ + Clear alarmed file systems that are no longer mounted or present + """ + + # get the DF (filesystem plugin) base object. + df_base_obj = PLUGINS[PLUGIN__DF] + # create a single alarm list from both wranings and failures list + # to avoid having to duplicate the code below for each. + # At this point we don't care about severity, we just need to + # determine if an any-severity' alarmed filesystem no longer exists + # so we can cleanup by clearing its alarm. + # Note: the 2 lists shpould always contain unique data between them + alarm_list = df_base_obj.warnings + df_base_obj.failures + if len(alarm_list): + for eid in alarm_list: + # search for any of them that might be alarmed. + obj = df_base_obj.instance_objects[eid] + + # only care about df (file system plugins) + if obj.plugin == PLUGIN__DF and \ + obj.entity_id == eid and \ + obj.plugin_instance != 'root': + + # For all others replace all '-' with '/' + path = '/' + obj.plugin_instance.replace('-', '/') + if os.path.ismount(path) is False: + if api.clear_fault(df_base_obj.id, obj.entity_id) is False: + collectd.error("%s %s:%s clear failed ; will retry" % + (PLUGIN, df_base_obj.id, obj.entity_id)) + else: + collectd.info("%s cleared alarm for missing %s" % + (PLUGIN, path)) + df_base_obj._manage_alarm(obj.entity_id, "okay") + else: + collectd.debug("%s maintaining alarm for %s" % + (PLUGIN, path)) + return 0 + + +# Collectd calls this function on startup. +# Initialize each plugin object with plugin specific data. +# Query FM for existing alarms and run with that starting state. +def init_func(): + """ Collectd FM Notifier Initialization Function """ + + PluginObject.host = os.uname()[1] + collectd.info("%s %s:%s init function" % + (PLUGIN, tsc.nodetype, PluginObject.host)) + + # Constant CPU Plugin Object Settings + obj = PLUGINS[PLUGIN__CPU] + obj.resource_name = "Platform CPU" + obj.instance_name = PLUGIN__CPU + obj.repair = "Monitor and if condition persists, " + obj.repair += "contact next level of support." + collectd.info("%s monitoring %s usage" % (PLUGIN, obj.resource_name)) + + # Constant Memory Plugin Object settings + obj = PLUGINS[PLUGIN__MEM] + obj.resource_name = "Memory" + obj.instance_name = PLUGIN__MEM + obj.repair = "Monitor and if condition persists, " + obj.repair += "contact next level of support; " + obj.repair += "may require additional memory on Host." + collectd.info("%s monitoring %s usage" % (PLUGIN, obj.resource_name)) + + # Constant FileSystem Plugin Object settings + obj = PLUGINS[PLUGIN__DF] + obj.resource_name = "File System" + obj.instance_name = PLUGIN__DF + obj.repair = "Monitor and if condition persists, " + obj.repair += "contact next level of support." + + # The FileSystem (DF) plugin has multiple instances + # One instance per file system mount point being monitored. + # Create one DF instance object per mount point + obj._create_instance_objects() + + obj = PLUGINS[PLUGIN__EXAMPLE] + obj.resource_name = "Example" + obj.instance_name = PLUGIN__EXAMPLE + obj.repair = "Not Applicable" + collectd.info("%s monitoring %s usage" % (PLUGIN, obj.resource_name)) + + if tsc.nodetype == 'controller': + PluginObject.database_setup_in_progress = True + _database_setup('collectd') + PluginObject.database_setup_in_progress = False + + # ... + # ADD_NEW_PLUGIN: Add new plugin object initialization here ... + # ... + + ###################################################################### + # + # With plugin objects initialized ... + # Query FM for any resource alarms that may already be raised + # Load the queries severity state into the appropriate + # severity list for those that are. + for alarm_id in ALARM_ID_LIST: + collectd.debug("%s searching for all '%s' alarms " % + (PLUGIN, alarm_id)) + alarms = api.get_faults_by_id(alarm_id) + if alarms: + for alarm in alarms: + eid = alarm.entity_instance_id + # ignore alarms not for this host + if PluginObject.host not in eid: + continue + + base_obj = _get_base_object(alarm_id) + if base_obj is None: + # Handle unrecognized alarm by clearing it ; + # should never happen since we are iterating + # over an internal alarm_id list. + if api.clear_fault(alarm_id, eid) is False: + collectd.error("%s %s:%s not found ; clear failed" % + (PLUGIN, + alarm_id, + eid)) + else: + collectd.error("%s %s:%s not found ; cleared" % + (PLUGIN, + alarm_id, + eid)) + continue + + collectd.info('%s found %s alarm with %s severity [%s:%s:%s]' % + (PLUGIN, + base_obj.id, + alarm.severity, + base_obj.plugin, + alarm_id, + eid)) + if alarm.severity == "critical": + sev = "failure" + elif alarm.severity == "major": + sev = "warning" + else: + sev = "okay" + continue + + # Load the alarm severity by doing a plugin/instance lookup. + base_obj._manage_alarm(eid, sev) + + +# The notifier function inspects the collectd notification and determines if +# the representative alarm needs to be asserted, severity changed, or cleared. +def notifier_func(nObject): + + collectd.debug('%s notification: %s %s:%s - %s %s %s [%s]' % ( + PLUGIN, + nObject.host, + nObject.plugin, + nObject.plugin_instance, + nObject.type, + nObject.type_instance, + nObject.severity, + nObject.message)) + + # Load up severity variables and alarm actions based on + # this notification's severity level. + if nObject.severity == NOTIF_OKAY: + severity_str = "okay" + _severity_num = fm_constants.FM_ALARM_SEVERITY_CLEAR + _alarm_state = fm_constants.FM_ALARM_STATE_CLEAR + elif nObject.severity == NOTIF_FAILURE: + severity_str = "failure" + _severity_num = fm_constants.FM_ALARM_SEVERITY_CRITICAL + _alarm_state = fm_constants.FM_ALARM_STATE_SET + elif nObject.severity == NOTIF_WARNING: + severity_str = "warning" + _severity_num = fm_constants.FM_ALARM_SEVERITY_MAJOR + _alarm_state = fm_constants.FM_ALARM_STATE_SET + else: + collectd.debug('%s with unsupported severity %d' % + (PLUGIN, nObject.severity)) + return 0 + + if tsc.nodetype == 'controller': + if PluginObject.database_setup is False: + if PluginObject.database_setup_in_progress is False: + PluginObject.database_setup_in_progress = True + _database_setup('collectd') + PluginObject.database_setup_in_progress = False + + # get plugin object + if nObject.plugin in PLUGINS: + base_obj = obj = PLUGINS[nObject.plugin] + + # if this notification is for a plugin instance then get that + # instances's object instead. if that object does not yet exists + # then create it + eid = '' + if nObject.plugin_instance: + # Build the entity_id from the parent object if needed + eid = _build_entity_id(nObject.plugin, nObject.plugin_instance) + try: + inst_obj = base_obj.instance_objects[eid] + if inst_obj is None: + collectd.error("%s %s:%s instance object is None" % + (PLUGIN, + nObject.plugin, + nObject.plugin_instance)) + return 0 + except: + # o.k. , not in the list yet, lets create one + collectd.error("%s %s:%s instance object not found" % + (PLUGIN, + nObject.plugin, + nObject.plugin_instance)) + return 0 + + # re-assign the object + obj = inst_obj + else: + if not len(base_obj.entity_id): + # Build the entity_id from the parent object if needed + eid = _build_entity_id(nObject.plugin, nObject.plugin_instance) + + # TODO: Needed ? + if not len(obj.instance): + obj.instance = nObject.plugin + if nObject.plugin_instance: + obj.instance += '_' + nObject.plugin_instance + + # TODO: Needed ? + # update the object with the eid if its not already set. + if not len(obj.entity_id): + obj.entity_id = eid + + else: + collectd.debug("%s notification for unknown plugin: %s %s" % + (PLUGIN, nObject.plugin, nObject.plugin_instance)) + return 0 + + # _print_state(obj) + + # If want_state_audit is True then run the audit. + # Primarily used for debug + # default state is False + # TODO: comment out for production code. + if want_state_audit: + obj.audit_threshold += 1 + if obj.audit_threshold == DEBUG_AUDIT: + obj.audit_threshold = 0 + obj._state_audit("audit") + + # manage reading value change ; store last and log if gt obj.step + action = obj._manage_change(nObject) + if action == "done": + return 0 + + # increment just before any possible return for a valid sample + obj.count += 1 + + # audit file system presence every time we get the + # notification for the root file system ; which will + # always be there. + if obj.instance == 'df_root': + _clear_alarm_for_missing_filesystems() + + # exit early if there is no severity change + if base_obj._severity_change(obj.entity_id, severity_str) is False: + return 0 + + if _alarm_state == fm_constants.FM_ALARM_STATE_CLEAR: + if api.clear_fault(base_obj.id, obj.entity_id) is False: + collectd.error("%s %s:%s clear_fault failed" % + (PLUGIN, base_obj.id, obj.entity_id)) + else: + reason = obj.resource_name + reason += " threshold exceeded" + if obj.threshold: + reason += "; {:2.0f}".format(obj.threshold) + "%" + # reason += "; {:2.2f}".format(obj.threshold) + "%" + if obj.value: + reason += ", actual " + "{:2.0f}".format(obj.value) + "%" + + fault = fm_api.Fault( + alarm_id=base_obj.id, + alarm_state=_alarm_state, + entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST, + entity_instance_id=obj.entity_id, + severity=_severity_num, + reason_text=reason, + alarm_type=base_obj.alarm_type, + probable_cause=base_obj.cause, + proposed_repair_action=base_obj.repair, + service_affecting=base_obj.service_affecting, + suppression=base_obj.suppression) + + alarm_uuid = api.set_fault(fault) + if is_uuid_like(alarm_uuid) is False: + collectd.error("%s %s:%s set_fault failed:%s" % + (PLUGIN, base_obj.id, obj.entity_id, alarm_uuid)) + return 0 + + # update the lists now that + base_obj._manage_alarm(obj.entity_id, severity_str) + + collectd.info("%s %s alarm %s:%s %s:%s thld:%2.2f value:%2.2f" % ( + PLUGIN, + _alarm_state, + base_obj.id, + severity_str, + obj.instance, + obj.entity_id, + obj.threshold, + obj.value)) + + # Debug only: comment out for production code. + # obj._state_audit("change") + +collectd.register_init(init_func) +collectd.register_notification(notifier_func) diff --git a/monitoring/collectd-extensions/src/interface.conf b/monitoring/collectd-extensions/src/interface.conf new file mode 100644 index 000000000..c7ef627f6 --- /dev/null +++ b/monitoring/collectd-extensions/src/interface.conf @@ -0,0 +1,13 @@ + + + + Instance "state" + Persist true + PersistOK true + WarningMin 50 + FailureMin 0 +# Hits 2 + Invert false + + + diff --git a/monitoring/collectd-extensions/src/interface.py b/monitoring/collectd-extensions/src/interface.py new file mode 100755 index 000000000..ae42a47d6 --- /dev/null +++ b/monitoring/collectd-extensions/src/interface.py @@ -0,0 +1,129 @@ +# +# Copyright (c) 2018 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# +############################################################################ +# +# This is the Host Interface Monitor plugin for Collectd. +# +# Only mgmnt , infra and oam interfaces are supported with the following +# mapping specified in /etc/platform/platform.conf +# +# mgmnt - management_interface | all hosts | manditory +# infa - infrastructure_interface | any host | optional +# oam - oam_interface | controller | manditory +# +# This plugin reports link state inb the following way. +# +# The plugin init function learns interface names from platform.conf +# +# +############################################################################ +import os +import random +import collectd +import tsconfig.tsconfig as tsc + +PLUGIN = 'interface plugin' + +# static variables + +PLATFORM_CONF_MGMNT_LABEL = "management_interface=" +PLATFORM_CONF_INFRA_LABEL = "infrastructure_interface=" +PLATFORM_CONF_OAM_LABEL = "oam_interface=" + +NETWORK_MGMNT = 'mgmnt' +NETWORK_INFRA = 'infra' +NETWORK_OAM = 'oam' + + +class iface: + def __init__(self, n, m, s): + self.master = {'network': n, 'name': m, 'state': 'down', 'slaves': s} + self.slave1 = {} + self.slave2 = {} + self.state = int(100) + + +class object: + hostname = '' + + def __init__(self): + self.NETWORKS = {} + self.NETWORKS[NETWORK_MGMNT] = None + self.NETWORKS[NETWORK_INFRA] = None + self.NETWORKS[NETWORK_OAM] = None + +obj = object() + + +# The config function - called once on collectd process startup +def config_func(config): + """ + Configure the plugin + """ + + collectd.debug('%s config function' % PLUGIN) + return 0 + + +# The init function - called once on collectd process startup +def init_func(): + + # get current hostname + obj.hostname = os.uname()[1] + + # get the master interface names from /etc/platform/platform.conf + with open(tsc.PLATFORM_CONF_FILE, 'r') as infile: + for line in infile: + + # Management Interface + if PLATFORM_CONF_MGMNT_LABEL in line: + name = line.split('=')[1].replace('\n', '') + obj.NETWORKS[NETWORK_MGMNT] = iface(NETWORK_MGMNT, name, 0) + collectd.info("%s monitoring mgmnt interface : %s" % + (PLUGIN, + obj.NETWORKS[NETWORK_MGMNT].master['name'])) + + # Infrastructure Interface + elif PLATFORM_CONF_INFRA_LABEL in line: + name = line.split('=')[1].replace('\n', '') + obj.NETWORKS[NETWORK_INFRA] = iface(NETWORK_INFRA, name, 0) + collectd.info("%s monitoring infra interface : %s" % + (PLUGIN, + obj.NETWORKS[NETWORK_INFRA].master['name'])) + + # OAM Interface + elif PLATFORM_CONF_OAM_LABEL in line: + name = line.split('=')[1].replace('\n', '') + obj.NETWORKS[NETWORK_OAM] = iface(NETWORK_OAM, name, 0) + collectd.info("%s monitoring oam interface: %s" % + (PLUGIN, + obj.NETWORKS[NETWORK_OAM].master['name'])) + + return 0 + + +# The sample read function - called on every audit interval +def read_func(): + + if obj.NETWORKS[NETWORK_MGMNT].state == 0: + obj.NETWORKS[NETWORK_MGMNT].state = 100 + else: + obj.NETWORKS[NETWORK_MGMNT].state -= 25 + + # Dispatch usage value to collectd + val = collectd.Values(host=obj.hostname) + val.plugin = 'interface' + val.plugin_instance = 'mgmnt' + val.type = 'absolute' + val.type_instance = 'used' + val.dispatch(values=[obj.NETWORKS[NETWORK_MGMNT].state]) + return 0 + + +# register the config, init and read functions +collectd.register_config(config_func) +collectd.register_init(init_func) +collectd.register_read(read_func) diff --git a/monitoring/collectd-extensions/src/memory.conf b/monitoring/collectd-extensions/src/memory.conf new file mode 100644 index 000000000..5e5195f09 --- /dev/null +++ b/monitoring/collectd-extensions/src/memory.conf @@ -0,0 +1,21 @@ +# For stock plugin only +# Uncomment to compare stock to tiS plugin readings +# --------------------- +# +# ValuesAbsolute false +# ValuesPercentage true +# + + + + + Instance "used" + Persist true + PersistOK true + WarningMax 80.00 + FailureMax 90.00 + Hits 2 + Invert false + + + diff --git a/monitoring/collectd-extensions/src/memory.py b/monitoring/collectd-extensions/src/memory.py new file mode 100755 index 000000000..2b436aeca --- /dev/null +++ b/monitoring/collectd-extensions/src/memory.py @@ -0,0 +1,181 @@ +# +# Copyright (c) 2018 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# +############################################################################ +# +# This file is the collectd 'Platform CPU Usage' Monitor. +# +# The Platform CPU Usage is calculated as an averaged percentage of +# platform core usable since the previous sample. +# +# Init Function: +# - if 'compute_reserved.conf exists then query/store PLATFORM_CPU_LIST +# +############################################################################ +import os +import collectd + +debug = False + +# general return codes +PASS = 0 +FAIL = 1 + +PLUGIN = 'platform memory usage' + + +# CPU Control class +class MEM: + hostname = "" # hostname for sample notification message + cmd = '/proc/meminfo' # the query comment + value = float(0.0) # float value of memory usage + + # meminfo values we care about + memTotal_kB = 0 + memFree_kB = 0 + buffers = 0 + cached = 0 + SReclaimable = 0 + CommitLimit = 0 + Committed_AS = 0 + HugePages_Total = 0 + Hugepagesize = 0 + AnonPages = 0 + + # derived values + avail = 0 + total = 0 + strict = 0 + + +# Instantiate the class +obj = MEM() + + +def config_func(config): + """ + Configure the memory usage plugin + """ + + for node in config.children: + key = node.key.lower() + val = node.values[0] + + if key == 'path': + obj.cmd = str(val) + collectd.info("%s configured query command: '%s'" % + (PLUGIN, obj.cmd)) + return 0 + + collectd.info("%s no config command provided ; " + "defaulting to '%s'" % + (PLUGIN, obj.cmd)) + + +# Get the platform cpu list and number of cpus reported by /proc/cpuinfo +def init_func(): + # get current hostname + obj.hostname = os.uname()[1] + + fn = '/proc/sys/vm/overcommit_memory' + if os.path.exists(fn): + with open(fn, 'r') as infile: + for line in infile: + obj.strict = int(line) + break + + collectd.info("%s strict:%d" % (PLUGIN, obj.strict)) + + +# Calculate the CPU usage sample +def read_func(): + meminfo = {} + try: + with open(obj.cmd) as fd: + for line in fd: + meminfo[line.split(':')[0]] = line.split(':')[1].strip() + + except EnvironmentError as e: + collectd.error("%s unable to read from %s ; str(e)" % + (PLUGIN, str(e))) + return FAIL + + # remove the 'unit' (kB) suffix that might be on some of the lines + for line in meminfo: + # remove the units from the value read + value_unit = [u.strip() for u in meminfo[line].split(' ', 1)] + if len(value_unit) == 2: + value, unit = value_unit + meminfo[line] = float(value) + else: + meminfo[line] = float(meminfo[line]) + + obj.memTotal_kB = float(meminfo['MemTotal']) + obj.memFree_kB = float(meminfo['MemFree']) + obj.buffers = float(meminfo['Buffers']) + obj.cached = float(meminfo['Cached']) + obj.SReclaimable = float(meminfo['SReclaimable']) + obj.CommitLimit = float(meminfo['CommitLimit']) + obj.Committed_AS = float(meminfo['Committed_AS']) + obj.HugePages_Total = float(meminfo['HugePages_Total']) + obj.Hugepagesize = float(meminfo['Hugepagesize']) + obj.AnonPages = float(meminfo['AnonPages']) + + # collectd.info("%s /proc/meminfo: %s" % (PLUGIN, meminfo)) + # collectd.info("%s ---------------------------" % PLUGIN) + # collectd.info("%s memTotal_kB : %f" % (PLUGIN, obj.memTotal_kB)) + # collectd.info("%s memFree_kB : %f" % (PLUGIN, obj.memFree_kB)) + # collectd.info("%s Buffers : %f" % (PLUGIN, obj.buffers)) + # collectd.info("%s Cached : %f" % (PLUGIN, obj.cached)) + # collectd.info("%s SReclaimable : %f" % (PLUGIN, obj.SReclaimable)) + # collectd.info("%s CommitLimit : %f" % (PLUGIN, obj.CommitLimit)) + # collectd.info("%s Committed_AS : %f" % (PLUGIN, obj.Committed_AS)) + # collectd.info("%s HugePages_Total: %f" % (PLUGIN, obj.HugePages_Total)) + # collectd.info("%s AnonPages : %f" % (PLUGIN, obj.AnonPages)) + + obj.avail = float(float(obj.memFree_kB) + + float(obj.buffers) + + float(obj.cached) + + float(obj.SReclaimable)) + obj.total = float(float(obj.avail) + + float(obj.AnonPages)) + + # collectd.info("%s ---------------------------" % PLUGIN) + # collectd.info("%s memTotal: %d" % (PLUGIN, obj.avail)) + # collectd.info("%s memAvail: %d" % (PLUGIN, obj.total)) + + if obj.strict == 1: + obj.value = float(float(obj.Committed_AS) / float(obj.CommitLimit)) + else: + obj.value = float(float(obj.AnonPages) / float(obj.total)) + + obj.value = float(float(obj.value) * 100) + + # get numa node memory + # numa_node_files = [] + # fn = "/sys/devices/system/node/" + # files = os.listdir(fn) + # for file in files: + # if 'node' in file: + # numa_node_files.append(fn + file) + # collectd.info("%s numa node files: %s" % + # (PLUGIN, numa_node_files)) + + collectd.debug('%s reports %.2f %% usage' % + (PLUGIN, obj.value)) + + # Dispatch usage value to collectd + val = collectd.Values(host=obj.hostname) + val.plugin = 'memory' + val.type = 'percent' + val.type_instance = 'used' + val.dispatch(values=[obj.value]) + + return PASS + + +collectd.register_config(config_func) +collectd.register_init(init_func) +collectd.register_read(read_func) diff --git a/monitoring/collectd-extensions/src/mtce_notifier.py b/monitoring/collectd-extensions/src/mtce_notifier.py new file mode 100755 index 000000000..c18977eab --- /dev/null +++ b/monitoring/collectd-extensions/src/mtce_notifier.py @@ -0,0 +1,379 @@ +# +# Copyright (c) 2018 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# +# This file is the collectd 'Maintenance' Notifier. +# +# Collects provides information about each event as an object passed to the +# notification handler ; the notification object. +# +# object.host - the hostname +# +# object.plugin - the name of the plugin aka resource +# object.plugin_instance - plugin instance string i.e. say mountpoint +# for df plugin +# object.type, - the unit i.e. percent or absolute +# object.type_instance - the attribute i.e. free, used, etc +# +# object.severity - a integer value 0=OK , 1=warning, 2=failure +# object.message - a log-able message containing the above along +# with the value +# +# This notifier manages requesting mtce to assert or clear its collectd +# host-degrade-cause flag based on notification messages sent from collectd. +# +# Messages to maintenance are throttled ONE_EVERY while this state is the +# same as last state. +# +# Message is sent on every state change +# from clear to assert or +# from assert to clear +# +# See code comments for details. +# +############################################################################ +# +# Import list + +import os +import socket +import collectd + +# This plugin name +PLUGIN = 'degrade notifier' + +# collectd severity definitions ; +# Note: can't seem to pull then in symbolically with a header +NOTIF_FAILURE = 1 +NOTIF_WARNING = 2 +NOTIF_OKAY = 4 + +# generic return codes +PASS = 0 +FAIL = 1 + +# default mtce port. +# ... with configuration override +MTCE_CMD_RX_PORT = 2101 + +# same state message throttle count. +# ... only send the the degrade message every 'this' number +# while the state of assert or clear remains the same. +ONE_EVERY = 10 + +PLUGIN__DF = 'df' +PLUGIN__MEM = 'memory' +PLUGIN__CPU = 'cpu' +PLUGIN_INTERFACE = 'interface' +PLUGIN__EXAMPLE = 'example' + + +# The collectd Maintenance Notifier Object +class collectdMtceNotifierObject: + + def __init__(self, port): + """ + collectdMtceNotifierObject Class constructor + """ + # default maintenance port + self.port = port + self.addr = None + + # specifies the protocol family to use when messaging maintenance. + # if system is IPV6, then that is learned and this 'protocol' is + # updated with AF_INET6 + self.protocol = socket.AF_INET + + # List of plugin names that require degrade for specified severity. + self.degrade_list__failure = [PLUGIN__DF, + PLUGIN__MEM, + PLUGIN__CPU, + PLUGIN_INTERFACE, + PLUGIN__EXAMPLE] + self.degrade_list__warning = [] + + # the running list of resources that require degrade. + # a degrade clear message is sent whenever this list is empty. + # a degrade assert message is sent whenever this list is not empty. + self.degrade_list = [] + + # throttle down sending of duplicate degrade assert/clear messages + self.last_state = "undef" + self.msg_throttle = 0 + + +# Instantiate the mtce_notifier object +# This object persists from notificaiton to notification +obj = collectdMtceNotifierObject(MTCE_CMD_RX_PORT) + + +def _get_active_controller_ip(): + """ + Get the active controller host IP + """ + + try: + obj.addr = socket.getaddrinfo('controller', None)[0][4][0] + collectd.info("%s controller ip: %s" % (PLUGIN, obj.addr)) + except Exception as ex: + obj.addr = None + collectd.error("%s failed to get controller ip ; %s" % + (PLUGIN, str(ex))) + return 0 + + +def _df_instance_to_path(df_inst): + """ + Convert a df instance name to a mountpoint + """ + + # df_root is not a dynamic file system. Ignore that one. + if df_inst == 'df_root': + return '/' + else: + # For all others replace all '-' with '/' + return('/' + df_inst[3:].replace('-', '/')) + + +# This function removes degraded file systems that are no longer present. +def _clear_degrade_for_missing_filesystems(): + """ + Remove degraded file systems that are no longer mounted or present. + """ + + for df_inst in obj.degrade_list: + + # Only file system plugins are looked at. + # File system plugin instance names are prefixed with 'df_' + # as the first 3 chars in the instance name. + if df_inst[0:3] == 'df_': + path = _df_instance_to_path(df_inst) + + # check the mount point. + # if the mount point no longer exists then remove + # this instance from the degrade list. + if os.path.ismount(path) is False: + collectd.info("%s clearing degrade for missing %s ; %s" % + (PLUGIN, path, obj.degrade_list)) + obj.degrade_list.remove(df_inst) + + return 0 + + +# The collectd configuration interface +# +# Used to configure the maintenance port. +# key = 'port' +# val = port number +# +def config_func(config): + """ + Configure the maintenance degrade notifier plugin. + """ + + collectd.info('%s config function' % PLUGIN) + for node in config.children: + key = node.key.lower() + val = node.values[0] + + if key == 'port': + obj.port = int(val) + collectd.info("%s configured mtce port: %d" % + (PLUGIN, obj.port)) + return 0 + + obj.port = MTCE_CMD_RX_PORT + collectd.error("%s no mtce port provided ; defaulting to %d" % + (PLUGIN, obj.port)) + + +# Collectd calls this function on startup. +def init_func(): + """ + Collectd Mtce Notifier Initialization Function + """ + + collectd.debug("%s init function" % PLUGIN) + + +# This is the Notifier function that is called by collectd. +# +# Handling steps are +# +# 1. build resource name from notification object. +# 2. check resource against severity lists. +# 3. manage this instance's degrade state. +# 4. send mtcAgent the degrade state message. +# +def notifier_func(nObject): + """ + Collectd Mtce Notifier Handler Function + """ + + # Create the resource name from the notifier object. + # format: _ + resource = nObject.plugin + if nObject.plugin_instance: + resource += "_" + nObject.plugin_instance + + # This block looks at the current notification severity + # and manages the degrade_list. + # If the specified plugin name exists in each of the warnings + # or failure lists and there is a current severity match then + # add that resource instance to the degrade list. + # Conversly if this notification is OKAY then make sure this + # resource instance is not in the degrade list (remove it if it is) + if nObject.severity is NOTIF_OKAY: + if obj.degrade_list and resource in obj.degrade_list: + obj.degrade_list.remove(resource) + + elif nObject.severity is NOTIF_FAILURE: + if obj.degrade_list__failure: + if nObject.plugin in obj.degrade_list__failure: + if resource not in obj.degrade_list: + # handle dynamic filesystems going missing over a swact + # or unmount and being reported as a transient error by + # the df plugin. Don't add it to the failed list if the + # mountpoint is gone. + add = True + if nObject.plugin == PLUGIN__DF: + path = _df_instance_to_path(resource) + add = os.path.ismount(path) + if add is True: + collectd.debug("%s %s added to degrade list" % + (PLUGIN, resource)) + obj.degrade_list.append(resource) + else: + # If severity is failure and no failures cause degrade + # then make sure this plugin is not in the degrade list, + # Should never occur. + if resource in obj.degrade_list: + obj.degrade_list.remove(resource) + + elif nObject.severity is NOTIF_WARNING: + if obj.degrade_list__warning: + if nObject.plugin in obj.degrade_list__warning: + if resource not in obj.degrade_list: + # handle dynamic filesystems going missing over a swact + # or unmount and being reported as a transient error by + # the df plugin. Don't add it to the failed list if the + # mountpoint is gone. + add = True + if nObject.plugin == PLUGIN__DF: + path = _df_instance_to_path(resource) + add = os.path.ismount(path) + if add is True: + collectd.debug("%s %s added to degrade list" % + (PLUGIN, resource)) + obj.degrade_list.append(resource) + else: + # If severity is warning and no warnings cause degrade + # then make sure this plugin is not in the degrade list. + # Should never occur.. + if resource in obj.degrade_list: + obj.degrade_list.remove(resource) + else: + collectd.info("%s unsupported severity %d" % + (PLUGIN, nObject.severity)) + return FAIL + + # running counter of notifications. + obj.msg_throttle += 1 + + # Support for Dynamic File Systems + # -------------------------------- + # Some active controller mounted filesystems can become + # unmounted under the watch of collectd. This can occur + # as a result of a Swact. If an 'degrade' is raised at the + # time an fs disappears then that state can become stuck + # active until the next Swact. This call handles this case. + # + # Audit file system presence every time we get the + # notification for the root file system. + # Depending on the root filesystem always being there. + if nObject.plugin == 'df' \ + and nObject.plugin_instance == 'root' \ + and len(obj.degrade_list): + _clear_degrade_for_missing_filesystems() + + # If degrade list is empty then a clear state is sent to maintenance. + # If degrade list is NOT empty then an assert state is sent to maintenance + # For logging and to ease debug the code below will create a list of + # degraded resource instances to be included in the message to maintenance + # for mtcAgent to optionally log it. + resources = "" + if obj.degrade_list: + # loop over the list, + # limit the degraded resource list being sent to mtce to 5 + for r in obj.degrade_list[0:1:5]: + resources += r + ',' + resources = resources[:-1] + state = "assert" + else: + state = "clear" + + # Message throttling .... + + # Avoid sending the same last state message for up to ONE_EVERY count. + # Just reduce load on mtcAgent + if obj.last_state == state and obj.msg_throttle < ONE_EVERY: + return 0 + + # if the degrade state has changed then log it and proceed + if obj.last_state != state: + if obj.last_state != "undef": + collectd.info("%s degrade %s %s" % + (PLUGIN, + state, + obj.degrade_list)) + + # Save state for next time + obj.last_state = state + + # Clear the message throttle counter + obj.msg_throttle = 0 + + # Send the degrade state ; assert or clear message to mtcAgent. + # If we get a send failure then log it and set the addr to None + # so it forces us to refresh the controller address on the next + # notification + try: + mtce_socket = socket.socket(obj.protocol, socket.SOCK_DGRAM) + if mtce_socket: + if obj.addr is None: + _get_active_controller_ip() + if obj.addr is None: + return 0 + + # Create the Maintenance message. + message = "{\"service\":\"collectd_notifier\"," + message += "\"hostname\":\"" + nObject.host + "\"," + message += "\"degrade\":\"" + state + "\"," + message += "\"resource\":\"" + resources + "\"}" + collectd.debug("%s: %s" % (PLUGIN, message)) + + mtce_socket.settimeout(1.0) + mtce_socket.sendto(message, (obj.addr, obj.port)) + mtce_socket.close() + else: + collectd.error("%s %s failed to open socket (%s)" % + (PLUGIN, resource, obj.addr)) + except socket.error as e: + if e.args[0] == socket.EAI_ADDRFAMILY: + # Handle IPV4 to IPV6 switchover: + obj.protocol = socket.AF_INET6 + collectd.info("%s %s ipv6 addressing (%s)" % + (PLUGIN, resource, obj.addr)) + else: + collectd.error("%s %s socket error (%s) ; %s" % + (PLUGIN, resource, obj.addr, str(e))) + # try self correction + obj.addr = None + obj.protocol = socket.AF_INET + + return 0 + +collectd.register_config(config_func) +collectd.register_init(init_func) +collectd.register_notification(notifier_func) diff --git a/monitoring/collectd-extensions/src/ntpq.conf b/monitoring/collectd-extensions/src/ntpq.conf new file mode 100644 index 000000000..f7e3c26ce --- /dev/null +++ b/monitoring/collectd-extensions/src/ntpq.conf @@ -0,0 +1,17 @@ +# +# Interval 60 +# + + + + + Instance "state" + Persist true + PersistOK true + WarningMin 1 + FailureMin 0 +# Hits 2 + Invert false + + + diff --git a/monitoring/collectd-extensions/src/ntpq.py b/monitoring/collectd-extensions/src/ntpq.py new file mode 100755 index 000000000..7a984304e --- /dev/null +++ b/monitoring/collectd-extensions/src/ntpq.py @@ -0,0 +1,195 @@ + +# Copyright (c) 2018 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# +# + +import os +import subprocess +import uuid +import collectd +from fm_api import constants as fm_constants +from fm_api import fm_api +import tsconfig.tsconfig as tsc + +api = fm_api.FaultAPIs() + +PLUGIN = 'NTP query plugin' + +PLUGIN_SCRIPT = '/etc/rmonfiles.d/query_ntp_servers.sh' +PLUGIN_RESULT = '/tmp/ntpq_server_info' + +# static variables +ALARM_ID__NTPQ = "100.114" + + +# define a class here that will persist over read calls +class NtpqObject: + hostname = '' + base_eid = '' + severity = 'clear' + suppression = True + service_affecting = False + status = 0 + last_result = '' + this_result = '' + id = ALARM_ID__NTPQ + name = "NTP" + alarm_type = fm_constants.FM_ALARM_TYPE_1 + cause = fm_constants.ALARM_PROBABLE_CAUSE_UNKNOWN + repair = "Monitor and if condition persists, " + repair += "contact next level of support." + + +obj = NtpqObject() + + +def is_uuid_like(val): + """Returns validation of a value as a UUID.""" + try: + return str(uuid.UUID(val)) == val + except (TypeError, ValueError, AttributeError): + return False + + +# The config function - called once on collectd process startup +def config_func(config): + """ + Configure the plugin + """ + + collectd.debug('%s config function' % PLUGIN) + return 0 + + +# The init function - called once on collectd process startup +def init_func(): + + # ntp query is for controllers only + if tsc.nodetype != 'controller': + return 0 + + # get current hostname + obj.hostname = os.uname()[1] + obj.base_eid = 'host=' + obj.hostname + '.ntp' + collectd.info("%s on %s with entity id '%s'" % PLUGIN, obj.hostname, obj.base_eid) + return 0 + + +# The sample read function - called on every audit interval +def read_func(): + + # ntp query is for controllers only + if tsc.nodetype != 'controller': + return 0 + + result = int(0) + # Query ntp + try: + result = os.system(PLUGIN_SCRIPT) + except Exception as e: + collectd.error("%s Could not run '%s' (%s)" % + (PLUGIN, e)) + return 0 + + obj.status = int(result)/0x100 + + collectd.info("%s Query Result: %s" % (PLUGIN, obj.status)) + + if os.path.exists(PLUGIN_RESULT) is False: + collectd.error("%s produced no result file '%s'" % + (PLUGIN, PLUGIN_RESULT)) + return 0 + + # read the query result file. + # format is in the PLUGIN_SCRIPT file. + # This code only wants the second line. + # It contains list of unreachable ntp servers that need alarm management. + count = 0 + with open(PLUGIN_RESULT, 'r') as infile: + for line in infile: + count += 1 + collectd.info("%s Query Result: %s" % (PLUGIN, line)) + if count == 0: + collectd.error("%s produced empty result file '%s'" % + (PLUGIN, PLUGIN_RESULT)) + return 0 + + sample = 1 + + # Dispatch usage value to collectd + val = collectd.Values(host=obj.hostname) + val.plugin = 'ntpq' + val.plugin_instance = 'some.ntp.server.ip' + val.type = 'absolute' + val.type_instance = 'state' + val.dispatch(values=[sample]) + + severity = 'clear' + obj.severity = 'clear' + + # if there is no severity change then consider exiting + if obj.severity == severity: + + # unless the current severity is 'minor' + if severity == 'minor': + # TODO: check to see if the failing IP address is changed + collectd.info("%s NEED TO CHECK IP ADDRESSES" % (PLUGIN)) + else: + return 0 + + # if current severity is clear but previous severity is not then + # prepare to clear the alarms + if severity == 'clear': + _alarm_state = fm_constants.FM_ALARM_STATE_CLEAR + + # TODO: loop over all raises alarms and clear them + collectd.info("%s NEED CLEAR ALL ALARMS" % (PLUGIN)) + if api.clear_fault(obj.id, obj.base_eid) is False: + collectd.error("%s %s:%s clear_fault failed" % + (PLUGIN, obj.id, obj.base_eid)) + return 0 + + elif severity == 'major': + reason = "NTP configuration does not contain any valid " + reason += "or reachable NTP servers." + eid = obj.base_eid + fm_severity = fm_constants.FM_ALARM_SEVERITY_MAJOR + else: + # TODO: There can be up to 3 inacessable servers + ip = 'some.server.ip.addr' + reason = "NTP address " + reason += ip + reason += " is not a valid or a reachable NTP server." + eid = obj.base_eid + '=' + ip + fm_severity = fm_constants.FM_ALARM_SEVERITY_MINOR + + fault = fm_api.Fault( + alarm_id=obj.id, + alarm_state=fm_constants.FM_ALARM_STATE_SET, + entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST, + entity_instance_id=eid, + severity=fm_severity, + reason_text=reason, + alarm_type=obj.alarm_type, + probable_cause=obj.cause, + proposed_repair_action=obj.repair, + service_affecting=obj.service_affecting, + suppression=obj.suppression) + + alarm_uuid = api.set_fault(fault) + if is_uuid_like(alarm_uuid) is False: + collectd.error("%s %s:%s set_fault failed:%s" % + (PLUGIN, obj.id, eid, alarm_uuid)) + return 0 + + # TODO: clear the object alarm state + + return 0 + + +# register the config, init and read functions +collectd.register_config(config_func) +collectd.register_init(init_func) +collectd.register_read(read_func) diff --git a/monitoring/collectd-extensions/src/python_plugins.conf b/monitoring/collectd-extensions/src/python_plugins.conf new file mode 100644 index 000000000..8d8c979cc --- /dev/null +++ b/monitoring/collectd-extensions/src/python_plugins.conf @@ -0,0 +1,20 @@ +LoadPlugin python + + ModulePath "/opt/collectd/extensions/python" + Import "cpu" + + Path "/proc/cpuinfo" + + Import "memory" + + Path "/proc/meminfo" + + # Import "example" + # + # Data "1 50" + # + # Import "interface" + # Import "ntpq" + LogTraces = true + Encoding "utf-8" + diff --git a/monitoring/influxdb-extensions/PKG-INFO b/monitoring/influxdb-extensions/PKG-INFO new file mode 100644 index 000000000..235c6da16 --- /dev/null +++ b/monitoring/influxdb-extensions/PKG-INFO @@ -0,0 +1,10 @@ +Metadata-Version: 1.1 +Name: influxdb-extensions +Version: 1.0 +Summary: influxdb-extensions +Home-page: +Author: Windriver +Author-email: info@windriver.com +License: windriver +Description: Titanium Cloud influxdb extensions. +Platform: UNKNOWN diff --git a/monitoring/influxdb-extensions/centos/build_srpm.data b/monitoring/influxdb-extensions/centos/build_srpm.data new file mode 100644 index 000000000..2be8701c6 --- /dev/null +++ b/monitoring/influxdb-extensions/centos/build_srpm.data @@ -0,0 +1,7 @@ +SRC_DIR="$PKG_BASE" + +COPY_LIST="$PKG_BASE/src/LICENSE \ + $PKG_BASE/src/influxdb.conf.pmon \ + $PKG_BASE/src/influxdb.service" + +TIS_PATCH_VER=1 diff --git a/monitoring/influxdb-extensions/centos/influxdb-extensions.spec b/monitoring/influxdb-extensions/centos/influxdb-extensions.spec new file mode 100644 index 000000000..d58ad5b6f --- /dev/null +++ b/monitoring/influxdb-extensions/centos/influxdb-extensions.spec @@ -0,0 +1,46 @@ +Summary: Titanuim Server influxdb Extensions Package +Name: influxdb-extensions +Version: 1.0 +Release: 0%{?_tis_dist}.%{tis_patch_ver} +License: windriver +Group: base +Packager: Wind River +URL: unknown + +# create the files tarball +Source0: %{name}-%{version}.tar.gz + +source1: influxdb.service +Source2: influxdb.conf.pmon + +Requires: systemd +Requires: influxdb +Requires: /bin/systemctl + +%description +Titanium Cloud influxdb extensions + +%define debug_package %{nil} +%define local_unit_dir %{_sysconfdir}/systemd/system + +%prep +%setup + +%build + +%install +install -m 755 -d %{buildroot}%{_sysconfdir} +install -m 755 -d %{buildroot}%{_sysconfdir}/influxdb +install -m 755 -d %{buildroot}%{local_unit_dir} + +install -m 644 %{SOURCE1} %{buildroot}%{local_unit_dir} +install -m 600 %{SOURCE2} %{buildroot}%{_sysconfdir}/influxdb + + +%clean +rm -rf $RPM_BUILD_ROOT + +%files +%defattr(-,root,root,-) +%config(noreplace) %{local_unit_dir}/influxdb.service +%{_sysconfdir}/influxdb/* diff --git a/monitoring/influxdb-extensions/src/LICENSE b/monitoring/influxdb-extensions/src/LICENSE new file mode 100644 index 000000000..d64569567 --- /dev/null +++ b/monitoring/influxdb-extensions/src/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/monitoring/influxdb-extensions/src/influxdb.conf b/monitoring/influxdb-extensions/src/influxdb.conf new file mode 100644 index 000000000..b0a5f000f --- /dev/null +++ b/monitoring/influxdb-extensions/src/influxdb.conf @@ -0,0 +1,322 @@ +### Welcome to the InfluxDB configuration file. + +# Once every 24 hours InfluxDB will report anonymous data to m.influxdb.com +# The data includes raft id (random 8 bytes), os, arch, version, and metadata. +# We don't track ip addresses of servers reporting. This is only used +# to track the number of instances running and the versions, which +# is very helpful for us. +# Change this option to true to disable reporting. +reporting-disabled = false + +### +### Enterprise registration control +### + +[registration] +# enabled = true +# url = "https://enterprise.influxdata.com" # The Enterprise server URL +# token = "" # Registration token for Enterprise server + +### +### [meta] +### +### Controls the parameters for the Raft consensus group that stores metadata +### about the InfluxDB cluster. +### + +[meta] + dir = "/var/lib/influxdb/meta" + hostname = "localhost" + bind-address = ":8088" + retention-autocreate = true + election-timeout = "1s" + heartbeat-timeout = "1s" + leader-lease-timeout = "500ms" + commit-timeout = "50ms" + cluster-tracing = false + + # If enabled, when a Raft cluster loses a peer due to a `DROP SERVER` command, + # the leader will automatically ask a non-raft peer node to promote to a raft + # peer. This only happens if there is a non-raft peer node available to promote. + # This setting only affects the local node, so to ensure if operates correctly, be sure to set + # it in the config of every node. + raft-promotion-enabled = true + +### +### [data] +### +### Controls where the actual shard data for InfluxDB lives and how it is +### flushed from the WAL. "dir" may need to be changed to a suitable place +### for your system, but the WAL settings are an advanced configuration. The +### defaults should work for most systems. +### + +[data] + dir = "/var/lib/influxdb/data" + + # Controls the engine type for new shards. Options are b1, bz1, or tsm1. + # b1 is the 0.9.2 storage engine, bz1 is the 0.9.3 and 0.9.4 engine. + # tsm1 is the 0.9.5 engine and is currenly EXPERIMENTAL. Until 0.9.5 is + # actually released data written into a tsm1 engine may be need to be wiped + # between upgrades. + # engine ="bz1" + + # The following WAL settings are for the b1 storage engine used in 0.9.2. They won't + # apply to any new shards created after upgrading to a version > 0.9.3. + max-wal-size = 104857600 # Maximum size the WAL can reach before a flush. Defaults to 100MB. + wal-flush-interval = "10m" # Maximum time data can sit in WAL before a flush. + wal-partition-flush-delay = "2s" # The delay time between each WAL partition being flushed. + + # These are the WAL settings for the storage engine >= 0.9.3 + wal-dir = "/var/lib/influxdb/wal" + wal-enable-logging = true + + # When a series in the WAL in-memory cache reaches this size in bytes it is marked as ready to + # flush to the index + # wal-ready-series-size = 25600 + + # Flush and compact a partition once this ratio of series are over the ready size + # wal-compaction-threshold = 0.6 + + # Force a flush and compaction if any series in a partition gets above this size in bytes + # wal-max-series-size = 2097152 + + # Force a flush of all series and full compaction if there have been no writes in this + # amount of time. This is useful for ensuring that shards that are cold for writes don't + # keep a bunch of data cached in memory and in the WAL. + # wal-flush-cold-interval = "10m" + + # Force a partition to flush its largest series if it reaches this approximate size in + # bytes. Remember there are 5 partitions so you'll need at least 5x this amount of memory. + # The more memory you have, the bigger this can be. + # wal-partition-size-threshold = 20971520 + + # Whether queries should be logged before execution. Very useful for troubleshooting, but will + # log any sensitive data contained within a query. + # query-log-enabled = true + +### +### [hinted-handoff] +### +### Controls the hinted handoff feature, which allows nodes to temporarily +### store queued data when one node of a cluster is down for a short period +### of time. +### + +[hinted-handoff] + enabled = true + dir = "/var/lib/influxdb/hh" + max-size = 1073741824 + max-age = "168h" + retry-rate-limit = 0 + + # Hinted handoff will start retrying writes to down nodes at a rate of once per second. + # If any error occurs, it will backoff in an exponential manner, until the interval + # reaches retry-max-interval. Once writes to all nodes are successfully completed the + # interval will reset to retry-interval. + retry-interval = "1s" + retry-max-interval = "1m" + + # Interval between running checks for data that should be purged. Data is purged from + # hinted-handoff queues for two reasons. 1) The data is older than the max age, or + # 2) the target node has been dropped from the cluster. Data is never dropped until + # it has reached max-age however, for a dropped node or not. + purge-interval = "1h" + +### +### [cluster] +### +### Controls non-Raft cluster behavior, which generally includes how data is +### shared across shards. +### + +[cluster] + shard-writer-timeout = "10s" # The time within which a shard must respond to write. + write-timeout = "5s" # The time within which a write operation must complete on the cluster. + +### +### [retention] +### +### Controls the enforcement of retention policies for evicting old data. +### + +[retention] + enabled = true + check-interval = "30m" + +### +### [shard-precreation] +### +### Controls the precreation of shards, so they are created before data arrives. +### Only shards that will exist in the future, at time of creation, are precreated. + +[shard-precreation] + enabled = true + check-interval = "10m" + advance-period = "30m" + +### +### Controls the system self-monitoring, statistics and diagnostics. +### +### The internal database for monitoring data is created automatically if +### if it does not already exist. The target retention within this database +### is called 'monitor' and is also created with a retention period of 7 days +### and a replication factor of 1, if it does not exist. In all cases the +### this retention policy is configured as the default for the database. + +[monitor] + store-enabled = true # Whether to record statistics internally. + store-database = "_internal" # The destination database for recorded statistics + store-interval = "10s" # The interval at which to record statistics + +### +### [admin] +### +### Controls the availability of the built-in, web-based admin interface. If HTTPS is +### enabled for the admin interface, HTTPS must also be enabled on the [http] service. +### + +[admin] + enabled = true + bind-address = ":8083" + https-enabled = false + https-certificate = "/etc/ssl/influxdb.pem" + +### +### [http] +### +### Controls how the HTTP endpoints are configured. These are the primary +### mechanism for getting data into and out of InfluxDB. +### + +[http] + enabled = true + bind-address = ":8086" + auth-enabled = false + log-enabled = true + write-tracing = false + pprof-enabled = false + https-enabled = false + https-certificate = "/etc/ssl/influxdb.pem" + +### +### [[graphite]] +### +### Controls one or many listeners for Graphite data. +### + +[[graphite]] + enabled = false + # database = "graphite" + # bind-address = ":2003" + # protocol = "tcp" + # consistency-level = "one" + # name-separator = "." + + # These next lines control how batching works. You should have this enabled + # otherwise you could get dropped metrics or poor performance. Batching + # will buffer points in memory if you have many coming in. + + # batch-size = 1000 # will flush if this many points get buffered + # batch-pending = 5 # number of batches that may be pending in memory + # batch-timeout = "1s" # will flush at least this often even if we haven't hit buffer limit + # udp-read-buffer = 0 # UDP Read buffer size, 0 means OS default. UDP listener will fail if set above OS max. + + ## "name-schema" configures tag names for parsing the metric name from graphite protocol; + ## separated by `name-separator`. + ## The "measurement" tag is special and the corresponding field will become + ## the name of the metric. + ## e.g. "type.host.measurement.device" will parse "server.localhost.cpu.cpu0" as + ## { + ## measurement: "cpu", + ## tags: { + ## "type": "server", + ## "host": "localhost, + ## "device": "cpu0" + ## } + ## } + # name-schema = "type.host.measurement.device" + + ## If set to true, when the input metric name has more fields than `name-schema` specified, + ## the extra fields will be ignored. + ## Otherwise an error will be logged and the metric rejected. + # ignore-unnamed = true + +### +### [collectd] +### +### Controls the listener for collectd data. +### + +[collectd] + enabled = true + bind-address = "127.0.0.1:25826" + database = "collectd" + typesdb = "/usr/share/collectd/types.db" + + # These next lines control how batching works. You should have this enabled + # otherwise you could get dropped metrics or poor performance. Batching + # will buffer points in memory if you have many coming in. + + # batch-size = 1000 # will flush if this many points get buffered + # batch-pending = 5 # number of batches that may be pending in memory + # batch-timeout = "1s" # will flush at least this often even if we haven't hit buffer limit + # read-buffer = 0 # UDP Read buffer size, 0 means OS default. UDP listener will fail if set above OS max. + +### +### [opentsdb] +### +### Controls the listener for OpenTSDB data. +### + +[opentsdb] + enabled = false + # bind-address = ":4242" + # database = "opentsdb" + # retention-policy = "" + # consistency-level = "one" + # tls-enabled = false + # certificate= "" + + # These next lines control how batching works. You should have this enabled + # otherwise you could get dropped metrics or poor performance. Only points + # metrics received over the telnet protocol undergo batching. + + # batch-size = 1000 # will flush if this many points get buffered + # batch-pending = 5 # number of batches that may be pending in memory + # batch-timeout = "1s" # will flush at least this often even if we haven't hit buffer limit + +### +### [[udp]] +### +### Controls the listeners for InfluxDB line protocol data via UDP. +### + +[[udp]] + enabled = false + # bind-address = "" + # database = "udp" + # retention-policy = "" + + # These next lines control how batching works. You should have this enabled + # otherwise you could get dropped metrics or poor performance. Batching + # will buffer points in memory if you have many coming in. + + # batch-size = 1000 # will flush if this many points get buffered + # batch-pending = 5 # number of batches that may be pending in memory + # batch-timeout = "1s" # will flush at least this often even if we haven't hit buffer limit + # read-buffer = 0 # UDP Read buffer size, 0 means OS default. UDP listener will fail if set above OS max. + +### +### [continuous_queries] +### +### Controls how continuous queries are run within InfluxDB. +### + +[continuous_queries] + log-enabled = true + enabled = true + recompute-previous-n = 2 + recompute-no-older-than = "10m" + compute-runs-per-interval = 10 + compute-no-more-than = "2m" diff --git a/monitoring/influxdb-extensions/src/influxdb.conf.pmon b/monitoring/influxdb-extensions/src/influxdb.conf.pmon new file mode 100644 index 000000000..0556f2ac6 --- /dev/null +++ b/monitoring/influxdb-extensions/src/influxdb.conf.pmon @@ -0,0 +1,17 @@ +[process] +process = influxdb +service = influxdb +style = lsb +pidfile = /var/run/influxdb/influxdb.pid +severity = major ; minor, major, critical +restarts = 3 ; restart retries before error assertion +interval = 5 ; number of seconds to wait between restarts +debounce = 10 ; number of seconds that a process needs to remain + ; running before degrade is removed and retry count + ; is cleared. +startuptime = 3 ; Seconds to wait after process start before starting the debounce monitor +mode = passive ; Monitoring mode: passive (default) or active + ; passive: process death monitoring (default: always) + ; active : heartbeat monitoring, i.e. request / response messaging + ; ignore : do not monitor or stop monitoring +quorum = 0 ; process is in the host watchdog quorum diff --git a/monitoring/influxdb-extensions/src/influxdb.logrotate b/monitoring/influxdb-extensions/src/influxdb.logrotate new file mode 100644 index 000000000..b2bef9a22 --- /dev/null +++ b/monitoring/influxdb-extensions/src/influxdb.logrotate @@ -0,0 +1,16 @@ +#daily +nodateext + +/var/log/influxdb/influxdb.log +{ + size 20M + start 1 + missingok + rotate 20 + compress + sharedscripts + postrotate + systemctl reload syslog-ng > /dev/null 2>&1 || true + endscript +} + diff --git a/monitoring/influxdb-extensions/src/influxdb.service b/monitoring/influxdb-extensions/src/influxdb.service new file mode 100644 index 000000000..6e61df465 --- /dev/null +++ b/monitoring/influxdb-extensions/src/influxdb.service @@ -0,0 +1,20 @@ +[Unit] +Description=InfluxDB open-source, distributed, time series database +Documentation=https://influxdb.com/docs/ +After=local-fs.target network.target +Before=collectd.service + +[Service] +User=influxdb +Group=influxdb +LimitNOFILE=65536 +Environment='STDOUT=/dev/null' +Environment='STDERR=/var/log/influxdb/influxd.log' +EnvironmentFile=-/etc/default/influxdb +ExecStart=/bin/sh -c "/usr/bin/influxd -config /etc/influxdb/influxdb.conf -pidfile /var/run/influxdb/influxdb.pid ${INFLUXD_OPTS} >> ${STDOUT} 2>> ${STDERR}" +ExecStopPost=/bin/bash -c 'rm /var/run/influxdb/influxdb.pid' +KillMode=control-group + +[Install] +WantedBy=multi-user.target +Alias=influxd.service