Collectd+InfluxDb-RMON Replacement(ALL METRICS) P1

This is the primary update that introduces collectd monitoring and
sample storage into the influxdb database.
Two new packages are introduced by this update
 - collectd-extensions package which includes
   - newly developed collectd platform memory, cpu and filesystem
     plugins
     - note that the example, ntpq and interface plugins are not
       complete and are not enabled by this update.
   - pmond process monitoring / recovery support for collectd
   - updated service file for pidfile management ; needed by pmond
 - influxdb-extensions package which includes
   - pmond process monitoring / recovery support for influxdb
   - updated service file for pidfile management ; needed by pmond
   - log rotate support for influxdb

Change-Id: I06511fecb781781ed5491c926ad4b1273a1bc23b
Signed-off-by: Jack Ding <jack.ding@windriver.com>
This commit is contained in:
Eric MacDonald 2018-05-14 16:12:16 -04:00 committed by Jack Ding
parent 7e0cf4b205
commit 892489acd7
29 changed files with 3542 additions and 0 deletions

View File

@ -101,4 +101,6 @@ extended/memcached
devtools/puppet-modules/openstack/puppet-memcached-3.0.2
devtools/puppet-modules/openstack/puppet-horizon-9.5.0
devtools/puppet-modules/openstack/puppet-swift-11.3.0
monitoring/collectd-extensions
monitoring/influxdb-extensions
kubernetes/kubernetes

View File

@ -0,0 +1,10 @@
Metadata-Version: 1.1
Name: collectd-extensions
Version: 1.0
Summary: collectd-extensions
Home-page:
Author: Windriver
Author-email: info@windriver.com
License: windriver
Description: Titanium Cloud collectd extensions
Platform: UNKNOWN

View File

@ -0,0 +1,19 @@
SRC_DIR="$PKG_BASE"
COPY_LIST="$PKG_BASE/src/LICENSE \
$PKG_BASE/src/collectd.conf.pmon \
$PKG_BASE/src/collectd.service \
$PKG_BASE/src/fm_notifier.py \
$PKG_BASE/src/mtce_notifier.py \
$PKG_BASE/src/python_plugins.conf \
$PKG_BASE/src/cpu.py \
$PKG_BASE/src/cpu.conf \
$PKG_BASE/src/memory.py \
$PKG_BASE/src/memory.conf \
$PKG_BASE/src/df.conf \
$PKG_BASE/src/ntpq.py \
$PKG_BASE/src/ntpq.conf \
$PKG_BASE/src/example.py \
$PKG_BASE/src/example.conf"
TIS_PATCH_VER=1

View File

@ -0,0 +1,90 @@
Summary: Titanuim Server collectd Package
Name: collectd-extensions
Version: 1.0
Release: 0%{?_tis_dist}.%{tis_patch_ver}
License: windriver
Group: base
Packager: Wind River <info@windriver.com>
URL: unknown
# create the files tarball
Source0: %{name}-%{version}.tar.gz
Source1: collectd.service
Source2: collectd.conf.pmon
# collectd python plugin files - notifiers
Source3: fm_notifier.py
Source4: mtce_notifier.py
# collectd python plugin files - resource plugins
Source11: cpu.py
Source12: memory.py
Source14: example.py
Source15: ntpq.py
# collectd plugin conf files into /etc/collectd.d
Source100: python_plugins.conf
Source101: cpu.conf
Source102: memory.conf
Source103: df.conf
Source104: example.conf
Source105: ntpq.conf
BuildRequires: systemd-devel
Requires: systemd
Requires: collectd
Requires: /bin/systemctl
%description
Titanium Cloud collectd extensions
%define debug_package %{nil}
%define local_unit_dir %{_sysconfdir}/systemd/system
%define local_plugin_dir %{_sysconfdir}/collectd.d
%define local_python_extensions_dir /opt/collectd/extensions/python
%define local_config_extensions_dir /opt/collectd/extensions/config
%prep
%setup
%build
%install
install -m 755 -d %{buildroot}%{_sysconfdir}
install -m 755 -d %{buildroot}%{local_unit_dir}
install -m 755 -d %{buildroot}%{local_plugin_dir}
install -m 755 -d %{buildroot}%{local_config_extensions_dir}
install -m 755 -d %{buildroot}%{local_python_extensions_dir}
# support files ; service and pmon conf
install -m 644 %{SOURCE1} %{buildroot}%{local_unit_dir}
install -m 600 %{SOURCE2} %{buildroot}%{local_config_extensions_dir}
# collectd python plugin files - notifiers
install -m 700 %{SOURCE3} %{buildroot}%{local_python_extensions_dir}
install -m 700 %{SOURCE4} %{buildroot}%{local_python_extensions_dir}
# collectd python plugin files - resource plugins
install -m 700 %{SOURCE11} %{buildroot}%{local_python_extensions_dir}
install -m 700 %{SOURCE12} %{buildroot}%{local_python_extensions_dir}
install -m 700 %{SOURCE14} %{buildroot}%{local_python_extensions_dir}
install -m 700 %{SOURCE15} %{buildroot}%{local_python_extensions_dir}
# collectd plugin conf files into /etc/collectd.d
install -m 600 %{SOURCE100} %{buildroot}%{local_plugin_dir}
install -m 600 %{SOURCE101} %{buildroot}%{local_plugin_dir}
install -m 600 %{SOURCE102} %{buildroot}%{local_plugin_dir}
install -m 600 %{SOURCE103} %{buildroot}%{local_plugin_dir}
install -m 600 %{SOURCE104} %{buildroot}%{local_plugin_dir}
install -m 600 %{SOURCE105} %{buildroot}%{local_plugin_dir}
%clean
rm -rf $RPM_BUILD_ROOT
%files
%defattr(-,root,root,-)
%config(noreplace) %{local_unit_dir}/collectd.service
%{local_plugin_dir}/*
%{local_config_extensions_dir}/*
%{local_python_extensions_dir}/*

View File

@ -0,0 +1,202 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@ -0,0 +1,18 @@
[process]
process = collectd
service = collectd
style = lsb
pidfile = /var/run/collectd.pid
severity = major ; minor, major, critical
restarts = 3 ; restart retries before error assertion
interval = 5 ; number of seconds to wait between restarts
debounce = 10 ; number of seconds that a process needs to remain
; running before degrade is removed and retry count
; is cleared.
startuptime = 3 ; Seconds to wait after process start before starting the debounce monitor
mode = passive ; Monitoring mode: passive (default) or active
; passive: process death monitoring (default: always)
; active : heartbeat monitoring, i.e. request / response messaging
; ignore : do not monitor or stop monitoring
quorum = 0 ; process is in the host watchdog quorum

View File

@ -0,0 +1,14 @@
[Unit]
Description=Collectd statistics daemon and extension services
Documentation=man:collectd(1) man:collectd.conf(5)
After=local-fs.target network-online.target
Requires=local-fs.target network-online.target
[Service]
Type=notify
ExecStart=/usr/sbin/collectd
ExecStartPost=/bin/bash -c 'echo $MAINPID > /var/run/collectd.pid'
ExecStopPost=/bin/rm -f /var/run/collectd.pid
[Install]
WantedBy=multi-user.target

View File

@ -0,0 +1,22 @@
# For stock plugin only
# Uncomment to compare stock to tiS plugin readings
# ---------------------
# <Plugin cpu>
# ReportByCpu false
# ReportByState false
# ValuesPercentage true
# </Plugin>
<Plugin "threshold">
<Plugin "cpu">
<Type "percent">
Instance "used"
Persist true
PersistOK true
WarningMax 90.00
FailureMax 95.00
Hits 2
Invert false
</Type>
</Plugin>
</Plugin>

View File

@ -0,0 +1,253 @@
#
# Copyright (c) 2018 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
############################################################################
#
# This file is the collectd 'Platform CPU Usage' Monitor.
#
# The Platform CPU Usage is calculated as an averaged percentage of
# platform core usable since the previous sample.
#
# Init Function:
# - if 'compute_reserved.conf exists then query/store PLATFORM_CPU_LIST
#
############################################################################
import os
import time
import collectd
debug = False
PASS = 0
FAIL = 1
PATH = '/proc/cpuinfo'
COMPUTE_RESERVED_CONF = '/etc/nova/compute_reserved.conf'
PLUGIN = 'platform cpu usage plugin'
# CPU Control class
class CPU:
hostname = "" # hostname for sample notification message
usage = float(0.0) # float value of cpu usage
processors = int(0) # number of processors for all cpus case
cpu_list = [] # list of CPUs to calculate combined usage for
cpu_time = [] # schedstat time for each CPU
cpu_time_last = [] # last schedstat time for each CPU
time_last = float(0.0) # float of the time the last sample was taken
def log_error(self, err_str):
""" Print an error log with plugin name prefixing the log """
collectd.error("%s %s" % (PLUGIN, err_str))
# Instantiate the class
c = CPU()
# The collectd configuration interface
# collectd needs this defined ; but not used/needed.
def config_func(config):
collectd.info('%s config function' % PLUGIN)
# Get the platform cpu list and number of cpus reported by /proc/cpuinfo
def init_func():
# get current hostname
c.hostname = os.uname()[1]
collectd.info('%s init function for %s' % (PLUGIN, c.hostname))
raw_list = ""
if os.path.exists(COMPUTE_RESERVED_CONF):
with open(COMPUTE_RESERVED_CONF, 'r') as infile:
for line in infile:
if 'PLATFORM_CPU_LIST' in line:
val = line.split("=")
raw_list = val[1].strip('\n')[1:-1].strip('"')
break
if raw_list:
# Convert the cpu list fetched from the compute
# reserved file into an integer list.
# Handle mix of number list #,# and number range #-#
split_list = raw_list.split(',')
if debug:
collectd.info('%s split list: %s' % (PLUGIN, split_list))
for cpu in split_list:
if cpu.find('-') == -1:
# add individual cpu # with assumed ',' delimiter
c.cpu_list.append(int(cpu))
else:
# add all in range #-#
cpu_range = cpu.split('-')
if len(cpu_range) == 2:
first = int(cpu_range[0])
last = int(cpu_range[1]) + 1
# add each
for i in list(range(first, last)):
c.cpu_list.append(i)
# with the full CPU list in hand we can now just read their samples
if debug:
collectd.info('%s full cpu list: %s' %
(PLUGIN, c.cpu_list))
try:
f = open('/proc/cpuinfo')
except EnvironmentError as e:
collectd.error(str(e), UserWarning)
else:
if len(c.cpu_list) == 0:
_want_all_cpus = True
else:
_want_all_cpus = False
c.processors = 0
for line in f:
name_value = [s.strip() for s in line.split(':', 1)]
if len(name_value) != 2:
continue
name, value = name_value
if 'rocessor' in name:
if _want_all_cpus is True:
c.cpu_list.append(int(c.processors))
c.processors += 1
collectd.info('%s has found %d cpus total' %
(PLUGIN, c.processors))
collectd.info('%s monitoring %d cpus %s' %
(PLUGIN, len(c.cpu_list), c.cpu_list))
f.close()
# Calculate the CPU usage sample
def read_func():
try:
f = open('/proc/schedstat')
except EnvironmentError as e:
c.log_error('file open failed ; ' + str(e))
return FAIL
else:
# schedstat time for each CPU
c.cpu_time = []
# Loop over each line ...
# get the output version ; only 15 is supported
# get the cpu time from each line staring with 'cpux ....'
for line in f:
# break each line into name/value pairs
line_split = [s.strip() for s in line.split(' ', 1)]
name, value = line_split
# get the output version.
if 'ersion' in name:
try:
c.version = int(value)
except ValueError as e:
c.log_error('got invalid schedstat version ; ' + str(e))
# TODO: Consider exiting here and raising alarm.
# Calling this type of exit will stop the plugin.
# sys._exit()
return FAIL
# only version 15 is supported
if c.version == 15:
if 'cpu' in name:
# get the cpu number for each line
if int(name.replace('cpu', '')) in c.cpu_list:
_in_list = True
else:
_in_list = False
# get cpu time for each cpu that is valid
if len(c.cpu_list) == 0 or _in_list is True:
_schedstat = value
value_split = value.split(' ')
c.cpu_time.append(float(value_split[6]))
if debug:
collectd.info('%s %s schedstat is %s [%s]' %
(PLUGIN, name, value_split[6],
_schedstat))
else:
collectd.error('%s unsupported schedstat version [%d]' %
(PLUGIN, c.version))
return 0
f.close()
# Now that we have the cpu time recorded for each cpu
_time_delta = float(0)
_cpu_count = int(0)
if len(c.cpu_time_last) == 0:
c.time_last = time.time()
if c.cpu_list:
# This is a compute node.
# Do not include vswitch or pinned cpus in calculation.
for cpu in c.cpu_list:
c.cpu_time_last.append(float(c.cpu_time[_cpu_count]))
_cpu_count += 1
if debug:
collectd.info('%s cpu time ; first pass ; %s' %
(PLUGIN, c.cpu_time))
return PASS
else:
_time_this = time.time()
_time_delta = _time_this - c.time_last
c.total_avg_cpu = 0
cpu_occupancy = []
if debug:
collectd.info('%s cpu time ; this pass ; %s -> %s' %
(PLUGIN, c.cpu_time_last, c.cpu_time))
if c.cpu_list:
# This is a compute node.
# Do not include vswitch or pinned cpus in calculation.
for cpu in c.cpu_list:
if cpu >= c.processors:
c.log_error(' got out of range cpu number')
else:
_delta = (c.cpu_time[_cpu_count] - c.cpu_time_last[_cpu_count])
_delta = _delta / 1000000 / _time_delta
cpu_occupancy.append(float((100*(_delta))/1000))
c.total_avg_cpu += cpu_occupancy[_cpu_count]
if debug:
collectd.info('%s cpu %d - count:%d [%s]' %
(PLUGIN, cpu, _cpu_count, cpu_occupancy))
_cpu_count += 1
else:
collectd.info('%s no cpus to monitor' % PLUGIN)
return 0
c.usage = c.total_avg_cpu / _cpu_count
if debug:
collectd.info('%s reports %.2f %% usage (averaged)' %
(PLUGIN, c.usage))
# Prepare for next audit ; mode now to last
# c.cpu_time_last = []
c.cpu_time_last = c.cpu_time
c.time_last = _time_this
# Dispatch usage value to collectd
val = collectd.Values(host=c.hostname)
val.plugin = 'cpu'
val.type = 'percent'
val.type_instance = 'used'
val.dispatch(values=[c.usage])
return 0
collectd.register_config(config_func)
collectd.register_init(init_func)
collectd.register_read(read_func)

View File

@ -0,0 +1,38 @@
<Plugin df>
ValuesPercentage true
IgnoreSelected false
ReportByDevice false
ReportInodes false
ValuesAbsolute false
MountPoint "/"
MountPoint "/tmp"
MountPoint "/dev"
MountPoint "/dev/shm"
MountPoint "/var/run"
MountPoint "/var/log"
MountPoint "/var/lock"
MountPoint "/boot"
MountPoint "/scratch"
MountPoint "/opt/cgcs"
MountPoint "/opt/platform"
MountPoint "/opt/extension"
MountPoint "/etc/nova/instances"
MountPoint "/var/lib/rabbitmq"
MountPoint "/var/lib/postgresql"
MountPoint "/var/lib/ceph/mon"
MountPoint "/opt/backups"
</Plugin>
<Plugin "threshold">
<Plugin "df">
<Type "percent_bytes">
Instance "used"
WarningMax 80.00
FailureMax 90.00
Persist true
PersistOK true
Hits 2
Invert false
</Type>
</Plugin>
</Plugin>

View File

@ -0,0 +1,13 @@
<Plugin "threshold">
<Plugin "example">
<Type "percent">
Instance "used"
Persist true
PersistOK true
WarningMax 51.00
FailureMax 75.00
Hits 1
Invert false
</Type>
</Plugin>
</Plugin>

View File

@ -0,0 +1,75 @@
#
# Copyright (c) 2018 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
import os
import random
import collectd
PLUGIN = 'random number plugin'
# static variables
# define a class here that will persist over read calls
class ExampleObject:
hostname = ""
plugin_data = ['1', '100']
obj = ExampleObject()
# The config function - called once on collectd process startup
def config_func(config):
"""
Configure the plugin
"""
for node in config.children:
key = node.key.lower()
val = node.values[0]
if key == 'data':
obj.plugin_data = str(val).split(' ')
collectd.info("%s configured data '%d:%d'" %
(PLUGIN,
int(obj.plugin_data[0]),
int(obj.plugin_data[1])))
return 0
collectd.info('%s config function' % PLUGIN)
return 0
# The init function - called once on collectd process startup
def init_func():
# get current hostname
obj.hostname = os.uname()[1]
return 0
# The sample read function - called on every audit interval
def read_func():
# do the work to create the sample
low = int(obj.plugin_data[0])
high = int(obj.plugin_data[1])
sample = random.randint(low, high)
# Dispatch usage value to collectd
val = collectd.Values(host=obj.hostname)
val.plugin = 'example'
val.type = 'percent'
val.type_instance = 'used'
val.dispatch(values=[sample])
return 0
# register the config, init and read functions
collectd.register_config(config_func)
collectd.register_init(init_func)
collectd.register_read(read_func)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,13 @@
<Plugin "threshold">
<Plugin "interface">
<Type "absolute">
Instance "state"
Persist true
PersistOK true
WarningMin 50
FailureMin 0
# Hits 2
Invert false
</Type>
</Plugin>
</Plugin>

View File

@ -0,0 +1,129 @@
#
# Copyright (c) 2018 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
############################################################################
#
# This is the Host Interface Monitor plugin for Collectd.
#
# Only mgmnt , infra and oam interfaces are supported with the following
# mapping specified in /etc/platform/platform.conf
#
# mgmnt - management_interface | all hosts | manditory
# infa - infrastructure_interface | any host | optional
# oam - oam_interface | controller | manditory
#
# This plugin reports link state inb the following way.
#
# The plugin init function learns interface names from platform.conf
#
#
############################################################################
import os
import random
import collectd
import tsconfig.tsconfig as tsc
PLUGIN = 'interface plugin'
# static variables
PLATFORM_CONF_MGMNT_LABEL = "management_interface="
PLATFORM_CONF_INFRA_LABEL = "infrastructure_interface="
PLATFORM_CONF_OAM_LABEL = "oam_interface="
NETWORK_MGMNT = 'mgmnt'
NETWORK_INFRA = 'infra'
NETWORK_OAM = 'oam'
class iface:
def __init__(self, n, m, s):
self.master = {'network': n, 'name': m, 'state': 'down', 'slaves': s}
self.slave1 = {}
self.slave2 = {}
self.state = int(100)
class object:
hostname = ''
def __init__(self):
self.NETWORKS = {}
self.NETWORKS[NETWORK_MGMNT] = None
self.NETWORKS[NETWORK_INFRA] = None
self.NETWORKS[NETWORK_OAM] = None
obj = object()
# The config function - called once on collectd process startup
def config_func(config):
"""
Configure the plugin
"""
collectd.debug('%s config function' % PLUGIN)
return 0
# The init function - called once on collectd process startup
def init_func():
# get current hostname
obj.hostname = os.uname()[1]
# get the master interface names from /etc/platform/platform.conf
with open(tsc.PLATFORM_CONF_FILE, 'r') as infile:
for line in infile:
# Management Interface
if PLATFORM_CONF_MGMNT_LABEL in line:
name = line.split('=')[1].replace('\n', '')
obj.NETWORKS[NETWORK_MGMNT] = iface(NETWORK_MGMNT, name, 0)
collectd.info("%s monitoring mgmnt interface : %s" %
(PLUGIN,
obj.NETWORKS[NETWORK_MGMNT].master['name']))
# Infrastructure Interface
elif PLATFORM_CONF_INFRA_LABEL in line:
name = line.split('=')[1].replace('\n', '')
obj.NETWORKS[NETWORK_INFRA] = iface(NETWORK_INFRA, name, 0)
collectd.info("%s monitoring infra interface : %s" %
(PLUGIN,
obj.NETWORKS[NETWORK_INFRA].master['name']))
# OAM Interface
elif PLATFORM_CONF_OAM_LABEL in line:
name = line.split('=')[1].replace('\n', '')
obj.NETWORKS[NETWORK_OAM] = iface(NETWORK_OAM, name, 0)
collectd.info("%s monitoring oam interface: %s" %
(PLUGIN,
obj.NETWORKS[NETWORK_OAM].master['name']))
return 0
# The sample read function - called on every audit interval
def read_func():
if obj.NETWORKS[NETWORK_MGMNT].state == 0:
obj.NETWORKS[NETWORK_MGMNT].state = 100
else:
obj.NETWORKS[NETWORK_MGMNT].state -= 25
# Dispatch usage value to collectd
val = collectd.Values(host=obj.hostname)
val.plugin = 'interface'
val.plugin_instance = 'mgmnt'
val.type = 'absolute'
val.type_instance = 'used'
val.dispatch(values=[obj.NETWORKS[NETWORK_MGMNT].state])
return 0
# register the config, init and read functions
collectd.register_config(config_func)
collectd.register_init(init_func)
collectd.register_read(read_func)

View File

@ -0,0 +1,21 @@
# For stock plugin only
# Uncomment to compare stock to tiS plugin readings
# ---------------------
# <Plugin memory>
# ValuesAbsolute false
# ValuesPercentage true
# </Plugin>
<Plugin "threshold">
<Plugin "memory">
<Type "percent">
Instance "used"
Persist true
PersistOK true
WarningMax 80.00
FailureMax 90.00
Hits 2
Invert false
</Type>
</Plugin>
</Plugin>

View File

@ -0,0 +1,181 @@
#
# Copyright (c) 2018 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
############################################################################
#
# This file is the collectd 'Platform CPU Usage' Monitor.
#
# The Platform CPU Usage is calculated as an averaged percentage of
# platform core usable since the previous sample.
#
# Init Function:
# - if 'compute_reserved.conf exists then query/store PLATFORM_CPU_LIST
#
############################################################################
import os
import collectd
debug = False
# general return codes
PASS = 0
FAIL = 1
PLUGIN = 'platform memory usage'
# CPU Control class
class MEM:
hostname = "" # hostname for sample notification message
cmd = '/proc/meminfo' # the query comment
value = float(0.0) # float value of memory usage
# meminfo values we care about
memTotal_kB = 0
memFree_kB = 0
buffers = 0
cached = 0
SReclaimable = 0
CommitLimit = 0
Committed_AS = 0
HugePages_Total = 0
Hugepagesize = 0
AnonPages = 0
# derived values
avail = 0
total = 0
strict = 0
# Instantiate the class
obj = MEM()
def config_func(config):
"""
Configure the memory usage plugin
"""
for node in config.children:
key = node.key.lower()
val = node.values[0]
if key == 'path':
obj.cmd = str(val)
collectd.info("%s configured query command: '%s'" %
(PLUGIN, obj.cmd))
return 0
collectd.info("%s no config command provided ; "
"defaulting to '%s'" %
(PLUGIN, obj.cmd))
# Get the platform cpu list and number of cpus reported by /proc/cpuinfo
def init_func():
# get current hostname
obj.hostname = os.uname()[1]
fn = '/proc/sys/vm/overcommit_memory'
if os.path.exists(fn):
with open(fn, 'r') as infile:
for line in infile:
obj.strict = int(line)
break
collectd.info("%s strict:%d" % (PLUGIN, obj.strict))
# Calculate the CPU usage sample
def read_func():
meminfo = {}
try:
with open(obj.cmd) as fd:
for line in fd:
meminfo[line.split(':')[0]] = line.split(':')[1].strip()
except EnvironmentError as e:
collectd.error("%s unable to read from %s ; str(e)" %
(PLUGIN, str(e)))
return FAIL
# remove the 'unit' (kB) suffix that might be on some of the lines
for line in meminfo:
# remove the units from the value read
value_unit = [u.strip() for u in meminfo[line].split(' ', 1)]
if len(value_unit) == 2:
value, unit = value_unit
meminfo[line] = float(value)
else:
meminfo[line] = float(meminfo[line])
obj.memTotal_kB = float(meminfo['MemTotal'])
obj.memFree_kB = float(meminfo['MemFree'])
obj.buffers = float(meminfo['Buffers'])
obj.cached = float(meminfo['Cached'])
obj.SReclaimable = float(meminfo['SReclaimable'])
obj.CommitLimit = float(meminfo['CommitLimit'])
obj.Committed_AS = float(meminfo['Committed_AS'])
obj.HugePages_Total = float(meminfo['HugePages_Total'])
obj.Hugepagesize = float(meminfo['Hugepagesize'])
obj.AnonPages = float(meminfo['AnonPages'])
# collectd.info("%s /proc/meminfo: %s" % (PLUGIN, meminfo))
# collectd.info("%s ---------------------------" % PLUGIN)
# collectd.info("%s memTotal_kB : %f" % (PLUGIN, obj.memTotal_kB))
# collectd.info("%s memFree_kB : %f" % (PLUGIN, obj.memFree_kB))
# collectd.info("%s Buffers : %f" % (PLUGIN, obj.buffers))
# collectd.info("%s Cached : %f" % (PLUGIN, obj.cached))
# collectd.info("%s SReclaimable : %f" % (PLUGIN, obj.SReclaimable))
# collectd.info("%s CommitLimit : %f" % (PLUGIN, obj.CommitLimit))
# collectd.info("%s Committed_AS : %f" % (PLUGIN, obj.Committed_AS))
# collectd.info("%s HugePages_Total: %f" % (PLUGIN, obj.HugePages_Total))
# collectd.info("%s AnonPages : %f" % (PLUGIN, obj.AnonPages))
obj.avail = float(float(obj.memFree_kB) +
float(obj.buffers) +
float(obj.cached) +
float(obj.SReclaimable))
obj.total = float(float(obj.avail) +
float(obj.AnonPages))
# collectd.info("%s ---------------------------" % PLUGIN)
# collectd.info("%s memTotal: %d" % (PLUGIN, obj.avail))
# collectd.info("%s memAvail: %d" % (PLUGIN, obj.total))
if obj.strict == 1:
obj.value = float(float(obj.Committed_AS) / float(obj.CommitLimit))
else:
obj.value = float(float(obj.AnonPages) / float(obj.total))
obj.value = float(float(obj.value) * 100)
# get numa node memory
# numa_node_files = []
# fn = "/sys/devices/system/node/"
# files = os.listdir(fn)
# for file in files:
# if 'node' in file:
# numa_node_files.append(fn + file)
# collectd.info("%s numa node files: %s" %
# (PLUGIN, numa_node_files))
collectd.debug('%s reports %.2f %% usage' %
(PLUGIN, obj.value))
# Dispatch usage value to collectd
val = collectd.Values(host=obj.hostname)
val.plugin = 'memory'
val.type = 'percent'
val.type_instance = 'used'
val.dispatch(values=[obj.value])
return PASS
collectd.register_config(config_func)
collectd.register_init(init_func)
collectd.register_read(read_func)

View File

@ -0,0 +1,379 @@
#
# Copyright (c) 2018 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
# This file is the collectd 'Maintenance' Notifier.
#
# Collects provides information about each event as an object passed to the
# notification handler ; the notification object.
#
# object.host - the hostname
#
# object.plugin - the name of the plugin aka resource
# object.plugin_instance - plugin instance string i.e. say mountpoint
# for df plugin
# object.type, - the unit i.e. percent or absolute
# object.type_instance - the attribute i.e. free, used, etc
#
# object.severity - a integer value 0=OK , 1=warning, 2=failure
# object.message - a log-able message containing the above along
# with the value
#
# This notifier manages requesting mtce to assert or clear its collectd
# host-degrade-cause flag based on notification messages sent from collectd.
#
# Messages to maintenance are throttled ONE_EVERY while this state is the
# same as last state.
#
# Message is sent on every state change
# from clear to assert or
# from assert to clear
#
# See code comments for details.
#
############################################################################
#
# Import list
import os
import socket
import collectd
# This plugin name
PLUGIN = 'degrade notifier'
# collectd severity definitions ;
# Note: can't seem to pull then in symbolically with a header
NOTIF_FAILURE = 1
NOTIF_WARNING = 2
NOTIF_OKAY = 4
# generic return codes
PASS = 0
FAIL = 1
# default mtce port.
# ... with configuration override
MTCE_CMD_RX_PORT = 2101
# same state message throttle count.
# ... only send the the degrade message every 'this' number
# while the state of assert or clear remains the same.
ONE_EVERY = 10
PLUGIN__DF = 'df'
PLUGIN__MEM = 'memory'
PLUGIN__CPU = 'cpu'
PLUGIN_INTERFACE = 'interface'
PLUGIN__EXAMPLE = 'example'
# The collectd Maintenance Notifier Object
class collectdMtceNotifierObject:
def __init__(self, port):
"""
collectdMtceNotifierObject Class constructor
"""
# default maintenance port
self.port = port
self.addr = None
# specifies the protocol family to use when messaging maintenance.
# if system is IPV6, then that is learned and this 'protocol' is
# updated with AF_INET6
self.protocol = socket.AF_INET
# List of plugin names that require degrade for specified severity.
self.degrade_list__failure = [PLUGIN__DF,
PLUGIN__MEM,
PLUGIN__CPU,
PLUGIN_INTERFACE,
PLUGIN__EXAMPLE]
self.degrade_list__warning = []
# the running list of resources that require degrade.
# a degrade clear message is sent whenever this list is empty.
# a degrade assert message is sent whenever this list is not empty.
self.degrade_list = []
# throttle down sending of duplicate degrade assert/clear messages
self.last_state = "undef"
self.msg_throttle = 0
# Instantiate the mtce_notifier object
# This object persists from notificaiton to notification
obj = collectdMtceNotifierObject(MTCE_CMD_RX_PORT)
def _get_active_controller_ip():
"""
Get the active controller host IP
"""
try:
obj.addr = socket.getaddrinfo('controller', None)[0][4][0]
collectd.info("%s controller ip: %s" % (PLUGIN, obj.addr))
except Exception as ex:
obj.addr = None
collectd.error("%s failed to get controller ip ; %s" %
(PLUGIN, str(ex)))
return 0
def _df_instance_to_path(df_inst):
"""
Convert a df instance name to a mountpoint
"""
# df_root is not a dynamic file system. Ignore that one.
if df_inst == 'df_root':
return '/'
else:
# For all others replace all '-' with '/'
return('/' + df_inst[3:].replace('-', '/'))
# This function removes degraded file systems that are no longer present.
def _clear_degrade_for_missing_filesystems():
"""
Remove degraded file systems that are no longer mounted or present.
"""
for df_inst in obj.degrade_list:
# Only file system plugins are looked at.
# File system plugin instance names are prefixed with 'df_'
# as the first 3 chars in the instance name.
if df_inst[0:3] == 'df_':
path = _df_instance_to_path(df_inst)
# check the mount point.
# if the mount point no longer exists then remove
# this instance from the degrade list.
if os.path.ismount(path) is False:
collectd.info("%s clearing degrade for missing %s ; %s" %
(PLUGIN, path, obj.degrade_list))
obj.degrade_list.remove(df_inst)
return 0
# The collectd configuration interface
#
# Used to configure the maintenance port.
# key = 'port'
# val = port number
#
def config_func(config):
"""
Configure the maintenance degrade notifier plugin.
"""
collectd.info('%s config function' % PLUGIN)
for node in config.children:
key = node.key.lower()
val = node.values[0]
if key == 'port':
obj.port = int(val)
collectd.info("%s configured mtce port: %d" %
(PLUGIN, obj.port))
return 0
obj.port = MTCE_CMD_RX_PORT
collectd.error("%s no mtce port provided ; defaulting to %d" %
(PLUGIN, obj.port))
# Collectd calls this function on startup.
def init_func():
"""
Collectd Mtce Notifier Initialization Function
"""
collectd.debug("%s init function" % PLUGIN)
# This is the Notifier function that is called by collectd.
#
# Handling steps are
#
# 1. build resource name from notification object.
# 2. check resource against severity lists.
# 3. manage this instance's degrade state.
# 4. send mtcAgent the degrade state message.
#
def notifier_func(nObject):
"""
Collectd Mtce Notifier Handler Function
"""
# Create the resource name from the notifier object.
# format: <plugin name>_<plugin_instance_name>
resource = nObject.plugin
if nObject.plugin_instance:
resource += "_" + nObject.plugin_instance
# This block looks at the current notification severity
# and manages the degrade_list.
# If the specified plugin name exists in each of the warnings
# or failure lists and there is a current severity match then
# add that resource instance to the degrade list.
# Conversly if this notification is OKAY then make sure this
# resource instance is not in the degrade list (remove it if it is)
if nObject.severity is NOTIF_OKAY:
if obj.degrade_list and resource in obj.degrade_list:
obj.degrade_list.remove(resource)
elif nObject.severity is NOTIF_FAILURE:
if obj.degrade_list__failure:
if nObject.plugin in obj.degrade_list__failure:
if resource not in obj.degrade_list:
# handle dynamic filesystems going missing over a swact
# or unmount and being reported as a transient error by
# the df plugin. Don't add it to the failed list if the
# mountpoint is gone.
add = True
if nObject.plugin == PLUGIN__DF:
path = _df_instance_to_path(resource)
add = os.path.ismount(path)
if add is True:
collectd.debug("%s %s added to degrade list" %
(PLUGIN, resource))
obj.degrade_list.append(resource)
else:
# If severity is failure and no failures cause degrade
# then make sure this plugin is not in the degrade list,
# Should never occur.
if resource in obj.degrade_list:
obj.degrade_list.remove(resource)
elif nObject.severity is NOTIF_WARNING:
if obj.degrade_list__warning:
if nObject.plugin in obj.degrade_list__warning:
if resource not in obj.degrade_list:
# handle dynamic filesystems going missing over a swact
# or unmount and being reported as a transient error by
# the df plugin. Don't add it to the failed list if the
# mountpoint is gone.
add = True
if nObject.plugin == PLUGIN__DF:
path = _df_instance_to_path(resource)
add = os.path.ismount(path)
if add is True:
collectd.debug("%s %s added to degrade list" %
(PLUGIN, resource))
obj.degrade_list.append(resource)
else:
# If severity is warning and no warnings cause degrade
# then make sure this plugin is not in the degrade list.
# Should never occur..
if resource in obj.degrade_list:
obj.degrade_list.remove(resource)
else:
collectd.info("%s unsupported severity %d" %
(PLUGIN, nObject.severity))
return FAIL
# running counter of notifications.
obj.msg_throttle += 1
# Support for Dynamic File Systems
# --------------------------------
# Some active controller mounted filesystems can become
# unmounted under the watch of collectd. This can occur
# as a result of a Swact. If an 'degrade' is raised at the
# time an fs disappears then that state can become stuck
# active until the next Swact. This call handles this case.
#
# Audit file system presence every time we get the
# notification for the root file system.
# Depending on the root filesystem always being there.
if nObject.plugin == 'df' \
and nObject.plugin_instance == 'root' \
and len(obj.degrade_list):
_clear_degrade_for_missing_filesystems()
# If degrade list is empty then a clear state is sent to maintenance.
# If degrade list is NOT empty then an assert state is sent to maintenance
# For logging and to ease debug the code below will create a list of
# degraded resource instances to be included in the message to maintenance
# for mtcAgent to optionally log it.
resources = ""
if obj.degrade_list:
# loop over the list,
# limit the degraded resource list being sent to mtce to 5
for r in obj.degrade_list[0:1:5]:
resources += r + ','
resources = resources[:-1]
state = "assert"
else:
state = "clear"
# Message throttling ....
# Avoid sending the same last state message for up to ONE_EVERY count.
# Just reduce load on mtcAgent
if obj.last_state == state and obj.msg_throttle < ONE_EVERY:
return 0
# if the degrade state has changed then log it and proceed
if obj.last_state != state:
if obj.last_state != "undef":
collectd.info("%s degrade %s %s" %
(PLUGIN,
state,
obj.degrade_list))
# Save state for next time
obj.last_state = state
# Clear the message throttle counter
obj.msg_throttle = 0
# Send the degrade state ; assert or clear message to mtcAgent.
# If we get a send failure then log it and set the addr to None
# so it forces us to refresh the controller address on the next
# notification
try:
mtce_socket = socket.socket(obj.protocol, socket.SOCK_DGRAM)
if mtce_socket:
if obj.addr is None:
_get_active_controller_ip()
if obj.addr is None:
return 0
# Create the Maintenance message.
message = "{\"service\":\"collectd_notifier\","
message += "\"hostname\":\"" + nObject.host + "\","
message += "\"degrade\":\"" + state + "\","
message += "\"resource\":\"" + resources + "\"}"
collectd.debug("%s: %s" % (PLUGIN, message))
mtce_socket.settimeout(1.0)
mtce_socket.sendto(message, (obj.addr, obj.port))
mtce_socket.close()
else:
collectd.error("%s %s failed to open socket (%s)" %
(PLUGIN, resource, obj.addr))
except socket.error as e:
if e.args[0] == socket.EAI_ADDRFAMILY:
# Handle IPV4 to IPV6 switchover:
obj.protocol = socket.AF_INET6
collectd.info("%s %s ipv6 addressing (%s)" %
(PLUGIN, resource, obj.addr))
else:
collectd.error("%s %s socket error (%s) ; %s" %
(PLUGIN, resource, obj.addr, str(e)))
# try self correction
obj.addr = None
obj.protocol = socket.AF_INET
return 0
collectd.register_config(config_func)
collectd.register_init(init_func)
collectd.register_notification(notifier_func)

View File

@ -0,0 +1,17 @@
#<Plugin "ntpq">
# Interval 60
#</Plugin>
<Plugin "threshold">
<Plugin "ntpq">
<Type "absolute">
Instance "state"
Persist true
PersistOK true
WarningMin 1
FailureMin 0
# Hits 2
Invert false
</Type>
</Plugin>
</Plugin>

View File

@ -0,0 +1,195 @@
# Copyright (c) 2018 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
#
import os
import subprocess
import uuid
import collectd
from fm_api import constants as fm_constants
from fm_api import fm_api
import tsconfig.tsconfig as tsc
api = fm_api.FaultAPIs()
PLUGIN = 'NTP query plugin'
PLUGIN_SCRIPT = '/etc/rmonfiles.d/query_ntp_servers.sh'
PLUGIN_RESULT = '/tmp/ntpq_server_info'
# static variables
ALARM_ID__NTPQ = "100.114"
# define a class here that will persist over read calls
class NtpqObject:
hostname = ''
base_eid = ''
severity = 'clear'
suppression = True
service_affecting = False
status = 0
last_result = ''
this_result = ''
id = ALARM_ID__NTPQ
name = "NTP"
alarm_type = fm_constants.FM_ALARM_TYPE_1
cause = fm_constants.ALARM_PROBABLE_CAUSE_UNKNOWN
repair = "Monitor and if condition persists, "
repair += "contact next level of support."
obj = NtpqObject()
def is_uuid_like(val):
"""Returns validation of a value as a UUID."""
try:
return str(uuid.UUID(val)) == val
except (TypeError, ValueError, AttributeError):
return False
# The config function - called once on collectd process startup
def config_func(config):
"""
Configure the plugin
"""
collectd.debug('%s config function' % PLUGIN)
return 0
# The init function - called once on collectd process startup
def init_func():
# ntp query is for controllers only
if tsc.nodetype != 'controller':
return 0
# get current hostname
obj.hostname = os.uname()[1]
obj.base_eid = 'host=' + obj.hostname + '.ntp'
collectd.info("%s on %s with entity id '%s'" % PLUGIN, obj.hostname, obj.base_eid)
return 0
# The sample read function - called on every audit interval
def read_func():
# ntp query is for controllers only
if tsc.nodetype != 'controller':
return 0
result = int(0)
# Query ntp
try:
result = os.system(PLUGIN_SCRIPT)
except Exception as e:
collectd.error("%s Could not run '%s' (%s)" %
(PLUGIN, e))
return 0
obj.status = int(result)/0x100
collectd.info("%s Query Result: %s" % (PLUGIN, obj.status))
if os.path.exists(PLUGIN_RESULT) is False:
collectd.error("%s produced no result file '%s'" %
(PLUGIN, PLUGIN_RESULT))
return 0
# read the query result file.
# format is in the PLUGIN_SCRIPT file.
# This code only wants the second line.
# It contains list of unreachable ntp servers that need alarm management.
count = 0
with open(PLUGIN_RESULT, 'r') as infile:
for line in infile:
count += 1
collectd.info("%s Query Result: %s" % (PLUGIN, line))
if count == 0:
collectd.error("%s produced empty result file '%s'" %
(PLUGIN, PLUGIN_RESULT))
return 0
sample = 1
# Dispatch usage value to collectd
val = collectd.Values(host=obj.hostname)
val.plugin = 'ntpq'
val.plugin_instance = 'some.ntp.server.ip'
val.type = 'absolute'
val.type_instance = 'state'
val.dispatch(values=[sample])
severity = 'clear'
obj.severity = 'clear'
# if there is no severity change then consider exiting
if obj.severity == severity:
# unless the current severity is 'minor'
if severity == 'minor':
# TODO: check to see if the failing IP address is changed
collectd.info("%s NEED TO CHECK IP ADDRESSES" % (PLUGIN))
else:
return 0
# if current severity is clear but previous severity is not then
# prepare to clear the alarms
if severity == 'clear':
_alarm_state = fm_constants.FM_ALARM_STATE_CLEAR
# TODO: loop over all raises alarms and clear them
collectd.info("%s NEED CLEAR ALL ALARMS" % (PLUGIN))
if api.clear_fault(obj.id, obj.base_eid) is False:
collectd.error("%s %s:%s clear_fault failed" %
(PLUGIN, obj.id, obj.base_eid))
return 0
elif severity == 'major':
reason = "NTP configuration does not contain any valid "
reason += "or reachable NTP servers."
eid = obj.base_eid
fm_severity = fm_constants.FM_ALARM_SEVERITY_MAJOR
else:
# TODO: There can be up to 3 inacessable servers
ip = 'some.server.ip.addr'
reason = "NTP address "
reason += ip
reason += " is not a valid or a reachable NTP server."
eid = obj.base_eid + '=' + ip
fm_severity = fm_constants.FM_ALARM_SEVERITY_MINOR
fault = fm_api.Fault(
alarm_id=obj.id,
alarm_state=fm_constants.FM_ALARM_STATE_SET,
entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST,
entity_instance_id=eid,
severity=fm_severity,
reason_text=reason,
alarm_type=obj.alarm_type,
probable_cause=obj.cause,
proposed_repair_action=obj.repair,
service_affecting=obj.service_affecting,
suppression=obj.suppression)
alarm_uuid = api.set_fault(fault)
if is_uuid_like(alarm_uuid) is False:
collectd.error("%s %s:%s set_fault failed:%s" %
(PLUGIN, obj.id, eid, alarm_uuid))
return 0
# TODO: clear the object alarm state
return 0
# register the config, init and read functions
collectd.register_config(config_func)
collectd.register_init(init_func)
collectd.register_read(read_func)

View File

@ -0,0 +1,20 @@
LoadPlugin python
<Plugin python>
ModulePath "/opt/collectd/extensions/python"
Import "cpu"
<Module "cpu">
Path "/proc/cpuinfo"
</Module>
Import "memory"
<Module "memory">
Path "/proc/meminfo"
</Module>
# Import "example"
# <Module "example">
# Data "1 50"
# </Module>
# Import "interface"
# Import "ntpq"
LogTraces = true
Encoding "utf-8"
</Plugin>

View File

@ -0,0 +1,10 @@
Metadata-Version: 1.1
Name: influxdb-extensions
Version: 1.0
Summary: influxdb-extensions
Home-page:
Author: Windriver
Author-email: info@windriver.com
License: windriver
Description: Titanium Cloud influxdb extensions.
Platform: UNKNOWN

View File

@ -0,0 +1,7 @@
SRC_DIR="$PKG_BASE"
COPY_LIST="$PKG_BASE/src/LICENSE \
$PKG_BASE/src/influxdb.conf.pmon \
$PKG_BASE/src/influxdb.service"
TIS_PATCH_VER=1

View File

@ -0,0 +1,46 @@
Summary: Titanuim Server influxdb Extensions Package
Name: influxdb-extensions
Version: 1.0
Release: 0%{?_tis_dist}.%{tis_patch_ver}
License: windriver
Group: base
Packager: Wind River <info@windriver.com>
URL: unknown
# create the files tarball
Source0: %{name}-%{version}.tar.gz
source1: influxdb.service
Source2: influxdb.conf.pmon
Requires: systemd
Requires: influxdb
Requires: /bin/systemctl
%description
Titanium Cloud influxdb extensions
%define debug_package %{nil}
%define local_unit_dir %{_sysconfdir}/systemd/system
%prep
%setup
%build
%install
install -m 755 -d %{buildroot}%{_sysconfdir}
install -m 755 -d %{buildroot}%{_sysconfdir}/influxdb
install -m 755 -d %{buildroot}%{local_unit_dir}
install -m 644 %{SOURCE1} %{buildroot}%{local_unit_dir}
install -m 600 %{SOURCE2} %{buildroot}%{_sysconfdir}/influxdb
%clean
rm -rf $RPM_BUILD_ROOT
%files
%defattr(-,root,root,-)
%config(noreplace) %{local_unit_dir}/influxdb.service
%{_sysconfdir}/influxdb/*

View File

@ -0,0 +1,202 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@ -0,0 +1,322 @@
### Welcome to the InfluxDB configuration file.
# Once every 24 hours InfluxDB will report anonymous data to m.influxdb.com
# The data includes raft id (random 8 bytes), os, arch, version, and metadata.
# We don't track ip addresses of servers reporting. This is only used
# to track the number of instances running and the versions, which
# is very helpful for us.
# Change this option to true to disable reporting.
reporting-disabled = false
###
### Enterprise registration control
###
[registration]
# enabled = true
# url = "https://enterprise.influxdata.com" # The Enterprise server URL
# token = "" # Registration token for Enterprise server
###
### [meta]
###
### Controls the parameters for the Raft consensus group that stores metadata
### about the InfluxDB cluster.
###
[meta]
dir = "/var/lib/influxdb/meta"
hostname = "localhost"
bind-address = ":8088"
retention-autocreate = true
election-timeout = "1s"
heartbeat-timeout = "1s"
leader-lease-timeout = "500ms"
commit-timeout = "50ms"
cluster-tracing = false
# If enabled, when a Raft cluster loses a peer due to a `DROP SERVER` command,
# the leader will automatically ask a non-raft peer node to promote to a raft
# peer. This only happens if there is a non-raft peer node available to promote.
# This setting only affects the local node, so to ensure if operates correctly, be sure to set
# it in the config of every node.
raft-promotion-enabled = true
###
### [data]
###
### Controls where the actual shard data for InfluxDB lives and how it is
### flushed from the WAL. "dir" may need to be changed to a suitable place
### for your system, but the WAL settings are an advanced configuration. The
### defaults should work for most systems.
###
[data]
dir = "/var/lib/influxdb/data"
# Controls the engine type for new shards. Options are b1, bz1, or tsm1.
# b1 is the 0.9.2 storage engine, bz1 is the 0.9.3 and 0.9.4 engine.
# tsm1 is the 0.9.5 engine and is currenly EXPERIMENTAL. Until 0.9.5 is
# actually released data written into a tsm1 engine may be need to be wiped
# between upgrades.
# engine ="bz1"
# The following WAL settings are for the b1 storage engine used in 0.9.2. They won't
# apply to any new shards created after upgrading to a version > 0.9.3.
max-wal-size = 104857600 # Maximum size the WAL can reach before a flush. Defaults to 100MB.
wal-flush-interval = "10m" # Maximum time data can sit in WAL before a flush.
wal-partition-flush-delay = "2s" # The delay time between each WAL partition being flushed.
# These are the WAL settings for the storage engine >= 0.9.3
wal-dir = "/var/lib/influxdb/wal"
wal-enable-logging = true
# When a series in the WAL in-memory cache reaches this size in bytes it is marked as ready to
# flush to the index
# wal-ready-series-size = 25600
# Flush and compact a partition once this ratio of series are over the ready size
# wal-compaction-threshold = 0.6
# Force a flush and compaction if any series in a partition gets above this size in bytes
# wal-max-series-size = 2097152
# Force a flush of all series and full compaction if there have been no writes in this
# amount of time. This is useful for ensuring that shards that are cold for writes don't
# keep a bunch of data cached in memory and in the WAL.
# wal-flush-cold-interval = "10m"
# Force a partition to flush its largest series if it reaches this approximate size in
# bytes. Remember there are 5 partitions so you'll need at least 5x this amount of memory.
# The more memory you have, the bigger this can be.
# wal-partition-size-threshold = 20971520
# Whether queries should be logged before execution. Very useful for troubleshooting, but will
# log any sensitive data contained within a query.
# query-log-enabled = true
###
### [hinted-handoff]
###
### Controls the hinted handoff feature, which allows nodes to temporarily
### store queued data when one node of a cluster is down for a short period
### of time.
###
[hinted-handoff]
enabled = true
dir = "/var/lib/influxdb/hh"
max-size = 1073741824
max-age = "168h"
retry-rate-limit = 0
# Hinted handoff will start retrying writes to down nodes at a rate of once per second.
# If any error occurs, it will backoff in an exponential manner, until the interval
# reaches retry-max-interval. Once writes to all nodes are successfully completed the
# interval will reset to retry-interval.
retry-interval = "1s"
retry-max-interval = "1m"
# Interval between running checks for data that should be purged. Data is purged from
# hinted-handoff queues for two reasons. 1) The data is older than the max age, or
# 2) the target node has been dropped from the cluster. Data is never dropped until
# it has reached max-age however, for a dropped node or not.
purge-interval = "1h"
###
### [cluster]
###
### Controls non-Raft cluster behavior, which generally includes how data is
### shared across shards.
###
[cluster]
shard-writer-timeout = "10s" # The time within which a shard must respond to write.
write-timeout = "5s" # The time within which a write operation must complete on the cluster.
###
### [retention]
###
### Controls the enforcement of retention policies for evicting old data.
###
[retention]
enabled = true
check-interval = "30m"
###
### [shard-precreation]
###
### Controls the precreation of shards, so they are created before data arrives.
### Only shards that will exist in the future, at time of creation, are precreated.
[shard-precreation]
enabled = true
check-interval = "10m"
advance-period = "30m"
###
### Controls the system self-monitoring, statistics and diagnostics.
###
### The internal database for monitoring data is created automatically if
### if it does not already exist. The target retention within this database
### is called 'monitor' and is also created with a retention period of 7 days
### and a replication factor of 1, if it does not exist. In all cases the
### this retention policy is configured as the default for the database.
[monitor]
store-enabled = true # Whether to record statistics internally.
store-database = "_internal" # The destination database for recorded statistics
store-interval = "10s" # The interval at which to record statistics
###
### [admin]
###
### Controls the availability of the built-in, web-based admin interface. If HTTPS is
### enabled for the admin interface, HTTPS must also be enabled on the [http] service.
###
[admin]
enabled = true
bind-address = ":8083"
https-enabled = false
https-certificate = "/etc/ssl/influxdb.pem"
###
### [http]
###
### Controls how the HTTP endpoints are configured. These are the primary
### mechanism for getting data into and out of InfluxDB.
###
[http]
enabled = true
bind-address = ":8086"
auth-enabled = false
log-enabled = true
write-tracing = false
pprof-enabled = false
https-enabled = false
https-certificate = "/etc/ssl/influxdb.pem"
###
### [[graphite]]
###
### Controls one or many listeners for Graphite data.
###
[[graphite]]
enabled = false
# database = "graphite"
# bind-address = ":2003"
# protocol = "tcp"
# consistency-level = "one"
# name-separator = "."
# These next lines control how batching works. You should have this enabled
# otherwise you could get dropped metrics or poor performance. Batching
# will buffer points in memory if you have many coming in.
# batch-size = 1000 # will flush if this many points get buffered
# batch-pending = 5 # number of batches that may be pending in memory
# batch-timeout = "1s" # will flush at least this often even if we haven't hit buffer limit
# udp-read-buffer = 0 # UDP Read buffer size, 0 means OS default. UDP listener will fail if set above OS max.
## "name-schema" configures tag names for parsing the metric name from graphite protocol;
## separated by `name-separator`.
## The "measurement" tag is special and the corresponding field will become
## the name of the metric.
## e.g. "type.host.measurement.device" will parse "server.localhost.cpu.cpu0" as
## {
## measurement: "cpu",
## tags: {
## "type": "server",
## "host": "localhost,
## "device": "cpu0"
## }
## }
# name-schema = "type.host.measurement.device"
## If set to true, when the input metric name has more fields than `name-schema` specified,
## the extra fields will be ignored.
## Otherwise an error will be logged and the metric rejected.
# ignore-unnamed = true
###
### [collectd]
###
### Controls the listener for collectd data.
###
[collectd]
enabled = true
bind-address = "127.0.0.1:25826"
database = "collectd"
typesdb = "/usr/share/collectd/types.db"
# These next lines control how batching works. You should have this enabled
# otherwise you could get dropped metrics or poor performance. Batching
# will buffer points in memory if you have many coming in.
# batch-size = 1000 # will flush if this many points get buffered
# batch-pending = 5 # number of batches that may be pending in memory
# batch-timeout = "1s" # will flush at least this often even if we haven't hit buffer limit
# read-buffer = 0 # UDP Read buffer size, 0 means OS default. UDP listener will fail if set above OS max.
###
### [opentsdb]
###
### Controls the listener for OpenTSDB data.
###
[opentsdb]
enabled = false
# bind-address = ":4242"
# database = "opentsdb"
# retention-policy = ""
# consistency-level = "one"
# tls-enabled = false
# certificate= ""
# These next lines control how batching works. You should have this enabled
# otherwise you could get dropped metrics or poor performance. Only points
# metrics received over the telnet protocol undergo batching.
# batch-size = 1000 # will flush if this many points get buffered
# batch-pending = 5 # number of batches that may be pending in memory
# batch-timeout = "1s" # will flush at least this often even if we haven't hit buffer limit
###
### [[udp]]
###
### Controls the listeners for InfluxDB line protocol data via UDP.
###
[[udp]]
enabled = false
# bind-address = ""
# database = "udp"
# retention-policy = ""
# These next lines control how batching works. You should have this enabled
# otherwise you could get dropped metrics or poor performance. Batching
# will buffer points in memory if you have many coming in.
# batch-size = 1000 # will flush if this many points get buffered
# batch-pending = 5 # number of batches that may be pending in memory
# batch-timeout = "1s" # will flush at least this often even if we haven't hit buffer limit
# read-buffer = 0 # UDP Read buffer size, 0 means OS default. UDP listener will fail if set above OS max.
###
### [continuous_queries]
###
### Controls how continuous queries are run within InfluxDB.
###
[continuous_queries]
log-enabled = true
enabled = true
recompute-previous-n = 2
recompute-no-older-than = "10m"
compute-runs-per-interval = 10
compute-no-more-than = "2m"

View File

@ -0,0 +1,17 @@
[process]
process = influxdb
service = influxdb
style = lsb
pidfile = /var/run/influxdb/influxdb.pid
severity = major ; minor, major, critical
restarts = 3 ; restart retries before error assertion
interval = 5 ; number of seconds to wait between restarts
debounce = 10 ; number of seconds that a process needs to remain
; running before degrade is removed and retry count
; is cleared.
startuptime = 3 ; Seconds to wait after process start before starting the debounce monitor
mode = passive ; Monitoring mode: passive (default) or active
; passive: process death monitoring (default: always)
; active : heartbeat monitoring, i.e. request / response messaging
; ignore : do not monitor or stop monitoring
quorum = 0 ; process is in the host watchdog quorum

View File

@ -0,0 +1,16 @@
#daily
nodateext
/var/log/influxdb/influxdb.log
{
size 20M
start 1
missingok
rotate 20
compress
sharedscripts
postrotate
systemctl reload syslog-ng > /dev/null 2>&1 || true
endscript
}

View File

@ -0,0 +1,20 @@
[Unit]
Description=InfluxDB open-source, distributed, time series database
Documentation=https://influxdb.com/docs/
After=local-fs.target network.target
Before=collectd.service
[Service]
User=influxdb
Group=influxdb
LimitNOFILE=65536
Environment='STDOUT=/dev/null'
Environment='STDERR=/var/log/influxdb/influxd.log'
EnvironmentFile=-/etc/default/influxdb
ExecStart=/bin/sh -c "/usr/bin/influxd -config /etc/influxdb/influxdb.conf -pidfile /var/run/influxdb/influxdb.pid ${INFLUXD_OPTS} >> ${STDOUT} 2>> ${STDERR}"
ExecStopPost=/bin/bash -c 'rm /var/run/influxdb/influxdb.pid'
KillMode=control-group
[Install]
WantedBy=multi-user.target
Alias=influxd.service