Collectd+InfluxDb-RMON Replacement(ALL METRICS) P1

This update introduces two new open source services to the Titanium
Cloud.

The first is 'collectd' which is a plugin based resource monitoring
engine intended to obsolete the existing rmond process.
The second service is 'influxdb' which is a light weight time series
database intended to store sample data.

Collectd and influxdb are configured through puppet and are started by
manifest.

Collectd runs on all hosts while influxdb only runs on the controllers,.
Colectd periodically monitors platform cpu, memory and filesystem
resources in terms of percentage used value. The value sample for each
monitored resource for each audit cycle is forwarded to the active
controller for storage in the 'influxdb' database.

This update develops several collectd 'support' plugins for managing
thresholds, alarms, and host degrade. Additionally, this update
implements home-grown plugins for platform cpu and memory monitoring
and combines the static and dynamic filesystem resource monitoring in
rmond into a single filesystem plugin based on collectd's stock 'df'
plugin.

Collectd's network plugin is configured to forward samples to the active
controller while influxdb that only runs on the controllers are
configured to listen for collectd samples and then store them in its
database.

Change-Id: I797f01ca09df3a7e8236153e4f0cb1f90be4c9b5
This commit is contained in:
Eric MacDonald 2018-05-14 16:12:16 -04:00 committed by Jack Ding
parent 686d83b25b
commit 44e2f196d6
13 changed files with 642 additions and 25 deletions

View File

@ -70,6 +70,7 @@ Requires: puppet-nslcd
Requires: puppet-nssdb
Requires: puppet-puppi
Requires: puppet-vlan
Requires: puppet-collectd
%description
Platform puppet configuration files and manifests

View File

@ -52,6 +52,19 @@ platform::mtce::agent::params::heartbeat_period: 100
platform::mtce::agent::params::heartbeat_failure_threshold: 10
platform::mtce::agent::params::heartbeat_degrade_threshold: 6
# influxdb configuration for collectd
platform::influxdb::params::bind_address: ':25826'
platform::influxdb::params::database: 'collectd'
platform::influxdb::params::typesdb: '/usr/share/collectd/types.db'
platform::influxdb::params::batch_size: 1000
platform::influxdb::params::batch_pending: 5
platform::influxdb::params::batch_timeout: '2s'
platform::influxdb::params::read_buffer: 0
# influxdb log ratation file
platform::influxdb::logrotate::params::log_file_name: '/var/log/influxdb/influxdb.log'
platform::influxdb::logrotate::params::log_file_size: '20M'
platform::influxdb::logrotate::params::log_file_rotate: 10
# postgresql
postgresql::globals::needs_initdb: false

View File

@ -89,3 +89,22 @@ nova::placement::os_interface: 'internal'
ceilometer::telemetry_secret: ''
ceilometer::use_syslog: true
ceilometer::log_facility: 'local2'
# collectd: configuration
platform::collectd::params::interval: 30
platform::collectd::params::timeout: 2
platform::collectd::params::read_threads: 5
platform::collectd::params::write_threads: 5
platform::collectd::params::max_read_interval: 86400
platform::collectd::params::write_queue_limit_high: 1000000
platform::collectd::params::write_queue_limit_low: 800000
platform::collectd::params::server_addrs: ['controller']
platform::collectd::params::server_port: 25826
platform::collectd::params::collectd_d_dir: '/etc/collectd.d'
# collectd: module named plugins
platform::collectd::params::module_path: '/opt/collectd/extensions/python'
platform::collectd::params::plugins: ['fm_notifier','mtce_notifier']
platform::collectd::params::mtce_notifier_port: 2101
platform::collectd::params::log_traces: true
platform::collectd::params::encoding: "utf-8"

View File

@ -28,7 +28,7 @@ include ::platform::sysinv
include ::platform::ceph
include ::platform::devices
include ::platform::grub
include ::platform::collectd
include ::openstack::client
include ::openstack::neutron

View File

@ -53,6 +53,10 @@ include ::platform::ceph
include ::platform::ceph::monitor
include ::platform::ceph::rgw
include ::platform::influxdb
include ::platform::influxdb::logrotate
include ::platform::collectd
include ::openstack::client
include ::openstack::keystone
include ::openstack::keystone::api

View File

@ -24,6 +24,7 @@ include ::platform::remotelogging
include ::platform::mtce
include ::platform::sysinv
include ::platform::grub
include ::platform::collectd
include ::platform::ceph
include ::platform::ceph::monitor

View File

@ -724,16 +724,6 @@ class openstack::cinder::post
require => Class['openstack::cinder'],
}
}
if $is_node_cinder_lvm {
exec { "Update cinder-volumes monitoring state to enabled":
command => "rmon_resource_notify --resource-name cinder-volumes --resource-type lvg --resource-state enabled --volume-group cinder-volume",
logoutput => true,
tries => 2,
try_sleep => 1,
returns => [ 0, 1 ],
}
}
}

View File

@ -491,20 +491,6 @@ class openstack::nova::storage (
exec { 'mount /dev/nova-local/instances_lv':
unless => 'mount | grep -q /etc/nova/instances',
command => 'mount -t ext4 /dev/nova-local/instances_lv /etc/nova/instances',
} ->
exec { "Update nova-local monitoring state to ${local_monitor_state}":
command => "rmon_resource_notify --resource-name nova-local --resource-type lvg --resource-state ${local_monitor_state} --volume-group nova-local",
logoutput => true,
tries => 2,
try_sleep => 1,
returns => [ 0, 1 ],
} ->
exec { 'Enable instance_lv monitoring':
command => "rmon_resource_notify --resource-name /etc/nova/instances --resource-type mount --resource-state enabled --device /dev/mapper/nova--local-instances_lv --mount-point /etc/nova/instances",
logoutput => true,
tries => 2,
try_sleep => 1,
returns => [ 0, 1 ],
}
}

View File

@ -0,0 +1,60 @@
class platform::collectd::params (
$interval = undef,
$timeout = undef,
$read_threads = undef,
$write_threads = undef,
$write_queue_limit_high = undef,
$write_queue_limit_low = undef,
$server_addrs = [],
$server_port = undef,
$max_read_interval = undef,
# python plugin controls
$module_path = undef,
$plugins = [],
$mtce_notifier_port = undef,
$log_traces = undef,
$encoding = undef,
$collectd_d_dir = undef,
) {}
class platform::collectd
inherits ::platform::collectd::params {
file { "/etc/collectd.conf":
ensure => 'present',
replace => true,
content => template('platform/collectd.conf.erb'),
} -> # now start collectd
# ensure that collectd is running
service { 'collectd':
ensure => running,
enable => true,
provider => 'systemd'
} -> # now get pmond to monitor the process
# ensure pmon soft link for process monitoring
file { "/etc/pmon.d/collectd.conf":
ensure => 'link',
target => "/opt/collectd/extensions/config/collectd.conf.pmon",
owner => 'root',
group => 'root',
mode => '0600',
}
}
class platform::collectd::runtime {
include ::platform::collectd
}
# restart target
class platform::collectd::restart {
include ::platform::collectd
exec { "collectd-restart":
command => '/usr/local/sbin/pmon-restart collect'
}
}

View File

@ -0,0 +1,84 @@
class platform::influxdb::params (
$bind_address = undef,
$database = undef,
$typesdb = undef,
$batch_size = undef,
$batch_pending = undef,
$batch_timeout = undef,
$read_buffer = undef,
) {}
class platform::influxdb
inherits ::platform::influxdb::params {
user { 'influxdb': ensure => present, } ->
group { 'influxdb': ensure => present, } ->
# make a pid dir for influxdb username and group
file { "/var/run/influxdb":
ensure => 'directory',
owner => 'influxdb',
group => 'influxdb',
mode => '0755',
} ->
# make a log dir for influxdb username and group
file { "/var/log/influxdb":
ensure => 'directory',
owner => 'influxdb',
group => 'influxdb',
mode => '0755',
} ->
# make a lib dir for influxdb username and group
file { "/var/lib/influxdb":
ensure => 'directory',
owner => 'influxdb',
group => 'influxdb',
mode => '0755',
} -> # now configure influxdb
file { "/etc/influxdb/influxdb.conf":
ensure => 'present',
replace => true,
content => template('platform/influxdb.conf.erb'),
} -> # now make sure that influxdb is started
# ensure that influxdb is running
service { 'influxdb':
ensure => running,
enable => true,
provider => 'systemd'
} -> # now ask pmon to monitor the process
# ensure pmon soft link for process monitoring
file { "/etc/pmon.d/influxdb.conf":
ensure => 'link',
target => "/etc/influxdb/influxdb.conf.pmon",
owner => 'root',
group => 'root',
mode => '0600',
}
}
class platform::influxdb::runtime {
include ::platform::influxdb
}
class platform::influxdb::logrotate::params (
$log_file_name = undef,
$log_file_size = undef,
$log_file_rotate = undef,
) {}
class platform::influxdb::logrotate
inherits ::platform::influxdb::logrotate::params {
file { "/etc/logrotate.d/influxdb":
ensure => 'present',
replace => true,
content => template('platform/logrotate.erb'),
}
}

View File

@ -0,0 +1,116 @@
#
# Config file for collectd(1).
# Please read collectd.conf(5) for a list of options.
# http://collectd.org/
#
##############################################################################
# Global #
#----------------------------------------------------------------------------#
# Global settings for the daemon. #
##############################################################################
FQDNLookup true
BaseDir "/var/lib/collectd"
PIDFile "/var/run/collectd.pid"
PluginDir "/usr/lib64/collectd"
TypesDB "/usr/share/collectd/types.db"
#----------------------------------------------------------------------------#
# When enabled, plugins are loaded automatically with the default options #
# when an appropriate <Plugin ...> block is encountered. #
# Disabled by default. #
#----------------------------------------------------------------------------#
AutoLoadPlugin true
#----------------------------------------------------------------------------#
# When enabled, internal statistics are collected, using "collectd" as the #
# plugin name. #
# Disabled by default. #
#----------------------------------------------------------------------------#
CollectInternalStats true
#----------------------------------------------------------------------------#
# Interval at which to query values. This may be overwritten on a per-plugin #
# base by using the 'Interval' option of the LoadPlugin block: #
# <LoadPlugin foo> #
# Interval 60 #
# </LoadPlugin> #
#----------------------------------------------------------------------------#
Interval <%= @interval %>
MaxReadInterval <%= @max_read_interval %>
Timeout <%= @timeout %>
ReadThreads <%= @read_threads %>
WriteThreads <%= @write_threads %>
# Limit the size of the write queue. Default is no limit. Setting up a limit is
# recommended for servers handling a high volume of traffic.
<%- if @write_queue_limit_high -%>
WriteQueueLimitHigh <%= @write_queue_limit_high %>
<%- end -%>
<%- if @write_queue_limit_low -%>
WriteQueueLimitLow <%= @write_queue_limit_low %>
<%- end -%>
##############################################################################
# Logging #
#----------------------------------------------------------------------------#
# Plugins which provide logging functions should be loaded first, so log #
# messages generated when loading or configuring other plugins can be #
# accessed. #
##############################################################################
#LoadPlugin syslog
#LoadPlugin logfile
#LoadPlugin log_logstash
#<Plugin logfile>
# LogLevel info
# File "/var/log/collectd.log"
# Timestamp true
# PrintSeverity true
#</Plugin>
#<Plugin log_logstash>
# LogLevel info
# File "/var/log/collectd.json.log"
#</Plugin>
#<Plugin syslog>
# LogLevel info
#</Plugin>
# Have collectd send to these servers on server_port
<Plugin network>
<%- @server_addrs.each do |server_addr| -%>
Server "<%= server_addr %>" "<%= @server_port %>"
<%- end -%>
</Plugin>
LoadPlugin python
<Plugin python>
<%- if @module_path != "" -%>
ModulePath "<%= @module_path %>"
<%- end -%>
<%- @plugins.each do |plugin| -%>
Import "<%= plugin %>"
<%- if plugin == 'mtce_notifier' -%>
<Module "mtce_notifier">
Port <%= @mtce_notifier_port %>
</Module>
<%- end -%>
<%- end -%>
<%- if @log_traces -%>
LogTraces <%= @log_traces %>
<%- end -%>
<%- if @encoding -%>
Encoding "<%= @encoding %>"
<%- end -%>
Interactive false
</Plugin>
# The default plugin directory
<%- if @collectd_d_dir -%>
Include "<%= @collectd_d_dir %>"
<%- end -%>

View File

@ -0,0 +1,329 @@
### Welcome to the InfluxDB configuration file.
# Once every 24 hours InfluxDB will report anonymous data to m.influxdb.com
# The data includes raft id (random 8 bytes), os, arch, version, and metadata.
# We don't track ip addresses of servers reporting. This is only used
# to track the number of instances running and the versions, which
# is very helpful for us.
# Change this option to true to disable reporting.
reporting-disabled = false
###
### Enterprise registration control
###
[registration]
# enabled = true
# url = "https://enterprise.influxdata.com" # The Enterprise server URL
# token = "" # Registration token for Enterprise server
###
### [meta]
###
### Controls the parameters for the Raft consensus group that stores metadata
### about the InfluxDB cluster.
###
[meta]
dir = "/var/lib/influxdb/meta"
hostname = "localhost"
bind-address = ":8088"
retention-autocreate = true
election-timeout = "1s"
heartbeat-timeout = "1s"
leader-lease-timeout = "500ms"
commit-timeout = "50ms"
cluster-tracing = false
# If enabled, when a Raft cluster loses a peer due to a `DROP SERVER` command,
# the leader will automatically ask a non-raft peer node to promote to a raft
# peer. This only happens if there is a non-raft peer node available to promote.
# This setting only affects the local node, so to ensure if operates correctly, be sure to set
# it in the config of every node.
raft-promotion-enabled = true
###
### [data]
###
### Controls where the actual shard data for InfluxDB lives and how it is
### flushed from the WAL. "dir" may need to be changed to a suitable place
### for your system, but the WAL settings are an advanced configuration. The
### defaults should work for most systems.
###
[data]
dir = "/var/lib/influxdb/data"
# Controls the engine type for new shards. Options are b1, bz1, or tsm1.
# b1 is the 0.9.2 storage engine, bz1 is the 0.9.3 and 0.9.4 engine.
# tsm1 is the 0.9.5 engine and is currenly EXPERIMENTAL. Until 0.9.5 is
# actually released data written into a tsm1 engine may be need to be wiped
# between upgrades.
# engine ="bz1"
# The following WAL settings are for the b1 storage engine used in 0.9.2. They won't
# apply to any new shards created after upgrading to a version > 0.9.3.
max-wal-size = 104857600 # Maximum size the WAL can reach before a flush. Defaults to 100MB.
wal-flush-interval = "10m" # Maximum time data can sit in WAL before a flush.
wal-partition-flush-delay = "2s" # The delay time between each WAL partition being flushed.
# These are the WAL settings for the storage engine >= 0.9.3
wal-dir = "/var/lib/influxdb/wal"
wal-enable-logging = true
# When a series in the WAL in-memory cache reaches this size in bytes it is marked as ready to
# flush to the index
# wal-ready-series-size = 25600
# Flush and compact a partition once this ratio of series are over the ready size
# wal-compaction-threshold = 0.6
# Force a flush and compaction if any series in a partition gets above this size in bytes
# wal-max-series-size = 2097152
# Force a flush of all series and full compaction if there have been no writes in this
# amount of time. This is useful for ensuring that shards that are cold for writes don't
# keep a bunch of data cached in memory and in the WAL.
# wal-flush-cold-interval = "10m"
# Force a partition to flush its largest series if it reaches this approximate size in
# bytes. Remember there are 5 partitions so you'll need at least 5x this amount of memory.
# The more memory you have, the bigger this can be.
# wal-partition-size-threshold = 20971520
# Whether queries should be logged before execution. Very useful for troubleshooting, but will
# log any sensitive data contained within a query.
# query-log-enabled = true
###
### [hinted-handoff]
###
### Controls the hinted handoff feature, which allows nodes to temporarily
### store queued data when one node of a cluster is down for a short period
### of time.
###
[hinted-handoff]
enabled = true
dir = "/var/lib/influxdb/hh"
max-size = 1073741824
max-age = "168h"
retry-rate-limit = 0
# Hinted handoff will start retrying writes to down nodes at a rate of once per second.
# If any error occurs, it will backoff in an exponential manner, until the interval
# reaches retry-max-interval. Once writes to all nodes are successfully completed the
# interval will reset to retry-interval.
retry-interval = "1s"
retry-max-interval = "1m"
# Interval between running checks for data that should be purged. Data is purged from
# hinted-handoff queues for two reasons. 1) The data is older than the max age, or
# 2) the target node has been dropped from the cluster. Data is never dropped until
# it has reached max-age however, for a dropped node or not.
purge-interval = "1h"
###
### [cluster]
###
### Controls non-Raft cluster behavior, which generally includes how data is
### shared across shards.
###
[cluster]
shard-writer-timeout = "10s" # The time within which a shard must respond to write.
write-timeout = "5s" # The time within which a write operation must complete on the cluster.
###
### [retention]
###
### Controls the enforcement of retention policies for evicting old data.
###
[retention]
enabled = true
check-interval = "30m"
###
### [shard-precreation]
###
### Controls the precreation of shards, so they are created before data arrives.
### Only shards that will exist in the future, at time of creation, are precreated.
[shard-precreation]
enabled = true
check-interval = "10m"
advance-period = "30m"
###
### Controls the system self-monitoring, statistics and diagnostics.
###
### The internal database for monitoring data is created automatically if
### if it does not already exist. The target retention within this database
### is called 'monitor' and is also created with a retention period of 7 days
### and a replication factor of 1, if it does not exist. In all cases the
### this retention policy is configured as the default for the database.
[monitor]
store-enabled = true # Whether to record statistics internally.
store-database = "_internal" # The destination database for recorded statistics
store-interval = "10s" # The interval at which to record statistics
###
### [admin]
###
### Controls the availability of the built-in, web-based admin interface. If HTTPS is
### enabled for the admin interface, HTTPS must also be enabled on the [http] service.
###
[admin]
enabled = true
bind-address = ":8083"
https-enabled = false
https-certificate = "/etc/ssl/influxdb.pem"
###
### [http]
###
### Controls how the HTTP endpoints are configured. These are the primary
### mechanism for getting data into and out of InfluxDB.
###
[http]
enabled = true
bind-address = ":8086"
auth-enabled = false
log-enabled = true
write-tracing = false
pprof-enabled = false
https-enabled = false
https-certificate = "/etc/ssl/influxdb.pem"
###
### [[graphite]]
###
### Controls one or many listeners for Graphite data.
###
[[graphite]]
enabled = false
# database = "graphite"
# bind-address = ":2003"
# protocol = "tcp"
# consistency-level = "one"
# name-separator = "."
# These next lines control how batching works. You should have this enabled
# otherwise you could get dropped metrics or poor performance. Batching
# will buffer points in memory if you have many coming in.
# batch-size = 1000 # will flush if this many points get buffered
# batch-pending = 5 # number of batches that may be pending in memory
# batch-timeout = "1s" # will flush at least this often even if we haven't hit buffer limit
# udp-read-buffer = 0 # UDP Read buffer size, 0 means OS default. UDP listener will fail if set above OS max.
## "name-schema" configures tag names for parsing the metric name from graphite protocol;
## separated by `name-separator`.
## The "measurement" tag is special and the corresponding field will become
## the name of the metric.
## e.g. "type.host.measurement.device" will parse "server.localhost.cpu.cpu0" as
## {
## measurement: "cpu",
## tags: {
## "type": "server",
## "host": "localhost,
## "device": "cpu0"
## }
## }
# name-schema = "type.host.measurement.device"
## If set to true, when the input metric name has more fields than `name-schema` specified,
## the extra fields will be ignored.
## Otherwise an error will be logged and the metric rejected.
# ignore-unnamed = true
###
### [collectd]
###
### Controls the listener for collectd data.
###
[collectd]
enabled = true
bind-address = "<%= @bind_address %>"
database = "<%= @database %>"
typesdb = "<%= @typesdb %>"
# These next lines control how batching works. You should have this enabled
# otherwise you could get dropped metrics or poor performance. Batching
# will buffer points in memory if you have many coming in.
# will flush if this many points get buffered
batch-size = <%= @batch_size %>
# number of batches that may be pending in memory
batch-pending = <%= @batch_pending %>
# will flush at least this often even if we haven't hit buffer limit
batch-timeout = "<%= @batch_timeout %>"
# UDP Read buffer size, 0 means OS default. UDP listener will fail if set above OS max.
read-buffer = <%= @read_buffer %>
###
### [opentsdb]
###
### Controls the listener for OpenTSDB data.
###
[opentsdb]
enabled = false
# bind-address = ":4242"
# database = "opentsdb"
# retention-policy = ""
# consistency-level = "one"
# tls-enabled = false
# certificate= ""
# These next lines control how batching works. You should have this enabled
# otherwise you could get dropped metrics or poor performance. Only points
# metrics received over the telnet protocol undergo batching.
# batch-size = 1000 # will flush if this many points get buffered
# batch-pending = 5 # number of batches that may be pending in memory
# batch-timeout = "1s" # will flush at least this often even if we haven't hit buffer limit
###
### [[udp]]
###
### Controls the listeners for InfluxDB line protocol data via UDP.
###
[[udp]]
enabled = false
# bind-address = ""
# database = "udp"
# retention-policy = ""
# These next lines control how batching works. You should have this enabled
# otherwise you could get dropped metrics or poor performance. Batching
# will buffer points in memory if you have many coming in.
# batch-size = 1000 # will flush if this many points get buffered
# batch-pending = 5 # number of batches that may be pending in memory
# batch-timeout = "1s" # will flush at least this often even if we haven't hit buffer limit
# read-buffer = 0 # UDP Read buffer size, 0 means OS default. UDP listener will fail if set above OS max.
###
### [continuous_queries]
###
### Controls how continuous queries are run within InfluxDB.
###
[continuous_queries]
log-enabled = true
enabled = true
recompute-previous-n = 2
recompute-no-older-than = "10m"
compute-runs-per-interval = 10
compute-no-more-than = "2m"

View File

@ -0,0 +1,14 @@
<%= @log_file_name %>
{
size <%= @log_file_size %>
rotate <%= @log_file_rotate %>
start 1
missingok
compress
sharedscripts
nodateext
postrotate
systemctl reload syslog-ng > /dev/null 2>&1 || true
endscript
}