254 lines
8.3 KiB
Python
Executable File
254 lines
8.3 KiB
Python
Executable File
#
|
|
# Copyright (c) 2018-2019 Wind River Systems, Inc.
|
|
#
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
#
|
|
############################################################################
|
|
#
|
|
# This file is the collectd 'Platform CPU Usage' Monitor.
|
|
#
|
|
# The Platform CPU Usage is calculated as an averaged percentage of
|
|
# platform core usable since the previous sample.
|
|
#
|
|
# Init Function:
|
|
# - if 'worker_reserved.conf exists then query/store PLATFORM_CPU_LIST
|
|
#
|
|
############################################################################
|
|
import os
|
|
import time
|
|
import collectd
|
|
|
|
debug = False
|
|
|
|
PASS = 0
|
|
FAIL = 1
|
|
|
|
PATH = '/proc/cpuinfo'
|
|
WORKER_RESERVED_CONF = '/etc/platform/worker_reserved.conf'
|
|
|
|
PLUGIN = 'platform cpu usage plugin'
|
|
|
|
|
|
# CPU Control class
|
|
class CPU:
|
|
hostname = "" # hostname for sample notification message
|
|
usage = float(0.0) # float value of cpu usage
|
|
|
|
processors = int(0) # number of processors for all cpus case
|
|
cpu_list = [] # list of CPUs to calculate combined usage for
|
|
cpu_time = [] # schedstat time for each CPU
|
|
cpu_time_last = [] # last schedstat time for each CPU
|
|
time_last = float(0.0) # float of the time the last sample was taken
|
|
|
|
def log_error(self, err_str):
|
|
""" Print an error log with plugin name prefixing the log """
|
|
|
|
collectd.error("%s %s" % (PLUGIN, err_str))
|
|
|
|
# Instantiate the class
|
|
c = CPU()
|
|
|
|
|
|
# The collectd configuration interface
|
|
# collectd needs this defined ; but not used/needed.
|
|
def config_func(config):
|
|
collectd.info('%s config function' % PLUGIN)
|
|
|
|
|
|
# Get the platform cpu list and number of cpus reported by /proc/cpuinfo
|
|
def init_func():
|
|
# get current hostname
|
|
c.hostname = os.uname()[1]
|
|
|
|
collectd.info('%s init function for %s' % (PLUGIN, c.hostname))
|
|
|
|
raw_list = ""
|
|
if os.path.exists(WORKER_RESERVED_CONF):
|
|
with open(WORKER_RESERVED_CONF, 'r') as infile:
|
|
for line in infile:
|
|
if 'PLATFORM_CPU_LIST' in line:
|
|
val = line.split("=")
|
|
raw_list = val[1].strip('\n')[1:-1].strip('"')
|
|
break
|
|
if raw_list:
|
|
|
|
# Convert the cpu list fetched from the compute
|
|
# reserved file into an integer list.
|
|
# Handle mix of number list #,# and number range #-#
|
|
split_list = raw_list.split(',')
|
|
if debug:
|
|
collectd.info('%s split list: %s' % (PLUGIN, split_list))
|
|
for cpu in split_list:
|
|
if cpu.find('-') == -1:
|
|
# add individual cpu # with assumed ',' delimiter
|
|
c.cpu_list.append(int(cpu))
|
|
else:
|
|
# add all in range #-#
|
|
cpu_range = cpu.split('-')
|
|
if len(cpu_range) == 2:
|
|
first = int(cpu_range[0])
|
|
last = int(cpu_range[1]) + 1
|
|
# add each
|
|
for i in list(range(first, last)):
|
|
c.cpu_list.append(i)
|
|
|
|
# with the full CPU list in hand we can now just read their samples
|
|
if debug:
|
|
collectd.info('%s full cpu list: %s' %
|
|
(PLUGIN, c.cpu_list))
|
|
|
|
try:
|
|
f = open('/proc/cpuinfo')
|
|
except EnvironmentError as e:
|
|
collectd.error(str(e), UserWarning)
|
|
else:
|
|
|
|
if len(c.cpu_list) == 0:
|
|
_want_all_cpus = True
|
|
else:
|
|
_want_all_cpus = False
|
|
|
|
c.processors = 0
|
|
for line in f:
|
|
name_value = [s.strip() for s in line.split(':', 1)]
|
|
if len(name_value) != 2:
|
|
continue
|
|
|
|
name, value = name_value
|
|
if 'rocessor' in name:
|
|
if _want_all_cpus is True:
|
|
c.cpu_list.append(int(c.processors))
|
|
c.processors += 1
|
|
|
|
collectd.info('%s has found %d cpus total' %
|
|
(PLUGIN, c.processors))
|
|
collectd.info('%s monitoring %d cpus %s' %
|
|
(PLUGIN, len(c.cpu_list), c.cpu_list))
|
|
f.close()
|
|
|
|
|
|
# Calculate the CPU usage sample
|
|
def read_func():
|
|
try:
|
|
f = open('/proc/schedstat')
|
|
except EnvironmentError as e:
|
|
c.log_error('file open failed ; ' + str(e))
|
|
return FAIL
|
|
else:
|
|
# schedstat time for each CPU
|
|
c.cpu_time = []
|
|
|
|
# Loop over each line ...
|
|
# get the output version ; only 15 is supported
|
|
# get the cpu time from each line staring with 'cpux ....'
|
|
for line in f:
|
|
|
|
# break each line into name/value pairs
|
|
line_split = [s.strip() for s in line.split(' ', 1)]
|
|
name, value = line_split
|
|
|
|
# get the output version.
|
|
if 'ersion' in name:
|
|
try:
|
|
c.version = int(value)
|
|
except ValueError as e:
|
|
c.log_error('got invalid schedstat version ; ' + str(e))
|
|
|
|
# TODO: Consider exiting here and raising alarm.
|
|
# Calling this type of exit will stop the plugin.
|
|
# sys._exit()
|
|
return FAIL
|
|
|
|
# only version 15 is supported
|
|
if c.version == 15:
|
|
if 'cpu' in name:
|
|
# get the cpu number for each line
|
|
if int(name.replace('cpu', '')) in c.cpu_list:
|
|
_in_list = True
|
|
else:
|
|
_in_list = False
|
|
|
|
# get cpu time for each cpu that is valid
|
|
if len(c.cpu_list) == 0 or _in_list is True:
|
|
_schedstat = value
|
|
value_split = value.split(' ')
|
|
c.cpu_time.append(float(value_split[6]))
|
|
if debug:
|
|
collectd.info('%s %s schedstat is %s [%s]' %
|
|
(PLUGIN, name, value_split[6],
|
|
_schedstat))
|
|
else:
|
|
collectd.error('%s unsupported schedstat version [%d]' %
|
|
(PLUGIN, c.version))
|
|
return 0
|
|
|
|
f.close()
|
|
|
|
# Now that we have the cpu time recorded for each cpu
|
|
_time_delta = float(0)
|
|
_cpu_count = int(0)
|
|
if len(c.cpu_time_last) == 0:
|
|
c.time_last = time.time()
|
|
if c.cpu_list:
|
|
# This is a compute node.
|
|
# Do not include vswitch or pinned cpus in calculation.
|
|
for cpu in c.cpu_list:
|
|
c.cpu_time_last.append(float(c.cpu_time[_cpu_count]))
|
|
_cpu_count += 1
|
|
if debug:
|
|
collectd.info('%s cpu time ; first pass ; %s' %
|
|
(PLUGIN, c.cpu_time))
|
|
return PASS
|
|
else:
|
|
_time_this = time.time()
|
|
_time_delta = _time_this - c.time_last
|
|
c.total_avg_cpu = 0
|
|
cpu_occupancy = []
|
|
if debug:
|
|
collectd.info('%s cpu time ; this pass ; %s -> %s' %
|
|
(PLUGIN, c.cpu_time_last, c.cpu_time))
|
|
|
|
if c.cpu_list:
|
|
# This is a compute node.
|
|
# Do not include vswitch or pinned cpus in calculation.
|
|
for cpu in c.cpu_list:
|
|
if cpu >= c.processors:
|
|
c.log_error(' got out of range cpu number')
|
|
else:
|
|
_delta = (c.cpu_time[_cpu_count] - c.cpu_time_last[_cpu_count])
|
|
_delta = _delta / 1000000 / _time_delta
|
|
cpu_occupancy.append(float((100*(_delta))/1000))
|
|
c.total_avg_cpu += cpu_occupancy[_cpu_count]
|
|
if debug:
|
|
collectd.info('%s cpu %d - count:%d [%s]' %
|
|
(PLUGIN, cpu, _cpu_count, cpu_occupancy))
|
|
_cpu_count += 1
|
|
|
|
else:
|
|
collectd.info('%s no cpus to monitor' % PLUGIN)
|
|
return 0
|
|
|
|
c.usage = c.total_avg_cpu / _cpu_count
|
|
if debug:
|
|
collectd.info('%s reports %.2f %% usage (averaged)' %
|
|
(PLUGIN, c.usage))
|
|
|
|
# Prepare for next audit ; mode now to last
|
|
# c.cpu_time_last = []
|
|
c.cpu_time_last = c.cpu_time
|
|
c.time_last = _time_this
|
|
|
|
# Dispatch usage value to collectd
|
|
val = collectd.Values(host=c.hostname)
|
|
val.plugin = 'cpu'
|
|
val.type = 'percent'
|
|
val.type_instance = 'used'
|
|
val.dispatch(values=[c.usage])
|
|
|
|
return 0
|
|
|
|
|
|
collectd.register_config(config_func)
|
|
collectd.register_init(init_func)
|
|
collectd.register_read(read_func)
|