integ/monitoring/collectd-extensions/src/ntpq.py

196 lines
5.4 KiB
Python
Executable File

# Copyright (c) 2018 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
#
import os
import subprocess
import uuid
import collectd
from fm_api import constants as fm_constants
from fm_api import fm_api
import tsconfig.tsconfig as tsc
api = fm_api.FaultAPIs()
PLUGIN = 'NTP query plugin'
PLUGIN_SCRIPT = '/etc/rmonfiles.d/query_ntp_servers.sh'
PLUGIN_RESULT = '/tmp/ntpq_server_info'
# static variables
ALARM_ID__NTPQ = "100.114"
# define a class here that will persist over read calls
class NtpqObject:
hostname = ''
base_eid = ''
severity = 'clear'
suppression = True
service_affecting = False
status = 0
last_result = ''
this_result = ''
id = ALARM_ID__NTPQ
name = "NTP"
alarm_type = fm_constants.FM_ALARM_TYPE_1
cause = fm_constants.ALARM_PROBABLE_CAUSE_UNKNOWN
repair = "Monitor and if condition persists, "
repair += "contact next level of support."
obj = NtpqObject()
def is_uuid_like(val):
"""Returns validation of a value as a UUID."""
try:
return str(uuid.UUID(val)) == val
except (TypeError, ValueError, AttributeError):
return False
# The config function - called once on collectd process startup
def config_func(config):
"""
Configure the plugin
"""
collectd.debug('%s config function' % PLUGIN)
return 0
# The init function - called once on collectd process startup
def init_func():
# ntp query is for controllers only
if tsc.nodetype != 'controller':
return 0
# get current hostname
obj.hostname = os.uname()[1]
obj.base_eid = 'host=' + obj.hostname + '.ntp'
collectd.info("%s on %s with entity id '%s'" % PLUGIN, obj.hostname, obj.base_eid)
return 0
# The sample read function - called on every audit interval
def read_func():
# ntp query is for controllers only
if tsc.nodetype != 'controller':
return 0
result = int(0)
# Query ntp
try:
result = os.system(PLUGIN_SCRIPT)
except Exception as e:
collectd.error("%s Could not run '%s' (%s)" %
(PLUGIN, e))
return 0
obj.status = int(result)/0x100
collectd.info("%s Query Result: %s" % (PLUGIN, obj.status))
if os.path.exists(PLUGIN_RESULT) is False:
collectd.error("%s produced no result file '%s'" %
(PLUGIN, PLUGIN_RESULT))
return 0
# read the query result file.
# format is in the PLUGIN_SCRIPT file.
# This code only wants the second line.
# It contains list of unreachable ntp servers that need alarm management.
count = 0
with open(PLUGIN_RESULT, 'r') as infile:
for line in infile:
count += 1
collectd.info("%s Query Result: %s" % (PLUGIN, line))
if count == 0:
collectd.error("%s produced empty result file '%s'" %
(PLUGIN, PLUGIN_RESULT))
return 0
sample = 1
# Dispatch usage value to collectd
val = collectd.Values(host=obj.hostname)
val.plugin = 'ntpq'
val.plugin_instance = 'some.ntp.server.ip'
val.type = 'absolute'
val.type_instance = 'state'
val.dispatch(values=[sample])
severity = 'clear'
obj.severity = 'clear'
# if there is no severity change then consider exiting
if obj.severity == severity:
# unless the current severity is 'minor'
if severity == 'minor':
# TODO: check to see if the failing IP address is changed
collectd.info("%s NEED TO CHECK IP ADDRESSES" % (PLUGIN))
else:
return 0
# if current severity is clear but previous severity is not then
# prepare to clear the alarms
if severity == 'clear':
_alarm_state = fm_constants.FM_ALARM_STATE_CLEAR
# TODO: loop over all raises alarms and clear them
collectd.info("%s NEED CLEAR ALL ALARMS" % (PLUGIN))
if api.clear_fault(obj.id, obj.base_eid) is False:
collectd.error("%s %s:%s clear_fault failed" %
(PLUGIN, obj.id, obj.base_eid))
return 0
elif severity == 'major':
reason = "NTP configuration does not contain any valid "
reason += "or reachable NTP servers."
eid = obj.base_eid
fm_severity = fm_constants.FM_ALARM_SEVERITY_MAJOR
else:
# TODO: There can be up to 3 inacessable servers
ip = 'some.server.ip.addr'
reason = "NTP address "
reason += ip
reason += " is not a valid or a reachable NTP server."
eid = obj.base_eid + '=' + ip
fm_severity = fm_constants.FM_ALARM_SEVERITY_MINOR
fault = fm_api.Fault(
alarm_id=obj.id,
alarm_state=fm_constants.FM_ALARM_STATE_SET,
entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST,
entity_instance_id=eid,
severity=fm_severity,
reason_text=reason,
alarm_type=obj.alarm_type,
probable_cause=obj.cause,
proposed_repair_action=obj.repair,
service_affecting=obj.service_affecting,
suppression=obj.suppression)
alarm_uuid = api.set_fault(fault)
if is_uuid_like(alarm_uuid) is False:
collectd.error("%s %s:%s set_fault failed:%s" %
(PLUGIN, obj.id, eid, alarm_uuid))
return 0
# TODO: clear the object alarm state
return 0
# register the config, init and read functions
collectd.register_config(config_func)
collectd.register_init(init_func)
collectd.register_read(read_func)