Make collectd plugins use FM API V2
Using Fm API V2 allows the collectd plugins to distinguish between FM connection failures and no existing alarm query requests on process startup as well as failure to clear or assert alarms during runtime so that such actions can be retried on next audit interval. This allows the plugins to be more robust in its alarm management and avoids leaving stuck alarms which fixes the following three reported stuck alarm bugs. Closes-Bug: https://bugs.launchpad.net/starlingx/+bug/1802535 Closes-Bug: https://bugs.launchpad.net/starlingx/+bug/1813974 Closes-Bug: https://bugs.launchpad.net/starlingx/+bug/1814944 Additional improvements were made to each plugin to handle failure paths better with the V2 API. Additional changes made by this update include: 1. fixed stale unmounted filesystems alarm handling 2. percent usage alarm actual readings are updated on change 3. fix of threshold values 4. add 2 decimal point resolution to % usage alarm text 5. added commented FIT code to mem, cpu and df plugins 6. reversed True/False return polarity in interface plugin functions Test Plan: Regression: PASS: normal alarm handling with FM V2 API ; process startup PASS: normal alarm handling with FM V2 API ; runtime alarm assert PASS: normal alarm handling with FM V2 API ; runtime alarm clear PASS: Verify alarms of unmounted fs gets automatically cleared PASS: Verify interface alarm/clear operation Robustness: PASS: Verify general startup behavior of all plugins while FM is not running only to see it start at some later time. PASS: Verify alarm handling over process startup with existing cpu alarms while FM not running. PASS: Verify alarm handling over process startup with existing mem alarms while FM not running. PASS: Verify alarm handling over process startup with existing df alarms while FM not running. PASS: Verify runtime cpu plugin alarm assertion retry handling PASS: Verify runtime cpu plugin alarm clear retry handling PASS: Verify runtime cpu plugin handling over process restart PASS: Verify alarm handling over process startup with existing cpu alarms while FM initially not running and then started. PASS: Verify runtime mem plugin alarm assertion retry handling PASS: Verify runtime mem plugin alarm clear retry handling PASS: Verify runtime mem plugin handling over process restart PASS: Verify alarm handling over process startup with existing mem alarms while FM initially not running and then started. PASS: Verify runtime df plugin alarm assertion retry handling PASS: Verify runtime df plugin alarm clear retry handling PASS: Verify runtime df plugin handling over process restart PASS: Verify alarm handling over process startup with existing df alarms while FM initially not running and then started. PASS: Verify alarm set/clear threshold boundaries for cpu plugin PASS: Verify alarm set/clear threshold boundaries for memory plugin PASS: Verify alarm set/clear threshold boundaries for df plugin New Features: ... threshold exceeded ; threshold 80.00%, actual 80.33% PASS: Verify percent usage alarms are refreshed with current value PASS: Verify percent usage alarms show two decimal points Change-Id: Ibe173617d11c17bdc4b41115e25bd8c18b49807e Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
parent
c0f3e873be
commit
8841bceb80
|
@ -22,4 +22,4 @@ COPY_LIST="$PKG_BASE/src/LICENSE \
|
||||||
$PKG_BASE/src/ptp.conf \
|
$PKG_BASE/src/ptp.conf \
|
||||||
$PKG_BASE/src/example.py \
|
$PKG_BASE/src/example.py \
|
||||||
$PKG_BASE/src/example.conf"
|
$PKG_BASE/src/example.conf"
|
||||||
TIS_PATCH_VER=9
|
TIS_PATCH_VER=10
|
||||||
|
|
|
@ -13,8 +13,8 @@
|
||||||
Instance "used"
|
Instance "used"
|
||||||
Persist true
|
Persist true
|
||||||
PersistOK true
|
PersistOK true
|
||||||
WarningMax 89.00
|
WarningMax 90.00
|
||||||
FailureMax 94.00
|
FailureMax 95.00
|
||||||
Hits 2
|
Hits 2
|
||||||
Invert false
|
Invert false
|
||||||
</Type>
|
</Type>
|
||||||
|
|
|
@ -239,6 +239,14 @@ def read_func():
|
||||||
c.cpu_time_last = c.cpu_time
|
c.cpu_time_last = c.cpu_time
|
||||||
c.time_last = _time_this
|
c.time_last = _time_this
|
||||||
|
|
||||||
|
# if os.path.exists('/var/run/fit/cpu_data'):
|
||||||
|
# with open('/var/run/fit/cpu_data', 'r') as infile:
|
||||||
|
# for line in infile:
|
||||||
|
# c.usage = float(line)
|
||||||
|
# collectd.info("%s using FIT data:%.2f" %
|
||||||
|
# (PLUGIN, c.usage))
|
||||||
|
# break
|
||||||
|
|
||||||
# Dispatch usage value to collectd
|
# Dispatch usage value to collectd
|
||||||
val = collectd.Values(host=c.hostname)
|
val = collectd.Values(host=c.hostname)
|
||||||
val.plugin = 'cpu'
|
val.plugin = 'cpu'
|
||||||
|
|
|
@ -28,8 +28,8 @@
|
||||||
<Plugin "df">
|
<Plugin "df">
|
||||||
<Type "percent_bytes">
|
<Type "percent_bytes">
|
||||||
Instance "used"
|
Instance "used"
|
||||||
WarningMax 79.00
|
WarningMax 80.00
|
||||||
FailureMax 89.00
|
FailureMax 90.00
|
||||||
Persist true
|
Persist true
|
||||||
PersistOK true
|
PersistOK true
|
||||||
Hits 2
|
Hits 2
|
||||||
|
|
|
@ -96,7 +96,7 @@ import plugin_common as pc
|
||||||
if tsc.nodetype == 'controller':
|
if tsc.nodetype == 'controller':
|
||||||
from influxdb import InfluxDBClient
|
from influxdb import InfluxDBClient
|
||||||
|
|
||||||
api = fm_api.FaultAPIs()
|
api = fm_api.FaultAPIsV2()
|
||||||
|
|
||||||
# Debug control
|
# Debug control
|
||||||
debug = False
|
debug = False
|
||||||
|
@ -136,6 +136,7 @@ NOTIF_OKAY = 4
|
||||||
PASS = 0
|
PASS = 0
|
||||||
FAIL = 1
|
FAIL = 1
|
||||||
|
|
||||||
|
|
||||||
# Some plugin_instances are mangled by collectd.
|
# Some plugin_instances are mangled by collectd.
|
||||||
# The filesystem plugin is especially bad for this.
|
# The filesystem plugin is especially bad for this.
|
||||||
# For instance the "/var/log" MountPoint instance is
|
# For instance the "/var/log" MountPoint instance is
|
||||||
|
@ -216,6 +217,10 @@ class PluginObject:
|
||||||
database_setup = False # state of database setup
|
database_setup = False # state of database setup
|
||||||
database_setup_in_progress = False # connection mutex
|
database_setup_in_progress = False # connection mutex
|
||||||
|
|
||||||
|
# Set to True once FM connectivity is verified
|
||||||
|
# Used to ensure alarms are queried on startup
|
||||||
|
fm_connectivity = False
|
||||||
|
|
||||||
def __init__(self, id, plugin):
|
def __init__(self, id, plugin):
|
||||||
"""PluginObject Class constructor"""
|
"""PluginObject Class constructor"""
|
||||||
|
|
||||||
|
@ -234,6 +239,10 @@ class PluginObject:
|
||||||
self.values = []
|
self.values = []
|
||||||
self.value = float(0) # float value of reading
|
self.value = float(0) # float value of reading
|
||||||
|
|
||||||
|
# This member is used to help log change values using the
|
||||||
|
# LOG_STEP threshold consant
|
||||||
|
self.last_value = float(0)
|
||||||
|
|
||||||
# float value of threshold
|
# float value of threshold
|
||||||
self.threshold = float(INVALID_THRESHOLD)
|
self.threshold = float(INVALID_THRESHOLD)
|
||||||
|
|
||||||
|
@ -264,10 +273,6 @@ class PluginObject:
|
||||||
self.audit_threshold = 0
|
self.audit_threshold = 0
|
||||||
self.audit_count = 0
|
self.audit_count = 0
|
||||||
|
|
||||||
# This member is used to help log change values using the
|
|
||||||
# LOG_STEP threshold consant
|
|
||||||
self.last_value = ""
|
|
||||||
|
|
||||||
# For plugins that have multiple instances like df (filesystem plugin)
|
# For plugins that have multiple instances like df (filesystem plugin)
|
||||||
# we need to create an instance of this object for each one.
|
# we need to create an instance of this object for each one.
|
||||||
# This dictionary is used to associate an instance with its object.
|
# This dictionary is used to associate an instance with its object.
|
||||||
|
@ -378,20 +383,10 @@ class PluginObject:
|
||||||
if len(self.values):
|
if len(self.values):
|
||||||
# validate the reading
|
# validate the reading
|
||||||
try:
|
try:
|
||||||
self.value = float(self.values[0])
|
self.value = round(float(self.values[0]), 2)
|
||||||
# get the threshold if its there.
|
# get the threshold if its there.
|
||||||
if len(self.values) > 1:
|
if len(self.values) > 1:
|
||||||
self.threshold = float(self.values[1])
|
self.threshold = float(self.values[1])
|
||||||
if nObject.plugin == PLUGIN__MEM:
|
|
||||||
if self.reading_type == READING_TYPE__PERCENT_USAGE:
|
|
||||||
# Note: add one to % usage reading types so that it
|
|
||||||
# matches how rmond did it. In collectd an
|
|
||||||
# overage is over the specified threshold
|
|
||||||
# whereas in rmon an overage is at threshold
|
|
||||||
# or above.
|
|
||||||
self.threshold = float(self.values[1]) + 1
|
|
||||||
else:
|
|
||||||
self.threshold = float(self.values[1])
|
|
||||||
else:
|
else:
|
||||||
self.threshold = float(INVALID_THRESHOLD) # invalid value
|
self.threshold = float(INVALID_THRESHOLD) # invalid value
|
||||||
|
|
||||||
|
@ -471,22 +466,21 @@ class PluginObject:
|
||||||
self.reading_type,
|
self.reading_type,
|
||||||
resource))
|
resource))
|
||||||
|
|
||||||
self.last_value = float(self.value)
|
|
||||||
|
|
||||||
##########################################################################
|
##########################################################################
|
||||||
#
|
#
|
||||||
# Name : _severity_change
|
# Name : _update_alarm
|
||||||
#
|
#
|
||||||
# Purpose : Compare current severity to instance severity lists to
|
# Purpose : Compare current severity to instance severity lists to
|
||||||
# facilitate early 'do nothing' exit from a notification.
|
# facilitate early 'do nothing' exit from a notification.
|
||||||
#
|
#
|
||||||
# Returns : True if the severity changed
|
# Description: Avoid clearing an already cleared alarm.
|
||||||
# False if severity is the same
|
# Refresh asserted alarm data for usage reading type alarms
|
||||||
|
#
|
||||||
|
# Returns : True if the alarm needs refresh, otherwise false.
|
||||||
#
|
#
|
||||||
##########################################################################
|
##########################################################################
|
||||||
|
def _update_alarm(self, entity_id, severity, this_value, last_value):
|
||||||
def _severity_change(self, entity_id, severity):
|
"""Check for need to update alarm data"""
|
||||||
"""Check for a severity change"""
|
|
||||||
|
|
||||||
if entity_id in self.warnings:
|
if entity_id in self.warnings:
|
||||||
self._llog(entity_id + " is already in warnings list")
|
self._llog(entity_id + " is already in warnings list")
|
||||||
|
@ -501,9 +495,13 @@ class PluginObject:
|
||||||
# Compare to current state to previous state.
|
# Compare to current state to previous state.
|
||||||
# If they are the same then return done.
|
# If they are the same then return done.
|
||||||
if severity == current_severity_str:
|
if severity == current_severity_str:
|
||||||
return False
|
if severity == "okay":
|
||||||
else:
|
return False
|
||||||
return True
|
if self.reading_type != READING_TYPE__PERCENT_USAGE:
|
||||||
|
return False
|
||||||
|
elif round(last_value, 2) == round(this_value, 2):
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
########################################################################
|
########################################################################
|
||||||
#
|
#
|
||||||
|
@ -670,19 +668,14 @@ class PluginObject:
|
||||||
|
|
||||||
try:
|
try:
|
||||||
collectd.debug("%s %s Get Lock ..." % (PLUGIN, self.plugin))
|
collectd.debug("%s %s Get Lock ..." % (PLUGIN, self.plugin))
|
||||||
PluginObject.lock.acquire()
|
with PluginObject.lock:
|
||||||
|
obj = self.instance_objects[eid]
|
||||||
obj = self.instance_objects[eid]
|
return obj
|
||||||
return obj
|
|
||||||
except:
|
except:
|
||||||
collectd.error("%s failed to get instance from %s object list" %
|
collectd.error("%s failed to get instance from %s object list" %
|
||||||
(PLUGIN, self.plugin))
|
(PLUGIN, self.plugin))
|
||||||
return None
|
return None
|
||||||
|
|
||||||
finally:
|
|
||||||
collectd.debug("%s %s Get UnLock ..." % (PLUGIN, self.plugin))
|
|
||||||
PluginObject.lock.release()
|
|
||||||
|
|
||||||
##########################################################################
|
##########################################################################
|
||||||
#
|
#
|
||||||
# Name : _add_instance_object
|
# Name : _add_instance_object
|
||||||
|
@ -701,17 +694,12 @@ class PluginObject:
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
collectd.debug("%s %s Add Lock ..." % (PLUGIN, self.plugin))
|
collectd.debug("%s %s Add Lock ..." % (PLUGIN, self.plugin))
|
||||||
PluginObject.lock.acquire()
|
with PluginObject.lock:
|
||||||
|
self.instance_objects[eid] = obj
|
||||||
self.instance_objects[eid] = obj
|
|
||||||
except:
|
except:
|
||||||
collectd.error("%s failed to add instance to %s object list" %
|
collectd.error("%s failed to add instance to %s object list" %
|
||||||
(PLUGIN, self.plugin))
|
(PLUGIN, self.plugin))
|
||||||
|
|
||||||
finally:
|
|
||||||
collectd.debug("%s %s Add UnLock ..." % (PLUGIN, self.plugin))
|
|
||||||
PluginObject.lock.release()
|
|
||||||
|
|
||||||
##########################################################################
|
##########################################################################
|
||||||
#
|
#
|
||||||
# Name : _copy_instance_object
|
# Name : _copy_instance_object
|
||||||
|
@ -861,6 +849,36 @@ PLUGINS = {
|
||||||
PLUGIN__EXAMPLE: PluginObject(ALARM_ID__EXAMPLE, PLUGIN__EXAMPLE)}
|
PLUGIN__EXAMPLE: PluginObject(ALARM_ID__EXAMPLE, PLUGIN__EXAMPLE)}
|
||||||
|
|
||||||
|
|
||||||
|
#####################################################################
|
||||||
|
#
|
||||||
|
# Name : clear_alarm
|
||||||
|
#
|
||||||
|
# Description: Clear the specified alarm with the specified entity ID.
|
||||||
|
#
|
||||||
|
# Returns : True if operation succeeded
|
||||||
|
# False if there was an error exception.
|
||||||
|
#
|
||||||
|
# Assumptions: Caller can decide to retry based on return status.
|
||||||
|
#
|
||||||
|
#####################################################################
|
||||||
|
def clear_alarm(alarm_id, eid):
|
||||||
|
"""Clear the specified alarm:eid"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
if api.clear_fault(alarm_id, eid) is True:
|
||||||
|
collectd.info("%s %s:%s alarm cleared" %
|
||||||
|
(PLUGIN, alarm_id, eid))
|
||||||
|
else:
|
||||||
|
collectd.info("%s %s:%s alarm already cleared" %
|
||||||
|
(PLUGIN, alarm_id, eid))
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as ex:
|
||||||
|
collectd.error("%s 'clear_fault' exception ; %s:%s ; %s" %
|
||||||
|
(PLUGIN, alarm_id, eid, ex))
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def _get_base_object(alarm_id):
|
def _get_base_object(alarm_id):
|
||||||
"""Get the alarm object for the specified alarm id"""
|
"""Get the alarm object for the specified alarm id"""
|
||||||
for plugin in PLUGIN_NAME_LIST:
|
for plugin in PLUGIN_NAME_LIST:
|
||||||
|
@ -1027,15 +1045,16 @@ def _print_state(obj=None):
|
||||||
objs.append(obj)
|
objs.append(obj)
|
||||||
|
|
||||||
collectd.debug("%s _print_state Lock ..." % PLUGIN)
|
collectd.debug("%s _print_state Lock ..." % PLUGIN)
|
||||||
PluginObject.lock.acquire()
|
with PluginObject.lock:
|
||||||
for o in objs:
|
for o in objs:
|
||||||
_print_obj(o)
|
_print_obj(o)
|
||||||
if len(o.instance_objects):
|
if len(o.instance_objects):
|
||||||
for inst_obj in o.instance_objects:
|
for inst_obj in o.instance_objects:
|
||||||
_print_obj(o.instance_objects[inst_obj])
|
_print_obj(o.instance_objects[inst_obj])
|
||||||
finally:
|
|
||||||
collectd.debug("%s _print_state UnLock ..." % PLUGIN)
|
except Exception as ex:
|
||||||
PluginObject.lock.release()
|
collectd.error("%s _print_state exception ; %s" %
|
||||||
|
(PLUGIN, ex))
|
||||||
|
|
||||||
|
|
||||||
def _database_setup(database):
|
def _database_setup(database):
|
||||||
|
@ -1137,10 +1156,7 @@ def _clear_alarm_for_missing_filesystems():
|
||||||
# For all others replace all '-' with '/'
|
# For all others replace all '-' with '/'
|
||||||
path = '/' + obj.plugin_instance.replace('-', '/')
|
path = '/' + obj.plugin_instance.replace('-', '/')
|
||||||
if os.path.ismount(path) is False:
|
if os.path.ismount(path) is False:
|
||||||
if api.clear_fault(df_base_obj.id, obj.entity_id) is False:
|
if clear_alarm(df_base_obj.id, obj.entity_id) is True:
|
||||||
collectd.error("%s %s:%s clear failed ; will retry" %
|
|
||||||
(PLUGIN, df_base_obj.id, obj.entity_id))
|
|
||||||
else:
|
|
||||||
collectd.info("%s cleared alarm for missing %s" %
|
collectd.info("%s cleared alarm for missing %s" %
|
||||||
(PLUGIN, path))
|
(PLUGIN, path))
|
||||||
df_base_obj._manage_alarm(obj.entity_id, "okay")
|
df_base_obj._manage_alarm(obj.entity_id, "okay")
|
||||||
|
@ -1259,77 +1275,92 @@ def init_func():
|
||||||
obj.repair = "Not Applicable"
|
obj.repair = "Not Applicable"
|
||||||
collectd.info("%s monitoring %s usage" % (PLUGIN, obj.resource_name))
|
collectd.info("%s monitoring %s usage" % (PLUGIN, obj.resource_name))
|
||||||
|
|
||||||
|
# ...
|
||||||
|
# ADD_NEW_PLUGIN: Add new plugin object initialization here ...
|
||||||
|
# ...
|
||||||
|
|
||||||
if tsc.nodetype == 'controller':
|
if tsc.nodetype == 'controller':
|
||||||
PluginObject.database_setup_in_progress = True
|
PluginObject.database_setup_in_progress = True
|
||||||
_database_setup('collectd')
|
_database_setup('collectd')
|
||||||
PluginObject.database_setup_in_progress = False
|
PluginObject.database_setup_in_progress = False
|
||||||
|
|
||||||
# ...
|
|
||||||
# ADD_NEW_PLUGIN: Add new plugin object initialization here ...
|
|
||||||
# ...
|
|
||||||
|
|
||||||
######################################################################
|
|
||||||
#
|
|
||||||
# With plugin objects initialized ...
|
|
||||||
# Query FM for any resource alarms that may already be raised
|
|
||||||
# Load the queries severity state into the appropriate
|
|
||||||
# severity list for those that are.
|
|
||||||
for alarm_id in ALARM_ID_LIST:
|
|
||||||
collectd.debug("%s searching for all '%s' alarms " %
|
|
||||||
(PLUGIN, alarm_id))
|
|
||||||
alarms = api.get_faults_by_id(alarm_id)
|
|
||||||
if alarms:
|
|
||||||
for alarm in alarms:
|
|
||||||
want_alarm_clear = False
|
|
||||||
eid = alarm.entity_instance_id
|
|
||||||
# ignore alarms not for this host
|
|
||||||
if PluginObject.host not in eid:
|
|
||||||
continue
|
|
||||||
|
|
||||||
base_obj = _get_base_object(alarm_id)
|
|
||||||
if base_obj is None:
|
|
||||||
|
|
||||||
# might be a plugin instance - clear it
|
|
||||||
want_alarm_clear = True
|
|
||||||
|
|
||||||
collectd.info('%s found %s %s alarm [%s]' %
|
|
||||||
(PLUGIN,
|
|
||||||
alarm.severity,
|
|
||||||
alarm_id,
|
|
||||||
eid))
|
|
||||||
|
|
||||||
if want_alarm_clear is True:
|
|
||||||
|
|
||||||
if api.clear_fault(alarm_id, eid) is False:
|
|
||||||
collectd.error("%s %s:%s clear failed" %
|
|
||||||
(PLUGIN,
|
|
||||||
alarm_id,
|
|
||||||
eid))
|
|
||||||
else:
|
|
||||||
collectd.info("%s clear %s %s alarm %s" %
|
|
||||||
(PLUGIN,
|
|
||||||
alarm.severity,
|
|
||||||
alarm_id,
|
|
||||||
eid))
|
|
||||||
continue
|
|
||||||
|
|
||||||
if alarm.severity == "critical":
|
|
||||||
sev = "failure"
|
|
||||||
elif alarm.severity == "major":
|
|
||||||
sev = "warning"
|
|
||||||
else:
|
|
||||||
sev = "okay"
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Load the alarm severity by doing a plugin/instance lookup.
|
|
||||||
if base_obj is not None:
|
|
||||||
base_obj._manage_alarm(eid, sev)
|
|
||||||
|
|
||||||
|
|
||||||
# The notifier function inspects the collectd notification and determines if
|
# The notifier function inspects the collectd notification and determines if
|
||||||
# the representative alarm needs to be asserted, severity changed, or cleared.
|
# the representative alarm needs to be asserted, severity changed, or cleared.
|
||||||
def notifier_func(nObject):
|
def notifier_func(nObject):
|
||||||
|
|
||||||
|
if PluginObject.fm_connectivity is False:
|
||||||
|
|
||||||
|
# handle multi threading startup
|
||||||
|
with PluginObject.lock:
|
||||||
|
if PluginObject.fm_connectivity is True:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
##################################################################
|
||||||
|
#
|
||||||
|
# With plugin objects initialized ...
|
||||||
|
# Query FM for any resource alarms that may already be raised
|
||||||
|
# Load the queries severity state into the appropriate
|
||||||
|
# severity list for those that are.
|
||||||
|
for alarm_id in ALARM_ID_LIST:
|
||||||
|
collectd.debug("%s searching for all '%s' alarms " %
|
||||||
|
(PLUGIN, alarm_id))
|
||||||
|
try:
|
||||||
|
alarms = api.get_faults_by_id(alarm_id)
|
||||||
|
except Exception as ex:
|
||||||
|
collectd.error("%s 'get_faults_by_id' exception ; %s" %
|
||||||
|
(PLUGIN, ex))
|
||||||
|
return 0
|
||||||
|
|
||||||
|
if alarms:
|
||||||
|
for alarm in alarms:
|
||||||
|
want_alarm_clear = False
|
||||||
|
eid = alarm.entity_instance_id
|
||||||
|
# ignore alarms not for this host
|
||||||
|
if PluginObject.host not in eid:
|
||||||
|
continue
|
||||||
|
|
||||||
|
base_obj = _get_base_object(alarm_id)
|
||||||
|
if base_obj is None:
|
||||||
|
# might be a plugin instance - clear it
|
||||||
|
want_alarm_clear = True
|
||||||
|
|
||||||
|
collectd.info('%s found %s %s alarm [%s]' %
|
||||||
|
(PLUGIN,
|
||||||
|
alarm.severity,
|
||||||
|
alarm_id,
|
||||||
|
eid))
|
||||||
|
|
||||||
|
if want_alarm_clear is True:
|
||||||
|
|
||||||
|
if clear_alarm(alarm_id, eid) is False:
|
||||||
|
collectd.error("%s %s:%s clear failed" %
|
||||||
|
(PLUGIN,
|
||||||
|
alarm_id,
|
||||||
|
eid))
|
||||||
|
else:
|
||||||
|
collectd.info("%s clear %s %s alarm %s" %
|
||||||
|
(PLUGIN,
|
||||||
|
alarm.severity,
|
||||||
|
alarm_id,
|
||||||
|
eid))
|
||||||
|
continue
|
||||||
|
|
||||||
|
if alarm.severity == "critical":
|
||||||
|
sev = "failure"
|
||||||
|
elif alarm.severity == "major":
|
||||||
|
sev = "warning"
|
||||||
|
else:
|
||||||
|
sev = "okay"
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Load the alarm severity by plugin/instance lookup.
|
||||||
|
if base_obj is not None:
|
||||||
|
base_obj._manage_alarm(eid, sev)
|
||||||
|
|
||||||
|
PluginObject.fm_connectivity = True
|
||||||
|
collectd.info("%s initialization complete" % PLUGIN)
|
||||||
|
|
||||||
collectd.debug('%s notification: %s %s:%s - %s %s %s [%s]' % (
|
collectd.debug('%s notification: %s %s:%s - %s %s %s [%s]' % (
|
||||||
PLUGIN,
|
PLUGIN,
|
||||||
nObject.host,
|
nObject.host,
|
||||||
|
@ -1393,31 +1424,20 @@ def notifier_func(nObject):
|
||||||
eid = _build_entity_id(nObject.plugin, nObject.plugin_instance)
|
eid = _build_entity_id(nObject.plugin, nObject.plugin_instance)
|
||||||
try:
|
try:
|
||||||
# Need lock when reading/writing any obj.instance_objects list
|
# Need lock when reading/writing any obj.instance_objects list
|
||||||
collectd.debug("%s %s lock" % (PLUGIN, nObject.plugin))
|
with PluginObject.lock:
|
||||||
PluginObject.lock.acquire()
|
|
||||||
|
|
||||||
# collectd.info("%s Object Search eid: %s" %
|
# we will take an exception if this object is not
|
||||||
# (nObject.plugin, eid))
|
# in the list. The exception handling code below will
|
||||||
|
# create and add this object for success path the next
|
||||||
|
# time around.
|
||||||
|
inst_obj = base_obj.instance_objects[eid]
|
||||||
|
|
||||||
# for o in base_obj.instance_objects:
|
collectd.debug("%s %s instance %s already exists %s" %
|
||||||
# collectd.error("%s %s inst object dict item %s : %s" %
|
(PLUGIN, nObject.plugin, eid, inst_obj))
|
||||||
# (PLUGIN, nObject.plugin, o,
|
# _print_state(inst_obj)
|
||||||
# base_obj.instance_objects[o]))
|
|
||||||
|
|
||||||
# we will take an exception if this object is not in the list.
|
|
||||||
# the exception handling code below will create and add this
|
|
||||||
# object for success path the next time around.
|
|
||||||
inst_obj = base_obj.instance_objects[eid]
|
|
||||||
|
|
||||||
collectd.debug("%s %s instance %s already exists %s" %
|
|
||||||
(PLUGIN, nObject.plugin, eid, inst_obj))
|
|
||||||
# _print_state(inst_obj)
|
|
||||||
|
|
||||||
except:
|
except:
|
||||||
need_instance_object_create = True
|
need_instance_object_create = True
|
||||||
finally:
|
|
||||||
collectd.debug("%s %s unlock" % (PLUGIN, nObject.plugin))
|
|
||||||
PluginObject.lock.release()
|
|
||||||
|
|
||||||
if need_instance_object_create is True:
|
if need_instance_object_create is True:
|
||||||
base_obj._create_instance_object(nObject.plugin_instance)
|
base_obj._create_instance_object(nObject.plugin_instance)
|
||||||
|
@ -1474,30 +1494,33 @@ def notifier_func(nObject):
|
||||||
# audit file system presence every time we get the
|
# audit file system presence every time we get the
|
||||||
# notification for the root file system ; which will
|
# notification for the root file system ; which will
|
||||||
# always be there.
|
# always be there.
|
||||||
if obj.instance == 'df_root':
|
if obj.instance == '/':
|
||||||
_clear_alarm_for_missing_filesystems()
|
_clear_alarm_for_missing_filesystems()
|
||||||
|
|
||||||
# exit early if there is no severity change
|
# exit early if there is no alarm update to be made
|
||||||
if base_obj._severity_change(obj.entity_id, severity_str) is False:
|
if base_obj._update_alarm(obj.entity_id,
|
||||||
|
severity_str,
|
||||||
|
obj.value,
|
||||||
|
obj.last_value) is False:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
obj.last_value = round(obj.value, 2)
|
||||||
|
|
||||||
if _alarm_state == fm_constants.FM_ALARM_STATE_CLEAR:
|
if _alarm_state == fm_constants.FM_ALARM_STATE_CLEAR:
|
||||||
if api.clear_fault(obj.id, obj.entity_id) is False:
|
if clear_alarm(obj.id, obj.entity_id) is False:
|
||||||
collectd.error("%s %s:%s clear_fault failed" %
|
|
||||||
(PLUGIN, base_obj.id, obj.entity_id))
|
|
||||||
return 0
|
return 0
|
||||||
else:
|
else:
|
||||||
|
|
||||||
# manage addition of the failure reason text
|
# manage addition of the failure reason text
|
||||||
if obj.cause == fm_constants.ALARM_PROBABLE_CAUSE_50:
|
if obj.cause == fm_constants.ALARM_PROBABLE_CAUSE_50:
|
||||||
# if this is a threshold alarm then build the reason text that
|
# if this is a threshold alarm then build the reason text that
|
||||||
# includes the threahold and the reading that caused the assertion.
|
# includes the threshold and the reading that caused the assertion.
|
||||||
reason = obj.resource_name
|
reason = obj.resource_name
|
||||||
reason += " threshold exceeded ;"
|
reason += " threshold exceeded ;"
|
||||||
if obj.threshold != INVALID_THRESHOLD:
|
if obj.threshold != INVALID_THRESHOLD:
|
||||||
reason += " threshold {:2.0f}".format(obj.threshold) + "%,"
|
reason += " threshold {:2.2f}".format(obj.threshold) + "%,"
|
||||||
if obj.value:
|
if obj.value:
|
||||||
reason += " actual {:2.0f}".format(obj.value) + "%"
|
reason += " actual {:2.2f}".format(obj.value) + "%"
|
||||||
|
|
||||||
elif _severity_num == fm_constants.FM_ALARM_SEVERITY_CRITICAL:
|
elif _severity_num == fm_constants.FM_ALARM_SEVERITY_CRITICAL:
|
||||||
reason = obj.reason_failure
|
reason = obj.reason_failure
|
||||||
|
@ -1519,10 +1542,23 @@ def notifier_func(nObject):
|
||||||
service_affecting=base_obj.service_affecting,
|
service_affecting=base_obj.service_affecting,
|
||||||
suppression=base_obj.suppression)
|
suppression=base_obj.suppression)
|
||||||
|
|
||||||
alarm_uuid = api.set_fault(fault)
|
try:
|
||||||
if pc.is_uuid_like(alarm_uuid) is False:
|
alarm_uuid = api.set_fault(fault)
|
||||||
collectd.error("%s %s:%s set_fault failed:%s" %
|
if pc.is_uuid_like(alarm_uuid) is False:
|
||||||
(PLUGIN, base_obj.id, obj.entity_id, alarm_uuid))
|
collectd.error("%s 'set_fault' failed ; %s:%s ; %s" %
|
||||||
|
(PLUGIN,
|
||||||
|
base_obj.id,
|
||||||
|
obj.entity_id,
|
||||||
|
alarm_uuid))
|
||||||
|
return 0
|
||||||
|
|
||||||
|
except Exception as ex:
|
||||||
|
collectd.error("%s 'set_fault' exception ; %s:%s:%s ; %s" %
|
||||||
|
(PLUGIN,
|
||||||
|
obj.id,
|
||||||
|
obj.entity_id,
|
||||||
|
_severity_num,
|
||||||
|
ex))
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
# update the lists now that
|
# update the lists now that
|
||||||
|
|
|
@ -78,7 +78,7 @@ from fm_api import constants as fm_constants
|
||||||
from fm_api import fm_api
|
from fm_api import fm_api
|
||||||
|
|
||||||
# Fault manager API Object
|
# Fault manager API Object
|
||||||
api = fm_api.FaultAPIs()
|
api = fm_api.FaultAPIsV2()
|
||||||
|
|
||||||
# name of the plugin - all logs produced by this plugin are prefixed with this
|
# name of the plugin - all logs produced by this plugin are prefixed with this
|
||||||
PLUGIN = 'interface plugin'
|
PLUGIN = 'interface plugin'
|
||||||
|
@ -137,6 +137,11 @@ ALARM_ACTION_CLEAR = 'clear'
|
||||||
LEVEL_PORT = 'port'
|
LEVEL_PORT = 'port'
|
||||||
LEVEL_IFACE = 'interface'
|
LEVEL_IFACE = 'interface'
|
||||||
|
|
||||||
|
# Run phases
|
||||||
|
RUN_PHASE__INIT = 0
|
||||||
|
RUN_PHASE__ALARMS_CLEARED = 1
|
||||||
|
RUN_PHASE__HTTP_REQUEST_PASS = 2
|
||||||
|
|
||||||
|
|
||||||
# Link Object (aka Port or Physical interface) Structure
|
# Link Object (aka Port or Physical interface) Structure
|
||||||
# and member functions.
|
# and member functions.
|
||||||
|
@ -163,7 +168,8 @@ class LinkObject:
|
||||||
#
|
#
|
||||||
# Parameters : Network the link is part of.
|
# Parameters : Network the link is part of.
|
||||||
#
|
#
|
||||||
# Returns : True on failure and False on success.
|
# Returns : False on failure
|
||||||
|
# True on success
|
||||||
#
|
#
|
||||||
##################################################################
|
##################################################################
|
||||||
def raise_port_alarm(self, network):
|
def raise_port_alarm(self, network):
|
||||||
|
@ -177,16 +183,16 @@ class LinkObject:
|
||||||
ALARM_ACTION_RAISE,
|
ALARM_ACTION_RAISE,
|
||||||
fm_constants.FM_ALARM_SEVERITY_MAJOR,
|
fm_constants.FM_ALARM_SEVERITY_MAJOR,
|
||||||
self.alarm_id,
|
self.alarm_id,
|
||||||
self.timestamp) is False:
|
self.timestamp) is True:
|
||||||
|
|
||||||
self.severity = fm_constants.FM_ALARM_SEVERITY_MAJOR
|
self.severity = fm_constants.FM_ALARM_SEVERITY_MAJOR
|
||||||
collectd.info("%s %s %s port alarm raised" %
|
collectd.info("%s %s %s port alarm raised" %
|
||||||
(PLUGIN, self.name, self.alarm_id))
|
(PLUGIN, self.name, self.alarm_id))
|
||||||
return False
|
|
||||||
else:
|
|
||||||
return True
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
else:
|
else:
|
||||||
return False
|
return True
|
||||||
|
|
||||||
##################################################################
|
##################################################################
|
||||||
#
|
#
|
||||||
|
@ -197,7 +203,8 @@ class LinkObject:
|
||||||
#
|
#
|
||||||
# Parameters : Network the link is part of.
|
# Parameters : Network the link is part of.
|
||||||
#
|
#
|
||||||
# Returns : True on failure and False on success.
|
# Returns : False on failure
|
||||||
|
# True on success.
|
||||||
#
|
#
|
||||||
##################################################################
|
##################################################################
|
||||||
def clear_port_alarm(self, network):
|
def clear_port_alarm(self, network):
|
||||||
|
@ -210,16 +217,16 @@ class LinkObject:
|
||||||
ALARM_ACTION_CLEAR,
|
ALARM_ACTION_CLEAR,
|
||||||
fm_constants.FM_ALARM_SEVERITY_CLEAR,
|
fm_constants.FM_ALARM_SEVERITY_CLEAR,
|
||||||
self.alarm_id,
|
self.alarm_id,
|
||||||
self.timestamp) is False:
|
self.timestamp) is True:
|
||||||
|
|
||||||
collectd.info("%s %s %s port alarm cleared" %
|
collectd.info("%s %s %s port alarm cleared" %
|
||||||
(PLUGIN, self.name, self.alarm_id))
|
(PLUGIN, self.name, self.alarm_id))
|
||||||
self.severity = fm_constants.FM_ALARM_SEVERITY_CLEAR
|
self.severity = fm_constants.FM_ALARM_SEVERITY_CLEAR
|
||||||
return False
|
|
||||||
else:
|
|
||||||
return True
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
else:
|
else:
|
||||||
return False
|
return True
|
||||||
|
|
||||||
|
|
||||||
# Interface (aka Network) Level Object Structure and member functions
|
# Interface (aka Network) Level Object Structure and member functions
|
||||||
|
@ -265,7 +272,8 @@ class NetworkObject:
|
||||||
#
|
#
|
||||||
# Parameters : None
|
# Parameters : None
|
||||||
#
|
#
|
||||||
# Returns : True on failure and False on success.
|
# Returns : False on failure
|
||||||
|
# True on success
|
||||||
#
|
#
|
||||||
##################################################################
|
##################################################################
|
||||||
def raise_iface_alarm(self, severity):
|
def raise_iface_alarm(self, severity):
|
||||||
|
@ -283,7 +291,7 @@ class NetworkObject:
|
||||||
ALARM_ACTION_RAISE,
|
ALARM_ACTION_RAISE,
|
||||||
severity,
|
severity,
|
||||||
self.alarm_id,
|
self.alarm_id,
|
||||||
self.timestamp) is False:
|
self.timestamp) is True:
|
||||||
|
|
||||||
self.severity = severity
|
self.severity = severity
|
||||||
collectd.info("%s %s %s %s interface alarm raised" %
|
collectd.info("%s %s %s %s interface alarm raised" %
|
||||||
|
@ -291,11 +299,11 @@ class NetworkObject:
|
||||||
self.name,
|
self.name,
|
||||||
self.alarm_id,
|
self.alarm_id,
|
||||||
pc.get_severity_str(severity)))
|
pc.get_severity_str(severity)))
|
||||||
return False
|
|
||||||
else:
|
|
||||||
return True
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
else:
|
else:
|
||||||
return False
|
return True
|
||||||
|
|
||||||
##################################################################
|
##################################################################
|
||||||
#
|
#
|
||||||
|
@ -306,7 +314,8 @@ class NetworkObject:
|
||||||
#
|
#
|
||||||
# Parameters : None
|
# Parameters : None
|
||||||
#
|
#
|
||||||
# Returns : True on failure and False on success.
|
# Returns : False on failure
|
||||||
|
# True on success.
|
||||||
#
|
#
|
||||||
##################################################################
|
##################################################################
|
||||||
def clear_iface_alarm(self):
|
def clear_iface_alarm(self):
|
||||||
|
@ -319,7 +328,7 @@ class NetworkObject:
|
||||||
ALARM_ACTION_CLEAR,
|
ALARM_ACTION_CLEAR,
|
||||||
fm_constants.FM_ALARM_SEVERITY_CLEAR,
|
fm_constants.FM_ALARM_SEVERITY_CLEAR,
|
||||||
self.alarm_id,
|
self.alarm_id,
|
||||||
self.timestamp) is False:
|
self.timestamp) is True:
|
||||||
|
|
||||||
collectd.info("%s %s %s %s interface alarm cleared" %
|
collectd.info("%s %s %s %s interface alarm cleared" %
|
||||||
(PLUGIN,
|
(PLUGIN,
|
||||||
|
@ -327,11 +336,11 @@ class NetworkObject:
|
||||||
self.alarm_id,
|
self.alarm_id,
|
||||||
pc.get_severity_str(self.severity)))
|
pc.get_severity_str(self.severity)))
|
||||||
self.severity = fm_constants.FM_ALARM_SEVERITY_CLEAR
|
self.severity = fm_constants.FM_ALARM_SEVERITY_CLEAR
|
||||||
return False
|
|
||||||
else:
|
|
||||||
return True
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
else:
|
else:
|
||||||
return False
|
return True
|
||||||
|
|
||||||
######################################################################
|
######################################################################
|
||||||
#
|
#
|
||||||
|
@ -522,14 +531,23 @@ def this_hosts_alarm(hostname, eid):
|
||||||
#
|
#
|
||||||
# Parameters : A list of this plugin's alarm ids
|
# Parameters : A list of this plugin's alarm ids
|
||||||
#
|
#
|
||||||
# Returns : True on failure and False on success
|
# Returns : True on Success
|
||||||
|
# False on Failure
|
||||||
#
|
#
|
||||||
##########################################################################
|
##########################################################################
|
||||||
def clear_alarms(alarm_id_list):
|
def clear_alarms(alarm_id_list):
|
||||||
"""Clear alarm state of all plugin alarms"""
|
"""Clear alarm state of all plugin alarms"""
|
||||||
found = False
|
found = False
|
||||||
for alarm_id in alarm_id_list:
|
for alarm_id in alarm_id_list:
|
||||||
alarms = api.get_faults_by_id(alarm_id)
|
|
||||||
|
try:
|
||||||
|
alarms = api.get_faults_by_id(alarm_id)
|
||||||
|
except Exception as ex:
|
||||||
|
collectd.error("%s 'get_faults_by_id' exception ;"
|
||||||
|
" %s ; %s" %
|
||||||
|
(PLUGIN, alarm_id, ex))
|
||||||
|
return False
|
||||||
|
|
||||||
if alarms:
|
if alarms:
|
||||||
for alarm in alarms:
|
for alarm in alarms:
|
||||||
eid = alarm.entity_instance_id
|
eid = alarm.entity_instance_id
|
||||||
|
@ -543,24 +561,30 @@ def clear_alarms(alarm_id_list):
|
||||||
alarm_id == PLUGIN_MGMT_IFACE_ALARMID or \
|
alarm_id == PLUGIN_MGMT_IFACE_ALARMID or \
|
||||||
alarm_id == PLUGIN_CLSTR_PORT_ALARMID or \
|
alarm_id == PLUGIN_CLSTR_PORT_ALARMID or \
|
||||||
alarm_id == PLUGIN_CLSTR_IFACE_ALARMID:
|
alarm_id == PLUGIN_CLSTR_IFACE_ALARMID:
|
||||||
eid = alarm.entity_instance_id
|
|
||||||
if api.clear_fault(alarm_id, eid) is False:
|
|
||||||
collectd.error("%s %s:%s clear_fault failed" %
|
|
||||||
(PLUGIN, alarm_id, eid))
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
found = True
|
|
||||||
collectd.info("%s %s clearing %s alarm %s:%s" %
|
|
||||||
(PLUGIN,
|
|
||||||
NETWORK_CLSTR,
|
|
||||||
alarm.severity,
|
|
||||||
alarm_id,
|
|
||||||
alarm.entity_instance_id))
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
if api.clear_fault(alarm_id, eid) is False:
|
||||||
|
collectd.info("%s %s:%s:%s alarm already cleared" %
|
||||||
|
(PLUGIN,
|
||||||
|
alarm.severity,
|
||||||
|
alarm_id,
|
||||||
|
eid))
|
||||||
|
else:
|
||||||
|
found = True
|
||||||
|
collectd.info("%s %s:%s:%s alarm cleared" %
|
||||||
|
(PLUGIN,
|
||||||
|
alarm.severity,
|
||||||
|
alarm_id,
|
||||||
|
eid))
|
||||||
|
except Exception as ex:
|
||||||
|
collectd.error("%s 'clear_fault' exception ; "
|
||||||
|
"%s:%s ; %s" %
|
||||||
|
(PLUGIN, alarm_id, eid, ex))
|
||||||
|
return False
|
||||||
if found is False:
|
if found is False:
|
||||||
collectd.info("%s found no startup alarms" % PLUGIN)
|
collectd.info("%s found no startup alarms" % PLUGIN)
|
||||||
|
|
||||||
return False
|
return True
|
||||||
|
|
||||||
|
|
||||||
##########################################################################
|
##########################################################################
|
||||||
|
@ -570,7 +594,8 @@ def clear_alarms(alarm_id_list):
|
||||||
# Purpose : Raises or clears port and interface alarms based on
|
# Purpose : Raises or clears port and interface alarms based on
|
||||||
# calling parameters.
|
# calling parameters.
|
||||||
#
|
#
|
||||||
# Returns : True on failure and False on success
|
# Returns : True on success
|
||||||
|
# False on failure
|
||||||
#
|
#
|
||||||
##########################################################################
|
##########################################################################
|
||||||
def manage_alarm(name, network, level, action, severity, alarm_id, timestamp):
|
def manage_alarm(name, network, level, action, severity, alarm_id, timestamp):
|
||||||
|
@ -604,12 +629,20 @@ def manage_alarm(name, network, level, action, severity, alarm_id, timestamp):
|
||||||
reason += " failed"
|
reason += " failed"
|
||||||
|
|
||||||
if alarm_state == fm_constants.FM_ALARM_STATE_CLEAR:
|
if alarm_state == fm_constants.FM_ALARM_STATE_CLEAR:
|
||||||
if api.clear_fault(alarm_id, eid) is False:
|
try:
|
||||||
collectd.error("%s %s:%s clear_fault failed" %
|
if api.clear_fault(alarm_id, eid) is False:
|
||||||
(PLUGIN, alarm_id, eid))
|
collectd.info("%s %s:%s alarm already cleared" %
|
||||||
|
(PLUGIN, alarm_id, eid))
|
||||||
|
else:
|
||||||
|
collectd.info("%s %s:%s alarm cleared" %
|
||||||
|
(PLUGIN, alarm_id, eid))
|
||||||
return True
|
return True
|
||||||
else:
|
|
||||||
|
except Exception as ex:
|
||||||
|
collectd.error("%s 'clear_fault' failed ; %s:%s ; %s" %
|
||||||
|
(PLUGIN, alarm_id, eid, ex))
|
||||||
return False
|
return False
|
||||||
|
|
||||||
else:
|
else:
|
||||||
fault = fm_api.Fault(
|
fault = fm_api.Fault(
|
||||||
uuid="",
|
uuid="",
|
||||||
|
@ -626,14 +659,20 @@ def manage_alarm(name, network, level, action, severity, alarm_id, timestamp):
|
||||||
timestamp=ts,
|
timestamp=ts,
|
||||||
suppression=True)
|
suppression=True)
|
||||||
|
|
||||||
alarm_uuid = api.set_fault(fault)
|
try:
|
||||||
if pc.is_uuid_like(alarm_uuid) is False:
|
alarm_uuid = api.set_fault(fault)
|
||||||
collectd.error("%s %s:%s set_fault failed:%s" %
|
except Exception as ex:
|
||||||
(PLUGIN, alarm_id, eid, alarm_uuid))
|
collectd.error("%s 'set_fault' exception ; %s:%s ; %s" %
|
||||||
return True
|
(PLUGIN, alarm_id, eid, ex))
|
||||||
else:
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
if pc.is_uuid_like(alarm_uuid) is False:
|
||||||
|
collectd.error("%s 'set_fault' failed ; %s:%s ; %s" %
|
||||||
|
(PLUGIN, alarm_id, eid, alarm_uuid))
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
# The config function - called once on collectd process startup
|
# The config function - called once on collectd process startup
|
||||||
def config_func(config):
|
def config_func(config):
|
||||||
|
@ -704,13 +743,13 @@ def init_func():
|
||||||
|
|
||||||
if obj.init_done is False:
|
if obj.init_done is False:
|
||||||
if obj.init_ready() is False:
|
if obj.init_ready() is False:
|
||||||
return False
|
return 0
|
||||||
|
|
||||||
obj.hostname = obj.gethostname()
|
obj.hostname = obj.gethostname()
|
||||||
obj.init_done = True
|
obj.init_done = True
|
||||||
collectd.info("%s initialization complete" % PLUGIN)
|
collectd.info("%s initialization complete" % PLUGIN)
|
||||||
|
|
||||||
return True
|
return 0
|
||||||
|
|
||||||
|
|
||||||
# The sample read function - called on every audit interval
|
# The sample read function - called on every audit interval
|
||||||
|
@ -721,208 +760,217 @@ def read_func():
|
||||||
init_func()
|
init_func()
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
if obj.audits == 0:
|
if obj.phase < RUN_PHASE__ALARMS_CLEARED:
|
||||||
|
|
||||||
# clear all alarms on first audit
|
# clear all alarms on first audit
|
||||||
|
#
|
||||||
# block on fm availability
|
# block on fm availability
|
||||||
|
#
|
||||||
# If existing raised the alarms are still valid then
|
# If the existing raised alarms are still valid then
|
||||||
# they will be re-raised with the same timestamp the
|
# they will be re-raised with the same timestamp the
|
||||||
# original event occurred at once auditing resumes.
|
# original event occurred at once auditing resumes.
|
||||||
if clear_alarms(ALARM_ID_LIST) is True:
|
if clear_alarms(ALARM_ID_LIST) is False:
|
||||||
collectd.error("%s failed to clear existing alarms ; "
|
collectd.error("%s failed to clear existing alarms ; "
|
||||||
"retry next audit" % PLUGIN)
|
"retry next audit" % PLUGIN)
|
||||||
|
|
||||||
# Don't proceed till we can communicate with FM and
|
# Don't proceed till we can communicate with FM and
|
||||||
# clear all existing interface and port alarms.
|
# clear all existing interface and port alarms.
|
||||||
return 0
|
return 0
|
||||||
|
else:
|
||||||
|
obj.phase = RUN_PHASE__ALARMS_CLEARED
|
||||||
|
|
||||||
|
# Throttle HTTP request error retries
|
||||||
|
if obj.http_retry_count != 0:
|
||||||
|
obj.http_retry_count += 1
|
||||||
|
if obj.http_retry_count > obj.HTTP_RETRY_THROTTLE:
|
||||||
|
obj.http_retry_count = 0
|
||||||
|
return 0
|
||||||
|
|
||||||
|
# Issue query and construct the monitoring object
|
||||||
|
success = obj.make_http_request(to=PLUGIN_HTTP_TIMEOUT)
|
||||||
|
|
||||||
|
if success is False:
|
||||||
|
obj.http_retry_count += 1
|
||||||
|
return 0
|
||||||
|
|
||||||
|
if len(obj.jresp) == 0:
|
||||||
|
collectd.error("%s no json response from http request" % PLUGIN)
|
||||||
|
obj.http_retry_count += 1
|
||||||
|
return 0
|
||||||
|
|
||||||
|
# Check query status
|
||||||
try:
|
try:
|
||||||
# Issue query and construct the monitoring object
|
if obj.jresp['status'] != 'pass':
|
||||||
error = obj.make_http_request(to=PLUGIN_HTTP_TIMEOUT)
|
collectd.error("%s link monitor query %s" %
|
||||||
|
(PLUGIN, obj.jresp['status']))
|
||||||
if len(obj.jresp) == 0:
|
obj.http_retry_count += 1
|
||||||
collectd.error("%s no json response from http request" % PLUGIN)
|
return 0
|
||||||
return 1
|
|
||||||
|
|
||||||
if error:
|
|
||||||
return 1
|
|
||||||
|
|
||||||
# Check query status
|
|
||||||
try:
|
|
||||||
if obj.jresp['status'] != 'pass':
|
|
||||||
collectd.error("%s link monitor query %s" %
|
|
||||||
(PLUGIN, obj.jresp['status']))
|
|
||||||
return 0
|
|
||||||
|
|
||||||
except Exception as ex:
|
|
||||||
collectd.error("%s http request get reason failed ; %s" %
|
|
||||||
(PLUGIN, str(ex)))
|
|
||||||
collectd.info("%s resp:%d:%s" %
|
|
||||||
(PLUGIN, len(obj.jresp), obj.jresp))
|
|
||||||
return 1
|
|
||||||
|
|
||||||
# log the first query response
|
|
||||||
if obj.audits == 0:
|
|
||||||
collectd.info("%s Link Status Query Response:%d:\n%s" %
|
|
||||||
(PLUGIN, len(obj.jresp), obj.jresp))
|
|
||||||
|
|
||||||
# uncomment below for debug purposes
|
|
||||||
#
|
|
||||||
# for network in NETWORKS:
|
|
||||||
# dump_network_info(network)
|
|
||||||
|
|
||||||
try:
|
|
||||||
link_info = obj.jresp['link_info']
|
|
||||||
for network_link_info in link_info:
|
|
||||||
collectd.debug("%s parse link info:%s" %
|
|
||||||
(PLUGIN, network_link_info))
|
|
||||||
for network in NETWORKS:
|
|
||||||
if network.name == network_link_info['network']:
|
|
||||||
links = network_link_info['links']
|
|
||||||
nname = network.name
|
|
||||||
if len(links) > 0:
|
|
||||||
link_one = links[0]
|
|
||||||
|
|
||||||
# get initial link one name
|
|
||||||
if network.link_one.name is None:
|
|
||||||
network.link_one.name = link_one['name']
|
|
||||||
|
|
||||||
network.link_one.timestamp =\
|
|
||||||
float(get_timestamp(link_one['time']))
|
|
||||||
|
|
||||||
# load link one state
|
|
||||||
if link_one['state'] == LINK_UP:
|
|
||||||
collectd.debug("%s %s IS Up [%s]" %
|
|
||||||
(PLUGIN, network.link_one.name,
|
|
||||||
network.link_one.state))
|
|
||||||
if network.link_one.state != LINK_UP:
|
|
||||||
network.link_one.state_change = True
|
|
||||||
network.link_one.clear_port_alarm(nname)
|
|
||||||
network.link_one.state = LINK_UP
|
|
||||||
else:
|
|
||||||
collectd.debug("%s %s IS Down [%s]" %
|
|
||||||
(PLUGIN, network.link_one.name,
|
|
||||||
network.link_one.state))
|
|
||||||
if network.link_one.state == LINK_UP:
|
|
||||||
network.link_one.state_change = True
|
|
||||||
network.link_one.raise_port_alarm(nname)
|
|
||||||
network.link_one.state = LINK_DOWN
|
|
||||||
|
|
||||||
if len(links) > 1:
|
|
||||||
link_two = links[1]
|
|
||||||
|
|
||||||
# get initial link two name
|
|
||||||
if network.link_two.name is None:
|
|
||||||
network.link_two.name = link_two['name']
|
|
||||||
|
|
||||||
network.link_two.timestamp =\
|
|
||||||
float(get_timestamp(link_two['time']))
|
|
||||||
|
|
||||||
# load link two state
|
|
||||||
if link_two['state'] == LINK_UP:
|
|
||||||
collectd.debug("%s %s IS Up [%s]" %
|
|
||||||
(PLUGIN, network.link_two.name,
|
|
||||||
network.link_two.state))
|
|
||||||
if network.link_two.state != LINK_UP:
|
|
||||||
network.link_two.state_change = True
|
|
||||||
network.link_two.clear_port_alarm(nname)
|
|
||||||
network.link_two.state = LINK_UP
|
|
||||||
else:
|
|
||||||
collectd.debug("%s %s IS Down [%s]" %
|
|
||||||
(PLUGIN, network.link_two.name,
|
|
||||||
network.link_two.state))
|
|
||||||
if network.link_two.state == LINK_UP:
|
|
||||||
network.link_two.state_change = True
|
|
||||||
network.link_two.raise_port_alarm(nname)
|
|
||||||
network.link_two.state = LINK_DOWN
|
|
||||||
|
|
||||||
# manage interface alarms
|
|
||||||
network.manage_iface_alarm()
|
|
||||||
|
|
||||||
except Exception as ex:
|
|
||||||
collectd.error("%s link monitor query parse error: %s " %
|
|
||||||
(PLUGIN, obj.resp))
|
|
||||||
|
|
||||||
# handle state changes
|
|
||||||
for network in NETWORKS:
|
|
||||||
if network.link_two.name is not None and \
|
|
||||||
network.link_one.state_change is True:
|
|
||||||
|
|
||||||
if network.link_one.state == LINK_UP:
|
|
||||||
collectd.info("%s %s link one '%s' is Up" %
|
|
||||||
(PLUGIN,
|
|
||||||
network.name,
|
|
||||||
network.link_one.name))
|
|
||||||
else:
|
|
||||||
collectd.info("%s %s link one '%s' is Down" %
|
|
||||||
(PLUGIN,
|
|
||||||
network.name,
|
|
||||||
network.link_one.name))
|
|
||||||
|
|
||||||
if network.link_two.name is not None and \
|
|
||||||
network.link_two.state_change is True:
|
|
||||||
|
|
||||||
if network.link_two.state == LINK_UP:
|
|
||||||
collectd.info("%s %s link two '%s' is Up" %
|
|
||||||
(PLUGIN,
|
|
||||||
network.name,
|
|
||||||
network.link_two.name))
|
|
||||||
else:
|
|
||||||
collectd.info("%s %s link two %s 'is' Down" %
|
|
||||||
(PLUGIN,
|
|
||||||
network.name,
|
|
||||||
network.link_two.name))
|
|
||||||
|
|
||||||
# Dispatch usage value to collectd
|
|
||||||
val = collectd.Values(host=obj.hostname)
|
|
||||||
val.plugin = 'interface'
|
|
||||||
val.type = 'percent'
|
|
||||||
val.type_instance = 'used'
|
|
||||||
|
|
||||||
# For each interface [ mgmt, oam, cluster-host ]
|
|
||||||
# calculate the percentage used sample
|
|
||||||
# sample = 100 % when all its links are up
|
|
||||||
# sample = 0 % when all its links are down
|
|
||||||
# sample = 50 % when one of a lagged group is down
|
|
||||||
for network in NETWORKS:
|
|
||||||
|
|
||||||
if network.link_one.name is not None:
|
|
||||||
|
|
||||||
val.plugin_instance = network.name
|
|
||||||
|
|
||||||
network.sample = 0
|
|
||||||
|
|
||||||
if network.link_two.name is not None:
|
|
||||||
# lagged
|
|
||||||
|
|
||||||
if network.link_one.state == LINK_UP:
|
|
||||||
network.sample = 50
|
|
||||||
if network.link_two.state == LINK_UP:
|
|
||||||
network.sample += 50
|
|
||||||
else:
|
|
||||||
if network.link_one.state == LINK_UP:
|
|
||||||
network.sample = 100
|
|
||||||
val.dispatch(values=[network.sample])
|
|
||||||
|
|
||||||
if network.link_one.state_change is True or \
|
|
||||||
network.link_two.state_change is True:
|
|
||||||
|
|
||||||
dump_network_info(network)
|
|
||||||
|
|
||||||
network.link_one.state_change = False
|
|
||||||
network.link_two.state_change = False
|
|
||||||
|
|
||||||
network.sample_last = network.sample
|
|
||||||
|
|
||||||
else:
|
|
||||||
collectd.debug("%s %s network not provisioned" %
|
|
||||||
(PLUGIN, network.name))
|
|
||||||
obj.audits += 1
|
|
||||||
|
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
collectd.info("%s http request failed: %s" % (PLUGIN, str(ex)))
|
collectd.error("%s http request get reason failed ; %s" %
|
||||||
|
(PLUGIN, str(ex)))
|
||||||
|
collectd.info("%s resp:%d:%s" %
|
||||||
|
(PLUGIN, len(obj.jresp), obj.jresp))
|
||||||
|
obj.http_retry_count += 1
|
||||||
|
return 0
|
||||||
|
|
||||||
|
# log the first query response
|
||||||
|
if obj.audits == 0:
|
||||||
|
collectd.info("%s Link Status Query Response:%d:\n%s" %
|
||||||
|
(PLUGIN, len(obj.jresp), obj.jresp))
|
||||||
|
|
||||||
|
# uncomment below for debug purposes
|
||||||
|
#
|
||||||
|
# for network in NETWORKS:
|
||||||
|
# dump_network_info(network)
|
||||||
|
|
||||||
|
try:
|
||||||
|
link_info = obj.jresp['link_info']
|
||||||
|
for network_link_info in link_info:
|
||||||
|
collectd.debug("%s parse link info:%s" %
|
||||||
|
(PLUGIN, network_link_info))
|
||||||
|
for network in NETWORKS:
|
||||||
|
if network.name == network_link_info['network']:
|
||||||
|
links = network_link_info['links']
|
||||||
|
nname = network.name
|
||||||
|
if len(links) > 0:
|
||||||
|
link_one = links[0]
|
||||||
|
|
||||||
|
# get initial link one name
|
||||||
|
if network.link_one.name is None:
|
||||||
|
network.link_one.name = link_one['name']
|
||||||
|
|
||||||
|
network.link_one.timestamp =\
|
||||||
|
float(get_timestamp(link_one['time']))
|
||||||
|
|
||||||
|
# load link one state
|
||||||
|
if link_one['state'] == LINK_UP:
|
||||||
|
collectd.debug("%s %s IS Up [%s]" %
|
||||||
|
(PLUGIN, network.link_one.name,
|
||||||
|
network.link_one.state))
|
||||||
|
if network.link_one.state != LINK_UP:
|
||||||
|
network.link_one.state_change = True
|
||||||
|
network.link_one.clear_port_alarm(nname)
|
||||||
|
network.link_one.state = LINK_UP
|
||||||
|
else:
|
||||||
|
collectd.debug("%s %s IS Down [%s]" %
|
||||||
|
(PLUGIN, network.link_one.name,
|
||||||
|
network.link_one.state))
|
||||||
|
if network.link_one.state == LINK_UP:
|
||||||
|
network.link_one.state_change = True
|
||||||
|
network.link_one.raise_port_alarm(nname)
|
||||||
|
network.link_one.state = LINK_DOWN
|
||||||
|
|
||||||
|
if len(links) > 1:
|
||||||
|
link_two = links[1]
|
||||||
|
|
||||||
|
# get initial link two name
|
||||||
|
if network.link_two.name is None:
|
||||||
|
network.link_two.name = link_two['name']
|
||||||
|
|
||||||
|
network.link_two.timestamp =\
|
||||||
|
float(get_timestamp(link_two['time']))
|
||||||
|
|
||||||
|
# load link two state
|
||||||
|
if link_two['state'] == LINK_UP:
|
||||||
|
collectd.debug("%s %s IS Up [%s]" %
|
||||||
|
(PLUGIN, network.link_two.name,
|
||||||
|
network.link_two.state))
|
||||||
|
if network.link_two.state != LINK_UP:
|
||||||
|
network.link_two.state_change = True
|
||||||
|
network.link_two.clear_port_alarm(nname)
|
||||||
|
network.link_two.state = LINK_UP
|
||||||
|
else:
|
||||||
|
collectd.debug("%s %s IS Down [%s]" %
|
||||||
|
(PLUGIN, network.link_two.name,
|
||||||
|
network.link_two.state))
|
||||||
|
if network.link_two.state == LINK_UP:
|
||||||
|
network.link_two.state_change = True
|
||||||
|
network.link_two.raise_port_alarm(nname)
|
||||||
|
network.link_two.state = LINK_DOWN
|
||||||
|
|
||||||
|
# manage interface alarms
|
||||||
|
network.manage_iface_alarm()
|
||||||
|
|
||||||
|
except Exception as ex:
|
||||||
|
collectd.error("%s link monitor query parse exception ; %s " %
|
||||||
|
(PLUGIN, obj.resp))
|
||||||
|
|
||||||
|
# handle state changes
|
||||||
|
for network in NETWORKS:
|
||||||
|
if network.link_two.name is not None and \
|
||||||
|
network.link_one.state_change is True:
|
||||||
|
|
||||||
|
if network.link_one.state == LINK_UP:
|
||||||
|
collectd.info("%s %s link one '%s' is Up" %
|
||||||
|
(PLUGIN,
|
||||||
|
network.name,
|
||||||
|
network.link_one.name))
|
||||||
|
else:
|
||||||
|
collectd.info("%s %s link one '%s' is Down" %
|
||||||
|
(PLUGIN,
|
||||||
|
network.name,
|
||||||
|
network.link_one.name))
|
||||||
|
|
||||||
|
if network.link_two.name is not None and \
|
||||||
|
network.link_two.state_change is True:
|
||||||
|
|
||||||
|
if network.link_two.state == LINK_UP:
|
||||||
|
collectd.info("%s %s link two '%s' is Up" %
|
||||||
|
(PLUGIN,
|
||||||
|
network.name,
|
||||||
|
network.link_two.name))
|
||||||
|
else:
|
||||||
|
collectd.info("%s %s link two %s 'is' Down" %
|
||||||
|
(PLUGIN,
|
||||||
|
network.name,
|
||||||
|
network.link_two.name))
|
||||||
|
|
||||||
|
# Dispatch usage value to collectd
|
||||||
|
val = collectd.Values(host=obj.hostname)
|
||||||
|
val.plugin = 'interface'
|
||||||
|
val.type = 'percent'
|
||||||
|
val.type_instance = 'used'
|
||||||
|
|
||||||
|
# For each interface [ mgmt, oam, infra ]
|
||||||
|
# calculate the percentage used sample
|
||||||
|
# sample = 100 % when all its links are up
|
||||||
|
# sample = 0 % when all its links are down
|
||||||
|
# sample = 50 % when one of a lagged group is down
|
||||||
|
for network in NETWORKS:
|
||||||
|
|
||||||
|
if network.link_one.name is not None:
|
||||||
|
|
||||||
|
val.plugin_instance = network.name
|
||||||
|
|
||||||
|
network.sample = 0
|
||||||
|
|
||||||
|
if network.link_two.name is not None:
|
||||||
|
# lagged
|
||||||
|
|
||||||
|
if network.link_one.state == LINK_UP:
|
||||||
|
network.sample = 50
|
||||||
|
if network.link_two.state == LINK_UP:
|
||||||
|
network.sample += 50
|
||||||
|
else:
|
||||||
|
if network.link_one.state == LINK_UP:
|
||||||
|
network.sample = 100
|
||||||
|
val.dispatch(values=[network.sample])
|
||||||
|
|
||||||
|
if network.link_one.state_change is True or \
|
||||||
|
network.link_two.state_change is True:
|
||||||
|
|
||||||
|
dump_network_info(network)
|
||||||
|
|
||||||
|
network.link_one.state_change = False
|
||||||
|
network.link_two.state_change = False
|
||||||
|
|
||||||
|
network.sample_last = network.sample
|
||||||
|
|
||||||
|
else:
|
||||||
|
collectd.debug("%s %s network not provisioned" %
|
||||||
|
(PLUGIN, network.name))
|
||||||
|
obj.audits += 1
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
|
@ -12,8 +12,8 @@
|
||||||
Instance "used"
|
Instance "used"
|
||||||
Persist true
|
Persist true
|
||||||
PersistOK true
|
PersistOK true
|
||||||
WarningMax 79.00
|
WarningMax 80.00
|
||||||
FailureMax 89.00
|
FailureMax 90.00
|
||||||
Hits 2
|
Hits 2
|
||||||
Invert false
|
Invert false
|
||||||
</Type>
|
</Type>
|
||||||
|
|
|
@ -136,6 +136,15 @@ def read_func():
|
||||||
val.type = 'percent'
|
val.type = 'percent'
|
||||||
val.type_instance = 'used'
|
val.type_instance = 'used'
|
||||||
|
|
||||||
|
# fit_value = 0
|
||||||
|
# if os.path.exists('/var/run/fit/mem_data'):
|
||||||
|
# with open('/var/run/fit/mem_data', 'r') as infile:
|
||||||
|
# for line in infile:
|
||||||
|
# fit_value = float(line)
|
||||||
|
# collectd.info("%s using FIT data:%.2f" %
|
||||||
|
# (PLUGIN, fit_value))
|
||||||
|
# break
|
||||||
|
|
||||||
# remove the 'unit' (kB) suffix that might be on some of the lines
|
# remove the 'unit' (kB) suffix that might be on some of the lines
|
||||||
for line in meminfo:
|
for line in meminfo:
|
||||||
# remove the units from the value read
|
# remove the units from the value read
|
||||||
|
@ -173,6 +182,9 @@ def read_func():
|
||||||
obj.value = float(float(obj.AnonPages) / float(obj.total))
|
obj.value = float(float(obj.AnonPages) / float(obj.total))
|
||||||
obj.value = float(float(obj.value) * 100)
|
obj.value = float(float(obj.value) * 100)
|
||||||
|
|
||||||
|
# if fit_value != 0:
|
||||||
|
# obj.value = fit_value
|
||||||
|
|
||||||
if debug is True:
|
if debug is True:
|
||||||
collectd.info("%s ---------------------------" % PLUGIN)
|
collectd.info("%s ---------------------------" % PLUGIN)
|
||||||
collectd.info("%s memAvail: %d" % (PLUGIN, obj.avail))
|
collectd.info("%s memAvail: %d" % (PLUGIN, obj.avail))
|
||||||
|
@ -218,6 +230,9 @@ def read_func():
|
||||||
obj.value = float(float(obj.AnonPages)) / float(total)
|
obj.value = float(float(obj.AnonPages)) / float(total)
|
||||||
obj.value = float(float(obj.value) * 100)
|
obj.value = float(float(obj.value) * 100)
|
||||||
|
|
||||||
|
# if fit_value != 0:
|
||||||
|
# obj.value = fit_value
|
||||||
|
|
||||||
# Dispatch usage value to collectd for this numa node
|
# Dispatch usage value to collectd for this numa node
|
||||||
val.plugin_instance = numa_node.split('/')[5]
|
val.plugin_instance = numa_node.split('/')[5]
|
||||||
val.dispatch(values=[obj.value])
|
val.dispatch(values=[obj.value])
|
||||||
|
@ -238,6 +253,9 @@ def read_func():
|
||||||
float(obj.HugePages_Total)
|
float(obj.HugePages_Total)
|
||||||
obj.value = float(float(obj.value) * 100)
|
obj.value = float(float(obj.value) * 100)
|
||||||
|
|
||||||
|
# if fit_value != 0:
|
||||||
|
# obj.value = fit_value
|
||||||
|
|
||||||
# Dispatch huge page memory usage value
|
# Dispatch huge page memory usage value
|
||||||
# to collectd for this numa node.
|
# to collectd for this numa node.
|
||||||
val.plugin_instance = numa_node.split('/')[5] + '_hugepages'
|
val.plugin_instance = numa_node.split('/')[5] + '_hugepages'
|
||||||
|
|
|
@ -62,7 +62,7 @@ from fm_api import constants as fm_constants
|
||||||
from fm_api import fm_api
|
from fm_api import fm_api
|
||||||
import tsconfig.tsconfig as tsc
|
import tsconfig.tsconfig as tsc
|
||||||
|
|
||||||
api = fm_api.FaultAPIs()
|
api = fm_api.FaultAPIsV2()
|
||||||
|
|
||||||
PLUGIN = 'NTP query plugin'
|
PLUGIN = 'NTP query plugin'
|
||||||
PLUGIN_INTERVAL = 600 # audit interval in secs
|
PLUGIN_INTERVAL = 600 # audit interval in secs
|
||||||
|
@ -78,7 +78,7 @@ class NtpqObject:
|
||||||
# static variables set in init
|
# static variables set in init
|
||||||
hostname = '' # the name of this host
|
hostname = '' # the name of this host
|
||||||
base_eid = '' # the eid for the major alarm
|
base_eid = '' # the eid for the major alarm
|
||||||
config_complete = False # set to true once config is complete
|
init_complete = False # set to true once config is complete
|
||||||
alarm_raised = False # True when the major alarm is asserted
|
alarm_raised = False # True when the major alarm is asserted
|
||||||
|
|
||||||
server_list_conf = [] # list of servers in the /etc/ntp.conf file
|
server_list_conf = [] # list of servers in the /etc/ntp.conf file
|
||||||
|
@ -172,35 +172,46 @@ def _raise_alarm(ip=None):
|
||||||
eid = obj.base_eid + '=' + ip
|
eid = obj.base_eid + '=' + ip
|
||||||
fm_severity = fm_constants.FM_ALARM_SEVERITY_MINOR
|
fm_severity = fm_constants.FM_ALARM_SEVERITY_MINOR
|
||||||
|
|
||||||
fault = fm_api.Fault(
|
try:
|
||||||
alarm_id=PLUGIN_ALARMID,
|
fault = fm_api.Fault(
|
||||||
alarm_state=fm_constants.FM_ALARM_STATE_SET,
|
alarm_id=PLUGIN_ALARMID,
|
||||||
entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST,
|
alarm_state=fm_constants.FM_ALARM_STATE_SET,
|
||||||
entity_instance_id=eid,
|
entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST,
|
||||||
severity=fm_severity,
|
entity_instance_id=eid,
|
||||||
reason_text=reason,
|
severity=fm_severity,
|
||||||
alarm_type=obj.alarm_type,
|
reason_text=reason,
|
||||||
probable_cause=obj.cause,
|
alarm_type=obj.alarm_type,
|
||||||
proposed_repair_action=obj.repair,
|
probable_cause=obj.cause,
|
||||||
service_affecting=obj.service_affecting,
|
proposed_repair_action=obj.repair,
|
||||||
suppression=obj.suppression)
|
service_affecting=obj.service_affecting,
|
||||||
|
suppression=obj.suppression)
|
||||||
|
|
||||||
alarm_uuid = api.set_fault(fault)
|
alarm_uuid = api.set_fault(fault)
|
||||||
if _is_uuid_like(alarm_uuid) is False:
|
if _is_uuid_like(alarm_uuid) is False:
|
||||||
|
|
||||||
# Don't _add_unreachable_server list if the fm call failed.
|
# Don't _add_unreachable_server list if the fm call failed.
|
||||||
# That way it will be retried at a later time.
|
# That way it will be retried at a later time.
|
||||||
collectd.error("%s %s:%s set_fault failed:%s" %
|
collectd.error("%s 'set_fault' failed ; %s:%s ; %s" %
|
||||||
(PLUGIN, PLUGIN_ALARMID, eid, alarm_uuid))
|
(PLUGIN, PLUGIN_ALARMID, eid, alarm_uuid))
|
||||||
return True
|
return 0
|
||||||
else:
|
|
||||||
collectd.info("%s raised alarm %s:%s" % (PLUGIN, PLUGIN_ALARMID, eid))
|
|
||||||
if ip:
|
|
||||||
_add_unreachable_server(ip)
|
|
||||||
else:
|
else:
|
||||||
obj.alarm_raised = True
|
collectd.info("%s raised alarm %s:%s" %
|
||||||
|
(PLUGIN,
|
||||||
|
PLUGIN_ALARMID,
|
||||||
|
eid))
|
||||||
|
if ip:
|
||||||
|
_add_unreachable_server(ip)
|
||||||
|
else:
|
||||||
|
obj.alarm_raised = True
|
||||||
|
|
||||||
return False
|
except Exception as ex:
|
||||||
|
collectd.error("%s 'set_fault' exception ; %s:%s:%s ; %s" %
|
||||||
|
(PLUGIN,
|
||||||
|
PLUGIN_ALARMID,
|
||||||
|
eid,
|
||||||
|
fm_severity,
|
||||||
|
ex))
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
###############################################################################
|
###############################################################################
|
||||||
|
@ -213,26 +224,33 @@ def _raise_alarm(ip=None):
|
||||||
#
|
#
|
||||||
# Returns : Error indication.
|
# Returns : Error indication.
|
||||||
#
|
#
|
||||||
# True : is error. FM call failed to clear the
|
# False: is error. FM call failed to clear the
|
||||||
# alarm and needs to be retried.
|
# alarm and needs to be retried.
|
||||||
#
|
#
|
||||||
# False: no error. FM call succeeds
|
# True : no error. FM call succeeds
|
||||||
#
|
#
|
||||||
###############################################################################
|
###############################################################################
|
||||||
|
|
||||||
def _clear_base_alarm():
|
def _clear_base_alarm():
|
||||||
"""Clear the NTP base alarm"""
|
"""Clear the NTP base alarm"""
|
||||||
|
|
||||||
if api.clear_fault(PLUGIN_ALARMID, obj.base_eid) is False:
|
try:
|
||||||
collectd.error("%s failed to clear alarm %s:%s" %
|
if api.clear_fault(PLUGIN_ALARMID, obj.base_eid) is False:
|
||||||
(PLUGIN, PLUGIN_ALARMID, obj.base_eid))
|
collectd.info("%s %s:%s alarm already cleared" %
|
||||||
return True
|
(PLUGIN, PLUGIN_ALARMID, obj.base_eid))
|
||||||
else:
|
else:
|
||||||
collectd.info("%s cleared alarm %s:%s" %
|
collectd.info("%s %s:%s alarm cleared" %
|
||||||
(PLUGIN, PLUGIN_ALARMID, obj.base_eid))
|
(PLUGIN, PLUGIN_ALARMID, obj.base_eid))
|
||||||
obj.alarm_raised = False
|
obj.alarm_raised = False
|
||||||
|
return True
|
||||||
|
|
||||||
return False
|
except Exception as ex:
|
||||||
|
collectd.error("%s 'clear_fault' exception ; %s:%s ; %s" %
|
||||||
|
(PLUGIN,
|
||||||
|
PLUGIN_ALARMID,
|
||||||
|
obj.base_eid,
|
||||||
|
ex))
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
###############################################################################
|
###############################################################################
|
||||||
|
@ -244,13 +262,6 @@ def _clear_base_alarm():
|
||||||
#
|
#
|
||||||
# Parameters : IP address
|
# Parameters : IP address
|
||||||
#
|
#
|
||||||
# Returns : Error indication.
|
|
||||||
#
|
|
||||||
# True : is error. FM call failed to clear the
|
|
||||||
# alarm and needs to be retried.
|
|
||||||
#
|
|
||||||
# False: no error. FM call succeeds
|
|
||||||
#
|
|
||||||
###############################################################################
|
###############################################################################
|
||||||
|
|
||||||
def _remove_ip_from_unreachable_list(ip):
|
def _remove_ip_from_unreachable_list(ip):
|
||||||
|
@ -258,24 +269,28 @@ def _remove_ip_from_unreachable_list(ip):
|
||||||
|
|
||||||
# remove from unreachable list if its there
|
# remove from unreachable list if its there
|
||||||
if ip and ip in obj.unreachable_servers:
|
if ip and ip in obj.unreachable_servers:
|
||||||
|
|
||||||
eid = obj.base_eid + '=' + ip
|
eid = obj.base_eid + '=' + ip
|
||||||
collectd.debug("%s trying to clear alarm %s" % (PLUGIN, eid))
|
collectd.debug("%s trying to clear alarm %s" % (PLUGIN, eid))
|
||||||
|
|
||||||
# clear the alarm if its asserted
|
try:
|
||||||
if api.clear_fault(PLUGIN_ALARMID, eid) is True:
|
# clear the alarm if its asserted
|
||||||
collectd.info("%s cleared %s:%s alarm" %
|
if api.clear_fault(PLUGIN_ALARMID, eid) is True:
|
||||||
(PLUGIN, PLUGIN_ALARMID, eid))
|
collectd.info("%s %s:%s alarm cleared " %
|
||||||
obj.unreachable_servers.remove(ip)
|
(PLUGIN, PLUGIN_ALARMID, eid))
|
||||||
else:
|
else:
|
||||||
# Handle clear failure by not removing the IP from the list.
|
# alarm does not exist
|
||||||
# It will retry on next audit.
|
collectd.info("%s %s:%s alarm clear" %
|
||||||
# Error should only occur if FM is not running at the time
|
(PLUGIN, PLUGIN_ALARMID, eid))
|
||||||
# this get or clear is called
|
|
||||||
collectd.error("%s failed alarm clear %s:%s" %
|
|
||||||
(PLUGIN, PLUGIN_ALARMID, eid))
|
|
||||||
return True
|
|
||||||
|
|
||||||
return False
|
obj.unreachable_servers.remove(ip)
|
||||||
|
|
||||||
|
except Exception as ex:
|
||||||
|
collectd.error("%s 'clear_fault' exception ; %s:%s ; %s" %
|
||||||
|
(PLUGIN,
|
||||||
|
PLUGIN_ALARMID,
|
||||||
|
eid,
|
||||||
|
ex))
|
||||||
|
|
||||||
|
|
||||||
###############################################################################
|
###############################################################################
|
||||||
|
@ -373,7 +388,8 @@ def _get_ntp_servers():
|
||||||
# Clear any that may have been raised.
|
# Clear any that may have been raised.
|
||||||
#
|
#
|
||||||
##################################################################
|
##################################################################
|
||||||
collectd.info("%s No NTP servers are provisioned" % PLUGIN)
|
collectd.info("%s NTP Service Disabled ; no provisioned servers" %
|
||||||
|
PLUGIN)
|
||||||
|
|
||||||
# clear all alarms
|
# clear all alarms
|
||||||
if obj.alarm_raised:
|
if obj.alarm_raised:
|
||||||
|
@ -510,7 +526,14 @@ def init_func():
|
||||||
_get_ntp_servers()
|
_get_ntp_servers()
|
||||||
|
|
||||||
# manage existing alarms.
|
# manage existing alarms.
|
||||||
alarms = api.get_faults_by_id(PLUGIN_ALARMID)
|
try:
|
||||||
|
alarms = api.get_faults_by_id(PLUGIN_ALARMID)
|
||||||
|
|
||||||
|
except Exception as ex:
|
||||||
|
collectd.error("%s 'get_faults_by_id' exception ; %s ; %s" %
|
||||||
|
(PLUGIN, PLUGIN_ALARMID, ex))
|
||||||
|
return 0
|
||||||
|
|
||||||
if alarms:
|
if alarms:
|
||||||
for alarm in alarms:
|
for alarm in alarms:
|
||||||
eid = alarm.entity_instance_id
|
eid = alarm.entity_instance_id
|
||||||
|
@ -524,18 +547,16 @@ def init_func():
|
||||||
# this is done to avoid the potential for stuck ntp ip alarms
|
# this is done to avoid the potential for stuck ntp ip alarms
|
||||||
collectd.info("%s clearing found startup alarm '%s'" %
|
collectd.info("%s clearing found startup alarm '%s'" %
|
||||||
(PLUGIN, alarm.entity_instance_id))
|
(PLUGIN, alarm.entity_instance_id))
|
||||||
rc = api.clear_fault(PLUGIN_ALARMID, alarm.entity_instance_id)
|
try:
|
||||||
if rc is False:
|
api.clear_fault(PLUGIN_ALARMID, alarm.entity_instance_id)
|
||||||
# if we can't clear the alarm now then lets load it and
|
except Exception as ex:
|
||||||
# manage it like it just happened. When the server starts
|
collectd.error("%s 'clear_fault' exception ; %s:%s ; %s" %
|
||||||
# responding then the alarm will get cleared at that time.
|
(PLUGIN,
|
||||||
collectd.error("%s failed to clear alarm %s:%s" %
|
PLUGIN_ALARMID,
|
||||||
(PLUGIN, PLUGIN_ALARMID,
|
alarm.entity_instance_id,
|
||||||
alarm.entity_instance_id))
|
ex))
|
||||||
|
return 0
|
||||||
|
|
||||||
ip = alarm.entity_instance_id.split('=')[2]
|
|
||||||
if ip and ip not in obj.unreachable_servers:
|
|
||||||
_add_unreachable_server(ip)
|
|
||||||
else:
|
else:
|
||||||
obj.alarm_raised = True
|
obj.alarm_raised = True
|
||||||
collectd.info("%s found alarm %s:%s" %
|
collectd.info("%s found alarm %s:%s" %
|
||||||
|
@ -551,7 +572,7 @@ def init_func():
|
||||||
else:
|
else:
|
||||||
collectd.info("%s no major startup alarms found" % PLUGIN)
|
collectd.info("%s no major startup alarms found" % PLUGIN)
|
||||||
|
|
||||||
obj.config_complete = True
|
obj.init_complete = True
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
@ -583,14 +604,11 @@ def read_func():
|
||||||
if tsc.nodetype != 'controller':
|
if tsc.nodetype != 'controller':
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
if obj.config_complete is False:
|
if obj.init_complete is False:
|
||||||
if os.path.exists(tsc.VOLATILE_CONTROLLER_CONFIG_COMPLETE) is False:
|
if os.path.exists(tsc.VOLATILE_CONTROLLER_CONFIG_COMPLETE) is True:
|
||||||
return 0
|
collectd.info("%s re-running init" % PLUGIN)
|
||||||
else:
|
init_func()
|
||||||
collectd.info("%s controller config complete ; "
|
return 0
|
||||||
"invoking init_func" % PLUGIN)
|
|
||||||
if init_func() != 0:
|
|
||||||
return 1
|
|
||||||
|
|
||||||
# get a list if provisioned ntp servers
|
# get a list if provisioned ntp servers
|
||||||
_get_ntp_servers()
|
_get_ntp_servers()
|
||||||
|
@ -613,7 +631,7 @@ def read_func():
|
||||||
|
|
||||||
if not data:
|
if not data:
|
||||||
collectd.error("%s no data from query" % PLUGIN)
|
collectd.error("%s no data from query" % PLUGIN)
|
||||||
return 1
|
return 0
|
||||||
|
|
||||||
# Get the ntp query output into a list of lines
|
# Get the ntp query output into a list of lines
|
||||||
obj.ntpq = data.split('\n')
|
obj.ntpq = data.split('\n')
|
||||||
|
|
|
@ -70,6 +70,8 @@ class PluginObject(object):
|
||||||
self.error_logged = False # used to prevent log flooding
|
self.error_logged = False # used to prevent log flooding
|
||||||
self.log_throttle_count = 0 # used to count throttle logs
|
self.log_throttle_count = 0 # used to count throttle logs
|
||||||
self.INIT_LOG_THROTTLE = 10 # the init log throttle threshold
|
self.INIT_LOG_THROTTLE = 10 # the init log throttle threshold
|
||||||
|
self.http_retry_count = 0 # track http error cases
|
||||||
|
self.HTTP_RETRY_THROTTLE = 6 # http retry threshold
|
||||||
self.phase = 0 # tracks current phase; init, sampling
|
self.phase = 0 # tracks current phase; init, sampling
|
||||||
|
|
||||||
collectd.debug("%s Common PluginObject constructor [%s]" %
|
collectd.debug("%s Common PluginObject constructor [%s]" %
|
||||||
|
@ -236,8 +238,8 @@ class PluginObject(object):
|
||||||
# Updates : self.jresp with the json string response from the request.
|
# Updates : self.jresp with the json string response from the request.
|
||||||
#
|
#
|
||||||
# Returns : Error indication (True/False)
|
# Returns : Error indication (True/False)
|
||||||
# True on error
|
# True on success
|
||||||
# False on success
|
# False on error
|
||||||
#
|
#
|
||||||
###########################################################################
|
###########################################################################
|
||||||
def make_http_request(self, url=None, to=None, hdrs=None):
|
def make_http_request(self, url=None, to=None, hdrs=None):
|
||||||
|
@ -261,9 +263,9 @@ class PluginObject(object):
|
||||||
resp = http.request(url, headers=hdrs)
|
resp = http.request(url, headers=hdrs)
|
||||||
|
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
collectd.info("%s http request failure (%s)" %
|
collectd.info("%s http request exception ; %s" %
|
||||||
(self.plugin, str(ex)))
|
(self.plugin, str(ex)))
|
||||||
return True
|
return False
|
||||||
|
|
||||||
try:
|
try:
|
||||||
collectd.debug("%s Resp: %s" %
|
collectd.debug("%s Resp: %s" %
|
||||||
|
@ -273,10 +275,13 @@ class PluginObject(object):
|
||||||
self.jresp = json.loads(resp[1])
|
self.jresp = json.loads(resp[1])
|
||||||
|
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
collectd.info("%s http request parse failure (%s) (%s)" %
|
collectd.error("%s http response parse exception ; %s" %
|
||||||
(self.plugin, str(ex), resp))
|
(self.plugin, str(ex)))
|
||||||
return True
|
if len(self.resp):
|
||||||
return False
|
collectd.error("%s response: %s" %
|
||||||
|
(self.plugin, self.resp))
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
def is_uuid_like(val):
|
def is_uuid_like(val):
|
||||||
|
|
|
@ -336,7 +336,7 @@ def clear_alarm(eid):
|
||||||
collectd.info("%s %s:%s alarm cleared" %
|
collectd.info("%s %s:%s alarm cleared" %
|
||||||
(PLUGIN, PLUGIN_ALARMID, eid))
|
(PLUGIN, PLUGIN_ALARMID, eid))
|
||||||
else:
|
else:
|
||||||
collectd.info("%s %s:%s alarm clear ; None found" %
|
collectd.info("%s %s:%s alarm already cleared" %
|
||||||
(PLUGIN, PLUGIN_ALARMID, eid))
|
(PLUGIN, PLUGIN_ALARMID, eid))
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
@ -433,7 +433,7 @@ def raise_alarm(alarm_cause, interface=None, data=0):
|
||||||
|
|
||||||
# Don't _add_unreachable_server list if the fm call failed.
|
# Don't _add_unreachable_server list if the fm call failed.
|
||||||
# That way it will be retried at a later time.
|
# That way it will be retried at a later time.
|
||||||
collectd.error("%s %s:%s set_fault failed:%s" %
|
collectd.error("%s 'set_fault' failed ; %s:%s ; %s" %
|
||||||
(PLUGIN, PLUGIN_ALARMID, alarm.eid, alarm_uuid))
|
(PLUGIN, PLUGIN_ALARMID, alarm.eid, alarm_uuid))
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
@ -671,8 +671,9 @@ def read_func():
|
||||||
# query FM for existing alarms.
|
# query FM for existing alarms.
|
||||||
alarms = api.get_faults_by_id(PLUGIN_ALARMID)
|
alarms = api.get_faults_by_id(PLUGIN_ALARMID)
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
collectd.error("%s 'get_faults_by_id' exception ; %s" %
|
collectd.error("%s 'get_faults_by_id' exception ;"
|
||||||
(PLUGIN, ex))
|
" %s ; %s" %
|
||||||
|
(PLUGIN, PLUGIN_ALARMID, ex))
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
if alarms:
|
if alarms:
|
||||||
|
|
|
@ -43,7 +43,7 @@ from oslo_concurrency import processutils
|
||||||
from fm_api import fm_api
|
from fm_api import fm_api
|
||||||
|
|
||||||
# Fault manager API Object
|
# Fault manager API Object
|
||||||
api = fm_api.FaultAPIs()
|
api = fm_api.FaultAPIsV2()
|
||||||
|
|
||||||
# name of the plugin
|
# name of the plugin
|
||||||
PLUGIN_NAME = 'remotels'
|
PLUGIN_NAME = 'remotels'
|
||||||
|
@ -95,7 +95,7 @@ def raise_alarm():
|
||||||
|
|
||||||
alarm_uuid = api.set_fault(fault)
|
alarm_uuid = api.set_fault(fault)
|
||||||
if pc.is_uuid_like(alarm_uuid) is False:
|
if pc.is_uuid_like(alarm_uuid) is False:
|
||||||
collectd.error("%s %s:%s set_fault failed:%s" %
|
collectd.error("%s 'set_fault' failed ; %s:%s ; %s" %
|
||||||
(PLUGIN, PLUGIN_ALARMID,
|
(PLUGIN, PLUGIN_ALARMID,
|
||||||
obj.base_eid, alarm_uuid))
|
obj.base_eid, alarm_uuid))
|
||||||
else:
|
else:
|
||||||
|
@ -103,9 +103,9 @@ def raise_alarm():
|
||||||
(PLUGIN, PLUGIN_ALARMID, obj.base_eid))
|
(PLUGIN, PLUGIN_ALARMID, obj.base_eid))
|
||||||
obj.alarmed = True
|
obj.alarmed = True
|
||||||
|
|
||||||
except:
|
except Exception as ex:
|
||||||
collectd.error("%s %s:%s set_fault exception" %
|
collectd.error("%s 'set_fault' exception ; %s:%s ; %s " %
|
||||||
(PLUGIN, PLUGIN_ALARMID, obj.base_eid))
|
(PLUGIN, PLUGIN_ALARMID, obj.base_eid, ex))
|
||||||
|
|
||||||
|
|
||||||
# Clear remote logging server alarm
|
# Clear remote logging server alarm
|
||||||
|
@ -114,13 +114,18 @@ def clear_alarm():
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if api.clear_fault(PLUGIN_ALARMID, obj.base_eid) is True:
|
if api.clear_fault(PLUGIN_ALARMID, obj.base_eid) is True:
|
||||||
collectd.info("%s alarm cleared" % PLUGIN)
|
collectd.info("%s %s:%s alarm cleared" %
|
||||||
|
(PLUGIN, PLUGIN_ALARMID, obj.base_eid))
|
||||||
|
else:
|
||||||
|
collectd.info("%s %s:%s alarm clear" %
|
||||||
|
(PLUGIN, PLUGIN_ALARMID, obj.base_eid))
|
||||||
|
|
||||||
obj.alarmed = False
|
obj.alarmed = False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
except:
|
except Exception as ex:
|
||||||
collectd.error("%s %s:%s clear failed ; will retry" %
|
collectd.error("%s 'clear_fault' exception ; %s:%s ; %s" %
|
||||||
(PLUGIN, PLUGIN_ALARMID, obj.base_eid))
|
(PLUGIN, PLUGIN_ALARMID, obj.base_eid, ex))
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue