Collectd+InfluxDb-RMON Replacement(ALL METRICS) P1

This update adds Maintenance support for receiving host degrade assert
and clear messages from collectd.
This update also disables platform memory, cpu and file system resource
monitoring in the maintenance resource monitor process rmon.
These disabled resources are now monitored by collectd and therefore
should not be monitored by rmond any longer.

Change-Id: I13fd033bb1d14f299dcb97fa80296641c958d0a9
Signed-off-by: Jack Ding <jack.ding@windriver.com>
This commit is contained in:
Eric MacDonald 2018-05-14 16:12:16 -04:00 committed by Jack Ding
parent 04055390fa
commit c038b1a9a7
14 changed files with 192 additions and 219 deletions

View File

@ -421,11 +421,6 @@ install -m 700 -p -D %{_buildsubdir}/pmon/scripts/pmon-restart %{buildroot}/%{lo
install -m 700 -p -D %{_buildsubdir}/pmon/scripts/pmon-start %{buildroot}/%{local_sbindir}/pmon-start
install -m 700 -p -D %{_buildsubdir}/pmon/scripts/pmon-stop %{buildroot}/%{local_sbindir}/pmon-stop
# test tools
install -m 755 %{_buildsubdir}/hwmon/scripts/show_hp360 %{buildroot}/%{_sbindir}/show_hp360
install -m 755 %{_buildsubdir}/hwmon/scripts/show_hp380 %{buildroot}/%{_sbindir}/show_hp380
install -m 755 %{_buildsubdir}/hwmon/scripts/show_quanta %{buildroot}/%{_sbindir}/show_quanta
# init script files
install -m 755 -p -D %{_buildsubdir}/scripts/mtcClient %{buildroot}%{_sysconfdir}/init.d/mtcClient
install -m 755 -p -D %{_buildsubdir}/scripts/hbsClient %{buildroot}%{_sysconfdir}/init.d/hbsClient
@ -498,9 +493,6 @@ install -m 755 -d %{buildroot}%{_sysconfdir}/rmonapi.d
install -m 755 -d %{buildroot}%{_sysconfdir}/rmonfiles.d
install -m 755 -d %{buildroot}%{_sysconfdir}/rmon_interfaces.d
install -m 644 -p -D %{_buildsubdir}/rmon/scripts/remotelogging_resource.conf %{buildroot}%{local_etc_rmond}/remotelogging_resource.conf
install -m 644 -p -D %{_buildsubdir}/rmon/scripts/cpu_resource.conf %{buildroot}%{local_etc_rmond}/cpu_resource.conf
install -m 644 -p -D %{_buildsubdir}/rmon/scripts/memory_resource.conf %{buildroot}%{local_etc_rmond}/memory_resource.conf
install -m 644 -p -D %{_buildsubdir}/rmon/scripts/filesystem_resource.conf %{buildroot}%{local_etc_rmond}/filesystem_resource.conf
install -m 644 -p -D %{_buildsubdir}/rmon/scripts/cinder_virtual_resource.conf %{buildroot}%{local_etc_rmond}/cinder_virtual_resource.conf
install -m 644 -p -D %{_buildsubdir}/rmon/scripts/nova_virtual_resource.conf %{buildroot}%{local_etc_rmond}/nova_virtual_resource.conf
install -m 644 -p -D %{_buildsubdir}/rmon/scripts/oam_resource.conf %{buildroot}%{_sysconfdir}/rmon_interfaces.d/oam_resource.conf
@ -676,10 +668,7 @@ install -m 755 -d %{buildroot}/var/run
%{local_etc_logrotated}/rmon.logrotate
%{_unitdir}/rmon.service
%{local_etc_rmond}/filesystem_resource.conf
%{local_etc_rmond}/cpu_resource.conf
%{local_etc_rmond}/remotelogging_resource.conf
%{local_etc_rmond}/memory_resource.conf
%{local_etc_rmond}/cinder_virtual_resource.conf
%{local_etc_rmond}/nova_virtual_resource.conf
@ -713,10 +702,6 @@ install -m 755 -d %{buildroot}/var/run
%{local_etc_logrotated}/hwmon.logrotate
%{ocf_resourced}/platform/hwmon
%{_sbindir}/show_hp380
%{_sbindir}/show_hp360
%{_sbindir}/show_quanta
%{_sysconfdir}/init.d/hwmon
%{local_bindir}/hwmond

View File

@ -255,12 +255,38 @@ const char * get_mtcNodeCommand_str ( int cmd )
}
void print_mtc_message ( string hostname, int direction, mtc_message_type & msg , const char * iface, bool force )
void print_mtc_message ( string hostname,
int direction,
mtc_message_type & msg,
const char * iface,
bool force )
{
/* Handle raw json string messages differently.
* Those messages just have a json string that starts at the header */
if ( msg.hdr[0] == '{' )
{
if ( force )
{
ilog ("%s %s (%s network) - %s\n",
hostname.c_str(),
direction ? "rx <-" : "tx ->" ,
iface,
msg.hdr);
}
else
{
mlog1 ("%s %s (%s network) - %s\n",
hostname.c_str(),
direction ? "rx <-" : "tx ->" ,
iface,
msg.hdr);
}
return ;
}
string str = "-" ;
if ( msg.buf[0] )
str = msg.buf ;
if ( force )
{
ilog ("%s %s %s (%s network) %d.%d %x:%x:%x.%x.%x.%x [%s] %s\n",

View File

@ -92,6 +92,9 @@ void daemon_exit ( void );
#define NODE_HEALTHY (1)
#define NODE_UNHEALTHY (2)
#define AUTO_RECOVERY_FILE_SUFFIX ((const char *)"_ar_count")
#define TMP_DIR_PATH ((const char *)"/etc/mtc/tmp/")
#define HOST_IS_VIRTUAL ((const char *)"/var/run/virtual.host")
/** Configuration Pass/Fail Flag File */
@ -146,10 +149,6 @@ void daemon_exit ( void );
#define BM_DNSMASQ_FILENAME ((const char *)"dnsmasq.bmc_hosts")
/* Added for Centos */
#define CENTOS_RELEASE_FILE ((const char *)"/etc/centos-release")
#define SYSTEMD_SERVICE_FILE_DIR ((const char *)"/usr/lib/systemd/system")
#define THREAD_NAME__IPMITOOL ((const char *)("ipmitool"))
#define IPMITOOL_PATH_AND_FILENAME ((const char *)("/usr/bin/ipmitool"))
@ -970,7 +969,7 @@ string get_configStages_str ( mtc_configStages_enum stage );
#define DEGRADE_MASK_SUBF 0x00000100
#define DEGRADE_MASK_SM 0x00000200
#define DEGRADE_MASK_CONFIG 0x00000400
#define DEGRADE_MASK_RES2 0x00000800
#define DEGRADE_MASK_COLLECTD 0x00000800
#define DEGRADE_MASK_ENABLE 0x00001000
#define DEGRADE_MASK_RES4 0x00002000
#define DEGRADE_MASK_RES5 0x00004000

View File

@ -662,8 +662,8 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname )
ptr->log_throttle = 0 ;
ptr->no_work_log_throttle = 0 ;
/* Clear the degrade control structs */
ptr->degrade_mask = DEGRADE_MASK_NONE ;
ptr->degrade_mask = ptr->degrade_mask_save = DEGRADE_MASK_NONE ;
ptr->degraded_resources_list.clear () ;
ptr->pmond_ready = false ;
ptr->rmond_ready = false ;
@ -4561,16 +4561,6 @@ void nodeLinkClass::manage_heartbeat_degrade ( string hostname, iface_enum iface
}
hbs_minor_clear ( node_ptr, iface );
/* Set the host available if the degrade mask is now
* cleared and we are degraded */
if ( node_ptr->degrade_mask == 0 )
{
if ( get_availStatus ( hostname ) == MTC_AVAIL_STATUS__DEGRADED )
{
set_availStatus ( hostname, MTC_AVAIL_STATUS__AVAILABLE );
}
}
}
else if ( this->mtcTimer_dor.tid )
{
@ -4602,12 +4592,6 @@ void nodeLinkClass::manage_heartbeat_degrade ( string hostname, iface_enum iface
node_ptr->degrade_mask |= DEGRADE_MASK_HEARTBEAT_INFRA ;
}
}
/* No point in changing if we are already degraded */
if ( nodeLinkClass::get_availStatus ( hostname ) == MTC_AVAIL_STATUS__AVAILABLE )
{
set_availStatus ( hostname, MTC_AVAIL_STATUS__DEGRADED );
}
}
}
}
@ -4621,7 +4605,7 @@ void nodeLinkClass::manage_heartbeat_minor ( string hostname, iface_enum iface,
wlog ("%s Unknown host\n", hostname.c_str());
return ;
}
/* is this a clear event ? */
if ( clear_event == true )
{
@ -4639,15 +4623,15 @@ void nodeLinkClass::manage_heartbeat_minor ( string hostname, iface_enum iface,
else if ( node_ptr->hbs_minor[iface] != true )
{
mnfa_add_host ( node_ptr, iface );
mnfa_add_host ( node_ptr, iface );
}
}
}
/** Interface to declare that a key service on the
/** Interface to declare that a key service on the
* specified host is up, running and ready */
int nodeLinkClass::declare_service_ready ( string & hostname,
int nodeLinkClass::declare_service_ready ( string & hostname,
unsigned int service )
{
nodeLinkClass::node * node_ptr = nodeLinkClass::getNode ( hostname );
@ -4661,18 +4645,11 @@ int nodeLinkClass::declare_service_ready ( string & hostname,
node_ptr->pmond_ready = true ;
plog ("%s got pmond ready event\n", hostname.c_str());
/* A ready event means that pmond pocess has started.
* Any previous history is gone. Cleanup mtce.
/* A ready event means that pmond pocess has started.
* Any previous history is gone. Cleanup mtce.
* If there are still process issues on this host then
* they will be reported again.*/
node_ptr->degrade_mask &= ~DEGRADE_MASK_PMON ;
if ( node_ptr->degrade_mask == DEGRADE_MASK_NONE )
{
if ( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED )
{
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__AVAILABLE );
}
}
return (PASS);
}
else if ( service == MTC_SERVICE_HWMOND )
@ -4719,14 +4696,6 @@ int nodeLinkClass::degrade_pmond_clear ( string & hostname )
if ( node_ptr->degrade_mask )
{
node_ptr->degrade_mask &= ~DEGRADE_MASK_PMON ;
if ( !node_ptr->degrade_mask )
{
if ( node_ptr->operState == MTC_OPER_STATE__ENABLED )
{
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__AVAILABLE );
}
}
}
/* The only detectable inservice failures are process failures */
@ -4735,15 +4704,65 @@ int nodeLinkClass::degrade_pmond_clear ( string & hostname )
return (PASS);
}
/* This private API handles event messages from collectd */
int nodeLinkClass::collectd_notify_handler ( string & hostname,
string & resource,
string & state )
{
int rc = PASS ;
nodeLinkClass::node * node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr == NULL )
{
wlog ("%s Unknown Host\n", hostname.c_str());
return (FAIL_UNKNOWN_HOSTNAME) ;
}
if ( state == "clear" )
{
if ( node_ptr->degrade_mask & DEGRADE_MASK_COLLECTD )
{
ilog("%s collectd degrade state change ; assert -> clear (%s)",
hostname.c_str(), resource.c_str());
node_ptr->degrade_mask &= ~DEGRADE_MASK_COLLECTD ;
}
else
{
mlog3("%s collectd degrade 'clear' request (%s)",
hostname.c_str(), resource.c_str());
}
}
else if ( state == "assert" )
{
if ( (node_ptr->degrade_mask & DEGRADE_MASK_COLLECTD) == 0 )
{
ilog("%s collectd degrade state change ; clear -> assert (due to %s)",
hostname.c_str(), resource.c_str());
node_ptr->degrade_mask |= DEGRADE_MASK_COLLECTD ;
}
else
{
mlog3("%s collectd degrade 'assert' request (%s)",
hostname.c_str(), resource.c_str());
}
}
else
{
wlog ("%s collectd degrade state unknown (%s)\n",
hostname.c_str(),
state.c_str());
rc = FAIL_OPERATION ;
}
return (rc);
}
/** Resource Monitor 'Clear' Event handler.
*
*
* The resource specified will be removed from the
* 'degraded_resources_list' for specified host.
* if there are no other degraded resources or other
* degraded services/reasons against that host then
* this handler will clear the degrade state for the
* specified host all together. */
int nodeLinkClass::degrade_resource_clear ( string & hostname,
int nodeLinkClass::degrade_resource_clear ( string & hostname,
string & resource )
{
/* lr - Log Prefix Rmon */
@ -4788,18 +4807,6 @@ int nodeLinkClass::degrade_resource_clear ( string & hostname,
if ( node_ptr->degraded_resources_list.empty() )
{
node_ptr->degrade_mask &= ~DEGRADE_MASK_RESMON ; ;
if ( node_ptr->degrade_mask == DEGRADE_MASK_NONE )
{
if ( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED )
{
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__AVAILABLE );
}
}
else
{
wlog ("%s Remains Degraded - Reason Mask:0x%08x\n",
hostname.c_str(), node_ptr->degrade_mask );
}
}
else
{
@ -4874,30 +4881,6 @@ int nodeLinkClass::node_degrade_control ( string & hostname, int state, string s
/* clear the mask regardless of host state */
node_ptr->degrade_mask &= ~service_flag ;
/* only applies if host is unlocked-enabled-degraded and
* there are no other degrade flags in the degrade mask */
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
( node_ptr->operState == MTC_OPER_STATE__ENABLED ) &&
( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED ))
{
if ( node_ptr->degrade_mask == DEGRADE_MASK_NONE )
{
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__AVAILABLE );
}
else
{
/* TODO: convert lask to a sring or services and print that string */
wlog ("%s remains degraded - degrade mask:0x%08x\n",
hostname.c_str(),
node_ptr->degrade_mask );
}
}
else
{
dlog ("%s unexpected degrade clear for '%s' service\n",
hostname.c_str(), service.c_str() );
}
rc = PASS ;
break ;
}
@ -4910,13 +4893,6 @@ int nodeLinkClass::node_degrade_control ( string & hostname, int state, string s
wlog ("%s degrade 'assert' from '%s'\n", hostname.c_str(), service.c_str() );
node_ptr->degrade_mask |= service_flag ;
}
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
( node_ptr->operState == MTC_OPER_STATE__ENABLED ) &&
( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ))
{
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__DEGRADED );
}
rc = PASS ;
break ;
}
@ -5232,10 +5208,6 @@ int nodeLinkClass::degrade_process_raise ( string & hostname,
{
node_ptr->degrade_mask |= DEGRADE_MASK_PMON ;
wlog ("%s is degraded due to '%s' process failure\n", hostname.c_str(), process.c_str());
if ( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE )
{
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__DEGRADED );
}
}
}
return (PASS);
@ -5412,11 +5384,6 @@ int nodeLinkClass::degrade_resource_raise ( string & hostname,
{
dlog ("%s '%s' Degraded (again)\n", lr.c_str(), resource.c_str());
}
if ( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE )
{
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__DEGRADED );
}
}
return (PASS);
}
@ -7039,9 +7006,6 @@ struct nodeLinkClass::node * nodeLinkClass::get_insvTestTimer ( timer_t tid )
*
*****************************************************************************/
#define TMP_DIR_PATH ((const char *)"/etc/mtc/tmp/")
#define AUTO_RECOVERY_FILE_SUFFIX ((const char *)"_ar_count")
void autorecovery_clear ( string hostname )
{
string ar_file = TMP_DIR_PATH + hostname + AUTO_RECOVERY_FILE_SUFFIX ;

View File

@ -585,6 +585,7 @@ private:
/* Bit mask of degrade reasons */
unsigned int degrade_mask ;
unsigned int degrade_mask_save ;
/** Process Monitor Daemon Flag Missing count */
int pmon_missing_count ;
@ -785,6 +786,7 @@ private:
int insv_test_handler ( struct nodeLinkClass::node * node_ptr );
int stress_handler ( struct nodeLinkClass::node * node_ptr );
int bm_handler ( struct nodeLinkClass::node * node_ptr );
int degrade_handler ( struct nodeLinkClass::node * node_ptr );
int uptime_handler ( void );
int host_services_handler ( struct nodeLinkClass::node * node_ptr );
@ -1731,6 +1733,11 @@ public:
/** Calculates and returns the mnfa threshold based on enabled hosts */
int mnfa_calculate_threshold ( string hostname );
/* collectd event handler */
int collectd_notify_handler ( string & hostname,
string & resource,
string & state );
/*****************************************
** Process Monitor Event Utilities API **
*****************************************/

View File

@ -68,9 +68,9 @@ string daemon_read_file ( const char * filename );
void daemon_logfile_close ( void );
void daemon_logfile_open ( void );
void daemon_log ( const char * filename , const char * str );
void daemon_log_value ( const char * filename , int val );
void daemon_log_value ( const char * filename , const char * str, int val );
int daemon_log ( const char * filename , const char * str );
int daemon_log_value ( const char * filename , int val );
int daemon_log_value ( const char * filename , const char * str, int val );
/* reads the first line of a file and if it contains a string
* that represents an integer value then return it */

View File

@ -103,7 +103,7 @@ void daemon_healthcheck ( const char * sig )
#define BUFFER 1024
void daemon_log_value ( const char * filename , const char * str, int val )
int daemon_log_value ( const char * filename , const char * str, int val )
{
FILE * file_stream = fopen (filename, "a" ) ;
if ( file_stream != NULL )
@ -111,10 +111,12 @@ void daemon_log_value ( const char * filename , const char * str, int val )
fprintf ( file_stream,"%s %d\n", str, val );
fflush (file_stream);
fclose (file_stream);
return (PASS);
}
return (FAIL_FILE_OPEN);
}
void daemon_log_value ( const char * filename , int val )
int daemon_log_value ( const char * filename , int val )
{
FILE * file_stream = fopen (filename, "w" ) ;
if ( file_stream != NULL )
@ -122,10 +124,12 @@ void daemon_log_value ( const char * filename , int val )
fprintf ( file_stream,"%d\n", val );
fflush (file_stream);
fclose (file_stream);
return (PASS);
}
return (FAIL_FILE_OPEN);
}
void daemon_log ( const char * filename , const char * str )
int daemon_log ( const char * filename , const char * str )
{
FILE * file_stream = fopen (filename, "a" ) ;
if ( file_stream != NULL )
@ -133,7 +137,9 @@ void daemon_log ( const char * filename , const char * str )
fprintf ( file_stream,"%s\n", str );
fflush (file_stream);
fclose (file_stream);
return (PASS);
}
return (FAIL_FILE_OPEN);
}
/* reads the first line of a file and if it contains a string

View File

@ -191,8 +191,50 @@ int mtc_service_inbox ( nodeLinkClass * obj_ptr,
print_mtc_message ( hostname, MTC_CMD_RX, msg, get_iface_name_str(iface), false );
if ( msg.hdr[0] == '{' )
{
int rc1 ;
string service ;
mlog1 ("%s\n", &msg.hdr[0] );
rc1 = jsonUtil_get_key_val(&msg.hdr[0],"service", service );
if ( rc1 == PASS )
{
if ( service == "collectd_notifier" )
{
int rc1,rc2,rc3 ;
string hostname,resource,state ;
rc1 = jsonUtil_get_key_val(&msg.hdr[0],"hostname", hostname );
rc2 = jsonUtil_get_key_val(&msg.hdr[0],"resource", resource );
rc3 = jsonUtil_get_key_val(&msg.hdr[0],"degrade", state );
if ( rc1|rc2|rc3 )
{
elog ("failed to parse '%s' message\n", service.c_str());
wlog ("... %s\n", &msg.hdr[0] );
}
else
{
obj_ptr->collectd_notify_handler ( hostname,
resource,
state );
}
}
/* future service requests */
else
{
wlog ("Unexpected service request: '%s'\n", service.c_str());
}
}
else
{
wlog("Unexpected json message: %s\n", &msg.hdr[0] );
}
}
/* Check for response messages */
if ( strstr ( &msg.hdr[0], get_cmd_rsp_msg_header() ) )
else if ( strstr ( &msg.hdr[0], get_cmd_rsp_msg_header() ) )
{
obj_ptr->set_cmd_resp ( hostname , msg ) ;
}

View File

@ -74,6 +74,9 @@ int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr )
/* manage the host connected state and board management alarms */
nodeLinkClass::bm_handler ( node_ptr );
/* manage host's degrade state */
nodeLinkClass::degrade_handler ( node_ptr );
/*
* Always run the offline handler
*

View File

@ -5599,15 +5599,6 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
send_hwmon_command ( node_ptr->hostname, MTC_CMD_START_HOST );
}
/* handle coming out of the ADD in a degraded state */
if (( node_ptr->degrade_mask != 0 ) &&
(( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
( node_ptr->operState == MTC_OPER_STATE__ENABLED ) &&
( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE )))
{
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__DEGRADED );
}
node_ptr->mtcAlive_gate = false ;
node_ptr->addStage = MTC_ADD__DONE ;
break;
@ -6111,22 +6102,6 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
{
alarm_compute_clear ( node_ptr, false );
}
/************************************************************
* Manage host degrade based on degrade mask *
***********************************************************/
if (( node_ptr->degrade_mask == 0 ) &&
( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED ))
{
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__AVAILABLE );
}
/* expected degrade audit */
else if (( node_ptr->degrade_mask ) &&
( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ))
{
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__DEGRADED );
}
}
break ;
}
@ -6461,12 +6436,6 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
node_ptr->degrade_mask |= DEGRADE_MASK_SM ;
ilog ("%s sm degrade\n", node_ptr->hostname.c_str());
/* degrade the host if not already degraded */
if ( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE )
{
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__DEGRADED );
}
}
/* Manage de-asserting degrade due to Software Management */
@ -6477,16 +6446,6 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
node_ptr->degrade_mask &= ~DEGRADE_MASK_SM ;
ilog ("%s sm degrade clear\n", node_ptr->hostname.c_str());
/* if the degrade mask is now clear then consider clearing the degrade state */
if ( node_ptr->degrade_mask == 0 )
{
/* ... but only if we are degraded */
if ( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED )
{
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__AVAILABLE );
}
}
}
if ( node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY)
@ -6502,10 +6461,6 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
if ( node_ptr->health_threshold_counter >= MTC_UNHEALTHY_THRESHOLD )
{
node_ptr->degrade_mask |= DEGRADE_MASK_CONFIG ;
if ( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE )
{
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__DEGRADED );
}
/* threshold is reached so raise the config alarm if it is not already raised */
if ( node_ptr->alarms[MTC_ALARM_ID__CONFIG] != FM_ALARM_SEVERITY_CRITICAL )
@ -6554,6 +6509,30 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
return (PASS);
}
/************************************************************
* Manage host degrade state based on degrade mask *
* The availability state of degrade only applies when the *
* host is unlocked-enabled. *
***********************************************************/
int nodeLinkClass::degrade_handler ( struct nodeLinkClass::node * node_ptr )
{
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
( node_ptr->operState == MTC_OPER_STATE__ENABLED ))
{
if (( node_ptr->degrade_mask == DEGRADE_MASK_NONE ) &&
( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED ))
{
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__AVAILABLE );
}
else if (( node_ptr->degrade_mask ) &&
( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ))
{
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__DEGRADED );
}
}
return (PASS);
}
int nodeLinkClass::cfg_handler ( struct nodeLinkClass::node * node_ptr )
{

View File

@ -1127,6 +1127,7 @@ void read_fs_file ( vector<string> & dynamic_resources )
*****************************************************************************/
void add_dynamic_fs_resource ( bool send_response )
{
#ifdef WANT_FS_MONITORING
char resource[50];
char temp_resource[50];
char device [50];
@ -1206,10 +1207,14 @@ void add_dynamic_fs_resource ( bool send_response )
}
}
}
#endif
if (send_response)
{
#ifdef WANT_FS_MONITORING
ilog ("sending response to dynamic FS add, to the rmon client\n");
#else
ilog("dynamic filesystem monitoring moved to collectd\n");
#endif
/* let the rmon client know that we are done with the file */
rmon_resource_response(_rmon_ctrl_ptr->clients);
}
@ -4650,6 +4655,8 @@ void rmon_service (rmon_ctrl_type * ctrl_ptr)
ilog ("registered clients: %d\n", _rmon_ctrl_ptr->clients);
#ifdef WANT_FS_MONITORING
/* Initialize the resource specific configuration */
for (int j=0; j<_rmon_ctrl_ptr->resources; j++)
{
@ -4669,6 +4676,9 @@ void rmon_service (rmon_ctrl_type * ctrl_ptr)
/* add any dynamic resources from before */
add_dynamic_fs_resource(false);
#else
ilog("static filesystem monitoring moved to collectd\n");
#endif
/* Clear any stale dynamic alarms that can be caused by dynamic resources. */
/* An alarm become stale for example if it was raised against a local volumn group (lvg) and */

View File

@ -1,16 +0,0 @@
[resource]
resource = Platform CPU Usage
debounce = 20 ; number of seconds to wait before degrade clear
severity = critical ; minor, major, critical
minor_threshold = 80 ; minor cpu utilization threshold percentage
major_threshold = 90 ; major cpu utilization threshold percentage
critical_threshold = 95 ; critical cpu utilization threshold percentage (use 101 if unused)
minor_threshold_abs_node0 = 512 ; absolute minor threshold value MiB processor node 0
major_threshold_abs_node0 = 307 ; absolute major threshold value MiB processor node 0
critical_threshold_abs_node0 = 102 ; absolute critical threshold value MiB processor node 0
minor_threshold_abs_node1 = 0 ; absolute minor threshold value MiB processor node 1
major_threshold_abs_node1 = 0 ; absolute major threshold value MiB processor node 1
critical_threshold_abs_node1 = 0 ; absolute critical threshold value MiB processor node 1
num_tries = 2 ; number of tries before the alarm is raised
alarm_on = 1 ; 1 for alarm on, 0 for alarm off
percent = 1 ; Always use 1 for this resource (thresholds by percentage)

View File

@ -1,16 +0,0 @@
[resource]
resource = Platform Filesystem Usage
debounce = 20 ; number of seconds to wait before degrade clear
severity = critical ; minor, major, critical
minor_threshold = 70 ; minor filesystem utilization threshold percentage
major_threshold = 80 ; major filesystem utilization threshold percentage
critical_threshold = 90 ; critical filesystem utilization threshold percentage (use 101 if unused)
minor_threshold_abs_node0 = 512 ; absolute minor threshold value MiB processor node 0
major_threshold_abs_node0 = 307 ; absolute major threshold value MiB processor node 0
critical_threshold_abs_node0 = 102 ; absolute critical threshold value MiB processor node 0 (use 0 if unused)
minor_threshold_abs_node1 = 0 ; absolute minor threshold value MiB processor node 1
major_threshold_abs_node1 = 0 ; absolute major threshold value MiB processor node 1
critical_threshold_abs_node1 = 0 ; absolute critical threshold value MiB processor node 1
num_tries = 2 ; number of tries before the alarm is raised
alarm_on = 1 ; 1 for alarm on, 0 for alarm off
percent = 1 ; 1 for percentage used, 0 for absolute value (file system available in MiB) (default is 1)

View File

@ -1,16 +0,0 @@
[resource]
resource = Platform Memory Usage
debounce = 20 ; number of seconds to wait before degrade clear
severity = critical ; minor, major, critical
minor_threshold = 70 ; minor memory utilization threshold percentage
major_threshold = 80 ; major memory utilization threshold percentage
critical_threshold = 90 ; critical memory utilization threshold percentage (use 101 if unsed)
minor_threshold_abs_node0 = 512 ; absolute minor threshold value MiB processor node 0
major_threshold_abs_node0 = 307 ; absolute major threshold value MiB processor node 0
critical_threshold_abs_node0 = 102 ; absolute critical threshold value MiB processor node 0 (use 0 if unused)
minor_threshold_abs_node1 = 0 ; absolute minor threshold value MiB processor node 1
major_threshold_abs_node1 = 0 ; absolute major threshold value MiB processor node 1
critical_threshold_abs_node1 = 0 ; absolute critical threshold value MiB processor node 1
num_tries = 2 ; number of tries before the alarm is raised
alarm_on = 1 ; 1 for alarm on, 0 for alarm off
percent = 1 ; 1 for percentage used, 0 for absolute value (memory available in MiB) (default is 1)