/* * Copyright (c) 2015-2017 Wind River Systems, Inc. * * SPDX-License-Identifier: Apache-2.0 * * * * @file * Wind River Titanium Cloud Hardware Monitor "Sensor" Utilities */ #include "daemon_ini.h" /* for ... parse_ini and MATCH */ #include "nodeBase.h" /* for ... mtce common definitions */ #include "jsonUtil.h" /* for ... json utilitiies */ #include "nodeUtil.h" /* for ... mtce common utilities */ #include "hwmonUtil.h" /* for ... get_severity */ #include "hwmonClass.h" /* for ... service class definition */ #include "hwmonHttp.h" /* for ... hwmonHttp_load_sensors */ #include "hwmonSensor.h" /* for ... this module header */ #include "hwmonGroup.h" /* for ... ipmi_get_grouptype */ #include "hwmonAlarm.h" /* for ... hwmonAlarm */ #define DELIMITER ((const char)',') /**************************************************************************** * * Name : hwmonSensor_print * * Purpose: Print the contents of the pointed to sensor * *****************************************************************************/ void hwmonSensor_print ( string & hostname, sensor_type * sensor_ptr ) { const char bar [] = {"+--------------------------------------------------\n" } ; if ( sensor_ptr ) { syslog ( LOG_INFO, "%s", bar); syslog ( LOG_INFO, "| sensor info : %s '%s' sensor\n", sensor_ptr->hostname.c_str(), sensor_ptr->sensorname.c_str()); syslog ( LOG_INFO, "| uuid : %s\n", sensor_ptr->uuid.c_str()); syslog ( LOG_INFO, "| group uuid : %s\n", sensor_ptr->group_uuid.c_str()); syslog ( LOG_INFO, "| sensortype : %s\n", sensor_ptr->sensortype.c_str()); syslog ( LOG_INFO, "| datatype : %s\n", sensor_ptr->datatype.c_str()); syslog ( LOG_INFO, "| minor : %s\n", sensor_ptr->actions_minor.c_str()); syslog ( LOG_INFO, "| major : %s\n", sensor_ptr->actions_major.c_str()); syslog ( LOG_INFO, "| critical : %s\n", sensor_ptr->actions_critl.c_str()); syslog ( LOG_INFO, "| state:status : %s-%s\n", sensor_ptr->state.c_str(), sensor_ptr->status.c_str()); syslog ( LOG_INFO, "| command : %s\n", sensor_ptr->path.c_str()); syslog ( LOG_INFO, "| algorithm : %s\n", sensor_ptr->algorithm.c_str()); syslog ( LOG_INFO, "| suppress : %s\n", sensor_ptr->suppress ? "True" : "False" ); if ( !sensor_ptr->datatype.compare("analog") ) { // syslog ( LOG_INFO, "%s",bar); syslog ( LOG_INFO, "| minor thld: %5.3f <-> %5.3f \n", sensor_ptr->t_minor_lower, sensor_ptr->t_minor_upper ); syslog ( LOG_INFO, "| major thld: %5.3f <-> %5.3f \n", sensor_ptr->t_major_lower, sensor_ptr->t_major_upper ); syslog ( LOG_INFO, "| critical thld: %5.3f <-> %5.3f \n", sensor_ptr->t_critical_lower, sensor_ptr->t_critical_upper ); syslog ( LOG_INFO, "| unit info : [base:%s] [rate:%s] [modifier:%s]\n", sensor_ptr->unit_base.c_str(), sensor_ptr->unit_rate.c_str(), sensor_ptr->unit_modifier.c_str()); } } else { slog ("%s cannot print a NULL sensor\n", hostname.c_str() ); } } /**************************************************************************** * * Name : hwmonGroup_print * * Purpose: Print the contents of the pointed to sensor group * *****************************************************************************/ void hwmonGroup_print ( string & hostname, struct sensor_group_type * group_ptr ) { const char bar [] = {"+--------------------------------------------------------\n" } ; if ( group_ptr ) { syslog ( LOG_INFO, "%s", bar); syslog ( LOG_INFO, "| group info : %s '%s' group\n", hostname.c_str(), group_ptr->group_name.c_str()); syslog ( LOG_INFO, "| group uuid : %s\n", group_ptr->group_uuid.c_str()); syslog ( LOG_INFO, "| sensortype : %s\n", group_ptr->sensortype.c_str()); syslog ( LOG_INFO, "| datatype : %s\n", group_ptr->datatype.c_str()); syslog ( LOG_INFO, "| group minor choices : %s\n", group_ptr->actions_minor_choices.c_str()); syslog ( LOG_INFO, "| group minor actions : %s\n", group_ptr->actions_minor_group.c_str()); syslog ( LOG_INFO, "| group major choices : %s\n", group_ptr->actions_major_choices.c_str()); syslog ( LOG_INFO, "| group major actions : %s\n", group_ptr->actions_major_group.c_str()); syslog ( LOG_INFO, "| group critical choices: %s\n", group_ptr->actions_critical_choices.c_str()); syslog ( LOG_INFO, "| group critical actions: %s\n", group_ptr->actions_critl_group.c_str()); syslog ( LOG_INFO, "| group state : %s\n", group_ptr->group_state.c_str()); syslog ( LOG_INFO, "| algorithm : %s\n", group_ptr->algorithm.c_str()); syslog ( LOG_INFO, "| group audit period : %d secs\n", group_ptr->group_interval ); syslog ( LOG_INFO, "| group suppress : %s\n", group_ptr->suppress ? "True" : "False" ); syslog ( LOG_INFO, "| group sensor read cmd : %s\n", group_ptr->path.c_str()); syslog ( LOG_INFO, "| group sensors (count) : %d\n", group_ptr->sensors); if ( !group_ptr->sensor_labels.empty() ) { syslog ( LOG_INFO, "| group sensor labels : %s\n", group_ptr->sensor_labels.c_str()); } if ( group_ptr->sensors ) { for ( int s = 0 ; s < group_ptr->sensors ; ++s ) { if ( group_ptr->sensor_ptr[s] != NULL ) { syslog ( LOG_INFO, "| group sensor %02d : %s\n", s, group_ptr->sensor_ptr[s]->sensorname.c_str()); } } } if ( !group_ptr->datatype.compare("analog") ) { // syslog ( LOG_INFO, "%s",bar); syslog ( LOG_INFO, "| minor thld: %5.3f <-> %5.3f \n", group_ptr->t_minor_lower_group, group_ptr->t_minor_upper_group ); syslog ( LOG_INFO, "| major thld: %5.3f <-> %5.3f \n", group_ptr->t_major_lower_group, group_ptr->t_major_upper_group ); syslog ( LOG_INFO, "| critical thld: %5.3f <-> %5.3f \n", group_ptr->t_critical_lower_group, group_ptr->t_critical_upper_group ); syslog ( LOG_INFO, "| unit info : [base:%s] [rate:%s] [modifier:%s]\n", group_ptr->unit_base_group.c_str(), group_ptr->unit_rate_group.c_str(), group_ptr->unit_modifier_group.c_str()); } } else { slog ("%s cannot print a NULL group\n", hostname.c_str()); } } /**************************************************************************** * * Name : hwmonSensor_init * * Purpose: Initialize a sensor_type struct to default values * *****************************************************************************/ void hwmonSensor_init ( string & hostname , sensor_type * sensor_ptr ) { if ( sensor_ptr ) { sensor_ptr->hostname = hostname ; sensor_ptr->bmc.clear(); sensor_ptr->uuid.clear(); sensor_ptr->host_uuid.clear(); sensor_ptr->group_uuid.clear(); sensor_ptr->sensorname.clear(); sensor_ptr->sensortype.clear(); sensor_ptr->datatype = DISCRETE ; /* should really be ANALOG */ sensor_ptr->suppress = false ; sensor_ptr->debounce_count = 0 ; sensor_ptr->want_debounce_log_if_ok = false ; sensor_ptr->algorithm = "debounce-1.v1" ; sensor_ptr->status = "offline" ; sensor_ptr->state = "disabled" ; sensor_ptr->script.clear(); sensor_ptr->path.clear() ; sensor_ptr->entity_path.clear(); sensor_ptr->unit_base.clear(); sensor_ptr->unit_rate.clear(); sensor_ptr->unit_modifier.clear(); sensor_ptr->prot = BMC_PROTOCOL__IPMITOOL ; sensor_ptr->kind = SENSOR_KIND__NONE ; sensor_ptr->unit = SENSOR_UNIT__NONE ; sensor_ptr->actions_minor = HWMON_ACTION_IGNORE ; sensor_ptr->actions_major = HWMON_ACTION_LOG ; sensor_ptr->actions_critl = HWMON_ACTION_ALARM ; sensor_ptr->t_minor_lower = 1.000 ; sensor_ptr->t_major_lower = 5.000 ; sensor_ptr->t_critical_lower = 10.000 ; sensor_ptr->t_minor_upper = 1.000 ; sensor_ptr->t_major_upper = 5.000 ; sensor_ptr->t_critical_upper = 10.000 ; /* PATCHBACK - should patchback to REL3 and earlier */ sensor_ptr->severity = sensor_ptr->sample_severity = HWMON_SEVERITY_GOOD ; sensor_ptr->sample_status = sensor_ptr->sample_status_last = "ok" ; sensor_ptr->updated = false ; sensor_ptr->found = false ; sensor_ptr->degraded = false ; sensor_ptr->alarmed = false ; sensor_ptr->not_found_log_throttle = 0 ; sensor_ptr->not_updated_status_change_count = 0 ; clear_logged_state ( sensor_ptr ); clear_ignored_state ( sensor_ptr ); clear_alarmed_state ( sensor_ptr ); } else { slog ("%s cannot init a NULL sensor\n", hostname.c_str()); } } /**************************************************************************** * * Name : hwmonGroup_init * * Purpose: Initialize a sensor_group_type struct to default values * *****************************************************************************/ void hwmonGroup_init ( string & hostname , struct sensor_group_type * group_ptr ) { if ( group_ptr ) { hwmonHostClass * obj_ptr = get_hwmonHostClass_ptr(); group_ptr->hostname = hostname ; group_ptr->group_name.clear(); group_ptr->host_uuid.clear(); group_ptr->group_uuid.clear(); group_ptr->sensortype.clear(); group_ptr->datatype = DISCRETE ; /* should really be ANALOG */ group_ptr->status.clear(); group_ptr->unit_base_group.clear(); group_ptr->unit_rate_group.clear(); group_ptr->unit_modifier_group.clear(); group_ptr->path = "na" ; group_ptr->sensor_labels.clear(); group_ptr->sensor_read_index = 0 ; group_ptr->group_interval = HWMON_DEFAULT_AUDIT_INTERVAL ; /* Number of sensors in this group followed by an * array of those sensor pointers * All this is initied here */ group_ptr->sensors = 0 ; for ( int i = 0 ; i < MAX_HOST_SENSORS ; i++ ) group_ptr->sensor_ptr[i] = NULL ; group_ptr->actions_critical_choices = HWMON_ACTION_IGNORE ; group_ptr->actions_critical_choices.append(","); group_ptr->actions_critical_choices.append(HWMON_ACTION_LOG); group_ptr->actions_critical_choices.append(","); group_ptr->actions_critical_choices.append(HWMON_ACTION_ALARM); /* Don't support reset and power cycle in AIO simplex mode */ if ( obj_ptr->system_type != SYSTEM_TYPE__CPE_MODE__SIMPLEX ) { group_ptr->actions_critical_choices.append(","); group_ptr->actions_critical_choices.append(HWMON_ACTION_RESET); group_ptr->actions_critical_choices.append(","); group_ptr->actions_critical_choices.append(HWMON_ACTION_POWERCYCLE); } group_ptr->actions_major_choices = HWMON_ACTION_IGNORE ; group_ptr->actions_major_choices.append(","); group_ptr->actions_major_choices.append(HWMON_ACTION_LOG); group_ptr->actions_major_choices.append(","); group_ptr->actions_major_choices.append(HWMON_ACTION_ALARM); group_ptr->actions_minor_choices = HWMON_ACTION_IGNORE ; group_ptr->actions_minor_choices.append(","); group_ptr->actions_minor_choices.append(HWMON_ACTION_LOG); group_ptr->actions_minor_choices.append(","); group_ptr->actions_minor_choices.append(HWMON_ACTION_ALARM); group_ptr->suppress = false ; group_ptr->algorithm = "debounce-1.v1" ; group_ptr->group_state = "disabled" ; group_ptr->active = false ; group_ptr->timeout = false ; group_ptr->failed = false ; group_ptr->alarmed = false ; group_ptr->actions_minor_group = HWMON_ACTION_IGNORE ; group_ptr->actions_major_group = HWMON_ACTION_LOG ; group_ptr->actions_critl_group = HWMON_ACTION_ALARM ; group_ptr->t_minor_lower_group = 1.000 ; group_ptr->t_major_lower_group = 5.000 ; group_ptr->t_critical_lower_group = 10.000 ; group_ptr->t_minor_upper_group = 1.000 ; group_ptr->t_major_upper_group = 5.000 ; group_ptr->t_critical_upper_group = 10.000 ; } else { slog ("%s cannot print a NULL group\n", hostname.c_str()); } } /* ************************************************************************* * * Name : load_profile_groups * * Description: Load all the sensor groups from the profile file. * * Scope : private hwmonHostClass * * Parameters : host_ptr , a pointer to a group sensor array with * up to MAX_HOST_GROUPS elements * * Returns : The number of groups that were loaded * * *************************************************************************/ int hwmonHostClass::load_profile_groups ( struct hwmonHostClass::hwmon_host * host_ptr, struct sensor_group_type * group_array_ptr, int max , bool & error ) { int groups_found = 0 ; error = false ; if ( ( max <= MAX_HOST_GROUPS ) && ( group_array_ptr ) ) { int rc ; string sensor_group_types = "" ; std::list sensor_group_types_list ; std::list::iterator types_iter_ptr ; /* get the top level group types from the profile file * in the SERVER:group_types heading , i.e. TEMPERATURE */ rc = ini_get_config_value ( QUANTA_SENSOR_PROFILE_FILE, "SERVER", "group_types", sensor_group_types , false ); if ( rc ) /* handle error case */ { elog ("%s failed to find '[SERVER] -> 'group_types' label in %s file\n", host_ptr->hostname.c_str(), QUANTA_SENSOR_PROFILE_FILE); return rc ; } /* Start with a fresh group type list and load the detected group_types into it */ sensor_group_types_list.clear(); rc = get_delimited_list ( sensor_group_types, DELIMITER , sensor_group_types_list, true ) ; /* Note: badly parsed group_types , sensors are skipped over with error messages logged * * TODO: keep track of an error and raise the SENSORCFG Alarm * **/ /* loop over each 'group_type' looking for the 'groups' */ for ( types_iter_ptr = sensor_group_types_list.begin(); types_iter_ptr != sensor_group_types_list.end() ; ++types_iter_ptr ) { string sensor_groups = "" ; std::list sensor_groups_list ; daemon_signal_hdlr (); dlog ("%s [%s]\n", host_ptr->hostname.c_str(), types_iter_ptr->c_str()); /* get the 'groups' within this 'group type' * in the [[SERVER]:group_type] -> 'groups' heading, i.e. TEMPERATURE1 */ rc = ini_get_config_value ( QUANTA_SENSOR_PROFILE_FILE, *types_iter_ptr, "groups", sensor_groups, false ); if ( rc ) { elog ("%s '%s' group type parse error ... skipping (%d)\n", host_ptr->hostname.c_str(), types_iter_ptr->c_str(), rc ); error = true ; continue ; } /* get the list of groups from the []:groups label */ sensor_groups_list.clear(); rc = get_delimited_list ( sensor_groups, DELIMITER , sensor_groups_list, true ) ; if ( rc ) { elog ("%s '%s' failed to get group type list ... skipping (%d)\n", host_ptr->hostname.c_str(), types_iter_ptr->c_str(), rc); elog ("%s '%s' error string: %s\n", host_ptr->hostname.c_str(), types_iter_ptr->c_str(), sensor_groups.c_str()); error = true ; continue ; } /****************************************************************** * Look for the 'groups' *****************************************************************/ std::list::iterator groups_iter_ptr ; dlog2 ("%s groups list: %s\n", host_ptr->hostname.c_str(), sensor_groups.c_str()); for ( groups_iter_ptr = sensor_groups_list.begin(); groups_iter_ptr != sensor_groups_list.end() ; ++groups_iter_ptr ) { /* Start from a fresh default */ hwmonGroup_init ( host_ptr->hostname, group_array_ptr ); daemon_signal_hdlr (); /* Get the group name for each group */ group_array_ptr->group_name.clear() ; rc = ini_get_config_value ( QUANTA_SENSOR_PROFILE_FILE, *groups_iter_ptr, "group", group_array_ptr->group_name, false ); if ( rc ) { elog ("%s '%s' group parse error ... skipping (%d)\n", host_ptr->hostname.c_str(), groups_iter_ptr->c_str(), rc ); error = true ; continue ; } /************************************************************************* * Read the sensor group attributes *************************************************************************/ /* sensortype */ rc = ini_get_config_value ( QUANTA_SENSOR_PROFILE_FILE, *groups_iter_ptr, "sensortype", group_array_ptr->sensortype, false ); if ( rc ) { elog ("%s 'sensortype' for '%s' sensor group is unknown\n", host_ptr->hostname.c_str(), group_array_ptr->group_name.c_str()); error = true ; continue ; } /* datatype */ rc = ini_get_config_value ( QUANTA_SENSOR_PROFILE_FILE, *groups_iter_ptr, "datatype", group_array_ptr->datatype, false ); if ( rc ) { elog ("%s 'datatype' for '%s' sensor group is unknown\n", host_ptr->hostname.c_str(), group_array_ptr->group_name.c_str()); error = true ; continue ; } /* interval */ string interval ; rc = ini_get_config_value ( QUANTA_SENSOR_PROFILE_FILE, *groups_iter_ptr, "interval", interval , false ); if ( rc ) { elog ("%s 'interval' for '%s' sensor group is unknown\n", host_ptr->hostname.c_str(), group_array_ptr->group_name.c_str()); error = true ; continue ; } group_array_ptr->group_interval = atoi(interval.data()) ; /* seconds */ /* group read command (cmd) * This may be over ridden by a sensor specific command * TODO: add support for multiple commands **/ rc = ini_get_config_value ( QUANTA_SENSOR_PROFILE_FILE, *groups_iter_ptr, "cmd", group_array_ptr->path, false ); if ( rc ) { elog ("%s 'sensor read command' for '%s' sensor group is unknown ; skipping group\n", host_ptr->hostname.c_str(), group_array_ptr->group_name.c_str()); error = true ; continue ; } /************************************************************************** * Get the list of sensors for each group from the profile file * * SERVER:group_types * :groups * :sensors <------- **/ rc = ini_get_config_value ( QUANTA_SENSOR_PROFILE_FILE, *groups_iter_ptr, "sensors", group_array_ptr->sensor_labels, false ); if ( rc ) { elog ("%s failed to read 'sensor' list for '%s' sensor group\n", host_ptr->hostname.c_str(), group_array_ptr->group_name.c_str()); error = true ; continue ; } ilog ("%s '%s' group loaded from profile (%d) (cmd:%s)\n", host_ptr->hostname.c_str(), group_array_ptr->group_name.c_str(), groups_found, group_array_ptr->path.c_str()); group_array_ptr++ ; groups_found++ ; } } /* types_iter_ptr for loop end */ } return (groups_found); } /* ************************************************************************* * * Name : load_profile_sensors * * Description: Load all the sensors for each group. * * Scope : private hwmonHostClass * * Parameters : host_ptr , a pointer to a sensor array with * up to MAX_HOST_SENSORS elements * * Returns : The number of sensors that were loaded * * *************************************************************************/ int hwmonHostClass::load_profile_sensors ( struct hwmonHostClass::hwmon_host * host_ptr, sensor_type * sensor_array_ptr, int max, bool & error ) { int sensors_found = 0 ; if ( ( max <= MAX_HOST_SENSORS ) && ( sensor_array_ptr ) ) { std::list sensor_sensors_list ; std::list::iterator sensors_iter_ptr ; string sensor = "" ; string sensor_read_cmd = "" ; for ( int g = 0 ; g < host_ptr->groups ; ++g ) { sensor_sensors_list.clear(); int rc = get_delimited_list ( host_ptr->group[g].sensor_labels, DELIMITER , sensor_sensors_list, false ); if ( rc ) { elog ("%s '%s' failed to get sensor list ... skipping\n", host_ptr->hostname.c_str(), host_ptr->group[g].group_name.c_str()); elog ("%s error string: %s\n", host_ptr->hostname.c_str(), host_ptr->group[g].sensor_labels.c_str()); error = true ; continue ; } blog ("%s '%s' has %ld sensors (%s)\n", host_ptr->hostname.c_str(), host_ptr->group[g].group_name.c_str(), sensor_sensors_list.size(), host_ptr->group[g].sensor_labels.c_str()); for ( sensors_iter_ptr = sensor_sensors_list.begin(); sensors_iter_ptr != sensor_sensors_list.end() ; ++sensors_iter_ptr ) { sensor.clear(); dlog1 ("%s '%s' sensor group label: %s\n", host_ptr->hostname.c_str(), host_ptr->group[g].group_name.c_str(), sensors_iter_ptr->c_str()); rc = ini_get_config_value ( QUANTA_SENSOR_PROFILE_FILE, *sensors_iter_ptr, "name", sensor, false ); if ( rc ) { elog ("%s %s '%s' sensor label 'name' parse error ... skipping\n", host_ptr->hostname.c_str(), host_ptr->group[g].group_name.c_str(), sensors_iter_ptr->c_str()); error = true ; continue ; } else { hwmonSensor_init ( host_ptr->hostname, sensor_array_ptr ); dlog ("%s '%s' group has '%s' sensor\n", host_ptr->hostname.c_str(), host_ptr->group[g].group_name.c_str(), sensor.c_str()); /* fetch the sensor specific read command if there is one (cmd) * maybe_missing parm is true so as to avoid ailing when there is * no sensor specific command ; which is most of he time */ sensor_read_cmd.clear(); ini_get_config_value ( QUANTA_SENSOR_PROFILE_FILE, *sensors_iter_ptr, "cmd", sensor_read_cmd, true ); if ( sensor_read_cmd.empty() ) { sensor_array_ptr->entity_path = sensor ; } else { sensor_array_ptr->path = sensor_read_cmd ; host_ptr->group[g].path.clear(); sensor_array_ptr->entity_path = sensor_read_cmd ; sensor_array_ptr->entity_path.append(ENTITY_DELIMITER); sensor_array_ptr->entity_path.append(sensor); } sensor_array_ptr->sensorname = sensor ; sensor_array_ptr->sensortype = host_ptr->group[g].sensortype ; sensor_array_ptr->datatype = host_ptr->group[g].datatype ; /* group uuid is in host_ptr at this point */ sensor_array_ptr->group_uuid = host_ptr->group[g].group_uuid ; if ( host_ptr->group[g].group_uuid.empty() ) { wlog ("%s '%s' had empty uuid ; grouping will fail\n", host_ptr->hostname.c_str(), host_ptr->group[g].group_name.c_str()); error = true ; continue ; } sensor_array_ptr++ ; sensors_found++ ; } } } } return (sensors_found); } int hwmonHostClass::delete_unwanted_sensors ( struct hwmonHostClass::hwmon_host * host_ptr ) { int rc = PASS ; for ( int s = 0 ; s < host_ptr->sensors ; s++ ) { if (( host_ptr->sensor[s].sensorname == "PCH Thermal Trip" ) || ( host_ptr->sensor[s].sensorname == "MB Thermal Trip" ) || ( host_ptr->sensor[s].sensorname == "Temp_OCP")) { ilog ("%s %s sensor is being deleted from sensor model\n", host_ptr->hostname.c_str(), host_ptr->sensor[s].sensorname.c_str() ); clear_asserted_alarm ( host_ptr->hostname, HWMON_ALARM_ID__SENSOR, &host_ptr->sensor[s], REASON_DEPROVISIONED ); hwmonHttp_del_sensor ( host_ptr->hostname, host_ptr->event, host_ptr->sensor[s]); rc = RETRY ; } } return (rc); } /* ************************************************************************* * * Name : hwmon_load_sensors * * Description: High level work horse procdure called by the Add FSM to * * 1. load all sensors and groups from the database. * - hwmonHttp_load_sensors * * 2. read all sensors and groups from the profile file. * * 3. ensure that all the sensors and groups in the profile * file are in the database and hardware monitor. * - hwmonHttp_add_sensor (sysinv) * - add_sensor (hwmon) * * 5. verify hat the sensors are read from hwmon correctly. * - get_sensor (hwmon) * * 4. group all sensors * - group_sensors (hwmon) and sysinv inside * * Scope : private hwmonHostClass * * Parameters : host_ptr * * Returns : TODO: handle modify errors better. * * *************************************************************************/ int hwmonHostClass::hwmon_load_sensors ( struct hwmonHostClass::hwmon_host * host_ptr, bool & error ) { /* * * This will load all already provisioned sensors from the sysinv * database into this host's host_ptr->sensor array. * */ int rc = hwmonHttp_load_sensors ( host_ptr->hostname, host_ptr->event ); if ( delete_unwanted_sensors ( host_ptr ) == RETRY ) { ilog ("%s reloading sensors list\n", host_ptr->hostname.c_str()); host_ptr->sensors = 0 ; rc = hwmonHttp_load_sensors ( host_ptr->hostname, host_ptr->event ); } if ( rc == PASS ) { sensor_type sensor_array[MAX_HOST_SENSORS] ; int profile_sensors = load_profile_sensors ( host_ptr, &sensor_array[0], MAX_HOST_SENSORS, error ); ilog ("%s has %d sensors in the profile file\n", host_ptr->hostname.c_str(), profile_sensors ); /** * Loop through each profile file sensor and ensure it * is in the database as well as in this host's control * structure. if already in the sysinv database then * don't try and reload it. **/ /** * TODO:ROBUST: Should have a check for sensors in the * database that are not in the profile file. * What to do ? **/ for ( int i = 0 ; i < profile_sensors ; i++ ) { rc = PASS ; bool found = false ; /* Loop over all the sensors in the database */ for ( int j = 0 ; j < host_ptr->sensors ; j++ ) { daemon_signal_hdlr (); if ( !sensor_array[i].entity_path.compare(host_ptr->sensor[j].entity_path) ) { found = true ; break ; } } /** * If hwmon does not have it after loading all the * sensors from the database then handle adding this * as a new sensor to the sysinv database here. **/ if ( found == false ) { ilog ( "%s '%s' sensor (add to sysinv)\n", host_ptr->hostname.c_str(), sensor_array[i].sensorname.c_str()); rc = hwmonHttp_add_sensor ( host_ptr->hostname, host_ptr->event, sensor_array[i] ); if ( rc != PASS ) { wlog ("%s '%s' sensor add failed (to sysinv) (rc:%d)\n", host_ptr->hostname.c_str(), sensor_array[i].sensorname.c_str(), rc); break ; } else { sensor_array[i].uuid = host_ptr->event.new_uuid ; blog1 ("%s '%s' sensor added (to sysinv)\n", host_ptr->hostname.c_str(), sensor_array[i].sensorname.c_str()); } } else { blog1 ("%s '%s' sensor already provisioned (in sysinv) (rc:%d)\n", host_ptr->hostname.c_str(), sensor_array[i].sensorname.c_str(), rc); } /** * Only add this sensor to hwmon if it was * successfully added to the sysinv database. **/ if (( rc == PASS ) && ( found == false )) { // ilog ( "%s '%s' is in %s (add to hwmond)\n", host_ptr->hostname.c_str(), sensor_array[i].sensorname.c_str(), sensor_array[i].group_uuid.c_str()); rc = add_sensor ( host_ptr->hostname, sensor_array[i] ); #ifdef WANT_FIT_TESTING if ( daemon_want_fit ( FIT_CODE__HWMON__ADD_SENSOR, host_ptr->hostname )) rc = FAIL ; #endif if ( rc != PASS ) { wlog ("%s '%s' sensor add failed (to hwmond) (rc:%d)\n", host_ptr->hostname.c_str(), sensor_array[i].sensorname.c_str(), rc); break ; } else { blog1 ("%s '%s' sensor added (to hwmon) uuid:%s\n", host_ptr->hostname.c_str(), sensor_array[i].sensorname.c_str(), sensor_array[i].uuid.c_str()); } } /* Associate it with its group */ /** * Verify that the sensor was loaded into hwmond * correctly and assign it to a group **/ if (( rc == PASS ) && ( i < host_ptr->sensors )) { sensor_type * sensor_ptr = get_sensor ( host_ptr->hostname, sensor_array[i].entity_path ) ; #ifdef WANT_FIT_TESTING if ( daemon_want_fit ( FIT_CODE__HWMON__GET_SENSOR, host_ptr->hostname )) sensor_ptr = NULL ; #endif /* Verify the sensor content */ if ( sensor_ptr != NULL ) { /* Load the sensor with values that are not carried through from the above add */ sensor_ptr->group_uuid = sensor_array[i].group_uuid ; sensor_ptr->degraded = false ; sensor_ptr->alarmed = false ; clear_ignored_state ( sensor_ptr ) ; clear_logged_state ( sensor_ptr ) ; dlog1 ( "%s '%s' is in group %s (hwmon)\n", host_ptr->hostname.c_str(), sensor_ptr->sensorname.c_str(), sensor_ptr->group_uuid.c_str()); if ( daemon_get_cfg_ptr()->debug_bmgmt > 1 ) { hwmonSensor_print ( host_ptr->hostname, sensor_ptr ); } } else { wlog ("%s '%s' sensor should have been but was not found (in hwmon)\n", host_ptr->hostname.c_str(), sensor_array[i].sensorname.c_str()); rc = FAIL ; break ; } } } if ( rc == PASS ) { /* Group all the sensors into the groups specified by the profile file */ rc = hwmonHostClass::hwmon_group_sensors ( host_ptr ); if ( rc == PASS ) { blog ("%s sensors grouped\n", host_ptr->hostname.c_str()); } else { wlog ("%s sensor grouping failed (rc:%d)\n", host_ptr->hostname.c_str(), rc ); } } } return (rc); } /* ************************************************************************* * * Name : hwmon_load_groups * * Description: Read all the sensor groups from the database and profile file. * Those in the database that match the profile file are loaded * into hwmon directly. Those found in the profile file but * missing from the database are added to the database and * hardware monitor. Assign the each sensor its correct group * uuid and send sysinv the list of sensor uuids for each group. * * Scope : private hwmonHostClass * * Parameters : host_ptr * * Returns : * * *************************************************************************/ int hwmonHostClass::hwmon_load_groups ( struct hwmonHostClass::hwmon_host * host_ptr , bool & error ) { int rc = hwmonHttp_load_groups ( host_ptr->hostname, host_ptr->event ); if ( rc == PASS ) { struct sensor_group_type group_array[MAX_HOST_GROUPS] ; int profile_groups = load_profile_groups ( host_ptr, &group_array[0], MAX_HOST_GROUPS, error ); ilog ("%s has %d sensor groups in profile file (%s)\n", host_ptr->hostname.c_str(), profile_groups, QUANTA_SENSOR_PROFILE_FILE); for ( int i = 0 ; i < profile_groups ; i++ ) { rc = PASS ; bool found = false ; daemon_signal_hdlr (); for ( int j = 0 ; j < host_ptr->groups ; j++ ) { if ( !host_ptr->group[j].group_name.compare(group_array[i].group_name) ) { found = true ; break ; } } if ( found == false ) { dlog ("%s '%s' group not found, adding (to sysinv/hwmon)\n", host_ptr->hostname.c_str(), group_array[i].group_name.c_str()); group_array[i].hostname = host_ptr->hostname ; rc = hwmonHttp_add_group ( host_ptr->hostname, host_ptr->event, group_array[i] ); if ( rc ) { wlog ("%s '%s' sensor group add failed [%s:%s] (to sysinv/hwmon) (rc:%d)\n", group_array[i].hostname.c_str(), group_array[i].group_name.c_str(), group_array[i].datatype.c_str(), group_array[i].sensortype.c_str(), rc ); break ; } else { group_array[i].group_uuid = host_ptr->event.new_uuid ; ilog ("%s '%s' sensor group added [%s:%s] (to sysinv)\n", group_array[i].hostname.c_str(), group_array[i].group_name.c_str(), group_array[i].datatype.c_str(), group_array[i].sensortype.c_str()); } if ( rc == PASS ) { /** * Only add this sensor to hwmon if it was * successfully added to the sysinv database. **/ if (( rc == PASS ) && ( found == false )) { rc = hwmon_add_group ( host_ptr->hostname, group_array[i] ); #ifdef WANT_FIT_TESTING if ( daemon_want_fit ( FIT_CODE__HWMON__ADD_GROUP, host_ptr->hostname )) rc = FAIL ; #endif if ( rc != PASS ) { wlog ("%s '%s' sensor group add failed (to hwmon) (rc:%d)\n", host_ptr->hostname.c_str(), group_array[i].group_name.c_str(), rc); break ; } else { blog1 ("%s '%s' sensor group added (to hwmon) uuid:%s\n", host_ptr->hostname.c_str(), group_array[i].group_name.c_str(), group_array[i].group_uuid.c_str()); } } } /* error log is already printed */ } /* tack on a few important elements */ if (( rc == PASS ) && ( i < host_ptr->groups )) { struct sensor_group_type * group_ptr = hwmon_get_group ( host_ptr->hostname, group_array[i].group_name ) ; #ifdef WANT_FIT_TESTING if ( daemon_want_fit ( FIT_CODE__HWMON__GET_GROUP, host_ptr->hostname )) group_ptr = NULL ; #endif /* Verify the sensor content */ if ( group_ptr != NULL ) { /* Add the sensor label list for this group to the host_ptr group. * This list was fetched and attached to the group array in load_profile_groups. * Having it prevents the need to parse the profile file again to associate * the sensors to a group all over again inside load_profile_sensors */ group_ptr->sensor_labels = group_array[i].sensor_labels ; if ( daemon_get_cfg_ptr()->debug_bmgmt > 1 ) { hwmonGroup_print ( host_ptr->hostname, group_ptr ); } } else { wlog ("%s '%s' sensor group should have been but was not found (in hwmon)\n", host_ptr->hostname.c_str(), group_array[i].group_name.c_str()); rc = FAIL ; break ; } } } } return(rc); } /* ************************************************************************* * * Name : hwmon_group_sensors * * Description: Assign the each sensor its correct group uuid and send * sysinv the list of sensor uuids for each group. * * To do this we have to loop over the groups 3 times * 1. init the sensors in each group * 2. assigned the sensors to the groups ; might not be linear * 3. create sensor list * * Scope : private hwmonHostClass * * Parameters : host_ptr * * Returns : TODO: handle modify errors better. * * *************************************************************************/ /* TODO: make this a hardware independed implementation */ int hwmonHostClass::hwmon_group_sensors ( struct hwmonHostClass::hwmon_host * host_ptr ) { int rc = PASS ; string sensor_list = "" ; for ( int g = 0 ; g < host_ptr->groups ; ++g ) { host_ptr->group[g].sensors = 0 ; } ilog ("%s has %d sensors across %d groups\n", host_ptr->hostname.c_str(), host_ptr->sensors, host_ptr->groups); for ( int g = 0 ; g < host_ptr->groups ; ++g ) { /* search through all the sensors and put them in their correct group */ for ( int s = 0 ; s < host_ptr->sensors ; ++s ) { if ( !host_ptr->group[g].group_uuid.compare(host_ptr->sensor[s].group_uuid)) { /* Update this group with the pointers to is sensors */ host_ptr->group[g].sensor_ptr[host_ptr->group[g].sensors] = &host_ptr->sensor[s] ; host_ptr->group[g].sensors++ ; dlog ("%s '%s' is assigned '%s' (%d)\n", host_ptr->hostname.c_str(), host_ptr->group[g].group_name.c_str(), host_ptr->sensor[s].sensorname.c_str(), host_ptr->group[g].sensors); } } } /* Build up the sensor uuid list for each * group and send it to sysinv */ for ( int g = 0 ; g < host_ptr->groups ; ++g ) { int count = 0 ; bool first = true ; sensor_list.clear(); for ( int s = 0 ; s < host_ptr->sensors ; ++s ) { daemon_signal_hdlr (); /* only add it to the list got this group */ if ( !host_ptr->group[g].group_uuid.compare(host_ptr->sensor[s].group_uuid) ) { /* make sure there is a sensor read command at the group or sensor level */ if ( host_ptr->group[g].path.empty() && host_ptr->sensor[s].path.empty() ) { elog ("%s '%s:%s' no read command for this combo ; ignoring sensor\n", host_ptr->hostname.c_str(), host_ptr->group[g].group_name.c_str(), host_ptr->sensor[s].sensorname.c_str()); elog ("... group cmd:(%s) sensor cmd:(%s)\n", host_ptr->group[g].path.c_str(), host_ptr->sensor[s].path.c_str()); } else { /* Default each sensor to its groups action */ host_ptr->sensor[s].actions_minor = host_ptr->group[g].actions_minor_group ; host_ptr->sensor[s].actions_major = host_ptr->group[g].actions_major_group ; host_ptr->sensor[s].actions_critl = host_ptr->group[g].actions_critl_group ; count++ ; if ( first == false ) { sensor_list.append(","); } else { first = false ; } sensor_list.append(host_ptr->sensor[s].uuid); if ( count == host_ptr->group[g].sensors ) break ; } } } if ( sensor_list.empty() ) { wlog ("%s no sensors found for '%s' group ; should have %d\n", host_ptr->hostname.c_str(), host_ptr->group[g].group_name.c_str(), host_ptr->group[g].sensors); } else if ( host_ptr->group[g].sensors != count ) { wlog ("%s incorrect number of sensors found for '%s' group ; has %d but should have %d\n", host_ptr->hostname.c_str(), host_ptr->group[g].group_name.c_str(), count, host_ptr->group[g].sensors); } else { groupSensors_print ( &host_ptr->group[g] ); rc = hwmonHttp_group_sensors ( host_ptr->hostname, host_ptr->event, host_ptr->group[g].group_uuid, sensor_list ); } } return (rc); } void handle_new_suppression ( sensor_type * sensor_ptr ) { clear_asserted_alarm ( sensor_ptr->hostname, HWMON_ALARM_ID__SENSOR, sensor_ptr, REASON_SUPPRESSED ); } action_state_type * _get_action_state_ptr ( sensor_type * sensor_ptr, sensor_severity_enum hwmon_sev ) { if ( sensor_ptr ) { if ( hwmon_sev == HWMON_SEVERITY_CRITICAL ) return ( &sensor_ptr->critl ) ; else if ( hwmon_sev == HWMON_SEVERITY_MAJOR ) return ( &sensor_ptr->major ) ; else if ( hwmon_sev == HWMON_SEVERITY_MINOR ) return ( &sensor_ptr->minor ) ; } slog ("invalid parms (%p:%s)\n", sensor_ptr, get_severity(hwmon_sev).c_str()); return (NULL); } int _update_group_action ( struct sensor_group_type * group_ptr, sensor_severity_enum hwmon_sev, string new_action ) { if ( hwmon_sev == HWMON_SEVERITY_CRITICAL ) group_ptr->actions_critl_group = new_action ; else if ( hwmon_sev == HWMON_SEVERITY_MAJOR ) group_ptr->actions_major_group = new_action ; else group_ptr->actions_minor_group = new_action ; return (PASS); } int _update_sensor_action ( sensor_type * sensor_ptr, sensor_severity_enum hwmon_sev, string new_action ) { if ( hwmon_sev == HWMON_SEVERITY_CRITICAL ) { dlog ("%s %s critical action update '%s' -> '%s'\n", sensor_ptr->hostname.c_str(), sensor_ptr->sensorname.c_str(), sensor_ptr->actions_critl.c_str(), new_action.c_str()); sensor_ptr->actions_critl = new_action ; } else if ( hwmon_sev == HWMON_SEVERITY_MAJOR ) { dlog ("%s %s major action update '%s' -> '%s'\n", sensor_ptr->hostname.c_str(), sensor_ptr->sensorname.c_str(), sensor_ptr->actions_major.c_str(), new_action.c_str()); sensor_ptr->actions_major = new_action ; } else { dlog ("%s %s minor action update '%s' -> '%s'\n", sensor_ptr->hostname.c_str(), sensor_ptr->sensorname.c_str(), sensor_ptr->actions_minor.c_str(), new_action.c_str()); sensor_ptr->actions_minor = new_action ; } return (PASS); } bool severity_match ( EFmAlarmSeverityT fm_severity, sensor_severity_enum hwmon_sev ) { if ((( fm_severity == FM_ALARM_SEVERITY_MINOR ) && ( hwmon_sev == HWMON_SEVERITY_MINOR )) || (( fm_severity == FM_ALARM_SEVERITY_MAJOR ) && ( hwmon_sev == HWMON_SEVERITY_MAJOR )) || (( fm_severity == FM_ALARM_SEVERITY_CRITICAL ) && ( hwmon_sev == HWMON_SEVERITY_CRITICAL )) || (( fm_severity == FM_ALARM_SEVERITY_CRITICAL ) && ( hwmon_sev == HWMON_SEVERITY_NONRECOVERABLE ))) return (true); return (false); } int _manage_action_change ( string hostname, struct sensor_group_type * group_ptr, string cur_action, string new_action, sensor_severity_enum hwmon_sev, EFmAlarmSeverityT fm_severity ) { bool new_action__log = false ; bool new_action__alarm = false ; bool new_action__ignore = false ; bool new_action__reset = false ; bool new_action__powercycle = false ; bool cur_action__log = false ; bool cur_action__alarm = false ; bool cur_action__ignore = false ; bool cur_action__reset = false ; bool cur_action__powercycle = false ; if ( is_ignore_action ( new_action )) new_action__ignore = true ; if ( is_log_action ( new_action )) new_action__log = true ; if ( is_alarm_action ( new_action )) new_action__alarm = true ; if ( is_reset_action ( new_action )) new_action__reset = true ; if ( is_powercycle_action ( new_action )) new_action__powercycle = true ; if ( is_alarm_action ( cur_action )) cur_action__alarm = true ; if ( is_ignore_action ( cur_action )) cur_action__ignore = true ; if ( is_log_action ( cur_action )) cur_action__log = true ; if ( is_reset_action ( cur_action )) cur_action__reset = true ; if ( is_powercycle_action ( cur_action )) cur_action__powercycle = true ; /* TODO: change these return codes to PASS once we know we don't get them */ if (( new_action__log && cur_action__log ) || ( new_action__alarm && cur_action__alarm ) || ( new_action__ignore && cur_action__ignore ) || ( new_action__reset && cur_action__reset ) || ( new_action__powercycle && cur_action__powercycle )) { elog ("%s null '%s' sensor group action change for severity '%s' ; no action (%s to %s)\n", hostname.c_str(), group_ptr->group_name.c_str(), get_severity(hwmon_sev).c_str(), cur_action.c_str(), new_action.c_str()); return (FAIL_INVALID_OPERATION); } ilog ("%s modifying '%s' sensor group '%s' action from '%s' to '%s'\n", hostname.c_str(), group_ptr->group_name.c_str(), get_severity(hwmon_sev).c_str(), cur_action.c_str(), new_action.c_str()); /********************************************************************* * There are 5 possible actions * * - alarm * - log * - ignore * - reset * - powercycle * * Any action can be changed to any other action * Checks above ensure that no null action changes make it here * *********************************************************************/ string reason = get_severity(hwmon_sev) + " severity level action "; if ( new_action__alarm ) reason.append(REASON_SET_TO_ALARM); else if ( new_action__log ) reason.append(REASON_SET_TO_LOG); else if ( new_action__ignore ) reason.append(REASON_SET_TO_IGNORE); else if ( new_action__reset ) reason.append(REASON_SET_TO_RESET); else if ( new_action__powercycle ) reason.append(REASON_SET_TO_POWERCYCLE); /* ... now all the sensors in that group */ for ( int i = 0 ; i < group_ptr->sensors ; i++ ) { daemon_signal_hdlr(); sensor_type * sensor_ptr = group_ptr->sensor_ptr[i] ; if ( sensor_ptr->group_uuid != group_ptr->group_uuid ) { slog ("%s %s group:sensor uuid mismatch ; auto correcting\n", hostname.c_str(), sensor_ptr->sensorname.c_str() ); slog ("%s ... group:sensor [%s:%s]\n", hostname.c_str(), group_ptr->group_uuid.c_str(), sensor_ptr->group_uuid.c_str()); sensor_ptr->group_uuid = group_ptr->group_uuid ; } /* Only run change handling when the severity matches current status */ if ( !severity_match ( fm_severity, sensor_ptr->severity ) ) { /* but we still need to update the action for each sensor */ _update_sensor_action ( sensor_ptr, hwmon_sev, new_action ); continue ; } // string severity = get_severity(sensor_ptr->severity); /* get correct action state bools */ action_state_type * action_state_ptr = _get_action_state_ptr ( sensor_ptr, hwmon_sev ); if ( action_state_ptr == NULL ) { slog ("%s %s has invalid action state %d\n", hostname.c_str(), sensor_ptr->sensorname.c_str(), hwmon_sev ); return (FAIL_INVALID_DATA); } hwmonLog_clear ( hostname, HWMON_ALARM_ID__SENSORGROUP, group_ptr->group_name, reason ); /* Handle Action change away from ............. ALARM */ if ( cur_action__alarm ) { /************************************************************* * * From 'alarm' to 'log' case * -------------------------- * * If alarm is asserted then clear it in favor of a log * *************************************************************/ if ( new_action__log ) { dlog ("%s %s action change from 'alarm' to 'log'\n", hostname.c_str(), sensor_ptr->sensorname.c_str()); /* cleanup and garbage collection */ clear_ignored_state ( sensor_ptr ); clear_logged_state ( sensor_ptr ); /* clear the current alarm if it exists */ if ( action_state_ptr->alarmed == true ) { clear_severity_alarm ( hostname, HWMON_ALARM_ID__SENSOR, sensor_ptr->sensorname, fm_severity, reason ) ; } /* Produce a log if the sensor is reporting error status */ if (( sensor_ptr->suppress == false ) && ( sensor_ptr->status.compare("ok")) && ( sensor_ptr->status.compare("offline"))) { hwmonLog ( hostname, HWMON_ALARM_ID__SENSOR, fm_severity, sensor_ptr->sensorname, REASON_OOT ); set_logged_severity ( sensor_ptr , fm_severity ); } } /************************************************************* * * From 'alarm' to 'reset' * ---------------------------- * * If alarm is asserted then clear it and let it get * generated again in the handler if the severity condition * persists. * *************************************************************/ else if ( new_action__reset ) { dlog ("%s %s action change from 'alarm' to 'reset'\n", hostname.c_str(), sensor_ptr->sensorname.c_str()); /* cleanup and garbage collection */ clear_ignored_state ( sensor_ptr ); clear_logged_state ( sensor_ptr ); /* clear the current alarm if it exists */ if ( action_state_ptr->alarmed == true ) { clear_severity_alarm ( hostname, HWMON_ALARM_ID__SENSOR, sensor_ptr->sensorname, fm_severity, reason ) ; } } /************************************************************* * * From 'alarm' to 'powercycle' * ---------------------------- * * If alarm is asserted then clear it and let it get * generated again in the handler if the severity condition * persists. * *************************************************************/ else if ( new_action__powercycle ) { dlog ("%s %s action change from 'alarm' to 'powercycle'\n", hostname.c_str(), sensor_ptr->sensorname.c_str()); /* cleanup and garbage collection */ clear_ignored_state ( sensor_ptr ); clear_logged_state ( sensor_ptr ); /* clear the current alarm if it exists */ if ( action_state_ptr->alarmed == true ) { clear_severity_alarm ( hostname, HWMON_ALARM_ID__SENSOR, sensor_ptr->sensorname, fm_severity, reason ) ; } } /************************************************************* * * From 'alarm' to 'ignore' case * ----------------------------- * * If alarm is asserted then clear it. * *************************************************************/ else /* ignore as default case */ { dlog ("%s %s action change from 'alarm' to 'ignore'\n", hostname.c_str(), sensor_ptr->sensorname.c_str()); /* cleanup and garbage collection */ clear_ignored_state ( sensor_ptr ); clear_logged_state ( sensor_ptr ); /* clear the current alarm if it exists */ if ( action_state_ptr->alarmed == true ) { clear_severity_alarm ( hostname, HWMON_ALARM_ID__SENSOR, sensor_ptr->sensorname, fm_severity, reason) ; set_ignored_severity ( sensor_ptr, fm_severity ); } } /* Clear alarm and degrade state */ clear_alarmed_state ( sensor_ptr ); clear_degraded_state ( sensor_ptr ); } /* Handle Action change away from ............. LOG */ else if ( cur_action__log ) { /* Do auto correction / garbage collection */ clear_alarmed_state ( sensor_ptr ); clear_degraded_state ( sensor_ptr ); clear_ignored_state ( sensor_ptr ); /************************************************************** * * From 'log' -> 'alarm' case * -------------------------- * * If it was a log action and was logged and is now alarm * action then send a log indicating that the current log is * cleared. * * Allow the alarm to get raised on the next status reading * **************************************************************/ if ( new_action__alarm ) { dlog ("%s %s action change from 'log' to 'alarm'\n", hostname.c_str(), sensor_ptr->sensorname.c_str()); if ( action_state_ptr->logged == true ) { hwmonLog_clear ( hostname, HWMON_ALARM_ID__SENSOR, sensor_ptr->sensorname, reason ); } } /********************************************************** * * From 'log' to 'reset' case * -------------------------- * * Allow the alarm to get raised on the next status reading * *********************************************************/ else if ( new_action__reset ) { dlog ("%s %s action change from 'log' to 'reset'\n", hostname.c_str(), sensor_ptr->sensorname.c_str()); if ( action_state_ptr->logged == true ) { hwmonLog_clear ( hostname, HWMON_ALARM_ID__SENSOR, sensor_ptr->sensorname, reason ); } } /********************************************************** * * From 'log' to 'powercycle' case * ------------------------------- * * Allow the alarm to get raised on the next status reading * *********************************************************/ else if ( new_action__powercycle ) { dlog ("%s %s action change from 'log' to 'powercycle'\n", hostname.c_str(), sensor_ptr->sensorname.c_str()); if ( action_state_ptr->logged == true ) { hwmonLog_clear ( hostname, HWMON_ALARM_ID__SENSOR, sensor_ptr->sensorname, reason ); } } /********************************************************** * * From 'log -> 'ignore' case * --------------------------- * * If it was a log action and was logged and is now ignore * then send a log indicating that it is now ignored. * ***********************************************************/ else /* ignore as default case */ { dlog ("%s %s action change from 'log' to 'ignore'\n", hostname.c_str(), sensor_ptr->sensorname.c_str()); if ( action_state_ptr->logged == true ) { hwmonLog_clear ( hostname, HWMON_ALARM_ID__SENSOR, sensor_ptr->sensorname, reason ); set_ignored_severity ( sensor_ptr, fm_severity ); } } clear_logged_state ( sensor_ptr ); } /* Handle Action change away from ............. IGNORE */ else if ( cur_action__ignore ) { /* Do auto correction / garbage collection */ clear_alarmed_state ( sensor_ptr ); clear_degraded_state ( sensor_ptr ); /************************************************************** * * From 'ignore' to 'alarm' case * ----------------------------- * * If was ignore and is now alarm then just take it out of * ignore and let the alarm get raised on the next audit. * **************************************************************/ if ( new_action__alarm ) { dlog ("%s %s action change from 'ignore' to 'alarm'\n", hostname.c_str(), sensor_ptr->sensorname.c_str()); /* additional garbage collection and cleanup */ clear_logged_state ( sensor_ptr ); /* take it out of ignore state */ clear_ignored_state ( sensor_ptr ); /* do nothing and allow the sensor event handler to act */ } /************************************************************** * * From 'ignore' to 'reset' case * ---------------------------------- * **************************************************************/ else if ( new_action__reset ) { dlog ("%s %s action change from 'ignore' to 'reset'\n", hostname.c_str(), sensor_ptr->sensorname.c_str()); /* additional garbage collection and cleanup */ clear_ignored_state ( sensor_ptr ); clear_logged_state ( sensor_ptr ); /* do nothing and allow the sensor event handler to act */ } /************************************************************** * * From 'ignore' to 'powercycle' case * ---------------------------------- * **************************************************************/ else if ( new_action__powercycle ) { dlog ("%s %s action change from 'ignore' to 'powercycle'\n", hostname.c_str(), sensor_ptr->sensorname.c_str()); /* additional garbage collection and cleanup */ clear_logged_state ( sensor_ptr ); /* take it out of ignore state */ clear_ignored_state ( sensor_ptr ); /* do nothing and allow the sensor event handler to act */ } /************************************************************** * * From 'ignore' to 'log' case * ----------------------------- * * If was ignore then raise log if status is not ok or offline * ***************************************************************/ else /* log as default case */ { dlog ("%s %s action change from 'ignore' to 'log'\n", hostname.c_str(), sensor_ptr->sensorname.c_str()); clear_ignored_state ( sensor_ptr ); /* take it out of logged state */ clear_logged_state ( sensor_ptr ); /* Produce a log if the sensor is reporting error status */ if (( sensor_ptr->suppress == false ) && ( sensor_ptr->status.compare("ok")) && ( sensor_ptr->status.compare("offline"))) { hwmonLog ( hostname, HWMON_ALARM_ID__SENSOR, fm_severity, sensor_ptr->sensorname, REASON_OOT ); set_logged_severity ( sensor_ptr , fm_severity ); } } } /* Handle Action change away from ............. RESET */ else if ( cur_action__reset ) { /************************************************************* * * From 'reset' to 'alarm' case * ------------------------------- * * If alarm is asserted then clear it only to allow the * it to be raised in the handler by failed severity status. * *************************************************************/ if ( new_action__alarm ) { dlog ("%s %s action change from 'reset' to 'alarm'\n", hostname.c_str(), sensor_ptr->sensorname.c_str()); /* cleanup and garbage collection */ clear_ignored_state ( sensor_ptr ); clear_logged_state ( sensor_ptr ); /* clear the current alarm if it exists */ if ( action_state_ptr->alarmed == true ) { clear_severity_alarm ( hostname, HWMON_ALARM_ID__SENSOR, sensor_ptr->sensorname, fm_severity, reason ) ; } /* Clear alarm and degrade state */ clear_alarmed_state ( sensor_ptr ); clear_degraded_state ( sensor_ptr ); } /************************************************************* * * From 'reset' to 'powercycle' case * --------------------------------- * * If alarm is asserted then clear it only to allow the * powercycle case alarm to be raised in the handler. * *************************************************************/ else if ( new_action__powercycle ) { dlog ("%s %s action change from 'reset' to 'powercycle'\n", hostname.c_str(), sensor_ptr->sensorname.c_str()); /* cleanup and garbage collection */ clear_ignored_state ( sensor_ptr ); clear_logged_state ( sensor_ptr ); /* clear the current alarm if it exists */ if ( action_state_ptr->alarmed == true ) { clear_severity_alarm ( hostname, HWMON_ALARM_ID__SENSOR, sensor_ptr->sensorname, fm_severity, reason ) ; } /* Clear alarm and degrade state */ clear_alarmed_state ( sensor_ptr ); clear_degraded_state ( sensor_ptr ); } /************************************************************* * * From 'reset' to 'log' case * -------------------------- * * Clear the reset alarm if it is raised and set the * corresponding log. * *************************************************************/ else if ( new_action__log ) { dlog ("%s %s action change from 'reset' to 'log'\n", hostname.c_str(), sensor_ptr->sensorname.c_str()); /* do garbage collection */ clear_ignored_state ( sensor_ptr ); clear_logged_state ( sensor_ptr ); if ( action_state_ptr->alarmed == true ) { clear_severity_alarm ( hostname, HWMON_ALARM_ID__SENSOR, sensor_ptr->sensorname, fm_severity, reason ) ; } /* Produce a log if the sensor is reporting error status */ if (( sensor_ptr->suppress == false ) && ( sensor_ptr->status.compare("ok")) && ( sensor_ptr->status.compare("offline"))) { hwmonLog ( hostname, HWMON_ALARM_ID__SENSOR, fm_severity, sensor_ptr->sensorname, REASON_OOT ); set_logged_severity ( sensor_ptr , fm_severity ); } /* Clear alarm and degrade state if Do auto correction / garbage collection */ clear_alarmed_state ( sensor_ptr ); clear_degraded_state ( sensor_ptr ); } /************************************************************* * * From 'reset' to 'ignore' case * ---------------------------------- * * Clear the reset alarm if it is raised * *************************************************************/ else /* ignore as default case */ { dlog ("%s %s action change from 'reset' to 'ignore'\n", hostname.c_str(), sensor_ptr->sensorname.c_str()); /* do garbage collection */ clear_ignored_state ( sensor_ptr ); clear_logged_state ( sensor_ptr ); if ( action_state_ptr->alarmed == true ) { clear_severity_alarm ( hostname, HWMON_ALARM_ID__SENSOR, sensor_ptr->sensorname, fm_severity, reason ) ; } if (( sensor_ptr->suppress == false ) && ( sensor_ptr->status.compare(HWMON_CRITICAL))) { set_ignored_severity ( sensor_ptr, fm_severity ); } /* Clear alarm and degrade state if Do auto correction / garbage collection */ clear_alarmed_state ( sensor_ptr ); clear_degraded_state ( sensor_ptr ); } } /* Handle Action change away from ............. POWERCYCLE */ else if ( cur_action__powercycle ) { /************************************************************* * * From 'powercycle' to 'alarm' case * --------------------------------- * * If alarm is asserted then clear it only to allow it to * be raised in the handler if the status persists. * *************************************************************/ if ( new_action__alarm ) { dlog ("%s %s action change from 'powercycle' to 'alarm'\n", hostname.c_str(), sensor_ptr->sensorname.c_str()); /* cleanup and garbage collection */ clear_ignored_state ( sensor_ptr ); clear_logged_state ( sensor_ptr ); /* clear the current alarm if it exists */ if ( action_state_ptr->alarmed == true ) { clear_severity_alarm ( hostname, HWMON_ALARM_ID__SENSOR, sensor_ptr->sensorname, fm_severity, reason ) ; } /* Clear alarm and degrade state */ clear_alarmed_state ( sensor_ptr ); clear_degraded_state ( sensor_ptr ); } /************************************************************* * * From 'powercycle' to 'log' case * ------------------------------- * * If alarm is asserted then clear it in favor of a log * *************************************************************/ else if ( new_action__log ) { dlog ("%s %s action change from 'powercycle' to 'log'\n", hostname.c_str(), sensor_ptr->sensorname.c_str()); /* cleanup and garbage collection */ clear_ignored_state ( sensor_ptr ); clear_logged_state ( sensor_ptr ); /* clear the current alarm if it exists */ if ( action_state_ptr->alarmed == true ) { clear_severity_alarm ( hostname, HWMON_ALARM_ID__SENSOR, sensor_ptr->sensorname, fm_severity, reason ) ; } /* Produce a log if the sensor is reporting error status */ if (( sensor_ptr->suppress == false ) && ( sensor_ptr->status.compare("ok")) && ( sensor_ptr->status.compare("offline"))) { hwmonLog ( hostname, HWMON_ALARM_ID__SENSOR, fm_severity, sensor_ptr->sensorname, REASON_OOT ); set_logged_severity ( sensor_ptr , fm_severity ); } /* Clear alarm and degrade state */ clear_alarmed_state ( sensor_ptr ); clear_degraded_state ( sensor_ptr ); } /************************************************************* * * From 'powercycle' to 'reset' case * ------------------------------- * * If alarm is asserted then clear it only to allow the reset * case alarm to be raised in the handler if the status * persists as critical. * *************************************************************/ else if ( new_action__reset ) { dlog ("%s %s action change from 'powercycle' to 'reset'\n", hostname.c_str(), sensor_ptr->sensorname.c_str()); /* cleanup and garbage collection */ clear_ignored_state ( sensor_ptr ); clear_logged_state ( sensor_ptr ); /* clear the current alarm if it exists */ if ( action_state_ptr->alarmed == true ) { clear_severity_alarm ( hostname, HWMON_ALARM_ID__SENSOR, sensor_ptr->sensorname, fm_severity, reason ) ; } /* Clear alarm and degrade state */ clear_alarmed_state ( sensor_ptr ); clear_degraded_state ( sensor_ptr ); } /************************************************************* * * From 'powercycle' to 'ignore' case * ---------------------------------- * * If alarm is asserted then clear it. * *************************************************************/ else /* ignore as default case */ { dlog ("%s %s action change from 'powercycle' to 'ignore'\n", hostname.c_str(), sensor_ptr->sensorname.c_str()); /* cleanup and garbage collection */ clear_ignored_state ( sensor_ptr ); clear_logged_state ( sensor_ptr ); /* clear the current alarm if it exists */ if ( action_state_ptr->alarmed == true ) { clear_severity_alarm ( hostname, HWMON_ALARM_ID__SENSOR, sensor_ptr->sensorname, fm_severity, reason ) ; } if (( sensor_ptr->suppress == false ) && ( sensor_ptr->status.compare(HWMON_CRITICAL))) { set_ignored_severity ( sensor_ptr, fm_severity ); } /* Clear alarm and degrade state */ clear_alarmed_state ( sensor_ptr ); clear_degraded_state ( sensor_ptr ); } } else { elog ("%s no '%s' sensor group action change for severity '%s' ; no action (%s to %s)\n", hostname.c_str(), group_ptr->group_name.c_str(), get_severity(hwmon_sev).c_str(), cur_action.c_str(), new_action.c_str()); return (FAIL_INVALID_OPERATION) ; } _update_sensor_action ( sensor_ptr, hwmon_sev, new_action ); } _update_group_action ( group_ptr, hwmon_sev, new_action ); return (PASS); } /* ************************************************************************* * * Name : group_modify * * Description: For a limited number of attributes, modify each sensor * that is part of the specified group for that attribute. * * Modifiable attributes are: * * suppress -> suppress * audit_interval_group -> audit_interval * actions_minor_group -> actions_minor * actions_major_group -> actions_major * actions_critical_group -> actions_critical * * Scope : public hwmonHostClass * * Assumptions: action (as value) has already been verified by calling procedure * * Returns : PASS, FAIL or FAIL_... * * *************************************************************************/ /* TODO:FEATURE: manage alarms when the actions are changed */ int hwmonHostClass::group_modify ( string hostname, string group_uuid, string key, string value ) { int rc = PASS ; if ( ( !group_uuid.empty() ) && ( !hostname.empty()) ) { hwmonHostClass::hwmon_host * host_ptr ; host_ptr = hwmonHostClass::getHost ( hostname ); if ( host_ptr != NULL ) { sensor_group_type * group_ptr = NULL ; for ( int i = 0 ; i < host_ptr->groups ; i++ ) { if ( !host_ptr->group[i].group_uuid.compare(group_uuid) ) { group_ptr = &host_ptr->group[i] ; break ; } } if ( group_ptr == NULL ) { slog ("%s '%s' group not found value:%s (uuid:%s) \n", hostname.c_str(), key.c_str(), value.c_str(), group_uuid.substr(0,8).c_str()); return (FAIL_NOT_FOUND); } /* Look for Suppression Modify */ if ( !key.compare("suppress") ) { /* modify the group suppression */ if ( ( value.compare("True") ) && ( value.compare("true") )) { hwmonLog ( hostname, HWMON_ALARM_ID__SENSORGROUP, FM_ALARM_SEVERITY_CLEAR, group_ptr->group_name, REASON_UNSUPPRESSED ); group_ptr->suppress = false ; } else { hwmonLog ( hostname, HWMON_ALARM_ID__SENSORGROUP, FM_ALARM_SEVERITY_CLEAR, group_ptr->group_name, REASON_SUPPRESSED ); group_ptr->suppress = true ; } ilog ("%s '%s' sensor group is '%ssuppressed'\n", host_ptr->hostname.c_str(), group_ptr->group_name.c_str(), group_ptr->suppress ? "" : "un"); /* ... now all the sensors in that group */ for ( int s = 0 ; s < group_ptr->sensors ; s++ ) { /* modify all sensors to match the group suppression state */ sensor_type * sensor_ptr = group_ptr->sensor_ptr[s] ; if ( sensor_ptr->suppress != group_ptr->suppress ) { if ( group_ptr->suppress == true ) { handle_new_suppression ( sensor_ptr ); } else { manage_sensor_state ( hostname, sensor_ptr, get_severity(sensor_ptr->status)); } sensor_ptr->suppress = group_ptr->suppress ; } } } /* Look for Audit Interval Modify */ if ( !key.compare("audit_interval_group") ) { hwmonHostClass * obj_ptr = get_hwmonHostClass_ptr() ; int interval = atoi(value.data()); if ( interval < HWMON_MIN_AUDIT_INTERVAL ) { wlog ("%s invalid audit interval (%d:%s)\n", hostname.c_str(), interval, value.c_str()); return (FAIL_INVALID_DATA); } /* modify the group interval */ /* This just sets a flag so that the audit interval * group changes sent back to sysinv are done at base level * rather that inside this http request, which would create * a deadlock */ obj_ptr->modify_audit_interval ( hostname, interval ); } /* Look for Critical Action Group Modify */ if ( !key.compare("actions_critical_group") ) { sensor_severity_enum hwmon_sev = HWMON_SEVERITY_CRITICAL ; EFmAlarmSeverityT fm_severity = FM_ALARM_SEVERITY_CRITICAL ; string cur_action = group_ptr->actions_critl_group ; string new_action = value ; rc = _manage_action_change ( hostname, group_ptr, cur_action, new_action, hwmon_sev, fm_severity ); #ifdef WANT_MANAGE_SENSOR_STATE_ON_ACTION_CHANGE /* force evaluation of all sensors in this group */ for ( int i = 0 ; i < group_ptr->sensors ; i++ ) { sensor_severity_enum sev = get_severity(group_ptr->sensor_ptr[i]->status) ; if ( sev == HWMON_SEVERITY_CRITICAL ) { manage_sensor_state ( hostname, group_ptr->sensor_ptr[i], sev ); } } #endif } /* Look for Major Action Group Modify */ if ( !key.compare("actions_major_group") ) { sensor_severity_enum hwmon_sev = HWMON_SEVERITY_MAJOR ; EFmAlarmSeverityT fm_severity = FM_ALARM_SEVERITY_MAJOR ; string cur_action = group_ptr->actions_major_group ; string new_action = value ; rc = _manage_action_change ( hostname, group_ptr, cur_action, new_action, hwmon_sev, fm_severity ); #ifdef WANT_MANAGE_SENSOR_STATE_ON_ACTION_CHANGE /* force evaluation of all sensors in this group */ for ( int i = 0 ; i < group_ptr->sensors ; i++ ) { sensor_severity_enum sev = get_severity(group_ptr->sensor_ptr[i]->status) ; if ( sev == HWMON_SEVERITY_MAJOR ) { manage_sensor_state ( hostname, group_ptr->sensor_ptr[i], sev ); } } #endif } /* Look for Minor Action Group Modify */ if ( !key.compare("actions_minor_group") ) { sensor_severity_enum hwmon_sev = HWMON_SEVERITY_MINOR ; EFmAlarmSeverityT fm_severity = FM_ALARM_SEVERITY_MINOR ; string cur_action = group_ptr->actions_minor_group ; string new_action = value ; rc = _manage_action_change ( hostname, group_ptr, cur_action, new_action, hwmon_sev, fm_severity ); #ifdef WANT_MANAGE_SENSOR_STATE_ON_ACTION_CHANGE /* force evaluation of all sensors in this group */ for ( int i = 0 ; i < group_ptr->sensors ; i++ ) { sensor_severity_enum sev = get_severity(group_ptr->sensor_ptr[i]->status) ; if ( sev == HWMON_SEVERITY_MINOR ) { manage_sensor_state ( hostname, group_ptr->sensor_ptr[i], sev ); } } #endif } monitor_now ( host_ptr ); } else { elog ("%s hostname is unknown\n", hostname.c_str()); rc = FAIL_UNKNOWN_HOSTNAME ; } } else { slog ("empty hostname or group uuid\n"); rc = FAIL_STRING_EMPTY ; } return (rc); } /***************************************************************************** * * Name : ipmi_create_sensors * * Description: Add sample sensors to the sysinv database. * *****************************************************************************/ int hwmonHostClass::ipmi_create_sensors ( struct hwmonHostClass::hwmon_host * host_ptr ) { int rc = PASS ; int sensor_errors = 0 ; host_ptr->sensors = 0 ; for ( int s = 0 ; s < host_ptr->samples ; ++s ) { string sensortype = ipmi_get_grouptype ( host_ptr->hostname, host_ptr->sample[s].unit, host_ptr->sample[s].name); #ifdef WANT_FIT_TESTING /* sysinv does not allow adding a sensor with no type ; will reject with a 400 */ if ( daemon_want_fit ( FIT_CODE__HWMON__BAD_SENSOR, host_ptr->hostname, host_ptr->sample[s].name)) sensortype = "" ; #endif if ( sensortype.empty() ) { if ( ++sensor_errors > MAX_SENSOR_TYPE_ERRORS ) { rc = FAIL_STRING_EMPTY ; elog ("%s '%s' not added ; sample sensor create failed ; too many sensor type errors (rc:%d)\n", host_ptr->hostname.c_str(), host_ptr->sample[s].unit.c_str(), rc); } else { wlog ("%s %s %s %s%s not added ; empty or unsupported type classification\n", host_ptr->hostname.c_str(), host_ptr->sample[s].name.c_str(), host_ptr->sample[s].status.c_str(), host_ptr->sample[s].unit.c_str(), host_ptr->sample[s].ignore ? " ignored" : ""); } } else { /* add the sensor to hwmon */ hwmonSensor_init ( host_ptr->hostname, &host_ptr->sensor[host_ptr->sensors] ); clear_ignored_state (&host_ptr->sensor[host_ptr->sensors]); clear_alarmed_state (&host_ptr->sensor[host_ptr->sensors]); clear_logged_state (&host_ptr->sensor[host_ptr->sensors]); host_ptr->sensor[host_ptr->sensors].sensorname = host_ptr->sample[s].name ; host_ptr->sensor[host_ptr->sensors].sensortype = sensortype ; #ifdef WANT_FIT_TESTING /* sysinv does not allow adding a sensor with no type ; will reject with a 400 */ if ( daemon_want_fit ( FIT_CODE__HWMON__ADD_SENSOR, host_ptr->hostname, host_ptr->sample[s].name)) host_ptr->sensor[host_ptr->sensors].sensortype = "" ; #endif /* add it to to sysinv */ if ( ( rc = hwmonHttp_add_sensor ( host_ptr->hostname, host_ptr->event, host_ptr->sensor[host_ptr->sensors] )) == PASS ) { /* add the sysinv uuid for this sensor to the sensor in hwmon */ host_ptr->sensor[host_ptr->sensors].uuid = host_ptr->event.new_uuid ; host_ptr->sensor[host_ptr->sensors].group_enum = host_ptr->sample[s].group_enum ; // hwmonSensor_print ( host_ptr->hostname, &host_ptr->sensor[host_ptr->sensors] ); ilog ("%s '%s' sensor added\n", host_ptr->hostname.c_str(), host_ptr->sensor[host_ptr->sensors].sensorname.c_str()); host_ptr->sensors++ ; } else { elog ("%s '%s' sensor add failed (rc:%d)\n", host_ptr->hostname.c_str(), host_ptr->sensor[host_ptr->sensors].sensorname.c_str(), rc); hwmonSensor_print ( host_ptr->hostname, &host_ptr->sensor[s] ); } } if ( rc ) break ; } /* end for loop over sensor samples */ return (rc); } /***************************************************************************** * * Name : ipmi_disable_sensors * * Purpose : With the introduction of ipmi monitoring, all groups are * monitored at once. Therefore all should be in the same state. * * Description: Set all sensors to specified state. * If disabled then also set to offline. * ******************************************************************************/ int hwmonHostClass::ipmi_disable_sensors ( struct hwmonHostClass::hwmon_host * host_ptr ) { int rc = FAIL_NULL_POINTER ; if ( host_ptr ) { rc = PASS ; /* don't send requests to sysinv if we are in the middle of * deleting a host because sysinv has already gotten rid of the * sensor model */ if ( host_ptr->host_delete == true ) return (PASS); for ( int s = 0 ; s < host_ptr->sensors ; ++s ) { sensor_type * sensor_ptr = &host_ptr->sensor[s] ; if (( sensor_ptr->state.compare("disabled")) || ( sensor_ptr->status.compare("offline"))) { sensor_ptr->state = "disabled" ; sensor_ptr->status = "offline" ; int status = hwmonHttp_disable_sensor ( host_ptr->hostname, host_ptr->event, sensor_ptr->uuid ); if ( status ) { elog ( "%s failed to disable '%s' sensor\n", host_ptr->hostname.c_str(), sensor_ptr->sensorname.c_str()); if ( rc == PASS ) rc = RETRY ; } clear_logged_state ( sensor_ptr ) ; } } } return (rc); }