Merge "Mtce: Add heartbeat cluster information for SM query"

This commit is contained in:
Zuul 2018-10-16 18:51:26 +00:00 committed by Gerrit Code Review
commit 0362090b73
22 changed files with 2094 additions and 247 deletions

View File

@ -249,6 +249,44 @@ int jsonUtil_get_key_val ( char * json_str_ptr,
return (PASS);
}
int jsonUtil_get_key_val_int ( char * json_str_ptr,
string key,
int & value )
{
/* init to null to avoid trap on early cleanup call with
* bad non-null default pointer value */
struct json_object *raw_obj = (struct json_object *)(NULL);
if ((json_str_ptr == NULL) || ( *json_str_ptr == '\0' ) || ( ! strncmp ( json_str_ptr, "(null)" , 6 )))
{
elog ("Cannot tokenize a null json string\n");
elog ("... json string: %s\n", json_str_ptr );
return (FAIL);
}
size_t len_before = strlen (json_str_ptr);
jlog2 ("String: %s\n", json_str_ptr );
raw_obj = json_tokener_parse( json_str_ptr );
if ( raw_obj )
{
value = jsonUtil_get_key_value_int ( raw_obj, key.data() ) ;
jlog1 ("%s:%d\n", key.c_str(), value);
}
else
{
size_t len_after = strlen (json_str_ptr);
elog ("Unable to tokenize string (before:%ld after:%ld);\n", len_before, len_after);
elog ("... json string: %s\n", json_str_ptr );
}
if (raw_obj)
json_object_put(raw_obj);
return (PASS);
}
/** This utility freads the passed in inventory GET request
* response json character string and performes the following

View File

@ -69,6 +69,10 @@ int jsonUtil_get_key_val ( char * json_str_ptr,
string key,
string & value );
int jsonUtil_get_key_val_int ( char * json_str_ptr,
string key,
int & value );
/** Submit a request to get an authorization token and nova URL */
int jsonApi_auth_request ( string & hostname, string & payload );

View File

@ -114,6 +114,8 @@ typedef struct
int event_port ; /**< daemon specific event tx port */
int cmd_port ; /**< daemon specific command rx port */
int sensor_port ; /**< sensor read value port */
int sm_server_port ; /**< port mtce uses to receive data from SM */
int sm_client_port ; /**< port mtce uses to send SM data */
int start_delay ; /**< startup delay, added for pmon */
int api_retries ; /**< api retries before failure */
int hostwd_failure_threshold ; /**< allowed # of missed pmon/hostwd messages */
@ -243,6 +245,19 @@ extern char *program_invocation_short_name;
else { syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Error : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } \
}
/** Error logger macro with throttling */
#define elog_throttled(cnt,max,format,args...) { \
if ( ++cnt == 1 ) \
{ \
if (ltc()) { printf ( "%s [%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Error : " format, pt(), getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } \
else { syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Error : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } \
} \
if ( cnt >= max ) \
{ \
cnt = 0 ; \
} \
}
/** Warning logger macro */
#define wlog(format, args...) { \
if ( ltc() ) { printf ( "%s [%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Warn : " format, pt(), getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } \
@ -387,7 +402,9 @@ extern char *program_invocation_short_name;
#define flog(format, args...) { if(daemon_get_cfg_ptr()->debug_fsm) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: FSM : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
#define tlog(format, args...) { if(daemon_get_cfg_ptr()->debug_timer) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Timer: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
#define clog(format, args...) { if(daemon_get_cfg_ptr()->debug_state) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Change: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
#define clog1(format, args...) { if(daemon_get_cfg_ptr()->debug_state&2) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Chang2: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
#define log_event(format, args...) { syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Event: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
#define log_stress(format, args...) { syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Stress: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }

View File

@ -23,7 +23,9 @@ using namespace std;
#include "returnCodes.h"
#include "nodeTimers.h"
#ifndef ALIGN_PACK
#define ALIGN_PACK(x) __attribute__((packed)) x
#endif
/* Out-Of-Service Stress tests */
#define WANT_SYSINV_API_STRESS 0x00000001
@ -359,8 +361,12 @@ void daemon_exit ( void );
#define CONTROLLER_0 ((const char *)"controller-0")
#define CONTROLLER_1 ((const char *)"controller-1")
#define CONTROLLER_2 ((const char *)"controller-2")
#define CONTROLLER ((const char *)"controller")
#define STORAGE_0 ((const char *)"storage-0")
#define STORAGE_1 ((const char *)"storage-1")
/* The infrastructure networking floating IP
*
* Note: If there is no infra then this label will resolve

View File

@ -267,7 +267,7 @@ bool is_goenabled ( int nodeType, bool pass )
return daemon_is_file_present ( file );
}
#define LOG_MEMORY(buf) ilog ("%s", buf ); \
#define LOG_MEMORY(buf) syslog ( LOG_INFO, "%s", buf ); \
buf_ptr = &buf[0]; \
MEMSET_ZERO ( buf );
@ -279,7 +279,7 @@ void dump_memory ( void * raw_ptr , int format, size_t bytes )
char buf[0x1024] ;
char * buf_ptr = &buf[0];
MEMSET_ZERO ( buf );
ilog ("Dumping Memory:\n");
syslog ( LOG_INFO, "Dumping Memory: %ld bytes", bytes );
if ( format == 4 )
{
int loops = bytes/format ;
@ -294,7 +294,6 @@ void dump_memory ( void * raw_ptr , int format, size_t bytes )
buf_ptr += sprintf ( buf_ptr, "%c", *byte_ptr) ;
else
buf_ptr += sprintf ( buf_ptr, "%c", '.');
byte_ptr++ ;
}
LOG_MEMORY(buf);
@ -315,7 +314,6 @@ void dump_memory ( void * raw_ptr , int format, size_t bytes )
buf_ptr += sprintf ( buf_ptr , "%c", *byte_ptr) ;
else
buf_ptr += sprintf ( buf_ptr , "%c", '.');
byte_ptr++ ;
}
LOG_MEMORY(buf);
@ -336,21 +334,12 @@ void dump_memory ( void * raw_ptr , int format, size_t bytes )
buf_ptr += sprintf ( buf_ptr , "%c", *byte_ptr) ;
else
buf_ptr += sprintf ( buf_ptr , "%c", '.');
byte_ptr++ ;
}
LOG_MEMORY(buf);
word_ptr += 4 ;
}
}
byte_ptr = (uint8_t*)raw_ptr ;
ilog ("Raw Hex Dump : %ld\n", bytes );
for ( unsigned int x = 0 ; x < bytes ; x++ )
{
buf_ptr += sprintf ( buf_ptr, " %02x", *byte_ptr );
byte_ptr++ ;
}
// printf ("\n\n");
}

View File

@ -93,7 +93,7 @@
#define FAIL_INVALID_DATA (71)
#define FAIL_BAD_STATE (72)
#define FAIL_KEY_VALUE_PARSE (73)
#define FAIL____UNUSED____74 (74)
#define FAIL_DATA_SIZE (74)
#define FAIL_NOT_FOUND (75)
#define FAIL_WORKQ_TIMEOUT (76)
#define FAIL_HTTP_DELETE (77)

View File

@ -207,7 +207,7 @@ int daemon_run_testhead ( void );
#define CONFIG_AGENT_INV_PORT 0x00000100 /**< Inventory Port Number */
#define CONFIG_AGENT_HA_PORT 0x00000200 /**< HA Framework Port Number */
#define CONFIG_CLIENT_MTCALARM_PORT 0x00000400 /**< Send alarm requests to */
#define CONFIG_RESERVED_800 0x00000800 /**< */
#define CONFIG_AGENT_SM_CLIENT_PORT 0x00000800 /**< Port to Send SM data on */
#define CONFIG_MTC_TO_HWMON_CMD_PORT 0x00001000 /**< HWmon Port Number */
#define CONFIG_AGENT_KEY_PORT 0x00002000 /**< Keystone HTTP port */
#define CONFIG_AGENT_HBS_MTC_PORT 0x00004000 /**< Heartbeat Service Port */
@ -217,8 +217,8 @@ int daemon_run_testhead ( void );
#define CONFIG_AGENT_MTC_MGMNT_PORT 0x00040000 /**< Agent Infr network port */
#define CONFIG_AGENT_TOKEN_REFRESH 0x00080000 /**< Token refresh rate mask */
#define CONFIG_CLIENT_MTC_INFRA_PORT 0x00100000 /**< Client Infra nwk mtc port */
#define CONFIG_CLIENT_MTC_MGMNT_PORT 0x00200000 /**< Client mgmnt nwk mtc port */
#define CONFIG_AGENT_VIM_CMD_PORT 0x00400000 /**< VIM Command Port Mask */
#define CONFIG_CLIENT_MTC_MGMNT_PORT 0x00200000 /**< Client mgmnt nwk mtc port */
#define CONFIG_AGENT_SM_SERVER_PORT 0x00400000 /**< Port to RX data from SM */
#define CONFIG_CLIENT_HBS_INFRA_PORT 0x00800000 /**< Infrastructure ntwk Port */
#define CONFIG_CLIENT_HBS_MGMNT_PORT 0x01000000 /**< Management network Port */
#define CONFIG_CLIENT_HBS_EVENT_PORT 0x02000000 /**< Heartbeat Event Messaging */

View File

@ -90,6 +90,15 @@ of spec operating conditions that can reduce outage time through automated
notification and recovery thereby improving overall platform availability
for the customer.
%package -n mtce-dev
Summary: Titanuim Server Maintenance Software Development Package
Group: base
Provides: mtce-dev = %{version}-%{release}
%description -n mtce-dev
Titanuim Cloud Maintenance. This package contains header files,
and related items necessary for software development.
%package -n mtce-pmon
Summary: Titanuim Server Maintenance Process Monitor Package
Group: base
@ -424,6 +433,9 @@ install -m 644 -p -D %{_buildsubdir}/fsmon/scripts/fsmon.logrotate %{buildroot}%
install -m 644 -p -D %{_buildsubdir}/hwmon/scripts/hwmon.logrotate %{buildroot}%{local_etc_logrotated}/hwmon.logrotate
install -m 644 -p -D %{_buildsubdir}/alarm/scripts/mtcalarm.logrotate %{buildroot}%{local_etc_logrotated}/mtcalarm.logrotate
# software development files
install -m 644 -p -D %{_buildsubdir}/heartbeat/mtceHbsCluster.h %{buildroot}/%{_includedir}/mtceHbsCluster.h
install -m 755 -p -D %{_buildsubdir}/public/libamon.so.$MAJOR %{buildroot}%{_libdir}/libamon.so.$MAJOR
cd %{buildroot}%{_libdir} ; ln -s libamon.so.$MAJOR libamon.so.$MAJOR.$MINOR
cd %{buildroot}%{_libdir} ; ln -s libamon.so.$MAJOR libamon.so
@ -621,3 +633,10 @@ install -m 755 -d %{buildroot}/var/run
%{_sysconfdir}/init.d/hostw
%{local_bindir}/hostwd
###############################
# Maintenance Software Development RPM
###############################
%files -n mtce-dev
%defattr(-,root,root,-)
%{_includedir}/mtceHbsCluster.h

View File

@ -269,7 +269,7 @@ nodeLinkClass::nodeLinkClass()
hbs_ready = false ;
hbs_state_change = false ;
hbs_disabled = true ;
hbs_pulse_period = hbs_pulse_period_save = 200 ;
hbs_pulse_period = hbs_pulse_period_save = 0 ;
hbs_minor_threshold = HBS_MINOR_THRESHOLD ;
hbs_degrade_threshold = HBS_DEGRADE_THRESHOLD ;
hbs_failure_threshold = HBS_FAILURE_THRESHOLD ;
@ -7325,18 +7325,40 @@ int nodeLinkClass::launch_host_services_cmd ( struct nodeLinkClass::node * node_
int send_event ( string & hostname, unsigned int cmd, iface_enum iface );
int nodeLinkClass::mon_host ( const string & hostname, iface_enum iface, bool true_false, bool send_clear )
int nodeLinkClass::mon_host ( const string & hostname, bool true_false, bool send_clear )
{
int rc = FAIL ;
if ( ! hostname.empty() )
nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
bool want_log = true ;
for ( int iface = 0 ; iface < MAX_IFACES ; iface++ )
{
node_ptr->monitor[iface] = true_false ;
if ( node_ptr->monitor[iface] == true_false )
continue ;
if ( iface == INFRA_IFACE )
{
if ( this->infra_network_provisioned == false )
continue ;
if ( node_ptr->monitor[MGMNT_IFACE] == true_false )
want_log = false ;
}
if ( send_clear == true )
{
send_event ( node_ptr->hostname, MTC_EVENT_HEARTBEAT_MINOR_CLR, (iface_enum)iface ) ;
send_event ( node_ptr->hostname, MTC_EVENT_HEARTBEAT_DEGRADE_CLR, (iface_enum)iface ) ;
}
if ( true_false == true )
{
if ( want_log )
{
ilog ("%s starting heartbeat service \n",
hostname.c_str());
}
node_ptr->no_work_log_throttle = 0 ;
node_ptr->b2b_misses_count[iface] = 0 ;
node_ptr->hbs_misses_count[iface] = 0 ;
@ -7345,16 +7367,20 @@ int nodeLinkClass::mon_host ( const string & hostname, iface_enum iface, bool tr
node_ptr->hbs_failure[iface] = false ;
node_ptr->hbs_minor[iface] = false ;
node_ptr->hbs_degrade[iface] = false ;
if ( send_clear == true )
}
else
{
if ( want_log )
{
send_event ( node_ptr->hostname, MTC_EVENT_HEARTBEAT_MINOR_CLR, iface ) ;
send_event ( node_ptr->hostname, MTC_EVENT_HEARTBEAT_DEGRADE_CLR, iface ) ;
ilog ("%s stopping heartbeat service\n",
hostname.c_str());
}
}
return PASS ;
node_ptr->monitor[iface] = true_false ;
}
return PASS ;
}
return ( rc );
return ( FAIL );
}
/* store the current hardware monitor monitoring state */
@ -7887,11 +7913,11 @@ int nodeLinkClass::remPulse ( struct node * node_ptr, iface_enum iface, bool cle
}
else
{
pulse_list[iface].head_ptr = pulse_list[iface].head_ptr->pulse_link[iface].next_ptr ;
pulse_list[iface].head_ptr->pulse_link[iface].prev_ptr = NULL ;
pulse_list[iface].head_ptr = pulse_list[iface].head_ptr->pulse_link[iface].next_ptr ;
pulse_list[iface].head_ptr->pulse_link[iface].prev_ptr = NULL ;
}
}
}
}
else if ( pulse_list[iface].tail_ptr == pulse_ptr )
{
qlog2 ("%s Pulse: Multiple Node -> Tail Case : %d of %d\n", node_ptr->hostname.c_str(), pulse_ptr->linknum[iface], pulses[iface] );
@ -7906,19 +7932,16 @@ int nodeLinkClass::remPulse ( struct node * node_ptr, iface_enum iface, bool cle
}
else
{
pulse_list[iface].tail_ptr = pulse_list[iface].tail_ptr->pulse_link[iface].prev_ptr ;
pulse_list[iface].tail_ptr->pulse_link[iface].next_ptr = NULL ;
}
pulse_list[iface].tail_ptr = pulse_list[iface].tail_ptr->pulse_link[iface].prev_ptr ;
pulse_list[iface].tail_ptr->pulse_link[iface].next_ptr = NULL ;
}
}
else
{
/* July 1 emacdona: Make failure path case more robust */
if ( pulse_ptr == NULL ) { slog ("Internal Err 1\n"); rc = FAIL; }
else if ( pulse_ptr->pulse_link[iface].prev_ptr == NULL ) { slog ("Internal Err 2\n"); rc = FAIL; }
else if ( pulse_ptr->pulse_link[iface].prev_ptr->pulse_link[iface].next_ptr == NULL ) { slog ("Internal Err 3\n"); rc = FAIL; }
else if ( pulse_ptr->pulse_link[iface].next_ptr == NULL ) { slog ("Internal Err 4\n"); rc = FAIL; }
else if ( pulse_ptr->pulse_link[iface].next_ptr->pulse_link[iface].prev_ptr == NULL ) { slog ("Internal Err 5\n"); rc = FAIL; }
if ( pulse_ptr == NULL ) { slog ("Internal Err 1\n"); rc = FAIL; }
else if ( pulse_ptr->pulse_link[iface].prev_ptr == NULL ) { slog ("Internal Err 2\n"); rc = FAIL; }
else if ( pulse_ptr->pulse_link[iface].next_ptr == NULL ) { slog ("Internal Err 3\n"); rc = FAIL; }
if ( rc == FAIL )
{
slog ("%s Null pointer error splicing %s out of pulse list with %d pulses remaining (Monitoring:%s)\n",
@ -7935,7 +7958,7 @@ int nodeLinkClass::remPulse ( struct node * node_ptr, iface_enum iface, bool cle
}
if ( rc == PASS )
{
pulse_ptr->linknum[iface]-- ; // = 0 ;
pulse_ptr->linknum[iface]-- ;
}
pulses[iface]-- ;
}
@ -8082,14 +8105,26 @@ void nodeLinkClass::manage_heartbeat_alarm ( struct nodeLinkClass::node * node_p
int nodeLinkClass::lost_pulses ( iface_enum iface )
int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
{
int rc = PASS ;
int lost = 0 ;
/*
* Assume storage-0 is responding until otherwise proven its not.
* keep in mind that this interface counts nodes that have not responded ;
* not those that have.
*/
storage_0_responding = true ;
/*
* Loop over the pulse_list which now onoly contains a list of hosts
* that have not responded in this heartbeat period.
*/
for ( ; pulse_list[iface].head_ptr != NULL ; )
{
daemon_signal_hdlr ();
pulse_ptr = pulse_list[iface].head_ptr ;
lost++ ;
if ( active )
{
string flat = "Flat Line:" ;
@ -8098,6 +8133,15 @@ int nodeLinkClass::lost_pulses ( iface_enum iface )
pulse_ptr->b2b_pulses_count[iface] = 0 ;
// pulse_ptr->max_count[iface]++ ;
/*
* Update storage_0_responding reference to false if storgate-0
* is found in the pulse lots list.
*/
if ( pulse_ptr->hostname == STORAGE_0 )
{
storage_0_responding = false ;
}
/* Don't log single misses unless in debug mode */
if ( pulse_ptr->b2b_misses_count[iface] > 1 )
{
@ -8156,8 +8200,9 @@ int nodeLinkClass::lost_pulses ( iface_enum iface )
get_iface_name_str(iface),
pulse_ptr->b2b_misses_count[iface] );
}
#ifdef WANT_HBS_MEM_LOGS
mem_log ( flat, pulse_ptr->b2b_misses_count[iface], pulse_ptr->hostname.c_str());
#endif
if ( iface == MGMNT_IFACE )
{
if ( pulse_ptr->b2b_misses_count[iface] == hbs_minor_threshold )
@ -8252,8 +8297,7 @@ int nodeLinkClass::lost_pulses ( iface_enum iface )
if ( pulse_ptr->b2b_misses_count[iface] > pulse_ptr->max_count[iface] )
pulse_ptr->max_count[iface] = pulse_ptr->b2b_misses_count[iface] ;
}
rc = remPulse_by_name ( pulse_ptr->hostname, iface, false, NULL_PULSE_FLAGS );
if ( rc != PASS )
if ( remPulse_by_name ( pulse_ptr->hostname, iface, false, NULL_PULSE_FLAGS ))
{
elog ("%s %s not in pulse list\n", pulse_ptr->hostname.c_str(),
get_iface_name_str(iface));
@ -8266,7 +8310,7 @@ int nodeLinkClass::lost_pulses ( iface_enum iface )
break ;
}
}
return (rc);
return (lost);
}
/* Return true if the specified interface is being monitored for this host */
@ -8301,7 +8345,7 @@ void nodeLinkClass::print_pulse_list ( iface_enum iface )
if ( pulse_list[iface].head_ptr != NULL )
{
for ( pulse_ptr = pulse_list[iface].head_ptr ;
for ( pulse_ptr = pulse_list[iface].head_ptr ;
pulse_ptr != NULL ;
pulse_ptr = pulse_ptr->pulse_link[iface].next_ptr )
{
@ -8310,12 +8354,15 @@ void nodeLinkClass::print_pulse_list ( iface_enum iface )
}
dlog ("Patients: %s\n", pulse_host_list.c_str());
}
#ifdef WANT_HBS_MEM_LOGS
if ( pulses[iface] && !pulse_host_list.empty() )
{
string temp = get_iface_name_str(iface) ;
temp.append(" Patients :") ;
mem_log ( temp, pulses[iface], pulse_host_list );
}
#endif
}

View File

@ -1940,7 +1940,7 @@ public:
void manage_pulse_flags ( string & hostname, unsigned int flags );
/** Control the heartbeat monitoring state of a host */
int mon_host ( const string & hostname, iface_enum iface, bool true_false, bool send_clear );
int mon_host ( const string & hostname, bool true_false, bool send_clear );
/** Return true if the pulse list is empty */
bool pulse_list_empty ( iface_enum iface );
@ -1956,7 +1956,7 @@ public:
* that exceed preset thresholds.
*
*/
int lost_pulses ( iface_enum iface );
int lost_pulses ( iface_enum iface, bool & storage_0_responding );
bool monitored_pulse ( string hostname , iface_enum iface );

View File

@ -4,10 +4,10 @@
# SPDX-License-Identifier: Apache-2.0
#
SRCS = hbsAlarm.cpp hbsClient.cpp hbsAgent.cpp hbsPmon.cpp hbsStubs.cpp
SRCS = hbsAlarm.cpp hbsClient.cpp hbsAgent.cpp hbsPmon.cpp hbsUtil.cpp hbsCluster.cpp hbsStubs.cpp
OBJS = $(SRCS:.cpp=.o)
LDLIBS = -lstdc++ -ldaemon -lcommon -lthreadUtil -lpthread -lfmcommon -lalarm -lrt -lamon -lcrypto -luuid
LDLIBS = -lstdc++ -ldaemon -lcommon -lthreadUtil -lpthread -lfmcommon -lalarm -lrt -lamon -lcrypto -luuid -ljson-c
INCLUDES = -I. -I/usr/include/mtce-daemon -I/usr/include/mtce-common
INCLUDES += -I../common -I../alarm -I../maintenance -I../public
@ -31,8 +31,8 @@ endif
all: static_analysis common agent client
build: static_analysis $(OBJS)
$(CXX) $(CCFLAGS) hbsAlarm.o hbsAgent.o hbsStubs.o ../common/nodeClass.o -L../public -L../alarm $(LDLIBS) -o hbsAgent
$(CXX) $(CCFLAGS) hbsClient.o hbsPmon.o -L../public -L../alarm $(LDLIBS) -o hbsClient
$(CXX) $(CCFLAGS) hbsAlarm.o hbsAgent.o hbsUtil.o hbsCluster.o hbsStubs.o ../common/nodeClass.o -L../public -L../alarm $(LDLIBS) -o hbsAgent
$(CXX) $(CCFLAGS) hbsClient.o hbsPmon.o hbsUtil.o -L../public -L../alarm $(LDLIBS) -o hbsClient
common:
( cd ../common ; make clean ; make lib VER=$(VER) VER_MJR=$(VER_MJR))

View File

@ -41,6 +41,7 @@ using namespace std;
#include "hbsBase.h" /* Heartbeat Base Header File */
#include "hbsAlarm.h" /* for ... hbsAlarm_clear_all */
#include "alarm.h" /* for ... alarm send message to mtcalarmd */
#include "jsonUtil.h" /* for ... jsonUtil_get_key_val */
/**************************************************************
* Implementation Structure
@ -68,6 +69,8 @@ using namespace std;
/* Number of back to back interface errors before the interface is re-initialized. */
#define INTERFACE_ERRORS_FOR_REINIT (8)
#define MAX_LEN 1000
/* Historical String data for mem_logs */
static string unexpected_pulse_list[MAX_IFACES] = { "" , "" } ;
static string arrival_histogram[MAX_IFACES] = { "" , "" } ;
@ -90,6 +93,8 @@ int module_init ( void )
return (PASS);
}
static unsigned int controller_number = 0 ;
void daemon_sigchld_hdlr ( void )
{
; /* dlog("Received SIGCHLD ... no action\n"); */
@ -184,14 +189,16 @@ void daemon_exit ( void )
CONFIG_AGENT_HBS_DEGRADE |\
CONFIG_AGENT_HBS_FAILURE |\
CONFIG_AGENT_MULTICAST |\
CONFIG_SCHED_PRIORITY |\
CONFIG_SCHED_PRIORITY |\
CONFIG_MTC_TO_HBS_CMD_PORT |\
CONFIG_HBS_TO_MTC_EVENT_PORT |\
CONFIG_AGENT_HBS_MGMNT_PORT |\
CONFIG_AGENT_HBS_INFRA_PORT |\
CONFIG_CLIENT_HBS_MGMNT_PORT |\
CONFIG_CLIENT_MTCALARM_PORT |\
CONFIG_CLIENT_HBS_INFRA_PORT )
CONFIG_CLIENT_HBS_INFRA_PORT |\
CONFIG_AGENT_SM_SERVER_PORT |\
CONFIG_AGENT_SM_CLIENT_PORT)
/* Startup config read */
static int hbs_config_handler ( void * user,
@ -203,6 +210,8 @@ static int hbs_config_handler ( void * user,
if (MATCH("agent", "heartbeat_period"))
{
int curr_period = hbsInv.hbs_pulse_period ;
config_ptr->hbs_pulse_period = atoi(value);
hbsInv.hbs_pulse_period = atoi(value);
hbsInv.hbs_state_change = true ;
@ -227,10 +236,14 @@ static int hbs_config_handler ( void * user,
}
}
}
hbsInv.hbs_pulse_period_save = hbsInv.hbs_pulse_period ;
if ( curr_period != hbsInv.hbs_pulse_period )
{
/* initialize cluster info */
hbs_cluster_init ( hbsInv.hbs_pulse_period );
}
}
hbsInv.hbs_pulse_period_save = hbsInv.hbs_pulse_period ;
if (MATCH("agent", "hbs_minor_threshold"))
{
config_ptr->hbs_minor_threshold =
@ -312,6 +325,16 @@ static int hbs_config_handler ( void * user,
config_ptr->hbs_agent_mgmnt_port = atoi(value);
config_ptr->mask |= CONFIG_AGENT_HBS_MGMNT_PORT ;
}
else if (MATCH("agent", "sm_server_port"))
{
config_ptr->sm_server_port = atoi(value);
config_ptr->mask |= CONFIG_AGENT_SM_SERVER_PORT ;
}
else if (MATCH("agent", "sm_client_port"))
{
config_ptr->sm_client_port = atoi(value);
config_ptr->mask |= CONFIG_AGENT_SM_CLIENT_PORT ;
}
else if (MATCH("client", "hbs_client_mgmnt_port"))
{
config_ptr->hbs_client_mgmnt_port = atoi(value);
@ -617,6 +640,34 @@ int alarm_port_init ( void )
return ( hbs_sock.alarm_sock->return_status ) ;
}
int hbs_sm_sockets_init ( void )
{
int rc = PASS ;
/* Create an UDP RX Message Socket for SM Requests; LO interface only */
hbs_sock.sm_server_sock = new msgClassRx(LOOPBACK_IP, hbs_config.sm_server_port, IPPROTO_UDP);
if ( ! hbs_sock.sm_server_sock )
{
elog ("Failed to setup SM receive socket");
rc = FAIL_SOCKET_CREATE ;
}
/* Create an UDP TX Message Socket for SM Requests; LO interface only */
hbs_sock.sm_client_sock = new msgClassTx(LOOPBACK_IP, hbs_config.sm_client_port,IPPROTO_UDP);
if ( ! hbs_sock.sm_client_sock )
{
elog ("Failed to setup SM transmit socket");
rc = FAIL_SOCKET_CREATE ;
}
if ( rc == PASS )
{
hbs_sock.sm_server_sock->sock_ok(true);
hbs_sock.sm_client_sock->sock_ok(true);
}
return (rc);
}
/* Init the internal/local sockets ; the ones that will no change.
* This way we don't miss add and start commands from maintenance. */
@ -654,6 +705,9 @@ int hbs_int_socket_init ( void )
{
elog ("Alarm port setup or registration failed (rc:%d)\n", rc );
}
rc = hbs_sm_sockets_init () ;
return (rc);
}
@ -697,26 +751,36 @@ int hbs_pulse_request ( iface_enum iface,
string hostname_clue,
unsigned int lookup_clue)
{
int rc = PASS ;
#define MAX_LEN 1000
#ifdef WANT_HBS_MEM_LOGS
char str[MAX_LEN] ;
/* Add the sequence number */
hbs_sock.tx_mesg[iface].s = seq_num ;
memset ( &hbs_sock.tx_mesg[iface].m[HBS_HEADER_SIZE], 0, MAX_CHARS_HOSTNAME );
if (( lookup_clue ) &&
( hostname_clue.length() <= MAX_CHARS_HOSTNAME ))
{
hbs_sock.tx_mesg[iface].c = lookup_clue ;
memcpy ( &hbs_sock.tx_mesg[iface].m[HBS_HEADER_SIZE],
hostname_clue.data(),
hostname_clue.length());
}
/* Message length is the size of the sequence number, the clue and the buffer */
int msg_len = (HBS_MAX_MSG+(sizeof(unsigned int)*2)) ;
#endif
int bytes = 0 ;
if ( hbs_sock.tx_sock[iface] )
{
// int unused_networks = 0 ;
memset ( &hbs_sock.tx_mesg[iface].m[HBS_HEADER_SIZE], 0, MAX_CHARS_HOSTNAME );
/* Add message version - 0 -> 1 with the acction of cluster information */
hbs_sock.tx_mesg[iface].v = HBS_MESSAGE_VERSION ;
/* Add the sequence number */
hbs_sock.tx_mesg[iface].s = seq_num ;
if (( lookup_clue ) &&
( hostname_clue.length() <= MAX_CHARS_HOSTNAME ))
{
hbs_sock.tx_mesg[iface].c = lookup_clue ;
memcpy ( &hbs_sock.tx_mesg[iface].m[HBS_HEADER_SIZE],
hostname_clue.data(),
hostname_clue.length());
}
/* Append the cluster info to the pulse request */
hbs_cluster_append(hbs_sock.tx_mesg[iface]) ;
/* Calculate the total message size */
bytes = sizeof(hbs_message_type)-hbs_cluster_unused_bytes();
#ifdef WANT_FIT_TESTING
if ( daemon_want_fit ( FIT_CODE__NO_PULSE_REQUEST, "any" , get_iface_name_str(iface) ) )
{
@ -727,14 +791,15 @@ int hbs_pulse_request ( iface_enum iface,
goto hbs_pulse_request_out ;
}
#endif
if ( (rc = hbs_sock.tx_sock[iface]->write((char*)&hbs_sock.tx_mesg[iface], msg_len)) < 0 )
if ( (bytes = hbs_sock.tx_sock[iface]->write((char*)&hbs_sock.tx_mesg[iface], bytes)) < 0 )
{
elog("Failed to send Pulse request: %d:%s to %s.%d (rc:%i ; %d:%s)\n",
hbs_sock.tx_mesg[iface].s,
&hbs_sock.tx_mesg[iface].m[0],
hbs_sock.tx_sock[iface]->get_dst_addr()->toString(),
hbs_sock.tx_sock[iface]->get_dst_addr()->getPort(),
rc, errno, strerror(errno) );
bytes, errno, strerror(errno) );
return (FAIL_SOCKET_SENDTO);
}
}
@ -748,16 +813,17 @@ int hbs_pulse_request ( iface_enum iface,
hbs_pulse_request_out:
#endif
mlog1("%s Pulse Req: (%5d): %17s:%5d: %d:%d:%x:%s\n",
get_iface_name_str(iface), rc,
mlog("%s Pulse Req: (%5d): %17s:%5d: %d:%d:%d:%x:%s\n",
get_iface_name_str(iface), bytes,
hbs_sock.tx_sock[iface]->get_dst_addr()->toString(),
hbs_sock.tx_sock[iface]->get_dst_addr()->getPort(),
hbs_sock.tx_mesg[iface].v,
hbs_sock.tx_mesg[iface].s,
hbs_sock.tx_mesg[iface].c,
hbs_sock.tx_mesg[iface].f,
hbs_sock.tx_mesg[iface].m);
#ifdef WANT_HBS_MEM_LOGS
snprintf ( &str[0], MAX_LEN, "%s Pulse Req: %17s:%5d: %u:%u:%s\n",
get_iface_name_str(iface),
hbs_sock.tx_sock[iface]->get_dst_addr()->toString(),
@ -766,6 +832,7 @@ hbs_pulse_request_out:
hbs_sock.tx_mesg[iface].c,
hbs_sock.tx_mesg[iface].m);
mem_log (&str[0]);
#endif
return (PASS);
}
@ -785,7 +852,7 @@ string get_hostname_from_pulse ( char * msg_ptr )
int _pulse_receive ( iface_enum iface , unsigned int seq_num )
{
int n = 0 ;
int bytes = 0 ;
int detected_pulses = 0 ;
@ -796,7 +863,7 @@ int _pulse_receive ( iface_enum iface , unsigned int seq_num )
do
{
/* Clean the receive buffer */
memset ( hbs_sock.rx_mesg[iface].m, 0, HBS_MAX_MSG );
memset ( hbs_sock.rx_mesg[iface].m, 0, sizeof(hbs_message_type) );
hbs_sock.rx_mesg[iface].s = 0 ;
hbs_sock.rx_mesg[iface].c = 0 ;
if ( hbs_sock.rx_sock[iface] == NULL )
@ -804,10 +871,10 @@ int _pulse_receive ( iface_enum iface , unsigned int seq_num )
elog ("%s cannot receive pulses - null object\n", get_iface_name_str(iface) );
return (0);
}
if ( (n = hbs_sock.rx_sock[iface]->read((char*)&hbs_sock.rx_mesg[iface], sizeof(hbs_message_type))) != -1 )
if ( (bytes = hbs_sock.rx_sock[iface]->read((char*)&hbs_sock.rx_mesg[iface], sizeof(hbs_message_type))) != -1 )
{
mlog1 ("%s Pulse Rsp: (%5d): %17s:%5d: %d:%d:%x:%s\n",
get_iface_name_str(iface), n,
get_iface_name_str(iface), bytes,
hbs_sock.rx_sock[iface]->get_dst_addr()->toString(),
hbs_sock.rx_sock[iface]->get_dst_addr()->getPort(),
hbs_sock.rx_mesg[iface].s,
@ -839,7 +906,7 @@ int _pulse_receive ( iface_enum iface , unsigned int seq_num )
}
#endif
mlog ("%s Pulse Rsp from (%s)\n", get_iface_name_str(iface), hostname.c_str());
// mlog ("%s Pulse Rsp from (%s)\n", get_iface_name_str(iface), hostname.c_str());
if ( !hostname.compare("localhost") )
{
mlog3 ("%s Pulse Rsp (local): %17s:%5d: %d:%d:%x:%s\n",
@ -868,7 +935,6 @@ int _pulse_receive ( iface_enum iface , unsigned int seq_num )
{
if ( hbsInv.monitored_pulse ( hostname , iface ) == true )
{
#define MAX_LEN 1000
char str[MAX_LEN] ;
string extra = "Rsp" ;
@ -880,25 +946,42 @@ int _pulse_receive ( iface_enum iface , unsigned int seq_num )
{
rc = hbsInv.remove_pulse ( hostname, iface, hbs_sock.rx_mesg[iface].c, hbs_sock.rx_mesg[iface].f ) ;
}
snprintf (&str[0], MAX_LEN, "%s Pulse %s: (%5d): %17s:%5d: %u:%u:%x:%s\n",
get_iface_name_str(iface), extra.c_str(), n,
snprintf (&str[0], MAX_LEN, "%s Pulse %s: (%5d): %s:%d: %u:%u:%x:%s\n",
get_iface_name_str(iface), extra.c_str(), bytes,
hbs_sock.rx_sock[iface]->get_dst_addr()->toString(),
hbs_sock.rx_sock[iface]->get_dst_addr()->getPort(),
hbs_sock.rx_mesg[iface].s,
hbs_sock.rx_mesg[iface].c,
hbs_sock.rx_mesg[iface].f,
hbs_sock.rx_mesg[iface].m);
mlog1 ("%s", &str[0]);
mlog ("%s", &str[0]);
#ifdef WANT_HBS_MEM_LOGS
mem_log (str);
#endif
if ( extra.empty())
{
detected_pulses++ ;
}
/* don't save data from self */
if ( hostname != hbsInv.my_hostname )
{
if ( hbs_sock.rx_mesg[iface].v >= HBS_MESSAGE_VERSION )
{
if ( iface == MGMNT_IFACE )
hbs_cluster_save ( hostname, MTCE_HBS_NETWORK_MGMT , hbs_sock.rx_mesg[iface]);
else
hbs_cluster_save ( hostname, MTCE_HBS_NETWORK_INFRA , hbs_sock.rx_mesg[iface]);
}
}
else
{
ilog ("skipping my hostname");
}
}
else
{
mlog3 ("%s Pulse Dis: (%5d): %17s:%5d: %d:%d:%x:%s\n",
get_iface_name_str(iface), n,
get_iface_name_str(iface), bytes,
hbs_sock.rx_sock[iface]->get_dst_addr()->toString(),
hbs_sock.rx_sock[iface]->get_dst_addr()->getPort(),
hbs_sock.rx_mesg[iface].s,
@ -934,7 +1017,7 @@ int _pulse_receive ( iface_enum iface , unsigned int seq_num )
hbs_sock.rx_mesg[iface].m) ;
}
}
} while ( n > 0 ) ;
} while ( bytes > 0 ) ;
monitor_scheduling ( after_rx_time, before_rx_time, detected_pulses, SCHED_MONITOR__RECEIVER );
return (detected_pulses);
}
@ -951,6 +1034,8 @@ int send_event ( string & hostname, unsigned int event_cmd, iface_enum iface )
if ( event_cmd == MTC_EVENT_HEARTBEAT_LOSS )
{
daemon_dump_membuf_banner ();
hbsInv.print_node_info ();
hbs_cluster_log( hbsInv.my_hostname, "event");
daemon_dump_membuf ();
snprintf ( &event.hdr[0] , MSG_HEADER_SIZE, "%s", get_heartbeat_loss_header());
}
@ -1038,6 +1123,9 @@ int daemon_init ( string iface, string nodetype )
/* Initialize the hbs control struct */
MEMSET_ZERO ( hbs_ctrl );
/* init the utility module */
hbs_utils_init ();
/* initialize the timer */
mtcTimer_init ( hbsTimer, "controller", "heartbeat" );
@ -1091,9 +1179,123 @@ int daemon_init ( string iface, string nodetype )
return (rc);
}
/*****************************************************************************
*
* Name : hbs_sm_handler
*
* Description: Try and receive a Service Management request from sm_server_sock
*
* Expecting request in the following form:
* ~66 bytes with moderate spacing
*
* {
* "origin" :"sm",
* "service":"heartbeat",
* "request":"cluster_info"
* "req_id" : number
* }
*
* Successfully parsed request results in a call to
* hbs_cluser_send which sends the latest snapshot of
* the heartbeat cluser info to SM.
*
* Assumptions: log flooding is avoided.
*
* Returns : Nothing
*
****************************************************************************/
static int _hbs_sm_handler_log_throttle = 0 ;
void hbs_sm_handler ( void )
{
#define _MAX_MSG_LEN (80)
#define _MAX_LOG_CNT (1000)
#define PRIMARY_LABEL "origin"
#define SERVICE_LABEL "service"
#define REQUEST_LABEL "request"
#define REQID_LABEL "reqid"
#define SUPPORTED_ORIGIN "sm"
#define SUPPERTED_SERVICE "heartbeat"
#define SUPPORTED_REQUEST "cluster_info"
char sm_mesg[_MAX_MSG_LEN] ;
MEMSET_ZERO(sm_mesg);
int bytes = hbs_sock.sm_server_sock->read((char*)&sm_mesg, _MAX_MSG_LEN);
if ( bytes )
{
/* Expecting request in the following form:
* { "origin":"sm" ... } */
if ( sm_mesg[0] == '{' )
{
int reqid = 0 ;
string origin = "" ;
string service = "" ;
string request = "" ;
if ( jsonUtil_get_key_val ( sm_mesg, PRIMARY_LABEL, origin ) != PASS )
{
wlog_throttled ( _hbs_sm_handler_log_throttle, _MAX_LOG_CNT,
"missing primary label 'origin' in request.");
}
else if (( origin == SUPPORTED_ORIGIN ) &&
( jsonUtil_get_key_val ( sm_mesg, SERVICE_LABEL, service ) == PASS ) &&
( jsonUtil_get_key_val ( sm_mesg, REQUEST_LABEL, request ) == PASS ) &&
( jsonUtil_get_key_val_int ( sm_mesg, REQID_LABEL, reqid ) == PASS ))
{
if (( service == SUPPERTED_SERVICE ) &&
( request == SUPPORTED_REQUEST ))
{
/* success path ... */
hbs_cluster_send( hbs_sock.sm_client_sock, reqid );
/* reset log throttle */
_hbs_sm_handler_log_throttle = 0 ;
}
else
{
wlog_throttled ( _hbs_sm_handler_log_throttle, _MAX_LOG_CNT,
"missing service or request labels in request.");
}
}
else
{
wlog_throttled ( _hbs_sm_handler_log_throttle, _MAX_LOG_CNT,
"failed to parse one or more request labels.");
}
}
else
{
wlog_throttled ( _hbs_sm_handler_log_throttle, _MAX_LOG_CNT,
"improperly formatted json string request.");
}
}
else if ( bytes == -1 )
{
wlog_throttled ( _hbs_sm_handler_log_throttle, _MAX_LOG_CNT,
"message receive error (%d:%s)",
errno, strerror(errno));
}
else
{
wlog_throttled ( _hbs_sm_handler_log_throttle, _MAX_LOG_CNT,
"unknown error Error (rc:%d)", bytes );
}
dlog ("... %s", sm_mesg );
}
/****************************************************************************
*
* Name : daemon_service_run
*
* Description: Daemon's main loop
*
***************************************************************************/
void daemon_service_run ( void )
{
#ifdef WANT_HBS_MEM_LOGS
int exp_pulses[MAX_IFACES] ;
#endif
int rc = PASS ;
int counter = 0 ;
int goenabled_wait_log_throttle = 0 ;
@ -1154,6 +1356,8 @@ void daemon_service_run ( void )
daemon_exit ();
}
/* set this controller as provisioned */
hbs_manage_controller_state ( hbsInv.my_hostname , true );
/* CGTS 4114: Small Footprint: Alarm 200.005 remains active after connectivity restored
*
@ -1195,6 +1399,16 @@ void daemon_service_run ( void )
/* enable the base level signal handler latency monitor */
daemon_latency_monitor (true);
/* load this controller index number - used for cluster stuff */
if ( hbsInv.my_hostname == CONTROLLER_0 )
controller_number = 0 ;
else
controller_number = 1 ;
/* tell the cluster which controller this is and
* how many networks are being monitored */
hbs_cluster_nums (controller_number,hbsInv.infra_network_provisioned ?2:1);
/* Run heartbeat service forever or until stop condition */
for ( hbsTimer.ring = false ; ; )
{
@ -1315,6 +1529,14 @@ void daemon_service_run ( void )
FD_SET(hbs_sock.mtc_to_hbs_sock->getFD(), &hbs_sock.readfds);
}
/* Add the sm request receiver to the select list */
if (( hbs_sock.sm_server_sock ) &&
( hbs_sock.sm_server_sock->getFD()))
{
socks.push_front (hbs_sock.sm_server_sock->getFD());
FD_SET(hbs_sock.sm_server_sock->getFD(), &hbs_sock.readfds);
}
/* Add the netlink event listener to the select list */
if ( hbs_sock.netlink_sock )
{
@ -1379,6 +1601,11 @@ void daemon_service_run ( void )
hbs_sock.fired[INFRA_INTERFACE] = true ;
}
if ((hbs_sock.sm_server_sock != NULL ) &&
( FD_ISSET(hbs_sock.sm_server_sock->getFD(), &hbs_sock.readfds)))
{
hbs_sm_handler();
}
if ((hbs_sock.mtc_to_hbs_sock != NULL ) &&
( FD_ISSET(hbs_sock.mtc_to_hbs_sock->getFD(), &hbs_sock.readfds)))
{
@ -1404,7 +1631,7 @@ void daemon_service_run ( void )
inv.nodetype = msg.parm[0];
hbsInv.add_heartbeat_host ( inv ) ;
hostname_inventory.push_back ( hostname );
ilog ("%s added to heartbeat service (%d)\n", hostname.c_str(), inv.nodetype );
ilog ("%s added to heartbeat service (%d)\n", hostname.c_str(), msg.parm[0] );
/* clear any outstanding alarms on the ADD */
if ( hbsInv.hbs_failure_action != HBS_FAILURE_ACTION__NONE )
@ -1415,10 +1642,7 @@ void daemon_service_run ( void )
}
else if ( msg.cmd == MTC_CMD_DEL_HOST )
{
for ( int iface = 0 ; iface < MAX_IFACES ; iface++ )
{
hbsInv.mon_host ( hostname, (iface_enum)iface, false, false );
}
hbsInv.mon_host ( hostname, false, false );
hostname_inventory.remove ( hostname );
hbsInv.del_host ( hostname );
ilog ("%s deleted from heartbeat service\n", hostname.c_str());
@ -1432,27 +1656,24 @@ void daemon_service_run ( void )
}
else if ( msg.cmd == MTC_CMD_STOP_HOST )
{
for ( int iface = 0 ; iface < MAX_IFACES ; iface++ )
{
hbsInv.mon_host ( hostname, (iface_enum)iface, false, true );
}
ilog ("%s stopping heartbeat service\n", hostname.c_str());
hbsInv.mon_host ( hostname, false, true );
hbs_cluster_del ( hostname );
ilog ("%s stopping heartbeat service\n",
hostname.c_str());
}
else if ( msg.cmd == MTC_CMD_START_HOST )
{
for ( int iface = 0 ; iface < MAX_IFACES ; iface++ )
{
hbsInv.mon_host ( hostname, (iface_enum)iface, true, true );
}
ilog ("%s starting heartbeat service\n", hostname.c_str());
hbsInv.mon_host ( hostname, true, true );
hbs_cluster_add ( hostname );
ilog ("%s starting heartbeat service\n",
hostname.c_str());
}
else if ( msg.cmd == MTC_RESTART_HBS )
{
for ( int iface = 0 ; iface < MAX_IFACES ; iface++ )
{
hbsInv.mon_host ( hostname, (iface_enum)iface, false, false );
hbsInv.mon_host ( hostname, (iface_enum)iface, true, false );
}
hbsInv.mon_host ( hostname, false, false );
hbsInv.mon_host ( hostname, true, false );
ilog ("%s restarting heartbeat service\n", hostname.c_str());
hbsInv.print_node_info();
}
@ -1616,7 +1837,9 @@ void daemon_service_run ( void )
int rri = 0 ;
string lf = "\n" ;
#ifdef WANT_HBS_MEM_LOGS
mem_log ((char*)lf.data());
#endif
/* Get the next Resource Reference Identifier
* and its Resourvce Identifier. These values
@ -1630,7 +1853,9 @@ void daemon_service_run ( void )
if (( iface == INFRA_IFACE ) && ( hbsInv.infra_network_provisioned == false ))
continue ;
#ifdef WANT_HBS_MEM_LOGS
exp_pulses[iface] =
#endif
hbsInv.hbs_expected_pulses[iface] =
hbsInv.create_pulse_list((iface_enum)iface);
@ -1759,28 +1984,33 @@ void daemon_service_run ( void )
if (( iface == INFRA_IFACE ) && ( hbsInv.infra_network_provisioned != true ))
continue ;
#define MAX_LEN 1000
#ifdef WANT_HBS_MEM_LOGS
char str[MAX_LEN] ;
snprintf (&str[0], MAX_LEN, "%s Histogram: %d - %s\n",
get_iface_name_str(iface),
exp_pulses[iface],
arrival_histogram[iface].c_str());
mem_log (str);
if ( !unexpected_pulse_list[iface].empty() )
{
snprintf ( &str[0], MAX_LEN, "%s Others : %s\n",
get_iface_name_str(iface),
unexpected_pulse_list[iface].c_str());
mem_log(str);
}
hbsInv.lost_pulses ( (iface_enum)iface );
#endif
/*
* Assume storage-0 is responding until otherwise proven
* its not. Keep in mind that the 'lost_pulses' interface
* only counts nodes that have not responded.
*/
bool storage_0_responding = true ;
int lost = hbsInv.lost_pulses ((iface_enum)iface, storage_0_responding);
hbs_cluster_update ((iface_enum)iface, lost, storage_0_responding);
}
hbsTimer.ring = false ;
heartbeat_request = true ;
// hbs_cluster_log ( hbsInv.my_hostname, "->") ;
seq_num++ ;
}
daemon_load_fit ();
@ -1796,7 +2026,9 @@ void daemon_dump_info ( void )
hbsInv.print_node_info ();
hbsInv.memDumpAllState ();
#ifdef WANT_HBS_MEM_LOGS
daemon_dump_membuf (); /* write mem_logs to log file and clear log list */
#endif
}
const char MY_DATA [100] = { "eieio\n" } ;

View File

@ -27,6 +27,8 @@
#include <signal.h>
#include <list>
#include "msgClass.h"
#include "mtceHbsCluster.h"
#include "hbsCluster.h"
/**
* @addtogroup hbs_base
@ -38,6 +40,8 @@
#endif
#define __AREA__ "hbs"
// #define WANT_CLUSTER_DEBUG
#define ALIGN_PACK(x) __attribute__((packed)) x
/** Maximum service fail count before action */
@ -56,15 +60,18 @@ const char rsp_msg_header [HBS_HEADER_SIZE+1] = {"cgts pulse rsp:"};
#define HBS_MAX_MSG (HBS_HEADER_SIZE+MAX_CHARS_HOSTNAME)
#define HBS_MESSAGE_VERSION (1) // 0 -> 1 with intro of cluster info
/* Heartbeat control structure */
typedef struct
{
unsigned int nodetype ;
bool clear_alarms ;
} hbs_ctrl_type ;
hbs_ctrl_type * get_hbs_ctrl_ptr ( void );
/* A heartbeat service message
* if this structire is changed then
* if this structure is changed then
* hbs_pulse_request needs to be looked at
*/
typedef struct
@ -76,7 +83,7 @@ typedef struct
unsigned int s ;
/* Fast Lookup Clue Info */
unsigned int c ;
unsigned int c ;
/* Status Flags
* ------------
@ -89,6 +96,16 @@ typedef struct
/** message version number */
unsigned int v ;
/** Heartbeat cluster information that is put into heartbeat messages.
*
* Pulse Request : To hbsClient: Only 1 controller with up to 2 network types history.
* Pulse Response: From hbsClient: Can include up to 2 controllers with 2 networks each.
*
* This addition requires message verison increment.
*
**/
mtce_hbs_cluster_type cluster ;
} ALIGN_PACK(hbs_message_type) ;
@ -104,6 +121,12 @@ typedef struct
/** Heartbeat Service Event Transmit Interface - hbsClient -> mtcAgent */
msgClassSock* hbs_ready_tx_sock;
/** Heartbeat Service SM Transmit Interface - hbsAgent -> sm */
msgClassSock* sm_client_sock;
/** Heartbeat Service SM Receive Interface - sm -> hbsAgent */
msgClassSock* sm_server_sock;
/** PMON Pulse Receive Interface - pmond -> hbsClient */
msgClassSock* pmon_pulse_sock;
@ -166,6 +189,9 @@ int hbs_refresh_pids ( std::list<procList> & proc_list );
int hbs_process_monitor ( std::list<procList> & pmon_list );
int hbs_self_recovery ( unsigned int cmd );
/* returns this controller's number ; 0 or 1 */
unsigned int hbs_get_controller_number ( void );
/* Setup the pulse messaging interfaces
* 'p' is a boot that indicates if the infrastructure network is provisioned
* 'p' = true means it is provisioned */
@ -184,6 +210,93 @@ int hbs_self_recovery ( unsigned int cmd );
} \
}
/*********** Common Heartbeat Utilities in hbsUtil.cpp ***************/
/* module init */
void hbs_utils_init ( void );
/* network enum to name lookup */
string hbs_cluster_network_name ( mtce_hbs_network_enum network );
/* Produce formatted clog's that characterize current and changing cluster
* history for a given network. Each log is controller/network specific. */
void hbs_cluster_log ( string & hostname, mtce_hbs_cluster_type & cluster, string prefix );
/* Initialize the specified history array */
void hbs_cluster_history_init ( mtce_hbs_cluster_history_type & history );
/* Clear all history in the cluster vault */
void hbs_cluster_history_clear( mtce_hbs_cluster_type & cluster );
/******** Heartbeat Agent Cluster Functions in hbsCluster.cpp ********/
/* Set the cluster vault to default state.
* Called upon daemon init or heartbeat period change. */
void hbs_cluster_init ( unsigned short period );
/* Calculate number of bytes that is unused in the cluster data structure.
* Primarily to know how many history elements are missing. */
unsigned short hbs_cluster_unused_bytes ( void );
/* Add and delete hosts from the monitored list.
* Automatically adjusts the numbers in the cluster vault. */
void hbs_cluster_add ( string & hostname );
void hbs_cluster_del ( string & hostname );
/* Report status of storgate-0 */
void hbs_cluster_storage0_status ( iface_enum iface , bool responding );
/* Look for and clog changes in cluster state */
int hbs_cluster_cmp ( hbs_message_type & msg );
/* Manage the enabled state of the controllers */
void hbs_manage_controller_state ( string & hostname, bool enabled );
/* Set the number of monitored hosts and this controller's
* number in the cluster vault. */
void hbs_cluster_nums ( unsigned short this_controller,
unsigned short monitored_networks );
/* Copy/Save the peer controller's cluster info from the hbsClient's
* pulse response into the cluster vault so its there and ready for
* an SM cluster_info request. */
int hbs_cluster_save ( string & hostname,
mtce_hbs_network_enum network,
hbs_message_type & msg );
/*
* Called by the hbsAgent pulse receiver to create a network specific
* history update entry consisting of
*
* 1. the number of monitored hosts
* 2. how many of those that responded in the last heartbeat period.
* 3. threshold storage-0 responding count and manage that state in that
* networks history header.
*/
void hbs_cluster_update ( iface_enum iface,
unsigned short not_responding_hosts,
bool storage_0_responding );
/* Called by the hbsAgent pulse transmitter to append this controllers
* running cluster view in the next multicast pulse request.
* The hbsClient is expected to loop this data and any other like data from
* the other controller back in its response. */
void hbs_cluster_append ( hbs_message_type & msg );
/* Produce formatted clog's that characterize current and changing cluster
* history for a given network. Each log is controller/network specific. */
void hbs_cluster_log ( string & hostname, string prefix );
/* Service SM cluster info request */
void hbs_sm_handler ( void );
/* send the cluster vault to SM */
void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid );
/* print the contents of the vault */
void hbs_cluster_dump ( mtce_hbs_cluster_type & vault );
/**
* @} hbs_base
*/

View File

@ -20,7 +20,6 @@
* daemon_files_init
* daemon_configure
* daemon_signal_init
* hbs_message_init
* hbs_socket_init
*
* daemon_service_run
@ -59,7 +58,7 @@ using namespace std;
#include "daemon_option.h" /* Common options for daemons */
#include "nodeTimers.h" /* for ... maintenance timers */
#include "nodeMacro.h" /* for ... CREATE_NONBLOCK_INET_UDP_RX_SOCKET */
#include "nlEvent.h" /* for ... open_netlink_socket */
#include "nlEvent.h" /* for ... open_netlink_socket */
#include "hbsBase.h" /* Heartbeat Base Header File */
extern "C"
@ -95,8 +94,9 @@ typedef struct
std::list<procList>::iterator proc_ptr ;
} stallMon_type ;
static char pulse_resp_tx_hdr [HBS_MAX_MSG];
static char my_hostname [MAX_HOST_NAME_SIZE+1];
static char my_hostname_length ;
static string my_macaddr = "" ;
static string my_address = "" ;
static unsigned int my_nodetype= CGTS_NODE_NULL ;
@ -360,6 +360,12 @@ static int hbs_config_handler ( void * user,
config_ptr->pmon_pulse_port = atoi(value);
config_ptr->mask |= CONFIG_CLIENT_PULSE_PORT ;
}
#ifdef WANT_CLUSTER_DEBUG
else if (MATCH("agent", "sm_client_port"))
{
config_ptr->sm_client_port = atoi(value);
}
#endif
else
{
return (PASS);
@ -446,20 +452,6 @@ int daemon_configure ( void )
/* Initialization Utilities */
/****************************/
/* Initialize the unicast pulse response message */
/* One time thing ; tx same message all the time. */
int hbs_message_init ( void )
{
/* Build the transmit pulse response message for each interface */
for ( int i = 0 ; i < MAX_IFACES ; i++ )
{
memset ( &hbs_sock.tx_mesg[i], 0, sizeof (hbs_message_type));
memcpy ( &hbs_sock.tx_mesg[i].m[0], &rsp_msg_header[0], HBS_HEADER_SIZE );
memcpy ( &hbs_sock.tx_mesg[i].m[HBS_HEADER_SIZE], my_hostname, strlen(my_hostname));
}
return (PASS);
}
/* Initialize pulse messaging for the specified interface
* This is called by a macro defined in hbsBase.h */
int _setup_pulse_messaging ( iface_enum i, int rmem )
@ -621,6 +613,11 @@ int hbs_socket_init ( void )
return (FAIL_SOCKET_NOBLOCK);
}
#ifdef WANT_CLUSTER_DEBUG
hbs_sock.sm_client_sock = new msgClassRx(LOOPBACK_IP,hbs_config.sm_client_port,IPPROTO_UDP);
if ( rc ) return (rc) ;
hbs_sock.sm_client_sock->sock_ok(true);
#endif
return (PASS);
}
@ -648,7 +645,7 @@ int get_pmon_pulses ( void )
if ( !strncmp ( &msg.hdr[0] , get_pmond_pulse_header(), MSG_HEADER_SIZE ))
{
pulses++ ;
mlog ("Pmon Pulse (%s) (%d)\n", msg.hdr, pulses );
mlog1 ("Pmon Pulse (%s) (%d)\n", msg.hdr, pulses );
}
else
{
@ -710,92 +707,87 @@ static unsigned int my_rri = 0 ;
static int rx_error_count[MAX_IFACES] = {0,0} ;
static int tx_error_count[MAX_IFACES] = {0,0} ;
#define ERROR_LOG_THRESHOLD (200)
int _service_pulse_request ( iface_enum iface , unsigned int flags )
{
unsigned int s = 0 ; /* Sequence number */
int n = 0 ; /* message size */
int rc = 0 ;
if (( iface != MGMNT_IFACE ) && ( iface != INFRA_IFACE ))
return (FAIL_BAD_CASE);
memset ( (char*) &hbs_sock.rx_mesg[iface], 0, sizeof(hbs_message_type));
if ( ! hbs_sock.rx_sock[iface] )
{
elog ("cannot receive from null rx_mesg[%s] socket\n", get_iface_name_str(iface) );
elog_throttled ( rx_error_count[iface], ERROR_LOG_THRESHOLD,
"cannot receive from null rx_mesg[%s] socket\n",
get_iface_name_str(iface) );
return (FAIL_TO_RECEIVE);
}
else if ( hbs_sock.rx_sock[iface]->sock_ok() == false )
else if ( ! hbs_sock.tx_sock[iface] )
{
elog ("cannot receive from failed rx_mesg[%s] socket\n", get_iface_name_str(iface) );
elog_throttled ( tx_error_count[iface], ERROR_LOG_THRESHOLD,
"cannot send to null mesg[%s] socket\n",
get_iface_name_str(iface) );
return (FAIL_TO_TRANSMIT);
}
else if ( ! hbs_sock.rx_sock[iface]->sock_ok() )
{
elog_throttled ( rx_error_count[iface], ERROR_LOG_THRESHOLD,
"cannot receive from failed rx_mesg[%s] socket\n",
get_iface_name_str(iface) );
return (FAIL_TO_RECEIVE);
}
n = hbs_sock.rx_sock[iface]->read((char*)&hbs_sock.rx_mesg[iface], sizeof(hbs_message_type));
if( n < HBS_HEADER_SIZE )
else if ( ! hbs_sock.tx_sock[iface]->sock_ok() )
{
rx_error_count[iface]++ ;
elog_throttled ( tx_error_count[iface], ERROR_LOG_THRESHOLD,
"cannot send to failed mesg[%s] socket\n",
get_iface_name_str(iface) );
return (FAIL_TO_TRANSMIT);
}
/* throtle the log so that if they come back-to-back we avoid flooding */
if ( n == -1 )
// MEMSET_ZERO(hbs_sock.rx_mesg[iface]);
int rx_bytes = hbs_sock.rx_sock[iface]->read((char*)&hbs_sock.rx_mesg[iface], sizeof(hbs_message_type));
if ( rx_bytes < HBS_HEADER_SIZE )
{
if ( rx_bytes == -1 )
{
if ( rx_error_count[iface] > 1 )
{
wlog_throttled ( rx_error_count[iface], 500, "%s receive error (%d:%m)\n", get_iface_name_str(iface), errno );
}
wlog_throttled ( rx_error_count[iface], ERROR_LOG_THRESHOLD,
"%s receive error (%d:%m)\n",
get_iface_name_str(iface), errno );
}
else
{
wlog_throttled ( rx_error_count[iface], 500, "%s message underrun (expected %ld but got %d)\n",
get_iface_name_str(iface), sizeof(hbs_message_type), n );
}
if ( rx_error_count[iface] == 100 )
{
wlog ( "%s is getting a lot of receive errors (%d:%m)\n", get_iface_name_str(iface), errno );
wlog_throttled ( rx_error_count[iface], ERROR_LOG_THRESHOLD,
"%s message underrun (expected %ld but got %d)\n",
get_iface_name_str(iface),
sizeof(hbs_message_type), rx_bytes );
}
return (FAIL_TO_RECEIVE);
}
/* Clear the error count since we got a good receive */
rx_error_count[iface] = 0 ;
#ifdef WANT_NO_SELF_HEARTBEAT_REPLY
/* Don't reply to the heartbeat if the request came from myself */
if ( ! strncmp ( my_address.data(),
hbs_sock.rx_sock[iface]->get_dst_addr()->toString(),
MAX_CHARS_IN_IP_ADDR ))
daemon_config_type * cfg_ptr = daemon_get_cfg_ptr();
if ( cfg_ptr->debug_msg )
{
ilog ("%s Refusing to send heartbeat response to self\n", hbs_sock.rx_sock[iface]->get_dst_addr()->toString());
return (PASS);
mlog ("\n");
mlog ("%s Pulse Req: %s:%5d: %d:%s RRI:%d\n",
get_iface_name_str(iface),
hbs_sock.rx_sock[iface]->get_dst_addr()->toString(),
hbs_sock.rx_sock[iface]->get_dst_addr()->getPort(),
hbs_sock.rx_mesg[iface].s,
hbs_sock.rx_mesg[iface].m,
hbs_sock.rx_mesg[iface].c);
}
#else
/* We use this to monitor pmond on active controller */
#endif
/* Save the sequence number */
s = hbs_sock.rx_mesg[iface].s ;
mlog ("\n");
mlog ("%s Pulse Req: %s:%5d: %d: :%s RRI:%d\n", get_iface_name_str(iface),
hbs_sock.rx_sock[iface]->get_dst_addr()->toString(),
hbs_sock.rx_sock[iface]->get_dst_addr()->getPort(),
hbs_sock.rx_mesg[iface].s,
hbs_sock.rx_mesg[iface].m,
hbs_sock.rx_mesg[iface].c);
/* verify the message header */
if ( strncmp ( (const char *)&hbs_sock.rx_mesg[iface].m, (const char *)&req_msg_header, HBS_HEADER_SIZE ))
{
wlog_throttled ( rx_error_count[iface], 200, "%s Invalid header (%d:%s)\n",
get_iface_name_str(iface),
hbs_sock.rx_mesg[iface].s,
hbs_sock.rx_mesg[iface].m );
mlog ("Detected: %d <%s>\n", HBS_HEADER_SIZE,hbs_sock.rx_mesg[iface].m);
mlog ("Expected: %d <%s>\n", HBS_HEADER_SIZE,req_msg_header);
wlog_throttled ( rx_error_count[iface], ERROR_LOG_THRESHOLD,
"%s Invalid header (%d:%s)\n",
get_iface_name_str(iface),
hbs_sock.rx_mesg[iface].s,
hbs_sock.rx_mesg[iface].m );
return (FAIL_MSG_HEADER) ;
}
/* Manage the Resource Reference Index (RRI) "lookup clue" */
if ( ! strncmp ( &hbs_sock.rx_mesg[iface].m[HBS_HEADER_SIZE], &my_hostname[0], MAX_CHARS_HOSTNAME ))
{
@ -807,32 +799,31 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags )
}
/* Add my RRI to the response message */
hbs_sock.tx_mesg[iface].c = my_rri ;
hbs_sock.rx_mesg[iface].c = my_rri ;
/* Clear struct */
hbs_sock.tx_mesg[iface].s = s ;
hbs_sock.tx_mesg[iface].f = flags ;
/* Manage OOB flags */
hbs_sock.rx_mesg[iface].f = flags ;
if ( pmonPulse_counter )
{
hbs_sock.tx_mesg[iface].f |= ( PMOND_FLAG ) ;
hbs_sock.rx_mesg[iface].f |= ( PMOND_FLAG ) ;
}
if ( infra_network_provisioned == true )
{
hbs_sock.tx_mesg[iface].f |= INFRA_FLAG ;
hbs_sock.rx_mesg[iface].f |= INFRA_FLAG ;
}
n = (int)sizeof(hbs_message_type) ;
if ( ! hbs_sock.tx_sock[iface] )
#define WANT_CLUSTER_INFO_LOG
#ifdef WANT_CLUSTER_INFO_LOG
/* Log the received cluster info */
if ( hbs_sock.rx_mesg[iface].v >= HBS_MESSAGE_VERSION )
{
elog ("cannot send to null tx_mesg[%s] socket\n", get_iface_name_str(iface) );
return (FAIL_TO_TRANSMIT);
}
else if ( hbs_sock.tx_sock[iface]->sock_ok() == false )
{
elog ("cannot send to failed tx_mesg[%s] socket\n", get_iface_name_str(iface) );
return (FAIL_TO_TRANSMIT);
char str[100] ;
// hbs_cluster_log (hbs_sock.rx_mesg[iface].cluster, hbs_sock.rx_mesg[iface].s );
snprintf ( &str[0], 100, " seq %6d with %d bytes from %s ", hbs_sock.rx_mesg[iface].s, rx_bytes, get_iface_name_str(iface));
string hostname = my_hostname ;
hbs_cluster_log ( hostname, hbs_sock.rx_mesg[iface].cluster, str );
}
#endif
#ifdef WANT_PULSE_RESPONSE_FIT
if (( iface == INFRA_IFACE ) && ( daemon_is_file_present ( MTC_CMD_FIT__NO_INFRA_RSP )))
@ -848,44 +839,69 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags )
}
#endif
/* Send pulse response message with sequence number, flags and resource referecen index */
rc = hbs_sock.tx_sock[iface]->reply(hbs_sock.rx_sock[iface],(char*)&hbs_sock.tx_mesg[iface], n);
if ( rc == -1 )
int rc = PASS ;
/* replace the request header with the response header */
memcpy ( &hbs_sock.rx_mesg[iface].m[0], &pulse_resp_tx_hdr[0], HBS_MAX_MSG );
/* Deal with the cluster info if it exists.
* ... Introduced in messaging version 1 */
if ( hbs_sock.rx_mesg[iface].v >= HBS_MESSAGE_VERSION )
{
elog ("Failed to sendto socket %d through %s:%d len:%d (%s) (%d:%s)\n",
hbs_sock.tx_sock[iface]->getFD(),
hbs_sock.tx_sock[iface]->get_dst_addr()->toString(),
hbs_sock.tx_sock[iface]->get_dst_addr()->getPort(),
hbs_sock.tx_sock[iface]->get_dst_addr()->getSockLen(),
get_iface_name_str(iface), errno, strerror(errno));
if ( hbs_sock.rx_mesg[iface].cluster.version < MTCE_HBS_CLUSTER_VERSION )
{
ilog ("Bad cluster verison (%d)", hbs_sock.rx_mesg[iface].cluster.version);
}
// if ( hbs_sock.rx_mesg[iface].cluster.revision != MTCE_HBS_CLUSTER_REVISION )
// {
// ilog ("Bad cluster revision (%d)", hbs_sock.rx_mesg[iface].cluster.revision);
// }
/* Add peer controller cluster data to this controller's response */
// hbs_cluster_loop(hbs_sock.rx_mesg[iface]);
}
else if ( rc != n)
/* send pulse response message */
int tx_bytes = hbs_sock.tx_sock[iface]->reply(hbs_sock.rx_sock[iface],(char*)&hbs_sock.rx_mesg[iface], rx_bytes);
if ( tx_bytes == -1 )
{
/* Avoid log flooding
elog ("unicast send failed. (%d)\n", rc); */
wlog_throttled ( tx_error_count[iface], 200,
"%s Pulse Rsp: %d:%d bytes < %d:%s > to <%s>\n",
get_iface_name_str(iface), n, rc,
hbs_sock.tx_mesg[iface].s,
&hbs_sock.tx_mesg[iface].m[0],
elog_throttled ( tx_error_count[iface], ERROR_LOG_THRESHOLD,
"pulse tx failed %d:%s:%d len:%d (%s) (%d:%s)\n",
hbs_sock.tx_sock[iface]->getFD(),
hbs_sock.tx_sock[iface]->get_dst_addr()->toString(),
hbs_sock.tx_sock[iface]->get_dst_addr()->getPort(),
hbs_sock.tx_sock[iface]->get_dst_addr()->getSockLen(),
get_iface_name_str(iface), errno, strerror(errno));
}
else if ( tx_bytes != rx_bytes)
{
wlog_throttled ( tx_error_count[iface], ERROR_LOG_THRESHOLD,
"%s Pulse Rsp: %d:%d bytes < %d:%s >",
get_iface_name_str(iface), rx_bytes, tx_bytes,
hbs_sock.rx_mesg[iface].s,
&hbs_sock.rx_mesg[iface].m[0]);
return (rc);
rc = FAIL_DATA_SIZE ;
}
else
{
mlog ("%s Pulse Rsp: %s:%5d: %d:%d:%s RRI:%d (%d)\n",
mlog ("%s Pulse Rsp: %s:%5d: %d:%d:%s RRI:%d (%d:%d:%d)\n",
get_iface_name_str(iface),
hbs_sock.tx_sock[iface]->get_dst_addr()->toString(),
hbs_sock.tx_sock[iface]->get_dst_addr()->getPort(),
hbs_sock.tx_mesg[iface].s,
hbs_sock.tx_mesg[iface].f,
hbs_sock.tx_mesg[iface].m,
hbs_sock.tx_mesg[iface].c,
pmonPulse_counter);
/* Clear the error count since we got a good transmit */
tx_error_count[iface] = 0 ;
hbs_sock.rx_mesg[iface].s,
hbs_sock.rx_mesg[iface].f,
hbs_sock.rx_mesg[iface].m,
hbs_sock.rx_mesg[iface].c,
pmonPulse_counter, rx_bytes, tx_bytes);
}
return PASS;
/* Clear the error count since we got a good receive */
if ( rx_error_count[iface] )
rx_error_count[iface] = 0 ;
if ( tx_error_count[iface] )
tx_error_count[iface] = 0 ;
return rc ;
}
#ifdef WANT_FIT_TESTING
@ -968,6 +984,9 @@ int daemon_init ( string iface, string nodeType_str )
/* Initialize socket construct and pointer to it */
memset ( &hbs_sock, 0, sizeof(hbs_sock));
/* init the utility module */
hbs_utils_init ();
/* Defaults */
hbs_config.stall_pmon_thld = -1 ;
hbs_config.stall_mon_period = MTC_HRS_8 ;
@ -1025,12 +1044,6 @@ int daemon_init ( string iface, string nodeType_str )
rc = FAIL_DAEMON_CONFIG ;
}
/* Init the heartbeat transmit pulse response message */
else if ( hbs_message_init () != PASS )
{
elog ("Failed to initialize pulse response message\n");
rc = FAIL_MESSAGE_INIT ;
}
/* Setup the heartbeat service messaging sockets */
else if ( hbs_socket_init () != PASS )
{
@ -1119,6 +1132,11 @@ void daemon_service_run ( void )
ilog ("Sending Heartbeat Ready Event\n");
hbs_send_event ( MTC_EVENT_MONITOR_READY );
my_hostname_length = strlen(my_hostname) ;
memset ( &pulse_resp_tx_hdr[0], 0, HBS_MAX_MSG );
memcpy ( &pulse_resp_tx_hdr[0], &rsp_msg_header[0], HBS_HEADER_SIZE );
memcpy ( &pulse_resp_tx_hdr[HBS_HEADER_SIZE], my_hostname, my_hostname_length );
/* Run heartbeat service forever or until stop condition */
for ( ; ; )
{
@ -1153,7 +1171,9 @@ void daemon_service_run ( void )
FD_SET(hbs_sock.pmon_pulse_sock->getFD(),&hbs_sock.readfds);
FD_SET(hbs_sock.amon_socket, &hbs_sock.readfds);
FD_SET(hbs_sock.netlink_sock, &hbs_sock.readfds);
#ifdef WANT_CLUSTER_DEBUG
FD_SET(hbs_sock.sm_client_sock->getFD(), &hbs_sock.readfds);
#endif
rc = select( socks.back()+1,
&hbs_sock.readfds, NULL, NULL,
&hbs_sock.waitd);
@ -1176,6 +1196,19 @@ void daemon_service_run ( void )
/* Only service sockets for the rc > 0 case */
else if ( rc )
{
#ifdef WANT_CLUSTER_DEBUG
if ( hbs_sock.sm_client_sock && FD_ISSET(hbs_sock.sm_client_sock->getFD(), &hbs_sock.readfds ) )
{
mtce_hbs_cluster_type msg ;
/* Receive event messages */
memset ( &msg , 0, sizeof(mtce_hbs_cluster_type));
int bytes = hbs_sock.sm_client_sock->read((char*)&msg, sizeof(mtce_hbs_cluster_type));
if ( bytes )
{
hbs_cluster_dump (msg);
}
}
#endif
if (hbs_sock.rx_sock[MGMNT_IFACE]&&FD_ISSET(hbs_sock.rx_sock[MGMNT_IFACE]->getFD(), &hbs_sock.readfds))
{
/* Receive pulse request and send a response */

View File

@ -0,0 +1,748 @@
/*
* Copyright (c) 2018 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
* @file Maintenance Heartbeat Agent Cluster Manager Module
*
*************************************************************************
*
* This module provides the heartbeat cluster implementation member
* functions that the hbsAgent service calls to collect, store and
* send heartbeat cluster information to SM upon request.
*
* See mtceHbsCluster.h for formal API between SM and Mtce.
*
*************************************************************************/
using namespace std;
#include "nodeBase.h" /* common maintenance constructs and definitions */
#include "daemon_common.h" /* common daemon constructs and definitions */
#include "hbsBase.h" /* mtce heartbeat constructs and definitions */
/* Error log throttle counter. */
#define THROTTLE_COUNT (500)
/* Private Heartbeat Cluster Control Structure. */
typedef struct
{
/* Contains the controller number (0 or 1) for this controller. */
unsigned short this_controller ;
/* Preserves which controllers are enabled. */
bool controller_0_enabled ;
bool controller_1_enabled ;
#ifdef THREE_CONTROLLER_SYSTEM
bool controller_2_enabled ;
#endif
/* Used to prevent log flooding in presence of back to back errors. */
unsigned int log_throttle ;
/* Used to threshold storage-0 not responding state */
unsigned int storage_0_not_responding_count[MTCE_HBS_NETWORKS];
/* Contains the number of monitored networks in the system.
* Management only = 1
* Management and Inrastructure = 2 */
unsigned short monitored_networks ;
/* This contains the current number of heartbeat enabled hosts.
*
* Used to improve performance.
*
* Performance: This value is included in each history entry so
* rather than do the size calculation of monitored_hostname_list
* each time, this variable is updated from monitored_hostname_list
* after each add/del operation. */
unsigned short monitored_hosts ;
/* List of host names being monitored. */
std::list<string>monitored_hostname_list ;
/* The working heartbeat cluster data vault. */
mtce_hbs_cluster_type cluster ;
} hbs_cluster_ctrl_type ;
/* Cluster control structire construct allocation. */
static hbs_cluster_ctrl_type ctrl ;
/****************************************************************************
*
* Name : hbs_cluster_init
*
* Description : Initialize the cluster structure to default values.
*
* Assumtions : Called by hbsAgent.cpp before entering the main loop.
*
***************************************************************************/
void hbs_cluster_init ( unsigned short period )
{
ctrl.monitored_hosts = 0;
ctrl.monitored_hostname_list.clear();
/* Init the cluster - header. */
ctrl.cluster.version = MTCE_HBS_CLUSTER_VERSION ;
ctrl.cluster.revision = MTCE_HBS_CLUSTER_REVISION ;
ctrl.cluster.magic_number = MTCE_HBS_MAGIC_NUMBER ;
/* Init the cluster - global / dynamic data. */
ctrl.cluster.reqid = 0 ;
ctrl.cluster.period_msec = period ;
ctrl.cluster.storage0_enabled = false ;
ctrl.cluster.histories = 0 ;
ctrl.cluster.bytes = BYTES_IN_CLUSTER_VAULT(ctrl.cluster.histories);
/* The storage-0 thresholding counter for each network. */
for ( int n = 0 ; n < MTCE_HBS_NETWORKS ; n++ )
ctrl.storage_0_not_responding_count[n] = 0 ;
for ( int h = 0 ; h < MTCE_HBS_MAX_HISTORY_ELEMENTS ; h++ )
hbs_cluster_history_init ( ctrl.cluster.history[h] );
ilog ("Cluster Info: v%d.%d sig:%x bytes:%d (%ld)",
ctrl.cluster.version,
ctrl.cluster.revision,
ctrl.cluster.magic_number,
ctrl.cluster.bytes,
sizeof(mtce_hbs_cluster_history_type));
ctrl.log_throttle = 0 ;
}
/****************************************************************************
*
* Name : hbs_cluster_nums
*
* Description : Set this controller number and the number of monitored
* networks in this system.
*
* These values do not change without a process restart.
*
* Assumtions : Called by hbsAgent.cpp before entering the main loop.
*
* Returns : None
*
***************************************************************************/
void hbs_cluster_nums ( unsigned short this_controller,
unsigned short monitored_networks )
{
ctrl.this_controller = this_controller ;
ctrl.monitored_networks = monitored_networks ;
}
/****************************************************************************
*
* Name : log_monitored_hosts_list
*
* Description : Log the list of monitored hosts.
* Typically done on a list change.
*
* Returns : None
*
***************************************************************************/
void log_monitored_hosts_list ( void )
{
std::list<string>::iterator iter_ptr ;
string list = "" ;
for ( iter_ptr = ctrl.monitored_hostname_list.begin() ;
iter_ptr != ctrl.monitored_hostname_list.end() ;
iter_ptr++ )
{
list.append (*(iter_ptr));
list.append (" ");
}
ilog ("cluster of %ld: %s",
ctrl.monitored_hostname_list.size(),
list.c_str());
}
/****************************************************************************
*
* Name : cluster_storage0_state
*
* Description : Record the heartbeat monitoring state of storage-0.
*
* Parameters : true if storage-0 heartbeating is in the 'started' state.
* false if storage-0 heartbeating is in the 'stopped' state.
*
* Returns : None
*
***************************************************************************/
void cluster_storage0_state ( bool enabled )
{
if ( ctrl.cluster.storage0_enabled != enabled )
{
ctrl.cluster.storage0_enabled = enabled ;
ilog ("storage-0 heartbeat state changed to %s",
enabled ? "enabled" : "disabled" );
}
}
/****************************************************************************
*
* Name : hbs_manage_controller_state
*
* Description : Track the monitored enabled state of the controllers.
*
***************************************************************************/
void hbs_manage_controller_state ( string & hostname, bool enabled )
{
/* track controller state */
if ( hostname == CONTROLLER_0 )
{
ctrl.controller_0_enabled = enabled ;
}
else if ( hostname == CONTROLLER_1 )
{
ctrl.controller_1_enabled = enabled ;
}
#ifdef THREE_CONTROLLER_SYSTEM
else if ( hostname == CONTROLLER_2 )
{
ctrl.controller_2_enabled = enabled ;
}
#endif
}
/****************************************************************************
*
* Name : hbs_cluster_add
*
* Description : Add the specified hostname to the enabled hosts list.
*
* Updates : hostname is added to monitored_hostname_list
*
* If added host is storage-0 then update its enabled status.
* if added host is a controller then update controller state.
*
* Parameters : hostname string
*
* Updates : monitored_hostname_list
*
***************************************************************************/
void hbs_cluster_add ( string & hostname )
{
/* Consider using 'unique' after instead of remove before update. */
ctrl.monitored_hostname_list.remove(hostname) ;
ctrl.monitored_hostname_list.push_back(hostname) ;
ctrl.monitored_hosts = (unsigned short)ctrl.monitored_hostname_list.size();
/* Manage storage-0 state */
if ( hostname == STORAGE_0 )
{
cluster_storage0_state ( true );
}
/* If we get down to 0 monitored hosts then just start fresh */
if (( ctrl.monitored_hosts ) == 0 )
{
hbs_cluster_init ( ctrl.cluster.period_msec );
}
/* Manage controller state ; true means enabled in this case. */
hbs_manage_controller_state ( hostname, true );
ilog ("%s added to cluster", hostname.c_str());
log_monitored_hosts_list ();
}
/****************************************************************************
*
* Name : hbs_cluster_del
*
* Description : Delete the specified hostname from the enabled hosts list.
*
* Updates : hostname is removed from monitored_hostname_list
*
* If added host is storage-0 then update its enabled status.
* if added host is a controller then update controller count.
*
* Parameters : hostname string
*
* Updates : monitored_hostname_list
*
***************************************************************************/
void hbs_cluster_del ( string & hostname )
{
ctrl.monitored_hostname_list.remove(hostname) ;
ctrl.monitored_hosts = (unsigned short)ctrl.monitored_hostname_list.size();
/* Manage storage-0 state. */
if ( hostname == STORAGE_0 )
{
cluster_storage0_state ( false );
}
/* If we get down to 0 monitored hosts then just start fresh */
if (( ctrl.monitored_hosts ) == 0 )
{
hbs_cluster_init ( ctrl.cluster.period_msec );
}
/* Manage controller state ; false means not enabled in this case. */
hbs_manage_controller_state ( hostname , false );
ilog ("%s deleted from cluster", hostname.c_str());
log_monitored_hosts_list ();
}
/****************************************************************************
*
* Name : hbs_cluster_update
*
* Description : Update this controller's cluster info for the specified
* network with
*
* 1. The number of enabled hosts.
* 2. The number of responding hosts.
* 3. The oldest history index in the rotational history fifo.
* 4. Maintain a back to back non-responding count for storage-0.
* Once the count reaches the minimum threshold of
* STORAGE_0_NR_THRESHOLD then the specific network history
* is updated to indicate storgae-0 is not responding. Once
* storage-0 starts responding again with a single response
* then that network history is updated to indicate storage-0
* is responding.
*
* Assumptions : Converts heartbeat interface number to cluster network number.
*
* Parameters : heartbeat interface number ( iface_enum )
* network index
* number of not responding hosts for this interval
*
* Updates : This and last history as well as storage-0 not responding
* count.
*
***************************************************************************/
#define STORAGE_0_NR_THRESHOLD (4)
void hbs_cluster_update ( iface_enum iface,
unsigned short not_responding_hosts,
bool storage_0_responding )
{
if ( ctrl.monitored_hosts == 0 )
return ;
/* convert heartbeat iface enum to cluster network enum. */
mtce_hbs_network_enum n ;
if ( iface == MGMNT_IFACE )
n = MTCE_HBS_NETWORK_MGMT ;
else if ( iface == INFRA_IFACE )
n = MTCE_HBS_NETWORK_INFRA ;
#ifdef MONITORED_OAM_NETWORK
else if ( iface == OAM_IFACE )
n = MTCE_HBS_NETWORK_OAM ;
#endif
else
return ;
if ( not_responding_hosts )
{
clog1 ("controller-%d %s enabled:%d not responding:%d",
ctrl.this_controller,
hbs_cluster_network_name(n).c_str(),
ctrl.monitored_hosts,
not_responding_hosts);
}
else
{
clog1 ("controller-%d %s has %d monitored hosts and all are responding",
ctrl.this_controller,
hbs_cluster_network_name(n).c_str(),
ctrl.monitored_hosts);
}
/* Look-up active history array for this network combination */
mtce_hbs_cluster_history_type * history_ptr = NULL ;
GET_CLUSTER_HISTORY_PTR(ctrl.cluster, ctrl.this_controller ,n);
if ( history_ptr == NULL )
{
if ( ctrl.cluster.histories >= MTCE_HBS_MAX_HISTORY_ELEMENTS )
{
/* Should never happen but if it does then log without floooding */
wlog_throttled ( ctrl.log_throttle, THROTTLE_COUNT,
"Unable to store history beyond %d ",
ctrl.cluster.histories );
return ;
}
else
{
/* Adding a new history slot. */
history_ptr = &ctrl.cluster.history[ctrl.cluster.histories] ;
ctrl.cluster.histories++ ;
ctrl.cluster.bytes = BYTES_IN_CLUSTER_VAULT(ctrl.cluster.histories);
history_ptr->controller = ctrl.this_controller ;
history_ptr->network = n ;
/* Log new network history as its being started. */
ilog ("controller-%d %s network history add",
ctrl.this_controller,
hbs_cluster_network_name(n).c_str());
}
}
/* Manage storage-0 status. */
if ( ctrl.cluster.storage0_enabled )
{
/* Handle storage-0 status change from not responding to responding. */
if ( storage_0_responding == true )
{
if (history_ptr->storage0_responding == false)
{
history_ptr->storage0_responding = true ;
ilog ("controller-%d %s heartbeat ; storage-0 is ok",
ctrl.this_controller,
hbs_cluster_network_name(n).c_str());
}
if (ctrl.storage_0_not_responding_count[n])
ctrl.storage_0_not_responding_count[n] = 0 ;
}
/* Count the storage-0 not responding case for this network. */
else
{
ctrl.storage_0_not_responding_count[n]++ ;
if ( ctrl.storage_0_not_responding_count[n] == 2 )
{
ilog ("controller-%d %s heartbeat ; storage-0 has 2 misses",
ctrl.this_controller,
hbs_cluster_network_name(n).c_str() );
}
}
/* Handle storage-0 status change from responding to not responding. */
if (( history_ptr->storage0_responding == true ) &&
( ctrl.storage_0_not_responding_count[n] >= STORAGE_0_NR_THRESHOLD ))
{
history_ptr->storage0_responding = false ;
ilog ("controller-%d %s heartbeat ; storage-0 is not responding",
ctrl.this_controller,
hbs_cluster_network_name(n).c_str() );
}
}
else
{
/* Typical path for storage-0 disabled or normal non-storage system case */
if ( history_ptr->storage0_responding == true )
history_ptr->storage0_responding = false ;
/* Handle clearing threshold count when storage-0 is not enabled. */
if ( ctrl.storage_0_not_responding_count[n] )
ctrl.storage_0_not_responding_count[n] = 0 ;
}
/*
* Manage the history entry index.
*
* Get the previous entry index ...
* ... which is the one before the oldest index.
* ... which is the index for the next entry.
*/
unsigned short last_entry_index ;
if ( history_ptr->oldest_entry_index == 0 )
{
/* Go to the end of the array. */
last_entry_index = MTCE_HBS_HISTORY_ENTRIES-1 ;
}
else
{
/* Otherwise, the previous index in the array */
last_entry_index = history_ptr->oldest_entry_index - 1 ;
}
/* Update the history with this data. */
history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled = ctrl.monitored_hosts ;
history_ptr->entry[history_ptr->oldest_entry_index].hosts_responding = ctrl.monitored_hosts - not_responding_hosts ;
if (( history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled !=
history_ptr->entry[ last_entry_index].hosts_enabled ) ||
( history_ptr->entry[history_ptr->oldest_entry_index].hosts_responding !=
history_ptr->entry[ last_entry_index].hosts_responding))
{
/* Only log on change events. */
if ( history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled ==
history_ptr->entry[history_ptr->oldest_entry_index].hosts_responding )
{
ilog ("controller-%d %s cluster of %d is healthy",
ctrl.this_controller,
hbs_cluster_network_name(n).c_str(),
history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled);
}
else
{
ilog ("controller-%d %s cluster of %d with %d responding",
ctrl.this_controller,
hbs_cluster_network_name(n).c_str(),
history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled,
history_ptr->entry[history_ptr->oldest_entry_index].hosts_responding);
}
}
/* Increment the entries count till it reaches the max. */
if ( history_ptr->entries < MTCE_HBS_HISTORY_ENTRIES )
history_ptr->entries++ ;
/* Manage the next entry update index ; aka the oldest index. */
if ( history_ptr->oldest_entry_index == (MTCE_HBS_HISTORY_ENTRIES-1))
history_ptr->oldest_entry_index = 0 ;
else
history_ptr->oldest_entry_index++ ;
/* clear the log throttle if we are updating history ok. */
ctrl.log_throttle = 0 ;
}
/****************************************************************************
*
* Name : hbs_cluster_append
*
* Description : Add this controller's cluster info to this pulse
* request message.
*
***************************************************************************/
void hbs_cluster_append ( hbs_message_type & msg )
{
unsigned short c = ctrl.this_controller ;
CHECK_CTRL_NTWK_PARMS(c, ctrl.monitored_networks);
msg.cluster.version = ctrl.cluster.version ;
msg.cluster.revision = ctrl.cluster.revision ;
msg.cluster.magic_number = ctrl.cluster.magic_number ;
msg.cluster.period_msec = ctrl.cluster.period_msec ;
msg.cluster.storage0_enabled = ctrl.cluster.storage0_enabled ;
msg.cluster.histories = ctrl.cluster.histories ;
int bytes = BYTES_IN_CLUSTER_VAULT(ctrl.monitored_networks);
clog1 ("controller-%d appending cluster info to heartbeat message (%d:%d:%d)",
c, ctrl.monitored_networks, ctrl.cluster.histories, bytes );
/* Copy the cluster into the message. */
memcpy( &msg.cluster.history[0], &ctrl.cluster.history[c], bytes);
}
/****************************************************************************
*
* Name : hbs_cluster_unused_bytes
*
* Descrition : Used to set how much data to send in the heartbeat pulse
* requests.
*
* Returns : The number of bytes that are not used in the full
* history array cluster structure.
*
***************************************************************************/
unsigned short hbs_cluster_unused_bytes ( void )
{
if ( ctrl.cluster.histories <= MTCE_HBS_MAX_HISTORY_ELEMENTS )
{
unsigned short tmp = MTCE_HBS_MAX_HISTORY_ELEMENTS - ctrl.cluster.histories ;
return((unsigned short)(sizeof(mtce_hbs_cluster_history_type)*tmp)) ;
}
return 0;
}
/****************************************************************************
*
* Name : hbs_cluster_send
*
* Description: Send the cluster vault to SM.
*
* Returns : Nothing
*
***************************************************************************/
/* NOTE: All code wrapped in this directive will be removed once
* active/active heartbeating is delivered in next update */
#define WANT_ACTIVE_ACTIVE_HEARTBEAT_RESULTS
void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid )
{
#ifdef WANT_ACTIVE_ACTIVE_HEARTBEAT_RESULTS
/* To assist SM with duplex integration ...
*
* This code emulates heartbeat redundancy by duplicating
* controller history up to the number of provisioned
* controllers until active-active heartbeat is delivered.
*/
int peer_controller ;
bool copy_cluster = false ;
if ( ctrl.this_controller == 0 )
{
peer_controller = 1 ;
if ( ctrl.controller_1_enabled )
{
copy_cluster = true ;
}
}
else
{
peer_controller = 0 ;
if ( ctrl.controller_0_enabled )
{
copy_cluster = true ;
}
}
int n, networks = ctrl.cluster.histories ;
if ( copy_cluster )
{
for ( n = 0 ; n < networks ; n++ )
{
/* copy this controller history to create peer controller */
ctrl.cluster.history[ctrl.cluster.histories] = ctrl.cluster.history[n] ;
/* update the controller */
ctrl.cluster.history[ctrl.cluster.histories].controller = peer_controller ;
ctrl.cluster.bytes += sizeof(mtce_hbs_cluster_history_type) ;
ctrl.cluster.histories++ ;
}
}
#endif // WANT_ACTIVE_ACTIVE_HEARTBEAT_RESULTS
ctrl.cluster.reqid = (unsigned short)reqid ;
if (( sm_client_sock ) && ( sm_client_sock->sock_ok() == true ))
{
int len = sizeof(mtce_hbs_cluster_type)-hbs_cluster_unused_bytes();
int bytes = sm_client_sock->write((char*)&ctrl.cluster, len);
if ( bytes <= 0 )
{
elog ("failed to send cluster vault to SM (bytes=%d) (%d:%s)\n",
bytes , errno, strerror(errno));
}
else
{
ilog ("heartbeat cluster vault sent to SM (%d bytes)", len );
hbs_cluster_dump ( ctrl.cluster );
}
}
#ifdef WANT_ACTIVE_ACTIVE_HEARTBEAT_RESULTS
if ( copy_cluster )
{
/* Clear out the other controllers data. */
for ( n = networks ; n > 0 ; n-- )
{
/* copy c0 history to another controller */
hbs_cluster_history_init(ctrl.cluster.history[ctrl.cluster.histories-1]);
ctrl.cluster.bytes -= sizeof(mtce_hbs_cluster_history_type);
ctrl.cluster.histories-- ;
}
}
#endif // WANT_ACTIVE_ACTIVE_HEARTBEAT_RESULTS
}
void hbs_cluster_log ( string & hostname, string prefix )
{
hbs_cluster_log ( hostname, ctrl.cluster, prefix );
}
/****************************************************************************
*
* Active Active Heartbeating and Debug Member Functions
*
***************************************************************************/
/****************************************************************************
*
* Name : hbs_cluster_cmp
*
* Descrition : Performs a sanity check over the cluster structure.
*
* Assumptions : Debug tool, not called at runtime.
*
* Returns : PASS or FAIL
*
***************************************************************************/
int hbs_cluster_cmp( hbs_message_type & msg )
{
if ( msg.cluster.version < ctrl.cluster.version )
{
wlog ("Unexpected version (%d:%d)",
msg.cluster.version, ctrl.cluster.version );
}
else if ( msg.cluster.revision != ctrl.cluster.revision )
{
wlog ("Unexpected revision (%d:%d)",
msg.cluster.revision, ctrl.cluster.revision );
}
else if ( msg.cluster.magic_number != ctrl.cluster.magic_number )
{
wlog ("Unexpected magic number (%d:%d)",
msg.cluster.magic_number, ctrl.cluster.magic_number );
}
else if ( msg.cluster.period_msec != ctrl.cluster.period_msec )
{
wlog ("Cluster Heartbeat period delta (%d:%d)",
msg.cluster.period_msec, ctrl.cluster.period_msec );
}
else if ( msg.cluster.storage0_enabled != ctrl.cluster.storage0_enabled )
{
wlog ("Cluster storage0 enabled state delta (%d:%d)",
msg.cluster.storage0_enabled, ctrl.cluster.storage0_enabled );
}
else
{
return (PASS);
}
return (FAIL);
}
/****************************************************************************
*
* Name : hbs_cluster_save
*
* Descrition : Copies the other controllers information from msg into
* the cluster.
*
* NOTE: Does not do that right now.
*
* Assumptions : Place holder until active/active heartbeating is implemented.
*
* Returns : PASS or FAIL
*
***************************************************************************/
int hbs_cluster_save ( string & hostname,
mtce_hbs_network_enum network,
hbs_message_type & msg )
{
// clog ("Add cluster info from peer controller");
if ( ctrl.monitored_hosts )
{
/* compare cluster info and log deltas */
// hbs_cluster_cmp( msg );
UNUSED(msg);
hbs_cluster_log( hostname, ctrl.cluster, hbs_cluster_network_name(network) );
}
return (PASS);
}

View File

@ -0,0 +1,86 @@
/*
* Copyright (c) 2018 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
* @file StarlingX Maintenance Heartbeat Cluster Manager Module
*
*************************************************************************
*
* This module provides API for the hbsAgent service to call to
* collect, store and send heartbeat cluster information to SM
* upon request. See hbsCluster.h for formal API.
*
*************************************************************************/
#ifndef __HBSCLUSTER_H__
#define __HBSCLUSTER_H__
using namespace std;
#include "mtceHbsCluster.h" /* for ... the public API */
/****************************************************************************
*
* Name : BYTES_IN_CLUSTER_VAULT
*
* Description : Calculates the number of bytes in the cluster vault based on
* the number of valid history array elements included.
*
* Parameters :
*
***************************************************************************/
#define BYTES_IN_CLUSTER_VAULT(e) \
(sizeof(mtce_hbs_cluster_type)-(sizeof(mtce_hbs_cluster_history_type)*(MTCE_HBS_MAX_HISTORY_ELEMENTS-e)))
/****************************************************************************
*
* Name : CHECK_CTRL_NTWK_PARMS
*
* Description :
*
* Parameters :
*
***************************************************************************/
#define CHECK_CTRL_NTWK_PARMS(c,n) \
if (( c > MTCE_HBS_MAX_CONTROLLERS ) || \
( n > MTCE_HBS_NETWORKS )) \
{ \
slog ("Invalid parameter: %d:%d", c, n); \
return ; \
}
/****************************************************************************
*
* Name : GET_CLUSTER_HISTORY_PTR
*
* Description :
*
* Parameters :
*
***************************************************************************/
#define GET_CLUSTER_HISTORY_PTR(cluster, c,n) \
for ( int h = 0 ; h < cluster.histories ; h++ ) \
{ \
if (( cluster.history[h].controller == c ) && \
( cluster.history[h].network == n )) \
{ \
history_ptr = &cluster.history[h] ; \
} \
}
#define SET_CONTROLLER_HOSTNAME(c) \
if ( c == 0 ) \
controller = CONTROLLER_0 ; \
else if ( c == 1 ) \
controller = CONTROLLER_1 ; \
else if ( c == 2 ) \
controller = CONTROLLER_2 ; \
else \
controller = "unknown" \
#endif // __HBSCLUSTER_H__

View File

@ -0,0 +1,346 @@
/*
* Copyright (c) 2018 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
* @file Maintenance Heartbeat Utilities Module
*
*************************************************************************
*
* This module provides heartbeat utilities that are common to both
* hbsAgent and hbsClient.
*
*************************************************************************/
using namespace std;
#include "daemon_common.h" /* common daemon constructs and definitions */
#include "hbsBase.h" /* mtce heartbeat constructs and definitions */
/* hbs_cluster_log utility support. log control array. */
bool first_log[MTCE_HBS_MAX_HISTORY_ELEMENTS]; /* has first history log out */
bool was_diff [MTCE_HBS_MAX_HISTORY_ELEMENTS]; /* was there a history diff */
/****************************************************************************
*
* Name : hbs_utils_init
*
* Description : Module Init function
*
***************************************************************************/
void hbs_utils_init ( void )
{
MEMSET_ZERO ( first_log );
MEMSET_ZERO ( was_diff );
}
/****************************************************************************
*
* Name : hbs_cluster_history_init
*
* Description : Initialize a cluster history element.
*
* Parameters : Reference to a mtce_hbs_cluster_history_type (history element)
*
* Returns : Nothing
*
***************************************************************************/
void hbs_cluster_history_init ( mtce_hbs_cluster_history_type & history )
{
MEMSET_ZERO(history);
history.entries_max = MTCE_HBS_HISTORY_ENTRIES ;
}
/****************************************************************************
*
* Name : hbs_cluster_history_clear
*
* Description : Clear all history in the cluster vault.
*
* Parameters : mtce_hbs_cluster_type instance : the vault.
*
* Returns : Nothing
*
***************************************************************************/
void hbs_cluster_history_clear ( mtce_hbs_cluster_type & cluster )
{
if ( cluster.histories )
{
for ( int h = 0 ; h < cluster.histories ; h++ )
hbs_cluster_history_init ( cluster.history[h] ) ;
}
}
/****************************************************************************
*
* Name : cluster_network_name
*
* Description : converts what is a heartbeat cluster network id to
* network name.
*
* Parameters : network id
*
* Returns : network name as a string
*
***************************************************************************/
string hbs_cluster_network_name ( mtce_hbs_network_enum network )
{
switch ( network )
{
case MTCE_HBS_NETWORK_MGMT:
return ("Mgmnt");
case MTCE_HBS_NETWORK_INFRA:
return ("Infra");
#ifdef MONITORED_OAM_NETWORK
case MTCE_HBS_NETWORK_OAM:
return ("Oam");
#endif
default:
slog ("invalid network enum (%d)", network );
return ("unknown");
}
}
/****************************************************************************
*
* Name : hbs_cluster_log
*
* Description : logs changes to the heartbeat cluster
*
* Parameters : The heartbeat cluster structure
*
* Returns : Nothing
*
***************************************************************************/
void hbs_cluster_log ( string & hostname,
mtce_hbs_cluster_type & cluster,
string log_prefix )
{
// bool want_log = false ;
clog1 ("log %d histories", cluster.histories );
for ( int h = 0 ; h < cluster.histories ; h++ )
{
if ( cluster.history[h].entries == MTCE_HBS_HISTORY_ENTRIES )
{
#define MAX_CLUSTER_LINE_LEN 100
#define MAX_ENTRY_STR_LEN 10 /* "9999:9999 " */
mtce_hbs_cluster_entry_type e = { 0, 0 } ;
char str[MAX_CLUSTER_LINE_LEN] ;
string line = "";
int start = 0 ;
int stop = 0 ;
bool newline = false ;
bool logit = false ;
bool first = false ;
string controller = "" ;
mtce_hbs_cluster_history_type * history_ptr = &cluster.history[h] ;
clog1 ("%s %s has %d entries (controller-%d view from %s)", hostname.c_str(),
hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(),
history_ptr->entries,
history_ptr->controller,
log_prefix.c_str());
/* Manage local this_index for log display.
* Display oldest to newest ; left to right
*
* */
int this_index = history_ptr->oldest_entry_index ;
for ( int count = 0 ; count < history_ptr->entries ; count++ )
{
if (( line.length() + MAX_ENTRY_STR_LEN ) >=
MAX_CLUSTER_LINE_LEN )
{
newline = true ;
}
#ifdef WANT_MINIMAL_LOGS
/* TODO: enable in final update */
if (( first_log[h] == true ) && ( newline == false ) &&
( history_ptr->entry[this_index].hosts_enabled ==
history_ptr->entry[this_index].hosts_responding ))
{
line.append(". ");
continue ;
}
#endif
// want_log = true ;
if ( count == 0 )
{
snprintf (&str[0], MAX_ENTRY_STR_LEN , "%d:%d ", // -%d",
history_ptr->entry[this_index].hosts_enabled,
history_ptr->entry[this_index].hosts_responding ); // , this_index );
line.append (str);
str[0] = '\0' ;
}
//#ifdef WANT_DOTS
else if (( history_ptr->entry[this_index].hosts_enabled ==
e.hosts_enabled ) &&
( history_ptr->entry[this_index].hosts_responding ==
e.hosts_responding ))
{
line.append(". ");
}
//#endif
else
{
snprintf (&str[0], MAX_ENTRY_STR_LEN , "%d:%d ", // -%d",
history_ptr->entry[this_index].hosts_enabled,
history_ptr->entry[this_index].hosts_responding ); // , this_index );
line.append (str);
str[0] = '\0' ;
logit = true ;
was_diff[h] = true ;
}
if (( logit == false ) && ( first_log[h] == false ))
{
first_log[h] = true ;
logit = true ;
}
stop++ ;
if ( newline == true )
{
if ( logit )
{
SET_CONTROLLER_HOSTNAME(history_ptr->controller);
if ( hostname == controller )
{
clog ("%s view %s %s %02d..%02d: %s,",
hostname.c_str(),
log_prefix.c_str(),
hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(),
start, stop, line.c_str());
}
else
{
clog ("%s view from %s %s %s %02d..%02d: %s,",
controller.c_str(),
hostname.c_str(),
log_prefix.c_str(),
hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(),
start, stop, line.c_str());
}
}
start = stop + 1 ;
line.clear();
first = true ;
newline = false ;
}
e = history_ptr->entry[this_index] ;
/* manage index tracking */
if ( this_index == (MTCE_HBS_HISTORY_ENTRIES-1))
this_index = 0 ;
else
this_index++ ;
}
if (( newline == false ) && ( line.length() ))
{
// ERIC
if (( logit == false ) && ( was_diff[h] == true ))
{
logit = true ;
was_diff[h] = false ;
}
if ( logit )
{
if ( first )
{
clog ("............ %s %s %02d..%02d: %s",
log_prefix.c_str(),
hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(),
start, stop, line.c_str());
}
else
{
SET_CONTROLLER_HOSTNAME(history_ptr->controller);
if ( hostname == controller )
{
clog ("%s view %s %s %02d..%02d: %s",
hostname.c_str(),
log_prefix.c_str(),
hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(),
start, stop, line.c_str());
}
else
{
clog ("%s view from %s %s %s %02d..%02d: %s",
controller.c_str(),
hostname.c_str(),
log_prefix.c_str(), /* Infra <- */
hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(),
start, stop, line.c_str());
}
}
}
else
{
was_diff[h] = false ;
}
}
}
}
}
/****************************************************************************
*
* name : hbs_cluster_dump
*
* Description: Formatted dump of the vault contents to the log file.
*
***************************************************************************/
void hbs_cluster_dump ( mtce_hbs_cluster_type & vault )
{
syslog ( LOG_INFO, "Cluster Vault Dump: --------------------------------------------------------------------------------------------");
syslog ( LOG_INFO, "Cluster Vault: v%d.%d %d msec period ; SM Reqid is %d with storage-0 %s and %d histories in %d bytes",
vault.version,
vault.revision,
vault.period_msec,
vault.reqid,
vault.storage0_enabled ? "enabled" : "disabled",
vault.histories,
vault.bytes );
for ( int h = 0 ; h < vault.histories ; h++ )
{
#define MAX_LINE_LEN (500)
char str[MAX_LINE_LEN] ;
int i = 0 ;
for ( int e = 0 ; e < vault.history[h].entries_max ; e++ )
{
snprintf ( &str[i], MAX_LINE_LEN, "%c[%d:%d]" ,
vault.history[h].oldest_entry_index==e ? '>' : ' ',
vault.history[h].entry[e].hosts_enabled,
vault.history[h].entry[e].hosts_responding);
i = strlen(str) ;
}
syslog ( LOG_INFO, "Cluster Vault: C%d %s S:%s:%s (%d:%d) %s",
vault.history[h].controller,
hbs_cluster_network_name((mtce_hbs_network_enum)vault.history[h].network).c_str(),
vault.storage0_enabled ? "y" : "n",
vault.history[h].storage0_responding ? "y" : "n",
vault.history[h].entries_max,
vault.history[h].entries,
str);
}
// dump_memory ( &vault, 16, vault.bytes );
}

View File

@ -0,0 +1,109 @@
/*
* Copyright (c) 2018 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
* @file StarlingX Maintenance Heartbeat Cluster Manager Module
*
*************************************************************************
*
* This module provides API for the hbsAgent service to call to
* collect, store and send heartbeat cluster information to SM
* upon request. See hbsCluster.h for formal API.
*
*************************************************************************/
#ifndef __MTCEHBSCLUSTER_H__
#define __MTCEHBSCLUSTER_H__
#include <sys/types.h>
/**************************************************************
* Implementation Structure
*************************************************************/
#define MTCE_HBS_CLUSTER_VERSION (1)
#define MTCE_HBS_CLUSTER_REVISION (0)
#define MTCE_HBS_MAGIC_NUMBER (0x5aa5)
typedef enum
{
MTCE_HBS_NETWORK_MGMT = 0,
MTCE_HBS_NETWORK_INFRA = 1,
#ifdef MONITORED_OAM_NETWORK
MTCE_HBS_NETWORK_OAM,
#endif
MTCE_HBS_NETWORKS
} mtce_hbs_network_enum ;
#ifdef THREE_CONTROLLER_SYSTEM
#define MTCE_HBS_MAX_CONTROLLERS (3)
#else
#define MTCE_HBS_MAX_CONTROLLERS (2)
#endif
#ifdef MONITORED_OAM_NETWORK
#define MTCE_HBS_MAX_NETWORKS (3)
#else
#define MTCE_HBS_MAX_NETWORKS (2)
#endif
// value of 20 at 100 msec period is 2 seconds of history */
#define MTCE_HBS_HISTORY_ENTRIES (20)
/* maximum number of history elements permitted in a cluster history summary */
#define MTCE_HBS_MAX_HISTORY_ELEMENTS ((MTCE_HBS_MAX_CONTROLLERS)*(MTCE_HBS_NETWORKS))
#ifndef ALIGN_PACK
#define ALIGN_PACK(x) __attribute__((packed)) x
#endif
/* A single element of Heartbeat Cluster History for one heartbeat period */
typedef struct
{
unsigned short hosts_enabled ; /* # of hosts being hb monitored */
unsigned short hosts_responding ; /* # of hosts that responsed to hb*/
} ALIGN_PACK(mtce_hbs_cluster_entry_type);
/* Heartbeat Cluster History for all monitored networks of a Controller */
typedef struct
{
unsigned short controller :4 ; /* value 0 or 1 (and 2 in future) */
unsigned short network :4 ; /* see mtce_hbs_network_enum */
unsigned short reserved_bits :7 ; /* future - initted to 0 */
unsigned short storage0_responding:1 ; /* 1 = storage-0 is hb healthy */
unsigned short entries ; /* # of valid values in .entry */
unsigned short entries_max ; /* max size of the enry array */
unsigned short oldest_entry_index ; /* the oldest entry in the array */
/* historical array of entries for a specific network */
mtce_hbs_cluster_entry_type entry [MTCE_HBS_HISTORY_ENTRIES] ;
} ALIGN_PACK(mtce_hbs_cluster_history_type) ;
/* Heartbeat Cluster History for all monitored networks of all Controllers */
typedef struct
{
/* Header - Static Data - 4 bytes */
unsigned char version ; /* public API MTCE_HBS_CLUSTER_VERSION */
unsigned char revision ; /* public API MTCE_HBS_CLUSTER_REVISION */
unsigned short magic_number ; /* public API MTCE_HBS_MAGIC_NUMBER */
/* Control - Dynamic Data - 8 bytes */
unsigned short reqid ; /* added from SM cluster request */
unsigned short period_msec ; /* heartbeat period in milliseconds */
unsigned short bytes ; /* total struct size self check */
unsigned char storage0_enabled; /* bool containing true or false */
unsigned char histories ; /* How many hostory elements follow */
/* Array of Cluster History
*
* - histories above specifies how many
* elements of this array are populated.
*/
mtce_hbs_cluster_history_type history [MTCE_HBS_MAX_HISTORY_ELEMENTS] ;
} ALIGN_PACK(mtce_hbs_cluster_type) ;
#endif // __HBSCLUSTER_H__

View File

@ -23,6 +23,7 @@ SRCS += mtcKeyApi.cpp
SRCS += mtcCmdHdlr.cpp
SRCS += mtcNodeMnfa.cpp
SRCS += mtcVimApi.cpp
SRCS += mtcStubs.cpp
COMPUTE_OBJS = mtcNodeComp.o
COMPUTE_OBJS += mtcCompMsg.o

View File

@ -1935,8 +1935,10 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
* the host has not reset yet we have disabled services
* then now we need to reset the host to prevet VM duplication
* by forcing a full enable */
if (( node_ptr->uptime_save != 0 ) &&
( node_ptr->uptime >= node_ptr->uptime_save ))
if ((( node_ptr->uptime_save != 0 ) &&
( node_ptr->uptime >= node_ptr->uptime_save )) ||
(( node_ptr->uptime_save == 0 ) &&
( node_ptr->uptime > MTC_MINS_15 )))
{
ilog ("%s regained MTCALIVE from host that did not reboot (uptime:%d)\n",
node_ptr->hostname.c_str(), node_ptr->uptime );

View File

@ -0,0 +1,17 @@
/*
* Copyright (c) 2013, 2016 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
*/
/**
* @file
* Maintenance Agent Stubs
*/
using namespace std;
#include "nodeClass.h" /* The main link class */
void hbs_cluster_log ( void ) { }

40
mtce/src/scripts/hbs-query Executable file
View File

@ -0,0 +1,40 @@
#!/bin/bash
# Copyright (c) 2013-2016 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
#
# This utility is primarily used by no reboot patching for process restart
#
# This script sends a jason string containing the the restart command
# and ${1} as the specified process name to pmond over the loopback
# interface on port 2117
#
# Linux Standard Base (LSB) Error Codes
RETVAL=0
GENERIC_ERROR=1
INVALID_ARGS=2
UNSUPPORTED_FEATURE=3
NOT_INSTALLED=5
NOT_RUNNING=7
PROTOCOL="UDP4-DATAGRAM"
ADDRESS="127.0.0.1"
socat_exec=`(which socat) 2> /dev/null`
if [ -z ${socat_exec} ] ; then
logger "Error: $0 cannot find socat exec"
exit ${NOT_INSTALLED}
fi
reqid=123
if [ "${1}" != "" ] ; then
reqid=${1}
fi
port=$(cat /etc/mtc.ini | awk '{if ($1 == "sm_server_port") { print $3; }}')
echo "{\"origin\":\"sm\", \"service\":\"heartbeat\", \"request\":\"cluster_info\", \"reqid\": $reqid }" | socat - ${PROTOCOL}:${ADDRESS}:${port}
exit ${RETVAL}