diff --git a/mtce-common/src/common/jsonUtil.cpp b/mtce-common/src/common/jsonUtil.cpp index 2a315b16..092c8387 100644 --- a/mtce-common/src/common/jsonUtil.cpp +++ b/mtce-common/src/common/jsonUtil.cpp @@ -249,6 +249,44 @@ int jsonUtil_get_key_val ( char * json_str_ptr, return (PASS); } +int jsonUtil_get_key_val_int ( char * json_str_ptr, + string key, + int & value ) +{ + /* init to null to avoid trap on early cleanup call with + * bad non-null default pointer value */ + struct json_object *raw_obj = (struct json_object *)(NULL); + + if ((json_str_ptr == NULL) || ( *json_str_ptr == '\0' ) || ( ! strncmp ( json_str_ptr, "(null)" , 6 ))) + { + elog ("Cannot tokenize a null json string\n"); + elog ("... json string: %s\n", json_str_ptr ); + return (FAIL); + } + + size_t len_before = strlen (json_str_ptr); + + jlog2 ("String: %s\n", json_str_ptr ); + + raw_obj = json_tokener_parse( json_str_ptr ); + if ( raw_obj ) + { + value = jsonUtil_get_key_value_int ( raw_obj, key.data() ) ; + jlog1 ("%s:%d\n", key.c_str(), value); + } + else + { + size_t len_after = strlen (json_str_ptr); + + elog ("Unable to tokenize string (before:%ld after:%ld);\n", len_before, len_after); + elog ("... json string: %s\n", json_str_ptr ); + } + + if (raw_obj) + json_object_put(raw_obj); + + return (PASS); +} /** This utility freads the passed in inventory GET request * response json character string and performes the following diff --git a/mtce-common/src/common/jsonUtil.h b/mtce-common/src/common/jsonUtil.h index 780a519b..79be85a4 100644 --- a/mtce-common/src/common/jsonUtil.h +++ b/mtce-common/src/common/jsonUtil.h @@ -69,6 +69,10 @@ int jsonUtil_get_key_val ( char * json_str_ptr, string key, string & value ); +int jsonUtil_get_key_val_int ( char * json_str_ptr, + string key, + int & value ); + /** Submit a request to get an authorization token and nova URL */ int jsonApi_auth_request ( string & hostname, string & payload ); diff --git a/mtce-common/src/common/logMacros.h b/mtce-common/src/common/logMacros.h index bef9221a..60c43bbf 100644 --- a/mtce-common/src/common/logMacros.h +++ b/mtce-common/src/common/logMacros.h @@ -114,6 +114,8 @@ typedef struct int event_port ; /**< daemon specific event tx port */ int cmd_port ; /**< daemon specific command rx port */ int sensor_port ; /**< sensor read value port */ + int sm_server_port ; /**< port mtce uses to receive data from SM */ + int sm_client_port ; /**< port mtce uses to send SM data */ int start_delay ; /**< startup delay, added for pmon */ int api_retries ; /**< api retries before failure */ int hostwd_failure_threshold ; /**< allowed # of missed pmon/hostwd messages */ @@ -243,6 +245,19 @@ extern char *program_invocation_short_name; else { syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Error : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } \ } +/** Error logger macro with throttling */ +#define elog_throttled(cnt,max,format,args...) { \ + if ( ++cnt == 1 ) \ + { \ + if (ltc()) { printf ( "%s [%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Error : " format, pt(), getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } \ + else { syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Error : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } \ + } \ + if ( cnt >= max ) \ + { \ + cnt = 0 ; \ + } \ +} + /** Warning logger macro */ #define wlog(format, args...) { \ if ( ltc() ) { printf ( "%s [%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Warn : " format, pt(), getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } \ @@ -387,7 +402,9 @@ extern char *program_invocation_short_name; #define flog(format, args...) { if(daemon_get_cfg_ptr()->debug_fsm) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: FSM : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } #define tlog(format, args...) { if(daemon_get_cfg_ptr()->debug_timer) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Timer: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } + #define clog(format, args...) { if(daemon_get_cfg_ptr()->debug_state) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Change: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } +#define clog1(format, args...) { if(daemon_get_cfg_ptr()->debug_state&2) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Chang2: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } #define log_event(format, args...) { syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Event: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } #define log_stress(format, args...) { syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Stress: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } diff --git a/mtce-common/src/common/nodeBase.h b/mtce-common/src/common/nodeBase.h index 120ba0fa..3504acc6 100755 --- a/mtce-common/src/common/nodeBase.h +++ b/mtce-common/src/common/nodeBase.h @@ -23,7 +23,9 @@ using namespace std; #include "returnCodes.h" #include "nodeTimers.h" +#ifndef ALIGN_PACK #define ALIGN_PACK(x) __attribute__((packed)) x +#endif /* Out-Of-Service Stress tests */ #define WANT_SYSINV_API_STRESS 0x00000001 @@ -359,8 +361,12 @@ void daemon_exit ( void ); #define CONTROLLER_0 ((const char *)"controller-0") #define CONTROLLER_1 ((const char *)"controller-1") +#define CONTROLLER_2 ((const char *)"controller-2") #define CONTROLLER ((const char *)"controller") +#define STORAGE_0 ((const char *)"storage-0") +#define STORAGE_1 ((const char *)"storage-1") + /* The infrastructure networking floating IP * * Note: If there is no infra then this label will resolve diff --git a/mtce-common/src/common/nodeUtil.cpp b/mtce-common/src/common/nodeUtil.cpp index 73a75697..a0a89cbb 100755 --- a/mtce-common/src/common/nodeUtil.cpp +++ b/mtce-common/src/common/nodeUtil.cpp @@ -267,7 +267,7 @@ bool is_goenabled ( int nodeType, bool pass ) return daemon_is_file_present ( file ); } -#define LOG_MEMORY(buf) ilog ("%s", buf ); \ +#define LOG_MEMORY(buf) syslog ( LOG_INFO, "%s", buf ); \ buf_ptr = &buf[0]; \ MEMSET_ZERO ( buf ); @@ -279,7 +279,7 @@ void dump_memory ( void * raw_ptr , int format, size_t bytes ) char buf[0x1024] ; char * buf_ptr = &buf[0]; MEMSET_ZERO ( buf ); - ilog ("Dumping Memory:\n"); + syslog ( LOG_INFO, "Dumping Memory: %ld bytes", bytes ); if ( format == 4 ) { int loops = bytes/format ; @@ -294,7 +294,6 @@ void dump_memory ( void * raw_ptr , int format, size_t bytes ) buf_ptr += sprintf ( buf_ptr, "%c", *byte_ptr) ; else buf_ptr += sprintf ( buf_ptr, "%c", '.'); - byte_ptr++ ; } LOG_MEMORY(buf); @@ -315,7 +314,6 @@ void dump_memory ( void * raw_ptr , int format, size_t bytes ) buf_ptr += sprintf ( buf_ptr , "%c", *byte_ptr) ; else buf_ptr += sprintf ( buf_ptr , "%c", '.'); - byte_ptr++ ; } LOG_MEMORY(buf); @@ -336,21 +334,12 @@ void dump_memory ( void * raw_ptr , int format, size_t bytes ) buf_ptr += sprintf ( buf_ptr , "%c", *byte_ptr) ; else buf_ptr += sprintf ( buf_ptr , "%c", '.'); - byte_ptr++ ; } LOG_MEMORY(buf); word_ptr += 4 ; } } - byte_ptr = (uint8_t*)raw_ptr ; - ilog ("Raw Hex Dump : %ld\n", bytes ); - for ( unsigned int x = 0 ; x < bytes ; x++ ) - { - buf_ptr += sprintf ( buf_ptr, " %02x", *byte_ptr ); - byte_ptr++ ; - } - // printf ("\n\n"); } diff --git a/mtce-common/src/common/returnCodes.h b/mtce-common/src/common/returnCodes.h index e0c04454..cc2882ae 100644 --- a/mtce-common/src/common/returnCodes.h +++ b/mtce-common/src/common/returnCodes.h @@ -93,7 +93,7 @@ #define FAIL_INVALID_DATA (71) #define FAIL_BAD_STATE (72) #define FAIL_KEY_VALUE_PARSE (73) -#define FAIL____UNUSED____74 (74) +#define FAIL_DATA_SIZE (74) #define FAIL_NOT_FOUND (75) #define FAIL_WORKQ_TIMEOUT (76) #define FAIL_HTTP_DELETE (77) diff --git a/mtce-common/src/daemon/daemon_common.h b/mtce-common/src/daemon/daemon_common.h index 61abb0ab..6215dcb1 100755 --- a/mtce-common/src/daemon/daemon_common.h +++ b/mtce-common/src/daemon/daemon_common.h @@ -207,7 +207,7 @@ int daemon_run_testhead ( void ); #define CONFIG_AGENT_INV_PORT 0x00000100 /**< Inventory Port Number */ #define CONFIG_AGENT_HA_PORT 0x00000200 /**< HA Framework Port Number */ #define CONFIG_CLIENT_MTCALARM_PORT 0x00000400 /**< Send alarm requests to */ -#define CONFIG_RESERVED_800 0x00000800 /**< */ +#define CONFIG_AGENT_SM_CLIENT_PORT 0x00000800 /**< Port to Send SM data on */ #define CONFIG_MTC_TO_HWMON_CMD_PORT 0x00001000 /**< HWmon Port Number */ #define CONFIG_AGENT_KEY_PORT 0x00002000 /**< Keystone HTTP port */ #define CONFIG_AGENT_HBS_MTC_PORT 0x00004000 /**< Heartbeat Service Port */ @@ -217,8 +217,8 @@ int daemon_run_testhead ( void ); #define CONFIG_AGENT_MTC_MGMNT_PORT 0x00040000 /**< Agent Infr network port */ #define CONFIG_AGENT_TOKEN_REFRESH 0x00080000 /**< Token refresh rate mask */ #define CONFIG_CLIENT_MTC_INFRA_PORT 0x00100000 /**< Client Infra nwk mtc port */ -#define CONFIG_CLIENT_MTC_MGMNT_PORT 0x00200000 /**< Client mgmnt nwk mtc port */ -#define CONFIG_AGENT_VIM_CMD_PORT 0x00400000 /**< VIM Command Port Mask */ +#define CONFIG_CLIENT_MTC_MGMNT_PORT 0x00200000 /**< Client mgmnt nwk mtc port */ +#define CONFIG_AGENT_SM_SERVER_PORT 0x00400000 /**< Port to RX data from SM */ #define CONFIG_CLIENT_HBS_INFRA_PORT 0x00800000 /**< Infrastructure ntwk Port */ #define CONFIG_CLIENT_HBS_MGMNT_PORT 0x01000000 /**< Management network Port */ #define CONFIG_CLIENT_HBS_EVENT_PORT 0x02000000 /**< Heartbeat Event Messaging */ diff --git a/mtce/centos/mtce.spec b/mtce/centos/mtce.spec index 98f72d78..d69e34bc 100644 --- a/mtce/centos/mtce.spec +++ b/mtce/centos/mtce.spec @@ -90,6 +90,15 @@ of spec operating conditions that can reduce outage time through automated notification and recovery thereby improving overall platform availability for the customer. +%package -n mtce-dev +Summary: Titanuim Server Maintenance Software Development Package +Group: base +Provides: mtce-dev = %{version}-%{release} + +%description -n mtce-dev +Titanuim Cloud Maintenance. This package contains header files, +and related items necessary for software development. + %package -n mtce-pmon Summary: Titanuim Server Maintenance Process Monitor Package Group: base @@ -424,6 +433,9 @@ install -m 644 -p -D %{_buildsubdir}/fsmon/scripts/fsmon.logrotate %{buildroot}% install -m 644 -p -D %{_buildsubdir}/hwmon/scripts/hwmon.logrotate %{buildroot}%{local_etc_logrotated}/hwmon.logrotate install -m 644 -p -D %{_buildsubdir}/alarm/scripts/mtcalarm.logrotate %{buildroot}%{local_etc_logrotated}/mtcalarm.logrotate +# software development files +install -m 644 -p -D %{_buildsubdir}/heartbeat/mtceHbsCluster.h %{buildroot}/%{_includedir}/mtceHbsCluster.h + install -m 755 -p -D %{_buildsubdir}/public/libamon.so.$MAJOR %{buildroot}%{_libdir}/libamon.so.$MAJOR cd %{buildroot}%{_libdir} ; ln -s libamon.so.$MAJOR libamon.so.$MAJOR.$MINOR cd %{buildroot}%{_libdir} ; ln -s libamon.so.$MAJOR libamon.so @@ -621,3 +633,10 @@ install -m 755 -d %{buildroot}/var/run %{_sysconfdir}/init.d/hostw %{local_bindir}/hostwd +############################### +# Maintenance Software Development RPM +############################### +%files -n mtce-dev +%defattr(-,root,root,-) + +%{_includedir}/mtceHbsCluster.h diff --git a/mtce/src/common/nodeClass.cpp b/mtce/src/common/nodeClass.cpp index 4b54be53..825e60f7 100755 --- a/mtce/src/common/nodeClass.cpp +++ b/mtce/src/common/nodeClass.cpp @@ -269,7 +269,7 @@ nodeLinkClass::nodeLinkClass() hbs_ready = false ; hbs_state_change = false ; hbs_disabled = true ; - hbs_pulse_period = hbs_pulse_period_save = 200 ; + hbs_pulse_period = hbs_pulse_period_save = 0 ; hbs_minor_threshold = HBS_MINOR_THRESHOLD ; hbs_degrade_threshold = HBS_DEGRADE_THRESHOLD ; hbs_failure_threshold = HBS_FAILURE_THRESHOLD ; @@ -7325,18 +7325,40 @@ int nodeLinkClass::launch_host_services_cmd ( struct nodeLinkClass::node * node_ int send_event ( string & hostname, unsigned int cmd, iface_enum iface ); -int nodeLinkClass::mon_host ( const string & hostname, iface_enum iface, bool true_false, bool send_clear ) +int nodeLinkClass::mon_host ( const string & hostname, bool true_false, bool send_clear ) { - int rc = FAIL ; - if ( ! hostname.empty() ) + nodeLinkClass::node* node_ptr ; + node_ptr = nodeLinkClass::getNode ( hostname ); + if ( node_ptr != NULL ) { - nodeLinkClass::node* node_ptr ; - node_ptr = nodeLinkClass::getNode ( hostname ); - if ( node_ptr != NULL ) + bool want_log = true ; + for ( int iface = 0 ; iface < MAX_IFACES ; iface++ ) { - node_ptr->monitor[iface] = true_false ; + if ( node_ptr->monitor[iface] == true_false ) + continue ; + + if ( iface == INFRA_IFACE ) + { + if ( this->infra_network_provisioned == false ) + continue ; + + if ( node_ptr->monitor[MGMNT_IFACE] == true_false ) + want_log = false ; + } + + if ( send_clear == true ) + { + send_event ( node_ptr->hostname, MTC_EVENT_HEARTBEAT_MINOR_CLR, (iface_enum)iface ) ; + send_event ( node_ptr->hostname, MTC_EVENT_HEARTBEAT_DEGRADE_CLR, (iface_enum)iface ) ; + } + if ( true_false == true ) { + if ( want_log ) + { + ilog ("%s starting heartbeat service \n", + hostname.c_str()); + } node_ptr->no_work_log_throttle = 0 ; node_ptr->b2b_misses_count[iface] = 0 ; node_ptr->hbs_misses_count[iface] = 0 ; @@ -7345,16 +7367,20 @@ int nodeLinkClass::mon_host ( const string & hostname, iface_enum iface, bool tr node_ptr->hbs_failure[iface] = false ; node_ptr->hbs_minor[iface] = false ; node_ptr->hbs_degrade[iface] = false ; - if ( send_clear == true ) + } + else + { + if ( want_log ) { - send_event ( node_ptr->hostname, MTC_EVENT_HEARTBEAT_MINOR_CLR, iface ) ; - send_event ( node_ptr->hostname, MTC_EVENT_HEARTBEAT_DEGRADE_CLR, iface ) ; + ilog ("%s stopping heartbeat service\n", + hostname.c_str()); } } - return PASS ; + node_ptr->monitor[iface] = true_false ; } + return PASS ; } - return ( rc ); + return ( FAIL ); } /* store the current hardware monitor monitoring state */ @@ -7887,11 +7913,11 @@ int nodeLinkClass::remPulse ( struct node * node_ptr, iface_enum iface, bool cle } else { - pulse_list[iface].head_ptr = pulse_list[iface].head_ptr->pulse_link[iface].next_ptr ; - pulse_list[iface].head_ptr->pulse_link[iface].prev_ptr = NULL ; + pulse_list[iface].head_ptr = pulse_list[iface].head_ptr->pulse_link[iface].next_ptr ; + pulse_list[iface].head_ptr->pulse_link[iface].prev_ptr = NULL ; + } } } - } else if ( pulse_list[iface].tail_ptr == pulse_ptr ) { qlog2 ("%s Pulse: Multiple Node -> Tail Case : %d of %d\n", node_ptr->hostname.c_str(), pulse_ptr->linknum[iface], pulses[iface] ); @@ -7906,19 +7932,16 @@ int nodeLinkClass::remPulse ( struct node * node_ptr, iface_enum iface, bool cle } else { - pulse_list[iface].tail_ptr = pulse_list[iface].tail_ptr->pulse_link[iface].prev_ptr ; - pulse_list[iface].tail_ptr->pulse_link[iface].next_ptr = NULL ; - } + pulse_list[iface].tail_ptr = pulse_list[iface].tail_ptr->pulse_link[iface].prev_ptr ; + pulse_list[iface].tail_ptr->pulse_link[iface].next_ptr = NULL ; + } } else { /* July 1 emacdona: Make failure path case more robust */ - if ( pulse_ptr == NULL ) { slog ("Internal Err 1\n"); rc = FAIL; } - else if ( pulse_ptr->pulse_link[iface].prev_ptr == NULL ) { slog ("Internal Err 2\n"); rc = FAIL; } - else if ( pulse_ptr->pulse_link[iface].prev_ptr->pulse_link[iface].next_ptr == NULL ) { slog ("Internal Err 3\n"); rc = FAIL; } - else if ( pulse_ptr->pulse_link[iface].next_ptr == NULL ) { slog ("Internal Err 4\n"); rc = FAIL; } - else if ( pulse_ptr->pulse_link[iface].next_ptr->pulse_link[iface].prev_ptr == NULL ) { slog ("Internal Err 5\n"); rc = FAIL; } - + if ( pulse_ptr == NULL ) { slog ("Internal Err 1\n"); rc = FAIL; } + else if ( pulse_ptr->pulse_link[iface].prev_ptr == NULL ) { slog ("Internal Err 2\n"); rc = FAIL; } + else if ( pulse_ptr->pulse_link[iface].next_ptr == NULL ) { slog ("Internal Err 3\n"); rc = FAIL; } if ( rc == FAIL ) { slog ("%s Null pointer error splicing %s out of pulse list with %d pulses remaining (Monitoring:%s)\n", @@ -7935,7 +7958,7 @@ int nodeLinkClass::remPulse ( struct node * node_ptr, iface_enum iface, bool cle } if ( rc == PASS ) { - pulse_ptr->linknum[iface]-- ; // = 0 ; + pulse_ptr->linknum[iface]-- ; } pulses[iface]-- ; } @@ -8082,14 +8105,26 @@ void nodeLinkClass::manage_heartbeat_alarm ( struct nodeLinkClass::node * node_p -int nodeLinkClass::lost_pulses ( iface_enum iface ) +int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding ) { - int rc = PASS ; + int lost = 0 ; + /* + * Assume storage-0 is responding until otherwise proven its not. + * keep in mind that this interface counts nodes that have not responded ; + * not those that have. + */ + storage_0_responding = true ; + + /* + * Loop over the pulse_list which now onoly contains a list of hosts + * that have not responded in this heartbeat period. + */ for ( ; pulse_list[iface].head_ptr != NULL ; ) { daemon_signal_hdlr (); pulse_ptr = pulse_list[iface].head_ptr ; + lost++ ; if ( active ) { string flat = "Flat Line:" ; @@ -8098,6 +8133,15 @@ int nodeLinkClass::lost_pulses ( iface_enum iface ) pulse_ptr->b2b_pulses_count[iface] = 0 ; // pulse_ptr->max_count[iface]++ ; + /* + * Update storage_0_responding reference to false if storgate-0 + * is found in the pulse lots list. + */ + if ( pulse_ptr->hostname == STORAGE_0 ) + { + storage_0_responding = false ; + } + /* Don't log single misses unless in debug mode */ if ( pulse_ptr->b2b_misses_count[iface] > 1 ) { @@ -8156,8 +8200,9 @@ int nodeLinkClass::lost_pulses ( iface_enum iface ) get_iface_name_str(iface), pulse_ptr->b2b_misses_count[iface] ); } +#ifdef WANT_HBS_MEM_LOGS mem_log ( flat, pulse_ptr->b2b_misses_count[iface], pulse_ptr->hostname.c_str()); - +#endif if ( iface == MGMNT_IFACE ) { if ( pulse_ptr->b2b_misses_count[iface] == hbs_minor_threshold ) @@ -8252,8 +8297,7 @@ int nodeLinkClass::lost_pulses ( iface_enum iface ) if ( pulse_ptr->b2b_misses_count[iface] > pulse_ptr->max_count[iface] ) pulse_ptr->max_count[iface] = pulse_ptr->b2b_misses_count[iface] ; } - rc = remPulse_by_name ( pulse_ptr->hostname, iface, false, NULL_PULSE_FLAGS ); - if ( rc != PASS ) + if ( remPulse_by_name ( pulse_ptr->hostname, iface, false, NULL_PULSE_FLAGS )) { elog ("%s %s not in pulse list\n", pulse_ptr->hostname.c_str(), get_iface_name_str(iface)); @@ -8266,7 +8310,7 @@ int nodeLinkClass::lost_pulses ( iface_enum iface ) break ; } } - return (rc); + return (lost); } /* Return true if the specified interface is being monitored for this host */ @@ -8301,7 +8345,7 @@ void nodeLinkClass::print_pulse_list ( iface_enum iface ) if ( pulse_list[iface].head_ptr != NULL ) { - for ( pulse_ptr = pulse_list[iface].head_ptr ; + for ( pulse_ptr = pulse_list[iface].head_ptr ; pulse_ptr != NULL ; pulse_ptr = pulse_ptr->pulse_link[iface].next_ptr ) { @@ -8310,12 +8354,15 @@ void nodeLinkClass::print_pulse_list ( iface_enum iface ) } dlog ("Patients: %s\n", pulse_host_list.c_str()); } + +#ifdef WANT_HBS_MEM_LOGS if ( pulses[iface] && !pulse_host_list.empty() ) { string temp = get_iface_name_str(iface) ; temp.append(" Patients :") ; mem_log ( temp, pulses[iface], pulse_host_list ); } +#endif } diff --git a/mtce/src/common/nodeClass.h b/mtce/src/common/nodeClass.h index d009ea3c..cb6d59ff 100755 --- a/mtce/src/common/nodeClass.h +++ b/mtce/src/common/nodeClass.h @@ -1940,7 +1940,7 @@ public: void manage_pulse_flags ( string & hostname, unsigned int flags ); /** Control the heartbeat monitoring state of a host */ - int mon_host ( const string & hostname, iface_enum iface, bool true_false, bool send_clear ); + int mon_host ( const string & hostname, bool true_false, bool send_clear ); /** Return true if the pulse list is empty */ bool pulse_list_empty ( iface_enum iface ); @@ -1956,7 +1956,7 @@ public: * that exceed preset thresholds. * */ - int lost_pulses ( iface_enum iface ); + int lost_pulses ( iface_enum iface, bool & storage_0_responding ); bool monitored_pulse ( string hostname , iface_enum iface ); diff --git a/mtce/src/heartbeat/Makefile b/mtce/src/heartbeat/Makefile index 7c3d5f31..9fe77f3b 100755 --- a/mtce/src/heartbeat/Makefile +++ b/mtce/src/heartbeat/Makefile @@ -4,10 +4,10 @@ # SPDX-License-Identifier: Apache-2.0 # -SRCS = hbsAlarm.cpp hbsClient.cpp hbsAgent.cpp hbsPmon.cpp hbsStubs.cpp +SRCS = hbsAlarm.cpp hbsClient.cpp hbsAgent.cpp hbsPmon.cpp hbsUtil.cpp hbsCluster.cpp hbsStubs.cpp OBJS = $(SRCS:.cpp=.o) -LDLIBS = -lstdc++ -ldaemon -lcommon -lthreadUtil -lpthread -lfmcommon -lalarm -lrt -lamon -lcrypto -luuid +LDLIBS = -lstdc++ -ldaemon -lcommon -lthreadUtil -lpthread -lfmcommon -lalarm -lrt -lamon -lcrypto -luuid -ljson-c INCLUDES = -I. -I/usr/include/mtce-daemon -I/usr/include/mtce-common INCLUDES += -I../common -I../alarm -I../maintenance -I../public @@ -31,8 +31,8 @@ endif all: static_analysis common agent client build: static_analysis $(OBJS) - $(CXX) $(CCFLAGS) hbsAlarm.o hbsAgent.o hbsStubs.o ../common/nodeClass.o -L../public -L../alarm $(LDLIBS) -o hbsAgent - $(CXX) $(CCFLAGS) hbsClient.o hbsPmon.o -L../public -L../alarm $(LDLIBS) -o hbsClient + $(CXX) $(CCFLAGS) hbsAlarm.o hbsAgent.o hbsUtil.o hbsCluster.o hbsStubs.o ../common/nodeClass.o -L../public -L../alarm $(LDLIBS) -o hbsAgent + $(CXX) $(CCFLAGS) hbsClient.o hbsPmon.o hbsUtil.o -L../public -L../alarm $(LDLIBS) -o hbsClient common: ( cd ../common ; make clean ; make lib VER=$(VER) VER_MJR=$(VER_MJR)) diff --git a/mtce/src/heartbeat/hbsAgent.cpp b/mtce/src/heartbeat/hbsAgent.cpp index 0c7580e2..ad094ee5 100644 --- a/mtce/src/heartbeat/hbsAgent.cpp +++ b/mtce/src/heartbeat/hbsAgent.cpp @@ -41,6 +41,7 @@ using namespace std; #include "hbsBase.h" /* Heartbeat Base Header File */ #include "hbsAlarm.h" /* for ... hbsAlarm_clear_all */ #include "alarm.h" /* for ... alarm send message to mtcalarmd */ +#include "jsonUtil.h" /* for ... jsonUtil_get_key_val */ /************************************************************** * Implementation Structure @@ -68,6 +69,8 @@ using namespace std; /* Number of back to back interface errors before the interface is re-initialized. */ #define INTERFACE_ERRORS_FOR_REINIT (8) +#define MAX_LEN 1000 + /* Historical String data for mem_logs */ static string unexpected_pulse_list[MAX_IFACES] = { "" , "" } ; static string arrival_histogram[MAX_IFACES] = { "" , "" } ; @@ -90,6 +93,8 @@ int module_init ( void ) return (PASS); } +static unsigned int controller_number = 0 ; + void daemon_sigchld_hdlr ( void ) { ; /* dlog("Received SIGCHLD ... no action\n"); */ @@ -184,14 +189,16 @@ void daemon_exit ( void ) CONFIG_AGENT_HBS_DEGRADE |\ CONFIG_AGENT_HBS_FAILURE |\ CONFIG_AGENT_MULTICAST |\ - CONFIG_SCHED_PRIORITY |\ + CONFIG_SCHED_PRIORITY |\ CONFIG_MTC_TO_HBS_CMD_PORT |\ CONFIG_HBS_TO_MTC_EVENT_PORT |\ CONFIG_AGENT_HBS_MGMNT_PORT |\ CONFIG_AGENT_HBS_INFRA_PORT |\ CONFIG_CLIENT_HBS_MGMNT_PORT |\ CONFIG_CLIENT_MTCALARM_PORT |\ - CONFIG_CLIENT_HBS_INFRA_PORT ) + CONFIG_CLIENT_HBS_INFRA_PORT |\ + CONFIG_AGENT_SM_SERVER_PORT |\ + CONFIG_AGENT_SM_CLIENT_PORT) /* Startup config read */ static int hbs_config_handler ( void * user, @@ -203,6 +210,8 @@ static int hbs_config_handler ( void * user, if (MATCH("agent", "heartbeat_period")) { + int curr_period = hbsInv.hbs_pulse_period ; + config_ptr->hbs_pulse_period = atoi(value); hbsInv.hbs_pulse_period = atoi(value); hbsInv.hbs_state_change = true ; @@ -227,10 +236,14 @@ static int hbs_config_handler ( void * user, } } } + hbsInv.hbs_pulse_period_save = hbsInv.hbs_pulse_period ; + if ( curr_period != hbsInv.hbs_pulse_period ) + { + /* initialize cluster info */ + hbs_cluster_init ( hbsInv.hbs_pulse_period ); + } } - hbsInv.hbs_pulse_period_save = hbsInv.hbs_pulse_period ; - if (MATCH("agent", "hbs_minor_threshold")) { config_ptr->hbs_minor_threshold = @@ -312,6 +325,16 @@ static int hbs_config_handler ( void * user, config_ptr->hbs_agent_mgmnt_port = atoi(value); config_ptr->mask |= CONFIG_AGENT_HBS_MGMNT_PORT ; } + else if (MATCH("agent", "sm_server_port")) + { + config_ptr->sm_server_port = atoi(value); + config_ptr->mask |= CONFIG_AGENT_SM_SERVER_PORT ; + } + else if (MATCH("agent", "sm_client_port")) + { + config_ptr->sm_client_port = atoi(value); + config_ptr->mask |= CONFIG_AGENT_SM_CLIENT_PORT ; + } else if (MATCH("client", "hbs_client_mgmnt_port")) { config_ptr->hbs_client_mgmnt_port = atoi(value); @@ -617,6 +640,34 @@ int alarm_port_init ( void ) return ( hbs_sock.alarm_sock->return_status ) ; } +int hbs_sm_sockets_init ( void ) +{ + int rc = PASS ; + + /* Create an UDP RX Message Socket for SM Requests; LO interface only */ + hbs_sock.sm_server_sock = new msgClassRx(LOOPBACK_IP, hbs_config.sm_server_port, IPPROTO_UDP); + if ( ! hbs_sock.sm_server_sock ) + { + elog ("Failed to setup SM receive socket"); + rc = FAIL_SOCKET_CREATE ; + } + + /* Create an UDP TX Message Socket for SM Requests; LO interface only */ + hbs_sock.sm_client_sock = new msgClassTx(LOOPBACK_IP, hbs_config.sm_client_port,IPPROTO_UDP); + if ( ! hbs_sock.sm_client_sock ) + { + elog ("Failed to setup SM transmit socket"); + rc = FAIL_SOCKET_CREATE ; + } + + if ( rc == PASS ) + { + hbs_sock.sm_server_sock->sock_ok(true); + hbs_sock.sm_client_sock->sock_ok(true); + } + return (rc); +} + /* Init the internal/local sockets ; the ones that will no change. * This way we don't miss add and start commands from maintenance. */ @@ -654,6 +705,9 @@ int hbs_int_socket_init ( void ) { elog ("Alarm port setup or registration failed (rc:%d)\n", rc ); } + + rc = hbs_sm_sockets_init () ; + return (rc); } @@ -697,26 +751,36 @@ int hbs_pulse_request ( iface_enum iface, string hostname_clue, unsigned int lookup_clue) { - int rc = PASS ; - #define MAX_LEN 1000 +#ifdef WANT_HBS_MEM_LOGS char str[MAX_LEN] ; - - /* Add the sequence number */ - hbs_sock.tx_mesg[iface].s = seq_num ; - memset ( &hbs_sock.tx_mesg[iface].m[HBS_HEADER_SIZE], 0, MAX_CHARS_HOSTNAME ); - if (( lookup_clue ) && - ( hostname_clue.length() <= MAX_CHARS_HOSTNAME )) - { - hbs_sock.tx_mesg[iface].c = lookup_clue ; - memcpy ( &hbs_sock.tx_mesg[iface].m[HBS_HEADER_SIZE], - hostname_clue.data(), - hostname_clue.length()); - } - /* Message length is the size of the sequence number, the clue and the buffer */ - - int msg_len = (HBS_MAX_MSG+(sizeof(unsigned int)*2)) ; +#endif + int bytes = 0 ; if ( hbs_sock.tx_sock[iface] ) { + // int unused_networks = 0 ; + memset ( &hbs_sock.tx_mesg[iface].m[HBS_HEADER_SIZE], 0, MAX_CHARS_HOSTNAME ); + + /* Add message version - 0 -> 1 with the acction of cluster information */ + hbs_sock.tx_mesg[iface].v = HBS_MESSAGE_VERSION ; + + /* Add the sequence number */ + hbs_sock.tx_mesg[iface].s = seq_num ; + + if (( lookup_clue ) && + ( hostname_clue.length() <= MAX_CHARS_HOSTNAME )) + { + hbs_sock.tx_mesg[iface].c = lookup_clue ; + memcpy ( &hbs_sock.tx_mesg[iface].m[HBS_HEADER_SIZE], + hostname_clue.data(), + hostname_clue.length()); + } + + /* Append the cluster info to the pulse request */ + hbs_cluster_append(hbs_sock.tx_mesg[iface]) ; + + /* Calculate the total message size */ + bytes = sizeof(hbs_message_type)-hbs_cluster_unused_bytes(); + #ifdef WANT_FIT_TESTING if ( daemon_want_fit ( FIT_CODE__NO_PULSE_REQUEST, "any" , get_iface_name_str(iface) ) ) { @@ -727,14 +791,15 @@ int hbs_pulse_request ( iface_enum iface, goto hbs_pulse_request_out ; } #endif - if ( (rc = hbs_sock.tx_sock[iface]->write((char*)&hbs_sock.tx_mesg[iface], msg_len)) < 0 ) + + if ( (bytes = hbs_sock.tx_sock[iface]->write((char*)&hbs_sock.tx_mesg[iface], bytes)) < 0 ) { elog("Failed to send Pulse request: %d:%s to %s.%d (rc:%i ; %d:%s)\n", hbs_sock.tx_mesg[iface].s, &hbs_sock.tx_mesg[iface].m[0], hbs_sock.tx_sock[iface]->get_dst_addr()->toString(), hbs_sock.tx_sock[iface]->get_dst_addr()->getPort(), - rc, errno, strerror(errno) ); + bytes, errno, strerror(errno) ); return (FAIL_SOCKET_SENDTO); } } @@ -748,16 +813,17 @@ int hbs_pulse_request ( iface_enum iface, hbs_pulse_request_out: #endif - mlog1("%s Pulse Req: (%5d): %17s:%5d: %d:%d:%x:%s\n", - get_iface_name_str(iface), rc, + mlog("%s Pulse Req: (%5d): %17s:%5d: %d:%d:%d:%x:%s\n", + get_iface_name_str(iface), bytes, hbs_sock.tx_sock[iface]->get_dst_addr()->toString(), hbs_sock.tx_sock[iface]->get_dst_addr()->getPort(), + hbs_sock.tx_mesg[iface].v, hbs_sock.tx_mesg[iface].s, hbs_sock.tx_mesg[iface].c, hbs_sock.tx_mesg[iface].f, hbs_sock.tx_mesg[iface].m); - +#ifdef WANT_HBS_MEM_LOGS snprintf ( &str[0], MAX_LEN, "%s Pulse Req: %17s:%5d: %u:%u:%s\n", get_iface_name_str(iface), hbs_sock.tx_sock[iface]->get_dst_addr()->toString(), @@ -766,6 +832,7 @@ hbs_pulse_request_out: hbs_sock.tx_mesg[iface].c, hbs_sock.tx_mesg[iface].m); mem_log (&str[0]); +#endif return (PASS); } @@ -785,7 +852,7 @@ string get_hostname_from_pulse ( char * msg_ptr ) int _pulse_receive ( iface_enum iface , unsigned int seq_num ) { - int n = 0 ; + int bytes = 0 ; int detected_pulses = 0 ; @@ -796,7 +863,7 @@ int _pulse_receive ( iface_enum iface , unsigned int seq_num ) do { /* Clean the receive buffer */ - memset ( hbs_sock.rx_mesg[iface].m, 0, HBS_MAX_MSG ); + memset ( hbs_sock.rx_mesg[iface].m, 0, sizeof(hbs_message_type) ); hbs_sock.rx_mesg[iface].s = 0 ; hbs_sock.rx_mesg[iface].c = 0 ; if ( hbs_sock.rx_sock[iface] == NULL ) @@ -804,10 +871,10 @@ int _pulse_receive ( iface_enum iface , unsigned int seq_num ) elog ("%s cannot receive pulses - null object\n", get_iface_name_str(iface) ); return (0); } - if ( (n = hbs_sock.rx_sock[iface]->read((char*)&hbs_sock.rx_mesg[iface], sizeof(hbs_message_type))) != -1 ) + if ( (bytes = hbs_sock.rx_sock[iface]->read((char*)&hbs_sock.rx_mesg[iface], sizeof(hbs_message_type))) != -1 ) { mlog1 ("%s Pulse Rsp: (%5d): %17s:%5d: %d:%d:%x:%s\n", - get_iface_name_str(iface), n, + get_iface_name_str(iface), bytes, hbs_sock.rx_sock[iface]->get_dst_addr()->toString(), hbs_sock.rx_sock[iface]->get_dst_addr()->getPort(), hbs_sock.rx_mesg[iface].s, @@ -839,7 +906,7 @@ int _pulse_receive ( iface_enum iface , unsigned int seq_num ) } #endif - mlog ("%s Pulse Rsp from (%s)\n", get_iface_name_str(iface), hostname.c_str()); + // mlog ("%s Pulse Rsp from (%s)\n", get_iface_name_str(iface), hostname.c_str()); if ( !hostname.compare("localhost") ) { mlog3 ("%s Pulse Rsp (local): %17s:%5d: %d:%d:%x:%s\n", @@ -868,7 +935,6 @@ int _pulse_receive ( iface_enum iface , unsigned int seq_num ) { if ( hbsInv.monitored_pulse ( hostname , iface ) == true ) { - #define MAX_LEN 1000 char str[MAX_LEN] ; string extra = "Rsp" ; @@ -880,25 +946,42 @@ int _pulse_receive ( iface_enum iface , unsigned int seq_num ) { rc = hbsInv.remove_pulse ( hostname, iface, hbs_sock.rx_mesg[iface].c, hbs_sock.rx_mesg[iface].f ) ; } - snprintf (&str[0], MAX_LEN, "%s Pulse %s: (%5d): %17s:%5d: %u:%u:%x:%s\n", - get_iface_name_str(iface), extra.c_str(), n, + snprintf (&str[0], MAX_LEN, "%s Pulse %s: (%5d): %s:%d: %u:%u:%x:%s\n", + get_iface_name_str(iface), extra.c_str(), bytes, hbs_sock.rx_sock[iface]->get_dst_addr()->toString(), hbs_sock.rx_sock[iface]->get_dst_addr()->getPort(), hbs_sock.rx_mesg[iface].s, hbs_sock.rx_mesg[iface].c, hbs_sock.rx_mesg[iface].f, hbs_sock.rx_mesg[iface].m); - mlog1 ("%s", &str[0]); + mlog ("%s", &str[0]); +#ifdef WANT_HBS_MEM_LOGS mem_log (str); +#endif if ( extra.empty()) { detected_pulses++ ; } + /* don't save data from self */ + if ( hostname != hbsInv.my_hostname ) + { + if ( hbs_sock.rx_mesg[iface].v >= HBS_MESSAGE_VERSION ) + { + if ( iface == MGMNT_IFACE ) + hbs_cluster_save ( hostname, MTCE_HBS_NETWORK_MGMT , hbs_sock.rx_mesg[iface]); + else + hbs_cluster_save ( hostname, MTCE_HBS_NETWORK_INFRA , hbs_sock.rx_mesg[iface]); + } + } + else + { +ilog ("skipping my hostname"); + } } else { mlog3 ("%s Pulse Dis: (%5d): %17s:%5d: %d:%d:%x:%s\n", - get_iface_name_str(iface), n, + get_iface_name_str(iface), bytes, hbs_sock.rx_sock[iface]->get_dst_addr()->toString(), hbs_sock.rx_sock[iface]->get_dst_addr()->getPort(), hbs_sock.rx_mesg[iface].s, @@ -934,7 +1017,7 @@ int _pulse_receive ( iface_enum iface , unsigned int seq_num ) hbs_sock.rx_mesg[iface].m) ; } } - } while ( n > 0 ) ; + } while ( bytes > 0 ) ; monitor_scheduling ( after_rx_time, before_rx_time, detected_pulses, SCHED_MONITOR__RECEIVER ); return (detected_pulses); } @@ -951,6 +1034,8 @@ int send_event ( string & hostname, unsigned int event_cmd, iface_enum iface ) if ( event_cmd == MTC_EVENT_HEARTBEAT_LOSS ) { daemon_dump_membuf_banner (); + hbsInv.print_node_info (); + hbs_cluster_log( hbsInv.my_hostname, "event"); daemon_dump_membuf (); snprintf ( &event.hdr[0] , MSG_HEADER_SIZE, "%s", get_heartbeat_loss_header()); } @@ -1038,6 +1123,9 @@ int daemon_init ( string iface, string nodetype ) /* Initialize the hbs control struct */ MEMSET_ZERO ( hbs_ctrl ); + /* init the utility module */ + hbs_utils_init (); + /* initialize the timer */ mtcTimer_init ( hbsTimer, "controller", "heartbeat" ); @@ -1091,9 +1179,123 @@ int daemon_init ( string iface, string nodetype ) return (rc); } +/***************************************************************************** + * + * Name : hbs_sm_handler + * + * Description: Try and receive a Service Management request from sm_server_sock + * + * Expecting request in the following form: + * ~66 bytes with moderate spacing + * + * { + * "origin" :"sm", + * "service":"heartbeat", + * "request":"cluster_info" + * "req_id" : number + * } + * + * Successfully parsed request results in a call to + * hbs_cluser_send which sends the latest snapshot of + * the heartbeat cluser info to SM. + * + * Assumptions: log flooding is avoided. + * + * Returns : Nothing + * + ****************************************************************************/ +static int _hbs_sm_handler_log_throttle = 0 ; +void hbs_sm_handler ( void ) +{ + #define _MAX_MSG_LEN (80) + #define _MAX_LOG_CNT (1000) + + #define PRIMARY_LABEL "origin" + #define SERVICE_LABEL "service" + #define REQUEST_LABEL "request" + #define REQID_LABEL "reqid" + + #define SUPPORTED_ORIGIN "sm" + #define SUPPERTED_SERVICE "heartbeat" + #define SUPPORTED_REQUEST "cluster_info" + + char sm_mesg[_MAX_MSG_LEN] ; + MEMSET_ZERO(sm_mesg); + int bytes = hbs_sock.sm_server_sock->read((char*)&sm_mesg, _MAX_MSG_LEN); + if ( bytes ) + { + /* Expecting request in the following form: + * { "origin":"sm" ... } */ + if ( sm_mesg[0] == '{' ) + { + int reqid = 0 ; + string origin = "" ; + string service = "" ; + string request = "" ; + if ( jsonUtil_get_key_val ( sm_mesg, PRIMARY_LABEL, origin ) != PASS ) + { + wlog_throttled ( _hbs_sm_handler_log_throttle, _MAX_LOG_CNT, + "missing primary label 'origin' in request."); + } + else if (( origin == SUPPORTED_ORIGIN ) && + ( jsonUtil_get_key_val ( sm_mesg, SERVICE_LABEL, service ) == PASS ) && + ( jsonUtil_get_key_val ( sm_mesg, REQUEST_LABEL, request ) == PASS ) && + ( jsonUtil_get_key_val_int ( sm_mesg, REQID_LABEL, reqid ) == PASS )) + { + if (( service == SUPPERTED_SERVICE ) && + ( request == SUPPORTED_REQUEST )) + { + /* success path ... */ + hbs_cluster_send( hbs_sock.sm_client_sock, reqid ); + + /* reset log throttle */ + _hbs_sm_handler_log_throttle = 0 ; + } + else + { + wlog_throttled ( _hbs_sm_handler_log_throttle, _MAX_LOG_CNT, + "missing service or request labels in request."); + } + } + else + { + wlog_throttled ( _hbs_sm_handler_log_throttle, _MAX_LOG_CNT, + "failed to parse one or more request labels."); + } + } + else + { + wlog_throttled ( _hbs_sm_handler_log_throttle, _MAX_LOG_CNT, + "improperly formatted json string request."); + } + } + else if ( bytes == -1 ) + { + wlog_throttled ( _hbs_sm_handler_log_throttle, _MAX_LOG_CNT, + "message receive error (%d:%s)", + errno, strerror(errno)); + } + else + { + wlog_throttled ( _hbs_sm_handler_log_throttle, _MAX_LOG_CNT, + "unknown error Error (rc:%d)", bytes ); + } + dlog ("... %s", sm_mesg ); +} + +/**************************************************************************** + * + * Name : daemon_service_run + * + * Description: Daemon's main loop + * + ***************************************************************************/ + void daemon_service_run ( void ) { +#ifdef WANT_HBS_MEM_LOGS int exp_pulses[MAX_IFACES] ; +#endif int rc = PASS ; int counter = 0 ; int goenabled_wait_log_throttle = 0 ; @@ -1154,6 +1356,8 @@ void daemon_service_run ( void ) daemon_exit (); } + /* set this controller as provisioned */ + hbs_manage_controller_state ( hbsInv.my_hostname , true ); /* CGTS 4114: Small Footprint: Alarm 200.005 remains active after connectivity restored * @@ -1195,6 +1399,16 @@ void daemon_service_run ( void ) /* enable the base level signal handler latency monitor */ daemon_latency_monitor (true); + /* load this controller index number - used for cluster stuff */ + if ( hbsInv.my_hostname == CONTROLLER_0 ) + controller_number = 0 ; + else + controller_number = 1 ; + + /* tell the cluster which controller this is and + * how many networks are being monitored */ + hbs_cluster_nums (controller_number,hbsInv.infra_network_provisioned ?2:1); + /* Run heartbeat service forever or until stop condition */ for ( hbsTimer.ring = false ; ; ) { @@ -1315,6 +1529,14 @@ void daemon_service_run ( void ) FD_SET(hbs_sock.mtc_to_hbs_sock->getFD(), &hbs_sock.readfds); } + /* Add the sm request receiver to the select list */ + if (( hbs_sock.sm_server_sock ) && + ( hbs_sock.sm_server_sock->getFD())) + { + socks.push_front (hbs_sock.sm_server_sock->getFD()); + FD_SET(hbs_sock.sm_server_sock->getFD(), &hbs_sock.readfds); + } + /* Add the netlink event listener to the select list */ if ( hbs_sock.netlink_sock ) { @@ -1379,6 +1601,11 @@ void daemon_service_run ( void ) hbs_sock.fired[INFRA_INTERFACE] = true ; } + if ((hbs_sock.sm_server_sock != NULL ) && + ( FD_ISSET(hbs_sock.sm_server_sock->getFD(), &hbs_sock.readfds))) + { + hbs_sm_handler(); + } if ((hbs_sock.mtc_to_hbs_sock != NULL ) && ( FD_ISSET(hbs_sock.mtc_to_hbs_sock->getFD(), &hbs_sock.readfds))) { @@ -1404,7 +1631,7 @@ void daemon_service_run ( void ) inv.nodetype = msg.parm[0]; hbsInv.add_heartbeat_host ( inv ) ; hostname_inventory.push_back ( hostname ); - ilog ("%s added to heartbeat service (%d)\n", hostname.c_str(), inv.nodetype ); + ilog ("%s added to heartbeat service (%d)\n", hostname.c_str(), msg.parm[0] ); /* clear any outstanding alarms on the ADD */ if ( hbsInv.hbs_failure_action != HBS_FAILURE_ACTION__NONE ) @@ -1415,10 +1642,7 @@ void daemon_service_run ( void ) } else if ( msg.cmd == MTC_CMD_DEL_HOST ) { - for ( int iface = 0 ; iface < MAX_IFACES ; iface++ ) - { - hbsInv.mon_host ( hostname, (iface_enum)iface, false, false ); - } + hbsInv.mon_host ( hostname, false, false ); hostname_inventory.remove ( hostname ); hbsInv.del_host ( hostname ); ilog ("%s deleted from heartbeat service\n", hostname.c_str()); @@ -1432,27 +1656,24 @@ void daemon_service_run ( void ) } else if ( msg.cmd == MTC_CMD_STOP_HOST ) { - for ( int iface = 0 ; iface < MAX_IFACES ; iface++ ) - { - hbsInv.mon_host ( hostname, (iface_enum)iface, false, true ); - } - ilog ("%s stopping heartbeat service\n", hostname.c_str()); + hbsInv.mon_host ( hostname, false, true ); + hbs_cluster_del ( hostname ); + + ilog ("%s stopping heartbeat service\n", + hostname.c_str()); } else if ( msg.cmd == MTC_CMD_START_HOST ) { - for ( int iface = 0 ; iface < MAX_IFACES ; iface++ ) - { - hbsInv.mon_host ( hostname, (iface_enum)iface, true, true ); - } - ilog ("%s starting heartbeat service\n", hostname.c_str()); + hbsInv.mon_host ( hostname, true, true ); + hbs_cluster_add ( hostname ); + + ilog ("%s starting heartbeat service\n", + hostname.c_str()); } else if ( msg.cmd == MTC_RESTART_HBS ) { - for ( int iface = 0 ; iface < MAX_IFACES ; iface++ ) - { - hbsInv.mon_host ( hostname, (iface_enum)iface, false, false ); - hbsInv.mon_host ( hostname, (iface_enum)iface, true, false ); - } + hbsInv.mon_host ( hostname, false, false ); + hbsInv.mon_host ( hostname, true, false ); ilog ("%s restarting heartbeat service\n", hostname.c_str()); hbsInv.print_node_info(); } @@ -1616,7 +1837,9 @@ void daemon_service_run ( void ) int rri = 0 ; string lf = "\n" ; +#ifdef WANT_HBS_MEM_LOGS mem_log ((char*)lf.data()); +#endif /* Get the next Resource Reference Identifier * and its Resourvce Identifier. These values @@ -1630,7 +1853,9 @@ void daemon_service_run ( void ) if (( iface == INFRA_IFACE ) && ( hbsInv.infra_network_provisioned == false )) continue ; +#ifdef WANT_HBS_MEM_LOGS exp_pulses[iface] = +#endif hbsInv.hbs_expected_pulses[iface] = hbsInv.create_pulse_list((iface_enum)iface); @@ -1759,28 +1984,33 @@ void daemon_service_run ( void ) if (( iface == INFRA_IFACE ) && ( hbsInv.infra_network_provisioned != true )) continue ; - #define MAX_LEN 1000 +#ifdef WANT_HBS_MEM_LOGS char str[MAX_LEN] ; - snprintf (&str[0], MAX_LEN, "%s Histogram: %d - %s\n", get_iface_name_str(iface), exp_pulses[iface], arrival_histogram[iface].c_str()); - mem_log (str); - if ( !unexpected_pulse_list[iface].empty() ) { snprintf ( &str[0], MAX_LEN, "%s Others : %s\n", get_iface_name_str(iface), unexpected_pulse_list[iface].c_str()); - mem_log(str); } - hbsInv.lost_pulses ( (iface_enum)iface ); +#endif + /* + * Assume storage-0 is responding until otherwise proven + * its not. Keep in mind that the 'lost_pulses' interface + * only counts nodes that have not responded. + */ + bool storage_0_responding = true ; + int lost = hbsInv.lost_pulses ((iface_enum)iface, storage_0_responding); + hbs_cluster_update ((iface_enum)iface, lost, storage_0_responding); } hbsTimer.ring = false ; heartbeat_request = true ; + // hbs_cluster_log ( hbsInv.my_hostname, "->") ; seq_num++ ; } daemon_load_fit (); @@ -1796,7 +2026,9 @@ void daemon_dump_info ( void ) hbsInv.print_node_info (); hbsInv.memDumpAllState (); +#ifdef WANT_HBS_MEM_LOGS daemon_dump_membuf (); /* write mem_logs to log file and clear log list */ +#endif } const char MY_DATA [100] = { "eieio\n" } ; diff --git a/mtce/src/heartbeat/hbsBase.h b/mtce/src/heartbeat/hbsBase.h index 8b5cf2d3..264eba57 100755 --- a/mtce/src/heartbeat/hbsBase.h +++ b/mtce/src/heartbeat/hbsBase.h @@ -27,6 +27,8 @@ #include #include #include "msgClass.h" +#include "mtceHbsCluster.h" +#include "hbsCluster.h" /** * @addtogroup hbs_base @@ -38,6 +40,8 @@ #endif #define __AREA__ "hbs" +// #define WANT_CLUSTER_DEBUG + #define ALIGN_PACK(x) __attribute__((packed)) x /** Maximum service fail count before action */ @@ -56,15 +60,18 @@ const char rsp_msg_header [HBS_HEADER_SIZE+1] = {"cgts pulse rsp:"}; #define HBS_MAX_MSG (HBS_HEADER_SIZE+MAX_CHARS_HOSTNAME) +#define HBS_MESSAGE_VERSION (1) // 0 -> 1 with intro of cluster info + /* Heartbeat control structure */ typedef struct { unsigned int nodetype ; bool clear_alarms ; } hbs_ctrl_type ; +hbs_ctrl_type * get_hbs_ctrl_ptr ( void ); /* A heartbeat service message - * if this structire is changed then + * if this structure is changed then * hbs_pulse_request needs to be looked at */ typedef struct @@ -76,7 +83,7 @@ typedef struct unsigned int s ; /* Fast Lookup Clue Info */ - unsigned int c ; + unsigned int c ; /* Status Flags * ------------ @@ -89,6 +96,16 @@ typedef struct /** message version number */ unsigned int v ; + /** Heartbeat cluster information that is put into heartbeat messages. + * + * Pulse Request : To hbsClient: Only 1 controller with up to 2 network types history. + * Pulse Response: From hbsClient: Can include up to 2 controllers with 2 networks each. + * + * This addition requires message verison increment. + * + **/ + mtce_hbs_cluster_type cluster ; + } ALIGN_PACK(hbs_message_type) ; @@ -104,6 +121,12 @@ typedef struct /** Heartbeat Service Event Transmit Interface - hbsClient -> mtcAgent */ msgClassSock* hbs_ready_tx_sock; + /** Heartbeat Service SM Transmit Interface - hbsAgent -> sm */ + msgClassSock* sm_client_sock; + + /** Heartbeat Service SM Receive Interface - sm -> hbsAgent */ + msgClassSock* sm_server_sock; + /** PMON Pulse Receive Interface - pmond -> hbsClient */ msgClassSock* pmon_pulse_sock; @@ -166,6 +189,9 @@ int hbs_refresh_pids ( std::list & proc_list ); int hbs_process_monitor ( std::list & pmon_list ); int hbs_self_recovery ( unsigned int cmd ); +/* returns this controller's number ; 0 or 1 */ +unsigned int hbs_get_controller_number ( void ); + /* Setup the pulse messaging interfaces * 'p' is a boot that indicates if the infrastructure network is provisioned * 'p' = true means it is provisioned */ @@ -184,6 +210,93 @@ int hbs_self_recovery ( unsigned int cmd ); } \ } +/*********** Common Heartbeat Utilities in hbsUtil.cpp ***************/ + +/* module init */ +void hbs_utils_init ( void ); + +/* network enum to name lookup */ +string hbs_cluster_network_name ( mtce_hbs_network_enum network ); + +/* Produce formatted clog's that characterize current and changing cluster + * history for a given network. Each log is controller/network specific. */ +void hbs_cluster_log ( string & hostname, mtce_hbs_cluster_type & cluster, string prefix ); + +/* Initialize the specified history array */ +void hbs_cluster_history_init ( mtce_hbs_cluster_history_type & history ); + +/* Clear all history in the cluster vault */ +void hbs_cluster_history_clear( mtce_hbs_cluster_type & cluster ); + + +/******** Heartbeat Agent Cluster Functions in hbsCluster.cpp ********/ + +/* Set the cluster vault to default state. + * Called upon daemon init or heartbeat period change. */ +void hbs_cluster_init ( unsigned short period ); + +/* Calculate number of bytes that is unused in the cluster data structure. + * Primarily to know how many history elements are missing. */ +unsigned short hbs_cluster_unused_bytes ( void ); + +/* Add and delete hosts from the monitored list. + * Automatically adjusts the numbers in the cluster vault. */ +void hbs_cluster_add ( string & hostname ); +void hbs_cluster_del ( string & hostname ); + +/* Report status of storgate-0 */ +void hbs_cluster_storage0_status ( iface_enum iface , bool responding ); + +/* Look for and clog changes in cluster state */ +int hbs_cluster_cmp ( hbs_message_type & msg ); + +/* Manage the enabled state of the controllers */ +void hbs_manage_controller_state ( string & hostname, bool enabled ); + +/* Set the number of monitored hosts and this controller's + * number in the cluster vault. */ +void hbs_cluster_nums ( unsigned short this_controller, + unsigned short monitored_networks ); + +/* Copy/Save the peer controller's cluster info from the hbsClient's + * pulse response into the cluster vault so its there and ready for + * an SM cluster_info request. */ +int hbs_cluster_save ( string & hostname, + mtce_hbs_network_enum network, + hbs_message_type & msg ); + +/* + * Called by the hbsAgent pulse receiver to create a network specific + * history update entry consisting of + * + * 1. the number of monitored hosts + * 2. how many of those that responded in the last heartbeat period. + * 3. threshold storage-0 responding count and manage that state in that + * networks history header. + */ +void hbs_cluster_update ( iface_enum iface, + unsigned short not_responding_hosts, + bool storage_0_responding ); + +/* Called by the hbsAgent pulse transmitter to append this controllers + * running cluster view in the next multicast pulse request. + * The hbsClient is expected to loop this data and any other like data from + * the other controller back in its response. */ +void hbs_cluster_append ( hbs_message_type & msg ); + +/* Produce formatted clog's that characterize current and changing cluster + * history for a given network. Each log is controller/network specific. */ +void hbs_cluster_log ( string & hostname, string prefix ); + +/* Service SM cluster info request */ +void hbs_sm_handler ( void ); + +/* send the cluster vault to SM */ +void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid ); + +/* print the contents of the vault */ +void hbs_cluster_dump ( mtce_hbs_cluster_type & vault ); + /** * @} hbs_base */ diff --git a/mtce/src/heartbeat/hbsClient.cpp b/mtce/src/heartbeat/hbsClient.cpp index 41b4b9f9..444dae29 100644 --- a/mtce/src/heartbeat/hbsClient.cpp +++ b/mtce/src/heartbeat/hbsClient.cpp @@ -20,7 +20,6 @@ * daemon_files_init * daemon_configure * daemon_signal_init - * hbs_message_init * hbs_socket_init * * daemon_service_run @@ -59,7 +58,7 @@ using namespace std; #include "daemon_option.h" /* Common options for daemons */ #include "nodeTimers.h" /* for ... maintenance timers */ #include "nodeMacro.h" /* for ... CREATE_NONBLOCK_INET_UDP_RX_SOCKET */ -#include "nlEvent.h" /* for ... open_netlink_socket */ +#include "nlEvent.h" /* for ... open_netlink_socket */ #include "hbsBase.h" /* Heartbeat Base Header File */ extern "C" @@ -95,8 +94,9 @@ typedef struct std::list::iterator proc_ptr ; } stallMon_type ; - +static char pulse_resp_tx_hdr [HBS_MAX_MSG]; static char my_hostname [MAX_HOST_NAME_SIZE+1]; +static char my_hostname_length ; static string my_macaddr = "" ; static string my_address = "" ; static unsigned int my_nodetype= CGTS_NODE_NULL ; @@ -360,6 +360,12 @@ static int hbs_config_handler ( void * user, config_ptr->pmon_pulse_port = atoi(value); config_ptr->mask |= CONFIG_CLIENT_PULSE_PORT ; } +#ifdef WANT_CLUSTER_DEBUG + else if (MATCH("agent", "sm_client_port")) + { + config_ptr->sm_client_port = atoi(value); + } +#endif else { return (PASS); @@ -446,20 +452,6 @@ int daemon_configure ( void ) /* Initialization Utilities */ /****************************/ -/* Initialize the unicast pulse response message */ -/* One time thing ; tx same message all the time. */ -int hbs_message_init ( void ) -{ - /* Build the transmit pulse response message for each interface */ - for ( int i = 0 ; i < MAX_IFACES ; i++ ) - { - memset ( &hbs_sock.tx_mesg[i], 0, sizeof (hbs_message_type)); - memcpy ( &hbs_sock.tx_mesg[i].m[0], &rsp_msg_header[0], HBS_HEADER_SIZE ); - memcpy ( &hbs_sock.tx_mesg[i].m[HBS_HEADER_SIZE], my_hostname, strlen(my_hostname)); - } - return (PASS); -} - /* Initialize pulse messaging for the specified interface * This is called by a macro defined in hbsBase.h */ int _setup_pulse_messaging ( iface_enum i, int rmem ) @@ -621,6 +613,11 @@ int hbs_socket_init ( void ) return (FAIL_SOCKET_NOBLOCK); } +#ifdef WANT_CLUSTER_DEBUG + hbs_sock.sm_client_sock = new msgClassRx(LOOPBACK_IP,hbs_config.sm_client_port,IPPROTO_UDP); + if ( rc ) return (rc) ; + hbs_sock.sm_client_sock->sock_ok(true); +#endif return (PASS); } @@ -648,7 +645,7 @@ int get_pmon_pulses ( void ) if ( !strncmp ( &msg.hdr[0] , get_pmond_pulse_header(), MSG_HEADER_SIZE )) { pulses++ ; - mlog ("Pmon Pulse (%s) (%d)\n", msg.hdr, pulses ); + mlog1 ("Pmon Pulse (%s) (%d)\n", msg.hdr, pulses ); } else { @@ -710,92 +707,87 @@ static unsigned int my_rri = 0 ; static int rx_error_count[MAX_IFACES] = {0,0} ; static int tx_error_count[MAX_IFACES] = {0,0} ; +#define ERROR_LOG_THRESHOLD (200) + int _service_pulse_request ( iface_enum iface , unsigned int flags ) { - unsigned int s = 0 ; /* Sequence number */ - int n = 0 ; /* message size */ - int rc = 0 ; - if (( iface != MGMNT_IFACE ) && ( iface != INFRA_IFACE )) return (FAIL_BAD_CASE); - memset ( (char*) &hbs_sock.rx_mesg[iface], 0, sizeof(hbs_message_type)); if ( ! hbs_sock.rx_sock[iface] ) { - elog ("cannot receive from null rx_mesg[%s] socket\n", get_iface_name_str(iface) ); + elog_throttled ( rx_error_count[iface], ERROR_LOG_THRESHOLD, + "cannot receive from null rx_mesg[%s] socket\n", + get_iface_name_str(iface) ); return (FAIL_TO_RECEIVE); } - else if ( hbs_sock.rx_sock[iface]->sock_ok() == false ) + else if ( ! hbs_sock.tx_sock[iface] ) { - elog ("cannot receive from failed rx_mesg[%s] socket\n", get_iface_name_str(iface) ); + elog_throttled ( tx_error_count[iface], ERROR_LOG_THRESHOLD, + "cannot send to null mesg[%s] socket\n", + get_iface_name_str(iface) ); + return (FAIL_TO_TRANSMIT); + } + else if ( ! hbs_sock.rx_sock[iface]->sock_ok() ) + { + elog_throttled ( rx_error_count[iface], ERROR_LOG_THRESHOLD, + "cannot receive from failed rx_mesg[%s] socket\n", + get_iface_name_str(iface) ); return (FAIL_TO_RECEIVE); } - - n = hbs_sock.rx_sock[iface]->read((char*)&hbs_sock.rx_mesg[iface], sizeof(hbs_message_type)); - - if( n < HBS_HEADER_SIZE ) + else if ( ! hbs_sock.tx_sock[iface]->sock_ok() ) { - rx_error_count[iface]++ ; + elog_throttled ( tx_error_count[iface], ERROR_LOG_THRESHOLD, + "cannot send to failed mesg[%s] socket\n", + get_iface_name_str(iface) ); + return (FAIL_TO_TRANSMIT); + } - /* throtle the log so that if they come back-to-back we avoid flooding */ - if ( n == -1 ) + // MEMSET_ZERO(hbs_sock.rx_mesg[iface]); + int rx_bytes = hbs_sock.rx_sock[iface]->read((char*)&hbs_sock.rx_mesg[iface], sizeof(hbs_message_type)); + if ( rx_bytes < HBS_HEADER_SIZE ) + { + if ( rx_bytes == -1 ) { - if ( rx_error_count[iface] > 1 ) - { - wlog_throttled ( rx_error_count[iface], 500, "%s receive error (%d:%m)\n", get_iface_name_str(iface), errno ); - } + wlog_throttled ( rx_error_count[iface], ERROR_LOG_THRESHOLD, + "%s receive error (%d:%m)\n", + get_iface_name_str(iface), errno ); } else { - wlog_throttled ( rx_error_count[iface], 500, "%s message underrun (expected %ld but got %d)\n", - get_iface_name_str(iface), sizeof(hbs_message_type), n ); - } - if ( rx_error_count[iface] == 100 ) - { - wlog ( "%s is getting a lot of receive errors (%d:%m)\n", get_iface_name_str(iface), errno ); + wlog_throttled ( rx_error_count[iface], ERROR_LOG_THRESHOLD, + "%s message underrun (expected %ld but got %d)\n", + get_iface_name_str(iface), + sizeof(hbs_message_type), rx_bytes ); } return (FAIL_TO_RECEIVE); } - /* Clear the error count since we got a good receive */ - rx_error_count[iface] = 0 ; - -#ifdef WANT_NO_SELF_HEARTBEAT_REPLY - /* Don't reply to the heartbeat if the request came from myself */ - if ( ! strncmp ( my_address.data(), - hbs_sock.rx_sock[iface]->get_dst_addr()->toString(), - MAX_CHARS_IN_IP_ADDR )) + daemon_config_type * cfg_ptr = daemon_get_cfg_ptr(); + if ( cfg_ptr->debug_msg ) { - ilog ("%s Refusing to send heartbeat response to self\n", hbs_sock.rx_sock[iface]->get_dst_addr()->toString()); - return (PASS); + mlog ("\n"); + mlog ("%s Pulse Req: %s:%5d: %d:%s RRI:%d\n", + get_iface_name_str(iface), + hbs_sock.rx_sock[iface]->get_dst_addr()->toString(), + hbs_sock.rx_sock[iface]->get_dst_addr()->getPort(), + hbs_sock.rx_mesg[iface].s, + hbs_sock.rx_mesg[iface].m, + hbs_sock.rx_mesg[iface].c); } -#else - /* We use this to monitor pmond on active controller */ -#endif - - /* Save the sequence number */ - s = hbs_sock.rx_mesg[iface].s ; - - mlog ("\n"); - mlog ("%s Pulse Req: %s:%5d: %d: :%s RRI:%d\n", get_iface_name_str(iface), - hbs_sock.rx_sock[iface]->get_dst_addr()->toString(), - hbs_sock.rx_sock[iface]->get_dst_addr()->getPort(), - hbs_sock.rx_mesg[iface].s, - hbs_sock.rx_mesg[iface].m, - hbs_sock.rx_mesg[iface].c); + /* verify the message header */ if ( strncmp ( (const char *)&hbs_sock.rx_mesg[iface].m, (const char *)&req_msg_header, HBS_HEADER_SIZE )) { - wlog_throttled ( rx_error_count[iface], 200, "%s Invalid header (%d:%s)\n", - get_iface_name_str(iface), - hbs_sock.rx_mesg[iface].s, - hbs_sock.rx_mesg[iface].m ); - - mlog ("Detected: %d <%s>\n", HBS_HEADER_SIZE,hbs_sock.rx_mesg[iface].m); - mlog ("Expected: %d <%s>\n", HBS_HEADER_SIZE,req_msg_header); + wlog_throttled ( rx_error_count[iface], ERROR_LOG_THRESHOLD, + "%s Invalid header (%d:%s)\n", + get_iface_name_str(iface), + hbs_sock.rx_mesg[iface].s, + hbs_sock.rx_mesg[iface].m ); return (FAIL_MSG_HEADER) ; } + /* Manage the Resource Reference Index (RRI) "lookup clue" */ if ( ! strncmp ( &hbs_sock.rx_mesg[iface].m[HBS_HEADER_SIZE], &my_hostname[0], MAX_CHARS_HOSTNAME )) { @@ -807,32 +799,31 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags ) } /* Add my RRI to the response message */ - hbs_sock.tx_mesg[iface].c = my_rri ; + hbs_sock.rx_mesg[iface].c = my_rri ; - /* Clear struct */ - hbs_sock.tx_mesg[iface].s = s ; - hbs_sock.tx_mesg[iface].f = flags ; + /* Manage OOB flags */ + hbs_sock.rx_mesg[iface].f = flags ; if ( pmonPulse_counter ) { - hbs_sock.tx_mesg[iface].f |= ( PMOND_FLAG ) ; + hbs_sock.rx_mesg[iface].f |= ( PMOND_FLAG ) ; } if ( infra_network_provisioned == true ) { - hbs_sock.tx_mesg[iface].f |= INFRA_FLAG ; + hbs_sock.rx_mesg[iface].f |= INFRA_FLAG ; } - n = (int)sizeof(hbs_message_type) ; - - if ( ! hbs_sock.tx_sock[iface] ) +#define WANT_CLUSTER_INFO_LOG +#ifdef WANT_CLUSTER_INFO_LOG + /* Log the received cluster info */ + if ( hbs_sock.rx_mesg[iface].v >= HBS_MESSAGE_VERSION ) { - elog ("cannot send to null tx_mesg[%s] socket\n", get_iface_name_str(iface) ); - return (FAIL_TO_TRANSMIT); - } - else if ( hbs_sock.tx_sock[iface]->sock_ok() == false ) - { - elog ("cannot send to failed tx_mesg[%s] socket\n", get_iface_name_str(iface) ); - return (FAIL_TO_TRANSMIT); + char str[100] ; + // hbs_cluster_log (hbs_sock.rx_mesg[iface].cluster, hbs_sock.rx_mesg[iface].s ); + snprintf ( &str[0], 100, " seq %6d with %d bytes from %s ", hbs_sock.rx_mesg[iface].s, rx_bytes, get_iface_name_str(iface)); + string hostname = my_hostname ; + hbs_cluster_log ( hostname, hbs_sock.rx_mesg[iface].cluster, str ); } +#endif #ifdef WANT_PULSE_RESPONSE_FIT if (( iface == INFRA_IFACE ) && ( daemon_is_file_present ( MTC_CMD_FIT__NO_INFRA_RSP ))) @@ -848,44 +839,69 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags ) } #endif - /* Send pulse response message with sequence number, flags and resource referecen index */ - rc = hbs_sock.tx_sock[iface]->reply(hbs_sock.rx_sock[iface],(char*)&hbs_sock.tx_mesg[iface], n); - if ( rc == -1 ) + int rc = PASS ; + + /* replace the request header with the response header */ + memcpy ( &hbs_sock.rx_mesg[iface].m[0], &pulse_resp_tx_hdr[0], HBS_MAX_MSG ); + + /* Deal with the cluster info if it exists. + * ... Introduced in messaging version 1 */ + if ( hbs_sock.rx_mesg[iface].v >= HBS_MESSAGE_VERSION ) { - elog ("Failed to sendto socket %d through %s:%d len:%d (%s) (%d:%s)\n", - hbs_sock.tx_sock[iface]->getFD(), - hbs_sock.tx_sock[iface]->get_dst_addr()->toString(), - hbs_sock.tx_sock[iface]->get_dst_addr()->getPort(), - hbs_sock.tx_sock[iface]->get_dst_addr()->getSockLen(), - get_iface_name_str(iface), errno, strerror(errno)); + if ( hbs_sock.rx_mesg[iface].cluster.version < MTCE_HBS_CLUSTER_VERSION ) + { + ilog ("Bad cluster verison (%d)", hbs_sock.rx_mesg[iface].cluster.version); + } + // if ( hbs_sock.rx_mesg[iface].cluster.revision != MTCE_HBS_CLUSTER_REVISION ) + // { + // ilog ("Bad cluster revision (%d)", hbs_sock.rx_mesg[iface].cluster.revision); + // } + + /* Add peer controller cluster data to this controller's response */ + // hbs_cluster_loop(hbs_sock.rx_mesg[iface]); } - else if ( rc != n) + + /* send pulse response message */ + int tx_bytes = hbs_sock.tx_sock[iface]->reply(hbs_sock.rx_sock[iface],(char*)&hbs_sock.rx_mesg[iface], rx_bytes); + if ( tx_bytes == -1 ) { - /* Avoid log flooding - elog ("unicast send failed. (%d)\n", rc); */ - wlog_throttled ( tx_error_count[iface], 200, - "%s Pulse Rsp: %d:%d bytes < %d:%s > to <%s>\n", - get_iface_name_str(iface), n, rc, - hbs_sock.tx_mesg[iface].s, - &hbs_sock.tx_mesg[iface].m[0], + elog_throttled ( tx_error_count[iface], ERROR_LOG_THRESHOLD, + "pulse tx failed %d:%s:%d len:%d (%s) (%d:%s)\n", + hbs_sock.tx_sock[iface]->getFD(), + hbs_sock.tx_sock[iface]->get_dst_addr()->toString(), + hbs_sock.tx_sock[iface]->get_dst_addr()->getPort(), + hbs_sock.tx_sock[iface]->get_dst_addr()->getSockLen(), + get_iface_name_str(iface), errno, strerror(errno)); + } + else if ( tx_bytes != rx_bytes) + { + wlog_throttled ( tx_error_count[iface], ERROR_LOG_THRESHOLD, + "%s Pulse Rsp: %d:%d bytes < %d:%s >", + get_iface_name_str(iface), rx_bytes, tx_bytes, + hbs_sock.rx_mesg[iface].s, &hbs_sock.rx_mesg[iface].m[0]); - return (rc); + rc = FAIL_DATA_SIZE ; } else { - mlog ("%s Pulse Rsp: %s:%5d: %d:%d:%s RRI:%d (%d)\n", + mlog ("%s Pulse Rsp: %s:%5d: %d:%d:%s RRI:%d (%d:%d:%d)\n", get_iface_name_str(iface), hbs_sock.tx_sock[iface]->get_dst_addr()->toString(), hbs_sock.tx_sock[iface]->get_dst_addr()->getPort(), - hbs_sock.tx_mesg[iface].s, - hbs_sock.tx_mesg[iface].f, - hbs_sock.tx_mesg[iface].m, - hbs_sock.tx_mesg[iface].c, - pmonPulse_counter); - /* Clear the error count since we got a good transmit */ - tx_error_count[iface] = 0 ; + hbs_sock.rx_mesg[iface].s, + hbs_sock.rx_mesg[iface].f, + hbs_sock.rx_mesg[iface].m, + hbs_sock.rx_mesg[iface].c, + pmonPulse_counter, rx_bytes, tx_bytes); } - return PASS; + + /* Clear the error count since we got a good receive */ + if ( rx_error_count[iface] ) + rx_error_count[iface] = 0 ; + if ( tx_error_count[iface] ) + tx_error_count[iface] = 0 ; + + return rc ; } #ifdef WANT_FIT_TESTING @@ -968,6 +984,9 @@ int daemon_init ( string iface, string nodeType_str ) /* Initialize socket construct and pointer to it */ memset ( &hbs_sock, 0, sizeof(hbs_sock)); + /* init the utility module */ + hbs_utils_init (); + /* Defaults */ hbs_config.stall_pmon_thld = -1 ; hbs_config.stall_mon_period = MTC_HRS_8 ; @@ -1025,12 +1044,6 @@ int daemon_init ( string iface, string nodeType_str ) rc = FAIL_DAEMON_CONFIG ; } - /* Init the heartbeat transmit pulse response message */ - else if ( hbs_message_init () != PASS ) - { - elog ("Failed to initialize pulse response message\n"); - rc = FAIL_MESSAGE_INIT ; - } /* Setup the heartbeat service messaging sockets */ else if ( hbs_socket_init () != PASS ) { @@ -1119,6 +1132,11 @@ void daemon_service_run ( void ) ilog ("Sending Heartbeat Ready Event\n"); hbs_send_event ( MTC_EVENT_MONITOR_READY ); + my_hostname_length = strlen(my_hostname) ; + memset ( &pulse_resp_tx_hdr[0], 0, HBS_MAX_MSG ); + memcpy ( &pulse_resp_tx_hdr[0], &rsp_msg_header[0], HBS_HEADER_SIZE ); + memcpy ( &pulse_resp_tx_hdr[HBS_HEADER_SIZE], my_hostname, my_hostname_length ); + /* Run heartbeat service forever or until stop condition */ for ( ; ; ) { @@ -1153,7 +1171,9 @@ void daemon_service_run ( void ) FD_SET(hbs_sock.pmon_pulse_sock->getFD(),&hbs_sock.readfds); FD_SET(hbs_sock.amon_socket, &hbs_sock.readfds); FD_SET(hbs_sock.netlink_sock, &hbs_sock.readfds); - +#ifdef WANT_CLUSTER_DEBUG + FD_SET(hbs_sock.sm_client_sock->getFD(), &hbs_sock.readfds); +#endif rc = select( socks.back()+1, &hbs_sock.readfds, NULL, NULL, &hbs_sock.waitd); @@ -1176,6 +1196,19 @@ void daemon_service_run ( void ) /* Only service sockets for the rc > 0 case */ else if ( rc ) { +#ifdef WANT_CLUSTER_DEBUG + if ( hbs_sock.sm_client_sock && FD_ISSET(hbs_sock.sm_client_sock->getFD(), &hbs_sock.readfds ) ) + { + mtce_hbs_cluster_type msg ; + /* Receive event messages */ + memset ( &msg , 0, sizeof(mtce_hbs_cluster_type)); + int bytes = hbs_sock.sm_client_sock->read((char*)&msg, sizeof(mtce_hbs_cluster_type)); + if ( bytes ) + { + hbs_cluster_dump (msg); + } + } +#endif if (hbs_sock.rx_sock[MGMNT_IFACE]&&FD_ISSET(hbs_sock.rx_sock[MGMNT_IFACE]->getFD(), &hbs_sock.readfds)) { /* Receive pulse request and send a response */ diff --git a/mtce/src/heartbeat/hbsCluster.cpp b/mtce/src/heartbeat/hbsCluster.cpp new file mode 100644 index 00000000..3789541f --- /dev/null +++ b/mtce/src/heartbeat/hbsCluster.cpp @@ -0,0 +1,748 @@ +/* + * Copyright (c) 2018 Wind River Systems, Inc. + * + * SPDX-License-Identifier: Apache-2.0 + * + * @file Maintenance Heartbeat Agent Cluster Manager Module + * + ************************************************************************* + * + * This module provides the heartbeat cluster implementation member + * functions that the hbsAgent service calls to collect, store and + * send heartbeat cluster information to SM upon request. + * + * See mtceHbsCluster.h for formal API between SM and Mtce. + * + *************************************************************************/ + +using namespace std; + +#include "nodeBase.h" /* common maintenance constructs and definitions */ +#include "daemon_common.h" /* common daemon constructs and definitions */ +#include "hbsBase.h" /* mtce heartbeat constructs and definitions */ + +/* Error log throttle counter. */ +#define THROTTLE_COUNT (500) + +/* Private Heartbeat Cluster Control Structure. */ +typedef struct +{ + /* Contains the controller number (0 or 1) for this controller. */ + unsigned short this_controller ; + + /* Preserves which controllers are enabled. */ + bool controller_0_enabled ; + bool controller_1_enabled ; +#ifdef THREE_CONTROLLER_SYSTEM + bool controller_2_enabled ; +#endif + + /* Used to prevent log flooding in presence of back to back errors. */ + unsigned int log_throttle ; + + /* Used to threshold storage-0 not responding state */ + unsigned int storage_0_not_responding_count[MTCE_HBS_NETWORKS]; + + /* Contains the number of monitored networks in the system. + * Management only = 1 + * Management and Inrastructure = 2 */ + unsigned short monitored_networks ; + + /* This contains the current number of heartbeat enabled hosts. + * + * Used to improve performance. + * + * Performance: This value is included in each history entry so + * rather than do the size calculation of monitored_hostname_list + * each time, this variable is updated from monitored_hostname_list + * after each add/del operation. */ + unsigned short monitored_hosts ; + + /* List of host names being monitored. */ + std::listmonitored_hostname_list ; + + /* The working heartbeat cluster data vault. */ + mtce_hbs_cluster_type cluster ; + +} hbs_cluster_ctrl_type ; + +/* Cluster control structire construct allocation. */ +static hbs_cluster_ctrl_type ctrl ; + + +/**************************************************************************** + * + * Name : hbs_cluster_init + * + * Description : Initialize the cluster structure to default values. + * + * Assumtions : Called by hbsAgent.cpp before entering the main loop. + * + ***************************************************************************/ + +void hbs_cluster_init ( unsigned short period ) +{ + ctrl.monitored_hosts = 0; + ctrl.monitored_hostname_list.clear(); + + /* Init the cluster - header. */ + ctrl.cluster.version = MTCE_HBS_CLUSTER_VERSION ; + ctrl.cluster.revision = MTCE_HBS_CLUSTER_REVISION ; + ctrl.cluster.magic_number = MTCE_HBS_MAGIC_NUMBER ; + + /* Init the cluster - global / dynamic data. */ + ctrl.cluster.reqid = 0 ; + ctrl.cluster.period_msec = period ; + ctrl.cluster.storage0_enabled = false ; + ctrl.cluster.histories = 0 ; + ctrl.cluster.bytes = BYTES_IN_CLUSTER_VAULT(ctrl.cluster.histories); + + /* The storage-0 thresholding counter for each network. */ + for ( int n = 0 ; n < MTCE_HBS_NETWORKS ; n++ ) + ctrl.storage_0_not_responding_count[n] = 0 ; + + for ( int h = 0 ; h < MTCE_HBS_MAX_HISTORY_ELEMENTS ; h++ ) + hbs_cluster_history_init ( ctrl.cluster.history[h] ); + + ilog ("Cluster Info: v%d.%d sig:%x bytes:%d (%ld)", + ctrl.cluster.version, + ctrl.cluster.revision, + ctrl.cluster.magic_number, + ctrl.cluster.bytes, + sizeof(mtce_hbs_cluster_history_type)); + + ctrl.log_throttle = 0 ; +} + + +/**************************************************************************** + * + * Name : hbs_cluster_nums + * + * Description : Set this controller number and the number of monitored + * networks in this system. + * + * These values do not change without a process restart. + * + * Assumtions : Called by hbsAgent.cpp before entering the main loop. + * + * Returns : None + * + ***************************************************************************/ + +void hbs_cluster_nums ( unsigned short this_controller, + unsigned short monitored_networks ) +{ + ctrl.this_controller = this_controller ; + ctrl.monitored_networks = monitored_networks ; +} + + +/**************************************************************************** + * + * Name : log_monitored_hosts_list + * + * Description : Log the list of monitored hosts. + * Typically done on a list change. + * + * Returns : None + * + ***************************************************************************/ + +void log_monitored_hosts_list ( void ) +{ + std::list::iterator iter_ptr ; + string list = "" ; + for ( iter_ptr = ctrl.monitored_hostname_list.begin() ; + iter_ptr != ctrl.monitored_hostname_list.end() ; + iter_ptr++ ) + { + list.append (*(iter_ptr)); + list.append (" "); + } + ilog ("cluster of %ld: %s", + ctrl.monitored_hostname_list.size(), + list.c_str()); +} + + +/**************************************************************************** + * + * Name : cluster_storage0_state + * + * Description : Record the heartbeat monitoring state of storage-0. + * + * Parameters : true if storage-0 heartbeating is in the 'started' state. + * false if storage-0 heartbeating is in the 'stopped' state. + * + * Returns : None + * + ***************************************************************************/ + +void cluster_storage0_state ( bool enabled ) +{ + if ( ctrl.cluster.storage0_enabled != enabled ) + { + ctrl.cluster.storage0_enabled = enabled ; + ilog ("storage-0 heartbeat state changed to %s", + enabled ? "enabled" : "disabled" ); + } +} + + +/**************************************************************************** + * + * Name : hbs_manage_controller_state + * + * Description : Track the monitored enabled state of the controllers. + * + ***************************************************************************/ + +void hbs_manage_controller_state ( string & hostname, bool enabled ) +{ + /* track controller state */ + if ( hostname == CONTROLLER_0 ) + { + ctrl.controller_0_enabled = enabled ; + } + else if ( hostname == CONTROLLER_1 ) + { + ctrl.controller_1_enabled = enabled ; + } +#ifdef THREE_CONTROLLER_SYSTEM + else if ( hostname == CONTROLLER_2 ) + { + ctrl.controller_2_enabled = enabled ; + } +#endif +} + + +/**************************************************************************** + * + * Name : hbs_cluster_add + * + * Description : Add the specified hostname to the enabled hosts list. + * + * Updates : hostname is added to monitored_hostname_list + * + * If added host is storage-0 then update its enabled status. + * if added host is a controller then update controller state. + * + * Parameters : hostname string + * + * Updates : monitored_hostname_list + * + ***************************************************************************/ + +void hbs_cluster_add ( string & hostname ) +{ + /* Consider using 'unique' after instead of remove before update. */ + ctrl.monitored_hostname_list.remove(hostname) ; + ctrl.monitored_hostname_list.push_back(hostname) ; + ctrl.monitored_hosts = (unsigned short)ctrl.monitored_hostname_list.size(); + + /* Manage storage-0 state */ + if ( hostname == STORAGE_0 ) + { + cluster_storage0_state ( true ); + } + + /* If we get down to 0 monitored hosts then just start fresh */ + if (( ctrl.monitored_hosts ) == 0 ) + { + hbs_cluster_init ( ctrl.cluster.period_msec ); + } + + /* Manage controller state ; true means enabled in this case. */ + hbs_manage_controller_state ( hostname, true ); + + ilog ("%s added to cluster", hostname.c_str()); + + log_monitored_hosts_list (); +} + +/**************************************************************************** + * + * Name : hbs_cluster_del + * + * Description : Delete the specified hostname from the enabled hosts list. + * + * Updates : hostname is removed from monitored_hostname_list + * + * If added host is storage-0 then update its enabled status. + * if added host is a controller then update controller count. + * + * Parameters : hostname string + * + * Updates : monitored_hostname_list + * + ***************************************************************************/ + +void hbs_cluster_del ( string & hostname ) +{ + ctrl.monitored_hostname_list.remove(hostname) ; + ctrl.monitored_hosts = (unsigned short)ctrl.monitored_hostname_list.size(); + + /* Manage storage-0 state. */ + if ( hostname == STORAGE_0 ) + { + cluster_storage0_state ( false ); + } + + /* If we get down to 0 monitored hosts then just start fresh */ + if (( ctrl.monitored_hosts ) == 0 ) + { + hbs_cluster_init ( ctrl.cluster.period_msec ); + } + + /* Manage controller state ; false means not enabled in this case. */ + hbs_manage_controller_state ( hostname , false ); + + ilog ("%s deleted from cluster", hostname.c_str()); + + log_monitored_hosts_list (); +} + +/**************************************************************************** + * + * Name : hbs_cluster_update + * + * Description : Update this controller's cluster info for the specified + * network with + * + * 1. The number of enabled hosts. + * 2. The number of responding hosts. + * 3. The oldest history index in the rotational history fifo. + * 4. Maintain a back to back non-responding count for storage-0. + * Once the count reaches the minimum threshold of + * STORAGE_0_NR_THRESHOLD then the specific network history + * is updated to indicate storgae-0 is not responding. Once + * storage-0 starts responding again with a single response + * then that network history is updated to indicate storage-0 + * is responding. + * + * Assumptions : Converts heartbeat interface number to cluster network number. + * + * Parameters : heartbeat interface number ( iface_enum ) + * network index + * number of not responding hosts for this interval + * + * Updates : This and last history as well as storage-0 not responding + * count. + * + ***************************************************************************/ + +#define STORAGE_0_NR_THRESHOLD (4) + +void hbs_cluster_update ( iface_enum iface, + unsigned short not_responding_hosts, + bool storage_0_responding ) +{ + if ( ctrl.monitored_hosts == 0 ) + return ; + + /* convert heartbeat iface enum to cluster network enum. */ + mtce_hbs_network_enum n ; + if ( iface == MGMNT_IFACE ) + n = MTCE_HBS_NETWORK_MGMT ; + else if ( iface == INFRA_IFACE ) + n = MTCE_HBS_NETWORK_INFRA ; +#ifdef MONITORED_OAM_NETWORK + else if ( iface == OAM_IFACE ) + n = MTCE_HBS_NETWORK_OAM ; +#endif + else + return ; + + if ( not_responding_hosts ) + { + clog1 ("controller-%d %s enabled:%d not responding:%d", + ctrl.this_controller, + hbs_cluster_network_name(n).c_str(), + ctrl.monitored_hosts, + not_responding_hosts); + } + else + { + clog1 ("controller-%d %s has %d monitored hosts and all are responding", + ctrl.this_controller, + hbs_cluster_network_name(n).c_str(), + ctrl.monitored_hosts); + } + + /* Look-up active history array for this network combination */ + mtce_hbs_cluster_history_type * history_ptr = NULL ; + GET_CLUSTER_HISTORY_PTR(ctrl.cluster, ctrl.this_controller ,n); + if ( history_ptr == NULL ) + { + if ( ctrl.cluster.histories >= MTCE_HBS_MAX_HISTORY_ELEMENTS ) + { + /* Should never happen but if it does then log without floooding */ + wlog_throttled ( ctrl.log_throttle, THROTTLE_COUNT, + "Unable to store history beyond %d ", + ctrl.cluster.histories ); + return ; + } + else + { + /* Adding a new history slot. */ + history_ptr = &ctrl.cluster.history[ctrl.cluster.histories] ; + ctrl.cluster.histories++ ; + ctrl.cluster.bytes = BYTES_IN_CLUSTER_VAULT(ctrl.cluster.histories); + history_ptr->controller = ctrl.this_controller ; + history_ptr->network = n ; + + /* Log new network history as its being started. */ + ilog ("controller-%d %s network history add", + ctrl.this_controller, + hbs_cluster_network_name(n).c_str()); + } + } + + /* Manage storage-0 status. */ + if ( ctrl.cluster.storage0_enabled ) + { + /* Handle storage-0 status change from not responding to responding. */ + if ( storage_0_responding == true ) + { + if (history_ptr->storage0_responding == false) + { + history_ptr->storage0_responding = true ; + ilog ("controller-%d %s heartbeat ; storage-0 is ok", + ctrl.this_controller, + hbs_cluster_network_name(n).c_str()); + } + if (ctrl.storage_0_not_responding_count[n]) + ctrl.storage_0_not_responding_count[n] = 0 ; + } + /* Count the storage-0 not responding case for this network. */ + else + { + ctrl.storage_0_not_responding_count[n]++ ; + if ( ctrl.storage_0_not_responding_count[n] == 2 ) + { + ilog ("controller-%d %s heartbeat ; storage-0 has 2 misses", + ctrl.this_controller, + hbs_cluster_network_name(n).c_str() ); + } + } + + /* Handle storage-0 status change from responding to not responding. */ + if (( history_ptr->storage0_responding == true ) && + ( ctrl.storage_0_not_responding_count[n] >= STORAGE_0_NR_THRESHOLD )) + { + history_ptr->storage0_responding = false ; + ilog ("controller-%d %s heartbeat ; storage-0 is not responding", + ctrl.this_controller, + hbs_cluster_network_name(n).c_str() ); + } + } + else + { + /* Typical path for storage-0 disabled or normal non-storage system case */ + if ( history_ptr->storage0_responding == true ) + history_ptr->storage0_responding = false ; + + /* Handle clearing threshold count when storage-0 is not enabled. */ + if ( ctrl.storage_0_not_responding_count[n] ) + ctrl.storage_0_not_responding_count[n] = 0 ; + } + + /* + * Manage the history entry index. + * + * Get the previous entry index ... + * ... which is the one before the oldest index. + * ... which is the index for the next entry. + */ + unsigned short last_entry_index ; + if ( history_ptr->oldest_entry_index == 0 ) + { + /* Go to the end of the array. */ + last_entry_index = MTCE_HBS_HISTORY_ENTRIES-1 ; + } + else + { + /* Otherwise, the previous index in the array */ + last_entry_index = history_ptr->oldest_entry_index - 1 ; + } + + /* Update the history with this data. */ + history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled = ctrl.monitored_hosts ; + history_ptr->entry[history_ptr->oldest_entry_index].hosts_responding = ctrl.monitored_hosts - not_responding_hosts ; + + if (( history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled != + history_ptr->entry[ last_entry_index].hosts_enabled ) || + ( history_ptr->entry[history_ptr->oldest_entry_index].hosts_responding != + history_ptr->entry[ last_entry_index].hosts_responding)) + { + /* Only log on change events. */ + if ( history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled == + history_ptr->entry[history_ptr->oldest_entry_index].hosts_responding ) + { + ilog ("controller-%d %s cluster of %d is healthy", + ctrl.this_controller, + hbs_cluster_network_name(n).c_str(), + history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled); + } + else + { + ilog ("controller-%d %s cluster of %d with %d responding", + ctrl.this_controller, + hbs_cluster_network_name(n).c_str(), + history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled, + history_ptr->entry[history_ptr->oldest_entry_index].hosts_responding); + } + } + + /* Increment the entries count till it reaches the max. */ + if ( history_ptr->entries < MTCE_HBS_HISTORY_ENTRIES ) + history_ptr->entries++ ; + + /* Manage the next entry update index ; aka the oldest index. */ + if ( history_ptr->oldest_entry_index == (MTCE_HBS_HISTORY_ENTRIES-1)) + history_ptr->oldest_entry_index = 0 ; + else + history_ptr->oldest_entry_index++ ; + + /* clear the log throttle if we are updating history ok. */ + ctrl.log_throttle = 0 ; +} + +/**************************************************************************** + * + * Name : hbs_cluster_append + * + * Description : Add this controller's cluster info to this pulse + * request message. + * + ***************************************************************************/ + +void hbs_cluster_append ( hbs_message_type & msg ) +{ + unsigned short c = ctrl.this_controller ; + + CHECK_CTRL_NTWK_PARMS(c, ctrl.monitored_networks); + + msg.cluster.version = ctrl.cluster.version ; + msg.cluster.revision = ctrl.cluster.revision ; + msg.cluster.magic_number = ctrl.cluster.magic_number ; + msg.cluster.period_msec = ctrl.cluster.period_msec ; + msg.cluster.storage0_enabled = ctrl.cluster.storage0_enabled ; + msg.cluster.histories = ctrl.cluster.histories ; + + int bytes = BYTES_IN_CLUSTER_VAULT(ctrl.monitored_networks); + + clog1 ("controller-%d appending cluster info to heartbeat message (%d:%d:%d)", + c, ctrl.monitored_networks, ctrl.cluster.histories, bytes ); + + /* Copy the cluster into the message. */ + memcpy( &msg.cluster.history[0], &ctrl.cluster.history[c], bytes); +} + +/**************************************************************************** + * + * Name : hbs_cluster_unused_bytes + * + * Descrition : Used to set how much data to send in the heartbeat pulse + * requests. + * + * Returns : The number of bytes that are not used in the full + * history array cluster structure. + * + ***************************************************************************/ + +unsigned short hbs_cluster_unused_bytes ( void ) +{ + if ( ctrl.cluster.histories <= MTCE_HBS_MAX_HISTORY_ELEMENTS ) + { + unsigned short tmp = MTCE_HBS_MAX_HISTORY_ELEMENTS - ctrl.cluster.histories ; + return((unsigned short)(sizeof(mtce_hbs_cluster_history_type)*tmp)) ; + } + return 0; +} + + +/**************************************************************************** + * + * Name : hbs_cluster_send + * + * Description: Send the cluster vault to SM. + * + * Returns : Nothing + * + ***************************************************************************/ + +/* NOTE: All code wrapped in this directive will be removed once + * active/active heartbeating is delivered in next update */ +#define WANT_ACTIVE_ACTIVE_HEARTBEAT_RESULTS + +void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid ) +{ + +#ifdef WANT_ACTIVE_ACTIVE_HEARTBEAT_RESULTS + + /* To assist SM with duplex integration ... + * + * This code emulates heartbeat redundancy by duplicating + * controller history up to the number of provisioned + * controllers until active-active heartbeat is delivered. + */ + int peer_controller ; + bool copy_cluster = false ; + if ( ctrl.this_controller == 0 ) + { + peer_controller = 1 ; + if ( ctrl.controller_1_enabled ) + { + copy_cluster = true ; + } + } + else + { + peer_controller = 0 ; + if ( ctrl.controller_0_enabled ) + { + copy_cluster = true ; + } + } + + int n, networks = ctrl.cluster.histories ; + if ( copy_cluster ) + { + for ( n = 0 ; n < networks ; n++ ) + { + /* copy this controller history to create peer controller */ + ctrl.cluster.history[ctrl.cluster.histories] = ctrl.cluster.history[n] ; + + /* update the controller */ + ctrl.cluster.history[ctrl.cluster.histories].controller = peer_controller ; + ctrl.cluster.bytes += sizeof(mtce_hbs_cluster_history_type) ; + ctrl.cluster.histories++ ; + } + } + +#endif // WANT_ACTIVE_ACTIVE_HEARTBEAT_RESULTS + + ctrl.cluster.reqid = (unsigned short)reqid ; + if (( sm_client_sock ) && ( sm_client_sock->sock_ok() == true )) + { + int len = sizeof(mtce_hbs_cluster_type)-hbs_cluster_unused_bytes(); + int bytes = sm_client_sock->write((char*)&ctrl.cluster, len); + if ( bytes <= 0 ) + { + elog ("failed to send cluster vault to SM (bytes=%d) (%d:%s)\n", + bytes , errno, strerror(errno)); + } + else + { + ilog ("heartbeat cluster vault sent to SM (%d bytes)", len ); + hbs_cluster_dump ( ctrl.cluster ); + } + } + +#ifdef WANT_ACTIVE_ACTIVE_HEARTBEAT_RESULTS + + if ( copy_cluster ) + { + /* Clear out the other controllers data. */ + for ( n = networks ; n > 0 ; n-- ) + { + /* copy c0 history to another controller */ + hbs_cluster_history_init(ctrl.cluster.history[ctrl.cluster.histories-1]); + ctrl.cluster.bytes -= sizeof(mtce_hbs_cluster_history_type); + ctrl.cluster.histories-- ; + } + } + +#endif // WANT_ACTIVE_ACTIVE_HEARTBEAT_RESULTS + +} + +void hbs_cluster_log ( string & hostname, string prefix ) +{ + hbs_cluster_log ( hostname, ctrl.cluster, prefix ); +} + +/**************************************************************************** + * + * Active Active Heartbeating and Debug Member Functions + * + ***************************************************************************/ + +/**************************************************************************** + * + * Name : hbs_cluster_cmp + * + * Descrition : Performs a sanity check over the cluster structure. + * + * Assumptions : Debug tool, not called at runtime. + * + * Returns : PASS or FAIL + * + ***************************************************************************/ + +int hbs_cluster_cmp( hbs_message_type & msg ) +{ + if ( msg.cluster.version < ctrl.cluster.version ) + { + wlog ("Unexpected version (%d:%d)", + msg.cluster.version, ctrl.cluster.version ); + } + else if ( msg.cluster.revision != ctrl.cluster.revision ) + { + wlog ("Unexpected revision (%d:%d)", + msg.cluster.revision, ctrl.cluster.revision ); + } + else if ( msg.cluster.magic_number != ctrl.cluster.magic_number ) + { + wlog ("Unexpected magic number (%d:%d)", + msg.cluster.magic_number, ctrl.cluster.magic_number ); + } + else if ( msg.cluster.period_msec != ctrl.cluster.period_msec ) + { + wlog ("Cluster Heartbeat period delta (%d:%d)", + msg.cluster.period_msec, ctrl.cluster.period_msec ); + } + else if ( msg.cluster.storage0_enabled != ctrl.cluster.storage0_enabled ) + { + wlog ("Cluster storage0 enabled state delta (%d:%d)", + msg.cluster.storage0_enabled, ctrl.cluster.storage0_enabled ); + } + else + { + return (PASS); + } + return (FAIL); +} + +/**************************************************************************** + * + * Name : hbs_cluster_save + * + * Descrition : Copies the other controllers information from msg into + * the cluster. + * + * NOTE: Does not do that right now. + * + * Assumptions : Place holder until active/active heartbeating is implemented. + * + * Returns : PASS or FAIL + * + ***************************************************************************/ + +int hbs_cluster_save ( string & hostname, + mtce_hbs_network_enum network, + hbs_message_type & msg ) +{ + // clog ("Add cluster info from peer controller"); + if ( ctrl.monitored_hosts ) + { + /* compare cluster info and log deltas */ + // hbs_cluster_cmp( msg ); + UNUSED(msg); + hbs_cluster_log( hostname, ctrl.cluster, hbs_cluster_network_name(network) ); + } + return (PASS); +} diff --git a/mtce/src/heartbeat/hbsCluster.h b/mtce/src/heartbeat/hbsCluster.h new file mode 100644 index 00000000..bb4ffe14 --- /dev/null +++ b/mtce/src/heartbeat/hbsCluster.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2018 Wind River Systems, Inc. + * + * SPDX-License-Identifier: Apache-2.0 + * + * @file StarlingX Maintenance Heartbeat Cluster Manager Module + * + ************************************************************************* + * + * This module provides API for the hbsAgent service to call to + * collect, store and send heartbeat cluster information to SM + * upon request. See hbsCluster.h for formal API. + * + *************************************************************************/ + +#ifndef __HBSCLUSTER_H__ +#define __HBSCLUSTER_H__ + +using namespace std; + +#include "mtceHbsCluster.h" /* for ... the public API */ + +/**************************************************************************** + * + * Name : BYTES_IN_CLUSTER_VAULT + * + * Description : Calculates the number of bytes in the cluster vault based on + * the number of valid history array elements included. + * + * Parameters : + * + ***************************************************************************/ + +#define BYTES_IN_CLUSTER_VAULT(e) \ + (sizeof(mtce_hbs_cluster_type)-(sizeof(mtce_hbs_cluster_history_type)*(MTCE_HBS_MAX_HISTORY_ELEMENTS-e))) + +/**************************************************************************** + * + * Name : CHECK_CTRL_NTWK_PARMS + * + * Description : + * + * Parameters : + * + ***************************************************************************/ + +#define CHECK_CTRL_NTWK_PARMS(c,n) \ + if (( c > MTCE_HBS_MAX_CONTROLLERS ) || \ + ( n > MTCE_HBS_NETWORKS )) \ + { \ + slog ("Invalid parameter: %d:%d", c, n); \ + return ; \ + } + +/**************************************************************************** + * + * Name : GET_CLUSTER_HISTORY_PTR + * + * Description : + * + * Parameters : + * + ***************************************************************************/ + +#define GET_CLUSTER_HISTORY_PTR(cluster, c,n) \ + for ( int h = 0 ; h < cluster.histories ; h++ ) \ + { \ + if (( cluster.history[h].controller == c ) && \ + ( cluster.history[h].network == n )) \ + { \ + history_ptr = &cluster.history[h] ; \ + } \ + } + + +#define SET_CONTROLLER_HOSTNAME(c) \ + if ( c == 0 ) \ + controller = CONTROLLER_0 ; \ + else if ( c == 1 ) \ + controller = CONTROLLER_1 ; \ + else if ( c == 2 ) \ + controller = CONTROLLER_2 ; \ + else \ + controller = "unknown" \ + +#endif // __HBSCLUSTER_H__ diff --git a/mtce/src/heartbeat/hbsUtil.cpp b/mtce/src/heartbeat/hbsUtil.cpp new file mode 100644 index 00000000..54edb376 --- /dev/null +++ b/mtce/src/heartbeat/hbsUtil.cpp @@ -0,0 +1,346 @@ +/* + * Copyright (c) 2018 Wind River Systems, Inc. + * + * SPDX-License-Identifier: Apache-2.0 + * + * @file Maintenance Heartbeat Utilities Module + * + ************************************************************************* + * + * This module provides heartbeat utilities that are common to both + * hbsAgent and hbsClient. + * + *************************************************************************/ + +using namespace std; + +#include "daemon_common.h" /* common daemon constructs and definitions */ +#include "hbsBase.h" /* mtce heartbeat constructs and definitions */ + +/* hbs_cluster_log utility support. log control array. */ +bool first_log[MTCE_HBS_MAX_HISTORY_ELEMENTS]; /* has first history log out */ +bool was_diff [MTCE_HBS_MAX_HISTORY_ELEMENTS]; /* was there a history diff */ + + +/**************************************************************************** + * + * Name : hbs_utils_init + * + * Description : Module Init function + * + ***************************************************************************/ + +void hbs_utils_init ( void ) +{ + MEMSET_ZERO ( first_log ); + MEMSET_ZERO ( was_diff ); +} + + +/**************************************************************************** + * + * Name : hbs_cluster_history_init + * + * Description : Initialize a cluster history element. + * + * Parameters : Reference to a mtce_hbs_cluster_history_type (history element) + * + * Returns : Nothing + * + ***************************************************************************/ + +void hbs_cluster_history_init ( mtce_hbs_cluster_history_type & history ) +{ + MEMSET_ZERO(history); + history.entries_max = MTCE_HBS_HISTORY_ENTRIES ; +} + + +/**************************************************************************** + * + * Name : hbs_cluster_history_clear + * + * Description : Clear all history in the cluster vault. + * + * Parameters : mtce_hbs_cluster_type instance : the vault. + * + * Returns : Nothing + * + ***************************************************************************/ + +void hbs_cluster_history_clear ( mtce_hbs_cluster_type & cluster ) +{ + if ( cluster.histories ) + { + for ( int h = 0 ; h < cluster.histories ; h++ ) + hbs_cluster_history_init ( cluster.history[h] ) ; + } +} + + +/**************************************************************************** + * + * Name : cluster_network_name + * + * Description : converts what is a heartbeat cluster network id to + * network name. + * + * Parameters : network id + * + * Returns : network name as a string + * + ***************************************************************************/ + +string hbs_cluster_network_name ( mtce_hbs_network_enum network ) +{ + switch ( network ) + { + case MTCE_HBS_NETWORK_MGMT: + return ("Mgmnt"); + case MTCE_HBS_NETWORK_INFRA: + return ("Infra"); + +#ifdef MONITORED_OAM_NETWORK + case MTCE_HBS_NETWORK_OAM: + return ("Oam"); +#endif + + default: + slog ("invalid network enum (%d)", network ); + return ("unknown"); + } +} + + +/**************************************************************************** + * + * Name : hbs_cluster_log + * + * Description : logs changes to the heartbeat cluster + * + * Parameters : The heartbeat cluster structure + * + * Returns : Nothing + * + ***************************************************************************/ + +void hbs_cluster_log ( string & hostname, + mtce_hbs_cluster_type & cluster, + string log_prefix ) +{ + // bool want_log = false ; + + clog1 ("log %d histories", cluster.histories ); + for ( int h = 0 ; h < cluster.histories ; h++ ) + { + if ( cluster.history[h].entries == MTCE_HBS_HISTORY_ENTRIES ) + { +#define MAX_CLUSTER_LINE_LEN 100 +#define MAX_ENTRY_STR_LEN 10 /* "9999:9999 " */ + mtce_hbs_cluster_entry_type e = { 0, 0 } ; + char str[MAX_CLUSTER_LINE_LEN] ; + string line = ""; + int start = 0 ; + int stop = 0 ; + bool newline = false ; + bool logit = false ; + bool first = false ; + string controller = "" ; + + mtce_hbs_cluster_history_type * history_ptr = &cluster.history[h] ; + + clog1 ("%s %s has %d entries (controller-%d view from %s)", hostname.c_str(), + hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(), + history_ptr->entries, + history_ptr->controller, + log_prefix.c_str()); + + + /* Manage local this_index for log display. + * Display oldest to newest ; left to right + * + * */ + int this_index = history_ptr->oldest_entry_index ; + for ( int count = 0 ; count < history_ptr->entries ; count++ ) + { + if (( line.length() + MAX_ENTRY_STR_LEN ) >= + MAX_CLUSTER_LINE_LEN ) + { + newline = true ; + } + +#ifdef WANT_MINIMAL_LOGS + /* TODO: enable in final update */ + if (( first_log[h] == true ) && ( newline == false ) && + ( history_ptr->entry[this_index].hosts_enabled == + history_ptr->entry[this_index].hosts_responding )) + { + line.append(". "); + continue ; + } +#endif + + // want_log = true ; + + if ( count == 0 ) + { + snprintf (&str[0], MAX_ENTRY_STR_LEN , "%d:%d ", // -%d", + history_ptr->entry[this_index].hosts_enabled, + history_ptr->entry[this_index].hosts_responding ); // , this_index ); + line.append (str); + str[0] = '\0' ; + } +//#ifdef WANT_DOTS + else if (( history_ptr->entry[this_index].hosts_enabled == + e.hosts_enabled ) && + ( history_ptr->entry[this_index].hosts_responding == + e.hosts_responding )) + { + line.append(". "); + } +//#endif + else + { + snprintf (&str[0], MAX_ENTRY_STR_LEN , "%d:%d ", // -%d", + history_ptr->entry[this_index].hosts_enabled, + history_ptr->entry[this_index].hosts_responding ); // , this_index ); + line.append (str); + str[0] = '\0' ; + logit = true ; + was_diff[h] = true ; + } + if (( logit == false ) && ( first_log[h] == false )) + { + first_log[h] = true ; + logit = true ; + } + stop++ ; + if ( newline == true ) + { + if ( logit ) + { + SET_CONTROLLER_HOSTNAME(history_ptr->controller); + if ( hostname == controller ) + { + clog ("%s view %s %s %02d..%02d: %s,", + hostname.c_str(), + log_prefix.c_str(), + hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(), + start, stop, line.c_str()); + } + else + { + clog ("%s view from %s %s %s %02d..%02d: %s,", + controller.c_str(), + hostname.c_str(), + log_prefix.c_str(), + hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(), + start, stop, line.c_str()); + } + } + start = stop + 1 ; + line.clear(); + first = true ; + newline = false ; + } + e = history_ptr->entry[this_index] ; + + /* manage index tracking */ + if ( this_index == (MTCE_HBS_HISTORY_ENTRIES-1)) + this_index = 0 ; + else + this_index++ ; + } + if (( newline == false ) && ( line.length() )) + { + // ERIC + if (( logit == false ) && ( was_diff[h] == true )) + { + logit = true ; + was_diff[h] = false ; + } + + if ( logit ) + { + if ( first ) + { + clog ("............ %s %s %02d..%02d: %s", + log_prefix.c_str(), + hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(), + start, stop, line.c_str()); + } + else + { + SET_CONTROLLER_HOSTNAME(history_ptr->controller); + if ( hostname == controller ) + { + clog ("%s view %s %s %02d..%02d: %s", + hostname.c_str(), + log_prefix.c_str(), + hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(), + start, stop, line.c_str()); + } + else + { + clog ("%s view from %s %s %s %02d..%02d: %s", + controller.c_str(), + hostname.c_str(), + log_prefix.c_str(), /* Infra <- */ + hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(), + start, stop, line.c_str()); + } + } + } + else + { + was_diff[h] = false ; + } + } + } + } +} + +/**************************************************************************** + * + * name : hbs_cluster_dump + * + * Description: Formatted dump of the vault contents to the log file. + * + ***************************************************************************/ +void hbs_cluster_dump ( mtce_hbs_cluster_type & vault ) +{ + syslog ( LOG_INFO, "Cluster Vault Dump: --------------------------------------------------------------------------------------------"); + syslog ( LOG_INFO, "Cluster Vault: v%d.%d %d msec period ; SM Reqid is %d with storage-0 %s and %d histories in %d bytes", + vault.version, + vault.revision, + vault.period_msec, + vault.reqid, + vault.storage0_enabled ? "enabled" : "disabled", + vault.histories, + vault.bytes ); + for ( int h = 0 ; h < vault.histories ; h++ ) + { + #define MAX_LINE_LEN (500) + char str[MAX_LINE_LEN] ; + int i = 0 ; + for ( int e = 0 ; e < vault.history[h].entries_max ; e++ ) + { + snprintf ( &str[i], MAX_LINE_LEN, "%c[%d:%d]" , + vault.history[h].oldest_entry_index==e ? '>' : ' ', + vault.history[h].entry[e].hosts_enabled, + vault.history[h].entry[e].hosts_responding); + i = strlen(str) ; + } + syslog ( LOG_INFO, "Cluster Vault: C%d %s S:%s:%s (%d:%d) %s", + vault.history[h].controller, + hbs_cluster_network_name((mtce_hbs_network_enum)vault.history[h].network).c_str(), + vault.storage0_enabled ? "y" : "n", + vault.history[h].storage0_responding ? "y" : "n", + vault.history[h].entries_max, + vault.history[h].entries, + str); + } + // dump_memory ( &vault, 16, vault.bytes ); +} + + diff --git a/mtce/src/heartbeat/mtceHbsCluster.h b/mtce/src/heartbeat/mtceHbsCluster.h new file mode 100644 index 00000000..bd9f31db --- /dev/null +++ b/mtce/src/heartbeat/mtceHbsCluster.h @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2018 Wind River Systems, Inc. + * + * SPDX-License-Identifier: Apache-2.0 + * + * @file StarlingX Maintenance Heartbeat Cluster Manager Module + * + ************************************************************************* + * + * This module provides API for the hbsAgent service to call to + * collect, store and send heartbeat cluster information to SM + * upon request. See hbsCluster.h for formal API. + * + *************************************************************************/ + +#ifndef __MTCEHBSCLUSTER_H__ +#define __MTCEHBSCLUSTER_H__ + +#include + +/************************************************************** + * Implementation Structure + *************************************************************/ + +#define MTCE_HBS_CLUSTER_VERSION (1) +#define MTCE_HBS_CLUSTER_REVISION (0) +#define MTCE_HBS_MAGIC_NUMBER (0x5aa5) + +typedef enum +{ + MTCE_HBS_NETWORK_MGMT = 0, + MTCE_HBS_NETWORK_INFRA = 1, +#ifdef MONITORED_OAM_NETWORK + MTCE_HBS_NETWORK_OAM, +#endif + MTCE_HBS_NETWORKS +} mtce_hbs_network_enum ; + +#ifdef THREE_CONTROLLER_SYSTEM + #define MTCE_HBS_MAX_CONTROLLERS (3) +#else + #define MTCE_HBS_MAX_CONTROLLERS (2) +#endif + +#ifdef MONITORED_OAM_NETWORK + #define MTCE_HBS_MAX_NETWORKS (3) +#else + #define MTCE_HBS_MAX_NETWORKS (2) +#endif + +// value of 20 at 100 msec period is 2 seconds of history */ +#define MTCE_HBS_HISTORY_ENTRIES (20) + +/* maximum number of history elements permitted in a cluster history summary */ +#define MTCE_HBS_MAX_HISTORY_ELEMENTS ((MTCE_HBS_MAX_CONTROLLERS)*(MTCE_HBS_NETWORKS)) + +#ifndef ALIGN_PACK +#define ALIGN_PACK(x) __attribute__((packed)) x +#endif + +/* A single element of Heartbeat Cluster History for one heartbeat period */ +typedef struct +{ + unsigned short hosts_enabled ; /* # of hosts being hb monitored */ + unsigned short hosts_responding ; /* # of hosts that responsed to hb*/ +} ALIGN_PACK(mtce_hbs_cluster_entry_type); + + +/* Heartbeat Cluster History for all monitored networks of a Controller */ +typedef struct +{ + unsigned short controller :4 ; /* value 0 or 1 (and 2 in future) */ + unsigned short network :4 ; /* see mtce_hbs_network_enum */ + unsigned short reserved_bits :7 ; /* future - initted to 0 */ + unsigned short storage0_responding:1 ; /* 1 = storage-0 is hb healthy */ + unsigned short entries ; /* # of valid values in .entry */ + unsigned short entries_max ; /* max size of the enry array */ + unsigned short oldest_entry_index ; /* the oldest entry in the array */ + + /* historical array of entries for a specific network */ + mtce_hbs_cluster_entry_type entry [MTCE_HBS_HISTORY_ENTRIES] ; + +} ALIGN_PACK(mtce_hbs_cluster_history_type) ; + +/* Heartbeat Cluster History for all monitored networks of all Controllers */ +typedef struct +{ + /* Header - Static Data - 4 bytes */ + unsigned char version ; /* public API MTCE_HBS_CLUSTER_VERSION */ + unsigned char revision ; /* public API MTCE_HBS_CLUSTER_REVISION */ + unsigned short magic_number ; /* public API MTCE_HBS_MAGIC_NUMBER */ + + /* Control - Dynamic Data - 8 bytes */ + unsigned short reqid ; /* added from SM cluster request */ + unsigned short period_msec ; /* heartbeat period in milliseconds */ + unsigned short bytes ; /* total struct size self check */ + unsigned char storage0_enabled; /* bool containing true or false */ + unsigned char histories ; /* How many hostory elements follow */ + + /* Array of Cluster History + * + * - histories above specifies how many + * elements of this array are populated. + */ + mtce_hbs_cluster_history_type history [MTCE_HBS_MAX_HISTORY_ELEMENTS] ; + +} ALIGN_PACK(mtce_hbs_cluster_type) ; + +#endif // __HBSCLUSTER_H__ diff --git a/mtce/src/maintenance/Makefile b/mtce/src/maintenance/Makefile index f49ec335..83d038f5 100755 --- a/mtce/src/maintenance/Makefile +++ b/mtce/src/maintenance/Makefile @@ -23,6 +23,7 @@ SRCS += mtcKeyApi.cpp SRCS += mtcCmdHdlr.cpp SRCS += mtcNodeMnfa.cpp SRCS += mtcVimApi.cpp +SRCS += mtcStubs.cpp COMPUTE_OBJS = mtcNodeComp.o COMPUTE_OBJS += mtcCompMsg.o diff --git a/mtce/src/maintenance/mtcNodeHdlrs.cpp b/mtce/src/maintenance/mtcNodeHdlrs.cpp index 4fe185aa..d57c3d6e 100755 --- a/mtce/src/maintenance/mtcNodeHdlrs.cpp +++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp @@ -1935,8 +1935,10 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) * the host has not reset yet we have disabled services * then now we need to reset the host to prevet VM duplication * by forcing a full enable */ - if (( node_ptr->uptime_save != 0 ) && - ( node_ptr->uptime >= node_ptr->uptime_save )) + if ((( node_ptr->uptime_save != 0 ) && + ( node_ptr->uptime >= node_ptr->uptime_save )) || + (( node_ptr->uptime_save == 0 ) && + ( node_ptr->uptime > MTC_MINS_15 ))) { ilog ("%s regained MTCALIVE from host that did not reboot (uptime:%d)\n", node_ptr->hostname.c_str(), node_ptr->uptime ); diff --git a/mtce/src/maintenance/mtcStubs.cpp b/mtce/src/maintenance/mtcStubs.cpp new file mode 100644 index 00000000..f1a94b62 --- /dev/null +++ b/mtce/src/maintenance/mtcStubs.cpp @@ -0,0 +1,17 @@ +/* + * Copyright (c) 2013, 2016 Wind River Systems, Inc. +* +* SPDX-License-Identifier: Apache-2.0 +* + */ + + /** + * @file + * Maintenance Agent Stubs + */ + +using namespace std; + +#include "nodeClass.h" /* The main link class */ + +void hbs_cluster_log ( void ) { } diff --git a/mtce/src/scripts/hbs-query b/mtce/src/scripts/hbs-query new file mode 100755 index 00000000..0fba4742 --- /dev/null +++ b/mtce/src/scripts/hbs-query @@ -0,0 +1,40 @@ +#!/bin/bash + +# Copyright (c) 2013-2016 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# +# +# This utility is primarily used by no reboot patching for process restart +# +# This script sends a jason string containing the the restart command +# and ${1} as the specified process name to pmond over the loopback +# interface on port 2117 +# +# Linux Standard Base (LSB) Error Codes +RETVAL=0 +GENERIC_ERROR=1 +INVALID_ARGS=2 +UNSUPPORTED_FEATURE=3 +NOT_INSTALLED=5 +NOT_RUNNING=7 + +PROTOCOL="UDP4-DATAGRAM" +ADDRESS="127.0.0.1" + +socat_exec=`(which socat) 2> /dev/null` + +if [ -z ${socat_exec} ] ; then + logger "Error: $0 cannot find socat exec" + exit ${NOT_INSTALLED} +fi +reqid=123 + +if [ "${1}" != "" ] ; then + reqid=${1} +fi + +port=$(cat /etc/mtc.ini | awk '{if ($1 == "sm_server_port") { print $3; }}') +echo "{\"origin\":\"sm\", \"service\":\"heartbeat\", \"request\":\"cluster_info\", \"reqid\": $reqid }" | socat - ${PROTOCOL}:${ADDRESS}:${port} + +exit ${RETVAL}