diff --git a/mtce-common/src/common/logMacros.h b/mtce-common/src/common/logMacros.h index 60c43bbf..b9e124d1 100644 --- a/mtce-common/src/common/logMacros.h +++ b/mtce-common/src/common/logMacros.h @@ -39,7 +39,6 @@ typedef struct { int scheduling_priority ; /**< Scheduling priority of this daemon */ bool active ; /**< Maintenance activity state true|false */ - int hbs_pulse_period ; /**< time (msec) between heartbeat requests */ int token_refresh_rate ; /**< token refresh rate in seconds */ int hbs_minor_threshold ; /**< heartbeat miss minor threshold */ int hbs_degrade_threshold ; /**< heartbeat miss degrade threshold */ @@ -351,7 +350,7 @@ extern char *program_invocation_short_name; } #define blog(format, args...) { \ - if ( ltc() ) { if(daemon_get_cfg_ptr()->debug_bmgmt) printf ( "%s [%d.%05d] %s %s %-3s %-18s(%4d) %-24s: BMgt : " format, pt(), getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } \ + if ( ltc() ) { if(daemon_get_cfg_ptr()->debug_bmgmt&1) printf ( "%s [%d.%05d] %s %s %-3s %-18s(%4d) %-24s: BMgt : " format, pt(), getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } \ else { if(daemon_get_cfg_ptr()->debug_bmgmt) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: BMgt : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } \ } @@ -380,22 +379,22 @@ extern char *program_invocation_short_name; #define mlog2(format, args...) { if(daemon_get_cfg_ptr()->debug_msg&4 ) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Msg4 : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } #define mlog3(format, args...) { if(daemon_get_cfg_ptr()->debug_msg&8 ) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Msg8 : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } -#define jlog(format, args...) { if(daemon_get_cfg_ptr()->debug_json ) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Json : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } +#define jlog(format, args...) { if(daemon_get_cfg_ptr()->debug_json&1) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Json : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } #define jlog1(format, args...) { if(daemon_get_cfg_ptr()->debug_json&2) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Json2: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } #define jlog2(format, args...) { if(daemon_get_cfg_ptr()->debug_json&4) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Json4: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } #define jlog3(format, args...) { if(daemon_get_cfg_ptr()->debug_json&8) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Json8: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } -#define hlog(format, args...) { if(daemon_get_cfg_ptr()->debug_http) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Http : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } +#define hlog(format, args...) { if(daemon_get_cfg_ptr()->debug_http&1) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Http : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } #define hlog1(format, args...) { if(daemon_get_cfg_ptr()->debug_http&2) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Http2: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } #define hlog2(format, args...) { if(daemon_get_cfg_ptr()->debug_http&4) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Http4: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } #define hlog3(format, args...) { if(daemon_get_cfg_ptr()->debug_http&8) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Http8: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } -#define alog(format, args...) { if(daemon_get_cfg_ptr()->debug_alive ) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Alive : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } +#define alog(format, args...) { if(daemon_get_cfg_ptr()->debug_alive&1) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Alive : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } #define alog1(format, args...) { if(daemon_get_cfg_ptr()->debug_alive&2) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Alive2: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } #define alog2(format, args...) { if(daemon_get_cfg_ptr()->debug_alive&4) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Alive4: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } #define alog3(format, args...) { if(daemon_get_cfg_ptr()->debug_alive&8) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Alive8: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } -#define qlog(format, args...) { if(daemon_get_cfg_ptr()->debug_work) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Work : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } +#define qlog(format, args...) { if(daemon_get_cfg_ptr()->debug_work&1) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Work : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } #define qlog1(format, args...) { if(daemon_get_cfg_ptr()->debug_work&2) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Work2: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } #define qlog2(format, args...) { if(daemon_get_cfg_ptr()->debug_work&4) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Work4: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } #define qlog3(format, args...) { if(daemon_get_cfg_ptr()->debug_work&8) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Work8: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } @@ -403,8 +402,11 @@ extern char *program_invocation_short_name; #define flog(format, args...) { if(daemon_get_cfg_ptr()->debug_fsm) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: FSM : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } #define tlog(format, args...) { if(daemon_get_cfg_ptr()->debug_timer) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Timer: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } -#define clog(format, args...) { if(daemon_get_cfg_ptr()->debug_state) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Change: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } +#define clog(format, args...) { if(daemon_get_cfg_ptr()->debug_state&1) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Change: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } #define clog1(format, args...) { if(daemon_get_cfg_ptr()->debug_state&2) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Chang2: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } +#define clog2(format, args...) { if(daemon_get_cfg_ptr()->debug_state&4) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Chang4: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } +#define clog3(format, args...) { if(daemon_get_cfg_ptr()->debug_state&8) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Chang8: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } + #define log_event(format, args...) { syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Event: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } #define log_stress(format, args...) { syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Stress: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } diff --git a/mtce-common/src/common/nodeBase.cpp b/mtce-common/src/common/nodeBase.cpp index f7410580..f0db8696 100755 --- a/mtce-common/src/common/nodeBase.cpp +++ b/mtce-common/src/common/nodeBase.cpp @@ -233,6 +233,7 @@ const char * get_mtcNodeCommand_str ( int cmd ) case MTC_CMD_QRY_HOST: return("query host"); case MTC_CMD_START_HOST: return("start host service"); case MTC_CMD_STOP_HOST: return("stop host service"); + case MTC_CMD_ACTIVE_CTRL: return("publish active controller"); /* VM Instance Commands */ case MTC_CMD_ADD_INST: return("add instance"); diff --git a/mtce-common/src/common/nodeBase.h b/mtce-common/src/common/nodeBase.h index 3504acc6..788954b6 100755 --- a/mtce-common/src/common/nodeBase.h +++ b/mtce-common/src/common/nodeBase.h @@ -359,6 +359,7 @@ void daemon_exit ( void ); * a power-off to online transition */ #define MTC_MTCALIVE_HITS_TO_GO_ONLINE (5) +#define CONTROLLER_X ((const char *)"controller-x") #define CONTROLLER_0 ((const char *)"controller-0") #define CONTROLLER_1 ((const char *)"controller-1") #define CONTROLLER_2 ((const char *)"controller-2") @@ -526,7 +527,8 @@ typedef struct #define MTC_CMD_MOD_HOST (0x11110012) /* Query Host */ #define MTC_CMD_QRY_HOST (0x11110013) /* Modify Host */ #define MTC_CMD_START_HOST (0x11110014) /* Start Monitoring Host */ -#define MTC_CMD_STOP_HOST (0x11110015) /* Stop Moniroting Host */ +#define MTC_CMD_STOP_HOST (0x11110015) /* Stop Monitoring Host */ +#define MTC_CMD_ACTIVE_CTRL (0x11110016) /* Active Controller */ #define MTC_CMD_ADD_INST (0x11110020) /* Add Inst */ #define MTC_CMD_DEL_INST (0x11110021) /* Delete Inst */ @@ -643,6 +645,9 @@ typedef struct #define PMOND_FLAG (0x00000001) /**< Process Monitor O.K. Flag */ #define INFRA_FLAG (0x00000002) /**< Infrastructure iface provisioned Flag */ +#define CTRLX_MASK (0x00000300) /**< From/To Controller-0/1/2/3 Number */ +#define CTRLX_BIT ((unsigned int)8) /**< used to shift right mask into bit 0 */ + #define STALL_MON_FLAG (0x00010000) /**< Flag indicating hang monitor running */ #define STALL_REC_FLAG (0x00020000) /**< Flag indicating hbsClient took action */ #define STALL_ERR1_FLAG (0x00100000) /**< Error 1 Flag */ @@ -1217,15 +1222,15 @@ string get_availStatus_str ( mtc_nodeAvailStatus_enum availStatus ); string get_operState_str ( mtc_nodeOperState_enum operState ); string get_adminState_str ( mtc_nodeAdminState_enum adminState ); -void log_adminAction ( string hostname, - mtc_nodeAdminAction_enum currAction, +void log_adminAction ( string hostname, + mtc_nodeAdminAction_enum currAction, mtc_nodeAdminAction_enum newAction ); -int send_hbs_command ( string hostname, int command ); +int send_hbs_command ( string hostname, int command, string controller=CONTROLLER ); int send_hwmon_command ( string hostname, int command ); int send_guest_command ( string hostname, int command ); -int daemon_log_message ( const char * hostname, +int daemon_log_message ( const char * hostname, const char * filename, const char * log_str ); diff --git a/mtce-common/src/common/nodeTimers.h b/mtce-common/src/common/nodeTimers.h index 7371c3bd..8f7178d8 100755 --- a/mtce-common/src/common/nodeTimers.h +++ b/mtce-common/src/common/nodeTimers.h @@ -48,6 +48,7 @@ #define MTC_MINS_20 (1200) #define MTC_MINS_30 (1800) #define MTC_MINS_40 (2400) +#define MTC_HRS_1 (3600) #define MTC_HRS_4 (14400) #define MTC_HRS_8 (28800) /* old token refresh rate */ diff --git a/mtce-common/src/daemon/daemon_config.cpp b/mtce-common/src/daemon/daemon_config.cpp index 80f4c801..5053f5f8 100644 --- a/mtce-common/src/daemon/daemon_config.cpp +++ b/mtce-common/src/daemon/daemon_config.cpp @@ -269,7 +269,7 @@ void daemon_dump_cfg ( void ) { daemon_config_type * ptr = daemon_get_cfg_ptr(); - ilog ("Configuration Settings\n------------------------------\n"); + ilog ("Configuration Settings ...\n"); if ( ptr->scheduling_priority ) { ilog ("scheduling_priority = %d\n", ptr->scheduling_priority ); } if ( ptr->infra_degrade_only ) { ilog ("infra_degrade_only = %s\n", ptr->infra_degrade_only ? "Yes" : "No" );} @@ -277,7 +277,6 @@ void daemon_dump_cfg ( void ) if ( ptr->active ) { ilog ("active = %s\n", ptr->active ? "Yes" : "No" );} /* hbsAgent */ - if ( ptr->hbs_pulse_period ) { ilog ("hbs_pulse_period = %d\n", ptr->hbs_pulse_period );} if ( ptr->token_refresh_rate ) { ilog ("token_refresh_rate = %d\n", ptr->token_refresh_rate );} if ( ptr->hbs_minor_threshold ) { ilog ("hbs_minor_threshold = %d\n", ptr->hbs_minor_threshold );} if ( ptr->hbs_degrade_threshold ) { ilog ("hbs_degrade_threshold = %d\n", ptr->hbs_degrade_threshold );} diff --git a/mtce-common/src/daemon/daemon_main.cpp b/mtce-common/src/daemon/daemon_main.cpp index 9c2bb190..de6f2aee 100755 --- a/mtce-common/src/daemon/daemon_main.cpp +++ b/mtce-common/src/daemon/daemon_main.cpp @@ -78,6 +78,7 @@ void print_help ( void ) printf ("\t-l --log - Log to file ; /var/log/.log\n"); printf ("\t-p --passive - Passive mode ; do not act on failures\n"); printf ("\t-v --verbose - Show command line arguments\n"); + printf ("\t-V --Virtual - Running in virtual environment\n"); printf ("\t-t --test - Run Test Head\n"); printf ("\t-g --gap - Gap in seconds\n"); printf ("\t-m --mode - Word string representing a run mode\n"); @@ -106,6 +107,9 @@ int daemon_get_run_option ( const char * option ) } return (1); } + else if ( !strcmp ( option, "Virtual" ) ) + return opts.Virtual ; + else if ( !strcmp ( option, "front" ) ) return opts.front ; @@ -118,6 +122,7 @@ void opts_init ( void) opts.log = false ; opts.test = false ; opts.verbose = false ; + opts.Virtual = false ; opts.active = false ; opts.front = false ; opts.front = false ; @@ -152,8 +157,8 @@ int parseArg ( int argc, char * argv[], opts_type * opts_ptr ) int cmd_arg_count = 1 ; /* command args start at 1 */ /* A string listing of valid short options letters. */ - const char* const short_options = "u:c:p:g:i:m:n:d:hlfpvta"; - + const char* const short_options = "u:c:p:g:i:m:n:d:hlfpvVta"; + /* An array listing of valid long options. */ const struct option long_options[] = { @@ -167,9 +172,10 @@ int parseArg ( int argc, char * argv[], opts_type * opts_ptr ) { "username" , 1, NULL, 'u' }, { "help" , 0, NULL, 'h' }, { "active" , 0, NULL, 'a' }, - { "foreground", 0, NULL, 'f' }, - { "log" , 0, NULL, 'l' }, + { "foreground", 0, NULL, 'f' }, + { "log" , 0, NULL, 'l' }, { "verbose" , 0, NULL, 'v' }, + { "Virtual" , 0, NULL, 'V' }, { "test" , 0, NULL, 't' }, { NULL , 0, NULL, 0 } /* Required at end of array. */ }; @@ -254,19 +260,25 @@ int parseArg ( int argc, char * argv[], opts_type * opts_ptr ) case 't': /* -t or --test */ { opts_ptr->test = true ; - cmd_arg_count++ ; + cmd_arg_count++ ; break; } - case 'v': /* -t or --verbose */ + case 'v': /* -v or --verbose */ { opts_ptr->verbose = true ; - cmd_arg_count++ ; + cmd_arg_count++ ; + break; + } + case 'V': /* -V or --Virtual */ + { + opts_ptr->Virtual = true ; + cmd_arg_count++ ; break; } case 'a': /* -a or --active */ { opts_ptr->active = true ; - cmd_arg_count++ ; + cmd_arg_count++ ; break; } case '?': diff --git a/mtce-common/src/daemon/daemon_option.h b/mtce-common/src/daemon/daemon_option.h index a4924b50..3aaac846 100755 --- a/mtce-common/src/daemon/daemon_option.h +++ b/mtce-common/src/daemon/daemon_option.h @@ -33,6 +33,7 @@ typedef struct int test ; /**< Enable test mode */ int info ; /**< Dump data module info */ int verbose ; /**< Dump command line options */ + int Virtual ; /**< Set to non-zero when in virtual env */ int active ; /**< Set daemon active */ int debug ; /**< Set tracing debug mode "debug,"test","info","trace" */ int front ; /**< run in the foreground ; do not daemonize */ @@ -43,7 +44,7 @@ typedef struct string username ; string command ; string password ; -} opts_type ; +} opts_type ; opts_type * daemon_get_opts_ptr ( void ); diff --git a/mtce-control/centos/build_srpm.data b/mtce-control/centos/build_srpm.data index a69a574d..48f2c3f2 100644 --- a/mtce-control/centos/build_srpm.data +++ b/mtce-control/centos/build_srpm.data @@ -1,3 +1,3 @@ SRC_DIR="$PKG_BASE/src" COPY_LIST="$SRC_DIR/*" -TIS_PATCH_VER=6 +TIS_PATCH_VER=7 diff --git a/mtce-control/centos/mtce-control.spec b/mtce-control/centos/mtce-control.spec index a69bca1f..d65782c7 100644 --- a/mtce-control/centos/mtce-control.spec +++ b/mtce-control/centos/mtce-control.spec @@ -34,6 +34,7 @@ make install buildroot=%{buildroot} _sysconfdir=%{_sysconfdir} _unitdir=%{_unitd if [ $1 -eq 1 ] ; then /bin/systemctl enable lighttpd.service /bin/systemctl enable qemu_clean.service + /bin/systemctl enable hbsAgent.service fi exit 0 @@ -41,6 +42,9 @@ exit 0 %defattr(-,root,root,-) %{_sysconfdir}/init.d/goenabledControl %license %{_datarootdir}/licenses/mtce-control-1.0/LICENSE +%{_sysconfdir}/pmon.d/hbsAgent.conf +%{_sysconfdir}/init.d/hbsAgent +%{_unitdir}/hbsAgent.service %clean rm -rf $RPM_BUILD_ROOT diff --git a/mtce-control/src/Makefile b/mtce-control/src/Makefile index 0741da1d..aaa3de7f 100755 --- a/mtce-control/src/Makefile +++ b/mtce-control/src/Makefile @@ -1,19 +1,32 @@ -SOURCE1 = goenabled -SOURCE2 = LICENSE +SOURCE1 = LICENSE +SOURCE2 = goenabled +SOURCE3 = hbsAgent +SOURCE4 = hbsAgent.conf +SOURCE5 = hbsAgent.service -local_etc_pmond = $(_sysconfdir)/pmond.d +local_etc_pmond = $(_sysconfdir)/pmon.d local_etc_goenabledd = $(_sysconfdir)/goenabled.d .PHONY: default install: - # Controller-Only Init Scripts - install -m 755 -p -D scripts/$(SOURCE1) $(buildroot)/$(_sysconfdir)/init.d/goenabledControl - # Controller-Only Process Monitor Config files - install -m 755 -d $(buildroot)/$(local_etc_pmond) - # Controller-Only Go Enabled Test - install -m 755 -d $(buildroot)/$(local_etc_goenabledd) + # for license install -m 755 -d $(buildroot)/$(_datarootdir)/licenses/mtce-control-1.0 - install -p -D -m 600 $(SOURCE2) $(buildroot)/$(_datarootdir)/licenses/mtce-control-1.0/LICENSE + install -m 600 -p -D $(SOURCE1) $(buildroot)/$(_datarootdir)/licenses/mtce-control-1.0/LICENSE + + # Controller-Only Init Scripts + install -m 755 -d $(buildroot)/$(_sysconfdir)/init.d + install -m 755 -p -D scripts/$(SOURCE2) $(buildroot)/$(_sysconfdir)/init.d/goenabledControl + install -m 755 -p -D scripts/$(SOURCE3) $(buildroot)/$(_sysconfdir)/init.d/hbsAgent + + # Controller-Only Process Monitor Config files + install -m 755 -d $(buildroot)/$(local_etc_pmond) + install -m 644 -p -D scripts/$(SOURCE4) $(buildroot)/$(local_etc_pmond)/hbsAgent.conf + + # Controller-Only Heartbeat Service file + install -m 644 -p -D scripts/$(SOURCE5) $(buildroot)/$(_unitdir)/hbsAgent.service + + # Controller-Only Go Enabled Test + install -m 755 -d $(buildroot)/$(local_etc_goenabledd) diff --git a/mtce-control/src/scripts/hbsAgent b/mtce-control/src/scripts/hbsAgent new file mode 100644 index 00000000..db93fde0 --- /dev/null +++ b/mtce-control/src/scripts/hbsAgent @@ -0,0 +1,117 @@ +#! /bin/sh +# +# Copyright (c) 2018 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + +# +# chkconfig: 2345 95 95 +# +### BEGIN INIT INFO +# Provides: hbsAgent +# Default-Start: 3 5 +# Default-Stop: 0 1 2 6 +# Short-Description: Heartbeat Agent Daemon +### END INIT INFO + +. /etc/init.d/functions + +DAEMON_NAME="hbsAgent" +DAEMON="/usr/local/bin/${DAEMON_NAME}" +PIDFILE="/var/run/${DAEMON_NAME}.pid" + +VIRT_TOOL='virt-what' +# controller-1:~$ sudo virt-what +# virtualbox ... in virtualbox +# kvm ... in qemu + +# Linux Standard Base (LSB) Error Codes +RETVAL=0 +GENERIC_ERROR=1 +INVALID_ARGS=2 +UNSUPPORTED_FEATURE=3 +NOT_INSTALLED=5 +NOT_RUNNING=7 + +PATH=/sbin:/usr/sbin:/bin:/usr/bin:/usr/local/bin +export PATH + +if [ ! -e "${DAEMON}" ] ; then + logger "${DAEMON} is missing" + exit ${NOT_INSTALLED} +fi + +case "$1" in + start) + logger "Starting ${DAEMON_NAME}" + echo -n "Starting ${DAEMON_NAME}: " + if [ -n "`pidof ${DAEMON_NAME}`" ] ; then + echo -n "is already running " + RETVAL=0 + else + tool=$(which ${VIRT_TOOL}) + if [ $? -eq 0 ] ; then + virtual=`${VIRT_TOOL}` + else + virtual="" + fi + + if [ "${virtual}" == "virtualbox" -o "${virtual}" == "kvm" ] ; then + start-stop-daemon --start -b -x ${DAEMON} -- -l -a -V + else + start-stop-daemon --start -b -x ${DAEMON} -- -l -a + fi + RETVAL=$? + fi + if [ ${RETVAL} -eq 0 ] ; then + pid=`pidof ${DAEMON_NAME}` + echo "OK" + logger "${DAEMON} (${pid})" + else + echo "FAIL" + RETVAL=${GENERIC_ERROR} + fi + ;; + + stop) + logger "Stopping ${DAEMON_NAME}" + echo -n "Stopping ${DAEMON_NAME}: " + if [ -n "`pidof ${DAEMON_NAME}`" ] ; then + killproc ${DAEMON_NAME} + fi + if [ -n "`pidof ${DAEMON_NAME}`" ] ; then + echo "FAIL" + RETVAL=${NOT_RUNNING} + else + echo "OK" + fi + rm -f ${PIDFILE} + ;; + + restart) + $0 stop + $0 start + ;; + + status) + pid=`pidof ${DAEMON_NAME}` + RETVAL=$? + if [ ${RETVAL} -eq 0 ] ; then + echo "${DAEMON_NAME} is running" + else + echo "${DAEMON_NAME} is NOT running" + RETVAL=${NOT_RUNNING} + fi + ;; + + condrestart) + $0 restart + ;; + + *) + echo "usage: $0 { start | stop | status | restart | condrestart | status }" + ;; +esac + +exit ${RETVAL} diff --git a/mtce-control/src/scripts/hbsAgent.conf b/mtce-control/src/scripts/hbsAgent.conf new file mode 100644 index 00000000..169e5ce0 --- /dev/null +++ b/mtce-control/src/scripts/hbsAgent.conf @@ -0,0 +1,25 @@ +[process] +process = hbsAgent +service = hbsAgent +pidfile = /var/run/hbsAgent.pid +style = lsb ; ocf or lsb +severity = major ; minor, major, critical +restarts = 1 ; restart retries before error assertion +interval = 10 ; number of seconds to wait between restarts +debounce = 10 ; number of seconds that a process needs to remain + ; running before degrade is removed and retry count + ; is cleared. +startuptime = 5 ; Seconds to wait after process start before starting the debounce monitor +mode = passive ; Monitoring mode: passive (default) or active + ; passive: process death monitoring (default: always) + ; active : heartbeat monitoring, i.e. request / response messaging + ; ignore : do not monitor or stop monitoring +quorum = 0 ; process is in the host watchdog quorum + +; Active Monitoring Options + +port = 2201 +period = 5 ; monitor period in seconds +timeout = 4 ; Messaging timeout period in seconds, must be shorter than period +threshold = 5 ; Number of back to back heartbeat failures before action + diff --git a/mtce-control/src/scripts/hbsAgent.service b/mtce-control/src/scripts/hbsAgent.service new file mode 100644 index 00000000..de3cb8d8 --- /dev/null +++ b/mtce-control/src/scripts/hbsAgent.service @@ -0,0 +1,22 @@ +[Unit] +Description=Titanium Cloud Maintenance Heartbeat Agent +After=network.target syslog.service config.service +Before=pmon.service + +[Service] +Type=forking +ExecStart=/etc/rc.d/init.d/hbsAgent start +ExecStop=/etc/rc.d/init.d/hbsAgent start +PIDFile=/var/run/hbsAgent.pid +KillMode=process +SendSIGKILL=no + +# Process recovery is handled by pmond if its running. +# Delay 10 seconds to give pmond a chance to recover +# before systemd kicks in to do it as a backup plan. +Restart=always +RestartSec=10 + +[Install] +WantedBy=multi-user.target + diff --git a/mtce/centos/build_srpm.data b/mtce/centos/build_srpm.data index a6783bd5..01e786b1 100644 --- a/mtce/centos/build_srpm.data +++ b/mtce/centos/build_srpm.data @@ -1,3 +1,3 @@ SRC_DIR="src" -TIS_PATCH_VER=140 +TIS_PATCH_VER=142 BUILD_IS_SLOW=5 diff --git a/mtce/centos/mtce.spec b/mtce/centos/mtce.spec index d69e34bc..e6ffa183 100644 --- a/mtce/centos/mtce.spec +++ b/mtce/centos/mtce.spec @@ -313,7 +313,6 @@ install -m 755 -d %{buildroot}/usr/lib/ocf install -m 755 -d %{buildroot}/usr/lib/ocf/resource.d install -m 755 -d %{buildroot}/usr/lib/ocf/resource.d/platform install -m 755 -p -D %{_buildsubdir}/scripts/mtcAgent %{buildroot}/usr/lib/ocf/resource.d/platform/mtcAgent -install -m 755 -p -D %{_buildsubdir}/scripts/hbsAgent %{buildroot}/usr/lib/ocf/resource.d/platform/hbsAgent install -m 755 -p -D %{_buildsubdir}/hwmon/scripts/ocf/hwmon %{buildroot}/usr/lib/ocf/resource.d/platform/hwmon # config files @@ -482,7 +481,6 @@ install -m 755 -d %{buildroot}/var/run # SM OCF Start/Stop/Monitor Scripts %{ocf_resourced}/platform/mtcAgent -%{ocf_resourced}/platform/hbsAgent # Config files %config(noreplace)/etc/mtc.ini diff --git a/mtce/src/alarm/alarm.cpp b/mtce/src/alarm/alarm.cpp index 6b113f89..f1642446 100644 --- a/mtce/src/alarm/alarm.cpp +++ b/mtce/src/alarm/alarm.cpp @@ -47,6 +47,11 @@ int alarm_register_user ( msgClassSock * sock_ptr ) return (rc); } +void alarm_unregister_user ( void ) +{ + user_sock_ptr = NULL ; +} + /* Construct an alarm request json string in the following form {\"mtcalarm\":[{\"alarmid\":\"200.009\",\"hostname\":\"compute-3\",\"operation\":\"set\",\"severity\":\"major\",\"entity\":\"Infrastructure\",\"prefix\":\"service=heartbeat\"}, {\"alarmid\":\"200.005\",\"hostname\":\"compute-3\",\"operation\":\"set\",\"severity\":\"major\",\"entity\":\"Management\",\"prefix\":\"service=heartbeat\"}]}" @@ -73,6 +78,17 @@ int alarm_ ( string hostname, const char * id, EFmAlarmStateT state, EFmAlarmSev string msg_type ; string sev ; + if ( user_sock_ptr == NULL ) + { + slog ("alarm socket is NULL"); + return (FAIL_NULL_POINTER ); + } + else if ( ! user_sock_ptr->sock_ok() ) + { + elog ("alarm socket is not ok"); + return (FAIL_OPERATION); + } + if ( state == FM_ALARM_STATE_MSG ) msg_type = "msg" ; else if ( state == FM_ALARM_STATE_SET ) @@ -127,7 +143,8 @@ int alarm_ ( string hostname, const char * id, EFmAlarmStateT state, EFmAlarmSev } else { - ilog ("%s %s\n", hostname.c_str(), request); + ilog ("%s %s %s %s %s", hostname.c_str(), entity, msg_type.c_str(), sev.c_str(), id); + mlog ("%s %s\n", hostname.c_str(), request); return ( PASS ) ; } daemon_signal_hdlr (); diff --git a/mtce/src/alarm/alarm.h b/mtce/src/alarm/alarm.h index 9e29c971..1bc8f9e0 100644 --- a/mtce/src/alarm/alarm.h +++ b/mtce/src/alarm/alarm.h @@ -68,6 +68,7 @@ EFmAlarmSeverityT alarmUtil_getSev_enum ( string severity ); #ifndef __MODULE_PRIVATE__ int alarm_register_user ( msgClassSock * sock_ptr ); +void alarm_unregister_user ( void ); /* Public API */ int alarm_ ( string hostname, const char * id, EFmAlarmStateT state, EFmAlarmSeverityT severity, const char * entity, string prefix ); diff --git a/mtce/src/common/nodeClass.cpp b/mtce/src/common/nodeClass.cpp index 0c51ab23..e33c70ce 100755 --- a/mtce/src/common/nodeClass.cpp +++ b/mtce/src/common/nodeClass.cpp @@ -36,6 +36,7 @@ using namespace std; #include "mtcAlarm.h" #include "alarm.h" #include "hbsAlarm.h" +#include "hbsBase.h" /** Initialize the supplied command buffer */ void mtcCmd_init ( mtcCmd & cmd ) @@ -263,7 +264,8 @@ nodeLinkClass::nodeLinkClass() /* Make no assumption on the service */ maintenance = false ; heartbeat = false ; - active = false ; + active = false ; /* run active */ + active_controller = false ; /* true if this controller is active */ /* Set some defaults for the hearbeat service */ hbs_ready = false ; @@ -1156,26 +1158,26 @@ void nodeLinkClass::print_node_info ( void ) if (( i == INFRA_IFACE ) && ( infra_network_provisioned == false )) continue ; - syslog ( LOG_INFO, "+--------------+-----+-----+------+-----+------+------------+---------+-----------------+\n"); - syslog ( LOG_INFO, "| %s: %3d | Mon | Mis | Max | Deg | Fail | Pulses Tot | Pulses | %s (%4d) |\n" , + syslog ( LOG_INFO, "+--------------+-----+-------+-------+-------+-------+------------+----------+-----------------+\n"); + syslog ( LOG_INFO, "| %s: %3d | Mon | Mis | Max | Deg | Fail | Pulses Tot | Pulses | %s (%4d) |\n" , get_iface_name_str ((iface_enum)i), hosts, hbs_disabled ? "DISABLED" : "Enabled ", hbs_pulse_period ); - syslog ( LOG_INFO, "+--------------+-----+-----+------+-----+------+------------+---------+-----------------+\n"); - + syslog ( LOG_INFO, "+--------------+-----+-------+-------+-------+-------+------------+----------+-----------------+\n"); + for ( struct node * ptr = head ; ptr != NULL ; ptr = ptr->next ) { - syslog ( LOG_INFO, "| %-12s | %c | %3i | %4i | %3i | %4i | %8x | %7x | %d msec\n", + syslog ( LOG_INFO, "| %-12s | %c | %5i | %5i | %5i | %5i | %10x | %8x | %d msec\n", ptr->hostname.c_str(), ptr->monitor[i] ? 'Y' : 'n', - ptr->hbs_misses_count[i], - ptr->max_count[i], - ptr->hbs_degrade_count[i], - ptr->hbs_failure_count[i], + ptr->hbs_misses_count[i], + ptr->max_count[i], + ptr->hbs_degrade_count[i], + ptr->hbs_failure_count[i], ptr->hbs_count[i], ptr->b2b_pulses_count[i], hbs_pulse_period ); } } - syslog ( LOG_INFO, "+--------------+-----+-----+------+-----+------+------------+---------+-----------------+\n"); + syslog ( LOG_INFO, "+--------------+-----+-------+-------+-------+-------+------------+----------+-----------------+\n"); } } @@ -7778,7 +7780,7 @@ int nodeLinkClass::remPulse ( struct node * node_ptr, iface_enum iface, bool cle { /* This default RC allows the caller to filter out unexpected pulse responses */ int rc = ENXIO ; - + if ( head == NULL ) { return -ENODEV ; @@ -7962,6 +7964,16 @@ int nodeLinkClass::remPulse ( struct node * node_ptr, iface_enum iface, bool cle } pulses[iface]-- ; } + else if ( node_ptr ) + { + dlog ("%s unexpected pulse response ; %s", + node_ptr->hostname.c_str(), + get_iface_name_str(iface)); + } + else + { + slog ("null pointer"); + } return rc ; } @@ -7972,6 +7984,13 @@ int nodeLinkClass::remPulse ( struct node * node_ptr, iface_enum iface, bool cle * By index does not require a lookup whereas hostname does */ int nodeLinkClass::remove_pulse ( string & hostname, iface_enum iface, int index, unsigned int flags ) { + /* TODO: consider removing this check */ + if ( hostname == "localhost" ) + { + /* localhost is not a supported hostname and indicates + * an unconfigured host response ; return the ignore response */ + return(ENXIO); + } if ( index ) { int rc = remPulse_by_index ( hostname, index , iface, true , flags ); @@ -7984,16 +8003,6 @@ int nodeLinkClass::remove_pulse ( string & hostname, iface_enum iface, int index } else { - if ( hostname.compare("localhost") ) - { - get_hbs_monitor_state ( hostname , iface ) ; - } - else - { - /* localhost is not a supported hostname and indicates - * an unconfigured host response ; return the ignore response */ - return(ENXIO); - } } return ( remPulse_by_name ( hostname , iface, true, flags )); } @@ -8016,7 +8025,6 @@ void nodeLinkClass::clear_pulse_list ( iface_enum iface ) } } - /** Runs in the hbsAgent to set or clear heartbat alarms for all supported interfaces */ void nodeLinkClass::manage_heartbeat_alarm ( struct nodeLinkClass::node * node_ptr, EFmAlarmSeverityT sev, int iface ) { @@ -8142,7 +8150,6 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding ) storage_0_responding = false ; } - /* Don't log single misses unless in debug mode */ if ( pulse_ptr->b2b_misses_count[iface] > 1 ) { if ( pulse_ptr->b2b_misses_count[iface] >= hbs_failure_threshold ) @@ -8207,7 +8214,10 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding ) { if ( pulse_ptr->b2b_misses_count[iface] == hbs_minor_threshold ) { - send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_MINOR_SET, iface ); + if ( this->active_controller ) + { + send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_MINOR_SET, iface ); + } pulse_ptr->hbs_minor[iface] = true ; pulse_ptr->hbs_minor_count[iface]++ ; wlog ("%s %s -> MINOR\n", pulse_ptr->hostname.c_str(), get_iface_name_str(iface)); @@ -8215,10 +8225,17 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding ) } if ( pulse_ptr->b2b_misses_count[iface] == hbs_degrade_threshold ) { - manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_MAJOR, iface ); + if ( this->active_controller ) + { + manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_MAJOR, iface ); - /* report this host as failed */ - if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_DEGRADE_SET, iface ) == PASS ) + /* report this host as failed */ + if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_DEGRADE_SET, iface ) == PASS ) + { + pulse_ptr->hbs_degrade[iface] = true ; + } + } + else { pulse_ptr->hbs_degrade[iface] = true ; } @@ -8231,11 +8248,17 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding ) ( pulse_ptr->hbs_degrade[iface] == false )) { wlog ("%s -> DEGRADED - Auto-Correction\n", pulse_ptr->hostname.c_str()); + if ( this->active_controller ) + { + manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_MAJOR, iface ); - manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_MAJOR, iface ); - - /* report this host as failed */ - if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_DEGRADE_SET, iface ) == PASS ) + /* report this host as failed */ + if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_DEGRADE_SET, iface ) == PASS ) + { + pulse_ptr->hbs_degrade[iface] = true ; + } + } + else { pulse_ptr->hbs_degrade[iface] = true ; } @@ -8250,11 +8273,16 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding ) /* Only print the log at the threshold boundary */ if ( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold ) { - manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface ); + if ( this->active_controller ) + { + manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface ); + } wlog_throttled ( pulse_ptr->no_work_log_throttle, 500, "%s %s *** Heartbeat Loss *** (degrade only)\n", pulse_ptr->hostname.c_str(), get_iface_name_str(iface) ); + this->print_node_info (); + hbs_cluster_log ( this->my_hostname, "event", true ); } } @@ -8268,35 +8296,46 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding ) /* Only print the log at the threshold boundary */ if ( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold ) { - manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface ); - + if ( this->active_controller ) + { + manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface ); + } wlog_throttled ( pulse_ptr->no_work_log_throttle, 500, "%s %s *** Heartbeat Loss *** (degrade only)\n", pulse_ptr->hostname.c_str(), get_iface_name_str(iface) ); + this->print_node_info (); + hbs_cluster_log ( this->my_hostname, "event", true ); } } else if (( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold ) && - ( pulse_ptr->hbs_failure[iface] == false )) + ( pulse_ptr->hbs_failure[iface] == false )) { - elog ("%s %s -> FAILED\n", pulse_ptr->hostname.c_str(), - get_iface_name_str(iface) ); elog ("%s %s *** Heartbeat Loss ***\n", pulse_ptr->hostname.c_str(), get_iface_name_str(iface) ); - manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface ); + if ( this->active_controller ) + { + manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface ); - /* report this host as failed */ - if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_LOSS , iface ) == PASS ) + /* report this host as failed */ + if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_LOSS , iface ) == PASS ) + { + pulse_ptr->hbs_failure[iface] = true ; + } + } + else { pulse_ptr->hbs_failure[iface] = true ; + this->print_node_info (); + hbs_cluster_log ( this->my_hostname, "event", true ); } - pulse_ptr->hbs_failure_count[iface]++ ; } if ( pulse_ptr->b2b_misses_count[iface] > pulse_ptr->max_count[iface] ) - pulse_ptr->max_count[iface] = pulse_ptr->b2b_misses_count[iface] ; + pulse_ptr->max_count[iface] = pulse_ptr->b2b_misses_count[iface] ; } + if ( remPulse_by_name ( pulse_ptr->hostname, iface, false, NULL_PULSE_FLAGS )) { elog ("%s %s not in pulse list\n", pulse_ptr->hostname.c_str(), diff --git a/mtce/src/common/nodeClass.h b/mtce/src/common/nodeClass.h index cb6d59ff..0701a749 100755 --- a/mtce/src/common/nodeClass.h +++ b/mtce/src/common/nodeClass.h @@ -1266,6 +1266,10 @@ public: bool maintenance ; bool heartbeat ; + /* Set to true if this controller is active. + * Currently only used by heartbeat service. */ + bool active_controller ; + /* offline_handler tuning controls */ int offline_threshold ; /* number of back to back mtcAlive misses before offline */ int offline_period ; /* offline handler mtcAlive request period */ diff --git a/mtce/src/heartbeat/hbsAgent.cpp b/mtce/src/heartbeat/hbsAgent.cpp index d7aaf5bd..1e2b2fc1 100644 --- a/mtce/src/heartbeat/hbsAgent.cpp +++ b/mtce/src/heartbeat/hbsAgent.cpp @@ -54,8 +54,6 @@ using namespace std; * daemon_signal_init * hbs_hostname_read * hbs_message_init - * hbs_int_socket_init - * hbs_ext_socket_init * forever ( timer_handler ) * hbs_pulse_req * hbs_timer_start @@ -74,7 +72,7 @@ using namespace std; /* Historical String data for mem_logs */ static string unexpected_pulse_list[MAX_IFACES] = { "" , "" } ; static string arrival_histogram[MAX_IFACES] = { "" , "" } ; - +static string mtcAgent_ip = "" ; static std::list hostname_inventory ; /** This heartbeat service inventory is tracked by @@ -93,8 +91,6 @@ int module_init ( void ) return (PASS); } -static unsigned int controller_number = 0 ; - void daemon_sigchld_hdlr ( void ) { ; /* dlog("Received SIGCHLD ... no action\n"); */ @@ -130,17 +126,20 @@ hbs_ctrl_type * get_hbs_ctrl_ptr () { return &hbs_ctrl ; } void monitor_scheduling ( unsigned long long & this_time, unsigned long long & prev_time , int data, const char * label_ptr ) { this_time = gettime_monotonic_nsec () ; - if ( this_time > (prev_time + (NSEC_TO_MSEC*(hbs_config.latency_thld)))) /* 10 millisec */ + if ( label_ptr && strncmp ( label_ptr, NODEUTIL_LATENCY_MON_START, strlen(NODEUTIL_LATENCY_MON_START))) { - llog ("%4llu.%-4llu msec - %s at line %d\n", + if ( ! strcmp (SCHED_MONITOR__RECEIVER, label_ptr ) && ( data > 10 )) + { + ilog ("===> receive latency : batch of %d pulses in under scheduling threshold of %d msec\n", data , hbs_config.latency_thld ); + } + else if ( this_time > (prev_time + (NSEC_TO_MSEC*(hbs_config.latency_thld)))) + { + llog ("%4llu.%-4llu msec %s at line %d\n", ((this_time-prev_time) > NSEC_TO_MSEC) ? ((this_time-prev_time)/NSEC_TO_MSEC) : 0, ((this_time-prev_time) > NSEC_TO_MSEC) ? ((this_time-prev_time)%NSEC_TO_MSEC) : 0, label_ptr, data); + } } -// else if ( ! strcmp (SCHED_MONITOR__RECEIVER, label_ptr ) && ( data > 10 )) -// { -// ilog ("===> receive latency : batch of %d pulses in under scheduling threshold of %d msec\n", data , hbs_config.latency_thld ); -// } prev_time = this_time ; } @@ -167,6 +166,16 @@ void daemon_exit ( void ) if ( hbs_sock.mtc_to_hbs_sock ) delete (hbs_sock.mtc_to_hbs_sock); + /* Close the alarm socket */ + if ( hbs_sock.alarm_sock ) + delete (hbs_sock.alarm_sock); + + /* Close the SM sockets */ + if ( hbs_sock.sm_server_sock ) + delete (hbs_sock.sm_server_sock); + if ( hbs_sock.sm_client_sock ) + delete (hbs_sock.sm_client_sock); + exit (0); } @@ -179,8 +188,8 @@ void daemon_exit ( void ) #define HBS_SOCKET_MSEC (5) #define HBS_SOCKET_NSEC (HBS_SOCKET_MSEC*1000) -#define HBS_MIN_PERIOD (50) -#define HBS_MAX_PERIOD (999) +#define HBS_MIN_PERIOD (100) +#define HBS_MAX_PERIOD (1000) #define HBS_VIRT_PERIOD (500) #define HBS_BACKOFF_FACTOR (4) /* period*this during backoff */ @@ -212,35 +221,28 @@ static int hbs_config_handler ( void * user, { int curr_period = hbsInv.hbs_pulse_period ; - config_ptr->hbs_pulse_period = atoi(value); hbsInv.hbs_pulse_period = atoi(value); hbsInv.hbs_state_change = true ; hbsInv.hbs_disabled = false ; config_ptr->mask |= CONFIG_AGENT_HBS_PERIOD ; /* Adjust the heartbeat period in a virtual environment */ - if (( hbsInv.hbs_pulse_period >= HBS_MIN_PERIOD ) || - ( hbsInv.hbs_pulse_period <= HBS_MAX_PERIOD )) + if (( hbsInv.hbs_pulse_period < HBS_MIN_PERIOD ) || + ( hbsInv.hbs_pulse_period > HBS_MAX_PERIOD )) { - struct stat p ; - p.st_size = 0 ; - stat ( HOST_IS_VIRTUAL, &p ) ; - if ( p.st_size ) - { - if (( hbsInv.hbs_pulse_period != 0 ) && - ( hbsInv.hbs_pulse_period < HBS_VIRT_PERIOD )) - { - config_ptr->hbs_pulse_period = HBS_VIRT_PERIOD ; - hbsInv.hbs_pulse_period = HBS_VIRT_PERIOD ; - hbsInv.hbs_pulse_period_save = HBS_VIRT_PERIOD ; - } - } + hbsInv.hbs_pulse_period = HBS_MIN_PERIOD ; } + + if ( daemon_get_run_option("Virtual") ) + { + hbsInv.hbs_pulse_period = HBS_VIRT_PERIOD ; + } + hbsInv.hbs_pulse_period_save = hbsInv.hbs_pulse_period ; if ( curr_period != hbsInv.hbs_pulse_period ) { /* initialize cluster info */ - hbs_cluster_init ( hbsInv.hbs_pulse_period ); + hbs_cluster_init ( hbsInv.hbs_pulse_period, hbs_sock.sm_client_sock ); } } @@ -405,10 +407,12 @@ int daemon_configure ( void ) ilog("Failure Thld: %i misses\n", hbsInv.hbs_failure_threshold ); ilog("Multicast : %s\n", hbs_config.multicast ); + ilog("Mgmnt Name : %s\n", hbs_config.mgmnt_iface ); /* TODO: Remove me */ + hbs_config.mgmnt_iface = daemon_get_iface_master ( hbs_config.mgmnt_iface ); - ilog("Mgmnt iface : %s\n", hbs_config.mgmnt_iface ); - ilog("Mgmnt RxPort: %d\n", hbs_config.hbs_agent_mgmnt_port ); - ilog("Mgmnt TxPort: %d\n", hbs_config.hbs_client_mgmnt_port ); + ilog("Mgmnt Master: %s\n", hbs_config.mgmnt_iface ); + ilog("Mgmnt Port : %d (rx)", hbs_config.hbs_agent_mgmnt_port ); + ilog("Mgmnt Port : %d (tx)\n", hbs_config.hbs_client_mgmnt_port ); /* Fetch the infrastructure interface name. * calls daemon_get_iface_master inside so the @@ -423,13 +427,12 @@ int daemon_configure ( void ) else { hbsInv.infra_network_provisioned = true ; - ilog ("Infra iface : %s\n", hbs_config.infra_iface ); + ilog ("Infra Name : %s", hbs_config.infra_iface ); + ilog ("Infra Port : %d (rx)", hbs_config.hbs_agent_infra_port ); + ilog ("Infra Port : %d (tx)", hbs_config.hbs_client_infra_port ); } } - ilog("Infra RxPort: %d\n", hbs_config.hbs_agent_infra_port ); - ilog("Infra TxPort: %d\n", hbs_config.hbs_client_infra_port ); - ilog("Command Port: %d (rx)\n", hbs_config.mtc_to_hbs_cmd_port ); ilog("Event Port : %d (tx)\n", hbs_config.hbs_to_mtc_event_port ); ilog("Alarm Port : %d (tx)\n", hbs_config.mtcalarm_req_port ); @@ -477,6 +480,7 @@ int daemon_configure ( void ) } static struct mtc_timer hbsTimer ; +static struct mtc_timer hbsTimer_audit ; void hbsTimer_handler ( int sig, siginfo_t *si, void *uc) { @@ -498,6 +502,12 @@ void hbsTimer_handler ( int sig, siginfo_t *si, void *uc) mtcTimer_stop_int_safe ( hbsTimer ); hbsTimer.ring = true ; } + /* is base mtc timer */ + else if (( *tid_ptr == hbsTimer_audit.tid ) ) + { + mtcTimer_stop_int_safe ( hbsTimer_audit ); + hbsTimer_audit.ring = true ; + } else { // wlog ("Unexpected timer - %p", *tid_ptr ); @@ -567,6 +577,10 @@ int _setup_pulse_messaging ( iface_enum i, int rmem_max ) hbs_sock.tx_sock[i] = 0 ; return (FAIL_SOCKET_CREATE); } + else + { + hbs_sock.tx_sock[i]->sock_ok(true); + } } else { @@ -598,6 +612,7 @@ int _setup_pulse_messaging ( iface_enum i, int rmem_max ) hbs_sock.rx_sock[i]->setSocketMemory ( iface, "rx pulse socket memory", rmem_max ); else wlog ("failed to query rmem_max ; using rmem_default\n"); + hbs_sock.rx_sock[i]->sock_ok(true); } /* handle failure path */ @@ -619,107 +634,218 @@ int _setup_pulse_messaging ( iface_enum i, int rmem_max ) return (rc); } -/* Setup the Unix Domain Transmit Pulse Socket */ -int alarm_port_init ( void ) -{ - hbs_sock.alarm_port = daemon_get_cfg_ptr()->mtcalarm_req_port; - hbs_sock.alarm_sock = new msgClassTx(LOOPBACK_IP, hbs_sock.alarm_port, IPPROTO_UDP); - if ( hbs_sock.alarm_sock ) - { - if ( hbs_sock.alarm_sock->return_status == PASS ) - { - hbs_sock.alarm_sock->sock_ok(true); - alarm_register_user ( hbs_sock.alarm_sock ); - } - else - { - elog ("alarm_port_init failed socket setup (rc:%d)\n", - hbs_sock.alarm_sock->return_status ); - } - } - return ( hbs_sock.alarm_sock->return_status ) ; -} - -int hbs_sm_sockets_init ( void ) +/* ********************************************************************* + * + * Initialize all heartbeat messaging sockets + * + * 1. transmit socket to maintenance (ready event) + * 2. receive socket from maintenance (inventory) + * 3. alarm socket to alarmd + * 4. multicast transmit socket + * 5. unicast receive socket + * + * ********************************************************************/ +int hbs_socket_init ( void ) { int rc = PASS ; - /* Create an UDP RX Message Socket for SM Requests; LO interface only */ - hbs_sock.sm_server_sock = new msgClassRx(LOOPBACK_IP, hbs_config.sm_server_port, IPPROTO_UDP); - if ( ! hbs_sock.sm_server_sock ) - { - elog ("Failed to setup SM receive socket"); - rc = FAIL_SOCKET_CREATE ; - } - - /* Create an UDP TX Message Socket for SM Requests; LO interface only */ - hbs_sock.sm_client_sock = new msgClassTx(LOOPBACK_IP, hbs_config.sm_client_port,IPPROTO_UDP); - if ( ! hbs_sock.sm_client_sock ) - { - elog ("Failed to setup SM transmit socket"); - rc = FAIL_SOCKET_CREATE ; - } - - if ( rc == PASS ) - { - hbs_sock.sm_server_sock->sock_ok(true); - hbs_sock.sm_client_sock->sock_ok(true); - } - return (rc); -} - -/* Init the internal/local sockets ; the ones that will no change. - * This way we don't miss add and start commands from maintenance. */ - -int hbs_int_socket_init ( void ) -{ - int rc = PASS ; - - ilog ("internal sockets init ...\n"); - /******************************************************************/ /* UDP Tx Message Socket for Heartbeat Events Towards Maintenance */ /******************************************************************/ - - int port = hbs_config.hbs_to_mtc_event_port ; - hbs_sock.hbs_event_tx_sock = new msgClassTx(LOOPBACK_IP, port, IPPROTO_UDP); - if (hbs_sock.hbs_event_tx_sock->return_status != PASS) + rc = FAIL_SOCKET_CREATE ; { - elog ("Failed to setup hbs event transmit port %d\n", port ); - return (hbs_sock.hbs_event_tx_sock->return_status) ; + /* load local variables */ + int port = hbs_config.hbs_to_mtc_event_port ; + mtcAgent_ip = getipbyname ( CONTROLLER ); + + /* Handle re-init case */ + if ( hbs_sock.hbs_event_tx_sock != NULL ) + { + delete (hbs_sock.hbs_event_tx_sock); + hbs_sock.hbs_event_tx_sock = NULL ; + } + + /* Create the socket */ + hbs_sock.hbs_event_tx_sock = + new msgClassTx ( mtcAgent_ip.data(), + port, + IPPROTO_UDP, + hbs_config.mgmnt_iface); + + /* Check the socket */ + if ( hbs_sock.hbs_event_tx_sock != NULL ) + { + if (hbs_sock.hbs_event_tx_sock->return_status == PASS) + { + /* success path */ + hbs_sock.hbs_event_tx_sock->sock_ok(true) ; + rc = PASS ; + } + } + + /* Handle errors */ + if ( rc ) + { + elog ("Failed to setup event transmit socket: %s:%s:%d\n", + hbs_config.mgmnt_iface, mtcAgent_ip.c_str(), port ); + return (rc); + } + } + + /****************************************************************/ + /* UDP Rx Message Socket for Maintenance Commands and Inventory */ + /****************************************************************/ + rc = FAIL_SOCKET_CREATE ; + { + /* load local variables */ + int port = hbs_config.mtc_to_hbs_cmd_port ; + + /* Handle re-init case */ + if ( hbs_sock.mtc_to_hbs_sock != NULL ) + { + delete (hbs_sock.mtc_to_hbs_sock); + hbs_sock.mtc_to_hbs_sock = NULL ; + } + + /* Create the socket */ + hbs_sock.mtc_to_hbs_sock = + new msgClassRx ( hbsInv.my_local_ip.data(), + port, + IPPROTO_UDP); + + /* Check the socket */ + if (hbs_sock.mtc_to_hbs_sock != NULL ) + { + if (hbs_sock.mtc_to_hbs_sock->return_status == PASS) + { + /* success path */ + hbs_sock.mtc_to_hbs_sock->sock_ok(true) ; + rc = PASS ; + } + } + + /* Handle errors */ + if ( rc ) + { + elog ("Failed to setup mtce command receive socket: %s:%d\n", + hbsInv.my_local_ip.c_str(), port ); + return (rc); + } + } + + /*****************************************************************/ + /* UDP Tx Message Socket to alarmd for alarm notifications */ + /*****************************************************************/ + rc = FAIL_SOCKET_CREATE ; + { + hbs_sock.alarm_port = daemon_get_cfg_ptr()->mtcalarm_req_port; + + /* Handle re-init case */ + if ( hbs_sock.alarm_sock != NULL ) + { + delete (hbs_sock.alarm_sock); + hbs_sock.alarm_sock = NULL ; + } + + /* Create the socket */ + hbs_sock.alarm_sock = + new msgClassTx(LOOPBACK_IP, hbs_sock.alarm_port, IPPROTO_UDP); + + /* Check the socket */ + if ( hbs_sock.alarm_sock ) + { + if ( hbs_sock.alarm_sock->return_status == PASS ) + { + hbs_sock.alarm_sock->sock_ok(true); + alarm_register_user ( hbs_sock.alarm_sock ); + rc = PASS ; + } + } + + /* Handle errors */ + if ( rc ) + { + elog ("Failed to setup alarm socket: LO:%d\n", + hbs_sock.alarm_port ); + alarm_unregister_user(); + return (rc ); + } } /***************************************************************/ - /* Non-Blocking UDP Rx Message Socket for Maintenance Commands */ + /* UDP RX Message Socket for SM Requests; LO interface only */ /***************************************************************/ - - port = hbs_config.mtc_to_hbs_cmd_port ; - hbs_sock.mtc_to_hbs_sock = new msgClassRx(LOOPBACK_IP, port, IPPROTO_UDP); - if (hbs_sock.mtc_to_hbs_sock->return_status != PASS) + rc = FAIL_SOCKET_CREATE ; { - elog ("Failed to setup mtce command receive port %d\n", port ); - return (hbs_sock.mtc_to_hbs_sock->return_status) ; + /* Handle re-init case */ + if ( hbs_sock.sm_server_sock != NULL ) + { + delete (hbs_sock.sm_server_sock); + hbs_sock.sm_server_sock = NULL ; + } + + /* Create the socket */ + hbs_sock.sm_server_sock = + new msgClassRx(LOOPBACK_IP, hbs_config.sm_server_port, IPPROTO_UDP); + + /* Check the socket */ + if ( hbs_sock.sm_server_sock ) + { + if ( hbs_sock.sm_server_sock->return_status == PASS ) + { + hbs_sock.sm_server_sock->sock_ok(true); + rc = PASS ; + } + } + + /* Handle errors */ + if ( rc ) + { + elog ("Failed to setup SM receive socket: LO:%d", + hbs_config.sm_server_port); + return (rc) ; + } } - if ( ( rc = alarm_port_init ()) != PASS ) + /***************************************************************/ + /* UDP TX Message Socket for SM Requests; LO interface only */ + /***************************************************************/ + rc = FAIL_SOCKET_CREATE ; { - elog ("Alarm port setup or registration failed (rc:%d)\n", rc ); + /* Handle re-init case */ + if ( hbs_sock.sm_client_sock != NULL ) + { + delete (hbs_sock.sm_client_sock); + hbs_sock.sm_client_sock = NULL ; + } + + /* Create the socket */ + hbs_sock.sm_client_sock = + new msgClassTx(LOOPBACK_IP, hbs_config.sm_client_port,IPPROTO_UDP); + + /* Check the socket */ + if ( hbs_sock.sm_client_sock ) + { + if ( hbs_sock.sm_client_sock->return_status == PASS ) + { + hbs_sock.sm_client_sock->sock_ok(true); + + /* initialize cluster info */ + hbs_cluster_init ( hbsInv.hbs_pulse_period, hbs_sock.sm_client_sock ); + + rc = PASS ; + } + } + + /* Handle errors */ + if ( rc ) + { + elog ("Failed to setup SM transmit socket: LO:%d", + hbs_config.sm_client_port); + return (rc) ; + } } - rc = hbs_sm_sockets_init () ; - - return (rc); -} - -/* Construct the messaging sockets * - * 1. multicast transmit socket * - * 2. unicast receive socket */ -int hbs_ext_socket_init ( void ) -{ - int rc = PASS ; - - ilog ("external sockets init ...\n"); - /* set rx socket buffer size ro rmem_max */ int rmem_max = daemon_get_rmem_max () ; @@ -740,6 +866,12 @@ int hbs_ext_socket_init ( void ) /* Setup the pulse messaging interfaces */ SETUP_PULSE_MESSAGING ( hbsInv.infra_network_provisioned, rmem_max ) ; + if (( hbs_sock.netlink_sock = open_netlink_socket ( RTMGRP_LINK )) <= 0 ) + { + elog ("Failed to create netlink listener socket"); + rc = FAIL_SOCKET_CREATE ; + } + return (rc) ; } @@ -751,9 +883,6 @@ int hbs_pulse_request ( iface_enum iface, string hostname_clue, unsigned int lookup_clue) { -#ifdef WANT_HBS_MEM_LOGS - char str[MAX_LEN] ; -#endif int bytes = 0 ; if ( hbs_sock.tx_sock[iface] ) { @@ -766,6 +895,12 @@ int hbs_pulse_request ( iface_enum iface, /* Add the sequence number */ hbs_sock.tx_mesg[iface].s = seq_num ; + /* Add which controller initiated this pulse */ + if (hbs_ctrl.controller ) + hbs_sock.tx_mesg[iface].f |= ( hbs_ctrl.controller << CTRLX_BIT ); + + /* Add this controller's lookup_clue + * ... aka RRI (Resource Reference Index) */ if (( lookup_clue ) && ( hostname_clue.length() <= MAX_CHARS_HOSTNAME )) { @@ -812,25 +947,24 @@ int hbs_pulse_request ( iface_enum iface, #ifdef WANT_FIT_TESTING hbs_pulse_request_out: #endif - - mlog("%s Pulse Req: (%5d): %17s:%5d: %d:%d:%d:%x:%s\n", - get_iface_name_str(iface), bytes, - hbs_sock.tx_sock[iface]->get_dst_addr()->toString(), - hbs_sock.tx_sock[iface]->get_dst_addr()->getPort(), - hbs_sock.tx_mesg[iface].v, - hbs_sock.tx_mesg[iface].s, - hbs_sock.tx_mesg[iface].c, - hbs_sock.tx_mesg[iface].f, - hbs_sock.tx_mesg[iface].m); - -#ifdef WANT_HBS_MEM_LOGS - snprintf ( &str[0], MAX_LEN, "%s Pulse Req: %17s:%5d: %u:%u:%s\n", - get_iface_name_str(iface), + mlog ( "%s Pulse Req: (%d) %s:%d: s:%u f:%x [%s] RRI:%d\n", + get_iface_name_str(iface), bytes, hbs_sock.tx_sock[iface]->get_dst_addr()->toString(), hbs_sock.tx_sock[iface]->get_dst_addr()->getPort(), hbs_sock.tx_mesg[iface].s, - hbs_sock.tx_mesg[iface].c, - hbs_sock.tx_mesg[iface].m); + hbs_sock.tx_mesg[iface].f, + hbs_sock.tx_mesg[iface].m, + hbs_sock.tx_mesg[iface].c); +#ifdef WANT_HBS_MEM_LOGS + char str[MAX_LEN] ; + snprintf ( &str[0], MAX_LEN, "%s Pulse Req: (%d) %s:%d: s:%u f:%x [%s] RRI:%d\n", + get_iface_name_str(iface), bytes, + hbs_sock.tx_sock[iface]->get_dst_addr()->toString(), + hbs_sock.tx_sock[iface]->get_dst_addr()->getPort(), + hbs_sock.tx_mesg[iface].s, + hbs_sock.tx_mesg[iface].f, + hbs_sock.tx_mesg[iface].m, + hbs_sock.tx_mesg[iface].c); mem_log (&str[0]); #endif @@ -873,14 +1007,27 @@ int _pulse_receive ( iface_enum iface , unsigned int seq_num ) } if ( (bytes = hbs_sock.rx_sock[iface]->read((char*)&hbs_sock.rx_mesg[iface], sizeof(hbs_message_type))) != -1 ) { - mlog1 ("%s Pulse Rsp: (%5d): %17s:%5d: %d:%d:%x:%s\n", - get_iface_name_str(iface), bytes, - hbs_sock.rx_sock[iface]->get_dst_addr()->toString(), - hbs_sock.rx_sock[iface]->get_dst_addr()->getPort(), - hbs_sock.rx_mesg[iface].s, - hbs_sock.rx_mesg[iface].c, - hbs_sock.rx_mesg[iface].f, - hbs_sock.rx_mesg[iface].m); + /* Look for messages that are not for this controller ..... */ + if ( hbs_ctrl.controller != + ((hbs_sock.rx_mesg[iface].f & CTRLX_MASK ) >> CTRLX_BIT)) + { + /* This path has been verified to not get hit during cluster + * feature testing. Leaving the check/continue in just in case. + * This dlog is left commented out for easy re-enable + * for debug but has no runtime impact */ + // dlog ("controller-%d pulse not for this controller ; for controller-%d", + // hbs_ctrl.controller, + // (hbs_sock.rx_mesg[iface].f & CTRLX_MASK ) >> CTRLX_BIT); + continue ; + } + mlog ("%s Pulse Rsp: (%d) %s:%d: s:%d f:%x [%-27s] RRI:%d\n", + get_iface_name_str(iface), bytes, + hbs_sock.rx_sock[iface]->get_dst_addr()->toString(), + hbs_sock.rx_sock[iface]->get_dst_addr()->getPort(), + hbs_sock.rx_mesg[iface].s, + hbs_sock.rx_mesg[iface].f, + hbs_sock.rx_mesg[iface].m, + hbs_sock.rx_mesg[iface].c); /* Validate the header */ if ( strstr ( hbs_sock.rx_mesg[iface].m, rsp_msg_header) ) @@ -907,27 +1054,27 @@ int _pulse_receive ( iface_enum iface , unsigned int seq_num ) #endif // mlog ("%s Pulse Rsp from (%s)\n", get_iface_name_str(iface), hostname.c_str()); - if ( !hostname.compare("localhost") ) + if ( hostname == "localhost" ) { - mlog3 ("%s Pulse Rsp (local): %17s:%5d: %d:%d:%x:%s\n", + mlog3 ("%s Pulse Rsp (local): %s:%d: s:%d f:%x [%-27s] RRI:%d\n", get_iface_name_str(iface), hbs_sock.rx_sock[iface]->get_dst_addr()->toString(), hbs_sock.rx_sock[iface]->get_dst_addr()->getPort(), hbs_sock.rx_mesg[iface].s, - hbs_sock.rx_mesg[iface].c, hbs_sock.rx_mesg[iface].f, - hbs_sock.rx_mesg[iface].m); + hbs_sock.rx_mesg[iface].m, + hbs_sock.rx_mesg[iface].c); } - else if ( !hostname.compare(hbsInv.my_hostname)) + else if ( hostname == hbsInv.my_hostname) { - mlog3 ("%s Pulse Rsp: (self ): %17s:%5d: %d:%d:%x:%s\n", + mlog3 ("%s Pulse Rsp: (self ): %s:%d: s:%d f:%x [%-27s] RRI:%d\n", get_iface_name_str(iface), hbs_sock.rx_sock[iface]->get_dst_addr()->toString(), hbs_sock.rx_sock[iface]->get_dst_addr()->getPort(), hbs_sock.rx_mesg[iface].s, - hbs_sock.rx_mesg[iface].c, hbs_sock.rx_mesg[iface].f, - hbs_sock.rx_mesg[iface].m); + hbs_sock.rx_mesg[iface].m, + hbs_sock.rx_mesg[iface].c); hbsInv.manage_pulse_flags ( hostname, hbs_sock.rx_mesg[iface].f ); } @@ -935,7 +1082,6 @@ int _pulse_receive ( iface_enum iface , unsigned int seq_num ) { if ( hbsInv.monitored_pulse ( hostname , iface ) == true ) { - char str[MAX_LEN] ; string extra = "Rsp" ; if ( seq_num != hbs_sock.rx_mesg[iface].s ) @@ -946,7 +1092,9 @@ int _pulse_receive ( iface_enum iface , unsigned int seq_num ) { rc = hbsInv.remove_pulse ( hostname, iface, hbs_sock.rx_mesg[iface].c, hbs_sock.rx_mesg[iface].f ) ; } - snprintf (&str[0], MAX_LEN, "%s Pulse %s: (%5d): %s:%d: %u:%u:%x:%s\n", +#ifdef WANT_HBS_MEM_LOGS + char str[MAX_LEN] ; + snprintf (&str[0], MAX_LEN, "%s Pulse %s: (%d): %s:%d: %u:%u:%x:%s\n", get_iface_name_str(iface), extra.c_str(), bytes, hbs_sock.rx_sock[iface]->get_dst_addr()->toString(), hbs_sock.rx_sock[iface]->get_dst_addr()->getPort(), @@ -954,8 +1102,7 @@ int _pulse_receive ( iface_enum iface , unsigned int seq_num ) hbs_sock.rx_mesg[iface].c, hbs_sock.rx_mesg[iface].f, hbs_sock.rx_mesg[iface].m); - mlog ("%s", &str[0]); -#ifdef WANT_HBS_MEM_LOGS + // mlog ("%s", &str[0]); mem_log (str); #endif if ( extra.empty()) @@ -973,21 +1120,17 @@ int _pulse_receive ( iface_enum iface , unsigned int seq_num ) hbs_cluster_save ( hostname, MTCE_HBS_NETWORK_INFRA , hbs_sock.rx_mesg[iface]); } } - else - { -ilog ("skipping my hostname"); - } } else { - mlog3 ("%s Pulse Dis: (%5d): %17s:%5d: %d:%d:%x:%s\n", + mlog3 ("%s Pulse Dis: (%d) %s:%d: seq:%d flag:%x [%-27s] RRI:%d\n", get_iface_name_str(iface), bytes, hbs_sock.rx_sock[iface]->get_dst_addr()->toString(), hbs_sock.rx_sock[iface]->get_dst_addr()->getPort(), hbs_sock.rx_mesg[iface].s, - hbs_sock.rx_mesg[iface].c, hbs_sock.rx_mesg[iface].f, - hbs_sock.rx_mesg[iface].m); + hbs_sock.rx_mesg[iface].m, + hbs_sock.rx_mesg[iface].c); } } @@ -1009,7 +1152,7 @@ ilog ("skipping my hostname"); else { wlog ( "Badly formed message\n" ); - mlog ( "Bad %s Msg: %14s:%5d: %d:%s\n", + mlog ( "Bad %s Msg: %s:%d: %d:%s\n", get_iface_name_str(iface), hbs_sock.rx_sock[iface]->get_dst_addr()->toString(), hbs_sock.rx_sock[iface]->get_dst_addr()->getPort(), @@ -1029,14 +1172,22 @@ int send_event ( string & hostname, unsigned int event_cmd, iface_enum iface ) int rc = PASS ; int retries = 0 ; + if ((hbs_sock.hbs_event_tx_sock == NULL ) || + (hbs_sock.hbs_event_tx_sock->sock_ok() == false )) + { + elog ("send event socket not healthy"); + return (FAIL_OPERATION); + } + mtc_message_type event ; memset (&event, 0 , sizeof(mtc_message_type)); if ( event_cmd == MTC_EVENT_HEARTBEAT_LOSS ) { - daemon_dump_membuf_banner (); + // daemon_dump_membuf_banner (); hbsInv.print_node_info (); - hbs_cluster_log( hbsInv.my_hostname, "event"); - daemon_dump_membuf (); + hbs_cluster_log ( hbsInv.my_hostname, "event", true ); + + // daemon_dump_membuf (); snprintf ( &event.hdr[0] , MSG_HEADER_SIZE, "%s", get_heartbeat_loss_header()); } else if ( event_cmd == MTC_EVENT_LOOPBACK ) @@ -1112,7 +1263,7 @@ int send_event ( string & hostname, unsigned int event_cmd, iface_enum iface ) /* The main heartbeat service loop */ int daemon_init ( string iface, string nodetype ) { - int rc = 10 ; + int rc = PASS ; /* Not used by this service */ UNUSED(nodetype); @@ -1128,6 +1279,7 @@ int daemon_init ( string iface, string nodetype ) /* initialize the timer */ mtcTimer_init ( hbsTimer, "controller", "heartbeat" ); + mtcTimer_init ( hbsTimer_audit, "controller", "state audit" ); /* start with no inventory */ hostname_inventory.clear(); @@ -1154,12 +1306,14 @@ int daemon_init ( string iface, string nodetype ) rc = FAIL_SIGNAL_INIT ; } +#ifdef WANT_EARLY_CONFIG /* Configure the agent */ else if ( (rc = daemon_configure ( )) != PASS ) { elog ("Daemon service configuration failed (rc:%i)\n", rc ); rc = FAIL_DAEMON_CONFIG ; } +#endif /* Init the heartbeat request message */ else if ( hbs_message_init ( ) != PASS ) @@ -1168,11 +1322,9 @@ int daemon_init ( string iface, string nodetype ) rc = FAIL_MESSAGE_INIT; } - /* Setup the heartbeat service messaging sockets */ - else if ((rc = hbs_int_socket_init ( )) != PASS ) + if ( daemon_is_file_present ( NODE_LOCKED_FILE )) { - elog ("internal socket initialization failed (rc:%d)\n", rc ); - return ( FAIL_SOCKET_INIT ) ; + hbs_ctrl.locked = true ; } daemon_init_fit(); @@ -1298,21 +1450,29 @@ void daemon_service_run ( void ) #endif int rc = PASS ; int counter = 0 ; - int goenabled_wait_log_throttle = 0 ; + + /* staged initialization gates */ bool goenabled = false ; + bool sockets_init = false ; - /* A variable that throttles external socket init failure retries and + /* log throttles */ + + /* A variable that throttles socket init failure retries and * ultimately triggers an exit if that retry count gets too big */ - int ext_socket_init_fail_count = 0 ; - - /* get a starting point */ - unsigned long long prev_time = gettime_monotonic_nsec (); - unsigned long long this_time = prev_time ; + int socket_init_fail_count = 0 ; /* Used to throttle warning messages that report * an error transmitting the pulse request */ int pulse_request_fail_log_counter[MAX_IFACES] ; + /* throttle initialization wait logs */ + int wait_log_throttle = 0 ; + + + /* get a starting point */ + unsigned long long prev_time = gettime_monotonic_nsec (); + unsigned long long this_time = prev_time ; + bool heartbeat_request = true ; unsigned int seq_num = 0 ; @@ -1333,15 +1493,6 @@ void daemon_service_run ( void ) hbsInv.pulse_requests[iface] = 0 ; } - /* Make the main loop schedule in real-time */ - struct sched_param param ; - memset ( ¶m, 0, sizeof(struct sched_param)); - param.sched_priority = hbs_config.scheduling_priority ; - if ( sched_setscheduler(0, SCHED_RR, ¶m) ) - { - elog ("sched_setscheduler (0, SCHED_RR, %d ) returned error (%d:%s)\n", - param.sched_priority, errno, strerror(errno)); - } /* Not monitoring address changes RTMGRP_IPV4_IFADDR | RTMGRP_IPV6_IFADDR */ if (( hbs_sock.ioctl_sock = open_ioctl_socket ( )) <= 0 ) @@ -1350,126 +1501,112 @@ void daemon_service_run ( void ) daemon_exit (); } - if (( hbs_sock.netlink_sock = open_netlink_socket ( RTMGRP_LINK )) <= 0 ) - { - elog ("Failed to create netlink listener socket"); - daemon_exit (); - } - /* set this controller as provisioned */ hbs_manage_controller_state ( hbsInv.my_hostname , true ); - /* CGTS 4114: Small Footprint: Alarm 200.005 remains active after connectivity restored - * - * Clear self alarms */ - hbsAlarm_clear_all ( hbsInv.my_hostname, hbsInv.infra_network_provisioned ); - - /* add this host as inventory to hbsAgent - * Although this host is not monitored for heartbeat, - * there are OOB flags in the heartbneat message that - * are needed to be extracted and locally updated */ - { - /* Scoping this so that the inv variable is freed after the add. - * No need sarying it around on the stack all the time */ - node_inv_type inv ; - - /* init the inv variable */ - node_inv_init ( inv ); - inv.name = hbsInv.my_hostname ; - inv.nodetype = CONTROLLER_TYPE ; - hbsInv.add_heartbeat_host ( inv ); - } - ilog ("Sending ready event to maintenance\n"); - do - { - /* Wait for maintenance */ - rc = send_event ( hbsInv.my_hostname, MTC_EVENT_HEARTBEAT_READY, MGMNT_IFACE ) ; - if ( rc == RETRY ) - { - mtcWait_secs ( 3 ); - } - } while ( rc == RETRY ) ; - - if ( rc == FAIL ) - { - elog ("Unrecoverable heartbeat startup error (rc=%d)\n", rc ); - daemon_exit (); - } - - /* enable the base level signal handler latency monitor */ - daemon_latency_monitor (true); - - /* load this controller index number - used for cluster stuff */ - if ( hbsInv.my_hostname == CONTROLLER_0 ) - controller_number = 0 ; - else - controller_number = 1 ; - - /* tell the cluster which controller this is and - * how many networks are being monitored */ - hbs_cluster_nums (controller_number,hbsInv.infra_network_provisioned ?2:1); - /* Run heartbeat service forever or until stop condition */ - for ( hbsTimer.ring = false ; ; ) + for ( hbsTimer.ring = false , hbsTimer_audit.ring = false ; ; ) { daemon_signal_hdlr (); - /******************************************************************* - * - * This handles hbsAgent external socket initialization in the main - * loop only after the goenabled state is reached. - * - *******************************************************************/ - if ( goenabled == false ) + if ( hbsTimer_audit.ring == true ) { - if ( hbsInv.system_type == SYSTEM_TYPE__NORMAL ) + /* the state dump is only important after daemon init */ + if ( sockets_init == true ) { - if ( daemon_is_file_present ( GOENABLED_MAIN_PASS ) == true ) - { - ilog ("GOENABLE (large system)\n"); - goenabled = true ; - } - } - else - { - if ( daemon_is_file_present ( GOENABLED_SUBF_PASS ) == true ) - { - ilog ("GOENABLE (small system)\n"); - goenabled = true ; - } + hbsInv.print_node_info(); + + hbs_state_audit (); } + /* run the first audit in 30 seconds */ + mtcTimer_start ( hbsTimer_audit, hbsTimer_handler, MTC_HRS_1 ); + } + + /* handle staged initialization */ + if ( sockets_init == false ) + { if ( goenabled == false ) { - ilog_throttled ( goenabled_wait_log_throttle, 2000, "GOENABLE wait ...\n"); - usleep (50000); /* 50 msec */ - } - - if ( goenabled == true ) - { - /* Setup the heartbeat service messaging sockets */ - if ( (rc = hbs_ext_socket_init ( )) != PASS ) + if ( hbsInv.system_type == SYSTEM_TYPE__NORMAL ) { - goenabled = false ; - if ( ext_socket_init_fail_count++ == 30 ) + if ( daemon_is_file_present ( GOENABLED_MAIN_PASS )) { - elog ("external socket initialization failed (rc:%d) max retries ; exiting ...\n", rc ); - daemon_exit (); - } - else - { - elog ("external socket initialization failed (rc:%d)\n", rc ); + ilog ("GOENABLE (large system)\n"); + goenabled = true ; + wait_log_throttle = 0 ; } } else { - ext_socket_init_fail_count = 0 ; - goenabled_wait_log_throttle = 0 ; + if ( daemon_is_file_present ( GOENABLED_SUBF_PASS )) + { + ilog ("GOENABLE (small system)\n"); + goenabled = true ; + wait_log_throttle = 0 ; + } + } + + if ( goenabled == false ) + { + ilog_throttled ( wait_log_throttle, MTC_MINS_5, "GOENABLE wait ...\n"); + sleep (1); + continue ; + } + } + else // ( sockets_init == false ) + { + string mgmnt_iface = daemon_mgmnt_iface (); + hbs_config.mgmnt_iface = (char*)mgmnt_iface.data(); + if ( mgmnt_iface.empty() || ( mgmnt_iface == "none" )) + { + ilog_throttled ( wait_log_throttle, 5, "MGMNT wait ..."); + sleep (5); + continue ; + } + + if ( (rc = daemon_configure ( )) != PASS ) + { + elog ("Daemon service configuration failed (rc:%i)\n", rc ); + daemon_exit(); + } + + /* Setup the heartbeat sockets */ + if ( (rc = hbs_socket_init ()) != PASS ) + { + if ( socket_init_fail_count++ == 10 ) + { + elog ("Failed socket initialization (rc:%d) max retries ; exiting ...\n", rc ); + daemon_exit (); + } + else + { + elog ("Failed socket initialization (rc:%d) ; will retry in 5 secs ...\n", rc ); + sleep (5); + } + } + else + { + ilog ("Sending ready event to maintenance\n"); + do + { + /* Wait for maintenance */ + rc = send_event ( hbsInv.my_hostname, MTC_EVENT_HEARTBEAT_READY, MGMNT_IFACE ) ; + if ( rc == RETRY ) + { + mtcWait_secs ( 3 ); + } + } while ( rc == RETRY ) ; + if ( rc == FAIL ) + { + elog ("Unrecoverable heartbeat startup error (rc=%d)\n", rc ); + daemon_exit (); + } if ( get_link_state ( hbs_sock.ioctl_sock, hbs_config.mgmnt_iface, &hbsInv.mgmnt_link_up_and_running ) ) { - hbsInv.mgmnt_link_up_and_running = false ; - wlog ("Failed to query %s operational state ; defaulting to down\n", hbs_config.mgmnt_iface ); + hbsInv.mgmnt_link_up_and_running = false ; + wlog ("Failed to query %s operational state ; defaulting to down\n", hbs_config.mgmnt_iface ); } else { @@ -1488,22 +1625,81 @@ void daemon_service_run ( void ) ilog ("Infra %s link is %s\n", hbs_config.infra_iface, hbsInv.infra_link_up_and_running ? "Up" : "Down" ); } } + + /* Make the main loop schedule in real-time */ + { + struct sched_param param ; + memset ( ¶m, 0, sizeof(struct sched_param)); + param.sched_priority = hbs_config.scheduling_priority ; + if ( sched_setscheduler(0, SCHED_RR, ¶m) ) + { + elog ("sched_setscheduler (0, SCHED_RR, %d ) returned error (%d:%s)\n", + param.sched_priority, errno, strerror(errno)); + } + } + + /* add this host as inventory to hbsAgent + * Although this host is not monitored for heartbeat, + * there are OOB flags in the heartbeat message that + * are needed to be extracted and locally updated */ + { + /* Scoping this so that the inv variable is freed after the add. + * No need saving it around on the stack all the time */ + node_inv_type inv ; + + /* init the inv variable */ + node_inv_init ( inv ); + inv.name = hbsInv.my_hostname ; + inv.nodetype = CONTROLLER_TYPE ; + hbsInv.add_heartbeat_host ( inv ); + } + + /* enable the base level signal handler latency monitor */ + daemon_latency_monitor (true); + + /* load this controller index number - used for cluster stuff */ + if ( hbsInv.my_hostname == CONTROLLER_0 ) + hbs_ctrl.controller = 0 ; + else + hbs_ctrl.controller = 1 ; + + /* tell the cluster which controller this is and + * how many networks are being monitored */ + hbs_cluster_nums (hbs_ctrl.controller,hbsInv.infra_network_provisioned ?2:1); + + socket_init_fail_count = 0 ; + wait_log_throttle = 0 ; + sockets_init = true ; + monitor_scheduling ( this_time, prev_time, 0, NODEUTIL_LATENCY_MON_START ); + + /* no need for the heartbeat audit in a simplex system */ + if ( hbsInv.system_type != SYSTEM_TYPE__CPE_MODE__SIMPLEX ) + { + /* start the state audit */ + /* run the first audit in 30 seconds */ + mtcTimer_start ( hbsTimer_audit, hbsTimer_handler, MTC_SECS_30 ); + } } } } + /* Bypass link management & heartbeat handling prior + * to sockets being initialized */ + if ( sockets_init == false ) + continue ; + /* audit for forced alarms clear due to ... * - * 1. heartbeat failure action being set to none - * 2. ... future + * 1. first initialization or + * 2. heartbeat failure action being set to none * */ if ( hbs_ctrl.clear_alarms == true ) { - if ( goenabled == true ) + if ( hbsInv.active_controller ) { std::list::iterator hostname_ptr ; - ilog ("clearing all heartbeat alarms for all hosts due to 'none' action"); + ilog ("clearing all heartbeat alarms"); for ( hostname_ptr = hostname_inventory.begin(); hostname_ptr != hostname_inventory.end() ; hostname_ptr++ ) @@ -1511,11 +1707,36 @@ void daemon_service_run ( void ) hbsAlarm_clear_all ( hostname_ptr->data(), hbsInv.infra_network_provisioned ); hbsInv.manage_heartbeat_clear ( hostname_ptr->data(), MAX_IFACES ); } - hbs_ctrl.clear_alarms = false ; } + hbs_ctrl.clear_alarms = false ; } /***************** Service Sockets ********************/ + if ( hbs_ctrl.audit++ == AUDIT_RATE ) + { + hbs_ctrl.audit = 0 ; + if ( daemon_is_file_present ( NODE_LOCKED_FILE )) + { + hbs_ctrl.locked = true ; + if ( hbsInv.hbs_disabled == false ) + { + hbsInv.hbs_disabled = true ; + hbsInv.hbs_state_change = true ; + ilog ("heartbeat service going disabled (locked)"); + + /* force the throttle 'still disabled' log to wait for + * the throttled count before the first log */ + counter = 1 ; + } + } + else if ( hbsInv.hbs_disabled == true ) + { + hbs_ctrl.locked = false ; + hbsInv.hbs_disabled = false; + hbsInv.hbs_state_change = true ; + ilog ("heartbeat service going enabled"); + } + } /* Initialize the master fd_set and clear socket list */ FD_ZERO(&hbs_sock.readfds); @@ -1529,38 +1750,42 @@ void daemon_service_run ( void ) FD_SET(hbs_sock.mtc_to_hbs_sock->getFD(), &hbs_sock.readfds); } - /* Add the sm request receiver to the select list */ - if (( hbs_sock.sm_server_sock ) && - ( hbs_sock.sm_server_sock->getFD())) + if ( sockets_init ) { - socks.push_front (hbs_sock.sm_server_sock->getFD()); - FD_SET(hbs_sock.sm_server_sock->getFD(), &hbs_sock.readfds); - } + /* Add the netlink event listener to the select list */ + if ( hbs_sock.netlink_sock ) + { + socks.push_back (hbs_sock.netlink_sock); + FD_SET(hbs_sock.netlink_sock, &hbs_sock.readfds); + } - /* Add the netlink event listener to the select list */ - if ( hbs_sock.netlink_sock ) - { - socks.push_back (hbs_sock.netlink_sock); - FD_SET(hbs_sock.netlink_sock, &hbs_sock.readfds); - } + if ( ! hbsInv.hbs_disabled ) + { + /* Add the management interface to the select list */ + if (( hbs_sock.rx_sock[MGMNT_INTERFACE] ) && + ( hbs_sock.rx_sock[MGMNT_INTERFACE]->getFD())) + { + socks.push_back (hbs_sock.rx_sock[MGMNT_INTERFACE]->getFD()); + FD_SET(hbs_sock.rx_sock[MGMNT_INTERFACE]->getFD(), &hbs_sock.readfds ); + } - /* Add the management interface to the select list */ - if (( goenabled == true ) && - ( hbs_sock.rx_sock[MGMNT_INTERFACE] ) && - ( hbs_sock.rx_sock[MGMNT_INTERFACE]->getFD())) - { - socks.push_back (hbs_sock.rx_sock[MGMNT_INTERFACE]->getFD()); - FD_SET(hbs_sock.rx_sock[MGMNT_INTERFACE]->getFD(), &hbs_sock.readfds ); - } + /* Add the INFRA network pulse rx socket if its provisioned and have a valid socket */ + if (( hbsInv.infra_network_provisioned == true ) && + ( hbs_sock.rx_sock[INFRA_INTERFACE] ) && + ( hbs_sock.rx_sock[INFRA_INTERFACE]->getFD())) + { + socks.push_back (hbs_sock.rx_sock[INFRA_INTERFACE]->getFD()); + FD_SET(hbs_sock.rx_sock[INFRA_INTERFACE]->getFD(), &hbs_sock.readfds ); + } + } - /* Add the INFRA network pulse rx socket if its provisioned and have a valid socket */ - if (( goenabled == true ) && - ( hbsInv.infra_network_provisioned == true ) && - ( hbs_sock.rx_sock[INFRA_INTERFACE] ) && - ( hbs_sock.rx_sock[INFRA_INTERFACE]->getFD())) - { - socks.push_back (hbs_sock.rx_sock[INFRA_INTERFACE]->getFD()); - FD_SET(hbs_sock.rx_sock[INFRA_INTERFACE]->getFD(), &hbs_sock.readfds ); + /* Add the SM receiver to the socket select list */ + if (( hbs_sock.sm_server_sock ) && + ( hbs_sock.sm_server_sock->getFD())) + { + socks.push_back (hbs_sock.sm_server_sock->getFD()); + FD_SET(hbs_sock.sm_server_sock->getFD(), &hbs_sock.readfds ); + } } monitor_scheduling ( this_time, prev_time, seq_num, SCHED_MONITOR__MAIN_LOOP ); @@ -1585,27 +1810,6 @@ void daemon_service_run ( void ) } else { - if (( goenabled == true ) && - ( hbs_sock.rx_sock[MGMNT_INTERFACE] ) && - ( FD_ISSET(hbs_sock.rx_sock[MGMNT_INTERFACE]->getFD(), &hbs_sock.readfds))) - { - hbs_sock.fired[MGMNT_INTERFACE] = true ; - } - - if (( goenabled == true ) && - ( hbsInv.infra_network_provisioned == true ) && - ( hbs_sock.rx_sock[INFRA_INTERFACE] ) && - ( hbs_sock.rx_sock[INFRA_INTERFACE]->getFD()) && - ( FD_ISSET(hbs_sock.rx_sock[INFRA_INTERFACE]->getFD(), &hbs_sock.readfds))) - { - hbs_sock.fired[INFRA_INTERFACE] = true ; - } - - if ((hbs_sock.sm_server_sock != NULL ) && - ( FD_ISSET(hbs_sock.sm_server_sock->getFD(), &hbs_sock.readfds))) - { - hbs_sm_handler(); - } if ((hbs_sock.mtc_to_hbs_sock != NULL ) && ( FD_ISSET(hbs_sock.mtc_to_hbs_sock->getFD(), &hbs_sock.readfds))) { @@ -1623,7 +1827,45 @@ void daemon_service_run ( void ) if ( !strncmp ( get_hbs_cmd_req_header(), &msg.hdr[0], MSG_HEADER_SIZE )) { string hostname = &msg.hdr[MSG_HEADER_SIZE] ; - if ( msg.cmd == MTC_CMD_ADD_HOST ) + if ( msg.cmd == MTC_CMD_ACTIVE_CTRL ) + { + bool logit = false ; + if ( hostname == hbsInv.my_hostname ) + { + if ( hbsInv.active_controller == false ) + { + logit = true ; + hbs_ctrl.clear_alarms = true ; + } + hbsInv.active_controller = true ; + } + else + { + if ( hbsInv.active_controller == true ) + logit = true ; + hbsInv.active_controller = false ; + } + if ( logit == true ) + { + ilog ("%s is %sactive", + hbsInv.my_hostname.c_str(), + hbsInv.active_controller ? "" : "in" ); + + /* no need for the heartbeat audit in a simplex system */ + if ( hbsInv.system_type != SYSTEM_TYPE__CPE_MODE__SIMPLEX ) + { + /* Due to activity state change we will dump + * the heartbeat cluster state at now time + * and then again in 5 seconds only to get + * the regular audit dump restarted at + * regular interval after that. */ + hbs_state_audit (); + mtcTimer_reset ( hbsTimer_audit); + mtcTimer_start ( hbsTimer_audit, hbsTimer_handler, MTC_SECS_5 ); + } + } + } + else if ( msg.cmd == MTC_CMD_ADD_HOST ) { node_inv_type inv ; node_inv_init(inv); @@ -1634,7 +1876,8 @@ void daemon_service_run ( void ) ilog ("%s added to heartbeat service (%d)\n", hostname.c_str(), msg.parm[0] ); /* clear any outstanding alarms on the ADD */ - if ( hbsInv.hbs_failure_action != HBS_FAILURE_ACTION__NONE ) + if (( hbsInv.hbs_failure_action != HBS_FAILURE_ACTION__NONE ) && + ( hbsInv.active_controller == true )) { hbsAlarm_clear_all ( hostname, hbsInv.infra_network_provisioned ); @@ -1648,27 +1891,34 @@ void daemon_service_run ( void ) ilog ("%s deleted from heartbeat service\n", hostname.c_str()); /* clear any outstanding alarms on the DEL */ - if ( hbsInv.hbs_failure_action != HBS_FAILURE_ACTION__NONE ) + if (( hbsInv.hbs_failure_action != HBS_FAILURE_ACTION__NONE ) && + ( hbsInv.active_controller == true )) { hbsAlarm_clear_all ( hostname, - hbsInv.infra_network_provisioned ); + hbsInv.infra_network_provisioned ); } } else if ( msg.cmd == MTC_CMD_STOP_HOST ) { hbsInv.mon_host ( hostname, false, true ); hbs_cluster_del ( hostname ); - - ilog ("%s stopping heartbeat service\n", - hostname.c_str()); } else if ( msg.cmd == MTC_CMD_START_HOST ) { - hbsInv.mon_host ( hostname, true, true ); - hbs_cluster_add ( hostname ); + if ( hostname == hbsInv.my_hostname ) + { + dlog ("%s stopping heartbeat of self\n", + hostname.c_str()); - ilog ("%s starting heartbeat service\n", - hostname.c_str()); + hbsInv.mon_host ( hostname, false, true ); + hbs_cluster_del ( hostname ); + + } + else + { + hbs_cluster_add ( hostname ); + hbsInv.mon_host ( hostname, true, true ); + } } else if ( msg.cmd == MTC_RESTART_HBS ) { @@ -1685,8 +1935,12 @@ void daemon_service_run ( void ) } else if ( msg.cmd == MTC_BACKOFF_HBS ) { + hbsInv.hbs_pulse_period = (hbsInv.hbs_pulse_period_save * HBS_BACKOFF_FACTOR) ; ilog ("%s starting heartbeat backoff (period:%d msecs)\n", hostname.c_str(), hbsInv.hbs_pulse_period ); + + /* Send SM cluster information at start of MNFA */ + hbs_cluster_send( hbs_sock.sm_client_sock, 0 ); hbsInv.print_node_info(); } else @@ -1704,10 +1958,35 @@ void daemon_service_run ( void ) elog ("Failed receive from agent domain socket (%i)\n", bytes ); } } + + if ( ! hbsInv.hbs_disabled ) + { + if (( hbs_sock.rx_sock[MGMNT_INTERFACE] ) && + ( FD_ISSET(hbs_sock.rx_sock[MGMNT_INTERFACE]->getFD(), &hbs_sock.readfds))) + { + hbs_sock.fired[MGMNT_INTERFACE] = true ; + } + + if (( hbsInv.infra_network_provisioned == true ) && + ( hbs_sock.rx_sock[INFRA_INTERFACE] ) && + ( hbs_sock.rx_sock[INFRA_INTERFACE]->getFD()) && + ( FD_ISSET(hbs_sock.rx_sock[INFRA_INTERFACE]->getFD(), &hbs_sock.readfds))) + { + hbs_sock.fired[INFRA_INTERFACE] = true ; + } + } + + if ((hbs_sock.sm_server_sock != NULL ) && + ( FD_ISSET(hbs_sock.sm_server_sock->getFD(), &hbs_sock.readfds))) + { + hbs_sm_handler(); + } + if (FD_ISSET( hbs_sock.netlink_sock, &hbs_sock.readfds)) { dlog ("netlink socket fired\n"); - if ( hbsInv.service_netlink_events ( hbs_sock.netlink_sock, hbs_sock.ioctl_sock ) != PASS ) + rc = hbsInv.service_netlink_events ( hbs_sock.netlink_sock, hbs_sock.ioctl_sock ); + if ( rc ) { elog ("service_netlink_events failed (rc:%d)\n", rc ); } @@ -1733,37 +2012,51 @@ void daemon_service_run ( void ) /* print current node inventory to the stdio */ hbsInv.print_node_info(); - } } - /* Manage enabling and disabling the heartbeat service based on - * the state of the management link. - * link up = run heartbeat service - * link down = disable heatbeat service and monitor the link up to re-enable - */ - else if (( hbsInv.mgmnt_link_up_and_running == false ) && - ( hbsInv.hbs_disabled == false )) + if ( hbs_ctrl.locked == false ) { - hbsInv.hbs_disabled = true ; - hbsInv.hbs_state_change = true ; - ilog ("Heartbeat disabled by %s link down event\n", hbs_config.mgmnt_iface ); - counter = 1 ; + /* Manage enabling and disabling the heartbeat service based on + * the state of the management link. + * link up = run heartbeat service + * link down = disable heatbeat service and monitor the link up to re-enable + */ + if (( hbsInv.mgmnt_link_up_and_running == false ) && + ( hbsInv.hbs_disabled == false )) + { + hbsInv.hbs_disabled = true ; + hbsInv.hbs_state_change = true ; + ilog ("Heartbeat disabled by %s link down event\n", hbs_config.mgmnt_iface ); + counter = 1 ; + } + + /* Recover heartbeat when link comes back up */ + else if (( hbsInv.mgmnt_link_up_and_running == true ) && + ( hbsInv.hbs_disabled == true )) + { + hbsInv.hbs_disabled = false ; + hbsInv.hbs_state_change = true ; + ilog ("Heartbeat Enabled by %s link up event\n", hbs_config.mgmnt_iface ); + counter = 1 ; + } + + else if ( hbsInv.hbs_failure_action == HBS_FAILURE_ACTION__NONE ) + { + wlog_throttled (counter, 100000, "Heartbeat disabled with action=none\n"); + usleep (50000) ; + continue ; + } } - /* Recover heartbeat when link comes back up */ - else if (( hbsInv.mgmnt_link_up_and_running == true ) && - ( hbsInv.hbs_disabled == true )) + /* go to sleep if disabled */ + else if ( hbsInv.hbs_disabled == true ) { - hbsInv.hbs_disabled = false ; - hbsInv.hbs_state_change = true ; - ilog ("Heartbeat Enabled by %s link up event\n", hbs_config.mgmnt_iface ); - counter = 1 ; - } + wlog_throttled (counter, 100000, + "Heartbeat service still disabled %s", + hbs_ctrl.locked ? "(locked)" : ""); - else if ( hbsInv.hbs_failure_action == HBS_FAILURE_ACTION__NONE ) - { - wlog_throttled (counter, 100000, "Heartbeat disabled by 'none' action\n"); + hbsInv.hbs_state_change = false ; usleep (50000) ; continue ; } @@ -1784,14 +2077,6 @@ void daemon_service_run ( void ) hbsInv.print_node_info(); } - /* go to sleep if disabled */ - if ( hbsInv.hbs_disabled == true ) - { - wlog_throttled (counter, 1000, "Heartbeat service still disabled\n"); - usleep (50000) ; - continue ; - } - /* Be sure state change flag is cleared */ hbsInv.hbs_state_change = false ; counter = 0 ; @@ -2010,7 +2295,6 @@ void daemon_service_run ( void ) } hbsTimer.ring = false ; heartbeat_request = true ; - // hbs_cluster_log ( hbsInv.my_hostname, "->") ; seq_num++ ; } daemon_load_fit (); @@ -2024,6 +2308,7 @@ void daemon_dump_info ( void ) daemon_dump_membuf_banner (); hbsInv.print_node_info (); + hbs_state_audit(); hbsInv.memDumpAllState (); #ifdef WANT_HBS_MEM_LOGS @@ -2059,7 +2344,7 @@ int daemon_run_testhead ( void ) else PASSED ; } - free(hbsInv_testhead_ptr); + delete(hbsInv_testhead_ptr); printf (TESTHEAD_BAR); printf ("| Heartbeat Service Test Head\n"); diff --git a/mtce/src/heartbeat/hbsBase.h b/mtce/src/heartbeat/hbsBase.h index 264eba57..b679fadf 100755 --- a/mtce/src/heartbeat/hbsBase.h +++ b/mtce/src/heartbeat/hbsBase.h @@ -47,6 +47,9 @@ /** Maximum service fail count before action */ #define MAX_FAIL_COUNT (1) +/** Audit Rate/Count */ +#define AUDIT_RATE (9) + /** Heartbeat pulse request/response message header byte size */ #define HBS_HEADER_SIZE (15) @@ -60,13 +63,16 @@ const char rsp_msg_header [HBS_HEADER_SIZE+1] = {"cgts pulse rsp:"}; #define HBS_MAX_MSG (HBS_HEADER_SIZE+MAX_CHARS_HOSTNAME) -#define HBS_MESSAGE_VERSION (1) // 0 -> 1 with intro of cluster info +#define HBS_MESSAGE_VERSION (1) // 0 -> 1 with intro of cluster info /* Heartbeat control structure */ typedef struct { + unsigned int controller ; + unsigned int audit ; unsigned int nodetype ; bool clear_alarms ; + bool locked ; } hbs_ctrl_type ; hbs_ctrl_type * get_hbs_ctrl_ptr ( void ); @@ -218,22 +224,17 @@ void hbs_utils_init ( void ); /* network enum to name lookup */ string hbs_cluster_network_name ( mtce_hbs_network_enum network ); -/* Produce formatted clog's that characterize current and changing cluster - * history for a given network. Each log is controller/network specific. */ -void hbs_cluster_log ( string & hostname, mtce_hbs_cluster_type & cluster, string prefix ); - /* Initialize the specified history array */ void hbs_cluster_history_init ( mtce_hbs_cluster_history_type & history ); /* Clear all history in the cluster vault */ void hbs_cluster_history_clear( mtce_hbs_cluster_type & cluster ); - /******** Heartbeat Agent Cluster Functions in hbsCluster.cpp ********/ /* Set the cluster vault to default state. * Called upon daemon init or heartbeat period change. */ -void hbs_cluster_init ( unsigned short period ); +void hbs_cluster_init ( unsigned short period , msgClassSock * sm_socket_ptr ); /* Calculate number of bytes that is unused in the cluster data structure. * Primarily to know how many history elements are missing. */ @@ -286,7 +287,9 @@ void hbs_cluster_append ( hbs_message_type & msg ); /* Produce formatted clog's that characterize current and changing cluster * history for a given network. Each log is controller/network specific. */ -void hbs_cluster_log ( string & hostname, string prefix ); +void hbs_cluster_log ( string & hostname, string prefix, bool force=false ); +void hbs_cluster_log ( string & hostname, mtce_hbs_cluster_type & cluster, string prefix, bool force=false ); + /* Service SM cluster info request */ void hbs_sm_handler ( void ); @@ -294,8 +297,14 @@ void hbs_sm_handler ( void ); /* send the cluster vault to SM */ void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid ); +/* copy cluster data from src to dst */ +void hbs_cluster_copy ( mtce_hbs_cluster_type & src, mtce_hbs_cluster_type & dst ); + /* print the contents of the vault */ -void hbs_cluster_dump ( mtce_hbs_cluster_type & vault ); +void hbs_cluster_dump ( mtce_hbs_cluster_type & vault, string log_prefix, bool force ); + +/* Heartbeat service state audit */ +void hbs_state_audit ( void ); /** * @} hbs_base diff --git a/mtce/src/heartbeat/hbsClient.cpp b/mtce/src/heartbeat/hbsClient.cpp index 444dae29..c0dbad8d 100644 --- a/mtce/src/heartbeat/hbsClient.cpp +++ b/mtce/src/heartbeat/hbsClient.cpp @@ -66,6 +66,8 @@ extern "C" #include "amon.h" /* for ... active monitoring utilities */ } +#define MAX_LEN (300) + /* Where to send events */ string mtcAgent_ip = "" ; @@ -96,12 +98,17 @@ typedef struct static char pulse_resp_tx_hdr [HBS_MAX_MSG]; static char my_hostname [MAX_HOST_NAME_SIZE+1]; +static string hostname = "" ; static char my_hostname_length ; static string my_macaddr = "" ; static string my_address = "" ; static unsigned int my_nodetype= CGTS_NODE_NULL ; static stallMon_type stallMon ; +/* Cached Cluster view from controllers */ +mtce_hbs_cluster_type controller_cluster_cache[MTCE_HBS_MAX_CONTROLLERS]; + + void daemon_sigchld_hdlr ( void ) { ; /* dlog("Received SIGCHLD ... no action\n"); */ @@ -407,16 +414,17 @@ int daemon_configure ( void ) else { ilog("Realtime Pri: FIFO/%i \n", hbs_config.scheduling_priority ); - ilog("Multicast: %s\n", hbs_config.multicast ); + ilog("Multicast : %s\n", hbs_config.multicast ); hbs_config.mgmnt_iface = daemon_get_iface_master ( hbs_config.mgmnt_iface ); - ilog("Mgmnt iface : %s\n", hbs_config.mgmnt_iface ); - ilog("Mgmnt RxPort: %d\n", hbs_config.hbs_client_mgmnt_port ); - ilog("Mgmnt TxPort: %d\n", hbs_config.hbs_agent_mgmnt_port ); + ilog("Mgmnt Name : %s\n", hbs_config.mgmnt_iface ); + ilog("Mgmnt Port : %d (rx)", hbs_config.hbs_client_mgmnt_port ); + ilog("Mgmnt Port : %d (tx)", hbs_config.hbs_agent_mgmnt_port ); get_iface_macaddr ( hbs_config.mgmnt_iface, my_macaddr ); get_iface_address ( hbs_config.mgmnt_iface, my_address, true ); get_hostname ( &my_hostname[0], MAX_HOST_NAME_SIZE ); + hostname = my_hostname ; /* Fetch the infrastructure interface name. * calls daemon_get_iface_master inside so the @@ -427,11 +435,14 @@ int daemon_configure ( void ) if (strcmp(hbs_config.infra_iface, hbs_config.mgmnt_iface)) { infra_network_provisioned = true ; - ilog ("Infra iface : %s\n", hbs_config.infra_iface ); + ilog ("Infra Name : %s\n", hbs_config.infra_iface ); } } - ilog("Infra RxPort: %d\n", hbs_config.hbs_client_infra_port ); - ilog("Infra TxPort: %d\n", hbs_config.hbs_agent_infra_port ); + if ( infra_network_provisioned == true ) + { + ilog("Infra Port : %d (rx)", hbs_config.hbs_client_infra_port ); + ilog("Infra Port : %d (tx)", hbs_config.hbs_agent_infra_port ); + } /* initialize the stall detection monitor */ stallMon_init (); @@ -663,7 +674,37 @@ int get_pmon_pulses ( void ) return (pulses); } -static unsigned int my_rri = 0 ; +/************************************************************* + * + * Name : have_other_controller_history + * + * Description: returns true if there is cached history for any + * controller number other than this one supplied. + * + *************************************************************/ + +bool have_other_controller_history ( unsigned short controller ) +{ + if ( controller < MTCE_HBS_MAX_CONTROLLERS ) + { + /* look for history for any controller other than the one specified */ + for ( int c = 0 ; c < MTCE_HBS_MAX_CONTROLLERS ; c++ ) + { + /* skip specified controller */ + if ( c != controller ) + { + if ( controller_cluster_cache[c].histories ) + { + return true ; + } + } + } + } + return false ; +} + + +static unsigned int rri[MTCE_HBS_MAX_CONTROLLERS] = {0,0} ; /************************************************************* * @@ -766,12 +807,13 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags ) daemon_config_type * cfg_ptr = daemon_get_cfg_ptr(); if ( cfg_ptr->debug_msg ) { - mlog ("\n"); - mlog ("%s Pulse Req: %s:%5d: %d:%s RRI:%d\n", + mlog (" "); + mlog ("%s Pulse Req: %s:%d s:%d f:%x [%s] RRI:%d\n", get_iface_name_str(iface), hbs_sock.rx_sock[iface]->get_dst_addr()->toString(), hbs_sock.rx_sock[iface]->get_dst_addr()->getPort(), hbs_sock.rx_mesg[iface].s, + hbs_sock.rx_mesg[iface].f, hbs_sock.rx_mesg[iface].m, hbs_sock.rx_mesg[iface].c); } @@ -787,19 +829,9 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags ) return (FAIL_MSG_HEADER) ; } - - /* Manage the Resource Reference Index (RRI) "lookup clue" */ - if ( ! strncmp ( &hbs_sock.rx_mesg[iface].m[HBS_HEADER_SIZE], &my_hostname[0], MAX_CHARS_HOSTNAME )) - { - if( my_rri!= hbs_sock.rx_mesg[iface].c ) - { - my_rri = hbs_sock.rx_mesg[iface].c ; - ilog ("%s Caching New RRI: %d\n", &my_hostname[0], my_rri ); - } - } - - /* Add my RRI to the response message */ - hbs_sock.rx_mesg[iface].c = my_rri ; + /* Update local copy for the controller this pulse came from */ + /* ... before the flags are cleared and setup for the reply. */ + unsigned int controller = (hbs_sock.rx_mesg[iface].f & CTRLX_MASK ) >> CTRLX_BIT ; /* Manage OOB flags */ hbs_sock.rx_mesg[iface].f = flags ; @@ -807,23 +839,102 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags ) { hbs_sock.rx_mesg[iface].f |= ( PMOND_FLAG ) ; } + if ( infra_network_provisioned == true ) { hbs_sock.rx_mesg[iface].f |= INFRA_FLAG ; } -#define WANT_CLUSTER_INFO_LOG -#ifdef WANT_CLUSTER_INFO_LOG - /* Log the received cluster info */ - if ( hbs_sock.rx_mesg[iface].v >= HBS_MESSAGE_VERSION ) + /************************************************************************* + ***** C L U S T E R D A T A M A N A G E M E N T ****** + * * + * TODO: Add support for 3 controllers. + * Only 2 suppoerted by some of this code. + ***** ******/ + + if ( controller >= MTCE_HBS_MAX_CONTROLLERS ) { - char str[100] ; - // hbs_cluster_log (hbs_sock.rx_mesg[iface].cluster, hbs_sock.rx_mesg[iface].s ); - snprintf ( &str[0], 100, " seq %6d with %d bytes from %s ", hbs_sock.rx_mesg[iface].s, rx_bytes, get_iface_name_str(iface)); - string hostname = my_hostname ; - hbs_cluster_log ( hostname, hbs_sock.rx_mesg[iface].cluster, str ); + wlog ("invalid controller number: %d ; dropping message", controller ); + return ( FAIL_INVALID_DATA ); } -#endif + + /* Manage the Resource Reference Index (RRI) "lookup clue" + * With the introduction of active-active heartbeating the hbsClient + * is responsible for servicing pulses from both controllers. + * This means that hbsClient needs to manage an rri for each controller. */ + if ( ! strncmp ( &hbs_sock.rx_mesg[iface].m[HBS_HEADER_SIZE], &my_hostname[0], MAX_CHARS_HOSTNAME )) + { + if( rri[controller] != hbs_sock.rx_mesg[iface].c ) + { + rri[controller] = hbs_sock.rx_mesg[iface].c ; + ilog ("Caching New RRI: %d (from controller-%d)\n", rri[controller], controller ); + } + } + + /* Log the received cluster info + * ... if the message version shows that it is supported */ + if ( hbs_sock.rx_mesg[iface].v ) + { + char str[MAX_LEN] ; + snprintf ( &str[0], MAX_LEN, " seq %6d with %d bytes from %s ", (int)hbs_sock.rx_mesg[iface].s, rx_bytes, get_iface_name_str(iface)); + hbs_cluster_log ( hostname, hbs_sock.rx_mesg[iface].cluster, str ); + + /* add the controller back in */ + hbs_sock.rx_mesg[iface].f |= ( controller << CTRLX_BIT ); + + /* Add my RRI to the response message */ + hbs_sock.rx_mesg[iface].c = rri[controller] ; + + if ( hbs_sock.rx_mesg[iface].cluster.histories > MTCE_HBS_MAX_NETWORKS ) + { + slog ("controller-%d provided %d network histories ; max is %d per controller", + controller, + hbs_sock.rx_mesg[iface].cluster.histories, + MTCE_HBS_MAX_NETWORKS ); + } + else if ( hbs_sock.rx_mesg[iface].cluster.bytes != ( BYTES_IN_CLUSTER_VAULT(hbs_sock.rx_mesg[iface].cluster.histories))) + { + slog ("controller-%d provided %d bytes of history ; expected %d", + controller, + hbs_sock.rx_mesg[iface].cluster.bytes, + (unsigned short)(BYTES_IN_CLUSTER_VAULT(hbs_sock.rx_mesg[iface].cluster.histories))); + } + else if ( hbs_sock.rx_mesg[iface].cluster.histories ) + { + hbs_cluster_copy ( hbs_sock.rx_mesg[iface].cluster, + controller_cluster_cache[controller] ); + clog1 ("controller-%d cluster info from %s pulse request saved to cache", + controller, get_iface_name_str(iface)); + + hbs_sock.rx_mesg[iface].cluster.histories = 0 ; + + if ( have_other_controller_history ( controller ) == true ) + { + /* Now copy the other controller's cached cluster info into + * this controlers response */ + hbs_cluster_copy ( controller_cluster_cache[controller?0:1], + hbs_sock.rx_mesg[iface].cluster ); + + if ( daemon_get_cfg_ptr()->debug_state & 4 ) + { + string dump_banner = "" ; + dump_banner.append("controller-") ; + dump_banner.append(itos(controller?0:1)); + dump_banner.append(" cluster info from cache injected into controller-"); + dump_banner.append(itos(controller)); + dump_banner.append(":"); + dump_banner.append(get_iface_name_str(iface)); + dump_banner.append(" pulse response"); + hbs_cluster_dump ( hbs_sock.rx_mesg[iface].cluster, dump_banner, true ); + } + } + } + } + + /* Cluster Data management end */ + + /* replace the request header with the response header */ + memcpy ( &hbs_sock.rx_mesg[iface].m[0], &pulse_resp_tx_hdr[0], HBS_MAX_MSG ); #ifdef WANT_PULSE_RESPONSE_FIT if (( iface == INFRA_IFACE ) && ( daemon_is_file_present ( MTC_CMD_FIT__NO_INFRA_RSP ))) @@ -839,29 +950,11 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags ) } #endif - int rc = PASS ; - - /* replace the request header with the response header */ - memcpy ( &hbs_sock.rx_mesg[iface].m[0], &pulse_resp_tx_hdr[0], HBS_MAX_MSG ); - - /* Deal with the cluster info if it exists. - * ... Introduced in messaging version 1 */ - if ( hbs_sock.rx_mesg[iface].v >= HBS_MESSAGE_VERSION ) - { - if ( hbs_sock.rx_mesg[iface].cluster.version < MTCE_HBS_CLUSTER_VERSION ) - { - ilog ("Bad cluster verison (%d)", hbs_sock.rx_mesg[iface].cluster.version); - } - // if ( hbs_sock.rx_mesg[iface].cluster.revision != MTCE_HBS_CLUSTER_REVISION ) - // { - // ilog ("Bad cluster revision (%d)", hbs_sock.rx_mesg[iface].cluster.revision); - // } - - /* Add peer controller cluster data to this controller's response */ - // hbs_cluster_loop(hbs_sock.rx_mesg[iface]); - } + /* reuse the rx_bytes variable */ + rx_bytes = sizeof(hbs_message_type)-sizeof(mtce_hbs_cluster_type)+BYTES_IN_CLUSTER_VAULT(hbs_sock.rx_mesg[iface].cluster.histories); /* send pulse response message */ + int rc = PASS ; int tx_bytes = hbs_sock.tx_sock[iface]->reply(hbs_sock.rx_sock[iface],(char*)&hbs_sock.rx_mesg[iface], rx_bytes); if ( tx_bytes == -1 ) { @@ -884,15 +977,15 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags ) } else { - mlog ("%s Pulse Rsp: %s:%5d: %d:%d:%s RRI:%d (%d:%d:%d)\n", - get_iface_name_str(iface), - hbs_sock.tx_sock[iface]->get_dst_addr()->toString(), - hbs_sock.tx_sock[iface]->get_dst_addr()->getPort(), - hbs_sock.rx_mesg[iface].s, - hbs_sock.rx_mesg[iface].f, - hbs_sock.rx_mesg[iface].m, - hbs_sock.rx_mesg[iface].c, - pmonPulse_counter, rx_bytes, tx_bytes); + mlog ("%s Pulse Rsp: %s:%d: s:%d f:%x [%s] RRI:%d (%x:%d:%d)\n", + get_iface_name_str(iface), + hbs_sock.tx_sock[iface]->get_dst_addr()->toString(), + hbs_sock.tx_sock[iface]->get_dst_addr()->getPort(), + hbs_sock.rx_mesg[iface].s, + hbs_sock.rx_mesg[iface].f, + hbs_sock.rx_mesg[iface].m, + hbs_sock.rx_mesg[iface].c, + pmonPulse_counter, rx_bytes, tx_bytes); } /* Clear the error count since we got a good receive */ @@ -984,6 +1077,10 @@ int daemon_init ( string iface, string nodeType_str ) /* Initialize socket construct and pointer to it */ memset ( &hbs_sock, 0, sizeof(hbs_sock)); + /* Initialize the controller cluster view data bounce structure */ + for ( int c = 0 ; c < MTCE_HBS_MAX_CONTROLLERS ; c++ ) + memset ( &controller_cluster_cache[c], 0, sizeof(mtce_hbs_cluster_type)) ; + /* init the utility module */ hbs_utils_init (); @@ -1007,6 +1104,11 @@ int daemon_init ( string iface, string nodeType_str ) /* convert node type to integer */ my_nodetype = get_host_function_mask ( nodeType_str ) ; + if ( my_nodetype & CONTROLLER_TYPE ) + { + /* is controller but don't know what one yet. */ + set_hn((char*)CONTROLLER_X); + } ilog ("Node Type : %s (%d)\n", nodeType_str.c_str(), my_nodetype ); /* Bind signal handlers */ @@ -1058,7 +1160,6 @@ int daemon_init ( string iface, string nodeType_str ) int stall_threshold_log = 0 ; int stall_times_threshold_log = 0 ; -#define MAX_LEN 300 void daemon_service_run ( void ) { #ifdef WANT_DAEMON_DEBUG @@ -1205,7 +1306,7 @@ void daemon_service_run ( void ) int bytes = hbs_sock.sm_client_sock->read((char*)&msg, sizeof(mtce_hbs_cluster_type)); if ( bytes ) { - hbs_cluster_dump (msg); + hbs_cluster_dump (msg, "Cluster info received", true ); } } #endif diff --git a/mtce/src/heartbeat/hbsCluster.cpp b/mtce/src/heartbeat/hbsCluster.cpp index 3789541f..85b8f363 100644 --- a/mtce/src/heartbeat/hbsCluster.cpp +++ b/mtce/src/heartbeat/hbsCluster.cpp @@ -64,11 +64,19 @@ typedef struct /* The working heartbeat cluster data vault. */ mtce_hbs_cluster_type cluster ; + bool cluster_change ; + int cluster_change_threshold_count ; + int cluster_change_difference_count ; + + msgClassSock * sm_socket_ptr ; + } hbs_cluster_ctrl_type ; /* Cluster control structire construct allocation. */ static hbs_cluster_ctrl_type ctrl ; +#define STORAGE_0_NR_THRESHOLD (4) +#define CLUSTER_CHANGE_THRESHOLD (50000) /**************************************************************************** * @@ -80,7 +88,7 @@ static hbs_cluster_ctrl_type ctrl ; * ***************************************************************************/ -void hbs_cluster_init ( unsigned short period ) +void hbs_cluster_init ( unsigned short period, msgClassSock * sm_socket_ptr ) { ctrl.monitored_hosts = 0; ctrl.monitored_hostname_list.clear(); @@ -104,13 +112,17 @@ void hbs_cluster_init ( unsigned short period ) for ( int h = 0 ; h < MTCE_HBS_MAX_HISTORY_ELEMENTS ; h++ ) hbs_cluster_history_init ( ctrl.cluster.history[h] ); - ilog ("Cluster Info: v%d.%d sig:%x bytes:%d (%ld)", + clog ("Cluster Info: v%d.%d sig:%x bytes:%d (%ld)", ctrl.cluster.version, ctrl.cluster.revision, ctrl.cluster.magic_number, ctrl.cluster.bytes, sizeof(mtce_hbs_cluster_history_type)); + if ( sm_socket_ptr ) + { + ctrl.sm_socket_ptr = sm_socket_ptr ; + } ctrl.log_throttle = 0 ; } @@ -140,7 +152,7 @@ void hbs_cluster_nums ( unsigned short this_controller, /**************************************************************************** * - * Name : log_monitored_hosts_list + * Name : cluster_list * * Description : Log the list of monitored hosts. * Typically done on a list change. @@ -149,7 +161,7 @@ void hbs_cluster_nums ( unsigned short this_controller, * ***************************************************************************/ -void log_monitored_hosts_list ( void ) +void cluster_list ( void ) { std::list::iterator iter_ptr ; string list = "" ; @@ -160,9 +172,7 @@ void log_monitored_hosts_list ( void ) list.append (*(iter_ptr)); list.append (" "); } - ilog ("cluster of %ld: %s", - ctrl.monitored_hostname_list.size(), - list.c_str()); + ilog ("cluster: %s", list.c_str()); } @@ -186,6 +196,7 @@ void cluster_storage0_state ( bool enabled ) ctrl.cluster.storage0_enabled = enabled ; ilog ("storage-0 heartbeat state changed to %s", enabled ? "enabled" : "disabled" ); + ctrl.cluster_change = true ; } } @@ -237,13 +248,30 @@ void hbs_manage_controller_state ( string & hostname, bool enabled ) void hbs_cluster_add ( string & hostname ) { - /* Consider using 'unique' after instead of remove before update. */ - ctrl.monitored_hostname_list.remove(hostname) ; - ctrl.monitored_hostname_list.push_back(hostname) ; - ctrl.monitored_hosts = (unsigned short)ctrl.monitored_hostname_list.size(); + bool already_in_list = false ; + std::list::iterator hostname_ptr ; + for ( hostname_ptr = ctrl.monitored_hostname_list.begin(); + hostname_ptr != ctrl.monitored_hostname_list.end() ; + hostname_ptr++ ) + { + if ( hostname_ptr->compare(hostname) == 0 ) + { + already_in_list = true ; + break ; + } + } + + if ( already_in_list == false ) + { + ctrl.monitored_hostname_list.push_back(hostname) ; + ctrl.monitored_hosts = (unsigned short)ctrl.monitored_hostname_list.size(); + ilog ("%s added to cluster", hostname.c_str()); + cluster_list (); + ctrl.cluster_change = true ; + } /* Manage storage-0 state */ - if ( hostname == STORAGE_0 ) + if ( hostname.compare(STORAGE_0) == 0 ) { cluster_storage0_state ( true ); } @@ -251,15 +279,18 @@ void hbs_cluster_add ( string & hostname ) /* If we get down to 0 monitored hosts then just start fresh */ if (( ctrl.monitored_hosts ) == 0 ) { - hbs_cluster_init ( ctrl.cluster.period_msec ); + hbs_cluster_init ( ctrl.cluster.period_msec, NULL ); } /* Manage controller state ; true means enabled in this case. */ hbs_manage_controller_state ( hostname, true ); - ilog ("%s added to cluster", hostname.c_str()); + if (( ctrl.cluster_change ) && ( ctrl.sm_socket_ptr )) + { + hbs_cluster_send( ctrl.sm_socket_ptr, 0 ); + ctrl.cluster_change = false ; + } - log_monitored_hosts_list (); } /**************************************************************************** @@ -281,27 +312,46 @@ void hbs_cluster_add ( string & hostname ) void hbs_cluster_del ( string & hostname ) { - ctrl.monitored_hostname_list.remove(hostname) ; - ctrl.monitored_hosts = (unsigned short)ctrl.monitored_hostname_list.size(); - - /* Manage storage-0 state. */ - if ( hostname == STORAGE_0 ) + std::list::iterator hostname_ptr ; + for ( hostname_ptr = ctrl.monitored_hostname_list.begin(); + hostname_ptr != ctrl.monitored_hostname_list.end() ; + hostname_ptr++ ) { - cluster_storage0_state ( false ); + if ( hostname_ptr->compare(hostname) == 0 ) + { + ctrl.monitored_hostname_list.remove(hostname) ; + ctrl.monitored_hosts = (unsigned short)ctrl.monitored_hostname_list.size(); + + /* Manage storage-0 state. */ + if ( hostname.compare(STORAGE_0) == 0 ) + { + cluster_storage0_state ( false ); + } + + /* If we get down to 0 monitored hosts then just start fresh */ + if (( ctrl.monitored_hosts ) == 0 ) + { + hbs_cluster_init ( ctrl.cluster.period_msec, NULL ); + } + + /* Manage controller state ; false means not enabled in this case. */ + hbs_manage_controller_state ( hostname , false ); + + ilog ("%s deleted from cluster", hostname.c_str()); + + cluster_list (); + + ctrl.cluster_change = true ; + + break ; + } } - /* If we get down to 0 monitored hosts then just start fresh */ - if (( ctrl.monitored_hosts ) == 0 ) + if (( ctrl.cluster_change ) && ( ctrl.sm_socket_ptr )) { - hbs_cluster_init ( ctrl.cluster.period_msec ); + hbs_cluster_send( ctrl.sm_socket_ptr, 0 ); + ctrl.cluster_change = false ; } - - /* Manage controller state ; false means not enabled in this case. */ - hbs_manage_controller_state ( hostname , false ); - - ilog ("%s deleted from cluster", hostname.c_str()); - - log_monitored_hosts_list (); } /**************************************************************************** @@ -309,7 +359,7 @@ void hbs_cluster_del ( string & hostname ) * Name : hbs_cluster_update * * Description : Update this controller's cluster info for the specified - * network with + * network with ... * * 1. The number of enabled hosts. * 2. The number of responding hosts. @@ -333,7 +383,6 @@ void hbs_cluster_del ( string & hostname ) * ***************************************************************************/ -#define STORAGE_0_NR_THRESHOLD (4) void hbs_cluster_update ( iface_enum iface, unsigned short not_responding_hosts, @@ -357,7 +406,7 @@ void hbs_cluster_update ( iface_enum iface, if ( not_responding_hosts ) { - clog1 ("controller-%d %s enabled:%d not responding:%d", + clog ("controller-%d %s enabled:%d not responding:%d", ctrl.this_controller, hbs_cluster_network_name(n).c_str(), ctrl.monitored_hosts, @@ -365,7 +414,7 @@ void hbs_cluster_update ( iface_enum iface, } else { - clog1 ("controller-%d %s has %d monitored hosts and all are responding", + clog ("controller-%d %s has %d monitored hosts and all are responding", ctrl.this_controller, hbs_cluster_network_name(n).c_str(), ctrl.monitored_hosts); @@ -394,9 +443,11 @@ void hbs_cluster_update ( iface_enum iface, history_ptr->network = n ; /* Log new network history as its being started. */ - ilog ("controller-%d %s network history add", + ilog ("controller-%d added new controller-%d:%s history to vault ; now have %d network views", ctrl.this_controller, - hbs_cluster_network_name(n).c_str()); + ctrl.this_controller, + hbs_cluster_network_name(n).c_str(), + ctrl.cluster.histories); } } @@ -457,7 +508,9 @@ void hbs_cluster_update ( iface_enum iface, * ... which is the index for the next entry. */ unsigned short last_entry_index ; - if ( history_ptr->oldest_entry_index == 0 ) + unsigned short oldest_entry_index = history_ptr->oldest_entry_index ; + + if ( oldest_entry_index == 0 ) { /* Go to the end of the array. */ last_entry_index = MTCE_HBS_HISTORY_ENTRIES-1 ; @@ -465,43 +518,88 @@ void hbs_cluster_update ( iface_enum iface, else { /* Otherwise, the previous index in the array */ - last_entry_index = history_ptr->oldest_entry_index - 1 ; + last_entry_index = oldest_entry_index - 1 ; } - /* Update the history with this data. */ - history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled = ctrl.monitored_hosts ; - history_ptr->entry[history_ptr->oldest_entry_index].hosts_responding = ctrl.monitored_hosts - not_responding_hosts ; + bool logit = false ; + string logit_reason = "" ; - if (( history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled != - history_ptr->entry[ last_entry_index].hosts_enabled ) || - ( history_ptr->entry[history_ptr->oldest_entry_index].hosts_responding != - history_ptr->entry[ last_entry_index].hosts_responding)) + /* Update the history with this data. */ + history_ptr->entry[oldest_entry_index].hosts_enabled = ctrl.monitored_hosts ; + history_ptr->entry[oldest_entry_index].hosts_responding = ctrl.monitored_hosts - not_responding_hosts ; + + if (( history_ptr->entry[oldest_entry_index].hosts_enabled != + history_ptr->entry[ last_entry_index].hosts_enabled ) || + ( history_ptr->entry[oldest_entry_index].hosts_responding != + history_ptr->entry[ last_entry_index].hosts_responding)) { /* Only log on change events. */ - if ( history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled == - history_ptr->entry[history_ptr->oldest_entry_index].hosts_responding ) + if ( history_ptr->entry[oldest_entry_index].hosts_enabled == + history_ptr->entry[oldest_entry_index].hosts_responding ) { ilog ("controller-%d %s cluster of %d is healthy", ctrl.this_controller, hbs_cluster_network_name(n).c_str(), - history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled); + history_ptr->entry[oldest_entry_index].hosts_enabled); + ctrl.cluster_change_threshold_count = 0 ; + ctrl.cluster_change_difference_count = 0 ; } else { - ilog ("controller-%d %s cluster of %d with %d responding", - ctrl.this_controller, - hbs_cluster_network_name(n).c_str(), - history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled, - history_ptr->entry[history_ptr->oldest_entry_index].hosts_responding); + ctrl.cluster_change_threshold_count++ ; + ctrl.cluster_change_difference_count = + history_ptr->entry[oldest_entry_index].hosts_enabled - + history_ptr->entry[oldest_entry_index].hosts_responding ; } } + if ( daemon_get_cfg_ptr()->debug_state&4 ) + { + logit = true ; + logit_reason = "(debug)" ; + } +// else if (( ctrl.cluster_change_threshold_count == 1 ) && +// ( cluster_change == false )) +// { +// logit = true ; +// logit_reason = "" ; +// } + else if ( ctrl.cluster_change_threshold_count >= CLUSTER_CHANGE_THRESHOLD ) + { + logit = true ; + ctrl.cluster_change_threshold_count = 0 ; + logit_reason = "(threshold)" ; + } + else + { + int delta = + history_ptr->entry[oldest_entry_index].hosts_enabled - + history_ptr->entry[oldest_entry_index].hosts_responding ; + if ( delta != ctrl.cluster_change_difference_count ) + { + logit = true ; + ctrl.cluster_change_difference_count = delta ; + logit_reason = "(delta)" ; + } + } + + if ( logit ) + { + ilog ("controller-%d %s cluster of %d with %d responding (%d:%d) %s", + ctrl.this_controller, + hbs_cluster_network_name(n).c_str(), + history_ptr->entry[oldest_entry_index].hosts_enabled, + history_ptr->entry[oldest_entry_index].hosts_responding, + ctrl.cluster_change_difference_count, + not_responding_hosts, + logit_reason.c_str()); + } /* Increment the entries count till it reaches the max. */ if ( history_ptr->entries < MTCE_HBS_HISTORY_ENTRIES ) history_ptr->entries++ ; /* Manage the next entry update index ; aka the oldest index. */ - if ( history_ptr->oldest_entry_index == (MTCE_HBS_HISTORY_ENTRIES-1)) + if ( oldest_entry_index == (MTCE_HBS_HISTORY_ENTRIES-1)) history_ptr->oldest_entry_index = 0 ; else history_ptr->oldest_entry_index++ ; @@ -521,24 +619,31 @@ void hbs_cluster_update ( iface_enum iface, void hbs_cluster_append ( hbs_message_type & msg ) { - unsigned short c = ctrl.this_controller ; - - CHECK_CTRL_NTWK_PARMS(c, ctrl.monitored_networks); + CHECK_CTRL_NTWK_PARMS(ctrl.this_controller, ctrl.monitored_networks); msg.cluster.version = ctrl.cluster.version ; msg.cluster.revision = ctrl.cluster.revision ; msg.cluster.magic_number = ctrl.cluster.magic_number ; msg.cluster.period_msec = ctrl.cluster.period_msec ; msg.cluster.storage0_enabled = ctrl.cluster.storage0_enabled ; - msg.cluster.histories = ctrl.cluster.histories ; + msg.cluster.histories = 0 ; - int bytes = BYTES_IN_CLUSTER_VAULT(ctrl.monitored_networks); + /* Copy this controller's cluster history into the broadcast request. */ + for ( int h = 0 ; h < ctrl.cluster.histories ; h++ ) + { + if ( ctrl.cluster.history[h].controller == ctrl.this_controller ) + { + memcpy( &msg.cluster.history[msg.cluster.histories], + &ctrl.cluster.history[h], + sizeof(mtce_hbs_cluster_history_type)); - clog1 ("controller-%d appending cluster info to heartbeat message (%d:%d:%d)", - c, ctrl.monitored_networks, ctrl.cluster.histories, bytes ); + msg.cluster.histories++ ; + } + } + msg.cluster.bytes = BYTES_IN_CLUSTER_VAULT(msg.cluster.histories); - /* Copy the cluster into the message. */ - memcpy( &msg.cluster.history[0], &ctrl.cluster.history[c], bytes); + clog2 ("controller-%d appending cluster info to heartbeat message (%d:%d:%d)", + ctrl.this_controller, ctrl.monitored_networks, ctrl.cluster.histories, msg.cluster.bytes ); } /**************************************************************************** @@ -574,57 +679,8 @@ unsigned short hbs_cluster_unused_bytes ( void ) * ***************************************************************************/ -/* NOTE: All code wrapped in this directive will be removed once - * active/active heartbeating is delivered in next update */ -#define WANT_ACTIVE_ACTIVE_HEARTBEAT_RESULTS - void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid ) { - -#ifdef WANT_ACTIVE_ACTIVE_HEARTBEAT_RESULTS - - /* To assist SM with duplex integration ... - * - * This code emulates heartbeat redundancy by duplicating - * controller history up to the number of provisioned - * controllers until active-active heartbeat is delivered. - */ - int peer_controller ; - bool copy_cluster = false ; - if ( ctrl.this_controller == 0 ) - { - peer_controller = 1 ; - if ( ctrl.controller_1_enabled ) - { - copy_cluster = true ; - } - } - else - { - peer_controller = 0 ; - if ( ctrl.controller_0_enabled ) - { - copy_cluster = true ; - } - } - - int n, networks = ctrl.cluster.histories ; - if ( copy_cluster ) - { - for ( n = 0 ; n < networks ; n++ ) - { - /* copy this controller history to create peer controller */ - ctrl.cluster.history[ctrl.cluster.histories] = ctrl.cluster.history[n] ; - - /* update the controller */ - ctrl.cluster.history[ctrl.cluster.histories].controller = peer_controller ; - ctrl.cluster.bytes += sizeof(mtce_hbs_cluster_history_type) ; - ctrl.cluster.histories++ ; - } - } - -#endif // WANT_ACTIVE_ACTIVE_HEARTBEAT_RESULTS - ctrl.cluster.reqid = (unsigned short)reqid ; if (( sm_client_sock ) && ( sm_client_sock->sock_ok() == true )) { @@ -637,34 +693,82 @@ void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid ) } else { - ilog ("heartbeat cluster vault sent to SM (%d bytes)", len ); - hbs_cluster_dump ( ctrl.cluster ); + string reason = "" ; + // ilog ("heartbeat cluster vault sent to SM (%d bytes)", len ); + if ( reqid ) + reason = "cluster query" ; + else + reason = "cluster event" ; + hbs_cluster_dump ( ctrl.cluster, reason, true ); } } - -#ifdef WANT_ACTIVE_ACTIVE_HEARTBEAT_RESULTS - - if ( copy_cluster ) + else { - /* Clear out the other controllers data. */ - for ( n = networks ; n > 0 ; n-- ) + wlog ("cannot send cluster info due to socket error"); + } +} + +/**************************************************************************** + * + * Name : hbs_history_save + * + * Descrition : Copy the history sample to the vault. + * + * Returns : Nothing. + * + ***************************************************************************/ + +void hbs_history_save ( string hostname, mtce_hbs_cluster_history_type & sample ) +{ + for ( int h = 0 ; h < ctrl.cluster.histories ; h++ ) + { + if (( ctrl.cluster.history[h].controller == sample.controller ) && + ( ctrl.cluster.history[h].network == sample.network )) { - /* copy c0 history to another controller */ - hbs_cluster_history_init(ctrl.cluster.history[ctrl.cluster.histories-1]); - ctrl.cluster.bytes -= sizeof(mtce_hbs_cluster_history_type); - ctrl.cluster.histories-- ; + memcpy( &ctrl.cluster.history[h], &sample, + sizeof(mtce_hbs_cluster_history_type)); + + clog1 ("controller-%d updated vault with controller-%d:%s network history through %s (histories:%d)", + ctrl.this_controller, + sample.controller, + hbs_cluster_network_name((mtce_hbs_network_enum)sample.network).c_str(), + hostname.c_str(), + ctrl.cluster.histories); + return ; } } -#endif // WANT_ACTIVE_ACTIVE_HEARTBEAT_RESULTS + /* not found ? Add a new one */ + memcpy( &ctrl.cluster.history[ctrl.cluster.histories], &sample, + sizeof(mtce_hbs_cluster_history_type)); + ctrl.cluster.histories++ ; + ctrl.cluster.bytes = BYTES_IN_CLUSTER_VAULT(ctrl.cluster.histories); + ilog ("controller-%d added new controller-%d:%s history to vault ; now have %d network views", + ctrl.this_controller, + sample.controller, + hbs_cluster_network_name((mtce_hbs_network_enum)sample.network).c_str(), + ctrl.cluster.histories); } +void hbs_state_audit ( void ) +{ + hbs_cluster_dump ( ctrl.cluster, "Audit", true ); +} + + void hbs_cluster_log ( string & hostname, string prefix ) { hbs_cluster_log ( hostname, ctrl.cluster, prefix ); } +void hbs_cluster_log ( string & hostname, + string log_prefix, + bool force ) +{ + hbs_cluster_log (hostname, ctrl.cluster, log_prefix, force ); +} + /**************************************************************************** * * Active Active Heartbeating and Debug Member Functions @@ -724,10 +828,6 @@ int hbs_cluster_cmp( hbs_message_type & msg ) * Descrition : Copies the other controllers information from msg into * the cluster. * - * NOTE: Does not do that right now. - * - * Assumptions : Place holder until active/active heartbeating is implemented. - * * Returns : PASS or FAIL * ***************************************************************************/ @@ -736,12 +836,29 @@ int hbs_cluster_save ( string & hostname, mtce_hbs_network_enum network, hbs_message_type & msg ) { - // clog ("Add cluster info from peer controller"); - if ( ctrl.monitored_hosts ) + /* cluster info is only supported in HBS_MESSAGE_VERSION 1 */ + if ( msg.v < HBS_MESSAGE_VERSION ) + return FAIL_NOT_SUPPORTED ; + + if ( ! ctrl.monitored_hosts ) + return RETRY ; + + if ( msg.cluster.histories == 0 ) + return PASS ; + + for ( int h = 0 ; h < msg.cluster.histories ; h++ ) { - /* compare cluster info and log deltas */ - // hbs_cluster_cmp( msg ); - UNUSED(msg); + if ( msg.cluster.history[h].network >= MTCE_HBS_MAX_NETWORKS ) + { + elog ("Invalid network id (%d:%d:%d)", + h, + msg.cluster.history[h].controller, + msg.cluster.history[h].network ); + } + else if ( msg.cluster.history[h].controller != ctrl.this_controller ) + { + hbs_history_save ( hostname, msg.cluster.history[h] ); + } hbs_cluster_log( hostname, ctrl.cluster, hbs_cluster_network_name(network) ); } return (PASS); diff --git a/mtce/src/heartbeat/hbsStubs.cpp b/mtce/src/heartbeat/hbsStubs.cpp index 70b25df3..fbd68067 100644 --- a/mtce/src/heartbeat/hbsStubs.cpp +++ b/mtce/src/heartbeat/hbsStubs.cpp @@ -241,10 +241,11 @@ int mtcSmgrApi_active_services ( string hostname , bool * yes_no_ptr ) return(PASS); } -int send_hbs_command ( string hostname, int command ) +int send_hbs_command ( string hostname, int command, string controller ) { UNUSED(hostname); UNUSED(command); + UNUSED(controller); return(PASS); } diff --git a/mtce/src/heartbeat/hbsUtil.cpp b/mtce/src/heartbeat/hbsUtil.cpp index 54edb376..3980014a 100644 --- a/mtce/src/heartbeat/hbsUtil.cpp +++ b/mtce/src/heartbeat/hbsUtil.cpp @@ -111,6 +111,33 @@ string hbs_cluster_network_name ( mtce_hbs_network_enum network ) } } +/**************************************************************************** + * + * Name : hbs_cluster_copy + * + * Descrition : Copies cluster from src to dst. + * + * Returns : Nothing. + * + ***************************************************************************/ + +void hbs_cluster_copy ( mtce_hbs_cluster_type & src, mtce_hbs_cluster_type & dst ) +{ + dst.version = src.version ; + dst.revision = src.revision ; + dst.magic_number = src.magic_number ; + dst.period_msec = src.period_msec ; + dst.histories = src.histories ; + dst.storage0_enabled = src.storage0_enabled ; + for ( int h = 0 ; h < dst.histories ; h++ ) + { + memcpy( &dst.history[h], + &src.history[h], + sizeof(mtce_hbs_cluster_history_type)); + } + dst.bytes = BYTES_IN_CLUSTER_VAULT(dst.histories); +} + /**************************************************************************** * @@ -126,11 +153,9 @@ string hbs_cluster_network_name ( mtce_hbs_network_enum network ) void hbs_cluster_log ( string & hostname, mtce_hbs_cluster_type & cluster, - string log_prefix ) + string log_prefix, + bool force ) { - // bool want_log = false ; - - clog1 ("log %d histories", cluster.histories ); for ( int h = 0 ; h < cluster.histories ; h++ ) { if ( cluster.history[h].entries == MTCE_HBS_HISTORY_ENTRIES ) @@ -140,8 +165,6 @@ void hbs_cluster_log ( string & hostname, mtce_hbs_cluster_entry_type e = { 0, 0 } ; char str[MAX_CLUSTER_LINE_LEN] ; string line = ""; - int start = 0 ; - int stop = 0 ; bool newline = false ; bool logit = false ; bool first = false ; @@ -149,18 +172,13 @@ void hbs_cluster_log ( string & hostname, mtce_hbs_cluster_history_type * history_ptr = &cluster.history[h] ; - clog1 ("%s %s has %d entries (controller-%d view from %s)", hostname.c_str(), - hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(), - history_ptr->entries, - history_ptr->controller, - log_prefix.c_str()); - - /* Manage local this_index for log display. * Display oldest to newest ; left to right * * */ int this_index = history_ptr->oldest_entry_index ; + int debug = daemon_get_cfg_ptr()->debug_state ; + for ( int count = 0 ; count < history_ptr->entries ; count++ ) { if (( line.length() + MAX_ENTRY_STR_LEN ) >= @@ -180,13 +198,11 @@ void hbs_cluster_log ( string & hostname, } #endif - // want_log = true ; - if ( count == 0 ) { snprintf (&str[0], MAX_ENTRY_STR_LEN , "%d:%d ", // -%d", history_ptr->entry[this_index].hosts_enabled, - history_ptr->entry[this_index].hosts_responding ); // , this_index ); + history_ptr->entry[this_index].hosts_responding ); line.append (str); str[0] = '\0' ; } @@ -203,7 +219,7 @@ void hbs_cluster_log ( string & hostname, { snprintf (&str[0], MAX_ENTRY_STR_LEN , "%d:%d ", // -%d", history_ptr->entry[this_index].hosts_enabled, - history_ptr->entry[this_index].hosts_responding ); // , this_index ); + history_ptr->entry[this_index].hosts_responding ); line.append (str); str[0] = '\0' ; logit = true ; @@ -214,31 +230,21 @@ void hbs_cluster_log ( string & hostname, first_log[h] = true ; logit = true ; } - stop++ ; if ( newline == true ) { if ( logit ) { SET_CONTROLLER_HOSTNAME(history_ptr->controller); - if ( hostname == controller ) + if (( force ) || ( debug&2 )) { - clog ("%s view %s %s %02d..%02d: %s,", - hostname.c_str(), - log_prefix.c_str(), - hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(), - start, stop, line.c_str()); - } - else - { - clog ("%s view from %s %s %s %02d..%02d: %s,", - controller.c_str(), - hostname.c_str(), - log_prefix.c_str(), - hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(), - start, stop, line.c_str()); + syslog ( LOG_INFO, "%s view from %s %s %s: %s", + controller.c_str(), + hostname.c_str(), + log_prefix.c_str(), + hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(), + line.c_str()); } } - start = stop + 1 ; line.clear(); first = true ; newline = false ; @@ -253,7 +259,6 @@ void hbs_cluster_log ( string & hostname, } if (( newline == false ) && ( line.length() )) { - // ERIC if (( logit == false ) && ( was_diff[h] == true )) { logit = true ; @@ -264,30 +269,25 @@ void hbs_cluster_log ( string & hostname, { if ( first ) { - clog ("............ %s %s %02d..%02d: %s", - log_prefix.c_str(), - hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(), - start, stop, line.c_str()); + if (( force ) || ( debug&2 )) + { + syslog ( LOG_INFO, "............ %s %s: %s", + log_prefix.c_str(), + hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(), + line.c_str()); + } } else { SET_CONTROLLER_HOSTNAME(history_ptr->controller); - if ( hostname == controller ) + if (( force ) || ( debug&2 )) { - clog ("%s view %s %s %02d..%02d: %s", - hostname.c_str(), - log_prefix.c_str(), - hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(), - start, stop, line.c_str()); - } - else - { - clog ("%s view from %s %s %s %02d..%02d: %s", - controller.c_str(), - hostname.c_str(), - log_prefix.c_str(), /* Infra <- */ - hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(), - start, stop, line.c_str()); + syslog ( LOG_INFO, "%s view from %s %s %s: %s", + controller.c_str(), + hostname.c_str(), + log_prefix.c_str(), /* Infra <- */ + hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(), + line.c_str()); } } } @@ -307,40 +307,62 @@ void hbs_cluster_log ( string & hostname, * Description: Formatted dump of the vault contents to the log file. * ***************************************************************************/ -void hbs_cluster_dump ( mtce_hbs_cluster_type & vault ) +void hbs_cluster_dump ( mtce_hbs_cluster_type & vault, string log_prefix, bool force ) { - syslog ( LOG_INFO, "Cluster Vault Dump: --------------------------------------------------------------------------------------------"); - syslog ( LOG_INFO, "Cluster Vault: v%d.%d %d msec period ; SM Reqid is %d with storage-0 %s and %d histories in %d bytes", - vault.version, - vault.revision, - vault.period_msec, - vault.reqid, - vault.storage0_enabled ? "enabled" : "disabled", - vault.histories, - vault.bytes ); - for ( int h = 0 ; h < vault.histories ; h++ ) + if ( vault.version == 0 ) + return ; + + int debug = daemon_get_cfg_ptr()->debug_state ; + + if (( debug & 2 ) || ( force == true )) { - #define MAX_LINE_LEN (500) - char str[MAX_LINE_LEN] ; - int i = 0 ; - for ( int e = 0 ; e < vault.history[h].entries_max ; e++ ) - { - snprintf ( &str[i], MAX_LINE_LEN, "%c[%d:%d]" , - vault.history[h].oldest_entry_index==e ? '>' : ' ', - vault.history[h].entry[e].hosts_enabled, - vault.history[h].entry[e].hosts_responding); - i = strlen(str) ; - } - syslog ( LOG_INFO, "Cluster Vault: C%d %s S:%s:%s (%d:%d) %s", - vault.history[h].controller, - hbs_cluster_network_name((mtce_hbs_network_enum)vault.history[h].network).c_str(), - vault.storage0_enabled ? "y" : "n", - vault.history[h].storage0_responding ? "y" : "n", - vault.history[h].entries_max, - vault.history[h].entries, - str); + ilog ("%s", log_prefix.c_str()); + syslog ( LOG_INFO, "Cluster Vault : v%d.%d %d msec heartbeat period %s;%d network heartbeat response histories (%d bytes)", + vault.version, + vault.revision, + vault.period_msec, + vault.storage0_enabled ? " with storage-0: enabled " : "", + vault.histories, + vault.bytes ); + } + + if (( debug & 4 ) || ( force == true )) + { + for ( int h = 0 ; h < vault.histories ; h++ ) + { + #define MAX_LINE_LEN (500) + char str[MAX_LINE_LEN] ; + int i = 0 ; + for ( int e = 0 ; e < vault.history[h].entries_max ; e++ ) + { + snprintf ( &str[i], MAX_LINE_LEN, "%c[%d:%d]" , + vault.history[h].oldest_entry_index==e ? '>' : ' ', + vault.history[h].entry[e].hosts_enabled, + vault.history[h].entry[e].hosts_responding); + i = strlen(str) ; + } + if ( vault.storage0_enabled ) + { + syslog ( LOG_INFO, "Cluster Vault : C%d %s S:%s %s", + vault.history[h].controller, + hbs_cluster_network_name((mtce_hbs_network_enum)vault.history[h].network).c_str(), + vault.history[h].storage0_responding ? "y" : "n", + str); + } + else + { + syslog ( LOG_INFO, "Cluster Vault : C%d %s %s", + vault.history[h].controller, + hbs_cluster_network_name((mtce_hbs_network_enum)vault.history[h].network).c_str(), + str); + } + } + } + + if ( debug & 8 ) + { + dump_memory ( &vault, 16, vault.bytes ); } - // dump_memory ( &vault, 16, vault.bytes ); } diff --git a/mtce/src/maintenance/Makefile b/mtce/src/maintenance/Makefile index 83d038f5..64eea2a7 100755 --- a/mtce/src/maintenance/Makefile +++ b/mtce/src/maintenance/Makefile @@ -46,6 +46,7 @@ CONTROL_OBJS += mtcHttpSvr.o CONTROL_OBJS += mtcCmdHdlr.o CONTROL_OBJS += mtcNodeMnfa.o CONTROL_OBJS += mtcVimApi.o +CONTROL_OBJS += mtcStubs.o CONTROL_OBJS += ../common/nodeClass.o OBJS = $(SRCS:.cpp=.o) diff --git a/mtce/src/maintenance/mtcCtrlMsg.cpp b/mtce/src/maintenance/mtcCtrlMsg.cpp index 0be8d705..06dfd228 100755 --- a/mtce/src/maintenance/mtcCtrlMsg.cpp +++ b/mtce/src/maintenance/mtcCtrlMsg.cpp @@ -48,6 +48,7 @@ using namespace std; #include "mtcAlarm.h" /* for ... mtcAlarm... */ #include "nodeUtil.h" /* for ... get_event_str ... */ +int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr ); /* Throttle logging of messages from unknown IP addresses */ std::list unknown_ip_list ; @@ -766,7 +767,7 @@ int send_mtc_cmd ( string & hostname, int cmd , int interface ) return ( rc ); } -int send_hbs_command ( string hostname, int cmd ) +int send_hbs_command ( string hostname, int cmd, string controller ) { int bytes = 0 ; int bytes_to_send = 0 ; @@ -776,18 +777,6 @@ int send_hbs_command ( string hostname, int cmd ) mtc_message_type event ; mtc_socket_type * sock_ptr = get_sockPtr (); - /* We don't heartbeat self */ - if (( obj_ptr->is_active_controller (hostname) ) && - (( cmd == MTC_CMD_ADD_HOST ) || - ( cmd == MTC_CMD_DEL_HOST ) || - ( cmd == MTC_CMD_START_HOST ) || - ( cmd == MTC_CMD_STOP_HOST ))) - { - dlog ("%s refusing to '%s' self to heartbeat service\n", - hostname.c_str(), get_event_str(cmd).c_str()); - return (PASS); - } - memset (&event, 0 , sizeof(mtc_message_type)); snprintf ( &event.hdr[0] , MSG_HEADER_SIZE, "%s", get_hbs_cmd_req_header() ); snprintf ( &event.hdr[MSG_HEADER_SIZE] , MAX_CHARS_HOSTNAME , "%s", hostname.data()); @@ -795,48 +784,72 @@ int send_hbs_command ( string hostname, int cmd ) /* There is no buffer data in any of these messages */ bytes_to_send = ((sizeof(mtc_message_type))-(BUF_SIZE)) ; - switch ( cmd ) - { - case MTC_CMD_STOP_HOST: - ilog ("%s sending 'stop' to heartbeat service\n", hostname.c_str()); - break ; - case MTC_CMD_START_HOST: - obj_ptr->manage_heartbeat_clear ( hostname , MAX_IFACES ); - ilog ("%s sending 'start' to heartbeat service\n", hostname.c_str()); - break ; - case MTC_CMD_DEL_HOST: - ilog ("%s sending 'delete' to heartbeat service\n", hostname.c_str()); - break ; - case MTC_CMD_ADD_HOST: - obj_ptr->manage_heartbeat_clear ( hostname, MAX_IFACES ); - ilog ("%s sending 'add' to heartbeat service\n", hostname.c_str()); - break ; - case MTC_RESTART_HBS: - ilog ("%s sending 'restart' to heartbeat service\n", hostname.c_str()); - break ; - case MTC_BACKOFF_HBS: - ilog ("%s requesting heartbeat period backoff\n", hostname.c_str()); - break ; - case MTC_RECOVER_HBS: - ilog ("%s requesting heartbeat period recovery\n", hostname.c_str()); - break ; - default: - { - slog ("%s Unsupported command operation 0x%x\n", hostname.c_str(), cmd ); - return (FAIL_BAD_PARM); - } - } event.cmd = cmd ; event.num = 1 ; event.parm[0] = obj_ptr->get_nodetype(hostname); /* send to hbsAgent daemon port */ - bytes = sock_ptr->mtc_to_hbs_sock->write((char*) &event, bytes_to_send); - if ( bytes <= 0 ) + std::list controllers ; + controllers.clear(); + if ( controller == CONTROLLER ) { - wlog ("Cannot send to heartbeat service\n"); - rc = FAIL_TO_TRANSMIT ; + controllers.push_back(CONTROLLER_0); + controllers.push_back(CONTROLLER_1); + } + else + { + controllers.push_back(controller); + } + string ip = "" ; + std::list::iterator unit ; + for ( unit = controllers.begin () ; + unit != controllers.end () ; + unit++ ) + { + switch ( cmd ) + { + case MTC_CMD_ACTIVE_CTRL: + mlog3 ("%s sending 'activity state' to %s heartbeat service\n", hostname.c_str(), unit->c_str()); + break ; + case MTC_CMD_STOP_HOST: + ilog ("%s sending 'stop' to %s heartbeat service\n", hostname.c_str(), unit->c_str()); + break ; + case MTC_CMD_START_HOST: + obj_ptr->manage_heartbeat_clear ( hostname , MAX_IFACES ); + ilog ("%s sending 'start' to %s heartbeat service\n", hostname.c_str(), unit->c_str()); + break ; + case MTC_CMD_DEL_HOST: + ilog ("%s sending 'delete' to %s heartbeat service\n", hostname.c_str(), unit->c_str()); + break ; + case MTC_CMD_ADD_HOST: + obj_ptr->manage_heartbeat_clear ( hostname, MAX_IFACES ); + ilog ("%s sending 'add' to %s heartbeat service\n", hostname.c_str(), unit->c_str()); + break ; + case MTC_RESTART_HBS: + ilog ("%s sending 'restart' to %s heartbeat service\n", hostname.c_str(), unit->c_str()); + break ; + case MTC_BACKOFF_HBS: + ilog ("%s requesting %s heartbeat period backoff\n", hostname.c_str(), unit->c_str()); + break ; + case MTC_RECOVER_HBS: + ilog ("%s requesting %s heartbeat period recovery\n", hostname.c_str(), unit->c_str()); + break ; + default: + { + slog ("%s Unsupported command operation 0x%x\n", hostname.c_str(), cmd ); + rc = FAIL_BAD_PARM ; + continue ; + } + } + + ip = get_mtcInv_ptr()->get_hostaddr(*unit) ; + bytes = sock_ptr->mtc_to_hbs_sock->write((char*) &event, bytes_to_send, ip.data()); + if ( bytes <= 0 ) + { + wlog ("%s failed to send command (0x%x) to heartbeat service at %s\n", unit->c_str(), cmd, ip.c_str() ); + rc = FAIL_TO_TRANSMIT ; + } } return rc ; } @@ -954,6 +967,14 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr ) /* Assert the degrade condition with the 'false' (i.e. not clear)*/ obj_ptr->manage_heartbeat_degrade ( hostname, iface, false ); } + /* Otherwise the action must be alarm only or none ; both of which + * are already handled by the hbsAgent, so do nothing */ + else + { + ilog ("%s heartbeat degrade event dropped ; action is not fail or degrade (%s)\n", + hostname.c_str(), + get_iface_name_str(iface)); + } } else { @@ -1003,7 +1024,7 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr ) * are already handled by the hbsAgent, so do nothing */ else { - dlog ("%s heartbeat loss event dropped (%s)\n", + ilog ("%s heartbeat loss event dropped ; action is not fail or degrade (%s)\n", hostname.c_str(), get_iface_name_str(iface)); } @@ -1070,6 +1091,7 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr ) else if ( msg.cmd == MTC_EVENT_HEARTBEAT_READY ) { + string controller = CONTROLLER ; std::list::iterator temp ; /* no heartbeating in simplex mode */ @@ -1078,7 +1100,17 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr ) return (PASS); } - ilog ("Received 'Heartbeat Service Ready' Event\n"); + /* get the controller that sent this ready event */ + if (( msg.buf[0] != '\0' ) && ( strnlen( msg.buf, BUF_SIZE) <= MAX_CHARS_HOSTNAME )) + { + controller = msg.buf ; + ilog ("%s Heartbeat Service Ready Event (%s)\n", + msg.buf, sock_ptr->mtc_event_rx_sock->get_src_str()); + } + else + { + ilog ("Heartbeat Service Ready Event\n"); + } obj_ptr->hbs_ready = true ; /* Run Maintenance on Inventory */ @@ -1093,25 +1125,17 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr ) * the heartbeat service. This tell the heartbeat * service about all the hosts so that it will * send heartbeat oob flag events to mtce. */ - if ( send_hbs_command( hostname, MTC_CMD_ADD_HOST ) != PASS ) + if ( send_hbs_command( hostname, MTC_CMD_ADD_HOST, controller ) != PASS ) { elog ("%s Failed to send inventory to heartbeat service\n", hostname.c_str()); } - /* Send the start event to the heartbeat service for all enabled hosts except - * for the active controller which is not actively monitored */ - if ( obj_ptr->is_active_controller ( hostname ) == false ) + /* Send the start event to the heartbeat service for all enabled hosts */ + if (( obj_ptr->get_adminState ( hostname ) == MTC_ADMIN_STATE__UNLOCKED ) && + ( obj_ptr->get_operState ( hostname ) == MTC_OPER_STATE__ENABLED ) && + ((obj_ptr->get_availStatus ( hostname ) == MTC_AVAIL_STATUS__AVAILABLE ) || + (obj_ptr->get_availStatus ( hostname ) == MTC_AVAIL_STATUS__DEGRADED ))) { - if (( obj_ptr->get_adminState ( hostname ) == MTC_ADMIN_STATE__UNLOCKED ) && - ( obj_ptr->get_operState ( hostname ) == MTC_OPER_STATE__ENABLED ) && - ((obj_ptr->get_availStatus ( hostname ) == MTC_AVAIL_STATUS__AVAILABLE ) || - (obj_ptr->get_availStatus ( hostname ) == MTC_AVAIL_STATUS__DEGRADED ))) - { - send_hbs_command ( hostname, MTC_CMD_START_HOST ); - } - } - else - { - dlog ("%s Refusing to start heartbeat of self\n", hostname.c_str() ); + send_hbs_command ( hostname, MTC_CMD_START_HOST, controller ); } } } diff --git a/mtce/src/maintenance/mtcNodeCtrl.cpp b/mtce/src/maintenance/mtcNodeCtrl.cpp index c56f4b00..b82ee661 100644 --- a/mtce/src/maintenance/mtcNodeCtrl.cpp +++ b/mtce/src/maintenance/mtcNodeCtrl.cpp @@ -802,7 +802,11 @@ int mtc_socket_init ( void ) /***********************************************************/ int port = daemon_get_cfg_ptr()->hbs_to_mtc_event_port ; - mtc_sock.mtc_event_rx_sock = new msgClassRx(LOOPBACK_IP, port, IPPROTO_UDP); + + /* listen to this port on any interface so that the hbsAgent running + * locally or on peer controller can get events into mtcAgent */ + mtc_sock.mtc_event_rx_sock = + new msgClassRx(mtcInv.my_float_ip.data(), port, IPPROTO_UDP); rc = mtc_sock.mtc_event_rx_sock->return_status; if ( rc ) { @@ -820,7 +824,7 @@ int mtc_socket_init ( void ) /***********************************************************/ port = daemon_get_cfg_ptr()->mtc_to_hbs_cmd_port ; - sock_ptr->mtc_to_hbs_sock = new msgClassTx(LOOPBACK_IP, port, IPPROTO_UDP); + sock_ptr->mtc_to_hbs_sock = new msgClassTx(CONTROLLER, port, IPPROTO_UDP, mtc_config.mgmnt_iface); rc = sock_ptr->mtc_to_hbs_sock->return_status; if ( rc ) { @@ -1281,11 +1285,14 @@ void daemon_service_run ( void ) mtcInv.inotify_shadow_file_fd , mtcInv.inotify_shadow_file_wd ); - /* Add this controller to the heartbeat service so that we - * receive the out-of-band heartbeat 'flags' even though - * we don't self monitor the active controller specifically - * This add may be duplicate but covers the initial config case */ + /* inform the heartbeat service that this controller is active */ + send_hbs_command ( mtcInv.my_hostname, MTC_CMD_ACTIVE_CTRL ); + + /* Add this controller to the heartbeat service so that + * the peer hbsAgent also gets this controllers inventory + * and this hbsAgent receives the out-of-band heartbeat 'flags' */ send_hbs_command ( mtcInv.my_hostname, MTC_CMD_ADD_HOST ); + send_hbs_command ( mtcInv.my_hostname, MTC_CMD_START_HOST ); socks.clear(); socks.push_front (mtc_sock.mtc_event_rx_sock->getFD()); // service_events diff --git a/mtce/src/maintenance/mtcNodeHdlrs.cpp b/mtce/src/maintenance/mtcNodeHdlrs.cpp index d57c3d6e..2536a7b7 100755 --- a/mtce/src/maintenance/mtcNodeHdlrs.cpp +++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp @@ -6205,6 +6205,13 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr ) #endif + /* Audits for this controller host only */ + if ( node_ptr->hostname == this->my_hostname ) + { + /* Remind the heartbeat service that this is the active ctrl */ + send_hbs_command ( this->my_hostname, MTC_CMD_ACTIVE_CTRL ); + } + /* Manage active controller auto recovery bool. * If the inactive controller is inservice then disable * controller autorecovery. Otherwise enable it but in this case diff --git a/mtce/src/maintenance/mtcStubs.cpp b/mtce/src/maintenance/mtcStubs.cpp index f1a94b62..4fc3ff80 100644 --- a/mtce/src/maintenance/mtcStubs.cpp +++ b/mtce/src/maintenance/mtcStubs.cpp @@ -14,4 +14,10 @@ using namespace std; #include "nodeClass.h" /* The main link class */ -void hbs_cluster_log ( void ) { } +void hbs_cluster_log ( string & hostname, string prefix, bool force=false ) +{ + UNUSED(hostname); + UNUSED(prefix); + UNUSED(force); +} +