Merge "Implement Active-Active Heartbeat as HA Improvement"
This commit is contained in:
commit
abf0ff3986
|
@ -39,7 +39,6 @@ typedef struct
|
|||
{
|
||||
int scheduling_priority ; /**< Scheduling priority of this daemon */
|
||||
bool active ; /**< Maintenance activity state true|false */
|
||||
int hbs_pulse_period ; /**< time (msec) between heartbeat requests */
|
||||
int token_refresh_rate ; /**< token refresh rate in seconds */
|
||||
int hbs_minor_threshold ; /**< heartbeat miss minor threshold */
|
||||
int hbs_degrade_threshold ; /**< heartbeat miss degrade threshold */
|
||||
|
@ -351,7 +350,7 @@ extern char *program_invocation_short_name;
|
|||
}
|
||||
|
||||
#define blog(format, args...) { \
|
||||
if ( ltc() ) { if(daemon_get_cfg_ptr()->debug_bmgmt) printf ( "%s [%d.%05d] %s %s %-3s %-18s(%4d) %-24s: BMgt : " format, pt(), getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } \
|
||||
if ( ltc() ) { if(daemon_get_cfg_ptr()->debug_bmgmt&1) printf ( "%s [%d.%05d] %s %s %-3s %-18s(%4d) %-24s: BMgt : " format, pt(), getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } \
|
||||
else { if(daemon_get_cfg_ptr()->debug_bmgmt) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: BMgt : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } \
|
||||
}
|
||||
|
||||
|
@ -380,22 +379,22 @@ extern char *program_invocation_short_name;
|
|||
#define mlog2(format, args...) { if(daemon_get_cfg_ptr()->debug_msg&4 ) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Msg4 : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define mlog3(format, args...) { if(daemon_get_cfg_ptr()->debug_msg&8 ) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Msg8 : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
|
||||
#define jlog(format, args...) { if(daemon_get_cfg_ptr()->debug_json ) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Json : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define jlog(format, args...) { if(daemon_get_cfg_ptr()->debug_json&1) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Json : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define jlog1(format, args...) { if(daemon_get_cfg_ptr()->debug_json&2) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Json2: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define jlog2(format, args...) { if(daemon_get_cfg_ptr()->debug_json&4) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Json4: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define jlog3(format, args...) { if(daemon_get_cfg_ptr()->debug_json&8) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Json8: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
|
||||
#define hlog(format, args...) { if(daemon_get_cfg_ptr()->debug_http) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Http : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define hlog(format, args...) { if(daemon_get_cfg_ptr()->debug_http&1) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Http : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define hlog1(format, args...) { if(daemon_get_cfg_ptr()->debug_http&2) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Http2: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define hlog2(format, args...) { if(daemon_get_cfg_ptr()->debug_http&4) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Http4: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define hlog3(format, args...) { if(daemon_get_cfg_ptr()->debug_http&8) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Http8: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
|
||||
#define alog(format, args...) { if(daemon_get_cfg_ptr()->debug_alive ) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Alive : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define alog(format, args...) { if(daemon_get_cfg_ptr()->debug_alive&1) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Alive : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define alog1(format, args...) { if(daemon_get_cfg_ptr()->debug_alive&2) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Alive2: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define alog2(format, args...) { if(daemon_get_cfg_ptr()->debug_alive&4) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Alive4: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define alog3(format, args...) { if(daemon_get_cfg_ptr()->debug_alive&8) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Alive8: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
|
||||
#define qlog(format, args...) { if(daemon_get_cfg_ptr()->debug_work) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Work : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define qlog(format, args...) { if(daemon_get_cfg_ptr()->debug_work&1) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Work : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define qlog1(format, args...) { if(daemon_get_cfg_ptr()->debug_work&2) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Work2: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define qlog2(format, args...) { if(daemon_get_cfg_ptr()->debug_work&4) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Work4: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define qlog3(format, args...) { if(daemon_get_cfg_ptr()->debug_work&8) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Work8: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
|
@ -403,8 +402,11 @@ extern char *program_invocation_short_name;
|
|||
#define flog(format, args...) { if(daemon_get_cfg_ptr()->debug_fsm) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: FSM : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define tlog(format, args...) { if(daemon_get_cfg_ptr()->debug_timer) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Timer: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
|
||||
#define clog(format, args...) { if(daemon_get_cfg_ptr()->debug_state) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Change: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define clog(format, args...) { if(daemon_get_cfg_ptr()->debug_state&1) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Change: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define clog1(format, args...) { if(daemon_get_cfg_ptr()->debug_state&2) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Chang2: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define clog2(format, args...) { if(daemon_get_cfg_ptr()->debug_state&4) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Chang4: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define clog3(format, args...) { if(daemon_get_cfg_ptr()->debug_state&8) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Chang8: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
|
||||
|
||||
#define log_event(format, args...) { syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Event: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define log_stress(format, args...) { syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Stress: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
|
|
|
@ -233,6 +233,7 @@ const char * get_mtcNodeCommand_str ( int cmd )
|
|||
case MTC_CMD_QRY_HOST: return("query host");
|
||||
case MTC_CMD_START_HOST: return("start host service");
|
||||
case MTC_CMD_STOP_HOST: return("stop host service");
|
||||
case MTC_CMD_ACTIVE_CTRL: return("publish active controller");
|
||||
|
||||
/* VM Instance Commands */
|
||||
case MTC_CMD_ADD_INST: return("add instance");
|
||||
|
|
|
@ -359,6 +359,7 @@ void daemon_exit ( void );
|
|||
* a power-off to online transition */
|
||||
#define MTC_MTCALIVE_HITS_TO_GO_ONLINE (5)
|
||||
|
||||
#define CONTROLLER_X ((const char *)"controller-x")
|
||||
#define CONTROLLER_0 ((const char *)"controller-0")
|
||||
#define CONTROLLER_1 ((const char *)"controller-1")
|
||||
#define CONTROLLER_2 ((const char *)"controller-2")
|
||||
|
@ -526,7 +527,8 @@ typedef struct
|
|||
#define MTC_CMD_MOD_HOST (0x11110012) /* Query Host */
|
||||
#define MTC_CMD_QRY_HOST (0x11110013) /* Modify Host */
|
||||
#define MTC_CMD_START_HOST (0x11110014) /* Start Monitoring Host */
|
||||
#define MTC_CMD_STOP_HOST (0x11110015) /* Stop Moniroting Host */
|
||||
#define MTC_CMD_STOP_HOST (0x11110015) /* Stop Monitoring Host */
|
||||
#define MTC_CMD_ACTIVE_CTRL (0x11110016) /* Active Controller */
|
||||
|
||||
#define MTC_CMD_ADD_INST (0x11110020) /* Add Inst */
|
||||
#define MTC_CMD_DEL_INST (0x11110021) /* Delete Inst */
|
||||
|
@ -643,6 +645,9 @@ typedef struct
|
|||
#define PMOND_FLAG (0x00000001) /**< Process Monitor O.K. Flag */
|
||||
#define INFRA_FLAG (0x00000002) /**< Infrastructure iface provisioned Flag */
|
||||
|
||||
#define CTRLX_MASK (0x00000300) /**< From/To Controller-0/1/2/3 Number */
|
||||
#define CTRLX_BIT ((unsigned int)8) /**< used to shift right mask into bit 0 */
|
||||
|
||||
#define STALL_MON_FLAG (0x00010000) /**< Flag indicating hang monitor running */
|
||||
#define STALL_REC_FLAG (0x00020000) /**< Flag indicating hbsClient took action */
|
||||
#define STALL_ERR1_FLAG (0x00100000) /**< Error 1 Flag */
|
||||
|
@ -1217,15 +1222,15 @@ string get_availStatus_str ( mtc_nodeAvailStatus_enum availStatus );
|
|||
string get_operState_str ( mtc_nodeOperState_enum operState );
|
||||
string get_adminState_str ( mtc_nodeAdminState_enum adminState );
|
||||
|
||||
void log_adminAction ( string hostname,
|
||||
mtc_nodeAdminAction_enum currAction,
|
||||
void log_adminAction ( string hostname,
|
||||
mtc_nodeAdminAction_enum currAction,
|
||||
mtc_nodeAdminAction_enum newAction );
|
||||
|
||||
int send_hbs_command ( string hostname, int command );
|
||||
int send_hbs_command ( string hostname, int command, string controller=CONTROLLER );
|
||||
int send_hwmon_command ( string hostname, int command );
|
||||
int send_guest_command ( string hostname, int command );
|
||||
|
||||
int daemon_log_message ( const char * hostname,
|
||||
int daemon_log_message ( const char * hostname,
|
||||
const char * filename,
|
||||
const char * log_str );
|
||||
|
||||
|
|
|
@ -48,6 +48,7 @@
|
|||
#define MTC_MINS_20 (1200)
|
||||
#define MTC_MINS_30 (1800)
|
||||
#define MTC_MINS_40 (2400)
|
||||
#define MTC_HRS_1 (3600)
|
||||
#define MTC_HRS_4 (14400)
|
||||
#define MTC_HRS_8 (28800) /* old token refresh rate */
|
||||
|
||||
|
|
|
@ -269,7 +269,7 @@ void daemon_dump_cfg ( void )
|
|||
{
|
||||
daemon_config_type * ptr = daemon_get_cfg_ptr();
|
||||
|
||||
ilog ("Configuration Settings\n------------------------------\n");
|
||||
ilog ("Configuration Settings ...\n");
|
||||
if ( ptr->scheduling_priority ) { ilog ("scheduling_priority = %d\n", ptr->scheduling_priority ); }
|
||||
|
||||
if ( ptr->infra_degrade_only ) { ilog ("infra_degrade_only = %s\n", ptr->infra_degrade_only ? "Yes" : "No" );}
|
||||
|
@ -277,7 +277,6 @@ void daemon_dump_cfg ( void )
|
|||
if ( ptr->active ) { ilog ("active = %s\n", ptr->active ? "Yes" : "No" );}
|
||||
|
||||
/* hbsAgent */
|
||||
if ( ptr->hbs_pulse_period ) { ilog ("hbs_pulse_period = %d\n", ptr->hbs_pulse_period );}
|
||||
if ( ptr->token_refresh_rate ) { ilog ("token_refresh_rate = %d\n", ptr->token_refresh_rate );}
|
||||
if ( ptr->hbs_minor_threshold ) { ilog ("hbs_minor_threshold = %d\n", ptr->hbs_minor_threshold );}
|
||||
if ( ptr->hbs_degrade_threshold ) { ilog ("hbs_degrade_threshold = %d\n", ptr->hbs_degrade_threshold );}
|
||||
|
|
|
@ -78,6 +78,7 @@ void print_help ( void )
|
|||
printf ("\t-l --log - Log to file ; /var/log/<daemon>.log\n");
|
||||
printf ("\t-p --passive - Passive mode ; do not act on failures\n");
|
||||
printf ("\t-v --verbose - Show command line arguments\n");
|
||||
printf ("\t-V --Virtual - Running in virtual environment\n");
|
||||
printf ("\t-t --test - Run Test Head\n");
|
||||
printf ("\t-g --gap - Gap in seconds\n");
|
||||
printf ("\t-m --mode - Word string representing a run mode\n");
|
||||
|
@ -106,6 +107,9 @@ int daemon_get_run_option ( const char * option )
|
|||
}
|
||||
return (1);
|
||||
}
|
||||
else if ( !strcmp ( option, "Virtual" ) )
|
||||
return opts.Virtual ;
|
||||
|
||||
else if ( !strcmp ( option, "front" ) )
|
||||
return opts.front ;
|
||||
|
||||
|
@ -118,6 +122,7 @@ void opts_init ( void)
|
|||
opts.log = false ;
|
||||
opts.test = false ;
|
||||
opts.verbose = false ;
|
||||
opts.Virtual = false ;
|
||||
opts.active = false ;
|
||||
opts.front = false ;
|
||||
opts.front = false ;
|
||||
|
@ -152,8 +157,8 @@ int parseArg ( int argc, char * argv[], opts_type * opts_ptr )
|
|||
int cmd_arg_count = 1 ; /* command args start at 1 */
|
||||
|
||||
/* A string listing of valid short options letters. */
|
||||
const char* const short_options = "u:c:p:g:i:m:n:d:hlfpvta";
|
||||
|
||||
const char* const short_options = "u:c:p:g:i:m:n:d:hlfpvVta";
|
||||
|
||||
/* An array listing of valid long options. */
|
||||
const struct option long_options[] =
|
||||
{
|
||||
|
@ -167,9 +172,10 @@ int parseArg ( int argc, char * argv[], opts_type * opts_ptr )
|
|||
{ "username" , 1, NULL, 'u' },
|
||||
{ "help" , 0, NULL, 'h' },
|
||||
{ "active" , 0, NULL, 'a' },
|
||||
{ "foreground", 0, NULL, 'f' },
|
||||
{ "log" , 0, NULL, 'l' },
|
||||
{ "foreground", 0, NULL, 'f' },
|
||||
{ "log" , 0, NULL, 'l' },
|
||||
{ "verbose" , 0, NULL, 'v' },
|
||||
{ "Virtual" , 0, NULL, 'V' },
|
||||
{ "test" , 0, NULL, 't' },
|
||||
{ NULL , 0, NULL, 0 } /* Required at end of array. */
|
||||
};
|
||||
|
@ -254,19 +260,25 @@ int parseArg ( int argc, char * argv[], opts_type * opts_ptr )
|
|||
case 't': /* -t or --test */
|
||||
{
|
||||
opts_ptr->test = true ;
|
||||
cmd_arg_count++ ;
|
||||
cmd_arg_count++ ;
|
||||
break;
|
||||
}
|
||||
case 'v': /* -t or --verbose */
|
||||
case 'v': /* -v or --verbose */
|
||||
{
|
||||
opts_ptr->verbose = true ;
|
||||
cmd_arg_count++ ;
|
||||
cmd_arg_count++ ;
|
||||
break;
|
||||
}
|
||||
case 'V': /* -V or --Virtual */
|
||||
{
|
||||
opts_ptr->Virtual = true ;
|
||||
cmd_arg_count++ ;
|
||||
break;
|
||||
}
|
||||
case 'a': /* -a or --active */
|
||||
{
|
||||
opts_ptr->active = true ;
|
||||
cmd_arg_count++ ;
|
||||
cmd_arg_count++ ;
|
||||
break;
|
||||
}
|
||||
case '?':
|
||||
|
|
|
@ -33,6 +33,7 @@ typedef struct
|
|||
int test ; /**< Enable test mode */
|
||||
int info ; /**< Dump data module info */
|
||||
int verbose ; /**< Dump command line options */
|
||||
int Virtual ; /**< Set to non-zero when in virtual env */
|
||||
int active ; /**< Set daemon active */
|
||||
int debug ; /**< Set tracing debug mode "debug,"test","info","trace" */
|
||||
int front ; /**< run in the foreground ; do not daemonize */
|
||||
|
@ -43,7 +44,7 @@ typedef struct
|
|||
string username ;
|
||||
string command ;
|
||||
string password ;
|
||||
} opts_type ;
|
||||
} opts_type ;
|
||||
|
||||
opts_type * daemon_get_opts_ptr ( void );
|
||||
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
SRC_DIR="$PKG_BASE/src"
|
||||
COPY_LIST="$SRC_DIR/*"
|
||||
TIS_PATCH_VER=6
|
||||
TIS_PATCH_VER=7
|
||||
|
|
|
@ -34,6 +34,7 @@ make install buildroot=%{buildroot} _sysconfdir=%{_sysconfdir} _unitdir=%{_unitd
|
|||
if [ $1 -eq 1 ] ; then
|
||||
/bin/systemctl enable lighttpd.service
|
||||
/bin/systemctl enable qemu_clean.service
|
||||
/bin/systemctl enable hbsAgent.service
|
||||
fi
|
||||
exit 0
|
||||
|
||||
|
@ -41,6 +42,9 @@ exit 0
|
|||
%defattr(-,root,root,-)
|
||||
%{_sysconfdir}/init.d/goenabledControl
|
||||
%license %{_datarootdir}/licenses/mtce-control-1.0/LICENSE
|
||||
%{_sysconfdir}/pmon.d/hbsAgent.conf
|
||||
%{_sysconfdir}/init.d/hbsAgent
|
||||
%{_unitdir}/hbsAgent.service
|
||||
|
||||
%clean
|
||||
rm -rf $RPM_BUILD_ROOT
|
||||
|
|
|
@ -1,19 +1,32 @@
|
|||
SOURCE1 = goenabled
|
||||
SOURCE2 = LICENSE
|
||||
SOURCE1 = LICENSE
|
||||
SOURCE2 = goenabled
|
||||
SOURCE3 = hbsAgent
|
||||
SOURCE4 = hbsAgent.conf
|
||||
SOURCE5 = hbsAgent.service
|
||||
|
||||
local_etc_pmond = $(_sysconfdir)/pmond.d
|
||||
local_etc_pmond = $(_sysconfdir)/pmon.d
|
||||
local_etc_goenabledd = $(_sysconfdir)/goenabled.d
|
||||
|
||||
.PHONY: default
|
||||
|
||||
install:
|
||||
# Controller-Only Init Scripts
|
||||
install -m 755 -p -D scripts/$(SOURCE1) $(buildroot)/$(_sysconfdir)/init.d/goenabledControl
|
||||
# Controller-Only Process Monitor Config files
|
||||
install -m 755 -d $(buildroot)/$(local_etc_pmond)
|
||||
# Controller-Only Go Enabled Test
|
||||
install -m 755 -d $(buildroot)/$(local_etc_goenabledd)
|
||||
|
||||
# for license
|
||||
install -m 755 -d $(buildroot)/$(_datarootdir)/licenses/mtce-control-1.0
|
||||
install -p -D -m 600 $(SOURCE2) $(buildroot)/$(_datarootdir)/licenses/mtce-control-1.0/LICENSE
|
||||
install -m 600 -p -D $(SOURCE1) $(buildroot)/$(_datarootdir)/licenses/mtce-control-1.0/LICENSE
|
||||
|
||||
# Controller-Only Init Scripts
|
||||
install -m 755 -d $(buildroot)/$(_sysconfdir)/init.d
|
||||
install -m 755 -p -D scripts/$(SOURCE2) $(buildroot)/$(_sysconfdir)/init.d/goenabledControl
|
||||
install -m 755 -p -D scripts/$(SOURCE3) $(buildroot)/$(_sysconfdir)/init.d/hbsAgent
|
||||
|
||||
# Controller-Only Process Monitor Config files
|
||||
install -m 755 -d $(buildroot)/$(local_etc_pmond)
|
||||
install -m 644 -p -D scripts/$(SOURCE4) $(buildroot)/$(local_etc_pmond)/hbsAgent.conf
|
||||
|
||||
# Controller-Only Heartbeat Service file
|
||||
install -m 644 -p -D scripts/$(SOURCE5) $(buildroot)/$(_unitdir)/hbsAgent.service
|
||||
|
||||
# Controller-Only Go Enabled Test
|
||||
install -m 755 -d $(buildroot)/$(local_etc_goenabledd)
|
||||
|
||||
|
|
|
@ -0,0 +1,117 @@
|
|||
#! /bin/sh
|
||||
#
|
||||
# Copyright (c) 2018 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
|
||||
#
|
||||
# chkconfig: 2345 95 95
|
||||
#
|
||||
### BEGIN INIT INFO
|
||||
# Provides: hbsAgent
|
||||
# Default-Start: 3 5
|
||||
# Default-Stop: 0 1 2 6
|
||||
# Short-Description: Heartbeat Agent Daemon
|
||||
### END INIT INFO
|
||||
|
||||
. /etc/init.d/functions
|
||||
|
||||
DAEMON_NAME="hbsAgent"
|
||||
DAEMON="/usr/local/bin/${DAEMON_NAME}"
|
||||
PIDFILE="/var/run/${DAEMON_NAME}.pid"
|
||||
|
||||
VIRT_TOOL='virt-what'
|
||||
# controller-1:~$ sudo virt-what
|
||||
# virtualbox ... in virtualbox
|
||||
# kvm ... in qemu
|
||||
|
||||
# Linux Standard Base (LSB) Error Codes
|
||||
RETVAL=0
|
||||
GENERIC_ERROR=1
|
||||
INVALID_ARGS=2
|
||||
UNSUPPORTED_FEATURE=3
|
||||
NOT_INSTALLED=5
|
||||
NOT_RUNNING=7
|
||||
|
||||
PATH=/sbin:/usr/sbin:/bin:/usr/bin:/usr/local/bin
|
||||
export PATH
|
||||
|
||||
if [ ! -e "${DAEMON}" ] ; then
|
||||
logger "${DAEMON} is missing"
|
||||
exit ${NOT_INSTALLED}
|
||||
fi
|
||||
|
||||
case "$1" in
|
||||
start)
|
||||
logger "Starting ${DAEMON_NAME}"
|
||||
echo -n "Starting ${DAEMON_NAME}: "
|
||||
if [ -n "`pidof ${DAEMON_NAME}`" ] ; then
|
||||
echo -n "is already running "
|
||||
RETVAL=0
|
||||
else
|
||||
tool=$(which ${VIRT_TOOL})
|
||||
if [ $? -eq 0 ] ; then
|
||||
virtual=`${VIRT_TOOL}`
|
||||
else
|
||||
virtual=""
|
||||
fi
|
||||
|
||||
if [ "${virtual}" == "virtualbox" -o "${virtual}" == "kvm" ] ; then
|
||||
start-stop-daemon --start -b -x ${DAEMON} -- -l -a -V
|
||||
else
|
||||
start-stop-daemon --start -b -x ${DAEMON} -- -l -a
|
||||
fi
|
||||
RETVAL=$?
|
||||
fi
|
||||
if [ ${RETVAL} -eq 0 ] ; then
|
||||
pid=`pidof ${DAEMON_NAME}`
|
||||
echo "OK"
|
||||
logger "${DAEMON} (${pid})"
|
||||
else
|
||||
echo "FAIL"
|
||||
RETVAL=${GENERIC_ERROR}
|
||||
fi
|
||||
;;
|
||||
|
||||
stop)
|
||||
logger "Stopping ${DAEMON_NAME}"
|
||||
echo -n "Stopping ${DAEMON_NAME}: "
|
||||
if [ -n "`pidof ${DAEMON_NAME}`" ] ; then
|
||||
killproc ${DAEMON_NAME}
|
||||
fi
|
||||
if [ -n "`pidof ${DAEMON_NAME}`" ] ; then
|
||||
echo "FAIL"
|
||||
RETVAL=${NOT_RUNNING}
|
||||
else
|
||||
echo "OK"
|
||||
fi
|
||||
rm -f ${PIDFILE}
|
||||
;;
|
||||
|
||||
restart)
|
||||
$0 stop
|
||||
$0 start
|
||||
;;
|
||||
|
||||
status)
|
||||
pid=`pidof ${DAEMON_NAME}`
|
||||
RETVAL=$?
|
||||
if [ ${RETVAL} -eq 0 ] ; then
|
||||
echo "${DAEMON_NAME} is running"
|
||||
else
|
||||
echo "${DAEMON_NAME} is NOT running"
|
||||
RETVAL=${NOT_RUNNING}
|
||||
fi
|
||||
;;
|
||||
|
||||
condrestart)
|
||||
$0 restart
|
||||
;;
|
||||
|
||||
*)
|
||||
echo "usage: $0 { start | stop | status | restart | condrestart | status }"
|
||||
;;
|
||||
esac
|
||||
|
||||
exit ${RETVAL}
|
|
@ -0,0 +1,25 @@
|
|||
[process]
|
||||
process = hbsAgent
|
||||
service = hbsAgent
|
||||
pidfile = /var/run/hbsAgent.pid
|
||||
style = lsb ; ocf or lsb
|
||||
severity = major ; minor, major, critical
|
||||
restarts = 1 ; restart retries before error assertion
|
||||
interval = 10 ; number of seconds to wait between restarts
|
||||
debounce = 10 ; number of seconds that a process needs to remain
|
||||
; running before degrade is removed and retry count
|
||||
; is cleared.
|
||||
startuptime = 5 ; Seconds to wait after process start before starting the debounce monitor
|
||||
mode = passive ; Monitoring mode: passive (default) or active
|
||||
; passive: process death monitoring (default: always)
|
||||
; active : heartbeat monitoring, i.e. request / response messaging
|
||||
; ignore : do not monitor or stop monitoring
|
||||
quorum = 0 ; process is in the host watchdog quorum
|
||||
|
||||
; Active Monitoring Options
|
||||
|
||||
port = 2201
|
||||
period = 5 ; monitor period in seconds
|
||||
timeout = 4 ; Messaging timeout period in seconds, must be shorter than period
|
||||
threshold = 5 ; Number of back to back heartbeat failures before action
|
||||
|
|
@ -0,0 +1,22 @@
|
|||
[Unit]
|
||||
Description=Titanium Cloud Maintenance Heartbeat Agent
|
||||
After=network.target syslog.service config.service
|
||||
Before=pmon.service
|
||||
|
||||
[Service]
|
||||
Type=forking
|
||||
ExecStart=/etc/rc.d/init.d/hbsAgent start
|
||||
ExecStop=/etc/rc.d/init.d/hbsAgent start
|
||||
PIDFile=/var/run/hbsAgent.pid
|
||||
KillMode=process
|
||||
SendSIGKILL=no
|
||||
|
||||
# Process recovery is handled by pmond if its running.
|
||||
# Delay 10 seconds to give pmond a chance to recover
|
||||
# before systemd kicks in to do it as a backup plan.
|
||||
Restart=always
|
||||
RestartSec=10
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
|
@ -1,3 +1,3 @@
|
|||
SRC_DIR="src"
|
||||
TIS_PATCH_VER=140
|
||||
TIS_PATCH_VER=142
|
||||
BUILD_IS_SLOW=5
|
||||
|
|
|
@ -313,7 +313,6 @@ install -m 755 -d %{buildroot}/usr/lib/ocf
|
|||
install -m 755 -d %{buildroot}/usr/lib/ocf/resource.d
|
||||
install -m 755 -d %{buildroot}/usr/lib/ocf/resource.d/platform
|
||||
install -m 755 -p -D %{_buildsubdir}/scripts/mtcAgent %{buildroot}/usr/lib/ocf/resource.d/platform/mtcAgent
|
||||
install -m 755 -p -D %{_buildsubdir}/scripts/hbsAgent %{buildroot}/usr/lib/ocf/resource.d/platform/hbsAgent
|
||||
install -m 755 -p -D %{_buildsubdir}/hwmon/scripts/ocf/hwmon %{buildroot}/usr/lib/ocf/resource.d/platform/hwmon
|
||||
|
||||
# config files
|
||||
|
@ -482,7 +481,6 @@ install -m 755 -d %{buildroot}/var/run
|
|||
|
||||
# SM OCF Start/Stop/Monitor Scripts
|
||||
%{ocf_resourced}/platform/mtcAgent
|
||||
%{ocf_resourced}/platform/hbsAgent
|
||||
|
||||
# Config files
|
||||
%config(noreplace)/etc/mtc.ini
|
||||
|
|
|
@ -47,6 +47,11 @@ int alarm_register_user ( msgClassSock * sock_ptr )
|
|||
return (rc);
|
||||
}
|
||||
|
||||
void alarm_unregister_user ( void )
|
||||
{
|
||||
user_sock_ptr = NULL ;
|
||||
}
|
||||
|
||||
/* Construct an alarm request json string in the following form
|
||||
{\"mtcalarm\":[{\"alarmid\":\"200.009\",\"hostname\":\"compute-3\",\"operation\":\"set\",\"severity\":\"major\",\"entity\":\"Infrastructure\",\"prefix\":\"service=heartbeat\"}, {\"alarmid\":\"200.005\",\"hostname\":\"compute-3\",\"operation\":\"set\",\"severity\":\"major\",\"entity\":\"Management\",\"prefix\":\"service=heartbeat\"}]}"
|
||||
|
||||
|
@ -73,6 +78,17 @@ int alarm_ ( string hostname, const char * id, EFmAlarmStateT state, EFmAlarmSev
|
|||
string msg_type ;
|
||||
string sev ;
|
||||
|
||||
if ( user_sock_ptr == NULL )
|
||||
{
|
||||
slog ("alarm socket is NULL");
|
||||
return (FAIL_NULL_POINTER );
|
||||
}
|
||||
else if ( ! user_sock_ptr->sock_ok() )
|
||||
{
|
||||
elog ("alarm socket is not ok");
|
||||
return (FAIL_OPERATION);
|
||||
}
|
||||
|
||||
if ( state == FM_ALARM_STATE_MSG )
|
||||
msg_type = "msg" ;
|
||||
else if ( state == FM_ALARM_STATE_SET )
|
||||
|
@ -127,7 +143,8 @@ int alarm_ ( string hostname, const char * id, EFmAlarmStateT state, EFmAlarmSev
|
|||
}
|
||||
else
|
||||
{
|
||||
ilog ("%s %s\n", hostname.c_str(), request);
|
||||
ilog ("%s %s %s %s %s", hostname.c_str(), entity, msg_type.c_str(), sev.c_str(), id);
|
||||
mlog ("%s %s\n", hostname.c_str(), request);
|
||||
return ( PASS ) ;
|
||||
}
|
||||
daemon_signal_hdlr ();
|
||||
|
|
|
@ -68,6 +68,7 @@ EFmAlarmSeverityT alarmUtil_getSev_enum ( string severity );
|
|||
#ifndef __MODULE_PRIVATE__
|
||||
|
||||
int alarm_register_user ( msgClassSock * sock_ptr );
|
||||
void alarm_unregister_user ( void );
|
||||
|
||||
/* Public API */
|
||||
int alarm_ ( string hostname, const char * id, EFmAlarmStateT state, EFmAlarmSeverityT severity, const char * entity, string prefix );
|
||||
|
|
|
@ -36,6 +36,7 @@ using namespace std;
|
|||
#include "mtcAlarm.h"
|
||||
#include "alarm.h"
|
||||
#include "hbsAlarm.h"
|
||||
#include "hbsBase.h"
|
||||
|
||||
/** Initialize the supplied command buffer */
|
||||
void mtcCmd_init ( mtcCmd & cmd )
|
||||
|
@ -263,7 +264,8 @@ nodeLinkClass::nodeLinkClass()
|
|||
/* Make no assumption on the service */
|
||||
maintenance = false ;
|
||||
heartbeat = false ;
|
||||
active = false ;
|
||||
active = false ; /* run active */
|
||||
active_controller = false ; /* true if this controller is active */
|
||||
|
||||
/* Set some defaults for the hearbeat service */
|
||||
hbs_ready = false ;
|
||||
|
@ -1156,26 +1158,26 @@ void nodeLinkClass::print_node_info ( void )
|
|||
if (( i == INFRA_IFACE ) && ( infra_network_provisioned == false ))
|
||||
continue ;
|
||||
|
||||
syslog ( LOG_INFO, "+--------------+-----+-----+------+-----+------+------------+---------+-----------------+\n");
|
||||
syslog ( LOG_INFO, "| %s: %3d | Mon | Mis | Max | Deg | Fail | Pulses Tot | Pulses | %s (%4d) |\n" ,
|
||||
syslog ( LOG_INFO, "+--------------+-----+-------+-------+-------+-------+------------+----------+-----------------+\n");
|
||||
syslog ( LOG_INFO, "| %s: %3d | Mon | Mis | Max | Deg | Fail | Pulses Tot | Pulses | %s (%4d) |\n" ,
|
||||
get_iface_name_str ((iface_enum)i), hosts, hbs_disabled ? "DISABLED" : "Enabled ", hbs_pulse_period );
|
||||
syslog ( LOG_INFO, "+--------------+-----+-----+------+-----+------+------------+---------+-----------------+\n");
|
||||
|
||||
syslog ( LOG_INFO, "+--------------+-----+-------+-------+-------+-------+------------+----------+-----------------+\n");
|
||||
|
||||
for ( struct node * ptr = head ; ptr != NULL ; ptr = ptr->next )
|
||||
{
|
||||
syslog ( LOG_INFO, "| %-12s | %c | %3i | %4i | %3i | %4i | %8x | %7x | %d msec\n",
|
||||
syslog ( LOG_INFO, "| %-12s | %c | %5i | %5i | %5i | %5i | %10x | %8x | %d msec\n",
|
||||
ptr->hostname.c_str(),
|
||||
ptr->monitor[i] ? 'Y' : 'n',
|
||||
ptr->hbs_misses_count[i],
|
||||
ptr->max_count[i],
|
||||
ptr->hbs_degrade_count[i],
|
||||
ptr->hbs_failure_count[i],
|
||||
ptr->hbs_misses_count[i],
|
||||
ptr->max_count[i],
|
||||
ptr->hbs_degrade_count[i],
|
||||
ptr->hbs_failure_count[i],
|
||||
ptr->hbs_count[i],
|
||||
ptr->b2b_pulses_count[i],
|
||||
hbs_pulse_period );
|
||||
}
|
||||
}
|
||||
syslog ( LOG_INFO, "+--------------+-----+-----+------+-----+------+------------+---------+-----------------+\n");
|
||||
syslog ( LOG_INFO, "+--------------+-----+-------+-------+-------+-------+------------+----------+-----------------+\n");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -7778,7 +7780,7 @@ int nodeLinkClass::remPulse ( struct node * node_ptr, iface_enum iface, bool cle
|
|||
{
|
||||
/* This default RC allows the caller to filter out unexpected pulse responses */
|
||||
int rc = ENXIO ;
|
||||
|
||||
|
||||
if ( head == NULL )
|
||||
{
|
||||
return -ENODEV ;
|
||||
|
@ -7962,6 +7964,16 @@ int nodeLinkClass::remPulse ( struct node * node_ptr, iface_enum iface, bool cle
|
|||
}
|
||||
pulses[iface]-- ;
|
||||
}
|
||||
else if ( node_ptr )
|
||||
{
|
||||
dlog ("%s unexpected pulse response ; %s",
|
||||
node_ptr->hostname.c_str(),
|
||||
get_iface_name_str(iface));
|
||||
}
|
||||
else
|
||||
{
|
||||
slog ("null pointer");
|
||||
}
|
||||
|
||||
return rc ;
|
||||
}
|
||||
|
@ -7972,6 +7984,13 @@ int nodeLinkClass::remPulse ( struct node * node_ptr, iface_enum iface, bool cle
|
|||
* By index does not require a lookup whereas hostname does */
|
||||
int nodeLinkClass::remove_pulse ( string & hostname, iface_enum iface, int index, unsigned int flags )
|
||||
{
|
||||
/* TODO: consider removing this check */
|
||||
if ( hostname == "localhost" )
|
||||
{
|
||||
/* localhost is not a supported hostname and indicates
|
||||
* an unconfigured host response ; return the ignore response */
|
||||
return(ENXIO);
|
||||
}
|
||||
if ( index )
|
||||
{
|
||||
int rc = remPulse_by_index ( hostname, index , iface, true , flags );
|
||||
|
@ -7984,16 +8003,6 @@ int nodeLinkClass::remove_pulse ( string & hostname, iface_enum iface, int index
|
|||
}
|
||||
else
|
||||
{
|
||||
if ( hostname.compare("localhost") )
|
||||
{
|
||||
get_hbs_monitor_state ( hostname , iface ) ;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* localhost is not a supported hostname and indicates
|
||||
* an unconfigured host response ; return the ignore response */
|
||||
return(ENXIO);
|
||||
}
|
||||
}
|
||||
return ( remPulse_by_name ( hostname , iface, true, flags ));
|
||||
}
|
||||
|
@ -8016,7 +8025,6 @@ void nodeLinkClass::clear_pulse_list ( iface_enum iface )
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
/** Runs in the hbsAgent to set or clear heartbat alarms for all supported interfaces */
|
||||
void nodeLinkClass::manage_heartbeat_alarm ( struct nodeLinkClass::node * node_ptr, EFmAlarmSeverityT sev, int iface )
|
||||
{
|
||||
|
@ -8142,7 +8150,6 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
|
|||
storage_0_responding = false ;
|
||||
}
|
||||
|
||||
/* Don't log single misses unless in debug mode */
|
||||
if ( pulse_ptr->b2b_misses_count[iface] > 1 )
|
||||
{
|
||||
if ( pulse_ptr->b2b_misses_count[iface] >= hbs_failure_threshold )
|
||||
|
@ -8207,7 +8214,10 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
|
|||
{
|
||||
if ( pulse_ptr->b2b_misses_count[iface] == hbs_minor_threshold )
|
||||
{
|
||||
send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_MINOR_SET, iface );
|
||||
if ( this->active_controller )
|
||||
{
|
||||
send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_MINOR_SET, iface );
|
||||
}
|
||||
pulse_ptr->hbs_minor[iface] = true ;
|
||||
pulse_ptr->hbs_minor_count[iface]++ ;
|
||||
wlog ("%s %s -> MINOR\n", pulse_ptr->hostname.c_str(), get_iface_name_str(iface));
|
||||
|
@ -8215,10 +8225,17 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
|
|||
}
|
||||
if ( pulse_ptr->b2b_misses_count[iface] == hbs_degrade_threshold )
|
||||
{
|
||||
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_MAJOR, iface );
|
||||
if ( this->active_controller )
|
||||
{
|
||||
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_MAJOR, iface );
|
||||
|
||||
/* report this host as failed */
|
||||
if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_DEGRADE_SET, iface ) == PASS )
|
||||
/* report this host as failed */
|
||||
if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_DEGRADE_SET, iface ) == PASS )
|
||||
{
|
||||
pulse_ptr->hbs_degrade[iface] = true ;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
pulse_ptr->hbs_degrade[iface] = true ;
|
||||
}
|
||||
|
@ -8231,11 +8248,17 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
|
|||
( pulse_ptr->hbs_degrade[iface] == false ))
|
||||
{
|
||||
wlog ("%s -> DEGRADED - Auto-Correction\n", pulse_ptr->hostname.c_str());
|
||||
if ( this->active_controller )
|
||||
{
|
||||
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_MAJOR, iface );
|
||||
|
||||
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_MAJOR, iface );
|
||||
|
||||
/* report this host as failed */
|
||||
if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_DEGRADE_SET, iface ) == PASS )
|
||||
/* report this host as failed */
|
||||
if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_DEGRADE_SET, iface ) == PASS )
|
||||
{
|
||||
pulse_ptr->hbs_degrade[iface] = true ;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
pulse_ptr->hbs_degrade[iface] = true ;
|
||||
}
|
||||
|
@ -8250,11 +8273,16 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
|
|||
/* Only print the log at the threshold boundary */
|
||||
if ( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold )
|
||||
{
|
||||
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
|
||||
if ( this->active_controller )
|
||||
{
|
||||
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
|
||||
}
|
||||
|
||||
wlog_throttled ( pulse_ptr->no_work_log_throttle, 500,
|
||||
"%s %s *** Heartbeat Loss *** (degrade only)\n", pulse_ptr->hostname.c_str(),
|
||||
get_iface_name_str(iface) );
|
||||
this->print_node_info ();
|
||||
hbs_cluster_log ( this->my_hostname, "event", true );
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -8268,35 +8296,46 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
|
|||
/* Only print the log at the threshold boundary */
|
||||
if ( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold )
|
||||
{
|
||||
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
|
||||
|
||||
if ( this->active_controller )
|
||||
{
|
||||
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
|
||||
}
|
||||
wlog_throttled ( pulse_ptr->no_work_log_throttle, 500,
|
||||
"%s %s *** Heartbeat Loss *** (degrade only)\n", pulse_ptr->hostname.c_str(),
|
||||
get_iface_name_str(iface) );
|
||||
this->print_node_info ();
|
||||
hbs_cluster_log ( this->my_hostname, "event", true );
|
||||
}
|
||||
}
|
||||
|
||||
else if (( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold ) &&
|
||||
( pulse_ptr->hbs_failure[iface] == false ))
|
||||
( pulse_ptr->hbs_failure[iface] == false ))
|
||||
{
|
||||
elog ("%s %s -> FAILED\n", pulse_ptr->hostname.c_str(),
|
||||
get_iface_name_str(iface) );
|
||||
elog ("%s %s *** Heartbeat Loss ***\n", pulse_ptr->hostname.c_str(),
|
||||
get_iface_name_str(iface) );
|
||||
|
||||
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
|
||||
if ( this->active_controller )
|
||||
{
|
||||
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
|
||||
|
||||
/* report this host as failed */
|
||||
if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_LOSS , iface ) == PASS )
|
||||
/* report this host as failed */
|
||||
if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_LOSS , iface ) == PASS )
|
||||
{
|
||||
pulse_ptr->hbs_failure[iface] = true ;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
pulse_ptr->hbs_failure[iface] = true ;
|
||||
this->print_node_info ();
|
||||
hbs_cluster_log ( this->my_hostname, "event", true );
|
||||
}
|
||||
|
||||
pulse_ptr->hbs_failure_count[iface]++ ;
|
||||
}
|
||||
if ( pulse_ptr->b2b_misses_count[iface] > pulse_ptr->max_count[iface] )
|
||||
pulse_ptr->max_count[iface] = pulse_ptr->b2b_misses_count[iface] ;
|
||||
pulse_ptr->max_count[iface] = pulse_ptr->b2b_misses_count[iface] ;
|
||||
}
|
||||
|
||||
if ( remPulse_by_name ( pulse_ptr->hostname, iface, false, NULL_PULSE_FLAGS ))
|
||||
{
|
||||
elog ("%s %s not in pulse list\n", pulse_ptr->hostname.c_str(),
|
||||
|
|
|
@ -1266,6 +1266,10 @@ public:
|
|||
bool maintenance ;
|
||||
bool heartbeat ;
|
||||
|
||||
/* Set to true if this controller is active.
|
||||
* Currently only used by heartbeat service. */
|
||||
bool active_controller ;
|
||||
|
||||
/* offline_handler tuning controls */
|
||||
int offline_threshold ; /* number of back to back mtcAlive misses before offline */
|
||||
int offline_period ; /* offline handler mtcAlive request period */
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -47,6 +47,9 @@
|
|||
/** Maximum service fail count before action */
|
||||
#define MAX_FAIL_COUNT (1)
|
||||
|
||||
/** Audit Rate/Count */
|
||||
#define AUDIT_RATE (9)
|
||||
|
||||
/** Heartbeat pulse request/response message header byte size */
|
||||
#define HBS_HEADER_SIZE (15)
|
||||
|
||||
|
@ -60,13 +63,16 @@ const char rsp_msg_header [HBS_HEADER_SIZE+1] = {"cgts pulse rsp:"};
|
|||
|
||||
#define HBS_MAX_MSG (HBS_HEADER_SIZE+MAX_CHARS_HOSTNAME)
|
||||
|
||||
#define HBS_MESSAGE_VERSION (1) // 0 -> 1 with intro of cluster info
|
||||
#define HBS_MESSAGE_VERSION (1) // 0 -> 1 with intro of cluster info
|
||||
|
||||
/* Heartbeat control structure */
|
||||
typedef struct
|
||||
{
|
||||
unsigned int controller ;
|
||||
unsigned int audit ;
|
||||
unsigned int nodetype ;
|
||||
bool clear_alarms ;
|
||||
bool locked ;
|
||||
} hbs_ctrl_type ;
|
||||
hbs_ctrl_type * get_hbs_ctrl_ptr ( void );
|
||||
|
||||
|
@ -218,22 +224,17 @@ void hbs_utils_init ( void );
|
|||
/* network enum to name lookup */
|
||||
string hbs_cluster_network_name ( mtce_hbs_network_enum network );
|
||||
|
||||
/* Produce formatted clog's that characterize current and changing cluster
|
||||
* history for a given network. Each log is controller/network specific. */
|
||||
void hbs_cluster_log ( string & hostname, mtce_hbs_cluster_type & cluster, string prefix );
|
||||
|
||||
/* Initialize the specified history array */
|
||||
void hbs_cluster_history_init ( mtce_hbs_cluster_history_type & history );
|
||||
|
||||
/* Clear all history in the cluster vault */
|
||||
void hbs_cluster_history_clear( mtce_hbs_cluster_type & cluster );
|
||||
|
||||
|
||||
/******** Heartbeat Agent Cluster Functions in hbsCluster.cpp ********/
|
||||
|
||||
/* Set the cluster vault to default state.
|
||||
* Called upon daemon init or heartbeat period change. */
|
||||
void hbs_cluster_init ( unsigned short period );
|
||||
void hbs_cluster_init ( unsigned short period , msgClassSock * sm_socket_ptr );
|
||||
|
||||
/* Calculate number of bytes that is unused in the cluster data structure.
|
||||
* Primarily to know how many history elements are missing. */
|
||||
|
@ -286,7 +287,9 @@ void hbs_cluster_append ( hbs_message_type & msg );
|
|||
|
||||
/* Produce formatted clog's that characterize current and changing cluster
|
||||
* history for a given network. Each log is controller/network specific. */
|
||||
void hbs_cluster_log ( string & hostname, string prefix );
|
||||
void hbs_cluster_log ( string & hostname, string prefix, bool force=false );
|
||||
void hbs_cluster_log ( string & hostname, mtce_hbs_cluster_type & cluster, string prefix, bool force=false );
|
||||
|
||||
|
||||
/* Service SM cluster info request */
|
||||
void hbs_sm_handler ( void );
|
||||
|
@ -294,8 +297,14 @@ void hbs_sm_handler ( void );
|
|||
/* send the cluster vault to SM */
|
||||
void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid );
|
||||
|
||||
/* copy cluster data from src to dst */
|
||||
void hbs_cluster_copy ( mtce_hbs_cluster_type & src, mtce_hbs_cluster_type & dst );
|
||||
|
||||
/* print the contents of the vault */
|
||||
void hbs_cluster_dump ( mtce_hbs_cluster_type & vault );
|
||||
void hbs_cluster_dump ( mtce_hbs_cluster_type & vault, string log_prefix, bool force );
|
||||
|
||||
/* Heartbeat service state audit */
|
||||
void hbs_state_audit ( void );
|
||||
|
||||
/**
|
||||
* @} hbs_base
|
||||
|
|
|
@ -66,6 +66,8 @@ extern "C"
|
|||
#include "amon.h" /* for ... active monitoring utilities */
|
||||
}
|
||||
|
||||
#define MAX_LEN (300)
|
||||
|
||||
/* Where to send events */
|
||||
string mtcAgent_ip = "" ;
|
||||
|
||||
|
@ -96,12 +98,17 @@ typedef struct
|
|||
|
||||
static char pulse_resp_tx_hdr [HBS_MAX_MSG];
|
||||
static char my_hostname [MAX_HOST_NAME_SIZE+1];
|
||||
static string hostname = "" ;
|
||||
static char my_hostname_length ;
|
||||
static string my_macaddr = "" ;
|
||||
static string my_address = "" ;
|
||||
static unsigned int my_nodetype= CGTS_NODE_NULL ;
|
||||
static stallMon_type stallMon ;
|
||||
|
||||
/* Cached Cluster view from controllers */
|
||||
mtce_hbs_cluster_type controller_cluster_cache[MTCE_HBS_MAX_CONTROLLERS];
|
||||
|
||||
|
||||
void daemon_sigchld_hdlr ( void )
|
||||
{
|
||||
; /* dlog("Received SIGCHLD ... no action\n"); */
|
||||
|
@ -407,16 +414,17 @@ int daemon_configure ( void )
|
|||
else
|
||||
{
|
||||
ilog("Realtime Pri: FIFO/%i \n", hbs_config.scheduling_priority );
|
||||
ilog("Multicast: %s\n", hbs_config.multicast );
|
||||
ilog("Multicast : %s\n", hbs_config.multicast );
|
||||
|
||||
hbs_config.mgmnt_iface = daemon_get_iface_master ( hbs_config.mgmnt_iface );
|
||||
ilog("Mgmnt iface : %s\n", hbs_config.mgmnt_iface );
|
||||
ilog("Mgmnt RxPort: %d\n", hbs_config.hbs_client_mgmnt_port );
|
||||
ilog("Mgmnt TxPort: %d\n", hbs_config.hbs_agent_mgmnt_port );
|
||||
ilog("Mgmnt Name : %s\n", hbs_config.mgmnt_iface );
|
||||
ilog("Mgmnt Port : %d (rx)", hbs_config.hbs_client_mgmnt_port );
|
||||
ilog("Mgmnt Port : %d (tx)", hbs_config.hbs_agent_mgmnt_port );
|
||||
|
||||
get_iface_macaddr ( hbs_config.mgmnt_iface, my_macaddr );
|
||||
get_iface_address ( hbs_config.mgmnt_iface, my_address, true );
|
||||
get_hostname ( &my_hostname[0], MAX_HOST_NAME_SIZE );
|
||||
hostname = my_hostname ;
|
||||
|
||||
/* Fetch the infrastructure interface name.
|
||||
* calls daemon_get_iface_master inside so the
|
||||
|
@ -427,11 +435,14 @@ int daemon_configure ( void )
|
|||
if (strcmp(hbs_config.infra_iface, hbs_config.mgmnt_iface))
|
||||
{
|
||||
infra_network_provisioned = true ;
|
||||
ilog ("Infra iface : %s\n", hbs_config.infra_iface );
|
||||
ilog ("Infra Name : %s\n", hbs_config.infra_iface );
|
||||
}
|
||||
}
|
||||
ilog("Infra RxPort: %d\n", hbs_config.hbs_client_infra_port );
|
||||
ilog("Infra TxPort: %d\n", hbs_config.hbs_agent_infra_port );
|
||||
if ( infra_network_provisioned == true )
|
||||
{
|
||||
ilog("Infra Port : %d (rx)", hbs_config.hbs_client_infra_port );
|
||||
ilog("Infra Port : %d (tx)", hbs_config.hbs_agent_infra_port );
|
||||
}
|
||||
|
||||
/* initialize the stall detection monitor */
|
||||
stallMon_init ();
|
||||
|
@ -663,7 +674,37 @@ int get_pmon_pulses ( void )
|
|||
return (pulses);
|
||||
}
|
||||
|
||||
static unsigned int my_rri = 0 ;
|
||||
/*************************************************************
|
||||
*
|
||||
* Name : have_other_controller_history
|
||||
*
|
||||
* Description: returns true if there is cached history for any
|
||||
* controller number other than this one supplied.
|
||||
*
|
||||
*************************************************************/
|
||||
|
||||
bool have_other_controller_history ( unsigned short controller )
|
||||
{
|
||||
if ( controller < MTCE_HBS_MAX_CONTROLLERS )
|
||||
{
|
||||
/* look for history for any controller other than the one specified */
|
||||
for ( int c = 0 ; c < MTCE_HBS_MAX_CONTROLLERS ; c++ )
|
||||
{
|
||||
/* skip specified controller */
|
||||
if ( c != controller )
|
||||
{
|
||||
if ( controller_cluster_cache[c].histories )
|
||||
{
|
||||
return true ;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return false ;
|
||||
}
|
||||
|
||||
|
||||
static unsigned int rri[MTCE_HBS_MAX_CONTROLLERS] = {0,0} ;
|
||||
|
||||
/*************************************************************
|
||||
*
|
||||
|
@ -766,12 +807,13 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags )
|
|||
daemon_config_type * cfg_ptr = daemon_get_cfg_ptr();
|
||||
if ( cfg_ptr->debug_msg )
|
||||
{
|
||||
mlog ("\n");
|
||||
mlog ("%s Pulse Req: %s:%5d: %d:%s RRI:%d\n",
|
||||
mlog (" ");
|
||||
mlog ("%s Pulse Req: %s:%d s:%d f:%x [%s] RRI:%d\n",
|
||||
get_iface_name_str(iface),
|
||||
hbs_sock.rx_sock[iface]->get_dst_addr()->toString(),
|
||||
hbs_sock.rx_sock[iface]->get_dst_addr()->getPort(),
|
||||
hbs_sock.rx_mesg[iface].s,
|
||||
hbs_sock.rx_mesg[iface].f,
|
||||
hbs_sock.rx_mesg[iface].m,
|
||||
hbs_sock.rx_mesg[iface].c);
|
||||
}
|
||||
|
@ -787,19 +829,9 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags )
|
|||
return (FAIL_MSG_HEADER) ;
|
||||
}
|
||||
|
||||
|
||||
/* Manage the Resource Reference Index (RRI) "lookup clue" */
|
||||
if ( ! strncmp ( &hbs_sock.rx_mesg[iface].m[HBS_HEADER_SIZE], &my_hostname[0], MAX_CHARS_HOSTNAME ))
|
||||
{
|
||||
if( my_rri!= hbs_sock.rx_mesg[iface].c )
|
||||
{
|
||||
my_rri = hbs_sock.rx_mesg[iface].c ;
|
||||
ilog ("%s Caching New RRI: %d\n", &my_hostname[0], my_rri );
|
||||
}
|
||||
}
|
||||
|
||||
/* Add my RRI to the response message */
|
||||
hbs_sock.rx_mesg[iface].c = my_rri ;
|
||||
/* Update local copy for the controller this pulse came from */
|
||||
/* ... before the flags are cleared and setup for the reply. */
|
||||
unsigned int controller = (hbs_sock.rx_mesg[iface].f & CTRLX_MASK ) >> CTRLX_BIT ;
|
||||
|
||||
/* Manage OOB flags */
|
||||
hbs_sock.rx_mesg[iface].f = flags ;
|
||||
|
@ -807,23 +839,102 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags )
|
|||
{
|
||||
hbs_sock.rx_mesg[iface].f |= ( PMOND_FLAG ) ;
|
||||
}
|
||||
|
||||
if ( infra_network_provisioned == true )
|
||||
{
|
||||
hbs_sock.rx_mesg[iface].f |= INFRA_FLAG ;
|
||||
}
|
||||
|
||||
#define WANT_CLUSTER_INFO_LOG
|
||||
#ifdef WANT_CLUSTER_INFO_LOG
|
||||
/* Log the received cluster info */
|
||||
if ( hbs_sock.rx_mesg[iface].v >= HBS_MESSAGE_VERSION )
|
||||
/*************************************************************************
|
||||
***** C L U S T E R D A T A M A N A G E M E N T ******
|
||||
* *
|
||||
* TODO: Add support for 3 controllers.
|
||||
* Only 2 suppoerted by some of this code.
|
||||
***** ******/
|
||||
|
||||
if ( controller >= MTCE_HBS_MAX_CONTROLLERS )
|
||||
{
|
||||
char str[100] ;
|
||||
// hbs_cluster_log (hbs_sock.rx_mesg[iface].cluster, hbs_sock.rx_mesg[iface].s );
|
||||
snprintf ( &str[0], 100, " seq %6d with %d bytes from %s ", hbs_sock.rx_mesg[iface].s, rx_bytes, get_iface_name_str(iface));
|
||||
string hostname = my_hostname ;
|
||||
hbs_cluster_log ( hostname, hbs_sock.rx_mesg[iface].cluster, str );
|
||||
wlog ("invalid controller number: %d ; dropping message", controller );
|
||||
return ( FAIL_INVALID_DATA );
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Manage the Resource Reference Index (RRI) "lookup clue"
|
||||
* With the introduction of active-active heartbeating the hbsClient
|
||||
* is responsible for servicing pulses from both controllers.
|
||||
* This means that hbsClient needs to manage an rri for each controller. */
|
||||
if ( ! strncmp ( &hbs_sock.rx_mesg[iface].m[HBS_HEADER_SIZE], &my_hostname[0], MAX_CHARS_HOSTNAME ))
|
||||
{
|
||||
if( rri[controller] != hbs_sock.rx_mesg[iface].c )
|
||||
{
|
||||
rri[controller] = hbs_sock.rx_mesg[iface].c ;
|
||||
ilog ("Caching New RRI: %d (from controller-%d)\n", rri[controller], controller );
|
||||
}
|
||||
}
|
||||
|
||||
/* Log the received cluster info
|
||||
* ... if the message version shows that it is supported */
|
||||
if ( hbs_sock.rx_mesg[iface].v )
|
||||
{
|
||||
char str[MAX_LEN] ;
|
||||
snprintf ( &str[0], MAX_LEN, " seq %6d with %d bytes from %s ", (int)hbs_sock.rx_mesg[iface].s, rx_bytes, get_iface_name_str(iface));
|
||||
hbs_cluster_log ( hostname, hbs_sock.rx_mesg[iface].cluster, str );
|
||||
|
||||
/* add the controller back in */
|
||||
hbs_sock.rx_mesg[iface].f |= ( controller << CTRLX_BIT );
|
||||
|
||||
/* Add my RRI to the response message */
|
||||
hbs_sock.rx_mesg[iface].c = rri[controller] ;
|
||||
|
||||
if ( hbs_sock.rx_mesg[iface].cluster.histories > MTCE_HBS_MAX_NETWORKS )
|
||||
{
|
||||
slog ("controller-%d provided %d network histories ; max is %d per controller",
|
||||
controller,
|
||||
hbs_sock.rx_mesg[iface].cluster.histories,
|
||||
MTCE_HBS_MAX_NETWORKS );
|
||||
}
|
||||
else if ( hbs_sock.rx_mesg[iface].cluster.bytes != ( BYTES_IN_CLUSTER_VAULT(hbs_sock.rx_mesg[iface].cluster.histories)))
|
||||
{
|
||||
slog ("controller-%d provided %d bytes of history ; expected %d",
|
||||
controller,
|
||||
hbs_sock.rx_mesg[iface].cluster.bytes,
|
||||
(unsigned short)(BYTES_IN_CLUSTER_VAULT(hbs_sock.rx_mesg[iface].cluster.histories)));
|
||||
}
|
||||
else if ( hbs_sock.rx_mesg[iface].cluster.histories )
|
||||
{
|
||||
hbs_cluster_copy ( hbs_sock.rx_mesg[iface].cluster,
|
||||
controller_cluster_cache[controller] );
|
||||
clog1 ("controller-%d cluster info from %s pulse request saved to cache",
|
||||
controller, get_iface_name_str(iface));
|
||||
|
||||
hbs_sock.rx_mesg[iface].cluster.histories = 0 ;
|
||||
|
||||
if ( have_other_controller_history ( controller ) == true )
|
||||
{
|
||||
/* Now copy the other controller's cached cluster info into
|
||||
* this controlers response */
|
||||
hbs_cluster_copy ( controller_cluster_cache[controller?0:1],
|
||||
hbs_sock.rx_mesg[iface].cluster );
|
||||
|
||||
if ( daemon_get_cfg_ptr()->debug_state & 4 )
|
||||
{
|
||||
string dump_banner = "" ;
|
||||
dump_banner.append("controller-") ;
|
||||
dump_banner.append(itos(controller?0:1));
|
||||
dump_banner.append(" cluster info from cache injected into controller-");
|
||||
dump_banner.append(itos(controller));
|
||||
dump_banner.append(":");
|
||||
dump_banner.append(get_iface_name_str(iface));
|
||||
dump_banner.append(" pulse response");
|
||||
hbs_cluster_dump ( hbs_sock.rx_mesg[iface].cluster, dump_banner, true );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Cluster Data management end */
|
||||
|
||||
/* replace the request header with the response header */
|
||||
memcpy ( &hbs_sock.rx_mesg[iface].m[0], &pulse_resp_tx_hdr[0], HBS_MAX_MSG );
|
||||
|
||||
#ifdef WANT_PULSE_RESPONSE_FIT
|
||||
if (( iface == INFRA_IFACE ) && ( daemon_is_file_present ( MTC_CMD_FIT__NO_INFRA_RSP )))
|
||||
|
@ -839,29 +950,11 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags )
|
|||
}
|
||||
#endif
|
||||
|
||||
int rc = PASS ;
|
||||
|
||||
/* replace the request header with the response header */
|
||||
memcpy ( &hbs_sock.rx_mesg[iface].m[0], &pulse_resp_tx_hdr[0], HBS_MAX_MSG );
|
||||
|
||||
/* Deal with the cluster info if it exists.
|
||||
* ... Introduced in messaging version 1 */
|
||||
if ( hbs_sock.rx_mesg[iface].v >= HBS_MESSAGE_VERSION )
|
||||
{
|
||||
if ( hbs_sock.rx_mesg[iface].cluster.version < MTCE_HBS_CLUSTER_VERSION )
|
||||
{
|
||||
ilog ("Bad cluster verison (%d)", hbs_sock.rx_mesg[iface].cluster.version);
|
||||
}
|
||||
// if ( hbs_sock.rx_mesg[iface].cluster.revision != MTCE_HBS_CLUSTER_REVISION )
|
||||
// {
|
||||
// ilog ("Bad cluster revision (%d)", hbs_sock.rx_mesg[iface].cluster.revision);
|
||||
// }
|
||||
|
||||
/* Add peer controller cluster data to this controller's response */
|
||||
// hbs_cluster_loop(hbs_sock.rx_mesg[iface]);
|
||||
}
|
||||
/* reuse the rx_bytes variable */
|
||||
rx_bytes = sizeof(hbs_message_type)-sizeof(mtce_hbs_cluster_type)+BYTES_IN_CLUSTER_VAULT(hbs_sock.rx_mesg[iface].cluster.histories);
|
||||
|
||||
/* send pulse response message */
|
||||
int rc = PASS ;
|
||||
int tx_bytes = hbs_sock.tx_sock[iface]->reply(hbs_sock.rx_sock[iface],(char*)&hbs_sock.rx_mesg[iface], rx_bytes);
|
||||
if ( tx_bytes == -1 )
|
||||
{
|
||||
|
@ -884,15 +977,15 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags )
|
|||
}
|
||||
else
|
||||
{
|
||||
mlog ("%s Pulse Rsp: %s:%5d: %d:%d:%s RRI:%d (%d:%d:%d)\n",
|
||||
get_iface_name_str(iface),
|
||||
hbs_sock.tx_sock[iface]->get_dst_addr()->toString(),
|
||||
hbs_sock.tx_sock[iface]->get_dst_addr()->getPort(),
|
||||
hbs_sock.rx_mesg[iface].s,
|
||||
hbs_sock.rx_mesg[iface].f,
|
||||
hbs_sock.rx_mesg[iface].m,
|
||||
hbs_sock.rx_mesg[iface].c,
|
||||
pmonPulse_counter, rx_bytes, tx_bytes);
|
||||
mlog ("%s Pulse Rsp: %s:%d: s:%d f:%x [%s] RRI:%d (%x:%d:%d)\n",
|
||||
get_iface_name_str(iface),
|
||||
hbs_sock.tx_sock[iface]->get_dst_addr()->toString(),
|
||||
hbs_sock.tx_sock[iface]->get_dst_addr()->getPort(),
|
||||
hbs_sock.rx_mesg[iface].s,
|
||||
hbs_sock.rx_mesg[iface].f,
|
||||
hbs_sock.rx_mesg[iface].m,
|
||||
hbs_sock.rx_mesg[iface].c,
|
||||
pmonPulse_counter, rx_bytes, tx_bytes);
|
||||
}
|
||||
|
||||
/* Clear the error count since we got a good receive */
|
||||
|
@ -984,6 +1077,10 @@ int daemon_init ( string iface, string nodeType_str )
|
|||
/* Initialize socket construct and pointer to it */
|
||||
memset ( &hbs_sock, 0, sizeof(hbs_sock));
|
||||
|
||||
/* Initialize the controller cluster view data bounce structure */
|
||||
for ( int c = 0 ; c < MTCE_HBS_MAX_CONTROLLERS ; c++ )
|
||||
memset ( &controller_cluster_cache[c], 0, sizeof(mtce_hbs_cluster_type)) ;
|
||||
|
||||
/* init the utility module */
|
||||
hbs_utils_init ();
|
||||
|
||||
|
@ -1007,6 +1104,11 @@ int daemon_init ( string iface, string nodeType_str )
|
|||
|
||||
/* convert node type to integer */
|
||||
my_nodetype = get_host_function_mask ( nodeType_str ) ;
|
||||
if ( my_nodetype & CONTROLLER_TYPE )
|
||||
{
|
||||
/* is controller but don't know what one yet. */
|
||||
set_hn((char*)CONTROLLER_X);
|
||||
}
|
||||
ilog ("Node Type : %s (%d)\n", nodeType_str.c_str(), my_nodetype );
|
||||
|
||||
/* Bind signal handlers */
|
||||
|
@ -1058,7 +1160,6 @@ int daemon_init ( string iface, string nodeType_str )
|
|||
|
||||
int stall_threshold_log = 0 ;
|
||||
int stall_times_threshold_log = 0 ;
|
||||
#define MAX_LEN 300
|
||||
void daemon_service_run ( void )
|
||||
{
|
||||
#ifdef WANT_DAEMON_DEBUG
|
||||
|
@ -1205,7 +1306,7 @@ void daemon_service_run ( void )
|
|||
int bytes = hbs_sock.sm_client_sock->read((char*)&msg, sizeof(mtce_hbs_cluster_type));
|
||||
if ( bytes )
|
||||
{
|
||||
hbs_cluster_dump (msg);
|
||||
hbs_cluster_dump (msg, "Cluster info received", true );
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -64,11 +64,19 @@ typedef struct
|
|||
/* The working heartbeat cluster data vault. */
|
||||
mtce_hbs_cluster_type cluster ;
|
||||
|
||||
bool cluster_change ;
|
||||
int cluster_change_threshold_count ;
|
||||
int cluster_change_difference_count ;
|
||||
|
||||
msgClassSock * sm_socket_ptr ;
|
||||
|
||||
} hbs_cluster_ctrl_type ;
|
||||
|
||||
/* Cluster control structire construct allocation. */
|
||||
static hbs_cluster_ctrl_type ctrl ;
|
||||
|
||||
#define STORAGE_0_NR_THRESHOLD (4)
|
||||
#define CLUSTER_CHANGE_THRESHOLD (50000)
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
|
@ -80,7 +88,7 @@ static hbs_cluster_ctrl_type ctrl ;
|
|||
*
|
||||
***************************************************************************/
|
||||
|
||||
void hbs_cluster_init ( unsigned short period )
|
||||
void hbs_cluster_init ( unsigned short period, msgClassSock * sm_socket_ptr )
|
||||
{
|
||||
ctrl.monitored_hosts = 0;
|
||||
ctrl.monitored_hostname_list.clear();
|
||||
|
@ -104,13 +112,17 @@ void hbs_cluster_init ( unsigned short period )
|
|||
for ( int h = 0 ; h < MTCE_HBS_MAX_HISTORY_ELEMENTS ; h++ )
|
||||
hbs_cluster_history_init ( ctrl.cluster.history[h] );
|
||||
|
||||
ilog ("Cluster Info: v%d.%d sig:%x bytes:%d (%ld)",
|
||||
clog ("Cluster Info: v%d.%d sig:%x bytes:%d (%ld)",
|
||||
ctrl.cluster.version,
|
||||
ctrl.cluster.revision,
|
||||
ctrl.cluster.magic_number,
|
||||
ctrl.cluster.bytes,
|
||||
sizeof(mtce_hbs_cluster_history_type));
|
||||
|
||||
if ( sm_socket_ptr )
|
||||
{
|
||||
ctrl.sm_socket_ptr = sm_socket_ptr ;
|
||||
}
|
||||
ctrl.log_throttle = 0 ;
|
||||
}
|
||||
|
||||
|
@ -140,7 +152,7 @@ void hbs_cluster_nums ( unsigned short this_controller,
|
|||
|
||||
/****************************************************************************
|
||||
*
|
||||
* Name : log_monitored_hosts_list
|
||||
* Name : cluster_list
|
||||
*
|
||||
* Description : Log the list of monitored hosts.
|
||||
* Typically done on a list change.
|
||||
|
@ -149,7 +161,7 @@ void hbs_cluster_nums ( unsigned short this_controller,
|
|||
*
|
||||
***************************************************************************/
|
||||
|
||||
void log_monitored_hosts_list ( void )
|
||||
void cluster_list ( void )
|
||||
{
|
||||
std::list<string>::iterator iter_ptr ;
|
||||
string list = "" ;
|
||||
|
@ -160,9 +172,7 @@ void log_monitored_hosts_list ( void )
|
|||
list.append (*(iter_ptr));
|
||||
list.append (" ");
|
||||
}
|
||||
ilog ("cluster of %ld: %s",
|
||||
ctrl.monitored_hostname_list.size(),
|
||||
list.c_str());
|
||||
ilog ("cluster: %s", list.c_str());
|
||||
}
|
||||
|
||||
|
||||
|
@ -186,6 +196,7 @@ void cluster_storage0_state ( bool enabled )
|
|||
ctrl.cluster.storage0_enabled = enabled ;
|
||||
ilog ("storage-0 heartbeat state changed to %s",
|
||||
enabled ? "enabled" : "disabled" );
|
||||
ctrl.cluster_change = true ;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -237,13 +248,30 @@ void hbs_manage_controller_state ( string & hostname, bool enabled )
|
|||
|
||||
void hbs_cluster_add ( string & hostname )
|
||||
{
|
||||
/* Consider using 'unique' after instead of remove before update. */
|
||||
ctrl.monitored_hostname_list.remove(hostname) ;
|
||||
ctrl.monitored_hostname_list.push_back(hostname) ;
|
||||
ctrl.monitored_hosts = (unsigned short)ctrl.monitored_hostname_list.size();
|
||||
bool already_in_list = false ;
|
||||
std::list<string>::iterator hostname_ptr ;
|
||||
for ( hostname_ptr = ctrl.monitored_hostname_list.begin();
|
||||
hostname_ptr != ctrl.monitored_hostname_list.end() ;
|
||||
hostname_ptr++ )
|
||||
{
|
||||
if ( hostname_ptr->compare(hostname) == 0 )
|
||||
{
|
||||
already_in_list = true ;
|
||||
break ;
|
||||
}
|
||||
}
|
||||
|
||||
if ( already_in_list == false )
|
||||
{
|
||||
ctrl.monitored_hostname_list.push_back(hostname) ;
|
||||
ctrl.monitored_hosts = (unsigned short)ctrl.monitored_hostname_list.size();
|
||||
ilog ("%s added to cluster", hostname.c_str());
|
||||
cluster_list ();
|
||||
ctrl.cluster_change = true ;
|
||||
}
|
||||
|
||||
/* Manage storage-0 state */
|
||||
if ( hostname == STORAGE_0 )
|
||||
if ( hostname.compare(STORAGE_0) == 0 )
|
||||
{
|
||||
cluster_storage0_state ( true );
|
||||
}
|
||||
|
@ -251,15 +279,18 @@ void hbs_cluster_add ( string & hostname )
|
|||
/* If we get down to 0 monitored hosts then just start fresh */
|
||||
if (( ctrl.monitored_hosts ) == 0 )
|
||||
{
|
||||
hbs_cluster_init ( ctrl.cluster.period_msec );
|
||||
hbs_cluster_init ( ctrl.cluster.period_msec, NULL );
|
||||
}
|
||||
|
||||
/* Manage controller state ; true means enabled in this case. */
|
||||
hbs_manage_controller_state ( hostname, true );
|
||||
|
||||
ilog ("%s added to cluster", hostname.c_str());
|
||||
if (( ctrl.cluster_change ) && ( ctrl.sm_socket_ptr ))
|
||||
{
|
||||
hbs_cluster_send( ctrl.sm_socket_ptr, 0 );
|
||||
ctrl.cluster_change = false ;
|
||||
}
|
||||
|
||||
log_monitored_hosts_list ();
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
|
@ -281,27 +312,46 @@ void hbs_cluster_add ( string & hostname )
|
|||
|
||||
void hbs_cluster_del ( string & hostname )
|
||||
{
|
||||
ctrl.monitored_hostname_list.remove(hostname) ;
|
||||
ctrl.monitored_hosts = (unsigned short)ctrl.monitored_hostname_list.size();
|
||||
|
||||
/* Manage storage-0 state. */
|
||||
if ( hostname == STORAGE_0 )
|
||||
std::list<string>::iterator hostname_ptr ;
|
||||
for ( hostname_ptr = ctrl.monitored_hostname_list.begin();
|
||||
hostname_ptr != ctrl.monitored_hostname_list.end() ;
|
||||
hostname_ptr++ )
|
||||
{
|
||||
cluster_storage0_state ( false );
|
||||
if ( hostname_ptr->compare(hostname) == 0 )
|
||||
{
|
||||
ctrl.monitored_hostname_list.remove(hostname) ;
|
||||
ctrl.monitored_hosts = (unsigned short)ctrl.monitored_hostname_list.size();
|
||||
|
||||
/* Manage storage-0 state. */
|
||||
if ( hostname.compare(STORAGE_0) == 0 )
|
||||
{
|
||||
cluster_storage0_state ( false );
|
||||
}
|
||||
|
||||
/* If we get down to 0 monitored hosts then just start fresh */
|
||||
if (( ctrl.monitored_hosts ) == 0 )
|
||||
{
|
||||
hbs_cluster_init ( ctrl.cluster.period_msec, NULL );
|
||||
}
|
||||
|
||||
/* Manage controller state ; false means not enabled in this case. */
|
||||
hbs_manage_controller_state ( hostname , false );
|
||||
|
||||
ilog ("%s deleted from cluster", hostname.c_str());
|
||||
|
||||
cluster_list ();
|
||||
|
||||
ctrl.cluster_change = true ;
|
||||
|
||||
break ;
|
||||
}
|
||||
}
|
||||
|
||||
/* If we get down to 0 monitored hosts then just start fresh */
|
||||
if (( ctrl.monitored_hosts ) == 0 )
|
||||
if (( ctrl.cluster_change ) && ( ctrl.sm_socket_ptr ))
|
||||
{
|
||||
hbs_cluster_init ( ctrl.cluster.period_msec );
|
||||
hbs_cluster_send( ctrl.sm_socket_ptr, 0 );
|
||||
ctrl.cluster_change = false ;
|
||||
}
|
||||
|
||||
/* Manage controller state ; false means not enabled in this case. */
|
||||
hbs_manage_controller_state ( hostname , false );
|
||||
|
||||
ilog ("%s deleted from cluster", hostname.c_str());
|
||||
|
||||
log_monitored_hosts_list ();
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
|
@ -309,7 +359,7 @@ void hbs_cluster_del ( string & hostname )
|
|||
* Name : hbs_cluster_update
|
||||
*
|
||||
* Description : Update this controller's cluster info for the specified
|
||||
* network with
|
||||
* network with ...
|
||||
*
|
||||
* 1. The number of enabled hosts.
|
||||
* 2. The number of responding hosts.
|
||||
|
@ -333,7 +383,6 @@ void hbs_cluster_del ( string & hostname )
|
|||
*
|
||||
***************************************************************************/
|
||||
|
||||
#define STORAGE_0_NR_THRESHOLD (4)
|
||||
|
||||
void hbs_cluster_update ( iface_enum iface,
|
||||
unsigned short not_responding_hosts,
|
||||
|
@ -357,7 +406,7 @@ void hbs_cluster_update ( iface_enum iface,
|
|||
|
||||
if ( not_responding_hosts )
|
||||
{
|
||||
clog1 ("controller-%d %s enabled:%d not responding:%d",
|
||||
clog ("controller-%d %s enabled:%d not responding:%d",
|
||||
ctrl.this_controller,
|
||||
hbs_cluster_network_name(n).c_str(),
|
||||
ctrl.monitored_hosts,
|
||||
|
@ -365,7 +414,7 @@ void hbs_cluster_update ( iface_enum iface,
|
|||
}
|
||||
else
|
||||
{
|
||||
clog1 ("controller-%d %s has %d monitored hosts and all are responding",
|
||||
clog ("controller-%d %s has %d monitored hosts and all are responding",
|
||||
ctrl.this_controller,
|
||||
hbs_cluster_network_name(n).c_str(),
|
||||
ctrl.monitored_hosts);
|
||||
|
@ -394,9 +443,11 @@ void hbs_cluster_update ( iface_enum iface,
|
|||
history_ptr->network = n ;
|
||||
|
||||
/* Log new network history as its being started. */
|
||||
ilog ("controller-%d %s network history add",
|
||||
ilog ("controller-%d added new controller-%d:%s history to vault ; now have %d network views",
|
||||
ctrl.this_controller,
|
||||
hbs_cluster_network_name(n).c_str());
|
||||
ctrl.this_controller,
|
||||
hbs_cluster_network_name(n).c_str(),
|
||||
ctrl.cluster.histories);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -457,7 +508,9 @@ void hbs_cluster_update ( iface_enum iface,
|
|||
* ... which is the index for the next entry.
|
||||
*/
|
||||
unsigned short last_entry_index ;
|
||||
if ( history_ptr->oldest_entry_index == 0 )
|
||||
unsigned short oldest_entry_index = history_ptr->oldest_entry_index ;
|
||||
|
||||
if ( oldest_entry_index == 0 )
|
||||
{
|
||||
/* Go to the end of the array. */
|
||||
last_entry_index = MTCE_HBS_HISTORY_ENTRIES-1 ;
|
||||
|
@ -465,43 +518,88 @@ void hbs_cluster_update ( iface_enum iface,
|
|||
else
|
||||
{
|
||||
/* Otherwise, the previous index in the array */
|
||||
last_entry_index = history_ptr->oldest_entry_index - 1 ;
|
||||
last_entry_index = oldest_entry_index - 1 ;
|
||||
}
|
||||
|
||||
/* Update the history with this data. */
|
||||
history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled = ctrl.monitored_hosts ;
|
||||
history_ptr->entry[history_ptr->oldest_entry_index].hosts_responding = ctrl.monitored_hosts - not_responding_hosts ;
|
||||
bool logit = false ;
|
||||
string logit_reason = "" ;
|
||||
|
||||
if (( history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled !=
|
||||
history_ptr->entry[ last_entry_index].hosts_enabled ) ||
|
||||
( history_ptr->entry[history_ptr->oldest_entry_index].hosts_responding !=
|
||||
history_ptr->entry[ last_entry_index].hosts_responding))
|
||||
/* Update the history with this data. */
|
||||
history_ptr->entry[oldest_entry_index].hosts_enabled = ctrl.monitored_hosts ;
|
||||
history_ptr->entry[oldest_entry_index].hosts_responding = ctrl.monitored_hosts - not_responding_hosts ;
|
||||
|
||||
if (( history_ptr->entry[oldest_entry_index].hosts_enabled !=
|
||||
history_ptr->entry[ last_entry_index].hosts_enabled ) ||
|
||||
( history_ptr->entry[oldest_entry_index].hosts_responding !=
|
||||
history_ptr->entry[ last_entry_index].hosts_responding))
|
||||
{
|
||||
/* Only log on change events. */
|
||||
if ( history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled ==
|
||||
history_ptr->entry[history_ptr->oldest_entry_index].hosts_responding )
|
||||
if ( history_ptr->entry[oldest_entry_index].hosts_enabled ==
|
||||
history_ptr->entry[oldest_entry_index].hosts_responding )
|
||||
{
|
||||
ilog ("controller-%d %s cluster of %d is healthy",
|
||||
ctrl.this_controller,
|
||||
hbs_cluster_network_name(n).c_str(),
|
||||
history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled);
|
||||
history_ptr->entry[oldest_entry_index].hosts_enabled);
|
||||
ctrl.cluster_change_threshold_count = 0 ;
|
||||
ctrl.cluster_change_difference_count = 0 ;
|
||||
}
|
||||
else
|
||||
{
|
||||
ilog ("controller-%d %s cluster of %d with %d responding",
|
||||
ctrl.this_controller,
|
||||
hbs_cluster_network_name(n).c_str(),
|
||||
history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled,
|
||||
history_ptr->entry[history_ptr->oldest_entry_index].hosts_responding);
|
||||
ctrl.cluster_change_threshold_count++ ;
|
||||
ctrl.cluster_change_difference_count =
|
||||
history_ptr->entry[oldest_entry_index].hosts_enabled -
|
||||
history_ptr->entry[oldest_entry_index].hosts_responding ;
|
||||
}
|
||||
}
|
||||
if ( daemon_get_cfg_ptr()->debug_state&4 )
|
||||
{
|
||||
logit = true ;
|
||||
logit_reason = "(debug)" ;
|
||||
}
|
||||
// else if (( ctrl.cluster_change_threshold_count == 1 ) &&
|
||||
// ( cluster_change == false ))
|
||||
// {
|
||||
// logit = true ;
|
||||
// logit_reason = "" ;
|
||||
// }
|
||||
else if ( ctrl.cluster_change_threshold_count >= CLUSTER_CHANGE_THRESHOLD )
|
||||
{
|
||||
logit = true ;
|
||||
ctrl.cluster_change_threshold_count = 0 ;
|
||||
logit_reason = "(threshold)" ;
|
||||
}
|
||||
else
|
||||
{
|
||||
int delta =
|
||||
history_ptr->entry[oldest_entry_index].hosts_enabled -
|
||||
history_ptr->entry[oldest_entry_index].hosts_responding ;
|
||||
if ( delta != ctrl.cluster_change_difference_count )
|
||||
{
|
||||
logit = true ;
|
||||
ctrl.cluster_change_difference_count = delta ;
|
||||
logit_reason = "(delta)" ;
|
||||
}
|
||||
}
|
||||
|
||||
if ( logit )
|
||||
{
|
||||
ilog ("controller-%d %s cluster of %d with %d responding (%d:%d) %s",
|
||||
ctrl.this_controller,
|
||||
hbs_cluster_network_name(n).c_str(),
|
||||
history_ptr->entry[oldest_entry_index].hosts_enabled,
|
||||
history_ptr->entry[oldest_entry_index].hosts_responding,
|
||||
ctrl.cluster_change_difference_count,
|
||||
not_responding_hosts,
|
||||
logit_reason.c_str());
|
||||
}
|
||||
|
||||
/* Increment the entries count till it reaches the max. */
|
||||
if ( history_ptr->entries < MTCE_HBS_HISTORY_ENTRIES )
|
||||
history_ptr->entries++ ;
|
||||
|
||||
/* Manage the next entry update index ; aka the oldest index. */
|
||||
if ( history_ptr->oldest_entry_index == (MTCE_HBS_HISTORY_ENTRIES-1))
|
||||
if ( oldest_entry_index == (MTCE_HBS_HISTORY_ENTRIES-1))
|
||||
history_ptr->oldest_entry_index = 0 ;
|
||||
else
|
||||
history_ptr->oldest_entry_index++ ;
|
||||
|
@ -521,24 +619,31 @@ void hbs_cluster_update ( iface_enum iface,
|
|||
|
||||
void hbs_cluster_append ( hbs_message_type & msg )
|
||||
{
|
||||
unsigned short c = ctrl.this_controller ;
|
||||
|
||||
CHECK_CTRL_NTWK_PARMS(c, ctrl.monitored_networks);
|
||||
CHECK_CTRL_NTWK_PARMS(ctrl.this_controller, ctrl.monitored_networks);
|
||||
|
||||
msg.cluster.version = ctrl.cluster.version ;
|
||||
msg.cluster.revision = ctrl.cluster.revision ;
|
||||
msg.cluster.magic_number = ctrl.cluster.magic_number ;
|
||||
msg.cluster.period_msec = ctrl.cluster.period_msec ;
|
||||
msg.cluster.storage0_enabled = ctrl.cluster.storage0_enabled ;
|
||||
msg.cluster.histories = ctrl.cluster.histories ;
|
||||
msg.cluster.histories = 0 ;
|
||||
|
||||
int bytes = BYTES_IN_CLUSTER_VAULT(ctrl.monitored_networks);
|
||||
/* Copy this controller's cluster history into the broadcast request. */
|
||||
for ( int h = 0 ; h < ctrl.cluster.histories ; h++ )
|
||||
{
|
||||
if ( ctrl.cluster.history[h].controller == ctrl.this_controller )
|
||||
{
|
||||
memcpy( &msg.cluster.history[msg.cluster.histories],
|
||||
&ctrl.cluster.history[h],
|
||||
sizeof(mtce_hbs_cluster_history_type));
|
||||
|
||||
clog1 ("controller-%d appending cluster info to heartbeat message (%d:%d:%d)",
|
||||
c, ctrl.monitored_networks, ctrl.cluster.histories, bytes );
|
||||
msg.cluster.histories++ ;
|
||||
}
|
||||
}
|
||||
msg.cluster.bytes = BYTES_IN_CLUSTER_VAULT(msg.cluster.histories);
|
||||
|
||||
/* Copy the cluster into the message. */
|
||||
memcpy( &msg.cluster.history[0], &ctrl.cluster.history[c], bytes);
|
||||
clog2 ("controller-%d appending cluster info to heartbeat message (%d:%d:%d)",
|
||||
ctrl.this_controller, ctrl.monitored_networks, ctrl.cluster.histories, msg.cluster.bytes );
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
|
@ -574,57 +679,8 @@ unsigned short hbs_cluster_unused_bytes ( void )
|
|||
*
|
||||
***************************************************************************/
|
||||
|
||||
/* NOTE: All code wrapped in this directive will be removed once
|
||||
* active/active heartbeating is delivered in next update */
|
||||
#define WANT_ACTIVE_ACTIVE_HEARTBEAT_RESULTS
|
||||
|
||||
void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid )
|
||||
{
|
||||
|
||||
#ifdef WANT_ACTIVE_ACTIVE_HEARTBEAT_RESULTS
|
||||
|
||||
/* To assist SM with duplex integration ...
|
||||
*
|
||||
* This code emulates heartbeat redundancy by duplicating
|
||||
* controller history up to the number of provisioned
|
||||
* controllers until active-active heartbeat is delivered.
|
||||
*/
|
||||
int peer_controller ;
|
||||
bool copy_cluster = false ;
|
||||
if ( ctrl.this_controller == 0 )
|
||||
{
|
||||
peer_controller = 1 ;
|
||||
if ( ctrl.controller_1_enabled )
|
||||
{
|
||||
copy_cluster = true ;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
peer_controller = 0 ;
|
||||
if ( ctrl.controller_0_enabled )
|
||||
{
|
||||
copy_cluster = true ;
|
||||
}
|
||||
}
|
||||
|
||||
int n, networks = ctrl.cluster.histories ;
|
||||
if ( copy_cluster )
|
||||
{
|
||||
for ( n = 0 ; n < networks ; n++ )
|
||||
{
|
||||
/* copy this controller history to create peer controller */
|
||||
ctrl.cluster.history[ctrl.cluster.histories] = ctrl.cluster.history[n] ;
|
||||
|
||||
/* update the controller */
|
||||
ctrl.cluster.history[ctrl.cluster.histories].controller = peer_controller ;
|
||||
ctrl.cluster.bytes += sizeof(mtce_hbs_cluster_history_type) ;
|
||||
ctrl.cluster.histories++ ;
|
||||
}
|
||||
}
|
||||
|
||||
#endif // WANT_ACTIVE_ACTIVE_HEARTBEAT_RESULTS
|
||||
|
||||
ctrl.cluster.reqid = (unsigned short)reqid ;
|
||||
if (( sm_client_sock ) && ( sm_client_sock->sock_ok() == true ))
|
||||
{
|
||||
|
@ -637,34 +693,82 @@ void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid )
|
|||
}
|
||||
else
|
||||
{
|
||||
ilog ("heartbeat cluster vault sent to SM (%d bytes)", len );
|
||||
hbs_cluster_dump ( ctrl.cluster );
|
||||
string reason = "" ;
|
||||
// ilog ("heartbeat cluster vault sent to SM (%d bytes)", len );
|
||||
if ( reqid )
|
||||
reason = "cluster query" ;
|
||||
else
|
||||
reason = "cluster event" ;
|
||||
hbs_cluster_dump ( ctrl.cluster, reason, true );
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef WANT_ACTIVE_ACTIVE_HEARTBEAT_RESULTS
|
||||
|
||||
if ( copy_cluster )
|
||||
else
|
||||
{
|
||||
/* Clear out the other controllers data. */
|
||||
for ( n = networks ; n > 0 ; n-- )
|
||||
wlog ("cannot send cluster info due to socket error");
|
||||
}
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* Name : hbs_history_save
|
||||
*
|
||||
* Descrition : Copy the history sample to the vault.
|
||||
*
|
||||
* Returns : Nothing.
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
void hbs_history_save ( string hostname, mtce_hbs_cluster_history_type & sample )
|
||||
{
|
||||
for ( int h = 0 ; h < ctrl.cluster.histories ; h++ )
|
||||
{
|
||||
if (( ctrl.cluster.history[h].controller == sample.controller ) &&
|
||||
( ctrl.cluster.history[h].network == sample.network ))
|
||||
{
|
||||
/* copy c0 history to another controller */
|
||||
hbs_cluster_history_init(ctrl.cluster.history[ctrl.cluster.histories-1]);
|
||||
ctrl.cluster.bytes -= sizeof(mtce_hbs_cluster_history_type);
|
||||
ctrl.cluster.histories-- ;
|
||||
memcpy( &ctrl.cluster.history[h], &sample,
|
||||
sizeof(mtce_hbs_cluster_history_type));
|
||||
|
||||
clog1 ("controller-%d updated vault with controller-%d:%s network history through %s (histories:%d)",
|
||||
ctrl.this_controller,
|
||||
sample.controller,
|
||||
hbs_cluster_network_name((mtce_hbs_network_enum)sample.network).c_str(),
|
||||
hostname.c_str(),
|
||||
ctrl.cluster.histories);
|
||||
return ;
|
||||
}
|
||||
}
|
||||
|
||||
#endif // WANT_ACTIVE_ACTIVE_HEARTBEAT_RESULTS
|
||||
/* not found ? Add a new one */
|
||||
memcpy( &ctrl.cluster.history[ctrl.cluster.histories], &sample,
|
||||
sizeof(mtce_hbs_cluster_history_type));
|
||||
ctrl.cluster.histories++ ;
|
||||
ctrl.cluster.bytes = BYTES_IN_CLUSTER_VAULT(ctrl.cluster.histories);
|
||||
|
||||
ilog ("controller-%d added new controller-%d:%s history to vault ; now have %d network views",
|
||||
ctrl.this_controller,
|
||||
sample.controller,
|
||||
hbs_cluster_network_name((mtce_hbs_network_enum)sample.network).c_str(),
|
||||
ctrl.cluster.histories);
|
||||
}
|
||||
|
||||
void hbs_state_audit ( void )
|
||||
{
|
||||
hbs_cluster_dump ( ctrl.cluster, "Audit", true );
|
||||
}
|
||||
|
||||
|
||||
void hbs_cluster_log ( string & hostname, string prefix )
|
||||
{
|
||||
hbs_cluster_log ( hostname, ctrl.cluster, prefix );
|
||||
}
|
||||
|
||||
void hbs_cluster_log ( string & hostname,
|
||||
string log_prefix,
|
||||
bool force )
|
||||
{
|
||||
hbs_cluster_log (hostname, ctrl.cluster, log_prefix, force );
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* Active Active Heartbeating and Debug Member Functions
|
||||
|
@ -724,10 +828,6 @@ int hbs_cluster_cmp( hbs_message_type & msg )
|
|||
* Descrition : Copies the other controllers information from msg into
|
||||
* the cluster.
|
||||
*
|
||||
* NOTE: Does not do that right now.
|
||||
*
|
||||
* Assumptions : Place holder until active/active heartbeating is implemented.
|
||||
*
|
||||
* Returns : PASS or FAIL
|
||||
*
|
||||
***************************************************************************/
|
||||
|
@ -736,12 +836,29 @@ int hbs_cluster_save ( string & hostname,
|
|||
mtce_hbs_network_enum network,
|
||||
hbs_message_type & msg )
|
||||
{
|
||||
// clog ("Add cluster info from peer controller");
|
||||
if ( ctrl.monitored_hosts )
|
||||
/* cluster info is only supported in HBS_MESSAGE_VERSION 1 */
|
||||
if ( msg.v < HBS_MESSAGE_VERSION )
|
||||
return FAIL_NOT_SUPPORTED ;
|
||||
|
||||
if ( ! ctrl.monitored_hosts )
|
||||
return RETRY ;
|
||||
|
||||
if ( msg.cluster.histories == 0 )
|
||||
return PASS ;
|
||||
|
||||
for ( int h = 0 ; h < msg.cluster.histories ; h++ )
|
||||
{
|
||||
/* compare cluster info and log deltas */
|
||||
// hbs_cluster_cmp( msg );
|
||||
UNUSED(msg);
|
||||
if ( msg.cluster.history[h].network >= MTCE_HBS_MAX_NETWORKS )
|
||||
{
|
||||
elog ("Invalid network id (%d:%d:%d)",
|
||||
h,
|
||||
msg.cluster.history[h].controller,
|
||||
msg.cluster.history[h].network );
|
||||
}
|
||||
else if ( msg.cluster.history[h].controller != ctrl.this_controller )
|
||||
{
|
||||
hbs_history_save ( hostname, msg.cluster.history[h] );
|
||||
}
|
||||
hbs_cluster_log( hostname, ctrl.cluster, hbs_cluster_network_name(network) );
|
||||
}
|
||||
return (PASS);
|
||||
|
|
|
@ -241,10 +241,11 @@ int mtcSmgrApi_active_services ( string hostname , bool * yes_no_ptr )
|
|||
return(PASS);
|
||||
}
|
||||
|
||||
int send_hbs_command ( string hostname, int command )
|
||||
int send_hbs_command ( string hostname, int command, string controller )
|
||||
{
|
||||
UNUSED(hostname);
|
||||
UNUSED(command);
|
||||
UNUSED(controller);
|
||||
return(PASS);
|
||||
}
|
||||
|
||||
|
|
|
@ -111,6 +111,33 @@ string hbs_cluster_network_name ( mtce_hbs_network_enum network )
|
|||
}
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* Name : hbs_cluster_copy
|
||||
*
|
||||
* Descrition : Copies cluster from src to dst.
|
||||
*
|
||||
* Returns : Nothing.
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
void hbs_cluster_copy ( mtce_hbs_cluster_type & src, mtce_hbs_cluster_type & dst )
|
||||
{
|
||||
dst.version = src.version ;
|
||||
dst.revision = src.revision ;
|
||||
dst.magic_number = src.magic_number ;
|
||||
dst.period_msec = src.period_msec ;
|
||||
dst.histories = src.histories ;
|
||||
dst.storage0_enabled = src.storage0_enabled ;
|
||||
for ( int h = 0 ; h < dst.histories ; h++ )
|
||||
{
|
||||
memcpy( &dst.history[h],
|
||||
&src.history[h],
|
||||
sizeof(mtce_hbs_cluster_history_type));
|
||||
}
|
||||
dst.bytes = BYTES_IN_CLUSTER_VAULT(dst.histories);
|
||||
}
|
||||
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
|
@ -126,11 +153,9 @@ string hbs_cluster_network_name ( mtce_hbs_network_enum network )
|
|||
|
||||
void hbs_cluster_log ( string & hostname,
|
||||
mtce_hbs_cluster_type & cluster,
|
||||
string log_prefix )
|
||||
string log_prefix,
|
||||
bool force )
|
||||
{
|
||||
// bool want_log = false ;
|
||||
|
||||
clog1 ("log %d histories", cluster.histories );
|
||||
for ( int h = 0 ; h < cluster.histories ; h++ )
|
||||
{
|
||||
if ( cluster.history[h].entries == MTCE_HBS_HISTORY_ENTRIES )
|
||||
|
@ -140,8 +165,6 @@ void hbs_cluster_log ( string & hostname,
|
|||
mtce_hbs_cluster_entry_type e = { 0, 0 } ;
|
||||
char str[MAX_CLUSTER_LINE_LEN] ;
|
||||
string line = "";
|
||||
int start = 0 ;
|
||||
int stop = 0 ;
|
||||
bool newline = false ;
|
||||
bool logit = false ;
|
||||
bool first = false ;
|
||||
|
@ -149,18 +172,13 @@ void hbs_cluster_log ( string & hostname,
|
|||
|
||||
mtce_hbs_cluster_history_type * history_ptr = &cluster.history[h] ;
|
||||
|
||||
clog1 ("%s %s has %d entries (controller-%d view from %s)", hostname.c_str(),
|
||||
hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(),
|
||||
history_ptr->entries,
|
||||
history_ptr->controller,
|
||||
log_prefix.c_str());
|
||||
|
||||
|
||||
/* Manage local this_index for log display.
|
||||
* Display oldest to newest ; left to right
|
||||
*
|
||||
* */
|
||||
int this_index = history_ptr->oldest_entry_index ;
|
||||
int debug = daemon_get_cfg_ptr()->debug_state ;
|
||||
|
||||
for ( int count = 0 ; count < history_ptr->entries ; count++ )
|
||||
{
|
||||
if (( line.length() + MAX_ENTRY_STR_LEN ) >=
|
||||
|
@ -180,13 +198,11 @@ void hbs_cluster_log ( string & hostname,
|
|||
}
|
||||
#endif
|
||||
|
||||
// want_log = true ;
|
||||
|
||||
if ( count == 0 )
|
||||
{
|
||||
snprintf (&str[0], MAX_ENTRY_STR_LEN , "%d:%d ", // -%d",
|
||||
history_ptr->entry[this_index].hosts_enabled,
|
||||
history_ptr->entry[this_index].hosts_responding ); // , this_index );
|
||||
history_ptr->entry[this_index].hosts_responding );
|
||||
line.append (str);
|
||||
str[0] = '\0' ;
|
||||
}
|
||||
|
@ -203,7 +219,7 @@ void hbs_cluster_log ( string & hostname,
|
|||
{
|
||||
snprintf (&str[0], MAX_ENTRY_STR_LEN , "%d:%d ", // -%d",
|
||||
history_ptr->entry[this_index].hosts_enabled,
|
||||
history_ptr->entry[this_index].hosts_responding ); // , this_index );
|
||||
history_ptr->entry[this_index].hosts_responding );
|
||||
line.append (str);
|
||||
str[0] = '\0' ;
|
||||
logit = true ;
|
||||
|
@ -214,31 +230,21 @@ void hbs_cluster_log ( string & hostname,
|
|||
first_log[h] = true ;
|
||||
logit = true ;
|
||||
}
|
||||
stop++ ;
|
||||
if ( newline == true )
|
||||
{
|
||||
if ( logit )
|
||||
{
|
||||
SET_CONTROLLER_HOSTNAME(history_ptr->controller);
|
||||
if ( hostname == controller )
|
||||
if (( force ) || ( debug&2 ))
|
||||
{
|
||||
clog ("%s view %s %s %02d..%02d: %s,",
|
||||
hostname.c_str(),
|
||||
log_prefix.c_str(),
|
||||
hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(),
|
||||
start, stop, line.c_str());
|
||||
}
|
||||
else
|
||||
{
|
||||
clog ("%s view from %s %s %s %02d..%02d: %s,",
|
||||
controller.c_str(),
|
||||
hostname.c_str(),
|
||||
log_prefix.c_str(),
|
||||
hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(),
|
||||
start, stop, line.c_str());
|
||||
syslog ( LOG_INFO, "%s view from %s %s %s: %s",
|
||||
controller.c_str(),
|
||||
hostname.c_str(),
|
||||
log_prefix.c_str(),
|
||||
hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(),
|
||||
line.c_str());
|
||||
}
|
||||
}
|
||||
start = stop + 1 ;
|
||||
line.clear();
|
||||
first = true ;
|
||||
newline = false ;
|
||||
|
@ -253,7 +259,6 @@ void hbs_cluster_log ( string & hostname,
|
|||
}
|
||||
if (( newline == false ) && ( line.length() ))
|
||||
{
|
||||
// ERIC
|
||||
if (( logit == false ) && ( was_diff[h] == true ))
|
||||
{
|
||||
logit = true ;
|
||||
|
@ -264,30 +269,25 @@ void hbs_cluster_log ( string & hostname,
|
|||
{
|
||||
if ( first )
|
||||
{
|
||||
clog ("............ %s %s %02d..%02d: %s",
|
||||
log_prefix.c_str(),
|
||||
hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(),
|
||||
start, stop, line.c_str());
|
||||
if (( force ) || ( debug&2 ))
|
||||
{
|
||||
syslog ( LOG_INFO, "............ %s %s: %s",
|
||||
log_prefix.c_str(),
|
||||
hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(),
|
||||
line.c_str());
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
SET_CONTROLLER_HOSTNAME(history_ptr->controller);
|
||||
if ( hostname == controller )
|
||||
if (( force ) || ( debug&2 ))
|
||||
{
|
||||
clog ("%s view %s %s %02d..%02d: %s",
|
||||
hostname.c_str(),
|
||||
log_prefix.c_str(),
|
||||
hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(),
|
||||
start, stop, line.c_str());
|
||||
}
|
||||
else
|
||||
{
|
||||
clog ("%s view from %s %s %s %02d..%02d: %s",
|
||||
controller.c_str(),
|
||||
hostname.c_str(),
|
||||
log_prefix.c_str(), /* Infra <- */
|
||||
hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(),
|
||||
start, stop, line.c_str());
|
||||
syslog ( LOG_INFO, "%s view from %s %s %s: %s",
|
||||
controller.c_str(),
|
||||
hostname.c_str(),
|
||||
log_prefix.c_str(), /* Infra <- */
|
||||
hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(),
|
||||
line.c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -307,40 +307,62 @@ void hbs_cluster_log ( string & hostname,
|
|||
* Description: Formatted dump of the vault contents to the log file.
|
||||
*
|
||||
***************************************************************************/
|
||||
void hbs_cluster_dump ( mtce_hbs_cluster_type & vault )
|
||||
void hbs_cluster_dump ( mtce_hbs_cluster_type & vault, string log_prefix, bool force )
|
||||
{
|
||||
syslog ( LOG_INFO, "Cluster Vault Dump: --------------------------------------------------------------------------------------------");
|
||||
syslog ( LOG_INFO, "Cluster Vault: v%d.%d %d msec period ; SM Reqid is %d with storage-0 %s and %d histories in %d bytes",
|
||||
vault.version,
|
||||
vault.revision,
|
||||
vault.period_msec,
|
||||
vault.reqid,
|
||||
vault.storage0_enabled ? "enabled" : "disabled",
|
||||
vault.histories,
|
||||
vault.bytes );
|
||||
for ( int h = 0 ; h < vault.histories ; h++ )
|
||||
if ( vault.version == 0 )
|
||||
return ;
|
||||
|
||||
int debug = daemon_get_cfg_ptr()->debug_state ;
|
||||
|
||||
if (( debug & 2 ) || ( force == true ))
|
||||
{
|
||||
#define MAX_LINE_LEN (500)
|
||||
char str[MAX_LINE_LEN] ;
|
||||
int i = 0 ;
|
||||
for ( int e = 0 ; e < vault.history[h].entries_max ; e++ )
|
||||
{
|
||||
snprintf ( &str[i], MAX_LINE_LEN, "%c[%d:%d]" ,
|
||||
vault.history[h].oldest_entry_index==e ? '>' : ' ',
|
||||
vault.history[h].entry[e].hosts_enabled,
|
||||
vault.history[h].entry[e].hosts_responding);
|
||||
i = strlen(str) ;
|
||||
}
|
||||
syslog ( LOG_INFO, "Cluster Vault: C%d %s S:%s:%s (%d:%d) %s",
|
||||
vault.history[h].controller,
|
||||
hbs_cluster_network_name((mtce_hbs_network_enum)vault.history[h].network).c_str(),
|
||||
vault.storage0_enabled ? "y" : "n",
|
||||
vault.history[h].storage0_responding ? "y" : "n",
|
||||
vault.history[h].entries_max,
|
||||
vault.history[h].entries,
|
||||
str);
|
||||
ilog ("%s", log_prefix.c_str());
|
||||
syslog ( LOG_INFO, "Cluster Vault : v%d.%d %d msec heartbeat period %s;%d network heartbeat response histories (%d bytes)",
|
||||
vault.version,
|
||||
vault.revision,
|
||||
vault.period_msec,
|
||||
vault.storage0_enabled ? " with storage-0: enabled " : "",
|
||||
vault.histories,
|
||||
vault.bytes );
|
||||
}
|
||||
|
||||
if (( debug & 4 ) || ( force == true ))
|
||||
{
|
||||
for ( int h = 0 ; h < vault.histories ; h++ )
|
||||
{
|
||||
#define MAX_LINE_LEN (500)
|
||||
char str[MAX_LINE_LEN] ;
|
||||
int i = 0 ;
|
||||
for ( int e = 0 ; e < vault.history[h].entries_max ; e++ )
|
||||
{
|
||||
snprintf ( &str[i], MAX_LINE_LEN, "%c[%d:%d]" ,
|
||||
vault.history[h].oldest_entry_index==e ? '>' : ' ',
|
||||
vault.history[h].entry[e].hosts_enabled,
|
||||
vault.history[h].entry[e].hosts_responding);
|
||||
i = strlen(str) ;
|
||||
}
|
||||
if ( vault.storage0_enabled )
|
||||
{
|
||||
syslog ( LOG_INFO, "Cluster Vault : C%d %s S:%s %s",
|
||||
vault.history[h].controller,
|
||||
hbs_cluster_network_name((mtce_hbs_network_enum)vault.history[h].network).c_str(),
|
||||
vault.history[h].storage0_responding ? "y" : "n",
|
||||
str);
|
||||
}
|
||||
else
|
||||
{
|
||||
syslog ( LOG_INFO, "Cluster Vault : C%d %s %s",
|
||||
vault.history[h].controller,
|
||||
hbs_cluster_network_name((mtce_hbs_network_enum)vault.history[h].network).c_str(),
|
||||
str);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ( debug & 8 )
|
||||
{
|
||||
dump_memory ( &vault, 16, vault.bytes );
|
||||
}
|
||||
// dump_memory ( &vault, 16, vault.bytes );
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -46,6 +46,7 @@ CONTROL_OBJS += mtcHttpSvr.o
|
|||
CONTROL_OBJS += mtcCmdHdlr.o
|
||||
CONTROL_OBJS += mtcNodeMnfa.o
|
||||
CONTROL_OBJS += mtcVimApi.o
|
||||
CONTROL_OBJS += mtcStubs.o
|
||||
CONTROL_OBJS += ../common/nodeClass.o
|
||||
|
||||
OBJS = $(SRCS:.cpp=.o)
|
||||
|
|
|
@ -48,6 +48,7 @@ using namespace std;
|
|||
#include "mtcAlarm.h" /* for ... mtcAlarm... */
|
||||
#include "nodeUtil.h" /* for ... get_event_str ... */
|
||||
|
||||
int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr );
|
||||
|
||||
/* Throttle logging of messages from unknown IP addresses */
|
||||
std::list<string> unknown_ip_list ;
|
||||
|
@ -766,7 +767,7 @@ int send_mtc_cmd ( string & hostname, int cmd , int interface )
|
|||
return ( rc );
|
||||
}
|
||||
|
||||
int send_hbs_command ( string hostname, int cmd )
|
||||
int send_hbs_command ( string hostname, int cmd, string controller )
|
||||
{
|
||||
int bytes = 0 ;
|
||||
int bytes_to_send = 0 ;
|
||||
|
@ -776,18 +777,6 @@ int send_hbs_command ( string hostname, int cmd )
|
|||
mtc_message_type event ;
|
||||
mtc_socket_type * sock_ptr = get_sockPtr ();
|
||||
|
||||
/* We don't heartbeat self */
|
||||
if (( obj_ptr->is_active_controller (hostname) ) &&
|
||||
(( cmd == MTC_CMD_ADD_HOST ) ||
|
||||
( cmd == MTC_CMD_DEL_HOST ) ||
|
||||
( cmd == MTC_CMD_START_HOST ) ||
|
||||
( cmd == MTC_CMD_STOP_HOST )))
|
||||
{
|
||||
dlog ("%s refusing to '%s' self to heartbeat service\n",
|
||||
hostname.c_str(), get_event_str(cmd).c_str());
|
||||
return (PASS);
|
||||
}
|
||||
|
||||
memset (&event, 0 , sizeof(mtc_message_type));
|
||||
snprintf ( &event.hdr[0] , MSG_HEADER_SIZE, "%s", get_hbs_cmd_req_header() );
|
||||
snprintf ( &event.hdr[MSG_HEADER_SIZE] , MAX_CHARS_HOSTNAME , "%s", hostname.data());
|
||||
|
@ -795,48 +784,72 @@ int send_hbs_command ( string hostname, int cmd )
|
|||
/* There is no buffer data in any of these messages */
|
||||
bytes_to_send = ((sizeof(mtc_message_type))-(BUF_SIZE)) ;
|
||||
|
||||
switch ( cmd )
|
||||
{
|
||||
case MTC_CMD_STOP_HOST:
|
||||
ilog ("%s sending 'stop' to heartbeat service\n", hostname.c_str());
|
||||
break ;
|
||||
case MTC_CMD_START_HOST:
|
||||
obj_ptr->manage_heartbeat_clear ( hostname , MAX_IFACES );
|
||||
ilog ("%s sending 'start' to heartbeat service\n", hostname.c_str());
|
||||
break ;
|
||||
case MTC_CMD_DEL_HOST:
|
||||
ilog ("%s sending 'delete' to heartbeat service\n", hostname.c_str());
|
||||
break ;
|
||||
case MTC_CMD_ADD_HOST:
|
||||
obj_ptr->manage_heartbeat_clear ( hostname, MAX_IFACES );
|
||||
ilog ("%s sending 'add' to heartbeat service\n", hostname.c_str());
|
||||
break ;
|
||||
case MTC_RESTART_HBS:
|
||||
ilog ("%s sending 'restart' to heartbeat service\n", hostname.c_str());
|
||||
break ;
|
||||
case MTC_BACKOFF_HBS:
|
||||
ilog ("%s requesting heartbeat period backoff\n", hostname.c_str());
|
||||
break ;
|
||||
case MTC_RECOVER_HBS:
|
||||
ilog ("%s requesting heartbeat period recovery\n", hostname.c_str());
|
||||
break ;
|
||||
default:
|
||||
{
|
||||
slog ("%s Unsupported command operation 0x%x\n", hostname.c_str(), cmd );
|
||||
return (FAIL_BAD_PARM);
|
||||
}
|
||||
}
|
||||
|
||||
event.cmd = cmd ;
|
||||
event.num = 1 ;
|
||||
event.parm[0] = obj_ptr->get_nodetype(hostname);
|
||||
|
||||
/* send to hbsAgent daemon port */
|
||||
bytes = sock_ptr->mtc_to_hbs_sock->write((char*) &event, bytes_to_send);
|
||||
if ( bytes <= 0 )
|
||||
std::list<string> controllers ;
|
||||
controllers.clear();
|
||||
if ( controller == CONTROLLER )
|
||||
{
|
||||
wlog ("Cannot send to heartbeat service\n");
|
||||
rc = FAIL_TO_TRANSMIT ;
|
||||
controllers.push_back(CONTROLLER_0);
|
||||
controllers.push_back(CONTROLLER_1);
|
||||
}
|
||||
else
|
||||
{
|
||||
controllers.push_back(controller);
|
||||
}
|
||||
string ip = "" ;
|
||||
std::list<string>::iterator unit ;
|
||||
for ( unit = controllers.begin () ;
|
||||
unit != controllers.end () ;
|
||||
unit++ )
|
||||
{
|
||||
switch ( cmd )
|
||||
{
|
||||
case MTC_CMD_ACTIVE_CTRL:
|
||||
mlog3 ("%s sending 'activity state' to %s heartbeat service\n", hostname.c_str(), unit->c_str());
|
||||
break ;
|
||||
case MTC_CMD_STOP_HOST:
|
||||
ilog ("%s sending 'stop' to %s heartbeat service\n", hostname.c_str(), unit->c_str());
|
||||
break ;
|
||||
case MTC_CMD_START_HOST:
|
||||
obj_ptr->manage_heartbeat_clear ( hostname , MAX_IFACES );
|
||||
ilog ("%s sending 'start' to %s heartbeat service\n", hostname.c_str(), unit->c_str());
|
||||
break ;
|
||||
case MTC_CMD_DEL_HOST:
|
||||
ilog ("%s sending 'delete' to %s heartbeat service\n", hostname.c_str(), unit->c_str());
|
||||
break ;
|
||||
case MTC_CMD_ADD_HOST:
|
||||
obj_ptr->manage_heartbeat_clear ( hostname, MAX_IFACES );
|
||||
ilog ("%s sending 'add' to %s heartbeat service\n", hostname.c_str(), unit->c_str());
|
||||
break ;
|
||||
case MTC_RESTART_HBS:
|
||||
ilog ("%s sending 'restart' to %s heartbeat service\n", hostname.c_str(), unit->c_str());
|
||||
break ;
|
||||
case MTC_BACKOFF_HBS:
|
||||
ilog ("%s requesting %s heartbeat period backoff\n", hostname.c_str(), unit->c_str());
|
||||
break ;
|
||||
case MTC_RECOVER_HBS:
|
||||
ilog ("%s requesting %s heartbeat period recovery\n", hostname.c_str(), unit->c_str());
|
||||
break ;
|
||||
default:
|
||||
{
|
||||
slog ("%s Unsupported command operation 0x%x\n", hostname.c_str(), cmd );
|
||||
rc = FAIL_BAD_PARM ;
|
||||
continue ;
|
||||
}
|
||||
}
|
||||
|
||||
ip = get_mtcInv_ptr()->get_hostaddr(*unit) ;
|
||||
bytes = sock_ptr->mtc_to_hbs_sock->write((char*) &event, bytes_to_send, ip.data());
|
||||
if ( bytes <= 0 )
|
||||
{
|
||||
wlog ("%s failed to send command (0x%x) to heartbeat service at %s\n", unit->c_str(), cmd, ip.c_str() );
|
||||
rc = FAIL_TO_TRANSMIT ;
|
||||
}
|
||||
}
|
||||
return rc ;
|
||||
}
|
||||
|
@ -954,6 +967,14 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr )
|
|||
/* Assert the degrade condition with the 'false' (i.e. not clear)*/
|
||||
obj_ptr->manage_heartbeat_degrade ( hostname, iface, false );
|
||||
}
|
||||
/* Otherwise the action must be alarm only or none ; both of which
|
||||
* are already handled by the hbsAgent, so do nothing */
|
||||
else
|
||||
{
|
||||
ilog ("%s heartbeat degrade event dropped ; action is not fail or degrade (%s)\n",
|
||||
hostname.c_str(),
|
||||
get_iface_name_str(iface));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -1003,7 +1024,7 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr )
|
|||
* are already handled by the hbsAgent, so do nothing */
|
||||
else
|
||||
{
|
||||
dlog ("%s heartbeat loss event dropped (%s)\n",
|
||||
ilog ("%s heartbeat loss event dropped ; action is not fail or degrade (%s)\n",
|
||||
hostname.c_str(),
|
||||
get_iface_name_str(iface));
|
||||
}
|
||||
|
@ -1070,6 +1091,7 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr )
|
|||
|
||||
else if ( msg.cmd == MTC_EVENT_HEARTBEAT_READY )
|
||||
{
|
||||
string controller = CONTROLLER ;
|
||||
std::list<string>::iterator temp ;
|
||||
|
||||
/* no heartbeating in simplex mode */
|
||||
|
@ -1078,7 +1100,17 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr )
|
|||
return (PASS);
|
||||
}
|
||||
|
||||
ilog ("Received 'Heartbeat Service Ready' Event\n");
|
||||
/* get the controller that sent this ready event */
|
||||
if (( msg.buf[0] != '\0' ) && ( strnlen( msg.buf, BUF_SIZE) <= MAX_CHARS_HOSTNAME ))
|
||||
{
|
||||
controller = msg.buf ;
|
||||
ilog ("%s Heartbeat Service Ready Event (%s)\n",
|
||||
msg.buf, sock_ptr->mtc_event_rx_sock->get_src_str());
|
||||
}
|
||||
else
|
||||
{
|
||||
ilog ("Heartbeat Service Ready Event\n");
|
||||
}
|
||||
obj_ptr->hbs_ready = true ;
|
||||
|
||||
/* Run Maintenance on Inventory */
|
||||
|
@ -1093,25 +1125,17 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr )
|
|||
* the heartbeat service. This tell the heartbeat
|
||||
* service about all the hosts so that it will
|
||||
* send heartbeat oob flag events to mtce. */
|
||||
if ( send_hbs_command( hostname, MTC_CMD_ADD_HOST ) != PASS )
|
||||
if ( send_hbs_command( hostname, MTC_CMD_ADD_HOST, controller ) != PASS )
|
||||
{
|
||||
elog ("%s Failed to send inventory to heartbeat service\n", hostname.c_str());
|
||||
}
|
||||
/* Send the start event to the heartbeat service for all enabled hosts except
|
||||
* for the active controller which is not actively monitored */
|
||||
if ( obj_ptr->is_active_controller ( hostname ) == false )
|
||||
/* Send the start event to the heartbeat service for all enabled hosts */
|
||||
if (( obj_ptr->get_adminState ( hostname ) == MTC_ADMIN_STATE__UNLOCKED ) &&
|
||||
( obj_ptr->get_operState ( hostname ) == MTC_OPER_STATE__ENABLED ) &&
|
||||
((obj_ptr->get_availStatus ( hostname ) == MTC_AVAIL_STATUS__AVAILABLE ) ||
|
||||
(obj_ptr->get_availStatus ( hostname ) == MTC_AVAIL_STATUS__DEGRADED )))
|
||||
{
|
||||
if (( obj_ptr->get_adminState ( hostname ) == MTC_ADMIN_STATE__UNLOCKED ) &&
|
||||
( obj_ptr->get_operState ( hostname ) == MTC_OPER_STATE__ENABLED ) &&
|
||||
((obj_ptr->get_availStatus ( hostname ) == MTC_AVAIL_STATUS__AVAILABLE ) ||
|
||||
(obj_ptr->get_availStatus ( hostname ) == MTC_AVAIL_STATUS__DEGRADED )))
|
||||
{
|
||||
send_hbs_command ( hostname, MTC_CMD_START_HOST );
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
dlog ("%s Refusing to start heartbeat of self\n", hostname.c_str() );
|
||||
send_hbs_command ( hostname, MTC_CMD_START_HOST, controller );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -802,7 +802,11 @@ int mtc_socket_init ( void )
|
|||
/***********************************************************/
|
||||
|
||||
int port = daemon_get_cfg_ptr()->hbs_to_mtc_event_port ;
|
||||
mtc_sock.mtc_event_rx_sock = new msgClassRx(LOOPBACK_IP, port, IPPROTO_UDP);
|
||||
|
||||
/* listen to this port on any interface so that the hbsAgent running
|
||||
* locally or on peer controller can get events into mtcAgent */
|
||||
mtc_sock.mtc_event_rx_sock =
|
||||
new msgClassRx(mtcInv.my_float_ip.data(), port, IPPROTO_UDP);
|
||||
rc = mtc_sock.mtc_event_rx_sock->return_status;
|
||||
if ( rc )
|
||||
{
|
||||
|
@ -820,7 +824,7 @@ int mtc_socket_init ( void )
|
|||
/***********************************************************/
|
||||
|
||||
port = daemon_get_cfg_ptr()->mtc_to_hbs_cmd_port ;
|
||||
sock_ptr->mtc_to_hbs_sock = new msgClassTx(LOOPBACK_IP, port, IPPROTO_UDP);
|
||||
sock_ptr->mtc_to_hbs_sock = new msgClassTx(CONTROLLER, port, IPPROTO_UDP, mtc_config.mgmnt_iface);
|
||||
rc = sock_ptr->mtc_to_hbs_sock->return_status;
|
||||
if ( rc )
|
||||
{
|
||||
|
@ -1281,11 +1285,14 @@ void daemon_service_run ( void )
|
|||
mtcInv.inotify_shadow_file_fd ,
|
||||
mtcInv.inotify_shadow_file_wd );
|
||||
|
||||
/* Add this controller to the heartbeat service so that we
|
||||
* receive the out-of-band heartbeat 'flags' even though
|
||||
* we don't self monitor the active controller specifically
|
||||
* This add may be duplicate but covers the initial config case */
|
||||
/* inform the heartbeat service that this controller is active */
|
||||
send_hbs_command ( mtcInv.my_hostname, MTC_CMD_ACTIVE_CTRL );
|
||||
|
||||
/* Add this controller to the heartbeat service so that
|
||||
* the peer hbsAgent also gets this controllers inventory
|
||||
* and this hbsAgent receives the out-of-band heartbeat 'flags' */
|
||||
send_hbs_command ( mtcInv.my_hostname, MTC_CMD_ADD_HOST );
|
||||
send_hbs_command ( mtcInv.my_hostname, MTC_CMD_START_HOST );
|
||||
|
||||
socks.clear();
|
||||
socks.push_front (mtc_sock.mtc_event_rx_sock->getFD()); // service_events
|
||||
|
|
|
@ -6205,6 +6205,13 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
|
||||
#endif
|
||||
|
||||
/* Audits for this controller host only */
|
||||
if ( node_ptr->hostname == this->my_hostname )
|
||||
{
|
||||
/* Remind the heartbeat service that this is the active ctrl */
|
||||
send_hbs_command ( this->my_hostname, MTC_CMD_ACTIVE_CTRL );
|
||||
}
|
||||
|
||||
/* Manage active controller auto recovery bool.
|
||||
* If the inactive controller is inservice then disable
|
||||
* controller autorecovery. Otherwise enable it but in this case
|
||||
|
|
|
@ -14,4 +14,10 @@ using namespace std;
|
|||
|
||||
#include "nodeClass.h" /* The main link class */
|
||||
|
||||
void hbs_cluster_log ( void ) { }
|
||||
void hbs_cluster_log ( string & hostname, string prefix, bool force=false )
|
||||
{
|
||||
UNUSED(hostname);
|
||||
UNUSED(prefix);
|
||||
UNUSED(force);
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue