Merge "Implement Active-Active Heartbeat as HA Improvement"

This commit is contained in:
Zuul 2018-11-21 16:42:56 +00:00 committed by Gerrit Code Review
commit abf0ff3986
30 changed files with 1678 additions and 837 deletions

View File

@ -39,7 +39,6 @@ typedef struct
{
int scheduling_priority ; /**< Scheduling priority of this daemon */
bool active ; /**< Maintenance activity state true|false */
int hbs_pulse_period ; /**< time (msec) between heartbeat requests */
int token_refresh_rate ; /**< token refresh rate in seconds */
int hbs_minor_threshold ; /**< heartbeat miss minor threshold */
int hbs_degrade_threshold ; /**< heartbeat miss degrade threshold */
@ -351,7 +350,7 @@ extern char *program_invocation_short_name;
}
#define blog(format, args...) { \
if ( ltc() ) { if(daemon_get_cfg_ptr()->debug_bmgmt) printf ( "%s [%d.%05d] %s %s %-3s %-18s(%4d) %-24s: BMgt : " format, pt(), getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } \
if ( ltc() ) { if(daemon_get_cfg_ptr()->debug_bmgmt&1) printf ( "%s [%d.%05d] %s %s %-3s %-18s(%4d) %-24s: BMgt : " format, pt(), getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } \
else { if(daemon_get_cfg_ptr()->debug_bmgmt) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: BMgt : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } \
}
@ -380,22 +379,22 @@ extern char *program_invocation_short_name;
#define mlog2(format, args...) { if(daemon_get_cfg_ptr()->debug_msg&4 ) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Msg4 : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
#define mlog3(format, args...) { if(daemon_get_cfg_ptr()->debug_msg&8 ) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Msg8 : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
#define jlog(format, args...) { if(daemon_get_cfg_ptr()->debug_json ) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Json : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
#define jlog(format, args...) { if(daemon_get_cfg_ptr()->debug_json&1) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Json : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
#define jlog1(format, args...) { if(daemon_get_cfg_ptr()->debug_json&2) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Json2: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
#define jlog2(format, args...) { if(daemon_get_cfg_ptr()->debug_json&4) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Json4: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
#define jlog3(format, args...) { if(daemon_get_cfg_ptr()->debug_json&8) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Json8: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
#define hlog(format, args...) { if(daemon_get_cfg_ptr()->debug_http) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Http : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
#define hlog(format, args...) { if(daemon_get_cfg_ptr()->debug_http&1) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Http : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
#define hlog1(format, args...) { if(daemon_get_cfg_ptr()->debug_http&2) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Http2: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
#define hlog2(format, args...) { if(daemon_get_cfg_ptr()->debug_http&4) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Http4: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
#define hlog3(format, args...) { if(daemon_get_cfg_ptr()->debug_http&8) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Http8: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
#define alog(format, args...) { if(daemon_get_cfg_ptr()->debug_alive ) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Alive : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
#define alog(format, args...) { if(daemon_get_cfg_ptr()->debug_alive&1) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Alive : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
#define alog1(format, args...) { if(daemon_get_cfg_ptr()->debug_alive&2) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Alive2: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
#define alog2(format, args...) { if(daemon_get_cfg_ptr()->debug_alive&4) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Alive4: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
#define alog3(format, args...) { if(daemon_get_cfg_ptr()->debug_alive&8) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Alive8: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
#define qlog(format, args...) { if(daemon_get_cfg_ptr()->debug_work) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Work : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
#define qlog(format, args...) { if(daemon_get_cfg_ptr()->debug_work&1) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Work : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
#define qlog1(format, args...) { if(daemon_get_cfg_ptr()->debug_work&2) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Work2: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
#define qlog2(format, args...) { if(daemon_get_cfg_ptr()->debug_work&4) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Work4: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
#define qlog3(format, args...) { if(daemon_get_cfg_ptr()->debug_work&8) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Work8: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
@ -403,8 +402,11 @@ extern char *program_invocation_short_name;
#define flog(format, args...) { if(daemon_get_cfg_ptr()->debug_fsm) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: FSM : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
#define tlog(format, args...) { if(daemon_get_cfg_ptr()->debug_timer) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Timer: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
#define clog(format, args...) { if(daemon_get_cfg_ptr()->debug_state) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Change: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
#define clog(format, args...) { if(daemon_get_cfg_ptr()->debug_state&1) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Change: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
#define clog1(format, args...) { if(daemon_get_cfg_ptr()->debug_state&2) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Chang2: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
#define clog2(format, args...) { if(daemon_get_cfg_ptr()->debug_state&4) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Chang4: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
#define clog3(format, args...) { if(daemon_get_cfg_ptr()->debug_state&8) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Chang8: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
#define log_event(format, args...) { syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Event: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
#define log_stress(format, args...) { syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Stress: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }

View File

@ -233,6 +233,7 @@ const char * get_mtcNodeCommand_str ( int cmd )
case MTC_CMD_QRY_HOST: return("query host");
case MTC_CMD_START_HOST: return("start host service");
case MTC_CMD_STOP_HOST: return("stop host service");
case MTC_CMD_ACTIVE_CTRL: return("publish active controller");
/* VM Instance Commands */
case MTC_CMD_ADD_INST: return("add instance");

View File

@ -359,6 +359,7 @@ void daemon_exit ( void );
* a power-off to online transition */
#define MTC_MTCALIVE_HITS_TO_GO_ONLINE (5)
#define CONTROLLER_X ((const char *)"controller-x")
#define CONTROLLER_0 ((const char *)"controller-0")
#define CONTROLLER_1 ((const char *)"controller-1")
#define CONTROLLER_2 ((const char *)"controller-2")
@ -526,7 +527,8 @@ typedef struct
#define MTC_CMD_MOD_HOST (0x11110012) /* Query Host */
#define MTC_CMD_QRY_HOST (0x11110013) /* Modify Host */
#define MTC_CMD_START_HOST (0x11110014) /* Start Monitoring Host */
#define MTC_CMD_STOP_HOST (0x11110015) /* Stop Moniroting Host */
#define MTC_CMD_STOP_HOST (0x11110015) /* Stop Monitoring Host */
#define MTC_CMD_ACTIVE_CTRL (0x11110016) /* Active Controller */
#define MTC_CMD_ADD_INST (0x11110020) /* Add Inst */
#define MTC_CMD_DEL_INST (0x11110021) /* Delete Inst */
@ -643,6 +645,9 @@ typedef struct
#define PMOND_FLAG (0x00000001) /**< Process Monitor O.K. Flag */
#define INFRA_FLAG (0x00000002) /**< Infrastructure iface provisioned Flag */
#define CTRLX_MASK (0x00000300) /**< From/To Controller-0/1/2/3 Number */
#define CTRLX_BIT ((unsigned int)8) /**< used to shift right mask into bit 0 */
#define STALL_MON_FLAG (0x00010000) /**< Flag indicating hang monitor running */
#define STALL_REC_FLAG (0x00020000) /**< Flag indicating hbsClient took action */
#define STALL_ERR1_FLAG (0x00100000) /**< Error 1 Flag */
@ -1217,15 +1222,15 @@ string get_availStatus_str ( mtc_nodeAvailStatus_enum availStatus );
string get_operState_str ( mtc_nodeOperState_enum operState );
string get_adminState_str ( mtc_nodeAdminState_enum adminState );
void log_adminAction ( string hostname,
mtc_nodeAdminAction_enum currAction,
void log_adminAction ( string hostname,
mtc_nodeAdminAction_enum currAction,
mtc_nodeAdminAction_enum newAction );
int send_hbs_command ( string hostname, int command );
int send_hbs_command ( string hostname, int command, string controller=CONTROLLER );
int send_hwmon_command ( string hostname, int command );
int send_guest_command ( string hostname, int command );
int daemon_log_message ( const char * hostname,
int daemon_log_message ( const char * hostname,
const char * filename,
const char * log_str );

View File

@ -48,6 +48,7 @@
#define MTC_MINS_20 (1200)
#define MTC_MINS_30 (1800)
#define MTC_MINS_40 (2400)
#define MTC_HRS_1 (3600)
#define MTC_HRS_4 (14400)
#define MTC_HRS_8 (28800) /* old token refresh rate */

View File

@ -269,7 +269,7 @@ void daemon_dump_cfg ( void )
{
daemon_config_type * ptr = daemon_get_cfg_ptr();
ilog ("Configuration Settings\n------------------------------\n");
ilog ("Configuration Settings ...\n");
if ( ptr->scheduling_priority ) { ilog ("scheduling_priority = %d\n", ptr->scheduling_priority ); }
if ( ptr->infra_degrade_only ) { ilog ("infra_degrade_only = %s\n", ptr->infra_degrade_only ? "Yes" : "No" );}
@ -277,7 +277,6 @@ void daemon_dump_cfg ( void )
if ( ptr->active ) { ilog ("active = %s\n", ptr->active ? "Yes" : "No" );}
/* hbsAgent */
if ( ptr->hbs_pulse_period ) { ilog ("hbs_pulse_period = %d\n", ptr->hbs_pulse_period );}
if ( ptr->token_refresh_rate ) { ilog ("token_refresh_rate = %d\n", ptr->token_refresh_rate );}
if ( ptr->hbs_minor_threshold ) { ilog ("hbs_minor_threshold = %d\n", ptr->hbs_minor_threshold );}
if ( ptr->hbs_degrade_threshold ) { ilog ("hbs_degrade_threshold = %d\n", ptr->hbs_degrade_threshold );}

View File

@ -78,6 +78,7 @@ void print_help ( void )
printf ("\t-l --log - Log to file ; /var/log/<daemon>.log\n");
printf ("\t-p --passive - Passive mode ; do not act on failures\n");
printf ("\t-v --verbose - Show command line arguments\n");
printf ("\t-V --Virtual - Running in virtual environment\n");
printf ("\t-t --test - Run Test Head\n");
printf ("\t-g --gap - Gap in seconds\n");
printf ("\t-m --mode - Word string representing a run mode\n");
@ -106,6 +107,9 @@ int daemon_get_run_option ( const char * option )
}
return (1);
}
else if ( !strcmp ( option, "Virtual" ) )
return opts.Virtual ;
else if ( !strcmp ( option, "front" ) )
return opts.front ;
@ -118,6 +122,7 @@ void opts_init ( void)
opts.log = false ;
opts.test = false ;
opts.verbose = false ;
opts.Virtual = false ;
opts.active = false ;
opts.front = false ;
opts.front = false ;
@ -152,8 +157,8 @@ int parseArg ( int argc, char * argv[], opts_type * opts_ptr )
int cmd_arg_count = 1 ; /* command args start at 1 */
/* A string listing of valid short options letters. */
const char* const short_options = "u:c:p:g:i:m:n:d:hlfpvta";
const char* const short_options = "u:c:p:g:i:m:n:d:hlfpvVta";
/* An array listing of valid long options. */
const struct option long_options[] =
{
@ -167,9 +172,10 @@ int parseArg ( int argc, char * argv[], opts_type * opts_ptr )
{ "username" , 1, NULL, 'u' },
{ "help" , 0, NULL, 'h' },
{ "active" , 0, NULL, 'a' },
{ "foreground", 0, NULL, 'f' },
{ "log" , 0, NULL, 'l' },
{ "foreground", 0, NULL, 'f' },
{ "log" , 0, NULL, 'l' },
{ "verbose" , 0, NULL, 'v' },
{ "Virtual" , 0, NULL, 'V' },
{ "test" , 0, NULL, 't' },
{ NULL , 0, NULL, 0 } /* Required at end of array. */
};
@ -254,19 +260,25 @@ int parseArg ( int argc, char * argv[], opts_type * opts_ptr )
case 't': /* -t or --test */
{
opts_ptr->test = true ;
cmd_arg_count++ ;
cmd_arg_count++ ;
break;
}
case 'v': /* -t or --verbose */
case 'v': /* -v or --verbose */
{
opts_ptr->verbose = true ;
cmd_arg_count++ ;
cmd_arg_count++ ;
break;
}
case 'V': /* -V or --Virtual */
{
opts_ptr->Virtual = true ;
cmd_arg_count++ ;
break;
}
case 'a': /* -a or --active */
{
opts_ptr->active = true ;
cmd_arg_count++ ;
cmd_arg_count++ ;
break;
}
case '?':

View File

@ -33,6 +33,7 @@ typedef struct
int test ; /**< Enable test mode */
int info ; /**< Dump data module info */
int verbose ; /**< Dump command line options */
int Virtual ; /**< Set to non-zero when in virtual env */
int active ; /**< Set daemon active */
int debug ; /**< Set tracing debug mode "debug,"test","info","trace" */
int front ; /**< run in the foreground ; do not daemonize */
@ -43,7 +44,7 @@ typedef struct
string username ;
string command ;
string password ;
} opts_type ;
} opts_type ;
opts_type * daemon_get_opts_ptr ( void );

View File

@ -1,3 +1,3 @@
SRC_DIR="$PKG_BASE/src"
COPY_LIST="$SRC_DIR/*"
TIS_PATCH_VER=6
TIS_PATCH_VER=7

View File

@ -34,6 +34,7 @@ make install buildroot=%{buildroot} _sysconfdir=%{_sysconfdir} _unitdir=%{_unitd
if [ $1 -eq 1 ] ; then
/bin/systemctl enable lighttpd.service
/bin/systemctl enable qemu_clean.service
/bin/systemctl enable hbsAgent.service
fi
exit 0
@ -41,6 +42,9 @@ exit 0
%defattr(-,root,root,-)
%{_sysconfdir}/init.d/goenabledControl
%license %{_datarootdir}/licenses/mtce-control-1.0/LICENSE
%{_sysconfdir}/pmon.d/hbsAgent.conf
%{_sysconfdir}/init.d/hbsAgent
%{_unitdir}/hbsAgent.service
%clean
rm -rf $RPM_BUILD_ROOT

View File

@ -1,19 +1,32 @@
SOURCE1 = goenabled
SOURCE2 = LICENSE
SOURCE1 = LICENSE
SOURCE2 = goenabled
SOURCE3 = hbsAgent
SOURCE4 = hbsAgent.conf
SOURCE5 = hbsAgent.service
local_etc_pmond = $(_sysconfdir)/pmond.d
local_etc_pmond = $(_sysconfdir)/pmon.d
local_etc_goenabledd = $(_sysconfdir)/goenabled.d
.PHONY: default
install:
# Controller-Only Init Scripts
install -m 755 -p -D scripts/$(SOURCE1) $(buildroot)/$(_sysconfdir)/init.d/goenabledControl
# Controller-Only Process Monitor Config files
install -m 755 -d $(buildroot)/$(local_etc_pmond)
# Controller-Only Go Enabled Test
install -m 755 -d $(buildroot)/$(local_etc_goenabledd)
# for license
install -m 755 -d $(buildroot)/$(_datarootdir)/licenses/mtce-control-1.0
install -p -D -m 600 $(SOURCE2) $(buildroot)/$(_datarootdir)/licenses/mtce-control-1.0/LICENSE
install -m 600 -p -D $(SOURCE1) $(buildroot)/$(_datarootdir)/licenses/mtce-control-1.0/LICENSE
# Controller-Only Init Scripts
install -m 755 -d $(buildroot)/$(_sysconfdir)/init.d
install -m 755 -p -D scripts/$(SOURCE2) $(buildroot)/$(_sysconfdir)/init.d/goenabledControl
install -m 755 -p -D scripts/$(SOURCE3) $(buildroot)/$(_sysconfdir)/init.d/hbsAgent
# Controller-Only Process Monitor Config files
install -m 755 -d $(buildroot)/$(local_etc_pmond)
install -m 644 -p -D scripts/$(SOURCE4) $(buildroot)/$(local_etc_pmond)/hbsAgent.conf
# Controller-Only Heartbeat Service file
install -m 644 -p -D scripts/$(SOURCE5) $(buildroot)/$(_unitdir)/hbsAgent.service
# Controller-Only Go Enabled Test
install -m 755 -d $(buildroot)/$(local_etc_goenabledd)

View File

@ -0,0 +1,117 @@
#! /bin/sh
#
# Copyright (c) 2018 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
#
# chkconfig: 2345 95 95
#
### BEGIN INIT INFO
# Provides: hbsAgent
# Default-Start: 3 5
# Default-Stop: 0 1 2 6
# Short-Description: Heartbeat Agent Daemon
### END INIT INFO
. /etc/init.d/functions
DAEMON_NAME="hbsAgent"
DAEMON="/usr/local/bin/${DAEMON_NAME}"
PIDFILE="/var/run/${DAEMON_NAME}.pid"
VIRT_TOOL='virt-what'
# controller-1:~$ sudo virt-what
# virtualbox ... in virtualbox
# kvm ... in qemu
# Linux Standard Base (LSB) Error Codes
RETVAL=0
GENERIC_ERROR=1
INVALID_ARGS=2
UNSUPPORTED_FEATURE=3
NOT_INSTALLED=5
NOT_RUNNING=7
PATH=/sbin:/usr/sbin:/bin:/usr/bin:/usr/local/bin
export PATH
if [ ! -e "${DAEMON}" ] ; then
logger "${DAEMON} is missing"
exit ${NOT_INSTALLED}
fi
case "$1" in
start)
logger "Starting ${DAEMON_NAME}"
echo -n "Starting ${DAEMON_NAME}: "
if [ -n "`pidof ${DAEMON_NAME}`" ] ; then
echo -n "is already running "
RETVAL=0
else
tool=$(which ${VIRT_TOOL})
if [ $? -eq 0 ] ; then
virtual=`${VIRT_TOOL}`
else
virtual=""
fi
if [ "${virtual}" == "virtualbox" -o "${virtual}" == "kvm" ] ; then
start-stop-daemon --start -b -x ${DAEMON} -- -l -a -V
else
start-stop-daemon --start -b -x ${DAEMON} -- -l -a
fi
RETVAL=$?
fi
if [ ${RETVAL} -eq 0 ] ; then
pid=`pidof ${DAEMON_NAME}`
echo "OK"
logger "${DAEMON} (${pid})"
else
echo "FAIL"
RETVAL=${GENERIC_ERROR}
fi
;;
stop)
logger "Stopping ${DAEMON_NAME}"
echo -n "Stopping ${DAEMON_NAME}: "
if [ -n "`pidof ${DAEMON_NAME}`" ] ; then
killproc ${DAEMON_NAME}
fi
if [ -n "`pidof ${DAEMON_NAME}`" ] ; then
echo "FAIL"
RETVAL=${NOT_RUNNING}
else
echo "OK"
fi
rm -f ${PIDFILE}
;;
restart)
$0 stop
$0 start
;;
status)
pid=`pidof ${DAEMON_NAME}`
RETVAL=$?
if [ ${RETVAL} -eq 0 ] ; then
echo "${DAEMON_NAME} is running"
else
echo "${DAEMON_NAME} is NOT running"
RETVAL=${NOT_RUNNING}
fi
;;
condrestart)
$0 restart
;;
*)
echo "usage: $0 { start | stop | status | restart | condrestart | status }"
;;
esac
exit ${RETVAL}

View File

@ -0,0 +1,25 @@
[process]
process = hbsAgent
service = hbsAgent
pidfile = /var/run/hbsAgent.pid
style = lsb ; ocf or lsb
severity = major ; minor, major, critical
restarts = 1 ; restart retries before error assertion
interval = 10 ; number of seconds to wait between restarts
debounce = 10 ; number of seconds that a process needs to remain
; running before degrade is removed and retry count
; is cleared.
startuptime = 5 ; Seconds to wait after process start before starting the debounce monitor
mode = passive ; Monitoring mode: passive (default) or active
; passive: process death monitoring (default: always)
; active : heartbeat monitoring, i.e. request / response messaging
; ignore : do not monitor or stop monitoring
quorum = 0 ; process is in the host watchdog quorum
; Active Monitoring Options
port = 2201
period = 5 ; monitor period in seconds
timeout = 4 ; Messaging timeout period in seconds, must be shorter than period
threshold = 5 ; Number of back to back heartbeat failures before action

View File

@ -0,0 +1,22 @@
[Unit]
Description=Titanium Cloud Maintenance Heartbeat Agent
After=network.target syslog.service config.service
Before=pmon.service
[Service]
Type=forking
ExecStart=/etc/rc.d/init.d/hbsAgent start
ExecStop=/etc/rc.d/init.d/hbsAgent start
PIDFile=/var/run/hbsAgent.pid
KillMode=process
SendSIGKILL=no
# Process recovery is handled by pmond if its running.
# Delay 10 seconds to give pmond a chance to recover
# before systemd kicks in to do it as a backup plan.
Restart=always
RestartSec=10
[Install]
WantedBy=multi-user.target

View File

@ -1,3 +1,3 @@
SRC_DIR="src"
TIS_PATCH_VER=140
TIS_PATCH_VER=142
BUILD_IS_SLOW=5

View File

@ -313,7 +313,6 @@ install -m 755 -d %{buildroot}/usr/lib/ocf
install -m 755 -d %{buildroot}/usr/lib/ocf/resource.d
install -m 755 -d %{buildroot}/usr/lib/ocf/resource.d/platform
install -m 755 -p -D %{_buildsubdir}/scripts/mtcAgent %{buildroot}/usr/lib/ocf/resource.d/platform/mtcAgent
install -m 755 -p -D %{_buildsubdir}/scripts/hbsAgent %{buildroot}/usr/lib/ocf/resource.d/platform/hbsAgent
install -m 755 -p -D %{_buildsubdir}/hwmon/scripts/ocf/hwmon %{buildroot}/usr/lib/ocf/resource.d/platform/hwmon
# config files
@ -482,7 +481,6 @@ install -m 755 -d %{buildroot}/var/run
# SM OCF Start/Stop/Monitor Scripts
%{ocf_resourced}/platform/mtcAgent
%{ocf_resourced}/platform/hbsAgent
# Config files
%config(noreplace)/etc/mtc.ini

View File

@ -47,6 +47,11 @@ int alarm_register_user ( msgClassSock * sock_ptr )
return (rc);
}
void alarm_unregister_user ( void )
{
user_sock_ptr = NULL ;
}
/* Construct an alarm request json string in the following form
{\"mtcalarm\":[{\"alarmid\":\"200.009\",\"hostname\":\"compute-3\",\"operation\":\"set\",\"severity\":\"major\",\"entity\":\"Infrastructure\",\"prefix\":\"service=heartbeat\"}, {\"alarmid\":\"200.005\",\"hostname\":\"compute-3\",\"operation\":\"set\",\"severity\":\"major\",\"entity\":\"Management\",\"prefix\":\"service=heartbeat\"}]}"
@ -73,6 +78,17 @@ int alarm_ ( string hostname, const char * id, EFmAlarmStateT state, EFmAlarmSev
string msg_type ;
string sev ;
if ( user_sock_ptr == NULL )
{
slog ("alarm socket is NULL");
return (FAIL_NULL_POINTER );
}
else if ( ! user_sock_ptr->sock_ok() )
{
elog ("alarm socket is not ok");
return (FAIL_OPERATION);
}
if ( state == FM_ALARM_STATE_MSG )
msg_type = "msg" ;
else if ( state == FM_ALARM_STATE_SET )
@ -127,7 +143,8 @@ int alarm_ ( string hostname, const char * id, EFmAlarmStateT state, EFmAlarmSev
}
else
{
ilog ("%s %s\n", hostname.c_str(), request);
ilog ("%s %s %s %s %s", hostname.c_str(), entity, msg_type.c_str(), sev.c_str(), id);
mlog ("%s %s\n", hostname.c_str(), request);
return ( PASS ) ;
}
daemon_signal_hdlr ();

View File

@ -68,6 +68,7 @@ EFmAlarmSeverityT alarmUtil_getSev_enum ( string severity );
#ifndef __MODULE_PRIVATE__
int alarm_register_user ( msgClassSock * sock_ptr );
void alarm_unregister_user ( void );
/* Public API */
int alarm_ ( string hostname, const char * id, EFmAlarmStateT state, EFmAlarmSeverityT severity, const char * entity, string prefix );

View File

@ -36,6 +36,7 @@ using namespace std;
#include "mtcAlarm.h"
#include "alarm.h"
#include "hbsAlarm.h"
#include "hbsBase.h"
/** Initialize the supplied command buffer */
void mtcCmd_init ( mtcCmd & cmd )
@ -263,7 +264,8 @@ nodeLinkClass::nodeLinkClass()
/* Make no assumption on the service */
maintenance = false ;
heartbeat = false ;
active = false ;
active = false ; /* run active */
active_controller = false ; /* true if this controller is active */
/* Set some defaults for the hearbeat service */
hbs_ready = false ;
@ -1156,26 +1158,26 @@ void nodeLinkClass::print_node_info ( void )
if (( i == INFRA_IFACE ) && ( infra_network_provisioned == false ))
continue ;
syslog ( LOG_INFO, "+--------------+-----+-----+------+-----+------+------------+---------+-----------------+\n");
syslog ( LOG_INFO, "| %s: %3d | Mon | Mis | Max | Deg | Fail | Pulses Tot | Pulses | %s (%4d) |\n" ,
syslog ( LOG_INFO, "+--------------+-----+-------+-------+-------+-------+------------+----------+-----------------+\n");
syslog ( LOG_INFO, "| %s: %3d | Mon | Mis | Max | Deg | Fail | Pulses Tot | Pulses | %s (%4d) |\n" ,
get_iface_name_str ((iface_enum)i), hosts, hbs_disabled ? "DISABLED" : "Enabled ", hbs_pulse_period );
syslog ( LOG_INFO, "+--------------+-----+-----+------+-----+------+------------+---------+-----------------+\n");
syslog ( LOG_INFO, "+--------------+-----+-------+-------+-------+-------+------------+----------+-----------------+\n");
for ( struct node * ptr = head ; ptr != NULL ; ptr = ptr->next )
{
syslog ( LOG_INFO, "| %-12s | %c | %3i | %4i | %3i | %4i | %8x | %7x | %d msec\n",
syslog ( LOG_INFO, "| %-12s | %c | %5i | %5i | %5i | %5i | %10x | %8x | %d msec\n",
ptr->hostname.c_str(),
ptr->monitor[i] ? 'Y' : 'n',
ptr->hbs_misses_count[i],
ptr->max_count[i],
ptr->hbs_degrade_count[i],
ptr->hbs_failure_count[i],
ptr->hbs_misses_count[i],
ptr->max_count[i],
ptr->hbs_degrade_count[i],
ptr->hbs_failure_count[i],
ptr->hbs_count[i],
ptr->b2b_pulses_count[i],
hbs_pulse_period );
}
}
syslog ( LOG_INFO, "+--------------+-----+-----+------+-----+------+------------+---------+-----------------+\n");
syslog ( LOG_INFO, "+--------------+-----+-------+-------+-------+-------+------------+----------+-----------------+\n");
}
}
@ -7778,7 +7780,7 @@ int nodeLinkClass::remPulse ( struct node * node_ptr, iface_enum iface, bool cle
{
/* This default RC allows the caller to filter out unexpected pulse responses */
int rc = ENXIO ;
if ( head == NULL )
{
return -ENODEV ;
@ -7962,6 +7964,16 @@ int nodeLinkClass::remPulse ( struct node * node_ptr, iface_enum iface, bool cle
}
pulses[iface]-- ;
}
else if ( node_ptr )
{
dlog ("%s unexpected pulse response ; %s",
node_ptr->hostname.c_str(),
get_iface_name_str(iface));
}
else
{
slog ("null pointer");
}
return rc ;
}
@ -7972,6 +7984,13 @@ int nodeLinkClass::remPulse ( struct node * node_ptr, iface_enum iface, bool cle
* By index does not require a lookup whereas hostname does */
int nodeLinkClass::remove_pulse ( string & hostname, iface_enum iface, int index, unsigned int flags )
{
/* TODO: consider removing this check */
if ( hostname == "localhost" )
{
/* localhost is not a supported hostname and indicates
* an unconfigured host response ; return the ignore response */
return(ENXIO);
}
if ( index )
{
int rc = remPulse_by_index ( hostname, index , iface, true , flags );
@ -7984,16 +8003,6 @@ int nodeLinkClass::remove_pulse ( string & hostname, iface_enum iface, int index
}
else
{
if ( hostname.compare("localhost") )
{
get_hbs_monitor_state ( hostname , iface ) ;
}
else
{
/* localhost is not a supported hostname and indicates
* an unconfigured host response ; return the ignore response */
return(ENXIO);
}
}
return ( remPulse_by_name ( hostname , iface, true, flags ));
}
@ -8016,7 +8025,6 @@ void nodeLinkClass::clear_pulse_list ( iface_enum iface )
}
}
/** Runs in the hbsAgent to set or clear heartbat alarms for all supported interfaces */
void nodeLinkClass::manage_heartbeat_alarm ( struct nodeLinkClass::node * node_ptr, EFmAlarmSeverityT sev, int iface )
{
@ -8142,7 +8150,6 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
storage_0_responding = false ;
}
/* Don't log single misses unless in debug mode */
if ( pulse_ptr->b2b_misses_count[iface] > 1 )
{
if ( pulse_ptr->b2b_misses_count[iface] >= hbs_failure_threshold )
@ -8207,7 +8214,10 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
{
if ( pulse_ptr->b2b_misses_count[iface] == hbs_minor_threshold )
{
send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_MINOR_SET, iface );
if ( this->active_controller )
{
send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_MINOR_SET, iface );
}
pulse_ptr->hbs_minor[iface] = true ;
pulse_ptr->hbs_minor_count[iface]++ ;
wlog ("%s %s -> MINOR\n", pulse_ptr->hostname.c_str(), get_iface_name_str(iface));
@ -8215,10 +8225,17 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
}
if ( pulse_ptr->b2b_misses_count[iface] == hbs_degrade_threshold )
{
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_MAJOR, iface );
if ( this->active_controller )
{
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_MAJOR, iface );
/* report this host as failed */
if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_DEGRADE_SET, iface ) == PASS )
/* report this host as failed */
if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_DEGRADE_SET, iface ) == PASS )
{
pulse_ptr->hbs_degrade[iface] = true ;
}
}
else
{
pulse_ptr->hbs_degrade[iface] = true ;
}
@ -8231,11 +8248,17 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
( pulse_ptr->hbs_degrade[iface] == false ))
{
wlog ("%s -> DEGRADED - Auto-Correction\n", pulse_ptr->hostname.c_str());
if ( this->active_controller )
{
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_MAJOR, iface );
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_MAJOR, iface );
/* report this host as failed */
if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_DEGRADE_SET, iface ) == PASS )
/* report this host as failed */
if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_DEGRADE_SET, iface ) == PASS )
{
pulse_ptr->hbs_degrade[iface] = true ;
}
}
else
{
pulse_ptr->hbs_degrade[iface] = true ;
}
@ -8250,11 +8273,16 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
/* Only print the log at the threshold boundary */
if ( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold )
{
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
if ( this->active_controller )
{
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
}
wlog_throttled ( pulse_ptr->no_work_log_throttle, 500,
"%s %s *** Heartbeat Loss *** (degrade only)\n", pulse_ptr->hostname.c_str(),
get_iface_name_str(iface) );
this->print_node_info ();
hbs_cluster_log ( this->my_hostname, "event", true );
}
}
@ -8268,35 +8296,46 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
/* Only print the log at the threshold boundary */
if ( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold )
{
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
if ( this->active_controller )
{
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
}
wlog_throttled ( pulse_ptr->no_work_log_throttle, 500,
"%s %s *** Heartbeat Loss *** (degrade only)\n", pulse_ptr->hostname.c_str(),
get_iface_name_str(iface) );
this->print_node_info ();
hbs_cluster_log ( this->my_hostname, "event", true );
}
}
else if (( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold ) &&
( pulse_ptr->hbs_failure[iface] == false ))
( pulse_ptr->hbs_failure[iface] == false ))
{
elog ("%s %s -> FAILED\n", pulse_ptr->hostname.c_str(),
get_iface_name_str(iface) );
elog ("%s %s *** Heartbeat Loss ***\n", pulse_ptr->hostname.c_str(),
get_iface_name_str(iface) );
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
if ( this->active_controller )
{
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
/* report this host as failed */
if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_LOSS , iface ) == PASS )
/* report this host as failed */
if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_LOSS , iface ) == PASS )
{
pulse_ptr->hbs_failure[iface] = true ;
}
}
else
{
pulse_ptr->hbs_failure[iface] = true ;
this->print_node_info ();
hbs_cluster_log ( this->my_hostname, "event", true );
}
pulse_ptr->hbs_failure_count[iface]++ ;
}
if ( pulse_ptr->b2b_misses_count[iface] > pulse_ptr->max_count[iface] )
pulse_ptr->max_count[iface] = pulse_ptr->b2b_misses_count[iface] ;
pulse_ptr->max_count[iface] = pulse_ptr->b2b_misses_count[iface] ;
}
if ( remPulse_by_name ( pulse_ptr->hostname, iface, false, NULL_PULSE_FLAGS ))
{
elog ("%s %s not in pulse list\n", pulse_ptr->hostname.c_str(),

View File

@ -1266,6 +1266,10 @@ public:
bool maintenance ;
bool heartbeat ;
/* Set to true if this controller is active.
* Currently only used by heartbeat service. */
bool active_controller ;
/* offline_handler tuning controls */
int offline_threshold ; /* number of back to back mtcAlive misses before offline */
int offline_period ; /* offline handler mtcAlive request period */

File diff suppressed because it is too large Load Diff

View File

@ -47,6 +47,9 @@
/** Maximum service fail count before action */
#define MAX_FAIL_COUNT (1)
/** Audit Rate/Count */
#define AUDIT_RATE (9)
/** Heartbeat pulse request/response message header byte size */
#define HBS_HEADER_SIZE (15)
@ -60,13 +63,16 @@ const char rsp_msg_header [HBS_HEADER_SIZE+1] = {"cgts pulse rsp:"};
#define HBS_MAX_MSG (HBS_HEADER_SIZE+MAX_CHARS_HOSTNAME)
#define HBS_MESSAGE_VERSION (1) // 0 -> 1 with intro of cluster info
#define HBS_MESSAGE_VERSION (1) // 0 -> 1 with intro of cluster info
/* Heartbeat control structure */
typedef struct
{
unsigned int controller ;
unsigned int audit ;
unsigned int nodetype ;
bool clear_alarms ;
bool locked ;
} hbs_ctrl_type ;
hbs_ctrl_type * get_hbs_ctrl_ptr ( void );
@ -218,22 +224,17 @@ void hbs_utils_init ( void );
/* network enum to name lookup */
string hbs_cluster_network_name ( mtce_hbs_network_enum network );
/* Produce formatted clog's that characterize current and changing cluster
* history for a given network. Each log is controller/network specific. */
void hbs_cluster_log ( string & hostname, mtce_hbs_cluster_type & cluster, string prefix );
/* Initialize the specified history array */
void hbs_cluster_history_init ( mtce_hbs_cluster_history_type & history );
/* Clear all history in the cluster vault */
void hbs_cluster_history_clear( mtce_hbs_cluster_type & cluster );
/******** Heartbeat Agent Cluster Functions in hbsCluster.cpp ********/
/* Set the cluster vault to default state.
* Called upon daemon init or heartbeat period change. */
void hbs_cluster_init ( unsigned short period );
void hbs_cluster_init ( unsigned short period , msgClassSock * sm_socket_ptr );
/* Calculate number of bytes that is unused in the cluster data structure.
* Primarily to know how many history elements are missing. */
@ -286,7 +287,9 @@ void hbs_cluster_append ( hbs_message_type & msg );
/* Produce formatted clog's that characterize current and changing cluster
* history for a given network. Each log is controller/network specific. */
void hbs_cluster_log ( string & hostname, string prefix );
void hbs_cluster_log ( string & hostname, string prefix, bool force=false );
void hbs_cluster_log ( string & hostname, mtce_hbs_cluster_type & cluster, string prefix, bool force=false );
/* Service SM cluster info request */
void hbs_sm_handler ( void );
@ -294,8 +297,14 @@ void hbs_sm_handler ( void );
/* send the cluster vault to SM */
void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid );
/* copy cluster data from src to dst */
void hbs_cluster_copy ( mtce_hbs_cluster_type & src, mtce_hbs_cluster_type & dst );
/* print the contents of the vault */
void hbs_cluster_dump ( mtce_hbs_cluster_type & vault );
void hbs_cluster_dump ( mtce_hbs_cluster_type & vault, string log_prefix, bool force );
/* Heartbeat service state audit */
void hbs_state_audit ( void );
/**
* @} hbs_base

View File

@ -66,6 +66,8 @@ extern "C"
#include "amon.h" /* for ... active monitoring utilities */
}
#define MAX_LEN (300)
/* Where to send events */
string mtcAgent_ip = "" ;
@ -96,12 +98,17 @@ typedef struct
static char pulse_resp_tx_hdr [HBS_MAX_MSG];
static char my_hostname [MAX_HOST_NAME_SIZE+1];
static string hostname = "" ;
static char my_hostname_length ;
static string my_macaddr = "" ;
static string my_address = "" ;
static unsigned int my_nodetype= CGTS_NODE_NULL ;
static stallMon_type stallMon ;
/* Cached Cluster view from controllers */
mtce_hbs_cluster_type controller_cluster_cache[MTCE_HBS_MAX_CONTROLLERS];
void daemon_sigchld_hdlr ( void )
{
; /* dlog("Received SIGCHLD ... no action\n"); */
@ -407,16 +414,17 @@ int daemon_configure ( void )
else
{
ilog("Realtime Pri: FIFO/%i \n", hbs_config.scheduling_priority );
ilog("Multicast: %s\n", hbs_config.multicast );
ilog("Multicast : %s\n", hbs_config.multicast );
hbs_config.mgmnt_iface = daemon_get_iface_master ( hbs_config.mgmnt_iface );
ilog("Mgmnt iface : %s\n", hbs_config.mgmnt_iface );
ilog("Mgmnt RxPort: %d\n", hbs_config.hbs_client_mgmnt_port );
ilog("Mgmnt TxPort: %d\n", hbs_config.hbs_agent_mgmnt_port );
ilog("Mgmnt Name : %s\n", hbs_config.mgmnt_iface );
ilog("Mgmnt Port : %d (rx)", hbs_config.hbs_client_mgmnt_port );
ilog("Mgmnt Port : %d (tx)", hbs_config.hbs_agent_mgmnt_port );
get_iface_macaddr ( hbs_config.mgmnt_iface, my_macaddr );
get_iface_address ( hbs_config.mgmnt_iface, my_address, true );
get_hostname ( &my_hostname[0], MAX_HOST_NAME_SIZE );
hostname = my_hostname ;
/* Fetch the infrastructure interface name.
* calls daemon_get_iface_master inside so the
@ -427,11 +435,14 @@ int daemon_configure ( void )
if (strcmp(hbs_config.infra_iface, hbs_config.mgmnt_iface))
{
infra_network_provisioned = true ;
ilog ("Infra iface : %s\n", hbs_config.infra_iface );
ilog ("Infra Name : %s\n", hbs_config.infra_iface );
}
}
ilog("Infra RxPort: %d\n", hbs_config.hbs_client_infra_port );
ilog("Infra TxPort: %d\n", hbs_config.hbs_agent_infra_port );
if ( infra_network_provisioned == true )
{
ilog("Infra Port : %d (rx)", hbs_config.hbs_client_infra_port );
ilog("Infra Port : %d (tx)", hbs_config.hbs_agent_infra_port );
}
/* initialize the stall detection monitor */
stallMon_init ();
@ -663,7 +674,37 @@ int get_pmon_pulses ( void )
return (pulses);
}
static unsigned int my_rri = 0 ;
/*************************************************************
*
* Name : have_other_controller_history
*
* Description: returns true if there is cached history for any
* controller number other than this one supplied.
*
*************************************************************/
bool have_other_controller_history ( unsigned short controller )
{
if ( controller < MTCE_HBS_MAX_CONTROLLERS )
{
/* look for history for any controller other than the one specified */
for ( int c = 0 ; c < MTCE_HBS_MAX_CONTROLLERS ; c++ )
{
/* skip specified controller */
if ( c != controller )
{
if ( controller_cluster_cache[c].histories )
{
return true ;
}
}
}
}
return false ;
}
static unsigned int rri[MTCE_HBS_MAX_CONTROLLERS] = {0,0} ;
/*************************************************************
*
@ -766,12 +807,13 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags )
daemon_config_type * cfg_ptr = daemon_get_cfg_ptr();
if ( cfg_ptr->debug_msg )
{
mlog ("\n");
mlog ("%s Pulse Req: %s:%5d: %d:%s RRI:%d\n",
mlog (" ");
mlog ("%s Pulse Req: %s:%d s:%d f:%x [%s] RRI:%d\n",
get_iface_name_str(iface),
hbs_sock.rx_sock[iface]->get_dst_addr()->toString(),
hbs_sock.rx_sock[iface]->get_dst_addr()->getPort(),
hbs_sock.rx_mesg[iface].s,
hbs_sock.rx_mesg[iface].f,
hbs_sock.rx_mesg[iface].m,
hbs_sock.rx_mesg[iface].c);
}
@ -787,19 +829,9 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags )
return (FAIL_MSG_HEADER) ;
}
/* Manage the Resource Reference Index (RRI) "lookup clue" */
if ( ! strncmp ( &hbs_sock.rx_mesg[iface].m[HBS_HEADER_SIZE], &my_hostname[0], MAX_CHARS_HOSTNAME ))
{
if( my_rri!= hbs_sock.rx_mesg[iface].c )
{
my_rri = hbs_sock.rx_mesg[iface].c ;
ilog ("%s Caching New RRI: %d\n", &my_hostname[0], my_rri );
}
}
/* Add my RRI to the response message */
hbs_sock.rx_mesg[iface].c = my_rri ;
/* Update local copy for the controller this pulse came from */
/* ... before the flags are cleared and setup for the reply. */
unsigned int controller = (hbs_sock.rx_mesg[iface].f & CTRLX_MASK ) >> CTRLX_BIT ;
/* Manage OOB flags */
hbs_sock.rx_mesg[iface].f = flags ;
@ -807,23 +839,102 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags )
{
hbs_sock.rx_mesg[iface].f |= ( PMOND_FLAG ) ;
}
if ( infra_network_provisioned == true )
{
hbs_sock.rx_mesg[iface].f |= INFRA_FLAG ;
}
#define WANT_CLUSTER_INFO_LOG
#ifdef WANT_CLUSTER_INFO_LOG
/* Log the received cluster info */
if ( hbs_sock.rx_mesg[iface].v >= HBS_MESSAGE_VERSION )
/*************************************************************************
***** C L U S T E R D A T A M A N A G E M E N T ******
* *
* TODO: Add support for 3 controllers.
* Only 2 suppoerted by some of this code.
***** ******/
if ( controller >= MTCE_HBS_MAX_CONTROLLERS )
{
char str[100] ;
// hbs_cluster_log (hbs_sock.rx_mesg[iface].cluster, hbs_sock.rx_mesg[iface].s );
snprintf ( &str[0], 100, " seq %6d with %d bytes from %s ", hbs_sock.rx_mesg[iface].s, rx_bytes, get_iface_name_str(iface));
string hostname = my_hostname ;
hbs_cluster_log ( hostname, hbs_sock.rx_mesg[iface].cluster, str );
wlog ("invalid controller number: %d ; dropping message", controller );
return ( FAIL_INVALID_DATA );
}
#endif
/* Manage the Resource Reference Index (RRI) "lookup clue"
* With the introduction of active-active heartbeating the hbsClient
* is responsible for servicing pulses from both controllers.
* This means that hbsClient needs to manage an rri for each controller. */
if ( ! strncmp ( &hbs_sock.rx_mesg[iface].m[HBS_HEADER_SIZE], &my_hostname[0], MAX_CHARS_HOSTNAME ))
{
if( rri[controller] != hbs_sock.rx_mesg[iface].c )
{
rri[controller] = hbs_sock.rx_mesg[iface].c ;
ilog ("Caching New RRI: %d (from controller-%d)\n", rri[controller], controller );
}
}
/* Log the received cluster info
* ... if the message version shows that it is supported */
if ( hbs_sock.rx_mesg[iface].v )
{
char str[MAX_LEN] ;
snprintf ( &str[0], MAX_LEN, " seq %6d with %d bytes from %s ", (int)hbs_sock.rx_mesg[iface].s, rx_bytes, get_iface_name_str(iface));
hbs_cluster_log ( hostname, hbs_sock.rx_mesg[iface].cluster, str );
/* add the controller back in */
hbs_sock.rx_mesg[iface].f |= ( controller << CTRLX_BIT );
/* Add my RRI to the response message */
hbs_sock.rx_mesg[iface].c = rri[controller] ;
if ( hbs_sock.rx_mesg[iface].cluster.histories > MTCE_HBS_MAX_NETWORKS )
{
slog ("controller-%d provided %d network histories ; max is %d per controller",
controller,
hbs_sock.rx_mesg[iface].cluster.histories,
MTCE_HBS_MAX_NETWORKS );
}
else if ( hbs_sock.rx_mesg[iface].cluster.bytes != ( BYTES_IN_CLUSTER_VAULT(hbs_sock.rx_mesg[iface].cluster.histories)))
{
slog ("controller-%d provided %d bytes of history ; expected %d",
controller,
hbs_sock.rx_mesg[iface].cluster.bytes,
(unsigned short)(BYTES_IN_CLUSTER_VAULT(hbs_sock.rx_mesg[iface].cluster.histories)));
}
else if ( hbs_sock.rx_mesg[iface].cluster.histories )
{
hbs_cluster_copy ( hbs_sock.rx_mesg[iface].cluster,
controller_cluster_cache[controller] );
clog1 ("controller-%d cluster info from %s pulse request saved to cache",
controller, get_iface_name_str(iface));
hbs_sock.rx_mesg[iface].cluster.histories = 0 ;
if ( have_other_controller_history ( controller ) == true )
{
/* Now copy the other controller's cached cluster info into
* this controlers response */
hbs_cluster_copy ( controller_cluster_cache[controller?0:1],
hbs_sock.rx_mesg[iface].cluster );
if ( daemon_get_cfg_ptr()->debug_state & 4 )
{
string dump_banner = "" ;
dump_banner.append("controller-") ;
dump_banner.append(itos(controller?0:1));
dump_banner.append(" cluster info from cache injected into controller-");
dump_banner.append(itos(controller));
dump_banner.append(":");
dump_banner.append(get_iface_name_str(iface));
dump_banner.append(" pulse response");
hbs_cluster_dump ( hbs_sock.rx_mesg[iface].cluster, dump_banner, true );
}
}
}
}
/* Cluster Data management end */
/* replace the request header with the response header */
memcpy ( &hbs_sock.rx_mesg[iface].m[0], &pulse_resp_tx_hdr[0], HBS_MAX_MSG );
#ifdef WANT_PULSE_RESPONSE_FIT
if (( iface == INFRA_IFACE ) && ( daemon_is_file_present ( MTC_CMD_FIT__NO_INFRA_RSP )))
@ -839,29 +950,11 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags )
}
#endif
int rc = PASS ;
/* replace the request header with the response header */
memcpy ( &hbs_sock.rx_mesg[iface].m[0], &pulse_resp_tx_hdr[0], HBS_MAX_MSG );
/* Deal with the cluster info if it exists.
* ... Introduced in messaging version 1 */
if ( hbs_sock.rx_mesg[iface].v >= HBS_MESSAGE_VERSION )
{
if ( hbs_sock.rx_mesg[iface].cluster.version < MTCE_HBS_CLUSTER_VERSION )
{
ilog ("Bad cluster verison (%d)", hbs_sock.rx_mesg[iface].cluster.version);
}
// if ( hbs_sock.rx_mesg[iface].cluster.revision != MTCE_HBS_CLUSTER_REVISION )
// {
// ilog ("Bad cluster revision (%d)", hbs_sock.rx_mesg[iface].cluster.revision);
// }
/* Add peer controller cluster data to this controller's response */
// hbs_cluster_loop(hbs_sock.rx_mesg[iface]);
}
/* reuse the rx_bytes variable */
rx_bytes = sizeof(hbs_message_type)-sizeof(mtce_hbs_cluster_type)+BYTES_IN_CLUSTER_VAULT(hbs_sock.rx_mesg[iface].cluster.histories);
/* send pulse response message */
int rc = PASS ;
int tx_bytes = hbs_sock.tx_sock[iface]->reply(hbs_sock.rx_sock[iface],(char*)&hbs_sock.rx_mesg[iface], rx_bytes);
if ( tx_bytes == -1 )
{
@ -884,15 +977,15 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags )
}
else
{
mlog ("%s Pulse Rsp: %s:%5d: %d:%d:%s RRI:%d (%d:%d:%d)\n",
get_iface_name_str(iface),
hbs_sock.tx_sock[iface]->get_dst_addr()->toString(),
hbs_sock.tx_sock[iface]->get_dst_addr()->getPort(),
hbs_sock.rx_mesg[iface].s,
hbs_sock.rx_mesg[iface].f,
hbs_sock.rx_mesg[iface].m,
hbs_sock.rx_mesg[iface].c,
pmonPulse_counter, rx_bytes, tx_bytes);
mlog ("%s Pulse Rsp: %s:%d: s:%d f:%x [%s] RRI:%d (%x:%d:%d)\n",
get_iface_name_str(iface),
hbs_sock.tx_sock[iface]->get_dst_addr()->toString(),
hbs_sock.tx_sock[iface]->get_dst_addr()->getPort(),
hbs_sock.rx_mesg[iface].s,
hbs_sock.rx_mesg[iface].f,
hbs_sock.rx_mesg[iface].m,
hbs_sock.rx_mesg[iface].c,
pmonPulse_counter, rx_bytes, tx_bytes);
}
/* Clear the error count since we got a good receive */
@ -984,6 +1077,10 @@ int daemon_init ( string iface, string nodeType_str )
/* Initialize socket construct and pointer to it */
memset ( &hbs_sock, 0, sizeof(hbs_sock));
/* Initialize the controller cluster view data bounce structure */
for ( int c = 0 ; c < MTCE_HBS_MAX_CONTROLLERS ; c++ )
memset ( &controller_cluster_cache[c], 0, sizeof(mtce_hbs_cluster_type)) ;
/* init the utility module */
hbs_utils_init ();
@ -1007,6 +1104,11 @@ int daemon_init ( string iface, string nodeType_str )
/* convert node type to integer */
my_nodetype = get_host_function_mask ( nodeType_str ) ;
if ( my_nodetype & CONTROLLER_TYPE )
{
/* is controller but don't know what one yet. */
set_hn((char*)CONTROLLER_X);
}
ilog ("Node Type : %s (%d)\n", nodeType_str.c_str(), my_nodetype );
/* Bind signal handlers */
@ -1058,7 +1160,6 @@ int daemon_init ( string iface, string nodeType_str )
int stall_threshold_log = 0 ;
int stall_times_threshold_log = 0 ;
#define MAX_LEN 300
void daemon_service_run ( void )
{
#ifdef WANT_DAEMON_DEBUG
@ -1205,7 +1306,7 @@ void daemon_service_run ( void )
int bytes = hbs_sock.sm_client_sock->read((char*)&msg, sizeof(mtce_hbs_cluster_type));
if ( bytes )
{
hbs_cluster_dump (msg);
hbs_cluster_dump (msg, "Cluster info received", true );
}
}
#endif

View File

@ -64,11 +64,19 @@ typedef struct
/* The working heartbeat cluster data vault. */
mtce_hbs_cluster_type cluster ;
bool cluster_change ;
int cluster_change_threshold_count ;
int cluster_change_difference_count ;
msgClassSock * sm_socket_ptr ;
} hbs_cluster_ctrl_type ;
/* Cluster control structire construct allocation. */
static hbs_cluster_ctrl_type ctrl ;
#define STORAGE_0_NR_THRESHOLD (4)
#define CLUSTER_CHANGE_THRESHOLD (50000)
/****************************************************************************
*
@ -80,7 +88,7 @@ static hbs_cluster_ctrl_type ctrl ;
*
***************************************************************************/
void hbs_cluster_init ( unsigned short period )
void hbs_cluster_init ( unsigned short period, msgClassSock * sm_socket_ptr )
{
ctrl.monitored_hosts = 0;
ctrl.monitored_hostname_list.clear();
@ -104,13 +112,17 @@ void hbs_cluster_init ( unsigned short period )
for ( int h = 0 ; h < MTCE_HBS_MAX_HISTORY_ELEMENTS ; h++ )
hbs_cluster_history_init ( ctrl.cluster.history[h] );
ilog ("Cluster Info: v%d.%d sig:%x bytes:%d (%ld)",
clog ("Cluster Info: v%d.%d sig:%x bytes:%d (%ld)",
ctrl.cluster.version,
ctrl.cluster.revision,
ctrl.cluster.magic_number,
ctrl.cluster.bytes,
sizeof(mtce_hbs_cluster_history_type));
if ( sm_socket_ptr )
{
ctrl.sm_socket_ptr = sm_socket_ptr ;
}
ctrl.log_throttle = 0 ;
}
@ -140,7 +152,7 @@ void hbs_cluster_nums ( unsigned short this_controller,
/****************************************************************************
*
* Name : log_monitored_hosts_list
* Name : cluster_list
*
* Description : Log the list of monitored hosts.
* Typically done on a list change.
@ -149,7 +161,7 @@ void hbs_cluster_nums ( unsigned short this_controller,
*
***************************************************************************/
void log_monitored_hosts_list ( void )
void cluster_list ( void )
{
std::list<string>::iterator iter_ptr ;
string list = "" ;
@ -160,9 +172,7 @@ void log_monitored_hosts_list ( void )
list.append (*(iter_ptr));
list.append (" ");
}
ilog ("cluster of %ld: %s",
ctrl.monitored_hostname_list.size(),
list.c_str());
ilog ("cluster: %s", list.c_str());
}
@ -186,6 +196,7 @@ void cluster_storage0_state ( bool enabled )
ctrl.cluster.storage0_enabled = enabled ;
ilog ("storage-0 heartbeat state changed to %s",
enabled ? "enabled" : "disabled" );
ctrl.cluster_change = true ;
}
}
@ -237,13 +248,30 @@ void hbs_manage_controller_state ( string & hostname, bool enabled )
void hbs_cluster_add ( string & hostname )
{
/* Consider using 'unique' after instead of remove before update. */
ctrl.monitored_hostname_list.remove(hostname) ;
ctrl.monitored_hostname_list.push_back(hostname) ;
ctrl.monitored_hosts = (unsigned short)ctrl.monitored_hostname_list.size();
bool already_in_list = false ;
std::list<string>::iterator hostname_ptr ;
for ( hostname_ptr = ctrl.monitored_hostname_list.begin();
hostname_ptr != ctrl.monitored_hostname_list.end() ;
hostname_ptr++ )
{
if ( hostname_ptr->compare(hostname) == 0 )
{
already_in_list = true ;
break ;
}
}
if ( already_in_list == false )
{
ctrl.monitored_hostname_list.push_back(hostname) ;
ctrl.monitored_hosts = (unsigned short)ctrl.monitored_hostname_list.size();
ilog ("%s added to cluster", hostname.c_str());
cluster_list ();
ctrl.cluster_change = true ;
}
/* Manage storage-0 state */
if ( hostname == STORAGE_0 )
if ( hostname.compare(STORAGE_0) == 0 )
{
cluster_storage0_state ( true );
}
@ -251,15 +279,18 @@ void hbs_cluster_add ( string & hostname )
/* If we get down to 0 monitored hosts then just start fresh */
if (( ctrl.monitored_hosts ) == 0 )
{
hbs_cluster_init ( ctrl.cluster.period_msec );
hbs_cluster_init ( ctrl.cluster.period_msec, NULL );
}
/* Manage controller state ; true means enabled in this case. */
hbs_manage_controller_state ( hostname, true );
ilog ("%s added to cluster", hostname.c_str());
if (( ctrl.cluster_change ) && ( ctrl.sm_socket_ptr ))
{
hbs_cluster_send( ctrl.sm_socket_ptr, 0 );
ctrl.cluster_change = false ;
}
log_monitored_hosts_list ();
}
/****************************************************************************
@ -281,27 +312,46 @@ void hbs_cluster_add ( string & hostname )
void hbs_cluster_del ( string & hostname )
{
ctrl.monitored_hostname_list.remove(hostname) ;
ctrl.monitored_hosts = (unsigned short)ctrl.monitored_hostname_list.size();
/* Manage storage-0 state. */
if ( hostname == STORAGE_0 )
std::list<string>::iterator hostname_ptr ;
for ( hostname_ptr = ctrl.monitored_hostname_list.begin();
hostname_ptr != ctrl.monitored_hostname_list.end() ;
hostname_ptr++ )
{
cluster_storage0_state ( false );
if ( hostname_ptr->compare(hostname) == 0 )
{
ctrl.monitored_hostname_list.remove(hostname) ;
ctrl.monitored_hosts = (unsigned short)ctrl.monitored_hostname_list.size();
/* Manage storage-0 state. */
if ( hostname.compare(STORAGE_0) == 0 )
{
cluster_storage0_state ( false );
}
/* If we get down to 0 monitored hosts then just start fresh */
if (( ctrl.monitored_hosts ) == 0 )
{
hbs_cluster_init ( ctrl.cluster.period_msec, NULL );
}
/* Manage controller state ; false means not enabled in this case. */
hbs_manage_controller_state ( hostname , false );
ilog ("%s deleted from cluster", hostname.c_str());
cluster_list ();
ctrl.cluster_change = true ;
break ;
}
}
/* If we get down to 0 monitored hosts then just start fresh */
if (( ctrl.monitored_hosts ) == 0 )
if (( ctrl.cluster_change ) && ( ctrl.sm_socket_ptr ))
{
hbs_cluster_init ( ctrl.cluster.period_msec );
hbs_cluster_send( ctrl.sm_socket_ptr, 0 );
ctrl.cluster_change = false ;
}
/* Manage controller state ; false means not enabled in this case. */
hbs_manage_controller_state ( hostname , false );
ilog ("%s deleted from cluster", hostname.c_str());
log_monitored_hosts_list ();
}
/****************************************************************************
@ -309,7 +359,7 @@ void hbs_cluster_del ( string & hostname )
* Name : hbs_cluster_update
*
* Description : Update this controller's cluster info for the specified
* network with
* network with ...
*
* 1. The number of enabled hosts.
* 2. The number of responding hosts.
@ -333,7 +383,6 @@ void hbs_cluster_del ( string & hostname )
*
***************************************************************************/
#define STORAGE_0_NR_THRESHOLD (4)
void hbs_cluster_update ( iface_enum iface,
unsigned short not_responding_hosts,
@ -357,7 +406,7 @@ void hbs_cluster_update ( iface_enum iface,
if ( not_responding_hosts )
{
clog1 ("controller-%d %s enabled:%d not responding:%d",
clog ("controller-%d %s enabled:%d not responding:%d",
ctrl.this_controller,
hbs_cluster_network_name(n).c_str(),
ctrl.monitored_hosts,
@ -365,7 +414,7 @@ void hbs_cluster_update ( iface_enum iface,
}
else
{
clog1 ("controller-%d %s has %d monitored hosts and all are responding",
clog ("controller-%d %s has %d monitored hosts and all are responding",
ctrl.this_controller,
hbs_cluster_network_name(n).c_str(),
ctrl.monitored_hosts);
@ -394,9 +443,11 @@ void hbs_cluster_update ( iface_enum iface,
history_ptr->network = n ;
/* Log new network history as its being started. */
ilog ("controller-%d %s network history add",
ilog ("controller-%d added new controller-%d:%s history to vault ; now have %d network views",
ctrl.this_controller,
hbs_cluster_network_name(n).c_str());
ctrl.this_controller,
hbs_cluster_network_name(n).c_str(),
ctrl.cluster.histories);
}
}
@ -457,7 +508,9 @@ void hbs_cluster_update ( iface_enum iface,
* ... which is the index for the next entry.
*/
unsigned short last_entry_index ;
if ( history_ptr->oldest_entry_index == 0 )
unsigned short oldest_entry_index = history_ptr->oldest_entry_index ;
if ( oldest_entry_index == 0 )
{
/* Go to the end of the array. */
last_entry_index = MTCE_HBS_HISTORY_ENTRIES-1 ;
@ -465,43 +518,88 @@ void hbs_cluster_update ( iface_enum iface,
else
{
/* Otherwise, the previous index in the array */
last_entry_index = history_ptr->oldest_entry_index - 1 ;
last_entry_index = oldest_entry_index - 1 ;
}
/* Update the history with this data. */
history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled = ctrl.monitored_hosts ;
history_ptr->entry[history_ptr->oldest_entry_index].hosts_responding = ctrl.monitored_hosts - not_responding_hosts ;
bool logit = false ;
string logit_reason = "" ;
if (( history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled !=
history_ptr->entry[ last_entry_index].hosts_enabled ) ||
( history_ptr->entry[history_ptr->oldest_entry_index].hosts_responding !=
history_ptr->entry[ last_entry_index].hosts_responding))
/* Update the history with this data. */
history_ptr->entry[oldest_entry_index].hosts_enabled = ctrl.monitored_hosts ;
history_ptr->entry[oldest_entry_index].hosts_responding = ctrl.monitored_hosts - not_responding_hosts ;
if (( history_ptr->entry[oldest_entry_index].hosts_enabled !=
history_ptr->entry[ last_entry_index].hosts_enabled ) ||
( history_ptr->entry[oldest_entry_index].hosts_responding !=
history_ptr->entry[ last_entry_index].hosts_responding))
{
/* Only log on change events. */
if ( history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled ==
history_ptr->entry[history_ptr->oldest_entry_index].hosts_responding )
if ( history_ptr->entry[oldest_entry_index].hosts_enabled ==
history_ptr->entry[oldest_entry_index].hosts_responding )
{
ilog ("controller-%d %s cluster of %d is healthy",
ctrl.this_controller,
hbs_cluster_network_name(n).c_str(),
history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled);
history_ptr->entry[oldest_entry_index].hosts_enabled);
ctrl.cluster_change_threshold_count = 0 ;
ctrl.cluster_change_difference_count = 0 ;
}
else
{
ilog ("controller-%d %s cluster of %d with %d responding",
ctrl.this_controller,
hbs_cluster_network_name(n).c_str(),
history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled,
history_ptr->entry[history_ptr->oldest_entry_index].hosts_responding);
ctrl.cluster_change_threshold_count++ ;
ctrl.cluster_change_difference_count =
history_ptr->entry[oldest_entry_index].hosts_enabled -
history_ptr->entry[oldest_entry_index].hosts_responding ;
}
}
if ( daemon_get_cfg_ptr()->debug_state&4 )
{
logit = true ;
logit_reason = "(debug)" ;
}
// else if (( ctrl.cluster_change_threshold_count == 1 ) &&
// ( cluster_change == false ))
// {
// logit = true ;
// logit_reason = "" ;
// }
else if ( ctrl.cluster_change_threshold_count >= CLUSTER_CHANGE_THRESHOLD )
{
logit = true ;
ctrl.cluster_change_threshold_count = 0 ;
logit_reason = "(threshold)" ;
}
else
{
int delta =
history_ptr->entry[oldest_entry_index].hosts_enabled -
history_ptr->entry[oldest_entry_index].hosts_responding ;
if ( delta != ctrl.cluster_change_difference_count )
{
logit = true ;
ctrl.cluster_change_difference_count = delta ;
logit_reason = "(delta)" ;
}
}
if ( logit )
{
ilog ("controller-%d %s cluster of %d with %d responding (%d:%d) %s",
ctrl.this_controller,
hbs_cluster_network_name(n).c_str(),
history_ptr->entry[oldest_entry_index].hosts_enabled,
history_ptr->entry[oldest_entry_index].hosts_responding,
ctrl.cluster_change_difference_count,
not_responding_hosts,
logit_reason.c_str());
}
/* Increment the entries count till it reaches the max. */
if ( history_ptr->entries < MTCE_HBS_HISTORY_ENTRIES )
history_ptr->entries++ ;
/* Manage the next entry update index ; aka the oldest index. */
if ( history_ptr->oldest_entry_index == (MTCE_HBS_HISTORY_ENTRIES-1))
if ( oldest_entry_index == (MTCE_HBS_HISTORY_ENTRIES-1))
history_ptr->oldest_entry_index = 0 ;
else
history_ptr->oldest_entry_index++ ;
@ -521,24 +619,31 @@ void hbs_cluster_update ( iface_enum iface,
void hbs_cluster_append ( hbs_message_type & msg )
{
unsigned short c = ctrl.this_controller ;
CHECK_CTRL_NTWK_PARMS(c, ctrl.monitored_networks);
CHECK_CTRL_NTWK_PARMS(ctrl.this_controller, ctrl.monitored_networks);
msg.cluster.version = ctrl.cluster.version ;
msg.cluster.revision = ctrl.cluster.revision ;
msg.cluster.magic_number = ctrl.cluster.magic_number ;
msg.cluster.period_msec = ctrl.cluster.period_msec ;
msg.cluster.storage0_enabled = ctrl.cluster.storage0_enabled ;
msg.cluster.histories = ctrl.cluster.histories ;
msg.cluster.histories = 0 ;
int bytes = BYTES_IN_CLUSTER_VAULT(ctrl.monitored_networks);
/* Copy this controller's cluster history into the broadcast request. */
for ( int h = 0 ; h < ctrl.cluster.histories ; h++ )
{
if ( ctrl.cluster.history[h].controller == ctrl.this_controller )
{
memcpy( &msg.cluster.history[msg.cluster.histories],
&ctrl.cluster.history[h],
sizeof(mtce_hbs_cluster_history_type));
clog1 ("controller-%d appending cluster info to heartbeat message (%d:%d:%d)",
c, ctrl.monitored_networks, ctrl.cluster.histories, bytes );
msg.cluster.histories++ ;
}
}
msg.cluster.bytes = BYTES_IN_CLUSTER_VAULT(msg.cluster.histories);
/* Copy the cluster into the message. */
memcpy( &msg.cluster.history[0], &ctrl.cluster.history[c], bytes);
clog2 ("controller-%d appending cluster info to heartbeat message (%d:%d:%d)",
ctrl.this_controller, ctrl.monitored_networks, ctrl.cluster.histories, msg.cluster.bytes );
}
/****************************************************************************
@ -574,57 +679,8 @@ unsigned short hbs_cluster_unused_bytes ( void )
*
***************************************************************************/
/* NOTE: All code wrapped in this directive will be removed once
* active/active heartbeating is delivered in next update */
#define WANT_ACTIVE_ACTIVE_HEARTBEAT_RESULTS
void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid )
{
#ifdef WANT_ACTIVE_ACTIVE_HEARTBEAT_RESULTS
/* To assist SM with duplex integration ...
*
* This code emulates heartbeat redundancy by duplicating
* controller history up to the number of provisioned
* controllers until active-active heartbeat is delivered.
*/
int peer_controller ;
bool copy_cluster = false ;
if ( ctrl.this_controller == 0 )
{
peer_controller = 1 ;
if ( ctrl.controller_1_enabled )
{
copy_cluster = true ;
}
}
else
{
peer_controller = 0 ;
if ( ctrl.controller_0_enabled )
{
copy_cluster = true ;
}
}
int n, networks = ctrl.cluster.histories ;
if ( copy_cluster )
{
for ( n = 0 ; n < networks ; n++ )
{
/* copy this controller history to create peer controller */
ctrl.cluster.history[ctrl.cluster.histories] = ctrl.cluster.history[n] ;
/* update the controller */
ctrl.cluster.history[ctrl.cluster.histories].controller = peer_controller ;
ctrl.cluster.bytes += sizeof(mtce_hbs_cluster_history_type) ;
ctrl.cluster.histories++ ;
}
}
#endif // WANT_ACTIVE_ACTIVE_HEARTBEAT_RESULTS
ctrl.cluster.reqid = (unsigned short)reqid ;
if (( sm_client_sock ) && ( sm_client_sock->sock_ok() == true ))
{
@ -637,34 +693,82 @@ void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid )
}
else
{
ilog ("heartbeat cluster vault sent to SM (%d bytes)", len );
hbs_cluster_dump ( ctrl.cluster );
string reason = "" ;
// ilog ("heartbeat cluster vault sent to SM (%d bytes)", len );
if ( reqid )
reason = "cluster query" ;
else
reason = "cluster event" ;
hbs_cluster_dump ( ctrl.cluster, reason, true );
}
}
#ifdef WANT_ACTIVE_ACTIVE_HEARTBEAT_RESULTS
if ( copy_cluster )
else
{
/* Clear out the other controllers data. */
for ( n = networks ; n > 0 ; n-- )
wlog ("cannot send cluster info due to socket error");
}
}
/****************************************************************************
*
* Name : hbs_history_save
*
* Descrition : Copy the history sample to the vault.
*
* Returns : Nothing.
*
***************************************************************************/
void hbs_history_save ( string hostname, mtce_hbs_cluster_history_type & sample )
{
for ( int h = 0 ; h < ctrl.cluster.histories ; h++ )
{
if (( ctrl.cluster.history[h].controller == sample.controller ) &&
( ctrl.cluster.history[h].network == sample.network ))
{
/* copy c0 history to another controller */
hbs_cluster_history_init(ctrl.cluster.history[ctrl.cluster.histories-1]);
ctrl.cluster.bytes -= sizeof(mtce_hbs_cluster_history_type);
ctrl.cluster.histories-- ;
memcpy( &ctrl.cluster.history[h], &sample,
sizeof(mtce_hbs_cluster_history_type));
clog1 ("controller-%d updated vault with controller-%d:%s network history through %s (histories:%d)",
ctrl.this_controller,
sample.controller,
hbs_cluster_network_name((mtce_hbs_network_enum)sample.network).c_str(),
hostname.c_str(),
ctrl.cluster.histories);
return ;
}
}
#endif // WANT_ACTIVE_ACTIVE_HEARTBEAT_RESULTS
/* not found ? Add a new one */
memcpy( &ctrl.cluster.history[ctrl.cluster.histories], &sample,
sizeof(mtce_hbs_cluster_history_type));
ctrl.cluster.histories++ ;
ctrl.cluster.bytes = BYTES_IN_CLUSTER_VAULT(ctrl.cluster.histories);
ilog ("controller-%d added new controller-%d:%s history to vault ; now have %d network views",
ctrl.this_controller,
sample.controller,
hbs_cluster_network_name((mtce_hbs_network_enum)sample.network).c_str(),
ctrl.cluster.histories);
}
void hbs_state_audit ( void )
{
hbs_cluster_dump ( ctrl.cluster, "Audit", true );
}
void hbs_cluster_log ( string & hostname, string prefix )
{
hbs_cluster_log ( hostname, ctrl.cluster, prefix );
}
void hbs_cluster_log ( string & hostname,
string log_prefix,
bool force )
{
hbs_cluster_log (hostname, ctrl.cluster, log_prefix, force );
}
/****************************************************************************
*
* Active Active Heartbeating and Debug Member Functions
@ -724,10 +828,6 @@ int hbs_cluster_cmp( hbs_message_type & msg )
* Descrition : Copies the other controllers information from msg into
* the cluster.
*
* NOTE: Does not do that right now.
*
* Assumptions : Place holder until active/active heartbeating is implemented.
*
* Returns : PASS or FAIL
*
***************************************************************************/
@ -736,12 +836,29 @@ int hbs_cluster_save ( string & hostname,
mtce_hbs_network_enum network,
hbs_message_type & msg )
{
// clog ("Add cluster info from peer controller");
if ( ctrl.monitored_hosts )
/* cluster info is only supported in HBS_MESSAGE_VERSION 1 */
if ( msg.v < HBS_MESSAGE_VERSION )
return FAIL_NOT_SUPPORTED ;
if ( ! ctrl.monitored_hosts )
return RETRY ;
if ( msg.cluster.histories == 0 )
return PASS ;
for ( int h = 0 ; h < msg.cluster.histories ; h++ )
{
/* compare cluster info and log deltas */
// hbs_cluster_cmp( msg );
UNUSED(msg);
if ( msg.cluster.history[h].network >= MTCE_HBS_MAX_NETWORKS )
{
elog ("Invalid network id (%d:%d:%d)",
h,
msg.cluster.history[h].controller,
msg.cluster.history[h].network );
}
else if ( msg.cluster.history[h].controller != ctrl.this_controller )
{
hbs_history_save ( hostname, msg.cluster.history[h] );
}
hbs_cluster_log( hostname, ctrl.cluster, hbs_cluster_network_name(network) );
}
return (PASS);

View File

@ -241,10 +241,11 @@ int mtcSmgrApi_active_services ( string hostname , bool * yes_no_ptr )
return(PASS);
}
int send_hbs_command ( string hostname, int command )
int send_hbs_command ( string hostname, int command, string controller )
{
UNUSED(hostname);
UNUSED(command);
UNUSED(controller);
return(PASS);
}

View File

@ -111,6 +111,33 @@ string hbs_cluster_network_name ( mtce_hbs_network_enum network )
}
}
/****************************************************************************
*
* Name : hbs_cluster_copy
*
* Descrition : Copies cluster from src to dst.
*
* Returns : Nothing.
*
***************************************************************************/
void hbs_cluster_copy ( mtce_hbs_cluster_type & src, mtce_hbs_cluster_type & dst )
{
dst.version = src.version ;
dst.revision = src.revision ;
dst.magic_number = src.magic_number ;
dst.period_msec = src.period_msec ;
dst.histories = src.histories ;
dst.storage0_enabled = src.storage0_enabled ;
for ( int h = 0 ; h < dst.histories ; h++ )
{
memcpy( &dst.history[h],
&src.history[h],
sizeof(mtce_hbs_cluster_history_type));
}
dst.bytes = BYTES_IN_CLUSTER_VAULT(dst.histories);
}
/****************************************************************************
*
@ -126,11 +153,9 @@ string hbs_cluster_network_name ( mtce_hbs_network_enum network )
void hbs_cluster_log ( string & hostname,
mtce_hbs_cluster_type & cluster,
string log_prefix )
string log_prefix,
bool force )
{
// bool want_log = false ;
clog1 ("log %d histories", cluster.histories );
for ( int h = 0 ; h < cluster.histories ; h++ )
{
if ( cluster.history[h].entries == MTCE_HBS_HISTORY_ENTRIES )
@ -140,8 +165,6 @@ void hbs_cluster_log ( string & hostname,
mtce_hbs_cluster_entry_type e = { 0, 0 } ;
char str[MAX_CLUSTER_LINE_LEN] ;
string line = "";
int start = 0 ;
int stop = 0 ;
bool newline = false ;
bool logit = false ;
bool first = false ;
@ -149,18 +172,13 @@ void hbs_cluster_log ( string & hostname,
mtce_hbs_cluster_history_type * history_ptr = &cluster.history[h] ;
clog1 ("%s %s has %d entries (controller-%d view from %s)", hostname.c_str(),
hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(),
history_ptr->entries,
history_ptr->controller,
log_prefix.c_str());
/* Manage local this_index for log display.
* Display oldest to newest ; left to right
*
* */
int this_index = history_ptr->oldest_entry_index ;
int debug = daemon_get_cfg_ptr()->debug_state ;
for ( int count = 0 ; count < history_ptr->entries ; count++ )
{
if (( line.length() + MAX_ENTRY_STR_LEN ) >=
@ -180,13 +198,11 @@ void hbs_cluster_log ( string & hostname,
}
#endif
// want_log = true ;
if ( count == 0 )
{
snprintf (&str[0], MAX_ENTRY_STR_LEN , "%d:%d ", // -%d",
history_ptr->entry[this_index].hosts_enabled,
history_ptr->entry[this_index].hosts_responding ); // , this_index );
history_ptr->entry[this_index].hosts_responding );
line.append (str);
str[0] = '\0' ;
}
@ -203,7 +219,7 @@ void hbs_cluster_log ( string & hostname,
{
snprintf (&str[0], MAX_ENTRY_STR_LEN , "%d:%d ", // -%d",
history_ptr->entry[this_index].hosts_enabled,
history_ptr->entry[this_index].hosts_responding ); // , this_index );
history_ptr->entry[this_index].hosts_responding );
line.append (str);
str[0] = '\0' ;
logit = true ;
@ -214,31 +230,21 @@ void hbs_cluster_log ( string & hostname,
first_log[h] = true ;
logit = true ;
}
stop++ ;
if ( newline == true )
{
if ( logit )
{
SET_CONTROLLER_HOSTNAME(history_ptr->controller);
if ( hostname == controller )
if (( force ) || ( debug&2 ))
{
clog ("%s view %s %s %02d..%02d: %s,",
hostname.c_str(),
log_prefix.c_str(),
hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(),
start, stop, line.c_str());
}
else
{
clog ("%s view from %s %s %s %02d..%02d: %s,",
controller.c_str(),
hostname.c_str(),
log_prefix.c_str(),
hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(),
start, stop, line.c_str());
syslog ( LOG_INFO, "%s view from %s %s %s: %s",
controller.c_str(),
hostname.c_str(),
log_prefix.c_str(),
hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(),
line.c_str());
}
}
start = stop + 1 ;
line.clear();
first = true ;
newline = false ;
@ -253,7 +259,6 @@ void hbs_cluster_log ( string & hostname,
}
if (( newline == false ) && ( line.length() ))
{
// ERIC
if (( logit == false ) && ( was_diff[h] == true ))
{
logit = true ;
@ -264,30 +269,25 @@ void hbs_cluster_log ( string & hostname,
{
if ( first )
{
clog ("............ %s %s %02d..%02d: %s",
log_prefix.c_str(),
hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(),
start, stop, line.c_str());
if (( force ) || ( debug&2 ))
{
syslog ( LOG_INFO, "............ %s %s: %s",
log_prefix.c_str(),
hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(),
line.c_str());
}
}
else
{
SET_CONTROLLER_HOSTNAME(history_ptr->controller);
if ( hostname == controller )
if (( force ) || ( debug&2 ))
{
clog ("%s view %s %s %02d..%02d: %s",
hostname.c_str(),
log_prefix.c_str(),
hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(),
start, stop, line.c_str());
}
else
{
clog ("%s view from %s %s %s %02d..%02d: %s",
controller.c_str(),
hostname.c_str(),
log_prefix.c_str(), /* Infra <- */
hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(),
start, stop, line.c_str());
syslog ( LOG_INFO, "%s view from %s %s %s: %s",
controller.c_str(),
hostname.c_str(),
log_prefix.c_str(), /* Infra <- */
hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(),
line.c_str());
}
}
}
@ -307,40 +307,62 @@ void hbs_cluster_log ( string & hostname,
* Description: Formatted dump of the vault contents to the log file.
*
***************************************************************************/
void hbs_cluster_dump ( mtce_hbs_cluster_type & vault )
void hbs_cluster_dump ( mtce_hbs_cluster_type & vault, string log_prefix, bool force )
{
syslog ( LOG_INFO, "Cluster Vault Dump: --------------------------------------------------------------------------------------------");
syslog ( LOG_INFO, "Cluster Vault: v%d.%d %d msec period ; SM Reqid is %d with storage-0 %s and %d histories in %d bytes",
vault.version,
vault.revision,
vault.period_msec,
vault.reqid,
vault.storage0_enabled ? "enabled" : "disabled",
vault.histories,
vault.bytes );
for ( int h = 0 ; h < vault.histories ; h++ )
if ( vault.version == 0 )
return ;
int debug = daemon_get_cfg_ptr()->debug_state ;
if (( debug & 2 ) || ( force == true ))
{
#define MAX_LINE_LEN (500)
char str[MAX_LINE_LEN] ;
int i = 0 ;
for ( int e = 0 ; e < vault.history[h].entries_max ; e++ )
{
snprintf ( &str[i], MAX_LINE_LEN, "%c[%d:%d]" ,
vault.history[h].oldest_entry_index==e ? '>' : ' ',
vault.history[h].entry[e].hosts_enabled,
vault.history[h].entry[e].hosts_responding);
i = strlen(str) ;
}
syslog ( LOG_INFO, "Cluster Vault: C%d %s S:%s:%s (%d:%d) %s",
vault.history[h].controller,
hbs_cluster_network_name((mtce_hbs_network_enum)vault.history[h].network).c_str(),
vault.storage0_enabled ? "y" : "n",
vault.history[h].storage0_responding ? "y" : "n",
vault.history[h].entries_max,
vault.history[h].entries,
str);
ilog ("%s", log_prefix.c_str());
syslog ( LOG_INFO, "Cluster Vault : v%d.%d %d msec heartbeat period %s;%d network heartbeat response histories (%d bytes)",
vault.version,
vault.revision,
vault.period_msec,
vault.storage0_enabled ? " with storage-0: enabled " : "",
vault.histories,
vault.bytes );
}
if (( debug & 4 ) || ( force == true ))
{
for ( int h = 0 ; h < vault.histories ; h++ )
{
#define MAX_LINE_LEN (500)
char str[MAX_LINE_LEN] ;
int i = 0 ;
for ( int e = 0 ; e < vault.history[h].entries_max ; e++ )
{
snprintf ( &str[i], MAX_LINE_LEN, "%c[%d:%d]" ,
vault.history[h].oldest_entry_index==e ? '>' : ' ',
vault.history[h].entry[e].hosts_enabled,
vault.history[h].entry[e].hosts_responding);
i = strlen(str) ;
}
if ( vault.storage0_enabled )
{
syslog ( LOG_INFO, "Cluster Vault : C%d %s S:%s %s",
vault.history[h].controller,
hbs_cluster_network_name((mtce_hbs_network_enum)vault.history[h].network).c_str(),
vault.history[h].storage0_responding ? "y" : "n",
str);
}
else
{
syslog ( LOG_INFO, "Cluster Vault : C%d %s %s",
vault.history[h].controller,
hbs_cluster_network_name((mtce_hbs_network_enum)vault.history[h].network).c_str(),
str);
}
}
}
if ( debug & 8 )
{
dump_memory ( &vault, 16, vault.bytes );
}
// dump_memory ( &vault, 16, vault.bytes );
}

View File

@ -46,6 +46,7 @@ CONTROL_OBJS += mtcHttpSvr.o
CONTROL_OBJS += mtcCmdHdlr.o
CONTROL_OBJS += mtcNodeMnfa.o
CONTROL_OBJS += mtcVimApi.o
CONTROL_OBJS += mtcStubs.o
CONTROL_OBJS += ../common/nodeClass.o
OBJS = $(SRCS:.cpp=.o)

View File

@ -48,6 +48,7 @@ using namespace std;
#include "mtcAlarm.h" /* for ... mtcAlarm... */
#include "nodeUtil.h" /* for ... get_event_str ... */
int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr );
/* Throttle logging of messages from unknown IP addresses */
std::list<string> unknown_ip_list ;
@ -766,7 +767,7 @@ int send_mtc_cmd ( string & hostname, int cmd , int interface )
return ( rc );
}
int send_hbs_command ( string hostname, int cmd )
int send_hbs_command ( string hostname, int cmd, string controller )
{
int bytes = 0 ;
int bytes_to_send = 0 ;
@ -776,18 +777,6 @@ int send_hbs_command ( string hostname, int cmd )
mtc_message_type event ;
mtc_socket_type * sock_ptr = get_sockPtr ();
/* We don't heartbeat self */
if (( obj_ptr->is_active_controller (hostname) ) &&
(( cmd == MTC_CMD_ADD_HOST ) ||
( cmd == MTC_CMD_DEL_HOST ) ||
( cmd == MTC_CMD_START_HOST ) ||
( cmd == MTC_CMD_STOP_HOST )))
{
dlog ("%s refusing to '%s' self to heartbeat service\n",
hostname.c_str(), get_event_str(cmd).c_str());
return (PASS);
}
memset (&event, 0 , sizeof(mtc_message_type));
snprintf ( &event.hdr[0] , MSG_HEADER_SIZE, "%s", get_hbs_cmd_req_header() );
snprintf ( &event.hdr[MSG_HEADER_SIZE] , MAX_CHARS_HOSTNAME , "%s", hostname.data());
@ -795,48 +784,72 @@ int send_hbs_command ( string hostname, int cmd )
/* There is no buffer data in any of these messages */
bytes_to_send = ((sizeof(mtc_message_type))-(BUF_SIZE)) ;
switch ( cmd )
{
case MTC_CMD_STOP_HOST:
ilog ("%s sending 'stop' to heartbeat service\n", hostname.c_str());
break ;
case MTC_CMD_START_HOST:
obj_ptr->manage_heartbeat_clear ( hostname , MAX_IFACES );
ilog ("%s sending 'start' to heartbeat service\n", hostname.c_str());
break ;
case MTC_CMD_DEL_HOST:
ilog ("%s sending 'delete' to heartbeat service\n", hostname.c_str());
break ;
case MTC_CMD_ADD_HOST:
obj_ptr->manage_heartbeat_clear ( hostname, MAX_IFACES );
ilog ("%s sending 'add' to heartbeat service\n", hostname.c_str());
break ;
case MTC_RESTART_HBS:
ilog ("%s sending 'restart' to heartbeat service\n", hostname.c_str());
break ;
case MTC_BACKOFF_HBS:
ilog ("%s requesting heartbeat period backoff\n", hostname.c_str());
break ;
case MTC_RECOVER_HBS:
ilog ("%s requesting heartbeat period recovery\n", hostname.c_str());
break ;
default:
{
slog ("%s Unsupported command operation 0x%x\n", hostname.c_str(), cmd );
return (FAIL_BAD_PARM);
}
}
event.cmd = cmd ;
event.num = 1 ;
event.parm[0] = obj_ptr->get_nodetype(hostname);
/* send to hbsAgent daemon port */
bytes = sock_ptr->mtc_to_hbs_sock->write((char*) &event, bytes_to_send);
if ( bytes <= 0 )
std::list<string> controllers ;
controllers.clear();
if ( controller == CONTROLLER )
{
wlog ("Cannot send to heartbeat service\n");
rc = FAIL_TO_TRANSMIT ;
controllers.push_back(CONTROLLER_0);
controllers.push_back(CONTROLLER_1);
}
else
{
controllers.push_back(controller);
}
string ip = "" ;
std::list<string>::iterator unit ;
for ( unit = controllers.begin () ;
unit != controllers.end () ;
unit++ )
{
switch ( cmd )
{
case MTC_CMD_ACTIVE_CTRL:
mlog3 ("%s sending 'activity state' to %s heartbeat service\n", hostname.c_str(), unit->c_str());
break ;
case MTC_CMD_STOP_HOST:
ilog ("%s sending 'stop' to %s heartbeat service\n", hostname.c_str(), unit->c_str());
break ;
case MTC_CMD_START_HOST:
obj_ptr->manage_heartbeat_clear ( hostname , MAX_IFACES );
ilog ("%s sending 'start' to %s heartbeat service\n", hostname.c_str(), unit->c_str());
break ;
case MTC_CMD_DEL_HOST:
ilog ("%s sending 'delete' to %s heartbeat service\n", hostname.c_str(), unit->c_str());
break ;
case MTC_CMD_ADD_HOST:
obj_ptr->manage_heartbeat_clear ( hostname, MAX_IFACES );
ilog ("%s sending 'add' to %s heartbeat service\n", hostname.c_str(), unit->c_str());
break ;
case MTC_RESTART_HBS:
ilog ("%s sending 'restart' to %s heartbeat service\n", hostname.c_str(), unit->c_str());
break ;
case MTC_BACKOFF_HBS:
ilog ("%s requesting %s heartbeat period backoff\n", hostname.c_str(), unit->c_str());
break ;
case MTC_RECOVER_HBS:
ilog ("%s requesting %s heartbeat period recovery\n", hostname.c_str(), unit->c_str());
break ;
default:
{
slog ("%s Unsupported command operation 0x%x\n", hostname.c_str(), cmd );
rc = FAIL_BAD_PARM ;
continue ;
}
}
ip = get_mtcInv_ptr()->get_hostaddr(*unit) ;
bytes = sock_ptr->mtc_to_hbs_sock->write((char*) &event, bytes_to_send, ip.data());
if ( bytes <= 0 )
{
wlog ("%s failed to send command (0x%x) to heartbeat service at %s\n", unit->c_str(), cmd, ip.c_str() );
rc = FAIL_TO_TRANSMIT ;
}
}
return rc ;
}
@ -954,6 +967,14 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr )
/* Assert the degrade condition with the 'false' (i.e. not clear)*/
obj_ptr->manage_heartbeat_degrade ( hostname, iface, false );
}
/* Otherwise the action must be alarm only or none ; both of which
* are already handled by the hbsAgent, so do nothing */
else
{
ilog ("%s heartbeat degrade event dropped ; action is not fail or degrade (%s)\n",
hostname.c_str(),
get_iface_name_str(iface));
}
}
else
{
@ -1003,7 +1024,7 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr )
* are already handled by the hbsAgent, so do nothing */
else
{
dlog ("%s heartbeat loss event dropped (%s)\n",
ilog ("%s heartbeat loss event dropped ; action is not fail or degrade (%s)\n",
hostname.c_str(),
get_iface_name_str(iface));
}
@ -1070,6 +1091,7 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr )
else if ( msg.cmd == MTC_EVENT_HEARTBEAT_READY )
{
string controller = CONTROLLER ;
std::list<string>::iterator temp ;
/* no heartbeating in simplex mode */
@ -1078,7 +1100,17 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr )
return (PASS);
}
ilog ("Received 'Heartbeat Service Ready' Event\n");
/* get the controller that sent this ready event */
if (( msg.buf[0] != '\0' ) && ( strnlen( msg.buf, BUF_SIZE) <= MAX_CHARS_HOSTNAME ))
{
controller = msg.buf ;
ilog ("%s Heartbeat Service Ready Event (%s)\n",
msg.buf, sock_ptr->mtc_event_rx_sock->get_src_str());
}
else
{
ilog ("Heartbeat Service Ready Event\n");
}
obj_ptr->hbs_ready = true ;
/* Run Maintenance on Inventory */
@ -1093,25 +1125,17 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr )
* the heartbeat service. This tell the heartbeat
* service about all the hosts so that it will
* send heartbeat oob flag events to mtce. */
if ( send_hbs_command( hostname, MTC_CMD_ADD_HOST ) != PASS )
if ( send_hbs_command( hostname, MTC_CMD_ADD_HOST, controller ) != PASS )
{
elog ("%s Failed to send inventory to heartbeat service\n", hostname.c_str());
}
/* Send the start event to the heartbeat service for all enabled hosts except
* for the active controller which is not actively monitored */
if ( obj_ptr->is_active_controller ( hostname ) == false )
/* Send the start event to the heartbeat service for all enabled hosts */
if (( obj_ptr->get_adminState ( hostname ) == MTC_ADMIN_STATE__UNLOCKED ) &&
( obj_ptr->get_operState ( hostname ) == MTC_OPER_STATE__ENABLED ) &&
((obj_ptr->get_availStatus ( hostname ) == MTC_AVAIL_STATUS__AVAILABLE ) ||
(obj_ptr->get_availStatus ( hostname ) == MTC_AVAIL_STATUS__DEGRADED )))
{
if (( obj_ptr->get_adminState ( hostname ) == MTC_ADMIN_STATE__UNLOCKED ) &&
( obj_ptr->get_operState ( hostname ) == MTC_OPER_STATE__ENABLED ) &&
((obj_ptr->get_availStatus ( hostname ) == MTC_AVAIL_STATUS__AVAILABLE ) ||
(obj_ptr->get_availStatus ( hostname ) == MTC_AVAIL_STATUS__DEGRADED )))
{
send_hbs_command ( hostname, MTC_CMD_START_HOST );
}
}
else
{
dlog ("%s Refusing to start heartbeat of self\n", hostname.c_str() );
send_hbs_command ( hostname, MTC_CMD_START_HOST, controller );
}
}
}

View File

@ -802,7 +802,11 @@ int mtc_socket_init ( void )
/***********************************************************/
int port = daemon_get_cfg_ptr()->hbs_to_mtc_event_port ;
mtc_sock.mtc_event_rx_sock = new msgClassRx(LOOPBACK_IP, port, IPPROTO_UDP);
/* listen to this port on any interface so that the hbsAgent running
* locally or on peer controller can get events into mtcAgent */
mtc_sock.mtc_event_rx_sock =
new msgClassRx(mtcInv.my_float_ip.data(), port, IPPROTO_UDP);
rc = mtc_sock.mtc_event_rx_sock->return_status;
if ( rc )
{
@ -820,7 +824,7 @@ int mtc_socket_init ( void )
/***********************************************************/
port = daemon_get_cfg_ptr()->mtc_to_hbs_cmd_port ;
sock_ptr->mtc_to_hbs_sock = new msgClassTx(LOOPBACK_IP, port, IPPROTO_UDP);
sock_ptr->mtc_to_hbs_sock = new msgClassTx(CONTROLLER, port, IPPROTO_UDP, mtc_config.mgmnt_iface);
rc = sock_ptr->mtc_to_hbs_sock->return_status;
if ( rc )
{
@ -1281,11 +1285,14 @@ void daemon_service_run ( void )
mtcInv.inotify_shadow_file_fd ,
mtcInv.inotify_shadow_file_wd );
/* Add this controller to the heartbeat service so that we
* receive the out-of-band heartbeat 'flags' even though
* we don't self monitor the active controller specifically
* This add may be duplicate but covers the initial config case */
/* inform the heartbeat service that this controller is active */
send_hbs_command ( mtcInv.my_hostname, MTC_CMD_ACTIVE_CTRL );
/* Add this controller to the heartbeat service so that
* the peer hbsAgent also gets this controllers inventory
* and this hbsAgent receives the out-of-band heartbeat 'flags' */
send_hbs_command ( mtcInv.my_hostname, MTC_CMD_ADD_HOST );
send_hbs_command ( mtcInv.my_hostname, MTC_CMD_START_HOST );
socks.clear();
socks.push_front (mtc_sock.mtc_event_rx_sock->getFD()); // service_events

View File

@ -6205,6 +6205,13 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
#endif
/* Audits for this controller host only */
if ( node_ptr->hostname == this->my_hostname )
{
/* Remind the heartbeat service that this is the active ctrl */
send_hbs_command ( this->my_hostname, MTC_CMD_ACTIVE_CTRL );
}
/* Manage active controller auto recovery bool.
* If the inactive controller is inservice then disable
* controller autorecovery. Otherwise enable it but in this case

View File

@ -14,4 +14,10 @@ using namespace std;
#include "nodeClass.h" /* The main link class */
void hbs_cluster_log ( void ) { }
void hbs_cluster_log ( string & hostname, string prefix, bool force=false )
{
UNUSED(hostname);
UNUSED(prefix);
UNUSED(force);
}