From 0b922227acb6877214bf05e31073815dc8bc3466 Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Tue, 13 Nov 2018 14:55:17 -0500 Subject: [PATCH] Implement Active-Active Heartbeat as HA Improvement This update introduces mtce changes to support Active-Active Heartbeating. The purpose of Active-Active Heartbeating is help avoid Split-Brain. Active-Active heartbeating has each controller maintain a 5 second heartbeat response history cache of each network for all monitored hosts as well as the on-going health of storage-0 if provisioned and enabled. This is referred to as the 'heartbeat cluster history' Each controller then includes its cluster history in each heartbeat pulse request message. The hbsClient, now modified to handle heartbeat from both controllers, saves each controllers' heartbeat cluster history in a local cache and criss-crosses the data in its pulse responses. So when the hbsClient receives a pulse request from controller-0 it saves its reported history and then replaces that history information in its response to controller-0 with what it saved from controller-1's last pulse request ; i.e. its view of the system. Controller-0, receiving a host's pulse response, saves its peers heartbeat cluster history so that it has summary of heartbeat cluster history for the last 5 seconds for each monitored network of every monitored host in the system from both controllers' perspectives. Same for controller-1 with controller-0's history. The hbsAgent is then further enhanced to support a query request for this information. So now SM, when it needs to make a decision to avoid Split-Brain or otherwise, can query either controller for its heartbeat cluster history and get the last 5 second summary view of heartbeat (network) responsivness from both controllers perspectives to help decide which controller to make active. This involved removing the hbsAgent process from SM control and monitor and adding a new hbsAgent LSB init script for process launch, service file to run the init script and pmon config file for hbsAgent process monitoring. With hbsAgent now running on both controllers, changes to maintenance were required to send inventory to hbsAgent on both controllers, listen for hbsAgent event messages over the management interface and inform both hbsAgents which controller is active. The hbsAgent running on the inactive controller does not - does not send heartbeat events to maintenance - does not send raise or clear alarms or produce customer logs Test Plan: Feature: PASS: Verify hbsAgent runs on both controllers PASS: Verify hbsAgent as pmon monitored process (not SM) PASS: Verify system install and cluster collection in all system types (10+) PASS: Verify active controller hbsAgent detects and handles heartbeat loss PASS: Verify inactive controller hbsAgent detects and logs heartbeat loss PASS: Verify heartbeat cluster history collection functions properly. PASS: Verify storage-0 state tracking in cluster into. PASS: Verify storage-0 not responding handling PASS: Verify heartbeat response is sent back to only the requesting controller. PASS: Verify heartbeat history is correct from each controller PASS: Verify MNFA from active controller after install to controller-0 PASS: Verify MNFA from active controller after swact to controller-1 PASS: Verify MNFA for 80%+ of the hosts in the storage system PASS: Verify SM cluster query operation and content from both controllers PASS: Verify restart of inactive hbsAgent doesn't clear existing heartbeat alarms Logging: PASS: Verify cluster info logs. PASS: Verify feature design logging. PASS: Verify hbsAgent and hbsClient design logs on all hosts add value PASS: Verify design logging from both controllers in heartbeat loss case PASS: Verify design logging from both controllers in MNFA case PASS: Verify clog logs cluster info vault status and updates for controllers PASS: Verify clog1 logs full cluster state change for all hosts PASS: Verify clog2 logs cluster info save/append logs for controllers PASS: Verify clog3 memory dumps a cluster history PASS: Verify USR2 forces heartbeat and cluster info log dump PASS: Verify hourly heartbeat and cluster info log dump PASS: Verify loss events force heartbeat and cluster info log dump Regression: PASS: Verify Large System DOR PASS: Verify pmond regression test that now includes hbsAgent PASS: Verify Lock/Unlock of inactive controller (x3) PASS: Verify Swact behavior (x10) PASS: Verify compute Lock/Unlock PASS: Verify storage-0 Lock/Unlock PASS: Verify compute Host Failure and Graceful Recovery PASS: Verify Graceful Recovery Retry to Max:3 then Full Enable PASS: Verify Delete Host PASS: Verify Patching hbsAgent and hbsClient PASS: Verify event driven cluster push Story: 2003576 Task: 24907 Change-Id: I5baf5bcca23601a99473d039356d58250ffb01b5 Signed-off-by: Eric MacDonald --- mtce-common/src/common/logMacros.h | 16 +- mtce-common/src/common/nodeBase.cpp | 1 + mtce-common/src/common/nodeBase.h | 15 +- mtce-common/src/common/nodeTimers.h | 1 + mtce-common/src/daemon/daemon_config.cpp | 3 +- mtce-common/src/daemon/daemon_main.cpp | 28 +- mtce-common/src/daemon/daemon_option.h | 3 +- mtce-control/centos/build_srpm.data | 2 +- mtce-control/centos/mtce-control.spec | 4 + mtce-control/src/Makefile | 33 +- mtce-control/src/scripts/hbsAgent | 117 +++ mtce-control/src/scripts/hbsAgent.conf | 25 + mtce-control/src/scripts/hbsAgent.service | 22 + mtce/centos/build_srpm.data | 2 +- mtce/centos/mtce.spec | 2 - mtce/src/alarm/alarm.cpp | 19 +- mtce/src/alarm/alarm.h | 1 + mtce/src/common/nodeClass.cpp | 125 ++- mtce/src/common/nodeClass.h | 4 + mtce/src/heartbeat/hbsAgent.cpp | 1063 +++++++++++++-------- mtce/src/heartbeat/hbsBase.h | 27 +- mtce/src/heartbeat/hbsClient.cpp | 231 +++-- mtce/src/heartbeat/hbsCluster.cpp | 387 +++++--- mtce/src/heartbeat/hbsStubs.cpp | 3 +- mtce/src/heartbeat/hbsUtil.cpp | 192 ++-- mtce/src/maintenance/Makefile | 1 + mtce/src/maintenance/mtcCtrlMsg.cpp | 154 +-- mtce/src/maintenance/mtcNodeCtrl.cpp | 19 +- mtce/src/maintenance/mtcNodeHdlrs.cpp | 7 + mtce/src/maintenance/mtcStubs.cpp | 8 +- 30 files changed, 1678 insertions(+), 837 deletions(-) create mode 100644 mtce-control/src/scripts/hbsAgent create mode 100644 mtce-control/src/scripts/hbsAgent.conf create mode 100644 mtce-control/src/scripts/hbsAgent.service diff --git a/mtce-common/src/common/logMacros.h b/mtce-common/src/common/logMacros.h index 60c43bbf..b9e124d1 100644 --- a/mtce-common/src/common/logMacros.h +++ b/mtce-common/src/common/logMacros.h @@ -39,7 +39,6 @@ typedef struct { int scheduling_priority ; /**< Scheduling priority of this daemon */ bool active ; /**< Maintenance activity state true|false */ - int hbs_pulse_period ; /**< time (msec) between heartbeat requests */ int token_refresh_rate ; /**< token refresh rate in seconds */ int hbs_minor_threshold ; /**< heartbeat miss minor threshold */ int hbs_degrade_threshold ; /**< heartbeat miss degrade threshold */ @@ -351,7 +350,7 @@ extern char *program_invocation_short_name; } #define blog(format, args...) { \ - if ( ltc() ) { if(daemon_get_cfg_ptr()->debug_bmgmt) printf ( "%s [%d.%05d] %s %s %-3s %-18s(%4d) %-24s: BMgt : " format, pt(), getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } \ + if ( ltc() ) { if(daemon_get_cfg_ptr()->debug_bmgmt&1) printf ( "%s [%d.%05d] %s %s %-3s %-18s(%4d) %-24s: BMgt : " format, pt(), getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } \ else { if(daemon_get_cfg_ptr()->debug_bmgmt) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: BMgt : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } \ } @@ -380,22 +379,22 @@ extern char *program_invocation_short_name; #define mlog2(format, args...) { if(daemon_get_cfg_ptr()->debug_msg&4 ) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Msg4 : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } #define mlog3(format, args...) { if(daemon_get_cfg_ptr()->debug_msg&8 ) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Msg8 : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } -#define jlog(format, args...) { if(daemon_get_cfg_ptr()->debug_json ) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Json : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } +#define jlog(format, args...) { if(daemon_get_cfg_ptr()->debug_json&1) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Json : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } #define jlog1(format, args...) { if(daemon_get_cfg_ptr()->debug_json&2) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Json2: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } #define jlog2(format, args...) { if(daemon_get_cfg_ptr()->debug_json&4) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Json4: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } #define jlog3(format, args...) { if(daemon_get_cfg_ptr()->debug_json&8) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Json8: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } -#define hlog(format, args...) { if(daemon_get_cfg_ptr()->debug_http) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Http : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } +#define hlog(format, args...) { if(daemon_get_cfg_ptr()->debug_http&1) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Http : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } #define hlog1(format, args...) { if(daemon_get_cfg_ptr()->debug_http&2) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Http2: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } #define hlog2(format, args...) { if(daemon_get_cfg_ptr()->debug_http&4) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Http4: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } #define hlog3(format, args...) { if(daemon_get_cfg_ptr()->debug_http&8) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Http8: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } -#define alog(format, args...) { if(daemon_get_cfg_ptr()->debug_alive ) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Alive : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } +#define alog(format, args...) { if(daemon_get_cfg_ptr()->debug_alive&1) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Alive : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } #define alog1(format, args...) { if(daemon_get_cfg_ptr()->debug_alive&2) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Alive2: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } #define alog2(format, args...) { if(daemon_get_cfg_ptr()->debug_alive&4) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Alive4: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } #define alog3(format, args...) { if(daemon_get_cfg_ptr()->debug_alive&8) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Alive8: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } -#define qlog(format, args...) { if(daemon_get_cfg_ptr()->debug_work) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Work : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } +#define qlog(format, args...) { if(daemon_get_cfg_ptr()->debug_work&1) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Work : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } #define qlog1(format, args...) { if(daemon_get_cfg_ptr()->debug_work&2) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Work2: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } #define qlog2(format, args...) { if(daemon_get_cfg_ptr()->debug_work&4) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Work4: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } #define qlog3(format, args...) { if(daemon_get_cfg_ptr()->debug_work&8) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Work8: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } @@ -403,8 +402,11 @@ extern char *program_invocation_short_name; #define flog(format, args...) { if(daemon_get_cfg_ptr()->debug_fsm) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: FSM : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } #define tlog(format, args...) { if(daemon_get_cfg_ptr()->debug_timer) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Timer: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } -#define clog(format, args...) { if(daemon_get_cfg_ptr()->debug_state) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Change: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } +#define clog(format, args...) { if(daemon_get_cfg_ptr()->debug_state&1) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Change: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } #define clog1(format, args...) { if(daemon_get_cfg_ptr()->debug_state&2) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Chang2: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } +#define clog2(format, args...) { if(daemon_get_cfg_ptr()->debug_state&4) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Chang4: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } +#define clog3(format, args...) { if(daemon_get_cfg_ptr()->debug_state&8) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Chang8: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } + #define log_event(format, args...) { syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Event: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } #define log_stress(format, args...) { syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Stress: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } diff --git a/mtce-common/src/common/nodeBase.cpp b/mtce-common/src/common/nodeBase.cpp index f7410580..f0db8696 100755 --- a/mtce-common/src/common/nodeBase.cpp +++ b/mtce-common/src/common/nodeBase.cpp @@ -233,6 +233,7 @@ const char * get_mtcNodeCommand_str ( int cmd ) case MTC_CMD_QRY_HOST: return("query host"); case MTC_CMD_START_HOST: return("start host service"); case MTC_CMD_STOP_HOST: return("stop host service"); + case MTC_CMD_ACTIVE_CTRL: return("publish active controller"); /* VM Instance Commands */ case MTC_CMD_ADD_INST: return("add instance"); diff --git a/mtce-common/src/common/nodeBase.h b/mtce-common/src/common/nodeBase.h index 3504acc6..788954b6 100755 --- a/mtce-common/src/common/nodeBase.h +++ b/mtce-common/src/common/nodeBase.h @@ -359,6 +359,7 @@ void daemon_exit ( void ); * a power-off to online transition */ #define MTC_MTCALIVE_HITS_TO_GO_ONLINE (5) +#define CONTROLLER_X ((const char *)"controller-x") #define CONTROLLER_0 ((const char *)"controller-0") #define CONTROLLER_1 ((const char *)"controller-1") #define CONTROLLER_2 ((const char *)"controller-2") @@ -526,7 +527,8 @@ typedef struct #define MTC_CMD_MOD_HOST (0x11110012) /* Query Host */ #define MTC_CMD_QRY_HOST (0x11110013) /* Modify Host */ #define MTC_CMD_START_HOST (0x11110014) /* Start Monitoring Host */ -#define MTC_CMD_STOP_HOST (0x11110015) /* Stop Moniroting Host */ +#define MTC_CMD_STOP_HOST (0x11110015) /* Stop Monitoring Host */ +#define MTC_CMD_ACTIVE_CTRL (0x11110016) /* Active Controller */ #define MTC_CMD_ADD_INST (0x11110020) /* Add Inst */ #define MTC_CMD_DEL_INST (0x11110021) /* Delete Inst */ @@ -643,6 +645,9 @@ typedef struct #define PMOND_FLAG (0x00000001) /**< Process Monitor O.K. Flag */ #define INFRA_FLAG (0x00000002) /**< Infrastructure iface provisioned Flag */ +#define CTRLX_MASK (0x00000300) /**< From/To Controller-0/1/2/3 Number */ +#define CTRLX_BIT ((unsigned int)8) /**< used to shift right mask into bit 0 */ + #define STALL_MON_FLAG (0x00010000) /**< Flag indicating hang monitor running */ #define STALL_REC_FLAG (0x00020000) /**< Flag indicating hbsClient took action */ #define STALL_ERR1_FLAG (0x00100000) /**< Error 1 Flag */ @@ -1217,15 +1222,15 @@ string get_availStatus_str ( mtc_nodeAvailStatus_enum availStatus ); string get_operState_str ( mtc_nodeOperState_enum operState ); string get_adminState_str ( mtc_nodeAdminState_enum adminState ); -void log_adminAction ( string hostname, - mtc_nodeAdminAction_enum currAction, +void log_adminAction ( string hostname, + mtc_nodeAdminAction_enum currAction, mtc_nodeAdminAction_enum newAction ); -int send_hbs_command ( string hostname, int command ); +int send_hbs_command ( string hostname, int command, string controller=CONTROLLER ); int send_hwmon_command ( string hostname, int command ); int send_guest_command ( string hostname, int command ); -int daemon_log_message ( const char * hostname, +int daemon_log_message ( const char * hostname, const char * filename, const char * log_str ); diff --git a/mtce-common/src/common/nodeTimers.h b/mtce-common/src/common/nodeTimers.h index 7371c3bd..8f7178d8 100755 --- a/mtce-common/src/common/nodeTimers.h +++ b/mtce-common/src/common/nodeTimers.h @@ -48,6 +48,7 @@ #define MTC_MINS_20 (1200) #define MTC_MINS_30 (1800) #define MTC_MINS_40 (2400) +#define MTC_HRS_1 (3600) #define MTC_HRS_4 (14400) #define MTC_HRS_8 (28800) /* old token refresh rate */ diff --git a/mtce-common/src/daemon/daemon_config.cpp b/mtce-common/src/daemon/daemon_config.cpp index 80f4c801..5053f5f8 100644 --- a/mtce-common/src/daemon/daemon_config.cpp +++ b/mtce-common/src/daemon/daemon_config.cpp @@ -269,7 +269,7 @@ void daemon_dump_cfg ( void ) { daemon_config_type * ptr = daemon_get_cfg_ptr(); - ilog ("Configuration Settings\n------------------------------\n"); + ilog ("Configuration Settings ...\n"); if ( ptr->scheduling_priority ) { ilog ("scheduling_priority = %d\n", ptr->scheduling_priority ); } if ( ptr->infra_degrade_only ) { ilog ("infra_degrade_only = %s\n", ptr->infra_degrade_only ? "Yes" : "No" );} @@ -277,7 +277,6 @@ void daemon_dump_cfg ( void ) if ( ptr->active ) { ilog ("active = %s\n", ptr->active ? "Yes" : "No" );} /* hbsAgent */ - if ( ptr->hbs_pulse_period ) { ilog ("hbs_pulse_period = %d\n", ptr->hbs_pulse_period );} if ( ptr->token_refresh_rate ) { ilog ("token_refresh_rate = %d\n", ptr->token_refresh_rate );} if ( ptr->hbs_minor_threshold ) { ilog ("hbs_minor_threshold = %d\n", ptr->hbs_minor_threshold );} if ( ptr->hbs_degrade_threshold ) { ilog ("hbs_degrade_threshold = %d\n", ptr->hbs_degrade_threshold );} diff --git a/mtce-common/src/daemon/daemon_main.cpp b/mtce-common/src/daemon/daemon_main.cpp index 9c2bb190..de6f2aee 100755 --- a/mtce-common/src/daemon/daemon_main.cpp +++ b/mtce-common/src/daemon/daemon_main.cpp @@ -78,6 +78,7 @@ void print_help ( void ) printf ("\t-l --log - Log to file ; /var/log/.log\n"); printf ("\t-p --passive - Passive mode ; do not act on failures\n"); printf ("\t-v --verbose - Show command line arguments\n"); + printf ("\t-V --Virtual - Running in virtual environment\n"); printf ("\t-t --test - Run Test Head\n"); printf ("\t-g --gap - Gap in seconds\n"); printf ("\t-m --mode - Word string representing a run mode\n"); @@ -106,6 +107,9 @@ int daemon_get_run_option ( const char * option ) } return (1); } + else if ( !strcmp ( option, "Virtual" ) ) + return opts.Virtual ; + else if ( !strcmp ( option, "front" ) ) return opts.front ; @@ -118,6 +122,7 @@ void opts_init ( void) opts.log = false ; opts.test = false ; opts.verbose = false ; + opts.Virtual = false ; opts.active = false ; opts.front = false ; opts.front = false ; @@ -152,8 +157,8 @@ int parseArg ( int argc, char * argv[], opts_type * opts_ptr ) int cmd_arg_count = 1 ; /* command args start at 1 */ /* A string listing of valid short options letters. */ - const char* const short_options = "u:c:p:g:i:m:n:d:hlfpvta"; - + const char* const short_options = "u:c:p:g:i:m:n:d:hlfpvVta"; + /* An array listing of valid long options. */ const struct option long_options[] = { @@ -167,9 +172,10 @@ int parseArg ( int argc, char * argv[], opts_type * opts_ptr ) { "username" , 1, NULL, 'u' }, { "help" , 0, NULL, 'h' }, { "active" , 0, NULL, 'a' }, - { "foreground", 0, NULL, 'f' }, - { "log" , 0, NULL, 'l' }, + { "foreground", 0, NULL, 'f' }, + { "log" , 0, NULL, 'l' }, { "verbose" , 0, NULL, 'v' }, + { "Virtual" , 0, NULL, 'V' }, { "test" , 0, NULL, 't' }, { NULL , 0, NULL, 0 } /* Required at end of array. */ }; @@ -254,19 +260,25 @@ int parseArg ( int argc, char * argv[], opts_type * opts_ptr ) case 't': /* -t or --test */ { opts_ptr->test = true ; - cmd_arg_count++ ; + cmd_arg_count++ ; break; } - case 'v': /* -t or --verbose */ + case 'v': /* -v or --verbose */ { opts_ptr->verbose = true ; - cmd_arg_count++ ; + cmd_arg_count++ ; + break; + } + case 'V': /* -V or --Virtual */ + { + opts_ptr->Virtual = true ; + cmd_arg_count++ ; break; } case 'a': /* -a or --active */ { opts_ptr->active = true ; - cmd_arg_count++ ; + cmd_arg_count++ ; break; } case '?': diff --git a/mtce-common/src/daemon/daemon_option.h b/mtce-common/src/daemon/daemon_option.h index a4924b50..3aaac846 100755 --- a/mtce-common/src/daemon/daemon_option.h +++ b/mtce-common/src/daemon/daemon_option.h @@ -33,6 +33,7 @@ typedef struct int test ; /**< Enable test mode */ int info ; /**< Dump data module info */ int verbose ; /**< Dump command line options */ + int Virtual ; /**< Set to non-zero when in virtual env */ int active ; /**< Set daemon active */ int debug ; /**< Set tracing debug mode "debug,"test","info","trace" */ int front ; /**< run in the foreground ; do not daemonize */ @@ -43,7 +44,7 @@ typedef struct string username ; string command ; string password ; -} opts_type ; +} opts_type ; opts_type * daemon_get_opts_ptr ( void ); diff --git a/mtce-control/centos/build_srpm.data b/mtce-control/centos/build_srpm.data index a69a574d..48f2c3f2 100644 --- a/mtce-control/centos/build_srpm.data +++ b/mtce-control/centos/build_srpm.data @@ -1,3 +1,3 @@ SRC_DIR="$PKG_BASE/src" COPY_LIST="$SRC_DIR/*" -TIS_PATCH_VER=6 +TIS_PATCH_VER=7 diff --git a/mtce-control/centos/mtce-control.spec b/mtce-control/centos/mtce-control.spec index a69bca1f..d65782c7 100644 --- a/mtce-control/centos/mtce-control.spec +++ b/mtce-control/centos/mtce-control.spec @@ -34,6 +34,7 @@ make install buildroot=%{buildroot} _sysconfdir=%{_sysconfdir} _unitdir=%{_unitd if [ $1 -eq 1 ] ; then /bin/systemctl enable lighttpd.service /bin/systemctl enable qemu_clean.service + /bin/systemctl enable hbsAgent.service fi exit 0 @@ -41,6 +42,9 @@ exit 0 %defattr(-,root,root,-) %{_sysconfdir}/init.d/goenabledControl %license %{_datarootdir}/licenses/mtce-control-1.0/LICENSE +%{_sysconfdir}/pmon.d/hbsAgent.conf +%{_sysconfdir}/init.d/hbsAgent +%{_unitdir}/hbsAgent.service %clean rm -rf $RPM_BUILD_ROOT diff --git a/mtce-control/src/Makefile b/mtce-control/src/Makefile index 0741da1d..aaa3de7f 100755 --- a/mtce-control/src/Makefile +++ b/mtce-control/src/Makefile @@ -1,19 +1,32 @@ -SOURCE1 = goenabled -SOURCE2 = LICENSE +SOURCE1 = LICENSE +SOURCE2 = goenabled +SOURCE3 = hbsAgent +SOURCE4 = hbsAgent.conf +SOURCE5 = hbsAgent.service -local_etc_pmond = $(_sysconfdir)/pmond.d +local_etc_pmond = $(_sysconfdir)/pmon.d local_etc_goenabledd = $(_sysconfdir)/goenabled.d .PHONY: default install: - # Controller-Only Init Scripts - install -m 755 -p -D scripts/$(SOURCE1) $(buildroot)/$(_sysconfdir)/init.d/goenabledControl - # Controller-Only Process Monitor Config files - install -m 755 -d $(buildroot)/$(local_etc_pmond) - # Controller-Only Go Enabled Test - install -m 755 -d $(buildroot)/$(local_etc_goenabledd) + # for license install -m 755 -d $(buildroot)/$(_datarootdir)/licenses/mtce-control-1.0 - install -p -D -m 600 $(SOURCE2) $(buildroot)/$(_datarootdir)/licenses/mtce-control-1.0/LICENSE + install -m 600 -p -D $(SOURCE1) $(buildroot)/$(_datarootdir)/licenses/mtce-control-1.0/LICENSE + + # Controller-Only Init Scripts + install -m 755 -d $(buildroot)/$(_sysconfdir)/init.d + install -m 755 -p -D scripts/$(SOURCE2) $(buildroot)/$(_sysconfdir)/init.d/goenabledControl + install -m 755 -p -D scripts/$(SOURCE3) $(buildroot)/$(_sysconfdir)/init.d/hbsAgent + + # Controller-Only Process Monitor Config files + install -m 755 -d $(buildroot)/$(local_etc_pmond) + install -m 644 -p -D scripts/$(SOURCE4) $(buildroot)/$(local_etc_pmond)/hbsAgent.conf + + # Controller-Only Heartbeat Service file + install -m 644 -p -D scripts/$(SOURCE5) $(buildroot)/$(_unitdir)/hbsAgent.service + + # Controller-Only Go Enabled Test + install -m 755 -d $(buildroot)/$(local_etc_goenabledd) diff --git a/mtce-control/src/scripts/hbsAgent b/mtce-control/src/scripts/hbsAgent new file mode 100644 index 00000000..db93fde0 --- /dev/null +++ b/mtce-control/src/scripts/hbsAgent @@ -0,0 +1,117 @@ +#! /bin/sh +# +# Copyright (c) 2018 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + +# +# chkconfig: 2345 95 95 +# +### BEGIN INIT INFO +# Provides: hbsAgent +# Default-Start: 3 5 +# Default-Stop: 0 1 2 6 +# Short-Description: Heartbeat Agent Daemon +### END INIT INFO + +. /etc/init.d/functions + +DAEMON_NAME="hbsAgent" +DAEMON="/usr/local/bin/${DAEMON_NAME}" +PIDFILE="/var/run/${DAEMON_NAME}.pid" + +VIRT_TOOL='virt-what' +# controller-1:~$ sudo virt-what +# virtualbox ... in virtualbox +# kvm ... in qemu + +# Linux Standard Base (LSB) Error Codes +RETVAL=0 +GENERIC_ERROR=1 +INVALID_ARGS=2 +UNSUPPORTED_FEATURE=3 +NOT_INSTALLED=5 +NOT_RUNNING=7 + +PATH=/sbin:/usr/sbin:/bin:/usr/bin:/usr/local/bin +export PATH + +if [ ! -e "${DAEMON}" ] ; then + logger "${DAEMON} is missing" + exit ${NOT_INSTALLED} +fi + +case "$1" in + start) + logger "Starting ${DAEMON_NAME}" + echo -n "Starting ${DAEMON_NAME}: " + if [ -n "`pidof ${DAEMON_NAME}`" ] ; then + echo -n "is already running " + RETVAL=0 + else + tool=$(which ${VIRT_TOOL}) + if [ $? -eq 0 ] ; then + virtual=`${VIRT_TOOL}` + else + virtual="" + fi + + if [ "${virtual}" == "virtualbox" -o "${virtual}" == "kvm" ] ; then + start-stop-daemon --start -b -x ${DAEMON} -- -l -a -V + else + start-stop-daemon --start -b -x ${DAEMON} -- -l -a + fi + RETVAL=$? + fi + if [ ${RETVAL} -eq 0 ] ; then + pid=`pidof ${DAEMON_NAME}` + echo "OK" + logger "${DAEMON} (${pid})" + else + echo "FAIL" + RETVAL=${GENERIC_ERROR} + fi + ;; + + stop) + logger "Stopping ${DAEMON_NAME}" + echo -n "Stopping ${DAEMON_NAME}: " + if [ -n "`pidof ${DAEMON_NAME}`" ] ; then + killproc ${DAEMON_NAME} + fi + if [ -n "`pidof ${DAEMON_NAME}`" ] ; then + echo "FAIL" + RETVAL=${NOT_RUNNING} + else + echo "OK" + fi + rm -f ${PIDFILE} + ;; + + restart) + $0 stop + $0 start + ;; + + status) + pid=`pidof ${DAEMON_NAME}` + RETVAL=$? + if [ ${RETVAL} -eq 0 ] ; then + echo "${DAEMON_NAME} is running" + else + echo "${DAEMON_NAME} is NOT running" + RETVAL=${NOT_RUNNING} + fi + ;; + + condrestart) + $0 restart + ;; + + *) + echo "usage: $0 { start | stop | status | restart | condrestart | status }" + ;; +esac + +exit ${RETVAL} diff --git a/mtce-control/src/scripts/hbsAgent.conf b/mtce-control/src/scripts/hbsAgent.conf new file mode 100644 index 00000000..169e5ce0 --- /dev/null +++ b/mtce-control/src/scripts/hbsAgent.conf @@ -0,0 +1,25 @@ +[process] +process = hbsAgent +service = hbsAgent +pidfile = /var/run/hbsAgent.pid +style = lsb ; ocf or lsb +severity = major ; minor, major, critical +restarts = 1 ; restart retries before error assertion +interval = 10 ; number of seconds to wait between restarts +debounce = 10 ; number of seconds that a process needs to remain + ; running before degrade is removed and retry count + ; is cleared. +startuptime = 5 ; Seconds to wait after process start before starting the debounce monitor +mode = passive ; Monitoring mode: passive (default) or active + ; passive: process death monitoring (default: always) + ; active : heartbeat monitoring, i.e. request / response messaging + ; ignore : do not monitor or stop monitoring +quorum = 0 ; process is in the host watchdog quorum + +; Active Monitoring Options + +port = 2201 +period = 5 ; monitor period in seconds +timeout = 4 ; Messaging timeout period in seconds, must be shorter than period +threshold = 5 ; Number of back to back heartbeat failures before action + diff --git a/mtce-control/src/scripts/hbsAgent.service b/mtce-control/src/scripts/hbsAgent.service new file mode 100644 index 00000000..de3cb8d8 --- /dev/null +++ b/mtce-control/src/scripts/hbsAgent.service @@ -0,0 +1,22 @@ +[Unit] +Description=Titanium Cloud Maintenance Heartbeat Agent +After=network.target syslog.service config.service +Before=pmon.service + +[Service] +Type=forking +ExecStart=/etc/rc.d/init.d/hbsAgent start +ExecStop=/etc/rc.d/init.d/hbsAgent start +PIDFile=/var/run/hbsAgent.pid +KillMode=process +SendSIGKILL=no + +# Process recovery is handled by pmond if its running. +# Delay 10 seconds to give pmond a chance to recover +# before systemd kicks in to do it as a backup plan. +Restart=always +RestartSec=10 + +[Install] +WantedBy=multi-user.target + diff --git a/mtce/centos/build_srpm.data b/mtce/centos/build_srpm.data index a6783bd5..01e786b1 100644 --- a/mtce/centos/build_srpm.data +++ b/mtce/centos/build_srpm.data @@ -1,3 +1,3 @@ SRC_DIR="src" -TIS_PATCH_VER=140 +TIS_PATCH_VER=142 BUILD_IS_SLOW=5 diff --git a/mtce/centos/mtce.spec b/mtce/centos/mtce.spec index d69e34bc..e6ffa183 100644 --- a/mtce/centos/mtce.spec +++ b/mtce/centos/mtce.spec @@ -313,7 +313,6 @@ install -m 755 -d %{buildroot}/usr/lib/ocf install -m 755 -d %{buildroot}/usr/lib/ocf/resource.d install -m 755 -d %{buildroot}/usr/lib/ocf/resource.d/platform install -m 755 -p -D %{_buildsubdir}/scripts/mtcAgent %{buildroot}/usr/lib/ocf/resource.d/platform/mtcAgent -install -m 755 -p -D %{_buildsubdir}/scripts/hbsAgent %{buildroot}/usr/lib/ocf/resource.d/platform/hbsAgent install -m 755 -p -D %{_buildsubdir}/hwmon/scripts/ocf/hwmon %{buildroot}/usr/lib/ocf/resource.d/platform/hwmon # config files @@ -482,7 +481,6 @@ install -m 755 -d %{buildroot}/var/run # SM OCF Start/Stop/Monitor Scripts %{ocf_resourced}/platform/mtcAgent -%{ocf_resourced}/platform/hbsAgent # Config files %config(noreplace)/etc/mtc.ini diff --git a/mtce/src/alarm/alarm.cpp b/mtce/src/alarm/alarm.cpp index 6b113f89..f1642446 100644 --- a/mtce/src/alarm/alarm.cpp +++ b/mtce/src/alarm/alarm.cpp @@ -47,6 +47,11 @@ int alarm_register_user ( msgClassSock * sock_ptr ) return (rc); } +void alarm_unregister_user ( void ) +{ + user_sock_ptr = NULL ; +} + /* Construct an alarm request json string in the following form {\"mtcalarm\":[{\"alarmid\":\"200.009\",\"hostname\":\"compute-3\",\"operation\":\"set\",\"severity\":\"major\",\"entity\":\"Infrastructure\",\"prefix\":\"service=heartbeat\"}, {\"alarmid\":\"200.005\",\"hostname\":\"compute-3\",\"operation\":\"set\",\"severity\":\"major\",\"entity\":\"Management\",\"prefix\":\"service=heartbeat\"}]}" @@ -73,6 +78,17 @@ int alarm_ ( string hostname, const char * id, EFmAlarmStateT state, EFmAlarmSev string msg_type ; string sev ; + if ( user_sock_ptr == NULL ) + { + slog ("alarm socket is NULL"); + return (FAIL_NULL_POINTER ); + } + else if ( ! user_sock_ptr->sock_ok() ) + { + elog ("alarm socket is not ok"); + return (FAIL_OPERATION); + } + if ( state == FM_ALARM_STATE_MSG ) msg_type = "msg" ; else if ( state == FM_ALARM_STATE_SET ) @@ -127,7 +143,8 @@ int alarm_ ( string hostname, const char * id, EFmAlarmStateT state, EFmAlarmSev } else { - ilog ("%s %s\n", hostname.c_str(), request); + ilog ("%s %s %s %s %s", hostname.c_str(), entity, msg_type.c_str(), sev.c_str(), id); + mlog ("%s %s\n", hostname.c_str(), request); return ( PASS ) ; } daemon_signal_hdlr (); diff --git a/mtce/src/alarm/alarm.h b/mtce/src/alarm/alarm.h index 9e29c971..1bc8f9e0 100644 --- a/mtce/src/alarm/alarm.h +++ b/mtce/src/alarm/alarm.h @@ -68,6 +68,7 @@ EFmAlarmSeverityT alarmUtil_getSev_enum ( string severity ); #ifndef __MODULE_PRIVATE__ int alarm_register_user ( msgClassSock * sock_ptr ); +void alarm_unregister_user ( void ); /* Public API */ int alarm_ ( string hostname, const char * id, EFmAlarmStateT state, EFmAlarmSeverityT severity, const char * entity, string prefix ); diff --git a/mtce/src/common/nodeClass.cpp b/mtce/src/common/nodeClass.cpp index 0c51ab23..e33c70ce 100755 --- a/mtce/src/common/nodeClass.cpp +++ b/mtce/src/common/nodeClass.cpp @@ -36,6 +36,7 @@ using namespace std; #include "mtcAlarm.h" #include "alarm.h" #include "hbsAlarm.h" +#include "hbsBase.h" /** Initialize the supplied command buffer */ void mtcCmd_init ( mtcCmd & cmd ) @@ -263,7 +264,8 @@ nodeLinkClass::nodeLinkClass() /* Make no assumption on the service */ maintenance = false ; heartbeat = false ; - active = false ; + active = false ; /* run active */ + active_controller = false ; /* true if this controller is active */ /* Set some defaults for the hearbeat service */ hbs_ready = false ; @@ -1156,26 +1158,26 @@ void nodeLinkClass::print_node_info ( void ) if (( i == INFRA_IFACE ) && ( infra_network_provisioned == false )) continue ; - syslog ( LOG_INFO, "+--------------+-----+-----+------+-----+------+------------+---------+-----------------+\n"); - syslog ( LOG_INFO, "| %s: %3d | Mon | Mis | Max | Deg | Fail | Pulses Tot | Pulses | %s (%4d) |\n" , + syslog ( LOG_INFO, "+--------------+-----+-------+-------+-------+-------+------------+----------+-----------------+\n"); + syslog ( LOG_INFO, "| %s: %3d | Mon | Mis | Max | Deg | Fail | Pulses Tot | Pulses | %s (%4d) |\n" , get_iface_name_str ((iface_enum)i), hosts, hbs_disabled ? "DISABLED" : "Enabled ", hbs_pulse_period ); - syslog ( LOG_INFO, "+--------------+-----+-----+------+-----+------+------------+---------+-----------------+\n"); - + syslog ( LOG_INFO, "+--------------+-----+-------+-------+-------+-------+------------+----------+-----------------+\n"); + for ( struct node * ptr = head ; ptr != NULL ; ptr = ptr->next ) { - syslog ( LOG_INFO, "| %-12s | %c | %3i | %4i | %3i | %4i | %8x | %7x | %d msec\n", + syslog ( LOG_INFO, "| %-12s | %c | %5i | %5i | %5i | %5i | %10x | %8x | %d msec\n", ptr->hostname.c_str(), ptr->monitor[i] ? 'Y' : 'n', - ptr->hbs_misses_count[i], - ptr->max_count[i], - ptr->hbs_degrade_count[i], - ptr->hbs_failure_count[i], + ptr->hbs_misses_count[i], + ptr->max_count[i], + ptr->hbs_degrade_count[i], + ptr->hbs_failure_count[i], ptr->hbs_count[i], ptr->b2b_pulses_count[i], hbs_pulse_period ); } } - syslog ( LOG_INFO, "+--------------+-----+-----+------+-----+------+------------+---------+-----------------+\n"); + syslog ( LOG_INFO, "+--------------+-----+-------+-------+-------+-------+------------+----------+-----------------+\n"); } } @@ -7778,7 +7780,7 @@ int nodeLinkClass::remPulse ( struct node * node_ptr, iface_enum iface, bool cle { /* This default RC allows the caller to filter out unexpected pulse responses */ int rc = ENXIO ; - + if ( head == NULL ) { return -ENODEV ; @@ -7962,6 +7964,16 @@ int nodeLinkClass::remPulse ( struct node * node_ptr, iface_enum iface, bool cle } pulses[iface]-- ; } + else if ( node_ptr ) + { + dlog ("%s unexpected pulse response ; %s", + node_ptr->hostname.c_str(), + get_iface_name_str(iface)); + } + else + { + slog ("null pointer"); + } return rc ; } @@ -7972,6 +7984,13 @@ int nodeLinkClass::remPulse ( struct node * node_ptr, iface_enum iface, bool cle * By index does not require a lookup whereas hostname does */ int nodeLinkClass::remove_pulse ( string & hostname, iface_enum iface, int index, unsigned int flags ) { + /* TODO: consider removing this check */ + if ( hostname == "localhost" ) + { + /* localhost is not a supported hostname and indicates + * an unconfigured host response ; return the ignore response */ + return(ENXIO); + } if ( index ) { int rc = remPulse_by_index ( hostname, index , iface, true , flags ); @@ -7984,16 +8003,6 @@ int nodeLinkClass::remove_pulse ( string & hostname, iface_enum iface, int index } else { - if ( hostname.compare("localhost") ) - { - get_hbs_monitor_state ( hostname , iface ) ; - } - else - { - /* localhost is not a supported hostname and indicates - * an unconfigured host response ; return the ignore response */ - return(ENXIO); - } } return ( remPulse_by_name ( hostname , iface, true, flags )); } @@ -8016,7 +8025,6 @@ void nodeLinkClass::clear_pulse_list ( iface_enum iface ) } } - /** Runs in the hbsAgent to set or clear heartbat alarms for all supported interfaces */ void nodeLinkClass::manage_heartbeat_alarm ( struct nodeLinkClass::node * node_ptr, EFmAlarmSeverityT sev, int iface ) { @@ -8142,7 +8150,6 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding ) storage_0_responding = false ; } - /* Don't log single misses unless in debug mode */ if ( pulse_ptr->b2b_misses_count[iface] > 1 ) { if ( pulse_ptr->b2b_misses_count[iface] >= hbs_failure_threshold ) @@ -8207,7 +8214,10 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding ) { if ( pulse_ptr->b2b_misses_count[iface] == hbs_minor_threshold ) { - send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_MINOR_SET, iface ); + if ( this->active_controller ) + { + send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_MINOR_SET, iface ); + } pulse_ptr->hbs_minor[iface] = true ; pulse_ptr->hbs_minor_count[iface]++ ; wlog ("%s %s -> MINOR\n", pulse_ptr->hostname.c_str(), get_iface_name_str(iface)); @@ -8215,10 +8225,17 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding ) } if ( pulse_ptr->b2b_misses_count[iface] == hbs_degrade_threshold ) { - manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_MAJOR, iface ); + if ( this->active_controller ) + { + manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_MAJOR, iface ); - /* report this host as failed */ - if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_DEGRADE_SET, iface ) == PASS ) + /* report this host as failed */ + if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_DEGRADE_SET, iface ) == PASS ) + { + pulse_ptr->hbs_degrade[iface] = true ; + } + } + else { pulse_ptr->hbs_degrade[iface] = true ; } @@ -8231,11 +8248,17 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding ) ( pulse_ptr->hbs_degrade[iface] == false )) { wlog ("%s -> DEGRADED - Auto-Correction\n", pulse_ptr->hostname.c_str()); + if ( this->active_controller ) + { + manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_MAJOR, iface ); - manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_MAJOR, iface ); - - /* report this host as failed */ - if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_DEGRADE_SET, iface ) == PASS ) + /* report this host as failed */ + if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_DEGRADE_SET, iface ) == PASS ) + { + pulse_ptr->hbs_degrade[iface] = true ; + } + } + else { pulse_ptr->hbs_degrade[iface] = true ; } @@ -8250,11 +8273,16 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding ) /* Only print the log at the threshold boundary */ if ( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold ) { - manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface ); + if ( this->active_controller ) + { + manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface ); + } wlog_throttled ( pulse_ptr->no_work_log_throttle, 500, "%s %s *** Heartbeat Loss *** (degrade only)\n", pulse_ptr->hostname.c_str(), get_iface_name_str(iface) ); + this->print_node_info (); + hbs_cluster_log ( this->my_hostname, "event", true ); } } @@ -8268,35 +8296,46 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding ) /* Only print the log at the threshold boundary */ if ( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold ) { - manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface ); - + if ( this->active_controller ) + { + manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface ); + } wlog_throttled ( pulse_ptr->no_work_log_throttle, 500, "%s %s *** Heartbeat Loss *** (degrade only)\n", pulse_ptr->hostname.c_str(), get_iface_name_str(iface) ); + this->print_node_info (); + hbs_cluster_log ( this->my_hostname, "event", true ); } } else if (( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold ) && - ( pulse_ptr->hbs_failure[iface] == false )) + ( pulse_ptr->hbs_failure[iface] == false )) { - elog ("%s %s -> FAILED\n", pulse_ptr->hostname.c_str(), - get_iface_name_str(iface) ); elog ("%s %s *** Heartbeat Loss ***\n", pulse_ptr->hostname.c_str(), get_iface_name_str(iface) ); - manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface ); + if ( this->active_controller ) + { + manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface ); - /* report this host as failed */ - if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_LOSS , iface ) == PASS ) + /* report this host as failed */ + if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_LOSS , iface ) == PASS ) + { + pulse_ptr->hbs_failure[iface] = true ; + } + } + else { pulse_ptr->hbs_failure[iface] = true ; + this->print_node_info (); + hbs_cluster_log ( this->my_hostname, "event", true ); } - pulse_ptr->hbs_failure_count[iface]++ ; } if ( pulse_ptr->b2b_misses_count[iface] > pulse_ptr->max_count[iface] ) - pulse_ptr->max_count[iface] = pulse_ptr->b2b_misses_count[iface] ; + pulse_ptr->max_count[iface] = pulse_ptr->b2b_misses_count[iface] ; } + if ( remPulse_by_name ( pulse_ptr->hostname, iface, false, NULL_PULSE_FLAGS )) { elog ("%s %s not in pulse list\n", pulse_ptr->hostname.c_str(), diff --git a/mtce/src/common/nodeClass.h b/mtce/src/common/nodeClass.h index cb6d59ff..0701a749 100755 --- a/mtce/src/common/nodeClass.h +++ b/mtce/src/common/nodeClass.h @@ -1266,6 +1266,10 @@ public: bool maintenance ; bool heartbeat ; + /* Set to true if this controller is active. + * Currently only used by heartbeat service. */ + bool active_controller ; + /* offline_handler tuning controls */ int offline_threshold ; /* number of back to back mtcAlive misses before offline */ int offline_period ; /* offline handler mtcAlive request period */ diff --git a/mtce/src/heartbeat/hbsAgent.cpp b/mtce/src/heartbeat/hbsAgent.cpp index d7aaf5bd..1e2b2fc1 100644 --- a/mtce/src/heartbeat/hbsAgent.cpp +++ b/mtce/src/heartbeat/hbsAgent.cpp @@ -54,8 +54,6 @@ using namespace std; * daemon_signal_init * hbs_hostname_read * hbs_message_init - * hbs_int_socket_init - * hbs_ext_socket_init * forever ( timer_handler ) * hbs_pulse_req * hbs_timer_start @@ -74,7 +72,7 @@ using namespace std; /* Historical String data for mem_logs */ static string unexpected_pulse_list[MAX_IFACES] = { "" , "" } ; static string arrival_histogram[MAX_IFACES] = { "" , "" } ; - +static string mtcAgent_ip = "" ; static std::list hostname_inventory ; /** This heartbeat service inventory is tracked by @@ -93,8 +91,6 @@ int module_init ( void ) return (PASS); } -static unsigned int controller_number = 0 ; - void daemon_sigchld_hdlr ( void ) { ; /* dlog("Received SIGCHLD ... no action\n"); */ @@ -130,17 +126,20 @@ hbs_ctrl_type * get_hbs_ctrl_ptr () { return &hbs_ctrl ; } void monitor_scheduling ( unsigned long long & this_time, unsigned long long & prev_time , int data, const char * label_ptr ) { this_time = gettime_monotonic_nsec () ; - if ( this_time > (prev_time + (NSEC_TO_MSEC*(hbs_config.latency_thld)))) /* 10 millisec */ + if ( label_ptr && strncmp ( label_ptr, NODEUTIL_LATENCY_MON_START, strlen(NODEUTIL_LATENCY_MON_START))) { - llog ("%4llu.%-4llu msec - %s at line %d\n", + if ( ! strcmp (SCHED_MONITOR__RECEIVER, label_ptr ) && ( data > 10 )) + { + ilog ("===> receive latency : batch of %d pulses in under scheduling threshold of %d msec\n", data , hbs_config.latency_thld ); + } + else if ( this_time > (prev_time + (NSEC_TO_MSEC*(hbs_config.latency_thld)))) + { + llog ("%4llu.%-4llu msec %s at line %d\n", ((this_time-prev_time) > NSEC_TO_MSEC) ? ((this_time-prev_time)/NSEC_TO_MSEC) : 0, ((this_time-prev_time) > NSEC_TO_MSEC) ? ((this_time-prev_time)%NSEC_TO_MSEC) : 0, label_ptr, data); + } } -// else if ( ! strcmp (SCHED_MONITOR__RECEIVER, label_ptr ) && ( data > 10 )) -// { -// ilog ("===> receive latency : batch of %d pulses in under scheduling threshold of %d msec\n", data , hbs_config.latency_thld ); -// } prev_time = this_time ; } @@ -167,6 +166,16 @@ void daemon_exit ( void ) if ( hbs_sock.mtc_to_hbs_sock ) delete (hbs_sock.mtc_to_hbs_sock); + /* Close the alarm socket */ + if ( hbs_sock.alarm_sock ) + delete (hbs_sock.alarm_sock); + + /* Close the SM sockets */ + if ( hbs_sock.sm_server_sock ) + delete (hbs_sock.sm_server_sock); + if ( hbs_sock.sm_client_sock ) + delete (hbs_sock.sm_client_sock); + exit (0); } @@ -179,8 +188,8 @@ void daemon_exit ( void ) #define HBS_SOCKET_MSEC (5) #define HBS_SOCKET_NSEC (HBS_SOCKET_MSEC*1000) -#define HBS_MIN_PERIOD (50) -#define HBS_MAX_PERIOD (999) +#define HBS_MIN_PERIOD (100) +#define HBS_MAX_PERIOD (1000) #define HBS_VIRT_PERIOD (500) #define HBS_BACKOFF_FACTOR (4) /* period*this during backoff */ @@ -212,35 +221,28 @@ static int hbs_config_handler ( void * user, { int curr_period = hbsInv.hbs_pulse_period ; - config_ptr->hbs_pulse_period = atoi(value); hbsInv.hbs_pulse_period = atoi(value); hbsInv.hbs_state_change = true ; hbsInv.hbs_disabled = false ; config_ptr->mask |= CONFIG_AGENT_HBS_PERIOD ; /* Adjust the heartbeat period in a virtual environment */ - if (( hbsInv.hbs_pulse_period >= HBS_MIN_PERIOD ) || - ( hbsInv.hbs_pulse_period <= HBS_MAX_PERIOD )) + if (( hbsInv.hbs_pulse_period < HBS_MIN_PERIOD ) || + ( hbsInv.hbs_pulse_period > HBS_MAX_PERIOD )) { - struct stat p ; - p.st_size = 0 ; - stat ( HOST_IS_VIRTUAL, &p ) ; - if ( p.st_size ) - { - if (( hbsInv.hbs_pulse_period != 0 ) && - ( hbsInv.hbs_pulse_period < HBS_VIRT_PERIOD )) - { - config_ptr->hbs_pulse_period = HBS_VIRT_PERIOD ; - hbsInv.hbs_pulse_period = HBS_VIRT_PERIOD ; - hbsInv.hbs_pulse_period_save = HBS_VIRT_PERIOD ; - } - } + hbsInv.hbs_pulse_period = HBS_MIN_PERIOD ; } + + if ( daemon_get_run_option("Virtual") ) + { + hbsInv.hbs_pulse_period = HBS_VIRT_PERIOD ; + } + hbsInv.hbs_pulse_period_save = hbsInv.hbs_pulse_period ; if ( curr_period != hbsInv.hbs_pulse_period ) { /* initialize cluster info */ - hbs_cluster_init ( hbsInv.hbs_pulse_period ); + hbs_cluster_init ( hbsInv.hbs_pulse_period, hbs_sock.sm_client_sock ); } } @@ -405,10 +407,12 @@ int daemon_configure ( void ) ilog("Failure Thld: %i misses\n", hbsInv.hbs_failure_threshold ); ilog("Multicast : %s\n", hbs_config.multicast ); + ilog("Mgmnt Name : %s\n", hbs_config.mgmnt_iface ); /* TODO: Remove me */ + hbs_config.mgmnt_iface = daemon_get_iface_master ( hbs_config.mgmnt_iface ); - ilog("Mgmnt iface : %s\n", hbs_config.mgmnt_iface ); - ilog("Mgmnt RxPort: %d\n", hbs_config.hbs_agent_mgmnt_port ); - ilog("Mgmnt TxPort: %d\n", hbs_config.hbs_client_mgmnt_port ); + ilog("Mgmnt Master: %s\n", hbs_config.mgmnt_iface ); + ilog("Mgmnt Port : %d (rx)", hbs_config.hbs_agent_mgmnt_port ); + ilog("Mgmnt Port : %d (tx)\n", hbs_config.hbs_client_mgmnt_port ); /* Fetch the infrastructure interface name. * calls daemon_get_iface_master inside so the @@ -423,13 +427,12 @@ int daemon_configure ( void ) else { hbsInv.infra_network_provisioned = true ; - ilog ("Infra iface : %s\n", hbs_config.infra_iface ); + ilog ("Infra Name : %s", hbs_config.infra_iface ); + ilog ("Infra Port : %d (rx)", hbs_config.hbs_agent_infra_port ); + ilog ("Infra Port : %d (tx)", hbs_config.hbs_client_infra_port ); } } - ilog("Infra RxPort: %d\n", hbs_config.hbs_agent_infra_port ); - ilog("Infra TxPort: %d\n", hbs_config.hbs_client_infra_port ); - ilog("Command Port: %d (rx)\n", hbs_config.mtc_to_hbs_cmd_port ); ilog("Event Port : %d (tx)\n", hbs_config.hbs_to_mtc_event_port ); ilog("Alarm Port : %d (tx)\n", hbs_config.mtcalarm_req_port ); @@ -477,6 +480,7 @@ int daemon_configure ( void ) } static struct mtc_timer hbsTimer ; +static struct mtc_timer hbsTimer_audit ; void hbsTimer_handler ( int sig, siginfo_t *si, void *uc) { @@ -498,6 +502,12 @@ void hbsTimer_handler ( int sig, siginfo_t *si, void *uc) mtcTimer_stop_int_safe ( hbsTimer ); hbsTimer.ring = true ; } + /* is base mtc timer */ + else if (( *tid_ptr == hbsTimer_audit.tid ) ) + { + mtcTimer_stop_int_safe ( hbsTimer_audit ); + hbsTimer_audit.ring = true ; + } else { // wlog ("Unexpected timer - %p", *tid_ptr ); @@ -567,6 +577,10 @@ int _setup_pulse_messaging ( iface_enum i, int rmem_max ) hbs_sock.tx_sock[i] = 0 ; return (FAIL_SOCKET_CREATE); } + else + { + hbs_sock.tx_sock[i]->sock_ok(true); + } } else { @@ -598,6 +612,7 @@ int _setup_pulse_messaging ( iface_enum i, int rmem_max ) hbs_sock.rx_sock[i]->setSocketMemory ( iface, "rx pulse socket memory", rmem_max ); else wlog ("failed to query rmem_max ; using rmem_default\n"); + hbs_sock.rx_sock[i]->sock_ok(true); } /* handle failure path */ @@ -619,107 +634,218 @@ int _setup_pulse_messaging ( iface_enum i, int rmem_max ) return (rc); } -/* Setup the Unix Domain Transmit Pulse Socket */ -int alarm_port_init ( void ) -{ - hbs_sock.alarm_port = daemon_get_cfg_ptr()->mtcalarm_req_port; - hbs_sock.alarm_sock = new msgClassTx(LOOPBACK_IP, hbs_sock.alarm_port, IPPROTO_UDP); - if ( hbs_sock.alarm_sock ) - { - if ( hbs_sock.alarm_sock->return_status == PASS ) - { - hbs_sock.alarm_sock->sock_ok(true); - alarm_register_user ( hbs_sock.alarm_sock ); - } - else - { - elog ("alarm_port_init failed socket setup (rc:%d)\n", - hbs_sock.alarm_sock->return_status ); - } - } - return ( hbs_sock.alarm_sock->return_status ) ; -} - -int hbs_sm_sockets_init ( void ) +/* ********************************************************************* + * + * Initialize all heartbeat messaging sockets + * + * 1. transmit socket to maintenance (ready event) + * 2. receive socket from maintenance (inventory) + * 3. alarm socket to alarmd + * 4. multicast transmit socket + * 5. unicast receive socket + * + * ********************************************************************/ +int hbs_socket_init ( void ) { int rc = PASS ; - /* Create an UDP RX Message Socket for SM Requests; LO interface only */ - hbs_sock.sm_server_sock = new msgClassRx(LOOPBACK_IP, hbs_config.sm_server_port, IPPROTO_UDP); - if ( ! hbs_sock.sm_server_sock ) - { - elog ("Failed to setup SM receive socket"); - rc = FAIL_SOCKET_CREATE ; - } - - /* Create an UDP TX Message Socket for SM Requests; LO interface only */ - hbs_sock.sm_client_sock = new msgClassTx(LOOPBACK_IP, hbs_config.sm_client_port,IPPROTO_UDP); - if ( ! hbs_sock.sm_client_sock ) - { - elog ("Failed to setup SM transmit socket"); - rc = FAIL_SOCKET_CREATE ; - } - - if ( rc == PASS ) - { - hbs_sock.sm_server_sock->sock_ok(true); - hbs_sock.sm_client_sock->sock_ok(true); - } - return (rc); -} - -/* Init the internal/local sockets ; the ones that will no change. - * This way we don't miss add and start commands from maintenance. */ - -int hbs_int_socket_init ( void ) -{ - int rc = PASS ; - - ilog ("internal sockets init ...\n"); - /******************************************************************/ /* UDP Tx Message Socket for Heartbeat Events Towards Maintenance */ /******************************************************************/ - - int port = hbs_config.hbs_to_mtc_event_port ; - hbs_sock.hbs_event_tx_sock = new msgClassTx(LOOPBACK_IP, port, IPPROTO_UDP); - if (hbs_sock.hbs_event_tx_sock->return_status != PASS) + rc = FAIL_SOCKET_CREATE ; { - elog ("Failed to setup hbs event transmit port %d\n", port ); - return (hbs_sock.hbs_event_tx_sock->return_status) ; + /* load local variables */ + int port = hbs_config.hbs_to_mtc_event_port ; + mtcAgent_ip = getipbyname ( CONTROLLER ); + + /* Handle re-init case */ + if ( hbs_sock.hbs_event_tx_sock != NULL ) + { + delete (hbs_sock.hbs_event_tx_sock); + hbs_sock.hbs_event_tx_sock = NULL ; + } + + /* Create the socket */ + hbs_sock.hbs_event_tx_sock = + new msgClassTx ( mtcAgent_ip.data(), + port, + IPPROTO_UDP, + hbs_config.mgmnt_iface); + + /* Check the socket */ + if ( hbs_sock.hbs_event_tx_sock != NULL ) + { + if (hbs_sock.hbs_event_tx_sock->return_status == PASS) + { + /* success path */ + hbs_sock.hbs_event_tx_sock->sock_ok(true) ; + rc = PASS ; + } + } + + /* Handle errors */ + if ( rc ) + { + elog ("Failed to setup event transmit socket: %s:%s:%d\n", + hbs_config.mgmnt_iface, mtcAgent_ip.c_str(), port ); + return (rc); + } + } + + /****************************************************************/ + /* UDP Rx Message Socket for Maintenance Commands and Inventory */ + /****************************************************************/ + rc = FAIL_SOCKET_CREATE ; + { + /* load local variables */ + int port = hbs_config.mtc_to_hbs_cmd_port ; + + /* Handle re-init case */ + if ( hbs_sock.mtc_to_hbs_sock != NULL ) + { + delete (hbs_sock.mtc_to_hbs_sock); + hbs_sock.mtc_to_hbs_sock = NULL ; + } + + /* Create the socket */ + hbs_sock.mtc_to_hbs_sock = + new msgClassRx ( hbsInv.my_local_ip.data(), + port, + IPPROTO_UDP); + + /* Check the socket */ + if (hbs_sock.mtc_to_hbs_sock != NULL ) + { + if (hbs_sock.mtc_to_hbs_sock->return_status == PASS) + { + /* success path */ + hbs_sock.mtc_to_hbs_sock->sock_ok(true) ; + rc = PASS ; + } + } + + /* Handle errors */ + if ( rc ) + { + elog ("Failed to setup mtce command receive socket: %s:%d\n", + hbsInv.my_local_ip.c_str(), port ); + return (rc); + } + } + + /*****************************************************************/ + /* UDP Tx Message Socket to alarmd for alarm notifications */ + /*****************************************************************/ + rc = FAIL_SOCKET_CREATE ; + { + hbs_sock.alarm_port = daemon_get_cfg_ptr()->mtcalarm_req_port; + + /* Handle re-init case */ + if ( hbs_sock.alarm_sock != NULL ) + { + delete (hbs_sock.alarm_sock); + hbs_sock.alarm_sock = NULL ; + } + + /* Create the socket */ + hbs_sock.alarm_sock = + new msgClassTx(LOOPBACK_IP, hbs_sock.alarm_port, IPPROTO_UDP); + + /* Check the socket */ + if ( hbs_sock.alarm_sock ) + { + if ( hbs_sock.alarm_sock->return_status == PASS ) + { + hbs_sock.alarm_sock->sock_ok(true); + alarm_register_user ( hbs_sock.alarm_sock ); + rc = PASS ; + } + } + + /* Handle errors */ + if ( rc ) + { + elog ("Failed to setup alarm socket: LO:%d\n", + hbs_sock.alarm_port ); + alarm_unregister_user(); + return (rc ); + } } /***************************************************************/ - /* Non-Blocking UDP Rx Message Socket for Maintenance Commands */ + /* UDP RX Message Socket for SM Requests; LO interface only */ /***************************************************************/ - - port = hbs_config.mtc_to_hbs_cmd_port ; - hbs_sock.mtc_to_hbs_sock = new msgClassRx(LOOPBACK_IP, port, IPPROTO_UDP); - if (hbs_sock.mtc_to_hbs_sock->return_status != PASS) + rc = FAIL_SOCKET_CREATE ; { - elog ("Failed to setup mtce command receive port %d\n", port ); - return (hbs_sock.mtc_to_hbs_sock->return_status) ; + /* Handle re-init case */ + if ( hbs_sock.sm_server_sock != NULL ) + { + delete (hbs_sock.sm_server_sock); + hbs_sock.sm_server_sock = NULL ; + } + + /* Create the socket */ + hbs_sock.sm_server_sock = + new msgClassRx(LOOPBACK_IP, hbs_config.sm_server_port, IPPROTO_UDP); + + /* Check the socket */ + if ( hbs_sock.sm_server_sock ) + { + if ( hbs_sock.sm_server_sock->return_status == PASS ) + { + hbs_sock.sm_server_sock->sock_ok(true); + rc = PASS ; + } + } + + /* Handle errors */ + if ( rc ) + { + elog ("Failed to setup SM receive socket: LO:%d", + hbs_config.sm_server_port); + return (rc) ; + } } - if ( ( rc = alarm_port_init ()) != PASS ) + /***************************************************************/ + /* UDP TX Message Socket for SM Requests; LO interface only */ + /***************************************************************/ + rc = FAIL_SOCKET_CREATE ; { - elog ("Alarm port setup or registration failed (rc:%d)\n", rc ); + /* Handle re-init case */ + if ( hbs_sock.sm_client_sock != NULL ) + { + delete (hbs_sock.sm_client_sock); + hbs_sock.sm_client_sock = NULL ; + } + + /* Create the socket */ + hbs_sock.sm_client_sock = + new msgClassTx(LOOPBACK_IP, hbs_config.sm_client_port,IPPROTO_UDP); + + /* Check the socket */ + if ( hbs_sock.sm_client_sock ) + { + if ( hbs_sock.sm_client_sock->return_status == PASS ) + { + hbs_sock.sm_client_sock->sock_ok(true); + + /* initialize cluster info */ + hbs_cluster_init ( hbsInv.hbs_pulse_period, hbs_sock.sm_client_sock ); + + rc = PASS ; + } + } + + /* Handle errors */ + if ( rc ) + { + elog ("Failed to setup SM transmit socket: LO:%d", + hbs_config.sm_client_port); + return (rc) ; + } } - rc = hbs_sm_sockets_init () ; - - return (rc); -} - -/* Construct the messaging sockets * - * 1. multicast transmit socket * - * 2. unicast receive socket */ -int hbs_ext_socket_init ( void ) -{ - int rc = PASS ; - - ilog ("external sockets init ...\n"); - /* set rx socket buffer size ro rmem_max */ int rmem_max = daemon_get_rmem_max () ; @@ -740,6 +866,12 @@ int hbs_ext_socket_init ( void ) /* Setup the pulse messaging interfaces */ SETUP_PULSE_MESSAGING ( hbsInv.infra_network_provisioned, rmem_max ) ; + if (( hbs_sock.netlink_sock = open_netlink_socket ( RTMGRP_LINK )) <= 0 ) + { + elog ("Failed to create netlink listener socket"); + rc = FAIL_SOCKET_CREATE ; + } + return (rc) ; } @@ -751,9 +883,6 @@ int hbs_pulse_request ( iface_enum iface, string hostname_clue, unsigned int lookup_clue) { -#ifdef WANT_HBS_MEM_LOGS - char str[MAX_LEN] ; -#endif int bytes = 0 ; if ( hbs_sock.tx_sock[iface] ) { @@ -766,6 +895,12 @@ int hbs_pulse_request ( iface_enum iface, /* Add the sequence number */ hbs_sock.tx_mesg[iface].s = seq_num ; + /* Add which controller initiated this pulse */ + if (hbs_ctrl.controller ) + hbs_sock.tx_mesg[iface].f |= ( hbs_ctrl.controller << CTRLX_BIT ); + + /* Add this controller's lookup_clue + * ... aka RRI (Resource Reference Index) */ if (( lookup_clue ) && ( hostname_clue.length() <= MAX_CHARS_HOSTNAME )) { @@ -812,25 +947,24 @@ int hbs_pulse_request ( iface_enum iface, #ifdef WANT_FIT_TESTING hbs_pulse_request_out: #endif - - mlog("%s Pulse Req: (%5d): %17s:%5d: %d:%d:%d:%x:%s\n", - get_iface_name_str(iface), bytes, - hbs_sock.tx_sock[iface]->get_dst_addr()->toString(), - hbs_sock.tx_sock[iface]->get_dst_addr()->getPort(), - hbs_sock.tx_mesg[iface].v, - hbs_sock.tx_mesg[iface].s, - hbs_sock.tx_mesg[iface].c, - hbs_sock.tx_mesg[iface].f, - hbs_sock.tx_mesg[iface].m); - -#ifdef WANT_HBS_MEM_LOGS - snprintf ( &str[0], MAX_LEN, "%s Pulse Req: %17s:%5d: %u:%u:%s\n", - get_iface_name_str(iface), + mlog ( "%s Pulse Req: (%d) %s:%d: s:%u f:%x [%s] RRI:%d\n", + get_iface_name_str(iface), bytes, hbs_sock.tx_sock[iface]->get_dst_addr()->toString(), hbs_sock.tx_sock[iface]->get_dst_addr()->getPort(), hbs_sock.tx_mesg[iface].s, - hbs_sock.tx_mesg[iface].c, - hbs_sock.tx_mesg[iface].m); + hbs_sock.tx_mesg[iface].f, + hbs_sock.tx_mesg[iface].m, + hbs_sock.tx_mesg[iface].c); +#ifdef WANT_HBS_MEM_LOGS + char str[MAX_LEN] ; + snprintf ( &str[0], MAX_LEN, "%s Pulse Req: (%d) %s:%d: s:%u f:%x [%s] RRI:%d\n", + get_iface_name_str(iface), bytes, + hbs_sock.tx_sock[iface]->get_dst_addr()->toString(), + hbs_sock.tx_sock[iface]->get_dst_addr()->getPort(), + hbs_sock.tx_mesg[iface].s, + hbs_sock.tx_mesg[iface].f, + hbs_sock.tx_mesg[iface].m, + hbs_sock.tx_mesg[iface].c); mem_log (&str[0]); #endif @@ -873,14 +1007,27 @@ int _pulse_receive ( iface_enum iface , unsigned int seq_num ) } if ( (bytes = hbs_sock.rx_sock[iface]->read((char*)&hbs_sock.rx_mesg[iface], sizeof(hbs_message_type))) != -1 ) { - mlog1 ("%s Pulse Rsp: (%5d): %17s:%5d: %d:%d:%x:%s\n", - get_iface_name_str(iface), bytes, - hbs_sock.rx_sock[iface]->get_dst_addr()->toString(), - hbs_sock.rx_sock[iface]->get_dst_addr()->getPort(), - hbs_sock.rx_mesg[iface].s, - hbs_sock.rx_mesg[iface].c, - hbs_sock.rx_mesg[iface].f, - hbs_sock.rx_mesg[iface].m); + /* Look for messages that are not for this controller ..... */ + if ( hbs_ctrl.controller != + ((hbs_sock.rx_mesg[iface].f & CTRLX_MASK ) >> CTRLX_BIT)) + { + /* This path has been verified to not get hit during cluster + * feature testing. Leaving the check/continue in just in case. + * This dlog is left commented out for easy re-enable + * for debug but has no runtime impact */ + // dlog ("controller-%d pulse not for this controller ; for controller-%d", + // hbs_ctrl.controller, + // (hbs_sock.rx_mesg[iface].f & CTRLX_MASK ) >> CTRLX_BIT); + continue ; + } + mlog ("%s Pulse Rsp: (%d) %s:%d: s:%d f:%x [%-27s] RRI:%d\n", + get_iface_name_str(iface), bytes, + hbs_sock.rx_sock[iface]->get_dst_addr()->toString(), + hbs_sock.rx_sock[iface]->get_dst_addr()->getPort(), + hbs_sock.rx_mesg[iface].s, + hbs_sock.rx_mesg[iface].f, + hbs_sock.rx_mesg[iface].m, + hbs_sock.rx_mesg[iface].c); /* Validate the header */ if ( strstr ( hbs_sock.rx_mesg[iface].m, rsp_msg_header) ) @@ -907,27 +1054,27 @@ int _pulse_receive ( iface_enum iface , unsigned int seq_num ) #endif // mlog ("%s Pulse Rsp from (%s)\n", get_iface_name_str(iface), hostname.c_str()); - if ( !hostname.compare("localhost") ) + if ( hostname == "localhost" ) { - mlog3 ("%s Pulse Rsp (local): %17s:%5d: %d:%d:%x:%s\n", + mlog3 ("%s Pulse Rsp (local): %s:%d: s:%d f:%x [%-27s] RRI:%d\n", get_iface_name_str(iface), hbs_sock.rx_sock[iface]->get_dst_addr()->toString(), hbs_sock.rx_sock[iface]->get_dst_addr()->getPort(), hbs_sock.rx_mesg[iface].s, - hbs_sock.rx_mesg[iface].c, hbs_sock.rx_mesg[iface].f, - hbs_sock.rx_mesg[iface].m); + hbs_sock.rx_mesg[iface].m, + hbs_sock.rx_mesg[iface].c); } - else if ( !hostname.compare(hbsInv.my_hostname)) + else if ( hostname == hbsInv.my_hostname) { - mlog3 ("%s Pulse Rsp: (self ): %17s:%5d: %d:%d:%x:%s\n", + mlog3 ("%s Pulse Rsp: (self ): %s:%d: s:%d f:%x [%-27s] RRI:%d\n", get_iface_name_str(iface), hbs_sock.rx_sock[iface]->get_dst_addr()->toString(), hbs_sock.rx_sock[iface]->get_dst_addr()->getPort(), hbs_sock.rx_mesg[iface].s, - hbs_sock.rx_mesg[iface].c, hbs_sock.rx_mesg[iface].f, - hbs_sock.rx_mesg[iface].m); + hbs_sock.rx_mesg[iface].m, + hbs_sock.rx_mesg[iface].c); hbsInv.manage_pulse_flags ( hostname, hbs_sock.rx_mesg[iface].f ); } @@ -935,7 +1082,6 @@ int _pulse_receive ( iface_enum iface , unsigned int seq_num ) { if ( hbsInv.monitored_pulse ( hostname , iface ) == true ) { - char str[MAX_LEN] ; string extra = "Rsp" ; if ( seq_num != hbs_sock.rx_mesg[iface].s ) @@ -946,7 +1092,9 @@ int _pulse_receive ( iface_enum iface , unsigned int seq_num ) { rc = hbsInv.remove_pulse ( hostname, iface, hbs_sock.rx_mesg[iface].c, hbs_sock.rx_mesg[iface].f ) ; } - snprintf (&str[0], MAX_LEN, "%s Pulse %s: (%5d): %s:%d: %u:%u:%x:%s\n", +#ifdef WANT_HBS_MEM_LOGS + char str[MAX_LEN] ; + snprintf (&str[0], MAX_LEN, "%s Pulse %s: (%d): %s:%d: %u:%u:%x:%s\n", get_iface_name_str(iface), extra.c_str(), bytes, hbs_sock.rx_sock[iface]->get_dst_addr()->toString(), hbs_sock.rx_sock[iface]->get_dst_addr()->getPort(), @@ -954,8 +1102,7 @@ int _pulse_receive ( iface_enum iface , unsigned int seq_num ) hbs_sock.rx_mesg[iface].c, hbs_sock.rx_mesg[iface].f, hbs_sock.rx_mesg[iface].m); - mlog ("%s", &str[0]); -#ifdef WANT_HBS_MEM_LOGS + // mlog ("%s", &str[0]); mem_log (str); #endif if ( extra.empty()) @@ -973,21 +1120,17 @@ int _pulse_receive ( iface_enum iface , unsigned int seq_num ) hbs_cluster_save ( hostname, MTCE_HBS_NETWORK_INFRA , hbs_sock.rx_mesg[iface]); } } - else - { -ilog ("skipping my hostname"); - } } else { - mlog3 ("%s Pulse Dis: (%5d): %17s:%5d: %d:%d:%x:%s\n", + mlog3 ("%s Pulse Dis: (%d) %s:%d: seq:%d flag:%x [%-27s] RRI:%d\n", get_iface_name_str(iface), bytes, hbs_sock.rx_sock[iface]->get_dst_addr()->toString(), hbs_sock.rx_sock[iface]->get_dst_addr()->getPort(), hbs_sock.rx_mesg[iface].s, - hbs_sock.rx_mesg[iface].c, hbs_sock.rx_mesg[iface].f, - hbs_sock.rx_mesg[iface].m); + hbs_sock.rx_mesg[iface].m, + hbs_sock.rx_mesg[iface].c); } } @@ -1009,7 +1152,7 @@ ilog ("skipping my hostname"); else { wlog ( "Badly formed message\n" ); - mlog ( "Bad %s Msg: %14s:%5d: %d:%s\n", + mlog ( "Bad %s Msg: %s:%d: %d:%s\n", get_iface_name_str(iface), hbs_sock.rx_sock[iface]->get_dst_addr()->toString(), hbs_sock.rx_sock[iface]->get_dst_addr()->getPort(), @@ -1029,14 +1172,22 @@ int send_event ( string & hostname, unsigned int event_cmd, iface_enum iface ) int rc = PASS ; int retries = 0 ; + if ((hbs_sock.hbs_event_tx_sock == NULL ) || + (hbs_sock.hbs_event_tx_sock->sock_ok() == false )) + { + elog ("send event socket not healthy"); + return (FAIL_OPERATION); + } + mtc_message_type event ; memset (&event, 0 , sizeof(mtc_message_type)); if ( event_cmd == MTC_EVENT_HEARTBEAT_LOSS ) { - daemon_dump_membuf_banner (); + // daemon_dump_membuf_banner (); hbsInv.print_node_info (); - hbs_cluster_log( hbsInv.my_hostname, "event"); - daemon_dump_membuf (); + hbs_cluster_log ( hbsInv.my_hostname, "event", true ); + + // daemon_dump_membuf (); snprintf ( &event.hdr[0] , MSG_HEADER_SIZE, "%s", get_heartbeat_loss_header()); } else if ( event_cmd == MTC_EVENT_LOOPBACK ) @@ -1112,7 +1263,7 @@ int send_event ( string & hostname, unsigned int event_cmd, iface_enum iface ) /* The main heartbeat service loop */ int daemon_init ( string iface, string nodetype ) { - int rc = 10 ; + int rc = PASS ; /* Not used by this service */ UNUSED(nodetype); @@ -1128,6 +1279,7 @@ int daemon_init ( string iface, string nodetype ) /* initialize the timer */ mtcTimer_init ( hbsTimer, "controller", "heartbeat" ); + mtcTimer_init ( hbsTimer_audit, "controller", "state audit" ); /* start with no inventory */ hostname_inventory.clear(); @@ -1154,12 +1306,14 @@ int daemon_init ( string iface, string nodetype ) rc = FAIL_SIGNAL_INIT ; } +#ifdef WANT_EARLY_CONFIG /* Configure the agent */ else if ( (rc = daemon_configure ( )) != PASS ) { elog ("Daemon service configuration failed (rc:%i)\n", rc ); rc = FAIL_DAEMON_CONFIG ; } +#endif /* Init the heartbeat request message */ else if ( hbs_message_init ( ) != PASS ) @@ -1168,11 +1322,9 @@ int daemon_init ( string iface, string nodetype ) rc = FAIL_MESSAGE_INIT; } - /* Setup the heartbeat service messaging sockets */ - else if ((rc = hbs_int_socket_init ( )) != PASS ) + if ( daemon_is_file_present ( NODE_LOCKED_FILE )) { - elog ("internal socket initialization failed (rc:%d)\n", rc ); - return ( FAIL_SOCKET_INIT ) ; + hbs_ctrl.locked = true ; } daemon_init_fit(); @@ -1298,21 +1450,29 @@ void daemon_service_run ( void ) #endif int rc = PASS ; int counter = 0 ; - int goenabled_wait_log_throttle = 0 ; + + /* staged initialization gates */ bool goenabled = false ; + bool sockets_init = false ; - /* A variable that throttles external socket init failure retries and + /* log throttles */ + + /* A variable that throttles socket init failure retries and * ultimately triggers an exit if that retry count gets too big */ - int ext_socket_init_fail_count = 0 ; - - /* get a starting point */ - unsigned long long prev_time = gettime_monotonic_nsec (); - unsigned long long this_time = prev_time ; + int socket_init_fail_count = 0 ; /* Used to throttle warning messages that report * an error transmitting the pulse request */ int pulse_request_fail_log_counter[MAX_IFACES] ; + /* throttle initialization wait logs */ + int wait_log_throttle = 0 ; + + + /* get a starting point */ + unsigned long long prev_time = gettime_monotonic_nsec (); + unsigned long long this_time = prev_time ; + bool heartbeat_request = true ; unsigned int seq_num = 0 ; @@ -1333,15 +1493,6 @@ void daemon_service_run ( void ) hbsInv.pulse_requests[iface] = 0 ; } - /* Make the main loop schedule in real-time */ - struct sched_param param ; - memset ( ¶m, 0, sizeof(struct sched_param)); - param.sched_priority = hbs_config.scheduling_priority ; - if ( sched_setscheduler(0, SCHED_RR, ¶m) ) - { - elog ("sched_setscheduler (0, SCHED_RR, %d ) returned error (%d:%s)\n", - param.sched_priority, errno, strerror(errno)); - } /* Not monitoring address changes RTMGRP_IPV4_IFADDR | RTMGRP_IPV6_IFADDR */ if (( hbs_sock.ioctl_sock = open_ioctl_socket ( )) <= 0 ) @@ -1350,126 +1501,112 @@ void daemon_service_run ( void ) daemon_exit (); } - if (( hbs_sock.netlink_sock = open_netlink_socket ( RTMGRP_LINK )) <= 0 ) - { - elog ("Failed to create netlink listener socket"); - daemon_exit (); - } - /* set this controller as provisioned */ hbs_manage_controller_state ( hbsInv.my_hostname , true ); - /* CGTS 4114: Small Footprint: Alarm 200.005 remains active after connectivity restored - * - * Clear self alarms */ - hbsAlarm_clear_all ( hbsInv.my_hostname, hbsInv.infra_network_provisioned ); - - /* add this host as inventory to hbsAgent - * Although this host is not monitored for heartbeat, - * there are OOB flags in the heartbneat message that - * are needed to be extracted and locally updated */ - { - /* Scoping this so that the inv variable is freed after the add. - * No need sarying it around on the stack all the time */ - node_inv_type inv ; - - /* init the inv variable */ - node_inv_init ( inv ); - inv.name = hbsInv.my_hostname ; - inv.nodetype = CONTROLLER_TYPE ; - hbsInv.add_heartbeat_host ( inv ); - } - ilog ("Sending ready event to maintenance\n"); - do - { - /* Wait for maintenance */ - rc = send_event ( hbsInv.my_hostname, MTC_EVENT_HEARTBEAT_READY, MGMNT_IFACE ) ; - if ( rc == RETRY ) - { - mtcWait_secs ( 3 ); - } - } while ( rc == RETRY ) ; - - if ( rc == FAIL ) - { - elog ("Unrecoverable heartbeat startup error (rc=%d)\n", rc ); - daemon_exit (); - } - - /* enable the base level signal handler latency monitor */ - daemon_latency_monitor (true); - - /* load this controller index number - used for cluster stuff */ - if ( hbsInv.my_hostname == CONTROLLER_0 ) - controller_number = 0 ; - else - controller_number = 1 ; - - /* tell the cluster which controller this is and - * how many networks are being monitored */ - hbs_cluster_nums (controller_number,hbsInv.infra_network_provisioned ?2:1); - /* Run heartbeat service forever or until stop condition */ - for ( hbsTimer.ring = false ; ; ) + for ( hbsTimer.ring = false , hbsTimer_audit.ring = false ; ; ) { daemon_signal_hdlr (); - /******************************************************************* - * - * This handles hbsAgent external socket initialization in the main - * loop only after the goenabled state is reached. - * - *******************************************************************/ - if ( goenabled == false ) + if ( hbsTimer_audit.ring == true ) { - if ( hbsInv.system_type == SYSTEM_TYPE__NORMAL ) + /* the state dump is only important after daemon init */ + if ( sockets_init == true ) { - if ( daemon_is_file_present ( GOENABLED_MAIN_PASS ) == true ) - { - ilog ("GOENABLE (large system)\n"); - goenabled = true ; - } - } - else - { - if ( daemon_is_file_present ( GOENABLED_SUBF_PASS ) == true ) - { - ilog ("GOENABLE (small system)\n"); - goenabled = true ; - } + hbsInv.print_node_info(); + + hbs_state_audit (); } + /* run the first audit in 30 seconds */ + mtcTimer_start ( hbsTimer_audit, hbsTimer_handler, MTC_HRS_1 ); + } + + /* handle staged initialization */ + if ( sockets_init == false ) + { if ( goenabled == false ) { - ilog_throttled ( goenabled_wait_log_throttle, 2000, "GOENABLE wait ...\n"); - usleep (50000); /* 50 msec */ - } - - if ( goenabled == true ) - { - /* Setup the heartbeat service messaging sockets */ - if ( (rc = hbs_ext_socket_init ( )) != PASS ) + if ( hbsInv.system_type == SYSTEM_TYPE__NORMAL ) { - goenabled = false ; - if ( ext_socket_init_fail_count++ == 30 ) + if ( daemon_is_file_present ( GOENABLED_MAIN_PASS )) { - elog ("external socket initialization failed (rc:%d) max retries ; exiting ...\n", rc ); - daemon_exit (); - } - else - { - elog ("external socket initialization failed (rc:%d)\n", rc ); + ilog ("GOENABLE (large system)\n"); + goenabled = true ; + wait_log_throttle = 0 ; } } else { - ext_socket_init_fail_count = 0 ; - goenabled_wait_log_throttle = 0 ; + if ( daemon_is_file_present ( GOENABLED_SUBF_PASS )) + { + ilog ("GOENABLE (small system)\n"); + goenabled = true ; + wait_log_throttle = 0 ; + } + } + + if ( goenabled == false ) + { + ilog_throttled ( wait_log_throttle, MTC_MINS_5, "GOENABLE wait ...\n"); + sleep (1); + continue ; + } + } + else // ( sockets_init == false ) + { + string mgmnt_iface = daemon_mgmnt_iface (); + hbs_config.mgmnt_iface = (char*)mgmnt_iface.data(); + if ( mgmnt_iface.empty() || ( mgmnt_iface == "none" )) + { + ilog_throttled ( wait_log_throttle, 5, "MGMNT wait ..."); + sleep (5); + continue ; + } + + if ( (rc = daemon_configure ( )) != PASS ) + { + elog ("Daemon service configuration failed (rc:%i)\n", rc ); + daemon_exit(); + } + + /* Setup the heartbeat sockets */ + if ( (rc = hbs_socket_init ()) != PASS ) + { + if ( socket_init_fail_count++ == 10 ) + { + elog ("Failed socket initialization (rc:%d) max retries ; exiting ...\n", rc ); + daemon_exit (); + } + else + { + elog ("Failed socket initialization (rc:%d) ; will retry in 5 secs ...\n", rc ); + sleep (5); + } + } + else + { + ilog ("Sending ready event to maintenance\n"); + do + { + /* Wait for maintenance */ + rc = send_event ( hbsInv.my_hostname, MTC_EVENT_HEARTBEAT_READY, MGMNT_IFACE ) ; + if ( rc == RETRY ) + { + mtcWait_secs ( 3 ); + } + } while ( rc == RETRY ) ; + if ( rc == FAIL ) + { + elog ("Unrecoverable heartbeat startup error (rc=%d)\n", rc ); + daemon_exit (); + } if ( get_link_state ( hbs_sock.ioctl_sock, hbs_config.mgmnt_iface, &hbsInv.mgmnt_link_up_and_running ) ) { - hbsInv.mgmnt_link_up_and_running = false ; - wlog ("Failed to query %s operational state ; defaulting to down\n", hbs_config.mgmnt_iface ); + hbsInv.mgmnt_link_up_and_running = false ; + wlog ("Failed to query %s operational state ; defaulting to down\n", hbs_config.mgmnt_iface ); } else { @@ -1488,22 +1625,81 @@ void daemon_service_run ( void ) ilog ("Infra %s link is %s\n", hbs_config.infra_iface, hbsInv.infra_link_up_and_running ? "Up" : "Down" ); } } + + /* Make the main loop schedule in real-time */ + { + struct sched_param param ; + memset ( ¶m, 0, sizeof(struct sched_param)); + param.sched_priority = hbs_config.scheduling_priority ; + if ( sched_setscheduler(0, SCHED_RR, ¶m) ) + { + elog ("sched_setscheduler (0, SCHED_RR, %d ) returned error (%d:%s)\n", + param.sched_priority, errno, strerror(errno)); + } + } + + /* add this host as inventory to hbsAgent + * Although this host is not monitored for heartbeat, + * there are OOB flags in the heartbeat message that + * are needed to be extracted and locally updated */ + { + /* Scoping this so that the inv variable is freed after the add. + * No need saving it around on the stack all the time */ + node_inv_type inv ; + + /* init the inv variable */ + node_inv_init ( inv ); + inv.name = hbsInv.my_hostname ; + inv.nodetype = CONTROLLER_TYPE ; + hbsInv.add_heartbeat_host ( inv ); + } + + /* enable the base level signal handler latency monitor */ + daemon_latency_monitor (true); + + /* load this controller index number - used for cluster stuff */ + if ( hbsInv.my_hostname == CONTROLLER_0 ) + hbs_ctrl.controller = 0 ; + else + hbs_ctrl.controller = 1 ; + + /* tell the cluster which controller this is and + * how many networks are being monitored */ + hbs_cluster_nums (hbs_ctrl.controller,hbsInv.infra_network_provisioned ?2:1); + + socket_init_fail_count = 0 ; + wait_log_throttle = 0 ; + sockets_init = true ; + monitor_scheduling ( this_time, prev_time, 0, NODEUTIL_LATENCY_MON_START ); + + /* no need for the heartbeat audit in a simplex system */ + if ( hbsInv.system_type != SYSTEM_TYPE__CPE_MODE__SIMPLEX ) + { + /* start the state audit */ + /* run the first audit in 30 seconds */ + mtcTimer_start ( hbsTimer_audit, hbsTimer_handler, MTC_SECS_30 ); + } } } } + /* Bypass link management & heartbeat handling prior + * to sockets being initialized */ + if ( sockets_init == false ) + continue ; + /* audit for forced alarms clear due to ... * - * 1. heartbeat failure action being set to none - * 2. ... future + * 1. first initialization or + * 2. heartbeat failure action being set to none * */ if ( hbs_ctrl.clear_alarms == true ) { - if ( goenabled == true ) + if ( hbsInv.active_controller ) { std::list::iterator hostname_ptr ; - ilog ("clearing all heartbeat alarms for all hosts due to 'none' action"); + ilog ("clearing all heartbeat alarms"); for ( hostname_ptr = hostname_inventory.begin(); hostname_ptr != hostname_inventory.end() ; hostname_ptr++ ) @@ -1511,11 +1707,36 @@ void daemon_service_run ( void ) hbsAlarm_clear_all ( hostname_ptr->data(), hbsInv.infra_network_provisioned ); hbsInv.manage_heartbeat_clear ( hostname_ptr->data(), MAX_IFACES ); } - hbs_ctrl.clear_alarms = false ; } + hbs_ctrl.clear_alarms = false ; } /***************** Service Sockets ********************/ + if ( hbs_ctrl.audit++ == AUDIT_RATE ) + { + hbs_ctrl.audit = 0 ; + if ( daemon_is_file_present ( NODE_LOCKED_FILE )) + { + hbs_ctrl.locked = true ; + if ( hbsInv.hbs_disabled == false ) + { + hbsInv.hbs_disabled = true ; + hbsInv.hbs_state_change = true ; + ilog ("heartbeat service going disabled (locked)"); + + /* force the throttle 'still disabled' log to wait for + * the throttled count before the first log */ + counter = 1 ; + } + } + else if ( hbsInv.hbs_disabled == true ) + { + hbs_ctrl.locked = false ; + hbsInv.hbs_disabled = false; + hbsInv.hbs_state_change = true ; + ilog ("heartbeat service going enabled"); + } + } /* Initialize the master fd_set and clear socket list */ FD_ZERO(&hbs_sock.readfds); @@ -1529,38 +1750,42 @@ void daemon_service_run ( void ) FD_SET(hbs_sock.mtc_to_hbs_sock->getFD(), &hbs_sock.readfds); } - /* Add the sm request receiver to the select list */ - if (( hbs_sock.sm_server_sock ) && - ( hbs_sock.sm_server_sock->getFD())) + if ( sockets_init ) { - socks.push_front (hbs_sock.sm_server_sock->getFD()); - FD_SET(hbs_sock.sm_server_sock->getFD(), &hbs_sock.readfds); - } + /* Add the netlink event listener to the select list */ + if ( hbs_sock.netlink_sock ) + { + socks.push_back (hbs_sock.netlink_sock); + FD_SET(hbs_sock.netlink_sock, &hbs_sock.readfds); + } - /* Add the netlink event listener to the select list */ - if ( hbs_sock.netlink_sock ) - { - socks.push_back (hbs_sock.netlink_sock); - FD_SET(hbs_sock.netlink_sock, &hbs_sock.readfds); - } + if ( ! hbsInv.hbs_disabled ) + { + /* Add the management interface to the select list */ + if (( hbs_sock.rx_sock[MGMNT_INTERFACE] ) && + ( hbs_sock.rx_sock[MGMNT_INTERFACE]->getFD())) + { + socks.push_back (hbs_sock.rx_sock[MGMNT_INTERFACE]->getFD()); + FD_SET(hbs_sock.rx_sock[MGMNT_INTERFACE]->getFD(), &hbs_sock.readfds ); + } - /* Add the management interface to the select list */ - if (( goenabled == true ) && - ( hbs_sock.rx_sock[MGMNT_INTERFACE] ) && - ( hbs_sock.rx_sock[MGMNT_INTERFACE]->getFD())) - { - socks.push_back (hbs_sock.rx_sock[MGMNT_INTERFACE]->getFD()); - FD_SET(hbs_sock.rx_sock[MGMNT_INTERFACE]->getFD(), &hbs_sock.readfds ); - } + /* Add the INFRA network pulse rx socket if its provisioned and have a valid socket */ + if (( hbsInv.infra_network_provisioned == true ) && + ( hbs_sock.rx_sock[INFRA_INTERFACE] ) && + ( hbs_sock.rx_sock[INFRA_INTERFACE]->getFD())) + { + socks.push_back (hbs_sock.rx_sock[INFRA_INTERFACE]->getFD()); + FD_SET(hbs_sock.rx_sock[INFRA_INTERFACE]->getFD(), &hbs_sock.readfds ); + } + } - /* Add the INFRA network pulse rx socket if its provisioned and have a valid socket */ - if (( goenabled == true ) && - ( hbsInv.infra_network_provisioned == true ) && - ( hbs_sock.rx_sock[INFRA_INTERFACE] ) && - ( hbs_sock.rx_sock[INFRA_INTERFACE]->getFD())) - { - socks.push_back (hbs_sock.rx_sock[INFRA_INTERFACE]->getFD()); - FD_SET(hbs_sock.rx_sock[INFRA_INTERFACE]->getFD(), &hbs_sock.readfds ); + /* Add the SM receiver to the socket select list */ + if (( hbs_sock.sm_server_sock ) && + ( hbs_sock.sm_server_sock->getFD())) + { + socks.push_back (hbs_sock.sm_server_sock->getFD()); + FD_SET(hbs_sock.sm_server_sock->getFD(), &hbs_sock.readfds ); + } } monitor_scheduling ( this_time, prev_time, seq_num, SCHED_MONITOR__MAIN_LOOP ); @@ -1585,27 +1810,6 @@ void daemon_service_run ( void ) } else { - if (( goenabled == true ) && - ( hbs_sock.rx_sock[MGMNT_INTERFACE] ) && - ( FD_ISSET(hbs_sock.rx_sock[MGMNT_INTERFACE]->getFD(), &hbs_sock.readfds))) - { - hbs_sock.fired[MGMNT_INTERFACE] = true ; - } - - if (( goenabled == true ) && - ( hbsInv.infra_network_provisioned == true ) && - ( hbs_sock.rx_sock[INFRA_INTERFACE] ) && - ( hbs_sock.rx_sock[INFRA_INTERFACE]->getFD()) && - ( FD_ISSET(hbs_sock.rx_sock[INFRA_INTERFACE]->getFD(), &hbs_sock.readfds))) - { - hbs_sock.fired[INFRA_INTERFACE] = true ; - } - - if ((hbs_sock.sm_server_sock != NULL ) && - ( FD_ISSET(hbs_sock.sm_server_sock->getFD(), &hbs_sock.readfds))) - { - hbs_sm_handler(); - } if ((hbs_sock.mtc_to_hbs_sock != NULL ) && ( FD_ISSET(hbs_sock.mtc_to_hbs_sock->getFD(), &hbs_sock.readfds))) { @@ -1623,7 +1827,45 @@ void daemon_service_run ( void ) if ( !strncmp ( get_hbs_cmd_req_header(), &msg.hdr[0], MSG_HEADER_SIZE )) { string hostname = &msg.hdr[MSG_HEADER_SIZE] ; - if ( msg.cmd == MTC_CMD_ADD_HOST ) + if ( msg.cmd == MTC_CMD_ACTIVE_CTRL ) + { + bool logit = false ; + if ( hostname == hbsInv.my_hostname ) + { + if ( hbsInv.active_controller == false ) + { + logit = true ; + hbs_ctrl.clear_alarms = true ; + } + hbsInv.active_controller = true ; + } + else + { + if ( hbsInv.active_controller == true ) + logit = true ; + hbsInv.active_controller = false ; + } + if ( logit == true ) + { + ilog ("%s is %sactive", + hbsInv.my_hostname.c_str(), + hbsInv.active_controller ? "" : "in" ); + + /* no need for the heartbeat audit in a simplex system */ + if ( hbsInv.system_type != SYSTEM_TYPE__CPE_MODE__SIMPLEX ) + { + /* Due to activity state change we will dump + * the heartbeat cluster state at now time + * and then again in 5 seconds only to get + * the regular audit dump restarted at + * regular interval after that. */ + hbs_state_audit (); + mtcTimer_reset ( hbsTimer_audit); + mtcTimer_start ( hbsTimer_audit, hbsTimer_handler, MTC_SECS_5 ); + } + } + } + else if ( msg.cmd == MTC_CMD_ADD_HOST ) { node_inv_type inv ; node_inv_init(inv); @@ -1634,7 +1876,8 @@ void daemon_service_run ( void ) ilog ("%s added to heartbeat service (%d)\n", hostname.c_str(), msg.parm[0] ); /* clear any outstanding alarms on the ADD */ - if ( hbsInv.hbs_failure_action != HBS_FAILURE_ACTION__NONE ) + if (( hbsInv.hbs_failure_action != HBS_FAILURE_ACTION__NONE ) && + ( hbsInv.active_controller == true )) { hbsAlarm_clear_all ( hostname, hbsInv.infra_network_provisioned ); @@ -1648,27 +1891,34 @@ void daemon_service_run ( void ) ilog ("%s deleted from heartbeat service\n", hostname.c_str()); /* clear any outstanding alarms on the DEL */ - if ( hbsInv.hbs_failure_action != HBS_FAILURE_ACTION__NONE ) + if (( hbsInv.hbs_failure_action != HBS_FAILURE_ACTION__NONE ) && + ( hbsInv.active_controller == true )) { hbsAlarm_clear_all ( hostname, - hbsInv.infra_network_provisioned ); + hbsInv.infra_network_provisioned ); } } else if ( msg.cmd == MTC_CMD_STOP_HOST ) { hbsInv.mon_host ( hostname, false, true ); hbs_cluster_del ( hostname ); - - ilog ("%s stopping heartbeat service\n", - hostname.c_str()); } else if ( msg.cmd == MTC_CMD_START_HOST ) { - hbsInv.mon_host ( hostname, true, true ); - hbs_cluster_add ( hostname ); + if ( hostname == hbsInv.my_hostname ) + { + dlog ("%s stopping heartbeat of self\n", + hostname.c_str()); - ilog ("%s starting heartbeat service\n", - hostname.c_str()); + hbsInv.mon_host ( hostname, false, true ); + hbs_cluster_del ( hostname ); + + } + else + { + hbs_cluster_add ( hostname ); + hbsInv.mon_host ( hostname, true, true ); + } } else if ( msg.cmd == MTC_RESTART_HBS ) { @@ -1685,8 +1935,12 @@ void daemon_service_run ( void ) } else if ( msg.cmd == MTC_BACKOFF_HBS ) { + hbsInv.hbs_pulse_period = (hbsInv.hbs_pulse_period_save * HBS_BACKOFF_FACTOR) ; ilog ("%s starting heartbeat backoff (period:%d msecs)\n", hostname.c_str(), hbsInv.hbs_pulse_period ); + + /* Send SM cluster information at start of MNFA */ + hbs_cluster_send( hbs_sock.sm_client_sock, 0 ); hbsInv.print_node_info(); } else @@ -1704,10 +1958,35 @@ void daemon_service_run ( void ) elog ("Failed receive from agent domain socket (%i)\n", bytes ); } } + + if ( ! hbsInv.hbs_disabled ) + { + if (( hbs_sock.rx_sock[MGMNT_INTERFACE] ) && + ( FD_ISSET(hbs_sock.rx_sock[MGMNT_INTERFACE]->getFD(), &hbs_sock.readfds))) + { + hbs_sock.fired[MGMNT_INTERFACE] = true ; + } + + if (( hbsInv.infra_network_provisioned == true ) && + ( hbs_sock.rx_sock[INFRA_INTERFACE] ) && + ( hbs_sock.rx_sock[INFRA_INTERFACE]->getFD()) && + ( FD_ISSET(hbs_sock.rx_sock[INFRA_INTERFACE]->getFD(), &hbs_sock.readfds))) + { + hbs_sock.fired[INFRA_INTERFACE] = true ; + } + } + + if ((hbs_sock.sm_server_sock != NULL ) && + ( FD_ISSET(hbs_sock.sm_server_sock->getFD(), &hbs_sock.readfds))) + { + hbs_sm_handler(); + } + if (FD_ISSET( hbs_sock.netlink_sock, &hbs_sock.readfds)) { dlog ("netlink socket fired\n"); - if ( hbsInv.service_netlink_events ( hbs_sock.netlink_sock, hbs_sock.ioctl_sock ) != PASS ) + rc = hbsInv.service_netlink_events ( hbs_sock.netlink_sock, hbs_sock.ioctl_sock ); + if ( rc ) { elog ("service_netlink_events failed (rc:%d)\n", rc ); } @@ -1733,37 +2012,51 @@ void daemon_service_run ( void ) /* print current node inventory to the stdio */ hbsInv.print_node_info(); - } } - /* Manage enabling and disabling the heartbeat service based on - * the state of the management link. - * link up = run heartbeat service - * link down = disable heatbeat service and monitor the link up to re-enable - */ - else if (( hbsInv.mgmnt_link_up_and_running == false ) && - ( hbsInv.hbs_disabled == false )) + if ( hbs_ctrl.locked == false ) { - hbsInv.hbs_disabled = true ; - hbsInv.hbs_state_change = true ; - ilog ("Heartbeat disabled by %s link down event\n", hbs_config.mgmnt_iface ); - counter = 1 ; + /* Manage enabling and disabling the heartbeat service based on + * the state of the management link. + * link up = run heartbeat service + * link down = disable heatbeat service and monitor the link up to re-enable + */ + if (( hbsInv.mgmnt_link_up_and_running == false ) && + ( hbsInv.hbs_disabled == false )) + { + hbsInv.hbs_disabled = true ; + hbsInv.hbs_state_change = true ; + ilog ("Heartbeat disabled by %s link down event\n", hbs_config.mgmnt_iface ); + counter = 1 ; + } + + /* Recover heartbeat when link comes back up */ + else if (( hbsInv.mgmnt_link_up_and_running == true ) && + ( hbsInv.hbs_disabled == true )) + { + hbsInv.hbs_disabled = false ; + hbsInv.hbs_state_change = true ; + ilog ("Heartbeat Enabled by %s link up event\n", hbs_config.mgmnt_iface ); + counter = 1 ; + } + + else if ( hbsInv.hbs_failure_action == HBS_FAILURE_ACTION__NONE ) + { + wlog_throttled (counter, 100000, "Heartbeat disabled with action=none\n"); + usleep (50000) ; + continue ; + } } - /* Recover heartbeat when link comes back up */ - else if (( hbsInv.mgmnt_link_up_and_running == true ) && - ( hbsInv.hbs_disabled == true )) + /* go to sleep if disabled */ + else if ( hbsInv.hbs_disabled == true ) { - hbsInv.hbs_disabled = false ; - hbsInv.hbs_state_change = true ; - ilog ("Heartbeat Enabled by %s link up event\n", hbs_config.mgmnt_iface ); - counter = 1 ; - } + wlog_throttled (counter, 100000, + "Heartbeat service still disabled %s", + hbs_ctrl.locked ? "(locked)" : ""); - else if ( hbsInv.hbs_failure_action == HBS_FAILURE_ACTION__NONE ) - { - wlog_throttled (counter, 100000, "Heartbeat disabled by 'none' action\n"); + hbsInv.hbs_state_change = false ; usleep (50000) ; continue ; } @@ -1784,14 +2077,6 @@ void daemon_service_run ( void ) hbsInv.print_node_info(); } - /* go to sleep if disabled */ - if ( hbsInv.hbs_disabled == true ) - { - wlog_throttled (counter, 1000, "Heartbeat service still disabled\n"); - usleep (50000) ; - continue ; - } - /* Be sure state change flag is cleared */ hbsInv.hbs_state_change = false ; counter = 0 ; @@ -2010,7 +2295,6 @@ void daemon_service_run ( void ) } hbsTimer.ring = false ; heartbeat_request = true ; - // hbs_cluster_log ( hbsInv.my_hostname, "->") ; seq_num++ ; } daemon_load_fit (); @@ -2024,6 +2308,7 @@ void daemon_dump_info ( void ) daemon_dump_membuf_banner (); hbsInv.print_node_info (); + hbs_state_audit(); hbsInv.memDumpAllState (); #ifdef WANT_HBS_MEM_LOGS @@ -2059,7 +2344,7 @@ int daemon_run_testhead ( void ) else PASSED ; } - free(hbsInv_testhead_ptr); + delete(hbsInv_testhead_ptr); printf (TESTHEAD_BAR); printf ("| Heartbeat Service Test Head\n"); diff --git a/mtce/src/heartbeat/hbsBase.h b/mtce/src/heartbeat/hbsBase.h index 264eba57..b679fadf 100755 --- a/mtce/src/heartbeat/hbsBase.h +++ b/mtce/src/heartbeat/hbsBase.h @@ -47,6 +47,9 @@ /** Maximum service fail count before action */ #define MAX_FAIL_COUNT (1) +/** Audit Rate/Count */ +#define AUDIT_RATE (9) + /** Heartbeat pulse request/response message header byte size */ #define HBS_HEADER_SIZE (15) @@ -60,13 +63,16 @@ const char rsp_msg_header [HBS_HEADER_SIZE+1] = {"cgts pulse rsp:"}; #define HBS_MAX_MSG (HBS_HEADER_SIZE+MAX_CHARS_HOSTNAME) -#define HBS_MESSAGE_VERSION (1) // 0 -> 1 with intro of cluster info +#define HBS_MESSAGE_VERSION (1) // 0 -> 1 with intro of cluster info /* Heartbeat control structure */ typedef struct { + unsigned int controller ; + unsigned int audit ; unsigned int nodetype ; bool clear_alarms ; + bool locked ; } hbs_ctrl_type ; hbs_ctrl_type * get_hbs_ctrl_ptr ( void ); @@ -218,22 +224,17 @@ void hbs_utils_init ( void ); /* network enum to name lookup */ string hbs_cluster_network_name ( mtce_hbs_network_enum network ); -/* Produce formatted clog's that characterize current and changing cluster - * history for a given network. Each log is controller/network specific. */ -void hbs_cluster_log ( string & hostname, mtce_hbs_cluster_type & cluster, string prefix ); - /* Initialize the specified history array */ void hbs_cluster_history_init ( mtce_hbs_cluster_history_type & history ); /* Clear all history in the cluster vault */ void hbs_cluster_history_clear( mtce_hbs_cluster_type & cluster ); - /******** Heartbeat Agent Cluster Functions in hbsCluster.cpp ********/ /* Set the cluster vault to default state. * Called upon daemon init or heartbeat period change. */ -void hbs_cluster_init ( unsigned short period ); +void hbs_cluster_init ( unsigned short period , msgClassSock * sm_socket_ptr ); /* Calculate number of bytes that is unused in the cluster data structure. * Primarily to know how many history elements are missing. */ @@ -286,7 +287,9 @@ void hbs_cluster_append ( hbs_message_type & msg ); /* Produce formatted clog's that characterize current and changing cluster * history for a given network. Each log is controller/network specific. */ -void hbs_cluster_log ( string & hostname, string prefix ); +void hbs_cluster_log ( string & hostname, string prefix, bool force=false ); +void hbs_cluster_log ( string & hostname, mtce_hbs_cluster_type & cluster, string prefix, bool force=false ); + /* Service SM cluster info request */ void hbs_sm_handler ( void ); @@ -294,8 +297,14 @@ void hbs_sm_handler ( void ); /* send the cluster vault to SM */ void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid ); +/* copy cluster data from src to dst */ +void hbs_cluster_copy ( mtce_hbs_cluster_type & src, mtce_hbs_cluster_type & dst ); + /* print the contents of the vault */ -void hbs_cluster_dump ( mtce_hbs_cluster_type & vault ); +void hbs_cluster_dump ( mtce_hbs_cluster_type & vault, string log_prefix, bool force ); + +/* Heartbeat service state audit */ +void hbs_state_audit ( void ); /** * @} hbs_base diff --git a/mtce/src/heartbeat/hbsClient.cpp b/mtce/src/heartbeat/hbsClient.cpp index 444dae29..c0dbad8d 100644 --- a/mtce/src/heartbeat/hbsClient.cpp +++ b/mtce/src/heartbeat/hbsClient.cpp @@ -66,6 +66,8 @@ extern "C" #include "amon.h" /* for ... active monitoring utilities */ } +#define MAX_LEN (300) + /* Where to send events */ string mtcAgent_ip = "" ; @@ -96,12 +98,17 @@ typedef struct static char pulse_resp_tx_hdr [HBS_MAX_MSG]; static char my_hostname [MAX_HOST_NAME_SIZE+1]; +static string hostname = "" ; static char my_hostname_length ; static string my_macaddr = "" ; static string my_address = "" ; static unsigned int my_nodetype= CGTS_NODE_NULL ; static stallMon_type stallMon ; +/* Cached Cluster view from controllers */ +mtce_hbs_cluster_type controller_cluster_cache[MTCE_HBS_MAX_CONTROLLERS]; + + void daemon_sigchld_hdlr ( void ) { ; /* dlog("Received SIGCHLD ... no action\n"); */ @@ -407,16 +414,17 @@ int daemon_configure ( void ) else { ilog("Realtime Pri: FIFO/%i \n", hbs_config.scheduling_priority ); - ilog("Multicast: %s\n", hbs_config.multicast ); + ilog("Multicast : %s\n", hbs_config.multicast ); hbs_config.mgmnt_iface = daemon_get_iface_master ( hbs_config.mgmnt_iface ); - ilog("Mgmnt iface : %s\n", hbs_config.mgmnt_iface ); - ilog("Mgmnt RxPort: %d\n", hbs_config.hbs_client_mgmnt_port ); - ilog("Mgmnt TxPort: %d\n", hbs_config.hbs_agent_mgmnt_port ); + ilog("Mgmnt Name : %s\n", hbs_config.mgmnt_iface ); + ilog("Mgmnt Port : %d (rx)", hbs_config.hbs_client_mgmnt_port ); + ilog("Mgmnt Port : %d (tx)", hbs_config.hbs_agent_mgmnt_port ); get_iface_macaddr ( hbs_config.mgmnt_iface, my_macaddr ); get_iface_address ( hbs_config.mgmnt_iface, my_address, true ); get_hostname ( &my_hostname[0], MAX_HOST_NAME_SIZE ); + hostname = my_hostname ; /* Fetch the infrastructure interface name. * calls daemon_get_iface_master inside so the @@ -427,11 +435,14 @@ int daemon_configure ( void ) if (strcmp(hbs_config.infra_iface, hbs_config.mgmnt_iface)) { infra_network_provisioned = true ; - ilog ("Infra iface : %s\n", hbs_config.infra_iface ); + ilog ("Infra Name : %s\n", hbs_config.infra_iface ); } } - ilog("Infra RxPort: %d\n", hbs_config.hbs_client_infra_port ); - ilog("Infra TxPort: %d\n", hbs_config.hbs_agent_infra_port ); + if ( infra_network_provisioned == true ) + { + ilog("Infra Port : %d (rx)", hbs_config.hbs_client_infra_port ); + ilog("Infra Port : %d (tx)", hbs_config.hbs_agent_infra_port ); + } /* initialize the stall detection monitor */ stallMon_init (); @@ -663,7 +674,37 @@ int get_pmon_pulses ( void ) return (pulses); } -static unsigned int my_rri = 0 ; +/************************************************************* + * + * Name : have_other_controller_history + * + * Description: returns true if there is cached history for any + * controller number other than this one supplied. + * + *************************************************************/ + +bool have_other_controller_history ( unsigned short controller ) +{ + if ( controller < MTCE_HBS_MAX_CONTROLLERS ) + { + /* look for history for any controller other than the one specified */ + for ( int c = 0 ; c < MTCE_HBS_MAX_CONTROLLERS ; c++ ) + { + /* skip specified controller */ + if ( c != controller ) + { + if ( controller_cluster_cache[c].histories ) + { + return true ; + } + } + } + } + return false ; +} + + +static unsigned int rri[MTCE_HBS_MAX_CONTROLLERS] = {0,0} ; /************************************************************* * @@ -766,12 +807,13 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags ) daemon_config_type * cfg_ptr = daemon_get_cfg_ptr(); if ( cfg_ptr->debug_msg ) { - mlog ("\n"); - mlog ("%s Pulse Req: %s:%5d: %d:%s RRI:%d\n", + mlog (" "); + mlog ("%s Pulse Req: %s:%d s:%d f:%x [%s] RRI:%d\n", get_iface_name_str(iface), hbs_sock.rx_sock[iface]->get_dst_addr()->toString(), hbs_sock.rx_sock[iface]->get_dst_addr()->getPort(), hbs_sock.rx_mesg[iface].s, + hbs_sock.rx_mesg[iface].f, hbs_sock.rx_mesg[iface].m, hbs_sock.rx_mesg[iface].c); } @@ -787,19 +829,9 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags ) return (FAIL_MSG_HEADER) ; } - - /* Manage the Resource Reference Index (RRI) "lookup clue" */ - if ( ! strncmp ( &hbs_sock.rx_mesg[iface].m[HBS_HEADER_SIZE], &my_hostname[0], MAX_CHARS_HOSTNAME )) - { - if( my_rri!= hbs_sock.rx_mesg[iface].c ) - { - my_rri = hbs_sock.rx_mesg[iface].c ; - ilog ("%s Caching New RRI: %d\n", &my_hostname[0], my_rri ); - } - } - - /* Add my RRI to the response message */ - hbs_sock.rx_mesg[iface].c = my_rri ; + /* Update local copy for the controller this pulse came from */ + /* ... before the flags are cleared and setup for the reply. */ + unsigned int controller = (hbs_sock.rx_mesg[iface].f & CTRLX_MASK ) >> CTRLX_BIT ; /* Manage OOB flags */ hbs_sock.rx_mesg[iface].f = flags ; @@ -807,23 +839,102 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags ) { hbs_sock.rx_mesg[iface].f |= ( PMOND_FLAG ) ; } + if ( infra_network_provisioned == true ) { hbs_sock.rx_mesg[iface].f |= INFRA_FLAG ; } -#define WANT_CLUSTER_INFO_LOG -#ifdef WANT_CLUSTER_INFO_LOG - /* Log the received cluster info */ - if ( hbs_sock.rx_mesg[iface].v >= HBS_MESSAGE_VERSION ) + /************************************************************************* + ***** C L U S T E R D A T A M A N A G E M E N T ****** + * * + * TODO: Add support for 3 controllers. + * Only 2 suppoerted by some of this code. + ***** ******/ + + if ( controller >= MTCE_HBS_MAX_CONTROLLERS ) { - char str[100] ; - // hbs_cluster_log (hbs_sock.rx_mesg[iface].cluster, hbs_sock.rx_mesg[iface].s ); - snprintf ( &str[0], 100, " seq %6d with %d bytes from %s ", hbs_sock.rx_mesg[iface].s, rx_bytes, get_iface_name_str(iface)); - string hostname = my_hostname ; - hbs_cluster_log ( hostname, hbs_sock.rx_mesg[iface].cluster, str ); + wlog ("invalid controller number: %d ; dropping message", controller ); + return ( FAIL_INVALID_DATA ); } -#endif + + /* Manage the Resource Reference Index (RRI) "lookup clue" + * With the introduction of active-active heartbeating the hbsClient + * is responsible for servicing pulses from both controllers. + * This means that hbsClient needs to manage an rri for each controller. */ + if ( ! strncmp ( &hbs_sock.rx_mesg[iface].m[HBS_HEADER_SIZE], &my_hostname[0], MAX_CHARS_HOSTNAME )) + { + if( rri[controller] != hbs_sock.rx_mesg[iface].c ) + { + rri[controller] = hbs_sock.rx_mesg[iface].c ; + ilog ("Caching New RRI: %d (from controller-%d)\n", rri[controller], controller ); + } + } + + /* Log the received cluster info + * ... if the message version shows that it is supported */ + if ( hbs_sock.rx_mesg[iface].v ) + { + char str[MAX_LEN] ; + snprintf ( &str[0], MAX_LEN, " seq %6d with %d bytes from %s ", (int)hbs_sock.rx_mesg[iface].s, rx_bytes, get_iface_name_str(iface)); + hbs_cluster_log ( hostname, hbs_sock.rx_mesg[iface].cluster, str ); + + /* add the controller back in */ + hbs_sock.rx_mesg[iface].f |= ( controller << CTRLX_BIT ); + + /* Add my RRI to the response message */ + hbs_sock.rx_mesg[iface].c = rri[controller] ; + + if ( hbs_sock.rx_mesg[iface].cluster.histories > MTCE_HBS_MAX_NETWORKS ) + { + slog ("controller-%d provided %d network histories ; max is %d per controller", + controller, + hbs_sock.rx_mesg[iface].cluster.histories, + MTCE_HBS_MAX_NETWORKS ); + } + else if ( hbs_sock.rx_mesg[iface].cluster.bytes != ( BYTES_IN_CLUSTER_VAULT(hbs_sock.rx_mesg[iface].cluster.histories))) + { + slog ("controller-%d provided %d bytes of history ; expected %d", + controller, + hbs_sock.rx_mesg[iface].cluster.bytes, + (unsigned short)(BYTES_IN_CLUSTER_VAULT(hbs_sock.rx_mesg[iface].cluster.histories))); + } + else if ( hbs_sock.rx_mesg[iface].cluster.histories ) + { + hbs_cluster_copy ( hbs_sock.rx_mesg[iface].cluster, + controller_cluster_cache[controller] ); + clog1 ("controller-%d cluster info from %s pulse request saved to cache", + controller, get_iface_name_str(iface)); + + hbs_sock.rx_mesg[iface].cluster.histories = 0 ; + + if ( have_other_controller_history ( controller ) == true ) + { + /* Now copy the other controller's cached cluster info into + * this controlers response */ + hbs_cluster_copy ( controller_cluster_cache[controller?0:1], + hbs_sock.rx_mesg[iface].cluster ); + + if ( daemon_get_cfg_ptr()->debug_state & 4 ) + { + string dump_banner = "" ; + dump_banner.append("controller-") ; + dump_banner.append(itos(controller?0:1)); + dump_banner.append(" cluster info from cache injected into controller-"); + dump_banner.append(itos(controller)); + dump_banner.append(":"); + dump_banner.append(get_iface_name_str(iface)); + dump_banner.append(" pulse response"); + hbs_cluster_dump ( hbs_sock.rx_mesg[iface].cluster, dump_banner, true ); + } + } + } + } + + /* Cluster Data management end */ + + /* replace the request header with the response header */ + memcpy ( &hbs_sock.rx_mesg[iface].m[0], &pulse_resp_tx_hdr[0], HBS_MAX_MSG ); #ifdef WANT_PULSE_RESPONSE_FIT if (( iface == INFRA_IFACE ) && ( daemon_is_file_present ( MTC_CMD_FIT__NO_INFRA_RSP ))) @@ -839,29 +950,11 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags ) } #endif - int rc = PASS ; - - /* replace the request header with the response header */ - memcpy ( &hbs_sock.rx_mesg[iface].m[0], &pulse_resp_tx_hdr[0], HBS_MAX_MSG ); - - /* Deal with the cluster info if it exists. - * ... Introduced in messaging version 1 */ - if ( hbs_sock.rx_mesg[iface].v >= HBS_MESSAGE_VERSION ) - { - if ( hbs_sock.rx_mesg[iface].cluster.version < MTCE_HBS_CLUSTER_VERSION ) - { - ilog ("Bad cluster verison (%d)", hbs_sock.rx_mesg[iface].cluster.version); - } - // if ( hbs_sock.rx_mesg[iface].cluster.revision != MTCE_HBS_CLUSTER_REVISION ) - // { - // ilog ("Bad cluster revision (%d)", hbs_sock.rx_mesg[iface].cluster.revision); - // } - - /* Add peer controller cluster data to this controller's response */ - // hbs_cluster_loop(hbs_sock.rx_mesg[iface]); - } + /* reuse the rx_bytes variable */ + rx_bytes = sizeof(hbs_message_type)-sizeof(mtce_hbs_cluster_type)+BYTES_IN_CLUSTER_VAULT(hbs_sock.rx_mesg[iface].cluster.histories); /* send pulse response message */ + int rc = PASS ; int tx_bytes = hbs_sock.tx_sock[iface]->reply(hbs_sock.rx_sock[iface],(char*)&hbs_sock.rx_mesg[iface], rx_bytes); if ( tx_bytes == -1 ) { @@ -884,15 +977,15 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags ) } else { - mlog ("%s Pulse Rsp: %s:%5d: %d:%d:%s RRI:%d (%d:%d:%d)\n", - get_iface_name_str(iface), - hbs_sock.tx_sock[iface]->get_dst_addr()->toString(), - hbs_sock.tx_sock[iface]->get_dst_addr()->getPort(), - hbs_sock.rx_mesg[iface].s, - hbs_sock.rx_mesg[iface].f, - hbs_sock.rx_mesg[iface].m, - hbs_sock.rx_mesg[iface].c, - pmonPulse_counter, rx_bytes, tx_bytes); + mlog ("%s Pulse Rsp: %s:%d: s:%d f:%x [%s] RRI:%d (%x:%d:%d)\n", + get_iface_name_str(iface), + hbs_sock.tx_sock[iface]->get_dst_addr()->toString(), + hbs_sock.tx_sock[iface]->get_dst_addr()->getPort(), + hbs_sock.rx_mesg[iface].s, + hbs_sock.rx_mesg[iface].f, + hbs_sock.rx_mesg[iface].m, + hbs_sock.rx_mesg[iface].c, + pmonPulse_counter, rx_bytes, tx_bytes); } /* Clear the error count since we got a good receive */ @@ -984,6 +1077,10 @@ int daemon_init ( string iface, string nodeType_str ) /* Initialize socket construct and pointer to it */ memset ( &hbs_sock, 0, sizeof(hbs_sock)); + /* Initialize the controller cluster view data bounce structure */ + for ( int c = 0 ; c < MTCE_HBS_MAX_CONTROLLERS ; c++ ) + memset ( &controller_cluster_cache[c], 0, sizeof(mtce_hbs_cluster_type)) ; + /* init the utility module */ hbs_utils_init (); @@ -1007,6 +1104,11 @@ int daemon_init ( string iface, string nodeType_str ) /* convert node type to integer */ my_nodetype = get_host_function_mask ( nodeType_str ) ; + if ( my_nodetype & CONTROLLER_TYPE ) + { + /* is controller but don't know what one yet. */ + set_hn((char*)CONTROLLER_X); + } ilog ("Node Type : %s (%d)\n", nodeType_str.c_str(), my_nodetype ); /* Bind signal handlers */ @@ -1058,7 +1160,6 @@ int daemon_init ( string iface, string nodeType_str ) int stall_threshold_log = 0 ; int stall_times_threshold_log = 0 ; -#define MAX_LEN 300 void daemon_service_run ( void ) { #ifdef WANT_DAEMON_DEBUG @@ -1205,7 +1306,7 @@ void daemon_service_run ( void ) int bytes = hbs_sock.sm_client_sock->read((char*)&msg, sizeof(mtce_hbs_cluster_type)); if ( bytes ) { - hbs_cluster_dump (msg); + hbs_cluster_dump (msg, "Cluster info received", true ); } } #endif diff --git a/mtce/src/heartbeat/hbsCluster.cpp b/mtce/src/heartbeat/hbsCluster.cpp index 3789541f..85b8f363 100644 --- a/mtce/src/heartbeat/hbsCluster.cpp +++ b/mtce/src/heartbeat/hbsCluster.cpp @@ -64,11 +64,19 @@ typedef struct /* The working heartbeat cluster data vault. */ mtce_hbs_cluster_type cluster ; + bool cluster_change ; + int cluster_change_threshold_count ; + int cluster_change_difference_count ; + + msgClassSock * sm_socket_ptr ; + } hbs_cluster_ctrl_type ; /* Cluster control structire construct allocation. */ static hbs_cluster_ctrl_type ctrl ; +#define STORAGE_0_NR_THRESHOLD (4) +#define CLUSTER_CHANGE_THRESHOLD (50000) /**************************************************************************** * @@ -80,7 +88,7 @@ static hbs_cluster_ctrl_type ctrl ; * ***************************************************************************/ -void hbs_cluster_init ( unsigned short period ) +void hbs_cluster_init ( unsigned short period, msgClassSock * sm_socket_ptr ) { ctrl.monitored_hosts = 0; ctrl.monitored_hostname_list.clear(); @@ -104,13 +112,17 @@ void hbs_cluster_init ( unsigned short period ) for ( int h = 0 ; h < MTCE_HBS_MAX_HISTORY_ELEMENTS ; h++ ) hbs_cluster_history_init ( ctrl.cluster.history[h] ); - ilog ("Cluster Info: v%d.%d sig:%x bytes:%d (%ld)", + clog ("Cluster Info: v%d.%d sig:%x bytes:%d (%ld)", ctrl.cluster.version, ctrl.cluster.revision, ctrl.cluster.magic_number, ctrl.cluster.bytes, sizeof(mtce_hbs_cluster_history_type)); + if ( sm_socket_ptr ) + { + ctrl.sm_socket_ptr = sm_socket_ptr ; + } ctrl.log_throttle = 0 ; } @@ -140,7 +152,7 @@ void hbs_cluster_nums ( unsigned short this_controller, /**************************************************************************** * - * Name : log_monitored_hosts_list + * Name : cluster_list * * Description : Log the list of monitored hosts. * Typically done on a list change. @@ -149,7 +161,7 @@ void hbs_cluster_nums ( unsigned short this_controller, * ***************************************************************************/ -void log_monitored_hosts_list ( void ) +void cluster_list ( void ) { std::list::iterator iter_ptr ; string list = "" ; @@ -160,9 +172,7 @@ void log_monitored_hosts_list ( void ) list.append (*(iter_ptr)); list.append (" "); } - ilog ("cluster of %ld: %s", - ctrl.monitored_hostname_list.size(), - list.c_str()); + ilog ("cluster: %s", list.c_str()); } @@ -186,6 +196,7 @@ void cluster_storage0_state ( bool enabled ) ctrl.cluster.storage0_enabled = enabled ; ilog ("storage-0 heartbeat state changed to %s", enabled ? "enabled" : "disabled" ); + ctrl.cluster_change = true ; } } @@ -237,13 +248,30 @@ void hbs_manage_controller_state ( string & hostname, bool enabled ) void hbs_cluster_add ( string & hostname ) { - /* Consider using 'unique' after instead of remove before update. */ - ctrl.monitored_hostname_list.remove(hostname) ; - ctrl.monitored_hostname_list.push_back(hostname) ; - ctrl.monitored_hosts = (unsigned short)ctrl.monitored_hostname_list.size(); + bool already_in_list = false ; + std::list::iterator hostname_ptr ; + for ( hostname_ptr = ctrl.monitored_hostname_list.begin(); + hostname_ptr != ctrl.monitored_hostname_list.end() ; + hostname_ptr++ ) + { + if ( hostname_ptr->compare(hostname) == 0 ) + { + already_in_list = true ; + break ; + } + } + + if ( already_in_list == false ) + { + ctrl.monitored_hostname_list.push_back(hostname) ; + ctrl.monitored_hosts = (unsigned short)ctrl.monitored_hostname_list.size(); + ilog ("%s added to cluster", hostname.c_str()); + cluster_list (); + ctrl.cluster_change = true ; + } /* Manage storage-0 state */ - if ( hostname == STORAGE_0 ) + if ( hostname.compare(STORAGE_0) == 0 ) { cluster_storage0_state ( true ); } @@ -251,15 +279,18 @@ void hbs_cluster_add ( string & hostname ) /* If we get down to 0 monitored hosts then just start fresh */ if (( ctrl.monitored_hosts ) == 0 ) { - hbs_cluster_init ( ctrl.cluster.period_msec ); + hbs_cluster_init ( ctrl.cluster.period_msec, NULL ); } /* Manage controller state ; true means enabled in this case. */ hbs_manage_controller_state ( hostname, true ); - ilog ("%s added to cluster", hostname.c_str()); + if (( ctrl.cluster_change ) && ( ctrl.sm_socket_ptr )) + { + hbs_cluster_send( ctrl.sm_socket_ptr, 0 ); + ctrl.cluster_change = false ; + } - log_monitored_hosts_list (); } /**************************************************************************** @@ -281,27 +312,46 @@ void hbs_cluster_add ( string & hostname ) void hbs_cluster_del ( string & hostname ) { - ctrl.monitored_hostname_list.remove(hostname) ; - ctrl.monitored_hosts = (unsigned short)ctrl.monitored_hostname_list.size(); - - /* Manage storage-0 state. */ - if ( hostname == STORAGE_0 ) + std::list::iterator hostname_ptr ; + for ( hostname_ptr = ctrl.monitored_hostname_list.begin(); + hostname_ptr != ctrl.monitored_hostname_list.end() ; + hostname_ptr++ ) { - cluster_storage0_state ( false ); + if ( hostname_ptr->compare(hostname) == 0 ) + { + ctrl.monitored_hostname_list.remove(hostname) ; + ctrl.monitored_hosts = (unsigned short)ctrl.monitored_hostname_list.size(); + + /* Manage storage-0 state. */ + if ( hostname.compare(STORAGE_0) == 0 ) + { + cluster_storage0_state ( false ); + } + + /* If we get down to 0 monitored hosts then just start fresh */ + if (( ctrl.monitored_hosts ) == 0 ) + { + hbs_cluster_init ( ctrl.cluster.period_msec, NULL ); + } + + /* Manage controller state ; false means not enabled in this case. */ + hbs_manage_controller_state ( hostname , false ); + + ilog ("%s deleted from cluster", hostname.c_str()); + + cluster_list (); + + ctrl.cluster_change = true ; + + break ; + } } - /* If we get down to 0 monitored hosts then just start fresh */ - if (( ctrl.monitored_hosts ) == 0 ) + if (( ctrl.cluster_change ) && ( ctrl.sm_socket_ptr )) { - hbs_cluster_init ( ctrl.cluster.period_msec ); + hbs_cluster_send( ctrl.sm_socket_ptr, 0 ); + ctrl.cluster_change = false ; } - - /* Manage controller state ; false means not enabled in this case. */ - hbs_manage_controller_state ( hostname , false ); - - ilog ("%s deleted from cluster", hostname.c_str()); - - log_monitored_hosts_list (); } /**************************************************************************** @@ -309,7 +359,7 @@ void hbs_cluster_del ( string & hostname ) * Name : hbs_cluster_update * * Description : Update this controller's cluster info for the specified - * network with + * network with ... * * 1. The number of enabled hosts. * 2. The number of responding hosts. @@ -333,7 +383,6 @@ void hbs_cluster_del ( string & hostname ) * ***************************************************************************/ -#define STORAGE_0_NR_THRESHOLD (4) void hbs_cluster_update ( iface_enum iface, unsigned short not_responding_hosts, @@ -357,7 +406,7 @@ void hbs_cluster_update ( iface_enum iface, if ( not_responding_hosts ) { - clog1 ("controller-%d %s enabled:%d not responding:%d", + clog ("controller-%d %s enabled:%d not responding:%d", ctrl.this_controller, hbs_cluster_network_name(n).c_str(), ctrl.monitored_hosts, @@ -365,7 +414,7 @@ void hbs_cluster_update ( iface_enum iface, } else { - clog1 ("controller-%d %s has %d monitored hosts and all are responding", + clog ("controller-%d %s has %d monitored hosts and all are responding", ctrl.this_controller, hbs_cluster_network_name(n).c_str(), ctrl.monitored_hosts); @@ -394,9 +443,11 @@ void hbs_cluster_update ( iface_enum iface, history_ptr->network = n ; /* Log new network history as its being started. */ - ilog ("controller-%d %s network history add", + ilog ("controller-%d added new controller-%d:%s history to vault ; now have %d network views", ctrl.this_controller, - hbs_cluster_network_name(n).c_str()); + ctrl.this_controller, + hbs_cluster_network_name(n).c_str(), + ctrl.cluster.histories); } } @@ -457,7 +508,9 @@ void hbs_cluster_update ( iface_enum iface, * ... which is the index for the next entry. */ unsigned short last_entry_index ; - if ( history_ptr->oldest_entry_index == 0 ) + unsigned short oldest_entry_index = history_ptr->oldest_entry_index ; + + if ( oldest_entry_index == 0 ) { /* Go to the end of the array. */ last_entry_index = MTCE_HBS_HISTORY_ENTRIES-1 ; @@ -465,43 +518,88 @@ void hbs_cluster_update ( iface_enum iface, else { /* Otherwise, the previous index in the array */ - last_entry_index = history_ptr->oldest_entry_index - 1 ; + last_entry_index = oldest_entry_index - 1 ; } - /* Update the history with this data. */ - history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled = ctrl.monitored_hosts ; - history_ptr->entry[history_ptr->oldest_entry_index].hosts_responding = ctrl.monitored_hosts - not_responding_hosts ; + bool logit = false ; + string logit_reason = "" ; - if (( history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled != - history_ptr->entry[ last_entry_index].hosts_enabled ) || - ( history_ptr->entry[history_ptr->oldest_entry_index].hosts_responding != - history_ptr->entry[ last_entry_index].hosts_responding)) + /* Update the history with this data. */ + history_ptr->entry[oldest_entry_index].hosts_enabled = ctrl.monitored_hosts ; + history_ptr->entry[oldest_entry_index].hosts_responding = ctrl.monitored_hosts - not_responding_hosts ; + + if (( history_ptr->entry[oldest_entry_index].hosts_enabled != + history_ptr->entry[ last_entry_index].hosts_enabled ) || + ( history_ptr->entry[oldest_entry_index].hosts_responding != + history_ptr->entry[ last_entry_index].hosts_responding)) { /* Only log on change events. */ - if ( history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled == - history_ptr->entry[history_ptr->oldest_entry_index].hosts_responding ) + if ( history_ptr->entry[oldest_entry_index].hosts_enabled == + history_ptr->entry[oldest_entry_index].hosts_responding ) { ilog ("controller-%d %s cluster of %d is healthy", ctrl.this_controller, hbs_cluster_network_name(n).c_str(), - history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled); + history_ptr->entry[oldest_entry_index].hosts_enabled); + ctrl.cluster_change_threshold_count = 0 ; + ctrl.cluster_change_difference_count = 0 ; } else { - ilog ("controller-%d %s cluster of %d with %d responding", - ctrl.this_controller, - hbs_cluster_network_name(n).c_str(), - history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled, - history_ptr->entry[history_ptr->oldest_entry_index].hosts_responding); + ctrl.cluster_change_threshold_count++ ; + ctrl.cluster_change_difference_count = + history_ptr->entry[oldest_entry_index].hosts_enabled - + history_ptr->entry[oldest_entry_index].hosts_responding ; } } + if ( daemon_get_cfg_ptr()->debug_state&4 ) + { + logit = true ; + logit_reason = "(debug)" ; + } +// else if (( ctrl.cluster_change_threshold_count == 1 ) && +// ( cluster_change == false )) +// { +// logit = true ; +// logit_reason = "" ; +// } + else if ( ctrl.cluster_change_threshold_count >= CLUSTER_CHANGE_THRESHOLD ) + { + logit = true ; + ctrl.cluster_change_threshold_count = 0 ; + logit_reason = "(threshold)" ; + } + else + { + int delta = + history_ptr->entry[oldest_entry_index].hosts_enabled - + history_ptr->entry[oldest_entry_index].hosts_responding ; + if ( delta != ctrl.cluster_change_difference_count ) + { + logit = true ; + ctrl.cluster_change_difference_count = delta ; + logit_reason = "(delta)" ; + } + } + + if ( logit ) + { + ilog ("controller-%d %s cluster of %d with %d responding (%d:%d) %s", + ctrl.this_controller, + hbs_cluster_network_name(n).c_str(), + history_ptr->entry[oldest_entry_index].hosts_enabled, + history_ptr->entry[oldest_entry_index].hosts_responding, + ctrl.cluster_change_difference_count, + not_responding_hosts, + logit_reason.c_str()); + } /* Increment the entries count till it reaches the max. */ if ( history_ptr->entries < MTCE_HBS_HISTORY_ENTRIES ) history_ptr->entries++ ; /* Manage the next entry update index ; aka the oldest index. */ - if ( history_ptr->oldest_entry_index == (MTCE_HBS_HISTORY_ENTRIES-1)) + if ( oldest_entry_index == (MTCE_HBS_HISTORY_ENTRIES-1)) history_ptr->oldest_entry_index = 0 ; else history_ptr->oldest_entry_index++ ; @@ -521,24 +619,31 @@ void hbs_cluster_update ( iface_enum iface, void hbs_cluster_append ( hbs_message_type & msg ) { - unsigned short c = ctrl.this_controller ; - - CHECK_CTRL_NTWK_PARMS(c, ctrl.monitored_networks); + CHECK_CTRL_NTWK_PARMS(ctrl.this_controller, ctrl.monitored_networks); msg.cluster.version = ctrl.cluster.version ; msg.cluster.revision = ctrl.cluster.revision ; msg.cluster.magic_number = ctrl.cluster.magic_number ; msg.cluster.period_msec = ctrl.cluster.period_msec ; msg.cluster.storage0_enabled = ctrl.cluster.storage0_enabled ; - msg.cluster.histories = ctrl.cluster.histories ; + msg.cluster.histories = 0 ; - int bytes = BYTES_IN_CLUSTER_VAULT(ctrl.monitored_networks); + /* Copy this controller's cluster history into the broadcast request. */ + for ( int h = 0 ; h < ctrl.cluster.histories ; h++ ) + { + if ( ctrl.cluster.history[h].controller == ctrl.this_controller ) + { + memcpy( &msg.cluster.history[msg.cluster.histories], + &ctrl.cluster.history[h], + sizeof(mtce_hbs_cluster_history_type)); - clog1 ("controller-%d appending cluster info to heartbeat message (%d:%d:%d)", - c, ctrl.monitored_networks, ctrl.cluster.histories, bytes ); + msg.cluster.histories++ ; + } + } + msg.cluster.bytes = BYTES_IN_CLUSTER_VAULT(msg.cluster.histories); - /* Copy the cluster into the message. */ - memcpy( &msg.cluster.history[0], &ctrl.cluster.history[c], bytes); + clog2 ("controller-%d appending cluster info to heartbeat message (%d:%d:%d)", + ctrl.this_controller, ctrl.monitored_networks, ctrl.cluster.histories, msg.cluster.bytes ); } /**************************************************************************** @@ -574,57 +679,8 @@ unsigned short hbs_cluster_unused_bytes ( void ) * ***************************************************************************/ -/* NOTE: All code wrapped in this directive will be removed once - * active/active heartbeating is delivered in next update */ -#define WANT_ACTIVE_ACTIVE_HEARTBEAT_RESULTS - void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid ) { - -#ifdef WANT_ACTIVE_ACTIVE_HEARTBEAT_RESULTS - - /* To assist SM with duplex integration ... - * - * This code emulates heartbeat redundancy by duplicating - * controller history up to the number of provisioned - * controllers until active-active heartbeat is delivered. - */ - int peer_controller ; - bool copy_cluster = false ; - if ( ctrl.this_controller == 0 ) - { - peer_controller = 1 ; - if ( ctrl.controller_1_enabled ) - { - copy_cluster = true ; - } - } - else - { - peer_controller = 0 ; - if ( ctrl.controller_0_enabled ) - { - copy_cluster = true ; - } - } - - int n, networks = ctrl.cluster.histories ; - if ( copy_cluster ) - { - for ( n = 0 ; n < networks ; n++ ) - { - /* copy this controller history to create peer controller */ - ctrl.cluster.history[ctrl.cluster.histories] = ctrl.cluster.history[n] ; - - /* update the controller */ - ctrl.cluster.history[ctrl.cluster.histories].controller = peer_controller ; - ctrl.cluster.bytes += sizeof(mtce_hbs_cluster_history_type) ; - ctrl.cluster.histories++ ; - } - } - -#endif // WANT_ACTIVE_ACTIVE_HEARTBEAT_RESULTS - ctrl.cluster.reqid = (unsigned short)reqid ; if (( sm_client_sock ) && ( sm_client_sock->sock_ok() == true )) { @@ -637,34 +693,82 @@ void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid ) } else { - ilog ("heartbeat cluster vault sent to SM (%d bytes)", len ); - hbs_cluster_dump ( ctrl.cluster ); + string reason = "" ; + // ilog ("heartbeat cluster vault sent to SM (%d bytes)", len ); + if ( reqid ) + reason = "cluster query" ; + else + reason = "cluster event" ; + hbs_cluster_dump ( ctrl.cluster, reason, true ); } } - -#ifdef WANT_ACTIVE_ACTIVE_HEARTBEAT_RESULTS - - if ( copy_cluster ) + else { - /* Clear out the other controllers data. */ - for ( n = networks ; n > 0 ; n-- ) + wlog ("cannot send cluster info due to socket error"); + } +} + +/**************************************************************************** + * + * Name : hbs_history_save + * + * Descrition : Copy the history sample to the vault. + * + * Returns : Nothing. + * + ***************************************************************************/ + +void hbs_history_save ( string hostname, mtce_hbs_cluster_history_type & sample ) +{ + for ( int h = 0 ; h < ctrl.cluster.histories ; h++ ) + { + if (( ctrl.cluster.history[h].controller == sample.controller ) && + ( ctrl.cluster.history[h].network == sample.network )) { - /* copy c0 history to another controller */ - hbs_cluster_history_init(ctrl.cluster.history[ctrl.cluster.histories-1]); - ctrl.cluster.bytes -= sizeof(mtce_hbs_cluster_history_type); - ctrl.cluster.histories-- ; + memcpy( &ctrl.cluster.history[h], &sample, + sizeof(mtce_hbs_cluster_history_type)); + + clog1 ("controller-%d updated vault with controller-%d:%s network history through %s (histories:%d)", + ctrl.this_controller, + sample.controller, + hbs_cluster_network_name((mtce_hbs_network_enum)sample.network).c_str(), + hostname.c_str(), + ctrl.cluster.histories); + return ; } } -#endif // WANT_ACTIVE_ACTIVE_HEARTBEAT_RESULTS + /* not found ? Add a new one */ + memcpy( &ctrl.cluster.history[ctrl.cluster.histories], &sample, + sizeof(mtce_hbs_cluster_history_type)); + ctrl.cluster.histories++ ; + ctrl.cluster.bytes = BYTES_IN_CLUSTER_VAULT(ctrl.cluster.histories); + ilog ("controller-%d added new controller-%d:%s history to vault ; now have %d network views", + ctrl.this_controller, + sample.controller, + hbs_cluster_network_name((mtce_hbs_network_enum)sample.network).c_str(), + ctrl.cluster.histories); } +void hbs_state_audit ( void ) +{ + hbs_cluster_dump ( ctrl.cluster, "Audit", true ); +} + + void hbs_cluster_log ( string & hostname, string prefix ) { hbs_cluster_log ( hostname, ctrl.cluster, prefix ); } +void hbs_cluster_log ( string & hostname, + string log_prefix, + bool force ) +{ + hbs_cluster_log (hostname, ctrl.cluster, log_prefix, force ); +} + /**************************************************************************** * * Active Active Heartbeating and Debug Member Functions @@ -724,10 +828,6 @@ int hbs_cluster_cmp( hbs_message_type & msg ) * Descrition : Copies the other controllers information from msg into * the cluster. * - * NOTE: Does not do that right now. - * - * Assumptions : Place holder until active/active heartbeating is implemented. - * * Returns : PASS or FAIL * ***************************************************************************/ @@ -736,12 +836,29 @@ int hbs_cluster_save ( string & hostname, mtce_hbs_network_enum network, hbs_message_type & msg ) { - // clog ("Add cluster info from peer controller"); - if ( ctrl.monitored_hosts ) + /* cluster info is only supported in HBS_MESSAGE_VERSION 1 */ + if ( msg.v < HBS_MESSAGE_VERSION ) + return FAIL_NOT_SUPPORTED ; + + if ( ! ctrl.monitored_hosts ) + return RETRY ; + + if ( msg.cluster.histories == 0 ) + return PASS ; + + for ( int h = 0 ; h < msg.cluster.histories ; h++ ) { - /* compare cluster info and log deltas */ - // hbs_cluster_cmp( msg ); - UNUSED(msg); + if ( msg.cluster.history[h].network >= MTCE_HBS_MAX_NETWORKS ) + { + elog ("Invalid network id (%d:%d:%d)", + h, + msg.cluster.history[h].controller, + msg.cluster.history[h].network ); + } + else if ( msg.cluster.history[h].controller != ctrl.this_controller ) + { + hbs_history_save ( hostname, msg.cluster.history[h] ); + } hbs_cluster_log( hostname, ctrl.cluster, hbs_cluster_network_name(network) ); } return (PASS); diff --git a/mtce/src/heartbeat/hbsStubs.cpp b/mtce/src/heartbeat/hbsStubs.cpp index 70b25df3..fbd68067 100644 --- a/mtce/src/heartbeat/hbsStubs.cpp +++ b/mtce/src/heartbeat/hbsStubs.cpp @@ -241,10 +241,11 @@ int mtcSmgrApi_active_services ( string hostname , bool * yes_no_ptr ) return(PASS); } -int send_hbs_command ( string hostname, int command ) +int send_hbs_command ( string hostname, int command, string controller ) { UNUSED(hostname); UNUSED(command); + UNUSED(controller); return(PASS); } diff --git a/mtce/src/heartbeat/hbsUtil.cpp b/mtce/src/heartbeat/hbsUtil.cpp index 54edb376..3980014a 100644 --- a/mtce/src/heartbeat/hbsUtil.cpp +++ b/mtce/src/heartbeat/hbsUtil.cpp @@ -111,6 +111,33 @@ string hbs_cluster_network_name ( mtce_hbs_network_enum network ) } } +/**************************************************************************** + * + * Name : hbs_cluster_copy + * + * Descrition : Copies cluster from src to dst. + * + * Returns : Nothing. + * + ***************************************************************************/ + +void hbs_cluster_copy ( mtce_hbs_cluster_type & src, mtce_hbs_cluster_type & dst ) +{ + dst.version = src.version ; + dst.revision = src.revision ; + dst.magic_number = src.magic_number ; + dst.period_msec = src.period_msec ; + dst.histories = src.histories ; + dst.storage0_enabled = src.storage0_enabled ; + for ( int h = 0 ; h < dst.histories ; h++ ) + { + memcpy( &dst.history[h], + &src.history[h], + sizeof(mtce_hbs_cluster_history_type)); + } + dst.bytes = BYTES_IN_CLUSTER_VAULT(dst.histories); +} + /**************************************************************************** * @@ -126,11 +153,9 @@ string hbs_cluster_network_name ( mtce_hbs_network_enum network ) void hbs_cluster_log ( string & hostname, mtce_hbs_cluster_type & cluster, - string log_prefix ) + string log_prefix, + bool force ) { - // bool want_log = false ; - - clog1 ("log %d histories", cluster.histories ); for ( int h = 0 ; h < cluster.histories ; h++ ) { if ( cluster.history[h].entries == MTCE_HBS_HISTORY_ENTRIES ) @@ -140,8 +165,6 @@ void hbs_cluster_log ( string & hostname, mtce_hbs_cluster_entry_type e = { 0, 0 } ; char str[MAX_CLUSTER_LINE_LEN] ; string line = ""; - int start = 0 ; - int stop = 0 ; bool newline = false ; bool logit = false ; bool first = false ; @@ -149,18 +172,13 @@ void hbs_cluster_log ( string & hostname, mtce_hbs_cluster_history_type * history_ptr = &cluster.history[h] ; - clog1 ("%s %s has %d entries (controller-%d view from %s)", hostname.c_str(), - hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(), - history_ptr->entries, - history_ptr->controller, - log_prefix.c_str()); - - /* Manage local this_index for log display. * Display oldest to newest ; left to right * * */ int this_index = history_ptr->oldest_entry_index ; + int debug = daemon_get_cfg_ptr()->debug_state ; + for ( int count = 0 ; count < history_ptr->entries ; count++ ) { if (( line.length() + MAX_ENTRY_STR_LEN ) >= @@ -180,13 +198,11 @@ void hbs_cluster_log ( string & hostname, } #endif - // want_log = true ; - if ( count == 0 ) { snprintf (&str[0], MAX_ENTRY_STR_LEN , "%d:%d ", // -%d", history_ptr->entry[this_index].hosts_enabled, - history_ptr->entry[this_index].hosts_responding ); // , this_index ); + history_ptr->entry[this_index].hosts_responding ); line.append (str); str[0] = '\0' ; } @@ -203,7 +219,7 @@ void hbs_cluster_log ( string & hostname, { snprintf (&str[0], MAX_ENTRY_STR_LEN , "%d:%d ", // -%d", history_ptr->entry[this_index].hosts_enabled, - history_ptr->entry[this_index].hosts_responding ); // , this_index ); + history_ptr->entry[this_index].hosts_responding ); line.append (str); str[0] = '\0' ; logit = true ; @@ -214,31 +230,21 @@ void hbs_cluster_log ( string & hostname, first_log[h] = true ; logit = true ; } - stop++ ; if ( newline == true ) { if ( logit ) { SET_CONTROLLER_HOSTNAME(history_ptr->controller); - if ( hostname == controller ) + if (( force ) || ( debug&2 )) { - clog ("%s view %s %s %02d..%02d: %s,", - hostname.c_str(), - log_prefix.c_str(), - hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(), - start, stop, line.c_str()); - } - else - { - clog ("%s view from %s %s %s %02d..%02d: %s,", - controller.c_str(), - hostname.c_str(), - log_prefix.c_str(), - hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(), - start, stop, line.c_str()); + syslog ( LOG_INFO, "%s view from %s %s %s: %s", + controller.c_str(), + hostname.c_str(), + log_prefix.c_str(), + hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(), + line.c_str()); } } - start = stop + 1 ; line.clear(); first = true ; newline = false ; @@ -253,7 +259,6 @@ void hbs_cluster_log ( string & hostname, } if (( newline == false ) && ( line.length() )) { - // ERIC if (( logit == false ) && ( was_diff[h] == true )) { logit = true ; @@ -264,30 +269,25 @@ void hbs_cluster_log ( string & hostname, { if ( first ) { - clog ("............ %s %s %02d..%02d: %s", - log_prefix.c_str(), - hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(), - start, stop, line.c_str()); + if (( force ) || ( debug&2 )) + { + syslog ( LOG_INFO, "............ %s %s: %s", + log_prefix.c_str(), + hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(), + line.c_str()); + } } else { SET_CONTROLLER_HOSTNAME(history_ptr->controller); - if ( hostname == controller ) + if (( force ) || ( debug&2 )) { - clog ("%s view %s %s %02d..%02d: %s", - hostname.c_str(), - log_prefix.c_str(), - hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(), - start, stop, line.c_str()); - } - else - { - clog ("%s view from %s %s %s %02d..%02d: %s", - controller.c_str(), - hostname.c_str(), - log_prefix.c_str(), /* Infra <- */ - hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(), - start, stop, line.c_str()); + syslog ( LOG_INFO, "%s view from %s %s %s: %s", + controller.c_str(), + hostname.c_str(), + log_prefix.c_str(), /* Infra <- */ + hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(), + line.c_str()); } } } @@ -307,40 +307,62 @@ void hbs_cluster_log ( string & hostname, * Description: Formatted dump of the vault contents to the log file. * ***************************************************************************/ -void hbs_cluster_dump ( mtce_hbs_cluster_type & vault ) +void hbs_cluster_dump ( mtce_hbs_cluster_type & vault, string log_prefix, bool force ) { - syslog ( LOG_INFO, "Cluster Vault Dump: --------------------------------------------------------------------------------------------"); - syslog ( LOG_INFO, "Cluster Vault: v%d.%d %d msec period ; SM Reqid is %d with storage-0 %s and %d histories in %d bytes", - vault.version, - vault.revision, - vault.period_msec, - vault.reqid, - vault.storage0_enabled ? "enabled" : "disabled", - vault.histories, - vault.bytes ); - for ( int h = 0 ; h < vault.histories ; h++ ) + if ( vault.version == 0 ) + return ; + + int debug = daemon_get_cfg_ptr()->debug_state ; + + if (( debug & 2 ) || ( force == true )) { - #define MAX_LINE_LEN (500) - char str[MAX_LINE_LEN] ; - int i = 0 ; - for ( int e = 0 ; e < vault.history[h].entries_max ; e++ ) - { - snprintf ( &str[i], MAX_LINE_LEN, "%c[%d:%d]" , - vault.history[h].oldest_entry_index==e ? '>' : ' ', - vault.history[h].entry[e].hosts_enabled, - vault.history[h].entry[e].hosts_responding); - i = strlen(str) ; - } - syslog ( LOG_INFO, "Cluster Vault: C%d %s S:%s:%s (%d:%d) %s", - vault.history[h].controller, - hbs_cluster_network_name((mtce_hbs_network_enum)vault.history[h].network).c_str(), - vault.storage0_enabled ? "y" : "n", - vault.history[h].storage0_responding ? "y" : "n", - vault.history[h].entries_max, - vault.history[h].entries, - str); + ilog ("%s", log_prefix.c_str()); + syslog ( LOG_INFO, "Cluster Vault : v%d.%d %d msec heartbeat period %s;%d network heartbeat response histories (%d bytes)", + vault.version, + vault.revision, + vault.period_msec, + vault.storage0_enabled ? " with storage-0: enabled " : "", + vault.histories, + vault.bytes ); + } + + if (( debug & 4 ) || ( force == true )) + { + for ( int h = 0 ; h < vault.histories ; h++ ) + { + #define MAX_LINE_LEN (500) + char str[MAX_LINE_LEN] ; + int i = 0 ; + for ( int e = 0 ; e < vault.history[h].entries_max ; e++ ) + { + snprintf ( &str[i], MAX_LINE_LEN, "%c[%d:%d]" , + vault.history[h].oldest_entry_index==e ? '>' : ' ', + vault.history[h].entry[e].hosts_enabled, + vault.history[h].entry[e].hosts_responding); + i = strlen(str) ; + } + if ( vault.storage0_enabled ) + { + syslog ( LOG_INFO, "Cluster Vault : C%d %s S:%s %s", + vault.history[h].controller, + hbs_cluster_network_name((mtce_hbs_network_enum)vault.history[h].network).c_str(), + vault.history[h].storage0_responding ? "y" : "n", + str); + } + else + { + syslog ( LOG_INFO, "Cluster Vault : C%d %s %s", + vault.history[h].controller, + hbs_cluster_network_name((mtce_hbs_network_enum)vault.history[h].network).c_str(), + str); + } + } + } + + if ( debug & 8 ) + { + dump_memory ( &vault, 16, vault.bytes ); } - // dump_memory ( &vault, 16, vault.bytes ); } diff --git a/mtce/src/maintenance/Makefile b/mtce/src/maintenance/Makefile index 83d038f5..64eea2a7 100755 --- a/mtce/src/maintenance/Makefile +++ b/mtce/src/maintenance/Makefile @@ -46,6 +46,7 @@ CONTROL_OBJS += mtcHttpSvr.o CONTROL_OBJS += mtcCmdHdlr.o CONTROL_OBJS += mtcNodeMnfa.o CONTROL_OBJS += mtcVimApi.o +CONTROL_OBJS += mtcStubs.o CONTROL_OBJS += ../common/nodeClass.o OBJS = $(SRCS:.cpp=.o) diff --git a/mtce/src/maintenance/mtcCtrlMsg.cpp b/mtce/src/maintenance/mtcCtrlMsg.cpp index 0be8d705..06dfd228 100755 --- a/mtce/src/maintenance/mtcCtrlMsg.cpp +++ b/mtce/src/maintenance/mtcCtrlMsg.cpp @@ -48,6 +48,7 @@ using namespace std; #include "mtcAlarm.h" /* for ... mtcAlarm... */ #include "nodeUtil.h" /* for ... get_event_str ... */ +int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr ); /* Throttle logging of messages from unknown IP addresses */ std::list unknown_ip_list ; @@ -766,7 +767,7 @@ int send_mtc_cmd ( string & hostname, int cmd , int interface ) return ( rc ); } -int send_hbs_command ( string hostname, int cmd ) +int send_hbs_command ( string hostname, int cmd, string controller ) { int bytes = 0 ; int bytes_to_send = 0 ; @@ -776,18 +777,6 @@ int send_hbs_command ( string hostname, int cmd ) mtc_message_type event ; mtc_socket_type * sock_ptr = get_sockPtr (); - /* We don't heartbeat self */ - if (( obj_ptr->is_active_controller (hostname) ) && - (( cmd == MTC_CMD_ADD_HOST ) || - ( cmd == MTC_CMD_DEL_HOST ) || - ( cmd == MTC_CMD_START_HOST ) || - ( cmd == MTC_CMD_STOP_HOST ))) - { - dlog ("%s refusing to '%s' self to heartbeat service\n", - hostname.c_str(), get_event_str(cmd).c_str()); - return (PASS); - } - memset (&event, 0 , sizeof(mtc_message_type)); snprintf ( &event.hdr[0] , MSG_HEADER_SIZE, "%s", get_hbs_cmd_req_header() ); snprintf ( &event.hdr[MSG_HEADER_SIZE] , MAX_CHARS_HOSTNAME , "%s", hostname.data()); @@ -795,48 +784,72 @@ int send_hbs_command ( string hostname, int cmd ) /* There is no buffer data in any of these messages */ bytes_to_send = ((sizeof(mtc_message_type))-(BUF_SIZE)) ; - switch ( cmd ) - { - case MTC_CMD_STOP_HOST: - ilog ("%s sending 'stop' to heartbeat service\n", hostname.c_str()); - break ; - case MTC_CMD_START_HOST: - obj_ptr->manage_heartbeat_clear ( hostname , MAX_IFACES ); - ilog ("%s sending 'start' to heartbeat service\n", hostname.c_str()); - break ; - case MTC_CMD_DEL_HOST: - ilog ("%s sending 'delete' to heartbeat service\n", hostname.c_str()); - break ; - case MTC_CMD_ADD_HOST: - obj_ptr->manage_heartbeat_clear ( hostname, MAX_IFACES ); - ilog ("%s sending 'add' to heartbeat service\n", hostname.c_str()); - break ; - case MTC_RESTART_HBS: - ilog ("%s sending 'restart' to heartbeat service\n", hostname.c_str()); - break ; - case MTC_BACKOFF_HBS: - ilog ("%s requesting heartbeat period backoff\n", hostname.c_str()); - break ; - case MTC_RECOVER_HBS: - ilog ("%s requesting heartbeat period recovery\n", hostname.c_str()); - break ; - default: - { - slog ("%s Unsupported command operation 0x%x\n", hostname.c_str(), cmd ); - return (FAIL_BAD_PARM); - } - } event.cmd = cmd ; event.num = 1 ; event.parm[0] = obj_ptr->get_nodetype(hostname); /* send to hbsAgent daemon port */ - bytes = sock_ptr->mtc_to_hbs_sock->write((char*) &event, bytes_to_send); - if ( bytes <= 0 ) + std::list controllers ; + controllers.clear(); + if ( controller == CONTROLLER ) { - wlog ("Cannot send to heartbeat service\n"); - rc = FAIL_TO_TRANSMIT ; + controllers.push_back(CONTROLLER_0); + controllers.push_back(CONTROLLER_1); + } + else + { + controllers.push_back(controller); + } + string ip = "" ; + std::list::iterator unit ; + for ( unit = controllers.begin () ; + unit != controllers.end () ; + unit++ ) + { + switch ( cmd ) + { + case MTC_CMD_ACTIVE_CTRL: + mlog3 ("%s sending 'activity state' to %s heartbeat service\n", hostname.c_str(), unit->c_str()); + break ; + case MTC_CMD_STOP_HOST: + ilog ("%s sending 'stop' to %s heartbeat service\n", hostname.c_str(), unit->c_str()); + break ; + case MTC_CMD_START_HOST: + obj_ptr->manage_heartbeat_clear ( hostname , MAX_IFACES ); + ilog ("%s sending 'start' to %s heartbeat service\n", hostname.c_str(), unit->c_str()); + break ; + case MTC_CMD_DEL_HOST: + ilog ("%s sending 'delete' to %s heartbeat service\n", hostname.c_str(), unit->c_str()); + break ; + case MTC_CMD_ADD_HOST: + obj_ptr->manage_heartbeat_clear ( hostname, MAX_IFACES ); + ilog ("%s sending 'add' to %s heartbeat service\n", hostname.c_str(), unit->c_str()); + break ; + case MTC_RESTART_HBS: + ilog ("%s sending 'restart' to %s heartbeat service\n", hostname.c_str(), unit->c_str()); + break ; + case MTC_BACKOFF_HBS: + ilog ("%s requesting %s heartbeat period backoff\n", hostname.c_str(), unit->c_str()); + break ; + case MTC_RECOVER_HBS: + ilog ("%s requesting %s heartbeat period recovery\n", hostname.c_str(), unit->c_str()); + break ; + default: + { + slog ("%s Unsupported command operation 0x%x\n", hostname.c_str(), cmd ); + rc = FAIL_BAD_PARM ; + continue ; + } + } + + ip = get_mtcInv_ptr()->get_hostaddr(*unit) ; + bytes = sock_ptr->mtc_to_hbs_sock->write((char*) &event, bytes_to_send, ip.data()); + if ( bytes <= 0 ) + { + wlog ("%s failed to send command (0x%x) to heartbeat service at %s\n", unit->c_str(), cmd, ip.c_str() ); + rc = FAIL_TO_TRANSMIT ; + } } return rc ; } @@ -954,6 +967,14 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr ) /* Assert the degrade condition with the 'false' (i.e. not clear)*/ obj_ptr->manage_heartbeat_degrade ( hostname, iface, false ); } + /* Otherwise the action must be alarm only or none ; both of which + * are already handled by the hbsAgent, so do nothing */ + else + { + ilog ("%s heartbeat degrade event dropped ; action is not fail or degrade (%s)\n", + hostname.c_str(), + get_iface_name_str(iface)); + } } else { @@ -1003,7 +1024,7 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr ) * are already handled by the hbsAgent, so do nothing */ else { - dlog ("%s heartbeat loss event dropped (%s)\n", + ilog ("%s heartbeat loss event dropped ; action is not fail or degrade (%s)\n", hostname.c_str(), get_iface_name_str(iface)); } @@ -1070,6 +1091,7 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr ) else if ( msg.cmd == MTC_EVENT_HEARTBEAT_READY ) { + string controller = CONTROLLER ; std::list::iterator temp ; /* no heartbeating in simplex mode */ @@ -1078,7 +1100,17 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr ) return (PASS); } - ilog ("Received 'Heartbeat Service Ready' Event\n"); + /* get the controller that sent this ready event */ + if (( msg.buf[0] != '\0' ) && ( strnlen( msg.buf, BUF_SIZE) <= MAX_CHARS_HOSTNAME )) + { + controller = msg.buf ; + ilog ("%s Heartbeat Service Ready Event (%s)\n", + msg.buf, sock_ptr->mtc_event_rx_sock->get_src_str()); + } + else + { + ilog ("Heartbeat Service Ready Event\n"); + } obj_ptr->hbs_ready = true ; /* Run Maintenance on Inventory */ @@ -1093,25 +1125,17 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr ) * the heartbeat service. This tell the heartbeat * service about all the hosts so that it will * send heartbeat oob flag events to mtce. */ - if ( send_hbs_command( hostname, MTC_CMD_ADD_HOST ) != PASS ) + if ( send_hbs_command( hostname, MTC_CMD_ADD_HOST, controller ) != PASS ) { elog ("%s Failed to send inventory to heartbeat service\n", hostname.c_str()); } - /* Send the start event to the heartbeat service for all enabled hosts except - * for the active controller which is not actively monitored */ - if ( obj_ptr->is_active_controller ( hostname ) == false ) + /* Send the start event to the heartbeat service for all enabled hosts */ + if (( obj_ptr->get_adminState ( hostname ) == MTC_ADMIN_STATE__UNLOCKED ) && + ( obj_ptr->get_operState ( hostname ) == MTC_OPER_STATE__ENABLED ) && + ((obj_ptr->get_availStatus ( hostname ) == MTC_AVAIL_STATUS__AVAILABLE ) || + (obj_ptr->get_availStatus ( hostname ) == MTC_AVAIL_STATUS__DEGRADED ))) { - if (( obj_ptr->get_adminState ( hostname ) == MTC_ADMIN_STATE__UNLOCKED ) && - ( obj_ptr->get_operState ( hostname ) == MTC_OPER_STATE__ENABLED ) && - ((obj_ptr->get_availStatus ( hostname ) == MTC_AVAIL_STATUS__AVAILABLE ) || - (obj_ptr->get_availStatus ( hostname ) == MTC_AVAIL_STATUS__DEGRADED ))) - { - send_hbs_command ( hostname, MTC_CMD_START_HOST ); - } - } - else - { - dlog ("%s Refusing to start heartbeat of self\n", hostname.c_str() ); + send_hbs_command ( hostname, MTC_CMD_START_HOST, controller ); } } } diff --git a/mtce/src/maintenance/mtcNodeCtrl.cpp b/mtce/src/maintenance/mtcNodeCtrl.cpp index c56f4b00..b82ee661 100644 --- a/mtce/src/maintenance/mtcNodeCtrl.cpp +++ b/mtce/src/maintenance/mtcNodeCtrl.cpp @@ -802,7 +802,11 @@ int mtc_socket_init ( void ) /***********************************************************/ int port = daemon_get_cfg_ptr()->hbs_to_mtc_event_port ; - mtc_sock.mtc_event_rx_sock = new msgClassRx(LOOPBACK_IP, port, IPPROTO_UDP); + + /* listen to this port on any interface so that the hbsAgent running + * locally or on peer controller can get events into mtcAgent */ + mtc_sock.mtc_event_rx_sock = + new msgClassRx(mtcInv.my_float_ip.data(), port, IPPROTO_UDP); rc = mtc_sock.mtc_event_rx_sock->return_status; if ( rc ) { @@ -820,7 +824,7 @@ int mtc_socket_init ( void ) /***********************************************************/ port = daemon_get_cfg_ptr()->mtc_to_hbs_cmd_port ; - sock_ptr->mtc_to_hbs_sock = new msgClassTx(LOOPBACK_IP, port, IPPROTO_UDP); + sock_ptr->mtc_to_hbs_sock = new msgClassTx(CONTROLLER, port, IPPROTO_UDP, mtc_config.mgmnt_iface); rc = sock_ptr->mtc_to_hbs_sock->return_status; if ( rc ) { @@ -1281,11 +1285,14 @@ void daemon_service_run ( void ) mtcInv.inotify_shadow_file_fd , mtcInv.inotify_shadow_file_wd ); - /* Add this controller to the heartbeat service so that we - * receive the out-of-band heartbeat 'flags' even though - * we don't self monitor the active controller specifically - * This add may be duplicate but covers the initial config case */ + /* inform the heartbeat service that this controller is active */ + send_hbs_command ( mtcInv.my_hostname, MTC_CMD_ACTIVE_CTRL ); + + /* Add this controller to the heartbeat service so that + * the peer hbsAgent also gets this controllers inventory + * and this hbsAgent receives the out-of-band heartbeat 'flags' */ send_hbs_command ( mtcInv.my_hostname, MTC_CMD_ADD_HOST ); + send_hbs_command ( mtcInv.my_hostname, MTC_CMD_START_HOST ); socks.clear(); socks.push_front (mtc_sock.mtc_event_rx_sock->getFD()); // service_events diff --git a/mtce/src/maintenance/mtcNodeHdlrs.cpp b/mtce/src/maintenance/mtcNodeHdlrs.cpp index d57c3d6e..2536a7b7 100755 --- a/mtce/src/maintenance/mtcNodeHdlrs.cpp +++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp @@ -6205,6 +6205,13 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr ) #endif + /* Audits for this controller host only */ + if ( node_ptr->hostname == this->my_hostname ) + { + /* Remind the heartbeat service that this is the active ctrl */ + send_hbs_command ( this->my_hostname, MTC_CMD_ACTIVE_CTRL ); + } + /* Manage active controller auto recovery bool. * If the inactive controller is inservice then disable * controller autorecovery. Otherwise enable it but in this case diff --git a/mtce/src/maintenance/mtcStubs.cpp b/mtce/src/maintenance/mtcStubs.cpp index f1a94b62..4fc3ff80 100644 --- a/mtce/src/maintenance/mtcStubs.cpp +++ b/mtce/src/maintenance/mtcStubs.cpp @@ -14,4 +14,10 @@ using namespace std; #include "nodeClass.h" /* The main link class */ -void hbs_cluster_log ( void ) { } +void hbs_cluster_log ( string & hostname, string prefix, bool force=false ) +{ + UNUSED(hostname); + UNUSED(prefix); + UNUSED(force); +} +