From 62532a7eac989c15b1544fd438bd30099a8661c7 Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Fri, 12 Jul 2019 14:18:20 -0400 Subject: [PATCH] Fix maintenance cluster-host messaging Maintenance's success path messaging does not depend on cluster network messaging. However, there are a number of failure mode cases that do depend on cluster network messaging to properly diagnose and offer a higher availability handling for some failure cases. For instance, when the management interface goes down, without cluster network messaging remote hosts can be isolated. Being able to command- reboot a host over cluster-host network offers higher availability. Maintenance is designed to use the cluster network, if provisioned, as a backup path for mtcAlive, node locked, reboot and several other commands and acknowledgements. Unfortunately, it was recently observed that maintenance is using the 'nfs-controller' label to resolve cluster network addressing which resolves to management network IPs. As a result all messages intended to be going over the cluster-host network are instead just redundant management network messages. During debug of this issue several additional cluster network messaging related issues were observed and fixed. This update implements the following fixes 1. since there is no floating address for the cluster network the mtcClient was modified to send messages to both controllers where only the active controller will be listening and acting. 2. fixes port number mtce listens for cluster-host network messages 3. fixes port number mtce sends cluster-host network messages to. 4. mtcAlive messages are also sent on provisioned cluster network. 5. locked state notifications and acks sent on provisioned cluster network. 6. reboot request and acks sent on provisioned cluster network. 7. fixed command acknowledgement messaging. This update also 1. envelopes the mtcAlive gate control to allow debug tracing of all gate state changes. 2. moves graceful recovery handling heartbeat failure state clear to the end of the recovery handler, just before heartbeat start. 3. adds sm unhealthy support to fail and automatically recover the inactive controller from an SM UNHEALTHY state. ---------- Test Plan: ---------- Functional: PASS: Verify management network messaging PASS: Verify cluster-host network messaging PASS: Verify cluster-host messages with tcpdump PASS: Verify cluster-host network mtcAlive messaging PASS: Verify reboot request and ack reply over management network PASS: Verify reboot request and ack reply over cluster-host network PASS: Verify lock state notification and ack reply over management network PASS: Verify lock state notification and ack reply over cluster-host network PASS: Verify acknowledgement messaging PASS: Verify maintenance daemon logging PASS: Verify maintenance socket initialization System: PASS: Verify compute system install PASS: Verify AIO system install Feature: PASS: Verify sm node unhealth handling (active:ignore, inactive:recover) Change-Id: I092596d3e22438dd8a613a073614c188f6f5721d Closes-Bug: #835268 Signed-off-by: Eric MacDonald --- mtce-common/src/common/nodeBase.cpp | 10 +- mtce-common/src/common/nodeBase.h | 2 + mtce-common/src/common/nodeUtil.cpp | 4 +- mtce-common/src/daemon/daemon_common.h | 3 - mtce/src/common/nodeClass.cpp | 196 ++++++++++++++----- mtce/src/common/nodeClass.h | 6 +- mtce/src/maintenance/mtcCmdHdlr.cpp | 21 +- mtce/src/maintenance/mtcCompMsg.cpp | 257 +++++++++++++------------ mtce/src/maintenance/mtcCtrlMsg.cpp | 147 +++++++------- mtce/src/maintenance/mtcNodeComp.cpp | 128 +++++++----- mtce/src/maintenance/mtcNodeComp.h | 5 +- mtce/src/maintenance/mtcNodeCtrl.cpp | 49 +++-- mtce/src/maintenance/mtcNodeHdlrs.cpp | 79 +++----- mtce/src/maintenance/mtcNodeMsg.h | 18 +- 14 files changed, 551 insertions(+), 374 deletions(-) diff --git a/mtce-common/src/common/nodeBase.cpp b/mtce-common/src/common/nodeBase.cpp index e8383abb..c333656b 100755 --- a/mtce-common/src/common/nodeBase.cpp +++ b/mtce-common/src/common/nodeBase.cpp @@ -265,6 +265,14 @@ void print_mtc_message ( string hostname, iface, msg.hdr); } + else if (( daemon_get_cfg_ptr()->debug_alive&1) && ( msg.cmd == MTC_MSG_MTCALIVE )) + { + alog ("%s %s (%s network) - %s\n", + hostname.c_str(), + direction ? "rx <-" : "tx ->" , + iface, + msg.hdr); + } else { mlog1 ("%s %s (%s network) - %s\n", @@ -276,7 +284,7 @@ void print_mtc_message ( string hostname, return ; } - string str = "-" ; + string str = "" ; if ( msg.buf[0] ) str = msg.buf ; if ( force ) diff --git a/mtce-common/src/common/nodeBase.h b/mtce-common/src/common/nodeBase.h index 94c80ca0..ff6398cd 100755 --- a/mtce-common/src/common/nodeBase.h +++ b/mtce-common/src/common/nodeBase.h @@ -393,6 +393,8 @@ void daemon_exit ( void ); /* This label will resolve to an IP on the management network */ #define CONTROLLER_NFS ((const char *)"controller-nfs") +#define CONTROLLER_0_CLUSTER_HOST ((const char *)"controller-0-cluster-host") +#define CONTROLLER_1_CLUSTER_HOST ((const char *)"controller-1-cluster-host") /* Maintenance Daemon Services - actual names of the daemons */ /* ... controller only service / daemons */ diff --git a/mtce-common/src/common/nodeUtil.cpp b/mtce-common/src/common/nodeUtil.cpp index e69dbe61..e0f3003a 100755 --- a/mtce-common/src/common/nodeUtil.cpp +++ b/mtce-common/src/common/nodeUtil.cpp @@ -699,7 +699,7 @@ int get_hostname ( char * hostname_ptr, int max_len ) rc = gethostname(hostname_ptr, max_len ); if ( rc == PASS ) { - ilog ("Hostname : %s\n", hostname_ptr); + ilog ("%s", hostname_ptr); } else { @@ -751,7 +751,7 @@ int get_iface_address ( const char * iface_ptr, string & ip_addr , bool retry ) if ( rc == PASS ) { ip_addr = ip_cstr; - dlog ("IP Address : %s\n", ip_addr.c_str() ); + ilog ("%s %s\n", iface_ptr, ip_addr.c_str()); } else { diff --git a/mtce-common/src/daemon/daemon_common.h b/mtce-common/src/daemon/daemon_common.h index 8d7d1044..9ea16339 100755 --- a/mtce-common/src/daemon/daemon_common.h +++ b/mtce-common/src/daemon/daemon_common.h @@ -233,9 +233,6 @@ int daemon_run_testhead ( void ); #define CONFIG_AGENT_SECRET_PORT 0x20000000 /**< Barbican HTTP port */ #define CONFIG_AGENT_VIM_EVENT_PORT 0x40000000 /**< VIM Event Port Mask */ -#define CONFIG_AGENT_PORT CONFIG_AGENT_MTC_MGMNT_PORT -#define CONFIG_CLIENT_PORT CONFIG_CLIENT_MTC_MGMNT_PORT - typedef struct { struct timespec ts ; struct tm t; diff --git a/mtce/src/common/nodeClass.cpp b/mtce/src/common/nodeClass.cpp index 96cd1d89..ec3c3096 100755 --- a/mtce/src/common/nodeClass.cpp +++ b/mtce/src/common/nodeClass.cpp @@ -545,7 +545,7 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname ) ptr->action = "none" ; ptr->clear_task = false ; - ptr->mtcAlive_gate = true ; + ctl_mtcAlive_gate( ptr , true ) ; ptr->mtcAlive_online = false ; ptr->mtcAlive_offline = true ; ptr->mtcAlive_misses = 0 ; @@ -1463,7 +1463,7 @@ int nodeLinkClass::avail_status_change ( string hostname, { node_ptr->mtcAlive_misses = 0 ; node_ptr->mtcAlive_hits = 0 ; - node_ptr->mtcAlive_gate = false ; + this->ctl_mtcAlive_gate ( node_ptr, false ) ; } /* check for need to generate power on log */ @@ -1696,16 +1696,10 @@ int nodeLinkClass::alarm_insv_failure ( struct nodeLinkClass::node * node_ptr ) /* Clear the enable alarm and degrade flag */ int nodeLinkClass::alarm_enabled_clear ( struct nodeLinkClass::node * node_ptr, bool force ) { - if ( node_ptr->degrade_mask & DEGRADE_MASK_ENABLE ) - { - node_ptr->degrade_mask &= ~DEGRADE_MASK_ENABLE ; - } + unsigned int clear_mask = DEGRADE_MASK_ENABLE | + DEGRADE_MASK_INSV_TEST ; - /* The inservice test degrade flag needs to be cleared too. */ - if ( node_ptr->degrade_mask & DEGRADE_MASK_INSV_TEST ) - { - node_ptr->degrade_mask &= ~DEGRADE_MASK_INSV_TEST ; - } + node_ptr->degrade_mask &= ~clear_mask ; if (( node_ptr->alarms[MTC_ALARM_ID__ENABLE] != FM_ALARM_SEVERITY_CLEAR ) || ( force == true )) @@ -2350,18 +2344,19 @@ int nodeLinkClass::mod_host ( node_inv_type & inv ) modify = true ; /* we have a delta */ } - if ( node_ptr->clstr_ip.compare ( inv.clstr_ip ) ) - { - if ( hostUtil_is_valid_ip_addr ( inv.clstr_ip )) - { - plog ("%s Modify 'clstr_ip' from %s -> %s\n", - node_ptr->hostname.c_str(), - node_ptr->clstr_ip.c_str(), inv.clstr_ip.c_str() ); - modify = true ; /* we have a delta */ - node_ptr->clstr_ip = inv.clstr_ip ; - } + if (( hostUtil_is_valid_ip_addr ( inv.clstr_ip )) && + ( node_ptr->clstr_ip != inv.clstr_ip )) + { + plog ("%s Modify 'clstr_ip' from %s -> %s\n", + node_ptr->hostname.c_str(), + node_ptr->clstr_ip.c_str(), + inv.clstr_ip.c_str() ); + + modify = true ; /* we have a delta */ + node_ptr->clstr_ip = inv.clstr_ip ; } + if ( (!inv.name.empty()) && (node_ptr->hostname.compare ( inv.name)) ) { mtcCmd cmd ; @@ -3455,17 +3450,14 @@ void nodeLinkClass::set_cmd_resp ( string & hostname, mtc_message_type & msg ) } else { - node_ptr->cmdRsp = msg.cmd ; - if ( msg.num > 0 ) - node_ptr->cmdRsp_status = msg.parm[0] ; - else - node_ptr->cmdRsp_status = -1 ; - - dlog ("%s '%s' command response status [%u:%s]\n", - hostname.c_str(), - node_ptr->cmdName.c_str(), - msg.num ? node_ptr->cmdRsp_status : PASS, - node_ptr->cmdRsp_status_string.empty() ? "empty" : node_ptr->cmdRsp_status_string.c_str()); + if ( node_ptr->cmdRsp != msg.cmd ) + { + node_ptr->cmdRsp = msg.cmd ; + if ( msg.num > 0 ) + node_ptr->cmdRsp_status = msg.parm[0] ; + else + node_ptr->cmdRsp_status = -1 ; + } } } } @@ -3514,7 +3506,12 @@ int nodeLinkClass::set_activeClient ( string hostname, mtc_client_enum client ) * * Name : set_mtcAlive * - * Description: + * Description: Set the mgmnt or clust specific mtc alive received bool. + * + * Used in the offline handler to verify overall offline state. + * + * Interfaces : Public with hostname. + * Private by node pointer. * * If mtcAlive is ungated then * @@ -3528,6 +3525,14 @@ void nodeLinkClass::set_mtcAlive ( string & hostname, int interface ) nodeLinkClass::node* node_ptr ; node_ptr = nodeLinkClass::getNode ( hostname ); if ( node_ptr != NULL ) + { + this->set_mtcAlive ( node_ptr, interface ); + } +} + +void nodeLinkClass::set_mtcAlive ( struct nodeLinkClass::node * node_ptr, int interface ) +{ + if ( node_ptr ) { if ( node_ptr->mtcAlive_gate == false ) { @@ -3537,48 +3542,110 @@ void nodeLinkClass::set_mtcAlive ( string & hostname, int interface ) if ( interface == CLSTR_INTERFACE ) { - node_ptr->mtcAlive_clstr = true ; + if ( node_ptr->mtcAlive_clstr == false ) + { + alog2 ("%s %s mtcAlive received", + node_ptr->hostname.c_str(), + get_iface_name_str(interface)); + node_ptr->mtcAlive_clstr = true ; + } } else { - node_ptr->mtcAlive_mgmnt = true ; + if ( node_ptr->mtcAlive_mgmnt == false ) + { + alog2 ("%s %s mtcAlive received", + node_ptr->hostname.c_str(), + get_iface_name_str(interface)); + node_ptr->mtcAlive_mgmnt = true ; + } } } } } +/***************************************************************************** + * + * Name : get_mtcAlive + * + * Description: Return the current mtcAlive gate state. + * + * Interfaces : Public with hostname. + * Private by node pointer. + * + ****************************************************************************/ + bool nodeLinkClass::get_mtcAlive_gate ( string & hostname ) { nodeLinkClass::node* node_ptr ; node_ptr = nodeLinkClass::getNode ( hostname ); if ( node_ptr != NULL ) { + return ( get_mtcAlive_gate (node_ptr)) ; + } + /* If we can't find the node then assume alive messages are gated */ + return (true); +} + +bool nodeLinkClass::get_mtcAlive_gate ( struct nodeLinkClass::node * node_ptr ) +{ + if ( node_ptr ) + { + alog3 ("%s mtcAlive gate: %s", + node_ptr->hostname.c_str(), + node_ptr->mtcAlive_gate ? "closed" : "open" ); return ( node_ptr->mtcAlive_gate ) ; } /* If we can't find the node then gate off the alive messages */ return (true); } -void nodeLinkClass::ctl_mtcAlive_gate ( string & hostname, bool gated ) +/***************************************************************************** + * + * Name : ctl_mtcAlive_gate + * + * Description: Control the mtcAlive gate state. + * Produce an alog on state changes. + * + * Interfaces : Public with hostname. + * Private by node pointer. + * + ****************************************************************************/ + +void nodeLinkClass::ctl_mtcAlive_gate ( string & hostname, bool gate_state ) { nodeLinkClass::node* node_ptr ; node_ptr = nodeLinkClass::getNode ( hostname ); if ( node_ptr != NULL ) { - node_ptr->mtcAlive_gate = gated ; - if ( gated == true ) + ctl_mtcAlive_gate ( node_ptr, gate_state ); + } +} + +void nodeLinkClass::ctl_mtcAlive_gate ( struct nodeLinkClass::node * node_ptr, + bool gate_state ) +{ + if ( node_ptr ) + { + if ( node_ptr->mtcAlive_gate != gate_state ) { - alog ("%s mtcAlive gated\n", node_ptr->hostname.c_str()); - } - else - { - alog ("%s mtcAlive ungated\n", node_ptr->hostname.c_str()); + node_ptr->mtcAlive_gate = gate_state ; + if ( node_ptr->mtcAlive_gate == true ) + { + alog ("%s mtcAlive gate closed", + node_ptr->hostname.c_str()); + } + else + { + alog ("%s mtcAlive gate open", + node_ptr->hostname.c_str()); + } } } } -/* Main-Function Go Enabled member Functions */ +/* Main-Function Go Enabled member Functions */ void nodeLinkClass::set_goEnabled ( string & hostname ) { nodeLinkClass::node* node_ptr ; @@ -3691,7 +3758,7 @@ void nodeLinkClass::set_uptime_refresh_ctr ( string & hostname, int value ) if ( node_ptr != NULL ) { node_ptr->uptime_refresh_counter = value ; - } + } } @@ -3706,7 +3773,7 @@ int nodeLinkClass::get_uptime_refresh_ctr ( string & hostname ) return (0); } -void nodeLinkClass::set_mtce_flags ( string hostname, int flags ) +void nodeLinkClass::set_mtce_flags ( string hostname, int flags, int iface ) { nodeLinkClass::node* node_ptr = nodeLinkClass::getNode ( hostname ); if ( node_ptr != NULL ) @@ -3718,6 +3785,35 @@ void nodeLinkClass::set_mtce_flags ( string hostname, int flags ) else node_ptr->goEnabled = false ; + /* + * Fail the inactive controller if the sm unhealthy flag is set. + * Degrade for the active controller. + */ + if (( flags & MTC_FLAG__SM_UNHEALTHY ) && + (( node_ptr->operState == MTC_OPER_STATE__ENABLED ) || + ( node_ptr->adminAction == MTC_ADMIN_ACTION__RECOVER ))) + { + if (( hostname == CONTROLLER_0 ) || ( hostname == CONTROLLER_1 )) + { + elog ("%s reported unhealthy by SM (%s)", + hostname.c_str(), + get_iface_name_str(iface)); + + if ( hostname != this->my_hostname ) + { + force_full_enable ( node_ptr ); + } + + /* no else cause because mtcAgent does nothing if this file + * is present on the active controller. */ + } + else + { + slog ("%s reported unhealthy by SM ; compare error", + hostname.c_str()); + } + } + /* Track host patching state by Out-Of-Band flag */ if ( flags & MTC_FLAG__PATCHING ) { @@ -6235,7 +6331,7 @@ int nodeLinkClass::availStatusChange ( struct nodeLinkClass::node * node_ptr, { node_ptr->mtcAlive_misses = 0 ; node_ptr->mtcAlive_hits = 0 ; - node_ptr->mtcAlive_gate = false ; + this->ctl_mtcAlive_gate ( node_ptr, false ) ; } /* check for need to generate power on log */ @@ -8175,7 +8271,7 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding ) // pulse_ptr->max_count[iface]++ ; /* - * Update storage_0_responding reference to false if storgate-0 + * Update storage_0_responding reference to false if storage-0 * is found in the pulse lots list. */ if ( pulse_ptr->hostname == STORAGE_0 ) @@ -8572,12 +8668,12 @@ void nodeLinkClass::mem_log_mtcalive ( struct nodeLinkClass::node * node_ptr ) { char str[MAX_MEM_LOG_DATA] ; - snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tmtcAlive: on:%c off:%c Cnt:%d State:%s Misses:%d\n", + snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tmtcAlive: online:%c offline:%c Cnt:%d Gate:%s Misses:%d\n", node_ptr->hostname.c_str(), node_ptr->mtcAlive_online ? 'Y' : 'N', node_ptr->mtcAlive_offline ? 'Y' : 'N', node_ptr->mtcAlive_count, - node_ptr->mtcAlive_gate ? "gated" : "rxing", + node_ptr->mtcAlive_gate ? "closed" : "open", node_ptr->mtcAlive_misses); mem_log (str); } diff --git a/mtce/src/common/nodeClass.h b/mtce/src/common/nodeClass.h index c095f89a..756faebb 100755 --- a/mtce/src/common/nodeClass.h +++ b/mtce/src/common/nodeClass.h @@ -818,6 +818,10 @@ private: void start_offline_handler ( struct nodeLinkClass::node * node_ptr ); void stop_offline_handler ( struct nodeLinkClass::node * node_ptr ); + bool get_mtcAlive_gate ( struct nodeLinkClass::node * node_ptr ); + void ctl_mtcAlive_gate ( struct nodeLinkClass::node * node_ptr, bool gate_state ); + void set_mtcAlive ( struct nodeLinkClass::node * node_ptr, int interface ); + /***************************************************************************** * * Name : ipmi_command_send @@ -1701,7 +1705,7 @@ public: #define MTC_FLAG__I_AM_HEALTHY (0x00000004) #define MTC_FLAG__I_AM_LOCKED (0x00000008) */ - void set_mtce_flags ( string hostname, int flags ); + void set_mtce_flags ( string hostname, int flags, int iface ); /** Updates the node's health code * Codes are found in nodeBase.h diff --git a/mtce/src/maintenance/mtcCmdHdlr.cpp b/mtce/src/maintenance/mtcCmdHdlr.cpp index fcc81c6f..b222e28b 100644 --- a/mtce/src/maintenance/mtcCmdHdlr.cpp +++ b/mtce/src/maintenance/mtcCmdHdlr.cpp @@ -356,6 +356,7 @@ int nodeLinkClass::cmd_handler ( struct nodeLinkClass::node * node_ptr ) } case MTC_CMD_STAGE__REBOOT: { + int rc = PASS ; bool send_reboot_ok = false ; node_ptr->reboot_cmd_ack_mgmnt = false ; @@ -364,11 +365,13 @@ int nodeLinkClass::cmd_handler ( struct nodeLinkClass::node * node_ptr ) /* send reboot command */ node_ptr->cmdReq = MTC_CMD_REBOOT ; node_ptr->cmdRsp = MTC_CMD_NONE ; - plog ("%s Performing REBOOT (mgmnt network)\n", node_ptr->hostname.c_str()); - if ( send_mtc_cmd ( node_ptr->hostname, MTC_CMD_REBOOT, MGMNT_INTERFACE ) != PASS ) + if (( rc = send_mtc_cmd ( node_ptr->hostname, + MTC_CMD_REBOOT, + MGMNT_INTERFACE )) != PASS ) { - wlog ("%s REBOOT Request Failed (mgmnt network)\n", - node_ptr->hostname.c_str()); + wlog ("%s reboot request failed (%s) (rc:%d)\n", + node_ptr->hostname.c_str(), + get_iface_name_str(MGMNT_INTERFACE), rc); } else { @@ -377,11 +380,13 @@ int nodeLinkClass::cmd_handler ( struct nodeLinkClass::node * node_ptr ) if ( clstr_network_provisioned == true ) { - plog ("%s Performing REBOOT (cluster-host network)\n", node_ptr->hostname.c_str()); - if ( send_mtc_cmd ( node_ptr->hostname, MTC_CMD_REBOOT, CLSTR_INTERFACE ) != PASS ) + if (( rc = send_mtc_cmd ( node_ptr->hostname, + MTC_CMD_REBOOT, + CLSTR_INTERFACE )) != PASS ) { - wlog ("%s REBOOT Request Failed (cluster-host network)\n", - node_ptr->hostname.c_str()); + wlog ("%s 'reboot' request failed (%s) (rc:%d)\n", + node_ptr->hostname.c_str(), + get_iface_name_str(CLSTR_INTERFACE), rc); } else { diff --git a/mtce/src/maintenance/mtcCompMsg.cpp b/mtce/src/maintenance/mtcCompMsg.cpp index 87c8cde2..b5b221e3 100755 --- a/mtce/src/maintenance/mtcCompMsg.cpp +++ b/mtce/src/maintenance/mtcCompMsg.cpp @@ -62,6 +62,7 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface ) mtc_message_type msg ; int rc = FAIL ; ctrl_type * ctrl_ptr = get_ctrl_ptr() ; + bool log_ack = true ; if ( interface == CLSTR_INTERFACE ) { @@ -124,6 +125,10 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface ) { self = true ; } + string interface_name = get_iface_name_str (interface) ; + string command_name = get_mtcNodeCommand_str(msg.cmd) ; + + print_mtc_message ( get_hostname(), MTC_CMD_RX, msg, interface_name.data(), false ); /* Message version greater than zero have the hosts management * mac address appended to the header string */ @@ -133,10 +138,11 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface ) if ( strncmp ( &msg.hdr[MSG_HEADER_SIZE-1], ctrl_ptr->macaddr.data(), MSG_HEADER_SIZE )) { wlog ("%s command not for this host (exp:%s det:%s) ; ignoring ...\n", - get_mtcNodeCommand_str(msg.cmd), + command_name.c_str(), ctrl_ptr->macaddr.c_str(), &msg.hdr[MSG_HEADER_SIZE-1]); - rc = FAIL_INVALID_DATA ; + print_mtc_message ( get_hostname(), MTC_CMD_RX, msg, interface_name.data(), true ); + return (FAIL_INVALID_DATA); } } @@ -150,7 +156,7 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface ) rc = PASS ; if ( msg.cmd == MTC_REQ_MTCALIVE ) { - mlog1 ("mtcAlive request received (%s network)\n", get_iface_name_str (interface)); + mlog1 ("mtcAlive request received (%s network)\n", interface_name.c_str()); return ( send_mtcAlive_msg ( sock_ptr, get_who_i_am(), interface )); } else if ( msg.cmd == MTC_MSG_LOCKED ) @@ -158,10 +164,15 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface ) /* Only recreate the file if its not already present */ if ( daemon_is_file_present ( NODE_LOCKED_FILE ) == false ) { + log_ack = true ; + ilog ("%s locked (%s)", get_hostname().c_str(), interface_name.c_str() ); daemon_log ( NODE_LOCKED_FILE, "This node is currently in the administratively locked state" ); } - return (PASS); + else + { + log_ack = false ; + } } else if ( msg.cmd == MTC_MSG_SUBF_GOENABLED_FAILED ) { @@ -193,7 +204,7 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface ) } else { - ilog ("GoEnabled request posted (%s)\n",get_iface_name_str (interface)); + ilog ("GoEnabled request posted (%s)\n", interface_name.c_str()); ctrl_ptr->posted_script_set.push_back ( GOENABLED_MAIN_SCRIPTS ); ctrl_ptr->posted_script_set.unique(); } @@ -220,7 +231,7 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface ) } else { - ilog ("GoEnabled Subf request posted (%s)\n", get_iface_name_str (interface)); + ilog ("GoEnabled Subf request posted (%s)\n", interface_name.c_str()); /* Cleanup test result flag files */ if ( daemon_is_file_present ( GOENABLED_SUBF_PASS) ) @@ -241,11 +252,16 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface ) } else if ( msg.cmd == MTC_CMD_REBOOT ) { - ilog ("Reboot command received (%s)\n", get_iface_name_str (interface)); + ilog ("%s command received (%s)", + command_name.c_str(), + interface_name.c_str()); } else if ( msg.cmd == MTC_CMD_LAZY_REBOOT ) { - ilog ("Lazy Reboot command received (%s) ; delay:%d seconds\n", get_iface_name_str (interface), msg.num ? msg.parm[0] : 0 ); + ilog ("%s command received (%s) ; delay:%d seconds\n", + command_name.c_str(), + interface_name.c_str(), + msg.num ? msg.parm[0] : 0 ); } else if ( is_host_services_cmd ( msg.cmd ) == true ) { @@ -258,7 +274,7 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface ) ( ctrl_ptr->hostservices.monitor == msg.cmd )) { wlog ("%s already in progress (%d:%d)\n", - get_mtcNodeCommand_str(msg.cmd), + command_name.c_str(), ctrl_ptr->hostservices.posted, ctrl_ptr->hostservices.monitor ); @@ -270,8 +286,8 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface ) ctrl_ptr->posted_script_set.unique (); ilog ("%s request posted (%s)\n", - get_mtcNodeCommand_str(msg.cmd), - get_iface_name_str (interface)); + command_name.c_str(), + interface_name.c_str()); ctrl_ptr->hostservices.posted = msg.cmd ; ctrl_ptr->hostservices.monitor = MTC_CMD_NONE ; @@ -283,16 +299,16 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface ) { rc = FAIL_FIT ; wlog ("%s Start Services - fit failure (%s)\n", - get_mtcNodeCommand_str(msg.cmd), - get_iface_name_str (interface) ); + command_name.c_str(), + interface_name.c_str() ); } /* Fault insertion - fail to send host services ACK */ if ( ( daemon_is_file_present ( MTC_CMD_FIT__NO_HS_ACK ))) { wlog ("%s Start Services - fit no ACK (%s)\n", - get_mtcNodeCommand_str(msg.cmd), - get_iface_name_str (interface) ); + command_name.c_str(), + interface_name.c_str() ); return (PASS); } @@ -312,15 +328,15 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface ) } else if ( msg.cmd == MTC_CMD_WIPEDISK ) { - ilog ("Reload command received (%s)\n", get_iface_name_str (interface)); + ilog ("Reload command received (%s)\n", interface_name.c_str()); } else if ( msg.cmd == MTC_CMD_RESET ) { - ilog ("Reset command received (%s)\n", get_iface_name_str (interface)); + ilog ("Reset command received (%s)\n", interface_name.c_str()); } else if ( msg.cmd == MTC_CMD_LOOPBACK ) { - ilog ("Loopback command received (%s)\n", get_iface_name_str (interface)); + ilog ("Loopback command received (%s)\n", interface_name.c_str()); } else { @@ -334,12 +350,12 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface ) { if ( msg.cmd == MTC_MSG_MAIN_GOENABLED ) { - ilog ("main function goEnabled results acknowledged (%s)\n", get_iface_name_str (interface)); + ilog ("main function goEnabled results acknowledged (%s)\n", interface_name.c_str()); return (PASS); } else if ( msg.cmd == MTC_MSG_SUBF_GOENABLED ) { - ilog ("sub-function goEnabled results acknowledged (%s)\n", get_iface_name_str (interface)); + ilog ("sub-function goEnabled results acknowledged (%s)\n", interface_name.c_str()); return (PASS); } else @@ -351,7 +367,13 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface ) else if ( strstr ( &msg.hdr[0], get_worker_msg_header()) ) { - elog ("Unsupported Message\n"); + elog ("unsupported worker message\n"); + print_mtc_message ( &msg ); + return PASS ; + } + else + { + elog ("unsupported message\n"); print_mtc_message ( &msg ); return PASS ; } @@ -364,57 +386,75 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface ) * if ( rc == PASS ) **********************************************************/ { + rc = PASS ; + bytes = sizeof(mtc_message_type)-BUF_SIZE; - /* Fault insertion for no command ACK */ - if (( interface == MGMNT_INTERFACE ) && ( daemon_is_file_present ( MTC_CMD_FIT__NO_MGMNT_ACK ))) - { - wlog ("%s reply ack message - fit bypass (%s)\n", - get_mtcNodeCommand_str(msg.cmd), - get_iface_name_str (interface) ); - } - else if (( interface == CLSTR_INTERFACE ) && ( daemon_is_file_present ( MTC_CMD_FIT__NO_CLSTR_ACK ))) - { - wlog ("%s reply ack message - fit bypass (%s)\n", - get_mtcNodeCommand_str(msg.cmd), - get_iface_name_str (interface) ); - } - /* Otherwise, send the message back either over the mgmnt or clstr interface */ - else if ( interface == MGMNT_INTERFACE ) + /* send the message back either over the mgmnt or clstr interface */ + if ( interface == MGMNT_INTERFACE ) { if (( sock_ptr->mtc_client_tx_socket ) && ( sock_ptr->mtc_client_tx_socket->sock_ok() == true )) { - rc=sock_ptr->mtc_client_tx_socket->write((char*)&msg.hdr[0], bytes); + rc = sock_ptr->mtc_client_tx_socket->write((char*)&msg.hdr[0], bytes); + if ( rc <= 0 ) + { + elog ("%s reply send (mtc_client_tx_socket) failed (%s) (rc:%d)", + command_name.c_str(), + interface_name.c_str(), rc); + } + else if ( log_ack ) + { + ilog ("%s reply send (%s)", + command_name.c_str(), + interface_name.c_str()); + } } else { elog ("cannot send to null or failed socket (%s network)\n", - get_iface_name_str (interface) ); + interface_name.c_str() ); } } else if ( interface == CLSTR_INTERFACE ) { - if (( sock_ptr->mtc_client_clstr_tx_socket ) && - ( sock_ptr->mtc_client_clstr_tx_socket->sock_ok() == true )) + if (( sock_ptr->mtc_client_tx_socket_c0_clstr ) && + ( sock_ptr->mtc_client_tx_socket_c0_clstr->sock_ok() == true )) { - rc = sock_ptr->mtc_client_clstr_tx_socket->write((char*)&msg.hdr[0], bytes); + rc = sock_ptr->mtc_client_tx_socket_c0_clstr->write((char*)&msg.hdr[0], bytes); + if ( rc <= 0 ) + { + elog ("%s reply send (mtc_client_tx_socket_c0_clstr) failed (%s) (rc:%d)", + command_name.c_str(), + interface_name.c_str(), rc); + } + else if ( log_ack ) + { + ilog ("%s reply send (%s)", + command_name.c_str(), + interface_name.c_str()); + } } - else + if (( sock_ptr->mtc_client_tx_socket_c1_clstr ) && + ( sock_ptr->mtc_client_tx_socket_c1_clstr->sock_ok() == true )) { - elog ("cannot send to null or failed socket (%s network)\n", - get_iface_name_str (interface) ); + rc = sock_ptr->mtc_client_tx_socket_c1_clstr->write((char*)&msg.hdr[0], bytes); + if ( rc <= 0 ) + { + elog ("%s reply send (mtc_client_tx_socket_c1_clstr) failed (%s) (rc:%d)", + command_name.c_str(), + interface_name.c_str(), rc); + } + else if ( log_ack ) + { + ilog ("%s reply send (%s)", + command_name.c_str(), + interface_name.c_str()); + } } } - if (rc != bytes ) - { - elog ("failed to send reply message (%d)\n", rc); - } - else - { - print_mtc_message ( get_hostname(), MTC_CMD_TX, msg, get_iface_name_str(interface), false ); - } + print_mtc_message ( get_hostname(), MTC_CMD_TX, msg, interface_name.data(), (rc != bytes) ); /* get the shutdown delay config alue */ int delay = daemon_get_cfg_ptr()->failsafe_shutdown_delay ; @@ -427,10 +467,10 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface ) { if ( daemon_is_file_present ( MTC_CMD_FIT__NO_REBOOT ) ) { - ilog ("Reboot - fit bypass (%s)\n", get_iface_name_str (interface)); + ilog ("Reboot - fit bypass (%s)\n", interface_name.c_str()); return (PASS); } - ilog ("Reboot (%s)\n", get_iface_name_str (interface)); + ilog ("Reboot (%s)\n", interface_name.c_str()); daemon_log ( NODE_RESET_FILE, "reboot command" ); fork_sysreq_reboot ( delay ); rc = system("/usr/bin/systemctl reboot"); @@ -439,7 +479,7 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface ) { if ( daemon_is_file_present ( MTC_CMD_FIT__NO_REBOOT ) ) { - ilog ("Lazy Reboot - fit bypass (%s)\n", get_iface_name_str (interface)); + ilog ("Lazy Reboot - fit bypass (%s)\n", interface_name.c_str()); return (PASS); } daemon_log ( NODE_RESET_FILE, "lazy reboot command" ); @@ -447,7 +487,7 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface ) { do { - ilog ("Lazy Reboot (%s) ; rebooting in %d seconds\n", get_iface_name_str (interface), msg.num ? msg.parm[0] : 1 ); + ilog ("Lazy Reboot (%s) ; rebooting in %d seconds\n", interface_name.c_str(), msg.num ? msg.parm[0] : 1 ); sleep (1); if ( msg.parm[0] % 5 ) { @@ -458,7 +498,7 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface ) } else { - ilog ("Lazy Reboot (%s) ; now\n", get_iface_name_str (interface) ); + ilog ("Lazy Reboot (%s) ; now\n", interface_name.c_str() ); } fork_sysreq_reboot ( delay ); rc = system("/usr/bin/systemctl reboot"); @@ -467,10 +507,10 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface ) { if ( daemon_is_file_present ( MTC_CMD_FIT__NO_RESET ) ) { - ilog ("Reset - fit bypass (%s)\n", get_iface_name_str (interface)); + ilog ("Reset - fit bypass (%s)\n", interface_name.c_str()); return (PASS); } - ilog ("Reset 'reboot -f' (%s)\n", get_iface_name_str (interface)); + ilog ("Reset 'reboot -f' (%s)\n", interface_name.c_str()); daemon_log ( NODE_RESET_FILE, "reset command" ); fork_sysreq_reboot ( delay/2 ); rc = system("/usr/bin/systemctl reboot --force"); @@ -481,7 +521,7 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface ) if ( daemon_is_file_present ( MTC_CMD_FIT__NO_WIPEDISK ) ) { - ilog ("Wipedisk - fit bypass (%s)\n", get_iface_name_str (interface)); + ilog ("Wipedisk - fit bypass (%s)\n", interface_name.c_str()); return (PASS); } /* We fork a reboot as a fail safe. @@ -499,7 +539,7 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface ) } else if( 0 == parent ) /* we're the child */ { - ilog ("Disk wipe in progress (%s)\n", get_iface_name_str (interface)); + ilog ("Disk wipe in progress (%s)\n", interface_name.c_str()); daemon_log ( NODE_RESET_FILE, "wipedisk command" ); rc = system("/usr/local/bin/wipedisk --force"); ilog ("Disk wipe complete - Forcing Reboot ...\n"); @@ -509,7 +549,6 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface ) } rc = PASS ; - fflush(stdout); } return (rc); } @@ -761,28 +800,56 @@ int send_mtc_msg ( mtc_socket_type * sock_ptr, int cmd , string identity ) int send_mtcAlive_msg_failed = 0 ; int send_mtcAlive_msg ( mtc_socket_type * sock_ptr, string identity, int interface ) { - mtc_message_type msg ; - msgClassSock * mtcAlive_tx_sock_ptr = NULL ; - int rc = FAIL ; - if (( interface == CLSTR_INTERFACE ) && ( get_ctrl_ptr()->clstr_iface_provisioned != true )) { dlog2 ("cannot send to unprovisioned %s interface\n", get_iface_name_str(interface) ); - return (rc); + return (FAIL); } + mtc_message_type msg ; + int bytes = create_mtcAlive_msg ( msg, MTC_MSG_MTCALIVE, identity, interface ); if ( interface == MGMNT_INTERFACE ) { - /* management interface */ - mtcAlive_tx_sock_ptr = sock_ptr->mtc_client_tx_socket ; + /* Send to controller floating address */ + if (( sock_ptr->mtc_client_tx_socket ) && + ( sock_ptr->mtc_client_tx_socket->sock_ok() == true )) + { + print_mtc_message ( CONTROLLER, MTC_CMD_TX, msg, get_iface_name_str(MGMNT_INTERFACE), false ); + sock_ptr->mtc_client_tx_socket->write((char*)&msg.hdr[0], bytes) ; + } + else + { + elog("mtc_client_tx_socket not ok"); + } } else if ( interface == CLSTR_INTERFACE ) { - /* cluster-host interface */ - mtcAlive_tx_sock_ptr = sock_ptr->mtc_client_clstr_tx_socket ; + /* Send to controller-0 cluster address */ + if (( sock_ptr->mtc_client_tx_socket_c0_clstr ) && + ( sock_ptr->mtc_client_tx_socket_c0_clstr->sock_ok() == true )) + { + print_mtc_message ( CONTROLLER_0, MTC_CMD_TX, msg, get_iface_name_str(CLSTR_INTERFACE), false ); + sock_ptr->mtc_client_tx_socket_c0_clstr->write((char*)&msg.hdr[0], bytes ) ; + } + else + { + elog("mtc_client_tx_socket_c0_clstr not ok"); + } + + /* Send to controller-1 cluster address */ + if (( sock_ptr->mtc_client_tx_socket_c1_clstr ) && + ( sock_ptr->mtc_client_tx_socket_c1_clstr->sock_ok() == true )) + { + print_mtc_message ( CONTROLLER_1, MTC_CMD_TX, msg, get_iface_name_str(CLSTR_INTERFACE), false ); + sock_ptr->mtc_client_tx_socket_c1_clstr->write((char*)&msg.hdr[0], bytes ) ; + } + else + { + elog("mtc_client_tx_socket_c1_clstr not ok"); + } } else { @@ -791,53 +858,7 @@ int send_mtcAlive_msg ( mtc_socket_type * sock_ptr, string identity, int interfa return (FAIL_BAD_PARM); } - if ( daemon_is_file_present ( MTC_CMD_FIT__NO_MTCALIVE )) - { - wlog ("mtcAlive - fit bypass\n"); - return (PASS); - } - else - { - int bytes = create_mtcAlive_msg ( msg, MTC_MSG_MTCALIVE, identity, interface ); - - if (( mtcAlive_tx_sock_ptr ) && - ( mtcAlive_tx_sock_ptr->sock_ok() == true )) - { - if ((rc = mtcAlive_tx_sock_ptr->write((char*)&msg.hdr[0], bytes)) != bytes ) - { - if ( rc == -1 ) - { - wlog_throttled (send_mtcAlive_msg_failed, 100 , - "failed to send <%s:%d> (%d:%m) (%s)\n", - mtcAlive_tx_sock_ptr->get_dst_str(), - mtcAlive_tx_sock_ptr->get_dst_addr()->getPort(), - errno, get_iface_name_str(interface) ); - } - else - { - wlog_throttled ( send_mtcAlive_msg_failed, 100 , - "sent only %d of %d bytes to <%s:%d> (%s)\n", - rc, bytes, - mtcAlive_tx_sock_ptr->get_dst_str(), - mtcAlive_tx_sock_ptr->get_dst_addr()->getPort(), - get_iface_name_str(interface) ); - } - rc = FAIL_SOCKET_SENDTO ; - } - else - { - send_mtcAlive_msg_failed = 0 ; - print_mtc_message ( get_hostname(), MTC_CMD_TX, msg, get_iface_name_str(interface), false ); - rc = PASS ; - } - } - else - { - elog ("cannot send to null or failed socket (%s network)\n", - get_iface_name_str(interface)); - } - } - return (rc) ; + return (PASS) ; } /* Accelerated Virtual Switch 'events' socket diff --git a/mtce/src/maintenance/mtcCtrlMsg.cpp b/mtce/src/maintenance/mtcCtrlMsg.cpp index b2ba4bea..f5019a4d 100755 --- a/mtce/src/maintenance/mtcCtrlMsg.cpp +++ b/mtce/src/maintenance/mtcCtrlMsg.cpp @@ -50,9 +50,6 @@ using namespace std; int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr ); -/* Throttle logging of messages from unknown IP addresses */ -std::list unknown_ip_list ; - /* Send specified command to the guestAgent daemon */ int send_guest_command ( string hostname, int command ) { @@ -163,6 +160,7 @@ int mtc_service_inbox ( nodeLinkClass * obj_ptr, zero_unused_msg_buf (msg, bytes); + /* get the sender's hostname */ string hostaddr = "" ; string hostname = "" ; if ( iface == CLSTR_INTERFACE ) @@ -175,20 +173,22 @@ int mtc_service_inbox ( nodeLinkClass * obj_ptr, hostaddr = sock_ptr->mtc_agent_rx_socket->get_src_str(); hostname = obj_ptr->get_hostname ( hostaddr ) ; } + + /* lookup failed if hostname remains empty. */ if ( hostname.empty() ) { - std::list::iterator iter ; - iter = std::find (unknown_ip_list.begin(), unknown_ip_list.end(), hostaddr ); - if ( iter == unknown_ip_list.end() ) + /* try and learn the cluster ip from a mtcAlive message. */ + if (( msg.cmd == MTC_MSG_MTCALIVE ) && + (( rc = jsonUtil_get_key_val ( &msg.buf[0], "hostname", hostname )) == PASS )) { - mlog3 ( "Received message from unknown IP <%s>\n", hostaddr.c_str()); - unknown_ip_list.push_front(hostaddr); + ilog ("%s learned from mtcAlive", hostname.c_str()); + } + else + { + wlog ("unknown hostname message ... dropping" ); /* make dlog */ + print_mtc_message ( hostname, MTC_CMD_RX, msg, get_iface_name_str(iface), true ); + return (FAIL_GET_HOSTNAME); } - return (FAIL_NOT_FOUND); - } - else if ( ! hostaddr.empty() ) - { - unknown_ip_list.remove (hostaddr); } print_mtc_message ( hostname, MTC_CMD_RX, msg, get_iface_name_str(iface), false ); @@ -244,6 +244,26 @@ int mtc_service_inbox ( nodeLinkClass * obj_ptr, else if ( strstr ( &msg.hdr[0], get_cmd_rsp_msg_header() ) ) { obj_ptr->set_cmd_resp ( hostname , msg ) ; + if ( msg.num > 0 ) + { + if (( msg.cmd != MTC_MSG_LOCKED ) && + ( msg.cmd != MTC_CMD_HOST_SVCS_RESULT )) + { + ilog ("%s '%s' ACK (rc:%d) (%s)", + hostname.c_str(), + get_mtcNodeCommand_str(msg.cmd), + msg.parm[0], + get_iface_name_str(iface)); + } + else + { + mlog ("%s '%s' ACK (rc:%d) (%s)", + hostname.c_str(), + get_mtcNodeCommand_str(msg.cmd), + msg.parm[0], + get_iface_name_str(iface)); + } + } } /* @@ -267,30 +287,35 @@ int mtc_service_inbox ( nodeLinkClass * obj_ptr, wlog ("%s failed to load functions from mtcAlive message\n", hostname.c_str()); return (FAIL_NODETYPE); } + + if ( obj_ptr->clstr_network_provisioned == true ) + { + string cluster_host_ip = ""; + /* Get the clstr ip address if it is provisioned */ + rc = jsonUtil_get_key_val ( &msg.buf[0], "cluster_host_ip", cluster_host_ip ); + if ( rc == PASS ) + { + obj_ptr->set_clstr_hostaddr ( hostname, cluster_host_ip ); + } + else + { + wlog ("%s missing 'cluster_host_ip' value (rc:%d)\n", hostname.c_str(), rc); + } + } + obj_ptr->set_uptime ( hostname , msg.parm[MTC_PARM_UPTIME_IDX], false ); obj_ptr->set_health ( hostname , msg.parm[MTC_PARM_HEALTH_IDX] ); - obj_ptr->set_mtce_flags ( hostname , msg.parm[MTC_PARM_FLAGS_IDX] ); - + obj_ptr->set_mtce_flags ( hostname , msg.parm[MTC_PARM_FLAGS_IDX], iface ); obj_ptr->set_mtcAlive ( hostname, iface ); - mlog1("%s Uptime:%d Health:%d Flags:0x%x mtcAlive:%s\n", + mlog1("%s Uptime:%d Health:%d Flags:0x%x mtcAlive:%s (%s)\n", hostname.c_str(), msg.parm[MTC_PARM_UPTIME_IDX], msg.parm[MTC_PARM_HEALTH_IDX], msg.parm[MTC_PARM_FLAGS_IDX], - obj_ptr->get_mtcAlive_gate ( hostname ) ? "gated" : "open"); + obj_ptr->get_mtcAlive_gate ( hostname ) ? "gated" : "open", + get_iface_name_str(iface)); - string cluster_host_ip = ""; - /* Get the clstr ip address if it is provisioned */ - rc = jsonUtil_get_key_val ( &msg.buf[0], "cluster_host_ip", cluster_host_ip ); - if ( rc == PASS ) - { - obj_ptr->set_clstr_hostaddr ( hostname, cluster_host_ip ); - } - else - { - mlog ("%s null or missing 'cluster_host_ip' value (rc:%d)\n", hostname.c_str(), rc); - } } else if ( msg.cmd == MTC_MSG_MAIN_GOENABLED ) { @@ -546,19 +571,6 @@ int mtc_service_inbox ( nodeLinkClass * obj_ptr, wlog ( "Received unsupported or badly formed message\n" ); } - /* Only do this if the debug level is appropriate */ - if ( daemon_get_cfg_ptr()->debug_msg ) - { - int count = 0 ; - std::list::iterator iter ; - for ( iter = unknown_ip_list.begin () ; - iter != unknown_ip_list.end () ; - iter++ ) - { - count++ ; - mlog3 ("Unknown IP [%d]:%s\n", count, iter->c_str()); - } - } return (rc); } @@ -667,55 +679,56 @@ int send_mtc_cmd ( string & hostname, int cmd , int interface ) { int bytes = 0; - /* Temporarily get IP from node inventory till dns is available */ nodeLinkClass * obj_ptr = get_mtcInv_ptr (); /* add the mac address of the target card to the header - * Note: the minus 1 is to overwqrite the null */ + * Note: the minus 1 is to overwrite the null */ snprintf ( &mtc_cmd.hdr[MSG_HEADER_SIZE-1], MSG_HEADER_SIZE, "%s", obj_ptr->get_hostIfaceMac(hostname, MGMNT_IFACE).data()); - /* Lets add the controller's floating ip in the buffer so hat he host knowns where to reply */ - snprintf ( &mtc_cmd.buf[0], obj_ptr->my_float_ip.length()+1, "%s", obj_ptr->my_float_ip.data()); - - /* only send the minimum amount of data */ - bytes = (sizeof(mtc_message_type)-(BUF_SIZE-(obj_ptr->my_float_ip.length()+1))) ; + string data = "{\"address\":\""; + data.append(obj_ptr->my_float_ip) ; + data.append("\",\"interface\":\""); + data.append(get_iface_name_str(interface)); + data.append("\"}"); + snprintf ( &mtc_cmd.buf[0], data.length()+1, "%s", data.data()); + bytes = (sizeof(mtc_message_type)-(BUF_SIZE-(data.length()+1))); print_mtc_message ( hostname, MTC_CMD_TX, mtc_cmd, get_iface_name_str(interface), force ) ; if (interface == MGMNT_INTERFACE) { string hostaddr = obj_ptr->get_hostaddr(hostname); - -#ifdef WANT_FIT_TESTING - if ( daemon_want_fit ( FIT_CODE__INVALIDATE_MGMNT_IP, hostname ) ) - hostaddr = "none" ; -#endif - if ( hostUtil_is_valid_ip_addr ( hostaddr ) != true ) { - wlog("%s has no management IP assigned\n", hostname.c_str()); + wlog("%s has invalid management addr '%s'\n", + hostname.c_str(), + hostaddr.c_str()); return (FAIL_HOSTADDR_LOOKUP); } - /* rc = message size */ - rc = sock_ptr->mtc_agent_tx_socket->write((char *)&mtc_cmd, bytes, hostaddr.c_str(), sock_ptr->mtc_cmd_port); + + mlog ("%s sending %s request to %s (%s)", + hostname.c_str(), + get_mtcNodeCommand_str(cmd), + hostaddr.c_str(), + get_iface_name_str(interface)); + + rc = sock_ptr->mtc_agent_tx_socket->write((char *)&mtc_cmd, bytes, hostaddr.c_str(), sock_ptr->mtc_mgmnt_cmd_port); } else if ((interface == CLSTR_INTERFACE) && ( obj_ptr->clstr_network_provisioned == true ) && ( sock_ptr->mtc_agent_clstr_tx_socket != NULL )) { - /* SETUP TX -> COMPUTE SOCKET CLSTR INTERFACE */ string clstr_hostaddr = obj_ptr->get_clstr_hostaddr(hostname); - -#ifdef WANT_FIT_TESTING - if ( daemon_want_fit ( FIT_CODE__INVALIDATE_CLSTR_IP, hostname ) ) - clstr_hostaddr = "none" ; -#endif - if ( hostUtil_is_valid_ip_addr( clstr_hostaddr ) != true ) - { return (FAIL_NO_CLSTR_PROV); - } - rc = sock_ptr->mtc_agent_clstr_tx_socket->write((char *)&mtc_cmd, bytes, clstr_hostaddr.c_str(), sock_ptr->mtc_cmd_port); + + mlog ("%s sending %s request to %s (%s)", + hostname.c_str(), + get_mtcNodeCommand_str(cmd), + clstr_hostaddr.c_str(), + get_iface_name_str(interface)); + + rc = sock_ptr->mtc_agent_clstr_tx_socket->write((char *)&mtc_cmd, bytes, clstr_hostaddr.c_str(), sock_ptr->mtc_clstr_cmd_port); } if ( 0 > rc ) diff --git a/mtce/src/maintenance/mtcNodeComp.cpp b/mtce/src/maintenance/mtcNodeComp.cpp index 1f3e1514..1048d72b 100644 --- a/mtce/src/maintenance/mtcNodeComp.cpp +++ b/mtce/src/maintenance/mtcNodeComp.cpp @@ -171,12 +171,17 @@ void _close_mgmnt_tx_socket ( void ) } } -void _close_clstr_tx_socket ( void ) +void _close_clstr_tx_sockets ( void ) { - if (mtc_sock.mtc_client_clstr_tx_socket) + if (mtc_sock.mtc_client_tx_socket_c0_clstr) { - delete (mtc_sock.mtc_client_clstr_tx_socket); - mtc_sock.mtc_client_clstr_tx_socket = 0 ; + delete (mtc_sock.mtc_client_tx_socket_c0_clstr); + mtc_sock.mtc_client_tx_socket_c0_clstr = 0 ; + } + if (mtc_sock.mtc_client_tx_socket_c1_clstr) + { + delete (mtc_sock.mtc_client_tx_socket_c1_clstr); + mtc_sock.mtc_client_tx_socket_c1_clstr = 0 ; } } @@ -196,7 +201,7 @@ void daemon_exit ( void ) _close_mgmnt_rx_socket (); _close_clstr_rx_socket (); _close_mgmnt_tx_socket (); - _close_clstr_tx_socket (); + _close_clstr_tx_sockets(); _close_amon_sock (); exit (0) ; @@ -214,13 +219,18 @@ static int mtc_config_handler ( void * user, if (MATCH("agent", "mtc_agent_port")) { config_ptr->mtc_agent_port = atoi(value); - config_ptr->mask |= CONFIG_AGENT_PORT ; + config_ptr->mask |= CONFIG_AGENT_MTC_MGMNT_PORT ; } else if (MATCH("client", "mtc_rx_mgmnt_port")) { config_ptr->mtc_rx_mgmnt_port = atoi(value); config_ptr->mask |= CONFIG_CLIENT_MTC_MGMNT_PORT ; } + else if (MATCH("client", "mtc_rx_clstr_port")) + { + config_ptr->mtc_rx_clstr_port = atoi(value); + config_ptr->mask |= CONFIG_CLIENT_MTC_CLSTR_PORT ; + } else if (MATCH("timeouts", "failsafe_shutdown_delay")) { config_ptr->failsafe_shutdown_delay = atoi(value); @@ -289,10 +299,9 @@ void setup_mgmnt_rx_socket ( void ) ilog("Mgmnt iface : %s\n", ctrl.mgmnt_iface.c_str() ); get_iface_macaddr ( ctrl.mgmnt_iface.data(), ctrl.macaddr ); get_iface_address ( ctrl.mgmnt_iface.data(), ctrl.address , true ); - get_hostname ( &ctrl.hostname[0], MAX_HOST_NAME_SIZE ); _close_mgmnt_rx_socket (); - mtc_sock.mtc_client_rx_socket = new msgClassRx(ctrl.address.c_str(),mtc_sock.mtc_cmd_port, IPPROTO_UDP, ctrl.mgmnt_iface.data(), false ); + mtc_sock.mtc_client_rx_socket = new msgClassRx(ctrl.address.c_str(),mtc_sock.mtc_mgmnt_cmd_port, IPPROTO_UDP, ctrl.mgmnt_iface.data(), false ); /* update health of socket */ if ( mtc_sock.mtc_client_rx_socket ) @@ -328,12 +337,13 @@ void setup_clstr_rx_socket ( void ) * calls daemon_get_iface_master inside so the * aggrigated name is returned if it exists */ get_clstr_iface (&mtc_config.clstr_iface ); - if ( strlen(mtc_config.clstr_iface) ) + ctrl.clstr_iface = mtc_config.clstr_iface ; + if ( !ctrl.clstr_iface.empty()) { /* Only get the cluster-host network address if it is provisioned */ - if ( get_iface_address ( mtc_config.clstr_iface, ctrl.address_clstr, false ) == PASS ) + if ( get_iface_address ( ctrl.clstr_iface.data(), ctrl.address_clstr, false ) == PASS ) { - ilog ("Cluster-host iface : %s\n", mtc_config.clstr_iface ); + ilog ("Cluster-host iface : %s\n", ctrl.clstr_iface.c_str()); ilog ("Cluster-host addr : %s\n", ctrl.address_clstr.c_str()); } } @@ -342,7 +352,7 @@ void setup_clstr_rx_socket ( void ) _close_clstr_rx_socket (); /* Only set up the socket if an cluster-host interface is provisioned */ - mtc_sock.mtc_client_clstr_rx_socket = new msgClassRx(ctrl.address_clstr.c_str(),mtc_sock.mtc_cmd_port, IPPROTO_UDP, ctrl.clstr_iface.data(), false ); + mtc_sock.mtc_client_clstr_rx_socket = new msgClassRx(ctrl.address_clstr.c_str(),mtc_sock.mtc_clstr_cmd_port, IPPROTO_UDP, ctrl.clstr_iface.data(), false ); /* update health of socket */ if ( mtc_sock.mtc_client_clstr_rx_socket ) @@ -390,32 +400,60 @@ void setup_mgmnt_tx_socket ( void ) } } -void setup_clstr_tx_socket ( void ) +void setup_clstr_tx_sockets ( void ) { if ( ctrl.clstr_iface_provisioned == false ) { return ; } - dlog ("setup of cluster-host TX\n"); - _close_clstr_tx_socket (); - mtc_sock.mtc_client_clstr_tx_socket = new msgClassTx(CONTROLLER_NFS,mtc_sock.mtc_agent_port, IPPROTO_UDP, mtc_config.clstr_iface); + dlog ("setup of %s TX\n", CONTROLLER_0_CLUSTER_HOST); - if ( mtc_sock.mtc_client_clstr_tx_socket ) + _close_clstr_tx_sockets (); + + mtc_sock.mtc_client_tx_socket_c0_clstr = + new msgClassTx(CONTROLLER_0_CLUSTER_HOST, + mtc_sock.mtc_agent_port, + IPPROTO_UDP, + mtc_config.clstr_iface); + + if ( mtc_sock.mtc_client_tx_socket_c0_clstr ) { - /* look for fault insertion request */ - if ( daemon_is_file_present ( MTC_CMD_FIT__CLSTR_TXSOCK ) ) - mtc_sock.mtc_client_clstr_tx_socket->return_status = FAIL ; - - if ( mtc_sock.mtc_client_clstr_tx_socket->return_status == PASS ) + if ( mtc_sock.mtc_client_tx_socket_c0_clstr->return_status == PASS ) { - mtc_sock.mtc_client_clstr_tx_socket->sock_ok(true); + mtc_sock.mtc_client_tx_socket_c0_clstr->sock_ok(true); } else { - elog ("failed to init 'cluster-host tx' socket (rc:%d)\n", - mtc_sock.mtc_client_clstr_tx_socket->return_status ); - mtc_sock.mtc_client_clstr_tx_socket->sock_ok(false); + elog ("failed to init '%s' tx socket (rc:%d)\n", + CONTROLLER_0_CLUSTER_HOST, + mtc_sock.mtc_client_tx_socket_c0_clstr->return_status ); + mtc_sock.mtc_client_tx_socket_c0_clstr->sock_ok(false); + } + } + if ( ctrl.system_type != SYSTEM_TYPE__CPE_MODE__SIMPLEX ) + { + dlog ("setup of %s TX\n", CONTROLLER_1_CLUSTER_HOST); + + mtc_sock.mtc_client_tx_socket_c1_clstr = + new msgClassTx(CONTROLLER_1_CLUSTER_HOST, + mtc_sock.mtc_agent_port, + IPPROTO_UDP, + mtc_config.clstr_iface); + + if ( mtc_sock.mtc_client_tx_socket_c1_clstr ) + { + if ( mtc_sock.mtc_client_tx_socket_c1_clstr->return_status == PASS ) + { + mtc_sock.mtc_client_tx_socket_c1_clstr->sock_ok(true); + } + else + { + elog ("failed to init '%s' tx socket (rc:%d)\n", + CONTROLLER_0_CLUSTER_HOST, + mtc_sock.mtc_client_tx_socket_c1_clstr->return_status ); + mtc_sock.mtc_client_tx_socket_c1_clstr->sock_ok(false); + } } } } @@ -463,7 +501,7 @@ void setup_amon_socket ( void ) * 1. Unicast receive socket mgmnt (mtc_client_rx_socket) * 2. Unicast receive socket clstr (mtc_client_clstr_rx_socket) * 3. Unicast transmit socket mgmnt (mtc_client_tx_socket) - * 4. Unicast transmit socket clstr (mtc_client_clstr_tx_socket) + * 4. Unicast transmit socket clstr (mtc_client_tx_socket_c?_clstr) * * 5. socket for pmond acive monitoring * @@ -473,8 +511,10 @@ int mtc_socket_init ( void ) /* Setup the Management Interface Recieve Socket */ /* Read the port config strings into the socket struct */ mtc_sock.mtc_agent_port = mtc_config.mtc_agent_port; - mtc_sock.mtc_cmd_port = mtc_config.mtc_rx_mgmnt_port; + mtc_sock.mtc_mgmnt_cmd_port = mtc_config.mtc_rx_mgmnt_port; + mtc_sock.mtc_clstr_cmd_port = mtc_config.mtc_rx_clstr_port; + get_hostname ( &ctrl.hostname[0], MAX_HOST_NAME_SIZE ); ctrl.mtcAgent_ip = getipbyname ( CONTROLLER ); ilog ("Controller : %s\n", ctrl.mtcAgent_ip.c_str()); @@ -489,8 +529,8 @@ int mtc_socket_init ( void ) setup_mgmnt_tx_socket (); /* Manage Cluster-host network setup */ - string clstr_iface_name = daemon_clstr_iface(); string mgmnt_iface_name = daemon_mgmnt_iface(); + string clstr_iface_name = daemon_clstr_iface(); if ( !clstr_iface_name.empty() ) { if ( clstr_iface_name != mgmnt_iface_name ) @@ -504,7 +544,7 @@ int mtc_socket_init ( void ) /*************************************************************/ /* Setup the Clstr Interface Transmit Messaging to mtcAgent */ /*************************************************************/ - setup_clstr_tx_socket () ; + setup_clstr_tx_sockets () ; } } @@ -1225,8 +1265,8 @@ void daemon_service_run ( void ) if (( mtc_sock.mtc_client_rx_socket == NULL ) || ( mtc_sock.mtc_client_rx_socket->sock_ok() == false )) { - setup_mgmnt_rx_socket(); wlog ("calling setup_mgmnt_rx_socket (auto-recovery)\n"); + setup_mgmnt_rx_socket(); socket_reinit = true ; } @@ -1234,8 +1274,8 @@ void daemon_service_run ( void ) else if (( mtc_sock.mtc_client_tx_socket == NULL ) || ( mtc_sock.mtc_client_tx_socket->sock_ok() == false )) { - setup_mgmnt_tx_socket(); wlog ("calling setup_mgmnt_tx_socket\n"); + setup_mgmnt_tx_socket(); socket_reinit = true ; } @@ -1244,18 +1284,20 @@ void daemon_service_run ( void ) (( mtc_sock.mtc_client_clstr_rx_socket == NULL ) || ( mtc_sock.mtc_client_clstr_rx_socket->sock_ok() == false ))) { - setup_clstr_rx_socket(); wlog ("calling setup_clstr_rx_socket (auto-recovery)\n"); + setup_clstr_rx_socket(); socket_reinit = true ; } /* Clstr Tx */ else if (( ctrl.clstr_iface_provisioned == true ) && - (( mtc_sock.mtc_client_clstr_tx_socket == NULL ) || - ( mtc_sock.mtc_client_clstr_tx_socket->sock_ok() == false ))) + (( mtc_sock.mtc_client_tx_socket_c0_clstr == NULL ) || + ( mtc_sock.mtc_client_tx_socket_c1_clstr == NULL ) || + ( mtc_sock.mtc_client_tx_socket_c0_clstr->sock_ok() == false ) || + ( mtc_sock.mtc_client_tx_socket_c1_clstr->sock_ok() == false ))) { - setup_clstr_tx_socket(); - wlog ("calling setup_clstr_tx_socket (auto-recovery)\n"); + wlog ("calling setup_clstr_tx_sockets (auto-recovery)\n"); + setup_clstr_tx_sockets(); socket_reinit = true ; } @@ -1311,18 +1353,14 @@ void daemon_service_run ( void ) if ( daemon_is_file_present ( MTC_CMD_FIT__CLSTR_RXSOCK )) { if ( mtc_sock.mtc_client_clstr_rx_socket ) - { mtc_sock.mtc_client_clstr_rx_socket->sock_ok (false); - _close_clstr_rx_socket (); - } } if ( daemon_is_file_present ( MTC_CMD_FIT__CLSTR_TXSOCK )) { - if ( mtc_sock.mtc_client_clstr_tx_socket ) - { - mtc_sock.mtc_client_clstr_tx_socket->sock_ok (false); - _close_clstr_tx_socket (); - } + if ( mtc_sock.mtc_client_tx_socket_c0_clstr ) + mtc_sock.mtc_client_tx_socket_c0_clstr->sock_ok (false); + if ( mtc_sock.mtc_client_tx_socket_c1_clstr ) + mtc_sock.mtc_client_tx_socket_c1_clstr->sock_ok (false); } if ( daemon_is_file_present ( MTC_CMD_FIT__AMON_SOCK )) { diff --git a/mtce/src/maintenance/mtcNodeComp.h b/mtce/src/maintenance/mtcNodeComp.h index b0917bfc..612144f8 100644 --- a/mtce/src/maintenance/mtcNodeComp.h +++ b/mtce/src/maintenance/mtcNodeComp.h @@ -18,8 +18,9 @@ #include /** Compute Config mask */ -#define CONFIG_CLIENT_MASK (CONFIG_AGENT_PORT |\ - CONFIG_CLIENT_MTC_MGMNT_PORT) +#define CONFIG_CLIENT_MASK (CONFIG_AGENT_MTC_MGMNT_PORT |\ + CONFIG_CLIENT_MTC_MGMNT_PORT |\ + CONFIG_CLIENT_MTC_CLSTR_PORT) #define MAX_RUN_SCRIPTS (20) diff --git a/mtce/src/maintenance/mtcNodeCtrl.cpp b/mtce/src/maintenance/mtcNodeCtrl.cpp index d01de614..26fca0f0 100644 --- a/mtce/src/maintenance/mtcNodeCtrl.cpp +++ b/mtce/src/maintenance/mtcNodeCtrl.cpp @@ -155,18 +155,6 @@ void daemon_exit ( void ) if (mtc_sock.mtc_agent_tx_socket) delete (mtc_sock.mtc_agent_tx_socket); - if (mtc_sock.mtc_client_rx_socket) - delete(mtc_sock.mtc_client_rx_socket); - - if (mtc_sock.mtc_client_tx_socket) - delete (mtc_sock.mtc_client_tx_socket); - - if (mtc_sock.mtc_client_clstr_rx_socket) - delete (mtc_sock.mtc_client_clstr_rx_socket); - - if (mtc_sock.mtc_client_clstr_tx_socket) - delete (mtc_sock.mtc_client_clstr_tx_socket); - if (mtc_sock.mtc_event_rx_sock) delete (mtc_sock.mtc_event_rx_sock); @@ -191,7 +179,8 @@ void daemon_exit ( void ) /** Control Config Mask */ -#define CONFIG_AGENT_MASK (CONFIG_AGENT_PORT |\ +#define CONFIG_AGENT_MASK (CONFIG_AGENT_MTC_MGMNT_PORT |\ + CONFIG_CLIENT_MTC_CLSTR_PORT |\ CONFIG_MTC_TO_HBS_CMD_PORT |\ CONFIG_MTC_TO_HWMON_CMD_PORT |\ CONFIG_HBS_TO_MTC_EVENT_PORT |\ @@ -201,7 +190,7 @@ void daemon_exit ( void ) CONFIG_AGENT_LOC_TIMEOUT |\ CONFIG_AGENT_INV_EVENT_PORT |\ CONFIG_AGENT_API_RETRIES |\ - CONFIG_CLIENT_PORT) + CONFIG_CLIENT_MTC_MGMNT_PORT) static int mtc_nfvi_handler ( void * user, const char * section, @@ -250,7 +239,7 @@ static int mtc_config_handler ( void * user, else if (MATCH("agent", "mtc_agent_port")) { config_ptr->mtc_agent_port = atoi(value); - config_ptr->mask |= CONFIG_AGENT_PORT ; + config_ptr->mask |= CONFIG_AGENT_MTC_MGMNT_PORT ; } else if (MATCH("agent", "mtc_to_hbs_cmd_port")) { @@ -279,7 +268,12 @@ static int mtc_config_handler ( void * user, else if (MATCH("client", "mtc_rx_mgmnt_port")) { config_ptr->cmd_port = atoi(value); - config_ptr->mask |= CONFIG_CLIENT_PORT ; + config_ptr->mask |= CONFIG_CLIENT_MTC_MGMNT_PORT ; + } + else if (MATCH("client", "mtc_rx_clstr_port")) + { + config_ptr->mtc_rx_clstr_port = atoi(value); + config_ptr->mask |= CONFIG_CLIENT_MTC_CLSTR_PORT ; } else if (MATCH("agent", "token_refresh_rate")) { @@ -639,6 +633,7 @@ int daemon_configure ( void ) else { mtcInv.clstr_network_provisioned = true ; + ilog ("Cluster network is provisioned" ); } } @@ -697,11 +692,11 @@ int mtc_socket_init ( void ) /* Read the port config strings into the socket struct */ mtc_sock.mtc_agent_port = mtc_config.mtc_agent_port; - mtc_sock.mtc_cmd_port = mtc_config.cmd_port; + mtc_sock.mtc_mgmnt_cmd_port = mtc_config.cmd_port; /* create transmit socket */ msgClassAddr::getAddressFromInterface(mtc_config.mgmnt_iface, ip_address, INET6_ADDRSTRLEN); - sock_ptr->mtc_agent_tx_socket = new msgClassTx(ip_address, mtc_config.mtc_agent_port, IPPROTO_UDP, mtc_config.mgmnt_iface); + sock_ptr->mtc_agent_tx_socket = new msgClassTx(ip_address, mtc_sock.mtc_mgmnt_cmd_port, IPPROTO_UDP, mtc_config.mgmnt_iface); rc = sock_ptr->mtc_agent_tx_socket->return_status; if(rc != PASS) { @@ -714,9 +709,12 @@ int mtc_socket_init ( void ) /***********************************************************/ if ( strlen( mtc_config.clstr_iface ) ) { + sock_ptr->mtc_clstr_cmd_port = mtc_config.mtc_rx_clstr_port; + /* create clstr transmit socket only if the interface is provisioned */ msgClassAddr::getAddressFromInterface(mtc_config.clstr_iface, ip_address, INET6_ADDRSTRLEN); - sock_ptr->mtc_agent_clstr_tx_socket = new msgClassTx(ip_address, mtc_config.mtc_agent_port, IPPROTO_UDP, mtc_config.clstr_iface); + sock_ptr->mtc_agent_clstr_tx_socket = new msgClassTx(ip_address, mtc_sock.mtc_clstr_cmd_port, IPPROTO_UDP, mtc_config.clstr_iface); + rc = sock_ptr->mtc_agent_clstr_tx_socket->return_status; if(rc != PASS) { @@ -778,8 +776,17 @@ int mtc_socket_init ( void ) if ( mtcInv.clstr_network_provisioned == true ) { - sock_ptr->mtc_agent_clstr_rx_socket = - new msgClassRx(CONTROLLER_NFS, sock_ptr->mtc_agent_port, IPPROTO_UDP ); + if ( mtcInv.my_hostname == CONTROLLER_0 ) + { + sock_ptr->mtc_agent_clstr_rx_socket = + new msgClassRx(CONTROLLER_0_CLUSTER_HOST, sock_ptr->mtc_agent_port, IPPROTO_UDP ); + } + else + { + sock_ptr->mtc_agent_clstr_rx_socket = + new msgClassRx(CONTROLLER_1_CLUSTER_HOST, sock_ptr->mtc_agent_port, IPPROTO_UDP ); + } + if (( sock_ptr->mtc_agent_clstr_rx_socket == NULL ) || ( sock_ptr->mtc_agent_clstr_rx_socket->return_status )) { diff --git a/mtce/src/maintenance/mtcNodeHdlrs.cpp b/mtce/src/maintenance/mtcNodeHdlrs.cpp index 154861aa..5e7e05bf 100755 --- a/mtce/src/maintenance/mtcNodeHdlrs.cpp +++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp @@ -773,7 +773,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) node_ptr->mtce_flags = 0 ; /* Assert the mtc alive gate */ - node_ptr->mtcAlive_gate = true ; + this->ctl_mtcAlive_gate ( node_ptr, true ) ; node_ptr->mtcAlive_online = false ; node_ptr->mtcAlive_offline = true ; @@ -886,9 +886,9 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) * in the reboot recovery phase now. Look for the mtcAlive */ /* In self-enable we don't need to purge mtcAlive just need - * to wait for one more. Assum,e offline, not online and open + * to wait for one more. Assume offline, not online and open * the mtcAlive gate. */ - node_ptr->mtcAlive_gate = false ; + this->ctl_mtcAlive_gate ( node_ptr, false ) ; node_ptr->mtcAlive_online = false ; node_ptr->mtcAlive_offline = true ; /* set mtcAlive timeout */ @@ -1053,7 +1053,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) if ( node_ptr->mtcAlive_purge >= 20 ) { /* open gate */ - node_ptr->mtcAlive_gate = false ; + this->ctl_mtcAlive_gate ( node_ptr, false ) ; node_ptr->mtcAlive_purge = 0 ; /* timer is started ok so we can do the stage transition */ @@ -1173,12 +1173,12 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) break ; } - else if ( node_ptr->mtcAlive_gate == true ) + else if ( this->get_mtcAlive_gate (node_ptr) == true ) { slog ("%s mtcAlive gate unexpectedly set, correcting ...\n", node_ptr->hostname.c_str()); - node_ptr->mtcAlive_gate = false ; + this->ctl_mtcAlive_gate ( node_ptr, false ) ; } /* wait some more */ @@ -1628,7 +1628,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) /* Purge this hosts work queues */ mtcCmd_workQ_purge ( node_ptr ); mtcCmd_doneQ_purge ( node_ptr ); - node_ptr->mtcAlive_gate = false ; + this->ctl_mtcAlive_gate ( node_ptr, false ); node_ptr->http_retries_cur = 0 ; node_ptr->unknown_health_reported = false ; @@ -1648,13 +1648,6 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) /* Disable the heartbeat service for Graceful Recovery */ send_hbs_command ( node_ptr->hostname, MTC_CMD_STOP_HOST ); - /* Clear the minor and failure flags if it is set for this host */ - for ( int iface = 0 ; iface < MAX_IFACES ; iface++ ) - { - hbs_minor_clear ( node_ptr, (iface_enum)iface ); - node_ptr->heartbeat_failed[iface] = false ; - } - /* Have we reached the maximum allowed fast recovery attempts. * * If we have then force the full enable by @@ -1664,10 +1657,10 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) */ if ( ++node_ptr->graceful_recovery_counter > MTC_MAX_FAST_ENABLES ) { - /* gate off further mtcAlive messaging timme the offline - * handler runs. This prevents stale messages from making it - * in and prolong the offline detection time */ - node_ptr->mtcAlive_gate = true ; + /* gate off further mtcAlive messaging timme the offline + * handler runs. This prevents stale messages from making it + * in and prolong the offline detection time */ + this->ctl_mtcAlive_gate ( node_ptr, true ) ; elog ("%s Graceful Recovery Failed (retries=%d)\n", node_ptr->hostname.c_str(), node_ptr->graceful_recovery_counter ); @@ -2114,12 +2107,12 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) availStatusChange ( node_ptr, MTC_AVAIL_STATUS__OFFLINE ); } } - else if ( node_ptr->mtcAlive_gate == true ) + else if ( this->get_mtcAlive_gate ( node_ptr ) == true ) { slog ("%s mtcAlive gate unexpectedly set, auto-correcting ...\n", node_ptr->hostname.c_str()); - node_ptr->mtcAlive_gate = false ; + this->ctl_mtcAlive_gate ( node_ptr, false ) ; } /* wait some more */ @@ -2454,6 +2447,12 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) mtcTimer_reset ( node_ptr->mtcTimer ); } + for ( int iface = 0 ; iface < MAX_IFACES ; iface++ ) + { + hbs_minor_clear ( node_ptr, (iface_enum)iface ); + node_ptr->heartbeat_failed[iface] = false ; + } + /* Enable the heartbeat service for Graceful Recovery */ send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST ); @@ -3097,7 +3096,7 @@ int nodeLinkClass::disable_handler ( struct nodeLinkClass::node * node_ptr ) } /* open the mtcAlive gate while we are disabled */ - node_ptr->mtcAlive_gate = false ; + this->ctl_mtcAlive_gate ( node_ptr, false ) ; disableStageChange( node_ptr, MTC_DISABLE__START ); adminActionChange ( node_ptr , MTC_ADMIN_ACTION__NONE ); @@ -3240,7 +3239,7 @@ int nodeLinkClass::offline_handler ( struct nodeLinkClass::node * node_ptr ) operState_enum_to_str(node_ptr->operState).c_str(), availStatus_enum_to_str(node_ptr->availStatus).c_str()); - node_ptr->mtcAlive_gate = false ; + this->ctl_mtcAlive_gate ( node_ptr, false ) ; node_ptr->mtcAlive_mgmnt = false ; node_ptr->mtcAlive_clstr = false ; @@ -3261,7 +3260,7 @@ int nodeLinkClass::offline_handler ( struct nodeLinkClass::node * node_ptr ) case MTC_OFFLINE__WAIT: { /* be sure the mtcAlive gate is open */ - node_ptr->mtcAlive_gate = false ; + this->ctl_mtcAlive_gate (node_ptr, false ) ; if ( mtcTimer_expired ( node_ptr->offline_timer ) == true ) { if ( node_ptr->availStatus == MTC_AVAIL_STATUS__OFFLINE ) @@ -3369,12 +3368,12 @@ int nodeLinkClass::online_handler ( struct nodeLinkClass::node * node_ptr ) node_ptr->hostname.c_str(), node_ptr->onlineStage ); - if ( node_ptr->mtcAlive_gate == true ) + if ( this->get_mtcAlive_gate ( node_ptr ) == true ) { alog ("%s mtcAlive gate unexpectedly set, correcting ...\n", node_ptr->hostname.c_str()); - node_ptr->mtcAlive_gate = false ; + this->ctl_mtcAlive_gate (node_ptr, false ) ; } /* Start with a zero count. This counter is incremented every @@ -3475,7 +3474,8 @@ int nodeLinkClass::online_handler ( struct nodeLinkClass::node * node_ptr ) /* ... keep the 'host locked' file on this host refreshed while in the locked state * ... send it on both interfaces just in case */ send_mtc_cmd ( node_ptr->hostname , MTC_MSG_LOCKED, MGMNT_INTERFACE ); - // send_mtc_cmd ( node_ptr->hostname , MTC_MSG_LOCKED, INFRA_INTERFACE ); + if ( clstr_network_provisioned ) + send_mtc_cmd ( node_ptr->hostname , MTC_MSG_LOCKED, CLSTR_INTERFACE ); } /* Start over */ @@ -6106,7 +6106,7 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr ) send_hwmon_command ( node_ptr->hostname, MTC_CMD_START_HOST ); } - node_ptr->mtcAlive_gate = false ; + this->ctl_mtcAlive_gate(node_ptr, false) ; node_ptr->addStage = MTC_ADD__DONE ; break; } @@ -6522,6 +6522,11 @@ int nodeLinkClass::oos_test_handler ( struct nodeLinkClass::node * node_ptr ) /* Tell the host that it is locked */ send_mtc_cmd ( node_ptr->hostname , MTC_MSG_LOCKED, MGMNT_INTERFACE); + if ( clstr_network_provisioned ) + { + ilog ("%s Sending Lock Cluster", node_ptr->hostname.c_str() ); + send_mtc_cmd ( node_ptr->hostname , MTC_MSG_LOCKED, CLSTR_INTERFACE ); + } } break ; @@ -6668,26 +6673,6 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr ) send_hbs_command ( this->my_hostname, MTC_CMD_ACTIVE_CTRL ); } - /* Manage active controller auto recovery bool. - * If the inactive controller is inservice then disable - * controller autorecovery. Otherwise enable it but in this case - * don't change the disable bool as that is used to gate auto - * recovery once the threshoild is reached */ -// if ( is_controller ( node_ptr ) && NOT_THIS_HOST ) -// { -// if (( node_ptr->ar_disabled == false ) && -// ( node_ptr->operState == MTC_OPER_STATE__ENABLED )) -// { -// autorecovery_clear ( CONTROLLER_0 ); -// autorecovery_clear ( CONTROLLER_1 ); -// } - //else if (( node_ptr->ar_disabled == true ) && - // ( node_ptr->operState != MTC_OPER_STATE__ENABLED )) - //{ - // node_ptr->ar_disabled = false ; - //} - // } - /* Monitor the health of the host - no pass file */ if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) && ( node_ptr->operState == MTC_OPER_STATE__ENABLED )) diff --git a/mtce/src/maintenance/mtcNodeMsg.h b/mtce/src/maintenance/mtcNodeMsg.h index 0eb9e628..6816354c 100755 --- a/mtce/src/maintenance/mtcNodeMsg.h +++ b/mtce/src/maintenance/mtcNodeMsg.h @@ -74,15 +74,15 @@ typedef struct int mtc_agent_clstr_rx_socket_size ; /** UDP sockets used by the mtcClient to receive maintenance - * commands from and transmit replies to the mtcAgent */ - msgClassSock* mtc_client_rx_socket ; /**< rx from controller */ - msgClassSock* mtc_client_tx_socket ; /**< tx to controller mgmnt */ - msgClassSock* mtc_client_clstr_tx_socket ; /**< tx to controller clstr */ - msgClassSock* mtc_client_clstr_rx_socket ; /**< rx from controller clstr */ - int mtc_cmd_port ; /**< mtc command port number */ - struct sockaddr_in mtc_cmd_addr ; /**< socket attributes mgmnt */ - - + * commands from and transmit replies to the mtcAgent */ + msgClassSock* mtc_client_rx_socket ; /**< rx from controller */ + msgClassSock* mtc_client_tx_socket ; /**< tx to controller mgmnt */ + msgClassSock* mtc_client_tx_socket_c0_clstr ; /**< tx to controller-0 clstr i/f */ + msgClassSock* mtc_client_tx_socket_c1_clstr ; /**< tx to controller-1 clstr i/f */ + msgClassSock* mtc_client_clstr_rx_socket ; /**< rx from controller clstr */ + int mtc_mgmnt_cmd_port ; /**< mtc command port mgmnt i/f */ + int mtc_clstr_cmd_port ; /**< mtc command port clstr i/f */ + struct sockaddr_in mtc_cmd_addr ; /**< socket attributes mgmnt */ /***************************************************************/