Merge "Fix heartbeat messaging when interface is set to 'lo'"

This commit is contained in:
Zuul 2020-07-02 13:54:28 +00:00 committed by Gerrit Code Review
commit 43a7e676e9
5 changed files with 225 additions and 117 deletions

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2013, 2015 Wind River Systems, Inc.
* Copyright (c) 2013-2020 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -187,6 +187,7 @@ void log_link_events ( int netlink_sock,
iter_curr_ptr != links_gone_down.end() ;
iter_curr_ptr++ )
{
bool care = false ;
dlog3 ( "downed link: %s (running:%d:%d)\n",
iter_curr_ptr->c_str(),
mgmnt_link_up_and_running,
@ -194,6 +195,7 @@ void log_link_events ( int netlink_sock,
if ( !strcmp (mgmnt_iface_ptr, iter_curr_ptr->data()))
{
care = true ;
if ( mgmnt_link_up_and_running == true )
{
mgmnt_link_up_and_running = false ;
@ -202,6 +204,7 @@ void log_link_events ( int netlink_sock,
}
if ( !strcmp (clstr_iface_ptr, iter_curr_ptr->data()))
{
care = true ;
if ( clstr_link_up_and_running == true )
{
clstr_link_up_and_running = false ;
@ -209,18 +212,21 @@ void log_link_events ( int netlink_sock,
}
}
if ( get_link_state ( ioctl_sock, iter_curr_ptr->data(), &running ) == PASS )
if ( care == true )
{
wlog ("%s is down (oper:%s) (%ld)\n",
iter_curr_ptr->c_str(),
running ? "up" : "down",
iter_curr_ptr->length() );
}
else
{
wlog ("%s is down (driver query failed) (len:%ld)\n",
iter_curr_ptr->c_str(),
iter_curr_ptr->length() );
if ( get_link_state ( ioctl_sock, iter_curr_ptr->data(), &running ) == PASS )
{
wlog ("%s is down (oper:%s) (%ld)\n",
iter_curr_ptr->c_str(),
running ? "up" : "down",
iter_curr_ptr->length());
}
else
{
wlog ("%s is down (driver query failed) (len:%ld)\n",
iter_curr_ptr->c_str(),
iter_curr_ptr->length());
}
}
}
}
@ -233,6 +239,7 @@ void log_link_events ( int netlink_sock,
iter_curr_ptr != links_gone_up.end() ;
iter_curr_ptr++ )
{
bool care = false ;
dlog3 ( "recovered link: %s (running:%d:%d)\n",
iter_curr_ptr->c_str(),
mgmnt_link_up_and_running,
@ -240,27 +247,32 @@ void log_link_events ( int netlink_sock,
if ( !strcmp (mgmnt_iface_ptr, iter_curr_ptr->data()))
{
care = true ;
mgmnt_link_up_and_running = true ;
wlog ("Mgmnt link %s is up\n", mgmnt_iface_ptr );
}
if ( !strcmp (clstr_iface_ptr, iter_curr_ptr->data()))
{
care = true ;
clstr_link_up_and_running = true ;
wlog ("Cluster-host link %s is up\n", clstr_iface_ptr );
}
if ( get_link_state ( ioctl_sock, iter_curr_ptr->data(), &running ) == PASS )
if ( care == true )
{
wlog ("%s is up (oper:%s) (len:%ld)\n",
iter_curr_ptr->c_str(),
running ? "up" : "down",
iter_curr_ptr->length() );
}
else
{
wlog ("%s is up (driver query failed) (len:%ld)\n",
iter_curr_ptr->c_str(),
iter_curr_ptr->length() );
if ( get_link_state ( ioctl_sock, iter_curr_ptr->data(), &running ) == PASS )
{
wlog ("%s is up (oper:%s) (len:%ld)\n",
iter_curr_ptr->c_str(),
running ? "up" : "down",
iter_curr_ptr->length() );
}
else
{
wlog ("%s is up (driver query failed) (len:%ld)\n",
iter_curr_ptr->c_str(),
iter_curr_ptr->length() );
}
}
}
}

View File

@ -1,7 +1,7 @@
#ifndef __INCLUDE_NODEBASE_HH__
#define __INCLUDE_NODEBASE_HH__
/*
* Copyright (c) 2013-2016 Wind River Systems, Inc.
* Copyright (c) 2013-2020 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -161,10 +161,10 @@ typedef enum
/** 'lo' interface IP address - TODO: get it from the interface */
#define LOOPBACK_IP "127.0.0.1"
#define LOCALHOST "localhost"
#define LOOPBACK_IP "127.0.0.1"
#define LOOPBACK_IPV6 "::1"
#define LOCALHOST "localhost"
#define LOOPBACK_IF "lo"
#define CLUSTER_HOST_SUFFIX ((const char*)("-cluster-host"))

View File

@ -5,8 +5,8 @@ pidfile = /var/run/hbsAgent.pid
style = lsb ; ocf or lsb
severity = major ; minor, major, critical
restarts = 1 ; restart retries before error assertion
interval = 10 ; number of seconds to wait between restarts
debounce = 10 ; number of seconds that a process needs to remain
interval = 5 ; number of seconds to wait between restarts
debounce = 20 ; number of seconds that a process needs to remain
; running before degrade is removed and retry count
; is cleared.
startuptime = 5 ; Seconds to wait after process start before starting the debounce monitor

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2013, 2016 Wind River Systems, Inc.
* Copyright (c) 2013-2020 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -75,6 +75,10 @@ static string arrival_histogram[MAX_IFACES] = { "" , "" } ;
static string mtcAgent_ip = "" ;
static std::list<string> hostname_inventory ;
/* Used to throttle warning messages that report
* an error transmitting the pulse request */
static int pulse_request_fail_log_counter[MAX_IFACES] ;
/** This heartbeat service inventory is tracked by
* the same nodeLinkClass that maintenance uses.
*
@ -449,7 +453,15 @@ int daemon_configure ( void )
ilog ("Clstr Addr : %s\n", hbsInv.my_clstr_ip.c_str());
}
if (!strcmp(hbs_config.clstr_iface, hbs_config.mgmnt_iface))
/* The cluster host network is considered unprovisioned
* for heartbeat while ...
* ... its interface is 'lo' ... */
if (!strcmp(hbs_config.clstr_iface, LOOPBACK_IF))
{
hbsInv.clstr_network_provisioned = false ;
}
/* ... or it and the management interface are the same. */
else if (!strcmp(hbs_config.clstr_iface, hbs_config.mgmnt_iface))
{
hbsInv.clstr_network_provisioned = false ;
}
@ -551,20 +563,7 @@ int _setup_pulse_messaging ( iface_enum i, int rmem_max )
int rc = PASS ;
char * iface = NULL ;
/* Load up the interface name */
if ( i == MGMNT_IFACE )
{
iface = hbs_config.mgmnt_iface ;
}
else if (( i == CLSTR_IFACE ) && ( hbs_config.clstr_iface != NULL ))
{
iface = hbs_config.clstr_iface ;
}
else
{
wlog ("No Cluster-host Interface\n");
return (RETRY);
}
pulse_request_fail_log_counter[i] = 0 ;
/* Start by closing existing sockets just in case this is a (re)initialization */
if ( hbs_sock.rx_sock[i] )
@ -579,13 +578,60 @@ int _setup_pulse_messaging ( iface_enum i, int rmem_max )
hbs_sock.tx_sock[i] = 0 ;
}
/* Load up the interface name */
if ( i == MGMNT_IFACE )
{
if ( hbsInv.mgmnt_link_up_and_running == false )
{
wlog("Cannot setup Mgmnt pulse messaging when '%s' interface is down", hbs_config.clstr_iface );
return(FAIL_BAD_STATE);
}
else
{
iface = hbs_config.mgmnt_iface ;
if (strcmp(iface, LOOPBACK_IF))
{
hbs_sock.tx_sock[i] =
new msgClassTx(hbs_config.multicast,hbs_sock.tx_port[i],IPPROTO_UDP,iface);
}
else
{
hbs_sock.tx_sock[i] =
new msgClassTx(hbsInv.my_local_ip.data(), hbs_sock.tx_port[i],IPPROTO_UDP,iface);
}
}
}
else if (( i == CLSTR_IFACE ) &&
( hbsInv.clstr_network_provisioned == true ) &&
( hbs_config.clstr_iface != NULL ))
{
if ( hbsInv.clstr_link_up_and_running == false )
{
wlog("Cannot setup Clstr pulse messaging when '%s' interface is down", hbs_config.clstr_iface);
return(FAIL_BAD_STATE);
}
else
{
iface = hbs_config.clstr_iface ;
hbs_sock.tx_sock[i] =
new msgClassTx(hbs_config.multicast,hbs_sock.tx_port[i],IPPROTO_UDP,iface);
}
}
else
{
ilog("no heartbeat on %s network", get_iface_name_str(i) );
return (PASS);
}
/* Create transmit socket */
hbs_sock.tx_sock[i] = new msgClassTx(hbs_config.multicast,hbs_sock.tx_port[i],IPPROTO_UDP,iface);
if ( hbs_sock.tx_sock[i] )
{
if ( hbs_sock.tx_sock[i]->return_status != PASS )
{
elog("Cannot open multicast transmit socket - rc:%d (%d:%m)\n", hbs_sock.tx_sock[i]->return_status, errno );
elog("Failed to create %s pulse transmit socket (%d:%d:%m)\n",
get_iface_name_str(i),
hbs_sock.tx_sock[i]->return_status,
errno );
delete (hbs_sock.tx_sock[i]);
hbs_sock.tx_sock[i] = 0 ;
return (FAIL_SOCKET_CREATE);
@ -597,10 +643,10 @@ int _setup_pulse_messaging ( iface_enum i, int rmem_max )
}
else
{
elog("Cannot open multicast transmit socket - null object (%d:%m)\n", errno );
elog("Failed to create %s pulse transmit socket (%d:%m)\n",
get_iface_name_str(i), errno );
return (FAIL_SOCKET_CREATE);
}
dlog("Opened multicast transmit socket\n" );
/* In order to avoid multicast packets being routed wrong, force sending from that socket */
hbs_sock.tx_sock[i]->interfaceBind();
@ -614,8 +660,10 @@ int _setup_pulse_messaging ( iface_enum i, int rmem_max )
hbs_sock.rx_sock[i] = new msgClassRx(hbs_config.multicast,hbs_sock.rx_port[i],IPPROTO_UDP,iface,true);
if (( hbs_sock.rx_sock[i] == NULL ) || (hbs_sock.rx_sock[i]->return_status != PASS ))
{
elog("Failed opening pulse receive socket (%d:%s)\n",
errno, strerror (errno));
elog("Failed to create %s pulse receive socket (%d:%d:%m)\n",
get_iface_name_str(i),
hbs_sock.rx_sock[i]->return_status,
errno );
rc = FAIL_SOCKET_CREATE ;
}
else
@ -948,7 +996,11 @@ int hbs_pulse_request ( iface_enum iface,
if ( (bytes = hbs_sock.tx_sock[iface]->write((char*)&hbs_sock.tx_mesg[iface], bytes)) < 0 )
{
elog("Failed to send Pulse request: %d:%s to %s.%d (rc:%i ; %d:%s)\n",
/* Throttle this error log. */
elog_throttled( pulse_request_fail_log_counter[iface], 100,
"Failed to send %s Pulse request: " \
"%d:%s to %s.%d (rc:%i ; %d:%s)\n",
get_iface_name_str(iface),
hbs_sock.tx_mesg[iface].s,
&hbs_sock.tx_mesg[iface].m[0],
hbs_sock.tx_sock[iface]->get_dst_addr()->toString(),
@ -959,7 +1011,9 @@ int hbs_pulse_request ( iface_enum iface,
}
else
{
wlog("Unable to send pulse request - null tx object - auto re-init pending\n");
elog_throttled( pulse_request_fail_log_counter[iface], 100,
"Unable to send %s pulse request on null socket",
get_iface_name_str(iface));
return (FAIL_SOCKET_SENDTO);
}
@ -1448,10 +1502,6 @@ void daemon_service_run ( void )
* ultimately triggers an exit if that retry count gets too big */
int socket_init_fail_count = 0 ;
/* Used to throttle warning messages that report
* an error transmitting the pulse request */
int pulse_request_fail_log_counter[MAX_IFACES] ;
/* throttle initialization wait logs */
int wait_log_throttle = 0 ;
@ -1561,18 +1611,46 @@ void daemon_service_run ( void )
daemon_exit();
}
if ( get_link_state ( hbs_sock.ioctl_sock, hbs_config.mgmnt_iface, &hbsInv.mgmnt_link_up_and_running ) )
{
hbsInv.mgmnt_link_up_and_running = false ;
wlog ("Failed to query %s operational state ; defaulting to down\n", hbs_config.mgmnt_iface );
}
else
{
ilog ("Mgmnt %s link is %s\n", hbs_config.mgmnt_iface, hbsInv.mgmnt_link_up_and_running ? "Up" : "Down" );
}
if ( hbsInv.clstr_network_provisioned == true )
{
if ( get_link_state ( hbs_sock.ioctl_sock, hbs_config.clstr_iface, &hbsInv.clstr_link_up_and_running ) )
{
hbsInv.clstr_link_up_and_running = false ;
wlog ("Failed to query %s operational state ; defaulting to down\n", hbs_config.clstr_iface );
}
else
{
ilog ("Cluster-host %s link is %s\n", hbs_config.clstr_iface, hbsInv.clstr_link_up_and_running ? "Up" : "Down" );
}
}
/* Setup the heartbeat sockets */
if ( (rc = hbs_socket_init ()) != PASS )
{
if ( socket_init_fail_count++ == 10 )
#define HBS_SOCKET_INIT_RETRY_THRESHOLD (3)
#define HBS_SOCKET_INIT_RETRY_INTERVAL (2)
if ( socket_init_fail_count++ == HBS_SOCKET_INIT_RETRY_THRESHOLD )
{
elog ("Failed socket initialization (rc:%d) max retries ; exiting ...\n", rc );
elog ("Failed socket initialization (rc:%d) "
"max retries ; exiting ...", rc );
daemon_exit ();
}
else
{
elog ("Failed socket initialization (rc:%d) ; will retry in 5 secs ...\n", rc );
sleep (5);
elog ("Failed socket initialization (rc:%d) ; "
"will retry in %d secs ...\n",
rc, HBS_SOCKET_INIT_RETRY_INTERVAL);
sleep (HBS_SOCKET_INIT_RETRY_INTERVAL);
}
}
else
@ -1584,6 +1662,8 @@ void daemon_service_run ( void )
rc = send_event ( hbsInv.my_hostname, MTC_EVENT_HEARTBEAT_READY, MGMNT_IFACE ) ;
if ( rc == RETRY )
{
// TODO: Threshold this loop and exit or this
// could be a silent process failure loop.
mtcWait_secs ( 3 );
}
} while ( rc == RETRY ) ;
@ -1593,29 +1673,6 @@ void daemon_service_run ( void )
daemon_exit ();
}
if ( get_link_state ( hbs_sock.ioctl_sock, hbs_config.mgmnt_iface, &hbsInv.mgmnt_link_up_and_running ) )
{
hbsInv.mgmnt_link_up_and_running = false ;
wlog ("Failed to query %s operational state ; defaulting to down\n", hbs_config.mgmnt_iface );
}
else
{
ilog ("Mgmnt %s link is %s\n", hbs_config.mgmnt_iface, hbsInv.mgmnt_link_up_and_running ? "Up" : "Down" );
}
if ( hbsInv.clstr_network_provisioned == true )
{
if ( get_link_state ( hbs_sock.ioctl_sock, hbs_config.clstr_iface, &hbsInv.clstr_link_up_and_running ) )
{
hbsInv.clstr_link_up_and_running = false ;
wlog ("Failed to query %s operational state ; defaulting to down\n", hbs_config.clstr_iface );
}
else
{
ilog ("Cluster-host %s link is %s\n", hbs_config.clstr_iface, hbsInv.clstr_link_up_and_running ? "Up" : "Down" );
}
}
/* Make the main loop schedule in real-time */
{
struct sched_param param ;
@ -1720,7 +1777,8 @@ void daemon_service_run ( void )
counter = 1 ;
}
}
else if ( hbsInv.hbs_disabled == true )
else if (( hbsInv.hbs_disabled == true ) &&
( hbsInv.mgmnt_link_up_and_running == true ))
{
hbs_ctrl.locked = false ;
hbsInv.hbs_disabled = false;
@ -2191,12 +2249,11 @@ void daemon_service_run ( void )
if ( rc != 0 )
{
/* TODO: Fix this with an alarm */
wlog_throttled ( pulse_request_fail_log_counter[iface], 100,
"%s hbs_pulse_request failed - rc:%d\n", get_iface_name_str(iface), rc);
if ( pulse_request_fail_log_counter[iface] == INTERFACE_ERRORS_FOR_REINIT )
if ( pulse_request_fail_log_counter[iface] > INTERFACE_ERRORS_FOR_REINIT )
{
_setup_pulse_messaging ( (iface_enum)iface , daemon_get_rmem_max ()) ;
rc = _setup_pulse_messaging ( (iface_enum)iface , daemon_get_rmem_max ()) ;
if ( rc )
continue ;
}
}
else

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2013, 2016 Wind River Systems, Inc.
* Copyright (c) 2013-2020 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -437,8 +437,11 @@ int daemon_configure ( void )
{
if (strcmp(hbs_config.clstr_iface, hbs_config.mgmnt_iface))
{
clstr_network_provisioned = true ;
ilog ("Cluster-host Name : %s\n", hbs_config.clstr_iface );
if (strcmp(hbs_config.clstr_iface, LOOPBACK_IF))
{
clstr_network_provisioned = true ;
ilog ("Cluster-host Name : %s\n", hbs_config.clstr_iface );
}
}
}
if ( clstr_network_provisioned == true )
@ -476,44 +479,80 @@ int _setup_pulse_messaging ( iface_enum i, int rmem )
/* client sockets are not modified */
UNUSED(rmem);
/* Load up the interface name */
if ( i == MGMNT_IFACE )
{
iface = hbs_config.mgmnt_iface ;
}
else if (( i == CLSTR_IFACE ) && ( hbs_config.clstr_iface != NULL ))
{
iface = hbs_config.clstr_iface ;
}
else
{
wlog ("No Cluster-host Interface\n");
return (RETRY);
}
_close_pulse_rx_sock (i);
_close_pulse_tx_sock (i);
/********************************************************************/
/* Setup multicast Pulse Request Receive Socket */
/********************************************************************/
hbs_sock.rx_sock[i] =
new msgClassRx(hbs_config.multicast,hbs_sock.rx_port[i],IPPROTO_UDP,iface,true,true);
if (hbs_sock.rx_sock[i]->return_status != PASS)
/* Load up the interface name */
if ( i == MGMNT_IFACE )
{
elog("Cannot create socket (%d) (%d:%m)\n", i, errno );
_close_pulse_rx_sock (i);
iface = hbs_config.mgmnt_iface ;
if (strcmp(iface, LOOPBACK_IF))
{
hbs_sock.rx_sock[i] =
new msgClassRx(hbs_config.multicast,hbs_sock.rx_port[i],IPPROTO_UDP,iface,true,true);
}
else
{
// Default to unicast heartbeat on management 'lo' interface
hbs_sock.rx_sock[i] =
new msgClassRx(my_address.data(),hbs_sock.rx_port[i],IPPROTO_UDP,iface,false, false);
}
}
else if (( i == CLSTR_IFACE ) &&
( clstr_network_provisioned == true ) &&
( hbs_config.clstr_iface != NULL ))
{
iface = hbs_config.clstr_iface ;
hbs_sock.rx_sock[i] =
new msgClassRx(hbs_config.multicast,hbs_sock.rx_port[i],IPPROTO_UDP,iface,true,true);
}
else
{
ilog("Cluster host interface not used.");
return (PASS);
}
if ( hbs_sock.rx_sock[i] )
{
if (hbs_sock.rx_sock[i]->return_status != PASS)
{
elog("Failed to create %s pulse receiver socket (%d:%d:%m)\n",
get_iface_name_str(i),
hbs_sock.rx_sock[i]->return_status,
errno );
_close_pulse_rx_sock (i);
return (FAIL_SOCKET_CREATE);
}
hbs_sock.rx_sock[i]->sock_ok(true);
}
else
{
elog("Failed to create %s pulse receiver socket (%d:%m)\n",
get_iface_name_str(i), errno );
return (FAIL_SOCKET_CREATE);
}
hbs_sock.rx_sock[i]->sock_ok(true);
/********************************************************************/
/* Setup unicast transmit (reply) socket */
/********************************************************************/
hbs_sock.tx_sock[i] =
new msgClassTx(hbs_config.multicast,hbs_sock.tx_port[i],IPPROTO_UDP, iface);
if ( hbs_sock.tx_sock[i] == NULL )
{
elog("Failed to create %s pulse reply socket (%d:%m)\n",
get_iface_name_str(i), errno );
return (FAIL_SOCKET_CREATE);
}
if (hbs_sock.tx_sock[i]->return_status != PASS)
{
elog("Cannot create unicast transmit socket (%d) (%d:%m)\n", i, errno );
elog("Failed to create %s pulse reply socket (%d:%d:%m)\n",
get_iface_name_str(i),
hbs_sock.tx_sock[i]->return_status,
errno );
_close_pulse_tx_sock(i);
return (FAIL_SOCKET_CREATE);
}
@ -1234,7 +1273,7 @@ int daemon_init ( string iface, string nodeType_str )
}
/* Setup the heartbeat service messaging sockets */
else if ( hbs_socket_init () != PASS )
else if (( rc = hbs_socket_init ()) != PASS )
{
elog ("socket initialization failed (rc:%d)\n", rc );
rc = FAIL_SOCKET_INIT;