Add failover state of peer to heartbeat msg
- add failover state to heartbeat message ( 4 bits ) - add logic to survived_state to use peer's failover state to determine whether to exit survived state and enter normal state - throttle peer is normal events with a threshold of 10 used to ensure the peer is normal and stable - change fsm->send_event() log to debug from info log level - a few logging improvements; debug send_event logs - update copyright year 2023 Test plan: PASS - AIO-DX: iso install PASS - AIO-DX: crash the sm as indicated in bug and observe swact to standby PASS - AIO-DX: manual swact PASS - AIO-DX: power off active controller PASS - AIO-SX: install and basic sanity check PASS - AIO-SX: upgrade test to verify sm heartbeat messages changes still function when controllers are running different loads Closes-Bug: 2012519 Signed-off-by: Kyale, Eliud <Eliud.Kyale@windriver.com> Change-Id: I1f86dcb8c9d9dbaf436b9240867f61adc405e88c
This commit is contained in:
parent
002f5a79db
commit
9e2ff82411
|
@ -174,7 +174,8 @@ _sm_failover_event_mappings[SM_FAILOVER_EVENT_MAX] =
|
|||
{SM_FAILOVER_EVENT_IF_STATE_CHANGED, "interface-state-changed"},
|
||||
{SM_FAILOVER_EVENT_FAIL_PENDING_TIMEOUT, "fail-pending-timeout"},
|
||||
{SM_FAILOVER_EVENT_FAILED_RECOVERY_AUDIT, "failed-recovery-audit"},
|
||||
{SM_FAILOVER_EVENT_NODE_ENABLED, "node-enabled"}
|
||||
{SM_FAILOVER_EVENT_NODE_ENABLED, "node-enabled"},
|
||||
{SM_FAILOVER_EVENT_PEER_IS_NORMAL, "peer-is-normal"}
|
||||
};
|
||||
|
||||
static SmValueStrMappingT
|
||||
|
|
|
@ -281,11 +281,11 @@ typedef enum
|
|||
} SmServiceDomainEventDataT;
|
||||
|
||||
typedef enum{
|
||||
SM_FAILOVER_STATE_INITIAL,
|
||||
SM_FAILOVER_STATE_NORMAL,
|
||||
SM_FAILOVER_STATE_FAIL_PENDING,
|
||||
SM_FAILOVER_STATE_FAILED,
|
||||
SM_FAILOVER_STATE_SURVIVED,
|
||||
SM_FAILOVER_STATE_INITIAL = 0,
|
||||
SM_FAILOVER_STATE_NORMAL = 1,
|
||||
SM_FAILOVER_STATE_FAIL_PENDING = 2,
|
||||
SM_FAILOVER_STATE_FAILED = 3,
|
||||
SM_FAILOVER_STATE_SURVIVED = 4,
|
||||
SM_FAILOVER_STATE_MAX
|
||||
}SmFailoverStateT;
|
||||
|
||||
|
@ -295,6 +295,7 @@ typedef enum{
|
|||
SM_FAILOVER_EVENT_FAIL_PENDING_TIMEOUT,
|
||||
SM_FAILOVER_EVENT_FAILED_RECOVERY_AUDIT,
|
||||
SM_FAILOVER_EVENT_NODE_ENABLED,
|
||||
SM_FAILOVER_EVENT_PEER_IS_NORMAL,
|
||||
SM_FAILOVER_EVENT_MAX
|
||||
}SmFailoverEventT;
|
||||
|
||||
|
@ -706,7 +707,59 @@ typedef enum
|
|||
SM_ERROR_MAX
|
||||
} SmErrorT;
|
||||
|
||||
typedef uint32_t SmHeartbeatMsgIfStateT;
|
||||
/**
|
||||
* The SM_FAILOVER_STATE_MASK is used for the failover states
|
||||
* lower bits 8 to 10 .
|
||||
* The 'SmFailoverStateT' is stored directly in the 4 bits
|
||||
* and can be accessed by right shifting >> 8
|
||||
* stores upto 15 states or 0b1111 with 4 bits
|
||||
*
|
||||
* The SM_FAILOVER_IF_STATE_MASK is used for the state of the 4 interfaces
|
||||
* If any of these interfaces is down ( assuming its configured) then the bit
|
||||
* flag will be set
|
||||
* - cluster_host 0b0001
|
||||
* - management 0b0010
|
||||
* - oam 0b0100
|
||||
* - admin 0b1000
|
||||
*
|
||||
* SM_FAILOVER_HEARTBEAT_ALIVE & SM_FAILOVER_HELLO_MSG_ALIVE are currently only used locally
|
||||
* and are not shared in the heartbeat message between controllers
|
||||
*
|
||||
* SM_FAILOVER_NODE_INFO_MASK is combination mask that is used to specify the bits shared in
|
||||
* the heartbeat message shared between controllers
|
||||
*
|
||||
* Example: 0x100 indicate all "configured" interfaces are up and failover state is 'normal'
|
||||
* Example: 0x130 is an example of a local message
|
||||
* - heartbeat and hello message alive and all "configured" interfaces up
|
||||
* - failover state is normal
|
||||
*/
|
||||
#define SM_FAILOVER_STATE_MASK 0x00000F00 // 4 bits in use for failover states
|
||||
#define SM_FAILOVER_IF_STATE_MASK 0x0000000F // 4 bits in use for interface states
|
||||
#define SM_FAILOVER_NODE_INFO_MASK (SM_FAILOVER_STATE_MASK | SM_FAILOVER_IF_STATE_MASK)
|
||||
|
||||
/* Extract the failover state from the node info flags */
|
||||
#define SM_FAILOVER_EXTRACT_STATE(x) (SmFailoverStateT)((x & SM_FAILOVER_STATE_MASK) >> 8)
|
||||
|
||||
typedef enum
|
||||
{
|
||||
// --- interface status bits --
|
||||
SM_FAILOVER_CLUSTER_HOST_DOWN = (0x1 << 0), // 1
|
||||
SM_FAILOVER_MGMT_DOWN = (0x1 << 1), // 2
|
||||
SM_FAILOVER_OAM_DOWN = (0x1 << 2), // 4
|
||||
SM_FAILOVER_ADMIN_DOWN = (0x1 << 3), // 8
|
||||
// ---- local status bits --
|
||||
SM_FAILOVER_HEARTBEAT_ALIVE = (0x1 << 4), // 16
|
||||
SM_FAILOVER_HELLO_MSG_ALIVE = (0x1 << 5), // 32
|
||||
// ---- failover state bits --
|
||||
SM_FAILOVER_STATE_BIT0 = (0X1 << 8),
|
||||
SM_FAILOVER_STATE_BIT1 = (0X1 << 9),
|
||||
SM_FAILOVER_STATE_BIT2 = (0X1 << 10),
|
||||
SM_FAILOVER_STATE_BIT3 = (0X1 << 11),
|
||||
// ----------------------------
|
||||
SM_FAILOVER_PEER_DISABLED = (0x1 << 14) // 0x4000
|
||||
} SmFailoverNodeStateBitFlagT;
|
||||
|
||||
typedef uint32_t SmHeartbeatMsgNodeInfoT;
|
||||
|
||||
typedef enum
|
||||
{
|
||||
|
|
|
@ -47,6 +47,8 @@
|
|||
#define SM_FAILOVER_RECOVERY_INTERVAL_IN_SEC 100
|
||||
#define SM_FAILOVER_INTERFACE_STATE_REPORT_INTERVAL_MS 20000
|
||||
|
||||
#define SM_FAILOVER_PEER_IS_NORMAL_THRESHOLD 10
|
||||
|
||||
const char* RESET_PEER_NOW = "/var/run/.sm_reset_peer";
|
||||
|
||||
typedef enum
|
||||
|
@ -124,8 +126,8 @@ class SmFailoverInterfaceInfo
|
|||
SmTimerIdT failover_audit_timer_id;
|
||||
static char _host_name[SM_NODE_NAME_MAX_CHAR];
|
||||
static char _peer_name[SM_NODE_NAME_MAX_CHAR];
|
||||
static SmHeartbeatMsgIfStateT _peer_if_state = 0;
|
||||
static SmHeartbeatMsgIfStateT _peer_if_state_at_last_action = 0;
|
||||
static SmHeartbeatMsgNodeInfoT _peer_node_info = 0;
|
||||
static SmHeartbeatMsgNodeInfoT _peer_node_info_at_last_action = 0;
|
||||
static bool _retry = true;
|
||||
static bool _recheck = true;
|
||||
|
||||
|
@ -145,7 +147,7 @@ static SmDbHandleT* _sm_db_handle = NULL;
|
|||
static SmNodeScheduleStateT _host_state;
|
||||
static SmNodeScheduleStateT _host_state_at_last_action; // host state when action was taken last time
|
||||
static int64_t _node_comm_state = -1;
|
||||
static int _prev_if_state_flag = -1;
|
||||
static int _prev_node_info_flag = -1;
|
||||
time_t _last_if_state_ms = 0;
|
||||
static SmNodeScheduleStateT _prev_host_state= SM_NODE_STATE_UNKNOWN;
|
||||
|
||||
|
@ -472,22 +474,38 @@ SmErrorT sm_failover_degrade_clear(SmFailoverDegradeSourceT source)
|
|||
// ****************************************************************************
|
||||
|
||||
// ****************************************************************************
|
||||
// Failover - peer interface state update
|
||||
// Failover - peer node info update
|
||||
// ==================
|
||||
void sm_failover_if_state_update(const char node_name[], SmHeartbeatMsgIfStateT if_state)
|
||||
void sm_failover_node_info_update(const char node_name[], SmHeartbeatMsgNodeInfoT node_info)
|
||||
{
|
||||
static uint32_t num_of_peer_is_normal_events = 0;
|
||||
if( 0 == strcmp(node_name, _peer_name) )
|
||||
{
|
||||
mutex_holder holder(&sm_failover_mutex);
|
||||
|
||||
if( _peer_if_state != if_state )
|
||||
SmFailoverStateT node_failover_state = SM_FAILOVER_EXTRACT_STATE(node_info);
|
||||
if(node_failover_state == SM_FAILOVER_STATE_NORMAL)
|
||||
{
|
||||
DPRINTFI("%s I/F state changed %d => %d", node_name, _peer_if_state, if_state);
|
||||
_peer_if_state = if_state;
|
||||
num_of_peer_is_normal_events++;
|
||||
if(num_of_peer_is_normal_events > SM_FAILOVER_PEER_IS_NORMAL_THRESHOLD)
|
||||
{
|
||||
SmFailoverFSM::get_fsm().send_event(SM_FAILOVER_EVENT_PEER_IS_NORMAL, NULL);
|
||||
num_of_peer_is_normal_events = 0; // reset count
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
num_of_peer_is_normal_events = 0; // reset count
|
||||
}
|
||||
|
||||
if( _peer_node_info != node_info )
|
||||
{
|
||||
DPRINTFI("%s Node Info changed %#x => %#x", node_name, _peer_node_info, node_info);
|
||||
_peer_node_info = node_info;
|
||||
}
|
||||
} else
|
||||
{
|
||||
DPRINTFE("If state updated by unknown host %s", node_name);
|
||||
DPRINTFE("Node Info (%#x) updated by unknown host %s", node_info, node_name);
|
||||
}
|
||||
}
|
||||
// ****************************************************************************
|
||||
|
@ -523,25 +541,25 @@ bool is_admin_interface_configured()
|
|||
// ****************************************************************************
|
||||
|
||||
// ****************************************************************************
|
||||
// Failover - get interface state
|
||||
// Failover - get host interface state flags
|
||||
// ==================
|
||||
int sm_failover_get_if_state()
|
||||
static uint32_t sm_failover_get_host_if_state_flags()
|
||||
{
|
||||
SmFailoverInterfaceStateT mgmt_state = _mgmt_interface_info->get_state();
|
||||
SmFailoverInterfaceStateT oam_state = _oam_interface_info->get_state();
|
||||
SmFailoverInterfaceStateT cluster_host_state;
|
||||
SmFailoverInterfaceStateT admin_state;
|
||||
int if_state_flag = 0;
|
||||
uint32_t if_state_flags = 0;
|
||||
if ( is_cluster_host_interface_configured() )
|
||||
{
|
||||
cluster_host_state = _cluster_host_interface_info->get_state();
|
||||
if( SM_FAILOVER_INTERFACE_OK == cluster_host_state )
|
||||
{
|
||||
if_state_flag |= SM_FAILOVER_HEARTBEAT_ALIVE;
|
||||
if_state_flags |= SM_FAILOVER_HEARTBEAT_ALIVE;
|
||||
}
|
||||
else if ( SM_FAILOVER_INTERFACE_DOWN == cluster_host_state )
|
||||
{
|
||||
if_state_flag |= SM_FAILOVER_CLUSTER_HOST_DOWN;
|
||||
if_state_flags |= SM_FAILOVER_CLUSTER_HOST_DOWN;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -550,55 +568,60 @@ int sm_failover_get_if_state()
|
|||
admin_state = _admin_interface_info->get_state();
|
||||
if( SM_FAILOVER_INTERFACE_OK == admin_state )
|
||||
{
|
||||
if_state_flag |= SM_FAILOVER_HEARTBEAT_ALIVE;
|
||||
if_state_flags |= SM_FAILOVER_HEARTBEAT_ALIVE;
|
||||
}
|
||||
else if ( SM_FAILOVER_INTERFACE_DOWN == admin_state )
|
||||
{
|
||||
if_state_flag |= SM_FAILOVER_ADMIN_DOWN;
|
||||
if_state_flags |= SM_FAILOVER_ADMIN_DOWN;
|
||||
}
|
||||
}
|
||||
|
||||
if( SM_FAILOVER_INTERFACE_OK == mgmt_state )
|
||||
{
|
||||
if_state_flag |= SM_FAILOVER_HEARTBEAT_ALIVE;
|
||||
if_state_flags |= SM_FAILOVER_HEARTBEAT_ALIVE;
|
||||
}
|
||||
else if ( SM_FAILOVER_INTERFACE_DOWN == mgmt_state )
|
||||
{
|
||||
if_state_flag |= SM_FAILOVER_MGMT_DOWN;
|
||||
if_state_flags |= SM_FAILOVER_MGMT_DOWN;
|
||||
}
|
||||
|
||||
if( SM_FAILOVER_INTERFACE_OK == oam_state )
|
||||
{
|
||||
if_state_flag |= SM_FAILOVER_HEARTBEAT_ALIVE;
|
||||
if_state_flags |= SM_FAILOVER_HEARTBEAT_ALIVE;
|
||||
}
|
||||
else if ( SM_FAILOVER_INTERFACE_DOWN == oam_state )
|
||||
{
|
||||
if_state_flag |= SM_FAILOVER_OAM_DOWN;
|
||||
if_state_flags |= SM_FAILOVER_OAM_DOWN;
|
||||
}
|
||||
|
||||
return if_state_flag;
|
||||
return if_state_flags;
|
||||
}
|
||||
// ****************************************************************************
|
||||
|
||||
// ****************************************************************************
|
||||
// Failover - get interface state
|
||||
// Failover - get host node info - bit flags
|
||||
// ==================
|
||||
SmHeartbeatMsgIfStateT sm_failover_if_state_get()
|
||||
{
|
||||
mutex_holder holder(&sm_failover_mutex);
|
||||
int if_state_flag = sm_failover_get_if_state();
|
||||
return (if_state_flag & 0b0111); //the lower 3 bits i/f state flag
|
||||
}
|
||||
// ****************************************************************************
|
||||
|
||||
// ****************************************************************************
|
||||
// Failover - get peer node interface state
|
||||
// ==================
|
||||
SmHeartbeatMsgIfStateT sm_failover_get_peer_if_state()
|
||||
SmHeartbeatMsgNodeInfoT sm_failover_get_host_node_info_flags()
|
||||
{
|
||||
mutex_holder holder(&sm_failover_mutex);
|
||||
|
||||
return (_peer_if_state & 0b0111); //the lower 3 bits i/f state flag
|
||||
uint32_t failover_state_bits = ((uint32_t)SmFailoverFSM::get_fsm().get_state() << 8 );
|
||||
failover_state_bits &= SM_FAILOVER_STATE_MASK;
|
||||
|
||||
uint32_t if_state_flags = sm_failover_get_host_if_state_flags();
|
||||
if_state_flags &= SM_FAILOVER_IF_STATE_MASK;
|
||||
|
||||
return (failover_state_bits | if_state_flags);
|
||||
}
|
||||
// ****************************************************************************
|
||||
|
||||
// ****************************************************************************
|
||||
// Failover - get peer node info - bit flags
|
||||
// ==================
|
||||
SmHeartbeatMsgNodeInfoT sm_failover_get_peer_node_info_flags()
|
||||
{
|
||||
mutex_holder holder(&sm_failover_mutex);
|
||||
return (_peer_node_info & SM_FAILOVER_NODE_INFO_MASK);
|
||||
}
|
||||
// ****************************************************************************
|
||||
|
||||
|
@ -1015,7 +1038,9 @@ void sm_failover_audit()
|
|||
{
|
||||
if ( _prev_host_state != _host_state )
|
||||
{
|
||||
DPRINTFD("Wait for scheduler to decided my role. host state = %d", _host_state);
|
||||
DPRINTFD("Wait for scheduler to decided my role. host state = %s(%d)",
|
||||
sm_node_schedule_state_str(_host_state),
|
||||
_host_state);
|
||||
_prev_host_state = _host_state;
|
||||
}
|
||||
return;
|
||||
|
@ -1023,12 +1048,14 @@ void sm_failover_audit()
|
|||
|
||||
if ( _prev_host_state != _host_state )
|
||||
{
|
||||
DPRINTFI("host state is %d", _host_state);
|
||||
DPRINTFI("host state is %s(%d)",
|
||||
sm_node_schedule_state_str(_host_state),
|
||||
_host_state);
|
||||
_prev_host_state = _host_state;
|
||||
}
|
||||
|
||||
int if_state_flag = sm_failover_get_if_state();
|
||||
if(if_state_flag & SM_FAILOVER_HEARTBEAT_ALIVE)
|
||||
int node_info_flags = (int)sm_failover_get_host_node_info_flags();
|
||||
if(node_info_flags & SM_FAILOVER_HEARTBEAT_ALIVE)
|
||||
{
|
||||
_heartbeat_count ++;
|
||||
}
|
||||
|
@ -1036,12 +1063,14 @@ void sm_failover_audit()
|
|||
{
|
||||
_heartbeat_count = 0;
|
||||
}
|
||||
if( _prev_if_state_flag != if_state_flag)
|
||||
if( _prev_node_info_flag != node_info_flags)
|
||||
{
|
||||
DPRINTFI("Interface state flag %d", if_state_flag);
|
||||
DPRINTFI("Interface state flags changed : %#x -> %#x",
|
||||
_prev_node_info_flag,
|
||||
node_info_flags);
|
||||
|
||||
_last_if_state_ms = now_ms;
|
||||
_prev_if_state_flag = if_state_flag;
|
||||
_prev_node_info_flag = node_info_flags;
|
||||
}
|
||||
|
||||
if(!peer_controller_enabled())
|
||||
|
@ -1082,7 +1111,7 @@ void sm_failover_audit()
|
|||
_if_state_changed = false;
|
||||
}
|
||||
|
||||
int64_t curr_node_state = if_state_flag;
|
||||
int64_t curr_node_state = node_info_flags;
|
||||
|
||||
if( _hello_msg_alive )
|
||||
{
|
||||
|
@ -1092,7 +1121,7 @@ void sm_failover_audit()
|
|||
if( !_retry && !_recheck &&
|
||||
( _node_comm_state == curr_node_state &&
|
||||
_host_state == _host_state_at_last_action &&
|
||||
_peer_if_state == _peer_if_state_at_last_action ))
|
||||
_peer_node_info == _peer_node_info_at_last_action ))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
@ -1102,7 +1131,7 @@ void sm_failover_audit()
|
|||
|
||||
_node_comm_state = curr_node_state;
|
||||
_host_state_at_last_action = _host_state;
|
||||
_peer_if_state_at_last_action = _peer_if_state;
|
||||
_peer_node_info_at_last_action = _peer_node_info;
|
||||
|
||||
_log_nodes_state();
|
||||
}
|
||||
|
@ -1195,10 +1224,14 @@ void _log_nodes_state()
|
|||
sm_node_avail_status_str(peer.avail_status)
|
||||
);
|
||||
}
|
||||
DPRINTFI("Host state %d, I/F state %d, peer I/F state %d",
|
||||
SmFailoverStateT peer_failover_state = SM_FAILOVER_EXTRACT_STATE(_peer_node_info);
|
||||
DPRINTFI("Host state %s(%d), I/F state %#x, peer I/F state %#x, peer failover state %s(%d)",
|
||||
sm_node_schedule_state_str(_host_state),
|
||||
_host_state,
|
||||
_node_comm_state,
|
||||
_peer_if_state
|
||||
_peer_node_info,
|
||||
sm_failover_state_str(peer_failover_state),
|
||||
peer_failover_state
|
||||
);
|
||||
}
|
||||
// ****************************************************************************
|
||||
|
@ -1527,7 +1560,7 @@ void dump_interfaces_state(FILE* fp)
|
|||
|
||||
void dump_peer_if_state(FILE* fp)
|
||||
{
|
||||
fprintf(fp, " Peer Interface state: %d\n", _peer_if_state);
|
||||
fprintf(fp, " Peer Node info: %#x\n", _peer_node_info);
|
||||
}
|
||||
|
||||
void dump_failover_fsm_state(FILE* fp)
|
||||
|
@ -1536,6 +1569,13 @@ void dump_failover_fsm_state(FILE* fp)
|
|||
fprintf(fp, " Failover FSM state: %s\n", sm_failover_state_str(state));
|
||||
}
|
||||
|
||||
void dump_peer_failover_fsm_state(FILE* fp)
|
||||
{
|
||||
uint32_t peer_failover_bits = _peer_node_info & SM_FAILOVER_STATE_MASK;
|
||||
SmFailoverStateT state = (SmFailoverStateT)( peer_failover_bits >> 8 );
|
||||
fprintf(fp, " Peer Failover FSM state: %s\n", sm_failover_state_str(state));
|
||||
}
|
||||
|
||||
// ****************************************************************************
|
||||
// Failover - dump state
|
||||
// ======================
|
||||
|
@ -1546,5 +1586,5 @@ void sm_failover_dump_state(FILE* fp)
|
|||
dump_failover_fsm_state(fp);
|
||||
dump_interfaces_state(fp);
|
||||
dump_peer_if_state(fp);
|
||||
dump_peer_failover_fsm_state(fp);
|
||||
}
|
||||
// ****************************************************************************
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
#ifndef __SM_FAILOVER_H__
|
||||
#define __SM_FAILOVER_H__
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include "sm_types.h"
|
||||
#include "sm_service_domain_interface_table.h"
|
||||
#include "sm_db_nodes.h"
|
||||
|
@ -29,16 +30,6 @@ typedef enum
|
|||
SM_FAILOVER_DEGRADE_SOURCE_IF_DOWN = 2
|
||||
}SmFailoverDegradeSourceT;
|
||||
|
||||
typedef enum
|
||||
{
|
||||
SM_FAILOVER_CLUSTER_HOST_DOWN = 1,
|
||||
SM_FAILOVER_MGMT_DOWN = 2,
|
||||
SM_FAILOVER_OAM_DOWN = 4,
|
||||
SM_FAILOVER_HEARTBEAT_ALIVE = 8,
|
||||
SM_FAILOVER_HELLO_MSG_ALIVE = 16,
|
||||
SM_FAILOVER_ADMIN_DOWN = 32,
|
||||
SM_FAILOVER_PEER_DISABLED = 0x4000,
|
||||
}SmFailoverCommFaultBitFlagT;
|
||||
|
||||
// ****************************************************************************
|
||||
// initialize mutex
|
||||
|
@ -105,14 +96,14 @@ extern void sm_failover_interface_up( const char* const interface_name );
|
|||
// ****************************************************************************
|
||||
|
||||
// ****************************************************************************
|
||||
// Failover - interface state update for peer
|
||||
extern void sm_failover_if_state_update(const char node_name[],
|
||||
SmHeartbeatMsgIfStateT if_state);
|
||||
// Failover - node info update for peer
|
||||
extern void sm_failover_node_info_update(const char node_name[],
|
||||
SmHeartbeatMsgNodeInfoT node_info);
|
||||
// ****************************************************************************
|
||||
|
||||
// ****************************************************************************
|
||||
// Failover - get local interface state
|
||||
extern SmHeartbeatMsgIfStateT sm_failover_if_state_get();
|
||||
// Failover - get local interface state flags
|
||||
extern SmHeartbeatMsgNodeInfoT sm_failover_get_host_node_info_flags();
|
||||
// ****************************************************************************
|
||||
|
||||
// ****************************************************************************
|
||||
|
@ -147,9 +138,9 @@ SmFailoverInterfaceStateT sm_failover_get_interface_info(SmInterfaceTypeT interf
|
|||
|
||||
|
||||
// ****************************************************************************
|
||||
// Failover - get peer node interface state
|
||||
// Failover - get peer node info flags
|
||||
// ==================
|
||||
SmHeartbeatMsgIfStateT sm_failover_get_peer_if_state();
|
||||
SmHeartbeatMsgNodeInfoT sm_failover_get_peer_node_info_flags();
|
||||
// ****************************************************************************
|
||||
|
||||
|
||||
|
@ -177,10 +168,6 @@ SmErrorT sm_failover_disable_peer();
|
|||
bool sm_is_active_controller();
|
||||
// ****************************************************************************
|
||||
|
||||
// ****************************************************************************
|
||||
// Failover - get interface state
|
||||
// ==================
|
||||
int sm_failover_get_if_state();
|
||||
// ****************************************************************************
|
||||
|
||||
// ****************************************************************************
|
||||
|
|
|
@ -218,7 +218,7 @@ SmErrorT SmFailoverFailPendingState::event_handler(SmFailoverEventT event, const
|
|||
case SM_FAILOVER_EVENT_FAIL_PENDING_TIMEOUT:
|
||||
sm_node_utils_is_aio_duplex(&duplex);
|
||||
if( duplex &&
|
||||
0 == (sm_failover_get_if_state() & SM_FAILOVER_HEARTBEAT_ALIVE))
|
||||
0 == (sm_failover_get_host_node_info_flags() & SM_FAILOVER_HEARTBEAT_ALIVE))
|
||||
{
|
||||
SmSystemModeT system_mode = sm_node_utils_get_system_mode();
|
||||
SmInterfaceTypeT interfaces_to_check[3] = {SM_INTERFACE_UNKNOWN};
|
||||
|
@ -291,6 +291,8 @@ SmErrorT SmFailoverFailPendingState::event_handler(SmFailoverEventT event, const
|
|||
return error;
|
||||
}
|
||||
break;
|
||||
case SM_FAILOVER_EVENT_PEER_IS_NORMAL:
|
||||
break;
|
||||
default:
|
||||
DPRINTFE("Runtime error, unexpected event %s, at state %s",
|
||||
sm_failover_event_str(event),
|
||||
|
|
|
@ -282,6 +282,8 @@ SmErrorT SmFailoverFailedState::event_handler(SmFailoverEventT event, const ISmF
|
|||
}
|
||||
break;
|
||||
}
|
||||
case SM_FAILOVER_EVENT_PEER_IS_NORMAL:
|
||||
break;
|
||||
default:
|
||||
DPRINTFE("Runtime error, unexpected event %s, at state %s",
|
||||
sm_failover_event_str(event),
|
||||
|
|
|
@ -109,7 +109,7 @@ SmErrorT SmFailoverFSM::send_event(SmFailoverEventT event, const ISmFSMEventData
|
|||
DPRINTFE("Runtime error. No handler for state %d", this->get_state());
|
||||
return SM_FAILED;
|
||||
}
|
||||
DPRINTFI("send_event %s\n", sm_failover_event_str(event));
|
||||
DPRINTFD("send_event %s\n", sm_failover_event_str(event));
|
||||
state_handler->event_handler(event, event_data);
|
||||
return SM_OKAY;
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
//
|
||||
// Copyright (c) 2018 Wind River Systems, Inc.
|
||||
// Copyright (c) 2018-2023 Wind River Systems, Inc.
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
@ -18,6 +18,7 @@ SmErrorT SmFailoverInitialState::event_handler(SmFailoverEventT event, const ISm
|
|||
case SM_FAILOVER_EVENT_IF_STATE_CHANGED:
|
||||
case SM_FAILOVER_EVENT_FAIL_PENDING_TIMEOUT:
|
||||
case SM_FAILOVER_EVENT_NODE_ENABLED:
|
||||
case SM_FAILOVER_EVENT_PEER_IS_NORMAL:
|
||||
break;
|
||||
default:
|
||||
DPRINTFE("Runtime error, unexpected event %s, at state %s",
|
||||
|
|
|
@ -43,7 +43,9 @@ SmErrorT SmFailoverNormalState::event_handler(SmFailoverEventT event, const ISmF
|
|||
}
|
||||
|
||||
break;
|
||||
|
||||
case SM_FAILOVER_EVENT_NODE_ENABLED:
|
||||
case SM_FAILOVER_EVENT_PEER_IS_NORMAL:
|
||||
break;
|
||||
default:
|
||||
DPRINTFE("Runtime error, unexpected event %s, at state %s",
|
||||
sm_failover_event_str(event),
|
||||
|
|
|
@ -284,11 +284,11 @@ SmErrorT _get_system_status(SmSystemStatusT& sys_status, char host_name[], char
|
|||
SmSystemFailoverStatus::get_status().set_heartbeat_state(sys_status.heartbeat_state);
|
||||
|
||||
sys_status.host_status.node_name = host_name;
|
||||
sys_status.host_status.interface_state = sm_failover_if_state_get();
|
||||
sys_status.host_status.node_info_flags = sm_failover_get_host_node_info_flags();
|
||||
sys_status.host_status.current_schedule_state = host_state;
|
||||
sys_status.peer_status.node_name = peer_name;
|
||||
|
||||
sys_status.peer_status.interface_state = sm_failover_get_peer_if_state();
|
||||
sys_status.peer_status.node_name = peer_name;
|
||||
sys_status.peer_status.node_info_flags = sm_failover_get_peer_node_info_flags();
|
||||
sys_status.peer_status.current_schedule_state = sm_get_controller_state(peer_name);
|
||||
return SM_OKAY;
|
||||
}
|
||||
|
@ -324,16 +324,18 @@ SmErrorT sm_failover_ss_get_survivor(const SmSystemStatusT& system_status, SmSys
|
|||
selection.set_peer_schedule_state(system_status.peer_status.current_schedule_state);
|
||||
if(SM_HEARTBEAT_OK == system_status.heartbeat_state)
|
||||
{
|
||||
DPRINTFI("Heartbeat alive");
|
||||
DPRINTFI("Heartbeat alive; host flags %#x, peer flags %#x",
|
||||
system_status.host_status.node_info_flags,
|
||||
system_status.peer_status.node_info_flags);
|
||||
int host_healthy_score, peer_healthy_score;
|
||||
host_healthy_score = get_node_if_healthy_score(system_status.host_status.interface_state);
|
||||
peer_healthy_score = get_node_if_healthy_score(system_status.peer_status.interface_state);
|
||||
host_healthy_score = get_node_if_healthy_score(system_status.host_status.node_info_flags);
|
||||
peer_healthy_score = get_node_if_healthy_score(system_status.peer_status.node_info_flags);
|
||||
if( peer_healthy_score < host_healthy_score )
|
||||
{
|
||||
//host is more healthy
|
||||
selection.set_host_schedule_state(SM_NODE_STATE_ACTIVE);
|
||||
selection.set_peer_schedule_state(SM_NODE_STATE_STANDBY);
|
||||
if(system_status.peer_status.interface_state & SM_FAILOVER_MGMT_DOWN)
|
||||
if(system_status.peer_status.node_info_flags & SM_FAILOVER_MGMT_DOWN)
|
||||
{
|
||||
DPRINTFI("Disable peer, host go active");
|
||||
selection.set_peer_schedule_state(SM_NODE_STATE_FAILED);
|
||||
|
@ -343,12 +345,18 @@ SmErrorT sm_failover_ss_get_survivor(const SmSystemStatusT& system_status, SmSys
|
|||
//peer is more healthy
|
||||
selection.set_host_schedule_state(SM_NODE_STATE_STANDBY);
|
||||
selection.set_peer_schedule_state(SM_NODE_STATE_ACTIVE);
|
||||
if(system_status.host_status.interface_state & SM_FAILOVER_MGMT_DOWN)
|
||||
if(system_status.host_status.node_info_flags & SM_FAILOVER_MGMT_DOWN)
|
||||
{
|
||||
DPRINTFI("Disable host, peer go active");
|
||||
selection.set_host_schedule_state(SM_NODE_STATE_FAILED);
|
||||
}
|
||||
} else
|
||||
{
|
||||
// host and peer has equal healthy scores
|
||||
DPRINTFI("host and peer have the same healthy score");
|
||||
}
|
||||
DPRINTFI("host_healthy_score %d, peer_healthy_score %d",
|
||||
host_healthy_score, peer_healthy_score);
|
||||
}else
|
||||
{
|
||||
DPRINTFI("Loss of heartbeat ALL");
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
typedef struct
|
||||
{
|
||||
const char* node_name;
|
||||
unsigned int interface_state;
|
||||
unsigned int node_info_flags;
|
||||
unsigned int heartbeat_state;
|
||||
SmFailoverInterfaceStateT mgmt_state;
|
||||
SmFailoverInterfaceStateT cluster_host_state;
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
//
|
||||
// Copyright (c) 2018 Wind River Systems, Inc.
|
||||
// Copyright (c) 2018-2023 Wind River Systems, Inc.
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
@ -9,6 +9,7 @@
|
|||
#include "sm_debug.h"
|
||||
#include "sm_failover_fsm.h"
|
||||
#include "sm_failover_ss.h"
|
||||
#include "sm_failover.h"
|
||||
|
||||
static void _audit_failover_state()
|
||||
{
|
||||
|
@ -44,6 +45,14 @@ SmErrorT SmFailoverSurvivedState::event_handler(SmFailoverEventT event, const IS
|
|||
{
|
||||
case SM_FAILOVER_EVENT_IF_STATE_CHANGED:
|
||||
break;
|
||||
case SM_FAILOVER_EVENT_PEER_IS_NORMAL:
|
||||
DPRINTFE("Peer is in %s(%d) failover state, Host will transition to %s(%d)",
|
||||
sm_failover_state_str(SM_FAILOVER_STATE_NORMAL),
|
||||
SM_FAILOVER_STATE_NORMAL,
|
||||
sm_failover_state_str(SM_FAILOVER_STATE_NORMAL),
|
||||
SM_FAILOVER_STATE_NORMAL);
|
||||
SmFailoverFSM::get_fsm().set_state(SM_FAILOVER_STATE_NORMAL);
|
||||
break;
|
||||
case SM_FAILOVER_EVENT_NODE_ENABLED:
|
||||
_audit_failover_state();
|
||||
break;
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
//
|
||||
// Copyright (c) 2014-2017 Wind River Systems, Inc.
|
||||
// Copyright (c) 2014-2023 Wind River Systems, Inc.
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
@ -55,7 +55,7 @@ typedef struct
|
|||
typedef struct
|
||||
{
|
||||
uint16_t msg_size;
|
||||
uint32_t if_state;
|
||||
uint32_t node_info; /**< @see SmHeartbeatMsgNodeInfoT */
|
||||
} __attribute__ ((packed)) SmHeartbeatMsgAliveRev2T;
|
||||
|
||||
typedef struct
|
||||
|
@ -64,10 +64,10 @@ typedef struct
|
|||
|
||||
union
|
||||
{
|
||||
SmHeartbeatMsgAliveT alive;
|
||||
SmHeartbeatMsgAliveRev2T if_state_msg;
|
||||
SmHeartbeatMsgAliveT alive_msg;
|
||||
SmHeartbeatMsgAliveRev2T aliveRev2_msg;
|
||||
char raw_msg[SM_HEARTBEAT_MSG_MAX_SIZE-sizeof(SmHeartbeatMsgHeaderT)];
|
||||
} u;
|
||||
} msg;
|
||||
} __attribute__ ((packed)) SmHeartbeatMsgT;
|
||||
|
||||
static char _tx_control_buffer[SM_HEARTBEAT_MSG_BUFFER_MAX_SIZE] __attribute__((aligned));
|
||||
|
@ -206,10 +206,10 @@ SmErrorT sm_heartbeat_msg_send_alive( SmNetworkTypeT network_type, char node_nam
|
|||
snprintf( heartbeat_msg.header.node_name,
|
||||
sizeof(heartbeat_msg.header.node_name), "%s", node_name );
|
||||
|
||||
heartbeat_msg.u.if_state_msg.msg_size = htons((uint16_t)sizeof(SmHeartbeatMsgAliveRev2T));
|
||||
SmHeartbeatMsgIfStateT if_state;
|
||||
if_state = sm_failover_if_state_get();
|
||||
heartbeat_msg.u.if_state_msg.if_state = htonl(if_state);
|
||||
heartbeat_msg.msg.aliveRev2_msg.msg_size = htons((uint16_t)sizeof(SmHeartbeatMsgAliveRev2T));
|
||||
SmHeartbeatMsgNodeInfoT node_info;
|
||||
node_info = sm_failover_get_host_node_info_flags();
|
||||
heartbeat_msg.msg.aliveRev2_msg.node_info = htonl(node_info);
|
||||
|
||||
if( SM_AUTH_TYPE_HMAC_SHA512 == auth_type )
|
||||
{
|
||||
|
@ -341,14 +341,14 @@ static void sm_heartbeat_msg_dispatch_msg( SmHeartbeatMsgT* heartbeat_msg,
|
|||
}
|
||||
|
||||
uint16_t msg_size;
|
||||
SmHeartbeatMsgIfStateT if_state = 0;
|
||||
SmHeartbeatMsgNodeInfoT node_info = 0;
|
||||
bool perform_if_exg = false;
|
||||
if(2 <= ntohs(heartbeat_msg->header.revision))
|
||||
{
|
||||
msg_size = ntohs(heartbeat_msg->u.if_state_msg.msg_size);
|
||||
msg_size = ntohs(heartbeat_msg->msg.aliveRev2_msg.msg_size);
|
||||
if(sizeof(SmHeartbeatMsgAliveRev2T) == msg_size )
|
||||
{
|
||||
if_state = ntohl(heartbeat_msg->u.if_state_msg.if_state);
|
||||
node_info = ntohl(heartbeat_msg->msg.aliveRev2_msg.node_info);
|
||||
perform_if_exg = true;
|
||||
}else
|
||||
{
|
||||
|
@ -377,14 +377,15 @@ static void sm_heartbeat_msg_dispatch_msg( SmHeartbeatMsgT* heartbeat_msg,
|
|||
|
||||
if(perform_if_exg)
|
||||
{
|
||||
if(NULL != callbacks->if_state)
|
||||
if(NULL != callbacks->node_info)
|
||||
{
|
||||
callbacks->if_state(heartbeat_msg->header.node_name,
|
||||
if_state);
|
||||
callbacks->node_info(heartbeat_msg->header.node_name,
|
||||
node_info);
|
||||
}
|
||||
else
|
||||
{
|
||||
DPRINTFD("No handler for if state package");
|
||||
DPRINTFD("Missing callback for node info %s %#x", heartbeat_msg->header.node_name,
|
||||
node_info);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
//
|
||||
// Copyright (c) 2014-2017 Wind River Systems, Inc.
|
||||
// Copyright (c) 2014-2023 Wind River Systems, Inc.
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
@ -23,14 +23,14 @@ typedef void (*SmHeartbeatMsgAliveCallbackT) (char node_name[],
|
|||
SmNetworkAddressT* network_address, int network_port, int version,
|
||||
int revision, char interface_name[]);
|
||||
|
||||
typedef void (*SmHeartbeatMsgIfStateCallbackT) (const char node_name[],
|
||||
SmHeartbeatMsgIfStateT if_state);
|
||||
typedef void (*SmHeartbeatMsgNodeInfoCallbackT) (const char node_name[],
|
||||
SmHeartbeatMsgNodeInfoT if_state);
|
||||
|
||||
typedef struct
|
||||
{
|
||||
SmHeartbeatMsgAuthCallbackT auth;
|
||||
SmHeartbeatMsgAliveCallbackT alive;
|
||||
SmHeartbeatMsgIfStateCallbackT if_state;
|
||||
SmHeartbeatMsgNodeInfoCallbackT node_info;
|
||||
} SmHeartbeatMsgCallbacksT;
|
||||
|
||||
// ****************************************************************************
|
||||
|
|
|
@ -1116,12 +1116,12 @@ DONE:
|
|||
// ****************************************************************************
|
||||
|
||||
// ****************************************************************************
|
||||
// Heartbeat Thread - Receive if_state Message
|
||||
// Heartbeat Thread - Receive node_info Message
|
||||
// ========================================
|
||||
static void sm_heartbeat_thread_receive_if_state_message( const char node_name[],
|
||||
SmHeartbeatMsgIfStateT if_state)
|
||||
static void sm_heartbeat_thread_receive_node_info_message( const char node_name[],
|
||||
SmHeartbeatMsgNodeInfoT node_info)
|
||||
{
|
||||
sm_failover_if_state_update(node_name, if_state);
|
||||
sm_failover_node_info_update(node_name, node_info);
|
||||
}
|
||||
// ****************************************************************************
|
||||
|
||||
|
@ -1158,7 +1158,7 @@ static SmErrorT sm_heartbeat_thread_initialize_thread( void )
|
|||
|
||||
_callbacks.auth = sm_heartbeat_thread_auth_message;
|
||||
_callbacks.alive = sm_heartbeat_thread_receive_alive_message;
|
||||
_callbacks.if_state = sm_heartbeat_thread_receive_if_state_message;
|
||||
_callbacks.node_info = sm_heartbeat_thread_receive_node_info_message;
|
||||
|
||||
error = sm_heartbeat_msg_register_callbacks( &_callbacks );
|
||||
if( SM_OKAY != error )
|
||||
|
|
Loading…
Reference in New Issue