Add failover state of peer to heartbeat msg

- add failover state to heartbeat message ( 4 bits )
- add logic to survived_state to use peer's
  failover state to determine whether to exit survived state
  and enter normal state
- throttle peer is normal events with a threshold of 10
  used to ensure the peer is normal and stable
- change fsm->send_event() log to debug from info log level
- a few logging improvements; debug send_event logs
- update copyright year 2023

Test plan:
PASS - AIO-DX: iso install
PASS - AIO-DX: crash the sm as indicated in bug
               and observe swact to standby
PASS - AIO-DX: manual swact
PASS - AIO-DX: power off active controller
PASS - AIO-SX: install and basic sanity check
PASS - AIO-SX: upgrade test to verify sm heartbeat
               messages changes still function when
               controllers are running different loads

Closes-Bug: 2012519

Signed-off-by: Kyale, Eliud <Eliud.Kyale@windriver.com>
Change-Id: I1f86dcb8c9d9dbaf436b9240867f61adc405e88c
This commit is contained in:
Kyale, Eliud 2023-03-22 10:06:37 -04:00 committed by Kyale, eliud
parent 002f5a79db
commit 9e2ff82411
15 changed files with 329 additions and 223 deletions

View File

@ -174,7 +174,8 @@ _sm_failover_event_mappings[SM_FAILOVER_EVENT_MAX] =
{SM_FAILOVER_EVENT_IF_STATE_CHANGED, "interface-state-changed"},
{SM_FAILOVER_EVENT_FAIL_PENDING_TIMEOUT, "fail-pending-timeout"},
{SM_FAILOVER_EVENT_FAILED_RECOVERY_AUDIT, "failed-recovery-audit"},
{SM_FAILOVER_EVENT_NODE_ENABLED, "node-enabled"}
{SM_FAILOVER_EVENT_NODE_ENABLED, "node-enabled"},
{SM_FAILOVER_EVENT_PEER_IS_NORMAL, "peer-is-normal"}
};
static SmValueStrMappingT

View File

@ -281,11 +281,11 @@ typedef enum
} SmServiceDomainEventDataT;
typedef enum{
SM_FAILOVER_STATE_INITIAL,
SM_FAILOVER_STATE_NORMAL,
SM_FAILOVER_STATE_FAIL_PENDING,
SM_FAILOVER_STATE_FAILED,
SM_FAILOVER_STATE_SURVIVED,
SM_FAILOVER_STATE_INITIAL = 0,
SM_FAILOVER_STATE_NORMAL = 1,
SM_FAILOVER_STATE_FAIL_PENDING = 2,
SM_FAILOVER_STATE_FAILED = 3,
SM_FAILOVER_STATE_SURVIVED = 4,
SM_FAILOVER_STATE_MAX
}SmFailoverStateT;
@ -295,6 +295,7 @@ typedef enum{
SM_FAILOVER_EVENT_FAIL_PENDING_TIMEOUT,
SM_FAILOVER_EVENT_FAILED_RECOVERY_AUDIT,
SM_FAILOVER_EVENT_NODE_ENABLED,
SM_FAILOVER_EVENT_PEER_IS_NORMAL,
SM_FAILOVER_EVENT_MAX
}SmFailoverEventT;
@ -706,7 +707,59 @@ typedef enum
SM_ERROR_MAX
} SmErrorT;
typedef uint32_t SmHeartbeatMsgIfStateT;
/**
* The SM_FAILOVER_STATE_MASK is used for the failover states
* lower bits 8 to 10 .
* The 'SmFailoverStateT' is stored directly in the 4 bits
* and can be accessed by right shifting >> 8
* stores upto 15 states or 0b1111 with 4 bits
*
* The SM_FAILOVER_IF_STATE_MASK is used for the state of the 4 interfaces
* If any of these interfaces is down ( assuming its configured) then the bit
* flag will be set
* - cluster_host 0b0001
* - management 0b0010
* - oam 0b0100
* - admin 0b1000
*
* SM_FAILOVER_HEARTBEAT_ALIVE & SM_FAILOVER_HELLO_MSG_ALIVE are currently only used locally
* and are not shared in the heartbeat message between controllers
*
* SM_FAILOVER_NODE_INFO_MASK is combination mask that is used to specify the bits shared in
* the heartbeat message shared between controllers
*
* Example: 0x100 indicate all "configured" interfaces are up and failover state is 'normal'
* Example: 0x130 is an example of a local message
* - heartbeat and hello message alive and all "configured" interfaces up
* - failover state is normal
*/
#define SM_FAILOVER_STATE_MASK 0x00000F00 // 4 bits in use for failover states
#define SM_FAILOVER_IF_STATE_MASK 0x0000000F // 4 bits in use for interface states
#define SM_FAILOVER_NODE_INFO_MASK (SM_FAILOVER_STATE_MASK | SM_FAILOVER_IF_STATE_MASK)
/* Extract the failover state from the node info flags */
#define SM_FAILOVER_EXTRACT_STATE(x) (SmFailoverStateT)((x & SM_FAILOVER_STATE_MASK) >> 8)
typedef enum
{
// --- interface status bits --
SM_FAILOVER_CLUSTER_HOST_DOWN = (0x1 << 0), // 1
SM_FAILOVER_MGMT_DOWN = (0x1 << 1), // 2
SM_FAILOVER_OAM_DOWN = (0x1 << 2), // 4
SM_FAILOVER_ADMIN_DOWN = (0x1 << 3), // 8
// ---- local status bits --
SM_FAILOVER_HEARTBEAT_ALIVE = (0x1 << 4), // 16
SM_FAILOVER_HELLO_MSG_ALIVE = (0x1 << 5), // 32
// ---- failover state bits --
SM_FAILOVER_STATE_BIT0 = (0X1 << 8),
SM_FAILOVER_STATE_BIT1 = (0X1 << 9),
SM_FAILOVER_STATE_BIT2 = (0X1 << 10),
SM_FAILOVER_STATE_BIT3 = (0X1 << 11),
// ----------------------------
SM_FAILOVER_PEER_DISABLED = (0x1 << 14) // 0x4000
} SmFailoverNodeStateBitFlagT;
typedef uint32_t SmHeartbeatMsgNodeInfoT;
typedef enum
{

View File

@ -47,6 +47,8 @@
#define SM_FAILOVER_RECOVERY_INTERVAL_IN_SEC 100
#define SM_FAILOVER_INTERFACE_STATE_REPORT_INTERVAL_MS 20000
#define SM_FAILOVER_PEER_IS_NORMAL_THRESHOLD 10
const char* RESET_PEER_NOW = "/var/run/.sm_reset_peer";
typedef enum
@ -124,8 +126,8 @@ class SmFailoverInterfaceInfo
SmTimerIdT failover_audit_timer_id;
static char _host_name[SM_NODE_NAME_MAX_CHAR];
static char _peer_name[SM_NODE_NAME_MAX_CHAR];
static SmHeartbeatMsgIfStateT _peer_if_state = 0;
static SmHeartbeatMsgIfStateT _peer_if_state_at_last_action = 0;
static SmHeartbeatMsgNodeInfoT _peer_node_info = 0;
static SmHeartbeatMsgNodeInfoT _peer_node_info_at_last_action = 0;
static bool _retry = true;
static bool _recheck = true;
@ -145,7 +147,7 @@ static SmDbHandleT* _sm_db_handle = NULL;
static SmNodeScheduleStateT _host_state;
static SmNodeScheduleStateT _host_state_at_last_action; // host state when action was taken last time
static int64_t _node_comm_state = -1;
static int _prev_if_state_flag = -1;
static int _prev_node_info_flag = -1;
time_t _last_if_state_ms = 0;
static SmNodeScheduleStateT _prev_host_state= SM_NODE_STATE_UNKNOWN;
@ -472,22 +474,38 @@ SmErrorT sm_failover_degrade_clear(SmFailoverDegradeSourceT source)
// ****************************************************************************
// ****************************************************************************
// Failover - peer interface state update
// Failover - peer node info update
// ==================
void sm_failover_if_state_update(const char node_name[], SmHeartbeatMsgIfStateT if_state)
void sm_failover_node_info_update(const char node_name[], SmHeartbeatMsgNodeInfoT node_info)
{
static uint32_t num_of_peer_is_normal_events = 0;
if( 0 == strcmp(node_name, _peer_name) )
{
mutex_holder holder(&sm_failover_mutex);
if( _peer_if_state != if_state )
SmFailoverStateT node_failover_state = SM_FAILOVER_EXTRACT_STATE(node_info);
if(node_failover_state == SM_FAILOVER_STATE_NORMAL)
{
DPRINTFI("%s I/F state changed %d => %d", node_name, _peer_if_state, if_state);
_peer_if_state = if_state;
num_of_peer_is_normal_events++;
if(num_of_peer_is_normal_events > SM_FAILOVER_PEER_IS_NORMAL_THRESHOLD)
{
SmFailoverFSM::get_fsm().send_event(SM_FAILOVER_EVENT_PEER_IS_NORMAL, NULL);
num_of_peer_is_normal_events = 0; // reset count
}
}
}else
else
{
num_of_peer_is_normal_events = 0; // reset count
}
if( _peer_node_info != node_info )
{
DPRINTFI("%s Node Info changed %#x => %#x", node_name, _peer_node_info, node_info);
_peer_node_info = node_info;
}
} else
{
DPRINTFE("If state updated by unknown host %s", node_name);
DPRINTFE("Node Info (%#x) updated by unknown host %s", node_info, node_name);
}
}
// ****************************************************************************
@ -523,25 +541,25 @@ bool is_admin_interface_configured()
// ****************************************************************************
// ****************************************************************************
// Failover - get interface state
// Failover - get host interface state flags
// ==================
int sm_failover_get_if_state()
static uint32_t sm_failover_get_host_if_state_flags()
{
SmFailoverInterfaceStateT mgmt_state = _mgmt_interface_info->get_state();
SmFailoverInterfaceStateT oam_state = _oam_interface_info->get_state();
SmFailoverInterfaceStateT cluster_host_state;
SmFailoverInterfaceStateT admin_state;
int if_state_flag = 0;
uint32_t if_state_flags = 0;
if ( is_cluster_host_interface_configured() )
{
cluster_host_state = _cluster_host_interface_info->get_state();
if( SM_FAILOVER_INTERFACE_OK == cluster_host_state )
{
if_state_flag |= SM_FAILOVER_HEARTBEAT_ALIVE;
if_state_flags |= SM_FAILOVER_HEARTBEAT_ALIVE;
}
else if ( SM_FAILOVER_INTERFACE_DOWN == cluster_host_state )
{
if_state_flag |= SM_FAILOVER_CLUSTER_HOST_DOWN;
if_state_flags |= SM_FAILOVER_CLUSTER_HOST_DOWN;
}
}
@ -550,55 +568,60 @@ int sm_failover_get_if_state()
admin_state = _admin_interface_info->get_state();
if( SM_FAILOVER_INTERFACE_OK == admin_state )
{
if_state_flag |= SM_FAILOVER_HEARTBEAT_ALIVE;
if_state_flags |= SM_FAILOVER_HEARTBEAT_ALIVE;
}
else if ( SM_FAILOVER_INTERFACE_DOWN == admin_state )
{
if_state_flag |= SM_FAILOVER_ADMIN_DOWN;
if_state_flags |= SM_FAILOVER_ADMIN_DOWN;
}
}
if( SM_FAILOVER_INTERFACE_OK == mgmt_state )
{
if_state_flag |= SM_FAILOVER_HEARTBEAT_ALIVE;
if_state_flags |= SM_FAILOVER_HEARTBEAT_ALIVE;
}
else if ( SM_FAILOVER_INTERFACE_DOWN == mgmt_state )
{
if_state_flag |= SM_FAILOVER_MGMT_DOWN;
if_state_flags |= SM_FAILOVER_MGMT_DOWN;
}
if( SM_FAILOVER_INTERFACE_OK == oam_state )
{
if_state_flag |= SM_FAILOVER_HEARTBEAT_ALIVE;
if_state_flags |= SM_FAILOVER_HEARTBEAT_ALIVE;
}
else if ( SM_FAILOVER_INTERFACE_DOWN == oam_state )
{
if_state_flag |= SM_FAILOVER_OAM_DOWN;
if_state_flags |= SM_FAILOVER_OAM_DOWN;
}
return if_state_flag;
return if_state_flags;
}
// ****************************************************************************
// ****************************************************************************
// Failover - get interface state
// Failover - get host node info - bit flags
// ==================
SmHeartbeatMsgIfStateT sm_failover_if_state_get()
SmHeartbeatMsgNodeInfoT sm_failover_get_host_node_info_flags()
{
mutex_holder holder(&sm_failover_mutex);
int if_state_flag = sm_failover_get_if_state();
return (if_state_flag & 0b0111); //the lower 3 bits i/f state flag
uint32_t failover_state_bits = ((uint32_t)SmFailoverFSM::get_fsm().get_state() << 8 );
failover_state_bits &= SM_FAILOVER_STATE_MASK;
uint32_t if_state_flags = sm_failover_get_host_if_state_flags();
if_state_flags &= SM_FAILOVER_IF_STATE_MASK;
return (failover_state_bits | if_state_flags);
}
// ****************************************************************************
// ****************************************************************************
// Failover - get peer node interface state
// Failover - get peer node info - bit flags
// ==================
SmHeartbeatMsgIfStateT sm_failover_get_peer_if_state()
SmHeartbeatMsgNodeInfoT sm_failover_get_peer_node_info_flags()
{
mutex_holder holder(&sm_failover_mutex);
return (_peer_if_state & 0b0111); //the lower 3 bits i/f state flag
return (_peer_node_info & SM_FAILOVER_NODE_INFO_MASK);
}
// ****************************************************************************
@ -1015,7 +1038,9 @@ void sm_failover_audit()
{
if ( _prev_host_state != _host_state )
{
DPRINTFD("Wait for scheduler to decided my role. host state = %d", _host_state);
DPRINTFD("Wait for scheduler to decided my role. host state = %s(%d)",
sm_node_schedule_state_str(_host_state),
_host_state);
_prev_host_state = _host_state;
}
return;
@ -1023,12 +1048,14 @@ void sm_failover_audit()
if ( _prev_host_state != _host_state )
{
DPRINTFI("host state is %d", _host_state);
DPRINTFI("host state is %s(%d)",
sm_node_schedule_state_str(_host_state),
_host_state);
_prev_host_state = _host_state;
}
int if_state_flag = sm_failover_get_if_state();
if(if_state_flag & SM_FAILOVER_HEARTBEAT_ALIVE)
int node_info_flags = (int)sm_failover_get_host_node_info_flags();
if(node_info_flags & SM_FAILOVER_HEARTBEAT_ALIVE)
{
_heartbeat_count ++;
}
@ -1036,12 +1063,14 @@ void sm_failover_audit()
{
_heartbeat_count = 0;
}
if( _prev_if_state_flag != if_state_flag)
if( _prev_node_info_flag != node_info_flags)
{
DPRINTFI("Interface state flag %d", if_state_flag);
DPRINTFI("Interface state flags changed : %#x -> %#x",
_prev_node_info_flag,
node_info_flags);
_last_if_state_ms = now_ms;
_prev_if_state_flag = if_state_flag;
_prev_node_info_flag = node_info_flags;
}
if(!peer_controller_enabled())
@ -1082,7 +1111,7 @@ void sm_failover_audit()
_if_state_changed = false;
}
int64_t curr_node_state = if_state_flag;
int64_t curr_node_state = node_info_flags;
if( _hello_msg_alive )
{
@ -1092,7 +1121,7 @@ void sm_failover_audit()
if( !_retry && !_recheck &&
( _node_comm_state == curr_node_state &&
_host_state == _host_state_at_last_action &&
_peer_if_state == _peer_if_state_at_last_action ))
_peer_node_info == _peer_node_info_at_last_action ))
{
return;
}
@ -1102,7 +1131,7 @@ void sm_failover_audit()
_node_comm_state = curr_node_state;
_host_state_at_last_action = _host_state;
_peer_if_state_at_last_action = _peer_if_state;
_peer_node_info_at_last_action = _peer_node_info;
_log_nodes_state();
}
@ -1195,10 +1224,14 @@ void _log_nodes_state()
sm_node_avail_status_str(peer.avail_status)
);
}
DPRINTFI("Host state %d, I/F state %d, peer I/F state %d",
SmFailoverStateT peer_failover_state = SM_FAILOVER_EXTRACT_STATE(_peer_node_info);
DPRINTFI("Host state %s(%d), I/F state %#x, peer I/F state %#x, peer failover state %s(%d)",
sm_node_schedule_state_str(_host_state),
_host_state,
_node_comm_state,
_peer_if_state
_peer_node_info,
sm_failover_state_str(peer_failover_state),
peer_failover_state
);
}
// ****************************************************************************
@ -1527,7 +1560,7 @@ void dump_interfaces_state(FILE* fp)
void dump_peer_if_state(FILE* fp)
{
fprintf(fp, " Peer Interface state: %d\n", _peer_if_state);
fprintf(fp, " Peer Node info: %#x\n", _peer_node_info);
}
void dump_failover_fsm_state(FILE* fp)
@ -1536,6 +1569,13 @@ void dump_failover_fsm_state(FILE* fp)
fprintf(fp, " Failover FSM state: %s\n", sm_failover_state_str(state));
}
void dump_peer_failover_fsm_state(FILE* fp)
{
uint32_t peer_failover_bits = _peer_node_info & SM_FAILOVER_STATE_MASK;
SmFailoverStateT state = (SmFailoverStateT)( peer_failover_bits >> 8 );
fprintf(fp, " Peer Failover FSM state: %s\n", sm_failover_state_str(state));
}
// ****************************************************************************
// Failover - dump state
// ======================
@ -1546,5 +1586,5 @@ void sm_failover_dump_state(FILE* fp)
dump_failover_fsm_state(fp);
dump_interfaces_state(fp);
dump_peer_if_state(fp);
dump_peer_failover_fsm_state(fp);
}
// ****************************************************************************

View File

@ -6,6 +6,7 @@
#ifndef __SM_FAILOVER_H__
#define __SM_FAILOVER_H__
#include <stdio.h>
#include <stdint.h>
#include "sm_types.h"
#include "sm_service_domain_interface_table.h"
#include "sm_db_nodes.h"
@ -29,16 +30,6 @@ typedef enum
SM_FAILOVER_DEGRADE_SOURCE_IF_DOWN = 2
}SmFailoverDegradeSourceT;
typedef enum
{
SM_FAILOVER_CLUSTER_HOST_DOWN = 1,
SM_FAILOVER_MGMT_DOWN = 2,
SM_FAILOVER_OAM_DOWN = 4,
SM_FAILOVER_HEARTBEAT_ALIVE = 8,
SM_FAILOVER_HELLO_MSG_ALIVE = 16,
SM_FAILOVER_ADMIN_DOWN = 32,
SM_FAILOVER_PEER_DISABLED = 0x4000,
}SmFailoverCommFaultBitFlagT;
// ****************************************************************************
// initialize mutex
@ -105,14 +96,14 @@ extern void sm_failover_interface_up( const char* const interface_name );
// ****************************************************************************
// ****************************************************************************
// Failover - interface state update for peer
extern void sm_failover_if_state_update(const char node_name[],
SmHeartbeatMsgIfStateT if_state);
// Failover - node info update for peer
extern void sm_failover_node_info_update(const char node_name[],
SmHeartbeatMsgNodeInfoT node_info);
// ****************************************************************************
// ****************************************************************************
// Failover - get local interface state
extern SmHeartbeatMsgIfStateT sm_failover_if_state_get();
// Failover - get local interface state flags
extern SmHeartbeatMsgNodeInfoT sm_failover_get_host_node_info_flags();
// ****************************************************************************
// ****************************************************************************
@ -147,9 +138,9 @@ SmFailoverInterfaceStateT sm_failover_get_interface_info(SmInterfaceTypeT interf
// ****************************************************************************
// Failover - get peer node interface state
// Failover - get peer node info flags
// ==================
SmHeartbeatMsgIfStateT sm_failover_get_peer_if_state();
SmHeartbeatMsgNodeInfoT sm_failover_get_peer_node_info_flags();
// ****************************************************************************
@ -177,10 +168,6 @@ SmErrorT sm_failover_disable_peer();
bool sm_is_active_controller();
// ****************************************************************************
// ****************************************************************************
// Failover - get interface state
// ==================
int sm_failover_get_if_state();
// ****************************************************************************
// ****************************************************************************

View File

@ -218,7 +218,7 @@ SmErrorT SmFailoverFailPendingState::event_handler(SmFailoverEventT event, const
case SM_FAILOVER_EVENT_FAIL_PENDING_TIMEOUT:
sm_node_utils_is_aio_duplex(&duplex);
if( duplex &&
0 == (sm_failover_get_if_state() & SM_FAILOVER_HEARTBEAT_ALIVE))
0 == (sm_failover_get_host_node_info_flags() & SM_FAILOVER_HEARTBEAT_ALIVE))
{
SmSystemModeT system_mode = sm_node_utils_get_system_mode();
SmInterfaceTypeT interfaces_to_check[3] = {SM_INTERFACE_UNKNOWN};
@ -291,6 +291,8 @@ SmErrorT SmFailoverFailPendingState::event_handler(SmFailoverEventT event, const
return error;
}
break;
case SM_FAILOVER_EVENT_PEER_IS_NORMAL:
break;
default:
DPRINTFE("Runtime error, unexpected event %s, at state %s",
sm_failover_event_str(event),

View File

@ -282,6 +282,8 @@ SmErrorT SmFailoverFailedState::event_handler(SmFailoverEventT event, const ISmF
}
break;
}
case SM_FAILOVER_EVENT_PEER_IS_NORMAL:
break;
default:
DPRINTFE("Runtime error, unexpected event %s, at state %s",
sm_failover_event_str(event),

View File

@ -109,7 +109,7 @@ SmErrorT SmFailoverFSM::send_event(SmFailoverEventT event, const ISmFSMEventData
DPRINTFE("Runtime error. No handler for state %d", this->get_state());
return SM_FAILED;
}
DPRINTFI("send_event %s\n", sm_failover_event_str(event));
DPRINTFD("send_event %s\n", sm_failover_event_str(event));
state_handler->event_handler(event, event_data);
return SM_OKAY;
}

View File

@ -1,5 +1,5 @@
//
// Copyright (c) 2018 Wind River Systems, Inc.
// Copyright (c) 2018-2023 Wind River Systems, Inc.
//
// SPDX-License-Identifier: Apache-2.0
//
@ -18,6 +18,7 @@ SmErrorT SmFailoverInitialState::event_handler(SmFailoverEventT event, const ISm
case SM_FAILOVER_EVENT_IF_STATE_CHANGED:
case SM_FAILOVER_EVENT_FAIL_PENDING_TIMEOUT:
case SM_FAILOVER_EVENT_NODE_ENABLED:
case SM_FAILOVER_EVENT_PEER_IS_NORMAL:
break;
default:
DPRINTFE("Runtime error, unexpected event %s, at state %s",

View File

@ -43,7 +43,9 @@ SmErrorT SmFailoverNormalState::event_handler(SmFailoverEventT event, const ISmF
}
break;
case SM_FAILOVER_EVENT_NODE_ENABLED:
case SM_FAILOVER_EVENT_PEER_IS_NORMAL:
break;
default:
DPRINTFE("Runtime error, unexpected event %s, at state %s",
sm_failover_event_str(event),

View File

@ -284,11 +284,11 @@ SmErrorT _get_system_status(SmSystemStatusT& sys_status, char host_name[], char
SmSystemFailoverStatus::get_status().set_heartbeat_state(sys_status.heartbeat_state);
sys_status.host_status.node_name = host_name;
sys_status.host_status.interface_state = sm_failover_if_state_get();
sys_status.host_status.node_info_flags = sm_failover_get_host_node_info_flags();
sys_status.host_status.current_schedule_state = host_state;
sys_status.peer_status.node_name = peer_name;
sys_status.peer_status.interface_state = sm_failover_get_peer_if_state();
sys_status.peer_status.node_name = peer_name;
sys_status.peer_status.node_info_flags = sm_failover_get_peer_node_info_flags();
sys_status.peer_status.current_schedule_state = sm_get_controller_state(peer_name);
return SM_OKAY;
}
@ -324,16 +324,18 @@ SmErrorT sm_failover_ss_get_survivor(const SmSystemStatusT& system_status, SmSys
selection.set_peer_schedule_state(system_status.peer_status.current_schedule_state);
if(SM_HEARTBEAT_OK == system_status.heartbeat_state)
{
DPRINTFI("Heartbeat alive");
DPRINTFI("Heartbeat alive; host flags %#x, peer flags %#x",
system_status.host_status.node_info_flags,
system_status.peer_status.node_info_flags);
int host_healthy_score, peer_healthy_score;
host_healthy_score = get_node_if_healthy_score(system_status.host_status.interface_state);
peer_healthy_score = get_node_if_healthy_score(system_status.peer_status.interface_state);
host_healthy_score = get_node_if_healthy_score(system_status.host_status.node_info_flags);
peer_healthy_score = get_node_if_healthy_score(system_status.peer_status.node_info_flags);
if( peer_healthy_score < host_healthy_score )
{
//host is more healthy
selection.set_host_schedule_state(SM_NODE_STATE_ACTIVE);
selection.set_peer_schedule_state(SM_NODE_STATE_STANDBY);
if(system_status.peer_status.interface_state & SM_FAILOVER_MGMT_DOWN)
if(system_status.peer_status.node_info_flags & SM_FAILOVER_MGMT_DOWN)
{
DPRINTFI("Disable peer, host go active");
selection.set_peer_schedule_state(SM_NODE_STATE_FAILED);
@ -343,12 +345,18 @@ SmErrorT sm_failover_ss_get_survivor(const SmSystemStatusT& system_status, SmSys
//peer is more healthy
selection.set_host_schedule_state(SM_NODE_STATE_STANDBY);
selection.set_peer_schedule_state(SM_NODE_STATE_ACTIVE);
if(system_status.host_status.interface_state & SM_FAILOVER_MGMT_DOWN)
if(system_status.host_status.node_info_flags & SM_FAILOVER_MGMT_DOWN)
{
DPRINTFI("Disable host, peer go active");
selection.set_host_schedule_state(SM_NODE_STATE_FAILED);
}
} else
{
// host and peer has equal healthy scores
DPRINTFI("host and peer have the same healthy score");
}
DPRINTFI("host_healthy_score %d, peer_healthy_score %d",
host_healthy_score, peer_healthy_score);
}else
{
DPRINTFI("Loss of heartbeat ALL");

View File

@ -13,7 +13,7 @@
typedef struct
{
const char* node_name;
unsigned int interface_state;
unsigned int node_info_flags;
unsigned int heartbeat_state;
SmFailoverInterfaceStateT mgmt_state;
SmFailoverInterfaceStateT cluster_host_state;

View File

@ -1,5 +1,5 @@
//
// Copyright (c) 2018 Wind River Systems, Inc.
// Copyright (c) 2018-2023 Wind River Systems, Inc.
//
// SPDX-License-Identifier: Apache-2.0
//
@ -9,6 +9,7 @@
#include "sm_debug.h"
#include "sm_failover_fsm.h"
#include "sm_failover_ss.h"
#include "sm_failover.h"
static void _audit_failover_state()
{
@ -44,6 +45,14 @@ SmErrorT SmFailoverSurvivedState::event_handler(SmFailoverEventT event, const IS
{
case SM_FAILOVER_EVENT_IF_STATE_CHANGED:
break;
case SM_FAILOVER_EVENT_PEER_IS_NORMAL:
DPRINTFE("Peer is in %s(%d) failover state, Host will transition to %s(%d)",
sm_failover_state_str(SM_FAILOVER_STATE_NORMAL),
SM_FAILOVER_STATE_NORMAL,
sm_failover_state_str(SM_FAILOVER_STATE_NORMAL),
SM_FAILOVER_STATE_NORMAL);
SmFailoverFSM::get_fsm().set_state(SM_FAILOVER_STATE_NORMAL);
break;
case SM_FAILOVER_EVENT_NODE_ENABLED:
_audit_failover_state();
break;

View File

@ -1,5 +1,5 @@
//
// Copyright (c) 2014-2017 Wind River Systems, Inc.
// Copyright (c) 2014-2023 Wind River Systems, Inc.
//
// SPDX-License-Identifier: Apache-2.0
//
@ -55,7 +55,7 @@ typedef struct
typedef struct
{
uint16_t msg_size;
uint32_t if_state;
uint32_t node_info; /**< @see SmHeartbeatMsgNodeInfoT */
} __attribute__ ((packed)) SmHeartbeatMsgAliveRev2T;
typedef struct
@ -64,10 +64,10 @@ typedef struct
union
{
SmHeartbeatMsgAliveT alive;
SmHeartbeatMsgAliveRev2T if_state_msg;
SmHeartbeatMsgAliveT alive_msg;
SmHeartbeatMsgAliveRev2T aliveRev2_msg;
char raw_msg[SM_HEARTBEAT_MSG_MAX_SIZE-sizeof(SmHeartbeatMsgHeaderT)];
} u;
} msg;
} __attribute__ ((packed)) SmHeartbeatMsgT;
static char _tx_control_buffer[SM_HEARTBEAT_MSG_BUFFER_MAX_SIZE] __attribute__((aligned));
@ -206,10 +206,10 @@ SmErrorT sm_heartbeat_msg_send_alive( SmNetworkTypeT network_type, char node_nam
snprintf( heartbeat_msg.header.node_name,
sizeof(heartbeat_msg.header.node_name), "%s", node_name );
heartbeat_msg.u.if_state_msg.msg_size = htons((uint16_t)sizeof(SmHeartbeatMsgAliveRev2T));
SmHeartbeatMsgIfStateT if_state;
if_state = sm_failover_if_state_get();
heartbeat_msg.u.if_state_msg.if_state = htonl(if_state);
heartbeat_msg.msg.aliveRev2_msg.msg_size = htons((uint16_t)sizeof(SmHeartbeatMsgAliveRev2T));
SmHeartbeatMsgNodeInfoT node_info;
node_info = sm_failover_get_host_node_info_flags();
heartbeat_msg.msg.aliveRev2_msg.node_info = htonl(node_info);
if( SM_AUTH_TYPE_HMAC_SHA512 == auth_type )
{
@ -341,14 +341,14 @@ static void sm_heartbeat_msg_dispatch_msg( SmHeartbeatMsgT* heartbeat_msg,
}
uint16_t msg_size;
SmHeartbeatMsgIfStateT if_state = 0;
SmHeartbeatMsgNodeInfoT node_info = 0;
bool perform_if_exg = false;
if(2 <= ntohs(heartbeat_msg->header.revision))
{
msg_size = ntohs(heartbeat_msg->u.if_state_msg.msg_size);
msg_size = ntohs(heartbeat_msg->msg.aliveRev2_msg.msg_size);
if(sizeof(SmHeartbeatMsgAliveRev2T) == msg_size )
{
if_state = ntohl(heartbeat_msg->u.if_state_msg.if_state);
node_info = ntohl(heartbeat_msg->msg.aliveRev2_msg.node_info);
perform_if_exg = true;
}else
{
@ -377,14 +377,15 @@ static void sm_heartbeat_msg_dispatch_msg( SmHeartbeatMsgT* heartbeat_msg,
if(perform_if_exg)
{
if(NULL != callbacks->if_state)
if(NULL != callbacks->node_info)
{
callbacks->if_state(heartbeat_msg->header.node_name,
if_state);
callbacks->node_info(heartbeat_msg->header.node_name,
node_info);
}
else
{
DPRINTFD("No handler for if state package");
DPRINTFD("Missing callback for node info %s %#x", heartbeat_msg->header.node_name,
node_info);
}
}
}

View File

@ -1,5 +1,5 @@
//
// Copyright (c) 2014-2017 Wind River Systems, Inc.
// Copyright (c) 2014-2023 Wind River Systems, Inc.
//
// SPDX-License-Identifier: Apache-2.0
//
@ -23,14 +23,14 @@ typedef void (*SmHeartbeatMsgAliveCallbackT) (char node_name[],
SmNetworkAddressT* network_address, int network_port, int version,
int revision, char interface_name[]);
typedef void (*SmHeartbeatMsgIfStateCallbackT) (const char node_name[],
SmHeartbeatMsgIfStateT if_state);
typedef void (*SmHeartbeatMsgNodeInfoCallbackT) (const char node_name[],
SmHeartbeatMsgNodeInfoT if_state);
typedef struct
{
SmHeartbeatMsgAuthCallbackT auth;
SmHeartbeatMsgAliveCallbackT alive;
SmHeartbeatMsgIfStateCallbackT if_state;
SmHeartbeatMsgNodeInfoCallbackT node_info;
} SmHeartbeatMsgCallbacksT;
// ****************************************************************************

View File

@ -1116,12 +1116,12 @@ DONE:
// ****************************************************************************
// ****************************************************************************
// Heartbeat Thread - Receive if_state Message
// Heartbeat Thread - Receive node_info Message
// ========================================
static void sm_heartbeat_thread_receive_if_state_message( const char node_name[],
SmHeartbeatMsgIfStateT if_state)
static void sm_heartbeat_thread_receive_node_info_message( const char node_name[],
SmHeartbeatMsgNodeInfoT node_info)
{
sm_failover_if_state_update(node_name, if_state);
sm_failover_node_info_update(node_name, node_info);
}
// ****************************************************************************
@ -1158,7 +1158,7 @@ static SmErrorT sm_heartbeat_thread_initialize_thread( void )
_callbacks.auth = sm_heartbeat_thread_auth_message;
_callbacks.alive = sm_heartbeat_thread_receive_alive_message;
_callbacks.if_state = sm_heartbeat_thread_receive_if_state_message;
_callbacks.node_info = sm_heartbeat_thread_receive_node_info_message;
error = sm_heartbeat_msg_register_callbacks( &_callbacks );
if( SM_OKAY != error )