SM to monitor infra i/f and swact when needed
Individual services should not fail itself and trigger swact when infra i/f goes down SM will collect the overrall system healthy state to schedule the services. Story: 2003577 Task: 24899 Change-Id: Ifa7453136f34768b99e2bcd741d1065e69ef452e Signed-off-by: Bin Qian <bin.qian@windriver.com>
This commit is contained in:
parent
cd92cda225
commit
68b5ce3835
|
@ -2,4 +2,4 @@ SRC_DIR=$PKG_BASE
|
|||
COPY_LIST="$PKG_BASE/LICENSE"
|
||||
TAR_NAME=sm
|
||||
VERSION=1.0.0
|
||||
TIS_PATCH_VER=23
|
||||
TIS_PATCH_VER=24
|
||||
|
|
|
@ -107,6 +107,7 @@ SRCS+=sm_swact_state.c
|
|||
SRCS+=sm_worker_thread.cpp
|
||||
SRCS+=sm_task_affining_thread.c
|
||||
SRCS+=sm_node_swact_monitor.cpp
|
||||
SRCS+=sm_failover_ss.c
|
||||
SRCS+=sm_service_domain_interface_not_in_use_state.c
|
||||
SRCS+=sm_configuration_table.c
|
||||
SRCS+=sm_failover_utils.c
|
||||
|
|
|
@ -32,6 +32,7 @@
|
|||
#include "sm_heartbeat_msg.h"
|
||||
#include "sm_node_swact_monitor.h"
|
||||
#include "sm_util_types.h"
|
||||
#include "sm_failover_ss.h"
|
||||
#include "sm_failover_utils.h"
|
||||
|
||||
typedef enum
|
||||
|
@ -52,7 +53,13 @@ typedef enum
|
|||
SM_FAILOVER_ACTION_DEGRADE = 8,
|
||||
SM_FAILOVER_ACTION_ACTIVATE = 16,
|
||||
SM_FAILOVER_ACTION_FAIL_NODE = 32,
|
||||
SM_FAILOVER_ACTION_UNDEFINED = 64
|
||||
SM_FAILOVER_ACTION_UNDEFINED = 64,
|
||||
//as part of the gradual delivery of enhancement with more
|
||||
//complex algorithm to determine the failover survivor routine
|
||||
//the SM_FAILOVER_ACTION_ROUTINE will redirect the lookup to
|
||||
//new logic. Until all actions are migrate to new logic, the
|
||||
//lookup tables will be eliminated.
|
||||
SM_FAILOVER_ACTION_ROUTINE = 1 << 31,
|
||||
}SmFailoverActionT;
|
||||
|
||||
#define SM_FAILOVER_STATE_TRANSITION_TIME_IN_MS 2000
|
||||
|
@ -188,13 +195,13 @@ SmFailoverActionPairT action_map_std_infra[16] =
|
|||
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //6
|
||||
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //7
|
||||
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //8
|
||||
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //9
|
||||
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //10
|
||||
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //11
|
||||
{SM_FAILOVER_ACTION_SWACT, SM_FAILOVER_ACTION_DEGRADE}, //12
|
||||
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //13
|
||||
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //14
|
||||
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED} //15
|
||||
{SM_FAILOVER_ACTION_ROUTINE, SM_FAILOVER_ACTION_ROUTINE}, //9
|
||||
{SM_FAILOVER_ACTION_ROUTINE, SM_FAILOVER_ACTION_ROUTINE}, //10
|
||||
{SM_FAILOVER_ACTION_ROUTINE, SM_FAILOVER_ACTION_ROUTINE}, //11
|
||||
{SM_FAILOVER_ACTION_ROUTINE, SM_FAILOVER_ACTION_DEGRADE}, //12
|
||||
{SM_FAILOVER_ACTION_ROUTINE, SM_FAILOVER_ACTION_ROUTINE}, //13
|
||||
{SM_FAILOVER_ACTION_ROUTINE, SM_FAILOVER_ACTION_ROUTINE}, //14
|
||||
{SM_FAILOVER_ACTION_ROUTINE, SM_FAILOVER_ACTION_ROUTINE} //15
|
||||
};
|
||||
|
||||
SmFailoverActionPairT action_map_std_no_infra[16] =
|
||||
|
@ -916,6 +923,87 @@ bool this_controller_unlocked()
|
|||
}
|
||||
// ****************************************************************************
|
||||
|
||||
|
||||
// ****************************************************************************
|
||||
// Failover - pack schedule state into 2 bit (active/standby/failed)
|
||||
// SM starts managing failover after the node is scheduled, i.e, cannot be
|
||||
// in SM_NODE_STATE_UNKNOWN or SM_NODE_STATE_INIT state
|
||||
// =======================
|
||||
unsigned int sm_failover_pack_schedule_state(SmNodeScheduleStateT state)
|
||||
{
|
||||
static const unsigned int failed_state_bit_flag = 0;
|
||||
static const unsigned int active_state_bit_flag = 1;
|
||||
static const unsigned int standby_state_bit_flag = 1 << 1;
|
||||
unsigned int res;
|
||||
switch(state)
|
||||
{
|
||||
case SM_NODE_STATE_ACTIVE:
|
||||
res = active_state_bit_flag;
|
||||
break;
|
||||
case SM_NODE_STATE_STANDBY:
|
||||
res = standby_state_bit_flag;
|
||||
break;
|
||||
case SM_NODE_STATE_FAILED:
|
||||
res = failed_state_bit_flag;
|
||||
break;
|
||||
default:
|
||||
res = failed_state_bit_flag;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
// ****************************************************************************
|
||||
|
||||
// ****************************************************************************
|
||||
// Failover - convert target scheduling state to action
|
||||
// =======================
|
||||
int sm_failover_get_action(const SmSystemFailoverStatus& failover_status)
|
||||
{
|
||||
DPRINTFI("Host to %s, Peer to %s.",
|
||||
sm_node_schedule_state_str(failover_status.host_schedule_state),
|
||||
sm_node_schedule_state_str(failover_status.peer_schedule_state)
|
||||
);
|
||||
unsigned int host_flag, peer_flag;
|
||||
host_flag = sm_failover_pack_schedule_state(failover_status.host_schedule_state);
|
||||
peer_flag = sm_failover_pack_schedule_state(failover_status.peer_schedule_state);
|
||||
|
||||
unsigned int flag = (host_flag | (peer_flag << 2));
|
||||
DPRINTFI("Failover scheduling flag %d", flag);
|
||||
SmFailoverActionPairT* actions;
|
||||
int action;
|
||||
|
||||
SmFailoverActionPairT action_map[16] = {
|
||||
{SM_FAILOVER_ACTION_FAIL_NODE, SM_FAILOVER_ACTION_FAIL_NODE}, //00 00
|
||||
{SM_FAILOVER_ACTION_DISABLE_STANDBY, SM_FAILOVER_ACTION_FAIL_NODE}, //01 00
|
||||
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //10 00
|
||||
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //11 00
|
||||
{SM_FAILOVER_ACTION_FAIL_NODE, SM_FAILOVER_ACTION_ACTIVATE}, //00 01
|
||||
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //01 01
|
||||
{SM_FAILOVER_ACTION_SWACT, SM_FAILOVER_ACTION_NO_ACTION}, //10 01
|
||||
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //11 01
|
||||
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //00 10
|
||||
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //01 10
|
||||
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //10 10
|
||||
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //11 10
|
||||
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //00 11
|
||||
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //01 11
|
||||
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //10 11
|
||||
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //11 11
|
||||
};
|
||||
|
||||
bool is_active = is_active_controller();
|
||||
actions = &(action_map[flag & 0xf]);
|
||||
if(is_active)
|
||||
{
|
||||
action = actions->active_controller_action;
|
||||
}
|
||||
else
|
||||
{
|
||||
action = actions->standby_controller_action;
|
||||
}
|
||||
return action;
|
||||
}
|
||||
// ****************************************************************************
|
||||
|
||||
// ****************************************************************************
|
||||
// Failover - audit
|
||||
// =======================
|
||||
|
@ -1072,6 +1160,36 @@ void sm_failover_audit()
|
|||
_log_nodes_state(action);
|
||||
|
||||
DPRINTFI("Action to take %d", action);
|
||||
|
||||
if (action & SM_FAILOVER_ACTION_ROUTINE)
|
||||
{
|
||||
SmSystemStatusT sys_status;
|
||||
SmSystemFailoverStatus failover_status;
|
||||
sys_status.system_mode = _system_mode;
|
||||
if(if_state_flag & SM_FAILOVER_HEARTBEAT_ALIVE)
|
||||
{
|
||||
sys_status.heartbeat_state = SM_HEARTBEAT_OK;
|
||||
}else
|
||||
{
|
||||
sys_status.heartbeat_state = SM_HEARTBEAT_LOSS;
|
||||
}
|
||||
|
||||
sys_status.host_status.node_name = _host_name;
|
||||
sys_status.host_status.interface_state = if_state_flag & 0x7;
|
||||
sys_status.host_status.current_schedule_state = _host_state;
|
||||
sys_status.peer_status.node_name = _peer_name;
|
||||
sys_status.peer_status.interface_state = _peer_if_state & 0x7;
|
||||
sys_status.peer_status.current_schedule_state = sm_get_controller_state(_peer_name);
|
||||
SmErrorT error = sm_failover_ss_get_survivor(sys_status, failover_status);
|
||||
if(SM_OKAY != error)
|
||||
{
|
||||
DPRINTFE("Failed to determine failover state. ");
|
||||
return;
|
||||
}
|
||||
|
||||
action = sm_failover_get_action(failover_status);
|
||||
}
|
||||
|
||||
if (action & SM_FAILOVER_ACTION_ACTIVATE)
|
||||
{
|
||||
DPRINTFI("ACTIVE");
|
||||
|
|
|
@ -0,0 +1,80 @@
|
|||
//
|
||||
// Copyright (c) 2018 Wind River Systems, Inc.
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "sm_failover_ss.h"
|
||||
#include "sm_debug.h"
|
||||
|
||||
|
||||
typedef enum
|
||||
{
|
||||
SM_FAILOVER_INFRA_DOWN = 1,
|
||||
SM_FAILOVER_MGMT_DOWN = 2,
|
||||
SM_FAILOVER_OAM_DOWN = 4,
|
||||
}SmFailoverCommFaultBitFlagT;
|
||||
|
||||
// ****************************************************************************
|
||||
// sm_failover_ss get_node_if_healthy_score - get node interface healthy score
|
||||
// ===================
|
||||
static int get_node_if_healthy_score(unsigned int interface_state)
|
||||
{
|
||||
int healthy_score = 0;
|
||||
if(interface_state & SM_FAILOVER_OAM_DOWN)
|
||||
{
|
||||
healthy_score -= 1;
|
||||
}
|
||||
if(interface_state & SM_FAILOVER_INFRA_DOWN)
|
||||
{
|
||||
healthy_score -= 2;
|
||||
}
|
||||
if(interface_state & SM_FAILOVER_INFRA_DOWN)
|
||||
{
|
||||
healthy_score -= 4;
|
||||
}
|
||||
|
||||
return healthy_score;
|
||||
}
|
||||
// ****************************************************************************
|
||||
|
||||
// ****************************************************************************
|
||||
// sm_failover_ss_get_survivor - select the failover survivor
|
||||
// This is the main entry/container for the failover logic to determine how
|
||||
// to schedule the controllers, i.e, active/standby or active/failure.
|
||||
// ===================
|
||||
SmErrorT sm_failover_ss_get_survivor(const SmSystemStatusT& system_status, SmSystemFailoverStatus& selection)
|
||||
{
|
||||
selection.host_schedule_state = system_status.host_status.current_schedule_state;
|
||||
selection.peer_schedule_state = system_status.peer_status.current_schedule_state;
|
||||
if(SM_HEARTBEAT_OK == system_status.heartbeat_state)
|
||||
{
|
||||
int host_healthy_score, peer_healthy_score;
|
||||
host_healthy_score = get_node_if_healthy_score(system_status.host_status.interface_state);
|
||||
peer_healthy_score = get_node_if_healthy_score(system_status.peer_status.interface_state);
|
||||
if( peer_healthy_score < host_healthy_score )
|
||||
{
|
||||
//host is more healthy
|
||||
selection.host_schedule_state = SM_NODE_STATE_ACTIVE;
|
||||
selection.peer_schedule_state = SM_NODE_STATE_STANDBY;
|
||||
}else if(peer_healthy_score > host_healthy_score)
|
||||
{
|
||||
//peer is more healthy
|
||||
selection.host_schedule_state = SM_NODE_STATE_STANDBY;
|
||||
selection.peer_schedule_state = SM_NODE_STATE_ACTIVE;
|
||||
}
|
||||
}
|
||||
|
||||
if(system_status.host_status.current_schedule_state != selection.host_schedule_state ||
|
||||
system_status.peer_status.current_schedule_state != selection.peer_schedule_state )
|
||||
{
|
||||
DPRINTFI("Uncontrolled swact starts. Host from %s to %s, Peer from %s to %s.",
|
||||
sm_node_schedule_state_str(system_status.host_status.current_schedule_state),
|
||||
sm_node_schedule_state_str(selection.host_schedule_state),
|
||||
sm_node_schedule_state_str(system_status.peer_status.current_schedule_state),
|
||||
sm_node_schedule_state_str(selection.peer_schedule_state)
|
||||
);
|
||||
}
|
||||
return SM_OKAY;
|
||||
}
|
||||
// ****************************************************************************
|
|
@ -0,0 +1,57 @@
|
|||
//
|
||||
// Copyright (c) 2018 Wind River Systems, Inc.
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#ifndef __SM_FAILOVER_SS_H__
|
||||
#define __SM_FAILOVER_SS_H__
|
||||
#include <stdio.h>
|
||||
#include "sm_types.h"
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct
|
||||
{
|
||||
const char* node_name;
|
||||
unsigned int interface_state;
|
||||
SmNodeScheduleStateT current_schedule_state;
|
||||
}SmNodeStatusT;
|
||||
|
||||
typedef enum
|
||||
{
|
||||
//heartbeat ok
|
||||
SM_HEARTBEAT_OK,
|
||||
//single node situation
|
||||
SM_HEARTBEAT_NA,
|
||||
//other nodes report heartbeat with peer, no direct heartbeat
|
||||
SM_HEARTBEAT_INDIRECT,
|
||||
//no heartbeat
|
||||
SM_HEARTBEAT_LOSS
|
||||
}SmHeartbeatStatusT;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
SmNodeStatusT host_status;
|
||||
SmNodeStatusT peer_status;
|
||||
SmHeartbeatStatusT heartbeat_state;
|
||||
SmSystemModeT system_mode;
|
||||
}SmSystemStatusT;
|
||||
|
||||
|
||||
typedef struct
|
||||
{
|
||||
SmNodeScheduleStateT host_schedule_state;
|
||||
SmNodeScheduleStateT peer_schedule_state;
|
||||
}SmSystemFailoverStatus;
|
||||
|
||||
// ****************************************************************************
|
||||
// sm_failover_ss_get_survivor - select the failover survivor
|
||||
// ===================
|
||||
SmErrorT sm_failover_ss_get_survivor(const SmSystemStatusT& system_status, SmSystemFailoverStatus& selection);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#endif // __SM_FAILOVER_SS_H__
|
Loading…
Reference in New Issue