SM to monitor infra i/f and swact when needed
Individual services should not fail itself and trigger swact when infra i/f goes down SM will collect the overrall system healthy state to schedule the services. Story: 2003577 Task: 24899 Change-Id: Ifa7453136f34768b99e2bcd741d1065e69ef452e Signed-off-by: Bin Qian <bin.qian@windriver.com>
This commit is contained in:
parent
cd92cda225
commit
68b5ce3835
|
@ -2,4 +2,4 @@ SRC_DIR=$PKG_BASE
|
||||||
COPY_LIST="$PKG_BASE/LICENSE"
|
COPY_LIST="$PKG_BASE/LICENSE"
|
||||||
TAR_NAME=sm
|
TAR_NAME=sm
|
||||||
VERSION=1.0.0
|
VERSION=1.0.0
|
||||||
TIS_PATCH_VER=23
|
TIS_PATCH_VER=24
|
||||||
|
|
|
@ -107,6 +107,7 @@ SRCS+=sm_swact_state.c
|
||||||
SRCS+=sm_worker_thread.cpp
|
SRCS+=sm_worker_thread.cpp
|
||||||
SRCS+=sm_task_affining_thread.c
|
SRCS+=sm_task_affining_thread.c
|
||||||
SRCS+=sm_node_swact_monitor.cpp
|
SRCS+=sm_node_swact_monitor.cpp
|
||||||
|
SRCS+=sm_failover_ss.c
|
||||||
SRCS+=sm_service_domain_interface_not_in_use_state.c
|
SRCS+=sm_service_domain_interface_not_in_use_state.c
|
||||||
SRCS+=sm_configuration_table.c
|
SRCS+=sm_configuration_table.c
|
||||||
SRCS+=sm_failover_utils.c
|
SRCS+=sm_failover_utils.c
|
||||||
|
|
|
@ -32,6 +32,7 @@
|
||||||
#include "sm_heartbeat_msg.h"
|
#include "sm_heartbeat_msg.h"
|
||||||
#include "sm_node_swact_monitor.h"
|
#include "sm_node_swact_monitor.h"
|
||||||
#include "sm_util_types.h"
|
#include "sm_util_types.h"
|
||||||
|
#include "sm_failover_ss.h"
|
||||||
#include "sm_failover_utils.h"
|
#include "sm_failover_utils.h"
|
||||||
|
|
||||||
typedef enum
|
typedef enum
|
||||||
|
@ -52,7 +53,13 @@ typedef enum
|
||||||
SM_FAILOVER_ACTION_DEGRADE = 8,
|
SM_FAILOVER_ACTION_DEGRADE = 8,
|
||||||
SM_FAILOVER_ACTION_ACTIVATE = 16,
|
SM_FAILOVER_ACTION_ACTIVATE = 16,
|
||||||
SM_FAILOVER_ACTION_FAIL_NODE = 32,
|
SM_FAILOVER_ACTION_FAIL_NODE = 32,
|
||||||
SM_FAILOVER_ACTION_UNDEFINED = 64
|
SM_FAILOVER_ACTION_UNDEFINED = 64,
|
||||||
|
//as part of the gradual delivery of enhancement with more
|
||||||
|
//complex algorithm to determine the failover survivor routine
|
||||||
|
//the SM_FAILOVER_ACTION_ROUTINE will redirect the lookup to
|
||||||
|
//new logic. Until all actions are migrate to new logic, the
|
||||||
|
//lookup tables will be eliminated.
|
||||||
|
SM_FAILOVER_ACTION_ROUTINE = 1 << 31,
|
||||||
}SmFailoverActionT;
|
}SmFailoverActionT;
|
||||||
|
|
||||||
#define SM_FAILOVER_STATE_TRANSITION_TIME_IN_MS 2000
|
#define SM_FAILOVER_STATE_TRANSITION_TIME_IN_MS 2000
|
||||||
|
@ -188,13 +195,13 @@ SmFailoverActionPairT action_map_std_infra[16] =
|
||||||
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //6
|
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //6
|
||||||
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //7
|
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //7
|
||||||
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //8
|
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //8
|
||||||
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //9
|
{SM_FAILOVER_ACTION_ROUTINE, SM_FAILOVER_ACTION_ROUTINE}, //9
|
||||||
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //10
|
{SM_FAILOVER_ACTION_ROUTINE, SM_FAILOVER_ACTION_ROUTINE}, //10
|
||||||
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //11
|
{SM_FAILOVER_ACTION_ROUTINE, SM_FAILOVER_ACTION_ROUTINE}, //11
|
||||||
{SM_FAILOVER_ACTION_SWACT, SM_FAILOVER_ACTION_DEGRADE}, //12
|
{SM_FAILOVER_ACTION_ROUTINE, SM_FAILOVER_ACTION_DEGRADE}, //12
|
||||||
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //13
|
{SM_FAILOVER_ACTION_ROUTINE, SM_FAILOVER_ACTION_ROUTINE}, //13
|
||||||
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //14
|
{SM_FAILOVER_ACTION_ROUTINE, SM_FAILOVER_ACTION_ROUTINE}, //14
|
||||||
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED} //15
|
{SM_FAILOVER_ACTION_ROUTINE, SM_FAILOVER_ACTION_ROUTINE} //15
|
||||||
};
|
};
|
||||||
|
|
||||||
SmFailoverActionPairT action_map_std_no_infra[16] =
|
SmFailoverActionPairT action_map_std_no_infra[16] =
|
||||||
|
@ -916,6 +923,87 @@ bool this_controller_unlocked()
|
||||||
}
|
}
|
||||||
// ****************************************************************************
|
// ****************************************************************************
|
||||||
|
|
||||||
|
|
||||||
|
// ****************************************************************************
|
||||||
|
// Failover - pack schedule state into 2 bit (active/standby/failed)
|
||||||
|
// SM starts managing failover after the node is scheduled, i.e, cannot be
|
||||||
|
// in SM_NODE_STATE_UNKNOWN or SM_NODE_STATE_INIT state
|
||||||
|
// =======================
|
||||||
|
unsigned int sm_failover_pack_schedule_state(SmNodeScheduleStateT state)
|
||||||
|
{
|
||||||
|
static const unsigned int failed_state_bit_flag = 0;
|
||||||
|
static const unsigned int active_state_bit_flag = 1;
|
||||||
|
static const unsigned int standby_state_bit_flag = 1 << 1;
|
||||||
|
unsigned int res;
|
||||||
|
switch(state)
|
||||||
|
{
|
||||||
|
case SM_NODE_STATE_ACTIVE:
|
||||||
|
res = active_state_bit_flag;
|
||||||
|
break;
|
||||||
|
case SM_NODE_STATE_STANDBY:
|
||||||
|
res = standby_state_bit_flag;
|
||||||
|
break;
|
||||||
|
case SM_NODE_STATE_FAILED:
|
||||||
|
res = failed_state_bit_flag;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
res = failed_state_bit_flag;
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
// ****************************************************************************
|
||||||
|
|
||||||
|
// ****************************************************************************
|
||||||
|
// Failover - convert target scheduling state to action
|
||||||
|
// =======================
|
||||||
|
int sm_failover_get_action(const SmSystemFailoverStatus& failover_status)
|
||||||
|
{
|
||||||
|
DPRINTFI("Host to %s, Peer to %s.",
|
||||||
|
sm_node_schedule_state_str(failover_status.host_schedule_state),
|
||||||
|
sm_node_schedule_state_str(failover_status.peer_schedule_state)
|
||||||
|
);
|
||||||
|
unsigned int host_flag, peer_flag;
|
||||||
|
host_flag = sm_failover_pack_schedule_state(failover_status.host_schedule_state);
|
||||||
|
peer_flag = sm_failover_pack_schedule_state(failover_status.peer_schedule_state);
|
||||||
|
|
||||||
|
unsigned int flag = (host_flag | (peer_flag << 2));
|
||||||
|
DPRINTFI("Failover scheduling flag %d", flag);
|
||||||
|
SmFailoverActionPairT* actions;
|
||||||
|
int action;
|
||||||
|
|
||||||
|
SmFailoverActionPairT action_map[16] = {
|
||||||
|
{SM_FAILOVER_ACTION_FAIL_NODE, SM_FAILOVER_ACTION_FAIL_NODE}, //00 00
|
||||||
|
{SM_FAILOVER_ACTION_DISABLE_STANDBY, SM_FAILOVER_ACTION_FAIL_NODE}, //01 00
|
||||||
|
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //10 00
|
||||||
|
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //11 00
|
||||||
|
{SM_FAILOVER_ACTION_FAIL_NODE, SM_FAILOVER_ACTION_ACTIVATE}, //00 01
|
||||||
|
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //01 01
|
||||||
|
{SM_FAILOVER_ACTION_SWACT, SM_FAILOVER_ACTION_NO_ACTION}, //10 01
|
||||||
|
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //11 01
|
||||||
|
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //00 10
|
||||||
|
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //01 10
|
||||||
|
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //10 10
|
||||||
|
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //11 10
|
||||||
|
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //00 11
|
||||||
|
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //01 11
|
||||||
|
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //10 11
|
||||||
|
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //11 11
|
||||||
|
};
|
||||||
|
|
||||||
|
bool is_active = is_active_controller();
|
||||||
|
actions = &(action_map[flag & 0xf]);
|
||||||
|
if(is_active)
|
||||||
|
{
|
||||||
|
action = actions->active_controller_action;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
action = actions->standby_controller_action;
|
||||||
|
}
|
||||||
|
return action;
|
||||||
|
}
|
||||||
|
// ****************************************************************************
|
||||||
|
|
||||||
// ****************************************************************************
|
// ****************************************************************************
|
||||||
// Failover - audit
|
// Failover - audit
|
||||||
// =======================
|
// =======================
|
||||||
|
@ -1072,6 +1160,36 @@ void sm_failover_audit()
|
||||||
_log_nodes_state(action);
|
_log_nodes_state(action);
|
||||||
|
|
||||||
DPRINTFI("Action to take %d", action);
|
DPRINTFI("Action to take %d", action);
|
||||||
|
|
||||||
|
if (action & SM_FAILOVER_ACTION_ROUTINE)
|
||||||
|
{
|
||||||
|
SmSystemStatusT sys_status;
|
||||||
|
SmSystemFailoverStatus failover_status;
|
||||||
|
sys_status.system_mode = _system_mode;
|
||||||
|
if(if_state_flag & SM_FAILOVER_HEARTBEAT_ALIVE)
|
||||||
|
{
|
||||||
|
sys_status.heartbeat_state = SM_HEARTBEAT_OK;
|
||||||
|
}else
|
||||||
|
{
|
||||||
|
sys_status.heartbeat_state = SM_HEARTBEAT_LOSS;
|
||||||
|
}
|
||||||
|
|
||||||
|
sys_status.host_status.node_name = _host_name;
|
||||||
|
sys_status.host_status.interface_state = if_state_flag & 0x7;
|
||||||
|
sys_status.host_status.current_schedule_state = _host_state;
|
||||||
|
sys_status.peer_status.node_name = _peer_name;
|
||||||
|
sys_status.peer_status.interface_state = _peer_if_state & 0x7;
|
||||||
|
sys_status.peer_status.current_schedule_state = sm_get_controller_state(_peer_name);
|
||||||
|
SmErrorT error = sm_failover_ss_get_survivor(sys_status, failover_status);
|
||||||
|
if(SM_OKAY != error)
|
||||||
|
{
|
||||||
|
DPRINTFE("Failed to determine failover state. ");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
action = sm_failover_get_action(failover_status);
|
||||||
|
}
|
||||||
|
|
||||||
if (action & SM_FAILOVER_ACTION_ACTIVATE)
|
if (action & SM_FAILOVER_ACTION_ACTIVATE)
|
||||||
{
|
{
|
||||||
DPRINTFI("ACTIVE");
|
DPRINTFI("ACTIVE");
|
||||||
|
|
|
@ -0,0 +1,80 @@
|
||||||
|
//
|
||||||
|
// Copyright (c) 2018 Wind River Systems, Inc.
|
||||||
|
//
|
||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
//
|
||||||
|
|
||||||
|
#include "sm_failover_ss.h"
|
||||||
|
#include "sm_debug.h"
|
||||||
|
|
||||||
|
|
||||||
|
typedef enum
|
||||||
|
{
|
||||||
|
SM_FAILOVER_INFRA_DOWN = 1,
|
||||||
|
SM_FAILOVER_MGMT_DOWN = 2,
|
||||||
|
SM_FAILOVER_OAM_DOWN = 4,
|
||||||
|
}SmFailoverCommFaultBitFlagT;
|
||||||
|
|
||||||
|
// ****************************************************************************
|
||||||
|
// sm_failover_ss get_node_if_healthy_score - get node interface healthy score
|
||||||
|
// ===================
|
||||||
|
static int get_node_if_healthy_score(unsigned int interface_state)
|
||||||
|
{
|
||||||
|
int healthy_score = 0;
|
||||||
|
if(interface_state & SM_FAILOVER_OAM_DOWN)
|
||||||
|
{
|
||||||
|
healthy_score -= 1;
|
||||||
|
}
|
||||||
|
if(interface_state & SM_FAILOVER_INFRA_DOWN)
|
||||||
|
{
|
||||||
|
healthy_score -= 2;
|
||||||
|
}
|
||||||
|
if(interface_state & SM_FAILOVER_INFRA_DOWN)
|
||||||
|
{
|
||||||
|
healthy_score -= 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
return healthy_score;
|
||||||
|
}
|
||||||
|
// ****************************************************************************
|
||||||
|
|
||||||
|
// ****************************************************************************
|
||||||
|
// sm_failover_ss_get_survivor - select the failover survivor
|
||||||
|
// This is the main entry/container for the failover logic to determine how
|
||||||
|
// to schedule the controllers, i.e, active/standby or active/failure.
|
||||||
|
// ===================
|
||||||
|
SmErrorT sm_failover_ss_get_survivor(const SmSystemStatusT& system_status, SmSystemFailoverStatus& selection)
|
||||||
|
{
|
||||||
|
selection.host_schedule_state = system_status.host_status.current_schedule_state;
|
||||||
|
selection.peer_schedule_state = system_status.peer_status.current_schedule_state;
|
||||||
|
if(SM_HEARTBEAT_OK == system_status.heartbeat_state)
|
||||||
|
{
|
||||||
|
int host_healthy_score, peer_healthy_score;
|
||||||
|
host_healthy_score = get_node_if_healthy_score(system_status.host_status.interface_state);
|
||||||
|
peer_healthy_score = get_node_if_healthy_score(system_status.peer_status.interface_state);
|
||||||
|
if( peer_healthy_score < host_healthy_score )
|
||||||
|
{
|
||||||
|
//host is more healthy
|
||||||
|
selection.host_schedule_state = SM_NODE_STATE_ACTIVE;
|
||||||
|
selection.peer_schedule_state = SM_NODE_STATE_STANDBY;
|
||||||
|
}else if(peer_healthy_score > host_healthy_score)
|
||||||
|
{
|
||||||
|
//peer is more healthy
|
||||||
|
selection.host_schedule_state = SM_NODE_STATE_STANDBY;
|
||||||
|
selection.peer_schedule_state = SM_NODE_STATE_ACTIVE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if(system_status.host_status.current_schedule_state != selection.host_schedule_state ||
|
||||||
|
system_status.peer_status.current_schedule_state != selection.peer_schedule_state )
|
||||||
|
{
|
||||||
|
DPRINTFI("Uncontrolled swact starts. Host from %s to %s, Peer from %s to %s.",
|
||||||
|
sm_node_schedule_state_str(system_status.host_status.current_schedule_state),
|
||||||
|
sm_node_schedule_state_str(selection.host_schedule_state),
|
||||||
|
sm_node_schedule_state_str(system_status.peer_status.current_schedule_state),
|
||||||
|
sm_node_schedule_state_str(selection.peer_schedule_state)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
return SM_OKAY;
|
||||||
|
}
|
||||||
|
// ****************************************************************************
|
|
@ -0,0 +1,57 @@
|
||||||
|
//
|
||||||
|
// Copyright (c) 2018 Wind River Systems, Inc.
|
||||||
|
//
|
||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
//
|
||||||
|
|
||||||
|
#ifndef __SM_FAILOVER_SS_H__
|
||||||
|
#define __SM_FAILOVER_SS_H__
|
||||||
|
#include <stdio.h>
|
||||||
|
#include "sm_types.h"
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
const char* node_name;
|
||||||
|
unsigned int interface_state;
|
||||||
|
SmNodeScheduleStateT current_schedule_state;
|
||||||
|
}SmNodeStatusT;
|
||||||
|
|
||||||
|
typedef enum
|
||||||
|
{
|
||||||
|
//heartbeat ok
|
||||||
|
SM_HEARTBEAT_OK,
|
||||||
|
//single node situation
|
||||||
|
SM_HEARTBEAT_NA,
|
||||||
|
//other nodes report heartbeat with peer, no direct heartbeat
|
||||||
|
SM_HEARTBEAT_INDIRECT,
|
||||||
|
//no heartbeat
|
||||||
|
SM_HEARTBEAT_LOSS
|
||||||
|
}SmHeartbeatStatusT;
|
||||||
|
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
SmNodeStatusT host_status;
|
||||||
|
SmNodeStatusT peer_status;
|
||||||
|
SmHeartbeatStatusT heartbeat_state;
|
||||||
|
SmSystemModeT system_mode;
|
||||||
|
}SmSystemStatusT;
|
||||||
|
|
||||||
|
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
SmNodeScheduleStateT host_schedule_state;
|
||||||
|
SmNodeScheduleStateT peer_schedule_state;
|
||||||
|
}SmSystemFailoverStatus;
|
||||||
|
|
||||||
|
// ****************************************************************************
|
||||||
|
// sm_failover_ss_get_survivor - select the failover survivor
|
||||||
|
// ===================
|
||||||
|
SmErrorT sm_failover_ss_get_survivor(const SmSystemStatusT& system_status, SmSystemFailoverStatus& selection);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#endif // __SM_FAILOVER_SS_H__
|
Loading…
Reference in New Issue