SM to monitor infra i/f and swact when needed

Individual services should not fail itself and trigger swact when infra i/f goes down
SM will collect the overrall system healthy state to schedule the services.

Story: 2003577
Task: 24899

Change-Id: Ifa7453136f34768b99e2bcd741d1065e69ef452e
Signed-off-by: Bin Qian <bin.qian@windriver.com>
This commit is contained in:
Bin Qian 2018-08-31 09:36:29 -04:00 committed by Dean Troyer
parent cd92cda225
commit 68b5ce3835
5 changed files with 265 additions and 9 deletions

View File

@ -2,4 +2,4 @@ SRC_DIR=$PKG_BASE
COPY_LIST="$PKG_BASE/LICENSE"
TAR_NAME=sm
VERSION=1.0.0
TIS_PATCH_VER=23
TIS_PATCH_VER=24

View File

@ -107,6 +107,7 @@ SRCS+=sm_swact_state.c
SRCS+=sm_worker_thread.cpp
SRCS+=sm_task_affining_thread.c
SRCS+=sm_node_swact_monitor.cpp
SRCS+=sm_failover_ss.c
SRCS+=sm_service_domain_interface_not_in_use_state.c
SRCS+=sm_configuration_table.c
SRCS+=sm_failover_utils.c

View File

@ -32,6 +32,7 @@
#include "sm_heartbeat_msg.h"
#include "sm_node_swact_monitor.h"
#include "sm_util_types.h"
#include "sm_failover_ss.h"
#include "sm_failover_utils.h"
typedef enum
@ -52,7 +53,13 @@ typedef enum
SM_FAILOVER_ACTION_DEGRADE = 8,
SM_FAILOVER_ACTION_ACTIVATE = 16,
SM_FAILOVER_ACTION_FAIL_NODE = 32,
SM_FAILOVER_ACTION_UNDEFINED = 64
SM_FAILOVER_ACTION_UNDEFINED = 64,
//as part of the gradual delivery of enhancement with more
//complex algorithm to determine the failover survivor routine
//the SM_FAILOVER_ACTION_ROUTINE will redirect the lookup to
//new logic. Until all actions are migrate to new logic, the
//lookup tables will be eliminated.
SM_FAILOVER_ACTION_ROUTINE = 1 << 31,
}SmFailoverActionT;
#define SM_FAILOVER_STATE_TRANSITION_TIME_IN_MS 2000
@ -188,13 +195,13 @@ SmFailoverActionPairT action_map_std_infra[16] =
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //6
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //7
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //8
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //9
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //10
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //11
{SM_FAILOVER_ACTION_SWACT, SM_FAILOVER_ACTION_DEGRADE}, //12
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //13
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //14
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED} //15
{SM_FAILOVER_ACTION_ROUTINE, SM_FAILOVER_ACTION_ROUTINE}, //9
{SM_FAILOVER_ACTION_ROUTINE, SM_FAILOVER_ACTION_ROUTINE}, //10
{SM_FAILOVER_ACTION_ROUTINE, SM_FAILOVER_ACTION_ROUTINE}, //11
{SM_FAILOVER_ACTION_ROUTINE, SM_FAILOVER_ACTION_DEGRADE}, //12
{SM_FAILOVER_ACTION_ROUTINE, SM_FAILOVER_ACTION_ROUTINE}, //13
{SM_FAILOVER_ACTION_ROUTINE, SM_FAILOVER_ACTION_ROUTINE}, //14
{SM_FAILOVER_ACTION_ROUTINE, SM_FAILOVER_ACTION_ROUTINE} //15
};
SmFailoverActionPairT action_map_std_no_infra[16] =
@ -916,6 +923,87 @@ bool this_controller_unlocked()
}
// ****************************************************************************
// ****************************************************************************
// Failover - pack schedule state into 2 bit (active/standby/failed)
// SM starts managing failover after the node is scheduled, i.e, cannot be
// in SM_NODE_STATE_UNKNOWN or SM_NODE_STATE_INIT state
// =======================
unsigned int sm_failover_pack_schedule_state(SmNodeScheduleStateT state)
{
static const unsigned int failed_state_bit_flag = 0;
static const unsigned int active_state_bit_flag = 1;
static const unsigned int standby_state_bit_flag = 1 << 1;
unsigned int res;
switch(state)
{
case SM_NODE_STATE_ACTIVE:
res = active_state_bit_flag;
break;
case SM_NODE_STATE_STANDBY:
res = standby_state_bit_flag;
break;
case SM_NODE_STATE_FAILED:
res = failed_state_bit_flag;
break;
default:
res = failed_state_bit_flag;
}
return res;
}
// ****************************************************************************
// ****************************************************************************
// Failover - convert target scheduling state to action
// =======================
int sm_failover_get_action(const SmSystemFailoverStatus& failover_status)
{
DPRINTFI("Host to %s, Peer to %s.",
sm_node_schedule_state_str(failover_status.host_schedule_state),
sm_node_schedule_state_str(failover_status.peer_schedule_state)
);
unsigned int host_flag, peer_flag;
host_flag = sm_failover_pack_schedule_state(failover_status.host_schedule_state);
peer_flag = sm_failover_pack_schedule_state(failover_status.peer_schedule_state);
unsigned int flag = (host_flag | (peer_flag << 2));
DPRINTFI("Failover scheduling flag %d", flag);
SmFailoverActionPairT* actions;
int action;
SmFailoverActionPairT action_map[16] = {
{SM_FAILOVER_ACTION_FAIL_NODE, SM_FAILOVER_ACTION_FAIL_NODE}, //00 00
{SM_FAILOVER_ACTION_DISABLE_STANDBY, SM_FAILOVER_ACTION_FAIL_NODE}, //01 00
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //10 00
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //11 00
{SM_FAILOVER_ACTION_FAIL_NODE, SM_FAILOVER_ACTION_ACTIVATE}, //00 01
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //01 01
{SM_FAILOVER_ACTION_SWACT, SM_FAILOVER_ACTION_NO_ACTION}, //10 01
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //11 01
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //00 10
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //01 10
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //10 10
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //11 10
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //00 11
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //01 11
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //10 11
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //11 11
};
bool is_active = is_active_controller();
actions = &(action_map[flag & 0xf]);
if(is_active)
{
action = actions->active_controller_action;
}
else
{
action = actions->standby_controller_action;
}
return action;
}
// ****************************************************************************
// ****************************************************************************
// Failover - audit
// =======================
@ -1072,6 +1160,36 @@ void sm_failover_audit()
_log_nodes_state(action);
DPRINTFI("Action to take %d", action);
if (action & SM_FAILOVER_ACTION_ROUTINE)
{
SmSystemStatusT sys_status;
SmSystemFailoverStatus failover_status;
sys_status.system_mode = _system_mode;
if(if_state_flag & SM_FAILOVER_HEARTBEAT_ALIVE)
{
sys_status.heartbeat_state = SM_HEARTBEAT_OK;
}else
{
sys_status.heartbeat_state = SM_HEARTBEAT_LOSS;
}
sys_status.host_status.node_name = _host_name;
sys_status.host_status.interface_state = if_state_flag & 0x7;
sys_status.host_status.current_schedule_state = _host_state;
sys_status.peer_status.node_name = _peer_name;
sys_status.peer_status.interface_state = _peer_if_state & 0x7;
sys_status.peer_status.current_schedule_state = sm_get_controller_state(_peer_name);
SmErrorT error = sm_failover_ss_get_survivor(sys_status, failover_status);
if(SM_OKAY != error)
{
DPRINTFE("Failed to determine failover state. ");
return;
}
action = sm_failover_get_action(failover_status);
}
if (action & SM_FAILOVER_ACTION_ACTIVATE)
{
DPRINTFI("ACTIVE");

View File

@ -0,0 +1,80 @@
//
// Copyright (c) 2018 Wind River Systems, Inc.
//
// SPDX-License-Identifier: Apache-2.0
//
#include "sm_failover_ss.h"
#include "sm_debug.h"
typedef enum
{
SM_FAILOVER_INFRA_DOWN = 1,
SM_FAILOVER_MGMT_DOWN = 2,
SM_FAILOVER_OAM_DOWN = 4,
}SmFailoverCommFaultBitFlagT;
// ****************************************************************************
// sm_failover_ss get_node_if_healthy_score - get node interface healthy score
// ===================
static int get_node_if_healthy_score(unsigned int interface_state)
{
int healthy_score = 0;
if(interface_state & SM_FAILOVER_OAM_DOWN)
{
healthy_score -= 1;
}
if(interface_state & SM_FAILOVER_INFRA_DOWN)
{
healthy_score -= 2;
}
if(interface_state & SM_FAILOVER_INFRA_DOWN)
{
healthy_score -= 4;
}
return healthy_score;
}
// ****************************************************************************
// ****************************************************************************
// sm_failover_ss_get_survivor - select the failover survivor
// This is the main entry/container for the failover logic to determine how
// to schedule the controllers, i.e, active/standby or active/failure.
// ===================
SmErrorT sm_failover_ss_get_survivor(const SmSystemStatusT& system_status, SmSystemFailoverStatus& selection)
{
selection.host_schedule_state = system_status.host_status.current_schedule_state;
selection.peer_schedule_state = system_status.peer_status.current_schedule_state;
if(SM_HEARTBEAT_OK == system_status.heartbeat_state)
{
int host_healthy_score, peer_healthy_score;
host_healthy_score = get_node_if_healthy_score(system_status.host_status.interface_state);
peer_healthy_score = get_node_if_healthy_score(system_status.peer_status.interface_state);
if( peer_healthy_score < host_healthy_score )
{
//host is more healthy
selection.host_schedule_state = SM_NODE_STATE_ACTIVE;
selection.peer_schedule_state = SM_NODE_STATE_STANDBY;
}else if(peer_healthy_score > host_healthy_score)
{
//peer is more healthy
selection.host_schedule_state = SM_NODE_STATE_STANDBY;
selection.peer_schedule_state = SM_NODE_STATE_ACTIVE;
}
}
if(system_status.host_status.current_schedule_state != selection.host_schedule_state ||
system_status.peer_status.current_schedule_state != selection.peer_schedule_state )
{
DPRINTFI("Uncontrolled swact starts. Host from %s to %s, Peer from %s to %s.",
sm_node_schedule_state_str(system_status.host_status.current_schedule_state),
sm_node_schedule_state_str(selection.host_schedule_state),
sm_node_schedule_state_str(system_status.peer_status.current_schedule_state),
sm_node_schedule_state_str(selection.peer_schedule_state)
);
}
return SM_OKAY;
}
// ****************************************************************************

View File

@ -0,0 +1,57 @@
//
// Copyright (c) 2018 Wind River Systems, Inc.
//
// SPDX-License-Identifier: Apache-2.0
//
#ifndef __SM_FAILOVER_SS_H__
#define __SM_FAILOVER_SS_H__
#include <stdio.h>
#include "sm_types.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct
{
const char* node_name;
unsigned int interface_state;
SmNodeScheduleStateT current_schedule_state;
}SmNodeStatusT;
typedef enum
{
//heartbeat ok
SM_HEARTBEAT_OK,
//single node situation
SM_HEARTBEAT_NA,
//other nodes report heartbeat with peer, no direct heartbeat
SM_HEARTBEAT_INDIRECT,
//no heartbeat
SM_HEARTBEAT_LOSS
}SmHeartbeatStatusT;
typedef struct
{
SmNodeStatusT host_status;
SmNodeStatusT peer_status;
SmHeartbeatStatusT heartbeat_state;
SmSystemModeT system_mode;
}SmSystemStatusT;
typedef struct
{
SmNodeScheduleStateT host_schedule_state;
SmNodeScheduleStateT peer_schedule_state;
}SmSystemFailoverStatus;
// ****************************************************************************
// sm_failover_ss_get_survivor - select the failover survivor
// ===================
SmErrorT sm_failover_ss_get_survivor(const SmSystemStatusT& system_status, SmSystemFailoverStatus& selection);
#ifdef __cplusplus
}
#endif
#endif // __SM_FAILOVER_SS_H__