From 68b5ce3835599197418fdf4d92fd5f9db758ad05 Mon Sep 17 00:00:00 2001 From: Bin Qian Date: Fri, 31 Aug 2018 09:36:29 -0400 Subject: [PATCH] SM to monitor infra i/f and swact when needed Individual services should not fail itself and trigger swact when infra i/f goes down SM will collect the overrall system healthy state to schedule the services. Story: 2003577 Task: 24899 Change-Id: Ifa7453136f34768b99e2bcd741d1065e69ef452e Signed-off-by: Bin Qian --- service-mgmt/sm-1.0.0/centos/build_srpm.data | 2 +- service-mgmt/sm-1.0.0/src/Makefile | 1 + service-mgmt/sm-1.0.0/src/sm_failover.c | 134 +++++++++++++++++-- service-mgmt/sm-1.0.0/src/sm_failover_ss.c | 80 +++++++++++ service-mgmt/sm-1.0.0/src/sm_failover_ss.h | 57 ++++++++ 5 files changed, 265 insertions(+), 9 deletions(-) create mode 100644 service-mgmt/sm-1.0.0/src/sm_failover_ss.h diff --git a/service-mgmt/sm-1.0.0/centos/build_srpm.data b/service-mgmt/sm-1.0.0/centos/build_srpm.data index cfe7ee97..56655102 100644 --- a/service-mgmt/sm-1.0.0/centos/build_srpm.data +++ b/service-mgmt/sm-1.0.0/centos/build_srpm.data @@ -2,4 +2,4 @@ SRC_DIR=$PKG_BASE COPY_LIST="$PKG_BASE/LICENSE" TAR_NAME=sm VERSION=1.0.0 -TIS_PATCH_VER=23 +TIS_PATCH_VER=24 diff --git a/service-mgmt/sm-1.0.0/src/Makefile b/service-mgmt/sm-1.0.0/src/Makefile index fa36a522..d7ee40bb 100644 --- a/service-mgmt/sm-1.0.0/src/Makefile +++ b/service-mgmt/sm-1.0.0/src/Makefile @@ -107,6 +107,7 @@ SRCS+=sm_swact_state.c SRCS+=sm_worker_thread.cpp SRCS+=sm_task_affining_thread.c SRCS+=sm_node_swact_monitor.cpp +SRCS+=sm_failover_ss.c SRCS+=sm_service_domain_interface_not_in_use_state.c SRCS+=sm_configuration_table.c SRCS+=sm_failover_utils.c diff --git a/service-mgmt/sm-1.0.0/src/sm_failover.c b/service-mgmt/sm-1.0.0/src/sm_failover.c index 79a08951..e5b07165 100644 --- a/service-mgmt/sm-1.0.0/src/sm_failover.c +++ b/service-mgmt/sm-1.0.0/src/sm_failover.c @@ -32,6 +32,7 @@ #include "sm_heartbeat_msg.h" #include "sm_node_swact_monitor.h" #include "sm_util_types.h" +#include "sm_failover_ss.h" #include "sm_failover_utils.h" typedef enum @@ -52,7 +53,13 @@ typedef enum SM_FAILOVER_ACTION_DEGRADE = 8, SM_FAILOVER_ACTION_ACTIVATE = 16, SM_FAILOVER_ACTION_FAIL_NODE = 32, - SM_FAILOVER_ACTION_UNDEFINED = 64 + SM_FAILOVER_ACTION_UNDEFINED = 64, + //as part of the gradual delivery of enhancement with more + //complex algorithm to determine the failover survivor routine + //the SM_FAILOVER_ACTION_ROUTINE will redirect the lookup to + //new logic. Until all actions are migrate to new logic, the + //lookup tables will be eliminated. + SM_FAILOVER_ACTION_ROUTINE = 1 << 31, }SmFailoverActionT; #define SM_FAILOVER_STATE_TRANSITION_TIME_IN_MS 2000 @@ -188,13 +195,13 @@ SmFailoverActionPairT action_map_std_infra[16] = {SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //6 {SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //7 {SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //8 - {SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //9 - {SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //10 - {SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //11 - {SM_FAILOVER_ACTION_SWACT, SM_FAILOVER_ACTION_DEGRADE}, //12 - {SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //13 - {SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //14 - {SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED} //15 + {SM_FAILOVER_ACTION_ROUTINE, SM_FAILOVER_ACTION_ROUTINE}, //9 + {SM_FAILOVER_ACTION_ROUTINE, SM_FAILOVER_ACTION_ROUTINE}, //10 + {SM_FAILOVER_ACTION_ROUTINE, SM_FAILOVER_ACTION_ROUTINE}, //11 + {SM_FAILOVER_ACTION_ROUTINE, SM_FAILOVER_ACTION_DEGRADE}, //12 + {SM_FAILOVER_ACTION_ROUTINE, SM_FAILOVER_ACTION_ROUTINE}, //13 + {SM_FAILOVER_ACTION_ROUTINE, SM_FAILOVER_ACTION_ROUTINE}, //14 + {SM_FAILOVER_ACTION_ROUTINE, SM_FAILOVER_ACTION_ROUTINE} //15 }; SmFailoverActionPairT action_map_std_no_infra[16] = @@ -916,6 +923,87 @@ bool this_controller_unlocked() } // **************************************************************************** + +// **************************************************************************** +// Failover - pack schedule state into 2 bit (active/standby/failed) +// SM starts managing failover after the node is scheduled, i.e, cannot be +// in SM_NODE_STATE_UNKNOWN or SM_NODE_STATE_INIT state +// ======================= +unsigned int sm_failover_pack_schedule_state(SmNodeScheduleStateT state) +{ + static const unsigned int failed_state_bit_flag = 0; + static const unsigned int active_state_bit_flag = 1; + static const unsigned int standby_state_bit_flag = 1 << 1; + unsigned int res; + switch(state) + { + case SM_NODE_STATE_ACTIVE: + res = active_state_bit_flag; + break; + case SM_NODE_STATE_STANDBY: + res = standby_state_bit_flag; + break; + case SM_NODE_STATE_FAILED: + res = failed_state_bit_flag; + break; + default: + res = failed_state_bit_flag; + } + return res; +} +// **************************************************************************** + +// **************************************************************************** +// Failover - convert target scheduling state to action +// ======================= +int sm_failover_get_action(const SmSystemFailoverStatus& failover_status) +{ + DPRINTFI("Host to %s, Peer to %s.", + sm_node_schedule_state_str(failover_status.host_schedule_state), + sm_node_schedule_state_str(failover_status.peer_schedule_state) + ); + unsigned int host_flag, peer_flag; + host_flag = sm_failover_pack_schedule_state(failover_status.host_schedule_state); + peer_flag = sm_failover_pack_schedule_state(failover_status.peer_schedule_state); + + unsigned int flag = (host_flag | (peer_flag << 2)); + DPRINTFI("Failover scheduling flag %d", flag); + SmFailoverActionPairT* actions; + int action; + + SmFailoverActionPairT action_map[16] = { + {SM_FAILOVER_ACTION_FAIL_NODE, SM_FAILOVER_ACTION_FAIL_NODE}, //00 00 + {SM_FAILOVER_ACTION_DISABLE_STANDBY, SM_FAILOVER_ACTION_FAIL_NODE}, //01 00 + {SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //10 00 + {SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //11 00 + {SM_FAILOVER_ACTION_FAIL_NODE, SM_FAILOVER_ACTION_ACTIVATE}, //00 01 + {SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //01 01 + {SM_FAILOVER_ACTION_SWACT, SM_FAILOVER_ACTION_NO_ACTION}, //10 01 + {SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //11 01 + {SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //00 10 + {SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //01 10 + {SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //10 10 + {SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //11 10 + {SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //00 11 + {SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //01 11 + {SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //10 11 + {SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //11 11 + }; + + bool is_active = is_active_controller(); + actions = &(action_map[flag & 0xf]); + if(is_active) + { + action = actions->active_controller_action; + } + else + { + action = actions->standby_controller_action; + } + return action; +} +// **************************************************************************** + // **************************************************************************** // Failover - audit // ======================= @@ -1072,6 +1160,36 @@ void sm_failover_audit() _log_nodes_state(action); DPRINTFI("Action to take %d", action); + + if (action & SM_FAILOVER_ACTION_ROUTINE) + { + SmSystemStatusT sys_status; + SmSystemFailoverStatus failover_status; + sys_status.system_mode = _system_mode; + if(if_state_flag & SM_FAILOVER_HEARTBEAT_ALIVE) + { + sys_status.heartbeat_state = SM_HEARTBEAT_OK; + }else + { + sys_status.heartbeat_state = SM_HEARTBEAT_LOSS; + } + + sys_status.host_status.node_name = _host_name; + sys_status.host_status.interface_state = if_state_flag & 0x7; + sys_status.host_status.current_schedule_state = _host_state; + sys_status.peer_status.node_name = _peer_name; + sys_status.peer_status.interface_state = _peer_if_state & 0x7; + sys_status.peer_status.current_schedule_state = sm_get_controller_state(_peer_name); + SmErrorT error = sm_failover_ss_get_survivor(sys_status, failover_status); + if(SM_OKAY != error) + { + DPRINTFE("Failed to determine failover state. "); + return; + } + + action = sm_failover_get_action(failover_status); + } + if (action & SM_FAILOVER_ACTION_ACTIVATE) { DPRINTFI("ACTIVE"); diff --git a/service-mgmt/sm-1.0.0/src/sm_failover_ss.c b/service-mgmt/sm-1.0.0/src/sm_failover_ss.c index e69de29b..e1323cba 100644 --- a/service-mgmt/sm-1.0.0/src/sm_failover_ss.c +++ b/service-mgmt/sm-1.0.0/src/sm_failover_ss.c @@ -0,0 +1,80 @@ +// +// Copyright (c) 2018 Wind River Systems, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +// + +#include "sm_failover_ss.h" +#include "sm_debug.h" + + +typedef enum +{ + SM_FAILOVER_INFRA_DOWN = 1, + SM_FAILOVER_MGMT_DOWN = 2, + SM_FAILOVER_OAM_DOWN = 4, +}SmFailoverCommFaultBitFlagT; + +// **************************************************************************** +// sm_failover_ss get_node_if_healthy_score - get node interface healthy score +// =================== +static int get_node_if_healthy_score(unsigned int interface_state) +{ + int healthy_score = 0; + if(interface_state & SM_FAILOVER_OAM_DOWN) + { + healthy_score -= 1; + } + if(interface_state & SM_FAILOVER_INFRA_DOWN) + { + healthy_score -= 2; + } + if(interface_state & SM_FAILOVER_INFRA_DOWN) + { + healthy_score -= 4; + } + + return healthy_score; +} +// **************************************************************************** + +// **************************************************************************** +// sm_failover_ss_get_survivor - select the failover survivor +// This is the main entry/container for the failover logic to determine how +// to schedule the controllers, i.e, active/standby or active/failure. +// =================== +SmErrorT sm_failover_ss_get_survivor(const SmSystemStatusT& system_status, SmSystemFailoverStatus& selection) +{ + selection.host_schedule_state = system_status.host_status.current_schedule_state; + selection.peer_schedule_state = system_status.peer_status.current_schedule_state; + if(SM_HEARTBEAT_OK == system_status.heartbeat_state) + { + int host_healthy_score, peer_healthy_score; + host_healthy_score = get_node_if_healthy_score(system_status.host_status.interface_state); + peer_healthy_score = get_node_if_healthy_score(system_status.peer_status.interface_state); + if( peer_healthy_score < host_healthy_score ) + { + //host is more healthy + selection.host_schedule_state = SM_NODE_STATE_ACTIVE; + selection.peer_schedule_state = SM_NODE_STATE_STANDBY; + }else if(peer_healthy_score > host_healthy_score) + { + //peer is more healthy + selection.host_schedule_state = SM_NODE_STATE_STANDBY; + selection.peer_schedule_state = SM_NODE_STATE_ACTIVE; + } + } + + if(system_status.host_status.current_schedule_state != selection.host_schedule_state || + system_status.peer_status.current_schedule_state != selection.peer_schedule_state ) + { + DPRINTFI("Uncontrolled swact starts. Host from %s to %s, Peer from %s to %s.", + sm_node_schedule_state_str(system_status.host_status.current_schedule_state), + sm_node_schedule_state_str(selection.host_schedule_state), + sm_node_schedule_state_str(system_status.peer_status.current_schedule_state), + sm_node_schedule_state_str(selection.peer_schedule_state) + ); + } + return SM_OKAY; +} +// **************************************************************************** \ No newline at end of file diff --git a/service-mgmt/sm-1.0.0/src/sm_failover_ss.h b/service-mgmt/sm-1.0.0/src/sm_failover_ss.h new file mode 100644 index 00000000..ebc9c3ff --- /dev/null +++ b/service-mgmt/sm-1.0.0/src/sm_failover_ss.h @@ -0,0 +1,57 @@ +// +// Copyright (c) 2018 Wind River Systems, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +// + +#ifndef __SM_FAILOVER_SS_H__ +#define __SM_FAILOVER_SS_H__ +#include +#include "sm_types.h" +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct +{ + const char* node_name; + unsigned int interface_state; + SmNodeScheduleStateT current_schedule_state; +}SmNodeStatusT; + +typedef enum +{ + //heartbeat ok + SM_HEARTBEAT_OK, + //single node situation + SM_HEARTBEAT_NA, + //other nodes report heartbeat with peer, no direct heartbeat + SM_HEARTBEAT_INDIRECT, + //no heartbeat + SM_HEARTBEAT_LOSS +}SmHeartbeatStatusT; + +typedef struct +{ + SmNodeStatusT host_status; + SmNodeStatusT peer_status; + SmHeartbeatStatusT heartbeat_state; + SmSystemModeT system_mode; +}SmSystemStatusT; + + +typedef struct +{ + SmNodeScheduleStateT host_schedule_state; + SmNodeScheduleStateT peer_schedule_state; +}SmSystemFailoverStatus; + +// **************************************************************************** +// sm_failover_ss_get_survivor - select the failover survivor +// =================== +SmErrorT sm_failover_ss_get_survivor(const SmSystemStatusT& system_status, SmSystemFailoverStatus& selection); + +#ifdef __cplusplus +} +#endif +#endif // __SM_FAILOVER_SS_H__