Merge "split-brain avoidance improvement"

This commit is contained in:
Zuul 2018-11-09 14:48:57 +00:00 committed by Gerrit Code Review
commit b1aad31b96
28 changed files with 1031 additions and 830 deletions

View File

@ -102,12 +102,12 @@ SRCS+=sm_service_heartbeat_thread.c
SRCS+=sm_main_event_handler.c
SRCS+=fm_api_wrapper.c
SRCS+=sm_failover.c
SRCS+=sm_failover_thread.c
SRCS+=sm_swact_state.c
SRCS+=sm_worker_thread.cpp
SRCS+=sm_task_affining_thread.c
SRCS+=sm_node_swact_monitor.cpp
SRCS+=sm_failover_fsm.cpp
SRCS+=sm_failover_initial_state.cpp
SRCS+=sm_failover_normal_state.cpp
SRCS+=sm_failover_fail_pending_state.cpp
SRCS+=sm_failover_failed_state.cpp

View File

@ -37,24 +37,6 @@
#include "sm_failover_fsm.h"
#include "sm_api.h"
typedef enum
{
SM_FAILOVER_ACTION_NO_ACTION = 0,
SM_FAILOVER_ACTION_DISABLE_STANDBY = 1,
SM_FAILOVER_ACTION_DISABLE = 2,
SM_FAILOVER_ACTION_SWACT = 4,
SM_FAILOVER_ACTION_DEGRADE = 8,
SM_FAILOVER_ACTION_ACTIVATE = 16,
SM_FAILOVER_ACTION_FAIL_NODE = 32,
SM_FAILOVER_ACTION_UNDEFINED = 64,
//as part of the gradual delivery of enhancement with more
//complex algorithm to determine the failover survivor routine
//the SM_FAILOVER_ACTION_ROUTINE will redirect the lookup to
//new logic. Until all actions are migrated to new logic, the
//lookup tables will be eliminated.
SM_FAILOVER_ACTION_ROUTINE = 1 << 31,
}SmFailoverActionT;
#define SM_FAILOVER_STATE_TRANSITION_TIME_IN_MS 2000
#define SM_FAILOVER_MULTI_FAILURE_WAIT_TIMER_IN_MS 2000
#define SM_FAILOVER_RECOVERY_INTERVAL_IN_SEC 100
@ -132,16 +114,6 @@ class SmFailoverInterfaceInfo
}
};
typedef enum
{
SM_FAILOVER_INFRA_DOWN = 1,
SM_FAILOVER_MGMT_DOWN = 2,
SM_FAILOVER_OAM_DOWN = 4,
SM_FAILOVER_HEARTBEAT_ALIVE = 8,
SM_FAILOVER_HELLO_MSG_ALIVE = 16,
SM_FAILOVER_PEER_DISABLED = 0x4000,
}SmFailoverCommFaultBitFlagT;
SmTimerIdT failover_audit_timer_id;
static char _host_name[SM_NODE_NAME_MAX_CHAR];
static char _peer_name[SM_NODE_NAME_MAX_CHAR];
@ -159,13 +131,12 @@ static SmFailoverInterfaceInfo* _oam_interface_info = NULL;
static SmFailoverInterfaceInfo* _mgmt_interface_info = NULL;
static SmFailoverInterfaceInfo* _infra_interface_info = NULL;
static SmFailoverInterfaceInfo _peer_if_list[SM_INTERFACE_MAX];
static pthread_mutex_t _mutex = PTHREAD_MUTEX_INITIALIZER;
static pthread_mutex_t _mutex;
static SmDbHandleT* _sm_db_handle = NULL;
static SmNodeScheduleStateT _host_state;
static SmNodeScheduleStateT _host_state_at_last_action; // host state when action was taken last time
static int64_t _node_comm_state = -1;
static bool _to_disable_peer = false; //set flag for failover thread to send http request to mtce to disable peer
static int _prev_if_state_flag = -1;
time_t _last_if_state_ms = 0;
static SmNodeScheduleStateT _prev_host_state= SM_NODE_STATE_UNKNOWN;
@ -175,194 +146,12 @@ static SmSystemModeT _system_mode;
static time_t _last_report_ts = 0;
static int _heartbeat_count = 0;
// The class is to accommodate batching i/f state changes
// during the failover_audit interval.
class FailoverEventHolder
{
public:
FailoverEventHolder();
void push_event(SmFailoverEventT, const ISmFSMEventData* event_data);
void send_event();
private:
bool _event_to_send;
SmFailoverEventT _failover_event;
const ISmFSMEventData* _event_data;
};
FailoverEventHolder::FailoverEventHolder()
{
_event_to_send = false;
}
void FailoverEventHolder::push_event(SmFailoverEventT event, const ISmFSMEventData* event_data)
{
SmFailoverEventT list[] = {
SM_INTERFACE_DOWN,
SM_HEARTBEAT_LOST,
SM_INTERFACE_UP,
SM_HEARTBEAT_RECOVER
};
unsigned int i;
if(!_event_to_send)
{
_event_to_send = true;
for(i = 0; i < sizeof(list) / sizeof(SmFailoverEventT); i ++)
{
if(event == list[i])
{
_failover_event = event;
_event_data = event_data;
_event_to_send = true;
DPRINTFI("Push event %d", event);
return;
}
}
DPRINTFE("Runtime error: unsupported event %d", event);
return;
}
for(i = 0; list[i] != _failover_event && i < sizeof(list) / sizeof(SmFailoverEventT); i ++)
{
if(list[i] == event)
{
_failover_event = event;
_event_data = event_data;
return;
}
}
DPRINTFE("Runtime error: unsupported event %d", event);
}
void FailoverEventHolder::send_event()
{
if(_event_to_send)
{
DPRINTFI("Send event %d", _failover_event);
SmFailoverFSM::get_fsm().send_event(_failover_event, _event_data);
_event_to_send = false;
}
}
static FailoverEventHolder _event_holder;
// std 2+2/2+2+x failover actions
SmFailoverActionPairT action_map_std_infra[16] =
{
//Active action standby action if_state
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_ACTIVATE}, //0
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //1
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //2
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //3
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_ACTIVATE}, //4
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //5
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //6
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //7
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //8
{SM_FAILOVER_ACTION_ROUTINE, SM_FAILOVER_ACTION_ROUTINE}, //9
{SM_FAILOVER_ACTION_ROUTINE, SM_FAILOVER_ACTION_ROUTINE}, //10
{SM_FAILOVER_ACTION_ROUTINE, SM_FAILOVER_ACTION_ROUTINE}, //11
{SM_FAILOVER_ACTION_ROUTINE, SM_FAILOVER_ACTION_DEGRADE}, //12
{SM_FAILOVER_ACTION_ROUTINE, SM_FAILOVER_ACTION_ROUTINE}, //13
{SM_FAILOVER_ACTION_ROUTINE, SM_FAILOVER_ACTION_ROUTINE}, //14
{SM_FAILOVER_ACTION_ROUTINE, SM_FAILOVER_ACTION_ROUTINE} //15
};
SmFailoverActionPairT action_map_std_no_infra[16] =
{
//Active action standby action if_state
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_ACTIVATE}, //0
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //1
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //2
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //3
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_ACTIVATE}, //4
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //5
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //6
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //7
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //8
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //9
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //10
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //11
{SM_FAILOVER_ACTION_SWACT, SM_FAILOVER_ACTION_NO_ACTION}, //12
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //13
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //14
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED} //15
};
// cpe duplex failover actions
SmFailoverActionPairT *action_map_cpe_infra = action_map_std_infra;
SmFailoverActionPairT *action_map_cpe_no_infra = action_map_std_no_infra;
// cpe duplex-direct failover actions
SmFailoverActionPairT action_map_cpe_dc_infra[16] =
{
//Active action standby action if_state
{SM_FAILOVER_ACTION_DISABLE_STANDBY, SM_FAILOVER_ACTION_ACTIVATE}, //0
{SM_FAILOVER_ACTION_DISABLE_STANDBY, SM_FAILOVER_ACTION_ACTIVATE}, //1
{SM_FAILOVER_ACTION_DISABLE_STANDBY, SM_FAILOVER_ACTION_ACTIVATE}, //2
{SM_FAILOVER_ACTION_DISABLE_STANDBY, SM_FAILOVER_ACTION_ACTIVATE}, //3
{SM_FAILOVER_ACTION_DISABLE_STANDBY, SM_FAILOVER_ACTION_ACTIVATE}, //4
{SM_FAILOVER_ACTION_DISABLE_STANDBY, SM_FAILOVER_ACTION_ACTIVATE}, //5
{SM_FAILOVER_ACTION_DISABLE_STANDBY, SM_FAILOVER_ACTION_ACTIVATE}, //6
{SM_FAILOVER_ACTION_FAIL_NODE, SM_FAILOVER_ACTION_FAIL_NODE}, //7
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //8
{SM_FAILOVER_ACTION_DISABLE_STANDBY, SM_FAILOVER_ACTION_NO_ACTION}, //9
{SM_FAILOVER_ACTION_DISABLE_STANDBY, SM_FAILOVER_ACTION_NO_ACTION}, //10
{SM_FAILOVER_ACTION_DISABLE_STANDBY, SM_FAILOVER_ACTION_NO_ACTION}, //11
{SM_FAILOVER_ACTION_SWACT, SM_FAILOVER_ACTION_DEGRADE}, //12
{SM_FAILOVER_ACTION_DISABLE_STANDBY, SM_FAILOVER_ACTION_NO_ACTION}, //13
{SM_FAILOVER_ACTION_DISABLE_STANDBY, SM_FAILOVER_ACTION_NO_ACTION}, //14
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED} //15
};
SmFailoverActionPairT action_map_cpe_dc_no_infra[16] =
{
//Active action standby action if_state
{SM_FAILOVER_ACTION_DISABLE_STANDBY, SM_FAILOVER_ACTION_ACTIVATE}, //0
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //1
{SM_FAILOVER_ACTION_DISABLE_STANDBY, SM_FAILOVER_ACTION_ACTIVATE}, //2
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //3
{SM_FAILOVER_ACTION_DISABLE_STANDBY, SM_FAILOVER_ACTION_ACTIVATE}, //4
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //5
{SM_FAILOVER_ACTION_FAIL_NODE, SM_FAILOVER_ACTION_ACTIVATE}, //6
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //7
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //8
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //9
{SM_FAILOVER_ACTION_DISABLE_STANDBY, SM_FAILOVER_ACTION_NO_ACTION}, //10
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //11
{SM_FAILOVER_ACTION_SWACT, SM_FAILOVER_ACTION_DEGRADE}, //12
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //13
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //14
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED} //15
};
SmFailoverActionPairT action_map_simplex[16] =
{
//Active action standby action
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION},
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION},
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION},
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION},
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION},
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION},
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION},
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION},
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION},
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION},
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION},
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION},
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION},
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION},
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION},
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}
};
SmFailoverActionPairT *action_maps_no_infra[SM_SYSTEM_MODE_MAX] = {0};
SmFailoverActionPairT *action_maps_infra[SM_SYSTEM_MODE_MAX] = {0};
static bool _if_state_changed = false;
SmErrorT sm_exec_json_command(const char* cmd, char result_buf[], int result_len);
SmErrorT sm_failover_get_node_oper_state(char* node_name, SmNodeOperationalStateT *state);
void _log_nodes_state(int action);
void _log_nodes_state();
// ****************************************************************************
// Failover - interface check
@ -501,7 +290,7 @@ void sm_failover_lost_heartbeat( SmFailoverInterfaceT* interface )
DPRINTFI("Interface %s lose heartbeat.", interface->interface_name);
}
_event_holder.push_event(SM_HEARTBEAT_LOST, NULL);
_if_state_changed = true;
}
// ****************************************************************************
@ -534,7 +323,7 @@ void sm_failover_heartbeat_restore( SmFailoverInterfaceT* interface )
DPRINTFI("Interface %s heartbeat is OK.", interface->interface_name);
}
_event_holder.push_event(SM_HEARTBEAT_RECOVER, NULL);
_if_state_changed = true;
}
// ****************************************************************************
@ -571,7 +360,7 @@ void sm_failover_interface_down( const char* const interface_name )
DPRINTFI("%d domain interfaces are impacted as i/f %s is down.",
impacted, interface_name);
_event_holder.push_event(SM_INTERFACE_DOWN, NULL);
_if_state_changed = true;
}
}
// ****************************************************************************
@ -609,7 +398,7 @@ void sm_failover_interface_up( const char* const interface_name )
DPRINTFI("%d domain interfaces are impacted as i/f %s is up.",
impacted, interface_name);
_event_holder.push_event(SM_INTERFACE_UP, NULL);
_if_state_changed = true;
}
}
// ****************************************************************************
@ -685,7 +474,7 @@ bool is_infra_configured()
// ****************************************************************************
// Failover - get interface state
// ==================
int _failover_get_if_state()
int sm_failover_get_if_state()
{
SmFailoverInterfaceStateT mgmt_state = _mgmt_interface_info->get_state();
SmFailoverInterfaceStateT oam_state = _oam_interface_info->get_state();
@ -732,7 +521,7 @@ int _failover_get_if_state()
SmHeartbeatMsgIfStateT sm_failover_if_state_get()
{
mutex_holder holder(&_mutex);
int if_state_flag = _failover_get_if_state();
int if_state_flag = sm_failover_get_if_state();
return (if_state_flag & 0b0111); //the lower 3 bits i/f state flag
}
// ****************************************************************************
@ -760,7 +549,7 @@ SmNodeScheduleStateT get_controller_state()
// ****************************************************************************
// Failover - is active controller
// ==================
bool is_active_controller()
bool sm_is_active_controller()
{
return SM_NODE_STATE_ACTIVE == _host_state;
}
@ -846,8 +635,10 @@ SmFailoverActionResultT sm_failover_disable_node(char* node_name)
DPRINTFI("To disable %s", node_name);
SmErrorT error;
char reason_text[SM_LOG_REASON_TEXT_MAX_CHAR];
snprintf(reason_text, sizeof(reason_text), "%s has failed", node_name);
error = sm_node_fsm_event_handler(
node_name, SM_NODE_EVENT_DISABLED, NULL, "Host is isolated" );
node_name, SM_NODE_EVENT_DISABLED, NULL, reason_text );
if( SM_OKAY != error )
{
@ -855,7 +646,6 @@ SmFailoverActionResultT sm_failover_disable_node(char* node_name)
node_name, sm_error_str( error ) );
return SM_FAILOVER_ACTION_RESULT_FAILED;
}
_to_disable_peer = true;
return SM_FAILOVER_ACTION_RESULT_OK;
}
// ****************************************************************************
@ -876,13 +666,13 @@ static void active_self_callback(void* user_data[], SmServiceDomainT* domain)
// ****************************************************************************
// Failover - active self
// =======================
SmFailoverActionResultT sm_failover_activate_self()
SmErrorT sm_failover_activate_self()
{
DPRINTFI("Uncontrolled swact start (active local only)");
SmNodeSwactMonitor::SwactStart(SM_NODE_STATE_ACTIVE);
sm_service_domain_table_foreach( NULL, active_self_callback);
return SM_FAILOVER_ACTION_RESULT_OK;
return SM_OKAY;
}
// ****************************************************************************
@ -1022,8 +812,7 @@ SmErrorT sm_failover_set_system(const SmSystemFailoverStatus& failover_status)
if(SM_NODE_STATE_STANDBY == _host_state &&
SM_NODE_STATE_FAILED == peer_target_state)
{
result = sm_failover_activate_self();
if(SM_FAILOVER_ACTION_RESULT_FAILED == result)
if(SM_OKAY != sm_failover_activate_self())
{
DPRINTFE("Failed to activate %s.", _host_name);
return SM_FAILED;
@ -1097,7 +886,6 @@ void sm_failover_audit()
_prev_host_state = _host_state;
}
bool is_active = is_active_controller();
bool in_transition = false;
bool infra_configured = is_infra_configured();
in_transition = in_transition &&
@ -1116,20 +904,7 @@ void sm_failover_audit()
return;
}
SmFailoverActionPairT* actions;
int action;
SmFailoverActionPairT *action_map = NULL;
SmFailoverActionPairT **action_map_mode_list = action_maps_no_infra;
if(infra_configured)
{
action_map_mode_list = action_maps_infra;
}
action_map = action_map_mode_list[(int) _system_mode];
int if_state_flag = _failover_get_if_state();
int if_state_flag = sm_failover_get_if_state();
if(if_state_flag & SM_FAILOVER_HEARTBEAT_ALIVE)
{
_heartbeat_count ++;
@ -1176,7 +951,21 @@ void sm_failover_audit()
return;
}
_event_holder.send_event();
if(_if_state_changed)
{
SmIFStateChangedEventData event_data;
event_data.set_interface_state(SM_INTERFACE_OAM, _oam_interface_info->get_state());
event_data.set_interface_state(SM_INTERFACE_MGMT, _mgmt_interface_info->get_state());
if( NULL != _infra_interface_info)
{
event_data.set_interface_state(SM_INTERFACE_INFRA, _infra_interface_info->get_state());
}else
{
event_data.set_interface_state(SM_INTERFACE_INFRA, SM_FAILOVER_INTERFACE_UNKNOWN);
}
SmFailoverFSM::get_fsm().send_event(SM_FAILOVER_EVENT_IF_STATE_CHANGED, &event_data);
_if_state_changed = false;
}
int64_t curr_node_state = if_state_flag;
@ -1200,74 +989,7 @@ void sm_failover_audit()
_host_state_at_last_action = _host_state;
_peer_if_state_at_last_action = _peer_if_state;
actions = &(action_map[if_state_flag & 0xf]);
if(is_active)
{
action = actions->active_controller_action;
if(if_state_flag & SM_FAILOVER_OAM_DOWN)
{
if(_peer_if_state & SM_FAILOVER_OAM_DOWN)
{
DPRINTFI("No swact, oam down on both controllers");
action &= (~SM_FAILOVER_ACTION_SWACT);
}
else
{
DPRINTFI("Swact to %s, it's oam is UP", _peer_name);
action &= SM_FAILOVER_ACTION_SWACT;
}
}
}
else
{
action = actions->standby_controller_action;
}
_log_nodes_state(action);
DPRINTFI("Action to take %d", action);
if (action & SM_FAILOVER_ACTION_ROUTINE)
{
action = SM_FAILOVER_ACTION_NO_ACTION;
}
if (action & SM_FAILOVER_ACTION_ACTIVATE)
{
DPRINTFI("ACTIVE");
_retry |= ( SM_FAILOVER_ACTION_RESULT_OK != sm_failover_activate_self());
}
if(action & SM_FAILOVER_ACTION_DEGRADE) {
DPRINTFI("DEGRADE");
_retry |= ( SM_OKAY != sm_failover_degrade(SM_FAILOVER_DEGRADE_SOURCE_IF_DOWN) );
}
else
{
_retry |= ( SM_OKAY != sm_failover_degrade_clear(SM_FAILOVER_DEGRADE_SOURCE_IF_DOWN) );
}
if(action & SM_FAILOVER_ACTION_SWACT)
{
DPRINTFI("SWACT");
_retry |= ( SM_FAILOVER_ACTION_RESULT_OK != sm_failover_swact() );
}
if(action & SM_FAILOVER_ACTION_DISABLE)
{
DPRINTFI("DISABLE");
_retry |= (SM_FAILOVER_ACTION_RESULT_OK != sm_failover_disable_node(_host_name));
}
if(action & SM_FAILOVER_ACTION_FAIL_NODE)
{
DPRINTFI("FAIL SELF");
_retry |= (SM_FAILOVER_ACTION_RESULT_OK != sm_failover_fail_self());
}
if(action & SM_FAILOVER_ACTION_DISABLE_STANDBY) {
DPRINTFI("DISABLE STANDBY");
_retry |= (SM_FAILOVER_ACTION_RESULT_OK != sm_failover_disable_node(_peer_name));
}
_log_nodes_state();
}
// ****************************************************************************
@ -1340,7 +1062,7 @@ SmErrorT sm_exec_mtce_command(const char cmd[], char result_buf[], int result_le
// ****************************************************************************
// Failover - log current state
// =======================
void _log_nodes_state(int action)
void _log_nodes_state()
{
SmDbNodeT host, peer;
SmErrorT error = sm_failover_get_node(_host_name, host);
@ -1358,51 +1080,32 @@ void _log_nodes_state(int action)
sm_node_avail_status_str(peer.avail_status)
);
}
DPRINTFI("Host state %d, I/F state %d, peer I/F state %d, action %d",
DPRINTFI("Host state %d, I/F state %d, peer I/F state %d",
_node_comm_state,
_peer_if_state,
_host_state,
action
_host_state
);
}
// ****************************************************************************
// ****************************************************************************
// Failover - failover action
// Failover - disable peer node
// =======================
SmErrorT sm_failover_action()
SmErrorT sm_failover_disable_peer()
{
//This runs in failover thread for long action
// so main thread and hello msg are not blocked
bool *flag = NULL;
char result_buf[1024];
char cmd_buf[1024];
bool to_disable_peer;
snprintf(cmd_buf, sizeof(cmd_buf), "curl --header \"Content-Type:application/json\" "
"--header \"Accept: application/json\" --header \"User-Agent: sm/1.0\" "
"--data '{\"action\": \"event\", \"hostname\":\"%s\", \"operational\": \"disabled\", \"availability\": \"failed\"}' "
"\"http://localhost:2112/v1/hosts/%s/\"", _peer_name, _peer_name);
SmErrorT error = sm_exec_mtce_command(cmd_buf, result_buf, sizeof(result_buf));
if (SM_OKAY != error )
{
mutex_holder holder(&_mutex);
to_disable_peer = _to_disable_peer;
}
if ( to_disable_peer )
{
flag = &_to_disable_peer;
snprintf(cmd_buf, sizeof(cmd_buf), "curl --header \"Content-Type:application/json\" "
"--header \"Accept: application/json\" --header \"User-Agent: sm/1.0\" "
"--data '{\"action\": \"event\", \"hostname\":\"%s\", \"operational\": \"disabled\", \"availability\": \"failed\"}' "
"\"http://localhost:2112/v1/hosts/%s/\"", _peer_name, _peer_name);
SmErrorT error = sm_exec_mtce_command(cmd_buf, result_buf, sizeof(result_buf));
if (SM_OKAY != error )
{
return SM_FAILED;
}
}
if( flag )
{
mutex_holder holder(&_mutex);
* flag = false;
DPRINTFE("Failed to disable peer %s", _peer_name);
return SM_FAILED;
}
return SM_OKAY;
}
@ -1436,6 +1139,26 @@ SmFailoverInterfaceStateT sm_failover_get_interface_info(SmInterfaceTypeT interf
}
// ****************************************************************************
static void sm_failover_interface_change_callback(
SmHwInterfaceChangeDataT* if_change )
{
switch ( if_change->interface_state )
{
case SM_INTERFACE_STATE_DISABLED:
DPRINTFI("Interface %s is down", if_change->interface_name);
sm_failover_interface_down(if_change->interface_name);
break;
case SM_INTERFACE_STATE_ENABLED:
DPRINTFI("Interface %s is up", if_change->interface_name);
sm_failover_interface_up(if_change->interface_name);
break;
default:
DPRINTFI("Interface %s state changed to %d",
if_change->interface_name, if_change->interface_state);
break;
}
}
// ****************************************************************************
// Failover - Initialize
// ======================
@ -1445,6 +1168,28 @@ SmErrorT sm_failover_initialize( void )
bool enabled;
SmErrorT error;
pthread_mutexattr_t attr;
pthread_mutexattr_init(&attr);
pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
int res = pthread_mutex_init(&_mutex, &attr);
if( 0 != res )
{
DPRINTFE("Failed to initialize mutex, error %d", res);
return SM_FAILED;
}
SmHwCallbacksT callbacks;
memset( &callbacks, 0, sizeof(callbacks) );
callbacks.interface_change = sm_failover_interface_change_callback;
error = sm_hw_initialize( &callbacks );
if( SM_OKAY != error )
{
DPRINTFE( "Failed to initialize hardware module, error=%s.",
sm_error_str( error ) );
return( error );
}
error = SmFailoverFSM::initialize();
if( SM_OKAY != error )
{
@ -1455,16 +1200,6 @@ SmErrorT sm_failover_initialize( void )
_system_mode = sm_node_utils_get_system_mode();
DPRINTFI("System mode %s", sm_system_mode_str(_system_mode));
action_maps_no_infra[SM_SYSTEM_MODE_STANDARD] = action_map_std_no_infra;
action_maps_no_infra[SM_SYSTEM_MODE_CPE_DUPLEX] = action_map_cpe_no_infra;
action_maps_no_infra[SM_SYSTEM_MODE_CPE_DUPLEX_DC] = action_map_cpe_dc_no_infra;
action_maps_no_infra[SM_SYSTEM_MODE_CPE_SIMPLEX] = action_map_simplex;
action_maps_infra[SM_SYSTEM_MODE_STANDARD] = action_map_std_infra;
action_maps_infra[SM_SYSTEM_MODE_CPE_DUPLEX] = action_map_cpe_infra;
action_maps_infra[SM_SYSTEM_MODE_CPE_DUPLEX_DC] = action_map_cpe_dc_infra;
action_maps_infra[SM_SYSTEM_MODE_CPE_SIMPLEX] = action_map_simplex;
error = sm_db_connect( SM_DATABASE_NAME, &_sm_db_handle );
if( SM_OKAY != error )
{
@ -1552,7 +1287,7 @@ SmErrorT sm_failover_initialize( void )
}
}
error = sm_timer_register( "failover audit", 1000,
error = sm_timer_register( "failover audit", 50,
sm_failover_audit_timeout,
0, &failover_audit_timer_id );
@ -1620,7 +1355,7 @@ static const char* get_map_str( KeyStringMapT mappings[],
void dump_host_state(FILE* fp)
{
const char* state = sm_node_schedule_state_str(_host_state);
fprintf(fp, " host state: %s\n", state);
fprintf(fp, " host state: %s\n", state);
}
static KeyStringMapT if_state_map[] = {
@ -1656,12 +1391,13 @@ void dump_interfaces_state(FILE* fp)
void dump_peer_if_state(FILE* fp)
{
fprintf(fp, " Peer Interface state: %d\n", _peer_if_state);
fprintf(fp, " Peer Interface state: %d\n", _peer_if_state);
}
void dump_failover_fsm_state(FILE* fp)
{
fprintf(fp, " Failover FSM state: %d\n", SmFailoverFSM::get_fsm().get_state());
SmFailoverStateT state = SmFailoverFSM::get_fsm().get_state();
fprintf(fp, " Failover FSM state: %s\n", sm_failover_state_str(state));
}
// ****************************************************************************

View File

@ -29,6 +29,16 @@ typedef enum
SM_FAILOVER_DEGRADE_SOURCE_IF_DOWN = 2
}SmFailoverDegradeSourceT;
typedef enum
{
SM_FAILOVER_INFRA_DOWN = 1,
SM_FAILOVER_MGMT_DOWN = 2,
SM_FAILOVER_OAM_DOWN = 4,
SM_FAILOVER_HEARTBEAT_ALIVE = 8,
SM_FAILOVER_HELLO_MSG_ALIVE = 16,
SM_FAILOVER_PEER_DISABLED = 0x4000,
}SmFailoverCommFaultBitFlagT;
// ****************************************************************************
// Failover - check
// ==================
@ -136,6 +146,29 @@ SmHeartbeatMsgIfStateT sm_failover_get_peer_if_state();
SmErrorT sm_failover_set_system(const SmSystemFailoverStatus& failover_status);
// ****************************************************************************
// ****************************************************************************
// Failover - active self
// =======================
SmErrorT sm_failover_activate_self();
// ****************************************************************************
// ****************************************************************************
// Failover - disable peer node
// =======================
SmErrorT sm_failover_disable_peer();
// ****************************************************************************
// ****************************************************************************
// Failover - is active controller
// ==================
bool sm_is_active_controller();
// ****************************************************************************
// ****************************************************************************
// Failover - get interface state
// ==================
int sm_failover_get_if_state();
// ****************************************************************************
// ****************************************************************************
// Failover - dump state

View File

@ -5,6 +5,7 @@
//
#include "sm_failover_fail_pending_state.h"
#include <stdlib.h>
#include <unistd.h>
#include "sm_types.h"
#include "sm_limits.h"
#include "sm_debug.h"
@ -14,9 +15,164 @@
#include "sm_failover_utils.h"
#include "sm_node_utils.h"
#include "sm_node_api.h"
#include "sm_worker_thread.h"
static const int FAIL_PENDING_TIMEOUT = 2000; //2000ms
static SmTimerIdT action_timer_id = SM_TIMER_ID_INVALID;
static const int RESET_TIMEOUT = 10 * 1000; // 10 seconds for a reset command to reboot a node
static const int GO_ACTIVE_TIMEOUT = 30 * 1000; // 30 seconds for standby node go active
static const int WAIT_RESET_TIMEOUT = RESET_TIMEOUT + 5 * 1000; // extra 5 seconds for sending reset command
static void disable_peer(SmSimpleAction&)
{
SmErrorT error;
int counter = 0;
bool done = false;
DPRINTFI("To send mtce api to disable peer");
while(!done && counter < 5)
{
error = sm_failover_disable_peer();
counter ++;
if(SM_OKAY != error)
{
int wait_interval_us = 1000000; // 1 second
DPRINTFE("Failed to disable peer, reattempt in %d ms",
wait_interval_us / 1000);
usleep(wait_interval_us); // wait 1 second before retry
}else
{
done = true;
}
}
}
static SmSimpleAction _disable_peer_action("disable-peer", disable_peer);
static bool wait_for_reset_timeout(SmTimerIdT timer_id, int64_t user_data)
{
//timer never rearm, clear the timer id
action_timer_id = SM_TIMER_ID_INVALID;
sm_failover_utils_reset_stayfailed_flag();
DPRINTFI("stayfailed flag file is removed");
SmFailoverFSM::get_fsm().set_state(SM_FAILOVER_STATE_SURVIVED);
return false;
}
static bool reset_standby_timeout(SmTimerIdT timer_id, int64_t user_data)
{
//timer never rearm, clear the timer id
action_timer_id = SM_TIMER_ID_INVALID;
sm_failover_utils_set_stayfailed_flag();
DPRINTFI("stayfailed flag file is set");
const char* timer_name = "confirm standby reset";
SmErrorT error = sm_timer_register( timer_name, WAIT_RESET_TIMEOUT + WAIT_RESET_TIMEOUT,
wait_for_reset_timeout,
0, &action_timer_id);
if(SM_OKAY != error)
{
DPRINTFE("Failed to register confirm standby reset timer");
}
DPRINTFI("wait for %d secs...", (WAIT_RESET_TIMEOUT + WAIT_RESET_TIMEOUT) / 1000);
return false;
}
SmErrorT blind_guess_start_active()
{
SmErrorT error;
SmWorkerThread::get_worker().add_priority_action(&_disable_peer_action);
const char* timer_name = "reset standby";
error = sm_timer_register( timer_name, RESET_TIMEOUT,
reset_standby_timeout,
0, &action_timer_id);
if(SM_OKAY != error)
{
DPRINTFE("Failed to register reset standby timer");
return SM_FAILED;
}
DPRINTFI("wait for %d secs...", RESET_TIMEOUT / 1000);
return SM_OKAY;
}
static void standby_go_active_failed()
{
sm_failover_utils_set_stayfailed_flag();
DPRINTFI("stayfailed flag file is set");
SmFailoverFSM::get_fsm().set_state(SM_FAILOVER_STATE_FAILED);
}
bool standby_wait_timeout(SmTimerIdT timer_id, int64_t user_data)
{
//timer never rearm, clear the timer id
action_timer_id = SM_TIMER_ID_INVALID;
sm_failover_utils_reset_stayfailed_flag();
DPRINTFI("stayfailed flag file is removed");
char host_name[SM_NODE_NAME_MAX_CHAR];
SmErrorT error = sm_node_utils_get_hostname(host_name);
if( SM_OKAY != error )
{
DPRINTFE( "Failed to get hostname, error=%s.",
sm_error_str( error ) );
standby_go_active_failed();
return false;
}
SmNodeScheduleStateT host_state = sm_get_controller_state(host_name);
if(SM_NODE_STATE_ACTIVE != host_state)
{
standby_go_active_failed();
return false;
}
SmWorkerThread::get_worker().add_priority_action(&_disable_peer_action);
SmFailoverFSM::get_fsm().set_state(SM_FAILOVER_STATE_SURVIVED);
return false;
}
static void blind_guess_start_standby()
{
SmErrorT error;
sm_failover_utils_set_stayfailed_flag();
DPRINTFI("set stayfailed flag file");
sm_failover_activate_self();
const char* timer_name = "active controller action timeout";
error = sm_timer_register( timer_name, WAIT_RESET_TIMEOUT + GO_ACTIVE_TIMEOUT,
standby_wait_timeout,
0, &action_timer_id);
if(SM_OKAY != error)
{
DPRINTFE("Failed to register blindguess action timer");
}
DPRINTFI("wait for %d secs", (WAIT_RESET_TIMEOUT + GO_ACTIVE_TIMEOUT) / 1000);
}
void blind_guess_scenario_start()
{
SmSystemFailoverStatus& failover_status = SmSystemFailoverStatus::get_status();
SmNodeScheduleStateT host_state = failover_status.get_host_pre_failure_schedule_state();
SmNodeScheduleStateT peer_state = failover_status.get_peer_pre_failure_schedule_state();
DPRINTFI("Pre failure host state %s, peer state %s",
sm_node_schedule_state_str(host_state), sm_node_schedule_state_str(peer_state));
if(SM_NODE_STATE_ACTIVE == host_state)
{
if(SM_NODE_STATE_STANDBY == peer_state)
{
blind_guess_start_active();
}
}else if(SM_NODE_STATE_STANDBY == host_state)
{
blind_guess_start_standby();
}
}
SmFailoverFailPendingState::SmFailoverFailPendingState(SmFailoverFSM& fsm) : SmFSMState(fsm)
{
this->_pending_timer_id = SM_TIMER_ID_INVALID;
@ -30,19 +186,53 @@ SmFailoverFailPendingState::~SmFailoverFailPendingState()
SmErrorT SmFailoverFailPendingState::event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data)
{
//SmFSMEventDataTypeT event_data_type = event_data->get_event_data_type();
bool duplex = false;
switch (event)
{
case SM_HEARTBEAT_LOST:
case SM_INTERFACE_DOWN:
//getting worse, extend wait time
case SM_INTERFACE_UP:
case SM_HEARTBEAT_RECOVER:
// Things are improving, extend wait time
case SM_FAILOVER_EVENT_IF_STATE_CHANGED:
// Situation are changing, extend wait time
this->_register_timer();
break;
case SM_FAIL_PENDING_TIMEOUT:
case SM_FAILOVER_EVENT_FAIL_PENDING_TIMEOUT:
sm_node_utils_is_aio_duplex(&duplex);
if( duplex &&
0 == (sm_failover_get_if_state() & SM_FAILOVER_HEARTBEAT_ALIVE))
{
SmSystemFailoverStatus failover_status;
SmSystemModeT system_mode = sm_node_utils_get_system_mode();
SmInterfaceTypeT interfaces_to_check[3] = {SM_INTERFACE_UNKNOWN};
bool healthy = true;
if(sm_failover_utils_is_stayfailed())
{
healthy = false;
}
if(SM_SYSTEM_MODE_CPE_DUPLEX_DC == system_mode)
{
interfaces_to_check[0] = SM_INTERFACE_OAM;
}else
{
interfaces_to_check[0] = SM_INTERFACE_MGMT;
interfaces_to_check[1] = SM_INTERFACE_INFRA;
}
for(int i = 0; interfaces_to_check[i] != SM_INTERFACE_UNKNOWN; i ++)
{
SmFailoverInterfaceStateT if_state = sm_failover_get_interface_info(interfaces_to_check[i]);
if(SM_FAILOVER_INTERFACE_DOWN == if_state)
{
healthy = false;
break;
}
}
if(healthy)
{
blind_guess_scenario_start();
}
}
else
{
SmSystemFailoverStatus& failover_status = SmSystemFailoverStatus::get_status();
SmErrorT error = sm_failover_ss_get_survivor(failover_status);
SmNodeScheduleStateT host_state = failover_status.get_host_schedule_state();
SmNodeScheduleStateT peer_state = failover_status.get_peer_schedule_state();
@ -79,7 +269,9 @@ SmErrorT SmFailoverFailPendingState::event_handler(SmFailoverEventT event, const
}
break;
default:
DPRINTFE("Runtime error, unexpected event %d, at state %d", event, this->fsm.get_state());
DPRINTFE("Runtime error, unexpected event %s, at state %s",
sm_failover_event_str(event),
sm_failover_state_str(this->fsm.get_state()));
}
return SM_OKAY;
}
@ -87,13 +279,13 @@ SmErrorT SmFailoverFailPendingState::event_handler(SmFailoverEventT event, const
bool SmFailoverFailPendingState::_fail_pending_timeout(
SmTimerIdT timer_id, int64_t user_data)
{
SmFailoverFSM::get_fsm().send_event(SM_FAIL_PENDING_TIMEOUT, NULL);
SmFailoverFSM::get_fsm().send_event(SM_FAILOVER_EVENT_FAIL_PENDING_TIMEOUT, NULL);
return false;
}
SmErrorT SmFailoverFailPendingState::enter_state()
{
DPRINTFI("Entering Pending state");
SmFSMState::enter_state();
SmErrorT error = this->_register_timer();
if(SM_OKAY != error)
{
@ -146,6 +338,15 @@ SmErrorT SmFailoverFailPendingState::exit_state()
{
DPRINTFE("Failed to deregister fail pending timer. Error %s", sm_error_str(error));
}
DPRINTFI("Exiting Pending state");
if(SM_TIMER_ID_INVALID != action_timer_id)
{
error = sm_timer_deregister(action_timer_id);
action_timer_id = SM_TIMER_ID_INVALID;
if( SM_OKAY != error)
{
DPRINTFE("Failed to deregister action timer. Error %s", sm_error_str(error));
}
}
SmFSMState::exit_state();
return error;
}

View File

@ -10,57 +10,47 @@
#include "sm_failover_fsm.h"
#include "sm_failover_ss.h"
SmErrorT SmFailoverFailedState::enter_state()
static void _audit_failover_state()
{
DPRINTFI("Entering Failed state");
return SM_OKAY;
SmSystemFailoverStatus& failover_status = SmSystemFailoverStatus::get_status();
SmErrorT error = sm_failover_ss_get_survivor(failover_status);
SmNodeScheduleStateT host_state = failover_status.get_host_schedule_state();
SmNodeScheduleStateT peer_state = failover_status.get_peer_schedule_state();
if(SM_OKAY != error)
{
DPRINTFE("Failed to get failover survivor. Error %s", sm_error_str(error));
return;
}
if(SM_NODE_STATE_FAILED == host_state)
{
// don't need to set to failed state, already here
}
else if(SM_NODE_STATE_STANDBY == host_state && SM_NODE_STATE_ACTIVE == peer_state)
{
// standby is the only possible state to be scheduled to from failed state
SmFailoverFSM::get_fsm().set_state(SM_FAILOVER_STATE_NORMAL);
}else
{
DPRINTFE("Runtime error: unexpected scheduling state: %s",
sm_node_schedule_state_str(host_state));
}
}
SmErrorT SmFailoverFailedState::event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data)
{
switch (event)
{
case SM_HEARTBEAT_LOST:
case SM_INTERFACE_DOWN:
case SM_INTERFACE_UP:
// ignore, recover is possible only when heartbeat is recovered
break;
case SM_HEARTBEAT_RECOVER:
{
SmSystemFailoverStatus failover_status;
SmErrorT error = sm_failover_ss_get_survivor(failover_status);
SmNodeScheduleStateT host_state = failover_status.get_host_schedule_state();
SmNodeScheduleStateT peer_state = failover_status.get_peer_schedule_state();
if(SM_OKAY != error)
{
DPRINTFE("Failed to get failover survivor. Error %s", sm_error_str(error));
return error;
}
if(SM_NODE_STATE_FAILED == host_state)
{
// don't need to set to failed state, already here
}
else if(SM_NODE_STATE_STANDBY == host_state && SM_NODE_STATE_ACTIVE == peer_state)
{
// standby is the only possible state to be scheduled to from failed state
this->fsm.set_state(SM_FAILOVER_STATE_NORMAL);
}else
{
DPRINTFE("Runtime error: unexpected scheduling state: %s",
sm_node_schedule_state_str(host_state));
}
}
case SM_FAILOVER_EVENT_IF_STATE_CHANGED:
DPRINTFI("sm_heartbeat_recover event received.");
_audit_failover_state();
break;
default:
DPRINTFE("Runtime error, unexpected event %d, at state %d", event, this->fsm.get_state());
DPRINTFE("Runtime error, unexpected event %s, at state %s",
sm_failover_event_str(event),
sm_failover_state_str(this->fsm.get_state()));
}
return SM_OKAY;
}
SmErrorT SmFailoverFailedState::exit_state()
{
DPRINTFI("Exiting Failed state");
return SM_OKAY;
}

View File

@ -13,8 +13,6 @@ class SmFailoverFailedState : public SmFSMState
{
public:
SmFailoverFailedState(SmFailoverFSM& fsm) : SmFSMState(fsm){}
SmErrorT enter_state();
SmErrorT exit_state();
protected:
SmErrorT event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data);

View File

@ -6,20 +6,66 @@
#include "sm_failover_fsm.h"
#include <stdlib.h>
#include "sm_debug.h"
#include "sm_failover_initial_state.h"
#include "sm_failover_normal_state.h"
#include "sm_failover_fail_pending_state.h"
#include "sm_failover_failed_state.h"
#include "sm_failover_survived_state.h"
SmFSMEventDataTypeT SmIFStateChangedEventData::get_event_data_type() const
{
return SmIFStateChangedEventDataType;
}
SmIFStateChangedEventData::SmIFStateChangedEventData()
{
}
void SmIFStateChangedEventData::set_interface_state(
SmInterfaceTypeT interface_type, SmFailoverInterfaceStateT interface_state)
{
switch (interface_type)
{
case SM_INTERFACE_OAM:
_oam_state = interface_state;
break;
case SM_INTERFACE_MGMT:
_mgmt_state = interface_state;
break;
case SM_INTERFACE_INFRA:
_infra_state = interface_state;
break;
default:
DPRINTFE("Runtime error: invalid interface type %d", interface_type);
}
}
SmFailoverInterfaceStateT SmIFStateChangedEventData::get_interface_state(SmInterfaceTypeT interface_type) const
{
switch (interface_type)
{
case SM_INTERFACE_OAM:
return _oam_state;
case SM_INTERFACE_MGMT:
return _mgmt_state;
case SM_INTERFACE_INFRA:
return _infra_state;
default:
DPRINTFE("Runtime error: invalid interface type %d", interface_type);
return SM_FAILOVER_INTERFACE_UNKNOWN;
}
}
SmErrorT SmFSMState::enter_state()
{
// default implementation, nothing to do
DPRINTFI("Entering %s state", sm_failover_state_str(fsm.get_state()));
return SM_OKAY;
};
SmErrorT SmFSMState::exit_state()
{
// default implementation, nothing to do
DPRINTFI("Exiting %s state", sm_failover_state_str(fsm.get_state()));
return SM_OKAY;
};
@ -58,7 +104,7 @@ SmErrorT SmFailoverFSM::send_event(SmFailoverEventT event, const ISmFSMEventData
DPRINTFE("Runtime error. No handler for state %d", this->get_state());
return SM_FAILED;
}
DPRINTFI("send_event %d\n", event);
DPRINTFI("send_event %s\n", sm_failover_event_str(event));
state_handler->event_handler(event, event_data);
return SM_OKAY;
}
@ -96,6 +142,9 @@ SmErrorT SmFailoverFSM::set_state(SmFailoverStateT state)
DPRINTFE("State %d was not registered", state);
return SM_FAILED;
}
this->_current_state = state;
error = new_state_handler->enter_state();
if(SM_OKAY != error)
{
@ -103,7 +152,6 @@ SmErrorT SmFailoverFSM::set_state(SmFailoverStateT state)
return SM_FAILED;
}
this->_current_state = state;
return SM_OKAY;
}
@ -149,6 +197,7 @@ SmErrorT SmFailoverFSM::init_state()
SmErrorT SmFailoverFSM::initialize()
{
SmFailoverFSM& fsm = SmFailoverFSM::get_fsm();
fsm.register_fsm_state(SM_FAILOVER_STATE_INITIAL, new SmFailoverInitialState(fsm));
fsm.register_fsm_state(SM_FAILOVER_STATE_NORMAL, new SmFailoverNormalState(fsm));
fsm.register_fsm_state(SM_FAILOVER_STATE_FAIL_PENDING, new SmFailoverFailPendingState(fsm));
fsm.register_fsm_state(SM_FAILOVER_STATE_FAILED, new SmFailoverFailedState(fsm));

View File

@ -7,24 +7,6 @@
#define __SM_FAILOVER_FSM_H__
#include "sm_types.h"
typedef enum{
SM_FAILOVER_STATE_NORMAL,
SM_FAILOVER_STATE_FAIL_PENDING,
SM_FAILOVER_STATE_FAILED,
SM_FAILOVER_STATE_SURVIVED,
SM_FAILOVER_STATE_MAX
}SmFailoverStateT;
#define SM_FAILOVER_STATE_INITIAL SM_FAILOVER_STATE_NORMAL
typedef enum{
SM_HEARTBEAT_LOST,
SM_INTERFACE_DOWN,
SM_INTERFACE_UP,
SM_HEARTBEAT_RECOVER,
SM_FAIL_PENDING_TIMEOUT,
}SmFailoverEventT;
typedef int SmFSMEventDataTypeT;
class ISmFSMEventData
@ -59,7 +41,7 @@ class SmFailoverFSM
SmErrorT register_fsm_state(SmFailoverStateT state, SmFSMState* state_handler);
SmErrorT set_state(SmFailoverStateT state);
inline int get_state() const {return this->_current_state;}
inline SmFailoverStateT get_state() const {return this->_current_state;}
static SmErrorT initialize();
static SmErrorT finalize();
@ -70,12 +52,27 @@ class SmFailoverFSM
private:
static const int MaxState = SM_FAILOVER_STATE_MAX;
SmFSMState* _state_handlers[MaxState];
int _current_state;
SmFailoverStateT _current_state;
static SmFailoverFSM _the_fsm;
SmErrorT init_state();
void deregister_states();
};
class SmIFStateChangedEventData: public ISmFSMEventData
{
public:
static const SmFSMEventDataTypeT SmIFStateChangedEventDataType = 2;
SmFSMEventDataTypeT get_event_data_type() const;
SmIFStateChangedEventData();
void set_interface_state(
SmInterfaceTypeT interface_type, SmFailoverInterfaceStateT interface_state);
SmFailoverInterfaceStateT get_interface_state(SmInterfaceTypeT interface_type) const;
private:
SmFailoverInterfaceStateT _mgmt_state;
SmFailoverInterfaceStateT _infra_state;
SmFailoverInterfaceStateT _oam_state;
};
#endif //__SM_FAILOVER_FSM_H__

View File

@ -0,0 +1,28 @@
//
// Copyright (c) 2018 Wind River Systems, Inc.
//
// SPDX-License-Identifier: Apache-2.0
//
#include "sm_failover_initial_state.h"
#include "sm_types.h"
#include "sm_debug.h"
#include "sm_failover_fsm.h"
SmErrorT SmFailoverInitialState::event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data)
{
switch (event)
{
case SM_FAILOVER_EVENT_HEARTBEAT_ENABLED:
this->fsm.set_state(SM_FAILOVER_STATE_NORMAL);
break;
case SM_FAILOVER_EVENT_IF_STATE_CHANGED:
case SM_FAILOVER_EVENT_FAIL_PENDING_TIMEOUT:
case SM_FAILOVER_EVENT_NODE_ENABLED:
break;
default:
DPRINTFE("Runtime error, unexpected event %s, at state %s",
sm_failover_event_str(event),
sm_failover_state_str(this->fsm.get_state()));
}
return SM_OKAY;
}

View File

@ -0,0 +1,20 @@
//
// Copyright (c) 2018 Wind River Systems, Inc.
//
// SPDX-License-Identifier: Apache-2.0
//
#ifndef __SM_FAILOVER_INITIAL_STATE_H__
#define __SM_FAILOVER_INITIAL_STATE_H__
#include "sm_types.h"
#include "sm_failover_fsm.h"
class SmFailoverInitialState : public SmFSMState
{
public:
SmFailoverInitialState(SmFailoverFSM& fsm) : SmFSMState(fsm){}
protected:
SmErrorT event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data);
};
#endif //__SM_FAILOVER_INITIAL_STATE_H__

View File

@ -5,38 +5,80 @@
//
#include "sm_failover_normal_state.h"
#include <stdlib.h>
#include "sm_limits.h"
#include "sm_types.h"
#include "sm_debug.h"
#include "sm_node_api.h"
#include "sm_node_utils.h"
#include "sm_failover_utils.h"
#include "sm_failover_fsm.h"
SmErrorT SmFailoverNormalState::enter_state()
{
DPRINTFI("Entering normal state");
return SM_OKAY;
}
#include "sm_failover_ss.h"
SmErrorT SmFailoverNormalState::event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data)
{
switch (event)
{
case SM_HEARTBEAT_LOST:
case SM_INTERFACE_DOWN:
this->fsm.set_state(SM_FAILOVER_STATE_FAIL_PENDING);
break;
case SM_INTERFACE_UP:
DPRINTFI("Interface is up");
break;
case SM_HEARTBEAT_RECOVER:
DPRINTFI("Heartbeat is recovered");
case SM_FAILOVER_EVENT_IF_STATE_CHANGED:
if(NULL != event_data &&
SmIFStateChangedEventData::SmIFStateChangedEventDataType == event_data->get_event_data_type())
{
const SmIFStateChangedEventData* data = (const SmIFStateChangedEventData*) event_data;
SmFailoverInterfaceStateT oam_state, mgmt_state, infra_state;
oam_state = data->get_interface_state(SM_INTERFACE_OAM);
mgmt_state = data->get_interface_state(SM_INTERFACE_MGMT);
infra_state = data->get_interface_state(SM_INTERFACE_INFRA);
if(oam_state != SM_FAILOVER_INTERFACE_OK ||
mgmt_state != SM_FAILOVER_INTERFACE_OK ||
(infra_state != SM_FAILOVER_INTERFACE_OK && infra_state != SM_FAILOVER_INTERFACE_UNKNOWN))
{
this->fsm.set_state(SM_FAILOVER_STATE_FAIL_PENDING);
}
}else
{
DPRINTFE("Runtime error: receive invalid event data type");
}
break;
default:
DPRINTFE("Runtime error, unexpected event %d, at state %d", event, this->fsm.get_state());
DPRINTFE("Runtime error, unexpected event %s, at state %s",
sm_failover_event_str(event),
sm_failover_state_str(this->fsm.get_state()));
}
return SM_OKAY;
}
SmErrorT SmFailoverNormalState::exit_state()
{
DPRINTFI("Exiting normal state");
SmSystemFailoverStatus& failover_status = SmSystemFailoverStatus::get_status();
char host_name[SM_NODE_NAME_MAX_CHAR];
SmErrorT error = sm_node_utils_get_hostname(host_name);
if( SM_OKAY != error )
{
DPRINTFE( "Failed to get hostname, error=%s.",
sm_error_str( error ) );
failover_status.set_host_pre_failure_schedule_state(SM_NODE_STATE_UNKNOWN);
}else
{
SmNodeScheduleStateT host_state = sm_get_controller_state(host_name);
DPRINTFI("Blind guess pre failure host state %s", sm_node_schedule_state_str(host_state));
failover_status.set_host_pre_failure_schedule_state(host_state);
}
char peer_name[SM_NODE_NAME_MAX_CHAR];
error = sm_node_api_get_peername(peer_name);
if( SM_OKAY != error )
{
DPRINTFE( "Failed to get peername, error=%s.",
sm_error_str( error ) );
failover_status.set_peer_pre_failure_schedule_state(SM_NODE_STATE_UNKNOWN);
}else
{
SmNodeScheduleStateT peer_state = sm_get_controller_state(peer_name);
failover_status.set_peer_pre_failure_schedule_state(peer_state);
}
SmFSMState::exit_state();
return SM_OKAY;
}

View File

@ -12,7 +12,6 @@ class SmFailoverNormalState : public SmFSMState
{
public:
SmFailoverNormalState(SmFailoverFSM& fsm) : SmFSMState(fsm){}
SmErrorT enter_state();
SmErrorT exit_state();
protected:
SmErrorT event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data);

View File

@ -15,6 +15,40 @@
#include "sm_node_api.h"
#include "sm_failover.h"
//
SmErrorT _get_survivor_dc(const SmSystemStatusT& system_status, SmSystemFailoverStatus& selection);
// select standby as failed
SmErrorT _fail_standby(const SmSystemStatusT& system_status, SmSystemFailoverStatus& selection)
{
if(SM_NODE_STATE_STANDBY == system_status.host_status.current_schedule_state)
{
selection.set_host_schedule_state(SM_NODE_STATE_FAILED);
selection.set_peer_schedule_state(SM_NODE_STATE_ACTIVE);
}else if(SM_NODE_STATE_STANDBY == system_status.peer_status.current_schedule_state)
{
selection.set_peer_schedule_state(SM_NODE_STATE_FAILED);
selection.set_host_schedule_state(SM_NODE_STATE_ACTIVE);
}else
{
DPRINTFE("Runtime error. Unexpected scheduling state: host %s, peer %s (no standby)",
sm_node_schedule_state_str(system_status.host_status.current_schedule_state),
sm_node_schedule_state_str(system_status.peer_status.current_schedule_state));
return SM_FAILED;
}
return SM_OKAY;
}
const char SmSystemFailoverStatus::filename[] = "/var/lib/sm/failover.status";
const char SmSystemFailoverStatus::file_format[] =
"This is a very important system file.\n"
"Any modification is strictly forbidden and will cause serious consequence.\n"
"host_schedule_state=%1d\n"
"peer_schedule_state=%1d\n"
"last_update=%19s\n";
SmSystemFailoverStatus SmSystemFailoverStatus::_failover_status;
SmSystemFailoverStatus::SmSystemFailoverStatus()
{
SmErrorT error;
@ -42,11 +76,17 @@ SmSystemFailoverStatus::~SmSystemFailoverStatus()
{
}
SmSystemFailoverStatus& SmSystemFailoverStatus::get_status()
{
return _failover_status;
}
bool SmSystemFailoverStatus::_is_valid_schedule_state(SmNodeScheduleStateT state)
{
return (SM_NODE_STATE_ACTIVE == state ||
SM_NODE_STATE_STANDBY == state ||
SM_NODE_STATE_FAILED == state);
SM_NODE_STATE_FAILED == state ||
SM_NODE_STATE_INIT == state);
}
void SmSystemFailoverStatus::set_host_schedule_state(SmNodeScheduleStateT state)
@ -63,6 +103,20 @@ void SmSystemFailoverStatus::set_host_schedule_state(SmNodeScheduleStateT state)
}
}
void SmSystemFailoverStatus::set_host_pre_failure_schedule_state(SmNodeScheduleStateT state)
{
if(_is_valid_schedule_state(state))
{
if(_host_pre_failure_schedule_state != state)
{
_host_pre_failure_schedule_state = state;
}
}else
{
DPRINTFE("Runtime error, schedule state unknown %d", state);
}
}
void SmSystemFailoverStatus::set_peer_schedule_state(SmNodeScheduleStateT state)
{
if(_is_valid_schedule_state(state))
@ -77,12 +131,65 @@ void SmSystemFailoverStatus::set_peer_schedule_state(SmNodeScheduleStateT state)
}
}
typedef enum
void SmSystemFailoverStatus::set_peer_pre_failure_schedule_state(SmNodeScheduleStateT state)
{
SM_FAILOVER_INFRA_DOWN = 1,
SM_FAILOVER_MGMT_DOWN = 2,
SM_FAILOVER_OAM_DOWN = 4,
}SmFailoverCommFaultBitFlagT;
if(_is_valid_schedule_state(state))
{
if(_peer_pre_failure_schedule_state != state)
{
_peer_pre_failure_schedule_state = state;
}
}else
{
DPRINTFE("Runtime error, schedule state unknown %d", state);
}
}
void SmSystemFailoverStatus::serialize()
{
FILE* f;
time_t last_update;
struct tm* local_time;
time(&last_update);
local_time = localtime(&last_update);
char timestamp[20];
sprintf(timestamp, "%04d-%02d-%02dT%02d:%02d:%02d", local_time->tm_year + 1900,
local_time->tm_mon + 1, local_time->tm_mday, local_time->tm_hour, local_time->tm_min, local_time->tm_sec);
f = fopen(filename, "w");
fprintf(f, file_format, _host_schedule_state, _peer_schedule_state, timestamp);
for(int i = 0; i < 72; i ++)
{
fputs(".", f);
}
fclose(f);
}
void SmSystemFailoverStatus::deserialize()
{
// FILE* f;
// char timestamp[20];
//
// _host_schedule_state = _peer_schedule_state = SM_NODE_STATE_UNKNOWN;
// int host_state, peer_state;
// f = fopen(filename, "r");
// if(NULL != f)
// {
// DPRINTFI("Loading schedule state from %s", filename);
// int cnt = fscanf(f, file_format, &host_state, &peer_state, timestamp);
// fclose(f);
// if(cnt != 3)
// {
// DPRINTFE("Runtime error, %s has been modified.", filename);
// }else
// {
// set_host_schedule_state((SmNodeScheduleStateT)host_state);
// set_peer_schedule_state((SmNodeScheduleStateT)peer_state);
// }
// }
}
// ****************************************************************************
// sm_failover_ss get_node_if_healthy_score - get node interface healthy score
@ -98,7 +205,7 @@ static int get_node_if_healthy_score(unsigned int interface_state)
{
healthy_score -= 2;
}
if(interface_state & SM_FAILOVER_INFRA_DOWN)
if(interface_state & SM_FAILOVER_MGMT_DOWN)
{
healthy_score -= 4;
}
@ -152,7 +259,6 @@ SmErrorT _get_system_status(SmSystemStatusT& sys_status, char host_name[], char
sys_status.peer_status.current_schedule_state = sm_get_controller_state(peer_name);
return SM_OKAY;
}
// ****************************************************************************
// sm_failover_ss_get_survivor - select the failover survivor
// This is the main entry/container for the failover logic to determine how
@ -175,10 +281,17 @@ SmErrorT sm_failover_ss_get_survivor(SmSystemFailoverStatus& selection)
SmErrorT sm_failover_ss_get_survivor(const SmSystemStatusT& system_status, SmSystemFailoverStatus& selection)
{
DPRINTFI("get survivor %s %s, %s %s",
system_status.host_status.node_name,
sm_node_schedule_state_str(system_status.host_status.current_schedule_state),
system_status.peer_status.node_name,
sm_node_schedule_state_str(system_status.peer_status.current_schedule_state));
selection.set_host_schedule_state(system_status.host_status.current_schedule_state);
selection.set_peer_schedule_state(system_status.peer_status.current_schedule_state);
if(SM_HEARTBEAT_OK == system_status.heartbeat_state)
{
DPRINTFI("Heartbeat alive");
int host_healthy_score, peer_healthy_score;
host_healthy_score = get_node_if_healthy_score(system_status.host_status.interface_state);
peer_healthy_score = get_node_if_healthy_score(system_status.peer_status.interface_state);
@ -187,16 +300,52 @@ SmErrorT sm_failover_ss_get_survivor(const SmSystemStatusT& system_status, SmSys
//host is more healthy
selection.set_host_schedule_state(SM_NODE_STATE_ACTIVE);
selection.set_peer_schedule_state(SM_NODE_STATE_STANDBY);
if(system_status.peer_status.interface_state & SM_FAILOVER_MGMT_DOWN)
{
DPRINTFI("Disable peer, host go active");
selection.set_peer_schedule_state(SM_NODE_STATE_FAILED);
}
}else if(peer_healthy_score > host_healthy_score)
{
//peer is more healthy
selection.set_host_schedule_state(SM_NODE_STATE_STANDBY);
selection.set_peer_schedule_state(SM_NODE_STATE_ACTIVE);
if(system_status.host_status.interface_state & SM_FAILOVER_MGMT_DOWN)
{
DPRINTFI("Disable host, peer go active");
selection.set_host_schedule_state(SM_NODE_STATE_FAILED);
}
}
}else
{
DPRINTFI("Loss of heartbeat ALL");
selection.set_host_schedule_state(SM_NODE_STATE_ACTIVE);
selection.set_peer_schedule_state(SM_NODE_STATE_FAILED);
}
if(system_status.host_status.current_schedule_state != selection.get_host_schedule_state() ||
system_status.peer_status.current_schedule_state != selection.get_peer_schedule_state() )
if(SM_SYSTEM_MODE_CPE_DUPLEX == system_status.system_mode)
{
}
if(SM_SYSTEM_MODE_CPE_DUPLEX_DC == system_status.system_mode)
{
return _get_survivor_dc(system_status, selection);
}
SmNodeScheduleStateT host_schedule_state, peer_schedule_state;
host_schedule_state = selection.get_host_schedule_state();
peer_schedule_state = selection.get_peer_schedule_state();
DPRINTFI("Host from %s to %s, Peer from %s to %s.",
sm_node_schedule_state_str(system_status.host_status.current_schedule_state),
sm_node_schedule_state_str(host_schedule_state),
sm_node_schedule_state_str(system_status.peer_status.current_schedule_state),
sm_node_schedule_state_str(peer_schedule_state)
);
if((system_status.host_status.current_schedule_state == SM_NODE_STATE_ACTIVE &&
host_schedule_state != SM_NODE_STATE_ACTIVE) ||
(system_status.peer_status.current_schedule_state == SM_NODE_STATE_ACTIVE &&
peer_schedule_state != SM_NODE_STATE_ACTIVE))
{
DPRINTFI("Uncontrolled swact starts. Host from %s to %s, Peer from %s to %s.",
sm_node_schedule_state_str(system_status.host_status.current_schedule_state),
@ -205,6 +354,43 @@ SmErrorT sm_failover_ss_get_survivor(const SmSystemStatusT& system_status, SmSys
sm_node_schedule_state_str(selection.get_peer_schedule_state())
);
}
selection.serialize();
return SM_OKAY;
}
// ****************************************************************************
// ****************************************************************************
// Direct connected
SmErrorT _get_survivor_dc(const SmSystemStatusT& system_status, SmSystemFailoverStatus& selection)
{
if(SM_SYSTEM_MODE_CPE_DUPLEX_DC != system_status.system_mode)
{
DPRINTFE("Runtime error, not the right system mode %d", system_status.system_mode);
return SM_FAILED;
}
if(SM_HEARTBEAT_LOSS == system_status.heartbeat_state)
{
if(system_status.host_status.mgmt_state == SM_FAILOVER_INTERFACE_DOWN &&
(system_status.host_status.infra_state == SM_FAILOVER_INTERFACE_DOWN ||
system_status.host_status.infra_state == SM_FAILOVER_INTERFACE_UNKNOWN))
{
if(SM_FAILOVER_INTERFACE_DOWN == system_status.host_status.oam_state)
{
selection.set_host_schedule_state(SM_NODE_STATE_FAILED);
selection.set_peer_schedule_state(SM_NODE_STATE_ACTIVE);
}else
{
selection.set_peer_schedule_state(SM_NODE_STATE_FAILED);
selection.set_host_schedule_state(SM_NODE_STATE_ACTIVE);
}
}else
{
selection.set_peer_schedule_state(SM_NODE_STATE_FAILED);
selection.set_host_schedule_state(SM_NODE_STATE_ACTIVE);
}
}
return SM_OKAY;
}

View File

@ -8,9 +8,6 @@
#define __SM_FAILOVER_SS_H__
#include <stdio.h>
#include "sm_types.h"
//#ifdef __cplusplus
//extern "C" {
//#endif
typedef struct
{
@ -47,23 +44,37 @@ typedef struct
class SmSystemFailoverStatus
{
public:
SmSystemFailoverStatus();
virtual ~SmSystemFailoverStatus();
inline SmNodeScheduleStateT get_host_schedule_state() const {
return _host_schedule_state;
}
inline SmNodeScheduleStateT get_host_pre_failure_schedule_state() const {
return _host_pre_failure_schedule_state;
}
void set_host_schedule_state(SmNodeScheduleStateT state);
void set_host_pre_failure_schedule_state(SmNodeScheduleStateT state);
inline SmNodeScheduleStateT get_peer_schedule_state() const {
return _peer_schedule_state;
}
inline SmNodeScheduleStateT get_peer_pre_failure_schedule_state() const {
return _peer_pre_failure_schedule_state;
}
void set_peer_schedule_state(SmNodeScheduleStateT state);
void set_peer_pre_failure_schedule_state(SmNodeScheduleStateT state);
void serialize();
void deserialize();
static SmSystemFailoverStatus& get_status();
private:
SmSystemFailoverStatus();
SmNodeScheduleStateT _host_pre_failure_schedule_state;
SmNodeScheduleStateT _peer_pre_failure_schedule_state;
SmNodeScheduleStateT _host_schedule_state;
SmNodeScheduleStateT _peer_schedule_state;
static const char filename[];
static const char file_format[];
bool _is_valid_schedule_state(SmNodeScheduleStateT state);
static SmSystemFailoverStatus _failover_status;
};
// ****************************************************************************
@ -73,7 +84,4 @@ SmErrorT sm_failover_ss_get_survivor(SmSystemFailoverStatus& selection);
SmErrorT sm_failover_ss_get_survivor(const SmSystemStatusT& system_status, SmSystemFailoverStatus& selection);
//#ifdef __cplusplus
//}
//#endif
#endif // __SM_FAILOVER_SS_H__

View File

@ -10,53 +10,48 @@
#include "sm_failover_fsm.h"
#include "sm_failover_ss.h"
SmErrorT SmFailoverSurvivedState::enter_state()
static void _audit_failover_state()
{
DPRINTFI("Entering Survived state");
return SM_OKAY;
SmSystemFailoverStatus& failover_status = SmSystemFailoverStatus::get_status();
SmErrorT error = sm_failover_ss_get_survivor(failover_status);
SmNodeScheduleStateT host_state = failover_status.get_host_schedule_state();
SmNodeScheduleStateT peer_state = failover_status.get_peer_schedule_state();
if(SM_OKAY != error)
{
DPRINTFE("Failed to get failover survivor. Error %s", sm_error_str(error));
return;
}
if(SM_NODE_STATE_ACTIVE == host_state && SM_NODE_STATE_FAILED == peer_state)
{
// nothing changed yet
}
else if(SM_NODE_STATE_ACTIVE == host_state && SM_NODE_STATE_STANDBY == peer_state)
{
// Active is the only possible state to be scheduled to from survived state
SmFailoverFSM::get_fsm().set_state(SM_FAILOVER_STATE_NORMAL);
}else
{
DPRINTFE("Runtime error: unexpected scheduling state: host %s, peer %s",
sm_node_schedule_state_str(host_state),
sm_node_schedule_state_str(peer_state));
}
}
SmErrorT SmFailoverSurvivedState::event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data)
{
switch (event)
{
case SM_HEARTBEAT_LOST:
case SM_INTERFACE_DOWN:
case SM_INTERFACE_UP:
// ignore, recover is possible only when heartbeat is recovered
case SM_FAILOVER_EVENT_IF_STATE_CHANGED:
break;
case SM_HEARTBEAT_RECOVER:
{
SmSystemFailoverStatus failover_status;
SmErrorT error = sm_failover_ss_get_survivor(failover_status);
SmNodeScheduleStateT host_state = failover_status.get_host_schedule_state();
SmNodeScheduleStateT peer_state = failover_status.get_peer_schedule_state();
if(SM_OKAY != error)
{
DPRINTFE("Failed to get failover survivor. Error %s", sm_error_str(error));
return error;
}
if(SM_NODE_STATE_ACTIVE == host_state && SM_NODE_STATE_STANDBY == peer_state)
{
// Active is the only possible state to be scheduled to from survived state
this->fsm.set_state(SM_FAILOVER_STATE_NORMAL);
}else
{
DPRINTFE("Runtime error: unexpected scheduling state: host %s, peer %s",
sm_node_schedule_state_str(host_state),
sm_node_schedule_state_str(peer_state));
}
}
case SM_FAILOVER_EVENT_NODE_ENABLED:
_audit_failover_state();
break;
default:
DPRINTFE("Runtime error, unexpected event %d, at state %d", event, this->fsm.get_state());
DPRINTFE("Runtime error, unexpected event %s, at state %s",
sm_failover_event_str(event),
sm_failover_state_str(this->fsm.get_state()));
}
return SM_OKAY;
}
SmErrorT SmFailoverSurvivedState::exit_state()
{
DPRINTFI("Exiting Survived state");
return SM_OKAY;
}

View File

@ -12,8 +12,6 @@ class SmFailoverSurvivedState : public SmFSMState
{
public:
SmFailoverSurvivedState(SmFailoverFSM& fsm) : SmFSMState(fsm){}
SmErrorT enter_state();
SmErrorT exit_state();
protected:
SmErrorT event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data);

View File

@ -1,244 +0,0 @@
//
// Copyright (c) 2017 Wind River Systems, Inc.
//
// SPDX-License-Identifier: Apache-2.0
//
#include "sm_failover_thread.h"
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <pthread.h>
#include <signal.h>
#include <unistd.h>
#include <sys/types.h>
#include <string.h>
#include "sm_failover.h"
#include "sm_types.h"
#include "sm_time.h"
#include "sm_debug.h"
#include "sm_trap.h"
#include "sm_selobj.h"
#include "sm_thread_health.h"
#include "sm_hw.h"
#include "sm_timer.h"
#define SM_FAILOVER_THREAD_TICK_INTERVAL_IN_MS 1000
static const char sm_failover_thread_name[] = "sm_failover";
static const int sm_failover_tick_interval_in_ms = 1000;
static sig_atomic_t _stay_on = 1;
static bool _thread_created = false;
static pthread_t _failover_thread;
static void sm_failover_interface_change_callback(
SmHwInterfaceChangeDataT* if_change )
{
switch ( if_change->interface_state )
{
case SM_INTERFACE_STATE_DISABLED:
DPRINTFI("Interface %s is down", if_change->interface_name);
sm_failover_interface_down(if_change->interface_name);
break;
case SM_INTERFACE_STATE_ENABLED:
DPRINTFI("Interface %s is up", if_change->interface_name);
sm_failover_interface_up(if_change->interface_name);
break;
default:
DPRINTFI("Interface %s state changed to %d",
if_change->interface_name, if_change->interface_state);
break;
}
}
// ****************************************************************************
// Failover Thread - Initialize Thread
// ============================================
SmErrorT sm_failover_thread_initialize_thread( void )
{
SmErrorT error;
SmHwCallbacksT callbacks;
error = sm_selobj_initialize();
if( SM_OKAY != error )
{
DPRINTFE( "Failed to initialize selection object module, error=%s.",
sm_error_str( error ) );
return( SM_FAILED );
}
error = sm_timer_initialize( SM_FAILOVER_THREAD_TICK_INTERVAL_IN_MS );
if( SM_OKAY != error )
{
DPRINTFE( "Failed to initialize timer module, error=%s.",
sm_error_str( error ) );
return( SM_FAILED );
}
memset( &callbacks, 0, sizeof(callbacks) );
callbacks.interface_change = sm_failover_interface_change_callback;
error = sm_hw_initialize( &callbacks );
if( SM_OKAY != error )
{
DPRINTFE( "Failed to initialize hardware module, error=%s.",
sm_error_str( error ) );
return( error );
}
return( SM_OKAY );
}
// ****************************************************************************
// ****************************************************************************
// Failover Thread - Finalize Thread
// ==========================================
SmErrorT sm_failover_thread_finalize_thread( void )
{
SmErrorT error;
error = sm_timer_finalize();
if( SM_OKAY != error )
{
DPRINTFE( "Failed to finialize timer module, error=%s.",
sm_error_str( error ) );
}
error = sm_selobj_finalize();
if( SM_OKAY != error )
{
DPRINTFE( "Failed to finialize selection object module, error=%s.",
sm_error_str( error ) );
}
return( SM_OKAY );
}
// ****************************************************************************
// ****************************************************************************
// Failover Thread - Main
// ===============================
static void* sm_failover_thread_main( void* arguments )
{
SmErrorT error;
pthread_setname_np( pthread_self(), sm_failover_thread_name );
sm_debug_set_thread_info();
sm_trap_set_thread_info();
DPRINTFI( "Starting" );
error = sm_failover_thread_initialize_thread();
if( SM_OKAY != error )
{
DPRINTFE( "Failover thread initialize failed, error=%s.",
sm_error_str( error ) );
pthread_exit( NULL );
}
DPRINTFI( "Started." );
while( _stay_on )
{
error = sm_selobj_dispatch( sm_failover_tick_interval_in_ms );
if( SM_OKAY != error )
{
DPRINTFE( "Selection object dispatch failed, error=%s.",
sm_error_str(error) );
break;
}
sm_failover_action();
}
DPRINTFI( "Shutting down." );
error = sm_failover_thread_finalize_thread();
if( SM_OKAY != error )
{
DPRINTFE( "Failover finalize thread failed, error=%s.",
sm_error_str( error ) );
}
DPRINTFI( "Shutdown complete." );
return( NULL );
}
// ****************************************************************************
// ****************************************************************************
// Failover Thread - Start
// ================================
SmErrorT sm_failover_thread_start( void )
{
int result;
_stay_on = 1;
_thread_created = false;
result = pthread_create( &_failover_thread, NULL,
sm_failover_thread_main, NULL );
if( 0 != result )
{
DPRINTFE( "Failed to start failover thread, error=%s.",
strerror(result) );
return( SM_FAILED );
}
_thread_created = true;
return( SM_OKAY );
}
// ****************************************************************************
// ****************************************************************************
// Failover Thread - Stop
// ===============================
SmErrorT sm_failover_thread_stop( void )
{
_stay_on = 0;
if( _thread_created )
{
long ms_expired;
SmTimeT time_prev;
int result;
sm_time_get( &time_prev );
while( true )
{
result = pthread_tryjoin_np( _failover_thread, NULL );
if(( 0 != result )&&( EBUSY != result ))
{
if(( ESRCH != result )&&( EINVAL != result ))
{
DPRINTFE( "Failed to wait for failover thread "
"exit, sending kill signal, error=%s.",
strerror(result) );
pthread_kill( _failover_thread, SIGKILL );
}
break;
}
ms_expired = sm_time_get_elapsed_ms( &time_prev );
if( 12000 <= ms_expired )
{
DPRINTFE( "Failed to stop failover thread, sending "
"kill signal." );
pthread_kill( _failover_thread, SIGKILL );
break;
}
usleep( 250000 ); // 250 milliseconds.
}
}
_thread_created = false;
return( SM_OKAY );
}
// ****************************************************************************

View File

@ -1,33 +0,0 @@
//
// Copyright (c) 2017 Wind River Systems, Inc.
//
// SPDX-License-Identifier: Apache-2.0
//
#ifndef __SM_FAILOVER_THREAD_H__
#define __SM_FAILOVER_THREAD_H__
#include "sm_types.h"
#ifdef __cplusplus
extern "C" {
#endif
// ****************************************************************************
// Failover Thread - Start
// ================================
extern SmErrorT sm_failover_thread_start( void );
// ****************************************************************************
// ****************************************************************************
// Failover Thread - Stop
// ===============================
extern SmErrorT sm_failover_thread_stop( void );
// ****************************************************************************
#ifdef __cplusplus
}
#endif
#endif

View File

@ -1,12 +1,16 @@
#include "sm_failover_utils.h"
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#include "sm_debug.h"
#include "sm_service_domain_member_table.h"
#include "sm_service_domain_assignment_table.h"
#include "sm_service_domain_table.h"
#define SM_NODE_STAY_FAILED_FILE "/var/run/.sm_stay_fail"
// ****************************************************************************
// Failover utilities - loop all service domain members
// ==================
@ -90,3 +94,48 @@ SmNodeScheduleStateT sm_get_controller_state(
}
// ****************************************************************************
// ****************************************************************************
// Failover Utilities - Set StayFailed
// ==============================
SmErrorT sm_failover_utils_set_stayfailed_flag()
{
int fd = open( SM_NODE_STAY_FAILED_FILE,
O_RDWR | O_CREAT, S_IRUSR | S_IRGRP | S_IROTH);
if( 0 > fd )
{
DPRINTFE( "Failed to create file (%s), error=%s.",
SM_NODE_STAY_FAILED_FILE, strerror(errno) );
return( SM_FAILED );
}
close(fd);
return( SM_OKAY );
}
// ****************************************************************************
// ****************************************************************************
// Failover Utilities - Set StayFailed
// ==============================
SmErrorT sm_failover_utils_reset_stayfailed_flag()
{
unlink( SM_NODE_STAY_FAILED_FILE );
return( SM_OKAY );
}
// ****************************************************************************
// ****************************************************************************
// Failover Utilities - check StayFailed
// ==============================
bool sm_failover_utils_is_stayfailed()
{
if( 0 == access( SM_NODE_STAY_FAILED_FILE, F_OK ) )
{
return true;
}
return false;
}
// ****************************************************************************

View File

@ -15,7 +15,25 @@ extern "C" {
// ****************************************************************************
// Failover Utilities - get node controller state
// ==============================
SmNodeScheduleStateT sm_get_controller_state(const char node_name[]);
extern SmNodeScheduleStateT sm_get_controller_state(const char node_name[]);
// ****************************************************************************
// ****************************************************************************
// failover Utilities - Set StayFailed
// ==============================
extern SmErrorT sm_failover_utils_set_stayfailed_flag();
// ****************************************************************************
// ****************************************************************************
// Node Utilities - Set StayFailed
// ==============================
extern SmErrorT sm_failover_utils_reset_stayfailed_flag();
// ****************************************************************************
// ****************************************************************************
// Node Utilities - is StayFailed set
// ==============================
extern bool sm_failover_utils_is_stayfailed();
// ****************************************************************************
#ifdef __cplusplus

View File

@ -26,6 +26,7 @@
#include "sm_service_api.h"
#include "sm_failover.h"
#include "sm_node_swact_monitor.h"
#include "sm_failover_fsm.h"
#define SM_NODE_AUDIT_TIMER_IN_MS 1000
#define SM_INTERFACE_AUDIT_TIMER_IN_MS 1000
@ -177,6 +178,23 @@ static void sm_main_event_handler_api_node_set_callback( char node_name[],
return;
}
char peer_name[SM_NODE_NAME_MAX_CHAR] = {0};
error = sm_node_api_get_peername(peer_name);
if( SM_OKAY != error )
{
DPRINTFE( "Failed to get peername, error=%s.",
sm_error_str( error ) );
}
if(SM_NODE_SET_ACTION_EVENT == action &&
SM_NODE_ADMIN_STATE_UNLOCKED == admin_state &&
SM_NODE_OPERATIONAL_STATE_ENABLED == oper_state &&
(SM_NODE_AVAIL_STATUS_AVAILABLE == avail_status ||
SM_NODE_AVAIL_STATUS_DEGRADED == avail_status ) &&
0 == strcmp(node_name, peer_name))
{
SmFailoverFSM::get_fsm().send_event(SM_FAILOVER_EVENT_NODE_ENABLED, NULL);
}
if( SM_NODE_SET_ACTION_SWACT == action )
{
SmNodeSwactMonitor::SwactStart(SM_NODE_STATE_STANDBY);

View File

@ -51,7 +51,6 @@
#include "sm_service_action_table.h"
#include "sm_heartbeat_thread.h"
#include "sm_failover.h"
#include "sm_failover_thread.h"
#include "sm_task_affining_thread.h"
#include "sm_worker_thread.h"
@ -328,6 +327,14 @@ static SmErrorT sm_process_initialize( void )
return( SM_FAILED );
}
error = sm_failover_initialize();
if( SM_OKAY != error )
{
DPRINTFE( "Failed to initialize failover handler module, error=%s.",
sm_error_str( error ) );
return( SM_FAILED );
}
error = sm_service_heartbeat_thread_start();
if( SM_OKAY != error )
{
@ -344,22 +351,6 @@ static SmErrorT sm_process_initialize( void )
return( SM_FAILED );
}
error = sm_failover_thread_start();
if( SM_OKAY != error )
{
DPRINTFE( "Failed to start the failover thread, error=%s.",
sm_error_str( error ) );
return( SM_FAILED );
}
error = sm_failover_initialize();
if( SM_OKAY != error )
{
DPRINTFE( "Failed to initialize failover handler module, error=%s.",
sm_error_str( error ) );
return( SM_FAILED );
}
// Start a task affining thread for AIO duplex system
if(_is_aio_duplex)
{
@ -394,20 +385,6 @@ static SmErrorT sm_process_finalize( void )
}
}
error = sm_failover_thread_stop();
if( SM_OKAY != error )
{
DPRINTFE( "Failed to stop failover thread, error=%s.",
sm_error_str( error ) );
}
error = sm_failover_finalize();
if( SM_OKAY != error )
{
DPRINTFE( "Failed to finalize failover handler module, error=%s.",
sm_error_str( error ) );
}
error = sm_main_event_handler_finalize();
if( SM_OKAY != error )
{
@ -422,6 +399,13 @@ static SmErrorT sm_process_finalize( void )
sm_error_str(error) );
}
error = sm_failover_finalize();
if( SM_OKAY != error )
{
DPRINTFE( "Failed to finalize failover handler module, error=%s.",
sm_error_str( error ) );
}
error = sm_service_heartbeat_api_finalize( true );
if( SM_OKAY != error )
{

View File

@ -17,6 +17,7 @@
#include "sm_msg.h"
#include "sm_service_domain_utils.h"
#include "sm_service_domain_fsm.h"
#include "sm_failover_fsm.h"
// ****************************************************************************
// Service Domain Initial State - Hello Timer
@ -122,6 +123,7 @@ SmErrorT sm_service_domain_initial_state_exit( SmServiceDomainT* domain )
sm_heartbeat_enable();
SmFailoverFSM::get_fsm().send_event(SM_FAILOVER_EVENT_HEARTBEAT_ENABLED, NULL);
return( SM_OKAY );
}
// ****************************************************************************

View File

@ -9,8 +9,33 @@
#include <errno.h>
#include <unistd.h>
#include <time.h>
#include <string.h>
#include "sm_util_types.h"
#include "sm_debug.h"
#include "sm_trap.h"
SmSimpleAction::SmSimpleAction(const char* action_name, SmSimpleActionCallback callback)
{
_callback = callback;
strncpy(_action_name, action_name, sizeof(_action_name));
}
SmSimpleAction::~SmSimpleAction()
{
}
void SmSimpleAction::action()
{
if(NULL != _callback)
{
_callback(*this);
}
}
const char* SmSimpleAction::get_name() const
{
return _action_name;
}
SmWorkerThread SmWorkerThread::_the_worker;
@ -68,6 +93,10 @@ SmWorkerThread::~SmWorkerThread()
// ****************************************************************************
void* SmWorkerThread::thread_helper(SmWorkerThread* workerThread)
{
pthread_setname_np( pthread_self(), "worker" );
sm_debug_set_thread_info();
sm_trap_set_thread_info();
DPRINTFI("worker thread started");
workerThread->thread_run();
return 0;
}
@ -126,20 +155,33 @@ SmErrorT SmWorkerThread::stop()
// ****************************************************************************
// SmWorkerThread::add_action add a regular priority action
// ****************************************************************************
void SmWorkerThread::add_action(SmAction& action)
void SmWorkerThread::add_action(SmAction* action)
{
mutex_holder(&this->_mutex);
this->_regular_queue.push(&action);
this->_regular_queue.push(action);
int res = sem_post(&_sem);
if(0 != res)
{
DPRINTFE("failed to signal semaphore. error %d", errno);
}
DPRINTFI("Action %s is added", action->get_name());
}
// ****************************************************************************
// ****************************************************************************
// SmWorkerThread::add_priority_action add a high priority action
// ****************************************************************************
void SmWorkerThread::add_priority_action(SmAction& action)
void SmWorkerThread::add_priority_action(SmAction* action)
{
mutex_holder(&this->_mutex);
this->_priority_queue.push(&action);
this->_priority_queue.push(action);
int res = sem_post(&_sem);
if(0 != res)
{
DPRINTFE("failed to signal semaphore. error %d", errno);
}
DPRINTFI("Action %s is added to priority queue", action->get_name());
}
// ****************************************************************************
@ -177,7 +219,9 @@ void SmWorkerThread::thread_run()
}
if( NULL != action )
{
DPRINTFI("Executing action %s", action->get_name());
action->action();
DPRINTFI("Executing action %s completed", action->get_name());
}
}else if(ETIMEDOUT != errno)
{

View File

@ -18,10 +18,23 @@ class SmAction
{
public:
virtual void action() = 0;
virtual const char* get_name() const = 0;
};
class SmSimpleAction : public SmAction
{
public:
typedef void(*SmSimpleActionCallback)(SmSimpleAction&);
SmSimpleAction(const char* action_name, SmSimpleActionCallback callback);
virtual ~SmSimpleAction();
void action();
const char* get_name() const;
private:
SmSimpleActionCallback _callback;
char _action_name[32];
};
typedef std::queue<SmAction*> SmActionQueueT;
class SmWorkerThread;
// ****************************************************************************
// SmWorkerThread work thread class
@ -36,13 +49,13 @@ class SmWorkerThread
A normal priority action will be scheduled after all
high priority actions.
*/
void add_action(SmAction& action);
void add_action(SmAction* action);
/*
Add an action to high priority FCFS queue.
A high priority action is nonpreemptive. It will
be scheduled after the current action.
*/
void add_priority_action(SmAction& action);
void add_priority_action(SmAction* action);
// retrieve singleton object

View File

@ -166,6 +166,25 @@ _sm_service_domain_interface_event_mappings[SM_SERVICE_DOMAIN_INTERFACE_EVENT_MA
{ SM_SERVICE_DOMAIN_INTERFACE_EVENT_NOT_IN_USE, "not-in-use" }
};
static SmValueStrMappingT
_sm_failover_event_mappings[SM_FAILOVER_EVENT_MAX] =
{
{SM_FAILOVER_EVENT_HEARTBEAT_ENABLED, "heartbeat-enabled"},
{SM_FAILOVER_EVENT_IF_STATE_CHANGED, "interface-state-changed"},
{SM_FAILOVER_EVENT_FAIL_PENDING_TIMEOUT, "fail-pending-timeout"},
{SM_FAILOVER_EVENT_NODE_ENABLED, "node-enabled"}
};
static SmValueStrMappingT
_sm_failover_state_mappings[SM_FAILOVER_STATE_MAX] =
{
{SM_FAILOVER_STATE_INITIAL, "initial"},
{SM_FAILOVER_STATE_NORMAL, "normal"},
{SM_FAILOVER_STATE_FAIL_PENDING, "fail-pending"},
{SM_FAILOVER_STATE_FAILED, "failed"},
{SM_FAILOVER_STATE_SURVIVED, "survived"}
};
static SmValueStrMappingT
_sm_service_domain_neighbor_state_mappings[SM_SERVICE_DOMAIN_NEIGHBOR_STATE_MAX] =
{
@ -953,6 +972,28 @@ const char* sm_service_domain_interface_event_str(
}
// ****************************************************************************
// ****************************************************************************
// Types - Failover Event String
// =============================================
const char* sm_failover_event_str( SmFailoverEventT event )
{
return( sm_mapping_get_str( _sm_failover_event_mappings,
SM_FAILOVER_EVENT_MAX,
event ) );
}
// ****************************************************************************
// ****************************************************************************
// Types - Failover State String
// =============================================
const char* sm_failover_state_str( SmFailoverStateT state )
{
return( sm_mapping_get_str( _sm_failover_state_mappings,
SM_FAILOVER_STATE_MAX,
state ) );
}
// ****************************************************************************
// ****************************************************************************
// Types - Service Domain Neighbor State Value
// ===========================================

View File

@ -279,6 +279,23 @@ typedef enum
SM_SERVICE_DOMAIN_EVENT_DATA_MAX
} SmServiceDomainEventDataT;
typedef enum{
SM_FAILOVER_STATE_INITIAL,
SM_FAILOVER_STATE_NORMAL,
SM_FAILOVER_STATE_FAIL_PENDING,
SM_FAILOVER_STATE_FAILED,
SM_FAILOVER_STATE_SURVIVED,
SM_FAILOVER_STATE_MAX
}SmFailoverStateT;
typedef enum{
SM_FAILOVER_EVENT_HEARTBEAT_ENABLED,
SM_FAILOVER_EVENT_IF_STATE_CHANGED,
SM_FAILOVER_EVENT_FAIL_PENDING_TIMEOUT,
SM_FAILOVER_EVENT_NODE_ENABLED,
SM_FAILOVER_EVENT_MAX
}SmFailoverEventT;
typedef enum
{
SM_SERVICE_DOMAIN_INTERFACE_EVENT_NIL,
@ -931,6 +948,18 @@ extern const char* sm_service_domain_interface_event_str(
SmServiceDomainInterfaceEventT event );
// ****************************************************************************
// ****************************************************************************
// Types - Failover Event String
// =============================================
extern const char* sm_failover_event_str( SmFailoverEventT event );
// ****************************************************************************
// ****************************************************************************
// Types - Failover State String
// =============================================
extern const char* sm_failover_state_str( SmFailoverStateT state );
// ****************************************************************************
// ****************************************************************************
// Types - Service Domain Neighbor State Value
// ===========================================

View File

@ -3,7 +3,7 @@ BEGIN TRANSACTION;
CREATE TABLE NODES ( ID INTEGER PRIMARY KEY AUTOINCREMENT, NAME CHAR(32), ADMINISTRATIVE_STATE CHAR(32), OPERATIONAL_STATE CHAR(32), AVAILABILITY_STATUS CHAR(32), READY_STATE CHAR(32), STATE_UUID CHAR(42));
CREATE TABLE NODE_HISTORY ( ID INTEGER PRIMARY KEY AUTOINCREMENT, NAME CHAR(32), ADMINISTRATIVE_STATE CHAR(32), OPERATIONAL_STATE CHAR(32), AVAILABILITY_STATUS CHAR(32), READY_STATE CHAR(32), STATE_UUID CHAR(42));
CREATE TABLE SERVICE_DOMAIN_INTERFACES ( ID INTEGER PRIMARY KEY AUTOINCREMENT, PROVISIONED CHAR(32), SERVICE_DOMAIN CHAR(32), SERVICE_DOMAIN_INTERFACE CHAR(32), PATH_TYPE CHAR(32), AUTH_TYPE CHAR(32), AUTH_KEY CHAR(128), INTERFACE_NAME CHAR(32), INTERFACE_STATE CHAR(32), NETWORK_TYPE CHAR(32), NETWORK_MULTICAST CHAR(256), NETWORK_ADDRESS CHAR(256), NETWORK_PORT INT, NETWORK_HEARTBEAT_PORT INT, NETWORK_PEER_ADDRESS CHAR(256), NETWORK_PEER_PORT INT, NETWORK_PEER_HEARTBEAT_PORT INT , INTERFACE_CONNECT_TYPE CHAR(32) DEFAULT 'tor');
INSERT INTO "SERVICE_DOMAIN_INTERFACES" VALUES(1,'yes','controller','management-interface','primary','none','','','','','','','','','','','','tor');
INSERT INTO "SERVICE_DOMAIN_INTERFACES" VALUES(1,'yes','controller','management-interface','secondary','none','','','','','','','','','','','','tor');
INSERT INTO "SERVICE_DOMAIN_INTERFACES" VALUES(2,'yes','controller','oam-interface','secondary','hmac-sha512','titanium-server','','','','','','','','','','','tor');
INSERT INTO "SERVICE_DOMAIN_INTERFACES" VALUES(3,'yes','controller','infrastructure-interface','secondary','none','','','','','','','','','','','','tor');
CREATE TABLE SERVICE_DOMAINS ( ID INTEGER PRIMARY KEY AUTOINCREMENT, PROVISIONED CHAR(32), NAME CHAR(32), ORCHESTRATION CHAR(32), DESIGNATION CHAR(32), PREEMPT CHAR(32), PRIORITY INT, HELLO_INTERVAL INT, DEAD_INTERVAL INT, WAIT_INTERVAL INT, EXCHANGE_INTERVAL INT, STATE CHAR(32), SPLIT_BRAIN_RECOVERY CHAR(32), LEADER CHAR(32), GENERATION INT);