From 133da10b0848762cc677fc15128fc9dd1bcdacf8 Mon Sep 17 00:00:00 2001 From: Bin Qian Date: Fri, 26 Oct 2018 12:11:11 -0400 Subject: [PATCH] split-brain avoidance improvement This change enables one way communication via BMC (if configured) through mtce. when 2 controllers lost all communications to each other. The algorithm is: when communications all lost, both active and standby controllers, verify its interfaces (mgmt, infra, and oam) if active controller is healthy, it will request a bmc reset thorugh mtce, against standby controller. if standby controller is healthy, it will active itself and wait a total 45 seconds before requesting a bmc reset through mtce, against the active controller. Changes also include: 1. adding new initial failover state. initial state is a state before the node is enabled 2. remove failover thread. using worker thread action to perform time consuming operations 3. remove entire failover action table Story: 2003577 Task: 24901 Change-Id: I7d294d40e84469df6b6a6f6dd490cf3c4557b711 Signed-off-by: Bin Qian --- service-mgmt/sm-1.0.0/src/Makefile | 2 +- service-mgmt/sm-1.0.0/src/sm_failover.c | 454 ++++-------------- service-mgmt/sm-1.0.0/src/sm_failover.h | 33 ++ .../src/sm_failover_fail_pending_state.cpp | 225 ++++++++- .../sm-1.0.0/src/sm_failover_failed_state.cpp | 72 ++- .../sm-1.0.0/src/sm_failover_failed_state.h | 2 - service-mgmt/sm-1.0.0/src/sm_failover_fsm.cpp | 57 ++- service-mgmt/sm-1.0.0/src/sm_failover_fsm.h | 37 +- .../src/sm_failover_initial_state.cpp | 28 ++ .../sm-1.0.0/src/sm_failover_initial_state.h | 20 + .../sm-1.0.0/src/sm_failover_normal_state.cpp | 76 ++- .../sm-1.0.0/src/sm_failover_normal_state.h | 1 - service-mgmt/sm-1.0.0/src/sm_failover_ss.c | 206 +++++++- service-mgmt/sm-1.0.0/src/sm_failover_ss.h | 22 +- .../src/sm_failover_survived_state.cpp | 67 ++- .../sm-1.0.0/src/sm_failover_survived_state.h | 2 - .../sm-1.0.0/src/sm_failover_thread.c | 244 ---------- .../sm-1.0.0/src/sm_failover_thread.h | 33 -- service-mgmt/sm-1.0.0/src/sm_failover_utils.c | 51 +- service-mgmt/sm-1.0.0/src/sm_failover_utils.h | 20 +- .../sm-1.0.0/src/sm_main_event_handler.c | 18 + service-mgmt/sm-1.0.0/src/sm_process.c | 46 +- .../src/sm_service_domain_initial_state.c | 2 + .../sm-1.0.0/src/sm_worker_thread.cpp | 52 +- service-mgmt/sm-1.0.0/src/sm_worker_thread.h | 19 +- service-mgmt/sm-common-1.0.0/src/sm_types.c | 41 ++ service-mgmt/sm-common-1.0.0/src/sm_types.h | 29 ++ .../sm-db-1.0.0/database/create_sm_db.sql | 2 +- 28 files changed, 1031 insertions(+), 830 deletions(-) create mode 100644 service-mgmt/sm-1.0.0/src/sm_failover_initial_state.cpp create mode 100644 service-mgmt/sm-1.0.0/src/sm_failover_initial_state.h delete mode 100644 service-mgmt/sm-1.0.0/src/sm_failover_thread.c delete mode 100644 service-mgmt/sm-1.0.0/src/sm_failover_thread.h diff --git a/service-mgmt/sm-1.0.0/src/Makefile b/service-mgmt/sm-1.0.0/src/Makefile index 0c9c9635..3e86e402 100644 --- a/service-mgmt/sm-1.0.0/src/Makefile +++ b/service-mgmt/sm-1.0.0/src/Makefile @@ -102,12 +102,12 @@ SRCS+=sm_service_heartbeat_thread.c SRCS+=sm_main_event_handler.c SRCS+=fm_api_wrapper.c SRCS+=sm_failover.c -SRCS+=sm_failover_thread.c SRCS+=sm_swact_state.c SRCS+=sm_worker_thread.cpp SRCS+=sm_task_affining_thread.c SRCS+=sm_node_swact_monitor.cpp SRCS+=sm_failover_fsm.cpp +SRCS+=sm_failover_initial_state.cpp SRCS+=sm_failover_normal_state.cpp SRCS+=sm_failover_fail_pending_state.cpp SRCS+=sm_failover_failed_state.cpp diff --git a/service-mgmt/sm-1.0.0/src/sm_failover.c b/service-mgmt/sm-1.0.0/src/sm_failover.c index dc17cdd9..26124fc7 100644 --- a/service-mgmt/sm-1.0.0/src/sm_failover.c +++ b/service-mgmt/sm-1.0.0/src/sm_failover.c @@ -37,24 +37,6 @@ #include "sm_failover_fsm.h" #include "sm_api.h" -typedef enum -{ - SM_FAILOVER_ACTION_NO_ACTION = 0, - SM_FAILOVER_ACTION_DISABLE_STANDBY = 1, - SM_FAILOVER_ACTION_DISABLE = 2, - SM_FAILOVER_ACTION_SWACT = 4, - SM_FAILOVER_ACTION_DEGRADE = 8, - SM_FAILOVER_ACTION_ACTIVATE = 16, - SM_FAILOVER_ACTION_FAIL_NODE = 32, - SM_FAILOVER_ACTION_UNDEFINED = 64, - //as part of the gradual delivery of enhancement with more - //complex algorithm to determine the failover survivor routine - //the SM_FAILOVER_ACTION_ROUTINE will redirect the lookup to - //new logic. Until all actions are migrated to new logic, the - //lookup tables will be eliminated. - SM_FAILOVER_ACTION_ROUTINE = 1 << 31, -}SmFailoverActionT; - #define SM_FAILOVER_STATE_TRANSITION_TIME_IN_MS 2000 #define SM_FAILOVER_MULTI_FAILURE_WAIT_TIMER_IN_MS 2000 #define SM_FAILOVER_RECOVERY_INTERVAL_IN_SEC 100 @@ -132,16 +114,6 @@ class SmFailoverInterfaceInfo } }; -typedef enum -{ - SM_FAILOVER_INFRA_DOWN = 1, - SM_FAILOVER_MGMT_DOWN = 2, - SM_FAILOVER_OAM_DOWN = 4, - SM_FAILOVER_HEARTBEAT_ALIVE = 8, - SM_FAILOVER_HELLO_MSG_ALIVE = 16, - SM_FAILOVER_PEER_DISABLED = 0x4000, -}SmFailoverCommFaultBitFlagT; - SmTimerIdT failover_audit_timer_id; static char _host_name[SM_NODE_NAME_MAX_CHAR]; static char _peer_name[SM_NODE_NAME_MAX_CHAR]; @@ -159,13 +131,12 @@ static SmFailoverInterfaceInfo* _oam_interface_info = NULL; static SmFailoverInterfaceInfo* _mgmt_interface_info = NULL; static SmFailoverInterfaceInfo* _infra_interface_info = NULL; static SmFailoverInterfaceInfo _peer_if_list[SM_INTERFACE_MAX]; -static pthread_mutex_t _mutex = PTHREAD_MUTEX_INITIALIZER; +static pthread_mutex_t _mutex; static SmDbHandleT* _sm_db_handle = NULL; static SmNodeScheduleStateT _host_state; static SmNodeScheduleStateT _host_state_at_last_action; // host state when action was taken last time static int64_t _node_comm_state = -1; -static bool _to_disable_peer = false; //set flag for failover thread to send http request to mtce to disable peer static int _prev_if_state_flag = -1; time_t _last_if_state_ms = 0; static SmNodeScheduleStateT _prev_host_state= SM_NODE_STATE_UNKNOWN; @@ -175,194 +146,12 @@ static SmSystemModeT _system_mode; static time_t _last_report_ts = 0; static int _heartbeat_count = 0; -// The class is to accommodate batching i/f state changes -// during the failover_audit interval. -class FailoverEventHolder -{ - public: - FailoverEventHolder(); - void push_event(SmFailoverEventT, const ISmFSMEventData* event_data); - void send_event(); - private: - bool _event_to_send; - SmFailoverEventT _failover_event; - const ISmFSMEventData* _event_data; -}; -FailoverEventHolder::FailoverEventHolder() -{ - _event_to_send = false; -} - -void FailoverEventHolder::push_event(SmFailoverEventT event, const ISmFSMEventData* event_data) -{ - SmFailoverEventT list[] = { - SM_INTERFACE_DOWN, - SM_HEARTBEAT_LOST, - SM_INTERFACE_UP, - SM_HEARTBEAT_RECOVER - }; - - unsigned int i; - if(!_event_to_send) - { - _event_to_send = true; - for(i = 0; i < sizeof(list) / sizeof(SmFailoverEventT); i ++) - { - if(event == list[i]) - { - _failover_event = event; - _event_data = event_data; - _event_to_send = true; - DPRINTFI("Push event %d", event); - return; - } - } - DPRINTFE("Runtime error: unsupported event %d", event); - return; - } - - for(i = 0; list[i] != _failover_event && i < sizeof(list) / sizeof(SmFailoverEventT); i ++) - { - if(list[i] == event) - { - _failover_event = event; - _event_data = event_data; - return; - } - } - DPRINTFE("Runtime error: unsupported event %d", event); -} - -void FailoverEventHolder::send_event() -{ - if(_event_to_send) - { - DPRINTFI("Send event %d", _failover_event); - SmFailoverFSM::get_fsm().send_event(_failover_event, _event_data); - _event_to_send = false; - } -} - - -static FailoverEventHolder _event_holder; - -// std 2+2/2+2+x failover actions -SmFailoverActionPairT action_map_std_infra[16] = -{ - //Active action standby action if_state - {SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_ACTIVATE}, //0 - {SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //1 - {SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //2 - {SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //3 - {SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_ACTIVATE}, //4 - {SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //5 - {SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //6 - {SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //7 - {SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //8 - {SM_FAILOVER_ACTION_ROUTINE, SM_FAILOVER_ACTION_ROUTINE}, //9 - {SM_FAILOVER_ACTION_ROUTINE, SM_FAILOVER_ACTION_ROUTINE}, //10 - {SM_FAILOVER_ACTION_ROUTINE, SM_FAILOVER_ACTION_ROUTINE}, //11 - {SM_FAILOVER_ACTION_ROUTINE, SM_FAILOVER_ACTION_DEGRADE}, //12 - {SM_FAILOVER_ACTION_ROUTINE, SM_FAILOVER_ACTION_ROUTINE}, //13 - {SM_FAILOVER_ACTION_ROUTINE, SM_FAILOVER_ACTION_ROUTINE}, //14 - {SM_FAILOVER_ACTION_ROUTINE, SM_FAILOVER_ACTION_ROUTINE} //15 -}; - -SmFailoverActionPairT action_map_std_no_infra[16] = -{ - //Active action standby action if_state - {SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_ACTIVATE}, //0 - {SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //1 - {SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //2 - {SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //3 - {SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_ACTIVATE}, //4 - {SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //5 - {SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //6 - {SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //7 - {SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //8 - {SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //9 - {SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //10 - {SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //11 - {SM_FAILOVER_ACTION_SWACT, SM_FAILOVER_ACTION_NO_ACTION}, //12 - {SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //13 - {SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //14 - {SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED} //15 -}; - -// cpe duplex failover actions -SmFailoverActionPairT *action_map_cpe_infra = action_map_std_infra; -SmFailoverActionPairT *action_map_cpe_no_infra = action_map_std_no_infra; - -// cpe duplex-direct failover actions -SmFailoverActionPairT action_map_cpe_dc_infra[16] = -{ - //Active action standby action if_state - {SM_FAILOVER_ACTION_DISABLE_STANDBY, SM_FAILOVER_ACTION_ACTIVATE}, //0 - {SM_FAILOVER_ACTION_DISABLE_STANDBY, SM_FAILOVER_ACTION_ACTIVATE}, //1 - {SM_FAILOVER_ACTION_DISABLE_STANDBY, SM_FAILOVER_ACTION_ACTIVATE}, //2 - {SM_FAILOVER_ACTION_DISABLE_STANDBY, SM_FAILOVER_ACTION_ACTIVATE}, //3 - {SM_FAILOVER_ACTION_DISABLE_STANDBY, SM_FAILOVER_ACTION_ACTIVATE}, //4 - {SM_FAILOVER_ACTION_DISABLE_STANDBY, SM_FAILOVER_ACTION_ACTIVATE}, //5 - {SM_FAILOVER_ACTION_DISABLE_STANDBY, SM_FAILOVER_ACTION_ACTIVATE}, //6 - {SM_FAILOVER_ACTION_FAIL_NODE, SM_FAILOVER_ACTION_FAIL_NODE}, //7 - {SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //8 - {SM_FAILOVER_ACTION_DISABLE_STANDBY, SM_FAILOVER_ACTION_NO_ACTION}, //9 - {SM_FAILOVER_ACTION_DISABLE_STANDBY, SM_FAILOVER_ACTION_NO_ACTION}, //10 - {SM_FAILOVER_ACTION_DISABLE_STANDBY, SM_FAILOVER_ACTION_NO_ACTION}, //11 - {SM_FAILOVER_ACTION_SWACT, SM_FAILOVER_ACTION_DEGRADE}, //12 - {SM_FAILOVER_ACTION_DISABLE_STANDBY, SM_FAILOVER_ACTION_NO_ACTION}, //13 - {SM_FAILOVER_ACTION_DISABLE_STANDBY, SM_FAILOVER_ACTION_NO_ACTION}, //14 - {SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED} //15 -}; - -SmFailoverActionPairT action_map_cpe_dc_no_infra[16] = -{ - //Active action standby action if_state - {SM_FAILOVER_ACTION_DISABLE_STANDBY, SM_FAILOVER_ACTION_ACTIVATE}, //0 - {SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //1 - {SM_FAILOVER_ACTION_DISABLE_STANDBY, SM_FAILOVER_ACTION_ACTIVATE}, //2 - {SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //3 - {SM_FAILOVER_ACTION_DISABLE_STANDBY, SM_FAILOVER_ACTION_ACTIVATE}, //4 - {SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //5 - {SM_FAILOVER_ACTION_FAIL_NODE, SM_FAILOVER_ACTION_ACTIVATE}, //6 - {SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //7 - {SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //8 - {SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //9 - {SM_FAILOVER_ACTION_DISABLE_STANDBY, SM_FAILOVER_ACTION_NO_ACTION}, //10 - {SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //11 - {SM_FAILOVER_ACTION_SWACT, SM_FAILOVER_ACTION_DEGRADE}, //12 - {SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //13 - {SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //14 - {SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED} //15 -}; - -SmFailoverActionPairT action_map_simplex[16] = -{ - //Active action standby action - {SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, - {SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, - {SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, - {SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, - {SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, - {SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, - {SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, - {SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, - {SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, - {SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, - {SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, - {SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, - {SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, - {SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, - {SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, - {SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION} -}; -SmFailoverActionPairT *action_maps_no_infra[SM_SYSTEM_MODE_MAX] = {0}; -SmFailoverActionPairT *action_maps_infra[SM_SYSTEM_MODE_MAX] = {0}; +static bool _if_state_changed = false; SmErrorT sm_exec_json_command(const char* cmd, char result_buf[], int result_len); SmErrorT sm_failover_get_node_oper_state(char* node_name, SmNodeOperationalStateT *state); -void _log_nodes_state(int action); +void _log_nodes_state(); // **************************************************************************** // Failover - interface check @@ -501,7 +290,7 @@ void sm_failover_lost_heartbeat( SmFailoverInterfaceT* interface ) DPRINTFI("Interface %s lose heartbeat.", interface->interface_name); } - _event_holder.push_event(SM_HEARTBEAT_LOST, NULL); + _if_state_changed = true; } // **************************************************************************** @@ -534,7 +323,7 @@ void sm_failover_heartbeat_restore( SmFailoverInterfaceT* interface ) DPRINTFI("Interface %s heartbeat is OK.", interface->interface_name); } - _event_holder.push_event(SM_HEARTBEAT_RECOVER, NULL); + _if_state_changed = true; } // **************************************************************************** @@ -571,7 +360,7 @@ void sm_failover_interface_down( const char* const interface_name ) DPRINTFI("%d domain interfaces are impacted as i/f %s is down.", impacted, interface_name); - _event_holder.push_event(SM_INTERFACE_DOWN, NULL); + _if_state_changed = true; } } // **************************************************************************** @@ -609,7 +398,7 @@ void sm_failover_interface_up( const char* const interface_name ) DPRINTFI("%d domain interfaces are impacted as i/f %s is up.", impacted, interface_name); - _event_holder.push_event(SM_INTERFACE_UP, NULL); + _if_state_changed = true; } } // **************************************************************************** @@ -685,7 +474,7 @@ bool is_infra_configured() // **************************************************************************** // Failover - get interface state // ================== -int _failover_get_if_state() +int sm_failover_get_if_state() { SmFailoverInterfaceStateT mgmt_state = _mgmt_interface_info->get_state(); SmFailoverInterfaceStateT oam_state = _oam_interface_info->get_state(); @@ -732,7 +521,7 @@ int _failover_get_if_state() SmHeartbeatMsgIfStateT sm_failover_if_state_get() { mutex_holder holder(&_mutex); - int if_state_flag = _failover_get_if_state(); + int if_state_flag = sm_failover_get_if_state(); return (if_state_flag & 0b0111); //the lower 3 bits i/f state flag } // **************************************************************************** @@ -760,7 +549,7 @@ SmNodeScheduleStateT get_controller_state() // **************************************************************************** // Failover - is active controller // ================== -bool is_active_controller() +bool sm_is_active_controller() { return SM_NODE_STATE_ACTIVE == _host_state; } @@ -846,8 +635,10 @@ SmFailoverActionResultT sm_failover_disable_node(char* node_name) DPRINTFI("To disable %s", node_name); SmErrorT error; + char reason_text[SM_LOG_REASON_TEXT_MAX_CHAR]; + snprintf(reason_text, sizeof(reason_text), "%s has failed", node_name); error = sm_node_fsm_event_handler( - node_name, SM_NODE_EVENT_DISABLED, NULL, "Host is isolated" ); + node_name, SM_NODE_EVENT_DISABLED, NULL, reason_text ); if( SM_OKAY != error ) { @@ -855,7 +646,6 @@ SmFailoverActionResultT sm_failover_disable_node(char* node_name) node_name, sm_error_str( error ) ); return SM_FAILOVER_ACTION_RESULT_FAILED; } - _to_disable_peer = true; return SM_FAILOVER_ACTION_RESULT_OK; } // **************************************************************************** @@ -876,13 +666,13 @@ static void active_self_callback(void* user_data[], SmServiceDomainT* domain) // **************************************************************************** // Failover - active self // ======================= -SmFailoverActionResultT sm_failover_activate_self() +SmErrorT sm_failover_activate_self() { DPRINTFI("Uncontrolled swact start (active local only)"); SmNodeSwactMonitor::SwactStart(SM_NODE_STATE_ACTIVE); sm_service_domain_table_foreach( NULL, active_self_callback); - return SM_FAILOVER_ACTION_RESULT_OK; + return SM_OKAY; } // **************************************************************************** @@ -1022,8 +812,7 @@ SmErrorT sm_failover_set_system(const SmSystemFailoverStatus& failover_status) if(SM_NODE_STATE_STANDBY == _host_state && SM_NODE_STATE_FAILED == peer_target_state) { - result = sm_failover_activate_self(); - if(SM_FAILOVER_ACTION_RESULT_FAILED == result) + if(SM_OKAY != sm_failover_activate_self()) { DPRINTFE("Failed to activate %s.", _host_name); return SM_FAILED; @@ -1097,7 +886,6 @@ void sm_failover_audit() _prev_host_state = _host_state; } - bool is_active = is_active_controller(); bool in_transition = false; bool infra_configured = is_infra_configured(); in_transition = in_transition && @@ -1116,20 +904,7 @@ void sm_failover_audit() return; } - SmFailoverActionPairT* actions; - int action; - - SmFailoverActionPairT *action_map = NULL; - - SmFailoverActionPairT **action_map_mode_list = action_maps_no_infra; - if(infra_configured) - { - action_map_mode_list = action_maps_infra; - } - - action_map = action_map_mode_list[(int) _system_mode]; - - int if_state_flag = _failover_get_if_state(); + int if_state_flag = sm_failover_get_if_state(); if(if_state_flag & SM_FAILOVER_HEARTBEAT_ALIVE) { _heartbeat_count ++; @@ -1176,7 +951,21 @@ void sm_failover_audit() return; } - _event_holder.send_event(); + if(_if_state_changed) + { + SmIFStateChangedEventData event_data; + event_data.set_interface_state(SM_INTERFACE_OAM, _oam_interface_info->get_state()); + event_data.set_interface_state(SM_INTERFACE_MGMT, _mgmt_interface_info->get_state()); + if( NULL != _infra_interface_info) + { + event_data.set_interface_state(SM_INTERFACE_INFRA, _infra_interface_info->get_state()); + }else + { + event_data.set_interface_state(SM_INTERFACE_INFRA, SM_FAILOVER_INTERFACE_UNKNOWN); + } + SmFailoverFSM::get_fsm().send_event(SM_FAILOVER_EVENT_IF_STATE_CHANGED, &event_data); + _if_state_changed = false; + } int64_t curr_node_state = if_state_flag; @@ -1200,74 +989,7 @@ void sm_failover_audit() _host_state_at_last_action = _host_state; _peer_if_state_at_last_action = _peer_if_state; - actions = &(action_map[if_state_flag & 0xf]); - if(is_active) - { - action = actions->active_controller_action; - if(if_state_flag & SM_FAILOVER_OAM_DOWN) - { - if(_peer_if_state & SM_FAILOVER_OAM_DOWN) - { - DPRINTFI("No swact, oam down on both controllers"); - action &= (~SM_FAILOVER_ACTION_SWACT); - } - else - { - DPRINTFI("Swact to %s, it's oam is UP", _peer_name); - action &= SM_FAILOVER_ACTION_SWACT; - } - } - } - else - { - action = actions->standby_controller_action; - } - - _log_nodes_state(action); - - DPRINTFI("Action to take %d", action); - if (action & SM_FAILOVER_ACTION_ROUTINE) - { - action = SM_FAILOVER_ACTION_NO_ACTION; - } - - if (action & SM_FAILOVER_ACTION_ACTIVATE) - { - DPRINTFI("ACTIVE"); - _retry |= ( SM_FAILOVER_ACTION_RESULT_OK != sm_failover_activate_self()); - } - - if(action & SM_FAILOVER_ACTION_DEGRADE) { - DPRINTFI("DEGRADE"); - _retry |= ( SM_OKAY != sm_failover_degrade(SM_FAILOVER_DEGRADE_SOURCE_IF_DOWN) ); - } - else - { - _retry |= ( SM_OKAY != sm_failover_degrade_clear(SM_FAILOVER_DEGRADE_SOURCE_IF_DOWN) ); - } - - if(action & SM_FAILOVER_ACTION_SWACT) - { - DPRINTFI("SWACT"); - _retry |= ( SM_FAILOVER_ACTION_RESULT_OK != sm_failover_swact() ); - } - - if(action & SM_FAILOVER_ACTION_DISABLE) - { - DPRINTFI("DISABLE"); - _retry |= (SM_FAILOVER_ACTION_RESULT_OK != sm_failover_disable_node(_host_name)); - } - - if(action & SM_FAILOVER_ACTION_FAIL_NODE) - { - DPRINTFI("FAIL SELF"); - _retry |= (SM_FAILOVER_ACTION_RESULT_OK != sm_failover_fail_self()); - } - - if(action & SM_FAILOVER_ACTION_DISABLE_STANDBY) { - DPRINTFI("DISABLE STANDBY"); - _retry |= (SM_FAILOVER_ACTION_RESULT_OK != sm_failover_disable_node(_peer_name)); - } + _log_nodes_state(); } // **************************************************************************** @@ -1340,7 +1062,7 @@ SmErrorT sm_exec_mtce_command(const char cmd[], char result_buf[], int result_le // **************************************************************************** // Failover - log current state // ======================= -void _log_nodes_state(int action) +void _log_nodes_state() { SmDbNodeT host, peer; SmErrorT error = sm_failover_get_node(_host_name, host); @@ -1358,51 +1080,32 @@ void _log_nodes_state(int action) sm_node_avail_status_str(peer.avail_status) ); } - DPRINTFI("Host state %d, I/F state %d, peer I/F state %d, action %d", + DPRINTFI("Host state %d, I/F state %d, peer I/F state %d", _node_comm_state, _peer_if_state, - _host_state, - action + _host_state ); } // **************************************************************************** + // **************************************************************************** -// Failover - failover action +// Failover - disable peer node // ======================= -SmErrorT sm_failover_action() +SmErrorT sm_failover_disable_peer() { - //This runs in failover thread for long action - // so main thread and hello msg are not blocked - bool *flag = NULL; char result_buf[1024]; char cmd_buf[1024]; - bool to_disable_peer; + snprintf(cmd_buf, sizeof(cmd_buf), "curl --header \"Content-Type:application/json\" " + "--header \"Accept: application/json\" --header \"User-Agent: sm/1.0\" " + "--data '{\"action\": \"event\", \"hostname\":\"%s\", \"operational\": \"disabled\", \"availability\": \"failed\"}' " + "\"http://localhost:2112/v1/hosts/%s/\"", _peer_name, _peer_name); + + SmErrorT error = sm_exec_mtce_command(cmd_buf, result_buf, sizeof(result_buf)); + if (SM_OKAY != error ) { - mutex_holder holder(&_mutex); - - to_disable_peer = _to_disable_peer; - } - - if ( to_disable_peer ) - { - flag = &_to_disable_peer; - snprintf(cmd_buf, sizeof(cmd_buf), "curl --header \"Content-Type:application/json\" " - "--header \"Accept: application/json\" --header \"User-Agent: sm/1.0\" " - "--data '{\"action\": \"event\", \"hostname\":\"%s\", \"operational\": \"disabled\", \"availability\": \"failed\"}' " - "\"http://localhost:2112/v1/hosts/%s/\"", _peer_name, _peer_name); - - SmErrorT error = sm_exec_mtce_command(cmd_buf, result_buf, sizeof(result_buf)); - if (SM_OKAY != error ) - { - return SM_FAILED; - } - } - - if( flag ) - { - mutex_holder holder(&_mutex); - * flag = false; + DPRINTFE("Failed to disable peer %s", _peer_name); + return SM_FAILED; } return SM_OKAY; } @@ -1436,6 +1139,26 @@ SmFailoverInterfaceStateT sm_failover_get_interface_info(SmInterfaceTypeT interf } // **************************************************************************** +static void sm_failover_interface_change_callback( + SmHwInterfaceChangeDataT* if_change ) +{ + switch ( if_change->interface_state ) + { + case SM_INTERFACE_STATE_DISABLED: + DPRINTFI("Interface %s is down", if_change->interface_name); + sm_failover_interface_down(if_change->interface_name); + break; + case SM_INTERFACE_STATE_ENABLED: + DPRINTFI("Interface %s is up", if_change->interface_name); + sm_failover_interface_up(if_change->interface_name); + break; + default: + DPRINTFI("Interface %s state changed to %d", + if_change->interface_name, if_change->interface_state); + break; + } +} + // **************************************************************************** // Failover - Initialize // ====================== @@ -1445,6 +1168,28 @@ SmErrorT sm_failover_initialize( void ) bool enabled; SmErrorT error; + pthread_mutexattr_t attr; + + pthread_mutexattr_init(&attr); + pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE); + int res = pthread_mutex_init(&_mutex, &attr); + if( 0 != res ) + { + DPRINTFE("Failed to initialize mutex, error %d", res); + return SM_FAILED; + } + + SmHwCallbacksT callbacks; + memset( &callbacks, 0, sizeof(callbacks) ); + callbacks.interface_change = sm_failover_interface_change_callback; + error = sm_hw_initialize( &callbacks ); + if( SM_OKAY != error ) + { + DPRINTFE( "Failed to initialize hardware module, error=%s.", + sm_error_str( error ) ); + return( error ); + } + error = SmFailoverFSM::initialize(); if( SM_OKAY != error ) { @@ -1455,16 +1200,6 @@ SmErrorT sm_failover_initialize( void ) _system_mode = sm_node_utils_get_system_mode(); DPRINTFI("System mode %s", sm_system_mode_str(_system_mode)); - action_maps_no_infra[SM_SYSTEM_MODE_STANDARD] = action_map_std_no_infra; - action_maps_no_infra[SM_SYSTEM_MODE_CPE_DUPLEX] = action_map_cpe_no_infra; - action_maps_no_infra[SM_SYSTEM_MODE_CPE_DUPLEX_DC] = action_map_cpe_dc_no_infra; - action_maps_no_infra[SM_SYSTEM_MODE_CPE_SIMPLEX] = action_map_simplex; - - action_maps_infra[SM_SYSTEM_MODE_STANDARD] = action_map_std_infra; - action_maps_infra[SM_SYSTEM_MODE_CPE_DUPLEX] = action_map_cpe_infra; - action_maps_infra[SM_SYSTEM_MODE_CPE_DUPLEX_DC] = action_map_cpe_dc_infra; - action_maps_infra[SM_SYSTEM_MODE_CPE_SIMPLEX] = action_map_simplex; - error = sm_db_connect( SM_DATABASE_NAME, &_sm_db_handle ); if( SM_OKAY != error ) { @@ -1552,7 +1287,7 @@ SmErrorT sm_failover_initialize( void ) } } - error = sm_timer_register( "failover audit", 1000, + error = sm_timer_register( "failover audit", 50, sm_failover_audit_timeout, 0, &failover_audit_timer_id ); @@ -1620,7 +1355,7 @@ static const char* get_map_str( KeyStringMapT mappings[], void dump_host_state(FILE* fp) { const char* state = sm_node_schedule_state_str(_host_state); - fprintf(fp, " host state: %s\n", state); + fprintf(fp, " host state: %s\n", state); } static KeyStringMapT if_state_map[] = { @@ -1656,12 +1391,13 @@ void dump_interfaces_state(FILE* fp) void dump_peer_if_state(FILE* fp) { - fprintf(fp, " Peer Interface state: %d\n", _peer_if_state); + fprintf(fp, " Peer Interface state: %d\n", _peer_if_state); } void dump_failover_fsm_state(FILE* fp) { - fprintf(fp, " Failover FSM state: %d\n", SmFailoverFSM::get_fsm().get_state()); + SmFailoverStateT state = SmFailoverFSM::get_fsm().get_state(); + fprintf(fp, " Failover FSM state: %s\n", sm_failover_state_str(state)); } // **************************************************************************** diff --git a/service-mgmt/sm-1.0.0/src/sm_failover.h b/service-mgmt/sm-1.0.0/src/sm_failover.h index 5266bdbe..698ea77e 100644 --- a/service-mgmt/sm-1.0.0/src/sm_failover.h +++ b/service-mgmt/sm-1.0.0/src/sm_failover.h @@ -29,6 +29,16 @@ typedef enum SM_FAILOVER_DEGRADE_SOURCE_IF_DOWN = 2 }SmFailoverDegradeSourceT; +typedef enum +{ + SM_FAILOVER_INFRA_DOWN = 1, + SM_FAILOVER_MGMT_DOWN = 2, + SM_FAILOVER_OAM_DOWN = 4, + SM_FAILOVER_HEARTBEAT_ALIVE = 8, + SM_FAILOVER_HELLO_MSG_ALIVE = 16, + SM_FAILOVER_PEER_DISABLED = 0x4000, +}SmFailoverCommFaultBitFlagT; + // **************************************************************************** // Failover - check // ================== @@ -136,6 +146,29 @@ SmHeartbeatMsgIfStateT sm_failover_get_peer_if_state(); SmErrorT sm_failover_set_system(const SmSystemFailoverStatus& failover_status); // **************************************************************************** +// **************************************************************************** +// Failover - active self +// ======================= +SmErrorT sm_failover_activate_self(); +// **************************************************************************** + +// **************************************************************************** +// Failover - disable peer node +// ======================= +SmErrorT sm_failover_disable_peer(); +// **************************************************************************** + +// **************************************************************************** +// Failover - is active controller +// ================== +bool sm_is_active_controller(); +// **************************************************************************** + +// **************************************************************************** +// Failover - get interface state +// ================== +int sm_failover_get_if_state(); +// **************************************************************************** // **************************************************************************** // Failover - dump state diff --git a/service-mgmt/sm-1.0.0/src/sm_failover_fail_pending_state.cpp b/service-mgmt/sm-1.0.0/src/sm_failover_fail_pending_state.cpp index aadd088e..9e7f7711 100644 --- a/service-mgmt/sm-1.0.0/src/sm_failover_fail_pending_state.cpp +++ b/service-mgmt/sm-1.0.0/src/sm_failover_fail_pending_state.cpp @@ -5,6 +5,7 @@ // #include "sm_failover_fail_pending_state.h" #include +#include #include "sm_types.h" #include "sm_limits.h" #include "sm_debug.h" @@ -14,9 +15,164 @@ #include "sm_failover_utils.h" #include "sm_node_utils.h" #include "sm_node_api.h" +#include "sm_worker_thread.h" static const int FAIL_PENDING_TIMEOUT = 2000; //2000ms +static SmTimerIdT action_timer_id = SM_TIMER_ID_INVALID; +static const int RESET_TIMEOUT = 10 * 1000; // 10 seconds for a reset command to reboot a node +static const int GO_ACTIVE_TIMEOUT = 30 * 1000; // 30 seconds for standby node go active +static const int WAIT_RESET_TIMEOUT = RESET_TIMEOUT + 5 * 1000; // extra 5 seconds for sending reset command + +static void disable_peer(SmSimpleAction&) +{ + SmErrorT error; + int counter = 0; + bool done = false; + DPRINTFI("To send mtce api to disable peer"); + + while(!done && counter < 5) + { + error = sm_failover_disable_peer(); + counter ++; + if(SM_OKAY != error) + { + int wait_interval_us = 1000000; // 1 second + DPRINTFE("Failed to disable peer, reattempt in %d ms", + wait_interval_us / 1000); + usleep(wait_interval_us); // wait 1 second before retry + }else + { + done = true; + } + } +} +static SmSimpleAction _disable_peer_action("disable-peer", disable_peer); + +static bool wait_for_reset_timeout(SmTimerIdT timer_id, int64_t user_data) +{ + //timer never rearm, clear the timer id + action_timer_id = SM_TIMER_ID_INVALID; + sm_failover_utils_reset_stayfailed_flag(); + DPRINTFI("stayfailed flag file is removed"); + SmFailoverFSM::get_fsm().set_state(SM_FAILOVER_STATE_SURVIVED); + return false; +} + +static bool reset_standby_timeout(SmTimerIdT timer_id, int64_t user_data) +{ + //timer never rearm, clear the timer id + action_timer_id = SM_TIMER_ID_INVALID; + sm_failover_utils_set_stayfailed_flag(); + DPRINTFI("stayfailed flag file is set"); + const char* timer_name = "confirm standby reset"; + SmErrorT error = sm_timer_register( timer_name, WAIT_RESET_TIMEOUT + WAIT_RESET_TIMEOUT, + wait_for_reset_timeout, + 0, &action_timer_id); + + if(SM_OKAY != error) + { + DPRINTFE("Failed to register confirm standby reset timer"); + } + DPRINTFI("wait for %d secs...", (WAIT_RESET_TIMEOUT + WAIT_RESET_TIMEOUT) / 1000); + return false; +} + +SmErrorT blind_guess_start_active() +{ + SmErrorT error; + SmWorkerThread::get_worker().add_priority_action(&_disable_peer_action); + + const char* timer_name = "reset standby"; + error = sm_timer_register( timer_name, RESET_TIMEOUT, + reset_standby_timeout, + 0, &action_timer_id); + + if(SM_OKAY != error) + { + DPRINTFE("Failed to register reset standby timer"); + return SM_FAILED; + } + + DPRINTFI("wait for %d secs...", RESET_TIMEOUT / 1000); + return SM_OKAY; +} + +static void standby_go_active_failed() +{ + sm_failover_utils_set_stayfailed_flag(); + DPRINTFI("stayfailed flag file is set"); + SmFailoverFSM::get_fsm().set_state(SM_FAILOVER_STATE_FAILED); +} + +bool standby_wait_timeout(SmTimerIdT timer_id, int64_t user_data) +{ + //timer never rearm, clear the timer id + action_timer_id = SM_TIMER_ID_INVALID; + sm_failover_utils_reset_stayfailed_flag(); + DPRINTFI("stayfailed flag file is removed"); + char host_name[SM_NODE_NAME_MAX_CHAR]; + SmErrorT error = sm_node_utils_get_hostname(host_name); + if( SM_OKAY != error ) + { + DPRINTFE( "Failed to get hostname, error=%s.", + sm_error_str( error ) ); + + standby_go_active_failed(); + return false; + } + + SmNodeScheduleStateT host_state = sm_get_controller_state(host_name); + if(SM_NODE_STATE_ACTIVE != host_state) + { + standby_go_active_failed(); + return false; + } + + SmWorkerThread::get_worker().add_priority_action(&_disable_peer_action); + + SmFailoverFSM::get_fsm().set_state(SM_FAILOVER_STATE_SURVIVED); + return false; +} + +static void blind_guess_start_standby() +{ + SmErrorT error; + sm_failover_utils_set_stayfailed_flag(); + DPRINTFI("set stayfailed flag file"); + + sm_failover_activate_self(); + const char* timer_name = "active controller action timeout"; + error = sm_timer_register( timer_name, WAIT_RESET_TIMEOUT + GO_ACTIVE_TIMEOUT, + standby_wait_timeout, + 0, &action_timer_id); + + if(SM_OKAY != error) + { + DPRINTFE("Failed to register blindguess action timer"); + } + DPRINTFI("wait for %d secs", (WAIT_RESET_TIMEOUT + GO_ACTIVE_TIMEOUT) / 1000); +} + +void blind_guess_scenario_start() +{ + SmSystemFailoverStatus& failover_status = SmSystemFailoverStatus::get_status(); + SmNodeScheduleStateT host_state = failover_status.get_host_pre_failure_schedule_state(); + SmNodeScheduleStateT peer_state = failover_status.get_peer_pre_failure_schedule_state(); + DPRINTFI("Pre failure host state %s, peer state %s", + sm_node_schedule_state_str(host_state), sm_node_schedule_state_str(peer_state)); + if(SM_NODE_STATE_ACTIVE == host_state) + { + if(SM_NODE_STATE_STANDBY == peer_state) + { + blind_guess_start_active(); + } + }else if(SM_NODE_STATE_STANDBY == host_state) + { + blind_guess_start_standby(); + } +} + SmFailoverFailPendingState::SmFailoverFailPendingState(SmFailoverFSM& fsm) : SmFSMState(fsm) { this->_pending_timer_id = SM_TIMER_ID_INVALID; @@ -30,19 +186,53 @@ SmFailoverFailPendingState::~SmFailoverFailPendingState() SmErrorT SmFailoverFailPendingState::event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data) { //SmFSMEventDataTypeT event_data_type = event_data->get_event_data_type(); + bool duplex = false; switch (event) { - case SM_HEARTBEAT_LOST: - case SM_INTERFACE_DOWN: - //getting worse, extend wait time - case SM_INTERFACE_UP: - case SM_HEARTBEAT_RECOVER: - // Things are improving, extend wait time + case SM_FAILOVER_EVENT_IF_STATE_CHANGED: + // Situation are changing, extend wait time this->_register_timer(); break; - case SM_FAIL_PENDING_TIMEOUT: + + case SM_FAILOVER_EVENT_FAIL_PENDING_TIMEOUT: + sm_node_utils_is_aio_duplex(&duplex); + if( duplex && + 0 == (sm_failover_get_if_state() & SM_FAILOVER_HEARTBEAT_ALIVE)) { - SmSystemFailoverStatus failover_status; + SmSystemModeT system_mode = sm_node_utils_get_system_mode(); + SmInterfaceTypeT interfaces_to_check[3] = {SM_INTERFACE_UNKNOWN}; + bool healthy = true; + if(sm_failover_utils_is_stayfailed()) + { + healthy = false; + } + + if(SM_SYSTEM_MODE_CPE_DUPLEX_DC == system_mode) + { + interfaces_to_check[0] = SM_INTERFACE_OAM; + }else + { + interfaces_to_check[0] = SM_INTERFACE_MGMT; + interfaces_to_check[1] = SM_INTERFACE_INFRA; + } + for(int i = 0; interfaces_to_check[i] != SM_INTERFACE_UNKNOWN; i ++) + { + SmFailoverInterfaceStateT if_state = sm_failover_get_interface_info(interfaces_to_check[i]); + if(SM_FAILOVER_INTERFACE_DOWN == if_state) + { + healthy = false; + break; + } + } + + if(healthy) + { + blind_guess_scenario_start(); + } + } + else + { + SmSystemFailoverStatus& failover_status = SmSystemFailoverStatus::get_status(); SmErrorT error = sm_failover_ss_get_survivor(failover_status); SmNodeScheduleStateT host_state = failover_status.get_host_schedule_state(); SmNodeScheduleStateT peer_state = failover_status.get_peer_schedule_state(); @@ -79,7 +269,9 @@ SmErrorT SmFailoverFailPendingState::event_handler(SmFailoverEventT event, const } break; default: - DPRINTFE("Runtime error, unexpected event %d, at state %d", event, this->fsm.get_state()); + DPRINTFE("Runtime error, unexpected event %s, at state %s", + sm_failover_event_str(event), + sm_failover_state_str(this->fsm.get_state())); } return SM_OKAY; } @@ -87,13 +279,13 @@ SmErrorT SmFailoverFailPendingState::event_handler(SmFailoverEventT event, const bool SmFailoverFailPendingState::_fail_pending_timeout( SmTimerIdT timer_id, int64_t user_data) { - SmFailoverFSM::get_fsm().send_event(SM_FAIL_PENDING_TIMEOUT, NULL); + SmFailoverFSM::get_fsm().send_event(SM_FAILOVER_EVENT_FAIL_PENDING_TIMEOUT, NULL); return false; } SmErrorT SmFailoverFailPendingState::enter_state() { - DPRINTFI("Entering Pending state"); + SmFSMState::enter_state(); SmErrorT error = this->_register_timer(); if(SM_OKAY != error) { @@ -146,6 +338,15 @@ SmErrorT SmFailoverFailPendingState::exit_state() { DPRINTFE("Failed to deregister fail pending timer. Error %s", sm_error_str(error)); } - DPRINTFI("Exiting Pending state"); + if(SM_TIMER_ID_INVALID != action_timer_id) + { + error = sm_timer_deregister(action_timer_id); + action_timer_id = SM_TIMER_ID_INVALID; + if( SM_OKAY != error) + { + DPRINTFE("Failed to deregister action timer. Error %s", sm_error_str(error)); + } + } + SmFSMState::exit_state(); return error; } diff --git a/service-mgmt/sm-1.0.0/src/sm_failover_failed_state.cpp b/service-mgmt/sm-1.0.0/src/sm_failover_failed_state.cpp index a6d25d97..577a733a 100644 --- a/service-mgmt/sm-1.0.0/src/sm_failover_failed_state.cpp +++ b/service-mgmt/sm-1.0.0/src/sm_failover_failed_state.cpp @@ -10,57 +10,47 @@ #include "sm_failover_fsm.h" #include "sm_failover_ss.h" -SmErrorT SmFailoverFailedState::enter_state() +static void _audit_failover_state() { - DPRINTFI("Entering Failed state"); - return SM_OKAY; + SmSystemFailoverStatus& failover_status = SmSystemFailoverStatus::get_status(); + SmErrorT error = sm_failover_ss_get_survivor(failover_status); + SmNodeScheduleStateT host_state = failover_status.get_host_schedule_state(); + SmNodeScheduleStateT peer_state = failover_status.get_peer_schedule_state(); + if(SM_OKAY != error) + { + DPRINTFE("Failed to get failover survivor. Error %s", sm_error_str(error)); + return; + } + + if(SM_NODE_STATE_FAILED == host_state) + { + // don't need to set to failed state, already here + } + else if(SM_NODE_STATE_STANDBY == host_state && SM_NODE_STATE_ACTIVE == peer_state) + { + // standby is the only possible state to be scheduled to from failed state + SmFailoverFSM::get_fsm().set_state(SM_FAILOVER_STATE_NORMAL); + }else + { + DPRINTFE("Runtime error: unexpected scheduling state: %s", + sm_node_schedule_state_str(host_state)); + } } SmErrorT SmFailoverFailedState::event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data) { switch (event) { - case SM_HEARTBEAT_LOST: - case SM_INTERFACE_DOWN: - case SM_INTERFACE_UP: - // ignore, recover is possible only when heartbeat is recovered - break; - case SM_HEARTBEAT_RECOVER: - { - SmSystemFailoverStatus failover_status; - SmErrorT error = sm_failover_ss_get_survivor(failover_status); - SmNodeScheduleStateT host_state = failover_status.get_host_schedule_state(); - SmNodeScheduleStateT peer_state = failover_status.get_peer_schedule_state(); - if(SM_OKAY != error) - { - DPRINTFE("Failed to get failover survivor. Error %s", sm_error_str(error)); - return error; - } - - if(SM_NODE_STATE_FAILED == host_state) - { - // don't need to set to failed state, already here - } - else if(SM_NODE_STATE_STANDBY == host_state && SM_NODE_STATE_ACTIVE == peer_state) - { - // standby is the only possible state to be scheduled to from failed state - this->fsm.set_state(SM_FAILOVER_STATE_NORMAL); - }else - { - DPRINTFE("Runtime error: unexpected scheduling state: %s", - sm_node_schedule_state_str(host_state)); - } - } + case SM_FAILOVER_EVENT_IF_STATE_CHANGED: + DPRINTFI("sm_heartbeat_recover event received."); + _audit_failover_state(); break; + default: - DPRINTFE("Runtime error, unexpected event %d, at state %d", event, this->fsm.get_state()); + DPRINTFE("Runtime error, unexpected event %s, at state %s", + sm_failover_event_str(event), + sm_failover_state_str(this->fsm.get_state())); } return SM_OKAY; } - -SmErrorT SmFailoverFailedState::exit_state() -{ - DPRINTFI("Exiting Failed state"); - return SM_OKAY; -} diff --git a/service-mgmt/sm-1.0.0/src/sm_failover_failed_state.h b/service-mgmt/sm-1.0.0/src/sm_failover_failed_state.h index e0991f47..ae501167 100644 --- a/service-mgmt/sm-1.0.0/src/sm_failover_failed_state.h +++ b/service-mgmt/sm-1.0.0/src/sm_failover_failed_state.h @@ -13,8 +13,6 @@ class SmFailoverFailedState : public SmFSMState { public: SmFailoverFailedState(SmFailoverFSM& fsm) : SmFSMState(fsm){} - SmErrorT enter_state(); - SmErrorT exit_state(); protected: SmErrorT event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data); diff --git a/service-mgmt/sm-1.0.0/src/sm_failover_fsm.cpp b/service-mgmt/sm-1.0.0/src/sm_failover_fsm.cpp index 10e098c5..d0f3cbaa 100644 --- a/service-mgmt/sm-1.0.0/src/sm_failover_fsm.cpp +++ b/service-mgmt/sm-1.0.0/src/sm_failover_fsm.cpp @@ -6,20 +6,66 @@ #include "sm_failover_fsm.h" #include #include "sm_debug.h" +#include "sm_failover_initial_state.h" #include "sm_failover_normal_state.h" #include "sm_failover_fail_pending_state.h" #include "sm_failover_failed_state.h" #include "sm_failover_survived_state.h" +SmFSMEventDataTypeT SmIFStateChangedEventData::get_event_data_type() const +{ + return SmIFStateChangedEventDataType; +} + + +SmIFStateChangedEventData::SmIFStateChangedEventData() +{ +} + +void SmIFStateChangedEventData::set_interface_state( + SmInterfaceTypeT interface_type, SmFailoverInterfaceStateT interface_state) +{ + switch (interface_type) + { + case SM_INTERFACE_OAM: + _oam_state = interface_state; + break; + case SM_INTERFACE_MGMT: + _mgmt_state = interface_state; + break; + case SM_INTERFACE_INFRA: + _infra_state = interface_state; + break; + default: + DPRINTFE("Runtime error: invalid interface type %d", interface_type); + } +} +SmFailoverInterfaceStateT SmIFStateChangedEventData::get_interface_state(SmInterfaceTypeT interface_type) const +{ + switch (interface_type) + { + case SM_INTERFACE_OAM: + return _oam_state; + case SM_INTERFACE_MGMT: + return _mgmt_state; + case SM_INTERFACE_INFRA: + return _infra_state; + default: + DPRINTFE("Runtime error: invalid interface type %d", interface_type); + return SM_FAILOVER_INTERFACE_UNKNOWN; + } + +} + SmErrorT SmFSMState::enter_state() { - // default implementation, nothing to do + DPRINTFI("Entering %s state", sm_failover_state_str(fsm.get_state())); return SM_OKAY; }; SmErrorT SmFSMState::exit_state() { - // default implementation, nothing to do + DPRINTFI("Exiting %s state", sm_failover_state_str(fsm.get_state())); return SM_OKAY; }; @@ -58,7 +104,7 @@ SmErrorT SmFailoverFSM::send_event(SmFailoverEventT event, const ISmFSMEventData DPRINTFE("Runtime error. No handler for state %d", this->get_state()); return SM_FAILED; } - DPRINTFI("send_event %d\n", event); + DPRINTFI("send_event %s\n", sm_failover_event_str(event)); state_handler->event_handler(event, event_data); return SM_OKAY; } @@ -96,6 +142,9 @@ SmErrorT SmFailoverFSM::set_state(SmFailoverStateT state) DPRINTFE("State %d was not registered", state); return SM_FAILED; } + + this->_current_state = state; + error = new_state_handler->enter_state(); if(SM_OKAY != error) { @@ -103,7 +152,6 @@ SmErrorT SmFailoverFSM::set_state(SmFailoverStateT state) return SM_FAILED; } - this->_current_state = state; return SM_OKAY; } @@ -149,6 +197,7 @@ SmErrorT SmFailoverFSM::init_state() SmErrorT SmFailoverFSM::initialize() { SmFailoverFSM& fsm = SmFailoverFSM::get_fsm(); + fsm.register_fsm_state(SM_FAILOVER_STATE_INITIAL, new SmFailoverInitialState(fsm)); fsm.register_fsm_state(SM_FAILOVER_STATE_NORMAL, new SmFailoverNormalState(fsm)); fsm.register_fsm_state(SM_FAILOVER_STATE_FAIL_PENDING, new SmFailoverFailPendingState(fsm)); fsm.register_fsm_state(SM_FAILOVER_STATE_FAILED, new SmFailoverFailedState(fsm)); diff --git a/service-mgmt/sm-1.0.0/src/sm_failover_fsm.h b/service-mgmt/sm-1.0.0/src/sm_failover_fsm.h index bd79585b..9a30dd2f 100644 --- a/service-mgmt/sm-1.0.0/src/sm_failover_fsm.h +++ b/service-mgmt/sm-1.0.0/src/sm_failover_fsm.h @@ -7,24 +7,6 @@ #define __SM_FAILOVER_FSM_H__ #include "sm_types.h" -typedef enum{ - SM_FAILOVER_STATE_NORMAL, - SM_FAILOVER_STATE_FAIL_PENDING, - SM_FAILOVER_STATE_FAILED, - SM_FAILOVER_STATE_SURVIVED, - SM_FAILOVER_STATE_MAX -}SmFailoverStateT; - -#define SM_FAILOVER_STATE_INITIAL SM_FAILOVER_STATE_NORMAL - -typedef enum{ - SM_HEARTBEAT_LOST, - SM_INTERFACE_DOWN, - SM_INTERFACE_UP, - SM_HEARTBEAT_RECOVER, - SM_FAIL_PENDING_TIMEOUT, -}SmFailoverEventT; - typedef int SmFSMEventDataTypeT; class ISmFSMEventData @@ -59,7 +41,7 @@ class SmFailoverFSM SmErrorT register_fsm_state(SmFailoverStateT state, SmFSMState* state_handler); SmErrorT set_state(SmFailoverStateT state); - inline int get_state() const {return this->_current_state;} + inline SmFailoverStateT get_state() const {return this->_current_state;} static SmErrorT initialize(); static SmErrorT finalize(); @@ -70,12 +52,27 @@ class SmFailoverFSM private: static const int MaxState = SM_FAILOVER_STATE_MAX; SmFSMState* _state_handlers[MaxState]; - int _current_state; + SmFailoverStateT _current_state; static SmFailoverFSM _the_fsm; SmErrorT init_state(); void deregister_states(); }; +class SmIFStateChangedEventData: public ISmFSMEventData +{ + public: + static const SmFSMEventDataTypeT SmIFStateChangedEventDataType = 2; + SmFSMEventDataTypeT get_event_data_type() const; + SmIFStateChangedEventData(); + void set_interface_state( + SmInterfaceTypeT interface_type, SmFailoverInterfaceStateT interface_state); + SmFailoverInterfaceStateT get_interface_state(SmInterfaceTypeT interface_type) const; + + private: + SmFailoverInterfaceStateT _mgmt_state; + SmFailoverInterfaceStateT _infra_state; + SmFailoverInterfaceStateT _oam_state; +}; #endif //__SM_FAILOVER_FSM_H__ diff --git a/service-mgmt/sm-1.0.0/src/sm_failover_initial_state.cpp b/service-mgmt/sm-1.0.0/src/sm_failover_initial_state.cpp new file mode 100644 index 00000000..9c205e36 --- /dev/null +++ b/service-mgmt/sm-1.0.0/src/sm_failover_initial_state.cpp @@ -0,0 +1,28 @@ +// +// Copyright (c) 2018 Wind River Systems, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +// +#include "sm_failover_initial_state.h" +#include "sm_types.h" +#include "sm_debug.h" +#include "sm_failover_fsm.h" + +SmErrorT SmFailoverInitialState::event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data) +{ + switch (event) + { + case SM_FAILOVER_EVENT_HEARTBEAT_ENABLED: + this->fsm.set_state(SM_FAILOVER_STATE_NORMAL); + break; + case SM_FAILOVER_EVENT_IF_STATE_CHANGED: + case SM_FAILOVER_EVENT_FAIL_PENDING_TIMEOUT: + case SM_FAILOVER_EVENT_NODE_ENABLED: + break; + default: + DPRINTFE("Runtime error, unexpected event %s, at state %s", + sm_failover_event_str(event), + sm_failover_state_str(this->fsm.get_state())); + } + return SM_OKAY; +} diff --git a/service-mgmt/sm-1.0.0/src/sm_failover_initial_state.h b/service-mgmt/sm-1.0.0/src/sm_failover_initial_state.h new file mode 100644 index 00000000..16f5dc7d --- /dev/null +++ b/service-mgmt/sm-1.0.0/src/sm_failover_initial_state.h @@ -0,0 +1,20 @@ +// +// Copyright (c) 2018 Wind River Systems, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +// +#ifndef __SM_FAILOVER_INITIAL_STATE_H__ +#define __SM_FAILOVER_INITIAL_STATE_H__ +#include "sm_types.h" +#include "sm_failover_fsm.h" + +class SmFailoverInitialState : public SmFSMState +{ + public: + SmFailoverInitialState(SmFailoverFSM& fsm) : SmFSMState(fsm){} + protected: + SmErrorT event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data); +}; + + +#endif //__SM_FAILOVER_INITIAL_STATE_H__ \ No newline at end of file diff --git a/service-mgmt/sm-1.0.0/src/sm_failover_normal_state.cpp b/service-mgmt/sm-1.0.0/src/sm_failover_normal_state.cpp index ac8c1bc9..cdef009d 100644 --- a/service-mgmt/sm-1.0.0/src/sm_failover_normal_state.cpp +++ b/service-mgmt/sm-1.0.0/src/sm_failover_normal_state.cpp @@ -5,38 +5,80 @@ // #include "sm_failover_normal_state.h" #include +#include "sm_limits.h" #include "sm_types.h" #include "sm_debug.h" +#include "sm_node_api.h" +#include "sm_node_utils.h" +#include "sm_failover_utils.h" #include "sm_failover_fsm.h" - -SmErrorT SmFailoverNormalState::enter_state() -{ - DPRINTFI("Entering normal state"); - return SM_OKAY; -} +#include "sm_failover_ss.h" SmErrorT SmFailoverNormalState::event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data) { switch (event) { - case SM_HEARTBEAT_LOST: - case SM_INTERFACE_DOWN: - this->fsm.set_state(SM_FAILOVER_STATE_FAIL_PENDING); - break; - case SM_INTERFACE_UP: - DPRINTFI("Interface is up"); - break; - case SM_HEARTBEAT_RECOVER: - DPRINTFI("Heartbeat is recovered"); + case SM_FAILOVER_EVENT_IF_STATE_CHANGED: + if(NULL != event_data && + SmIFStateChangedEventData::SmIFStateChangedEventDataType == event_data->get_event_data_type()) + { + const SmIFStateChangedEventData* data = (const SmIFStateChangedEventData*) event_data; + + SmFailoverInterfaceStateT oam_state, mgmt_state, infra_state; + oam_state = data->get_interface_state(SM_INTERFACE_OAM); + mgmt_state = data->get_interface_state(SM_INTERFACE_MGMT); + infra_state = data->get_interface_state(SM_INTERFACE_INFRA); + if(oam_state != SM_FAILOVER_INTERFACE_OK || + mgmt_state != SM_FAILOVER_INTERFACE_OK || + (infra_state != SM_FAILOVER_INTERFACE_OK && infra_state != SM_FAILOVER_INTERFACE_UNKNOWN)) + { + this->fsm.set_state(SM_FAILOVER_STATE_FAIL_PENDING); + } + }else + { + DPRINTFE("Runtime error: receive invalid event data type"); + } + break; + default: - DPRINTFE("Runtime error, unexpected event %d, at state %d", event, this->fsm.get_state()); + DPRINTFE("Runtime error, unexpected event %s, at state %s", + sm_failover_event_str(event), + sm_failover_state_str(this->fsm.get_state())); } return SM_OKAY; } SmErrorT SmFailoverNormalState::exit_state() { - DPRINTFI("Exiting normal state"); + SmSystemFailoverStatus& failover_status = SmSystemFailoverStatus::get_status(); + char host_name[SM_NODE_NAME_MAX_CHAR]; + SmErrorT error = sm_node_utils_get_hostname(host_name); + if( SM_OKAY != error ) + { + DPRINTFE( "Failed to get hostname, error=%s.", + sm_error_str( error ) ); + failover_status.set_host_pre_failure_schedule_state(SM_NODE_STATE_UNKNOWN); + }else + { + SmNodeScheduleStateT host_state = sm_get_controller_state(host_name); + DPRINTFI("Blind guess pre failure host state %s", sm_node_schedule_state_str(host_state)); + failover_status.set_host_pre_failure_schedule_state(host_state); + } + + char peer_name[SM_NODE_NAME_MAX_CHAR]; + error = sm_node_api_get_peername(peer_name); + if( SM_OKAY != error ) + { + DPRINTFE( "Failed to get peername, error=%s.", + sm_error_str( error ) ); + failover_status.set_peer_pre_failure_schedule_state(SM_NODE_STATE_UNKNOWN); + }else + { + SmNodeScheduleStateT peer_state = sm_get_controller_state(peer_name); + failover_status.set_peer_pre_failure_schedule_state(peer_state); + } + + SmFSMState::exit_state(); return SM_OKAY; } diff --git a/service-mgmt/sm-1.0.0/src/sm_failover_normal_state.h b/service-mgmt/sm-1.0.0/src/sm_failover_normal_state.h index dc6aac51..96d1fd9b 100644 --- a/service-mgmt/sm-1.0.0/src/sm_failover_normal_state.h +++ b/service-mgmt/sm-1.0.0/src/sm_failover_normal_state.h @@ -12,7 +12,6 @@ class SmFailoverNormalState : public SmFSMState { public: SmFailoverNormalState(SmFailoverFSM& fsm) : SmFSMState(fsm){} - SmErrorT enter_state(); SmErrorT exit_state(); protected: SmErrorT event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data); diff --git a/service-mgmt/sm-1.0.0/src/sm_failover_ss.c b/service-mgmt/sm-1.0.0/src/sm_failover_ss.c index a4235d3b..8ed1b41f 100644 --- a/service-mgmt/sm-1.0.0/src/sm_failover_ss.c +++ b/service-mgmt/sm-1.0.0/src/sm_failover_ss.c @@ -15,6 +15,40 @@ #include "sm_node_api.h" #include "sm_failover.h" +// +SmErrorT _get_survivor_dc(const SmSystemStatusT& system_status, SmSystemFailoverStatus& selection); + +// select standby as failed +SmErrorT _fail_standby(const SmSystemStatusT& system_status, SmSystemFailoverStatus& selection) +{ + if(SM_NODE_STATE_STANDBY == system_status.host_status.current_schedule_state) + { + selection.set_host_schedule_state(SM_NODE_STATE_FAILED); + selection.set_peer_schedule_state(SM_NODE_STATE_ACTIVE); + }else if(SM_NODE_STATE_STANDBY == system_status.peer_status.current_schedule_state) + { + selection.set_peer_schedule_state(SM_NODE_STATE_FAILED); + selection.set_host_schedule_state(SM_NODE_STATE_ACTIVE); + }else + { + DPRINTFE("Runtime error. Unexpected scheduling state: host %s, peer %s (no standby)", + sm_node_schedule_state_str(system_status.host_status.current_schedule_state), + sm_node_schedule_state_str(system_status.peer_status.current_schedule_state)); + return SM_FAILED; + } + return SM_OKAY; +} + +const char SmSystemFailoverStatus::filename[] = "/var/lib/sm/failover.status"; +const char SmSystemFailoverStatus::file_format[] = + "This is a very important system file.\n" + "Any modification is strictly forbidden and will cause serious consequence.\n" + "host_schedule_state=%1d\n" + "peer_schedule_state=%1d\n" + "last_update=%19s\n"; + +SmSystemFailoverStatus SmSystemFailoverStatus::_failover_status; + SmSystemFailoverStatus::SmSystemFailoverStatus() { SmErrorT error; @@ -42,11 +76,17 @@ SmSystemFailoverStatus::~SmSystemFailoverStatus() { } +SmSystemFailoverStatus& SmSystemFailoverStatus::get_status() +{ + return _failover_status; +} + bool SmSystemFailoverStatus::_is_valid_schedule_state(SmNodeScheduleStateT state) { return (SM_NODE_STATE_ACTIVE == state || SM_NODE_STATE_STANDBY == state || - SM_NODE_STATE_FAILED == state); + SM_NODE_STATE_FAILED == state || + SM_NODE_STATE_INIT == state); } void SmSystemFailoverStatus::set_host_schedule_state(SmNodeScheduleStateT state) @@ -63,6 +103,20 @@ void SmSystemFailoverStatus::set_host_schedule_state(SmNodeScheduleStateT state) } } +void SmSystemFailoverStatus::set_host_pre_failure_schedule_state(SmNodeScheduleStateT state) +{ + if(_is_valid_schedule_state(state)) + { + if(_host_pre_failure_schedule_state != state) + { + _host_pre_failure_schedule_state = state; + } + }else + { + DPRINTFE("Runtime error, schedule state unknown %d", state); + } +} + void SmSystemFailoverStatus::set_peer_schedule_state(SmNodeScheduleStateT state) { if(_is_valid_schedule_state(state)) @@ -77,12 +131,65 @@ void SmSystemFailoverStatus::set_peer_schedule_state(SmNodeScheduleStateT state) } } -typedef enum +void SmSystemFailoverStatus::set_peer_pre_failure_schedule_state(SmNodeScheduleStateT state) { - SM_FAILOVER_INFRA_DOWN = 1, - SM_FAILOVER_MGMT_DOWN = 2, - SM_FAILOVER_OAM_DOWN = 4, -}SmFailoverCommFaultBitFlagT; + if(_is_valid_schedule_state(state)) + { + if(_peer_pre_failure_schedule_state != state) + { + _peer_pre_failure_schedule_state = state; + } + }else + { + DPRINTFE("Runtime error, schedule state unknown %d", state); + } +} + +void SmSystemFailoverStatus::serialize() +{ + FILE* f; + time_t last_update; + struct tm* local_time; + time(&last_update); + local_time = localtime(&last_update); + char timestamp[20]; + + + sprintf(timestamp, "%04d-%02d-%02dT%02d:%02d:%02d", local_time->tm_year + 1900, + local_time->tm_mon + 1, local_time->tm_mday, local_time->tm_hour, local_time->tm_min, local_time->tm_sec); + + f = fopen(filename, "w"); + fprintf(f, file_format, _host_schedule_state, _peer_schedule_state, timestamp); + for(int i = 0; i < 72; i ++) + { + fputs(".", f); + } + fclose(f); +} + +void SmSystemFailoverStatus::deserialize() +{ +// FILE* f; +// char timestamp[20]; +// +// _host_schedule_state = _peer_schedule_state = SM_NODE_STATE_UNKNOWN; +// int host_state, peer_state; +// f = fopen(filename, "r"); +// if(NULL != f) +// { +// DPRINTFI("Loading schedule state from %s", filename); +// int cnt = fscanf(f, file_format, &host_state, &peer_state, timestamp); +// fclose(f); +// if(cnt != 3) +// { +// DPRINTFE("Runtime error, %s has been modified.", filename); +// }else +// { +// set_host_schedule_state((SmNodeScheduleStateT)host_state); +// set_peer_schedule_state((SmNodeScheduleStateT)peer_state); +// } +// } +} // **************************************************************************** // sm_failover_ss get_node_if_healthy_score - get node interface healthy score @@ -98,7 +205,7 @@ static int get_node_if_healthy_score(unsigned int interface_state) { healthy_score -= 2; } - if(interface_state & SM_FAILOVER_INFRA_DOWN) + if(interface_state & SM_FAILOVER_MGMT_DOWN) { healthy_score -= 4; } @@ -152,7 +259,6 @@ SmErrorT _get_system_status(SmSystemStatusT& sys_status, char host_name[], char sys_status.peer_status.current_schedule_state = sm_get_controller_state(peer_name); return SM_OKAY; } - // **************************************************************************** // sm_failover_ss_get_survivor - select the failover survivor // This is the main entry/container for the failover logic to determine how @@ -175,10 +281,17 @@ SmErrorT sm_failover_ss_get_survivor(SmSystemFailoverStatus& selection) SmErrorT sm_failover_ss_get_survivor(const SmSystemStatusT& system_status, SmSystemFailoverStatus& selection) { + DPRINTFI("get survivor %s %s, %s %s", + system_status.host_status.node_name, + sm_node_schedule_state_str(system_status.host_status.current_schedule_state), + system_status.peer_status.node_name, + sm_node_schedule_state_str(system_status.peer_status.current_schedule_state)); + selection.set_host_schedule_state(system_status.host_status.current_schedule_state); selection.set_peer_schedule_state(system_status.peer_status.current_schedule_state); if(SM_HEARTBEAT_OK == system_status.heartbeat_state) { + DPRINTFI("Heartbeat alive"); int host_healthy_score, peer_healthy_score; host_healthy_score = get_node_if_healthy_score(system_status.host_status.interface_state); peer_healthy_score = get_node_if_healthy_score(system_status.peer_status.interface_state); @@ -187,16 +300,52 @@ SmErrorT sm_failover_ss_get_survivor(const SmSystemStatusT& system_status, SmSys //host is more healthy selection.set_host_schedule_state(SM_NODE_STATE_ACTIVE); selection.set_peer_schedule_state(SM_NODE_STATE_STANDBY); + if(system_status.peer_status.interface_state & SM_FAILOVER_MGMT_DOWN) + { + DPRINTFI("Disable peer, host go active"); + selection.set_peer_schedule_state(SM_NODE_STATE_FAILED); + } }else if(peer_healthy_score > host_healthy_score) { //peer is more healthy selection.set_host_schedule_state(SM_NODE_STATE_STANDBY); selection.set_peer_schedule_state(SM_NODE_STATE_ACTIVE); + if(system_status.host_status.interface_state & SM_FAILOVER_MGMT_DOWN) + { + DPRINTFI("Disable host, peer go active"); + selection.set_host_schedule_state(SM_NODE_STATE_FAILED); + } } + }else + { + DPRINTFI("Loss of heartbeat ALL"); + selection.set_host_schedule_state(SM_NODE_STATE_ACTIVE); + selection.set_peer_schedule_state(SM_NODE_STATE_FAILED); } - if(system_status.host_status.current_schedule_state != selection.get_host_schedule_state() || - system_status.peer_status.current_schedule_state != selection.get_peer_schedule_state() ) + if(SM_SYSTEM_MODE_CPE_DUPLEX == system_status.system_mode) + { + } + + if(SM_SYSTEM_MODE_CPE_DUPLEX_DC == system_status.system_mode) + { + return _get_survivor_dc(system_status, selection); + } + + SmNodeScheduleStateT host_schedule_state, peer_schedule_state; + host_schedule_state = selection.get_host_schedule_state(); + peer_schedule_state = selection.get_peer_schedule_state(); + DPRINTFI("Host from %s to %s, Peer from %s to %s.", + sm_node_schedule_state_str(system_status.host_status.current_schedule_state), + sm_node_schedule_state_str(host_schedule_state), + sm_node_schedule_state_str(system_status.peer_status.current_schedule_state), + sm_node_schedule_state_str(peer_schedule_state) + ); + + if((system_status.host_status.current_schedule_state == SM_NODE_STATE_ACTIVE && + host_schedule_state != SM_NODE_STATE_ACTIVE) || + (system_status.peer_status.current_schedule_state == SM_NODE_STATE_ACTIVE && + peer_schedule_state != SM_NODE_STATE_ACTIVE)) { DPRINTFI("Uncontrolled swact starts. Host from %s to %s, Peer from %s to %s.", sm_node_schedule_state_str(system_status.host_status.current_schedule_state), @@ -205,6 +354,43 @@ SmErrorT sm_failover_ss_get_survivor(const SmSystemStatusT& system_status, SmSys sm_node_schedule_state_str(selection.get_peer_schedule_state()) ); } + + selection.serialize(); return SM_OKAY; } // **************************************************************************** + + +// **************************************************************************** +// Direct connected +SmErrorT _get_survivor_dc(const SmSystemStatusT& system_status, SmSystemFailoverStatus& selection) +{ + if(SM_SYSTEM_MODE_CPE_DUPLEX_DC != system_status.system_mode) + { + DPRINTFE("Runtime error, not the right system mode %d", system_status.system_mode); + return SM_FAILED; + } + + if(SM_HEARTBEAT_LOSS == system_status.heartbeat_state) + { + if(system_status.host_status.mgmt_state == SM_FAILOVER_INTERFACE_DOWN && + (system_status.host_status.infra_state == SM_FAILOVER_INTERFACE_DOWN || + system_status.host_status.infra_state == SM_FAILOVER_INTERFACE_UNKNOWN)) + { + if(SM_FAILOVER_INTERFACE_DOWN == system_status.host_status.oam_state) + { + selection.set_host_schedule_state(SM_NODE_STATE_FAILED); + selection.set_peer_schedule_state(SM_NODE_STATE_ACTIVE); + }else + { + selection.set_peer_schedule_state(SM_NODE_STATE_FAILED); + selection.set_host_schedule_state(SM_NODE_STATE_ACTIVE); + } + }else + { + selection.set_peer_schedule_state(SM_NODE_STATE_FAILED); + selection.set_host_schedule_state(SM_NODE_STATE_ACTIVE); + } + } + return SM_OKAY; +} diff --git a/service-mgmt/sm-1.0.0/src/sm_failover_ss.h b/service-mgmt/sm-1.0.0/src/sm_failover_ss.h index 99e48a1b..5a4609cb 100644 --- a/service-mgmt/sm-1.0.0/src/sm_failover_ss.h +++ b/service-mgmt/sm-1.0.0/src/sm_failover_ss.h @@ -8,9 +8,6 @@ #define __SM_FAILOVER_SS_H__ #include #include "sm_types.h" -//#ifdef __cplusplus -//extern "C" { -//#endif typedef struct { @@ -47,23 +44,37 @@ typedef struct class SmSystemFailoverStatus { public: - SmSystemFailoverStatus(); virtual ~SmSystemFailoverStatus(); inline SmNodeScheduleStateT get_host_schedule_state() const { return _host_schedule_state; } + inline SmNodeScheduleStateT get_host_pre_failure_schedule_state() const { + return _host_pre_failure_schedule_state; + } void set_host_schedule_state(SmNodeScheduleStateT state); + void set_host_pre_failure_schedule_state(SmNodeScheduleStateT state); inline SmNodeScheduleStateT get_peer_schedule_state() const { return _peer_schedule_state; } + inline SmNodeScheduleStateT get_peer_pre_failure_schedule_state() const { + return _peer_pre_failure_schedule_state; + } void set_peer_schedule_state(SmNodeScheduleStateT state); + void set_peer_pre_failure_schedule_state(SmNodeScheduleStateT state); + void serialize(); + void deserialize(); + static SmSystemFailoverStatus& get_status(); private: + SmSystemFailoverStatus(); + SmNodeScheduleStateT _host_pre_failure_schedule_state; + SmNodeScheduleStateT _peer_pre_failure_schedule_state; SmNodeScheduleStateT _host_schedule_state; SmNodeScheduleStateT _peer_schedule_state; static const char filename[]; static const char file_format[]; bool _is_valid_schedule_state(SmNodeScheduleStateT state); + static SmSystemFailoverStatus _failover_status; }; // **************************************************************************** @@ -73,7 +84,4 @@ SmErrorT sm_failover_ss_get_survivor(SmSystemFailoverStatus& selection); SmErrorT sm_failover_ss_get_survivor(const SmSystemStatusT& system_status, SmSystemFailoverStatus& selection); -//#ifdef __cplusplus -//} -//#endif #endif // __SM_FAILOVER_SS_H__ diff --git a/service-mgmt/sm-1.0.0/src/sm_failover_survived_state.cpp b/service-mgmt/sm-1.0.0/src/sm_failover_survived_state.cpp index dec43433..d0e3b88a 100644 --- a/service-mgmt/sm-1.0.0/src/sm_failover_survived_state.cpp +++ b/service-mgmt/sm-1.0.0/src/sm_failover_survived_state.cpp @@ -10,53 +10,48 @@ #include "sm_failover_fsm.h" #include "sm_failover_ss.h" -SmErrorT SmFailoverSurvivedState::enter_state() +static void _audit_failover_state() { - DPRINTFI("Entering Survived state"); - return SM_OKAY; + SmSystemFailoverStatus& failover_status = SmSystemFailoverStatus::get_status(); + SmErrorT error = sm_failover_ss_get_survivor(failover_status); + SmNodeScheduleStateT host_state = failover_status.get_host_schedule_state(); + SmNodeScheduleStateT peer_state = failover_status.get_peer_schedule_state(); + if(SM_OKAY != error) + { + DPRINTFE("Failed to get failover survivor. Error %s", sm_error_str(error)); + return; + } + + if(SM_NODE_STATE_ACTIVE == host_state && SM_NODE_STATE_FAILED == peer_state) + { + // nothing changed yet + } + else if(SM_NODE_STATE_ACTIVE == host_state && SM_NODE_STATE_STANDBY == peer_state) + { + // Active is the only possible state to be scheduled to from survived state + SmFailoverFSM::get_fsm().set_state(SM_FAILOVER_STATE_NORMAL); + }else + { + DPRINTFE("Runtime error: unexpected scheduling state: host %s, peer %s", + sm_node_schedule_state_str(host_state), + sm_node_schedule_state_str(peer_state)); + } } SmErrorT SmFailoverSurvivedState::event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data) { switch (event) { - case SM_HEARTBEAT_LOST: - case SM_INTERFACE_DOWN: - case SM_INTERFACE_UP: - // ignore, recover is possible only when heartbeat is recovered + case SM_FAILOVER_EVENT_IF_STATE_CHANGED: break; - case SM_HEARTBEAT_RECOVER: - { - SmSystemFailoverStatus failover_status; - SmErrorT error = sm_failover_ss_get_survivor(failover_status); - SmNodeScheduleStateT host_state = failover_status.get_host_schedule_state(); - SmNodeScheduleStateT peer_state = failover_status.get_peer_schedule_state(); - if(SM_OKAY != error) - { - DPRINTFE("Failed to get failover survivor. Error %s", sm_error_str(error)); - return error; - } - - if(SM_NODE_STATE_ACTIVE == host_state && SM_NODE_STATE_STANDBY == peer_state) - { - // Active is the only possible state to be scheduled to from survived state - this->fsm.set_state(SM_FAILOVER_STATE_NORMAL); - }else - { - DPRINTFE("Runtime error: unexpected scheduling state: host %s, peer %s", - sm_node_schedule_state_str(host_state), - sm_node_schedule_state_str(peer_state)); - } - } + case SM_FAILOVER_EVENT_NODE_ENABLED: + _audit_failover_state(); break; default: - DPRINTFE("Runtime error, unexpected event %d, at state %d", event, this->fsm.get_state()); + DPRINTFE("Runtime error, unexpected event %s, at state %s", + sm_failover_event_str(event), + sm_failover_state_str(this->fsm.get_state())); } return SM_OKAY; } -SmErrorT SmFailoverSurvivedState::exit_state() -{ - DPRINTFI("Exiting Survived state"); - return SM_OKAY; -} diff --git a/service-mgmt/sm-1.0.0/src/sm_failover_survived_state.h b/service-mgmt/sm-1.0.0/src/sm_failover_survived_state.h index 7f58b505..4f35d226 100644 --- a/service-mgmt/sm-1.0.0/src/sm_failover_survived_state.h +++ b/service-mgmt/sm-1.0.0/src/sm_failover_survived_state.h @@ -12,8 +12,6 @@ class SmFailoverSurvivedState : public SmFSMState { public: SmFailoverSurvivedState(SmFailoverFSM& fsm) : SmFSMState(fsm){} - SmErrorT enter_state(); - SmErrorT exit_state(); protected: SmErrorT event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data); diff --git a/service-mgmt/sm-1.0.0/src/sm_failover_thread.c b/service-mgmt/sm-1.0.0/src/sm_failover_thread.c deleted file mode 100644 index 69715fd6..00000000 --- a/service-mgmt/sm-1.0.0/src/sm_failover_thread.c +++ /dev/null @@ -1,244 +0,0 @@ -// -// Copyright (c) 2017 Wind River Systems, Inc. -// -// SPDX-License-Identifier: Apache-2.0 -// -#include "sm_failover_thread.h" - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "sm_failover.h" -#include "sm_types.h" -#include "sm_time.h" -#include "sm_debug.h" -#include "sm_trap.h" -#include "sm_selobj.h" -#include "sm_thread_health.h" -#include "sm_hw.h" -#include "sm_timer.h" - -#define SM_FAILOVER_THREAD_TICK_INTERVAL_IN_MS 1000 - -static const char sm_failover_thread_name[] = "sm_failover"; -static const int sm_failover_tick_interval_in_ms = 1000; - -static sig_atomic_t _stay_on = 1; -static bool _thread_created = false; -static pthread_t _failover_thread; - -static void sm_failover_interface_change_callback( - SmHwInterfaceChangeDataT* if_change ) -{ - switch ( if_change->interface_state ) - { - case SM_INTERFACE_STATE_DISABLED: - DPRINTFI("Interface %s is down", if_change->interface_name); - sm_failover_interface_down(if_change->interface_name); - break; - case SM_INTERFACE_STATE_ENABLED: - DPRINTFI("Interface %s is up", if_change->interface_name); - sm_failover_interface_up(if_change->interface_name); - break; - default: - DPRINTFI("Interface %s state changed to %d", - if_change->interface_name, if_change->interface_state); - break; - } -} - -// **************************************************************************** -// Failover Thread - Initialize Thread -// ============================================ -SmErrorT sm_failover_thread_initialize_thread( void ) -{ - SmErrorT error; - SmHwCallbacksT callbacks; - - error = sm_selobj_initialize(); - if( SM_OKAY != error ) - { - DPRINTFE( "Failed to initialize selection object module, error=%s.", - sm_error_str( error ) ); - return( SM_FAILED ); - } - - error = sm_timer_initialize( SM_FAILOVER_THREAD_TICK_INTERVAL_IN_MS ); - if( SM_OKAY != error ) - { - DPRINTFE( "Failed to initialize timer module, error=%s.", - sm_error_str( error ) ); - return( SM_FAILED ); - } - - memset( &callbacks, 0, sizeof(callbacks) ); - callbacks.interface_change = sm_failover_interface_change_callback; - error = sm_hw_initialize( &callbacks ); - if( SM_OKAY != error ) - { - DPRINTFE( "Failed to initialize hardware module, error=%s.", - sm_error_str( error ) ); - return( error ); - } - - return( SM_OKAY ); -} -// **************************************************************************** - - -// **************************************************************************** -// Failover Thread - Finalize Thread -// ========================================== -SmErrorT sm_failover_thread_finalize_thread( void ) -{ - SmErrorT error; - - error = sm_timer_finalize(); - if( SM_OKAY != error ) - { - DPRINTFE( "Failed to finialize timer module, error=%s.", - sm_error_str( error ) ); - } - - error = sm_selobj_finalize(); - if( SM_OKAY != error ) - { - DPRINTFE( "Failed to finialize selection object module, error=%s.", - sm_error_str( error ) ); - } - - return( SM_OKAY ); -} -// **************************************************************************** - - -// **************************************************************************** -// Failover Thread - Main -// =============================== -static void* sm_failover_thread_main( void* arguments ) -{ - SmErrorT error; - - pthread_setname_np( pthread_self(), sm_failover_thread_name ); - sm_debug_set_thread_info(); - sm_trap_set_thread_info(); - - DPRINTFI( "Starting" ); - - error = sm_failover_thread_initialize_thread(); - if( SM_OKAY != error ) - { - DPRINTFE( "Failover thread initialize failed, error=%s.", - sm_error_str( error ) ); - pthread_exit( NULL ); - } - - DPRINTFI( "Started." ); - - while( _stay_on ) - { - error = sm_selobj_dispatch( sm_failover_tick_interval_in_ms ); - if( SM_OKAY != error ) - { - DPRINTFE( "Selection object dispatch failed, error=%s.", - sm_error_str(error) ); - break; - } - - sm_failover_action(); - } - - DPRINTFI( "Shutting down." ); - - error = sm_failover_thread_finalize_thread(); - if( SM_OKAY != error ) - { - DPRINTFE( "Failover finalize thread failed, error=%s.", - sm_error_str( error ) ); - } - - DPRINTFI( "Shutdown complete." ); - - return( NULL ); -} -// **************************************************************************** - -// **************************************************************************** -// Failover Thread - Start -// ================================ -SmErrorT sm_failover_thread_start( void ) -{ - int result; - - _stay_on = 1; - _thread_created = false; - - result = pthread_create( &_failover_thread, NULL, - sm_failover_thread_main, NULL ); - if( 0 != result ) - { - DPRINTFE( "Failed to start failover thread, error=%s.", - strerror(result) ); - return( SM_FAILED ); - } - - _thread_created = true; - - return( SM_OKAY ); -} -// **************************************************************************** - -// **************************************************************************** -// Failover Thread - Stop -// =============================== -SmErrorT sm_failover_thread_stop( void ) -{ - _stay_on = 0; - - if( _thread_created ) - { - long ms_expired; - SmTimeT time_prev; - int result; - - sm_time_get( &time_prev ); - - while( true ) - { - result = pthread_tryjoin_np( _failover_thread, NULL ); - if(( 0 != result )&&( EBUSY != result )) - { - if(( ESRCH != result )&&( EINVAL != result )) - { - DPRINTFE( "Failed to wait for failover thread " - "exit, sending kill signal, error=%s.", - strerror(result) ); - pthread_kill( _failover_thread, SIGKILL ); - } - break; - } - - ms_expired = sm_time_get_elapsed_ms( &time_prev ); - if( 12000 <= ms_expired ) - { - DPRINTFE( "Failed to stop failover thread, sending " - "kill signal." ); - pthread_kill( _failover_thread, SIGKILL ); - break; - } - - usleep( 250000 ); // 250 milliseconds. - } - } - - _thread_created = false; - - return( SM_OKAY ); -} -// **************************************************************************** \ No newline at end of file diff --git a/service-mgmt/sm-1.0.0/src/sm_failover_thread.h b/service-mgmt/sm-1.0.0/src/sm_failover_thread.h deleted file mode 100644 index 6bf5b457..00000000 --- a/service-mgmt/sm-1.0.0/src/sm_failover_thread.h +++ /dev/null @@ -1,33 +0,0 @@ -// -// Copyright (c) 2017 Wind River Systems, Inc. -// -// SPDX-License-Identifier: Apache-2.0 -// -#ifndef __SM_FAILOVER_THREAD_H__ -#define __SM_FAILOVER_THREAD_H__ - -#include "sm_types.h" - -#ifdef __cplusplus -extern "C" { -#endif - - -// **************************************************************************** -// Failover Thread - Start -// ================================ -extern SmErrorT sm_failover_thread_start( void ); -// **************************************************************************** - -// **************************************************************************** -// Failover Thread - Stop -// =============================== -extern SmErrorT sm_failover_thread_stop( void ); -// **************************************************************************** - - -#ifdef __cplusplus -} -#endif - -#endif \ No newline at end of file diff --git a/service-mgmt/sm-1.0.0/src/sm_failover_utils.c b/service-mgmt/sm-1.0.0/src/sm_failover_utils.c index e36311f3..46848bc8 100644 --- a/service-mgmt/sm-1.0.0/src/sm_failover_utils.c +++ b/service-mgmt/sm-1.0.0/src/sm_failover_utils.c @@ -1,12 +1,16 @@ #include "sm_failover_utils.h" #include #include - +#include +#include +#include #include "sm_debug.h" #include "sm_service_domain_member_table.h" #include "sm_service_domain_assignment_table.h" #include "sm_service_domain_table.h" +#define SM_NODE_STAY_FAILED_FILE "/var/run/.sm_stay_fail" + // **************************************************************************** // Failover utilities - loop all service domain members // ================== @@ -90,3 +94,48 @@ SmNodeScheduleStateT sm_get_controller_state( } // **************************************************************************** + + +// **************************************************************************** +// Failover Utilities - Set StayFailed +// ============================== +SmErrorT sm_failover_utils_set_stayfailed_flag() +{ + int fd = open( SM_NODE_STAY_FAILED_FILE, + O_RDWR | O_CREAT, S_IRUSR | S_IRGRP | S_IROTH); + if( 0 > fd ) + { + DPRINTFE( "Failed to create file (%s), error=%s.", + SM_NODE_STAY_FAILED_FILE, strerror(errno) ); + return( SM_FAILED ); + } + + close(fd); + return( SM_OKAY ); + +} +// **************************************************************************** + +// **************************************************************************** +// Failover Utilities - Set StayFailed +// ============================== +SmErrorT sm_failover_utils_reset_stayfailed_flag() +{ + unlink( SM_NODE_STAY_FAILED_FILE ); + return( SM_OKAY ); +} +// **************************************************************************** + + +// **************************************************************************** +// Failover Utilities - check StayFailed +// ============================== +bool sm_failover_utils_is_stayfailed() +{ + if( 0 == access( SM_NODE_STAY_FAILED_FILE, F_OK ) ) + { + return true; + } + return false; +} +// **************************************************************************** diff --git a/service-mgmt/sm-1.0.0/src/sm_failover_utils.h b/service-mgmt/sm-1.0.0/src/sm_failover_utils.h index bb96a56b..f556615f 100644 --- a/service-mgmt/sm-1.0.0/src/sm_failover_utils.h +++ b/service-mgmt/sm-1.0.0/src/sm_failover_utils.h @@ -15,7 +15,25 @@ extern "C" { // **************************************************************************** // Failover Utilities - get node controller state // ============================== -SmNodeScheduleStateT sm_get_controller_state(const char node_name[]); +extern SmNodeScheduleStateT sm_get_controller_state(const char node_name[]); +// **************************************************************************** + +// **************************************************************************** +// failover Utilities - Set StayFailed +// ============================== +extern SmErrorT sm_failover_utils_set_stayfailed_flag(); +// **************************************************************************** + +// **************************************************************************** +// Node Utilities - Set StayFailed +// ============================== +extern SmErrorT sm_failover_utils_reset_stayfailed_flag(); +// **************************************************************************** + +// **************************************************************************** +// Node Utilities - is StayFailed set +// ============================== +extern bool sm_failover_utils_is_stayfailed(); // **************************************************************************** #ifdef __cplusplus diff --git a/service-mgmt/sm-1.0.0/src/sm_main_event_handler.c b/service-mgmt/sm-1.0.0/src/sm_main_event_handler.c index 501945e3..af93c6b4 100644 --- a/service-mgmt/sm-1.0.0/src/sm_main_event_handler.c +++ b/service-mgmt/sm-1.0.0/src/sm_main_event_handler.c @@ -26,6 +26,7 @@ #include "sm_service_api.h" #include "sm_failover.h" #include "sm_node_swact_monitor.h" +#include "sm_failover_fsm.h" #define SM_NODE_AUDIT_TIMER_IN_MS 1000 #define SM_INTERFACE_AUDIT_TIMER_IN_MS 1000 @@ -177,6 +178,23 @@ static void sm_main_event_handler_api_node_set_callback( char node_name[], return; } + char peer_name[SM_NODE_NAME_MAX_CHAR] = {0}; + error = sm_node_api_get_peername(peer_name); + if( SM_OKAY != error ) + { + DPRINTFE( "Failed to get peername, error=%s.", + sm_error_str( error ) ); + } + if(SM_NODE_SET_ACTION_EVENT == action && + SM_NODE_ADMIN_STATE_UNLOCKED == admin_state && + SM_NODE_OPERATIONAL_STATE_ENABLED == oper_state && + (SM_NODE_AVAIL_STATUS_AVAILABLE == avail_status || + SM_NODE_AVAIL_STATUS_DEGRADED == avail_status ) && + 0 == strcmp(node_name, peer_name)) + { + SmFailoverFSM::get_fsm().send_event(SM_FAILOVER_EVENT_NODE_ENABLED, NULL); + } + if( SM_NODE_SET_ACTION_SWACT == action ) { SmNodeSwactMonitor::SwactStart(SM_NODE_STATE_STANDBY); diff --git a/service-mgmt/sm-1.0.0/src/sm_process.c b/service-mgmt/sm-1.0.0/src/sm_process.c index 36cc40c0..26e6b8b0 100644 --- a/service-mgmt/sm-1.0.0/src/sm_process.c +++ b/service-mgmt/sm-1.0.0/src/sm_process.c @@ -51,7 +51,6 @@ #include "sm_service_action_table.h" #include "sm_heartbeat_thread.h" #include "sm_failover.h" -#include "sm_failover_thread.h" #include "sm_task_affining_thread.h" #include "sm_worker_thread.h" @@ -328,6 +327,14 @@ static SmErrorT sm_process_initialize( void ) return( SM_FAILED ); } + error = sm_failover_initialize(); + if( SM_OKAY != error ) + { + DPRINTFE( "Failed to initialize failover handler module, error=%s.", + sm_error_str( error ) ); + return( SM_FAILED ); + } + error = sm_service_heartbeat_thread_start(); if( SM_OKAY != error ) { @@ -344,22 +351,6 @@ static SmErrorT sm_process_initialize( void ) return( SM_FAILED ); } - error = sm_failover_thread_start(); - if( SM_OKAY != error ) - { - DPRINTFE( "Failed to start the failover thread, error=%s.", - sm_error_str( error ) ); - return( SM_FAILED ); - } - - error = sm_failover_initialize(); - if( SM_OKAY != error ) - { - DPRINTFE( "Failed to initialize failover handler module, error=%s.", - sm_error_str( error ) ); - return( SM_FAILED ); - } - // Start a task affining thread for AIO duplex system if(_is_aio_duplex) { @@ -394,20 +385,6 @@ static SmErrorT sm_process_finalize( void ) } } - error = sm_failover_thread_stop(); - if( SM_OKAY != error ) - { - DPRINTFE( "Failed to stop failover thread, error=%s.", - sm_error_str( error ) ); - } - - error = sm_failover_finalize(); - if( SM_OKAY != error ) - { - DPRINTFE( "Failed to finalize failover handler module, error=%s.", - sm_error_str( error ) ); - } - error = sm_main_event_handler_finalize(); if( SM_OKAY != error ) { @@ -422,6 +399,13 @@ static SmErrorT sm_process_finalize( void ) sm_error_str(error) ); } + error = sm_failover_finalize(); + if( SM_OKAY != error ) + { + DPRINTFE( "Failed to finalize failover handler module, error=%s.", + sm_error_str( error ) ); + } + error = sm_service_heartbeat_api_finalize( true ); if( SM_OKAY != error ) { diff --git a/service-mgmt/sm-1.0.0/src/sm_service_domain_initial_state.c b/service-mgmt/sm-1.0.0/src/sm_service_domain_initial_state.c index 3e746d62..051d5075 100644 --- a/service-mgmt/sm-1.0.0/src/sm_service_domain_initial_state.c +++ b/service-mgmt/sm-1.0.0/src/sm_service_domain_initial_state.c @@ -17,6 +17,7 @@ #include "sm_msg.h" #include "sm_service_domain_utils.h" #include "sm_service_domain_fsm.h" +#include "sm_failover_fsm.h" // **************************************************************************** // Service Domain Initial State - Hello Timer @@ -122,6 +123,7 @@ SmErrorT sm_service_domain_initial_state_exit( SmServiceDomainT* domain ) sm_heartbeat_enable(); + SmFailoverFSM::get_fsm().send_event(SM_FAILOVER_EVENT_HEARTBEAT_ENABLED, NULL); return( SM_OKAY ); } // **************************************************************************** diff --git a/service-mgmt/sm-1.0.0/src/sm_worker_thread.cpp b/service-mgmt/sm-1.0.0/src/sm_worker_thread.cpp index 93821548..cf855fc2 100644 --- a/service-mgmt/sm-1.0.0/src/sm_worker_thread.cpp +++ b/service-mgmt/sm-1.0.0/src/sm_worker_thread.cpp @@ -9,8 +9,33 @@ #include #include #include +#include #include "sm_util_types.h" #include "sm_debug.h" +#include "sm_trap.h" + +SmSimpleAction::SmSimpleAction(const char* action_name, SmSimpleActionCallback callback) +{ + _callback = callback; + strncpy(_action_name, action_name, sizeof(_action_name)); +} + +SmSimpleAction::~SmSimpleAction() +{ +} + +void SmSimpleAction::action() +{ + if(NULL != _callback) + { + _callback(*this); + } +} + +const char* SmSimpleAction::get_name() const +{ + return _action_name; +} SmWorkerThread SmWorkerThread::_the_worker; @@ -68,6 +93,10 @@ SmWorkerThread::~SmWorkerThread() // **************************************************************************** void* SmWorkerThread::thread_helper(SmWorkerThread* workerThread) { + pthread_setname_np( pthread_self(), "worker" ); + sm_debug_set_thread_info(); + sm_trap_set_thread_info(); + DPRINTFI("worker thread started"); workerThread->thread_run(); return 0; } @@ -126,20 +155,33 @@ SmErrorT SmWorkerThread::stop() // **************************************************************************** // SmWorkerThread::add_action add a regular priority action // **************************************************************************** -void SmWorkerThread::add_action(SmAction& action) +void SmWorkerThread::add_action(SmAction* action) { mutex_holder(&this->_mutex); - this->_regular_queue.push(&action); + this->_regular_queue.push(action); + int res = sem_post(&_sem); + if(0 != res) + { + DPRINTFE("failed to signal semaphore. error %d", errno); + } + + DPRINTFI("Action %s is added", action->get_name()); } // **************************************************************************** // **************************************************************************** // SmWorkerThread::add_priority_action add a high priority action // **************************************************************************** -void SmWorkerThread::add_priority_action(SmAction& action) +void SmWorkerThread::add_priority_action(SmAction* action) { mutex_holder(&this->_mutex); - this->_priority_queue.push(&action); + this->_priority_queue.push(action); + int res = sem_post(&_sem); + if(0 != res) + { + DPRINTFE("failed to signal semaphore. error %d", errno); + } + DPRINTFI("Action %s is added to priority queue", action->get_name()); } // **************************************************************************** @@ -177,7 +219,9 @@ void SmWorkerThread::thread_run() } if( NULL != action ) { + DPRINTFI("Executing action %s", action->get_name()); action->action(); + DPRINTFI("Executing action %s completed", action->get_name()); } }else if(ETIMEDOUT != errno) { diff --git a/service-mgmt/sm-1.0.0/src/sm_worker_thread.h b/service-mgmt/sm-1.0.0/src/sm_worker_thread.h index 06028a69..4dd16e72 100644 --- a/service-mgmt/sm-1.0.0/src/sm_worker_thread.h +++ b/service-mgmt/sm-1.0.0/src/sm_worker_thread.h @@ -18,10 +18,23 @@ class SmAction { public: virtual void action() = 0; + virtual const char* get_name() const = 0; +}; + +class SmSimpleAction : public SmAction +{ + public: + typedef void(*SmSimpleActionCallback)(SmSimpleAction&); + SmSimpleAction(const char* action_name, SmSimpleActionCallback callback); + virtual ~SmSimpleAction(); + void action(); + const char* get_name() const; + private: + SmSimpleActionCallback _callback; + char _action_name[32]; }; typedef std::queue SmActionQueueT; -class SmWorkerThread; // **************************************************************************** // SmWorkerThread work thread class @@ -36,13 +49,13 @@ class SmWorkerThread A normal priority action will be scheduled after all high priority actions. */ - void add_action(SmAction& action); + void add_action(SmAction* action); /* Add an action to high priority FCFS queue. A high priority action is nonpreemptive. It will be scheduled after the current action. */ - void add_priority_action(SmAction& action); + void add_priority_action(SmAction* action); // retrieve singleton object diff --git a/service-mgmt/sm-common-1.0.0/src/sm_types.c b/service-mgmt/sm-common-1.0.0/src/sm_types.c index 0012972a..b3bfdc6a 100644 --- a/service-mgmt/sm-common-1.0.0/src/sm_types.c +++ b/service-mgmt/sm-common-1.0.0/src/sm_types.c @@ -166,6 +166,25 @@ _sm_service_domain_interface_event_mappings[SM_SERVICE_DOMAIN_INTERFACE_EVENT_MA { SM_SERVICE_DOMAIN_INTERFACE_EVENT_NOT_IN_USE, "not-in-use" } }; +static SmValueStrMappingT +_sm_failover_event_mappings[SM_FAILOVER_EVENT_MAX] = +{ + {SM_FAILOVER_EVENT_HEARTBEAT_ENABLED, "heartbeat-enabled"}, + {SM_FAILOVER_EVENT_IF_STATE_CHANGED, "interface-state-changed"}, + {SM_FAILOVER_EVENT_FAIL_PENDING_TIMEOUT, "fail-pending-timeout"}, + {SM_FAILOVER_EVENT_NODE_ENABLED, "node-enabled"} +}; + +static SmValueStrMappingT +_sm_failover_state_mappings[SM_FAILOVER_STATE_MAX] = +{ + {SM_FAILOVER_STATE_INITIAL, "initial"}, + {SM_FAILOVER_STATE_NORMAL, "normal"}, + {SM_FAILOVER_STATE_FAIL_PENDING, "fail-pending"}, + {SM_FAILOVER_STATE_FAILED, "failed"}, + {SM_FAILOVER_STATE_SURVIVED, "survived"} +}; + static SmValueStrMappingT _sm_service_domain_neighbor_state_mappings[SM_SERVICE_DOMAIN_NEIGHBOR_STATE_MAX] = { @@ -953,6 +972,28 @@ const char* sm_service_domain_interface_event_str( } // **************************************************************************** +// **************************************************************************** +// Types - Failover Event String +// ============================================= +const char* sm_failover_event_str( SmFailoverEventT event ) +{ + return( sm_mapping_get_str( _sm_failover_event_mappings, + SM_FAILOVER_EVENT_MAX, + event ) ); +} +// **************************************************************************** + +// **************************************************************************** +// Types - Failover State String +// ============================================= +const char* sm_failover_state_str( SmFailoverStateT state ) +{ + return( sm_mapping_get_str( _sm_failover_state_mappings, + SM_FAILOVER_STATE_MAX, + state ) ); +} +// **************************************************************************** + // **************************************************************************** // Types - Service Domain Neighbor State Value // =========================================== diff --git a/service-mgmt/sm-common-1.0.0/src/sm_types.h b/service-mgmt/sm-common-1.0.0/src/sm_types.h index 26aa2b92..074a4fca 100644 --- a/service-mgmt/sm-common-1.0.0/src/sm_types.h +++ b/service-mgmt/sm-common-1.0.0/src/sm_types.h @@ -279,6 +279,23 @@ typedef enum SM_SERVICE_DOMAIN_EVENT_DATA_MAX } SmServiceDomainEventDataT; +typedef enum{ + SM_FAILOVER_STATE_INITIAL, + SM_FAILOVER_STATE_NORMAL, + SM_FAILOVER_STATE_FAIL_PENDING, + SM_FAILOVER_STATE_FAILED, + SM_FAILOVER_STATE_SURVIVED, + SM_FAILOVER_STATE_MAX +}SmFailoverStateT; + +typedef enum{ + SM_FAILOVER_EVENT_HEARTBEAT_ENABLED, + SM_FAILOVER_EVENT_IF_STATE_CHANGED, + SM_FAILOVER_EVENT_FAIL_PENDING_TIMEOUT, + SM_FAILOVER_EVENT_NODE_ENABLED, + SM_FAILOVER_EVENT_MAX +}SmFailoverEventT; + typedef enum { SM_SERVICE_DOMAIN_INTERFACE_EVENT_NIL, @@ -931,6 +948,18 @@ extern const char* sm_service_domain_interface_event_str( SmServiceDomainInterfaceEventT event ); // **************************************************************************** +// **************************************************************************** +// Types - Failover Event String +// ============================================= +extern const char* sm_failover_event_str( SmFailoverEventT event ); +// **************************************************************************** + +// **************************************************************************** +// Types - Failover State String +// ============================================= +extern const char* sm_failover_state_str( SmFailoverStateT state ); +// **************************************************************************** + // **************************************************************************** // Types - Service Domain Neighbor State Value // =========================================== diff --git a/service-mgmt/sm-db-1.0.0/database/create_sm_db.sql b/service-mgmt/sm-db-1.0.0/database/create_sm_db.sql index 8f846ef1..fb10b2fc 100644 --- a/service-mgmt/sm-db-1.0.0/database/create_sm_db.sql +++ b/service-mgmt/sm-db-1.0.0/database/create_sm_db.sql @@ -3,7 +3,7 @@ BEGIN TRANSACTION; CREATE TABLE NODES ( ID INTEGER PRIMARY KEY AUTOINCREMENT, NAME CHAR(32), ADMINISTRATIVE_STATE CHAR(32), OPERATIONAL_STATE CHAR(32), AVAILABILITY_STATUS CHAR(32), READY_STATE CHAR(32), STATE_UUID CHAR(42)); CREATE TABLE NODE_HISTORY ( ID INTEGER PRIMARY KEY AUTOINCREMENT, NAME CHAR(32), ADMINISTRATIVE_STATE CHAR(32), OPERATIONAL_STATE CHAR(32), AVAILABILITY_STATUS CHAR(32), READY_STATE CHAR(32), STATE_UUID CHAR(42)); CREATE TABLE SERVICE_DOMAIN_INTERFACES ( ID INTEGER PRIMARY KEY AUTOINCREMENT, PROVISIONED CHAR(32), SERVICE_DOMAIN CHAR(32), SERVICE_DOMAIN_INTERFACE CHAR(32), PATH_TYPE CHAR(32), AUTH_TYPE CHAR(32), AUTH_KEY CHAR(128), INTERFACE_NAME CHAR(32), INTERFACE_STATE CHAR(32), NETWORK_TYPE CHAR(32), NETWORK_MULTICAST CHAR(256), NETWORK_ADDRESS CHAR(256), NETWORK_PORT INT, NETWORK_HEARTBEAT_PORT INT, NETWORK_PEER_ADDRESS CHAR(256), NETWORK_PEER_PORT INT, NETWORK_PEER_HEARTBEAT_PORT INT , INTERFACE_CONNECT_TYPE CHAR(32) DEFAULT 'tor'); -INSERT INTO "SERVICE_DOMAIN_INTERFACES" VALUES(1,'yes','controller','management-interface','primary','none','','','','','','','','','','','','tor'); +INSERT INTO "SERVICE_DOMAIN_INTERFACES" VALUES(1,'yes','controller','management-interface','secondary','none','','','','','','','','','','','','tor'); INSERT INTO "SERVICE_DOMAIN_INTERFACES" VALUES(2,'yes','controller','oam-interface','secondary','hmac-sha512','titanium-server','','','','','','','','','','','tor'); INSERT INTO "SERVICE_DOMAIN_INTERFACES" VALUES(3,'yes','controller','infrastructure-interface','secondary','none','','','','','','','','','','','','tor'); CREATE TABLE SERVICE_DOMAINS ( ID INTEGER PRIMARY KEY AUTOINCREMENT, PROVISIONED CHAR(32), NAME CHAR(32), ORCHESTRATION CHAR(32), DESIGNATION CHAR(32), PREEMPT CHAR(32), PRIORITY INT, HELLO_INTERVAL INT, DEAD_INTERVAL INT, WAIT_INTERVAL INT, EXCHANGE_INTERVAL INT, STATE CHAR(32), SPLIT_BRAIN_RECOVERY CHAR(32), LEADER CHAR(32), GENERATION INT);