diff --git a/service-mgmt/sm-1.0.0/centos/build_srpm.data b/service-mgmt/sm-1.0.0/centos/build_srpm.data index 56655102..72b5905d 100644 --- a/service-mgmt/sm-1.0.0/centos/build_srpm.data +++ b/service-mgmt/sm-1.0.0/centos/build_srpm.data @@ -2,4 +2,4 @@ SRC_DIR=$PKG_BASE COPY_LIST="$PKG_BASE/LICENSE" TAR_NAME=sm VERSION=1.0.0 -TIS_PATCH_VER=24 +TIS_PATCH_VER=26 diff --git a/service-mgmt/sm-1.0.0/src/Makefile b/service-mgmt/sm-1.0.0/src/Makefile index d7ee40bb..0c9c9635 100644 --- a/service-mgmt/sm-1.0.0/src/Makefile +++ b/service-mgmt/sm-1.0.0/src/Makefile @@ -107,6 +107,11 @@ SRCS+=sm_swact_state.c SRCS+=sm_worker_thread.cpp SRCS+=sm_task_affining_thread.c SRCS+=sm_node_swact_monitor.cpp +SRCS+=sm_failover_fsm.cpp +SRCS+=sm_failover_normal_state.cpp +SRCS+=sm_failover_fail_pending_state.cpp +SRCS+=sm_failover_failed_state.cpp +SRCS+=sm_failover_survived_state.cpp SRCS+=sm_failover_ss.c SRCS+=sm_service_domain_interface_not_in_use_state.c SRCS+=sm_configuration_table.c diff --git a/service-mgmt/sm-1.0.0/src/sm_failover.c b/service-mgmt/sm-1.0.0/src/sm_failover.c index e5b07165..5e8e8f02 100644 --- a/service-mgmt/sm-1.0.0/src/sm_failover.c +++ b/service-mgmt/sm-1.0.0/src/sm_failover.c @@ -34,15 +34,8 @@ #include "sm_util_types.h" #include "sm_failover_ss.h" #include "sm_failover_utils.h" - -typedef enum -{ - SM_FAILOVER_INTERFACE_UNKNOWN, - SM_FAILOVER_INTERFACE_OK, - SM_FAILOVER_INTERFACE_MISSING_HEARTBEAT, - SM_FAILOVER_INTERFACE_DOWN, - SM_FAILOVER_INTERFACE_RECOVERING -}SmFailoverInterfaceStateT; +#include "sm_failover_fsm.h" +#include "sm_api.h" typedef enum { @@ -57,7 +50,7 @@ typedef enum //as part of the gradual delivery of enhancement with more //complex algorithm to determine the failover survivor routine //the SM_FAILOVER_ACTION_ROUTINE will redirect the lookup to - //new logic. Until all actions are migrate to new logic, the + //new logic. Until all actions are migrated to new logic, the //lookup tables will be eliminated. SM_FAILOVER_ACTION_ROUTINE = 1 << 31, }SmFailoverActionT; @@ -181,6 +174,79 @@ static bool _hello_msg_alive = true; static SmSystemModeT _system_mode; static time_t _last_report_ts = 0; static int _heartbeat_count = 0; +static bool _peer_ready = false; + +// The class is to accommodate batching i/f state changes +// during the failover_audit interval. +class FailoverEventHolder +{ + public: + FailoverEventHolder(); + void push_event(SmFailoverEventT, const ISmFSMEventData* event_data); + void send_event(); + private: + bool _event_to_send; + SmFailoverEventT _failover_event; + const ISmFSMEventData* _event_data; +}; + +FailoverEventHolder::FailoverEventHolder() +{ + _event_to_send = false; +} + +void FailoverEventHolder::push_event(SmFailoverEventT event, const ISmFSMEventData* event_data) +{ + SmFailoverEventT list[] = { + SM_INTERFACE_DOWN, + SM_HEARTBEAT_LOST, + SM_INTERFACE_UP, + SM_HEARTBEAT_RECOVER + }; + + unsigned int i; + if(!_event_to_send) + { + _event_to_send = true; + for(i = 0; i < sizeof(list) / sizeof(SmFailoverEventT); i ++) + { + if(event == list[i]) + { + _failover_event = event; + _event_data = event_data; + _event_to_send = true; + DPRINTFI("Push event %d", event); + return; + } + } + DPRINTFE("Runtime error: unsupported event %d", event); + return; + } + + for(i = 0; list[i] != _failover_event && i < sizeof(list) / sizeof(SmFailoverEventT); i ++) + { + if(list[i] == event) + { + _failover_event = event; + _event_data = event_data; + return; + } + } + DPRINTFE("Runtime error: unsupported event %d", event); +} + +void FailoverEventHolder::send_event() +{ + if(_event_to_send) + { + DPRINTFI("Send event %d", _failover_event); + SmFailoverFSM::get_fsm().send_event(_failover_event, _event_data); + _event_to_send = false; + } +} + + +static FailoverEventHolder _event_holder; // std 2+2/2+2+x failover actions SmFailoverActionPairT action_map_std_infra[16] = @@ -435,6 +501,8 @@ void sm_failover_lost_heartbeat( SmFailoverInterfaceT* interface ) { DPRINTFI("Interface %s lose heartbeat.", interface->interface_name); } + + _event_holder.push_event(SM_HEARTBEAT_LOST, NULL); } // **************************************************************************** @@ -466,6 +534,8 @@ void sm_failover_heartbeat_restore( SmFailoverInterfaceT* interface ) { DPRINTFI("Interface %s heartbeat is OK.", interface->interface_name); } + + _event_holder.push_event(SM_HEARTBEAT_RECOVER, NULL); } // **************************************************************************** @@ -501,6 +571,8 @@ void sm_failover_interface_down( const char* const interface_name ) { DPRINTFI("%d domain interfaces are impacted as i/f %s is down.", impacted, interface_name); + + _event_holder.push_event(SM_INTERFACE_DOWN, NULL); } } // **************************************************************************** @@ -537,6 +609,8 @@ void sm_failover_interface_up( const char* const interface_name ) { DPRINTFI("%d domain interfaces are impacted as i/f %s is up.", impacted, interface_name); + + _event_holder.push_event(SM_INTERFACE_UP, NULL); } } // **************************************************************************** @@ -656,13 +730,22 @@ int _failover_get_if_state() // **************************************************************************** // Failover - get interface state // ================== -SmErrorT sm_failover_if_state_get(SmHeartbeatMsgIfStateT *if_state) +SmHeartbeatMsgIfStateT sm_failover_if_state_get() +{ + mutex_holder holder(&_mutex); + int if_state_flag = _failover_get_if_state(); + return (if_state_flag & 0b0111); //the lower 3 bits i/f state flag +} +// **************************************************************************** + +// **************************************************************************** +// Failover - get peer node interface state +// ================== +SmHeartbeatMsgIfStateT sm_failover_get_peer_if_state() { mutex_holder holder(&_mutex); - int if_state_flag = _failover_get_if_state(); - *if_state = (if_state_flag & 0b0111); //the lower 3 bits i/f state flag - return SM_OKAY; + return (_peer_if_state & 0b0111); //the lower 3 bits i/f state flag } // **************************************************************************** @@ -923,84 +1006,66 @@ bool this_controller_unlocked() } // **************************************************************************** - // **************************************************************************** -// Failover - pack schedule state into 2 bit (active/standby/failed) -// SM starts managing failover after the node is scheduled, i.e, cannot be -// in SM_NODE_STATE_UNKNOWN or SM_NODE_STATE_INIT state -// ======================= -unsigned int sm_failover_pack_schedule_state(SmNodeScheduleStateT state) +// Failover - set system to scheduled status +// ================== +SmErrorT sm_failover_set_system(const SmSystemFailoverStatus& failover_status) { - static const unsigned int failed_state_bit_flag = 0; - static const unsigned int active_state_bit_flag = 1; - static const unsigned int standby_state_bit_flag = 1 << 1; - unsigned int res; - switch(state) + //valid combinations of target system scheduling are: + // active/standby + // active/failed + SmFailoverActionResultT result; + SmNodeScheduleStateT host_target_state, peer_target_state; + host_target_state = failover_status.get_host_schedule_state(); + peer_target_state = failover_status.get_peer_schedule_state(); + if(SM_NODE_STATE_ACTIVE == host_target_state) { - case SM_NODE_STATE_ACTIVE: - res = active_state_bit_flag; - break; - case SM_NODE_STATE_STANDBY: - res = standby_state_bit_flag; - break; - case SM_NODE_STATE_FAILED: - res = failed_state_bit_flag; - break; - default: - res = failed_state_bit_flag; + if(SM_NODE_STATE_STANDBY == _host_state && + SM_NODE_STATE_FAILED == peer_target_state) + { + result = sm_failover_activate_self(); + if(SM_FAILOVER_ACTION_RESULT_FAILED == result) + { + DPRINTFE("Failed to activate %s.", _host_name); + return SM_FAILED; + } + result = sm_failover_disable_node(_peer_name); + if(SM_FAILOVER_ACTION_RESULT_FAILED == result) + { + DPRINTFE("Failed to disable node %s.", _peer_name); + return SM_FAILED; + } + } + }else if(SM_NODE_STATE_STANDBY == host_target_state) + { + if(SM_NODE_STATE_ACTIVE == _host_state) + { + result = sm_failover_swact(); + if(SM_FAILOVER_ACTION_RESULT_FAILED == result) + { + DPRINTFE("Failed to swact."); + return SM_FAILED; + } + } } - return res; -} -// **************************************************************************** - -// **************************************************************************** -// Failover - convert target scheduling state to action -// ======================= -int sm_failover_get_action(const SmSystemFailoverStatus& failover_status) -{ - DPRINTFI("Host to %s, Peer to %s.", - sm_node_schedule_state_str(failover_status.host_schedule_state), - sm_node_schedule_state_str(failover_status.peer_schedule_state) - ); - unsigned int host_flag, peer_flag; - host_flag = sm_failover_pack_schedule_state(failover_status.host_schedule_state); - peer_flag = sm_failover_pack_schedule_state(failover_status.peer_schedule_state); - - unsigned int flag = (host_flag | (peer_flag << 2)); - DPRINTFI("Failover scheduling flag %d", flag); - SmFailoverActionPairT* actions; - int action; - - SmFailoverActionPairT action_map[16] = { - {SM_FAILOVER_ACTION_FAIL_NODE, SM_FAILOVER_ACTION_FAIL_NODE}, //00 00 - {SM_FAILOVER_ACTION_DISABLE_STANDBY, SM_FAILOVER_ACTION_FAIL_NODE}, //01 00 - {SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //10 00 - {SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //11 00 - {SM_FAILOVER_ACTION_FAIL_NODE, SM_FAILOVER_ACTION_ACTIVATE}, //00 01 - {SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //01 01 - {SM_FAILOVER_ACTION_SWACT, SM_FAILOVER_ACTION_NO_ACTION}, //10 01 - {SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //11 01 - {SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //00 10 - {SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //01 10 - {SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //10 10 - {SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //11 10 - {SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //00 11 - {SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //01 11 - {SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //10 11 - {SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //11 11 - }; - - bool is_active = is_active_controller(); - actions = &(action_map[flag & 0xf]); - if(is_active) + else if(SM_NODE_STATE_FAILED == host_target_state) { - action = actions->active_controller_action; + result = sm_failover_disable_node(_host_name); + if(SM_FAILOVER_ACTION_RESULT_FAILED == result) + { + DPRINTFE("Failed disable host %s.", _host_name); + return SM_FAILED; + } } else { - action = actions->standby_controller_action; + DPRINTFE("Runtime error. Unexpected scheduling state %s for host %s", + sm_node_schedule_state_str(host_target_state), + _host_name); + return SM_FAILED; } - return action; + + return SM_OKAY; } // **************************************************************************** @@ -1112,6 +1177,8 @@ void sm_failover_audit() return; } + _event_holder.send_event(); + int64_t curr_node_state = if_state_flag; if( _hello_msg_alive ) @@ -1160,34 +1227,9 @@ void sm_failover_audit() _log_nodes_state(action); DPRINTFI("Action to take %d", action); - if (action & SM_FAILOVER_ACTION_ROUTINE) { - SmSystemStatusT sys_status; - SmSystemFailoverStatus failover_status; - sys_status.system_mode = _system_mode; - if(if_state_flag & SM_FAILOVER_HEARTBEAT_ALIVE) - { - sys_status.heartbeat_state = SM_HEARTBEAT_OK; - }else - { - sys_status.heartbeat_state = SM_HEARTBEAT_LOSS; - } - - sys_status.host_status.node_name = _host_name; - sys_status.host_status.interface_state = if_state_flag & 0x7; - sys_status.host_status.current_schedule_state = _host_state; - sys_status.peer_status.node_name = _peer_name; - sys_status.peer_status.interface_state = _peer_if_state & 0x7; - sys_status.peer_status.current_schedule_state = sm_get_controller_state(_peer_name); - SmErrorT error = sm_failover_ss_get_survivor(sys_status, failover_status); - if(SM_OKAY != error) - { - DPRINTFE("Failed to determine failover state. "); - return; - } - - action = sm_failover_get_action(failover_status); + action = SM_FAILOVER_ACTION_NO_ACTION; } if (action & SM_FAILOVER_ACTION_ACTIVATE) @@ -1367,6 +1409,67 @@ SmErrorT sm_failover_action() } // **************************************************************************** +// **************************************************************************** +// Failover - get node interface info +// ================== +SmFailoverInterfaceStateT sm_failover_get_interface_info(SmInterfaceTypeT interface_type) +{ + SmFailoverInterfaceInfo* res = NULL; + switch(interface_type) + { + case SM_INTERFACE_MGMT: + res = _mgmt_interface_info; + break; + case SM_INTERFACE_INFRA: + res = _infra_interface_info; + break; + case SM_INTERFACE_OAM: + res = _oam_interface_info; + break; + case SM_INTERFACE_UNKNOWN: + break; + } + if(NULL != res) + { + return res->get_state(); + } + return SM_FAILOVER_INTERFACE_UNKNOWN; +} +// **************************************************************************** + +static void sm_failover_node_set_callback( char node_name[], + SmNodeSetActionT action, SmNodeAdminStateT admin_state, + SmNodeOperationalStateT oper_state, SmNodeAvailStatusT avail_status, + int seqno ) +{ + if(0 == strcmp(node_name, _host_name)) + { + // care only the peer + return; + } + + bool peer_ready = false; + if(SM_NODE_SET_ACTION_UNLOCK == action) + { + peer_ready = SM_NODE_ADMIN_STATE_UNLOCKED == admin_state && + SM_NODE_OPERATIONAL_STATE_ENABLED == oper_state && + (SM_NODE_AVAIL_STATUS_AVAILABLE == avail_status || + SM_NODE_AVAIL_STATUS_DEGRADED == avail_status); + }else if (SM_NODE_SET_ACTION_LOCK == action) + { + peer_ready = false; + }else + { + return; + } + if(peer_ready && !_peer_ready) + { + DPRINTFI("Peer %s is now online.", node_name); + } + _peer_ready = peer_ready; +} + +static SmApiCallbacksT _api_callbacks = {0}; // **************************************************************************** // Failover - Initialize // ====================== @@ -1375,6 +1478,14 @@ SmErrorT sm_failover_initialize( void ) void* user_data[1] = {&_total_interfaces}; bool enabled; SmErrorT error; + + error = SmFailoverFSM::initialize(); + if( SM_OKAY != error ) + { + DPRINTFE( "Failed to initialize failover FSM, error %s.", sm_error_str( error ) ); + return error; + } + _system_mode = sm_node_utils_get_system_mode(); DPRINTFI("System mode %s", sm_system_mode_str(_system_mode)); @@ -1404,6 +1515,17 @@ SmErrorT sm_failover_initialize( void ) return error; } + _api_callbacks.node_set = sm_failover_node_set_callback; + + error = sm_api_register_callbacks( &_api_callbacks ); + if( SM_OKAY != error ) + { + DPRINTFE( "Failed to register api callbacks, error=%s.", + sm_error_str( error ) ); + return( error ); + } + + if( 0 == strcmp( SM_NODE_CONTROLLER_0_NAME, _host_name ) ) { snprintf( _peer_name, sizeof(_peer_name), "%s", SM_NODE_CONTROLLER_1_NAME ); @@ -1491,6 +1613,13 @@ SmErrorT sm_failover_finalize( void ) _total_interfaces = 0; SmErrorT error; + error = sm_api_deregister_callbacks( &_api_callbacks ); + if( SM_OKAY != error ) + { + DPRINTFE( "Failed to deregister api callbacks, error=%s.", + sm_error_str( error ) ); + } + sm_timer_deregister( failover_audit_timer_id ); if( NULL != _sm_db_handle ) { @@ -1504,6 +1633,13 @@ SmErrorT sm_failover_finalize( void ) _sm_db_handle = NULL; } + error = SmFailoverFSM::finalize(); + if( SM_OKAY != error ) + { + DPRINTFE( "Failed to finalize failover FSM, error %s.", sm_error_str( error ) ); + return error; + } + return SM_OKAY; } // **************************************************************************** @@ -1575,6 +1711,11 @@ void dump_peer_if_state(FILE* fp) fprintf(fp, " Peer Interface state: %d\n", _peer_if_state); } +void dump_failover_fsm_state(FILE* fp) +{ + fprintf(fp, " Failover FSM state: %d\n", SmFailoverFSM::get_fsm().get_state()); +} + // **************************************************************************** // Failover - dump state // ====================== @@ -1582,6 +1723,7 @@ void sm_failover_dump_state(FILE* fp) { fprintf( fp, "Failover \n\n" ); dump_host_state(fp); + dump_failover_fsm_state(fp); dump_interfaces_state(fp); dump_peer_if_state(fp); } diff --git a/service-mgmt/sm-1.0.0/src/sm_failover.h b/service-mgmt/sm-1.0.0/src/sm_failover.h index 3c4cf984..5266bdbe 100644 --- a/service-mgmt/sm-1.0.0/src/sm_failover.h +++ b/service-mgmt/sm-1.0.0/src/sm_failover.h @@ -9,6 +9,7 @@ #include "sm_types.h" #include "sm_service_domain_interface_table.h" #include "sm_db_nodes.h" +#include "sm_failover_ss.h" #ifdef __cplusplus extern "C" { #endif @@ -88,7 +89,7 @@ extern void sm_failover_if_state_update(const char node_name[], // **************************************************************************** // Failover - get local interface state -extern SmErrorT sm_failover_if_state_get(SmHeartbeatMsgIfStateT*); +extern SmHeartbeatMsgIfStateT sm_failover_if_state_get(); // **************************************************************************** // **************************************************************************** @@ -115,6 +116,27 @@ SmErrorT sm_failover_degrade_clear(SmFailoverDegradeSourceT source); SmErrorT sm_failover_get_node(char* node_name, SmDbNodeT& node); // **************************************************************************** +// **************************************************************************** +// Failover - get node interface info +// ================== +SmFailoverInterfaceStateT sm_failover_get_interface_info(SmInterfaceTypeT interface_type); +// **************************************************************************** + + +// **************************************************************************** +// Failover - get peer node interface state +// ================== +SmHeartbeatMsgIfStateT sm_failover_get_peer_if_state(); +// **************************************************************************** + + +// **************************************************************************** +// Failover - set system to scheduled status +// ================== +SmErrorT sm_failover_set_system(const SmSystemFailoverStatus& failover_status); +// **************************************************************************** + + // **************************************************************************** // Failover - dump state // ================== diff --git a/service-mgmt/sm-1.0.0/src/sm_failover_fail_pending_state.cpp b/service-mgmt/sm-1.0.0/src/sm_failover_fail_pending_state.cpp new file mode 100644 index 00000000..524c9878 --- /dev/null +++ b/service-mgmt/sm-1.0.0/src/sm_failover_fail_pending_state.cpp @@ -0,0 +1,141 @@ +// +// Copyright (c) 2018 Wind River Systems, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +// +#include "sm_failover_fail_pending_state.h" +#include +#include "sm_types.h" +#include "sm_limits.h" +#include "sm_debug.h" +#include "sm_failover.h" +#include "sm_failover_fsm.h" +#include "sm_failover_ss.h" +#include "sm_failover_utils.h" +#include "sm_node_utils.h" +#include "sm_node_api.h" + +static const int FAIL_PENDING_TIMEOUT = 2000; //2000ms + +SmErrorT SmFailoverFailPendingState::event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data) +{ + //SmFSMEventDataTypeT event_data_type = event_data->get_event_data_type(); + switch (event) + { + case SM_HEARTBEAT_LOST: + case SM_INTERFACE_DOWN: + //getting worse, extend wait time + case SM_INTERFACE_UP: + case SM_HEARTBEAT_RECOVER: + // Things are improving, extend wait time + this->_register_timer(); + break; + case SM_FAIL_PENDING_TIMEOUT: + { + SmSystemFailoverStatus failover_status; + SmErrorT error = sm_failover_ss_get_survivor(failover_status); + SmNodeScheduleStateT host_state = failover_status.get_host_schedule_state(); + SmNodeScheduleStateT peer_state = failover_status.get_peer_schedule_state(); + if(SM_OKAY != error) + { + DPRINTFE("Failed to get failover survivor. Error %s", sm_error_str(error)); + return error; + } + + error = sm_failover_set_system(failover_status); + if(SM_NODE_STATE_FAILED == host_state) + { + this->fsm.set_state(SM_FAILOVER_STATE_FAILED); + } + else if(SM_NODE_STATE_ACTIVE == host_state) + { + if(SM_NODE_STATE_FAILED == peer_state) + { + this->fsm.set_state(SM_FAILOVER_STATE_SURVIVED); + }else + { + this->fsm.set_state(SM_FAILOVER_STATE_NORMAL); + } + } + else if(SM_NODE_STATE_STANDBY == host_state) + { + this->fsm.set_state(SM_FAILOVER_STATE_NORMAL); + }else + { + DPRINTFE("Runtime error: unexpected scheduling state: %s", + sm_node_schedule_state_str(host_state)); + } + return error; + } + break; + default: + DPRINTFE("Runtime error, unexpected event %d, at state %d", event, this->fsm.get_state()); + } + return SM_OKAY; +} + +bool SmFailoverFailPendingState::_fail_pending_timeout( + SmTimerIdT timer_id, int64_t user_data) +{ + SmFailoverFSM::get_fsm().send_event(SM_FAIL_PENDING_TIMEOUT, NULL); + return false; +} + +SmErrorT SmFailoverFailPendingState::enter_state() +{ + DPRINTFI("Entering Pending state"); + SmErrorT error = this->_register_timer(); + if(SM_OKAY != error) + { + DPRINTFE("Failed to register fail pending timer. Error %s", sm_error_str(error)); + } + return error; +} + +SmErrorT SmFailoverFailPendingState::_register_timer() +{ + SmErrorT error; + const char* timer_name = "FAIL PENDING TIMER"; + if(SM_TIMER_ID_INVALID != this->_pending_timer_id) + { + this->_deregister_timer(); + } + + error = sm_timer_register( timer_name, FAIL_PENDING_TIMEOUT, + SmFailoverFailPendingState::_fail_pending_timeout, + 0, &this->_pending_timer_id); + + return error; +} + +SmErrorT SmFailoverFailPendingState::_deregister_timer() +{ + SmErrorT error; + if(SM_TIMER_ID_INVALID == this->_pending_timer_id) + { + return SM_OKAY; + } + + error = sm_timer_deregister(this->_pending_timer_id); + if( SM_OKAY != error ) + { + DPRINTFE( "Failed to cancel fail pending timer, error=%s.", + sm_error_str( error ) ); + }else + { + this->_pending_timer_id = SM_TIMER_ID_INVALID; + } + + return error; +} + +SmErrorT SmFailoverFailPendingState::exit_state() +{ + SmErrorT error = this->_deregister_timer(); + if(SM_OKAY != error) + { + DPRINTFE("Failed to deregister fail pending timer. Error %s", sm_error_str(error)); + } + DPRINTFI("Exiting Pending state"); + return error; +} diff --git a/service-mgmt/sm-1.0.0/src/sm_failover_fail_pending_state.h b/service-mgmt/sm-1.0.0/src/sm_failover_fail_pending_state.h new file mode 100644 index 00000000..69befd13 --- /dev/null +++ b/service-mgmt/sm-1.0.0/src/sm_failover_fail_pending_state.h @@ -0,0 +1,31 @@ +// +// Copyright (c) 2018 Wind River Systems, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +// +#ifndef __SM_FAILOVER_FAIL_PENDING_STATE_H__ +#define __SM_FAILOVER_FAIL_PENDING_STATE_H__ +#include "sm_types.h" +#include "sm_failover_fsm.h" +#include "sm_timer.h" + +class SmFailoverFailPendingState : public SmFSMState +{ + public: + SmFailoverFailPendingState(SmFailoverFSM& fsm) : SmFSMState(fsm){} + SmErrorT enter_state(); + SmErrorT exit_state(); + + protected: + SmErrorT event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data); + + private: + SmTimerIdT _pending_timer_id; + + static bool _fail_pending_timeout(SmTimerIdT timer_id, int64_t user_data); + SmErrorT _register_timer(); + SmErrorT _deregister_timer(); +}; + + +#endif //__SM_FAILOVER_FAIL_PENDING_STATE_H__ \ No newline at end of file diff --git a/service-mgmt/sm-1.0.0/src/sm_failover_failed_state.cpp b/service-mgmt/sm-1.0.0/src/sm_failover_failed_state.cpp new file mode 100644 index 00000000..a6d25d97 --- /dev/null +++ b/service-mgmt/sm-1.0.0/src/sm_failover_failed_state.cpp @@ -0,0 +1,66 @@ +// +// Copyright (c) 2018 Wind River Systems, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +// +#include "sm_failover_failed_state.h" +#include +#include "sm_types.h" +#include "sm_debug.h" +#include "sm_failover_fsm.h" +#include "sm_failover_ss.h" + +SmErrorT SmFailoverFailedState::enter_state() +{ + DPRINTFI("Entering Failed state"); + return SM_OKAY; +} + +SmErrorT SmFailoverFailedState::event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data) +{ + switch (event) + { + case SM_HEARTBEAT_LOST: + case SM_INTERFACE_DOWN: + case SM_INTERFACE_UP: + // ignore, recover is possible only when heartbeat is recovered + break; + case SM_HEARTBEAT_RECOVER: + { + SmSystemFailoverStatus failover_status; + SmErrorT error = sm_failover_ss_get_survivor(failover_status); + SmNodeScheduleStateT host_state = failover_status.get_host_schedule_state(); + SmNodeScheduleStateT peer_state = failover_status.get_peer_schedule_state(); + if(SM_OKAY != error) + { + DPRINTFE("Failed to get failover survivor. Error %s", sm_error_str(error)); + return error; + } + + if(SM_NODE_STATE_FAILED == host_state) + { + // don't need to set to failed state, already here + } + else if(SM_NODE_STATE_STANDBY == host_state && SM_NODE_STATE_ACTIVE == peer_state) + { + // standby is the only possible state to be scheduled to from failed state + this->fsm.set_state(SM_FAILOVER_STATE_NORMAL); + }else + { + DPRINTFE("Runtime error: unexpected scheduling state: %s", + sm_node_schedule_state_str(host_state)); + } + } + + break; + default: + DPRINTFE("Runtime error, unexpected event %d, at state %d", event, this->fsm.get_state()); + } + return SM_OKAY; +} + +SmErrorT SmFailoverFailedState::exit_state() +{ + DPRINTFI("Exiting Failed state"); + return SM_OKAY; +} diff --git a/service-mgmt/sm-1.0.0/src/sm_failover_failed_state.h b/service-mgmt/sm-1.0.0/src/sm_failover_failed_state.h new file mode 100644 index 00000000..e0991f47 --- /dev/null +++ b/service-mgmt/sm-1.0.0/src/sm_failover_failed_state.h @@ -0,0 +1,25 @@ +// +// Copyright (c) 2018 Wind River Systems, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +// +#ifndef __SM_FAILOVER_FAILED_STATE_H__ +#define __SM_FAILOVER_FAILED_STATE_H__ +#include "sm_types.h" +#include "sm_failover_fsm.h" +#include "sm_timer.h" + +class SmFailoverFailedState : public SmFSMState +{ + public: + SmFailoverFailedState(SmFailoverFSM& fsm) : SmFSMState(fsm){} + SmErrorT enter_state(); + SmErrorT exit_state(); + + protected: + SmErrorT event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data); + +}; + + +#endif //__SM_FAILOVER_FAILED_STATE_H__ \ No newline at end of file diff --git a/service-mgmt/sm-1.0.0/src/sm_failover_fsm.cpp b/service-mgmt/sm-1.0.0/src/sm_failover_fsm.cpp new file mode 100644 index 00000000..10e098c5 --- /dev/null +++ b/service-mgmt/sm-1.0.0/src/sm_failover_fsm.cpp @@ -0,0 +1,164 @@ +// +// Copyright (c) 2018 Wind River Systems, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +// +#include "sm_failover_fsm.h" +#include +#include "sm_debug.h" +#include "sm_failover_normal_state.h" +#include "sm_failover_fail_pending_state.h" +#include "sm_failover_failed_state.h" +#include "sm_failover_survived_state.h" + +SmErrorT SmFSMState::enter_state() +{ + // default implementation, nothing to do + return SM_OKAY; +}; + +SmErrorT SmFSMState::exit_state() +{ + // default implementation, nothing to do + return SM_OKAY; +}; + +SmFailoverFSM SmFailoverFSM::_the_fsm; + +void SmFailoverFSM::deregister_states() +{ + for(int i = 0; i < MaxState; i ++) + { + SmFSMState* state = _state_handlers[i]; + if(NULL != state) + { + _state_handlers[i] = NULL; + delete state; + } + } +} + +SmFailoverFSM::SmFailoverFSM() +{ + for(int i = 0; i < MaxState; i ++) + { + _state_handlers[i] = NULL; + } +} + +SmFailoverFSM::~SmFailoverFSM() +{ +} + +SmErrorT SmFailoverFSM::send_event(SmFailoverEventT event, const ISmFSMEventData* event_data) +{ + SmFSMState* state_handler = this->_state_handlers[this->get_state()]; + if(NULL == state_handler) + { + DPRINTFE("Runtime error. No handler for state %d", this->get_state()); + return SM_FAILED; + } + DPRINTFI("send_event %d\n", event); + state_handler->event_handler(event, event_data); + return SM_OKAY; +} + +SmErrorT SmFailoverFSM::set_state(SmFailoverStateT state) +{ + if(state >= MaxState) + { + DPRINTFE("Runtime error. Don't understand state %d", state); + return SM_FAILED; + } + if(state == this->get_state()) + { + DPRINTFI("Already in %d state", state); + return SM_OKAY; + } + + SmFSMState* state_handler = this->_state_handlers[this->get_state()]; + if(NULL == state_handler) + { + DPRINTFE("State %d was not registered", this->get_state()); + return SM_FAILED; + } + + SmErrorT error = state_handler->exit_state(); + if(SM_OKAY != error) + { + DPRINTFE("Exit state failed (%d), %d", this->get_state(), error); + return SM_FAILED; + } + + SmFSMState* new_state_handler = this->_state_handlers[state]; + if(NULL == new_state_handler) + { + DPRINTFE("State %d was not registered", state); + return SM_FAILED; + } + error = new_state_handler->enter_state(); + if(SM_OKAY != error) + { + DPRINTFE("Enter state failed (%d), %d", this->get_state(), error); + return SM_FAILED; + } + + this->_current_state = state; + return SM_OKAY; +} + +SmErrorT SmFailoverFSM::register_fsm_state(SmFailoverStateT state, SmFSMState* state_handler) +{ + if(state >= MaxState) + { + DPRINTFE("Runtime error. Invalid state %d", state); + return SM_FAILED; + } + + if(NULL != this->_state_handlers[state]) + { + DPRINTFE("Runtime error. Duplicated state handle for state %d", state); + return SM_FAILED; + } + + this->_state_handlers[state] = state_handler; + return SM_OKAY; +} + +SmErrorT SmFailoverFSM::init_state() +{ + SmErrorT error; + SmFailoverStateT state = SM_FAILOVER_STATE_INITIAL; + SmFSMState* new_state_handler = _state_handlers[state]; + if(NULL == new_state_handler) + { + DPRINTFE("State %d was not registered", state); + return SM_FAILED; + } + error = new_state_handler->enter_state(); + if(SM_OKAY != error) + { + DPRINTFE("Enter state failed (%d), %d", this->get_state(), error); + return SM_FAILED; + } + + _current_state = state; + return SM_OKAY; +} + +SmErrorT SmFailoverFSM::initialize() +{ + SmFailoverFSM& fsm = SmFailoverFSM::get_fsm(); + fsm.register_fsm_state(SM_FAILOVER_STATE_NORMAL, new SmFailoverNormalState(fsm)); + fsm.register_fsm_state(SM_FAILOVER_STATE_FAIL_PENDING, new SmFailoverFailPendingState(fsm)); + fsm.register_fsm_state(SM_FAILOVER_STATE_FAILED, new SmFailoverFailedState(fsm)); + fsm.register_fsm_state(SM_FAILOVER_STATE_SURVIVED, new SmFailoverSurvivedState(fsm)); + + return fsm.init_state(); +} + +SmErrorT SmFailoverFSM::finalize() +{ + SmFailoverFSM::get_fsm().deregister_states(); + return SM_OKAY; +} \ No newline at end of file diff --git a/service-mgmt/sm-1.0.0/src/sm_failover_fsm.h b/service-mgmt/sm-1.0.0/src/sm_failover_fsm.h new file mode 100644 index 00000000..bd79585b --- /dev/null +++ b/service-mgmt/sm-1.0.0/src/sm_failover_fsm.h @@ -0,0 +1,81 @@ +// +// Copyright (c) 2018 Wind River Systems, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +// +#ifndef __SM_FAILOVER_FSM_H__ +#define __SM_FAILOVER_FSM_H__ +#include "sm_types.h" + +typedef enum{ + SM_FAILOVER_STATE_NORMAL, + SM_FAILOVER_STATE_FAIL_PENDING, + SM_FAILOVER_STATE_FAILED, + SM_FAILOVER_STATE_SURVIVED, + SM_FAILOVER_STATE_MAX +}SmFailoverStateT; + +#define SM_FAILOVER_STATE_INITIAL SM_FAILOVER_STATE_NORMAL + +typedef enum{ + SM_HEARTBEAT_LOST, + SM_INTERFACE_DOWN, + SM_INTERFACE_UP, + SM_HEARTBEAT_RECOVER, + SM_FAIL_PENDING_TIMEOUT, +}SmFailoverEventT; + +typedef int SmFSMEventDataTypeT; + +class ISmFSMEventData +{ + public: + virtual SmFSMEventDataTypeT get_event_data_type() const = 0; +}; + +class SmFailoverFSM; + +class SmFSMState +{ + public: + SmFSMState(SmFailoverFSM& fsm) : fsm(fsm) {}; + virtual ~SmFSMState(){}; + virtual SmErrorT enter_state(); + virtual SmErrorT exit_state(); + protected: + SmFailoverFSM& fsm; + virtual SmErrorT event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data)=0; + friend SmFailoverFSM; +}; + +class SmFailoverFSM +{ + public: + SmFailoverFSM(); + virtual ~SmFailoverFSM(); + + SmErrorT send_event(SmFailoverEventT event, const ISmFSMEventData* event_data); + + SmErrorT register_fsm_state(SmFailoverStateT state, SmFSMState* state_handler); + + SmErrorT set_state(SmFailoverStateT state); + inline int get_state() const {return this->_current_state;} + + static SmErrorT initialize(); + static SmErrorT finalize(); + inline static SmFailoverFSM& get_fsm() { + return _the_fsm; + } + + private: + static const int MaxState = SM_FAILOVER_STATE_MAX; + SmFSMState* _state_handlers[MaxState]; + int _current_state; + + static SmFailoverFSM _the_fsm; + SmErrorT init_state(); + void deregister_states(); +}; + + +#endif //__SM_FAILOVER_FSM_H__ diff --git a/service-mgmt/sm-1.0.0/src/sm_failover_normal_state.cpp b/service-mgmt/sm-1.0.0/src/sm_failover_normal_state.cpp new file mode 100644 index 00000000..ac8c1bc9 --- /dev/null +++ b/service-mgmt/sm-1.0.0/src/sm_failover_normal_state.cpp @@ -0,0 +1,42 @@ +// +// Copyright (c) 2018 Wind River Systems, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +// +#include "sm_failover_normal_state.h" +#include +#include "sm_types.h" +#include "sm_debug.h" +#include "sm_failover_fsm.h" + +SmErrorT SmFailoverNormalState::enter_state() +{ + DPRINTFI("Entering normal state"); + return SM_OKAY; +} + +SmErrorT SmFailoverNormalState::event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data) +{ + switch (event) + { + case SM_HEARTBEAT_LOST: + case SM_INTERFACE_DOWN: + this->fsm.set_state(SM_FAILOVER_STATE_FAIL_PENDING); + break; + case SM_INTERFACE_UP: + DPRINTFI("Interface is up"); + break; + case SM_HEARTBEAT_RECOVER: + DPRINTFI("Heartbeat is recovered"); + break; + default: + DPRINTFE("Runtime error, unexpected event %d, at state %d", event, this->fsm.get_state()); + } + return SM_OKAY; +} + +SmErrorT SmFailoverNormalState::exit_state() +{ + DPRINTFI("Exiting normal state"); + return SM_OKAY; +} diff --git a/service-mgmt/sm-1.0.0/src/sm_failover_normal_state.h b/service-mgmt/sm-1.0.0/src/sm_failover_normal_state.h new file mode 100644 index 00000000..dc6aac51 --- /dev/null +++ b/service-mgmt/sm-1.0.0/src/sm_failover_normal_state.h @@ -0,0 +1,22 @@ +// +// Copyright (c) 2018 Wind River Systems, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +// +#ifndef __SM_FAILOVER_NORMAL_STATE_H__ +#define __SM_FAILOVER_NORMAL_STATE_H__ +#include "sm_types.h" +#include "sm_failover_fsm.h" + +class SmFailoverNormalState : public SmFSMState +{ + public: + SmFailoverNormalState(SmFailoverFSM& fsm) : SmFSMState(fsm){} + SmErrorT enter_state(); + SmErrorT exit_state(); + protected: + SmErrorT event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data); +}; + + +#endif //__SM_FAILOVER_NORMAL_STATE_H__ \ No newline at end of file diff --git a/service-mgmt/sm-1.0.0/src/sm_failover_ss.c b/service-mgmt/sm-1.0.0/src/sm_failover_ss.c index e1323cba..a4235d3b 100644 --- a/service-mgmt/sm-1.0.0/src/sm_failover_ss.c +++ b/service-mgmt/sm-1.0.0/src/sm_failover_ss.c @@ -5,8 +5,77 @@ // #include "sm_failover_ss.h" +#include #include "sm_debug.h" +#include "sm_limits.h" +#include "sm_node_utils.h" +#include "sm_node_api.h" +#include "sm_failover_utils.h" +#include "sm_node_utils.h" +#include "sm_node_api.h" +#include "sm_failover.h" +SmSystemFailoverStatus::SmSystemFailoverStatus() +{ + SmErrorT error; + char host_name[SM_NODE_NAME_MAX_CHAR]; + char peer_name[SM_NODE_NAME_MAX_CHAR]; + error = sm_node_utils_get_hostname(host_name); + if( SM_OKAY != error ) + { + DPRINTFE( "Failed to get hostname, error=%s.", + sm_error_str( error ) ); + return; + } + error = sm_node_api_get_peername(peer_name); + if( SM_OKAY != error ) + { + DPRINTFE( "Failed to get peername, error=%s.", + sm_error_str( error ) ); + return; + } + _host_schedule_state = sm_get_controller_state(host_name); + _peer_schedule_state = sm_get_controller_state(peer_name); +} + +SmSystemFailoverStatus::~SmSystemFailoverStatus() +{ +} + +bool SmSystemFailoverStatus::_is_valid_schedule_state(SmNodeScheduleStateT state) +{ + return (SM_NODE_STATE_ACTIVE == state || + SM_NODE_STATE_STANDBY == state || + SM_NODE_STATE_FAILED == state); +} + +void SmSystemFailoverStatus::set_host_schedule_state(SmNodeScheduleStateT state) +{ + if(_is_valid_schedule_state(state)) + { + if(_host_schedule_state != state) + { + _host_schedule_state=state; + } + }else + { + DPRINTFE("Runtime error, schedule state unknown %d", state); + } +} + +void SmSystemFailoverStatus::set_peer_schedule_state(SmNodeScheduleStateT state) +{ + if(_is_valid_schedule_state(state)) + { + if(_peer_schedule_state != state) + { + _peer_schedule_state = state; + } + }else + { + DPRINTFE("Runtime error, schedule state unknown %d", state); + } +} typedef enum { @@ -38,15 +107,76 @@ static int get_node_if_healthy_score(unsigned int interface_state) } // **************************************************************************** +SmErrorT _get_system_status(SmSystemStatusT& sys_status, char host_name[], char peer_name[]) +{ + SmErrorT error; + SmNodeScheduleStateT host_state; + + error = sm_node_utils_get_hostname(host_name); + if( SM_OKAY != error ) + { + DPRINTFE( "Failed to get hostname, error=%s.", + sm_error_str( error ) ); + return error; + } + error = sm_node_api_get_peername(peer_name); + if( SM_OKAY != error ) + { + DPRINTFE( "Failed to get peername, error=%s.", + sm_error_str( error ) ); + return error; + } + host_state = sm_get_controller_state(host_name); + + sys_status.system_mode = sm_node_utils_get_system_mode(); + sys_status.host_status.mgmt_state = sm_failover_get_interface_info(SM_INTERFACE_MGMT); + sys_status.host_status.infra_state = sm_failover_get_interface_info(SM_INTERFACE_INFRA); + sys_status.host_status.oam_state = sm_failover_get_interface_info(SM_INTERFACE_OAM); + + if(SM_FAILOVER_INTERFACE_OK == sys_status.host_status.mgmt_state || + SM_FAILOVER_INTERFACE_OK == sys_status.host_status.oam_state || + SM_FAILOVER_INTERFACE_OK == sys_status.host_status.infra_state) + { + sys_status.heartbeat_state = SM_HEARTBEAT_OK; + }else + { + sys_status.heartbeat_state = SM_HEARTBEAT_LOSS; + } + + sys_status.host_status.node_name = host_name; + sys_status.host_status.interface_state = sm_failover_if_state_get(); + sys_status.host_status.current_schedule_state = host_state; + sys_status.peer_status.node_name = peer_name; + + sys_status.peer_status.interface_state = sm_failover_get_peer_if_state(); + sys_status.peer_status.current_schedule_state = sm_get_controller_state(peer_name); + return SM_OKAY; +} + // **************************************************************************** // sm_failover_ss_get_survivor - select the failover survivor // This is the main entry/container for the failover logic to determine how // to schedule the controllers, i.e, active/standby or active/failure. // =================== +SmErrorT sm_failover_ss_get_survivor(SmSystemFailoverStatus& selection) +{ + SmSystemStatusT sys_status; + char host_name[SM_NODE_NAME_MAX_CHAR]; + char peer_name[SM_NODE_NAME_MAX_CHAR]; + + SmErrorT error = _get_system_status(sys_status, host_name, peer_name); + if(SM_OKAY != error) + { + DPRINTFE("Failed to retrieve system status. Error %s", sm_error_str(error)); + return error; + } + return sm_failover_ss_get_survivor(sys_status, selection); +} + SmErrorT sm_failover_ss_get_survivor(const SmSystemStatusT& system_status, SmSystemFailoverStatus& selection) { - selection.host_schedule_state = system_status.host_status.current_schedule_state; - selection.peer_schedule_state = system_status.peer_status.current_schedule_state; + selection.set_host_schedule_state(system_status.host_status.current_schedule_state); + selection.set_peer_schedule_state(system_status.peer_status.current_schedule_state); if(SM_HEARTBEAT_OK == system_status.heartbeat_state) { int host_healthy_score, peer_healthy_score; @@ -55,26 +185,26 @@ SmErrorT sm_failover_ss_get_survivor(const SmSystemStatusT& system_status, SmSys if( peer_healthy_score < host_healthy_score ) { //host is more healthy - selection.host_schedule_state = SM_NODE_STATE_ACTIVE; - selection.peer_schedule_state = SM_NODE_STATE_STANDBY; + selection.set_host_schedule_state(SM_NODE_STATE_ACTIVE); + selection.set_peer_schedule_state(SM_NODE_STATE_STANDBY); }else if(peer_healthy_score > host_healthy_score) { //peer is more healthy - selection.host_schedule_state = SM_NODE_STATE_STANDBY; - selection.peer_schedule_state = SM_NODE_STATE_ACTIVE; + selection.set_host_schedule_state(SM_NODE_STATE_STANDBY); + selection.set_peer_schedule_state(SM_NODE_STATE_ACTIVE); } } - if(system_status.host_status.current_schedule_state != selection.host_schedule_state || - system_status.peer_status.current_schedule_state != selection.peer_schedule_state ) + if(system_status.host_status.current_schedule_state != selection.get_host_schedule_state() || + system_status.peer_status.current_schedule_state != selection.get_peer_schedule_state() ) { DPRINTFI("Uncontrolled swact starts. Host from %s to %s, Peer from %s to %s.", sm_node_schedule_state_str(system_status.host_status.current_schedule_state), - sm_node_schedule_state_str(selection.host_schedule_state), + sm_node_schedule_state_str(selection.get_host_schedule_state()), sm_node_schedule_state_str(system_status.peer_status.current_schedule_state), - sm_node_schedule_state_str(selection.peer_schedule_state) + sm_node_schedule_state_str(selection.get_peer_schedule_state()) ); } return SM_OKAY; } -// **************************************************************************** \ No newline at end of file +// **************************************************************************** diff --git a/service-mgmt/sm-1.0.0/src/sm_failover_ss.h b/service-mgmt/sm-1.0.0/src/sm_failover_ss.h index ebc9c3ff..99e48a1b 100644 --- a/service-mgmt/sm-1.0.0/src/sm_failover_ss.h +++ b/service-mgmt/sm-1.0.0/src/sm_failover_ss.h @@ -8,14 +8,18 @@ #define __SM_FAILOVER_SS_H__ #include #include "sm_types.h" -#ifdef __cplusplus -extern "C" { -#endif +//#ifdef __cplusplus +//extern "C" { +//#endif typedef struct { const char* node_name; unsigned int interface_state; + unsigned int heartbeat_state; + SmFailoverInterfaceStateT mgmt_state; + SmFailoverInterfaceStateT infra_state; + SmFailoverInterfaceStateT oam_state; SmNodeScheduleStateT current_schedule_state; }SmNodeStatusT; @@ -40,18 +44,36 @@ typedef struct }SmSystemStatusT; -typedef struct +class SmSystemFailoverStatus { - SmNodeScheduleStateT host_schedule_state; - SmNodeScheduleStateT peer_schedule_state; -}SmSystemFailoverStatus; + public: + SmSystemFailoverStatus(); + virtual ~SmSystemFailoverStatus(); + inline SmNodeScheduleStateT get_host_schedule_state() const { + return _host_schedule_state; + } + void set_host_schedule_state(SmNodeScheduleStateT state); + inline SmNodeScheduleStateT get_peer_schedule_state() const { + return _peer_schedule_state; + } + void set_peer_schedule_state(SmNodeScheduleStateT state); + private: + SmNodeScheduleStateT _host_schedule_state; + SmNodeScheduleStateT _peer_schedule_state; + static const char filename[]; + static const char file_format[]; + + bool _is_valid_schedule_state(SmNodeScheduleStateT state); +}; // **************************************************************************** // sm_failover_ss_get_survivor - select the failover survivor // =================== +SmErrorT sm_failover_ss_get_survivor(SmSystemFailoverStatus& selection); + SmErrorT sm_failover_ss_get_survivor(const SmSystemStatusT& system_status, SmSystemFailoverStatus& selection); -#ifdef __cplusplus -} -#endif +//#ifdef __cplusplus +//} +//#endif #endif // __SM_FAILOVER_SS_H__ diff --git a/service-mgmt/sm-1.0.0/src/sm_failover_survived_state.cpp b/service-mgmt/sm-1.0.0/src/sm_failover_survived_state.cpp new file mode 100644 index 00000000..dec43433 --- /dev/null +++ b/service-mgmt/sm-1.0.0/src/sm_failover_survived_state.cpp @@ -0,0 +1,62 @@ +// +// Copyright (c) 2018 Wind River Systems, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +// +#include "sm_failover_survived_state.h" +#include +#include "sm_types.h" +#include "sm_debug.h" +#include "sm_failover_fsm.h" +#include "sm_failover_ss.h" + +SmErrorT SmFailoverSurvivedState::enter_state() +{ + DPRINTFI("Entering Survived state"); + return SM_OKAY; +} + +SmErrorT SmFailoverSurvivedState::event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data) +{ + switch (event) + { + case SM_HEARTBEAT_LOST: + case SM_INTERFACE_DOWN: + case SM_INTERFACE_UP: + // ignore, recover is possible only when heartbeat is recovered + break; + case SM_HEARTBEAT_RECOVER: + { + SmSystemFailoverStatus failover_status; + SmErrorT error = sm_failover_ss_get_survivor(failover_status); + SmNodeScheduleStateT host_state = failover_status.get_host_schedule_state(); + SmNodeScheduleStateT peer_state = failover_status.get_peer_schedule_state(); + if(SM_OKAY != error) + { + DPRINTFE("Failed to get failover survivor. Error %s", sm_error_str(error)); + return error; + } + + if(SM_NODE_STATE_ACTIVE == host_state && SM_NODE_STATE_STANDBY == peer_state) + { + // Active is the only possible state to be scheduled to from survived state + this->fsm.set_state(SM_FAILOVER_STATE_NORMAL); + }else + { + DPRINTFE("Runtime error: unexpected scheduling state: host %s, peer %s", + sm_node_schedule_state_str(host_state), + sm_node_schedule_state_str(peer_state)); + } + } + break; + default: + DPRINTFE("Runtime error, unexpected event %d, at state %d", event, this->fsm.get_state()); + } + return SM_OKAY; +} + +SmErrorT SmFailoverSurvivedState::exit_state() +{ + DPRINTFI("Exiting Survived state"); + return SM_OKAY; +} diff --git a/service-mgmt/sm-1.0.0/src/sm_failover_survived_state.h b/service-mgmt/sm-1.0.0/src/sm_failover_survived_state.h new file mode 100644 index 00000000..7f58b505 --- /dev/null +++ b/service-mgmt/sm-1.0.0/src/sm_failover_survived_state.h @@ -0,0 +1,23 @@ +// +// Copyright (c) 2018 Wind River Systems, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +// +#ifndef __SM_FAILOVER_SURVIVED_STATE_H__ +#define __SM_FAILOVER_SURVIVED_STATE_H__ +#include "sm_types.h" +#include "sm_failover_fsm.h" + +class SmFailoverSurvivedState : public SmFSMState +{ + public: + SmFailoverSurvivedState(SmFailoverFSM& fsm) : SmFSMState(fsm){} + SmErrorT enter_state(); + SmErrorT exit_state(); + + protected: + SmErrorT event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data); +}; + + +#endif //__SM_FAILOVER_SURVIVED_STATE_H__ \ No newline at end of file diff --git a/service-mgmt/sm-1.0.0/src/sm_heartbeat_msg.c b/service-mgmt/sm-1.0.0/src/sm_heartbeat_msg.c index 76129dad..47afc1fc 100644 --- a/service-mgmt/sm-1.0.0/src/sm_heartbeat_msg.c +++ b/service-mgmt/sm-1.0.0/src/sm_heartbeat_msg.c @@ -208,11 +208,7 @@ SmErrorT sm_heartbeat_msg_send_alive( SmNetworkTypeT network_type, char node_nam heartbeat_msg.u.if_state_msg.msg_size = htons((uint16_t)sizeof(SmHeartbeatMsgAliveRev2T)); SmHeartbeatMsgIfStateT if_state; - SmErrorT error = sm_failover_if_state_get(&if_state); - if(SM_OKAY != error) - { - return SM_FAILED; - } + if_state = sm_failover_if_state_get(); heartbeat_msg.u.if_state_msg.if_state = htonl(if_state); if( SM_AUTH_TYPE_HMAC_SHA512 == auth_type ) diff --git a/service-mgmt/sm-1.0.0/src/sm_node_api.cpp b/service-mgmt/sm-1.0.0/src/sm_node_api.cpp index 8ea9203f..179ec770 100644 --- a/service-mgmt/sm-1.0.0/src/sm_node_api.cpp +++ b/service-mgmt/sm-1.0.0/src/sm_node_api.cpp @@ -58,6 +58,36 @@ SmErrorT sm_node_api_get_hostname( char node_name[] ) } // **************************************************************************** +// **************************************************************************** +// Node API - Get Peer Name +// ======================== +SmErrorT sm_node_api_get_peername(char peer_name[SM_NODE_NAME_MAX_CHAR]) +{ + char node_name[SM_NODE_NAME_MAX_CHAR]; + SmErrorT error = sm_node_api_get_hostname(node_name); + if(SM_OKAY != error) + { + return error; + } + + char format[] = "name <> '%s'"; + char query[SM_NODE_NAME_MAX_CHAR + sizeof(format)]; + unsigned int cnt = snprintf(query, sizeof(query), format, node_name); + if(cnt < sizeof(query) - 1) + { + SmDbNodeT node; + error = sm_db_nodes_query(_sm_db_handle, query, &node); + if(SM_OKAY != error) + { + return error; + } + strncpy(peer_name, node.name, SM_NODE_NAME_MAX_CHAR); + return SM_OKAY; + } + return SM_FAILED; +} +// **************************************************************************** + // **************************************************************************** // Node API - Configuration Complete // ================================= diff --git a/service-mgmt/sm-1.0.0/src/sm_node_api.h b/service-mgmt/sm-1.0.0/src/sm_node_api.h index da3b2e43..773b434d 100644 --- a/service-mgmt/sm-1.0.0/src/sm_node_api.h +++ b/service-mgmt/sm-1.0.0/src/sm_node_api.h @@ -10,6 +10,7 @@ #include "sm_types.h" #include "sm_uuid.h" +#include "sm_limits.h" #ifdef __cplusplus extern "C" { @@ -21,6 +22,12 @@ extern "C" { extern SmErrorT sm_node_api_get_hostname( char node_name[] ); // **************************************************************************** +// **************************************************************************** +// Node API - Get Peer Name +// ======================== +extern SmErrorT sm_node_api_get_peername(char peer_name[SM_NODE_NAME_MAX_CHAR]); +// **************************************************************************** + // **************************************************************************** // Node API - Configuration Complete // ================================= diff --git a/service-mgmt/sm-common-1.0.0/src/sm_types.h b/service-mgmt/sm-common-1.0.0/src/sm_types.h index 0fa563cd..26aa2b92 100644 --- a/service-mgmt/sm-common-1.0.0/src/sm_types.h +++ b/service-mgmt/sm-common-1.0.0/src/sm_types.h @@ -706,6 +706,16 @@ typedef enum SM_NODE_STATE_MAX }SmNodeScheduleStateT; + +typedef enum +{ + SM_FAILOVER_INTERFACE_UNKNOWN, + SM_FAILOVER_INTERFACE_OK, + SM_FAILOVER_INTERFACE_MISSING_HEARTBEAT, + SM_FAILOVER_INTERFACE_DOWN, + SM_FAILOVER_INTERFACE_RECOVERING +}SmFailoverInterfaceStateT; + // **************************************************************************** // Types - System Mode String // =======================================