From 84ac862fdce5c4e9a134961aa6bce8cdd7f1f66f Mon Sep 17 00:00:00 2001 From: Bin Qian Date: Tue, 4 May 2021 11:33:43 -0400 Subject: [PATCH] Fix AIO-DX failover issues This fix is to fix AIO unexpected failover behaviors. 1. active controller reboots itself when standby controller reboot/lost power 2. standby controller becomes degraded after active controller reboot/lost power Closes-bug: 1927133 Change-Id: If3c9f6251f689a89cd206c672092ba296f00bd6b Signed-off-by: Bin Qian (cherry picked from commit 0b99b594f83b7c626cc0c4f7dc970ce373a7b748) --- .../sm/src/sm_failover_fail_pending_state.cpp | 9 +-- .../sm/src/sm_failover_failed_state.cpp | 67 ++++++++++++++++--- service-mgmt/sm/src/sm_node_api.cpp | 38 +++++++++++ service-mgmt/sm/src/sm_node_api.h | 6 ++ 4 files changed, 106 insertions(+), 14 deletions(-) diff --git a/service-mgmt/sm/src/sm_failover_fail_pending_state.cpp b/service-mgmt/sm/src/sm_failover_fail_pending_state.cpp index 94431ba6..45cf89f0 100644 --- a/service-mgmt/sm/src/sm_failover_fail_pending_state.cpp +++ b/service-mgmt/sm/src/sm_failover_fail_pending_state.cpp @@ -206,8 +206,8 @@ SmFailoverFailPendingState::~SmFailoverFailPendingState() SmErrorT SmFailoverFailPendingState::event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data) { - //SmFSMEventDataTypeT event_data_type = event_data->get_event_data_type(); bool duplex = false; + bool blind_guess = false; switch (event) { case SM_FAILOVER_EVENT_IF_STATE_CHANGED: @@ -249,13 +249,10 @@ SmErrorT SmFailoverFailPendingState::event_handler(SmFailoverEventT event, const if(healthy) { blind_guess_scenario_start(); - } - else - { - this->fsm.set_state(SM_FAILOVER_STATE_FAILED); + blind_guess = true; } } - else + if( !blind_guess ) { SmSystemFailoverStatus& failover_status = SmSystemFailoverStatus::get_status(); SmErrorT error = sm_failover_ss_get_survivor(failover_status); diff --git a/service-mgmt/sm/src/sm_failover_failed_state.cpp b/service-mgmt/sm/src/sm_failover_failed_state.cpp index 8b5a9c01..d96b922e 100644 --- a/service-mgmt/sm/src/sm_failover_failed_state.cpp +++ b/service-mgmt/sm/src/sm_failover_failed_state.cpp @@ -20,6 +20,7 @@ #include "sm_types.h" #include "sm_debug.h" #include "sm_node_utils.h" +#include "sm_node_api.h" #include "sm_failover.h" #include "sm_failover_fsm.h" #include "sm_failover_ss.h" @@ -183,9 +184,54 @@ static bool sm_failover_failed_recovery_criteria_met( void ) return (criteria_met); } +SmErrorT proceed_recovery() +{ + SmErrorT error; + char peer_name[SM_NODE_NAME_MAX_CHAR]; + char host_name[SM_NODE_NAME_MAX_CHAR]; + // delete peer node + error = sm_node_api_get_peername(peer_name); + if(SM_OKAY != error) + { + DPRINTFI("Cannot retrieve peer's hostname, error %s", sm_error_str(error)); + return error; + } + error = sm_node_api_delete_node(peer_name); + if(SM_OKAY != error) + { + DPRINTFI("Failed to delete peer %s, error %s", peer_name, sm_error_str(error)); + return error; + }else + { + DPRINTFI("Peer %s is deleted.", peer_name); + } + + // enable host + error = sm_node_api_get_hostname(host_name); + if(SM_OKAY != error) + { + DPRINTFI("Cannot retrieve hostname, error %s", sm_error_str(error)); + return error; + } + error = sm_node_api_recover_node(host_name); + if(SM_OKAY != error) + { + DPRINTFI("Failed to recover %s, error %s", host_name, sm_error_str(error)); + return error; + }else + { + DPRINTFI("Host %s is recovered.", host_name); + } + + sm_node_utils_reset_unhealthy_flag(); + DPRINTFI("Unhealthy flag is removed"); + return SM_OKAY; +} + // The 'Failover Failed' state recovery audit handler SmErrorT SmFailoverFailedState::event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data) { + SmErrorT error; event_data=event_data; switch (event) { @@ -197,16 +243,21 @@ SmErrorT SmFailoverFailedState::event_handler(SmFailoverEventT event, const ISmF DPRINTFI("************************************"); DPRINTFI("** Failover Failed state recovery **"); DPRINTFI("************************************"); - sm_node_utils_reset_unhealthy_flag(); - sm_failover_failed_process_restart(PROCESS_HBSAGENT); - sm_failover_failed_process_restart(PROCESS_SM); - for ( int i = 0 ; i < 10 ; i++ ) + error = proceed_recovery(); + if(SM_OKAY != error) { - // waiting for shutdown - sleep(1); + DPRINTFE("Cannot recover from failed state"); + }else + { + sm_failover_failed_process_restart(PROCESS_SM); + for ( int i = 0 ; i < 10 ; i++ ) + { + // waiting for shutdown + sleep(1); + } + DPRINTFE("Restart did not occur ; reinstating unhealthy flag ; recovery will retry"); + sm_node_utils_set_unhealthy(); } - DPRINTFE("Restart did not occur ; reinstating unhealthy flag ; recovery will retry"); - sm_node_utils_set_unhealthy(); } else if ( ++_log_throttle > 1 ) { diff --git a/service-mgmt/sm/src/sm_node_api.cpp b/service-mgmt/sm/src/sm_node_api.cpp index 179ec770..c15d50b6 100644 --- a/service-mgmt/sm/src/sm_node_api.cpp +++ b/service-mgmt/sm/src/sm_node_api.cpp @@ -742,6 +742,44 @@ SmErrorT sm_node_api_fail_node( char node_name[] ) } // **************************************************************************** +// **************************************************************************** +// Node API - Recover Node +// ====================== +SmErrorT sm_node_api_recover_node( char node_name[] ) +{ + SmDbNodeT node; + SmErrorT error; + error = sm_db_nodes_read( _sm_db_handle, node_name, &node ); + if( SM_OKAY != error ) + { + DPRINTFE( "Failed to read node (%s) information, error=%s.", + node_name, sm_error_str( error ) ); + return( error ); + } + + if( node.oper_state != SM_NODE_OPERATIONAL_STATE_DISABLED || + node.avail_status != SM_NODE_AVAIL_STATUS_FAILED ) + { + DPRINTFD("Not in failure mode %s", node_name); + } + + DPRINTFE("Node %s is to recover from failure mode.", node_name); + + error = sm_node_api_update_node( + node_name, + node.admin_state, + SM_NODE_OPERATIONAL_STATE_ENABLED, + SM_NODE_AVAIL_STATUS_AVAILABLE); + + if( SM_OKAY != error ) + { + DPRINTFE( "Failed to set node (%s) failed, error=%s.", + node_name, sm_error_str( error ) ); + } + return( error ); +} +// **************************************************************************** + // **************************************************************************** // Node API - Delete Node // ====================== diff --git a/service-mgmt/sm/src/sm_node_api.h b/service-mgmt/sm/src/sm_node_api.h index 773b434d..e8874f52 100644 --- a/service-mgmt/sm/src/sm_node_api.h +++ b/service-mgmt/sm/src/sm_node_api.h @@ -54,6 +54,12 @@ extern SmErrorT sm_node_api_update_node( char node_name[], SmErrorT sm_node_api_fail_node( char node_name[] ); // **************************************************************************** +// **************************************************************************** +// Node API - Fail Node +// ====================== +SmErrorT sm_node_api_recover_node( char node_name[] ); +// **************************************************************************** + // **************************************************************************** // Node API - Delete Node // ======================