Merge "Add unhealthy state recovery audit to service management (sm)"

This commit is contained in:
Zuul 2020-06-21 13:02:49 +00:00 committed by Gerrit Code Review
commit c94618a228
8 changed files with 310 additions and 15 deletions

View File

@ -1,5 +1,5 @@
//
// Copyright (c) 2014-2018 Wind River Systems, Inc.
// Copyright (c) 2014-2020 Wind River Systems, Inc.
//
// SPDX-License-Identifier: Apache-2.0
//
@ -581,3 +581,17 @@ SmErrorT sm_node_utils_is_aio_duplex( bool* is_aio_duplex )
return SM_OKAY;
}
// ****************************************************************************
// Node Utilities - Clear the unhealthy flag
// ==============================
extern void sm_node_utils_reset_unhealthy_flag( void )
{
if( 0 == access( SM_NODE_UNHEALTHY_FILE, F_OK ) )
{
unlink( SM_NODE_UNHEALTHY_FILE );
if( 0 == access( SM_NODE_UNHEALTHY_FILE, F_OK ) )
{
DPRINTFE("file did not get removed ; %s", SM_NODE_UNHEALTHY_FILE);
}
}
}

View File

@ -1,5 +1,5 @@
//
// Copyright (c) 2014 Wind River Systems, Inc.
// Copyright (c) 2014,2020 Wind River Systems, Inc.
//
// SPDX-License-Identifier: Apache-2.0
//
@ -104,6 +104,12 @@ extern SmErrorT sm_node_utils_is_aio_duplex( bool* is_aio_duplex );
extern bool sm_node_utils_set_failover( bool to_disable );
// ****************************************************************************
// ****************************************************************************
// Node Utilities - Clear the unhealthy flag
// ==============================
extern void sm_node_utils_reset_unhealthy_flag( void );
// ****************************************************************************
#ifdef __cplusplus
}
#endif

View File

@ -1,5 +1,5 @@
//
// Copyright (c) 2014-2018 Wind River Systems, Inc.
// Copyright (c) 2014-2020 Wind River Systems, Inc.
//
// SPDX-License-Identifier: Apache-2.0
//
@ -173,6 +173,7 @@ _sm_failover_event_mappings[SM_FAILOVER_EVENT_MAX] =
{SM_FAILOVER_EVENT_HEARTBEAT_ENABLED, "heartbeat-enabled"},
{SM_FAILOVER_EVENT_IF_STATE_CHANGED, "interface-state-changed"},
{SM_FAILOVER_EVENT_FAIL_PENDING_TIMEOUT, "fail-pending-timeout"},
{SM_FAILOVER_EVENT_FAILED_RECOVERY_AUDIT, "failed-recovery-audit"},
{SM_FAILOVER_EVENT_NODE_ENABLED, "node-enabled"}
};
@ -186,6 +187,15 @@ _sm_failover_state_mappings[SM_FAILOVER_STATE_MAX] =
{SM_FAILOVER_STATE_SURVIVED, "survived"}
};
static SmValueStrMappingT
_sm_failover_interface_state_mappings[SM_FAILOVER_INTERFACE_STATE_MAX] =
{
{SM_FAILOVER_INTERFACE_UNKNOWN, "unknown"},
{SM_FAILOVER_INTERFACE_OK, "ok"},
{SM_FAILOVER_INTERFACE_MISSING_HEARTBEAT, "missing-heartbeat"},
{SM_FAILOVER_INTERFACE_DOWN, "down"}
};
static SmValueStrMappingT
_sm_service_domain_neighbor_state_mappings[SM_SERVICE_DOMAIN_NEIGHBOR_STATE_MAX] =
{
@ -993,6 +1003,17 @@ const char* sm_failover_state_str( SmFailoverStateT state )
SM_FAILOVER_STATE_MAX,
state ) );
}
// ****************************************************************************
// Types - Failover Interface State String
// =============================================
const char* sm_failover_interface_state_str( SmFailoverInterfaceStateT state )
{
return( sm_mapping_get_str( _sm_failover_interface_state_mappings,
SM_FAILOVER_INTERFACE_STATE_MAX,
state ) );
}
// ****************************************************************************
// ****************************************************************************

View File

@ -1,5 +1,5 @@
//
// Copyright (c) 2014-2018 Wind River Systems, Inc.
// Copyright (c) 2014-2020 Wind River Systems, Inc.
//
// SPDX-License-Identifier: Apache-2.0
//
@ -293,6 +293,7 @@ typedef enum{
SM_FAILOVER_EVENT_HEARTBEAT_ENABLED,
SM_FAILOVER_EVENT_IF_STATE_CHANGED,
SM_FAILOVER_EVENT_FAIL_PENDING_TIMEOUT,
SM_FAILOVER_EVENT_FAILED_RECOVERY_AUDIT,
SM_FAILOVER_EVENT_NODE_ENABLED,
SM_FAILOVER_EVENT_MAX
}SmFailoverEventT;
@ -730,7 +731,8 @@ typedef enum
SM_FAILOVER_INTERFACE_UNKNOWN,
SM_FAILOVER_INTERFACE_OK,
SM_FAILOVER_INTERFACE_MISSING_HEARTBEAT,
SM_FAILOVER_INTERFACE_DOWN
SM_FAILOVER_INTERFACE_DOWN,
SM_FAILOVER_INTERFACE_STATE_MAX
}SmFailoverInterfaceStateT;
// ****************************************************************************
@ -960,6 +962,10 @@ extern const char* sm_failover_event_str( SmFailoverEventT event );
extern const char* sm_failover_state_str( SmFailoverStateT state );
// ****************************************************************************
const char* sm_failover_interface_state_str( SmFailoverInterfaceStateT state );
// ****************************************************************************
// ****************************************************************************
// Types - Service Domain Neighbor State Value
// ===========================================

View File

@ -1,5 +1,5 @@
//
// Copyright (c) 2017 Wind River Systems, Inc.
// Copyright (c) 2017-2020 Wind River Systems, Inc.
//
// SPDX-License-Identifier: Apache-2.0
//

View File

@ -231,6 +231,10 @@ SmErrorT SmFailoverFailPendingState::event_handler(SmFailoverEventT event, const
{
blind_guess_scenario_start();
}
else
{
this->fsm.set_state(SM_FAILOVER_STATE_FAILED);
}
}
else
{

View File

@ -3,24 +3,203 @@
//
// SPDX-License-Identifier: Apache-2.0
//
#include "sm_failover_failed_state.h"
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <limits.h>
#include <sys/types.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <sys/stat.h>
#include <errno.h>
#include <fcntl.h>
#include "sm_failover_failed_state.h"
#include "sm_types.h"
#include "sm_debug.h"
#include "sm_node_utils.h"
#include "sm_failover.h"
#include "sm_failover_fsm.h"
#include "sm_failover_ss.h"
#include "sm_failover_utils.h"
extern bool is_cluster_host_interface_configured( void );
// Failover Failed Recovery Audit period = 5 seconds
static const int FAILED_STATE_AUDIT_PERIOD = 5000;
// Recovery log throttle threshold - 1 log every minute
static const int SM_FAILOVER_FAILED_LOG_THROTTLE_THLD = 12;
// processes to restart over a failover failed recovery
#define MAX_RESTART_PROCESS_NAME_LEN 10
#define PROCESS_HBSAGENT ((const char *)("hbsAgent"))
#define PROCESS_SM ((const char *)("sm"))
// Failover Failed state class constructor
SmFailoverFailedState::SmFailoverFailedState(SmFailoverFSM& fsm) : SmFSMState(fsm)
{
this->_failed_state_audit_timer_id = SM_TIMER_ID_INVALID;
}
// The 'Failover Failed' state destructor
// - stops the recovery audit if needed
SmFailoverFailedState::~SmFailoverFailedState()
{
this->_deregister_timer();
}
// Failover Failed state entry class member function
// - starts the Failover Failed state recovery audit timer
SmErrorT SmFailoverFailedState::enter_state()
{
SmFSMState::enter_state();
DPRINTFE("********************************************************");
DPRINTFE("Entering Failover Failed state ; recovery audit started ");
DPRINTFE("********************************************************");
SmErrorT error = this->_register_timer();
if(SM_OKAY != error)
{
DPRINTFE("Failed to register failed state timer. Error %s", sm_error_str(error));
}
return error;
}
// Failover Failed state audit timer handler
bool SmFailoverFailedState::_failed_state_audit(
SmTimerIdT timer_id, int64_t user_data)
{
SmFailoverFSM::get_fsm().send_event(SM_FAILOVER_EVENT_FAILED_RECOVERY_AUDIT, NULL);
return true ;
}
// Issue a self restart through pmon-restart service
static bool sm_failover_failed_process_restart( const char * process )
{
DPRINTFI( "Issuing controlled process restart ; pmon-restart %s", process);
pid_t pid = fork();
if( 0 > pid )
{
DPRINTFE( "Failed to fork 'pmond-restart %s' request, error=%s.",
process, strerror( errno ) );
return( true );
}
else if( 0 == pid )
{
// set the arguement array for execv
char pmon_restart_cmd[] = "/usr/local/sbin/pmon-restart";
char pmon_restart_process[MAX_RESTART_PROCESS_NAME_LEN] ;
snprintf(&pmon_restart_process[0], MAX_RESTART_PROCESS_NAME_LEN, "%s", process);
char* pmon_restart_argv[3] ;
pmon_restart_argv[0] = pmon_restart_cmd;
pmon_restart_argv[1] = pmon_restart_process;
pmon_restart_argv[2] = NULL;
// Add the path to socat for pmon-restart
char path[] = "PATH=/usr/bin:$PATH";
char* pmon_restart_env[2] ;
pmon_restart_env[0] = path;
pmon_restart_env[1] = NULL;
setpgid( 0, 0 );
struct rlimit file_limits;
if( 0 == getrlimit( RLIMIT_NOFILE, &file_limits ) )
{
unsigned int fd_i;
for( fd_i=0; fd_i < file_limits.rlim_cur; ++fd_i )
{
close( fd_i );
}
open( "/dev/null", O_RDONLY ); // stdin
open( "/dev/null", O_WRONLY ); // stdout
open( "/dev/null", O_WRONLY ); // stderr
}
execve( pmon_restart_argv[0], pmon_restart_argv, pmon_restart_env );
// Shouldn't get this far, else there was an error.
exit(-1);
}
return( false );
}
// Failover Failed recovery criteria checker
static bool sm_failover_failed_recovery_criteria_met( void )
{
bool criteria_met = false ;
SmFailoverInterfaceStateT oam_state, mgmt_state, cluster_host_state;
oam_state = sm_failover_get_interface_info(SM_INTERFACE_OAM);
mgmt_state = sm_failover_get_interface_info(SM_INTERFACE_MGMT);
if ( is_cluster_host_interface_configured() )
{
cluster_host_state = sm_failover_get_interface_info(SM_INTERFACE_CLUSTER_HOST);
if ((( oam_state == SM_FAILOVER_INTERFACE_OK ) || ( oam_state == SM_FAILOVER_INTERFACE_MISSING_HEARTBEAT )) &&
(( mgmt_state == SM_FAILOVER_INTERFACE_OK ) || ( mgmt_state == SM_FAILOVER_INTERFACE_MISSING_HEARTBEAT )) &&
(( cluster_host_state == SM_FAILOVER_INTERFACE_OK ) || ( cluster_host_state == SM_FAILOVER_INTERFACE_MISSING_HEARTBEAT )))
{
criteria_met = true ;
}
}
else if ((( oam_state == SM_FAILOVER_INTERFACE_OK ) || ( oam_state == SM_FAILOVER_INTERFACE_MISSING_HEARTBEAT )) &&
(( mgmt_state == SM_FAILOVER_INTERFACE_OK ) || ( mgmt_state == SM_FAILOVER_INTERFACE_MISSING_HEARTBEAT )))
{
criteria_met = true ;
}
DPRINTFI("Oam:%s ; Mgmt:%s ; Cluster:%s ; recovery criteria met: %s",
sm_failover_interface_state_str(oam_state),
sm_failover_interface_state_str(mgmt_state),
sm_failover_interface_state_str(cluster_host_state),
criteria_met ? "Yes" : "No");
return (criteria_met);
}
// The 'Failover Failed' state recovery audit handler
SmErrorT SmFailoverFailedState::event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data)
{
// Currently the only supported scenario to recover from failure is
// reboot triggered by mtce.
// So once entering failed state, wait for reboot to reenter the normal state.
event_data=event_data;
switch (event)
{
case SM_FAILOVER_EVENT_IF_STATE_CHANGED:
// event will be fired, but couldn't bring fsm state back to normal
case SM_FAILOVER_EVENT_FAILED_RECOVERY_AUDIT:
{
if ( sm_failover_failed_recovery_criteria_met() )
{
DPRINTFI("************************************");
DPRINTFI("** Failover Failed state recovery **");
DPRINTFI("************************************");
sm_node_utils_reset_unhealthy_flag();
sm_failover_failed_process_restart(PROCESS_HBSAGENT);
sm_failover_failed_process_restart(PROCESS_SM);
for ( int i = 0 ; i < 10 ; i++ )
{
// waiting for shutdown
sleep(1);
}
DPRINTFE("Restart did not occur ; reinstating unhealthy flag ; recovery will retry");
sm_node_utils_set_unhealthy();
}
else if ( ++_log_throttle > 1 )
{
if ( _log_throttle > SM_FAILOVER_FAILED_LOG_THROTTLE_THLD )
_log_throttle = 0 ;
}
else
{
DPRINTFI("Failover Failed state recovery monitor");
}
break;
}
default:
DPRINTFE("Runtime error, unexpected event %s, at state %s",
sm_failover_event_str(event),
@ -28,3 +207,58 @@ SmErrorT SmFailoverFailedState::event_handler(SmFailoverEventT event, const ISmF
}
return SM_OKAY;
}
// Start the 'Failover Failed' state recovery audit
SmErrorT SmFailoverFailedState::_register_timer()
{
SmErrorT error;
const char* timer_name = "FAILED STATE AUDIT TIMER";
if(SM_TIMER_ID_INVALID != this->_failed_state_audit_timer_id)
this->_deregister_timer();
error = sm_timer_register(timer_name, FAILED_STATE_AUDIT_PERIOD,
SmFailoverFailedState::_failed_state_audit,
0, &this->_failed_state_audit_timer_id);
return error;
}
// Stop the 'Failover Failed' state recovery audit
SmErrorT SmFailoverFailedState::_deregister_timer()
{
SmErrorT error = SM_OKAY;
if(SM_TIMER_ID_INVALID != this->_failed_state_audit_timer_id)
{
error = sm_timer_deregister(this->_failed_state_audit_timer_id);
if( SM_OKAY != error )
{
DPRINTFE( "Failed to cancel failed timer, error=%s.",
sm_error_str( error ) );
}else
{
this->_failed_state_audit_timer_id = SM_TIMER_ID_INVALID;
}
}
return error;
}
SmErrorT SmFailoverFailedState::exit_state()
{
SmErrorT error = this->_deregister_timer();
if(SM_OKAY != error)
{
DPRINTFE("Failed to deregister fail failed timer. Error %s", sm_error_str(error));
}
if(SM_TIMER_ID_INVALID != _failed_state_audit_timer_id)
{
error = sm_timer_deregister(_failed_state_audit_timer_id);
_failed_state_audit_timer_id = SM_TIMER_ID_INVALID;
if( SM_OKAY != error)
{
DPRINTFE("Failed to deregister action timer. Error %s", sm_error_str(error));
}
}
SmFSMState::exit_state();
return error;
}

View File

@ -1,5 +1,5 @@
//
// Copyright (c) 2018 Wind River Systems, Inc.
// Copyright (c) 2020 Wind River Systems, Inc.
//
// SPDX-License-Identifier: Apache-2.0
//
@ -12,12 +12,22 @@
class SmFailoverFailedState : public SmFSMState
{
public:
SmFailoverFailedState(SmFailoverFSM& fsm) : SmFSMState(fsm){}
SmFailoverFailedState(SmFailoverFSM& fsm);
virtual ~SmFailoverFailedState();
SmErrorT enter_state();
SmErrorT exit_state();
protected:
SmErrorT event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data);
private:
SmTimerIdT _failed_state_audit_timer_id;
static bool _failed_state_audit(SmTimerIdT timer_id, int64_t user_data);
SmErrorT _register_timer();
SmErrorT _deregister_timer();
int _log_throttle ;
};
#endif //__SM_FAILOVER_FAILED_STATE_H__
#endif //__SM_FAILOVER_FAILED_STATE_H__