Merge "Add unhealthy state recovery audit to service management (sm)"
This commit is contained in:
commit
c94618a228
|
@ -1,5 +1,5 @@
|
|||
//
|
||||
// Copyright (c) 2014-2018 Wind River Systems, Inc.
|
||||
// Copyright (c) 2014-2020 Wind River Systems, Inc.
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
@ -581,3 +581,17 @@ SmErrorT sm_node_utils_is_aio_duplex( bool* is_aio_duplex )
|
|||
return SM_OKAY;
|
||||
}
|
||||
|
||||
// ****************************************************************************
|
||||
// Node Utilities - Clear the unhealthy flag
|
||||
// ==============================
|
||||
extern void sm_node_utils_reset_unhealthy_flag( void )
|
||||
{
|
||||
if( 0 == access( SM_NODE_UNHEALTHY_FILE, F_OK ) )
|
||||
{
|
||||
unlink( SM_NODE_UNHEALTHY_FILE );
|
||||
if( 0 == access( SM_NODE_UNHEALTHY_FILE, F_OK ) )
|
||||
{
|
||||
DPRINTFE("file did not get removed ; %s", SM_NODE_UNHEALTHY_FILE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
//
|
||||
// Copyright (c) 2014 Wind River Systems, Inc.
|
||||
// Copyright (c) 2014,2020 Wind River Systems, Inc.
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
@ -104,6 +104,12 @@ extern SmErrorT sm_node_utils_is_aio_duplex( bool* is_aio_duplex );
|
|||
extern bool sm_node_utils_set_failover( bool to_disable );
|
||||
// ****************************************************************************
|
||||
|
||||
// ****************************************************************************
|
||||
// Node Utilities - Clear the unhealthy flag
|
||||
// ==============================
|
||||
extern void sm_node_utils_reset_unhealthy_flag( void );
|
||||
// ****************************************************************************
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
//
|
||||
// Copyright (c) 2014-2018 Wind River Systems, Inc.
|
||||
// Copyright (c) 2014-2020 Wind River Systems, Inc.
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
@ -173,6 +173,7 @@ _sm_failover_event_mappings[SM_FAILOVER_EVENT_MAX] =
|
|||
{SM_FAILOVER_EVENT_HEARTBEAT_ENABLED, "heartbeat-enabled"},
|
||||
{SM_FAILOVER_EVENT_IF_STATE_CHANGED, "interface-state-changed"},
|
||||
{SM_FAILOVER_EVENT_FAIL_PENDING_TIMEOUT, "fail-pending-timeout"},
|
||||
{SM_FAILOVER_EVENT_FAILED_RECOVERY_AUDIT, "failed-recovery-audit"},
|
||||
{SM_FAILOVER_EVENT_NODE_ENABLED, "node-enabled"}
|
||||
};
|
||||
|
||||
|
@ -186,6 +187,15 @@ _sm_failover_state_mappings[SM_FAILOVER_STATE_MAX] =
|
|||
{SM_FAILOVER_STATE_SURVIVED, "survived"}
|
||||
};
|
||||
|
||||
static SmValueStrMappingT
|
||||
_sm_failover_interface_state_mappings[SM_FAILOVER_INTERFACE_STATE_MAX] =
|
||||
{
|
||||
{SM_FAILOVER_INTERFACE_UNKNOWN, "unknown"},
|
||||
{SM_FAILOVER_INTERFACE_OK, "ok"},
|
||||
{SM_FAILOVER_INTERFACE_MISSING_HEARTBEAT, "missing-heartbeat"},
|
||||
{SM_FAILOVER_INTERFACE_DOWN, "down"}
|
||||
};
|
||||
|
||||
static SmValueStrMappingT
|
||||
_sm_service_domain_neighbor_state_mappings[SM_SERVICE_DOMAIN_NEIGHBOR_STATE_MAX] =
|
||||
{
|
||||
|
@ -993,6 +1003,17 @@ const char* sm_failover_state_str( SmFailoverStateT state )
|
|||
SM_FAILOVER_STATE_MAX,
|
||||
state ) );
|
||||
}
|
||||
|
||||
// ****************************************************************************
|
||||
// Types - Failover Interface State String
|
||||
// =============================================
|
||||
const char* sm_failover_interface_state_str( SmFailoverInterfaceStateT state )
|
||||
{
|
||||
return( sm_mapping_get_str( _sm_failover_interface_state_mappings,
|
||||
SM_FAILOVER_INTERFACE_STATE_MAX,
|
||||
state ) );
|
||||
}
|
||||
|
||||
// ****************************************************************************
|
||||
|
||||
// ****************************************************************************
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
//
|
||||
// Copyright (c) 2014-2018 Wind River Systems, Inc.
|
||||
// Copyright (c) 2014-2020 Wind River Systems, Inc.
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
@ -293,6 +293,7 @@ typedef enum{
|
|||
SM_FAILOVER_EVENT_HEARTBEAT_ENABLED,
|
||||
SM_FAILOVER_EVENT_IF_STATE_CHANGED,
|
||||
SM_FAILOVER_EVENT_FAIL_PENDING_TIMEOUT,
|
||||
SM_FAILOVER_EVENT_FAILED_RECOVERY_AUDIT,
|
||||
SM_FAILOVER_EVENT_NODE_ENABLED,
|
||||
SM_FAILOVER_EVENT_MAX
|
||||
}SmFailoverEventT;
|
||||
|
@ -730,7 +731,8 @@ typedef enum
|
|||
SM_FAILOVER_INTERFACE_UNKNOWN,
|
||||
SM_FAILOVER_INTERFACE_OK,
|
||||
SM_FAILOVER_INTERFACE_MISSING_HEARTBEAT,
|
||||
SM_FAILOVER_INTERFACE_DOWN
|
||||
SM_FAILOVER_INTERFACE_DOWN,
|
||||
SM_FAILOVER_INTERFACE_STATE_MAX
|
||||
}SmFailoverInterfaceStateT;
|
||||
|
||||
// ****************************************************************************
|
||||
|
@ -960,6 +962,10 @@ extern const char* sm_failover_event_str( SmFailoverEventT event );
|
|||
extern const char* sm_failover_state_str( SmFailoverStateT state );
|
||||
// ****************************************************************************
|
||||
|
||||
const char* sm_failover_interface_state_str( SmFailoverInterfaceStateT state );
|
||||
|
||||
// ****************************************************************************
|
||||
|
||||
// ****************************************************************************
|
||||
// Types - Service Domain Neighbor State Value
|
||||
// ===========================================
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
//
|
||||
// Copyright (c) 2017 Wind River Systems, Inc.
|
||||
// Copyright (c) 2017-2020 Wind River Systems, Inc.
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
|
|
@ -231,6 +231,10 @@ SmErrorT SmFailoverFailPendingState::event_handler(SmFailoverEventT event, const
|
|||
{
|
||||
blind_guess_scenario_start();
|
||||
}
|
||||
else
|
||||
{
|
||||
this->fsm.set_state(SM_FAILOVER_STATE_FAILED);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
|
|
|
@ -3,24 +3,203 @@
|
|||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
#include "sm_failover_failed_state.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
#include <limits.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/time.h>
|
||||
#include <sys/resource.h>
|
||||
#include <sys/stat.h>
|
||||
#include <errno.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
#include "sm_failover_failed_state.h"
|
||||
#include "sm_types.h"
|
||||
#include "sm_debug.h"
|
||||
#include "sm_node_utils.h"
|
||||
#include "sm_failover.h"
|
||||
#include "sm_failover_fsm.h"
|
||||
#include "sm_failover_ss.h"
|
||||
#include "sm_failover_utils.h"
|
||||
|
||||
extern bool is_cluster_host_interface_configured( void );
|
||||
|
||||
// Failover Failed Recovery Audit period = 5 seconds
|
||||
static const int FAILED_STATE_AUDIT_PERIOD = 5000;
|
||||
|
||||
// Recovery log throttle threshold - 1 log every minute
|
||||
static const int SM_FAILOVER_FAILED_LOG_THROTTLE_THLD = 12;
|
||||
|
||||
// processes to restart over a failover failed recovery
|
||||
#define MAX_RESTART_PROCESS_NAME_LEN 10
|
||||
#define PROCESS_HBSAGENT ((const char *)("hbsAgent"))
|
||||
#define PROCESS_SM ((const char *)("sm"))
|
||||
|
||||
// Failover Failed state class constructor
|
||||
SmFailoverFailedState::SmFailoverFailedState(SmFailoverFSM& fsm) : SmFSMState(fsm)
|
||||
{
|
||||
this->_failed_state_audit_timer_id = SM_TIMER_ID_INVALID;
|
||||
}
|
||||
|
||||
// The 'Failover Failed' state destructor
|
||||
// - stops the recovery audit if needed
|
||||
SmFailoverFailedState::~SmFailoverFailedState()
|
||||
{
|
||||
this->_deregister_timer();
|
||||
}
|
||||
|
||||
// Failover Failed state entry class member function
|
||||
// - starts the Failover Failed state recovery audit timer
|
||||
SmErrorT SmFailoverFailedState::enter_state()
|
||||
{
|
||||
SmFSMState::enter_state();
|
||||
|
||||
DPRINTFE("********************************************************");
|
||||
DPRINTFE("Entering Failover Failed state ; recovery audit started ");
|
||||
DPRINTFE("********************************************************");
|
||||
|
||||
SmErrorT error = this->_register_timer();
|
||||
if(SM_OKAY != error)
|
||||
{
|
||||
DPRINTFE("Failed to register failed state timer. Error %s", sm_error_str(error));
|
||||
}
|
||||
return error;
|
||||
}
|
||||
|
||||
// Failover Failed state audit timer handler
|
||||
bool SmFailoverFailedState::_failed_state_audit(
|
||||
SmTimerIdT timer_id, int64_t user_data)
|
||||
{
|
||||
SmFailoverFSM::get_fsm().send_event(SM_FAILOVER_EVENT_FAILED_RECOVERY_AUDIT, NULL);
|
||||
return true ;
|
||||
}
|
||||
|
||||
// Issue a self restart through pmon-restart service
|
||||
static bool sm_failover_failed_process_restart( const char * process )
|
||||
{
|
||||
DPRINTFI( "Issuing controlled process restart ; pmon-restart %s", process);
|
||||
pid_t pid = fork();
|
||||
if( 0 > pid )
|
||||
{
|
||||
DPRINTFE( "Failed to fork 'pmond-restart %s' request, error=%s.",
|
||||
process, strerror( errno ) );
|
||||
return( true );
|
||||
}
|
||||
else if( 0 == pid )
|
||||
{
|
||||
// set the arguement array for execv
|
||||
char pmon_restart_cmd[] = "/usr/local/sbin/pmon-restart";
|
||||
|
||||
char pmon_restart_process[MAX_RESTART_PROCESS_NAME_LEN] ;
|
||||
snprintf(&pmon_restart_process[0], MAX_RESTART_PROCESS_NAME_LEN, "%s", process);
|
||||
|
||||
char* pmon_restart_argv[3] ;
|
||||
pmon_restart_argv[0] = pmon_restart_cmd;
|
||||
pmon_restart_argv[1] = pmon_restart_process;
|
||||
pmon_restart_argv[2] = NULL;
|
||||
|
||||
// Add the path to socat for pmon-restart
|
||||
char path[] = "PATH=/usr/bin:$PATH";
|
||||
char* pmon_restart_env[2] ;
|
||||
pmon_restart_env[0] = path;
|
||||
pmon_restart_env[1] = NULL;
|
||||
|
||||
setpgid( 0, 0 );
|
||||
|
||||
struct rlimit file_limits;
|
||||
if( 0 == getrlimit( RLIMIT_NOFILE, &file_limits ) )
|
||||
{
|
||||
unsigned int fd_i;
|
||||
for( fd_i=0; fd_i < file_limits.rlim_cur; ++fd_i )
|
||||
{
|
||||
close( fd_i );
|
||||
}
|
||||
open( "/dev/null", O_RDONLY ); // stdin
|
||||
open( "/dev/null", O_WRONLY ); // stdout
|
||||
open( "/dev/null", O_WRONLY ); // stderr
|
||||
}
|
||||
|
||||
execve( pmon_restart_argv[0], pmon_restart_argv, pmon_restart_env );
|
||||
|
||||
// Shouldn't get this far, else there was an error.
|
||||
exit(-1);
|
||||
}
|
||||
return( false );
|
||||
}
|
||||
|
||||
// Failover Failed recovery criteria checker
|
||||
static bool sm_failover_failed_recovery_criteria_met( void )
|
||||
{
|
||||
bool criteria_met = false ;
|
||||
|
||||
SmFailoverInterfaceStateT oam_state, mgmt_state, cluster_host_state;
|
||||
oam_state = sm_failover_get_interface_info(SM_INTERFACE_OAM);
|
||||
mgmt_state = sm_failover_get_interface_info(SM_INTERFACE_MGMT);
|
||||
|
||||
if ( is_cluster_host_interface_configured() )
|
||||
{
|
||||
cluster_host_state = sm_failover_get_interface_info(SM_INTERFACE_CLUSTER_HOST);
|
||||
if ((( oam_state == SM_FAILOVER_INTERFACE_OK ) || ( oam_state == SM_FAILOVER_INTERFACE_MISSING_HEARTBEAT )) &&
|
||||
(( mgmt_state == SM_FAILOVER_INTERFACE_OK ) || ( mgmt_state == SM_FAILOVER_INTERFACE_MISSING_HEARTBEAT )) &&
|
||||
(( cluster_host_state == SM_FAILOVER_INTERFACE_OK ) || ( cluster_host_state == SM_FAILOVER_INTERFACE_MISSING_HEARTBEAT )))
|
||||
{
|
||||
criteria_met = true ;
|
||||
}
|
||||
}
|
||||
else if ((( oam_state == SM_FAILOVER_INTERFACE_OK ) || ( oam_state == SM_FAILOVER_INTERFACE_MISSING_HEARTBEAT )) &&
|
||||
(( mgmt_state == SM_FAILOVER_INTERFACE_OK ) || ( mgmt_state == SM_FAILOVER_INTERFACE_MISSING_HEARTBEAT )))
|
||||
{
|
||||
criteria_met = true ;
|
||||
}
|
||||
|
||||
DPRINTFI("Oam:%s ; Mgmt:%s ; Cluster:%s ; recovery criteria met: %s",
|
||||
sm_failover_interface_state_str(oam_state),
|
||||
sm_failover_interface_state_str(mgmt_state),
|
||||
sm_failover_interface_state_str(cluster_host_state),
|
||||
criteria_met ? "Yes" : "No");
|
||||
|
||||
return (criteria_met);
|
||||
}
|
||||
|
||||
// The 'Failover Failed' state recovery audit handler
|
||||
SmErrorT SmFailoverFailedState::event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data)
|
||||
{
|
||||
// Currently the only supported scenario to recover from failure is
|
||||
// reboot triggered by mtce.
|
||||
// So once entering failed state, wait for reboot to reenter the normal state.
|
||||
event_data=event_data;
|
||||
switch (event)
|
||||
{
|
||||
case SM_FAILOVER_EVENT_IF_STATE_CHANGED:
|
||||
// event will be fired, but couldn't bring fsm state back to normal
|
||||
case SM_FAILOVER_EVENT_FAILED_RECOVERY_AUDIT:
|
||||
{
|
||||
if ( sm_failover_failed_recovery_criteria_met() )
|
||||
{
|
||||
DPRINTFI("************************************");
|
||||
DPRINTFI("** Failover Failed state recovery **");
|
||||
DPRINTFI("************************************");
|
||||
sm_node_utils_reset_unhealthy_flag();
|
||||
sm_failover_failed_process_restart(PROCESS_HBSAGENT);
|
||||
sm_failover_failed_process_restart(PROCESS_SM);
|
||||
for ( int i = 0 ; i < 10 ; i++ )
|
||||
{
|
||||
// waiting for shutdown
|
||||
sleep(1);
|
||||
}
|
||||
DPRINTFE("Restart did not occur ; reinstating unhealthy flag ; recovery will retry");
|
||||
sm_node_utils_set_unhealthy();
|
||||
}
|
||||
else if ( ++_log_throttle > 1 )
|
||||
{
|
||||
if ( _log_throttle > SM_FAILOVER_FAILED_LOG_THROTTLE_THLD )
|
||||
_log_throttle = 0 ;
|
||||
}
|
||||
else
|
||||
{
|
||||
DPRINTFI("Failover Failed state recovery monitor");
|
||||
}
|
||||
break;
|
||||
|
||||
}
|
||||
default:
|
||||
DPRINTFE("Runtime error, unexpected event %s, at state %s",
|
||||
sm_failover_event_str(event),
|
||||
|
@ -28,3 +207,58 @@ SmErrorT SmFailoverFailedState::event_handler(SmFailoverEventT event, const ISmF
|
|||
}
|
||||
return SM_OKAY;
|
||||
}
|
||||
|
||||
// Start the 'Failover Failed' state recovery audit
|
||||
SmErrorT SmFailoverFailedState::_register_timer()
|
||||
{
|
||||
SmErrorT error;
|
||||
const char* timer_name = "FAILED STATE AUDIT TIMER";
|
||||
if(SM_TIMER_ID_INVALID != this->_failed_state_audit_timer_id)
|
||||
this->_deregister_timer();
|
||||
|
||||
error = sm_timer_register(timer_name, FAILED_STATE_AUDIT_PERIOD,
|
||||
SmFailoverFailedState::_failed_state_audit,
|
||||
0, &this->_failed_state_audit_timer_id);
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
// Stop the 'Failover Failed' state recovery audit
|
||||
SmErrorT SmFailoverFailedState::_deregister_timer()
|
||||
{
|
||||
SmErrorT error = SM_OKAY;
|
||||
if(SM_TIMER_ID_INVALID != this->_failed_state_audit_timer_id)
|
||||
{
|
||||
error = sm_timer_deregister(this->_failed_state_audit_timer_id);
|
||||
if( SM_OKAY != error )
|
||||
{
|
||||
DPRINTFE( "Failed to cancel failed timer, error=%s.",
|
||||
sm_error_str( error ) );
|
||||
}else
|
||||
{
|
||||
this->_failed_state_audit_timer_id = SM_TIMER_ID_INVALID;
|
||||
}
|
||||
}
|
||||
return error;
|
||||
}
|
||||
|
||||
|
||||
SmErrorT SmFailoverFailedState::exit_state()
|
||||
{
|
||||
SmErrorT error = this->_deregister_timer();
|
||||
if(SM_OKAY != error)
|
||||
{
|
||||
DPRINTFE("Failed to deregister fail failed timer. Error %s", sm_error_str(error));
|
||||
}
|
||||
if(SM_TIMER_ID_INVALID != _failed_state_audit_timer_id)
|
||||
{
|
||||
error = sm_timer_deregister(_failed_state_audit_timer_id);
|
||||
_failed_state_audit_timer_id = SM_TIMER_ID_INVALID;
|
||||
if( SM_OKAY != error)
|
||||
{
|
||||
DPRINTFE("Failed to deregister action timer. Error %s", sm_error_str(error));
|
||||
}
|
||||
}
|
||||
SmFSMState::exit_state();
|
||||
return error;
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
//
|
||||
// Copyright (c) 2018 Wind River Systems, Inc.
|
||||
// Copyright (c) 2020 Wind River Systems, Inc.
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
@ -12,12 +12,22 @@
|
|||
class SmFailoverFailedState : public SmFSMState
|
||||
{
|
||||
public:
|
||||
SmFailoverFailedState(SmFailoverFSM& fsm) : SmFSMState(fsm){}
|
||||
SmFailoverFailedState(SmFailoverFSM& fsm);
|
||||
virtual ~SmFailoverFailedState();
|
||||
SmErrorT enter_state();
|
||||
SmErrorT exit_state();
|
||||
|
||||
protected:
|
||||
SmErrorT event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data);
|
||||
|
||||
private:
|
||||
SmTimerIdT _failed_state_audit_timer_id;
|
||||
static bool _failed_state_audit(SmTimerIdT timer_id, int64_t user_data);
|
||||
SmErrorT _register_timer();
|
||||
SmErrorT _deregister_timer();
|
||||
|
||||
int _log_throttle ;
|
||||
};
|
||||
|
||||
|
||||
#endif //__SM_FAILOVER_FAILED_STATE_H__
|
||||
#endif //__SM_FAILOVER_FAILED_STATE_H__
|
||||
|
|
Loading…
Reference in New Issue