Add reattempt and collect more data for SM init failure
Multiple report to AIO-SX that SM failed its intialization due to a SQL failure. The issue had not been reproduced in DEV environment. This change adds logging, reattempt and collect SM troubleshooting data when SM fails in such situation. For potential recovery before pmon start actively monitoring SM, setting systemd restart=on-failure. Also set RestartSec=10 seconds in order to give pmon enough time to catch the failure and restart SM. Partial-bug: 1915894 Change-Id: I5899e401742510158cd9c59a664b1dc329bb1075 Signed-off-by: Bin Qian <bin.qian@windriver.com>
This commit is contained in:
parent
c5f753c3bb
commit
f39ca95924
|
@ -9,6 +9,8 @@
|
|||
#include "sm_debug.h"
|
||||
#include "sm_db.h"
|
||||
#include "sm_db_iterator.h"
|
||||
#include "sm_failover_utils.h"
|
||||
|
||||
|
||||
// ****************************************************************************
|
||||
// Database For-Each
|
||||
|
@ -21,6 +23,8 @@ SmErrorT sm_db_foreach( const char* db_name, const char* db_table,
|
|||
SmDbIteratorT it;
|
||||
SmErrorT error, error2;
|
||||
|
||||
DPRINTFI("Entering db foreach");
|
||||
|
||||
error = sm_db_iterator_initialize( db_name, db_table, db_query, &it );
|
||||
if( SM_OKAY != error )
|
||||
{
|
||||
|
@ -74,6 +78,7 @@ ERROR:
|
|||
return( error );
|
||||
}
|
||||
|
||||
DPRINTFI("Exiting db foreach");
|
||||
return( error );
|
||||
}
|
||||
// ****************************************************************************
|
||||
|
|
|
@ -11,6 +11,8 @@ ExecStart=/etc/init.d/sm start
|
|||
ExecStop=/etc/init.d/sm stop
|
||||
PIDFile=/var/run/sm.pid
|
||||
KillMode=process
|
||||
RestartSec=10
|
||||
Restart=on-failure
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
|
|
@ -9,6 +9,8 @@
|
|||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
#include <signal.h>
|
||||
|
||||
#include "sm_types.h"
|
||||
#include "sm_debug.h"
|
||||
|
@ -28,6 +30,7 @@
|
|||
#include "sm_node_swact_monitor.h"
|
||||
#include "sm_failover_fsm.h"
|
||||
#include "sm_configure.h"
|
||||
#include "sm_troubleshoot.h"
|
||||
|
||||
#define SM_NODE_AUDIT_TIMER_IN_MS 1000
|
||||
#define SM_INTERFACE_AUDIT_TIMER_IN_MS 1000
|
||||
|
@ -318,6 +321,7 @@ static void sm_main_event_handler_service_group_state_callback(
|
|||
SmErrorT sm_main_event_handler_initialize( void )
|
||||
{
|
||||
SmErrorT error;
|
||||
int i;
|
||||
|
||||
memset( &_api_callbacks, 0, sizeof(_api_callbacks) );
|
||||
memset( &_notify_api_callbacks, 0, sizeof(_notify_api_callbacks) );
|
||||
|
@ -367,12 +371,32 @@ SmErrorT sm_main_event_handler_initialize( void )
|
|||
return( error );
|
||||
}
|
||||
|
||||
error = sm_main_event_handler_release_service_groups();
|
||||
if( SM_OKAY != error )
|
||||
#define MAX_REATTEMPT 20
|
||||
for(i = 0; i < MAX_REATTEMPT; i ++)
|
||||
{
|
||||
DPRINTFE( "Failed to release service groups, error=%s.",
|
||||
sm_error_str( error ) );
|
||||
return( error );
|
||||
error = sm_main_event_handler_release_service_groups();
|
||||
if( SM_OKAY != error )
|
||||
{
|
||||
DPRINTFE( "Failed to release service groups, error=%s.",
|
||||
sm_error_str( error ) );
|
||||
if( i == 0)
|
||||
{
|
||||
// collect SM troubleshooting data when it fails
|
||||
DPRINTFE("Initialization failed, dumping troubleshooting data");
|
||||
sm_troubleshoot_dump_data("Release service groups failed");
|
||||
}
|
||||
usleep(1000000);
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (error != SM_OKAY)
|
||||
{
|
||||
DPRINTFE("Failed to release service groups, after %d attempts", i);
|
||||
return error;
|
||||
}
|
||||
|
||||
error = sm_api_initialize();
|
||||
|
|
Loading…
Reference in New Issue