Merge "Reorder process restart operations to prevent pmond futex deadlock"
This commit is contained in:
commit
8290718f81
|
@ -445,6 +445,7 @@ typedef struct
|
||||||
/** holds the alarm severity state of CLEAR, MINOR, MAJOR, CRITICAL */
|
/** holds the alarm severity state of CLEAR, MINOR, MAJOR, CRITICAL */
|
||||||
EFmAlarmSeverityT alarm_severity ;
|
EFmAlarmSeverityT alarm_severity ;
|
||||||
bool restart ;
|
bool restart ;
|
||||||
|
bool registered ; /**< true if pid is registered with kernel */
|
||||||
bool failed ;
|
bool failed ;
|
||||||
bool ignore ; /**< ignore this process ; debug purposes */
|
bool ignore ; /**< ignore this process ; debug purposes */
|
||||||
bool stopped ; /**< process was stopped by command */
|
bool stopped ; /**< process was stopped by command */
|
||||||
|
|
|
@ -194,10 +194,7 @@ int pmon_active_handler ( process_config_type * ptr )
|
||||||
}
|
}
|
||||||
case ACTIVE_STAGE__GAP_SETUP:
|
case ACTIVE_STAGE__GAP_SETUP:
|
||||||
{
|
{
|
||||||
if ( ptr->pt_ptr->tid )
|
mtcTimer_reset ( ptr->pt_ptr );
|
||||||
{
|
|
||||||
mtcTimer_stop ( ptr->pt_ptr );
|
|
||||||
}
|
|
||||||
mtcTimer_start ( ptr->pt_ptr, pmon_timer_handler, ptr->period );
|
mtcTimer_start ( ptr->pt_ptr, pmon_timer_handler, ptr->period );
|
||||||
activeStageChange ( ptr, ACTIVE_STAGE__GAP_WAIT );
|
activeStageChange ( ptr, ACTIVE_STAGE__GAP_WAIT );
|
||||||
break ;
|
break ;
|
||||||
|
@ -216,8 +213,7 @@ int pmon_active_handler ( process_config_type * ptr )
|
||||||
ptr->active_failed = true ;
|
ptr->active_failed = true ;
|
||||||
ptr->afailed_count++ ;
|
ptr->afailed_count++ ;
|
||||||
ptr->b2b_miss_count = 0 ;
|
ptr->b2b_miss_count = 0 ;
|
||||||
if ( ptr->pt_ptr->tid )
|
mtcTimer_reset ( ptr->pt_ptr );
|
||||||
mtcTimer_stop ( ptr->pt_ptr );
|
|
||||||
|
|
||||||
manage_process_failure ( ptr );
|
manage_process_failure ( ptr );
|
||||||
|
|
||||||
|
@ -569,16 +565,17 @@ int pmon_passive_handler ( process_config_type * ptr )
|
||||||
respawn_process ( ptr ) ;
|
respawn_process ( ptr ) ;
|
||||||
|
|
||||||
/* Start the monitor debounce timer. */
|
/* Start the monitor debounce timer. */
|
||||||
if ( ptr->pt_ptr->tid ) mtcTimer_stop ( ptr->pt_ptr );
|
mtcTimer_reset ( ptr->pt_ptr );
|
||||||
mtcTimer_start ( ptr->pt_ptr, pmon_timer_handler, ptr->startuptime );
|
mtcTimer_start ( ptr->pt_ptr, pmon_timer_handler, ptr->startuptime );
|
||||||
passiveStageChange ( ptr, PMON_STAGE__MONITOR_WAIT ) ;
|
passiveStageChange ( ptr, PMON_STAGE__MONITOR_WAIT ) ;
|
||||||
|
|
||||||
/* Don't wait for the debounce timer to take this process out of 'commanded restart' mode.
|
/* Don't wait for the debounce timer to take this process out of 'commanded restart' mode.
|
||||||
* Do it now, otherwise tight patch loop stress testing might fail */
|
* Do it now, otherwise tight patch loop stress testing might fail */
|
||||||
if ( ptr->restart == true )
|
if ( ptr->restart == true )
|
||||||
{
|
{
|
||||||
ilog ("%s exit manual restart request mode\n", ptr->process )
|
ilog ("%s Restarted\n", ptr->process )
|
||||||
ptr->restart = false ;
|
ptr->restart = false ;
|
||||||
|
ptr->registered = false ;
|
||||||
}
|
}
|
||||||
break ;
|
break ;
|
||||||
}
|
}
|
||||||
|
@ -972,12 +969,7 @@ int pmon_status_handler ( process_config_type * ptr )
|
||||||
// a ring when the command execute successfully or returns a failure
|
// a ring when the command execute successfully or returns a failure
|
||||||
if ( (ptr->pt_ptr->ring == true) || (ptr->status != PASS ) )
|
if ( (ptr->pt_ptr->ring == true) || (ptr->status != PASS ) )
|
||||||
{
|
{
|
||||||
// Stop timer if we had one
|
mtcTimer_reset( ptr->pt_ptr);
|
||||||
if ( ptr->pt_ptr->tid )
|
|
||||||
{
|
|
||||||
dlog ("%s stop the status command timer %p\n", ptr->process, ptr->pt_ptr->tid );
|
|
||||||
mtcTimer_stop( ptr->pt_ptr);
|
|
||||||
}
|
|
||||||
ptr->pt_ptr->ring = false;
|
ptr->pt_ptr->ring = false;
|
||||||
|
|
||||||
if (( !ptr->sigchld_rxed ) || ( !ptr->child_pid ) || (ptr->status != PASS))
|
if (( !ptr->sigchld_rxed ) || ( !ptr->child_pid ) || (ptr->status != PASS))
|
||||||
|
@ -1056,12 +1048,7 @@ int pmon_status_handler ( process_config_type * ptr )
|
||||||
// a ring when the command execute successfully or returns a failure
|
// a ring when the command execute successfully or returns a failure
|
||||||
if ( (ptr->pt_ptr->ring == true) || (ptr->status != PASS) )
|
if ( (ptr->pt_ptr->ring == true) || (ptr->status != PASS) )
|
||||||
{
|
{
|
||||||
// stop timer if we had one
|
mtcTimer_reset( ptr->pt_ptr);
|
||||||
if ( ptr->pt_ptr->tid )
|
|
||||||
{
|
|
||||||
dlog ("%s stop the start command timer %p\n", ptr->process, ptr->pt_ptr->tid );
|
|
||||||
mtcTimer_stop( ptr->pt_ptr);
|
|
||||||
}
|
|
||||||
ptr->pt_ptr->ring = false;
|
ptr->pt_ptr->ring = false;
|
||||||
|
|
||||||
// If the status had failed then ptr->status_failed will be set to true. Status failure
|
// If the status had failed then ptr->status_failed will be set to true. Status failure
|
||||||
|
|
|
@ -379,10 +379,7 @@ void load_processes ( void )
|
||||||
*/
|
*/
|
||||||
for ( int i = 0 ; i < _pmon_ctrl_ptr->processes ; i++ )
|
for ( int i = 0 ; i < _pmon_ctrl_ptr->processes ; i++ )
|
||||||
{
|
{
|
||||||
if ( process_config[i].pt_ptr->tid )
|
mtcTimer_reset ( process_config[i].pt_ptr );
|
||||||
{
|
|
||||||
mtcTimer_stop ( process_config[i].pt_ptr );
|
|
||||||
}
|
|
||||||
close_process_socket ( &process_config[i] );
|
close_process_socket ( &process_config[i] );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -925,13 +922,13 @@ bool kill_running_process ( int pid )
|
||||||
if ( result == 0 )
|
if ( result == 0 )
|
||||||
{
|
{
|
||||||
char * proc_name_ptr = &unknown_process[0] ;
|
char * proc_name_ptr = &unknown_process[0] ;
|
||||||
|
|
||||||
result = kill ( pid, SIGKILL );
|
|
||||||
process_config_type * ptr = find_parent_process ( pid ) ;
|
process_config_type * ptr = find_parent_process ( pid ) ;
|
||||||
if ( ptr )
|
if ( ptr )
|
||||||
{
|
{
|
||||||
|
daemon_remove_file ( ptr->pidfile );
|
||||||
proc_name_ptr = (char*)ptr->process ;
|
proc_name_ptr = (char*)ptr->process ;
|
||||||
}
|
}
|
||||||
|
result = kill ( pid, SIGKILL );
|
||||||
if ( ptr && ( result == 0 ) )
|
if ( ptr && ( result == 0 ) )
|
||||||
{
|
{
|
||||||
if ( daemon_is_file_present ( ptr->pidfile ) )
|
if ( daemon_is_file_present ( ptr->pidfile ) )
|
||||||
|
@ -947,14 +944,10 @@ bool kill_running_process ( int pid )
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
wlog ("%s kill failed or process not running (%d)\n", proc_name_ptr, pid );
|
ilog ("%s kill failed (%d)\n", proc_name_ptr, pid );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
|
||||||
{
|
|
||||||
wlog ("%s cannot kill pid %d\n", unknown_process, pid);
|
|
||||||
}
|
|
||||||
return (rc);
|
return (rc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1095,13 +1088,21 @@ int unregister_process ( process_config_type * ptr )
|
||||||
info.events = PMON_EVENT_FLAGS ;
|
info.events = PMON_EVENT_FLAGS ;
|
||||||
if ( prctl (PR_DO_NOTIFY_TASK_STATE, &info ))
|
if ( prctl (PR_DO_NOTIFY_TASK_STATE, &info ))
|
||||||
{
|
{
|
||||||
wlog ("%s failed to unregister process %d\n", ptr->process, ptr->pid );
|
if ( errno != ESRCH )
|
||||||
|
{
|
||||||
|
wlog ("%s unregister pid:%d (%d:%s)\n",
|
||||||
|
ptr->process,
|
||||||
|
ptr->pid,
|
||||||
|
errno,
|
||||||
|
strerror(errno) );
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
ilog ("%s unregistered (%d)\n", ptr->process, ptr->pid );
|
ilog ("%s unregistered (%d)\n", ptr->process, ptr->pid );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
ptr->registered = false ;
|
||||||
return (PASS);
|
return (PASS);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1120,7 +1121,7 @@ int register_process ( process_config_type * ptr )
|
||||||
info.events = PMON_EVENT_FLAGS;
|
info.events = PMON_EVENT_FLAGS;
|
||||||
if ( prctl (PR_DO_NOTIFY_TASK_STATE, &info ) )
|
if ( prctl (PR_DO_NOTIFY_TASK_STATE, &info ) )
|
||||||
{
|
{
|
||||||
elog ("%s failed to register pid:%d (%d) (%s)\n", ptr->process, pid, errno, strerror(errno));
|
elog ("%s failed to register pid:%d (%d:%s)\n", ptr->process, pid, errno, strerror(errno));
|
||||||
if ( errno == EINVAL )
|
if ( errno == EINVAL )
|
||||||
{
|
{
|
||||||
_pmon_ctrl_ptr->event_mode = false ;
|
_pmon_ctrl_ptr->event_mode = false ;
|
||||||
|
@ -1135,6 +1136,7 @@ int register_process ( process_config_type * ptr )
|
||||||
{
|
{
|
||||||
ilog ("%s Registered (%d)\n", ptr->process , pid );
|
ilog ("%s Registered (%d)\n", ptr->process , pid );
|
||||||
ptr->failed = false ;
|
ptr->failed = false ;
|
||||||
|
ptr->registered = true ;
|
||||||
passiveStageChange ( ptr, PMON_STAGE__MANAGE ) ;
|
passiveStageChange ( ptr, PMON_STAGE__MANAGE ) ;
|
||||||
if ( ptr->active_monitoring == false )
|
if ( ptr->active_monitoring == false )
|
||||||
{
|
{
|
||||||
|
@ -1148,6 +1150,10 @@ int register_process ( process_config_type * ptr )
|
||||||
{
|
{
|
||||||
wlog ("%s Registered (%d) in polling mode\n",
|
wlog ("%s Registered (%d) in polling mode\n",
|
||||||
ptr->process , pid);
|
ptr->process , pid);
|
||||||
|
|
||||||
|
/* prevent infinite reg retry in polling mode */
|
||||||
|
ptr->registered = true ;
|
||||||
|
|
||||||
if ( process_running ( ptr ) == false )
|
if ( process_running ( ptr ) == false )
|
||||||
{
|
{
|
||||||
ptr->failed = true ;
|
ptr->failed = true ;
|
||||||
|
@ -1192,25 +1198,14 @@ int respawn_process ( process_config_type * ptr )
|
||||||
int rc = PASS ;
|
int rc = PASS ;
|
||||||
bool restart = false ;
|
bool restart = false ;
|
||||||
|
|
||||||
|
unregister_process ( ptr );
|
||||||
if ( process_running ( ptr ) == true )
|
if ( process_running ( ptr ) == true )
|
||||||
{
|
{
|
||||||
ilog ("%s restart of running process\n", ptr->process );
|
ilog ("%s restart of running process\n", ptr->process );
|
||||||
restart = true ;
|
restart = true ;
|
||||||
|
kill_running_process ( ptr->pid );
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Handle the case where the process is running but the known pid suggests its not.
|
|
||||||
* Do this by quering by processname and if it returns a valid PID then kill it before
|
|
||||||
* we start managing its death */
|
|
||||||
pid = get_pid_by_name_pipe ( ptr->process ) ;
|
|
||||||
if ( pid )
|
|
||||||
{
|
|
||||||
/* Note: We could just go with this new PID ; update the struct and such
|
|
||||||
* but that could be a bit risky ; instead we kill and restart. */
|
|
||||||
kill_running_process ( pid );
|
|
||||||
}
|
|
||||||
|
|
||||||
unregister_process ( ptr );
|
|
||||||
|
|
||||||
ptr->restarts_cnt++ ;
|
ptr->restarts_cnt++ ;
|
||||||
|
|
||||||
/* default restart result and ponitoring controls */
|
/* default restart result and ponitoring controls */
|
||||||
|
@ -1306,7 +1301,7 @@ int respawn_process ( process_config_type * ptr )
|
||||||
|
|
||||||
gettime ( ptr->time_start );
|
gettime ( ptr->time_start );
|
||||||
|
|
||||||
ilog ("%s Spawn (%d) fork\n", ptr->process, ptr->child_pid );
|
ilog ("%s Spawn (%d)\n", ptr->process, ptr->child_pid );
|
||||||
|
|
||||||
return (PASS);
|
return (PASS);
|
||||||
}
|
}
|
||||||
|
@ -1906,10 +1901,7 @@ void pmon_service ( pmon_ctrl_type * ctrl_ptr )
|
||||||
ilog ("Setting config reload flag\n");
|
ilog ("Setting config reload flag\n");
|
||||||
|
|
||||||
/* Hijack the audit timer for the next period for config reload */
|
/* Hijack the audit timer for the next period for config reload */
|
||||||
if ( pmonTimer_degrade.tid )
|
mtcTimer_reset (pmonTimer_degrade);
|
||||||
{
|
|
||||||
mtcTimer_stop (pmonTimer_degrade);
|
|
||||||
}
|
|
||||||
if ( daemon_is_file_present ( PATCHING_IN_PROG_FILE ) == true )
|
if ( daemon_is_file_present ( PATCHING_IN_PROG_FILE ) == true )
|
||||||
{
|
{
|
||||||
_pmon_ctrl_ptr->patching_in_progress = true ;
|
_pmon_ctrl_ptr->patching_in_progress = true ;
|
||||||
|
@ -2037,6 +2029,25 @@ void pmon_service ( pmon_ctrl_type * ctrl_ptr )
|
||||||
manage_process_failure ( &process_config[i]) ;
|
manage_process_failure ( &process_config[i]) ;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Audit to ensure that running processes are
|
||||||
|
* registered with the kernel */
|
||||||
|
if (( process_config[i].registered == false ) &&
|
||||||
|
( _pmon_ctrl_ptr->event_mode ) &&
|
||||||
|
( process_config[i].restart == false ) &&
|
||||||
|
( process_config[i].failed == false ) &&
|
||||||
|
( process_config[i].ignore == false ))
|
||||||
|
{
|
||||||
|
int pid = get_process_pid ( &process_config[i] );
|
||||||
|
if ( pid )
|
||||||
|
{
|
||||||
|
if ( kill (pid, 0 ) == 0 )
|
||||||
|
{
|
||||||
|
process_config[i].pid = pid ;
|
||||||
|
register_process ( &process_config[i] );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Debugging */
|
/* Debugging */
|
||||||
|
|
|
@ -717,7 +717,7 @@ void pmon_service_inbox ( void )
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
process_config_type * ptr = get_process_config_ptr ( process );
|
process_config_type * ptr = get_process_config_ptr ( process );
|
||||||
ilog ("%s process 'restart' request\n", process.c_str());
|
dlog ("%s process 'restart' request\n", process.c_str());
|
||||||
if ( ptr != NULL )
|
if ( ptr != NULL )
|
||||||
{
|
{
|
||||||
if ( strcmp ( ptr->mode, "status" ) == 0 )
|
if ( strcmp ( ptr->mode, "status" ) == 0 )
|
||||||
|
|
Loading…
Reference in New Issue