Merge "Reorder process restart operations to prevent pmond futex deadlock"

This commit is contained in:
Zuul 2018-08-23 13:24:40 +00:00 committed by Gerrit Code Review
commit 8290718f81
4 changed files with 52 additions and 53 deletions

View File

@ -445,6 +445,7 @@ typedef struct
/** holds the alarm severity state of CLEAR, MINOR, MAJOR, CRITICAL */
EFmAlarmSeverityT alarm_severity ;
bool restart ;
bool registered ; /**< true if pid is registered with kernel */
bool failed ;
bool ignore ; /**< ignore this process ; debug purposes */
bool stopped ; /**< process was stopped by command */

View File

@ -194,10 +194,7 @@ int pmon_active_handler ( process_config_type * ptr )
}
case ACTIVE_STAGE__GAP_SETUP:
{
if ( ptr->pt_ptr->tid )
{
mtcTimer_stop ( ptr->pt_ptr );
}
mtcTimer_reset ( ptr->pt_ptr );
mtcTimer_start ( ptr->pt_ptr, pmon_timer_handler, ptr->period );
activeStageChange ( ptr, ACTIVE_STAGE__GAP_WAIT );
break ;
@ -216,8 +213,7 @@ int pmon_active_handler ( process_config_type * ptr )
ptr->active_failed = true ;
ptr->afailed_count++ ;
ptr->b2b_miss_count = 0 ;
if ( ptr->pt_ptr->tid )
mtcTimer_stop ( ptr->pt_ptr );
mtcTimer_reset ( ptr->pt_ptr );
manage_process_failure ( ptr );
@ -569,16 +565,17 @@ int pmon_passive_handler ( process_config_type * ptr )
respawn_process ( ptr ) ;
/* Start the monitor debounce timer. */
if ( ptr->pt_ptr->tid ) mtcTimer_stop ( ptr->pt_ptr );
mtcTimer_start ( ptr->pt_ptr, pmon_timer_handler, ptr->startuptime );
mtcTimer_reset ( ptr->pt_ptr );
mtcTimer_start ( ptr->pt_ptr, pmon_timer_handler, ptr->startuptime );
passiveStageChange ( ptr, PMON_STAGE__MONITOR_WAIT ) ;
/* Don't wait for the debounce timer to take this process out of 'commanded restart' mode.
* Do it now, otherwise tight patch loop stress testing might fail */
if ( ptr->restart == true )
{
ilog ("%s exit manual restart request mode\n", ptr->process )
ilog ("%s Restarted\n", ptr->process )
ptr->restart = false ;
ptr->registered = false ;
}
break ;
}
@ -972,12 +969,7 @@ int pmon_status_handler ( process_config_type * ptr )
// a ring when the command execute successfully or returns a failure
if ( (ptr->pt_ptr->ring == true) || (ptr->status != PASS ) )
{
// Stop timer if we had one
if ( ptr->pt_ptr->tid )
{
dlog ("%s stop the status command timer %p\n", ptr->process, ptr->pt_ptr->tid );
mtcTimer_stop( ptr->pt_ptr);
}
mtcTimer_reset( ptr->pt_ptr);
ptr->pt_ptr->ring = false;
if (( !ptr->sigchld_rxed ) || ( !ptr->child_pid ) || (ptr->status != PASS))
@ -1056,12 +1048,7 @@ int pmon_status_handler ( process_config_type * ptr )
// a ring when the command execute successfully or returns a failure
if ( (ptr->pt_ptr->ring == true) || (ptr->status != PASS) )
{
// stop timer if we had one
if ( ptr->pt_ptr->tid )
{
dlog ("%s stop the start command timer %p\n", ptr->process, ptr->pt_ptr->tid );
mtcTimer_stop( ptr->pt_ptr);
}
mtcTimer_reset( ptr->pt_ptr);
ptr->pt_ptr->ring = false;
// If the status had failed then ptr->status_failed will be set to true. Status failure

View File

@ -379,10 +379,7 @@ void load_processes ( void )
*/
for ( int i = 0 ; i < _pmon_ctrl_ptr->processes ; i++ )
{
if ( process_config[i].pt_ptr->tid )
{
mtcTimer_stop ( process_config[i].pt_ptr );
}
mtcTimer_reset ( process_config[i].pt_ptr );
close_process_socket ( &process_config[i] );
}
@ -925,13 +922,13 @@ bool kill_running_process ( int pid )
if ( result == 0 )
{
char * proc_name_ptr = &unknown_process[0] ;
result = kill ( pid, SIGKILL );
process_config_type * ptr = find_parent_process ( pid ) ;
if ( ptr )
{
daemon_remove_file ( ptr->pidfile );
proc_name_ptr = (char*)ptr->process ;
}
result = kill ( pid, SIGKILL );
if ( ptr && ( result == 0 ) )
{
if ( daemon_is_file_present ( ptr->pidfile ) )
@ -947,14 +944,10 @@ bool kill_running_process ( int pid )
}
else
{
wlog ("%s kill failed or process not running (%d)\n", proc_name_ptr, pid );
ilog ("%s kill failed (%d)\n", proc_name_ptr, pid );
}
}
}
else
{
wlog ("%s cannot kill pid %d\n", unknown_process, pid);
}
return (rc);
}
@ -1095,13 +1088,21 @@ int unregister_process ( process_config_type * ptr )
info.events = PMON_EVENT_FLAGS ;
if ( prctl (PR_DO_NOTIFY_TASK_STATE, &info ))
{
wlog ("%s failed to unregister process %d\n", ptr->process, ptr->pid );
if ( errno != ESRCH )
{
wlog ("%s unregister pid:%d (%d:%s)\n",
ptr->process,
ptr->pid,
errno,
strerror(errno) );
}
}
else
{
ilog ("%s unregistered (%d)\n", ptr->process, ptr->pid );
}
}
ptr->registered = false ;
return (PASS);
}
@ -1120,7 +1121,7 @@ int register_process ( process_config_type * ptr )
info.events = PMON_EVENT_FLAGS;
if ( prctl (PR_DO_NOTIFY_TASK_STATE, &info ) )
{
elog ("%s failed to register pid:%d (%d) (%s)\n", ptr->process, pid, errno, strerror(errno));
elog ("%s failed to register pid:%d (%d:%s)\n", ptr->process, pid, errno, strerror(errno));
if ( errno == EINVAL )
{
_pmon_ctrl_ptr->event_mode = false ;
@ -1135,6 +1136,7 @@ int register_process ( process_config_type * ptr )
{
ilog ("%s Registered (%d)\n", ptr->process , pid );
ptr->failed = false ;
ptr->registered = true ;
passiveStageChange ( ptr, PMON_STAGE__MANAGE ) ;
if ( ptr->active_monitoring == false )
{
@ -1148,6 +1150,10 @@ int register_process ( process_config_type * ptr )
{
wlog ("%s Registered (%d) in polling mode\n",
ptr->process , pid);
/* prevent infinite reg retry in polling mode */
ptr->registered = true ;
if ( process_running ( ptr ) == false )
{
ptr->failed = true ;
@ -1192,25 +1198,14 @@ int respawn_process ( process_config_type * ptr )
int rc = PASS ;
bool restart = false ;
unregister_process ( ptr );
if ( process_running ( ptr ) == true )
{
ilog ("%s restart of running process\n", ptr->process );
restart = true ;
kill_running_process ( ptr->pid );
}
/* Handle the case where the process is running but the known pid suggests its not.
* Do this by quering by processname and if it returns a valid PID then kill it before
* we start managing its death */
pid = get_pid_by_name_pipe ( ptr->process ) ;
if ( pid )
{
/* Note: We could just go with this new PID ; update the struct and such
* but that could be a bit risky ; instead we kill and restart. */
kill_running_process ( pid );
}
unregister_process ( ptr );
ptr->restarts_cnt++ ;
/* default restart result and ponitoring controls */
@ -1306,7 +1301,7 @@ int respawn_process ( process_config_type * ptr )
gettime ( ptr->time_start );
ilog ("%s Spawn (%d) fork\n", ptr->process, ptr->child_pid );
ilog ("%s Spawn (%d)\n", ptr->process, ptr->child_pid );
return (PASS);
}
@ -1906,10 +1901,7 @@ void pmon_service ( pmon_ctrl_type * ctrl_ptr )
ilog ("Setting config reload flag\n");
/* Hijack the audit timer for the next period for config reload */
if ( pmonTimer_degrade.tid )
{
mtcTimer_stop (pmonTimer_degrade);
}
mtcTimer_reset (pmonTimer_degrade);
if ( daemon_is_file_present ( PATCHING_IN_PROG_FILE ) == true )
{
_pmon_ctrl_ptr->patching_in_progress = true ;
@ -2037,6 +2029,25 @@ void pmon_service ( pmon_ctrl_type * ctrl_ptr )
manage_process_failure ( &process_config[i]) ;
}
}
/* Audit to ensure that running processes are
* registered with the kernel */
if (( process_config[i].registered == false ) &&
( _pmon_ctrl_ptr->event_mode ) &&
( process_config[i].restart == false ) &&
( process_config[i].failed == false ) &&
( process_config[i].ignore == false ))
{
int pid = get_process_pid ( &process_config[i] );
if ( pid )
{
if ( kill (pid, 0 ) == 0 )
{
process_config[i].pid = pid ;
register_process ( &process_config[i] );
}
}
}
}
/* Debugging */

View File

@ -717,7 +717,7 @@ void pmon_service_inbox ( void )
else
{
process_config_type * ptr = get_process_config_ptr ( process );
ilog ("%s process 'restart' request\n", process.c_str());
dlog ("%s process 'restart' request\n", process.c_str());
if ( ptr != NULL )
{
if ( strcmp ( ptr->mode, "status" ) == 0 )