diff --git a/mtce-common/cgts-mtce-common-1.0/pmon/pmon.h b/mtce-common/cgts-mtce-common-1.0/pmon/pmon.h index c7a03a14..ec247532 100755 --- a/mtce-common/cgts-mtce-common-1.0/pmon/pmon.h +++ b/mtce-common/cgts-mtce-common-1.0/pmon/pmon.h @@ -445,6 +445,7 @@ typedef struct /** holds the alarm severity state of CLEAR, MINOR, MAJOR, CRITICAL */ EFmAlarmSeverityT alarm_severity ; bool restart ; + bool registered ; /**< true if pid is registered with kernel */ bool failed ; bool ignore ; /**< ignore this process ; debug purposes */ bool stopped ; /**< process was stopped by command */ diff --git a/mtce-common/cgts-mtce-common-1.0/pmon/pmonFsm.cpp b/mtce-common/cgts-mtce-common-1.0/pmon/pmonFsm.cpp index 2ef7a2ff..69a32839 100644 --- a/mtce-common/cgts-mtce-common-1.0/pmon/pmonFsm.cpp +++ b/mtce-common/cgts-mtce-common-1.0/pmon/pmonFsm.cpp @@ -194,10 +194,7 @@ int pmon_active_handler ( process_config_type * ptr ) } case ACTIVE_STAGE__GAP_SETUP: { - if ( ptr->pt_ptr->tid ) - { - mtcTimer_stop ( ptr->pt_ptr ); - } + mtcTimer_reset ( ptr->pt_ptr ); mtcTimer_start ( ptr->pt_ptr, pmon_timer_handler, ptr->period ); activeStageChange ( ptr, ACTIVE_STAGE__GAP_WAIT ); break ; @@ -216,8 +213,7 @@ int pmon_active_handler ( process_config_type * ptr ) ptr->active_failed = true ; ptr->afailed_count++ ; ptr->b2b_miss_count = 0 ; - if ( ptr->pt_ptr->tid ) - mtcTimer_stop ( ptr->pt_ptr ); + mtcTimer_reset ( ptr->pt_ptr ); manage_process_failure ( ptr ); @@ -569,16 +565,17 @@ int pmon_passive_handler ( process_config_type * ptr ) respawn_process ( ptr ) ; /* Start the monitor debounce timer. */ - if ( ptr->pt_ptr->tid ) mtcTimer_stop ( ptr->pt_ptr ); - mtcTimer_start ( ptr->pt_ptr, pmon_timer_handler, ptr->startuptime ); + mtcTimer_reset ( ptr->pt_ptr ); + mtcTimer_start ( ptr->pt_ptr, pmon_timer_handler, ptr->startuptime ); passiveStageChange ( ptr, PMON_STAGE__MONITOR_WAIT ) ; /* Don't wait for the debounce timer to take this process out of 'commanded restart' mode. * Do it now, otherwise tight patch loop stress testing might fail */ if ( ptr->restart == true ) { - ilog ("%s exit manual restart request mode\n", ptr->process ) + ilog ("%s Restarted\n", ptr->process ) ptr->restart = false ; + ptr->registered = false ; } break ; } @@ -972,12 +969,7 @@ int pmon_status_handler ( process_config_type * ptr ) // a ring when the command execute successfully or returns a failure if ( (ptr->pt_ptr->ring == true) || (ptr->status != PASS ) ) { - // Stop timer if we had one - if ( ptr->pt_ptr->tid ) - { - dlog ("%s stop the status command timer %p\n", ptr->process, ptr->pt_ptr->tid ); - mtcTimer_stop( ptr->pt_ptr); - } + mtcTimer_reset( ptr->pt_ptr); ptr->pt_ptr->ring = false; if (( !ptr->sigchld_rxed ) || ( !ptr->child_pid ) || (ptr->status != PASS)) @@ -1056,12 +1048,7 @@ int pmon_status_handler ( process_config_type * ptr ) // a ring when the command execute successfully or returns a failure if ( (ptr->pt_ptr->ring == true) || (ptr->status != PASS) ) { - // stop timer if we had one - if ( ptr->pt_ptr->tid ) - { - dlog ("%s stop the start command timer %p\n", ptr->process, ptr->pt_ptr->tid ); - mtcTimer_stop( ptr->pt_ptr); - } + mtcTimer_reset( ptr->pt_ptr); ptr->pt_ptr->ring = false; // If the status had failed then ptr->status_failed will be set to true. Status failure diff --git a/mtce-common/cgts-mtce-common-1.0/pmon/pmonHdlr.cpp b/mtce-common/cgts-mtce-common-1.0/pmon/pmonHdlr.cpp index daf31540..19305c10 100644 --- a/mtce-common/cgts-mtce-common-1.0/pmon/pmonHdlr.cpp +++ b/mtce-common/cgts-mtce-common-1.0/pmon/pmonHdlr.cpp @@ -379,10 +379,7 @@ void load_processes ( void ) */ for ( int i = 0 ; i < _pmon_ctrl_ptr->processes ; i++ ) { - if ( process_config[i].pt_ptr->tid ) - { - mtcTimer_stop ( process_config[i].pt_ptr ); - } + mtcTimer_reset ( process_config[i].pt_ptr ); close_process_socket ( &process_config[i] ); } @@ -925,13 +922,13 @@ bool kill_running_process ( int pid ) if ( result == 0 ) { char * proc_name_ptr = &unknown_process[0] ; - - result = kill ( pid, SIGKILL ); process_config_type * ptr = find_parent_process ( pid ) ; if ( ptr ) { + daemon_remove_file ( ptr->pidfile ); proc_name_ptr = (char*)ptr->process ; } + result = kill ( pid, SIGKILL ); if ( ptr && ( result == 0 ) ) { if ( daemon_is_file_present ( ptr->pidfile ) ) @@ -947,14 +944,10 @@ bool kill_running_process ( int pid ) } else { - wlog ("%s kill failed or process not running (%d)\n", proc_name_ptr, pid ); + ilog ("%s kill failed (%d)\n", proc_name_ptr, pid ); } } } - else - { - wlog ("%s cannot kill pid %d\n", unknown_process, pid); - } return (rc); } @@ -1095,13 +1088,21 @@ int unregister_process ( process_config_type * ptr ) info.events = PMON_EVENT_FLAGS ; if ( prctl (PR_DO_NOTIFY_TASK_STATE, &info )) { - wlog ("%s failed to unregister process %d\n", ptr->process, ptr->pid ); + if ( errno != ESRCH ) + { + wlog ("%s unregister pid:%d (%d:%s)\n", + ptr->process, + ptr->pid, + errno, + strerror(errno) ); + } } else { ilog ("%s unregistered (%d)\n", ptr->process, ptr->pid ); } } + ptr->registered = false ; return (PASS); } @@ -1120,7 +1121,7 @@ int register_process ( process_config_type * ptr ) info.events = PMON_EVENT_FLAGS; if ( prctl (PR_DO_NOTIFY_TASK_STATE, &info ) ) { - elog ("%s failed to register pid:%d (%d) (%s)\n", ptr->process, pid, errno, strerror(errno)); + elog ("%s failed to register pid:%d (%d:%s)\n", ptr->process, pid, errno, strerror(errno)); if ( errno == EINVAL ) { _pmon_ctrl_ptr->event_mode = false ; @@ -1135,6 +1136,7 @@ int register_process ( process_config_type * ptr ) { ilog ("%s Registered (%d)\n", ptr->process , pid ); ptr->failed = false ; + ptr->registered = true ; passiveStageChange ( ptr, PMON_STAGE__MANAGE ) ; if ( ptr->active_monitoring == false ) { @@ -1148,6 +1150,10 @@ int register_process ( process_config_type * ptr ) { wlog ("%s Registered (%d) in polling mode\n", ptr->process , pid); + + /* prevent infinite reg retry in polling mode */ + ptr->registered = true ; + if ( process_running ( ptr ) == false ) { ptr->failed = true ; @@ -1192,25 +1198,14 @@ int respawn_process ( process_config_type * ptr ) int rc = PASS ; bool restart = false ; + unregister_process ( ptr ); if ( process_running ( ptr ) == true ) { ilog ("%s restart of running process\n", ptr->process ); restart = true ; + kill_running_process ( ptr->pid ); } - /* Handle the case where the process is running but the known pid suggests its not. - * Do this by quering by processname and if it returns a valid PID then kill it before - * we start managing its death */ - pid = get_pid_by_name_pipe ( ptr->process ) ; - if ( pid ) - { - /* Note: We could just go with this new PID ; update the struct and such - * but that could be a bit risky ; instead we kill and restart. */ - kill_running_process ( pid ); - } - - unregister_process ( ptr ); - ptr->restarts_cnt++ ; /* default restart result and ponitoring controls */ @@ -1306,7 +1301,7 @@ int respawn_process ( process_config_type * ptr ) gettime ( ptr->time_start ); - ilog ("%s Spawn (%d) fork\n", ptr->process, ptr->child_pid ); + ilog ("%s Spawn (%d)\n", ptr->process, ptr->child_pid ); return (PASS); } @@ -1906,10 +1901,7 @@ void pmon_service ( pmon_ctrl_type * ctrl_ptr ) ilog ("Setting config reload flag\n"); /* Hijack the audit timer for the next period for config reload */ - if ( pmonTimer_degrade.tid ) - { - mtcTimer_stop (pmonTimer_degrade); - } + mtcTimer_reset (pmonTimer_degrade); if ( daemon_is_file_present ( PATCHING_IN_PROG_FILE ) == true ) { _pmon_ctrl_ptr->patching_in_progress = true ; @@ -2037,6 +2029,25 @@ void pmon_service ( pmon_ctrl_type * ctrl_ptr ) manage_process_failure ( &process_config[i]) ; } } + + /* Audit to ensure that running processes are + * registered with the kernel */ + if (( process_config[i].registered == false ) && + ( _pmon_ctrl_ptr->event_mode ) && + ( process_config[i].restart == false ) && + ( process_config[i].failed == false ) && + ( process_config[i].ignore == false )) + { + int pid = get_process_pid ( &process_config[i] ); + if ( pid ) + { + if ( kill (pid, 0 ) == 0 ) + { + process_config[i].pid = pid ; + register_process ( &process_config[i] ); + } + } + } } /* Debugging */ diff --git a/mtce-common/cgts-mtce-common-1.0/pmon/pmonMsg.cpp b/mtce-common/cgts-mtce-common-1.0/pmon/pmonMsg.cpp index 8e0daf6a..2b231e6f 100644 --- a/mtce-common/cgts-mtce-common-1.0/pmon/pmonMsg.cpp +++ b/mtce-common/cgts-mtce-common-1.0/pmon/pmonMsg.cpp @@ -717,7 +717,7 @@ void pmon_service_inbox ( void ) else { process_config_type * ptr = get_process_config_ptr ( process ); - ilog ("%s process 'restart' request\n", process.c_str()); + dlog ("%s process 'restart' request\n", process.c_str()); if ( ptr != NULL ) { if ( strcmp ( ptr->mode, "status" ) == 0 )