From 6d0cc6a2a83513c2ed571973c01d207dedbe2d15 Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Wed, 21 Nov 2018 09:59:00 -0500 Subject: [PATCH] Prevent early active monitoring of compute processes in AIO The commit shown below introduced a main loop audit that mistakenly registers subfunction processes that are in the waiting for /var/run/.compute_config_complete 'polling' state during unlock enable. By doing so inadvertently changes its monitor FSM stage from 'Poll' to 'Manage' before configuration is complete. Since config is not complete, the hbsClient has not initialized its socket interface and is unable to service active monitoring requests. This leads to quorum failure and watchdog reboot. commit 537935bb0caa257df624a0b470a971c82d215152 Author: Eric MacDonald Date: Mon Jul 9 08:36:22 2018 -0400 Reorder process restart operations to prevent pmond futex deadlock The Fix: Don't run the audit for processes that are in the waiting for 'polling' state. Test Plan: Provision AIO , verify no quorum failure and inspect logs for correct behavior. Change-Id: I179c78309517a34285783ee99bbb3d699915cb83 Closes-Bug: 1804318 Signed-off-by: Eric MacDonald --- mtce/centos/build_srpm.data | 2 +- mtce/src/pmon/pmonHdlr.cpp | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/mtce/centos/build_srpm.data b/mtce/centos/build_srpm.data index a6783bd5..01e786b1 100644 --- a/mtce/centos/build_srpm.data +++ b/mtce/centos/build_srpm.data @@ -1,3 +1,3 @@ SRC_DIR="src" -TIS_PATCH_VER=140 +TIS_PATCH_VER=142 BUILD_IS_SLOW=5 diff --git a/mtce/src/pmon/pmonHdlr.cpp b/mtce/src/pmon/pmonHdlr.cpp index 19305c10..996dbccb 100644 --- a/mtce/src/pmon/pmonHdlr.cpp +++ b/mtce/src/pmon/pmonHdlr.cpp @@ -2032,7 +2032,9 @@ void pmon_service ( pmon_ctrl_type * ctrl_ptr ) /* Audit to ensure that running processes are * registered with the kernel */ - if (( process_config[i].registered == false ) && + if (( process_config[i].stage != PMON_STAGE__POLLING ) && + ( process_config[i].stage != PMON_STAGE__START_WAIT ) && + ( process_config[i].registered == false ) && ( _pmon_ctrl_ptr->event_mode ) && ( process_config[i].restart == false ) && ( process_config[i].failed == false ) &&