From 7be3b9085aa699e11c94e2991a87fb6fae21ce05 Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Wed, 2 May 2018 10:31:29 -0500 Subject: [PATCH] Add 90s delay before locking storage node for upgrade Adds support to the mtcAgent for detecting the absence of the 'host services execution enhancement feature' in the mtcClient and implements the pre-upgrade implementation in that case. When mtcAgent tries to lock a storage node running pre-upgrade verison it will implement a 90s lock wait before proceeding to declare that storage host as locked-disabled. Story: 2002886 Task: 22847 Change-Id: I99fb5576e027621019adb5eff553d52773f608db Signed-off-by: Jack Ding --- .../cgts-mtce-common-1.0/common/nodeCmds.h | 1 + .../cgts-mtce-common-1.0/common/nodeTimers.h | 1 + .../maintenance/mtcCmdHdlr.cpp | 34 +++++++++++++++++++ 3 files changed, 36 insertions(+) diff --git a/mtce-common/cgts-mtce-common-1.0/common/nodeCmds.h b/mtce-common/cgts-mtce-common-1.0/common/nodeCmds.h index 47ff9fdb..7944e61e 100644 --- a/mtce-common/cgts-mtce-common-1.0/common/nodeCmds.h +++ b/mtce-common/cgts-mtce-common-1.0/common/nodeCmds.h @@ -56,6 +56,7 @@ typedef enum MTC_CMD_STAGE__HOST_SERVICES_SEND_CMD, MTC_CMD_STAGE__HOST_SERVICES_RECV_ACK, MTC_CMD_STAGE__HOST_SERVICES_WAIT_FOR_RESULT, + MTC_CMD_STAGE__STORAGE_LOCK_DELAY, /* Common command done stage */ MTC_CMD_STAGE__DONE, diff --git a/mtce-common/cgts-mtce-common-1.0/common/nodeTimers.h b/mtce-common/cgts-mtce-common-1.0/common/nodeTimers.h index 154ffc7d..7371c3bd 100755 --- a/mtce-common/cgts-mtce-common-1.0/common/nodeTimers.h +++ b/mtce-common/cgts-mtce-common-1.0/common/nodeTimers.h @@ -88,6 +88,7 @@ #define MTC_MIN_ONLINE_PERIOD_SECS (7) #define MTC_RETRY_WAIT (5) #define MTC_AGENT_TIMEOUT_EXTENSION (5) +#define MTC_LOCK_CEPH_DELAY (90) /** Host must stay enabled for this long for the * failed_recovery_counter to get cleared */ diff --git a/mtce-common/cgts-mtce-common-1.0/maintenance/mtcCmdHdlr.cpp b/mtce-common/cgts-mtce-common-1.0/maintenance/mtcCmdHdlr.cpp index 98c9d9de..1277b131 100644 --- a/mtce-common/cgts-mtce-common-1.0/maintenance/mtcCmdHdlr.cpp +++ b/mtce-common/cgts-mtce-common-1.0/maintenance/mtcCmdHdlr.cpp @@ -231,6 +231,40 @@ int nodeLinkClass::cmd_handler ( struct nodeLinkClass::node * node_ptr ) dlog ("%s %s request ack (legacy mode)\n", node_ptr->hostname.c_str(), node_ptr->host_services_req.name.c_str()); + + // Upgrades that lock storage nodes can + // lead to storage corruption if ceph isn't given + // enough time to shut down. + // + // The following special case for storage node + // lock forces a 90 sec holdoff for pre-upgrade storage + // hosts ; i.e. legacy mode. + // + if ( is_storage(node_ptr) ) + { + ilog ("%s waiting for ceph OSD shutdown\n", node_ptr->hostname.c_str()); + mtcTimer_reset ( node_ptr->mtcCmd_timer ); + mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, MTC_LOCK_CEPH_DELAY ); + node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__STORAGE_LOCK_DELAY ; + } + else + { + node_ptr->mtcCmd_work_fifo_ptr->status = + node_ptr->host_services_req.status = PASS ; + + node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__DONE ; + } + } + break ; + } + case MTC_CMD_STAGE__STORAGE_LOCK_DELAY: + { + /* wait for the timer to expire before moving on */ + if ( mtcTimer_expired ( node_ptr->mtcCmd_timer ) ) + { + ilog ("%s ceph OSD shutdown wait complete\n", + node_ptr->hostname.c_str()); + node_ptr->mtcCmd_work_fifo_ptr->status = node_ptr->host_services_req.status = PASS ;