From 8f6d2eb85a0580182b4fa3b2c110d5e3a32467e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gabriel=20de=20Ara=C3=BAjo=20Cabral?= Date: Mon, 12 Jun 2023 10:49:53 -0300 Subject: [PATCH] Restart the ceph-mgr daemon every 7 days to control RSS memory growth MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ceph-mgr has a behavior where its RSS memory grows continuously. In a few months, depending on the system, this may carry out more than 1GB of growth. In tests performed on storage and duplex systems, the average growth is around 10MiB per day on the active controller. Since Ceph is open source, a thorough search was performed on the Internet and Ceph repo for information about this growth behavior in memory consumption of ceph-mgr, both in Ceph 14.2.22 (present on the system) and in later versions. However, nothing that could help to fix the problem was found. As there were no reports about this bug, I reported it on the Ceph tracker: https://tracker.ceph.com/issues/61702 A new approach to fix the problem is to automatically restart ceph-mgr every 7 days, so the memory use goes back to the initial state when the daemon is restarted, avoiding the possibility of memory overflow. Also, it was verified that there weren't any impacts on the running processes after the restart. Test-Plan: PASS: Changed the fix in an AIO-DX to restart ceph-mgr every one day. PASS: After one day, ceph-mgr restarted and its RSS memory use went back to the initial state. Closes-Bug: 2023553 Change-Id: I1c62efaf0ca1d37ba93a24fc99b8db7156973102 Signed-off-by: Gabriel de Araújo Cabral --- ceph/ceph/files/mgr-restful-plugin.py | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/ceph/ceph/files/mgr-restful-plugin.py b/ceph/ceph/files/mgr-restful-plugin.py index 714e16aa6..eff0970ee 100644 --- a/ceph/ceph/files/mgr-restful-plugin.py +++ b/ceph/ceph/files/mgr-restful-plugin.py @@ -1,6 +1,6 @@ #!/usr/bin/python # -# Copyright (c) 2019 Wind River Systems, Inc. +# Copyright (c) 2019-2023 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # @@ -32,6 +32,7 @@ import subprocess import sys import tempfile import time +from datetime import datetime import daemon import psutil @@ -141,6 +142,10 @@ class Config(object): # restful plugin recovers self.ping_fail_count_report_error = 5 + # Number of days for ceph-mgr to be restarted to avoid possible + # memory overflow due to memory growth (-1 to disable) + self.ceph_mgr_lifecycle_days = 7 + @staticmethod def load(): return Config() @@ -277,6 +282,9 @@ class ServiceMonitor(object): # ceph-mgr process self.ceph_mgr = None + # date the ceph-mgr process was started + self.ceph_mgr_start_date = None + # consecutive ceph-mgr/restful-plugin start failures. Service monitor # reports failure after CONFIG.ceph_mgr_max_failure_count self.ceph_mgr_failure_count = 0 @@ -570,6 +578,12 @@ class ServiceMonitor(object): # REST API should be available now # start making periodic requests (ping) while True: + if self.ceph_mgr_lifecycle_days != -1 \ + and self.ceph_mgr_uptime() >= self.ceph_mgr_lifecycle_days: + self.ceph_mgr_start_date = None + LOG.info("Restarting ceph-mgr to control RSS memory growth") + self.ceph_mgr_restart() + try: self.restful_plugin_ping() self.ping_failure_count = 0 @@ -710,6 +724,7 @@ class ServiceMonitor(object): stdout=null, stderr=null, shell=False) + self.ceph_mgr_start_date = datetime.now() except (OSError, ValueError) as err: raise CephMgrStartFailed(reason=str(err)) time.sleep(CONFIG.ceph_mgr_grace_period_sec) @@ -720,6 +735,15 @@ class ServiceMonitor(object): LOG.info('Stop ceph-mgr') psutil_terminate_kill(self.ceph_mgr, CONFIG.ceph_mgr_kill_delay_sec) + def ceph_mgr_restart(self): + self.ceph_mgr_stop() + self.ceph_mgr_start() + + def ceph_mgr_uptime(self): + if not self.ceph_mgr_start_date: + return 0 + return (datetime.now() - self.ceph_mgr_start_date).days + def restful_plugin_has_server_port(self): try: with open(os.devnull, 'wb') as null: