Refactor upgrade re-tries on failure path
If image upgrade or downgrade fails for tiller or kubernetes networking, the sysinv conductor will re-try in an hour until success. Tested a full install in an AIO-SX lab. Tested an upgrade of the calico (kube networking) and tiller image. Tested an upgrade failure of the calico image (by using an invalid version number), verified there are retries every hour until success. Tested an upgrade failure of the tiller image (by using an invalid version number), verified there are retries. Ensure that after the conductor is restarted or is in an upgrade kube components failure path, that it is not locked by the greenthread. Tested this by adding/removing a label to controller-0 (system host-label-assign, system host-label-remove). Tested upgrade after AIO-SX lab controller lock/unlock. Tested upgrade after Standard lab controller lock/unlock. Tested controller swact while in upgrade failure: verified that sysinv-conductor properly takes activity on new controller, upgrade continued to fail as expected. Tested a successful upgrade. Story: 2006590 Task: 36942 Change-Id: I9212f167ff4f8975c4f2df504d6850e03d4e9a6b Signed-off-by: Kristine Bujold <kristine.bujold@windriver.com>
This commit is contained in:
parent
76200b79b2
commit
3194f2b559
|
@ -1,2 +1,2 @@
|
|||
SRC_DIR="sysinv"
|
||||
TIS_PATCH_VER=336
|
||||
TIS_PATCH_VER=337
|
||||
|
|
|
@ -129,6 +129,9 @@ conductor_opts = [
|
|||
cfg.IntOpt('managed_app_auto_recovery_interval',
|
||||
default=300,
|
||||
help='Interval to run managed app auto recovery'),
|
||||
cfg.IntOpt('kube_upgrade_downgrade_retry_interval',
|
||||
default=3600,
|
||||
help='Interval in seconds between retries to upgrade/downgrade kubernetes components'),
|
||||
]
|
||||
|
||||
CONF = cfg.CONF
|
||||
|
@ -195,6 +198,10 @@ class ConductorManager(service.PeriodicService):
|
|||
# initializing conductor manager service
|
||||
super(ConductorManager, self).start()
|
||||
|
||||
# Upgrade/Downgrade kubernetes components.
|
||||
# greenthread must be called after super.start for it to work properly.
|
||||
greenthread.spawn(self._upgrade_downgrade_kube_components())
|
||||
|
||||
def _start(self):
|
||||
self.dbapi = dbapi.get_instance()
|
||||
self.fm_api = fm_api.FaultAPIs()
|
||||
|
@ -229,9 +236,6 @@ class ConductorManager(service.PeriodicService):
|
|||
|
||||
self._handle_restore_in_progress()
|
||||
|
||||
# Upgrade/Downgrade kubernetes components
|
||||
greenthread.spawn(self._upgrade_downgrade_kube_components())
|
||||
|
||||
LOG.info("sysinv-conductor start committed system=%s" %
|
||||
system.as_dict())
|
||||
|
||||
|
@ -5219,6 +5223,8 @@ class ConductorManager(service.PeriodicService):
|
|||
self._upgrade_downgrade_tiller()
|
||||
self._upgrade_downgrade_kube_networking()
|
||||
|
||||
@retry(retry_on_result=lambda x: x is False,
|
||||
wait_fixed=(CONF.conductor.kube_upgrade_downgrade_retry_interval * 1000))
|
||||
def _upgrade_downgrade_tiller(self):
|
||||
"""Check if tiller needs to be upgraded or downgraded"""
|
||||
LOG.info("_upgrade_downgrade_tiller")
|
||||
|
@ -5259,9 +5265,10 @@ class ConductorManager(service.PeriodicService):
|
|||
|
||||
if running_image is None:
|
||||
LOG.warning("Failed to get tiller image")
|
||||
return
|
||||
return False
|
||||
|
||||
LOG.info("Running tiller image: %s" % running_image)
|
||||
LOG.info("Requested tiller version: %s" % image_versions.TILLER_IMAGE_VERSION)
|
||||
|
||||
# Grab the version from the image name. Version is preceded
|
||||
# by a ":" e.g.
|
||||
|
@ -5269,7 +5276,7 @@ class ConductorManager(service.PeriodicService):
|
|||
running_image_name, running_version = running_image.rsplit(":", 1)
|
||||
if not running_version:
|
||||
LOG.warning("Failed to get version from tiller image")
|
||||
return
|
||||
return False
|
||||
|
||||
# Verify the tiller version running
|
||||
if running_version != image_versions.TILLER_IMAGE_VERSION:
|
||||
|
@ -5281,39 +5288,36 @@ class ConductorManager(service.PeriodicService):
|
|||
local_registry_auth = cutils.get_local_docker_registry_auth()
|
||||
self._docker._retrieve_specified_registries()
|
||||
|
||||
# download the image, retry if it fails
|
||||
while True:
|
||||
try:
|
||||
ret = self._docker.download_an_image("helm",
|
||||
local_registry_auth,
|
||||
download_image)
|
||||
if not ret:
|
||||
raise Exception
|
||||
except Exception as e:
|
||||
LOG.warning(
|
||||
"Failed to download image '%s'. %s" %
|
||||
(download_image, e))
|
||||
greenthread.sleep(FIVE_MIN_IN_SECS)
|
||||
continue
|
||||
break
|
||||
# download the image
|
||||
try:
|
||||
img_tag, ret = self._docker.download_an_image("helm",
|
||||
local_registry_auth,
|
||||
download_image)
|
||||
if not ret:
|
||||
raise Exception
|
||||
except Exception as e:
|
||||
LOG.warning("Failed to download image '%s'. %s" % (download_image, e))
|
||||
return False
|
||||
|
||||
# reset the cached registries
|
||||
self._docker._reset_registries_info()
|
||||
|
||||
# Update the new image, retry if it fails
|
||||
while True:
|
||||
try:
|
||||
helm_utils.helm_upgrade_tiller(download_image)
|
||||
# Update the new image
|
||||
try:
|
||||
helm_utils.helm_upgrade_tiller(download_image)
|
||||
|
||||
except Exception as e:
|
||||
LOG.warning("Failed to update the new image: %s" % e)
|
||||
greenthread.sleep(FIVE_MIN_IN_SECS)
|
||||
continue
|
||||
break
|
||||
except Exception as e:
|
||||
LOG.warning("Failed to update the new image: %s" % e)
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
LOG.error("{}. Failed to upgrade/downgrade tiller.".format(e))
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
@retry(retry_on_result=lambda x: x is False,
|
||||
wait_fixed=(CONF.conductor.kube_upgrade_downgrade_retry_interval * 1000))
|
||||
def _upgrade_downgrade_kube_networking(self):
|
||||
try:
|
||||
LOG.info(
|
||||
|
@ -5333,6 +5337,9 @@ class ConductorManager(service.PeriodicService):
|
|||
except Exception as e:
|
||||
LOG.error("Failed to upgrade/downgrade kubernetes "
|
||||
"networking images: {}".format(e))
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def check_nodes_stable(self):
|
||||
hosts = self.dbapi.ihost_get_list()
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
# -*- encoding: utf-8 -*-
|
||||
#
|
||||
#
|
||||
# Copyright (c) 2017-2018 Wind River Systems, Inc.
|
||||
# Copyright (c) 2017-2019 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
|
@ -546,6 +546,9 @@ class StorageTierDependentTCs(base.FunctionalTest):
|
|||
set_monitors_status_patcher = mock.patch.object(ceph_utils.CephApiOperator, 'get_monitors_status')
|
||||
set_is_initial_config_patcher = mock.patch.object(cutils, 'is_initial_config_complete')
|
||||
|
||||
upgrade_downgrade_kube_components_patcher = mock.patch.object(
|
||||
manager.ConductorManager, '_upgrade_downgrade_kube_components')
|
||||
|
||||
def setUp(self):
|
||||
super(StorageTierDependentTCs, self).setUp()
|
||||
self.mock_set_crushmap = self.set_crushmap_patcher.start()
|
||||
|
@ -563,11 +566,14 @@ class StorageTierDependentTCs(base.FunctionalTest):
|
|||
self.host_index = -1
|
||||
self.mon_index = -1
|
||||
|
||||
self.mock_upgrade_downgrade_kube_components = self.upgrade_downgrade_kube_components_patcher.start()
|
||||
|
||||
def tearDown(self):
|
||||
super(StorageTierDependentTCs, self).tearDown()
|
||||
self.set_crushmap_patcher.stop()
|
||||
self.set_monitors_status_patcher = self.set_monitors_status_patcher.stop()
|
||||
self.set_is_initial_config_patcher.stop()
|
||||
self.upgrade_downgrade_kube_components_patcher.stop()
|
||||
|
||||
def assertDeleted(self, fullPath):
|
||||
self.get_json(fullPath, expect_errors=True) # Make sure this line raises an error
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# vim: tabstop=4 shiftwidth=4 softtabstop=4
|
||||
# coding=utf-8
|
||||
|
||||
# Copyright (c) 2017-2018 Wind River Systems, Inc.
|
||||
# Copyright (c) 2017-2019 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
|
@ -40,6 +40,9 @@ class UpdateCephCluster(base.DbTestCase):
|
|||
# - test_add_valid_mix_tiers
|
||||
# - test_add_4_mix_bbbb
|
||||
|
||||
upgrade_downgrade_kube_components_patcher = mock.patch.object(
|
||||
manager.ConductorManager, '_upgrade_downgrade_kube_components')
|
||||
|
||||
def setUp(self):
|
||||
super(UpdateCephCluster, self).setUp()
|
||||
self.service = manager.ConductorManager('test-host', 'test-topic')
|
||||
|
@ -50,6 +53,12 @@ class UpdateCephCluster(base.DbTestCase):
|
|||
self.load = utils.create_test_load()
|
||||
self.host_index = -1
|
||||
|
||||
self.mock_upgrade_downgrade_kube_components = self.upgrade_downgrade_kube_components_patcher.start()
|
||||
|
||||
def tearDown(self):
|
||||
super(UpdateCephCluster, self).tearDown()
|
||||
self.upgrade_downgrade_kube_components_patcher.stop()
|
||||
|
||||
def _create_storage_ihost(self, hostname):
|
||||
self.host_index += 1
|
||||
ihost_dict = utils.get_test_ihost(
|
||||
|
|
|
@ -17,11 +17,13 @@
|
|||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
#
|
||||
# Copyright (c) 2013-2016 Wind River Systems, Inc.
|
||||
# Copyright (c) 2013-2019 Wind River Systems, Inc.
|
||||
#
|
||||
|
||||
"""Test class for Sysinv ManagerService."""
|
||||
|
||||
import mock
|
||||
|
||||
from sysinv.common import exception
|
||||
from sysinv.conductor import manager
|
||||
from sysinv.db import api as dbapi
|
||||
|
@ -32,6 +34,9 @@ from sysinv.tests.db import utils
|
|||
|
||||
class ManagerTestCase(base.DbTestCase):
|
||||
|
||||
upgrade_downgrade_kube_components_patcher = mock.patch.object(
|
||||
manager.ConductorManager, '_upgrade_downgrade_kube_components')
|
||||
|
||||
def setUp(self):
|
||||
super(ManagerTestCase, self).setUp()
|
||||
self.service = manager.ConductorManager('test-host', 'test-topic')
|
||||
|
@ -41,6 +46,12 @@ class ManagerTestCase(base.DbTestCase):
|
|||
self.system = utils.create_test_isystem()
|
||||
self.load = utils.create_test_load()
|
||||
|
||||
self.mock_upgrade_downgrade_kube_components = self.upgrade_downgrade_kube_components_patcher.start()
|
||||
|
||||
def tearDown(self):
|
||||
super(ManagerTestCase, self).tearDown()
|
||||
self.upgrade_downgrade_kube_components_patcher.stop()
|
||||
|
||||
def _create_test_ihost(self, **kwargs):
|
||||
# ensure the system ID for proper association
|
||||
kwargs['forisystemid'] = self.system['id']
|
||||
|
|
Loading…
Reference in New Issue