Refactor upgrade re-tries on failure path

If image upgrade or downgrade fails for tiller or kubernetes
networking, the sysinv conductor will re-try in an hour until
success.

Tested a full install in an AIO-SX lab.

Tested an upgrade of the calico (kube networking) and tiller image.

Tested an upgrade failure of the calico image (by using an invalid
version number), verified there are retries every hour until success.

Tested an upgrade failure of the tiller image (by using an invalid
version number), verified there are retries.

Ensure that after the conductor is restarted or is in an upgrade kube
components failure path, that it is not locked by the greenthread.
Tested this by adding/removing a label to controller-0 (system
host-label-assign, system host-label-remove).

Tested upgrade after AIO-SX lab controller lock/unlock.

Tested upgrade after Standard lab controller lock/unlock.

Tested  controller swact while in upgrade failure: verified that
sysinv-conductor properly takes activity on new controller, upgrade
continued to fail as expected. Tested a successful upgrade.

Story: 2006590
Task: 36942
Change-Id: I9212f167ff4f8975c4f2df504d6850e03d4e9a6b
Signed-off-by: Kristine Bujold <kristine.bujold@windriver.com>
This commit is contained in:
Kristine Bujold 2019-10-21 16:31:48 -04:00
parent 76200b79b2
commit 3194f2b559
5 changed files with 66 additions and 33 deletions

View File

@ -1,2 +1,2 @@
SRC_DIR="sysinv"
TIS_PATCH_VER=336
TIS_PATCH_VER=337

View File

@ -129,6 +129,9 @@ conductor_opts = [
cfg.IntOpt('managed_app_auto_recovery_interval',
default=300,
help='Interval to run managed app auto recovery'),
cfg.IntOpt('kube_upgrade_downgrade_retry_interval',
default=3600,
help='Interval in seconds between retries to upgrade/downgrade kubernetes components'),
]
CONF = cfg.CONF
@ -195,6 +198,10 @@ class ConductorManager(service.PeriodicService):
# initializing conductor manager service
super(ConductorManager, self).start()
# Upgrade/Downgrade kubernetes components.
# greenthread must be called after super.start for it to work properly.
greenthread.spawn(self._upgrade_downgrade_kube_components())
def _start(self):
self.dbapi = dbapi.get_instance()
self.fm_api = fm_api.FaultAPIs()
@ -229,9 +236,6 @@ class ConductorManager(service.PeriodicService):
self._handle_restore_in_progress()
# Upgrade/Downgrade kubernetes components
greenthread.spawn(self._upgrade_downgrade_kube_components())
LOG.info("sysinv-conductor start committed system=%s" %
system.as_dict())
@ -5219,6 +5223,8 @@ class ConductorManager(service.PeriodicService):
self._upgrade_downgrade_tiller()
self._upgrade_downgrade_kube_networking()
@retry(retry_on_result=lambda x: x is False,
wait_fixed=(CONF.conductor.kube_upgrade_downgrade_retry_interval * 1000))
def _upgrade_downgrade_tiller(self):
"""Check if tiller needs to be upgraded or downgraded"""
LOG.info("_upgrade_downgrade_tiller")
@ -5259,9 +5265,10 @@ class ConductorManager(service.PeriodicService):
if running_image is None:
LOG.warning("Failed to get tiller image")
return
return False
LOG.info("Running tiller image: %s" % running_image)
LOG.info("Requested tiller version: %s" % image_versions.TILLER_IMAGE_VERSION)
# Grab the version from the image name. Version is preceded
# by a ":" e.g.
@ -5269,7 +5276,7 @@ class ConductorManager(service.PeriodicService):
running_image_name, running_version = running_image.rsplit(":", 1)
if not running_version:
LOG.warning("Failed to get version from tiller image")
return
return False
# Verify the tiller version running
if running_version != image_versions.TILLER_IMAGE_VERSION:
@ -5281,39 +5288,36 @@ class ConductorManager(service.PeriodicService):
local_registry_auth = cutils.get_local_docker_registry_auth()
self._docker._retrieve_specified_registries()
# download the image, retry if it fails
while True:
try:
ret = self._docker.download_an_image("helm",
local_registry_auth,
download_image)
if not ret:
raise Exception
except Exception as e:
LOG.warning(
"Failed to download image '%s'. %s" %
(download_image, e))
greenthread.sleep(FIVE_MIN_IN_SECS)
continue
break
# download the image
try:
img_tag, ret = self._docker.download_an_image("helm",
local_registry_auth,
download_image)
if not ret:
raise Exception
except Exception as e:
LOG.warning("Failed to download image '%s'. %s" % (download_image, e))
return False
# reset the cached registries
self._docker._reset_registries_info()
# Update the new image, retry if it fails
while True:
try:
helm_utils.helm_upgrade_tiller(download_image)
# Update the new image
try:
helm_utils.helm_upgrade_tiller(download_image)
except Exception as e:
LOG.warning("Failed to update the new image: %s" % e)
greenthread.sleep(FIVE_MIN_IN_SECS)
continue
break
except Exception as e:
LOG.warning("Failed to update the new image: %s" % e)
return False
except Exception as e:
LOG.error("{}. Failed to upgrade/downgrade tiller.".format(e))
return False
return True
@retry(retry_on_result=lambda x: x is False,
wait_fixed=(CONF.conductor.kube_upgrade_downgrade_retry_interval * 1000))
def _upgrade_downgrade_kube_networking(self):
try:
LOG.info(
@ -5333,6 +5337,9 @@ class ConductorManager(service.PeriodicService):
except Exception as e:
LOG.error("Failed to upgrade/downgrade kubernetes "
"networking images: {}".format(e))
return False
return True
def check_nodes_stable(self):
hosts = self.dbapi.ihost_get_list()

View File

@ -2,7 +2,7 @@
# -*- encoding: utf-8 -*-
#
#
# Copyright (c) 2017-2018 Wind River Systems, Inc.
# Copyright (c) 2017-2019 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@ -546,6 +546,9 @@ class StorageTierDependentTCs(base.FunctionalTest):
set_monitors_status_patcher = mock.patch.object(ceph_utils.CephApiOperator, 'get_monitors_status')
set_is_initial_config_patcher = mock.patch.object(cutils, 'is_initial_config_complete')
upgrade_downgrade_kube_components_patcher = mock.patch.object(
manager.ConductorManager, '_upgrade_downgrade_kube_components')
def setUp(self):
super(StorageTierDependentTCs, self).setUp()
self.mock_set_crushmap = self.set_crushmap_patcher.start()
@ -563,11 +566,14 @@ class StorageTierDependentTCs(base.FunctionalTest):
self.host_index = -1
self.mon_index = -1
self.mock_upgrade_downgrade_kube_components = self.upgrade_downgrade_kube_components_patcher.start()
def tearDown(self):
super(StorageTierDependentTCs, self).tearDown()
self.set_crushmap_patcher.stop()
self.set_monitors_status_patcher = self.set_monitors_status_patcher.stop()
self.set_is_initial_config_patcher.stop()
self.upgrade_downgrade_kube_components_patcher.stop()
def assertDeleted(self, fullPath):
self.get_json(fullPath, expect_errors=True) # Make sure this line raises an error

View File

@ -1,7 +1,7 @@
# vim: tabstop=4 shiftwidth=4 softtabstop=4
# coding=utf-8
# Copyright (c) 2017-2018 Wind River Systems, Inc.
# Copyright (c) 2017-2019 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@ -40,6 +40,9 @@ class UpdateCephCluster(base.DbTestCase):
# - test_add_valid_mix_tiers
# - test_add_4_mix_bbbb
upgrade_downgrade_kube_components_patcher = mock.patch.object(
manager.ConductorManager, '_upgrade_downgrade_kube_components')
def setUp(self):
super(UpdateCephCluster, self).setUp()
self.service = manager.ConductorManager('test-host', 'test-topic')
@ -50,6 +53,12 @@ class UpdateCephCluster(base.DbTestCase):
self.load = utils.create_test_load()
self.host_index = -1
self.mock_upgrade_downgrade_kube_components = self.upgrade_downgrade_kube_components_patcher.start()
def tearDown(self):
super(UpdateCephCluster, self).tearDown()
self.upgrade_downgrade_kube_components_patcher.stop()
def _create_storage_ihost(self, hostname):
self.host_index += 1
ihost_dict = utils.get_test_ihost(

View File

@ -17,11 +17,13 @@
# License for the specific language governing permissions and limitations
# under the License.
#
# Copyright (c) 2013-2016 Wind River Systems, Inc.
# Copyright (c) 2013-2019 Wind River Systems, Inc.
#
"""Test class for Sysinv ManagerService."""
import mock
from sysinv.common import exception
from sysinv.conductor import manager
from sysinv.db import api as dbapi
@ -32,6 +34,9 @@ from sysinv.tests.db import utils
class ManagerTestCase(base.DbTestCase):
upgrade_downgrade_kube_components_patcher = mock.patch.object(
manager.ConductorManager, '_upgrade_downgrade_kube_components')
def setUp(self):
super(ManagerTestCase, self).setUp()
self.service = manager.ConductorManager('test-host', 'test-topic')
@ -41,6 +46,12 @@ class ManagerTestCase(base.DbTestCase):
self.system = utils.create_test_isystem()
self.load = utils.create_test_load()
self.mock_upgrade_downgrade_kube_components = self.upgrade_downgrade_kube_components_patcher.start()
def tearDown(self):
super(ManagerTestCase, self).tearDown()
self.upgrade_downgrade_kube_components_patcher.stop()
def _create_test_ihost(self, **kwargs):
# ensure the system ID for proper association
kwargs['forisystemid'] = self.system['id']