Adding retries during upgrade complete state

During upgrade complete, many services are restarting
and the API calls may randomly fail.  This change adds
retry wrapping around all client creation and API calls
during that upgrade state.

Currently 5 retries with 5 seconds delay

Story: 2009665
Task: 44070
Change-Id: Ifacfd364c2e961fd396db695658abe2d027757c3
Signed-off-by: albailey <Al.Bailey@windriver.com>
This commit is contained in:
albailey 2021-11-18 09:56:24 -06:00
parent 855232c872
commit 7c88723d06
4 changed files with 60 additions and 14 deletions

View File

@ -95,6 +95,7 @@ Summary: DC Orchestrator
# TODO(John): should we add Requires lines?
Requires: openstack-ras
Requires: python-psutil
Requires: python-retrying
%description dcorch
Distributed Cloud Orchestrator

View File

@ -1,8 +1,9 @@
#
# Copyright (c) 2020 Wind River Systems, Inc.
# Copyright (c) 2020-2021 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
import retrying
import time
from dcmanager.common import consts
@ -15,6 +16,12 @@ from dcmanager.orchestrator.states.base import BaseState
DEFAULT_MAX_QUERIES = 60
DEFAULT_SLEEP_DURATION = 10
# There are additional retry loops for actions that should never fail
# The sleep duration and number of retries are shorter, since these should
# only occur if a service is being restarted
RETRY_MAX_ATTEMPTS = 5
RETRY_SLEEP_MILLIS = 5000
class CompletingUpgradeState(BaseState):
"""Upgrade state actions for completing an upgrade"""
@ -26,10 +33,43 @@ class CompletingUpgradeState(BaseState):
self.sleep_duration = DEFAULT_SLEEP_DURATION
self.max_queries = DEFAULT_MAX_QUERIES
def finalize_upgrade(self, strategy_step):
sysinv_client = self.get_sysinv_client(strategy_step.subcloud.name)
@retrying.retry(stop_max_attempt_number=RETRY_MAX_ATTEMPTS,
wait_fixed=RETRY_SLEEP_MILLIS)
def _get_software_version(self, strategy_step):
"""Internal utility method to query software version from a subcloud
software_version = sysinv_client.get_system().software_version
This method is 'retry' wrapped to attempt multiple times with a
small wait period between attempts if any exception is raised
"""
region = self.get_region_name(strategy_step)
return self.get_sysinv_client(region).get_system().software_version
@retrying.retry(stop_max_attempt_number=RETRY_MAX_ATTEMPTS,
wait_fixed=RETRY_SLEEP_MILLIS)
def _get_upgrades(self, strategy_step):
"""Internal utility method to query a subcloud for its upgrades
This method is 'retry' wrapped to attempt multiple times with a
small wait period between attempts if any exception is raised
"""
region = self.get_region_name(strategy_step)
return self.get_sysinv_client(region).get_upgrades()
@retrying.retry(stop_max_attempt_number=RETRY_MAX_ATTEMPTS,
wait_fixed=RETRY_SLEEP_MILLIS)
def _upgrade_complete(self, strategy_step):
"""Internal utility method to complete an upgrade in a subcloud
This method is 'retry' wrapped to attempt multiple times with a
small wait period between attempts if any exception is raised
returns None
"""
region = self.get_region_name(strategy_step)
return self.get_sysinv_client(region).upgrade_complete()
def finalize_upgrade(self, strategy_step):
software_version = self._get_software_version(strategy_step)
db_api.subcloud_update(
self.context, strategy_step.subcloud_id,
@ -37,27 +77,33 @@ class CompletingUpgradeState(BaseState):
software_version=software_version)
return self.next_state
# todo(abailey): determine if service restarts can be made predictable
# todo(abailey): other states should have similar retry decorators and
# this may also be reasonable to add within the client API calls.
def perform_state_action(self, strategy_step):
"""Complete an upgrade on a subcloud
We should never cache the client. re-query it.
Returns the next state in the state machine on success.
Any exceptions raised by this method set the strategy to FAILED.
This state runs during a time when manifests are applying and services
are restarting, and therefore any API call in this method can randomly
fail. To accomodate this, every call is wrapped with retries.
"""
# get the sysinv client for the subcloud
sysinv_client = self.get_sysinv_client(strategy_step.subcloud.name)
# upgrade-complete causes the upgrade to be deleted.
# if no upgrade exists, there is no need to call it.
# The API should always return a list
upgrades = sysinv_client.get_upgrades()
upgrades = self._get_upgrades(strategy_step)
if len(upgrades) == 0:
self.info_log(strategy_step,
"No upgrades exist. Nothing needs completing")
return self.finalize_upgrade(strategy_step)
# invoke the API 'upgrade-complete'
# This is a partially blocking call that raises exception on failure.
sysinv_client.upgrade_complete()
# We will re-attempt even if that failure is encountered
self._upgrade_complete(strategy_step)
# 'completion' deletes the upgrade. Need to loop until it is deleted
counter = 0
@ -66,11 +112,9 @@ class CompletingUpgradeState(BaseState):
if self.stopped():
raise StrategyStoppedException()
upgrades = self.get_sysinv_client(
strategy_step.subcloud.name).get_upgrades()
upgrades = self._get_upgrades(strategy_step)
if len(upgrades) == 0:
self.info_log(strategy_step,
"Upgrade completed.")
self.info_log(strategy_step, "Upgrade completed.")
break
counter += 1
if counter >= self.max_queries:

View File

@ -48,6 +48,7 @@ python-novaclient>=7.1.0 # Apache-2.0
python-keystoneclient>=3.8.0 # Apache-2.0
pycrypto>=2.6 # Public Domain
requests_toolbelt
retrying
keyring
kubernetes # Apache-2.0
psutil

View File

@ -21,4 +21,4 @@ pylint==1.9.2;python_version<"3.0" # GPLv2
pylint==2.3.1;python_version>="3.0" # GPLv2
PyYAML>=3.1.0
yamllint<1.26.1;python_version>="3.0" # GPLv2
python-dev-tools;python_version>="3.0"
#python-dev-tools;python_version>="3.0"