Adding kube-upgrade-abort support

Trigger a kube-upgrade abort when kube upgrade steps
encounter a failure.

Trigger a cleanup when constructing a new kube upgrade
strategy where an aborted kube-upgrade is detected.
This cleanup occurs during the 'build' phase.

This commit also includes additional INFO level logs
as each kube-upgrade step is invoked.  Previously some
were info and some were debug.

Test Plan:
  PASS: Trigger the downloading images phase to fail and
   observe that the kube-upgrade becomes 'aborted'.
  PASS: Create a kube-upgrade strategy where an aborted
   kube-upgrade exists, and observe that it is cleaned up
   and a fresh kube upgrade strategy is created.

Story: 2010565
Task: 48219
Signed-off-by: Al Bailey <al.bailey@windriver.com>
Change-Id: I6d0ef0bdaaee73c76d6be40b9d5d0143332f83a0
This commit is contained in:
Al Bailey 2023-06-12 18:24:28 +00:00
parent 9e519abfb9
commit 3bd5eed446
9 changed files with 243 additions and 8 deletions

View File

@ -1799,6 +1799,52 @@ class NFVIInfrastructureAPI(nfvi.api.v1.NFVIInfrastructureAPI):
callback.send(response)
callback.close()
def kube_upgrade_abort(self, future, callback):
"""Invokes sysinv kube-upgrade-abort"""
response = dict()
response['completed'] = False
response['reason'] = ''
action_type = 'kube-upgrade-abort'
sysinv_method = sysinv.kube_upgrade_abort
try:
future.set_timeouts(config.CONF.get('nfvi-timeouts', None))
if self._platform_token is None or \
self._platform_token.is_expired():
future.work(openstack.get_token, self._platform_directory)
future.result = (yield)
if not future.result.is_complete() or \
future.result.data is None:
self.set_response_error(response, "Openstack get-token")
return
self._platform_token = future.result.data
future.work(sysinv_method, self._platform_token)
future.result = (yield)
if not future.result.is_complete():
self.set_response_error(response, action_type)
return
api_data = future.result.data
result_obj = nfvi.objects.v1.KubeUpgrade(
api_data['state'],
api_data['from_version'],
api_data['to_version'])
response['result-data'] = result_obj
response['completed'] = True
except exceptions.OpenStackRestAPIException as e:
if httplib.UNAUTHORIZED == e.http_status_code:
response['error-code'] = nfvi.NFVI_ERROR_CODE.TOKEN_EXPIRED
if self._platform_token is not None:
self._platform_token.set_expired()
else:
DLOG.exception("Caught API exception while trying %s. error=%s"
% (action_type, e))
response['reason'] = e.http_response_reason
except Exception as e:
DLOG.exception("Caught exception while trying %s. error=%s"
% (action_type, e))
finally:
callback.send(response)
callback.close()
def kube_upgrade_cleanup(self, future, callback):
"""
kube upgrade cleanup

View File

@ -396,6 +396,13 @@ def _patch_kube_upgrade_state(token, new_value, hostname=None):
timeout_in_secs=REST_API_REQUEST_TIMEOUT)
def kube_upgrade_abort(token):
"""
Ask System Inventory to kube upgrade abort
"""
return _patch_kube_upgrade_state(token, "upgrade-aborting")
def kube_upgrade_cleanup(token):
"""
Ask System Inventory to delete the kube upgrade

View File

@ -92,6 +92,7 @@ class TestBuildStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
{'name': 'query-alarms'},
{'name': 'query-kube-versions'},
{'name': 'query-kube-upgrade'},
{'name': 'kube-upgrade-cleanup-aborted'},
{'name': 'query-kube-host-upgrade'},
]
expected_results = {

View File

@ -123,6 +123,7 @@ from nfv_vim.nfvi._nfvi_infrastructure_module import nfvi_kube_rootca_update_hos
from nfv_vim.nfvi._nfvi_infrastructure_module import nfvi_kube_rootca_update_pods # noqa: F401
from nfv_vim.nfvi._nfvi_infrastructure_module import nfvi_kube_rootca_update_start # noqa: F401
from nfv_vim.nfvi._nfvi_infrastructure_module import nfvi_kube_rootca_update_upload_cert # noqa: F401
from nfv_vim.nfvi._nfvi_infrastructure_module import nfvi_kube_upgrade_abort # noqa: F401
from nfv_vim.nfvi._nfvi_infrastructure_module import nfvi_kube_upgrade_cleanup # noqa: F401
from nfv_vim.nfvi._nfvi_infrastructure_module import nfvi_kube_upgrade_complete # noqa: F401
from nfv_vim.nfvi._nfvi_infrastructure_module import nfvi_kube_upgrade_download_images # noqa: F401

View File

@ -223,6 +223,14 @@ def nfvi_kube_rootca_update_upload_cert(cert_file, callback):
return cmd_id
def nfvi_kube_upgrade_abort(callback):
"""Kube Upgrade - Abort"""
cmd_id = _infrastructure_plugin.invoke_plugin(
'kube_upgrade_abort',
callback=callback)
return cmd_id
def nfvi_kube_upgrade_cleanup(callback):
"""
Kube Upgrade Cleanup

View File

@ -34,23 +34,27 @@ class KubeUpgradeState(Constants):
"""
Kube Upgrade State Constants
These values are copied from sysinv/common/kubernetes.py
Maintaining the same order as defined in kubernetes.py
"""
KUBE_UPGRADE_STARTED = Constant('upgrade-started')
KUBE_UPGRADE_DOWNLOADING_IMAGES = Constant('downloading-images')
KUBE_UPGRADE_DOWNLOADING_IMAGES_FAILED = Constant('downloading-images-failed')
KUBE_UPGRADE_DOWNLOADED_IMAGES = Constant('downloaded-images')
KUBE_UPGRADING_FIRST_MASTER = Constant('upgrading-first-master')
KUBE_UPGRADING_FIRST_MASTER_FAILED = Constant('upgrading-first-master-failed')
KUBE_UPGRADED_FIRST_MASTER = Constant('upgraded-first-master')
KUBE_UPGRADING_NETWORKING = Constant('upgrading-networking')
KUBE_UPGRADING_NETWORKING_FAILED = Constant('upgrading-networking-failed')
KUBE_UPGRADED_NETWORKING = Constant('upgraded-networking')
KUBE_UPGRADING_FIRST_MASTER = Constant('upgrading-first-master')
KUBE_UPGRADING_FIRST_MASTER_FAILED = Constant('upgrading-first-master-failed')
KUBE_UPGRADED_FIRST_MASTER = Constant('upgraded-first-master')
KUBE_UPGRADING_SECOND_MASTER = Constant('upgrading-second-master')
KUBE_UPGRADING_SECOND_MASTER_FAILED = Constant('upgrading-second-master-failed')
KUBE_UPGRADED_SECOND_MASTER = Constant('upgraded-second-master')
KUBE_UPGRADING_KUBELETS = Constant('upgrading-kubelets')
KUBE_UPGRADE_COMPLETE = Constant('upgrade-complete')
KUBE_UPGRADE_ABORTING = Constant('upgrade-aborting')
KUBE_UPGRADE_ABORTING_FAILED = Constant('upgrade-aborting-failed')
KUBE_UPGRADE_ABORTED = Constant('upgrade-aborted')
KUBE_HOST_CORDON = Constant('cordon-started')
KUBE_HOST_CORDON_COMPLETE = Constant('cordon-complete')
KUBE_HOST_CORDON_FAILED = Constant('cordon-failed')

View File

@ -29,6 +29,7 @@ from nfv_vim.strategy._strategy_steps import KubeRootcaUpdatePodsTrustBothcasSte
from nfv_vim.strategy._strategy_steps import KubeRootcaUpdatePodsTrustNewcaStep # noqa: F401
from nfv_vim.strategy._strategy_steps import KubeRootcaUpdateStartStep # noqa: F401
from nfv_vim.strategy._strategy_steps import KubeRootcaUpdateUploadCertStep # noqa: F401
from nfv_vim.strategy._strategy_steps import KubeUpgradeCleanupAbortedStep # noqa: F401
from nfv_vim.strategy._strategy_steps import KubeUpgradeCleanupStep # noqa: F401
from nfv_vim.strategy._strategy_steps import KubeUpgradeCompleteStep # noqa: F401
from nfv_vim.strategy._strategy_steps import KubeUpgradeDownloadImagesStep # noqa: F401

View File

@ -3039,11 +3039,16 @@ class KubeUpgradeStrategy(SwUpdateStrategy,
# Initial stage is a query of existing kube upgrade
stage = strategy.StrategyStage(
strategy.STRATEGY_STAGE_NAME.KUBE_UPGRADE_QUERY)
# these query steps are paired with mixins that process their results
stage.add_step(strategy.QueryAlarmsStep(
ignore_alarms=self._ignore_alarms))
# these query steps are paired with mixins that process their results
stage.add_step(strategy.QueryKubeVersionsStep())
stage.add_step(strategy.QueryKubeUpgradeStep())
# cleanup kube upgrade if 'upgrade-aborted'
stage.add_step(strategy.KubeUpgradeCleanupAbortedStep())
# query hosts last, after any aborted upgrade is cleaned up
stage.add_step(strategy.QueryKubeHostUpgradeStep())
self.build_phase.add_stage(stage)

View File

@ -76,8 +76,10 @@ class StrategyStepNames(Constants):
QUERY_KUBE_VERSIONS = Constant('query-kube-versions')
KUBE_HOST_CORDON = Constant('kube-host-cordon')
KUBE_HOST_UNCORDON = Constant('kube-host-uncordon')
KUBE_UPGRADE_ABORT = Constant('kube-upgrade-abort')
KUBE_UPGRADE_START = Constant('kube-upgrade-start')
KUBE_UPGRADE_CLEANUP = Constant('kube-upgrade-cleanup')
KUBE_UPGRADE_CLEANUP_ABORTED = Constant('kube-upgrade-cleanup-aborted')
KUBE_UPGRADE_COMPLETE = Constant('kube-upgrade-complete')
KUBE_UPGRADE_DOWNLOAD_IMAGES = Constant('kube-upgrade-download-images')
KUBE_UPGRADE_NETWORKING = Constant('kube-upgrade-networking')
@ -3774,6 +3776,7 @@ class QueryKubeVersionsStep(AbstractStrategyStep):
"""
from nfv_vim import nfvi
DLOG.info("Step (%s) apply." % self._name)
nfvi.nfvi_get_kube_version_list(self._query_callback())
return strategy.STRATEGY_STEP_RESULT.WAIT, ""
@ -3810,6 +3813,7 @@ class QueryKubeHostUpgradeStep(AbstractStrategyStep):
Query Kube Host Upgrade List
"""
from nfv_vim import nfvi
DLOG.info("Step (%s) apply." % self._name)
nfvi.nfvi_get_kube_host_upgrade_list(
self._get_kube_host_upgrade_list_callback())
return strategy.STRATEGY_STEP_RESULT.WAIT, ""
@ -3922,6 +3926,56 @@ class AbstractKubeUpgradeStep(AbstractStrategyStep):
return data
class KubeUpgradeAbortStep(AbstractKubeUpgradeStep):
"""Kube Upgrade - Abort - Strategy Step"""
def __init__(self):
from nfv_vim import nfvi
super(KubeUpgradeAbortStep, self).__init__(
STRATEGY_STEP_NAME.KUBE_UPGRADE_ABORT,
nfvi.objects.v1.KUBE_UPGRADE_STATE.KUBE_UPGRADE_ABORTED,
nfvi.objects.v1.KUBE_UPGRADE_STATE.KUBE_UPGRADE_ABORTING_FAILED)
@coroutine
def _response_callback(self):
"""Kube Upgrade - Abort - Callback"""
response = (yield)
DLOG.debug("%s callback response=%s." % (self._name, response))
if response['completed']:
if self.strategy is not None:
self.strategy.nfvi_kube_upgrade = response['result-data']
# Calling abort on an aborted update returns a failure so we check
if self.strategy is None:
# return success if there is no more strategy
self.stage.step_complete(strategy.STRATEGY_STEP_RESULT.SUCCESS,
"no strategy")
elif self.strategy.nfvi_kube_upgrade is None:
# return success if there is no more kube upgrade
self.stage.step_complete(strategy.STRATEGY_STEP_RESULT.SUCCESS,
"no kube upgrade")
elif self.strategy.nfvi_kube_upgrade.state == self._success_state:
self.stage.step_complete(strategy.STRATEGY_STEP_RESULT.SUCCESS,
"")
else:
# If the state does not match, the abort failed.
result = strategy.STRATEGY_STEP_RESULT.FAILED
self.stage.step_complete(result,
"Unexpected state: %s"
% self.strategy.nfvi_kube_upgrade.state)
def apply(self):
"""Kube Upgrade - Abort"""
from nfv_vim import nfvi
DLOG.info("Step (%s) apply." % self._name)
nfvi.nfvi_kube_upgrade_abort(self._response_callback())
return strategy.STRATEGY_STEP_RESULT.WAIT, ""
class KubeUpgradeStartStep(AbstractKubeUpgradeStep):
"""Kube Upgrade Start - Strategy Step"""
@ -3937,6 +3991,12 @@ class KubeUpgradeStartStep(AbstractKubeUpgradeStep):
self._to_version = to_version
self._force = force
def abort(self):
"""
Returns the abort step related to this step
"""
return [KubeUpgradeAbortStep()]
def from_dict(self, data):
"""
Returns the step object initialized using the given dictionary
@ -3977,6 +4037,7 @@ class KubeUpgradeStartStep(AbstractKubeUpgradeStep):
"""Kube Upgrade Start"""
from nfv_vim import nfvi
DLOG.info("Step (%s) apply." % self._name)
alarm_ignore_list = ["900.401", ] # ignore the auto apply alarm
nfvi.nfvi_kube_upgrade_start(self._to_version,
@ -3986,6 +4047,54 @@ class KubeUpgradeStartStep(AbstractKubeUpgradeStep):
return strategy.STRATEGY_STEP_RESULT.WAIT, ""
class KubeUpgradeCleanupAbortedStep(AbstractKubeUpgradeStep):
"""Kube Upgrade Cleanup Aborted - Strategy Step"""
# todo(abailey): this class could be replaced by KubeUpgradeCleanupStep
# if it was enhanced to take an optional 'filter'
def __init__(self):
super(KubeUpgradeCleanupAbortedStep, self).__init__(
STRATEGY_STEP_NAME.KUBE_UPGRADE_CLEANUP_ABORTED,
None, # there is no success state for this cleanup activity
None) # there is no failure state for this cleanup activity
@coroutine
def _response_callback(self):
"""Kube Upgrade Cleanup Aborted - Callback"""
response = (yield)
DLOG.debug("%s callback response=%s." % (self._name, response))
# kube-upgrade-cleanup-aborted will return a result when it completes,
# so we do not want to use handle_event
if response['completed']:
if self.strategy is not None:
# cleanup deletes the kube upgrade, clear it from the strategy
self.strategy.nfvi_kube_upgrade = None
result = strategy.STRATEGY_STEP_RESULT.SUCCESS
self.stage.step_complete(result, "")
else:
result = strategy.STRATEGY_STEP_RESULT.FAILED
self.stage.step_complete(result, response['reason'])
def apply(self):
"""Kube Upgrade Cleanup Aborted"""
DLOG.info("Step (%s) apply." % self._name)
from nfv_vim import nfvi
# We only invoke this step if the state matches our filter
filter_state = nfvi.objects.v1.KUBE_UPGRADE_STATE.KUBE_UPGRADE_ABORTED
if self.strategy is not None:
if self.strategy.nfvi_kube_upgrade is not None:
if self.strategy.nfvi_kube_upgrade.state == filter_state:
DLOG.info("%s cleaning up %s" % (self._name, self.strategy.nfvi_kube_upgrade.state))
nfvi.nfvi_kube_upgrade_cleanup(self._response_callback())
return strategy.STRATEGY_STEP_RESULT.WAIT, ""
# All other cases, we claim success since the filter did not match
return strategy.STRATEGY_STEP_RESULT.SUCCESS, ""
class KubeUpgradeCleanupStep(AbstractKubeUpgradeStep):
"""Kube Upgrade Cleanup - Strategy Step"""
@ -4018,6 +4127,7 @@ class KubeUpgradeCleanupStep(AbstractKubeUpgradeStep):
"""Kube Upgrade Cleanup"""
from nfv_vim import nfvi
DLOG.info("Step (%s) apply." % self._name)
nfvi.nfvi_kube_upgrade_cleanup(self._response_callback())
return strategy.STRATEGY_STEP_RESULT.WAIT, ""
@ -4033,6 +4143,12 @@ class KubeUpgradeCompleteStep(AbstractKubeUpgradeStep):
nfvi.objects.v1.KUBE_UPGRADE_STATE.KUBE_UPGRADE_COMPLETE,
None) # there is no failure state for upgrade-complete
def abort(self):
"""
Returns the abort step related to this step
"""
return [KubeUpgradeAbortStep()]
@coroutine
def _response_callback(self):
"""Kube Upgrade Complete - Callback"""
@ -4055,6 +4171,7 @@ class KubeUpgradeCompleteStep(AbstractKubeUpgradeStep):
"""Kube Upgrade Complete """
from nfv_vim import nfvi
DLOG.info("Step (%s) apply." % self._name)
nfvi.nfvi_kube_upgrade_complete(self._response_callback())
return strategy.STRATEGY_STEP_RESULT.WAIT, ""
@ -4071,6 +4188,12 @@ class KubeUpgradeDownloadImagesStep(AbstractKubeUpgradeStep):
nfvi.objects.v1.KUBE_UPGRADE_STATE.KUBE_UPGRADE_DOWNLOADING_IMAGES_FAILED,
timeout_in_secs=1800)
def abort(self):
"""
Returns the abort step related to this step
"""
return [KubeUpgradeAbortStep()]
@coroutine
def _response_callback(self):
"""Kube Upgrade Download Images - Callback"""
@ -4089,6 +4212,7 @@ class KubeUpgradeDownloadImagesStep(AbstractKubeUpgradeStep):
"""Kube Upgrade Download Images """
from nfv_vim import nfvi
DLOG.info("Step (%s) apply." % self._name)
nfvi.nfvi_kube_upgrade_download_images(self._response_callback())
return strategy.STRATEGY_STEP_RESULT.WAIT, ""
@ -4105,6 +4229,12 @@ class KubeUpgradeNetworkingStep(AbstractKubeUpgradeStep):
nfvi.objects.v1.KUBE_UPGRADE_STATE.KUBE_UPGRADING_NETWORKING_FAILED,
timeout_in_secs=900)
def abort(self):
"""
Returns the abort step related to this step
"""
return [KubeUpgradeAbortStep()]
@coroutine
def _response_callback(self):
"""Kube Upgrade Networking - Callback"""
@ -4123,6 +4253,7 @@ class KubeUpgradeNetworkingStep(AbstractKubeUpgradeStep):
"""Kube Upgrade Networking"""
from nfv_vim import nfvi
DLOG.info("Step (%s) apply." % self._name)
nfvi.nfvi_kube_upgrade_networking(self._response_callback())
return strategy.STRATEGY_STEP_RESULT.WAIT, ""
@ -4262,6 +4393,13 @@ class KubeHostCordonStep(AbstractKubeHostUpgradeStep):
target_failure_state,
timeout_in_secs)
def abort(self):
"""
Returns the abort step related to this step
"""
# todo(abailey): Unknown if this should include an uncordon if it fails
return [KubeUpgradeAbortStep()]
def handle_event(self, event, event_data=None):
"""
Handle Host events - does not query kube host upgrade list but
@ -4309,6 +4447,13 @@ class KubeHostUncordonStep(AbstractKubeHostUpgradeStep):
target_failure_state,
timeout_in_secs)
def abort(self):
"""
Returns the abort step related to this step
"""
# todo(abailey): Unknown if this should include a cordon if it fails
return [KubeUpgradeAbortStep()]
def handle_event(self, event, event_data=None):
"""
Handle Host events - does not query kube host upgrade list but
@ -4359,6 +4504,13 @@ class KubeHostUpgradeControlPlaneStep(AbstractKubeHostUpgradeStep):
target_failure_state,
timeout_in_secs)
def abort(self):
"""
Returns the abort step related to this step
"""
# todo(abailey): Unknown if this should include an uncordon if it fails
return [KubeUpgradeAbortStep()]
def handle_event(self, event, event_data=None):
"""
Handle Host events - does not query kube host upgrade list but
@ -4383,8 +4535,8 @@ class KubeHostUpgradeControlPlaneStep(AbstractKubeHostUpgradeStep):
from nfv_vim import directors
DLOG.debug("Step (%s) apply to hostnames (%s)."
% (self._name, self._host_names))
DLOG.info("Step (%s) apply to hostnames (%s)."
% (self._name, self._host_names))
host_director = directors.get_host_director()
operation = \
host_director.kube_upgrade_hosts_control_plane(self._host_names,
@ -4409,6 +4561,13 @@ class KubeHostUpgradeKubeletStep(AbstractKubeHostListUpgradeStep):
None, # there is no kube upgrade failure state for kubelets
timeout_in_secs=900) # kubelet takes longer than control plane
def abort(self):
"""
Returns the abort step related to this step
"""
# todo(abailey): Unknown if this should include an uncordon if it fails
return [KubeUpgradeAbortStep()]
@coroutine
def _get_kube_host_upgrade_list_callback(self):
"""Get Kube Host Upgrade List Callback"""
@ -4484,8 +4643,8 @@ class KubeHostUpgradeKubeletStep(AbstractKubeHostListUpgradeStep):
from nfv_vim import directors
DLOG.debug("Step (%s) apply to hostnames (%s)."
% (self._name, self._host_names))
DLOG.info("Step (%s) apply to hostnames (%s)."
% (self._name, self._host_names))
host_director = directors.get_host_director()
operation = \
host_director.kube_upgrade_hosts_kubelet(self._host_names,
@ -4540,7 +4699,10 @@ def strategy_step_rebuild_from_dict(data):
KubeHostUpgradeControlPlaneStep,
STRATEGY_STEP_NAME.KUBE_HOST_UPGRADE_KUBELET:
KubeHostUpgradeKubeletStep,
STRATEGY_STEP_NAME.KUBE_UPGRADE_ABORT: KubeUpgradeAbortStep,
STRATEGY_STEP_NAME.KUBE_UPGRADE_CLEANUP: KubeUpgradeCleanupStep,
STRATEGY_STEP_NAME.KUBE_UPGRADE_CLEANUP_ABORTED:
KubeUpgradeCleanupAbortedStep,
STRATEGY_STEP_NAME.KUBE_UPGRADE_COMPLETE: KubeUpgradeCompleteStep,
STRATEGY_STEP_NAME.KUBE_UPGRADE_DOWNLOAD_IMAGES:
KubeUpgradeDownloadImagesStep,