Add retry at nfv orchestration level

This commit introduces retry on failure for cases such
as kubelet killing pods due to resource contention during
kubernetes upgrade.

Test Plan:
    PASS: Simulate kubeapiserver pod failure by adding wrong resource
    in rest api request and check retries.

    PASS: Verify kubernetes orchestrated update works with
    changes on aio-sx

    PASS: Verify changes are working on AIO-DX, with strategy
    created on controller-0 and applied on controller-1

Closes-Bug: #2053236
Change-Id: I816b09bb0cd767380e5093d4732d161e4cc8cb24
Signed-off-by: sshathee <shunmugam.shatheesh@windriver.com>
This commit is contained in:
sshathee 2024-02-28 08:55:45 -05:00
parent d730d8edc2
commit 9c73d3b254
4 changed files with 79 additions and 7 deletions

View File

@ -1,5 +1,5 @@
#
# Copyright (c) 2015-2023 Wind River Systems, Inc.
# Copyright (c) 2015-2024 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@ -27,6 +27,7 @@ class HostDirector(object):
"""
Host Director
"""
def __init__(self):
self._host_operation = None
@ -1280,6 +1281,25 @@ class HostDirector(object):
return host_operation
@coroutine
def _nfvi_get_kube_host_upgrade_list_callback(self):
"""
Get Kube Host Upgrade List Callback
"""
from nfv_vim import directors
response = (yield)
DLOG.debug("Get kube host upgrade list callback response=%s." % response)
sw_mgmt_director = directors.get_sw_mgmt_director()
sw_mgmt_director.kube_host_upgrade_list(response)
def _nfvi_get_kube_host_upgrade_list(self):
"""
NFVI Kube host upgrade list
"""
nfvi.nfvi_get_kube_host_upgrade_list(
self._nfvi_get_kube_host_upgrade_list_callback())
def get_host_director():
"""

View File

@ -450,6 +450,18 @@ class SwMgmtDirector(object):
self._sw_update.handle_event(
strategy.STRATEGY_EVENT.MIGRATE_INSTANCES_FAILED, reason)
def kube_host_upgrade_list(self, event_data):
"""
Kubernetes host upgrade list handle_event called
"""
if event_data['completed']:
event = strategy.STRATEGY_EVENT.QUERY_KUBE_HOST_UPGRADE_COMPLETED
else:
event = strategy.STRATEGY_EVENT.QUERY_KUBE_HOST_UPGRADE_FAILED
if self._sw_update is not None:
self._sw_update.handle_event(
event, event_data)
def get_sw_mgmt_director():
"""

View File

@ -37,6 +37,9 @@ class EventNames(object):
Constant('kube-host-upgrade-kubelet-failed')
KUBE_ROOTCA_UPDATE_HOST_FAILED = Constant('kube-rootca-update-host-failed')
KUBE_UPGRADE_CHANGED = Constant('kube-upgrade-changed')
QUERY_KUBE_HOST_UPGRADE_FAILED = Constant('query-kube-host-upgrade-failed')
QUERY_KUBE_HOST_UPGRADE_COMPLETED = \
Constant('query-kube-host-upgrade-completed')
# Constants

View File

@ -1,5 +1,5 @@
#
# Copyright (c) 2015-2023 Wind River Systems, Inc.
# Copyright (c) 2015-2024 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@ -3930,9 +3930,12 @@ class QueryKubeHostUpgradeStep(AbstractStrategyStep):
"""
Query Kube Host Upgrade list
"""
def __init__(self):
MAX_RETRIES = 3
def __init__(self, retry_count=MAX_RETRIES):
super(QueryKubeHostUpgradeStep, self).__init__(
STRATEGY_STEP_NAME.QUERY_KUBE_HOST_UPGRADE, timeout_in_secs=60)
STRATEGY_STEP_NAME.QUERY_KUBE_HOST_UPGRADE, timeout_in_secs=200)
self._retry_count = retry_count
@coroutine
def _get_kube_host_upgrade_list_callback(self):
@ -3957,12 +3960,46 @@ class QueryKubeHostUpgradeStep(AbstractStrategyStep):
"""
Query Kube Host Upgrade List
"""
from nfv_vim import nfvi
from nfv_vim import directors
DLOG.info("Step (%s) apply." % self._name)
nfvi.nfvi_get_kube_host_upgrade_list(
self._get_kube_host_upgrade_list_callback())
host_director = directors.get_host_director()
host_director._nfvi_get_kube_host_upgrade_list()
return strategy.STRATEGY_STEP_RESULT.WAIT, ""
def handle_event(self, event, event_data=None):
"""
Handle Query Kube Host upgrade event
"""
from nfv_vim import directors
if event == STRATEGY_EVENT.QUERY_KUBE_HOST_UPGRADE_FAILED:
if event_data is not None and self._retry_count > 0:
# if kube host upgrade list fails and we have retries,
# re-trigger the function
DLOG.info("Step (%s) retry due to failure for (%s)." % (self._name,
str(event_data["reason"])))
self._retry_count = self._retry_count - 1
host_director = directors.get_host_director()
host_director._nfvi_get_kube_host_upgrade_list()
else:
# if kube host upgrade list fails and we are out of retries, fail
result = strategy.STRATEGY_STEP_RESULT.FAILED
self.stage.step_complete(result, event_data['reason'])
return True
elif event == STRATEGY_EVENT.QUERY_KUBE_HOST_UPGRADE_COMPLETED:
if event_data is not None and self.strategy is not None:
self.strategy.nfvi_kube_host_upgrade_list = \
event_data['result-data']
result = strategy.STRATEGY_STEP_RESULT.SUCCESS
self.stage.step_complete(result, "")
return False
class AbstractKubeUpgradeStep(AbstractStrategyStep):