From 9c73d3b254ddebd65e6c5d6f7949813c8d08192f Mon Sep 17 00:00:00 2001 From: sshathee Date: Wed, 28 Feb 2024 08:55:45 -0500 Subject: [PATCH] Add retry at nfv orchestration level This commit introduces retry on failure for cases such as kubelet killing pods due to resource contention during kubernetes upgrade. Test Plan: PASS: Simulate kubeapiserver pod failure by adding wrong resource in rest api request and check retries. PASS: Verify kubernetes orchestrated update works with changes on aio-sx PASS: Verify changes are working on AIO-DX, with strategy created on controller-0 and applied on controller-1 Closes-Bug: #2053236 Change-Id: I816b09bb0cd767380e5093d4732d161e4cc8cb24 Signed-off-by: sshathee --- .../nfv_vim/directors/_host_director.py | 22 ++++++++- .../nfv_vim/directors/_sw_mgmt_director.py | 12 +++++ .../nfv_vim/strategy/_strategy_defs.py | 3 ++ .../nfv_vim/strategy/_strategy_steps.py | 49 ++++++++++++++++--- 4 files changed, 79 insertions(+), 7 deletions(-) diff --git a/nfv/nfv-vim/nfv_vim/directors/_host_director.py b/nfv/nfv-vim/nfv_vim/directors/_host_director.py index ec0b11f6..f976784a 100755 --- a/nfv/nfv-vim/nfv_vim/directors/_host_director.py +++ b/nfv/nfv-vim/nfv_vim/directors/_host_director.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2023 Wind River Systems, Inc. +# Copyright (c) 2015-2024 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # @@ -27,6 +27,7 @@ class HostDirector(object): """ Host Director """ + def __init__(self): self._host_operation = None @@ -1280,6 +1281,25 @@ class HostDirector(object): return host_operation + @coroutine + def _nfvi_get_kube_host_upgrade_list_callback(self): + """ + Get Kube Host Upgrade List Callback + """ + from nfv_vim import directors + + response = (yield) + DLOG.debug("Get kube host upgrade list callback response=%s." % response) + sw_mgmt_director = directors.get_sw_mgmt_director() + sw_mgmt_director.kube_host_upgrade_list(response) + + def _nfvi_get_kube_host_upgrade_list(self): + """ + NFVI Kube host upgrade list + """ + nfvi.nfvi_get_kube_host_upgrade_list( + self._nfvi_get_kube_host_upgrade_list_callback()) + def get_host_director(): """ diff --git a/nfv/nfv-vim/nfv_vim/directors/_sw_mgmt_director.py b/nfv/nfv-vim/nfv_vim/directors/_sw_mgmt_director.py index f91af669..a6354033 100755 --- a/nfv/nfv-vim/nfv_vim/directors/_sw_mgmt_director.py +++ b/nfv/nfv-vim/nfv_vim/directors/_sw_mgmt_director.py @@ -450,6 +450,18 @@ class SwMgmtDirector(object): self._sw_update.handle_event( strategy.STRATEGY_EVENT.MIGRATE_INSTANCES_FAILED, reason) + def kube_host_upgrade_list(self, event_data): + """ + Kubernetes host upgrade list handle_event called + """ + if event_data['completed']: + event = strategy.STRATEGY_EVENT.QUERY_KUBE_HOST_UPGRADE_COMPLETED + else: + event = strategy.STRATEGY_EVENT.QUERY_KUBE_HOST_UPGRADE_FAILED + if self._sw_update is not None: + self._sw_update.handle_event( + event, event_data) + def get_sw_mgmt_director(): """ diff --git a/nfv/nfv-vim/nfv_vim/strategy/_strategy_defs.py b/nfv/nfv-vim/nfv_vim/strategy/_strategy_defs.py index 221f30dd..6ccd5a1c 100755 --- a/nfv/nfv-vim/nfv_vim/strategy/_strategy_defs.py +++ b/nfv/nfv-vim/nfv_vim/strategy/_strategy_defs.py @@ -37,6 +37,9 @@ class EventNames(object): Constant('kube-host-upgrade-kubelet-failed') KUBE_ROOTCA_UPDATE_HOST_FAILED = Constant('kube-rootca-update-host-failed') KUBE_UPGRADE_CHANGED = Constant('kube-upgrade-changed') + QUERY_KUBE_HOST_UPGRADE_FAILED = Constant('query-kube-host-upgrade-failed') + QUERY_KUBE_HOST_UPGRADE_COMPLETED = \ + Constant('query-kube-host-upgrade-completed') # Constants diff --git a/nfv/nfv-vim/nfv_vim/strategy/_strategy_steps.py b/nfv/nfv-vim/nfv_vim/strategy/_strategy_steps.py index a32d7e86..bbad2e9f 100755 --- a/nfv/nfv-vim/nfv_vim/strategy/_strategy_steps.py +++ b/nfv/nfv-vim/nfv_vim/strategy/_strategy_steps.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2023 Wind River Systems, Inc. +# Copyright (c) 2015-2024 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # @@ -3930,9 +3930,12 @@ class QueryKubeHostUpgradeStep(AbstractStrategyStep): """ Query Kube Host Upgrade list """ - def __init__(self): + MAX_RETRIES = 3 + + def __init__(self, retry_count=MAX_RETRIES): super(QueryKubeHostUpgradeStep, self).__init__( - STRATEGY_STEP_NAME.QUERY_KUBE_HOST_UPGRADE, timeout_in_secs=60) + STRATEGY_STEP_NAME.QUERY_KUBE_HOST_UPGRADE, timeout_in_secs=200) + self._retry_count = retry_count @coroutine def _get_kube_host_upgrade_list_callback(self): @@ -3957,12 +3960,46 @@ class QueryKubeHostUpgradeStep(AbstractStrategyStep): """ Query Kube Host Upgrade List """ - from nfv_vim import nfvi + from nfv_vim import directors + DLOG.info("Step (%s) apply." % self._name) - nfvi.nfvi_get_kube_host_upgrade_list( - self._get_kube_host_upgrade_list_callback()) + + host_director = directors.get_host_director() + host_director._nfvi_get_kube_host_upgrade_list() return strategy.STRATEGY_STEP_RESULT.WAIT, "" + def handle_event(self, event, event_data=None): + """ + Handle Query Kube Host upgrade event + """ + from nfv_vim import directors + + if event == STRATEGY_EVENT.QUERY_KUBE_HOST_UPGRADE_FAILED: + if event_data is not None and self._retry_count > 0: + # if kube host upgrade list fails and we have retries, + # re-trigger the function + DLOG.info("Step (%s) retry due to failure for (%s)." % (self._name, + str(event_data["reason"]))) + + self._retry_count = self._retry_count - 1 + host_director = directors.get_host_director() + host_director._nfvi_get_kube_host_upgrade_list() + else: + # if kube host upgrade list fails and we are out of retries, fail + result = strategy.STRATEGY_STEP_RESULT.FAILED + self.stage.step_complete(result, event_data['reason']) + return True + + elif event == STRATEGY_EVENT.QUERY_KUBE_HOST_UPGRADE_COMPLETED: + if event_data is not None and self.strategy is not None: + self.strategy.nfvi_kube_host_upgrade_list = \ + event_data['result-data'] + + result = strategy.STRATEGY_STEP_RESULT.SUCCESS + self.stage.step_complete(result, "") + + return False + class AbstractKubeUpgradeStep(AbstractStrategyStep):