From f1f7fe329226f9a3a98548486fea7b090be697b0 Mon Sep 17 00:00:00 2001 From: Al Bailey Date: Thu, 1 Jun 2023 17:12:24 +0000 Subject: [PATCH] Audit kube upgrade changes more frequently Kube upgrade orchestrator used built-in host-audit events (emitted at thirty second interval) to determine when a kube-upgrade or kube-host-upgrade query should be invoked. This meant that kube-upgrade steps would typically take at least two of those intervals to detect a relatively quick transition. Now the kube-upgrade audit will be responsible for the kube upgrade and kube host upgrade queries, and will run at its own interval (5 seconds) to allow the steps to more rapidly detect completion. AIO-SX kube upgrade is two to three minutes faster. Test Plan: PASS: AIO-SX kube upgrade Story: 2010565 Task: 48173 Signed-off-by: Al Bailey Change-Id: Ib4878322d0846b8df935f643352b028ff11fc184 --- nfv/nfv-vim/nfv_vim/objects/_kube_upgrade.py | 93 +++++++++++++++---- .../nfv_vim/strategy/_strategy_defs.py | 2 + .../nfv_vim/strategy/_strategy_steps.py | 30 +++--- 3 files changed, 95 insertions(+), 30 deletions(-) diff --git a/nfv/nfv-vim/nfv_vim/objects/_kube_upgrade.py b/nfv/nfv-vim/nfv_vim/objects/_kube_upgrade.py index 61d2d3af..953222bd 100644 --- a/nfv/nfv-vim/nfv_vim/objects/_kube_upgrade.py +++ b/nfv/nfv-vim/nfv_vim/objects/_kube_upgrade.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2020-2021 Wind River Systems, Inc. +# Copyright (c) 2020-2023 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # @@ -18,6 +18,7 @@ from nfv_vim.objects._sw_update import SW_UPDATE_TYPE from nfv_vim.objects._sw_update import SwUpdate DLOG = debug.debug_get_logger('nfv_vim.objects.kube_upgrade') +DEFAULT_KUBE_AUDIT_RATE = 5 class KubeUpgrade(SwUpdate): @@ -29,8 +30,8 @@ class KubeUpgrade(SwUpdate): sw_update_type=SW_UPDATE_TYPE.KUBE_UPGRADE, sw_update_uuid=sw_update_uuid, strategy_data=strategy_data) - # TODO(abailey): we do not appear to populate _kube_upgrade_hosts - # consider removing + # these next two values are used by the audit + self._kube_upgrade = None self._kube_upgrade_hosts = list() def strategy_build(self, @@ -140,18 +141,10 @@ class KubeUpgrade(SwUpdate): elif (self.strategy.is_apply_failed() or self.strategy.is_apply_timed_out()): - for kube_upgrade_host in self._kube_upgrade_hosts: - if not self._alarms: - self._alarms = alarm.raise_sw_update_alarm( - self.alarm_type(SW_UPDATE_ALARM_TYPES.APPLY_FAILED)) - event_log.sw_update_issue_log( - self.event_id(SW_UPDATE_EVENT_IDS.APPLY_FAILED)) - break - - else: - if self._alarms: - alarm.clear_sw_update_alarm(self._alarms) - return False + # we do not raise additional alarms + if self._alarms: + alarm.clear_sw_update_alarm(self._alarms) + return False elif self.strategy.is_aborting(): if not self._alarms: @@ -167,6 +160,48 @@ class KubeUpgrade(SwUpdate): return True + @coroutine + def nfvi_kube_upgrade_callback(self, timer_id): + """ + Audit Kube Upgrade Callback + """ + from nfv_vim import strategy + response = (yield) + + if response['completed']: + DLOG.debug("Audit-Kube-Upgrade callback, response=%s." % response) + last_state = self._kube_upgrade.state if self._kube_upgrade else None + self._kube_upgrade = response['result-data'] + current_state = self._kube_upgrade.state if self._kube_upgrade else None + if last_state != current_state: + self.handle_event(strategy.STRATEGY_EVENT.KUBE_UPGRADE_CHANGED, + self._kube_upgrade) + else: + DLOG.error("Audit-Kube-Upgrade callback, not completed, " + "response=%s." % response) + + self._nfvi_audit_inprogress = False + + @coroutine + def nfvi_kube_host_upgrade_list_callback(self, timer_id): + """ + Audit Kube Host Upgrade Callback + """ + from nfv_vim import strategy + response = (yield) + + if response['completed']: + DLOG.debug("Audit-Kube-Host-Upgrade callback, response=%s." % response) + self._kube_upgrade_hosts = response['result-data'] + # todo(abailey): this needs to detect the change + self.handle_event(strategy.STRATEGY_EVENT.KUBE_HOST_UPGRADE_CHANGED, + self._kube_upgrade) + else: + DLOG.error("Audit-Kube-Upgrade callback, not completed, " + "response=%s." % response) + + self._nfvi_audit_inprogress = False + @coroutine def nfvi_audit(self): """ @@ -184,15 +219,35 @@ class KubeUpgrade(SwUpdate): self._nfvi_audit_inprogress = True while self._nfvi_audit_inprogress: timer_id = (yield) + # nfvi_alarms_callback sets timer to 2 seconds + # leave timer at 2 seconds for the next two audit calls - # nfvi_alarms_callback sets timer to 2 seconds. reset back to 30 - timers.timers_reschedule_timer(timer_id, 30) + DLOG.debug("Audit kube upgrade, timer_id=%s." % timer_id) + nfvi.nfvi_get_kube_upgrade( + self.nfvi_kube_upgrade_callback(timer_id)) + self._nfvi_audit_inprogress = True + while self._nfvi_audit_inprogress: + timer_id = (yield) + current_state = self._kube_upgrade.state if self._kube_upgrade else None + # only audit the kube hosts when upgrading kubelets + if current_state in ["upgrading-kubelets", + "upgraded-kubelets"]: + DLOG.debug("Audit kube upgrade hosts, timer_id=%s." % timer_id) + nfvi.nfvi_get_kube_host_upgrade_list( + self.nfvi_kube_host_upgrade_list_callback(timer_id)) + + self._nfvi_audit_inprogress = True + while self._nfvi_audit_inprogress: + timer_id = (yield) + + # set timer to DEFAULT_KUBE_AUDIT_RATE + timers.timers_reschedule_timer(timer_id, DEFAULT_KUBE_AUDIT_RATE) if not self.nfvi_update(): DLOG.info("Audit no longer needed.") break - DLOG.verbose("Audit kube upgrade still running, timer_id=%s." % - timer_id) + DLOG.debug("Audit kube upgrade still running, timer_id=%s." % + timer_id) self._nfvi_timer_id = None diff --git a/nfv/nfv-vim/nfv_vim/strategy/_strategy_defs.py b/nfv/nfv-vim/nfv_vim/strategy/_strategy_defs.py index e146fd39..221f30dd 100755 --- a/nfv/nfv-vim/nfv_vim/strategy/_strategy_defs.py +++ b/nfv/nfv-vim/nfv_vim/strategy/_strategy_defs.py @@ -30,11 +30,13 @@ class EventNames(object): MIGRATE_INSTANCES_FAILED = Constant('migrate-instances-failed') KUBE_HOST_CORDON_FAILED = Constant('kube-host-cordon-failed') KUBE_HOST_UNCORDON_FAILED = Constant('kube-host-uncordon-failed') + KUBE_HOST_UPGRADE_CHANGED = Constant('kube-host-upgrade-changed') KUBE_HOST_UPGRADE_CONTROL_PLANE_FAILED = \ Constant('kube-host-upgrade-control-plane-failed') KUBE_HOST_UPGRADE_KUBELET_FAILED = \ Constant('kube-host-upgrade-kubelet-failed') KUBE_ROOTCA_UPDATE_HOST_FAILED = Constant('kube-rootca-update-host-failed') + KUBE_UPGRADE_CHANGED = Constant('kube-upgrade-changed') # Constants diff --git a/nfv/nfv-vim/nfv_vim/strategy/_strategy_steps.py b/nfv/nfv-vim/nfv_vim/strategy/_strategy_steps.py index d481d252..e1b12de1 100755 --- a/nfv/nfv-vim/nfv_vim/strategy/_strategy_steps.py +++ b/nfv/nfv-vim/nfv_vim/strategy/_strategy_steps.py @@ -3882,14 +3882,18 @@ class AbstractKubeUpgradeStep(AbstractStrategyStep): from nfv_vim import nfvi DLOG.debug("Step (%s) handle event (%s)." % (self._name, event)) - if event == STRATEGY_EVENT.HOST_AUDIT: + if event == STRATEGY_EVENT.KUBE_UPGRADE_CHANGED: + # todo(abailey): use event data rather than re-querying + self._query_inprogress = True + nfvi.nfvi_get_kube_upgrade(self._get_kube_upgrade_callback()) + return True + elif event == STRATEGY_EVENT.HOST_AUDIT: if 0 == self._wait_time: self._wait_time = timers.get_monotonic_timestamp_in_ms() - now_ms = timers.get_monotonic_timestamp_in_ms() secs_expired = (now_ms - self._wait_time) // 1000 - # Wait at least 60 seconds before checking upgrade for first time - if 60 <= secs_expired and not self._query_inprogress: + # Wait 30 seconds before checking kube upgrade for first time + if 30 <= secs_expired and not self._query_inprogress: self._query_inprogress = True nfvi.nfvi_get_kube_upgrade(self._get_kube_upgrade_callback()) return True @@ -4379,8 +4383,8 @@ class KubeHostUpgradeControlPlaneStep(AbstractKubeHostUpgradeStep): from nfv_vim import directors - DLOG.info("Step (%s) apply to hostnames (%s)." - % (self._name, self._host_names)) + DLOG.debug("Step (%s) apply to hostnames (%s)." + % (self._name, self._host_names)) host_director = directors.get_host_director() operation = \ host_director.kube_upgrade_hosts_control_plane(self._host_names, @@ -4456,14 +4460,19 @@ class KubeHostUpgradeKubeletStep(AbstractKubeHostListUpgradeStep): result, "kube host upgrade kubelet (%s) failed" % host.name) return True + elif event == STRATEGY_EVENT.KUBE_HOST_UPGRADE_CHANGED: + self._query_inprogress = True + nfvi.nfvi_get_kube_host_upgrade_list( + self._get_kube_host_upgrade_list_callback()) + return True elif event == STRATEGY_EVENT.HOST_AUDIT: if 0 == self._wait_time: self._wait_time = timers.get_monotonic_timestamp_in_ms() now_ms = timers.get_monotonic_timestamp_in_ms() secs_expired = (now_ms - self._wait_time) // 1000 - # Wait at least 60 seconds before checking upgrade for first time - if 60 <= secs_expired and not self._query_inprogress: + # Wait at least 30 seconds before checking kube hosts for first time + if 30 <= secs_expired and not self._query_inprogress: self._query_inprogress = True nfvi.nfvi_get_kube_host_upgrade_list( self._get_kube_host_upgrade_list_callback()) @@ -4475,13 +4484,12 @@ class KubeHostUpgradeKubeletStep(AbstractKubeHostListUpgradeStep): from nfv_vim import directors - DLOG.info("Step (%s) apply to hostnames (%s)." - % (self._name, self._host_names)) + DLOG.debug("Step (%s) apply to hostnames (%s)." + % (self._name, self._host_names)) host_director = directors.get_host_director() operation = \ host_director.kube_upgrade_hosts_kubelet(self._host_names, self._force) - if operation.is_inprogress(): return strategy.STRATEGY_STEP_RESULT.WAIT, "" elif operation.is_failed():