Audit kube upgrade changes more frequently

Kube upgrade orchestrator used built-in host-audit events
(emitted at thirty second interval) to determine when a
kube-upgrade or kube-host-upgrade query should be invoked.

This meant that kube-upgrade steps would typically take at
least two of those intervals to detect a relatively quick
transition.

Now the kube-upgrade audit will be responsible for the
kube upgrade and kube host upgrade queries, and will run
at its own interval (5 seconds) to allow the steps to more
rapidly detect completion.

AIO-SX kube upgrade is two to three minutes faster.

Test Plan:
  PASS: AIO-SX kube upgrade

Story: 2010565
Task: 48173
Signed-off-by: Al Bailey <al.bailey@windriver.com>
Change-Id: Ib4878322d0846b8df935f643352b028ff11fc184
This commit is contained in:
Al Bailey 2023-06-01 17:12:24 +00:00
parent c908b9625d
commit f1f7fe3292
3 changed files with 95 additions and 30 deletions

View File

@ -1,5 +1,5 @@
# #
# Copyright (c) 2020-2021 Wind River Systems, Inc. # Copyright (c) 2020-2023 Wind River Systems, Inc.
# #
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# #
@ -18,6 +18,7 @@ from nfv_vim.objects._sw_update import SW_UPDATE_TYPE
from nfv_vim.objects._sw_update import SwUpdate from nfv_vim.objects._sw_update import SwUpdate
DLOG = debug.debug_get_logger('nfv_vim.objects.kube_upgrade') DLOG = debug.debug_get_logger('nfv_vim.objects.kube_upgrade')
DEFAULT_KUBE_AUDIT_RATE = 5
class KubeUpgrade(SwUpdate): class KubeUpgrade(SwUpdate):
@ -29,8 +30,8 @@ class KubeUpgrade(SwUpdate):
sw_update_type=SW_UPDATE_TYPE.KUBE_UPGRADE, sw_update_type=SW_UPDATE_TYPE.KUBE_UPGRADE,
sw_update_uuid=sw_update_uuid, sw_update_uuid=sw_update_uuid,
strategy_data=strategy_data) strategy_data=strategy_data)
# TODO(abailey): we do not appear to populate _kube_upgrade_hosts # these next two values are used by the audit
# consider removing self._kube_upgrade = None
self._kube_upgrade_hosts = list() self._kube_upgrade_hosts = list()
def strategy_build(self, def strategy_build(self,
@ -140,18 +141,10 @@ class KubeUpgrade(SwUpdate):
elif (self.strategy.is_apply_failed() or elif (self.strategy.is_apply_failed() or
self.strategy.is_apply_timed_out()): self.strategy.is_apply_timed_out()):
for kube_upgrade_host in self._kube_upgrade_hosts: # we do not raise additional alarms
if not self._alarms: if self._alarms:
self._alarms = alarm.raise_sw_update_alarm( alarm.clear_sw_update_alarm(self._alarms)
self.alarm_type(SW_UPDATE_ALARM_TYPES.APPLY_FAILED)) return False
event_log.sw_update_issue_log(
self.event_id(SW_UPDATE_EVENT_IDS.APPLY_FAILED))
break
else:
if self._alarms:
alarm.clear_sw_update_alarm(self._alarms)
return False
elif self.strategy.is_aborting(): elif self.strategy.is_aborting():
if not self._alarms: if not self._alarms:
@ -167,6 +160,48 @@ class KubeUpgrade(SwUpdate):
return True return True
@coroutine
def nfvi_kube_upgrade_callback(self, timer_id):
"""
Audit Kube Upgrade Callback
"""
from nfv_vim import strategy
response = (yield)
if response['completed']:
DLOG.debug("Audit-Kube-Upgrade callback, response=%s." % response)
last_state = self._kube_upgrade.state if self._kube_upgrade else None
self._kube_upgrade = response['result-data']
current_state = self._kube_upgrade.state if self._kube_upgrade else None
if last_state != current_state:
self.handle_event(strategy.STRATEGY_EVENT.KUBE_UPGRADE_CHANGED,
self._kube_upgrade)
else:
DLOG.error("Audit-Kube-Upgrade callback, not completed, "
"response=%s." % response)
self._nfvi_audit_inprogress = False
@coroutine
def nfvi_kube_host_upgrade_list_callback(self, timer_id):
"""
Audit Kube Host Upgrade Callback
"""
from nfv_vim import strategy
response = (yield)
if response['completed']:
DLOG.debug("Audit-Kube-Host-Upgrade callback, response=%s." % response)
self._kube_upgrade_hosts = response['result-data']
# todo(abailey): this needs to detect the change
self.handle_event(strategy.STRATEGY_EVENT.KUBE_HOST_UPGRADE_CHANGED,
self._kube_upgrade)
else:
DLOG.error("Audit-Kube-Upgrade callback, not completed, "
"response=%s." % response)
self._nfvi_audit_inprogress = False
@coroutine @coroutine
def nfvi_audit(self): def nfvi_audit(self):
""" """
@ -184,15 +219,35 @@ class KubeUpgrade(SwUpdate):
self._nfvi_audit_inprogress = True self._nfvi_audit_inprogress = True
while self._nfvi_audit_inprogress: while self._nfvi_audit_inprogress:
timer_id = (yield) timer_id = (yield)
# nfvi_alarms_callback sets timer to 2 seconds
# leave timer at 2 seconds for the next two audit calls
# nfvi_alarms_callback sets timer to 2 seconds. reset back to 30 DLOG.debug("Audit kube upgrade, timer_id=%s." % timer_id)
timers.timers_reschedule_timer(timer_id, 30) nfvi.nfvi_get_kube_upgrade(
self.nfvi_kube_upgrade_callback(timer_id))
self._nfvi_audit_inprogress = True
while self._nfvi_audit_inprogress:
timer_id = (yield)
current_state = self._kube_upgrade.state if self._kube_upgrade else None
# only audit the kube hosts when upgrading kubelets
if current_state in ["upgrading-kubelets",
"upgraded-kubelets"]:
DLOG.debug("Audit kube upgrade hosts, timer_id=%s." % timer_id)
nfvi.nfvi_get_kube_host_upgrade_list(
self.nfvi_kube_host_upgrade_list_callback(timer_id))
self._nfvi_audit_inprogress = True
while self._nfvi_audit_inprogress:
timer_id = (yield)
# set timer to DEFAULT_KUBE_AUDIT_RATE
timers.timers_reschedule_timer(timer_id, DEFAULT_KUBE_AUDIT_RATE)
if not self.nfvi_update(): if not self.nfvi_update():
DLOG.info("Audit no longer needed.") DLOG.info("Audit no longer needed.")
break break
DLOG.verbose("Audit kube upgrade still running, timer_id=%s." % DLOG.debug("Audit kube upgrade still running, timer_id=%s." %
timer_id) timer_id)
self._nfvi_timer_id = None self._nfvi_timer_id = None

View File

@ -30,11 +30,13 @@ class EventNames(object):
MIGRATE_INSTANCES_FAILED = Constant('migrate-instances-failed') MIGRATE_INSTANCES_FAILED = Constant('migrate-instances-failed')
KUBE_HOST_CORDON_FAILED = Constant('kube-host-cordon-failed') KUBE_HOST_CORDON_FAILED = Constant('kube-host-cordon-failed')
KUBE_HOST_UNCORDON_FAILED = Constant('kube-host-uncordon-failed') KUBE_HOST_UNCORDON_FAILED = Constant('kube-host-uncordon-failed')
KUBE_HOST_UPGRADE_CHANGED = Constant('kube-host-upgrade-changed')
KUBE_HOST_UPGRADE_CONTROL_PLANE_FAILED = \ KUBE_HOST_UPGRADE_CONTROL_PLANE_FAILED = \
Constant('kube-host-upgrade-control-plane-failed') Constant('kube-host-upgrade-control-plane-failed')
KUBE_HOST_UPGRADE_KUBELET_FAILED = \ KUBE_HOST_UPGRADE_KUBELET_FAILED = \
Constant('kube-host-upgrade-kubelet-failed') Constant('kube-host-upgrade-kubelet-failed')
KUBE_ROOTCA_UPDATE_HOST_FAILED = Constant('kube-rootca-update-host-failed') KUBE_ROOTCA_UPDATE_HOST_FAILED = Constant('kube-rootca-update-host-failed')
KUBE_UPGRADE_CHANGED = Constant('kube-upgrade-changed')
# Constants # Constants

View File

@ -3882,14 +3882,18 @@ class AbstractKubeUpgradeStep(AbstractStrategyStep):
from nfv_vim import nfvi from nfv_vim import nfvi
DLOG.debug("Step (%s) handle event (%s)." % (self._name, event)) DLOG.debug("Step (%s) handle event (%s)." % (self._name, event))
if event == STRATEGY_EVENT.HOST_AUDIT: if event == STRATEGY_EVENT.KUBE_UPGRADE_CHANGED:
# todo(abailey): use event data rather than re-querying
self._query_inprogress = True
nfvi.nfvi_get_kube_upgrade(self._get_kube_upgrade_callback())
return True
elif event == STRATEGY_EVENT.HOST_AUDIT:
if 0 == self._wait_time: if 0 == self._wait_time:
self._wait_time = timers.get_monotonic_timestamp_in_ms() self._wait_time = timers.get_monotonic_timestamp_in_ms()
now_ms = timers.get_monotonic_timestamp_in_ms() now_ms = timers.get_monotonic_timestamp_in_ms()
secs_expired = (now_ms - self._wait_time) // 1000 secs_expired = (now_ms - self._wait_time) // 1000
# Wait at least 60 seconds before checking upgrade for first time # Wait 30 seconds before checking kube upgrade for first time
if 60 <= secs_expired and not self._query_inprogress: if 30 <= secs_expired and not self._query_inprogress:
self._query_inprogress = True self._query_inprogress = True
nfvi.nfvi_get_kube_upgrade(self._get_kube_upgrade_callback()) nfvi.nfvi_get_kube_upgrade(self._get_kube_upgrade_callback())
return True return True
@ -4379,8 +4383,8 @@ class KubeHostUpgradeControlPlaneStep(AbstractKubeHostUpgradeStep):
from nfv_vim import directors from nfv_vim import directors
DLOG.info("Step (%s) apply to hostnames (%s)." DLOG.debug("Step (%s) apply to hostnames (%s)."
% (self._name, self._host_names)) % (self._name, self._host_names))
host_director = directors.get_host_director() host_director = directors.get_host_director()
operation = \ operation = \
host_director.kube_upgrade_hosts_control_plane(self._host_names, host_director.kube_upgrade_hosts_control_plane(self._host_names,
@ -4456,14 +4460,19 @@ class KubeHostUpgradeKubeletStep(AbstractKubeHostListUpgradeStep):
result, result,
"kube host upgrade kubelet (%s) failed" % host.name) "kube host upgrade kubelet (%s) failed" % host.name)
return True return True
elif event == STRATEGY_EVENT.KUBE_HOST_UPGRADE_CHANGED:
self._query_inprogress = True
nfvi.nfvi_get_kube_host_upgrade_list(
self._get_kube_host_upgrade_list_callback())
return True
elif event == STRATEGY_EVENT.HOST_AUDIT: elif event == STRATEGY_EVENT.HOST_AUDIT:
if 0 == self._wait_time: if 0 == self._wait_time:
self._wait_time = timers.get_monotonic_timestamp_in_ms() self._wait_time = timers.get_monotonic_timestamp_in_ms()
now_ms = timers.get_monotonic_timestamp_in_ms() now_ms = timers.get_monotonic_timestamp_in_ms()
secs_expired = (now_ms - self._wait_time) // 1000 secs_expired = (now_ms - self._wait_time) // 1000
# Wait at least 60 seconds before checking upgrade for first time # Wait at least 30 seconds before checking kube hosts for first time
if 60 <= secs_expired and not self._query_inprogress: if 30 <= secs_expired and not self._query_inprogress:
self._query_inprogress = True self._query_inprogress = True
nfvi.nfvi_get_kube_host_upgrade_list( nfvi.nfvi_get_kube_host_upgrade_list(
self._get_kube_host_upgrade_list_callback()) self._get_kube_host_upgrade_list_callback())
@ -4475,13 +4484,12 @@ class KubeHostUpgradeKubeletStep(AbstractKubeHostListUpgradeStep):
from nfv_vim import directors from nfv_vim import directors
DLOG.info("Step (%s) apply to hostnames (%s)." DLOG.debug("Step (%s) apply to hostnames (%s)."
% (self._name, self._host_names)) % (self._name, self._host_names))
host_director = directors.get_host_director() host_director = directors.get_host_director()
operation = \ operation = \
host_director.kube_upgrade_hosts_kubelet(self._host_names, host_director.kube_upgrade_hosts_kubelet(self._host_names,
self._force) self._force)
if operation.is_inprogress(): if operation.is_inprogress():
return strategy.STRATEGY_STEP_RESULT.WAIT, "" return strategy.STRATEGY_STEP_RESULT.WAIT, ""
elif operation.is_failed(): elif operation.is_failed():