Audit kube upgrade changes more frequently

Kube upgrade orchestrator used built-in host-audit events
(emitted at thirty second interval) to determine when a
kube-upgrade or kube-host-upgrade query should be invoked.

This meant that kube-upgrade steps would typically take at
least two of those intervals to detect a relatively quick
transition.

Now the kube-upgrade audit will be responsible for the
kube upgrade and kube host upgrade queries, and will run
at its own interval (5 seconds) to allow the steps to more
rapidly detect completion.

AIO-SX kube upgrade is two to three minutes faster.

Test Plan:
  PASS: AIO-SX kube upgrade

Story: 2010565
Task: 48173
Signed-off-by: Al Bailey <al.bailey@windriver.com>
Change-Id: Ib4878322d0846b8df935f643352b028ff11fc184
This commit is contained in:
Al Bailey 2023-06-01 17:12:24 +00:00
parent c908b9625d
commit f1f7fe3292
3 changed files with 95 additions and 30 deletions

View File

@ -1,5 +1,5 @@
#
# Copyright (c) 2020-2021 Wind River Systems, Inc.
# Copyright (c) 2020-2023 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@ -18,6 +18,7 @@ from nfv_vim.objects._sw_update import SW_UPDATE_TYPE
from nfv_vim.objects._sw_update import SwUpdate
DLOG = debug.debug_get_logger('nfv_vim.objects.kube_upgrade')
DEFAULT_KUBE_AUDIT_RATE = 5
class KubeUpgrade(SwUpdate):
@ -29,8 +30,8 @@ class KubeUpgrade(SwUpdate):
sw_update_type=SW_UPDATE_TYPE.KUBE_UPGRADE,
sw_update_uuid=sw_update_uuid,
strategy_data=strategy_data)
# TODO(abailey): we do not appear to populate _kube_upgrade_hosts
# consider removing
# these next two values are used by the audit
self._kube_upgrade = None
self._kube_upgrade_hosts = list()
def strategy_build(self,
@ -140,18 +141,10 @@ class KubeUpgrade(SwUpdate):
elif (self.strategy.is_apply_failed() or
self.strategy.is_apply_timed_out()):
for kube_upgrade_host in self._kube_upgrade_hosts:
if not self._alarms:
self._alarms = alarm.raise_sw_update_alarm(
self.alarm_type(SW_UPDATE_ALARM_TYPES.APPLY_FAILED))
event_log.sw_update_issue_log(
self.event_id(SW_UPDATE_EVENT_IDS.APPLY_FAILED))
break
else:
if self._alarms:
alarm.clear_sw_update_alarm(self._alarms)
return False
# we do not raise additional alarms
if self._alarms:
alarm.clear_sw_update_alarm(self._alarms)
return False
elif self.strategy.is_aborting():
if not self._alarms:
@ -167,6 +160,48 @@ class KubeUpgrade(SwUpdate):
return True
@coroutine
def nfvi_kube_upgrade_callback(self, timer_id):
"""
Audit Kube Upgrade Callback
"""
from nfv_vim import strategy
response = (yield)
if response['completed']:
DLOG.debug("Audit-Kube-Upgrade callback, response=%s." % response)
last_state = self._kube_upgrade.state if self._kube_upgrade else None
self._kube_upgrade = response['result-data']
current_state = self._kube_upgrade.state if self._kube_upgrade else None
if last_state != current_state:
self.handle_event(strategy.STRATEGY_EVENT.KUBE_UPGRADE_CHANGED,
self._kube_upgrade)
else:
DLOG.error("Audit-Kube-Upgrade callback, not completed, "
"response=%s." % response)
self._nfvi_audit_inprogress = False
@coroutine
def nfvi_kube_host_upgrade_list_callback(self, timer_id):
"""
Audit Kube Host Upgrade Callback
"""
from nfv_vim import strategy
response = (yield)
if response['completed']:
DLOG.debug("Audit-Kube-Host-Upgrade callback, response=%s." % response)
self._kube_upgrade_hosts = response['result-data']
# todo(abailey): this needs to detect the change
self.handle_event(strategy.STRATEGY_EVENT.KUBE_HOST_UPGRADE_CHANGED,
self._kube_upgrade)
else:
DLOG.error("Audit-Kube-Upgrade callback, not completed, "
"response=%s." % response)
self._nfvi_audit_inprogress = False
@coroutine
def nfvi_audit(self):
"""
@ -184,15 +219,35 @@ class KubeUpgrade(SwUpdate):
self._nfvi_audit_inprogress = True
while self._nfvi_audit_inprogress:
timer_id = (yield)
# nfvi_alarms_callback sets timer to 2 seconds
# leave timer at 2 seconds for the next two audit calls
# nfvi_alarms_callback sets timer to 2 seconds. reset back to 30
timers.timers_reschedule_timer(timer_id, 30)
DLOG.debug("Audit kube upgrade, timer_id=%s." % timer_id)
nfvi.nfvi_get_kube_upgrade(
self.nfvi_kube_upgrade_callback(timer_id))
self._nfvi_audit_inprogress = True
while self._nfvi_audit_inprogress:
timer_id = (yield)
current_state = self._kube_upgrade.state if self._kube_upgrade else None
# only audit the kube hosts when upgrading kubelets
if current_state in ["upgrading-kubelets",
"upgraded-kubelets"]:
DLOG.debug("Audit kube upgrade hosts, timer_id=%s." % timer_id)
nfvi.nfvi_get_kube_host_upgrade_list(
self.nfvi_kube_host_upgrade_list_callback(timer_id))
self._nfvi_audit_inprogress = True
while self._nfvi_audit_inprogress:
timer_id = (yield)
# set timer to DEFAULT_KUBE_AUDIT_RATE
timers.timers_reschedule_timer(timer_id, DEFAULT_KUBE_AUDIT_RATE)
if not self.nfvi_update():
DLOG.info("Audit no longer needed.")
break
DLOG.verbose("Audit kube upgrade still running, timer_id=%s." %
timer_id)
DLOG.debug("Audit kube upgrade still running, timer_id=%s." %
timer_id)
self._nfvi_timer_id = None

View File

@ -30,11 +30,13 @@ class EventNames(object):
MIGRATE_INSTANCES_FAILED = Constant('migrate-instances-failed')
KUBE_HOST_CORDON_FAILED = Constant('kube-host-cordon-failed')
KUBE_HOST_UNCORDON_FAILED = Constant('kube-host-uncordon-failed')
KUBE_HOST_UPGRADE_CHANGED = Constant('kube-host-upgrade-changed')
KUBE_HOST_UPGRADE_CONTROL_PLANE_FAILED = \
Constant('kube-host-upgrade-control-plane-failed')
KUBE_HOST_UPGRADE_KUBELET_FAILED = \
Constant('kube-host-upgrade-kubelet-failed')
KUBE_ROOTCA_UPDATE_HOST_FAILED = Constant('kube-rootca-update-host-failed')
KUBE_UPGRADE_CHANGED = Constant('kube-upgrade-changed')
# Constants

View File

@ -3882,14 +3882,18 @@ class AbstractKubeUpgradeStep(AbstractStrategyStep):
from nfv_vim import nfvi
DLOG.debug("Step (%s) handle event (%s)." % (self._name, event))
if event == STRATEGY_EVENT.HOST_AUDIT:
if event == STRATEGY_EVENT.KUBE_UPGRADE_CHANGED:
# todo(abailey): use event data rather than re-querying
self._query_inprogress = True
nfvi.nfvi_get_kube_upgrade(self._get_kube_upgrade_callback())
return True
elif event == STRATEGY_EVENT.HOST_AUDIT:
if 0 == self._wait_time:
self._wait_time = timers.get_monotonic_timestamp_in_ms()
now_ms = timers.get_monotonic_timestamp_in_ms()
secs_expired = (now_ms - self._wait_time) // 1000
# Wait at least 60 seconds before checking upgrade for first time
if 60 <= secs_expired and not self._query_inprogress:
# Wait 30 seconds before checking kube upgrade for first time
if 30 <= secs_expired and not self._query_inprogress:
self._query_inprogress = True
nfvi.nfvi_get_kube_upgrade(self._get_kube_upgrade_callback())
return True
@ -4379,8 +4383,8 @@ class KubeHostUpgradeControlPlaneStep(AbstractKubeHostUpgradeStep):
from nfv_vim import directors
DLOG.info("Step (%s) apply to hostnames (%s)."
% (self._name, self._host_names))
DLOG.debug("Step (%s) apply to hostnames (%s)."
% (self._name, self._host_names))
host_director = directors.get_host_director()
operation = \
host_director.kube_upgrade_hosts_control_plane(self._host_names,
@ -4456,14 +4460,19 @@ class KubeHostUpgradeKubeletStep(AbstractKubeHostListUpgradeStep):
result,
"kube host upgrade kubelet (%s) failed" % host.name)
return True
elif event == STRATEGY_EVENT.KUBE_HOST_UPGRADE_CHANGED:
self._query_inprogress = True
nfvi.nfvi_get_kube_host_upgrade_list(
self._get_kube_host_upgrade_list_callback())
return True
elif event == STRATEGY_EVENT.HOST_AUDIT:
if 0 == self._wait_time:
self._wait_time = timers.get_monotonic_timestamp_in_ms()
now_ms = timers.get_monotonic_timestamp_in_ms()
secs_expired = (now_ms - self._wait_time) // 1000
# Wait at least 60 seconds before checking upgrade for first time
if 60 <= secs_expired and not self._query_inprogress:
# Wait at least 30 seconds before checking kube hosts for first time
if 30 <= secs_expired and not self._query_inprogress:
self._query_inprogress = True
nfvi.nfvi_get_kube_host_upgrade_list(
self._get_kube_host_upgrade_list_callback())
@ -4475,13 +4484,12 @@ class KubeHostUpgradeKubeletStep(AbstractKubeHostListUpgradeStep):
from nfv_vim import directors
DLOG.info("Step (%s) apply to hostnames (%s)."
% (self._name, self._host_names))
DLOG.debug("Step (%s) apply to hostnames (%s)."
% (self._name, self._host_names))
host_director = directors.get_host_director()
operation = \
host_director.kube_upgrade_hosts_kubelet(self._host_names,
self._force)
if operation.is_inprogress():
return strategy.STRATEGY_STEP_RESULT.WAIT, ""
elif operation.is_failed():