From a4280ebf598e8303f4674ab2f4431bb678bc30f4 Mon Sep 17 00:00:00 2001 From: Al Bailey Date: Mon, 3 Apr 2023 12:57:53 +0000 Subject: [PATCH] Add host cordon steps to kube upgrade orch When updating the control plane and kubelets the host needs to be cordoned to prevent it from doing kubernetes work during that time period: system kube-host-cordon < update control plane > < update kubelet> system kube-host-uncordon Currently only supported for simplex. Depends-On: https://review.opendev.org/c/starlingx/config/+/880333 Test Plan: PASS: AIO-SX single kube upgrade (1.24 -> 1.25) PASS: Resume AIO-SX single kube upgrade after cordon started. PEND: AIO-SX multi-kube upgrade PEND: AIO-DX kube upgrade Story: 2010565 Task: 47772 Signed-off-by: Al Bailey Change-Id: I54262d4ff31a2da005fffb6d30bb6872ee52f6d4 --- .../nfvi_plugins/nfvi_infrastructure_api.py | 116 ++++++++++- .../nfvi_plugins/openstack/sysinv.py | 35 +++- .../tests/test_kube_upgrade_strategy.py | 56 ++++++ .../nfv_vim/directors/_host_director.py | 190 ++++++++++++++++-- .../nfv_vim/directors/_sw_mgmt_director.py | 18 +- nfv/nfv-vim/nfv_vim/nfvi/__init__.py | 4 +- .../nfvi/_nfvi_infrastructure_module.py | 28 ++- .../nfv_vim/nfvi/objects/v1/_kube_upgrade.py | 8 +- nfv/nfv-vim/nfv_vim/strategy/__init__.py | 4 +- nfv/nfv-vim/nfv_vim/strategy/_strategy.py | 98 ++++++++- .../nfv_vim/strategy/_strategy_defs.py | 4 +- .../nfv_vim/strategy/_strategy_stages.py | 4 +- .../nfv_vim/strategy/_strategy_steps.py | 114 ++++++++++- 13 files changed, 629 insertions(+), 50 deletions(-) diff --git a/nfv/nfv-plugins/nfv_plugins/nfvi_plugins/nfvi_infrastructure_api.py b/nfv/nfv-plugins/nfv_plugins/nfvi_plugins/nfvi_infrastructure_api.py index eaa4f6cb..657782cd 100755 --- a/nfv/nfv-plugins/nfv_plugins/nfvi_plugins/nfvi_infrastructure_api.py +++ b/nfv/nfv-plugins/nfv_plugins/nfvi_plugins/nfvi_infrastructure_api.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2021 Wind River Systems, Inc. +# Copyright (c) 2015-2023 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # @@ -3183,6 +3183,120 @@ class NFVIInfrastructureAPI(nfvi.api.v1.NFVIInfrastructureAPI): callback.send(response) callback.close() + def kube_host_cordon(self, future, host_uuid, host_name, force, callback): + """ + Cordon a host + """ + + # ignoring the force argument for now + response = dict() + response['completed'] = False + response['host_uuid'] = host_uuid + response['host_name'] = host_name + response['reason'] = '' + + action_type = 'kube-host-cordon' + sysinv_method = sysinv.kube_host_cordon + try: + future.set_timeouts(config.CONF.get('nfvi-timeouts', None)) + + if self._platform_token is None or \ + self._platform_token.is_expired(): + future.work(openstack.get_token, self._platform_directory) + future.result = (yield) + + if not future.result.is_complete() or \ + future.result.data is None: + DLOG.error("OpenStack get-token did not complete, " + "host_uuid=%s." % host_uuid) + return + + self._platform_token = future.result.data + + # cordon wants a hostname and not a host_uuid + future.work(sysinv_method, self._platform_token, host_name, force) + future.result = (yield) + + if not future.result.is_complete(): + return + + response['completed'] = True + + except exceptions.OpenStackRestAPIException as e: + if httplib.UNAUTHORIZED == e.http_status_code: + response['error-code'] = nfvi.NFVI_ERROR_CODE.TOKEN_EXPIRED + if self._platform_token is not None: + self._platform_token.set_expired() + + else: + DLOG.exception("Caught exception while trying to %s " + "a host %s, error=%s." % (action_type, host_name, e)) + response['reason'] = e.http_response_reason + + except Exception as e: + DLOG.exception("Caught exception while trying to %s a " + "host %s, error=%s." % (action_type, host_name, e)) + + finally: + callback.send(response) + callback.close() + + def kube_host_uncordon(self, future, host_uuid, host_name, force, callback): + """ + Uncordon a host + """ + response = dict() + response['completed'] = False + response['host_uuid'] = host_uuid + response['host_name'] = host_name + response['reason'] = '' + + action_type = 'kube-host-uncordon' + sysinv_method = sysinv.kube_host_uncordon + try: + future.set_timeouts(config.CONF.get('nfvi-timeouts', None)) + + if self._platform_token is None or \ + self._platform_token.is_expired(): + future.work(openstack.get_token, self._platform_directory) + future.result = (yield) + + if not future.result.is_complete() or \ + future.result.data is None: + DLOG.error("OpenStack get-token did not complete, " + "host_uuid=%s." % host_uuid) + return + + self._platform_token = future.result.data + + # uncordon wants a hostname and not a host_uuid + future.work(sysinv_method, self._platform_token, host_name, force) + future.result = (yield) + + if not future.result.is_complete(): + return + + response['completed'] = True + + except exceptions.OpenStackRestAPIException as e: + if httplib.UNAUTHORIZED == e.http_status_code: + response['error-code'] = nfvi.NFVI_ERROR_CODE.TOKEN_EXPIRED + if self._platform_token is not None: + self._platform_token.set_expired() + + else: + DLOG.exception("Caught exception while trying to %s " + "a host %s, error=%s." % (action_type, host_name, e)) + response['reason'] = e.http_response_reason + + except Exception as e: + DLOG.exception("Caught exception while trying to %s a " + "host %s, error=%s." % (action_type, host_name, e)) + + finally: + callback.send(response) + callback.close() + def lock_host(self, future, host_uuid, host_name, callback): """ Lock a host diff --git a/nfv/nfv-plugins/nfv_plugins/nfvi_plugins/openstack/sysinv.py b/nfv/nfv-plugins/nfv_plugins/nfvi_plugins/openstack/sysinv.py index 13717826..bb6a2599 100755 --- a/nfv/nfv-plugins/nfv_plugins/nfvi_plugins/openstack/sysinv.py +++ b/nfv/nfv-plugins/nfv_plugins/nfvi_plugins/openstack/sysinv.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2021 Wind River Systems, Inc. +# Copyright (c) 2015-2023 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # @@ -383,7 +383,12 @@ def kube_upgrade_start(token, to_version, force=False, alarm_ignore_list=None): return response -def _patch_kube_upgrade_state(token, new_value): +def api_data_patch(path, value, op="replace"): + # the 'path' is prefixed with a leading '/' + return {'path': '/' + path, 'value': value, 'op': op} + + +def _patch_kube_upgrade_state(token, new_value, hostname=None): url = token.get_service_url(PLATFORM_SERVICE.SYSINV) if url is None: raise ValueError("OpenStack SysInv URL is invalid") @@ -395,11 +400,10 @@ def _patch_kube_upgrade_state(token, new_value): api_cmd_headers['User-Agent'] = "vim/1.0" api_cmd_payload = list() - host_data = dict() - host_data['path'] = "/state" - host_data['value'] = new_value - host_data['op'] = "replace" - api_cmd_payload.append(host_data) + api_cmd_payload.append(api_data_patch('state', new_value)) + # some kube upgrade patch commands take a hostname + if hostname is not None: + api_cmd_payload.append(api_data_patch('hostname', hostname)) return rest_api_request(token, "PATCH", @@ -449,6 +453,23 @@ def kube_upgrade_networking(token): return _patch_kube_upgrade_state(token, "upgrading-networking") +def kube_host_cordon(token, hostname, force): + """ + system kube-host-cordon + force is a 'string' but is currently unused + """ + # cordon needs a 'hostname' + return _patch_kube_upgrade_state(token, "cordon-started", hostname=hostname) + + +def kube_host_uncordon(token, hostname, force): + """ + system kube-host-uncordon + force is a 'string' but is currently unused + """ + return _patch_kube_upgrade_state(token, "uncordon-started", hostname=hostname) + + def _kube_host_upgrade(token, host_uuid, target_operation, force): """ Invoke a POST for a host kube-upgrade operation diff --git a/nfv/nfv-tests/nfv_unit_tests/tests/test_kube_upgrade_strategy.py b/nfv/nfv-tests/nfv_unit_tests/tests/test_kube_upgrade_strategy.py index f9273ec4..3c9c560e 100755 --- a/nfv/nfv-tests/nfv_unit_tests/tests/test_kube_upgrade_strategy.py +++ b/nfv/nfv-tests/nfv_unit_tests/tests/test_kube_upgrade_strategy.py @@ -283,6 +283,28 @@ class ApplyStageMixin(object): ], } + def _kube_host_cordon_stage(self, ver="N/A"): + return { + 'name': 'kube-host-cordon', + 'total_steps': 1, + 'steps': [ + {'name': 'kube-host-cordon', + 'success_state': 'cordon-complete', + 'fail_state': 'cordon-failed'}, + ], + } + + def _kube_host_uncordon_stage(self, ver="N/A"): + return { + 'name': 'kube-host-uncordon', + 'total_steps': 1, + 'steps': [ + {'name': 'kube-host-uncordon', + 'success_state': 'uncordon-complete', + 'fail_state': 'uncordon-failed'}, + ], + } + def _kube_upgrade_first_control_plane_stage(self, ver): return { 'name': 'kube-upgrade-first-control-plane %s' % ver, @@ -462,15 +484,21 @@ class ApplyStageMixin(object): add_start=True, add_download=True, add_networking=True, + add_cordon=True, add_first_control_plane=True, add_second_control_plane=True, add_kubelets=True, + add_uncordon=True, add_complete=True, add_cleanup=True): """The order of the host_list determines the kubelets""" # We never add a second control plane on a simplex if self.is_simplex(): add_second_control_plane = False + # we do not support cordon and uncordon in duplex + if self.is_duplex(): + add_cordon = False + add_uncordon = False stages = [] if add_start: stages.append(self._kube_upgrade_start_stage()) @@ -478,6 +506,8 @@ class ApplyStageMixin(object): stages.append(self._kube_upgrade_download_images_stage()) if add_networking: stages.append(self._kube_upgrade_networking_stage()) + if add_cordon: + stages.append(self._kube_host_cordon_stage()) for ver in self.kube_versions: if add_first_control_plane: stages.append(self._kube_upgrade_first_control_plane_stage(ver)) @@ -489,6 +519,8 @@ class ApplyStageMixin(object): std_controller_list, aio_controller_list, worker_list)) + if add_uncordon: + stages.append(self._kube_host_uncordon_stage()) if add_complete: stages.append(self._kube_upgrade_complete_stage()) if add_cleanup: @@ -590,6 +622,8 @@ class TestSimplexApplyStrategy(sw_update_testcase.SwUpdateStrategyTestCase, self._kube_upgrade_download_images_stage(), self._kube_upgrade_networking_stage(), ] + if self.is_simplex(): + stages.append(self._kube_host_cordon_stage()) for ver in self.kube_versions: stages.append(self._kube_upgrade_first_control_plane_stage(ver)) stages.extend(self._kube_upgrade_kubelet_stages( @@ -597,6 +631,8 @@ class TestSimplexApplyStrategy(sw_update_testcase.SwUpdateStrategyTestCase, self.std_controller_list, self.aio_controller_list, self.worker_list)) + if self.is_simplex(): + stages.append(self._kube_host_uncordon_stage()) stages.extend([ self._kube_upgrade_complete_stage(), self._kube_upgrade_cleanup_stage(), @@ -616,6 +652,8 @@ class TestSimplexApplyStrategy(sw_update_testcase.SwUpdateStrategyTestCase, stages = [ self._kube_upgrade_networking_stage(), ] + if self.is_simplex(): + stages.append(self._kube_host_cordon_stage()) for ver in self.kube_versions: stages.append(self._kube_upgrade_first_control_plane_stage( ver)) @@ -624,6 +662,8 @@ class TestSimplexApplyStrategy(sw_update_testcase.SwUpdateStrategyTestCase, self.std_controller_list, self.aio_controller_list, self.worker_list)) + if self.is_simplex(): + stages.append(self._kube_host_uncordon_stage()) stages.extend([ self._kube_upgrade_complete_stage(), self._kube_upgrade_cleanup_stage(), @@ -649,6 +689,8 @@ class TestSimplexApplyStrategy(sw_update_testcase.SwUpdateStrategyTestCase, self.std_controller_list, self.aio_controller_list, self.worker_list)) + if self.is_simplex(): + stages.append(self._kube_host_uncordon_stage()) stages.extend([ self._kube_upgrade_complete_stage(), self._kube_upgrade_cleanup_stage(), @@ -672,6 +714,8 @@ class TestSimplexApplyStrategy(sw_update_testcase.SwUpdateStrategyTestCase, self.std_controller_list, self.aio_controller_list, self.worker_list)) + if self.is_simplex(): + stages.append(self._kube_host_uncordon_stage()) stages.extend([ self._kube_upgrade_complete_stage(), self._kube_upgrade_cleanup_stage(), @@ -691,6 +735,8 @@ class TestSimplexApplyStrategy(sw_update_testcase.SwUpdateStrategyTestCase, stages = [ self._kube_upgrade_networking_stage(), ] + if self.is_simplex(): + stages.append(self._kube_host_cordon_stage()) for ver in self.kube_versions: stages.append(self._kube_upgrade_first_control_plane_stage( ver)) @@ -699,6 +745,8 @@ class TestSimplexApplyStrategy(sw_update_testcase.SwUpdateStrategyTestCase, self.std_controller_list, self.aio_controller_list, self.worker_list)) + if self.is_simplex(): + stages.append(self._kube_host_uncordon_stage()) stages.extend([ self._kube_upgrade_complete_stage(), self._kube_upgrade_cleanup_stage(), @@ -716,6 +764,8 @@ class TestSimplexApplyStrategy(sw_update_testcase.SwUpdateStrategyTestCase, self.default_from_version, self.default_to_version) stages = [] + if self.is_simplex(): + stages.append(self._kube_host_cordon_stage()) for ver in self.kube_versions: stages.append(self._kube_upgrade_first_control_plane_stage( ver)) @@ -724,6 +774,8 @@ class TestSimplexApplyStrategy(sw_update_testcase.SwUpdateStrategyTestCase, self.std_controller_list, self.aio_controller_list, self.worker_list)) + if self.is_simplex(): + stages.append(self._kube_host_uncordon_stage()) stages.extend([ self._kube_upgrade_complete_stage(), self._kube_upgrade_cleanup_stage(), @@ -749,6 +801,8 @@ class TestSimplexApplyStrategy(sw_update_testcase.SwUpdateStrategyTestCase, self.std_controller_list, self.aio_controller_list, self.worker_list)) + if self.is_simplex(): + stages.append(self._kube_host_uncordon_stage()) stages.extend([ self._kube_upgrade_complete_stage(), self._kube_upgrade_cleanup_stage(), @@ -774,6 +828,8 @@ class TestSimplexApplyStrategy(sw_update_testcase.SwUpdateStrategyTestCase, self.std_controller_list, self.aio_controller_list, self.worker_list)) + if self.is_simplex(): + stages.append(self._kube_host_uncordon_stage()) stages.extend([ self._kube_upgrade_complete_stage(), self._kube_upgrade_cleanup_stage(), diff --git a/nfv/nfv-vim/nfv_vim/directors/_host_director.py b/nfv/nfv-vim/nfv_vim/directors/_host_director.py index fda7799e..ec0b11f6 100755 --- a/nfv/nfv-vim/nfv_vim/directors/_host_director.py +++ b/nfv/nfv-vim/nfv_vim/directors/_host_director.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2021 Wind River Systems, Inc. +# Copyright (c) 2015-2023 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # @@ -50,7 +50,7 @@ class HostDirector(object): return if self._host_operation is None: - DLOG.verbose("No host %s operation inprogress." % host.name) + DLOG.verbose("No host %s operation in progress." % host.name) return if OPERATION_TYPE.LOCK_HOSTS != self._host_operation.operation_type: @@ -89,7 +89,7 @@ class HostDirector(object): return if self._host_operation is None: - DLOG.verbose("No host %s operation inprogress." % host.name) + DLOG.verbose("No host %s operation in progress." % host.name) return if OPERATION_TYPE.DISABLE_HOST_SERVICES != \ @@ -145,7 +145,7 @@ class HostDirector(object): return if self._host_operation is None: - DLOG.verbose("No host %s operation inprogress." % host.name) + DLOG.verbose("No host %s operation in progress." % host.name) return if OPERATION_TYPE.ENABLE_HOST_SERVICES != \ @@ -206,7 +206,7 @@ class HostDirector(object): return if self._host_operation is None: - DLOG.verbose("No host %s operation inprogress." % host.name) + DLOG.verbose("No host %s operation in progress." % host.name) return if OPERATION_TYPE.UNLOCK_HOSTS != self._host_operation.operation_type: @@ -244,7 +244,7 @@ class HostDirector(object): return if self._host_operation is None: - DLOG.verbose("No host %s operation inprogress." % host.name) + DLOG.verbose("No host %s operation in progress." % host.name) return if OPERATION_TYPE.REBOOT_HOSTS != self._host_operation.operation_type: @@ -282,7 +282,7 @@ class HostDirector(object): return if self._host_operation is None: - DLOG.verbose("No host %s operation inprogress." % host.name) + DLOG.verbose("No host %s operation in progress." % host.name) return if OPERATION_TYPE.UPGRADE_HOSTS != self._host_operation.operation_type: @@ -320,7 +320,7 @@ class HostDirector(object): return if self._host_operation is None: - DLOG.verbose("No host %s operation inprogress." % host.name) + DLOG.verbose("No host %s operation in progress." % host.name) return if OPERATION_TYPE.SWACT_HOSTS != self._host_operation.operation_type: @@ -358,7 +358,7 @@ class HostDirector(object): return if self._host_operation is None: - DLOG.verbose("No host %s operation inprogress." % host.name) + DLOG.verbose("No host %s operation in progress." % host.name) return if OPERATION_TYPE.FW_UPDATE_HOSTS != self._host_operation.operation_type: @@ -395,7 +395,7 @@ class HostDirector(object): return if self._host_operation is None: - DLOG.verbose("No host %s operation inprogress." % host.name) + DLOG.verbose("No host %s operation in progress." % host.name) return if OPERATION_TYPE.FW_UPDATE_ABORT_HOSTS != self._host_operation.operation_type: @@ -515,11 +515,11 @@ class HostDirector(object): @staticmethod def host_audit(host): """ - Notifies the host director that a host audit is inprogress + Notifies the host director that a host audit is in progress """ from nfv_vim import directors - DLOG.verbose("Notify other directors that a host %s audit is inprogress." + DLOG.verbose("Notify other directors that a host %s audit is in progress." % host.name) instance_director = directors.get_instance_director() instance_director.host_audit(host) @@ -530,11 +530,11 @@ class HostDirector(object): @staticmethod def host_abort(host): """ - Notifies the host director that a host abort is inprogress + Notifies the host director that a host abort is in progress """ from nfv_vim import directors - DLOG.info("Notify other directors that a host %s abort is inprogress." + DLOG.info("Notify other directors that a host %s abort is in progress." % host.name) instance_director = directors.get_instance_director() instance_director.host_operation_cancel(host.name) @@ -824,7 +824,7 @@ class HostDirector(object): return if self._host_operation is None: - DLOG.verbose("No host %s operation inprogress." % host.name) + DLOG.verbose("No host %s operation in progress." % host.name) return if OPERATION_TYPE.KUBE_UPGRADE_HOSTS != self._host_operation.operation_type: @@ -883,6 +883,162 @@ class HostDirector(object): return host_operation + # cordon + @coroutine + def _nfvi_kube_host_cordon_callback(self): + """ + NFVI Kube Host Cordon Callback + """ + from nfv_vim import directors + + response = (yield) + DLOG.verbose("NFVI Kube Host Cordon response=%s." % response) + if not response['completed']: + DLOG.info("Kube Host Upgrade Cordon failed. Host:%s, reason=%s." + % (response['host_name'], response['reason'])) + + host_table = tables.tables_get_host_table() + host = host_table.get(response['host_name'], None) + if host is None: + DLOG.verbose("Host %s does not exist." % response['host_name']) + return + + if self._host_operation is None: + DLOG.verbose("No host %s operation in progress." % host.name) + return + + if OPERATION_TYPE.KUBE_UPGRADE_HOSTS != self._host_operation.operation_type: + DLOG.verbose("Unexpected host %s operation %s, ignoring." + % (host.name, self._host_operation.operation_type)) + return + + sw_mgmt_director = directors.get_sw_mgmt_director() + sw_mgmt_director.kube_host_cordon_failed(host) + + def _nfvi_kube_host_cordon(self, + host_uuid, + host_name, + force): + """ + NFVI Kube Host Cordon + """ + nfvi.nfvi_kube_host_cordon( + host_uuid, + host_name, + force, + self._nfvi_kube_host_cordon_callback()) + + def kube_host_cordon(self, host_names, force): + """ + Kube Host Cordon for multiple hosts + """ + DLOG.info("Kube Host Cordon for hosts: %s" % host_names) + + host_operation = \ + Operation(OPERATION_TYPE.KUBE_UPGRADE_HOSTS) + + if self._host_operation is not None: + DLOG.debug("Canceling previous host operation %s, before " + "continuing with host operation %s." + % (self._host_operation.operation_type, + host_operation.operation_type)) + self._host_operation = None + + host_table = tables.tables_get_host_table() + for host_name in host_names: + host = host_table.get(host_name, None) + if host is None: + reason = "Unknown host %s given." % host_name + DLOG.info(reason) + host_operation.set_failed(reason) + return host_operation + + host_operation.add_host(host.name, OPERATION_STATE.INPROGRESS) + self._nfvi_kube_host_cordon(host.uuid, + host.name, + force) + if host_operation.is_inprogress(): + self._host_operation = host_operation + return host_operation + + # uncordon + @coroutine + def _nfvi_kube_host_uncordon_callback(self): + """ + NFVI Kube Host Uncordon Callback + """ + from nfv_vim import directors + + response = (yield) + DLOG.verbose("NFVI Kube Host Uncordon response=%s." % response) + if not response['completed']: + DLOG.info("Kube Host Upgrade Uncordon failed. Host:%s, reason=%s." + % (response['host_name'], response['reason'])) + + host_table = tables.tables_get_host_table() + host = host_table.get(response['host_name'], None) + if host is None: + DLOG.verbose("Host %s does not exist." % response['host_name']) + return + + if self._host_operation is None: + DLOG.verbose("No host %s operation in progress." % host.name) + return + + if OPERATION_TYPE.KUBE_UPGRADE_HOSTS != self._host_operation.operation_type: + DLOG.verbose("Unexpected host %s operation %s, ignoring." + % (host.name, self._host_operation.operation_type)) + return + + sw_mgmt_director = directors.get_sw_mgmt_director() + sw_mgmt_director.kube_host_uncordon_failed(host) + + def _nfvi_kube_host_uncordon(self, + host_uuid, + host_name, + force): + """ + NFVI Kube Host Uncordon + """ + nfvi.nfvi_kube_host_uncordon( + host_uuid, + host_name, + force, + self._nfvi_kube_host_uncordon_callback()) + + def kube_host_uncordon(self, host_names, force): + """ + Kube Host Uncordon for multiple hosts + """ + DLOG.info("Kube Host Uncordon for hosts: %s" % host_names) + + host_operation = \ + Operation(OPERATION_TYPE.KUBE_UPGRADE_HOSTS) + + if self._host_operation is not None: + DLOG.debug("Canceling previous host operation %s, before " + "continuing with host operation %s." + % (self._host_operation.operation_type, + host_operation.operation_type)) + self._host_operation = None + + host_table = tables.tables_get_host_table() + for host_name in host_names: + host = host_table.get(host_name, None) + if host is None: + reason = "Unknown host %s given." % host_name + DLOG.info(reason) + host_operation.set_failed(reason) + return host_operation + + host_operation.add_host(host.name, OPERATION_STATE.INPROGRESS) + self._nfvi_kube_host_uncordon(host.uuid, + host.name, + force) + if host_operation.is_inprogress(): + self._host_operation = host_operation + return host_operation + @coroutine def _nfvi_kube_host_upgrade_kubelet_callback(self): """ @@ -904,7 +1060,7 @@ class HostDirector(object): return if self._host_operation is None: - DLOG.verbose("No host %s operation inprogress." % host.name) + DLOG.verbose("No host %s operation in progress." % host.name) return if OPERATION_TYPE.KUBE_UPGRADE_HOSTS != self._host_operation.operation_type: @@ -978,7 +1134,7 @@ class HostDirector(object): return if self._host_operation is None: - DLOG.verbose("No host %s operation inprogress." % host.name) + DLOG.verbose("No host %s operation in progress." % host.name) return if OPERATION_TYPE.KUBE_ROOTCA_UPDATE_HOSTS \ diff --git a/nfv/nfv-vim/nfv_vim/directors/_sw_mgmt_director.py b/nfv/nfv-vim/nfv_vim/directors/_sw_mgmt_director.py index 7d62b530..cba96732 100755 --- a/nfv/nfv-vim/nfv_vim/directors/_sw_mgmt_director.py +++ b/nfv/nfv-vim/nfv_vim/directors/_sw_mgmt_director.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2021 Wind River Systems, Inc. +# Copyright (c) 2015-2023 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # @@ -281,6 +281,22 @@ class SwMgmtDirector(object): self._sw_update.handle_event( strategy.STRATEGY_EVENT.ENABLE_HOST_SERVICES_FAILED, host) + def kube_host_cordon_failed(self, host): + """ + Called when a kube host cordon fails + """ + if self._sw_update is not None: + self._sw_update.handle_event( + strategy.STRATEGY_EVENT.KUBE_HOST_CORDON_FAILED, host) + + def kube_host_uncordon_failed(self, host): + """ + Called when a kube host uncordon fails + """ + if self._sw_update is not None: + self._sw_update.handle_event( + strategy.STRATEGY_EVENT.KUBE_HOST_UNCORDON_FAILED, host) + def host_unlock_failed(self, host): """ Called when a unlock of a host failed diff --git a/nfv/nfv-vim/nfv_vim/nfvi/__init__.py b/nfv/nfv-vim/nfv_vim/nfvi/__init__.py index fca28abf..d64faec5 100755 --- a/nfv/nfv-vim/nfv_vim/nfvi/__init__.py +++ b/nfv/nfv-vim/nfv_vim/nfvi/__init__.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2021 Wind River Systems, Inc. +# Copyright (c) 2015-2023 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # @@ -112,6 +112,8 @@ from nfv_vim.nfvi._nfvi_infrastructure_module import nfvi_get_terminating_pods from nfv_vim.nfvi._nfvi_infrastructure_module import nfvi_get_upgrade # noqa: F401 from nfv_vim.nfvi._nfvi_infrastructure_module import nfvi_host_device_image_update # noqa: F401 from nfv_vim.nfvi._nfvi_infrastructure_module import nfvi_host_device_image_update_abort # noqa: F401 +from nfv_vim.nfvi._nfvi_infrastructure_module import nfvi_kube_host_cordon # noqa: F401 +from nfv_vim.nfvi._nfvi_infrastructure_module import nfvi_kube_host_uncordon # noqa: F401 from nfv_vim.nfvi._nfvi_infrastructure_module import nfvi_kube_host_upgrade_control_plane # noqa: F401 from nfv_vim.nfvi._nfvi_infrastructure_module import nfvi_kube_host_upgrade_kubelet # noqa: F401 from nfv_vim.nfvi._nfvi_infrastructure_module import nfvi_kube_rootca_update_abort # noqa: F401 diff --git a/nfv/nfv-vim/nfv_vim/nfvi/_nfvi_infrastructure_module.py b/nfv/nfv-vim/nfv_vim/nfvi/_nfvi_infrastructure_module.py index 718856a8..76f34561 100755 --- a/nfv/nfv-vim/nfv_vim/nfvi/_nfvi_infrastructure_module.py +++ b/nfv/nfv-vim/nfv_vim/nfvi/_nfvi_infrastructure_module.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2021 Wind River Systems, Inc. +# Copyright (c) 2015-2023 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # @@ -100,6 +100,32 @@ def nfvi_host_device_image_update_abort(host_uuid, host_name, callback): return cmd_id +def nfvi_kube_host_cordon(host_uuid, host_name, force, callback): + """ + Kube Host Upgrade Cordon + """ + cmd_id = _infrastructure_plugin.invoke_plugin( + 'kube_host_cordon', + host_uuid, + host_name, + force, + callback=callback) + return cmd_id + + +def nfvi_kube_host_uncordon(host_uuid, host_name, force, callback): + """ + Kube Host Upgrade Uncordon + """ + cmd_id = _infrastructure_plugin.invoke_plugin( + 'kube_host_uncordon', + host_uuid, + host_name, + force, + callback=callback) + return cmd_id + + def nfvi_kube_host_upgrade_control_plane(host_uuid, host_name, force, callback): """ Kube Host Upgrade Control Plane diff --git a/nfv/nfv-vim/nfv_vim/nfvi/objects/v1/_kube_upgrade.py b/nfv/nfv-vim/nfv_vim/nfvi/objects/v1/_kube_upgrade.py index d8f10b9d..d3d782ed 100755 --- a/nfv/nfv-vim/nfv_vim/nfvi/objects/v1/_kube_upgrade.py +++ b/nfv/nfv-vim/nfv_vim/nfvi/objects/v1/_kube_upgrade.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2016-2021 Wind River Systems, Inc. +# Copyright (c) 2016-2023 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # @@ -51,6 +51,12 @@ class KubeUpgradeState(Constants): KUBE_UPGRADED_SECOND_MASTER = Constant('upgraded-second-master') KUBE_UPGRADING_KUBELETS = Constant('upgrading-kubelets') KUBE_UPGRADE_COMPLETE = Constant('upgrade-complete') + KUBE_HOST_CORDON = Constant('cordon-started') + KUBE_HOST_CORDON_COMPLETE = Constant('cordon-complete') + KUBE_HOST_CORDON_FAILED = Constant('cordon-failed') + KUBE_HOST_UNCORDON = Constant('uncordon-started') + KUBE_HOST_UNCORDON_COMPLETE = Constant('uncordon-complete') + KUBE_HOST_UNCORDON_FAILED = Constant('uncordon-failed') # Kube Upgrade Constant Instantiation diff --git a/nfv/nfv-vim/nfv_vim/strategy/__init__.py b/nfv/nfv-vim/nfv_vim/strategy/__init__.py index 3b721982..13992997 100755 --- a/nfv/nfv-vim/nfv_vim/strategy/__init__.py +++ b/nfv/nfv-vim/nfv_vim/strategy/__init__.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2021 Wind River Systems, Inc. +# Copyright (c) 2015-2023 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # @@ -16,6 +16,8 @@ from nfv_vim.strategy._strategy_steps import ApplySwPatchesStep # noqa: F401 from nfv_vim.strategy._strategy_steps import DisableHostServicesStep # noqa: F401 from nfv_vim.strategy._strategy_steps import FwUpdateAbortHostsStep # noqa: F401 from nfv_vim.strategy._strategy_steps import FwUpdateHostsStep # noqa: F401 +from nfv_vim.strategy._strategy_steps import KubeHostCordonStep # noqa: F401 +from nfv_vim.strategy._strategy_steps import KubeHostUncordonStep # noqa: F401 from nfv_vim.strategy._strategy_steps import KubeHostUpgradeControlPlaneStep # noqa: F401 from nfv_vim.strategy._strategy_steps import KubeHostUpgradeKubeletStep # noqa: F401 from nfv_vim.strategy._strategy_steps import KubeRootcaUpdateCompleteStep # noqa: F401 diff --git a/nfv/nfv-vim/nfv_vim/strategy/_strategy.py b/nfv/nfv-vim/nfv_vim/strategy/_strategy.py index 34f64980..d4af102f 100755 --- a/nfv/nfv-vim/nfv_vim/strategy/_strategy.py +++ b/nfv/nfv-vim/nfv_vim/strategy/_strategy.py @@ -3097,13 +3097,16 @@ class KubeUpgradeStrategy(SwUpdateStrategy, # 'upgrade_from' value is a list of versions however the # list should only ever be a single entry so we get the first # value and allow an exception to be raised if the list is empty + # todo(abailey): if the list contains more than one entry the + # algorithm may not work, since it may not converge at the active version. ver = kube['upgrade_from'][0] + # go around the loop again... # We should NEVER get into an infinite loop, but if the kube-version entries # in sysinv are malformed, we do not want to spin forever loop_count += 1 - if loop_count > 100: + if loop_count > 10: raise Exception("Invalid kubernetes dependency chain detected") return kube_sequence @@ -3169,21 +3172,55 @@ class KubeUpgradeStrategy(SwUpdateStrategy, stage.add_step(strategy.KubeUpgradeNetworkingStep()) self.apply_phase.add_stage(stage) - # need to update control plane and kubelet per-version - self._add_kube_update_stages() + # Next stage after networking is cordon + self._add_kube_host_cordon_stage() - def _add_kube_update_stages(self): - # for a particular version, the order is: - # - first control plane - # - second control plane - # - kubelets + def _add_kube_host_cordon_stage(self): + """Add host cordon stage for a host""" + # simplex only from nfv_vim import nfvi from nfv_vim import strategy + + first_host = self.get_first_host() + second_host = self.get_second_host() + is_simplex = second_host is None + if is_simplex: + # todo(abailey): add rollback support to trigger uncordon + stage = strategy.StrategyStage( + strategy.STRATEGY_STAGE_NAME.KUBE_HOST_CORDON) + stage.add_step(strategy.KubeHostCordonStep( + first_host, + self._to_version, + False, # force + nfvi.objects.v1.KUBE_UPGRADE_STATE.KUBE_HOST_CORDON_COMPLETE, + nfvi.objects.v1.KUBE_UPGRADE_STATE.KUBE_HOST_CORDON_FAILED) + ) + self.apply_phase.add_stage(stage) + self._add_kube_update_stages() + + def _add_kube_update_stages(self): + """Stages for control plane, kubelet and cordon""" + # Algorithm + # ------------------------- + # Simplex: + # - loop over kube versions + # - control plane + # - kubelet + # ------------------------- + # Duplex: + # - loop over kube versions + # - first control plane + # - second control plane + # - kubelets + # ------------------------- + from nfv_vim import nfvi + from nfv_vim import strategy + first_host = self.get_first_host() second_host = self.get_second_host() ver_list = self._get_kube_version_steps(self._to_version, - self._nfvi_kube_versions_list) + self._nfvi_kube_versions_list) prev_state = None if self.nfvi_kube_upgrade is not None: @@ -3225,6 +3262,29 @@ class KubeUpgradeStrategy(SwUpdateStrategy, if self._state == strategy.STRATEGY_STATE.BUILD_FAILED: return + self._add_kube_host_uncordon_stage() + + def _add_kube_host_uncordon_stage(self): + """Add host uncordon stage for a host""" + # simplex only + + from nfv_vim import nfvi + from nfv_vim import strategy + + first_host = self.get_first_host() + second_host = self.get_second_host() + is_simplex = second_host is None + if is_simplex: + stage = strategy.StrategyStage( + strategy.STRATEGY_STAGE_NAME.KUBE_HOST_UNCORDON) + stage.add_step(strategy.KubeHostUncordonStep( + first_host, + self._to_version, + False, # force + nfvi.objects.v1.KUBE_UPGRADE_STATE.KUBE_HOST_UNCORDON_COMPLETE, + nfvi.objects.v1.KUBE_UPGRADE_STATE.KUBE_HOST_UNCORDON_FAILED) + ) + self.apply_phase.add_stage(stage) # after this loop is kube upgrade complete stage self._add_kube_upgrade_complete_stage() @@ -3458,8 +3518,16 @@ class KubeUpgradeStrategy(SwUpdateStrategy, nfvi.objects.v1.KUBE_UPGRADE_STATE.KUBE_UPGRADING_NETWORKING_FAILED: self._add_kube_upgrade_networking_stage, - # After networking -> upgrade first control plane + # After networking -> cordon nfvi.objects.v1.KUBE_UPGRADE_STATE.KUBE_UPGRADED_NETWORKING: + self._add_kube_host_cordon_stage, + + # If the state is cordon-failed, resume at cordon stage + nfvi.objects.v1.KUBE_UPGRADE_STATE.KUBE_HOST_CORDON_FAILED: + self._add_kube_host_cordon_stage, + + # If the state is cordon-complete, resume at update stages + nfvi.objects.v1.KUBE_UPGRADE_STATE.KUBE_HOST_CORDON_COMPLETE: self._add_kube_update_stages, # if upgrading first control plane failed, resume there @@ -3478,10 +3546,18 @@ class KubeUpgradeStrategy(SwUpdateStrategy, nfvi.objects.v1.KUBE_UPGRADE_STATE.KUBE_UPGRADED_SECOND_MASTER: self._add_kube_update_stages, - # kubelets transition to 'complete' when they are done + # kubelets transition to 'uncordon after they are done nfvi.objects.v1.KUBE_UPGRADE_STATE.KUBE_UPGRADING_KUBELETS: self._add_kube_update_stages, + # If the state is uncordon-failed, resume at uncordon stage + nfvi.objects.v1.KUBE_UPGRADE_STATE.KUBE_HOST_UNCORDON_FAILED: + self._add_kube_host_uncordon_stage, + + # If the state is uncordon-complete, resume at complete stage + nfvi.objects.v1.KUBE_UPGRADE_STATE.KUBE_HOST_UNCORDON_COMPLETE: + self._add_kube_upgrade_complete_stage, + # upgrade is completed, delete the upgrade nfvi.objects.v1.KUBE_UPGRADE_STATE.KUBE_UPGRADE_COMPLETE: self._add_kube_upgrade_cleanup_stage, diff --git a/nfv/nfv-vim/nfv_vim/strategy/_strategy_defs.py b/nfv/nfv-vim/nfv_vim/strategy/_strategy_defs.py index 1643ffc7..e146fd39 100755 --- a/nfv/nfv-vim/nfv_vim/strategy/_strategy_defs.py +++ b/nfv/nfv-vim/nfv_vim/strategy/_strategy_defs.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2021 Wind River Systems, Inc. +# Copyright (c) 2015-2023 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # @@ -28,6 +28,8 @@ class EventNames(object): DISABLE_HOST_SERVICES_FAILED = Constant('disable-host-services-failed') ENABLE_HOST_SERVICES_FAILED = Constant('enable-host-services-failed') MIGRATE_INSTANCES_FAILED = Constant('migrate-instances-failed') + KUBE_HOST_CORDON_FAILED = Constant('kube-host-cordon-failed') + KUBE_HOST_UNCORDON_FAILED = Constant('kube-host-uncordon-failed') KUBE_HOST_UPGRADE_CONTROL_PLANE_FAILED = \ Constant('kube-host-upgrade-control-plane-failed') KUBE_HOST_UPGRADE_KUBELET_FAILED = \ diff --git a/nfv/nfv-vim/nfv_vim/strategy/_strategy_stages.py b/nfv/nfv-vim/nfv_vim/strategy/_strategy_stages.py index ccc9311b..e4dbdb23 100755 --- a/nfv/nfv-vim/nfv_vim/strategy/_strategy_stages.py +++ b/nfv/nfv-vim/nfv_vim/strategy/_strategy_stages.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2021 Wind River Systems, Inc. +# Copyright (c) 2015-2023 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # @@ -53,6 +53,8 @@ class StrategyStageNames(Constants): KUBE_ROOTCA_UPDATE_QUERY = Constant('kube-rootca-update-query') KUBE_ROOTCA_UPDATE_START = Constant('kube-rootca-update-start') # kube upgrade stages + KUBE_HOST_CORDON = Constant('kube-host-cordon') + KUBE_HOST_UNCORDON = Constant('kube-host-uncordon') KUBE_UPGRADE_QUERY = Constant('kube-upgrade-query') KUBE_UPGRADE_START = Constant('kube-upgrade-start') KUBE_UPGRADE_DOWNLOAD_IMAGES = Constant('kube-upgrade-download-images') diff --git a/nfv/nfv-vim/nfv_vim/strategy/_strategy_steps.py b/nfv/nfv-vim/nfv_vim/strategy/_strategy_steps.py index bbaea035..2454e93e 100755 --- a/nfv/nfv-vim/nfv_vim/strategy/_strategy_steps.py +++ b/nfv/nfv-vim/nfv_vim/strategy/_strategy_steps.py @@ -74,6 +74,8 @@ class StrategyStepNames(Constants): QUERY_KUBE_HOST_UPGRADE = Constant('query-kube-host-upgrade') QUERY_KUBE_UPGRADE = Constant('query-kube-upgrade') QUERY_KUBE_VERSIONS = Constant('query-kube-versions') + KUBE_HOST_CORDON = Constant('kube-host-cordon') + KUBE_HOST_UNCORDON = Constant('kube-host-uncordon') KUBE_UPGRADE_START = Constant('kube-upgrade-start') KUBE_UPGRADE_CLEANUP = Constant('kube-upgrade-cleanup') KUBE_UPGRADE_COMPLETE = Constant('kube-upgrade-complete') @@ -88,6 +90,14 @@ class StrategyStepNames(Constants): STRATEGY_STEP_NAME = StrategyStepNames() +def validate_operation(operation): + if operation.is_inprogress(): + return strategy.STRATEGY_STEP_RESULT.WAIT, "" + elif operation.is_failed(): + return strategy.STRATEGY_STEP_RESULT.FAILED, operation.reason + return strategy.STRATEGY_STEP_RESULT.SUCCESS, "" + + class AbstractStrategyStep(strategy.StrategyStep): """An abstract base class for strategy steps""" @@ -4225,6 +4235,100 @@ class AbstractKubeHostListUpgradeStep(AbstractKubeUpgradeStep): return data +class KubeHostCordonStep(AbstractKubeHostUpgradeStep): + """Kube Host Cordon - Strategy Step""" + + def __init__(self, host, to_version, force, target_state, target_failure_state, + timeout_in_secs=600): + super(KubeHostCordonStep, self).__init__( + host, + to_version, + force, + STRATEGY_STEP_NAME.KUBE_HOST_CORDON, + target_state, + target_failure_state, + timeout_in_secs) + + def handle_event(self, event, event_data=None): + """ + Handle Host events - does not query kube host upgrade list but + instead queries kube host upgrade directly. + """ + DLOG.debug("Step (%s) handle event (%s)." % (self._name, event)) + + if event == STRATEGY_EVENT.KUBE_HOST_CORDON_FAILED: + host = event_data + if host is not None and host.name in self._host_names: + result = strategy.STRATEGY_STEP_RESULT.FAILED + self.stage.step_complete( + result, + "kube host cordon (%s) failed" % host.name) + return True + # return handle_event of parent class + return super(KubeHostCordonStep, self).handle_event( + event, event_data=event_data) + + def apply(self): + """Kube Host Cordon""" + + from nfv_vim import directors + + DLOG.info("Step (%s) apply to hostnames (%s)." + % (self._name, self._host_names)) + host_director = directors.get_host_director() + operation = \ + host_director.kube_host_cordon(self._host_names, + self._force) + return validate_operation(operation) + + +class KubeHostUncordonStep(AbstractKubeHostUpgradeStep): + """Kube Host Uncordon - Strategy Step""" + + def __init__(self, host, to_version, force, target_state, target_failure_state, + timeout_in_secs=600): + super(KubeHostUncordonStep, self).__init__( + host, + to_version, + force, + STRATEGY_STEP_NAME.KUBE_HOST_UNCORDON, + target_state, + target_failure_state, + timeout_in_secs) + + def handle_event(self, event, event_data=None): + """ + Handle Host events - does not query kube host upgrade list but + instead queries kube host upgrade directly. + """ + DLOG.debug("Step (%s) handle event (%s)." % (self._name, event)) + + if event == STRATEGY_EVENT.KUBE_HOST_UNCORDON_FAILED: + host = event_data + if host is not None and host.name in self._host_names: + result = strategy.STRATEGY_STEP_RESULT.FAILED + self.stage.step_complete( + result, + "kube host uncordon (%s) failed" % host.name) + return True + # return handle_event of parent class + return super(KubeHostUncordonStep, self).handle_event( + event, event_data=event_data) + + def apply(self): + """Kube Host Uncordon""" + + from nfv_vim import directors + + DLOG.info("Step (%s) apply to hostnames (%s)." + % (self._name, self._host_names)) + host_director = directors.get_host_director() + operation = \ + host_director.kube_host_uncordon(self._host_names, + self._force) + return validate_operation(operation) + + class KubeHostUpgradeControlPlaneStep(AbstractKubeHostUpgradeStep): """Kube Host Upgrade Control Plane - Strategy Step @@ -4272,13 +4376,7 @@ class KubeHostUpgradeControlPlaneStep(AbstractKubeHostUpgradeStep): operation = \ host_director.kube_upgrade_hosts_control_plane(self._host_names, self._force) - - if operation.is_inprogress(): - return strategy.STRATEGY_STEP_RESULT.WAIT, "" - elif operation.is_failed(): - return strategy.STRATEGY_STEP_RESULT.FAILED, operation.reason - - return strategy.STRATEGY_STEP_RESULT.SUCCESS, "" + return validate_operation(operation) class KubeHostUpgradeKubeletStep(AbstractKubeHostListUpgradeStep): @@ -4419,6 +4517,8 @@ def strategy_step_rebuild_from_dict(data): # # kube upgrade steps # + STRATEGY_STEP_NAME.KUBE_HOST_CORDON: KubeHostCordonStep, + STRATEGY_STEP_NAME.KUBE_HOST_UNCORDON: KubeHostUncordonStep, STRATEGY_STEP_NAME.KUBE_HOST_UPGRADE_CONTROL_PLANE: KubeHostUpgradeControlPlaneStep, STRATEGY_STEP_NAME.KUBE_HOST_UPGRADE_KUBELET: