From 2785f64e54c425f103600e9a91559555cf748a03 Mon Sep 17 00:00:00 2001 From: Robert Church Date: Wed, 10 May 2023 10:15:08 -0500 Subject: [PATCH] AppFrmwk: Cleanup unique helm releases over update When updating an application some helm releases are unique to a specific application version. This requires that a when an application successfully or unsuccessfully updates, specific helm releases must be removed by the framework as it will not be managed by the new (or old) version of the application that is being applied during update (or recovery). Changes include: - When helm releases are cleaned up via delete_helm_release() also remove the FluxCD helmrelease CRD so that the helm controller will not re-deploy the helm release. - Refactor calls to delete_helm_v3_release() to delete_helm_release() as helm v2 is no longer supported, so differentiation is irrelevant. - Refactor retrieve_helm_releases() by removing the wrapper function and renaming retrieve_helm_v3_releases(). - Refactor HelmTillerFailure exception to HelmFailure. Tiller is no longer present in the system as helm v3 is tillerless and the Armada pod containing the Tiller container is no longer supported. - Fix issue that when an application does not specify any images in any chart values.yaml an exception is thrown when applying the application due to a null dict being written to the application images file. Test Plan: PASS - Build, install, deploy AIO-SX PASS - Build custom platform-integ-apps without the ceph audit chart. Perform application update and confirm that the unique helm release from the previous application version is properly cleaned up. Closes-Bug: #2019138 Signed-off-by: Robert Church Change-Id: I3a14f8f6b990351f8415a3fe3ce0b9637672dbcb --- .../sysinv/sysinv/sysinv/common/exception.py | 2 +- .../sysinv/sysinv/conductor/kube_app.py | 52 ++++++++++++----- sysinv/sysinv/sysinv/sysinv/helm/utils.py | 56 +++++++------------ 3 files changed, 61 insertions(+), 49 deletions(-) diff --git a/sysinv/sysinv/sysinv/sysinv/common/exception.py b/sysinv/sysinv/sysinv/sysinv/common/exception.py index e731715ef4..64371546bc 100644 --- a/sysinv/sysinv/sysinv/sysinv/common/exception.py +++ b/sysinv/sysinv/sysinv/sysinv/common/exception.py @@ -1537,7 +1537,7 @@ class KubeVersionUnavailable(NotFound): message = "Getting kubeadm and kubelet versions failed" -class HelmTillerFailure(SysinvException): +class HelmFailure(SysinvException): message = _("Helm operation failure: %(reason)s") diff --git a/sysinv/sysinv/sysinv/sysinv/conductor/kube_app.py b/sysinv/sysinv/sysinv/sysinv/conductor/kube_app.py index 466e5ed6c2..b3984ecbb4 100644 --- a/sysinv/sysinv/sysinv/sysinv/conductor/kube_app.py +++ b/sysinv/sysinv/sysinv/sysinv/conductor/kube_app.py @@ -717,9 +717,10 @@ class AppOperator(object): if chart_images: images_by_charts.update({chart.name: chart_images}) - with open(app.sync_imgfile, 'w') as f: - yaml.safe_dump(images_by_charts, f, explicit_start=True, - default_flow_style=False) + if images_by_charts: + with open(app.sync_imgfile, 'w') as f: + yaml.safe_dump(images_by_charts, f, explicit_start=True, + default_flow_style=False) def _retrieve_images_list(self, app_images_file): with io.open(app_images_file, 'r', encoding='utf-8') as f: @@ -1737,11 +1738,21 @@ class AppOperator(object): old_app_charts = [c.release for c in old_app.charts] deployed_releases = helm_utils.retrieve_helm_releases() for new_chart in new_app.charts: + # Cleanup the releases in the new application version + # but are not in the old application version if (new_chart.release not in old_app_charts and new_chart.release in deployed_releases): - # Cleanup the releases in the new application version - # but are not in the old application version - helm_utils.delete_helm_v3_release(new_chart.release) + # Send delete request in FluxCD so it doesn't + # recreate the helm release + self._kube.delete_custom_resource( + constants.FLUXCD_CRD_HELM_REL_GROUP, + constants.FLUXCD_CRD_HELM_REL_VERSION, + new_chart.namespace, + constants.FLUXCD_CRD_HELM_REL_PLURAL, + new_chart.metadata_name) + # Use helm to immediately remove the release + helm_utils.delete_helm_release(new_chart.release, + new_chart.namespace) else: rc = False @@ -2810,13 +2821,24 @@ class AppOperator(object): to_app_charts = [c.release for c in to_app.charts] deployed_releases = helm_utils.retrieve_helm_releases() for from_chart in from_app.charts: + # Cleanup the releases in the old application version + # but are not in the new application version if (from_chart.release not in to_app_charts and from_chart.release in deployed_releases): - # Cleanup the releases in the old application version - # but are not in the new application version - helm_utils.delete_helm_v3_release(from_chart.release) + # Send delete request in FluxCD so it doesn't + # recreate the helm release + self._kube.delete_custom_resource( + constants.FLUXCD_CRD_HELM_REL_GROUP, + constants.FLUXCD_CRD_HELM_REL_VERSION, + from_chart.namespace, + constants.FLUXCD_CRD_HELM_REL_PLURAL, + from_chart.metadata_name) + # Use helm to immediately remove the release + helm_utils.delete_helm_release(from_chart.release, + from_chart.namespace) LOG.info("Helm release %s for Application %s (%s) deleted" - % (from_chart.release, from_app.name, from_app.version)) + % (from_chart.release, from_app.name, + from_app.version)) self._cleanup(from_app, app_dir=False) self._utils._patch_report_app_dependencies( @@ -2925,8 +2947,10 @@ class AppOperator(object): helm_release_status, _ = self._fluxcd.get_helm_release_status(helm_release_dict) if helm_release_status == self._fluxcd.HELM_RELEASE_STATUS_UNKNOWN: - LOG.info("Removing helm release which has an operation in progress: {} - {}".format(namespace, release)) - # Send delete request in FluxCD so it doesn't recreate the helm release + LOG.info("Removing helm release which has an operation in " + "progress: {} - {}".format(namespace, release)) + # Send delete request in FluxCD so it doesn't recreate the helm + # release self._kube.delete_custom_resource( constants.FLUXCD_CRD_HELM_REL_GROUP, constants.FLUXCD_CRD_HELM_REL_VERSION, @@ -2934,7 +2958,9 @@ class AppOperator(object): constants.FLUXCD_CRD_HELM_REL_PLURAL, release) # Remove resource in Helm - helm_utils.delete_helm_v3_release(helm_release_dict['spec']['releaseName'], namespace=namespace) + helm_utils.delete_helm_release( + helm_release_dict['spec']['releaseName'], + namespace=namespace) if self._make_app_request(app, constants.APP_REMOVE_OP): # After fluxcd delete, the data for the releases are purged from diff --git a/sysinv/sysinv/sysinv/sysinv/helm/utils.py b/sysinv/sysinv/sysinv/sysinv/helm/utils.py index 55ef9c1341..b72ba28489 100644 --- a/sysinv/sysinv/sysinv/sysinv/helm/utils.py +++ b/sysinv/sysinv/sysinv/sysinv/helm/utils.py @@ -69,14 +69,14 @@ def refresh_helm_repo_information(): rpcapi.refresh_helm_repo_information(context.get_admin_context()) -def _retry_on_HelmTillerFailure(ex): +def _retry_on_HelmFailure(ex): LOG.info('Caught exception retrieving helm releases. Retrying... Exception: {}'.format(ex)) - return isinstance(ex, exception.HelmTillerFailure) + return isinstance(ex, exception.HelmFailure) @retry(stop_max_attempt_number=6, wait_fixed=20 * 1000, - retry_on_exception=_retry_on_HelmTillerFailure) -def retrieve_helm_v3_releases(): + retry_on_exception=_retry_on_HelmFailure) +def retrieve_helm_releases(): helm_list = subprocess.Popen( ['helm', '--kubeconfig', kubernetes.KUBERNETES_ADMIN_CONF, 'list', '--all-namespaces', '--output', 'yaml'], @@ -89,15 +89,15 @@ def retrieve_helm_v3_releases(): out, err = helm_list.communicate() if helm_list.returncode != 0: if err: - raise exception.HelmTillerFailure(reason=err) + raise exception.HelmFailure(reason=err) # killing the subprocesses with +kill() when timer expires returns EBADF # because the pipe is closed, but no error string on stderr. if helm_list.returncode == -9: - raise exception.HelmTillerFailure( + raise exception.HelmFailure( reason="helm list operation timed out after " "20 seconds. Terminated by threading timer.") - raise exception.HelmTillerFailure( + raise exception.HelmFailure( reason="helm list operation failed without error " "message, errno=%s" % helm_list.returncode) @@ -114,28 +114,14 @@ def retrieve_helm_v3_releases(): return deployed_releases except Exception as e: - raise exception.HelmTillerFailure( - reason="Failed to retrieve helmv3 releases: %s" % e) + raise exception.HelmFailure( + reason="Failed to retrieve helm releases: %s" % e) finally: timer.cancel() -def retrieve_helm_releases(): - """Retrieve the deployed helm releases - - Get the name, namespace and version for the deployed releases - by querying helm tiller - :return: a dict of deployed helm releases - """ - deployed_releases = {} - - deployed_releases.update(retrieve_helm_v3_releases()) - - return deployed_releases - - -def delete_helm_v3_release(release, namespace="default", flags=None): - """Delete helm v3 release +def delete_helm_release(release, namespace="default", flags=None): + """Delete helm release via callout to helm command :param release: Helm release name :param namespace: Helm release namespace @@ -161,19 +147,19 @@ def delete_helm_v3_release(release, namespace="default", flags=None): out, err = process.communicate() if err: if "not found" in err: - LOG.debug("Release %s not found or deleted already" % release) + LOG.error("Release %s/%s not found or deleted already" % (namespace, release)) return out, err - raise exception.HelmTillerFailure( + raise exception.HelmFailure( reason="Failed to delete release: %s" % err) elif not out: - err_msg = "Failed to execute helm v3 command. " \ + err_msg = "Failed to execute helm command. " \ "Helm response timeout." - raise exception.HelmTillerFailure(reason=err_msg) + raise exception.HelmFailure(reason=err_msg) return out, err except Exception as e: - LOG.error("Failed to execute helm v3 command: %s" % e) - raise exception.HelmTillerFailure( - reason="Failed to execute helm v3 command: %s" % e) + LOG.error("Failed to execute helm command: %s" % e) + raise exception.HelmFailure( + reason="Failed to execute helm command: %s" % e) finally: timer.cancel() @@ -214,12 +200,12 @@ def install_helm_chart_with_dry_run(args=None): if helm_install.returncode == 0: return out elif err: - raise exception.HelmTillerFailure(reason=err) + raise exception.HelmFailure(reason=err) else: err_msg = "Helm install --dry-run operation timeout." - raise exception.HelmTillerFailure(reason=err_msg) + raise exception.HelmFailure(reason=err_msg) except Exception as e: - raise exception.HelmTillerFailure( + raise exception.HelmFailure( reason="Failed to render helm chart: %s" % e) finally: if timer: