AppFrmwk: Cleanup unique helm releases over update

When updating an application some helm releases are unique to a specific
application version. This requires that a when an application
successfully or unsuccessfully updates, specific helm releases must be
removed by the framework as it will not be managed by the new (or old)
version of the application that is being applied during update (or
recovery).

Changes include:
 - When helm releases are cleaned up via delete_helm_release() also
   remove the FluxCD helmrelease CRD so that the helm controller will
   not re-deploy the helm release.
 - Refactor calls to delete_helm_v3_release() to delete_helm_release()
   as helm v2 is no longer supported, so differentiation is irrelevant.
 - Refactor retrieve_helm_releases() by removing the wrapper function
   and renaming retrieve_helm_v3_releases().
 - Refactor HelmTillerFailure exception to HelmFailure. Tiller is no
   longer present in the system as helm v3 is tillerless and the Armada
   pod containing the Tiller container is no longer supported.
 - Fix issue that when an application does not specify any images in any
   chart values.yaml an exception is thrown when applying the
   application due to a null dict being written to the application
   images file.

Test Plan:
PASS - Build, install, deploy AIO-SX
PASS - Build custom platform-integ-apps without the ceph audit chart.
       Perform application update and confirm that the unique helm
       release from the previous application version is properly
       cleaned up.

Closes-Bug: #2019138
Signed-off-by: Robert Church <robert.church@windriver.com>
Change-Id: I3a14f8f6b990351f8415a3fe3ce0b9637672dbcb
This commit is contained in:
Robert Church 2023-05-10 10:15:08 -05:00
parent f197b79f9f
commit 2785f64e54
3 changed files with 61 additions and 49 deletions

View File

@ -1537,7 +1537,7 @@ class KubeVersionUnavailable(NotFound):
message = "Getting kubeadm and kubelet versions failed"
class HelmTillerFailure(SysinvException):
class HelmFailure(SysinvException):
message = _("Helm operation failure: %(reason)s")

View File

@ -717,9 +717,10 @@ class AppOperator(object):
if chart_images:
images_by_charts.update({chart.name: chart_images})
with open(app.sync_imgfile, 'w') as f:
yaml.safe_dump(images_by_charts, f, explicit_start=True,
default_flow_style=False)
if images_by_charts:
with open(app.sync_imgfile, 'w') as f:
yaml.safe_dump(images_by_charts, f, explicit_start=True,
default_flow_style=False)
def _retrieve_images_list(self, app_images_file):
with io.open(app_images_file, 'r', encoding='utf-8') as f:
@ -1737,11 +1738,21 @@ class AppOperator(object):
old_app_charts = [c.release for c in old_app.charts]
deployed_releases = helm_utils.retrieve_helm_releases()
for new_chart in new_app.charts:
# Cleanup the releases in the new application version
# but are not in the old application version
if (new_chart.release not in old_app_charts and
new_chart.release in deployed_releases):
# Cleanup the releases in the new application version
# but are not in the old application version
helm_utils.delete_helm_v3_release(new_chart.release)
# Send delete request in FluxCD so it doesn't
# recreate the helm release
self._kube.delete_custom_resource(
constants.FLUXCD_CRD_HELM_REL_GROUP,
constants.FLUXCD_CRD_HELM_REL_VERSION,
new_chart.namespace,
constants.FLUXCD_CRD_HELM_REL_PLURAL,
new_chart.metadata_name)
# Use helm to immediately remove the release
helm_utils.delete_helm_release(new_chart.release,
new_chart.namespace)
else:
rc = False
@ -2810,13 +2821,24 @@ class AppOperator(object):
to_app_charts = [c.release for c in to_app.charts]
deployed_releases = helm_utils.retrieve_helm_releases()
for from_chart in from_app.charts:
# Cleanup the releases in the old application version
# but are not in the new application version
if (from_chart.release not in to_app_charts and
from_chart.release in deployed_releases):
# Cleanup the releases in the old application version
# but are not in the new application version
helm_utils.delete_helm_v3_release(from_chart.release)
# Send delete request in FluxCD so it doesn't
# recreate the helm release
self._kube.delete_custom_resource(
constants.FLUXCD_CRD_HELM_REL_GROUP,
constants.FLUXCD_CRD_HELM_REL_VERSION,
from_chart.namespace,
constants.FLUXCD_CRD_HELM_REL_PLURAL,
from_chart.metadata_name)
# Use helm to immediately remove the release
helm_utils.delete_helm_release(from_chart.release,
from_chart.namespace)
LOG.info("Helm release %s for Application %s (%s) deleted"
% (from_chart.release, from_app.name, from_app.version))
% (from_chart.release, from_app.name,
from_app.version))
self._cleanup(from_app, app_dir=False)
self._utils._patch_report_app_dependencies(
@ -2925,8 +2947,10 @@ class AppOperator(object):
helm_release_status, _ = self._fluxcd.get_helm_release_status(helm_release_dict)
if helm_release_status == self._fluxcd.HELM_RELEASE_STATUS_UNKNOWN:
LOG.info("Removing helm release which has an operation in progress: {} - {}".format(namespace, release))
# Send delete request in FluxCD so it doesn't recreate the helm release
LOG.info("Removing helm release which has an operation in "
"progress: {} - {}".format(namespace, release))
# Send delete request in FluxCD so it doesn't recreate the helm
# release
self._kube.delete_custom_resource(
constants.FLUXCD_CRD_HELM_REL_GROUP,
constants.FLUXCD_CRD_HELM_REL_VERSION,
@ -2934,7 +2958,9 @@ class AppOperator(object):
constants.FLUXCD_CRD_HELM_REL_PLURAL,
release)
# Remove resource in Helm
helm_utils.delete_helm_v3_release(helm_release_dict['spec']['releaseName'], namespace=namespace)
helm_utils.delete_helm_release(
helm_release_dict['spec']['releaseName'],
namespace=namespace)
if self._make_app_request(app, constants.APP_REMOVE_OP):
# After fluxcd delete, the data for the releases are purged from

View File

@ -69,14 +69,14 @@ def refresh_helm_repo_information():
rpcapi.refresh_helm_repo_information(context.get_admin_context())
def _retry_on_HelmTillerFailure(ex):
def _retry_on_HelmFailure(ex):
LOG.info('Caught exception retrieving helm releases. Retrying... Exception: {}'.format(ex))
return isinstance(ex, exception.HelmTillerFailure)
return isinstance(ex, exception.HelmFailure)
@retry(stop_max_attempt_number=6, wait_fixed=20 * 1000,
retry_on_exception=_retry_on_HelmTillerFailure)
def retrieve_helm_v3_releases():
retry_on_exception=_retry_on_HelmFailure)
def retrieve_helm_releases():
helm_list = subprocess.Popen(
['helm', '--kubeconfig', kubernetes.KUBERNETES_ADMIN_CONF,
'list', '--all-namespaces', '--output', 'yaml'],
@ -89,15 +89,15 @@ def retrieve_helm_v3_releases():
out, err = helm_list.communicate()
if helm_list.returncode != 0:
if err:
raise exception.HelmTillerFailure(reason=err)
raise exception.HelmFailure(reason=err)
# killing the subprocesses with +kill() when timer expires returns EBADF
# because the pipe is closed, but no error string on stderr.
if helm_list.returncode == -9:
raise exception.HelmTillerFailure(
raise exception.HelmFailure(
reason="helm list operation timed out after "
"20 seconds. Terminated by threading timer.")
raise exception.HelmTillerFailure(
raise exception.HelmFailure(
reason="helm list operation failed without error "
"message, errno=%s" % helm_list.returncode)
@ -114,28 +114,14 @@ def retrieve_helm_v3_releases():
return deployed_releases
except Exception as e:
raise exception.HelmTillerFailure(
reason="Failed to retrieve helmv3 releases: %s" % e)
raise exception.HelmFailure(
reason="Failed to retrieve helm releases: %s" % e)
finally:
timer.cancel()
def retrieve_helm_releases():
"""Retrieve the deployed helm releases
Get the name, namespace and version for the deployed releases
by querying helm tiller
:return: a dict of deployed helm releases
"""
deployed_releases = {}
deployed_releases.update(retrieve_helm_v3_releases())
return deployed_releases
def delete_helm_v3_release(release, namespace="default", flags=None):
"""Delete helm v3 release
def delete_helm_release(release, namespace="default", flags=None):
"""Delete helm release via callout to helm command
:param release: Helm release name
:param namespace: Helm release namespace
@ -161,19 +147,19 @@ def delete_helm_v3_release(release, namespace="default", flags=None):
out, err = process.communicate()
if err:
if "not found" in err:
LOG.debug("Release %s not found or deleted already" % release)
LOG.error("Release %s/%s not found or deleted already" % (namespace, release))
return out, err
raise exception.HelmTillerFailure(
raise exception.HelmFailure(
reason="Failed to delete release: %s" % err)
elif not out:
err_msg = "Failed to execute helm v3 command. " \
err_msg = "Failed to execute helm command. " \
"Helm response timeout."
raise exception.HelmTillerFailure(reason=err_msg)
raise exception.HelmFailure(reason=err_msg)
return out, err
except Exception as e:
LOG.error("Failed to execute helm v3 command: %s" % e)
raise exception.HelmTillerFailure(
reason="Failed to execute helm v3 command: %s" % e)
LOG.error("Failed to execute helm command: %s" % e)
raise exception.HelmFailure(
reason="Failed to execute helm command: %s" % e)
finally:
timer.cancel()
@ -214,12 +200,12 @@ def install_helm_chart_with_dry_run(args=None):
if helm_install.returncode == 0:
return out
elif err:
raise exception.HelmTillerFailure(reason=err)
raise exception.HelmFailure(reason=err)
else:
err_msg = "Helm install --dry-run operation timeout."
raise exception.HelmTillerFailure(reason=err_msg)
raise exception.HelmFailure(reason=err_msg)
except Exception as e:
raise exception.HelmTillerFailure(
raise exception.HelmFailure(
reason="Failed to render helm chart: %s" % e)
finally:
if timer: