From f5efbfe46d6356d03267eb2e72ecd4dc3892489a Mon Sep 17 00:00:00 2001 From: Leonardo Fagundes Luz Serrano Date: Wed, 30 Nov 2022 14:54:01 -0300 Subject: [PATCH] appfwk: fix app remove stuck after apply-failed At the moment, flux doesn't delete helm releases if they have a running operation (eg. HR in 'pending-install' status). That causes the app remove operation to get stuck and timeout due to some resources not terminating, most commonly after a failed application apply operation. Until this flux behaviour gets changed, we need to uninstall these 'stuck' releases in helm directly. Test Plan: PASS Cause a HR to get stuck by tainting the node before applying the app, then successfully remove the app Closes-Bug: 1998384 Signed-off-by: Leonardo Fagundes Luz Serrano Change-Id: I7466d61f79129b8f70f8a97f8968549f5823d811 --- .../sysinv/sysinv/conductor/kube_app.py | 36 +++++++++++- sysinv/sysinv/sysinv/sysinv/helm/utils.py | 57 ++++++++++++++++++- 2 files changed, 89 insertions(+), 4 deletions(-) diff --git a/sysinv/sysinv/sysinv/sysinv/conductor/kube_app.py b/sysinv/sysinv/sysinv/sysinv/conductor/kube_app.py index 1a6d009a07..cd4e80283f 100644 --- a/sysinv/sysinv/sysinv/sysinv/conductor/kube_app.py +++ b/sysinv/sysinv/sysinv/sysinv/conductor/kube_app.py @@ -3316,7 +3316,34 @@ class AppOperator(object): self._update_app_status( app, new_progress=constants.APP_PROGRESS_DELETE_MANIFEST) - if self._make_app_request(app, constants.APP_DELETE_OP): + # Delete helm releases which have a helm operation running. + # eg.: pending-install, pending-upgrade, etc. + flux_helm_releases = [(c.metadata_name, c.namespace) for c in self._get_list_of_charts(app)] + for release, namespace in flux_helm_releases: + helm_release_dict = self._kube.get_custom_resource( + constants.FLUXCD_CRD_HELM_REL_GROUP, + constants.FLUXCD_CRD_HELM_REL_VERSION, + namespace, + constants.FLUXCD_CRD_HELM_REL_PLURAL, + release) + if not helm_release_dict: + LOG.warning("FluxCD Helm release info for {} is not available".format(release)) + continue + + helm_release_status, _ = self._fluxcd.get_helm_release_status(helm_release_dict) + if helm_release_status == self._fluxcd.HELM_RELEASE_STATUS_UNKNOWN: + LOG.info("Removing helm release which has an operation in progress: {} - {}".format(namespace, release)) + # Send delete request in FluxCD so it doesn't recreate the helm release + self._kube.delete_custom_resource( + constants.FLUXCD_CRD_HELM_REL_GROUP, + constants.FLUXCD_CRD_HELM_REL_VERSION, + namespace, + constants.FLUXCD_CRD_HELM_REL_PLURAL, + release) + # Remove resource in Helm + helm_utils.delete_helm_v3_release(helm_release_dict['spec']['releaseName'], namespace=namespace) + + if self._make_app_request(app, constants.APP_REMOVE_OP): # After armada delete, the data for the releases are purged from # tiller/etcd, the releases info for the active app stored in sysinv # db should be set back to 0 and the inactive apps require to be @@ -4710,7 +4737,7 @@ class FluxCDHelper(object): else: LOG.error("Applying %s failed. Skipping helm release " "cleanup...") - elif operation == constants.APP_DELETE_OP: + elif operation in [constants.APP_DELETE_OP, constants.APP_REMOVE_OP]: rc = self._delete(manifest_dir) elif operation == constants.APP_ROLLBACK_OP: pass @@ -4739,6 +4766,7 @@ class FluxCDHelper(object): def _delete(self, manifest_dir): cmd = ['kubectl', '--kubeconfig', kubernetes.KUBERNETES_ADMIN_CONF, 'delete', '-k', manifest_dir, '--ignore-not-found=true'] + _, stderr = cutils.trycmd(*cmd) if stderr: @@ -4836,6 +4864,10 @@ class FluxCDHelper(object): else: return '' + # TODO (lfagunde): + # Some methods in this class receive helm_chart_dict as a parameter. + # Can move the call to _kube.get_custom_resource() into these functions + # or create a helper function inside the class for it. def get_helm_release_status(self, helm_release_dict): """helm_release_dict is of the form returned by _kube.get_custom_resource(). Returns: 'status' of the release (Unlnown,True,False) and 'message' diff --git a/sysinv/sysinv/sysinv/sysinv/helm/utils.py b/sysinv/sysinv/sysinv/sysinv/helm/utils.py index 441054f411..c714253a6e 100644 --- a/sysinv/sysinv/sysinv/sysinv/helm/utils.py +++ b/sysinv/sysinv/sysinv/sysinv/helm/utils.py @@ -38,6 +38,15 @@ LOG = logging.getLogger(__name__) # When python3 migration is finished, the explicit timer should # be removed. +# TODO(lfagunde): +# Some of the logic in here is outdated and assumes the default helm used is v2, +# such as the delete_helm_release() function. +# Also, this module would benefit from refatoring to add more +# functionality and make the current functions more flexible. +# Could create a generic "execute_helm_cmd" style function and derive the +# specific ones (list, delete, etc) from there. If that's done, remember +# to update function calls done to this module from elsewhere in the code. + def kill_process_and_descendants(proc): # function to kill a process and its children processes for child in psutil.Process(proc.pid).children(recursive=True): @@ -169,9 +178,9 @@ def retrieve_helm_releases(): def delete_helm_release(release): - """Delete helm release + """Delete helm v2 release - This method deletes a helm release without --purge which removes + This method deletes a helm v2 release without --purge which removes all associated resources from kubernetes but not from the store(ETCD) In the scenario of updating application, the method is needed to clean @@ -215,6 +224,50 @@ def delete_helm_release(release): timer.cancel() +def delete_helm_v3_release(release, namespace="default", flags=None): + """Delete helm v3 release + + :param release: Helm release name + :param namespace: Helm release namespace + :param flags: List with any other flags required to add to the command + """ + + env = os.environ.copy() + env['PATH'] = '/usr/local/sbin:' + env['PATH'] + env['KUBECONFIG'] = kubernetes.KUBERNETES_ADMIN_CONF + + helm_cmd = ['helm', 'uninstall', '-n', namespace, release] + if flags: + helm_cmd += flags + + process = subprocess.Popen( + helm_cmd, + env=env, stdout=subprocess.PIPE, stderr=subprocess.PIPE, + universal_newlines=True) + timer = threading.Timer(20, kill_process_and_descendants, [process]) + + try: + timer.start() + out, err = process.communicate() + if err: + if "not found" in err: + LOG.debug("Release %s not found or deleted already" % release) + return out, err + raise exception.HelmTillerFailure( + reason="Failed to delete release: %s" % err) + elif not out: + err_msg = "Failed to execute helm v3 command. " \ + "Helm response timeout." + raise exception.HelmTillerFailure(reason=err_msg) + return out, err + except Exception as e: + LOG.error("Failed to execute helm v3 command: %s" % e) + raise exception.HelmTillerFailure( + reason="Failed to execute helm v3 command: %s" % e) + finally: + timer.cancel() + + def _retry_on_HelmTillerFailure(ex): LOG.info('Caught HelmTillerFailure exception. Resetting tiller and retrying... ' 'Exception: {}'.format(ex))