appfwk: fix app remove stuck after apply-failed

At the moment, flux doesn't delete helm releases if they
have a running operation (eg. HR in 'pending-install' status).
That causes the app remove operation to get stuck and timeout due
to some resources not terminating, most commonly after a failed
application apply operation.

Until this flux behaviour gets changed, we need to uninstall
these 'stuck' releases in helm directly.

Test Plan:
PASS Cause a HR to get stuck by tainting the node before applying
     the app, then successfully remove the app

Closes-Bug: 1998384

Signed-off-by: Leonardo Fagundes Luz Serrano <Leonardo.FagundesLuzSerrano@windriver.com>
Change-Id: I7466d61f79129b8f70f8a97f8968549f5823d811
This commit is contained in:
Leonardo Fagundes Luz Serrano 2022-11-30 14:54:01 -03:00
parent 39dbec3c18
commit f5efbfe46d
2 changed files with 89 additions and 4 deletions

View File

@ -3316,7 +3316,34 @@ class AppOperator(object):
self._update_app_status(
app, new_progress=constants.APP_PROGRESS_DELETE_MANIFEST)
if self._make_app_request(app, constants.APP_DELETE_OP):
# Delete helm releases which have a helm operation running.
# eg.: pending-install, pending-upgrade, etc.
flux_helm_releases = [(c.metadata_name, c.namespace) for c in self._get_list_of_charts(app)]
for release, namespace in flux_helm_releases:
helm_release_dict = self._kube.get_custom_resource(
constants.FLUXCD_CRD_HELM_REL_GROUP,
constants.FLUXCD_CRD_HELM_REL_VERSION,
namespace,
constants.FLUXCD_CRD_HELM_REL_PLURAL,
release)
if not helm_release_dict:
LOG.warning("FluxCD Helm release info for {} is not available".format(release))
continue
helm_release_status, _ = self._fluxcd.get_helm_release_status(helm_release_dict)
if helm_release_status == self._fluxcd.HELM_RELEASE_STATUS_UNKNOWN:
LOG.info("Removing helm release which has an operation in progress: {} - {}".format(namespace, release))
# Send delete request in FluxCD so it doesn't recreate the helm release
self._kube.delete_custom_resource(
constants.FLUXCD_CRD_HELM_REL_GROUP,
constants.FLUXCD_CRD_HELM_REL_VERSION,
namespace,
constants.FLUXCD_CRD_HELM_REL_PLURAL,
release)
# Remove resource in Helm
helm_utils.delete_helm_v3_release(helm_release_dict['spec']['releaseName'], namespace=namespace)
if self._make_app_request(app, constants.APP_REMOVE_OP):
# After armada delete, the data for the releases are purged from
# tiller/etcd, the releases info for the active app stored in sysinv
# db should be set back to 0 and the inactive apps require to be
@ -4710,7 +4737,7 @@ class FluxCDHelper(object):
else:
LOG.error("Applying %s failed. Skipping helm release "
"cleanup...")
elif operation == constants.APP_DELETE_OP:
elif operation in [constants.APP_DELETE_OP, constants.APP_REMOVE_OP]:
rc = self._delete(manifest_dir)
elif operation == constants.APP_ROLLBACK_OP:
pass
@ -4739,6 +4766,7 @@ class FluxCDHelper(object):
def _delete(self, manifest_dir):
cmd = ['kubectl', '--kubeconfig', kubernetes.KUBERNETES_ADMIN_CONF,
'delete', '-k', manifest_dir, '--ignore-not-found=true']
_, stderr = cutils.trycmd(*cmd)
if stderr:
@ -4836,6 +4864,10 @@ class FluxCDHelper(object):
else:
return ''
# TODO (lfagunde):
# Some methods in this class receive helm_chart_dict as a parameter.
# Can move the call to _kube.get_custom_resource() into these functions
# or create a helper function inside the class for it.
def get_helm_release_status(self, helm_release_dict):
"""helm_release_dict is of the form returned by _kube.get_custom_resource().
Returns: 'status' of the release (Unlnown,True,False) and 'message'

View File

@ -38,6 +38,15 @@ LOG = logging.getLogger(__name__)
# When python3 migration is finished, the explicit timer should
# be removed.
# TODO(lfagunde):
# Some of the logic in here is outdated and assumes the default helm used is v2,
# such as the delete_helm_release() function.
# Also, this module would benefit from refatoring to add more
# functionality and make the current functions more flexible.
# Could create a generic "execute_helm_cmd" style function and derive the
# specific ones (list, delete, etc) from there. If that's done, remember
# to update function calls done to this module from elsewhere in the code.
def kill_process_and_descendants(proc):
# function to kill a process and its children processes
for child in psutil.Process(proc.pid).children(recursive=True):
@ -169,9 +178,9 @@ def retrieve_helm_releases():
def delete_helm_release(release):
"""Delete helm release
"""Delete helm v2 release
This method deletes a helm release without --purge which removes
This method deletes a helm v2 release without --purge which removes
all associated resources from kubernetes but not from the store(ETCD)
In the scenario of updating application, the method is needed to clean
@ -215,6 +224,50 @@ def delete_helm_release(release):
timer.cancel()
def delete_helm_v3_release(release, namespace="default", flags=None):
"""Delete helm v3 release
:param release: Helm release name
:param namespace: Helm release namespace
:param flags: List with any other flags required to add to the command
"""
env = os.environ.copy()
env['PATH'] = '/usr/local/sbin:' + env['PATH']
env['KUBECONFIG'] = kubernetes.KUBERNETES_ADMIN_CONF
helm_cmd = ['helm', 'uninstall', '-n', namespace, release]
if flags:
helm_cmd += flags
process = subprocess.Popen(
helm_cmd,
env=env, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
universal_newlines=True)
timer = threading.Timer(20, kill_process_and_descendants, [process])
try:
timer.start()
out, err = process.communicate()
if err:
if "not found" in err:
LOG.debug("Release %s not found or deleted already" % release)
return out, err
raise exception.HelmTillerFailure(
reason="Failed to delete release: %s" % err)
elif not out:
err_msg = "Failed to execute helm v3 command. " \
"Helm response timeout."
raise exception.HelmTillerFailure(reason=err_msg)
return out, err
except Exception as e:
LOG.error("Failed to execute helm v3 command: %s" % e)
raise exception.HelmTillerFailure(
reason="Failed to execute helm v3 command: %s" % e)
finally:
timer.cancel()
def _retry_on_HelmTillerFailure(ex):
LOG.info('Caught HelmTillerFailure exception. Resetting tiller and retrying... '
'Exception: {}'.format(ex))