Merge "appfwk: fix app remove stuck after apply-failed"

This commit is contained in:
Zuul 2022-12-06 16:13:31 +00:00 committed by Gerrit Code Review
commit b1715439ec
2 changed files with 89 additions and 4 deletions

View File

@ -3400,7 +3400,34 @@ class AppOperator(object):
self._update_app_status( self._update_app_status(
app, new_progress=constants.APP_PROGRESS_DELETE_MANIFEST) app, new_progress=constants.APP_PROGRESS_DELETE_MANIFEST)
if self._make_app_request(app, constants.APP_DELETE_OP): # Delete helm releases which have a helm operation running.
# eg.: pending-install, pending-upgrade, etc.
flux_helm_releases = [(c.metadata_name, c.namespace) for c in self._get_list_of_charts(app)]
for release, namespace in flux_helm_releases:
helm_release_dict = self._kube.get_custom_resource(
constants.FLUXCD_CRD_HELM_REL_GROUP,
constants.FLUXCD_CRD_HELM_REL_VERSION,
namespace,
constants.FLUXCD_CRD_HELM_REL_PLURAL,
release)
if not helm_release_dict:
LOG.warning("FluxCD Helm release info for {} is not available".format(release))
continue
helm_release_status, _ = self._fluxcd.get_helm_release_status(helm_release_dict)
if helm_release_status == self._fluxcd.HELM_RELEASE_STATUS_UNKNOWN:
LOG.info("Removing helm release which has an operation in progress: {} - {}".format(namespace, release))
# Send delete request in FluxCD so it doesn't recreate the helm release
self._kube.delete_custom_resource(
constants.FLUXCD_CRD_HELM_REL_GROUP,
constants.FLUXCD_CRD_HELM_REL_VERSION,
namespace,
constants.FLUXCD_CRD_HELM_REL_PLURAL,
release)
# Remove resource in Helm
helm_utils.delete_helm_v3_release(helm_release_dict['spec']['releaseName'], namespace=namespace)
if self._make_app_request(app, constants.APP_REMOVE_OP):
# After armada delete, the data for the releases are purged from # After armada delete, the data for the releases are purged from
# tiller/etcd, the releases info for the active app stored in sysinv # tiller/etcd, the releases info for the active app stored in sysinv
# db should be set back to 0 and the inactive apps require to be # db should be set back to 0 and the inactive apps require to be
@ -4794,7 +4821,7 @@ class FluxCDHelper(object):
else: else:
LOG.error("Applying %s failed. Skipping helm release " LOG.error("Applying %s failed. Skipping helm release "
"cleanup...") "cleanup...")
elif operation == constants.APP_DELETE_OP: elif operation in [constants.APP_DELETE_OP, constants.APP_REMOVE_OP]:
rc = self._delete(manifest_dir) rc = self._delete(manifest_dir)
elif operation == constants.APP_ROLLBACK_OP: elif operation == constants.APP_ROLLBACK_OP:
pass pass
@ -4823,6 +4850,7 @@ class FluxCDHelper(object):
def _delete(self, manifest_dir): def _delete(self, manifest_dir):
cmd = ['kubectl', '--kubeconfig', kubernetes.KUBERNETES_ADMIN_CONF, cmd = ['kubectl', '--kubeconfig', kubernetes.KUBERNETES_ADMIN_CONF,
'delete', '-k', manifest_dir, '--ignore-not-found=true'] 'delete', '-k', manifest_dir, '--ignore-not-found=true']
_, stderr = cutils.trycmd(*cmd) _, stderr = cutils.trycmd(*cmd)
if stderr: if stderr:
@ -4920,6 +4948,10 @@ class FluxCDHelper(object):
else: else:
return '' return ''
# TODO (lfagunde):
# Some methods in this class receive helm_chart_dict as a parameter.
# Can move the call to _kube.get_custom_resource() into these functions
# or create a helper function inside the class for it.
def get_helm_release_status(self, helm_release_dict): def get_helm_release_status(self, helm_release_dict):
"""helm_release_dict is of the form returned by _kube.get_custom_resource(). """helm_release_dict is of the form returned by _kube.get_custom_resource().
Returns: 'status' of the release (Unlnown,True,False) and 'message' Returns: 'status' of the release (Unlnown,True,False) and 'message'

View File

@ -38,6 +38,15 @@ LOG = logging.getLogger(__name__)
# When python3 migration is finished, the explicit timer should # When python3 migration is finished, the explicit timer should
# be removed. # be removed.
# TODO(lfagunde):
# Some of the logic in here is outdated and assumes the default helm used is v2,
# such as the delete_helm_release() function.
# Also, this module would benefit from refatoring to add more
# functionality and make the current functions more flexible.
# Could create a generic "execute_helm_cmd" style function and derive the
# specific ones (list, delete, etc) from there. If that's done, remember
# to update function calls done to this module from elsewhere in the code.
def kill_process_and_descendants(proc): def kill_process_and_descendants(proc):
# function to kill a process and its children processes # function to kill a process and its children processes
for child in psutil.Process(proc.pid).children(recursive=True): for child in psutil.Process(proc.pid).children(recursive=True):
@ -169,9 +178,9 @@ def retrieve_helm_releases():
def delete_helm_release(release): def delete_helm_release(release):
"""Delete helm release """Delete helm v2 release
This method deletes a helm release without --purge which removes This method deletes a helm v2 release without --purge which removes
all associated resources from kubernetes but not from the store(ETCD) all associated resources from kubernetes but not from the store(ETCD)
In the scenario of updating application, the method is needed to clean In the scenario of updating application, the method is needed to clean
@ -215,6 +224,50 @@ def delete_helm_release(release):
timer.cancel() timer.cancel()
def delete_helm_v3_release(release, namespace="default", flags=None):
"""Delete helm v3 release
:param release: Helm release name
:param namespace: Helm release namespace
:param flags: List with any other flags required to add to the command
"""
env = os.environ.copy()
env['PATH'] = '/usr/local/sbin:' + env['PATH']
env['KUBECONFIG'] = kubernetes.KUBERNETES_ADMIN_CONF
helm_cmd = ['helm', 'uninstall', '-n', namespace, release]
if flags:
helm_cmd += flags
process = subprocess.Popen(
helm_cmd,
env=env, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
universal_newlines=True)
timer = threading.Timer(20, kill_process_and_descendants, [process])
try:
timer.start()
out, err = process.communicate()
if err:
if "not found" in err:
LOG.debug("Release %s not found or deleted already" % release)
return out, err
raise exception.HelmTillerFailure(
reason="Failed to delete release: %s" % err)
elif not out:
err_msg = "Failed to execute helm v3 command. " \
"Helm response timeout."
raise exception.HelmTillerFailure(reason=err_msg)
return out, err
except Exception as e:
LOG.error("Failed to execute helm v3 command: %s" % e)
raise exception.HelmTillerFailure(
reason="Failed to execute helm v3 command: %s" % e)
finally:
timer.cancel()
def _retry_on_HelmTillerFailure(ex): def _retry_on_HelmTillerFailure(ex):
LOG.info('Caught HelmTillerFailure exception. Resetting tiller and retrying... ' LOG.info('Caught HelmTillerFailure exception. Resetting tiller and retrying... '
'Exception: {}'.format(ex)) 'Exception: {}'.format(ex))