From 85e3b47912d1aab8f771648fd3a76b7d1402bd2e Mon Sep 17 00:00:00 2001 From: Dan Voiculeasa Date: Mon, 5 Dec 2022 21:55:51 +0200 Subject: [PATCH] AppFwk: Remove recovery logic based on spec.suspend Because of FluxCD upversion described in [1], we don't need the recovery logic flipping spec.suspend. Flux is supposed to properly reconciliate the resources. Remove recovery logic concerned with flipping spec.suspend was removed. Remove optimizations for triggering reconciliation by flipping spec.suspend. Disclaimer for tests: 1) This was applied on to of [1]. 2) cert-manager, nginx-ingress-controller, platform-integ-apps had the reconciliation interval decreased to 1m to allow Flux to manage the resources by itself in a reasonable time interval. There will be future commits per app updating reconciliation interval. Tests on AIO-SX: PASS: bootstrap PASS: unlocked enabled available PASS: apps applied PASS: inspect flux pod logs for errors PASS: re-test known trigger for 1996747 and 1995748 PASS: re-test known trigger 1997368 [1]: https://review.opendev.org/c/starlingx/ansible-playbooks/+/866820/ Depends-On: https://review.opendev.org/c/starlingx/ansible-playbooks/+/866820/ Related-Bug: 1995748 Related-Bug: 1996747 Related-Bug: 1997368 Partial-Bug: 1999032 Signed-off-by: Dan Voiculeasa Change-Id: I932d85d8b366479b2c1d2c88a0acf7fad219b131 --- .../sysinv/sysinv/sysinv/common/constants.py | 8 - .../sysinv/sysinv/conductor/kube_app.py | 176 ------------------ 2 files changed, 184 deletions(-) diff --git a/sysinv/sysinv/sysinv/sysinv/common/constants.py b/sysinv/sysinv/sysinv/sysinv/common/constants.py index f2c5c0e013..b88562b96e 100644 --- a/sysinv/sysinv/sysinv/sysinv/common/constants.py +++ b/sysinv/sysinv/sysinv/sysinv/common/constants.py @@ -1711,14 +1711,6 @@ FLUXCD_CRD_HELM_CHART_GROUP = 'source.toolkit.fluxcd.io' FLUXCD_CRD_HELM_CHART_VERSION = 'v1beta1' FLUXCD_CRD_HELM_CHART_PLURAL = 'helmcharts' # Actually beginning of errors, should be used with -# string.startswith(FLUXCD_HELM_CHART_STATUS_ERRORS[number]) -# We want to recover from these errors -FLUXCD_RECOVERY_HELM_CHART_STATUS_ERRORS = [ - 'no artifact found', - 'failed to retrieve source:', - 'chart pull error:' -] -# Actually beginning of errors, should be used with # string.startswith(FLUXCD_RECOVERY_HELM_RELEASE_STATUS_ERRORS[number]) # We want to recover from these errors FLUXCD_RECOVERY_HELM_RELEASE_STATUS_ERRORS = [ diff --git a/sysinv/sysinv/sysinv/sysinv/conductor/kube_app.py b/sysinv/sysinv/sysinv/sysinv/conductor/kube_app.py index 77b909e98e..26b2967f1d 100644 --- a/sysinv/sysinv/sysinv/sysinv/conductor/kube_app.py +++ b/sysinv/sysinv/sysinv/sysinv/conductor/kube_app.py @@ -1711,139 +1711,6 @@ class AppOperator(object): @retry(retry_on_exception=lambda x: isinstance(x, exception.ApplicationApplyFailure), stop_max_attempt_number=5, wait_fixed=30 * 1000) def _make_fluxcd_operation_with_monitor(self, app, request): - def _patch_flux_suspend_false(resource): - """ Change resource.spec.suspend = False - """ - resource['spec']['suspend'] = False - - def _patch_flux_suspend_true(resource): - """ Change resource.spec.suspend = True - """ - resource['spec']['suspend'] = True - - def _recover_from_failed_helm_chart_on_app_apply(metadata_name, - namespace, - helm_repo_name): - """ Recovery logic for FluxCD on apply - - HelmChart reconciliation needs to be triggered. - The trigger is to flip spec.suspended from True to False. - - :param metadata_name: metadata name from helmrelease.yaml - :param namespace: namespace from kustomization.yaml - :param helm_repo_name: metadata name from helmrepository.yaml - - :return: tuple(attempt, error). - attempt is True if recovery is triggered - error is True if an error was encountered - """ - helm_chart_name = "{}-{}".format(namespace, metadata_name) - helm_release_name = metadata_name - attempt = False - - # Check for condition - try: - helm_chart_resource = self._kube.get_custom_resource( - constants.FLUXCD_CRD_HELM_CHART_GROUP, - constants.FLUXCD_CRD_HELM_CHART_VERSION, - namespace, - constants.FLUXCD_CRD_HELM_CHART_PLURAL, - helm_chart_name) - except Exception as err: - LOG.warning("Failed to get HelmChart resource {}: {}" - "".format(helm_chart_name, err)) - return attempt, True - - helm_chart_resource_status = \ - self._fluxcd.extract_helm_chart_status(helm_chart_resource) - - for error_string in constants.FLUXCD_RECOVERY_HELM_CHART_STATUS_ERRORS: - if helm_chart_resource_status.startswith(error_string): - LOG.info("For helm chart {} found a matching error string " - "we can attempt to recover from: {}" - "".format(helm_chart_name, helm_chart_resource_status)) - attempt = True - break - - if not attempt: - return attempt, False - - # Force HelmRepository reconciliation now, saves up to reconciliation - # timeout for the specific resource. Same trigger as with HelmChart. - try: - # Flip to spec.suspended to True from HelmRepository - self._kube.get_transform_patch_custom_resource( - constants.FLUXCD_CRD_HELM_REPO_GROUP, - constants.FLUXCD_CRD_HELM_REPO_VERSION, - namespace, - constants.FLUXCD_CRD_HELM_REPO_PLURAL, - helm_repo_name, - _patch_flux_suspend_true - ) - - # Flip to spec.suspended to False from HelmRepository - self._kube.get_transform_patch_custom_resource( - constants.FLUXCD_CRD_HELM_REPO_GROUP, - constants.FLUXCD_CRD_HELM_REPO_VERSION, - namespace, - constants.FLUXCD_CRD_HELM_REPO_PLURAL, - helm_repo_name, - _patch_flux_suspend_false - ) - except Exception: - return attempt, True - - # Force HelmChart reconciliation now - try: - # Flip to spec.suspended to True from HelmChart - self._kube.get_transform_patch_custom_resource( - constants.FLUXCD_CRD_HELM_CHART_GROUP, - constants.FLUXCD_CRD_HELM_CHART_VERSION, - namespace, - constants.FLUXCD_CRD_HELM_CHART_PLURAL, - helm_chart_name, - _patch_flux_suspend_true - ) - - # Flip to spec.suspended to False from HelmChart - self._kube.get_transform_patch_custom_resource( - constants.FLUXCD_CRD_HELM_CHART_GROUP, - constants.FLUXCD_CRD_HELM_CHART_VERSION, - namespace, - constants.FLUXCD_CRD_HELM_CHART_PLURAL, - helm_chart_name, - _patch_flux_suspend_false - ) - except Exception: - return attempt, True - - # Force HelmRelease reconciliation now, saves up to reconciliation - # timeout for the specific resource. Same trigger as with HelmChart. - try: - # Flip to spec.suspended to True from HelmRelease - self._kube.get_transform_patch_custom_resource( - constants.FLUXCD_CRD_HELM_REL_GROUP, - constants.FLUXCD_CRD_HELM_REL_VERSION, - namespace, - constants.FLUXCD_CRD_HELM_REL_PLURAL, - helm_release_name, - _patch_flux_suspend_true - ) - - # Flip to spec.suspended to False from HelmRelease - self._kube.get_transform_patch_custom_resource( - constants.FLUXCD_CRD_HELM_REL_GROUP, - constants.FLUXCD_CRD_HELM_REL_VERSION, - namespace, - constants.FLUXCD_CRD_HELM_REL_PLURAL, - helm_release_name, - _patch_flux_suspend_false - ) - except Exception: - return attempt, True - - return attempt, False - def _recover_from_helm_operation_in_progress_on_app_apply(metadata_name, namespace, flux_error_message): """ Recovery logic for FluxCD on apply @@ -1915,29 +1782,6 @@ class AppOperator(object): secret.metadata.namespace, err)) return attempt, True - # Force HelmRelease reconciliation now, saves up to reconciliation - # timeout for the specific resource. Flip suspend True, then False. - try: - self._kube.get_transform_patch_custom_resource( - constants.FLUXCD_CRD_HELM_REL_GROUP, - constants.FLUXCD_CRD_HELM_REL_VERSION, - namespace, - constants.FLUXCD_CRD_HELM_REL_PLURAL, - helm_release_name, - _patch_flux_suspend_true - ) - - self._kube.get_transform_patch_custom_resource( - constants.FLUXCD_CRD_HELM_REL_GROUP, - constants.FLUXCD_CRD_HELM_REL_VERSION, - namespace, - constants.FLUXCD_CRD_HELM_REL_PLURAL, - helm_release_name, - _patch_flux_suspend_false - ) - except Exception: - return attempt, True - return attempt, False def _check_progress(): @@ -1983,12 +1827,6 @@ class AppOperator(object): self._update_app_status(app, new_progress=progress_str) for release_name, chart_obj in list(charts.items()): - # Attempt to recover HelmCharts in some Failed states - _recover_from_failed_helm_chart_on_app_apply( - metadata_name=release_name, - namespace=chart_obj['namespace'], - helm_repo_name=chart_obj['helm_repo_name']) - # Request the helm release info helm_rel = self._kube.get_custom_resource( constants.FLUXCD_CRD_HELM_REL_GROUP, @@ -4934,20 +4772,6 @@ class FluxCDHelper(object): return True - def extract_helm_chart_status(self, helm_chart_dict): - """helm_chart_dict is of the form returned by _kube.get_custom_resource(). - Returns: message of first status.condition - """ - if not helm_chart_dict: - return '' - elif 'status' in helm_chart_dict and \ - 'conditions' in helm_chart_dict['status'] and \ - len(helm_chart_dict['status']['conditions']) > 0 and \ - 'message' in helm_chart_dict['status']['conditions'][0]: - return helm_chart_dict['status']['conditions'][0]['message'] - else: - return '' - # TODO (lfagunde): # Some methods in this class receive helm_chart_dict as a parameter. # Can move the call to _kube.get_custom_resource() into these functions