AppFwk: Remove recovery logic based on spec.suspend
Because of FluxCD upversion described in [1], we don't need the recovery logic flipping spec.suspend. Flux is supposed to properly reconciliate the resources. Remove recovery logic concerned with flipping spec.suspend was removed. Remove optimizations for triggering reconciliation by flipping spec.suspend. Disclaimer for tests: 1) This was applied on to of [1]. 2) cert-manager, nginx-ingress-controller, platform-integ-apps had the reconciliation interval decreased to 1m to allow Flux to manage the resources by itself in a reasonable time interval. There will be future commits per app updating reconciliation interval. Tests on AIO-SX: PASS: bootstrap PASS: unlocked enabled available PASS: apps applied PASS: inspect flux pod logs for errors PASS: re-test known trigger for 1996747 and 1995748 PASS: re-test known trigger 1997368 [1]: https://review.opendev.org/c/starlingx/ansible-playbooks/+/866820/ Depends-On: https://review.opendev.org/c/starlingx/ansible-playbooks/+/866820/ Related-Bug: 1995748 Related-Bug: 1996747 Related-Bug: 1997368 Partial-Bug: 1999032 Signed-off-by: Dan Voiculeasa <dan.voiculeasa@windriver.com> Change-Id: I932d85d8b366479b2c1d2c88a0acf7fad219b131
This commit is contained in:
parent
5d2196330a
commit
85e3b47912
|
@ -1711,14 +1711,6 @@ FLUXCD_CRD_HELM_CHART_GROUP = 'source.toolkit.fluxcd.io'
|
|||
FLUXCD_CRD_HELM_CHART_VERSION = 'v1beta1'
|
||||
FLUXCD_CRD_HELM_CHART_PLURAL = 'helmcharts'
|
||||
# Actually beginning of errors, should be used with
|
||||
# string.startswith(FLUXCD_HELM_CHART_STATUS_ERRORS[number])
|
||||
# We want to recover from these errors
|
||||
FLUXCD_RECOVERY_HELM_CHART_STATUS_ERRORS = [
|
||||
'no artifact found',
|
||||
'failed to retrieve source:',
|
||||
'chart pull error:'
|
||||
]
|
||||
# Actually beginning of errors, should be used with
|
||||
# string.startswith(FLUXCD_RECOVERY_HELM_RELEASE_STATUS_ERRORS[number])
|
||||
# We want to recover from these errors
|
||||
FLUXCD_RECOVERY_HELM_RELEASE_STATUS_ERRORS = [
|
||||
|
|
|
@ -1711,139 +1711,6 @@ class AppOperator(object):
|
|||
@retry(retry_on_exception=lambda x: isinstance(x, exception.ApplicationApplyFailure),
|
||||
stop_max_attempt_number=5, wait_fixed=30 * 1000)
|
||||
def _make_fluxcd_operation_with_monitor(self, app, request):
|
||||
def _patch_flux_suspend_false(resource):
|
||||
""" Change resource.spec.suspend = False
|
||||
"""
|
||||
resource['spec']['suspend'] = False
|
||||
|
||||
def _patch_flux_suspend_true(resource):
|
||||
""" Change resource.spec.suspend = True
|
||||
"""
|
||||
resource['spec']['suspend'] = True
|
||||
|
||||
def _recover_from_failed_helm_chart_on_app_apply(metadata_name,
|
||||
namespace,
|
||||
helm_repo_name):
|
||||
""" Recovery logic for FluxCD on apply
|
||||
|
||||
HelmChart reconciliation needs to be triggered.
|
||||
The trigger is to flip spec.suspended from True to False.
|
||||
|
||||
:param metadata_name: metadata name from helmrelease.yaml
|
||||
:param namespace: namespace from kustomization.yaml
|
||||
:param helm_repo_name: metadata name from helmrepository.yaml
|
||||
|
||||
:return: tuple(attempt, error).
|
||||
attempt is True if recovery is triggered
|
||||
error is True if an error was encountered
|
||||
"""
|
||||
helm_chart_name = "{}-{}".format(namespace, metadata_name)
|
||||
helm_release_name = metadata_name
|
||||
attempt = False
|
||||
|
||||
# Check for condition
|
||||
try:
|
||||
helm_chart_resource = self._kube.get_custom_resource(
|
||||
constants.FLUXCD_CRD_HELM_CHART_GROUP,
|
||||
constants.FLUXCD_CRD_HELM_CHART_VERSION,
|
||||
namespace,
|
||||
constants.FLUXCD_CRD_HELM_CHART_PLURAL,
|
||||
helm_chart_name)
|
||||
except Exception as err:
|
||||
LOG.warning("Failed to get HelmChart resource {}: {}"
|
||||
"".format(helm_chart_name, err))
|
||||
return attempt, True
|
||||
|
||||
helm_chart_resource_status = \
|
||||
self._fluxcd.extract_helm_chart_status(helm_chart_resource)
|
||||
|
||||
for error_string in constants.FLUXCD_RECOVERY_HELM_CHART_STATUS_ERRORS:
|
||||
if helm_chart_resource_status.startswith(error_string):
|
||||
LOG.info("For helm chart {} found a matching error string "
|
||||
"we can attempt to recover from: {}"
|
||||
"".format(helm_chart_name, helm_chart_resource_status))
|
||||
attempt = True
|
||||
break
|
||||
|
||||
if not attempt:
|
||||
return attempt, False
|
||||
|
||||
# Force HelmRepository reconciliation now, saves up to reconciliation
|
||||
# timeout for the specific resource. Same trigger as with HelmChart.
|
||||
try:
|
||||
# Flip to spec.suspended to True from HelmRepository
|
||||
self._kube.get_transform_patch_custom_resource(
|
||||
constants.FLUXCD_CRD_HELM_REPO_GROUP,
|
||||
constants.FLUXCD_CRD_HELM_REPO_VERSION,
|
||||
namespace,
|
||||
constants.FLUXCD_CRD_HELM_REPO_PLURAL,
|
||||
helm_repo_name,
|
||||
_patch_flux_suspend_true
|
||||
)
|
||||
|
||||
# Flip to spec.suspended to False from HelmRepository
|
||||
self._kube.get_transform_patch_custom_resource(
|
||||
constants.FLUXCD_CRD_HELM_REPO_GROUP,
|
||||
constants.FLUXCD_CRD_HELM_REPO_VERSION,
|
||||
namespace,
|
||||
constants.FLUXCD_CRD_HELM_REPO_PLURAL,
|
||||
helm_repo_name,
|
||||
_patch_flux_suspend_false
|
||||
)
|
||||
except Exception:
|
||||
return attempt, True
|
||||
|
||||
# Force HelmChart reconciliation now
|
||||
try:
|
||||
# Flip to spec.suspended to True from HelmChart
|
||||
self._kube.get_transform_patch_custom_resource(
|
||||
constants.FLUXCD_CRD_HELM_CHART_GROUP,
|
||||
constants.FLUXCD_CRD_HELM_CHART_VERSION,
|
||||
namespace,
|
||||
constants.FLUXCD_CRD_HELM_CHART_PLURAL,
|
||||
helm_chart_name,
|
||||
_patch_flux_suspend_true
|
||||
)
|
||||
|
||||
# Flip to spec.suspended to False from HelmChart
|
||||
self._kube.get_transform_patch_custom_resource(
|
||||
constants.FLUXCD_CRD_HELM_CHART_GROUP,
|
||||
constants.FLUXCD_CRD_HELM_CHART_VERSION,
|
||||
namespace,
|
||||
constants.FLUXCD_CRD_HELM_CHART_PLURAL,
|
||||
helm_chart_name,
|
||||
_patch_flux_suspend_false
|
||||
)
|
||||
except Exception:
|
||||
return attempt, True
|
||||
|
||||
# Force HelmRelease reconciliation now, saves up to reconciliation
|
||||
# timeout for the specific resource. Same trigger as with HelmChart.
|
||||
try:
|
||||
# Flip to spec.suspended to True from HelmRelease
|
||||
self._kube.get_transform_patch_custom_resource(
|
||||
constants.FLUXCD_CRD_HELM_REL_GROUP,
|
||||
constants.FLUXCD_CRD_HELM_REL_VERSION,
|
||||
namespace,
|
||||
constants.FLUXCD_CRD_HELM_REL_PLURAL,
|
||||
helm_release_name,
|
||||
_patch_flux_suspend_true
|
||||
)
|
||||
|
||||
# Flip to spec.suspended to False from HelmRelease
|
||||
self._kube.get_transform_patch_custom_resource(
|
||||
constants.FLUXCD_CRD_HELM_REL_GROUP,
|
||||
constants.FLUXCD_CRD_HELM_REL_VERSION,
|
||||
namespace,
|
||||
constants.FLUXCD_CRD_HELM_REL_PLURAL,
|
||||
helm_release_name,
|
||||
_patch_flux_suspend_false
|
||||
)
|
||||
except Exception:
|
||||
return attempt, True
|
||||
|
||||
return attempt, False
|
||||
|
||||
def _recover_from_helm_operation_in_progress_on_app_apply(metadata_name, namespace,
|
||||
flux_error_message):
|
||||
""" Recovery logic for FluxCD on apply
|
||||
|
@ -1915,29 +1782,6 @@ class AppOperator(object):
|
|||
secret.metadata.namespace, err))
|
||||
return attempt, True
|
||||
|
||||
# Force HelmRelease reconciliation now, saves up to reconciliation
|
||||
# timeout for the specific resource. Flip suspend True, then False.
|
||||
try:
|
||||
self._kube.get_transform_patch_custom_resource(
|
||||
constants.FLUXCD_CRD_HELM_REL_GROUP,
|
||||
constants.FLUXCD_CRD_HELM_REL_VERSION,
|
||||
namespace,
|
||||
constants.FLUXCD_CRD_HELM_REL_PLURAL,
|
||||
helm_release_name,
|
||||
_patch_flux_suspend_true
|
||||
)
|
||||
|
||||
self._kube.get_transform_patch_custom_resource(
|
||||
constants.FLUXCD_CRD_HELM_REL_GROUP,
|
||||
constants.FLUXCD_CRD_HELM_REL_VERSION,
|
||||
namespace,
|
||||
constants.FLUXCD_CRD_HELM_REL_PLURAL,
|
||||
helm_release_name,
|
||||
_patch_flux_suspend_false
|
||||
)
|
||||
except Exception:
|
||||
return attempt, True
|
||||
|
||||
return attempt, False
|
||||
|
||||
def _check_progress():
|
||||
|
@ -1983,12 +1827,6 @@ class AppOperator(object):
|
|||
self._update_app_status(app, new_progress=progress_str)
|
||||
|
||||
for release_name, chart_obj in list(charts.items()):
|
||||
# Attempt to recover HelmCharts in some Failed states
|
||||
_recover_from_failed_helm_chart_on_app_apply(
|
||||
metadata_name=release_name,
|
||||
namespace=chart_obj['namespace'],
|
||||
helm_repo_name=chart_obj['helm_repo_name'])
|
||||
|
||||
# Request the helm release info
|
||||
helm_rel = self._kube.get_custom_resource(
|
||||
constants.FLUXCD_CRD_HELM_REL_GROUP,
|
||||
|
@ -4934,20 +4772,6 @@ class FluxCDHelper(object):
|
|||
|
||||
return True
|
||||
|
||||
def extract_helm_chart_status(self, helm_chart_dict):
|
||||
"""helm_chart_dict is of the form returned by _kube.get_custom_resource().
|
||||
Returns: message of first status.condition
|
||||
"""
|
||||
if not helm_chart_dict:
|
||||
return ''
|
||||
elif 'status' in helm_chart_dict and \
|
||||
'conditions' in helm_chart_dict['status'] and \
|
||||
len(helm_chart_dict['status']['conditions']) > 0 and \
|
||||
'message' in helm_chart_dict['status']['conditions'][0]:
|
||||
return helm_chart_dict['status']['conditions'][0]['message']
|
||||
else:
|
||||
return ''
|
||||
|
||||
# TODO (lfagunde):
|
||||
# Some methods in this class receive helm_chart_dict as a parameter.
|
||||
# Can move the call to _kube.get_custom_resource() into these functions
|
||||
|
|
Loading…
Reference in New Issue