From ce4b7c1eb328c8c6bc443da4fd5b241f5384b207 Mon Sep 17 00:00:00 2001 From: David Bastos Date: Mon, 12 Feb 2024 15:51:59 -0300 Subject: [PATCH] Fix misleading app status after failed override update Application status was misleading after a failed override update with illegal values. Application should be in failed (apply-failed) state, and alarm should be raised accordingly. Instead, we're led to believe that the update was completed successfully. The solution consists of adding a default delay to the system of 60 seconds before changing the helmrelease status. This way we ensure that reconciliation has already been called. This also ensures that any application can override this default value via metadata. Just create a variable with the same name with the amount of time that is needed. Test Plan: PASS: Build-pkgs && build-image PASS: Upload, apply, delete and update nginx-ingress-controller PASS: Upload, apply, delete and update platform-integ-apps PASS: Upload, apply, delete and update metrics-server PASS: Update user overrides (system user-override-update) with illegal values. When reapplying the app it should fail. PASS: Update user overrides (system user-override-update) with correct values. When reapplying the app it should complete successfully. PASS: If the app has the fluxcd_hr_reconcile_check_delay key in its metadata, the system's default delay value must be overwritten. Closes-Bug: 2053276 Change-Id: I5e75745009be235e2646a79764cb4ff619a93d59 Signed-off-by: David Bastos --- sysinv/sysinv/sysinv/etc/sysinv/sysinv.conf | 3 ++ .../sysinv/sysinv/conductor/kube_app.py | 53 +++++++++++++++++-- .../sysinv/sysinv/sysinv/conductor/manager.py | 7 +++ 3 files changed, 59 insertions(+), 4 deletions(-) diff --git a/sysinv/sysinv/sysinv/etc/sysinv/sysinv.conf b/sysinv/sysinv/sysinv/etc/sysinv/sysinv.conf index 97e5b4aa76..5e8899abcf 100644 --- a/sysinv/sysinv/sysinv/etc/sysinv/sysinv.conf +++ b/sysinv/sysinv/sysinv/etc/sysinv/sysinv.conf @@ -20,3 +20,6 @@ rabbit_port = 5672 [lldp] drivers=lldpd + +[app_framework] +fluxcd_hr_reconcile_check_delay=60 diff --git a/sysinv/sysinv/sysinv/sysinv/conductor/kube_app.py b/sysinv/sysinv/sysinv/sysinv/conductor/kube_app.py index b447f201d6..65ae34d751 100644 --- a/sysinv/sysinv/sysinv/sysinv/conductor/kube_app.py +++ b/sysinv/sysinv/sysinv/sysinv/conductor/kube_app.py @@ -41,6 +41,7 @@ from eventlet import Timeout from fm_api import constants as fm_constants from fm_api import fm_api from oslo_log import log as logging +from oslo_config import cfg from oslo_serialization import base64 from sysinv._i18n import _ from sysinv.api.controllers.v1 import kube_app @@ -57,6 +58,8 @@ from sysinv.helm import utils as helm_utils from sysinv.helm.lifecycle_constants import LifecycleConstants from sysinv.helm.lifecycle_hook import LifecycleHookInfo +CONF = cfg.CONF + # Log and config LOG = logging.getLogger(__name__) @@ -1632,6 +1635,38 @@ class AppOperator(object): raise exception.ApplicationApplyFailure(name=app.name) + def _apply_delay_to_check_progress(kubectl_apply_time): + """ Apply delay before checking the charts installation progress. + The purpose of this is to ensure that reconciliation has been + initiated before checking the helmrelease status of each chart. + + The variable fluxcd_hr_reconcile_check_delay has been set in + sysinv.conf to the default amount of time. This value will be + override if this variable is present in the application's + metadata. + + :param kubectl_apply_time: date when "kubectl apply -k" was called + """ + fluxcd_hr_reconcile_check_delay = CONF.app_framework.fluxcd_hr_reconcile_check_delay + + # Override default time delay if app metadata have + # fluxcd_hr_reconcile_check_delay variable + if "fluxcd_hr_reconcile_check_delay" in app.app_metadata: + fluxcd_hr_reconcile_check_delay = app.app_metadata[ + "fluxcd_hr_reconcile_check_delay"] + + # If fluxcd_hr_reconcile_check_delay is 0, no delay will need to be added + # and the _check_progress function must be called normally. + if fluxcd_hr_reconcile_check_delay > 0: + now = time.time() + elapsed_time = now - kubectl_apply_time + delay_time = fluxcd_hr_reconcile_check_delay - elapsed_time + + if fluxcd_hr_reconcile_check_delay > elapsed_time: + LOG.info("Waiting {} seconds to make sure " + "the reconciliation was called".format(int(delay_time))) + time.sleep(delay_time) + def _check_progress(): tadjust = 0 last_successful_chart = None @@ -1755,10 +1790,17 @@ class AppOperator(object): with Timeout(constants.APP_INSTALLATION_TIMEOUT, exception.KubeAppProgressMonitorTimeout()): - rc = self._fluxcd.make_fluxcd_operation(request, app.sync_fluxcd_manifest) + rc, kubectl_apply_time = self._fluxcd.make_fluxcd_operation( + request, + app.sync_fluxcd_manifest) # check progress only for apply for now if rc and request == constants.APP_APPLY_OP: + # Because reconciliation is not called immediately after running the + # "kubectl apply -k" command, a delay is applied before _check_progress + # function so that it does not return old values. + _apply_delay_to_check_progress(kubectl_apply_time) + rc = _check_progress() except (exception.ApplicationApplyFailure): raise @@ -2002,8 +2044,9 @@ class AppOperator(object): manifest_sync_dir_path, constants.APP_METADATA_FILE) shutil.copy(inst_metadata_file, sync_metadata_file) - if not validate_function(constants.APP_VALIDATE_OP, - validate_manifest): + validation_result, _ = validate_function(constants.APP_VALIDATE_OP, + validate_manifest) + if not validation_result: raise exception.KubeAppUploadFailure( name=app.name, version=app.version, @@ -4005,7 +4048,9 @@ class FluxCDHelper(object): LOG.error("FluxCD operation %s failed for manifest %s : %s" % (operation, manifest_dir, e)) rc = False - return rc + + kubectl_apply_time = time.time() + return rc, kubectl_apply_time def run_kubectl_kustomize(self, operation_type, manifest_dir): if operation_type == constants.KUBECTL_KUSTOMIZE_VALIDATE: diff --git a/sysinv/sysinv/sysinv/sysinv/conductor/manager.py b/sysinv/sysinv/sysinv/sysinv/conductor/manager.py index 000ec0541d..ab0e92e1ec 100644 --- a/sysinv/sysinv/sysinv/sysinv/conductor/manager.py +++ b/sysinv/sysinv/sysinv/sysinv/conductor/manager.py @@ -183,9 +183,16 @@ audit_intervals_opts = [ cfg.IntOpt('prune_runtime_config', default=43200), ] +app_framework_opts = [ + cfg.IntOpt('fluxcd_hr_reconcile_check_delay', + default=60, + help='Delay time to check progress of helmrelease'), +] + CONF = cfg.CONF CONF.register_opts(conductor_opts, 'conductor') CONF.register_opts(audit_intervals_opts, 'conductor_periodic_task_intervals') +CONF.register_opts(app_framework_opts, 'app_framework') # doesn't work otherwise for ceph-manager RPC calls; reply is lost #