Fix misleading app status after failed override update

Application status was misleading after a failed override update with illegal values. Application should be in failed (apply-failed) state, and alarm should be raised accordingly. Instead, we're led to believe that the update was completed successfully. The solution consists of adding a default delay to the system of 60 seconds before changing the helmrelease status. This way we ensure that reconciliation has already been called. This also ensures that any application can override this default value via metadata. Just create a variable with the same name with the amount of time that is needed. Test Plan: PASS: Build-pkgs && build-image PASS: Upload, apply, delete and update nginx-ingress-controller PASS: Upload, apply, delete and update platform-integ-apps PASS: Upload, apply, delete and update metrics-server PASS: Update user overrides (system user-override-update) with illegal values. When reapplying the app it should fail. PASS: Update user overrides (system user-override-update) with correct values. When reapplying the app it should complete successfully. PASS: If the app has the fluxcd_hr_reconcile_check_delay key in its metadata, the system's default delay value must be overwritten. Closes-Bug: 2053276 Change-Id: I5e75745009be235e2646a79764cb4ff619a93d59 Signed-off-by: David Bastos <david.barbosabastos@windriver.com>
2024-02-12 15:51:59 -03:00 · 2024-02-12 15:51:59 -03:00 · ce4b7c1eb3
parent a95d95b770
commit ce4b7c1eb3
3 changed files with 59 additions and 4 deletions
--- a/sysinv/sysinv/sysinv/etc/sysinv/sysinv.conf
+++ b/sysinv/sysinv/sysinv/etc/sysinv/sysinv.conf
@ -20,3 +20,6 @@ rabbit_port = 5672

 [lldp]
 drivers=lldpd
+
+[app_framework]
+fluxcd_hr_reconcile_check_delay=60
--- a/sysinv/sysinv/sysinv/sysinv/conductor/kube_app.py
+++ b/sysinv/sysinv/sysinv/sysinv/conductor/kube_app.py
@ -41,6 +41,7 @@ from eventlet import Timeout
 from fm_api import constants as fm_constants
 from fm_api import fm_api
 from oslo_log import log as logging
+from oslo_config import cfg
 from oslo_serialization import base64
 from sysinv._i18n import _
 from sysinv.api.controllers.v1 import kube_app
@ -57,6 +58,8 @@ from sysinv.helm import utils as helm_utils
 from sysinv.helm.lifecycle_constants import LifecycleConstants
 from sysinv.helm.lifecycle_hook import LifecycleHookInfo

+CONF = cfg.CONF
+

 # Log and config
 LOG = logging.getLogger(__name__)
@ -1632,6 +1635,38 @@ class AppOperator(object):

                    raise exception.ApplicationApplyFailure(name=app.name)

+        def _apply_delay_to_check_progress(kubectl_apply_time):
+            """ Apply delay before checking the charts installation progress.
+                The purpose of this is to ensure that reconciliation has been
+                initiated before checking the helmrelease status of each chart.
+
+                The variable fluxcd_hr_reconcile_check_delay has been set in
+                sysinv.conf to the default amount of time. This value will be
+                override if this variable is present in the application's
+                metadata.
+
+                :param kubectl_apply_time: date when "kubectl apply -k" was called
+            """
+            fluxcd_hr_reconcile_check_delay = CONF.app_framework.fluxcd_hr_reconcile_check_delay
+
+            # Override default time delay if app metadata have
+            # fluxcd_hr_reconcile_check_delay variable
+            if "fluxcd_hr_reconcile_check_delay" in app.app_metadata:
+                fluxcd_hr_reconcile_check_delay = app.app_metadata[
+                    "fluxcd_hr_reconcile_check_delay"]
+
+            # If fluxcd_hr_reconcile_check_delay is 0, no delay will need to be added
+            # and the _check_progress function must be called normally.
+            if fluxcd_hr_reconcile_check_delay > 0:
+                now = time.time()
+                elapsed_time = now - kubectl_apply_time
+                delay_time = fluxcd_hr_reconcile_check_delay - elapsed_time
+
+                if fluxcd_hr_reconcile_check_delay > elapsed_time:
+                    LOG.info("Waiting {} seconds to make sure "
+                             "the reconciliation was called".format(int(delay_time)))
+                    time.sleep(delay_time)
+
        def _check_progress():
            tadjust = 0
            last_successful_chart = None
@ -1755,10 +1790,17 @@ class AppOperator(object):
            with Timeout(constants.APP_INSTALLATION_TIMEOUT,
                         exception.KubeAppProgressMonitorTimeout()):

-                rc = self._fluxcd.make_fluxcd_operation(request, app.sync_fluxcd_manifest)
+                rc, kubectl_apply_time = self._fluxcd.make_fluxcd_operation(
+                    request,
+                    app.sync_fluxcd_manifest)

                # check progress only for apply for now
                if rc and request == constants.APP_APPLY_OP:
+                    # Because reconciliation is not called immediately after running the
+                    # "kubectl apply -k" command, a delay is applied before _check_progress
+                    # function so that it does not return old values.
+                    _apply_delay_to_check_progress(kubectl_apply_time)
+
                    rc = _check_progress()
        except (exception.ApplicationApplyFailure):
            raise
@ -2002,8 +2044,9 @@ class AppOperator(object):
                    manifest_sync_dir_path, constants.APP_METADATA_FILE)
                shutil.copy(inst_metadata_file, sync_metadata_file)

-            if not validate_function(constants.APP_VALIDATE_OP,
-                                     validate_manifest):
+            validation_result, _ = validate_function(constants.APP_VALIDATE_OP,
+                                                     validate_manifest)
+            if not validation_result:
                raise exception.KubeAppUploadFailure(
                    name=app.name,
                    version=app.version,
@ -4005,7 +4048,9 @@ class FluxCDHelper(object):
            LOG.error("FluxCD operation %s failed for manifest %s : %s" %
                      (operation, manifest_dir, e))
            rc = False
-        return rc
+
+        kubectl_apply_time = time.time()
+        return rc, kubectl_apply_time

    def run_kubectl_kustomize(self, operation_type, manifest_dir):
        if operation_type == constants.KUBECTL_KUSTOMIZE_VALIDATE:
--- a/sysinv/sysinv/sysinv/sysinv/conductor/manager.py
+++ b/sysinv/sysinv/sysinv/sysinv/conductor/manager.py
@ -183,9 +183,16 @@ audit_intervals_opts = [
       cfg.IntOpt('prune_runtime_config', default=43200),
                  ]

+app_framework_opts = [
+    cfg.IntOpt('fluxcd_hr_reconcile_check_delay',
+        default=60,
+        help='Delay time to check progress of helmrelease'),
+]
+
 CONF = cfg.CONF
 CONF.register_opts(conductor_opts, 'conductor')
 CONF.register_opts(audit_intervals_opts, 'conductor_periodic_task_intervals')
+CONF.register_opts(app_framework_opts, 'app_framework')

 # doesn't work otherwise for ceph-manager RPC calls; reply is lost
 #