Fix misleading app status after failed override update

Application status was misleading after a failed override update with
illegal values. Application should be in failed (apply-failed) state,
and alarm should be raised accordingly. Instead, we're led to believe
that the update was completed successfully.

The solution consists of adding a default delay to the system of 60
seconds before changing the helmrelease status. This way we ensure
that reconciliation has already been called.

This also ensures that any application can override this default
value via metadata. Just create a variable with the same name with
the amount of time that is needed.

Test Plan:
PASS: Build-pkgs && build-image
PASS: Upload, apply, delete and update nginx-ingress-controller
PASS: Upload, apply, delete and update platform-integ-apps
PASS: Upload, apply, delete and update metrics-server
PASS: Update user overrides (system user-override-update) with illegal
      values. When reapplying the app it should fail.
PASS: Update user overrides (system user-override-update) with correct
      values. When reapplying the app it should complete successfully.
PASS: If the app has the fluxcd_hr_reconcile_check_delay key in its
      metadata, the system's default delay value must be overwritten.

Closes-Bug: 2053276

Change-Id: I5e75745009be235e2646a79764cb4ff619a93d59
Signed-off-by: David Bastos <david.barbosabastos@windriver.com>
This commit is contained in:
David Bastos 2024-02-12 15:51:59 -03:00 committed by David Barbosa Bastos
parent a95d95b770
commit ce4b7c1eb3
3 changed files with 59 additions and 4 deletions

View File

@ -20,3 +20,6 @@ rabbit_port = 5672
[lldp]
drivers=lldpd
[app_framework]
fluxcd_hr_reconcile_check_delay=60

View File

@ -41,6 +41,7 @@ from eventlet import Timeout
from fm_api import constants as fm_constants
from fm_api import fm_api
from oslo_log import log as logging
from oslo_config import cfg
from oslo_serialization import base64
from sysinv._i18n import _
from sysinv.api.controllers.v1 import kube_app
@ -57,6 +58,8 @@ from sysinv.helm import utils as helm_utils
from sysinv.helm.lifecycle_constants import LifecycleConstants
from sysinv.helm.lifecycle_hook import LifecycleHookInfo
CONF = cfg.CONF
# Log and config
LOG = logging.getLogger(__name__)
@ -1632,6 +1635,38 @@ class AppOperator(object):
raise exception.ApplicationApplyFailure(name=app.name)
def _apply_delay_to_check_progress(kubectl_apply_time):
""" Apply delay before checking the charts installation progress.
The purpose of this is to ensure that reconciliation has been
initiated before checking the helmrelease status of each chart.
The variable fluxcd_hr_reconcile_check_delay has been set in
sysinv.conf to the default amount of time. This value will be
override if this variable is present in the application's
metadata.
:param kubectl_apply_time: date when "kubectl apply -k" was called
"""
fluxcd_hr_reconcile_check_delay = CONF.app_framework.fluxcd_hr_reconcile_check_delay
# Override default time delay if app metadata have
# fluxcd_hr_reconcile_check_delay variable
if "fluxcd_hr_reconcile_check_delay" in app.app_metadata:
fluxcd_hr_reconcile_check_delay = app.app_metadata[
"fluxcd_hr_reconcile_check_delay"]
# If fluxcd_hr_reconcile_check_delay is 0, no delay will need to be added
# and the _check_progress function must be called normally.
if fluxcd_hr_reconcile_check_delay > 0:
now = time.time()
elapsed_time = now - kubectl_apply_time
delay_time = fluxcd_hr_reconcile_check_delay - elapsed_time
if fluxcd_hr_reconcile_check_delay > elapsed_time:
LOG.info("Waiting {} seconds to make sure "
"the reconciliation was called".format(int(delay_time)))
time.sleep(delay_time)
def _check_progress():
tadjust = 0
last_successful_chart = None
@ -1755,10 +1790,17 @@ class AppOperator(object):
with Timeout(constants.APP_INSTALLATION_TIMEOUT,
exception.KubeAppProgressMonitorTimeout()):
rc = self._fluxcd.make_fluxcd_operation(request, app.sync_fluxcd_manifest)
rc, kubectl_apply_time = self._fluxcd.make_fluxcd_operation(
request,
app.sync_fluxcd_manifest)
# check progress only for apply for now
if rc and request == constants.APP_APPLY_OP:
# Because reconciliation is not called immediately after running the
# "kubectl apply -k" command, a delay is applied before _check_progress
# function so that it does not return old values.
_apply_delay_to_check_progress(kubectl_apply_time)
rc = _check_progress()
except (exception.ApplicationApplyFailure):
raise
@ -2002,8 +2044,9 @@ class AppOperator(object):
manifest_sync_dir_path, constants.APP_METADATA_FILE)
shutil.copy(inst_metadata_file, sync_metadata_file)
if not validate_function(constants.APP_VALIDATE_OP,
validate_manifest):
validation_result, _ = validate_function(constants.APP_VALIDATE_OP,
validate_manifest)
if not validation_result:
raise exception.KubeAppUploadFailure(
name=app.name,
version=app.version,
@ -4005,7 +4048,9 @@ class FluxCDHelper(object):
LOG.error("FluxCD operation %s failed for manifest %s : %s" %
(operation, manifest_dir, e))
rc = False
return rc
kubectl_apply_time = time.time()
return rc, kubectl_apply_time
def run_kubectl_kustomize(self, operation_type, manifest_dir):
if operation_type == constants.KUBECTL_KUSTOMIZE_VALIDATE:

View File

@ -183,9 +183,16 @@ audit_intervals_opts = [
cfg.IntOpt('prune_runtime_config', default=43200),
]
app_framework_opts = [
cfg.IntOpt('fluxcd_hr_reconcile_check_delay',
default=60,
help='Delay time to check progress of helmrelease'),
]
CONF = cfg.CONF
CONF.register_opts(conductor_opts, 'conductor')
CONF.register_opts(audit_intervals_opts, 'conductor_periodic_task_intervals')
CONF.register_opts(app_framework_opts, 'app_framework')
# doesn't work otherwise for ceph-manager RPC calls; reply is lost
#