Add more info to alarms and progress messages

Improved app crash progress messages. It is now possible to
identify failure to download a specific image or if it failed
to download all. It is also possible to check in the progress
column if any specific helmchart failed.

Test Plan:
PASS: Shows specific error message when failing to download all
docker images
PASS: Individually show the image that failed when trying to
apply the app
PASS: Shows the name of the chart that caused the app to abort

Story: 2010736
Task: 47964

Signed-off-by: David Bastos <david.barbosabastos@windriver.com>
Change-Id: If953120852ad7812971adebf23a675ca2134cca1
This commit is contained in:
David Barbosa Bastos 2023-05-02 15:13:17 +00:00
parent ceb5852fce
commit 6a95a0eb42
2 changed files with 37 additions and 13 deletions

View File

@ -1956,7 +1956,7 @@ APP_PROGRESS_APPLY_MANIFEST = 'applying application manifest'
APP_PROGRESS_COMPLETED = 'completed'
APP_PROGRESS_DELETE_MANIFEST = 'deleting application manifest'
APP_PROGRESS_DOWNLOAD_IMAGES = 'retrieving docker images'
APP_PROGRESS_IMAGES_DOWNLOAD_FAILED = 'failed to download one or more image(s).'
APP_PROGRESS_IMAGES_DOWNLOAD_FAILED = 'Failed to download images. Check sysinv and daemon logs for details.'
APP_PROGRESS_EXTRACT_TARFILE = 'extracting application tar file'
APP_PROGRESS_GENERATE_OVERRIDES = 'generating application overrides'
APP_PROGRESS_TARFILE_DOWNLOAD = 'downloading tarfile'

View File

@ -298,6 +298,10 @@ class AppOperator(object):
progress=constants.APP_PROGRESS_ABORTED,
user_initiated=False, reset_status=False,
forced_operation=False):
# Adds the app object error message if it exists
progress = "{}: {}".format(app.error_message, progress)
app.clear_error_message()
if user_initiated:
progress = constants.APP_PROGRESS_ABORTED_BY_USER
@ -761,7 +765,7 @@ class AppOperator(object):
pool = greenpool.GreenPool(size=threads)
for tag, success in pool.imap(
functools.partial(self._docker.download_an_image,
app.name,
app,
registries_info),
images_to_download):
if success:
@ -794,6 +798,12 @@ class AppOperator(object):
"after %d seconds", app.name, wait_before_retry)
time.sleep(wait_before_retry)
else:
# Clears the error cache caused by failure to download one or more images
# in 'def download_an_image'. At this point it wasn't just one image that
# failed, but all of them. The 'raise' below already reports the error
# correctly.
app.clear_error_message()
raise exception.KubeAppApplyFailure(
name=app.name,
version=app.version,
@ -1590,25 +1600,29 @@ class AppOperator(object):
if release_status == "False":
# If the helm release failed the app must also be in a
# failed state
err_msg = "{}".format(msg) if msg else ""
helm_err_msg = "{}".format(msg) if msg else ""
# Handle corner cases in which retries are exhausted due to another operation in progress.
# If retries are exhausted we fail.
if _check_upgrade_retries_exhausted(helm_rel, err_msg):
if _check_upgrade_retries_exhausted(helm_rel, helm_err_msg):
return False
attempt, _ = _recover_from_helm_operation_in_progress_on_app_apply(
metadata_name=release_name,
namespace=chart_obj['namespace'],
flux_error_message=err_msg)
flux_error_message=helm_err_msg)
if not attempt:
# Handle corner cases in which application removal
# and apply are required to recover from failure
_recover_via_removal(release_name, err_msg)
_recover_via_removal(release_name, helm_err_msg)
LOG.exception("Application {}: release {}: Failed during {} :{}"
"".format(app.name, release_name, request, err_msg))
"".format(app.name, release_name, request, helm_err_msg))
# Store the error in the app object for use in def _abort_operation
app.update_error_message("Failed to apply helm "
"release \"{}\".".format(release_name))
return False
elif release_status == "True":
# Special validation check needed for AIO-SX only, can
@ -1778,9 +1792,10 @@ class AppOperator(object):
old_app, constants.APP_APPLY_FAILURE,
constants.APP_PROGRESS_UPDATE_ABORTED.format(old_app.version, new_app.version) +
constants.APP_PROGRESS_RECOVER_ABORTED.format(old_app.version) +
old_app.error_message +
'Please check logs for details.')
LOG.error("Application %s recover to version %s aborted!"
% (old_app.name, old_app.version))
% (old_app.name, old_app.version))
def _perform_app_rollback(self, from_app, to_app):
"""Perform application rollback request
@ -2555,8 +2570,7 @@ class AppOperator(object):
self._abort_operation(app, constants.APP_APPLY_OP,
user_initiated=True)
else:
self._abort_operation(app, constants.APP_APPLY_OP,
constants.APP_PROGRESS_ABORTED)
self._abort_operation(app, constants.APP_APPLY_OP, e)
if not caller:
# If apply is not called from update method, deregister the app's
@ -3134,6 +3148,7 @@ class AppOperator(object):
self.patch_dependencies = []
self.charts = []
self.releases = []
self.error_message = ""
@property
def system_app(self):
@ -3192,6 +3207,12 @@ class AppOperator(object):
self._kube_app.save()
return was_active
def update_error_message(self, new_error_message):
self.error_message = new_error_message
def clear_error_message(self):
self.error_message = ""
def regenerate_manifest_filename(self, new_mname, new_mfile):
self._kube_app.manifest_name = new_mname
self._kube_app.manifest_file = new_mfile
@ -3353,14 +3374,14 @@ class DockerHelper(object):
# must be unauthenticated in this case.)
return pub_img_tag, None
def download_an_image(self, app_name, registries_info, img_tag):
def download_an_image(self, app, registries_info, img_tag):
rc = True
start = time.time()
if img_tag.startswith(constants.DOCKER_REGISTRY_HOST):
try:
if AppOperator.is_app_aborted(app_name):
if AppOperator.is_app_aborted(app.name):
LOG.info("User aborted. Skipping download of image %s " % img_tag)
return img_tag, False
@ -3386,6 +3407,9 @@ class DockerHelper(object):
client.pull(target_img_tag, auth_config=registry_auth)
except Exception as e:
# Store the error in the app object for use in def _abort_operation
app.update_error_message("Failed to download image: " + target_img_tag)
rc = False
LOG.error("Image %s download failed from public/private"
"registry: %s" % (img_tag, e))