Merge "Add more info to alarms and progress messages"

This commit is contained in:
Zuul 2023-05-16 15:21:59 +00:00 committed by Gerrit Code Review
commit 0f5d272208
2 changed files with 37 additions and 13 deletions

View File

@ -1956,7 +1956,7 @@ APP_PROGRESS_APPLY_MANIFEST = 'applying application manifest'
APP_PROGRESS_COMPLETED = 'completed'
APP_PROGRESS_DELETE_MANIFEST = 'deleting application manifest'
APP_PROGRESS_DOWNLOAD_IMAGES = 'retrieving docker images'
APP_PROGRESS_IMAGES_DOWNLOAD_FAILED = 'failed to download one or more image(s).'
APP_PROGRESS_IMAGES_DOWNLOAD_FAILED = 'Failed to download images. Check sysinv and daemon logs for details.'
APP_PROGRESS_EXTRACT_TARFILE = 'extracting application tar file'
APP_PROGRESS_GENERATE_OVERRIDES = 'generating application overrides'
APP_PROGRESS_TARFILE_DOWNLOAD = 'downloading tarfile'

View File

@ -298,6 +298,10 @@ class AppOperator(object):
progress=constants.APP_PROGRESS_ABORTED,
user_initiated=False, reset_status=False,
forced_operation=False):
# Adds the app object error message if it exists
progress = "{}: {}".format(app.error_message, progress)
app.clear_error_message()
if user_initiated:
progress = constants.APP_PROGRESS_ABORTED_BY_USER
@ -762,7 +766,7 @@ class AppOperator(object):
pool = greenpool.GreenPool(size=threads)
for tag, success in pool.imap(
functools.partial(self._docker.download_an_image,
app.name,
app,
registries_info),
images_to_download):
if success:
@ -795,6 +799,12 @@ class AppOperator(object):
"after %d seconds", app.name, wait_before_retry)
time.sleep(wait_before_retry)
else:
# Clears the error cache caused by failure to download one or more images
# in 'def download_an_image'. At this point it wasn't just one image that
# failed, but all of them. The 'raise' below already reports the error
# correctly.
app.clear_error_message()
raise exception.KubeAppApplyFailure(
name=app.name,
version=app.version,
@ -1591,25 +1601,29 @@ class AppOperator(object):
if release_status == "False":
# If the helm release failed the app must also be in a
# failed state
err_msg = "{}".format(msg) if msg else ""
helm_err_msg = "{}".format(msg) if msg else ""
# Handle corner cases in which retries are exhausted due to another operation in progress.
# If retries are exhausted we fail.
if _check_upgrade_retries_exhausted(helm_rel, err_msg):
if _check_upgrade_retries_exhausted(helm_rel, helm_err_msg):
return False
attempt, _ = _recover_from_helm_operation_in_progress_on_app_apply(
metadata_name=release_name,
namespace=chart_obj['namespace'],
flux_error_message=err_msg)
flux_error_message=helm_err_msg)
if not attempt:
# Handle corner cases in which application removal
# and apply are required to recover from failure
_recover_via_removal(release_name, err_msg)
_recover_via_removal(release_name, helm_err_msg)
LOG.exception("Application {}: release {}: Failed during {} :{}"
"".format(app.name, release_name, request, err_msg))
"".format(app.name, release_name, request, helm_err_msg))
# Store the error in the app object for use in def _abort_operation
app.update_error_message("Failed to apply helm "
"release \"{}\".".format(release_name))
return False
elif release_status == "True":
# Special validation check needed for AIO-SX only, can
@ -1789,9 +1803,10 @@ class AppOperator(object):
old_app, constants.APP_APPLY_FAILURE,
constants.APP_PROGRESS_UPDATE_ABORTED.format(old_app.version, new_app.version) +
constants.APP_PROGRESS_RECOVER_ABORTED.format(old_app.version) +
old_app.error_message +
'Please check logs for details.')
LOG.error("Application %s recover to version %s aborted!"
% (old_app.name, old_app.version))
% (old_app.name, old_app.version))
def _perform_app_rollback(self, from_app, to_app):
"""Perform application rollback request
@ -2566,8 +2581,7 @@ class AppOperator(object):
self._abort_operation(app, constants.APP_APPLY_OP,
user_initiated=True)
else:
self._abort_operation(app, constants.APP_APPLY_OP,
constants.APP_PROGRESS_ABORTED)
self._abort_operation(app, constants.APP_APPLY_OP, e)
if not caller:
# If apply is not called from update method, deregister the app's
@ -3160,6 +3174,7 @@ class AppOperator(object):
self.patch_dependencies = []
self.charts = []
self.releases = []
self.error_message = ""
@property
def system_app(self):
@ -3218,6 +3233,12 @@ class AppOperator(object):
self._kube_app.save()
return was_active
def update_error_message(self, new_error_message):
self.error_message = new_error_message
def clear_error_message(self):
self.error_message = ""
def regenerate_manifest_filename(self, new_mname, new_mfile):
self._kube_app.manifest_name = new_mname
self._kube_app.manifest_file = new_mfile
@ -3379,14 +3400,14 @@ class DockerHelper(object):
# must be unauthenticated in this case.)
return pub_img_tag, None
def download_an_image(self, app_name, registries_info, img_tag):
def download_an_image(self, app, registries_info, img_tag):
rc = True
start = time.time()
if img_tag.startswith(constants.DOCKER_REGISTRY_HOST):
try:
if AppOperator.is_app_aborted(app_name):
if AppOperator.is_app_aborted(app.name):
LOG.info("User aborted. Skipping download of image %s " % img_tag)
return img_tag, False
@ -3412,6 +3433,9 @@ class DockerHelper(object):
client.pull(target_img_tag, auth_config=registry_auth)
except Exception as e:
# Store the error in the app object for use in def _abort_operation
app.update_error_message("Failed to download image: " + target_img_tag)
rc = False
LOG.error("Image %s download failed from public/private"
"registry: %s" % (img_tag, e))