Add more info to alarms and progress messages
Improved app crash progress messages. It is now possible to identify failure to download a specific image or if it failed to download all. It is also possible to check in the progress column if any specific helmchart failed. Test Plan: PASS: Shows specific error message when failing to download all docker images PASS: Individually show the image that failed when trying to apply the app PASS: Shows the name of the chart that caused the app to abort Story: 2010736 Task: 47964 Signed-off-by: David Bastos <david.barbosabastos@windriver.com> Change-Id: If953120852ad7812971adebf23a675ca2134cca1
This commit is contained in:
parent
ceb5852fce
commit
6a95a0eb42
|
@ -1956,7 +1956,7 @@ APP_PROGRESS_APPLY_MANIFEST = 'applying application manifest'
|
||||||
APP_PROGRESS_COMPLETED = 'completed'
|
APP_PROGRESS_COMPLETED = 'completed'
|
||||||
APP_PROGRESS_DELETE_MANIFEST = 'deleting application manifest'
|
APP_PROGRESS_DELETE_MANIFEST = 'deleting application manifest'
|
||||||
APP_PROGRESS_DOWNLOAD_IMAGES = 'retrieving docker images'
|
APP_PROGRESS_DOWNLOAD_IMAGES = 'retrieving docker images'
|
||||||
APP_PROGRESS_IMAGES_DOWNLOAD_FAILED = 'failed to download one or more image(s).'
|
APP_PROGRESS_IMAGES_DOWNLOAD_FAILED = 'Failed to download images. Check sysinv and daemon logs for details.'
|
||||||
APP_PROGRESS_EXTRACT_TARFILE = 'extracting application tar file'
|
APP_PROGRESS_EXTRACT_TARFILE = 'extracting application tar file'
|
||||||
APP_PROGRESS_GENERATE_OVERRIDES = 'generating application overrides'
|
APP_PROGRESS_GENERATE_OVERRIDES = 'generating application overrides'
|
||||||
APP_PROGRESS_TARFILE_DOWNLOAD = 'downloading tarfile'
|
APP_PROGRESS_TARFILE_DOWNLOAD = 'downloading tarfile'
|
||||||
|
|
|
@ -298,6 +298,10 @@ class AppOperator(object):
|
||||||
progress=constants.APP_PROGRESS_ABORTED,
|
progress=constants.APP_PROGRESS_ABORTED,
|
||||||
user_initiated=False, reset_status=False,
|
user_initiated=False, reset_status=False,
|
||||||
forced_operation=False):
|
forced_operation=False):
|
||||||
|
# Adds the app object error message if it exists
|
||||||
|
progress = "{}: {}".format(app.error_message, progress)
|
||||||
|
app.clear_error_message()
|
||||||
|
|
||||||
if user_initiated:
|
if user_initiated:
|
||||||
progress = constants.APP_PROGRESS_ABORTED_BY_USER
|
progress = constants.APP_PROGRESS_ABORTED_BY_USER
|
||||||
|
|
||||||
|
@ -761,7 +765,7 @@ class AppOperator(object):
|
||||||
pool = greenpool.GreenPool(size=threads)
|
pool = greenpool.GreenPool(size=threads)
|
||||||
for tag, success in pool.imap(
|
for tag, success in pool.imap(
|
||||||
functools.partial(self._docker.download_an_image,
|
functools.partial(self._docker.download_an_image,
|
||||||
app.name,
|
app,
|
||||||
registries_info),
|
registries_info),
|
||||||
images_to_download):
|
images_to_download):
|
||||||
if success:
|
if success:
|
||||||
|
@ -794,6 +798,12 @@ class AppOperator(object):
|
||||||
"after %d seconds", app.name, wait_before_retry)
|
"after %d seconds", app.name, wait_before_retry)
|
||||||
time.sleep(wait_before_retry)
|
time.sleep(wait_before_retry)
|
||||||
else:
|
else:
|
||||||
|
# Clears the error cache caused by failure to download one or more images
|
||||||
|
# in 'def download_an_image'. At this point it wasn't just one image that
|
||||||
|
# failed, but all of them. The 'raise' below already reports the error
|
||||||
|
# correctly.
|
||||||
|
app.clear_error_message()
|
||||||
|
|
||||||
raise exception.KubeAppApplyFailure(
|
raise exception.KubeAppApplyFailure(
|
||||||
name=app.name,
|
name=app.name,
|
||||||
version=app.version,
|
version=app.version,
|
||||||
|
@ -1590,25 +1600,29 @@ class AppOperator(object):
|
||||||
if release_status == "False":
|
if release_status == "False":
|
||||||
# If the helm release failed the app must also be in a
|
# If the helm release failed the app must also be in a
|
||||||
# failed state
|
# failed state
|
||||||
err_msg = "{}".format(msg) if msg else ""
|
helm_err_msg = "{}".format(msg) if msg else ""
|
||||||
|
|
||||||
# Handle corner cases in which retries are exhausted due to another operation in progress.
|
# Handle corner cases in which retries are exhausted due to another operation in progress.
|
||||||
# If retries are exhausted we fail.
|
# If retries are exhausted we fail.
|
||||||
if _check_upgrade_retries_exhausted(helm_rel, err_msg):
|
if _check_upgrade_retries_exhausted(helm_rel, helm_err_msg):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
attempt, _ = _recover_from_helm_operation_in_progress_on_app_apply(
|
attempt, _ = _recover_from_helm_operation_in_progress_on_app_apply(
|
||||||
metadata_name=release_name,
|
metadata_name=release_name,
|
||||||
namespace=chart_obj['namespace'],
|
namespace=chart_obj['namespace'],
|
||||||
flux_error_message=err_msg)
|
flux_error_message=helm_err_msg)
|
||||||
|
|
||||||
if not attempt:
|
if not attempt:
|
||||||
# Handle corner cases in which application removal
|
# Handle corner cases in which application removal
|
||||||
# and apply are required to recover from failure
|
# and apply are required to recover from failure
|
||||||
_recover_via_removal(release_name, err_msg)
|
_recover_via_removal(release_name, helm_err_msg)
|
||||||
|
|
||||||
LOG.exception("Application {}: release {}: Failed during {} :{}"
|
LOG.exception("Application {}: release {}: Failed during {} :{}"
|
||||||
"".format(app.name, release_name, request, err_msg))
|
"".format(app.name, release_name, request, helm_err_msg))
|
||||||
|
|
||||||
|
# Store the error in the app object for use in def _abort_operation
|
||||||
|
app.update_error_message("Failed to apply helm "
|
||||||
|
"release \"{}\".".format(release_name))
|
||||||
|
|
||||||
return False
|
return False
|
||||||
elif release_status == "True":
|
elif release_status == "True":
|
||||||
# Special validation check needed for AIO-SX only, can
|
# Special validation check needed for AIO-SX only, can
|
||||||
|
@ -1778,9 +1792,10 @@ class AppOperator(object):
|
||||||
old_app, constants.APP_APPLY_FAILURE,
|
old_app, constants.APP_APPLY_FAILURE,
|
||||||
constants.APP_PROGRESS_UPDATE_ABORTED.format(old_app.version, new_app.version) +
|
constants.APP_PROGRESS_UPDATE_ABORTED.format(old_app.version, new_app.version) +
|
||||||
constants.APP_PROGRESS_RECOVER_ABORTED.format(old_app.version) +
|
constants.APP_PROGRESS_RECOVER_ABORTED.format(old_app.version) +
|
||||||
|
old_app.error_message +
|
||||||
'Please check logs for details.')
|
'Please check logs for details.')
|
||||||
LOG.error("Application %s recover to version %s aborted!"
|
LOG.error("Application %s recover to version %s aborted!"
|
||||||
% (old_app.name, old_app.version))
|
% (old_app.name, old_app.version))
|
||||||
|
|
||||||
def _perform_app_rollback(self, from_app, to_app):
|
def _perform_app_rollback(self, from_app, to_app):
|
||||||
"""Perform application rollback request
|
"""Perform application rollback request
|
||||||
|
@ -2555,8 +2570,7 @@ class AppOperator(object):
|
||||||
self._abort_operation(app, constants.APP_APPLY_OP,
|
self._abort_operation(app, constants.APP_APPLY_OP,
|
||||||
user_initiated=True)
|
user_initiated=True)
|
||||||
else:
|
else:
|
||||||
self._abort_operation(app, constants.APP_APPLY_OP,
|
self._abort_operation(app, constants.APP_APPLY_OP, e)
|
||||||
constants.APP_PROGRESS_ABORTED)
|
|
||||||
|
|
||||||
if not caller:
|
if not caller:
|
||||||
# If apply is not called from update method, deregister the app's
|
# If apply is not called from update method, deregister the app's
|
||||||
|
@ -3134,6 +3148,7 @@ class AppOperator(object):
|
||||||
self.patch_dependencies = []
|
self.patch_dependencies = []
|
||||||
self.charts = []
|
self.charts = []
|
||||||
self.releases = []
|
self.releases = []
|
||||||
|
self.error_message = ""
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def system_app(self):
|
def system_app(self):
|
||||||
|
@ -3192,6 +3207,12 @@ class AppOperator(object):
|
||||||
self._kube_app.save()
|
self._kube_app.save()
|
||||||
return was_active
|
return was_active
|
||||||
|
|
||||||
|
def update_error_message(self, new_error_message):
|
||||||
|
self.error_message = new_error_message
|
||||||
|
|
||||||
|
def clear_error_message(self):
|
||||||
|
self.error_message = ""
|
||||||
|
|
||||||
def regenerate_manifest_filename(self, new_mname, new_mfile):
|
def regenerate_manifest_filename(self, new_mname, new_mfile):
|
||||||
self._kube_app.manifest_name = new_mname
|
self._kube_app.manifest_name = new_mname
|
||||||
self._kube_app.manifest_file = new_mfile
|
self._kube_app.manifest_file = new_mfile
|
||||||
|
@ -3353,14 +3374,14 @@ class DockerHelper(object):
|
||||||
# must be unauthenticated in this case.)
|
# must be unauthenticated in this case.)
|
||||||
return pub_img_tag, None
|
return pub_img_tag, None
|
||||||
|
|
||||||
def download_an_image(self, app_name, registries_info, img_tag):
|
def download_an_image(self, app, registries_info, img_tag):
|
||||||
|
|
||||||
rc = True
|
rc = True
|
||||||
|
|
||||||
start = time.time()
|
start = time.time()
|
||||||
if img_tag.startswith(constants.DOCKER_REGISTRY_HOST):
|
if img_tag.startswith(constants.DOCKER_REGISTRY_HOST):
|
||||||
try:
|
try:
|
||||||
if AppOperator.is_app_aborted(app_name):
|
if AppOperator.is_app_aborted(app.name):
|
||||||
LOG.info("User aborted. Skipping download of image %s " % img_tag)
|
LOG.info("User aborted. Skipping download of image %s " % img_tag)
|
||||||
return img_tag, False
|
return img_tag, False
|
||||||
|
|
||||||
|
@ -3386,6 +3407,9 @@ class DockerHelper(object):
|
||||||
client.pull(target_img_tag, auth_config=registry_auth)
|
client.pull(target_img_tag, auth_config=registry_auth)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
# Store the error in the app object for use in def _abort_operation
|
||||||
|
app.update_error_message("Failed to download image: " + target_img_tag)
|
||||||
|
|
||||||
rc = False
|
rc = False
|
||||||
LOG.error("Image %s download failed from public/private"
|
LOG.error("Image %s download failed from public/private"
|
||||||
"registry: %s" % (img_tag, e))
|
"registry: %s" % (img_tag, e))
|
||||||
|
|
Loading…
Reference in New Issue