From 8a797c9864dffcc4fa3530588841122d48d71b03 Mon Sep 17 00:00:00 2001 From: Tee Ngo Date: Mon, 29 Jul 2019 14:15:53 -0400 Subject: [PATCH] Raise/clear alarms based on Kubernetes app status This is the final commit for LP1833323. A major alarm will be raised when an application upload, apply/update or remove fails. A warning alarm will be raised when an application apply/update is in progress to discourage host maintenance (e.g. unlock) as this could be disruptive to the operation. The alarm is cleared once the operation is successfully completed. During an application update, if a failure occurs, automatic recovery will kick in to revert back to the original app version. If this is successful, the alarm that was previously raised at the point of update failure will be cleared. In the event sysinv conductor is unexpectedly restarted (e.g. controller swact, failed audit) while an application operation is in progress, the application status will be reset and an application alarm raised/cleared accordingly. This commit also includes the fix for a minor bug introduced in commit 3f7bbbd94c71c7b8c615704d7edd8ecd62c2f1c4 to add application-abort command which has no user impact but would result in a misleading log. Tests: - Verify a warning alarm is generated at the beginning of an application apply/update and is cleared at the end. - Induce an upload failure, verify a warning alarm is generated. - Abort during an application-apply, verify a major alarm is generated. Retry the apply, verify the alarm is cleared at the end of the apply. - Abort during an application-update, verify a major alarm is generated at the point of failure. Verify that the alarm is cleared once automatic recovery has completed. - Abort during an application-remove, verify a major alarm is generated. Retry the remove, verify the alarm is cleared at the end of the apply. Closes-Bug: 1833323 Depends-On: I992ab7a788cfab8d52d2e6a498519c591148f588 Change-Id: I9bd2cabe318b75b88768ea493992ffd37fb777b0 Signed-off-by: Tee Ngo --- .../sysinv/sysinv/conductor/kube_app.py | 199 ++++++++++++++---- 1 file changed, 156 insertions(+), 43 deletions(-) diff --git a/sysinv/sysinv/sysinv/sysinv/conductor/kube_app.py b/sysinv/sysinv/sysinv/sysinv/conductor/kube_app.py index 0a56653347..feaef90311 100644 --- a/sysinv/sysinv/sysinv/sysinv/conductor/kube_app.py +++ b/sysinv/sysinv/sysinv/sysinv/conductor/kube_app.py @@ -27,6 +27,8 @@ from eventlet import greenpool from eventlet import greenthread from eventlet import queue from eventlet import Timeout +from fm_api import constants as fm_constants +from fm_api import fm_api from oslo_config import cfg from oslo_log import log as logging from sysinv.api.controllers.v1 import kube_app @@ -140,6 +142,7 @@ class AppOperator(object): def __init__(self, dbapi): self._dbapi = dbapi + self._fm_api = fm_api.FaultAPIs() self._docker = DockerHelper(self._dbapi) self._helm = helm.HelmOperator(self._dbapi) self._kube = kubernetes.KubeOperator(self._dbapi) @@ -167,46 +170,75 @@ class AppOperator(object): def _clear_stuck_applications(self): apps = self._dbapi.kube_app_get_all() for app in apps: - if (app.status == constants.APP_APPLY_IN_PROGRESS or - app.status == constants.APP_UPDATE_IN_PROGRESS or - app.status == constants.APP_RECOVER_IN_PROGRESS): - - if app.status == constants.APP_APPLY_IN_PROGRESS: - op = 'application-apply' - else: - op = 'application-update' - - if app.name in constants.HELM_APPS_PLATFORM_MANAGED: - # For platform core apps, set the new status - # to 'uploaded'. The audit task will kick in with - # all its pre-requisite checks before reapplying. - new_status = constants.APP_UPLOAD_SUCCESS - else: - new_status = constants.APP_APPLY_FAILURE - elif app.status == constants.APP_REMOVE_IN_PROGRESS: - op = 'application-remove' - new_status = constants.APP_REMOVE_FAILURE - elif app.status == constants.APP_UPLOAD_IN_PROGRESS: - op = 'application-upload' - new_status = constants.APP_UPLOAD_FAILURE + if app.status in [constants.APP_UPLOAD_IN_PROGRESS, + constants.APP_APPLY_IN_PROGRESS, + constants.APP_UPDATE_IN_PROGRESS, + constants.APP_RECOVER_IN_PROGRESS, + constants.APP_REMOVE_IN_PROGRESS]: + self._abort_operation(app, app.status, reset_status=True) else: continue - LOG.info("Resetting status of app %s from '%s' to '%s' " % - (app.name, app.status, new_status)) - error_msg = "Unexpected process termination while " + op +\ - " was in progress. The application status " +\ - "has changed from \'" + app.status +\ - "\' to \'" + new_status + "\'." - values = {'progress': error_msg, 'status': new_status} - self._dbapi.kube_app_update(app.id, values) - # Delete the Armada locks that might have been acquired previously # for a fresh start. This guarantees that a re-apply, re-update or # a re-remove attempt following a status reset will not fail due # to a lock related issue. self._clear_armada_locks() + def _raise_app_alarm(self, app_name, app_action, alarm_id, severity, + reason_text, alarm_type, repair_action, + service_affecting): + + entity_instance_id = "%s=%s" % (fm_constants.FM_ENTITY_TYPE_APPLICATION, + app_name) + app_alarms = self._fm_api.get_faults(entity_instance_id) + if app_alarms: + if ((app_action == constants.APP_APPLY_FAILURE and + app_alarms[0].alarm_id == + fm_constants.FM_ALARM_ID_APPLICATION_APPLY_FAILED) or + (app_action == constants.APP_UPLOAD_FAILURE and + app_alarms[0].alarm_id == + fm_constants.FM_ALARM_ID_APPLICATION_UPLOAD_FAILED) or + (app_action == constants.APP_REMOVE_FAILURE and + app_alarms[0].alarm_id == + fm_constants.FM_ALARM_ID_APPLICATION_REMOVE_FAILED) or + (app_action == constants.APP_APPLY_IN_PROGRESS and + app_alarms[0].alarm_id == + fm_constants.FM_ALARM_ID_APPLICATION_APPLYING) or + (app_action == constants.APP_UPDATE_IN_PROGRESS and + app_alarms[0].alarm_id == + fm_constants.FM_ALARM_ID_APPLICATION_UPDATING)): + # The same alarm was raised before, will re-raise to set the + # latest timestamp. + pass + else: + # Clear existing alarm for this app if it differs than the one to + # be raised. + self._fm_api.clear_fault(app_alarms[0].alarm_id, + app_alarms[0].entity_instance_id) + fault = fm_api.Fault( + alarm_id=alarm_id, + alarm_state=fm_constants.FM_ALARM_STATE_SET, + entity_type_id=fm_constants.FM_ENTITY_TYPE_APPLICATION, + entity_instance_id=entity_instance_id, + severity=severity, + reason_text=reason_text, + alarm_type=alarm_type, + probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_UNKNOWN, + proposed_repair_action=repair_action, + service_affecting=service_affecting) + + self._fm_api.set_fault(fault) + + def _clear_app_alarm(self, app_name): + entity_instance_id = "%s=%s" % (fm_constants.FM_ENTITY_TYPE_APPLICATION, + app_name) + app_alarms = self._fm_api.get_faults(entity_instance_id) + if app_alarms: + # There can only exist one alarm per app + self._fm_api.clear_fault(app_alarms[0].alarm_id, + app_alarms[0].entity_instance_id) + def _register_app_abort(self, app_name): with self._lock: AppOperator.abort_requested[app_name] = False @@ -267,24 +299,81 @@ class AppOperator(object): def _abort_operation(self, app, operation, progress=constants.APP_PROGRESS_ABORTED, - user_initiated=False): + user_initiated=False, reset_status=False): if user_initiated: progress = constants.APP_PROGRESS_ABORTED_BY_USER if app.status == constants.APP_UPLOAD_IN_PROGRESS: - self._update_app_status(app, constants.APP_UPLOAD_FAILURE, - progress) + new_status = constants.APP_UPLOAD_FAILURE + op = 'application-upload' + self._raise_app_alarm( + app.name, constants.APP_UPLOAD_FAILURE, + fm_constants.FM_ALARM_ID_APPLICATION_UPLOAD_FAILED, + fm_constants.FM_ALARM_SEVERITY_WARNING, + _("Application Upload Failure"), + fm_constants.FM_ALARM_TYPE_3, + _("Check system inventory log for cause."), + False) elif (app.status == constants.APP_APPLY_IN_PROGRESS or - app.status == constants.APP_UPDATE_IN_PROGRESS): - self._update_app_status(app, constants.APP_APPLY_FAILURE, - progress) + app.status == constants.APP_UPDATE_IN_PROGRESS or + app.status == constants.APP_RECOVER_IN_PROGRESS): + new_status = constants.APP_APPLY_FAILURE + if reset_status: + if app.status == constants.APP_APPLY_IN_PROGRESS: + op = 'application-apply' + else: + op = 'application-update' + + if app.name in constants.HELM_APPS_PLATFORM_MANAGED: + # For platform core apps, set the new status + # to 'uploaded'. The audit task will kick in with + # all its pre-requisite checks before reapplying. + new_status = constants.APP_UPLOAD_SUCCESS + self._clear_app_alarm(app.name) + + if (not reset_status or + app.name not in constants.HELM_APPS_PLATFORM_MANAGED): + self._raise_app_alarm( + app.name, constants.APP_APPLY_FAILURE, + fm_constants.FM_ALARM_ID_APPLICATION_APPLY_FAILED, + fm_constants.FM_ALARM_SEVERITY_MAJOR, + _("Application Apply Failure"), + fm_constants.FM_ALARM_TYPE_3, + _("Retry applying the application. If the issue persists, " + "please check system inventory log for cause."), + True) elif app.status == constants.APP_REMOVE_IN_PROGRESS: - self._update_app_status(app, constants.APP_REMOVE_FAILURE, - progress) - if not user_initiated: - LOG.error("Application %s aborted!." % operation) + new_status = constants.APP_REMOVE_FAILURE + op = 'application-remove' + self._raise_app_alarm( + app.name, constants.APP_REMOVE_FAILURE, + fm_constants.FM_ALARM_ID_APPLICATION_REMOVE_FAILED, + fm_constants.FM_ALARM_SEVERITY_MAJOR, + _("Application Remove Failure"), + fm_constants.FM_ALARM_TYPE_3, + _("Retry removing the application. If the issue persists, " + "please check system inventory log for cause."), + True) else: - LOG.info("Application %s aborted by user!." % operation) + # Should not get here, perhaps a new status was introduced? + LOG.error("No abort handling code for app status = '%s'!" % app.status) + return + + if not reset_status: + self._update_app_status(app, new_status, progress) + if not user_initiated: + LOG.error("Application %s aborted!." % operation) + else: + LOG.info("Application %s aborted by user!." % operation) + else: + LOG.info("Resetting status of app %s from '%s' to '%s' " % + (app.name, app.status, new_status)) + error_msg = "Unexpected process termination while " + op +\ + " was in progress. The application status " +\ + "has changed from \'" + app.status +\ + "\' to \'" + new_status + "\'." + values = {'progress': error_msg, 'status': new_status} + self._dbapi.kube_app_update(app.id, values) def _download_tarfile(self, app): from six.moves.urllib.request import urlopen @@ -647,7 +736,7 @@ class AppOperator(object): elapsed = time.time() - start failed_count = len(failed_downloads) if failed_count > 0: - if not AppOperator.is_app_aborted: + if not AppOperator.is_app_aborted(app.name): reason = "failed to download one or more image(s)." else: reason = "operation aborted by user." @@ -1413,6 +1502,8 @@ class AppOperator(object): constants.APP_PROGRESS_UPDATE_ABORTED.format(old_app.version, new_app.version) + constants.APP_PROGRESS_RECOVER_COMPLETED.format(old_app.version) + 'Please check logs for details.') + # Recovery from an app update failure succeeded, clear app alarm + self._clear_app_alarm(old_app.name) LOG.info("Application %s recover to version %s completed." % (old_app.name, old_app.version)) else: @@ -1619,6 +1710,13 @@ class AppOperator(object): # already been registered. if not caller: self._register_app_abort(app.name) + self._raise_app_alarm(app.name, constants.APP_APPLY_IN_PROGRESS, + fm_constants.FM_ALARM_ID_APPLICATION_APPLYING, + fm_constants.FM_ALARM_SEVERITY_WARNING, + _("Application Apply In Progress"), + fm_constants.FM_ALARM_TYPE_0, + _("No action required."), + True) LOG.info("Application %s (%s) apply started." % (app.name, app.version)) overrides_str = '' @@ -1723,6 +1821,8 @@ class AppOperator(object): constants.APP_APPLY_SUCCESS, constants.APP_PROGRESS_COMPLETED) app.update_active(True) + if not caller: + self._clear_app_alarm(app.name) LOG.info("Application %s (%s) apply completed." % (app.name, app.version)) return True except Exception as e: @@ -1776,6 +1876,13 @@ class AppOperator(object): to_rpc_app.get('name') in self._helm.get_helm_applications()) self._register_app_abort(to_app.name) + self._raise_app_alarm(to_app.name, constants.APP_UPDATE_IN_PROGRESS, + fm_constants.FM_ALARM_ID_APPLICATION_UPDATING, + fm_constants.FM_ALARM_SEVERITY_WARNING, + _("Application Update In Progress"), + fm_constants.FM_ALARM_TYPE_0, + _("No action required."), + True) LOG.info("Start updating Application %s from version %s to version %s ..." % (to_app.name, from_app.version, to_app.version)) @@ -1846,6 +1953,7 @@ class AppOperator(object): finally: self._deregister_app_abort(to_app.name) + self._clear_app_alarm(to_app.name) return True def perform_app_remove(self, rpc_app): @@ -1897,6 +2005,8 @@ class AppOperator(object): self._update_app_status(app, constants.APP_UPLOAD_SUCCESS, constants.APP_PROGRESS_COMPLETED) + # In case there is an existing alarm for previous remove failure + self._clear_app_alarm(app.name) LOG.info("Application (%s) remove completed." % app.name) else: if AppOperator.is_app_aborted(app.name): @@ -1987,6 +2097,9 @@ class AppOperator(object): self._dbapi.kube_app_destroy(app.name) self._cleanup(app) self._utils._patch_report_app_dependencies(app.name + '-' + app.version) + # One last check of app alarm, should be no-op unless the + # user deletes the application following an upload failure. + self._clear_app_alarm(app.name) LOG.info("Application (%s) has been purged from the system." % app.name) msg = None