From bb68f0eeb555a65f2329e57f0f96856d5d376025 Mon Sep 17 00:00:00 2001 From: Joshua Reed Date: Mon, 11 Sep 2023 12:17:40 -0700 Subject: [PATCH] Enhance 65-k8s-app-upgrade.sh script robustness. Before this change, during an upgrade, this script will run and upgrade/apply any applications present in the app folder to the appropriate version. However, the script itself will only try once with each app. For whatever reason, if the app fails to apply, upload, etc.. then this script will exit with a non-zero return code and the upgrade activate step will fail. The changes in this script are meant to make a retry attempt in the cases of upload, apply or removal failures. Or, in the case, if the script is interrupted, and run for a second, third (etc) time and finds an application in a failed state, then it would also try to retry the appropriate command. Test Plan: PASS - AIO-SX Full Install. PASS - Start Script With App already in apply-failed state. 1. Copy 65-k8s-app-upgrade.sh into the home folder and modify permissions. 2. Re-apply the platform-integ-app with system application-apply platform-integ-app. 3. Before the app can apply, restart the sysinv-conductor - sudo sm-restart service sysinv-conductor. This will force the app into an apply-failed state. 4. Run: 65-k8s-app-upgrade.sh 21.12 22.12 activate. 5. Observe in logs that the script detects that the app is in a failed state, retries to apply it, and detects that the app finished applying and is in an applied state. Use: tail -f /var/log/platform.log. PASS - App ends up in apply-failed state while script is already running. The script will wait on an app in applying state, if it detects that the result is apply-failed then it should retry the application-apply command. Verify that this behavior happens correctly. Closes-Bug: 2035402 Change-Id: I5dd40127f44a0074be0303469f0c2021da128c36 Signed-off-by: Joshua Reed --- .../upgrade-scripts/65-k8s-app-upgrade.sh | 72 ++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) diff --git a/controllerconfig/controllerconfig/upgrade-scripts/65-k8s-app-upgrade.sh b/controllerconfig/controllerconfig/upgrade-scripts/65-k8s-app-upgrade.sh index 2360b42881..9612b0b351 100644 --- a/controllerconfig/controllerconfig/upgrade-scripts/65-k8s-app-upgrade.sh +++ b/controllerconfig/controllerconfig/upgrade-scripts/65-k8s-app-upgrade.sh @@ -22,6 +22,13 @@ FROM_RELEASE=$1 TO_RELEASE=$2 ACTION=$3 +if (( $# != 3 )); then + >&2 echo "Error: Missing Arguments!" + >&2 echo "Usage: 65-k8s-app-upgrade.sh FROM_RELEASE TO_RELEASE ACTION" + >&2 echo "Exiting for manual intervention..." + exit 1 +fi + PLATFORM_APPLICATION_PATH='/usr/local/share/applications/helm' UPGRADE_IN_PROGRESS_APPS_FILE='/etc/platform/.upgrade_in_progress_apps' @@ -33,6 +40,8 @@ UPLOAD_RESULT_SLEEP=10 UPLOAD_RESULT_ATTEMPTS=24 # ~4 min to upload app UPDATE_RESULT_SLEEP=30 UPDATE_RESULT_ATTEMPTS=30 # ~15 min to update app +COMMAND_RETRY_SLEEP=30 +COMMAND_RETRY_ATTEMPTS=10 # ~5 min to wait on a retried command. # This will log to /var/log/platform.log function log { @@ -62,6 +71,52 @@ function verify_apps_are_not_recovering { return 0 } +function retry_command { + # This command attempts to retry the command provided and waits to see if it + # executed sucessfully or failed. + + COMMAND=$1 + APPLICATION_NAME=$2 + + if (( $# != 2 )); then + >&2 echo "Error: Missing Arguments!" + >&2 echo "Usage: retry_command COMMAND APPLICATION_NAME" + >&2 echo "Exiting for manual intervention..." + exit 1 + fi + + log "$NAME: Retrying command: ${COMMAND}" + + system ${COMMAND} ${APPLICATION_NAME} + + # Do an initial sleep before first status check attempt + sleep $COMMAND_RETRY_SLEEP + + for tries in $(seq 1 $COMMAND_RETRY_ATTEMPTS); do + + APP_STATUS=$(system application-show ${APPLICATION_NAME} --column status --format value) + + if [[ "${APP_STATUS}" =~ ^(uploaded|applied|removed)$ ]]; then + # This is if the command succeeded, break here. + log "$NAME: ${APPLICATION_NAME} status is: ${APP_STATUS}. Done!" + break + elif [[ "${APP_STATUS}" =~ ^(upload-failed|apply-failed|remove-failed)$ ]]; then + # The command was retried, but resulted in another failure. Nothing more to be done, + # so exit. + log "$NAME: ${APPLICATION_NAME} status is: ${APP_STATUS}. The retry has failed. Exiting for manual intervention..." + exit 1 + elif [ $tries == $COMMAND_RETRY_ATTEMPTS ]; then + log "$NAME: Exceeded maximum application ${COMMAND} time of $(date -u -d @"$((COMMAND_RETRY_ATTEMPTS*COMMAND_RETRY_SLEEP))" +"%Mm%Ss"). Execute upgrade-activate again when all applications are uploaded or applied." + exit 1 + fi + log "$NAME: ${APPLICATION_NAME} status is: ${APP_STATUS}. Will check again in ${COMMAND_RETRY_SLEEP} seconds." + sleep $COMMAND_RETRY_SLEEP + done + + log "$NAME: Retrying command: ${COMMAND} - Succeeded!" + return 0 +} + log "$NAME: Starting Kubernetes application updates from release $FROM_RELEASE to $TO_RELEASE with action $ACTION" if [ "$ACTION" == "activate" ]; then @@ -169,8 +224,23 @@ if [ "$ACTION" == "activate" ]; then system application-update $fqpn_app ;; + upload-failed) + log "$NAME: ${EXISTING_APP_NAME}, version ${EXISTING_APP_VERSION}, upload failed: ${EXISTING_APP_STATUS}. Retrying command..." + retry_command "application-upload" "${EXISTING_APP_NAME}" + ;; + + apply-failed) + log "$NAME: ${EXISTING_APP_NAME}, version ${EXISTING_APP_VERSION}, apply failed: ${EXISTING_APP_STATUS}. Retrying command..." + retry_command "application-apply" "${EXISTING_APP_NAME}" + ;; + + remove-failed) + log "$NAME: ${EXISTING_APP_NAME}, version ${EXISTING_APP_VERSION}, remove failed: ${EXISTING_APP_STATUS}. Retrying command..." + retry_command "application-remove" "${EXISTING_APP_NAME}" + ;; + # States that are unexpected - uploading | upload-failed | applying | apply-failed | removing | remove-failed | restore-requested | updating | recovering ) + uploading | applying | removing | restore-requested | updating | recovering) log "$NAME: ${EXISTING_APP_NAME}, version ${EXISTING_APP_VERSION}, is in an unexpected state: ${EXISTING_APP_STATUS}. Exiting for manual intervention..." exit 1 ;;