From e0679eb6c0c8deb24f91bb8e4062805c74765135 Mon Sep 17 00:00:00 2001 From: David Barbosa Bastos Date: Mon, 7 Aug 2023 09:44:33 -0300 Subject: [PATCH] Remove Armada if there are no Armada apps Add a script that removes Armada if there are no Armada apps running. It runs during the upgrade-activate step checking if there are remaining Armada apps or helmv2 releases. Test Plan: PASS Build pkgs and build image PASS Upgrade SX stx-8 -> master PASS Armada removed when there are no Armada apps Story: 2010560 Task: 48563 Co-Authored-By: Igor Pires Soares Change-Id: Id336f02757573995028cb183458777b1d1b5991e Signed-off-by: David Barbosa Bastos --- .../76-remove-armada-if-unused.py | 381 ++++++++++++++++++ sysinv/sysinv/sysinv/sysinv/helm/utils.py | 6 +- 2 files changed, 386 insertions(+), 1 deletion(-) create mode 100644 controllerconfig/controllerconfig/upgrade-scripts/76-remove-armada-if-unused.py diff --git a/controllerconfig/controllerconfig/upgrade-scripts/76-remove-armada-if-unused.py b/controllerconfig/controllerconfig/upgrade-scripts/76-remove-armada-if-unused.py new file mode 100644 index 0000000000..e9e1c2e0e0 --- /dev/null +++ b/controllerconfig/controllerconfig/upgrade-scripts/76-remove-armada-if-unused.py @@ -0,0 +1,381 @@ +#!/usr/bin/env python +# +# Copyright (c) 2022 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# +# The purpose of this script is to check for armada apps uploaded/applied, +# and, if none are found, remove armada, including: +# - armada helm v3 release +# - armada kubernetes namespace +# - armada docker image + +import os +import psutil +import ruamel.yaml as yaml +import sys +import threading +from cgtsclient import client as cgts_client +from cgtsclient import exc as client_exception +from eventlet.green import subprocess +from time import sleep +from controllerconfig.common import log +from sysinv.common import exception +from sysinv.common import utils as common_utils +from sysinv.helm import utils as helm_utils +from sysinv.common.kubernetes import KUBERNETES_ADMIN_CONF +from tsconfig.tsconfig import PLATFORM_PATH + +LOG = log.get_logger(__name__) +log.configure() + +ARMADA_MANIFEST_DIRECTORY = os.path.join(PLATFORM_PATH, 'armada') +ARMADA_NS = 'armada' +ARMADA_RELEASE_NAME = 'armada' +HELMV2_PATH = '/usr/local/sbin/helmv2-cli' +ERROR_CODE_TIMEOUT_HELMV2_CLI = -9 +TIMEOUT = 180 # timeout in seconds for armada pods to terminate +TIME_STEP = 15 # wait X seconds between checks + + +class CgtsClient(object): + SYSINV_API_VERSION = 1 + + def __init__(self): + self.conf = {} + self._sysinv_client = None + source_command = "source /etc/platform/openrc && env" + with open(os.devnull, "w") as fnull: + proc = subprocess.Popen( + ["bash", "-c", source_command], + stdout=subprocess.PIPE, + stderr=fnull, + universal_newlines=True, + ) + for line in proc.stdout: + key, _, value = line.partition("=") + if key == "OS_USERNAME": + self.conf["admin_user"] = value.strip() + elif key == "OS_PASSWORD": + self.conf["admin_pwd"] = value.strip() + elif key == "OS_PROJECT_NAME": + self.conf["admin_tenant"] = value.strip() + elif key == "OS_AUTH_URL": + self.conf["auth_url"] = value.strip() + elif key == "OS_REGION_NAME": + self.conf["region_name"] = value.strip() + elif key == "OS_USER_DOMAIN_NAME": + self.conf["user_domain"] = value.strip() + elif key == "OS_PROJECT_DOMAIN_NAME": + self.conf["project_domain"] = value.strip() + proc.communicate() + + @property + def sysinv(self): + if not self._sysinv_client: + self._sysinv_client = cgts_client.get_client( + self.SYSINV_API_VERSION, + os_username=self.conf["admin_user"], + os_password=self.conf["admin_pwd"], + os_auth_url=self.conf["auth_url"], + os_project_name=self.conf["admin_tenant"], + os_project_domain_name=self.conf["project_domain"], + os_user_domain_name=self.conf["user_domain"], + os_region_name=self.conf["region_name"], + os_service_type="platform", + os_endpoint_type="admin", + ) + return self._sysinv_client + + +def run_cmd(cmd, interrupt_on_error=False): + """A wrapper for common_utils.trycmd()""" + out, err = common_utils.trycmd(*cmd.split()) + if err: + if interrupt_on_error: + raise Exception(err) + else: + LOG.debug(err) + return out, err + + +def wait_cmd_output(cmd, expected_output, timeout=TIMEOUT, step=TIME_STEP, + interrupt_on_error=True): + """Executes cmd until output matches 'expected_output' or a timeout.""" + LOG.debug('Wait for output of "%s" to match "%s"' % (cmd, expected_output)) + time_elapsed = 0 + while time_elapsed < timeout: + output, _ = run_cmd(cmd, interrupt_on_error=interrupt_on_error) + if output == expected_output: + return time_elapsed + sleep(step) + time_elapsed += step + msg = 'Timeout waiting for output of cmd "%s" to match "%s"' \ + % (cmd, expected_output) + raise Exception(msg) + + +def kill_process_and_descendants(proc): + """function to kill a process and its children processes""" + for child in psutil.Process(proc.pid).children(recursive=True): + child.kill() + proc.kill() + + +def retrieve_helm_v2_releases(): + env = os.environ.copy() + env['PATH'] = '/usr/local/sbin:' + env['PATH'] + env['KUBECONFIG'] = KUBERNETES_ADMIN_CONF + helm_list = subprocess.Popen( + ['helmv2-cli', '--', + 'helm', + 'list', '--output', 'yaml', '--tiller-connection-timeout', '5'], + env=env, stdout=subprocess.PIPE, stderr=subprocess.PIPE, + universal_newlines=True) + timer = threading.Timer(20, kill_process_and_descendants, [helm_list]) + try: + timer.start() + out, err = helm_list.communicate() + if helm_list.returncode != 0: + if err: + raise exception.HelmFailure(reason=err) + # killing the subprocesses with +kill() when timer expires + # returns EBADF because the pipe is closed, but no error + # string on stderr. + if helm_list.returncode == ERROR_CODE_TIMEOUT_HELMV2_CLI: + raise exception.HelmFailure( + reason="helmv2-cli -- helm list operation timed out after " + "20 seconds. Terminated by threading timer.") + raise exception.HelmFailure( + reason="helmv2-cli -- helm list operation failed without " + "error message, errno=%s" % helm_list.returncode) + deployed_releases = {} + if out: + output = yaml.safe_load(out) + releases = output.get('Releases', {}) + for r in releases: + r_name = r.get('Name') + r_version = r.get('Revision') + r_namespace = r.get('Namespace') + deployed_releases.setdefault(r_name, {}).update( + {r_namespace: r_version}) + return deployed_releases + except Exception as e: + raise exception.HelmFailure( + reason="Failed to retrieve helmv2 releases: %s" % e) + finally: + timer.cancel() + + +def has_helmv2_cli(): + if os.path.isfile(HELMV2_PATH): + return True + return False + + +def has_armada_pod(): + """ + Determine running armada pods, including list of status conditions + This jsonpath gives the following output format per pod: + armada-api-bc77f956d-jwl4n::Initialized=True: + Ready=True:ContainersReady=True:PodScheduled=True + """ + JSONPATH = '{range .items[*]}{"\n"}{@.metadata.name}:'\ + '{@.metadata.deletionTimestamp}{range @.status.conditions[*]}'\ + '{":"}{@.type}={@.status}{end}{end}' + + cmd = 'kubectl get pods -n armada --selector=application=armada,'\ + 'component=api --field-selector status.phase=Running '\ + '--output=jsonpath="{}"'.format(JSONPATH) + + output, _ = run_cmd(cmd) + + if output: + return True + return False + + +def is_armada_required(): + """ + Check for armada manifests or helm v2 releases. + Return True if any are found, + False otherwise (including if helm v2 was already removed). + """ + + # Check if there armada apps uploaded/applied, exit if there are any + if os.path.exists(ARMADA_MANIFEST_DIRECTORY): + for sw_version in os.listdir(ARMADA_MANIFEST_DIRECTORY): + directory = os.path.join(ARMADA_MANIFEST_DIRECTORY, sw_version) + if os.listdir(directory): + LOG.debug("Armada apps found: " + str(os.listdir(directory))) + return True + + # Check for releases in helm v2, exit if there are any + if has_helmv2_cli() and has_armada_pod(): + try: + helm_v2_releases = retrieve_helm_v2_releases() + if helm_v2_releases: + LOG.debug("helm v2 releases found: %s" % + (list(helm_v2_releases))) + return True + except Exception as e: + # Don't touch armada if helm v2 query fails for unknown reason. + # If armada was already removed, exception message will be: + # "helm list operation failed without error message" + # and it's okay to continue. + if 'operation failed' not in str(e): + raise Exception("Error listing helm v2 releases: %s" % e) + + return False + + +def remove_armada_resources(): + """ + Remove Armada helm release and namespace. + Note: removing the HR terminates pods and secrets. + """ + + # Remove armada helm v3 release + try: + if ARMADA_RELEASE_NAME in helm_utils.retrieve_helm_releases(): + helm_utils.delete_helm_release( + ARMADA_RELEASE_NAME, namespace=ARMADA_NS) + else: + LOG.warning("Helm v3 release %s not found." % ARMADA_RELEASE_NAME) + except Exception as e: + # Couldn't remove HR, so don't touch anything else. + LOG.error("Could not remove Armada helm release: %s. Try running " + "'helm uninstall -n armada armada' manually." % e) + return False + + # Wait for kubernetes armada namespace to have no resources, + # fail after a timeout + LOG.debug("Waiting for resources to terminate...") + cmd = "kubectl get all -n %s -o name --kubeconfig %s" \ + % (ARMADA_NS, KUBERNETES_ADMIN_CONF) + time_elapsed = wait_cmd_output(cmd=cmd, expected_output="") + LOG.debug("Took about {} seconds".format(time_elapsed)) + + # Remove armada namespace + cmd = "kubectl delete namespace %s --kubeconfig %s --ignore-not-found" \ + % (ARMADA_NS, KUBERNETES_ADMIN_CONF) + run_cmd(cmd) + + return True + + +def remove_armada_manifest_directory(): + cmd = "sudo rm -rf %s" % (ARMADA_MANIFEST_DIRECTORY) + _, stderr = run_cmd(cmd) + if stderr: + LOG.warning("Could not remove %s" % (ARMADA_MANIFEST_DIRECTORY)) + return False + return True + + +def remove_docker_images(): + """ + Loads the keystone admin environment variables and uses 'system' commands + to remove the armada docker image in the local registry. + Return True if successful, False otherwise. + """ + + def delete_images(image_name): + if image_name: + image_tags = client.sysinv.registry_image.tags(image_name) + if not image_tags: + LOG.warning("Image %s already deleted, but still appears on " + "registry image list" % (image_name)) + return True + + # Delete all armada related tags + for image_tag in image_tags: + image_name_and_tag = "%s:%s" % (image_name, image_tag.tag) + try: + client.sysinv.registry_image.delete(image_name_and_tag) + except client_exception.HTTPNotFound: + LOG.error("Image not found: %s" % image_name_and_tag) + return False + except Exception as e: + LOG.exception("Error while removing image: %s: %s" + % (image_name_and_tag, e)) + return False + + return True + else: + LOG.debug("Could not find image %s in docker registry." + % (image_name)) + return False + + client = CgtsClient() + armada_image = None + tiller_image = None + + # Get image names + image_list = client.sysinv.registry_image.list() + if not image_list: + LOG.warning("Failed to remove armada docker image.") + return False + for image in image_list: + if "airshipit/armada" in image.name: + armada_image = image.name + elif "helm/tiller" in image.name: + tiller_image = image.name + + if not armada_image and not tiller_image: + LOG.debug("Could not find armada and tiller images in " + "docker registry.") + return True + + # Delete images + if delete_images(armada_image) and delete_images(tiller_image): + client.sysinv.registry_image.garbage_collect() + else: + return False + + return True + + +def main(): + + if len(sys.argv) != 4: + error_msg = "Invalid arguments: %s" % (sys.argv) + print(error_msg) + LOG.error(error_msg) + return 1 + + script_name, from_release, to_release, action = sys.argv + LOG.info("%s invoked with from_release = %s to_release = %s action = %s" + % (script_name, from_release, to_release, action)) + + if action == 'activate' and from_release >= '22.12': + try: + if is_armada_required(): + LOG.info("Armada is in use. Migrate existing Armada " + "applications to FluxCD and run the activation " + "step again.") + return 1 + + LOG.info("Armada is not in use. It will be removed.") + + if not remove_armada_resources(): + return 1 + + if not remove_armada_manifest_directory(): + return 1 + + if not remove_docker_images(): + return 1 + + LOG.info("Armada removed.") + except Exception as e: + print(e) + LOG.error("An error occured while trying to remove armada:") + LOG.exception(e) + return 1 + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/sysinv/sysinv/sysinv/sysinv/helm/utils.py b/sysinv/sysinv/sysinv/sysinv/helm/utils.py index 3b1fee5fff..cae430ab8d 100644 --- a/sysinv/sysinv/sysinv/sysinv/helm/utils.py +++ b/sysinv/sysinv/sysinv/sysinv/helm/utils.py @@ -147,7 +147,10 @@ def delete_helm_release(release, namespace="default", flags=None): try: timer.start() out, err = process.communicate() - if err: + + if process.returncode == 0 and err: + LOG.warning("Command: %s; %s" % (' '.join(helm_cmd), err)) + elif err: if "not found" in err: LOG.error("Release %s/%s not found or deleted already" % (namespace, release)) return out, err @@ -157,6 +160,7 @@ def delete_helm_release(release, namespace="default", flags=None): err_msg = "Failed to execute helm command. " \ "Helm response timeout." raise exception.HelmFailure(reason=err_msg) + return out, err except Exception as e: LOG.error("Failed to execute helm command: %s" % e)