From 62a66370cac70a62f827fedfc8b76284fde006d3 Mon Sep 17 00:00:00 2001 From: Don Penney Date: Thu, 19 Nov 2020 10:41:27 -0500 Subject: [PATCH] Add protection against duplicate RPMs If a cold reboot occurs in the middle of patch installation, the system can be left in a state where the patch-agent is unable to perform its operations properly. The RPM database can be left with duplicate RPMs due to the incomplete transaction, which can in turn lead to DNF update installation issues. This update adds detection of duplicate RPMs to the patch-agent to avoid attempting installation until the system is recovered. Additionally, protection is added to the sw-patch init to treat multiple reboot patch installations as an error, to avoid boot loops. Closes-Bug: 1904928 Change-Id: Ia06a6f669c45398d7956f2ac2caa76c447bc1b16 Signed-off-by: Don Penney --- cgcs-patch/bin/sw-patch-init.sh | 28 ++- .../cgcs-patch/cgcs_patch/patch_agent.py | 163 ++++++++++-------- .../cgcs-patch/cgcs_patch/patch_controller.py | 8 + 3 files changed, 124 insertions(+), 75 deletions(-) diff --git a/cgcs-patch/bin/sw-patch-init.sh b/cgcs-patch/bin/sw-patch-init.sh index b163cc0a..0168f74f 100644 --- a/cgcs-patch/bin/sw-patch-init.sh +++ b/cgcs-patch/bin/sw-patch-init.sh @@ -1,12 +1,12 @@ #!/bin/bash # -# Copyright (c) 2014-2019 Wind River Systems, Inc. +# Copyright (c) 2014-2020 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # # StarlingX Patching # chkconfig: 345 20 23 -# description: CGCS Patching init script +# description: StarlingX Patching init script ### BEGIN INIT INFO # Provides: sw-patch @@ -25,6 +25,7 @@ NAME=$(basename $0) logfile=/var/log/patching.log patch_failed_file=/var/run/patch_install_failed +patched_during_init=/etc/patching/.patched_during_init function LOG_TO_FILE { echo "`date "+%FT%T.%3N"`: $NAME: $*" >> $logfile @@ -32,11 +33,24 @@ function LOG_TO_FILE { function check_for_rr_patch { if [ -f /var/run/node_is_patched_rr ]; then - echo - echo "Node has been patched and requires an immediate reboot." - echo - LOG_TO_FILE "Node has been patched, with reboot-required flag set. Rebooting" - /sbin/reboot + if [ ! -f ${patched_during_init} ]; then + echo + echo "Node has been patched and requires an immediate reboot." + echo + LOG_TO_FILE "Node has been patched, with reboot-required flag set. Rebooting" + touch ${patched_during_init} + /sbin/reboot + else + echo + echo "Node has been patched during init a second consecutive time. Skipping reboot due to possible error" + echo + LOG_TO_FILE "Node has been patched during init a second consecutive time. Skipping reboot due to possible error" + touch ${patch_failed_file} + rm -f ${patched_during_init} + exit 1 + fi + else + rm -f ${patched_during_init} fi } diff --git a/cgcs-patch/cgcs-patch/cgcs_patch/patch_agent.py b/cgcs-patch/cgcs-patch/cgcs_patch/patch_agent.py index 7c64a28a..e895e00a 100644 --- a/cgcs-patch/cgcs-patch/cgcs_patch/patch_agent.py +++ b/cgcs-patch/cgcs-patch/cgcs_patch/patch_agent.py @@ -204,6 +204,7 @@ class PatchMessageQueryDetailedResp(messages.PatchMessage): self.message['installed'] = pa.installed self.message['to_remove'] = pa.to_remove self.message['missing_pkgs'] = pa.missing_pkgs + self.message['duplicated_pkgs'] = pa.duplicated_pkgs self.message['nodetype'] = cfg.nodetype self.message['sw_version'] = SW_VERSION self.message['subfunctions'] = subfunctions @@ -340,6 +341,7 @@ class PatchAgent(PatchService): self.to_remove_dnf = [] self.missing_pkgs = [] self.missing_pkgs_dnf = [] + self.duplicated_pkgs = {} self.patch_op_counter = 0 self.node_is_patched = os.path.exists(node_is_patched_file) self.node_is_patched_timestamp = 0 @@ -384,15 +386,22 @@ class PatchAgent(PatchService): self.listener.bind(('', self.port)) self.listener.listen(2) # Allow two connections, for two controllers + @staticmethod + def pkgobj_to_version_str(pkg): + # Transform pkgobj version to format used by patch-controller + if pkg.epoch != 0: + output = "%s:%s-%s@%s" % (pkg.epoch, pkg.version, pkg.release, pkg.arch) + else: + output = "%s-%s@%s" % (pkg.version, pkg.release, pkg.arch) + + return output + @staticmethod def pkgobjs_to_list(pkgobjs): # Transform pkgobj list to format used by patch-controller output = {} for pkg in pkgobjs: - if pkg.epoch != 0: - output[pkg.name] = "%s:%s-%s@%s" % (pkg.epoch, pkg.version, pkg.release, pkg.arch) - else: - output[pkg.name] = "%s-%s@%s" % (pkg.version, pkg.release, pkg.arch) + output[pkg.name] = PatchAgent.pkgobj_to_version_str(pkg) return output @@ -481,6 +490,18 @@ class PatchAgent(PatchService): pkgs_installed = dnf.sack._rpmdb_sack(self.dnfb).query().installed() # pylint: disable=protected-access avail = self.dnfb.sack.query().available().latest() + # Check for packages with multiple installed versions + self.duplicated_pkgs = {} + for pkg in pkgs_installed: + pkglist = pkgs_installed.filter(name=pkg.name, arch=pkg.arch) + if len(pkglist) > 1: + if pkg.name not in self.duplicated_pkgs: + self.duplicated_pkgs[pkg.name] = {} + if pkg.arch not in self.duplicated_pkgs[pkg.name]: + self.duplicated_pkgs[pkg.name][pkg.arch] = map(PatchAgent.pkgobj_to_version_str, pkglist) + LOG.warn("Duplicate packages installed: %s %s", + pkg.name, ", ".join(self.duplicated_pkgs[pkg.name][pkg.arch])) + # There are three possible actions: # 1. If installed pkg is not in a repo, remove it. # 2. If installed pkg version does not match newest repo version, update it. @@ -538,6 +559,8 @@ class PatchAgent(PatchService): LOG.info("To install: %s", self.to_install) LOG.info("To remove: %s", self.to_remove) LOG.info("Missing: %s", self.missing_pkgs) + if len(self.duplicated_pkgs) > 0: + LOG.info("Duplicated: %s", self.duplicated_pkgs) return True @@ -625,78 +648,82 @@ class PatchAgent(PatchService): changed = False rc = True - if len(self.to_install_dnf) > 0 or len(self.to_downgrade_dnf) > 0: - LOG.info("Adding pkgs to installation set: %s", self.to_install) - for pkg in self.to_install_dnf: - self.dnfb.package_install(pkg) - - for pkg in self.to_downgrade_dnf: - self.dnfb.package_downgrade(pkg) - - changed = True - - if len(self.missing_pkgs_dnf) > 0: - LOG.info("Adding missing pkgs to installation set: %s", self.missing_pkgs) - for pkg in self.missing_pkgs_dnf: - self.dnfb.package_install(pkg) - changed = True - - if len(self.to_remove_dnf) > 0: - LOG.info("Adding pkgs to be removed: %s", self.to_remove) - for pkg in self.to_remove_dnf: - self.dnfb.package_remove(pkg) - changed = True - - if changed: - # Run the transaction set - transaction_rc = False - try: - transaction_rc = self.resolve_dnf_transaction() - except dnf.exceptions.DepsolveError: - LOG.exception("Failures resolving dependencies in transaction") - except dnf.exceptions.DownloadError: - LOG.exception("Failures downloading in transaction") - except dnf.exceptions.Error: - LOG.exception("Failure resolving transaction") - - if not transaction_rc: - LOG.error("Failures occurred during transaction") - rc = False - if verbose_to_stdout: - print("WARNING: Software update failed.") - + if len(self.duplicated_pkgs) > 0: + LOG.error("Duplicate installed packages found. Manual recovery is required.") + rc = False else: - if verbose_to_stdout: - print("Nothing to install.") - LOG.info("Nothing to install") + if len(self.to_install_dnf) > 0 or len(self.to_downgrade_dnf) > 0: + LOG.info("Adding pkgs to installation set: %s", self.to_install) + for pkg in self.to_install_dnf: + self.dnfb.package_install(pkg) - if changed and rc: - # Update the node_is_patched flag - setflag(node_is_patched_file) + for pkg in self.to_downgrade_dnf: + self.dnfb.package_downgrade(pkg) - self.node_is_patched = True - if verbose_to_stdout: - print("This node has been patched.") + changed = True - if os.path.exists(node_is_patched_rr_file): - LOG.info("Reboot is required. Skipping patch-scripts") - elif disallow_insvc_patch: - LOG.info("Disallowing patch-scripts. Treating as reboot-required") - setflag(node_is_patched_rr_file) - else: - LOG.info("Running in-service patch-scripts") + if len(self.missing_pkgs_dnf) > 0: + LOG.info("Adding missing pkgs to installation set: %s", self.missing_pkgs) + for pkg in self.missing_pkgs_dnf: + self.dnfb.package_install(pkg) + changed = True + if len(self.to_remove_dnf) > 0: + LOG.info("Adding pkgs to be removed: %s", self.to_remove) + for pkg in self.to_remove_dnf: + self.dnfb.package_remove(pkg) + changed = True + + if changed: + # Run the transaction set + transaction_rc = False try: - subprocess.check_output(run_insvc_patch_scripts_cmd, stderr=subprocess.STDOUT) + transaction_rc = self.resolve_dnf_transaction() + except dnf.exceptions.DepsolveError: + LOG.exception("Failures resolving dependencies in transaction") + except dnf.exceptions.DownloadError: + LOG.exception("Failures downloading in transaction") + except dnf.exceptions.Error: + LOG.exception("Failure resolving transaction") - # Clear the node_is_patched flag, since we've handled it in-service - clearflag(node_is_patched_file) - self.node_is_patched = False - except subprocess.CalledProcessError as e: - LOG.exception("In-Service patch scripts failed") - LOG.error("Command output: %s", e.output) - # Fail the patching operation + if not transaction_rc: + LOG.error("Failures occurred during transaction") rc = False + if verbose_to_stdout: + print("WARNING: Software update failed.") + + else: + if verbose_to_stdout: + print("Nothing to install.") + LOG.info("Nothing to install") + + if changed and rc: + # Update the node_is_patched flag + setflag(node_is_patched_file) + + self.node_is_patched = True + if verbose_to_stdout: + print("This node has been patched.") + + if os.path.exists(node_is_patched_rr_file): + LOG.info("Reboot is required. Skipping patch-scripts") + elif disallow_insvc_patch: + LOG.info("Disallowing patch-scripts. Treating as reboot-required") + setflag(node_is_patched_rr_file) + else: + LOG.info("Running in-service patch-scripts") + + try: + subprocess.check_output(run_insvc_patch_scripts_cmd, stderr=subprocess.STDOUT) + + # Clear the node_is_patched flag, since we've handled it in-service + clearflag(node_is_patched_file) + self.node_is_patched = False + except subprocess.CalledProcessError as e: + LOG.exception("In-Service patch scripts failed") + LOG.error("Command output: %s", e.output) + # Fail the patching operation + rc = False # Clear the in-service patch dirs if os.path.exists(insvc_patch_scripts): diff --git a/cgcs-patch/cgcs-patch/cgcs_patch/patch_controller.py b/cgcs-patch/cgcs-patch/cgcs_patch/patch_controller.py index ad233596..c84bcdfa 100644 --- a/cgcs-patch/cgcs-patch/cgcs_patch/patch_controller.py +++ b/cgcs-patch/cgcs-patch/cgcs_patch/patch_controller.py @@ -115,6 +115,7 @@ class AgentNeighbour(object): self.installed = {} self.to_remove = [] self.missing_pkgs = [] + self.duplicated_pkgs = {} self.nodetype = None self.sw_version = "unknown" self.subfunctions = [] @@ -156,6 +157,7 @@ class AgentNeighbour(object): installed, to_remove, missing_pkgs, + duplicated_pkgs, nodetype, sw_version, subfunctions, @@ -163,6 +165,7 @@ class AgentNeighbour(object): self.installed = installed self.to_remove = to_remove self.missing_pkgs = missing_pkgs + self.duplicated_pkgs = duplicated_pkgs self.nodetype = nodetype self.stale = False self.pending_query = False @@ -186,6 +189,7 @@ class AgentNeighbour(object): "installed": self.installed, "to_remove": self.to_remove, "missing_pkgs": self.missing_pkgs, + "duplicated_pkgs": self.duplicated_pkgs, "nodetype": self.nodetype, "subfunctions": self.subfunctions, "sw_version": self.sw_version, @@ -421,6 +425,7 @@ class PatchMessageQueryDetailedResp(messages.PatchMessage): self.to_install = {} self.to_remove = [] self.missing_pkgs = [] + self.duplicated_pkgs = {} self.subfunctions = [] self.nodetype = "unknown" self.agent_sw_version = "unknown" @@ -434,6 +439,8 @@ class PatchMessageQueryDetailedResp(messages.PatchMessage): self.to_remove = data['to_remove'] if 'missing_pkgs' in data: self.missing_pkgs = data['missing_pkgs'] + if 'duplicated_pkgs' in data: + self.duplicated_pkgs = data['duplicated_pkgs'] if 'nodetype' in data: self.nodetype = data['nodetype'] if 'sw_version' in data: @@ -455,6 +462,7 @@ class PatchMessageQueryDetailedResp(messages.PatchMessage): pc.hosts[ip].handle_query_detailed_resp(self.installed, self.to_remove, self.missing_pkgs, + self.duplicated_pkgs, self.nodetype, self.agent_sw_version, self.subfunctions,