Add protection against duplicate RPMs

If a cold reboot occurs in the middle of patch installation, the
system can be left in a state where the patch-agent is unable to
perform its operations properly. The RPM database can be left with
duplicate RPMs due to the incomplete transaction, which can in turn
lead to DNF update installation issues.

This update adds detection of duplicate RPMs to the patch-agent to
avoid attempting installation until the system is recovered.

Additionally, protection is added to the sw-patch init to treat
multiple reboot patch installations as an error, to avoid boot loops.

Closes-Bug: 1904928
Change-Id: Ia06a6f669c45398d7956f2ac2caa76c447bc1b16
Signed-off-by: Don Penney <don.penney@windriver.com>
This commit is contained in:
Don Penney 2020-11-19 10:41:27 -05:00
parent 1c8d87d404
commit 62a66370ca
3 changed files with 124 additions and 75 deletions

View File

@ -1,12 +1,12 @@
#!/bin/bash
#
# Copyright (c) 2014-2019 Wind River Systems, Inc.
# Copyright (c) 2014-2020 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
# StarlingX Patching
# chkconfig: 345 20 23
# description: CGCS Patching init script
# description: StarlingX Patching init script
### BEGIN INIT INFO
# Provides: sw-patch
@ -25,6 +25,7 @@ NAME=$(basename $0)
logfile=/var/log/patching.log
patch_failed_file=/var/run/patch_install_failed
patched_during_init=/etc/patching/.patched_during_init
function LOG_TO_FILE {
echo "`date "+%FT%T.%3N"`: $NAME: $*" >> $logfile
@ -32,11 +33,24 @@ function LOG_TO_FILE {
function check_for_rr_patch {
if [ -f /var/run/node_is_patched_rr ]; then
echo
echo "Node has been patched and requires an immediate reboot."
echo
LOG_TO_FILE "Node has been patched, with reboot-required flag set. Rebooting"
/sbin/reboot
if [ ! -f ${patched_during_init} ]; then
echo
echo "Node has been patched and requires an immediate reboot."
echo
LOG_TO_FILE "Node has been patched, with reboot-required flag set. Rebooting"
touch ${patched_during_init}
/sbin/reboot
else
echo
echo "Node has been patched during init a second consecutive time. Skipping reboot due to possible error"
echo
LOG_TO_FILE "Node has been patched during init a second consecutive time. Skipping reboot due to possible error"
touch ${patch_failed_file}
rm -f ${patched_during_init}
exit 1
fi
else
rm -f ${patched_during_init}
fi
}

View File

@ -204,6 +204,7 @@ class PatchMessageQueryDetailedResp(messages.PatchMessage):
self.message['installed'] = pa.installed
self.message['to_remove'] = pa.to_remove
self.message['missing_pkgs'] = pa.missing_pkgs
self.message['duplicated_pkgs'] = pa.duplicated_pkgs
self.message['nodetype'] = cfg.nodetype
self.message['sw_version'] = SW_VERSION
self.message['subfunctions'] = subfunctions
@ -340,6 +341,7 @@ class PatchAgent(PatchService):
self.to_remove_dnf = []
self.missing_pkgs = []
self.missing_pkgs_dnf = []
self.duplicated_pkgs = {}
self.patch_op_counter = 0
self.node_is_patched = os.path.exists(node_is_patched_file)
self.node_is_patched_timestamp = 0
@ -384,15 +386,22 @@ class PatchAgent(PatchService):
self.listener.bind(('', self.port))
self.listener.listen(2) # Allow two connections, for two controllers
@staticmethod
def pkgobj_to_version_str(pkg):
# Transform pkgobj version to format used by patch-controller
if pkg.epoch != 0:
output = "%s:%s-%s@%s" % (pkg.epoch, pkg.version, pkg.release, pkg.arch)
else:
output = "%s-%s@%s" % (pkg.version, pkg.release, pkg.arch)
return output
@staticmethod
def pkgobjs_to_list(pkgobjs):
# Transform pkgobj list to format used by patch-controller
output = {}
for pkg in pkgobjs:
if pkg.epoch != 0:
output[pkg.name] = "%s:%s-%s@%s" % (pkg.epoch, pkg.version, pkg.release, pkg.arch)
else:
output[pkg.name] = "%s-%s@%s" % (pkg.version, pkg.release, pkg.arch)
output[pkg.name] = PatchAgent.pkgobj_to_version_str(pkg)
return output
@ -481,6 +490,18 @@ class PatchAgent(PatchService):
pkgs_installed = dnf.sack._rpmdb_sack(self.dnfb).query().installed() # pylint: disable=protected-access
avail = self.dnfb.sack.query().available().latest()
# Check for packages with multiple installed versions
self.duplicated_pkgs = {}
for pkg in pkgs_installed:
pkglist = pkgs_installed.filter(name=pkg.name, arch=pkg.arch)
if len(pkglist) > 1:
if pkg.name not in self.duplicated_pkgs:
self.duplicated_pkgs[pkg.name] = {}
if pkg.arch not in self.duplicated_pkgs[pkg.name]:
self.duplicated_pkgs[pkg.name][pkg.arch] = map(PatchAgent.pkgobj_to_version_str, pkglist)
LOG.warn("Duplicate packages installed: %s %s",
pkg.name, ", ".join(self.duplicated_pkgs[pkg.name][pkg.arch]))
# There are three possible actions:
# 1. If installed pkg is not in a repo, remove it.
# 2. If installed pkg version does not match newest repo version, update it.
@ -538,6 +559,8 @@ class PatchAgent(PatchService):
LOG.info("To install: %s", self.to_install)
LOG.info("To remove: %s", self.to_remove)
LOG.info("Missing: %s", self.missing_pkgs)
if len(self.duplicated_pkgs) > 0:
LOG.info("Duplicated: %s", self.duplicated_pkgs)
return True
@ -625,78 +648,82 @@ class PatchAgent(PatchService):
changed = False
rc = True
if len(self.to_install_dnf) > 0 or len(self.to_downgrade_dnf) > 0:
LOG.info("Adding pkgs to installation set: %s", self.to_install)
for pkg in self.to_install_dnf:
self.dnfb.package_install(pkg)
for pkg in self.to_downgrade_dnf:
self.dnfb.package_downgrade(pkg)
changed = True
if len(self.missing_pkgs_dnf) > 0:
LOG.info("Adding missing pkgs to installation set: %s", self.missing_pkgs)
for pkg in self.missing_pkgs_dnf:
self.dnfb.package_install(pkg)
changed = True
if len(self.to_remove_dnf) > 0:
LOG.info("Adding pkgs to be removed: %s", self.to_remove)
for pkg in self.to_remove_dnf:
self.dnfb.package_remove(pkg)
changed = True
if changed:
# Run the transaction set
transaction_rc = False
try:
transaction_rc = self.resolve_dnf_transaction()
except dnf.exceptions.DepsolveError:
LOG.exception("Failures resolving dependencies in transaction")
except dnf.exceptions.DownloadError:
LOG.exception("Failures downloading in transaction")
except dnf.exceptions.Error:
LOG.exception("Failure resolving transaction")
if not transaction_rc:
LOG.error("Failures occurred during transaction")
rc = False
if verbose_to_stdout:
print("WARNING: Software update failed.")
if len(self.duplicated_pkgs) > 0:
LOG.error("Duplicate installed packages found. Manual recovery is required.")
rc = False
else:
if verbose_to_stdout:
print("Nothing to install.")
LOG.info("Nothing to install")
if len(self.to_install_dnf) > 0 or len(self.to_downgrade_dnf) > 0:
LOG.info("Adding pkgs to installation set: %s", self.to_install)
for pkg in self.to_install_dnf:
self.dnfb.package_install(pkg)
if changed and rc:
# Update the node_is_patched flag
setflag(node_is_patched_file)
for pkg in self.to_downgrade_dnf:
self.dnfb.package_downgrade(pkg)
self.node_is_patched = True
if verbose_to_stdout:
print("This node has been patched.")
changed = True
if os.path.exists(node_is_patched_rr_file):
LOG.info("Reboot is required. Skipping patch-scripts")
elif disallow_insvc_patch:
LOG.info("Disallowing patch-scripts. Treating as reboot-required")
setflag(node_is_patched_rr_file)
else:
LOG.info("Running in-service patch-scripts")
if len(self.missing_pkgs_dnf) > 0:
LOG.info("Adding missing pkgs to installation set: %s", self.missing_pkgs)
for pkg in self.missing_pkgs_dnf:
self.dnfb.package_install(pkg)
changed = True
if len(self.to_remove_dnf) > 0:
LOG.info("Adding pkgs to be removed: %s", self.to_remove)
for pkg in self.to_remove_dnf:
self.dnfb.package_remove(pkg)
changed = True
if changed:
# Run the transaction set
transaction_rc = False
try:
subprocess.check_output(run_insvc_patch_scripts_cmd, stderr=subprocess.STDOUT)
transaction_rc = self.resolve_dnf_transaction()
except dnf.exceptions.DepsolveError:
LOG.exception("Failures resolving dependencies in transaction")
except dnf.exceptions.DownloadError:
LOG.exception("Failures downloading in transaction")
except dnf.exceptions.Error:
LOG.exception("Failure resolving transaction")
# Clear the node_is_patched flag, since we've handled it in-service
clearflag(node_is_patched_file)
self.node_is_patched = False
except subprocess.CalledProcessError as e:
LOG.exception("In-Service patch scripts failed")
LOG.error("Command output: %s", e.output)
# Fail the patching operation
if not transaction_rc:
LOG.error("Failures occurred during transaction")
rc = False
if verbose_to_stdout:
print("WARNING: Software update failed.")
else:
if verbose_to_stdout:
print("Nothing to install.")
LOG.info("Nothing to install")
if changed and rc:
# Update the node_is_patched flag
setflag(node_is_patched_file)
self.node_is_patched = True
if verbose_to_stdout:
print("This node has been patched.")
if os.path.exists(node_is_patched_rr_file):
LOG.info("Reboot is required. Skipping patch-scripts")
elif disallow_insvc_patch:
LOG.info("Disallowing patch-scripts. Treating as reboot-required")
setflag(node_is_patched_rr_file)
else:
LOG.info("Running in-service patch-scripts")
try:
subprocess.check_output(run_insvc_patch_scripts_cmd, stderr=subprocess.STDOUT)
# Clear the node_is_patched flag, since we've handled it in-service
clearflag(node_is_patched_file)
self.node_is_patched = False
except subprocess.CalledProcessError as e:
LOG.exception("In-Service patch scripts failed")
LOG.error("Command output: %s", e.output)
# Fail the patching operation
rc = False
# Clear the in-service patch dirs
if os.path.exists(insvc_patch_scripts):

View File

@ -115,6 +115,7 @@ class AgentNeighbour(object):
self.installed = {}
self.to_remove = []
self.missing_pkgs = []
self.duplicated_pkgs = {}
self.nodetype = None
self.sw_version = "unknown"
self.subfunctions = []
@ -156,6 +157,7 @@ class AgentNeighbour(object):
installed,
to_remove,
missing_pkgs,
duplicated_pkgs,
nodetype,
sw_version,
subfunctions,
@ -163,6 +165,7 @@ class AgentNeighbour(object):
self.installed = installed
self.to_remove = to_remove
self.missing_pkgs = missing_pkgs
self.duplicated_pkgs = duplicated_pkgs
self.nodetype = nodetype
self.stale = False
self.pending_query = False
@ -186,6 +189,7 @@ class AgentNeighbour(object):
"installed": self.installed,
"to_remove": self.to_remove,
"missing_pkgs": self.missing_pkgs,
"duplicated_pkgs": self.duplicated_pkgs,
"nodetype": self.nodetype,
"subfunctions": self.subfunctions,
"sw_version": self.sw_version,
@ -421,6 +425,7 @@ class PatchMessageQueryDetailedResp(messages.PatchMessage):
self.to_install = {}
self.to_remove = []
self.missing_pkgs = []
self.duplicated_pkgs = {}
self.subfunctions = []
self.nodetype = "unknown"
self.agent_sw_version = "unknown"
@ -434,6 +439,8 @@ class PatchMessageQueryDetailedResp(messages.PatchMessage):
self.to_remove = data['to_remove']
if 'missing_pkgs' in data:
self.missing_pkgs = data['missing_pkgs']
if 'duplicated_pkgs' in data:
self.duplicated_pkgs = data['duplicated_pkgs']
if 'nodetype' in data:
self.nodetype = data['nodetype']
if 'sw_version' in data:
@ -455,6 +462,7 @@ class PatchMessageQueryDetailedResp(messages.PatchMessage):
pc.hosts[ip].handle_query_detailed_resp(self.installed,
self.to_remove,
self.missing_pkgs,
self.duplicated_pkgs,
self.nodetype,
self.agent_sw_version,
self.subfunctions,