Add protection against duplicate RPMs

If a cold reboot occurs in the middle of patch installation, the
system can be left in a state where the patch-agent is unable to
perform its operations properly. The RPM database can be left with
duplicate RPMs due to the incomplete transaction, which can in turn
lead to DNF update installation issues.

This update adds detection of duplicate RPMs to the patch-agent to
avoid attempting installation until the system is recovered.

Additionally, protection is added to the sw-patch init to treat
multiple reboot patch installations as an error, to avoid boot loops.

Closes-Bug: 1904928
Change-Id: Ia06a6f669c45398d7956f2ac2caa76c447bc1b16
Signed-off-by: Don Penney <don.penney@windriver.com>
This commit is contained in:
Don Penney 2020-11-19 10:41:27 -05:00
parent 1c8d87d404
commit 62a66370ca
3 changed files with 124 additions and 75 deletions

View File

@ -1,12 +1,12 @@
#!/bin/bash
#
# Copyright (c) 2014-2019 Wind River Systems, Inc.
# Copyright (c) 2014-2020 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
# StarlingX Patching
# chkconfig: 345 20 23
# description: CGCS Patching init script
# description: StarlingX Patching init script
### BEGIN INIT INFO
# Provides: sw-patch
@ -25,6 +25,7 @@ NAME=$(basename $0)
logfile=/var/log/patching.log
patch_failed_file=/var/run/patch_install_failed
patched_during_init=/etc/patching/.patched_during_init
function LOG_TO_FILE {
echo "`date "+%FT%T.%3N"`: $NAME: $*" >> $logfile
@ -32,11 +33,24 @@ function LOG_TO_FILE {
function check_for_rr_patch {
if [ -f /var/run/node_is_patched_rr ]; then
if [ ! -f ${patched_during_init} ]; then
echo
echo "Node has been patched and requires an immediate reboot."
echo
LOG_TO_FILE "Node has been patched, with reboot-required flag set. Rebooting"
touch ${patched_during_init}
/sbin/reboot
else
echo
echo "Node has been patched during init a second consecutive time. Skipping reboot due to possible error"
echo
LOG_TO_FILE "Node has been patched during init a second consecutive time. Skipping reboot due to possible error"
touch ${patch_failed_file}
rm -f ${patched_during_init}
exit 1
fi
else
rm -f ${patched_during_init}
fi
}

View File

@ -204,6 +204,7 @@ class PatchMessageQueryDetailedResp(messages.PatchMessage):
self.message['installed'] = pa.installed
self.message['to_remove'] = pa.to_remove
self.message['missing_pkgs'] = pa.missing_pkgs
self.message['duplicated_pkgs'] = pa.duplicated_pkgs
self.message['nodetype'] = cfg.nodetype
self.message['sw_version'] = SW_VERSION
self.message['subfunctions'] = subfunctions
@ -340,6 +341,7 @@ class PatchAgent(PatchService):
self.to_remove_dnf = []
self.missing_pkgs = []
self.missing_pkgs_dnf = []
self.duplicated_pkgs = {}
self.patch_op_counter = 0
self.node_is_patched = os.path.exists(node_is_patched_file)
self.node_is_patched_timestamp = 0
@ -384,15 +386,22 @@ class PatchAgent(PatchService):
self.listener.bind(('', self.port))
self.listener.listen(2) # Allow two connections, for two controllers
@staticmethod
def pkgobj_to_version_str(pkg):
# Transform pkgobj version to format used by patch-controller
if pkg.epoch != 0:
output = "%s:%s-%s@%s" % (pkg.epoch, pkg.version, pkg.release, pkg.arch)
else:
output = "%s-%s@%s" % (pkg.version, pkg.release, pkg.arch)
return output
@staticmethod
def pkgobjs_to_list(pkgobjs):
# Transform pkgobj list to format used by patch-controller
output = {}
for pkg in pkgobjs:
if pkg.epoch != 0:
output[pkg.name] = "%s:%s-%s@%s" % (pkg.epoch, pkg.version, pkg.release, pkg.arch)
else:
output[pkg.name] = "%s-%s@%s" % (pkg.version, pkg.release, pkg.arch)
output[pkg.name] = PatchAgent.pkgobj_to_version_str(pkg)
return output
@ -481,6 +490,18 @@ class PatchAgent(PatchService):
pkgs_installed = dnf.sack._rpmdb_sack(self.dnfb).query().installed() # pylint: disable=protected-access
avail = self.dnfb.sack.query().available().latest()
# Check for packages with multiple installed versions
self.duplicated_pkgs = {}
for pkg in pkgs_installed:
pkglist = pkgs_installed.filter(name=pkg.name, arch=pkg.arch)
if len(pkglist) > 1:
if pkg.name not in self.duplicated_pkgs:
self.duplicated_pkgs[pkg.name] = {}
if pkg.arch not in self.duplicated_pkgs[pkg.name]:
self.duplicated_pkgs[pkg.name][pkg.arch] = map(PatchAgent.pkgobj_to_version_str, pkglist)
LOG.warn("Duplicate packages installed: %s %s",
pkg.name, ", ".join(self.duplicated_pkgs[pkg.name][pkg.arch]))
# There are three possible actions:
# 1. If installed pkg is not in a repo, remove it.
# 2. If installed pkg version does not match newest repo version, update it.
@ -538,6 +559,8 @@ class PatchAgent(PatchService):
LOG.info("To install: %s", self.to_install)
LOG.info("To remove: %s", self.to_remove)
LOG.info("Missing: %s", self.missing_pkgs)
if len(self.duplicated_pkgs) > 0:
LOG.info("Duplicated: %s", self.duplicated_pkgs)
return True
@ -625,6 +648,10 @@ class PatchAgent(PatchService):
changed = False
rc = True
if len(self.duplicated_pkgs) > 0:
LOG.error("Duplicate installed packages found. Manual recovery is required.")
rc = False
else:
if len(self.to_install_dnf) > 0 or len(self.to_downgrade_dnf) > 0:
LOG.info("Adding pkgs to installation set: %s", self.to_install)
for pkg in self.to_install_dnf:

View File

@ -115,6 +115,7 @@ class AgentNeighbour(object):
self.installed = {}
self.to_remove = []
self.missing_pkgs = []
self.duplicated_pkgs = {}
self.nodetype = None
self.sw_version = "unknown"
self.subfunctions = []
@ -156,6 +157,7 @@ class AgentNeighbour(object):
installed,
to_remove,
missing_pkgs,
duplicated_pkgs,
nodetype,
sw_version,
subfunctions,
@ -163,6 +165,7 @@ class AgentNeighbour(object):
self.installed = installed
self.to_remove = to_remove
self.missing_pkgs = missing_pkgs
self.duplicated_pkgs = duplicated_pkgs
self.nodetype = nodetype
self.stale = False
self.pending_query = False
@ -186,6 +189,7 @@ class AgentNeighbour(object):
"installed": self.installed,
"to_remove": self.to_remove,
"missing_pkgs": self.missing_pkgs,
"duplicated_pkgs": self.duplicated_pkgs,
"nodetype": self.nodetype,
"subfunctions": self.subfunctions,
"sw_version": self.sw_version,
@ -421,6 +425,7 @@ class PatchMessageQueryDetailedResp(messages.PatchMessage):
self.to_install = {}
self.to_remove = []
self.missing_pkgs = []
self.duplicated_pkgs = {}
self.subfunctions = []
self.nodetype = "unknown"
self.agent_sw_version = "unknown"
@ -434,6 +439,8 @@ class PatchMessageQueryDetailedResp(messages.PatchMessage):
self.to_remove = data['to_remove']
if 'missing_pkgs' in data:
self.missing_pkgs = data['missing_pkgs']
if 'duplicated_pkgs' in data:
self.duplicated_pkgs = data['duplicated_pkgs']
if 'nodetype' in data:
self.nodetype = data['nodetype']
if 'sw_version' in data:
@ -455,6 +462,7 @@ class PatchMessageQueryDetailedResp(messages.PatchMessage):
pc.hosts[ip].handle_query_detailed_resp(self.installed,
self.to_remove,
self.missing_pkgs,
self.duplicated_pkgs,
self.nodetype,
self.agent_sw_version,
self.subfunctions,