From ac49ff342c7120fcd9d5196eecc6bbaadaa65596 Mon Sep 17 00:00:00 2001 From: Davlet Panech Date: Mon, 8 Nov 2021 10:40:20 -0500 Subject: [PATCH] use curl + avoid partial downloads Mirror scripts sometimes leave corrupted/partial files behind. Problems ======== 1) wget is called with the -O flag, and the server returns an HTTP error for the requested URL (404 etc). Wget leaves a zero-length file behind. This doesn't seem to happen without the -O flag. 2) wget starts the download which stalls & times out half-way; wget gives up and requests the same file with a byte offset of the form "Range: bytes=1234-", and the web server doesn't support open-ended ranges. In this case wget prints out a warning, leaves a partial file behind and returns success. 3) Sites like GitHub generate repo tarballs on the fly, eg: https://github.com/kubernetes/kubernetes/archive/refs/tags/v1.19.3.tar.gz Since tags can move, downloading such a file twice may result in a different file. Therefore HTTP "resume download" may corrupt files in this case. 4) Git "keyword expansion" feature may result in differences in source files being downloaded. For example, this file: https://github.com/kubernetes/kubernetes/blob/v1.19.3/staging/src/k8s.io/component-base/version/base.go contains lines similar to: gitVersion = "v0.0.0-master+$Format:%h$" where %h is replaced with a short SHA when the tar file is exported/downloaded. How short the SHA is depends on git history and sometimes results in shortened SHAs of different lengths. So downloading that file may result in different files. Therefore HTTP "Range" header may corrupt files in this case as well. 5) Curl is invoked with the "--retry" option and starts the download; connection stalls; curl gives up, connects again, skips the 1st N bytes and appends to the partial file. If the file changes while we are doing this, it will end up corrupting the file. This is very unlikely to happen and I haven't been able to reproduce this case. Problems with HTTP Range header =============================== Curl/wget "resume/continue download" feature has no way of verifying whether the partial file on disk, and the one being re-requested, are in fact the same file. If the file changes on the server between downloads, "resume download" will corrupt it. Some web servers don't support this at all, which triggers case (2) with wget. Some web servers support the Range header, but require that the end byte position is present. This is not compatible with wget & curl. For example curl & wget add headers similar to: "Range: bytes=1234-" means give me the file starting at offset 1234 and till EOF. This also triggers case (2). This patch ========== * Always download the file to a temporary name, then rename into place * Use curl instead wget (better error handling). The only exception is "recursive downloads", which curl doesn't support. Bug: https://bugs.launchpad.net/starlingx/+bug/1950017 Change-Id: Iaa89009ce23efe5b73ecb8163556ce6db932028b Signed-off-by: Davlet Panech --- centos-mirror-tools/dl_lower_layer_rpms.sh | 23 +++--- .../dl_other_from_centos_repo.sh | 3 +- centos-mirror-tools/dl_rpms.sh | 2 + centos-mirror-tools/dl_tarball.sh | 17 ++-- centos-mirror-tools/download_mirror.sh | 2 +- .../make_stx_mirror_yum_conf.sh | 4 +- centos-mirror-tools/starlingx_add_pkgs.sh | 5 +- .../stx_mirror_scripts/dl_utils.sh | 14 ++-- centos-mirror-tools/utils.sh | 80 ++++++++++++++++--- centos-mirror-tools/utils_tests.sh | 12 +-- 10 files changed, 117 insertions(+), 45 deletions(-) diff --git a/centos-mirror-tools/dl_lower_layer_rpms.sh b/centos-mirror-tools/dl_lower_layer_rpms.sh index 71a92b68..af353011 100755 --- a/centos-mirror-tools/dl_lower_layer_rpms.sh +++ b/centos-mirror-tools/dl_lower_layer_rpms.sh @@ -202,19 +202,24 @@ number_of_cpus () { /usr/bin/nproc } +# FIXME: curl would work better here, but it doesn't support recursive downloads. +# +# Wget corrupts files in some cases: +# - if the download stalls half-way and --tries is set to > 1, and the web +# server doesn't support the Range header with the upper limit omitted, +# (eg Range: bytes=18671712-) wget returns success (0) and leaves a partial +# file behind +# - if download fails half-way, or wget is interrupted, wget returns +# non-zero, but may leave a partial file behind. This is to be expected, +# but we can't easily tell which files were downloaded fully in this case. +# +# See https://bugs.launchpad.net/starlingx/+bug/1950017 get_remote_dir () { local url="${1}" local dest_dir="${2}" mkdir -p "${dest_dir}" || return 1 \rm "${dest_dir}/"index.html* - wget -c -N --recursive --no-parent --no-host-directories --no-directories --directory-prefix="${dest_dir}" "${url}/" -} - -get_remote_file () { - local url="${1}" - local dest_dir="${2}" - mkdir -p "${dest_dir}" || return 1 - wget -c -N --no-parent --no-host-directories --no-directories --directory-prefix="${dest_dir}" "${url}" + wget -c -N --timeout 15 --recursive --no-parent --no-host-directories --no-directories --directory-prefix="${dest_dir}" "${url}/" } get_remote_file_overwrite () { @@ -226,7 +231,7 @@ get_remote_file_overwrite () { if [ -f "${dest_file}" ]; then \rm "${dest_file}" fi - wget -c -N --no-parent --no-host-directories --no-directories --directory-prefix="${dest_dir}" "${url}" + download_file --timestamps "$url" "$dest_file" } clean_repodata () { diff --git a/centos-mirror-tools/dl_other_from_centos_repo.sh b/centos-mirror-tools/dl_other_from_centos_repo.sh index d16e7dc4..46074cc1 100755 --- a/centos-mirror-tools/dl_other_from_centos_repo.sh +++ b/centos-mirror-tools/dl_other_from_centos_repo.sh @@ -11,6 +11,7 @@ DL_OTHER_FROM_CENTOS_REPO_DIR="$(dirname "$(readlink -f "${BASH_SOURCE[0]}" )" )" source $DL_OTHER_FROM_CENTOS_REPO_DIR/url_utils.sh +source $DL_OTHER_FROM_CENTOS_REPO_DIR/utils.sh usage () { echo "$0 [-D ] [-s|-S|-u|-U] [-h] []" @@ -144,7 +145,7 @@ for ff in $all; do echo "remote path: $url_prefix/$_name" echo "local path: $save_path/$_name" - if wget $url_prefix/$_name; then + if download_file $url_prefix/$_name; then file_name=`basename $_name` sub_path=`dirname $_name` if [ -e "./$file_name" ]; then diff --git a/centos-mirror-tools/dl_rpms.sh b/centos-mirror-tools/dl_rpms.sh index a88446aa..f6672951 100755 --- a/centos-mirror-tools/dl_rpms.sh +++ b/centos-mirror-tools/dl_rpms.sh @@ -345,6 +345,8 @@ download_worker () { break else echo "Warning: $rpm_name not found" + SFILE="$(get_rpm_level_name $rpm_name $lvl)" + \rm -f "$SFILE" fi done diff --git a/centos-mirror-tools/dl_tarball.sh b/centos-mirror-tools/dl_tarball.sh index d19c0d1e..99843478 100755 --- a/centos-mirror-tools/dl_tarball.sh +++ b/centos-mirror-tools/dl_tarball.sh @@ -144,7 +144,7 @@ is_tarball() { return $FOUND } -# Download function using wget command +# Download function using curl or similar command download_package() { local tarball_name="$1" @@ -169,11 +169,11 @@ download_package() { ;; esac - wget --spider "$url" + url_exists "$url" if [ $? != 0 ]; then echo "Warning: '$url' is broken" else - wget -q -t 5 --wait=15 -O "$tarball_name" "$url" + download_file --quiet "$url" "$tarball_name" if [ $? -eq 0 ]; then if is_tarball "$tarball_name"; then echo "Ok: $download_path" @@ -293,7 +293,7 @@ for line in $(cat $tarball_file); do rm -rf $directory_name popd > /dev/null # pushd $directory_name elif [[ "$tarball_name" = 'chartmuseum-v0.12.0-amd64' ]]; then - wget -q -t 5 --wait=15 -O "$tarball_name" "$tarball_url" + download_file --quiet "$tarball_url" "$tarball_name" if [ $? -ne 0 ]; then error_count=$((error_count + 1)) popd > /dev/null # pushd $output_tarball @@ -301,7 +301,7 @@ for line in $(cat $tarball_file); do fi elif [[ "$tarball_name" = 'OPAE_1.3.7-5_el7.zip' ]]; then srpm_path="${directory_name}/source_code/" - wget -q -t 5 --wait=15 -O "$tarball_name" "$tarball_url" + download_file --quiet "$tarball_url" "$tarball_name" if [ $? -ne 0 ]; then error_count=$((error_count + 1)) popd > /dev/null # pushd $output_tarball @@ -450,7 +450,7 @@ for line in $(cat $tarball_file); do src_rpm_name="$(echo "$tarball_url" | rev | cut -d/ -f1 | rev)" - wget -q -t 5 --wait=15 -O "$src_rpm_name" "$tarball_url" + download_file --quiet "$tarball_url" "$src_rpm_name" if [ $? -eq 0 ]; then rpm2cpio "$src_rpm_name" | cpio --quiet -i "$tarball_name" mv "$tarball_name" .. @@ -486,9 +486,8 @@ for line in $(cat $tarball_file); do ;; esac - download_cmd="wget -q -t 5 --wait=15 $url -O $download_path" - - if $download_cmd ; then + download_file --quiet "$url" "$download_path" + if [[ $? -eq 0 ]] ; then if ! is_tarball "$download_path"; then echo "Warning: file from $url is not a tarball." \rm "$download_path" diff --git a/centos-mirror-tools/download_mirror.sh b/centos-mirror-tools/download_mirror.sh index 407011b9..60f8291d 100755 --- a/centos-mirror-tools/download_mirror.sh +++ b/centos-mirror-tools/download_mirror.sh @@ -539,7 +539,7 @@ else fi -#download RPMs/SRPMs from 3rd_party websites (not CentOS repos) by "wget" +#download RPMs/SRPMs from 3rd_party websites (not CentOS repos) using curl echo "step #2: start downloading RPMs/SRPMs from 3rd-party websites..." list=${rpms_from_3rd_parties} level=L1 diff --git a/centos-mirror-tools/make_stx_mirror_yum_conf.sh b/centos-mirror-tools/make_stx_mirror_yum_conf.sh index 5ea18964..63762427 100755 --- a/centos-mirror-tools/make_stx_mirror_yum_conf.sh +++ b/centos-mirror-tools/make_stx_mirror_yum_conf.sh @@ -12,7 +12,7 @@ MAKE_STX_MIRROR_YUM_CONF_DIR="$(dirname "$(readlink -f "${BASH_SOURCE[0]}" )" )" -source "$MAKE_STX_MIRROR_YUM_CONF_DIR/url_utils.sh" +source "$MAKE_STX_MIRROR_YUM_CONF_DIR/utils.sh" || exit 1 DISTRO="centos" SUDO=sudo @@ -252,7 +252,7 @@ for REPO in $(find "$CENGN_REPOS_DIR" -type f -name '*repo'); do CENGN_URL="$(url_to_stx_mirror_url "$URL" "$DISTRO")" # Test CENGN url - wget -q --spider $CENGN_URL + url_exists --quiet "$CENGN_URL" if [ $? -eq 0 ]; then # OK, make substitution sed "s#^baseurl=$URL\$#baseurl=$CENGN_URL#" -i "$REPO" diff --git a/centos-mirror-tools/starlingx_add_pkgs.sh b/centos-mirror-tools/starlingx_add_pkgs.sh index fb792316..852cf2fe 100755 --- a/centos-mirror-tools/starlingx_add_pkgs.sh +++ b/centos-mirror-tools/starlingx_add_pkgs.sh @@ -86,7 +86,8 @@ fi STARLINGX_ADD_PKGS_DIR="$(dirname "$(readlink -f "${BASH_SOURCE[0]}" )" )" -source $STARLINGX_ADD_PKGS_DIR/../toCOPY/lst_utils.sh +source $STARLINGX_ADD_PKGS_DIR/../toCOPY/lst_utils.sh || exit 1 +source $STARLINGX_ADD_PKGS_DIR/utils.sh || exit 1 STXTOOLS=${MY_REPO_ROOT_DIR}/stx-tools @@ -332,7 +333,7 @@ function download_pkg { echo "Downloading $url" rpm_path=$LOCALREPO_PATH/$arch/$(basename $relativepath) - wget -q -O $rpm_path $url + download_file --quiet "$url" "$rpm_path" if [ $? -ne 0 ]; then echo "Failed to download $url" >&2 diff --git a/centos-mirror-tools/stx_mirror_scripts/dl_utils.sh b/centos-mirror-tools/stx_mirror_scripts/dl_utils.sh index afbbe75b..7e494d6f 100644 --- a/centos-mirror-tools/stx_mirror_scripts/dl_utils.sh +++ b/centos-mirror-tools/stx_mirror_scripts/dl_utils.sh @@ -8,12 +8,12 @@ DL_UTILS_DIR="$(dirname "$(readlink -f "${BASH_SOURCE[0]}" )" )" -if [ -f "$DL_UTILS_DIR/url_utils.sh" ]; then - source "$DL_UTILS_DIR/url_utils.sh" -elif [ -f "$DL_UTILS_DIR/../url_utils.sh" ]; then - source "$DL_UTILS_DIR/../url_utils.sh" +if [ -f "$DL_UTILS_DIR/utils.sh" ]; then + source "$DL_UTILS_DIR/utils.sh" +elif [ -f "$DL_UTILS_DIR/../utils.sh" ]; then + source "$DL_UTILS_DIR/../utils.sh" else - echo "Error: Can't find 'url_utils.sh'" + echo "Error: Can't find 'utils.sh'" exit 1 fi @@ -257,13 +257,15 @@ dl_file_from_url () { fi fi - CMD="wget '$URL' --tries=5 --wait=15 --output-document='$DOWNLOAD_PATH'" + CMD="$(get_download_file_command $URL $DOWNLOAD_PATH.dl_part)" echo "$CMD" eval $CMD if [ $? -ne 0 ]; then + \rm -f "$DOWNLOAD_PATH.dl_part" echo "Error: $CMD" return 1 fi + \mv -fT "$DOWNLOAD_PATH.dl_part" "$DOWNLOAD_PATH" ;; *) echo "Error: Unknown protocol '$PROTOCOL' for url '$URL'" diff --git a/centos-mirror-tools/utils.sh b/centos-mirror-tools/utils.sh index a9a7faf6..0af80de3 100644 --- a/centos-mirror-tools/utils.sh +++ b/centos-mirror-tools/utils.sh @@ -6,7 +6,9 @@ UTILS_DIR="$(dirname "$(readlink -f "${BASH_SOURCE[0]}" )" )" -source $UTILS_DIR/url_utils.sh +: ${_CURL_OPTS="--fail --location --connect-timeout 15 --speed-time 15 --speed-limit 1 --retry 5"} + +source $UTILS_DIR/url_utils.sh || exit 1 get_yum_command() { local _file=$1 @@ -26,17 +28,77 @@ get_yum_command() { echo "${SUDO} yumdownloader -q -C ${YUMCONFOPT} ${RELEASEVER} $yumdownloader_extra_opts $rpm_name" } -get_wget_command() { +# Usage: get_download_file_command [--quiet] [--timestamps] URL [OUTPUT_FILE] +get_download_file_command() { + local _opts="$_CURL_OPTS" + while true ; do + case "$1" in + --quiet) _opts+=" --silent --show-error" ;; + --timestamps) _opts+=" --remote-time" ;; + -*) + echo >&2 "Unknown option $1" + return 1 + ;; + *) + break + esac + shift + done + local _name="$1" local _ret="" - if [[ "$_name" == http?(s)://* ]]; then - _ret="wget -q $_name" + if [[ $# -gt 1 ]]; then + _opts+=" -o $2" else - _ret="wget -q $(koji_url $_name)" + _opts+=" -O" + fi + if [[ "$_name" == http?(s)://* ]]; then + _ret="curl $_opts $_name" + else + _ret="curl $_opts $(koji_url $_name)" fi echo "$_ret" } +# Usage: download_file [--quiet] [--timestamps] URL [OUTPUT_FILE] +download_file() { + local _opts="$_CURL_OPTS" + while true ; do + case "$1" in + --quiet) _opts+=" --silent --show-error" ;; + --timestamps) _opts+=" --remote-time" ;; + -*) + echo >&2 "Unknown option $1" + return 1 + ;; + *) + break + esac + shift + done + if [[ "$#" -gt 1 ]] ; then + local _dest_file="$2" + else + local _dest_file="$(basename "$1")" + fi + if curl $_opts -o "${_dest_file}.dl_part" "$1" ; then + \mv -fT "${_dest_file}.dl_part" "${_dest_file}" + return 0 + fi + \rm -f "${_dest_file}.dl_part" + return 1 +} + +# Usage: url_exists [--quiet] URL +url_exists() { + local _opts + if [[ "$1" == "--quiet" ]] ; then + _opts+=" --quiet" + shift + fi + wget $_opts --spider "$1" +} + get_rpm_level_name() { local _rpm_name=$1 local _level=$2 @@ -166,20 +228,20 @@ get_download_cmd() { local ff="$1" local _level="$2" - # Decide if the list will be downloaded using yumdownloader or wget + # Decide if the list will be downloaded using yumdownloader or curl if [[ $ff != *"#"* ]]; then rpm_name=$ff if [ $_level == "K1" ]; then - download_cmd="$(get_wget_command $rpm_name)" + download_cmd="$(get_download_file_command --quiet $rpm_name)" else # yumdownloader with the appropriate flag for src, noarch or x86_64 # download_cmd="${SUDOCMD} $(get_yum_command $rpm_name $_level)" download_cmd="$(get_yum_command $rpm_name $_level)" fi else - # Build wget command + # Build the download command rpm_url=$(get_url "$ff" "$_level") - download_cmd="$(get_wget_command $rpm_url)" + download_cmd="$(get_download_file_command --quiet $rpm_url)" fi echo "$download_cmd" diff --git a/centos-mirror-tools/utils_tests.sh b/centos-mirror-tools/utils_tests.sh index 1ffab382..f3c33b48 100644 --- a/centos-mirror-tools/utils_tests.sh +++ b/centos-mirror-tools/utils_tests.sh @@ -18,7 +18,7 @@ source utils.sh check_result() { local _res="$1" local _expect="$2" - if [ "$_res" != "$_expect" ]; then + if [[ "$_res" != $_expect ]]; then echo "Fail" echo "expected $_expect" echo "returned $_res" @@ -27,14 +27,14 @@ check_result() { echo "Success" } -# get_wget_command +# get_download_file_command -res=$(get_wget_command "https://libvirt.org/sources/python/libvirt-python-3.5.0-1.fc24.src.rpm") -expect="wget -q https://libvirt.org/sources/python/libvirt-python-3.5.0-1.fc24.src.rpm" +res=$(get_download_file_command "https://libvirt.org/sources/python/libvirt-python-3.5.0-1.fc24.src.rpm") +expect="curl* https://libvirt.org/sources/python/libvirt-python-3.5.0-1.fc24.src.rpm" check_result "$res" "$expect" -res=$(get_wget_command "python2-httpbin-0.5.0-6.el7.noarch.rpm") -expect="wget -q https://kojipkgs.fedoraproject.org/packages/python2-httpbin/0.5.0/6.el7/noarch/python2-httpbin-0.5.0-6.el7.noarch.rpm" +res=$(get_download_file_command --quiet "python2-httpbin-0.5.0-6.el7.noarch.rpm") +expect="curl*--silent* https://kojipkgs.fedoraproject.org/packages/python2-httpbin/0.5.0/6.el7/noarch/python2-httpbin-0.5.0-6.el7.noarch.rpm" check_result "$res" "$expect" # get_url