From 588529121333a68901c21a89fe32c303e41c0ab3 Mon Sep 17 00:00:00 2001 From: Johannes Schauer Marin Rodrigues Date: Thu, 2 Mar 2023 17:24:14 +0100 Subject: [PATCH] Use an caching apt proxy instead of copying /var/cache/apt/archives/*.deb - only download Release files once and not by apt as well as with curl and thus avoid a mirror push happening between both downloads - no heuristic needed to place the file in their correct mirror location - no manual checksum checking - only throttle download speed when actually downloading and not when retrieving files from the cache - no translation of filenames between how the epoch colon is stored in files in /var/cache/apt/archives versus how it is stored in files on the mirrors - no special handling of stable update and security mirrors - implemented in Python instead of shell and thus an order of magnitude faster --- caching_proxy.py | 108 +++++++++++++ coverage.py | 2 +- coverage.sh | 6 +- make_mirror.sh | 222 +++++++-------------------- tests/as-debootstrap-unshare-wrapper | 1 + tests/check-against-debootstrap-dist | 1 + tests/dist-using-codename | 6 +- 7 files changed, 177 insertions(+), 169 deletions(-) create mode 100755 caching_proxy.py diff --git a/caching_proxy.py b/caching_proxy.py new file mode 100755 index 0000000..40053f1 --- /dev/null +++ b/caching_proxy.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 + +import sys +import os +import time +import http.client +import http.server +from io import StringIO +import pathlib +import urllib.parse + +oldcachedir = None +newcachedir = None +readonly = False + + +class ProxyRequestHandler(http.server.BaseHTTPRequestHandler): + def do_GET(self): + assert int(self.headers.get("Content-Length", 0)) == 0 + assert self.headers["Host"] + pathprefix = "http://" + self.headers["Host"] + "/" + assert self.path.startswith(pathprefix) + sanitizedpath = urllib.parse.unquote(self.path.removeprefix(pathprefix)) + oldpath = oldcachedir / sanitizedpath + newpath = newcachedir / sanitizedpath + + if not readonly: + newpath.parent.mkdir(parents=True, exist_ok=True) + + # just send back to client + if newpath.exists(): + print(f"proxy cached: {self.path}", file=sys.stderr) + self.wfile.write(b"HTTP/1.1 200 OK\r\n") + self.send_header("Content-Length", newpath.stat().st_size) + self.end_headers() + with newpath.open(mode="rb") as new: + while True: + buf = new.read(64 * 1024) # same as shutil uses + if not buf: + break + self.wfile.write(buf) + self.wfile.flush() + return + + if readonly: + newpath = pathlib.Path("/dev/null") + + # copy from oldpath to newpath and send back to client + if oldpath.exists(): + print(f"proxy cached: {self.path}", file=sys.stderr) + self.wfile.write(b"HTTP/1.1 200 OK\r\n") + self.send_header("Content-Length", oldpath.stat().st_size) + self.end_headers() + with oldpath.open(mode="rb") as old, newpath.open(mode="wb") as new: + while True: + buf = old.read(64 * 1024) # same as shutil uses + if not buf: + break + self.wfile.write(buf) + new.write(buf) + self.wfile.flush() + return + + # download fresh copy + try: + print(f"\rproxy download: {self.path}", file=sys.stderr) + conn = http.client.HTTPConnection(self.headers["Host"], timeout=5) + conn.request("GET", self.path, None, dict(self.headers)) + res = conn.getresponse() + assert (res.status, res.reason) == (200, "OK"), (res.status, res.reason) + self.wfile.write(b"HTTP/1.1 200 OK\r\n") + for k, v in res.getheaders(): + # do not allow a persistent connection + if k == "connection": + continue + self.send_header(k, v) + self.end_headers() + with newpath.open(mode="wb") as f: + while True: + buf = res.read(64 * 1024) # same as shutil uses + if not buf: + break + self.wfile.write(buf) + f.write(buf) + time.sleep(64 / 1024) # 1024 kB/s + self.wfile.flush() + except Exception as e: + self.send_error(502) + + +def main(): + global oldcachedir, newcachedir, readonly + if sys.argv[1] == "--readonly": + readonly = True + oldcachedir = pathlib.Path(sys.argv[2]) + newcachedir = pathlib.Path(sys.argv[3]) + else: + oldcachedir = pathlib.Path(sys.argv[1]) + newcachedir = pathlib.Path(sys.argv[2]) + print(f"starting caching proxy for {newcachedir}", file=sys.stderr) + httpd = http.server.ThreadingHTTPServer( + server_address=("", 8080), RequestHandlerClass=ProxyRequestHandler + ) + httpd.serve_forever() + + +if __name__ == "__main__": + main() diff --git a/coverage.py b/coverage.py index 9d0d2ac..fe1ca87 100755 --- a/coverage.py +++ b/coverage.py @@ -39,7 +39,7 @@ all_formats = ["auto", "directory", "tar", "squashfs", "ext2", "null"] mirror = os.getenv("mirror", "http://127.0.0.1/debian") hostarch = subprocess.check_output(["dpkg", "--print-architecture"]).decode().strip() -release_path = f"./shared/cache/debian/dists/{default_dist}/Release" +release_path = f"./shared/cache/debian/dists/{default_dist}/InRelease" if not os.path.exists(release_path): print("path doesn't exist:", release_path, file=sys.stderr) print("run ./make_mirror.sh first", file=sys.stderr) diff --git a/coverage.sh b/coverage.sh index 30abd4d..095b804 100755 --- a/coverage.sh +++ b/coverage.sh @@ -22,8 +22,10 @@ if [ -e ./mmdebstrap ]; then perlcritic --severity 4 --verbose 8 ./mmdebstrap fi -[ -e ./tarfilter ] && black --check ./tarfilter -[ -e ./coverage.py ] && black --check ./coverage.py +for f in tarfilter coverage.py caching_proxy.py; do + [ -e "./$f" ] || continue + black --check "./$f" +done shellcheck --exclude=SC2016 coverage.sh make_mirror.sh run_null.sh run_qemu.sh gpgvnoexpkeysig hooks/*/*.sh diff --git a/make_mirror.sh b/make_mirror.sh index c1417c2..cc13cf7 100755 --- a/make_mirror.sh +++ b/make_mirror.sh @@ -92,76 +92,11 @@ deletecache() { } cleanup_newcachedir() { + kill "$PROXYPID" || : echo "running cleanup_newcachedir" deletecache "$newcachedir" } -get_oldaptnames() { - if [ ! -e "$1/$2" ]; then - return - fi - xz -dc "$1/$2" \ - | grep-dctrl --no-field-names --show-field=Package,Version,Architecture,Filename '' \ - | paste -sd " \n" \ - | while read -r name ver arch fname; do - if [ ! -e "$1/$fname" ]; then - continue - fi - # apt stores deb files with the colon encoded as %3a while - # mirrors do not contain the epoch at all #645895 - case "$ver" in *:*) ver="${ver%%:*}%3a${ver#*:}";; esac - aptname="$rootdir/var/cache/apt/archives/${name}_${ver}_${arch}.deb" - # we have to cp and not mv because other - # distributions might still need this file - # we have to cp and not symlink because apt - # doesn't recognize symlinks - cp --link "$1/$fname" "$aptname" - echo "$aptname" - done -} - -get_newaptnames() { - if [ ! -e "$1/$2" ]; then - return - fi - # skip empty files by trying to uncompress the first byte of the payload - if [ "$(xz -dc "$1/$2" | head -c1 | wc -c)" -eq 0 ]; then - return - fi - xz -dc "$1/$2" \ - | grep-dctrl --no-field-names --show-field=Package,Version,Architecture,Filename,SHA256 '' \ - | paste -sd " \n" \ - | while read -r name ver arch fname hash; do - # sanity check for the hash because sometimes the - # archive switches the hash algorithm - if [ "${#hash}" -ne 64 ]; then - echo "expected hash length of 64 but got ${#hash} for: $hash" >&2 - exit 1 - fi - dir="${fname%/*}" - # apt stores deb files with the colon encoded as %3a while - # mirrors do not contain the epoch at all #645895 - case "$ver" in *:*) ver="${ver%%:*}%3a${ver#*:}";; esac - aptname="$rootdir/var/cache/apt/archives/${name}_${ver}_${arch}.deb" - if [ -e "$aptname" ]; then - # make sure that we found the right file by checking its hash - echo "$hash $aptname" | sha256sum --check >&2 - mkdir -p "$1/$dir" - # since we move hardlinks around, the same hardlink might've been - # moved already into the same place by another distribution. - # mv(1) refuses to copy A to B if both are hardlinks of each other. - if [ -e "$aptname" ] && [ -e "$1/$fname" ] && [ "$(stat -c "%d %i" "$aptname")" = "$(stat -c "%d %i" "$1/$fname")" ]; then - # both files are already the same so we just need to - # delete the source - rm "$aptname" - else - mv "$aptname" "$1/$fname" - fi - echo "$aptname" - fi - done -} - cleanupapt() { echo "running cleanupapt" >&2 if [ ! -e "$rootdir" ]; then @@ -175,12 +110,11 @@ cleanupapt() { "$rootdir/var/lib/dpkg/status" \ "$rootdir/var/lib/dpkg/lock-frontend" \ "$rootdir/var/lib/dpkg/lock" \ + "$rootdir/var/lib/apt/lists/lock" \ "$rootdir/etc/apt/apt.conf" \ "$rootdir/etc/apt/sources.list.d/"* \ "$rootdir/etc/apt/preferences.d/"* \ "$rootdir/etc/apt/sources.list" \ - "$rootdir/oldaptnames" \ - "$rootdir/newaptnames" \ "$rootdir/var/cache/apt/archives/lock"; do if [ ! -e "$f" ]; then echo "does not exist: $f" >&2 @@ -229,9 +163,7 @@ Apt::Get::Download-Only true; Acquire::Languages "none"; Dir::Etc::Trusted "/etc/apt/trusted.gpg"; Dir::Etc::TrustedParts "/etc/apt/trusted.gpg.d"; -Acquire::http::Dl-Limit "1000"; -Acquire::https::Dl-Limit "1000"; -Acquire::Retries "5"; +Acquire::http::Proxy "http://127.0.0.1:8080/"; END : > "$rootdir/var/lib/dpkg/status" @@ -256,26 +188,6 @@ END APT_CONFIG="$rootdir/etc/apt/apt.conf" apt-get update - # before downloading packages and before replacing the old Packages - # file, copy all old *.deb packages from the mirror to - # /var/cache/apt/archives so that apt will not re-download *.deb - # packages that we already have - { - get_oldaptnames "$oldmirrordir" "dists/$dist/main/binary-$nativearch/Packages.xz" - case "$dist" in oldstable|stable) - get_oldaptnames "$oldmirrordir" "dists/$dist-updates/main/binary-$nativearch/Packages.xz" - ;; - esac - case "$dist" in - oldstable) - get_oldaptnames "$oldcachedir/debian-security" "dists/$dist/updates/main/binary-$nativearch/Packages.xz" - ;; - stable) - get_oldaptnames "$oldcachedir/debian-security" "dists/$dist-security/main/binary-$nativearch/Packages.xz" - ;; - esac - } | sort -u > "$rootdir/oldaptnames" - pkgs=$(APT_CONFIG="$rootdir/etc/apt/apt.conf" apt-get indextargets \ --format '$(FILENAME)' 'Created-By: Packages' "Architecture: $nativearch" \ | xargs --delimiter='\n' /usr/lib/apt/apt-helper cat-file \ @@ -296,73 +208,8 @@ END # shellcheck disable=SC2086 APT_CONFIG="$rootdir/etc/apt/apt.conf" apt-get --yes install $pkgs - # to be able to also test gpg verification, we need to create a mirror - mkdir -p "$newmirrordir/dists/$dist/main/binary-$nativearch/" - curl --location "$mirror/dists/$dist/Release" > "$newmirrordir/dists/$dist/Release" - curl --location "$mirror/dists/$dist/Release.gpg" > "$newmirrordir/dists/$dist/Release.gpg" - curl --location "$mirror/dists/$dist/main/binary-$nativearch/Packages.xz" > "$newmirrordir/dists/$dist/main/binary-$nativearch/Packages.xz" - codename=$(awk '/^Codename: / { print $2; }' < "$newmirrordir/dists/$dist/Release") - [ -L "$newmirrordir/dists/$codename" ] || ln -s "$dist" "$newmirrordir/dists/$codename" - case "$dist" in oldstable|stable) - mkdir -p "$newmirrordir/dists/$dist-updates/main/binary-$nativearch/" - curl --location "$mirror/dists/$dist-updates/Release" > "$newmirrordir/dists/$dist-updates/Release" - curl --location "$mirror/dists/$dist-updates/Release.gpg" > "$newmirrordir/dists/$dist-updates/Release.gpg" - curl --location "$mirror/dists/$dist-updates/main/binary-$nativearch/Packages.xz" > "$newmirrordir/dists/$dist-updates/main/binary-$nativearch/Packages.xz" - [ -L "$newmirrordir/dists/$codename-updates" ] || ln -s "$dist-updates" "$newmirrordir/dists/$codename-updates" - ;; - esac - case "$dist" in - oldstable) - mkdir -p "$newcachedir/debian-security/dists/$dist/updates/main/binary-$nativearch/" - curl --location "$security_mirror/dists/$dist/updates/Release" > "$newcachedir/debian-security/dists/$dist/updates/Release" - curl --location "$security_mirror/dists/$dist/updates/Release.gpg" > "$newcachedir/debian-security/dists/$dist/updates/Release.gpg" - curl --location "$security_mirror/dists/$dist/updates/main/binary-$nativearch/Packages.xz" > "$newcachedir/debian-security/dists/$dist/updates/main/binary-$nativearch/Packages.xz" - ;; - stable) - mkdir -p "$newcachedir/debian-security/dists/$dist-security/main/binary-$nativearch/" - curl --location "$security_mirror/dists/$dist-security/Release" > "$newcachedir/debian-security/dists/$dist-security/Release" - curl --location "$security_mirror/dists/$dist-security/Release.gpg" > "$newcachedir/debian-security/dists/$dist-security/Release.gpg" - curl --location "$security_mirror/dists/$dist-security/main/binary-$nativearch/Packages.xz" > "$newcachedir/debian-security/dists/$dist-security/main/binary-$nativearch/Packages.xz" - [ -L "$newcachedir/debian-security/dists/$codename-security" ] || ln -s "$dist-security" "$newcachedir/debian-security/dists/$codename-security" - ;; - esac - - # the deb files downloaded by apt must be moved to their right locations in the - # pool directory - # - # Instead of parsing the Packages file, we could also attempt to move the deb - # files ourselves to the appropriate pool directories. But that approach - # requires re-creating the heuristic by which the directory is chosen, requires - # stripping the epoch from the filename and will break once mirrors change. - # This way, it doesn't matter where the mirror ends up storing the package. - { - get_newaptnames "$newmirrordir" "dists/$dist/main/binary-$nativearch/Packages.xz"; - case "$dist" in oldstable|stable) - get_newaptnames "$newmirrordir" "dists/$dist-updates/main/binary-$nativearch/Packages.xz" - ;; - esac - case "$dist" in - oldstable) - get_newaptnames "$newcachedir/debian-security" "dists/$dist/updates/main/binary-$nativearch/Packages.xz" - ;; - stable) - get_newaptnames "$newcachedir/debian-security" "dists/$dist-security/main/binary-$nativearch/Packages.xz" - ;; - esac - } | sort -u > "$rootdir/newaptnames" - rm "$rootdir/var/cache/apt/archives/lock" rmdir "$rootdir/var/cache/apt/archives/partial" - # remove all packages that were in the old Packages file but not in the - # new one anymore - comm -23 "$rootdir/oldaptnames" "$rootdir/newaptnames" | xargs --delimiter="\n" --no-run-if-empty rm - # now the apt cache should be empty - if [ -n "$(ls -1qA "$rootdir/var/cache/apt/archives/")" ]; then - echo "$rootdir/var/cache/apt/archives not empty:" - ls -la "$rootdir/var/cache/apt/archives/" - exit 1 - fi - APT_CONFIG="$rootdir/etc/apt/apt.conf" apt-get --option Dir::Etc::SourceList=/dev/null update APT_CONFIG="$rootdir/etc/apt/apt.conf" apt-get clean @@ -425,8 +272,8 @@ components=main : "${CMD:=./mmdebstrap}" : "${USE_HOST_APT_CONFIG:=no}" -if [ -e "$oldmirrordir/dists/$DEFAULT_DIST/Release" ]; then - http_code=$(curl --output /dev/null --silent --location --head --time-cond "$oldmirrordir/dists/$DEFAULT_DIST/Release" --write-out '%{http_code}' "$mirror/dists/$DEFAULT_DIST/Release") +if [ -e "$oldmirrordir/dists/$DEFAULT_DIST/InRelease" ]; then + http_code=$(curl --output /dev/null --silent --location --head --time-cond "$oldmirrordir/dists/$DEFAULT_DIST/InRelease" --write-out '%{http_code}' "$mirror/dists/$DEFAULT_DIST/InRelease") case "$http_code" in 200) ;; # need update 304) echo up-to-date; exit 0;; @@ -434,6 +281,19 @@ if [ -e "$oldmirrordir/dists/$DEFAULT_DIST/Release" ]; then esac fi +./caching_proxy.py "$oldcachedir" "$newcachedir" & +PROXYPID=$! + +for i in $(seq 10); do + curl --proxy "http://127.0.0.1:8080/" --silent -o /dev/null "http://deb.debian.org/debian/dists/$DEFAULT_DIST/InRelease" && break + sleep 1 +done +if [ ! -s "$newmirrordir/dists/$DEFAULT_DIST/InRelease" ]; then + echo "failed to start proxy" >&2 + kill $PROXYPID + exit 1 +fi + trap "cleanup_newcachedir" EXIT INT TERM mkdir -p "$newcachedir" @@ -447,8 +307,12 @@ elif [ "$HOSTARCH" = arm64 ]; then arches="$arches amd64 armhf" fi -for nativearch in $arches; do - for dist in oldstable stable testing unstable; do +# we need the split_inline_sig() function +# shellcheck disable=SC1091 +. /usr/share/debootstrap/functions + +for dist in oldstable stable testing unstable; do + for nativearch in $arches; do # non-host architectures are only downloaded for $DEFAULT_DIST if [ "$nativearch" != "$HOSTARCH" ] && [ "$DEFAULT_DIST" != "$dist" ]; then continue @@ -483,8 +347,20 @@ END ;; esac done + codename=$(awk '/^Codename: / { print $2; }' < "$newmirrordir/dists/$dist/InRelease") + ln -s "$dist" "$newmirrordir/dists/$codename" + + # split the InRelease file into Release and Release.gpg not because apt + # or debootstrap need it that way but because grep-dctrl does + split_inline_sig \ + "$newmirrordir/dists/$dist/InRelease" \ + "$newmirrordir/dists/$dist/Release" \ + "$newmirrordir/dists/$dist/Release.gpg" + touch --reference="$newmirrordir/dists/$dist/InRelease" "$newmirrordir/dists/$dist/Release" "$newmirrordir/dists/$dist/Release.gpg" done +kill $PROXYPID + # Create some symlinks so that we can trick apt into accepting multiple apt # lines that point to the same repository but look different. This is to # avoid the warning: @@ -530,6 +406,25 @@ if [ "$HAVE_QEMU" = "yes" ]; then ;; esac + # we use the caching proxy again when building the qemu image + # - we can re-use the packages that were already downloaded earlier + # - we make sure that the qemu image uses the same Release file even + # if a mirror push happened between now and earlier + # - we avoid polluting the mirror with the additional packages by + # using --readonly + ./caching_proxy.py --readonly "$oldcachedir" "$newcachedir" & + PROXYPID=$! + + for i in $(seq 10); do + curl --proxy "http://127.0.0.1:8080/" --silent -o /dev/null "http://deb.debian.org/debian/dists/$DEFAULT_DIST/InRelease" && break + sleep 1 + done + if [ ! -s "$newmirrordir/dists/$DEFAULT_DIST/InRelease" ]; then + echo "failed to start proxy" >&2 + kill $PROXYPID + exit 1 + fi + # We must not use any --dpkgopt here because any dpkg options still # leak into the chroot with chrootless mode. # We do not use our own package cache here because @@ -571,11 +466,12 @@ if [ "$HAVE_QEMU" = "yes" ]; then arches=$HOSTARCH fi $CMD --variant=apt --architectures="$arches" --include="$pkgs" \ - --aptopt='Acquire::http::Dl-Limit "1000"' \ - --aptopt='Acquire::https::Dl-Limit "1000"' \ - --aptopt='Acquire::Retries "5"' \ + --setup-hook='echo "Acquire::http::Proxy \"http://127.0.0.1:8080/\";" > "$1/etc/apt/apt.conf.d/00proxy"' \ + --customize-hook='rm "$1/etc/apt/apt.conf.d/00proxy"' \ "$DEFAULT_DIST" - "$mirror" > "$tmpdir/debian-chroot.tar" + kill $PROXYPID + cat << END > "$tmpdir/mmdebstrap.service" [Unit] Description=mmdebstrap worker script diff --git a/tests/as-debootstrap-unshare-wrapper b/tests/as-debootstrap-unshare-wrapper index 6c88564..0473983 100644 --- a/tests/as-debootstrap-unshare-wrapper +++ b/tests/as-debootstrap-unshare-wrapper @@ -78,6 +78,7 @@ find /tmp/debian-debootstrap/run/ -mindepth 1 -maxdepth 1 ! -name lock -print0 | # debootstrap doesn't clean apt rm /tmp/debian-debootstrap/var/lib/apt/lists/127.0.0.1_debian_dists_unstable_main_binary-{{ HOSTARCH }}_Packages \ + /tmp/debian-debootstrap/var/lib/apt/lists/127.0.0.1_debian_dists_unstable_InRelease \ /tmp/debian-debootstrap/var/lib/apt/lists/127.0.0.1_debian_dists_unstable_Release \ /tmp/debian-debootstrap/var/lib/apt/lists/127.0.0.1_debian_dists_unstable_Release.gpg diff --git a/tests/check-against-debootstrap-dist b/tests/check-against-debootstrap-dist index e562bd6..d51dc27 100644 --- a/tests/check-against-debootstrap-dist +++ b/tests/check-against-debootstrap-dist @@ -96,6 +96,7 @@ fi find /tmp/debian-{{ DIST }}-debootstrap/run/ -mindepth 1 -maxdepth 1 ! -name lock -print0 | xargs --no-run-if-empty -0 rm -r # debootstrap doesn't clean apt rm /tmp/debian-{{ DIST }}-debootstrap/var/lib/apt/lists/127.0.0.1_debian_dists_{{ DIST }}_main_binary-{{ HOSTARCH }}_Packages \ + /tmp/debian-{{ DIST }}-debootstrap/var/lib/apt/lists/127.0.0.1_debian_dists_{{ DIST }}_InRelease \ /tmp/debian-{{ DIST }}-debootstrap/var/lib/apt/lists/127.0.0.1_debian_dists_{{ DIST }}_Release \ /tmp/debian-{{ DIST }}-debootstrap/var/lib/apt/lists/127.0.0.1_debian_dists_{{ DIST }}_Release.gpg diff --git a/tests/dist-using-codename b/tests/dist-using-codename index 2cf7d89..96d8929 100644 --- a/tests/dist-using-codename +++ b/tests/dist-using-codename @@ -4,9 +4,9 @@ set -eu export LC_ALL=C.UTF-8 -trap "rm -f Release; rm -rf /tmp/debian-chroot.tar /tmp/expected" EXIT INT TERM -/usr/lib/apt/apt-helper download-file "{{ MIRROR }}/dists/{{ DIST }}/Release" Release -codename=$(awk '/^Codename: / { print $2; }' Release) +trap "rm -f InRelease; rm -rf /tmp/debian-chroot.tar /tmp/expected" EXIT INT TERM +/usr/lib/apt/apt-helper download-file "{{ MIRROR }}/dists/{{ DIST }}/InRelease" InRelease +codename=$(awk '/^Codename: / { print $2; }' InRelease) {{ CMD }} --mode={{ MODE }} --variant=apt "$codename" /tmp/debian-chroot.tar {{ MIRROR }} echo "deb {{ MIRROR }} $codename main" > /tmp/expected tar --to-stdout --extract --file /tmp/debian-chroot.tar ./etc/apt/sources.list \