Use an caching apt proxy instead of copying /var/cache/apt/archives/*.deb

- only download Release files once and not by apt as well as with curl
   and thus avoid a mirror push happening between both downloads
 - no heuristic needed to place the file in their correct mirror
   location
 - no manual checksum checking
 - only throttle download speed when actually downloading and not when
   retrieving files from the cache
 - no translation of filenames between how the epoch colon is stored in
   files in /var/cache/apt/archives versus how it is stored in files on
   the mirrors
 - no special handling of stable update and security mirrors
 - implemented in Python instead of shell and thus an order of magnitude
   faster
This commit is contained in:
Johannes Schauer Marin Rodrigues 2023-03-02 17:24:14 +01:00
parent 70092c49e8
commit 5885291213
Signed by untrusted user: josch
GPG key ID: F2CBA5C78FBD83E1
7 changed files with 177 additions and 169 deletions

108
caching_proxy.py Executable file
View file

@ -0,0 +1,108 @@
#!/usr/bin/env python3
import sys
import os
import time
import http.client
import http.server
from io import StringIO
import pathlib
import urllib.parse
oldcachedir = None
newcachedir = None
readonly = False
class ProxyRequestHandler(http.server.BaseHTTPRequestHandler):
def do_GET(self):
assert int(self.headers.get("Content-Length", 0)) == 0
assert self.headers["Host"]
pathprefix = "http://" + self.headers["Host"] + "/"
assert self.path.startswith(pathprefix)
sanitizedpath = urllib.parse.unquote(self.path.removeprefix(pathprefix))
oldpath = oldcachedir / sanitizedpath
newpath = newcachedir / sanitizedpath
if not readonly:
newpath.parent.mkdir(parents=True, exist_ok=True)
# just send back to client
if newpath.exists():
print(f"proxy cached: {self.path}", file=sys.stderr)
self.wfile.write(b"HTTP/1.1 200 OK\r\n")
self.send_header("Content-Length", newpath.stat().st_size)
self.end_headers()
with newpath.open(mode="rb") as new:
while True:
buf = new.read(64 * 1024) # same as shutil uses
if not buf:
break
self.wfile.write(buf)
self.wfile.flush()
return
if readonly:
newpath = pathlib.Path("/dev/null")
# copy from oldpath to newpath and send back to client
if oldpath.exists():
print(f"proxy cached: {self.path}", file=sys.stderr)
self.wfile.write(b"HTTP/1.1 200 OK\r\n")
self.send_header("Content-Length", oldpath.stat().st_size)
self.end_headers()
with oldpath.open(mode="rb") as old, newpath.open(mode="wb") as new:
while True:
buf = old.read(64 * 1024) # same as shutil uses
if not buf:
break
self.wfile.write(buf)
new.write(buf)
self.wfile.flush()
return
# download fresh copy
try:
print(f"\rproxy download: {self.path}", file=sys.stderr)
conn = http.client.HTTPConnection(self.headers["Host"], timeout=5)
conn.request("GET", self.path, None, dict(self.headers))
res = conn.getresponse()
assert (res.status, res.reason) == (200, "OK"), (res.status, res.reason)
self.wfile.write(b"HTTP/1.1 200 OK\r\n")
for k, v in res.getheaders():
# do not allow a persistent connection
if k == "connection":
continue
self.send_header(k, v)
self.end_headers()
with newpath.open(mode="wb") as f:
while True:
buf = res.read(64 * 1024) # same as shutil uses
if not buf:
break
self.wfile.write(buf)
f.write(buf)
time.sleep(64 / 1024) # 1024 kB/s
self.wfile.flush()
except Exception as e:
self.send_error(502)
def main():
global oldcachedir, newcachedir, readonly
if sys.argv[1] == "--readonly":
readonly = True
oldcachedir = pathlib.Path(sys.argv[2])
newcachedir = pathlib.Path(sys.argv[3])
else:
oldcachedir = pathlib.Path(sys.argv[1])
newcachedir = pathlib.Path(sys.argv[2])
print(f"starting caching proxy for {newcachedir}", file=sys.stderr)
httpd = http.server.ThreadingHTTPServer(
server_address=("", 8080), RequestHandlerClass=ProxyRequestHandler
)
httpd.serve_forever()
if __name__ == "__main__":
main()

View file

@ -39,7 +39,7 @@ all_formats = ["auto", "directory", "tar", "squashfs", "ext2", "null"]
mirror = os.getenv("mirror", "http://127.0.0.1/debian")
hostarch = subprocess.check_output(["dpkg", "--print-architecture"]).decode().strip()
release_path = f"./shared/cache/debian/dists/{default_dist}/Release"
release_path = f"./shared/cache/debian/dists/{default_dist}/InRelease"
if not os.path.exists(release_path):
print("path doesn't exist:", release_path, file=sys.stderr)
print("run ./make_mirror.sh first", file=sys.stderr)

View file

@ -22,8 +22,10 @@ if [ -e ./mmdebstrap ]; then
perlcritic --severity 4 --verbose 8 ./mmdebstrap
fi
[ -e ./tarfilter ] && black --check ./tarfilter
[ -e ./coverage.py ] && black --check ./coverage.py
for f in tarfilter coverage.py caching_proxy.py; do
[ -e "./$f" ] || continue
black --check "./$f"
done
shellcheck --exclude=SC2016 coverage.sh make_mirror.sh run_null.sh run_qemu.sh gpgvnoexpkeysig hooks/*/*.sh

View file

@ -92,76 +92,11 @@ deletecache() {
}
cleanup_newcachedir() {
kill "$PROXYPID" || :
echo "running cleanup_newcachedir"
deletecache "$newcachedir"
}
get_oldaptnames() {
if [ ! -e "$1/$2" ]; then
return
fi
xz -dc "$1/$2" \
| grep-dctrl --no-field-names --show-field=Package,Version,Architecture,Filename '' \
| paste -sd " \n" \
| while read -r name ver arch fname; do
if [ ! -e "$1/$fname" ]; then
continue
fi
# apt stores deb files with the colon encoded as %3a while
# mirrors do not contain the epoch at all #645895
case "$ver" in *:*) ver="${ver%%:*}%3a${ver#*:}";; esac
aptname="$rootdir/var/cache/apt/archives/${name}_${ver}_${arch}.deb"
# we have to cp and not mv because other
# distributions might still need this file
# we have to cp and not symlink because apt
# doesn't recognize symlinks
cp --link "$1/$fname" "$aptname"
echo "$aptname"
done
}
get_newaptnames() {
if [ ! -e "$1/$2" ]; then
return
fi
# skip empty files by trying to uncompress the first byte of the payload
if [ "$(xz -dc "$1/$2" | head -c1 | wc -c)" -eq 0 ]; then
return
fi
xz -dc "$1/$2" \
| grep-dctrl --no-field-names --show-field=Package,Version,Architecture,Filename,SHA256 '' \
| paste -sd " \n" \
| while read -r name ver arch fname hash; do
# sanity check for the hash because sometimes the
# archive switches the hash algorithm
if [ "${#hash}" -ne 64 ]; then
echo "expected hash length of 64 but got ${#hash} for: $hash" >&2
exit 1
fi
dir="${fname%/*}"
# apt stores deb files with the colon encoded as %3a while
# mirrors do not contain the epoch at all #645895
case "$ver" in *:*) ver="${ver%%:*}%3a${ver#*:}";; esac
aptname="$rootdir/var/cache/apt/archives/${name}_${ver}_${arch}.deb"
if [ -e "$aptname" ]; then
# make sure that we found the right file by checking its hash
echo "$hash $aptname" | sha256sum --check >&2
mkdir -p "$1/$dir"
# since we move hardlinks around, the same hardlink might've been
# moved already into the same place by another distribution.
# mv(1) refuses to copy A to B if both are hardlinks of each other.
if [ -e "$aptname" ] && [ -e "$1/$fname" ] && [ "$(stat -c "%d %i" "$aptname")" = "$(stat -c "%d %i" "$1/$fname")" ]; then
# both files are already the same so we just need to
# delete the source
rm "$aptname"
else
mv "$aptname" "$1/$fname"
fi
echo "$aptname"
fi
done
}
cleanupapt() {
echo "running cleanupapt" >&2
if [ ! -e "$rootdir" ]; then
@ -175,12 +110,11 @@ cleanupapt() {
"$rootdir/var/lib/dpkg/status" \
"$rootdir/var/lib/dpkg/lock-frontend" \
"$rootdir/var/lib/dpkg/lock" \
"$rootdir/var/lib/apt/lists/lock" \
"$rootdir/etc/apt/apt.conf" \
"$rootdir/etc/apt/sources.list.d/"* \
"$rootdir/etc/apt/preferences.d/"* \
"$rootdir/etc/apt/sources.list" \
"$rootdir/oldaptnames" \
"$rootdir/newaptnames" \
"$rootdir/var/cache/apt/archives/lock"; do
if [ ! -e "$f" ]; then
echo "does not exist: $f" >&2
@ -229,9 +163,7 @@ Apt::Get::Download-Only true;
Acquire::Languages "none";
Dir::Etc::Trusted "/etc/apt/trusted.gpg";
Dir::Etc::TrustedParts "/etc/apt/trusted.gpg.d";
Acquire::http::Dl-Limit "1000";
Acquire::https::Dl-Limit "1000";
Acquire::Retries "5";
Acquire::http::Proxy "http://127.0.0.1:8080/";
END
: > "$rootdir/var/lib/dpkg/status"
@ -256,26 +188,6 @@ END
APT_CONFIG="$rootdir/etc/apt/apt.conf" apt-get update
# before downloading packages and before replacing the old Packages
# file, copy all old *.deb packages from the mirror to
# /var/cache/apt/archives so that apt will not re-download *.deb
# packages that we already have
{
get_oldaptnames "$oldmirrordir" "dists/$dist/main/binary-$nativearch/Packages.xz"
case "$dist" in oldstable|stable)
get_oldaptnames "$oldmirrordir" "dists/$dist-updates/main/binary-$nativearch/Packages.xz"
;;
esac
case "$dist" in
oldstable)
get_oldaptnames "$oldcachedir/debian-security" "dists/$dist/updates/main/binary-$nativearch/Packages.xz"
;;
stable)
get_oldaptnames "$oldcachedir/debian-security" "dists/$dist-security/main/binary-$nativearch/Packages.xz"
;;
esac
} | sort -u > "$rootdir/oldaptnames"
pkgs=$(APT_CONFIG="$rootdir/etc/apt/apt.conf" apt-get indextargets \
--format '$(FILENAME)' 'Created-By: Packages' "Architecture: $nativearch" \
| xargs --delimiter='\n' /usr/lib/apt/apt-helper cat-file \
@ -296,73 +208,8 @@ END
# shellcheck disable=SC2086
APT_CONFIG="$rootdir/etc/apt/apt.conf" apt-get --yes install $pkgs
# to be able to also test gpg verification, we need to create a mirror
mkdir -p "$newmirrordir/dists/$dist/main/binary-$nativearch/"
curl --location "$mirror/dists/$dist/Release" > "$newmirrordir/dists/$dist/Release"
curl --location "$mirror/dists/$dist/Release.gpg" > "$newmirrordir/dists/$dist/Release.gpg"
curl --location "$mirror/dists/$dist/main/binary-$nativearch/Packages.xz" > "$newmirrordir/dists/$dist/main/binary-$nativearch/Packages.xz"
codename=$(awk '/^Codename: / { print $2; }' < "$newmirrordir/dists/$dist/Release")
[ -L "$newmirrordir/dists/$codename" ] || ln -s "$dist" "$newmirrordir/dists/$codename"
case "$dist" in oldstable|stable)
mkdir -p "$newmirrordir/dists/$dist-updates/main/binary-$nativearch/"
curl --location "$mirror/dists/$dist-updates/Release" > "$newmirrordir/dists/$dist-updates/Release"
curl --location "$mirror/dists/$dist-updates/Release.gpg" > "$newmirrordir/dists/$dist-updates/Release.gpg"
curl --location "$mirror/dists/$dist-updates/main/binary-$nativearch/Packages.xz" > "$newmirrordir/dists/$dist-updates/main/binary-$nativearch/Packages.xz"
[ -L "$newmirrordir/dists/$codename-updates" ] || ln -s "$dist-updates" "$newmirrordir/dists/$codename-updates"
;;
esac
case "$dist" in
oldstable)
mkdir -p "$newcachedir/debian-security/dists/$dist/updates/main/binary-$nativearch/"
curl --location "$security_mirror/dists/$dist/updates/Release" > "$newcachedir/debian-security/dists/$dist/updates/Release"
curl --location "$security_mirror/dists/$dist/updates/Release.gpg" > "$newcachedir/debian-security/dists/$dist/updates/Release.gpg"
curl --location "$security_mirror/dists/$dist/updates/main/binary-$nativearch/Packages.xz" > "$newcachedir/debian-security/dists/$dist/updates/main/binary-$nativearch/Packages.xz"
;;
stable)
mkdir -p "$newcachedir/debian-security/dists/$dist-security/main/binary-$nativearch/"
curl --location "$security_mirror/dists/$dist-security/Release" > "$newcachedir/debian-security/dists/$dist-security/Release"
curl --location "$security_mirror/dists/$dist-security/Release.gpg" > "$newcachedir/debian-security/dists/$dist-security/Release.gpg"
curl --location "$security_mirror/dists/$dist-security/main/binary-$nativearch/Packages.xz" > "$newcachedir/debian-security/dists/$dist-security/main/binary-$nativearch/Packages.xz"
[ -L "$newcachedir/debian-security/dists/$codename-security" ] || ln -s "$dist-security" "$newcachedir/debian-security/dists/$codename-security"
;;
esac
# the deb files downloaded by apt must be moved to their right locations in the
# pool directory
#
# Instead of parsing the Packages file, we could also attempt to move the deb
# files ourselves to the appropriate pool directories. But that approach
# requires re-creating the heuristic by which the directory is chosen, requires
# stripping the epoch from the filename and will break once mirrors change.
# This way, it doesn't matter where the mirror ends up storing the package.
{
get_newaptnames "$newmirrordir" "dists/$dist/main/binary-$nativearch/Packages.xz";
case "$dist" in oldstable|stable)
get_newaptnames "$newmirrordir" "dists/$dist-updates/main/binary-$nativearch/Packages.xz"
;;
esac
case "$dist" in
oldstable)
get_newaptnames "$newcachedir/debian-security" "dists/$dist/updates/main/binary-$nativearch/Packages.xz"
;;
stable)
get_newaptnames "$newcachedir/debian-security" "dists/$dist-security/main/binary-$nativearch/Packages.xz"
;;
esac
} | sort -u > "$rootdir/newaptnames"
rm "$rootdir/var/cache/apt/archives/lock"
rmdir "$rootdir/var/cache/apt/archives/partial"
# remove all packages that were in the old Packages file but not in the
# new one anymore
comm -23 "$rootdir/oldaptnames" "$rootdir/newaptnames" | xargs --delimiter="\n" --no-run-if-empty rm
# now the apt cache should be empty
if [ -n "$(ls -1qA "$rootdir/var/cache/apt/archives/")" ]; then
echo "$rootdir/var/cache/apt/archives not empty:"
ls -la "$rootdir/var/cache/apt/archives/"
exit 1
fi
APT_CONFIG="$rootdir/etc/apt/apt.conf" apt-get --option Dir::Etc::SourceList=/dev/null update
APT_CONFIG="$rootdir/etc/apt/apt.conf" apt-get clean
@ -425,8 +272,8 @@ components=main
: "${CMD:=./mmdebstrap}"
: "${USE_HOST_APT_CONFIG:=no}"
if [ -e "$oldmirrordir/dists/$DEFAULT_DIST/Release" ]; then
http_code=$(curl --output /dev/null --silent --location --head --time-cond "$oldmirrordir/dists/$DEFAULT_DIST/Release" --write-out '%{http_code}' "$mirror/dists/$DEFAULT_DIST/Release")
if [ -e "$oldmirrordir/dists/$DEFAULT_DIST/InRelease" ]; then
http_code=$(curl --output /dev/null --silent --location --head --time-cond "$oldmirrordir/dists/$DEFAULT_DIST/InRelease" --write-out '%{http_code}' "$mirror/dists/$DEFAULT_DIST/InRelease")
case "$http_code" in
200) ;; # need update
304) echo up-to-date; exit 0;;
@ -434,6 +281,19 @@ if [ -e "$oldmirrordir/dists/$DEFAULT_DIST/Release" ]; then
esac
fi
./caching_proxy.py "$oldcachedir" "$newcachedir" &
PROXYPID=$!
for i in $(seq 10); do
curl --proxy "http://127.0.0.1:8080/" --silent -o /dev/null "http://deb.debian.org/debian/dists/$DEFAULT_DIST/InRelease" && break
sleep 1
done
if [ ! -s "$newmirrordir/dists/$DEFAULT_DIST/InRelease" ]; then
echo "failed to start proxy" >&2
kill $PROXYPID
exit 1
fi
trap "cleanup_newcachedir" EXIT INT TERM
mkdir -p "$newcachedir"
@ -447,8 +307,12 @@ elif [ "$HOSTARCH" = arm64 ]; then
arches="$arches amd64 armhf"
fi
for nativearch in $arches; do
for dist in oldstable stable testing unstable; do
# we need the split_inline_sig() function
# shellcheck disable=SC1091
. /usr/share/debootstrap/functions
for dist in oldstable stable testing unstable; do
for nativearch in $arches; do
# non-host architectures are only downloaded for $DEFAULT_DIST
if [ "$nativearch" != "$HOSTARCH" ] && [ "$DEFAULT_DIST" != "$dist" ]; then
continue
@ -483,8 +347,20 @@ END
;;
esac
done
codename=$(awk '/^Codename: / { print $2; }' < "$newmirrordir/dists/$dist/InRelease")
ln -s "$dist" "$newmirrordir/dists/$codename"
# split the InRelease file into Release and Release.gpg not because apt
# or debootstrap need it that way but because grep-dctrl does
split_inline_sig \
"$newmirrordir/dists/$dist/InRelease" \
"$newmirrordir/dists/$dist/Release" \
"$newmirrordir/dists/$dist/Release.gpg"
touch --reference="$newmirrordir/dists/$dist/InRelease" "$newmirrordir/dists/$dist/Release" "$newmirrordir/dists/$dist/Release.gpg"
done
kill $PROXYPID
# Create some symlinks so that we can trick apt into accepting multiple apt
# lines that point to the same repository but look different. This is to
# avoid the warning:
@ -530,6 +406,25 @@ if [ "$HAVE_QEMU" = "yes" ]; then
;;
esac
# we use the caching proxy again when building the qemu image
# - we can re-use the packages that were already downloaded earlier
# - we make sure that the qemu image uses the same Release file even
# if a mirror push happened between now and earlier
# - we avoid polluting the mirror with the additional packages by
# using --readonly
./caching_proxy.py --readonly "$oldcachedir" "$newcachedir" &
PROXYPID=$!
for i in $(seq 10); do
curl --proxy "http://127.0.0.1:8080/" --silent -o /dev/null "http://deb.debian.org/debian/dists/$DEFAULT_DIST/InRelease" && break
sleep 1
done
if [ ! -s "$newmirrordir/dists/$DEFAULT_DIST/InRelease" ]; then
echo "failed to start proxy" >&2
kill $PROXYPID
exit 1
fi
# We must not use any --dpkgopt here because any dpkg options still
# leak into the chroot with chrootless mode.
# We do not use our own package cache here because
@ -571,11 +466,12 @@ if [ "$HAVE_QEMU" = "yes" ]; then
arches=$HOSTARCH
fi
$CMD --variant=apt --architectures="$arches" --include="$pkgs" \
--aptopt='Acquire::http::Dl-Limit "1000"' \
--aptopt='Acquire::https::Dl-Limit "1000"' \
--aptopt='Acquire::Retries "5"' \
--setup-hook='echo "Acquire::http::Proxy \"http://127.0.0.1:8080/\";" > "$1/etc/apt/apt.conf.d/00proxy"' \
--customize-hook='rm "$1/etc/apt/apt.conf.d/00proxy"' \
"$DEFAULT_DIST" - "$mirror" > "$tmpdir/debian-chroot.tar"
kill $PROXYPID
cat << END > "$tmpdir/mmdebstrap.service"
[Unit]
Description=mmdebstrap worker script

View file

@ -78,6 +78,7 @@ find /tmp/debian-debootstrap/run/ -mindepth 1 -maxdepth 1 ! -name lock -print0 |
# debootstrap doesn't clean apt
rm /tmp/debian-debootstrap/var/lib/apt/lists/127.0.0.1_debian_dists_unstable_main_binary-{{ HOSTARCH }}_Packages \
/tmp/debian-debootstrap/var/lib/apt/lists/127.0.0.1_debian_dists_unstable_InRelease \
/tmp/debian-debootstrap/var/lib/apt/lists/127.0.0.1_debian_dists_unstable_Release \
/tmp/debian-debootstrap/var/lib/apt/lists/127.0.0.1_debian_dists_unstable_Release.gpg

View file

@ -96,6 +96,7 @@ fi
find /tmp/debian-{{ DIST }}-debootstrap/run/ -mindepth 1 -maxdepth 1 ! -name lock -print0 | xargs --no-run-if-empty -0 rm -r
# debootstrap doesn't clean apt
rm /tmp/debian-{{ DIST }}-debootstrap/var/lib/apt/lists/127.0.0.1_debian_dists_{{ DIST }}_main_binary-{{ HOSTARCH }}_Packages \
/tmp/debian-{{ DIST }}-debootstrap/var/lib/apt/lists/127.0.0.1_debian_dists_{{ DIST }}_InRelease \
/tmp/debian-{{ DIST }}-debootstrap/var/lib/apt/lists/127.0.0.1_debian_dists_{{ DIST }}_Release \
/tmp/debian-{{ DIST }}-debootstrap/var/lib/apt/lists/127.0.0.1_debian_dists_{{ DIST }}_Release.gpg

View file

@ -4,9 +4,9 @@
set -eu
export LC_ALL=C.UTF-8
trap "rm -f Release; rm -rf /tmp/debian-chroot.tar /tmp/expected" EXIT INT TERM
/usr/lib/apt/apt-helper download-file "{{ MIRROR }}/dists/{{ DIST }}/Release" Release
codename=$(awk '/^Codename: / { print $2; }' Release)
trap "rm -f InRelease; rm -rf /tmp/debian-chroot.tar /tmp/expected" EXIT INT TERM
/usr/lib/apt/apt-helper download-file "{{ MIRROR }}/dists/{{ DIST }}/InRelease" InRelease
codename=$(awk '/^Codename: / { print $2; }' InRelease)
{{ CMD }} --mode={{ MODE }} --variant=apt "$codename" /tmp/debian-chroot.tar {{ MIRROR }}
echo "deb {{ MIRROR }} $codename main" > /tmp/expected
tar --to-stdout --extract --file /tmp/debian-chroot.tar ./etc/apt/sources.list \