forked from josch/mmdebstrap
043ab3bbf0
According to Debian bug #978742, mmtarfilter has a slow performance with many path exclusions. The execution can be speed up if the regular expression is only compiled once instead of every time in the hot loop. Signed-off-by: Benjamin Drung <benjamin.drung@cloud.ionos.com>
103 lines
3.5 KiB
Python
Executable file
103 lines
3.5 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
#
|
|
# This script is in the public domain
|
|
#
|
|
# This script accepts a tarball on standard input and filters it according to
|
|
# the same rules used by dpkg --path-exclude and --path-include, using command
|
|
# line options of the same name. The result is then printed on standard output.
|
|
#
|
|
# A tool like this should be written in C but libarchive has issues:
|
|
# https://github.com/libarchive/libarchive/issues/587
|
|
# https://github.com/libarchive/libarchive/pull/1288/ (needs 3.4.1)
|
|
# Should these issues get fixed, then a good template is tarfilter.c in the
|
|
# examples directory of libarchive.
|
|
#
|
|
# We are not using Perl either, because Archive::Tar slurps the whole tarball
|
|
# into memory.
|
|
#
|
|
# We could also use Go but meh...
|
|
# https://stackoverflow.com/a/59542307/784669
|
|
|
|
import tarfile
|
|
import sys
|
|
import argparse
|
|
import fnmatch
|
|
import re
|
|
|
|
|
|
class FilterAction(argparse.Action):
|
|
def __call__(self, parser, namespace, values, option_string=None):
|
|
items = getattr(namespace, "filter", [])
|
|
regex = re.compile(fnmatch.translate(values))
|
|
items.append((self.dest, regex))
|
|
setattr(namespace, "filter", items)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="""\
|
|
Filters a tarball on standard input by the same rules as the dpkg --path-exclude
|
|
and --path-include options and writes resulting tarball to standard output. See
|
|
dpkg(1) for information on how these two options work in detail.
|
|
"""
|
|
)
|
|
parser.add_argument(
|
|
"--path-exclude",
|
|
metavar="pattern",
|
|
action=FilterAction,
|
|
help="Exclude path matching the given shell pattern.",
|
|
)
|
|
parser.add_argument(
|
|
"--path-include",
|
|
metavar="pattern",
|
|
action=FilterAction,
|
|
help="Re-include a pattern after a previous exclusion.",
|
|
)
|
|
args = parser.parse_args()
|
|
if not hasattr(args, "filter"):
|
|
from shutil import copyfileobj
|
|
|
|
copyfileobj(sys.stdin.buffer, sys.stdout.buffer)
|
|
exit()
|
|
|
|
# same logic as in dpkg/src/filters.c/filter_should_skip()
|
|
prefix_prog = re.compile(r"^([^*?[\\]*).*")
|
|
def filter_should_skip(member):
|
|
skip = False
|
|
if not args.filter:
|
|
return False
|
|
for (t, r) in args.filter:
|
|
if r.match(member.name[1:]) is not None:
|
|
if t == "path_include":
|
|
skip = False
|
|
else:
|
|
skip = True
|
|
if skip and (member.isdir() or member.issym()):
|
|
for (t, r) in args.filter:
|
|
if t != "path_include":
|
|
continue
|
|
prefix = prefix_prog.sub(r"\1", r.pattern)
|
|
prefix = prefix.rstrip("/")
|
|
if member.name[1:].startswith(prefix):
|
|
if member.name == "./usr/share/doc/doc-debian":
|
|
print("foo", prefix, "bar", file=sys.stderr)
|
|
return False
|
|
return skip
|
|
|
|
# starting with Python 3.8, the default format became PAX_FORMAT, so this
|
|
# is only for compatibility with older versions of Python 3
|
|
with tarfile.open(fileobj=sys.stdin.buffer, mode="r|*") as in_tar, tarfile.open(
|
|
fileobj=sys.stdout.buffer, mode="w|", format=tarfile.PAX_FORMAT
|
|
) as out_tar:
|
|
for member in in_tar:
|
|
if filter_should_skip(member):
|
|
continue
|
|
if member.isfile():
|
|
with in_tar.extractfile(member) as file:
|
|
out_tar.addfile(member, file)
|
|
else:
|
|
out_tar.addfile(member)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|