forked from josch/mmdebstrap
5a7dbc10c7
mmtarfilter uses fnmatch to handle path exclusions and inclusions. Python's fnmatch handles shell patterns by translating them to regular expressions, with a 256-entry LRU cache. With more than 256 path exclusions or inclusions, this LRU cache no longer works, and every invocation of fnmatch on every file in every package will re-translate and re-compile a regular expression, resulting in much worse performance. Translate all the shell patterns to regular expressions once. For an mmdebstrap invocation with around 500 path filters, this speeds up mmdebstrap by more than a minute.
102 lines
3.5 KiB
Python
Executable file
102 lines
3.5 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
#
|
|
# This script is in the public domain
|
|
#
|
|
# This script accepts a tarball on standard input and filters it according to
|
|
# the same rules used by dpkg --path-exclude and --path-include, using command
|
|
# line options of the same name. The result is then printed on standard output.
|
|
#
|
|
# A tool like this should be written in C but libarchive has issues:
|
|
# https://github.com/libarchive/libarchive/issues/587
|
|
# https://github.com/libarchive/libarchive/pull/1288/ (needs 3.4.1)
|
|
# Should these issues get fixed, then a good template is tarfilter.c in the
|
|
# examples directory of libarchive.
|
|
#
|
|
# We are not using Perl either, because Archive::Tar slurps the whole tarball
|
|
# into memory.
|
|
#
|
|
# We could also use Go but meh...
|
|
# https://stackoverflow.com/a/59542307/784669
|
|
|
|
import tarfile
|
|
import sys
|
|
import argparse
|
|
import fnmatch
|
|
import re
|
|
|
|
|
|
class FilterAction(argparse.Action):
|
|
def __call__(self, parser, namespace, values, option_string=None):
|
|
items = getattr(namespace, "filter", [])
|
|
regex = re.compile(fnmatch.translate(values))
|
|
items.append((self.dest, regex))
|
|
setattr(namespace, "filter", items)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="""\
|
|
Filters a tarball on standard input by the same rules as the dpkg --path-exclude
|
|
and --path-include options and writes resulting tarball to standard output. See
|
|
dpkg(1) for information on how these two options work in detail.
|
|
"""
|
|
)
|
|
parser.add_argument(
|
|
"--path-exclude",
|
|
metavar="pattern",
|
|
action=FilterAction,
|
|
help="Exclude path matching the given shell pattern.",
|
|
)
|
|
parser.add_argument(
|
|
"--path-include",
|
|
metavar="pattern",
|
|
action=FilterAction,
|
|
help="Re-include a pattern after a previous exclusion.",
|
|
)
|
|
args = parser.parse_args()
|
|
if not hasattr(args, "filter"):
|
|
from shutil import copyfileobj
|
|
|
|
copyfileobj(sys.stdin.buffer, sys.stdout.buffer)
|
|
exit()
|
|
|
|
# same logic as in dpkg/src/filters.c/filter_should_skip()
|
|
def filter_should_skip(member):
|
|
skip = False
|
|
if not args.filter:
|
|
return False
|
|
for (t, r) in args.filter:
|
|
if r.match(member.name[1:]) is not None:
|
|
if t == "path_include":
|
|
skip = False
|
|
else:
|
|
skip = True
|
|
if skip and (member.isdir() or member.issym()):
|
|
for (t, r) in args.filter:
|
|
if t != "path_include":
|
|
continue
|
|
prefix = re.sub(r"^([^*?[\\]*).*", r"\1", r.pattern)
|
|
prefix = prefix.rstrip("/")
|
|
if member.name[1:].startswith(prefix):
|
|
if member.name == "./usr/share/doc/doc-debian":
|
|
print("foo", prefix, "bar", file=sys.stderr)
|
|
return False
|
|
return skip
|
|
|
|
# starting with Python 3.8, the default format became PAX_FORMAT, so this
|
|
# is only for compatibility with older versions of Python 3
|
|
with tarfile.open(fileobj=sys.stdin.buffer, mode="r|*") as in_tar, tarfile.open(
|
|
fileobj=sys.stdout.buffer, mode="w|", format=tarfile.PAX_FORMAT
|
|
) as out_tar:
|
|
for member in in_tar:
|
|
if filter_should_skip(member):
|
|
continue
|
|
if member.isfile():
|
|
with in_tar.extractfile(member) as file:
|
|
out_tar.addfile(member, file)
|
|
else:
|
|
out_tar.addfile(member)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|