Optimize mmtarfilter to handle many path exclusions

mmtarfilter uses fnmatch to handle path exclusions and inclusions.
Python's fnmatch handles shell patterns by translating them to regular
expressions, with a 256-entry LRU cache. With more than 256 path
exclusions or inclusions, this LRU cache no longer works, and every
invocation of fnmatch on every file in every package will re-translate
and re-compile a regular expression, resulting in much worse
performance.

Translate all the shell patterns to regular expressions once. For an
mmdebstrap invocation with around 500 path filters, this speeds up
mmdebstrap by more than a minute.
This commit is contained in:
Josh Triplett 2020-12-31 12:49:16 -08:00 committed by Johannes 'josch' Schauer
parent 9484107392
commit 5a7dbc10c7
Signed by: josch
GPG key ID: F2CBA5C78FBD83E1

View file

@ -21,14 +21,15 @@
import tarfile
import sys
import argparse
from fnmatch import fnmatch
import fnmatch
import re
class FilterAction(argparse.Action):
def __call__(self, parser, namespace, values, option_string=None):
items = getattr(namespace, "filter", [])
items.append((self.dest, values))
regex = re.compile(fnmatch.translate(values))
items.append((self.dest, regex))
setattr(namespace, "filter", items)
@ -64,17 +65,17 @@ dpkg(1) for information on how these two options work in detail.
skip = False
if not args.filter:
return False
for (t, f) in args.filter:
if fnmatch(member.name[1:], f):
for (t, r) in args.filter:
if r.match(member.name[1:]) is not None:
if t == "path_include":
skip = False
else:
skip = True
if skip and (member.isdir() or member.issym()):
for (t, f) in args.filter:
for (t, r) in args.filter:
if t != "path_include":
continue
prefix = re.sub(r"^([^*?[\\]*).*", r"\1", f)
prefix = re.sub(r"^([^*?[\\]*).*", r"\1", r.pattern)
prefix = prefix.rstrip("/")
if member.name[1:].startswith(prefix):
if member.name == "./usr/share/doc/doc-debian":