Optimize mmtarfilter to handle many path exclusions

mmtarfilter uses fnmatch to handle path exclusions and inclusions. Python's fnmatch handles shell patterns by translating them to regular expressions, with a 256-entry LRU cache. With more than 256 path exclusions or inclusions, this LRU cache no longer works, and every invocation of fnmatch on every file in every package will re-translate and re-compile a regular expression, resulting in much worse performance. Translate all the shell patterns to regular expressions once. For an mmdebstrap invocation with around 500 path filters, this speeds up mmdebstrap by more than a minute.
2020-12-31 12:49:16 -08:00 · 2020-12-31 12:49:16 -08:00 · 5a7dbc10c7
commit 5a7dbc10c7
parent 9484107392
1 changed files with 7 additions and 6 deletions
--- a/13
+++ b/13
@ -21,14 +21,15 @@
 import tarfile
 import sys
 import argparse
-from fnmatch import fnmatch
+import fnmatch
 import re
 class FilterAction(argparse.Action):
    def __call__(self, parser, namespace, values, option_string=None):
        items = getattr(namespace, "filter", [])
-        items.append((self.dest, values))
+        regex = re.compile(fnmatch.translate(values))
        items.append((self.dest, regex))
        setattr(namespace, "filter", items)
@ -64,17 +65,17 @@ dpkg(1) for information on how these two options work in detail.
        skip = False
        if not args.filter:
            return False
-        for (t, f) in args.filter:
+        for (t, r) in args.filter:
-            if fnmatch(member.name[1:], f):
+            if r.match(member.name[1:]) is not None:
                if t == "path_include":
                    skip = False
                else:
                    skip = True
        if skip and (member.isdir() or member.issym()):
-            for (t, f) in args.filter:
+            for (t, r) in args.filter:
                if t != "path_include":
                    continue
-                prefix = re.sub(r"^([^*?[\\]*).*", r"\1", f)
+                prefix = re.sub(r"^([^*?[\\]*).*", r"\1", r.pattern)
                prefix = prefix.rstrip("/")
                if member.name[1:].startswith(prefix):
                    if member.name == "./usr/share/doc/doc-debian":