Optimize mmtarfilter to handle many path exclusions

mmtarfilter uses fnmatch to handle path exclusions and inclusions.
Python's fnmatch handles shell patterns by translating them to regular
expressions, with a 256-entry LRU cache. With more than 256 path
exclusions or inclusions, this LRU cache no longer works, and every
invocation of fnmatch on every file in every package will re-translate
and re-compile a regular expression, resulting in much worse
performance.

Translate all the shell patterns to regular expressions once. For an
mmdebstrap invocation with around 500 path filters, this speeds up
mmdebstrap by more than a minute.
This commit is contained in:
Josh Triplett 2020-12-31 12:49:16 -08:00 committed by Johannes 'josch' Schauer
parent 9484107392
commit 5a7dbc10c7
Signed by: josch
GPG key ID: F2CBA5C78FBD83E1

View file

@ -21,14 +21,15 @@
import tarfile import tarfile
import sys import sys
import argparse import argparse
from fnmatch import fnmatch import fnmatch
import re import re
class FilterAction(argparse.Action): class FilterAction(argparse.Action):
def __call__(self, parser, namespace, values, option_string=None): def __call__(self, parser, namespace, values, option_string=None):
items = getattr(namespace, "filter", []) items = getattr(namespace, "filter", [])
items.append((self.dest, values)) regex = re.compile(fnmatch.translate(values))
items.append((self.dest, regex))
setattr(namespace, "filter", items) setattr(namespace, "filter", items)
@ -64,17 +65,17 @@ dpkg(1) for information on how these two options work in detail.
skip = False skip = False
if not args.filter: if not args.filter:
return False return False
for (t, f) in args.filter: for (t, r) in args.filter:
if fnmatch(member.name[1:], f): if r.match(member.name[1:]) is not None:
if t == "path_include": if t == "path_include":
skip = False skip = False
else: else:
skip = True skip = True
if skip and (member.isdir() or member.issym()): if skip and (member.isdir() or member.issym()):
for (t, f) in args.filter: for (t, r) in args.filter:
if t != "path_include": if t != "path_include":
continue continue
prefix = re.sub(r"^([^*?[\\]*).*", r"\1", f) prefix = re.sub(r"^([^*?[\\]*).*", r"\1", r.pattern)
prefix = prefix.rstrip("/") prefix = prefix.rstrip("/")
if member.name[1:].startswith(prefix): if member.name[1:].startswith(prefix):
if member.name == "./usr/share/doc/doc-debian": if member.name == "./usr/share/doc/doc-debian":