Optimize mmtarfilter to handle many path exclusions
mmtarfilter uses fnmatch to handle path exclusions and inclusions. Python's fnmatch handles shell patterns by translating them to regular expressions, with a 256-entry LRU cache. With more than 256 path exclusions or inclusions, this LRU cache no longer works, and every invocation of fnmatch on every file in every package will re-translate and re-compile a regular expression, resulting in much worse performance. Translate all the shell patterns to regular expressions once. For an mmdebstrap invocation with around 500 path filters, this speeds up mmdebstrap by more than a minute.
This commit is contained in:
parent
9484107392
commit
5a7dbc10c7
1 changed files with 7 additions and 6 deletions
13
tarfilter
13
tarfilter
|
@ -21,14 +21,15 @@
|
||||||
import tarfile
|
import tarfile
|
||||||
import sys
|
import sys
|
||||||
import argparse
|
import argparse
|
||||||
from fnmatch import fnmatch
|
import fnmatch
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
|
||||||
class FilterAction(argparse.Action):
|
class FilterAction(argparse.Action):
|
||||||
def __call__(self, parser, namespace, values, option_string=None):
|
def __call__(self, parser, namespace, values, option_string=None):
|
||||||
items = getattr(namespace, "filter", [])
|
items = getattr(namespace, "filter", [])
|
||||||
items.append((self.dest, values))
|
regex = re.compile(fnmatch.translate(values))
|
||||||
|
items.append((self.dest, regex))
|
||||||
setattr(namespace, "filter", items)
|
setattr(namespace, "filter", items)
|
||||||
|
|
||||||
|
|
||||||
|
@ -64,17 +65,17 @@ dpkg(1) for information on how these two options work in detail.
|
||||||
skip = False
|
skip = False
|
||||||
if not args.filter:
|
if not args.filter:
|
||||||
return False
|
return False
|
||||||
for (t, f) in args.filter:
|
for (t, r) in args.filter:
|
||||||
if fnmatch(member.name[1:], f):
|
if r.match(member.name[1:]) is not None:
|
||||||
if t == "path_include":
|
if t == "path_include":
|
||||||
skip = False
|
skip = False
|
||||||
else:
|
else:
|
||||||
skip = True
|
skip = True
|
||||||
if skip and (member.isdir() or member.issym()):
|
if skip and (member.isdir() or member.issym()):
|
||||||
for (t, f) in args.filter:
|
for (t, r) in args.filter:
|
||||||
if t != "path_include":
|
if t != "path_include":
|
||||||
continue
|
continue
|
||||||
prefix = re.sub(r"^([^*?[\\]*).*", r"\1", f)
|
prefix = re.sub(r"^([^*?[\\]*).*", r"\1", r.pattern)
|
||||||
prefix = prefix.rstrip("/")
|
prefix = prefix.rstrip("/")
|
||||||
if member.name[1:].startswith(prefix):
|
if member.name[1:].startswith(prefix):
|
||||||
if member.name == "./usr/share/doc/doc-debian":
|
if member.name == "./usr/share/doc/doc-debian":
|
||||||
|
|
Loading…
Reference in a new issue