2020-08-25 11:02:33 +00:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
#
|
|
|
|
# This script is in the public domain
|
|
|
|
#
|
2021-09-16 14:24:16 +00:00
|
|
|
# Author: Johannes Schauer Marin Rodrigues <josch@mister-muffin.de>
|
|
|
|
#
|
2020-08-25 11:02:33 +00:00
|
|
|
# This script accepts a tarball on standard input and filters it according to
|
|
|
|
# the same rules used by dpkg --path-exclude and --path-include, using command
|
|
|
|
# line options of the same name. The result is then printed on standard output.
|
|
|
|
#
|
|
|
|
# A tool like this should be written in C but libarchive has issues:
|
|
|
|
# https://github.com/libarchive/libarchive/issues/587
|
|
|
|
# https://github.com/libarchive/libarchive/pull/1288/ (needs 3.4.1)
|
|
|
|
# Should these issues get fixed, then a good template is tarfilter.c in the
|
|
|
|
# examples directory of libarchive.
|
|
|
|
#
|
|
|
|
# We are not using Perl either, because Archive::Tar slurps the whole tarball
|
|
|
|
# into memory.
|
|
|
|
#
|
|
|
|
# We could also use Go but meh...
|
|
|
|
# https://stackoverflow.com/a/59542307/784669
|
|
|
|
|
|
|
|
import tarfile
|
|
|
|
import sys
|
|
|
|
import argparse
|
2020-12-31 20:49:16 +00:00
|
|
|
import fnmatch
|
2020-08-25 11:02:33 +00:00
|
|
|
import re
|
|
|
|
|
|
|
|
|
2021-05-07 07:39:40 +00:00
|
|
|
class PathFilterAction(argparse.Action):
|
2020-08-25 11:02:33 +00:00
|
|
|
def __call__(self, parser, namespace, values, option_string=None):
|
2021-05-07 07:39:40 +00:00
|
|
|
items = getattr(namespace, "pathfilter", [])
|
2020-12-31 20:49:16 +00:00
|
|
|
regex = re.compile(fnmatch.translate(values))
|
|
|
|
items.append((self.dest, regex))
|
2021-05-07 07:39:40 +00:00
|
|
|
setattr(namespace, "pathfilter", items)
|
|
|
|
|
|
|
|
|
|
|
|
class PaxFilterAction(argparse.Action):
|
|
|
|
def __call__(self, parser, namespace, values, option_string=None):
|
|
|
|
items = getattr(namespace, "paxfilter", [])
|
|
|
|
regex = re.compile(fnmatch.translate(values))
|
|
|
|
items.append((self.dest, regex))
|
|
|
|
setattr(namespace, "paxfilter", items)
|
2020-08-25 11:02:33 +00:00
|
|
|
|
|
|
|
|
2022-08-31 03:52:28 +00:00
|
|
|
class TransformAction(argparse.Action):
|
|
|
|
def __call__(self, parser, namespace, values, option_string=None):
|
|
|
|
items = getattr(namespace, "trans", [])
|
|
|
|
# This function mimics what src/transform.c from tar does
|
|
|
|
if not values.startswith("s"):
|
|
|
|
raise ValueError("regex must start with an 's'")
|
|
|
|
if len(values) <= 4:
|
|
|
|
# minimum regex: s/x//
|
|
|
|
raise ValueError("invalid regex (too short)")
|
|
|
|
d = values[1]
|
|
|
|
if values.startswith(f"s{d}{d}"):
|
|
|
|
raise ValueError("empty regex")
|
|
|
|
values = values.removeprefix(f"s{d}")
|
|
|
|
flags = 0
|
|
|
|
if values.endswith(f"{d}i"):
|
|
|
|
# trailing flags
|
|
|
|
flags = re.IGNORECASE
|
|
|
|
values = values.removesuffix(f"{d}i")
|
|
|
|
# This regex only finds non-empty tokens.
|
|
|
|
# Finding empty tokens would require a variable length look-behind
|
|
|
|
# or \K in order to find escaped delimiters which is not supported by
|
|
|
|
# the python re module.
|
|
|
|
tokens = re.findall(rf"(?:\\[\\{d}]|[^{d}])+", values)
|
|
|
|
match len(tokens):
|
|
|
|
case 0:
|
|
|
|
raise ValueError("invalid regex: not enough terms")
|
|
|
|
case 1:
|
|
|
|
repl = ""
|
|
|
|
case 2:
|
|
|
|
repl = tokens[1]
|
|
|
|
case _:
|
|
|
|
raise ValueError("invalid regex: too many terms: %s" % tokens)
|
|
|
|
items.append((re.compile(tokens[0], flags), repl))
|
|
|
|
setattr(namespace, "trans", items)
|
|
|
|
|
|
|
|
|
2020-08-25 11:02:33 +00:00
|
|
|
def main():
|
2020-11-26 23:32:14 +00:00
|
|
|
parser = argparse.ArgumentParser(
|
2022-08-31 03:52:28 +00:00
|
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
2020-11-26 23:32:14 +00:00
|
|
|
description="""\
|
|
|
|
Filters a tarball on standard input by the same rules as the dpkg --path-exclude
|
|
|
|
and --path-include options and writes resulting tarball to standard output. See
|
2022-08-31 03:52:28 +00:00
|
|
|
dpkg(1) for information on how these two options work in detail. To reuse the
|
|
|
|
exact same semantics as used by dpkg, paths must be given as /path and not as
|
|
|
|
./path even though they might be stored as such in the tarball.
|
2021-05-07 07:39:40 +00:00
|
|
|
|
2022-08-31 03:52:28 +00:00
|
|
|
Secondly, filter out unwanted pax extended headers. This is useful in cases
|
2021-05-07 07:39:40 +00:00
|
|
|
where a tool only accepts certain xattr prefixes. For example tar2sqfs only
|
|
|
|
supports SCHILY.xattr.user.*, SCHILY.xattr.trusted.* and
|
|
|
|
SCHILY.xattr.security.* but not SCHILY.xattr.system.posix_acl_default.*.
|
|
|
|
|
|
|
|
Both types of options use Unix shell-style wildcards:
|
|
|
|
|
|
|
|
* matches everything
|
|
|
|
? matches any single character
|
|
|
|
[seq] matches any character in seq
|
|
|
|
[!seq] matches any character not in seq
|
2022-02-11 21:56:38 +00:00
|
|
|
|
2022-08-31 03:52:28 +00:00
|
|
|
Thirdly, transform the path of tar members using a sed expression just as with
|
|
|
|
GNU tar --transform.
|
|
|
|
|
|
|
|
Fourthly, strip leading directory components off of tar members. Just as with
|
2022-02-11 21:56:38 +00:00
|
|
|
GNU tar --strip-components, tar members that have less or equal components in
|
|
|
|
their path are not passed through.
|
2022-08-31 03:35:40 +00:00
|
|
|
|
|
|
|
Lastly, shift user id and group id of each entry by the value given by the
|
|
|
|
--idshift argument. The resulting uid or gid must not be negative.
|
|
|
|
""",
|
2020-11-26 23:32:14 +00:00
|
|
|
)
|
2020-08-25 11:02:33 +00:00
|
|
|
parser.add_argument(
|
|
|
|
"--path-exclude",
|
|
|
|
metavar="pattern",
|
2021-05-07 07:39:40 +00:00
|
|
|
action=PathFilterAction,
|
2022-08-31 03:52:28 +00:00
|
|
|
help="Exclude path matching the given shell pattern. "
|
|
|
|
"This option can be specified multiple times.",
|
2020-08-25 11:02:33 +00:00
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
"--path-include",
|
|
|
|
metavar="pattern",
|
2021-05-07 07:39:40 +00:00
|
|
|
action=PathFilterAction,
|
2022-08-31 03:52:28 +00:00
|
|
|
help="Re-include a pattern after a previous exclusion. "
|
|
|
|
"This option can be specified multiple times.",
|
2020-08-25 11:02:33 +00:00
|
|
|
)
|
2021-05-07 07:39:40 +00:00
|
|
|
parser.add_argument(
|
|
|
|
"--pax-exclude",
|
|
|
|
metavar="pattern",
|
|
|
|
action=PaxFilterAction,
|
2022-08-31 03:52:28 +00:00
|
|
|
help="Exclude pax header matching the given globbing pattern. "
|
|
|
|
"This option can be specified multiple times.",
|
2021-05-07 07:39:40 +00:00
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
"--pax-include",
|
|
|
|
metavar="pattern",
|
|
|
|
action=PaxFilterAction,
|
2022-08-31 03:52:28 +00:00
|
|
|
help="Re-include a pax header after a previous exclusion. "
|
|
|
|
"This option can be specified multiple times.",
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
"--transform",
|
|
|
|
"--xform",
|
|
|
|
metavar="EXPRESSION",
|
|
|
|
action=TransformAction,
|
|
|
|
help="Use sed replace EXPRESSION to transform file names. "
|
|
|
|
"This option can be specified multiple times.",
|
2021-05-07 07:39:40 +00:00
|
|
|
)
|
2022-02-11 21:56:38 +00:00
|
|
|
parser.add_argument(
|
|
|
|
"--strip-components",
|
2022-08-31 03:52:28 +00:00
|
|
|
metavar="NUMBER",
|
2022-02-11 21:56:38 +00:00
|
|
|
type=int,
|
|
|
|
help="Strip NUMBER leading components from file names",
|
|
|
|
)
|
2022-08-31 03:35:40 +00:00
|
|
|
parser.add_argument(
|
|
|
|
"--idshift",
|
|
|
|
metavar="NUM",
|
|
|
|
type=int,
|
|
|
|
help="Integer value by which to shift the uid and gid of each entry",
|
|
|
|
)
|
2020-08-25 11:02:33 +00:00
|
|
|
args = parser.parse_args()
|
2022-02-11 21:56:38 +00:00
|
|
|
if (
|
|
|
|
not hasattr(args, "pathfilter")
|
|
|
|
and not hasattr(args, "paxfilter")
|
|
|
|
and not hasattr(args, "strip_components")
|
|
|
|
):
|
2020-08-25 11:02:33 +00:00
|
|
|
from shutil import copyfileobj
|
|
|
|
|
|
|
|
copyfileobj(sys.stdin.buffer, sys.stdout.buffer)
|
|
|
|
exit()
|
|
|
|
|
|
|
|
# same logic as in dpkg/src/filters.c/filter_should_skip()
|
2021-02-18 20:14:11 +00:00
|
|
|
prefix_prog = re.compile(r"^([^*?[\\]*).*")
|
2021-02-22 12:45:55 +00:00
|
|
|
|
2021-05-07 07:39:40 +00:00
|
|
|
def path_filter_should_skip(member):
|
2020-08-25 11:02:33 +00:00
|
|
|
skip = False
|
2021-05-07 07:39:40 +00:00
|
|
|
if not hasattr(args, "pathfilter"):
|
2020-08-25 11:02:33 +00:00
|
|
|
return False
|
2023-02-11 14:12:31 +00:00
|
|
|
for t, r in args.pathfilter:
|
2020-12-31 20:49:16 +00:00
|
|
|
if r.match(member.name[1:]) is not None:
|
2020-08-25 11:02:33 +00:00
|
|
|
if t == "path_include":
|
|
|
|
skip = False
|
|
|
|
else:
|
|
|
|
skip = True
|
|
|
|
if skip and (member.isdir() or member.issym()):
|
2023-02-11 14:12:31 +00:00
|
|
|
for t, r in args.pathfilter:
|
2020-08-25 11:02:33 +00:00
|
|
|
if t != "path_include":
|
|
|
|
continue
|
2021-02-18 20:14:11 +00:00
|
|
|
prefix = prefix_prog.sub(r"\1", r.pattern)
|
2020-08-25 11:02:33 +00:00
|
|
|
prefix = prefix.rstrip("/")
|
|
|
|
if member.name[1:].startswith(prefix):
|
|
|
|
return False
|
|
|
|
return skip
|
|
|
|
|
2021-05-07 07:39:40 +00:00
|
|
|
def pax_filter_should_skip(header):
|
|
|
|
if not hasattr(args, "paxfilter"):
|
|
|
|
return False
|
|
|
|
skip = False
|
2023-02-11 14:12:31 +00:00
|
|
|
for t, r in args.paxfilter:
|
2021-05-07 07:39:40 +00:00
|
|
|
if r.match(header) is None:
|
|
|
|
continue
|
|
|
|
if t == "pax_include":
|
|
|
|
skip = False
|
|
|
|
else:
|
|
|
|
skip = True
|
|
|
|
return skip
|
|
|
|
|
2020-08-25 11:02:33 +00:00
|
|
|
# starting with Python 3.8, the default format became PAX_FORMAT, so this
|
|
|
|
# is only for compatibility with older versions of Python 3
|
|
|
|
with tarfile.open(fileobj=sys.stdin.buffer, mode="r|*") as in_tar, tarfile.open(
|
|
|
|
fileobj=sys.stdout.buffer, mode="w|", format=tarfile.PAX_FORMAT
|
|
|
|
) as out_tar:
|
|
|
|
for member in in_tar:
|
2021-05-07 07:39:40 +00:00
|
|
|
if path_filter_should_skip(member):
|
2020-08-25 11:02:33 +00:00
|
|
|
continue
|
2022-02-11 21:56:38 +00:00
|
|
|
if args.strip_components:
|
|
|
|
comps = member.name.split("/")
|
2022-08-31 03:52:28 +00:00
|
|
|
# just as with GNU tar, archive members with less or equal
|
|
|
|
# number of components are not passed through at all
|
2022-02-11 21:56:38 +00:00
|
|
|
if len(comps) <= args.strip_components:
|
|
|
|
continue
|
|
|
|
member.name = "/".join(comps[args.strip_components :])
|
2021-05-07 07:39:40 +00:00
|
|
|
member.pax_headers = {
|
|
|
|
k: v
|
|
|
|
for k, v in member.pax_headers.items()
|
|
|
|
if not pax_filter_should_skip(k)
|
|
|
|
}
|
2022-08-31 03:35:40 +00:00
|
|
|
if args.idshift:
|
|
|
|
if args.idshift < 0 and -args.idshift > member.uid:
|
|
|
|
print("uid cannot be negative", file=sys.stderr)
|
|
|
|
exit(1)
|
|
|
|
if args.idshift < 0 and -args.idshift > member.gid:
|
|
|
|
print("gid cannot be negative", file=sys.stderr)
|
|
|
|
exit(1)
|
|
|
|
member.uid += args.idshift
|
|
|
|
member.gid += args.idshift
|
2022-08-31 03:52:28 +00:00
|
|
|
if hasattr(args, "trans"):
|
|
|
|
for r, s in args.trans:
|
|
|
|
member.name = r.sub(s, member.name)
|
2020-08-25 11:02:33 +00:00
|
|
|
if member.isfile():
|
|
|
|
with in_tar.extractfile(member) as file:
|
|
|
|
out_tar.addfile(member, file)
|
|
|
|
else:
|
|
|
|
out_tar.addfile(member)
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|