From 93f65a49c9b1835235e90750af8e6f30b52c1592 Mon Sep 17 00:00:00 2001 From: Johannes 'josch' Schauer Date: Mon, 1 Jun 2020 02:10:07 +0200 Subject: [PATCH] add --pdfa argument to attach an icc profile for PDF/A-1b compliant output --- src/img2pdf.py | 112 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 101 insertions(+), 11 deletions(-) diff --git a/src/img2pdf.py b/src/img2pdf.py index 352ad36..82db1d0 100755 --- a/src/img2pdf.py +++ b/src/img2pdf.py @@ -33,6 +33,7 @@ from io import BytesIO import logging import struct import platform +import hashlib have_pdfrw = True try: @@ -502,7 +503,7 @@ class MyPdfWriter: obj.identifier = newid self.objects.append(obj) - def tostream(self, info, stream, version="1.3"): + def tostream(self, info, stream, version="1.3", ident=None): xreftable = list() # justification of the random binary garbage in the header from @@ -559,10 +560,11 @@ class MyPdfWriter: for x in xreftable: stream.write(x) stream.write(b"trailer\n") - stream.write( - parse({b"/Size": len(xreftable), b"/Info": info, b"/Root": self.catalog}) - + b"\n" - ) + trailer = {b"/Size": len(xreftable), b"/Info": info, b"/Root": self.catalog} + if ident is not None: + md5 = hashlib.md5(ident).hexdigest().encode("ascii") + trailer[b"/ID"] = b"[<%s><%s>]" % (md5, md5) + stream.write(parse(trailer) + b"\n") stream.write(b"startxref\n") stream.write(("%d\n" % xrefoffset).encode()) stream.write(b"%%EOF\n") @@ -619,6 +621,7 @@ class pdfdoc(object): fit_window=False, center_window=False, fullscreen=False, + pdfa=None, ): if engine is None: if have_pikepdf: @@ -655,7 +658,7 @@ class pdfdoc(object): continue if engine != Engine.pikepdf: v = PdfString.encode(v) - self.writer.docinfo[getattr(PdfName,k)] = v + self.writer.docinfo[getattr(PdfName, k)] = v now = datetime.now() for k in ["CreationDate", "ModDate"]: @@ -665,6 +668,8 @@ class pdfdoc(object): if v is None: v = now v = ("D:" + datetime_to_pdfdate(v)).encode("ascii") + if engine == Engine.internal: + v = b"(" + v + b")" self.writer.docinfo[getattr(PdfName, k)] = v if keywords is not None: if engine == Engine.pikepdf: @@ -674,6 +679,38 @@ class pdfdoc(object): ",".join(keywords) ) + def datetime_to_xmpdate(dt): + return dt.strftime("%Y-%m-%dT%H:%M:%SZ") + + self.xmp = b""" + + + + + %s + %s + + + + + + +""" % ( + b" pdf:Producer='%s'" % producer.encode("ascii") + if producer is not None + else b"", + b"" + if creationdate is None and nodate + else b"%s" + % datetime_to_xmpdate(now if creationdate is None else creationdate).encode( + "ascii" + ), + b"" + if moddate is None and nodate + else b"%s" + % datetime_to_xmpdate(now if moddate is None else moddate).encode("ascii"), + ) + if engine != Engine.pikepdf: # this is done because pdfrw adds info, catalog and pages as the first # three objects in this order @@ -691,6 +728,7 @@ class pdfdoc(object): self.fullscreen = fullscreen self.engine = engine self.output_version = version + self.pdfa = pdfa def add_imagepage( self, @@ -722,7 +760,6 @@ class pdfdoc(object): elif self.engine == Engine.pdfrw: from pdfrw import PdfDict, PdfName, PdfObject, PdfString from pdfrw.py23_diffs import convert_load - elif self.engine == Engine.internal: PdfDict = MyPdfDict PdfName = MyPdfName @@ -906,12 +943,13 @@ class pdfdoc(object): PdfName = pikepdf.Name elif self.engine == Engine.pdfrw: from pdfrw import PdfDict, PdfName, PdfArray, PdfObject - + from pdfrw.py23_diffs import convert_load elif self.engine == Engine.internal: PdfDict = MyPdfDict PdfName = MyPdfName PdfObject = MyPdfObject PdfArray = MyPdfArray + convert_load = my_convert_load else: raise ValueError("unknown engine: %s" % self.engine) NullObject = None if self.engine == Engine.pikepdf else PdfObject("null") @@ -1040,17 +1078,57 @@ class pdfdoc(object): else: raise ValueError("unknown page layout: %s" % self.page_layout) + if self.pdfa is not None: + if self.engine == Engine.pikepdf: + metadata = self.writer.make_stream(self.xmp) + else: + metadata = PdfDict(stream=convert_load(self.xmp)) + metadata[PdfName.Subtype] = PdfName.XML + metadata[PdfName.Type] = PdfName.Metadata + with open(self.pdfa, "rb") as f: + icc = f.read() + intents = PdfDict() + if self.engine == Engine.pikepdf: + iccstream = self.writer.make_stream(icc) + iccstream.stream_dict.N = 3 + else: + iccstream = PdfDict(stream=convert_load(zlib.compress(icc))) + iccstream[PdfName.N] = 3 + iccstream[PdfName.Filter] = PdfName.FlateDecode + intents[PdfName.S] = PdfName.GTS_PDFA1 + intents[PdfName.Type] = PdfName.OutputIntent + intents[PdfName.OutputConditionIdentifier] = ( + b"sRGB" if self.engine == Engine.pikepdf else b"(sRGB)" + ) + intents[PdfName.DestOutputProfile] = iccstream + catalog[PdfName.OutputIntents] = PdfArray([intents]) + catalog[PdfName.Metadata] = metadata + + if self.engine == Engine.internal: + self.writer.addobj(metadata) + self.writer.addobj(iccstream) + # now write out the PDF if self.engine == Engine.pikepdf: - self.writer.save(outputstream, min_version=self.output_version, linearize=True) + self.writer.save( + outputstream, min_version=self.output_version, linearize=True + ) elif self.engine == Engine.pdfrw: self.writer.trailer.Info = self.writer.docinfo # setting the version attribute of the pdfrw PdfWriter object will # influence the behaviour of the write() function self.writer.version = self.output_version + if self.pdfa: + md5 = hashlib.md5(b"").hexdigest().encode("ascii") + self.writer.trailer[PdfName.ID] = PdfArray([md5, md5]) self.writer.write(outputstream) elif self.engine == Engine.internal: - self.writer.tostream(self.writer.docinfo, outputstream, self.output_version) + self.writer.tostream( + self.writer.docinfo, + outputstream, + self.output_version, + None if self.pdfa is None else b"", + ) else: raise ValueError("unknown engine: %s" % self.engine) @@ -1847,6 +1925,7 @@ def convert(*images, **kwargs): bleedborder=None, trimborder=None, artborder=None, + pdfa=None, ) for kwname, default in _default_kwargs.items(): if kwname not in kwargs: @@ -1871,6 +1950,7 @@ def convert(*images, **kwargs): kwargs["viewer_fit_window"], kwargs["viewer_center_window"], kwargs["viewer_fullscreen"], + kwargs["pdfa"], ) # backwards compatibility with older img2pdf versions where the first @@ -3164,7 +3244,7 @@ RGB.""", "The internal engine does not have additional requirements and writes " "out a human readable PDF. The pikepdf engine requires the pikepdf " "Python module and qpdf library, is most featureful, can " - "linearize PDFs (\"fast web view\") and can compress more parts of it." + 'linearize PDFs ("fast web view") and can compress more parts of it.' "The pdfrw engine requires the pdfrw Python " "module but does not support unicode metadata (See " "https://github.com/pmaupin/pdfrw/issues/39) or palette data (See " @@ -3191,6 +3271,15 @@ RGB.""", % Image.MAX_IMAGE_PIXELS, ) + outargs.add_argument( + "--pdfa", + nargs="?", + const="/usr/share/color/icc/ghostscript/srgb.icc", + default=None, + help="Output a PDF/A-1b complient document. By default, this will " + "embed /usr/share/color/icc/ghostscript/srgb.icc as the color profile.", + ) + sizeargs = parser.add_argument_group( title="Image and page size and layout arguments", description="""\ @@ -3530,6 +3619,7 @@ and left/right, respectively. It is not possible to specify asymmetric borders. bleedborder=args.bleed_border, trimborder=args.trim_border, artborder=args.art_border, + pdfa=args.pdfa, ) except Exception as e: logging.error("error: " + str(e))