forked from josch/img2pdf
add --pdfa argument to attach an icc profile for PDF/A-1b compliant output
This commit is contained in:
parent
bca3f802ac
commit
93f65a49c9
1 changed files with 101 additions and 11 deletions
112
src/img2pdf.py
112
src/img2pdf.py
|
@ -33,6 +33,7 @@ from io import BytesIO
|
||||||
import logging
|
import logging
|
||||||
import struct
|
import struct
|
||||||
import platform
|
import platform
|
||||||
|
import hashlib
|
||||||
|
|
||||||
have_pdfrw = True
|
have_pdfrw = True
|
||||||
try:
|
try:
|
||||||
|
@ -502,7 +503,7 @@ class MyPdfWriter:
|
||||||
obj.identifier = newid
|
obj.identifier = newid
|
||||||
self.objects.append(obj)
|
self.objects.append(obj)
|
||||||
|
|
||||||
def tostream(self, info, stream, version="1.3"):
|
def tostream(self, info, stream, version="1.3", ident=None):
|
||||||
xreftable = list()
|
xreftable = list()
|
||||||
|
|
||||||
# justification of the random binary garbage in the header from
|
# justification of the random binary garbage in the header from
|
||||||
|
@ -559,10 +560,11 @@ class MyPdfWriter:
|
||||||
for x in xreftable:
|
for x in xreftable:
|
||||||
stream.write(x)
|
stream.write(x)
|
||||||
stream.write(b"trailer\n")
|
stream.write(b"trailer\n")
|
||||||
stream.write(
|
trailer = {b"/Size": len(xreftable), b"/Info": info, b"/Root": self.catalog}
|
||||||
parse({b"/Size": len(xreftable), b"/Info": info, b"/Root": self.catalog})
|
if ident is not None:
|
||||||
+ b"\n"
|
md5 = hashlib.md5(ident).hexdigest().encode("ascii")
|
||||||
)
|
trailer[b"/ID"] = b"[<%s><%s>]" % (md5, md5)
|
||||||
|
stream.write(parse(trailer) + b"\n")
|
||||||
stream.write(b"startxref\n")
|
stream.write(b"startxref\n")
|
||||||
stream.write(("%d\n" % xrefoffset).encode())
|
stream.write(("%d\n" % xrefoffset).encode())
|
||||||
stream.write(b"%%EOF\n")
|
stream.write(b"%%EOF\n")
|
||||||
|
@ -619,6 +621,7 @@ class pdfdoc(object):
|
||||||
fit_window=False,
|
fit_window=False,
|
||||||
center_window=False,
|
center_window=False,
|
||||||
fullscreen=False,
|
fullscreen=False,
|
||||||
|
pdfa=None,
|
||||||
):
|
):
|
||||||
if engine is None:
|
if engine is None:
|
||||||
if have_pikepdf:
|
if have_pikepdf:
|
||||||
|
@ -655,7 +658,7 @@ class pdfdoc(object):
|
||||||
continue
|
continue
|
||||||
if engine != Engine.pikepdf:
|
if engine != Engine.pikepdf:
|
||||||
v = PdfString.encode(v)
|
v = PdfString.encode(v)
|
||||||
self.writer.docinfo[getattr(PdfName,k)] = v
|
self.writer.docinfo[getattr(PdfName, k)] = v
|
||||||
|
|
||||||
now = datetime.now()
|
now = datetime.now()
|
||||||
for k in ["CreationDate", "ModDate"]:
|
for k in ["CreationDate", "ModDate"]:
|
||||||
|
@ -665,6 +668,8 @@ class pdfdoc(object):
|
||||||
if v is None:
|
if v is None:
|
||||||
v = now
|
v = now
|
||||||
v = ("D:" + datetime_to_pdfdate(v)).encode("ascii")
|
v = ("D:" + datetime_to_pdfdate(v)).encode("ascii")
|
||||||
|
if engine == Engine.internal:
|
||||||
|
v = b"(" + v + b")"
|
||||||
self.writer.docinfo[getattr(PdfName, k)] = v
|
self.writer.docinfo[getattr(PdfName, k)] = v
|
||||||
if keywords is not None:
|
if keywords is not None:
|
||||||
if engine == Engine.pikepdf:
|
if engine == Engine.pikepdf:
|
||||||
|
@ -674,6 +679,38 @@ class pdfdoc(object):
|
||||||
",".join(keywords)
|
",".join(keywords)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def datetime_to_xmpdate(dt):
|
||||||
|
return dt.strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||||
|
|
||||||
|
self.xmp = b"""<?xpacket begin='\xef\xbb\xbf' id='W5M0MpCehiHzreSzNTczkc9d'?>
|
||||||
|
<x:xmpmeta xmlns:x='adobe:ns:meta/' x:xmptk='XMP toolkit 2.9.1-13, framework 1.6'>
|
||||||
|
<rdf:RDF xmlns:rdf='http://www.w3.org/1999/02/22-rdf-syntax-ns#' xmlns:iX='http://ns.adobe.com/iX/1.0/'>
|
||||||
|
<rdf:Description rdf:about='' xmlns:pdf='http://ns.adobe.com/pdf/1.3/'%s/>
|
||||||
|
<rdf:Description rdf:about='' xmlns:xmp='http://ns.adobe.com/xap/1.0/'>
|
||||||
|
%s
|
||||||
|
%s
|
||||||
|
</rdf:Description>
|
||||||
|
<rdf:Description rdf:about='' xmlns:pdfaid='http://www.aiim.org/pdfa/ns/id/' pdfaid:part='1' pdfaid:conformance='B'/>
|
||||||
|
</rdf:RDF>
|
||||||
|
</x:xmpmeta>
|
||||||
|
|
||||||
|
<?xpacket end='w'?>
|
||||||
|
""" % (
|
||||||
|
b" pdf:Producer='%s'" % producer.encode("ascii")
|
||||||
|
if producer is not None
|
||||||
|
else b"",
|
||||||
|
b""
|
||||||
|
if creationdate is None and nodate
|
||||||
|
else b"<xmp:ModifyDate>%s</xmp:ModifyDate>"
|
||||||
|
% datetime_to_xmpdate(now if creationdate is None else creationdate).encode(
|
||||||
|
"ascii"
|
||||||
|
),
|
||||||
|
b""
|
||||||
|
if moddate is None and nodate
|
||||||
|
else b"<xmp:CreateDate>%s</xmp:CreateDate>"
|
||||||
|
% datetime_to_xmpdate(now if moddate is None else moddate).encode("ascii"),
|
||||||
|
)
|
||||||
|
|
||||||
if engine != Engine.pikepdf:
|
if engine != Engine.pikepdf:
|
||||||
# this is done because pdfrw adds info, catalog and pages as the first
|
# this is done because pdfrw adds info, catalog and pages as the first
|
||||||
# three objects in this order
|
# three objects in this order
|
||||||
|
@ -691,6 +728,7 @@ class pdfdoc(object):
|
||||||
self.fullscreen = fullscreen
|
self.fullscreen = fullscreen
|
||||||
self.engine = engine
|
self.engine = engine
|
||||||
self.output_version = version
|
self.output_version = version
|
||||||
|
self.pdfa = pdfa
|
||||||
|
|
||||||
def add_imagepage(
|
def add_imagepage(
|
||||||
self,
|
self,
|
||||||
|
@ -722,7 +760,6 @@ class pdfdoc(object):
|
||||||
elif self.engine == Engine.pdfrw:
|
elif self.engine == Engine.pdfrw:
|
||||||
from pdfrw import PdfDict, PdfName, PdfObject, PdfString
|
from pdfrw import PdfDict, PdfName, PdfObject, PdfString
|
||||||
from pdfrw.py23_diffs import convert_load
|
from pdfrw.py23_diffs import convert_load
|
||||||
|
|
||||||
elif self.engine == Engine.internal:
|
elif self.engine == Engine.internal:
|
||||||
PdfDict = MyPdfDict
|
PdfDict = MyPdfDict
|
||||||
PdfName = MyPdfName
|
PdfName = MyPdfName
|
||||||
|
@ -906,12 +943,13 @@ class pdfdoc(object):
|
||||||
PdfName = pikepdf.Name
|
PdfName = pikepdf.Name
|
||||||
elif self.engine == Engine.pdfrw:
|
elif self.engine == Engine.pdfrw:
|
||||||
from pdfrw import PdfDict, PdfName, PdfArray, PdfObject
|
from pdfrw import PdfDict, PdfName, PdfArray, PdfObject
|
||||||
|
from pdfrw.py23_diffs import convert_load
|
||||||
elif self.engine == Engine.internal:
|
elif self.engine == Engine.internal:
|
||||||
PdfDict = MyPdfDict
|
PdfDict = MyPdfDict
|
||||||
PdfName = MyPdfName
|
PdfName = MyPdfName
|
||||||
PdfObject = MyPdfObject
|
PdfObject = MyPdfObject
|
||||||
PdfArray = MyPdfArray
|
PdfArray = MyPdfArray
|
||||||
|
convert_load = my_convert_load
|
||||||
else:
|
else:
|
||||||
raise ValueError("unknown engine: %s" % self.engine)
|
raise ValueError("unknown engine: %s" % self.engine)
|
||||||
NullObject = None if self.engine == Engine.pikepdf else PdfObject("null")
|
NullObject = None if self.engine == Engine.pikepdf else PdfObject("null")
|
||||||
|
@ -1040,17 +1078,57 @@ class pdfdoc(object):
|
||||||
else:
|
else:
|
||||||
raise ValueError("unknown page layout: %s" % self.page_layout)
|
raise ValueError("unknown page layout: %s" % self.page_layout)
|
||||||
|
|
||||||
|
if self.pdfa is not None:
|
||||||
|
if self.engine == Engine.pikepdf:
|
||||||
|
metadata = self.writer.make_stream(self.xmp)
|
||||||
|
else:
|
||||||
|
metadata = PdfDict(stream=convert_load(self.xmp))
|
||||||
|
metadata[PdfName.Subtype] = PdfName.XML
|
||||||
|
metadata[PdfName.Type] = PdfName.Metadata
|
||||||
|
with open(self.pdfa, "rb") as f:
|
||||||
|
icc = f.read()
|
||||||
|
intents = PdfDict()
|
||||||
|
if self.engine == Engine.pikepdf:
|
||||||
|
iccstream = self.writer.make_stream(icc)
|
||||||
|
iccstream.stream_dict.N = 3
|
||||||
|
else:
|
||||||
|
iccstream = PdfDict(stream=convert_load(zlib.compress(icc)))
|
||||||
|
iccstream[PdfName.N] = 3
|
||||||
|
iccstream[PdfName.Filter] = PdfName.FlateDecode
|
||||||
|
intents[PdfName.S] = PdfName.GTS_PDFA1
|
||||||
|
intents[PdfName.Type] = PdfName.OutputIntent
|
||||||
|
intents[PdfName.OutputConditionIdentifier] = (
|
||||||
|
b"sRGB" if self.engine == Engine.pikepdf else b"(sRGB)"
|
||||||
|
)
|
||||||
|
intents[PdfName.DestOutputProfile] = iccstream
|
||||||
|
catalog[PdfName.OutputIntents] = PdfArray([intents])
|
||||||
|
catalog[PdfName.Metadata] = metadata
|
||||||
|
|
||||||
|
if self.engine == Engine.internal:
|
||||||
|
self.writer.addobj(metadata)
|
||||||
|
self.writer.addobj(iccstream)
|
||||||
|
|
||||||
# now write out the PDF
|
# now write out the PDF
|
||||||
if self.engine == Engine.pikepdf:
|
if self.engine == Engine.pikepdf:
|
||||||
self.writer.save(outputstream, min_version=self.output_version, linearize=True)
|
self.writer.save(
|
||||||
|
outputstream, min_version=self.output_version, linearize=True
|
||||||
|
)
|
||||||
elif self.engine == Engine.pdfrw:
|
elif self.engine == Engine.pdfrw:
|
||||||
self.writer.trailer.Info = self.writer.docinfo
|
self.writer.trailer.Info = self.writer.docinfo
|
||||||
# setting the version attribute of the pdfrw PdfWriter object will
|
# setting the version attribute of the pdfrw PdfWriter object will
|
||||||
# influence the behaviour of the write() function
|
# influence the behaviour of the write() function
|
||||||
self.writer.version = self.output_version
|
self.writer.version = self.output_version
|
||||||
|
if self.pdfa:
|
||||||
|
md5 = hashlib.md5(b"").hexdigest().encode("ascii")
|
||||||
|
self.writer.trailer[PdfName.ID] = PdfArray([md5, md5])
|
||||||
self.writer.write(outputstream)
|
self.writer.write(outputstream)
|
||||||
elif self.engine == Engine.internal:
|
elif self.engine == Engine.internal:
|
||||||
self.writer.tostream(self.writer.docinfo, outputstream, self.output_version)
|
self.writer.tostream(
|
||||||
|
self.writer.docinfo,
|
||||||
|
outputstream,
|
||||||
|
self.output_version,
|
||||||
|
None if self.pdfa is None else b"",
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
raise ValueError("unknown engine: %s" % self.engine)
|
raise ValueError("unknown engine: %s" % self.engine)
|
||||||
|
|
||||||
|
@ -1847,6 +1925,7 @@ def convert(*images, **kwargs):
|
||||||
bleedborder=None,
|
bleedborder=None,
|
||||||
trimborder=None,
|
trimborder=None,
|
||||||
artborder=None,
|
artborder=None,
|
||||||
|
pdfa=None,
|
||||||
)
|
)
|
||||||
for kwname, default in _default_kwargs.items():
|
for kwname, default in _default_kwargs.items():
|
||||||
if kwname not in kwargs:
|
if kwname not in kwargs:
|
||||||
|
@ -1871,6 +1950,7 @@ def convert(*images, **kwargs):
|
||||||
kwargs["viewer_fit_window"],
|
kwargs["viewer_fit_window"],
|
||||||
kwargs["viewer_center_window"],
|
kwargs["viewer_center_window"],
|
||||||
kwargs["viewer_fullscreen"],
|
kwargs["viewer_fullscreen"],
|
||||||
|
kwargs["pdfa"],
|
||||||
)
|
)
|
||||||
|
|
||||||
# backwards compatibility with older img2pdf versions where the first
|
# backwards compatibility with older img2pdf versions where the first
|
||||||
|
@ -3164,7 +3244,7 @@ RGB.""",
|
||||||
"The internal engine does not have additional requirements and writes "
|
"The internal engine does not have additional requirements and writes "
|
||||||
"out a human readable PDF. The pikepdf engine requires the pikepdf "
|
"out a human readable PDF. The pikepdf engine requires the pikepdf "
|
||||||
"Python module and qpdf library, is most featureful, can "
|
"Python module and qpdf library, is most featureful, can "
|
||||||
"linearize PDFs (\"fast web view\") and can compress more parts of it."
|
'linearize PDFs ("fast web view") and can compress more parts of it.'
|
||||||
"The pdfrw engine requires the pdfrw Python "
|
"The pdfrw engine requires the pdfrw Python "
|
||||||
"module but does not support unicode metadata (See "
|
"module but does not support unicode metadata (See "
|
||||||
"https://github.com/pmaupin/pdfrw/issues/39) or palette data (See "
|
"https://github.com/pmaupin/pdfrw/issues/39) or palette data (See "
|
||||||
|
@ -3191,6 +3271,15 @@ RGB.""",
|
||||||
% Image.MAX_IMAGE_PIXELS,
|
% Image.MAX_IMAGE_PIXELS,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
outargs.add_argument(
|
||||||
|
"--pdfa",
|
||||||
|
nargs="?",
|
||||||
|
const="/usr/share/color/icc/ghostscript/srgb.icc",
|
||||||
|
default=None,
|
||||||
|
help="Output a PDF/A-1b complient document. By default, this will "
|
||||||
|
"embed /usr/share/color/icc/ghostscript/srgb.icc as the color profile.",
|
||||||
|
)
|
||||||
|
|
||||||
sizeargs = parser.add_argument_group(
|
sizeargs = parser.add_argument_group(
|
||||||
title="Image and page size and layout arguments",
|
title="Image and page size and layout arguments",
|
||||||
description="""\
|
description="""\
|
||||||
|
@ -3530,6 +3619,7 @@ and left/right, respectively. It is not possible to specify asymmetric borders.
|
||||||
bleedborder=args.bleed_border,
|
bleedborder=args.bleed_border,
|
||||||
trimborder=args.trim_border,
|
trimborder=args.trim_border,
|
||||||
artborder=args.art_border,
|
artborder=args.art_border,
|
||||||
|
pdfa=args.pdfa,
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error("error: " + str(e))
|
logging.error("error: " + str(e))
|
||||||
|
|
Loading…
Reference in a new issue