convert(): add option to get engine document (e.g. pikepdf.Pdf) #203

Open
mara0004 wants to merge 7 commits from mara0004/img2pdf:return_engine_doc into main
2 changed files with 35 additions and 14 deletions

View file

@ -1075,7 +1075,7 @@ class pdfdoc(object):
self.tostream(stream) self.tostream(stream)
return stream.getvalue() return stream.getvalue()
def tostream(self, outputstream): def finalize(self):
Review

You split tostream() into finalize() and tostream() but then why does the new tostream() not call finalize()?

You split `tostream()` into `finalize()` and `tostream()` but then why does the new `tostream()` not call `finalize()`?
Review

Because I though the embedder of convert_to_docobject() should not have to invoke finalize().
Instead, finalize() technically belongs into convert_to_docobject() itself, after all image pages have been added. So tostream() cannot also finalize() as that would result in a double call.

Because I though the embedder of `convert_to_docobject()` should not have to invoke `finalize()`. Instead, `finalize()` technically belongs into `convert_to_docobject()` itself, after all image pages have been added. So `tostream()` cannot also `finalize()` as that would result in a double call.
if self.engine == Engine.pikepdf: if self.engine == Engine.pikepdf:
PdfArray = pikepdf.Array PdfArray = pikepdf.Array
PdfDict = pikepdf.Dictionary PdfDict = pikepdf.Dictionary
@ -1267,7 +1267,9 @@ class pdfdoc(object):
self.writer.addobj(metadata) self.writer.addobj(metadata)
self.writer.addobj(iccstream) self.writer.addobj(iccstream)
# now write out the PDF def tostream(self, outputstream):
# write out the PDF
# this assumes that finalize() has been invoked beforehand by the caller
if self.engine == Engine.pikepdf: if self.engine == Engine.pikepdf:
kwargs = {} kwargs = {}
if pikepdf.__version__ >= "6.2.0": if pikepdf.__version__ >= "6.2.0":
@ -1276,6 +1278,7 @@ class pdfdoc(object):
outputstream, min_version=self.output_version, linearize=True, **kwargs outputstream, min_version=self.output_version, linearize=True, **kwargs
) )
elif self.engine == Engine.pdfrw: elif self.engine == Engine.pdfrw:
from pdfrw import PdfName, PdfArray
self.writer.trailer.Info = self.writer.docinfo self.writer.trailer.Info = self.writer.docinfo
# setting the version attribute of the pdfrw PdfWriter object will # setting the version attribute of the pdfrw PdfWriter object will
# influence the behaviour of the write() function # influence the behaviour of the write() function
@ -2605,14 +2608,11 @@ def find_scale(pagewidth, pageheight):
return 10 ** ceil(log10(oversized)) return 10 ** ceil(log10(oversized))
# given one or more input image, depending on outputstream, either return a # Convert the image(s) to a `pdfdoc` object.
# string containing the whole PDF if outputstream is None or write the PDF # The `.writer` attribute holds the underlying engine document handle, and
# data to the given file-like object and return None # `.output_version` the minimum version the caller should use when saving.
# # The main convert() wraps this implementation function.
# Input images can be given as file like objects (they must implement read()), def convert_to_docobject(*images, **kwargs):
# as a binary string representing the image content or as filenames to the
# images.
def convert(*images, **kwargs):
_default_kwargs = dict( _default_kwargs = dict(
engine=None, engine=None,
title=None, title=None,
@ -2633,7 +2633,6 @@ def convert(*images, **kwargs):
viewer_fit_window=False, viewer_fit_window=False,
viewer_center_window=False, viewer_center_window=False,
viewer_fullscreen=False, viewer_fullscreen=False,
outputstream=None,
first_frame_only=False, first_frame_only=False,
allow_oversized=True, allow_oversized=True,
cropborder=None, cropborder=None,
@ -2796,10 +2795,22 @@ def convert(*images, **kwargs):
iccp, iccp,
) )
if kwargs["outputstream"]: pdf.finalize()
pdf.tostream(kwargs["outputstream"]) return pdf
return
# given one or more input image, depending on outputstream, either return a
# string containing the whole PDF if outputstream is None or write the PDF
# data to the given file-like object and return None
#
# Input images can be given as file like objects (they must implement read()),
# as a binary string representing the image content or as filenames to the
# images.
def convert(*images, outputstream=None, **kwargs):
Review

Please do not change the signature of the convert() function. This is necessary to preserve API stability. You have to extract "outputstream" from kwargs.

Please do not change the signature of the convert() function. This is necessary to preserve API stability. You have to extract "outputstream" from kwargs.
Review

It's not clear to me why this is supposed to break the API -- can you explain?
AFAICS, kwargs is internal, and no callee expects kwargs["outputstream"], right?

It's not clear to me why this is supposed to break the API -- can you explain? AFAICS, `kwargs` is internal, and no callee expects `kwargs["outputstream"]`, right?
Review

Relatedly, it looks like the _default_kwargs strategy silently ignores nonexistent parameters, which is problematic. (And of course it breaks IDE completion.)
I see you're using kwargs to unify access of {crop,bleed,trim,art}border, which is fine, but why can't any others params be in the signature directly?

Relatedly, it looks like the `_default_kwargs` strategy silently ignores nonexistent parameters, which is problematic. (And of course it breaks IDE completion.) ~~I see you're using kwargs to unify access of `{crop,bleed,trim,art}border`, which is fine, but why can't any others params be in the signature directly?~~
Review

It's not clear to me why this is supposed to break the API -- can you explain?

Okay, self-answering the question: a SO search yielded that specifying optional params after a *capture raises a SyntaxError on Python 2. Never having written code for Python 2, I did not know that. I'll change to extract from kwargs as you said, then.

The issue with invalid kwargs assumably being ignored without error still stands, though.

> It's not clear to me why this is supposed to break the API -- can you explain? Okay, self-answering the question: a SO search yielded that specifying optional params after a `*capture` raises a `SyntaxError` on Python 2. Never having written code for Python 2, I did not know that. I'll change to extract from kwargs as you said, then. The issue with invalid kwargs assumably being ignored without error still stands, though.
pdf = convert_to_docobject(*images, **kwargs)
if outputstream:
pdf.tostream(outputstream)
return
return pdf.tostring() return pdf.tostring()

View file

@ -7146,6 +7146,16 @@ def test_general(general_input, engine):
pass pass
def test_return_engine_doc(tmp_path_factory):
inputf = os.path.join(os.path.dirname(__file__), "tests", "input", "normal.jpg")
outputf = tmp_path_factory.mktemp("return_engine_doc") / "normal.jpg.pdf"
pdf_wrapper = img2pdf.convert_to_docobject(inputf, engine=img2pdf.Engine.pikepdf)
pdf = pdf_wrapper.writer
assert isinstance(pdf, pikepdf.Pdf)
pdf.save(outputf, min_version=pdf_wrapper.output_version, linearize=True)
assert os.path.isfile(outputf)
def main(): def main():
normal16 = alpha_value()[:, :, 0:3] normal16 = alpha_value()[:, :, 0:3]
pathlib.Path("test.icc").write_bytes(icc_profile()) pathlib.Path("test.icc").write_bytes(icc_profile())