From cb2644c34fe96e41ae79d263f9b2d40b576ba36d Mon Sep 17 00:00:00 2001
From: Johannes Schauer Marin Rodrigues <josch@mister-muffin.de>
Date: Fri, 9 Jun 2023 06:55:49 +0200
Subject: [PATCH] do not include thumbnails in the output by default unless
 --include-thumbnails is used

This is relevant for the MPO format which otherwise would result in PDF
files containing the same image in different sizes multiple times. With
this change, the default is to only have a single page containing the
full MPO. This means that extracting that MPO also gets the thumbnails
back.

With the --include-thumbnails option, each frame gets stored on its own
page as it is done for multi-frame GIF, for example.

Closes: #135
---
 src/img2pdf.py | 89 +++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 88 insertions(+), 1 deletion(-)

diff --git a/src/img2pdf.py b/src/img2pdf.py
index 47ebd1e..fa2aa73 100755
--- a/src/img2pdf.py
+++ b/src/img2pdf.py
@@ -1750,7 +1750,9 @@ def parse_miff(data):
 # fmt: on
 
 
-def read_images(rawdata, colorspace, first_frame_only=False, rot=None):
+def read_images(
+    rawdata, colorspace, first_frame_only=False, rot=None, include_thumbnails=False
+):
     im = BytesIO(rawdata)
     im.seek(0)
     imgdata = None
@@ -1836,6 +1838,77 @@ def read_images(rawdata, colorspace, first_frame_only=False, rot=None):
     if imgformat == ImageFormat.MPO:
         result = []
         img_page_count = 0
+        assert len(imgdata._MpoImageFile__mpoffsets) == len(imgdata.mpinfo[0xB002])
+        num_frames = len(imgdata.mpinfo[0xB002])
+        # An MPO file can be a main image together with one or more thumbnails
+        # if that is the case, then we only include all frames if the
+        # --include-thumbnails option is given. If it is not, such an MPO file
+        # will be embedded as is, so including its thumbnails but showing up
+        # as a single image page in the resulting PDF.
+        num_main_frames = 0
+        num_thumbnail_frames = 0
+        for i, mpent in enumerate(imgdata.mpinfo[0xB002]):
+            # check only the first frame for being the main image
+            if (
+                i == 0
+                and mpent["Attribute"]["DependentParentImageFlag"]
+                and not mpent["Attribute"]["DependentChildImageFlag"]
+                and mpent["Attribute"]["RepresentativeImageFlag"]
+                and mpent["Attribute"]["MPType"] == "Baseline MP Primary Image"
+            ):
+                num_main_frames += 1
+            elif (
+                not mpent["Attribute"]["DependentParentImageFlag"]
+                and mpent["Attribute"]["DependentChildImageFlag"]
+                and not mpent["Attribute"]["RepresentativeImageFlag"]
+                and mpent["Attribute"]["MPType"]
+                in [
+                    "Large Thumbnail (VGA Equivalent)",
+                    "Large Thumbnail (Full HD Equivalent)",
+                ]
+            ):
+                num_thumbnail_frames += 1
+        logger.debug(f"number of frames: {num_frames}")
+        logger.debug(f"number of main frames: {num_main_frames}")
+        logger.debug(f"number of thumbnail frames: {num_thumbnail_frames}")
+        # this MPO file is a main image plus zero or more thumbnails
+        # embed as-is unless the --include-thumbnails option was given
+        if num_frames == 1 or (
+            not include_thumbnails
+            and num_main_frames == 1
+            and num_thumbnail_frames + 1 == num_frames
+        ):
+            color, ndpi, imgwidthpx, imgheightpx, rotation, iccp = get_imgmetadata(
+                imgdata, imgformat, default_dpi, colorspace, rawdata, rot
+            )
+            if color == Colorspace["1"]:
+                raise JpegColorspaceError("jpeg can't be monochrome")
+            if color == Colorspace["P"]:
+                raise JpegColorspaceError("jpeg can't have a color palette")
+            if color == Colorspace["RGBA"]:
+                raise JpegColorspaceError("jpeg can't have an alpha channel")
+            logger.debug("read_images() embeds an MPO verbatim")
+            cleanup()
+            return [
+                (
+                    color,
+                    ndpi,
+                    ImageFormat.JPEG,
+                    rawdata,
+                    None,
+                    imgwidthpx,
+                    imgheightpx,
+                    [],
+                    False,
+                    8,
+                    rotation,
+                    iccp,
+                )
+            ]
+        # If the control flow reaches here, the MPO has more than a single
+        # frame but was not detected to be a main image followed by multiple
+        # thumbnails. We thus treat this MPO as we do other multi-frame images
+        # and include all its frames as individual pages.
         for offset, mpent in zip(
             imgdata._MpoImageFile__mpoffsets, imgdata.mpinfo[0xB002]
         ):
@@ -2509,6 +2582,7 @@ def convert(*images, **kwargs):
         artborder=None,
         pdfa=None,
         rotation=None,
+        include_thumbnails=False,
     )
     for kwname, default in _default_kwargs.items():
         if kwname not in kwargs:
@@ -2601,6 +2675,7 @@ def convert(*images, **kwargs):
             kwargs["colorspace"],
             kwargs["first_frame_only"],
             kwargs["rotation"],
+            kwargs["include_thumbnails"],
         ):
             pagewidth, pageheight, imgwidthpdf, imgheightpdf = kwargs["layout_fun"](
                 imgwidthpx, imgheightpx, ndpi
@@ -3936,6 +4011,17 @@ RGB.""",
         "input image be converted into a page in the resulting PDF.",
     )
 
+    outargs.add_argument(
+        "--include-thumbnails",
+        action="store_true",
+        help="Some multi-frame formats like MPO carry a main image and "
+        "one or more scaled-down copies of the main image (thumbnails). "
+        "In such a case, img2pdf will only include the main image and "
+        "not create additional pages for each of the thumbnails. If this "
+        "option is set, img2pdf will instead create one page per frame and "
+        "thus store each thumbnail on its own page.",
+    )
+
     outargs.add_argument(
         "--pillow-limit-break",
         action="store_true",
@@ -4333,6 +4419,7 @@ and left/right, respectively. It is not possible to specify asymmetric borders.
             artborder=args.art_border,
             pdfa=args.pdfa,
             rotation=args.rotation,
+            include_thumbnails=args.include_thumbnails,
         )
     except Exception as e:
         logger.error("error: " + str(e))