Fix format of PDF and XMP timezone offsets

The way to specify timezone offsets differs between the PDF and XMP specs, so hardcode the UTC offset instead of using `%z`. This also avoids the possibility of embedding the more granular timezones supported by the datetime module but that are unsupported by either PDF or XMP formats.
Convert manual creation/mod times to UTC as well
2023-06-10 17:36:47 -07:00 · 2023-06-10 17:26:13 -07:00 · 2023-05-29 14:17:56 -07:00
2 changed files with 22 additions and 43 deletions
--- a/src/img2pdf.py
+++ b/src/img2pdf.py
@ -22,7 +22,7 @@ import sys
 import os
 import zlib
 import argparse
-from PIL import Image, TiffImagePlugin, GifImagePlugin, ImageCms
+from PIL import Image, TiffImagePlugin, GifImagePlugin

 if hasattr(GifImagePlugin, "LoadingStrategy"):
    # Pillow 9.0.0 started emitting all frames but the first as RGB instead of
@ -36,7 +36,8 @@ if hasattr(GifImagePlugin, "LoadingStrategy"):

 # TiffImagePlugin.DEBUG = True
 from PIL.ExifTags import TAGS
-from datetime import datetime, timezone
+from datetime import datetime
+from datetime import timezone
 from jp2 import parsejp2
 from enum import Enum
 from io import BytesIO
@ -46,7 +47,6 @@ import platform
 import hashlib
 from itertools import chain
 import re
-import io

 logger = logging.getLogger(__name__)

@ -722,7 +722,8 @@ class pdfdoc(object):
            self.writer.docinfo = PdfDict(indirect=True)

        def datetime_to_pdfdate(dt):
-            return dt.astimezone(tz=timezone.utc).strftime("%Y%m%d%H%M%SZ")
+            dt_utc = dt.astimezone(tz=timezone.utc)
+            return dt_utc.strftime("%Y%m%d%H%M%S+00'00'")

        for k in ["Title", "Author", "Creator", "Producer", "Subject"]:
            v = locals()[k.lower()]
@ -752,7 +753,8 @@ class pdfdoc(object):
                )

        def datetime_to_xmpdate(dt):
-            return dt.astimezone(tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+            dt_utc = dt.astimezone(tz=timezone.utc)
+            return dt_utc.strftime("%Y-%m-%dT%H:%M:%S+00:00")

        self.xmp = b"""<?xpacket begin='\xef\xbb\xbf' id='W5M0MpCehiHzreSzNTczkc9d'?>
 <x:xmpmeta xmlns:x='adobe:ns:meta/' x:xmptk='XMP toolkit 2.9.1-13, framework 1.6'>
@ -1432,21 +1434,6 @@ def get_imgmetadata(
    iccp = None
    if "icc_profile" in imgdata.info:
        iccp = imgdata.info.get("icc_profile")
-    # GIMP saves bilevel tiff images with an RGB ICC profile which is useless
-    # and produces an error in Adobe Acrobat, so we ignore it with a warning.
-    # imagemagick also used to (wrongly) include an RGB ICC profile for bilevel
-    # images: https://github.com/ImageMagick/ImageMagick/issues/2070
-    if iccp is not None and color == Colorspace["1"] and imgformat == ImageFormat.TIFF:
-        with io.BytesIO(iccp) as f:
-            prf = ImageCms.ImageCmsProfile(f)
-        if (
-            prf.profile.model == "sRGB"
-            and prf.profile.manufacturer == "GIMP"
-            and prf.profile.profile_description == "GIMP built-in sRGB"
-        ):
-            logger.warning("Ignoring RGB ICC profile in bilevel TIFF produced by GIMP.")
-            logger.warning("https://gitlab.gnome.org/GNOME/gimp/-/issues/3438")
-            iccp = None

    logger.debug("width x height = %dpx x %dpx", imgwidthpx, imgheightpx)

@ -2101,16 +2088,7 @@ def read_images(rawdata, colorspace, first_frame_only=False, rot=None):
                )
            )
        else:
-            if color in [Colorspace.P, Colorspace.PA] and iccp is not None:
-                # PDF does not support palette images with icc profile
-                if color == Colorspace.P:
-                    newcolor = Colorspace.RGB
-                    newimg = newimg.convert(mode="RGB")
-                elif color == Colorspace.PA:
-                    newcolor = Colorspace.RGBA
-                    newimg = newimg.convert(mode="RGBA")
-                smaskidat = None
-            elif (
+            if (
                color == Colorspace.RGBA
                or color == Colorspace.LA
                or color == Colorspace.PA
@ -2124,11 +2102,6 @@ def read_images(rawdata, colorspace, first_frame_only=False, rot=None):
                    newcolor = color
                    l, a = newimg.split()
                    newimg = l
-                elif color == Colorspace.PA or (
-                    color == Colorspace.P and "transparency" in newimg.info
-                ):
-                    newcolor = color
-                    a = newimg.convert(mode="RGBA").split()[-1]
                else:
                    newcolor = Colorspace.RGBA
                    r, g, b, a = newimg.convert(mode="RGBA").split()
@ -2139,6 +2112,15 @@ def read_images(rawdata, colorspace, first_frame_only=False, rot=None):
                    "Image contains an alpha channel. Computing a separate "
                    "soft mask (/SMask) image to store transparency in PDF."
                )
+            elif color in [Colorspace.P, Colorspace.PA] and iccp is not None:
+                # PDF does not support palette images with icc profile
+                if color == Colorspace.P:
+                    newcolor = Colorspace.RGB
+                    newimg = newimg.convert(mode="RGB")
+                elif color == Colorspace.PA:
+                    newcolor = Colorspace.RGBA
+                    newimg = newimg.convert(mode="RGBA")
+                smaskidat = None
            else:
                newcolor = color
                smaskidat = None
@ -3732,9 +3714,7 @@ Paper sizes:
  the value in the second column has the same effect as giving the short hand
  in the first column. Appending ^T (a caret/circumflex followed by the letter
  T) turns the paper size from portrait into landscape. The postfix thus
-  symbolizes the transpose. Note that on Windows cmd.exe the caret symbol is
-  the escape character, so you need to put quotes around the option value.
-  The values are case insensitive.
+  symbolizes the transpose. The values are case insensitive.

 %s

@ -3801,7 +3781,7 @@ Examples:
  while preserving its aspect ratio and a print border of 2 cm on the top and
  bottom and 2.5 cm on the left and right hand side.

-    $ img2pdf --output out.pdf --pagesize "A4^T" --border 2cm:2.5cm *.jpg
+    $ img2pdf --output out.pdf --pagesize A4^T --border 2cm:2.5cm *.jpg

  On each A4 page, fit images into a 10 cm times 15 cm rectangle but keep the
  original image size if the image is smaller than that.
@ -4271,7 +4251,7 @@ and left/right, respectively. It is not possible to specify asymmetric borders.
        print(
            "Reading image from standard input...\n"
            "Re-run with -h or --help for usage information.",
-            file=sys.stderr,
+            file=sys.stderr
        )
        try:
            images = [sys.stdin.buffer.read()]
--- a/src/img2pdf_test.py
+++ b/src/img2pdf_test.py
@ -4276,10 +4276,9 @@ def gif_transparent_pdf(tmp_path_factory, gif_transparent_img, request):
            == b"q\n45.0000 0 0 45.0000 0.0000 0.0000 cm\n/Im0 Do\nQ"
        )
        assert p.pages[0].Resources.XObject.Im0.BitsPerComponent == 8
-        assert p.pages[0].Resources.XObject.Im0.ColorSpace[0] == "/Indexed"
-        assert p.pages[0].Resources.XObject.Im0.ColorSpace[1] == "/DeviceRGB"
+        assert p.pages[0].Resources.XObject.Im0.ColorSpace == "/DeviceRGB"
        assert p.pages[0].Resources.XObject.Im0.DecodeParms.BitsPerComponent == 8
-        assert p.pages[0].Resources.XObject.Im0.DecodeParms.Colors == 1
+        assert p.pages[0].Resources.XObject.Im0.DecodeParms.Colors == 3
        assert p.pages[0].Resources.XObject.Im0.DecodeParms.Predictor == 15
        assert p.pages[0].Resources.XObject.Im0.Filter == "/FlateDecode"
        assert p.pages[0].Resources.XObject.Im0.Height == 60