Address discrepancies between PDF and XMP timestamps

The PDF format and XMP metadata specs define different syntax for dates, so account for these discrepancies by more carefully constructing the final timestamps by post-processing strftime() output.
Treat default creation/mod dates as UTC (fixes #155 )
2023-05-29 19:55:05 -07:00 · 2023-05-29 14:17:56 -07:00
1 changed files with 21 additions and 3 deletions
--- a/src/img2pdf.py
+++ b/src/img2pdf.py
@ -37,6 +37,7 @@ if hasattr(GifImagePlugin, "LoadingStrategy"):
 # TiffImagePlugin.DEBUG = True
 from PIL.ExifTags import TAGS
 from datetime import datetime
+from datetime import timezone
 from jp2 import parsejp2
 from enum import Enum
 from io import BytesIO
@ -721,7 +722,16 @@ class pdfdoc(object):
            self.writer.docinfo = PdfDict(indirect=True)

        def datetime_to_pdfdate(dt):
-            return dt.strftime("%Y%m%d%H%M%SZ")
+            time_no_tz = dt.strftime("%Y%m%d%H%M%S")
+            tz_pdf = ""
+            # Format for `%z` specifier is [+-]HHMM(SS(\.ffffff)?)?, but the
+            # PDF format only accepts the [+-]HHMM part, and it must be
+            # formatted as [+-]HH'MM'. Note that PDF 1.7 removed the need for
+            # the trailing apostrophe (after MM), but earlier specs require it.
+            tz = dt.strftime("%z")
+            if tz:
+                tz_pdf = "%s%s'%s'" % (tz[0], tz[1:3], tz[3:5])
+            return time_no_tz + tz_pdf

        for k in ["Title", "Author", "Creator", "Producer", "Subject"]:
            v = locals()[k.lower()]
@ -731,7 +741,7 @@ class pdfdoc(object):
                v = PdfString.encode(v)
            self.writer.docinfo[getattr(PdfName, k)] = v

-        now = datetime.now()
+        now = datetime.now(tz=timezone.utc)
        for k in ["CreationDate", "ModDate"]:
            v = locals()[k.lower()]
            if v is None and nodate:
@ -751,7 +761,15 @@ class pdfdoc(object):
                )

        def datetime_to_xmpdate(dt):
-            return dt.strftime("%Y-%m-%dT%H:%M:%SZ")
+            time_no_tz = dt.strftime("%Y-%m-%dT%H:%M:%S")
+            tz_xmp = ""
+            # Format for `%z` specifier is [+-]HHMM(SS(\.ffffff)?)?, but the
+            # XMP metadata only accepts the [+-]HHMM part, and it must be
+            # formatted as [+-]HH:MM.
+            tz = dt.strftime("%z")
+            if tz:
+                tz_xmp = "%s%s:%s" % (tz[0], tz[1:3], tz[3:5])
+            return time_no_tz + tz_xmp

        self.xmp = b"""<?xpacket begin='\xef\xbb\xbf' id='W5M0MpCehiHzreSzNTczkc9d'?>
 <x:xmpmeta xmlns:x='adobe:ns:meta/' x:xmptk='XMP toolkit 2.9.1-13, framework 1.6'>