Address discrepancies between PDF and XMP timestamps

The PDF format and XMP metadata specs define different syntax for dates, so account for these discrepancies by more carefully constructing the final timestamps by post-processing strftime() output.
2023-05-29 17:23:24 -07:00 · 2023-05-29 17:23:24 -07:00 · 5a414ce4e4
commit 5a414ce4e4
parent 1dd05cc36b
1 changed files with 16 additions and 2 deletions
--- a/src/img2pdf.py
+++ b/src/img2pdf.py
@ -721,8 +721,16 @@ class pdfdoc(object):
        if engine != Engine.pikepdf:
            self.writer.docinfo = PdfDict(indirect=True)

+        timezone_regex = r'^([+-])([0-9]{2})([0-9]{2})([0-9.]+)?$'
+
        def datetime_to_pdfdate(dt):
-            return dt.strftime("%Y%m%d%H%M%S%z")
+            time_no_tz = dt.strftime("%Y%m%d%H%M%S")
+            # Format for `%z` specifier is [+-]HHMM(SS(\.ffffff)?)?, but the
+            # PDF format only accepts the [+-]HHMM part, and it must be
+            # formatted as [+-]HH'MM'.
+            tz = dt.strftime("%z")
+            tz_pdf = re.sub(timezone_regex, r"\1\2'\3'", tz)
+            return time_no_tz + tz_pdf

        for k in ["Title", "Author", "Creator", "Producer", "Subject"]:
            v = locals()[k.lower()]
@ -752,7 +760,13 @@ class pdfdoc(object):
                )

        def datetime_to_xmpdate(dt):
-            return dt.strftime("%Y-%m-%dT%H:%M:%S%z")
+            time_no_tz = dt.strftime("%Y-%m-%dT%H:%M:%S")
+            # Format for `%z` specifier is [+-]HHMM(SS(\.ffffff)?)?, but the
+            # XMP metadata only accepts the [+-]HHMM part, and it must be
+            # formatted as [+-]HH:MM.
+            tz = dt.strftime("%z")
+            tz_xmp = re.sub(timezone_regex, r'\1\2:\3', tz)
+            return time_no_tz + tz_xmp

        self.xmp = b"""<?xpacket begin='\xef\xbb\xbf' id='W5M0MpCehiHzreSzNTczkc9d'?>
 <x:xmpmeta xmlns:x='adobe:ns:meta/' x:xmptk='XMP toolkit 2.9.1-13, framework 1.6'>