From 5a414ce4e42ad74a568ddc98ce5cf5a6c71af6aa Mon Sep 17 00:00:00 2001 From: Patrick McCarty Date: Mon, 29 May 2023 17:23:24 -0700 Subject: [PATCH] Address discrepancies between PDF and XMP timestamps The PDF format and XMP metadata specs define different syntax for dates, so account for these discrepancies by more carefully constructing the final timestamps by post-processing strftime() output. --- src/img2pdf.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/img2pdf.py b/src/img2pdf.py index 4a1cd37..9723339 100755 --- a/src/img2pdf.py +++ b/src/img2pdf.py @@ -721,8 +721,16 @@ class pdfdoc(object): if engine != Engine.pikepdf: self.writer.docinfo = PdfDict(indirect=True) + timezone_regex = r'^([+-])([0-9]{2})([0-9]{2})([0-9.]+)?$' + def datetime_to_pdfdate(dt): - return dt.strftime("%Y%m%d%H%M%S%z") + time_no_tz = dt.strftime("%Y%m%d%H%M%S") + # Format for `%z` specifier is [+-]HHMM(SS(\.ffffff)?)?, but the + # PDF format only accepts the [+-]HHMM part, and it must be + # formatted as [+-]HH'MM'. + tz = dt.strftime("%z") + tz_pdf = re.sub(timezone_regex, r"\1\2'\3'", tz) + return time_no_tz + tz_pdf for k in ["Title", "Author", "Creator", "Producer", "Subject"]: v = locals()[k.lower()] @@ -752,7 +760,13 @@ class pdfdoc(object): ) def datetime_to_xmpdate(dt): - return dt.strftime("%Y-%m-%dT%H:%M:%S%z") + time_no_tz = dt.strftime("%Y-%m-%dT%H:%M:%S") + # Format for `%z` specifier is [+-]HHMM(SS(\.ffffff)?)?, but the + # XMP metadata only accepts the [+-]HHMM part, and it must be + # formatted as [+-]HH:MM. + tz = dt.strftime("%z") + tz_xmp = re.sub(timezone_regex, r'\1\2:\3', tz) + return time_no_tz + tz_xmp self.xmp = b"""