Address discrepancies between PDF and XMP timestamps

The PDF format and XMP metadata specs define different syntax for dates,
so account for these discrepancies by more carefully constructing the
final timestamps by post-processing strftime() output.
This commit is contained in:
Patrick McCarty 2023-05-29 17:23:24 -07:00
parent 1dd05cc36b
commit 5a414ce4e4

View file

@ -721,8 +721,16 @@ class pdfdoc(object):
if engine != Engine.pikepdf: if engine != Engine.pikepdf:
self.writer.docinfo = PdfDict(indirect=True) self.writer.docinfo = PdfDict(indirect=True)
timezone_regex = r'^([+-])([0-9]{2})([0-9]{2})([0-9.]+)?$'
def datetime_to_pdfdate(dt): def datetime_to_pdfdate(dt):
return dt.strftime("%Y%m%d%H%M%S%z") time_no_tz = dt.strftime("%Y%m%d%H%M%S")
# Format for `%z` specifier is [+-]HHMM(SS(\.ffffff)?)?, but the
# PDF format only accepts the [+-]HHMM part, and it must be
# formatted as [+-]HH'MM'.
tz = dt.strftime("%z")
tz_pdf = re.sub(timezone_regex, r"\1\2'\3'", tz)
return time_no_tz + tz_pdf
for k in ["Title", "Author", "Creator", "Producer", "Subject"]: for k in ["Title", "Author", "Creator", "Producer", "Subject"]:
v = locals()[k.lower()] v = locals()[k.lower()]
@ -752,7 +760,13 @@ class pdfdoc(object):
) )
def datetime_to_xmpdate(dt): def datetime_to_xmpdate(dt):
return dt.strftime("%Y-%m-%dT%H:%M:%S%z") time_no_tz = dt.strftime("%Y-%m-%dT%H:%M:%S")
# Format for `%z` specifier is [+-]HHMM(SS(\.ffffff)?)?, but the
# XMP metadata only accepts the [+-]HHMM part, and it must be
# formatted as [+-]HH:MM.
tz = dt.strftime("%z")
tz_xmp = re.sub(timezone_regex, r'\1\2:\3', tz)
return time_no_tz + tz_xmp
self.xmp = b"""<?xpacket begin='\xef\xbb\xbf' id='W5M0MpCehiHzreSzNTczkc9d'?> self.xmp = b"""<?xpacket begin='\xef\xbb\xbf' id='W5M0MpCehiHzreSzNTczkc9d'?>
<x:xmpmeta xmlns:x='adobe:ns:meta/' x:xmptk='XMP toolkit 2.9.1-13, framework 1.6'> <x:xmpmeta xmlns:x='adobe:ns:meta/' x:xmptk='XMP toolkit 2.9.1-13, framework 1.6'>