From 6c44d8cea60db017c6859ddd198192ea35de7f0f Mon Sep 17 00:00:00 2001 From: Johannes 'josch' Schauer Date: Mon, 20 Aug 2018 10:21:18 +0200 Subject: [PATCH] src/img2pdf.py: add more rationale behind palette encoding --- src/img2pdf.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/img2pdf.py b/src/img2pdf.py index 797bd27..52a6ffb 100755 --- a/src/img2pdf.py +++ b/src/img2pdf.py @@ -317,6 +317,10 @@ if PY3: string = string.encode('ascii') except UnicodeEncodeError: string = b"\xfe\xff"+string.encode("utf-16-be") + # We should probably encode more here because at least + # ghostscript interpretes a carriage return byte (0x0D) as a + # new line byte (0x0A) + # PDF supports: \n, \r, \t, \b and \f string = string.replace(b'\\', b'\\\\') string = string.replace(b'(', b'\\(') string = string.replace(b')', b'\\)') @@ -780,6 +784,15 @@ def parse_png(rawdata): if rawdata[i-4:i] == b"IDAT": pngidat += rawdata[i:i+n] elif rawdata[i-4:i] == b"PLTE": + # This could be as simple as saying "palette = rawdata[i:i+n]" but + # pdfrw does only escape parenthesis and backslashes in the raw + # byte stream. But raw carriage return bytes are interpreted as + # line feed bytes by ghostscript. So instead we use the hex string + # format. pdfrw cannot write it but at least ghostscript is happy + # with it. We would also write out the palette in binary format + # (and escape more bytes) but since we cannot use pdfrw anyways, + # we choose the more human readable variant. + # See https://github.com/pmaupin/pdfrw/issues/147 for j in range(i, i+n, 3): # with int.from_bytes() we would not have to prepend extra # zeroes