From 317a0ee7f217a4820435f947eaf8242232c6c68e Mon Sep 17 00:00:00 2001
From: josch <j.schauer@email.de>
Date: Fri, 13 Mar 2015 11:43:38 +0100
Subject: [PATCH] do not encode as utf8 as pdf is ascii, add safer handling
 across py2/py3

---
 src/img2pdf.py | 137 +++++++++++++++++++++++++++----------------------
 1 file changed, 75 insertions(+), 62 deletions(-)

diff --git a/src/img2pdf.py b/src/img2pdf.py
index b9ffd8c..91348af 100755
--- a/src/img2pdf.py
+++ b/src/img2pdf.py
@@ -46,20 +46,22 @@ def warning_out(message):
 def parse(cont, indent=1):
     if type(cont) is dict:
         return b"<<\n"+b"\n".join(
-            [4 * indent * b" " + k.encode("utf8") + b" " + parse(v, indent+1)
+            [4 * indent * b" " + k + b" " + parse(v, indent+1)
              for k, v in sorted(cont.items())])+b"\n"+4*(indent-1)*b" "+b">>"
     elif type(cont) is int:
-        return str(cont).encode("utf8")
+        return str(cont).encode()
     elif type(cont) is float:
-        return ("%0.4f"%cont).encode("utf8")
+        return ("%0.4f"%cont).encode()
     elif isinstance(cont, obj):
-        return ("%d 0 R"%cont.identifier).encode("utf8")
-    elif type(cont) is str:
-        return cont.encode("utf8")
-    elif type(cont) is bytes:
+        return ("%d 0 R"%cont.identifier).encode()
+    elif type(cont) is str or type(cont) is bytes:
+        if type(cont) is str and type(cont) is not bytes:
+            raise Exception("parse must be passed a bytes object in py3")
         return cont
     elif type(cont) is list:
         return b"[ "+b" ".join([parse(c, indent) for c in cont])+b" ]"
+    else:
+        raise Exception("cannot handle type %s"%type(cont))
 
 class obj(object):
     def __init__(self, content, stream=None):
@@ -69,11 +71,11 @@ class obj(object):
     def tostring(self):
         if self.stream:
             return (
-                ("%d 0 obj " % self.identifier).encode("utf8") +
+                ("%d 0 obj " % self.identifier).encode() +
                 parse(self.content) +
                 b"\nstream\n" + self.stream + b"\nendstream\nendobj\n")
         else:
-            return ("%d 0 obj "%self.identifier).encode("utf8")+parse(self.content)+b" endobj\n"
+            return ("%d 0 obj "%self.identifier).encode()+parse(self.content)+b" endobj\n"
 
 class pdfdoc(object):
 
@@ -86,39 +88,39 @@ class pdfdoc(object):
 
         info = {}
         if title:
-            info["/Title"] = "("+title+")"
+            info[b"/Title"] = b"("+title+b")"
         if author:
-            info["/Author"] = "("+author+")"
+            info[b"/Author"] = b"("+author+b")"
         if creator:
-            info["/Creator"] = "("+creator+")"
+            info[b"/Creator"] = b"("+creator+b")"
         if producer:
-            info["/Producer"] = "("+producer+")"
+            info[b"/Producer"] = b"("+producer+b")"
         if creationdate:
-            info["/CreationDate"] = "(D:"+creationdate.strftime("%Y%m%d%H%M%S")+")"
+            info[b"/CreationDate"] = b"(D:"+creationdate.strftime("%Y%m%d%H%M%S").encode()+b")"
         elif not nodate:
-            info["/CreationDate"] = "(D:"+now.strftime("%Y%m%d%H%M%S")+")"
+            info[b"/CreationDate"] = b"(D:"+now.strftime("%Y%m%d%H%M%S").encode()+b")"
         if moddate:
-            info["/ModDate"] = "(D:"+moddate.strftime("%Y%m%d%H%M%S")+")"
+            info[b"/ModDate"] = b"(D:"+moddate.strftime("%Y%m%d%H%M%S").encode()+b")"
         elif not nodate:
-            info["/ModDate"] = "(D:"+now.strftime("%Y%m%d%H%M%S")+")"
+            info[b"/ModDate"] = b"(D:"+now.strftime("%Y%m%d%H%M%S").encode()+b")"
         if subject:
-            info["/Subject"] = "("+subject+")"
+            info[b"/Subject"] = b"("+subject+b")"
         if keywords:
-            info["/Keywords"] = "("+",".join(keywords)+")"
+            info[b"/Keywords"] = b"("+b",".join(keywords)+b")"
 
         self.info = obj(info)
 
         # create an incomplete pages object so that a /Parent entry can be
         # added to each page
         self.pages = obj({
-            "/Type": "/Pages",
-            "/Kids": [],
-            "/Count": 0
+            b"/Type": b"/Pages",
+            b"/Kids": [],
+            b"/Count": 0
         })
 
         self.catalog = obj({
-            "/Pages": self.pages,
-            "/Type": "/Catalog"
+            b"/Pages": self.pages,
+            b"/Type": b"/Catalog"
         })
         self.addobj(self.catalog)
         self.addobj(self.pages)
@@ -130,11 +132,11 @@ class pdfdoc(object):
 
     def addimage(self, color, width, height, imgformat, imgdata, pdf_x, pdf_y):
         if color == 'L':
-            colorspace = "/DeviceGray"
+            colorspace = b"/DeviceGray"
         elif color == 'RGB':
-            colorspace = "/DeviceRGB"
+            colorspace = b"/DeviceRGB"
         elif color == 'CMYK' or color == 'CMYK;I':
-            colorspace = "/DeviceCMYK"
+            colorspace = b"/DeviceCMYK"
         else:
             error_out("unsupported color space: %s"%color)
             exit(1)
@@ -144,47 +146,47 @@ class pdfdoc(object):
 
         # either embed the whole jpeg or deflate the bitmap representation
         if imgformat is "JPEG":
-            ofilter = [ "/DCTDecode" ]
+            ofilter = [ b"/DCTDecode" ]
         elif imgformat is "JPEG2000":
-            ofilter = [ "/JPXDecode" ]
+            ofilter = [ b"/JPXDecode" ]
             self.version = 5 # jpeg2000 needs pdf 1.5
         else:
-            ofilter = [ "/FlateDecode" ]
+            ofilter = [ b"/FlateDecode" ]
         image = obj({
-            "/Type": "/XObject",
-            "/Subtype": "/Image",
-            "/Filter": ofilter,
-            "/Width": width,
-            "/Height": height,
-            "/ColorSpace": colorspace,
+            b"/Type": b"/XObject",
+            b"/Subtype": b"/Image",
+            b"/Filter": ofilter,
+            b"/Width": width,
+            b"/Height": height,
+            b"/ColorSpace": colorspace,
             # hardcoded as PIL doesnt provide bits for non-jpeg formats
-            "/BitsPerComponent": 8,
-            "/Length": len(imgdata)
+            b"/BitsPerComponent": 8,
+            b"/Length": len(imgdata)
         }, imgdata)
 
         if color == 'CMYK;I':
             # Inverts all four channels
-            image.content['/Decode'] = [1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0]
+            image.content[b'/Decode'] = [1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0]
 
-        text = ("q\n%0.4f 0 0 %0.4f 0 0 cm\n/Im0 Do\nQ"%(pdf_x, pdf_y)).encode('utf8')
+        text = ("q\n%0.4f 0 0 %0.4f 0 0 cm\n/Im0 Do\nQ"%(pdf_x, pdf_y)).encode()
 
         content = obj({
-            "/Length": len(text)
+            b"/Length": len(text)
         }, text)
 
         page = obj({
-            "/Type": "/Page",
-            "/Parent": self.pages,
-            "/Resources": {
-                "/XObject": {
-                    "/Im0": image
+            b"/Type": b"/Page",
+            b"/Parent": self.pages,
+            b"/Resources": {
+                b"/XObject": {
+                    b"/Im0": image
                 }
             },
-            "/MediaBox": [0, 0, pdf_x, pdf_y],
-            "/Contents": content
+            b"/MediaBox": [0, 0, pdf_x, pdf_y],
+            b"/Contents": content
         })
-        self.pages.content["/Kids"].append(page)
-        self.pages.content["/Count"] += 1
+        self.pages.content[b"/Kids"].append(page)
+        self.pages.content[b"/Count"] += 1
         self.addobj(page)
         self.addobj(content)
         self.addobj(image)
@@ -195,22 +197,22 @@ class pdfdoc(object):
 
         xreftable = list()
 
-        result = ("%%PDF-1.%d\n"%self.version).encode("utf8")
+        result = ("%%PDF-1.%d\n"%self.version).encode()
 
         xreftable.append(b"0000000000 65535 f \n")
         for o in self.objects:
-            xreftable.append(("%010d 00000 n \n"%len(result)).encode("utf8"))
+            xreftable.append(("%010d 00000 n \n"%len(result)).encode())
             result += o.tostring()
 
         xrefoffset = len(result)
         result += b"xref\n"
-        result += ("0 %d\n"%len(xreftable)).encode("utf8")
+        result += ("0 %d\n"%len(xreftable)).encode()
         for x in xreftable:
             result += x
         result += b"trailer\n"
-        result += parse({"/Size": len(xreftable), "/Info": self.info, "/Root": self.catalog})+b"\n"
+        result += parse({b"/Size": len(xreftable), b"/Info": self.info, b"/Root": self.catalog})+b"\n"
         result += b"startxref\n"
-        result += ("%d\n"%xrefoffset).encode("utf8")
+        result += ("%d\n"%xrefoffset).encode()
         result += b"%%EOF\n"
         return result
 
@@ -508,6 +510,17 @@ def valid_size(string):
 
     return (x, y, pagesize_options)
 
+# in python3, the received argument will be a unicode str() object which needs
+# to be encoded into a bytes() object
+# in python2, the received argument will be a binary str() object which needs
+# no encoding
+# we check whether we use python2 or python3 by checking whether the argument
+# is both, type str and type bytes (only the case in python2)
+def pdf_embedded_string(string):
+    if type(string) is str and type(string) is not bytes:
+        string = string.encode("utf8")
+    return string
+
 parser = argparse.ArgumentParser(
     description='Lossless conversion/embedding of images (in)to pdf')
 parser.add_argument(
@@ -540,16 +553,16 @@ sizeopts.add_argument(
 )
 
 parser.add_argument(
-    '-t', '--title', metavar='title', type=str,
+    '-t', '--title', metavar='title', type=pdf_embedded_string,
     help='title for metadata')
 parser.add_argument(
-    '-a', '--author', metavar='author', type=str,
+    '-a', '--author', metavar='author', type=pdf_embedded_string,
     help='author for metadata')
 parser.add_argument(
-    '-c', '--creator', metavar='creator', type=str,
+    '-c', '--creator', metavar='creator', type=pdf_embedded_string,
     help='creator for metadata')
 parser.add_argument(
-    '-p', '--producer', metavar='producer', type=str,
+    '-p', '--producer', metavar='producer', type=pdf_embedded_string,
     help='producer for metadata')
 parser.add_argument(
     '-r', '--creationdate', metavar='creationdate', type=valid_date,
@@ -558,13 +571,13 @@ parser.add_argument(
     '-m', '--moddate', metavar='moddate', type=valid_date,
     help='modification date for metadata in YYYY-MM-DDTHH:MM:SS format')
 parser.add_argument(
-    '-S', '--subject', metavar='subject', type=str,
+    '-S', '--subject', metavar='subject', type=pdf_embedded_string,
     help='subject for metadata')
 parser.add_argument(
-    '-k', '--keywords', metavar='kw', type=str, nargs='+',
+    '-k', '--keywords', metavar='kw', type=pdf_embedded_string, nargs='+',
     help='keywords for metadata')
 parser.add_argument(
-    '-C', '--colorspace', metavar='colorspace', type=str,
+    '-C', '--colorspace', metavar='colorspace', type=pdf_embedded_string,
     help='force PIL colorspace (one of: RGB, L, 1, CMYK, CMYK;I)')
 parser.add_argument(
     '-D', '--nodate', help='do not add timestamps', action="store_true")