initial commit

12 years ago · 055b075397
commit 055b075397
2 changed files with 243 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,45 @@
+== img2pdf ==
+
+Lossless conversion of images to PDF without unnecessarily re-encoding JPEG
+files. Thus, no loss of quality and no unnecessary large output file.
+
+PDF is able to embed JPEG images as they are without re-encoding them (and
+hence loosing quality) but I was missing a tool to do this automatically, thus
+I wrote this piece of python code.
+
+If you know how to embed JPEG images into a PDF container without
+recompression, using existing tools, please contact me so that I can put this
+code into the garbage bin :D
+
+The program will take image filenames from commandline arguments and output a
+PDF file with them embedded into it. If the input image is a JPEG file, it will
+be included as-is without any processing. If it is in any other format, the
+image will be included as zip-encoded RGB. As a result, this tool will be able
+to lossless wrap any image into a PDF container while performing better (in
+terms of quality/filesize ratio) than existing tools in case the input image is
+a JPEG.
+
+For the record, the imagemagick command to lossless convert any image to
+PDF using zip-encoding, is:
+
+	convert input.jpg -compress Zip output.pdf
+
+The downside is, that using imagemagick like this will make the resulting PDF
+files a few times bigger than the input JPEG and can also not output a
+multipage PDF.
+
+img2pdf is able to output a PDF with multiple pages if more than one input
+image is given, losslessly embed JPEGs into a PDF container without adding more
+overhead than the PDF structure itself and will save all other graphics formats
+using lossless zip-compression.
+
+If you find a JPEG that, when embedded can not be read by the Adobe Acrobat
+Reader, please contact me.
+
+For lossless conversion of other formats than JPEG, zip/flate encoding is used.
+This choice is based on a number of tests I did on images.  I converted them
+into PDF using imagemagick and all compressions it has to offer and then
+compared the output size of the lossless variants. In all my tests, zip/flate
+encoding performed best. You can verify my findings using the test_comp.sh
+script with any input image given as a commandline argument. If you find an
+input file that is outperformed by another lossless compression, contact me.
--- a/img2pdf.py
+++ b/img2pdf.py
@ -0,0 +1,198 @@
+#!/usr/bin/env python
+
+import Image
+import sys
+import zlib
+import argparse
+from datetime import datetime
+
+def parse(cont, indent=1):
+    if type(cont) is dict:
+        return "<<\n"+"\n".join([4*indent*" "+"%s %s"%(k, parse(v, indent+1)) for k, v in cont.items()])+"\n"+4*(indent-1)*" "+">>"
+    elif type(cont) is int or type(cont) is float:
+        return str(cont)
+    elif isinstance(cont, obj):
+        return "%d 0 R"%cont.get_identifier()
+    elif type(cont) is str:
+        return cont
+    elif type(cont) is list:
+        return "[ "+" ".join([parse(c, indent) for c in cont])+" ]"
+
+class obj():
+    def __init__(self, content, stream=None):
+        self.content = content
+        self.stream = stream
+
+    def tostring(self, identifier):
+        self.identifier = identifier
+        if self.stream:
+            return "%d 0 obj "%identifier+parse(self.content)+"\nstream\n"+self.stream+"\nendstream\nendobj\n"
+        else:
+            return "%d 0 obj "%identifier+parse(self.content)+" endobj\n"
+
+    def get_identifier(self):
+        if not hasattr(self, 'identifier'):
+            raise Exception("no id set yet, call tostring() on obj first")
+        return self.identifier
+
+def main(images, dpi, title=None, author=None, creator=None, producer=None,
+    creationdate=None, moddate=None, subject=None, keywords=None):
+
+    now = datetime.now()
+
+    info = dict()
+    if title:
+        info["/Title"] = "("+title+")"
+    if author:
+        info["/Author"] = "("+author+")"
+    if creator:
+        info["/Creator"] = "("+creator+")"
+    if producer:
+        info["/Producer"] = "("+producer+")"
+    if creationdate:
+        info["/CreationDate"] = "(D:"+creationdate.strftime("%Y%m%d%H%M%S")+")"
+    else:
+        info["/CreationDate"] = "(D:"+now.strftime("%Y%m%d%H%M%S")+")"
+    if moddate:
+        info["/ModDate"] = "(D:"+moddate.strftime("%Y%m%d%H%M%S")+")"
+    else:
+        info["/ModDate"] = "(D:"+now.strftime("%Y%m%d%H%M%S")+")"
+    if subject:
+        info["/Subject"] = "("+subject+")"
+    if keywords:
+        info["/Keywords"] = "("+",".join(keywords)+")"
+
+    info = obj(info)
+
+    pagestuples = list()
+
+    for im in images:
+        imgdata = Image.open(im)
+        width, height = imgdata.size
+        if dpi:
+            dpi_x, dpi_y = dpi, dpi
+        else:
+            dpi_x, dpi_y = imgdata.info.get("dpi", (96, 96))
+        pdf_x, pdf_y = 72.0*width/dpi_x, 72.0*height/dpi_y # pdf units = 1/72 inch
+        imgformat = imgdata.format
+        color = imgdata.mode
+        if color == 'L':
+            color = "/DeviceGray"
+        elif color == 'RGB':
+            color = "/DeviceRGB"
+        else:
+            print "unsupported color space:", color
+            exit(1)
+
+        # either embed the whole jpeg or deflate the bitmap representation
+        if imgformat is "JPEG":
+            ofilter = [ "/DCTDecode" ]
+            im.seek(0)
+            imgdata = im.read()
+        else:
+            ofilter = [ "/FlateDecode" ]
+            imgdata = zlib.compress(imgdata.tostring())
+        im.close()
+
+        image = obj({
+            "/Type": "/XObject",
+            "/Subtype": "/Image",
+            "/Filter": ofilter,
+            "/Width": width,
+            "/Height": height,
+            "/ColorSpace": color,
+            "/BitsPerComponent": 8, # hardcoded as PIL doesnt provide bits for non-jpeg formats
+            "/Length": len(imgdata)
+        }, imgdata)
+
+        text = "q\n%f 0 0 %f 0 0 cm\n/Im0 Do\nQ"%(pdf_x, pdf_y)
+
+        content = obj({
+            "/Length": len(text)
+        }, text)
+
+        page = obj({
+            "/Type": "/Page",
+            "/Resources": {
+                "/XObject": {
+                    "/Im0": image
+                }
+            },
+            "/MediaBox": [0, 0, pdf_x, pdf_y],
+            "/Contents": content
+        })
+
+        pagestuples.append((image, content, page))
+
+    pages = obj({
+        "/Type": "/Pages",
+        "/Kids": [ pagetuple[2] for pagetuple in pagestuples ],
+        "/Count": len(pagestuples)
+    })
+
+    catalog = obj({
+        "/Pages": pages,
+        "/Type": "/Catalog"
+    })
+
+    objects = list()
+    objects.append(info.tostring(3*(len(pagestuples)+1)))
+    for i, (image, content, page) in enumerate(reversed(pagestuples)):
+        objects.append(image.tostring(3*(len(pagestuples)-i+1)-1))
+        objects.append(content.tostring(3*(len(pagestuples)-i+1)-2))
+        objects.append(page.tostring(3*(len(pagestuples)-i+1)-3))
+    objects.append(pages.tostring(2))
+    objects.append(catalog.tostring(1))
+    objects.reverse()
+
+    xreftable = list()
+
+    result  = "%PDF-1.3\n"
+
+    xreftable.append("0000000000 65535 f \n")
+    for o in objects:
+        xreftable.append("%010d 00000 n \n"%len(result))
+        result += o
+
+    xrefoffset = len(result)
+    result += "xref\n"
+    result += "0 %d\n"%len(xreftable)
+    for x in xreftable:
+        result += x
+    result += "trailer\n"
+    result += parse({"/Size": len(xreftable), "/Info": info, "/Root": catalog})+"\n"
+    result += "startxref\n"
+    result += "%d\n"%xrefoffset
+    result += "%%EOF\n"
+
+    return result
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='lossless conversion/embedding of images (in)to pdf')
+    parser.add_argument('images', metavar='infile', type=argparse.FileType('r'),
+                        nargs='*', default=[sys.stdin], help='input file(s) (default: stdin)')
+    parser.add_argument('-o', '--output', metavar='out', type=argparse.FileType('w'),
+                        default=sys.stdout, help='output file (default: stdout)')
+    def positive_float(string):
+        value = float(string)
+        if value <= 0:
+            msg = "%r is not positive"%string
+            raise argparse.ArgumentTypeError(msg)
+        return value
+    parser.add_argument('-d', '--dpi', metavar='dpi', type=positive_float, help='dpi for pdf output (default: 96.0)')
+    parser.add_argument('-t', '--title', metavar='title', type=str, help='title for metadata')
+    parser.add_argument('-a', '--author', metavar='author', type=str, help='author for metadata')
+    parser.add_argument('-c', '--creator', metavar='creator', type=str, help='creator for metadata')
+    parser.add_argument('-p', '--producer', metavar='producer', type=str, help='producer for metadata')
+    def valid_date(string):
+        return datetime.strptime(string, "%Y-%m-%dT%H:%M:%S")
+    parser.add_argument('-r', '--creationdate', metavar='creationdate',
+        type=valid_date, help='creation date for metadata in YYYY-MM-DDTHH:MM:SS format')
+    parser.add_argument('-m', '--moddate', metavar='moddate',
+        type=valid_date, help='modification date for metadata in YYYY-MM-DDTHH:MM:SS format')
+    parser.add_argument('-s', '--subject', metavar='subject', type=str, help='subject for metadata')
+    parser.add_argument('-k', '--keywords', metavar='kw', type=str, nargs='+', help='keywords for metadata')
+    args = parser.parse_args()
+    args.output.write(main(args.images, args.dpi, args.title, args.author,
+        args.creator, args.producer, args.creationdate, args.moddate,
+        args.subject, args.keywords))