From 055b07539708e1fb0f201a5713d8b6a518b4a4f6 Mon Sep 17 00:00:00 2001 From: josch Date: Thu, 29 Mar 2012 11:08:32 +0200 Subject: [PATCH] initial commit --- README.md | 45 ++++++++++++ img2pdf.py | 198 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 243 insertions(+) create mode 100644 README.md create mode 100644 img2pdf.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..d280e41 --- /dev/null +++ b/README.md @@ -0,0 +1,45 @@ +== img2pdf == + +Lossless conversion of images to PDF without unnecessarily re-encoding JPEG +files. Thus, no loss of quality and no unnecessary large output file. + +PDF is able to embed JPEG images as they are without re-encoding them (and +hence loosing quality) but I was missing a tool to do this automatically, thus +I wrote this piece of python code. + +If you know how to embed JPEG images into a PDF container without +recompression, using existing tools, please contact me so that I can put this +code into the garbage bin :D + +The program will take image filenames from commandline arguments and output a +PDF file with them embedded into it. If the input image is a JPEG file, it will +be included as-is without any processing. If it is in any other format, the +image will be included as zip-encoded RGB. As a result, this tool will be able +to lossless wrap any image into a PDF container while performing better (in +terms of quality/filesize ratio) than existing tools in case the input image is +a JPEG. + +For the record, the imagemagick command to lossless convert any image to +PDF using zip-encoding, is: + + convert input.jpg -compress Zip output.pdf + +The downside is, that using imagemagick like this will make the resulting PDF +files a few times bigger than the input JPEG and can also not output a +multipage PDF. + +img2pdf is able to output a PDF with multiple pages if more than one input +image is given, losslessly embed JPEGs into a PDF container without adding more +overhead than the PDF structure itself and will save all other graphics formats +using lossless zip-compression. + +If you find a JPEG that, when embedded can not be read by the Adobe Acrobat +Reader, please contact me. + +For lossless conversion of other formats than JPEG, zip/flate encoding is used. +This choice is based on a number of tests I did on images. I converted them +into PDF using imagemagick and all compressions it has to offer and then +compared the output size of the lossless variants. In all my tests, zip/flate +encoding performed best. You can verify my findings using the test_comp.sh +script with any input image given as a commandline argument. If you find an +input file that is outperformed by another lossless compression, contact me. diff --git a/img2pdf.py b/img2pdf.py new file mode 100644 index 0000000..85c1f99 --- /dev/null +++ b/img2pdf.py @@ -0,0 +1,198 @@ +#!/usr/bin/env python + +import Image +import sys +import zlib +import argparse +from datetime import datetime + +def parse(cont, indent=1): + if type(cont) is dict: + return "<<\n"+"\n".join([4*indent*" "+"%s %s"%(k, parse(v, indent+1)) for k, v in cont.items()])+"\n"+4*(indent-1)*" "+">>" + elif type(cont) is int or type(cont) is float: + return str(cont) + elif isinstance(cont, obj): + return "%d 0 R"%cont.get_identifier() + elif type(cont) is str: + return cont + elif type(cont) is list: + return "[ "+" ".join([parse(c, indent) for c in cont])+" ]" + +class obj(): + def __init__(self, content, stream=None): + self.content = content + self.stream = stream + + def tostring(self, identifier): + self.identifier = identifier + if self.stream: + return "%d 0 obj "%identifier+parse(self.content)+"\nstream\n"+self.stream+"\nendstream\nendobj\n" + else: + return "%d 0 obj "%identifier+parse(self.content)+" endobj\n" + + def get_identifier(self): + if not hasattr(self, 'identifier'): + raise Exception("no id set yet, call tostring() on obj first") + return self.identifier + +def main(images, dpi, title=None, author=None, creator=None, producer=None, + creationdate=None, moddate=None, subject=None, keywords=None): + + now = datetime.now() + + info = dict() + if title: + info["/Title"] = "("+title+")" + if author: + info["/Author"] = "("+author+")" + if creator: + info["/Creator"] = "("+creator+")" + if producer: + info["/Producer"] = "("+producer+")" + if creationdate: + info["/CreationDate"] = "(D:"+creationdate.strftime("%Y%m%d%H%M%S")+")" + else: + info["/CreationDate"] = "(D:"+now.strftime("%Y%m%d%H%M%S")+")" + if moddate: + info["/ModDate"] = "(D:"+moddate.strftime("%Y%m%d%H%M%S")+")" + else: + info["/ModDate"] = "(D:"+now.strftime("%Y%m%d%H%M%S")+")" + if subject: + info["/Subject"] = "("+subject+")" + if keywords: + info["/Keywords"] = "("+",".join(keywords)+")" + + info = obj(info) + + pagestuples = list() + + for im in images: + imgdata = Image.open(im) + width, height = imgdata.size + if dpi: + dpi_x, dpi_y = dpi, dpi + else: + dpi_x, dpi_y = imgdata.info.get("dpi", (96, 96)) + pdf_x, pdf_y = 72.0*width/dpi_x, 72.0*height/dpi_y # pdf units = 1/72 inch + imgformat = imgdata.format + color = imgdata.mode + if color == 'L': + color = "/DeviceGray" + elif color == 'RGB': + color = "/DeviceRGB" + else: + print "unsupported color space:", color + exit(1) + + # either embed the whole jpeg or deflate the bitmap representation + if imgformat is "JPEG": + ofilter = [ "/DCTDecode" ] + im.seek(0) + imgdata = im.read() + else: + ofilter = [ "/FlateDecode" ] + imgdata = zlib.compress(imgdata.tostring()) + im.close() + + image = obj({ + "/Type": "/XObject", + "/Subtype": "/Image", + "/Filter": ofilter, + "/Width": width, + "/Height": height, + "/ColorSpace": color, + "/BitsPerComponent": 8, # hardcoded as PIL doesnt provide bits for non-jpeg formats + "/Length": len(imgdata) + }, imgdata) + + text = "q\n%f 0 0 %f 0 0 cm\n/Im0 Do\nQ"%(pdf_x, pdf_y) + + content = obj({ + "/Length": len(text) + }, text) + + page = obj({ + "/Type": "/Page", + "/Resources": { + "/XObject": { + "/Im0": image + } + }, + "/MediaBox": [0, 0, pdf_x, pdf_y], + "/Contents": content + }) + + pagestuples.append((image, content, page)) + + pages = obj({ + "/Type": "/Pages", + "/Kids": [ pagetuple[2] for pagetuple in pagestuples ], + "/Count": len(pagestuples) + }) + + catalog = obj({ + "/Pages": pages, + "/Type": "/Catalog" + }) + + objects = list() + objects.append(info.tostring(3*(len(pagestuples)+1))) + for i, (image, content, page) in enumerate(reversed(pagestuples)): + objects.append(image.tostring(3*(len(pagestuples)-i+1)-1)) + objects.append(content.tostring(3*(len(pagestuples)-i+1)-2)) + objects.append(page.tostring(3*(len(pagestuples)-i+1)-3)) + objects.append(pages.tostring(2)) + objects.append(catalog.tostring(1)) + objects.reverse() + + xreftable = list() + + result = "%PDF-1.3\n" + + xreftable.append("0000000000 65535 f \n") + for o in objects: + xreftable.append("%010d 00000 n \n"%len(result)) + result += o + + xrefoffset = len(result) + result += "xref\n" + result += "0 %d\n"%len(xreftable) + for x in xreftable: + result += x + result += "trailer\n" + result += parse({"/Size": len(xreftable), "/Info": info, "/Root": catalog})+"\n" + result += "startxref\n" + result += "%d\n"%xrefoffset + result += "%%EOF\n" + + return result + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='lossless conversion/embedding of images (in)to pdf') + parser.add_argument('images', metavar='infile', type=argparse.FileType('r'), + nargs='*', default=[sys.stdin], help='input file(s) (default: stdin)') + parser.add_argument('-o', '--output', metavar='out', type=argparse.FileType('w'), + default=sys.stdout, help='output file (default: stdout)') + def positive_float(string): + value = float(string) + if value <= 0: + msg = "%r is not positive"%string + raise argparse.ArgumentTypeError(msg) + return value + parser.add_argument('-d', '--dpi', metavar='dpi', type=positive_float, help='dpi for pdf output (default: 96.0)') + parser.add_argument('-t', '--title', metavar='title', type=str, help='title for metadata') + parser.add_argument('-a', '--author', metavar='author', type=str, help='author for metadata') + parser.add_argument('-c', '--creator', metavar='creator', type=str, help='creator for metadata') + parser.add_argument('-p', '--producer', metavar='producer', type=str, help='producer for metadata') + def valid_date(string): + return datetime.strptime(string, "%Y-%m-%dT%H:%M:%S") + parser.add_argument('-r', '--creationdate', metavar='creationdate', + type=valid_date, help='creation date for metadata in YYYY-MM-DDTHH:MM:SS format') + parser.add_argument('-m', '--moddate', metavar='moddate', + type=valid_date, help='modification date for metadata in YYYY-MM-DDTHH:MM:SS format') + parser.add_argument('-s', '--subject', metavar='subject', type=str, help='subject for metadata') + parser.add_argument('-k', '--keywords', metavar='kw', type=str, nargs='+', help='keywords for metadata') + args = parser.parse_args() + args.output.write(main(args.images, args.dpi, args.title, args.author, + args.creator, args.producer, args.creationdate, args.moddate, + args.subject, args.keywords))