diff --git a/README.md b/README.md index 8693942..18907ce 100644 --- a/README.md +++ b/README.md @@ -1,17 +1,17 @@ img2pdf ======= -Lossless conversion of images to PDF without unnecessarily re-encoding JPEG -files. Thus, no loss of quality and no unnecessary large output file. +Lossless conversion of images to PDF without unnecessarily re-encoding JPEG and +JPEG2000 files. Thus, no loss of quality and no unnecessary large output file. background ---------- -PDF is able to embed JPEG images as they are without re-encoding them (and -hence loosing quality) but I was missing a tool to do this automatically, thus -I wrote this piece of python code. +PDF is able to embed JPEG and JPEG2000 images as they are without re-encoding +them (and hence loosing quality) but I was missing a tool to do this +automatically, thus I wrote this piece of python code. -If you know how to embed JPEG images into a PDF container without +If you know how to embed JPEG and JPEG2000 images into a PDF container without recompression, using existing tools, please contact me so that I can put this code into the garbage bin :D @@ -19,12 +19,12 @@ functionality ------------- The program will take image filenames from commandline arguments and output a -PDF file with them embedded into it. If the input image is a JPEG file, it will -be included as-is without any processing. If it is in any other format, the -image will be included as zip-encoded RGB. As a result, this tool will be able -to lossless wrap any image into a PDF container while performing better (in -terms of quality/filesize ratio) than existing tools in case the input image is -a JPEG. +PDF file with them embedded into it. If the input image is a JPEG or JPEG2000 +file, it will be included as-is without any processing. If it is in any other +format, the image will be included as zip-encoded RGB. As a result, this tool +will be able to lossless wrap any image into a PDF container while performing +better (in terms of quality/filesize ratio) than existing tools in case the +input image is a JPEG or JPEG2000 file. For the record, the imagemagick command to lossless convert any image to PDF using zip-encoding, is: @@ -32,24 +32,25 @@ PDF using zip-encoding, is: convert input.jpg -compress Zip output.pdf The downside is, that using imagemagick like this will make the resulting PDF -files a few times bigger than the input JPEG and can also not output a -multipage PDF. +files a few times bigger than the input JPEG or JPEG2000 file and can also not +output a multipage PDF. img2pdf is able to output a PDF with multiple pages if more than one input -image is given, losslessly embed JPEGs into a PDF container without adding more -overhead than the PDF structure itself and will save all other graphics formats -using lossless zip-compression. +image is given, losslessly embed JPEG and JPEG2000 files into a PDF container +without adding more overhead than the PDF structure itself and will save all +other graphics formats using lossless zip-compression. bugs ---- -If you find a JPEG that, when embedded can not be read by the Adobe Acrobat -Reader, please contact me. - -For lossless conversion of other formats than JPEG, zip/flate encoding is used. -This choice is based on a number of tests I did on images. I converted them -into PDF using imagemagick and all compressions it has to offer and then -compared the output size of the lossless variants. In all my tests, zip/flate -encoding performed best. You can verify my findings using the test_comp.sh -script with any input image given as a commandline argument. If you find an -input file that is outperformed by another lossless compression, contact me. +If you find a JPEG or JPEG2000 file that, when embedded can not be read by the +Adobe Acrobat Reader, please contact me. + +For lossless conversion of other formats than JPEG or JPEG2000 files, zip/flate +encoding is used. This choice is based on a number of tests I did on images. +I converted them into PDF using imagemagick and all compressions it has to +offer and then compared the output size of the lossless variants. In all my +tests, zip/flate encoding performed best. You can verify my findings using the +test_comp.sh script with any input image given as a commandline argument. If +you find an input file that is outperformed by another lossless compression, +contact me. diff --git a/img2pdf.py b/img2pdf.py index 85c1f99..6aa2f4c 100644 --- a/img2pdf.py +++ b/img2pdf.py @@ -4,6 +4,7 @@ import Image import sys import zlib import argparse +import struct from datetime import datetime def parse(cont, indent=1): @@ -38,6 +39,8 @@ class obj(): def main(images, dpi, title=None, author=None, creator=None, producer=None, creationdate=None, moddate=None, subject=None, keywords=None): + version = 3 # default pdf version 1.3 + now = datetime.now() info = dict() @@ -67,15 +70,32 @@ def main(images, dpi, title=None, author=None, creator=None, producer=None, pagestuples = list() for im in images: - imgdata = Image.open(im) - width, height = imgdata.size - if dpi: - dpi_x, dpi_y = dpi, dpi + try: + imgdata = Image.open(im) + except IOError: + # test if it is a jpeg2000 image + im.seek(0) + if im.read(12) != "\x00\x00\x00\x0C\x6A\x50\x20\x20\x0D\x0A\x87\x0A": + print "cannot read input image" + exit(1) + # image is jpeg2000 + imgformat = "JP2" + im.seek(48) + height, width = struct.unpack(">II", im.read(8)) + color = "RGB" # TODO: read real colorspace + if dpi: + dpi_x, dpi_y = dpi, dpi + else: + dpi_x, dpi_y = (96, 96) # TODO: read real dpi else: - dpi_x, dpi_y = imgdata.info.get("dpi", (96, 96)) - pdf_x, pdf_y = 72.0*width/dpi_x, 72.0*height/dpi_y # pdf units = 1/72 inch - imgformat = imgdata.format - color = imgdata.mode + width, height = imgdata.size + if dpi: + dpi_x, dpi_y = dpi, dpi + else: + dpi_x, dpi_y = imgdata.info.get("dpi", (96, 96)) + imgformat = imgdata.format + color = imgdata.mode + if color == 'L': color = "/DeviceGray" elif color == 'RGB': @@ -84,11 +104,18 @@ def main(images, dpi, title=None, author=None, creator=None, producer=None, print "unsupported color space:", color exit(1) + pdf_x, pdf_y = 72.0*width/dpi_x, 72.0*height/dpi_y # pdf units = 1/72 inch + # either embed the whole jpeg or deflate the bitmap representation if imgformat is "JPEG": ofilter = [ "/DCTDecode" ] im.seek(0) imgdata = im.read() + elif imgformat is "JP2": + ofilter = [ "/JPXDecode" ] + im.seek(0) + imgdata = im.read() + version = 5 # jpeg2000 needs pdf 1.5 else: ofilter = [ "/FlateDecode" ] imgdata = zlib.compress(imgdata.tostring()) @@ -147,7 +174,7 @@ def main(images, dpi, title=None, author=None, creator=None, producer=None, xreftable = list() - result = "%PDF-1.3\n" + result = "%%PDF-1.%d\n"%version xreftable.append("0000000000 65535 f \n") for o in objects: