JPEG2000 support

2012-03-29 11:53:57 +02:00 · 2012-03-29 11:53:57 +02:00 · 0bd841c530
commit 0bd841c530
parent 571266a513
2 changed files with 63 additions and 35 deletions
--- a/README.md
+++ b/README.md
@ -1,17 +1,17 @@
 img2pdf
 =======

-Lossless conversion of images to PDF without unnecessarily re-encoding JPEG
-files. Thus, no loss of quality and no unnecessary large output file.
+Lossless conversion of images to PDF without unnecessarily re-encoding JPEG and
+JPEG2000 files. Thus, no loss of quality and no unnecessary large output file.

 background
 ----------

-PDF is able to embed JPEG images as they are without re-encoding them (and
-hence loosing quality) but I was missing a tool to do this automatically, thus
-I wrote this piece of python code.
+PDF is able to embed JPEG and JPEG2000 images as they are without re-encoding
+them (and hence loosing quality) but I was missing a tool to do this
+automatically, thus I wrote this piece of python code.

-If you know how to embed JPEG images into a PDF container without
+If you know how to embed JPEG and JPEG2000 images into a PDF container without
 recompression, using existing tools, please contact me so that I can put this
 code into the garbage bin :D

@ -19,12 +19,12 @@ functionality
 -------------

 The program will take image filenames from commandline arguments and output a
-PDF file with them embedded into it. If the input image is a JPEG file, it will
-be included as-is without any processing. If it is in any other format, the
-image will be included as zip-encoded RGB. As a result, this tool will be able
-to lossless wrap any image into a PDF container while performing better (in
-terms of quality/filesize ratio) than existing tools in case the input image is
-a JPEG.
+PDF file with them embedded into it. If the input image is a JPEG or JPEG2000
+file, it will be included as-is without any processing. If it is in any other
+format, the image will be included as zip-encoded RGB. As a result, this tool
+will be able to lossless wrap any image into a PDF container while performing
+better (in terms of quality/filesize ratio) than existing tools in case the
+input image is a JPEG or JPEG2000 file.

 For the record, the imagemagick command to lossless convert any image to
 PDF using zip-encoding, is:
@ -32,24 +32,25 @@ PDF using zip-encoding, is:
 	convert input.jpg -compress Zip output.pdf

 The downside is, that using imagemagick like this will make the resulting PDF
-files a few times bigger than the input JPEG and can also not output a
-multipage PDF.
+files a few times bigger than the input JPEG or JPEG2000 file and can also not
+output a multipage PDF.

 img2pdf is able to output a PDF with multiple pages if more than one input
-image is given, losslessly embed JPEGs into a PDF container without adding more
-overhead than the PDF structure itself and will save all other graphics formats
-using lossless zip-compression.
+image is given, losslessly embed JPEG and JPEG2000 files into a PDF container
+without adding more overhead than the PDF structure itself and will save all
+other graphics formats using lossless zip-compression.

 bugs
 ----

-If you find a JPEG that, when embedded can not be read by the Adobe Acrobat
-Reader, please contact me.
+If you find a JPEG or JPEG2000 file that, when embedded can not be read by the
+Adobe Acrobat Reader, please contact me.

-For lossless conversion of other formats than JPEG, zip/flate encoding is used.
-This choice is based on a number of tests I did on images.  I converted them
-into PDF using imagemagick and all compressions it has to offer and then
-compared the output size of the lossless variants. In all my tests, zip/flate
-encoding performed best. You can verify my findings using the test_comp.sh
-script with any input image given as a commandline argument. If you find an
-input file that is outperformed by another lossless compression, contact me.
+For lossless conversion of other formats than JPEG or JPEG2000 files, zip/flate
+encoding is used.  This choice is based on a number of tests I did on images.
+I converted them into PDF using imagemagick and all compressions it has to
+offer and then compared the output size of the lossless variants. In all my
+tests, zip/flate encoding performed best. You can verify my findings using the
+test_comp.sh script with any input image given as a commandline argument. If
+you find an input file that is outperformed by another lossless compression,
+contact me.
--- a/img2pdf.py
+++ b/img2pdf.py
@ -4,6 +4,7 @@ import Image
 import sys
 import zlib
 import argparse
+import struct
 from datetime import datetime

 def parse(cont, indent=1):
@ -38,6 +39,8 @@ class obj():
 def main(images, dpi, title=None, author=None, creator=None, producer=None,
    creationdate=None, moddate=None, subject=None, keywords=None):

+    version = 3 # default pdf version 1.3
+
    now = datetime.now()

    info = dict()
@ -67,15 +70,32 @@ def main(images, dpi, title=None, author=None, creator=None, producer=None,
    pagestuples = list()

    for im in images:
-        imgdata = Image.open(im)
-        width, height = imgdata.size
-        if dpi:
-            dpi_x, dpi_y = dpi, dpi
+        try:
+            imgdata = Image.open(im)
+        except IOError:
+            # test if it is a jpeg2000 image
+            im.seek(0)
+            if im.read(12) != "\x00\x00\x00\x0C\x6A\x50\x20\x20\x0D\x0A\x87\x0A":
+                print "cannot read input image"
+                exit(1)
+            # image is jpeg2000
+            imgformat = "JP2"
+            im.seek(48)
+            height, width = struct.unpack(">II", im.read(8))
+            color = "RGB" # TODO: read real colorspace
+            if dpi:
+                dpi_x, dpi_y = dpi, dpi
+            else:
+                dpi_x, dpi_y = (96, 96) # TODO: read real dpi
        else:
-            dpi_x, dpi_y = imgdata.info.get("dpi", (96, 96))
-        pdf_x, pdf_y = 72.0*width/dpi_x, 72.0*height/dpi_y # pdf units = 1/72 inch
-        imgformat = imgdata.format
-        color = imgdata.mode
+            width, height = imgdata.size
+            if dpi:
+                dpi_x, dpi_y = dpi, dpi
+            else:
+                dpi_x, dpi_y = imgdata.info.get("dpi", (96, 96))
+            imgformat = imgdata.format
+            color = imgdata.mode
+
        if color == 'L':
            color = "/DeviceGray"
        elif color == 'RGB':
@ -84,11 +104,18 @@ def main(images, dpi, title=None, author=None, creator=None, producer=None,
            print "unsupported color space:", color
            exit(1)

+        pdf_x, pdf_y = 72.0*width/dpi_x, 72.0*height/dpi_y # pdf units = 1/72 inch
+
        # either embed the whole jpeg or deflate the bitmap representation
        if imgformat is "JPEG":
            ofilter = [ "/DCTDecode" ]
            im.seek(0)
            imgdata = im.read()
+        elif imgformat is "JP2":
+            ofilter = [ "/JPXDecode" ]
+            im.seek(0)
+            imgdata = im.read()
+            version = 5 # jpeg2000 needs pdf 1.5
        else:
            ofilter = [ "/FlateDecode" ]
            imgdata = zlib.compress(imgdata.tostring())
@ -147,7 +174,7 @@ def main(images, dpi, title=None, author=None, creator=None, producer=None,

    xreftable = list()

-    result  = "%PDF-1.3\n"
+    result  = "%%PDF-1.%d\n"%version

    xreftable.append("0000000000 65535 f \n")
    for o in objects: