From 1d9a25dfd2e5ba6e8c86d9eef1734d0e2d2d10b7 Mon Sep 17 00:00:00 2001 From: Johannes 'josch' Schauer Date: Thu, 15 Mar 2018 11:31:36 +0100 Subject: [PATCH] Proof of concept of using PDF DecodeParms for storing pixel data with PNG compression --- README.md | 34 ++++++------- src/img2pdf.py | 127 +++++++++++++++++++++++++++++++++++++------------ 2 files changed, 114 insertions(+), 47 deletions(-) diff --git a/README.md b/README.md index 33bd9c5..7b991ba 100644 --- a/README.md +++ b/README.md @@ -11,28 +11,24 @@ terms of its file size. Background ---------- -Quality loss can be avoided when converting JPEG and JPEG2000 images to PDF by -embedding them into the PDF without re-encoding them. This is what img2pdf -does. It thus treats the PDF format merely as a container format for storing -one or more JPEGs without re-encoding the JPEG images themselves. +Quality loss can be avoided when converting PNG, JPEG and JPEG2000 images to +PDF by embedding them into the PDF without re-encoding them. This is what +img2pdf does. It thus treats the PDF format merely as a container format for +storing one or more JPEGs or PNGs without re-encoding the images themselves. -If you know an existing tool which allows one to embed JPEG and JPEG2000 images -into a PDF container without recompression, please contact me so that I can put -this code into the garbage bin. +If you know an existing tool which allows one to embed PNG, JPEG and JPEG2000 +images into a PDF container without recompression, please contact me so that I +can put this code into the garbage bin. Functionality ------------- This program will take a list of raster images and produce a PDF file with the -images embedded in it. JPEG and JPEG2000 images will be included without +images embedded in it. PNG, JPEG and JPEG2000 images will be included without recompression and the resulting PDF will only be slightly larger than the input images due to the overhead of the PDF container. Raster images in other -formats (like png, gif or tif) will be included using the lossless zip/flate -encoding which usually leads to a significant increase in the PDF size if the -input was for example a png image. This is unfortunately unavoidable because -there is no other way to store arbitrary RGB bitmaps in PDF in a lossless way -other than zip/flate encoding. And zip/flate compresses bitmaps worse than png -is able to compress them. +formats (like gif or tif) will be included using the lossless zip/flate +encoding using the PNG Paeth predictor. As a result, this tool is able to losslessly wrap raster images into a PDF container with a quality to filesize ratio that is typically better (in case of @@ -58,13 +54,17 @@ imagemagick, one has to use zip compression: However, this approach will result in PDF files that are a few times larger than the input JPEG or JPEG2000 file. -img2pdf is able to losslessly embed JPEG and JPEG2000 files into a PDF +Furthermore, when converting PNG images, popular tools like imagemagick use +flate encoding without a predictor. This means, that image file size ends up +being several orders of magnitude larger then necessary. + +img2pdf is able to losslessly embed PNG, JPEG and JPEG2000 files into a PDF container without additional overhead (aside from the PDF structure itself), save other graphics formats using lossless zip compression, and produce multi-page PDF files when more than one input image is given. -Also, since JPEG and JPEG2000 images are not reencoded, conversion with img2pdf -is several times faster than with other tools. +Also, since PNG, JPEG and JPEG2000 images are not reencoded, conversion with +img2pdf is several times faster than with other tools. Usage ----- diff --git a/src/img2pdf.py b/src/img2pdf.py index ac23eda..e6537f2 100755 --- a/src/img2pdf.py +++ b/src/img2pdf.py @@ -28,6 +28,7 @@ from jp2 import parsejp2 from enum import Enum from io import BytesIO import logging +import struct PY3 = sys.version_info[0] >= 3 @@ -61,7 +62,7 @@ PageOrientation = Enum('PageOrientation', 'portrait landscape') Colorspace = Enum('Colorspace', 'RGB L 1 CMYK CMYK;I RGBA P other') -ImageFormat = Enum('ImageFormat', 'JPEG JPEG2000 CCITTGroup4 other') +ImageFormat = Enum('ImageFormat', 'JPEG JPEG2000 CCITTGroup4 PNG other') PageMode = Enum('PageMode', 'none outlines thumbs') @@ -274,24 +275,30 @@ class MyPdfWriter(): if PY3: class MyPdfString(): @classmethod - def encode(cls, string): - try: - string = string.encode('ascii') - except UnicodeEncodeError: - string = b"\xfe\xff"+string.encode("utf-16-be") - string = string.replace(b'\\', b'\\\\') - string = string.replace(b'(', b'\\(') - string = string.replace(b')', b'\\)') - return b'(' + string + b')' + def encode(cls, string, hextype=False): + if hextype: + return b'< ' + b' '.join(("%06x"%c).encode('ascii') for c in string) + b' >' + else: + try: + string = string.encode('ascii') + except UnicodeEncodeError: + string = b"\xfe\xff"+string.encode("utf-16-be") + string = string.replace(b'\\', b'\\\\') + string = string.replace(b'(', b'\\(') + string = string.replace(b')', b'\\)') + return b'(' + string + b')' else: class MyPdfString(object): @classmethod - def encode(cls, string): - # This mimics exactely to what pdfrw does. - string = string.replace(b'\\', b'\\\\') - string = string.replace(b'(', b'\\(') - string = string.replace(b')', b'\\)') - return b'(' + string + b')' + def encode(cls, string, hextype=False): + if hextype: + return b'< ' + b' '.join(("%06x"%c).encode('ascii') for c in string) + b' >' + else: + # This mimics exactely to what pdfrw does. + string = string.replace(b'\\', b'\\\\') + string = string.replace(b'(', b'\\(') + string = string.replace(b')', b'\\)') + return b'(' + string + b')' class pdfdoc(object): @@ -367,14 +374,15 @@ class pdfdoc(object): def add_imagepage(self, color, imgwidthpx, imgheightpx, imgformat, imgdata, imgwidthpdf, imgheightpdf, imgxpdf, imgypdf, pagewidth, - pageheight, userunit=None): + pageheight, userunit=None, palette=None): if self.with_pdfrw: - from pdfrw import PdfDict, PdfName, PdfObject + from pdfrw import PdfDict, PdfName, PdfObject, PdfString from pdfrw.py23_diffs import convert_load else: PdfDict = MyPdfDict PdfName = MyPdfName PdfObject = MyPdfObject + PdfString = MyPdfString convert_load = my_convert_load if color == Colorspace['1'] or color == Colorspace.L: @@ -383,21 +391,24 @@ class pdfdoc(object): colorspace = PdfName.DeviceRGB elif color == Colorspace.CMYK or color == Colorspace['CMYK;I']: colorspace = PdfName.DeviceCMYK + elif color == Colorspace.P: + if self.with_pdfrw: + raise Exception("pdfrw does not support hex strings for palette image input, re-run with --without-pdfrw") + colorspace = [ PdfName.Indexed, PdfName.DeviceRGB, len(palette)-1, PdfString.encode(palette, hextype=True)] else: raise UnsupportedColorspaceError("unsupported color space: %s" % color.name) # either embed the whole jpeg or deflate the bitmap representation - logging.debug(imgformat) if imgformat is ImageFormat.JPEG: ofilter = [PdfName.DCTDecode] elif imgformat is ImageFormat.JPEG2000: ofilter = [PdfName.JPXDecode] self.writer.version = "1.5" # jpeg2000 needs pdf 1.5 elif imgformat is ImageFormat.CCITTGroup4: - ofilter = [PdfName.CCITTFaxDecode] + ofilter = PdfName.CCITTFaxDecode else: - ofilter = [PdfName.FlateDecode] + ofilter = PdfName.FlateDecode image = PdfDict(stream=convert_load(imgdata)) @@ -411,7 +422,15 @@ class pdfdoc(object): if imgformat is ImageFormat.CCITTGroup4: image[PdfName.BitsPerComponent] = 1 else: - image[PdfName.BitsPerComponent] = 8 + if color == Colorspace.P: + if len(palette) <= 2**1: + image[PdfName.BitsPerComponent] = 1 + elif len(palette) <= 2**4: + image[PdfName.BitsPerComponent] = 4 + else: + image[PdfName.BitsPerComponent] = 8 + else: + image[PdfName.BitsPerComponent] = 8 if color == Colorspace['CMYK;I']: # Inverts all four channels @@ -424,6 +443,24 @@ class pdfdoc(object): decodeparms[PdfName.Columns] = imgwidthpx decodeparms[PdfName.Rows] = imgheightpx image[PdfName.DecodeParms] = [decodeparms] + elif imgformat is ImageFormat.PNG: + decodeparms = PdfDict() + decodeparms[PdfName.Predictor] = 15 + if color in [ Colorspace.P, Colorspace['1'], Colorspace.L ]: + decodeparms[PdfName.Colors] = 1 + else: + decodeparms[PdfName.Colors] = 3 + decodeparms[PdfName.Columns] = imgwidthpx + if color == Colorspace.P: + if len(palette) <= 2**1: + decodeparms[PdfName.BitsPerComponent] = 1 + elif len(palette) <= 2**4: + decodeparms[PdfName.BitsPerComponent] = 4 + else: + decodeparms[PdfName.BitsPerComponent] = 8 + else: + decodeparms[PdfName.BitsPerComponent] = 8 + image[PdfName.DecodeParms] = decodeparms text = ("q\n%0.4f 0 0 %0.4f %0.4f %0.4f cm\n/Im0 Do\nQ" % (imgwidthpdf, imgheightpdf, imgxpdf, imgypdf)).encode("ascii") @@ -674,6 +711,25 @@ def transcode_monochrome(imgdata): return ccittdata +def parse_png(rawdata): + pngidat = b"" + palette = [] + i = 16 + while i < len(rawdata): + # once we can require Python >= 3.2 we can use int.from_bytes() instead + n, = struct.unpack('>I', rawdata[i-8:i-4]) + if i + n > len(rawdata): + raise Exception("invalid png: %d %d %d"%(i, n, len(rawdata))) + if rawdata[i-4:i] == b"IDAT": + pngidat += rawdata[i:i+n] + elif rawdata[i-4:i] == b"PLTE": + for j in range(i, i+n, 3): + # with int.from_bytes() we would not have to prepend extra zeroes + color, = struct.unpack('>I', b'\x00'+rawdata[j:j+3]) + palette.append(color) + i += n + i += 12 + return pngidat, palette def read_images(rawdata, colorspace, first_frame_only=False): im = BytesIO(rawdata) @@ -710,7 +766,12 @@ def read_images(rawdata, colorspace, first_frame_only=False): if color == Colorspace['RGBA']: raise JpegColorspaceError("jpeg can't have an alpha channel") im.close() - return [(color, ndpi, imgformat, rawdata, imgwidthpx, imgheightpx)] + return [(color, ndpi, imgformat, rawdata, imgwidthpx, imgheightpx, [])] + elif imgformat == ImageFormat.PNG: + color, ndpi, imgwidthpx, imgheightpx = get_imgmetadata( + imgdata, imgformat, default_dpi, colorspace, rawdata) + pngidat, palette = parse_png(rawdata) + return [(color, ndpi, imgformat, pngidat, imgwidthpx, imgheightpx, palette)] else: result = [] img_page_count = 0 @@ -744,18 +805,24 @@ def read_images(rawdata, colorspace, first_frame_only=False): newimg = imgdata.convert('L') color = Colorspace.L elif color in [Colorspace.RGB, Colorspace.L, Colorspace.CMYK, - Colorspace["CMYK;I"]]: + Colorspace["CMYK;I"], Colorspace.P]: logging.debug("Colorspace is OK: %s", color) newimg = imgdata - elif color in [Colorspace.RGBA, Colorspace.P, Colorspace.other]: + elif color in [Colorspace.RGBA, Colorspace.other]: logging.debug("Converting colorspace %s to RGB", color) newimg = imgdata.convert('RGB') color = Colorspace.RGB else: raise ValueError("unknown colorspace: %s" % color.name) - imggz = zlib.compress(newimg.tobytes()) - result.append((color, ndpi, imgformat, imggz, imgwidthpx, - imgheightpx)) + # cheapo version to retrieve a PNG encoding of the payload is to + # just save it with PIL. In the future this could be replaced by + # dedicated function applying the Paeth PNG filter to the raw pixel + pngbuffer = BytesIO() + newimg.save(pngbuffer, format="png") + pngidat, palette = parse_png(pngbuffer.getvalue()) + imgformat = ImageFormat.PNG + result.append((color, ndpi, imgformat, pngidat, imgwidthpx, + imgheightpx, palette)) img_page_count += 1 # the python-pil version 2.3.0-1ubuntu3 in Ubuntu does not have the # close() method @@ -1070,7 +1137,7 @@ def convert(*images, **kwargs): # name so we now try treating it as raw image content rawdata = img - for color, ndpi, imgformat, imgdata, imgwidthpx, imgheightpx \ + for color, ndpi, imgformat, imgdata, imgwidthpx, imgheightpx, palette \ in read_images( rawdata, kwargs['colorspace'], kwargs['first_frame_only']): pagewidth, pageheight, imgwidthpdf, imgheightpdf = \ @@ -1095,7 +1162,7 @@ def convert(*images, **kwargs): imgypdf = (pageheight - imgheightpdf)/2.0 pdf.add_imagepage(color, imgwidthpx, imgheightpx, imgformat, imgdata, imgwidthpdf, imgheightpdf, imgxpdf, - imgypdf, pagewidth, pageheight, userunit) + imgypdf, pagewidth, pageheight, userunit, palette) if kwargs['outputstream']: pdf.tostream(kwargs['outputstream'])