Proof of concept of using PDF DecodeParms for storing pixel data with PNG compression

This commit is contained in:
Johannes 'josch' Schauer 2018-03-15 11:31:36 +01:00
parent 9836b976d3
commit 1d9a25dfd2
Signed by untrusted user: josch
GPG key ID: F2CBA5C78FBD83E1
2 changed files with 114 additions and 47 deletions

View file

@ -11,28 +11,24 @@ terms of its file size.
Background
----------
Quality loss can be avoided when converting JPEG and JPEG2000 images to PDF by
embedding them into the PDF without re-encoding them. This is what img2pdf
does. It thus treats the PDF format merely as a container format for storing
one or more JPEGs without re-encoding the JPEG images themselves.
Quality loss can be avoided when converting PNG, JPEG and JPEG2000 images to
PDF by embedding them into the PDF without re-encoding them. This is what
img2pdf does. It thus treats the PDF format merely as a container format for
storing one or more JPEGs or PNGs without re-encoding the images themselves.
If you know an existing tool which allows one to embed JPEG and JPEG2000 images
into a PDF container without recompression, please contact me so that I can put
this code into the garbage bin.
If you know an existing tool which allows one to embed PNG, JPEG and JPEG2000
images into a PDF container without recompression, please contact me so that I
can put this code into the garbage bin.
Functionality
-------------
This program will take a list of raster images and produce a PDF file with the
images embedded in it. JPEG and JPEG2000 images will be included without
images embedded in it. PNG, JPEG and JPEG2000 images will be included without
recompression and the resulting PDF will only be slightly larger than the input
images due to the overhead of the PDF container. Raster images in other
formats (like png, gif or tif) will be included using the lossless zip/flate
encoding which usually leads to a significant increase in the PDF size if the
input was for example a png image. This is unfortunately unavoidable because
there is no other way to store arbitrary RGB bitmaps in PDF in a lossless way
other than zip/flate encoding. And zip/flate compresses bitmaps worse than png
is able to compress them.
formats (like gif or tif) will be included using the lossless zip/flate
encoding using the PNG Paeth predictor.
As a result, this tool is able to losslessly wrap raster images into a PDF
container with a quality to filesize ratio that is typically better (in case of
@ -58,13 +54,17 @@ imagemagick, one has to use zip compression:
However, this approach will result in PDF files that are a few times larger
than the input JPEG or JPEG2000 file.
img2pdf is able to losslessly embed JPEG and JPEG2000 files into a PDF
Furthermore, when converting PNG images, popular tools like imagemagick use
flate encoding without a predictor. This means, that image file size ends up
being several orders of magnitude larger then necessary.
img2pdf is able to losslessly embed PNG, JPEG and JPEG2000 files into a PDF
container without additional overhead (aside from the PDF structure itself),
save other graphics formats using lossless zip compression, and produce
multi-page PDF files when more than one input image is given.
Also, since JPEG and JPEG2000 images are not reencoded, conversion with img2pdf
is several times faster than with other tools.
Also, since PNG, JPEG and JPEG2000 images are not reencoded, conversion with
img2pdf is several times faster than with other tools.
Usage
-----

View file

@ -28,6 +28,7 @@ from jp2 import parsejp2
from enum import Enum
from io import BytesIO
import logging
import struct
PY3 = sys.version_info[0] >= 3
@ -61,7 +62,7 @@ PageOrientation = Enum('PageOrientation', 'portrait landscape')
Colorspace = Enum('Colorspace', 'RGB L 1 CMYK CMYK;I RGBA P other')
ImageFormat = Enum('ImageFormat', 'JPEG JPEG2000 CCITTGroup4 other')
ImageFormat = Enum('ImageFormat', 'JPEG JPEG2000 CCITTGroup4 PNG other')
PageMode = Enum('PageMode', 'none outlines thumbs')
@ -274,24 +275,30 @@ class MyPdfWriter():
if PY3:
class MyPdfString():
@classmethod
def encode(cls, string):
try:
string = string.encode('ascii')
except UnicodeEncodeError:
string = b"\xfe\xff"+string.encode("utf-16-be")
string = string.replace(b'\\', b'\\\\')
string = string.replace(b'(', b'\\(')
string = string.replace(b')', b'\\)')
return b'(' + string + b')'
def encode(cls, string, hextype=False):
if hextype:
return b'< ' + b' '.join(("%06x"%c).encode('ascii') for c in string) + b' >'
else:
try:
string = string.encode('ascii')
except UnicodeEncodeError:
string = b"\xfe\xff"+string.encode("utf-16-be")
string = string.replace(b'\\', b'\\\\')
string = string.replace(b'(', b'\\(')
string = string.replace(b')', b'\\)')
return b'(' + string + b')'
else:
class MyPdfString(object):
@classmethod
def encode(cls, string):
# This mimics exactely to what pdfrw does.
string = string.replace(b'\\', b'\\\\')
string = string.replace(b'(', b'\\(')
string = string.replace(b')', b'\\)')
return b'(' + string + b')'
def encode(cls, string, hextype=False):
if hextype:
return b'< ' + b' '.join(("%06x"%c).encode('ascii') for c in string) + b' >'
else:
# This mimics exactely to what pdfrw does.
string = string.replace(b'\\', b'\\\\')
string = string.replace(b'(', b'\\(')
string = string.replace(b')', b'\\)')
return b'(' + string + b')'
class pdfdoc(object):
@ -367,14 +374,15 @@ class pdfdoc(object):
def add_imagepage(self, color, imgwidthpx, imgheightpx, imgformat, imgdata,
imgwidthpdf, imgheightpdf, imgxpdf, imgypdf, pagewidth,
pageheight, userunit=None):
pageheight, userunit=None, palette=None):
if self.with_pdfrw:
from pdfrw import PdfDict, PdfName, PdfObject
from pdfrw import PdfDict, PdfName, PdfObject, PdfString
from pdfrw.py23_diffs import convert_load
else:
PdfDict = MyPdfDict
PdfName = MyPdfName
PdfObject = MyPdfObject
PdfString = MyPdfString
convert_load = my_convert_load
if color == Colorspace['1'] or color == Colorspace.L:
@ -383,21 +391,24 @@ class pdfdoc(object):
colorspace = PdfName.DeviceRGB
elif color == Colorspace.CMYK or color == Colorspace['CMYK;I']:
colorspace = PdfName.DeviceCMYK
elif color == Colorspace.P:
if self.with_pdfrw:
raise Exception("pdfrw does not support hex strings for palette image input, re-run with --without-pdfrw")
colorspace = [ PdfName.Indexed, PdfName.DeviceRGB, len(palette)-1, PdfString.encode(palette, hextype=True)]
else:
raise UnsupportedColorspaceError("unsupported color space: %s"
% color.name)
# either embed the whole jpeg or deflate the bitmap representation
logging.debug(imgformat)
if imgformat is ImageFormat.JPEG:
ofilter = [PdfName.DCTDecode]
elif imgformat is ImageFormat.JPEG2000:
ofilter = [PdfName.JPXDecode]
self.writer.version = "1.5" # jpeg2000 needs pdf 1.5
elif imgformat is ImageFormat.CCITTGroup4:
ofilter = [PdfName.CCITTFaxDecode]
ofilter = PdfName.CCITTFaxDecode
else:
ofilter = [PdfName.FlateDecode]
ofilter = PdfName.FlateDecode
image = PdfDict(stream=convert_load(imgdata))
@ -411,7 +422,15 @@ class pdfdoc(object):
if imgformat is ImageFormat.CCITTGroup4:
image[PdfName.BitsPerComponent] = 1
else:
image[PdfName.BitsPerComponent] = 8
if color == Colorspace.P:
if len(palette) <= 2**1:
image[PdfName.BitsPerComponent] = 1
elif len(palette) <= 2**4:
image[PdfName.BitsPerComponent] = 4
else:
image[PdfName.BitsPerComponent] = 8
else:
image[PdfName.BitsPerComponent] = 8
if color == Colorspace['CMYK;I']:
# Inverts all four channels
@ -424,6 +443,24 @@ class pdfdoc(object):
decodeparms[PdfName.Columns] = imgwidthpx
decodeparms[PdfName.Rows] = imgheightpx
image[PdfName.DecodeParms] = [decodeparms]
elif imgformat is ImageFormat.PNG:
decodeparms = PdfDict()
decodeparms[PdfName.Predictor] = 15
if color in [ Colorspace.P, Colorspace['1'], Colorspace.L ]:
decodeparms[PdfName.Colors] = 1
else:
decodeparms[PdfName.Colors] = 3
decodeparms[PdfName.Columns] = imgwidthpx
if color == Colorspace.P:
if len(palette) <= 2**1:
decodeparms[PdfName.BitsPerComponent] = 1
elif len(palette) <= 2**4:
decodeparms[PdfName.BitsPerComponent] = 4
else:
decodeparms[PdfName.BitsPerComponent] = 8
else:
decodeparms[PdfName.BitsPerComponent] = 8
image[PdfName.DecodeParms] = decodeparms
text = ("q\n%0.4f 0 0 %0.4f %0.4f %0.4f cm\n/Im0 Do\nQ" %
(imgwidthpdf, imgheightpdf, imgxpdf, imgypdf)).encode("ascii")
@ -674,6 +711,25 @@ def transcode_monochrome(imgdata):
return ccittdata
def parse_png(rawdata):
pngidat = b""
palette = []
i = 16
while i < len(rawdata):
# once we can require Python >= 3.2 we can use int.from_bytes() instead
n, = struct.unpack('>I', rawdata[i-8:i-4])
if i + n > len(rawdata):
raise Exception("invalid png: %d %d %d"%(i, n, len(rawdata)))
if rawdata[i-4:i] == b"IDAT":
pngidat += rawdata[i:i+n]
elif rawdata[i-4:i] == b"PLTE":
for j in range(i, i+n, 3):
# with int.from_bytes() we would not have to prepend extra zeroes
color, = struct.unpack('>I', b'\x00'+rawdata[j:j+3])
palette.append(color)
i += n
i += 12
return pngidat, palette
def read_images(rawdata, colorspace, first_frame_only=False):
im = BytesIO(rawdata)
@ -710,7 +766,12 @@ def read_images(rawdata, colorspace, first_frame_only=False):
if color == Colorspace['RGBA']:
raise JpegColorspaceError("jpeg can't have an alpha channel")
im.close()
return [(color, ndpi, imgformat, rawdata, imgwidthpx, imgheightpx)]
return [(color, ndpi, imgformat, rawdata, imgwidthpx, imgheightpx, [])]
elif imgformat == ImageFormat.PNG:
color, ndpi, imgwidthpx, imgheightpx = get_imgmetadata(
imgdata, imgformat, default_dpi, colorspace, rawdata)
pngidat, palette = parse_png(rawdata)
return [(color, ndpi, imgformat, pngidat, imgwidthpx, imgheightpx, palette)]
else:
result = []
img_page_count = 0
@ -744,18 +805,24 @@ def read_images(rawdata, colorspace, first_frame_only=False):
newimg = imgdata.convert('L')
color = Colorspace.L
elif color in [Colorspace.RGB, Colorspace.L, Colorspace.CMYK,
Colorspace["CMYK;I"]]:
Colorspace["CMYK;I"], Colorspace.P]:
logging.debug("Colorspace is OK: %s", color)
newimg = imgdata
elif color in [Colorspace.RGBA, Colorspace.P, Colorspace.other]:
elif color in [Colorspace.RGBA, Colorspace.other]:
logging.debug("Converting colorspace %s to RGB", color)
newimg = imgdata.convert('RGB')
color = Colorspace.RGB
else:
raise ValueError("unknown colorspace: %s" % color.name)
imggz = zlib.compress(newimg.tobytes())
result.append((color, ndpi, imgformat, imggz, imgwidthpx,
imgheightpx))
# cheapo version to retrieve a PNG encoding of the payload is to
# just save it with PIL. In the future this could be replaced by
# dedicated function applying the Paeth PNG filter to the raw pixel
pngbuffer = BytesIO()
newimg.save(pngbuffer, format="png")
pngidat, palette = parse_png(pngbuffer.getvalue())
imgformat = ImageFormat.PNG
result.append((color, ndpi, imgformat, pngidat, imgwidthpx,
imgheightpx, palette))
img_page_count += 1
# the python-pil version 2.3.0-1ubuntu3 in Ubuntu does not have the
# close() method
@ -1070,7 +1137,7 @@ def convert(*images, **kwargs):
# name so we now try treating it as raw image content
rawdata = img
for color, ndpi, imgformat, imgdata, imgwidthpx, imgheightpx \
for color, ndpi, imgformat, imgdata, imgwidthpx, imgheightpx, palette \
in read_images(
rawdata, kwargs['colorspace'], kwargs['first_frame_only']):
pagewidth, pageheight, imgwidthpdf, imgheightpdf = \
@ -1095,7 +1162,7 @@ def convert(*images, **kwargs):
imgypdf = (pageheight - imgheightpdf)/2.0
pdf.add_imagepage(color, imgwidthpx, imgheightpx, imgformat,
imgdata, imgwidthpdf, imgheightpdf, imgxpdf,
imgypdf, pagewidth, pageheight, userunit)
imgypdf, pagewidth, pageheight, userunit, palette)
if kwargs['outputstream']:
pdf.tostream(kwargs['outputstream'])