Proof of concept of using PDF DecodeParms for storing pixel data with PNG compression
This commit is contained in:
parent
9836b976d3
commit
1d9a25dfd2
2 changed files with 114 additions and 47 deletions
34
README.md
34
README.md
|
@ -11,28 +11,24 @@ terms of its file size.
|
||||||
Background
|
Background
|
||||||
----------
|
----------
|
||||||
|
|
||||||
Quality loss can be avoided when converting JPEG and JPEG2000 images to PDF by
|
Quality loss can be avoided when converting PNG, JPEG and JPEG2000 images to
|
||||||
embedding them into the PDF without re-encoding them. This is what img2pdf
|
PDF by embedding them into the PDF without re-encoding them. This is what
|
||||||
does. It thus treats the PDF format merely as a container format for storing
|
img2pdf does. It thus treats the PDF format merely as a container format for
|
||||||
one or more JPEGs without re-encoding the JPEG images themselves.
|
storing one or more JPEGs or PNGs without re-encoding the images themselves.
|
||||||
|
|
||||||
If you know an existing tool which allows one to embed JPEG and JPEG2000 images
|
If you know an existing tool which allows one to embed PNG, JPEG and JPEG2000
|
||||||
into a PDF container without recompression, please contact me so that I can put
|
images into a PDF container without recompression, please contact me so that I
|
||||||
this code into the garbage bin.
|
can put this code into the garbage bin.
|
||||||
|
|
||||||
Functionality
|
Functionality
|
||||||
-------------
|
-------------
|
||||||
|
|
||||||
This program will take a list of raster images and produce a PDF file with the
|
This program will take a list of raster images and produce a PDF file with the
|
||||||
images embedded in it. JPEG and JPEG2000 images will be included without
|
images embedded in it. PNG, JPEG and JPEG2000 images will be included without
|
||||||
recompression and the resulting PDF will only be slightly larger than the input
|
recompression and the resulting PDF will only be slightly larger than the input
|
||||||
images due to the overhead of the PDF container. Raster images in other
|
images due to the overhead of the PDF container. Raster images in other
|
||||||
formats (like png, gif or tif) will be included using the lossless zip/flate
|
formats (like gif or tif) will be included using the lossless zip/flate
|
||||||
encoding which usually leads to a significant increase in the PDF size if the
|
encoding using the PNG Paeth predictor.
|
||||||
input was for example a png image. This is unfortunately unavoidable because
|
|
||||||
there is no other way to store arbitrary RGB bitmaps in PDF in a lossless way
|
|
||||||
other than zip/flate encoding. And zip/flate compresses bitmaps worse than png
|
|
||||||
is able to compress them.
|
|
||||||
|
|
||||||
As a result, this tool is able to losslessly wrap raster images into a PDF
|
As a result, this tool is able to losslessly wrap raster images into a PDF
|
||||||
container with a quality to filesize ratio that is typically better (in case of
|
container with a quality to filesize ratio that is typically better (in case of
|
||||||
|
@ -58,13 +54,17 @@ imagemagick, one has to use zip compression:
|
||||||
However, this approach will result in PDF files that are a few times larger
|
However, this approach will result in PDF files that are a few times larger
|
||||||
than the input JPEG or JPEG2000 file.
|
than the input JPEG or JPEG2000 file.
|
||||||
|
|
||||||
img2pdf is able to losslessly embed JPEG and JPEG2000 files into a PDF
|
Furthermore, when converting PNG images, popular tools like imagemagick use
|
||||||
|
flate encoding without a predictor. This means, that image file size ends up
|
||||||
|
being several orders of magnitude larger then necessary.
|
||||||
|
|
||||||
|
img2pdf is able to losslessly embed PNG, JPEG and JPEG2000 files into a PDF
|
||||||
container without additional overhead (aside from the PDF structure itself),
|
container without additional overhead (aside from the PDF structure itself),
|
||||||
save other graphics formats using lossless zip compression, and produce
|
save other graphics formats using lossless zip compression, and produce
|
||||||
multi-page PDF files when more than one input image is given.
|
multi-page PDF files when more than one input image is given.
|
||||||
|
|
||||||
Also, since JPEG and JPEG2000 images are not reencoded, conversion with img2pdf
|
Also, since PNG, JPEG and JPEG2000 images are not reencoded, conversion with
|
||||||
is several times faster than with other tools.
|
img2pdf is several times faster than with other tools.
|
||||||
|
|
||||||
Usage
|
Usage
|
||||||
-----
|
-----
|
||||||
|
|
127
src/img2pdf.py
127
src/img2pdf.py
|
@ -28,6 +28,7 @@ from jp2 import parsejp2
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
import logging
|
import logging
|
||||||
|
import struct
|
||||||
|
|
||||||
PY3 = sys.version_info[0] >= 3
|
PY3 = sys.version_info[0] >= 3
|
||||||
|
|
||||||
|
@ -61,7 +62,7 @@ PageOrientation = Enum('PageOrientation', 'portrait landscape')
|
||||||
|
|
||||||
Colorspace = Enum('Colorspace', 'RGB L 1 CMYK CMYK;I RGBA P other')
|
Colorspace = Enum('Colorspace', 'RGB L 1 CMYK CMYK;I RGBA P other')
|
||||||
|
|
||||||
ImageFormat = Enum('ImageFormat', 'JPEG JPEG2000 CCITTGroup4 other')
|
ImageFormat = Enum('ImageFormat', 'JPEG JPEG2000 CCITTGroup4 PNG other')
|
||||||
|
|
||||||
PageMode = Enum('PageMode', 'none outlines thumbs')
|
PageMode = Enum('PageMode', 'none outlines thumbs')
|
||||||
|
|
||||||
|
@ -274,24 +275,30 @@ class MyPdfWriter():
|
||||||
if PY3:
|
if PY3:
|
||||||
class MyPdfString():
|
class MyPdfString():
|
||||||
@classmethod
|
@classmethod
|
||||||
def encode(cls, string):
|
def encode(cls, string, hextype=False):
|
||||||
try:
|
if hextype:
|
||||||
string = string.encode('ascii')
|
return b'< ' + b' '.join(("%06x"%c).encode('ascii') for c in string) + b' >'
|
||||||
except UnicodeEncodeError:
|
else:
|
||||||
string = b"\xfe\xff"+string.encode("utf-16-be")
|
try:
|
||||||
string = string.replace(b'\\', b'\\\\')
|
string = string.encode('ascii')
|
||||||
string = string.replace(b'(', b'\\(')
|
except UnicodeEncodeError:
|
||||||
string = string.replace(b')', b'\\)')
|
string = b"\xfe\xff"+string.encode("utf-16-be")
|
||||||
return b'(' + string + b')'
|
string = string.replace(b'\\', b'\\\\')
|
||||||
|
string = string.replace(b'(', b'\\(')
|
||||||
|
string = string.replace(b')', b'\\)')
|
||||||
|
return b'(' + string + b')'
|
||||||
else:
|
else:
|
||||||
class MyPdfString(object):
|
class MyPdfString(object):
|
||||||
@classmethod
|
@classmethod
|
||||||
def encode(cls, string):
|
def encode(cls, string, hextype=False):
|
||||||
# This mimics exactely to what pdfrw does.
|
if hextype:
|
||||||
string = string.replace(b'\\', b'\\\\')
|
return b'< ' + b' '.join(("%06x"%c).encode('ascii') for c in string) + b' >'
|
||||||
string = string.replace(b'(', b'\\(')
|
else:
|
||||||
string = string.replace(b')', b'\\)')
|
# This mimics exactely to what pdfrw does.
|
||||||
return b'(' + string + b')'
|
string = string.replace(b'\\', b'\\\\')
|
||||||
|
string = string.replace(b'(', b'\\(')
|
||||||
|
string = string.replace(b')', b'\\)')
|
||||||
|
return b'(' + string + b')'
|
||||||
|
|
||||||
|
|
||||||
class pdfdoc(object):
|
class pdfdoc(object):
|
||||||
|
@ -367,14 +374,15 @@ class pdfdoc(object):
|
||||||
|
|
||||||
def add_imagepage(self, color, imgwidthpx, imgheightpx, imgformat, imgdata,
|
def add_imagepage(self, color, imgwidthpx, imgheightpx, imgformat, imgdata,
|
||||||
imgwidthpdf, imgheightpdf, imgxpdf, imgypdf, pagewidth,
|
imgwidthpdf, imgheightpdf, imgxpdf, imgypdf, pagewidth,
|
||||||
pageheight, userunit=None):
|
pageheight, userunit=None, palette=None):
|
||||||
if self.with_pdfrw:
|
if self.with_pdfrw:
|
||||||
from pdfrw import PdfDict, PdfName, PdfObject
|
from pdfrw import PdfDict, PdfName, PdfObject, PdfString
|
||||||
from pdfrw.py23_diffs import convert_load
|
from pdfrw.py23_diffs import convert_load
|
||||||
else:
|
else:
|
||||||
PdfDict = MyPdfDict
|
PdfDict = MyPdfDict
|
||||||
PdfName = MyPdfName
|
PdfName = MyPdfName
|
||||||
PdfObject = MyPdfObject
|
PdfObject = MyPdfObject
|
||||||
|
PdfString = MyPdfString
|
||||||
convert_load = my_convert_load
|
convert_load = my_convert_load
|
||||||
|
|
||||||
if color == Colorspace['1'] or color == Colorspace.L:
|
if color == Colorspace['1'] or color == Colorspace.L:
|
||||||
|
@ -383,21 +391,24 @@ class pdfdoc(object):
|
||||||
colorspace = PdfName.DeviceRGB
|
colorspace = PdfName.DeviceRGB
|
||||||
elif color == Colorspace.CMYK or color == Colorspace['CMYK;I']:
|
elif color == Colorspace.CMYK or color == Colorspace['CMYK;I']:
|
||||||
colorspace = PdfName.DeviceCMYK
|
colorspace = PdfName.DeviceCMYK
|
||||||
|
elif color == Colorspace.P:
|
||||||
|
if self.with_pdfrw:
|
||||||
|
raise Exception("pdfrw does not support hex strings for palette image input, re-run with --without-pdfrw")
|
||||||
|
colorspace = [ PdfName.Indexed, PdfName.DeviceRGB, len(palette)-1, PdfString.encode(palette, hextype=True)]
|
||||||
else:
|
else:
|
||||||
raise UnsupportedColorspaceError("unsupported color space: %s"
|
raise UnsupportedColorspaceError("unsupported color space: %s"
|
||||||
% color.name)
|
% color.name)
|
||||||
|
|
||||||
# either embed the whole jpeg or deflate the bitmap representation
|
# either embed the whole jpeg or deflate the bitmap representation
|
||||||
logging.debug(imgformat)
|
|
||||||
if imgformat is ImageFormat.JPEG:
|
if imgformat is ImageFormat.JPEG:
|
||||||
ofilter = [PdfName.DCTDecode]
|
ofilter = [PdfName.DCTDecode]
|
||||||
elif imgformat is ImageFormat.JPEG2000:
|
elif imgformat is ImageFormat.JPEG2000:
|
||||||
ofilter = [PdfName.JPXDecode]
|
ofilter = [PdfName.JPXDecode]
|
||||||
self.writer.version = "1.5" # jpeg2000 needs pdf 1.5
|
self.writer.version = "1.5" # jpeg2000 needs pdf 1.5
|
||||||
elif imgformat is ImageFormat.CCITTGroup4:
|
elif imgformat is ImageFormat.CCITTGroup4:
|
||||||
ofilter = [PdfName.CCITTFaxDecode]
|
ofilter = PdfName.CCITTFaxDecode
|
||||||
else:
|
else:
|
||||||
ofilter = [PdfName.FlateDecode]
|
ofilter = PdfName.FlateDecode
|
||||||
|
|
||||||
image = PdfDict(stream=convert_load(imgdata))
|
image = PdfDict(stream=convert_load(imgdata))
|
||||||
|
|
||||||
|
@ -411,7 +422,15 @@ class pdfdoc(object):
|
||||||
if imgformat is ImageFormat.CCITTGroup4:
|
if imgformat is ImageFormat.CCITTGroup4:
|
||||||
image[PdfName.BitsPerComponent] = 1
|
image[PdfName.BitsPerComponent] = 1
|
||||||
else:
|
else:
|
||||||
image[PdfName.BitsPerComponent] = 8
|
if color == Colorspace.P:
|
||||||
|
if len(palette) <= 2**1:
|
||||||
|
image[PdfName.BitsPerComponent] = 1
|
||||||
|
elif len(palette) <= 2**4:
|
||||||
|
image[PdfName.BitsPerComponent] = 4
|
||||||
|
else:
|
||||||
|
image[PdfName.BitsPerComponent] = 8
|
||||||
|
else:
|
||||||
|
image[PdfName.BitsPerComponent] = 8
|
||||||
|
|
||||||
if color == Colorspace['CMYK;I']:
|
if color == Colorspace['CMYK;I']:
|
||||||
# Inverts all four channels
|
# Inverts all four channels
|
||||||
|
@ -424,6 +443,24 @@ class pdfdoc(object):
|
||||||
decodeparms[PdfName.Columns] = imgwidthpx
|
decodeparms[PdfName.Columns] = imgwidthpx
|
||||||
decodeparms[PdfName.Rows] = imgheightpx
|
decodeparms[PdfName.Rows] = imgheightpx
|
||||||
image[PdfName.DecodeParms] = [decodeparms]
|
image[PdfName.DecodeParms] = [decodeparms]
|
||||||
|
elif imgformat is ImageFormat.PNG:
|
||||||
|
decodeparms = PdfDict()
|
||||||
|
decodeparms[PdfName.Predictor] = 15
|
||||||
|
if color in [ Colorspace.P, Colorspace['1'], Colorspace.L ]:
|
||||||
|
decodeparms[PdfName.Colors] = 1
|
||||||
|
else:
|
||||||
|
decodeparms[PdfName.Colors] = 3
|
||||||
|
decodeparms[PdfName.Columns] = imgwidthpx
|
||||||
|
if color == Colorspace.P:
|
||||||
|
if len(palette) <= 2**1:
|
||||||
|
decodeparms[PdfName.BitsPerComponent] = 1
|
||||||
|
elif len(palette) <= 2**4:
|
||||||
|
decodeparms[PdfName.BitsPerComponent] = 4
|
||||||
|
else:
|
||||||
|
decodeparms[PdfName.BitsPerComponent] = 8
|
||||||
|
else:
|
||||||
|
decodeparms[PdfName.BitsPerComponent] = 8
|
||||||
|
image[PdfName.DecodeParms] = decodeparms
|
||||||
|
|
||||||
text = ("q\n%0.4f 0 0 %0.4f %0.4f %0.4f cm\n/Im0 Do\nQ" %
|
text = ("q\n%0.4f 0 0 %0.4f %0.4f %0.4f cm\n/Im0 Do\nQ" %
|
||||||
(imgwidthpdf, imgheightpdf, imgxpdf, imgypdf)).encode("ascii")
|
(imgwidthpdf, imgheightpdf, imgxpdf, imgypdf)).encode("ascii")
|
||||||
|
@ -674,6 +711,25 @@ def transcode_monochrome(imgdata):
|
||||||
|
|
||||||
return ccittdata
|
return ccittdata
|
||||||
|
|
||||||
|
def parse_png(rawdata):
|
||||||
|
pngidat = b""
|
||||||
|
palette = []
|
||||||
|
i = 16
|
||||||
|
while i < len(rawdata):
|
||||||
|
# once we can require Python >= 3.2 we can use int.from_bytes() instead
|
||||||
|
n, = struct.unpack('>I', rawdata[i-8:i-4])
|
||||||
|
if i + n > len(rawdata):
|
||||||
|
raise Exception("invalid png: %d %d %d"%(i, n, len(rawdata)))
|
||||||
|
if rawdata[i-4:i] == b"IDAT":
|
||||||
|
pngidat += rawdata[i:i+n]
|
||||||
|
elif rawdata[i-4:i] == b"PLTE":
|
||||||
|
for j in range(i, i+n, 3):
|
||||||
|
# with int.from_bytes() we would not have to prepend extra zeroes
|
||||||
|
color, = struct.unpack('>I', b'\x00'+rawdata[j:j+3])
|
||||||
|
palette.append(color)
|
||||||
|
i += n
|
||||||
|
i += 12
|
||||||
|
return pngidat, palette
|
||||||
|
|
||||||
def read_images(rawdata, colorspace, first_frame_only=False):
|
def read_images(rawdata, colorspace, first_frame_only=False):
|
||||||
im = BytesIO(rawdata)
|
im = BytesIO(rawdata)
|
||||||
|
@ -710,7 +766,12 @@ def read_images(rawdata, colorspace, first_frame_only=False):
|
||||||
if color == Colorspace['RGBA']:
|
if color == Colorspace['RGBA']:
|
||||||
raise JpegColorspaceError("jpeg can't have an alpha channel")
|
raise JpegColorspaceError("jpeg can't have an alpha channel")
|
||||||
im.close()
|
im.close()
|
||||||
return [(color, ndpi, imgformat, rawdata, imgwidthpx, imgheightpx)]
|
return [(color, ndpi, imgformat, rawdata, imgwidthpx, imgheightpx, [])]
|
||||||
|
elif imgformat == ImageFormat.PNG:
|
||||||
|
color, ndpi, imgwidthpx, imgheightpx = get_imgmetadata(
|
||||||
|
imgdata, imgformat, default_dpi, colorspace, rawdata)
|
||||||
|
pngidat, palette = parse_png(rawdata)
|
||||||
|
return [(color, ndpi, imgformat, pngidat, imgwidthpx, imgheightpx, palette)]
|
||||||
else:
|
else:
|
||||||
result = []
|
result = []
|
||||||
img_page_count = 0
|
img_page_count = 0
|
||||||
|
@ -744,18 +805,24 @@ def read_images(rawdata, colorspace, first_frame_only=False):
|
||||||
newimg = imgdata.convert('L')
|
newimg = imgdata.convert('L')
|
||||||
color = Colorspace.L
|
color = Colorspace.L
|
||||||
elif color in [Colorspace.RGB, Colorspace.L, Colorspace.CMYK,
|
elif color in [Colorspace.RGB, Colorspace.L, Colorspace.CMYK,
|
||||||
Colorspace["CMYK;I"]]:
|
Colorspace["CMYK;I"], Colorspace.P]:
|
||||||
logging.debug("Colorspace is OK: %s", color)
|
logging.debug("Colorspace is OK: %s", color)
|
||||||
newimg = imgdata
|
newimg = imgdata
|
||||||
elif color in [Colorspace.RGBA, Colorspace.P, Colorspace.other]:
|
elif color in [Colorspace.RGBA, Colorspace.other]:
|
||||||
logging.debug("Converting colorspace %s to RGB", color)
|
logging.debug("Converting colorspace %s to RGB", color)
|
||||||
newimg = imgdata.convert('RGB')
|
newimg = imgdata.convert('RGB')
|
||||||
color = Colorspace.RGB
|
color = Colorspace.RGB
|
||||||
else:
|
else:
|
||||||
raise ValueError("unknown colorspace: %s" % color.name)
|
raise ValueError("unknown colorspace: %s" % color.name)
|
||||||
imggz = zlib.compress(newimg.tobytes())
|
# cheapo version to retrieve a PNG encoding of the payload is to
|
||||||
result.append((color, ndpi, imgformat, imggz, imgwidthpx,
|
# just save it with PIL. In the future this could be replaced by
|
||||||
imgheightpx))
|
# dedicated function applying the Paeth PNG filter to the raw pixel
|
||||||
|
pngbuffer = BytesIO()
|
||||||
|
newimg.save(pngbuffer, format="png")
|
||||||
|
pngidat, palette = parse_png(pngbuffer.getvalue())
|
||||||
|
imgformat = ImageFormat.PNG
|
||||||
|
result.append((color, ndpi, imgformat, pngidat, imgwidthpx,
|
||||||
|
imgheightpx, palette))
|
||||||
img_page_count += 1
|
img_page_count += 1
|
||||||
# the python-pil version 2.3.0-1ubuntu3 in Ubuntu does not have the
|
# the python-pil version 2.3.0-1ubuntu3 in Ubuntu does not have the
|
||||||
# close() method
|
# close() method
|
||||||
|
@ -1070,7 +1137,7 @@ def convert(*images, **kwargs):
|
||||||
# name so we now try treating it as raw image content
|
# name so we now try treating it as raw image content
|
||||||
rawdata = img
|
rawdata = img
|
||||||
|
|
||||||
for color, ndpi, imgformat, imgdata, imgwidthpx, imgheightpx \
|
for color, ndpi, imgformat, imgdata, imgwidthpx, imgheightpx, palette \
|
||||||
in read_images(
|
in read_images(
|
||||||
rawdata, kwargs['colorspace'], kwargs['first_frame_only']):
|
rawdata, kwargs['colorspace'], kwargs['first_frame_only']):
|
||||||
pagewidth, pageheight, imgwidthpdf, imgheightpdf = \
|
pagewidth, pageheight, imgwidthpdf, imgheightpdf = \
|
||||||
|
@ -1095,7 +1162,7 @@ def convert(*images, **kwargs):
|
||||||
imgypdf = (pageheight - imgheightpdf)/2.0
|
imgypdf = (pageheight - imgheightpdf)/2.0
|
||||||
pdf.add_imagepage(color, imgwidthpx, imgheightpx, imgformat,
|
pdf.add_imagepage(color, imgwidthpx, imgheightpx, imgformat,
|
||||||
imgdata, imgwidthpdf, imgheightpdf, imgxpdf,
|
imgdata, imgwidthpdf, imgheightpdf, imgxpdf,
|
||||||
imgypdf, pagewidth, pageheight, userunit)
|
imgypdf, pagewidth, pageheight, userunit, palette)
|
||||||
|
|
||||||
if kwargs['outputstream']:
|
if kwargs['outputstream']:
|
||||||
pdf.tostream(kwargs['outputstream'])
|
pdf.tostream(kwargs['outputstream'])
|
||||||
|
|
Loading…
Reference in a new issue