Also do not re-encode CCITT Group 4 encoded TIFF images

Add additional property of input images: inverted

closes: #47
This commit is contained in:
Johannes 'josch' Schauer 2018-08-01 22:12:40 +02:00
parent 36c5034db5
commit f5d8d86dff
Signed by untrusted user: josch
GPG key ID: F2CBA5C78FBD83E1
3 changed files with 75 additions and 39 deletions

View file

@ -29,16 +29,18 @@ input file format and image color space.
| JPEG | any | direct | | JPEG | any | direct |
| JPEG2000 | any | direct | | JPEG2000 | any | direct |
| PNG (non-interlaced) | any | direct | | PNG (non-interlaced) | any | direct |
| TIFF (CCITT Group 4) | monochrome | direct |
| any | any except CMYK and monochrome | PNG Paeth | | any | any except CMYK and monochrome | PNG Paeth |
| any | monochrome | CCITT Group 4 | | any | monochrome | CCITT Group 4 |
| any | CMYK | flate | | any | CMYK | flate |
For JPEG, JPEG2000 and non-interlaced PNG input, img2pdf directly embeds the For JPEG, JPEG2000, non-interlaced PNG and TIFF images with CCITT Group 4
image data into the PDF without re-encoding it. It thus treats the PDF format encoded data, img2pdf directly embeds the image data into the PDF without
merely as a container format for the image data. In these cases, img2pdf only re-encoding it. It thus treats the PDF format merely as a container format for
increases the filesize by the size of the PDF container (typically around 500 the image data. In these cases, img2pdf only increases the filesize by the size
to 700 bytes). Since data is only copied and not re-encoded, img2pdf is also of the PDF container (typically around 500 to 700 bytes). Since data is only
typically faster than other solutions for these input formats. copied and not re-encoded, img2pdf is also typically faster than other
solutions for these input formats.
For all other input types, img2pdf first has to transform the pixel data to For all other input types, img2pdf first has to transform the pixel data to
make it compatible with PDF. In most cases, the PNG Paeth filter is applied to make it compatible with PDF. In most cases, the PNG Paeth filter is applied to

View file

@ -22,7 +22,7 @@ import sys
import os import os
import zlib import zlib
import argparse import argparse
from PIL import Image from PIL import Image, TiffImagePlugin
from datetime import datetime from datetime import datetime
from jp2 import parsejp2 from jp2 import parsejp2
from enum import Enum from enum import Enum
@ -62,7 +62,7 @@ PageOrientation = Enum('PageOrientation', 'portrait landscape')
Colorspace = Enum('Colorspace', 'RGB L 1 CMYK CMYK;I RGBA P other') Colorspace = Enum('Colorspace', 'RGB L 1 CMYK CMYK;I RGBA P other')
ImageFormat = Enum('ImageFormat', 'JPEG JPEG2000 CCITTGroup4 PNG other') ImageFormat = Enum('ImageFormat', 'JPEG JPEG2000 CCITTGroup4 PNG TIFF other')
PageMode = Enum('PageMode', 'none outlines thumbs') PageMode = Enum('PageMode', 'none outlines thumbs')
@ -374,7 +374,7 @@ class pdfdoc(object):
def add_imagepage(self, color, imgwidthpx, imgheightpx, imgformat, imgdata, def add_imagepage(self, color, imgwidthpx, imgheightpx, imgformat, imgdata,
imgwidthpdf, imgheightpdf, imgxpdf, imgypdf, pagewidth, imgwidthpdf, imgheightpdf, imgxpdf, imgypdf, pagewidth,
pageheight, userunit=None, palette=None): pageheight, userunit=None, palette=None, inverted=False):
if self.with_pdfrw: if self.with_pdfrw:
from pdfrw import PdfDict, PdfName, PdfObject, PdfString from pdfrw import PdfDict, PdfName, PdfObject, PdfString
from pdfrw.py23_diffs import convert_load from pdfrw.py23_diffs import convert_load
@ -440,8 +440,13 @@ class pdfdoc(object):
if imgformat is ImageFormat.CCITTGroup4: if imgformat is ImageFormat.CCITTGroup4:
decodeparms = PdfDict() decodeparms = PdfDict()
# The default for the K parameter is 0 which indicates Group 3 1-D
# encoding. We set it to -1 because we want Group 4 encoding.
decodeparms[PdfName.K] = -1 decodeparms[PdfName.K] = -1
decodeparms[PdfName.BlackIs1] = PdfObject('true') if inverted:
decodeparms[PdfName.BlackIs1] = PdfObject('false')
else:
decodeparms[PdfName.BlackIs1] = PdfObject('true')
decodeparms[PdfName.Columns] = imgwidthpx decodeparms[PdfName.Columns] = imgwidthpx
decodeparms[PdfName.Rows] = imgheightpx decodeparms[PdfName.Rows] = imgheightpx
image[PdfName.DecodeParms] = [decodeparms] image[PdfName.DecodeParms] = [decodeparms]
@ -685,11 +690,32 @@ def get_imgmetadata(imgdata, imgformat, default_dpi, colorspace, rawdata=None):
return (color, ndpi, imgwidthpx, imgheightpx) return (color, ndpi, imgwidthpx, imgheightpx)
def ccitt_payload_location_from_pil(img):
# If Pillow is passed an invalid compression argument it will ignore it;
# make sure the image actually got compressed.
if img.info['compression'] != 'group4':
raise ValueError("Image not compressed with CCITT Group 4 but with: %s" % img.info['compression'])
# Read the TIFF tags to find the offset(s) of the compressed data strips.
strip_offsets = img.tag_v2[TiffImagePlugin.STRIPOFFSETS]
strip_bytes = img.tag_v2[TiffImagePlugin.STRIPBYTECOUNTS]
rows_per_strip = img.tag_v2[TiffImagePlugin.ROWSPERSTRIP]
# PIL always seems to create a single strip even for very large TIFFs when
# it saves images, so assume we only have to read a single strip.
# A test ~10 GPixel image was still encoded as a single strip. Just to be
# safe check throw an error if there is more than one offset.
if len(strip_offsets) != 1 or len(strip_bytes) != 1:
raise NotImplementedError("Transcoding multiple strips not supported")
(offset, ), (length, ) = strip_offsets, strip_bytes
return offset, length
def transcode_monochrome(imgdata): def transcode_monochrome(imgdata):
"""Convert the open PIL.Image imgdata to compressed CCITT Group4 data""" """Convert the open PIL.Image imgdata to compressed CCITT Group4 data"""
from PIL import TiffImagePlugin
logging.debug("Converting monochrome to CCITT Group4") logging.debug("Converting monochrome to CCITT Group4")
# Convert the image to Group 4 in memory. If libtiff is not installed and # Convert the image to Group 4 in memory. If libtiff is not installed and
@ -707,27 +733,11 @@ def transcode_monochrome(imgdata):
newimgio.seek(0) newimgio.seek(0)
newimg = Image.open(newimgio) newimg = Image.open(newimgio)
# If Pillow is passed an invalid compression argument it will ignore it; offset, length = ccitt_payload_location_from_pil(newimg)
# make sure the image actually got compressed.
if newimg.info['compression'] != 'group4':
raise ValueError("Image not compressed as expected")
# Read the TIFF tags to find the offset(s) of the compressed data strips. newimgio.seek(offset)
strip_offsets = newimg.tag_v2[TiffImagePlugin.STRIPOFFSETS] return newimgio.read(length)
strip_bytes = newimg.tag_v2[TiffImagePlugin.STRIPBYTECOUNTS]
rows_per_strip = newimg.tag_v2[TiffImagePlugin.ROWSPERSTRIP]
# PIL always seems to create a single strip even for very large TIFFs when
# it saves images, so assume we only have to read a single strip.
# A test ~10 GPixel image was still encoded as a single strip. Just to be
# safe check throw an error if there is more than one offset.
if len(strip_offsets) > 1:
raise NotImplementedError("Transcoding multiple strips not supported")
newimgio.seek(strip_offsets[0])
ccittdata = newimgio.read(strip_bytes[0])
return ccittdata
def parse_png(rawdata): def parse_png(rawdata):
pngidat = b"" pngidat = b""
@ -786,7 +796,7 @@ def read_images(rawdata, colorspace, first_frame_only=False):
if color == Colorspace['RGBA']: if color == Colorspace['RGBA']:
raise JpegColorspaceError("jpeg can't have an alpha channel") raise JpegColorspaceError("jpeg can't have an alpha channel")
im.close() im.close()
return [(color, ndpi, imgformat, rawdata, imgwidthpx, imgheightpx, [])] return [(color, ndpi, imgformat, rawdata, imgwidthpx, imgheightpx, [], False)]
# We can directly embed the IDAT chunk of PNG images if the PNG is not # We can directly embed the IDAT chunk of PNG images if the PNG is not
# interlaced # interlaced
@ -799,7 +809,27 @@ def read_images(rawdata, colorspace, first_frame_only=False):
color, ndpi, imgwidthpx, imgheightpx = get_imgmetadata( color, ndpi, imgwidthpx, imgheightpx = get_imgmetadata(
imgdata, imgformat, default_dpi, colorspace, rawdata) imgdata, imgformat, default_dpi, colorspace, rawdata)
pngidat, palette = parse_png(rawdata) pngidat, palette = parse_png(rawdata)
return [(color, ndpi, imgformat, pngidat, imgwidthpx, imgheightpx, palette)] im.close()
return [(color, ndpi, imgformat, pngidat, imgwidthpx, imgheightpx, palette, False)]
# We can directly copy the data out of a CCITT Group 4 encoded TIFF, if it
# only contains a single strip
if imgformat == ImageFormat.TIFF \
and imgdata.info['compression'] == "group4" \
and len(imgdata.tag_v2[TiffImagePlugin.STRIPOFFSETS]) == 1:
photo = imgdata.tag_v2[TiffImagePlugin.PHOTOMETRIC_INTERPRETATION]
inverted = False
if photo == 0:
inverted = True
elif photo != 1:
raise ValueError("unsupported photometric interpretation for group4 tiff: %d" % photo)
color, ndpi, imgwidthpx, imgheightpx = get_imgmetadata(
imgdata, imgformat, default_dpi, colorspace, rawdata)
offset, length = ccitt_payload_location_from_pil(imgdata)
im.seek(offset)
rawdata = im.read(length)
im.close()
return [(color, ndpi, ImageFormat.CCITTGroup4, rawdata, imgwidthpx, imgheightpx, [], inverted)]
# Everything else has to be encoded # Everything else has to be encoded
@ -826,7 +856,7 @@ def read_images(rawdata, colorspace, first_frame_only=False):
ccittdata = transcode_monochrome(imgdata) ccittdata = transcode_monochrome(imgdata)
imgformat = ImageFormat.CCITTGroup4 imgformat = ImageFormat.CCITTGroup4
result.append((color, ndpi, imgformat, ccittdata, result.append((color, ndpi, imgformat, ccittdata,
imgwidthpx, imgheightpx, [])) imgwidthpx, imgheightpx, [], False))
img_page_count += 1 img_page_count += 1
continue continue
except Exception as e: except Exception as e:
@ -845,7 +875,7 @@ def read_images(rawdata, colorspace, first_frame_only=False):
if color in [Colorspace.CMYK, Colorspace["CMYK;I"]]: if color in [Colorspace.CMYK, Colorspace["CMYK;I"]]:
imggz = zlib.compress(newimg.tobytes()) imggz = zlib.compress(newimg.tobytes())
result.append((color, ndpi, imgformat, imggz, imgwidthpx, result.append((color, ndpi, imgformat, imggz, imgwidthpx,
imgheightpx, [])) imgheightpx, [], False))
else: else:
# cheapo version to retrieve a PNG encoding of the payload is to # cheapo version to retrieve a PNG encoding of the payload is to
# just save it with PIL. In the future this could be replaced by # just save it with PIL. In the future this could be replaced by
@ -855,7 +885,7 @@ def read_images(rawdata, colorspace, first_frame_only=False):
pngidat, palette = parse_png(pngbuffer.getvalue()) pngidat, palette = parse_png(pngbuffer.getvalue())
imgformat = ImageFormat.PNG imgformat = ImageFormat.PNG
result.append((color, ndpi, imgformat, pngidat, imgwidthpx, result.append((color, ndpi, imgformat, pngidat, imgwidthpx,
imgheightpx, palette)) imgheightpx, palette, False))
img_page_count += 1 img_page_count += 1
# the python-pil version 2.3.0-1ubuntu3 in Ubuntu does not have the # the python-pil version 2.3.0-1ubuntu3 in Ubuntu does not have the
# close() method # close() method
@ -1170,8 +1200,8 @@ def convert(*images, **kwargs):
# name so we now try treating it as raw image content # name so we now try treating it as raw image content
rawdata = img rawdata = img
for color, ndpi, imgformat, imgdata, imgwidthpx, imgheightpx, palette \ for color, ndpi, imgformat, imgdata, imgwidthpx, imgheightpx, \
in read_images( palette, inverted in read_images(
rawdata, kwargs['colorspace'], kwargs['first_frame_only']): rawdata, kwargs['colorspace'], kwargs['first_frame_only']):
pagewidth, pageheight, imgwidthpdf, imgheightpdf = \ pagewidth, pageheight, imgwidthpdf, imgheightpdf = \
kwargs['layout_fun'](imgwidthpx, imgheightpx, ndpi) kwargs['layout_fun'](imgwidthpx, imgheightpx, ndpi)
@ -1195,7 +1225,8 @@ def convert(*images, **kwargs):
imgypdf = (pageheight - imgheightpdf)/2.0 imgypdf = (pageheight - imgheightpdf)/2.0
pdf.add_imagepage(color, imgwidthpx, imgheightpx, imgformat, pdf.add_imagepage(color, imgwidthpx, imgheightpx, imgformat,
imgdata, imgwidthpdf, imgheightpdf, imgxpdf, imgdata, imgwidthpdf, imgheightpdf, imgxpdf,
imgypdf, pagewidth, pageheight, userunit, palette) imgypdf, pagewidth, pageheight, userunit,
palette, inverted)
if kwargs['outputstream']: if kwargs['outputstream']:
pdf.tostream(kwargs['outputstream']) pdf.tostream(kwargs['outputstream'])

View file

@ -592,6 +592,9 @@ def test_suite():
if imgprops.DecodeParms: if imgprops.DecodeParms:
if orig_img.format == 'PNG': if orig_img.format == 'PNG':
pngidat, palette = img2pdf.parse_png(orig_imgdata) pngidat, palette = img2pdf.parse_png(orig_imgdata)
elif orig_img.format == 'TIFF' and orig_img.info['compression'] == "group4":
offset, length = img2pdf.ccitt_payload_location_from_pil(orig_img)
pngidat = orig_imgdata[offset:offset+length]
else: else:
pngbuffer = BytesIO() pngbuffer = BytesIO()
orig_img.save(pngbuffer, format="png") orig_img.save(pngbuffer, format="png")