Also do not re-encode CCITT Group 4 encoded TIFF images
Add additional property of input images: inverted closes: #47
This commit is contained in:
parent
36c5034db5
commit
f5d8d86dff
3 changed files with 75 additions and 39 deletions
14
README.md
14
README.md
|
@ -29,16 +29,18 @@ input file format and image color space.
|
||||||
| JPEG | any | direct |
|
| JPEG | any | direct |
|
||||||
| JPEG2000 | any | direct |
|
| JPEG2000 | any | direct |
|
||||||
| PNG (non-interlaced) | any | direct |
|
| PNG (non-interlaced) | any | direct |
|
||||||
|
| TIFF (CCITT Group 4) | monochrome | direct |
|
||||||
| any | any except CMYK and monochrome | PNG Paeth |
|
| any | any except CMYK and monochrome | PNG Paeth |
|
||||||
| any | monochrome | CCITT Group 4 |
|
| any | monochrome | CCITT Group 4 |
|
||||||
| any | CMYK | flate |
|
| any | CMYK | flate |
|
||||||
|
|
||||||
For JPEG, JPEG2000 and non-interlaced PNG input, img2pdf directly embeds the
|
For JPEG, JPEG2000, non-interlaced PNG and TIFF images with CCITT Group 4
|
||||||
image data into the PDF without re-encoding it. It thus treats the PDF format
|
encoded data, img2pdf directly embeds the image data into the PDF without
|
||||||
merely as a container format for the image data. In these cases, img2pdf only
|
re-encoding it. It thus treats the PDF format merely as a container format for
|
||||||
increases the filesize by the size of the PDF container (typically around 500
|
the image data. In these cases, img2pdf only increases the filesize by the size
|
||||||
to 700 bytes). Since data is only copied and not re-encoded, img2pdf is also
|
of the PDF container (typically around 500 to 700 bytes). Since data is only
|
||||||
typically faster than other solutions for these input formats.
|
copied and not re-encoded, img2pdf is also typically faster than other
|
||||||
|
solutions for these input formats.
|
||||||
|
|
||||||
For all other input types, img2pdf first has to transform the pixel data to
|
For all other input types, img2pdf first has to transform the pixel data to
|
||||||
make it compatible with PDF. In most cases, the PNG Paeth filter is applied to
|
make it compatible with PDF. In most cases, the PNG Paeth filter is applied to
|
||||||
|
|
|
@ -22,7 +22,7 @@ import sys
|
||||||
import os
|
import os
|
||||||
import zlib
|
import zlib
|
||||||
import argparse
|
import argparse
|
||||||
from PIL import Image
|
from PIL import Image, TiffImagePlugin
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from jp2 import parsejp2
|
from jp2 import parsejp2
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
|
@ -62,7 +62,7 @@ PageOrientation = Enum('PageOrientation', 'portrait landscape')
|
||||||
|
|
||||||
Colorspace = Enum('Colorspace', 'RGB L 1 CMYK CMYK;I RGBA P other')
|
Colorspace = Enum('Colorspace', 'RGB L 1 CMYK CMYK;I RGBA P other')
|
||||||
|
|
||||||
ImageFormat = Enum('ImageFormat', 'JPEG JPEG2000 CCITTGroup4 PNG other')
|
ImageFormat = Enum('ImageFormat', 'JPEG JPEG2000 CCITTGroup4 PNG TIFF other')
|
||||||
|
|
||||||
PageMode = Enum('PageMode', 'none outlines thumbs')
|
PageMode = Enum('PageMode', 'none outlines thumbs')
|
||||||
|
|
||||||
|
@ -374,7 +374,7 @@ class pdfdoc(object):
|
||||||
|
|
||||||
def add_imagepage(self, color, imgwidthpx, imgheightpx, imgformat, imgdata,
|
def add_imagepage(self, color, imgwidthpx, imgheightpx, imgformat, imgdata,
|
||||||
imgwidthpdf, imgheightpdf, imgxpdf, imgypdf, pagewidth,
|
imgwidthpdf, imgheightpdf, imgxpdf, imgypdf, pagewidth,
|
||||||
pageheight, userunit=None, palette=None):
|
pageheight, userunit=None, palette=None, inverted=False):
|
||||||
if self.with_pdfrw:
|
if self.with_pdfrw:
|
||||||
from pdfrw import PdfDict, PdfName, PdfObject, PdfString
|
from pdfrw import PdfDict, PdfName, PdfObject, PdfString
|
||||||
from pdfrw.py23_diffs import convert_load
|
from pdfrw.py23_diffs import convert_load
|
||||||
|
@ -440,8 +440,13 @@ class pdfdoc(object):
|
||||||
|
|
||||||
if imgformat is ImageFormat.CCITTGroup4:
|
if imgformat is ImageFormat.CCITTGroup4:
|
||||||
decodeparms = PdfDict()
|
decodeparms = PdfDict()
|
||||||
|
# The default for the K parameter is 0 which indicates Group 3 1-D
|
||||||
|
# encoding. We set it to -1 because we want Group 4 encoding.
|
||||||
decodeparms[PdfName.K] = -1
|
decodeparms[PdfName.K] = -1
|
||||||
decodeparms[PdfName.BlackIs1] = PdfObject('true')
|
if inverted:
|
||||||
|
decodeparms[PdfName.BlackIs1] = PdfObject('false')
|
||||||
|
else:
|
||||||
|
decodeparms[PdfName.BlackIs1] = PdfObject('true')
|
||||||
decodeparms[PdfName.Columns] = imgwidthpx
|
decodeparms[PdfName.Columns] = imgwidthpx
|
||||||
decodeparms[PdfName.Rows] = imgheightpx
|
decodeparms[PdfName.Rows] = imgheightpx
|
||||||
image[PdfName.DecodeParms] = [decodeparms]
|
image[PdfName.DecodeParms] = [decodeparms]
|
||||||
|
@ -685,11 +690,32 @@ def get_imgmetadata(imgdata, imgformat, default_dpi, colorspace, rawdata=None):
|
||||||
return (color, ndpi, imgwidthpx, imgheightpx)
|
return (color, ndpi, imgwidthpx, imgheightpx)
|
||||||
|
|
||||||
|
|
||||||
|
def ccitt_payload_location_from_pil(img):
|
||||||
|
# If Pillow is passed an invalid compression argument it will ignore it;
|
||||||
|
# make sure the image actually got compressed.
|
||||||
|
if img.info['compression'] != 'group4':
|
||||||
|
raise ValueError("Image not compressed with CCITT Group 4 but with: %s" % img.info['compression'])
|
||||||
|
|
||||||
|
# Read the TIFF tags to find the offset(s) of the compressed data strips.
|
||||||
|
strip_offsets = img.tag_v2[TiffImagePlugin.STRIPOFFSETS]
|
||||||
|
strip_bytes = img.tag_v2[TiffImagePlugin.STRIPBYTECOUNTS]
|
||||||
|
rows_per_strip = img.tag_v2[TiffImagePlugin.ROWSPERSTRIP]
|
||||||
|
|
||||||
|
# PIL always seems to create a single strip even for very large TIFFs when
|
||||||
|
# it saves images, so assume we only have to read a single strip.
|
||||||
|
# A test ~10 GPixel image was still encoded as a single strip. Just to be
|
||||||
|
# safe check throw an error if there is more than one offset.
|
||||||
|
if len(strip_offsets) != 1 or len(strip_bytes) != 1:
|
||||||
|
raise NotImplementedError("Transcoding multiple strips not supported")
|
||||||
|
|
||||||
|
(offset, ), (length, ) = strip_offsets, strip_bytes
|
||||||
|
|
||||||
|
return offset, length
|
||||||
|
|
||||||
|
|
||||||
def transcode_monochrome(imgdata):
|
def transcode_monochrome(imgdata):
|
||||||
"""Convert the open PIL.Image imgdata to compressed CCITT Group4 data"""
|
"""Convert the open PIL.Image imgdata to compressed CCITT Group4 data"""
|
||||||
|
|
||||||
from PIL import TiffImagePlugin
|
|
||||||
|
|
||||||
logging.debug("Converting monochrome to CCITT Group4")
|
logging.debug("Converting monochrome to CCITT Group4")
|
||||||
|
|
||||||
# Convert the image to Group 4 in memory. If libtiff is not installed and
|
# Convert the image to Group 4 in memory. If libtiff is not installed and
|
||||||
|
@ -707,27 +733,11 @@ def transcode_monochrome(imgdata):
|
||||||
newimgio.seek(0)
|
newimgio.seek(0)
|
||||||
newimg = Image.open(newimgio)
|
newimg = Image.open(newimgio)
|
||||||
|
|
||||||
# If Pillow is passed an invalid compression argument it will ignore it;
|
offset, length = ccitt_payload_location_from_pil(newimg)
|
||||||
# make sure the image actually got compressed.
|
|
||||||
if newimg.info['compression'] != 'group4':
|
|
||||||
raise ValueError("Image not compressed as expected")
|
|
||||||
|
|
||||||
# Read the TIFF tags to find the offset(s) of the compressed data strips.
|
newimgio.seek(offset)
|
||||||
strip_offsets = newimg.tag_v2[TiffImagePlugin.STRIPOFFSETS]
|
return newimgio.read(length)
|
||||||
strip_bytes = newimg.tag_v2[TiffImagePlugin.STRIPBYTECOUNTS]
|
|
||||||
rows_per_strip = newimg.tag_v2[TiffImagePlugin.ROWSPERSTRIP]
|
|
||||||
|
|
||||||
# PIL always seems to create a single strip even for very large TIFFs when
|
|
||||||
# it saves images, so assume we only have to read a single strip.
|
|
||||||
# A test ~10 GPixel image was still encoded as a single strip. Just to be
|
|
||||||
# safe check throw an error if there is more than one offset.
|
|
||||||
if len(strip_offsets) > 1:
|
|
||||||
raise NotImplementedError("Transcoding multiple strips not supported")
|
|
||||||
|
|
||||||
newimgio.seek(strip_offsets[0])
|
|
||||||
ccittdata = newimgio.read(strip_bytes[0])
|
|
||||||
|
|
||||||
return ccittdata
|
|
||||||
|
|
||||||
def parse_png(rawdata):
|
def parse_png(rawdata):
|
||||||
pngidat = b""
|
pngidat = b""
|
||||||
|
@ -786,7 +796,7 @@ def read_images(rawdata, colorspace, first_frame_only=False):
|
||||||
if color == Colorspace['RGBA']:
|
if color == Colorspace['RGBA']:
|
||||||
raise JpegColorspaceError("jpeg can't have an alpha channel")
|
raise JpegColorspaceError("jpeg can't have an alpha channel")
|
||||||
im.close()
|
im.close()
|
||||||
return [(color, ndpi, imgformat, rawdata, imgwidthpx, imgheightpx, [])]
|
return [(color, ndpi, imgformat, rawdata, imgwidthpx, imgheightpx, [], False)]
|
||||||
|
|
||||||
# We can directly embed the IDAT chunk of PNG images if the PNG is not
|
# We can directly embed the IDAT chunk of PNG images if the PNG is not
|
||||||
# interlaced
|
# interlaced
|
||||||
|
@ -799,7 +809,27 @@ def read_images(rawdata, colorspace, first_frame_only=False):
|
||||||
color, ndpi, imgwidthpx, imgheightpx = get_imgmetadata(
|
color, ndpi, imgwidthpx, imgheightpx = get_imgmetadata(
|
||||||
imgdata, imgformat, default_dpi, colorspace, rawdata)
|
imgdata, imgformat, default_dpi, colorspace, rawdata)
|
||||||
pngidat, palette = parse_png(rawdata)
|
pngidat, palette = parse_png(rawdata)
|
||||||
return [(color, ndpi, imgformat, pngidat, imgwidthpx, imgheightpx, palette)]
|
im.close()
|
||||||
|
return [(color, ndpi, imgformat, pngidat, imgwidthpx, imgheightpx, palette, False)]
|
||||||
|
|
||||||
|
# We can directly copy the data out of a CCITT Group 4 encoded TIFF, if it
|
||||||
|
# only contains a single strip
|
||||||
|
if imgformat == ImageFormat.TIFF \
|
||||||
|
and imgdata.info['compression'] == "group4" \
|
||||||
|
and len(imgdata.tag_v2[TiffImagePlugin.STRIPOFFSETS]) == 1:
|
||||||
|
photo = imgdata.tag_v2[TiffImagePlugin.PHOTOMETRIC_INTERPRETATION]
|
||||||
|
inverted = False
|
||||||
|
if photo == 0:
|
||||||
|
inverted = True
|
||||||
|
elif photo != 1:
|
||||||
|
raise ValueError("unsupported photometric interpretation for group4 tiff: %d" % photo)
|
||||||
|
color, ndpi, imgwidthpx, imgheightpx = get_imgmetadata(
|
||||||
|
imgdata, imgformat, default_dpi, colorspace, rawdata)
|
||||||
|
offset, length = ccitt_payload_location_from_pil(imgdata)
|
||||||
|
im.seek(offset)
|
||||||
|
rawdata = im.read(length)
|
||||||
|
im.close()
|
||||||
|
return [(color, ndpi, ImageFormat.CCITTGroup4, rawdata, imgwidthpx, imgheightpx, [], inverted)]
|
||||||
|
|
||||||
# Everything else has to be encoded
|
# Everything else has to be encoded
|
||||||
|
|
||||||
|
@ -826,7 +856,7 @@ def read_images(rawdata, colorspace, first_frame_only=False):
|
||||||
ccittdata = transcode_monochrome(imgdata)
|
ccittdata = transcode_monochrome(imgdata)
|
||||||
imgformat = ImageFormat.CCITTGroup4
|
imgformat = ImageFormat.CCITTGroup4
|
||||||
result.append((color, ndpi, imgformat, ccittdata,
|
result.append((color, ndpi, imgformat, ccittdata,
|
||||||
imgwidthpx, imgheightpx, []))
|
imgwidthpx, imgheightpx, [], False))
|
||||||
img_page_count += 1
|
img_page_count += 1
|
||||||
continue
|
continue
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -845,7 +875,7 @@ def read_images(rawdata, colorspace, first_frame_only=False):
|
||||||
if color in [Colorspace.CMYK, Colorspace["CMYK;I"]]:
|
if color in [Colorspace.CMYK, Colorspace["CMYK;I"]]:
|
||||||
imggz = zlib.compress(newimg.tobytes())
|
imggz = zlib.compress(newimg.tobytes())
|
||||||
result.append((color, ndpi, imgformat, imggz, imgwidthpx,
|
result.append((color, ndpi, imgformat, imggz, imgwidthpx,
|
||||||
imgheightpx, []))
|
imgheightpx, [], False))
|
||||||
else:
|
else:
|
||||||
# cheapo version to retrieve a PNG encoding of the payload is to
|
# cheapo version to retrieve a PNG encoding of the payload is to
|
||||||
# just save it with PIL. In the future this could be replaced by
|
# just save it with PIL. In the future this could be replaced by
|
||||||
|
@ -855,7 +885,7 @@ def read_images(rawdata, colorspace, first_frame_only=False):
|
||||||
pngidat, palette = parse_png(pngbuffer.getvalue())
|
pngidat, palette = parse_png(pngbuffer.getvalue())
|
||||||
imgformat = ImageFormat.PNG
|
imgformat = ImageFormat.PNG
|
||||||
result.append((color, ndpi, imgformat, pngidat, imgwidthpx,
|
result.append((color, ndpi, imgformat, pngidat, imgwidthpx,
|
||||||
imgheightpx, palette))
|
imgheightpx, palette, False))
|
||||||
img_page_count += 1
|
img_page_count += 1
|
||||||
# the python-pil version 2.3.0-1ubuntu3 in Ubuntu does not have the
|
# the python-pil version 2.3.0-1ubuntu3 in Ubuntu does not have the
|
||||||
# close() method
|
# close() method
|
||||||
|
@ -1170,8 +1200,8 @@ def convert(*images, **kwargs):
|
||||||
# name so we now try treating it as raw image content
|
# name so we now try treating it as raw image content
|
||||||
rawdata = img
|
rawdata = img
|
||||||
|
|
||||||
for color, ndpi, imgformat, imgdata, imgwidthpx, imgheightpx, palette \
|
for color, ndpi, imgformat, imgdata, imgwidthpx, imgheightpx, \
|
||||||
in read_images(
|
palette, inverted in read_images(
|
||||||
rawdata, kwargs['colorspace'], kwargs['first_frame_only']):
|
rawdata, kwargs['colorspace'], kwargs['first_frame_only']):
|
||||||
pagewidth, pageheight, imgwidthpdf, imgheightpdf = \
|
pagewidth, pageheight, imgwidthpdf, imgheightpdf = \
|
||||||
kwargs['layout_fun'](imgwidthpx, imgheightpx, ndpi)
|
kwargs['layout_fun'](imgwidthpx, imgheightpx, ndpi)
|
||||||
|
@ -1195,7 +1225,8 @@ def convert(*images, **kwargs):
|
||||||
imgypdf = (pageheight - imgheightpdf)/2.0
|
imgypdf = (pageheight - imgheightpdf)/2.0
|
||||||
pdf.add_imagepage(color, imgwidthpx, imgheightpx, imgformat,
|
pdf.add_imagepage(color, imgwidthpx, imgheightpx, imgformat,
|
||||||
imgdata, imgwidthpdf, imgheightpdf, imgxpdf,
|
imgdata, imgwidthpdf, imgheightpdf, imgxpdf,
|
||||||
imgypdf, pagewidth, pageheight, userunit, palette)
|
imgypdf, pagewidth, pageheight, userunit,
|
||||||
|
palette, inverted)
|
||||||
|
|
||||||
if kwargs['outputstream']:
|
if kwargs['outputstream']:
|
||||||
pdf.tostream(kwargs['outputstream'])
|
pdf.tostream(kwargs['outputstream'])
|
||||||
|
|
|
@ -592,6 +592,9 @@ def test_suite():
|
||||||
if imgprops.DecodeParms:
|
if imgprops.DecodeParms:
|
||||||
if orig_img.format == 'PNG':
|
if orig_img.format == 'PNG':
|
||||||
pngidat, palette = img2pdf.parse_png(orig_imgdata)
|
pngidat, palette = img2pdf.parse_png(orig_imgdata)
|
||||||
|
elif orig_img.format == 'TIFF' and orig_img.info['compression'] == "group4":
|
||||||
|
offset, length = img2pdf.ccitt_payload_location_from_pil(orig_img)
|
||||||
|
pngidat = orig_imgdata[offset:offset+length]
|
||||||
else:
|
else:
|
||||||
pngbuffer = BytesIO()
|
pngbuffer = BytesIO()
|
||||||
orig_img.save(pngbuffer, format="png")
|
orig_img.save(pngbuffer, format="png")
|
||||||
|
|
Loading…
Reference in a new issue