Add support for JBIG2 (generic coding) #184
5 changed files with 110 additions and 20 deletions
16
README.md
16
README.md
|
@ -28,17 +28,19 @@ The following table shows how img2pdf handles different input depending on the
|
|||
input file format and image color space.
|
||||
|
||||
| Format | Colorspace | Result |
|
||||
| ------------------------------------- | ------------------------------ | ------------- |
|
||||
| ------------------------------------- | ------------------------------------ | ------------- |
|
||||
| JPEG | any | direct |
|
||||
| JPEG2000 | any | direct |
|
||||
| PNG (non-interlaced, no transparency) | any | direct |
|
||||
| TIFF (CCITT Group 4) | monochrome | direct |
|
||||
| any | any except CMYK and monochrome | PNG Paeth |
|
||||
| any | monochrome | CCITT Group 4 |
|
||||
| TIFF (CCITT Group 4) | 1-bit monochrome | direct |
|
||||
| JBIG2 (single-page generic coding) | 1-bit monochrome | direct |
|
||||
|
||||
| any | any except CMYK and 1-bit monochrome | PNG Paeth |
|
||||
| any | 1-bit monochrome | CCITT Group 4 |
|
||||
| any | CMYK | flate |
|
||||
|
||||
For JPEG, JPEG2000, non-interlaced PNG and TIFF images with CCITT Group 4
|
||||
encoded data, img2pdf directly embeds the image data into the PDF without
|
||||
For JPEG, JPEG2000, non-interlaced PNG, TIFF images with CCITT Group 4
|
||||
encoded data, and JBIG2 with single-page generic coding (e.g. using `jbig2enc`),
|
||||
img2pdf directly embeds the image data into the PDF without
|
||||
re-encoding it. It thus treats the PDF format merely as a container format for
|
||||
the image data. In these cases, img2pdf only increases the filesize by the size
|
||||
of the PDF container (typically around 500 to 700 bytes). Since data is only
|
||||
|
@ -47,7 +49,7 @@ solutions for these input formats.
|
|||
|
||||
For all other input types, img2pdf first has to transform the pixel data to
|
||||
make it compatible with PDF. In most cases, the PNG Paeth filter is applied to
|
||||
the pixel data. For monochrome input, CCITT Group 4 is used instead. Only for
|
||||
the pixel data. For 1-bit monochrome input, CCITT Group 4 is used instead. Only for
|
||||
CMYK input no filter is applied before finally applying flate compression.
|
||||
|
||||
Usage
|
||||
|
|
|
@ -128,7 +128,7 @@ PageOrientation = Enum("PageOrientation", "portrait landscape")
|
|||
Colorspace = Enum("Colorspace", "RGB RGBA L LA 1 CMYK CMYK;I P PA other")
|
||||
|
||||
ImageFormat = Enum(
|
||||
"ImageFormat", "JPEG JPEG2000 CCITTGroup4 PNG GIF TIFF MPO MIFF other"
|
||||
"ImageFormat", "JPEG JPEG2000 CCITTGroup4 PNG GIF TIFF MPO MIFF JBIG2 other"
|
||||
)
|
||||
|
||||
PageMode = Enum("PageMode", "none outlines thumbs")
|
||||
|
@ -918,6 +918,11 @@ class pdfdoc(object):
|
|||
self.output_version = "1.5" # jpeg2000 needs pdf 1.5
|
||||
elif imgformat is ImageFormat.CCITTGroup4:
|
||||
ofilter = [PdfName.CCITTFaxDecode]
|
||||
elif imgformat is ImageFormat.JBIG2:
|
||||
ofilter = PdfName.JBIG2Decode
|
||||
# JBIG2Decode requires PDF 1.4
|
||||
if self.output_version < "1.4":
|
||||
self.output_version = "1.4"
|
||||
else:
|
||||
ofilter = PdfName.FlateDecode
|
||||
|
||||
|
@ -1308,6 +1313,25 @@ def get_imgmetadata(
|
|||
if vdpi is None:
|
||||
vdpi = default_dpi
|
||||
ndpi = (hdpi, vdpi)
|
||||
elif imgformat == ImageFormat.JBIG2:
|
||||
imgwidthpx, imgheightpx, xres, yres = struct.unpack('>IIII', rawdata[24:40])
|
||||
INCH_PER_METER = 39.370079
|
||||
if xres == 0:
|
||||
hdpi = default_dpi
|
||||
elif xres < 1000:
|
||||
# If xres is very small, it's likely accidentally expressed in dpi instead
|
||||
# of dpm. See e.g. https://github.com/agl/jbig2enc/issues/86
|
||||
hdpi = xres
|
||||
else:
|
||||
hdpi = int(float(xres) / INCH_PER_METER)
|
||||
if yres == 0:
|
||||
vdpi = default_dpi
|
||||
elif yres < 1000:
|
||||
vdpi = yres
|
||||
else:
|
||||
vdpi = int(float(yres) / INCH_PER_METER)
|
||||
ndpi = (hdpi, vdpi)
|
||||
ics = "1"
|
||||
else:
|
||||
imgwidthpx, imgheightpx = imgdata.size
|
||||
|
||||
|
@ -1334,7 +1358,7 @@ def get_imgmetadata(
|
|||
|
||||
# GIF and PNG files with transparency are supported
|
||||
if imgformat in [ImageFormat.PNG, ImageFormat.GIF, ImageFormat.JPEG2000] and (
|
||||
ics in ["RGBA", "LA"] or "transparency" in imgdata.info
|
||||
ics in ["RGBA", "LA"] or (imgdata is not None and "transparency" in imgdata.info)
|
||||
):
|
||||
# Must check the IHDR chunk for the bit depth, because PIL would lossily
|
||||
# convert 16-bit RGBA/LA images to 8-bit.
|
||||
|
@ -1350,7 +1374,7 @@ def get_imgmetadata(
|
|||
raise AlphaChannelError(
|
||||
"Refusing to work with multiple >8bit channels."
|
||||
)
|
||||
elif ics in ["LA", "PA", "RGBA"] or "transparency" in imgdata.info:
|
||||
elif ics in ["LA", "PA", "RGBA"] or (imgdata is not None and "transparency" in imgdata.info):
|
||||
raise AlphaChannelError("This function must not be called on images with alpha")
|
||||
|
||||
# Since commit 07a96209597c5e8dfe785c757d7051ce67a980fb or release 4.1.0
|
||||
|
@ -1455,7 +1479,7 @@ def get_imgmetadata(
|
|||
logger.debug("input colorspace = %s", color.name)
|
||||
|
||||
iccp = None
|
||||
if "icc_profile" in imgdata.info:
|
||||
if imgdata is not None and "icc_profile" in imgdata.info:
|
||||
iccp = imgdata.info.get("icc_profile")
|
||||
# GIMP saves bilevel TIFF images and palette PNG images with only black and
|
||||
# white in the palette with an RGB ICC profile which is useless
|
||||
|
@ -1805,8 +1829,6 @@ def parse_miff(data):
|
|||
results.extend(parse_miff(rest[lenpal + lenimgdata :]))
|
||||
return results
|
||||
# fmt: on
|
||||
|
||||
|
||||
def read_images(
|
||||
rawdata, colorspace, first_frame_only=False, rot=None, include_thumbnails=False
|
||||
):
|
||||
|
@ -1820,7 +1842,41 @@ def read_images(
|
|||
if rawdata[:12] == b"\x00\x00\x00\x0C\x6A\x50\x20\x20\x0D\x0A\x87\x0A":
|
||||
# image is jpeg2000
|
||||
imgformat = ImageFormat.JPEG2000
|
||||
if rawdata[:14].lower() == b"id=imagemagick":
|
||||
ooBJ3u
commented
I wasn't sure why this was I wasn't sure why this was `if` instead of `elif`. Won't that make it so JPEG2000 still crashes? I fixed it but wanted to double-check.
|
||||
elif rawdata[:8] == b"\x97\x4a\x42\x32\x0d\x0a\x1a\x0a":
|
||||
# For now we only support single-page generic coding of JBIG2, for example as generated by
|
||||
# https://github.com/agl/jbig2enc
|
||||
#
|
||||
# In fact, you can pipe an example image `like src/tests/input/mono.png` directly into img2pdf:
|
||||
# jbig2 src/tests/input/mono.png | img2pdf -o src/tests/output/mono.png.pdf
|
||||
#
|
||||
# For this we assume that the first 13 bytes are the JBIG file header describing a document with one page,
|
||||
# followed by a "page information" segment describing the dimensions of that page.
|
||||
#
|
||||
# The following annotated `hexdump -C 042.jb2` shows the first 40 bytes that we inspect directly.
|
||||
# The first 24 bytes (until "||") have to match exactly, while the following 16 bytes are read by get_imgmetadata.
|
||||
#
|
||||
# 97 4a 42 32 0d 0a 1a 0a 01 00 00 00 01 00 00 00
|
||||
# \_____________________/ | \_________/ \______
|
||||
# magic-bytes org/unk pages seg-num
|
||||
#
|
||||
# 00 30 00 01 00 00 00 13 || 00 00 00 73 00 00 00 30
|
||||
# _/ | | | \_________/ || \_________/ \_________/
|
||||
# type refs page seg-size || width-px height-px
|
||||
#
|
||||
# 00 00 00 48 00 00 00 48
|
||||
# \_________/ \_________/
|
||||
# xres yres
|
||||
#
|
||||
# For more information on the data format, see:
|
||||
# * https://github.com/agl/jbig2enc/blob/ea05019/fcd14492.pdf
|
||||
# For more information about the generic coding, see:
|
||||
# * https://github.com/agl/jbig2enc/blob/ea05019/src/jbig2enc.cc#L898
|
||||
imgformat = ImageFormat.JBIG2
|
||||
if rawdata[:24] != b"\x97\x4a\x42\x32\x0d\x0a\x1a\x0a\x01\x00\x00\x00\x01\x00\x00\x00\x00\x30\x00\x01\x00\x00\x00\x13":
|
||||
raise ImageOpenError(
|
||||
"Unsupported JBIG2 format; only single-page generic coding is supported (e.g. from `jbig2enc`)"
|
||||
)
|
||||
elif rawdata[:14].lower() == b"id=imagemagick":
|
||||
phmccarty
commented
One question about the style of the code here: Do you think it would be better to use hex instead of the character value for the 5th and 16th bytes ( One question about the style of the code here:
Do you think it would be better to use hex instead of the character value for the 5th and 16th bytes (`1` and `3`)? IMO, I like that better for the consistency, but because the value is accurate as-is, I'm not opposed to leaving it.
ooBJ3u
commented
This is how Python prints the bytearray by default, so I figured that is fine. This is how Python prints the bytearray by default, so I figured that is fine.
phmccarty
commented
Okay, that makes sense then. No objection from me. Okay, that makes sense then. No objection from me.
|
||||
# image is in MIFF format
|
||||
# this is useful for 16 bit CMYK because PNG cannot do CMYK and thus
|
||||
# we need PIL but PIL cannot do 16 bit
|
||||
|
@ -2066,6 +2122,28 @@ def read_images(
|
|||
)
|
||||
]
|
||||
|
||||
if imgformat == ImageFormat.JBIG2:
|
||||
color, ndpi, imgwidthpx, imgheightpx, rotation, iccp = get_imgmetadata(
|
||||
imgdata, imgformat, default_dpi, colorspace, rawdata, rot
|
||||
)
|
||||
streamdata = rawdata[13:] # Strip file header
|
||||
return [
|
||||
(
|
||||
color,
|
||||
ndpi,
|
||||
imgformat,
|
||||
streamdata,
|
||||
None,
|
||||
imgwidthpx,
|
||||
imgheightpx,
|
||||
[],
|
||||
False,
|
||||
1,
|
||||
rotation,
|
||||
iccp,
|
||||
)
|
||||
]
|
||||
|
||||
if imgformat == ImageFormat.MIFF:
|
||||
return parse_miff(rawdata)
|
||||
|
||||
|
|
|
@ -6987,6 +6987,11 @@ def test_general(general_input, engine):
|
|||
assert x.Root.Type == "/Catalog"
|
||||
assert sorted(x.Root.Pages.keys()) == ["/Count", "/Kids", "/Type"]
|
||||
assert x.Root.Pages.Type == "/Pages"
|
||||
if f.endswith(".jb2"):
|
||||
# PIL doens't support .jb2, so we load the original .png, which
|
||||
# was converted to the .jb2 using `jbig2enc`.
|
||||
orig_img = Image.open(f.replace(".jb2", ".png"))
|
||||
else:
|
||||
orig_img = Image.open(f)
|
||||
for pagenum in range(len(x.Root.Pages.Kids)):
|
||||
# retrieve the original image frame that this page was
|
||||
|
@ -6995,6 +7000,8 @@ def test_general(general_input, engine):
|
|||
cur_page = x.Root.Pages.Kids[pagenum]
|
||||
|
||||
ndpi = orig_img.info.get("dpi", (96.0, 96.0))
|
||||
if ndpi[0] <= 0.001 or ndpi[1] <= 0.001:
|
||||
ndpi = (96.0, 96.0)
|
||||
# In python3, the returned dpi value for some tiff images will
|
||||
# not be an integer but a float. To make the behaviour of
|
||||
# img2pdf the same between python2 and python3, we convert that
|
||||
|
@ -7044,6 +7051,7 @@ def test_general(general_input, engine):
|
|||
"/JPXDecode",
|
||||
"/FlateDecode",
|
||||
pikepdf.Array([pikepdf.Name.CCITTFaxDecode]),
|
||||
"/JBIG2Decode",
|
||||
]
|
||||
|
||||
# test if the image has correct size
|
||||
|
@ -7053,6 +7061,8 @@ def test_general(general_input, engine):
|
|||
# verbatim into the PDF
|
||||
if imgprops.Filter in ["/DCTDecode", "/JPXDecode"]:
|
||||
assert cur_page.Resources.XObject.Im0.read_raw_bytes() == orig_imgdata
|
||||
elif imgprops.Filter == "/JBIG2Decode":
|
||||
assert cur_page.Resources.XObject.Im0.read_raw_bytes() == orig_imgdata[13:] # Strip file header
|
||||
elif imgprops.Filter == pikepdf.Array([pikepdf.Name.CCITTFaxDecode]):
|
||||
tiff_header = tiff_header_for_ccitt(
|
||||
int(imgprops.Width), int(imgprops.Height), int(imgprops.Length), 4
|
||||
|
|
BIN
src/tests/input/mono.jb2
Normal file
BIN
src/tests/input/mono.jb2
Normal file
Binary file not shown.
BIN
src/tests/output/mono.jb2.pdf
Normal file
BIN
src/tests/output/mono.jb2.pdf
Normal file
Binary file not shown.
Loading…
Reference in a new issue
the other entries seem to use the term
monochrome
for 1 bit per pixel images.Monochrome is also often used for greyscale images, however. See e.g. https://en.wikipedia.org/wiki/Monochrome
Bi-level is pretty standard terminology, though "binary images" or perhaps even "1-bit images" might be clearer. https://en.wikipedia.org/wiki/Binary_image
I'm fine with choosing another term, all I mean is the table should be consistent.
Apologies for the delay. I've updated the README to consistently say "1-bit monochrome" (to differentiate it from the other meaning of "grayscale"). Does this look good?