Add support for JBIG2 (generic coding) #184

Open
ooBJ3u wants to merge 2 commits from ooBJ3u/img2pdf:main into main
5 changed files with 114 additions and 20 deletions

View file

@ -27,18 +27,20 @@ software, because the raw pixel data never has to be loaded into memory.
The following table shows how img2pdf handles different input depending on the The following table shows how img2pdf handles different input depending on the
input file format and image color space. input file format and image color space.
| Format | Colorspace | Result | | Format | Colorspace | Result |
| ------------------------------------- | ------------------------------ | ------------- | | ------------------------------------- | ------------------------------------ | ------------- |
| JPEG | any | direct | | JPEG | any | direct |
| JPEG2000 | any | direct | | JPEG2000 | any | direct |
| PNG (non-interlaced, no transparency) | any | direct | | PNG (non-interlaced, no transparency) | any | direct |
| TIFF (CCITT Group 4) | monochrome | direct | | TIFF (CCITT Group 4) | 1-bit monochrome | direct |
| any | any except CMYK and monochrome | PNG Paeth | | JBIG2 (single-page generic coding) | 1-bit monochrome | direct |
| any | monochrome | CCITT Group 4 | | any | any except CMYK and 1-bit monochrome | PNG Paeth |
| any | CMYK | flate | | any | 1-bit monochrome | CCITT Group 4 |
| any | CMYK | flate |
For JPEG, JPEG2000, non-interlaced PNG and TIFF images with CCITT Group 4 For JPEG, JPEG2000, non-interlaced PNG, TIFF images with CCITT Group 4
encoded data, img2pdf directly embeds the image data into the PDF without encoded data, and JBIG2 with single-page generic coding (e.g. using `jbig2enc`),
img2pdf directly embeds the image data into the PDF without
re-encoding it. It thus treats the PDF format merely as a container format for re-encoding it. It thus treats the PDF format merely as a container format for
the image data. In these cases, img2pdf only increases the filesize by the size the image data. In these cases, img2pdf only increases the filesize by the size
of the PDF container (typically around 500 to 700 bytes). Since data is only of the PDF container (typically around 500 to 700 bytes). Since data is only
@ -47,7 +49,7 @@ solutions for these input formats.
For all other input types, img2pdf first has to transform the pixel data to For all other input types, img2pdf first has to transform the pixel data to
make it compatible with PDF. In most cases, the PNG Paeth filter is applied to make it compatible with PDF. In most cases, the PNG Paeth filter is applied to
the pixel data. For monochrome input, CCITT Group 4 is used instead. Only for the pixel data. For 1-bit monochrome input, CCITT Group 4 is used instead. Only for
CMYK input no filter is applied before finally applying flate compression. CMYK input no filter is applied before finally applying flate compression.
Usage Usage

View file

@ -128,7 +128,7 @@ PageOrientation = Enum("PageOrientation", "portrait landscape")
Colorspace = Enum("Colorspace", "RGB RGBA L LA 1 CMYK CMYK;I P PA other") Colorspace = Enum("Colorspace", "RGB RGBA L LA 1 CMYK CMYK;I P PA other")
ImageFormat = Enum( ImageFormat = Enum(
"ImageFormat", "JPEG JPEG2000 CCITTGroup4 PNG GIF TIFF MPO MIFF other" "ImageFormat", "JPEG JPEG2000 CCITTGroup4 PNG GIF TIFF MPO MIFF JBIG2 other"
) )
PageMode = Enum("PageMode", "none outlines thumbs") PageMode = Enum("PageMode", "none outlines thumbs")
@ -918,6 +918,11 @@ class pdfdoc(object):
self.output_version = "1.5" # jpeg2000 needs pdf 1.5 self.output_version = "1.5" # jpeg2000 needs pdf 1.5
elif imgformat is ImageFormat.CCITTGroup4: elif imgformat is ImageFormat.CCITTGroup4:
ofilter = [PdfName.CCITTFaxDecode] ofilter = [PdfName.CCITTFaxDecode]
elif imgformat is ImageFormat.JBIG2:
ofilter = PdfName.JBIG2Decode
# JBIG2Decode requires PDF 1.4
if self.output_version < "1.4":
self.output_version = "1.4"
else: else:
ofilter = PdfName.FlateDecode ofilter = PdfName.FlateDecode
@ -1308,6 +1313,25 @@ def get_imgmetadata(
if vdpi is None: if vdpi is None:
vdpi = default_dpi vdpi = default_dpi
ndpi = (hdpi, vdpi) ndpi = (hdpi, vdpi)
elif imgformat == ImageFormat.JBIG2:
imgwidthpx, imgheightpx, xres, yres = struct.unpack('>IIII', rawdata[24:40])
INCH_PER_METER = 39.370079
if xres == 0:
hdpi = default_dpi
elif xres < 1000:
# If xres is very small, it's likely accidentally expressed in dpi instead
# of dpm. See e.g. https://github.com/agl/jbig2enc/issues/86
hdpi = xres
else:
hdpi = int(float(xres) / INCH_PER_METER)
if yres == 0:
vdpi = default_dpi
elif yres < 1000:
vdpi = yres
else:
vdpi = int(float(yres) / INCH_PER_METER)
ndpi = (hdpi, vdpi)
ics = "1"
else: else:
imgwidthpx, imgheightpx = imgdata.size imgwidthpx, imgheightpx = imgdata.size
@ -1334,7 +1358,7 @@ def get_imgmetadata(
# GIF and PNG files with transparency are supported # GIF and PNG files with transparency are supported
if imgformat in [ImageFormat.PNG, ImageFormat.GIF, ImageFormat.JPEG2000] and ( if imgformat in [ImageFormat.PNG, ImageFormat.GIF, ImageFormat.JPEG2000] and (
ics in ["RGBA", "LA"] or "transparency" in imgdata.info ics in ["RGBA", "LA"] or (imgdata is not None and "transparency" in imgdata.info)
): ):
# Must check the IHDR chunk for the bit depth, because PIL would lossily # Must check the IHDR chunk for the bit depth, because PIL would lossily
# convert 16-bit RGBA/LA images to 8-bit. # convert 16-bit RGBA/LA images to 8-bit.
@ -1350,7 +1374,7 @@ def get_imgmetadata(
raise AlphaChannelError( raise AlphaChannelError(
"Refusing to work with multiple >8bit channels." "Refusing to work with multiple >8bit channels."
) )
elif ics in ["LA", "PA", "RGBA"] or "transparency" in imgdata.info: elif ics in ["LA", "PA", "RGBA"] or (imgdata is not None and "transparency" in imgdata.info):
raise AlphaChannelError("This function must not be called on images with alpha") raise AlphaChannelError("This function must not be called on images with alpha")
# Since commit 07a96209597c5e8dfe785c757d7051ce67a980fb or release 4.1.0 # Since commit 07a96209597c5e8dfe785c757d7051ce67a980fb or release 4.1.0
@ -1455,7 +1479,7 @@ def get_imgmetadata(
logger.debug("input colorspace = %s", color.name) logger.debug("input colorspace = %s", color.name)
iccp = None iccp = None
if "icc_profile" in imgdata.info: if imgdata is not None and "icc_profile" in imgdata.info:
iccp = imgdata.info.get("icc_profile") iccp = imgdata.info.get("icc_profile")
# GIMP saves bilevel TIFF images and palette PNG images with only black and # GIMP saves bilevel TIFF images and palette PNG images with only black and
# white in the palette with an RGB ICC profile which is useless # white in the palette with an RGB ICC profile which is useless
@ -1805,8 +1829,6 @@ def parse_miff(data):
results.extend(parse_miff(rest[lenpal + lenimgdata :])) results.extend(parse_miff(rest[lenpal + lenimgdata :]))
return results return results
# fmt: on # fmt: on
def read_images( def read_images(
rawdata, colorspace, first_frame_only=False, rot=None, include_thumbnails=False rawdata, colorspace, first_frame_only=False, rot=None, include_thumbnails=False
): ):
@ -1820,7 +1842,45 @@ def read_images(
if rawdata[:12] == b"\x00\x00\x00\x0C\x6A\x50\x20\x20\x0D\x0A\x87\x0A": if rawdata[:12] == b"\x00\x00\x00\x0C\x6A\x50\x20\x20\x0D\x0A\x87\x0A":
# image is jpeg2000 # image is jpeg2000
imgformat = ImageFormat.JPEG2000 imgformat = ImageFormat.JPEG2000
if rawdata[:14].lower() == b"id=imagemagick": elif rawdata[:8] == b"\x97\x4a\x42\x32\x0d\x0a\x1a\x0a":
# For now we only support single-page generic coding of JBIG2, for example as generated by
# https://github.com/agl/jbig2enc
#
# In fact, you can pipe an example image `like src/tests/input/mono.png` directly into img2pdf:
# jbig2 src/tests/input/mono.png | img2pdf -o src/tests/output/mono.png.pdf
#
# For this we assume that the first 13 bytes are the JBIG file header describing a document with one page,
# followed by a "page information" segment describing the dimensions of that page.
#
# The following annotated `hexdump -C 042.jb2` shows the first 40 bytes that we inspect directly.
# The first 24 bytes (until "||") have to match exactly, while the following 16 bytes are read by get_imgmetadata.
#
# 97 4a 42 32 0d 0a 1a 0a 01 00 00 00 01 00 00 00
# \_____________________/ | \_________/ \______
# magic-bytes org/unk pages seg-num
#
# 00 30 00 01 00 00 00 13 || 00 00 00 73 00 00 00 30
# _/ | | | \_________/ || \_________/ \_________/
# type refs page seg-size || width-px height-px
#
# 00 00 00 48 00 00 00 48
# \_________/ \_________/
# xres yres
#
# For more information on the data format, see:
# * https://github.com/agl/jbig2enc/blob/ea05019/fcd14492.pdf
# For more information about the generic coding, see:
# * https://github.com/agl/jbig2enc/blob/ea05019/src/jbig2enc.cc#L898
imgformat = ImageFormat.JBIG2
if rawdata[:24] != b"\x97\x4a\x42\x32\x0d\x0a\x1a\x0a\x01\x00\x00\x00\x01\x00\x00\x00\x00\x30\x00\x01\x00\x00\x00\x13":
raise ImageOpenError(
"Unsupported JBIG2 format; only single-page generic coding is supported (e.g. from `jbig2enc`)."
)
if rawdata[-22:] != b"\x00\x00\x00\x021\x00\x01\x00\x00\x00\x00\x00\x00\x00\x033\x00\x01\x00\x00\x00\x00":
Review

One question about the style of the code here:

Do you think it would be better to use hex instead of the character value for the 5th and 16th bytes (1 and 3)? IMO, I like that better for the consistency, but because the value is accurate as-is, I'm not opposed to leaving it.

One question about the style of the code here: Do you think it would be better to use hex instead of the character value for the 5th and 16th bytes (`1` and `3`)? IMO, I like that better for the consistency, but because the value is accurate as-is, I'm not opposed to leaving it.
Review

This is how Python prints the bytearray by default, so I figured that is fine.

This is how Python prints the bytearray by default, so I figured that is fine.
Review

Okay, that makes sense then. No objection from me.

Okay, that makes sense then. No objection from me.
raise ImageOpenError(
"Unsupported JBIG2 format; we expect end-of-page and end-of-file segments at the end (e.g. from `jbig2enc`)."
)
elif rawdata[:14].lower() == b"id=imagemagick":
# image is in MIFF format # image is in MIFF format
# this is useful for 16 bit CMYK because PNG cannot do CMYK and thus # this is useful for 16 bit CMYK because PNG cannot do CMYK and thus
# we need PIL but PIL cannot do 16 bit # we need PIL but PIL cannot do 16 bit
@ -2066,6 +2126,28 @@ def read_images(
) )
] ]
if imgformat == ImageFormat.JBIG2:
color, ndpi, imgwidthpx, imgheightpx, rotation, iccp = get_imgmetadata(
imgdata, imgformat, default_dpi, colorspace, rawdata, rot
)
streamdata = rawdata[13:-22] # Strip file header and footer
return [
(
color,
ndpi,
imgformat,
streamdata,
None,
imgwidthpx,
imgheightpx,
[],
False,
1,
rotation,
iccp,
)
]
if imgformat == ImageFormat.MIFF: if imgformat == ImageFormat.MIFF:
return parse_miff(rawdata) return parse_miff(rawdata)

View file

@ -6987,7 +6987,12 @@ def test_general(general_input, engine):
assert x.Root.Type == "/Catalog" assert x.Root.Type == "/Catalog"
assert sorted(x.Root.Pages.keys()) == ["/Count", "/Kids", "/Type"] assert sorted(x.Root.Pages.keys()) == ["/Count", "/Kids", "/Type"]
assert x.Root.Pages.Type == "/Pages" assert x.Root.Pages.Type == "/Pages"
orig_img = Image.open(f) if f.endswith(".jb2"):
# PIL doens't support .jb2, so we load the original .png, which
# was converted to the .jb2 using `jbig2enc`.
orig_img = Image.open(f.replace(".jb2", ".png"))
else:
orig_img = Image.open(f)
for pagenum in range(len(x.Root.Pages.Kids)): for pagenum in range(len(x.Root.Pages.Kids)):
# retrieve the original image frame that this page was # retrieve the original image frame that this page was
# generated from # generated from
@ -6995,6 +7000,8 @@ def test_general(general_input, engine):
cur_page = x.Root.Pages.Kids[pagenum] cur_page = x.Root.Pages.Kids[pagenum]
ndpi = orig_img.info.get("dpi", (96.0, 96.0)) ndpi = orig_img.info.get("dpi", (96.0, 96.0))
if ndpi[0] <= 0.001 or ndpi[1] <= 0.001:
ndpi = (96.0, 96.0)
# In python3, the returned dpi value for some tiff images will # In python3, the returned dpi value for some tiff images will
# not be an integer but a float. To make the behaviour of # not be an integer but a float. To make the behaviour of
# img2pdf the same between python2 and python3, we convert that # img2pdf the same between python2 and python3, we convert that
@ -7044,6 +7051,7 @@ def test_general(general_input, engine):
"/JPXDecode", "/JPXDecode",
"/FlateDecode", "/FlateDecode",
pikepdf.Array([pikepdf.Name.CCITTFaxDecode]), pikepdf.Array([pikepdf.Name.CCITTFaxDecode]),
"/JBIG2Decode",
] ]
# test if the image has correct size # test if the image has correct size
@ -7053,6 +7061,8 @@ def test_general(general_input, engine):
# verbatim into the PDF # verbatim into the PDF
if imgprops.Filter in ["/DCTDecode", "/JPXDecode"]: if imgprops.Filter in ["/DCTDecode", "/JPXDecode"]:
assert cur_page.Resources.XObject.Im0.read_raw_bytes() == orig_imgdata assert cur_page.Resources.XObject.Im0.read_raw_bytes() == orig_imgdata
elif imgprops.Filter == "/JBIG2Decode":
assert cur_page.Resources.XObject.Im0.read_raw_bytes() == orig_imgdata[13:-22] # Strip file header and footer.
elif imgprops.Filter == pikepdf.Array([pikepdf.Name.CCITTFaxDecode]): elif imgprops.Filter == pikepdf.Array([pikepdf.Name.CCITTFaxDecode]):
tiff_header = tiff_header_for_ccitt( tiff_header = tiff_header_for_ccitt(
int(imgprops.Width), int(imgprops.Height), int(imgprops.Length), 4 int(imgprops.Width), int(imgprops.Height), int(imgprops.Length), 4

BIN
src/tests/input/mono.jb2 Normal file

Binary file not shown.

Binary file not shown.