From e2369eb59ac0a197f211e879965f7b0e12450fbe Mon Sep 17 00:00:00 2001
From: ooBJ3u <ooBJ3u4z4250OWhL7aOB@proton.me>
Date: Wed, 25 Sep 2024 00:00:00 +0000
Subject: [PATCH] Add support for JBIG2 (generic coding)

Implements the proposal detailed at
https://gitlab.mister-muffin.de/josch/img2pdf/issues/112#issuecomment-1304

This is a limited implementation of JBIG2, which can be extended to
support multiple pages, symbol tables, and other features of the format
in the future.

Added a test case based on mono.tif.

Updated the README.md based on
https://gitlab.mister-muffin.de/josch/img2pdf/pulls/184/files#issuecomment-1334
---
 README.md                     |  26 +++++-----
 src/img2pdf.py                |  92 +++++++++++++++++++++++++++++++---
 src/img2pdf_test.py           |  12 ++++-
 src/tests/input/mono.jb2      | Bin 0 -> 205 bytes
 src/tests/output/mono.jb2.pdf | Bin 0 -> 1171 bytes
 5 files changed, 110 insertions(+), 20 deletions(-)
 create mode 100644 src/tests/input/mono.jb2
 create mode 100644 src/tests/output/mono.jb2.pdf

diff --git a/README.md b/README.md
index 8d33a36..2dca1d0 100644
--- a/README.md
+++ b/README.md
@@ -27,18 +27,20 @@ software, because the raw pixel data never has to be loaded into memory.
 The following table shows how img2pdf handles different input depending on the
 input file format and image color space.
 
-| Format                                | Colorspace                     | Result        |
-| ------------------------------------- | ------------------------------ | ------------- |
-| JPEG                                  | any                            | direct        |
-| JPEG2000                              | any                            | direct        |
-| PNG (non-interlaced, no transparency) | any                            | direct        |
-| TIFF (CCITT Group 4)                  | monochrome                     | direct        |
-| any                                   | any except CMYK and monochrome | PNG Paeth     |
-| any                                   | monochrome                     | CCITT Group 4 |
-| any                                   | CMYK                           | flate         |
+| Format                                | Colorspace                           | Result        |
+| ------------------------------------- | ------------------------------------ | ------------- |
+| JPEG                                  | any                                  | direct        |
+| JPEG2000                              | any                                  | direct        |
+| PNG (non-interlaced, no transparency) | any                                  | direct        |
+| TIFF (CCITT Group 4)                  | 1-bit monochrome                     | direct        |
+| JBIG2 (single-page generic coding)    | 1-bit monochrome                     | direct        |
+| any                                   | any except CMYK and 1-bit monochrome | PNG Paeth     |
+| any                                   | 1-bit monochrome                     | CCITT Group 4 |
+| any                                   | CMYK                                 | flate         |
 
-For JPEG, JPEG2000, non-interlaced PNG and TIFF images with CCITT Group 4
-encoded data, img2pdf directly embeds the image data into the PDF without
+For JPEG, JPEG2000, non-interlaced PNG, TIFF images with CCITT Group 4
+encoded data, and JBIG2 with single-page generic coding (e.g. using `jbig2enc`),
+img2pdf directly embeds the image data into the PDF without
 re-encoding it. It thus treats the PDF format merely as a container format for
 the image data. In these cases, img2pdf only increases the filesize by the size
 of the PDF container (typically around 500 to 700 bytes). Since data is only
@@ -47,7 +49,7 @@ solutions for these input formats.
 
 For all other input types, img2pdf first has to transform the pixel data to
 make it compatible with PDF. In most cases, the PNG Paeth filter is applied to
-the pixel data. For monochrome input, CCITT Group 4 is used instead. Only for
+the pixel data. For 1-bit monochrome input, CCITT Group 4 is used instead. Only for
 CMYK input no filter is applied before finally applying flate compression.
 
 Usage
diff --git a/src/img2pdf.py b/src/img2pdf.py
index f89670b..2c90c20 100755
--- a/src/img2pdf.py
+++ b/src/img2pdf.py
@@ -128,7 +128,7 @@ PageOrientation = Enum("PageOrientation", "portrait landscape")
 Colorspace = Enum("Colorspace", "RGB RGBA L LA 1 CMYK CMYK;I P PA other")
 
 ImageFormat = Enum(
-    "ImageFormat", "JPEG JPEG2000 CCITTGroup4 PNG GIF TIFF MPO MIFF other"
+    "ImageFormat", "JPEG JPEG2000 CCITTGroup4 PNG GIF TIFF MPO MIFF JBIG2 other"
 )
 
 PageMode = Enum("PageMode", "none outlines thumbs")
@@ -918,6 +918,11 @@ class pdfdoc(object):
             self.output_version = "1.5"  # jpeg2000 needs pdf 1.5
         elif imgformat is ImageFormat.CCITTGroup4:
             ofilter = [PdfName.CCITTFaxDecode]
+        elif imgformat is ImageFormat.JBIG2:
+            ofilter = PdfName.JBIG2Decode
+            # JBIG2Decode requires PDF 1.4
+            if self.output_version < "1.4":
+                self.output_version = "1.4"
         else:
             ofilter = PdfName.FlateDecode
 
@@ -1308,6 +1313,25 @@ def get_imgmetadata(
         if vdpi is None:
             vdpi = default_dpi
         ndpi = (hdpi, vdpi)
+    elif imgformat == ImageFormat.JBIG2:
+        imgwidthpx, imgheightpx, xres, yres = struct.unpack('>IIII', rawdata[24:40])
+        INCH_PER_METER = 39.370079
+        if xres == 0:
+            hdpi = default_dpi
+        elif xres < 1000:
+            # If xres is very small, it's likely accidentally expressed in dpi instead
+            # of dpm. See e.g. https://github.com/agl/jbig2enc/issues/86
+            hdpi = xres
+        else:
+            hdpi = int(float(xres) / INCH_PER_METER)
+        if yres == 0:
+            vdpi = default_dpi
+        elif yres < 1000:
+            vdpi = yres
+        else:
+            vdpi = int(float(yres) / INCH_PER_METER)
+        ndpi = (hdpi, vdpi)
+        ics = "1"
     else:
         imgwidthpx, imgheightpx = imgdata.size
 
@@ -1334,7 +1358,7 @@ def get_imgmetadata(
 
     # GIF and PNG files with transparency are supported
     if imgformat in [ImageFormat.PNG, ImageFormat.GIF, ImageFormat.JPEG2000] and (
-        ics in ["RGBA", "LA"] or "transparency" in imgdata.info
+        ics in ["RGBA", "LA"] or (imgdata is not None and "transparency" in imgdata.info)
     ):
         # Must check the IHDR chunk for the bit depth, because PIL would lossily
         # convert 16-bit RGBA/LA images to 8-bit.
@@ -1350,7 +1374,7 @@ def get_imgmetadata(
                 raise AlphaChannelError(
                     "Refusing to work with multiple >8bit channels."
                 )
-    elif ics in ["LA", "PA", "RGBA"] or "transparency" in imgdata.info:
+    elif ics in ["LA", "PA", "RGBA"] or (imgdata is not None and "transparency" in imgdata.info):
         raise AlphaChannelError("This function must not be called on images with alpha")
 
     # Since commit 07a96209597c5e8dfe785c757d7051ce67a980fb or release 4.1.0
@@ -1455,7 +1479,7 @@ def get_imgmetadata(
         logger.debug("input colorspace = %s", color.name)
 
     iccp = None
-    if "icc_profile" in imgdata.info:
+    if imgdata is not None and "icc_profile" in imgdata.info:
         iccp = imgdata.info.get("icc_profile")
     # GIMP saves bilevel TIFF images and palette PNG images with only black and
     # white in the palette with an RGB ICC profile which is useless
@@ -1805,8 +1829,6 @@ def parse_miff(data):
                 results.extend(parse_miff(rest[lenpal + lenimgdata :]))
     return results
 # fmt: on
-
-
 def read_images(
     rawdata, colorspace, first_frame_only=False, rot=None, include_thumbnails=False
 ):
@@ -1820,7 +1842,41 @@ def read_images(
         if rawdata[:12] == b"\x00\x00\x00\x0C\x6A\x50\x20\x20\x0D\x0A\x87\x0A":
             # image is jpeg2000
             imgformat = ImageFormat.JPEG2000
-        if rawdata[:14].lower() == b"id=imagemagick":
+        elif rawdata[:8] == b"\x97\x4a\x42\x32\x0d\x0a\x1a\x0a":
+            # For now we only support single-page generic coding of JBIG2, for example as generated by
+            # https://github.com/agl/jbig2enc
+            #
+            # In fact, you can pipe an example image `like src/tests/input/mono.png` directly into img2pdf:
+            #   jbig2 src/tests/input/mono.png | img2pdf -o src/tests/output/mono.png.pdf
+            #
+            # For this we assume that the first 13 bytes are the JBIG file header describing a document with one page,
+            # followed by a "page information" segment describing the dimensions of that page.
+            # 
+            # The following annotated `hexdump -C 042.jb2` shows the first 40 bytes that we inspect directly.
+            # The first 24 bytes (until "||") have to match exactly, while the following 16 bytes are read by get_imgmetadata.
+            #
+            # 97 4a 42 32 0d 0a 1a 0a  01 00 00 00 01 00 00 00
+            # \_____________________/  |  \_________/ \______
+            #       magic-bytes     org/unk  pages     seg-num
+            #
+            # 00 30 00 01 00 00 00 13  || 00 00 00 73 00 00 00 30
+            # _/ |  |   | \_________/  || \_________/ \_________/
+            # type refs page seg-size  ||  width-px    height-px
+            #
+            # 00 00 00 48 00 00 00 48
+            # \_________/ \_________/
+            #     xres       yres
+            # 
+            # For more information on the data format, see:
+            # * https://github.com/agl/jbig2enc/blob/ea05019/fcd14492.pdf
+            # For more information about the generic coding, see:
+            # * https://github.com/agl/jbig2enc/blob/ea05019/src/jbig2enc.cc#L898
+            imgformat = ImageFormat.JBIG2
+            if rawdata[:24] != b"\x97\x4a\x42\x32\x0d\x0a\x1a\x0a\x01\x00\x00\x00\x01\x00\x00\x00\x00\x30\x00\x01\x00\x00\x00\x13":
+                raise ImageOpenError(
+                    "Unsupported JBIG2 format; only single-page generic coding is supported (e.g. from `jbig2enc`)"
+                )
+        elif rawdata[:14].lower() == b"id=imagemagick":
             # image is in MIFF format
             # this is useful for 16 bit CMYK because PNG cannot do CMYK and thus
             # we need PIL but PIL cannot do 16 bit
@@ -2066,6 +2122,28 @@ def read_images(
                     )
                 ]
 
+    if imgformat == ImageFormat.JBIG2:
+        color, ndpi, imgwidthpx, imgheightpx, rotation, iccp = get_imgmetadata(
+            imgdata, imgformat, default_dpi, colorspace, rawdata, rot
+        )
+        streamdata = rawdata[13:] # Strip file header
+        return [
+            (
+                color,
+                ndpi,
+                imgformat,
+                streamdata,
+                None,
+                imgwidthpx,
+                imgheightpx,
+                [],
+                False,
+                1,
+                rotation,
+                iccp,
+            )
+        ]
+
     if imgformat == ImageFormat.MIFF:
         return parse_miff(rawdata)
 
diff --git a/src/img2pdf_test.py b/src/img2pdf_test.py
index 5d9ce85..9238c75 100755
--- a/src/img2pdf_test.py
+++ b/src/img2pdf_test.py
@@ -6987,7 +6987,12 @@ def test_general(general_input, engine):
     assert x.Root.Type == "/Catalog"
     assert sorted(x.Root.Pages.keys()) == ["/Count", "/Kids", "/Type"]
     assert x.Root.Pages.Type == "/Pages"
-    orig_img = Image.open(f)
+    if f.endswith(".jb2"):
+        # PIL doens't support .jb2, so we load the original .png, which
+        # was converted to the .jb2 using `jbig2enc`.
+        orig_img = Image.open(f.replace(".jb2", ".png"))
+    else:
+        orig_img = Image.open(f)
     for pagenum in range(len(x.Root.Pages.Kids)):
         # retrieve the original image frame that this page was
         # generated from
@@ -6995,6 +7000,8 @@ def test_general(general_input, engine):
         cur_page = x.Root.Pages.Kids[pagenum]
 
         ndpi = orig_img.info.get("dpi", (96.0, 96.0))
+        if ndpi[0] <= 0.001 or ndpi[1] <= 0.001:
+            ndpi = (96.0, 96.0)
         # In python3, the returned dpi value for some tiff images will
         # not be an integer but a float. To make the behaviour of
         # img2pdf the same between python2 and python3, we convert that
@@ -7044,6 +7051,7 @@ def test_general(general_input, engine):
             "/JPXDecode",
             "/FlateDecode",
             pikepdf.Array([pikepdf.Name.CCITTFaxDecode]),
+            "/JBIG2Decode",
         ]
 
         # test if the image has correct size
@@ -7053,6 +7061,8 @@ def test_general(general_input, engine):
         # verbatim into the PDF
         if imgprops.Filter in ["/DCTDecode", "/JPXDecode"]:
             assert cur_page.Resources.XObject.Im0.read_raw_bytes() == orig_imgdata
+        elif imgprops.Filter == "/JBIG2Decode":
+            assert cur_page.Resources.XObject.Im0.read_raw_bytes() == orig_imgdata[13:] # Strip file header
         elif imgprops.Filter == pikepdf.Array([pikepdf.Name.CCITTFaxDecode]):
             tiff_header = tiff_header_for_ccitt(
                 int(imgprops.Width), int(imgprops.Height), int(imgprops.Length), 4
diff --git a/src/tests/input/mono.jb2 b/src/tests/input/mono.jb2
new file mode 100644
index 0000000000000000000000000000000000000000..2f236f6ed9c33a9d7edff363d5247acbe143a24d
GIT binary patch
literal 205
zcmbR4<z&RmCB?<az`y{)3<h9I7|1FH^B5RBfV2l#j)74PEZc}A3uJ--^Z&p9ng0Fz
zw_4BGbFaX+?ems}ZrN46cxpqR3IF2*`{bBjdzQ+uZ=8QQ^lNwE)|&jimCtV<30t=2
zh+f>M?TmhEb81fi)6nE&&kKGm+Vx)Kn1q_coY~6pK8e%hma%Vge7Mx%&y7Hp%e$||
V&ddn@zXoU{lOe>xAj^#*OaS8}J@^0s

literal 0
HcmV?d00001

diff --git a/src/tests/output/mono.jb2.pdf b/src/tests/output/mono.jb2.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..f60bc18f84ef7cc560cda922077cc3c9f4dc94f4
GIT binary patch
literal 1171
zcmaJ>O=uHA6yBzX!W>jk!Gn|mZAlI$yPMs%ky2_K+eBNNG*(5ave})aTaul)JCRm<
z(5r$s1+nNs?8RObrJz(i6fYh;6h!c(Ie73O2zt`gH?!NgZNXV~XXefO`QG>5PFF5H
zazu%1sOwAP{W?;xjA?Ne_4gwTBQfj}gW1$c8;sy+`lu?aNkvgpvZkg~?R?DF8mHF(
z6>=`M>Lzh<)UKA*npMJbT#qX;G%$dOWATXUw#Yf7Ogx+f>7*az^n8r~AcGkdS`Ha<
zl`W8I-C<bqjZWH@hc94_yTQ|8N5X%|4OiLf#$m)^9@hUs7$=r(4AD98D8r*DI@bC&
z-R2Az^uvfJiAU?M326j$3QEsS6laOa!ZIPxR%P7Nip51hPH0N_1aRCT*c@FL_84)>
zYzAv7uH!K%Ulm<OfHW?{5U{nrV7+BBt0=)Cq$$etngq5%bvUqam?4{aIpPk}YK=NP
z?SSuLTA^;CW|&Z_G`V7%WYjh0ecngx3WF9%oEXZEs%c_Ui}(i_VwYzaQ0Z@j6qJ<e
z?+f)J6SdD)dG3NNpNoJ>Uv$>CAaTevl(_^$mfGOE7hX^7Bq;-Ph6^RBty`SlfaT_?
zkV{hg#-EKyv)OzSPi9y5{91XiH1+)DwMTcZ-_~|F-n{ONe9P7kc07H!KK1if{>1`a
zo&Wl2?ab2hTKwXVm9|seiwp0X(b%pIr_k7c^ZUNH2f7Cr?;pA}X58&u>UehS^W(wa
zAM=OTUwxRqH&fVHhNuxmVBzuGlm2pNA?93CLNe|}yqx~PJ-VLMaS69%(3j!}J1v=#
z?A<9#LDTaQ(o=hO+G^0`+x2uEu8K`t<{EZ|xUE|w>y#+wS4r^ePEyJ+|G|TO!M;i$
U1<|2~%S28o4ep?>;}aw3FPi`>t^fc4

literal 0
HcmV?d00001