From a2e2998fb13ba81f8dc004e2a966b3e8b0ce1658 Mon Sep 17 00:00:00 2001
From: ooBJ3u <ooBJ3u4z4250OWhL7aOB@proton.me>
Date: Wed, 30 Oct 2024 00:00:00 +0000
Subject: [PATCH] Strip end-of-page and end-of-file segments from JBIG2

As noted by @phmccarty in
https://gitlab.mister-muffin.de/josch/img2pdf/pulls/184#issuecomment-2709
and subsequent comments, we were not properly stripping end-of-page and
end-of-file segments. These are valid segments in a JBIG2 file, but not
when embedded in PDF.

From the PDF spec:
> The JBIG2 file header, end-of-page segments, and end-of-file segment
> shall not be used in PDF.

We were already stripping out the JBIG2 file header, but not yet the
end-of-page and end-of-file segments.

For this, I'm expanding the approach that we were already taking, of
only supporting a narrow subset of JBIG2 files. We assert that the input
file has such a footer, and then we strip it.

We validated that the issue raised by @phmccarty is indeed resolved by
running the following code before and after applying this commit:

```sh
src/img2pdf.py src/tests/input/mono.jb2 > test.pdf
pdfimages -tiff test.pdf img
```

Before this commit, this returned "Syntax Error (1143): Unknown segment
type in JBIG2 stream". After this commit, the error is gone.
---
 src/img2pdf.py      | 8 ++++++--
 src/img2pdf_test.py | 2 +-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/img2pdf.py b/src/img2pdf.py
index 4b540ed..e989ff7 100755
--- a/src/img2pdf.py
+++ b/src/img2pdf.py
@@ -1874,7 +1874,11 @@ def read_images(
             imgformat = ImageFormat.JBIG2
             if rawdata[:24] != b"\x97\x4a\x42\x32\x0d\x0a\x1a\x0a\x01\x00\x00\x00\x01\x00\x00\x00\x00\x30\x00\x01\x00\x00\x00\x13":
                 raise ImageOpenError(
-                    "Unsupported JBIG2 format; only single-page generic coding is supported (e.g. from `jbig2enc`)"
+                    "Unsupported JBIG2 format; only single-page generic coding is supported (e.g. from `jbig2enc`)."
+                )
+            if rawdata[-22:] != b"\x00\x00\x00\x021\x00\x01\x00\x00\x00\x00\x00\x00\x00\x033\x00\x01\x00\x00\x00\x00":
+                raise ImageOpenError(
+                    "Unsupported JBIG2 format; we expect end-of-page and end-of-file segments at the end (e.g. from `jbig2enc`)."
                 )
         elif rawdata[:14].lower() == b"id=imagemagick":
             # image is in MIFF format
@@ -2126,7 +2130,7 @@ def read_images(
         color, ndpi, imgwidthpx, imgheightpx, rotation, iccp = get_imgmetadata(
             imgdata, imgformat, default_dpi, colorspace, rawdata, rot
         )
-        streamdata = rawdata[13:] # Strip file header
+        streamdata = rawdata[13:-22] # Strip file header and footer
         return [
             (
                 color,
diff --git a/src/img2pdf_test.py b/src/img2pdf_test.py
index 3983a42..6c3d321 100755
--- a/src/img2pdf_test.py
+++ b/src/img2pdf_test.py
@@ -7084,7 +7084,7 @@ def test_general(general_input, engine):
         if imgprops.Filter in ["/DCTDecode", "/JPXDecode"]:
             assert cur_page.Resources.XObject.Im0.read_raw_bytes() == orig_imgdata
         elif imgprops.Filter == "/JBIG2Decode":
-            assert cur_page.Resources.XObject.Im0.read_raw_bytes() == orig_imgdata[13:] # Strip file header
+            assert cur_page.Resources.XObject.Im0.read_raw_bytes() == orig_imgdata[13:-22] # Strip file header and footer.
         elif imgprops.Filter == pikepdf.Array([pikepdf.Name.CCITTFaxDecode]):
             tiff_header = tiff_header_for_ccitt(
                 int(imgprops.Width), int(imgprops.Height), int(imgprops.Length), 4