forked from josch/img2pdf
Strip end-of-page and end-of-file segments from JBIG2
As noted by @phmccarty in josch/img2pdf#184 (comment) and subsequent comments, we were not properly stripping end-of-page and end-of-file segments. These are valid segments in a JBIG2 file, but not when embedded in PDF. From the PDF spec: > The JBIG2 file header, end-of-page segments, and end-of-file segment > shall not be used in PDF. We were already stripping out the JBIG2 file header, but not yet the end-of-page and end-of-file segments. For this, I'm expanding the approach that we were already taking, of only supporting a narrow subset of JBIG2 files. We assert that the input file has such a footer, and then we strip it. We validated that the issue raised by @phmccarty is indeed resolved by running the following code before and after applying this commit: ```sh src/img2pdf.py src/tests/input/mono.jb2 > test.pdf pdfimages -tiff test.pdf img ``` Before this commit, this returned "Syntax Error (1143): Unknown segment type in JBIG2 stream". After this commit, the error is gone.
This commit is contained in:
parent
e2369eb59a
commit
244600065d
2 changed files with 7 additions and 3 deletions
|
@ -1874,7 +1874,11 @@ def read_images(
|
||||||
imgformat = ImageFormat.JBIG2
|
imgformat = ImageFormat.JBIG2
|
||||||
if rawdata[:24] != b"\x97\x4a\x42\x32\x0d\x0a\x1a\x0a\x01\x00\x00\x00\x01\x00\x00\x00\x00\x30\x00\x01\x00\x00\x00\x13":
|
if rawdata[:24] != b"\x97\x4a\x42\x32\x0d\x0a\x1a\x0a\x01\x00\x00\x00\x01\x00\x00\x00\x00\x30\x00\x01\x00\x00\x00\x13":
|
||||||
raise ImageOpenError(
|
raise ImageOpenError(
|
||||||
"Unsupported JBIG2 format; only single-page generic coding is supported (e.g. from `jbig2enc`)"
|
"Unsupported JBIG2 format; only single-page generic coding is supported (e.g. from `jbig2enc`)."
|
||||||
|
)
|
||||||
|
if rawdata[-22:] != b"\x00\x00\x00\x021\x00\x01\x00\x00\x00\x00\x00\x00\x00\x033\x00\x01\x00\x00\x00\x00":
|
||||||
|
raise ImageOpenError(
|
||||||
|
"Unsupported JBIG2 format; we expect end-of-page and end-of-file segments at the end (e.g. from `jbig2enc`)."
|
||||||
)
|
)
|
||||||
elif rawdata[:14].lower() == b"id=imagemagick":
|
elif rawdata[:14].lower() == b"id=imagemagick":
|
||||||
# image is in MIFF format
|
# image is in MIFF format
|
||||||
|
@ -2126,7 +2130,7 @@ def read_images(
|
||||||
color, ndpi, imgwidthpx, imgheightpx, rotation, iccp = get_imgmetadata(
|
color, ndpi, imgwidthpx, imgheightpx, rotation, iccp = get_imgmetadata(
|
||||||
imgdata, imgformat, default_dpi, colorspace, rawdata, rot
|
imgdata, imgformat, default_dpi, colorspace, rawdata, rot
|
||||||
)
|
)
|
||||||
streamdata = rawdata[13:] # Strip file header
|
streamdata = rawdata[13:-22] # Strip file header and footer
|
||||||
return [
|
return [
|
||||||
(
|
(
|
||||||
color,
|
color,
|
||||||
|
|
|
@ -7062,7 +7062,7 @@ def test_general(general_input, engine):
|
||||||
if imgprops.Filter in ["/DCTDecode", "/JPXDecode"]:
|
if imgprops.Filter in ["/DCTDecode", "/JPXDecode"]:
|
||||||
assert cur_page.Resources.XObject.Im0.read_raw_bytes() == orig_imgdata
|
assert cur_page.Resources.XObject.Im0.read_raw_bytes() == orig_imgdata
|
||||||
elif imgprops.Filter == "/JBIG2Decode":
|
elif imgprops.Filter == "/JBIG2Decode":
|
||||||
assert cur_page.Resources.XObject.Im0.read_raw_bytes() == orig_imgdata[13:] # Strip file header
|
assert cur_page.Resources.XObject.Im0.read_raw_bytes() == orig_imgdata[13:-22] # Strip file header and footer.
|
||||||
elif imgprops.Filter == pikepdf.Array([pikepdf.Name.CCITTFaxDecode]):
|
elif imgprops.Filter == pikepdf.Array([pikepdf.Name.CCITTFaxDecode]):
|
||||||
tiff_header = tiff_header_for_ccitt(
|
tiff_header = tiff_header_for_ccitt(
|
||||||
int(imgprops.Width), int(imgprops.Height), int(imgprops.Length), 4
|
int(imgprops.Width), int(imgprops.Height), int(imgprops.Length), 4
|
||||||
|
|
Loading…
Reference in a new issue