Support JPEG2000 images with transparency

Closes: #173
2023-08-05 14:59:05 +02:00 · 2023-08-05 14:59:05 +02:00 · acc25a4926
commit acc25a4926
parent f597887088
3 changed files with 183 additions and 15 deletions
--- a/src/img2pdf.py
+++ b/src/img2pdf.py
@ -827,8 +827,10 @@ class pdfdoc(object):
        artborder=None,
        iccp=None,
    ):
-        assert (color != Colorspace.RGBA and color != Colorspace.LA) or (
-            imgformat == ImageFormat.PNG and smaskdata is not None
+        assert (
+            color not in [Colorspace.RGBA, Colorspace.LA]
+            or (imgformat == ImageFormat.PNG and smaskdata is not None)
+            or imgformat == ImageFormat.JPEG2000
        )

        if self.engine == Engine.pikepdf:
@ -852,7 +854,13 @@ class pdfdoc(object):
        if color == Colorspace["1"] or color == Colorspace.L or color == Colorspace.LA:
            colorspace = PdfName.DeviceGray
        elif color == Colorspace.RGB or color == Colorspace.RGBA:
-            colorspace = PdfName.DeviceRGB
+            if color == Colorspace.RGBA and imgformat == ImageFormat.JPEG2000:
+                # there is no DeviceRGBA and for JPXDecode it is okay to have
+                # no colorspace as the pdf reader is supposed to get this info
+                # from the jpeg2000 payload itself
+                colorspace = None
+            else:
+                colorspace = PdfName.DeviceRGB
        elif color == Colorspace.CMYK or color == Colorspace["CMYK;I"]:
            colorspace = PdfName.DeviceCMYK
        elif color == Colorspace.P:
@ -923,7 +931,8 @@ class pdfdoc(object):
        image[PdfName.Filter] = ofilter
        image[PdfName.Width] = imgwidthpx
        image[PdfName.Height] = imgheightpx
-        image[PdfName.ColorSpace] = colorspace
+        if colorspace is not None:
+            image[PdfName.ColorSpace] = colorspace
        image[PdfName.BitsPerComponent] = depth

        smask = None
@ -1292,7 +1301,7 @@ def get_imgmetadata(
    if imgformat == ImageFormat.JPEG2000 and rawdata is not None and imgdata is None:
        # this codepath gets called if the PIL installation is not able to
        # handle JPEG2000 files
-        imgwidthpx, imgheightpx, ics, hdpi, vdpi = parsejp2(rawdata)
+        imgwidthpx, imgheightpx, ics, hdpi, vdpi, channels, bpp = parsejp2(rawdata)

        if hdpi is None:
            hdpi = default_dpi
@ -1312,7 +1321,7 @@ def get_imgmetadata(
        ics = imgdata.mode

    # GIF and PNG files with transparency are supported
-    if (imgformat == ImageFormat.PNG or imgformat == ImageFormat.GIF) and (
+    if imgformat in [ImageFormat.PNG, ImageFormat.GIF, ImageFormat.JPEG2000] and (
        ics in ["RGBA", "LA"] or "transparency" in imgdata.info
    ):
        # Must check the IHDR chunk for the bit depth, because PIL would lossily
@ -1828,10 +1837,13 @@ def read_images(
            raise JpegColorspaceError("jpeg can't be monochrome")
        if color == Colorspace["P"]:
            raise JpegColorspaceError("jpeg can't have a color palette")
-        if color == Colorspace["RGBA"]:
+        if color == Colorspace["RGBA"] and imgformat != ImageFormat.JPEG2000:
            raise JpegColorspaceError("jpeg can't have an alpha channel")
        logger.debug("read_images() embeds a JPEG")
        cleanup()
+        depth = 8
+        if imgformat == ImageFormat.JPEG2000:
+            _, _, _, _, _, _, depth = parsejp2(rawdata)
        return [
            (
                color,
@ -1843,7 +1855,7 @@ def read_images(
                imgheightpx,
                [],
                False,
-                8,
+                depth,
                rotation,
                iccp,
            )
--- a/src/img2pdf_test.py
+++ b/src/img2pdf_test.py
@ -361,6 +361,8 @@ def compare(im1, im2, exact, icc, cmyk):
                + [
                    "-metric",
                    "AE",
+                    "-alpha",
+                    "off",
                    im1,
                    im2,
                    "null:",
@ -1216,6 +1218,74 @@ def jpg_2000_img(tmp_path_factory, tmp_normal_png):
    in_img.unlink()


+@pytest.fixture(scope="session")
+def jpg_2000_rgba8_img(tmp_path_factory, tmp_alpha_png):
+    in_img = tmp_path_factory.mktemp("jpg_2000_rgba8") / "in.jp2"
+    subprocess.check_call(CONVERT + [str(tmp_alpha_png), "-depth", "8", str(in_img)])
+    identify = json.loads(subprocess.check_output(CONVERT + [str(in_img), "json:"]))
+    assert len(identify) == 1
+    # somewhere between imagemagick 6.9.7.4 and 6.9.9.34, the json output was
+    # put into an array, here we cater for the older version containing just
+    # the bare dictionary
+    if "image" in identify:
+        identify = [identify]
+    assert "image" in identify[0]
+    assert identify[0]["image"].get("format") == "JP2", str(identify)
+    assert identify[0]["image"].get("mimeType") == "image/jp2", str(identify)
+    assert identify[0]["image"].get("geometry") == {
+        "width": 60,
+        "height": 60,
+        "x": 0,
+        "y": 0,
+    }, str(identify)
+    assert identify[0]["image"].get("colorspace") == "sRGB", str(identify)
+    assert identify[0]["image"].get("type") == "TrueColorAlpha", str(identify)
+    assert identify[0]["image"].get("depth") == 8, str(identify)
+    assert identify[0]["image"].get("pageGeometry") == {
+        "width": 60,
+        "height": 60,
+        "x": 0,
+        "y": 0,
+    }, str(identify)
+    assert identify[0]["image"].get("compression") == "JPEG2000", str(identify)
+    yield in_img
+    in_img.unlink()
+
+
+@pytest.fixture(scope="session")
+def jpg_2000_rgba16_img(tmp_path_factory, tmp_alpha_png):
+    in_img = tmp_path_factory.mktemp("jpg_2000_rgba16") / "in.jp2"
+    subprocess.check_call(CONVERT + [str(tmp_alpha_png), str(in_img)])
+    identify = json.loads(subprocess.check_output(CONVERT + [str(in_img), "json:"]))
+    assert len(identify) == 1
+    # somewhere between imagemagick 6.9.7.4 and 6.9.9.34, the json output was
+    # put into an array, here we cater for the older version containing just
+    # the bare dictionary
+    if "image" in identify:
+        identify = [identify]
+    assert "image" in identify[0]
+    assert identify[0]["image"].get("format") == "JP2", str(identify)
+    assert identify[0]["image"].get("mimeType") == "image/jp2", str(identify)
+    assert identify[0]["image"].get("geometry") == {
+        "width": 60,
+        "height": 60,
+        "x": 0,
+        "y": 0,
+    }, str(identify)
+    assert identify[0]["image"].get("colorspace") == "sRGB", str(identify)
+    assert identify[0]["image"].get("type") == "TrueColorAlpha", str(identify)
+    assert identify[0]["image"].get("depth") == 16, str(identify)
+    assert identify[0]["image"].get("pageGeometry") == {
+        "width": 60,
+        "height": 60,
+        "x": 0,
+        "y": 0,
+    }, str(identify)
+    assert identify[0]["image"].get("compression") == "JPEG2000", str(identify)
+    yield in_img
+    in_img.unlink()
+
+
@pytest.fixture(scope="session")
 def png_rgb8_img(tmp_normal_png):
    in_img = tmp_normal_png
@ -4068,6 +4138,60 @@ def jpg_2000_pdf(tmp_path_factory, jpg_2000_img, request):
    out_pdf.unlink()


+@pytest.fixture(scope="session", params=["internal", "pikepdf"])
+def jpg_2000_rgba8_pdf(tmp_path_factory, jpg_2000_rgba8_img, request):
+    out_pdf = tmp_path_factory.mktemp("jpg_2000_rgba8_pdf") / "out.pdf"
+    subprocess.check_call(
+        [
+            img2pdfprog,
+            "--producer=",
+            "--nodate",
+            "--engine=" + request.param,
+            "--output=" + str(out_pdf),
+            jpg_2000_rgba8_img,
+        ]
+    )
+    with pikepdf.open(str(out_pdf)) as p:
+        assert (
+            p.pages[0].Contents.read_bytes()
+            == b"q\n45.0000 0 0 45.0000 0.0000 0.0000 cm\n/Im0 Do\nQ"
+        )
+        assert p.pages[0].Resources.XObject.Im0.BitsPerComponent == 8
+        assert not hasattr(p.pages[0].Resources.XObject.Im0, "ColorSpace")
+        assert p.pages[0].Resources.XObject.Im0.Filter == "/JPXDecode"
+        assert p.pages[0].Resources.XObject.Im0.Height == 60
+        assert p.pages[0].Resources.XObject.Im0.Width == 60
+    yield out_pdf
+    out_pdf.unlink()
+
+
+@pytest.fixture(scope="session", params=["internal", "pikepdf"])
+def jpg_2000_rgba16_pdf(tmp_path_factory, jpg_2000_rgba16_img, request):
+    out_pdf = tmp_path_factory.mktemp("jpg_2000_rgba16_pdf") / "out.pdf"
+    subprocess.check_call(
+        [
+            img2pdfprog,
+            "--producer=",
+            "--nodate",
+            "--engine=" + request.param,
+            "--output=" + str(out_pdf),
+            jpg_2000_rgba16_img,
+        ]
+    )
+    with pikepdf.open(str(out_pdf)) as p:
+        assert (
+            p.pages[0].Contents.read_bytes()
+            == b"q\n45.0000 0 0 45.0000 0.0000 0.0000 cm\n/Im0 Do\nQ"
+        )
+        assert p.pages[0].Resources.XObject.Im0.BitsPerComponent == 16
+        assert not hasattr(p.pages[0].Resources.XObject.Im0, "ColorSpace")
+        assert p.pages[0].Resources.XObject.Im0.Filter == "/JPXDecode"
+        assert p.pages[0].Resources.XObject.Im0.Height == 60
+        assert p.pages[0].Resources.XObject.Im0.Width == 60
+    yield out_pdf
+    out_pdf.unlink()
+
+
@pytest.fixture(scope="session", params=["internal", "pikepdf"])
 def png_rgb8_pdf(tmp_path_factory, png_rgb8_img, request):
    out_pdf = tmp_path_factory.mktemp("png_rgb8_pdf") / "out.pdf"
@ -5461,6 +5585,39 @@ def test_jpg_2000(tmp_path_factory, jpg_2000_img, jpg_2000_pdf):
    compare_pdfimages_jp2(tmpdir, jpg_2000_img, jpg_2000_pdf)


+@pytest.mark.skipif(
+    sys.platform in ["win32"],
+    reason="test utilities not available on Windows and MacOS",
+)
+@pytest.mark.skipif(
+    not HAVE_JP2, reason="requires imagemagick with support for jpeg2000"
+)
+def test_jpg_2000_rgba8(tmp_path_factory, jpg_2000_rgba8_img, jpg_2000_rgba8_pdf):
+    tmpdir = tmp_path_factory.mktemp("jpg_2000_rgba8")
+    compare_ghostscript(tmpdir, jpg_2000_rgba8_img, jpg_2000_rgba8_pdf)
+    compare_poppler(tmpdir, jpg_2000_rgba8_img, jpg_2000_rgba8_pdf)
+    # compare_mupdf(tmpdir, jpg_2000_rgba8_img, jpg_2000_rgba8_pdf)
+    compare_pdfimages_jp2(tmpdir, jpg_2000_rgba8_img, jpg_2000_rgba8_pdf)
+
+
+@pytest.mark.skipif(
+    sys.platform in ["win32"],
+    reason="test utilities not available on Windows and MacOS",
+)
+@pytest.mark.skipif(
+    not HAVE_JP2, reason="requires imagemagick with support for jpeg2000"
+)
+def test_jpg_2000_rgba16(tmp_path_factory, jpg_2000_rgba16_img, jpg_2000_rgba16_pdf):
+    tmpdir = tmp_path_factory.mktemp("jpg_2000_rgba16")
+    compare_ghostscript(
+        tmpdir, jpg_2000_rgba16_img, jpg_2000_rgba16_pdf, gsdevice="tiff48nc"
+    )
+    # poppler outputs 8-bit RGB so the comparison will not be exact
+    # compare_poppler(tmpdir, jpg_2000_rgba16_img, jpg_2000_rgba16_pdf, exact=False)
+    # compare_mupdf(tmpdir, jpg_2000_rgba16_img, jpg_2000_rgba16_pdf)
+    compare_pdfimages_jp2(tmpdir, jpg_2000_rgba16_img, jpg_2000_rgba16_pdf)
+
+
@pytest.mark.skipif(
    sys.platform in ["win32"],
    reason="test utilities not available on Windows and MacOS",
--- a/src/jp2.py
+++ b/src/jp2.py
@ -37,9 +37,8 @@ def getBox(data, byteStart, noBytes):


 def parse_ihdr(data):
-    height = struct.unpack(">I", data[0:4])[0]
-    width = struct.unpack(">I", data[4:8])[0]
-    return width, height
+    height, width, channels, bpp = struct.unpack(">IIHB", data[:11])
+    return width, height, channels, bpp+1


 def parse_colr(data):
@ -85,13 +84,13 @@ def parse_jp2h(data):
    while byteStart < noBytes and boxLengthValue != 0:
        boxLengthValue, boxType, byteEnd, boxContents = getBox(data, byteStart, noBytes)
        if boxType == b"ihdr":
-            width, height = parse_ihdr(boxContents)
+            width, height, channels, bpp = parse_ihdr(boxContents)
        elif boxType == b"colr":
            colorspace = parse_colr(boxContents)
        elif boxType == b"res ":
            hdpi, vdpi = parse_res(boxContents)
        byteStart = byteEnd
-    return (width, height, colorspace, hdpi, vdpi)
+    return (width, height, colorspace, hdpi, vdpi, channels, bpp)


 def parsejp2(data):
@ -102,7 +101,7 @@ def parsejp2(data):
    while byteStart < noBytes and boxLengthValue != 0:
        boxLengthValue, boxType, byteEnd, boxContents = getBox(data, byteStart, noBytes)
        if boxType == b"jp2h":
-            width, height, colorspace, hdpi, vdpi = parse_jp2h(boxContents)
+            width, height, colorspace, hdpi, vdpi, channels, bpp = parse_jp2h(boxContents)
            break
        byteStart = byteEnd
    if not width:
@ -112,7 +111,7 @@ def parsejp2(data):
    if not colorspace:
        raise Exception("no colorspace in jp2 header")
    # retrieving the dpi is optional so we do not error out if not present
-    return (width, height, colorspace, hdpi, vdpi)
+    return (width, height, colorspace, hdpi, vdpi, channels, bpp)


 if __name__ == "__main__":