support for MIFF which allows 16 bit CMYK images

closes: #144
2022-06-26 16:48:10 +01:00 · 2022-06-26 16:48:10 +01:00 · bad6fcae39
commit bad6fcae39
parent d9b90499f3
2 changed files with 269 additions and 4 deletions
--- a/src/img2pdf.py
+++ b/src/img2pdf.py
@ -45,6 +45,7 @@ import struct
 import platform
 import hashlib
 from itertools import chain
+import re

 logger = logging.getLogger(__name__)

@ -125,7 +126,9 @@ PageOrientation = Enum("PageOrientation", "portrait landscape")

 Colorspace = Enum("Colorspace", "RGB RGBA L LA 1 CMYK CMYK;I P PA other")

-ImageFormat = Enum("ImageFormat", "JPEG JPEG2000 CCITTGroup4 PNG GIF TIFF MPO other")
+ImageFormat = Enum(
+    "ImageFormat", "JPEG JPEG2000 CCITTGroup4 PNG GIF TIFF MPO MIFF other"
+)

 PageMode = Enum("PageMode", "none outlines thumbs")

@ -1533,6 +1536,166 @@ def parse_png(rawdata):
    return pngidat, palette


+
+miff_re = re.compile(
+    r"""
+    [^\x00-\x20\x7f-\x9f] # the field name must not start with a control char or space
+    [^=]+                 # the field name can even contain spaces
+    =                     # field name and value are separated by an equal sign
+    (?:
+        [^\x00-\x20\x7f-\x9f{}] # either chars that are not braces and not control chars
+        |{[^}]*}                # or any kind of char surrounded by braces
+    )+""",
+    re.VERBOSE,
+)
+
+# https://imagemagick.org/script/miff.php
+def parse_miff(data):
+    results = []
+    header, rest = data.split(b":\x1a", 1)
+    header = header.decode("ISO-8859-1")
+    assert header.lower().startswith("id=imagemagick")
+    hdata = {}
+    for i, line in enumerate(re.findall(miff_re, header)):
+        if not line:
+            continue
+        k, v = line.split("=", 1)
+        if i == 0:
+            assert k.lower() == "id"
+            assert v.lower() == "imagemagick"
+        match k.lower():
+            case "class":
+                match v:
+                    case "DirectClass" | "PseudoClass":
+                        hdata["class"] = v
+                    case _:
+                        print("cannot understand class", v)
+            case "colorspace":
+                # theoretically RGBA and CMYKA should be supported as well
+                # please teach me how to create such a MIFF file
+                match v:
+                    case "sRGB" | "CMYK" | "Gray":
+                        hdata["colorspace"] = v
+                    case _:
+                        print("cannot understand colorspace", v)
+            case "depth":
+                match v:
+                    case "8" | "16" | "32":
+                        hdata["depth"] = int(v)
+                    case _:
+                        print("cannot understand depth", v)
+            case "colors":
+                hdata["colors"] = int(v)
+            case "matte":
+                match v:
+                    case "True":
+                        hdata["matte"] = True
+                    case "False":
+                        hdata["matte"] = False
+                    case _:
+                        print("cannot understand matte", v)
+            case "columns" | "rows":
+                hdata[k.lower()] = int(v)
+            case "compression":
+                print("compression not yet supported")
+            case "profile":
+                assert v in ["icc", "exif"]
+                hdata["profile"] = v
+            case "resolution":
+                dpix, dpiy = v.split("x", 1)
+                hdata["resolution"] = (float(dpix), float(dpiy))
+
+    assert "depth" in hdata
+    assert "columns" in hdata
+    assert "rows" in hdata
+    match hdata["class"]:
+        case "DirectClass":
+            if "colors" in hdata:
+                assert hdata["colors"] == 0
+            match hdata["colorspace"]:
+                case "sRGB":
+                    numchannels = 3
+                    colorspace = Colorspace.RGB
+                case "CMYK":
+                    numchannels = 4
+                    colorspace = Colorspace.CMYK
+                case "Gray":
+                    numchannels = 1
+                    colorspace = Colorspace.L
+            if hdata["matte"]:
+                numchannels += 1
+            if hdata.get("profile"):
+                # there is no key encoding the length of icc or exif data
+                # according to the docs, the profile-icc key is supposed to do this
+                print("FAIL: exif")
+            else:
+                lenimgdata = (
+                    hdata["depth"] // 8 * numchannels * hdata["columns"] * hdata["rows"]
+                )
+                assert len(rest) >= lenimgdata, (
+                    len(rest),
+                    hdata["depth"],
+                    numchannels,
+                    hdata["columns"],
+                    hdata["rows"],
+                    lenimgdata,
+                )
+                results.append(
+                    (
+                        colorspace,
+                        hdata.get("resolution") or (default_dpi, default_dpi),
+                        ImageFormat.MIFF,
+                        zlib.compress(rest[:lenimgdata]),
+                        None,  # smask
+                        hdata["columns"],
+                        hdata["rows"],
+                        [],  # palette
+                        False,  # inverted
+                        hdata["depth"],
+                        0,  # rotation
+                        None,  # icc profile
+                    )
+                )
+                if len(rest) > lenimgdata:
+                    # another image is here
+                    assert rest[lenimgdata:][:14].lower() == b"id=imagemagick"
+                    results.extend(parse_miff(rest[lenimgdata:]))
+        case "PseudoClass":
+            assert "colors" in hdata
+            if hdata["matte"]:
+                numchannels = 2
+            else:
+                numchannels = 1
+            lenpal = 3 * hdata["colors"] * hdata["depth"] // 8
+            lenimgdata = numchannels * hdata["rows"] * hdata["columns"]
+            assert len(rest) >= lenpal + lenimgdata, (len(rest), lenpal, lenimgdata)
+            results.append(
+                (
+                    Colorspace.RGB,
+                    hdata.get("resolution") or (default_dpi, default_dpi),
+                    ImageFormat.MIFF,
+                    zlib.compress(rest[lenpal : lenpal + lenimgdata]),
+                    None,  # FIXME: allow alpha channel smask
+                    hdata["columns"],
+                    hdata["rows"],
+                    rest[:lenpal],  # palette
+                    False,  # inverted
+                    hdata["depth"],
+                    0,  # rotation
+                    None,  # icc profile
+                )
+            )
+            if len(rest) > lenpal + lenimgdata:
+                # another image is here
+                assert rest[lenpal + lenimgdata :][:14].lower() == b"id=imagemagick", (
+                    len(rest),
+                    lenpal,
+                    lenimgdata,
+                )
+                results.extend(parse_miff(rest[lenpal + lenimgdata :]))
+    return results
+
+
 def read_images(rawdata, colorspace, first_frame_only=False, rot=None):
    im = BytesIO(rawdata)
    im.seek(0)
@ -1541,13 +1704,19 @@ def read_images(rawdata, colorspace, first_frame_only=False, rot=None):
        imgdata = Image.open(im)
    except IOError as e:
        # test if it is a jpeg2000 image
-        if rawdata[:12] != b"\x00\x00\x00\x0C\x6A\x50\x20\x20\x0D\x0A\x87\x0A":
+        if rawdata[:12] == b"\x00\x00\x00\x0C\x6A\x50\x20\x20\x0D\x0A\x87\x0A":
+            # image is jpeg2000
+            imgformat = ImageFormat.JPEG2000
+        if rawdata[:14].lower() == b"id=imagemagick":
+            # image is in MIFF format
+            # this is useful for 16 bit CMYK because PNG cannot do CMYK and thus
+            # we need PIL but PIL cannot do 16 bit
+            imgformat = ImageFormat.MIFF
+        else:
            raise ImageOpenError(
                "cannot read input image (not jpeg2000). "
                "PIL: error reading image: %s" % e
            )
-        # image is jpeg2000
-        imgformat = ImageFormat.JPEG2000
    else:
        logger.debug("PIL format = %s", imgdata.format)
        imgformat = None
@ -1710,6 +1879,10 @@ def read_images(rawdata, colorspace, first_frame_only=False, rot=None):
                    )
                ]

+
+    if imgformat == ImageFormat.MIFF:
+        return parse_miff(rawdata)
+
    # If our input is not JPEG or PNG, then we might have a format that
    # supports multiple frames (like TIFF or GIF), so we need a loop to
    # iterate through all frames of the image.
@ -2344,6 +2517,10 @@ def convert(*images, **kwargs):
                rawdata = f.read()
                f.close()

+        #md5 = hashlib.md5(rawdata).hexdigest()
+        #with open("./testdata/" + md5, "wb") as f:
+        #    f.write(rawdata)
+
        for (
            color,
            ndpi,
--- a/src/img2pdf_test.py
+++ b/src/img2pdf_test.py
@ -3875,6 +3875,51 @@ def tiff_ccitt_nometa2_img(tmp_path_factory, tmp_gray1_png):
    yield in_img
    in_img.unlink()

+@pytest.fixture(scope="session")
+def miff_cmyk16_img(tmp_path_factory, tmp_normal_png):
+    in_img = tmp_path_factory.mktemp("miff_cmyk16") / "in.miff"
+    subprocess.check_call(
+        CONVERT
+        + [
+            str(tmp_normal_png),
+            "-depth",
+            "16",
+            "-colorspace",
+            "cmyk",
+            str(in_img),
+        ]
+    )
+    identify = json.loads(subprocess.check_output(CONVERT + [str(in_img), "json:"]))
+    assert len(identify) == 1
+    # somewhere between imagemagick 6.9.7.4 and 6.9.9.34, the json output was
+    # put into an array, here we cater for the older version containing just
+    # the bare dictionary
+    if "image" in identify:
+        identify = [identify]
+    assert "image" in identify[0]
+    assert identify[0]["image"].get("format") == "MIFF", str(identify)
+    assert identify[0]["image"].get("geometry") == {
+        "width": 60,
+        "height": 60,
+        "x": 0,
+        "y": 0,
+    }, str(identify)
+    assert identify[0]["image"].get("colorspace") == "CMYK", str(identify)
+    assert identify[0]["image"].get("type") == "ColorSeparation", str(identify)
+    endian = "endianess" if identify[0].get("version", "0") < "1.0" else "endianness"
+    assert identify[0]["image"].get(endian) in ["Undefined", "LSB",], str(
+        identify
+    )  # FIXME: should be LSB
+    assert identify[0]["image"].get("depth") == 16, str(identify)
+    assert identify[0]["image"].get("baseDepth") == 16, str(identify)
+    assert identify[0]["image"].get("pageGeometry") == {
+        "width": 60,
+        "height": 60,
+        "x": 0,
+        "y": 0,
+    }, str(identify)
+    yield in_img
+    in_img.unlink()

@pytest.fixture(scope="session")
 def png_icc_img(tmp_icc_png):
@ -5261,6 +5306,35 @@ def tiff_ccitt_nometa2_pdf(tmp_path_factory, tiff_ccitt_nometa2_img, request):
    out_pdf.unlink()


+
+@pytest.fixture(scope="session", params=["internal", "pikepdf"])
+def miff_cmyk16_pdf(tmp_path_factory, miff_cmyk16_img, request):
+    out_pdf = tmp_path_factory.mktemp("miff_cmyk16_pdf") / "out.pdf"
+    subprocess.check_call(
+        [
+            img2pdfprog,
+            "--producer=",
+            "--nodate",
+            "--engine=" + request.param,
+            "--output=" + str(out_pdf),
+            str(miff_cmyk16_img),
+        ]
+    )
+    with pikepdf.open(str(out_pdf)) as p:
+        assert (
+            p.pages[0].Contents.read_bytes()
+            == b"q\n45.0000 0 0 45.0000 0.0000 0.0000 cm\n/Im0 Do\nQ"
+        )
+        assert p.pages[0].Resources.XObject.Im0.BitsPerComponent == 16
+        assert p.pages[0].Resources.XObject.Im0.ColorSpace == "/DeviceCMYK"
+        assert p.pages[0].Resources.XObject.Im0.Filter == "/FlateDecode"
+        assert p.pages[0].Resources.XObject.Im0.Height == 60
+        assert p.pages[0].Resources.XObject.Im0.Width == 60
+    yield out_pdf
+    out_pdf.unlink()
+
+
+
 ###############################################################################
 #                                  TEST CASES                                 #
 ###############################################################################
@ -6123,6 +6197,20 @@ def test_tiff_ccitt_nometa2(
    compare_pdfimages_tiff(tmpdir, tiff_ccitt_nometa2_img, tiff_ccitt_nometa2_pdf)


+@pytest.mark.skipif(
+    sys.platform in ["win32"],
+    reason="test utilities not available on Windows and MacOS",
+)
+def test_miff_cmyk16(tmp_path_factory, miff_cmyk16_img, tiff_cmyk16_img, miff_cmyk16_pdf):
+    tmpdir = tmp_path_factory.mktemp("miff_cmyk16")
+    compare_ghostscript(
+        tmpdir, tiff_cmyk16_img, miff_cmyk16_pdf, gsdevice="tiff32nc", exact=False
+    )
+    # not testing with poppler as it cannot write CMYK images
+    compare_mupdf(tmpdir, tiff_cmyk16_img, miff_cmyk16_pdf, exact=False, cmyk=True)
+    #compare_pdfimages_tiff(tmpdir, tiff_cmyk16_img, miff_cmyk16_pdf)
+
+
 # we define some variables so that the table below can be narrower
 psl = (972, 504)  # --pagesize landscape
 psp = (504, 972)  # --pagesize portrait