diff --git a/src/img2pdf.py b/src/img2pdf.py index 39a311b..44e48c1 100755 --- a/src/img2pdf.py +++ b/src/img2pdf.py @@ -45,6 +45,7 @@ import struct import platform import hashlib from itertools import chain +import re logger = logging.getLogger(__name__) @@ -125,7 +126,9 @@ PageOrientation = Enum("PageOrientation", "portrait landscape") Colorspace = Enum("Colorspace", "RGB RGBA L LA 1 CMYK CMYK;I P PA other") -ImageFormat = Enum("ImageFormat", "JPEG JPEG2000 CCITTGroup4 PNG GIF TIFF MPO other") +ImageFormat = Enum( + "ImageFormat", "JPEG JPEG2000 CCITTGroup4 PNG GIF TIFF MPO MIFF other" +) PageMode = Enum("PageMode", "none outlines thumbs") @@ -1533,6 +1536,166 @@ def parse_png(rawdata): return pngidat, palette + +miff_re = re.compile( + r""" + [^\x00-\x20\x7f-\x9f] # the field name must not start with a control char or space + [^=]+ # the field name can even contain spaces + = # field name and value are separated by an equal sign + (?: + [^\x00-\x20\x7f-\x9f{}] # either chars that are not braces and not control chars + |{[^}]*} # or any kind of char surrounded by braces + )+""", + re.VERBOSE, +) + +# https://imagemagick.org/script/miff.php +def parse_miff(data): + results = [] + header, rest = data.split(b":\x1a", 1) + header = header.decode("ISO-8859-1") + assert header.lower().startswith("id=imagemagick") + hdata = {} + for i, line in enumerate(re.findall(miff_re, header)): + if not line: + continue + k, v = line.split("=", 1) + if i == 0: + assert k.lower() == "id" + assert v.lower() == "imagemagick" + match k.lower(): + case "class": + match v: + case "DirectClass" | "PseudoClass": + hdata["class"] = v + case _: + print("cannot understand class", v) + case "colorspace": + # theoretically RGBA and CMYKA should be supported as well + # please teach me how to create such a MIFF file + match v: + case "sRGB" | "CMYK" | "Gray": + hdata["colorspace"] = v + case _: + print("cannot understand colorspace", v) + case "depth": + match v: + case "8" | "16" | "32": + hdata["depth"] = int(v) + case _: + print("cannot understand depth", v) + case "colors": + hdata["colors"] = int(v) + case "matte": + match v: + case "True": + hdata["matte"] = True + case "False": + hdata["matte"] = False + case _: + print("cannot understand matte", v) + case "columns" | "rows": + hdata[k.lower()] = int(v) + case "compression": + print("compression not yet supported") + case "profile": + assert v in ["icc", "exif"] + hdata["profile"] = v + case "resolution": + dpix, dpiy = v.split("x", 1) + hdata["resolution"] = (float(dpix), float(dpiy)) + + assert "depth" in hdata + assert "columns" in hdata + assert "rows" in hdata + match hdata["class"]: + case "DirectClass": + if "colors" in hdata: + assert hdata["colors"] == 0 + match hdata["colorspace"]: + case "sRGB": + numchannels = 3 + colorspace = Colorspace.RGB + case "CMYK": + numchannels = 4 + colorspace = Colorspace.CMYK + case "Gray": + numchannels = 1 + colorspace = Colorspace.L + if hdata["matte"]: + numchannels += 1 + if hdata.get("profile"): + # there is no key encoding the length of icc or exif data + # according to the docs, the profile-icc key is supposed to do this + print("FAIL: exif") + else: + lenimgdata = ( + hdata["depth"] // 8 * numchannels * hdata["columns"] * hdata["rows"] + ) + assert len(rest) >= lenimgdata, ( + len(rest), + hdata["depth"], + numchannels, + hdata["columns"], + hdata["rows"], + lenimgdata, + ) + results.append( + ( + colorspace, + hdata.get("resolution") or (default_dpi, default_dpi), + ImageFormat.MIFF, + zlib.compress(rest[:lenimgdata]), + None, # smask + hdata["columns"], + hdata["rows"], + [], # palette + False, # inverted + hdata["depth"], + 0, # rotation + None, # icc profile + ) + ) + if len(rest) > lenimgdata: + # another image is here + assert rest[lenimgdata:][:14].lower() == b"id=imagemagick" + results.extend(parse_miff(rest[lenimgdata:])) + case "PseudoClass": + assert "colors" in hdata + if hdata["matte"]: + numchannels = 2 + else: + numchannels = 1 + lenpal = 3 * hdata["colors"] * hdata["depth"] // 8 + lenimgdata = numchannels * hdata["rows"] * hdata["columns"] + assert len(rest) >= lenpal + lenimgdata, (len(rest), lenpal, lenimgdata) + results.append( + ( + Colorspace.RGB, + hdata.get("resolution") or (default_dpi, default_dpi), + ImageFormat.MIFF, + zlib.compress(rest[lenpal : lenpal + lenimgdata]), + None, # FIXME: allow alpha channel smask + hdata["columns"], + hdata["rows"], + rest[:lenpal], # palette + False, # inverted + hdata["depth"], + 0, # rotation + None, # icc profile + ) + ) + if len(rest) > lenpal + lenimgdata: + # another image is here + assert rest[lenpal + lenimgdata :][:14].lower() == b"id=imagemagick", ( + len(rest), + lenpal, + lenimgdata, + ) + results.extend(parse_miff(rest[lenpal + lenimgdata :])) + return results + + def read_images(rawdata, colorspace, first_frame_only=False, rot=None): im = BytesIO(rawdata) im.seek(0) @@ -1541,13 +1704,19 @@ def read_images(rawdata, colorspace, first_frame_only=False, rot=None): imgdata = Image.open(im) except IOError as e: # test if it is a jpeg2000 image - if rawdata[:12] != b"\x00\x00\x00\x0C\x6A\x50\x20\x20\x0D\x0A\x87\x0A": + if rawdata[:12] == b"\x00\x00\x00\x0C\x6A\x50\x20\x20\x0D\x0A\x87\x0A": + # image is jpeg2000 + imgformat = ImageFormat.JPEG2000 + if rawdata[:14].lower() == b"id=imagemagick": + # image is in MIFF format + # this is useful for 16 bit CMYK because PNG cannot do CMYK and thus + # we need PIL but PIL cannot do 16 bit + imgformat = ImageFormat.MIFF + else: raise ImageOpenError( "cannot read input image (not jpeg2000). " "PIL: error reading image: %s" % e ) - # image is jpeg2000 - imgformat = ImageFormat.JPEG2000 else: logger.debug("PIL format = %s", imgdata.format) imgformat = None @@ -1710,6 +1879,10 @@ def read_images(rawdata, colorspace, first_frame_only=False, rot=None): ) ] + + if imgformat == ImageFormat.MIFF: + return parse_miff(rawdata) + # If our input is not JPEG or PNG, then we might have a format that # supports multiple frames (like TIFF or GIF), so we need a loop to # iterate through all frames of the image. @@ -2344,6 +2517,10 @@ def convert(*images, **kwargs): rawdata = f.read() f.close() + #md5 = hashlib.md5(rawdata).hexdigest() + #with open("./testdata/" + md5, "wb") as f: + # f.write(rawdata) + for ( color, ndpi, diff --git a/src/img2pdf_test.py b/src/img2pdf_test.py index 80dd8e0..519bb85 100755 --- a/src/img2pdf_test.py +++ b/src/img2pdf_test.py @@ -3875,6 +3875,51 @@ def tiff_ccitt_nometa2_img(tmp_path_factory, tmp_gray1_png): yield in_img in_img.unlink() +@pytest.fixture(scope="session") +def miff_cmyk16_img(tmp_path_factory, tmp_normal_png): + in_img = tmp_path_factory.mktemp("miff_cmyk16") / "in.miff" + subprocess.check_call( + CONVERT + + [ + str(tmp_normal_png), + "-depth", + "16", + "-colorspace", + "cmyk", + str(in_img), + ] + ) + identify = json.loads(subprocess.check_output(CONVERT + [str(in_img), "json:"])) + assert len(identify) == 1 + # somewhere between imagemagick 6.9.7.4 and 6.9.9.34, the json output was + # put into an array, here we cater for the older version containing just + # the bare dictionary + if "image" in identify: + identify = [identify] + assert "image" in identify[0] + assert identify[0]["image"].get("format") == "MIFF", str(identify) + assert identify[0]["image"].get("geometry") == { + "width": 60, + "height": 60, + "x": 0, + "y": 0, + }, str(identify) + assert identify[0]["image"].get("colorspace") == "CMYK", str(identify) + assert identify[0]["image"].get("type") == "ColorSeparation", str(identify) + endian = "endianess" if identify[0].get("version", "0") < "1.0" else "endianness" + assert identify[0]["image"].get(endian) in ["Undefined", "LSB",], str( + identify + ) # FIXME: should be LSB + assert identify[0]["image"].get("depth") == 16, str(identify) + assert identify[0]["image"].get("baseDepth") == 16, str(identify) + assert identify[0]["image"].get("pageGeometry") == { + "width": 60, + "height": 60, + "x": 0, + "y": 0, + }, str(identify) + yield in_img + in_img.unlink() @pytest.fixture(scope="session") def png_icc_img(tmp_icc_png): @@ -5261,6 +5306,35 @@ def tiff_ccitt_nometa2_pdf(tmp_path_factory, tiff_ccitt_nometa2_img, request): out_pdf.unlink() + +@pytest.fixture(scope="session", params=["internal", "pikepdf"]) +def miff_cmyk16_pdf(tmp_path_factory, miff_cmyk16_img, request): + out_pdf = tmp_path_factory.mktemp("miff_cmyk16_pdf") / "out.pdf" + subprocess.check_call( + [ + img2pdfprog, + "--producer=", + "--nodate", + "--engine=" + request.param, + "--output=" + str(out_pdf), + str(miff_cmyk16_img), + ] + ) + with pikepdf.open(str(out_pdf)) as p: + assert ( + p.pages[0].Contents.read_bytes() + == b"q\n45.0000 0 0 45.0000 0.0000 0.0000 cm\n/Im0 Do\nQ" + ) + assert p.pages[0].Resources.XObject.Im0.BitsPerComponent == 16 + assert p.pages[0].Resources.XObject.Im0.ColorSpace == "/DeviceCMYK" + assert p.pages[0].Resources.XObject.Im0.Filter == "/FlateDecode" + assert p.pages[0].Resources.XObject.Im0.Height == 60 + assert p.pages[0].Resources.XObject.Im0.Width == 60 + yield out_pdf + out_pdf.unlink() + + + ############################################################################### # TEST CASES # ############################################################################### @@ -6123,6 +6197,20 @@ def test_tiff_ccitt_nometa2( compare_pdfimages_tiff(tmpdir, tiff_ccitt_nometa2_img, tiff_ccitt_nometa2_pdf) +@pytest.mark.skipif( + sys.platform in ["win32"], + reason="test utilities not available on Windows and MacOS", +) +def test_miff_cmyk16(tmp_path_factory, miff_cmyk16_img, tiff_cmyk16_img, miff_cmyk16_pdf): + tmpdir = tmp_path_factory.mktemp("miff_cmyk16") + compare_ghostscript( + tmpdir, tiff_cmyk16_img, miff_cmyk16_pdf, gsdevice="tiff32nc", exact=False + ) + # not testing with poppler as it cannot write CMYK images + compare_mupdf(tmpdir, tiff_cmyk16_img, miff_cmyk16_pdf, exact=False, cmyk=True) + #compare_pdfimages_tiff(tmpdir, tiff_cmyk16_img, miff_cmyk16_pdf) + + # we define some variables so that the table below can be narrower psl = (972, 504) # --pagesize landscape psp = (504, 972) # --pagesize portrait