Compare commits

..

1 commit
main ... main

Author SHA1 Message Date
homocomputeris
8b74dbb91b Add B and JB paper sizes 2022-03-30 23:47:40 +03:00
9 changed files with 282 additions and 1276 deletions

View file

@ -1,3 +0,0 @@
Johannes Schauer Marin Rodrigues <josch@mister-muffin.de>
Johannes Schauer Marin Rodrigues <josch@mister-muffin.de> <j.schauer@email.de>
Johannes Schauer Marin Rodrigues <josch@mister-muffin.de> <josch@pyneo.org>

View file

@ -2,29 +2,6 @@
CHANGES CHANGES
======= =======
0.5.1 (2023-11-26)
------------------
- no default ICC profile location for PDF/A-1b on Windows
- workaround for PNG input without dpi units but non-square dpi aspect ratio
0.5.0 (2023-10-28)
------------------
- support MIFF for 16 bit CMYK input
- accept pathlib.Path objects as input
- don't store RGB ICC profiles from bilevel or grayscale TIFF, PNG and JPEG
- thumbnails are no longer included by default and --include-thumbnails has to
be used if you want them
- support for pikepdf (>= 6.2.0)
0.4.4 (2022-04-07)
------------------
- --viewer-page-layout support for twopageright and twopageleft
- Add B and JB paper sizes
- support for pikepdf (>= 5.0.0) and Pillow (>= 9.1.0)
0.4.3 (2021-10-24) 0.4.3 (2021-10-24)
------------------ ------------------

39
HACKING
View file

@ -27,41 +27,6 @@ Making a new release
- Build and upload to pypi: - Build and upload to pypi:
$ rm -rf dist/* $ rm dist/*
$ python3 setup.py sdist $ python3 setup.py sdist
$ twine upload dist/* $ twine upload --sign dist/*
Using debbisect to find regressions
-----------------------------------
$ debbisect --cache=./cache --depends="git,ca-certificates,python3,
ghostscript,imagemagick,mupdf-tools,poppler-utils,python3-pil,
python3-pytest,python3-numpy,python3-scipy,python3-pikepdf" \
--verbose 2023-09-16 2023-10-24 \
'chroot "$1" sh -c "
git clone https://gitlab.mister-muffin.de/josch/img2pdf.git
&& cd img2pdf
&& pytest 'src/img2pdf_test.py::test_jpg_2000_rgba8[internal]"'
Using debbisect cache
---------------------
$ mmdebstrap --variant=apt --aptopt='Acquire::Check-Valid-Until "false"' \
--include=git,ca-certificates,python3,ghostscript,imagemagick \
--include=mupdf-tools,poppler-utils,python3-pil,python3-pytest \
--include=python3-numpy,python3-scipy,python3-pikepdf \
--hook-dir=/usr/share/mmdebstrap/hooks/file-mirror-automount \
--setup-hook='mkdir -p "$1/home/josch/git/devscripts/cache/pool/"' \
--setup-hook='mount -o ro,bind /home/josch/git/devscripts/cache/pool/ "$1/home/josch/git/devscripts/cache/pool/"' \
--chrooted-customize-hook=bash
unstable /dev/null
file:///home/josch/git/devscripts/cache/archive/debian/20231022T090139Z/
Bisecting imagemagick
---------------------
$ git clean -fdx && git reset --hard
$ ./configure --prefix=$(pwd)/prefix
$ make -j$(nproc)
$ make install
$ LD_LIBRARY_PATH=$(pwd)/prefix/lib prefix/bin/compare ...

View file

@ -28,10 +28,10 @@ The following table shows how img2pdf handles different input depending on the
input file format and image color space. input file format and image color space.
| Format | Colorspace | Result | | Format | Colorspace | Result |
| ------------------------------------- | ------------------------------ | ------------- | | -------------------- | ------------------------------ | ------------- |
| JPEG | any | direct | | JPEG | any | direct |
| JPEG2000 | any | direct | | JPEG2000 | any | direct |
| PNG (non-interlaced, no transparency) | any | direct | | PNG (non-interlaced) | any | direct |
| TIFF (CCITT Group 4) | monochrome | direct | | TIFF (CCITT Group 4) | monochrome | direct |
| any | any except CMYK and monochrome | PNG Paeth | | any | any except CMYK and monochrome | PNG Paeth |
| any | monochrome | CCITT Group 4 | | any | monochrome | CCITT Group 4 |
@ -72,6 +72,11 @@ Bugs
when embedded into the PDF cannot be read by the Adobe Acrobat Reader, when embedded into the PDF cannot be read by the Adobe Acrobat Reader,
please contact me. please contact me.
- I have not yet figured out how to determine the colorspace of JPEG2000
files. Therefore JPEG2000 files use DeviceRGB by default. For JPEG2000
files with other colorspaces, you must explicitly specify it using the
`--colorspace` option.
- An error is produced if the input image is broken. This commonly happens if - An error is produced if the input image is broken. This commonly happens if
the input image has an invalid EXIF Orientation value of zero. Even though the input image has an invalid EXIF Orientation value of zero. Even though
only nine different values from 1 to 9 are permitted, Anroid phones and only nine different values from 1 to 9 are permitted, Anroid phones and
@ -117,9 +122,10 @@ You can then test the converter using:
$ ve/bin/img2pdf -o test.pdf src/tests/test.jpg $ ve/bin/img2pdf -o test.pdf src/tests/test.jpg
If you don't want to setup Python on Windows, then head to the For Microsoft Windows users, PyInstaller based .exe files are produced by
[releases](/josch/img2pdf/releases) section and download the latest appveyor. If you don't want to install Python before using img2pdf you can head
`img2pdf.exe`. to appveyor and click on "Artifacts" to download the latest version:
https://ci.appveyor.com/project/josch/img2pdf
GUI GUI
--- ---
@ -146,10 +152,6 @@ The package can also be used as a library:
with open("name.pdf","wb") as f1, open("test.jpg") as f2: with open("name.pdf","wb") as f1, open("test.jpg") as f2:
f1.write(img2pdf.convert(f2)) f1.write(img2pdf.convert(f2))
# opening using pathlib
with open("name.pdf","wb") as f:
f.write(img2pdf.convert(pathlib.Path('test.jpg')))
# using in-memory image data # using in-memory image data
with open("name.pdf","wb") as f: with open("name.pdf","wb") as f:
f.write(img2pdf.convert("\x89PNG...") f.write(img2pdf.convert("\x89PNG...")
@ -192,11 +194,6 @@ The package can also be used as a library:
with open("name.pdf","wb") as f: with open("name.pdf","wb") as f:
f.write(img2pdf.convert(glob.glob("/path/to/*.jpg"))) f.write(img2pdf.convert(glob.glob("/path/to/*.jpg")))
# convert all files matching a glob using pathlib.Path
from pathlib import Path
with open("name.pdf","wb") as f:
f.write(img2pdf.convert(*Path("/path").glob("**/*.jpg")))
# ignore invalid rotation values in the input images # ignore invalid rotation values in the input images
with open("name.pdf","wb") as f: with open("name.pdf","wb") as f:
f.write(img2pdf.convert('test.jpg'), rotation=img2pdf.Rotation.ifvalid) f.write(img2pdf.convert('test.jpg'), rotation=img2pdf.Rotation.ifvalid)
@ -308,14 +305,3 @@ Tesseract might not do a lossless conversion. For example it converts CMYK
input to RGB and removes the alpha channel from images with transparency. For input to RGB and removes the alpha channel from images with transparency. For
multipage TIFF or animated GIF, it will only convert the first frame. multipage TIFF or animated GIF, it will only convert the first frame.
Comparison to econvert from ExactImage
--------------------------------------
Like pdflatex and podofoimg2pf, econvert is able to embed JPEG images into PDF
directly without re-encoding but when given other file formats, it stores them
just using flate compressen, which unnecessarily increases the filesize.
Furthermore, it throws an error with CMYK TIF input. It also doesn't store CMYK
jpeg files as CMYK but converts them to RGB, so it's not lossless. When trying
to feed it 16bit files, it errors out with Unhandled bps/spp combination. It
also seems to choose JPEG encoding when using it on some file types (like
palette images) making it again not lossless for that input as well.

View file

@ -26,8 +26,7 @@ build: off
after_test: after_test:
- "%PYTHON%\\python.exe setup.py bdist_wheel" - "%PYTHON%\\python.exe setup.py bdist_wheel"
- "%PYTHON%\\python.exe -m PyInstaller --clean --onefile --console --nowindowed --name img2pdf src/img2pdf.py" - "%PYTHON%\\python.exe -m PyInstaller --clean --onefile --noconsole src/img2pdf.py"
#- "%PYTHON%\\python.exe -m PyInstaller --clean --onefile --noconsole --windowed --name img2pdf_windowed src/img2pdf.py"
artifacts: artifacts:
- path: dist\* - path: dist\*

View file

@ -1,7 +1,7 @@
import sys import sys
from setuptools import setup from setuptools import setup
VERSION = "0.5.1" VERSION = "0.4.3"
INSTALL_REQUIRES = ( INSTALL_REQUIRES = (
"Pillow", "Pillow",

View file

@ -22,22 +22,12 @@ import sys
import os import os
import zlib import zlib
import argparse import argparse
from PIL import Image, TiffImagePlugin, GifImagePlugin, ImageCms from PIL import Image, TiffImagePlugin
if hasattr(GifImagePlugin, "LoadingStrategy"):
# Pillow 9.0.0 started emitting all frames but the first as RGB instead of
# P to make sure that more than 256 colors can be represented. But palette
# images compress far better than RGB images in PDF so we instruct Pillow
# to only emit RGB frames if the palette differs and return P otherwise.
# This works since Pillow 9.1.0.
GifImagePlugin.LOADING_STRATEGY = (
GifImagePlugin.LoadingStrategy.RGB_AFTER_DIFFERENT_PALETTE_ONLY
)
# TiffImagePlugin.DEBUG = True # TiffImagePlugin.DEBUG = True
from PIL.ExifTags import TAGS from PIL.ExifTags import TAGS
from datetime import datetime, timezone from datetime import datetime
import jp2 from jp2 import parsejp2
from enum import Enum from enum import Enum
from io import BytesIO from io import BytesIO
import logging import logging
@ -45,8 +35,6 @@ import struct
import platform import platform
import hashlib import hashlib
from itertools import chain from itertools import chain
import re
import io
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -62,7 +50,7 @@ try:
except ImportError: except ImportError:
have_pikepdf = False have_pikepdf = False
__version__ = "0.5.1" __version__ = "0.4.3"
default_dpi = 96.0 default_dpi = 96.0
papersizes = { papersizes = {
"letter": "8.5inx11in", "letter": "8.5inx11in",
@ -127,16 +115,11 @@ PageOrientation = Enum("PageOrientation", "portrait landscape")
Colorspace = Enum("Colorspace", "RGB RGBA L LA 1 CMYK CMYK;I P PA other") Colorspace = Enum("Colorspace", "RGB RGBA L LA 1 CMYK CMYK;I P PA other")
ImageFormat = Enum( ImageFormat = Enum("ImageFormat", "JPEG JPEG2000 CCITTGroup4 PNG GIF TIFF MPO other")
"ImageFormat", "JPEG JPEG2000 CCITTGroup4 PNG GIF TIFF MPO MIFF other"
)
PageMode = Enum("PageMode", "none outlines thumbs") PageMode = Enum("PageMode", "none outlines thumbs")
PageLayout = Enum( PageLayout = Enum("PageLayout", "single onecolumn twocolumnright twocolumnleft twopageright twopageleft")
"PageLayout",
"single onecolumn twocolumnright twocolumnleft twopageright twopageleft",
)
Magnification = Enum("Magnification", "fit fith fitbh") Magnification = Enum("Magnification", "fit fith fitbh")
@ -434,28 +417,6 @@ class ExifOrientationError(Exception):
pass pass
# temporary change the attribute of an object using a context manager
class temp_attr:
def __init__(self, obj, field, value):
self.obj = obj
self.field = field
self.value = value
def __enter__(self):
self.exists = False
if hasattr(self.obj, self.field):
self.exists = True
self.old_value = getattr(self.obj, self.field)
logger.debug(f"setting {self.obj}.{self.field} = {self.value}")
setattr(self.obj, self.field, self.value)
def __exit__(self, exctype, excinst, exctb):
if self.exists:
setattr(self.obj, self.field, self.old_value)
else:
delattr(self.obj, self.field)
# without pdfrw this function is a no-op # without pdfrw this function is a no-op
def my_convert_load(string): def my_convert_load(string):
return string return string
@ -722,7 +683,7 @@ class pdfdoc(object):
self.writer.docinfo = PdfDict(indirect=True) self.writer.docinfo = PdfDict(indirect=True)
def datetime_to_pdfdate(dt): def datetime_to_pdfdate(dt):
return dt.astimezone(tz=timezone.utc).strftime("%Y%m%d%H%M%SZ") return dt.strftime("%Y%m%d%H%M%SZ")
for k in ["Title", "Author", "Creator", "Producer", "Subject"]: for k in ["Title", "Author", "Creator", "Producer", "Subject"]:
v = locals()[k.lower()] v = locals()[k.lower()]
@ -732,7 +693,7 @@ class pdfdoc(object):
v = PdfString.encode(v) v = PdfString.encode(v)
self.writer.docinfo[getattr(PdfName, k)] = v self.writer.docinfo[getattr(PdfName, k)] = v
now = datetime.now().astimezone() now = datetime.now()
for k in ["CreationDate", "ModDate"]: for k in ["CreationDate", "ModDate"]:
v = locals()[k.lower()] v = locals()[k.lower()]
if v is None and nodate: if v is None and nodate:
@ -752,7 +713,7 @@ class pdfdoc(object):
) )
def datetime_to_xmpdate(dt): def datetime_to_xmpdate(dt):
return dt.astimezone(tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") return dt.strftime("%Y-%m-%dT%H:%M:%SZ")
self.xmp = b"""<?xpacket begin='\xef\xbb\xbf' id='W5M0MpCehiHzreSzNTczkc9d'?> self.xmp = b"""<?xpacket begin='\xef\xbb\xbf' id='W5M0MpCehiHzreSzNTczkc9d'?>
<x:xmpmeta xmlns:x='adobe:ns:meta/' x:xmptk='XMP toolkit 2.9.1-13, framework 1.6'> <x:xmpmeta xmlns:x='adobe:ns:meta/' x:xmptk='XMP toolkit 2.9.1-13, framework 1.6'>
@ -827,10 +788,8 @@ class pdfdoc(object):
artborder=None, artborder=None,
iccp=None, iccp=None,
): ):
assert ( assert (color != Colorspace.RGBA and color != Colorspace.LA) or (
color not in [Colorspace.RGBA, Colorspace.LA] imgformat == ImageFormat.PNG and smaskdata is not None
or (imgformat == ImageFormat.PNG and smaskdata is not None)
or imgformat == ImageFormat.JPEG2000
) )
if self.engine == Engine.pikepdf: if self.engine == Engine.pikepdf:
@ -854,12 +813,6 @@ class pdfdoc(object):
if color == Colorspace["1"] or color == Colorspace.L or color == Colorspace.LA: if color == Colorspace["1"] or color == Colorspace.L or color == Colorspace.LA:
colorspace = PdfName.DeviceGray colorspace = PdfName.DeviceGray
elif color == Colorspace.RGB or color == Colorspace.RGBA: elif color == Colorspace.RGB or color == Colorspace.RGBA:
if color == Colorspace.RGBA and imgformat == ImageFormat.JPEG2000:
# there is no DeviceRGBA and for JPXDecode it is okay to have
# no colorspace as the pdf reader is supposed to get this info
# from the jpeg2000 payload itself
colorspace = None
else:
colorspace = PdfName.DeviceRGB colorspace = PdfName.DeviceRGB
elif color == Colorspace.CMYK or color == Colorspace["CMYK;I"]: elif color == Colorspace.CMYK or color == Colorspace["CMYK;I"]:
colorspace = PdfName.DeviceCMYK colorspace = PdfName.DeviceCMYK
@ -931,7 +884,6 @@ class pdfdoc(object):
image[PdfName.Filter] = ofilter image[PdfName.Filter] = ofilter
image[PdfName.Width] = imgwidthpx image[PdfName.Width] = imgwidthpx
image[PdfName.Height] = imgheightpx image[PdfName.Height] = imgheightpx
if colorspace is not None:
image[PdfName.ColorSpace] = colorspace image[PdfName.ColorSpace] = colorspace
image[PdfName.BitsPerComponent] = depth image[PdfName.BitsPerComponent] = depth
@ -1182,16 +1134,8 @@ class pdfdoc(object):
[initial_page, PdfName.XYZ, NullObject, NullObject, 0] [initial_page, PdfName.XYZ, NullObject, NullObject, 0]
) )
# The /OpenAction array must contain the page as an indirect object. # the /OpenAction array must contain the page as an indirect object
# This changed some time after 4.2.0 and on or before 5.0.0 and current
# versions require to use .obj or otherwise we get:
# TypeError: Can't convert ObjectHelper (or subclass) to Object
# implicitly. Use .obj to get access the underlying object.
# See https://github.com/pikepdf/pikepdf/issues/313 for details.
if self.engine == Engine.pikepdf: if self.engine == Engine.pikepdf:
if isinstance(initial_page, pikepdf.Page):
initial_page = self.writer.make_indirect(initial_page.obj)
else:
initial_page = self.writer.make_indirect(initial_page) initial_page = self.writer.make_indirect(initial_page)
if self.magnification == Magnification.fit: if self.magnification == Magnification.fit:
@ -1269,11 +1213,8 @@ class pdfdoc(object):
# now write out the PDF # now write out the PDF
if self.engine == Engine.pikepdf: if self.engine == Engine.pikepdf:
kwargs = {}
if pikepdf.__version__ >= "6.2.0":
kwargs["deterministic_id"] = True
self.writer.save( self.writer.save(
outputstream, min_version=self.output_version, linearize=True, **kwargs outputstream, min_version=self.output_version, linearize=True
) )
elif self.engine == Engine.pdfrw: elif self.engine == Engine.pdfrw:
self.writer.trailer.Info = self.writer.docinfo self.writer.trailer.Info = self.writer.docinfo
@ -1301,7 +1242,7 @@ def get_imgmetadata(
if imgformat == ImageFormat.JPEG2000 and rawdata is not None and imgdata is None: if imgformat == ImageFormat.JPEG2000 and rawdata is not None and imgdata is None:
# this codepath gets called if the PIL installation is not able to # this codepath gets called if the PIL installation is not able to
# handle JPEG2000 files # handle JPEG2000 files
imgwidthpx, imgheightpx, ics, hdpi, vdpi, channels, bpp = jp2.parse(rawdata) imgwidthpx, imgheightpx, ics, hdpi, vdpi = parsejp2(rawdata)
if hdpi is None: if hdpi is None:
hdpi = default_dpi hdpi = default_dpi
@ -1311,19 +1252,7 @@ def get_imgmetadata(
else: else:
imgwidthpx, imgheightpx = imgdata.size imgwidthpx, imgheightpx = imgdata.size
ndpi = imgdata.info.get("dpi") ndpi = imgdata.info.get("dpi", (default_dpi, default_dpi))
if ndpi is None:
# the PNG plugin of PIL adds the undocumented "aspect" field instead of
# the "dpi" field if the PNG pHYs chunk unit is not set to meters
if imgformat == ImageFormat.PNG and imgdata.info.get("aspect") is not None:
aspect = imgdata.info["aspect"]
# make sure not to go below the default dpi
if aspect[0] > aspect[1]:
ndpi = (default_dpi * aspect[0] / aspect[1], default_dpi)
else:
ndpi = (default_dpi, default_dpi * aspect[1] / aspect[0])
else:
ndpi = (default_dpi, default_dpi)
# In python3, the returned dpi value for some tiff images will # In python3, the returned dpi value for some tiff images will
# not be an integer but a float. To make the behaviour of # not be an integer but a float. To make the behaviour of
# img2pdf the same between python2 and python3, we convert that # img2pdf the same between python2 and python3, we convert that
@ -1333,7 +1262,7 @@ def get_imgmetadata(
ics = imgdata.mode ics = imgdata.mode
# GIF and PNG files with transparency are supported # GIF and PNG files with transparency are supported
if imgformat in [ImageFormat.PNG, ImageFormat.GIF, ImageFormat.JPEG2000] and ( if (imgformat == ImageFormat.PNG or imgformat == ImageFormat.GIF) and (
ics in ["RGBA", "LA"] or "transparency" in imgdata.info ics in ["RGBA", "LA"] or "transparency" in imgdata.info
): ):
# Must check the IHDR chunk for the bit depth, because PIL would lossily # Must check the IHDR chunk for the bit depth, because PIL would lossily
@ -1343,10 +1272,6 @@ def get_imgmetadata(
if depth > 8: if depth > 8:
logger.warning("Image with transparency and a bit depth of %d." % depth) logger.warning("Image with transparency and a bit depth of %d." % depth)
logger.warning("This is unsupported due to PIL limitations.") logger.warning("This is unsupported due to PIL limitations.")
logger.warning(
"If you accept a lossy conversion, you can manually convert "
"your images to 8 bit using `convert -depth 8` from imagemagick"
)
raise AlphaChannelError( raise AlphaChannelError(
"Refusing to work with multiple >8bit channels." "Refusing to work with multiple >8bit channels."
) )
@ -1457,53 +1382,6 @@ def get_imgmetadata(
iccp = None iccp = None
if "icc_profile" in imgdata.info: if "icc_profile" in imgdata.info:
iccp = imgdata.info.get("icc_profile") iccp = imgdata.info.get("icc_profile")
# GIMP saves bilevel TIFF images and palette PNG images with only black and
# white in the palette with an RGB ICC profile which is useless
# https://gitlab.gnome.org/GNOME/gimp/-/issues/3438
# and produces an error in Adobe Acrobat, so we ignore it with a warning.
# imagemagick also used to (wrongly) include an RGB ICC profile for bilevel
# images: https://github.com/ImageMagick/ImageMagick/issues/2070
if iccp is not None and (
(color == Colorspace["1"] and imgformat == ImageFormat.TIFF)
or (
imgformat == ImageFormat.PNG
and color == Colorspace.P
and rawdata is not None
and parse_png(rawdata)[1]
in [b"\x00\x00\x00\xff\xff\xff", b"\xff\xff\xff\x00\x00\x00"]
)
):
with io.BytesIO(iccp) as f:
prf = ImageCms.ImageCmsProfile(f)
if (
prf.profile.model == "sRGB"
and prf.profile.manufacturer == "GIMP"
and prf.profile.profile_description == "GIMP built-in sRGB"
):
if imgformat == ImageFormat.TIFF:
logger.warning(
"Ignoring RGB ICC profile in bilevel TIFF produced by GIMP."
)
elif imgformat == ImageFormat.PNG:
logger.warning(
"Ignoring RGB ICC profile in 2-color palette PNG produced by GIMP."
)
logger.warning("https://gitlab.gnome.org/GNOME/gimp/-/issues/3438")
iccp = None
# SmartAlbums old version (found 2.2.6) exports JPG with only 1 compone
# with an RGB ICC profile which is useless.
# This produces an error in Adobe Acrobat, so we ignore it with a warning.
# Update: Found another case, the JPG is created by Adobe PhotoShop, so we
# don't check software anymore.
if iccp is not None and (
(color == Colorspace["L"] and imgformat == ImageFormat.JPEG)
):
with io.BytesIO(iccp) as f:
prf = ImageCms.ImageCmsProfile(f)
if prf.profile.xcolor_space not in ("GRAY"):
logger.warning("Ignoring non-GRAY ICC profile in Grayscale JPG")
iccp = None
logger.debug("width x height = %dpx x %dpx", imgwidthpx, imgheightpx) logger.debug("width x height = %dpx x %dpx", imgwidthpx, imgheightpx)
@ -1560,29 +1438,27 @@ def transcode_monochrome(imgdata):
# into putting everything into a single strip. Thanks to Andrew Murray for # into putting everything into a single strip. Thanks to Andrew Murray for
# the hack. # the hack.
# #
# Since version 8.4.0 Pillow allows us to modify the strip size explicitly # This can be dropped once this gets merged:
tmp_strip_size = (imgdata.size[0] + 7) // 8 * imgdata.size[1] # https://github.com/python-pillow/Pillow/pull/5744
if hasattr(TiffImagePlugin, "STRIP_SIZE"):
# we are using Pillow 8.4.0 or later
with temp_attr(TiffImagePlugin, "STRIP_SIZE", tmp_strip_size):
im.save(newimgio, format="TIFF", compression="group4")
else:
# only needed for Pillow 8.3.x but works for versions before that as
# well
pillow__getitem__ = TiffImagePlugin.ImageFileDirectory_v2.__getitem__ pillow__getitem__ = TiffImagePlugin.ImageFileDirectory_v2.__getitem__
def __getitem__(self, tag): def __getitem__(self, tag):
overrides = { overrides = {
TiffImagePlugin.ROWSPERSTRIP: imgdata.size[1], TiffImagePlugin.ROWSPERSTRIP: imgdata.size[1],
TiffImagePlugin.STRIPBYTECOUNTS: [tmp_strip_size], TiffImagePlugin.STRIPBYTECOUNTS: [
(imgdata.size[0] + 7) // 8 * imgdata.size[1]
],
TiffImagePlugin.STRIPOFFSETS: [0], TiffImagePlugin.STRIPOFFSETS: [0],
} }
return overrides.get(tag, pillow__getitem__(self, tag)) return overrides.get(tag, pillow__getitem__(self, tag))
with temp_attr( # use try/finally to make sure that __getitem__ is reset even if save()
TiffImagePlugin.ImageFileDirectory_v2, "__getitem__", __getitem__ # raises an exception
): try:
TiffImagePlugin.ImageFileDirectory_v2.__getitem__ = __getitem__
im.save(newimgio, format="TIFF", compression="group4") im.save(newimgio, format="TIFF", compression="group4")
finally:
TiffImagePlugin.ImageFileDirectory_v2.__getitem__ = pillow__getitem__
# Open new image in memory # Open new image in memory
newimgio.seek(0) newimgio.seek(0)
@ -1612,204 +1488,7 @@ def parse_png(rawdata):
return pngidat, palette return pngidat, palette
miff_re = re.compile( def read_images(rawdata, colorspace, first_frame_only=False, rot=None):
r"""
[^\x00-\x20\x7f-\x9f] # the field name must not start with a control char or space
[^=]+ # the field name can even contain spaces
= # field name and value are separated by an equal sign
(?:
[^\x00-\x20\x7f-\x9f{}] # either chars that are not braces and not control chars
|{[^}]*} # or any kind of char surrounded by braces
)+""",
re.VERBOSE,
)
# https://imagemagick.org/script/miff.php
# turn off black formatting until python 3.10 is available on more platforms
# and we can use match/case
# fmt: off
def parse_miff(data):
results = []
header, rest = data.split(b":\x1a", 1)
header = header.decode("ISO-8859-1")
assert header.lower().startswith("id=imagemagick")
hdata = {}
for i, line in enumerate(re.findall(miff_re, header)):
if not line:
continue
k, v = line.split("=", 1)
if i == 0:
assert k.lower() == "id"
assert v.lower() == "imagemagick"
#match k.lower():
# case "class":
if k.lower() == "class":
#match v:
# case "DirectClass" | "PseudoClass":
if v in ["DirectClass", "PseudoClass"]:
hdata["class"] = v
# case _:
else:
print("cannot understand class", v)
# case "colorspace":
elif k.lower() == "colorspace":
# theoretically RGBA and CMYKA should be supported as well
# please teach me how to create such a MIFF file
#match v:
# case "sRGB" | "CMYK" | "Gray":
if v in ["sRGB", "CMYK", "Gray"]:
hdata["colorspace"] = v
# case _:
else:
print("cannot understand colorspace", v)
# case "depth":
elif k.lower() == "depth":
#match v:
# case "8" | "16" | "32":
if v in ["8", "16", "32"]:
hdata["depth"] = int(v)
# case _:
else:
print("cannot understand depth", v)
# case "colors":
elif k.lower() == "colors":
hdata["colors"] = int(v)
# case "matte":
elif k.lower() == "matte":
#match v:
# case "True":
if v == "True":
hdata["matte"] = True
# case "False":
elif v == "False":
hdata["matte"] = False
# case _:
else:
print("cannot understand matte", v)
# case "columns" | "rows":
elif k.lower() in ["columns", "rows"]:
hdata[k.lower()] = int(v)
# case "compression":
elif k.lower() == "compression":
print("compression not yet supported")
# case "profile":
elif k.lower() == "profile":
assert v in ["icc", "exif"]
hdata["profile"] = v
# case "resolution":
elif k.lower() == "resolution":
dpix, dpiy = v.split("x", 1)
hdata["resolution"] = (float(dpix), float(dpiy))
assert "depth" in hdata
assert "columns" in hdata
assert "rows" in hdata
#match hdata["class"]:
# case "DirectClass":
if hdata["class"] == "DirectClass":
if "colors" in hdata:
assert hdata["colors"] == 0
#match hdata["colorspace"]:
# case "sRGB":
if hdata["colorspace"] == "sRGB":
numchannels = 3
colorspace = Colorspace.RGB
# case "CMYK":
elif hdata["colorspace"] == "CMYK":
numchannels = 4
colorspace = Colorspace.CMYK
# case "Gray":
elif hdata["colorspace"] == "Gray":
numchannels = 1
colorspace = Colorspace.L
if hdata.get("matte"):
numchannels += 1
if hdata.get("profile"):
# there is no key encoding the length of icc or exif data
# according to the docs, the profile-icc key is supposed to do this
print("FAIL: exif")
else:
lenimgdata = (
hdata["depth"] // 8 * numchannels * hdata["columns"] * hdata["rows"]
)
assert len(rest) >= lenimgdata, (
len(rest),
hdata["depth"],
numchannels,
hdata["columns"],
hdata["rows"],
lenimgdata,
)
if colorspace == Colorspace.RGB and hdata["depth"] == 8:
newimg = Image.frombytes("RGB", (hdata["columns"], hdata["rows"]), rest[:lenimgdata])
imgdata, palette, depth = to_png_data(newimg)
assert palette == b""
assert depth == hdata["depth"]
imgfmt = ImageFormat.PNG
else:
imgdata = zlib.compress(rest[:lenimgdata])
imgfmt = ImageFormat.MIFF
results.append(
(
colorspace,
hdata.get("resolution") or (default_dpi, default_dpi),
imgfmt,
imgdata,
None, # smask
hdata["columns"],
hdata["rows"],
[], # palette
False, # inverted
hdata["depth"],
0, # rotation
None, # icc profile
)
)
if len(rest) > lenimgdata:
# another image is here
assert rest[lenimgdata:][:14].lower() == b"id=imagemagick"
results.extend(parse_miff(rest[lenimgdata:]))
# case "PseudoClass":
elif hdata["class"] == "PseudoClass":
assert "colors" in hdata
if hdata.get("matte"):
numchannels = 2
else:
numchannels = 1
lenpal = 3 * hdata["colors"] * hdata["depth"] // 8
lenimgdata = numchannels * hdata["rows"] * hdata["columns"]
assert len(rest) >= lenpal + lenimgdata, (len(rest), lenpal, lenimgdata)
results.append(
(
Colorspace.RGB,
hdata.get("resolution") or (default_dpi, default_dpi),
ImageFormat.MIFF,
zlib.compress(rest[lenpal : lenpal + lenimgdata]),
None, # FIXME: allow alpha channel smask
hdata["columns"],
hdata["rows"],
rest[:lenpal], # palette
False, # inverted
hdata["depth"],
0, # rotation
None, # icc profile
)
)
if len(rest) > lenpal + lenimgdata:
# another image is here
assert rest[lenpal + lenimgdata :][:14].lower() == b"id=imagemagick", (
len(rest),
lenpal,
lenimgdata,
)
results.extend(parse_miff(rest[lenpal + lenimgdata :]))
return results
# fmt: on
def read_images(
rawdata, colorspace, first_frame_only=False, rot=None, include_thumbnails=False
):
im = BytesIO(rawdata) im = BytesIO(rawdata)
im.seek(0) im.seek(0)
imgdata = None imgdata = None
@ -1817,19 +1496,13 @@ def read_images(
imgdata = Image.open(im) imgdata = Image.open(im)
except IOError as e: except IOError as e:
# test if it is a jpeg2000 image # test if it is a jpeg2000 image
if rawdata[:12] == b"\x00\x00\x00\x0C\x6A\x50\x20\x20\x0D\x0A\x87\x0A": if rawdata[:12] != b"\x00\x00\x00\x0C\x6A\x50\x20\x20\x0D\x0A\x87\x0A":
# image is jpeg2000
imgformat = ImageFormat.JPEG2000
if rawdata[:14].lower() == b"id=imagemagick":
# image is in MIFF format
# this is useful for 16 bit CMYK because PNG cannot do CMYK and thus
# we need PIL but PIL cannot do 16 bit
imgformat = ImageFormat.MIFF
else:
raise ImageOpenError( raise ImageOpenError(
"cannot read input image (not jpeg2000). " "cannot read input image (not jpeg2000). "
"PIL: error reading image: %s" % e "PIL: error reading image: %s" % e
) )
# image is jpeg2000
imgformat = ImageFormat.JPEG2000
else: else:
logger.debug("PIL format = %s", imgdata.format) logger.debug("PIL format = %s", imgdata.format)
imgformat = None imgformat = None
@ -1863,13 +1536,10 @@ def read_images(
raise JpegColorspaceError("jpeg can't be monochrome") raise JpegColorspaceError("jpeg can't be monochrome")
if color == Colorspace["P"]: if color == Colorspace["P"]:
raise JpegColorspaceError("jpeg can't have a color palette") raise JpegColorspaceError("jpeg can't have a color palette")
if color == Colorspace["RGBA"] and imgformat != ImageFormat.JPEG2000: if color == Colorspace["RGBA"]:
raise JpegColorspaceError("jpeg can't have an alpha channel") raise JpegColorspaceError("jpeg can't have an alpha channel")
logger.debug("read_images() embeds a JPEG") logger.debug("read_images() embeds a JPEG")
cleanup() cleanup()
depth = 8
if imgformat == ImageFormat.JPEG2000:
*_, depth = jp2.parse(rawdata)
return [ return [
( (
color, color,
@ -1881,7 +1551,7 @@ def read_images(
imgheightpx, imgheightpx,
[], [],
False, False,
depth, 8,
rotation, rotation,
iccp, iccp,
) )
@ -1898,77 +1568,6 @@ def read_images(
if imgformat == ImageFormat.MPO: if imgformat == ImageFormat.MPO:
result = [] result = []
img_page_count = 0 img_page_count = 0
assert len(imgdata._MpoImageFile__mpoffsets) == len(imgdata.mpinfo[0xB002])
num_frames = len(imgdata.mpinfo[0xB002])
# An MPO file can be a main image together with one or more thumbnails
# if that is the case, then we only include all frames if the
# --include-thumbnails option is given. If it is not, such an MPO file
# will be embedded as is, so including its thumbnails but showing up
# as a single image page in the resulting PDF.
num_main_frames = 0
num_thumbnail_frames = 0
for i, mpent in enumerate(imgdata.mpinfo[0xB002]):
# check only the first frame for being the main image
if (
i == 0
and mpent["Attribute"]["DependentParentImageFlag"]
and not mpent["Attribute"]["DependentChildImageFlag"]
and mpent["Attribute"]["RepresentativeImageFlag"]
and mpent["Attribute"]["MPType"] == "Baseline MP Primary Image"
):
num_main_frames += 1
elif (
not mpent["Attribute"]["DependentParentImageFlag"]
and mpent["Attribute"]["DependentChildImageFlag"]
and not mpent["Attribute"]["RepresentativeImageFlag"]
and mpent["Attribute"]["MPType"]
in [
"Large Thumbnail (VGA Equivalent)",
"Large Thumbnail (Full HD Equivalent)",
]
):
num_thumbnail_frames += 1
logger.debug(f"number of frames: {num_frames}")
logger.debug(f"number of main frames: {num_main_frames}")
logger.debug(f"number of thumbnail frames: {num_thumbnail_frames}")
# this MPO file is a main image plus zero or more thumbnails
# embed as-is unless the --include-thumbnails option was given
if num_frames == 1 or (
not include_thumbnails
and num_main_frames == 1
and num_thumbnail_frames + 1 == num_frames
):
color, ndpi, imgwidthpx, imgheightpx, rotation, iccp = get_imgmetadata(
imgdata, imgformat, default_dpi, colorspace, rawdata, rot
)
if color == Colorspace["1"]:
raise JpegColorspaceError("jpeg can't be monochrome")
if color == Colorspace["P"]:
raise JpegColorspaceError("jpeg can't have a color palette")
if color == Colorspace["RGBA"]:
raise JpegColorspaceError("jpeg can't have an alpha channel")
logger.debug("read_images() embeds an MPO verbatim")
cleanup()
return [
(
color,
ndpi,
ImageFormat.JPEG,
rawdata,
None,
imgwidthpx,
imgheightpx,
[],
False,
8,
rotation,
iccp,
)
]
# If the control flow reaches here, the MPO has more than a single
# frame but was not detected to be a main image followed by multiple
# thumbnails. We thus treat this MPO as we do other multi-frame images
# and include all its frames as individual pages.
for offset, mpent in zip( for offset, mpent in zip(
imgdata._MpoImageFile__mpoffsets, imgdata.mpinfo[0xB002] imgdata._MpoImageFile__mpoffsets, imgdata.mpinfo[0xB002]
): ):
@ -2066,9 +1665,6 @@ def read_images(
) )
] ]
if imgformat == ImageFormat.MIFF:
return parse_miff(rawdata)
# If our input is not JPEG or PNG, then we might have a format that # If our input is not JPEG or PNG, then we might have a format that
# supports multiple frames (like TIFF or GIF), so we need a loop to # supports multiple frames (like TIFF or GIF), so we need a loop to
# iterate through all frames of the image. # iterate through all frames of the image.
@ -2234,16 +1830,7 @@ def read_images(
) )
) )
else: else:
if color in [Colorspace.P, Colorspace.PA] and iccp is not None: if (
# PDF does not support palette images with icc profile
if color == Colorspace.P:
newcolor = Colorspace.RGB
newimg = newimg.convert(mode="RGB")
elif color == Colorspace.PA:
newcolor = Colorspace.RGBA
newimg = newimg.convert(mode="RGBA")
smaskidat = None
elif (
color == Colorspace.RGBA color == Colorspace.RGBA
or color == Colorspace.LA or color == Colorspace.LA
or color == Colorspace.PA or color == Colorspace.PA
@ -2257,21 +1844,25 @@ def read_images(
newcolor = color newcolor = color
l, a = newimg.split() l, a = newimg.split()
newimg = l newimg = l
elif color == Colorspace.PA or (
color == Colorspace.P and "transparency" in newimg.info
):
newcolor = color
a = newimg.convert(mode="RGBA").split()[-1]
else: else:
newcolor = Colorspace.RGBA newcolor = Colorspace.RGBA
r, g, b, a = newimg.convert(mode="RGBA").split() r, g, b, a = newimg.convert(mode="RGBA").split()
newimg = Image.merge("RGB", (r, g, b)) newimg = Image.merge("RGB", (r, g, b))
smaskidat, *_ = to_png_data(a) smaskidat, _, _ = to_png_data(a)
logger.warning( logger.warning(
"Image contains an alpha channel. Computing a separate " "Image contains an alpha channel which will be stored "
"soft mask (/SMask) image to store transparency in PDF." "as a separate soft mask (/SMask) image in PDF."
) )
elif color in [Colorspace.P, Colorspace.PA] and iccp is not None:
# PDF does not support palette images with icc profile
if color == Colorspace.P:
newcolor = Colorspace.RGB
newimg = newimg.convert(mode="RGB")
elif color == Colorspace.PA:
newcolor = Colorspace.RGBA
newimg = newimg.convert(mode="RGBA")
smaskidat = None
else: else:
newcolor = color newcolor = color
smaskidat = None smaskidat = None
@ -2613,6 +2204,7 @@ def find_scale(pagewidth, pageheight):
# as a binary string representing the image content or as filenames to the # as a binary string representing the image content or as filenames to the
# images. # images.
def convert(*images, **kwargs): def convert(*images, **kwargs):
_default_kwargs = dict( _default_kwargs = dict(
engine=None, engine=None,
title=None, title=None,
@ -2642,7 +2234,6 @@ def convert(*images, **kwargs):
artborder=None, artborder=None,
pdfa=None, pdfa=None,
rotation=None, rotation=None,
include_thumbnails=False,
) )
for kwname, default in _default_kwargs.items(): for kwname, default in _default_kwargs.items():
if kwname not in kwargs: if kwname not in kwargs:
@ -2686,16 +2277,11 @@ def convert(*images, **kwargs):
for img in images: for img in images:
# img is allowed to be a path, a binary string representing image data # img is allowed to be a path, a binary string representing image data
# or a file-like object (really anything that implements read()) # or a file-like object (really anything that implements read())
# or a pathlib.Path object (really anything that implements read_bytes())
rawdata = None
for fun in "read", "read_bytes":
try: try:
rawdata = getattr(img, fun)() rawdata = img.read()
except AttributeError: except AttributeError:
pass
if rawdata is None:
if not isinstance(img, (str, bytes)): if not isinstance(img, (str, bytes)):
raise TypeError("Neither read(), read_bytes() nor is str or bytes") raise TypeError("Neither implements read() nor is str or bytes")
# the thing doesn't have a read() function, so try if we can treat # the thing doesn't have a read() function, so try if we can treat
# it as a file name # it as a file name
try: try:
@ -2713,10 +2299,6 @@ def convert(*images, **kwargs):
rawdata = f.read() rawdata = f.read()
f.close() f.close()
# md5 = hashlib.md5(rawdata).hexdigest()
# with open("./testdata/" + md5, "wb") as f:
# f.write(rawdata)
for ( for (
color, color,
ndpi, ndpi,
@ -2735,7 +2317,6 @@ def convert(*images, **kwargs):
kwargs["colorspace"], kwargs["colorspace"],
kwargs["first_frame_only"], kwargs["first_frame_only"],
kwargs["rotation"], kwargs["rotation"],
kwargs["include_thumbnails"],
): ):
pagewidth, pageheight, imgwidthpdf, imgheightpdf = kwargs["layout_fun"]( pagewidth, pageheight, imgwidthpdf, imgheightpdf = kwargs["layout_fun"](
imgwidthpx, imgheightpx, ndpi imgwidthpx, imgheightpx, ndpi
@ -3111,7 +2692,7 @@ def valid_date(string):
else: else:
try: try:
return parser.parse(string) return parser.parse(string)
except: except TypeError:
pass pass
# as a last resort, try the local date utility # as a last resort, try the local date utility
try: try:
@ -3124,7 +2705,7 @@ def valid_date(string):
except subprocess.CalledProcessError: except subprocess.CalledProcessError:
pass pass
else: else:
return datetime.fromtimestamp(int(utime)) return datetime.utcfromtimestamp(int(utime))
raise argparse.ArgumentTypeError("cannot parse date: %s" % string) raise argparse.ArgumentTypeError("cannot parse date: %s" % string)
@ -3826,35 +3407,7 @@ def gui():
app.mainloop() app.mainloop()
def file_is_icc(fname): def main(argv=sys.argv):
with open(fname, "rb") as f:
data = f.read(40)
if len(data) < 40:
return False
return data[36:] == b"acsp"
def validate_icc(fname):
if not file_is_icc(fname):
raise argparse.ArgumentTypeError('"%s" is not an ICC profile' % fname)
return fname
def get_default_icc_profile():
for profile in [
"/usr/share/color/icc/sRGB.icc",
"/usr/share/color/icc/OpenICC/sRGB.icc",
"/usr/share/color/icc/colord/sRGB.icc",
]:
if not os.path.exists(profile):
continue
if not file_is_icc(profile):
continue
return profile
return "/usr/share/color/icc/sRGB.icc"
def get_main_parser():
rendered_papersizes = "" rendered_papersizes = ""
for k, v in sorted(papersizes.items()): for k, v in sorted(papersizes.items()):
rendered_papersizes += " %-8s %s\n" % (papernames[k], v) rendered_papersizes += " %-8s %s\n" % (papernames[k], v)
@ -3895,9 +3448,7 @@ Paper sizes:
the value in the second column has the same effect as giving the short hand the value in the second column has the same effect as giving the short hand
in the first column. Appending ^T (a caret/circumflex followed by the letter in the first column. Appending ^T (a caret/circumflex followed by the letter
T) turns the paper size from portrait into landscape. The postfix thus T) turns the paper size from portrait into landscape. The postfix thus
symbolizes the transpose. Note that on Windows cmd.exe the caret symbol is symbolizes the transpose. The values are case insensitive.
the escape character, so you need to put quotes around the option value.
The values are case insensitive.
%s %s
@ -3964,7 +3515,7 @@ Examples:
while preserving its aspect ratio and a print border of 2 cm on the top and while preserving its aspect ratio and a print border of 2 cm on the top and
bottom and 2.5 cm on the left and right hand side. bottom and 2.5 cm on the left and right hand side.
$ img2pdf --output out.pdf --pagesize "A4^T" --border 2cm:2.5cm *.jpg $ img2pdf --output out.pdf --pagesize A4^T --border 2cm:2.5cm *.jpg
On each A4 page, fit images into a 10 cm times 15 cm rectangle but keep the On each A4 page, fit images into a 10 cm times 15 cm rectangle but keep the
original image size if the image is smaller than that. original image size if the image is smaller than that.
@ -4099,17 +3650,6 @@ RGB.""",
"input image be converted into a page in the resulting PDF.", "input image be converted into a page in the resulting PDF.",
) )
outargs.add_argument(
"--include-thumbnails",
action="store_true",
help="Some multi-frame formats like MPO carry a main image and "
"one or more scaled-down copies of the main image (thumbnails). "
"In such a case, img2pdf will only include the main image and "
"not create additional pages for each of the thumbnails. If this "
"option is set, img2pdf will instead create one page per frame and "
"thus store each thumbnail on its own page.",
)
outargs.add_argument( outargs.add_argument(
"--pillow-limit-break", "--pillow-limit-break",
action="store_true", action="store_true",
@ -4121,28 +3661,13 @@ RGB.""",
% Image.MAX_IMAGE_PIXELS, % Image.MAX_IMAGE_PIXELS,
) )
if sys.platform == "win32":
# on Windows, there are no default paths to search for an ICC profile
# so make the argument required instead of optional
outargs.add_argument(
"--pdfa",
type=validate_icc,
help="Output a PDF/A-1b compliant document. The argument to this "
"option is the path to the ICC profile that will be embedded into "
"the resulting PDF.",
)
else:
outargs.add_argument( outargs.add_argument(
"--pdfa", "--pdfa",
nargs="?", nargs="?",
const=get_default_icc_profile(), const="/usr/share/color/icc/sRGB.icc",
default=None, default=None,
type=validate_icc,
help="Output a PDF/A-1b compliant document. By default, this will " help="Output a PDF/A-1b compliant document. By default, this will "
"embed either /usr/share/color/icc/sRGB.icc, " "embed /usr/share/color/icc/sRGB.icc as the color profile.",
"/usr/share/color/icc/OpenICC/sRGB.icc or "
"/usr/share/color/icc/colord/sRGB.icc as the color profile, whichever "
"is found to exist first.",
) )
sizeargs = parser.add_argument_group( sizeargs = parser.add_argument_group(
@ -4432,11 +3957,8 @@ and left/right, respectively. It is not possible to specify asymmetric borders.
action="store_true", action="store_true",
help="Instruct the PDF viewer to open the PDF in fullscreen mode", help="Instruct the PDF viewer to open the PDF in fullscreen mode",
) )
return parser
args = parser.parse_args(argv[1:])
def main(argv=sys.argv):
args = get_main_parser().parse_args(argv[1:])
if args.verbose: if args.verbose:
logging.basicConfig(level=logging.DEBUG) logging.basicConfig(level=logging.DEBUG)
@ -4460,11 +3982,7 @@ def main(argv=sys.argv):
elif len(args.images) == 0 and len(args.from_file) == 0: elif len(args.images) == 0 and len(args.from_file) == 0:
# if no positional arguments were supplied, read a single image from # if no positional arguments were supplied, read a single image from
# standard input # standard input
print( logger.info("reading image from standard input")
"Reading image from standard input...\n"
"Re-run with -h or --help for usage information.",
file=sys.stderr,
)
try: try:
images = [sys.stdin.buffer.read()] images = [sys.stdin.buffer.read()]
except KeyboardInterrupt: except KeyboardInterrupt:
@ -4525,7 +4043,6 @@ def main(argv=sys.argv):
artborder=args.art_border, artborder=args.art_border,
pdfa=args.pdfa, pdfa=args.pdfa,
rotation=args.rotation, rotation=args.rotation,
include_thumbnails=args.include_thumbnails,
) )
except Exception as e: except Exception as e:
logger.error("error: " + str(e)) logger.error("error: " + str(e))

File diff suppressed because it is too large Load diff

View file

@ -37,8 +37,9 @@ def getBox(data, byteStart, noBytes):
def parse_ihdr(data): def parse_ihdr(data):
height, width, channels, bpp = struct.unpack(">IIHB", data[:11]) height = struct.unpack(">I", data[0:4])[0]
return width, height, channels, bpp + 1 width = struct.unpack(">I", data[4:8])[0]
return width, height
def parse_colr(data): def parse_colr(data):
@ -84,13 +85,13 @@ def parse_jp2h(data):
while byteStart < noBytes and boxLengthValue != 0: while byteStart < noBytes and boxLengthValue != 0:
boxLengthValue, boxType, byteEnd, boxContents = getBox(data, byteStart, noBytes) boxLengthValue, boxType, byteEnd, boxContents = getBox(data, byteStart, noBytes)
if boxType == b"ihdr": if boxType == b"ihdr":
width, height, channels, bpp = parse_ihdr(boxContents) width, height = parse_ihdr(boxContents)
elif boxType == b"colr": elif boxType == b"colr":
colorspace = parse_colr(boxContents) colorspace = parse_colr(boxContents)
elif boxType == b"res ": elif boxType == b"res ":
hdpi, vdpi = parse_res(boxContents) hdpi, vdpi = parse_res(boxContents)
byteStart = byteEnd byteStart = byteEnd
return (width, height, colorspace, hdpi, vdpi, channels, bpp) return (width, height, colorspace, hdpi, vdpi)
def parsejp2(data): def parsejp2(data):
@ -101,9 +102,7 @@ def parsejp2(data):
while byteStart < noBytes and boxLengthValue != 0: while byteStart < noBytes and boxLengthValue != 0:
boxLengthValue, boxType, byteEnd, boxContents = getBox(data, byteStart, noBytes) boxLengthValue, boxType, byteEnd, boxContents = getBox(data, byteStart, noBytes)
if boxType == b"jp2h": if boxType == b"jp2h":
width, height, colorspace, hdpi, vdpi, channels, bpp = parse_jp2h( width, height, colorspace, hdpi, vdpi = parse_jp2h(boxContents)
boxContents
)
break break
byteStart = byteEnd byteStart = byteEnd
if not width: if not width:
@ -113,41 +112,13 @@ def parsejp2(data):
if not colorspace: if not colorspace:
raise Exception("no colorspace in jp2 header") raise Exception("no colorspace in jp2 header")
# retrieving the dpi is optional so we do not error out if not present # retrieving the dpi is optional so we do not error out if not present
return (width, height, colorspace, hdpi, vdpi, channels, bpp) return (width, height, colorspace, hdpi, vdpi)
def parsej2k(data):
lsiz, rsiz, xsiz, ysiz, xosiz, yosiz, _, _, _, _, csiz = struct.unpack(
">HHIIIIIIIIH", data[4:42]
)
ssiz = [None] * csiz
xrsiz = [None] * csiz
yrsiz = [None] * csiz
for i in range(csiz):
ssiz[i], xrsiz[i], yrsiz[i] = struct.unpack(
"BBB", data[42 + 3 * i : 42 + 3 * (i + 1)]
)
assert ssiz == [7, 7, 7]
return xsiz - xosiz, ysiz - yosiz, None, None, None, csiz, 8
def parse(data):
if data[:4] == b"\xff\x4f\xff\x51":
return parsej2k(data)
else:
return parsejp2(data)
if __name__ == "__main__": if __name__ == "__main__":
import sys import sys
width, height, colorspace, hdpi, vdpi, channels, bpp = parse( width, height, colorspace = parsejp2(open(sys.argv[1]).read())
open(sys.argv[1], "rb").read() sys.stdout.write("width = %d" % width)
) sys.stdout.write("height = %d" % height)
print("width = %d" % width) sys.stdout.write("colorspace = %s" % colorspace)
print("height = %d" % height)
print("colorspace = %s" % colorspace)
print("hdpi = %s" % hdpi)
print("vdpi = %s" % vdpi)
print("channels = %s" % channels)
print("bpp = %s" % bpp)