src/tests/__init__.py: use pikepdf instead of pdfrw

This commit is contained in:
Johannes 'josch' Schauer 2020-05-28 12:43:08 +02:00
parent c808061b4b
commit 997fe8efd8
Signed by untrusted user: josch
GPG key ID: F2CBA5C78FBD83E1

View file

@ -7,57 +7,11 @@ import sys
import zlib import zlib
from PIL import Image from PIL import Image
from io import StringIO, BytesIO, TextIOWrapper from io import StringIO, BytesIO, TextIOWrapper
import pikepdf
import decimal
HERE = os.path.dirname(__file__) HERE = os.path.dirname(__file__)
PdfReaderIO = StringIO
# Recompressing the image stream makes the comparison robust against output
# preserving changes in the zlib compress output bitstream
# (e.g. between different zlib implementations/versions/releases).
# Without this, some img2pdf 0.3.2 tests fail on Fedora 29/aarch64.
# See also:
# https://gitlab.mister-muffin.de/josch/img2pdf/issues/51
# https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/thread/R7GD4L5Z6HELCDAL2RDESWR2F3ZXHWVX/
def recompress_last_stream(bs):
length_pos = bs.rindex(b'/Length')
li = length_pos + 8
lj = bs.index(b' ', li)
n = int(bs[li:lj])
stream_pos = bs.index(b'\nstream\n', lj)
si = stream_pos + 8
sj = si + n
startx_pos = bs.rindex(b'\nstartxref\n')
xi = startx_pos + 11
xj = bs.index(b'\n', xi)
m = int(bs[xi:xj])
unc_t = zlib.decompress(bs[si:sj])
t = zlib.compress(unc_t)
new_len = str(len(t)).encode('ascii')
u = (lj-li) + n
v = len(new_len) + len(t)
off = v - u
rs = (bs[:li] + new_len + bs[lj:si] + t + bs[sj:xi]
+ str(m+off).encode('ascii') + bs[xj:])
return rs
def compare_pdf(outx, outy):
if b'/FlateDecode' in outx:
x = recompress_last_stream(outx)
y = recompress_last_stream(outy)
if x != y:
print('original outx:\n{}\nouty:\n{}\n'.format(outx, outy), file=sys.stderr)
print('recompressed outx:\n{}\nouty:\n{}\n'.format(x, y), file=sys.stderr)
return False
else:
if outx != outy:
print('original outx:\n{}\nouty:\n{}\n'.format(outx, outy), file=sys.stderr)
return True
# convert +set date:create +set date:modify -define png:exclude-chunk=time # convert +set date:create +set date:modify -define png:exclude-chunk=time
# we define some variables so that the table below can be narrower # we define some variables so that the table below can be narrower
@ -535,34 +489,24 @@ def test_suite():
assert os.path.isfile(outputf) assert os.path.isfile(outputf)
def handle(self, f=inputf, out=outputf, with_pdfrw=with_pdfrw): def handle(self, f=inputf, out=outputf, with_pdfrw=with_pdfrw):
try:
from pdfrw import PdfReader, PdfName, PdfWriter
from pdfrw.py23_diffs import convert_load, convert_store
except ImportError:
# the test requires pdfrw
self.skipTest("this test requires pdfrw")
return
with open(f, "rb") as inf: with open(f, "rb") as inf:
orig_imgdata = inf.read() orig_imgdata = inf.read()
output = img2pdf.convert(orig_imgdata, nodate=True, output = img2pdf.convert(orig_imgdata, nodate=True,
with_pdfrw=with_pdfrw) with_pdfrw=with_pdfrw)
x = PdfReader(PdfReaderIO(convert_load(output))) x = pikepdf.open(BytesIO(output))
self.assertEqual(sorted(x.keys()), [PdfName.Info, PdfName.Root, self.assertIn(x.Root.Pages.Count, (1, 2))
PdfName.Size])
self.assertIn(x.Root.Pages.Count, ('1', '2'))
if len(x.Root.Pages.Kids) == '1': if len(x.Root.Pages.Kids) == '1':
self.assertEqual(x.Size, '7') self.assertEqual(x.Size, '7')
self.assertEqual(len(x.Root.Pages.Kids), 1) self.assertEqual(len(x.Root.Pages.Kids), 1)
elif len(x.Root.Pages.Kids) == '2': elif len(x.Root.Pages.Kids) == '2':
self.assertEqual(x.Size, '10') self.assertEqual(x.Size, '10')
self.assertEqual(len(x.Root.Pages.Kids), 2) self.assertEqual(len(x.Root.Pages.Kids), 2)
self.assertEqual(x.Info, {}) self.assertEqual(sorted(x.Root.keys()), ["/Pages",
self.assertEqual(sorted(x.Root.keys()), [PdfName.Pages, "/Type"])
PdfName.Type]) self.assertEqual(x.Root.Type, "/Catalog")
self.assertEqual(x.Root.Type, PdfName.Catalog)
self.assertEqual(sorted(x.Root.Pages.keys()), self.assertEqual(sorted(x.Root.Pages.keys()),
[PdfName.Count, PdfName.Kids, PdfName.Type]) ["/Count", "/Kids", "/Type"])
self.assertEqual(x.Root.Pages.Type, PdfName.Pages) self.assertEqual(x.Root.Pages.Type, "/Pages")
orig_img = Image.open(f) orig_img = Image.open(f)
for pagenum in range(len(x.Root.Pages.Kids)): for pagenum in range(len(x.Root.Pages.Kids)):
# retrieve the original image frame that this page was # retrieve the original image frame that this page was
@ -583,57 +527,54 @@ def test_suite():
def format_float(f): def format_float(f):
if int(f) == f: if int(f) == f:
return str(int(f)) return int(f)
else: else:
return ("%.4f" % f).rstrip("0") return decimal.Decimal("%.4f" % f)
self.assertEqual(sorted(cur_page.keys()), self.assertEqual(sorted(cur_page.keys()),
[PdfName.Contents, PdfName.MediaBox, ["/Contents", "/MediaBox",
PdfName.Parent, PdfName.Resources, "/Parent", "/Resources",
PdfName.Type]) "/Type"])
self.assertEqual(cur_page.MediaBox, self.assertEqual(cur_page.MediaBox,
['0', '0', format_float(pagewidth), pikepdf.Array([0, 0, format_float(pagewidth),
format_float(pageheight)]) format_float(pageheight)]))
self.assertEqual(cur_page.Parent, x.Root.Pages) self.assertEqual(cur_page.Parent, x.Root.Pages)
self.assertEqual(cur_page.Type, PdfName.Page) self.assertEqual(cur_page.Type, "/Page")
self.assertEqual(cur_page.Resources.keys(), self.assertEqual(cur_page.Resources.keys(),
[PdfName.XObject]) {"/XObject"})
self.assertEqual(cur_page.Resources.XObject.keys(), self.assertEqual(cur_page.Resources.XObject.keys(),
[PdfName.Im0]) {"/Im0"})
self.assertEqual(cur_page.Contents.keys(),
[PdfName.Length])
self.assertEqual(cur_page.Contents.Length, self.assertEqual(cur_page.Contents.Length,
str(len(cur_page.Contents.stream))) len(cur_page.Contents.read_bytes()))
self.assertEqual(cur_page.Contents.stream, self.assertEqual(cur_page.Contents.read_bytes(),
"q\n%.4f 0 0 %.4f 0.0000 0.0000 cm\n" b"q\n%.4f 0 0 %.4f 0.0000 0.0000 cm\n"
"/Im0 Do\nQ" % (pagewidth, pageheight)) b"/Im0 Do\nQ" % (pagewidth, pageheight))
imgprops = cur_page.Resources.XObject.Im0 imgprops = cur_page.Resources.XObject.Im0
# test if the filter is valid: # test if the filter is valid:
self.assertIn( self.assertIn(
imgprops.Filter, [PdfName.DCTDecode, PdfName.JPXDecode, imgprops.Filter, ["/DCTDecode", "/JPXDecode",
PdfName.FlateDecode, "/FlateDecode",
[PdfName.CCITTFaxDecode]]) pikepdf.Array([ pikepdf.Name.CCITTFaxDecode ])])
# test if the image has correct size # test if the image has correct size
self.assertEqual(imgprops.Width, str(orig_img.size[0])) self.assertEqual(imgprops.Width, orig_img.size[0])
self.assertEqual(imgprops.Height, str(orig_img.size[1])) self.assertEqual(imgprops.Height, orig_img.size[1])
# if the input file is a jpeg then it should've been copied # if the input file is a jpeg then it should've been copied
# verbatim into the PDF # verbatim into the PDF
if imgprops.Filter in [PdfName.DCTDecode, if imgprops.Filter in ["/DCTDecode",
PdfName.JPXDecode]: "/JPXDecode"]:
self.assertEqual( self.assertEqual(
cur_page.Resources.XObject.Im0.stream, cur_page.Resources.XObject.Im0.read_raw_bytes(),
convert_load(orig_imgdata)) orig_imgdata)
elif imgprops.Filter == [PdfName.CCITTFaxDecode]: elif imgprops.Filter == pikepdf.Array([ pikepdf.Name.CCITTFaxDecode ]):
tiff_header = tiff_header_for_ccitt( tiff_header = tiff_header_for_ccitt(
int(imgprops.Width), int(imgprops.Height), int(imgprops.Width), int(imgprops.Height),
int(imgprops.Length), 4) int(imgprops.Length), 4)
imgio = BytesIO() imgio = BytesIO()
imgio.write(tiff_header) imgio.write(tiff_header)
imgio.write(convert_store( imgio.write(cur_page.Resources.XObject.Im0.read_raw_bytes())
cur_page.Resources.XObject.Im0.stream))
imgio.seek(0) imgio.seek(0)
im = Image.open(imgio) im = Image.open(imgio)
self.assertEqual(im.tobytes(), orig_img.tobytes()) self.assertEqual(im.tobytes(), orig_img.tobytes())
@ -641,13 +582,12 @@ def test_suite():
im.close() im.close()
except AttributeError: except AttributeError:
pass pass
elif imgprops.Filter == "/FlateDecode":
elif imgprops.Filter == PdfName.FlateDecode:
# otherwise, the data is flate encoded and has to be equal # otherwise, the data is flate encoded and has to be equal
# to the pixel data of the input image # to the pixel data of the input image
imgdata = zlib.decompress( imgdata = zlib.decompress(
convert_store(cur_page.Resources.XObject.Im0.stream)) cur_page.Resources.XObject.Im0.read_raw_bytes())
if imgprops.DecodeParms: if hasattr(imgprops, "DecodeParms"):
if orig_img.format == 'PNG': if orig_img.format == 'PNG':
pngidat, palette = img2pdf.parse_png(orig_imgdata) pngidat, palette = img2pdf.parse_png(orig_imgdata)
elif orig_img.format == 'TIFF' \ elif orig_img.format == 'TIFF' \
@ -664,11 +604,11 @@ def test_suite():
self.assertEqual(zlib.decompress(pngidat), imgdata) self.assertEqual(zlib.decompress(pngidat), imgdata)
else: else:
colorspace = imgprops.ColorSpace colorspace = imgprops.ColorSpace
if colorspace == PdfName.DeviceGray: if colorspace == "/DeviceGray":
colorspace = 'L' colorspace = 'L'
elif colorspace == PdfName.DeviceRGB: elif colorspace == "/DeviceRGB":
colorspace = 'RGB' colorspace = 'RGB'
elif colorspace == PdfName.DeviceCMYK: elif colorspace == "/DeviceCMYK":
colorspace = 'CMYK' colorspace = 'CMYK'
else: else:
raise Exception("invalid colorspace") raise Exception("invalid colorspace")
@ -689,18 +629,17 @@ def test_suite():
im.close() im.close()
except AttributeError: except AttributeError:
pass pass
else:
raise Exception("unknown filter")
# now use pdfrw to parse and then write out both pdfs and check the # now use pdfrw to parse and then write out both pdfs and check the
# result for equality # result for equality
y = PdfReader(out) y = pikepdf.open(out)
outx = BytesIO() outx = BytesIO()
outy = BytesIO() outy = BytesIO()
xwriter = PdfWriter() x.save(outx, compress_streams = False, static_id=True)
ywriter = PdfWriter() y.save(outy, compress_streams = False, static_id=True)
xwriter.trailer = x self.assertEqual(outx.getvalue(), outy.getvalue())
ywriter.trailer = y
xwriter.write(outx)
ywriter.write(outy)
self.assertEqual(compare_pdf(outx.getvalue(), outy.getvalue()), True)
# the python-pil version 2.3.0-1ubuntu3 in Ubuntu does not have the # the python-pil version 2.3.0-1ubuntu3 in Ubuntu does not have the
# close() method # close() method
try: try: