From a3046ca7712d9f99692ad06273b08718a7c12c07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Bu=CC=88nemann?= Date: Sat, 5 Apr 2014 01:55:03 +0200 Subject: [PATCH 01/53] Add pillow 2.4.0 support Pillow 2.4.0 added support for JPEG2000 using OpenJPEG 2.0. Because Pillow calls the format JPEG2000 instead of JP2, we need to rename it to enable the optimized code path. Should still be backwards compatible. --- src/img2pdf.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/img2pdf.py b/src/img2pdf.py index 9753a7b..2695125 100755 --- a/src/img2pdf.py +++ b/src/img2pdf.py @@ -133,7 +133,7 @@ class pdfdoc(object): # either embed the whole jpeg or deflate the bitmap representation if imgformat is "JPEG": ofilter = [ "/DCTDecode" ] - elif imgformat is "JP2": + elif imgformat is "JPEG2000": ofilter = [ "/JPXDecode" ] self.version = 5 # jpeg2000 needs pdf 1.5 else: @@ -218,7 +218,7 @@ def convert(images, dpi, title=None, author=None, creator=None, producer=None, exit(1) # image is jpeg2000 width, height, ics = parsejp2(rawdata) - imgformat = "JP2" + imgformat = "JPEG2000" if dpi: ndpi = dpi, dpi @@ -256,7 +256,7 @@ def convert(images, dpi, title=None, author=None, creator=None, producer=None, # depending on the input format, determine whether to pass the raw # image or the zlib compressed color information - if imgformat is "JPEG" or imgformat is "JP2": + if imgformat is "JPEG" or imgformat is "JPEG2000": if color == '1': error_out("jpeg can't be monochrome") exit(1) From acfb3270ab983dbe389731f672ea60e09f4213c1 Mon Sep 17 00:00:00 2001 From: josch Date: Fri, 27 Jun 2014 22:31:21 +0200 Subject: [PATCH 02/53] fix markdown in readme --- README.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 6061d8e..ef0b405 100644 --- a/README.md +++ b/README.md @@ -103,25 +103,25 @@ Installation You can install the package using: - $ pip install img2pdf + $ pip install img2pdf If you want to install from source code simply use: - $ cd img2pdf/ - $ pip install . + $ cd img2pdf/ + $ pip install . To test the console script without installing the package on your system, simply use virtualenv: - $ cd img2pdf/ - $ virtualenv ve - $ ve/bin/pip install . + $ cd img2pdf/ + $ virtualenv ve + $ ve/bin/pip install . You can then test the converter using: - $ ve/bin/img2pdf -o test.pdf src/tests/test.jpg + $ ve/bin/img2pdf -o test.pdf src/tests/test.jpg Note that the package can also be used as a library as follows: - import img2pdf - pdf_bytes = img2pdf('test.jpg', dpi=150) + import img2pdf + pdf_bytes = img2pdf('test.jpg', dpi=150) From 563bcb03727834ef4367883d02c98fd5eb86d925 Mon Sep 17 00:00:00 2001 From: Nicolas ELIE Date: Sat, 26 Jul 2014 16:12:40 +0200 Subject: [PATCH 03/53] Added python shebang Added python shebang to be able to launch script directly --- src/img2pdf.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/img2pdf.py b/src/img2pdf.py index 2695125..90795f0 100755 --- a/src/img2pdf.py +++ b/src/img2pdf.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python2 + # Copyright (C) 2012-2014 Johannes 'josch' Schauer # # This program is free software: you can redistribute it and/or From 9b35f5cf6b32d5668c1bd8b843be057684ca118f Mon Sep 17 00:00:00 2001 From: xiota Date: Mon, 4 Aug 2014 11:25:07 -0400 Subject: [PATCH 04/53] add options to specify pdf dimensions in points add options specify output pdf dimensions in points: -x width; -y height. --- src/img2pdf.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/src/img2pdf.py b/src/img2pdf.py index 90795f0..a8b949e 100755 --- a/src/img2pdf.py +++ b/src/img2pdf.py @@ -117,7 +117,7 @@ class pdfdoc(object): obj.identifier = newid self.objects.append(obj) - def addimage(self, color, width, height, dpi, imgformat, imgdata): + def addimage(self, color, width, height, imgformat, imgdata, pdf_x, pdf_y): if color == 'L': color = "/DeviceGray" elif color == 'RGB': @@ -126,9 +126,6 @@ class pdfdoc(object): error_out("unsupported color space: %s"%color) exit(1) - # pdf units = 1/72 inch - pdf_x, pdf_y = 72.0*width/dpi[0], 72.0*height/dpi[1] - if pdf_x < 3.00 or pdf_y < 3.00: warning_out("pdf width or height is below 3.00 - decrease the dpi") @@ -200,7 +197,7 @@ class pdfdoc(object): result += "%%EOF\n" return result -def convert(images, dpi, title=None, author=None, creator=None, producer=None, +def convert(images, dpi, pdf_x, pdf_y, title=None, author=None, creator=None, producer=None, creationdate=None, moddate=None, subject=None, keywords=None, colorspace=None, verbose=False): @@ -270,7 +267,15 @@ def convert(images, dpi, title=None, author=None, creator=None, producer=None, color = 'L' imgdata = zlib.compress(imgdata.tostring()) - pdf.addimage(color, width, height, ndpi, imgformat, imgdata) + # pdf units = 1/72 inch + if not pdf_x and not pdf_y: + pdf_x, pdf_y = 72.0*width/ndpi[0], 72.0*height/ndpi[1] + elif not pdf_y: + pdf_y = pdf_x*height/width + elif not pdf_x: + pdf_x = pdf_y*width/height + + pdf.addimage(color, width, height, imgformat, imgdata, pdf_x, pdf_y) im.close() @@ -298,6 +303,12 @@ parser.add_argument( parser.add_argument( '-d', '--dpi', metavar='dpi', type=positive_float, help='dpi for pdf output (default: 96.0)') +parser.add_argument( + '-x', metavar='pdf_x', type=positive_float, + help='output width in points') +parser.add_argument( + '-y', metavar='pdf_y', type=positive_float, + help='output height in points') parser.add_argument( '-t', '--title', metavar='title', type=str, help='title for metadata') @@ -332,9 +343,10 @@ def main(args=None): if args is None: args = sys.argv[1:] args = parser.parse_args(args) + args.output.write( convert( - args.images, args.dpi, args.title, args.author, + args.images, args.dpi, args.x, args.y, args.title, args.author, args.creator, args.producer, args.creationdate, args.moddate, args.subject, args.keywords, args.colorspace, args.verbose)) From 0bdf6472f877e1934f35eefe259dbb0b8545b35e Mon Sep 17 00:00:00 2001 From: josch Date: Sun, 24 Aug 2014 17:15:43 +0200 Subject: [PATCH 05/53] fix regression introduced by 9b35f5cf --- src/img2pdf.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/img2pdf.py b/src/img2pdf.py index a8b949e..ae6a9fd 100755 --- a/src/img2pdf.py +++ b/src/img2pdf.py @@ -197,7 +197,7 @@ class pdfdoc(object): result += "%%EOF\n" return result -def convert(images, dpi, pdf_x, pdf_y, title=None, author=None, creator=None, producer=None, +def convert(images, dpi, x, y, title=None, author=None, creator=None, producer=None, creationdate=None, moddate=None, subject=None, keywords=None, colorspace=None, verbose=False): @@ -268,12 +268,12 @@ def convert(images, dpi, pdf_x, pdf_y, title=None, author=None, creator=None, pr imgdata = zlib.compress(imgdata.tostring()) # pdf units = 1/72 inch - if not pdf_x and not pdf_y: + if not x and not y: pdf_x, pdf_y = 72.0*width/ndpi[0], 72.0*height/ndpi[1] - elif not pdf_y: - pdf_y = pdf_x*height/width - elif not pdf_x: - pdf_x = pdf_y*width/height + elif not y: + pdf_x, pdf_y = x, x*height/width + elif not x: + pdf_x, pdf_y = y*width/height, y pdf.addimage(color, width, height, imgformat, imgdata, pdf_x, pdf_y) From 43bbb27f00644c9f8a5f51f24045b89cb4c54f9f Mon Sep 17 00:00:00 2001 From: josch Date: Sun, 7 Sep 2014 07:57:29 +0200 Subject: [PATCH 06/53] prepare 0.1.1 release --- setup.cfg | 2 ++ setup.py | 6 ++++-- 2 files changed, 6 insertions(+), 2 deletions(-) create mode 100644 setup.cfg diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..b88034e --- /dev/null +++ b/setup.cfg @@ -0,0 +1,2 @@ +[metadata] +description-file = README.md diff --git a/setup.py b/setup.py index 2b490dc..5fb12aa 100644 --- a/setup.py +++ b/setup.py @@ -2,8 +2,9 @@ from setuptools import setup setup ( name='img2pdf', - version='0.1.0', + version='0.1.1', author = "Johannes 'josch' Schauer", + author_email = 'j.schauer@email.de' description = "Convert images to PDF via direct JPEG inclusion.", long_description = open('README.md').read(), license = "LGPL", @@ -20,7 +21,8 @@ setup ( 'Programming Language :: Python', 'Natural Language :: English', 'Operating System :: OS Independent'], - url = 'http://pypi.python.org/pypi/img2pdf', + url = 'https://github.com/josch/img2pdf', + download_url = 'https://github.com/josch/img2pdf/archive/0.1.1.tar.gz', package_dir={"": "src"}, py_modules=['img2pdf', 'jp2'], include_package_data = True, From 2afa1e4ca51c007ffc973a7430ed352caf03d02c Mon Sep 17 00:00:00 2001 From: josch Date: Sun, 7 Sep 2014 07:58:40 +0200 Subject: [PATCH 07/53] fix typo --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 5fb12aa..ec6f861 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ setup ( name='img2pdf', version='0.1.1', author = "Johannes 'josch' Schauer", - author_email = 'j.schauer@email.de' + author_email = 'j.schauer@email.de', description = "Convert images to PDF via direct JPEG inclusion.", long_description = open('README.md').read(), license = "LGPL", From 439d9761292bd7660fe65acf91ed77308743ad56 Mon Sep 17 00:00:00 2001 From: josch Date: Sun, 7 Sep 2014 07:59:12 +0200 Subject: [PATCH 08/53] prepare 0.1.2 release --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index ec6f861..59412e3 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import setup setup ( name='img2pdf', - version='0.1.1', + version='0.1.2', author = "Johannes 'josch' Schauer", author_email = 'j.schauer@email.de', description = "Convert images to PDF via direct JPEG inclusion.", @@ -22,7 +22,7 @@ setup ( 'Natural Language :: English', 'Operating System :: OS Independent'], url = 'https://github.com/josch/img2pdf', - download_url = 'https://github.com/josch/img2pdf/archive/0.1.1.tar.gz', + download_url = 'https://github.com/josch/img2pdf/archive/0.1.2.tar.gz', package_dir={"": "src"}, py_modules=['img2pdf', 'jp2'], include_package_data = True, From c76f1344a525c009bbdf3747c1595fd78c7a367e Mon Sep 17 00:00:00 2001 From: "Ryan C. Thompson" Date: Wed, 5 Nov 2014 23:46:47 -0800 Subject: [PATCH 09/53] Avoid leaking file descriptors This change prevents img2pdf from opening *all* input files at once, which means it now works with thousands of input files. --- src/img2pdf.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/img2pdf.py b/src/img2pdf.py index ae6a9fd..16745c7 100755 --- a/src/img2pdf.py +++ b/src/img2pdf.py @@ -204,7 +204,9 @@ def convert(images, dpi, x, y, title=None, author=None, creator=None, producer=N pdf = pdfdoc(3, title, author, creator, producer, creationdate, moddate, subject, keywords) - for im in images: + for imfilename in images: + debug_out("Reading %s"%imfilename, verbose) + im = open(imfilename, "rb") rawdata = im.read() im.seek(0) try: @@ -295,7 +297,7 @@ def valid_date(string): parser = argparse.ArgumentParser( description='Lossless conversion/embedding of images (in)to pdf') parser.add_argument( - 'images', metavar='infile', type=argparse.FileType('rb'), + 'images', metavar='infile', type=str, nargs='+', help='input file(s)') parser.add_argument( '-o', '--output', metavar='out', type=argparse.FileType('wb'), From b726afbb5ad07144df381cdd859b4c8f7010c472 Mon Sep 17 00:00:00 2001 From: "Ryan C. Thompson" Date: Wed, 5 Nov 2014 23:47:42 -0800 Subject: [PATCH 10/53] Convert unrecognized colorspaces to RGB Instead of crashing on an unrecognized colorspace, we now do imgdata.convert('RGB'). --- src/img2pdf.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/img2pdf.py b/src/img2pdf.py index 16745c7..24e92a3 100755 --- a/src/img2pdf.py +++ b/src/img2pdf.py @@ -265,8 +265,15 @@ def convert(images, dpi, x, y, title=None, author=None, creator=None, producer=N else: # because we do not support /CCITTFaxDecode if color == '1': + debug_out("Converting colorspace 1 to L", verbose) imgdata = imgdata.convert('L') color = 'L' + elif color in ("RGB", "L"): + debug_out("Colorspace is OK: %s"%color, verbose) + else: + debug_out("Converting colorspace %s to RGB"%color, verbose) + imgdata = imgdata.convert('RGB') + color = imgdata.mode imgdata = zlib.compress(imgdata.tostring()) # pdf units = 1/72 inch From d09cd0f1973724f1a1eb6656176762398819a539 Mon Sep 17 00:00:00 2001 From: "Ryan C. Thompson" Date: Thu, 6 Nov 2014 00:53:16 -0800 Subject: [PATCH 11/53] Use "with" to open and close input files --- src/img2pdf.py | 82 ++++++++++++++++++++++++-------------------------- 1 file changed, 40 insertions(+), 42 deletions(-) diff --git a/src/img2pdf.py b/src/img2pdf.py index 24e92a3..fd845af 100755 --- a/src/img2pdf.py +++ b/src/img2pdf.py @@ -206,51 +206,51 @@ def convert(images, dpi, x, y, title=None, author=None, creator=None, producer=N for imfilename in images: debug_out("Reading %s"%imfilename, verbose) - im = open(imfilename, "rb") - rawdata = im.read() - im.seek(0) - try: - imgdata = Image.open(im) - except IOError as e: - # test if it is a jpeg2000 image - if rawdata[:12] != "\x00\x00\x00\x0C\x6A\x50\x20\x20\x0D\x0A\x87\x0A": - error_out("cannot read input image (not jpeg2000)") - error_out("PIL: %s"%e) - exit(1) - # image is jpeg2000 - width, height, ics = parsejp2(rawdata) - imgformat = "JPEG2000" + with open(imfilename, "rb") as im: + rawdata = im.read() + im.seek(0) + try: + imgdata = Image.open(im) + except IOError as e: + # test if it is a jpeg2000 image + if rawdata[:12] != "\x00\x00\x00\x0C\x6A\x50\x20\x20\x0D\x0A\x87\x0A": + error_out("cannot read input image (not jpeg2000)") + error_out("PIL: %s"%e) + exit(1) + # image is jpeg2000 + width, height, ics = parsejp2(rawdata) + imgformat = "JPEG2000" - if dpi: - ndpi = dpi, dpi - debug_out("input dpi (forced) = %d x %d"%ndpi, verbose) - else: - ndpi = (96, 96) # TODO: read real dpi - debug_out("input dpi = %d x %d"%ndpi, verbose) + if dpi: + ndpi = dpi, dpi + debug_out("input dpi (forced) = %d x %d"%ndpi, verbose) + else: + ndpi = (96, 96) # TODO: read real dpi + debug_out("input dpi = %d x %d"%ndpi, verbose) - if colorspace: - color = colorspace - debug_out("input colorspace (forced) = %s"%(ics)) + if colorspace: + color = colorspace + debug_out("input colorspace (forced) = %s"%(ics)) + else: + color = ics + debug_out("input colorspace = %s"%(ics), verbose) else: - color = ics - debug_out("input colorspace = %s"%(ics), verbose) - else: - width, height = imgdata.size - imgformat = imgdata.format + width, height = imgdata.size + imgformat = imgdata.format - if dpi: - ndpi = dpi, dpi - debug_out("input dpi (forced) = %d x %d"%ndpi, verbose) - else: - ndpi = imgdata.info.get("dpi", (96, 96)) - debug_out("input dpi = %d x %d"%ndpi, verbose) + if dpi: + ndpi = dpi, dpi + debug_out("input dpi (forced) = %d x %d"%ndpi, verbose) + else: + ndpi = imgdata.info.get("dpi", (96, 96)) + debug_out("input dpi = %d x %d"%ndpi, verbose) - if colorspace: - color = colorspace - debug_out("input colorspace (forced) = %s"%(color), verbose) - else: - color = imgdata.mode - debug_out("input colorspace = %s"%(color), verbose) + if colorspace: + color = colorspace + debug_out("input colorspace (forced) = %s"%(color), verbose) + else: + color = imgdata.mode + debug_out("input colorspace = %s"%(color), verbose) debug_out("width x height = %d x %d"%(width,height), verbose) debug_out("imgformat = %s"%imgformat, verbose) @@ -286,8 +286,6 @@ def convert(images, dpi, x, y, title=None, author=None, creator=None, producer=N pdf.addimage(color, width, height, imgformat, imgdata, pdf_x, pdf_y) - im.close() - return pdf.tostring() From d8a11d839c83656fae9b47fc84f1131b1f9186a2 Mon Sep 17 00:00:00 2001 From: josch Date: Mon, 10 Nov 2014 10:13:52 +0100 Subject: [PATCH 12/53] fix problem with conversion of closed file --- src/img2pdf.py | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/src/img2pdf.py b/src/img2pdf.py index fd845af..b6880cf 100755 --- a/src/img2pdf.py +++ b/src/img2pdf.py @@ -252,29 +252,29 @@ def convert(images, dpi, x, y, title=None, author=None, creator=None, producer=N color = imgdata.mode debug_out("input colorspace = %s"%(color), verbose) - debug_out("width x height = %d x %d"%(width,height), verbose) - debug_out("imgformat = %s"%imgformat, verbose) + debug_out("width x height = %d x %d"%(width,height), verbose) + debug_out("imgformat = %s"%imgformat, verbose) - # depending on the input format, determine whether to pass the raw - # image or the zlib compressed color information - if imgformat is "JPEG" or imgformat is "JPEG2000": - if color == '1': - error_out("jpeg can't be monochrome") - exit(1) - imgdata = rawdata - else: - # because we do not support /CCITTFaxDecode - if color == '1': - debug_out("Converting colorspace 1 to L", verbose) - imgdata = imgdata.convert('L') - color = 'L' - elif color in ("RGB", "L"): - debug_out("Colorspace is OK: %s"%color, verbose) + # depending on the input format, determine whether to pass the raw + # image or the zlib compressed color information + if imgformat is "JPEG" or imgformat is "JPEG2000": + if color == '1': + error_out("jpeg can't be monochrome") + exit(1) + imgdata = rawdata else: - debug_out("Converting colorspace %s to RGB"%color, verbose) - imgdata = imgdata.convert('RGB') - color = imgdata.mode - imgdata = zlib.compress(imgdata.tostring()) + # because we do not support /CCITTFaxDecode + if color == '1': + debug_out("Converting colorspace 1 to L", verbose) + imgdata = imgdata.convert('L') + color = 'L' + elif color in ("RGB", "L"): + debug_out("Colorspace is OK: %s"%color, verbose) + else: + debug_out("Converting colorspace %s to RGB"%color, verbose) + imgdata = imgdata.convert('RGB') + color = imgdata.mode + imgdata = zlib.compress(imgdata.tostring()) # pdf units = 1/72 inch if not x and not y: From d217f1403ffa2cb39ffa3e138272ea10d74a5c89 Mon Sep 17 00:00:00 2001 From: josch Date: Mon, 10 Nov 2014 10:14:33 +0100 Subject: [PATCH 13/53] prepare 0.1.3 release --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 59412e3..81fcf31 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import setup setup ( name='img2pdf', - version='0.1.2', + version='0.1.3', author = "Johannes 'josch' Schauer", author_email = 'j.schauer@email.de', description = "Convert images to PDF via direct JPEG inclusion.", @@ -22,7 +22,7 @@ setup ( 'Natural Language :: English', 'Operating System :: OS Independent'], url = 'https://github.com/josch/img2pdf', - download_url = 'https://github.com/josch/img2pdf/archive/0.1.2.tar.gz', + download_url = 'https://github.com/josch/img2pdf/archive/0.1.3.tar.gz', package_dir={"": "src"}, py_modules=['img2pdf', 'jp2'], include_package_data = True, From b143867a720501a7bf29d7684e8b058a16b1f578 Mon Sep 17 00:00:00 2001 From: josch Date: Wed, 7 Jan 2015 15:56:24 +0100 Subject: [PATCH 14/53] add Python 3 support --- setup.py | 2 ++ src/img2pdf.py | 42 ++++++++++++++++++++++-------------------- src/jp2.py | 7 ++++--- 3 files changed, 28 insertions(+), 23 deletions(-) diff --git a/setup.py b/setup.py index 81fcf31..c518a76 100644 --- a/setup.py +++ b/setup.py @@ -16,6 +16,8 @@ setup ( 'Programming Language :: Python :: 2', 'Programming Language :: Python :: 2.6', 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: Implementation :: CPython', 'License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)', 'Programming Language :: Python', diff --git a/src/img2pdf.py b/src/img2pdf.py index b6880cf..ae61e53 100755 --- a/src/img2pdf.py +++ b/src/img2pdf.py @@ -38,17 +38,19 @@ def warning_out(message): def parse(cont, indent=1): if type(cont) is dict: - return "<<\n"+"\n".join( - [4 * indent * " " + "%s %s" % (k, parse(v, indent+1)) - for k, v in cont.items()])+"\n"+4*(indent-1)*" "+">>" + return b"<<\n"+b"\n".join( + [4 * indent * b" " + k.encode("utf8") + b" " + parse(v, indent+1) + for k, v in cont.items()])+b"\n"+4*(indent-1)*b" "+b">>" elif type(cont) is int or type(cont) is float: - return str(cont) + return str(cont).encode("utf8") elif isinstance(cont, obj): - return "%d 0 R"%cont.identifier + return ("%d 0 R"%cont.identifier).encode("utf8") elif type(cont) is str: + return cont.encode("utf8") + elif type(cont) is bytes: return cont elif type(cont) is list: - return "[ "+" ".join([parse(c, indent) for c in cont])+" ]" + return b"[ "+b" ".join([parse(c, indent) for c in cont])+b" ]" class obj(object): def __init__(self, content, stream=None): @@ -58,11 +60,11 @@ class obj(object): def tostring(self): if self.stream: return ( - "%d 0 obj " % self.identifier + + ("%d 0 obj " % self.identifier).encode("utf8") + parse(self.content) + - "\nstream\n" + self.stream + "\nendstream\nendobj\n") + b"\nstream\n" + self.stream + b"\nendstream\nendobj\n") else: - return "%d 0 obj "%self.identifier+parse(self.content)+" endobj\n" + return ("%d 0 obj "%self.identifier).encode("utf8")+parse(self.content)+b" endobj\n" class pdfdoc(object): @@ -149,7 +151,7 @@ class pdfdoc(object): "/Length": len(imgdata) }, imgdata) - text = "q\n%f 0 0 %f 0 0 cm\n/Im0 Do\nQ"%(pdf_x, pdf_y) + text = ("q\n%f 0 0 %f 0 0 cm\n/Im0 Do\nQ"%(pdf_x, pdf_y)).encode('utf8') content = obj({ "/Length": len(text) @@ -178,23 +180,23 @@ class pdfdoc(object): xreftable = list() - result = "%%PDF-1.%d\n"%self.version + result = ("%%PDF-1.%d\n"%self.version).encode("utf8") - xreftable.append("0000000000 65535 f \n") + xreftable.append(b"0000000000 65535 f \n") for o in self.objects: - xreftable.append("%010d 00000 n \n"%len(result)) + xreftable.append(("%010d 00000 n \n"%len(result)).encode("utf8")) result += o.tostring() xrefoffset = len(result) - result += "xref\n" - result += "0 %d\n"%len(xreftable) + result += b"xref\n" + result += ("0 %d\n"%len(xreftable)).encode("utf8") for x in xreftable: result += x - result += "trailer\n" - result += parse({"/Size": len(xreftable), "/Info": self.info, "/Root": self.catalog})+"\n" - result += "startxref\n" - result += "%d\n"%xrefoffset - result += "%%EOF\n" + result += b"trailer\n" + result += parse({"/Size": len(xreftable), "/Info": self.info, "/Root": self.catalog})+b"\n" + result += b"startxref\n" + result += ("%d\n"%xrefoffset).encode("utf8") + result += b"%%EOF\n" return result def convert(images, dpi, x, y, title=None, author=None, creator=None, producer=None, diff --git a/src/jp2.py b/src/jp2.py index addfe5d..4f960fe 100644 --- a/src/jp2.py +++ b/src/jp2.py @@ -20,6 +20,7 @@ # along with this program. If not, see . import struct +import sys def getBox(data, byteStart, noBytes): boxLengthValue = struct.unpack(">I", data[byteStart:byteStart+4])[0] @@ -85,6 +86,6 @@ def parsejp2(data): if __name__ == "__main__": import sys width, height, colorspace = parsejp2(open(sys.argv[1]).read()) - print "width = %d"%width - print "height = %d"%height - print "colorspace = %s"%colorspace + sys.stdout.write("width = %d"%width) + sys.stdout.write("height = %d"%height) + sys.stdout.write("colorspace = %s"%colorspace) From 700e62f1d8a7b4100779f862861aaa54cac4e9b1 Mon Sep 17 00:00:00 2001 From: josch Date: Wed, 7 Jan 2015 16:23:52 +0100 Subject: [PATCH 15/53] make output reproducible by sorting and --nodate option --- src/img2pdf.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/img2pdf.py b/src/img2pdf.py index ae61e53..395ee31 100755 --- a/src/img2pdf.py +++ b/src/img2pdf.py @@ -40,7 +40,7 @@ def parse(cont, indent=1): if type(cont) is dict: return b"<<\n"+b"\n".join( [4 * indent * b" " + k.encode("utf8") + b" " + parse(v, indent+1) - for k, v in cont.items()])+b"\n"+4*(indent-1)*b" "+b">>" + for k, v in sorted(cont.items())])+b"\n"+4*(indent-1)*b" "+b">>" elif type(cont) is int or type(cont) is float: return str(cont).encode("utf8") elif isinstance(cont, obj): @@ -70,7 +70,7 @@ class pdfdoc(object): def __init__(self, version=3, title=None, author=None, creator=None, producer=None, creationdate=None, moddate=None, subject=None, - keywords=None): + keywords=None, nodate=False): self.version = version # default pdf version 1.3 now = datetime.now() self.objects = [] @@ -86,11 +86,11 @@ class pdfdoc(object): info["/Producer"] = "("+producer+")" if creationdate: info["/CreationDate"] = "(D:"+creationdate.strftime("%Y%m%d%H%M%S")+")" - else: + elif not nodate: info["/CreationDate"] = "(D:"+now.strftime("%Y%m%d%H%M%S")+")" if moddate: info["/ModDate"] = "(D:"+moddate.strftime("%Y%m%d%H%M%S")+")" - else: + elif not nodate: info["/ModDate"] = "(D:"+now.strftime("%Y%m%d%H%M%S")+")" if subject: info["/Subject"] = "("+subject+")" @@ -345,6 +345,8 @@ parser.add_argument( parser.add_argument( '-C', '--colorspace', metavar='colorspace', type=str, help='force PIL colorspace (one of: RGB, L, 1)') +parser.add_argument( + '-D', '--nodate', help='do not add timestamps', action="store_true") parser.add_argument( '-v', '--verbose', help='verbose mode', action="store_true") From 5c7753d6c4f8a82ccc104b001b1f7e09cd43d406 Mon Sep 17 00:00:00 2001 From: lukahn Date: Thu, 15 Jan 2015 16:15:05 +1100 Subject: [PATCH 16/53] Fix for adding custom resolutions --- src/img2pdf.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/img2pdf.py b/src/img2pdf.py index 395ee31..45601b4 100755 --- a/src/img2pdf.py +++ b/src/img2pdf.py @@ -285,6 +285,9 @@ def convert(images, dpi, x, y, title=None, author=None, creator=None, producer=N pdf_x, pdf_y = x, x*height/width elif not x: pdf_x, pdf_y = y*width/height, y + else: + pdf_x = x + pdf_y = y pdf.addimage(color, width, height, imgformat, imgdata, pdf_x, pdf_y) From d9054ddfd356bb9413c4ac2e1eccabeea8da1bb4 Mon Sep 17 00:00:00 2001 From: lukahn Date: Thu, 15 Jan 2015 16:19:18 +1100 Subject: [PATCH 17/53] Updated library section to reflect latest implementation --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ef0b405..9b219a5 100644 --- a/README.md +++ b/README.md @@ -124,4 +124,7 @@ You can then test the converter using: Note that the package can also be used as a library as follows: import img2pdf - pdf_bytes = img2pdf('test.jpg', dpi=150) + pdf_bytes = img2pdf.convert(['test.jpg'], dpi=150, x=0, y=0) + + file = open("name.pdf","wb") + file.write(pdf_bytes) From 454952724dd8efdac2b5bc03bcb1b4eea77ea74d Mon Sep 17 00:00:00 2001 From: josch Date: Wed, 21 Jan 2015 11:00:41 +0100 Subject: [PATCH 18/53] prepare 0.1.4 release --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index c518a76..88554a6 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import setup setup ( name='img2pdf', - version='0.1.3', + version='0.1.4', author = "Johannes 'josch' Schauer", author_email = 'j.schauer@email.de', description = "Convert images to PDF via direct JPEG inclusion.", @@ -24,7 +24,7 @@ setup ( 'Natural Language :: English', 'Operating System :: OS Independent'], url = 'https://github.com/josch/img2pdf', - download_url = 'https://github.com/josch/img2pdf/archive/0.1.3.tar.gz', + download_url = 'https://github.com/josch/img2pdf/archive/0.1.4.tar.gz', package_dir={"": "src"}, py_modules=['img2pdf', 'jp2'], include_package_data = True, From e4db4e9e8e129daa2b4b55741cd825bc6fd3c454 Mon Sep 17 00:00:00 2001 From: Erik Jensen Date: Sun, 15 Feb 2015 00:03:16 -0800 Subject: [PATCH 19/53] Enable support for CMYK images CMYK TIFFs and JPEGs both work. CMYK JPEG2000 images have not been tested. Adobe Photoshop and some other software generate inverted CMYK JPEGs. The image is assumed to be inverted if the "Adobe" (APP14) tag is present. Images can be forced inverted with `-C "CMYK;I"`, and forced not inverted with `-C CMYK`. --- src/img2pdf.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/src/img2pdf.py b/src/img2pdf.py index 45601b4..f923e8b 100755 --- a/src/img2pdf.py +++ b/src/img2pdf.py @@ -121,9 +121,11 @@ class pdfdoc(object): def addimage(self, color, width, height, imgformat, imgdata, pdf_x, pdf_y): if color == 'L': - color = "/DeviceGray" + colorspace = "/DeviceGray" elif color == 'RGB': - color = "/DeviceRGB" + colorspace = "/DeviceRGB" + elif color == 'CMYK' or color == 'CMYK;I': + colorspace = "/DeviceCMYK" else: error_out("unsupported color space: %s"%color) exit(1) @@ -145,12 +147,16 @@ class pdfdoc(object): "/Filter": ofilter, "/Width": width, "/Height": height, - "/ColorSpace": color, + "/ColorSpace": colorspace, # hardcoded as PIL doesnt provide bits for non-jpeg formats "/BitsPerComponent": 8, "/Length": len(imgdata) }, imgdata) + if color == 'CMYK;I': + # Inverts all four channels + image.content['/Decode'] = [1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0] + text = ("q\n%f 0 0 %f 0 0 cm\n/Im0 Do\nQ"%(pdf_x, pdf_y)).encode('utf8') content = obj({ @@ -252,6 +258,16 @@ def convert(images, dpi, x, y, title=None, author=None, creator=None, producer=N debug_out("input colorspace (forced) = %s"%(color), verbose) else: color = imgdata.mode + if color == "CMYK" and imgformat == "JPEG": + # Adobe inverts CMYK JPEGs for some reason, and others + # have followed suit as well. Some software assumes the + # JPEG is inverted if the Adobe tag (APP14), while other + # software assumes all CMYK JPEGs are inverted. I don't + # have enough experience with these to know which is + # better for images currently in the wild, so I'm going + # with the first approach for now. + if "adobe" in imgdata.info: + color = "CMYK;I" debug_out("input colorspace = %s"%(color), verbose) debug_out("width x height = %d x %d"%(width,height), verbose) @@ -270,7 +286,7 @@ def convert(images, dpi, x, y, title=None, author=None, creator=None, producer=N debug_out("Converting colorspace 1 to L", verbose) imgdata = imgdata.convert('L') color = 'L' - elif color in ("RGB", "L"): + elif color in ("RGB", "L", "CMYK", "CMYK;I"): debug_out("Colorspace is OK: %s"%color, verbose) else: debug_out("Converting colorspace %s to RGB"%color, verbose) @@ -347,7 +363,7 @@ parser.add_argument( help='keywords for metadata') parser.add_argument( '-C', '--colorspace', metavar='colorspace', type=str, - help='force PIL colorspace (one of: RGB, L, 1)') + help='force PIL colorspace (one of: RGB, L, 1, CMYK, CMYK;I)') parser.add_argument( '-D', '--nodate', help='do not add timestamps', action="store_true") parser.add_argument( From c68ce6a48d9bf652bd582cd585d57b83c87a2903 Mon Sep 17 00:00:00 2001 From: Erik Jensen Date: Sun, 15 Feb 2015 14:19:57 -0800 Subject: [PATCH 20/53] Add CMYK test images --- src/tests/test-CMYK.jpg | Bin 0 -> 4788 bytes src/tests/test-CMYK.tif | Bin 0 -> 22286 bytes 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 src/tests/test-CMYK.jpg create mode 100644 src/tests/test-CMYK.tif diff --git a/src/tests/test-CMYK.jpg b/src/tests/test-CMYK.jpg new file mode 100644 index 0000000000000000000000000000000000000000..44213a815e7d16383cdba1630362cc4b1a678cef GIT binary patch literal 4788 zcmcJRcQjmI*T?TL#t!A6rx5+f{=)K<@fyY{+>VI_gT+A>+W^#+WYMNy?3qiIp=u#_yfR#);H1zKp+rc zcrt+FEdZwN5$NU#cmO9xbNm)S0W>FJ5a^Gm`D0+(Kb{T@rUgUjAdtTwdIl(jo`D_$ zfu4dgF#fTV8Ph37rav!#iu`LS4IM2l9V0!2{_m9klRJI}utNbkAOs8&258_QS~v(S z3>?1%5CA}XBJITeQT(@^sG)<<(1ICH!n!N~06x(Qp``;u>Hp+_P9k&=dMLmEXJuq# z=in3)4sVlJxyZ#Wg0$DR%%k$?IJgpODH9&S*}3;CDko2>ocZ$y{bV=>maLShJrm|j429)kh@z!?f}TRkd`rxIEE3o2u=Hu58?Zi`AWlx`_W5NdPxayT zNq>cQvOs9>1H%cCrcK#j`{ROHh?16?RkHf4BE;PA*1pdWCC5DBs**Bs|FBn0->16Q zUdqhzrV_6#Izv1N`^%th%(2E}f~~2utCfI~h#_Y5&FbX$5aMZsLAWOg9I8vfuU?Cd ziW73i7n-&j2AkVQa~Av^f(N#5S(8Y5*0=2C?=qX=D_V7#m!?d?;E>^oF~9pYQVjh0 zGO}B~d}9`lyHDQVHgK&Mn;2O!%#B%AI14a0_PQr6s`ckSlL9C&axgE#7WQ3aM2+8(I;Iy=GZ-l+TE1Fz*R+H2EaE4yP6Hxs zqVh8|AnYRew2GB2>;0SWRlM@keXuu$cma(+C>jb9kc^Wx zpLvs{wzq8?w=tqQ9W8QpjAxa_)_%=wMy6;FTAUOfg(acc>@!t1M# zjIDCP=_Z_c;fgTtI+>qWwx3l!n8@~B38_YR@9?n3LnarRd%hYk3r1gs5*t|id@mhM zVTj;#xEXtIy=4q?NY;XbEwM!sZ>hop?wVk72N~u=1K@DDB-)(=!f{I{=uvr9!9lJ& zM0_)IPLpMKFXY?H`snW>XpaH=&yCwAgEIQOq`lHfHbkO29Sv*|J}XNIYaKKXN9Ki0)Fh>i=uUo?i~g0Ib;Kwg(Zn~tN0uo!3{)@Dqicc zF(KZ*+b@nwmXrIhZgzzZdUu+IbLZgiY?!zPB8w&(W_<#KH9jZ1WReV=XPtj+S8I!^ zsa-Q33y#cGQX?15Z&3&{`<9^&T?G6aeeevfYA6g$^ubW|@|on@ZO7I?p@gxd&EAzMOryl>6>` ztqlfG`2N5X%2UFD^2YKGHWfmlD@FB{Gcsrl>m|9%&U4g@mF4Ygx1Q3Z_1K!$myBde z9S(V;0bWOZ|LNZI`M1ADizEoD(JDTWpU-?}XTpWIjVn%q*Bn|KOx&R~Xe15?e^OFy zKx`NkhpD#{EAcDF_rJLqX^$yGi*B%^)pkKtFs;oDqUY;rb34s*==L{m3Ivmyw9a?C zLl3F0oqsuOdXV1+4;X%~Ky2aXFL>jA)yn;dm0yhJl5@+FzwPoo0m7>U-=XL|amc$g z+|54iH*N67lylF^X&jcOPqF%V#9S0p@Q&YJe>rBC+7XBHdhZpU>iLqm zLy?Ly>7QH3FU_xrMweZZudN=68Ii=T-A*Q)RX1=TaDN1T`kM2Tb&m>}@y$5%7oc5g z>7`)Yp#m?5JIA_y#x+t|&$&r`LYC8Ix76l zQ{GWYof7xYvE5c7i`Z@lv0jbvSW6;oVl}#n6A_x)k5ua5hQRXC?IDEiFaXvgZvs+wKEM&B2lswv$4be zR^xjGc3G<&RO`qExd~h;`R?nT9%xZc5&I{T!2+>Cz9kb_r6(KKpH{+R${OwWF4L2JZeogDU&S;@lW@ujYtBv(@En9>IcE+uc;a_p(`YHM_)eRV|Y z1=2TlF^@cVXs}y`;Uaim@n-1&FXH z=Pp9=>#R3*czrp zX$98d+A~Du-lcuHzN(kSc6#NLsr>H`ec9C$Ez-ZXih3n6Us0y#F~ToQKGc)F?!a|e zx-A`wwORNhL_HbRYqWs(`IW&rvxtDIp|dLX4+TU*z9W6SX5ue9|D-(PfXvp5@+(y3 z`vxq?Pe0Xd#wsmxRc24R1X+@weYzORG9B|c8~Mtuc*GB#oc59b0b_m_S~+k+DUx75 zzp*OxY*S@ODD57gge3!EgpbTU*ZI-?Y#_i0ovN!fX27| zkN+gjrPMknkGyc(DN(0h4dx8uq?13pUR{%o*{~jzZ?YB{O{Z%^2iGX&r6{R~U3+h8 zoV2Ut|NUCTq|-lPgg^6dq*Cd0(}Yj@Bt~dy6lV7~S!#MExHY(0e~FJ#6i;`mj2(vb z+Mke-$x5F~&gFUH@ZIR$u|QkTHz9%dlvKx=b`&TK>!++fdDo(N^W26jZ-jTMti0p7lWI-gFn{onG~OyD{?RTR&qh#Yd{e+6bRc z%|GvBobALg$FB)n#~6@2*4WBBonEHcFpw|gKgPP{D^kH=Am~X%u11unxl#UMu{9fo zNiDatb3c5Cg3b%FUWuskbCvAjd9v}SQAtslM8bSmG?IrUIatmKitYF`#RI2?+-OLk zbm^p5_C~mW!!Kiz=0i1?2Zi20(`Q5Fx2nmkY4u#UV4Jmnk<&4H?6AL(3j=jLgn8CY zS`f;soCk|-YrazL1BoL3k{|Z)^*P&1ka)X!#LiC#vw2&~D}F&D1}AFGcW8*>y9pYt zfeQe9rrQtnc7G?6-h5dW|DAZrpLzK5(dMK3FIL+)|3W06dmedIz_isR^G3Y5Gnsgh z+@_j1p=TC&a{kUj;sMZSjeiiEk<4D$xa9EV)2Z-Rptb*;*-8oAYYTC?<}C**&m|=+v>Cf`|3{VW;RxpS6B{c@hK?0UT1#dV7vL*Pk65q#bUpX& zfeRb9sivc0nK-`D=TP12QARHfH~66&PUS8pOPu9^^%);>skqpCWr|aa?bBOXHO*+5 z%h&t;jgd_TK0RuGo$?>~)Ks|kTbS9f;-;lA?|{oxKtdffp5W7KPr^i9dRJ&{B{JS+ zXg!T&h>~qlyjxZ}rYa_Zi+k)W*R>Su@zf&DN-Ui)HSfMx80u+SceG?x(BzH_QBwuU zU{u^!n#eu{HW$4G$vGO-S0P2-uzNx&8i2^XM=moD9rYdBXbJ}QHAYk?nXL{nsmtCd zSR=S0TYeJ20vgqd)}m)79|O|GJ(3M}%>-WUlDYar_Ph_v_?lzDCQqU4uiI-^ej+*^`kq`ly^!X6?M8+308<} zajVOD=7vu#;!wz8kptrMD^E5*K`tdlf?C<0q)$CJMKv;te6$p!i5|KJ<#)l{gJeRM z9Ki0@J6zW*+2aIl>f5ZG*sv8~z0SmPW4!mFV^fSIRqC^;T>ht7g`2LJ`h_>1p1#fX;)CyU9n~pa&BQPVscafCdwfDfb{i(@D@_TKwHViN zgXD?S)#??5#zq{G%VHE|1x9DuXG%UUrE!=O#%;U^*&G5yxhvy&VjXN9>ahrFT{`CV zxVZLBGLDPH7JLmE&eAJ)y$%Wm7NfpOMeh!jW+V4i2;u~;aOu#{t?d;;uIxhX$XEU8 z7uJc2w`Tk=`}ykGV2B&OSsn=3!>SG~(`AY?0G!ExNr$23v^$;gAe-$j?HW-%YKiT; zw~|yJhf+lkYEAG9y7K)~wXJ;WaiNjio TaQO8<_i^^(lQSp%c;6k(aE1tbnY5tbnY5tbnY5tbnY5tbnY5tbnY5tbnY5tbnY* zOJEQ(QtERs`n;ab+iuXR!b)cj8-dKz% zit@jx#i!GmKMV02Sl(XD%B$pSezaSw7po|Gs(x&zut$&xhsmd@;w^xMpq4csv#xo98Ps#`5WQeQhnd zJeB#L=4$O9P9A+@(%;~W)I-WW-aJaI(oHy;-*xI@}GSK@@)0OGW@4&b=AYjlM z+_D71&zpj&1(P*TB-7Z_vSZ$}^YKLTY%myPyzHBIuUJV^{U@KZ_q6sXRL<54u8&}9 z!DL-mm}`w3Pwb*&?cK36*Qr1~v<|Qf6xK8hrWQ<=saU=9e0pMzj#XY(#&h>{jMf3R zr@?ICm|8GdJlS(o>W_3R`2Jj|i`D`5Lg|3Gm|8Gdyqai#Oa4U1y0x^F=kDnktpjXN zgW13_wP3P%vbE3HD|D>=yIL1)5r@_RHcz+LW0+bnS-iZ56Y*v`7QD9$^=#5Qz~<@p zdJIzwCX1JMOG9JMcf)KZ^<_s}yDtz7@TQ`V6)I;Z!ZEe5EH`aC?U}*;SIqdAr_`&- zzW1JhHFQ|(SzKIfhjKN}YH$T}IT=$6%W~7UOAGcz*tH+4c<>qg;jwqO!#>LGRaJKA zyvEs>c}+%k#?->H+_asf8486+-;vkrwY{z1oW|1D?fy`Q2VR3UTa1g1i@_9SU~0i+ z<&#g%ZA(eK>(%8KGN~_E4ZL>x)CZmLe7m_=z8jX?xtLloSt5DC{_^UTcaj(PJr#sK zY4{A*p4nenU344PkjhF*M8aj~&@8ZpK?oBuwP3OkQp+M>&+CEuHRQ+(o#k-efYk`? z%s~IkqVZUsCmqx$+9h?3kTSnhBG3x zUTr_KK?F%Nz!qU(&B4@y$->Idiu3%QC;hbEbE(tktbBR>Gy`m&uh(OkS}<8&=MB#$ ztryOhh{U8BV2d!Y=3r{UWMSoJ#qs<7%zLR`wescl-$*%QYQbdj>W47~XM!%rD^L)J?!_^FQzzd8p&r#SgAqPnJcaBpePCI0sa` zXi;`u51zZx_m8(3XJQQo%!TvsqL;xj?xGxwd4>k#?uKbFu zQ9rB2(Ik(YFtxBOw{5GG@2XjzU0RG8}>qAe=E`5y{dJ~R)Tqsgw4^3X6Yj< zAS)m%AS)m%AS)m%AS)m%aI;k)LWtrgu$=gk_EL0vjb{5a8_scu+6nRN{oohU?Vx6t zDS_ Date: Mon, 16 Feb 2015 07:15:10 +0100 Subject: [PATCH 21/53] make 2nd, 3rd and 4th argument optional by supplying None as default --- README.md | 2 +- src/img2pdf.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 9b219a5..f97a1ee 100644 --- a/README.md +++ b/README.md @@ -124,7 +124,7 @@ You can then test the converter using: Note that the package can also be used as a library as follows: import img2pdf - pdf_bytes = img2pdf.convert(['test.jpg'], dpi=150, x=0, y=0) + pdf_bytes = img2pdf.convert(['test.jpg']) file = open("name.pdf","wb") file.write(pdf_bytes) diff --git a/src/img2pdf.py b/src/img2pdf.py index f923e8b..a6a1adc 100755 --- a/src/img2pdf.py +++ b/src/img2pdf.py @@ -205,9 +205,9 @@ class pdfdoc(object): result += b"%%EOF\n" return result -def convert(images, dpi, x, y, title=None, author=None, creator=None, producer=None, - creationdate=None, moddate=None, subject=None, keywords=None, - colorspace=None, verbose=False): +def convert(images, dpi=None, x=None, y=None, title=None, author=None, + creator=None, producer=None, creationdate=None, moddate=None, + subject=None, keywords=None, colorspace=None, verbose=False): pdf = pdfdoc(3, title, author, creator, producer, creationdate, moddate, subject, keywords) From 3fdd82420195708a10fb041d9a1e3b213e97223a Mon Sep 17 00:00:00 2001 From: josch Date: Mon, 16 Feb 2015 07:39:07 +0100 Subject: [PATCH 22/53] support file objects as input --- src/img2pdf.py | 158 ++++++++++---------- src/tests/{test-CMYK.jpg => input/CMYK.jpg} | Bin src/tests/{test-CMYK.tif => input/CMYK.tif} | Bin src/tests/{test.jpg => input/normal.jpg} | Bin src/tests/{test.png => input/normal.png} | Bin src/tests/test.pdf | Bin 3289 -> 0 bytes src/tests/test_img2pdf.py | 20 --- 7 files changed, 83 insertions(+), 95 deletions(-) rename src/tests/{test-CMYK.jpg => input/CMYK.jpg} (100%) rename src/tests/{test-CMYK.tif => input/CMYK.tif} (100%) rename src/tests/{test.jpg => input/normal.jpg} (100%) rename src/tests/{test.png => input/normal.png} (100%) delete mode 100644 src/tests/test.pdf delete mode 100644 src/tests/test_img2pdf.py diff --git a/src/img2pdf.py b/src/img2pdf.py index a6a1adc..abd2c81 100755 --- a/src/img2pdf.py +++ b/src/img2pdf.py @@ -24,6 +24,10 @@ import struct from PIL import Image from datetime import datetime from jp2 import parsejp2 +try: + from cStringIO import cStringIO +except ImportError: + from io import BytesIO as cStringIO # XXX: Switch to use logging module. def debug_out(message, verbose=True): @@ -214,85 +218,89 @@ def convert(images, dpi=None, x=None, y=None, title=None, author=None, for imfilename in images: debug_out("Reading %s"%imfilename, verbose) - with open(imfilename, "rb") as im: - rawdata = im.read() - im.seek(0) - try: - imgdata = Image.open(im) - except IOError as e: - # test if it is a jpeg2000 image - if rawdata[:12] != "\x00\x00\x00\x0C\x6A\x50\x20\x20\x0D\x0A\x87\x0A": - error_out("cannot read input image (not jpeg2000)") - error_out("PIL: %s"%e) - exit(1) - # image is jpeg2000 - width, height, ics = parsejp2(rawdata) - imgformat = "JPEG2000" + try: + rawdata = imfilename.read() + im = cStringIO(rawdata) + except: + with open(imfilename, "rb") as im: + rawdata = im.read() + im = cStringIO(rawdata) + try: + imgdata = Image.open(im) + except IOError as e: + # test if it is a jpeg2000 image + if rawdata[:12] != "\x00\x00\x00\x0C\x6A\x50\x20\x20\x0D\x0A\x87\x0A": + error_out("cannot read input image (not jpeg2000)") + error_out("PIL: %s"%e) + exit(1) + # image is jpeg2000 + width, height, ics = parsejp2(rawdata) + imgformat = "JPEG2000" - if dpi: - ndpi = dpi, dpi - debug_out("input dpi (forced) = %d x %d"%ndpi, verbose) - else: - ndpi = (96, 96) # TODO: read real dpi - debug_out("input dpi = %d x %d"%ndpi, verbose) - - if colorspace: - color = colorspace - debug_out("input colorspace (forced) = %s"%(ics)) - else: - color = ics - debug_out("input colorspace = %s"%(ics), verbose) + if dpi: + ndpi = dpi, dpi + debug_out("input dpi (forced) = %d x %d"%ndpi, verbose) else: - width, height = imgdata.size - imgformat = imgdata.format + ndpi = (96, 96) # TODO: read real dpi + debug_out("input dpi = %d x %d"%ndpi, verbose) - if dpi: - ndpi = dpi, dpi - debug_out("input dpi (forced) = %d x %d"%ndpi, verbose) - else: - ndpi = imgdata.info.get("dpi", (96, 96)) - debug_out("input dpi = %d x %d"%ndpi, verbose) - - if colorspace: - color = colorspace - debug_out("input colorspace (forced) = %s"%(color), verbose) - else: - color = imgdata.mode - if color == "CMYK" and imgformat == "JPEG": - # Adobe inverts CMYK JPEGs for some reason, and others - # have followed suit as well. Some software assumes the - # JPEG is inverted if the Adobe tag (APP14), while other - # software assumes all CMYK JPEGs are inverted. I don't - # have enough experience with these to know which is - # better for images currently in the wild, so I'm going - # with the first approach for now. - if "adobe" in imgdata.info: - color = "CMYK;I" - debug_out("input colorspace = %s"%(color), verbose) - - debug_out("width x height = %d x %d"%(width,height), verbose) - debug_out("imgformat = %s"%imgformat, verbose) - - # depending on the input format, determine whether to pass the raw - # image or the zlib compressed color information - if imgformat is "JPEG" or imgformat is "JPEG2000": - if color == '1': - error_out("jpeg can't be monochrome") - exit(1) - imgdata = rawdata + if colorspace: + color = colorspace + debug_out("input colorspace (forced) = %s"%(ics)) else: - # because we do not support /CCITTFaxDecode - if color == '1': - debug_out("Converting colorspace 1 to L", verbose) - imgdata = imgdata.convert('L') - color = 'L' - elif color in ("RGB", "L", "CMYK", "CMYK;I"): - debug_out("Colorspace is OK: %s"%color, verbose) - else: - debug_out("Converting colorspace %s to RGB"%color, verbose) - imgdata = imgdata.convert('RGB') - color = imgdata.mode - imgdata = zlib.compress(imgdata.tostring()) + color = ics + debug_out("input colorspace = %s"%(ics), verbose) + else: + width, height = imgdata.size + imgformat = imgdata.format + + if dpi: + ndpi = dpi, dpi + debug_out("input dpi (forced) = %d x %d"%ndpi, verbose) + else: + ndpi = imgdata.info.get("dpi", (96, 96)) + debug_out("input dpi = %d x %d"%ndpi, verbose) + + if colorspace: + color = colorspace + debug_out("input colorspace (forced) = %s"%(color), verbose) + else: + color = imgdata.mode + if color == "CMYK" and imgformat == "JPEG": + # Adobe inverts CMYK JPEGs for some reason, and others + # have followed suit as well. Some software assumes the + # JPEG is inverted if the Adobe tag (APP14), while other + # software assumes all CMYK JPEGs are inverted. I don't + # have enough experience with these to know which is + # better for images currently in the wild, so I'm going + # with the first approach for now. + if "adobe" in imgdata.info: + color = "CMYK;I" + debug_out("input colorspace = %s"%(color), verbose) + + debug_out("width x height = %d x %d"%(width,height), verbose) + debug_out("imgformat = %s"%imgformat, verbose) + + # depending on the input format, determine whether to pass the raw + # image or the zlib compressed color information + if imgformat is "JPEG" or imgformat is "JPEG2000": + if color == '1': + error_out("jpeg can't be monochrome") + exit(1) + imgdata = rawdata + else: + # because we do not support /CCITTFaxDecode + if color == '1': + debug_out("Converting colorspace 1 to L", verbose) + imgdata = imgdata.convert('L') + color = 'L' + elif color in ("RGB", "L", "CMYK", "CMYK;I"): + debug_out("Colorspace is OK: %s"%color, verbose) + else: + debug_out("Converting colorspace %s to RGB"%color, verbose) + imgdata = imgdata.convert('RGB') + color = imgdata.mode + imgdata = zlib.compress(imgdata.tostring()) # pdf units = 1/72 inch if not x and not y: diff --git a/src/tests/test-CMYK.jpg b/src/tests/input/CMYK.jpg similarity index 100% rename from src/tests/test-CMYK.jpg rename to src/tests/input/CMYK.jpg diff --git a/src/tests/test-CMYK.tif b/src/tests/input/CMYK.tif similarity index 100% rename from src/tests/test-CMYK.tif rename to src/tests/input/CMYK.tif diff --git a/src/tests/test.jpg b/src/tests/input/normal.jpg similarity index 100% rename from src/tests/test.jpg rename to src/tests/input/normal.jpg diff --git a/src/tests/test.png b/src/tests/input/normal.png similarity index 100% rename from src/tests/test.png rename to src/tests/input/normal.png diff --git a/src/tests/test.pdf b/src/tests/test.pdf deleted file mode 100644 index c3a315457ac3bae554504be32df1cbd739e67704..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3289 zcmb7H2~-p37M^S@Nq~feO;85}0TszY5D*oTumlK%fVBl{31Pq>B#?wnY*_@v1w|I2 zC{#d@Mckj-Csh_tRjg8~f`GLkXieGPmXhbpMiVz8yl4Lj@ z3Vbjat7#yOzMA3y2lg=oN9Wglox$ z5=BCY@5NMa&9S7>9pK?AfefHYAipK=aiN zN(h&$o)%85vJfAGRyaxCWR207@Cb0@2XKXrbXP z@}Jd8grX0?qYwa+fFb|@;(5N>3i%b+uk!uIb&*qj-syFuAe*#Rz>0#ZS!rNbWJc$c zBQivAvC+8l@+u$4YjGejwG(s3qU|w1VU%&MfKjU+;^4*E&eVZCnRoD2cGv(s6##*OCRM%&o z*%`SfCeOC*C|zCKy_wx_df%Wt)aSJ%L3tIEw*W~=xAfye45Fgy&gpw+oFo_>i0C`D zwSwx;mcCXYp}o0-{7Mb^7Kcps$V=GmNp1b0z34>abK6ACv(YudmahiGA3FutxtAOD zC`&{9Q^nw=jGK6AqNdpkU~GS8i2^81>n2!s^KYL+oV2JnUX9Nw4;$rzokn<;)`K$l4w?Js z7uIrupmoPunK^=2D17+U$Vbulnun@Pc9mEcF(z?_FU6Nl%|sHb^KbV4f^!bx*jw1- z2Rol^0_aOF;GceBTXKngyqzSI-AD*(Hb-)yRsEDjk5?Sb9JnFs);)hWM|P&^5@j^T zanK2WuDDRVJEddAHlL+B{fp`L@47ZPk^448KXb!n<(>%2TPxii@&wV!Wfjd@AOczh zBnNqM8}F?VZ}`P={I9~$I6?P=*tYG3<$1BKyFvvQwOgv!>>hIDm*@{-yS|f;@8385 zIxX1bhwr{I`t({mDcr0#!sS35GwkMG zSlj4o1tWKUQTpVVZ?9cJeZg_vRw5~AZ1&la`1gy zb{Vm+U(AHi+Pb1OHgniTCXK;pcVl82S2Ef@7InGhvxSEt;v*(_1oUIqUPbC2k<1rP!nL zWtC0Pc~5uty_!F+pN@3c^2*pGhiJOCDRCzHZR0OP6@eB%{wDj0)5NH|XMTIeM4U{Z zXT7s-DV<&=KKPz6T;9{1WZ~UOu0U>|Spgd$+A|7~8x40|$vsvkr}-kexKYL(fOGRk z>RM6zjS|HB5?$QE{ooQdadq6$;uo%yg3;eMU1<|}Z?!yIK}+R|s3zM^RUh9ou^CHZ z?Jr0T09;~ajgbh< z{2K~EG^YY^i8DY-{FyGn_=ge`rWHW%$dujJ(|!#Fw$FH;m`Kdi_vU3)HVve{eLWP5 zv*D-3*-eLs$%%K8KO4bc9#||xu3WxSV z=y+TRInPua$n5>*d`G=SMKL<;ldvt4Hj@U6J(cU~ZqIw@o&0U0-iBn#Pv2zYIMwzx z%YQ2uNcPdaV```!Y*Y)RHIvE%ZiLWynKk6;RM(h^aZ&k!(ucTZ*{-Sg(j)8kG-SWo z8%8ib<0sFxh;S$BTGiOsj@N(XHGRF=NAL69-8qihJl;?!x^e95q>|p=p{P8LTWMbr z@>}Gd7_fAd*gSk@57KD1b zd>QH*yU}7vQI&yq+yR&Ieg7f_x>poS)*I1ZP2(~k>8S%>ZG!*$7TeMEXujFpx8&v+ z&ec~=qgmdN-uCmVO9DSH1t_BsA#>{CrmB8iv{ z&rYBX+l5A Date: Mon, 16 Feb 2015 14:49:53 +0100 Subject: [PATCH 23/53] pass on --nodate option --- src/img2pdf.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/img2pdf.py b/src/img2pdf.py index abd2c81..5c1a87c 100755 --- a/src/img2pdf.py +++ b/src/img2pdf.py @@ -211,10 +211,11 @@ class pdfdoc(object): def convert(images, dpi=None, x=None, y=None, title=None, author=None, creator=None, producer=None, creationdate=None, moddate=None, - subject=None, keywords=None, colorspace=None, verbose=False): + subject=None, keywords=None, colorspace=None, nodate=False, + verbose=False): pdf = pdfdoc(3, title, author, creator, producer, creationdate, - moddate, subject, keywords) + moddate, subject, keywords, nodate) for imfilename in images: debug_out("Reading %s"%imfilename, verbose) @@ -386,7 +387,8 @@ def main(args=None): convert( args.images, args.dpi, args.x, args.y, args.title, args.author, args.creator, args.producer, args.creationdate, args.moddate, - args.subject, args.keywords, args.colorspace, args.verbose)) + args.subject, args.keywords, args.colorspace, args.nodate, + args.verbose)) if __name__ == '__main__': main() From 8887fe21f4ecb5c06bbb1b098f43c12d7851044e Mon Sep 17 00:00:00 2001 From: josch Date: Mon, 16 Feb 2015 19:09:34 +0100 Subject: [PATCH 24/53] print floats with four decimal places --- src/img2pdf.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/img2pdf.py b/src/img2pdf.py index 5c1a87c..28bba81 100755 --- a/src/img2pdf.py +++ b/src/img2pdf.py @@ -45,8 +45,10 @@ def parse(cont, indent=1): return b"<<\n"+b"\n".join( [4 * indent * b" " + k.encode("utf8") + b" " + parse(v, indent+1) for k, v in sorted(cont.items())])+b"\n"+4*(indent-1)*b" "+b">>" - elif type(cont) is int or type(cont) is float: + elif type(cont) is int: return str(cont).encode("utf8") + elif type(cont) is float: + return ("%0.4f"%cont).encode("utf8") elif isinstance(cont, obj): return ("%d 0 R"%cont.identifier).encode("utf8") elif type(cont) is str: From 90e954dc0ae739b19e0e5796287e956179f4778d Mon Sep 17 00:00:00 2001 From: josch Date: Mon, 16 Feb 2015 19:18:46 +0100 Subject: [PATCH 25/53] make sure dpi are read as integers and divided using float division --- src/img2pdf.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/img2pdf.py b/src/img2pdf.py index 28bba81..9f5629f 100755 --- a/src/img2pdf.py +++ b/src/img2pdf.py @@ -262,6 +262,12 @@ def convert(images, dpi=None, x=None, y=None, title=None, author=None, debug_out("input dpi (forced) = %d x %d"%ndpi, verbose) else: ndpi = imgdata.info.get("dpi", (96, 96)) + # in python3, the returned dpi value for some tiff images will + # not be an integer but a float. To make the behaviour of + # img2pdf the same between python2 and python3, we convert that + # float into an integer by rounding + # search online for the 72.009 dpi problem for more info + ndpi = (int(round(ndpi[0])),int(round(ndpi[1]))) debug_out("input dpi = %d x %d"%ndpi, verbose) if colorspace: @@ -307,11 +313,11 @@ def convert(images, dpi=None, x=None, y=None, title=None, author=None, # pdf units = 1/72 inch if not x and not y: - pdf_x, pdf_y = 72.0*width/ndpi[0], 72.0*height/ndpi[1] + pdf_x, pdf_y = 72.0*width/float(ndpi[0]), 72.0*height/float(ndpi[1]) elif not y: - pdf_x, pdf_y = x, x*height/width + pdf_x, pdf_y = x, x*height/float(width) elif not x: - pdf_x, pdf_y = y*width/height, y + pdf_x, pdf_y = y*width/float(height), y else: pdf_x = x pdf_y = y From e810f3baf4e70236f56df0b990c8eabe6c17a822 Mon Sep 17 00:00:00 2001 From: josch Date: Mon, 16 Feb 2015 19:19:49 +0100 Subject: [PATCH 26/53] close cStringIO and PIL.Image --- src/img2pdf.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/img2pdf.py b/src/img2pdf.py index 9f5629f..ab689b6 100755 --- a/src/img2pdf.py +++ b/src/img2pdf.py @@ -309,7 +309,10 @@ def convert(images, dpi=None, x=None, y=None, title=None, author=None, debug_out("Converting colorspace %s to RGB"%color, verbose) imgdata = imgdata.convert('RGB') color = imgdata.mode - imgdata = zlib.compress(imgdata.tostring()) + img = imgdata.tobytes() + imgdata.close() + imgdata = zlib.compress(img) + im.close() # pdf units = 1/72 inch if not x and not y: From 4d005c8443843cd15ccec7f3a6c3f7df4fff45f7 Mon Sep 17 00:00:00 2001 From: josch Date: Mon, 16 Feb 2015 19:20:06 +0100 Subject: [PATCH 27/53] write to sys.stdout.buffer and fall back to sys.stdout --- src/img2pdf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/img2pdf.py b/src/img2pdf.py index ab689b6..21067b1 100755 --- a/src/img2pdf.py +++ b/src/img2pdf.py @@ -347,7 +347,7 @@ parser.add_argument( nargs='+', help='input file(s)') parser.add_argument( '-o', '--output', metavar='out', type=argparse.FileType('wb'), - default=sys.stdout, help='output file (default: stdout)') + default=getattr(sys.stdout, "buffer", sys.stdout), help='output file (default: stdout)') parser.add_argument( '-d', '--dpi', metavar='dpi', type=positive_float, help='dpi for pdf output (default: 96.0)') From aabd104114f8b285dc93788e0f6888b10db72099 Mon Sep 17 00:00:00 2001 From: josch Date: Mon, 16 Feb 2015 19:23:21 +0100 Subject: [PATCH 28/53] update tests handling --- src/tests/__init__.py | 100 +++++++++++++++++++++++++++++++- src/tests/output/CMYK.jpg.pdf | Bin 0 -> 5741 bytes src/tests/output/CMYK.tif.pdf | Bin 0 -> 1861 bytes src/tests/output/normal.jpg.pdf | Bin 0 -> 3228 bytes src/tests/output/normal.png.pdf | Bin 0 -> 1710 bytes 5 files changed, 98 insertions(+), 2 deletions(-) create mode 100644 src/tests/output/CMYK.jpg.pdf create mode 100644 src/tests/output/CMYK.tif.pdf create mode 100644 src/tests/output/normal.jpg.pdf create mode 100644 src/tests/output/normal.png.pdf diff --git a/src/tests/__init__.py b/src/tests/__init__.py index 8fd6866..e3eb235 100644 --- a/src/tests/__init__.py +++ b/src/tests/__init__.py @@ -1,7 +1,103 @@ import unittest -import test_img2pdf + +import datetime +import os +import unittest +import img2pdf +import zlib +from PIL import Image + +HERE = os.path.dirname(__file__) + +#convert +set date:create +set date:modify -define png:exclude-chunk=time def test_suite(): + class TestImg2Pdf(unittest.TestCase): + pass + + for test_name in os.listdir(os.path.join(HERE, "input")): + inputf = os.path.join(HERE, "input", test_name) + if not os.path.isfile(inputf): + continue + outputf = os.path.join(HERE, "output", test_name+".pdf") + assert os.path.isfile(outputf) + def handle(self, f=inputf, out=outputf): + with open(f, "rb") as inf: + orig_imgdata = inf.read() + pdf = img2pdf.convert([f], nodate=True) + imgdata = b"" + instream = False + imgobj = False + colorspace = None + imgfilter = None + width = None + height = None + length = None + # ugly workaround to parse the created pdf + for line in pdf.split(b'\n'): + if instream: + if line == b"endstream": + break + else: + imgdata += line + b'\n' + else: + if imgobj and line == b"stream": + instream = True + elif b"/Subtype /Image" in line: + imgobj = True + elif b"/Width" in line: + width = int(line.split()[-1]) + elif b"/Height" in line: + height = int(line.split()[-1]) + elif b"/Length" in line: + length = int(line.split()[-1]) + elif b"/Filter" in line: + imgfilter = line.split()[-2] + elif b"/ColorSpace" in line: + colorspace = line.split()[-1] + # remove trailing \n + imgdata = imgdata[:-1] + # test if the length field is correct + self.assertEqual(len(imgdata), length) + # test if the filter is valid: + self.assertIn(imgfilter, [b"/DCTDecode", b"/JPXDecode", b"/FlateDecode"]) + # test if the colorspace is valid + self.assertIn(colorspace, [b"/DeviceGray", b"/DeviceRGB", b"/DeviceCMYK"]) + # test if the image has correct size + orig_img = Image.open(f) + self.assertEqual(width, orig_img.size[0]) + self.assertEqual(height, orig_img.size[1]) + # if the input file is a jpeg then it should've been copied + # verbatim into the PDF + if imgfilter in [b"/DCTDecode", b"/JPXDecode"]: + self.assertEqual(imgdata, orig_imgdata) + elif imgfilter == b"/FlateDecode": + # otherwise, the data is flate encoded and has to be equal to + # the pixel data of the input image + imgdata = zlib.decompress(imgdata) + if colorspace == b"/DeviceGray": + colorspace = 'L' + elif colorspace == b"/DeviceRGB": + colorspace = 'RGB' + elif colorspace == b"/DeviceCMYK": + colorspace = 'CMYK' + else: + raise Exception("invalid colorspace") + im = Image.frombytes(colorspace, (width, height), imgdata) + if orig_img.mode == '1': + orig_img = orig_img.convert("L") + elif orig_img.mode not in ("RGB", "L", "CMYK", "CMYK;I"): + orig_img = orig_img.convert("RGB") + self.assertEqual(im.tobytes(), orig_img.tobytes()) + im.close() + # lastly, make sure that the generated pdf matches bit by bit the + # expected pdf + with open(out, "rb") as outf: + out = outf.read() + self.assertEqual(pdf, out) + orig_img.close() + setattr(TestImg2Pdf, "test_%s"%test_name, handle) + return unittest.TestSuite(( - unittest.makeSuite(test_img2pdf.TestImg2Pdf), + unittest.makeSuite(TestImg2Pdf), )) diff --git a/src/tests/output/CMYK.jpg.pdf b/src/tests/output/CMYK.jpg.pdf new file mode 100644 index 0000000000000000000000000000000000000000..e2f9380ca45b6f9e80f8b3e190237648ac10e65f GIT binary patch literal 5741 zcmcIoc|27A*T2IUV`4J4gkh3wg&E8c62>}q#uf@=Fv4UkAxbI6o-J#3gHgT}(nzaN zwq(f?Es7}8LX^rg()aoLJ->gR<@KEVn$Mlj`JB%=@AJO*o_k+skByC|8u)h8vXoR!6`6Z*j9>@~>`!Ldij)At?2KAXt*UDV_$@2)0Kw`@mqZ z%4oKr)HOHHe*)Ne(%3tc0zyB`<V`^J z_c!!ECzVYqHZs_fzy{8(30RPWd_(S}BoSV$<1>=_6PhW(A1je6twgZ$5o6bMt< z2yRG)*}~ZL{@L}1=nY6)0~k<3Lu|-2JT>qbHE1JY%}svs)Bq~Y{+K6;eYp(D;S>@X zZ|P#bxn@WvQN77*{xF-2p*K(e{n;OijVS@4WE$I#iXoo($LXIXQ!>TZFO;3xX2ic~ z(9ry&un+SJ{e3H$1#S@doAFIT|6Gqvl@rCA%^sVsE%dFAuD=BM3C3o|00;yEOxXvp z{sDmNvu{K);0>^aWBnO`2RPWvAkfChu_0j2jgbos<^)5yAdufbZXPIvn}-_$fo_BH zz&3>ak9Ql4cVn{QWOFMA7bhndj2puJ+vI;VtltHMpa2>O0fVFg4g`o30Rl?{>kj}V z0C2LQv4;)BzlDv43&Oz(hOw6o`2hgT#tPx&0zHByNX)>+lTkf5>`lwcxlmUAlDV0+ORSYCUPh63X-G`-b@0m{t)X+LfsVmD9D+qepo=Oqykwo;cTWS=n^{s} z?N>~Pch@f{|5zD7Wg;&&RL!z<9u^`edOxgCdIqztL zv2k6x_2RUNXPxYD-&@n1m>Kmw0N>?yQo>YjkKXES#J%&#(*>hDgB~vKW$+s9J=lk* z5B#jHcz2KTR0{9IqKP9N;a_+6X-m2eJuwQY>W)Oql+G5OwY&k_gZvJ>;{Y+|(_XD6 zRM(YU*yH;4s+hg3U8g%W%E!eYJ?V7m>ovCp7Fi~&KW)V*D8JG0mwjIJot?qAdF{lB zUEL~k%}T3$-zKXlHME;1NT&!1y&?_Uy1=XF${q$7URfSN-iOAn7WM>7DV$Qa9($ai zy}amnYQFE_Xz+yoiJcRWQZtTUWR81@J5J<3^~sT*nO2)lC5e0@_-gb$ zzPBGN-4~g*UD$sVC)5XSIjvKv5Z$qQv*f6ilFIzOvOC{6kE{chiF0w{((h)ko7?4p z(=9}Dk7MC}HA>$PFW#-VGMwfAHl&hp`-_-BEad6b)wWOOGpOhz(ArA^9sUP?ju30X z=?E*K_Bz`bOpmgSuwZ#Q8X*uNcgSzOiy9wT1&hIZTT68NJi!+ytpPFB6-dCB{f_OfAc@Rf0=?dAN`&yaa zlZ8HHb|L1nk@De7Z5JPW)wU8>>h6Jq52*O#XafioSgIFRwx= z1=$f*$=gjfSqn4Fo}uDn;rcSTZDMpqiRBcOoe*KQTlVFBPd)*oM^=HyBI2S{OVWax zwymXK63zOIG@GLJ;0o~X7Z547t`Y1^chFj8b$U$@n%P(zF7X>q#eFET05l+KI@;N9 z#RIail{~u~HetFm>TZE$vs8f4?bk9juB03Jq->Un($uv4JG2|N5}2-{2;3Ozqo;c<#U}+7($id}*9QrRly;_Q+tGxR1MkTddTV(bo1g ze?p7Qz-hAzCqE44SId_`ZU<297gv(PQwGhFt`Li^&j#S!a!^*U5v%D(%bN+0m~!r6 zkraz~n>G7gsg36_36=oZkW=skL@}&uIwRAv)Y?*^s@;9kC6`1kQ+P0bZ93=q*J=l1 z665O?GF0rMFy4X} z9{~_|P3qjb0-VYB*#-l35cpSdz0ZagO_yUdueZQT9=UX zCPm>o&9&u8Z_O`!_At|bq5&;DCPdI)0u6yV9mbGtpSD{&9o$D~e(a^ru&7FFdA`(h zZOF6bFP8;x)-zg{>H9L|honh0zr-Ka=x--gUq_3gy)so#d)$kIh~p4n290jHjF-FMN)tk{{H5 z8B~n2=$x3!E6yv6CX^gdt*-2e=~GCYJDtqfqhsR2*!c?h?r$y0GW=PDrC#RAeRk%lAHUUe{jjiX-Dn`H-K=(LAmvla zty=7vvb$AgH1X~Q(I4F+M#-|RT~WlW8D&;dzJ@%F|K?dNrpx9NKd;k~)-@&T1-`O7 z?KK)-G*d_90<#?#HCekI7lQ?kM0n37Fc#4 z=gNpwHRfj>S?^pDl;(Bb^fh=A8(RDlEo_(#QfR~wJce|@rMpUF;bJ3^9wWg4STPUd zx`F)EiT%)|cLJdXVs-Rr{oV4Od($doy@WcqYo4Fs%ngs1ha;RT8|@PvUr$55Y+7F9 zr)&EkmfL|1M3+tRIcGT^yqh)36x{yyjiu_ZmmOJ^!wq|Xd?;*JAb!GIst~2`TfD4e z`J6!*@ZYv|DbnYcjF2J2bgJ}#7kTBh>~UmJMb92h=W9~CL%w1tK4YJeL`%)u8 z@}{4+0a9{rGp}bY^t_hkUVJxv;*H|vk!LmY!U$Wn#4u!eAQp-3biA({<=0%N`j*w)&chX? zufG$}{k-yOYsEQ`TI1%Od)#qE#;skK?|ki3w?HAIntBJTUL&Z)(`rmKm!Sg}Lbq6a z5T_IF`@+&ZVF?%b)i>g)+ty-~wBuhYm1c|ALBvx^e1x8EK~`tI?LnV7uS;I`A7f*% za_Mg6Cwn37&g{y_YiBHq_P7_{dp3IM3DuGOID~o*r!~O)MSYOx-8Q>7e${yKT(92p zW5-+1>>X^;w_|1&3MtTIs&|=CVhLJ)s9mp8bRG<=-))0VJic9!yL~p`*?ix}&z-R9 zL5gC7y;)NFP@W3KJj;z|Lh>N|9npm8JttV&;`SiLfrq7*ca!dwhaCcgf#6#aIl56~ zYqPwyB74C>UhSNgmX+gQ1_`;r_HQF90z4Jk#BR-Bzl_65GnvG%STj|4f{X10O7;t- zJ{H*4)kC*TX!f3ebuMndqv zFjHMMVvI?F1x^s&r&NqiZ@PN8k^+gU{h`|H?N8a$&X9k$@KZ>V3-0i&T3k*|J|
YAds*3(9ebI7w*qCiTs63-ghtZXFl(T zR;9;sMJ>s-tI17T@xw+|RQCIK91;tF-s^6u*f5q*!R2X}k8idee+ZiUzt`-K&2pK# z&Q6hsDx46(B9ET$?C_ks>4kY=I%rjKeoXe!NkQM*O3Rw^{OC%)@ON5tdV2h?RED`! zh13@j61&3QH;8OJTF($@sGX^LCXCPZNC^+SN}s>*t-^7wkND`8gPj7;jkgM%wtN;I zz$gAT*>=^@x}B%H)aD&iE&IbW5+(ZYK@E>anLRL_mxRu{m3mlAiX+G1kP+Mv7E{~C*xEg3IgVOHI zBt{*0USMvwd!W_SeiXwKrQCo$TT=W)OIAMd)J=DE>-0(QJ2t26WYZZVlceQ>FtTOM z&uP2-dQxJDwiZZlvUP_Zh7|h2&+SUl1$Lji?lE@F)!3zpBfo1! zw@>RSpWPZUb;b`5zbvXhnxFQ1zhPXTs`1|5LL(^ZDJ{>6#9HKE*>Gf7qJ zfJ3f&$y&$bN6GulQ_6B7&p#`NyQ-g7CZL_VRSmR0sk>>}`gEZ}m=_jYOEVP}Lw35y z-4N_A7f*3(2KP0nJvu2-q;zhu)mA2nOQE>irdkooy=d#h*q^ZW!0_138@zX^sEvB9e*`>t0~YN4=tHa{AO%`0a={0wwTiu_P{bAs{EzL6m_v&iey zF$W1fzEDXI;yFkLWZDHxvi~A_v|Q*E%Au~w&Ry{2XL*CXgMJXS2LFWsk&EsN%DZwc zE37UbNH zPH4s?kz^_cPZufLVqntTHg~0Mq{h;UeX!qcSZi}NIrSwqWRDC@rqZh*Lold|PprHr zITn6~%SQYQ00#8nVzKbDH01xjRwfV|Wx`zFsXZo>{jb)^bwAH>)fsHPS{v@7n8hLM zoD>(4)kKW?#BmWLP?UJ%7?X@K)N4mUPe%M?mBe6`rTWL3$1c8_P7}6f3^@2OvV^5- z(T4|eWp4=H&^d`5s!1n48j#ZuWhII-1!=yRAoB(fjuC*b`%chXVYK9Ei2^{+ZrL__c}~dX_VT?f|eO?*SK) zQ^aZe`72q1XF2C;b)u#Pzxv@6JA`qkyuWg4-AdcfNgX% zHwA^mV%4xHAJjj({ixqdsGxszIJUnH*`mX!Y5bc`O^w}e{4aLus{dw(Ro~C<1pjRp zO7o-ykZG{Lo5E&6K2+2O{y#loJ1Ujk>iuVKPl+P4Q~2HR^rVIU&IyafsKNH^F|sy> F{Ra)2SjqqZ literal 0 HcmV?d00001 diff --git a/src/tests/output/CMYK.tif.pdf b/src/tests/output/CMYK.tif.pdf new file mode 100644 index 0000000000000000000000000000000000000000..2441e99c1d4fcdd1563796ed766f67ecb5fddf03 GIT binary patch literal 1861 zcmZWq2~bm46mCZw*>wACZ%{}kl`_8%V&wVH_g3CVa zejI=WF(8*34%*lNATrQi6b8fxO%Rl<1bkv6ArS3EfhiGO24H6gLL3?r0LWd`1D&{0 z92mqZgGekI9}EI<@-$$GdLl)BwKQ^<|EA%<5C`UirYjUYAR3F}$c;fVVvvc&n(fCR z12nZ%uBrpPC_E%mojV!gbE9}+h#-|bg+zZ(YB&@It78hB$i{$X|DGy`?CccORSQ~K z3b_I`0dk(|0uDeN2FwIaaflDVhj|c%4g3f!b4FOpr;54C3S$Fk`QZdE;0++FMQeqQ7qM<{-d+f>F{Dor%{8oL(&8r<& zK3dq&>v891Al2u>d}2rOVx7h_*+9+Q7HMyv=EtZM{o~g3u~AvOMedb7#xu{ab#6^? z9F%3UhZ^@x^=@&J)mAZQW?v9qO!kvjwaSuh8*}XPi%kU-d z?9oft4(se7(v!|*^oeiQE_L`u8`s$=g4 zG0HuV#M77FhS7cV^CZFNt}(X z4fE@s@_-K7#cSZyV`^#lw{s}v~TE10~&|l<6>^5nevMRH73%`~%S0DeAXo03Z z-aYpqJ5!oAF&zFUIsJ@ic1W;3xps(16!z7wF0j}TO{f;>>W&FMG-PJd@p0>ejV+7c zbXid*&JS#JbJ<~$lV;?nk_nv=~@4kD!IpzANt}JTItmL|r!NB`C%z)Xn zV}ZES|L_-Q1aY~Vu^9ySP(#hrAoBep{GcKDE+zQ${$#$=pVZo73=g6M7|;TccWUJT z&CSekW*{A0!SJAB3*@X|=14yIP{Xh|i_b6|4(X%+^D{O1%nxghMP9gNKbS{hML;}Y t`LQE%=v+|FU-j0Jxm@H4Td@zZ;vhtVqLEX0utF2o9ESr?C}&R>;2+CI;0*u( literal 0 HcmV?d00001 diff --git a/src/tests/output/normal.jpg.pdf b/src/tests/output/normal.jpg.pdf new file mode 100644 index 0000000000000000000000000000000000000000..f5402f9828984f8195cf54b8d8d11faf2f8c7bf7 GIT binary patch literal 3228 zcmb7H2~-p37M^S@Nq~d|0)jdq2rft#f}p67ge5>A5UdrfC4>QkkU$bPu>~R^E-11H zML_{U7IA-SpHx{qRk2E`3If)GphYPP6_tvRHwjSb>p6Yr-I?>x%>D1Z|M%TH=gvRY ze2#}5Z50Dc1F4`iEDCgS!Gf^Z^M#R+9HhhAz!^nQax4VevxN#_v@{aCemw|D!eIxj z+GM^XTN*D>fV3GWA91)G3;`MHFyI&S18L&V3!^sqYZ!?Fk|^Y$!zTzlC|oRLNt57+ zRQRCLn5(F;fDTSG*VzU5LK*BiAA2Aqm&VIPFris`8j7F$hebgm#eAJc&yAsi%s*FE ztJbgAK%c$X&!j@lU>*TA&iMpVbtnl7WwRxCr%gVom7(!pc#cpMgo)6+L3=;;&naX5l8fk+}785!XXC?>{a6GO5Q zSses{f_=~!T?|H-tcTMh|KFl&0f;&P6_BG4L;y)dpoj?74ZsutP-+JAL`xf^gNCQb ze^sk3R6PJ5g#eHQ6afGbFLP(SkYDqB%Rg>h7ctS}l~zL%*ru)omgiMW%K|zg(%YXM zl_P=*DWmG)O)ie#^b>TJF(-yI9Lv5Z1)H!D_ z0XmcM8^PbxGW0s#$nLu; zLe1vQgzmS!{lBRI`Pqrg!b5s>FK*@s5fd+7;U>q?ChF-KeiYB0I<+w)wwbK9TUk%) z*IX)m`PjA*c=h_QRi9rM{m#{!cb?;?yf2r3IQ6n;Q+nj-%8`vqlCN_NKCwA7#nO98Ci3|&@25kfpRds=ghV; znjc5@R$~e6%kB>>*HUbCP;!P|!(mVB8VdHI6HG2_7dK8vRRvkS=?{DA6jb9@O6gLU zhWM9?!AS|#qyR4E1_bjT43KjMu5ba-XcQ8wrL8-YQv?!)1~5cJl7*Y%Jb_H3GuZ*c zxI;#umeW-3y4*mbE6U@L8py!#uE^f@kYIDsgL*yM1wK zEmr`oJJHO_623v&g>iF(jDP+_{e$fkffjx&BOxoT!EmRetYx9=C6b1>K5(l$59 z`E&!oSab>h{L5uUS2!nINpktEctN8Dk_WBqr7n27{E)Kmmbg>@;{7c7xrQs$;b_Nx zC;WxNe94~Vw&mNs7wh#dWY~Y`*x*F&*%0-@6_=5HN|3WwwkP-*qM64on6^atHwj4& ziozEDdkTN>)k*xXqL5f&=cAaG9r>j>G0nR}gqL-jDp&6ra11Om?8kO|rx-o3fAVc= zkm(QK9ix1Hqm>k9-W|T?U@R-=@rjK;@@>P+=_Vvb!qqj?w|5cBr>7tBezKo#@D=9_2Xd$#VT zaBki(e6qA1>}ql%CygCo6fTP!^D{awaCEFr-C35h@>;XXC{ViPR@nj0ZNm>zcsq?UKgHH!@vQ`06nmEZh3Wqdnm<=x6f zmpI*Y5c}$6n4P&`kN+ct{jG6t+l%XFZ0=|mt1D|~L;2%W<969-#5kfg*O$tm%pyC z66ji zx85p3d@Rz(9XbFm;t*HG9xr_5GAS_y<=OjB=&*4 z2(ip%cR@WGbz|4K2 z5JU?a0GBu&q{f}=5RU#?V#1^f=pLG|`)1O&E^pZjz6T}(^ZcVlNri14sb_x|)$)AU zSxIKYkwJ38y`;@U_^W*j<;WFFS7;3dKe9@&I)B&%Yphj>Ha3vZnwPrnV%gx7{)o1x z`H=He*+FIZv5RfBmSu(L(9c4*N?T0pEcceLtGPQDp?mzdv07V_Szy{GTgQo(_ZfcM zctDc3{yj5e-5^ReklIKp^}iKN=PRqolPNCIW255IgT;?=OEO(j9;8Lo?5)duw=a}n za?V$gZ5i%H)VHp(uO6-a+H>+|qqo5qdpffmb@}{(5On>>H;F~v-2;(1T-V~B0_3-D zm%m>)<(eIQJcDZEseRe3bXVmg>vj7HbxIY`mOs|I`zZOjYv(R*I+L|R#u0{itobU$ zC1#`LqJjz|-PnU`M)&`V6yR1?C|yrsyqUzML$WgmzupAj`V`vH4d_0Zy!YhBXzukl zPQw{#hG?B#z&q2#fv8%-IQ-4^$w=YWErmDLCzU_1TwC$U>+f0%{5m+j? z8mry_&j{$qWHOjw1o%nk3~Hu8$tRs798WFtbTr25zv&nZczgO^b`JEv+0mU>JAJZK z$b{l(NQRyJhu}&gq@WuA>@P&1R0{tYe0mm$cR(-&&Hh*@Q)oCb;6<_4*6#iu*#7_^ CN@czP literal 0 HcmV?d00001 diff --git a/src/tests/output/normal.png.pdf b/src/tests/output/normal.png.pdf new file mode 100644 index 0000000000000000000000000000000000000000..e1c13c460d9b40b7af382b44d6306479d4337e57 GIT binary patch literal 1710 zcmY!laB~UWtMsM3Z-BO0hz;f-%S#g;->Rz~&PWcr;6AggS(9l%R001yi`KAR#+Du+yQGJ|rkWK0xvS$eqX@@JY=}FUe3aF$SvPDlREX zP0Zyg{ z!xoV6b`6;Qec=1Th zOwT9*Itii{>V69&O9(%>G^qrf1Uz#=9tVdMBp{&S2oVa;Oab~D=uISVSIl`k%ig0X zP~up3&}V)8OAiz>oTH_7ih2eH z&*(@pWxYNF94t_neYR?E-0ZW< z!dADpxBsm9ci_N*6DLkQdGh4arAfyh@7%LzO^B9k$&JGrKAPqRfByX0vul@@0EdQa ziqXwy&(dt<($n|G%I4^ZIp5wCyzWG8S@;HVaq*y)BG)}sCT-oiwZLMIkJ|nF_q&fK zX-(~NQSw_}oRzgob{$8p(4|d$!ECFhoKEd^Yi9jgRckdjFQC4zZr`3gcdlQ*9(nt% z#E~_J4VO`Qyie^)b%nthYtJvZSY{XKNiz+ITbPTZPSwLsL&b%~-$nnwfG_4}X%) zjU2Pze?NWIf6=iaLMLv$`9zNm`{S;$8OrG>%|E|>{rd0Uznh1>c>8wl%$c0+hjZ3V z-|G0bxT3uLd4a{TW5+b^PrR7n^83&98);bqyJf?)rmkMS`t94?*=O7OIOO`jCvSUk z>f4VWKdN&s%O9C_80d^rj~>|Ni~Er8kCq+bUyQ z+r8VjfB*LF+KwOJzJ;y6`sk(BRIa((eR&^6Z3uNy68x31`|i68ldAgq|G$1k=}nK1 zi@U%p{`O33YwN4~oCSqCVy~yXE#rB$kTE#R+1Yvd^5t*VY>$kRyHd5ro_E*IorODN ztmgIwIBHGJ%gNdD;jQ`grlU5twyaEyyYJfB+xPRGKX}mbU;KsUpp_<4z0uLp2NO0t zjn9qX76YZ|hW|6{PXs)$!7K>O5Cs8J`BPDpn#N_IV9o_95+Ojr%+%D_R3S|PA!exn zmQu(=h?xP+0|6v4Lt}FcF=Jz3frX~d#0W#3p@|uix{{*A%$(FBE?BMNnU|KY0CGRH mQVGh>2i6q`esE@0D$ony5;L)=1neh66AL3QRaIAiH!c7;X>-E> literal 0 HcmV?d00001 From e8dc019bd59ef038e2b54848359fef8c2673988f Mon Sep 17 00:00:00 2001 From: josch Date: Mon, 16 Feb 2015 19:29:20 +0100 Subject: [PATCH 29/53] update CHANGES.rst --- CHANGES.rst | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index 69a9b36..9e1c288 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -2,7 +2,34 @@ CHANGES ======= -1.0.0 (unreleased) +0.1.5 +----- + +- Enable support for CMYK images +- Rework test suite +- support file objects as input + +0.1.4 +----- + +- add Python 3 support +- make output reproducible by sorting and --nodate option + +0.1.3 +----- + +- Avoid leaking file descriptors +- Convert unrecognized colorspaces to RGB + +0.1.1 +----- + +- allow running src/img2pdf.py standalone +- license change from GPL to LGPL +- Add pillow 2.4.0 support +- add options to specify pdf dimensions in points + +0.1.0 (unreleased) ------------------ - Initial PyPI release. From 4e6a4b937807a2bb59aeecec01304d916137add4 Mon Sep 17 00:00:00 2001 From: josch Date: Mon, 16 Feb 2015 19:29:59 +0100 Subject: [PATCH 30/53] release 0.1.5 --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 88554a6..92ca84e 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import setup setup ( name='img2pdf', - version='0.1.4', + version='0.1.5', author = "Johannes 'josch' Schauer", author_email = 'j.schauer@email.de', description = "Convert images to PDF via direct JPEG inclusion.", @@ -24,7 +24,7 @@ setup ( 'Natural Language :: English', 'Operating System :: OS Independent'], url = 'https://github.com/josch/img2pdf', - download_url = 'https://github.com/josch/img2pdf/archive/0.1.4.tar.gz', + download_url = 'https://github.com/josch/img2pdf/archive/0.1.5.tar.gz', package_dir={"": "src"}, py_modules=['img2pdf', 'jp2'], include_package_data = True, From 8f757bc3bb3d356343d77769c88a67a327e3d532 Mon Sep 17 00:00:00 2001 From: Jakub Wilk Date: Thu, 5 Mar 2015 23:23:16 +0100 Subject: [PATCH 31/53] README: fix a typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f97a1ee..cfd3908 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ Background ---------- PDF is able to embed JPEG and JPEG2000 images as they are without re-encoding -them (and hence loosing quality) but I was missing a tool to do this +them (and hence losing quality) but I was missing a tool to do this automatically, thus I wrote this piece of python code. If you know how to embed JPEG and JPEG2000 images into a PDF container without From 2cb8e55f1d9edb67a835560b582592fe0dfa0193 Mon Sep 17 00:00:00 2001 From: josch Date: Fri, 6 Mar 2015 19:16:36 +0100 Subject: [PATCH 32/53] output pdf image size with four significant digits --- src/img2pdf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/img2pdf.py b/src/img2pdf.py index 21067b1..c8ef93c 100755 --- a/src/img2pdf.py +++ b/src/img2pdf.py @@ -163,7 +163,7 @@ class pdfdoc(object): # Inverts all four channels image.content['/Decode'] = [1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0] - text = ("q\n%f 0 0 %f 0 0 cm\n/Im0 Do\nQ"%(pdf_x, pdf_y)).encode('utf8') + text = ("q\n%0.4f 0 0 %0.4f 0 0 cm\n/Im0 Do\nQ"%(pdf_x, pdf_y)).encode('utf8') content = obj({ "/Length": len(text) From ad8567d352a138adf1ce8d63c2f7f42651a8ab80 Mon Sep 17 00:00:00 2001 From: josch Date: Fri, 6 Mar 2015 19:19:21 +0100 Subject: [PATCH 33/53] add more details to code comment --- src/img2pdf.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/img2pdf.py b/src/img2pdf.py index c8ef93c..a406041 100755 --- a/src/img2pdf.py +++ b/src/img2pdf.py @@ -244,7 +244,8 @@ def convert(images, dpi=None, x=None, y=None, title=None, author=None, ndpi = dpi, dpi debug_out("input dpi (forced) = %d x %d"%ndpi, verbose) else: - ndpi = (96, 96) # TODO: read real dpi + # TODO: read real dpi from input jpeg2000 image + ndpi = (96, 96) debug_out("input dpi = %d x %d"%ndpi, verbose) if colorspace: From 53968c31e0d3115865d6d59142dd693b2b147aea Mon Sep 17 00:00:00 2001 From: josch Date: Fri, 6 Mar 2015 19:29:24 +0100 Subject: [PATCH 34/53] remove -x and -y and replace by -s/--pagesize. Change short option for --subject to -S. --- CHANGES.rst | 6 ++++++ src/img2pdf.py | 54 +++++++++++++++++++++++++++++++++----------------- 2 files changed, 42 insertions(+), 18 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 9e1c288..29dc9cb 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -2,6 +2,12 @@ CHANGES ======= +0.1.6 +----- + + - replace -x and -y option by combined option -s (or --pagesize) and use -S + for --subject + 0.1.5 ----- diff --git a/src/img2pdf.py b/src/img2pdf.py index a406041..fb197b4 100755 --- a/src/img2pdf.py +++ b/src/img2pdf.py @@ -211,7 +211,7 @@ class pdfdoc(object): result += b"%%EOF\n" return result -def convert(images, dpi=None, x=None, y=None, title=None, author=None, +def convert(images, dpi=None, pagesize=(None, None), title=None, author=None, creator=None, producer=None, creationdate=None, moddate=None, subject=None, keywords=None, colorspace=None, nodate=False, verbose=False): @@ -316,15 +316,15 @@ def convert(images, dpi=None, x=None, y=None, title=None, author=None, im.close() # pdf units = 1/72 inch - if not x and not y: + if not pagesize[0] and not pagesize[1]: pdf_x, pdf_y = 72.0*width/float(ndpi[0]), 72.0*height/float(ndpi[1]) - elif not y: - pdf_x, pdf_y = x, x*height/float(width) - elif not x: - pdf_x, pdf_y = y*width/float(height), y + elif not pagesize[1]: + pdf_x, pdf_y = pagesize[0], pagesize[0]*height/float(width) + elif not pagesize[0]: + pdf_x, pdf_y = pagesize[1]*width/float(height), pagesize[1] else: - pdf_x = x - pdf_y = y + pdf_x = pagesize[0] + pdf_y = pagesize[1] pdf.addimage(color, width, height, imgformat, imgdata, pdf_x, pdf_y) @@ -341,6 +341,23 @@ def positive_float(string): def valid_date(string): return datetime.strptime(string, "%Y-%m-%dT%H:%M:%S") +def valid_size(string): + tokens = string.split('x') + if len(tokens) != 2: + msg = "input size needs to be of the format Ax, xB or AxB with A and B being integers" + raise argparse.ArgumentTypeError(msg) + x = tokens[0] + y = tokens[1] + if x == '': + x = None + else: + x = int(x) + if y == '': + y = None + else: + y = int(y) + return (x,y) + parser = argparse.ArgumentParser( description='Lossless conversion/embedding of images (in)to pdf') parser.add_argument( @@ -349,15 +366,16 @@ parser.add_argument( parser.add_argument( '-o', '--output', metavar='out', type=argparse.FileType('wb'), default=getattr(sys.stdout, "buffer", sys.stdout), help='output file (default: stdout)') -parser.add_argument( + +sizeopts = parser.add_mutually_exclusive_group() +sizeopts.add_argument( '-d', '--dpi', metavar='dpi', type=positive_float, - help='dpi for pdf output (default: 96.0)') -parser.add_argument( - '-x', metavar='pdf_x', type=positive_float, - help='output width in points') -parser.add_argument( - '-y', metavar='pdf_y', type=positive_float, - help='output height in points') + help='dpi for pdf output. If input image does not specify dpi the default is 96.0. Must not be specified together with -s/--pagesize.') +sizeopts.add_argument( + '-s', '--pagesize', metavar='size', type=valid_size, + default=(None, None), + help='size of the pages in the pdf output in format AxB with A and B being width and height of the page in points. You can omit either one of them. Must not be specified together with -d/--dpi.') + parser.add_argument( '-t', '--title', metavar='title', type=str, help='title for metadata') @@ -377,7 +395,7 @@ parser.add_argument( '-m', '--moddate', metavar='moddate', type=valid_date, help='modification date for metadata in YYYY-MM-DDTHH:MM:SS format') parser.add_argument( - '-s', '--subject', metavar='subject', type=str, + '-S', '--subject', metavar='subject', type=str, help='subject for metadata') parser.add_argument( '-k', '--keywords', metavar='kw', type=str, nargs='+', @@ -397,7 +415,7 @@ def main(args=None): args.output.write( convert( - args.images, args.dpi, args.x, args.y, args.title, args.author, + args.images, args.dpi, args.pagesize, args.title, args.author, args.creator, args.producer, args.creationdate, args.moddate, args.subject, args.keywords, args.colorspace, args.nodate, args.verbose)) From 5d7975164dd31b9cb26822c323e4202dcc8a2735 Mon Sep 17 00:00:00 2001 From: Jakub Wilk Date: Fri, 6 Mar 2015 22:51:58 +0100 Subject: [PATCH 35/53] remove unused imports --- src/img2pdf.py | 1 - src/jp2.py | 1 - src/tests/__init__.py | 2 -- 3 files changed, 4 deletions(-) diff --git a/src/img2pdf.py b/src/img2pdf.py index fb197b4..db8778f 100755 --- a/src/img2pdf.py +++ b/src/img2pdf.py @@ -20,7 +20,6 @@ import sys import zlib import argparse -import struct from PIL import Image from datetime import datetime from jp2 import parsejp2 diff --git a/src/jp2.py b/src/jp2.py index 4f960fe..c897e5f 100644 --- a/src/jp2.py +++ b/src/jp2.py @@ -20,7 +20,6 @@ # along with this program. If not, see . import struct -import sys def getBox(data, byteStart, noBytes): boxLengthValue = struct.unpack(">I", data[byteStart:byteStart+4])[0] diff --git a/src/tests/__init__.py b/src/tests/__init__.py index e3eb235..3508baf 100644 --- a/src/tests/__init__.py +++ b/src/tests/__init__.py @@ -1,8 +1,6 @@ import unittest -import datetime import os -import unittest import img2pdf import zlib from PIL import Image From 9389d81a1406ff62843ca193ef9bef55be36a095 Mon Sep 17 00:00:00 2001 From: josch Date: Fri, 6 Mar 2015 23:55:32 +0100 Subject: [PATCH 36/53] fix testsuite after commit 2cb8e55f broke it - thanks Jakub Wilk! --- src/tests/output/CMYK.jpg.pdf | Bin 5741 -> 5737 bytes src/tests/output/CMYK.tif.pdf | Bin 1861 -> 1857 bytes src/tests/output/normal.jpg.pdf | Bin 3228 -> 3224 bytes src/tests/output/normal.png.pdf | Bin 1710 -> 1706 bytes 4 files changed, 0 insertions(+), 0 deletions(-) diff --git a/src/tests/output/CMYK.jpg.pdf b/src/tests/output/CMYK.jpg.pdf index e2f9380ca45b6f9e80f8b3e190237648ac10e65f..2a000220f134e12c366767859133f24a6dc5205b 100644 GIT binary patch delta 80 zcmaE>^HOI+BBQ*qC6}EYS8+*EYGN)|A(x?{sh$B4D1e}eh2G{C#%d8!V+D|yfkK`F a7nosSYGPqNSwt*=)4-BTRn^tsjSB!;@eQ{ti3O0ixq-1-MATdXBAlnd Z1yNvXVre*8KrDdM(3neA)z#mP3jm}F6N3N% diff --git a/src/tests/output/CMYK.tif.pdf b/src/tests/output/CMYK.tif.pdf index 2441e99c1d4fcdd1563796ed766f67ecb5fddf03..54c0b4e63f151d91a813151fbfdcfdb53ac9699b 100644 GIT binary patch delta 80 zcmX@gcaU#GBBQ*qC6}EYS8+*EYGN)|A(x?{sh$B4D1e}eh2G{C#$Z-aV+D|yfkK`F a7nosSXl7tO`3+kDr;#O>s;aBM8y5gEf)dXF delta 84 zcmX@eca(2KBBQd2F_)bkS8+*EYGN)|A(x?{sh$BCC>Q{ti3O0ixq&g5Rn%MoBAlnd Z1yNvVW@tG11zP~8u`!pbs;j>n7XWx`6Mz5! diff --git a/src/tests/output/normal.jpg.pdf b/src/tests/output/normal.jpg.pdf index f5402f9828984f8195cf54b8d8d11faf2f8c7bf7..1b891a0f03f8bfdf9288d1c8fd31ab2928a36adb 100644 GIT binary patch delta 82 zcmbOuIYV+oBBQ*qC6}EYS8+*EYGN)|A(x?{sh$B4D1e}eh2G{C##LOR#tI-Y1BE;V cE-=Hu$kNQ{ti3O0ixq)#Nm#Db{L^w}@ a3!=ct(%fRQJ&zx^F_6!ts_N?R#svV90TXKg diff --git a/src/tests/output/normal.png.pdf b/src/tests/output/normal.png.pdf index e1c13c460d9b40b7af382b44d6306479d4337e57..55386347f0c332b9debeab813467b153e893413d 100644 GIT binary patch delta 80 zcmZ3-yNY*1BBQ*qC6}EYS8+*EYGN)|A(x?{sh$B4D1e}eh2G{C#yu>e#tI-Y1BE;V aE-=Hu(8SbiawuB>r?~}}s;aBM8y5g7z7lo- delta 84 zcmZ3*yN-85BBQd2F_)bkS8+*EYGN)|A(x?{sh$BCC>Q{ti3O0ixq)#Hi>SE*L^w}@ Z3!=c##LQrFAX@;Zg%Ovks;j>n7XWbQ6DI%w From 23b77d254474cf159d3cb022c5bdc98f2d6b35d5 Mon Sep 17 00:00:00 2001 From: josch Date: Sat, 7 Mar 2015 02:58:44 +0100 Subject: [PATCH 37/53] README.md: add Debian/Ubuntu dependencies --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index cfd3908..d658d5e 100644 --- a/README.md +++ b/README.md @@ -101,6 +101,14 @@ Like -C L for DeviceGray. Installation ------------ +On a Debian/Ubuntu based OS, the following dependencies are needed: + + apt-get install python python-pil python-setuptools + +Or for Python 3: + + apt-get install python3 python3-pil python3-setuptools + You can install the package using: $ pip install img2pdf From 486361e71694c9e01ebc723432c4ab83ce42f572 Mon Sep 17 00:00:00 2001 From: josch Date: Sat, 7 Mar 2015 02:59:12 +0100 Subject: [PATCH 38/53] cater for python-pil versions without close() attribute --- src/img2pdf.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/img2pdf.py b/src/img2pdf.py index db8778f..0029481 100755 --- a/src/img2pdf.py +++ b/src/img2pdf.py @@ -310,7 +310,11 @@ def convert(images, dpi=None, pagesize=(None, None), title=None, author=None, imgdata = imgdata.convert('RGB') color = imgdata.mode img = imgdata.tobytes() - imgdata.close() + # the python-pil version 2.3.0-1ubuntu3 in Ubuntu does not have the close() method + try: + imgdata.close() + except AttributeError: + pass imgdata = zlib.compress(img) im.close() From b90e385efedff229c1de21d9d365b65422c32528 Mon Sep 17 00:00:00 2001 From: josch Date: Sat, 7 Mar 2015 03:01:02 +0100 Subject: [PATCH 39/53] cater for python-pil versions without close() attribute (the 2nd fix) --- src/tests/__init__.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/tests/__init__.py b/src/tests/__init__.py index 3508baf..08ff69a 100644 --- a/src/tests/__init__.py +++ b/src/tests/__init__.py @@ -93,7 +93,11 @@ def test_suite(): with open(out, "rb") as outf: out = outf.read() self.assertEqual(pdf, out) - orig_img.close() + # the python-pil version 2.3.0-1ubuntu3 in Ubuntu does not have the close() method + try: + orig_img.close() + except AttributeError: + pass setattr(TestImg2Pdf, "test_%s"%test_name, handle) return unittest.TestSuite(( From d5fc324b7bd4087764e7e713093412587c2bdd60 Mon Sep 17 00:00:00 2001 From: josch Date: Sat, 7 Mar 2015 03:02:00 +0100 Subject: [PATCH 40/53] cater for python-pil versions without close() attribute (the 3rd fix) --- src/tests/__init__.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/tests/__init__.py b/src/tests/__init__.py index 08ff69a..15c9328 100644 --- a/src/tests/__init__.py +++ b/src/tests/__init__.py @@ -87,7 +87,11 @@ def test_suite(): elif orig_img.mode not in ("RGB", "L", "CMYK", "CMYK;I"): orig_img = orig_img.convert("RGB") self.assertEqual(im.tobytes(), orig_img.tobytes()) - im.close() + # the python-pil version 2.3.0-1ubuntu3 in Ubuntu does not have the close() method + try: + im.close() + except AttributeError: + pass # lastly, make sure that the generated pdf matches bit by bit the # expected pdf with open(out, "rb") as outf: From 36fb9173fe6153f64edb300840adf88eb3f3d8fc Mon Sep 17 00:00:00 2001 From: josch Date: Sat, 7 Mar 2015 03:20:14 +0100 Subject: [PATCH 41/53] add --version option and __version__ module variable and use ~git version suffix --- setup.py | 6 ++++-- src/img2pdf.py | 5 +++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 92ca84e..1ad815c 100644 --- a/setup.py +++ b/setup.py @@ -1,8 +1,10 @@ from setuptools import setup +VERSION="0.1.6~git" + setup ( name='img2pdf', - version='0.1.5', + version=VERSION, author = "Johannes 'josch' Schauer", author_email = 'j.schauer@email.de', description = "Convert images to PDF via direct JPEG inclusion.", @@ -24,7 +26,7 @@ setup ( 'Natural Language :: English', 'Operating System :: OS Independent'], url = 'https://github.com/josch/img2pdf', - download_url = 'https://github.com/josch/img2pdf/archive/0.1.5.tar.gz', + download_url = 'https://github.com/josch/img2pdf/archive/'+VERSION+'.tar.gz', package_dir={"": "src"}, py_modules=['img2pdf', 'jp2'], include_package_data = True, diff --git a/src/img2pdf.py b/src/img2pdf.py index 0029481..aafadb3 100755 --- a/src/img2pdf.py +++ b/src/img2pdf.py @@ -17,6 +17,8 @@ # License along with this program. If not, see # . +__version__ = "0.1.6~git" + import sys import zlib import argparse @@ -410,6 +412,9 @@ parser.add_argument( '-D', '--nodate', help='do not add timestamps', action="store_true") parser.add_argument( '-v', '--verbose', help='verbose mode', action="store_true") +parser.add_argument( + '-V', '--version', action='version', version='%(prog)s '+__version__, + help="Print version information and exit") def main(args=None): if args is None: From 592cdc1cdb8eba77c2cb5ab566e29dbc14f2838c Mon Sep 17 00:00:00 2001 From: xiota Date: Fri, 20 Mar 2015 03:37:30 -0700 Subject: [PATCH 42/53] Changes to pdf page size handling Changes to `valid_size()` * accept common page sizes, such as letter and a4. * parse dimensions of format: AuxBv#, where A is width, u is units, B is height, v is units, # are options. * accept units: in, cm, mm, pt Changes to `convert()`: * resize pages based on dpi calculations * default resize images into page size (like default resize in imagemagick) * implement exact resizing (ignore dpi; equiv to ! in imagemagick) Created `get_ndpi()`: * provides dpi for page resizing * implement fill resizing (equiv to ^ in imagemagick) Other changes: * default dpi in global variable --- src/img2pdf.py | 258 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 208 insertions(+), 50 deletions(-) diff --git a/src/img2pdf.py b/src/img2pdf.py index aafadb3..b9ffd8c 100755 --- a/src/img2pdf.py +++ b/src/img2pdf.py @@ -18,7 +18,9 @@ # . __version__ = "0.1.6~git" +default_dpi = 96.0 +import re import sys import zlib import argparse @@ -212,10 +214,12 @@ class pdfdoc(object): result += b"%%EOF\n" return result -def convert(images, dpi=None, pagesize=(None, None), title=None, author=None, - creator=None, producer=None, creationdate=None, moddate=None, - subject=None, keywords=None, colorspace=None, nodate=False, - verbose=False): +def convert(images, dpi=None, pagesize=(None, None, None), title=None, + author=None, creator=None, producer=None, creationdate=None, + moddate=None, subject=None, keywords=None, colorspace=None, + nodate=False, verbose=False): + + pagesize_options = pagesize[2] pdf = pdfdoc(3, title, author, creator, producer, creationdate, moddate, subject, keywords, nodate) @@ -241,13 +245,9 @@ def convert(images, dpi=None, pagesize=(None, None), title=None, author=None, width, height, ics = parsejp2(rawdata) imgformat = "JPEG2000" - if dpi: - ndpi = dpi, dpi - debug_out("input dpi (forced) = %d x %d"%ndpi, verbose) - else: - # TODO: read real dpi from input jpeg2000 image - ndpi = (96, 96) - debug_out("input dpi = %d x %d"%ndpi, verbose) + # TODO: read real dpi from input jpeg2000 image + ndpi = (default_dpi, default_dpi) + debug_out("input dpi = %d x %d" % ndpi, verbose) if colorspace: color = colorspace @@ -259,18 +259,14 @@ def convert(images, dpi=None, pagesize=(None, None), title=None, author=None, width, height = imgdata.size imgformat = imgdata.format - if dpi: - ndpi = dpi, dpi - debug_out("input dpi (forced) = %d x %d"%ndpi, verbose) - else: - ndpi = imgdata.info.get("dpi", (96, 96)) - # in python3, the returned dpi value for some tiff images will - # not be an integer but a float. To make the behaviour of - # img2pdf the same between python2 and python3, we convert that - # float into an integer by rounding - # search online for the 72.009 dpi problem for more info - ndpi = (int(round(ndpi[0])),int(round(ndpi[1]))) - debug_out("input dpi = %d x %d"%ndpi, verbose) + ndpi = imgdata.info.get("dpi", (default_dpi, default_dpi)) + # in python3, the returned dpi value for some tiff images will + # not be an integer but a float. To make the behaviour of + # img2pdf the same between python2 and python3, we convert that + # float into an integer by rounding + # search online for the 72.009 dpi problem for more info + ndpi = (int(round(ndpi[0])),int(round(ndpi[1]))) + debug_out("input dpi = %d x %d" % ndpi, verbose) if colorspace: color = colorspace @@ -292,6 +288,13 @@ def convert(images, dpi=None, pagesize=(None, None), title=None, author=None, debug_out("width x height = %d x %d"%(width,height), verbose) debug_out("imgformat = %s"%imgformat, verbose) + if dpi: + ndpi = dpi, dpi + debug_out("input dpi (forced) = %d x %d" % ndpi, verbose) + elif pagesize_options: + ndpi = get_ndpi(width, height, pagesize) + debug_out("calculated dpi (based on pagesize) = %d x %d" % ndpi, verbose) + # depending on the input format, determine whether to pass the raw # image or the zlib compressed color information if imgformat is "JPEG" or imgformat is "JPEG2000": @@ -320,21 +323,43 @@ def convert(images, dpi=None, pagesize=(None, None), title=None, author=None, imgdata = zlib.compress(img) im.close() - # pdf units = 1/72 inch - if not pagesize[0] and not pagesize[1]: - pdf_x, pdf_y = 72.0*width/float(ndpi[0]), 72.0*height/float(ndpi[1]) - elif not pagesize[1]: - pdf_x, pdf_y = pagesize[0], pagesize[0]*height/float(width) - elif not pagesize[0]: - pdf_x, pdf_y = pagesize[1]*width/float(height), pagesize[1] + if pagesize_options and pagesize_options['exact'][1]: + # output size exactly to specified dimensions + # pagesize[0], pagesize[1] already checked in valid_size() + pdf_x, pdf_y = pagesize[0], pagesize[1] else: - pdf_x = pagesize[0] - pdf_y = pagesize[1] + # output size based on dpi; point = 1/72 inch + pdf_x, pdf_y = 72.0*width/float(ndpi[0]), 72.0*height/float(ndpi[1]) pdf.addimage(color, width, height, imgformat, imgdata, pdf_x, pdf_y) return pdf.tostring() +def get_ndpi(width, height, pagesize): + pagesize_options = pagesize[2] + + if pagesize_options and pagesize_options['fill'][1]: + if width/height < pagesize[0]/pagesize[1]: + tmp_dpi = 72.0*width/pagesize[0] + else: + tmp_dpi = 72.0*height/pagesize[1] + elif pagesize[0] and pagesize[1]: + # if both height and width given with no specific pagesize_option, + # resize to fit "into" page + if width/height < pagesize[0]/pagesize[1]: + tmp_dpi = 72.0*height/pagesize[1] + else: + tmp_dpi = 72.0*width/pagesize[0] + elif pagesize[0]: + # if width given, calculate dpi based on width + tmp_dpi = 72.0*width/pagesize[0] + elif pagesize[1]: + # if height given, calculate dpi based on height + tmp_dpi = 72.0*height/pagesize[1] + else: + tmp_dpi = default_dpi + + return tmp_dpi, tmp_dpi def positive_float(string): value = float(string) @@ -346,22 +371,142 @@ def positive_float(string): def valid_date(string): return datetime.strptime(string, "%Y-%m-%dT%H:%M:%S") +def get_standard_papersize(string): + papersizes = { + "11x17" : "792x792^", # "792x1224", + "ledger" : "792x792^", # "1224x792", + "legal" : "612x612^", # "612x1008", + "letter" : "612x612^", # "612x792", + "arche" : "2592x2592^", # "2592x3456", + "archd" : "1728x1728^", # "1728x2592", + "archc" : "1296x1296^", # "1296x1728", + "archb" : "864x864^", # "864x1296", + "archa" : "648x648^", # "648x864", + "a0" : "2380x2380^", # "2380x3368", + "a1" : "1684x1684^", # "1684x2380", + "a2" : "1190x1190^", # "1190x1684", + "a3" : "842x842^", # "842x1190", + "a4" : "595x595^", # "595x842", + "a5" : "421x421^", # "421x595", + "a6" : "297x297^", # "297x421", + "a7" : "210x210^", # "210x297", + "a8" : "148x148^", # "148x210", + "a9" : "105x105^", # "105x148", + "a10" : "74x74^", # "74x105", + "b0" : "2836x2836^", # "2836x4008", + "b1" : "2004x2004^", # "2004x2836", + "b2" : "1418x1418^", # "1418x2004", + "b3" : "1002x1002^", # "1002x1418", + "b4" : "709x709^", # "709x1002", + "b5" : "501x501^", # "501x709", + "c0" : "2600x2600^", # "2600x3677", + "c1" : "1837x1837^", # "1837x2600", + "c2" : "1298x1298^", # "1298x1837", + "c3" : "918x918^", # "918x1298", + "c4" : "649x649^", # "649x918", + "c5" : "459x459^", # "459x649", + "c6" : "323x323^", # "323x459", + "flsa" : "612x612^", # "612x936", + "flse" : "612x612^", # "612x936", + "halfletter" : "396x396^", # "396x612", + "tabloid" : "792x792^", # "792x1224", + "statement" : "396x396^", # "396x612", + "executive" : "540x540^", # "540x720", + "folio" : "612x612^", # "612x936", + "quarto" : "610x610^", # "610x780" + } + + string = string.lower() + return papersizes.get(string, string) + def valid_size(string): - tokens = string.split('x') - if len(tokens) != 2: - msg = "input size needs to be of the format Ax, xB or AxB with A and B being integers" + # conversion factors from units to points + units = { + 'in' : 72.0, + 'cm' : 72.0/2.54, + 'mm' : 72.0/25.4, + 'pt' : 1.0 + } + + pagesize_options = { + 'exact' : ['\!', False], + 'shrink' : ['\>', False], + 'enlarge' : ['\<', False], + 'fill' : ['\^', False], + 'percent' : ['\%', False], + 'count' : ['\@', False], + } + + string = get_standard_papersize(string) + + pattern = re.compile(r""" + ([0-9]*\.?[0-9]*) # tokens.group(1) == width; may be empty + ([a-z]*) # tokens.group(2) == units; may be empty + x + ([0-9]*\.?[0-9]*) # tokens.group(3) == height; may be empty + ([a-zA-Z]*) # tokens.group(4) == units; may be empty + ([^0-9a-zA-Z]*) # tokens.group(5) == extra options + """, re.VERBOSE) + + tokens = pattern.match(string) + + # tokens.group(0) should match entire input string + if tokens.group(0) != string: + msg = ('Input size needs to be of the format AuxBv#, ' + 'where A is width, B is height, u and v are units, ' + '# are options. ' + 'You may omit either width or height, but not both. ' + 'Units may be specified as (in, cm, mm, pt). ' + 'You may omit units, which will default to pt. ' + 'Available options include (! = exact ; ^ = fill ; default = into).') raise argparse.ArgumentTypeError(msg) - x = tokens[0] - y = tokens[1] - if x == '': - x = None - else: - x = int(x) - if y == '': - y = None - else: - y = int(y) - return (x,y) + + # temporary list to loop through to process width and height + pagesize_size = { + 'x' : [0, tokens.group(1), tokens.group(2)], + 'y' : [0, tokens.group(3), tokens.group(4)] + } + + for key, value in pagesize_size.items(): + try: + value[0] = float(value[1]) + value[0] *= units[value[2]] # convert to points + except ValueError, e: + # assign None if width or height not provided + value[0] = None + except KeyError, e: + # if units unrecognized, raise error + # otherwise default to pt because units not provided + if value[2]: + msg = "unrecognized unit '%s'." % value[2] + raise argparse.ArgumentTypeError(msg) + + x = pagesize_size['x'][0] + y = pagesize_size['y'][0] + + # parse options for resize methods + if tokens.group(5): + for key, value in pagesize_options.items(): + if re.search(value[0], tokens.group(5)): + value[1] = True + + if pagesize_options['fill'][1]: + # if either width or height is not given, try to fill in missing value + if not x: + x = y + elif not y: + y = x + + if pagesize_options['exact'][1]: + if not x or not y: + msg = ('exact size requires both width and height.') + raise argparse.ArgumentTypeError(msg) + + if not x and not y: + msg = ('width and height cannot both be omitted.') + raise argparse.ArgumentTypeError(msg) + + return (x, y, pagesize_options) parser = argparse.ArgumentParser( description='Lossless conversion/embedding of images (in)to pdf') @@ -370,16 +515,29 @@ parser.add_argument( nargs='+', help='input file(s)') parser.add_argument( '-o', '--output', metavar='out', type=argparse.FileType('wb'), - default=getattr(sys.stdout, "buffer", sys.stdout), help='output file (default: stdout)') + default=getattr(sys.stdout, "buffer", sys.stdout), + help='output file (default: stdout)') sizeopts = parser.add_mutually_exclusive_group() sizeopts.add_argument( '-d', '--dpi', metavar='dpi', type=positive_float, - help='dpi for pdf output. If input image does not specify dpi the default is 96.0. Must not be specified together with -s/--pagesize.') + help=('dpi for pdf output. ' + 'If input image does not specify dpi the default is %.2f. ' + 'Must not be used with -s/--pagesize.') % default_dpi +) + sizeopts.add_argument( '-s', '--pagesize', metavar='size', type=valid_size, - default=(None, None), - help='size of the pages in the pdf output in format AxB with A and B being width and height of the page in points. You can omit either one of them. Must not be specified together with -d/--dpi.') + default=(None, None, None), + help=('size of the pdf pages in format AuxBv#, ' + 'where A is width, B is height, u and v are units, # are options. ' + 'You may omit either width or height, but not both. ' + 'Some common page sizes, such as letter and a4, are also recognized. ' + 'Units may be specified as (in, cm, mm, pt). ' + 'Units default to pt when absent. ' + 'Available options include (! = exact ; ^ = fill ; default = into). ' + 'Must not be used with -d/--dpi.') +) parser.add_argument( '-t', '--title', metavar='title', type=str, From be21c4bbf36e821e886614cb7bb3968f6c36e606 Mon Sep 17 00:00:00 2001 From: xiota Date: Fri, 20 Mar 2015 11:30:19 -0700 Subject: [PATCH 43/53] general editing --- README.md | 90 +++++++++++++++++++++++++++---------------------------- 1 file changed, 44 insertions(+), 46 deletions(-) diff --git a/README.md b/README.md index d658d5e..ac5b762 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,16 @@ img2pdf ======= -Lossless conversion of images to PDF without unnecessarily re-encoding JPEG and -JPEG2000 files. Thus, no loss of quality and no unnecessary large output file. +Losslessly convert images to PDF without unnecessarily re-encoding JPEG and +JPEG2000 files. Image quality is retained without unnecessarily increasing +file size. Background ---------- -PDF is able to embed JPEG and JPEG2000 images as they are without re-encoding -them (and hence losing quality) but I was missing a tool to do this -automatically, thus I wrote this piece of python code. +Quality loss can be avoided when converting JPEG and JPEG2000 images to +PDF by embedding them without re-encoding. I wrote this piece of python code. +because I was missing a tool to do this automatically. If you know how to embed JPEG and JPEG2000 images into a PDF container without recompression, using existing tools, please contact me so that I can put this @@ -18,43 +19,41 @@ code into the garbage bin :D Functionality ------------- -The program will take image filenames from commandline arguments and output a -PDF file with them embedded into it. If the input image is a JPEG or JPEG2000 -file, it will be included as-is without any processing. If it is in any other -format, the image will be included as zip-encoded RGB. As a result, this tool -will be able to lossless wrap any image into a PDF container while performing -better (in terms of quality/filesize ratio) than existing tools in case the -input image is a JPEG or JPEG2000 file. +This program will take a list of images and produce a PDF file with the +images embedded in it. JPEG and JPEG2000 images will be included without +recompression. Images in other formats will be included with zip/flate +encoding. As a result, this tool is able to losslessly wrap any image +into a PDF container with a quality-filesize ratio that is typically better +than that of existing tools. -For example, imagemagick will re-encode the input JPEG image and thus change -its content: +For example, imagemagick will re-encode the input JPEG image (thus changing +its content): $ convert img.jpg img.pdf $ pdfimages img.pdf img.extr # not using -j to be extra sure there is no recompression $ compare -metric AE img.jpg img.extr-000.ppm null: 1.6301e+06 -If one wants to do a lossless conversion from any format to PDF with -imagemagick then one has to use zip-encoding: +If one wants to losslessly convert from any format to PDF with +imagemagick, one has to use zip compression: $ convert input.jpg -compress Zip output.pdf $ pdfimages img.pdf img.extr # not using -j to be extra sure there is no recompression $ compare -metric AE img.jpg img.extr-000.ppm null: 0 -The downside is, that using imagemagick like this will make the resulting PDF -files a few times bigger than the input JPEG or JPEG2000 file and can also not -output a multipage PDF. +However, this approach will result in PDF files that are a few times larger +than the input JPEG or JPEG2000 file. -img2pdf is able to output a PDF with multiple pages if more than one input -image is given, losslessly embed JPEG and JPEG2000 files into a PDF container -without adding more overhead than the PDF structure itself and will save all -other graphics formats using lossless zip-compression. +img2pdf is able to losslessly embed JPEG and JPEG2000 files into a PDF +container without additional overhead (aside from the PDF structure itself), +save other graphics formats using lossless zip compression, +and produce multi-page PDF files when more than one input image is given. -Another nifty advantage: Since no re-encoding is done in case of JPEG images, -the conversion is many (ten to hundred) times faster with img2pdf compared to -imagemagick. While a run of above convert command with a 2.8MB JPEG takes 27 -seconds (on average) on my machine, conversion using img2pdf takes just a +Also, since JPEG and JPEG2000 images are not reencoded, conversion with +img2pdf is several (ten to hundred) times faster than with imagemagick. +While the above convert command with a 2.8MB JPEG took 27 seconds +(on average) on my machine, conversion using img2pdf took just a fraction of a second. Commandline Arguments @@ -81,27 +80,26 @@ More help is available with the -h or --help option. Bugs ---- -If you find a JPEG or JPEG2000 file that, when embedded can not be read by the -Adobe Acrobat Reader, please contact me. +If you find a JPEG or JPEG2000 file that, when embedded cannot be read +by the Adobe Acrobat Reader, please contact me. -For lossless conversion of other formats than JPEG or JPEG2000 files, zip/flate -encoding is used. This choice is based on a number of tests I did on images. -I converted them into PDF using imagemagick and all compressions it has to -offer and then compared the output size of the lossless variants. In all my -tests, zip/flate encoding performed best. You can verify my findings using the -test_comp.sh script with any input image given as a commandline argument. If -you find an input file that is outperformed by another lossless compression, -contact me. +For lossless conversion of formats other than JPEG or JPEG2000, zip/flate +encoding is used. This choice is based on tests I did with a number of images. +I converted them into PDF using the lossless variants of the compression +formats offered by imagemagick. In all my tests, zip/flate encoding performed +best. You can verify my findings using the test_comp.sh script with any input +image given as a commandline argument. If you find an input file that is +outperformed by another lossless compression method, contact me. -I have not yet figured out how to read the colorspace from jpeg2000 files. -Therefor jpeg2000 files use DeviceRGB per default. If your jpeg2000 files are -of any other colorspace you must force it using the --colorspace option. -Like -C L for DeviceGray. +I have not yet figured out how to determine the colorspace of JPEG2000 files. +Therefore JPEG2000 files use DeviceRGB by default. For JPEG2000 files with +other colorspaces, you must force it using the `--colorspace` option. Installation ------------ -On a Debian/Ubuntu based OS, the following dependencies are needed: +On a Debian- and Ubuntu-based systems, dependencies may be installed +with the following command: apt-get install python python-pil python-setuptools @@ -109,17 +107,17 @@ Or for Python 3: apt-get install python3 python3-pil python3-setuptools -You can install the package using: +You can then install the package using: $ pip install img2pdf -If you want to install from source code simply use: +If you prefer to install from source code use: $ cd img2pdf/ $ pip install . To test the console script without installing the package on your system, -simply use virtualenv: +use virtualenv: $ cd img2pdf/ $ virtualenv ve @@ -129,7 +127,7 @@ You can then test the converter using: $ ve/bin/img2pdf -o test.pdf src/tests/test.jpg -Note that the package can also be used as a library as follows: +The package can also be used as a library: import img2pdf pdf_bytes = img2pdf.convert(['test.jpg']) From 53fdf81c37ee080694606a5456b38be0d23e6012 Mon Sep 17 00:00:00 2001 From: xiota Date: Fri, 20 Mar 2015 12:24:13 -0700 Subject: [PATCH 44/53] Usage * rename "Commandline Options" to "Usage" * General Notes * Controlling Page Size * Colorspace --- README.md | 70 ++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 56 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index ac5b762..64ec83f 100644 --- a/README.md +++ b/README.md @@ -56,26 +56,68 @@ While the above convert command with a 2.8MB JPEG took 27 seconds (on average) on my machine, conversion using img2pdf took just a fraction of a second. -Commandline Arguments ---------------------- -At least one input file argument must be given as img2pdf needs to seek in the -file descriptor which would not be possible for stdin. +Usage +----- -Specify the dpi with the -d or --dpi options instead of reading it from the -image or falling back to 96.0. +#### General Notes -Specify the output file with -o or --output. By default output will be done to -stdout. +The images must be provided as files because img2pdf needs to seek +in the file descriptor. Input cannot be piped through stdin. -Specify metadata using the --title, --author, --creator, --producer, ---creationdate, --moddate, --subject and --keywords options (or their short -forms). +If no output file is specified with the `-o`/`--output` option, +output will be to stdout. -Specify -C or --colorspace to force a colorspace using PIL short handles like -'RGB', 'L' or '1'. +Descriptions of the options should be self explanatory. +They are available by running: -More help is available with the -h or --help option. + img2pdf --help + + +#### Controlling Page Size + +The PDF page size can be manipulated. By default, the image will be sized "into" the given dimensions with the aspect ratio retained. For instance, to size an image into a page that is at most 500pt x 500pt, use: + + img2pdf -s 500x500 -o output.pdf input.jpg + +To "fill" out a page that is at least 500pt x 500pt, follow the dimensions with a `^`: + + img2pdf -s 500x500^ -o output.pdf input.jpg + +To output pages that are exactly 500pt x 500pt, follow the dimensions with an `!`: + + img2pdf -s 500x500\! -o output.pdf input.jpg + +Notice that the default unit is points. Units may be also be specified and mixed: + + img2pdf -s 8.5inx27.94cm -o output.pdf input.jpg + +If either width or height is omitted, the other will be calculated +to preserve aspect ratio. + + img2pdf -s x280mm -o output1.pdf input.jpg + img2pdf -s 280mmx -o output2.pdf input.jpg + +Some standard page sizes are recognized: + + img2pdf -s letter -o output1.pdf input.jpg + img2pdf -s a4 -o output2.pdf input.jpg + +#### Colorspace + +Currently, the colorspace must be forced for JPEG 2000 images that are +not in the RGB colorspace. Available colorspace options are based on +Python Imaging Library (PIL) short handles. + + * `RGB` = RGB color + * `L` = Grayscale + * `1` = Black and white (internally converted to grayscale) + * `CMYK` = CMYK color + * `CMYK;I` = CMYK color with inversion + +For example, to encode a grayscale JPEG2000 image, use: + + img2pdf -C L -o output.pdf input.jp2 Bugs ---- From 317a0ee7f217a4820435f947eaf8242232c6c68e Mon Sep 17 00:00:00 2001 From: josch Date: Fri, 13 Mar 2015 11:43:38 +0100 Subject: [PATCH 45/53] do not encode as utf8 as pdf is ascii, add safer handling across py2/py3 --- src/img2pdf.py | 137 +++++++++++++++++++++++++++---------------------- 1 file changed, 75 insertions(+), 62 deletions(-) diff --git a/src/img2pdf.py b/src/img2pdf.py index b9ffd8c..91348af 100755 --- a/src/img2pdf.py +++ b/src/img2pdf.py @@ -46,20 +46,22 @@ def warning_out(message): def parse(cont, indent=1): if type(cont) is dict: return b"<<\n"+b"\n".join( - [4 * indent * b" " + k.encode("utf8") + b" " + parse(v, indent+1) + [4 * indent * b" " + k + b" " + parse(v, indent+1) for k, v in sorted(cont.items())])+b"\n"+4*(indent-1)*b" "+b">>" elif type(cont) is int: - return str(cont).encode("utf8") + return str(cont).encode() elif type(cont) is float: - return ("%0.4f"%cont).encode("utf8") + return ("%0.4f"%cont).encode() elif isinstance(cont, obj): - return ("%d 0 R"%cont.identifier).encode("utf8") - elif type(cont) is str: - return cont.encode("utf8") - elif type(cont) is bytes: + return ("%d 0 R"%cont.identifier).encode() + elif type(cont) is str or type(cont) is bytes: + if type(cont) is str and type(cont) is not bytes: + raise Exception("parse must be passed a bytes object in py3") return cont elif type(cont) is list: return b"[ "+b" ".join([parse(c, indent) for c in cont])+b" ]" + else: + raise Exception("cannot handle type %s"%type(cont)) class obj(object): def __init__(self, content, stream=None): @@ -69,11 +71,11 @@ class obj(object): def tostring(self): if self.stream: return ( - ("%d 0 obj " % self.identifier).encode("utf8") + + ("%d 0 obj " % self.identifier).encode() + parse(self.content) + b"\nstream\n" + self.stream + b"\nendstream\nendobj\n") else: - return ("%d 0 obj "%self.identifier).encode("utf8")+parse(self.content)+b" endobj\n" + return ("%d 0 obj "%self.identifier).encode()+parse(self.content)+b" endobj\n" class pdfdoc(object): @@ -86,39 +88,39 @@ class pdfdoc(object): info = {} if title: - info["/Title"] = "("+title+")" + info[b"/Title"] = b"("+title+b")" if author: - info["/Author"] = "("+author+")" + info[b"/Author"] = b"("+author+b")" if creator: - info["/Creator"] = "("+creator+")" + info[b"/Creator"] = b"("+creator+b")" if producer: - info["/Producer"] = "("+producer+")" + info[b"/Producer"] = b"("+producer+b")" if creationdate: - info["/CreationDate"] = "(D:"+creationdate.strftime("%Y%m%d%H%M%S")+")" + info[b"/CreationDate"] = b"(D:"+creationdate.strftime("%Y%m%d%H%M%S").encode()+b")" elif not nodate: - info["/CreationDate"] = "(D:"+now.strftime("%Y%m%d%H%M%S")+")" + info[b"/CreationDate"] = b"(D:"+now.strftime("%Y%m%d%H%M%S").encode()+b")" if moddate: - info["/ModDate"] = "(D:"+moddate.strftime("%Y%m%d%H%M%S")+")" + info[b"/ModDate"] = b"(D:"+moddate.strftime("%Y%m%d%H%M%S").encode()+b")" elif not nodate: - info["/ModDate"] = "(D:"+now.strftime("%Y%m%d%H%M%S")+")" + info[b"/ModDate"] = b"(D:"+now.strftime("%Y%m%d%H%M%S").encode()+b")" if subject: - info["/Subject"] = "("+subject+")" + info[b"/Subject"] = b"("+subject+b")" if keywords: - info["/Keywords"] = "("+",".join(keywords)+")" + info[b"/Keywords"] = b"("+b",".join(keywords)+b")" self.info = obj(info) # create an incomplete pages object so that a /Parent entry can be # added to each page self.pages = obj({ - "/Type": "/Pages", - "/Kids": [], - "/Count": 0 + b"/Type": b"/Pages", + b"/Kids": [], + b"/Count": 0 }) self.catalog = obj({ - "/Pages": self.pages, - "/Type": "/Catalog" + b"/Pages": self.pages, + b"/Type": b"/Catalog" }) self.addobj(self.catalog) self.addobj(self.pages) @@ -130,11 +132,11 @@ class pdfdoc(object): def addimage(self, color, width, height, imgformat, imgdata, pdf_x, pdf_y): if color == 'L': - colorspace = "/DeviceGray" + colorspace = b"/DeviceGray" elif color == 'RGB': - colorspace = "/DeviceRGB" + colorspace = b"/DeviceRGB" elif color == 'CMYK' or color == 'CMYK;I': - colorspace = "/DeviceCMYK" + colorspace = b"/DeviceCMYK" else: error_out("unsupported color space: %s"%color) exit(1) @@ -144,47 +146,47 @@ class pdfdoc(object): # either embed the whole jpeg or deflate the bitmap representation if imgformat is "JPEG": - ofilter = [ "/DCTDecode" ] + ofilter = [ b"/DCTDecode" ] elif imgformat is "JPEG2000": - ofilter = [ "/JPXDecode" ] + ofilter = [ b"/JPXDecode" ] self.version = 5 # jpeg2000 needs pdf 1.5 else: - ofilter = [ "/FlateDecode" ] + ofilter = [ b"/FlateDecode" ] image = obj({ - "/Type": "/XObject", - "/Subtype": "/Image", - "/Filter": ofilter, - "/Width": width, - "/Height": height, - "/ColorSpace": colorspace, + b"/Type": b"/XObject", + b"/Subtype": b"/Image", + b"/Filter": ofilter, + b"/Width": width, + b"/Height": height, + b"/ColorSpace": colorspace, # hardcoded as PIL doesnt provide bits for non-jpeg formats - "/BitsPerComponent": 8, - "/Length": len(imgdata) + b"/BitsPerComponent": 8, + b"/Length": len(imgdata) }, imgdata) if color == 'CMYK;I': # Inverts all four channels - image.content['/Decode'] = [1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0] + image.content[b'/Decode'] = [1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0] - text = ("q\n%0.4f 0 0 %0.4f 0 0 cm\n/Im0 Do\nQ"%(pdf_x, pdf_y)).encode('utf8') + text = ("q\n%0.4f 0 0 %0.4f 0 0 cm\n/Im0 Do\nQ"%(pdf_x, pdf_y)).encode() content = obj({ - "/Length": len(text) + b"/Length": len(text) }, text) page = obj({ - "/Type": "/Page", - "/Parent": self.pages, - "/Resources": { - "/XObject": { - "/Im0": image + b"/Type": b"/Page", + b"/Parent": self.pages, + b"/Resources": { + b"/XObject": { + b"/Im0": image } }, - "/MediaBox": [0, 0, pdf_x, pdf_y], - "/Contents": content + b"/MediaBox": [0, 0, pdf_x, pdf_y], + b"/Contents": content }) - self.pages.content["/Kids"].append(page) - self.pages.content["/Count"] += 1 + self.pages.content[b"/Kids"].append(page) + self.pages.content[b"/Count"] += 1 self.addobj(page) self.addobj(content) self.addobj(image) @@ -195,22 +197,22 @@ class pdfdoc(object): xreftable = list() - result = ("%%PDF-1.%d\n"%self.version).encode("utf8") + result = ("%%PDF-1.%d\n"%self.version).encode() xreftable.append(b"0000000000 65535 f \n") for o in self.objects: - xreftable.append(("%010d 00000 n \n"%len(result)).encode("utf8")) + xreftable.append(("%010d 00000 n \n"%len(result)).encode()) result += o.tostring() xrefoffset = len(result) result += b"xref\n" - result += ("0 %d\n"%len(xreftable)).encode("utf8") + result += ("0 %d\n"%len(xreftable)).encode() for x in xreftable: result += x result += b"trailer\n" - result += parse({"/Size": len(xreftable), "/Info": self.info, "/Root": self.catalog})+b"\n" + result += parse({b"/Size": len(xreftable), b"/Info": self.info, b"/Root": self.catalog})+b"\n" result += b"startxref\n" - result += ("%d\n"%xrefoffset).encode("utf8") + result += ("%d\n"%xrefoffset).encode() result += b"%%EOF\n" return result @@ -508,6 +510,17 @@ def valid_size(string): return (x, y, pagesize_options) +# in python3, the received argument will be a unicode str() object which needs +# to be encoded into a bytes() object +# in python2, the received argument will be a binary str() object which needs +# no encoding +# we check whether we use python2 or python3 by checking whether the argument +# is both, type str and type bytes (only the case in python2) +def pdf_embedded_string(string): + if type(string) is str and type(string) is not bytes: + string = string.encode("utf8") + return string + parser = argparse.ArgumentParser( description='Lossless conversion/embedding of images (in)to pdf') parser.add_argument( @@ -540,16 +553,16 @@ sizeopts.add_argument( ) parser.add_argument( - '-t', '--title', metavar='title', type=str, + '-t', '--title', metavar='title', type=pdf_embedded_string, help='title for metadata') parser.add_argument( - '-a', '--author', metavar='author', type=str, + '-a', '--author', metavar='author', type=pdf_embedded_string, help='author for metadata') parser.add_argument( - '-c', '--creator', metavar='creator', type=str, + '-c', '--creator', metavar='creator', type=pdf_embedded_string, help='creator for metadata') parser.add_argument( - '-p', '--producer', metavar='producer', type=str, + '-p', '--producer', metavar='producer', type=pdf_embedded_string, help='producer for metadata') parser.add_argument( '-r', '--creationdate', metavar='creationdate', type=valid_date, @@ -558,13 +571,13 @@ parser.add_argument( '-m', '--moddate', metavar='moddate', type=valid_date, help='modification date for metadata in YYYY-MM-DDTHH:MM:SS format') parser.add_argument( - '-S', '--subject', metavar='subject', type=str, + '-S', '--subject', metavar='subject', type=pdf_embedded_string, help='subject for metadata') parser.add_argument( - '-k', '--keywords', metavar='kw', type=str, nargs='+', + '-k', '--keywords', metavar='kw', type=pdf_embedded_string, nargs='+', help='keywords for metadata') parser.add_argument( - '-C', '--colorspace', metavar='colorspace', type=str, + '-C', '--colorspace', metavar='colorspace', type=pdf_embedded_string, help='force PIL colorspace (one of: RGB, L, 1, CMYK, CMYK;I)') parser.add_argument( '-D', '--nodate', help='do not add timestamps', action="store_true") From 5a1f0701a31600881de1a346d920d60900286008 Mon Sep 17 00:00:00 2001 From: josch Date: Fri, 13 Mar 2015 13:05:23 +0100 Subject: [PATCH 46/53] to allow non-ascii characters, encode strings as utf-16-be (big endian) and escape backslashes and parenthesis --- CHANGES.rst | 1 + src/img2pdf.py | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index 29dc9cb..9884655 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -7,6 +7,7 @@ CHANGES - replace -x and -y option by combined option -s (or --pagesize) and use -S for --subject + - correctly encode and escape non-ascii metadata 0.1.5 ----- diff --git a/src/img2pdf.py b/src/img2pdf.py index 91348af..8b7a1fb 100755 --- a/src/img2pdf.py +++ b/src/img2pdf.py @@ -518,7 +518,15 @@ def valid_size(string): # is both, type str and type bytes (only the case in python2) def pdf_embedded_string(string): if type(string) is str and type(string) is not bytes: - string = string.encode("utf8") + # py3 + pass + else: + # py2 + string = string.decode("utf8") + string = b"\xfe\xff"+string.encode("utf-16-be") + string = string.replace(b'\\', b'\\\\') + string = string.replace(b'(', b'\\(') + string = string.replace(b')', b'\\)') return string parser = argparse.ArgumentParser( From 81647dd06fff9f30b1143b1af0917d0b0e7e19d4 Mon Sep 17 00:00:00 2001 From: josch Date: Fri, 13 Mar 2015 13:09:02 +0100 Subject: [PATCH 47/53] use common variable for datetime format string --- src/img2pdf.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/img2pdf.py b/src/img2pdf.py index 8b7a1fb..5c30b2b 100755 --- a/src/img2pdf.py +++ b/src/img2pdf.py @@ -95,14 +95,17 @@ class pdfdoc(object): info[b"/Creator"] = b"("+creator+b")" if producer: info[b"/Producer"] = b"("+producer+b")" + + datetime_formatstring = "%Y%m%d%H%M%S" if creationdate: - info[b"/CreationDate"] = b"(D:"+creationdate.strftime("%Y%m%d%H%M%S").encode()+b")" + info[b"/CreationDate"] = b"(D:"+creationdate.strftime(datetime_formatstring).encode()+b")" elif not nodate: - info[b"/CreationDate"] = b"(D:"+now.strftime("%Y%m%d%H%M%S").encode()+b")" + info[b"/CreationDate"] = b"(D:"+now.strftime(datetime_formatstring).encode()+b")" if moddate: - info[b"/ModDate"] = b"(D:"+moddate.strftime("%Y%m%d%H%M%S").encode()+b")" + info[b"/ModDate"] = b"(D:"+moddate.strftime(datetime_formatstring).encode()+b")" elif not nodate: - info[b"/ModDate"] = b"(D:"+now.strftime("%Y%m%d%H%M%S").encode()+b")" + info[b"/ModDate"] = b"(D:"+now.strftime(datetime_formatstring).encode()+b")" + if subject: info[b"/Subject"] = b"("+subject+b")" if keywords: From f3674907d622ae03a12d744faf42bbd870ac6dba Mon Sep 17 00:00:00 2001 From: josch Date: Fri, 13 Mar 2015 14:29:53 +0100 Subject: [PATCH 48/53] store times in UTC and understand YYYY-MM-DD, YYYY-MM-DDTHH:MM, YYYY-MM-DDTHH:MM:SS and everything understood by dateutil module and date --date --- CHANGES.rst | 2 ++ src/img2pdf.py | 55 +++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 47 insertions(+), 10 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 9884655..ec2a745 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -8,6 +8,8 @@ CHANGES - replace -x and -y option by combined option -s (or --pagesize) and use -S for --subject - correctly encode and escape non-ascii metadata + - explicitly store date in UTC and allow parsing all date formats understood + by dateutil and `date --date` 0.1.5 ----- diff --git a/src/img2pdf.py b/src/img2pdf.py index 5c30b2b..ffa95e1 100755 --- a/src/img2pdf.py +++ b/src/img2pdf.py @@ -43,6 +43,9 @@ def error_out(message): def warning_out(message): sys.stderr.write("W: "+message+"\n") +def datetime_to_pdfdate(dt): + return dt.strftime("%Y%m%d%H%M%SZ") + def parse(cont, indent=1): if type(cont) is dict: return b"<<\n"+b"\n".join( @@ -95,17 +98,14 @@ class pdfdoc(object): info[b"/Creator"] = b"("+creator+b")" if producer: info[b"/Producer"] = b"("+producer+b")" - - datetime_formatstring = "%Y%m%d%H%M%S" if creationdate: - info[b"/CreationDate"] = b"(D:"+creationdate.strftime(datetime_formatstring).encode()+b")" + info[b"/CreationDate"] = b"(D:"+datetime_to_pdfdate(creationdate).encode()+b")" elif not nodate: - info[b"/CreationDate"] = b"(D:"+now.strftime(datetime_formatstring).encode()+b")" + info[b"/CreationDate"] = b"(D:"+datetime_to_pdfdate(now).encode()+b")" if moddate: - info[b"/ModDate"] = b"(D:"+moddate.strftime(datetime_formatstring).encode()+b")" + info[b"/ModDate"] = b"(D:"+datetime_to_pdfdate(moddate).encode()+b")" elif not nodate: - info[b"/ModDate"] = b"(D:"+now.strftime(datetime_formatstring).encode()+b")" - + info[b"/ModDate"] = b"(D:"+datetime_to_pdfdate(now).encode()+b")" if subject: info[b"/Subject"] = b"("+subject+b")" if keywords: @@ -374,7 +374,42 @@ def positive_float(string): return value def valid_date(string): - return datetime.strptime(string, "%Y-%m-%dT%H:%M:%S") + # first try parsing in ISO8601 format + try: + return datetime.strptime(string, "%Y-%m-%d") + except ValueError: + pass + try: + return datetime.strptime(string, "%Y-%m-%dT%H:%M") + except ValueError: + pass + try: + return datetime.strptime(string, "%Y-%m-%dT%H:%M:%S") + except ValueError: + pass + # then try dateutil + try: + from dateutil import parser + except ImportError: + pass + else: + try: + return parser.parse(string) + except TypeError: + pass + # as a last resort, try the local date utility + try: + import subprocess + except ImportError: + pass + else: + try: + utime = subprocess.check_output(["date", "--date", string, "+%s"]) + except subprocess.CalledProcessError: + pass + else: + return datetime.utcfromtimestamp(int(utime)) + raise argparse.ArgumentTypeError("cannot parse date: %s"%string) def get_standard_papersize(string): papersizes = { @@ -577,10 +612,10 @@ parser.add_argument( help='producer for metadata') parser.add_argument( '-r', '--creationdate', metavar='creationdate', type=valid_date, - help='creation date for metadata in YYYY-MM-DDTHH:MM:SS format') + help='UTC creation date for metadata in YYYY-MM-DD or YYYY-MM-DDTHH:MM or YYYY-MM-DDTHH:MM:SS format or any format understood by python dateutil module or any format understood by `date --date`') parser.add_argument( '-m', '--moddate', metavar='moddate', type=valid_date, - help='modification date for metadata in YYYY-MM-DDTHH:MM:SS format') + help='UTC modification date for metadata in YYYY-MM-DD or YYYY-MM-DDTHH:MM or YYYY-MM-DDTHH:MM:SS format or any format understood by python dateutil module or any format understood by `date --date`') parser.add_argument( '-S', '--subject', metavar='subject', type=pdf_embedded_string, help='subject for metadata') From 4968d5862117446b040094d20dae551685fa3ef0 Mon Sep 17 00:00:00 2001 From: josch Date: Fri, 13 Mar 2015 14:33:45 +0100 Subject: [PATCH 49/53] fix typo --- src/img2pdf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/img2pdf.py b/src/img2pdf.py index ffa95e1..c3cf5e1 100755 --- a/src/img2pdf.py +++ b/src/img2pdf.py @@ -162,7 +162,7 @@ class pdfdoc(object): b"/Width": width, b"/Height": height, b"/ColorSpace": colorspace, - # hardcoded as PIL doesnt provide bits for non-jpeg formats + # hardcoded as PIL doesn't provide bits for non-jpeg formats b"/BitsPerComponent": 8, b"/Length": len(imgdata) }, imgdata) From 18a41fc8df615007624c00322d1d023091646256 Mon Sep 17 00:00:00 2001 From: josch Date: Fri, 13 Mar 2015 14:43:24 +0100 Subject: [PATCH 50/53] avoid catch-all except --- src/img2pdf.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/img2pdf.py b/src/img2pdf.py index c3cf5e1..657a63c 100755 --- a/src/img2pdf.py +++ b/src/img2pdf.py @@ -233,11 +233,10 @@ def convert(images, dpi=None, pagesize=(None, None, None), title=None, debug_out("Reading %s"%imfilename, verbose) try: rawdata = imfilename.read() - im = cStringIO(rawdata) - except: + except AttributeError: with open(imfilename, "rb") as im: rawdata = im.read() - im = cStringIO(rawdata) + im = cStringIO(rawdata) try: imgdata = Image.open(im) except IOError as e: From 33ee44e50de9ab9be15ed37cc0220a01defd9213 Mon Sep 17 00:00:00 2001 From: josch Date: Sun, 12 Apr 2015 20:37:57 +0200 Subject: [PATCH 51/53] restore python3 compatibility --- src/img2pdf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/img2pdf.py b/src/img2pdf.py index 657a63c..0293e7b 100755 --- a/src/img2pdf.py +++ b/src/img2pdf.py @@ -510,10 +510,10 @@ def valid_size(string): try: value[0] = float(value[1]) value[0] *= units[value[2]] # convert to points - except ValueError, e: + except ValueError: # assign None if width or height not provided value[0] = None - except KeyError, e: + except KeyError: # if units unrecognized, raise error # otherwise default to pt because units not provided if value[2]: From b7aa09834cb2ce3e05805d5a4fa60fa8a931e322 Mon Sep 17 00:00:00 2001 From: josch Date: Sat, 9 May 2015 07:39:50 +0200 Subject: [PATCH 52/53] README.md: update treatment of non-jpeg files --- README.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 64ec83f..8d2053b 100644 --- a/README.md +++ b/README.md @@ -19,12 +19,15 @@ code into the garbage bin :D Functionality ------------- -This program will take a list of images and produce a PDF file with the -images embedded in it. JPEG and JPEG2000 images will be included without +This program will take a list of images and produce a PDF file with the images +embedded in it. JPEG and JPEG2000 images will be included without recompression. Images in other formats will be included with zip/flate -encoding. As a result, this tool is able to losslessly wrap any image -into a PDF container with a quality-filesize ratio that is typically better -than that of existing tools. +encoding which usually leads to an increase in the resulting size because +formats like png compress better than PDF which just zip/flate compresses the +RGB data. As a result, this tool is able to losslessly wrap images into a PDF +container with a quality-filesize ratio that is typically better (in case of +JPEG and JPEG2000 images) or equal (in case of other formats) than that of +existing tools. For example, imagemagick will re-encode the input JPEG image (thus changing its content): @@ -51,10 +54,7 @@ save other graphics formats using lossless zip compression, and produce multi-page PDF files when more than one input image is given. Also, since JPEG and JPEG2000 images are not reencoded, conversion with -img2pdf is several (ten to hundred) times faster than with imagemagick. -While the above convert command with a 2.8MB JPEG took 27 seconds -(on average) on my machine, conversion using img2pdf took just a -fraction of a second. +img2pdf is several times faster than with other tools. Usage From fdee171d404f2b8891794fb503866c461d77a5d0 Mon Sep 17 00:00:00 2001 From: josch Date: Sat, 9 May 2015 20:52:07 +0200 Subject: [PATCH 53/53] README.md: add more wishlist bugs --- README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/README.md b/README.md index 8d2053b..74f24b6 100644 --- a/README.md +++ b/README.md @@ -137,6 +137,19 @@ I have not yet figured out how to determine the colorspace of JPEG2000 files. Therefore JPEG2000 files use DeviceRGB by default. For JPEG2000 files with other colorspaces, you must force it using the `--colorspace` option. +It might be possible to store transparency using masks but it is not clear +what the utility of such a functionality would be. + +Most vector graphic formats can be losslessly turned into PDF (minus some of +the features unsupported by PDF) but img2pdf will currently turn vector +graphics into their lossy raster representations. + +Acrobat is able to store a hint for the PDF reader of how to present the PDF +when opening it. Things like automatic fullscreen or the zoom level can be +configured. + +It would be nice if a single input image could be read from standard input. + Installation ------------