img2pdf/src/img2pdf.py

393 lines
14 KiB
Python
Raw Normal View History

#!/usr/bin/env python2
2014-03-30 06:10:12 +00:00
# Copyright (C) 2012-2014 Johannes 'josch' Schauer <j.schauer at email.de>
2013-05-02 06:17:13 +00:00
#
2014-03-30 06:10:12 +00:00
# This program is free software: you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation, either
# version 3 of the License, or (at your option) any later
# version.
2013-05-02 06:17:13 +00:00
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
2014-03-30 06:10:12 +00:00
# You should have received a copy of the GNU General Public
# License along with this program. If not, see
# <http://www.gnu.org/licenses/>.
2012-03-29 09:08:32 +00:00
import sys
import zlib
import argparse
2012-03-29 09:53:57 +00:00
import struct
2014-03-01 04:51:53 +00:00
from PIL import Image
2012-03-29 09:08:32 +00:00
from datetime import datetime
2013-10-21 13:55:47 +00:00
from jp2 import parsejp2
2015-02-16 06:39:07 +00:00
try:
from cStringIO import cStringIO
except ImportError:
from io import BytesIO as cStringIO
2012-03-29 09:08:32 +00:00
2014-03-01 04:51:53 +00:00
# XXX: Switch to use logging module.
def debug_out(message, verbose=True):
if verbose:
sys.stderr.write("D: "+message+"\n")
def error_out(message):
sys.stderr.write("E: "+message+"\n")
def warning_out(message):
sys.stderr.write("W: "+message+"\n")
2012-03-29 09:08:32 +00:00
def parse(cont, indent=1):
if type(cont) is dict:
2015-01-07 14:56:24 +00:00
return b"<<\n"+b"\n".join(
[4 * indent * b" " + k.encode("utf8") + b" " + parse(v, indent+1)
for k, v in sorted(cont.items())])+b"\n"+4*(indent-1)*b" "+b">>"
2012-03-29 09:08:32 +00:00
elif type(cont) is int or type(cont) is float:
2015-01-07 14:56:24 +00:00
return str(cont).encode("utf8")
2012-03-29 09:08:32 +00:00
elif isinstance(cont, obj):
2015-01-07 14:56:24 +00:00
return ("%d 0 R"%cont.identifier).encode("utf8")
2012-03-29 09:08:32 +00:00
elif type(cont) is str:
2015-01-07 14:56:24 +00:00
return cont.encode("utf8")
elif type(cont) is bytes:
2012-03-29 09:08:32 +00:00
return cont
elif type(cont) is list:
2015-01-07 14:56:24 +00:00
return b"[ "+b" ".join([parse(c, indent) for c in cont])+b" ]"
2012-03-29 09:08:32 +00:00
class obj(object):
2012-03-29 09:08:32 +00:00
def __init__(self, content, stream=None):
self.content = content
self.stream = stream
2013-10-23 10:34:07 +00:00
def tostring(self):
2012-03-29 09:08:32 +00:00
if self.stream:
2014-03-01 04:51:53 +00:00
return (
2015-01-07 14:56:24 +00:00
("%d 0 obj " % self.identifier).encode("utf8") +
2014-03-01 04:51:53 +00:00
parse(self.content) +
2015-01-07 14:56:24 +00:00
b"\nstream\n" + self.stream + b"\nendstream\nendobj\n")
2012-03-29 09:08:32 +00:00
else:
2015-01-07 14:56:24 +00:00
return ("%d 0 obj "%self.identifier).encode("utf8")+parse(self.content)+b" endobj\n"
2012-03-29 09:08:32 +00:00
class pdfdoc(object):
def __init__(self, version=3, title=None, author=None, creator=None,
producer=None, creationdate=None, moddate=None, subject=None,
keywords=None, nodate=False):
2013-10-23 10:34:07 +00:00
self.version = version # default pdf version 1.3
now = datetime.now()
2014-03-01 04:51:53 +00:00
self.objects = []
2013-10-23 10:34:07 +00:00
info = {}
2013-10-23 10:34:07 +00:00
if title:
info["/Title"] = "("+title+")"
if author:
info["/Author"] = "("+author+")"
if creator:
info["/Creator"] = "("+creator+")"
if producer:
info["/Producer"] = "("+producer+")"
if creationdate:
info["/CreationDate"] = "(D:"+creationdate.strftime("%Y%m%d%H%M%S")+")"
elif not nodate:
2013-10-23 10:34:07 +00:00
info["/CreationDate"] = "(D:"+now.strftime("%Y%m%d%H%M%S")+")"
if moddate:
info["/ModDate"] = "(D:"+moddate.strftime("%Y%m%d%H%M%S")+")"
elif not nodate:
2013-10-23 10:34:07 +00:00
info["/ModDate"] = "(D:"+now.strftime("%Y%m%d%H%M%S")+")"
if subject:
info["/Subject"] = "("+subject+")"
if keywords:
info["/Keywords"] = "("+",".join(keywords)+")"
self.info = obj(info)
# create an incomplete pages object so that a /Parent entry can be
# added to each page
2013-10-23 10:34:07 +00:00
self.pages = obj({
"/Type": "/Pages",
"/Kids": [],
"/Count": 0
})
2012-03-29 09:53:57 +00:00
2013-10-23 10:34:07 +00:00
self.catalog = obj({
"/Pages": self.pages,
"/Type": "/Catalog"
})
self.addobj(self.catalog)
self.addobj(self.pages)
2013-08-30 08:45:43 +00:00
2013-10-23 10:34:07 +00:00
def addobj(self, obj):
newid = len(self.objects)+1
obj.identifier = newid
self.objects.append(obj)
def addimage(self, color, width, height, imgformat, imgdata, pdf_x, pdf_y):
2012-03-29 09:08:32 +00:00
if color == 'L':
colorspace = "/DeviceGray"
2012-03-29 09:08:32 +00:00
elif color == 'RGB':
colorspace = "/DeviceRGB"
elif color == 'CMYK' or color == 'CMYK;I':
colorspace = "/DeviceCMYK"
2012-03-29 09:08:32 +00:00
else:
2013-10-23 06:49:43 +00:00
error_out("unsupported color space: %s"%color)
2012-03-29 09:08:32 +00:00
exit(1)
2013-10-23 06:49:59 +00:00
if pdf_x < 3.00 or pdf_y < 3.00:
warning_out("pdf width or height is below 3.00 - decrease the dpi")
2012-03-29 09:08:32 +00:00
# either embed the whole jpeg or deflate the bitmap representation
if imgformat is "JPEG":
ofilter = [ "/DCTDecode" ]
elif imgformat is "JPEG2000":
2012-03-29 09:53:57 +00:00
ofilter = [ "/JPXDecode" ]
2013-10-23 10:34:07 +00:00
self.version = 5 # jpeg2000 needs pdf 1.5
2012-03-29 09:08:32 +00:00
else:
ofilter = [ "/FlateDecode" ]
image = obj({
"/Type": "/XObject",
"/Subtype": "/Image",
"/Filter": ofilter,
"/Width": width,
"/Height": height,
"/ColorSpace": colorspace,
# hardcoded as PIL doesnt provide bits for non-jpeg formats
"/BitsPerComponent": 8,
2012-03-29 09:08:32 +00:00
"/Length": len(imgdata)
}, imgdata)
if color == 'CMYK;I':
# Inverts all four channels
image.content['/Decode'] = [1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0]
2015-01-07 14:56:24 +00:00
text = ("q\n%f 0 0 %f 0 0 cm\n/Im0 Do\nQ"%(pdf_x, pdf_y)).encode('utf8')
2012-03-29 09:08:32 +00:00
content = obj({
"/Length": len(text)
}, text)
page = obj({
"/Type": "/Page",
2013-10-23 10:34:07 +00:00
"/Parent": self.pages,
2012-03-29 09:08:32 +00:00
"/Resources": {
"/XObject": {
"/Im0": image
}
},
"/MediaBox": [0, 0, pdf_x, pdf_y],
"/Contents": content
})
2013-10-23 10:34:07 +00:00
self.pages.content["/Kids"].append(page)
self.pages.content["/Count"] += 1
self.addobj(page)
self.addobj(content)
self.addobj(image)
def tostring(self):
# add info as last object
self.addobj(self.info)
xreftable = list()
2015-01-07 14:56:24 +00:00
result = ("%%PDF-1.%d\n"%self.version).encode("utf8")
2013-10-23 10:34:07 +00:00
2015-01-07 14:56:24 +00:00
xreftable.append(b"0000000000 65535 f \n")
2013-10-23 10:34:07 +00:00
for o in self.objects:
2015-01-07 14:56:24 +00:00
xreftable.append(("%010d 00000 n \n"%len(result)).encode("utf8"))
2013-10-23 10:34:07 +00:00
result += o.tostring()
xrefoffset = len(result)
2015-01-07 14:56:24 +00:00
result += b"xref\n"
result += ("0 %d\n"%len(xreftable)).encode("utf8")
2013-10-23 10:34:07 +00:00
for x in xreftable:
result += x
2015-01-07 14:56:24 +00:00
result += b"trailer\n"
result += parse({"/Size": len(xreftable), "/Info": self.info, "/Root": self.catalog})+b"\n"
result += b"startxref\n"
result += ("%d\n"%xrefoffset).encode("utf8")
result += b"%%EOF\n"
2013-10-23 10:34:07 +00:00
return result
2012-03-29 09:08:32 +00:00
def convert(images, dpi=None, x=None, y=None, title=None, author=None,
creator=None, producer=None, creationdate=None, moddate=None,
subject=None, keywords=None, colorspace=None, verbose=False):
2012-03-29 09:08:32 +00:00
pdf = pdfdoc(3, title, author, creator, producer, creationdate,
moddate, subject, keywords)
2012-03-29 09:08:32 +00:00
for imfilename in images:
debug_out("Reading %s"%imfilename, verbose)
2015-02-16 06:39:07 +00:00
try:
rawdata = imfilename.read()
im = cStringIO(rawdata)
except:
with open(imfilename, "rb") as im:
rawdata = im.read()
im = cStringIO(rawdata)
try:
imgdata = Image.open(im)
except IOError as e:
# test if it is a jpeg2000 image
if rawdata[:12] != "\x00\x00\x00\x0C\x6A\x50\x20\x20\x0D\x0A\x87\x0A":
error_out("cannot read input image (not jpeg2000)")
error_out("PIL: %s"%e)
exit(1)
# image is jpeg2000
width, height, ics = parsejp2(rawdata)
imgformat = "JPEG2000"
if dpi:
ndpi = dpi, dpi
debug_out("input dpi (forced) = %d x %d"%ndpi, verbose)
2013-10-23 10:34:07 +00:00
else:
2015-02-16 06:39:07 +00:00
ndpi = (96, 96) # TODO: read real dpi
debug_out("input dpi = %d x %d"%ndpi, verbose)
if colorspace:
color = colorspace
debug_out("input colorspace (forced) = %s"%(ics))
else:
color = ics
debug_out("input colorspace = %s"%(ics), verbose)
else:
width, height = imgdata.size
imgformat = imgdata.format
if dpi:
ndpi = dpi, dpi
debug_out("input dpi (forced) = %d x %d"%ndpi, verbose)
else:
ndpi = imgdata.info.get("dpi", (96, 96))
debug_out("input dpi = %d x %d"%ndpi, verbose)
if colorspace:
color = colorspace
debug_out("input colorspace (forced) = %s"%(color), verbose)
else:
color = imgdata.mode
if color == "CMYK" and imgformat == "JPEG":
# Adobe inverts CMYK JPEGs for some reason, and others
# have followed suit as well. Some software assumes the
# JPEG is inverted if the Adobe tag (APP14), while other
# software assumes all CMYK JPEGs are inverted. I don't
# have enough experience with these to know which is
# better for images currently in the wild, so I'm going
# with the first approach for now.
if "adobe" in imgdata.info:
color = "CMYK;I"
debug_out("input colorspace = %s"%(color), verbose)
debug_out("width x height = %d x %d"%(width,height), verbose)
debug_out("imgformat = %s"%imgformat, verbose)
# depending on the input format, determine whether to pass the raw
# image or the zlib compressed color information
if imgformat is "JPEG" or imgformat is "JPEG2000":
if color == '1':
error_out("jpeg can't be monochrome")
exit(1)
imgdata = rawdata
else:
# because we do not support /CCITTFaxDecode
if color == '1':
debug_out("Converting colorspace 1 to L", verbose)
imgdata = imgdata.convert('L')
color = 'L'
elif color in ("RGB", "L", "CMYK", "CMYK;I"):
debug_out("Colorspace is OK: %s"%color, verbose)
else:
2015-02-16 06:39:07 +00:00
debug_out("Converting colorspace %s to RGB"%color, verbose)
imgdata = imgdata.convert('RGB')
color = imgdata.mode
imgdata = zlib.compress(imgdata.tostring())
2013-10-23 10:34:07 +00:00
# pdf units = 1/72 inch
2014-08-24 15:15:43 +00:00
if not x and not y:
pdf_x, pdf_y = 72.0*width/ndpi[0], 72.0*height/ndpi[1]
2014-08-24 15:15:43 +00:00
elif not y:
pdf_x, pdf_y = x, x*height/width
elif not x:
pdf_x, pdf_y = y*width/height, y
2015-01-15 05:15:05 +00:00
else:
pdf_x = x
pdf_y = y
pdf.addimage(color, width, height, imgformat, imgdata, pdf_x, pdf_y)
2013-10-23 10:34:07 +00:00
return pdf.tostring()
2012-03-29 09:08:32 +00:00
def positive_float(string):
value = float(string)
if value <= 0:
msg = "%r is not positive"%string
raise argparse.ArgumentTypeError(msg)
return value
def valid_date(string):
return datetime.strptime(string, "%Y-%m-%dT%H:%M:%S")
parser = argparse.ArgumentParser(
description='Lossless conversion/embedding of images (in)to pdf')
parser.add_argument(
'images', metavar='infile', type=str,
nargs='+', help='input file(s)')
parser.add_argument(
'-o', '--output', metavar='out', type=argparse.FileType('wb'),
default=sys.stdout, help='output file (default: stdout)')
parser.add_argument(
'-d', '--dpi', metavar='dpi', type=positive_float,
help='dpi for pdf output (default: 96.0)')
parser.add_argument(
'-x', metavar='pdf_x', type=positive_float,
help='output width in points')
parser.add_argument(
'-y', metavar='pdf_y', type=positive_float,
help='output height in points')
parser.add_argument(
'-t', '--title', metavar='title', type=str,
help='title for metadata')
parser.add_argument(
'-a', '--author', metavar='author', type=str,
help='author for metadata')
parser.add_argument(
'-c', '--creator', metavar='creator', type=str,
help='creator for metadata')
parser.add_argument(
'-p', '--producer', metavar='producer', type=str,
help='producer for metadata')
parser.add_argument(
'-r', '--creationdate', metavar='creationdate', type=valid_date,
help='creation date for metadata in YYYY-MM-DDTHH:MM:SS format')
parser.add_argument(
'-m', '--moddate', metavar='moddate', type=valid_date,
help='modification date for metadata in YYYY-MM-DDTHH:MM:SS format')
parser.add_argument(
'-s', '--subject', metavar='subject', type=str,
help='subject for metadata')
parser.add_argument(
'-k', '--keywords', metavar='kw', type=str, nargs='+',
help='keywords for metadata')
parser.add_argument(
'-C', '--colorspace', metavar='colorspace', type=str,
help='force PIL colorspace (one of: RGB, L, 1, CMYK, CMYK;I)')
parser.add_argument(
'-D', '--nodate', help='do not add timestamps', action="store_true")
parser.add_argument(
'-v', '--verbose', help='verbose mode', action="store_true")
def main(args=None):
if args is None:
args = sys.argv[1:]
args = parser.parse_args(args)
args.output.write(
convert(
args.images, args.dpi, args.x, args.y, args.title, args.author,
args.creator, args.producer, args.creationdate, args.moddate,
args.subject, args.keywords, args.colorspace, args.verbose))
if __name__ == '__main__':
main()