1
0
Fork 0
forked from josch/img2pdf
img2pdf/img2pdf.py

275 lines
9.9 KiB
Python
Raw Normal View History

2012-03-29 09:08:32 +00:00
#!/usr/bin/env python
2013-05-02 06:17:13 +00:00
#
# Copyright (C) 2012-2013 Johannes 'josch' Schauer <j.schauer at email.de>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
2012-03-29 09:08:32 +00:00
import Image
import sys
import zlib
import argparse
2012-03-29 09:53:57 +00:00
import struct
2012-03-29 09:08:32 +00:00
from datetime import datetime
2013-10-21 13:55:47 +00:00
from jp2 import parsejp2
2012-03-29 09:08:32 +00:00
def parse(cont, indent=1):
if type(cont) is dict:
return "<<\n"+"\n".join([4*indent*" "+"%s %s"%(k, parse(v, indent+1)) for k, v in cont.items()])+"\n"+4*(indent-1)*" "+">>"
elif type(cont) is int or type(cont) is float:
return str(cont)
elif isinstance(cont, obj):
return "%d 0 R"%cont.get_identifier()
elif type(cont) is str:
return cont
elif type(cont) is list:
return "[ "+" ".join([parse(c, indent) for c in cont])+" ]"
class obj():
def __init__(self, content, stream=None):
self.content = content
self.stream = stream
def tostring(self, identifier):
self.identifier = identifier
if self.stream:
return "%d 0 obj "%identifier+parse(self.content)+"\nstream\n"+self.stream+"\nendstream\nendobj\n"
else:
return "%d 0 obj "%identifier+parse(self.content)+" endobj\n"
def get_identifier(self):
if not hasattr(self, 'identifier'):
raise Exception("no id set yet, call tostring() on obj first")
return self.identifier
def main(images, dpi, title=None, author=None, creator=None, producer=None,
2012-10-18 14:56:43 +00:00
creationdate=None, moddate=None, subject=None, keywords=None,
2013-08-30 08:45:43 +00:00
colorspace=None, verbose=False):
2012-03-29 09:08:32 +00:00
2012-03-29 09:53:57 +00:00
version = 3 # default pdf version 1.3
2012-03-29 09:08:32 +00:00
now = datetime.now()
2013-08-30 08:45:43 +00:00
def debug_out(message):
if verbose:
sys.stderr.write("D: "+message+"\n")
2013-10-23 06:49:43 +00:00
def error_out(message):
sys.stderr.write("E: "+message+"\n")
def warning_out(message):
sys.stderr.write("W: "+message+"\n")
2013-08-30 08:45:43 +00:00
2012-03-29 09:08:32 +00:00
info = dict()
if title:
info["/Title"] = "("+title+")"
if author:
info["/Author"] = "("+author+")"
if creator:
info["/Creator"] = "("+creator+")"
if producer:
info["/Producer"] = "("+producer+")"
if creationdate:
info["/CreationDate"] = "(D:"+creationdate.strftime("%Y%m%d%H%M%S")+")"
else:
info["/CreationDate"] = "(D:"+now.strftime("%Y%m%d%H%M%S")+")"
if moddate:
info["/ModDate"] = "(D:"+moddate.strftime("%Y%m%d%H%M%S")+")"
else:
info["/ModDate"] = "(D:"+now.strftime("%Y%m%d%H%M%S")+")"
if subject:
info["/Subject"] = "("+subject+")"
if keywords:
info["/Keywords"] = "("+",".join(keywords)+")"
info = obj(info)
pagestuples = list()
# create an incomplete pages object so that a /Parent entry can be added to each page
pages = obj({
"/Type": "/Pages"
})
2012-03-29 09:08:32 +00:00
for im in images:
2013-10-21 13:55:47 +00:00
rawdata = im.read()
im.seek(0)
2012-03-29 09:53:57 +00:00
try:
imgdata = Image.open(im)
except IOError as e:
2012-03-29 09:53:57 +00:00
# test if it is a jpeg2000 image
2013-10-21 13:55:47 +00:00
if rawdata[:12] != "\x00\x00\x00\x0C\x6A\x50\x20\x20\x0D\x0A\x87\x0A":
2013-10-23 06:49:43 +00:00
error_out("cannot read input image (not jpeg2000)")
error_out("PIL: %s"%e)
2012-03-29 09:53:57 +00:00
exit(1)
# image is jpeg2000
2013-10-21 13:55:47 +00:00
width, height, ics = parsejp2(rawdata)
2012-03-29 09:53:57 +00:00
imgformat = "JP2"
2012-10-18 14:56:43 +00:00
if colorspace:
color = colorspace
else:
2013-10-21 13:55:47 +00:00
color = ics
2013-10-23 06:49:43 +00:00
debug_out("input colorspace = %s"%(ics))
2012-03-29 09:53:57 +00:00
if dpi:
dpi_x, dpi_y = dpi, dpi
else:
dpi_x, dpi_y = (96, 96) # TODO: read real dpi
2012-03-29 09:08:32 +00:00
else:
2012-03-29 09:53:57 +00:00
width, height = imgdata.size
if dpi:
dpi_x, dpi_y = dpi, dpi
else:
dpi_x, dpi_y = imgdata.info.get("dpi", (96, 96))
2013-10-21 14:25:45 +00:00
debug_out("input dpi = %d x %d"%(dpi_x,dpi_y))
2012-03-29 09:53:57 +00:00
imgformat = imgdata.format
2013-05-02 06:14:49 +00:00
if colorspace:
color = colorspace
else:
color = imgdata.mode
2013-10-21 14:25:45 +00:00
debug_out("input colorspace = %s"%(color))
2012-03-29 09:53:57 +00:00
2013-08-30 08:45:43 +00:00
debug_out("width x height = %d x %d"%(width,height))
2012-03-29 09:08:32 +00:00
if color == 'L':
color = "/DeviceGray"
elif color == 'RGB':
color = "/DeviceRGB"
2013-05-02 06:15:18 +00:00
elif color == '1':
# TODO: /CCITTFaxDecode monochrome images
imgdata = imgdata.convert('L')
color = "/DeviceGray"
2012-03-29 09:08:32 +00:00
else:
2013-10-23 06:49:43 +00:00
error_out("unsupported color space: %s"%color)
2012-03-29 09:08:32 +00:00
exit(1)
2012-03-29 09:53:57 +00:00
pdf_x, pdf_y = 72.0*width/dpi_x, 72.0*height/dpi_y # pdf units = 1/72 inch
2013-10-23 06:49:59 +00:00
if pdf_x < 3.00 or pdf_y < 3.00:
warning_out("pdf width or height is below 3.00 - decrease the dpi")
2012-03-29 09:08:32 +00:00
# either embed the whole jpeg or deflate the bitmap representation
if imgformat is "JPEG":
ofilter = [ "/DCTDecode" ]
2013-10-21 13:55:47 +00:00
imgdata = rawdata
2012-03-29 09:53:57 +00:00
elif imgformat is "JP2":
ofilter = [ "/JPXDecode" ]
2013-10-21 13:55:47 +00:00
imgdata = rawdata
2012-03-29 09:53:57 +00:00
version = 5 # jpeg2000 needs pdf 1.5
2012-03-29 09:08:32 +00:00
else:
ofilter = [ "/FlateDecode" ]
imgdata = zlib.compress(imgdata.tostring())
im.close()
image = obj({
"/Type": "/XObject",
"/Subtype": "/Image",
"/Filter": ofilter,
"/Width": width,
"/Height": height,
"/ColorSpace": color,
"/BitsPerComponent": 8, # hardcoded as PIL doesnt provide bits for non-jpeg formats
"/Length": len(imgdata)
}, imgdata)
text = "q\n%f 0 0 %f 0 0 cm\n/Im0 Do\nQ"%(pdf_x, pdf_y)
content = obj({
"/Length": len(text)
}, text)
page = obj({
"/Type": "/Page",
"/Parent": pages,
2012-03-29 09:08:32 +00:00
"/Resources": {
"/XObject": {
"/Im0": image
}
},
"/MediaBox": [0, 0, pdf_x, pdf_y],
"/Contents": content
})
pagestuples.append((image, content, page))
# complete pages object with page information
pages.content["/Kids"] = [ pagetuple[2] for pagetuple in pagestuples ]
pages.content["/Count"] = len(pagestuples)
2012-03-29 09:08:32 +00:00
catalog = obj({
"/Pages": pages,
"/Type": "/Catalog"
})
objects = list()
objects.append(info.tostring(3*(len(pagestuples)+1)))
pages.identifier = 2 # manually set it because each page references to it
2012-03-29 09:08:32 +00:00
for i, (image, content, page) in enumerate(reversed(pagestuples)):
objects.append(image.tostring(3*(len(pagestuples)-i+1)-1))
objects.append(content.tostring(3*(len(pagestuples)-i+1)-2))
objects.append(page.tostring(3*(len(pagestuples)-i+1)-3))
objects.append(pages.tostring(2))
objects.append(catalog.tostring(1))
objects.reverse()
xreftable = list()
2012-03-29 09:53:57 +00:00
result = "%%PDF-1.%d\n"%version
2012-03-29 09:08:32 +00:00
xreftable.append("0000000000 65535 f \n")
for o in objects:
xreftable.append("%010d 00000 n \n"%len(result))
result += o
xrefoffset = len(result)
result += "xref\n"
result += "0 %d\n"%len(xreftable)
for x in xreftable:
result += x
result += "trailer\n"
result += parse({"/Size": len(xreftable), "/Info": info, "/Root": catalog})+"\n"
result += "startxref\n"
result += "%d\n"%xrefoffset
result += "%%EOF\n"
return result
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='lossless conversion/embedding of images (in)to pdf')
2012-06-15 15:13:32 +00:00
parser.add_argument('images', metavar='infile', type=argparse.FileType('rb'),
nargs='+', help='input file(s)')
2012-06-15 15:13:32 +00:00
parser.add_argument('-o', '--output', metavar='out', type=argparse.FileType('wb'),
2012-03-29 09:08:32 +00:00
default=sys.stdout, help='output file (default: stdout)')
def positive_float(string):
value = float(string)
if value <= 0:
msg = "%r is not positive"%string
raise argparse.ArgumentTypeError(msg)
return value
parser.add_argument('-d', '--dpi', metavar='dpi', type=positive_float, help='dpi for pdf output (default: 96.0)')
parser.add_argument('-t', '--title', metavar='title', type=str, help='title for metadata')
parser.add_argument('-a', '--author', metavar='author', type=str, help='author for metadata')
parser.add_argument('-c', '--creator', metavar='creator', type=str, help='creator for metadata')
parser.add_argument('-p', '--producer', metavar='producer', type=str, help='producer for metadata')
def valid_date(string):
return datetime.strptime(string, "%Y-%m-%dT%H:%M:%S")
parser.add_argument('-r', '--creationdate', metavar='creationdate',
type=valid_date, help='creation date for metadata in YYYY-MM-DDTHH:MM:SS format')
parser.add_argument('-m', '--moddate', metavar='moddate',
type=valid_date, help='modification date for metadata in YYYY-MM-DDTHH:MM:SS format')
parser.add_argument('-s', '--subject', metavar='subject', type=str, help='subject for metadata')
parser.add_argument('-k', '--keywords', metavar='kw', type=str, nargs='+', help='keywords for metadata')
2012-10-18 14:56:43 +00:00
parser.add_argument('-C', '--colorspace', metavar='colorspace', type=str, help='force PIL colorspace (one of: RGB, L, 1)')
2013-08-30 08:45:43 +00:00
parser.add_argument('-v', '--verbose', help='verbose mode', action="store_true")
2012-03-29 09:08:32 +00:00
args = parser.parse_args()
args.output.write(main(args.images, args.dpi, args.title, args.author,
args.creator, args.producer, args.creationdate, args.moddate,
2013-08-30 08:45:43 +00:00
args.subject, args.keywords, args.colorspace, args.verbose))