You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
img2pdf/img2pdf.py

265 lines
9.5 KiB
Python

12 years ago
#!/usr/bin/env python
#
# Copyright (C) 2012-2013 Johannes 'josch' Schauer <j.schauer at email.de>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
12 years ago
import Image
import sys
import zlib
import argparse
12 years ago
import struct
12 years ago
from datetime import datetime
from jp2 import parsejp2
12 years ago
def parse(cont, indent=1):
if type(cont) is dict:
return "<<\n"+"\n".join([4*indent*" "+"%s %s"%(k, parse(v, indent+1)) for k, v in cont.items()])+"\n"+4*(indent-1)*" "+">>"
elif type(cont) is int or type(cont) is float:
return str(cont)
elif isinstance(cont, obj):
return "%d 0 R"%cont.get_identifier()
elif type(cont) is str:
return cont
elif type(cont) is list:
return "[ "+" ".join([parse(c, indent) for c in cont])+" ]"
class obj():
def __init__(self, content, stream=None):
self.content = content
self.stream = stream
def tostring(self, identifier):
self.identifier = identifier
if self.stream:
return "%d 0 obj "%identifier+parse(self.content)+"\nstream\n"+self.stream+"\nendstream\nendobj\n"
else:
return "%d 0 obj "%identifier+parse(self.content)+" endobj\n"
def get_identifier(self):
if not hasattr(self, 'identifier'):
raise Exception("no id set yet, call tostring() on obj first")
return self.identifier
def main(images, dpi, title=None, author=None, creator=None, producer=None,
creationdate=None, moddate=None, subject=None, keywords=None,
colorspace=None, verbose=False):
12 years ago
12 years ago
version = 3 # default pdf version 1.3
12 years ago
now = datetime.now()
def debug_out(message):
if verbose:
sys.stderr.write("D: "+message+"\n")
12 years ago
info = dict()
if title:
info["/Title"] = "("+title+")"
if author:
info["/Author"] = "("+author+")"
if creator:
info["/Creator"] = "("+creator+")"
if producer:
info["/Producer"] = "("+producer+")"
if creationdate:
info["/CreationDate"] = "(D:"+creationdate.strftime("%Y%m%d%H%M%S")+")"
else:
info["/CreationDate"] = "(D:"+now.strftime("%Y%m%d%H%M%S")+")"
if moddate:
info["/ModDate"] = "(D:"+moddate.strftime("%Y%m%d%H%M%S")+")"
else:
info["/ModDate"] = "(D:"+now.strftime("%Y%m%d%H%M%S")+")"
if subject:
info["/Subject"] = "("+subject+")"
if keywords:
info["/Keywords"] = "("+",".join(keywords)+")"
info = obj(info)
pagestuples = list()
# create an incomplete pages object so that a /Parent entry can be added to each page
pages = obj({
"/Type": "/Pages"
})
12 years ago
for im in images:
rawdata = im.read()
im.seek(0)
12 years ago
try:
imgdata = Image.open(im)
except IOError as e:
12 years ago
# test if it is a jpeg2000 image
if rawdata[:12] != "\x00\x00\x00\x0C\x6A\x50\x20\x20\x0D\x0A\x87\x0A":
print "cannot read input image (not jpeg2000)"
print "PIL: %s"%e
12 years ago
exit(1)
# image is jpeg2000
width, height, ics = parsejp2(rawdata)
12 years ago
imgformat = "JP2"
if colorspace:
color = colorspace
else:
color = ics
12 years ago
if dpi:
dpi_x, dpi_y = dpi, dpi
else:
dpi_x, dpi_y = (96, 96) # TODO: read real dpi
12 years ago
else:
12 years ago
width, height = imgdata.size
if dpi:
dpi_x, dpi_y = dpi, dpi
else:
dpi_x, dpi_y = imgdata.info.get("dpi", (96, 96))
imgformat = imgdata.format
if colorspace:
color = colorspace
else:
color = imgdata.mode
12 years ago
debug_out("width x height = %d x %d"%(width,height))
12 years ago
if color == 'L':
color = "/DeviceGray"
elif color == 'RGB':
color = "/DeviceRGB"
elif color == '1':
# TODO: /CCITTFaxDecode monochrome images
imgdata = imgdata.convert('L')
color = "/DeviceGray"
12 years ago
else:
print "unsupported color space:", color
exit(1)
12 years ago
pdf_x, pdf_y = 72.0*width/dpi_x, 72.0*height/dpi_y # pdf units = 1/72 inch
12 years ago
# either embed the whole jpeg or deflate the bitmap representation
if imgformat is "JPEG":
ofilter = [ "/DCTDecode" ]
imgdata = rawdata
12 years ago
elif imgformat is "JP2":
ofilter = [ "/JPXDecode" ]
imgdata = rawdata
12 years ago
version = 5 # jpeg2000 needs pdf 1.5
12 years ago
else:
ofilter = [ "/FlateDecode" ]
imgdata = zlib.compress(imgdata.tostring())
im.close()
image = obj({
"/Type": "/XObject",
"/Subtype": "/Image",
"/Filter": ofilter,
"/Width": width,
"/Height": height,
"/ColorSpace": color,
"/BitsPerComponent": 8, # hardcoded as PIL doesnt provide bits for non-jpeg formats
"/Length": len(imgdata)
}, imgdata)
text = "q\n%f 0 0 %f 0 0 cm\n/Im0 Do\nQ"%(pdf_x, pdf_y)
content = obj({
"/Length": len(text)
}, text)
page = obj({
"/Type": "/Page",
"/Parent": pages,
12 years ago
"/Resources": {
"/XObject": {
"/Im0": image
}
},
"/MediaBox": [0, 0, pdf_x, pdf_y],
"/Contents": content
})
pagestuples.append((image, content, page))
# complete pages object with page information
pages.content["/Kids"] = [ pagetuple[2] for pagetuple in pagestuples ]
pages.content["/Count"] = len(pagestuples)
12 years ago
catalog = obj({
"/Pages": pages,
"/Type": "/Catalog"
})
objects = list()
objects.append(info.tostring(3*(len(pagestuples)+1)))
pages.identifier = 2 # manually set it because each page references to it
12 years ago
for i, (image, content, page) in enumerate(reversed(pagestuples)):
objects.append(image.tostring(3*(len(pagestuples)-i+1)-1))
objects.append(content.tostring(3*(len(pagestuples)-i+1)-2))
objects.append(page.tostring(3*(len(pagestuples)-i+1)-3))
objects.append(pages.tostring(2))
objects.append(catalog.tostring(1))
objects.reverse()
xreftable = list()
12 years ago
result = "%%PDF-1.%d\n"%version
12 years ago
xreftable.append("0000000000 65535 f \n")
for o in objects:
xreftable.append("%010d 00000 n \n"%len(result))
result += o
xrefoffset = len(result)
result += "xref\n"
result += "0 %d\n"%len(xreftable)
for x in xreftable:
result += x
result += "trailer\n"
result += parse({"/Size": len(xreftable), "/Info": info, "/Root": catalog})+"\n"
result += "startxref\n"
result += "%d\n"%xrefoffset
result += "%%EOF\n"
return result
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='lossless conversion/embedding of images (in)to pdf')
parser.add_argument('images', metavar='infile', type=argparse.FileType('rb'),
nargs='+', help='input file(s)')
parser.add_argument('-o', '--output', metavar='out', type=argparse.FileType('wb'),
12 years ago
default=sys.stdout, help='output file (default: stdout)')
def positive_float(string):
value = float(string)
if value <= 0:
msg = "%r is not positive"%string
raise argparse.ArgumentTypeError(msg)
return value
parser.add_argument('-d', '--dpi', metavar='dpi', type=positive_float, help='dpi for pdf output (default: 96.0)')
parser.add_argument('-t', '--title', metavar='title', type=str, help='title for metadata')
parser.add_argument('-a', '--author', metavar='author', type=str, help='author for metadata')
parser.add_argument('-c', '--creator', metavar='creator', type=str, help='creator for metadata')
parser.add_argument('-p', '--producer', metavar='producer', type=str, help='producer for metadata')
def valid_date(string):
return datetime.strptime(string, "%Y-%m-%dT%H:%M:%S")
parser.add_argument('-r', '--creationdate', metavar='creationdate',
type=valid_date, help='creation date for metadata in YYYY-MM-DDTHH:MM:SS format')
parser.add_argument('-m', '--moddate', metavar='moddate',
type=valid_date, help='modification date for metadata in YYYY-MM-DDTHH:MM:SS format')
parser.add_argument('-s', '--subject', metavar='subject', type=str, help='subject for metadata')
parser.add_argument('-k', '--keywords', metavar='kw', type=str, nargs='+', help='keywords for metadata')
parser.add_argument('-C', '--colorspace', metavar='colorspace', type=str, help='force PIL colorspace (one of: RGB, L, 1)')
parser.add_argument('-v', '--verbose', help='verbose mode', action="store_true")
12 years ago
args = parser.parse_args()
args.output.write(main(args.images, args.dpi, args.title, args.author,
args.creator, args.producer, args.creationdate, args.moddate,
args.subject, args.keywords, args.colorspace, args.verbose))