major refactoring

main
josch 11 years ago
parent 50aff1474f
commit 46a510d2b9

@ -29,7 +29,7 @@ def parse(cont, indent=1):
elif type(cont) is int or type(cont) is float: elif type(cont) is int or type(cont) is float:
return str(cont) return str(cont)
elif isinstance(cont, obj): elif isinstance(cont, obj):
return "%d 0 R"%cont.get_identifier() return "%d 0 R"%cont.identifier
elif type(cont) is str: elif type(cont) is str:
return cont return cont
elif type(cont) is list: elif type(cont) is list:
@ -40,117 +40,73 @@ class obj():
self.content = content self.content = content
self.stream = stream self.stream = stream
def tostring(self, identifier): def tostring(self):
self.identifier = identifier
if self.stream: if self.stream:
return "%d 0 obj "%identifier+parse(self.content)+"\nstream\n"+self.stream+"\nendstream\nendobj\n" return "%d 0 obj "%self.identifier+parse(self.content)+"\nstream\n"+self.stream+"\nendstream\nendobj\n"
else: else:
return "%d 0 obj "%identifier+parse(self.content)+" endobj\n" return "%d 0 obj "%self.identifier+parse(self.content)+" endobj\n"
def get_identifier(self): class pdfdoc():
if not hasattr(self, 'identifier'): objects = list()
raise Exception("no id set yet, call tostring() on obj first")
return self.identifier
def main(images, dpi, title=None, author=None, creator=None, producer=None,
creationdate=None, moddate=None, subject=None, keywords=None,
colorspace=None, verbose=False):
version = 3 # default pdf version 1.3
now = datetime.now()
def debug_out(message):
if verbose:
sys.stderr.write("D: "+message+"\n")
def error_out(message):
sys.stderr.write("E: "+message+"\n")
def warning_out(message):
sys.stderr.write("W: "+message+"\n")
info = dict()
if title:
info["/Title"] = "("+title+")"
if author:
info["/Author"] = "("+author+")"
if creator:
info["/Creator"] = "("+creator+")"
if producer:
info["/Producer"] = "("+producer+")"
if creationdate:
info["/CreationDate"] = "(D:"+creationdate.strftime("%Y%m%d%H%M%S")+")"
else:
info["/CreationDate"] = "(D:"+now.strftime("%Y%m%d%H%M%S")+")"
if moddate:
info["/ModDate"] = "(D:"+moddate.strftime("%Y%m%d%H%M%S")+")"
else:
info["/ModDate"] = "(D:"+now.strftime("%Y%m%d%H%M%S")+")"
if subject:
info["/Subject"] = "("+subject+")"
if keywords:
info["/Keywords"] = "("+",".join(keywords)+")"
info = obj(info)
pagestuples = list()
# create an incomplete pages object so that a /Parent entry can be added to each page
pages = obj({
"/Type": "/Pages"
})
for im in images: def __init__(self, version=3, title=None, author=None, creator=None, producer=None,
rawdata = im.read() creationdate=None, moddate=None, subject=None, keywords=None):
im.seek(0) self.version = version # default pdf version 1.3
try: now = datetime.now()
imgdata = Image.open(im)
except IOError as e: info = dict()
# test if it is a jpeg2000 image if title:
if rawdata[:12] != "\x00\x00\x00\x0C\x6A\x50\x20\x20\x0D\x0A\x87\x0A": info["/Title"] = "("+title+")"
error_out("cannot read input image (not jpeg2000)") if author:
error_out("PIL: %s"%e) info["/Author"] = "("+author+")"
exit(1) if creator:
# image is jpeg2000 info["/Creator"] = "("+creator+")"
width, height, ics = parsejp2(rawdata) if producer:
imgformat = "JP2" info["/Producer"] = "("+producer+")"
if colorspace: if creationdate:
color = colorspace info["/CreationDate"] = "(D:"+creationdate.strftime("%Y%m%d%H%M%S")+")"
else:
color = ics
debug_out("input colorspace = %s"%(ics))
if dpi:
dpi_x, dpi_y = dpi, dpi
else:
dpi_x, dpi_y = (96, 96) # TODO: read real dpi
else: else:
width, height = imgdata.size info["/CreationDate"] = "(D:"+now.strftime("%Y%m%d%H%M%S")+")"
if dpi: if moddate:
dpi_x, dpi_y = dpi, dpi info["/ModDate"] = "(D:"+moddate.strftime("%Y%m%d%H%M%S")+")"
else: else:
dpi_x, dpi_y = imgdata.info.get("dpi", (96, 96)) info["/ModDate"] = "(D:"+now.strftime("%Y%m%d%H%M%S")+")"
debug_out("input dpi = %d x %d"%(dpi_x,dpi_y)) if subject:
imgformat = imgdata.format info["/Subject"] = "("+subject+")"
if colorspace: if keywords:
color = colorspace info["/Keywords"] = "("+",".join(keywords)+")"
else:
color = imgdata.mode self.info = obj(info)
debug_out("input colorspace = %s"%(color))
# create an incomplete pages object so that a /Parent entry can be added to each page
self.pages = obj({
"/Type": "/Pages",
"/Kids": [],
"/Count": 0
})
debug_out("width x height = %d x %d"%(width,height)) self.catalog = obj({
"/Pages": self.pages,
"/Type": "/Catalog"
})
self.addobj(self.catalog)
self.addobj(self.pages)
def addobj(self, obj):
newid = len(self.objects)+1
obj.identifier = newid
self.objects.append(obj)
def addimage(self, color, width, height, dpi, imgformat, imgdata):
if color == 'L': if color == 'L':
color = "/DeviceGray" color = "/DeviceGray"
elif color == 'RGB': elif color == 'RGB':
color = "/DeviceRGB" color = "/DeviceRGB"
elif color == '1':
# TODO: /CCITTFaxDecode monochrome images
imgdata = imgdata.convert('L')
color = "/DeviceGray"
else: else:
error_out("unsupported color space: %s"%color) error_out("unsupported color space: %s"%color)
exit(1) exit(1)
pdf_x, pdf_y = 72.0*width/dpi_x, 72.0*height/dpi_y # pdf units = 1/72 inch pdf_x, pdf_y = 72.0*width/dpi[0], 72.0*height/dpi[1] # pdf units = 1/72 inch
if pdf_x < 3.00 or pdf_y < 3.00: if pdf_x < 3.00 or pdf_y < 3.00:
warning_out("pdf width or height is below 3.00 - decrease the dpi") warning_out("pdf width or height is below 3.00 - decrease the dpi")
@ -158,16 +114,11 @@ def main(images, dpi, title=None, author=None, creator=None, producer=None,
# either embed the whole jpeg or deflate the bitmap representation # either embed the whole jpeg or deflate the bitmap representation
if imgformat is "JPEG": if imgformat is "JPEG":
ofilter = [ "/DCTDecode" ] ofilter = [ "/DCTDecode" ]
imgdata = rawdata
elif imgformat is "JP2": elif imgformat is "JP2":
ofilter = [ "/JPXDecode" ] ofilter = [ "/JPXDecode" ]
imgdata = rawdata self.version = 5 # jpeg2000 needs pdf 1.5
version = 5 # jpeg2000 needs pdf 1.5
else: else:
ofilter = [ "/FlateDecode" ] ofilter = [ "/FlateDecode" ]
imgdata = zlib.compress(imgdata.tostring())
im.close()
image = obj({ image = obj({
"/Type": "/XObject", "/Type": "/XObject",
"/Subtype": "/Image", "/Subtype": "/Image",
@ -187,7 +138,7 @@ def main(images, dpi, title=None, author=None, creator=None, producer=None,
page = obj({ page = obj({
"/Type": "/Page", "/Type": "/Page",
"/Parent": pages, "/Parent": self.pages,
"/Resources": { "/Resources": {
"/XObject": { "/XObject": {
"/Im0": image "/Im0": image
@ -196,50 +147,119 @@ def main(images, dpi, title=None, author=None, creator=None, producer=None,
"/MediaBox": [0, 0, pdf_x, pdf_y], "/MediaBox": [0, 0, pdf_x, pdf_y],
"/Contents": content "/Contents": content
}) })
self.pages.content["/Kids"].append(page)
self.pages.content["/Count"] += 1
self.addobj(page)
self.addobj(content)
self.addobj(image)
def tostring(self):
# add info as last object
self.addobj(self.info)
xreftable = list()
result = "%%PDF-1.%d\n"%self.version
xreftable.append("0000000000 65535 f \n")
for o in self.objects:
xreftable.append("%010d 00000 n \n"%len(result))
result += o.tostring()
xrefoffset = len(result)
result += "xref\n"
result += "0 %d\n"%len(xreftable)
for x in xreftable:
result += x
result += "trailer\n"
result += parse({"/Size": len(xreftable), "/Info": self.info, "/Root": self.catalog})+"\n"
result += "startxref\n"
result += "%d\n"%xrefoffset
result += "%%EOF\n"
return result
pagestuples.append((image, content, page)) def main(images, dpi, title=None, author=None, creator=None, producer=None,
creationdate=None, moddate=None, subject=None, keywords=None,
colorspace=None, verbose=False):
# complete pages object with page information def debug_out(message):
pages.content["/Kids"] = [ pagetuple[2] for pagetuple in pagestuples ] if verbose:
pages.content["/Count"] = len(pagestuples) sys.stderr.write("D: "+message+"\n")
def error_out(message):
sys.stderr.write("E: "+message+"\n")
def warning_out(message):
sys.stderr.write("W: "+message+"\n")
catalog = obj({ pdf = pdfdoc()
"/Pages": pages,
"/Type": "/Catalog"
})
objects = list() for im in images:
objects.append(info.tostring(3*(len(pagestuples)+1))) rawdata = im.read()
pages.identifier = 2 # manually set it because each page references to it im.seek(0)
for i, (image, content, page) in enumerate(reversed(pagestuples)): try:
objects.append(image.tostring(3*(len(pagestuples)-i+1)-1)) imgdata = Image.open(im)
objects.append(content.tostring(3*(len(pagestuples)-i+1)-2)) except IOError as e:
objects.append(page.tostring(3*(len(pagestuples)-i+1)-3)) # test if it is a jpeg2000 image
objects.append(pages.tostring(2)) if rawdata[:12] != "\x00\x00\x00\x0C\x6A\x50\x20\x20\x0D\x0A\x87\x0A":
objects.append(catalog.tostring(1)) error_out("cannot read input image (not jpeg2000)")
objects.reverse() error_out("PIL: %s"%e)
exit(1)
xreftable = list() # image is jpeg2000
width, height, ics = parsejp2(rawdata)
result = "%%PDF-1.%d\n"%version imgformat = "JP2"
xreftable.append("0000000000 65535 f \n") if dpi:
for o in objects: dpi = dpi, dpi
xreftable.append("%010d 00000 n \n"%len(result)) debug_out("input dpi (forced) = %d x %d"%dpi)
result += o else:
dpi = (96, 96) # TODO: read real dpi
xrefoffset = len(result) debug_out("input dpi = %d x %d"%dpi)
result += "xref\n"
result += "0 %d\n"%len(xreftable) if colorspace:
for x in xreftable: color = colorspace
result += x debug_out("input colorspace (forced) = %s"%(ics))
result += "trailer\n" else:
result += parse({"/Size": len(xreftable), "/Info": info, "/Root": catalog})+"\n" color = ics
result += "startxref\n" debug_out("input colorspace = %s"%(ics))
result += "%d\n"%xrefoffset else:
result += "%%EOF\n" width, height = imgdata.size
imgformat = imgdata.format
return result
if dpi:
dpi = dpi, dpi
debug_out("input dpi (forced) = %d x %d"%dpi)
else:
dpi = imgdata.info.get("dpi", (96, 96))
debug_out("input dpi = %d x %d"%dpi)
if colorspace:
color = colorspace
debug_out("input colorspace (forced) = %s"%(color))
else:
color = imgdata.mode
debug_out("input colorspace = %s"%(color))
debug_out("width x height = %d x %d"%(width,height))
debug_out("imgformat = %s"%imgformat)
# depending on the input format, determine whether to pass the raw
# image or the zlib compressed color information
if imgformat is "JPEG" or imgformat is "JP2":
if color == '1':
error_out("jpeg can't be monochrome")
exit(1)
imgdata = rawdata
else:
# because we do not support /CCITTFaxDecode
if color == '1':
imgdata = imgdata.convert('L')
color = 'L'
imgdata = zlib.compress(imgdata.tostring())
pdf.addimage(color, width, height, dpi, imgformat, imgdata)
im.close()
return pdf.tostring()
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description='lossless conversion/embedding of images (in)to pdf') parser = argparse.ArgumentParser(description='lossless conversion/embedding of images (in)to pdf')

Loading…
Cancel
Save