1
0
Fork 0
forked from josch/img2pdf

major refactoring

This commit is contained in:
josch 2013-10-23 12:34:07 +02:00
parent 50aff1474f
commit 46a510d2b9

View file

@ -29,7 +29,7 @@ def parse(cont, indent=1):
elif type(cont) is int or type(cont) is float: elif type(cont) is int or type(cont) is float:
return str(cont) return str(cont)
elif isinstance(cont, obj): elif isinstance(cont, obj):
return "%d 0 R"%cont.get_identifier() return "%d 0 R"%cont.identifier
elif type(cont) is str: elif type(cont) is str:
return cont return cont
elif type(cont) is list: elif type(cont) is list:
@ -40,117 +40,73 @@ class obj():
self.content = content self.content = content
self.stream = stream self.stream = stream
def tostring(self, identifier): def tostring(self):
self.identifier = identifier
if self.stream: if self.stream:
return "%d 0 obj "%identifier+parse(self.content)+"\nstream\n"+self.stream+"\nendstream\nendobj\n" return "%d 0 obj "%self.identifier+parse(self.content)+"\nstream\n"+self.stream+"\nendstream\nendobj\n"
else: else:
return "%d 0 obj "%identifier+parse(self.content)+" endobj\n" return "%d 0 obj "%self.identifier+parse(self.content)+" endobj\n"
def get_identifier(self): class pdfdoc():
if not hasattr(self, 'identifier'): objects = list()
raise Exception("no id set yet, call tostring() on obj first")
return self.identifier
def main(images, dpi, title=None, author=None, creator=None, producer=None, def __init__(self, version=3, title=None, author=None, creator=None, producer=None,
creationdate=None, moddate=None, subject=None, keywords=None, creationdate=None, moddate=None, subject=None, keywords=None):
colorspace=None, verbose=False): self.version = version # default pdf version 1.3
now = datetime.now()
version = 3 # default pdf version 1.3 info = dict()
if title:
now = datetime.now() info["/Title"] = "("+title+")"
if author:
def debug_out(message): info["/Author"] = "("+author+")"
if verbose: if creator:
sys.stderr.write("D: "+message+"\n") info["/Creator"] = "("+creator+")"
def error_out(message): if producer:
sys.stderr.write("E: "+message+"\n") info["/Producer"] = "("+producer+")"
def warning_out(message): if creationdate:
sys.stderr.write("W: "+message+"\n") info["/CreationDate"] = "(D:"+creationdate.strftime("%Y%m%d%H%M%S")+")"
info = dict()
if title:
info["/Title"] = "("+title+")"
if author:
info["/Author"] = "("+author+")"
if creator:
info["/Creator"] = "("+creator+")"
if producer:
info["/Producer"] = "("+producer+")"
if creationdate:
info["/CreationDate"] = "(D:"+creationdate.strftime("%Y%m%d%H%M%S")+")"
else:
info["/CreationDate"] = "(D:"+now.strftime("%Y%m%d%H%M%S")+")"
if moddate:
info["/ModDate"] = "(D:"+moddate.strftime("%Y%m%d%H%M%S")+")"
else:
info["/ModDate"] = "(D:"+now.strftime("%Y%m%d%H%M%S")+")"
if subject:
info["/Subject"] = "("+subject+")"
if keywords:
info["/Keywords"] = "("+",".join(keywords)+")"
info = obj(info)
pagestuples = list()
# create an incomplete pages object so that a /Parent entry can be added to each page
pages = obj({
"/Type": "/Pages"
})
for im in images:
rawdata = im.read()
im.seek(0)
try:
imgdata = Image.open(im)
except IOError as e:
# test if it is a jpeg2000 image
if rawdata[:12] != "\x00\x00\x00\x0C\x6A\x50\x20\x20\x0D\x0A\x87\x0A":
error_out("cannot read input image (not jpeg2000)")
error_out("PIL: %s"%e)
exit(1)
# image is jpeg2000
width, height, ics = parsejp2(rawdata)
imgformat = "JP2"
if colorspace:
color = colorspace
else:
color = ics
debug_out("input colorspace = %s"%(ics))
if dpi:
dpi_x, dpi_y = dpi, dpi
else:
dpi_x, dpi_y = (96, 96) # TODO: read real dpi
else: else:
width, height = imgdata.size info["/CreationDate"] = "(D:"+now.strftime("%Y%m%d%H%M%S")+")"
if dpi: if moddate:
dpi_x, dpi_y = dpi, dpi info["/ModDate"] = "(D:"+moddate.strftime("%Y%m%d%H%M%S")+")"
else: else:
dpi_x, dpi_y = imgdata.info.get("dpi", (96, 96)) info["/ModDate"] = "(D:"+now.strftime("%Y%m%d%H%M%S")+")"
debug_out("input dpi = %d x %d"%(dpi_x,dpi_y)) if subject:
imgformat = imgdata.format info["/Subject"] = "("+subject+")"
if colorspace: if keywords:
color = colorspace info["/Keywords"] = "("+",".join(keywords)+")"
else:
color = imgdata.mode
debug_out("input colorspace = %s"%(color))
debug_out("width x height = %d x %d"%(width,height)) self.info = obj(info)
# create an incomplete pages object so that a /Parent entry can be added to each page
self.pages = obj({
"/Type": "/Pages",
"/Kids": [],
"/Count": 0
})
self.catalog = obj({
"/Pages": self.pages,
"/Type": "/Catalog"
})
self.addobj(self.catalog)
self.addobj(self.pages)
def addobj(self, obj):
newid = len(self.objects)+1
obj.identifier = newid
self.objects.append(obj)
def addimage(self, color, width, height, dpi, imgformat, imgdata):
if color == 'L': if color == 'L':
color = "/DeviceGray" color = "/DeviceGray"
elif color == 'RGB': elif color == 'RGB':
color = "/DeviceRGB" color = "/DeviceRGB"
elif color == '1':
# TODO: /CCITTFaxDecode monochrome images
imgdata = imgdata.convert('L')
color = "/DeviceGray"
else: else:
error_out("unsupported color space: %s"%color) error_out("unsupported color space: %s"%color)
exit(1) exit(1)
pdf_x, pdf_y = 72.0*width/dpi_x, 72.0*height/dpi_y # pdf units = 1/72 inch pdf_x, pdf_y = 72.0*width/dpi[0], 72.0*height/dpi[1] # pdf units = 1/72 inch
if pdf_x < 3.00 or pdf_y < 3.00: if pdf_x < 3.00 or pdf_y < 3.00:
warning_out("pdf width or height is below 3.00 - decrease the dpi") warning_out("pdf width or height is below 3.00 - decrease the dpi")
@ -158,16 +114,11 @@ def main(images, dpi, title=None, author=None, creator=None, producer=None,
# either embed the whole jpeg or deflate the bitmap representation # either embed the whole jpeg or deflate the bitmap representation
if imgformat is "JPEG": if imgformat is "JPEG":
ofilter = [ "/DCTDecode" ] ofilter = [ "/DCTDecode" ]
imgdata = rawdata
elif imgformat is "JP2": elif imgformat is "JP2":
ofilter = [ "/JPXDecode" ] ofilter = [ "/JPXDecode" ]
imgdata = rawdata self.version = 5 # jpeg2000 needs pdf 1.5
version = 5 # jpeg2000 needs pdf 1.5
else: else:
ofilter = [ "/FlateDecode" ] ofilter = [ "/FlateDecode" ]
imgdata = zlib.compress(imgdata.tostring())
im.close()
image = obj({ image = obj({
"/Type": "/XObject", "/Type": "/XObject",
"/Subtype": "/Image", "/Subtype": "/Image",
@ -187,7 +138,7 @@ def main(images, dpi, title=None, author=None, creator=None, producer=None,
page = obj({ page = obj({
"/Type": "/Page", "/Type": "/Page",
"/Parent": pages, "/Parent": self.pages,
"/Resources": { "/Resources": {
"/XObject": { "/XObject": {
"/Im0": image "/Im0": image
@ -196,50 +147,119 @@ def main(images, dpi, title=None, author=None, creator=None, producer=None,
"/MediaBox": [0, 0, pdf_x, pdf_y], "/MediaBox": [0, 0, pdf_x, pdf_y],
"/Contents": content "/Contents": content
}) })
self.pages.content["/Kids"].append(page)
self.pages.content["/Count"] += 1
self.addobj(page)
self.addobj(content)
self.addobj(image)
pagestuples.append((image, content, page)) def tostring(self):
# add info as last object
self.addobj(self.info)
# complete pages object with page information xreftable = list()
pages.content["/Kids"] = [ pagetuple[2] for pagetuple in pagestuples ]
pages.content["/Count"] = len(pagestuples)
catalog = obj({ result = "%%PDF-1.%d\n"%self.version
"/Pages": pages,
"/Type": "/Catalog"
})
objects = list() xreftable.append("0000000000 65535 f \n")
objects.append(info.tostring(3*(len(pagestuples)+1))) for o in self.objects:
pages.identifier = 2 # manually set it because each page references to it xreftable.append("%010d 00000 n \n"%len(result))
for i, (image, content, page) in enumerate(reversed(pagestuples)): result += o.tostring()
objects.append(image.tostring(3*(len(pagestuples)-i+1)-1))
objects.append(content.tostring(3*(len(pagestuples)-i+1)-2))
objects.append(page.tostring(3*(len(pagestuples)-i+1)-3))
objects.append(pages.tostring(2))
objects.append(catalog.tostring(1))
objects.reverse()
xreftable = list() xrefoffset = len(result)
result += "xref\n"
result += "0 %d\n"%len(xreftable)
for x in xreftable:
result += x
result += "trailer\n"
result += parse({"/Size": len(xreftable), "/Info": self.info, "/Root": self.catalog})+"\n"
result += "startxref\n"
result += "%d\n"%xrefoffset
result += "%%EOF\n"
return result
result = "%%PDF-1.%d\n"%version def main(images, dpi, title=None, author=None, creator=None, producer=None,
creationdate=None, moddate=None, subject=None, keywords=None,
colorspace=None, verbose=False):
xreftable.append("0000000000 65535 f \n") def debug_out(message):
for o in objects: if verbose:
xreftable.append("%010d 00000 n \n"%len(result)) sys.stderr.write("D: "+message+"\n")
result += o def error_out(message):
sys.stderr.write("E: "+message+"\n")
def warning_out(message):
sys.stderr.write("W: "+message+"\n")
xrefoffset = len(result) pdf = pdfdoc()
result += "xref\n"
result += "0 %d\n"%len(xreftable)
for x in xreftable:
result += x
result += "trailer\n"
result += parse({"/Size": len(xreftable), "/Info": info, "/Root": catalog})+"\n"
result += "startxref\n"
result += "%d\n"%xrefoffset
result += "%%EOF\n"
return result for im in images:
rawdata = im.read()
im.seek(0)
try:
imgdata = Image.open(im)
except IOError as e:
# test if it is a jpeg2000 image
if rawdata[:12] != "\x00\x00\x00\x0C\x6A\x50\x20\x20\x0D\x0A\x87\x0A":
error_out("cannot read input image (not jpeg2000)")
error_out("PIL: %s"%e)
exit(1)
# image is jpeg2000
width, height, ics = parsejp2(rawdata)
imgformat = "JP2"
if dpi:
dpi = dpi, dpi
debug_out("input dpi (forced) = %d x %d"%dpi)
else:
dpi = (96, 96) # TODO: read real dpi
debug_out("input dpi = %d x %d"%dpi)
if colorspace:
color = colorspace
debug_out("input colorspace (forced) = %s"%(ics))
else:
color = ics
debug_out("input colorspace = %s"%(ics))
else:
width, height = imgdata.size
imgformat = imgdata.format
if dpi:
dpi = dpi, dpi
debug_out("input dpi (forced) = %d x %d"%dpi)
else:
dpi = imgdata.info.get("dpi", (96, 96))
debug_out("input dpi = %d x %d"%dpi)
if colorspace:
color = colorspace
debug_out("input colorspace (forced) = %s"%(color))
else:
color = imgdata.mode
debug_out("input colorspace = %s"%(color))
debug_out("width x height = %d x %d"%(width,height))
debug_out("imgformat = %s"%imgformat)
# depending on the input format, determine whether to pass the raw
# image or the zlib compressed color information
if imgformat is "JPEG" or imgformat is "JP2":
if color == '1':
error_out("jpeg can't be monochrome")
exit(1)
imgdata = rawdata
else:
# because we do not support /CCITTFaxDecode
if color == '1':
imgdata = imgdata.convert('L')
color = 'L'
imgdata = zlib.compress(imgdata.tostring())
pdf.addimage(color, width, height, dpi, imgformat, imgdata)
im.close()
return pdf.tostring()
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description='lossless conversion/embedding of images (in)to pdf') parser = argparse.ArgumentParser(description='lossless conversion/embedding of images (in)to pdf')