#!/usr/bin/env python2 # Copyright (C) 2012-2014 Johannes 'josch' Schauer # # This program is free software: you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation, either # version 3 of the License, or (at your option) any later # version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public # License along with this program. If not, see # . __version__ = "0.1.6~git" default_dpi = 96.0 import re import sys import zlib import argparse from PIL import Image from datetime import datetime from jp2 import parsejp2 try: from cStringIO import cStringIO except ImportError: from io import BytesIO as cStringIO # XXX: Switch to use logging module. def debug_out(message, verbose=True): if verbose: sys.stderr.write("D: "+message+"\n") def error_out(message): sys.stderr.write("E: "+message+"\n") def warning_out(message): sys.stderr.write("W: "+message+"\n") def datetime_to_pdfdate(dt): return dt.strftime("%Y%m%d%H%M%SZ") def parse(cont, indent=1): if type(cont) is dict: return b"<<\n"+b"\n".join( [4 * indent * b" " + k + b" " + parse(v, indent+1) for k, v in sorted(cont.items())])+b"\n"+4*(indent-1)*b" "+b">>" elif type(cont) is int: return str(cont).encode() elif type(cont) is float: return ("%0.4f"%cont).encode() elif isinstance(cont, obj): return ("%d 0 R"%cont.identifier).encode() elif type(cont) is str or type(cont) is bytes: if type(cont) is str and type(cont) is not bytes: raise Exception("parse must be passed a bytes object in py3") return cont elif type(cont) is list: return b"[ "+b" ".join([parse(c, indent) for c in cont])+b" ]" else: raise Exception("cannot handle type %s"%type(cont)) class obj(object): def __init__(self, content, stream=None): self.content = content self.stream = stream def tostring(self): if self.stream: return ( ("%d 0 obj " % self.identifier).encode() + parse(self.content) + b"\nstream\n" + self.stream + b"\nendstream\nendobj\n") else: return ("%d 0 obj "%self.identifier).encode()+parse(self.content)+b" endobj\n" class pdfdoc(object): def __init__(self, version=3, title=None, author=None, creator=None, producer=None, creationdate=None, moddate=None, subject=None, keywords=None, nodate=False): self.version = version # default pdf version 1.3 now = datetime.now() self.objects = [] info = {} if title: info[b"/Title"] = b"("+title+b")" if author: info[b"/Author"] = b"("+author+b")" if creator: info[b"/Creator"] = b"("+creator+b")" if producer: info[b"/Producer"] = b"("+producer+b")" if creationdate: info[b"/CreationDate"] = b"(D:"+datetime_to_pdfdate(creationdate).encode()+b")" elif not nodate: info[b"/CreationDate"] = b"(D:"+datetime_to_pdfdate(now).encode()+b")" if moddate: info[b"/ModDate"] = b"(D:"+datetime_to_pdfdate(moddate).encode()+b")" elif not nodate: info[b"/ModDate"] = b"(D:"+datetime_to_pdfdate(now).encode()+b")" if subject: info[b"/Subject"] = b"("+subject+b")" if keywords: info[b"/Keywords"] = b"("+b",".join(keywords)+b")" self.info = obj(info) # create an incomplete pages object so that a /Parent entry can be # added to each page self.pages = obj({ b"/Type": b"/Pages", b"/Kids": [], b"/Count": 0 }) self.catalog = obj({ b"/Pages": self.pages, b"/Type": b"/Catalog" }) self.addobj(self.catalog) self.addobj(self.pages) def addobj(self, obj): newid = len(self.objects)+1 obj.identifier = newid self.objects.append(obj) def addimage(self, color, width, height, imgformat, imgdata, pdf_x, pdf_y): if color == 'L': colorspace = b"/DeviceGray" elif color == 'RGB': colorspace = b"/DeviceRGB" elif color == 'CMYK' or color == 'CMYK;I': colorspace = b"/DeviceCMYK" else: error_out("unsupported color space: %s"%color) exit(1) if pdf_x < 3.00 or pdf_y < 3.00: warning_out("pdf width or height is below 3.00\" - decrease the dpi") elif pdf_x > 14400.0 or pdf_y > 14400.0: warning_out("pdf width or height would be above 200\" - squeezed inside") x_scale = 14400.0 / pdf_x y_scale = 14400.0 / pdf_y scale = min(x_scale, y_scale) * 0.999 pdf_x *= scale pdf_y *= scale # either embed the whole jpeg or deflate the bitmap representation if imgformat is "JPEG": ofilter = [ b"/DCTDecode" ] elif imgformat is "JPEG2000": ofilter = [ b"/JPXDecode" ] self.version = 5 # jpeg2000 needs pdf 1.5 else: ofilter = [ b"/FlateDecode" ] image = obj({ b"/Type": b"/XObject", b"/Subtype": b"/Image", b"/Filter": ofilter, b"/Width": width, b"/Height": height, b"/ColorSpace": colorspace, # hardcoded as PIL doesn't provide bits for non-jpeg formats b"/BitsPerComponent": 8, b"/Length": len(imgdata) }, imgdata) if color == 'CMYK;I': # Inverts all four channels image.content[b'/Decode'] = [1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0] text = ("q\n%0.4f 0 0 %0.4f 0 0 cm\n/Im0 Do\nQ"%(pdf_x, pdf_y)).encode() content = obj({ b"/Length": len(text) }, text) page = obj({ b"/Type": b"/Page", b"/Parent": self.pages, b"/Resources": { b"/XObject": { b"/Im0": image } }, b"/MediaBox": [0, 0, pdf_x, pdf_y], b"/Contents": content }) self.pages.content[b"/Kids"].append(page) self.pages.content[b"/Count"] += 1 self.addobj(page) self.addobj(content) self.addobj(image) def tostring(self): stream = cStringIO() self.tostream(stream) return stream.getvalue() def tostream(self, stream): # add info as last object self.addobj(self.info) xreftable = list() stream.write(("%%PDF-1.%d\n"%self.version).encode()) xreftable.append(b"0000000000 65535 f \n") for o in self.objects: xreftable.append(("%010d 00000 n \n"%stream.tell()).encode()) stream.write(o.tostring()) xrefoffset = stream.tell() stream.write(b"xref\n") stream.write(("0 %d\n"%len(xreftable)).encode()) for x in xreftable: stream.write(x) stream.write(b"trailer\n") stream.write(parse({b"/Size": len(xreftable), b"/Info": self.info, b"/Root": self.catalog})+b"\n") stream.write(b"startxref\n") stream.write(("%d\n"%xrefoffset).encode()) stream.write(b"%%EOF\n") def convert(images, dpi=None, pagesize=(None, None, None), title=None, author=None, creator=None, producer=None, creationdate=None, moddate=None, subject=None, keywords=None, colorspace=None, nodate=False, verbose=False, outputstream=None): pagesize_options = pagesize[2] pdf = pdfdoc(3, title, author, creator, producer, creationdate, moddate, subject, keywords, nodate) for imfilename in images: debug_out("Reading %s"%imfilename, verbose) try: rawdata = imfilename.read() except AttributeError: with open(imfilename, "rb") as im: rawdata = im.read() im = cStringIO(rawdata) try: imgdata = Image.open(im) except IOError as e: # test if it is a jpeg2000 image if rawdata[:12] != "\x00\x00\x00\x0C\x6A\x50\x20\x20\x0D\x0A\x87\x0A": error_out("cannot read input image (not jpeg2000)") error_out("PIL: %s"%e) exit(1) # image is jpeg2000 width, height, ics = parsejp2(rawdata) imgformat = "JPEG2000" # TODO: read real dpi from input jpeg2000 image ndpi = (default_dpi, default_dpi) debug_out("input dpi = %d x %d" % ndpi, verbose) if colorspace: color = colorspace debug_out("input colorspace (forced) = %s"%(ics)) else: color = ics debug_out("input colorspace = %s"%(ics), verbose) else: width, height = imgdata.size imgformat = imgdata.format ndpi = imgdata.info.get("dpi", (default_dpi, default_dpi)) # in python3, the returned dpi value for some tiff images will # not be an integer but a float. To make the behaviour of # img2pdf the same between python2 and python3, we convert that # float into an integer by rounding # search online for the 72.009 dpi problem for more info ndpi = (int(round(ndpi[0])),int(round(ndpi[1]))) debug_out("input dpi = %d x %d" % ndpi, verbose) if colorspace: color = colorspace debug_out("input colorspace (forced) = %s"%(color), verbose) else: color = imgdata.mode if color == "CMYK" and imgformat == "JPEG": # Adobe inverts CMYK JPEGs for some reason, and others # have followed suit as well. Some software assumes the # JPEG is inverted if the Adobe tag (APP14), while other # software assumes all CMYK JPEGs are inverted. I don't # have enough experience with these to know which is # better for images currently in the wild, so I'm going # with the first approach for now. if "adobe" in imgdata.info: color = "CMYK;I" debug_out("input colorspace = %s"%(color), verbose) debug_out("width x height = %d x %d"%(width,height), verbose) debug_out("imgformat = %s"%imgformat, verbose) if dpi: ndpi = dpi, dpi debug_out("input dpi (forced) = %d x %d" % ndpi, verbose) elif pagesize_options: ndpi = get_ndpi(width, height, pagesize) debug_out("calculated dpi (based on pagesize) = %d x %d" % ndpi, verbose) # depending on the input format, determine whether to pass the raw # image or the zlib compressed color information if imgformat is "JPEG" or imgformat is "JPEG2000": if color == '1': error_out("jpeg can't be monochrome") exit(1) imgdata = rawdata else: # because we do not support /CCITTFaxDecode if color == '1': debug_out("Converting colorspace 1 to L", verbose) imgdata = imgdata.convert('L') color = 'L' elif color in ("RGB", "L", "CMYK", "CMYK;I"): debug_out("Colorspace is OK: %s"%color, verbose) else: debug_out("Converting colorspace %s to RGB"%color, verbose) imgdata = imgdata.convert('RGB') color = imgdata.mode img = imgdata.tobytes() # the python-pil version 2.3.0-1ubuntu3 in Ubuntu does not have the close() method try: imgdata.close() except AttributeError: pass imgdata = zlib.compress(img) im.close() if pagesize_options and pagesize_options['exact'][1]: # output size exactly to specified dimensions # pagesize[0], pagesize[1] already checked in valid_size() pdf_x, pdf_y = pagesize[0], pagesize[1] else: # output size based on dpi; point = 1/72 inch pdf_x, pdf_y = 72.0*width/float(ndpi[0]), 72.0*height/float(ndpi[1]) pdf.addimage(color, width, height, imgformat, imgdata, pdf_x, pdf_y) if outputstream: pdf.tostream(outputstream) return outputstream return pdf.tostring() def get_ndpi(width, height, pagesize): pagesize_options = pagesize[2] if pagesize_options and pagesize_options['fill'][1]: if width/height < pagesize[0]/pagesize[1]: tmp_dpi = 72.0*width/pagesize[0] else: tmp_dpi = 72.0*height/pagesize[1] elif pagesize[0] and pagesize[1]: # if both height and width given with no specific pagesize_option, # resize to fit "into" page if width/height < pagesize[0]/pagesize[1]: tmp_dpi = 72.0*height/pagesize[1] else: tmp_dpi = 72.0*width/pagesize[0] elif pagesize[0]: # if width given, calculate dpi based on width tmp_dpi = 72.0*width/pagesize[0] elif pagesize[1]: # if height given, calculate dpi based on height tmp_dpi = 72.0*height/pagesize[1] else: tmp_dpi = default_dpi return tmp_dpi, tmp_dpi def positive_float(string): value = float(string) if value <= 0: msg = "%r is not positive"%string raise argparse.ArgumentTypeError(msg) return value def valid_date(string): # first try parsing in ISO8601 format try: return datetime.strptime(string, "%Y-%m-%d") except ValueError: pass try: return datetime.strptime(string, "%Y-%m-%dT%H:%M") except ValueError: pass try: return datetime.strptime(string, "%Y-%m-%dT%H:%M:%S") except ValueError: pass # then try dateutil try: from dateutil import parser except ImportError: pass else: try: return parser.parse(string) except TypeError: pass # as a last resort, try the local date utility try: import subprocess except ImportError: pass else: try: utime = subprocess.check_output(["date", "--date", string, "+%s"]) except subprocess.CalledProcessError: pass else: return datetime.utcfromtimestamp(int(utime)) raise argparse.ArgumentTypeError("cannot parse date: %s"%string) def get_standard_papersize(string): papersizes = { "11x17" : "792x792^", # "792x1224", "ledger" : "792x792^", # "1224x792", "legal" : "612x612^", # "612x1008", "letter" : "612x612^", # "612x792", "arche" : "2592x2592^", # "2592x3456", "archd" : "1728x1728^", # "1728x2592", "archc" : "1296x1296^", # "1296x1728", "archb" : "864x864^", # "864x1296", "archa" : "648x648^", # "648x864", "a0" : "2380x2380^", # "2380x3368", "a1" : "1684x1684^", # "1684x2380", "a2" : "1190x1190^", # "1190x1684", "a3" : "842x842^", # "842x1190", "a4" : "595x595^", # "595x842", "a5" : "421x421^", # "421x595", "a6" : "297x297^", # "297x421", "a7" : "210x210^", # "210x297", "a8" : "148x148^", # "148x210", "a9" : "105x105^", # "105x148", "a10" : "74x74^", # "74x105", "b0" : "2836x2836^", # "2836x4008", "b1" : "2004x2004^", # "2004x2836", "b2" : "1418x1418^", # "1418x2004", "b3" : "1002x1002^", # "1002x1418", "b4" : "709x709^", # "709x1002", "b5" : "501x501^", # "501x709", "c0" : "2600x2600^", # "2600x3677", "c1" : "1837x1837^", # "1837x2600", "c2" : "1298x1298^", # "1298x1837", "c3" : "918x918^", # "918x1298", "c4" : "649x649^", # "649x918", "c5" : "459x459^", # "459x649", "c6" : "323x323^", # "323x459", "flsa" : "612x612^", # "612x936", "flse" : "612x612^", # "612x936", "halfletter" : "396x396^", # "396x612", "tabloid" : "792x792^", # "792x1224", "statement" : "396x396^", # "396x612", "executive" : "540x540^", # "540x720", "folio" : "612x612^", # "612x936", "quarto" : "610x610^", # "610x780" } string = string.lower() return papersizes.get(string, string) def valid_size(string): # conversion factors from units to points units = { 'in' : 72.0, 'cm' : 72.0/2.54, 'mm' : 72.0/25.4, 'pt' : 1.0 } pagesize_options = { 'exact' : ['\!', False], 'shrink' : ['\>', False], 'enlarge' : ['\<', False], 'fill' : ['\^', False], 'percent' : ['\%', False], 'count' : ['\@', False], } string = get_standard_papersize(string) pattern = re.compile(r""" ([0-9]*\.?[0-9]*) # tokens.group(1) == width; may be empty ([a-z]*) # tokens.group(2) == units; may be empty x ([0-9]*\.?[0-9]*) # tokens.group(3) == height; may be empty ([a-zA-Z]*) # tokens.group(4) == units; may be empty ([^0-9a-zA-Z]*) # tokens.group(5) == extra options """, re.VERBOSE) tokens = pattern.match(string) # tokens.group(0) should match entire input string if tokens.group(0) != string: msg = ('Input size needs to be of the format AuxBv#, ' 'where A is width, B is height, u and v are units, ' '# are options. ' 'You may omit either width or height, but not both. ' 'Units may be specified as (in, cm, mm, pt). ' 'You may omit units, which will default to pt. ' 'Available options include (! = exact ; ^ = fill ; default = into).') raise argparse.ArgumentTypeError(msg) # temporary list to loop through to process width and height pagesize_size = { 'x' : [0, tokens.group(1), tokens.group(2)], 'y' : [0, tokens.group(3), tokens.group(4)] } for key, value in pagesize_size.items(): try: value[0] = float(value[1]) value[0] *= units[value[2]] # convert to points except ValueError: # assign None if width or height not provided value[0] = None except KeyError: # if units unrecognized, raise error # otherwise default to pt because units not provided if value[2]: msg = "unrecognized unit '%s'." % value[2] raise argparse.ArgumentTypeError(msg) x = pagesize_size['x'][0] y = pagesize_size['y'][0] # parse options for resize methods if tokens.group(5): for key, value in pagesize_options.items(): if re.search(value[0], tokens.group(5)): value[1] = True if pagesize_options['fill'][1]: # if either width or height is not given, try to fill in missing value if not x: x = y elif not y: y = x if pagesize_options['exact'][1]: if not x or not y: msg = ('exact size requires both width and height.') raise argparse.ArgumentTypeError(msg) if not x and not y: msg = ('width and height cannot both be omitted.') raise argparse.ArgumentTypeError(msg) return (x, y, pagesize_options) # in python3, the received argument will be a unicode str() object which needs # to be encoded into a bytes() object # in python2, the received argument will be a binary str() object which needs # no encoding # we check whether we use python2 or python3 by checking whether the argument # is both, type str and type bytes (only the case in python2) def pdf_embedded_string(string): if type(string) is str and type(string) is not bytes: # py3 pass else: # py2 string = string.decode("utf8") string = b"\xfe\xff"+string.encode("utf-16-be") string = string.replace(b'\\', b'\\\\') string = string.replace(b'(', b'\\(') string = string.replace(b')', b'\\)') return string parser = argparse.ArgumentParser( description='Lossless conversion/embedding of images (in)to pdf') parser.add_argument( 'images', metavar='infile', type=str, nargs='+', help='input file(s)') parser.add_argument( '-o', '--output', metavar='out', type=argparse.FileType('wb'), default=getattr(sys.stdout, "buffer", sys.stdout), help='output file (default: stdout)') sizeopts = parser.add_mutually_exclusive_group() sizeopts.add_argument( '-d', '--dpi', metavar='dpi', type=positive_float, help=('dpi for pdf output. ' 'If input image does not specify dpi the default is %.2f. ' 'Must not be used with -s/--pagesize.') % default_dpi ) sizeopts.add_argument( '-s', '--pagesize', metavar='size', type=valid_size, default=(None, None, None), help=('size of the pdf pages in format AuxBv#, ' 'where A is width, B is height, u and v are units, # are options. ' 'You may omit either width or height, but not both. ' 'Some common page sizes, such as letter and a4, are also recognized. ' 'Units may be specified as (in, cm, mm, pt). ' 'Units default to pt when absent. ' 'Available options include (! = exact ; ^ = fill ; default = into). ' 'Must not be used with -d/--dpi.') ) parser.add_argument( '-t', '--title', metavar='title', type=pdf_embedded_string, help='title for metadata') parser.add_argument( '-a', '--author', metavar='author', type=pdf_embedded_string, help='author for metadata') parser.add_argument( '-c', '--creator', metavar='creator', type=pdf_embedded_string, help='creator for metadata') parser.add_argument( '-p', '--producer', metavar='producer', type=pdf_embedded_string, help='producer for metadata') parser.add_argument( '-r', '--creationdate', metavar='creationdate', type=valid_date, help='UTC creation date for metadata in YYYY-MM-DD or YYYY-MM-DDTHH:MM or YYYY-MM-DDTHH:MM:SS format or any format understood by python dateutil module or any format understood by `date --date`') parser.add_argument( '-m', '--moddate', metavar='moddate', type=valid_date, help='UTC modification date for metadata in YYYY-MM-DD or YYYY-MM-DDTHH:MM or YYYY-MM-DDTHH:MM:SS format or any format understood by python dateutil module or any format understood by `date --date`') parser.add_argument( '-S', '--subject', metavar='subject', type=pdf_embedded_string, help='subject for metadata') parser.add_argument( '-k', '--keywords', metavar='kw', type=pdf_embedded_string, nargs='+', help='keywords for metadata') parser.add_argument( '-C', '--colorspace', metavar='colorspace', type=pdf_embedded_string, help='force PIL colorspace (one of: RGB, L, 1, CMYK, CMYK;I)') parser.add_argument( '-D', '--nodate', help='do not add timestamps', action="store_true") parser.add_argument( '-v', '--verbose', help='verbose mode', action="store_true") parser.add_argument( '-V', '--version', action='version', version='%(prog)s '+__version__, help="Print version information and exit") def main(args=None): if args is None: args = sys.argv[1:] args = parser.parse_args(args) convert( args.images, args.dpi, args.pagesize, args.title, args.author, args.creator, args.producer, args.creationdate, args.moddate, args.subject, args.keywords, args.colorspace, args.nodate, args.verbose, outputstream=args.output) if __name__ == '__main__': main()