2014-07-26 14:12:40 +00:00
#!/usr/bin/env python2
2014-03-30 06:10:12 +00:00
# Copyright (C) 2012-2014 Johannes 'josch' Schauer <j.schauer at email.de>
2013-05-02 06:17:13 +00:00
#
2014-03-30 06:10:12 +00:00
# This program is free software: you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation, either
# version 3 of the License, or (at your option) any later
# version.
2013-05-02 06:17:13 +00:00
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
2014-03-30 06:10:12 +00:00
# You should have received a copy of the GNU General Public
# License along with this program. If not, see
# <http://www.gnu.org/licenses/>.
2012-03-29 09:08:32 +00:00
2015-03-07 02:20:14 +00:00
__version__ = " 0.1.6~git "
2015-03-20 10:37:30 +00:00
default_dpi = 96.0
2015-03-07 02:20:14 +00:00
2015-03-20 10:37:30 +00:00
import re
2012-03-29 09:08:32 +00:00
import sys
import zlib
import argparse
2014-03-01 04:51:53 +00:00
from PIL import Image
2012-03-29 09:08:32 +00:00
from datetime import datetime
2013-10-21 13:55:47 +00:00
from jp2 import parsejp2
2015-02-16 06:39:07 +00:00
try :
from cStringIO import cStringIO
except ImportError :
from io import BytesIO as cStringIO
2012-03-29 09:08:32 +00:00
2014-03-01 04:51:53 +00:00
# XXX: Switch to use logging module.
def debug_out ( message , verbose = True ) :
if verbose :
sys . stderr . write ( " D: " + message + " \n " )
def error_out ( message ) :
sys . stderr . write ( " E: " + message + " \n " )
def warning_out ( message ) :
sys . stderr . write ( " W: " + message + " \n " )
2015-03-13 13:29:53 +00:00
def datetime_to_pdfdate ( dt ) :
return dt . strftime ( " % Y % m %d % H % M % SZ " )
2012-03-29 09:08:32 +00:00
def parse ( cont , indent = 1 ) :
if type ( cont ) is dict :
2015-01-07 14:56:24 +00:00
return b " << \n " + b " \n " . join (
2015-03-13 10:43:38 +00:00
[ 4 * indent * b " " + k + b " " + parse ( v , indent + 1 )
2015-01-07 15:23:52 +00:00
for k , v in sorted ( cont . items ( ) ) ] ) + b " \n " + 4 * ( indent - 1 ) * b " " + b " >> "
2015-02-16 18:09:34 +00:00
elif type ( cont ) is int :
2015-03-13 10:43:38 +00:00
return str ( cont ) . encode ( )
2015-02-16 18:09:34 +00:00
elif type ( cont ) is float :
2015-03-13 10:43:38 +00:00
return ( " %0.4f " % cont ) . encode ( )
2012-03-29 09:08:32 +00:00
elif isinstance ( cont , obj ) :
2015-03-13 10:43:38 +00:00
return ( " %d 0 R " % cont . identifier ) . encode ( )
elif type ( cont ) is str or type ( cont ) is bytes :
if type ( cont ) is str and type ( cont ) is not bytes :
raise Exception ( " parse must be passed a bytes object in py3 " )
2012-03-29 09:08:32 +00:00
return cont
elif type ( cont ) is list :
2015-01-07 14:56:24 +00:00
return b " [ " + b " " . join ( [ parse ( c , indent ) for c in cont ] ) + b " ] "
2015-03-13 10:43:38 +00:00
else :
raise Exception ( " cannot handle type %s " % type ( cont ) )
2012-03-29 09:08:32 +00:00
2014-03-01 03:57:40 +00:00
class obj ( object ) :
2012-03-29 09:08:32 +00:00
def __init__ ( self , content , stream = None ) :
self . content = content
self . stream = stream
2013-10-23 10:34:07 +00:00
def tostring ( self ) :
2012-03-29 09:08:32 +00:00
if self . stream :
2014-03-01 04:51:53 +00:00
return (
2015-03-13 10:43:38 +00:00
( " %d 0 obj " % self . identifier ) . encode ( ) +
2014-03-01 04:51:53 +00:00
parse ( self . content ) +
2015-01-07 14:56:24 +00:00
b " \n stream \n " + self . stream + b " \n endstream \n endobj \n " )
2012-03-29 09:08:32 +00:00
else :
2015-03-13 10:43:38 +00:00
return ( " %d 0 obj " % self . identifier ) . encode ( ) + parse ( self . content ) + b " endobj \n "
2012-03-29 09:08:32 +00:00
2014-03-01 03:57:40 +00:00
class pdfdoc ( object ) :
2012-06-15 14:59:31 +00:00
2014-03-01 03:57:40 +00:00
def __init__ ( self , version = 3 , title = None , author = None , creator = None ,
producer = None , creationdate = None , moddate = None , subject = None ,
2015-01-07 15:23:52 +00:00
keywords = None , nodate = False ) :
2013-10-23 10:34:07 +00:00
self . version = version # default pdf version 1.3
now = datetime . now ( )
2014-03-01 04:51:53 +00:00
self . objects = [ ]
2013-10-23 10:34:07 +00:00
2014-03-01 03:57:40 +00:00
info = { }
2013-10-23 10:34:07 +00:00
if title :
2015-03-13 10:43:38 +00:00
info [ b " /Title " ] = b " ( " + title + b " ) "
2013-10-23 10:34:07 +00:00
if author :
2015-03-13 10:43:38 +00:00
info [ b " /Author " ] = b " ( " + author + b " ) "
2013-10-23 10:34:07 +00:00
if creator :
2015-03-13 10:43:38 +00:00
info [ b " /Creator " ] = b " ( " + creator + b " ) "
2013-10-23 10:34:07 +00:00
if producer :
2015-03-13 10:43:38 +00:00
info [ b " /Producer " ] = b " ( " + producer + b " ) "
2013-10-23 10:34:07 +00:00
if creationdate :
2015-03-13 13:29:53 +00:00
info [ b " /CreationDate " ] = b " (D: " + datetime_to_pdfdate ( creationdate ) . encode ( ) + b " ) "
2015-01-07 15:23:52 +00:00
elif not nodate :
2015-03-13 13:29:53 +00:00
info [ b " /CreationDate " ] = b " (D: " + datetime_to_pdfdate ( now ) . encode ( ) + b " ) "
2013-10-23 10:34:07 +00:00
if moddate :
2015-03-13 13:29:53 +00:00
info [ b " /ModDate " ] = b " (D: " + datetime_to_pdfdate ( moddate ) . encode ( ) + b " ) "
2015-01-07 15:23:52 +00:00
elif not nodate :
2015-03-13 13:29:53 +00:00
info [ b " /ModDate " ] = b " (D: " + datetime_to_pdfdate ( now ) . encode ( ) + b " ) "
2013-10-23 10:34:07 +00:00
if subject :
2015-03-13 10:43:38 +00:00
info [ b " /Subject " ] = b " ( " + subject + b " ) "
2013-10-23 10:34:07 +00:00
if keywords :
2015-03-13 10:43:38 +00:00
info [ b " /Keywords " ] = b " ( " + b " , " . join ( keywords ) + b " ) "
2013-10-23 10:34:07 +00:00
self . info = obj ( info )
2014-03-01 03:57:40 +00:00
# create an incomplete pages object so that a /Parent entry can be
# added to each page
2013-10-23 10:34:07 +00:00
self . pages = obj ( {
2015-03-13 10:43:38 +00:00
b " /Type " : b " /Pages " ,
b " /Kids " : [ ] ,
b " /Count " : 0
2013-10-23 10:34:07 +00:00
} )
2012-03-29 09:53:57 +00:00
2013-10-23 10:34:07 +00:00
self . catalog = obj ( {
2015-03-13 10:43:38 +00:00
b " /Pages " : self . pages ,
b " /Type " : b " /Catalog "
2013-10-23 10:34:07 +00:00
} )
self . addobj ( self . catalog )
self . addobj ( self . pages )
2013-08-30 08:45:43 +00:00
2013-10-23 10:34:07 +00:00
def addobj ( self , obj ) :
newid = len ( self . objects ) + 1
obj . identifier = newid
self . objects . append ( obj )
2014-08-04 15:25:07 +00:00
def addimage ( self , color , width , height , imgformat , imgdata , pdf_x , pdf_y ) :
2012-03-29 09:08:32 +00:00
if color == ' L ' :
2015-03-13 10:43:38 +00:00
colorspace = b " /DeviceGray "
2012-03-29 09:08:32 +00:00
elif color == ' RGB ' :
2015-03-13 10:43:38 +00:00
colorspace = b " /DeviceRGB "
2015-02-15 08:03:16 +00:00
elif color == ' CMYK ' or color == ' CMYK;I ' :
2015-03-13 10:43:38 +00:00
colorspace = b " /DeviceCMYK "
2012-03-29 09:08:32 +00:00
else :
2013-10-23 06:49:43 +00:00
error_out ( " unsupported color space: %s " % color )
2012-03-29 09:08:32 +00:00
exit ( 1 )
2013-10-23 06:49:59 +00:00
if pdf_x < 3.00 or pdf_y < 3.00 :
warning_out ( " pdf width or height is below 3.00 - decrease the dpi " )
2012-03-29 09:08:32 +00:00
# either embed the whole jpeg or deflate the bitmap representation
if imgformat is " JPEG " :
2015-03-13 10:43:38 +00:00
ofilter = [ b " /DCTDecode " ]
2014-04-04 23:55:03 +00:00
elif imgformat is " JPEG2000 " :
2015-03-13 10:43:38 +00:00
ofilter = [ b " /JPXDecode " ]
2013-10-23 10:34:07 +00:00
self . version = 5 # jpeg2000 needs pdf 1.5
2012-03-29 09:08:32 +00:00
else :
2015-03-13 10:43:38 +00:00
ofilter = [ b " /FlateDecode " ]
2012-03-29 09:08:32 +00:00
image = obj ( {
2015-03-13 10:43:38 +00:00
b " /Type " : b " /XObject " ,
b " /Subtype " : b " /Image " ,
b " /Filter " : ofilter ,
b " /Width " : width ,
b " /Height " : height ,
b " /ColorSpace " : colorspace ,
2015-03-13 13:33:45 +00:00
# hardcoded as PIL doesn't provide bits for non-jpeg formats
2015-03-13 10:43:38 +00:00
b " /BitsPerComponent " : 8 ,
b " /Length " : len ( imgdata )
2012-03-29 09:08:32 +00:00
} , imgdata )
2015-02-15 08:03:16 +00:00
if color == ' CMYK;I ' :
# Inverts all four channels
2015-03-13 10:43:38 +00:00
image . content [ b ' /Decode ' ] = [ 1.0 , 0.0 , 1.0 , 0.0 , 1.0 , 0.0 , 1.0 , 0.0 ]
2015-02-15 08:03:16 +00:00
2015-03-13 10:43:38 +00:00
text = ( " q \n %0.4f 0 0 %0.4f 0 0 cm \n /Im0 Do \n Q " % ( pdf_x , pdf_y ) ) . encode ( )
2012-03-29 09:08:32 +00:00
content = obj ( {
2015-03-13 10:43:38 +00:00
b " /Length " : len ( text )
2012-03-29 09:08:32 +00:00
} , text )
page = obj ( {
2015-03-13 10:43:38 +00:00
b " /Type " : b " /Page " ,
b " /Parent " : self . pages ,
b " /Resources " : {
b " /XObject " : {
b " /Im0 " : image
2012-03-29 09:08:32 +00:00
}
} ,
2015-03-13 10:43:38 +00:00
b " /MediaBox " : [ 0 , 0 , pdf_x , pdf_y ] ,
b " /Contents " : content
2012-03-29 09:08:32 +00:00
} )
2015-03-13 10:43:38 +00:00
self . pages . content [ b " /Kids " ] . append ( page )
self . pages . content [ b " /Count " ] + = 1
2013-10-23 10:34:07 +00:00
self . addobj ( page )
self . addobj ( content )
self . addobj ( image )
def tostring ( self ) :
# add info as last object
self . addobj ( self . info )
xreftable = list ( )
2015-03-13 10:43:38 +00:00
result = ( " %% PDF-1. %d \n " % self . version ) . encode ( )
2013-10-23 10:34:07 +00:00
2015-01-07 14:56:24 +00:00
xreftable . append ( b " 0000000000 65535 f \n " )
2013-10-23 10:34:07 +00:00
for o in self . objects :
2015-03-13 10:43:38 +00:00
xreftable . append ( ( " %010d 00000 n \n " % len ( result ) ) . encode ( ) )
2013-10-23 10:34:07 +00:00
result + = o . tostring ( )
xrefoffset = len ( result )
2015-01-07 14:56:24 +00:00
result + = b " xref \n "
2015-03-13 10:43:38 +00:00
result + = ( " 0 %d \n " % len ( xreftable ) ) . encode ( )
2013-10-23 10:34:07 +00:00
for x in xreftable :
result + = x
2015-01-07 14:56:24 +00:00
result + = b " trailer \n "
2015-03-13 10:43:38 +00:00
result + = parse ( { b " /Size " : len ( xreftable ) , b " /Info " : self . info , b " /Root " : self . catalog } ) + b " \n "
2015-01-07 14:56:24 +00:00
result + = b " startxref \n "
2015-03-13 10:43:38 +00:00
result + = ( " %d \n " % xrefoffset ) . encode ( )
2015-01-07 14:56:24 +00:00
result + = b " %% EOF \n "
2013-10-23 10:34:07 +00:00
return result
2012-03-29 09:08:32 +00:00
2015-03-20 10:37:30 +00:00
def convert ( images , dpi = None , pagesize = ( None , None , None ) , title = None ,
author = None , creator = None , producer = None , creationdate = None ,
moddate = None , subject = None , keywords = None , colorspace = None ,
nodate = False , verbose = False ) :
pagesize_options = pagesize [ 2 ]
2012-03-29 09:08:32 +00:00
2014-03-01 03:57:40 +00:00
pdf = pdfdoc ( 3 , title , author , creator , producer , creationdate ,
2015-02-16 13:49:53 +00:00
moddate , subject , keywords , nodate )
2012-03-29 09:08:32 +00:00
2014-11-06 07:46:47 +00:00
for imfilename in images :
debug_out ( " Reading %s " % imfilename , verbose )
2015-02-16 06:39:07 +00:00
try :
rawdata = imfilename . read ( )
im = cStringIO ( rawdata )
except :
with open ( imfilename , " rb " ) as im :
rawdata = im . read ( )
im = cStringIO ( rawdata )
try :
imgdata = Image . open ( im )
except IOError as e :
# test if it is a jpeg2000 image
if rawdata [ : 12 ] != " \x00 \x00 \x00 \x0C \x6A \x50 \x20 \x20 \x0D \x0A \x87 \x0A " :
error_out ( " cannot read input image (not jpeg2000) " )
error_out ( " PIL: %s " % e )
exit ( 1 )
# image is jpeg2000
width , height , ics = parsejp2 ( rawdata )
imgformat = " JPEG2000 "
2015-03-20 10:37:30 +00:00
# TODO: read real dpi from input jpeg2000 image
ndpi = ( default_dpi , default_dpi )
debug_out ( " input dpi = %d x %d " % ndpi , verbose )
2015-02-16 06:39:07 +00:00
if colorspace :
color = colorspace
debug_out ( " input colorspace (forced) = %s " % ( ics ) )
else :
color = ics
debug_out ( " input colorspace = %s " % ( ics ) , verbose )
else :
width , height = imgdata . size
imgformat = imgdata . format
2015-03-20 10:37:30 +00:00
ndpi = imgdata . info . get ( " dpi " , ( default_dpi , default_dpi ) )
# in python3, the returned dpi value for some tiff images will
# not be an integer but a float. To make the behaviour of
# img2pdf the same between python2 and python3, we convert that
# float into an integer by rounding
# search online for the 72.009 dpi problem for more info
ndpi = ( int ( round ( ndpi [ 0 ] ) ) , int ( round ( ndpi [ 1 ] ) ) )
debug_out ( " input dpi = %d x %d " % ndpi , verbose )
2015-02-16 06:39:07 +00:00
if colorspace :
color = colorspace
debug_out ( " input colorspace (forced) = %s " % ( color ) , verbose )
else :
color = imgdata . mode
if color == " CMYK " and imgformat == " JPEG " :
# Adobe inverts CMYK JPEGs for some reason, and others
# have followed suit as well. Some software assumes the
# JPEG is inverted if the Adobe tag (APP14), while other
# software assumes all CMYK JPEGs are inverted. I don't
# have enough experience with these to know which is
# better for images currently in the wild, so I'm going
# with the first approach for now.
if " adobe " in imgdata . info :
color = " CMYK;I "
debug_out ( " input colorspace = %s " % ( color ) , verbose )
debug_out ( " width x height = %d x %d " % ( width , height ) , verbose )
debug_out ( " imgformat = %s " % imgformat , verbose )
2015-03-20 10:37:30 +00:00
if dpi :
ndpi = dpi , dpi
debug_out ( " input dpi (forced) = %d x %d " % ndpi , verbose )
elif pagesize_options :
ndpi = get_ndpi ( width , height , pagesize )
debug_out ( " calculated dpi (based on pagesize) = %d x %d " % ndpi , verbose )
2015-02-16 06:39:07 +00:00
# depending on the input format, determine whether to pass the raw
# image or the zlib compressed color information
if imgformat is " JPEG " or imgformat is " JPEG2000 " :
if color == ' 1 ' :
error_out ( " jpeg can ' t be monochrome " )
exit ( 1 )
imgdata = rawdata
else :
# because we do not support /CCITTFaxDecode
if color == ' 1 ' :
debug_out ( " Converting colorspace 1 to L " , verbose )
imgdata = imgdata . convert ( ' L ' )
color = ' L '
elif color in ( " RGB " , " L " , " CMYK " , " CMYK;I " ) :
debug_out ( " Colorspace is OK: %s " % color , verbose )
2014-11-06 07:47:42 +00:00
else :
2015-02-16 06:39:07 +00:00
debug_out ( " Converting colorspace %s to RGB " % color , verbose )
imgdata = imgdata . convert ( ' RGB ' )
color = imgdata . mode
2015-02-16 18:19:49 +00:00
img = imgdata . tobytes ( )
2015-03-07 01:59:12 +00:00
# the python-pil version 2.3.0-1ubuntu3 in Ubuntu does not have the close() method
try :
imgdata . close ( )
except AttributeError :
pass
2015-02-16 18:19:49 +00:00
imgdata = zlib . compress ( img )
im . close ( )
2013-10-23 10:34:07 +00:00
2015-03-20 10:37:30 +00:00
if pagesize_options and pagesize_options [ ' exact ' ] [ 1 ] :
# output size exactly to specified dimensions
# pagesize[0], pagesize[1] already checked in valid_size()
pdf_x , pdf_y = pagesize [ 0 ] , pagesize [ 1 ]
2015-01-15 05:15:05 +00:00
else :
2015-03-20 10:37:30 +00:00
# output size based on dpi; point = 1/72 inch
pdf_x , pdf_y = 72.0 * width / float ( ndpi [ 0 ] ) , 72.0 * height / float ( ndpi [ 1 ] )
2014-08-04 15:25:07 +00:00
pdf . addimage ( color , width , height , imgformat , imgdata , pdf_x , pdf_y )
2013-10-23 10:34:07 +00:00
return pdf . tostring ( )
2012-03-29 09:08:32 +00:00
2015-03-20 10:37:30 +00:00
def get_ndpi ( width , height , pagesize ) :
pagesize_options = pagesize [ 2 ]
if pagesize_options and pagesize_options [ ' fill ' ] [ 1 ] :
if width / height < pagesize [ 0 ] / pagesize [ 1 ] :
tmp_dpi = 72.0 * width / pagesize [ 0 ]
else :
tmp_dpi = 72.0 * height / pagesize [ 1 ]
elif pagesize [ 0 ] and pagesize [ 1 ] :
# if both height and width given with no specific pagesize_option,
# resize to fit "into" page
if width / height < pagesize [ 0 ] / pagesize [ 1 ] :
tmp_dpi = 72.0 * height / pagesize [ 1 ]
else :
tmp_dpi = 72.0 * width / pagesize [ 0 ]
elif pagesize [ 0 ] :
# if width given, calculate dpi based on width
tmp_dpi = 72.0 * width / pagesize [ 0 ]
elif pagesize [ 1 ] :
# if height given, calculate dpi based on height
tmp_dpi = 72.0 * height / pagesize [ 1 ]
else :
tmp_dpi = default_dpi
return tmp_dpi , tmp_dpi
2014-03-01 03:57:40 +00:00
def positive_float ( string ) :
value = float ( string )
if value < = 0 :
msg = " %r is not positive " % string
raise argparse . ArgumentTypeError ( msg )
return value
def valid_date ( string ) :
2015-03-13 13:29:53 +00:00
# first try parsing in ISO8601 format
try :
return datetime . strptime ( string , " % Y- % m- %d " )
except ValueError :
pass
try :
return datetime . strptime ( string , " % Y- % m- %d T % H: % M " )
except ValueError :
pass
try :
return datetime . strptime ( string , " % Y- % m- %d T % H: % M: % S " )
except ValueError :
pass
# then try dateutil
try :
from dateutil import parser
except ImportError :
pass
else :
try :
return parser . parse ( string )
except TypeError :
pass
# as a last resort, try the local date utility
try :
import subprocess
except ImportError :
pass
else :
try :
utime = subprocess . check_output ( [ " date " , " --date " , string , " + %s " ] )
except subprocess . CalledProcessError :
pass
else :
return datetime . utcfromtimestamp ( int ( utime ) )
raise argparse . ArgumentTypeError ( " cannot parse date: %s " % string )
2014-03-01 03:57:40 +00:00
2015-03-20 10:37:30 +00:00
def get_standard_papersize ( string ) :
papersizes = {
" 11x17 " : " 792x792^ " , # "792x1224",
" ledger " : " 792x792^ " , # "1224x792",
" legal " : " 612x612^ " , # "612x1008",
" letter " : " 612x612^ " , # "612x792",
" arche " : " 2592x2592^ " , # "2592x3456",
" archd " : " 1728x1728^ " , # "1728x2592",
" archc " : " 1296x1296^ " , # "1296x1728",
" archb " : " 864x864^ " , # "864x1296",
" archa " : " 648x648^ " , # "648x864",
" a0 " : " 2380x2380^ " , # "2380x3368",
" a1 " : " 1684x1684^ " , # "1684x2380",
" a2 " : " 1190x1190^ " , # "1190x1684",
" a3 " : " 842x842^ " , # "842x1190",
" a4 " : " 595x595^ " , # "595x842",
" a5 " : " 421x421^ " , # "421x595",
" a6 " : " 297x297^ " , # "297x421",
" a7 " : " 210x210^ " , # "210x297",
" a8 " : " 148x148^ " , # "148x210",
" a9 " : " 105x105^ " , # "105x148",
" a10 " : " 74x74^ " , # "74x105",
" b0 " : " 2836x2836^ " , # "2836x4008",
" b1 " : " 2004x2004^ " , # "2004x2836",
" b2 " : " 1418x1418^ " , # "1418x2004",
" b3 " : " 1002x1002^ " , # "1002x1418",
" b4 " : " 709x709^ " , # "709x1002",
" b5 " : " 501x501^ " , # "501x709",
" c0 " : " 2600x2600^ " , # "2600x3677",
" c1 " : " 1837x1837^ " , # "1837x2600",
" c2 " : " 1298x1298^ " , # "1298x1837",
" c3 " : " 918x918^ " , # "918x1298",
" c4 " : " 649x649^ " , # "649x918",
" c5 " : " 459x459^ " , # "459x649",
" c6 " : " 323x323^ " , # "323x459",
" flsa " : " 612x612^ " , # "612x936",
" flse " : " 612x612^ " , # "612x936",
" halfletter " : " 396x396^ " , # "396x612",
" tabloid " : " 792x792^ " , # "792x1224",
" statement " : " 396x396^ " , # "396x612",
" executive " : " 540x540^ " , # "540x720",
" folio " : " 612x612^ " , # "612x936",
" quarto " : " 610x610^ " , # "610x780"
}
string = string . lower ( )
return papersizes . get ( string , string )
2015-03-06 18:29:24 +00:00
def valid_size ( string ) :
2015-03-20 10:37:30 +00:00
# conversion factors from units to points
units = {
' in ' : 72.0 ,
' cm ' : 72.0 / 2.54 ,
' mm ' : 72.0 / 25.4 ,
' pt ' : 1.0
}
pagesize_options = {
' exact ' : [ ' \ ! ' , False ] ,
' shrink ' : [ ' \ > ' , False ] ,
' enlarge ' : [ ' \ < ' , False ] ,
' fill ' : [ ' \ ^ ' , False ] ,
' percent ' : [ ' \ % ' , False ] ,
' count ' : [ ' \ @ ' , False ] ,
}
string = get_standard_papersize ( string )
pattern = re . compile ( r """
( [ 0 - 9 ] * \. ? [ 0 - 9 ] * ) # tokens.group(1) == width; may be empty
( [ a - z ] * ) # tokens.group(2) == units; may be empty
x
( [ 0 - 9 ] * \. ? [ 0 - 9 ] * ) # tokens.group(3) == height; may be empty
( [ a - zA - Z ] * ) # tokens.group(4) == units; may be empty
( [ ^ 0 - 9 a - zA - Z ] * ) # tokens.group(5) == extra options
""" , re.VERBOSE)
tokens = pattern . match ( string )
# tokens.group(0) should match entire input string
if tokens . group ( 0 ) != string :
msg = ( ' Input size needs to be of the format AuxBv#, '
' where A is width, B is height, u and v are units, '
' # are options. '
' You may omit either width or height, but not both. '
' Units may be specified as (in, cm, mm, pt). '
' You may omit units, which will default to pt. '
' Available options include (! = exact ; ^ = fill ; default = into). ' )
2015-03-06 18:29:24 +00:00
raise argparse . ArgumentTypeError ( msg )
2015-03-20 10:37:30 +00:00
# temporary list to loop through to process width and height
pagesize_size = {
' x ' : [ 0 , tokens . group ( 1 ) , tokens . group ( 2 ) ] ,
' y ' : [ 0 , tokens . group ( 3 ) , tokens . group ( 4 ) ]
}
for key , value in pagesize_size . items ( ) :
try :
value [ 0 ] = float ( value [ 1 ] )
value [ 0 ] * = units [ value [ 2 ] ] # convert to points
except ValueError , e :
# assign None if width or height not provided
value [ 0 ] = None
except KeyError , e :
# if units unrecognized, raise error
# otherwise default to pt because units not provided
if value [ 2 ] :
msg = " unrecognized unit ' %s ' . " % value [ 2 ]
raise argparse . ArgumentTypeError ( msg )
x = pagesize_size [ ' x ' ] [ 0 ]
y = pagesize_size [ ' y ' ] [ 0 ]
# parse options for resize methods
if tokens . group ( 5 ) :
for key , value in pagesize_options . items ( ) :
if re . search ( value [ 0 ] , tokens . group ( 5 ) ) :
value [ 1 ] = True
if pagesize_options [ ' fill ' ] [ 1 ] :
# if either width or height is not given, try to fill in missing value
if not x :
x = y
elif not y :
y = x
if pagesize_options [ ' exact ' ] [ 1 ] :
if not x or not y :
msg = ( ' exact size requires both width and height. ' )
raise argparse . ArgumentTypeError ( msg )
if not x and not y :
msg = ( ' width and height cannot both be omitted. ' )
raise argparse . ArgumentTypeError ( msg )
return ( x , y , pagesize_options )
2015-03-06 18:29:24 +00:00
2015-03-13 10:43:38 +00:00
# in python3, the received argument will be a unicode str() object which needs
# to be encoded into a bytes() object
# in python2, the received argument will be a binary str() object which needs
# no encoding
# we check whether we use python2 or python3 by checking whether the argument
# is both, type str and type bytes (only the case in python2)
def pdf_embedded_string ( string ) :
if type ( string ) is str and type ( string ) is not bytes :
2015-03-13 12:05:23 +00:00
# py3
pass
else :
# py2
string = string . decode ( " utf8 " )
string = b " \xfe \xff " + string . encode ( " utf-16-be " )
string = string . replace ( b ' \\ ' , b ' \\ \\ ' )
string = string . replace ( b ' ( ' , b ' \\ ( ' )
string = string . replace ( b ' ) ' , b ' \\ ) ' )
2015-03-13 10:43:38 +00:00
return string
2014-03-01 03:57:40 +00:00
parser = argparse . ArgumentParser (
description = ' Lossless conversion/embedding of images (in)to pdf ' )
parser . add_argument (
2014-11-06 07:46:47 +00:00
' images ' , metavar = ' infile ' , type = str ,
2014-03-01 03:57:40 +00:00
nargs = ' + ' , help = ' input file(s) ' )
parser . add_argument (
' -o ' , ' --output ' , metavar = ' out ' , type = argparse . FileType ( ' wb ' ) ,
2015-03-20 10:37:30 +00:00
default = getattr ( sys . stdout , " buffer " , sys . stdout ) ,
help = ' output file (default: stdout) ' )
2015-03-06 18:29:24 +00:00
sizeopts = parser . add_mutually_exclusive_group ( )
sizeopts . add_argument (
2014-03-01 03:57:40 +00:00
' -d ' , ' --dpi ' , metavar = ' dpi ' , type = positive_float ,
2015-03-20 10:37:30 +00:00
help = ( ' dpi for pdf output. '
' If input image does not specify dpi the default is %.2f . '
' Must not be used with -s/--pagesize. ' ) % default_dpi
)
2015-03-06 18:29:24 +00:00
sizeopts . add_argument (
' -s ' , ' --pagesize ' , metavar = ' size ' , type = valid_size ,
2015-03-20 10:37:30 +00:00
default = ( None , None , None ) ,
help = ( ' size of the pdf pages in format AuxBv#, '
' where A is width, B is height, u and v are units, # are options. '
' You may omit either width or height, but not both. '
' Some common page sizes, such as letter and a4, are also recognized. '
' Units may be specified as (in, cm, mm, pt). '
' Units default to pt when absent. '
' Available options include (! = exact ; ^ = fill ; default = into). '
' Must not be used with -d/--dpi. ' )
)
2015-03-06 18:29:24 +00:00
2014-03-01 03:57:40 +00:00
parser . add_argument (
2015-03-13 10:43:38 +00:00
' -t ' , ' --title ' , metavar = ' title ' , type = pdf_embedded_string ,
2014-03-01 03:57:40 +00:00
help = ' title for metadata ' )
parser . add_argument (
2015-03-13 10:43:38 +00:00
' -a ' , ' --author ' , metavar = ' author ' , type = pdf_embedded_string ,
2014-03-01 03:57:40 +00:00
help = ' author for metadata ' )
parser . add_argument (
2015-03-13 10:43:38 +00:00
' -c ' , ' --creator ' , metavar = ' creator ' , type = pdf_embedded_string ,
2014-03-01 03:57:40 +00:00
help = ' creator for metadata ' )
parser . add_argument (
2015-03-13 10:43:38 +00:00
' -p ' , ' --producer ' , metavar = ' producer ' , type = pdf_embedded_string ,
2014-03-01 03:57:40 +00:00
help = ' producer for metadata ' )
parser . add_argument (
' -r ' , ' --creationdate ' , metavar = ' creationdate ' , type = valid_date ,
2015-03-13 13:29:53 +00:00
help = ' UTC creation date for metadata in YYYY-MM-DD or YYYY-MM-DDTHH:MM or YYYY-MM-DDTHH:MM:SS format or any format understood by python dateutil module or any format understood by `date --date` ' )
2014-03-01 03:57:40 +00:00
parser . add_argument (
' -m ' , ' --moddate ' , metavar = ' moddate ' , type = valid_date ,
2015-03-13 13:29:53 +00:00
help = ' UTC modification date for metadata in YYYY-MM-DD or YYYY-MM-DDTHH:MM or YYYY-MM-DDTHH:MM:SS format or any format understood by python dateutil module or any format understood by `date --date` ' )
2014-03-01 03:57:40 +00:00
parser . add_argument (
2015-03-13 10:43:38 +00:00
' -S ' , ' --subject ' , metavar = ' subject ' , type = pdf_embedded_string ,
2014-03-01 03:57:40 +00:00
help = ' subject for metadata ' )
parser . add_argument (
2015-03-13 10:43:38 +00:00
' -k ' , ' --keywords ' , metavar = ' kw ' , type = pdf_embedded_string , nargs = ' + ' ,
2014-03-01 03:57:40 +00:00
help = ' keywords for metadata ' )
parser . add_argument (
2015-03-13 10:43:38 +00:00
' -C ' , ' --colorspace ' , metavar = ' colorspace ' , type = pdf_embedded_string ,
2015-02-15 08:03:16 +00:00
help = ' force PIL colorspace (one of: RGB, L, 1, CMYK, CMYK;I) ' )
2015-01-07 15:23:52 +00:00
parser . add_argument (
' -D ' , ' --nodate ' , help = ' do not add timestamps ' , action = " store_true " )
2014-03-01 03:57:40 +00:00
parser . add_argument (
' -v ' , ' --verbose ' , help = ' verbose mode ' , action = " store_true " )
2015-03-07 02:20:14 +00:00
parser . add_argument (
' -V ' , ' --version ' , action = ' version ' , version = ' %(prog)s ' + __version__ ,
help = " Print version information and exit " )
2014-03-01 03:57:40 +00:00
def main ( args = None ) :
if args is None :
args = sys . argv [ 1 : ]
args = parser . parse_args ( args )
2014-08-04 15:25:07 +00:00
2014-03-01 03:57:40 +00:00
args . output . write (
convert (
2015-03-06 18:29:24 +00:00
args . images , args . dpi , args . pagesize , args . title , args . author ,
2014-03-01 03:57:40 +00:00
args . creator , args . producer , args . creationdate , args . moddate ,
2015-02-16 13:49:53 +00:00
args . subject , args . keywords , args . colorspace , args . nodate ,
args . verbose ) )
2014-03-14 18:13:03 +00:00
if __name__ == ' __main__ ' :
main ( )