2014-07-26 14:12:40 +00:00
#!/usr/bin/env python2
2014-03-30 06:10:12 +00:00
# Copyright (C) 2012-2014 Johannes 'josch' Schauer <j.schauer at email.de>
2013-05-02 06:17:13 +00:00
#
2014-03-30 06:10:12 +00:00
# This program is free software: you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation, either
# version 3 of the License, or (at your option) any later
# version.
2013-05-02 06:17:13 +00:00
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
2014-03-30 06:10:12 +00:00
# You should have received a copy of the GNU General Public
# License along with this program. If not, see
# <http://www.gnu.org/licenses/>.
2012-03-29 09:08:32 +00:00
import sys
import zlib
import argparse
2012-03-29 09:53:57 +00:00
import struct
2014-03-01 04:51:53 +00:00
from PIL import Image
2012-03-29 09:08:32 +00:00
from datetime import datetime
2013-10-21 13:55:47 +00:00
from jp2 import parsejp2
2015-02-16 06:39:07 +00:00
try :
from cStringIO import cStringIO
except ImportError :
from io import BytesIO as cStringIO
2012-03-29 09:08:32 +00:00
2014-03-01 04:51:53 +00:00
# XXX: Switch to use logging module.
def debug_out ( message , verbose = True ) :
if verbose :
sys . stderr . write ( " D: " + message + " \n " )
def error_out ( message ) :
sys . stderr . write ( " E: " + message + " \n " )
def warning_out ( message ) :
sys . stderr . write ( " W: " + message + " \n " )
2012-03-29 09:08:32 +00:00
def parse ( cont , indent = 1 ) :
if type ( cont ) is dict :
2015-01-07 14:56:24 +00:00
return b " << \n " + b " \n " . join (
[ 4 * indent * b " " + k . encode ( " utf8 " ) + b " " + parse ( v , indent + 1 )
2015-01-07 15:23:52 +00:00
for k , v in sorted ( cont . items ( ) ) ] ) + b " \n " + 4 * ( indent - 1 ) * b " " + b " >> "
2015-02-16 18:09:34 +00:00
elif type ( cont ) is int :
2015-01-07 14:56:24 +00:00
return str ( cont ) . encode ( " utf8 " )
2015-02-16 18:09:34 +00:00
elif type ( cont ) is float :
return ( " %0.4f " % cont ) . encode ( " utf8 " )
2012-03-29 09:08:32 +00:00
elif isinstance ( cont , obj ) :
2015-01-07 14:56:24 +00:00
return ( " %d 0 R " % cont . identifier ) . encode ( " utf8 " )
2012-03-29 09:08:32 +00:00
elif type ( cont ) is str :
2015-01-07 14:56:24 +00:00
return cont . encode ( " utf8 " )
elif type ( cont ) is bytes :
2012-03-29 09:08:32 +00:00
return cont
elif type ( cont ) is list :
2015-01-07 14:56:24 +00:00
return b " [ " + b " " . join ( [ parse ( c , indent ) for c in cont ] ) + b " ] "
2012-03-29 09:08:32 +00:00
2014-03-01 03:57:40 +00:00
class obj ( object ) :
2012-03-29 09:08:32 +00:00
def __init__ ( self , content , stream = None ) :
self . content = content
self . stream = stream
2013-10-23 10:34:07 +00:00
def tostring ( self ) :
2012-03-29 09:08:32 +00:00
if self . stream :
2014-03-01 04:51:53 +00:00
return (
2015-01-07 14:56:24 +00:00
( " %d 0 obj " % self . identifier ) . encode ( " utf8 " ) +
2014-03-01 04:51:53 +00:00
parse ( self . content ) +
2015-01-07 14:56:24 +00:00
b " \n stream \n " + self . stream + b " \n endstream \n endobj \n " )
2012-03-29 09:08:32 +00:00
else :
2015-01-07 14:56:24 +00:00
return ( " %d 0 obj " % self . identifier ) . encode ( " utf8 " ) + parse ( self . content ) + b " endobj \n "
2012-03-29 09:08:32 +00:00
2014-03-01 03:57:40 +00:00
class pdfdoc ( object ) :
2012-06-15 14:59:31 +00:00
2014-03-01 03:57:40 +00:00
def __init__ ( self , version = 3 , title = None , author = None , creator = None ,
producer = None , creationdate = None , moddate = None , subject = None ,
2015-01-07 15:23:52 +00:00
keywords = None , nodate = False ) :
2013-10-23 10:34:07 +00:00
self . version = version # default pdf version 1.3
now = datetime . now ( )
2014-03-01 04:51:53 +00:00
self . objects = [ ]
2013-10-23 10:34:07 +00:00
2014-03-01 03:57:40 +00:00
info = { }
2013-10-23 10:34:07 +00:00
if title :
info [ " /Title " ] = " ( " + title + " ) "
if author :
info [ " /Author " ] = " ( " + author + " ) "
if creator :
info [ " /Creator " ] = " ( " + creator + " ) "
if producer :
info [ " /Producer " ] = " ( " + producer + " ) "
if creationdate :
info [ " /CreationDate " ] = " (D: " + creationdate . strftime ( " % Y % m %d % H % M % S " ) + " ) "
2015-01-07 15:23:52 +00:00
elif not nodate :
2013-10-23 10:34:07 +00:00
info [ " /CreationDate " ] = " (D: " + now . strftime ( " % Y % m %d % H % M % S " ) + " ) "
if moddate :
info [ " /ModDate " ] = " (D: " + moddate . strftime ( " % Y % m %d % H % M % S " ) + " ) "
2015-01-07 15:23:52 +00:00
elif not nodate :
2013-10-23 10:34:07 +00:00
info [ " /ModDate " ] = " (D: " + now . strftime ( " % Y % m %d % H % M % S " ) + " ) "
if subject :
info [ " /Subject " ] = " ( " + subject + " ) "
if keywords :
info [ " /Keywords " ] = " ( " + " , " . join ( keywords ) + " ) "
self . info = obj ( info )
2014-03-01 03:57:40 +00:00
# create an incomplete pages object so that a /Parent entry can be
# added to each page
2013-10-23 10:34:07 +00:00
self . pages = obj ( {
" /Type " : " /Pages " ,
" /Kids " : [ ] ,
" /Count " : 0
} )
2012-03-29 09:53:57 +00:00
2013-10-23 10:34:07 +00:00
self . catalog = obj ( {
" /Pages " : self . pages ,
" /Type " : " /Catalog "
} )
self . addobj ( self . catalog )
self . addobj ( self . pages )
2013-08-30 08:45:43 +00:00
2013-10-23 10:34:07 +00:00
def addobj ( self , obj ) :
newid = len ( self . objects ) + 1
obj . identifier = newid
self . objects . append ( obj )
2014-08-04 15:25:07 +00:00
def addimage ( self , color , width , height , imgformat , imgdata , pdf_x , pdf_y ) :
2012-03-29 09:08:32 +00:00
if color == ' L ' :
2015-02-15 08:03:16 +00:00
colorspace = " /DeviceGray "
2012-03-29 09:08:32 +00:00
elif color == ' RGB ' :
2015-02-15 08:03:16 +00:00
colorspace = " /DeviceRGB "
elif color == ' CMYK ' or color == ' CMYK;I ' :
colorspace = " /DeviceCMYK "
2012-03-29 09:08:32 +00:00
else :
2013-10-23 06:49:43 +00:00
error_out ( " unsupported color space: %s " % color )
2012-03-29 09:08:32 +00:00
exit ( 1 )
2013-10-23 06:49:59 +00:00
if pdf_x < 3.00 or pdf_y < 3.00 :
warning_out ( " pdf width or height is below 3.00 - decrease the dpi " )
2012-03-29 09:08:32 +00:00
# either embed the whole jpeg or deflate the bitmap representation
if imgformat is " JPEG " :
ofilter = [ " /DCTDecode " ]
2014-04-04 23:55:03 +00:00
elif imgformat is " JPEG2000 " :
2012-03-29 09:53:57 +00:00
ofilter = [ " /JPXDecode " ]
2013-10-23 10:34:07 +00:00
self . version = 5 # jpeg2000 needs pdf 1.5
2012-03-29 09:08:32 +00:00
else :
ofilter = [ " /FlateDecode " ]
image = obj ( {
" /Type " : " /XObject " ,
" /Subtype " : " /Image " ,
" /Filter " : ofilter ,
" /Width " : width ,
" /Height " : height ,
2015-02-15 08:03:16 +00:00
" /ColorSpace " : colorspace ,
2014-03-01 03:57:40 +00:00
# hardcoded as PIL doesnt provide bits for non-jpeg formats
" /BitsPerComponent " : 8 ,
2012-03-29 09:08:32 +00:00
" /Length " : len ( imgdata )
} , imgdata )
2015-02-15 08:03:16 +00:00
if color == ' CMYK;I ' :
# Inverts all four channels
image . content [ ' /Decode ' ] = [ 1.0 , 0.0 , 1.0 , 0.0 , 1.0 , 0.0 , 1.0 , 0.0 ]
2015-03-06 18:16:36 +00:00
text = ( " q \n %0.4f 0 0 %0.4f 0 0 cm \n /Im0 Do \n Q " % ( pdf_x , pdf_y ) ) . encode ( ' utf8 ' )
2012-03-29 09:08:32 +00:00
content = obj ( {
" /Length " : len ( text )
} , text )
page = obj ( {
" /Type " : " /Page " ,
2013-10-23 10:34:07 +00:00
" /Parent " : self . pages ,
2012-03-29 09:08:32 +00:00
" /Resources " : {
" /XObject " : {
" /Im0 " : image
}
} ,
" /MediaBox " : [ 0 , 0 , pdf_x , pdf_y ] ,
" /Contents " : content
} )
2013-10-23 10:34:07 +00:00
self . pages . content [ " /Kids " ] . append ( page )
self . pages . content [ " /Count " ] + = 1
self . addobj ( page )
self . addobj ( content )
self . addobj ( image )
def tostring ( self ) :
# add info as last object
self . addobj ( self . info )
xreftable = list ( )
2015-01-07 14:56:24 +00:00
result = ( " %% PDF-1. %d \n " % self . version ) . encode ( " utf8 " )
2013-10-23 10:34:07 +00:00
2015-01-07 14:56:24 +00:00
xreftable . append ( b " 0000000000 65535 f \n " )
2013-10-23 10:34:07 +00:00
for o in self . objects :
2015-01-07 14:56:24 +00:00
xreftable . append ( ( " %010d 00000 n \n " % len ( result ) ) . encode ( " utf8 " ) )
2013-10-23 10:34:07 +00:00
result + = o . tostring ( )
xrefoffset = len ( result )
2015-01-07 14:56:24 +00:00
result + = b " xref \n "
result + = ( " 0 %d \n " % len ( xreftable ) ) . encode ( " utf8 " )
2013-10-23 10:34:07 +00:00
for x in xreftable :
result + = x
2015-01-07 14:56:24 +00:00
result + = b " trailer \n "
result + = parse ( { " /Size " : len ( xreftable ) , " /Info " : self . info , " /Root " : self . catalog } ) + b " \n "
result + = b " startxref \n "
result + = ( " %d \n " % xrefoffset ) . encode ( " utf8 " )
result + = b " %% EOF \n "
2013-10-23 10:34:07 +00:00
return result
2012-03-29 09:08:32 +00:00
2015-03-06 18:29:24 +00:00
def convert ( images , dpi = None , pagesize = ( None , None ) , title = None , author = None ,
2015-02-16 06:15:10 +00:00
creator = None , producer = None , creationdate = None , moddate = None ,
2015-02-16 13:49:53 +00:00
subject = None , keywords = None , colorspace = None , nodate = False ,
verbose = False ) :
2012-03-29 09:08:32 +00:00
2014-03-01 03:57:40 +00:00
pdf = pdfdoc ( 3 , title , author , creator , producer , creationdate ,
2015-02-16 13:49:53 +00:00
moddate , subject , keywords , nodate )
2012-03-29 09:08:32 +00:00
2014-11-06 07:46:47 +00:00
for imfilename in images :
debug_out ( " Reading %s " % imfilename , verbose )
2015-02-16 06:39:07 +00:00
try :
rawdata = imfilename . read ( )
im = cStringIO ( rawdata )
except :
with open ( imfilename , " rb " ) as im :
rawdata = im . read ( )
im = cStringIO ( rawdata )
try :
imgdata = Image . open ( im )
except IOError as e :
# test if it is a jpeg2000 image
if rawdata [ : 12 ] != " \x00 \x00 \x00 \x0C \x6A \x50 \x20 \x20 \x0D \x0A \x87 \x0A " :
error_out ( " cannot read input image (not jpeg2000) " )
error_out ( " PIL: %s " % e )
exit ( 1 )
# image is jpeg2000
width , height , ics = parsejp2 ( rawdata )
imgformat = " JPEG2000 "
if dpi :
ndpi = dpi , dpi
debug_out ( " input dpi (forced) = %d x %d " % ndpi , verbose )
2013-10-23 10:34:07 +00:00
else :
2015-03-06 18:19:21 +00:00
# TODO: read real dpi from input jpeg2000 image
ndpi = ( 96 , 96 )
2015-02-16 06:39:07 +00:00
debug_out ( " input dpi = %d x %d " % ndpi , verbose )
if colorspace :
color = colorspace
debug_out ( " input colorspace (forced) = %s " % ( ics ) )
else :
color = ics
debug_out ( " input colorspace = %s " % ( ics ) , verbose )
else :
width , height = imgdata . size
imgformat = imgdata . format
if dpi :
ndpi = dpi , dpi
debug_out ( " input dpi (forced) = %d x %d " % ndpi , verbose )
else :
ndpi = imgdata . info . get ( " dpi " , ( 96 , 96 ) )
2015-02-16 18:18:46 +00:00
# in python3, the returned dpi value for some tiff images will
# not be an integer but a float. To make the behaviour of
# img2pdf the same between python2 and python3, we convert that
# float into an integer by rounding
# search online for the 72.009 dpi problem for more info
ndpi = ( int ( round ( ndpi [ 0 ] ) ) , int ( round ( ndpi [ 1 ] ) ) )
2015-02-16 06:39:07 +00:00
debug_out ( " input dpi = %d x %d " % ndpi , verbose )
if colorspace :
color = colorspace
debug_out ( " input colorspace (forced) = %s " % ( color ) , verbose )
else :
color = imgdata . mode
if color == " CMYK " and imgformat == " JPEG " :
# Adobe inverts CMYK JPEGs for some reason, and others
# have followed suit as well. Some software assumes the
# JPEG is inverted if the Adobe tag (APP14), while other
# software assumes all CMYK JPEGs are inverted. I don't
# have enough experience with these to know which is
# better for images currently in the wild, so I'm going
# with the first approach for now.
if " adobe " in imgdata . info :
color = " CMYK;I "
debug_out ( " input colorspace = %s " % ( color ) , verbose )
debug_out ( " width x height = %d x %d " % ( width , height ) , verbose )
debug_out ( " imgformat = %s " % imgformat , verbose )
# depending on the input format, determine whether to pass the raw
# image or the zlib compressed color information
if imgformat is " JPEG " or imgformat is " JPEG2000 " :
if color == ' 1 ' :
error_out ( " jpeg can ' t be monochrome " )
exit ( 1 )
imgdata = rawdata
else :
# because we do not support /CCITTFaxDecode
if color == ' 1 ' :
debug_out ( " Converting colorspace 1 to L " , verbose )
imgdata = imgdata . convert ( ' L ' )
color = ' L '
elif color in ( " RGB " , " L " , " CMYK " , " CMYK;I " ) :
debug_out ( " Colorspace is OK: %s " % color , verbose )
2014-11-06 07:47:42 +00:00
else :
2015-02-16 06:39:07 +00:00
debug_out ( " Converting colorspace %s to RGB " % color , verbose )
imgdata = imgdata . convert ( ' RGB ' )
color = imgdata . mode
2015-02-16 18:19:49 +00:00
img = imgdata . tobytes ( )
imgdata . close ( )
imgdata = zlib . compress ( img )
im . close ( )
2013-10-23 10:34:07 +00:00
2014-08-04 15:25:07 +00:00
# pdf units = 1/72 inch
2015-03-06 18:29:24 +00:00
if not pagesize [ 0 ] and not pagesize [ 1 ] :
2015-02-16 18:18:46 +00:00
pdf_x , pdf_y = 72.0 * width / float ( ndpi [ 0 ] ) , 72.0 * height / float ( ndpi [ 1 ] )
2015-03-06 18:29:24 +00:00
elif not pagesize [ 1 ] :
pdf_x , pdf_y = pagesize [ 0 ] , pagesize [ 0 ] * height / float ( width )
elif not pagesize [ 0 ] :
pdf_x , pdf_y = pagesize [ 1 ] * width / float ( height ) , pagesize [ 1 ]
2015-01-15 05:15:05 +00:00
else :
2015-03-06 18:29:24 +00:00
pdf_x = pagesize [ 0 ]
pdf_y = pagesize [ 1 ]
2014-08-04 15:25:07 +00:00
pdf . addimage ( color , width , height , imgformat , imgdata , pdf_x , pdf_y )
2013-10-23 10:34:07 +00:00
return pdf . tostring ( )
2012-03-29 09:08:32 +00:00
2014-03-01 03:57:40 +00:00
def positive_float ( string ) :
value = float ( string )
if value < = 0 :
msg = " %r is not positive " % string
raise argparse . ArgumentTypeError ( msg )
return value
def valid_date ( string ) :
return datetime . strptime ( string , " % Y- % m- %d T % H: % M: % S " )
2015-03-06 18:29:24 +00:00
def valid_size ( string ) :
tokens = string . split ( ' x ' )
if len ( tokens ) != 2 :
msg = " input size needs to be of the format Ax, xB or AxB with A and B being integers "
raise argparse . ArgumentTypeError ( msg )
x = tokens [ 0 ]
y = tokens [ 1 ]
if x == ' ' :
x = None
else :
x = int ( x )
if y == ' ' :
y = None
else :
y = int ( y )
return ( x , y )
2014-03-01 03:57:40 +00:00
parser = argparse . ArgumentParser (
description = ' Lossless conversion/embedding of images (in)to pdf ' )
parser . add_argument (
2014-11-06 07:46:47 +00:00
' images ' , metavar = ' infile ' , type = str ,
2014-03-01 03:57:40 +00:00
nargs = ' + ' , help = ' input file(s) ' )
parser . add_argument (
' -o ' , ' --output ' , metavar = ' out ' , type = argparse . FileType ( ' wb ' ) ,
2015-02-16 18:20:06 +00:00
default = getattr ( sys . stdout , " buffer " , sys . stdout ) , help = ' output file (default: stdout) ' )
2015-03-06 18:29:24 +00:00
sizeopts = parser . add_mutually_exclusive_group ( )
sizeopts . add_argument (
2014-03-01 03:57:40 +00:00
' -d ' , ' --dpi ' , metavar = ' dpi ' , type = positive_float ,
2015-03-06 18:29:24 +00:00
help = ' dpi for pdf output. If input image does not specify dpi the default is 96.0. Must not be specified together with -s/--pagesize. ' )
sizeopts . add_argument (
' -s ' , ' --pagesize ' , metavar = ' size ' , type = valid_size ,
default = ( None , None ) ,
help = ' size of the pages in the pdf output in format AxB with A and B being width and height of the page in points. You can omit either one of them. Must not be specified together with -d/--dpi. ' )
2014-03-01 03:57:40 +00:00
parser . add_argument (
' -t ' , ' --title ' , metavar = ' title ' , type = str ,
help = ' title for metadata ' )
parser . add_argument (
' -a ' , ' --author ' , metavar = ' author ' , type = str ,
help = ' author for metadata ' )
parser . add_argument (
' -c ' , ' --creator ' , metavar = ' creator ' , type = str ,
help = ' creator for metadata ' )
parser . add_argument (
' -p ' , ' --producer ' , metavar = ' producer ' , type = str ,
help = ' producer for metadata ' )
parser . add_argument (
' -r ' , ' --creationdate ' , metavar = ' creationdate ' , type = valid_date ,
help = ' creation date for metadata in YYYY-MM-DDTHH:MM:SS format ' )
parser . add_argument (
' -m ' , ' --moddate ' , metavar = ' moddate ' , type = valid_date ,
help = ' modification date for metadata in YYYY-MM-DDTHH:MM:SS format ' )
parser . add_argument (
2015-03-06 18:29:24 +00:00
' -S ' , ' --subject ' , metavar = ' subject ' , type = str ,
2014-03-01 03:57:40 +00:00
help = ' subject for metadata ' )
parser . add_argument (
' -k ' , ' --keywords ' , metavar = ' kw ' , type = str , nargs = ' + ' ,
help = ' keywords for metadata ' )
parser . add_argument (
' -C ' , ' --colorspace ' , metavar = ' colorspace ' , type = str ,
2015-02-15 08:03:16 +00:00
help = ' force PIL colorspace (one of: RGB, L, 1, CMYK, CMYK;I) ' )
2015-01-07 15:23:52 +00:00
parser . add_argument (
' -D ' , ' --nodate ' , help = ' do not add timestamps ' , action = " store_true " )
2014-03-01 03:57:40 +00:00
parser . add_argument (
' -v ' , ' --verbose ' , help = ' verbose mode ' , action = " store_true " )
def main ( args = None ) :
if args is None :
args = sys . argv [ 1 : ]
args = parser . parse_args ( args )
2014-08-04 15:25:07 +00:00
2014-03-01 03:57:40 +00:00
args . output . write (
convert (
2015-03-06 18:29:24 +00:00
args . images , args . dpi , args . pagesize , args . title , args . author ,
2014-03-01 03:57:40 +00:00
args . creator , args . producer , args . creationdate , args . moddate ,
2015-02-16 13:49:53 +00:00
args . subject , args . keywords , args . colorspace , args . nodate ,
args . verbose ) )
2014-03-14 18:13:03 +00:00
if __name__ == ' __main__ ' :
main ( )