From 592cdc1cdb8eba77c2cb5ab566e29dbc14f2838c Mon Sep 17 00:00:00 2001
From: xiota <github@mentalfossa.com>
Date: Fri, 20 Mar 2015 03:37:30 -0700
Subject: [PATCH] Changes to pdf page size handling

Changes to `valid_size()`
* accept common page sizes, such as letter and a4.
* parse dimensions of format: AuxBv#, where A is width, u is units, B is height, v is units, # are options.
* accept units: in, cm, mm, pt

Changes to `convert()`:
* resize pages based on dpi calculations
* default resize images into page size (like default resize in imagemagick)
* implement exact resizing (ignore dpi; equiv to ! in imagemagick)

Created `get_ndpi()`:
* provides dpi for page resizing
* implement fill resizing (equiv to ^ in imagemagick)

Other changes:
* default dpi in global variable
---
 src/img2pdf.py | 258 +++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 208 insertions(+), 50 deletions(-)

diff --git a/src/img2pdf.py b/src/img2pdf.py
index aafadb3..b9ffd8c 100755
--- a/src/img2pdf.py
+++ b/src/img2pdf.py
@@ -18,7 +18,9 @@
 # <http://www.gnu.org/licenses/>.
 
 __version__ = "0.1.6~git"
+default_dpi = 96.0
 
+import re
 import sys
 import zlib
 import argparse
@@ -212,10 +214,12 @@ class pdfdoc(object):
         result += b"%%EOF\n"
         return result
 
-def convert(images, dpi=None, pagesize=(None, None), title=None, author=None,
-            creator=None, producer=None, creationdate=None, moddate=None,
-            subject=None, keywords=None, colorspace=None, nodate=False,
-            verbose=False):
+def convert(images, dpi=None, pagesize=(None, None, None), title=None,
+            author=None, creator=None, producer=None, creationdate=None,
+            moddate=None, subject=None, keywords=None, colorspace=None,
+            nodate=False, verbose=False):
+
+    pagesize_options = pagesize[2]
 
     pdf = pdfdoc(3, title, author, creator, producer, creationdate,
                  moddate, subject, keywords, nodate)
@@ -241,13 +245,9 @@ def convert(images, dpi=None, pagesize=(None, None), title=None, author=None,
             width, height, ics = parsejp2(rawdata)
             imgformat = "JPEG2000"
 
-            if dpi:
-                ndpi = dpi, dpi
-                debug_out("input dpi (forced) = %d x %d"%ndpi, verbose)
-            else:
-                # TODO: read real dpi from input jpeg2000 image
-                ndpi = (96, 96)
-                debug_out("input dpi = %d x %d"%ndpi, verbose)
+            # TODO: read real dpi from input jpeg2000 image
+            ndpi = (default_dpi, default_dpi)
+            debug_out("input dpi = %d x %d" % ndpi, verbose)
 
             if colorspace:
                 color = colorspace
@@ -259,18 +259,14 @@ def convert(images, dpi=None, pagesize=(None, None), title=None, author=None,
             width, height = imgdata.size
             imgformat = imgdata.format
 
-            if dpi:
-                ndpi = dpi, dpi
-                debug_out("input dpi (forced) = %d x %d"%ndpi, verbose)
-            else:
-                ndpi = imgdata.info.get("dpi", (96, 96))
-                # in python3, the returned dpi value for some tiff images will
-                # not be an integer but a float. To make the behaviour of
-                # img2pdf the same between python2 and python3, we convert that
-                # float into an integer by rounding
-                # search online for the 72.009 dpi problem for more info
-                ndpi = (int(round(ndpi[0])),int(round(ndpi[1])))
-                debug_out("input dpi = %d x %d"%ndpi, verbose)
+            ndpi = imgdata.info.get("dpi", (default_dpi, default_dpi))
+            # in python3, the returned dpi value for some tiff images will
+            # not be an integer but a float. To make the behaviour of
+            # img2pdf the same between python2 and python3, we convert that
+            # float into an integer by rounding
+            # search online for the 72.009 dpi problem for more info
+            ndpi = (int(round(ndpi[0])),int(round(ndpi[1])))
+            debug_out("input dpi = %d x %d" % ndpi, verbose)
 
             if colorspace:
                 color = colorspace
@@ -292,6 +288,13 @@ def convert(images, dpi=None, pagesize=(None, None), title=None, author=None,
         debug_out("width x height = %d x %d"%(width,height), verbose)
         debug_out("imgformat = %s"%imgformat, verbose)
 
+        if dpi:
+            ndpi = dpi, dpi
+            debug_out("input dpi (forced) = %d x %d" % ndpi, verbose)
+        elif pagesize_options:
+            ndpi = get_ndpi(width, height, pagesize)
+            debug_out("calculated dpi (based on pagesize) = %d x %d" % ndpi, verbose)
+
         # depending on the input format, determine whether to pass the raw
         # image or the zlib compressed color information
         if imgformat is "JPEG" or imgformat is "JPEG2000":
@@ -320,21 +323,43 @@ def convert(images, dpi=None, pagesize=(None, None), title=None, author=None,
             imgdata = zlib.compress(img)
         im.close()
 
-        # pdf units = 1/72 inch
-        if not pagesize[0] and not pagesize[1]:
-            pdf_x, pdf_y = 72.0*width/float(ndpi[0]), 72.0*height/float(ndpi[1])
-        elif not pagesize[1]:
-            pdf_x, pdf_y = pagesize[0], pagesize[0]*height/float(width)
-        elif not pagesize[0]:
-            pdf_x, pdf_y = pagesize[1]*width/float(height), pagesize[1]
+        if pagesize_options and pagesize_options['exact'][1]:
+            # output size exactly to specified dimensions
+            # pagesize[0], pagesize[1] already checked in valid_size()
+            pdf_x, pdf_y = pagesize[0], pagesize[1]
         else:
-            pdf_x = pagesize[0]
-            pdf_y = pagesize[1]
+            # output size based on dpi; point = 1/72 inch
+            pdf_x, pdf_y = 72.0*width/float(ndpi[0]), 72.0*height/float(ndpi[1])
 
         pdf.addimage(color, width, height, imgformat, imgdata, pdf_x, pdf_y)
 
     return pdf.tostring()
 
+def get_ndpi(width, height, pagesize):
+    pagesize_options = pagesize[2]
+
+    if pagesize_options and pagesize_options['fill'][1]:
+        if width/height < pagesize[0]/pagesize[1]:
+            tmp_dpi = 72.0*width/pagesize[0]
+        else:
+            tmp_dpi = 72.0*height/pagesize[1]
+    elif pagesize[0] and pagesize[1]:
+        # if both height and width given with no specific pagesize_option,
+        # resize to fit "into" page
+        if width/height < pagesize[0]/pagesize[1]:
+            tmp_dpi = 72.0*height/pagesize[1]
+        else:
+            tmp_dpi = 72.0*width/pagesize[0]
+    elif pagesize[0]:
+        # if width given, calculate dpi based on width
+        tmp_dpi = 72.0*width/pagesize[0]
+    elif pagesize[1]:
+        # if height given, calculate dpi based on height
+        tmp_dpi = 72.0*height/pagesize[1]
+    else:
+        tmp_dpi = default_dpi
+
+    return tmp_dpi, tmp_dpi
 
 def positive_float(string):
     value = float(string)
@@ -346,22 +371,142 @@ def positive_float(string):
 def valid_date(string):
     return datetime.strptime(string, "%Y-%m-%dT%H:%M:%S")
 
+def get_standard_papersize(string):
+    papersizes = {
+        "11x17"       : "792x792^",     # "792x1224",
+        "ledger"      : "792x792^",     # "1224x792",
+        "legal"       : "612x612^",     # "612x1008",
+        "letter"      : "612x612^",     # "612x792",
+        "arche"       : "2592x2592^",   # "2592x3456",
+        "archd"       : "1728x1728^",   # "1728x2592",
+        "archc"       : "1296x1296^",   # "1296x1728",
+        "archb"       : "864x864^",     # "864x1296",
+        "archa"       : "648x648^",     # "648x864",
+        "a0"          : "2380x2380^",   # "2380x3368",
+        "a1"          : "1684x1684^",   # "1684x2380",
+        "a2"          : "1190x1190^",   # "1190x1684",
+        "a3"          : "842x842^",     # "842x1190",
+        "a4"          : "595x595^",     # "595x842",
+        "a5"          : "421x421^",     # "421x595",
+        "a6"          : "297x297^",     # "297x421",
+        "a7"          : "210x210^",     # "210x297",
+        "a8"          : "148x148^",     # "148x210",
+        "a9"          : "105x105^",     # "105x148",
+        "a10"         : "74x74^",       # "74x105",
+        "b0"          : "2836x2836^",   # "2836x4008",
+        "b1"          : "2004x2004^",   # "2004x2836",
+        "b2"          : "1418x1418^",   # "1418x2004",
+        "b3"          : "1002x1002^",   # "1002x1418",
+        "b4"          : "709x709^",     # "709x1002",
+        "b5"          : "501x501^",     # "501x709",
+        "c0"          : "2600x2600^",   # "2600x3677",
+        "c1"          : "1837x1837^",   # "1837x2600",
+        "c2"          : "1298x1298^",   # "1298x1837",
+        "c3"          : "918x918^",     # "918x1298",
+        "c4"          : "649x649^",     # "649x918",
+        "c5"          : "459x459^",     # "459x649",
+        "c6"          : "323x323^",     # "323x459",
+        "flsa"        : "612x612^",     # "612x936",
+        "flse"        : "612x612^",     # "612x936",
+        "halfletter"  : "396x396^",     # "396x612",
+        "tabloid"     : "792x792^",     # "792x1224",
+        "statement"   : "396x396^",     # "396x612",
+        "executive"   : "540x540^",     # "540x720",
+        "folio"       : "612x612^",     # "612x936",
+        "quarto"      : "610x610^",     # "610x780"
+    }
+
+    string = string.lower()
+    return papersizes.get(string, string)
+
 def valid_size(string):
-    tokens = string.split('x')
-    if len(tokens) != 2:
-        msg = "input size needs to be of the format Ax, xB or AxB with A and B being integers"
+    # conversion factors from units to points
+    units = {
+        'in'  : 72.0,
+        'cm'  : 72.0/2.54,
+        'mm'  : 72.0/25.4,
+        'pt' : 1.0
+    }
+
+    pagesize_options = {
+        'exact'  : ['\!', False],
+        'shrink'  : ['\>', False],
+        'enlarge' : ['\<', False],
+        'fill'    : ['\^', False],
+        'percent' : ['\%', False],
+        'count'   : ['\@', False],
+    }
+
+    string = get_standard_papersize(string)
+
+    pattern = re.compile(r"""
+            ([0-9]*\.?[0-9]*)   # tokens.group(1) == width; may be empty
+            ([a-z]*)            # tokens.group(2) == units; may be empty
+            x
+            ([0-9]*\.?[0-9]*)   # tokens.group(3) == height; may be empty
+            ([a-zA-Z]*)         # tokens.group(4) == units; may be empty
+            ([^0-9a-zA-Z]*)     # tokens.group(5) == extra options
+        """, re.VERBOSE)
+
+    tokens = pattern.match(string)
+
+    # tokens.group(0) should match entire input string
+    if tokens.group(0) != string:
+        msg = ('Input size needs to be of the format AuxBv#, '
+            'where A is width, B is height, u and v are units, '
+            '# are options.  '
+            'You may omit either width or height, but not both.  '
+            'Units may be specified as (in, cm, mm, pt).  '
+            'You may omit units, which will default to pt.  '
+            'Available options include (! = exact ; ^ = fill ; default = into).')
         raise argparse.ArgumentTypeError(msg)
-    x = tokens[0]
-    y = tokens[1]
-    if x == '':
-        x = None
-    else:
-        x = int(x)
-    if y == '':
-        y = None
-    else:
-        y = int(y)
-    return (x,y)
+
+    # temporary list to loop through to process width and height
+    pagesize_size = {
+        'x' : [0, tokens.group(1), tokens.group(2)],
+        'y' : [0, tokens.group(3), tokens.group(4)]
+    }
+
+    for key, value in pagesize_size.items():
+        try:
+            value[0] = float(value[1])
+            value[0] *= units[value[2]]     # convert to points
+        except ValueError, e:
+            # assign None if width or height not provided
+            value[0] = None
+        except KeyError, e:
+            # if units unrecognized, raise error
+            # otherwise default to pt because units not provided 
+            if value[2]:
+                msg = "unrecognized unit '%s'." % value[2]
+                raise argparse.ArgumentTypeError(msg)
+
+    x = pagesize_size['x'][0]
+    y = pagesize_size['y'][0]
+
+    # parse options for resize methods
+    if tokens.group(5):
+        for key, value in pagesize_options.items():
+            if re.search(value[0], tokens.group(5)):
+                value[1] = True
+
+    if pagesize_options['fill'][1]:
+        # if either width or height is not given, try to fill in missing value
+        if not x:
+            x = y
+        elif not y:
+            y = x
+
+    if pagesize_options['exact'][1]:
+        if not x or not y:
+            msg = ('exact size requires both width and height.')
+            raise argparse.ArgumentTypeError(msg)
+
+    if not x and not y:
+        msg = ('width and height cannot both be omitted.')
+        raise argparse.ArgumentTypeError(msg)
+
+    return (x, y, pagesize_options)
 
 parser = argparse.ArgumentParser(
     description='Lossless conversion/embedding of images (in)to pdf')
@@ -370,16 +515,29 @@ parser.add_argument(
     nargs='+', help='input file(s)')
 parser.add_argument(
     '-o', '--output', metavar='out', type=argparse.FileType('wb'),
-    default=getattr(sys.stdout, "buffer", sys.stdout), help='output file (default: stdout)')
+    default=getattr(sys.stdout, "buffer", sys.stdout),
+    help='output file (default: stdout)')
 
 sizeopts = parser.add_mutually_exclusive_group()
 sizeopts.add_argument(
     '-d', '--dpi', metavar='dpi', type=positive_float,
-    help='dpi for pdf output. If input image does not specify dpi the default is 96.0. Must not be specified together with -s/--pagesize.')
+    help=('dpi for pdf output. '
+        'If input image does not specify dpi the default is %.2f.  '
+        'Must not be used with -s/--pagesize.') % default_dpi
+)
+
 sizeopts.add_argument(
     '-s', '--pagesize', metavar='size', type=valid_size,
-    default=(None, None),
-    help='size of the pages in the pdf output in format AxB with A and B being width and height of the page in points. You can omit either one of them. Must not be specified together with -d/--dpi.')
+    default=(None, None, None),
+    help=('size of the pdf pages in format AuxBv#, '
+        'where A is width, B is height, u and v are units, # are options. '
+        'You may omit either width or height, but not both.  '
+        'Some common page sizes, such as letter and a4, are also recognized.  '
+        'Units may be specified as (in, cm, mm, pt).  '
+        'Units default to pt when absent.  '
+        'Available options include (! = exact ; ^ = fill ; default = into).  '
+        'Must not be used with -d/--dpi.')
+)
 
 parser.add_argument(
     '-t', '--title', metavar='title', type=str,