#! /usr/bin/env python3 # SPDX-FileCopyrightText: 2021 mara004 # SPDX-License-Identifier: MPL-2.0 import os from os.path import join import sys import argparse import pikepdf from pikepdf import PdfImage from PIL import Image def find_images(container, depth=0): # cf. https://github.com/jbarlow83/OCRmyPDF/blob/master/src/ocrmypdf/pdfinfo/info.py if '/Resources' not in container: return [] resources = container['/Resources'] if '/XObject' not in resources: return [] xobjects = resources['/XObject'].as_dict() if depth > 0: allow_recursion = False else: allow_recursion = True images = [] for xobject in xobjects: candidate = xobjects[xobject] if candidate['/Subtype'] == '/Image': if '/SMask' in candidate: images.append( [candidate, candidate['/SMask']] ) else: images.append( candidate ) elif allow_recursion and candidate['/Subtype'] == '/Form': images.extend( find_images(candidate, depth=depth+1) ) return images def get_raw_images(pdfdoc, page_numbers): raw_images = [] for i in page_numbers: page = pdfdoc.pages[i] page_images = find_images(page) for img in page_images: if img not in raw_images: raw_images.append(img) return raw_images def extract_all_images( pdfdoc, output_directory, page_indices = None, prefix = None, filename = None, forward_images = False, ): print(output_directory) if not os.path.isdir(output_directory): os.mkdir(output_directory) raw_images = [] if (page_indices is None) or (isinstance(page_indices, (tuple,list)) and len(page_indices) == 0): real_indices = [i for i in range(len(pdfdoc.pages))] else: real_indices = page_indices found_imgs = get_raw_images(pdfdoc, real_indices) raw_images.extend(found_imgs) pdfimages = [] for r in raw_images: if isinstance(r, list): # QUESTION is there a way to losslessly apply the /SMask with PIL ? base_image = PdfImage(r[0]).as_pil_image() soft_mask = PdfImage(r[1]).as_pil_image() if base_image.size != soft_mask.size: print("Warning: Image and /SMask have a different size. This is unexpected.", file=sys.stderr) soft_mask = soft_mask.resize(base_image.size) if base_image.mode in ('L', 'LA'): transparency = Image.new('LA', base_image.size, (0,0)) else: if base_image.mode not in ('RGB', 'RGBA'): base_image = base_image.convert('RGB') transparency = Image.new('RGBA', base_image.size, (0,0,0,0)) composite = Image.composite(base_image, transparency, soft_mask) # imagemagick: ``convert base_image.png \( soft_mask.png \) -compose copy-opacity -composite output.png`` #pdfimages.append(base_image) #pdfimages.append(soft_mask) pdfimages.append(composite) else: pdfimages.append(PdfImage(r)) n_images = len(pdfimages) print("found {} {}".format(n_images, 'image(s)')) print(pdfimages) if forward_images: return [im.as_pil_image() for im in pdfimages] else: if prefix is not None: real_prefix = prefix + '_' elif filename is not None: real_prefix = os.path.basename(filename)[:-4].replace(' ', '-').replace('_', '-') + '_' else: real_prefix = 'pdf-image_' n_digits = len( str(n_images) ) for i, image in enumerate(pdfimages): filepath_prefix = join(output_directory, real_prefix+f"{i+1:0{n_digits}}") if isinstance(image, Image.Image): image.save(filepath_prefix+'.png', 'PNG') else: image.extract_to(fileprefix=filepath_prefix) return True def parse_args(args=sys.argv[1:]): parser = argparse.ArgumentParser( description = "Extract images from a PDF document using an algorithm that analyses the page resource tree. Alpha masks will be applied automatically. Quality loss or increased file size can be possible side effects of the PDF writer library or the extraction process." ) parser.add_argument( '--input', '-i', required = True, help = "PDF Document from which to extract images.", ) parser.add_argument( '--output-dir', '-o', dest = 'output', required = True, help = "Output directory where the serially numbered images shall be placed.", ) parser.add_argument( '--pages', '-p', help = "Numbers of the pages in which to search for images. Defaults to all pages. Please see the PageTextParser Guide for information on the syntax." ) parser.add_argument( '--prefix', '-r', help = "String to prepend to the serial number. By default, the input file name is used to determine a prefix.", ) parser.add_argument( '--verbose', '-v', action = 'store_true', help = "Show debugging information.", ) return parser.parse_args(args) def parse_page_range(ptext=""): # (C) Charlotte Curtis (PDFStitcher) # parse out the requested pages. Note that this allows for pages to be repeated and out of order. page_range = [] if ptext: for r in [p.split('-') for p in ptext.split(',')]: if len(r) == 1: page_range.append(int(r[0])) else: page_range += list(range(int(r[0]),int(r[-1])+1)) else: print('Please specify a valid page range') sys.exit() return page_range def main(cmdargs=sys.argv[1:]): args = parse_args(cmdargs) abs_input = os.path.abspath(args.input) abs_output = os.path.abspath(args.output) if not os.path.isdir(abs_output): print("Output must be an existing directory", file=sys.stderr) sys.exit() if not os.path.isfile(abs_input): print("Input must be an existing file", file=sys.stderr) sys.exit() input_file = pikepdf.Pdf.open(abs_input) pages = None if args.pages is not None: pages = parse_page_range(args.pages) extract_all_images( pdfdoc = input_file, output_directory = abs_output, page_indices = pages, prefix = args.prefix, filename = abs_input, ) input_file.close() print("Output saved to", abs_output) return abs_output if __name__ == '__main__': # ./imageextractor.py -i [input_file] -o [output_directory] main()