import sys import os import subprocess import time import argparse import re import ConsoleUtils from random import randint as rand prog_name = 'PDFBreak' prog_descript = 'Breaks a pdf into it\'s component pages as separate files.' prog_version = '0.1.0' prog_date = '2017/09/27' prog_auth = 'Chris Diesch ' USAGE = 'pdf-break [OPTIONS...] -i,--input IN_FILE -o,--out-dir OUT_DIR' parser = argparse.ArgumentParser(prog=prog_name, description=prog_descript, usage=USAGE, add_help=False) printer = ConsoleUtils.SLPrinter(prog_name) _PAGE_NAME_FORMAT = 'page%05d.pdf' _RUN_CLEAN = False _REMOVE_PDF = False _SPLIT_PAGES_CMD = 'pdftk %s burst output %s' _CONVERT_PAGE_OUT_ARG = '-sOutputFile=%s' _CONVERT_PAGE_TYPE_ARG = '-sDEVICE=jpeg' _CONVERT_PAGE_DPI_ARG = '-r300' _CONVERT_PAGES_CMD = 'gs -q -dBATCH -dNOPAUSE %s %s' _VALID_FORMAT_REGEX = re.compile('^[^%]*%[0-9]*d.*$') def _get_format(out_root): format = os.path.join(out_root, _PAGE_NAME_FORMAT) return format def _split_pdf(pdf_file, out_root): out_format = '"%s"' % os.path.join(out_root, _PAGE_NAME_FORMAT) cmd = _SPLIT_PAGES_CMD % (pdf_file, out_format) print('Splitting pages with PDFTK') start_time = time.time() process = subprocess.Popen(cmd.split()) process.wait() run_time = time.time() - start_time doc_data = os.path.join(out_root, 'doc_data.txt') with open(doc_data) as data_reader: lines = data_reader.readlines() page_cnt = 0 for line in lines: if line.startswith('NumberOfPages:'): page_cnt = int(line[14:]) break del lines if _RUN_CLEAN or _REMOVE_DATA: print('Deleting PDFTK data file: "%s"' % doc_data) try: os.remove(doc_data) print('Successfully deleted PDFTK data file') except BaseException as ex: print('Error: Unable to delete file "%s"' % doc_data) print(ex) pages_per_sec = float(page_cnt/run_time) print('Split %d pages in %.4f s (%.1f pg/s)' % (page_cnt, run_time, pages_per_sec)) def _convert_pages(out_root): files = [os.path.join(out_root, f) for f in os.listdir(out_root) if f.endswith('.pdf')] num_images = len(files) print('Converting %d PDF files to png images' % num_images) run_time = 0.0 for file in files: out_img = file.replace('.pdf', '.png') cmd_args = '%s %s %s' % (_CONVERT_PAGE_TYPE_ARG, _CONVERT_PAGE_DPI_ARG, _CONVERT_PAGE_OUT_ARG % out_img) cmd = _CONVERT_PAGES_CMD % (cmd_args, file) print('Converting image "%s"' % file) start_time = time.time() process = subprocess.Popen(cmd.split()) process.wait() run_time += (time.time() - start_time) # should the file be deleted if _RUN_CLEAN or _REMOVE_PDF: print('Deleting image "%s"' % file) try: os.remove(file) except BaseException as ex: print('Error: Unable to delete "%s"' % file) print(ex) printer.write_no_prefix('') convert_rate = float(run_time/num_images) if convert_rate < 1: convert_rate = 1/convert_rate print('Converted %d images in %.4f s (%.1f images/s)' % (num_images, run_time, convert_rate)) else: print('Converted %d images in %.4f s (%.1f s/image)' % (num_images, run_time, convert_rate)) def main(input_pdf, out_root): _split_pdf(input_pdf, out_root) _convert_pages(out_root) def _print_version(): sys.stdout = printer.old_stdout print('') print(prog_name) print('Usage: %s' % USAGE) print('Version: %s' % prog_version) print('Date: %s' % prog_date) exit(0) def _print_help(): sys.stdout = printer.old_stdout print('') print(prog_name) print(prog_descript) print('Usage %s' % USAGE) print('') print('Options:') print(' Required:') print(' -i, --input IN_FILE The input file to split.') print(' -o, --out-dir OUT_DIR The parent folder to output the files to.') print(' By default the files are named page 1 = page-00001.pdf. This can be') print(' changed by using the -f, --format flag below.') print('') print(' Input/Output:') print(' -f, --format FORMAT A printf-styled format string to name the files (default "page-%05d.pdf")') print(' Example: ') print(' page 1 = page_1.pdf would be --format "page_%d.pdf"') print(' -p, --clean-pdfs Delete the individual PDF pages after converting to jpg.') print(' -d, --clean-data Delete the "doc_data.txt" file produced by PDFTK form the output.') print(' -c, --clean-run Delete all intermediate files (same as running with -p and -d).') print('') print(' Miscellaneous:') print(' -h, --help Prints the help message.') print(' -v, --version Prints the version information.') print('') print('Version Info:') print(' Version: %s' % prog_version) print(' Date: %s' % prog_date) print('') print('Author: %s' % prog_auth) print('') exit(0) def _make_args(): # Required arguments required = parser.add_argument_group('Required') required.add_argument('-i', '--input', required=True) required.add_argument('-o', '--out-dir', required=True) # Output args output = parser.add_argument_group('Input/Output') output.add_argument('-f', '--format', default=_PAGE_NAME_FORMAT) output.add_argument('-c', '--clean-run', action='store_true') output.add_argument('-p', '--clean-pdfs', action='store_true') output.add_argument('-d', '--clean-data', action='store_true') # Miscellaneous arguments misc = parser.add_argument_group('Miscellaneous') misc.add_argument('-h', '--help', action=ConsoleUtils.CustomPrintAction, print_fn=_print_help) misc.add_argument('-v', '--version', action=ConsoleUtils.CustomPrintAction, print_fn=_print_version) def _is_valid_format(format_str): # We should only have 1 format character num_format_chars = format_str.count('%') if num_format_chars != 1: return False # validate with regex result = _VALID_FORMAT_REGEX.match(format_str) is not None return result def check_args(): in_file = args.input out_dir = args.out_dir out_format = args.format fatal_error = False if not os.path.exists(in_file): print('Fatal Error: The given input file does not exist (%s)' % in_file) fatal_error = True if not os.path.exists(out_dir): print('Error: The provided output directory does not exist (%s)' % out_dir) print('Attempting to create directory at "%s"' % out_dir) try: os.makedirs(out_dir) print('OK: Created output directory successfully.') except BaseException as ex: print('Fatal Error: Could not create output directory at "%s"' % out_dir) print(ex) fatal_error = True if not _is_valid_format(out_format): print('Error: The provided format is not valid.') print('OK: Using default format (%s)' % _PAGE_NAME_FORMAT) args.format = _PAGE_NAME_FORMAT _show_args() if fatal_error: printer.write_no_prefix('') print('Exiting...') _print_help() exit(1) def _show_args(): in_file = args.input out_dir = args.out_dir ex_page_num = rand(1, 1000) print('Splitting pdf file: "%s"' % in_file) print('Saving split pages under: "%s"' % out_dir) print('Remove "doc_data.txt" output from PDFTK: %s' % _RUN_CLEAN) print('Saving split pages with name format: "%s"' % _PAGE_NAME_FORMAT) print(' Ex: page %d -> "%s"' % (ex_page_num, _PAGE_NAME_FORMAT % ex_page_num)) printer.write_no_prefix('') def _setup(): print(ConsoleUtils.get_header(prog_name, prog_version, prog_date, prog_auth)) _make_args() sys.stdout = printer if __name__ == '__main__': _setup() # get the arguments args = parser.parse_args() # are they good? check_args() # default values _RUN_CLEAN = args.clean_run _REMOVE_PDF = args.clean_pdfs _REMOVE_DATA = args.clean_data _PAGE_NAME_FORMAT = args.format # non-default values input_file = args.input output_dir = args.out_dir # Run main(input_file, output_dir)