Sleds/imagine-cv/pdf-break.py

264 lines
8.4 KiB
Python

import sys
import os
import subprocess
import time
import argparse
import re
import ConsoleUtils
from random import randint as rand
prog_name = 'PDFBreak'
prog_descript = 'Breaks a pdf into it\'s component pages as separate files.'
prog_version = '0.1.0'
prog_date = '2017/09/27'
prog_auth = 'Chris Diesch <cdiesch@sequencelogic.net>'
USAGE = 'pdf-break [OPTIONS...] -i,--input IN_FILE -o,--out-dir OUT_DIR'
parser = argparse.ArgumentParser(prog=prog_name, description=prog_descript, usage=USAGE, add_help=False)
printer = ConsoleUtils.SLPrinter(prog_name)
_PAGE_NAME_FORMAT = 'page%05d.pdf'
_RUN_CLEAN = False
_REMOVE_PDF = False
_SPLIT_PAGES_CMD = 'pdftk %s burst output %s'
_CONVERT_PAGE_OUT_ARG = '-sOutputFile=%s'
_CONVERT_PAGE_TYPE_ARG = '-sDEVICE=jpeg'
_CONVERT_PAGE_DPI_ARG = '-r300'
_CONVERT_PAGES_CMD = 'gs -q -dBATCH -dNOPAUSE %s %s'
_VALID_FORMAT_REGEX = re.compile('^[^%]*%[0-9]*d.*$')
def _get_format(out_root):
format = os.path.join(out_root, _PAGE_NAME_FORMAT)
return format
def _split_pdf(pdf_file, out_root):
out_format = '"%s"' % os.path.join(out_root, _PAGE_NAME_FORMAT)
cmd = _SPLIT_PAGES_CMD % (pdf_file, out_format)
print('Splitting pages with PDFTK')
start_time = time.time()
process = subprocess.Popen(cmd.split())
process.wait()
run_time = time.time() - start_time
doc_data = os.path.join(out_root, 'doc_data.txt')
with open(doc_data) as data_reader:
lines = data_reader.readlines()
page_cnt = 0
for line in lines:
if line.startswith('NumberOfPages:'):
page_cnt = int(line[14:])
break
del lines
if _RUN_CLEAN or _REMOVE_DATA:
print('Deleting PDFTK data file: "%s"' % doc_data)
try:
os.remove(doc_data)
print('Successfully deleted PDFTK data file')
except BaseException as ex:
print('Error: Unable to delete file "%s"' % doc_data)
print(ex)
pages_per_sec = float(page_cnt/run_time)
print('Split %d pages in %.4f s (%.1f pg/s)' % (page_cnt, run_time, pages_per_sec))
def _convert_pages(out_root):
files = [os.path.join(out_root, f) for f in os.listdir(out_root) if f.endswith('.pdf')]
num_images = len(files)
print('Converting %d PDF files to png images' % num_images)
run_time = 0.0
for file in files:
out_img = file.replace('.pdf', '.png')
cmd_args = '%s %s %s' % (_CONVERT_PAGE_TYPE_ARG, _CONVERT_PAGE_DPI_ARG, _CONVERT_PAGE_OUT_ARG % out_img)
cmd = _CONVERT_PAGES_CMD % (cmd_args, file)
print('Converting image "%s"' % file)
start_time = time.time()
process = subprocess.Popen(cmd.split())
process.wait()
run_time += (time.time() - start_time)
# should the file be deleted
if _RUN_CLEAN or _REMOVE_PDF:
print('Deleting image "%s"' % file)
try:
os.remove(file)
except BaseException as ex:
print('Error: Unable to delete "%s"' % file)
print(ex)
printer.write_no_prefix('')
convert_rate = float(run_time/num_images)
if convert_rate < 1:
convert_rate = 1/convert_rate
print('Converted %d images in %.4f s (%.1f images/s)' % (num_images, run_time, convert_rate))
else:
print('Converted %d images in %.4f s (%.1f s/image)' % (num_images, run_time, convert_rate))
def main(input_pdf, out_root):
_split_pdf(input_pdf, out_root)
_convert_pages(out_root)
def _print_version():
sys.stdout = printer.old_stdout
print('')
print(prog_name)
print('Usage: %s' % USAGE)
print('Version: %s' % prog_version)
print('Date: %s' % prog_date)
exit(0)
def _print_help():
sys.stdout = printer.old_stdout
print('')
print(prog_name)
print(prog_descript)
print('Usage %s' % USAGE)
print('')
print('Options:')
print(' Required:')
print(' -i, --input IN_FILE The input file to split.')
print(' -o, --out-dir OUT_DIR The parent folder to output the files to.')
print(' By default the files are named page 1 = page-00001.pdf. This can be')
print(' changed by using the -f, --format flag below.')
print('')
print(' Input/Output:')
print(' -f, --format FORMAT A printf-styled format string to name the files (default "page-%05d.pdf")')
print(' Example: ')
print(' page 1 = page_1.pdf would be --format "page_%d.pdf"')
print(' -p, --clean-pdfs Delete the individual PDF pages after converting to jpg.')
print(' -d, --clean-data Delete the "doc_data.txt" file produced by PDFTK form the output.')
print(' -c, --clean-run Delete all intermediate files (same as running with -p and -d).')
print('')
print(' Miscellaneous:')
print(' -h, --help Prints the help message.')
print(' -v, --version Prints the version information.')
print('')
print('Version Info:')
print(' Version: %s' % prog_version)
print(' Date: %s' % prog_date)
print('')
print('Author: %s' % prog_auth)
print('')
exit(0)
def _make_args():
# Required arguments
required = parser.add_argument_group('Required')
required.add_argument('-i', '--input', required=True)
required.add_argument('-o', '--out-dir', required=True)
# Output args
output = parser.add_argument_group('Input/Output')
output.add_argument('-f', '--format', default=_PAGE_NAME_FORMAT)
output.add_argument('-c', '--clean-run', action='store_true')
output.add_argument('-p', '--clean-pdfs', action='store_true')
output.add_argument('-d', '--clean-data', action='store_true')
# Miscellaneous arguments
misc = parser.add_argument_group('Miscellaneous')
misc.add_argument('-h', '--help', action=ConsoleUtils.CustomPrintAction, print_fn=_print_help)
misc.add_argument('-v', '--version', action=ConsoleUtils.CustomPrintAction, print_fn=_print_version)
def _is_valid_format(format_str):
# We should only have 1 format character
num_format_chars = format_str.count('%')
if num_format_chars != 1:
return False
# validate with regex
result = _VALID_FORMAT_REGEX.match(format_str) is not None
return result
def check_args():
in_file = args.input
out_dir = args.out_dir
out_format = args.format
fatal_error = False
if not os.path.exists(in_file):
print('Fatal Error: The given input file does not exist (%s)' % in_file)
fatal_error = True
if not os.path.exists(out_dir):
print('Error: The provided output directory does not exist (%s)' % out_dir)
print('Attempting to create directory at "%s"' % out_dir)
try:
os.makedirs(out_dir)
print('OK: Created output directory successfully.')
except BaseException as ex:
print('Fatal Error: Could not create output directory at "%s"' % out_dir)
print(ex)
fatal_error = True
if not _is_valid_format(out_format):
print('Error: The provided format is not valid.')
print('OK: Using default format (%s)' % _PAGE_NAME_FORMAT)
args.format = _PAGE_NAME_FORMAT
_show_args()
if fatal_error:
printer.write_no_prefix('')
print('Exiting...')
_print_help()
exit(1)
def _show_args():
in_file = args.input
out_dir = args.out_dir
ex_page_num = rand(1, 1000)
print('Splitting pdf file: "%s"' % in_file)
print('Saving split pages under: "%s"' % out_dir)
print('Remove "doc_data.txt" output from PDFTK: %s' % _RUN_CLEAN)
print('Saving split pages with name format: "%s"' % _PAGE_NAME_FORMAT)
print(' Ex: page %d -> "%s"' % (ex_page_num, _PAGE_NAME_FORMAT % ex_page_num))
printer.write_no_prefix('')
def _setup():
print(ConsoleUtils.get_header(prog_name, prog_version, prog_date, prog_auth))
_make_args()
sys.stdout = printer
if __name__ == '__main__':
_setup()
# get the arguments
args = parser.parse_args()
# are they good?
check_args()
# default values
_RUN_CLEAN = args.clean_run
_REMOVE_PDF = args.clean_pdfs
_REMOVE_DATA = args.clean_data
_PAGE_NAME_FORMAT = args.format
# non-default values
input_file = args.input
output_dir = args.out_dir
# Run
main(input_file, output_dir)