264 lines
8.4 KiB
Python
264 lines
8.4 KiB
Python
import sys
|
|
import os
|
|
import subprocess
|
|
import time
|
|
import argparse
|
|
import re
|
|
import ConsoleUtils
|
|
|
|
from random import randint as rand
|
|
|
|
prog_name = 'PDFBreak'
|
|
prog_descript = 'Breaks a pdf into it\'s component pages as separate files.'
|
|
prog_version = '0.1.0'
|
|
prog_date = '2017/09/27'
|
|
prog_auth = 'Chris Diesch <cdiesch@sequencelogic.net>'
|
|
|
|
USAGE = 'pdf-break [OPTIONS...] -i,--input IN_FILE -o,--out-dir OUT_DIR'
|
|
|
|
parser = argparse.ArgumentParser(prog=prog_name, description=prog_descript, usage=USAGE, add_help=False)
|
|
|
|
printer = ConsoleUtils.SLPrinter(prog_name)
|
|
|
|
|
|
_PAGE_NAME_FORMAT = 'page%05d.pdf'
|
|
_RUN_CLEAN = False
|
|
_REMOVE_PDF = False
|
|
|
|
_SPLIT_PAGES_CMD = 'pdftk %s burst output %s'
|
|
_CONVERT_PAGE_OUT_ARG = '-sOutputFile=%s'
|
|
_CONVERT_PAGE_TYPE_ARG = '-sDEVICE=jpeg'
|
|
_CONVERT_PAGE_DPI_ARG = '-r300'
|
|
_CONVERT_PAGES_CMD = 'gs -q -dBATCH -dNOPAUSE %s %s'
|
|
|
|
_VALID_FORMAT_REGEX = re.compile('^[^%]*%[0-9]*d.*$')
|
|
|
|
|
|
def _get_format(out_root):
|
|
format = os.path.join(out_root, _PAGE_NAME_FORMAT)
|
|
return format
|
|
|
|
|
|
def _split_pdf(pdf_file, out_root):
|
|
out_format = '"%s"' % os.path.join(out_root, _PAGE_NAME_FORMAT)
|
|
cmd = _SPLIT_PAGES_CMD % (pdf_file, out_format)
|
|
|
|
print('Splitting pages with PDFTK')
|
|
|
|
start_time = time.time()
|
|
process = subprocess.Popen(cmd.split())
|
|
process.wait()
|
|
run_time = time.time() - start_time
|
|
|
|
doc_data = os.path.join(out_root, 'doc_data.txt')
|
|
with open(doc_data) as data_reader:
|
|
lines = data_reader.readlines()
|
|
|
|
page_cnt = 0
|
|
for line in lines:
|
|
if line.startswith('NumberOfPages:'):
|
|
page_cnt = int(line[14:])
|
|
break
|
|
|
|
del lines
|
|
|
|
if _RUN_CLEAN or _REMOVE_DATA:
|
|
print('Deleting PDFTK data file: "%s"' % doc_data)
|
|
try:
|
|
os.remove(doc_data)
|
|
print('Successfully deleted PDFTK data file')
|
|
except BaseException as ex:
|
|
print('Error: Unable to delete file "%s"' % doc_data)
|
|
print(ex)
|
|
|
|
pages_per_sec = float(page_cnt/run_time)
|
|
print('Split %d pages in %.4f s (%.1f pg/s)' % (page_cnt, run_time, pages_per_sec))
|
|
|
|
|
|
def _convert_pages(out_root):
|
|
files = [os.path.join(out_root, f) for f in os.listdir(out_root) if f.endswith('.pdf')]
|
|
num_images = len(files)
|
|
print('Converting %d PDF files to png images' % num_images)
|
|
|
|
run_time = 0.0
|
|
for file in files:
|
|
out_img = file.replace('.pdf', '.png')
|
|
cmd_args = '%s %s %s' % (_CONVERT_PAGE_TYPE_ARG, _CONVERT_PAGE_DPI_ARG, _CONVERT_PAGE_OUT_ARG % out_img)
|
|
cmd = _CONVERT_PAGES_CMD % (cmd_args, file)
|
|
|
|
print('Converting image "%s"' % file)
|
|
start_time = time.time()
|
|
process = subprocess.Popen(cmd.split())
|
|
process.wait()
|
|
run_time += (time.time() - start_time)
|
|
# should the file be deleted
|
|
if _RUN_CLEAN or _REMOVE_PDF:
|
|
print('Deleting image "%s"' % file)
|
|
try:
|
|
os.remove(file)
|
|
except BaseException as ex:
|
|
print('Error: Unable to delete "%s"' % file)
|
|
print(ex)
|
|
printer.write_no_prefix('')
|
|
convert_rate = float(run_time/num_images)
|
|
|
|
if convert_rate < 1:
|
|
convert_rate = 1/convert_rate
|
|
print('Converted %d images in %.4f s (%.1f images/s)' % (num_images, run_time, convert_rate))
|
|
else:
|
|
print('Converted %d images in %.4f s (%.1f s/image)' % (num_images, run_time, convert_rate))
|
|
|
|
|
|
def main(input_pdf, out_root):
|
|
_split_pdf(input_pdf, out_root)
|
|
_convert_pages(out_root)
|
|
|
|
|
|
def _print_version():
|
|
sys.stdout = printer.old_stdout
|
|
print('')
|
|
print(prog_name)
|
|
print('Usage: %s' % USAGE)
|
|
print('Version: %s' % prog_version)
|
|
print('Date: %s' % prog_date)
|
|
exit(0)
|
|
|
|
|
|
def _print_help():
|
|
sys.stdout = printer.old_stdout
|
|
print('')
|
|
print(prog_name)
|
|
print(prog_descript)
|
|
print('Usage %s' % USAGE)
|
|
print('')
|
|
print('Options:')
|
|
print(' Required:')
|
|
print(' -i, --input IN_FILE The input file to split.')
|
|
print(' -o, --out-dir OUT_DIR The parent folder to output the files to.')
|
|
print(' By default the files are named page 1 = page-00001.pdf. This can be')
|
|
print(' changed by using the -f, --format flag below.')
|
|
print('')
|
|
print(' Input/Output:')
|
|
print(' -f, --format FORMAT A printf-styled format string to name the files (default "page-%05d.pdf")')
|
|
print(' Example: ')
|
|
print(' page 1 = page_1.pdf would be --format "page_%d.pdf"')
|
|
print(' -p, --clean-pdfs Delete the individual PDF pages after converting to jpg.')
|
|
print(' -d, --clean-data Delete the "doc_data.txt" file produced by PDFTK form the output.')
|
|
print(' -c, --clean-run Delete all intermediate files (same as running with -p and -d).')
|
|
print('')
|
|
print(' Miscellaneous:')
|
|
print(' -h, --help Prints the help message.')
|
|
print(' -v, --version Prints the version information.')
|
|
print('')
|
|
print('Version Info:')
|
|
print(' Version: %s' % prog_version)
|
|
print(' Date: %s' % prog_date)
|
|
print('')
|
|
print('Author: %s' % prog_auth)
|
|
print('')
|
|
exit(0)
|
|
|
|
|
|
def _make_args():
|
|
# Required arguments
|
|
required = parser.add_argument_group('Required')
|
|
required.add_argument('-i', '--input', required=True)
|
|
required.add_argument('-o', '--out-dir', required=True)
|
|
# Output args
|
|
output = parser.add_argument_group('Input/Output')
|
|
output.add_argument('-f', '--format', default=_PAGE_NAME_FORMAT)
|
|
output.add_argument('-c', '--clean-run', action='store_true')
|
|
output.add_argument('-p', '--clean-pdfs', action='store_true')
|
|
output.add_argument('-d', '--clean-data', action='store_true')
|
|
# Miscellaneous arguments
|
|
misc = parser.add_argument_group('Miscellaneous')
|
|
misc.add_argument('-h', '--help', action=ConsoleUtils.CustomPrintAction, print_fn=_print_help)
|
|
misc.add_argument('-v', '--version', action=ConsoleUtils.CustomPrintAction, print_fn=_print_version)
|
|
|
|
|
|
def _is_valid_format(format_str):
|
|
# We should only have 1 format character
|
|
num_format_chars = format_str.count('%')
|
|
if num_format_chars != 1:
|
|
return False
|
|
# validate with regex
|
|
result = _VALID_FORMAT_REGEX.match(format_str) is not None
|
|
return result
|
|
|
|
|
|
def check_args():
|
|
in_file = args.input
|
|
out_dir = args.out_dir
|
|
out_format = args.format
|
|
|
|
fatal_error = False
|
|
|
|
if not os.path.exists(in_file):
|
|
print('Fatal Error: The given input file does not exist (%s)' % in_file)
|
|
fatal_error = True
|
|
|
|
if not os.path.exists(out_dir):
|
|
print('Error: The provided output directory does not exist (%s)' % out_dir)
|
|
print('Attempting to create directory at "%s"' % out_dir)
|
|
try:
|
|
os.makedirs(out_dir)
|
|
print('OK: Created output directory successfully.')
|
|
except BaseException as ex:
|
|
print('Fatal Error: Could not create output directory at "%s"' % out_dir)
|
|
print(ex)
|
|
fatal_error = True
|
|
|
|
if not _is_valid_format(out_format):
|
|
print('Error: The provided format is not valid.')
|
|
print('OK: Using default format (%s)' % _PAGE_NAME_FORMAT)
|
|
args.format = _PAGE_NAME_FORMAT
|
|
|
|
_show_args()
|
|
|
|
if fatal_error:
|
|
printer.write_no_prefix('')
|
|
print('Exiting...')
|
|
_print_help()
|
|
exit(1)
|
|
|
|
|
|
def _show_args():
|
|
in_file = args.input
|
|
out_dir = args.out_dir
|
|
|
|
ex_page_num = rand(1, 1000)
|
|
|
|
print('Splitting pdf file: "%s"' % in_file)
|
|
print('Saving split pages under: "%s"' % out_dir)
|
|
print('Remove "doc_data.txt" output from PDFTK: %s' % _RUN_CLEAN)
|
|
print('Saving split pages with name format: "%s"' % _PAGE_NAME_FORMAT)
|
|
print(' Ex: page %d -> "%s"' % (ex_page_num, _PAGE_NAME_FORMAT % ex_page_num))
|
|
printer.write_no_prefix('')
|
|
|
|
|
|
def _setup():
|
|
print(ConsoleUtils.get_header(prog_name, prog_version, prog_date, prog_auth))
|
|
_make_args()
|
|
sys.stdout = printer
|
|
|
|
|
|
if __name__ == '__main__':
|
|
_setup()
|
|
# get the arguments
|
|
args = parser.parse_args()
|
|
# are they good?
|
|
check_args()
|
|
# default values
|
|
_RUN_CLEAN = args.clean_run
|
|
_REMOVE_PDF = args.clean_pdfs
|
|
_REMOVE_DATA = args.clean_data
|
|
_PAGE_NAME_FORMAT = args.format
|
|
|
|
# non-default values
|
|
input_file = args.input
|
|
output_dir = args.out_dir
|
|
|
|
# Run
|
|
main(input_file, output_dir)
|
|
|