#!/bin/sh # # Run pdftotext on a hierarchy and write to OCR directories # # Usage: ocr-all.sh [-no-overwrite] OVER=1 if [ "$1" = "-no-overwrite" ]; then OVER=0 echo "Not overwriting files" shift fi SRCDIR=$1 #DSTDIR=$2 if [ ! -d "$SRCDIR" ]; then echo "Usage: ocr-all.sh [-no-overwrite] srcdir" exit 1 fi echo "Finding directories to process from: ${SRCDIR}" #find "$SRCDIR" -type d -name 'images' | while read dir; do find "$SRCDIR" -name 'isDataDirectory' -maxdepth 2 | while read dir; do #dst=${SRCDIR}`echo $dir | sed "s#$SRCDIR##" | sed "s#FRCapture/images##"`OCR #dst=${SRCDIR}`echo $dir | sed "s#$SRCDIR##" | sed "s#/isDataDirectory##"`/output/OCR echo $dir dir=`echo ${dir} | sed "s#isDataDirectory#output/FRCapture/images#"` dst=`echo ${dir} | sed "s#FRCapture/images#OCR#"` echo "*** Processing directory: $dir to $dst " `date` " ***" ( echo "*** Processing directory: $dir to $dst " `date` " ***" mkdir -p "${dst}" find "${dir}" -name '*.pdf' -maxdepth 1 | while read pdf; do ptx=`basename $pdf .pdf`.ptx if [ ! -f "${dst}/${ptx}" -o $OVER = 1 ]; then echo "*** OCR $pdf to $dst/$ptx" rm "${dst}/${ptx}*" pdftotext -bbox "${pdf}" "${dst}/${ptx}" fi done ) > "${dst}/../_ocr_.log" done exit 0