Sleds/posting/ocr-all.sh

46 lines
1.2 KiB
Bash

#!/bin/sh
#
# Run pdftotext on a hierarchy and write to OCR directories
#
# Usage: ocr-all.sh [-no-overwrite] <src-root>
OVER=1
if [ "$1" = "-no-overwrite" ]; then
OVER=0
echo "Not overwriting files"
shift
fi
SRCDIR=$1
#DSTDIR=$2
if [ ! -d "$SRCDIR" ]; then
echo "Usage: ocr-all.sh [-no-overwrite] srcdir"
exit 1
fi
echo "Finding directories to process from: ${SRCDIR}"
#find "$SRCDIR" -type d -name 'images' | while read dir; do
find "$SRCDIR" -name 'isDataDirectory' -maxdepth 2 | while read dir; do
#dst=${SRCDIR}`echo $dir | sed "s#$SRCDIR##" | sed "s#FRCapture/images##"`OCR
#dst=${SRCDIR}`echo $dir | sed "s#$SRCDIR##" | sed "s#/isDataDirectory##"`/output/OCR
echo $dir
dir=`echo ${dir} | sed "s#isDataDirectory#output/FRCapture/images#"`
dst=`echo ${dir} | sed "s#FRCapture/images#OCR#"`
echo "*** Processing directory: $dir to $dst " `date` " ***"
(
echo "*** Processing directory: $dir to $dst " `date` " ***"
mkdir -p "${dst}"
find "${dir}" -name '*.pdf' -maxdepth 1 | while read pdf; do
ptx=`basename $pdf .pdf`.ptx
if [ ! -f "${dst}/${ptx}" -o $OVER = 1 ]; then
echo "*** OCR $pdf to $dst/$ptx"
rm "${dst}/${ptx}*"
pdftotext -bbox "${pdf}" "${dst}/${ptx}"
fi
done
) > "${dst}/../_ocr_.log"
done
exit 0