46 lines
1.2 KiB
Bash
46 lines
1.2 KiB
Bash
#!/bin/sh
|
|
#
|
|
# Run pdftotext on a hierarchy and write to OCR directories
|
|
#
|
|
# Usage: ocr-all.sh [-no-overwrite] <src-root>
|
|
|
|
OVER=1
|
|
if [ "$1" = "-no-overwrite" ]; then
|
|
OVER=0
|
|
echo "Not overwriting files"
|
|
shift
|
|
fi
|
|
|
|
SRCDIR=$1
|
|
#DSTDIR=$2
|
|
|
|
if [ ! -d "$SRCDIR" ]; then
|
|
echo "Usage: ocr-all.sh [-no-overwrite] srcdir"
|
|
exit 1
|
|
fi
|
|
|
|
echo "Finding directories to process from: ${SRCDIR}"
|
|
#find "$SRCDIR" -type d -name 'images' | while read dir; do
|
|
find "$SRCDIR" -name 'isDataDirectory' -maxdepth 2 | while read dir; do
|
|
#dst=${SRCDIR}`echo $dir | sed "s#$SRCDIR##" | sed "s#FRCapture/images##"`OCR
|
|
#dst=${SRCDIR}`echo $dir | sed "s#$SRCDIR##" | sed "s#/isDataDirectory##"`/output/OCR
|
|
echo $dir
|
|
dir=`echo ${dir} | sed "s#isDataDirectory#output/FRCapture/images#"`
|
|
dst=`echo ${dir} | sed "s#FRCapture/images#OCR#"`
|
|
echo "*** Processing directory: $dir to $dst " `date` " ***"
|
|
(
|
|
echo "*** Processing directory: $dir to $dst " `date` " ***"
|
|
mkdir -p "${dst}"
|
|
find "${dir}" -name '*.pdf' -maxdepth 1 | while read pdf; do
|
|
ptx=`basename $pdf .pdf`.ptx
|
|
if [ ! -f "${dst}/${ptx}" -o $OVER = 1 ]; then
|
|
echo "*** OCR $pdf to $dst/$ptx"
|
|
rm "${dst}/${ptx}*"
|
|
pdftotext -bbox "${pdf}" "${dst}/${ptx}"
|
|
fi
|
|
done
|
|
) > "${dst}/../_ocr_.log"
|
|
done
|
|
|
|
exit 0
|