366 lines
9.7 KiB
C++
366 lines
9.7 KiB
C++
#include "OCRManager.h"
|
|
|
|
using namespace sequencelogic;
|
|
|
|
OCRManager::OCRManager()
|
|
{
|
|
}
|
|
|
|
|
|
OCRManager::~OCRManager()
|
|
{
|
|
}
|
|
|
|
void OCRManager::ValidateOCR()
|
|
{
|
|
system(std::string(SLOCR_LOCATION + " -v").c_str());
|
|
}
|
|
|
|
void OCRManager::RunOCR(std::string inputFile, bool isPretty, int pagesInErrorToAllow, int threadCount, bool singlePageOutput)
|
|
{
|
|
std::vector<std::string> OCROutputs;
|
|
std::vector<Range> ranges;
|
|
ranges = splitPages(inputFile, threadCount);
|
|
|
|
std::cout << "Running on " << inputFile << " with " << threadCount << " OCR Processes." << std::endl;
|
|
pid_t *childPids = NULL;
|
|
pid_t p;
|
|
childPids = (pid_t*) malloc(threadCount * sizeof(pid_t));
|
|
|
|
for (int i = 0; i < ranges.size(); ++i)
|
|
{
|
|
if ((p = fork()) == 0)
|
|
{
|
|
//in a child
|
|
std::string ocrResult = callOCR(inputFile, isPretty, pagesInErrorToAllow, threadCount, ranges[i], singlePageOutput);
|
|
std::cout << "OCR output for pages " << ranges[i].startPage << "-" << ranges[i].endPage << ":\n"
|
|
<< "----------------------------------------------------------------\n"
|
|
<< ocrResult << "\n"
|
|
<< "----------------------------------------------------------------" << std::endl << std::endl;
|
|
exit(0);
|
|
//break;
|
|
}
|
|
else
|
|
childPids[i] = p; //add to the child list
|
|
}
|
|
//wait
|
|
int stillWaiting;
|
|
do {
|
|
stillWaiting = 0;
|
|
for (int i = 0; i < threadCount; ++i)
|
|
{
|
|
if (childPids[i] > 0)
|
|
{
|
|
if (waitpid(childPids[i], NULL, WNOHANG) != 0)
|
|
childPids[i] = 0; //child is done
|
|
else
|
|
stillWaiting = 1; //still waiting
|
|
}
|
|
}
|
|
sleep(5);
|
|
} while (stillWaiting);
|
|
|
|
//clean up
|
|
free(childPids);
|
|
|
|
//now to put the files back together
|
|
std::string dir = inputFile.substr(0, inputFile.rfind("/") + 1);
|
|
glueFRTFiles(dir, isPretty, getFRTFileName(inputFile));
|
|
|
|
|
|
glueImageFiles(threadCount, dir, inputFile, getOutputFile(inputFile), singlePageOutput);
|
|
}
|
|
|
|
int OCRManager::getPageCnt(std::string inputFile)
|
|
{
|
|
int result;
|
|
std::string tmpResult;
|
|
std::stringstream inStr;
|
|
inStr << PAGE_COUNT_CMD << inputFile;
|
|
|
|
tmpResult = runProgram(inStr.str());
|
|
result = atoi(tmpResult.c_str());
|
|
|
|
return result;
|
|
}
|
|
|
|
size_t OCRManager::getFilesInDirectory(const std::string &dirName, std::vector<std::string> &files, const std::string filter)
|
|
{
|
|
files.clear();
|
|
|
|
DIR *dpdf;
|
|
struct dirent *epdf;
|
|
|
|
dpdf = ::opendir(dirName.c_str());
|
|
if (dpdf != NULL)
|
|
{
|
|
while (epdf = ::readdir(dpdf))
|
|
{
|
|
if (epdf->d_name[0] != '.')
|
|
{
|
|
std::string str(epdf->d_name);
|
|
if (str.find(filter) != std::string::npos)
|
|
files.push_back(str);
|
|
}
|
|
}
|
|
::closedir(dpdf);
|
|
}
|
|
|
|
return files.size();
|
|
}
|
|
|
|
void OCRManager::getPageRange(std::string fileName, int &startPage, int &endPage)
|
|
{
|
|
std::string start = fileName.substr(fileName.rfind('_') + 1, fileName.rfind('-'));
|
|
std::string end = fileName.substr(fileName.rfind('-') + 1, fileName.rfind('.'));
|
|
startPage = atoi(start.c_str());
|
|
endPage = atoi(end.c_str());
|
|
}
|
|
|
|
bool imgNameCompare(const std::string &imgLeft, const std::string &imgRight)
|
|
{
|
|
std::string LComp = imgLeft.substr(imgLeft.find_last_of('_') + 1, imgLeft.rfind('.'));
|
|
std::string RComp = imgRight.substr(imgRight.find_last_of('_') + 1, imgRight.rfind('.'));
|
|
|
|
int LNum = atoi(LComp.c_str());
|
|
int RNum = atoi(RComp.c_str());
|
|
|
|
bool result = LNum < RNum;
|
|
return result;
|
|
}
|
|
|
|
bool frtNameCompare(const std::string &frtLeft, const std::string &frtRight)
|
|
{
|
|
std::string LComp = frtLeft.substr(frtLeft.find_last_of('_') + 1, frtLeft.rfind('-'));
|
|
std::string RComp = frtRight.substr(frtRight.find_last_of('_') + 1, frtRight.rfind('-'));
|
|
|
|
int LNum = atoi(LComp.c_str());
|
|
int RNum = atoi(RComp.c_str());
|
|
|
|
bool result = LNum < RNum;
|
|
return result;
|
|
}
|
|
|
|
void OCRManager::glueFRTFiles(std::string dir, bool prettyPrint, std::string outFile)
|
|
{
|
|
std::cout << "Gluing files in " << dir << std::endl;
|
|
JSONArray pages;
|
|
JSONArray badPages;
|
|
int numPages;
|
|
int numBadPages = 0;
|
|
std::string origSrcFile;
|
|
std::string origSrcType;
|
|
std::string txtSrcType;
|
|
bool fixedUp;
|
|
|
|
std::vector<std::string> frtFiles;
|
|
getFilesInDirectory(dir, frtFiles, ".frt");
|
|
|
|
std::sort(frtFiles.begin(), frtFiles.end(), frtNameCompare);
|
|
//load the JSON objects
|
|
for (int i = 0; i < frtFiles.size(); ++i)
|
|
{
|
|
std::string fileName = dir + frtFiles[i];
|
|
std::cout << " Gluing file " << fileName << std::endl;
|
|
JSONObject currentFile;
|
|
currentFile.LoadFromFile(fileName);
|
|
//need to keep track of the number of bad pages.
|
|
numBadPages += currentFile.getJSONInt("numBadPages");
|
|
//add the pages good and bad.
|
|
JSONArray currentBadPages = currentFile.getJSONArray("badPages");
|
|
JSONArray currentPages = currentFile.getJSONArray("pages");
|
|
//only need this from the first file.
|
|
if (i == 0)
|
|
{
|
|
numPages = currentFile.getJSONInt("numPages");
|
|
origSrcFile = currentFile.getJSONString("originalSourceFile");
|
|
origSrcType = currentFile.getJSONString("originalSourceType");
|
|
txtSrcType = currentFile.getJSONString("textSourceType");
|
|
fixedUp = currentFile.getJSONBool("fixedUp");
|
|
}
|
|
int start = 0;
|
|
int end = 0;
|
|
getPageRange(fileName, start, end);
|
|
for (int j = start - 1; j < end; ++j)
|
|
{
|
|
JSONObject ¤tPage = static_cast<JSONObject&>(currentPages[j]);
|
|
pages.addElement(currentPage);
|
|
}
|
|
//add all the bad pages.
|
|
for (int i = 0; i < currentBadPages.getnumitems(); ++i)
|
|
badPages.addElement(static_cast<JSONObject&>(currentBadPages[i]));
|
|
|
|
//remove the file.
|
|
deleteFile(fileName);
|
|
}
|
|
|
|
JSONObject result;
|
|
result.setJSONValue("numPages", numPages);
|
|
result.setJSONValue("numBadPages", numBadPages);
|
|
result.setJSONValue("originalSourceFile", origSrcFile.c_str());
|
|
result.setJSONValue("originalSourceType", origSrcType.c_str());
|
|
result.setJSONValue("textSourceType", txtSrcType.c_str());
|
|
result.setJSONValue("fixedUp", fixedUp);
|
|
result.setJSONValue("pages", pages);
|
|
result.setJSONValue("badPages", badPages);
|
|
|
|
std::cout << "Saving file: " << outFile << std::endl;
|
|
result.SaveToFile(outFile, prettyPrint);
|
|
|
|
}
|
|
|
|
void OCRManager::glueImageFiles(int threadCount, std::string dir, std::string inFile, std::string outFile, bool singlePageOutput)
|
|
{
|
|
std::string ext = outFile.substr(outFile.rfind('.'));
|
|
std::cout << "Gluing image files into " << outFile << std::endl;
|
|
std::vector<std::string> imgFiles;
|
|
|
|
if (!singlePageOutput)
|
|
getFilesInDirectory(dir, imgFiles, ext);
|
|
else
|
|
getFilesInDirectory(dir, imgFiles, ".png");
|
|
|
|
std::sort(imgFiles.begin(), imgFiles.end(), imgNameCompare);
|
|
|
|
std::stringstream inStr;
|
|
inStr << GHOST_SCRIPT_CMD << outFile << " -dBATCH ";
|
|
|
|
for (int i = 0; i < imgFiles.size(); ++i)
|
|
{ //don't add the source image.
|
|
if(inFile.find(imgFiles[i].c_str()) == std::string::npos)
|
|
inStr << dir + imgFiles[i] << " ";
|
|
}
|
|
inStr << outFile;
|
|
std::string result = runProgram(inStr.str());
|
|
std::cout << result;
|
|
|
|
//remove the old image files.
|
|
if (!singlePageOutput)
|
|
{
|
|
for (int i = 0; i < imgFiles.size(); ++i)
|
|
{ //but not the source.
|
|
if (inFile.find(imgFiles[i].c_str()) == std::string::npos)
|
|
deleteFile(dir + imgFiles[i]);
|
|
}
|
|
}
|
|
}
|
|
|
|
std::string OCRManager::getOutputFile(std::string inputFile, int start, int end)
|
|
{
|
|
std::stringstream result;
|
|
|
|
std::string ext = inputFile.substr(inputFile.find("."));
|
|
result << inputFile.substr(0, inputFile.find("."));
|
|
result << "_" << start << "-" << end << ext;
|
|
|
|
return result.str();
|
|
}
|
|
|
|
void OCRManager::deleteFile(std::string fileName)
|
|
{
|
|
//std::cout << " Deleting file " << fileName << "..." << std::endl;
|
|
std::remove(fileName.c_str());
|
|
//std::cout << " Deleted." << std::endl;
|
|
}
|
|
|
|
std::string OCRManager::getOutputFile(std::string inputFile)
|
|
{
|
|
std::stringstream result;
|
|
|
|
std::string ext = inputFile.substr(inputFile.find("."));
|
|
result << inputFile.substr(0, inputFile.find("."));
|
|
result << "_Fixed" << ext;
|
|
|
|
return result.str();
|
|
}
|
|
|
|
std::vector<Range> OCRManager::splitPages(std::string inputFile, int threadCount)
|
|
{
|
|
const int numPages = getPageCnt(inputFile);
|
|
std::cout << "Recognized " << numPages << " pages in " << inputFile << std::endl;
|
|
|
|
std::vector<Range> result;
|
|
//NEED TO ROUND
|
|
double tmpPageInc = (double)numPages / (double)threadCount;
|
|
|
|
int pageInc = (int)ceil(tmpPageInc);
|
|
int pagesLeft = numPages;
|
|
int prevPageEnd = -1;
|
|
//while there are pages to process
|
|
while (pagesLeft > 0)
|
|
{
|
|
//get the starting page
|
|
int pageStart = numPages - pagesLeft;
|
|
|
|
if (pageStart = prevPageEnd)
|
|
++pageStart;
|
|
|
|
int pageEnd = pageStart + pageInc;
|
|
//if the end page is past the last page
|
|
if (pageEnd > numPages)
|
|
pageEnd = numPages; //set it to the last page
|
|
//there is no page 0, but we don't want to affect the page end.
|
|
if (pageStart == 0)
|
|
pageStart = 1;
|
|
|
|
Range r;
|
|
r.startPage = pageStart;
|
|
r.endPage = pageEnd;
|
|
|
|
result.push_back(r);
|
|
|
|
//take note of the pages ran
|
|
int pagesRan = pageEnd - pageStart + 1;
|
|
//the start page was 1, don't forget that.
|
|
if (pageStart == 0)
|
|
pagesRan = pageEnd - pageStart;
|
|
pagesLeft -= pagesRan;
|
|
prevPageEnd = pageEnd;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
std::string OCRManager::getFRTFileName(std::string imgFile)
|
|
{
|
|
std::string result;
|
|
|
|
result = imgFile.substr(0, imgFile.find("."));
|
|
result += ".frt";
|
|
|
|
return result;
|
|
}
|
|
|
|
std::string OCRManager::callOCR(std::string inputFile, bool isPretty, int pagesInErrorToAllow, int threadCount, Range range, bool singlePageOutput)
|
|
{
|
|
std::string result;
|
|
std::stringstream inputStr;
|
|
inputStr << SLOCR_LOCATION << " -i " << inputFile << " -e " << pagesInErrorToAllow << " -r " << range.startPage << "-" << range.endPage;
|
|
|
|
if (isPretty)
|
|
inputStr << " -p";
|
|
if (singlePageOutput)
|
|
inputStr << " -s";
|
|
|
|
result = runProgram(inputStr.str());
|
|
return result;
|
|
}
|
|
|
|
std::string OCRManager::runProgram(std::string command)
|
|
{
|
|
std::string result;
|
|
FILE *stream;
|
|
const int MAX_BUFFER = 256;
|
|
char buffer[MAX_BUFFER];
|
|
stream = popen(command.c_str(), "r");
|
|
|
|
if (stream)
|
|
{
|
|
while (!feof(stream))
|
|
{
|
|
if (fgets(buffer, MAX_BUFFER, stream) != NULL)
|
|
result.append(buffer);
|
|
}
|
|
pclose(stream);
|
|
}
|
|
return result;
|
|
} |