Sleds/slocr-mp/MultiProcessOCR/OCRManager.cpp

366 lines
9.7 KiB
C++

#include "OCRManager.h"
using namespace sequencelogic;
OCRManager::OCRManager()
{
}
OCRManager::~OCRManager()
{
}
void OCRManager::ValidateOCR()
{
system(std::string(SLOCR_LOCATION + " -v").c_str());
}
void OCRManager::RunOCR(std::string inputFile, bool isPretty, int pagesInErrorToAllow, int threadCount, bool singlePageOutput)
{
std::vector<std::string> OCROutputs;
std::vector<Range> ranges;
ranges = splitPages(inputFile, threadCount);
std::cout << "Running on " << inputFile << " with " << threadCount << " OCR Processes." << std::endl;
pid_t *childPids = NULL;
pid_t p;
childPids = (pid_t*) malloc(threadCount * sizeof(pid_t));
for (int i = 0; i < ranges.size(); ++i)
{
if ((p = fork()) == 0)
{
//in a child
std::string ocrResult = callOCR(inputFile, isPretty, pagesInErrorToAllow, threadCount, ranges[i], singlePageOutput);
std::cout << "OCR output for pages " << ranges[i].startPage << "-" << ranges[i].endPage << ":\n"
<< "----------------------------------------------------------------\n"
<< ocrResult << "\n"
<< "----------------------------------------------------------------" << std::endl << std::endl;
exit(0);
//break;
}
else
childPids[i] = p; //add to the child list
}
//wait
int stillWaiting;
do {
stillWaiting = 0;
for (int i = 0; i < threadCount; ++i)
{
if (childPids[i] > 0)
{
if (waitpid(childPids[i], NULL, WNOHANG) != 0)
childPids[i] = 0; //child is done
else
stillWaiting = 1; //still waiting
}
}
sleep(5);
} while (stillWaiting);
//clean up
free(childPids);
//now to put the files back together
std::string dir = inputFile.substr(0, inputFile.rfind("/") + 1);
glueFRTFiles(dir, isPretty, getFRTFileName(inputFile));
glueImageFiles(threadCount, dir, inputFile, getOutputFile(inputFile), singlePageOutput);
}
int OCRManager::getPageCnt(std::string inputFile)
{
int result;
std::string tmpResult;
std::stringstream inStr;
inStr << PAGE_COUNT_CMD << inputFile;
tmpResult = runProgram(inStr.str());
result = atoi(tmpResult.c_str());
return result;
}
size_t OCRManager::getFilesInDirectory(const std::string &dirName, std::vector<std::string> &files, const std::string filter)
{
files.clear();
DIR *dpdf;
struct dirent *epdf;
dpdf = ::opendir(dirName.c_str());
if (dpdf != NULL)
{
while (epdf = ::readdir(dpdf))
{
if (epdf->d_name[0] != '.')
{
std::string str(epdf->d_name);
if (str.find(filter) != std::string::npos)
files.push_back(str);
}
}
::closedir(dpdf);
}
return files.size();
}
void OCRManager::getPageRange(std::string fileName, int &startPage, int &endPage)
{
std::string start = fileName.substr(fileName.rfind('_') + 1, fileName.rfind('-'));
std::string end = fileName.substr(fileName.rfind('-') + 1, fileName.rfind('.'));
startPage = atoi(start.c_str());
endPage = atoi(end.c_str());
}
bool imgNameCompare(const std::string &imgLeft, const std::string &imgRight)
{
std::string LComp = imgLeft.substr(imgLeft.find_last_of('_') + 1, imgLeft.rfind('.'));
std::string RComp = imgRight.substr(imgRight.find_last_of('_') + 1, imgRight.rfind('.'));
int LNum = atoi(LComp.c_str());
int RNum = atoi(RComp.c_str());
bool result = LNum < RNum;
return result;
}
bool frtNameCompare(const std::string &frtLeft, const std::string &frtRight)
{
std::string LComp = frtLeft.substr(frtLeft.find_last_of('_') + 1, frtLeft.rfind('-'));
std::string RComp = frtRight.substr(frtRight.find_last_of('_') + 1, frtRight.rfind('-'));
int LNum = atoi(LComp.c_str());
int RNum = atoi(RComp.c_str());
bool result = LNum < RNum;
return result;
}
void OCRManager::glueFRTFiles(std::string dir, bool prettyPrint, std::string outFile)
{
std::cout << "Gluing files in " << dir << std::endl;
JSONArray pages;
JSONArray badPages;
int numPages;
int numBadPages = 0;
std::string origSrcFile;
std::string origSrcType;
std::string txtSrcType;
bool fixedUp;
std::vector<std::string> frtFiles;
getFilesInDirectory(dir, frtFiles, ".frt");
std::sort(frtFiles.begin(), frtFiles.end(), frtNameCompare);
//load the JSON objects
for (int i = 0; i < frtFiles.size(); ++i)
{
std::string fileName = dir + frtFiles[i];
std::cout << " Gluing file " << fileName << std::endl;
JSONObject currentFile;
currentFile.LoadFromFile(fileName);
//need to keep track of the number of bad pages.
numBadPages += currentFile.getJSONInt("numBadPages");
//add the pages good and bad.
JSONArray currentBadPages = currentFile.getJSONArray("badPages");
JSONArray currentPages = currentFile.getJSONArray("pages");
//only need this from the first file.
if (i == 0)
{
numPages = currentFile.getJSONInt("numPages");
origSrcFile = currentFile.getJSONString("originalSourceFile");
origSrcType = currentFile.getJSONString("originalSourceType");
txtSrcType = currentFile.getJSONString("textSourceType");
fixedUp = currentFile.getJSONBool("fixedUp");
}
int start = 0;
int end = 0;
getPageRange(fileName, start, end);
for (int j = start - 1; j < end; ++j)
{
JSONObject &currentPage = static_cast<JSONObject&>(currentPages[j]);
pages.addElement(currentPage);
}
//add all the bad pages.
for (int i = 0; i < currentBadPages.getnumitems(); ++i)
badPages.addElement(static_cast<JSONObject&>(currentBadPages[i]));
//remove the file.
deleteFile(fileName);
}
JSONObject result;
result.setJSONValue("numPages", numPages);
result.setJSONValue("numBadPages", numBadPages);
result.setJSONValue("originalSourceFile", origSrcFile.c_str());
result.setJSONValue("originalSourceType", origSrcType.c_str());
result.setJSONValue("textSourceType", txtSrcType.c_str());
result.setJSONValue("fixedUp", fixedUp);
result.setJSONValue("pages", pages);
result.setJSONValue("badPages", badPages);
std::cout << "Saving file: " << outFile << std::endl;
result.SaveToFile(outFile, prettyPrint);
}
void OCRManager::glueImageFiles(int threadCount, std::string dir, std::string inFile, std::string outFile, bool singlePageOutput)
{
std::string ext = outFile.substr(outFile.rfind('.'));
std::cout << "Gluing image files into " << outFile << std::endl;
std::vector<std::string> imgFiles;
if (!singlePageOutput)
getFilesInDirectory(dir, imgFiles, ext);
else
getFilesInDirectory(dir, imgFiles, ".png");
std::sort(imgFiles.begin(), imgFiles.end(), imgNameCompare);
std::stringstream inStr;
inStr << GHOST_SCRIPT_CMD << outFile << " -dBATCH ";
for (int i = 0; i < imgFiles.size(); ++i)
{ //don't add the source image.
if(inFile.find(imgFiles[i].c_str()) == std::string::npos)
inStr << dir + imgFiles[i] << " ";
}
inStr << outFile;
std::string result = runProgram(inStr.str());
std::cout << result;
//remove the old image files.
if (!singlePageOutput)
{
for (int i = 0; i < imgFiles.size(); ++i)
{ //but not the source.
if (inFile.find(imgFiles[i].c_str()) == std::string::npos)
deleteFile(dir + imgFiles[i]);
}
}
}
std::string OCRManager::getOutputFile(std::string inputFile, int start, int end)
{
std::stringstream result;
std::string ext = inputFile.substr(inputFile.find("."));
result << inputFile.substr(0, inputFile.find("."));
result << "_" << start << "-" << end << ext;
return result.str();
}
void OCRManager::deleteFile(std::string fileName)
{
//std::cout << " Deleting file " << fileName << "..." << std::endl;
std::remove(fileName.c_str());
//std::cout << " Deleted." << std::endl;
}
std::string OCRManager::getOutputFile(std::string inputFile)
{
std::stringstream result;
std::string ext = inputFile.substr(inputFile.find("."));
result << inputFile.substr(0, inputFile.find("."));
result << "_Fixed" << ext;
return result.str();
}
std::vector<Range> OCRManager::splitPages(std::string inputFile, int threadCount)
{
const int numPages = getPageCnt(inputFile);
std::cout << "Recognized " << numPages << " pages in " << inputFile << std::endl;
std::vector<Range> result;
//NEED TO ROUND
double tmpPageInc = (double)numPages / (double)threadCount;
int pageInc = (int)ceil(tmpPageInc);
int pagesLeft = numPages;
int prevPageEnd = -1;
//while there are pages to process
while (pagesLeft > 0)
{
//get the starting page
int pageStart = numPages - pagesLeft;
if (pageStart = prevPageEnd)
++pageStart;
int pageEnd = pageStart + pageInc;
//if the end page is past the last page
if (pageEnd > numPages)
pageEnd = numPages; //set it to the last page
//there is no page 0, but we don't want to affect the page end.
if (pageStart == 0)
pageStart = 1;
Range r;
r.startPage = pageStart;
r.endPage = pageEnd;
result.push_back(r);
//take note of the pages ran
int pagesRan = pageEnd - pageStart + 1;
//the start page was 1, don't forget that.
if (pageStart == 0)
pagesRan = pageEnd - pageStart;
pagesLeft -= pagesRan;
prevPageEnd = pageEnd;
}
return result;
}
std::string OCRManager::getFRTFileName(std::string imgFile)
{
std::string result;
result = imgFile.substr(0, imgFile.find("."));
result += ".frt";
return result;
}
std::string OCRManager::callOCR(std::string inputFile, bool isPretty, int pagesInErrorToAllow, int threadCount, Range range, bool singlePageOutput)
{
std::string result;
std::stringstream inputStr;
inputStr << SLOCR_LOCATION << " -i " << inputFile << " -e " << pagesInErrorToAllow << " -r " << range.startPage << "-" << range.endPage;
if (isPretty)
inputStr << " -p";
if (singlePageOutput)
inputStr << " -s";
result = runProgram(inputStr.str());
return result;
}
std::string OCRManager::runProgram(std::string command)
{
std::string result;
FILE *stream;
const int MAX_BUFFER = 256;
char buffer[MAX_BUFFER];
stream = popen(command.c_str(), "r");
if (stream)
{
while (!feof(stream))
{
if (fgets(buffer, MAX_BUFFER, stream) != NULL)
result.append(buffer);
}
pclose(stream);
}
return result;
}