#include "OCRManager.h" using namespace sequencelogic; OCRManager::OCRManager() { } OCRManager::~OCRManager() { } void OCRManager::ValidateOCR() { system(std::string(SLOCR_LOCATION + " -v").c_str()); } void OCRManager::RunOCR(std::string inputFile, bool isPretty, int pagesInErrorToAllow, int threadCount, bool singlePageOutput) { std::vector OCROutputs; std::vector ranges; ranges = splitPages(inputFile, threadCount); std::cout << "Running on " << inputFile << " with " << threadCount << " OCR Processes." << std::endl; pid_t *childPids = NULL; pid_t p; childPids = (pid_t*) malloc(threadCount * sizeof(pid_t)); for (int i = 0; i < ranges.size(); ++i) { if ((p = fork()) == 0) { //in a child std::string ocrResult = callOCR(inputFile, isPretty, pagesInErrorToAllow, threadCount, ranges[i], singlePageOutput); std::cout << "OCR output for pages " << ranges[i].startPage << "-" << ranges[i].endPage << ":\n" << "----------------------------------------------------------------\n" << ocrResult << "\n" << "----------------------------------------------------------------" << std::endl << std::endl; exit(0); //break; } else childPids[i] = p; //add to the child list } //wait int stillWaiting; do { stillWaiting = 0; for (int i = 0; i < threadCount; ++i) { if (childPids[i] > 0) { if (waitpid(childPids[i], NULL, WNOHANG) != 0) childPids[i] = 0; //child is done else stillWaiting = 1; //still waiting } } sleep(5); } while (stillWaiting); //clean up free(childPids); //now to put the files back together std::string dir = inputFile.substr(0, inputFile.rfind("/") + 1); glueFRTFiles(dir, isPretty, getFRTFileName(inputFile)); glueImageFiles(threadCount, dir, inputFile, getOutputFile(inputFile), singlePageOutput); } int OCRManager::getPageCnt(std::string inputFile) { int result; std::string tmpResult; std::stringstream inStr; inStr << PAGE_COUNT_CMD << inputFile; tmpResult = runProgram(inStr.str()); result = atoi(tmpResult.c_str()); return result; } size_t OCRManager::getFilesInDirectory(const std::string &dirName, std::vector &files, const std::string filter) { files.clear(); DIR *dpdf; struct dirent *epdf; dpdf = ::opendir(dirName.c_str()); if (dpdf != NULL) { while (epdf = ::readdir(dpdf)) { if (epdf->d_name[0] != '.') { std::string str(epdf->d_name); if (str.find(filter) != std::string::npos) files.push_back(str); } } ::closedir(dpdf); } return files.size(); } void OCRManager::getPageRange(std::string fileName, int &startPage, int &endPage) { std::string start = fileName.substr(fileName.rfind('_') + 1, fileName.rfind('-')); std::string end = fileName.substr(fileName.rfind('-') + 1, fileName.rfind('.')); startPage = atoi(start.c_str()); endPage = atoi(end.c_str()); } bool imgNameCompare(const std::string &imgLeft, const std::string &imgRight) { std::string LComp = imgLeft.substr(imgLeft.find_last_of('_') + 1, imgLeft.rfind('.')); std::string RComp = imgRight.substr(imgRight.find_last_of('_') + 1, imgRight.rfind('.')); int LNum = atoi(LComp.c_str()); int RNum = atoi(RComp.c_str()); bool result = LNum < RNum; return result; } bool frtNameCompare(const std::string &frtLeft, const std::string &frtRight) { std::string LComp = frtLeft.substr(frtLeft.find_last_of('_') + 1, frtLeft.rfind('-')); std::string RComp = frtRight.substr(frtRight.find_last_of('_') + 1, frtRight.rfind('-')); int LNum = atoi(LComp.c_str()); int RNum = atoi(RComp.c_str()); bool result = LNum < RNum; return result; } void OCRManager::glueFRTFiles(std::string dir, bool prettyPrint, std::string outFile) { std::cout << "Gluing files in " << dir << std::endl; JSONArray pages; JSONArray badPages; int numPages; int numBadPages = 0; std::string origSrcFile; std::string origSrcType; std::string txtSrcType; bool fixedUp; std::vector frtFiles; getFilesInDirectory(dir, frtFiles, ".frt"); std::sort(frtFiles.begin(), frtFiles.end(), frtNameCompare); //load the JSON objects for (int i = 0; i < frtFiles.size(); ++i) { std::string fileName = dir + frtFiles[i]; std::cout << " Gluing file " << fileName << std::endl; JSONObject currentFile; currentFile.LoadFromFile(fileName); //need to keep track of the number of bad pages. numBadPages += currentFile.getJSONInt("numBadPages"); //add the pages good and bad. JSONArray currentBadPages = currentFile.getJSONArray("badPages"); JSONArray currentPages = currentFile.getJSONArray("pages"); //only need this from the first file. if (i == 0) { numPages = currentFile.getJSONInt("numPages"); origSrcFile = currentFile.getJSONString("originalSourceFile"); origSrcType = currentFile.getJSONString("originalSourceType"); txtSrcType = currentFile.getJSONString("textSourceType"); fixedUp = currentFile.getJSONBool("fixedUp"); } int start = 0; int end = 0; getPageRange(fileName, start, end); for (int j = start - 1; j < end; ++j) { JSONObject ¤tPage = static_cast(currentPages[j]); pages.addElement(currentPage); } //add all the bad pages. for (int i = 0; i < currentBadPages.getnumitems(); ++i) badPages.addElement(static_cast(currentBadPages[i])); //remove the file. deleteFile(fileName); } JSONObject result; result.setJSONValue("numPages", numPages); result.setJSONValue("numBadPages", numBadPages); result.setJSONValue("originalSourceFile", origSrcFile.c_str()); result.setJSONValue("originalSourceType", origSrcType.c_str()); result.setJSONValue("textSourceType", txtSrcType.c_str()); result.setJSONValue("fixedUp", fixedUp); result.setJSONValue("pages", pages); result.setJSONValue("badPages", badPages); std::cout << "Saving file: " << outFile << std::endl; result.SaveToFile(outFile, prettyPrint); } void OCRManager::glueImageFiles(int threadCount, std::string dir, std::string inFile, std::string outFile, bool singlePageOutput) { std::string ext = outFile.substr(outFile.rfind('.')); std::cout << "Gluing image files into " << outFile << std::endl; std::vector imgFiles; if (!singlePageOutput) getFilesInDirectory(dir, imgFiles, ext); else getFilesInDirectory(dir, imgFiles, ".png"); std::sort(imgFiles.begin(), imgFiles.end(), imgNameCompare); std::stringstream inStr; inStr << GHOST_SCRIPT_CMD << outFile << " -dBATCH "; for (int i = 0; i < imgFiles.size(); ++i) { //don't add the source image. if(inFile.find(imgFiles[i].c_str()) == std::string::npos) inStr << dir + imgFiles[i] << " "; } inStr << outFile; std::string result = runProgram(inStr.str()); std::cout << result; //remove the old image files. if (!singlePageOutput) { for (int i = 0; i < imgFiles.size(); ++i) { //but not the source. if (inFile.find(imgFiles[i].c_str()) == std::string::npos) deleteFile(dir + imgFiles[i]); } } } std::string OCRManager::getOutputFile(std::string inputFile, int start, int end) { std::stringstream result; std::string ext = inputFile.substr(inputFile.find(".")); result << inputFile.substr(0, inputFile.find(".")); result << "_" << start << "-" << end << ext; return result.str(); } void OCRManager::deleteFile(std::string fileName) { //std::cout << " Deleting file " << fileName << "..." << std::endl; std::remove(fileName.c_str()); //std::cout << " Deleted." << std::endl; } std::string OCRManager::getOutputFile(std::string inputFile) { std::stringstream result; std::string ext = inputFile.substr(inputFile.find(".")); result << inputFile.substr(0, inputFile.find(".")); result << "_Fixed" << ext; return result.str(); } std::vector OCRManager::splitPages(std::string inputFile, int threadCount) { const int numPages = getPageCnt(inputFile); std::cout << "Recognized " << numPages << " pages in " << inputFile << std::endl; std::vector result; //NEED TO ROUND double tmpPageInc = (double)numPages / (double)threadCount; int pageInc = (int)ceil(tmpPageInc); int pagesLeft = numPages; int prevPageEnd = -1; //while there are pages to process while (pagesLeft > 0) { //get the starting page int pageStart = numPages - pagesLeft; if (pageStart = prevPageEnd) ++pageStart; int pageEnd = pageStart + pageInc; //if the end page is past the last page if (pageEnd > numPages) pageEnd = numPages; //set it to the last page //there is no page 0, but we don't want to affect the page end. if (pageStart == 0) pageStart = 1; Range r; r.startPage = pageStart; r.endPage = pageEnd; result.push_back(r); //take note of the pages ran int pagesRan = pageEnd - pageStart + 1; //the start page was 1, don't forget that. if (pageStart == 0) pagesRan = pageEnd - pageStart; pagesLeft -= pagesRan; prevPageEnd = pageEnd; } return result; } std::string OCRManager::getFRTFileName(std::string imgFile) { std::string result; result = imgFile.substr(0, imgFile.find(".")); result += ".frt"; return result; } std::string OCRManager::callOCR(std::string inputFile, bool isPretty, int pagesInErrorToAllow, int threadCount, Range range, bool singlePageOutput) { std::string result; std::stringstream inputStr; inputStr << SLOCR_LOCATION << " -i " << inputFile << " -e " << pagesInErrorToAllow << " -r " << range.startPage << "-" << range.endPage; if (isPretty) inputStr << " -p"; if (singlePageOutput) inputStr << " -s"; result = runProgram(inputStr.str()); return result; } std::string OCRManager::runProgram(std::string command) { std::string result; FILE *stream; const int MAX_BUFFER = 256; char buffer[MAX_BUFFER]; stream = popen(command.c_str(), "r"); if (stream) { while (!feof(stream)) { if (fgets(buffer, MAX_BUFFER, stream) != NULL) result.append(buffer); } pclose(stream); } return result; }