431 lines
10 KiB
C++
431 lines
10 KiB
C++
//
|
|
// Copyright (c) 2016, Sequence Logic
|
|
//
|
|
#include "ocrbase.h"
|
|
|
|
#include "sledsconstants.h"
|
|
|
|
#include <sys/types.h>
|
|
#include <sys/sysinfo.h>
|
|
#include <sys/stat.h>
|
|
#include <unistd.h>
|
|
#include <dirent.h>
|
|
#include <iostream>
|
|
#include <algorithm>
|
|
#include <thread>
|
|
#include <string>
|
|
#include <cassert>
|
|
|
|
using namespace sequencelogic;
|
|
|
|
namespace
|
|
{
|
|
uint64_t memused()
|
|
{
|
|
struct sysinfo meminfo;
|
|
sysinfo(&meminfo);
|
|
|
|
uint64_t virtualmemused = meminfo.totalram - meminfo.freeram;
|
|
//Add other values in next statement to avoid int overflow
|
|
virtualmemused += meminfo.totalswap - meminfo.freeswap;
|
|
virtualmemused *= meminfo.mem_unit;
|
|
|
|
return virtualmemused;
|
|
}
|
|
}
|
|
|
|
COCRBase::COCRBase(std::string &inFile)
|
|
{
|
|
imgFile = inFile;
|
|
fixedImgFile = getFixedImgName(imgFile);
|
|
isTIF = isTIFImg(imgFile);
|
|
isPDF = isPDFImg(imgFile);
|
|
}
|
|
|
|
//TODO: add a flag for error messages
|
|
void COCRBase::printMessage(std::string msg)
|
|
{
|
|
messageLock.lock();
|
|
std::cout << " " << msg << std::endl;
|
|
messageLock.unlock();
|
|
}
|
|
|
|
bool::COCRBase::Validate()
|
|
{
|
|
bool valid = verify();
|
|
|
|
if (valid)
|
|
printMessage(std::string("The license is valid."));
|
|
else
|
|
printMessage(std::string("Error with the license"));
|
|
}
|
|
|
|
int COCRBase::Recognize(const OCROpts &opts)
|
|
{
|
|
int nRetVal = OCR_SUCCESS;
|
|
uint64_t mem = memused();
|
|
nPagesinErrToAllow = opts._numPagesInError;
|
|
|
|
// File or folder?
|
|
std::string tmpName = opts._inFile;
|
|
if (*tmpName.rbegin() == '/')
|
|
tmpName.pop_back();
|
|
|
|
struct stat fStat;
|
|
bool bIsFolder = false;
|
|
if (stat(tmpName.c_str(), &fStat) == 0)
|
|
bIsFolder = S_ISDIR(fStat.st_mode);
|
|
|
|
if (bIsFolder)
|
|
nRetVal = OCRFiles(opts);
|
|
else
|
|
nRetVal = OCRFile(opts);
|
|
|
|
std::stringstream msg;
|
|
mem = memused() - mem;
|
|
msg << "Memory used: " << mem / (1024.0 * 1024 * 1024) << "GB";
|
|
printMessage(msg.str());
|
|
|
|
return nRetVal;
|
|
}
|
|
|
|
std::string COCRBase::getFixedImgName(const std::string inFile)
|
|
{
|
|
std::string result = inFile;
|
|
|
|
if (result.rfind('.') != std::string::npos)
|
|
result.erase(result.rfind('.'));
|
|
|
|
//if we decide to add pdf support in the future
|
|
if (isTIF)
|
|
result += "_fixed.tif";
|
|
//We did
|
|
else if (isPDF)
|
|
result += "_fixed.pdf";
|
|
|
|
return result;
|
|
}
|
|
|
|
std::string COCRBase::getJSONName(const std::string& inFile, int pageNum)
|
|
{
|
|
std::string result = inFile;
|
|
|
|
result.erase(result.rfind('.'));
|
|
result += "_";
|
|
result += pageNum;
|
|
result += ".json";
|
|
|
|
return result;
|
|
}
|
|
|
|
std::string COCRBase::getXMLName(const std::string inFile)
|
|
{
|
|
std::string result = inFile;
|
|
|
|
result.erase(result.rfind('.'));
|
|
result += ".xml";
|
|
|
|
return result;
|
|
}
|
|
|
|
std::string COCRBase::getTmpImgName(const std::string &tmpDir, const std::string &inFile)
|
|
{
|
|
std::string result = tmpDir;
|
|
if (result[result.length()-1] == '/')
|
|
result.pop_back();
|
|
|
|
// Check that the temp area exists...
|
|
struct stat fStat;
|
|
if ((stat(result.c_str(), &fStat) != 0) ||
|
|
!S_ISDIR(fStat.st_mode))
|
|
{
|
|
if (mkdir(result.c_str(), 0777) != 0)
|
|
{
|
|
printMessage(std::string("Error creating: ") + result);
|
|
switch (errno)
|
|
{
|
|
case EACCES:
|
|
printMessage("Search permission is denied on a component of the path prefix, or write permission is denied on the parent directory of the directory to be created.");
|
|
break;
|
|
case EEXIST:
|
|
printMessage("The named file exists.");
|
|
break;
|
|
case ELOOP:
|
|
printMessage("A loop exists in symbolic links encountered during resolution of the path argument.");
|
|
break;
|
|
case EMLINK:
|
|
printMessage("The link count of the parent directory would exceed{ LINK_MAX }.");
|
|
break;
|
|
case ENAMETOOLONG:
|
|
printMessage("The length of the path argument exceeds{ PATH_MAX } or a pathname component is longer than{ NAME_MAX }.");
|
|
break;
|
|
case ENOENT:
|
|
printMessage("A component of the path prefix specified by path does not name an existing directory or path is an empty string.");
|
|
break;
|
|
case ENOSPC:
|
|
printMessage("The file system does not contain enough space to hold the contents of the new directory or to extend the parent directory of the new directory.");
|
|
break;
|
|
case ENOTDIR:
|
|
printMessage("A component of the path prefix is not a directory.");
|
|
break;
|
|
case EROFS:
|
|
printMessage("The parent directory resides on a read-only file system.");
|
|
break;
|
|
default:
|
|
{
|
|
std::stringstream msg;
|
|
msg << "Unknown error: " << errno;
|
|
printMessage(msg.str());
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
result += '/';
|
|
|
|
std::string fileName = inFile;
|
|
fileName.erase(fileName.rfind('.'));
|
|
|
|
std::stringstream fileNameStr;
|
|
fileNameStr << fileName.substr(fileName.find_last_of('/') + 1);
|
|
fileNameStr << getpid() << "-" << std::this_thread::get_id();
|
|
fileName = getFixedImgName(fileNameStr.str()); //need the _fixed at the end
|
|
|
|
//save the result and return
|
|
result += fileName;
|
|
|
|
std::stringstream message;
|
|
message << "Tmp File path: " << result;
|
|
printMessage(message.str());
|
|
|
|
return result;
|
|
}
|
|
|
|
std::string COCRBase::getFixedImgName(const std::string inFile, int pageNum)
|
|
{
|
|
std::string result = inFile;
|
|
|
|
result.erase(result.rfind('.'));
|
|
|
|
std::stringstream messageStream;;
|
|
|
|
//if we decide to add pdf support in the future
|
|
if (isTIF)
|
|
messageStream << result << "_" << pageNum << ".tif";
|
|
//We did
|
|
else if (isPDF)
|
|
messageStream << result << "_" << pageNum << ".pdf";
|
|
|
|
result = messageStream.str();
|
|
return result;
|
|
}
|
|
|
|
std::string COCRBase::getFixedImgName(const std::string inFile, int startPage, int endPage)
|
|
{
|
|
std::string result = inFile;
|
|
|
|
result.erase(result.rfind('.'));
|
|
|
|
std::stringstream messageStream;;
|
|
|
|
//if we decide to add pdf support in the future
|
|
if (isTIF)
|
|
messageStream << result << "_" << startPage << "-" << endPage << ".tif";
|
|
//We did
|
|
else if (isPDF)
|
|
messageStream << result << "_" << startPage << "-" << endPage << ".pdf";
|
|
|
|
result = messageStream.str();
|
|
return result;
|
|
}
|
|
|
|
int COCRBase::OCRFile(const OCROpts &opts)
|
|
{
|
|
isPDF = isPDFImg(opts._inFile);
|
|
isTIF = isTIFImg(opts._inFile);
|
|
int nRetVal = OCR_SUCCESS;
|
|
bool cancel;
|
|
|
|
//don't OCR a fixed image!!!
|
|
if (opts._inFile.find("_fixed.") == 0)
|
|
return true;
|
|
|
|
std::string outFile = opts._inFile;
|
|
outFile.erase(outFile.rfind('.'));
|
|
outFile += ".frt";
|
|
|
|
printMessage(std::string("Recognizing '" + opts._inFile + "', into file '" + outFile + "\n"));
|
|
|
|
nRetVal = OCR(opts, outFile);
|
|
|
|
return nRetVal;
|
|
}
|
|
|
|
int COCRBase::OCRFiles(const OCROpts &opts)
|
|
{
|
|
int nRetVal = OCR_SUCCESS;
|
|
|
|
std::vector<std::string> files;
|
|
if (getFilesInDirectory(opts._inFile, files) > 0)
|
|
{
|
|
std::string inDir = opts._inFile;
|
|
if (*inDir.rbegin() != '/')
|
|
inDir += "/";
|
|
for (size_t i = 0; (nRetVal == OCR_SUCCESS) && (i < files.size()); ++i)
|
|
{
|
|
std::string tmpName = files[i];
|
|
tmpName.erase(0, tmpName.rfind('.'));
|
|
std::transform(tmpName.begin(), tmpName.end(), tmpName.begin(), ::tolower);
|
|
if (isTIFImg(tmpName) || isPDFImg(tmpName))
|
|
{
|
|
OCROpts newOpts(opts);
|
|
newOpts._inFile = inDir + files[i];
|
|
nRetVal = OCRFile(newOpts);
|
|
}
|
|
}
|
|
}
|
|
|
|
return nRetVal;
|
|
}
|
|
|
|
bool COCRBase::isTIFImg(const std::string &fileName)
|
|
{
|
|
std::string tmpName = fileName;
|
|
tmpName.erase(0, tmpName.rfind('.'));
|
|
std::transform(tmpName.begin(), tmpName.end(), tmpName.begin(), ::tolower);
|
|
|
|
//std::cout << "In isTIFImg() " << tmpName <<"\n";
|
|
bool result = (tmpName == ".tiff"|| tmpName == ".tif" );
|
|
return result;
|
|
}
|
|
|
|
bool COCRBase::isPDFImg(const std::string &fileName)
|
|
{
|
|
std::string tmpName = fileName;
|
|
tmpName.erase(0, tmpName.rfind('.'));
|
|
std::transform(tmpName.begin(), tmpName.end(), tmpName.begin(), ::tolower);
|
|
|
|
//std::cout << "In isPDFImg() " << tmpName << "\n";
|
|
bool result = (tmpName == ".pdf");
|
|
return result;
|
|
}
|
|
|
|
size_t COCRBase::getFilesInDirectory(const std::string &dirName, std::vector<std::string> &files)
|
|
{
|
|
files.clear();
|
|
|
|
DIR *dpdf;
|
|
struct dirent *epdf;
|
|
|
|
dpdf = ::opendir(dirName.c_str());
|
|
if (dpdf != NULL)
|
|
{
|
|
while (epdf = ::readdir(dpdf))
|
|
{
|
|
if (epdf->d_name[0] != '.')
|
|
files.push_back(epdf->d_name);
|
|
}
|
|
::closedir(dpdf);
|
|
}
|
|
|
|
return files.size();
|
|
}
|
|
|
|
void COCRBase::copyFile(const std::string& srcFile, const std::string& dstFile)
|
|
{
|
|
//move the file.
|
|
std::stringstream message;
|
|
message << "Copying file " << srcFile << " to " << dstFile;
|
|
printMessage(message.str());
|
|
message.str("");
|
|
|
|
std::ifstream srcStream(srcFile.c_str());
|
|
std::ofstream dstStream(dstFile.c_str());
|
|
|
|
dstStream << srcStream.rdbuf();
|
|
|
|
dstStream.close();
|
|
srcStream.close();
|
|
|
|
std::remove(srcFile.c_str());
|
|
}
|
|
|
|
/**
|
|
* Get a file type string, based on it's extension.
|
|
*/
|
|
std::string COCRBase::getTypeByFileExt(const std::string &fileName)
|
|
{
|
|
std::string retVal = "UNKNOWN";
|
|
|
|
if (isTIFImg(fileName))
|
|
retVal = "TIF_IMG";
|
|
else if (isPDFImg(fileName))
|
|
retVal = "PDF_IMG";
|
|
|
|
return retVal;
|
|
}
|
|
|
|
long COCRBase::fromBin(long n)
|
|
{
|
|
long factor = 1;
|
|
long total = 0;
|
|
|
|
while (n != 0)
|
|
{
|
|
total += (n % 10) * factor;
|
|
n /= 10;
|
|
factor *= 2;
|
|
}
|
|
|
|
return total;
|
|
}
|
|
|
|
int COCRBase::convertConfidence(long confidence)
|
|
{
|
|
int result;
|
|
//divide the origional by 10
|
|
int tmp = confidence / 10;
|
|
|
|
if (tmp >= 9)
|
|
tmp = 8;
|
|
|
|
result = 9 - tmp;
|
|
return result;
|
|
}
|
|
|
|
bool COCRBase::getBit(unsigned char byte, int position)
|
|
{
|
|
return (byte >> position) & 0x1;
|
|
}
|
|
|
|
int COCRBase::getMax(int start, int stop, int *vals)
|
|
{
|
|
int max = vals[start];
|
|
for (int i = start + 1; i < stop; i++)
|
|
if (vals[i] > max)
|
|
max = vals[i];
|
|
|
|
return max;
|
|
}
|
|
|
|
int COCRBase::getMin(int start, int stop, int *vals)
|
|
{
|
|
int min = vals[start];
|
|
for (int i = start + 1; i < stop; i++)
|
|
if (vals[i] < min)
|
|
min = vals[i];
|
|
|
|
return min;
|
|
}
|
|
|
|
int COCRBase::getNumWords(char chars[], int numChars)
|
|
{
|
|
int numWords = 0;
|
|
for (int i = 0; i < numChars; i++)
|
|
if (std::isspace(chars[i]))
|
|
numWords++;
|
|
|
|
return numWords;
|
|
}
|