Sleds/slocr/ocrbase.h

207 lines
4.5 KiB
C++

//
// Copyright (c) 2016, Sequence Logic
//
#ifndef OCR_BASE
#define OCR_BASE
#include <string>
#include <vector>
#include <mutex>
#include <iostream>
#include <string>
#include <fstream>
#include <ctime>
#include <vector>
#include <sstream>
#include "Mutex.h"
#define DEFAULT_THREAD_COUNT 1
#define DEFAULT_TMP_DIR "/tmp/slocr/"
namespace sequencelogic
{
struct OCROpts
{
bool _bPrettyOutput;
bool _bSinglePageSave;
int _numPagesInError;
int _nThreadCount;
int _nStartPage;
int _nEndPage;
std::string _inFile;
std::string _tempFolder;
std::string _singlePageFolder;
OCROpts() : _bPrettyOutput(false), _bSinglePageSave(false), _numPagesInError(0), _nThreadCount(DEFAULT_THREAD_COUNT), _nStartPage(0), _nEndPage(0), _tempFolder(DEFAULT_TMP_DIR), _singlePageFolder("") {}
};
/**
* A base class for an OCR engine.
*/
class COCRBase
{
public:
COCRBase() {}
COCRBase(std::string &inFile);
virtual ~COCRBase() {}
/**
* Recognize/OCR something.
*/
int Recognize(const OCROpts &opts);
/**
* Validates that the OCR engine is configured correctly.
*/
bool Validate();
protected:
#pragma region Members
std::string getXMLName(const std::string inFile);
void copyFile(const std::string& startFile, const std::string& destFile);
/**
* The number of threads the program will utalize.
*/
int threadCount;
/**
* Specified weather or not the image is a tif
*/
bool isTIF;
/**
* Specified weather or not the image is a pdf
*/
bool isPDF;
/**
* The name of the origional image file
*/
std::string imgFile;
/**
* The name of the fixed image file
*/
std::string fixedImgFile;
std::string fixedImgPageFile;
/**
* The number of pages allowed to be in error, before bailing out.
*/
int nPagesinErrToAllow;
/**
* The name of the temporary fixed image file
*/
std::string tmpImgFile;
#pragma endregion Members to be inherited by the base classes.
/**
* Prints a message to the console in a "thread safe" way.
*/
void printMessage(std::string msg);
/**
* OCR a single file.
*/
int OCRFile(const OCROpts &opts);
/**
* OCR all files in a given directory.
*/
int OCRFiles(const OCROpts &opts);
/*
* The one, and only, interface. For now...
*/
virtual int OCR(const OCROpts &opts, const std::string &outFile) = 0;
virtual int OCRMT(const std::string &inFile, int threadCount = 2) = 0;
/**
* Checks to see if the engine is properly set up
*/
virtual bool verify() = 0;
/**
* Get a file type string, based on it's extension.
*/
std::string getTypeByFileExt(const std::string &fileName);
/**
* Get the value of a binary number in base 10.
*/
long fromBin(long n);
/**
* Converts the 'error' nuance gives to a confidence value between 1-9.
*/
int convertConfidence(long confidence);
/**
* Gets the bit from a byte at the specified position.
*/
bool getBit(unsigned char byte, int position);
/**
* Gets the maximum value between start and stop in the array vals.
*/
int getMax(int start, int stop, int* vals);
/**
* Gets the minimum value between start and stop in the array vals.
*/
int getMin(int start, int stop, int* vals);
/**
* Gets the number of words (seperated by spaces) in the given array.
*/
int getNumWords(char chars[], int numChars);
/**
* Returns true if the given fileName is a tif image.
*/
bool isTIFImg(const std::string &fileName);
/**
* Returns true if the given fileName is a PDF image.
*/
bool isPDFImg(const std::string &fileName);
/**
* Gets the name for the "fixed" image.
*/
std::string getFixedImgName(const std::string inFile);
std::string getFixedImgName(const std::string inFile, int startPage, int endPage);
/**
* Gets the temporary name for the "fixed" image.
*/
std::string getTmpImgName(const std::string &tmpDir, const std::string &inFile);
//---------------------FOR TESTING---------------------------------------------------------------------------
std::string getFixedImgName(const std::string inFile, int pageNum);
std::string getJSONName(const std::string &inFile, int pageNum);
private:
/**
* Get all files in a given directory. Returns the number of files found.
*/
size_t getFilesInDirectory(const std::string &dirName, std::vector<std::string> &files);
/**
* The mutex to prevent messages causing a race condition.
*/
Mutex messageLock;
};
};
#endif