164 lines
5.7 KiB
Python
164 lines
5.7 KiB
Python
|
|
import json
|
||
|
|
import os
|
||
|
|
import argparse
|
||
|
|
import validictory
|
||
|
|
import concurrent.futures as futures
|
||
|
|
|
||
|
|
# schema = {}
|
||
|
|
# with open('OCROutput.json') as schema_file:
|
||
|
|
# schema = json.loads(schema_file.read())
|
||
|
|
|
||
|
|
|
||
|
|
def validate_lib_config_files(lib_dir):
|
||
|
|
result = {}
|
||
|
|
parent, lib_name = os.path.split(lib_dir)
|
||
|
|
print('Validating config files for library at "%s"' % lib_dir)
|
||
|
|
cfg_dir = os.path.abspath(os.path.join(lib_dir, os.pardir, os.pardir, 'config'))
|
||
|
|
|
||
|
|
if not os.path.exists(cfg_dir):
|
||
|
|
print('Error: There is no config directory at "%s"' % cfg_dir)
|
||
|
|
|
||
|
|
for file in os.listdir(cfg_dir):
|
||
|
|
tmp_set = []
|
||
|
|
if file.endswith('.conf.json'):
|
||
|
|
file = os.path.join(cfg_dir, file)
|
||
|
|
print('Testing config file "%s"' % file)
|
||
|
|
with open(file) as cfg_data:
|
||
|
|
cfg_info = json.load(cfg_data)
|
||
|
|
|
||
|
|
if cfg_info['libname'] == lib_name:
|
||
|
|
walker_cfg = cfg_info['CLASSIFYWALKER']
|
||
|
|
if 'containers' not in walker_cfg or 'contained' not in walker_cfg:
|
||
|
|
print('There are no containers/contained types in "%s"' % file)
|
||
|
|
else:
|
||
|
|
containers_contained = set(walker_cfg['containers'] + walker_cfg['contained'])
|
||
|
|
for doctype in containers_contained:
|
||
|
|
doctype_folder = os.path.join(lib_dir, doctype)
|
||
|
|
if not os.path.exists(doctype_folder):
|
||
|
|
tmp_set.append(doctype_folder)
|
||
|
|
if len(tmp_set) > 0:
|
||
|
|
result[file] = tmp_set
|
||
|
|
|
||
|
|
return result
|
||
|
|
|
||
|
|
|
||
|
|
def is_bad_json(json_file):
|
||
|
|
with open(json_file) as reader:
|
||
|
|
try:
|
||
|
|
# validictory.validate(json.load(reader), schema)
|
||
|
|
json.load(reader)
|
||
|
|
is_valid = False
|
||
|
|
except:
|
||
|
|
is_valid = True
|
||
|
|
return json_file, is_valid
|
||
|
|
|
||
|
|
|
||
|
|
def test_frts(frt_files):
|
||
|
|
result = []
|
||
|
|
with futures.ThreadPoolExecutor(max_workers=16) as thread_pool:
|
||
|
|
future_bad_frts = {thread_pool.submit(is_bad_json, frt_file): frt_file for frt_file in frt_files}
|
||
|
|
for future_bad_frt in futures.as_completed(future_bad_frts):
|
||
|
|
frt, is_bad = future_bad_frt.result()
|
||
|
|
if is_bad:
|
||
|
|
result.append(frt)
|
||
|
|
return result
|
||
|
|
|
||
|
|
|
||
|
|
def load_frts(folder):
|
||
|
|
result = []
|
||
|
|
for frt_file in os.listdir(folder):
|
||
|
|
if frt_file.endswith('.frt'):
|
||
|
|
result.append(os.path.join(folder, frt_file))
|
||
|
|
return result
|
||
|
|
|
||
|
|
|
||
|
|
def find_skipped_ocr(folder):
|
||
|
|
result = []
|
||
|
|
for file in os.listdir(folder):
|
||
|
|
if file.endswith('.pdf') and not file.startswith('._'):
|
||
|
|
file = os.path.join(folder, file)
|
||
|
|
if not os.path.exists(file.replace('.pdf', '.frt')):
|
||
|
|
result.append(file)
|
||
|
|
return result
|
||
|
|
|
||
|
|
|
||
|
|
def search_for_bad_json(library_dir):
|
||
|
|
bad_frts = []
|
||
|
|
file_types = {}
|
||
|
|
empty_folders = []
|
||
|
|
bad_pdfs = []
|
||
|
|
not_ocr = []
|
||
|
|
# Search through every subdirectory..
|
||
|
|
for doctype_folder in os.listdir(library_dir):
|
||
|
|
# If it's a file, skip it
|
||
|
|
doctype_folder = os.path.join(library_dir, doctype_folder)
|
||
|
|
if os.path.isfile(doctype_folder):
|
||
|
|
continue
|
||
|
|
|
||
|
|
print('Validating Folder: "%s"' % doctype_folder)
|
||
|
|
# Walk through sub directories
|
||
|
|
if len(os.listdir(doctype_folder)) == 0:
|
||
|
|
empty_folders.append(doctype_folder)
|
||
|
|
|
||
|
|
for file in os.listdir(doctype_folder):
|
||
|
|
if file.startswith('._') and file.endswith('.pdf'):
|
||
|
|
bad_pdfs.append(os.path.join(doctype_folder, file).replace('._', ''))
|
||
|
|
|
||
|
|
# Get the count of file types...
|
||
|
|
file_type = file[file.rfind('.') + 1:]
|
||
|
|
|
||
|
|
if file_type not in file_types:
|
||
|
|
file_types[file_type] = 1
|
||
|
|
else:
|
||
|
|
file_types[file_type] = file_types[file_type] + 1
|
||
|
|
|
||
|
|
bad_frts = bad_frts + test_frts(load_frts(doctype_folder))
|
||
|
|
not_ocr = not_ocr + find_skipped_ocr(doctype_folder)
|
||
|
|
|
||
|
|
# Return the bad data
|
||
|
|
return {'badFrts': bad_frts, 'fileTypeCounts': file_types, 'emptyFolders': empty_folders, 'badPdfs': bad_pdfs,
|
||
|
|
'notOcrd': not_ocr}
|
||
|
|
|
||
|
|
|
||
|
|
def validate_library(data_dict):
|
||
|
|
can_index = (data_dict['fileTypeCounts']['frt'] == data_dict['fileTypeCounts']['tkn'])
|
||
|
|
|
||
|
|
result = {'numBadJson': len(data_dict['badFrts']),
|
||
|
|
'badJsonFiles': data_dict['badFrts'],
|
||
|
|
'canIndex': can_index,
|
||
|
|
'fileTypes': data_dict['fileTypeCounts'],
|
||
|
|
'numEmptyFolders': len(data_dict['emptyFolders']),
|
||
|
|
'emptyFolders': data_dict['emptyFolders'],
|
||
|
|
'numNotOCRd': len(data_dict['notOcrd']),
|
||
|
|
'notOcrd': data_dict['notOcrd'],
|
||
|
|
'numBadPdfs': len(data_dict['badPdfs']),
|
||
|
|
'badPdfs': data_dict['badPdfs'],
|
||
|
|
'numBadConfig': len(data_dict['badConfig']),
|
||
|
|
'badConfig': data_dict['badConfig']}
|
||
|
|
return result
|
||
|
|
|
||
|
|
|
||
|
|
def main(lib_dir):
|
||
|
|
out_path = os.path.join(lib_dir, 'libValidation.json')
|
||
|
|
data_dict = search_for_bad_json(lib_dir)
|
||
|
|
data_dict['badConfig'] = validate_lib_config_files(lib_dir)
|
||
|
|
out_data = validate_library(data_dict)
|
||
|
|
with open(out_path, 'w+') as writer:
|
||
|
|
json.dump(out_data, writer, indent=3)
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == '__main__':
|
||
|
|
parser = argparse.ArgumentParser(prog='LibraryValidator',
|
||
|
|
description='Provides metadata about library validation.')
|
||
|
|
parser.add_argument('-l', '--library', nargs=1, type=str, dest='library_dir', required=True, default='no_input')
|
||
|
|
# Get the args
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
if args.library_dir == 'no_input':
|
||
|
|
parser.print_help()
|
||
|
|
|
||
|
|
else:
|
||
|
|
# Need to get actual strings...
|
||
|
|
library_directory = ''.join(args.library_dir)
|
||
|
|
main(library_directory)
|