Sleds/utils/LibraryValidator.py

164 lines
5.7 KiB
Python

import json
import os
import argparse
import validictory
import concurrent.futures as futures
# schema = {}
# with open('OCROutput.json') as schema_file:
# schema = json.loads(schema_file.read())
def validate_lib_config_files(lib_dir):
result = {}
parent, lib_name = os.path.split(lib_dir)
print('Validating config files for library at "%s"' % lib_dir)
cfg_dir = os.path.abspath(os.path.join(lib_dir, os.pardir, os.pardir, 'config'))
if not os.path.exists(cfg_dir):
print('Error: There is no config directory at "%s"' % cfg_dir)
for file in os.listdir(cfg_dir):
tmp_set = []
if file.endswith('.conf.json'):
file = os.path.join(cfg_dir, file)
print('Testing config file "%s"' % file)
with open(file) as cfg_data:
cfg_info = json.load(cfg_data)
if cfg_info['libname'] == lib_name:
walker_cfg = cfg_info['CLASSIFYWALKER']
if 'containers' not in walker_cfg or 'contained' not in walker_cfg:
print('There are no containers/contained types in "%s"' % file)
else:
containers_contained = set(walker_cfg['containers'] + walker_cfg['contained'])
for doctype in containers_contained:
doctype_folder = os.path.join(lib_dir, doctype)
if not os.path.exists(doctype_folder):
tmp_set.append(doctype_folder)
if len(tmp_set) > 0:
result[file] = tmp_set
return result
def is_bad_json(json_file):
with open(json_file) as reader:
try:
# validictory.validate(json.load(reader), schema)
json.load(reader)
is_valid = False
except:
is_valid = True
return json_file, is_valid
def test_frts(frt_files):
result = []
with futures.ThreadPoolExecutor(max_workers=16) as thread_pool:
future_bad_frts = {thread_pool.submit(is_bad_json, frt_file): frt_file for frt_file in frt_files}
for future_bad_frt in futures.as_completed(future_bad_frts):
frt, is_bad = future_bad_frt.result()
if is_bad:
result.append(frt)
return result
def load_frts(folder):
result = []
for frt_file in os.listdir(folder):
if frt_file.endswith('.frt'):
result.append(os.path.join(folder, frt_file))
return result
def find_skipped_ocr(folder):
result = []
for file in os.listdir(folder):
if file.endswith('.pdf') and not file.startswith('._'):
file = os.path.join(folder, file)
if not os.path.exists(file.replace('.pdf', '.frt')):
result.append(file)
return result
def search_for_bad_json(library_dir):
bad_frts = []
file_types = {}
empty_folders = []
bad_pdfs = []
not_ocr = []
# Search through every subdirectory..
for doctype_folder in os.listdir(library_dir):
# If it's a file, skip it
doctype_folder = os.path.join(library_dir, doctype_folder)
if os.path.isfile(doctype_folder):
continue
print('Validating Folder: "%s"' % doctype_folder)
# Walk through sub directories
if len(os.listdir(doctype_folder)) == 0:
empty_folders.append(doctype_folder)
for file in os.listdir(doctype_folder):
if file.startswith('._') and file.endswith('.pdf'):
bad_pdfs.append(os.path.join(doctype_folder, file).replace('._', ''))
# Get the count of file types...
file_type = file[file.rfind('.') + 1:]
if file_type not in file_types:
file_types[file_type] = 1
else:
file_types[file_type] = file_types[file_type] + 1
bad_frts = bad_frts + test_frts(load_frts(doctype_folder))
not_ocr = not_ocr + find_skipped_ocr(doctype_folder)
# Return the bad data
return {'badFrts': bad_frts, 'fileTypeCounts': file_types, 'emptyFolders': empty_folders, 'badPdfs': bad_pdfs,
'notOcrd': not_ocr}
def validate_library(data_dict):
can_index = (data_dict['fileTypeCounts']['frt'] == data_dict['fileTypeCounts']['tkn'])
result = {'numBadJson': len(data_dict['badFrts']),
'badJsonFiles': data_dict['badFrts'],
'canIndex': can_index,
'fileTypes': data_dict['fileTypeCounts'],
'numEmptyFolders': len(data_dict['emptyFolders']),
'emptyFolders': data_dict['emptyFolders'],
'numNotOCRd': len(data_dict['notOcrd']),
'notOcrd': data_dict['notOcrd'],
'numBadPdfs': len(data_dict['badPdfs']),
'badPdfs': data_dict['badPdfs'],
'numBadConfig': len(data_dict['badConfig']),
'badConfig': data_dict['badConfig']}
return result
def main(lib_dir):
out_path = os.path.join(lib_dir, 'libValidation.json')
data_dict = search_for_bad_json(lib_dir)
data_dict['badConfig'] = validate_lib_config_files(lib_dir)
out_data = validate_library(data_dict)
with open(out_path, 'w+') as writer:
json.dump(out_data, writer, indent=3)
if __name__ == '__main__':
parser = argparse.ArgumentParser(prog='LibraryValidator',
description='Provides metadata about library validation.')
parser.add_argument('-l', '--library', nargs=1, type=str, dest='library_dir', required=True, default='no_input')
# Get the args
args = parser.parse_args()
if args.library_dir == 'no_input':
parser.print_help()
else:
# Need to get actual strings...
library_directory = ''.join(args.library_dir)
main(library_directory)