import json import os import argparse import validictory import concurrent.futures as futures # schema = {} # with open('OCROutput.json') as schema_file: # schema = json.loads(schema_file.read()) def validate_lib_config_files(lib_dir): result = {} parent, lib_name = os.path.split(lib_dir) print('Validating config files for library at "%s"' % lib_dir) cfg_dir = os.path.abspath(os.path.join(lib_dir, os.pardir, os.pardir, 'config')) if not os.path.exists(cfg_dir): print('Error: There is no config directory at "%s"' % cfg_dir) for file in os.listdir(cfg_dir): tmp_set = [] if file.endswith('.conf.json'): file = os.path.join(cfg_dir, file) print('Testing config file "%s"' % file) with open(file) as cfg_data: cfg_info = json.load(cfg_data) if cfg_info['libname'] == lib_name: walker_cfg = cfg_info['CLASSIFYWALKER'] if 'containers' not in walker_cfg or 'contained' not in walker_cfg: print('There are no containers/contained types in "%s"' % file) else: containers_contained = set(walker_cfg['containers'] + walker_cfg['contained']) for doctype in containers_contained: doctype_folder = os.path.join(lib_dir, doctype) if not os.path.exists(doctype_folder): tmp_set.append(doctype_folder) if len(tmp_set) > 0: result[file] = tmp_set return result def is_bad_json(json_file): with open(json_file) as reader: try: # validictory.validate(json.load(reader), schema) json.load(reader) is_valid = False except: is_valid = True return json_file, is_valid def test_frts(frt_files): result = [] with futures.ThreadPoolExecutor(max_workers=16) as thread_pool: future_bad_frts = {thread_pool.submit(is_bad_json, frt_file): frt_file for frt_file in frt_files} for future_bad_frt in futures.as_completed(future_bad_frts): frt, is_bad = future_bad_frt.result() if is_bad: result.append(frt) return result def load_frts(folder): result = [] for frt_file in os.listdir(folder): if frt_file.endswith('.frt'): result.append(os.path.join(folder, frt_file)) return result def find_skipped_ocr(folder): result = [] for file in os.listdir(folder): if file.endswith('.pdf') and not file.startswith('._'): file = os.path.join(folder, file) if not os.path.exists(file.replace('.pdf', '.frt')): result.append(file) return result def search_for_bad_json(library_dir): bad_frts = [] file_types = {} empty_folders = [] bad_pdfs = [] not_ocr = [] # Search through every subdirectory.. for doctype_folder in os.listdir(library_dir): # If it's a file, skip it doctype_folder = os.path.join(library_dir, doctype_folder) if os.path.isfile(doctype_folder): continue print('Validating Folder: "%s"' % doctype_folder) # Walk through sub directories if len(os.listdir(doctype_folder)) == 0: empty_folders.append(doctype_folder) for file in os.listdir(doctype_folder): if file.startswith('._') and file.endswith('.pdf'): bad_pdfs.append(os.path.join(doctype_folder, file).replace('._', '')) # Get the count of file types... file_type = file[file.rfind('.') + 1:] if file_type not in file_types: file_types[file_type] = 1 else: file_types[file_type] = file_types[file_type] + 1 bad_frts = bad_frts + test_frts(load_frts(doctype_folder)) not_ocr = not_ocr + find_skipped_ocr(doctype_folder) # Return the bad data return {'badFrts': bad_frts, 'fileTypeCounts': file_types, 'emptyFolders': empty_folders, 'badPdfs': bad_pdfs, 'notOcrd': not_ocr} def validate_library(data_dict): can_index = (data_dict['fileTypeCounts']['frt'] == data_dict['fileTypeCounts']['tkn']) result = {'numBadJson': len(data_dict['badFrts']), 'badJsonFiles': data_dict['badFrts'], 'canIndex': can_index, 'fileTypes': data_dict['fileTypeCounts'], 'numEmptyFolders': len(data_dict['emptyFolders']), 'emptyFolders': data_dict['emptyFolders'], 'numNotOCRd': len(data_dict['notOcrd']), 'notOcrd': data_dict['notOcrd'], 'numBadPdfs': len(data_dict['badPdfs']), 'badPdfs': data_dict['badPdfs'], 'numBadConfig': len(data_dict['badConfig']), 'badConfig': data_dict['badConfig']} return result def main(lib_dir): out_path = os.path.join(lib_dir, 'libValidation.json') data_dict = search_for_bad_json(lib_dir) data_dict['badConfig'] = validate_lib_config_files(lib_dir) out_data = validate_library(data_dict) with open(out_path, 'w+') as writer: json.dump(out_data, writer, indent=3) if __name__ == '__main__': parser = argparse.ArgumentParser(prog='LibraryValidator', description='Provides metadata about library validation.') parser.add_argument('-l', '--library', nargs=1, type=str, dest='library_dir', required=True, default='no_input') # Get the args args = parser.parse_args() if args.library_dir == 'no_input': parser.print_help() else: # Need to get actual strings... library_directory = ''.join(args.library_dir) main(library_directory)