import os import csv import time import json import subprocess import ConsoleUtils _printer = ConsoleUtils.SLPrinter('ImportantDocsTest') runner = r'C:\Users\chris\Documents\Code\Git\scorewalker-utils\RunScoreWalker\sequencelogic-run-walker-classifier.py' idx_root = r'C:\Users\chris\Documents\Code\Tests\KMeans\L3Results\2017.11.17\14.02\index' config_file = r'C:\Users\chris\Documents\Code\Tests\KMeans\L3config' \ r'\SLI Standard Mortgage Library-IMPORTANT_TYPES_2017-11-17.conf.json' min_mem = 128 max_mem = 4096 thread_count = 8 # commands = [] def _get_commands(folder): commands = [] for in_file in [os.path.join(folder, f) for f in os.listdir(folder) if f.endswith('.frt')]: out_file = in_file.replace('.frt', '_classification-results.json') command = ['python', runner, '--index-root', idx_root, '--pkg-path', in_file, '--config', config_file, '--out', out_file, '--min-memory', str(min_mem), '--max-memory', str(max_mem), '--thread-count', str(thread_count) ] commands.append(command) return commands def _run_process(proc_args, proc_name, log_out_file): print('Running %s' % proc_name) start_time = time.time() std_out = open(log_out_file, 'a+') sub_proc = subprocess.Popen(proc_args, stdout=std_out) sub_proc.wait() run_time = time.time() - start_time print('%s completed in %.4f s.' % (proc_name, run_time)) def _get_doctype(file): with open(file) as reader: class_data = json.load(reader) if len(class_data['associations']) > 0: doctype = class_data['associations'][0]['topDoctype'] else: doctype = 'UNKNOWN' return doctype def main(): num_folders = len(folders) cur_folder = 1 for folder in folders: print('Running on package %03d/%d' % (cur_folder, num_folders)) cur_folder += 1 commands = _get_commands(folder) log_files = [] doc_num = 1 class_start = time.time() for command in commands: log_file = command[5].replace('.frt', '_std_out.log') _printer.write_line_break() _run_process(command, 'RunScoreWalker', log_file) print('Classified document %d/%d (%s)' % (doc_num, len(commands), os.path.split(command[5])[1])) log_files.append(log_file) doc_num += 1 class_run = time.time() - class_start rate = class_run / doc_num _printer.write_line_break() print('Done classifying documents in %.4fs (%.2f s/document)' % (class_run, rate)) _printer.write_line_break() important_data = [] for log_file in log_files: with open(log_file) as reader: important = True for line in reader.readlines(): if 'The document did not match any important types' in line: important = False break file_name = os.path.split(log_file)[1].replace('_std_out.log', '') doctype = _get_doctype(log_file.replace('_std_out.log', '_classification-results.json')) important_data.append({'Package': file_name, 'Doctype': doctype, 'Important': important}) result_file = os.path.join(folder, 'Results.csv') print('Saving results from package to %s' % result_file) with open(result_file, 'w+', newline='') as out_file: writer = csv.DictWriter(out_file, ['Package', 'Doctype', 'Important']) writer.writeheader() for i in important_data: writer.writerow(i) print('Done running package') _printer.write_line_break(break_char='=') if __name__ == '__main__': root_dir = r'C:\Users\chris\Documents\Code\Tests\Classification-Important-Docs\Big_Test' folders = [os.path.join(root_dir, f) for f in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, f))] os.environ['SEQUENCELOGICHOME'] = r'C:\Users\chris\Documents\Code\Tests\KMeans\L3Results\2017.11.17\17.47\Programs' os.environ['isFullDoc'] = 'TRUE' main()