Sleds/scorewalker-utils/RunTest/class-important-docs.py

130 lines
4.2 KiB
Python

import os
import csv
import time
import json
import subprocess
import ConsoleUtils
_printer = ConsoleUtils.SLPrinter('ImportantDocsTest')
runner = r'C:\Users\chris\Documents\Code\Git\scorewalker-utils\RunScoreWalker\sequencelogic-run-walker-classifier.py'
idx_root = r'C:\Users\chris\Documents\Code\Tests\KMeans\L3Results\2017.11.17\14.02\index'
config_file = r'C:\Users\chris\Documents\Code\Tests\KMeans\L3config' \
r'\SLI Standard Mortgage Library-IMPORTANT_TYPES_2017-11-17.conf.json'
min_mem = 128
max_mem = 4096
thread_count = 8
# commands = []
def _get_commands(folder):
commands = []
for in_file in [os.path.join(folder, f) for f in os.listdir(folder) if f.endswith('.frt')]:
out_file = in_file.replace('.frt', '_classification-results.json')
command = ['python', runner,
'--index-root', idx_root,
'--pkg-path', in_file,
'--config', config_file,
'--out', out_file,
'--min-memory', str(min_mem),
'--max-memory', str(max_mem),
'--thread-count', str(thread_count)
]
commands.append(command)
return commands
def _run_process(proc_args, proc_name, log_out_file):
print('Running %s' % proc_name)
start_time = time.time()
std_out = open(log_out_file, 'a+')
sub_proc = subprocess.Popen(proc_args, stdout=std_out)
sub_proc.wait()
run_time = time.time() - start_time
print('%s completed in %.4f s.' % (proc_name, run_time))
def _get_doctype(file):
with open(file) as reader:
class_data = json.load(reader)
if len(class_data['associations']) > 0:
doctype = class_data['associations'][0]['topDoctype']
else:
doctype = 'UNKNOWN'
return doctype
def main():
num_folders = len(folders)
cur_folder = 1
for folder in folders:
print('Running on package %03d/%d' % (cur_folder, num_folders))
cur_folder += 1
commands = _get_commands(folder)
log_files = []
doc_num = 1
class_start = time.time()
for command in commands:
log_file = command[5].replace('.frt', '_std_out.log')
_printer.write_line_break()
_run_process(command, 'RunScoreWalker', log_file)
print('Classified document %d/%d (%s)' % (doc_num, len(commands), os.path.split(command[5])[1]))
log_files.append(log_file)
doc_num += 1
class_run = time.time() - class_start
rate = class_run / doc_num
_printer.write_line_break()
print('Done classifying documents in %.4fs (%.2f s/document)' % (class_run, rate))
_printer.write_line_break()
important_data = []
for log_file in log_files:
with open(log_file) as reader:
important = True
for line in reader.readlines():
if 'The document did not match any important types' in line:
important = False
break
file_name = os.path.split(log_file)[1].replace('_std_out.log', '')
doctype = _get_doctype(log_file.replace('_std_out.log', '_classification-results.json'))
important_data.append({'Package': file_name, 'Doctype': doctype, 'Important': important})
result_file = os.path.join(folder, 'Results.csv')
print('Saving results from package to %s' % result_file)
with open(result_file, 'w+', newline='') as out_file:
writer = csv.DictWriter(out_file, ['Package', 'Doctype', 'Important'])
writer.writeheader()
for i in important_data:
writer.writerow(i)
print('Done running package')
_printer.write_line_break(break_char='=')
if __name__ == '__main__':
root_dir = r'C:\Users\chris\Documents\Code\Tests\Classification-Important-Docs\Big_Test'
folders = [os.path.join(root_dir, f) for f in os.listdir(root_dir)
if os.path.isdir(os.path.join(root_dir, f))]
os.environ['SEQUENCELOGICHOME'] = r'C:\Users\chris\Documents\Code\Tests\KMeans\L3Results\2017.11.17\17.47\Programs'
os.environ['isFullDoc'] = 'TRUE'
main()