130 lines
4.2 KiB
Python
130 lines
4.2 KiB
Python
import os
|
|
import csv
|
|
import time
|
|
import json
|
|
import subprocess
|
|
import ConsoleUtils
|
|
|
|
_printer = ConsoleUtils.SLPrinter('ImportantDocsTest')
|
|
|
|
runner = r'C:\Users\chris\Documents\Code\Git\scorewalker-utils\RunScoreWalker\sequencelogic-run-walker-classifier.py'
|
|
idx_root = r'C:\Users\chris\Documents\Code\Tests\KMeans\L3Results\2017.11.17\14.02\index'
|
|
config_file = r'C:\Users\chris\Documents\Code\Tests\KMeans\L3config' \
|
|
r'\SLI Standard Mortgage Library-IMPORTANT_TYPES_2017-11-17.conf.json'
|
|
min_mem = 128
|
|
max_mem = 4096
|
|
thread_count = 8
|
|
|
|
# commands = []
|
|
|
|
|
|
def _get_commands(folder):
|
|
commands = []
|
|
|
|
for in_file in [os.path.join(folder, f) for f in os.listdir(folder) if f.endswith('.frt')]:
|
|
out_file = in_file.replace('.frt', '_classification-results.json')
|
|
command = ['python', runner,
|
|
'--index-root', idx_root,
|
|
'--pkg-path', in_file,
|
|
'--config', config_file,
|
|
'--out', out_file,
|
|
'--min-memory', str(min_mem),
|
|
'--max-memory', str(max_mem),
|
|
'--thread-count', str(thread_count)
|
|
]
|
|
commands.append(command)
|
|
|
|
return commands
|
|
|
|
|
|
def _run_process(proc_args, proc_name, log_out_file):
|
|
print('Running %s' % proc_name)
|
|
|
|
start_time = time.time()
|
|
|
|
std_out = open(log_out_file, 'a+')
|
|
|
|
sub_proc = subprocess.Popen(proc_args, stdout=std_out)
|
|
|
|
sub_proc.wait()
|
|
|
|
run_time = time.time() - start_time
|
|
print('%s completed in %.4f s.' % (proc_name, run_time))
|
|
|
|
|
|
def _get_doctype(file):
|
|
with open(file) as reader:
|
|
class_data = json.load(reader)
|
|
|
|
if len(class_data['associations']) > 0:
|
|
doctype = class_data['associations'][0]['topDoctype']
|
|
else:
|
|
doctype = 'UNKNOWN'
|
|
|
|
return doctype
|
|
|
|
|
|
def main():
|
|
num_folders = len(folders)
|
|
cur_folder = 1
|
|
for folder in folders:
|
|
print('Running on package %03d/%d' % (cur_folder, num_folders))
|
|
cur_folder += 1
|
|
commands = _get_commands(folder)
|
|
|
|
log_files = []
|
|
|
|
doc_num = 1
|
|
class_start = time.time()
|
|
|
|
for command in commands:
|
|
log_file = command[5].replace('.frt', '_std_out.log')
|
|
_printer.write_line_break()
|
|
_run_process(command, 'RunScoreWalker', log_file)
|
|
print('Classified document %d/%d (%s)' % (doc_num, len(commands), os.path.split(command[5])[1]))
|
|
log_files.append(log_file)
|
|
doc_num += 1
|
|
|
|
class_run = time.time() - class_start
|
|
rate = class_run / doc_num
|
|
_printer.write_line_break()
|
|
print('Done classifying documents in %.4fs (%.2f s/document)' % (class_run, rate))
|
|
_printer.write_line_break()
|
|
|
|
important_data = []
|
|
for log_file in log_files:
|
|
with open(log_file) as reader:
|
|
important = True
|
|
for line in reader.readlines():
|
|
if 'The document did not match any important types' in line:
|
|
important = False
|
|
break
|
|
file_name = os.path.split(log_file)[1].replace('_std_out.log', '')
|
|
|
|
doctype = _get_doctype(log_file.replace('_std_out.log', '_classification-results.json'))
|
|
|
|
important_data.append({'Package': file_name, 'Doctype': doctype, 'Important': important})
|
|
|
|
result_file = os.path.join(folder, 'Results.csv')
|
|
|
|
print('Saving results from package to %s' % result_file)
|
|
|
|
with open(result_file, 'w+', newline='') as out_file:
|
|
writer = csv.DictWriter(out_file, ['Package', 'Doctype', 'Important'])
|
|
writer.writeheader()
|
|
for i in important_data:
|
|
writer.writerow(i)
|
|
|
|
print('Done running package')
|
|
_printer.write_line_break(break_char='=')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
root_dir = r'C:\Users\chris\Documents\Code\Tests\Classification-Important-Docs\Big_Test'
|
|
folders = [os.path.join(root_dir, f) for f in os.listdir(root_dir)
|
|
if os.path.isdir(os.path.join(root_dir, f))]
|
|
os.environ['SEQUENCELOGICHOME'] = r'C:\Users\chris\Documents\Code\Tests\KMeans\L3Results\2017.11.17\17.47\Programs'
|
|
os.environ['isFullDoc'] = 'TRUE'
|
|
|
|
main()
|