Sleds/scorewalker-utils/RunTest/class-important-docs.py

import os
import csv
import time
import json
import subprocess
import ConsoleUtils

_printer = ConsoleUtils.SLPrinter('ImportantDocsTest')

runner = r'C:\Users\chris\Documents\Code\Git\scorewalker-utils\RunScoreWalker\sequencelogic-run-walker-classifier.py'
idx_root = r'C:\Users\chris\Documents\Code\Tests\KMeans\L3Results\2017.11.17\14.02\index'
config_file = r'C:\Users\chris\Documents\Code\Tests\KMeans\L3config' \
              r'\SLI Standard Mortgage Library-IMPORTANT_TYPES_2017-11-17.conf.json'
min_mem = 128
max_mem = 4096
thread_count = 8

# commands = []


def _get_commands(folder):
    commands = []

    for in_file in [os.path.join(folder, f) for f in os.listdir(folder) if f.endswith('.frt')]:
        out_file = in_file.replace('.frt', '_classification-results.json')
        command = ['python', runner,
                   '--index-root', idx_root,
                   '--pkg-path', in_file,
                   '--config', config_file,
                   '--out', out_file,
                   '--min-memory', str(min_mem),
                   '--max-memory', str(max_mem),
                   '--thread-count', str(thread_count)
                   ]
        commands.append(command)

    return commands


def _run_process(proc_args, proc_name, log_out_file):
    print('Running %s' % proc_name)

    start_time = time.time()

    std_out = open(log_out_file, 'a+')

    sub_proc = subprocess.Popen(proc_args, stdout=std_out)

    sub_proc.wait()

    run_time = time.time() - start_time
    print('%s completed in %.4f s.' % (proc_name, run_time))


def _get_doctype(file):
    with open(file) as reader:
        class_data = json.load(reader)

        if len(class_data['associations']) > 0:
            doctype = class_data['associations'][0]['topDoctype']
        else:
            doctype = 'UNKNOWN'

    return doctype


def main():
    num_folders = len(folders)
    cur_folder = 1
    for folder in folders:
        print('Running on package %03d/%d' % (cur_folder, num_folders))
        cur_folder += 1
        commands = _get_commands(folder)

        log_files = []

        doc_num = 1
        class_start = time.time()

        for command in commands:
            log_file = command[5].replace('.frt', '_std_out.log')
            _printer.write_line_break()
            _run_process(command, 'RunScoreWalker', log_file)
            print('Classified document %d/%d (%s)' % (doc_num, len(commands), os.path.split(command[5])[1]))
            log_files.append(log_file)
            doc_num += 1

        class_run = time.time() - class_start
        rate = class_run / doc_num
        _printer.write_line_break()
        print('Done classifying documents in %.4fs (%.2f s/document)' % (class_run, rate))
        _printer.write_line_break()

        important_data = []
        for log_file in log_files:
            with open(log_file) as reader:
                important = True
                for line in reader.readlines():
                    if 'The document did not match any important types' in line:
                        important = False
                        break
                file_name = os.path.split(log_file)[1].replace('_std_out.log', '')

                doctype = _get_doctype(log_file.replace('_std_out.log', '_classification-results.json'))

                important_data.append({'Package': file_name, 'Doctype': doctype, 'Important': important})

        result_file = os.path.join(folder, 'Results.csv')

        print('Saving results from package to %s' % result_file)

        with open(result_file, 'w+', newline='') as out_file:
            writer = csv.DictWriter(out_file, ['Package', 'Doctype', 'Important'])
            writer.writeheader()
            for i in important_data:
                writer.writerow(i)

        print('Done running package')
        _printer.write_line_break(break_char='=')


if __name__ == '__main__':
    root_dir = r'C:\Users\chris\Documents\Code\Tests\Classification-Important-Docs\Big_Test'
    folders = [os.path.join(root_dir, f) for f in os.listdir(root_dir)
               if os.path.isdir(os.path.join(root_dir, f))]
    os.environ['SEQUENCELOGICHOME'] = r'C:\Users\chris\Documents\Code\Tests\KMeans\L3Results\2017.11.17\17.47\Programs'
    os.environ['isFullDoc'] = 'TRUE'

    main()