1245 lines
47 KiB
Python
1245 lines
47 KiB
Python
"""
|
|
Information
|
|
-----------
|
|
|
|
This script is a tool for running a full test over the ScoreWalker Classification engine. The following tools are called
|
|
in order to achieve this goal:
|
|
|
|
#) Builds WalkerIndexer and WalkerClassifier.
|
|
#) The necessary steps for indexing.
|
|
#) WalkerClassifier to preform classification on the test document.
|
|
#) Several other Python tools to perform analysis on the results.
|
|
#) ValidationWalker to validate the output from the classification engine against the JSON schemas used in
|
|
production.
|
|
|
|
.. moduleauthor:: Chris Diesch <cdiesch@sequencelogic.net>
|
|
|
|
|
|
Commandline Usage
|
|
------------------
|
|
Usage: ``Tester.py [-h, --help] [-b, --build] [-t, --test_dir] {TEST_DIR} [-l, --library_dir] {LIB_DIR}
|
|
[-c, --config] {CFG_FILE} [-o, --out_dir] {OUT_DIR}
|
|
|
|
Required Arguments:
|
|
|
|
``-t TEST_DIR, --test_dir TEST_DIR``
|
|
Where ``TEST_DIR`` is the path to the root folder to load the test data from.
|
|
|
|
``-l LIB_DIR, --library_dir LIB_DIR``
|
|
Where ``LIB_DIR`` is the path to the library to run the test with.
|
|
|
|
``-c CFG_FILE, --config CFG_FILE``
|
|
Where ``CFG_FILE`` is the path to the config file to use for testing.
|
|
|
|
``-o OUT_DIR, --out_dir OUT_DIR``
|
|
Where ``OUT_DIR`` is the path to save the output to.
|
|
|
|
Optional Arguments:
|
|
|
|
``-h, --help``
|
|
Prints the help message.
|
|
|
|
``-b, --build``
|
|
Runs the maven build for the classification tools.
|
|
|
|
|
|
Python Module Usage
|
|
--------------------
|
|
"""
|
|
import argparse
|
|
import os
|
|
import subprocess
|
|
import datetime
|
|
import time
|
|
import shutil
|
|
import sys
|
|
import csv
|
|
import json
|
|
import operator
|
|
import ConsoleUtils
|
|
|
|
import AccuracyGraphSetup
|
|
|
|
program_name = 'TestWalker'
|
|
program_description = 'This tool handles running the classification engine and several tools to perform analysis on ' \
|
|
'the results.'
|
|
author = 'Chris Diesch'
|
|
|
|
# The argument parser for the program.
|
|
parser = argparse.ArgumentParser(prog=program_name, description=program_description, add_help=False)
|
|
|
|
# Error and Warning console values:
|
|
red_error = '\033[91mError:\033[0m'
|
|
yellow_warning = '\033[93mWARNING:\033[0m'
|
|
blue_okay = '\033[94mOK\033[0m'
|
|
|
|
build_date = '2017.07.24' # datetime.datetime.now().strftime('%Y.%m.%d-%H.%M.%S')
|
|
program_version = '1.7.2'
|
|
side_bound_char = '|'
|
|
line_break_char = '-'
|
|
corner_char = '+'
|
|
line_break_size = 150
|
|
line_break = line_break_char * line_break_size
|
|
|
|
console_line_break = '-' * line_break_size
|
|
|
|
|
|
DEF_NUM_TOP_DOCS = 20
|
|
DEF_TERM_LENGTH = 1
|
|
DEF_CONF_THRESHOLD = 60
|
|
DEF_MIN_PHRASE_SIZE = 5
|
|
DEF_MAX_PHRASE_SIZE = 30
|
|
|
|
paginate = True
|
|
|
|
DEF_GRAPH_TITLE = 'Classification Accuracy'
|
|
|
|
graph_config_file_name = 'graph-config.json'
|
|
test_config_file_name = 'test-config.txt'
|
|
graph_name = 'Classification Accuracy.pdf'
|
|
|
|
classification_time_tag = 'Classification Run Time'
|
|
|
|
clux_file_tag = 'CLUX File'
|
|
package_file_tag = 'Package File'
|
|
|
|
package_name_tag = 'Package Name'
|
|
classification_file_tag = 'Classification'
|
|
tree_walker_file_tag = 'Tree Walker'
|
|
term_walker_file_tag = 'Term Walker'
|
|
fp_counter_file_tag = 'False Positive Counts'
|
|
accuracy_file_tag = 'Graph Data'
|
|
graph_config_file_tag = 'Graph Config'
|
|
graph_file_tag = 'Graph'
|
|
doctype_graph_tag = 'Doctype Graph'
|
|
|
|
idx_root_tag = 'Index Root'
|
|
test_root_tag = 'Test Root'
|
|
logs_root_tag = 'Logs Root'
|
|
|
|
tokenizer = 'tokenizer'
|
|
phrase_maker = 'phrase maker'
|
|
phrase_loader = 'phrase loader'
|
|
indexer = 'indexer'
|
|
classifier = 'classifier'
|
|
diff_utils = 'diff-utils'
|
|
walker_validator = 'walker validator'
|
|
tree_walker = 'tree walker'
|
|
status_counter = 'false positive counter'
|
|
doctype_graph = 'doctype graph'
|
|
term_walker = 'term walker'
|
|
accuracy = 'accuracy'
|
|
grapher = 'package graph maker'
|
|
term_diff = 'walker term diff'
|
|
|
|
current_dir = ''
|
|
phrase_exe_name = ''
|
|
build_roots = {}
|
|
executables = {}
|
|
|
|
printer = None
|
|
|
|
|
|
def write_line_break():
|
|
printer.write_no_prefix(console_line_break)
|
|
|
|
|
|
def load_executables():
|
|
"""
|
|
Loads the executables needed to perform a test run and analysis.
|
|
|
|
Returns:
|
|
``None``
|
|
|
|
"""
|
|
global executables, phrase_exe_name, current_dir, build_roots
|
|
|
|
current_dir = os.getcwd()
|
|
|
|
phrase_exe_root = r'C:\Users\chris\Documents\Code\Tests\KMeans\L3Input\Programs'
|
|
|
|
walker_validate_root = os.path.abspath(os.path.join(current_dir, os.pardir))
|
|
walker_validate_root = os.path.join(walker_validate_root, 'ValidationWalker')
|
|
|
|
tree_walker_root = os.path.abspath(os.path.join(current_dir, os.pardir))
|
|
tree_walker_root = os.path.join(tree_walker_root, 'TreeWalker')
|
|
|
|
accuracy_root = os.path.abspath(os.path.join(current_dir, os.pardir))
|
|
accuracy_root = os.path.join(accuracy_root, 'AccuracyCounter')
|
|
|
|
diff_utils_root = os.path.abspath(os.path.join(current_dir, os.pardir, os.pardir, 'diff-utils'))
|
|
|
|
classify_root = os.path.abspath(os.path.join(current_dir, os.pardir))
|
|
classify_root = os.path.abspath(os.path.join(classify_root, os.pardir))
|
|
classify_root = os.path.abspath(os.path.join(classify_root, 'scorewalker'))
|
|
|
|
tokenizer_jar_root = os.path.join(classify_root, 'walker-analysis', 'target')
|
|
tokenizer_jar_name = os.path.join(tokenizer_jar_root, 'tokenizer-one-jar.jar')
|
|
|
|
phrase_exe_name = os.path.join(phrase_exe_root, 'phrases.exe')
|
|
|
|
phrase_loader_name = os.path.abspath(os.path.join(current_dir, os.pardir))
|
|
phrase_loader_name = os.path.join(phrase_loader_name, 'PhraseCountLoader')
|
|
phrase_loader_name = os.path.join(phrase_loader_name, 'PhraseCountLoader.py')
|
|
|
|
classify_jar_root = os.path.abspath(os.path.join(classify_root, 'walker-classifier', 'target'))
|
|
classify_jar_name = os.path.join(classify_jar_root, 'walker-classifier-one-jar.jar')
|
|
|
|
walker_validator_name = os.path.join(walker_validate_root, 'ValidationWalker.py')
|
|
|
|
tree_walker_name = os.path.join(tree_walker_root, 'TreeWalker.py')
|
|
doctype_graph_name = os.path.join(tree_walker_root, 'DoctypeGraph.py')
|
|
fp_count_name = os.path.join(tree_walker_root, 'StatusCounter.py')
|
|
term_walker_name = os.path.join(tree_walker_root, 'TermWalker.py')
|
|
|
|
accuracy_name = os.path.join(accuracy_root, 'Accuracy.py')
|
|
graph_maker_name = os.path.join(accuracy_root, 'AccuracyGraph.py')
|
|
|
|
index_jar_root = os.path.abspath(os.path.join(classify_root, 'walker-indexer', 'target'))
|
|
index_jar_name = os.path.join(index_jar_root, 'walker-indexer-one-jar.jar')
|
|
|
|
walker_term_diff_root = os.path.abspath(os.path.join(classify_root, 'walker-term-diff', 'target'))
|
|
walker_term_diff_name = os.path.join(walker_term_diff_root, 'walker-term-diff-one-jar.jar')
|
|
|
|
build_roots = {classifier: classify_root,
|
|
diff_utils: diff_utils_root}
|
|
|
|
executables = {tokenizer: tokenizer_jar_name,
|
|
term_diff: walker_term_diff_name,
|
|
phrase_loader: phrase_loader_name,
|
|
indexer: index_jar_name,
|
|
classifier: classify_jar_name,
|
|
walker_validator: walker_validator_name,
|
|
tree_walker: tree_walker_name,
|
|
status_counter: fp_count_name,
|
|
doctype_graph: doctype_graph_name,
|
|
term_walker: term_walker_name,
|
|
accuracy: accuracy_name,
|
|
grapher: graph_maker_name}
|
|
|
|
for file in os.listdir(phrase_exe_root):
|
|
executables[file] = os.path.join(phrase_exe_root, file)
|
|
|
|
|
|
def open_sublime(file_path):
|
|
"""
|
|
Opens SublimeText on the given file.
|
|
|
|
Args:
|
|
``file_path`` The file to open with SublimeText.
|
|
|
|
Returns:
|
|
``None``
|
|
|
|
"""
|
|
sublime_cmd = [r'C:\Program Files\Sublime Text 3\sublime_text.exe', file_path]
|
|
subprocess.Popen(sublime_cmd)
|
|
|
|
|
|
def open_log_files(std_out_path, std_err_path):
|
|
"""
|
|
Opens the log files and adds a header to them.
|
|
|
|
Args:
|
|
``std_out_path`` -- ``str`` The path to save the standard output to.
|
|
|
|
``std_err_path`` -- ``str`` The path to save the standard error to.
|
|
|
|
Returns:
|
|
``str, str`` -- The writer for the standard out file, The writer for the standard error file.
|
|
"""
|
|
log_line_break = '=' * 100
|
|
log_std_err = format('%s\n%s\n%s\n' % (log_line_break, '||{:^96}||'.format('STANDARD ERROR'), log_line_break))
|
|
log_std_out = format('%s\n%s\n%s\n' % (log_line_break, '||{:^96}||'.format('STANDARD OUT'), log_line_break))
|
|
|
|
with open(std_out_path, 'a+') as std_out:
|
|
std_out.write(log_std_out)
|
|
|
|
with open(std_err_path, 'a+') as std_err:
|
|
std_err.write(log_std_err)
|
|
|
|
std_out = open(std_out_path, 'a+')
|
|
std_err = open(std_err_path, 'a+')
|
|
|
|
return std_out, std_err
|
|
|
|
|
|
def run_process(proc_name, proc_cmd, log_dir):
|
|
"""
|
|
Runs a process, logs the console output, and gets the time to execute.
|
|
|
|
Returns:
|
|
``proc_name`` -- ``str`` The name of the process to run.
|
|
|
|
``proc_cmd`` -- ``list(str)`` The command string for the process.
|
|
|
|
``log_dir`` -- ``str`` The path to save the log files to.
|
|
|
|
Returns:
|
|
``float`` The time (in seconds) to execute the command.
|
|
|
|
"""
|
|
|
|
proc_err_log = os.path.join(log_dir, '%s-std-err.log' % proc_name)
|
|
proc_std_log = os.path.join(log_dir, '%s-std-out.log' % proc_name)
|
|
|
|
std_out_writer, std_err_writer = open_log_files(proc_std_log, proc_err_log)
|
|
|
|
print('Running %s' % proc_name)
|
|
|
|
start_time = time.time()
|
|
|
|
process = subprocess.Popen(proc_cmd, shell=True, stdout=std_out_writer, stderr=std_err_writer)
|
|
process.wait()
|
|
|
|
run_time = time.time() - start_time
|
|
|
|
time.sleep(0.05)
|
|
process.poll()
|
|
|
|
return_code = int(process.returncode)
|
|
if return_code != 0:
|
|
print('%s Process completed with return code %d.' % (proc_name, return_code))
|
|
print(' Opening log files...')
|
|
std_out_writer.close()
|
|
std_err_writer.close()
|
|
|
|
open_sublime(std_out_writer.name)
|
|
open_sublime(std_err_writer.name)
|
|
|
|
exit(return_code)
|
|
|
|
print('%s completed (%.4f s)' % (proc_name, run_time))
|
|
return run_time
|
|
|
|
|
|
def run_build(log_dir):
|
|
"""
|
|
Runs the maven build for the classificaion and indexing engines.
|
|
|
|
Args:
|
|
``log_dir`` -- ``str`` The path to save log files.
|
|
|
|
Returns:
|
|
``None``
|
|
|
|
"""
|
|
mvn_home = os.environ.get('MAVEN_HOME')
|
|
mvn_path = os.path.join(mvn_home, os.path.join('bin', 'mvn.cmd'))
|
|
# Change dir and run maven
|
|
# os.chdir(build_roots[diff_utils])
|
|
# mvn_cmd = [mvn_path, 'clean', 'install']
|
|
# run_process('DiffUtils Maven Build', mvn_cmd, log_dir)
|
|
|
|
# Change dir and run maven
|
|
os.chdir(build_roots[classifier])
|
|
mvn_cmd = [mvn_path, '-DskipTests', 'install']
|
|
|
|
run_process('ScoreWalker Maven Build', mvn_cmd, log_dir)
|
|
|
|
os.chdir(current_dir)
|
|
|
|
|
|
def run_tokenizer(lib_dir, index_dir, log_dir):
|
|
"""
|
|
Runs the tokenizer using :meth:`run_process`.
|
|
|
|
Args:
|
|
``lib_dir`` -- ``str`` The path to the library to tokenize.
|
|
|
|
``index_dir`` -- ``str`` The path to the index being used to classify.
|
|
|
|
``log_dir`` -- ``str`` The path to the log directory.
|
|
|
|
Returns:
|
|
``None``
|
|
|
|
"""
|
|
tokenizer_cmd = ['java', '-jar', executables[tokenizer],
|
|
'-I', '"'+index_dir+'"',
|
|
'-L', '"'+lib_dir+'"',
|
|
'-O']
|
|
|
|
run_process('Tokenizer', tokenizer_cmd, log_dir)
|
|
|
|
|
|
def run_walker_term_diff(lib_dir, index_dir, log_dir):
|
|
with open(config_file) as cfg_file:
|
|
too_similar = json.load(cfg_file)
|
|
|
|
too_similar = too_similar['CLASSIFYWALKER']
|
|
too_similar = too_similar['similarDoctypes']
|
|
|
|
for similar_list in too_similar:
|
|
term_diff_cmd = ['java', '-jar', executables[term_diff],
|
|
'--libRoot', lib_dir,
|
|
'--outRoot', index_dir,
|
|
'--doctypes'] + similar_list
|
|
|
|
run_process('Walker Term Diff over doctypes %s' % similar_list, term_diff_cmd, log_dir)
|
|
|
|
|
|
def run_phrase_maker(files, min_phrase_len, max_phrase_len, phrase_file, log_file):
|
|
|
|
folder, doctype = os.path.split(phrase_file)
|
|
split_idx = doctype.rfind('.')
|
|
doctype = doctype[:split_idx]
|
|
|
|
phrases_cmd = [phrase_exe_name,
|
|
'-c', '80',
|
|
'-p', str(min_phrase_len),
|
|
'-P', str(max_phrase_len),
|
|
'-ol'] + files
|
|
|
|
with open(log_file, 'a+') as tmp:
|
|
tmp.write('Running Phrase Maker on %s\n%s\n' % (doctype, console_line_break))
|
|
|
|
std_out = open(phrase_file, 'a+')
|
|
std_err = open(log_file, 'a+')
|
|
process = subprocess.Popen(phrases_cmd, stdout=std_out, stderr=std_err)
|
|
|
|
with open(log_file, 'a+') as tmp:
|
|
tmp.write('%s\n' % console_line_break)
|
|
|
|
process.wait()
|
|
|
|
|
|
def run_phrase_loader(phrases_dir, out_file_name, min_phrase_len, log_dir):
|
|
phrase_loader_cmd = ['python', executables[phrase_loader],
|
|
'-i', phrases_dir,
|
|
'-o', out_file_name,
|
|
'-m', str(min_phrase_len)]
|
|
|
|
run_time = run_process('Phrase Loader', phrase_loader_cmd, log_dir)
|
|
return run_time
|
|
|
|
|
|
def run_indexer(idx_dir, lib_dir, log_dir):
|
|
index_cmd = ['java', '-jar', executables[indexer],
|
|
'-I', '"'+idx_dir+'"',
|
|
'-D', '"'+lib_dir+'"']
|
|
|
|
run_time = run_process('Walker Indexer', index_cmd, log_dir)
|
|
|
|
write_line_break()
|
|
|
|
return run_time
|
|
|
|
|
|
def new_run_classifier(index_dir, config_file_path, dest_file, package, log_dir,
|
|
min_memory=128, max_memory=4096, thread_count=8):
|
|
walker_loc = r'C:\Users\chris\Documents\Code\Git\scorewalker-utils\RunScoreWalker' \
|
|
r'\sequencelogic-run-walker-classifier.py'
|
|
|
|
classify_cmd = ['python', walker_loc,
|
|
'--index-root', index_dir,
|
|
'--pkg-path', package,
|
|
'--config', config_file_path,
|
|
'--out', dest_file,
|
|
'--min-memory', str(min_memory),
|
|
'--max-memory', str(max_memory),
|
|
'--thread-count', str(thread_count)]
|
|
run_time = run_process('sequencelogic-run-walker-classifier', classify_cmd, log_dir)
|
|
return run_time
|
|
|
|
|
|
def run_classifier(index_dir, config_file_path, dest_file, package, n_lucene, conf, word_len, log_dir):
|
|
classify_cmd = ['java', '-jar', executables[classifier],
|
|
'-D', '"'+dest_file+'"',
|
|
'-C', '"'+config_file_path+'"',
|
|
'-I', '"'+index_dir+'"',
|
|
'-P', '"'+package+'"']
|
|
|
|
run_time = run_process('Walker Classifier', classify_cmd, log_dir)
|
|
|
|
return run_time
|
|
|
|
|
|
def run_walker_validator(walker_file, log_dir):
|
|
validate_cmd = ['python', executables[walker_validator],
|
|
'-i', walker_file]
|
|
|
|
run_time = run_process('ValidationWalker', validate_cmd, log_dir)
|
|
|
|
return run_time
|
|
|
|
|
|
def run_tree_walker(clux_output, engine_output, result_file, log_dir):
|
|
tree_walker_cmd = ['python', executables[tree_walker],
|
|
'-c', clux_output,
|
|
'-w', engine_output,
|
|
'-o', result_file]
|
|
|
|
run_time = run_process('Tree Walker', tree_walker_cmd, log_dir)
|
|
|
|
return run_time
|
|
|
|
|
|
def run_fp_counter(twk_files, result_file, log_dir):
|
|
fp_count_cmd = ['python', executables[status_counter],
|
|
'-o', result_file,
|
|
'--in_files'] + twk_files
|
|
|
|
run_time = run_process('Status Counter', fp_count_cmd, log_dir)
|
|
|
|
return run_time
|
|
|
|
|
|
def run_doctype_graph(test_dir, data_dir, log_dir):
|
|
|
|
doctype_graph_cmd = ['python', executables[doctype_graph],
|
|
'-i', test_dir,
|
|
'-o', data_dir]
|
|
|
|
run_time = run_process('DoctypeGraph', doctype_graph_cmd, log_dir)
|
|
return run_time
|
|
|
|
|
|
def run_term_walker(tree_walker_output, engine_output, result_file, log_dir):
|
|
|
|
term_walker_cmd = ['python', executables[term_walker],
|
|
'-w', tree_walker_output,
|
|
'-c', engine_output,
|
|
'-o', result_file]
|
|
|
|
run_time = run_process('Term Walker', term_walker_cmd, log_dir)
|
|
|
|
return run_time
|
|
|
|
|
|
def run_accuracy(tree_walker_out, out_file, log_dir):
|
|
# Run it
|
|
accuracy_cmd = ['python', executables[accuracy],
|
|
'-i', tree_walker_out,
|
|
'-o', out_file]
|
|
|
|
run_time = run_process('Graph Metadata', accuracy_cmd, log_dir)
|
|
|
|
return run_time
|
|
|
|
|
|
def run_graph(configuration_file, out_file, log_dir):
|
|
graph_cmd = ['python', executables[grapher],
|
|
'-o', out_file,
|
|
'-i', configuration_file]
|
|
|
|
run_time = run_process('Graph Maker', graph_cmd, log_dir)
|
|
|
|
return run_time
|
|
|
|
|
|
def run_clean_up():
|
|
print('Cleaning up...')
|
|
|
|
# for name in executables:
|
|
# file = executables[name]
|
|
# if os.path.exists(file):
|
|
# os.remove(file)
|
|
|
|
print('Done cleaning up.')
|
|
|
|
|
|
def make_graph_config(files, tags, dest_file, graph_title=DEF_GRAPH_TITLE):
|
|
# folder, file = os.path.split(dest_file)
|
|
# avg_file = os.path.join(folder, 'global-%s' % file)
|
|
# AccuracyGraphSetup.make_package_graph_config(files, tags, graph_title, dest_file)
|
|
AccuracyGraphSetup.make_avg_cfg(files, graph_title, dest_file)
|
|
# return avg_file
|
|
|
|
|
|
def get_pkg_file_names(parent_dir):
|
|
# Get th appropriate sub folders
|
|
if not os.path.exists(parent_dir):
|
|
os.mkdir(parent_dir)
|
|
|
|
run_dir = os.path.abspath(parent_dir)
|
|
log_dir = os.path.join(run_dir, 'logs')
|
|
# Make the directories if they don't exist.
|
|
if not os.path.exists(run_dir):
|
|
os.mkdir(run_dir)
|
|
if not os.path.exists(log_dir):
|
|
os.mkdir(log_dir)
|
|
|
|
# Get the new full file names
|
|
classifier_out = os.path.join(run_dir, 'classification-results.json')
|
|
tree_walker_out = os.path.join(run_dir, 'classification-analysis.csv')
|
|
fp_counter_out = os.path.join(run_dir, 'false-positive-counts.csv')
|
|
term_walker_out = os.path.join(run_dir, 'term-analysis.csv')
|
|
accuracy_out = os.path.join(run_dir, 'graph-data.txt')
|
|
graph_out = os.path.join(run_dir, 'accuracy-graph.pdf')
|
|
doctype_graph = os.path.join(run_dir, 'doctype-counts')
|
|
|
|
return {classification_file_tag: classifier_out,
|
|
tree_walker_file_tag: tree_walker_out,
|
|
fp_counter_file_tag: fp_counter_out,
|
|
term_walker_file_tag: term_walker_out,
|
|
accuracy_file_tag: accuracy_out,
|
|
graph_file_tag: graph_out,
|
|
logs_root_tag: log_dir,
|
|
doctype_graph_tag: doctype_graph}
|
|
|
|
|
|
def get_root_folder(test_root_path):
|
|
subfolder = os.path.join(test_root_path, datetime.datetime.now().strftime('%Y.%m.%d'))
|
|
# Files will start with HH.MM
|
|
file_prefix = datetime.datetime.now().strftime('%H.%M')
|
|
|
|
test_root = os.path.join(test_root_path, subfolder)
|
|
run_dir = os.path.join(subfolder, file_prefix)
|
|
|
|
if not os.path.exists(test_root):
|
|
os.mkdir(test_root)
|
|
|
|
if not os.path.exists(run_dir):
|
|
os.mkdir(run_dir)
|
|
|
|
return run_dir
|
|
|
|
|
|
def get_idx_names(run_dir):
|
|
|
|
idx_dir = os.path.join(run_dir, 'index')
|
|
log_dir = os.path.join(run_dir, 'logs')
|
|
# Make the directories if they don't exist.
|
|
|
|
if not os.path.exists(log_dir):
|
|
os.mkdir(log_dir)
|
|
|
|
if not os.path.exists(idx_dir):
|
|
os.mkdir(idx_dir)
|
|
|
|
# Return the folders
|
|
return {idx_root_tag: idx_dir,
|
|
|
|
test_root_tag: run_dir,
|
|
logs_root_tag: log_dir}
|
|
|
|
|
|
def get_tree_walker_files(root_folder):
|
|
files = []
|
|
for folder in os.listdir(root_folder):
|
|
folder = os.path.join(root_folder, folder)
|
|
if os.path.isdir(folder):
|
|
for file in os.listdir(folder):
|
|
file = os.path.join(folder, file)
|
|
if os.path.isfile(file):
|
|
if file.endswith('analysis-no-centers.csv') and 'term' not in file:
|
|
files.append(file)
|
|
|
|
return files
|
|
|
|
|
|
def load_packages(test_data_path):
|
|
test_packages_root = os.path.join(test_data_path, 'Test-Files')
|
|
|
|
result = []
|
|
for file in os.listdir(test_packages_root):
|
|
file = os.path.join(test_packages_root, file)
|
|
if os.path.isfile(file) and file.endswith('.frt'):
|
|
clux_file = file.replace('.frt', '_true.json')
|
|
if os.path.exists(clux_file):
|
|
result.append({package_file_tag: file, clux_file_tag: clux_file})
|
|
|
|
return result
|
|
|
|
|
|
def write_cfg_file(settings_dict, test_settings_file, start_time, files):
|
|
|
|
with open(test_settings_file, 'w+') as writer:
|
|
writer.write('Tester.py Settings/Results\n')
|
|
writer.write('Start: %s\n' % start_time)
|
|
writer.write('Ended: %s\n' % datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S'))
|
|
writer.write('%s\n' % console_line_break)
|
|
for key, value in settings_dict.items():
|
|
writer.write('%s = %s\n' % (key, value))
|
|
writer.write('%s\nFiles:\n' % console_line_break)
|
|
for key, value in files.items():
|
|
writer.write(' %s: %s\n' % (key, value))
|
|
|
|
|
|
def make_empty_index_files(new_index_root):
|
|
out_files = ['MaleNames.txt', 'FemaleNames.txt', 'Surnames.txt', 'Phrases.txt']
|
|
for file in out_files:
|
|
file = os.path.join(new_index_root, file)
|
|
with open(file, 'w+') as writer:
|
|
writer.write('')
|
|
|
|
|
|
def copy_index_files(files_to_copy, new_index_root):
|
|
print('Copying index files...')
|
|
for item in files_to_copy:
|
|
folder, name = os.path.split(item)
|
|
new_path = os.path.join(new_index_root, name)
|
|
shutil.copy(item, new_path)
|
|
print('Done copying index files.')
|
|
|
|
|
|
def copy_executables(test_dir):
|
|
global executables
|
|
|
|
program_dir = os.environ['SEQUENCELOGICHOME']
|
|
# program_dir = os.path.join(test_dir, 'Programs')
|
|
# os.environ['SEQUENCELOGICHOME'] = program_dir
|
|
program_dir = os.path.join(program_dir, 'bin')
|
|
if not os.path.exists(program_dir):
|
|
os.makedirs(program_dir)
|
|
|
|
for name in executables:
|
|
if name != walker_validator:
|
|
original_path = executables[name]
|
|
original_folder, original_name = os.path.split(original_path)
|
|
new_path = os.path.join(program_dir, original_name)
|
|
shutil.copy(original_path, new_path)
|
|
executables[name] = new_path
|
|
|
|
|
|
def new_run_indexer(config_file, lib_path, idx_path, log_dir):
|
|
index_wrapper_path = \
|
|
r'C:\Users\chris\Documents\Code\Git\scorewalker-utils\RunScoreWalker\sequencelogic-run-walker-indexer.py'
|
|
|
|
cmd = ['python', index_wrapper_path,
|
|
'--config', config_file,
|
|
'--indexRoot', idx_path,
|
|
'--libRoot', lib_path]
|
|
|
|
idx_time = run_process('sequencelogic-run-walker-indexer', cmd, log_dir)
|
|
|
|
return idx_time
|
|
|
|
|
|
def run_phrase_maker_on_folder(folder, phrases_root, min_phrase_len, max_phrase_len, log_dir):
|
|
files = []
|
|
lib_dir, doctype = os.path.split(folder)
|
|
dest_file = os.path.join(phrases_root, '%s.phrasecount' % doctype)
|
|
log_file = os.path.join(log_dir, 'phrase-maker-std-err.txt')
|
|
for file in os.listdir(folder):
|
|
file = os.path.join(folder, file)
|
|
if os.path.isfile(file) and file.endswith('.tkn'):
|
|
files.append(file)
|
|
|
|
if len(files) > 1:
|
|
run_phrase_maker(files, min_phrase_len, max_phrase_len, dest_file, log_file)
|
|
|
|
else:
|
|
print('There are not enough examples of "%s"' % doctype)
|
|
|
|
|
|
def generate_phrases_for_library(library_root, index_root, log_dir, min_phrase_length=DEF_MIN_PHRASE_SIZE,
|
|
max_phrase_len=DEF_MAX_PHRASE_SIZE):
|
|
|
|
phrase_count_root = os.path.join(index_root, 'Phrase Count Source')
|
|
phrase_file = os.path.join(index_root, 'Phrases.txt')
|
|
|
|
if not os.path.exists(phrase_count_root):
|
|
os.mkdir(phrase_count_root)
|
|
|
|
run_tokenizer(library_root, index_root, log_dir)
|
|
|
|
for folder in os.listdir(library_root):
|
|
folder = folder.replace('\uf028', '')
|
|
folder = os.path.join(library_root, folder)
|
|
if os.path.isdir(folder):
|
|
run_phrase_maker_on_folder(folder, phrase_count_root, min_phrase_length, max_phrase_len, log_dir)
|
|
|
|
run_phrase_loader(phrase_count_root, phrase_file, min_phrase_length, log_dir)
|
|
|
|
shutil.rmtree(phrase_count_root)
|
|
|
|
|
|
def run_pre_index_tools(test_data_folder, new_index_root, log_file_dir, lib_dir):
|
|
index_data_dir = os.path.join(test_data_folder, 'Index-Data')
|
|
|
|
# Make empty files, they will be overwritten!
|
|
make_empty_index_files(new_index_root)
|
|
# Copy the non-empty files to overwrite the empty ones
|
|
copy_index_files(get_move_files(index_data_dir), new_index_root)
|
|
# Now we can finally generate phrases for the library.
|
|
generate_phrases_for_library(lib_dir, new_index_root, log_file_dir)
|
|
|
|
# Make the FirstPageIndex
|
|
first_page_index = os.path.join(new_index_root, 'FirstPageIndex')
|
|
tmp_idx = os.path.join(new_index_root, 'temp')
|
|
tmp_first_page_index = os.path.join(tmp_idx, 'FirstPageIndex')
|
|
if not os.path.exists(first_page_index):
|
|
os.mkdir(first_page_index)
|
|
if not os.path.exists(tmp_idx):
|
|
os.mkdir(tmp_idx)
|
|
if not os.path.exists(tmp_first_page_index):
|
|
os.mkdir(tmp_first_page_index)
|
|
|
|
# Run the term diff
|
|
run_walker_term_diff(lib_dir, new_index_root, log_file_dir)
|
|
|
|
|
|
def setup_test(test_path, should_build):
|
|
global printer
|
|
load_executables()
|
|
|
|
log_dir = os.path.join(test_path, 'logs')
|
|
|
|
if not os.path.exists(log_dir):
|
|
os.mkdir(log_dir)
|
|
|
|
test_log = os.path.join(log_dir, 'TestLog.log')
|
|
printer = ConsoleUtils.SLLogger(program_name, test_log)
|
|
sys.stdout = printer
|
|
|
|
printer.write_no_prefix(ConsoleUtils.get_header(program_name, program_version, build_date, author, 150))
|
|
|
|
if should_build:
|
|
run_build(log_dir)
|
|
|
|
copy_executables(test_path)
|
|
|
|
|
|
def run_package(index_dir, parent_dir, package_path, clux_file, config_file_path, lucene_query_cnt=DEF_NUM_TOP_DOCS,
|
|
conf_to_use=DEF_CONF_THRESHOLD, term_len=DEF_TERM_LENGTH):
|
|
start_time = get_time()
|
|
start_clock = time.time()
|
|
|
|
files = get_pkg_file_names(parent_dir)
|
|
package_name = get_package_name(package_path)
|
|
|
|
times = []
|
|
|
|
print('Testing Package: %s' % package_name)
|
|
|
|
log_root = files[logs_root_tag]
|
|
classification_out = files[classification_file_tag]
|
|
tree_walker_out = files[tree_walker_file_tag]
|
|
fp_counter_out = files[fp_counter_file_tag]
|
|
term_walker_out = files[term_walker_file_tag]
|
|
accuracy_out = files[accuracy_file_tag]
|
|
doctype_counts = files[doctype_graph_tag]
|
|
|
|
# Get the package settings data to save.
|
|
package_config = os.path.join(parent_dir, test_config_file_name)
|
|
package_data = {'Package File': package_path, 'CLUX File': clux_file, 'Num TopDocs': lucene_query_cnt,
|
|
'Term Length': term_len, 'Current Minimum Confidence': conf_to_use, 'Log Directory': log_root}
|
|
|
|
# Run the classifier
|
|
classification_time = new_run_classifier(index_dir, config_file_path, classification_out, package_path, log_root)
|
|
times.append(classification_time)
|
|
# Validate output
|
|
validate_time = run_walker_validator(classification_out, log_root)
|
|
times.append(validate_time)
|
|
# Run TreeWalker
|
|
tree_walker_time = run_tree_walker(clux_file, classification_out, tree_walker_out, log_root)
|
|
times.append(tree_walker_time)
|
|
# Run FalsePositiveCounter
|
|
fp_counter_time = run_fp_counter([tree_walker_out], fp_counter_out, log_root)
|
|
times.append(fp_counter_time)
|
|
# # Run TermWalker
|
|
# term_walker_time = run_term_walker(tree_walker_out, classification_out, term_walker_out, log_root)
|
|
# times.append(term_walker_time)
|
|
# Run DoctypeGraph
|
|
doctype_graph_time = run_doctype_graph(parent_dir, doctype_counts, log_root)
|
|
times.append(doctype_graph_time)
|
|
# make_graphs_from_folder(doctype_counts, log_root)
|
|
# Run Accuracy.
|
|
accuracy_time = run_accuracy(tree_walker_out, accuracy_out, log_root)
|
|
times.append(accuracy_time)
|
|
|
|
# Get the total time, save the config data, and return the classification time and graph metadata file.
|
|
total_time = time.time() - start_clock
|
|
|
|
write_cfg_file(package_data, package_config, start_time, files)
|
|
|
|
print('Finished testing package (%.4f s)' % total_time)
|
|
write_line_break()
|
|
|
|
files[classification_time_tag] = classification_time
|
|
return files
|
|
|
|
|
|
def run_packages(index_dir, parent_dir, package_dicts, config_file_path, n_top_docs=DEF_NUM_TOP_DOCS,
|
|
t_conf=DEF_CONF_THRESHOLD, term_len=DEF_TERM_LENGTH):
|
|
start_time = get_time()
|
|
|
|
graph_data_files = []
|
|
graph_tags = []
|
|
package_files = {}
|
|
package_results = {}
|
|
|
|
total_classification_time = 0.0
|
|
num_runs = len(package_dicts)
|
|
|
|
graph_config = os.path.join(parent_dir, graph_config_file_name)
|
|
test_config = os.path.join(parent_dir, test_config_file_name)
|
|
|
|
print('Testing %d packages...' % num_runs)
|
|
|
|
for i in range(num_runs):
|
|
print('Testing Package %d/%d' % (i + 1, num_runs))
|
|
|
|
file_dict = package_dicts[i]
|
|
package_file = file_dict[package_file_tag]
|
|
clux_file = file_dict[clux_file_tag]
|
|
|
|
package_name = get_package_name(package_file)
|
|
run_dir = os.path.join(parent_dir, package_name)
|
|
|
|
run_data = run_package(index_dir, run_dir, package_file, clux_file, config_file_path, n_top_docs, t_conf, term_len)
|
|
|
|
# Get the data To save...
|
|
package_results[package_name] = run_data
|
|
total_classification_time += run_data[classification_time_tag]
|
|
graph_tags.append(package_name)
|
|
graph_data_files.append(run_data[accuracy_file_tag])
|
|
package_files[i] = package_file
|
|
|
|
# avg_classification_time = total_classification_time / num_runs
|
|
# print('Average Classification time: %.4f seconds' % avg_classification_time)
|
|
|
|
make_graph_config(graph_data_files, graph_tags, graph_config)
|
|
|
|
total_pages = join_class_error_counts(parent_dir)
|
|
page_classification_rate = total_classification_time / total_pages
|
|
package_classification_rate = total_classification_time / num_runs
|
|
|
|
print('Classified %d pages from %d files in %.2fs (%.2f s/page | %.2f s/file)' %
|
|
(total_pages, num_runs, total_classification_time,
|
|
page_classification_rate, package_classification_rate))
|
|
printer.write_line_break()
|
|
|
|
test_config_data = {classification_time_tag: total_classification_time,
|
|
graph_config_file_tag: graph_config,
|
|
'Total Pages': total_pages,
|
|
'Average Package Size': round(total_pages/num_runs),
|
|
'Classification Speed (pages)': '%.2f s/page' % page_classification_rate,
|
|
'Classification Speed (files)': '%.2f s/file' % package_classification_rate}
|
|
test_files = get_test_files(package_results, 'Package')
|
|
|
|
write_cfg_file(test_config_data, test_config, start_time, test_files)
|
|
|
|
return test_config_data
|
|
|
|
|
|
def test_changing_confidence(config_file, index_dir, parent_dir, package_dicts, min_conf, max_conf, step_size,
|
|
lucene_query_cnt=DEF_NUM_TOP_DOCS, term_len=DEF_TERM_LENGTH):
|
|
start_time = get_time()
|
|
|
|
current_conf = min_conf
|
|
avg_cls_time = 0.0
|
|
graph_config_files = {}
|
|
result_data = {}
|
|
|
|
num_runs = 0
|
|
print('Running Test With Confidence Values [%.2f%% - %.2f%%]' % (min_conf, max_conf))
|
|
test_config_file = os.path.join(parent_dir, test_config_file_name)
|
|
|
|
while current_conf >= max_conf:
|
|
print('Current Confidence: %.2f%%' % current_conf)
|
|
|
|
run_dir = os.path.join(parent_dir, 'Confidence Threshold = %.2f%%' % current_conf)
|
|
graph_config = os.path.join(run_dir, graph_config_file_name)
|
|
graph_config_files.append(graph_config)
|
|
|
|
return_data = run_packages(config_file, index_dir, run_dir, package_dicts, lucene_query_cnt, current_conf,
|
|
term_len)
|
|
|
|
result_data[current_conf] = return_data
|
|
|
|
cls_time = return_data[classification_time_tag]
|
|
packages_config = return_data[graph_config_file_tag]
|
|
|
|
new_title = '%s With Confidence Threshold %.2f%%' % (DEF_GRAPH_TITLE, current_conf * 100)
|
|
AccuracyGraphSetup.change_title(packages_config, graph_config, new_title)
|
|
|
|
num_runs += 1
|
|
avg_cls_time += cls_time
|
|
current_conf += step_size
|
|
|
|
avg_cls_time = avg_cls_time / num_runs
|
|
print('Average Classification time: %.4f seconds' % avg_cls_time)
|
|
|
|
config_result = {classification_time_tag: avg_cls_time, graph_config_file_tag: graph_config,
|
|
'Minimum Confidence': '%.2f%%' % (min_conf * 100),
|
|
'Maximum Confidence': '%.2f%%' % (max_conf * 100)}
|
|
|
|
files = get_test_files(result_data, 'Confidence')
|
|
write_cfg_file(config_result, test_config_file, start_time, files)
|
|
|
|
return config_result
|
|
|
|
|
|
def make_graphs_from_folder(folder, log_dir):
|
|
graph_dict = {}
|
|
print('Making graphs from config files in "%s"' % folder)
|
|
for file in os.listdir(folder):
|
|
file = os.path.join(folder, file)
|
|
if file.endswith(graph_config_file_name):
|
|
out_file = file.replace(graph_config_file_name, graph_name)
|
|
graph_dict[out_file] = file
|
|
make_graphs(graph_dict, log_dir)
|
|
|
|
|
|
def get_test_files(test_files, new_prefix):
|
|
result = {}
|
|
for key, value in test_files.items():
|
|
for sub_key, sub_value in value.items():
|
|
new_key = '%s %s - %s' % (new_prefix, key, sub_key)
|
|
result[new_key] = sub_value
|
|
return result
|
|
|
|
|
|
def get_package_name(package_path):
|
|
par_dir, package_name = os.path.split(package_path)
|
|
package_name = package_name[:-4]
|
|
return package_name
|
|
|
|
|
|
def get_lib_name(lib_path):
|
|
folder, name = os.path.split(lib_path)
|
|
return name
|
|
|
|
|
|
def get_move_files(index_data_dir):
|
|
result = []
|
|
for file in os.listdir(index_data_dir):
|
|
result.append(os.path.join(index_data_dir, file))
|
|
|
|
if os.environ['SEQUENCELOGICHOME'] is not None:
|
|
dest_folder = os.path.join(os.environ['SEQUENCELOGICHOME'], 'SLSync', 'config', 'data')
|
|
for file in result:
|
|
new_file = os.path.join(dest_folder, os.path.split(file)[1])
|
|
|
|
if not os.path.exists(dest_folder):
|
|
os.makedirs(dest_folder)
|
|
|
|
shutil.copy(file, new_file)
|
|
|
|
return result
|
|
|
|
|
|
def get_graph_config_files(package_data):
|
|
result = {}
|
|
|
|
for key, value in package_data.items():
|
|
value = str(value)
|
|
if value.endswith(graph_config_file_name):
|
|
out_file = value.replace(graph_config_file_name, graph_name)
|
|
result[out_file] = value
|
|
|
|
return result
|
|
|
|
|
|
def do_make_graphs(configs_by_paths, logs_dir):
|
|
print('Making %d graphs...' % len(configs_by_paths))
|
|
|
|
graph_time = 0
|
|
for out_file, config_file in configs_by_paths.items():
|
|
graph_time += run_graph(config_file, out_file, logs_dir)
|
|
|
|
print('Done making Graphs (%.4f s)' % graph_time)
|
|
|
|
|
|
def make_graphs(run_data_dict, log_dir, title=None):
|
|
config_data = get_graph_config_files(run_data_dict)
|
|
if title is not None:
|
|
for out, config in config_data.items():
|
|
AccuracyGraphSetup.change_title(config, config, title)
|
|
do_make_graphs(config_data, log_dir)
|
|
|
|
|
|
def join_class_error_counts(test_out_root):
|
|
|
|
types = ['false-negative', 'false-positive', 'incorrect', 'correct']
|
|
fields = ['Correct Type', 'Classified Type', 'Number of Occurrences']
|
|
|
|
total_pages = 0
|
|
|
|
for t in types:
|
|
file_name = 'global-%s-counts.csv' % t
|
|
result_file = os.path.join(test_out_root, file_name)
|
|
|
|
result = {}
|
|
|
|
for folder in os.listdir(test_out_root):
|
|
folder = os.path.join(test_out_root, folder)
|
|
if os.path.isdir(folder):
|
|
to_read = os.path.join(folder, '%s-counts.csv' % t)
|
|
if os.path.exists(to_read):
|
|
with open(to_read) as reader:
|
|
csv_reader = csv.DictReader(reader)
|
|
for row in csv_reader:
|
|
key = '%s>>>%s' % (row['Correct Type'], row['Classified Type'])
|
|
count = int(row['Number of Occurrences'])
|
|
if key not in result.keys():
|
|
result[key] = count
|
|
else:
|
|
result[key] += count
|
|
|
|
result = sorted(result.items(), key=operator.itemgetter(1), reverse=True)
|
|
with open(result_file, 'w+', newline='') as writer:
|
|
csv_file = csv.DictWriter(writer, fieldnames=fields)
|
|
csv_file.writeheader()
|
|
for key, count in result:
|
|
correct_doctype, classified_doctype = key.split('>>>')[:2]
|
|
total_pages += count
|
|
csv_file.writerow({'Correct Type': correct_doctype,
|
|
'Classified Type': classified_doctype,
|
|
'Number of Occurrences': count})
|
|
|
|
return total_pages
|
|
|
|
|
|
def run_test_on_lib(lib_dir, output_paths, test_in_root, config_file_path, min_phrase_len=DEF_MIN_PHRASE_SIZE,
|
|
max_phrase_len=DEF_MAX_PHRASE_SIZE):
|
|
global phrase_exe_name, paginate
|
|
|
|
start_time = get_time()
|
|
|
|
idx_root = output_paths[idx_root_tag]
|
|
test_out = output_paths[test_root_tag]
|
|
logs_out = output_paths[logs_root_tag]
|
|
|
|
packages = load_packages(test_in_root)
|
|
|
|
idx_data_root = os.path.join(test_in_root, 'Index-Data')
|
|
|
|
get_move_files(idx_data_root)
|
|
|
|
new_run_indexer(config_file_path, lib_dir, idx_root, logs_out)
|
|
|
|
config_out = os.path.join(test_out, test_config_file_name)
|
|
|
|
config_info = {'Library Path': lib_dir, 'Library Name': get_lib_name(lib_dir), 'Test Data Source': test_in_root,
|
|
'Test Data Result': test_out, 'Minimum Phrase Length': min_phrase_len,
|
|
'Maximum Phrase Length': max_phrase_len, 'Minimum Phrase Doctype Coverage': '80%',
|
|
'Paginate': paginate, 'Minimum OCR Confidence': '60%',
|
|
'Confidence Threshold': '%.2f' % DEF_CONF_THRESHOLD, 'Score Ratio': 'Yes'}
|
|
|
|
write_line_break()
|
|
out_path = test_out
|
|
|
|
# Run the packages
|
|
run_data = run_packages(idx_root, out_path, packages, config_file_path)
|
|
config_info['Average Classification Time'] = run_data[classification_time_tag]
|
|
del run_data[classification_time_tag]
|
|
# Make the graphs!
|
|
make_graphs(run_data, logs_out, 'Confidence Threshold: %.2f' % DEF_CONF_THRESHOLD)
|
|
|
|
write_cfg_file(config_info, config_out, start_time, run_data)
|
|
|
|
|
|
def get_time():
|
|
return datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S')
|
|
|
|
|
|
def main(lib_dir, output_directory, test_data_root, config_file_loc, do_build):
|
|
# Do some set up
|
|
run_dir = get_root_folder(output_directory)
|
|
os.environ['SEQUENCELOGICHOME'] = os.path.join(run_dir, 'Programs')
|
|
setup_test(run_dir, do_build)
|
|
|
|
# Get the output information.
|
|
output_info = get_idx_names(run_dir)
|
|
# Perform a test
|
|
run_test_on_lib(lib_dir, output_info, test_data_root, config_file_loc)
|
|
|
|
# Do any necessary cleanup
|
|
run_clean_up()
|
|
print('\nTest completed. Exiting...')
|
|
|
|
|
|
# This is where we call the main method from.
|
|
if __name__ == '__main__':
|
|
# load_executables()
|
|
# # Set up arguments
|
|
# files = [
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597000945\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597000964\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597000967\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597000990\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597001171\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597001276\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597001454\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597001462\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597001468\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597001474\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597001635\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597001639\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597001648\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597001676\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597001681\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597001685\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597001699\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597001741\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597001785\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597001842\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597001849\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597001917\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597002051\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597002121\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597002173\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597002322\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597002344\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597002443\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597002544\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597002550\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597002556\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597002648\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597002662\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597002837\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597002854\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597002924\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597002931\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597002954\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597003029\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597003044\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597003068\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597003088\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597003125\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597003128\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597003130\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597003137\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597003186\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597003196\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597003206\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597003210\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597003221\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597003257\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597003261\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597003308\\graph-data.txt",
|
|
# "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597003335\\graph-data.txt"]
|
|
#
|
|
# tags = ['Correct & High Confidence',
|
|
# 'Incorrect & Low Confidence',
|
|
# 'Incorrect & High Confidence',
|
|
# 'Correct & Low Confidence',
|
|
# 'Correct Pagination']
|
|
#
|
|
# out_file = r'C:\Users\chris\Documents\Code\Tests\KMeans\L3Results\2017.10.12\17.26\global-graph-config.json'
|
|
# graph_out = \
|
|
# r'C:\Users\chris\Documents\Code\Tests\KMeans\L3Results\2017.10.12\17.26\global-classification-accuracy.pdf'
|
|
# make_graph_config(files, tags, out_file)
|
|
#
|
|
# do_make_graphs({graph_out: out_file}, r'C:\Users\chris\Documents\Code\Tests\KMeans\L3Results\2017.10.12\17.26\logs')
|
|
|
|
required_args = parser.add_argument_group('Required')
|
|
optional_args = parser.add_argument_group('Optional')
|
|
|
|
required_args.add_argument('-t', '--test_dir', required=True,
|
|
help='The path to the directory containing test data.')
|
|
required_args.add_argument('-o', '--out_dir', required=True, help='The location to write the test data to.')
|
|
required_args.add_argument('-l', '--library_dir', required=True, help='The path to the library root folder.')
|
|
required_args.add_argument('-c', '--config', required=True, help='The path to the config file')
|
|
|
|
optional_args.add_argument('-b', '--build', required=False, action='store_true',
|
|
help='Use if you want to run a build before testing.')
|
|
optional_args.add_argument('-h', '--help', action='help', help='prints the help message')
|
|
|
|
# Get the arguments
|
|
args = parser.parse_args()
|
|
library_dir = args.library_dir
|
|
output_dir = args.out_dir
|
|
test_data_dir = args.test_dir
|
|
build = args.build
|
|
config_file = args.config
|
|
# os.environ['UseFirstPages'] = 'true'
|
|
# Run the program
|
|
main(library_dir, output_dir, test_data_dir, config_file, build)
|
|
|
|
# printer.close()
|