""" Information ----------- This script is a tool for running a full test over the ScoreWalker Classification engine. The following tools are called in order to achieve this goal: #) Builds WalkerIndexer and WalkerClassifier. #) The necessary steps for indexing. #) WalkerClassifier to preform classification on the test document. #) Several other Python tools to perform analysis on the results. #) ValidationWalker to validate the output from the classification engine against the JSON schemas used in production. .. moduleauthor:: Chris Diesch Commandline Usage ------------------ Usage: ``Tester.py [-h, --help] [-b, --build] [-t, --test_dir] {TEST_DIR} [-l, --library_dir] {LIB_DIR} [-c, --config] {CFG_FILE} [-o, --out_dir] {OUT_DIR} Required Arguments: ``-t TEST_DIR, --test_dir TEST_DIR`` Where ``TEST_DIR`` is the path to the root folder to load the test data from. ``-l LIB_DIR, --library_dir LIB_DIR`` Where ``LIB_DIR`` is the path to the library to run the test with. ``-c CFG_FILE, --config CFG_FILE`` Where ``CFG_FILE`` is the path to the config file to use for testing. ``-o OUT_DIR, --out_dir OUT_DIR`` Where ``OUT_DIR`` is the path to save the output to. Optional Arguments: ``-h, --help`` Prints the help message. ``-b, --build`` Runs the maven build for the classification tools. Python Module Usage -------------------- """ import argparse import os import subprocess import datetime import time import shutil import sys import csv import json import operator import ConsoleUtils import AccuracyGraphSetup program_name = 'TestWalker' program_description = 'This tool handles running the classification engine and several tools to perform analysis on ' \ 'the results.' author = 'Chris Diesch' # The argument parser for the program. parser = argparse.ArgumentParser(prog=program_name, description=program_description, add_help=False) # Error and Warning console values: red_error = '\033[91mError:\033[0m' yellow_warning = '\033[93mWARNING:\033[0m' blue_okay = '\033[94mOK\033[0m' build_date = '2017.07.24' # datetime.datetime.now().strftime('%Y.%m.%d-%H.%M.%S') program_version = '1.7.2' side_bound_char = '|' line_break_char = '-' corner_char = '+' line_break_size = 150 line_break = line_break_char * line_break_size console_line_break = '-' * line_break_size DEF_NUM_TOP_DOCS = 20 DEF_TERM_LENGTH = 1 DEF_CONF_THRESHOLD = 60 DEF_MIN_PHRASE_SIZE = 5 DEF_MAX_PHRASE_SIZE = 30 paginate = True DEF_GRAPH_TITLE = 'Classification Accuracy' graph_config_file_name = 'graph-config.json' test_config_file_name = 'test-config.txt' graph_name = 'Classification Accuracy.pdf' classification_time_tag = 'Classification Run Time' clux_file_tag = 'CLUX File' package_file_tag = 'Package File' package_name_tag = 'Package Name' classification_file_tag = 'Classification' tree_walker_file_tag = 'Tree Walker' term_walker_file_tag = 'Term Walker' fp_counter_file_tag = 'False Positive Counts' accuracy_file_tag = 'Graph Data' graph_config_file_tag = 'Graph Config' graph_file_tag = 'Graph' doctype_graph_tag = 'Doctype Graph' idx_root_tag = 'Index Root' test_root_tag = 'Test Root' logs_root_tag = 'Logs Root' tokenizer = 'tokenizer' phrase_maker = 'phrase maker' phrase_loader = 'phrase loader' indexer = 'indexer' classifier = 'classifier' diff_utils = 'diff-utils' walker_validator = 'walker validator' tree_walker = 'tree walker' status_counter = 'false positive counter' doctype_graph = 'doctype graph' term_walker = 'term walker' accuracy = 'accuracy' grapher = 'package graph maker' term_diff = 'walker term diff' current_dir = '' phrase_exe_name = '' build_roots = {} executables = {} printer = None def write_line_break(): printer.write_no_prefix(console_line_break) def load_executables(): """ Loads the executables needed to perform a test run and analysis. Returns: ``None`` """ global executables, phrase_exe_name, current_dir, build_roots current_dir = os.getcwd() phrase_exe_root = r'C:\Users\chris\Documents\Code\Tests\KMeans\L3Input\Programs' walker_validate_root = os.path.abspath(os.path.join(current_dir, os.pardir)) walker_validate_root = os.path.join(walker_validate_root, 'ValidationWalker') tree_walker_root = os.path.abspath(os.path.join(current_dir, os.pardir)) tree_walker_root = os.path.join(tree_walker_root, 'TreeWalker') accuracy_root = os.path.abspath(os.path.join(current_dir, os.pardir)) accuracy_root = os.path.join(accuracy_root, 'AccuracyCounter') diff_utils_root = os.path.abspath(os.path.join(current_dir, os.pardir, os.pardir, 'diff-utils')) classify_root = os.path.abspath(os.path.join(current_dir, os.pardir)) classify_root = os.path.abspath(os.path.join(classify_root, os.pardir)) classify_root = os.path.abspath(os.path.join(classify_root, 'scorewalker')) tokenizer_jar_root = os.path.join(classify_root, 'walker-analysis', 'target') tokenizer_jar_name = os.path.join(tokenizer_jar_root, 'tokenizer-one-jar.jar') phrase_exe_name = os.path.join(phrase_exe_root, 'phrases.exe') phrase_loader_name = os.path.abspath(os.path.join(current_dir, os.pardir)) phrase_loader_name = os.path.join(phrase_loader_name, 'PhraseCountLoader') phrase_loader_name = os.path.join(phrase_loader_name, 'PhraseCountLoader.py') classify_jar_root = os.path.abspath(os.path.join(classify_root, 'walker-classifier', 'target')) classify_jar_name = os.path.join(classify_jar_root, 'walker-classifier-one-jar.jar') walker_validator_name = os.path.join(walker_validate_root, 'ValidationWalker.py') tree_walker_name = os.path.join(tree_walker_root, 'TreeWalker.py') doctype_graph_name = os.path.join(tree_walker_root, 'DoctypeGraph.py') fp_count_name = os.path.join(tree_walker_root, 'StatusCounter.py') term_walker_name = os.path.join(tree_walker_root, 'TermWalker.py') accuracy_name = os.path.join(accuracy_root, 'Accuracy.py') graph_maker_name = os.path.join(accuracy_root, 'AccuracyGraph.py') index_jar_root = os.path.abspath(os.path.join(classify_root, 'walker-indexer', 'target')) index_jar_name = os.path.join(index_jar_root, 'walker-indexer-one-jar.jar') walker_term_diff_root = os.path.abspath(os.path.join(classify_root, 'walker-term-diff', 'target')) walker_term_diff_name = os.path.join(walker_term_diff_root, 'walker-term-diff-one-jar.jar') build_roots = {classifier: classify_root, diff_utils: diff_utils_root} executables = {tokenizer: tokenizer_jar_name, term_diff: walker_term_diff_name, phrase_loader: phrase_loader_name, indexer: index_jar_name, classifier: classify_jar_name, walker_validator: walker_validator_name, tree_walker: tree_walker_name, status_counter: fp_count_name, doctype_graph: doctype_graph_name, term_walker: term_walker_name, accuracy: accuracy_name, grapher: graph_maker_name} for file in os.listdir(phrase_exe_root): executables[file] = os.path.join(phrase_exe_root, file) def open_sublime(file_path): """ Opens SublimeText on the given file. Args: ``file_path`` The file to open with SublimeText. Returns: ``None`` """ sublime_cmd = [r'C:\Program Files\Sublime Text 3\sublime_text.exe', file_path] subprocess.Popen(sublime_cmd) def open_log_files(std_out_path, std_err_path): """ Opens the log files and adds a header to them. Args: ``std_out_path`` -- ``str`` The path to save the standard output to. ``std_err_path`` -- ``str`` The path to save the standard error to. Returns: ``str, str`` -- The writer for the standard out file, The writer for the standard error file. """ log_line_break = '=' * 100 log_std_err = format('%s\n%s\n%s\n' % (log_line_break, '||{:^96}||'.format('STANDARD ERROR'), log_line_break)) log_std_out = format('%s\n%s\n%s\n' % (log_line_break, '||{:^96}||'.format('STANDARD OUT'), log_line_break)) with open(std_out_path, 'a+') as std_out: std_out.write(log_std_out) with open(std_err_path, 'a+') as std_err: std_err.write(log_std_err) std_out = open(std_out_path, 'a+') std_err = open(std_err_path, 'a+') return std_out, std_err def run_process(proc_name, proc_cmd, log_dir): """ Runs a process, logs the console output, and gets the time to execute. Returns: ``proc_name`` -- ``str`` The name of the process to run. ``proc_cmd`` -- ``list(str)`` The command string for the process. ``log_dir`` -- ``str`` The path to save the log files to. Returns: ``float`` The time (in seconds) to execute the command. """ proc_err_log = os.path.join(log_dir, '%s-std-err.log' % proc_name) proc_std_log = os.path.join(log_dir, '%s-std-out.log' % proc_name) std_out_writer, std_err_writer = open_log_files(proc_std_log, proc_err_log) print('Running %s' % proc_name) start_time = time.time() process = subprocess.Popen(proc_cmd, shell=True, stdout=std_out_writer, stderr=std_err_writer) process.wait() run_time = time.time() - start_time time.sleep(0.05) process.poll() return_code = int(process.returncode) if return_code != 0: print('%s Process completed with return code %d.' % (proc_name, return_code)) print(' Opening log files...') std_out_writer.close() std_err_writer.close() open_sublime(std_out_writer.name) open_sublime(std_err_writer.name) exit(return_code) print('%s completed (%.4f s)' % (proc_name, run_time)) return run_time def run_build(log_dir): """ Runs the maven build for the classificaion and indexing engines. Args: ``log_dir`` -- ``str`` The path to save log files. Returns: ``None`` """ mvn_home = os.environ.get('MAVEN_HOME') mvn_path = os.path.join(mvn_home, os.path.join('bin', 'mvn.cmd')) # Change dir and run maven # os.chdir(build_roots[diff_utils]) # mvn_cmd = [mvn_path, 'clean', 'install'] # run_process('DiffUtils Maven Build', mvn_cmd, log_dir) # Change dir and run maven os.chdir(build_roots[classifier]) mvn_cmd = [mvn_path, '-DskipTests', 'install'] run_process('ScoreWalker Maven Build', mvn_cmd, log_dir) os.chdir(current_dir) def run_tokenizer(lib_dir, index_dir, log_dir): """ Runs the tokenizer using :meth:`run_process`. Args: ``lib_dir`` -- ``str`` The path to the library to tokenize. ``index_dir`` -- ``str`` The path to the index being used to classify. ``log_dir`` -- ``str`` The path to the log directory. Returns: ``None`` """ tokenizer_cmd = ['java', '-jar', executables[tokenizer], '-I', '"'+index_dir+'"', '-L', '"'+lib_dir+'"', '-O'] run_process('Tokenizer', tokenizer_cmd, log_dir) def run_walker_term_diff(lib_dir, index_dir, log_dir): with open(config_file) as cfg_file: too_similar = json.load(cfg_file) too_similar = too_similar['CLASSIFYWALKER'] too_similar = too_similar['similarDoctypes'] for similar_list in too_similar: term_diff_cmd = ['java', '-jar', executables[term_diff], '--libRoot', lib_dir, '--outRoot', index_dir, '--doctypes'] + similar_list run_process('Walker Term Diff over doctypes %s' % similar_list, term_diff_cmd, log_dir) def run_phrase_maker(files, min_phrase_len, max_phrase_len, phrase_file, log_file): folder, doctype = os.path.split(phrase_file) split_idx = doctype.rfind('.') doctype = doctype[:split_idx] phrases_cmd = [phrase_exe_name, '-c', '80', '-p', str(min_phrase_len), '-P', str(max_phrase_len), '-ol'] + files with open(log_file, 'a+') as tmp: tmp.write('Running Phrase Maker on %s\n%s\n' % (doctype, console_line_break)) std_out = open(phrase_file, 'a+') std_err = open(log_file, 'a+') process = subprocess.Popen(phrases_cmd, stdout=std_out, stderr=std_err) with open(log_file, 'a+') as tmp: tmp.write('%s\n' % console_line_break) process.wait() def run_phrase_loader(phrases_dir, out_file_name, min_phrase_len, log_dir): phrase_loader_cmd = ['python', executables[phrase_loader], '-i', phrases_dir, '-o', out_file_name, '-m', str(min_phrase_len)] run_time = run_process('Phrase Loader', phrase_loader_cmd, log_dir) return run_time def run_indexer(idx_dir, lib_dir, log_dir): index_cmd = ['java', '-jar', executables[indexer], '-I', '"'+idx_dir+'"', '-D', '"'+lib_dir+'"'] run_time = run_process('Walker Indexer', index_cmd, log_dir) write_line_break() return run_time def new_run_classifier(index_dir, config_file_path, dest_file, package, log_dir, min_memory=128, max_memory=4096, thread_count=8): walker_loc = r'C:\Users\chris\Documents\Code\Git\scorewalker-utils\RunScoreWalker' \ r'\sequencelogic-run-walker-classifier.py' classify_cmd = ['python', walker_loc, '--index-root', index_dir, '--pkg-path', package, '--config', config_file_path, '--out', dest_file, '--min-memory', str(min_memory), '--max-memory', str(max_memory), '--thread-count', str(thread_count)] run_time = run_process('sequencelogic-run-walker-classifier', classify_cmd, log_dir) return run_time def run_classifier(index_dir, config_file_path, dest_file, package, n_lucene, conf, word_len, log_dir): classify_cmd = ['java', '-jar', executables[classifier], '-D', '"'+dest_file+'"', '-C', '"'+config_file_path+'"', '-I', '"'+index_dir+'"', '-P', '"'+package+'"'] run_time = run_process('Walker Classifier', classify_cmd, log_dir) return run_time def run_walker_validator(walker_file, log_dir): validate_cmd = ['python', executables[walker_validator], '-i', walker_file] run_time = run_process('ValidationWalker', validate_cmd, log_dir) return run_time def run_tree_walker(clux_output, engine_output, result_file, log_dir): tree_walker_cmd = ['python', executables[tree_walker], '-c', clux_output, '-w', engine_output, '-o', result_file] run_time = run_process('Tree Walker', tree_walker_cmd, log_dir) return run_time def run_fp_counter(twk_files, result_file, log_dir): fp_count_cmd = ['python', executables[status_counter], '-o', result_file, '--in_files'] + twk_files run_time = run_process('Status Counter', fp_count_cmd, log_dir) return run_time def run_doctype_graph(test_dir, data_dir, log_dir): doctype_graph_cmd = ['python', executables[doctype_graph], '-i', test_dir, '-o', data_dir] run_time = run_process('DoctypeGraph', doctype_graph_cmd, log_dir) return run_time def run_term_walker(tree_walker_output, engine_output, result_file, log_dir): term_walker_cmd = ['python', executables[term_walker], '-w', tree_walker_output, '-c', engine_output, '-o', result_file] run_time = run_process('Term Walker', term_walker_cmd, log_dir) return run_time def run_accuracy(tree_walker_out, out_file, log_dir): # Run it accuracy_cmd = ['python', executables[accuracy], '-i', tree_walker_out, '-o', out_file] run_time = run_process('Graph Metadata', accuracy_cmd, log_dir) return run_time def run_graph(configuration_file, out_file, log_dir): graph_cmd = ['python', executables[grapher], '-o', out_file, '-i', configuration_file] run_time = run_process('Graph Maker', graph_cmd, log_dir) return run_time def run_clean_up(): print('Cleaning up...') # for name in executables: # file = executables[name] # if os.path.exists(file): # os.remove(file) print('Done cleaning up.') def make_graph_config(files, tags, dest_file, graph_title=DEF_GRAPH_TITLE): # folder, file = os.path.split(dest_file) # avg_file = os.path.join(folder, 'global-%s' % file) # AccuracyGraphSetup.make_package_graph_config(files, tags, graph_title, dest_file) AccuracyGraphSetup.make_avg_cfg(files, graph_title, dest_file) # return avg_file def get_pkg_file_names(parent_dir): # Get th appropriate sub folders if not os.path.exists(parent_dir): os.mkdir(parent_dir) run_dir = os.path.abspath(parent_dir) log_dir = os.path.join(run_dir, 'logs') # Make the directories if they don't exist. if not os.path.exists(run_dir): os.mkdir(run_dir) if not os.path.exists(log_dir): os.mkdir(log_dir) # Get the new full file names classifier_out = os.path.join(run_dir, 'classification-results.json') tree_walker_out = os.path.join(run_dir, 'classification-analysis.csv') fp_counter_out = os.path.join(run_dir, 'false-positive-counts.csv') term_walker_out = os.path.join(run_dir, 'term-analysis.csv') accuracy_out = os.path.join(run_dir, 'graph-data.txt') graph_out = os.path.join(run_dir, 'accuracy-graph.pdf') doctype_graph = os.path.join(run_dir, 'doctype-counts') return {classification_file_tag: classifier_out, tree_walker_file_tag: tree_walker_out, fp_counter_file_tag: fp_counter_out, term_walker_file_tag: term_walker_out, accuracy_file_tag: accuracy_out, graph_file_tag: graph_out, logs_root_tag: log_dir, doctype_graph_tag: doctype_graph} def get_root_folder(test_root_path): subfolder = os.path.join(test_root_path, datetime.datetime.now().strftime('%Y.%m.%d')) # Files will start with HH.MM file_prefix = datetime.datetime.now().strftime('%H.%M') test_root = os.path.join(test_root_path, subfolder) run_dir = os.path.join(subfolder, file_prefix) if not os.path.exists(test_root): os.mkdir(test_root) if not os.path.exists(run_dir): os.mkdir(run_dir) return run_dir def get_idx_names(run_dir): idx_dir = os.path.join(run_dir, 'index') log_dir = os.path.join(run_dir, 'logs') # Make the directories if they don't exist. if not os.path.exists(log_dir): os.mkdir(log_dir) if not os.path.exists(idx_dir): os.mkdir(idx_dir) # Return the folders return {idx_root_tag: idx_dir, test_root_tag: run_dir, logs_root_tag: log_dir} def get_tree_walker_files(root_folder): files = [] for folder in os.listdir(root_folder): folder = os.path.join(root_folder, folder) if os.path.isdir(folder): for file in os.listdir(folder): file = os.path.join(folder, file) if os.path.isfile(file): if file.endswith('analysis-no-centers.csv') and 'term' not in file: files.append(file) return files def load_packages(test_data_path): test_packages_root = os.path.join(test_data_path, 'Test-Files') result = [] for file in os.listdir(test_packages_root): file = os.path.join(test_packages_root, file) if os.path.isfile(file) and file.endswith('.frt'): clux_file = file.replace('.frt', '_true.json') if os.path.exists(clux_file): result.append({package_file_tag: file, clux_file_tag: clux_file}) return result def write_cfg_file(settings_dict, test_settings_file, start_time, files): with open(test_settings_file, 'w+') as writer: writer.write('Tester.py Settings/Results\n') writer.write('Start: %s\n' % start_time) writer.write('Ended: %s\n' % datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S')) writer.write('%s\n' % console_line_break) for key, value in settings_dict.items(): writer.write('%s = %s\n' % (key, value)) writer.write('%s\nFiles:\n' % console_line_break) for key, value in files.items(): writer.write(' %s: %s\n' % (key, value)) def make_empty_index_files(new_index_root): out_files = ['MaleNames.txt', 'FemaleNames.txt', 'Surnames.txt', 'Phrases.txt'] for file in out_files: file = os.path.join(new_index_root, file) with open(file, 'w+') as writer: writer.write('') def copy_index_files(files_to_copy, new_index_root): print('Copying index files...') for item in files_to_copy: folder, name = os.path.split(item) new_path = os.path.join(new_index_root, name) shutil.copy(item, new_path) print('Done copying index files.') def copy_executables(test_dir): global executables program_dir = os.environ['SEQUENCELOGICHOME'] # program_dir = os.path.join(test_dir, 'Programs') # os.environ['SEQUENCELOGICHOME'] = program_dir program_dir = os.path.join(program_dir, 'bin') if not os.path.exists(program_dir): os.makedirs(program_dir) for name in executables: if name != walker_validator: original_path = executables[name] original_folder, original_name = os.path.split(original_path) new_path = os.path.join(program_dir, original_name) shutil.copy(original_path, new_path) executables[name] = new_path def new_run_indexer(config_file, lib_path, idx_path, log_dir): index_wrapper_path = \ r'C:\Users\chris\Documents\Code\Git\scorewalker-utils\RunScoreWalker\sequencelogic-run-walker-indexer.py' cmd = ['python', index_wrapper_path, '--config', config_file, '--indexRoot', idx_path, '--libRoot', lib_path] idx_time = run_process('sequencelogic-run-walker-indexer', cmd, log_dir) return idx_time def run_phrase_maker_on_folder(folder, phrases_root, min_phrase_len, max_phrase_len, log_dir): files = [] lib_dir, doctype = os.path.split(folder) dest_file = os.path.join(phrases_root, '%s.phrasecount' % doctype) log_file = os.path.join(log_dir, 'phrase-maker-std-err.txt') for file in os.listdir(folder): file = os.path.join(folder, file) if os.path.isfile(file) and file.endswith('.tkn'): files.append(file) if len(files) > 1: run_phrase_maker(files, min_phrase_len, max_phrase_len, dest_file, log_file) else: print('There are not enough examples of "%s"' % doctype) def generate_phrases_for_library(library_root, index_root, log_dir, min_phrase_length=DEF_MIN_PHRASE_SIZE, max_phrase_len=DEF_MAX_PHRASE_SIZE): phrase_count_root = os.path.join(index_root, 'Phrase Count Source') phrase_file = os.path.join(index_root, 'Phrases.txt') if not os.path.exists(phrase_count_root): os.mkdir(phrase_count_root) run_tokenizer(library_root, index_root, log_dir) for folder in os.listdir(library_root): folder = folder.replace('\uf028', '') folder = os.path.join(library_root, folder) if os.path.isdir(folder): run_phrase_maker_on_folder(folder, phrase_count_root, min_phrase_length, max_phrase_len, log_dir) run_phrase_loader(phrase_count_root, phrase_file, min_phrase_length, log_dir) shutil.rmtree(phrase_count_root) def run_pre_index_tools(test_data_folder, new_index_root, log_file_dir, lib_dir): index_data_dir = os.path.join(test_data_folder, 'Index-Data') # Make empty files, they will be overwritten! make_empty_index_files(new_index_root) # Copy the non-empty files to overwrite the empty ones copy_index_files(get_move_files(index_data_dir), new_index_root) # Now we can finally generate phrases for the library. generate_phrases_for_library(lib_dir, new_index_root, log_file_dir) # Make the FirstPageIndex first_page_index = os.path.join(new_index_root, 'FirstPageIndex') tmp_idx = os.path.join(new_index_root, 'temp') tmp_first_page_index = os.path.join(tmp_idx, 'FirstPageIndex') if not os.path.exists(first_page_index): os.mkdir(first_page_index) if not os.path.exists(tmp_idx): os.mkdir(tmp_idx) if not os.path.exists(tmp_first_page_index): os.mkdir(tmp_first_page_index) # Run the term diff run_walker_term_diff(lib_dir, new_index_root, log_file_dir) def setup_test(test_path, should_build): global printer load_executables() log_dir = os.path.join(test_path, 'logs') if not os.path.exists(log_dir): os.mkdir(log_dir) test_log = os.path.join(log_dir, 'TestLog.log') printer = ConsoleUtils.SLLogger(program_name, test_log) sys.stdout = printer printer.write_no_prefix(ConsoleUtils.get_header(program_name, program_version, build_date, author, 150)) if should_build: run_build(log_dir) copy_executables(test_path) def run_package(index_dir, parent_dir, package_path, clux_file, config_file_path, lucene_query_cnt=DEF_NUM_TOP_DOCS, conf_to_use=DEF_CONF_THRESHOLD, term_len=DEF_TERM_LENGTH): start_time = get_time() start_clock = time.time() files = get_pkg_file_names(parent_dir) package_name = get_package_name(package_path) times = [] print('Testing Package: %s' % package_name) log_root = files[logs_root_tag] classification_out = files[classification_file_tag] tree_walker_out = files[tree_walker_file_tag] fp_counter_out = files[fp_counter_file_tag] term_walker_out = files[term_walker_file_tag] accuracy_out = files[accuracy_file_tag] doctype_counts = files[doctype_graph_tag] # Get the package settings data to save. package_config = os.path.join(parent_dir, test_config_file_name) package_data = {'Package File': package_path, 'CLUX File': clux_file, 'Num TopDocs': lucene_query_cnt, 'Term Length': term_len, 'Current Minimum Confidence': conf_to_use, 'Log Directory': log_root} # Run the classifier classification_time = new_run_classifier(index_dir, config_file_path, classification_out, package_path, log_root) times.append(classification_time) # Validate output validate_time = run_walker_validator(classification_out, log_root) times.append(validate_time) # Run TreeWalker tree_walker_time = run_tree_walker(clux_file, classification_out, tree_walker_out, log_root) times.append(tree_walker_time) # Run FalsePositiveCounter fp_counter_time = run_fp_counter([tree_walker_out], fp_counter_out, log_root) times.append(fp_counter_time) # # Run TermWalker # term_walker_time = run_term_walker(tree_walker_out, classification_out, term_walker_out, log_root) # times.append(term_walker_time) # Run DoctypeGraph doctype_graph_time = run_doctype_graph(parent_dir, doctype_counts, log_root) times.append(doctype_graph_time) # make_graphs_from_folder(doctype_counts, log_root) # Run Accuracy. accuracy_time = run_accuracy(tree_walker_out, accuracy_out, log_root) times.append(accuracy_time) # Get the total time, save the config data, and return the classification time and graph metadata file. total_time = time.time() - start_clock write_cfg_file(package_data, package_config, start_time, files) print('Finished testing package (%.4f s)' % total_time) write_line_break() files[classification_time_tag] = classification_time return files def run_packages(index_dir, parent_dir, package_dicts, config_file_path, n_top_docs=DEF_NUM_TOP_DOCS, t_conf=DEF_CONF_THRESHOLD, term_len=DEF_TERM_LENGTH): start_time = get_time() graph_data_files = [] graph_tags = [] package_files = {} package_results = {} total_classification_time = 0.0 num_runs = len(package_dicts) graph_config = os.path.join(parent_dir, graph_config_file_name) test_config = os.path.join(parent_dir, test_config_file_name) print('Testing %d packages...' % num_runs) for i in range(num_runs): print('Testing Package %d/%d' % (i + 1, num_runs)) file_dict = package_dicts[i] package_file = file_dict[package_file_tag] clux_file = file_dict[clux_file_tag] package_name = get_package_name(package_file) run_dir = os.path.join(parent_dir, package_name) run_data = run_package(index_dir, run_dir, package_file, clux_file, config_file_path, n_top_docs, t_conf, term_len) # Get the data To save... package_results[package_name] = run_data total_classification_time += run_data[classification_time_tag] graph_tags.append(package_name) graph_data_files.append(run_data[accuracy_file_tag]) package_files[i] = package_file # avg_classification_time = total_classification_time / num_runs # print('Average Classification time: %.4f seconds' % avg_classification_time) make_graph_config(graph_data_files, graph_tags, graph_config) total_pages = join_class_error_counts(parent_dir) page_classification_rate = total_classification_time / total_pages package_classification_rate = total_classification_time / num_runs print('Classified %d pages from %d files in %.2fs (%.2f s/page | %.2f s/file)' % (total_pages, num_runs, total_classification_time, page_classification_rate, package_classification_rate)) printer.write_line_break() test_config_data = {classification_time_tag: total_classification_time, graph_config_file_tag: graph_config, 'Total Pages': total_pages, 'Average Package Size': round(total_pages/num_runs), 'Classification Speed (pages)': '%.2f s/page' % page_classification_rate, 'Classification Speed (files)': '%.2f s/file' % package_classification_rate} test_files = get_test_files(package_results, 'Package') write_cfg_file(test_config_data, test_config, start_time, test_files) return test_config_data def test_changing_confidence(config_file, index_dir, parent_dir, package_dicts, min_conf, max_conf, step_size, lucene_query_cnt=DEF_NUM_TOP_DOCS, term_len=DEF_TERM_LENGTH): start_time = get_time() current_conf = min_conf avg_cls_time = 0.0 graph_config_files = {} result_data = {} num_runs = 0 print('Running Test With Confidence Values [%.2f%% - %.2f%%]' % (min_conf, max_conf)) test_config_file = os.path.join(parent_dir, test_config_file_name) while current_conf >= max_conf: print('Current Confidence: %.2f%%' % current_conf) run_dir = os.path.join(parent_dir, 'Confidence Threshold = %.2f%%' % current_conf) graph_config = os.path.join(run_dir, graph_config_file_name) graph_config_files.append(graph_config) return_data = run_packages(config_file, index_dir, run_dir, package_dicts, lucene_query_cnt, current_conf, term_len) result_data[current_conf] = return_data cls_time = return_data[classification_time_tag] packages_config = return_data[graph_config_file_tag] new_title = '%s With Confidence Threshold %.2f%%' % (DEF_GRAPH_TITLE, current_conf * 100) AccuracyGraphSetup.change_title(packages_config, graph_config, new_title) num_runs += 1 avg_cls_time += cls_time current_conf += step_size avg_cls_time = avg_cls_time / num_runs print('Average Classification time: %.4f seconds' % avg_cls_time) config_result = {classification_time_tag: avg_cls_time, graph_config_file_tag: graph_config, 'Minimum Confidence': '%.2f%%' % (min_conf * 100), 'Maximum Confidence': '%.2f%%' % (max_conf * 100)} files = get_test_files(result_data, 'Confidence') write_cfg_file(config_result, test_config_file, start_time, files) return config_result def make_graphs_from_folder(folder, log_dir): graph_dict = {} print('Making graphs from config files in "%s"' % folder) for file in os.listdir(folder): file = os.path.join(folder, file) if file.endswith(graph_config_file_name): out_file = file.replace(graph_config_file_name, graph_name) graph_dict[out_file] = file make_graphs(graph_dict, log_dir) def get_test_files(test_files, new_prefix): result = {} for key, value in test_files.items(): for sub_key, sub_value in value.items(): new_key = '%s %s - %s' % (new_prefix, key, sub_key) result[new_key] = sub_value return result def get_package_name(package_path): par_dir, package_name = os.path.split(package_path) package_name = package_name[:-4] return package_name def get_lib_name(lib_path): folder, name = os.path.split(lib_path) return name def get_move_files(index_data_dir): result = [] for file in os.listdir(index_data_dir): result.append(os.path.join(index_data_dir, file)) if os.environ['SEQUENCELOGICHOME'] is not None: dest_folder = os.path.join(os.environ['SEQUENCELOGICHOME'], 'SLSync', 'config', 'data') for file in result: new_file = os.path.join(dest_folder, os.path.split(file)[1]) if not os.path.exists(dest_folder): os.makedirs(dest_folder) shutil.copy(file, new_file) return result def get_graph_config_files(package_data): result = {} for key, value in package_data.items(): value = str(value) if value.endswith(graph_config_file_name): out_file = value.replace(graph_config_file_name, graph_name) result[out_file] = value return result def do_make_graphs(configs_by_paths, logs_dir): print('Making %d graphs...' % len(configs_by_paths)) graph_time = 0 for out_file, config_file in configs_by_paths.items(): graph_time += run_graph(config_file, out_file, logs_dir) print('Done making Graphs (%.4f s)' % graph_time) def make_graphs(run_data_dict, log_dir, title=None): config_data = get_graph_config_files(run_data_dict) if title is not None: for out, config in config_data.items(): AccuracyGraphSetup.change_title(config, config, title) do_make_graphs(config_data, log_dir) def join_class_error_counts(test_out_root): types = ['false-negative', 'false-positive', 'incorrect', 'correct'] fields = ['Correct Type', 'Classified Type', 'Number of Occurrences'] total_pages = 0 for t in types: file_name = 'global-%s-counts.csv' % t result_file = os.path.join(test_out_root, file_name) result = {} for folder in os.listdir(test_out_root): folder = os.path.join(test_out_root, folder) if os.path.isdir(folder): to_read = os.path.join(folder, '%s-counts.csv' % t) if os.path.exists(to_read): with open(to_read) as reader: csv_reader = csv.DictReader(reader) for row in csv_reader: key = '%s>>>%s' % (row['Correct Type'], row['Classified Type']) count = int(row['Number of Occurrences']) if key not in result.keys(): result[key] = count else: result[key] += count result = sorted(result.items(), key=operator.itemgetter(1), reverse=True) with open(result_file, 'w+', newline='') as writer: csv_file = csv.DictWriter(writer, fieldnames=fields) csv_file.writeheader() for key, count in result: correct_doctype, classified_doctype = key.split('>>>')[:2] total_pages += count csv_file.writerow({'Correct Type': correct_doctype, 'Classified Type': classified_doctype, 'Number of Occurrences': count}) return total_pages def run_test_on_lib(lib_dir, output_paths, test_in_root, config_file_path, min_phrase_len=DEF_MIN_PHRASE_SIZE, max_phrase_len=DEF_MAX_PHRASE_SIZE): global phrase_exe_name, paginate start_time = get_time() idx_root = output_paths[idx_root_tag] test_out = output_paths[test_root_tag] logs_out = output_paths[logs_root_tag] packages = load_packages(test_in_root) idx_data_root = os.path.join(test_in_root, 'Index-Data') get_move_files(idx_data_root) new_run_indexer(config_file_path, lib_dir, idx_root, logs_out) config_out = os.path.join(test_out, test_config_file_name) config_info = {'Library Path': lib_dir, 'Library Name': get_lib_name(lib_dir), 'Test Data Source': test_in_root, 'Test Data Result': test_out, 'Minimum Phrase Length': min_phrase_len, 'Maximum Phrase Length': max_phrase_len, 'Minimum Phrase Doctype Coverage': '80%', 'Paginate': paginate, 'Minimum OCR Confidence': '60%', 'Confidence Threshold': '%.2f' % DEF_CONF_THRESHOLD, 'Score Ratio': 'Yes'} write_line_break() out_path = test_out # Run the packages run_data = run_packages(idx_root, out_path, packages, config_file_path) config_info['Average Classification Time'] = run_data[classification_time_tag] del run_data[classification_time_tag] # Make the graphs! make_graphs(run_data, logs_out, 'Confidence Threshold: %.2f' % DEF_CONF_THRESHOLD) write_cfg_file(config_info, config_out, start_time, run_data) def get_time(): return datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S') def main(lib_dir, output_directory, test_data_root, config_file_loc, do_build): # Do some set up run_dir = get_root_folder(output_directory) os.environ['SEQUENCELOGICHOME'] = os.path.join(run_dir, 'Programs') setup_test(run_dir, do_build) # Get the output information. output_info = get_idx_names(run_dir) # Perform a test run_test_on_lib(lib_dir, output_info, test_data_root, config_file_loc) # Do any necessary cleanup run_clean_up() print('\nTest completed. Exiting...') # This is where we call the main method from. if __name__ == '__main__': # load_executables() # # Set up arguments # files = [ # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597000945\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597000964\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597000967\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597000990\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597001171\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597001276\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597001454\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597001462\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597001468\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597001474\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597001635\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597001639\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597001648\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597001676\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597001681\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597001685\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597001699\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597001741\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597001785\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597001842\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597001849\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597001917\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597002051\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597002121\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597002173\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597002322\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597002344\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597002443\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597002544\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597002550\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597002556\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597002648\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597002662\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597002837\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597002854\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597002924\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597002931\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597002954\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597003029\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597003044\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597003068\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597003088\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597003125\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597003128\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597003130\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597003137\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597003186\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597003196\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597003206\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597003210\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597003221\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597003257\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597003261\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597003308\\graph-data.txt", # "C:\\Users\\chris\\Documents\\Code\\Tests\\KMeans\\L3Results\\2017.10.12\\17.26\\597003335\\graph-data.txt"] # # tags = ['Correct & High Confidence', # 'Incorrect & Low Confidence', # 'Incorrect & High Confidence', # 'Correct & Low Confidence', # 'Correct Pagination'] # # out_file = r'C:\Users\chris\Documents\Code\Tests\KMeans\L3Results\2017.10.12\17.26\global-graph-config.json' # graph_out = \ # r'C:\Users\chris\Documents\Code\Tests\KMeans\L3Results\2017.10.12\17.26\global-classification-accuracy.pdf' # make_graph_config(files, tags, out_file) # # do_make_graphs({graph_out: out_file}, r'C:\Users\chris\Documents\Code\Tests\KMeans\L3Results\2017.10.12\17.26\logs') required_args = parser.add_argument_group('Required') optional_args = parser.add_argument_group('Optional') required_args.add_argument('-t', '--test_dir', required=True, help='The path to the directory containing test data.') required_args.add_argument('-o', '--out_dir', required=True, help='The location to write the test data to.') required_args.add_argument('-l', '--library_dir', required=True, help='The path to the library root folder.') required_args.add_argument('-c', '--config', required=True, help='The path to the config file') optional_args.add_argument('-b', '--build', required=False, action='store_true', help='Use if you want to run a build before testing.') optional_args.add_argument('-h', '--help', action='help', help='prints the help message') # Get the arguments args = parser.parse_args() library_dir = args.library_dir output_dir = args.out_dir test_data_dir = args.test_dir build = args.build config_file = args.config # os.environ['UseFirstPages'] = 'true' # Run the program main(library_dir, output_dir, test_data_dir, config_file, build) # printer.close()