610 lines
28 KiB
Python
610 lines
28 KiB
Python
"""
|
|
.. role:: py(code)
|
|
:language: python
|
|
|
|
Information
|
|
-----------
|
|
|
|
This is a tool for comparing the results from our classification engine results against the CLUX results from an
|
|
already processed document. This tool produces a CSV file which contains several useful pieces of information; the
|
|
most important being the determination of which fields produced by our engine are false positives (returned a match
|
|
with good confidence which is incorrect). This tool is used for improvements in our engines as well as future
|
|
improvements in the library.
|
|
|
|
.. moduleauthor:: Chris Diesch <cdiesch@sequencelogic.net>
|
|
.. moduleauthor:: Dave Gustafson <daveg@sequencelogic.net>
|
|
|
|
|
|
Commandline Usage
|
|
-----------------
|
|
Usage: ``TreeWalker.py [-h] [-c, --clux_data] {CLUX_FILE} [-w walker_data] {WALKER_FILE} [-o out_path] {OUT_FILE}``
|
|
|
|
Required Arguments:
|
|
``-c CLUX_FILE, --clux_data CLUX_FILE``
|
|
Where ``CLUX_FILE`` is the path to the CLUX output file containing the "correct" classification results.
|
|
|
|
``-w WALKER_FILE, --walker_data WALKER_FILE``
|
|
Where ``WALKER_FILE`` is the path to the engine's output file.
|
|
|
|
``-o OUT_FILE, --out_path OUT_FILE``
|
|
Where ``OUT_FILE`` is the path to save the results to.
|
|
|
|
Optional Arguments:
|
|
``-h, --help``
|
|
Prints the help message.
|
|
|
|
*NOTE:* If there is a file at ``OUT_FILE`` a warning will be printed and the file will be overwritten.
|
|
|
|
|
|
Python Module Usage:
|
|
--------------------
|
|
"""
|
|
import argparse
|
|
import json
|
|
import os
|
|
import csv
|
|
import sys
|
|
|
|
import ConsoleUtils
|
|
|
|
program_name = 'TreeWalker'
|
|
program_description = 'A program for comparing new classification engine techniques against the correct data.'
|
|
|
|
author = 'Chris Diesch & Dave Gustafson'
|
|
build_date = '2017.07.27'
|
|
program_version = '2.0.0'
|
|
|
|
parser = argparse.ArgumentParser(prog=program_name, description=program_description, add_help=False)
|
|
|
|
FALSE_POSITIVE = 'False Positive'
|
|
"""
|
|
The False used for false positive results.
|
|
"""
|
|
|
|
FALSE_NEGATIVE = 'False Negative'
|
|
"""
|
|
The tag used for false negative results.
|
|
"""
|
|
|
|
CORRECT = 'Correct'
|
|
"""
|
|
The tag used for correct results.system
|
|
"""
|
|
|
|
INCORRECT = 'Incorrect'
|
|
"""
|
|
The tag used for incorrect results.
|
|
"""
|
|
|
|
SKIPPED = 'Skipped'
|
|
"""
|
|
The tag used for skipped results.
|
|
"""
|
|
|
|
YES = 'Yes'
|
|
"""
|
|
The tag used for yes in the resulting output.
|
|
"""
|
|
|
|
NO = 'No'
|
|
"""
|
|
The tag used for no in the resulting output.
|
|
|
|
"""
|
|
|
|
FIELD_NAMES = ['Page', 'Similar Pages', 'CLUX Result', 'ScoreWalker Result', 'Raw SW Result', 'Contained',
|
|
'Library Page', 'Was Classified', 'Confidence', 'Paginated', 'Walker Break Point', 'CLUX Break Point',
|
|
'Status', 'Human Review Required', 'Score', 'Confidence Threshold', 'Minimum Good Confidence',
|
|
'OCR Confidence']
|
|
"""
|
|
The list of field names used when generating the output in :meth:`compare_results`.
|
|
"""
|
|
|
|
f = r'C:\Users\chris\Documents\Code\Tests\KMeans\L3config\SLI Standard Mortgage Library-CONTAINERS_2017-11-07.conf.json'
|
|
with open(f, 'r') as reader:
|
|
data = json.load(reader)
|
|
|
|
walker = data['CLASSIFYWALKER']
|
|
del data
|
|
|
|
containers = walker['containers']
|
|
del walker
|
|
|
|
|
|
def load_clux_data(clux_file):
|
|
"""
|
|
Loads the data from the CLUX output file at the given path.
|
|
|
|
Args:
|
|
``clux_file`` -- ``str`` The path to the CLUX output to load.
|
|
|
|
Returns:
|
|
``dict`` A dict with integer keys corresponding to the page index in the document (0-based), and values which
|
|
are dicts with the following key/value pairs:
|
|
|
|
+------------------+----------------------------------------------------------------------------------------------+
|
|
| Key | Value ``type`` |
|
|
+==================+==============================================================================================+
|
|
| ``"doctype"`` | The name of the doctype the given page is in. ``str`` |
|
|
| | |
|
|
| | *NOTE:* A document may have individual pages which are misclassified, but during assembly |
|
|
| | in SLEDS they can be corrected. These were still misclassified by the engine. |
|
|
+------------------+----------------------------------------------------------------------------------------------+
|
|
| ``"docindex"`` | The index of the document in the package processed by CLUX. ``int`` |
|
|
+------------------+----------------------------------------------------------------------------------------------+
|
|
| ``"pageindex"`` | The index of the current page in the package. ``int`` |
|
|
+------------------+----------------------------------------------------------------------------------------------+
|
|
| ``"matchtype"`` | The name of the doctype the page matched. ``str`` |
|
|
+------------------+----------------------------------------------------------------------------------------------+
|
|
|
|
"""
|
|
result = {}
|
|
page_num = 0
|
|
doc_num = 0
|
|
print('Loading data from file %s' % clux_file)
|
|
with open(clux_file) as reader:
|
|
json_data = json.load(reader)
|
|
# pages = json_data['pages']
|
|
# sub_doc_page = 0
|
|
# doc_idx = 0
|
|
# cust_name = None
|
|
# for page in pages:
|
|
# break_page = cust_name is None or page['cust_name'] != cust_name
|
|
#
|
|
# if break_page:
|
|
# doc_idx = 0
|
|
# else:
|
|
# doc_idx += 1
|
|
#
|
|
# doctype = page['cust_doctype']
|
|
# page_idx = page['pageno'] - 1
|
|
# cust_name = page['cust_name']
|
|
# page_data = {'docindex': doc_idx, 'pageindex': page_idx, 'doctype': doctype, 'break': break_page}
|
|
# result[page_idx] = page_data
|
|
|
|
try:
|
|
json_docs = json_data['documents']
|
|
except KeyError:
|
|
json_docs = json_data['sdc']
|
|
|
|
for doc in json_docs:
|
|
doctype = doc['doctype']
|
|
pages = doc['pages']
|
|
sub_doc_page = 0
|
|
for page in pages:
|
|
break_point = (sub_doc_page == 0)
|
|
|
|
page_data = {'docindex': doc_num, 'pageindex': page_num, 'doctype': doctype, 'break': break_point}
|
|
page_matches = page['matches']
|
|
|
|
if len(page_matches) == 0:
|
|
page_data['matchtype'] = 'Unknown'
|
|
else:
|
|
page_data['matchtype'] = page_matches[0]['ldt']
|
|
|
|
result[page_num] = page_data
|
|
page_num += 1
|
|
sub_doc_page += 1
|
|
doc_num += 1
|
|
return result
|
|
|
|
|
|
def load_walker_data(walker_file):
|
|
"""
|
|
Loads the data from the classification engine's output.
|
|
|
|
Args:
|
|
``walker_file`` -- ``str`` The path to the engine's output file.
|
|
|
|
Returns:
|
|
``dict`` A dict with integer keys corresponding to the page index in the document (0-based), and values which
|
|
are dicts with the following key/value pairs:
|
|
|
|
+---------------------+---------------------------------------------------------------------------+
|
|
| Key | Value ``type`` |
|
|
+=====================+===========================================================================+
|
|
| ``"pageindex"`` | The index of the page in the package. ``int`` |
|
|
+---------------------+---------------------------------------------------------------------------+
|
|
| ``"charCount"`` | The number of characters on the current page. ``int`` |
|
|
+---------------------+---------------------------------------------------------------------------+
|
|
| ``"confidence"`` | The computed confidence score from the engine. ``float`` |
|
|
+---------------------+---------------------------------------------------------------------------+
|
|
| ``"luceneResult"`` | yes or no if Lucene Returned a result or not. ``str`` |
|
|
+---------------------+---------------------------------------------------------------------------+
|
|
| ``"doctype"`` | The name of the doctype returned by the classification engine. ``str`` |
|
|
+---------------------+---------------------------------------------------------------------------+
|
|
| ``"rawScore"`` | The Lucene score of the classified page against the library page. |
|
|
| | ``float`` |
|
|
+---------------------+---------------------------------------------------------------------------+
|
|
| ``"selfScore"`` | The Lucene score of the library page against itself. ``float`` |
|
|
+---------------------+---------------------------------------------------------------------------+
|
|
| ``"confThreshold"`` | The confidence threshold used by ScoreWalker. ``float`` |
|
|
+---------------------+---------------------------------------------------------------------------+
|
|
| ``"libDoc"`` | The path to the library element matched. ``str`` |
|
|
+---------------------+---------------------------------------------------------------------------+
|
|
| ``"minConf"`` | The minimum Confidence for this match to be "good". ``float`` |
|
|
+---------------------+---------------------------------------------------------------------------+
|
|
| ``"goodConf"`` | If the page has good confidence or not. ``bool`` |
|
|
+---------------------+---------------------------------------------------------------------------+
|
|
| ``"break"`` | If this page is a break page determined by ScoreWalker. ``bool`` |
|
|
+---------------------+---------------------------------------------------------------------------+
|
|
| ``"similarPages"`` | The indexes of copies of this page in the document. ``str`` |
|
|
+---------------------+---------------------------------------------------------------------------+
|
|
| ``"paginated"`` | If this page was paginated by ScoreWalker or not. ``bool`` |
|
|
+---------------------+---------------------------------------------------------------------------+
|
|
|
|
"""
|
|
result = {}
|
|
print('Loading data from file %s' % walker_file)
|
|
with open(walker_file) as reader:
|
|
json_data = json.load(reader)
|
|
json_docs = json_data['documentPages']
|
|
|
|
# Load all the from doc pages in...
|
|
for doc in json_docs:
|
|
page_index = doc['documentPageIndex']
|
|
page_data = {'pageindex': page_index,
|
|
'minConf': 0.0,
|
|
'confidence': 0,
|
|
'confThreshold': 0.0,
|
|
'luceneResult': 'no',
|
|
'libDoc': 'None',
|
|
'doctype': 'None',
|
|
'score': 0,
|
|
'goodConf': False,
|
|
'paginated': False,
|
|
'ocrConf': 0.0,
|
|
'break': False,
|
|
'similarPages': 'None',
|
|
'contained': False,
|
|
'extractAs': 'None'}
|
|
result[page_index] = page_data
|
|
|
|
# Overwrite default data with found data.
|
|
associations = json_data['associations']
|
|
for assoc in associations:
|
|
page_index = assoc['documentPageIndex']
|
|
page_data = result[page_index]
|
|
assoc_match_count = len(assoc['matches'])
|
|
# Skip FIELD_NAMES which have no matches
|
|
# if assoc_match_count == 0:
|
|
# continue
|
|
|
|
page_data['libDoc'] = assoc['imagePath']
|
|
page_data['matches'] = assoc['matches']
|
|
page_data['confidence'] = assoc['conf']
|
|
page_data['luceneResult'] = 'yes'
|
|
page_data['doctype'] = assoc['topDoctype']
|
|
page_data['goodConf'] = assoc['goodConf']
|
|
page_data['score'] = assoc['score']
|
|
page_data['confThreshold'] = assoc['confThreshold']
|
|
page_data['paginated'] = assoc['paginated']
|
|
page_data['ocrConf'] = assoc['ocrConfidence']
|
|
page_data['break'] = assoc['breakPage']
|
|
page_data['minConf'] = assoc['confThreshold']
|
|
page_data['extractAs'] = assoc['extractAs']
|
|
page_data['contained'] = not (assoc['topDoctype'] == assoc['extractAs'])
|
|
|
|
sim_pages = json_data['documentPages'][page_index]['similarPages']
|
|
if len(sim_pages) > 0:
|
|
page_data['similarPages'] = ', '.join(('{}'.format(p['pageIndex'] + 1)) for p in sim_pages)
|
|
# If there were some similar pages...
|
|
# if not len(assoc['similarPages']) == 0:
|
|
# page_data['similarPages'] = ', '.join(('{}'.format(i['pageIndex'] + 1)) for i in assoc['similarPages'])
|
|
result[page_index] = page_data
|
|
|
|
return result
|
|
|
|
|
|
def compare_results(clux_results, walker_results):
|
|
"""
|
|
Produces the output of this program. This is where the known results from the CLUX output is compared against the
|
|
predicted results from the engine's outputs.
|
|
|
|
Args:
|
|
``clux_results`` -- ``dict`` The dict returned by :meth:`load_clux_data`.
|
|
|
|
``walker_results`` -- ``dict`` The dict returned by :meth:`load_walker_results`.
|
|
|
|
Returns:
|
|
``dict`` A dict with integer keys corresponding to the index of the page in the package (0-based) and values
|
|
which are dicts with the following key/value paris:
|
|
|
|
+-----------------------------+--------------------------------------------------------------------------------+
|
|
| Key | Value ``type`` |
|
|
+=============================+================================================================================+
|
|
| ``"CLUX Result"`` | The correct (human-verified) doctype. ``str`` |
|
|
+-----------------------------+--------------------------------------------------------------------------------+
|
|
| ``"ScoreWalker Result"`` | The predicted doctype. ``str`` |
|
|
+-----------------------------+--------------------------------------------------------------------------------+
|
|
| ``"Page"`` | The page number. ``int`` |
|
|
+-----------------------------+--------------------------------------------------------------------------------+
|
|
| ``"Was Classified"`` | yes or no if Lucene classified the document. ``str`` |
|
|
+-----------------------------+--------------------------------------------------------------------------------+
|
|
| ``"Raw Score"`` | The Lucene score of the library page against the classified page. ``float`` |
|
|
+-----------------------------+--------------------------------------------------------------------------------+
|
|
| ``"Self Score"`` | The Lucene score of the library page against itself. ``float`` |
|
|
+-----------------------------+--------------------------------------------------------------------------------+
|
|
| ``"Confidence"`` | The computed confidence of the predicted result. ``float`` |
|
|
+-----------------------------+--------------------------------------------------------------------------------+
|
|
| ``"Status"`` | A value corresponding to the accuracy of classification: ``str`` |
|
|
| | - ``"False Positive"`` If the predicted result is incorrect and has high |
|
|
| | confidence. |
|
|
| | - ``"False Negative"`` If the predicted result is correct with low |
|
|
| | confidence. |
|
|
| | - ``"Correct"`` If the predicted result is correct. |
|
|
| | - ``"Incorrect"`` If the predicted result is incorrect and has low |
|
|
| | confidence. |
|
|
| | - ``"Skipped"`` If the page was skipped. |
|
|
+-----------------------------+--------------------------------------------------------------------------------+
|
|
| ``"Score"`` | The score for this match (0-999, lower is better). ``int`` |
|
|
+-----------------------------+--------------------------------------------------------------------------------+
|
|
| ``"Human Review Required"`` | If the page required a human to look at it. ``bool`` |
|
|
+-----------------------------+--------------------------------------------------------------------------------+
|
|
| ``"Similar Pages"`` | Pages which are identical to this page. ``list(int)`` |
|
|
+-----------------------------+--------------------------------------------------------------------------------+
|
|
| ``"Minimum Good | The minimum confidence to allow this page not to need human review. ``float`` |
|
|
| Confidence"`` | |
|
|
+-----------------------------+--------------------------------------------------------------------------------+
|
|
| ``"OCR Confidence"`` | The confidence of the OCR engine for this page. ``float`` |
|
|
+-----------------------------+--------------------------------------------------------------------------------+
|
|
| ``"Library Page"`` | The path to the matched page in the library. ``str`` |
|
|
+-----------------------------+--------------------------------------------------------------------------------+
|
|
| ``"Confidence Threshold"`` | The confidence threshold used for classifying. ``float`` |
|
|
+-----------------------------+--------------------------------------------------------------------------------+
|
|
| ``"Walker Break Point"`` | If this page is a sub-document break point determined by ScoreWalker. |
|
|
| | ``str`` |
|
|
+-----------------------------+--------------------------------------------------------------------------------+
|
|
| ``"CLUX Break Point"`` | If this page is a sub-document break point determined by a CLUX user. |
|
|
| | ``str`` |
|
|
+-----------------------------+--------------------------------------------------------------------------------+
|
|
|
|
"""
|
|
result = {}
|
|
num_pages = len(clux_results)
|
|
for page_idx in range(num_pages):
|
|
print('Comparing CLUX to walker (Page %d/%d).' % (page_idx + 1, num_pages))
|
|
result_page = {}
|
|
walker_page = walker_results[page_idx]
|
|
clux_page = clux_results[page_idx]
|
|
good_conf = walker_page['goodConf']
|
|
human_review = not good_conf
|
|
paginated = walker_page['paginated']
|
|
|
|
result_page['CLUX Result'] = clux_page['doctype']
|
|
result_page['ScoreWalker Result'] = walker_page['doctype']
|
|
result_page['Raw SW Result'] = walker_page['extractAs']
|
|
result_page['Page'] = page_idx + 1
|
|
result_page['Was Classified'] = walker_page['luceneResult']
|
|
result_page['Confidence'] = walker_page['confidence']
|
|
result_page['Human Review Required'] = human_review
|
|
result_page['Score'] = walker_page['score']
|
|
result_page['Library Page'] = walker_page['libDoc']
|
|
result_page['Confidence Threshold'] = walker_page['confThreshold']
|
|
result_page['Minimum Good Confidence'] = walker_page['minConf']
|
|
result_page['OCR Confidence'] = walker_page['ocrConf']
|
|
result_page['Similar Pages'] = walker_page['similarPages']
|
|
result_page['Contained'] = walker_page['contained']
|
|
|
|
are_same = (doctypes_are_same(clux_page['doctype'], walker_page['doctype'])) or \
|
|
(doctypes_are_same(clux_page['doctype'], walker_page['extractAs']))
|
|
|
|
break_txt = ''
|
|
if walker_page['break']:
|
|
break_txt = YES
|
|
result_page['Walker Break Point'] = break_txt
|
|
|
|
break_txt = ''
|
|
if clux_page['break']:
|
|
break_txt = YES
|
|
result_page['CLUX Break Point'] = break_txt
|
|
|
|
# Deal with status.
|
|
if (not are_same) and (walker_page['doctype'].lower() == 'unknown' or
|
|
walker_page['doctype'].lower() == 'unknown - illegible' or
|
|
clux_page['doctype'].lower() == 'unknown' or
|
|
clux_page['doctype'].lower() == 'illegible' or
|
|
walker_page['doctype'].lower() == 'blank page' or
|
|
clux_page['doctype'].lower() == 'blank page'):
|
|
status = SKIPPED
|
|
elif are_same and good_conf:
|
|
status = CORRECT
|
|
elif not good_conf and not are_same:
|
|
status = INCORRECT
|
|
elif not good_conf and are_same:
|
|
status = FALSE_NEGATIVE
|
|
else:
|
|
status = FALSE_POSITIVE
|
|
|
|
# Deal with pagination
|
|
if paginated:
|
|
pag = YES
|
|
else:
|
|
pag = NO
|
|
|
|
result_page['Paginated'] = pag
|
|
result_page['Status'] = status
|
|
result[page_idx] = result_page
|
|
|
|
return result
|
|
|
|
|
|
def doctypes_are_same(clux_doctype, walker_doctype):
|
|
"""
|
|
Determines if two doctypes are the same. Since currently there is a difference between the doctypes used by our
|
|
engines and those that were used by humans, we need to do a small comparison.
|
|
|
|
Args:
|
|
``clux_doctype`` -- ``str`` The name of the doctype returned by CLUX.
|
|
|
|
``walker_doctype`` -- ``str`` The name of the doctype returned by the engine.
|
|
|
|
Returns:
|
|
``bool`` :py:`True` if the two doctypes are equivalent, :py:`False` otherwise.
|
|
|
|
"""
|
|
|
|
walker_name = walker_doctype
|
|
clux_name = clux_doctype
|
|
|
|
# walker_name = walker_name.replace('Property Appraisal - Update or Completion Report', 'Property Appraisal')
|
|
# clux_name = clux_name.replace('Property Appraisal - Update or Completion Report', 'Property Appraisal')
|
|
#
|
|
# walker_name = walker_name.replace('Mortgage Statement', 'Disclosure - SCRA Notice')
|
|
# clux_name = clux_name.replace('Mortgage Statement', 'Disclosure - SCRA Notice')
|
|
#
|
|
# walker_name = walker_name.replace('Credit Report Supplement', 'Credit Report')
|
|
# clux_name = clux_name.replace('Credit Report Supplement', 'Credit Report')
|
|
#
|
|
# clux_name = clux_name.replace('Closing or Escrow Protection Letter', 'Closing Protection Letter')
|
|
# walker_name = walker_name.replace('Closing or Escrow Protection Letter', 'Closing Protection Letter')
|
|
#
|
|
# clux_name = clux_name.replace('Disclosure - (TIL) Truth in Lending', 'USDA - Truth In Lending Statement')
|
|
# walker_name = walker_name.replace('Disclosure - (TIL) Truth in Lending', 'USDA - Truth In Lending Statement')
|
|
|
|
if walker_name == 'UNKNOWN - ILLEGIBLE' and clux_name == 'Illegible':
|
|
return True
|
|
|
|
walker_name = walker_name.lower()
|
|
clux_name = clux_name.lower()
|
|
|
|
valid_types = [clux_name]
|
|
for container in containers:
|
|
if container['doctype'].lower() == clux_name:
|
|
valid_types += container['contains']
|
|
|
|
valid_types = [v.lower() for v in valid_types]
|
|
result = walker_name in valid_types
|
|
return result
|
|
|
|
|
|
def save_csv(file_path, page_match_data):
|
|
"""
|
|
Saves the given data to a CSV file.
|
|
|
|
Args:
|
|
``file_path`` -- ``str`` The path to the file to save.
|
|
|
|
``page_match_data`` -- ``dict`` The dict returned by :meth:`compare_results`.
|
|
|
|
Returns:
|
|
``None``
|
|
|
|
"""
|
|
print('Saving file %s' % file_path)
|
|
with open(file_path, 'w+', newline='') as out_file:
|
|
writer = csv.DictWriter(out_file, fieldnames=FIELD_NAMES)
|
|
writer.writeheader()
|
|
|
|
for idx, page in page_match_data.items():
|
|
writer.writerow(page)
|
|
|
|
|
|
def load_csv(file_path):
|
|
"""
|
|
Loads a TreeWalker file from the given path.
|
|
|
|
Args:
|
|
``file_path`` -- ``str`` The page to the file to load.
|
|
|
|
Returns:
|
|
``dict`` A dict of the rows in the file where indexes are the page index and values are dicts with key/value
|
|
pairs corresponding to the header row key, and column value for the given row.
|
|
|
|
"""
|
|
result = {}
|
|
with open(file_path) as in_file:
|
|
reader = csv.DictReader(in_file)
|
|
for row in reader:
|
|
page_num = row['Page']
|
|
result[page_num] = row
|
|
return result
|
|
|
|
|
|
def main(clux_data, walker_data, out_file):
|
|
"""
|
|
The main entry point of the program. This will compare the files are ``clux_data`` and ``walker_data`` and write the
|
|
result to ``out_file``.
|
|
|
|
Args:
|
|
``clux_data`` -- ``str`` The path to the output from CLUX.
|
|
``walker_data`` -- ``str`` The path the output from the classification engine.
|
|
``out_file`` -- ``str`` The path to the output file to save.
|
|
|
|
Returns:
|
|
``int`` The status of the program.
|
|
|
|
"""
|
|
clux_pages = load_clux_data(clux_data)
|
|
walker_pages = load_walker_data(walker_data)
|
|
match_info = compare_results(clux_pages, walker_pages)
|
|
save_csv(out_file, match_info)
|
|
|
|
|
|
def check_args(clux_file, walker_file, out_file):
|
|
"""
|
|
Makes sure arguments are valid before running the program.
|
|
|
|
Args:
|
|
``clux_file`` -- ``str`` The path to the output from CLUX.
|
|
|
|
``walker_file`` -- ``str`` The path to the output from the engine.
|
|
|
|
``out_file`` -- ``str`` The path to the file to save the results to.
|
|
|
|
Returns:
|
|
``None``
|
|
|
|
This function will cause the program to exit with ``status=-1`` if the given arguments are invalid.
|
|
Arguments are considered invalid ``iff``:
|
|
|
|
- There is no file at ``walker_file``.
|
|
- There is no file at ``clux_file``.
|
|
"""
|
|
fatal_error = False
|
|
|
|
if not os.path.exists(walker_file):
|
|
print('Error: Walker file does not exist: %s' % walker_file)
|
|
fatal_error = True
|
|
|
|
if not os.path.exists(clux_file):
|
|
print('Error: CLUX file does not exist: %s' % clux_file)
|
|
fatal_error = True
|
|
|
|
if os.path.exists(out_file):
|
|
print('Warning: File exists, and will be overwritten: %s' % out_file)
|
|
|
|
if fatal_error:
|
|
parser.print_help()
|
|
print('Encountered a fatal error, Exiting...')
|
|
exit(-1)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
printer = ConsoleUtils.SLPrinter(program_name)
|
|
sys.stdout = printer
|
|
printer.write_no_prefix(ConsoleUtils.get_header(program_name, program_version, build_date, author, 80))
|
|
|
|
required_args = parser.add_argument_group('Required')
|
|
optional_args = parser.add_argument_group('Optional')
|
|
|
|
required_args.add_argument('-c', '--clux_data', required=True, help='The output from CLUX.')
|
|
required_args.add_argument('-w', '--walker_data', required=True, help='The output from ScoreWalker.')
|
|
required_args.add_argument('-o', '--output', required=True, help='The path to save the output to.')
|
|
|
|
optional_args.add_argument('-h', '--help', action='help', help='Prints the help message.')
|
|
|
|
args = parser.parse_args()
|
|
|
|
clux = args.clux_data
|
|
walker = args.walker_data
|
|
output = args.output
|
|
|
|
print('Output file from classification to compare: "%s"' % walker)
|
|
print('Output file from CLUX to compare: "%s"' % clux)
|
|
print('Saving comparison report to: "%s"' % output)
|
|
|
|
check_args(clux, walker, output)
|
|
|
|
main(clux, walker, output)
|