Sleds/scorewalker-utils/TreeWalker/TreeWalker.py

610 lines
28 KiB
Python
Raw Normal View History

2025-03-13 21:28:38 +00:00
"""
.. role:: py(code)
:language: python
Information
-----------
This is a tool for comparing the results from our classification engine results against the CLUX results from an
already processed document. This tool produces a CSV file which contains several useful pieces of information; the
most important being the determination of which fields produced by our engine are false positives (returned a match
with good confidence which is incorrect). This tool is used for improvements in our engines as well as future
improvements in the library.
.. moduleauthor:: Chris Diesch <cdiesch@sequencelogic.net>
.. moduleauthor:: Dave Gustafson <daveg@sequencelogic.net>
Commandline Usage
-----------------
Usage: ``TreeWalker.py [-h] [-c, --clux_data] {CLUX_FILE} [-w walker_data] {WALKER_FILE} [-o out_path] {OUT_FILE}``
Required Arguments:
``-c CLUX_FILE, --clux_data CLUX_FILE``
Where ``CLUX_FILE`` is the path to the CLUX output file containing the "correct" classification results.
``-w WALKER_FILE, --walker_data WALKER_FILE``
Where ``WALKER_FILE`` is the path to the engine's output file.
``-o OUT_FILE, --out_path OUT_FILE``
Where ``OUT_FILE`` is the path to save the results to.
Optional Arguments:
``-h, --help``
Prints the help message.
*NOTE:* If there is a file at ``OUT_FILE`` a warning will be printed and the file will be overwritten.
Python Module Usage:
--------------------
"""
import argparse
import json
import os
import csv
import sys
import ConsoleUtils
program_name = 'TreeWalker'
program_description = 'A program for comparing new classification engine techniques against the correct data.'
author = 'Chris Diesch & Dave Gustafson'
build_date = '2017.07.27'
program_version = '2.0.0'
parser = argparse.ArgumentParser(prog=program_name, description=program_description, add_help=False)
FALSE_POSITIVE = 'False Positive'
"""
The False used for false positive results.
"""
FALSE_NEGATIVE = 'False Negative'
"""
The tag used for false negative results.
"""
CORRECT = 'Correct'
"""
The tag used for correct results.system
"""
INCORRECT = 'Incorrect'
"""
The tag used for incorrect results.
"""
SKIPPED = 'Skipped'
"""
The tag used for skipped results.
"""
YES = 'Yes'
"""
The tag used for yes in the resulting output.
"""
NO = 'No'
"""
The tag used for no in the resulting output.
"""
FIELD_NAMES = ['Page', 'Similar Pages', 'CLUX Result', 'ScoreWalker Result', 'Raw SW Result', 'Contained',
'Library Page', 'Was Classified', 'Confidence', 'Paginated', 'Walker Break Point', 'CLUX Break Point',
'Status', 'Human Review Required', 'Score', 'Confidence Threshold', 'Minimum Good Confidence',
'OCR Confidence']
"""
The list of field names used when generating the output in :meth:`compare_results`.
"""
f = r'C:\Users\chris\Documents\Code\Tests\KMeans\L3config\SLI Standard Mortgage Library-CONTAINERS_2017-11-07.conf.json'
with open(f, 'r') as reader:
data = json.load(reader)
walker = data['CLASSIFYWALKER']
del data
containers = walker['containers']
del walker
def load_clux_data(clux_file):
"""
Loads the data from the CLUX output file at the given path.
Args:
``clux_file`` -- ``str`` The path to the CLUX output to load.
Returns:
``dict`` A dict with integer keys corresponding to the page index in the document (0-based), and values which
are dicts with the following key/value pairs:
+------------------+----------------------------------------------------------------------------------------------+
| Key | Value ``type`` |
+==================+==============================================================================================+
| ``"doctype"`` | The name of the doctype the given page is in. ``str`` |
| | |
| | *NOTE:* A document may have individual pages which are misclassified, but during assembly |
| | in SLEDS they can be corrected. These were still misclassified by the engine. |
+------------------+----------------------------------------------------------------------------------------------+
| ``"docindex"`` | The index of the document in the package processed by CLUX. ``int`` |
+------------------+----------------------------------------------------------------------------------------------+
| ``"pageindex"`` | The index of the current page in the package. ``int`` |
+------------------+----------------------------------------------------------------------------------------------+
| ``"matchtype"`` | The name of the doctype the page matched. ``str`` |
+------------------+----------------------------------------------------------------------------------------------+
"""
result = {}
page_num = 0
doc_num = 0
print('Loading data from file %s' % clux_file)
with open(clux_file) as reader:
json_data = json.load(reader)
# pages = json_data['pages']
# sub_doc_page = 0
# doc_idx = 0
# cust_name = None
# for page in pages:
# break_page = cust_name is None or page['cust_name'] != cust_name
#
# if break_page:
# doc_idx = 0
# else:
# doc_idx += 1
#
# doctype = page['cust_doctype']
# page_idx = page['pageno'] - 1
# cust_name = page['cust_name']
# page_data = {'docindex': doc_idx, 'pageindex': page_idx, 'doctype': doctype, 'break': break_page}
# result[page_idx] = page_data
try:
json_docs = json_data['documents']
except KeyError:
json_docs = json_data['sdc']
for doc in json_docs:
doctype = doc['doctype']
pages = doc['pages']
sub_doc_page = 0
for page in pages:
break_point = (sub_doc_page == 0)
page_data = {'docindex': doc_num, 'pageindex': page_num, 'doctype': doctype, 'break': break_point}
page_matches = page['matches']
if len(page_matches) == 0:
page_data['matchtype'] = 'Unknown'
else:
page_data['matchtype'] = page_matches[0]['ldt']
result[page_num] = page_data
page_num += 1
sub_doc_page += 1
doc_num += 1
return result
def load_walker_data(walker_file):
"""
Loads the data from the classification engine's output.
Args:
``walker_file`` -- ``str`` The path to the engine's output file.
Returns:
``dict`` A dict with integer keys corresponding to the page index in the document (0-based), and values which
are dicts with the following key/value pairs:
+---------------------+---------------------------------------------------------------------------+
| Key | Value ``type`` |
+=====================+===========================================================================+
| ``"pageindex"`` | The index of the page in the package. ``int`` |
+---------------------+---------------------------------------------------------------------------+
| ``"charCount"`` | The number of characters on the current page. ``int`` |
+---------------------+---------------------------------------------------------------------------+
| ``"confidence"`` | The computed confidence score from the engine. ``float`` |
+---------------------+---------------------------------------------------------------------------+
| ``"luceneResult"`` | yes or no if Lucene Returned a result or not. ``str`` |
+---------------------+---------------------------------------------------------------------------+
| ``"doctype"`` | The name of the doctype returned by the classification engine. ``str`` |
+---------------------+---------------------------------------------------------------------------+
| ``"rawScore"`` | The Lucene score of the classified page against the library page. |
| | ``float`` |
+---------------------+---------------------------------------------------------------------------+
| ``"selfScore"`` | The Lucene score of the library page against itself. ``float`` |
+---------------------+---------------------------------------------------------------------------+
| ``"confThreshold"`` | The confidence threshold used by ScoreWalker. ``float`` |
+---------------------+---------------------------------------------------------------------------+
| ``"libDoc"`` | The path to the library element matched. ``str`` |
+---------------------+---------------------------------------------------------------------------+
| ``"minConf"`` | The minimum Confidence for this match to be "good". ``float`` |
+---------------------+---------------------------------------------------------------------------+
| ``"goodConf"`` | If the page has good confidence or not. ``bool`` |
+---------------------+---------------------------------------------------------------------------+
| ``"break"`` | If this page is a break page determined by ScoreWalker. ``bool`` |
+---------------------+---------------------------------------------------------------------------+
| ``"similarPages"`` | The indexes of copies of this page in the document. ``str`` |
+---------------------+---------------------------------------------------------------------------+
| ``"paginated"`` | If this page was paginated by ScoreWalker or not. ``bool`` |
+---------------------+---------------------------------------------------------------------------+
"""
result = {}
print('Loading data from file %s' % walker_file)
with open(walker_file) as reader:
json_data = json.load(reader)
json_docs = json_data['documentPages']
# Load all the from doc pages in...
for doc in json_docs:
page_index = doc['documentPageIndex']
page_data = {'pageindex': page_index,
'minConf': 0.0,
'confidence': 0,
'confThreshold': 0.0,
'luceneResult': 'no',
'libDoc': 'None',
'doctype': 'None',
'score': 0,
'goodConf': False,
'paginated': False,
'ocrConf': 0.0,
'break': False,
'similarPages': 'None',
'contained': False,
'extractAs': 'None'}
result[page_index] = page_data
# Overwrite default data with found data.
associations = json_data['associations']
for assoc in associations:
page_index = assoc['documentPageIndex']
page_data = result[page_index]
assoc_match_count = len(assoc['matches'])
# Skip FIELD_NAMES which have no matches
# if assoc_match_count == 0:
# continue
page_data['libDoc'] = assoc['imagePath']
page_data['matches'] = assoc['matches']
page_data['confidence'] = assoc['conf']
page_data['luceneResult'] = 'yes'
page_data['doctype'] = assoc['topDoctype']
page_data['goodConf'] = assoc['goodConf']
page_data['score'] = assoc['score']
page_data['confThreshold'] = assoc['confThreshold']
page_data['paginated'] = assoc['paginated']
page_data['ocrConf'] = assoc['ocrConfidence']
page_data['break'] = assoc['breakPage']
page_data['minConf'] = assoc['confThreshold']
page_data['extractAs'] = assoc['extractAs']
page_data['contained'] = not (assoc['topDoctype'] == assoc['extractAs'])
sim_pages = json_data['documentPages'][page_index]['similarPages']
if len(sim_pages) > 0:
page_data['similarPages'] = ', '.join(('{}'.format(p['pageIndex'] + 1)) for p in sim_pages)
# If there were some similar pages...
# if not len(assoc['similarPages']) == 0:
# page_data['similarPages'] = ', '.join(('{}'.format(i['pageIndex'] + 1)) for i in assoc['similarPages'])
result[page_index] = page_data
return result
def compare_results(clux_results, walker_results):
"""
Produces the output of this program. This is where the known results from the CLUX output is compared against the
predicted results from the engine's outputs.
Args:
``clux_results`` -- ``dict`` The dict returned by :meth:`load_clux_data`.
``walker_results`` -- ``dict`` The dict returned by :meth:`load_walker_results`.
Returns:
``dict`` A dict with integer keys corresponding to the index of the page in the package (0-based) and values
which are dicts with the following key/value paris:
+-----------------------------+--------------------------------------------------------------------------------+
| Key | Value ``type`` |
+=============================+================================================================================+
| ``"CLUX Result"`` | The correct (human-verified) doctype. ``str`` |
+-----------------------------+--------------------------------------------------------------------------------+
| ``"ScoreWalker Result"`` | The predicted doctype. ``str`` |
+-----------------------------+--------------------------------------------------------------------------------+
| ``"Page"`` | The page number. ``int`` |
+-----------------------------+--------------------------------------------------------------------------------+
| ``"Was Classified"`` | yes or no if Lucene classified the document. ``str`` |
+-----------------------------+--------------------------------------------------------------------------------+
| ``"Raw Score"`` | The Lucene score of the library page against the classified page. ``float`` |
+-----------------------------+--------------------------------------------------------------------------------+
| ``"Self Score"`` | The Lucene score of the library page against itself. ``float`` |
+-----------------------------+--------------------------------------------------------------------------------+
| ``"Confidence"`` | The computed confidence of the predicted result. ``float`` |
+-----------------------------+--------------------------------------------------------------------------------+
| ``"Status"`` | A value corresponding to the accuracy of classification: ``str`` |
| | - ``"False Positive"`` If the predicted result is incorrect and has high |
| | confidence. |
| | - ``"False Negative"`` If the predicted result is correct with low |
| | confidence. |
| | - ``"Correct"`` If the predicted result is correct. |
| | - ``"Incorrect"`` If the predicted result is incorrect and has low |
| | confidence. |
| | - ``"Skipped"`` If the page was skipped. |
+-----------------------------+--------------------------------------------------------------------------------+
| ``"Score"`` | The score for this match (0-999, lower is better). ``int`` |
+-----------------------------+--------------------------------------------------------------------------------+
| ``"Human Review Required"`` | If the page required a human to look at it. ``bool`` |
+-----------------------------+--------------------------------------------------------------------------------+
| ``"Similar Pages"`` | Pages which are identical to this page. ``list(int)`` |
+-----------------------------+--------------------------------------------------------------------------------+
| ``"Minimum Good | The minimum confidence to allow this page not to need human review. ``float`` |
| Confidence"`` | |
+-----------------------------+--------------------------------------------------------------------------------+
| ``"OCR Confidence"`` | The confidence of the OCR engine for this page. ``float`` |
+-----------------------------+--------------------------------------------------------------------------------+
| ``"Library Page"`` | The path to the matched page in the library. ``str`` |
+-----------------------------+--------------------------------------------------------------------------------+
| ``"Confidence Threshold"`` | The confidence threshold used for classifying. ``float`` |
+-----------------------------+--------------------------------------------------------------------------------+
| ``"Walker Break Point"`` | If this page is a sub-document break point determined by ScoreWalker. |
| | ``str`` |
+-----------------------------+--------------------------------------------------------------------------------+
| ``"CLUX Break Point"`` | If this page is a sub-document break point determined by a CLUX user. |
| | ``str`` |
+-----------------------------+--------------------------------------------------------------------------------+
"""
result = {}
num_pages = len(clux_results)
for page_idx in range(num_pages):
print('Comparing CLUX to walker (Page %d/%d).' % (page_idx + 1, num_pages))
result_page = {}
walker_page = walker_results[page_idx]
clux_page = clux_results[page_idx]
good_conf = walker_page['goodConf']
human_review = not good_conf
paginated = walker_page['paginated']
result_page['CLUX Result'] = clux_page['doctype']
result_page['ScoreWalker Result'] = walker_page['doctype']
result_page['Raw SW Result'] = walker_page['extractAs']
result_page['Page'] = page_idx + 1
result_page['Was Classified'] = walker_page['luceneResult']
result_page['Confidence'] = walker_page['confidence']
result_page['Human Review Required'] = human_review
result_page['Score'] = walker_page['score']
result_page['Library Page'] = walker_page['libDoc']
result_page['Confidence Threshold'] = walker_page['confThreshold']
result_page['Minimum Good Confidence'] = walker_page['minConf']
result_page['OCR Confidence'] = walker_page['ocrConf']
result_page['Similar Pages'] = walker_page['similarPages']
result_page['Contained'] = walker_page['contained']
are_same = (doctypes_are_same(clux_page['doctype'], walker_page['doctype'])) or \
(doctypes_are_same(clux_page['doctype'], walker_page['extractAs']))
break_txt = ''
if walker_page['break']:
break_txt = YES
result_page['Walker Break Point'] = break_txt
break_txt = ''
if clux_page['break']:
break_txt = YES
result_page['CLUX Break Point'] = break_txt
# Deal with status.
if (not are_same) and (walker_page['doctype'].lower() == 'unknown' or
walker_page['doctype'].lower() == 'unknown - illegible' or
clux_page['doctype'].lower() == 'unknown' or
clux_page['doctype'].lower() == 'illegible' or
walker_page['doctype'].lower() == 'blank page' or
clux_page['doctype'].lower() == 'blank page'):
status = SKIPPED
elif are_same and good_conf:
status = CORRECT
elif not good_conf and not are_same:
status = INCORRECT
elif not good_conf and are_same:
status = FALSE_NEGATIVE
else:
status = FALSE_POSITIVE
# Deal with pagination
if paginated:
pag = YES
else:
pag = NO
result_page['Paginated'] = pag
result_page['Status'] = status
result[page_idx] = result_page
return result
def doctypes_are_same(clux_doctype, walker_doctype):
"""
Determines if two doctypes are the same. Since currently there is a difference between the doctypes used by our
engines and those that were used by humans, we need to do a small comparison.
Args:
``clux_doctype`` -- ``str`` The name of the doctype returned by CLUX.
``walker_doctype`` -- ``str`` The name of the doctype returned by the engine.
Returns:
``bool`` :py:`True` if the two doctypes are equivalent, :py:`False` otherwise.
"""
walker_name = walker_doctype
clux_name = clux_doctype
# walker_name = walker_name.replace('Property Appraisal - Update or Completion Report', 'Property Appraisal')
# clux_name = clux_name.replace('Property Appraisal - Update or Completion Report', 'Property Appraisal')
#
# walker_name = walker_name.replace('Mortgage Statement', 'Disclosure - SCRA Notice')
# clux_name = clux_name.replace('Mortgage Statement', 'Disclosure - SCRA Notice')
#
# walker_name = walker_name.replace('Credit Report Supplement', 'Credit Report')
# clux_name = clux_name.replace('Credit Report Supplement', 'Credit Report')
#
# clux_name = clux_name.replace('Closing or Escrow Protection Letter', 'Closing Protection Letter')
# walker_name = walker_name.replace('Closing or Escrow Protection Letter', 'Closing Protection Letter')
#
# clux_name = clux_name.replace('Disclosure - (TIL) Truth in Lending', 'USDA - Truth In Lending Statement')
# walker_name = walker_name.replace('Disclosure - (TIL) Truth in Lending', 'USDA - Truth In Lending Statement')
if walker_name == 'UNKNOWN - ILLEGIBLE' and clux_name == 'Illegible':
return True
walker_name = walker_name.lower()
clux_name = clux_name.lower()
valid_types = [clux_name]
for container in containers:
if container['doctype'].lower() == clux_name:
valid_types += container['contains']
valid_types = [v.lower() for v in valid_types]
result = walker_name in valid_types
return result
def save_csv(file_path, page_match_data):
"""
Saves the given data to a CSV file.
Args:
``file_path`` -- ``str`` The path to the file to save.
``page_match_data`` -- ``dict`` The dict returned by :meth:`compare_results`.
Returns:
``None``
"""
print('Saving file %s' % file_path)
with open(file_path, 'w+', newline='') as out_file:
writer = csv.DictWriter(out_file, fieldnames=FIELD_NAMES)
writer.writeheader()
for idx, page in page_match_data.items():
writer.writerow(page)
def load_csv(file_path):
"""
Loads a TreeWalker file from the given path.
Args:
``file_path`` -- ``str`` The page to the file to load.
Returns:
``dict`` A dict of the rows in the file where indexes are the page index and values are dicts with key/value
pairs corresponding to the header row key, and column value for the given row.
"""
result = {}
with open(file_path) as in_file:
reader = csv.DictReader(in_file)
for row in reader:
page_num = row['Page']
result[page_num] = row
return result
def main(clux_data, walker_data, out_file):
"""
The main entry point of the program. This will compare the files are ``clux_data`` and ``walker_data`` and write the
result to ``out_file``.
Args:
``clux_data`` -- ``str`` The path to the output from CLUX.
``walker_data`` -- ``str`` The path the output from the classification engine.
``out_file`` -- ``str`` The path to the output file to save.
Returns:
``int`` The status of the program.
"""
clux_pages = load_clux_data(clux_data)
walker_pages = load_walker_data(walker_data)
match_info = compare_results(clux_pages, walker_pages)
save_csv(out_file, match_info)
def check_args(clux_file, walker_file, out_file):
"""
Makes sure arguments are valid before running the program.
Args:
``clux_file`` -- ``str`` The path to the output from CLUX.
``walker_file`` -- ``str`` The path to the output from the engine.
``out_file`` -- ``str`` The path to the file to save the results to.
Returns:
``None``
This function will cause the program to exit with ``status=-1`` if the given arguments are invalid.
Arguments are considered invalid ``iff``:
- There is no file at ``walker_file``.
- There is no file at ``clux_file``.
"""
fatal_error = False
if not os.path.exists(walker_file):
print('Error: Walker file does not exist: %s' % walker_file)
fatal_error = True
if not os.path.exists(clux_file):
print('Error: CLUX file does not exist: %s' % clux_file)
fatal_error = True
if os.path.exists(out_file):
print('Warning: File exists, and will be overwritten: %s' % out_file)
if fatal_error:
parser.print_help()
print('Encountered a fatal error, Exiting...')
exit(-1)
if __name__ == '__main__':
printer = ConsoleUtils.SLPrinter(program_name)
sys.stdout = printer
printer.write_no_prefix(ConsoleUtils.get_header(program_name, program_version, build_date, author, 80))
required_args = parser.add_argument_group('Required')
optional_args = parser.add_argument_group('Optional')
required_args.add_argument('-c', '--clux_data', required=True, help='The output from CLUX.')
required_args.add_argument('-w', '--walker_data', required=True, help='The output from ScoreWalker.')
required_args.add_argument('-o', '--output', required=True, help='The path to save the output to.')
optional_args.add_argument('-h', '--help', action='help', help='Prints the help message.')
args = parser.parse_args()
clux = args.clux_data
walker = args.walker_data
output = args.output
print('Output file from classification to compare: "%s"' % walker)
print('Output file from CLUX to compare: "%s"' % clux)
print('Saving comparison report to: "%s"' % output)
check_args(clux, walker, output)
main(clux, walker, output)