307 lines
10 KiB
Python
307 lines
10 KiB
Python
"""
|
|
Information
|
|
-----------
|
|
This is a tool for getting the number of times a specific classification error is made (``doctype A -> doctype B``).
|
|
This tool uses the output from :mod:`TreeWalker` to perform this function. This information is used for determining
|
|
which doctypes need improvement in the library or diagnosing systematic issues in the classification technique used by
|
|
the current classification engine.
|
|
|
|
This tool can read one or many :mod:`TreeWalker` files in a directory and produce a single "count" file from the
|
|
information contained in all of them.
|
|
|
|
.. moduleauthor:: Chris Diesch <cdiesch@sequencelogic.net>
|
|
|
|
|
|
Commandline Usage
|
|
-----------------
|
|
Usage: ``StatusCounter.py [-h] [-i input_directory] {IN_DIR} [-o output_file] {OUT_FILE}
|
|
[-t, --type] {TYPE_ARG}``
|
|
|
|
Required Arguments:
|
|
|
|
``-i IN_DIR, --input_directory IN_DIR``
|
|
Where ``IN_DIR`` is the input directory with the :mod:`TreeWalker` output files to generate counts from.
|
|
|
|
*NOTE:* This directory can contain one or several :mod:`TreeWalker` files and this tool will read from all of
|
|
them to produce its "count list".
|
|
|
|
``-o OUT_FILE, --output_file OUT_FILE``
|
|
Where ``OUT_FILE`` is the path to save the output to.
|
|
|
|
*NOTE*: If there is a file at ``OUT_FILE`` it will be overwritten.
|
|
|
|
Optional Arguments:
|
|
|
|
``-t {TYPE_ARG}, --type {TYPE_ARG}``
|
|
Where ``TYPE_ARG`` is one of the following:
|
|
- ``"fp"`` -- Count only false positive results.
|
|
- ``"fn"`` -- Count only false negative results.
|
|
- ``"c"`` -- Count only correct results.
|
|
- ``"i"`` -- Count only incorrect results.
|
|
|
|
*NOTE*: By default this tool will run with ``TYPE_ARG`` as ``"all"``.
|
|
|
|
``-h, --help``
|
|
Prints the help message.
|
|
|
|
|
|
Python Module Usage
|
|
-------------------
|
|
"""
|
|
|
|
# We do all our imports at the top of our program.
|
|
import argparse
|
|
import TreeWalker
|
|
import csv
|
|
import operator
|
|
import datetime
|
|
import sys
|
|
import os
|
|
|
|
import ConsoleUtils
|
|
|
|
|
|
program_name = 'StatusCounter'
|
|
program_description = 'Produces a count of the various types of classification results in the given list of files.'
|
|
parser = argparse.ArgumentParser(prog=program_name, description=program_description, add_help=False)
|
|
|
|
# Error and Warning console values:
|
|
red_error = '\033[91mError:\033[0m'
|
|
yellow_warning = '\033[93mWARNING:\033[0m'
|
|
blue_okay = '\033[94mOK\033[0m'
|
|
|
|
build_date = datetime.datetime.now().strftime('%Y.%m.%d-%H.%M.%S')
|
|
program_version = '2.0.0'
|
|
author = 'Chris Diesch'
|
|
|
|
FIELD_NAMES = ['Correct Type', 'Classified Type', 'Number of Occurrences']
|
|
|
|
count_fp = False
|
|
count_fn = False
|
|
count_c = False
|
|
count_i = False
|
|
|
|
|
|
def load_type_counts(tree_walker_files):
|
|
"""
|
|
Loads the counts for the requested types.
|
|
|
|
Args:
|
|
``tree_walker_files`` -- ``list(str)`` The list of :mod:`TreeWalker` output files to generate counts from.
|
|
|
|
Returns:
|
|
``dict`` A dict with keys:
|
|
- ``"falsePositive"`` The false positive occurrences.
|
|
- ``"falseNegative"`` The false negative occurrences.
|
|
- ``"correct"`` The false correct occurrences.
|
|
- ``"incorrect"`` The incorrect occurrences.
|
|
|
|
Each key contains a dict with keys formatted as follows:
|
|
|
|
``"CORRECT_TYPE>>>CLASSIFIED_TYPE"``
|
|
|
|
The values of these dicts are the number of times that classification occurred.
|
|
|
|
ex: A classified as B 13 times, the result would be:
|
|
- ``{"A>>>B": 13}``
|
|
|
|
*NOTE*: This function sorts the results by their number of occurrences (highest to lowest).
|
|
|
|
"""
|
|
fp_counts = {}
|
|
fn_counts = {}
|
|
correct_counts = {}
|
|
incorrect_counts = {}
|
|
for file in tree_walker_files:
|
|
print('Loading false positives from %s' % file)
|
|
tree_data = TreeWalker.load_csv(file)
|
|
for page_idx, data in tree_data.items():
|
|
status = data['Status']
|
|
|
|
classify_type = data['ScoreWalker Result']
|
|
correct_type = data['CLUX Result']
|
|
key = format('%s>>>%s' % (correct_type, classify_type))
|
|
|
|
# If it's a false positive...
|
|
if status == TreeWalker.FALSE_POSITIVE and count_fp:
|
|
if key not in fp_counts.keys():
|
|
fp_counts[key] = 1
|
|
|
|
else:
|
|
fp_counts[key] = fp_counts[key] + 1
|
|
# If it's a false negative
|
|
elif status == TreeWalker.FALSE_NEGATIVE and count_fn:
|
|
if key not in fn_counts.keys():
|
|
fn_counts[key] = 1
|
|
|
|
else:
|
|
fn_counts[key] = fn_counts[key] + 1
|
|
# If it was correct
|
|
elif status == TreeWalker.CORRECT and count_c:
|
|
if key not in correct_counts.keys():
|
|
correct_counts[key] = 1
|
|
|
|
else:
|
|
correct_counts[key] = correct_counts[key] + 1
|
|
# If it was incorrect.
|
|
elif status == TreeWalker.INCORRECT and count_i:
|
|
if key not in incorrect_counts.keys():
|
|
incorrect_counts[key] = 1
|
|
|
|
else:
|
|
incorrect_counts[key] = incorrect_counts[key] + 1
|
|
|
|
print('Found %d false positives' % len(fp_counts))
|
|
fp_counts = sorted(fp_counts.items(), key=operator.itemgetter(1), reverse=True)
|
|
fn_counts = sorted(fn_counts.items(), key=operator.itemgetter(1), reverse=True)
|
|
correct_counts = sorted(correct_counts.items(), key=operator.itemgetter(1), reverse=True)
|
|
incorrect_counts = sorted(incorrect_counts.items(), key=operator.itemgetter(1), reverse=True)
|
|
# Return the results...
|
|
return {'falsePositives': fp_counts, 'falseNegatives': fn_counts, 'correct': correct_counts,
|
|
'incorrect': incorrect_counts}
|
|
|
|
|
|
def save_file(doc_counts, out_file_loc):
|
|
"""
|
|
Saves the output from this tool in CSV format to a file.
|
|
|
|
Args:
|
|
``doc_counts`` -- ``dict`` The count information from :meth:`load_type_counts`.
|
|
|
|
``out_file_loc`` -- ``str`` The path to save the output to.
|
|
|
|
Returns:
|
|
``None``
|
|
|
|
*NOTE*: If there is a file at ``out_file_loc`` it will be overwritten.
|
|
"""
|
|
print('Writing output to %s' % out_file_loc)
|
|
with open(out_file_loc, 'w+', newline='') as csv_file:
|
|
writer = csv.DictWriter(csv_file, fieldnames=FIELD_NAMES)
|
|
writer.writeheader()
|
|
for key, count in doc_counts:
|
|
doctypes = key.split('>>>')
|
|
correct = doctypes[0]
|
|
false_pos = doctypes[1]
|
|
writer.writerow({'Correct Type': correct, 'Classified Type': false_pos, 'Number of Occurrences': count})
|
|
|
|
|
|
# This is the main function of the program.
|
|
def main(in_files, out_file):
|
|
"""
|
|
The main entry point for this program (invoked ``if __name__ == 'main'``). This generates the counts from
|
|
``in_files`` and saves them to the output at ``out_file``.
|
|
|
|
Args:
|
|
``in_files`` -- ``list(str)`` The output files from :mod:`TreeWalker` to generate counts from.
|
|
|
|
``out_file`` -- ``str`` The path to the output file.
|
|
*NOTE*: If ``"all"`` was used for ``--type`` this will generate 4 output files named:
|
|
- ``"false-positive-counts.csv"``
|
|
- ``"false-negative-counts.csv"``
|
|
- ``"correct-counts.csv"``
|
|
- ``"incorrect-counts.csv"``
|
|
|
|
These files will be output the parent directory of ``out_file``.
|
|
|
|
Returns:
|
|
``int`` The status of the program.
|
|
|
|
"""
|
|
if count_i and count_fp and count_fn and count_c:
|
|
out_dir, src_file = os.path.split(out_file)
|
|
fp_out = os.path.join(out_dir, 'false-positive-counts.csv')
|
|
fn_out = fp_out.replace('false-positive', 'false-negative')
|
|
correct_out = fp_out.replace('false-positive', 'correct')
|
|
incorrect_out = fp_out.replace('false-positive', 'incorrect')
|
|
else:
|
|
fp_out = out_file
|
|
fn_out = out_file
|
|
correct_out = out_file
|
|
incorrect_out = out_file
|
|
# Display the args
|
|
show_args(out_file, in_files)
|
|
|
|
count_data = load_type_counts(in_files)
|
|
|
|
if count_fp:
|
|
false_pos_data = count_data['falsePositives']
|
|
if count_fn:
|
|
false_neg_data = count_data['falseNegatives']
|
|
if count_c:
|
|
correct_data = count_data['correct']
|
|
if count_i:
|
|
incorrect_data = count_data['incorrect']
|
|
|
|
if count_fp:
|
|
save_file(false_pos_data, fp_out)
|
|
if count_fn:
|
|
save_file(false_neg_data, fn_out)
|
|
if count_c:
|
|
save_file(correct_data, correct_out)
|
|
if count_i:
|
|
save_file(incorrect_data, incorrect_out)
|
|
|
|
|
|
def show_args(out_file, in_files):
|
|
files_txt = '\n '.join('"%s"' % f for f in in_files)
|
|
if count_fp:
|
|
print('Saving false positive counts to: %s' % out_file)
|
|
|
|
if count_fn:
|
|
print('Saving false negative counts to: %s' % out_file)
|
|
|
|
if count_c:
|
|
print('Saving correct counts to: %s' % out_file)
|
|
|
|
if count_i:
|
|
print('Saving incorrect counts to: %s' % out_file)
|
|
|
|
print('Using data from files:')
|
|
for f in in_files:
|
|
printer.write_no_prefix(' "%s"' % f)
|
|
|
|
# This is where we call the main method from.
|
|
if __name__ == '__main__':
|
|
printer = ConsoleUtils.SLPrinter(program_name)
|
|
sys.stdout = printer
|
|
printer.write_no_prefix(ConsoleUtils.get_header(program_name, program_version, build_date, author, 80))
|
|
# Set up arguments here.
|
|
required_args = parser.add_argument_group('Required')
|
|
optional_args = parser.add_argument_group('Optional')
|
|
required_args.add_argument('-i', '--in_files', required=True, nargs='*',
|
|
help='A list of paths to TreeWalker CSV files.')
|
|
required_args.add_argument('-o', '--out_file', required=True, help='The path to write output to.')
|
|
|
|
optional_args.add_argument('-t', '--type', choices=['fp', 'fn', 'c', 'i', 'all'], required=False,
|
|
help='Use this to determine what the desired type of classification results to output is'
|
|
'"fp" will only count false positives.\n', default='all')
|
|
optional_args.add_argument('-h', '--help', action="help", help='Prints the help message.')
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Get the arguments
|
|
input_files = args.in_files
|
|
output_file = args.out_file
|
|
count_type = args.type
|
|
# Determine the type to count
|
|
if count_type == 'all':
|
|
count_fp = True
|
|
count_fn = True
|
|
count_c = True
|
|
count_i = True
|
|
|
|
elif count_type == 'fp':
|
|
count_fp = True
|
|
|
|
elif count_type == 'fn':
|
|
count_fn = True
|
|
|
|
elif count_type == 'c':
|
|
count_c = True
|
|
|
|
elif count_type == 'i':
|
|
count_i = True
|
|
|
|
main(input_files, output_file)
|