Sleds/scorewalker-utils/TreeWalker/StatusCounter.py

307 lines
10 KiB
Python

"""
Information
-----------
This is a tool for getting the number of times a specific classification error is made (``doctype A -> doctype B``).
This tool uses the output from :mod:`TreeWalker` to perform this function. This information is used for determining
which doctypes need improvement in the library or diagnosing systematic issues in the classification technique used by
the current classification engine.
This tool can read one or many :mod:`TreeWalker` files in a directory and produce a single "count" file from the
information contained in all of them.
.. moduleauthor:: Chris Diesch <cdiesch@sequencelogic.net>
Commandline Usage
-----------------
Usage: ``StatusCounter.py [-h] [-i input_directory] {IN_DIR} [-o output_file] {OUT_FILE}
[-t, --type] {TYPE_ARG}``
Required Arguments:
``-i IN_DIR, --input_directory IN_DIR``
Where ``IN_DIR`` is the input directory with the :mod:`TreeWalker` output files to generate counts from.
*NOTE:* This directory can contain one or several :mod:`TreeWalker` files and this tool will read from all of
them to produce its "count list".
``-o OUT_FILE, --output_file OUT_FILE``
Where ``OUT_FILE`` is the path to save the output to.
*NOTE*: If there is a file at ``OUT_FILE`` it will be overwritten.
Optional Arguments:
``-t {TYPE_ARG}, --type {TYPE_ARG}``
Where ``TYPE_ARG`` is one of the following:
- ``"fp"`` -- Count only false positive results.
- ``"fn"`` -- Count only false negative results.
- ``"c"`` -- Count only correct results.
- ``"i"`` -- Count only incorrect results.
*NOTE*: By default this tool will run with ``TYPE_ARG`` as ``"all"``.
``-h, --help``
Prints the help message.
Python Module Usage
-------------------
"""
# We do all our imports at the top of our program.
import argparse
import TreeWalker
import csv
import operator
import datetime
import sys
import os
import ConsoleUtils
program_name = 'StatusCounter'
program_description = 'Produces a count of the various types of classification results in the given list of files.'
parser = argparse.ArgumentParser(prog=program_name, description=program_description, add_help=False)
# Error and Warning console values:
red_error = '\033[91mError:\033[0m'
yellow_warning = '\033[93mWARNING:\033[0m'
blue_okay = '\033[94mOK\033[0m'
build_date = datetime.datetime.now().strftime('%Y.%m.%d-%H.%M.%S')
program_version = '2.0.0'
author = 'Chris Diesch'
FIELD_NAMES = ['Correct Type', 'Classified Type', 'Number of Occurrences']
count_fp = False
count_fn = False
count_c = False
count_i = False
def load_type_counts(tree_walker_files):
"""
Loads the counts for the requested types.
Args:
``tree_walker_files`` -- ``list(str)`` The list of :mod:`TreeWalker` output files to generate counts from.
Returns:
``dict`` A dict with keys:
- ``"falsePositive"`` The false positive occurrences.
- ``"falseNegative"`` The false negative occurrences.
- ``"correct"`` The false correct occurrences.
- ``"incorrect"`` The incorrect occurrences.
Each key contains a dict with keys formatted as follows:
``"CORRECT_TYPE>>>CLASSIFIED_TYPE"``
The values of these dicts are the number of times that classification occurred.
ex: A classified as B 13 times, the result would be:
- ``{"A>>>B": 13}``
*NOTE*: This function sorts the results by their number of occurrences (highest to lowest).
"""
fp_counts = {}
fn_counts = {}
correct_counts = {}
incorrect_counts = {}
for file in tree_walker_files:
print('Loading false positives from %s' % file)
tree_data = TreeWalker.load_csv(file)
for page_idx, data in tree_data.items():
status = data['Status']
classify_type = data['ScoreWalker Result']
correct_type = data['CLUX Result']
key = format('%s>>>%s' % (correct_type, classify_type))
# If it's a false positive...
if status == TreeWalker.FALSE_POSITIVE and count_fp:
if key not in fp_counts.keys():
fp_counts[key] = 1
else:
fp_counts[key] = fp_counts[key] + 1
# If it's a false negative
elif status == TreeWalker.FALSE_NEGATIVE and count_fn:
if key not in fn_counts.keys():
fn_counts[key] = 1
else:
fn_counts[key] = fn_counts[key] + 1
# If it was correct
elif status == TreeWalker.CORRECT and count_c:
if key not in correct_counts.keys():
correct_counts[key] = 1
else:
correct_counts[key] = correct_counts[key] + 1
# If it was incorrect.
elif status == TreeWalker.INCORRECT and count_i:
if key not in incorrect_counts.keys():
incorrect_counts[key] = 1
else:
incorrect_counts[key] = incorrect_counts[key] + 1
print('Found %d false positives' % len(fp_counts))
fp_counts = sorted(fp_counts.items(), key=operator.itemgetter(1), reverse=True)
fn_counts = sorted(fn_counts.items(), key=operator.itemgetter(1), reverse=True)
correct_counts = sorted(correct_counts.items(), key=operator.itemgetter(1), reverse=True)
incorrect_counts = sorted(incorrect_counts.items(), key=operator.itemgetter(1), reverse=True)
# Return the results...
return {'falsePositives': fp_counts, 'falseNegatives': fn_counts, 'correct': correct_counts,
'incorrect': incorrect_counts}
def save_file(doc_counts, out_file_loc):
"""
Saves the output from this tool in CSV format to a file.
Args:
``doc_counts`` -- ``dict`` The count information from :meth:`load_type_counts`.
``out_file_loc`` -- ``str`` The path to save the output to.
Returns:
``None``
*NOTE*: If there is a file at ``out_file_loc`` it will be overwritten.
"""
print('Writing output to %s' % out_file_loc)
with open(out_file_loc, 'w+', newline='') as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=FIELD_NAMES)
writer.writeheader()
for key, count in doc_counts:
doctypes = key.split('>>>')
correct = doctypes[0]
false_pos = doctypes[1]
writer.writerow({'Correct Type': correct, 'Classified Type': false_pos, 'Number of Occurrences': count})
# This is the main function of the program.
def main(in_files, out_file):
"""
The main entry point for this program (invoked ``if __name__ == 'main'``). This generates the counts from
``in_files`` and saves them to the output at ``out_file``.
Args:
``in_files`` -- ``list(str)`` The output files from :mod:`TreeWalker` to generate counts from.
``out_file`` -- ``str`` The path to the output file.
*NOTE*: If ``"all"`` was used for ``--type`` this will generate 4 output files named:
- ``"false-positive-counts.csv"``
- ``"false-negative-counts.csv"``
- ``"correct-counts.csv"``
- ``"incorrect-counts.csv"``
These files will be output the parent directory of ``out_file``.
Returns:
``int`` The status of the program.
"""
if count_i and count_fp and count_fn and count_c:
out_dir, src_file = os.path.split(out_file)
fp_out = os.path.join(out_dir, 'false-positive-counts.csv')
fn_out = fp_out.replace('false-positive', 'false-negative')
correct_out = fp_out.replace('false-positive', 'correct')
incorrect_out = fp_out.replace('false-positive', 'incorrect')
else:
fp_out = out_file
fn_out = out_file
correct_out = out_file
incorrect_out = out_file
# Display the args
show_args(out_file, in_files)
count_data = load_type_counts(in_files)
if count_fp:
false_pos_data = count_data['falsePositives']
if count_fn:
false_neg_data = count_data['falseNegatives']
if count_c:
correct_data = count_data['correct']
if count_i:
incorrect_data = count_data['incorrect']
if count_fp:
save_file(false_pos_data, fp_out)
if count_fn:
save_file(false_neg_data, fn_out)
if count_c:
save_file(correct_data, correct_out)
if count_i:
save_file(incorrect_data, incorrect_out)
def show_args(out_file, in_files):
files_txt = '\n '.join('"%s"' % f for f in in_files)
if count_fp:
print('Saving false positive counts to: %s' % out_file)
if count_fn:
print('Saving false negative counts to: %s' % out_file)
if count_c:
print('Saving correct counts to: %s' % out_file)
if count_i:
print('Saving incorrect counts to: %s' % out_file)
print('Using data from files:')
for f in in_files:
printer.write_no_prefix(' "%s"' % f)
# This is where we call the main method from.
if __name__ == '__main__':
printer = ConsoleUtils.SLPrinter(program_name)
sys.stdout = printer
printer.write_no_prefix(ConsoleUtils.get_header(program_name, program_version, build_date, author, 80))
# Set up arguments here.
required_args = parser.add_argument_group('Required')
optional_args = parser.add_argument_group('Optional')
required_args.add_argument('-i', '--in_files', required=True, nargs='*',
help='A list of paths to TreeWalker CSV files.')
required_args.add_argument('-o', '--out_file', required=True, help='The path to write output to.')
optional_args.add_argument('-t', '--type', choices=['fp', 'fn', 'c', 'i', 'all'], required=False,
help='Use this to determine what the desired type of classification results to output is'
'"fp" will only count false positives.\n', default='all')
optional_args.add_argument('-h', '--help', action="help", help='Prints the help message.')
args = parser.parse_args()
# Get the arguments
input_files = args.in_files
output_file = args.out_file
count_type = args.type
# Determine the type to count
if count_type == 'all':
count_fp = True
count_fn = True
count_c = True
count_i = True
elif count_type == 'fp':
count_fp = True
elif count_type == 'fn':
count_fn = True
elif count_type == 'c':
count_c = True
elif count_type == 'i':
count_i = True
main(input_files, output_file)