""" Information ----------- This is a tool for getting the number of times a specific classification error is made (``doctype A -> doctype B``). This tool uses the output from :mod:`TreeWalker` to perform this function. This information is used for determining which doctypes need improvement in the library or diagnosing systematic issues in the classification technique used by the current classification engine. This tool can read one or many :mod:`TreeWalker` files in a directory and produce a single "count" file from the information contained in all of them. .. moduleauthor:: Chris Diesch Commandline Usage ----------------- Usage: ``StatusCounter.py [-h] [-i input_directory] {IN_DIR} [-o output_file] {OUT_FILE} [-t, --type] {TYPE_ARG}`` Required Arguments: ``-i IN_DIR, --input_directory IN_DIR`` Where ``IN_DIR`` is the input directory with the :mod:`TreeWalker` output files to generate counts from. *NOTE:* This directory can contain one or several :mod:`TreeWalker` files and this tool will read from all of them to produce its "count list". ``-o OUT_FILE, --output_file OUT_FILE`` Where ``OUT_FILE`` is the path to save the output to. *NOTE*: If there is a file at ``OUT_FILE`` it will be overwritten. Optional Arguments: ``-t {TYPE_ARG}, --type {TYPE_ARG}`` Where ``TYPE_ARG`` is one of the following: - ``"fp"`` -- Count only false positive results. - ``"fn"`` -- Count only false negative results. - ``"c"`` -- Count only correct results. - ``"i"`` -- Count only incorrect results. *NOTE*: By default this tool will run with ``TYPE_ARG`` as ``"all"``. ``-h, --help`` Prints the help message. Python Module Usage ------------------- """ # We do all our imports at the top of our program. import argparse import TreeWalker import csv import operator import datetime import sys import os import ConsoleUtils program_name = 'StatusCounter' program_description = 'Produces a count of the various types of classification results in the given list of files.' parser = argparse.ArgumentParser(prog=program_name, description=program_description, add_help=False) # Error and Warning console values: red_error = '\033[91mError:\033[0m' yellow_warning = '\033[93mWARNING:\033[0m' blue_okay = '\033[94mOK\033[0m' build_date = datetime.datetime.now().strftime('%Y.%m.%d-%H.%M.%S') program_version = '2.0.0' author = 'Chris Diesch' FIELD_NAMES = ['Correct Type', 'Classified Type', 'Number of Occurrences'] count_fp = False count_fn = False count_c = False count_i = False def load_type_counts(tree_walker_files): """ Loads the counts for the requested types. Args: ``tree_walker_files`` -- ``list(str)`` The list of :mod:`TreeWalker` output files to generate counts from. Returns: ``dict`` A dict with keys: - ``"falsePositive"`` The false positive occurrences. - ``"falseNegative"`` The false negative occurrences. - ``"correct"`` The false correct occurrences. - ``"incorrect"`` The incorrect occurrences. Each key contains a dict with keys formatted as follows: ``"CORRECT_TYPE>>>CLASSIFIED_TYPE"`` The values of these dicts are the number of times that classification occurred. ex: A classified as B 13 times, the result would be: - ``{"A>>>B": 13}`` *NOTE*: This function sorts the results by their number of occurrences (highest to lowest). """ fp_counts = {} fn_counts = {} correct_counts = {} incorrect_counts = {} for file in tree_walker_files: print('Loading false positives from %s' % file) tree_data = TreeWalker.load_csv(file) for page_idx, data in tree_data.items(): status = data['Status'] classify_type = data['ScoreWalker Result'] correct_type = data['CLUX Result'] key = format('%s>>>%s' % (correct_type, classify_type)) # If it's a false positive... if status == TreeWalker.FALSE_POSITIVE and count_fp: if key not in fp_counts.keys(): fp_counts[key] = 1 else: fp_counts[key] = fp_counts[key] + 1 # If it's a false negative elif status == TreeWalker.FALSE_NEGATIVE and count_fn: if key not in fn_counts.keys(): fn_counts[key] = 1 else: fn_counts[key] = fn_counts[key] + 1 # If it was correct elif status == TreeWalker.CORRECT and count_c: if key not in correct_counts.keys(): correct_counts[key] = 1 else: correct_counts[key] = correct_counts[key] + 1 # If it was incorrect. elif status == TreeWalker.INCORRECT and count_i: if key not in incorrect_counts.keys(): incorrect_counts[key] = 1 else: incorrect_counts[key] = incorrect_counts[key] + 1 print('Found %d false positives' % len(fp_counts)) fp_counts = sorted(fp_counts.items(), key=operator.itemgetter(1), reverse=True) fn_counts = sorted(fn_counts.items(), key=operator.itemgetter(1), reverse=True) correct_counts = sorted(correct_counts.items(), key=operator.itemgetter(1), reverse=True) incorrect_counts = sorted(incorrect_counts.items(), key=operator.itemgetter(1), reverse=True) # Return the results... return {'falsePositives': fp_counts, 'falseNegatives': fn_counts, 'correct': correct_counts, 'incorrect': incorrect_counts} def save_file(doc_counts, out_file_loc): """ Saves the output from this tool in CSV format to a file. Args: ``doc_counts`` -- ``dict`` The count information from :meth:`load_type_counts`. ``out_file_loc`` -- ``str`` The path to save the output to. Returns: ``None`` *NOTE*: If there is a file at ``out_file_loc`` it will be overwritten. """ print('Writing output to %s' % out_file_loc) with open(out_file_loc, 'w+', newline='') as csv_file: writer = csv.DictWriter(csv_file, fieldnames=FIELD_NAMES) writer.writeheader() for key, count in doc_counts: doctypes = key.split('>>>') correct = doctypes[0] false_pos = doctypes[1] writer.writerow({'Correct Type': correct, 'Classified Type': false_pos, 'Number of Occurrences': count}) # This is the main function of the program. def main(in_files, out_file): """ The main entry point for this program (invoked ``if __name__ == 'main'``). This generates the counts from ``in_files`` and saves them to the output at ``out_file``. Args: ``in_files`` -- ``list(str)`` The output files from :mod:`TreeWalker` to generate counts from. ``out_file`` -- ``str`` The path to the output file. *NOTE*: If ``"all"`` was used for ``--type`` this will generate 4 output files named: - ``"false-positive-counts.csv"`` - ``"false-negative-counts.csv"`` - ``"correct-counts.csv"`` - ``"incorrect-counts.csv"`` These files will be output the parent directory of ``out_file``. Returns: ``int`` The status of the program. """ if count_i and count_fp and count_fn and count_c: out_dir, src_file = os.path.split(out_file) fp_out = os.path.join(out_dir, 'false-positive-counts.csv') fn_out = fp_out.replace('false-positive', 'false-negative') correct_out = fp_out.replace('false-positive', 'correct') incorrect_out = fp_out.replace('false-positive', 'incorrect') else: fp_out = out_file fn_out = out_file correct_out = out_file incorrect_out = out_file # Display the args show_args(out_file, in_files) count_data = load_type_counts(in_files) if count_fp: false_pos_data = count_data['falsePositives'] if count_fn: false_neg_data = count_data['falseNegatives'] if count_c: correct_data = count_data['correct'] if count_i: incorrect_data = count_data['incorrect'] if count_fp: save_file(false_pos_data, fp_out) if count_fn: save_file(false_neg_data, fn_out) if count_c: save_file(correct_data, correct_out) if count_i: save_file(incorrect_data, incorrect_out) def show_args(out_file, in_files): files_txt = '\n '.join('"%s"' % f for f in in_files) if count_fp: print('Saving false positive counts to: %s' % out_file) if count_fn: print('Saving false negative counts to: %s' % out_file) if count_c: print('Saving correct counts to: %s' % out_file) if count_i: print('Saving incorrect counts to: %s' % out_file) print('Using data from files:') for f in in_files: printer.write_no_prefix(' "%s"' % f) # This is where we call the main method from. if __name__ == '__main__': printer = ConsoleUtils.SLPrinter(program_name) sys.stdout = printer printer.write_no_prefix(ConsoleUtils.get_header(program_name, program_version, build_date, author, 80)) # Set up arguments here. required_args = parser.add_argument_group('Required') optional_args = parser.add_argument_group('Optional') required_args.add_argument('-i', '--in_files', required=True, nargs='*', help='A list of paths to TreeWalker CSV files.') required_args.add_argument('-o', '--out_file', required=True, help='The path to write output to.') optional_args.add_argument('-t', '--type', choices=['fp', 'fn', 'c', 'i', 'all'], required=False, help='Use this to determine what the desired type of classification results to output is' '"fp" will only count false positives.\n', default='all') optional_args.add_argument('-h', '--help', action="help", help='Prints the help message.') args = parser.parse_args() # Get the arguments input_files = args.in_files output_file = args.out_file count_type = args.type # Determine the type to count if count_type == 'all': count_fp = True count_fn = True count_c = True count_i = True elif count_type == 'fp': count_fp = True elif count_type == 'fn': count_fn = True elif count_type == 'c': count_c = True elif count_type == 'i': count_i = True main(input_files, output_file)