"""
Information
-----------
This is a tool for viewing the individual terms responsible for false positive matches from the classification
engine recognized by :mod:`TreeWalker`. This tool produces CSV output when called from the commandline and provides
insight into what causes false positives to occur at a term-level. This tool is used to help determine what terms or
groups of terms matter most when classifying a page between two similar doctypes.

.. moduleauthor:: Chris Diesch <cdiesch@sequencelogic.net>


Commandline Usage
-----------------
Usage: ``TermWalker.py [-h] [-c, --classification] {CLASS_FILE} [-w, --tree_walker_file] {TREE_WALKER_FILE}
[-o --output_file] {OUT_FILE}``

Required Arguments:

    ``-c CLASS_FILE, --classificatoin CLASS_FILE``
        Where ``CLASS_FILE`` is the path to the classification engine output.

    ``-w TREE_WALKER_FILE, --tree_walker TREE_WALKER_FILE``
        Where ``TREE_WALKER_FILE`` The path to the output of :mod:`TreeWalker`.

    ``-o OUT_FILE, --out_path OUT_FILE``
        Where ``OUT_FILE`` is the path to save the final output to.

Optional Arguments:
    ``-h, --help``
        Prints the help message.


Python Module Usage:
--------------------
"""
# We do all our imports at the top of our program.
import argparse
import csv
import sys

import ConsoleUtils
import TreeWalker

program_name = 'TermWalker'
program_description = 'Generates a CSV file for analyzing terms which produced false positives.'
author = 'Chris Diesch'
version = '1.0.1'
build = '2017.07.27'

parser = argparse.ArgumentParser(prog=program_name, description=program_description, add_help=False)

# Error and Warning console values:
red_error = '\033[91mError:\033[0m'
yellow_warning = '\033[93mWARNING:\033[0m'
blue_okay = '\033[94mOK\033[0m'

FIELD_NAMES = ['Page', 'Correct Result', 'Classification Result', 'Library Page', 'Confidence',  'Term', 'Term Weight']


def get_false_positive_pages(tree_walker_data):
    """
    Gets the False Positive pages from the given :mod:`TreeWalker` file.

    Args:
        ``tree_walker_data`` -- ``str`` The path to the TreeWalker file to load.

    Returns:
         ``dict`` A dict where the keys are the page indexes of the false positives and the values are the classified
         doctypes for those pages.
    """
    fp_pages = {}
    for key, value in tree_walker_data.items():
        if value['Status'] == TreeWalker.FALSE_POSITIVE:
            fp_pages[int(value['Page']) - 1] = value['CLUX Result']
    return fp_pages


def get_fp_terms(tree_walker_file, walker_file):
    """
    Loads the terms for every match false positive match in the TreeWalker file from the classification output.

    Args:
        ``tree_walker_file`` -- ``str`` The TreeWalker file to load the false positive data from.

        ``walker_file`` -- ``str`` The classification output file used to create ``tree_walker_file``.

    Returns:
        ``dict`` A dict where the keys are ``"PAGE_INDEX.MATCH_NUMBER"``

        (ex: The second match for page 1 = ``"0.1"`` since pages and match numbers are 0 indexed)

        The values for the result are dicts with the following key/value pairs:

    +-----------------+--------------------------------------------------------------+
    | Key             | Value ``type``                                               |
    +=================+==============================================================+
    | ``terms``       | The terms which responsible for this match. ``list(str)``    |
    +-----------------+--------------------------------------------------------------+
    | ``doctype``     | The classified doctype of this match. ``str``                |
    +-----------------+--------------------------------------------------------------+
    | ``correctType`` | The correct doctype for this match. ``str``                  |
    +-----------------+--------------------------------------------------------------+
    | ``score``       | The score for this match. ``int``                            |
    +-----------------+--------------------------------------------------------------+
    | ``libPage``     | The path to the library page for this match. ``str``         |
    +-----------------+--------------------------------------------------------------+
    | ``conf``        | The confidence of this match. ``float``                      |
    +-----------------+--------------------------------------------------------------+
    """
    result = {}

    bad_pages = get_false_positive_pages(TreeWalker.load_csv(tree_walker_file))
    walker_data = TreeWalker.load_walker_data(walker_file)

    for page_index, correct_type in bad_pages.items():
        walker_page = walker_data[page_index]
        matches = walker_page['matches']
        match_num = 0
        for match in matches:
            terms = match['terms']
            result[format('%d.%d' % (page_index, match_num))] = {'terms': terms,
                                                                 'correctType': correct_type,
                                                                 'doctype': match['doctype'],
                                                                 'score': match['rawScore'],
                                                                 'libPage': match['imagePath'],
                                                                 'conf': match['conf']}
            match_num += 1

    print('Found %d false positives in %s' % (len(result), tree_walker_file))
    return result


def write_csv(fp_term_data, out_file_loc):
    """
    Saves the output from this tool as a CSV file.

    Args:
        ``fp_term_data`` -- ``dict`` The data from :meth:`get_fp_terms`.

        ``out_file_loc`` -- ``str`` The path to save the output to.

    Returns:
        ``None``
    """
    print('Saving file at %s' % out_file_loc)

    with open(out_file_loc, 'w+', newline='') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=FIELD_NAMES)
        writer.writeheader()

        for key, data in fp_term_data.items():
            page_idx = int(key.split('.')[0])
            writer.writerow({'Page': page_idx + 1, 'Correct Result': data['correctType'],
                             'Classification Result': data['doctype'], 'Library Page': data['libPage'],
                             'Term': 'Page Score: ', 'Term Weight': data['score'], 'Confidence': data['conf']})

            for term in data['terms']:
                writer.writerow({'Page': '', 'Correct Result': '', 'Classification Result': '', 'Library Page': '',
                                 'Term': term['term'], 'Term Weight': term['termScore'],
                                 'Confidence': data['conf']})

            writer.writerow({'Page': '', 'Correct Result': '', 'Classification Result': '', 'Library Page': '',
                             'Term': '', 'Term Weight': '', 'Confidence': ''})


# This is the main function of the program.
def main(classification_file, tree_walker_file, output_file):
    print('Classification file: %s\n'
          'TreeWalker file: %s\n'
          'Output file: %s' % (classification_file, tree_walker_file, output_file))

    false_pos_data = get_fp_terms(tree_walker_file, classification_file)
    write_csv(false_pos_data, output_file_path)


# This is where we call the main method from.
if __name__ == '__main__':
    printer = ConsoleUtils.SLPrinter(program_name)
    sys.stdout = printer
    printer.write_no_prefix(ConsoleUtils.get_header(program_name, version, build, author, 80))
    # Set up arguments here.
    required_args = parser.add_argument_group('Required')
    optional_args = parser.add_argument_group('Optional')
    required_args.add_argument('-c', '--classification', required=True,
                               help='The path to a classification output file.')
    required_args.add_argument('-w', '--tree_walker', required=True, help='The path to a TreeWalker output file.')
    required_args.add_argument('-o', '--out_path', required=True, help='The path to write output to.')

    optional_args.add_argument('-h', '--help', action="help", help='Prints the help message.')

    args = parser.parse_args()
    # Get the argument.
    walker_file_path = args.classification
    tree_file_path = args.tree_walker
    output_file_path = args.out_path
    # Now we can run...
    main(walker_file_path, tree_file_path, output_file_path)