ScoreWalker/scorewalker-utils/KMeans/Dictionary.py

"""
Dictionary.py
=============

This is a tool for generating a dictionary of tokens in a given list of file. It takes for input a file containing a
list of the files containing the term tokens and a path to a file to write. Every file in this list is read and used
to generate two dicts which are written to the passed output file. The first of these is a list of each token and
it's associated integer 'map'. The second list is each token as well as the number of occurrences across the files
read.

.. moduleauthor:: Chris Diesch <cdiesch@sequencelogic.net>
"""

# We do all our imports at the top of our program.
import argparse
import json
import os
import ConsoleUtils
# Give the program a name.
program_name = 'Dictionary'
# Describe what the program does beiefly.
program_description = 'Creates and loads the dictionary for k-means clustering.'
# The argument parser for the program.
parser = argparse.ArgumentParser(prog=program_name, description=program_description, add_help=False)
# Error and Warning console values:
red_error = '\033[91mError:\033[0m'
yellow_warning = '\033[93mWARNING:\033[0m'
blue_okay = '\033[94mOK\033[0m'
program_header = format('\033[95m%s\033[0m\n'
                        '-----------------------' % program_name)
# Default decision message.
decision_message = '   Is this okay? (Y/N): '
# Constants
library_name = 'False Pos Lib'
dictionary_tag = 'dictionary'
dict_tokens_tag = 'tokens'
dict_token_tag = 'token'
tkn_value_tag = 'value'
tkn_weight_tag = 'weight'
tkn_mapped_tag = 'mapping'
token_count_tag = 'tokenCounts'


# TODO: Tokenize phrases which are important.
# Loads all the tokens from the files. Returns a dict with token values as keys and the count of the tokens as values
# and a dict with token values as keys and the mapped value of the token as values
def load_tokens_from_files(files):
    """
    Loads the tokens from every file in the given list of files. This function loads the tokens into a dictionary and
    guarantees every tokens' uniqueness. It also maps each token to an associated integer value and coutns the number of
    token occurrences.

    :param files: The list of files to load the tokens from.
    :type files: list of str

    .. raw:: html
        <br>

    :return: A dict with token values as keys, and their counts as values, a dict with the token values as keys and
        their mapped integers as values.

    :rtype: tuple(dict[str,int], dict[str,int])
    """
    print('{:^70}'.format('Processing token files...'))
    map_val = 0
    tkns_map = {}
    tkns_count = {}
    num_ran = 0
    num_to_run = len(files)
    for file in files:
        num_ran += 1
        ConsoleUtils.print_progress_bar(num_ran, num_to_run, 50, 70)
        if os.path.exists(file):
            tkns = load_tkns_from_file(file)
            # We may want more than just the tokens at some point...
            for i in range(len(tkns)):
                s = str(tkns[i])
                if s not in tkns_map:
                    # Add the token to the maps and counts...
                    tkns_map[s] = map_val
                    map_val += 1
                    tkns_count[s] = 1
                else:
                    # Update the token count
                    tkns_count[s] += 1
    return tkns_count, tkns_map


# Loads the un-mapped tokens from the tkn file.
def load_tkns_from_file(file_name):
    """
    A helper method for load_tokens_from_files; this function loads all the tokens from a single file into a list.

    :param file_name: The path to the file to load.
    :type file_name: str

    .. raw:: html
        <br>

    :return: A list of every token from the file (not unique, not mapped).
    :rtype: list of str
    """
    with open(file_name) as reader:
        token_json = json.load(reader)
        ordered_tokens = token_json['ordered']
        tokens = ordered_tokens.split(' ')
    return tokens


# Writes a json file out with all the dictionary metadata.
def write_json(lib_name, tokens_map, tokens_count, dict_file):
    """
    Writes the JSON containing the dictionary values (token mappings and counts).

    :param lib_name: The name of the library (or sub-set) the dictionary is being created for.
    :type lib_name: str
    :param tokens_map: A dict which has token values as keys, and their integer mapping as values.
    :type tokens_map: dict[str,int]
    :param tokens_count: A dict which has token values as keys and their counts as values.
    :type tokens_count dict[str,int]
    :param dict_file: The path to the JSON 'dictionary' file to write.
    :type dict_file: str

    .. raw:: html
        <br>

    :return: None.
    """
    final_dict = {dictionary_tag: lib_name, tkn_mapped_tag: tokens_map, token_count_tag: tokens_count}
    with open(dict_file, 'w+') as dict_writer:
        dict_writer.write(json.dumps(final_dict, indent=1))
        dict_writer.close()


# Loads the token files from the txt file containing the list of token files delimited by newlines
def load_tkn_files(file):
    """
    This function reads in the file containing the list of token files to read.

    :param file: The path to the 'master file list' file.
    :type file: str

    .. raw:: html
        <br>

    :return: A list of '.tkn' files which have been scrubbed as necessary.
    :rtype: list of str
    """
    with open(file, 'r+') as tkn_file:
        result = tkn_file.readlines()
    for i in range(len(result)):
        # This line changes depending on what machine you're on....
        result[i] = result[i].replace('Z:\\', '\\\\mount_eng\\eng\\')
        result[i] = result[i].replace('\n', '')

    return result


# This is the main function of the program.
def main(token_file, output_file):
    """
    The main entry point of the tool.

    :param token_file: The file containing the list of '.tkn' files to run against.
    :type token_file: str
    :param output_file: The path to write the JSON 'dictionary' file to.
    :type output_file: str

    .. raw:: html
        <br>

    :return: The status of the program.
    :rtype: int
    """
    library_dir, file_name = os.path.split(token_file)
    lib_dirs, lib_name = os.path.split(library_dir)

    tkn_files = load_tkn_files(token_file)
    tkns_count, tkns_map = load_tokens_from_files(tkn_files)
    num_processed = len(tkn_files)
    num_unique_tkns = len(tkns_map)
    print('{:^70}'.format('Processed %d files. Found %d unique tokens' % (num_processed, num_unique_tkns)))
    print('{:^70}'.format('Saving library to json file...'))
    write_json(lib_name, tkns_map, tkns_count, output_file)
    print('{:^70}'.format('Done!'))


# Makes sure the passed arguments are valid.
def check_args(tkn_file, output_json, auto_overwrite):
    """
    Makes sure arguments are valid before running the program.

    :param tkn_file: The path to the file containing a list of '.tkn' files to run over.
    :type tkn_file: str
    :param output_json: the location of the output JSON 'dictionary' file.
    :type output_json: str
    :param auto_overwrite: Weather or not to overwrite an existing dictionary without asking for permission.
    :type auto_overwrite: bool

    .. raw:: html
        <br>

    :return: None.
    """
    fatal_errors = False
    # Do the checks for the required arguments second, this makes more sense reading the output.
    if os.path.exists(output_json) and not auto_overwrite:
        print('%s The old dictionary will be overwritten.' % yellow_warning)
        yes_or_no(decision_message)

    if not os.path.exists(tkn_file):
        print('%s The token file provided does not exist (%s).' % (red_error, tkn_file))
        fatal_errors = True

    if fatal_errors:
        parser.print_help()
        print('Exiting...')
        exit(0)


# Will ask the user to input yes or no and if they input yes the program will continue to execute. If however, they
# input no, the program will exit with status 0. 0 status is used here because there was no error, the user just chose
# to exit rather than continue executing.
def yes_or_no(message):
    """
    Will prompt the user with the given message and accept either 'y', 'yes', 'n', or 'no' as inputs ignoring case.
    The program will exit (with status 0) if the user enters no and will continue if the user enters yes.

    .. note: The user will be prompted to re-enter input if it is not valid with the following message:
     'Invalid input, enter Y(es) or N(o):' (This message will be printed until valid input is provided).

    :param message: The message to display to the user.
    :type message: str

    .. raw:: html
        <br>

    :return: None.
    """
    decision = input(message)
    if decision.lower() == 'y' or decision.lower() == 'yes':
        return
    elif decision.lower() == 'n' or decision.lower() == 'no':
        exit(0)
    else:
        yes_or_no('   Invalid input, enter Y(es) or N(o): ')


# This is where we call the main method from.
if __name__ == '__main__':
    print(program_header)
    required_args = parser.add_argument_group('Required')
    optional_args = parser.add_argument_group('Optional')
    required_args.add_argument('-f', '--tkn_file', required=True,
                               help='The txt file containing a list of token files in the library.')
    required_args.add_argument('-o', '--output_file', required=True,
                               help='The full path of the json file to save the dictionary in.')
    optional_args.add_argument('-w', '--overwrite', required=False, action='store_true',
                               help='If this flag is thrown, if there is an old dictionary with the same name, it will '
                                    'be overwritten without asking.')
    optional_args.add_argument('-h', '--help', action='help', help='Prints the help message.')
    args = parser.parse_args()
    # Save the arguments
    token_file = args.tkn_file
    output_json = args.output_file
    overwrite = args.overwrite
    # Are the arguments valid?
    check_args(token_file, output_json, overwrite)
    # Now we can Run!
    main(token_file, output_json)