ScoreWalker/scorewalker-utils/KMeans/Tokenizer.py

# We do all our imports at the top of our program.
import argparse
import json
import os
import collections
import ConsoleUtils
# Give the program a name.
program_name = 'Tokenizer'
# Describe what the program does beiefly.
program_description = 'Creates the custom tokenized files for running a distance calulation on.'
# The argument parser for the program.
parser = argparse.ArgumentParser(prog=program_name, description=program_description, add_help=False)
# Error and Warning console values:
red_error = '\033[91mError:\033[0m'
yellow_warning = '\033[93mWARNING:\033[0m'
blue_okay = '\033[94mOK\033[0m'
program_header = format('\033[95m%s\033[0m\n'
                        '-----------------------' % program_name)
# Default decision message.
decision_message = '   Is this okay? (Y/N): '
# Constants
dictionary_tag = 'dictionary'
dict_count_tag = 'tokenCounts'
dict_mapping_tag = 'mapping'
tkn_mapped_tag = 'mappedVal'
file_name_tag = 'name'
file_path_tag = 'path'
file_txt_tag = 'text'


# Loads the un mapped tokens from all the token files in tkn_files (makes sure a file exists and is a .tkn file
#  before trying to read it)
def load_tokens_from_files(tkn_files):
    print('{:^70}'.format('Loading un-mapped tokens from files...'))
    result = {}
    tokenized_count = 0
    total_to_tokenize = len(tkn_files)
    for file in tkn_files:
        tokenized_count += 1
        if os.path.exists(file):
            ConsoleUtils.print_progress_bar(tokenized_count, total_to_tokenize, 50, 70)
            with open(file, 'r+') as reader:
                file_txt = reader.read()
                tkns = file_txt.split(' ')
                result[file] = tkns
    return result


# Gets the mapped tokens for all the files in tkn_files based on the mapping in tkn_maps.
def get_tkns_for_files(tkn_files, tkn_maps):
    print('{:^70}'.format('Mapping tokens from files...'))
    num_ran = 0
    num_total = len(tkn_files)
    result = {}
    for path, token_file in tkn_files.items():
        num_ran += 1
        save_file = path.replace('.tkn', '.lev')
        ConsoleUtils.print_progress_bar(num_ran, num_total, 50, 70)
        result[save_file] = generate_tokens(token_file, tkn_maps)
    return result


# Generates the mapped token values and counts into a nice string to write to a file. It is important to maintain order
# since part of our calculation is to order the tokens by a rule before calculating the distance.
def generate_tokens(token_file, tkn_maps):
    # We really want to maintain the original order of tokens!
    tkn_count = collections.OrderedDict()
    for word in token_file:
        if word not in tkn_count:
            tkn_count[word] = 1
        else:
            tkn_count[word] += 1

    result_tkns = ''
    result_cnts = ''
    # We want the key because we need the map as well as the count...
    for key, value in tkn_count.items():
        result_tkns += format('%s ' % tkn_maps[key])
        result_cnts += format('%d ' % value)
    result_str = format('%s\n%s' % (result_tkns, result_cnts))

    return result_str


# Loads the dictionary of token mappings, weights, and values into a list of dictionaries.
def load_dict(dict_path):
    with open(dict_path) as dict_file:
        dict_json = json.load(dict_file)
        tokens_count = dict_json[dict_count_tag]
        tokens_map = dict_json[dict_mapping_tag]
    return tokens_count, tokens_map


# Loads the tokens files from the file containing a list of token files which are \n delimited.
def load_tkn_files(file):
    with open(file, 'r+') as tkn_file:
        result = tkn_file.readlines()
    for i in range(len(result)):
        result[i] = result[i].replace('\n', '')
        result[i] = result[i].replace('Z:\\', '\\\\mount_eng\\eng\\')
    return result


# This is the main function of the program.
def main(dict_file, tkn_file, auto_overwrite):
    # Load the information from the dictionary (total token counts and token mapping)
    tkn_count, tkn_map = load_dict(dict_file)
    tkn_file_list = load_tkn_files(tkn_file)
    # Load the token files from the file with the list of token files
    tkn_files = load_tokens_from_files(tkn_file_list)
    # Get all the tokens mapped and get their counts to save as output.
    output_files = get_tkns_for_files(tkn_files, tkn_map)

    # Write all the output files out!
    print('{:^70}'.format('Saving files...'))
    num_files_written = 0
    num_files_to_write = len(output_files)
    for path, text in output_files.items():
        num_files_written += 1
        with open(path, 'w+') as output:
            # If the file exists, and we aren't overwriting files
            if os.path.exists(path) and not auto_overwrite:
                # Ask for permission
                print('\r%s File exists, it will be overwritten.' % yellow_warning)
                yes_or_no(decision_message)
            # We don't need anything else because if the user says 'no' the program will exit.
            output.write(text)
            output.close()
            ConsoleUtils.print_progress_bar(num_files_written, num_files_to_write, 50, 70)


# Checks the arguments to make sure they are valid.
def check_args(dictionary, tkn_file):
    fatal_errors = False

    if not os.path.exists(dictionary):
        print('%s The passed dictionary file does not exist (%s)' % (red_error, dictionary))
        fatal_errors = True

    if not os.path.exists(tkn_file):
        print('%s The passed token file does not exist (%s)' % (red_error, tkn_file))
        fatal_errors = True

    if fatal_errors:
        parser.print_help()
        print('Exiting...')
        exit(-1)


# Will ask the user to input yes or no and if they input yes the program will continue to execute. If however, they
# input no, the program will exit with status 0. 0 status is used here because there was no error, the user just chose
# to exit rather than continue executing.
def yes_or_no(message):
    decision = input(message)
    if decision.lower() == 'y' or decision.lower() == 'yes':
        return
    elif decision.lower() == 'n' or decision.lower() == 'no':
        exit(0)
    else:
        yes_or_no('   Invalid input, enter Y(es) or N(o): ')


# This is where we call the main method from.
if __name__ == '__main__':
    print(program_header)
    required_args = parser.add_argument_group('Required')
    optional_args = parser.add_argument_group('Optional')
    required_args.add_argument('-d', '--dictionary', required=True,
                               help='The json dictionary file containing the token counts and mappings.')
    required_args.add_argument('-f', '--tkn_file', required=True,
                               help='The txt file containing a list of tkn files in the library.')
    optional_args.add_argument('-w', '--overwrite', required=False, action='store_true',
                               help='If this flag is set, old output of this tool will be automatically overwritten.')
    optional_args.add_argument('-h', '--help', action='help', help='Prints the help message.')
    args = parser.parse_args()
    # Get the args!
    lib_dict = args.dictionary
    token_file = args.tkn_file
    auto_overwrite = args.overwrite
    # Are the args valid?
    check_args(lib_dict, token_file)
    # Now we can run!
    main(lib_dict, token_file, auto_overwrite)