# We do all our imports at the top of our program. import argparse import json import os import collections import ConsoleUtils # Give the program a name. program_name = 'Tokenizer' # Describe what the program does beiefly. program_description = 'Creates the custom tokenized files for running a distance calulation on.' # The argument parser for the program. parser = argparse.ArgumentParser(prog=program_name, description=program_description, add_help=False) # Error and Warning console values: red_error = '\033[91mError:\033[0m' yellow_warning = '\033[93mWARNING:\033[0m' blue_okay = '\033[94mOK\033[0m' program_header = format('\033[95m%s\033[0m\n' '-----------------------' % program_name) # Default decision message. decision_message = ' Is this okay? (Y/N): ' # Constants dictionary_tag = 'dictionary' dict_count_tag = 'tokenCounts' dict_mapping_tag = 'mapping' tkn_mapped_tag = 'mappedVal' file_name_tag = 'name' file_path_tag = 'path' file_txt_tag = 'text' # Loads the un mapped tokens from all the token files in tkn_files (makes sure a file exists and is a .tkn file # before trying to read it) def load_tokens_from_files(tkn_files): print('{:^70}'.format('Loading un-mapped tokens from files...')) result = {} tokenized_count = 0 total_to_tokenize = len(tkn_files) for file in tkn_files: tokenized_count += 1 if os.path.exists(file): ConsoleUtils.print_progress_bar(tokenized_count, total_to_tokenize, 50, 70) with open(file, 'r+') as reader: file_txt = reader.read() tkns = file_txt.split(' ') result[file] = tkns return result # Gets the mapped tokens for all the files in tkn_files based on the mapping in tkn_maps. def get_tkns_for_files(tkn_files, tkn_maps): print('{:^70}'.format('Mapping tokens from files...')) num_ran = 0 num_total = len(tkn_files) result = {} for path, token_file in tkn_files.items(): num_ran += 1 save_file = path.replace('.tkn', '.lev') ConsoleUtils.print_progress_bar(num_ran, num_total, 50, 70) result[save_file] = generate_tokens(token_file, tkn_maps) return result # Generates the mapped token values and counts into a nice string to write to a file. It is important to maintain order # since part of our calculation is to order the tokens by a rule before calculating the distance. def generate_tokens(token_file, tkn_maps): # We really want to maintain the original order of tokens! tkn_count = collections.OrderedDict() for word in token_file: if word not in tkn_count: tkn_count[word] = 1 else: tkn_count[word] += 1 result_tkns = '' result_cnts = '' # We want the key because we need the map as well as the count... for key, value in tkn_count.items(): result_tkns += format('%s ' % tkn_maps[key]) result_cnts += format('%d ' % value) result_str = format('%s\n%s' % (result_tkns, result_cnts)) return result_str # Loads the dictionary of token mappings, weights, and values into a list of dictionaries. def load_dict(dict_path): with open(dict_path) as dict_file: dict_json = json.load(dict_file) tokens_count = dict_json[dict_count_tag] tokens_map = dict_json[dict_mapping_tag] return tokens_count, tokens_map # Loads the tokens files from the file containing a list of token files which are \n delimited. def load_tkn_files(file): with open(file, 'r+') as tkn_file: result = tkn_file.readlines() for i in range(len(result)): result[i] = result[i].replace('\n', '') result[i] = result[i].replace('Z:\\', '\\\\mount_eng\\eng\\') return result # This is the main function of the program. def main(dict_file, tkn_file, auto_overwrite): # Load the information from the dictionary (total token counts and token mapping) tkn_count, tkn_map = load_dict(dict_file) tkn_file_list = load_tkn_files(tkn_file) # Load the token files from the file with the list of token files tkn_files = load_tokens_from_files(tkn_file_list) # Get all the tokens mapped and get their counts to save as output. output_files = get_tkns_for_files(tkn_files, tkn_map) # Write all the output files out! print('{:^70}'.format('Saving files...')) num_files_written = 0 num_files_to_write = len(output_files) for path, text in output_files.items(): num_files_written += 1 with open(path, 'w+') as output: # If the file exists, and we aren't overwriting files if os.path.exists(path) and not auto_overwrite: # Ask for permission print('\r%s File exists, it will be overwritten.' % yellow_warning) yes_or_no(decision_message) # We don't need anything else because if the user says 'no' the program will exit. output.write(text) output.close() ConsoleUtils.print_progress_bar(num_files_written, num_files_to_write, 50, 70) # Checks the arguments to make sure they are valid. def check_args(dictionary, tkn_file): fatal_errors = False if not os.path.exists(dictionary): print('%s The passed dictionary file does not exist (%s)' % (red_error, dictionary)) fatal_errors = True if not os.path.exists(tkn_file): print('%s The passed token file does not exist (%s)' % (red_error, tkn_file)) fatal_errors = True if fatal_errors: parser.print_help() print('Exiting...') exit(-1) # Will ask the user to input yes or no and if they input yes the program will continue to execute. If however, they # input no, the program will exit with status 0. 0 status is used here because there was no error, the user just chose # to exit rather than continue executing. def yes_or_no(message): decision = input(message) if decision.lower() == 'y' or decision.lower() == 'yes': return elif decision.lower() == 'n' or decision.lower() == 'no': exit(0) else: yes_or_no(' Invalid input, enter Y(es) or N(o): ') # This is where we call the main method from. if __name__ == '__main__': print(program_header) required_args = parser.add_argument_group('Required') optional_args = parser.add_argument_group('Optional') required_args.add_argument('-d', '--dictionary', required=True, help='The json dictionary file containing the token counts and mappings.') required_args.add_argument('-f', '--tkn_file', required=True, help='The txt file containing a list of tkn files in the library.') optional_args.add_argument('-w', '--overwrite', required=False, action='store_true', help='If this flag is set, old output of this tool will be automatically overwritten.') optional_args.add_argument('-h', '--help', action='help', help='Prints the help message.') args = parser.parse_args() # Get the args! lib_dict = args.dictionary token_file = args.tkn_file auto_overwrite = args.overwrite # Are the args valid? check_args(lib_dict, token_file) # Now we can run! main(lib_dict, token_file, auto_overwrite)