271 lines
9.9 KiB
Python
271 lines
9.9 KiB
Python
"""
|
|
Dictionary.py
|
|
=============
|
|
|
|
This is a tool for generating a dictionary of tokens in a given list of file. It takes for input a file containing a
|
|
list of the files containing the term tokens and a path to a file to write. Every file in this list is read and used
|
|
to generate two dicts which are written to the passed output file. The first of these is a list of each token and
|
|
it's associated integer 'map'. The second list is each token as well as the number of occurrences across the files
|
|
read.
|
|
|
|
.. moduleauthor:: Chris Diesch <cdiesch@sequencelogic.net>
|
|
"""
|
|
|
|
# We do all our imports at the top of our program.
|
|
import argparse
|
|
import json
|
|
import os
|
|
import ConsoleUtils
|
|
# Give the program a name.
|
|
program_name = 'Dictionary'
|
|
# Describe what the program does beiefly.
|
|
program_description = 'Creates and loads the dictionary for k-means clustering.'
|
|
# The argument parser for the program.
|
|
parser = argparse.ArgumentParser(prog=program_name, description=program_description, add_help=False)
|
|
# Error and Warning console values:
|
|
red_error = '\033[91mError:\033[0m'
|
|
yellow_warning = '\033[93mWARNING:\033[0m'
|
|
blue_okay = '\033[94mOK\033[0m'
|
|
program_header = format('\033[95m%s\033[0m\n'
|
|
'-----------------------' % program_name)
|
|
# Default decision message.
|
|
decision_message = ' Is this okay? (Y/N): '
|
|
# Constants
|
|
library_name = 'False Pos Lib'
|
|
dictionary_tag = 'dictionary'
|
|
dict_tokens_tag = 'tokens'
|
|
dict_token_tag = 'token'
|
|
tkn_value_tag = 'value'
|
|
tkn_weight_tag = 'weight'
|
|
tkn_mapped_tag = 'mapping'
|
|
token_count_tag = 'tokenCounts'
|
|
|
|
|
|
# TODO: Tokenize phrases which are important.
|
|
# Loads all the tokens from the files. Returns a dict with token values as keys and the count of the tokens as values
|
|
# and a dict with token values as keys and the mapped value of the token as values
|
|
def load_tokens_from_files(files):
|
|
"""
|
|
Loads the tokens from every file in the given list of files. This function loads the tokens into a dictionary and
|
|
guarantees every tokens' uniqueness. It also maps each token to an associated integer value and coutns the number of
|
|
token occurrences.
|
|
|
|
:param files: The list of files to load the tokens from.
|
|
:type files: list of str
|
|
|
|
.. raw:: html
|
|
<br>
|
|
|
|
:return: A dict with token values as keys, and their counts as values, a dict with the token values as keys and
|
|
their mapped integers as values.
|
|
|
|
:rtype: tuple(dict[str,int], dict[str,int])
|
|
"""
|
|
print('{:^70}'.format('Processing token files...'))
|
|
map_val = 0
|
|
tkns_map = {}
|
|
tkns_count = {}
|
|
num_ran = 0
|
|
num_to_run = len(files)
|
|
for file in files:
|
|
num_ran += 1
|
|
ConsoleUtils.print_progress_bar(num_ran, num_to_run, 50, 70)
|
|
if os.path.exists(file):
|
|
tkns = load_tkns_from_file(file)
|
|
# We may want more than just the tokens at some point...
|
|
for i in range(len(tkns)):
|
|
s = str(tkns[i])
|
|
if s not in tkns_map:
|
|
# Add the token to the maps and counts...
|
|
tkns_map[s] = map_val
|
|
map_val += 1
|
|
tkns_count[s] = 1
|
|
else:
|
|
# Update the token count
|
|
tkns_count[s] += 1
|
|
return tkns_count, tkns_map
|
|
|
|
|
|
# Loads the un-mapped tokens from the tkn file.
|
|
def load_tkns_from_file(file_name):
|
|
"""
|
|
A helper method for load_tokens_from_files; this function loads all the tokens from a single file into a list.
|
|
|
|
:param file_name: The path to the file to load.
|
|
:type file_name: str
|
|
|
|
.. raw:: html
|
|
<br>
|
|
|
|
:return: A list of every token from the file (not unique, not mapped).
|
|
:rtype: list of str
|
|
"""
|
|
with open(file_name) as reader:
|
|
token_json = json.load(reader)
|
|
ordered_tokens = token_json['ordered']
|
|
tokens = ordered_tokens.split(' ')
|
|
return tokens
|
|
|
|
|
|
# Writes a json file out with all the dictionary metadata.
|
|
def write_json(lib_name, tokens_map, tokens_count, dict_file):
|
|
"""
|
|
Writes the JSON containing the dictionary values (token mappings and counts).
|
|
|
|
:param lib_name: The name of the library (or sub-set) the dictionary is being created for.
|
|
:type lib_name: str
|
|
:param tokens_map: A dict which has token values as keys, and their integer mapping as values.
|
|
:type tokens_map: dict[str,int]
|
|
:param tokens_count: A dict which has token values as keys and their counts as values.
|
|
:type tokens_count dict[str,int]
|
|
:param dict_file: The path to the JSON 'dictionary' file to write.
|
|
:type dict_file: str
|
|
|
|
.. raw:: html
|
|
<br>
|
|
|
|
:return: None.
|
|
"""
|
|
final_dict = {dictionary_tag: lib_name, tkn_mapped_tag: tokens_map, token_count_tag: tokens_count}
|
|
with open(dict_file, 'w+') as dict_writer:
|
|
dict_writer.write(json.dumps(final_dict, indent=1))
|
|
dict_writer.close()
|
|
|
|
|
|
# Loads the token files from the txt file containing the list of token files delimited by newlines
|
|
def load_tkn_files(file):
|
|
"""
|
|
This function reads in the file containing the list of token files to read.
|
|
|
|
:param file: The path to the 'master file list' file.
|
|
:type file: str
|
|
|
|
.. raw:: html
|
|
<br>
|
|
|
|
:return: A list of '.tkn' files which have been scrubbed as necessary.
|
|
:rtype: list of str
|
|
"""
|
|
with open(file, 'r+') as tkn_file:
|
|
result = tkn_file.readlines()
|
|
for i in range(len(result)):
|
|
# This line changes depending on what machine you're on....
|
|
result[i] = result[i].replace('Z:\\', '\\\\mount_eng\\eng\\')
|
|
result[i] = result[i].replace('\n', '')
|
|
|
|
return result
|
|
|
|
|
|
# This is the main function of the program.
|
|
def main(token_file, output_file):
|
|
"""
|
|
The main entry point of the tool.
|
|
|
|
:param token_file: The file containing the list of '.tkn' files to run against.
|
|
:type token_file: str
|
|
:param output_file: The path to write the JSON 'dictionary' file to.
|
|
:type output_file: str
|
|
|
|
.. raw:: html
|
|
<br>
|
|
|
|
:return: The status of the program.
|
|
:rtype: int
|
|
"""
|
|
library_dir, file_name = os.path.split(token_file)
|
|
lib_dirs, lib_name = os.path.split(library_dir)
|
|
|
|
tkn_files = load_tkn_files(token_file)
|
|
tkns_count, tkns_map = load_tokens_from_files(tkn_files)
|
|
num_processed = len(tkn_files)
|
|
num_unique_tkns = len(tkns_map)
|
|
print('{:^70}'.format('Processed %d files. Found %d unique tokens' % (num_processed, num_unique_tkns)))
|
|
print('{:^70}'.format('Saving library to json file...'))
|
|
write_json(lib_name, tkns_map, tkns_count, output_file)
|
|
print('{:^70}'.format('Done!'))
|
|
|
|
|
|
# Makes sure the passed arguments are valid.
|
|
def check_args(tkn_file, output_json, auto_overwrite):
|
|
"""
|
|
Makes sure arguments are valid before running the program.
|
|
|
|
:param tkn_file: The path to the file containing a list of '.tkn' files to run over.
|
|
:type tkn_file: str
|
|
:param output_json: the location of the output JSON 'dictionary' file.
|
|
:type output_json: str
|
|
:param auto_overwrite: Weather or not to overwrite an existing dictionary without asking for permission.
|
|
:type auto_overwrite: bool
|
|
|
|
.. raw:: html
|
|
<br>
|
|
|
|
:return: None.
|
|
"""
|
|
fatal_errors = False
|
|
# Do the checks for the required arguments second, this makes more sense reading the output.
|
|
if os.path.exists(output_json) and not auto_overwrite:
|
|
print('%s The old dictionary will be overwritten.' % yellow_warning)
|
|
yes_or_no(decision_message)
|
|
|
|
if not os.path.exists(tkn_file):
|
|
print('%s The token file provided does not exist (%s).' % (red_error, tkn_file))
|
|
fatal_errors = True
|
|
|
|
if fatal_errors:
|
|
parser.print_help()
|
|
print('Exiting...')
|
|
exit(0)
|
|
|
|
|
|
# Will ask the user to input yes or no and if they input yes the program will continue to execute. If however, they
|
|
# input no, the program will exit with status 0. 0 status is used here because there was no error, the user just chose
|
|
# to exit rather than continue executing.
|
|
def yes_or_no(message):
|
|
"""
|
|
Will prompt the user with the given message and accept either 'y', 'yes', 'n', or 'no' as inputs ignoring case.
|
|
The program will exit (with status 0) if the user enters no and will continue if the user enters yes.
|
|
|
|
.. note: The user will be prompted to re-enter input if it is not valid with the following message:
|
|
'Invalid input, enter Y(es) or N(o):' (This message will be printed until valid input is provided).
|
|
|
|
:param message: The message to display to the user.
|
|
:type message: str
|
|
|
|
.. raw:: html
|
|
<br>
|
|
|
|
:return: None.
|
|
"""
|
|
decision = input(message)
|
|
if decision.lower() == 'y' or decision.lower() == 'yes':
|
|
return
|
|
elif decision.lower() == 'n' or decision.lower() == 'no':
|
|
exit(0)
|
|
else:
|
|
yes_or_no(' Invalid input, enter Y(es) or N(o): ')
|
|
|
|
|
|
# This is where we call the main method from.
|
|
if __name__ == '__main__':
|
|
print(program_header)
|
|
required_args = parser.add_argument_group('Required')
|
|
optional_args = parser.add_argument_group('Optional')
|
|
required_args.add_argument('-f', '--tkn_file', required=True,
|
|
help='The txt file containing a list of token files in the library.')
|
|
required_args.add_argument('-o', '--output_file', required=True,
|
|
help='The full path of the json file to save the dictionary in.')
|
|
optional_args.add_argument('-w', '--overwrite', required=False, action='store_true',
|
|
help='If this flag is thrown, if there is an old dictionary with the same name, it will '
|
|
'be overwritten without asking.')
|
|
optional_args.add_argument('-h', '--help', action='help', help='Prints the help message.')
|
|
args = parser.parse_args()
|
|
# Save the arguments
|
|
token_file = args.tkn_file
|
|
output_json = args.output_file
|
|
overwrite = args.overwrite
|
|
# Are the arguments valid?
|
|
check_args(token_file, output_json, overwrite)
|
|
# Now we can Run!
|
|
main(token_file, output_json)
|