290 lines
9.7 KiB
Python
290 lines
9.7 KiB
Python
# We do all our imports at the top of our program.
|
|
import argparse
|
|
import os
|
|
import json
|
|
import sys
|
|
import time
|
|
|
|
import ConsoleUtils
|
|
|
|
|
|
# Give the program a name.
|
|
program_name = 'PhraseCountLoader'
|
|
# Describe what the program does beiefly.
|
|
program_description = 'Loads and converts ".phrasecount" files for use with WalkerIndexer.'
|
|
# The argument parser for the program.
|
|
parser = argparse.ArgumentParser(prog=program_name, description=program_description, add_help=False)
|
|
printer = ConsoleUtils.SLPrinter(program_name)
|
|
sys.stdout = printer
|
|
|
|
build_date = '2017/11/20'
|
|
program_version = '1.0.2'
|
|
prog_auth = 'Chris Diesch <cdiesch@sequencelogic.net>'
|
|
|
|
# Program Constants
|
|
phrase_start = ' '
|
|
phrase_split = ' occurs '
|
|
count_end = ' times.'
|
|
old_delimiter = '.'
|
|
new_delimiter = '_'
|
|
|
|
# Default values
|
|
default_max_phrase_doctypes = 1
|
|
|
|
default_min_phrase_len = 5
|
|
min_sub_phrase_len = 2
|
|
|
|
|
|
def get_phrase_length(phrase):
|
|
return phrase.count(new_delimiter) + 1
|
|
|
|
|
|
def make_filter(min_length):
|
|
def long_enough(phrase):
|
|
return get_phrase_length(phrase) >= min_length
|
|
return long_enough
|
|
|
|
|
|
def load_phrases(phrase_path_root):
|
|
phrase_count_by_doctype = {}
|
|
master_set = set()
|
|
sub_phrase_set = set()
|
|
files = os.listdir(phrase_path_root)
|
|
|
|
total_files = len(files)
|
|
file_num = 0
|
|
|
|
start_time = time.time()
|
|
print('Analyzing %d files' % total_files)
|
|
for pc_file in files:
|
|
file_num += 1
|
|
# print('%03d/%d: Running on file: %s' % (file_num, total_files, pc_file))
|
|
if not pc_file.endswith('.phrasecount'):
|
|
continue
|
|
|
|
file_path = os.path.join(phrase_path_root, pc_file)
|
|
split_idx = pc_file.rfind('.')
|
|
doctype_name = pc_file[:split_idx]
|
|
|
|
doctype_set, counts, sub_phrase_set = load_phrasecount_file(file_path, sub_phrase_set)
|
|
# Save it to the master set (sets guarantee uniqueness)
|
|
master_set = master_set.union(doctype_set) - sub_phrase_set
|
|
|
|
phrase_count_by_doctype[doctype_name] = counts
|
|
|
|
run_time = time.time() - start_time
|
|
rate = len(files)/run_time
|
|
print('Analyzed %d files in %.4f s (%.1f files/s)' % (len(files), run_time, rate))
|
|
return phrase_count_by_doctype, master_set
|
|
|
|
|
|
def load_phrasecount_file(file_path, sub_phrases):
|
|
result = set()
|
|
phrase_counts = {}
|
|
with open(file_path) as pc_reader:
|
|
lines = pc_reader.readlines()
|
|
|
|
for line in lines:
|
|
if line.startswith(phrase_start) and '=' not in line:
|
|
line = line.replace(old_delimiter, new_delimiter).replace('\n', '')[len(phrase_start):-len(count_end)]
|
|
line_data = line.split(phrase_split)
|
|
|
|
phrase = line_data[0]
|
|
count = int(line_data[1])
|
|
|
|
phrase_terms = phrase.split('_')
|
|
|
|
phrase_len = len(phrase_terms)
|
|
num_different_terms = len(set(phrase_terms))
|
|
|
|
if phrase_len > 2 >= num_different_terms:
|
|
print('Not including phrase for having too few different terms: %s' % phrase)
|
|
continue
|
|
|
|
result.add(phrase)
|
|
phrase_counts[phrase] = count
|
|
# Add the new sub phrases
|
|
tmp_sub_phrases = make_sub_phrases(phrase)
|
|
sub_phrases = sub_phrases.union(tmp_sub_phrases)
|
|
|
|
return result, phrase_counts, sub_phrases
|
|
|
|
|
|
# TODO: Add filtering to the middle (currently only removes from front or back, not both).
|
|
def make_sub_phrases(phrase):
|
|
result = set()
|
|
|
|
remove_back_phrase = phrase
|
|
remove_front_phrase = phrase
|
|
|
|
# It's not a phrase if it is less than 2 tokens long...
|
|
long_enough = make_filter(min_sub_phrase_len)
|
|
|
|
while long_enough(remove_front_phrase) and long_enough(remove_back_phrase):
|
|
split_idx = phrase.rfind(new_delimiter)
|
|
remove_back_phrase = remove_back_phrase[:split_idx]
|
|
result.add(remove_back_phrase)
|
|
|
|
split_idx = remove_front_phrase.find(new_delimiter)
|
|
remove_front_phrase = remove_front_phrase[:split_idx]
|
|
result.add(remove_front_phrase)
|
|
|
|
return result
|
|
|
|
|
|
def filter_phrases_by_occurrences(phrase_data, phrase_set, max_occurrences):
|
|
result_set = set()
|
|
|
|
doc_counts = phrase_data['numDoctypes']
|
|
|
|
for phrase in phrase_set:
|
|
if not doc_counts[phrase] > max_occurrences:
|
|
result_set.add(phrase)
|
|
|
|
return result_set
|
|
|
|
|
|
def save_phrases_file(phrases, out_file_path):
|
|
print('Saving file %s' % out_file_path)
|
|
|
|
with open(out_file_path, 'w+') as writer:
|
|
for phrase in phrases:
|
|
writer.write('%s\n' % phrase)
|
|
|
|
|
|
def save_phrase_metadata(phrase_metadata, phrases, save_loc):
|
|
print('Saving metadata file to %s' % save_loc)
|
|
|
|
out_data = {}
|
|
counts = phrase_metadata['counts']
|
|
doctypes = phrase_metadata['doctypes']
|
|
doctype_counts = phrase_metadata['numDoctypes']
|
|
|
|
for phrase in phrases:
|
|
out_data[phrase] = {'numOccurrences': counts[phrase],
|
|
'numDoctypes': doctype_counts[phrase],
|
|
'doctypes': doctypes[phrase]}
|
|
with open(save_loc, 'w+') as writer:
|
|
json.dump(out_data, writer, indent=3)
|
|
|
|
|
|
def get_phrase_counts_over_doctypes(counts_by_doctype, master_set):
|
|
total_counts = {}
|
|
num_doctypes = {}
|
|
phrase_doctypes = {}
|
|
cur_phrase_num = 1
|
|
total_phrases = len(master_set)
|
|
print('Analyzing %d phrases...' % total_phrases)
|
|
for phrase in master_set:
|
|
total_counts[phrase] = 0
|
|
num_doctypes[phrase] = 0
|
|
doctypes = []
|
|
|
|
for doctype in counts_by_doctype:
|
|
doc_counts = counts_by_doctype[doctype]
|
|
|
|
if phrase in doc_counts:
|
|
total_counts[phrase] += doc_counts[phrase]
|
|
num_doctypes[phrase] += 1
|
|
doctypes.append(doctype)
|
|
|
|
phrase_doctypes[phrase] = doctypes
|
|
cur_phrase_num += 1
|
|
|
|
return {'counts': total_counts, 'numDoctypes': num_doctypes, 'doctypes': phrase_doctypes}
|
|
|
|
|
|
def show_args(phrases_root, out_file, min_phrase_len):
|
|
print('Loading ".phrasecount" files from: %s' % phrases_root)
|
|
print('Saving results to: %s' % out_file)
|
|
print('Minimum Phrase Length: %d' % min_phrase_len)
|
|
|
|
|
|
def check_args(phrases_folder, output_file, min_phrase_len):
|
|
fatal_errors = False
|
|
|
|
if os.path.exists(output_file):
|
|
print('Warning: The file at %s will be overwritten.' % output_file)
|
|
|
|
if min_phrase_len < 2:
|
|
print('Warning: Minimum Phrase Length must be >= 2. (%d)' % min_phrase_len)
|
|
print(' OK: Using default value of %d' % default_min_phrase_len)
|
|
|
|
if not os.path.exists(phrases_folder):
|
|
print('Error: The folder at %s does not exist.' % phrases_folder)
|
|
fatal_errors = True
|
|
|
|
if fatal_errors:
|
|
print('Encountered Fatal Error, exiting...')
|
|
parser.print_help()
|
|
exit(-1)
|
|
|
|
|
|
# This is the main function of the program.
|
|
def main(phrases_root, out_file, max_doctypes):
|
|
folder, file = os.path.split(out_file)
|
|
metadata_out = os.path.join(folder, 'phrase-metadata.json')
|
|
|
|
doctype_phrase_counts, all_phrases = load_phrases(phrases_root)
|
|
|
|
num_phrases = len(all_phrases)
|
|
print('Loaded %d phrases from %s' % (num_phrases, phrases_root))
|
|
|
|
data = get_phrase_counts_over_doctypes(doctype_phrase_counts, all_phrases)
|
|
|
|
filter_start = time.time()
|
|
filtered_phrases = filter_phrases_by_occurrences(data, all_phrases, max_doctypes)
|
|
filter_run = time.time() - filter_start
|
|
|
|
num_filtered = len(filtered_phrases)
|
|
try:
|
|
rate = num_phrases/filter_run
|
|
except ZeroDivisionError:
|
|
rate = 0
|
|
print('Analyzed %d phrases in %.4f s (%.1f phrase/s)' % (num_phrases, filter_run, rate))
|
|
print(' %d/%d phrases occur in %d doctypes or fewer' % (num_filtered, num_phrases, max_doctypes))
|
|
# print('Filtered %d phrases to %d phrases which occur in %d doctypes or fewer.' %
|
|
# (num_phrases, num_filtered, max_doctypes))
|
|
|
|
save_phrases_file(filtered_phrases, out_file)
|
|
|
|
save_phrase_metadata(data, filtered_phrases, metadata_out)
|
|
|
|
|
|
# This is where we call the main method from.
|
|
if __name__ == '__main__':
|
|
printer.write_no_prefix(ConsoleUtils.get_header(program_name, program_version, build_date, prog_auth))
|
|
# Set up arguments here.
|
|
required_args = parser.add_argument_group('Required')
|
|
optional_args = parser.add_argument_group('Optional')
|
|
required_args.add_argument('-i', '--in_dir', required=True,
|
|
help='The path to the folder containing the ".phrasecount" files.')
|
|
required_args.add_argument('-o', '--dest_file', required=True, help='The path to put the output of this tool.')
|
|
|
|
optional_args.add_argument('-m', '--min_length', required=False, type=int, default=default_min_phrase_len,
|
|
help='The minimum number of terms considered a phrase.')
|
|
optional_args.add_argument('-d', '--doctype_count', required=False, type=int, default=default_max_phrase_doctypes,
|
|
help='The maximum number of doctypes a phrase is allowed to occur in.')
|
|
optional_args.add_argument('-h', '--help', action="help", help='Prints the help message.')
|
|
|
|
# Get the arguments, we don't need to do a check for required arguments, since the ArgumentParser class does that
|
|
args = parser.parse_args()
|
|
|
|
# Get the arguments.
|
|
phrasecount_dir = args.in_dir
|
|
out_path = args.dest_file
|
|
min_len = args.min_length
|
|
min_sub_phrase_len = min_len - 1
|
|
max_doctype_occurrences = args.doctype_count
|
|
# Display the arguments
|
|
show_args(phrasecount_dir, out_path, min_len)
|
|
check_args(phrasecount_dir, out_path, min_len)
|
|
|
|
# Now we can run...
|
|
try:
|
|
main(phrasecount_dir, out_path, max_doctype_occurrences)
|
|
except Exception as ex:
|
|
printer.write_line_break(break_char=' ')
|
|
print('Encountered Error: %s' % type(ex).__name__)
|
|
print(' Message: %s' % str(ex))
|