Sleds/scorewalker-utils/PhraseCountLoader/PhraseCountLoader.py

290 lines
9.7 KiB
Python
Raw Normal View History

2025-03-13 21:28:38 +00:00
# We do all our imports at the top of our program.
import argparse
import os
import json
import sys
import time
import ConsoleUtils
# Give the program a name.
program_name = 'PhraseCountLoader'
# Describe what the program does beiefly.
program_description = 'Loads and converts ".phrasecount" files for use with WalkerIndexer.'
# The argument parser for the program.
parser = argparse.ArgumentParser(prog=program_name, description=program_description, add_help=False)
printer = ConsoleUtils.SLPrinter(program_name)
sys.stdout = printer
build_date = '2017/11/20'
program_version = '1.0.2'
prog_auth = 'Chris Diesch <cdiesch@sequencelogic.net>'
# Program Constants
phrase_start = ' '
phrase_split = ' occurs '
count_end = ' times.'
old_delimiter = '.'
new_delimiter = '_'
# Default values
default_max_phrase_doctypes = 1
default_min_phrase_len = 5
min_sub_phrase_len = 2
def get_phrase_length(phrase):
return phrase.count(new_delimiter) + 1
def make_filter(min_length):
def long_enough(phrase):
return get_phrase_length(phrase) >= min_length
return long_enough
def load_phrases(phrase_path_root):
phrase_count_by_doctype = {}
master_set = set()
sub_phrase_set = set()
files = os.listdir(phrase_path_root)
total_files = len(files)
file_num = 0
start_time = time.time()
print('Analyzing %d files' % total_files)
for pc_file in files:
file_num += 1
# print('%03d/%d: Running on file: %s' % (file_num, total_files, pc_file))
if not pc_file.endswith('.phrasecount'):
continue
file_path = os.path.join(phrase_path_root, pc_file)
split_idx = pc_file.rfind('.')
doctype_name = pc_file[:split_idx]
doctype_set, counts, sub_phrase_set = load_phrasecount_file(file_path, sub_phrase_set)
# Save it to the master set (sets guarantee uniqueness)
master_set = master_set.union(doctype_set) - sub_phrase_set
phrase_count_by_doctype[doctype_name] = counts
run_time = time.time() - start_time
rate = len(files)/run_time
print('Analyzed %d files in %.4f s (%.1f files/s)' % (len(files), run_time, rate))
return phrase_count_by_doctype, master_set
def load_phrasecount_file(file_path, sub_phrases):
result = set()
phrase_counts = {}
with open(file_path) as pc_reader:
lines = pc_reader.readlines()
for line in lines:
if line.startswith(phrase_start) and '=' not in line:
line = line.replace(old_delimiter, new_delimiter).replace('\n', '')[len(phrase_start):-len(count_end)]
line_data = line.split(phrase_split)
phrase = line_data[0]
count = int(line_data[1])
phrase_terms = phrase.split('_')
phrase_len = len(phrase_terms)
num_different_terms = len(set(phrase_terms))
if phrase_len > 2 >= num_different_terms:
print('Not including phrase for having too few different terms: %s' % phrase)
continue
result.add(phrase)
phrase_counts[phrase] = count
# Add the new sub phrases
tmp_sub_phrases = make_sub_phrases(phrase)
sub_phrases = sub_phrases.union(tmp_sub_phrases)
return result, phrase_counts, sub_phrases
# TODO: Add filtering to the middle (currently only removes from front or back, not both).
def make_sub_phrases(phrase):
result = set()
remove_back_phrase = phrase
remove_front_phrase = phrase
# It's not a phrase if it is less than 2 tokens long...
long_enough = make_filter(min_sub_phrase_len)
while long_enough(remove_front_phrase) and long_enough(remove_back_phrase):
split_idx = phrase.rfind(new_delimiter)
remove_back_phrase = remove_back_phrase[:split_idx]
result.add(remove_back_phrase)
split_idx = remove_front_phrase.find(new_delimiter)
remove_front_phrase = remove_front_phrase[:split_idx]
result.add(remove_front_phrase)
return result
def filter_phrases_by_occurrences(phrase_data, phrase_set, max_occurrences):
result_set = set()
doc_counts = phrase_data['numDoctypes']
for phrase in phrase_set:
if not doc_counts[phrase] > max_occurrences:
result_set.add(phrase)
return result_set
def save_phrases_file(phrases, out_file_path):
print('Saving file %s' % out_file_path)
with open(out_file_path, 'w+') as writer:
for phrase in phrases:
writer.write('%s\n' % phrase)
def save_phrase_metadata(phrase_metadata, phrases, save_loc):
print('Saving metadata file to %s' % save_loc)
out_data = {}
counts = phrase_metadata['counts']
doctypes = phrase_metadata['doctypes']
doctype_counts = phrase_metadata['numDoctypes']
for phrase in phrases:
out_data[phrase] = {'numOccurrences': counts[phrase],
'numDoctypes': doctype_counts[phrase],
'doctypes': doctypes[phrase]}
with open(save_loc, 'w+') as writer:
json.dump(out_data, writer, indent=3)
def get_phrase_counts_over_doctypes(counts_by_doctype, master_set):
total_counts = {}
num_doctypes = {}
phrase_doctypes = {}
cur_phrase_num = 1
total_phrases = len(master_set)
print('Analyzing %d phrases...' % total_phrases)
for phrase in master_set:
total_counts[phrase] = 0
num_doctypes[phrase] = 0
doctypes = []
for doctype in counts_by_doctype:
doc_counts = counts_by_doctype[doctype]
if phrase in doc_counts:
total_counts[phrase] += doc_counts[phrase]
num_doctypes[phrase] += 1
doctypes.append(doctype)
phrase_doctypes[phrase] = doctypes
cur_phrase_num += 1
return {'counts': total_counts, 'numDoctypes': num_doctypes, 'doctypes': phrase_doctypes}
def show_args(phrases_root, out_file, min_phrase_len):
print('Loading ".phrasecount" files from: %s' % phrases_root)
print('Saving results to: %s' % out_file)
print('Minimum Phrase Length: %d' % min_phrase_len)
def check_args(phrases_folder, output_file, min_phrase_len):
fatal_errors = False
if os.path.exists(output_file):
print('Warning: The file at %s will be overwritten.' % output_file)
if min_phrase_len < 2:
print('Warning: Minimum Phrase Length must be >= 2. (%d)' % min_phrase_len)
print(' OK: Using default value of %d' % default_min_phrase_len)
if not os.path.exists(phrases_folder):
print('Error: The folder at %s does not exist.' % phrases_folder)
fatal_errors = True
if fatal_errors:
print('Encountered Fatal Error, exiting...')
parser.print_help()
exit(-1)
# This is the main function of the program.
def main(phrases_root, out_file, max_doctypes):
folder, file = os.path.split(out_file)
metadata_out = os.path.join(folder, 'phrase-metadata.json')
doctype_phrase_counts, all_phrases = load_phrases(phrases_root)
num_phrases = len(all_phrases)
print('Loaded %d phrases from %s' % (num_phrases, phrases_root))
data = get_phrase_counts_over_doctypes(doctype_phrase_counts, all_phrases)
filter_start = time.time()
filtered_phrases = filter_phrases_by_occurrences(data, all_phrases, max_doctypes)
filter_run = time.time() - filter_start
num_filtered = len(filtered_phrases)
try:
rate = num_phrases/filter_run
except ZeroDivisionError:
rate = 0
print('Analyzed %d phrases in %.4f s (%.1f phrase/s)' % (num_phrases, filter_run, rate))
print(' %d/%d phrases occur in %d doctypes or fewer' % (num_filtered, num_phrases, max_doctypes))
# print('Filtered %d phrases to %d phrases which occur in %d doctypes or fewer.' %
# (num_phrases, num_filtered, max_doctypes))
save_phrases_file(filtered_phrases, out_file)
save_phrase_metadata(data, filtered_phrases, metadata_out)
# This is where we call the main method from.
if __name__ == '__main__':
printer.write_no_prefix(ConsoleUtils.get_header(program_name, program_version, build_date, prog_auth))
# Set up arguments here.
required_args = parser.add_argument_group('Required')
optional_args = parser.add_argument_group('Optional')
required_args.add_argument('-i', '--in_dir', required=True,
help='The path to the folder containing the ".phrasecount" files.')
required_args.add_argument('-o', '--dest_file', required=True, help='The path to put the output of this tool.')
optional_args.add_argument('-m', '--min_length', required=False, type=int, default=default_min_phrase_len,
help='The minimum number of terms considered a phrase.')
optional_args.add_argument('-d', '--doctype_count', required=False, type=int, default=default_max_phrase_doctypes,
help='The maximum number of doctypes a phrase is allowed to occur in.')
optional_args.add_argument('-h', '--help', action="help", help='Prints the help message.')
# Get the arguments, we don't need to do a check for required arguments, since the ArgumentParser class does that
args = parser.parse_args()
# Get the arguments.
phrasecount_dir = args.in_dir
out_path = args.dest_file
min_len = args.min_length
min_sub_phrase_len = min_len - 1
max_doctype_occurrences = args.doctype_count
# Display the arguments
show_args(phrasecount_dir, out_path, min_len)
check_args(phrasecount_dir, out_path, min_len)
# Now we can run...
try:
main(phrasecount_dir, out_path, max_doctype_occurrences)
except Exception as ex:
printer.write_line_break(break_char=' ')
print('Encountered Error: %s' % type(ex).__name__)
print(' Message: %s' % str(ex))