# We do all our imports at the top of our program. import argparse import os import json import sys import time import ConsoleUtils # Give the program a name. program_name = 'PhraseCountLoader' # Describe what the program does beiefly. program_description = 'Loads and converts ".phrasecount" files for use with WalkerIndexer.' # The argument parser for the program. parser = argparse.ArgumentParser(prog=program_name, description=program_description, add_help=False) printer = ConsoleUtils.SLPrinter(program_name) sys.stdout = printer build_date = '2017/11/20' program_version = '1.0.2' prog_auth = 'Chris Diesch ' # Program Constants phrase_start = ' ' phrase_split = ' occurs ' count_end = ' times.' old_delimiter = '.' new_delimiter = '_' # Default values default_max_phrase_doctypes = 1 default_min_phrase_len = 5 min_sub_phrase_len = 2 def get_phrase_length(phrase): return phrase.count(new_delimiter) + 1 def make_filter(min_length): def long_enough(phrase): return get_phrase_length(phrase) >= min_length return long_enough def load_phrases(phrase_path_root): phrase_count_by_doctype = {} master_set = set() sub_phrase_set = set() files = os.listdir(phrase_path_root) total_files = len(files) file_num = 0 start_time = time.time() print('Analyzing %d files' % total_files) for pc_file in files: file_num += 1 # print('%03d/%d: Running on file: %s' % (file_num, total_files, pc_file)) if not pc_file.endswith('.phrasecount'): continue file_path = os.path.join(phrase_path_root, pc_file) split_idx = pc_file.rfind('.') doctype_name = pc_file[:split_idx] doctype_set, counts, sub_phrase_set = load_phrasecount_file(file_path, sub_phrase_set) # Save it to the master set (sets guarantee uniqueness) master_set = master_set.union(doctype_set) - sub_phrase_set phrase_count_by_doctype[doctype_name] = counts run_time = time.time() - start_time rate = len(files)/run_time print('Analyzed %d files in %.4f s (%.1f files/s)' % (len(files), run_time, rate)) return phrase_count_by_doctype, master_set def load_phrasecount_file(file_path, sub_phrases): result = set() phrase_counts = {} with open(file_path) as pc_reader: lines = pc_reader.readlines() for line in lines: if line.startswith(phrase_start) and '=' not in line: line = line.replace(old_delimiter, new_delimiter).replace('\n', '')[len(phrase_start):-len(count_end)] line_data = line.split(phrase_split) phrase = line_data[0] count = int(line_data[1]) phrase_terms = phrase.split('_') phrase_len = len(phrase_terms) num_different_terms = len(set(phrase_terms)) if phrase_len > 2 >= num_different_terms: print('Not including phrase for having too few different terms: %s' % phrase) continue result.add(phrase) phrase_counts[phrase] = count # Add the new sub phrases tmp_sub_phrases = make_sub_phrases(phrase) sub_phrases = sub_phrases.union(tmp_sub_phrases) return result, phrase_counts, sub_phrases # TODO: Add filtering to the middle (currently only removes from front or back, not both). def make_sub_phrases(phrase): result = set() remove_back_phrase = phrase remove_front_phrase = phrase # It's not a phrase if it is less than 2 tokens long... long_enough = make_filter(min_sub_phrase_len) while long_enough(remove_front_phrase) and long_enough(remove_back_phrase): split_idx = phrase.rfind(new_delimiter) remove_back_phrase = remove_back_phrase[:split_idx] result.add(remove_back_phrase) split_idx = remove_front_phrase.find(new_delimiter) remove_front_phrase = remove_front_phrase[:split_idx] result.add(remove_front_phrase) return result def filter_phrases_by_occurrences(phrase_data, phrase_set, max_occurrences): result_set = set() doc_counts = phrase_data['numDoctypes'] for phrase in phrase_set: if not doc_counts[phrase] > max_occurrences: result_set.add(phrase) return result_set def save_phrases_file(phrases, out_file_path): print('Saving file %s' % out_file_path) with open(out_file_path, 'w+') as writer: for phrase in phrases: writer.write('%s\n' % phrase) def save_phrase_metadata(phrase_metadata, phrases, save_loc): print('Saving metadata file to %s' % save_loc) out_data = {} counts = phrase_metadata['counts'] doctypes = phrase_metadata['doctypes'] doctype_counts = phrase_metadata['numDoctypes'] for phrase in phrases: out_data[phrase] = {'numOccurrences': counts[phrase], 'numDoctypes': doctype_counts[phrase], 'doctypes': doctypes[phrase]} with open(save_loc, 'w+') as writer: json.dump(out_data, writer, indent=3) def get_phrase_counts_over_doctypes(counts_by_doctype, master_set): total_counts = {} num_doctypes = {} phrase_doctypes = {} cur_phrase_num = 1 total_phrases = len(master_set) print('Analyzing %d phrases...' % total_phrases) for phrase in master_set: total_counts[phrase] = 0 num_doctypes[phrase] = 0 doctypes = [] for doctype in counts_by_doctype: doc_counts = counts_by_doctype[doctype] if phrase in doc_counts: total_counts[phrase] += doc_counts[phrase] num_doctypes[phrase] += 1 doctypes.append(doctype) phrase_doctypes[phrase] = doctypes cur_phrase_num += 1 return {'counts': total_counts, 'numDoctypes': num_doctypes, 'doctypes': phrase_doctypes} def show_args(phrases_root, out_file, min_phrase_len): print('Loading ".phrasecount" files from: %s' % phrases_root) print('Saving results to: %s' % out_file) print('Minimum Phrase Length: %d' % min_phrase_len) def check_args(phrases_folder, output_file, min_phrase_len): fatal_errors = False if os.path.exists(output_file): print('Warning: The file at %s will be overwritten.' % output_file) if min_phrase_len < 2: print('Warning: Minimum Phrase Length must be >= 2. (%d)' % min_phrase_len) print(' OK: Using default value of %d' % default_min_phrase_len) if not os.path.exists(phrases_folder): print('Error: The folder at %s does not exist.' % phrases_folder) fatal_errors = True if fatal_errors: print('Encountered Fatal Error, exiting...') parser.print_help() exit(-1) # This is the main function of the program. def main(phrases_root, out_file, max_doctypes): folder, file = os.path.split(out_file) metadata_out = os.path.join(folder, 'phrase-metadata.json') doctype_phrase_counts, all_phrases = load_phrases(phrases_root) num_phrases = len(all_phrases) print('Loaded %d phrases from %s' % (num_phrases, phrases_root)) data = get_phrase_counts_over_doctypes(doctype_phrase_counts, all_phrases) filter_start = time.time() filtered_phrases = filter_phrases_by_occurrences(data, all_phrases, max_doctypes) filter_run = time.time() - filter_start num_filtered = len(filtered_phrases) try: rate = num_phrases/filter_run except ZeroDivisionError: rate = 0 print('Analyzed %d phrases in %.4f s (%.1f phrase/s)' % (num_phrases, filter_run, rate)) print(' %d/%d phrases occur in %d doctypes or fewer' % (num_filtered, num_phrases, max_doctypes)) # print('Filtered %d phrases to %d phrases which occur in %d doctypes or fewer.' % # (num_phrases, num_filtered, max_doctypes)) save_phrases_file(filtered_phrases, out_file) save_phrase_metadata(data, filtered_phrases, metadata_out) # This is where we call the main method from. if __name__ == '__main__': printer.write_no_prefix(ConsoleUtils.get_header(program_name, program_version, build_date, prog_auth)) # Set up arguments here. required_args = parser.add_argument_group('Required') optional_args = parser.add_argument_group('Optional') required_args.add_argument('-i', '--in_dir', required=True, help='The path to the folder containing the ".phrasecount" files.') required_args.add_argument('-o', '--dest_file', required=True, help='The path to put the output of this tool.') optional_args.add_argument('-m', '--min_length', required=False, type=int, default=default_min_phrase_len, help='The minimum number of terms considered a phrase.') optional_args.add_argument('-d', '--doctype_count', required=False, type=int, default=default_max_phrase_doctypes, help='The maximum number of doctypes a phrase is allowed to occur in.') optional_args.add_argument('-h', '--help', action="help", help='Prints the help message.') # Get the arguments, we don't need to do a check for required arguments, since the ArgumentParser class does that args = parser.parse_args() # Get the arguments. phrasecount_dir = args.in_dir out_path = args.dest_file min_len = args.min_length min_sub_phrase_len = min_len - 1 max_doctype_occurrences = args.doctype_count # Display the arguments show_args(phrasecount_dir, out_path, min_len) check_args(phrasecount_dir, out_path, min_len) # Now we can run... try: main(phrasecount_dir, out_path, max_doctype_occurrences) except Exception as ex: printer.write_line_break(break_char=' ') print('Encountered Error: %s' % type(ex).__name__) print(' Message: %s' % str(ex))