Sleds/scorewalker-utils/PhraseCountLoader/PhraseCountLoader.py

# We do all our imports at the top of our program.
import argparse
import os
import json
import sys
import time

import ConsoleUtils


# Give the program a name.
program_name = 'PhraseCountLoader'
# Describe what the program does beiefly.
program_description = 'Loads and converts ".phrasecount" files for use with WalkerIndexer.'
# The argument parser for the program.
parser = argparse.ArgumentParser(prog=program_name, description=program_description, add_help=False)
printer = ConsoleUtils.SLPrinter(program_name)
sys.stdout = printer

build_date = '2017/11/20'
program_version = '1.0.2'
prog_auth = 'Chris Diesch <cdiesch@sequencelogic.net>'

# Program Constants
phrase_start = '   '
phrase_split = ' occurs '
count_end = ' times.'
old_delimiter = '.'
new_delimiter = '_'

# Default values
default_max_phrase_doctypes = 1

default_min_phrase_len = 5
min_sub_phrase_len = 2


def get_phrase_length(phrase):
    return phrase.count(new_delimiter) + 1


def make_filter(min_length):
    def long_enough(phrase):
        return get_phrase_length(phrase) >= min_length
    return long_enough


def load_phrases(phrase_path_root):
    phrase_count_by_doctype = {}
    master_set = set()
    sub_phrase_set = set()
    files = os.listdir(phrase_path_root)

    total_files = len(files)
    file_num = 0

    start_time = time.time()
    print('Analyzing %d files' % total_files)
    for pc_file in files:
        file_num += 1
        # print('%03d/%d: Running on file: %s' % (file_num, total_files, pc_file))
        if not pc_file.endswith('.phrasecount'):
            continue

        file_path = os.path.join(phrase_path_root, pc_file)
        split_idx = pc_file.rfind('.')
        doctype_name = pc_file[:split_idx]

        doctype_set, counts, sub_phrase_set = load_phrasecount_file(file_path, sub_phrase_set)
        # Save it to the master set (sets guarantee uniqueness)
        master_set = master_set.union(doctype_set) - sub_phrase_set

        phrase_count_by_doctype[doctype_name] = counts

    run_time = time.time() - start_time
    rate = len(files)/run_time
    print('Analyzed %d files in %.4f s (%.1f files/s)' % (len(files), run_time, rate))
    return phrase_count_by_doctype, master_set


def load_phrasecount_file(file_path, sub_phrases):
    result = set()
    phrase_counts = {}
    with open(file_path) as pc_reader:
        lines = pc_reader.readlines()

    for line in lines:
        if line.startswith(phrase_start) and '=' not in line:
            line = line.replace(old_delimiter, new_delimiter).replace('\n', '')[len(phrase_start):-len(count_end)]
            line_data = line.split(phrase_split)

            phrase = line_data[0]
            count = int(line_data[1])

            phrase_terms = phrase.split('_')

            phrase_len = len(phrase_terms)
            num_different_terms = len(set(phrase_terms))

            if phrase_len > 2 >= num_different_terms:
                print('Not including phrase for having too few different terms: %s' % phrase)
                continue

            result.add(phrase)
            phrase_counts[phrase] = count
            # Add the new sub phrases
            tmp_sub_phrases = make_sub_phrases(phrase)
            sub_phrases = sub_phrases.union(tmp_sub_phrases)

    return result, phrase_counts, sub_phrases


# TODO: Add filtering to the middle (currently only removes from front or back, not both).
def make_sub_phrases(phrase):
    result = set()

    remove_back_phrase = phrase
    remove_front_phrase = phrase

    # It's not a phrase if it is less than 2 tokens long...
    long_enough = make_filter(min_sub_phrase_len)

    while long_enough(remove_front_phrase) and long_enough(remove_back_phrase):
        split_idx = phrase.rfind(new_delimiter)
        remove_back_phrase = remove_back_phrase[:split_idx]
        result.add(remove_back_phrase)

        split_idx = remove_front_phrase.find(new_delimiter)
        remove_front_phrase = remove_front_phrase[:split_idx]
        result.add(remove_front_phrase)

    return result


def filter_phrases_by_occurrences(phrase_data, phrase_set, max_occurrences):
    result_set = set()

    doc_counts = phrase_data['numDoctypes']

    for phrase in phrase_set:
        if not doc_counts[phrase] > max_occurrences:
            result_set.add(phrase)

    return result_set


def save_phrases_file(phrases, out_file_path):
    print('Saving file %s' % out_file_path)

    with open(out_file_path, 'w+') as writer:
        for phrase in phrases:
            writer.write('%s\n' % phrase)


def save_phrase_metadata(phrase_metadata, phrases, save_loc):
    print('Saving metadata file to %s' % save_loc)

    out_data = {}
    counts = phrase_metadata['counts']
    doctypes = phrase_metadata['doctypes']
    doctype_counts = phrase_metadata['numDoctypes']

    for phrase in phrases:
        out_data[phrase] = {'numOccurrences': counts[phrase],
                            'numDoctypes': doctype_counts[phrase],
                            'doctypes': doctypes[phrase]}
    with open(save_loc, 'w+') as writer:
        json.dump(out_data, writer, indent=3)


def get_phrase_counts_over_doctypes(counts_by_doctype, master_set):
    total_counts = {}
    num_doctypes = {}
    phrase_doctypes = {}
    cur_phrase_num = 1
    total_phrases = len(master_set)
    print('Analyzing %d phrases...' % total_phrases)
    for phrase in master_set:
        total_counts[phrase] = 0
        num_doctypes[phrase] = 0
        doctypes = []

        for doctype in counts_by_doctype:
            doc_counts = counts_by_doctype[doctype]

            if phrase in doc_counts:
                total_counts[phrase] += doc_counts[phrase]
                num_doctypes[phrase] += 1
                doctypes.append(doctype)

        phrase_doctypes[phrase] = doctypes
        cur_phrase_num += 1

    return {'counts': total_counts, 'numDoctypes': num_doctypes, 'doctypes': phrase_doctypes}


def show_args(phrases_root, out_file, min_phrase_len):
    print('Loading ".phrasecount" files from: %s' % phrases_root)
    print('Saving results to: %s' % out_file)
    print('Minimum Phrase Length: %d' % min_phrase_len)


def check_args(phrases_folder, output_file, min_phrase_len):
    fatal_errors = False

    if os.path.exists(output_file):
        print('Warning: The file at %s will be overwritten.' % output_file)

    if min_phrase_len < 2:
        print('Warning: Minimum Phrase Length must be >= 2. (%d)' % min_phrase_len)
        print('  OK: Using default value of %d' % default_min_phrase_len)

    if not os.path.exists(phrases_folder):
        print('Error: The folder at %s does not exist.' % phrases_folder)
        fatal_errors = True

    if fatal_errors:
        print('Encountered Fatal Error, exiting...')
        parser.print_help()
        exit(-1)


# This is the main function of the program.
def main(phrases_root, out_file, max_doctypes):
    folder, file = os.path.split(out_file)
    metadata_out = os.path.join(folder, 'phrase-metadata.json')

    doctype_phrase_counts, all_phrases = load_phrases(phrases_root)

    num_phrases = len(all_phrases)
    print('Loaded %d phrases from %s' % (num_phrases, phrases_root))

    data = get_phrase_counts_over_doctypes(doctype_phrase_counts, all_phrases)

    filter_start = time.time()
    filtered_phrases = filter_phrases_by_occurrences(data, all_phrases, max_doctypes)
    filter_run = time.time() - filter_start

    num_filtered = len(filtered_phrases)
    try:
        rate = num_phrases/filter_run
    except ZeroDivisionError:
        rate = 0
    print('Analyzed %d phrases in %.4f s (%.1f phrase/s)' % (num_phrases, filter_run, rate))
    print('  %d/%d phrases occur in %d doctypes or fewer' % (num_filtered, num_phrases, max_doctypes))
    # print('Filtered %d phrases to %d phrases which occur in %d doctypes or fewer.' %
    #       (num_phrases, num_filtered, max_doctypes))

    save_phrases_file(filtered_phrases, out_file)

    save_phrase_metadata(data, filtered_phrases, metadata_out)


# This is where we call the main method from.
if __name__ == '__main__':
    printer.write_no_prefix(ConsoleUtils.get_header(program_name, program_version, build_date, prog_auth))
    # Set up arguments here.
    required_args = parser.add_argument_group('Required')
    optional_args = parser.add_argument_group('Optional')
    required_args.add_argument('-i', '--in_dir', required=True,
                               help='The path to the folder containing the ".phrasecount" files.')
    required_args.add_argument('-o', '--dest_file', required=True, help='The path to put the output of this tool.')

    optional_args.add_argument('-m', '--min_length', required=False, type=int, default=default_min_phrase_len,
                               help='The minimum number of terms considered a phrase.')
    optional_args.add_argument('-d', '--doctype_count', required=False, type=int, default=default_max_phrase_doctypes,
                               help='The maximum number of doctypes a phrase is allowed to occur in.')
    optional_args.add_argument('-h', '--help', action="help", help='Prints the help message.')

    # Get the arguments, we don't need to do a check for required arguments, since the ArgumentParser class does that
    args = parser.parse_args()

    # Get the arguments.
    phrasecount_dir = args.in_dir
    out_path = args.dest_file
    min_len = args.min_length
    min_sub_phrase_len = min_len - 1
    max_doctype_occurrences = args.doctype_count
    # Display the arguments
    show_args(phrasecount_dir, out_path, min_len)
    check_args(phrasecount_dir, out_path, min_len)

    # Now we can run...
    try:
        main(phrasecount_dir, out_path, max_doctype_occurrences)
    except Exception as ex:
        printer.write_line_break(break_char=' ')
        print('Encountered Error: %s' % type(ex).__name__)
        print('  Message: %s' % str(ex))
Initial commit part 2 2025-03-13 21:28:38 +00:00			`# We do all our imports at the top of our program.`
			`import argparse`
			`import os`
			`import json`
			`import sys`
			`import time`

			`import ConsoleUtils`


			`# Give the program a name.`
			`program_name = 'PhraseCountLoader'`
			`# Describe what the program does beiefly.`
			`program_description = 'Loads and converts ".phrasecount" files for use with WalkerIndexer.'`
			`# The argument parser for the program.`
			`parser = argparse.ArgumentParser(prog=program_name, description=program_description, add_help=False)`
			`printer = ConsoleUtils.SLPrinter(program_name)`
			`sys.stdout = printer`

			`build_date = '2017/11/20'`
			`program_version = '1.0.2'`
			`prog_auth = 'Chris Diesch <cdiesch@sequencelogic.net>'`

			`# Program Constants`
			`phrase_start = ' '`
			`phrase_split = ' occurs '`
			`count_end = ' times.'`
			`old_delimiter = '.'`
			`new_delimiter = '_'`

			`# Default values`
			`default_max_phrase_doctypes = 1`

			`default_min_phrase_len = 5`
			`min_sub_phrase_len = 2`


			`def get_phrase_length(phrase):`
			`return phrase.count(new_delimiter) + 1`


			`def make_filter(min_length):`
			`def long_enough(phrase):`
			`return get_phrase_length(phrase) >= min_length`
			`return long_enough`


			`def load_phrases(phrase_path_root):`
			`phrase_count_by_doctype = {}`
			`master_set = set()`
			`sub_phrase_set = set()`
			`files = os.listdir(phrase_path_root)`

			`total_files = len(files)`
			`file_num = 0`

			`start_time = time.time()`
			`print('Analyzing %d files' % total_files)`
			`for pc_file in files:`
			`file_num += 1`
			`# print('%03d/%d: Running on file: %s' % (file_num, total_files, pc_file))`
			`if not pc_file.endswith('.phrasecount'):`
			`continue`

			`file_path = os.path.join(phrase_path_root, pc_file)`
			`split_idx = pc_file.rfind('.')`
			`doctype_name = pc_file[:split_idx]`

			`doctype_set, counts, sub_phrase_set = load_phrasecount_file(file_path, sub_phrase_set)`
			`# Save it to the master set (sets guarantee uniqueness)`
			`master_set = master_set.union(doctype_set) - sub_phrase_set`

			`phrase_count_by_doctype[doctype_name] = counts`

			`run_time = time.time() - start_time`
			`rate = len(files)/run_time`
			`print('Analyzed %d files in %.4f s (%.1f files/s)' % (len(files), run_time, rate))`
			`return phrase_count_by_doctype, master_set`


			`def load_phrasecount_file(file_path, sub_phrases):`
			`result = set()`
			`phrase_counts = {}`
			`with open(file_path) as pc_reader:`
			`lines = pc_reader.readlines()`

			`for line in lines:`
			`if line.startswith(phrase_start) and '=' not in line:`
			`line = line.replace(old_delimiter, new_delimiter).replace('\n', '')[len(phrase_start):-len(count_end)]`
			`line_data = line.split(phrase_split)`

			`phrase = line_data[0]`
			`count = int(line_data[1])`

			`phrase_terms = phrase.split('_')`

			`phrase_len = len(phrase_terms)`
			`num_different_terms = len(set(phrase_terms))`

			`if phrase_len > 2 >= num_different_terms:`
			`print('Not including phrase for having too few different terms: %s' % phrase)`
			`continue`

			`result.add(phrase)`
			`phrase_counts[phrase] = count`
			`# Add the new sub phrases`
			`tmp_sub_phrases = make_sub_phrases(phrase)`
			`sub_phrases = sub_phrases.union(tmp_sub_phrases)`

			`return result, phrase_counts, sub_phrases`


			`# TODO: Add filtering to the middle (currently only removes from front or back, not both).`
			`def make_sub_phrases(phrase):`
			`result = set()`

			`remove_back_phrase = phrase`
			`remove_front_phrase = phrase`

			`# It's not a phrase if it is less than 2 tokens long...`
			`long_enough = make_filter(min_sub_phrase_len)`

			`while long_enough(remove_front_phrase) and long_enough(remove_back_phrase):`
			`split_idx = phrase.rfind(new_delimiter)`
			`remove_back_phrase = remove_back_phrase[:split_idx]`
			`result.add(remove_back_phrase)`

			`split_idx = remove_front_phrase.find(new_delimiter)`
			`remove_front_phrase = remove_front_phrase[:split_idx]`
			`result.add(remove_front_phrase)`

			`return result`


			`def filter_phrases_by_occurrences(phrase_data, phrase_set, max_occurrences):`
			`result_set = set()`

			`doc_counts = phrase_data['numDoctypes']`

			`for phrase in phrase_set:`
			`if not doc_counts[phrase] > max_occurrences:`
			`result_set.add(phrase)`

			`return result_set`


			`def save_phrases_file(phrases, out_file_path):`
			`print('Saving file %s' % out_file_path)`

			`with open(out_file_path, 'w+') as writer:`
			`for phrase in phrases:`
			`writer.write('%s\n' % phrase)`


			`def save_phrase_metadata(phrase_metadata, phrases, save_loc):`
			`print('Saving metadata file to %s' % save_loc)`

			`out_data = {}`
			`counts = phrase_metadata['counts']`
			`doctypes = phrase_metadata['doctypes']`
			`doctype_counts = phrase_metadata['numDoctypes']`

			`for phrase in phrases:`
			`out_data[phrase] = {'numOccurrences': counts[phrase],`
			`'numDoctypes': doctype_counts[phrase],`
			`'doctypes': doctypes[phrase]}`
			`with open(save_loc, 'w+') as writer:`
			`json.dump(out_data, writer, indent=3)`


			`def get_phrase_counts_over_doctypes(counts_by_doctype, master_set):`
			`total_counts = {}`
			`num_doctypes = {}`
			`phrase_doctypes = {}`
			`cur_phrase_num = 1`
			`total_phrases = len(master_set)`
			`print('Analyzing %d phrases...' % total_phrases)`
			`for phrase in master_set:`
			`total_counts[phrase] = 0`
			`num_doctypes[phrase] = 0`
			`doctypes = []`

			`for doctype in counts_by_doctype:`
			`doc_counts = counts_by_doctype[doctype]`

			`if phrase in doc_counts:`
			`total_counts[phrase] += doc_counts[phrase]`
			`num_doctypes[phrase] += 1`
			`doctypes.append(doctype)`

			`phrase_doctypes[phrase] = doctypes`
			`cur_phrase_num += 1`

			`return {'counts': total_counts, 'numDoctypes': num_doctypes, 'doctypes': phrase_doctypes}`


			`def show_args(phrases_root, out_file, min_phrase_len):`
			`print('Loading ".phrasecount" files from: %s' % phrases_root)`
			`print('Saving results to: %s' % out_file)`
			`print('Minimum Phrase Length: %d' % min_phrase_len)`


			`def check_args(phrases_folder, output_file, min_phrase_len):`
			`fatal_errors = False`

			`if os.path.exists(output_file):`
			`print('Warning: The file at %s will be overwritten.' % output_file)`

			`if min_phrase_len < 2:`
			`print('Warning: Minimum Phrase Length must be >= 2. (%d)' % min_phrase_len)`
			`print(' OK: Using default value of %d' % default_min_phrase_len)`

			`if not os.path.exists(phrases_folder):`
			`print('Error: The folder at %s does not exist.' % phrases_folder)`
			`fatal_errors = True`

			`if fatal_errors:`
			`print('Encountered Fatal Error, exiting...')`
			`parser.print_help()`
			`exit(-1)`


			`# This is the main function of the program.`
			`def main(phrases_root, out_file, max_doctypes):`
			`folder, file = os.path.split(out_file)`
			`metadata_out = os.path.join(folder, 'phrase-metadata.json')`

			`doctype_phrase_counts, all_phrases = load_phrases(phrases_root)`

			`num_phrases = len(all_phrases)`
			`print('Loaded %d phrases from %s' % (num_phrases, phrases_root))`

			`data = get_phrase_counts_over_doctypes(doctype_phrase_counts, all_phrases)`

			`filter_start = time.time()`
			`filtered_phrases = filter_phrases_by_occurrences(data, all_phrases, max_doctypes)`
			`filter_run = time.time() - filter_start`

			`num_filtered = len(filtered_phrases)`
			`try:`
			`rate = num_phrases/filter_run`
			`except ZeroDivisionError:`
			`rate = 0`
			`print('Analyzed %d phrases in %.4f s (%.1f phrase/s)' % (num_phrases, filter_run, rate))`
			`print(' %d/%d phrases occur in %d doctypes or fewer' % (num_filtered, num_phrases, max_doctypes))`
			`# print('Filtered %d phrases to %d phrases which occur in %d doctypes or fewer.' %`
			`# (num_phrases, num_filtered, max_doctypes))`

			`save_phrases_file(filtered_phrases, out_file)`

			`save_phrase_metadata(data, filtered_phrases, metadata_out)`


			`# This is where we call the main method from.`
			`if __name__ == '__main__':`
			`printer.write_no_prefix(ConsoleUtils.get_header(program_name, program_version, build_date, prog_auth))`
			`# Set up arguments here.`
			`required_args = parser.add_argument_group('Required')`
			`optional_args = parser.add_argument_group('Optional')`
			`required_args.add_argument('-i', '--in_dir', required=True,`
			`help='The path to the folder containing the ".phrasecount" files.')`
			`required_args.add_argument('-o', '--dest_file', required=True, help='The path to put the output of this tool.')`

			`optional_args.add_argument('-m', '--min_length', required=False, type=int, default=default_min_phrase_len,`
			`help='The minimum number of terms considered a phrase.')`
			`optional_args.add_argument('-d', '--doctype_count', required=False, type=int, default=default_max_phrase_doctypes,`
			`help='The maximum number of doctypes a phrase is allowed to occur in.')`
			`optional_args.add_argument('-h', '--help', action="help", help='Prints the help message.')`

			`# Get the arguments, we don't need to do a check for required arguments, since the ArgumentParser class does that`
			`args = parser.parse_args()`

			`# Get the arguments.`
			`phrasecount_dir = args.in_dir`
			`out_path = args.dest_file`
			`min_len = args.min_length`
			`min_sub_phrase_len = min_len - 1`
			`max_doctype_occurrences = args.doctype_count`
			`# Display the arguments`
			`show_args(phrasecount_dir, out_path, min_len)`
			`check_args(phrasecount_dir, out_path, min_len)`

			`# Now we can run...`
			`try:`
			`main(phrasecount_dir, out_path, max_doctype_occurrences)`
			`except Exception as ex:`
			`printer.write_line_break(break_char=' ')`
			`print('Encountered Error: %s' % type(ex).__name__)`
			`print(' Message: %s' % str(ex))`