Sleds/scorewalker-utils/PhraseCountLoader/PhraseCountLoader.py

# We do all our imports at the top of our program.
import argparse
import os
import json
import sys
import time

import ConsoleUtils


# Give the program a name.
program_name = 'PhraseCountLoader'
# Describe what the program does beiefly.
program_description = 'Loads and converts ".phrasecount" files for use with WalkerIndexer.'
# The argument parser for the program.
parser = argparse.ArgumentParser(prog=program_name, description=program_description, add_help=False)
printer = ConsoleUtils.SLPrinter(program_name)
sys.stdout = printer

build_date = '2017/11/20'
program_version = '1.0.2'
prog_auth = 'Chris Diesch <cdiesch@sequencelogic.net>'

# Program Constants
phrase_start = '   '
phrase_split = ' occurs '
count_end = ' times.'
old_delimiter = '.'
new_delimiter = '_'

# Default values
default_max_phrase_doctypes = 1

default_min_phrase_len = 5
min_sub_phrase_len = 2


def get_phrase_length(phrase):
    return phrase.count(new_delimiter) + 1


def make_filter(min_length):
    def long_enough(phrase):
        return get_phrase_length(phrase) >= min_length
    return long_enough


def load_phrases(phrase_path_root):
    phrase_count_by_doctype = {}
    master_set = set()
    sub_phrase_set = set()
    files = os.listdir(phrase_path_root)

    total_files = len(files)
    file_num = 0

    start_time = time.time()
    print('Analyzing %d files' % total_files)
    for pc_file in files:
        file_num += 1
        # print('%03d/%d: Running on file: %s' % (file_num, total_files, pc_file))
        if not pc_file.endswith('.phrasecount'):
            continue

        file_path = os.path.join(phrase_path_root, pc_file)
        split_idx = pc_file.rfind('.')
        doctype_name = pc_file[:split_idx]

        doctype_set, counts, sub_phrase_set = load_phrasecount_file(file_path, sub_phrase_set)
        # Save it to the master set (sets guarantee uniqueness)
        master_set = master_set.union(doctype_set) - sub_phrase_set

        phrase_count_by_doctype[doctype_name] = counts

    run_time = time.time() - start_time
    rate = len(files)/run_time
    print('Analyzed %d files in %.4f s (%.1f files/s)' % (len(files), run_time, rate))
    return phrase_count_by_doctype, master_set


def load_phrasecount_file(file_path, sub_phrases):
    result = set()
    phrase_counts = {}
    with open(file_path) as pc_reader:
        lines = pc_reader.readlines()

    for line in lines:
        if line.startswith(phrase_start) and '=' not in line:
            line = line.replace(old_delimiter, new_delimiter).replace('\n', '')[len(phrase_start):-len(count_end)]
            line_data = line.split(phrase_split)

            phrase = line_data[0]
            count = int(line_data[1])

            phrase_terms = phrase.split('_')

            phrase_len = len(phrase_terms)
            num_different_terms = len(set(phrase_terms))

            if phrase_len > 2 >= num_different_terms:
                print('Not including phrase for having too few different terms: %s' % phrase)
                continue

            result.add(phrase)
            phrase_counts[phrase] = count
            # Add the new sub phrases
            tmp_sub_phrases = make_sub_phrases(phrase)
            sub_phrases = sub_phrases.union(tmp_sub_phrases)

    return result, phrase_counts, sub_phrases


# TODO: Add filtering to the middle (currently only removes from front or back, not both).
def make_sub_phrases(phrase):
    result = set()

    remove_back_phrase = phrase
    remove_front_phrase = phrase

    # It's not a phrase if it is less than 2 tokens long...
    long_enough = make_filter(min_sub_phrase_len)

    while long_enough(remove_front_phrase) and long_enough(remove_back_phrase):
        split_idx = phrase.rfind(new_delimiter)
        remove_back_phrase = remove_back_phrase[:split_idx]
        result.add(remove_back_phrase)

        split_idx = remove_front_phrase.find(new_delimiter)
        remove_front_phrase = remove_front_phrase[:split_idx]
        result.add(remove_front_phrase)

    return result


def filter_phrases_by_occurrences(phrase_data, phrase_set, max_occurrences):
    result_set = set()

    doc_counts = phrase_data['numDoctypes']

    for phrase in phrase_set:
        if not doc_counts[phrase] > max_occurrences:
            result_set.add(phrase)

    return result_set


def save_phrases_file(phrases, out_file_path):
    print('Saving file %s' % out_file_path)

    with open(out_file_path, 'w+') as writer:
        for phrase in phrases:
            writer.write('%s\n' % phrase)


def save_phrase_metadata(phrase_metadata, phrases, save_loc):
    print('Saving metadata file to %s' % save_loc)

    out_data = {}
    counts = phrase_metadata['counts']
    doctypes = phrase_metadata['doctypes']
    doctype_counts = phrase_metadata['numDoctypes']

    for phrase in phrases:
        out_data[phrase] = {'numOccurrences': counts[phrase],
                            'numDoctypes': doctype_counts[phrase],
                            'doctypes': doctypes[phrase]}
    with open(save_loc, 'w+') as writer:
        json.dump(out_data, writer, indent=3)


def get_phrase_counts_over_doctypes(counts_by_doctype, master_set):
    total_counts = {}
    num_doctypes = {}
    phrase_doctypes = {}
    cur_phrase_num = 1
    total_phrases = len(master_set)
    print('Analyzing %d phrases...' % total_phrases)
    for phrase in master_set:
        total_counts[phrase] = 0
        num_doctypes[phrase] = 0
        doctypes = []

        for doctype in counts_by_doctype:
            doc_counts = counts_by_doctype[doctype]

            if phrase in doc_counts:
                total_counts[phrase] += doc_counts[phrase]
                num_doctypes[phrase] += 1
                doctypes.append(doctype)

        phrase_doctypes[phrase] = doctypes
        cur_phrase_num += 1

    return {'counts': total_counts, 'numDoctypes': num_doctypes, 'doctypes': phrase_doctypes}


def show_args(phrases_root, out_file, min_phrase_len):
    print('Loading ".phrasecount" files from: %s' % phrases_root)
    print('Saving results to: %s' % out_file)
    print('Minimum Phrase Length: %d' % min_phrase_len)


def check_args(phrases_folder, output_file, min_phrase_len):
    fatal_errors = False

    if os.path.exists(output_file):
        print('Warning: The file at %s will be overwritten.' % output_file)

    if min_phrase_len < 2:
        print('Warning: Minimum Phrase Length must be >= 2. (%d)' % min_phrase_len)
        print('  OK: Using default value of %d' % default_min_phrase_len)

    if not os.path.exists(phrases_folder):
        print('Error: The folder at %s does not exist.' % phrases_folder)
        fatal_errors = True

    if fatal_errors:
        print('Encountered Fatal Error, exiting...')
        parser.print_help()
        exit(-1)


# This is the main function of the program.
def main(phrases_root, out_file, max_doctypes):
    folder, file = os.path.split(out_file)
    metadata_out = os.path.join(folder, 'phrase-metadata.json')

    doctype_phrase_counts, all_phrases = load_phrases(phrases_root)

    num_phrases = len(all_phrases)
    print('Loaded %d phrases from %s' % (num_phrases, phrases_root))

    data = get_phrase_counts_over_doctypes(doctype_phrase_counts, all_phrases)

    filter_start = time.time()
    filtered_phrases = filter_phrases_by_occurrences(data, all_phrases, max_doctypes)
    filter_run = time.time() - filter_start

    num_filtered = len(filtered_phrases)
    try:
        rate = num_phrases/filter_run
    except ZeroDivisionError:
        rate = 0
    print('Analyzed %d phrases in %.4f s (%.1f phrase/s)' % (num_phrases, filter_run, rate))
    print('  %d/%d phrases occur in %d doctypes or fewer' % (num_filtered, num_phrases, max_doctypes))
    # print('Filtered %d phrases to %d phrases which occur in %d doctypes or fewer.' %
    #       (num_phrases, num_filtered, max_doctypes))

    save_phrases_file(filtered_phrases, out_file)

    save_phrase_metadata(data, filtered_phrases, metadata_out)


# This is where we call the main method from.
if __name__ == '__main__':
    printer.write_no_prefix(ConsoleUtils.get_header(program_name, program_version, build_date, prog_auth))
    # Set up arguments here.
    required_args = parser.add_argument_group('Required')
    optional_args = parser.add_argument_group('Optional')
    required_args.add_argument('-i', '--in_dir', required=True,
                               help='The path to the folder containing the ".phrasecount" files.')
    required_args.add_argument('-o', '--dest_file', required=True, help='The path to put the output of this tool.')

    optional_args.add_argument('-m', '--min_length', required=False, type=int, default=default_min_phrase_len,
                               help='The minimum number of terms considered a phrase.')
    optional_args.add_argument('-d', '--doctype_count', required=False, type=int, default=default_max_phrase_doctypes,
                               help='The maximum number of doctypes a phrase is allowed to occur in.')
    optional_args.add_argument('-h', '--help', action="help", help='Prints the help message.')

    # Get the arguments, we don't need to do a check for required arguments, since the ArgumentParser class does that
    args = parser.parse_args()

    # Get the arguments.
    phrasecount_dir = args.in_dir
    out_path = args.dest_file
    min_len = args.min_length
    min_sub_phrase_len = min_len - 1
    max_doctype_occurrences = args.doctype_count
    # Display the arguments
    show_args(phrasecount_dir, out_path, min_len)
    check_args(phrasecount_dir, out_path, min_len)

    # Now we can run...
    try:
        main(phrasecount_dir, out_path, max_doctype_occurrences)
    except Exception as ex:
        printer.write_line_break(break_char=' ')
        print('Encountered Error: %s' % type(ex).__name__)
        print('  Message: %s' % str(ex))