""" .. role:: py(code) :language: python Information ----------- This is the script responsible for running WalkerIndexer, the indexing tool used for the ScoreWalker classification engine. This script handles performing all the necessary setup and runs any "pre-index" tools before running WalkerIndexer. This tool pipes the console logs from the scripts/programs it calls to the current standard output. Logging of this information is handled by the Java code calling this script. Currently This program has 5 steps to its execution: #) Copying the data files ``"MaleNames.txt", "FemaleNames.txt" and "Surnames.txt"`` to the destination index directory. #) Running ``"Tokenizer"`` on the any new library elements. #) Running ``"phrases"`` on the ``tkn`` files generated by ``"Tokenizer"``. #) Running ``"PhraseCountLoader.py"`` on the ``phrasecount`` files generated by ``"phrases"``. #) Running WalkerIndexer. This tool is only intended to be run from the commandline in production, however, the methods have been documented to allow for an easier understanding of how this tool works. Commandline Usage ------------------ Usage ``sequencelogic-run-walker-indexer.py [-h, --help] [-c, --config] {CFG_FILE} [-i, --indexRoot] {IDX_ROOT} [-l, --libRoot] {LIB_ROOT}`` Required Arguments: ``-c CFG_FILE, --config CFG_FILE`` Where ``CFG_FILE`` is the path to the config file being used for WalkerIndexer. ``-i IDX_ROOT, --indexRoot IDX_ROOT`` Where ``IDX_ROOT`` is the path to save the index generated by WalkerIndexer to. ``-l LIB_ROOT, --libRoot LIB_ROOT`` Where ``LIB_ROOT`` is the path to the library for WalkerIndexer to index. Optional Arguments: ``-h, --help`` Prints the help message ``-v, --version`` Prints the version of WalkerIndexer being used as well as a description of what the version does and requires. Python Module Usage ------------------- """ # TODO: Add a link to the referenced documentation # We do all our imports at the top of our program. import argparse import datetime import time import os import subprocess import shutil import sys import json import ConsoleUtils # Give the program a name. program_name = 'Run WalkerIndexer' # Describe what the program does beiefly. program_description = 'Does the necessary steps to run WalkerIndexer.' # The argument parser for the program. parser = argparse.ArgumentParser(prog=program_name, description=program_description, add_help=False) # Error and Warning console values: usage = 'sequencelogic-run-walker-indexer.py ' \ '-i,--indexRoot {IDX_PATH} ' \ '-c,--config {CFG_FILE} ' \ '-l,--libRoot {LIB_PATH} ' \ '[options]' build_date = '2017/11/21' program_version = '2.2.3' author = 'Chris Diesch ' sl_home = '' male_name_file = '' female_name_file = '' last_name_file = '' phrase_exe = '' tokenizer_jar = '' term_diff_jar = '' indexer_jar = '' phrase_loader = '' doctype_coverage_arg = '-c' min_phrase_len_arg = '-p' max_phrase_len_arg = '-P' doctype_coverage = str(70) min_phrase_len = str(3) max_phrase_len = str(8) min_num_examples = 2 idx_arg = '--indexDir' lib_arg = '--dataDir' cfg_arg = '--configFile' phrase_root_arg = '--in_dir' phrase_file_arg = '--dest_file' log_all = False printer = ConsoleUtils.SLPrinter(program_name) walker_config_data = {} def _print_version(): sys.stdout = printer.old_stdout print('') print(program_name) print('Version:') print(' Version: %s' % program_version) print(' Date: %s' % build_date) print(' Author: %s' % author) def _print_help(): sys.stdout = printer.old_stdout print('') print(program_name) print('Usage: %s' % usage) print('') # What is special about this version? print('Description:') print(' Creates and moves the necessary metadata to run WalkerIndexer v2.2.1 with Phrasing, Name Redaction,\n' ' Data Redaction, Container/Contained Doctypes, Similar Doctype Differencing, and Duplicate Page\n' ' Recognition enabled.\n' ' This process requires certain programs, files and environmental variables to exist/be set, since \n' ' WalkerIndexer only handles generating the Lucene index. These requirements are listed below.') print('') print('Arguments:') print(' -i,--indexRoot {IDX_PATH} The path to the save the generated index to.') print(' -c,--config {CFG_FILE} The path to the configuration file to use while indexing.') print(' -l,--libRoot {LIB_PATH} The path to the library to index.') print('') print('Miscellaneous:') print(' -h,--help Prints the help message.') print(' -v,--version Prints the version information.') print('') print('Version:') print(' Version: %s' % program_version) print(' Date: %s' % build_date) print('') print('Requires:') print(' - "SEQUENCELOGICHOME" environmental variable to be set.') print(' - The folder at "${SEQUENCELOGIC}/bin" (SL bin) to exist and be the home for SequenceLogic' + '\n' ' programs.') print(' - The Phrases ("phrases") program to be in "${SEQUENCELOGICHOME}/bin".') print(' - The Phrase Count Loader ("PhraseCountLoader.py") script to be in the SL bin directory.') print(' - The Tokenizer ("tokenizer-one-jar.jar") program to be in the SL bin directory.') print(' - The Walker Term Diff ("walker-term-diff-one-jar.jar") program to be in the SL bin directory.') print(' - The files "MaleNames.txt", "FemaleNames.txt", & "Surnames.txt" to be in\n' ' "${SEQUENCELOGICHOME}/config/data".') print('') print('Notes:') print(' - If "SEQUENCELOGICHOME" is not set, it will be set to "/sequencelogic" by default.') print(' - If any of the files "MaleNames.txt", "FemaleNames.txt", or "Surnames.txt" is not in\n' ' "${SEQUENCELOGIC}/config/data"; the missing file(s) will have an empty file with the same name\n ' ' created (this will cause name redaction to not work).') print(' - An environmental variable is set by this script for running the indexer:') print(' - "LOGALL" is set to "false" by default and causes additional logging to occur while indexing.') print('') print('Author: %s' % author) def copy_files(idx_root): """ Copies the files from their default location ${SEQUENCELOGICHOME}/SLSync/config/data to the index's output directory. Args: ``idx_root`` -- ``str`` The path to save the index to. Returns: ``None`` """ shutil.copy(male_name_file, idx_root) shutil.copy(female_name_file, idx_root) shutil.copy(last_name_file, idx_root) def run_walker_term_diff(similar_types, lib_root, idx_root): """ Runs WalkerTermDiff from ${SEQUENCELOGICHOME}/bin to generate the unique terms json file between doctypes which are known to be too similar to one another. Args: ``phrases`` -- ``list(str)`` The list of phrases which are too similar to one another. ``lib_root`` -- ``str`` The path to the library being used to create an index. ``idx_root`` -- ``str`` The path to the index directory being created. Returns: ``None`` """ term_diff_cmd = ['java', '-jar', term_diff_jar, '--libRoot', lib_root, '--outRoot', idx_root, '--doctypes'] + similar_types print('Running WalkerTermDiff') start_time = time.time() term_diff_proc = subprocess.Popen(term_diff_cmd, stdout=subprocess.PIPE, stderr=sys.stderr) for line in iter(term_diff_proc.stdout.readline, b''): printer.write_no_prefix(' %s' % line.decode('utf-8').replace('\n', '').replace('\r', '')) term_diff_proc.wait() run_time = time.time() - start_time printer.write_line_break() print('Phrase runner completed in %.4f s.' % run_time) printer.write_line_break() def run_phrase_loader(phrase_dir, out_file): """ Runs PhraseLoader.py from ${SEQUENCELOGICHOME}/bin to generate the "Phrases.txt" file used by WalkerIndexer. Args: ``phrase_dir`` -- ``str`` The output directory used when running phrases. ``out_file`` -- ``str`` The path to save the output to (should be ``"Phrases.txt"`` in the index directory). Returns: ``None`` """ phrase_loader_cmd = ['python', phrase_loader, phrase_root_arg, phrase_dir, phrase_file_arg, out_file] print('Running Phrase Loader') start_time = time.time() phrase_loader_proc = subprocess.Popen(phrase_loader_cmd, stdout=subprocess.PIPE, stderr=sys.stderr) for line in iter(phrase_loader_proc.stdout.readline, b''): printer.write_no_prefix(' %s' % line.decode('utf-8').replace('\n', '').replace('\r', '')) phrase_loader_proc.wait() end_time = time.time() - start_time # Remove the unneeded files shutil.rmtree(phrase_dir) printer.write_line_break() print('Phrase Loader completed in %.4f s.' % end_time) printer.write_line_break() def run_tokenizer(lib_dir, idx_dir): """ Runs Tokenizer on the library to generate the token files used by phrases. Args: ``lib_dir`` -- ``str`` The path to the library to generate ``tkn`` files for. ``idx_dir`` -- ``str`` The path to save the index to. Returns: ``None`` """ tokenizer_cmd = ['java', '-jar', tokenizer_jar, idx_arg, idx_dir, '--libDir', lib_dir] print('Running Tokenizer') start_time = time.time() tokenizer_proc = subprocess.Popen(tokenizer_cmd, stdout=subprocess.PIPE) for line in iter(tokenizer_proc.stdout.readline, b''): printer.write_no_prefix(' %s' % line.decode('utf-8').replace('\n', '').replace('\r', '')) tokenizer_proc.wait() end_time = time.time() - start_time printer.write_line_break() print('Tokenizer completed in %.4f s.' % end_time) def run_phrase_maker(files, out_file): """ Runs phrases on the given list of files. Args: ``files`` -- ``list(str)`` The list of files to generate phrases from. ``out_file`` -- ``str`` The path to save the output to. Returns: ``None`` """ phrase_cmd = [phrase_exe, doctype_coverage_arg, doctype_coverage, min_phrase_len_arg, min_phrase_len, max_phrase_len_arg, max_phrase_len] + files start_time = time.time() out = open(out_file, 'a+') phrase_proc = subprocess.Popen(phrase_cmd, stdout=out, stderr=subprocess.PIPE) for line in iter(phrase_proc.stderr.readline, b''): txt = line.decode('utf-8').replace('\n', '').replace('\r', '') if txt == '': continue if 'Processing' not in txt: printer.write_no_prefix('%s [PhraseMaker] %s' % (datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S'), txt)) if 'is zero size' in txt: start_idx = 5 end_idx = txt.rfind('.tkn') + 4 empty_file = txt[start_idx:end_idx] print('Deleting emtpy file detected by PhraseMaker %s' % empty_file) try: os.remove(empty_file) except OSError as ex: print('Failed to delete file:') print('Message: %s' % str(ex)) phrase_proc.wait() run_time = time.time() - start_time rate = len(files)/run_time print('Analyzed phrases on %d documents in %.4f s (%.1f documents/s)' % (len(files), run_time, rate)) def run_phrase(phrase_dir, lib_dir): """ Runs phrase on every doctype in the library using :meth:`run_phrase_maker`. Phrases for a doctype are saved into a file in ``phrase_dir`` called ``"DOCTYPE.phrasecount"``. Args: ``phrase_dir`` -- ``str`` The path to save the outputs from phrase to. ``lib_dir`` -- ``str`` The path to the library to generate the phrases from. Returns: ``None`` """ print('Generating phrases in library') for folder in os.listdir(lib_dir): folder = folder.replace('\uf028', '') # Get the output file name doctype = folder out_file_name = os.path.join(phrase_dir, '%s.phrasecount' % doctype) folder = os.path.join(lib_dir, folder) files = [] if os.path.isdir(folder) and not doctype.lower() == 'templates': # Get all the token files for file in os.listdir(folder): file = os.path.join(folder, file) if file.endswith('.tkn') and os.path.isfile(file): files.append(file) num_examples = len(files) # Only run the phrase maker if we have enough examples if num_examples >= min_num_examples: printer.write_line_break() print('Running phrases on %s' % doctype) run_phrase_maker(files, out_file_name) printer.write_line_break() else: print('There are not enough examples of doctype "%s" in the library to generate phrases ' '(examples required: %d, examples in library: %d)' % (doctype, min_num_examples, num_examples)) printer.write_line_break() def run_walker_indexer(idx_dir, lib_dir, cfg_path): """ Runs WalkerIndexer. Args: ``idx_dir`` -- ``str`` The path to save the index to. ``lib_dir`` -- ``str`` The path to the library to run WalkerIndexer on. ``cfg_path`` -- ``str`` The config file for WalkerIndexer to use. Returns: ``None`` """ index_cmd = ['java', '-jar', indexer_jar, idx_arg, idx_dir, lib_arg, lib_dir, cfg_arg, cfg_path] print('Running WalkerIndexer') start_time = time.time() index_proc = subprocess.Popen(index_cmd, stdout=subprocess.PIPE, stderr=sys.stderr) for line in iter(index_proc.stdout.readline, b''): printer.write_no_prefix(' %s' % line.decode('utf-8').replace('\n', '').replace('\r', '')) index_proc.wait() index_proc.poll() if index_proc.returncode != 0: printer.write_line_break(break_char=' ') print('ERROR: WalkerIndexer has crashed') raise subprocess.CalledProcessError(index_proc.returncode, index_proc.args) end_time = time.time() - start_time printer.write_line_break() print('WalkerIndexer completed with status %d in %.4f s.' % (index_proc.returncode, end_time)) printer.write_line_break() def env_is_good(): """ Determines if this system can run WalkerIndexer or not. Returns: :py:`True` if the system can run WalkerIndexer :py:`False` otherwise. """ global sl_home, male_name_file, female_name_file, last_name_file, phrase_loader, tokenizer_jar, indexer_jar, \ phrase_exe, term_diff_jar # Is LOGALL set? if os.environ.get('LOGALL') is None: print('No value set for environmental variable "LOGALL", setting LOGALL = "%s"' % str(log_all)) os.environ['LOGALL'] = str(log_all) # Eventually use this to find pages in the library which are too similar and do something with # them before indexing. # Is SAMEPAGES set? # if os.environ.get('SAMEPAGES') is None: # print('No value set for environmental variable "SAMEPAGES", setting SAMEPAGES = "true"') # os.environ['SAMEPAGES'] = 'true' # Check for SEQUENCELOGICHOME if os.environ.get('SEQUENCELOGICHOME') is None: print('Warning: The environmental variable "SEQUENCELOGICHOME" is not set') print('Setting SEQUENCELOGICHOME = "/sequencelogic" (default value)') os.environ['SEQUENCELOGICHOME'] = '/sequencelogic' # Does sl_home exist? sl_home = os.environ.get('SEQUENCELOGICHOME') if not os.path.exists(sl_home): print('Error: The environmental variable "SEQUENCELOGICHOME" does not point to a valid directory') print(' ${SEQUENCELOGICHOME} = "%s"' % sl_home) return False # Does sl_bin exist? sl_bin = os.path.join(sl_home, 'bin') if not os.path.exists(sl_bin): print('Error: There is no "bin" directory in "SEQUENCELOGICHOME"') print(' ${SEQUENCELOGICHOME}\\bin = "%s"' % sl_bin) return False # Is phrases on this system? phrase_exe = os.path.join(sl_bin, 'phrases') if not os.path.exists(phrase_exe): print('Error: There is no Phrases program on this system') print(' Expected at "%s"' % phrase_exe) phrase_exe += '.exe' if os.path.exists(phrase_exe): print(' OK: Found phrases.exe instead.') else: return False # Is Tokenizer on this system? tokenizer_jar = os.path.join(sl_bin, 'tokenizer-one-jar.jar') if not os.path.exists(tokenizer_jar): print('Error: There is no Tokenizer program on this system') print(' Expected at "%s"' % tokenizer_jar) return False # Is WalkerIndexer on this system? indexer_jar = os.path.join(sl_bin, 'walker-indexer-one-jar.jar') if not os.path.exists(indexer_jar): print('Error: There is no WalkerIndexer on this system.') print(' Expected at "%s"' % indexer_jar) return False # Is PhraseCountLoader on this system? phrase_loader = os.path.join(sl_bin, 'PhraseCountLoader.py') if not os.path.exists(phrase_loader): print('Error: There is no PhraseCountLoader on this system') print(' Expected at "%s"' % phrase_loader) return False # Is WalkerTermDiff on this system? term_diff_jar = os.path.join(sl_bin, 'walker-term-diff-one-jar.jar') if not os.path.exists(term_diff_jar): print('Error: There is no WalkerTermDiff on this system.') print(' Expected at "%s"' % term_diff_jar) return False # Check for index data index_files_home = os.path.join(sl_home, 'SLSync') index_files_home = os.path.join(index_files_home, 'config') index_files_home = os.path.join(index_files_home, 'data') # Does it exist? if not os.path.exists(index_files_home): print('Error: There is no "data" directory on this system ${SEQUENCELOGICHOME}\\SLSync\\config\\data') print(' Expected at "%s"' % index_files_home) return False # Check MaleNames.txt male_name_file = os.path.join(index_files_home, 'MaleNames.txt') if not os.path.exists(male_name_file): print('Warning: There is no "MaleNames.txt" file on this system. Name Redaction will not work correctly.') # Make an empty file open(male_name_file, 'w+').close() # Check FemaleNames.txt female_name_file = os.path.join(index_files_home, 'FemaleNames.txt') if not os.path.exists(female_name_file): print('Warning: There is no "FemaleNames.txt" file on this system. Name Redaction will not work correctly.') # Make an empty file open(female_name_file, 'w+').close() # Check Surnames.txt last_name_file = os.path.join(index_files_home, 'Surnames.txt') if not os.path.exists(last_name_file): print('Warning: There is no "Surnames.txt" file on this system. Name Redaction will not work correctly.') # Make an empty file open(last_name_file, 'w+').close() return True # This is the main function of the program. def main(lib_root, idx_root, config_path): global walker_config_data with open(config_path) as cfg: config_data = json.load(cfg) try: walker_config_data = config_data['CLASSIFYWALKER'] if not walker_config_data['enabled']: print('ScoreWalker is not enabled in this config file. It will not be indexed') print('Exiting...') exit(0) except KeyError: print('Error: The config file at "%s" has no "CLASSIFYWALKER" field.' % config_path) exit(-1) print('Creating Index') start_time = time.time() phrase_root = os.path.join(idx_root, 'phrases') if not os.path.exists(phrase_root): os.mkdir(phrase_root) phrase_file = os.path.join(idx_root, 'Phrases.txt') with open(phrase_file, 'w+') as tmp: tmp.write('') # Copy the files copy_files(idx_root) # Run the tokenizer run_tokenizer(lib_root, idx_root) # Run the term diff # Run the phrase maker run_phrase(phrase_root, lib_root) # Run the loader run_phrase_loader(phrase_root, phrase_file) # Now we can index run_walker_indexer(idx_root, lib_root, config_path) # Get the run time run_time = time.time() - start_time printer.write_no_prefix('Indexing successful!') print('Indexing completed in %.4f s.' % run_time) print('') def check_args(lib_root, idx_root, cfg_file): """ Checks the arguments to make sure they are valid, and WalkerIndexer can be run. This function will exit if either the arguments are invalid or :meth:`env_is_good` returns :py:`False`. Args: ``lib_root`` -- ``str`` The path to the library to index. ``idx_root`` -- ``str`` The path to save the index to. ``cfg_file`` -- ``str`` The path to the config file to use. Returns: ``None`` """ # We only exit if a required argument was bad, we can handle optional arguments. fatal_errors = False # Check the library root if not os.path.exists(lib_root): print('Error: Library Path does not exist: "%s"' % lib_root) fatal_errors = True # Check the index root if not os.path.exists(idx_root): print('Warning: Index Path does not exist, creating index path at "%s"' % idx_root) try: os.mkdir(idx_root) print('OK: Created index root successfully at "%s"' % idx_root) except FileNotFoundError: print('Error: Failed to create index at "%s"' % idx_root) fatal_errors = True # Check the config file. if not os.path.exists(cfg_file): print('Warning: There is no config file at "%s"' % cfg_file) print(' OK: WalkerIndexer will run without container/contained logic in place ' '(See WalkerIndexer documentation for more information).') if fatal_errors or not env_is_good(): parser.print_help() print('Exiting...') exit(0) os.environ['UseFirstPages'] = str(os.path.exists(os.path.join(idx_root, 'FirstPageIndex'))) show_args(lib_root, idx_root, cfg_file) # prints the argument values passed in. def show_args(lib_root, idx_root, config): """ Displays the run time information for this tool. Args: ``lib_root`` -- ``str`` The path to the library to index. ``idx_root`` -- ``str`` The path to save the index to. ``config`` -- ``str`` The path to the config file to use. Returns: ``None`` """ # print the arguments with a brief summation of what they mean. print('Arguments:') print(' Running indexer on library at: "%s"' % lib_root) print(' Saving index at: "%s"' % idx_root) print(' Using configuration from: "%s"' % config) printer.write_line_break() print('Environmental Variables:') print(' SEQUENCELOGICHOME = "%s"' % sl_home) print(' LOGALL = "%s"' % str(log_all)) printer.write_line_break() print('Program execution order:') print(' - Tokenizer [jar]') print(' - Walker Term Diff [jar] (IFF there are similar doctypes defined in the config file)') print(' - Phrase Maker [binary executable]') print(' - Phrase Count Loader [py script]') print(' - Walker Indexer [jar]') printer.write_line_break() print('Program Configurations:') print(' Tokenizer Config:') print(' Tokenizer executable location: "%s"' % tokenizer_jar) print(' Index root directory: "%s"' % idx_root) print(' Library root directory: "%s"' % lib_root) print(' Walker Term Diff Config:') print(' Walker Term Diff executable location: "%s"' % term_diff_jar) print(' Index root directory: "%s"' % idx_root) print(' Library root directory: "%s"' % lib_root) print(' Phrase Maker Config:') print(' Phrase Maker executable location: "%s"' % phrase_exe) print(' Minimum phrase length: %s' % min_phrase_len) print(' Maximum phrase length: %s' % max_phrase_len) print(' Minimum doctype coverage: %s%%' % doctype_coverage) print(' Minimum number of examples to generate a phrase: %d' % min_num_examples) print(' Phrase Count Loader Config:') print(' Phrase Count Loader executable location: "%s"' % phrase_loader) print(' Maximum number of doctype occurrences: %d' % 1) print(' Filter sub-phrases over doctype: %s' % 'YES') print(' Walker Indexer Config:') print(' Walker Indexer executable location: "%s"' % indexer_jar) print(' Index root directory: "%s"' % idx_root) print(' Library root directory: "%s"' % lib_root) printer.write_line_break() printer.write_no_prefix('') # Set up arguments here; more groups can be made if required. def make_args(): # A group for required arguments required_args = parser.add_argument_group('Required') # A group for optional arguments optional_args = parser.add_argument_group('Optional') required_args.add_argument('-c', '--config', required=True) required_args.add_argument('-i', '--indexRoot', required=True) required_args.add_argument('-l', '--libRoot', required=True) optional_args.add_argument('-h', '--help', action=ConsoleUtils.CustomPrintAction, print_fn=_print_help) optional_args.add_argument('-v', '--version', action=ConsoleUtils.CustomPrintAction, print_fn=_print_version) # This is where we call the main method from. if __name__ == '__main__': print(ConsoleUtils.get_header(program_name, program_version, build_date, author)) make_args() args = parser.parse_args() # Get the argument. library_root = args.libRoot index_root = args.indexRoot config_file = args.config # Set the printer up as the new std out. sys.stdout = printer # Check the arguments and environment check_args(library_root, index_root, config_file) # If we got here the args and env are all good try: main(library_root, index_root, config_file) except Exception as ex: printer.write_line_break(break_char=' ') print('Encountered a fatal error while attempting to index: %s' % type(ex).__name__) print(' Message: %s' % str(ex))