755 lines
26 KiB
Python
755 lines
26 KiB
Python
|
|
"""
|
||
|
|
.. role:: py(code)
|
||
|
|
:language: python
|
||
|
|
|
||
|
|
Information
|
||
|
|
-----------
|
||
|
|
This is the script responsible for running WalkerIndexer, the indexing tool used for the ScoreWalker classification
|
||
|
|
engine. This script handles performing all the necessary setup and runs any "pre-index" tools before running
|
||
|
|
WalkerIndexer. This tool pipes the console logs from the scripts/programs it calls to the current standard output.
|
||
|
|
Logging of this information is handled by the Java code calling this script.
|
||
|
|
|
||
|
|
Currently This program has 5 steps to its execution:
|
||
|
|
|
||
|
|
#) Copying the data files ``"MaleNames.txt", "FemaleNames.txt" and "Surnames.txt"`` to the destination index
|
||
|
|
directory.
|
||
|
|
|
||
|
|
#) Running ``"Tokenizer"`` on the any new library elements.
|
||
|
|
|
||
|
|
#) Running ``"phrases"`` on the ``tkn`` files generated by ``"Tokenizer"``.
|
||
|
|
|
||
|
|
#) Running ``"PhraseCountLoader.py"`` on the ``phrasecount`` files generated by ``"phrases"``.
|
||
|
|
|
||
|
|
#) Running WalkerIndexer.
|
||
|
|
|
||
|
|
This tool is only intended to be run from the commandline in production, however, the methods have been documented to
|
||
|
|
allow for an easier understanding of how this tool works.
|
||
|
|
|
||
|
|
|
||
|
|
Commandline Usage
|
||
|
|
------------------
|
||
|
|
Usage ``sequencelogic-run-walker-indexer.py [-h, --help] [-c, --config] {CFG_FILE} [-i, --indexRoot] {IDX_ROOT}
|
||
|
|
[-l, --libRoot] {LIB_ROOT}``
|
||
|
|
|
||
|
|
Required Arguments:
|
||
|
|
``-c CFG_FILE, --config CFG_FILE``
|
||
|
|
Where ``CFG_FILE`` is the path to the config file being used for WalkerIndexer.
|
||
|
|
|
||
|
|
``-i IDX_ROOT, --indexRoot IDX_ROOT``
|
||
|
|
Where ``IDX_ROOT`` is the path to save the index generated by WalkerIndexer to.
|
||
|
|
|
||
|
|
``-l LIB_ROOT, --libRoot LIB_ROOT``
|
||
|
|
Where ``LIB_ROOT`` is the path to the library for WalkerIndexer to index.
|
||
|
|
|
||
|
|
Optional Arguments:
|
||
|
|
``-h, --help``
|
||
|
|
Prints the help message
|
||
|
|
|
||
|
|
``-v, --version``
|
||
|
|
Prints the version of WalkerIndexer being used as well as a description of what the version does and requires.
|
||
|
|
|
||
|
|
|
||
|
|
Python Module Usage
|
||
|
|
-------------------
|
||
|
|
"""
|
||
|
|
# TODO: Add a link to the referenced documentation
|
||
|
|
# We do all our imports at the top of our program.
|
||
|
|
import argparse
|
||
|
|
import datetime
|
||
|
|
import time
|
||
|
|
import os
|
||
|
|
import subprocess
|
||
|
|
import shutil
|
||
|
|
import sys
|
||
|
|
import json
|
||
|
|
|
||
|
|
import ConsoleUtils
|
||
|
|
|
||
|
|
# Give the program a name.
|
||
|
|
program_name = 'Run WalkerIndexer'
|
||
|
|
# Describe what the program does beiefly.
|
||
|
|
program_description = 'Does the necessary steps to run WalkerIndexer.'
|
||
|
|
# The argument parser for the program.
|
||
|
|
parser = argparse.ArgumentParser(prog=program_name, description=program_description, add_help=False)
|
||
|
|
# Error and Warning console values:
|
||
|
|
usage = 'sequencelogic-run-walker-indexer.py ' \
|
||
|
|
'-i,--indexRoot {IDX_PATH} ' \
|
||
|
|
'-c,--config {CFG_FILE} ' \
|
||
|
|
'-l,--libRoot {LIB_PATH} ' \
|
||
|
|
'[options]'
|
||
|
|
|
||
|
|
build_date = '2017/11/21'
|
||
|
|
program_version = '2.2.3'
|
||
|
|
author = 'Chris Diesch <cdiesch@sequencelogic.net>'
|
||
|
|
|
||
|
|
sl_home = ''
|
||
|
|
|
||
|
|
male_name_file = ''
|
||
|
|
female_name_file = ''
|
||
|
|
last_name_file = ''
|
||
|
|
|
||
|
|
phrase_exe = ''
|
||
|
|
tokenizer_jar = ''
|
||
|
|
term_diff_jar = ''
|
||
|
|
indexer_jar = ''
|
||
|
|
phrase_loader = ''
|
||
|
|
|
||
|
|
doctype_coverage_arg = '-c'
|
||
|
|
min_phrase_len_arg = '-p'
|
||
|
|
max_phrase_len_arg = '-P'
|
||
|
|
|
||
|
|
doctype_coverage = str(70)
|
||
|
|
min_phrase_len = str(3)
|
||
|
|
max_phrase_len = str(8)
|
||
|
|
|
||
|
|
min_num_examples = 2
|
||
|
|
|
||
|
|
idx_arg = '--indexDir'
|
||
|
|
lib_arg = '--dataDir'
|
||
|
|
cfg_arg = '--configFile'
|
||
|
|
|
||
|
|
phrase_root_arg = '--in_dir'
|
||
|
|
phrase_file_arg = '--dest_file'
|
||
|
|
|
||
|
|
log_all = False
|
||
|
|
|
||
|
|
printer = ConsoleUtils.SLPrinter(program_name)
|
||
|
|
|
||
|
|
walker_config_data = {}
|
||
|
|
|
||
|
|
|
||
|
|
def _print_version():
|
||
|
|
sys.stdout = printer.old_stdout
|
||
|
|
print('')
|
||
|
|
print(program_name)
|
||
|
|
print('Version:')
|
||
|
|
print(' Version: %s' % program_version)
|
||
|
|
print(' Date: %s' % build_date)
|
||
|
|
print(' Author: %s' % author)
|
||
|
|
|
||
|
|
|
||
|
|
def _print_help():
|
||
|
|
sys.stdout = printer.old_stdout
|
||
|
|
print('')
|
||
|
|
print(program_name)
|
||
|
|
print('Usage: %s' % usage)
|
||
|
|
print('')
|
||
|
|
# What is special about this version?
|
||
|
|
print('Description:')
|
||
|
|
print(' Creates and moves the necessary metadata to run WalkerIndexer v2.2.1 with Phrasing, Name Redaction,\n'
|
||
|
|
' Data Redaction, Container/Contained Doctypes, Similar Doctype Differencing, and Duplicate Page\n'
|
||
|
|
' Recognition enabled.\n'
|
||
|
|
' This process requires certain programs, files and environmental variables to exist/be set, since \n'
|
||
|
|
' WalkerIndexer only handles generating the Lucene index. These requirements are listed below.')
|
||
|
|
print('')
|
||
|
|
print('Arguments:')
|
||
|
|
print(' -i,--indexRoot {IDX_PATH} The path to the save the generated index to.')
|
||
|
|
print(' -c,--config {CFG_FILE} The path to the configuration file to use while indexing.')
|
||
|
|
print(' -l,--libRoot {LIB_PATH} The path to the library to index.')
|
||
|
|
print('')
|
||
|
|
print('Miscellaneous:')
|
||
|
|
print(' -h,--help Prints the help message.')
|
||
|
|
print(' -v,--version Prints the version information.')
|
||
|
|
print('')
|
||
|
|
print('Version:')
|
||
|
|
print(' Version: %s' % program_version)
|
||
|
|
print(' Date: %s' % build_date)
|
||
|
|
print('')
|
||
|
|
print('Requires:')
|
||
|
|
print(' - "SEQUENCELOGICHOME" environmental variable to be set.')
|
||
|
|
print(' - The folder at "${SEQUENCELOGIC}/bin" (SL bin) to exist and be the home for SequenceLogic' + '\n'
|
||
|
|
' programs.')
|
||
|
|
print(' - The Phrases ("phrases") program to be in "${SEQUENCELOGICHOME}/bin".')
|
||
|
|
print(' - The Phrase Count Loader ("PhraseCountLoader.py") script to be in the SL bin directory.')
|
||
|
|
print(' - The Tokenizer ("tokenizer-one-jar.jar") program to be in the SL bin directory.')
|
||
|
|
print(' - The Walker Term Diff ("walker-term-diff-one-jar.jar") program to be in the SL bin directory.')
|
||
|
|
print(' - The files "MaleNames.txt", "FemaleNames.txt", & "Surnames.txt" to be in\n'
|
||
|
|
' "${SEQUENCELOGICHOME}/config/data".')
|
||
|
|
print('')
|
||
|
|
print('Notes:')
|
||
|
|
print(' - If "SEQUENCELOGICHOME" is not set, it will be set to "/sequencelogic" by default.')
|
||
|
|
print(' - If any of the files "MaleNames.txt", "FemaleNames.txt", or "Surnames.txt" is not in\n'
|
||
|
|
' "${SEQUENCELOGIC}/config/data"; the missing file(s) will have an empty file with the same name\n '
|
||
|
|
' created (this will cause name redaction to not work).')
|
||
|
|
print(' - An environmental variable is set by this script for running the indexer:')
|
||
|
|
print(' - "LOGALL" is set to "false" by default and causes additional logging to occur while indexing.')
|
||
|
|
print('')
|
||
|
|
print('Author: %s' % author)
|
||
|
|
|
||
|
|
|
||
|
|
def copy_files(idx_root):
|
||
|
|
"""
|
||
|
|
Copies the files from their default location ${SEQUENCELOGICHOME}/SLSync/config/data to the index's output
|
||
|
|
directory.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
``idx_root`` -- ``str`` The path to save the index to.
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
``None``
|
||
|
|
|
||
|
|
"""
|
||
|
|
|
||
|
|
shutil.copy(male_name_file, idx_root)
|
||
|
|
shutil.copy(female_name_file, idx_root)
|
||
|
|
shutil.copy(last_name_file, idx_root)
|
||
|
|
|
||
|
|
|
||
|
|
def run_walker_term_diff(similar_types, lib_root, idx_root):
|
||
|
|
"""
|
||
|
|
Runs WalkerTermDiff from ${SEQUENCELOGICHOME}/bin to generate the unique terms json file between doctypes which are
|
||
|
|
known to be too similar to one another.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
``phrases`` -- ``list(str)`` The list of phrases which are too similar to one another.
|
||
|
|
|
||
|
|
``lib_root`` -- ``str`` The path to the library being used to create an index.
|
||
|
|
|
||
|
|
``idx_root`` -- ``str`` The path to the index directory being created.
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
``None``
|
||
|
|
|
||
|
|
"""
|
||
|
|
|
||
|
|
term_diff_cmd = ['java', '-jar', term_diff_jar,
|
||
|
|
'--libRoot', lib_root,
|
||
|
|
'--outRoot', idx_root,
|
||
|
|
'--doctypes'] + similar_types
|
||
|
|
|
||
|
|
print('Running WalkerTermDiff')
|
||
|
|
start_time = time.time()
|
||
|
|
|
||
|
|
term_diff_proc = subprocess.Popen(term_diff_cmd, stdout=subprocess.PIPE, stderr=sys.stderr)
|
||
|
|
for line in iter(term_diff_proc.stdout.readline, b''):
|
||
|
|
printer.write_no_prefix(' %s' % line.decode('utf-8').replace('\n', '').replace('\r', ''))
|
||
|
|
|
||
|
|
term_diff_proc.wait()
|
||
|
|
run_time = time.time() - start_time
|
||
|
|
|
||
|
|
printer.write_line_break()
|
||
|
|
print('Phrase runner completed in %.4f s.' % run_time)
|
||
|
|
printer.write_line_break()
|
||
|
|
|
||
|
|
|
||
|
|
def run_phrase_loader(phrase_dir, out_file):
|
||
|
|
"""
|
||
|
|
Runs PhraseLoader.py from ${SEQUENCELOGICHOME}/bin to generate the "Phrases.txt" file used by WalkerIndexer.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
``phrase_dir`` -- ``str`` The output directory used when running phrases.
|
||
|
|
|
||
|
|
``out_file`` -- ``str`` The path to save the output to (should be ``"Phrases.txt"`` in the index directory).
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
``None``
|
||
|
|
|
||
|
|
"""
|
||
|
|
|
||
|
|
phrase_loader_cmd = ['python', phrase_loader,
|
||
|
|
phrase_root_arg, phrase_dir,
|
||
|
|
phrase_file_arg, out_file]
|
||
|
|
|
||
|
|
print('Running Phrase Loader')
|
||
|
|
start_time = time.time()
|
||
|
|
|
||
|
|
phrase_loader_proc = subprocess.Popen(phrase_loader_cmd, stdout=subprocess.PIPE, stderr=sys.stderr)
|
||
|
|
for line in iter(phrase_loader_proc.stdout.readline, b''):
|
||
|
|
printer.write_no_prefix(' %s' % line.decode('utf-8').replace('\n', '').replace('\r', ''))
|
||
|
|
|
||
|
|
phrase_loader_proc.wait()
|
||
|
|
|
||
|
|
end_time = time.time() - start_time
|
||
|
|
# Remove the unneeded files
|
||
|
|
shutil.rmtree(phrase_dir)
|
||
|
|
printer.write_line_break()
|
||
|
|
print('Phrase Loader completed in %.4f s.' % end_time)
|
||
|
|
printer.write_line_break()
|
||
|
|
|
||
|
|
|
||
|
|
def run_tokenizer(lib_dir, idx_dir):
|
||
|
|
"""
|
||
|
|
Runs Tokenizer on the library to generate the token files used by phrases.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
``lib_dir`` -- ``str`` The path to the library to generate ``tkn`` files for.
|
||
|
|
|
||
|
|
``idx_dir`` -- ``str`` The path to save the index to.
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
``None``
|
||
|
|
|
||
|
|
"""
|
||
|
|
|
||
|
|
tokenizer_cmd = ['java', '-jar', tokenizer_jar,
|
||
|
|
idx_arg, idx_dir,
|
||
|
|
'--libDir', lib_dir]
|
||
|
|
|
||
|
|
print('Running Tokenizer')
|
||
|
|
start_time = time.time()
|
||
|
|
|
||
|
|
tokenizer_proc = subprocess.Popen(tokenizer_cmd, stdout=subprocess.PIPE)
|
||
|
|
|
||
|
|
for line in iter(tokenizer_proc.stdout.readline, b''):
|
||
|
|
printer.write_no_prefix(' %s' % line.decode('utf-8').replace('\n', '').replace('\r', ''))
|
||
|
|
|
||
|
|
tokenizer_proc.wait()
|
||
|
|
|
||
|
|
end_time = time.time() - start_time
|
||
|
|
printer.write_line_break()
|
||
|
|
print('Tokenizer completed in %.4f s.' % end_time)
|
||
|
|
|
||
|
|
|
||
|
|
def run_phrase_maker(files, out_file):
|
||
|
|
"""
|
||
|
|
Runs phrases on the given list of files.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
``files`` -- ``list(str)`` The list of files to generate phrases from.
|
||
|
|
|
||
|
|
``out_file`` -- ``str`` The path to save the output to.
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
``None``
|
||
|
|
|
||
|
|
"""
|
||
|
|
|
||
|
|
phrase_cmd = [phrase_exe,
|
||
|
|
doctype_coverage_arg, doctype_coverage,
|
||
|
|
min_phrase_len_arg, min_phrase_len,
|
||
|
|
max_phrase_len_arg, max_phrase_len] + files
|
||
|
|
start_time = time.time()
|
||
|
|
out = open(out_file, 'a+')
|
||
|
|
phrase_proc = subprocess.Popen(phrase_cmd, stdout=out, stderr=subprocess.PIPE)
|
||
|
|
|
||
|
|
for line in iter(phrase_proc.stderr.readline, b''):
|
||
|
|
txt = line.decode('utf-8').replace('\n', '').replace('\r', '')
|
||
|
|
if txt == '':
|
||
|
|
continue
|
||
|
|
if 'Processing' not in txt:
|
||
|
|
printer.write_no_prefix('%s [PhraseMaker] %s' %
|
||
|
|
(datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S'), txt))
|
||
|
|
if 'is zero size' in txt:
|
||
|
|
start_idx = 5
|
||
|
|
end_idx = txt.rfind('.tkn') + 4
|
||
|
|
empty_file = txt[start_idx:end_idx]
|
||
|
|
print('Deleting emtpy file detected by PhraseMaker %s' % empty_file)
|
||
|
|
try:
|
||
|
|
os.remove(empty_file)
|
||
|
|
except OSError as ex:
|
||
|
|
print('Failed to delete file:')
|
||
|
|
print('Message: %s' % str(ex))
|
||
|
|
|
||
|
|
phrase_proc.wait()
|
||
|
|
run_time = time.time() - start_time
|
||
|
|
rate = len(files)/run_time
|
||
|
|
print('Analyzed phrases on %d documents in %.4f s (%.1f documents/s)' % (len(files), run_time, rate))
|
||
|
|
|
||
|
|
|
||
|
|
def run_phrase(phrase_dir, lib_dir):
|
||
|
|
"""
|
||
|
|
Runs phrase on every doctype in the library using :meth:`run_phrase_maker`. Phrases for a doctype are saved into a
|
||
|
|
file in ``phrase_dir`` called ``"DOCTYPE.phrasecount"``.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
``phrase_dir`` -- ``str`` The path to save the outputs from phrase to.
|
||
|
|
|
||
|
|
``lib_dir`` -- ``str`` The path to the library to generate the phrases from.
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
``None``
|
||
|
|
|
||
|
|
"""
|
||
|
|
|
||
|
|
print('Generating phrases in library')
|
||
|
|
|
||
|
|
for folder in os.listdir(lib_dir):
|
||
|
|
folder = folder.replace('\uf028', '')
|
||
|
|
# Get the output file name
|
||
|
|
doctype = folder
|
||
|
|
out_file_name = os.path.join(phrase_dir, '%s.phrasecount' % doctype)
|
||
|
|
folder = os.path.join(lib_dir, folder)
|
||
|
|
files = []
|
||
|
|
if os.path.isdir(folder) and not doctype.lower() == 'templates':
|
||
|
|
# Get all the token files
|
||
|
|
for file in os.listdir(folder):
|
||
|
|
file = os.path.join(folder, file)
|
||
|
|
if file.endswith('.tkn') and os.path.isfile(file):
|
||
|
|
files.append(file)
|
||
|
|
num_examples = len(files)
|
||
|
|
# Only run the phrase maker if we have enough examples
|
||
|
|
if num_examples >= min_num_examples:
|
||
|
|
printer.write_line_break()
|
||
|
|
print('Running phrases on %s' % doctype)
|
||
|
|
run_phrase_maker(files, out_file_name)
|
||
|
|
printer.write_line_break()
|
||
|
|
else:
|
||
|
|
print('There are not enough examples of doctype "%s" in the library to generate phrases '
|
||
|
|
'(examples required: %d, examples in library: %d)' %
|
||
|
|
(doctype, min_num_examples, num_examples))
|
||
|
|
printer.write_line_break()
|
||
|
|
|
||
|
|
|
||
|
|
def run_walker_indexer(idx_dir, lib_dir, cfg_path):
|
||
|
|
"""
|
||
|
|
Runs WalkerIndexer.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
``idx_dir`` -- ``str`` The path to save the index to.
|
||
|
|
|
||
|
|
``lib_dir`` -- ``str`` The path to the library to run WalkerIndexer on.
|
||
|
|
|
||
|
|
``cfg_path`` -- ``str`` The config file for WalkerIndexer to use.
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
``None``
|
||
|
|
|
||
|
|
"""
|
||
|
|
|
||
|
|
index_cmd = ['java', '-jar', indexer_jar,
|
||
|
|
idx_arg, idx_dir,
|
||
|
|
lib_arg, lib_dir,
|
||
|
|
cfg_arg, cfg_path]
|
||
|
|
|
||
|
|
print('Running WalkerIndexer')
|
||
|
|
start_time = time.time()
|
||
|
|
|
||
|
|
index_proc = subprocess.Popen(index_cmd, stdout=subprocess.PIPE, stderr=sys.stderr)
|
||
|
|
|
||
|
|
for line in iter(index_proc.stdout.readline, b''):
|
||
|
|
printer.write_no_prefix(' %s' % line.decode('utf-8').replace('\n', '').replace('\r', ''))
|
||
|
|
|
||
|
|
index_proc.wait()
|
||
|
|
index_proc.poll()
|
||
|
|
|
||
|
|
if index_proc.returncode != 0:
|
||
|
|
printer.write_line_break(break_char=' ')
|
||
|
|
print('ERROR: WalkerIndexer has crashed')
|
||
|
|
raise subprocess.CalledProcessError(index_proc.returncode, index_proc.args)
|
||
|
|
|
||
|
|
end_time = time.time() - start_time
|
||
|
|
printer.write_line_break()
|
||
|
|
print('WalkerIndexer completed with status %d in %.4f s.' % (index_proc.returncode, end_time))
|
||
|
|
printer.write_line_break()
|
||
|
|
|
||
|
|
|
||
|
|
def env_is_good():
|
||
|
|
"""
|
||
|
|
Determines if this system can run WalkerIndexer or not.
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
:py:`True` if the system can run WalkerIndexer :py:`False` otherwise.
|
||
|
|
|
||
|
|
"""
|
||
|
|
global sl_home, male_name_file, female_name_file, last_name_file, phrase_loader, tokenizer_jar, indexer_jar, \
|
||
|
|
phrase_exe, term_diff_jar
|
||
|
|
|
||
|
|
# Is LOGALL set?
|
||
|
|
if os.environ.get('LOGALL') is None:
|
||
|
|
print('No value set for environmental variable "LOGALL", setting LOGALL = "%s"' % str(log_all))
|
||
|
|
os.environ['LOGALL'] = str(log_all)
|
||
|
|
|
||
|
|
# Eventually use this to find pages in the library which are too similar and do something with
|
||
|
|
# them before indexing.
|
||
|
|
|
||
|
|
# Is SAMEPAGES set?
|
||
|
|
# if os.environ.get('SAMEPAGES') is None:
|
||
|
|
# print('No value set for environmental variable "SAMEPAGES", setting SAMEPAGES = "true"')
|
||
|
|
# os.environ['SAMEPAGES'] = 'true'
|
||
|
|
|
||
|
|
# Check for SEQUENCELOGICHOME
|
||
|
|
if os.environ.get('SEQUENCELOGICHOME') is None:
|
||
|
|
print('Warning: The environmental variable "SEQUENCELOGICHOME" is not set')
|
||
|
|
print('Setting SEQUENCELOGICHOME = "/sequencelogic" (default value)')
|
||
|
|
os.environ['SEQUENCELOGICHOME'] = '/sequencelogic'
|
||
|
|
|
||
|
|
# Does sl_home exist?
|
||
|
|
sl_home = os.environ.get('SEQUENCELOGICHOME')
|
||
|
|
if not os.path.exists(sl_home):
|
||
|
|
print('Error: The environmental variable "SEQUENCELOGICHOME" does not point to a valid directory')
|
||
|
|
print(' ${SEQUENCELOGICHOME} = "%s"' % sl_home)
|
||
|
|
return False
|
||
|
|
|
||
|
|
# Does sl_bin exist?
|
||
|
|
sl_bin = os.path.join(sl_home, 'bin')
|
||
|
|
if not os.path.exists(sl_bin):
|
||
|
|
print('Error: There is no "bin" directory in "SEQUENCELOGICHOME"')
|
||
|
|
print(' ${SEQUENCELOGICHOME}\\bin = "%s"' % sl_bin)
|
||
|
|
return False
|
||
|
|
|
||
|
|
# Is phrases on this system?
|
||
|
|
phrase_exe = os.path.join(sl_bin, 'phrases')
|
||
|
|
if not os.path.exists(phrase_exe):
|
||
|
|
print('Error: There is no Phrases program on this system')
|
||
|
|
print(' Expected at "%s"' % phrase_exe)
|
||
|
|
phrase_exe += '.exe'
|
||
|
|
if os.path.exists(phrase_exe):
|
||
|
|
print(' OK: Found phrases.exe instead.')
|
||
|
|
else:
|
||
|
|
return False
|
||
|
|
|
||
|
|
# Is Tokenizer on this system?
|
||
|
|
tokenizer_jar = os.path.join(sl_bin, 'tokenizer-one-jar.jar')
|
||
|
|
if not os.path.exists(tokenizer_jar):
|
||
|
|
print('Error: There is no Tokenizer program on this system')
|
||
|
|
print(' Expected at "%s"' % tokenizer_jar)
|
||
|
|
return False
|
||
|
|
|
||
|
|
# Is WalkerIndexer on this system?
|
||
|
|
indexer_jar = os.path.join(sl_bin, 'walker-indexer-one-jar.jar')
|
||
|
|
if not os.path.exists(indexer_jar):
|
||
|
|
print('Error: There is no WalkerIndexer on this system.')
|
||
|
|
print(' Expected at "%s"' % indexer_jar)
|
||
|
|
return False
|
||
|
|
|
||
|
|
# Is PhraseCountLoader on this system?
|
||
|
|
phrase_loader = os.path.join(sl_bin, 'PhraseCountLoader.py')
|
||
|
|
if not os.path.exists(phrase_loader):
|
||
|
|
print('Error: There is no PhraseCountLoader on this system')
|
||
|
|
print(' Expected at "%s"' % phrase_loader)
|
||
|
|
return False
|
||
|
|
|
||
|
|
# Is WalkerTermDiff on this system?
|
||
|
|
term_diff_jar = os.path.join(sl_bin, 'walker-term-diff-one-jar.jar')
|
||
|
|
if not os.path.exists(term_diff_jar):
|
||
|
|
print('Error: There is no WalkerTermDiff on this system.')
|
||
|
|
print(' Expected at "%s"' % term_diff_jar)
|
||
|
|
return False
|
||
|
|
|
||
|
|
# Check for index data
|
||
|
|
index_files_home = os.path.join(sl_home, 'SLSync')
|
||
|
|
index_files_home = os.path.join(index_files_home, 'config')
|
||
|
|
index_files_home = os.path.join(index_files_home, 'data')
|
||
|
|
# Does it exist?
|
||
|
|
if not os.path.exists(index_files_home):
|
||
|
|
print('Error: There is no "data" directory on this system ${SEQUENCELOGICHOME}\\SLSync\\config\\data')
|
||
|
|
print(' Expected at "%s"' % index_files_home)
|
||
|
|
return False
|
||
|
|
|
||
|
|
# Check MaleNames.txt
|
||
|
|
male_name_file = os.path.join(index_files_home, 'MaleNames.txt')
|
||
|
|
if not os.path.exists(male_name_file):
|
||
|
|
print('Warning: There is no "MaleNames.txt" file on this system. Name Redaction will not work correctly.')
|
||
|
|
# Make an empty file
|
||
|
|
open(male_name_file, 'w+').close()
|
||
|
|
|
||
|
|
# Check FemaleNames.txt
|
||
|
|
female_name_file = os.path.join(index_files_home, 'FemaleNames.txt')
|
||
|
|
if not os.path.exists(female_name_file):
|
||
|
|
print('Warning: There is no "FemaleNames.txt" file on this system. Name Redaction will not work correctly.')
|
||
|
|
# Make an empty file
|
||
|
|
open(female_name_file, 'w+').close()
|
||
|
|
|
||
|
|
# Check Surnames.txt
|
||
|
|
last_name_file = os.path.join(index_files_home, 'Surnames.txt')
|
||
|
|
if not os.path.exists(last_name_file):
|
||
|
|
print('Warning: There is no "Surnames.txt" file on this system. Name Redaction will not work correctly.')
|
||
|
|
# Make an empty file
|
||
|
|
open(last_name_file, 'w+').close()
|
||
|
|
|
||
|
|
return True
|
||
|
|
|
||
|
|
|
||
|
|
# This is the main function of the program.
|
||
|
|
def main(lib_root, idx_root, config_path):
|
||
|
|
global walker_config_data
|
||
|
|
|
||
|
|
with open(config_path) as cfg:
|
||
|
|
config_data = json.load(cfg)
|
||
|
|
|
||
|
|
try:
|
||
|
|
walker_config_data = config_data['CLASSIFYWALKER']
|
||
|
|
if not walker_config_data['enabled']:
|
||
|
|
print('ScoreWalker is not enabled in this config file. It will not be indexed')
|
||
|
|
print('Exiting...')
|
||
|
|
exit(0)
|
||
|
|
except KeyError:
|
||
|
|
print('Error: The config file at "%s" has no "CLASSIFYWALKER" field.' % config_path)
|
||
|
|
exit(-1)
|
||
|
|
|
||
|
|
print('Creating Index')
|
||
|
|
start_time = time.time()
|
||
|
|
phrase_root = os.path.join(idx_root, 'phrases')
|
||
|
|
if not os.path.exists(phrase_root):
|
||
|
|
os.mkdir(phrase_root)
|
||
|
|
|
||
|
|
phrase_file = os.path.join(idx_root, 'Phrases.txt')
|
||
|
|
with open(phrase_file, 'w+') as tmp:
|
||
|
|
tmp.write('')
|
||
|
|
|
||
|
|
# Copy the files
|
||
|
|
copy_files(idx_root)
|
||
|
|
# Run the tokenizer
|
||
|
|
run_tokenizer(lib_root, idx_root)
|
||
|
|
# Run the term diff
|
||
|
|
# Run the phrase maker
|
||
|
|
run_phrase(phrase_root, lib_root)
|
||
|
|
# Run the loader
|
||
|
|
run_phrase_loader(phrase_root, phrase_file)
|
||
|
|
# Now we can index
|
||
|
|
run_walker_indexer(idx_root, lib_root, config_path)
|
||
|
|
|
||
|
|
# Get the run time
|
||
|
|
run_time = time.time() - start_time
|
||
|
|
printer.write_no_prefix('Indexing successful!')
|
||
|
|
print('Indexing completed in %.4f s.' % run_time)
|
||
|
|
print('')
|
||
|
|
|
||
|
|
|
||
|
|
def check_args(lib_root, idx_root, cfg_file):
|
||
|
|
"""
|
||
|
|
Checks the arguments to make sure they are valid, and WalkerIndexer can be run. This function will exit if either
|
||
|
|
the arguments are invalid or :meth:`env_is_good` returns :py:`False`.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
``lib_root`` -- ``str`` The path to the library to index.
|
||
|
|
|
||
|
|
``idx_root`` -- ``str`` The path to save the index to.
|
||
|
|
|
||
|
|
``cfg_file`` -- ``str`` The path to the config file to use.
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
``None``
|
||
|
|
|
||
|
|
"""
|
||
|
|
|
||
|
|
# We only exit if a required argument was bad, we can handle optional arguments.
|
||
|
|
fatal_errors = False
|
||
|
|
# Check the library root
|
||
|
|
if not os.path.exists(lib_root):
|
||
|
|
print('Error: Library Path does not exist: "%s"' % lib_root)
|
||
|
|
fatal_errors = True
|
||
|
|
|
||
|
|
# Check the index root
|
||
|
|
if not os.path.exists(idx_root):
|
||
|
|
print('Warning: Index Path does not exist, creating index path at "%s"' % idx_root)
|
||
|
|
try:
|
||
|
|
os.mkdir(idx_root)
|
||
|
|
print('OK: Created index root successfully at "%s"' % idx_root)
|
||
|
|
except FileNotFoundError:
|
||
|
|
print('Error: Failed to create index at "%s"' % idx_root)
|
||
|
|
fatal_errors = True
|
||
|
|
|
||
|
|
# Check the config file.
|
||
|
|
if not os.path.exists(cfg_file):
|
||
|
|
print('Warning: There is no config file at "%s"' % cfg_file)
|
||
|
|
print(' OK: WalkerIndexer will run without container/contained logic in place '
|
||
|
|
'(See WalkerIndexer documentation for more information).')
|
||
|
|
|
||
|
|
if fatal_errors or not env_is_good():
|
||
|
|
parser.print_help()
|
||
|
|
print('Exiting...')
|
||
|
|
exit(0)
|
||
|
|
|
||
|
|
os.environ['UseFirstPages'] = str(os.path.exists(os.path.join(idx_root, 'FirstPageIndex')))
|
||
|
|
|
||
|
|
show_args(lib_root, idx_root, cfg_file)
|
||
|
|
|
||
|
|
|
||
|
|
# prints the argument values passed in.
|
||
|
|
def show_args(lib_root, idx_root, config):
|
||
|
|
"""
|
||
|
|
Displays the run time information for this tool.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
``lib_root`` -- ``str`` The path to the library to index.
|
||
|
|
|
||
|
|
``idx_root`` -- ``str`` The path to save the index to.
|
||
|
|
|
||
|
|
``config`` -- ``str`` The path to the config file to use.
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
``None``
|
||
|
|
|
||
|
|
"""
|
||
|
|
|
||
|
|
# print the arguments with a brief summation of what they mean.
|
||
|
|
print('Arguments:')
|
||
|
|
print(' Running indexer on library at: "%s"' % lib_root)
|
||
|
|
print(' Saving index at: "%s"' % idx_root)
|
||
|
|
print(' Using configuration from: "%s"' % config)
|
||
|
|
|
||
|
|
printer.write_line_break()
|
||
|
|
|
||
|
|
print('Environmental Variables:')
|
||
|
|
print(' SEQUENCELOGICHOME = "%s"' % sl_home)
|
||
|
|
print(' LOGALL = "%s"' % str(log_all))
|
||
|
|
|
||
|
|
printer.write_line_break()
|
||
|
|
|
||
|
|
print('Program execution order:')
|
||
|
|
print(' - Tokenizer [jar]')
|
||
|
|
print(' - Walker Term Diff [jar] (IFF there are similar doctypes defined in the config file)')
|
||
|
|
print(' - Phrase Maker [binary executable]')
|
||
|
|
print(' - Phrase Count Loader [py script]')
|
||
|
|
print(' - Walker Indexer [jar]')
|
||
|
|
|
||
|
|
printer.write_line_break()
|
||
|
|
|
||
|
|
print('Program Configurations:')
|
||
|
|
print(' Tokenizer Config:')
|
||
|
|
print(' Tokenizer executable location: "%s"' % tokenizer_jar)
|
||
|
|
print(' Index root directory: "%s"' % idx_root)
|
||
|
|
print(' Library root directory: "%s"' % lib_root)
|
||
|
|
print(' Walker Term Diff Config:')
|
||
|
|
print(' Walker Term Diff executable location: "%s"' % term_diff_jar)
|
||
|
|
print(' Index root directory: "%s"' % idx_root)
|
||
|
|
print(' Library root directory: "%s"' % lib_root)
|
||
|
|
print(' Phrase Maker Config:')
|
||
|
|
print(' Phrase Maker executable location: "%s"' % phrase_exe)
|
||
|
|
print(' Minimum phrase length: %s' % min_phrase_len)
|
||
|
|
print(' Maximum phrase length: %s' % max_phrase_len)
|
||
|
|
print(' Minimum doctype coverage: %s%%' % doctype_coverage)
|
||
|
|
print(' Minimum number of examples to generate a phrase: %d' % min_num_examples)
|
||
|
|
print(' Phrase Count Loader Config:')
|
||
|
|
print(' Phrase Count Loader executable location: "%s"' % phrase_loader)
|
||
|
|
print(' Maximum number of doctype occurrences: %d' % 1)
|
||
|
|
print(' Filter sub-phrases over doctype: %s' % 'YES')
|
||
|
|
print(' Walker Indexer Config:')
|
||
|
|
print(' Walker Indexer executable location: "%s"' % indexer_jar)
|
||
|
|
print(' Index root directory: "%s"' % idx_root)
|
||
|
|
print(' Library root directory: "%s"' % lib_root)
|
||
|
|
|
||
|
|
printer.write_line_break()
|
||
|
|
printer.write_no_prefix('')
|
||
|
|
|
||
|
|
|
||
|
|
# Set up arguments here; more groups can be made if required.
|
||
|
|
def make_args():
|
||
|
|
# A group for required arguments
|
||
|
|
required_args = parser.add_argument_group('Required')
|
||
|
|
# A group for optional arguments
|
||
|
|
optional_args = parser.add_argument_group('Optional')
|
||
|
|
|
||
|
|
required_args.add_argument('-c', '--config', required=True)
|
||
|
|
required_args.add_argument('-i', '--indexRoot', required=True)
|
||
|
|
required_args.add_argument('-l', '--libRoot', required=True)
|
||
|
|
|
||
|
|
optional_args.add_argument('-h', '--help',
|
||
|
|
action=ConsoleUtils.CustomPrintAction, print_fn=_print_help)
|
||
|
|
optional_args.add_argument('-v', '--version',
|
||
|
|
action=ConsoleUtils.CustomPrintAction, print_fn=_print_version)
|
||
|
|
|
||
|
|
|
||
|
|
# This is where we call the main method from.
|
||
|
|
if __name__ == '__main__':
|
||
|
|
print(ConsoleUtils.get_header(program_name, program_version, build_date, author))
|
||
|
|
make_args()
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
# Get the argument.
|
||
|
|
library_root = args.libRoot
|
||
|
|
index_root = args.indexRoot
|
||
|
|
config_file = args.config
|
||
|
|
# Set the printer up as the new std out.
|
||
|
|
sys.stdout = printer
|
||
|
|
# Check the arguments and environment
|
||
|
|
check_args(library_root, index_root, config_file)
|
||
|
|
# If we got here the args and env are all good
|
||
|
|
try:
|
||
|
|
main(library_root, index_root, config_file)
|
||
|
|
except Exception as ex:
|
||
|
|
printer.write_line_break(break_char=' ')
|
||
|
|
print('Encountered a fatal error while attempting to index: %s' % type(ex).__name__)
|
||
|
|
print(' Message: %s' % str(ex))
|