Sleds/scorewalker-utils/RunScoreWalker/sequencelogic-run-walker-classifier.py

254 lines
8.8 KiB
Python

import os
import sys
import time
import json
import argparse
import subprocess
import ConsoleUtils
_name = 'Run WalkerClassifier'
_description = 'Runs WalkerClassifier with the correct settings/arguments for the environment'
_version = '1.0.0'
_date = '2017.10.30'
_author = 'Chris Diesch <cdiesch@sequencelogic.net>'
_usage = 'sequencelogic-run-walker-classifier.py ' \
'-i,--index-root {INDEX_ROOT} ' \
'-c,--config {CONFIG_FILE} ' \
'-o,--out {OUT_FILE} ' \
'-f,--pkg-path {PKG_PATH}' \
' [OPTIONS...]'
_parser = argparse.ArgumentParser(prog=_name, description=_description, add_help=False)
_printer = ConsoleUtils.SLPrinter(_name)
sys.stdout = _printer
_DEF_THREAD_COUNT = 8
_DEF_MAX_MEM = 4096
_DEF_MIN_MEM = 128
_walker_classify_jar = ''
args = None
_DEF_ENV_VALS = {'LOG_LVL': '0',
'SEQUENCELOGICHOME': '/sequencelogic',
'FULL_DOC': 'FALSE',
'IMPORTANT_ONLY': 'FALSE'}
def _print_help():
sys.stdout = _printer.old_stdout
print(_name)
print(_description)
print('Usage: %s' % _usage)
print(
'\n'
'Arguments:\n'
' -p, --pkg-path {PKG_PATH} The path to the package file to classify.\n'
' -c, --config {CONFIG_FILE} The path to the configuration file to use while classifying.\n'
' -i, --index-root {INDEX_ROOT} The path to the index to classify the package against.\n'
' -o, --out {OUT_PATH} The path to save the classification results to.\n'
'\n'
'Options:\n'
'\n'
' JVM:\n'
' -m, --min-memory {MIN_MEM} The minimum amount of memory (in MB) to allow the JVM (default: 128).\n'
' -M, --max-memory {MAX_MEM} The maximum amount of memory (in MB) to allow the JVM (default: 4096).\n'
' -t, --thread-count {NUM_THREADS} The maximum number of threads to use while classifying (default: 8).\n'
'\n'
' Miscellaneous:\n'
' -h, --help Displays the help message.\n'
' -v, --version Displays the version information.\n'
'\n'
'Notes:\n'
' This tool sets several environmental variables which are used by the ScoreWalker classification engine.\n'
' these variables are set/used as follows:\n'
' - SEQUENCELOGICHOME: The path to the sequence logic home directory (default: "/sequencelogic").\n'
' - FULL_DOC: Weather ScoreWalker should treat the given package as a single sub-document or not.\n'
' This is determined by the existence/value of the "singleSDC" flag in the configuration file\n'
' (default: "FALSE").\n'
' - IMPORTANT_ONLY: Weather or not the package should be ignored if it does not have enough matches of\n'
' a given set of important document types. Packages which do not meet this criteria are set to\n'
' "MISCELLANEOUS" and have their confidence set high (default: "FALSE").\n'
'\n')
print('Version Information:')
print(' Version: %s' % _version)
print(' Date: %s' % _date)
print('')
print('Author: %s' % _author)
def _print_version():
sys.stdout = _parser.old_stdout
print(_name)
print('Version: %s' % _version)
print('Date: %s' % _date)
print('Author: %s' % _author)
def _run_process(proc_args, proc_name):
print('Running %s' % proc_name)
start_time = time.time()
sub_proc = subprocess.Popen(proc_args, stdout=subprocess.PIPE, stderr=sys.stderr)
for line in iter(sub_proc.stdout.readline, b''):
_printer.write_no_prefix(' %s' % line.decode('utf-8').replace(os.linesep, ''))
sub_proc.wait()
run_time = time.time() - start_time
_printer.write_line_break()
print('%s completed in %.4f s.' % (proc_name, run_time))
_printer.write_line_break()
def _get_jvm_args(init_mem, max_mem, thread_count, jar_path):
thread_count_arg = '-Djava.util.concurrent.ForkJoinPool.common.parallelism=%d' % thread_count
return ['java',
'-Xms%dm' % init_mem,
'-Xmx%dm' % max_mem,
thread_count_arg,
'-jar', jar_path]
def _make_args():
# _parser.add_argument('-t', '--timeout', type=float, default=3600)
# Required arguments
_parser.add_argument('-o', '--out')
_parser.add_argument('-p', '--pkg-path')
_parser.add_argument('-i', '--index-root')
_parser.add_argument('-c', '--config')
# JVM Options
_parser.add_argument('-T', '--thread-count', type=int, default=_DEF_THREAD_COUNT)
_parser.add_argument('-M', '--max-memory', type=int, default=_DEF_MAX_MEM)
_parser.add_argument('-m', '--min-memory', type=int, default=_DEF_MIN_MEM)
# Misc
_parser.add_argument('-h', '--help', action=ConsoleUtils.CustomPrintAction, print_fn=_print_help)
_parser.add_argument('-v', '--versions', action=ConsoleUtils.CustomPrintAction, print_fn=_print_version)
def _load_config():
try:
with open(args.config) as reader:
config_data = json.load(reader)
walker_cfg = config_data['CLASSIFYWALKER']
if not walker_cfg['enabled']:
print('ScoreWalker is disabled in this configuration file')
print('Exiting...')
exit(0)
except KeyError as ex:
print('Error parsing JSON:')
print(' %s' % str(ex))
except IOError as ex:
print('Error reading config file %s:' % args.config)
print(' %s' % str(ex))
return walker_cfg
def run_walker_classifier(package_file, index_root, config_file, out_file, thread_count, init_mem, max_mem):
class_args = _get_jvm_args(init_mem, max_mem, thread_count, _walker_classify_jar)
class_args += ['--packageFile', package_file,
'--indexDir', index_root,
'--configFile', config_file,
'--destDir', out_file]
_run_process(class_args, 'Walker Classifier')
def _setup_env():
global _walker_classify_jar
walker_cfg = _load_config()
print('Configuring environmental variables')
for var, def_val in _DEF_ENV_VALS.items():
if os.environ.get(var) is None:
print('No value set for %s, using default of %s' % (var, def_val))
os.environ[var] = def_val
important_types = 'importantTypes' in walker_cfg.keys()
if important_types:
print('Configuration uses important types, setting env variable "IMPORTANT_ONLY" = %s' % str(important_types))
os.environ['IMPORTANT_ONLY'] = str(important_types)
try:
is_full_doc = walker_cfg['singleSDC']
except KeyError:
is_full_doc = False
if is_full_doc:
print('Setting env variable "FULL_DOC" = %s' % str(is_full_doc))
os.environ['FULL_DOC'] = str(is_full_doc)
print('Environmental variables configured')
_walker_classify_jar = os.path.join(os.environ['SEQUENCELOGICHOME'], 'bin', 'walker-classifier-one-jar.jar')
if not os.path.exists(_walker_classify_jar):
print('Fatal Error: Walker Classifier jar is not in the expected location (%s)' % _walker_classify_jar)
print('Exiting...')
exit(-2)
def _check_args():
fatal_error = False
if not os.path.exists(args.pkg_path):
print('Fatal Error: The given package file does not exist (%s)' % args.pkg_path)
fatal_error = True
if not os.path.exists(args.index_root):
print('Fatal Error: The given index root does not exist (%s)' % args.index_root)
fatal_error = True
if not os.path.exists(args.config):
print('Warning: There is no config file at the given location (%s)' % args.config)
print(' OK: WalkerClassifier will run with default values (Accuracy will be lost)')
if fatal_error:
_parser.print_help()
print('Exiting...')
exit(-1)
def _show_args():
print('Walker Classifier Arguments:')
print(' Classifying package file at: %s' % args.pkg_path)
print(' Using configuration file at: %s' % args.config)
print(' Using index located at: %s' % args.index_root)
print(' Saving results to file at: %s' % args.out)
_printer.write_no_prefix('')
print('JVM Arguments:')
print(' Using %d threads' % args.thread_count)
print(' Initial memory: %sm' % args.min_memory)
print(' Maximum memory: %sm' % args.max_memory)
_printer.write_line_break(break_char=' ')
def _setup():
global args
_printer.write_no_prefix(ConsoleUtils.get_header(_name, _version, _date, _author))
_make_args()
args = _parser.parse_args()
_setup_env()
_check_args()
_show_args()
def main():
_setup()
run_walker_classifier(args.pkg_path, args.index_root, args.config, args.out, args.thread_count,
args.min_memory, args.max_memory)
if __name__ == '__main__':
main()