254 lines
8.8 KiB
Python
254 lines
8.8 KiB
Python
import os
|
|
import sys
|
|
import time
|
|
import json
|
|
import argparse
|
|
import subprocess
|
|
import ConsoleUtils
|
|
|
|
_name = 'Run WalkerClassifier'
|
|
_description = 'Runs WalkerClassifier with the correct settings/arguments for the environment'
|
|
_version = '1.0.0'
|
|
_date = '2017.10.30'
|
|
_author = 'Chris Diesch <cdiesch@sequencelogic.net>'
|
|
|
|
_usage = 'sequencelogic-run-walker-classifier.py ' \
|
|
'-i,--index-root {INDEX_ROOT} ' \
|
|
'-c,--config {CONFIG_FILE} ' \
|
|
'-o,--out {OUT_FILE} ' \
|
|
'-f,--pkg-path {PKG_PATH}' \
|
|
' [OPTIONS...]'
|
|
|
|
_parser = argparse.ArgumentParser(prog=_name, description=_description, add_help=False)
|
|
_printer = ConsoleUtils.SLPrinter(_name)
|
|
sys.stdout = _printer
|
|
|
|
_DEF_THREAD_COUNT = 8
|
|
_DEF_MAX_MEM = 4096
|
|
_DEF_MIN_MEM = 128
|
|
|
|
_walker_classify_jar = ''
|
|
args = None
|
|
|
|
_DEF_ENV_VALS = {'LOG_LVL': '0',
|
|
'SEQUENCELOGICHOME': '/sequencelogic',
|
|
'FULL_DOC': 'FALSE',
|
|
'IMPORTANT_ONLY': 'FALSE'}
|
|
|
|
|
|
def _print_help():
|
|
sys.stdout = _printer.old_stdout
|
|
print(_name)
|
|
print(_description)
|
|
print('Usage: %s' % _usage)
|
|
print(
|
|
'\n'
|
|
'Arguments:\n'
|
|
' -p, --pkg-path {PKG_PATH} The path to the package file to classify.\n'
|
|
' -c, --config {CONFIG_FILE} The path to the configuration file to use while classifying.\n'
|
|
' -i, --index-root {INDEX_ROOT} The path to the index to classify the package against.\n'
|
|
' -o, --out {OUT_PATH} The path to save the classification results to.\n'
|
|
'\n'
|
|
'Options:\n'
|
|
'\n'
|
|
' JVM:\n'
|
|
' -m, --min-memory {MIN_MEM} The minimum amount of memory (in MB) to allow the JVM (default: 128).\n'
|
|
' -M, --max-memory {MAX_MEM} The maximum amount of memory (in MB) to allow the JVM (default: 4096).\n'
|
|
' -t, --thread-count {NUM_THREADS} The maximum number of threads to use while classifying (default: 8).\n'
|
|
'\n'
|
|
' Miscellaneous:\n'
|
|
' -h, --help Displays the help message.\n'
|
|
' -v, --version Displays the version information.\n'
|
|
'\n'
|
|
'Notes:\n'
|
|
' This tool sets several environmental variables which are used by the ScoreWalker classification engine.\n'
|
|
' these variables are set/used as follows:\n'
|
|
' - SEQUENCELOGICHOME: The path to the sequence logic home directory (default: "/sequencelogic").\n'
|
|
' - FULL_DOC: Weather ScoreWalker should treat the given package as a single sub-document or not.\n'
|
|
' This is determined by the existence/value of the "singleSDC" flag in the configuration file\n'
|
|
' (default: "FALSE").\n'
|
|
' - IMPORTANT_ONLY: Weather or not the package should be ignored if it does not have enough matches of\n'
|
|
' a given set of important document types. Packages which do not meet this criteria are set to\n'
|
|
' "MISCELLANEOUS" and have their confidence set high (default: "FALSE").\n'
|
|
'\n')
|
|
print('Version Information:')
|
|
print(' Version: %s' % _version)
|
|
print(' Date: %s' % _date)
|
|
print('')
|
|
print('Author: %s' % _author)
|
|
|
|
|
|
def _print_version():
|
|
sys.stdout = _parser.old_stdout
|
|
print(_name)
|
|
print('Version: %s' % _version)
|
|
print('Date: %s' % _date)
|
|
print('Author: %s' % _author)
|
|
|
|
|
|
def _run_process(proc_args, proc_name):
|
|
print('Running %s' % proc_name)
|
|
|
|
start_time = time.time()
|
|
|
|
sub_proc = subprocess.Popen(proc_args, stdout=subprocess.PIPE, stderr=sys.stderr)
|
|
for line in iter(sub_proc.stdout.readline, b''):
|
|
_printer.write_no_prefix(' %s' % line.decode('utf-8').replace(os.linesep, ''))
|
|
|
|
sub_proc.wait()
|
|
|
|
run_time = time.time() - start_time
|
|
_printer.write_line_break()
|
|
print('%s completed in %.4f s.' % (proc_name, run_time))
|
|
_printer.write_line_break()
|
|
|
|
|
|
def _get_jvm_args(init_mem, max_mem, thread_count, jar_path):
|
|
thread_count_arg = '-Djava.util.concurrent.ForkJoinPool.common.parallelism=%d' % thread_count
|
|
return ['java',
|
|
'-Xms%dm' % init_mem,
|
|
'-Xmx%dm' % max_mem,
|
|
thread_count_arg,
|
|
'-jar', jar_path]
|
|
|
|
|
|
def _make_args():
|
|
# _parser.add_argument('-t', '--timeout', type=float, default=3600)
|
|
# Required arguments
|
|
_parser.add_argument('-o', '--out')
|
|
_parser.add_argument('-p', '--pkg-path')
|
|
_parser.add_argument('-i', '--index-root')
|
|
_parser.add_argument('-c', '--config')
|
|
# JVM Options
|
|
_parser.add_argument('-T', '--thread-count', type=int, default=_DEF_THREAD_COUNT)
|
|
_parser.add_argument('-M', '--max-memory', type=int, default=_DEF_MAX_MEM)
|
|
_parser.add_argument('-m', '--min-memory', type=int, default=_DEF_MIN_MEM)
|
|
# Misc
|
|
_parser.add_argument('-h', '--help', action=ConsoleUtils.CustomPrintAction, print_fn=_print_help)
|
|
_parser.add_argument('-v', '--versions', action=ConsoleUtils.CustomPrintAction, print_fn=_print_version)
|
|
|
|
|
|
def _load_config():
|
|
try:
|
|
with open(args.config) as reader:
|
|
config_data = json.load(reader)
|
|
|
|
walker_cfg = config_data['CLASSIFYWALKER']
|
|
|
|
if not walker_cfg['enabled']:
|
|
print('ScoreWalker is disabled in this configuration file')
|
|
print('Exiting...')
|
|
exit(0)
|
|
|
|
except KeyError as ex:
|
|
print('Error parsing JSON:')
|
|
print(' %s' % str(ex))
|
|
|
|
except IOError as ex:
|
|
print('Error reading config file %s:' % args.config)
|
|
print(' %s' % str(ex))
|
|
|
|
return walker_cfg
|
|
|
|
|
|
def run_walker_classifier(package_file, index_root, config_file, out_file, thread_count, init_mem, max_mem):
|
|
class_args = _get_jvm_args(init_mem, max_mem, thread_count, _walker_classify_jar)
|
|
|
|
class_args += ['--packageFile', package_file,
|
|
'--indexDir', index_root,
|
|
'--configFile', config_file,
|
|
'--destDir', out_file]
|
|
|
|
_run_process(class_args, 'Walker Classifier')
|
|
|
|
|
|
def _setup_env():
|
|
global _walker_classify_jar
|
|
|
|
walker_cfg = _load_config()
|
|
|
|
print('Configuring environmental variables')
|
|
for var, def_val in _DEF_ENV_VALS.items():
|
|
if os.environ.get(var) is None:
|
|
print('No value set for %s, using default of %s' % (var, def_val))
|
|
os.environ[var] = def_val
|
|
|
|
important_types = 'importantTypes' in walker_cfg.keys()
|
|
|
|
if important_types:
|
|
print('Configuration uses important types, setting env variable "IMPORTANT_ONLY" = %s' % str(important_types))
|
|
os.environ['IMPORTANT_ONLY'] = str(important_types)
|
|
|
|
try:
|
|
is_full_doc = walker_cfg['singleSDC']
|
|
except KeyError:
|
|
is_full_doc = False
|
|
|
|
if is_full_doc:
|
|
print('Setting env variable "FULL_DOC" = %s' % str(is_full_doc))
|
|
os.environ['FULL_DOC'] = str(is_full_doc)
|
|
|
|
print('Environmental variables configured')
|
|
|
|
_walker_classify_jar = os.path.join(os.environ['SEQUENCELOGICHOME'], 'bin', 'walker-classifier-one-jar.jar')
|
|
|
|
if not os.path.exists(_walker_classify_jar):
|
|
print('Fatal Error: Walker Classifier jar is not in the expected location (%s)' % _walker_classify_jar)
|
|
print('Exiting...')
|
|
exit(-2)
|
|
|
|
|
|
def _check_args():
|
|
fatal_error = False
|
|
if not os.path.exists(args.pkg_path):
|
|
print('Fatal Error: The given package file does not exist (%s)' % args.pkg_path)
|
|
fatal_error = True
|
|
|
|
if not os.path.exists(args.index_root):
|
|
print('Fatal Error: The given index root does not exist (%s)' % args.index_root)
|
|
fatal_error = True
|
|
|
|
if not os.path.exists(args.config):
|
|
print('Warning: There is no config file at the given location (%s)' % args.config)
|
|
print(' OK: WalkerClassifier will run with default values (Accuracy will be lost)')
|
|
|
|
if fatal_error:
|
|
_parser.print_help()
|
|
print('Exiting...')
|
|
exit(-1)
|
|
|
|
|
|
def _show_args():
|
|
print('Walker Classifier Arguments:')
|
|
print(' Classifying package file at: %s' % args.pkg_path)
|
|
print(' Using configuration file at: %s' % args.config)
|
|
print(' Using index located at: %s' % args.index_root)
|
|
print(' Saving results to file at: %s' % args.out)
|
|
_printer.write_no_prefix('')
|
|
print('JVM Arguments:')
|
|
print(' Using %d threads' % args.thread_count)
|
|
print(' Initial memory: %sm' % args.min_memory)
|
|
print(' Maximum memory: %sm' % args.max_memory)
|
|
_printer.write_line_break(break_char=' ')
|
|
|
|
|
|
def _setup():
|
|
global args
|
|
_printer.write_no_prefix(ConsoleUtils.get_header(_name, _version, _date, _author))
|
|
_make_args()
|
|
args = _parser.parse_args()
|
|
|
|
_setup_env()
|
|
_check_args()
|
|
_show_args()
|
|
|
|
|
|
def main():
|
|
_setup()
|
|
run_walker_classifier(args.pkg_path, args.index_root, args.config, args.out, args.thread_count,
|
|
args.min_memory, args.max_memory)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|