import os import sys import time import json import argparse import subprocess import ConsoleUtils _name = 'Run WalkerClassifier' _description = 'Runs WalkerClassifier with the correct settings/arguments for the environment' _version = '1.0.0' _date = '2017.10.30' _author = 'Chris Diesch ' _usage = 'sequencelogic-run-walker-classifier.py ' \ '-i,--index-root {INDEX_ROOT} ' \ '-c,--config {CONFIG_FILE} ' \ '-o,--out {OUT_FILE} ' \ '-f,--pkg-path {PKG_PATH}' \ ' [OPTIONS...]' _parser = argparse.ArgumentParser(prog=_name, description=_description, add_help=False) _printer = ConsoleUtils.SLPrinter(_name) sys.stdout = _printer _DEF_THREAD_COUNT = 8 _DEF_MAX_MEM = 4096 _DEF_MIN_MEM = 128 _walker_classify_jar = '' args = None _DEF_ENV_VALS = {'LOG_LVL': '0', 'SEQUENCELOGICHOME': '/sequencelogic', 'FULL_DOC': 'FALSE', 'IMPORTANT_ONLY': 'FALSE'} def _print_help(): sys.stdout = _printer.old_stdout print(_name) print(_description) print('Usage: %s' % _usage) print( '\n' 'Arguments:\n' ' -p, --pkg-path {PKG_PATH} The path to the package file to classify.\n' ' -c, --config {CONFIG_FILE} The path to the configuration file to use while classifying.\n' ' -i, --index-root {INDEX_ROOT} The path to the index to classify the package against.\n' ' -o, --out {OUT_PATH} The path to save the classification results to.\n' '\n' 'Options:\n' '\n' ' JVM:\n' ' -m, --min-memory {MIN_MEM} The minimum amount of memory (in MB) to allow the JVM (default: 128).\n' ' -M, --max-memory {MAX_MEM} The maximum amount of memory (in MB) to allow the JVM (default: 4096).\n' ' -t, --thread-count {NUM_THREADS} The maximum number of threads to use while classifying (default: 8).\n' '\n' ' Miscellaneous:\n' ' -h, --help Displays the help message.\n' ' -v, --version Displays the version information.\n' '\n' 'Notes:\n' ' This tool sets several environmental variables which are used by the ScoreWalker classification engine.\n' ' these variables are set/used as follows:\n' ' - SEQUENCELOGICHOME: The path to the sequence logic home directory (default: "/sequencelogic").\n' ' - FULL_DOC: Weather ScoreWalker should treat the given package as a single sub-document or not.\n' ' This is determined by the existence/value of the "singleSDC" flag in the configuration file\n' ' (default: "FALSE").\n' ' - IMPORTANT_ONLY: Weather or not the package should be ignored if it does not have enough matches of\n' ' a given set of important document types. Packages which do not meet this criteria are set to\n' ' "MISCELLANEOUS" and have their confidence set high (default: "FALSE").\n' '\n') print('Version Information:') print(' Version: %s' % _version) print(' Date: %s' % _date) print('') print('Author: %s' % _author) def _print_version(): sys.stdout = _parser.old_stdout print(_name) print('Version: %s' % _version) print('Date: %s' % _date) print('Author: %s' % _author) def _run_process(proc_args, proc_name): print('Running %s' % proc_name) start_time = time.time() sub_proc = subprocess.Popen(proc_args, stdout=subprocess.PIPE, stderr=sys.stderr) for line in iter(sub_proc.stdout.readline, b''): _printer.write_no_prefix(' %s' % line.decode('utf-8').replace(os.linesep, '')) sub_proc.wait() run_time = time.time() - start_time _printer.write_line_break() print('%s completed in %.4f s.' % (proc_name, run_time)) _printer.write_line_break() def _get_jvm_args(init_mem, max_mem, thread_count, jar_path): thread_count_arg = '-Djava.util.concurrent.ForkJoinPool.common.parallelism=%d' % thread_count return ['java', '-Xms%dm' % init_mem, '-Xmx%dm' % max_mem, thread_count_arg, '-jar', jar_path] def _make_args(): # _parser.add_argument('-t', '--timeout', type=float, default=3600) # Required arguments _parser.add_argument('-o', '--out') _parser.add_argument('-p', '--pkg-path') _parser.add_argument('-i', '--index-root') _parser.add_argument('-c', '--config') # JVM Options _parser.add_argument('-T', '--thread-count', type=int, default=_DEF_THREAD_COUNT) _parser.add_argument('-M', '--max-memory', type=int, default=_DEF_MAX_MEM) _parser.add_argument('-m', '--min-memory', type=int, default=_DEF_MIN_MEM) # Misc _parser.add_argument('-h', '--help', action=ConsoleUtils.CustomPrintAction, print_fn=_print_help) _parser.add_argument('-v', '--versions', action=ConsoleUtils.CustomPrintAction, print_fn=_print_version) def _load_config(): try: with open(args.config) as reader: config_data = json.load(reader) walker_cfg = config_data['CLASSIFYWALKER'] if not walker_cfg['enabled']: print('ScoreWalker is disabled in this configuration file') print('Exiting...') exit(0) except KeyError as ex: print('Error parsing JSON:') print(' %s' % str(ex)) except IOError as ex: print('Error reading config file %s:' % args.config) print(' %s' % str(ex)) return walker_cfg def run_walker_classifier(package_file, index_root, config_file, out_file, thread_count, init_mem, max_mem): class_args = _get_jvm_args(init_mem, max_mem, thread_count, _walker_classify_jar) class_args += ['--packageFile', package_file, '--indexDir', index_root, '--configFile', config_file, '--destDir', out_file] _run_process(class_args, 'Walker Classifier') def _setup_env(): global _walker_classify_jar walker_cfg = _load_config() print('Configuring environmental variables') for var, def_val in _DEF_ENV_VALS.items(): if os.environ.get(var) is None: print('No value set for %s, using default of %s' % (var, def_val)) os.environ[var] = def_val important_types = 'importantTypes' in walker_cfg.keys() if important_types: print('Configuration uses important types, setting env variable "IMPORTANT_ONLY" = %s' % str(important_types)) os.environ['IMPORTANT_ONLY'] = str(important_types) try: is_full_doc = walker_cfg['singleSDC'] except KeyError: is_full_doc = False if is_full_doc: print('Setting env variable "FULL_DOC" = %s' % str(is_full_doc)) os.environ['FULL_DOC'] = str(is_full_doc) print('Environmental variables configured') _walker_classify_jar = os.path.join(os.environ['SEQUENCELOGICHOME'], 'bin', 'walker-classifier-one-jar.jar') if not os.path.exists(_walker_classify_jar): print('Fatal Error: Walker Classifier jar is not in the expected location (%s)' % _walker_classify_jar) print('Exiting...') exit(-2) def _check_args(): fatal_error = False if not os.path.exists(args.pkg_path): print('Fatal Error: The given package file does not exist (%s)' % args.pkg_path) fatal_error = True if not os.path.exists(args.index_root): print('Fatal Error: The given index root does not exist (%s)' % args.index_root) fatal_error = True if not os.path.exists(args.config): print('Warning: There is no config file at the given location (%s)' % args.config) print(' OK: WalkerClassifier will run with default values (Accuracy will be lost)') if fatal_error: _parser.print_help() print('Exiting...') exit(-1) def _show_args(): print('Walker Classifier Arguments:') print(' Classifying package file at: %s' % args.pkg_path) print(' Using configuration file at: %s' % args.config) print(' Using index located at: %s' % args.index_root) print(' Saving results to file at: %s' % args.out) _printer.write_no_prefix('') print('JVM Arguments:') print(' Using %d threads' % args.thread_count) print(' Initial memory: %sm' % args.min_memory) print(' Maximum memory: %sm' % args.max_memory) _printer.write_line_break(break_char=' ') def _setup(): global args _printer.write_no_prefix(ConsoleUtils.get_header(_name, _version, _date, _author)) _make_args() args = _parser.parse_args() _setup_env() _check_args() _show_args() def main(): _setup() run_walker_classifier(args.pkg_path, args.index_root, args.config, args.out, args.thread_count, args.min_memory, args.max_memory) if __name__ == '__main__': main()