Sleds/scorewalker-utils/LibraryWalker/LibraryWalker.py

142 lines
5.3 KiB
Python

import shutil
import os
import argparse
import pandas
program_name = 'LibraryWalker'
program_description = 'Creates a subset of the main library with only false positives.'
parser = argparse.ArgumentParser(prog=program_name, description=program_description, add_help=False)
red_error = '\033[91mError:\033[0m'
yellow_warning = '\033[93mWARNING:\033[0m'
blue_okay = '\033[94mOK\033[0m'
program_header = format('\033[95m%s\033[0m\n'
'-----------------------' % program_name)
decision_message = ' Is this okay? (Y/N): '
def copy_frts(src, dst):
if src.endswith('.frt'):
shutil.copy2(src, dst)
def load_false_positives(file_name):
col_names = ['Doc Index', 'Page Index', 'CluxDocType', 'WalkerType', 'Status', 'Scored', 'WalkerPageType',
'Num Char', 'Raw Score', 'Self Score', 'Sum Match Scores', 'Top Doc Agree']
false_pos_fields = {}
# If there is no file...
if not os.path.exists(file_name):
return false_pos_fields
data = pandas.read_csv(file_name, names=col_names)
print('Opened file %s\n' % file_name)
false_pos_count = 0
status = data.Status.tolist()
doc_names = data.WalkerType.tolist()
clux_names = data.CluxDocType.tolist()
for index in range(len(status)):
doc_name = doc_names[index]
clux_name = clux_names[index]
if status[index] == 'FalsePos' and doc_name not in false_pos_fields.values() \
and not doc_name == 'no results returned':
false_pos_fields[false_pos_count] = str(doc_names[index])
if clux_name not in false_pos_fields.values():
# Need to increment before adding the CLUX name.
false_pos_count += 1
false_pos_fields[false_pos_count] = str(clux_name)
false_pos_count += 1
print('Found %d false positives.' % false_pos_count)
return false_pos_fields
def create_library(fields: set, src_library, new_library):
# We need somewhere to put stuff...
missing_doctypes = {}
missing_doctype_count = 0
print('Attempting to move false positives to the new library...')
if not os.path.exists(new_library):
print('Error directory for new library does not exist:\n '
'Creating directory %s' % new_library)
os.makedirs(new_library)
for row in fields:
old_lib_folder = os.path.join(src_library, fields[row])
new_lib_folder = os.path.join(new_library, fields[row])
# Does the folder exist?
print('Attempting to move %s to the new library (%s)' % (fields[row], new_lib_folder))
if os.path.exists(old_lib_folder):
# We can get away with using copytree here because we guarantee uniqueness of each folder already
if not os.path.exists(new_lib_folder):
shutil.copytree(old_lib_folder, new_lib_folder, copy_function=copy_frts)
print(' Success!')
else:
print('Error, could not find %s in directory:\n %s' % (fields[row], src_library))
missing_doctypes[missing_doctype_count] = fields[row]
missing_doctype_count += 1
if missing_doctype_count > 0:
missing_doctypes_file = os.path.join(new_library, "missing_doctypes.txt")
missing_log = open(missing_doctypes_file, 'w')
for i in range(missing_doctype_count):
missing_log.write(missing_doctypes[i])
missing_log.write('\n')
missing_log.close()
def yes_or_no(message):
decision = input(message)
if decision.lower() == 'y' or decision.lower() == 'yes':
return
elif decision.lower() == 'n' or decision.lower() == 'no':
exit(0)
else:
yes_or_no(' Invalid input, enter Y(es) or N(o): ')
def main(analysis_file, old_library, new_library):
fields = load_false_positives(analysis_file)
create_library(fields, old_library, new_library)
def check_args(source_lib, new_lib, false_pos):
fatal_errors = False
if not os.path.exists(new_lib):
print('%s No folder at %s' % (yellow_warning, new_lib))
print('One will be created.')
yes_or_no(decision_message)
# Check for fatal errors.
if not os.path.exists(source_lib):
print('%s No library at %s' % (red_error, library))
fatal_errors = True
if not os.path.exists(false_pos):
print('%s File does not exist: %s' % (red_error, false_pos))
fatal_errors = True
if fatal_errors:
parser.print_help()
print('Exiting...')
exit(0)
if __name__ == '__main__':
required_args = parser.add_argument_group('Required')
optional_args = parser.add_argument_group('Optional')
required_args.add_argument('-s', '--source_lib', required=True, help='The original library to search through.')
required_args.add_argument('-n', '--new_lib', required=True, help='The new library to create.')
required_args.add_argument('-f', '--false_pos', required=True, help='The csv file of false positives.')
optional_args.add_argument('-h', '--help', action="help", help='Prints the help message.')
args = parser.parse_args()
false_pos_file = args.false_pos
src_lib = args.source_lib
new_lib = args.new_lib
check_args(src_lib, new_lib, false_pos_file)
# Now we can make the call to main
main(false_pos_file, src_lib, new_lib)