142 lines
5.3 KiB
Python
142 lines
5.3 KiB
Python
import shutil
|
|
import os
|
|
import argparse
|
|
import pandas
|
|
|
|
program_name = 'LibraryWalker'
|
|
program_description = 'Creates a subset of the main library with only false positives.'
|
|
parser = argparse.ArgumentParser(prog=program_name, description=program_description, add_help=False)
|
|
|
|
red_error = '\033[91mError:\033[0m'
|
|
yellow_warning = '\033[93mWARNING:\033[0m'
|
|
blue_okay = '\033[94mOK\033[0m'
|
|
program_header = format('\033[95m%s\033[0m\n'
|
|
'-----------------------' % program_name)
|
|
decision_message = ' Is this okay? (Y/N): '
|
|
|
|
|
|
def copy_frts(src, dst):
|
|
if src.endswith('.frt'):
|
|
shutil.copy2(src, dst)
|
|
|
|
|
|
def load_false_positives(file_name):
|
|
col_names = ['Doc Index', 'Page Index', 'CluxDocType', 'WalkerType', 'Status', 'Scored', 'WalkerPageType',
|
|
'Num Char', 'Raw Score', 'Self Score', 'Sum Match Scores', 'Top Doc Agree']
|
|
false_pos_fields = {}
|
|
# If there is no file...
|
|
if not os.path.exists(file_name):
|
|
return false_pos_fields
|
|
|
|
data = pandas.read_csv(file_name, names=col_names)
|
|
print('Opened file %s\n' % file_name)
|
|
false_pos_count = 0
|
|
|
|
status = data.Status.tolist()
|
|
doc_names = data.WalkerType.tolist()
|
|
clux_names = data.CluxDocType.tolist()
|
|
for index in range(len(status)):
|
|
doc_name = doc_names[index]
|
|
clux_name = clux_names[index]
|
|
if status[index] == 'FalsePos' and doc_name not in false_pos_fields.values() \
|
|
and not doc_name == 'no results returned':
|
|
false_pos_fields[false_pos_count] = str(doc_names[index])
|
|
|
|
if clux_name not in false_pos_fields.values():
|
|
# Need to increment before adding the CLUX name.
|
|
false_pos_count += 1
|
|
false_pos_fields[false_pos_count] = str(clux_name)
|
|
|
|
false_pos_count += 1
|
|
|
|
print('Found %d false positives.' % false_pos_count)
|
|
|
|
return false_pos_fields
|
|
|
|
|
|
def create_library(fields: set, src_library, new_library):
|
|
# We need somewhere to put stuff...
|
|
missing_doctypes = {}
|
|
missing_doctype_count = 0
|
|
print('Attempting to move false positives to the new library...')
|
|
if not os.path.exists(new_library):
|
|
print('Error directory for new library does not exist:\n '
|
|
'Creating directory %s' % new_library)
|
|
os.makedirs(new_library)
|
|
|
|
for row in fields:
|
|
old_lib_folder = os.path.join(src_library, fields[row])
|
|
new_lib_folder = os.path.join(new_library, fields[row])
|
|
# Does the folder exist?
|
|
print('Attempting to move %s to the new library (%s)' % (fields[row], new_lib_folder))
|
|
if os.path.exists(old_lib_folder):
|
|
# We can get away with using copytree here because we guarantee uniqueness of each folder already
|
|
if not os.path.exists(new_lib_folder):
|
|
shutil.copytree(old_lib_folder, new_lib_folder, copy_function=copy_frts)
|
|
print(' Success!')
|
|
else:
|
|
print('Error, could not find %s in directory:\n %s' % (fields[row], src_library))
|
|
missing_doctypes[missing_doctype_count] = fields[row]
|
|
missing_doctype_count += 1
|
|
|
|
if missing_doctype_count > 0:
|
|
missing_doctypes_file = os.path.join(new_library, "missing_doctypes.txt")
|
|
missing_log = open(missing_doctypes_file, 'w')
|
|
for i in range(missing_doctype_count):
|
|
missing_log.write(missing_doctypes[i])
|
|
missing_log.write('\n')
|
|
missing_log.close()
|
|
|
|
|
|
def yes_or_no(message):
|
|
decision = input(message)
|
|
if decision.lower() == 'y' or decision.lower() == 'yes':
|
|
return
|
|
elif decision.lower() == 'n' or decision.lower() == 'no':
|
|
exit(0)
|
|
else:
|
|
yes_or_no(' Invalid input, enter Y(es) or N(o): ')
|
|
|
|
|
|
def main(analysis_file, old_library, new_library):
|
|
fields = load_false_positives(analysis_file)
|
|
create_library(fields, old_library, new_library)
|
|
|
|
|
|
def check_args(source_lib, new_lib, false_pos):
|
|
fatal_errors = False
|
|
if not os.path.exists(new_lib):
|
|
print('%s No folder at %s' % (yellow_warning, new_lib))
|
|
print('One will be created.')
|
|
yes_or_no(decision_message)
|
|
# Check for fatal errors.
|
|
if not os.path.exists(source_lib):
|
|
print('%s No library at %s' % (red_error, library))
|
|
fatal_errors = True
|
|
if not os.path.exists(false_pos):
|
|
print('%s File does not exist: %s' % (red_error, false_pos))
|
|
fatal_errors = True
|
|
|
|
if fatal_errors:
|
|
parser.print_help()
|
|
print('Exiting...')
|
|
exit(0)
|
|
|
|
if __name__ == '__main__':
|
|
required_args = parser.add_argument_group('Required')
|
|
optional_args = parser.add_argument_group('Optional')
|
|
required_args.add_argument('-s', '--source_lib', required=True, help='The original library to search through.')
|
|
required_args.add_argument('-n', '--new_lib', required=True, help='The new library to create.')
|
|
required_args.add_argument('-f', '--false_pos', required=True, help='The csv file of false positives.')
|
|
optional_args.add_argument('-h', '--help', action="help", help='Prints the help message.')
|
|
|
|
args = parser.parse_args()
|
|
|
|
false_pos_file = args.false_pos
|
|
src_lib = args.source_lib
|
|
new_lib = args.new_lib
|
|
check_args(src_lib, new_lib, false_pos_file)
|
|
# Now we can make the call to main
|
|
main(false_pos_file, src_lib, new_lib)
|
|
|