import json import argparse import os import string import numpy as np import sys def load_ocr_data(file_name): ocr_text = '' ocr_conf = '' with open(file_name) as reader: json_data = json.load(reader) pages = json_data['pages'] for page in pages: lines = page['lines'] num_lines = len(lines) current_line_num = 0 for line in lines: current_line_num += 1 line_chars = str.lower(line['chars'].translate(str.maketrans('', '', string.punctuation))) line_confs = line['confs'] ocr_text += line_chars ocr_conf += line_confs return ocr_text, ocr_conf def is_word_confident(line_confs, min_confidence): for i in range(len(line_confs)): conf_num = int(line_confs[i]) if conf_num < min_confidence: return False return True def find_unconfident_words(ocr_words, min_conf): unconfident_words = [] for vals in ocr_words: conf = vals['conf'] if not is_word_confident(conf, min_conf): unconfident_words.append(vals) return unconfident_words def load_dict(dict_file): result = {} with open(dict_file) as reader: lines = reader.readlines() for i in range(len(lines)): line = str.lower(lines[i].translate(str.maketrans('', '', string.whitespace))) result[line] = i return result def split_into_words(ocr_text, ocr_conf): words = [] previous_split_idx = 0 for i in range(len(ocr_text)): if ocr_text[i] == ' ': word = ocr_text[previous_split_idx:i] conf = ocr_conf[previous_split_idx:i] # We set the previous to + 1 because we don't want the leading space... previous_split_idx = i + 1 words.append({'word': word, 'conf': conf}) return words def is_in_dict(word, dict_words): return word in dict_words def correct_confs(line_confs, max_conf): result_confs = '' for i in range(len(line_confs)): result_confs += str(max_conf) return result_confs def get_remaining_bad_words(corrected, unconfident): remaining = [] for i in range(len(unconfident)): found = False for j in range(len(corrected)): if unconfident[i]['word'] == corrected[j]['word']: found = True break if not found: remaining.append(unconfident[i]) return remaining def lev_dist(source, target): # We assume source >= target. if len(source) < len(target): return lev_dist(target, source) # So now we have len(source) >= len(target). if len(target) == 0: return len(source) source = np.array(tuple(source)) target = np.array(tuple(target)) previous_row = np.arange(target.size + 1) for tkn_val in source: # Insertion (target grows longer than source): current_row = previous_row + 1 # Substitution or matching: Target and source items are aligned, and either are different (cost of 1), or are # the same (cost of 0). target != tkn_val produces an array of boolean values corresponding to weather or not # target[index] == tkn_val. This is used for incrementing the rows as we go. current_row[1:] = np.minimum(current_row[1:], np.add(previous_row[:-1], target != tkn_val)) # Deletion (target grows shorter than source): current_row[1:] = np.minimum(current_row[1:], current_row[0:-1] + 1) # Reset rows for next pass. previous_row = current_row result = previous_row[-1] return result # Finds the closest word with a Levenshtein distance under min_edit. # If there are no words under that threshold, it returns None. def find_closest_word(word, dictionary): closest_word = None min_distance = sys.maxsize for dict_word in dictionary: dist = lev_dist(word, dict_word) # For right now, just return the first word found, this may change though... if dist < min_distance: closest_word = dict_word min_distance = dist return closest_word, min_distance def find_words_in_dict(words, dictionary, max_distance): result_words = [] for word_data in words: word = word_data['word'] conf = word_data['conf'] # If every character has to be changed, we don't want to even try... if max_distance < len(word): replace_word, distance = find_closest_word(word, dictionary) # If the words were close enough... if distance <= max_distance: replace_conf = correct_confs(conf, 9) result_words.append({'newWord': replace_word, 'newConf': replace_conf, 'oldWord': word, 'oldConf': conf}) else: print('Cannot attempt replacement. Length of %s (%d) is less than the max distance allowed(%d).' % (word, len(word), max_distance)) return result_words def split_word(word): # Get all the possible 'splits' of a word w = word['word'] c = word['conf'] result = [] # Add a space at every point... for i in range(1, len(w)): result.append({'w1': w[:i], 'w2': w[i:], 'c1': c[:i], 'c2': c[i:]}) return result def get_possible_splits(word, dictionary): correct_splits = [] possible_splits = split_word(word) for possible in possible_splits: word1 = possible['w1'] word2 = possible['w2'] if is_in_dict(word1, dictionary) and is_in_dict(word2, dictionary): correct_splits.append(possible) return correct_splits def test(): ocr_chars, ocr_confs = load_ocr_data(r'E:\Code\GIT\scorewalker-utils\KMeans\test_data\ocr_files\1.frt') english_dict = load_dict(r'E:\Code\GIT\scorewalker-utils\KMeans\Data\WordLists\EnglishWords.txt') max_dist = 3 words = split_into_words(ocr_chars, ocr_confs) possible_bad_words = find_unconfident_words(words, 9) num_corrected = 0 for i in range(len(words)): replaced = False word_data = words[i] word = word_data['word'] conf = word_data['conf'] if is_word_confident(conf, 9): continue elif is_in_dict(word, english_dict): conf = correct_confs(conf) words.insert(i, {'word': word, 'conf': conf}) else: splits = get_possible_splits(word_data) if len(splits) == 1: w1 = splits[0]['w1'] c1 = correct_confs(splits[0]['c1']) w2 = splits[0]['w2'] c2 = correct_confs(splits[0]['c2']) words.remove(word_data) words.insert(i, {'word': w2, 'conf': c2}) words.insert(i, {'word': w1, 'conf': c1}) if __name__ == '__main__': test()