import os import cv2 import regex import numpy as np cur_batch_num = 0 x_train = [] y_train = [] train_size = 0 x_test = [] y_test = [] test_size = 0 total_ran = 0 R_TO_GREY = (0.2989/255) G_TO_GREY = (0.5870/255) B_TO_GREY = (0.1140/255) RGB_TO_GREY = [R_TO_GREY, G_TO_GREY, B_TO_GREY] TOP_BOTTOM_PAD = 10 LEFT_PAD = 39 RIGHT_PAD = 40 BORDER_COLOR = [255, 255, 255] filter_regex = regex.compile('[A-Za-z]') def num_epochs(): return total_ran/train_size def next_train_batch(batch_size): global cur_batch_num, total_ran start_idx = batch_size * cur_batch_num cur_batch_num += 1 end_idx = start_idx + batch_size if train_size < end_idx: end_idx = train_size if train_size < start_idx: cur_batch_num = 0 start_idx = 0 end_idx = batch_size # print('[DataHelper] Generated training batch of size %d' % batch_size) total_ran += batch_size return x_train[start_idx:end_idx], y_train[start_idx:end_idx] def next_train_items(): if total_ran > train_size: idx = total_ran % train_size else: idx = total_ran return x_train[idx], y_train[idx] def get_test_data(): return x_test, y_test def load_data(data_root, shuffle=True): files = _get_file_names(data_root, shuffle) global x_test, y_test, x_train, y_train, test_size, train_size first_pages = [] non_first_pages = [] # Get the first and non first pages cur_img = 1 tot_imgs = len(files) for file in files: img = _get_clean_img(file) print('[DataHelper] Loaded image %d/%d' % (cur_img, tot_imgs)) if img is not None: if '.001.' in file: first_pages.append(img) else: non_first_pages.append(img) else: print('[DataHelper] Image at "%s" is bad' % file) os.remove(file) cur_img += 1 del files # non_first_pages = non_first_pages[0:len(first_pages)] first_page_labels = [np.array([1, 0]) for _ in first_pages] non_first_page_labels = [np.array([0, 1]) for _ in non_first_pages] x_test = first_pages + non_first_pages y_test = first_page_labels + non_first_page_labels test_size = len(x_test) x_train = [] y_train = [] i = 0 print('[DataHelper] Full data set size: %d' % test_size) while i < test_size: if i % 5 == 0: x_train.append(x_test[i]) y_train.append(y_test[i]) i += 1 x_test = np.array(x_test) y_test = np.array(y_test) x_train = np.array(x_train) y_train = np.array(y_train) train_size = len(x_train) test_size = len(x_test) print('[DataHelper] Loaded %d first pages and %d other files' % (len(first_pages), len(non_first_pages))) print('[DataHelper] Training set size: %d' % train_size) print('[DataHelper] Testing set size: %d' % test_size) def _get_file_names(test_file_root, shuffle): names = [os.path.join(test_file_root, file) for file in os.listdir(test_file_root)[0:500] if _is_okay_file(file)] # for folder in os.listdir(test_file_root): # folder = os.path.join(test_file_root, folder) # if os.path.isdir(folder): # for file in os.listdir(folder): # file = os.path.join(folder, file) # if file.endswith('.jpg'): # names.append(file) print('[DataHelper] Found %d files to train with' % len(names)) if shuffle: print('[DataHelper] Shuffling file names') np.random.shuffle(names) print('[DataHelper] Done shuffling file names') return names def _is_okay_file(file_name): if '.pro' in file_name \ or '.csv' in file_name \ or '.bid' in file_name \ or '.bininfo' in file_name \ or '.csv' in file_name \ or '.ftr' in file_name \ or '.log' in file_name \ or '.tmp' in file_name \ or '.jpg' in file_name: return False return True def _get_clean_img(file_name): img = cv2.imread(file_name, cv2.IMREAD_GRAYSCALE) result = img.reshape([280, 280, 1]) return result # print('[DataHelper] Processing Image: "%s"' % file_name) # # raw_img = cv2.imread(file_name) # _, file = os.path.split(file_name) # new_file = '/home/cdiesch/Documents/TFFirstPageClassifier/GeneratedData/%s.png' % file # # height, width = raw_img.shape[:2] # h_scale = 260/height # w_scale = 201/height # resized = cv2.resize(raw_img, None, fx=w_scale, fy=h_scale, interpolation=cv2.INTER_AREA) # # print(' raw_img shape: %s' % str(raw_img.shape)) # # vertical_size = resized.shape[0] # horizontal_size = resized.shape[1] # # vrt_pad = 280 - vertical_size # hor_pad = 280 - horizontal_size # # top_pad = int(vrt_pad/2) # bot_pad = vrt_pad - top_pad # # lft_pad = int(hor_pad/2) # rht_pad = hor_pad - lft_pad # # if lft_pad < 0 or rht_pad < 0 or top_pad < 0 or bot_pad < 0: # return None # # # print(' Image padding') # # print(' Top: %d' % top_pad) # # print(' Bottom: %d' % bot_pad) # # print(' Left: %d' % lft_pad) # # print(' Right: %d' % rht_pad) # # pad_img = cv2.copyMakeBorder(resized, top_pad, bot_pad, lft_pad, rht_pad, cv2.BORDER_CONSTANT, value=BORDER_COLOR) # # print(' pad_img shape: %s' % str(pad_img.shape)) # # grey_img = cv2.cvtColor(pad_img, cv2.COLOR_BGR2GRAY) # print('[DataHelper] Saving new file to "%s"' % new_file) # cv2.imwrite(new_file, grey_img) # # print(' grey_img shape: %s' %s str(grey_img.shape)) # # res_img = grey_img.reshape([280, 280, 1]) # print(' res_img shape: %s' % str(res_img.shape)) # # return res_img