Sleds/TFFirstPageEngine/DataHelper.py

214 lines
5.7 KiB
Python

import os
import cv2
import regex
import numpy as np
cur_batch_num = 0
x_train = []
y_train = []
train_size = 0
x_test = []
y_test = []
test_size = 0
total_ran = 0
R_TO_GREY = (0.2989/255)
G_TO_GREY = (0.5870/255)
B_TO_GREY = (0.1140/255)
RGB_TO_GREY = [R_TO_GREY,
G_TO_GREY,
B_TO_GREY]
TOP_BOTTOM_PAD = 10
LEFT_PAD = 39
RIGHT_PAD = 40
BORDER_COLOR = [255, 255, 255]
filter_regex = regex.compile('[A-Za-z]')
def num_epochs():
return total_ran/train_size
def next_train_batch(batch_size):
global cur_batch_num, total_ran
start_idx = batch_size * cur_batch_num
cur_batch_num += 1
end_idx = start_idx + batch_size
if train_size < end_idx:
end_idx = train_size
if train_size < start_idx:
cur_batch_num = 0
start_idx = 0
end_idx = batch_size
# print('[DataHelper] Generated training batch of size %d' % batch_size)
total_ran += batch_size
return x_train[start_idx:end_idx], y_train[start_idx:end_idx]
def next_train_items():
if total_ran > train_size:
idx = total_ran % train_size
else:
idx = total_ran
return x_train[idx], y_train[idx]
def get_test_data():
return x_test, y_test
def load_data(data_root, shuffle=True):
files = _get_file_names(data_root, shuffle)
global x_test, y_test, x_train, y_train, test_size, train_size
first_pages = []
non_first_pages = []
# Get the first and non first pages
cur_img = 1
tot_imgs = len(files)
for file in files:
img = _get_clean_img(file)
print('[DataHelper] Loaded image %d/%d' % (cur_img, tot_imgs))
if img is not None:
if '.001.' in file:
first_pages.append(img)
else:
non_first_pages.append(img)
else:
print('[DataHelper] Image at "%s" is bad' % file)
os.remove(file)
cur_img += 1
del files
# non_first_pages = non_first_pages[0:len(first_pages)]
first_page_labels = [np.array([1, 0]) for _ in first_pages]
non_first_page_labels = [np.array([0, 1]) for _ in non_first_pages]
x_test = first_pages + non_first_pages
y_test = first_page_labels + non_first_page_labels
test_size = len(x_test)
x_train = []
y_train = []
i = 0
print('[DataHelper] Full data set size: %d' % test_size)
while i < test_size:
if i % 5 == 0:
x_train.append(x_test[i])
y_train.append(y_test[i])
i += 1
x_test = np.array(x_test)
y_test = np.array(y_test)
x_train = np.array(x_train)
y_train = np.array(y_train)
train_size = len(x_train)
test_size = len(x_test)
print('[DataHelper] Loaded %d first pages and %d other files' % (len(first_pages), len(non_first_pages)))
print('[DataHelper] Training set size: %d' % train_size)
print('[DataHelper] Testing set size: %d' % test_size)
def _get_file_names(test_file_root, shuffle):
names = [os.path.join(test_file_root, file) for file in os.listdir(test_file_root)[0:500] if _is_okay_file(file)]
# for folder in os.listdir(test_file_root):
# folder = os.path.join(test_file_root, folder)
# if os.path.isdir(folder):
# for file in os.listdir(folder):
# file = os.path.join(folder, file)
# if file.endswith('.jpg'):
# names.append(file)
print('[DataHelper] Found %d files to train with' % len(names))
if shuffle:
print('[DataHelper] Shuffling file names')
np.random.shuffle(names)
print('[DataHelper] Done shuffling file names')
return names
def _is_okay_file(file_name):
if '.pro' in file_name \
or '.csv' in file_name \
or '.bid' in file_name \
or '.bininfo' in file_name \
or '.csv' in file_name \
or '.ftr' in file_name \
or '.log' in file_name \
or '.tmp' in file_name \
or '.jpg' in file_name:
return False
return True
def _get_clean_img(file_name):
img = cv2.imread(file_name, cv2.IMREAD_GRAYSCALE)
result = img.reshape([280, 280, 1])
return result
# print('[DataHelper] Processing Image: "%s"' % file_name)
#
# raw_img = cv2.imread(file_name)
# _, file = os.path.split(file_name)
# new_file = '/home/cdiesch/Documents/TFFirstPageClassifier/GeneratedData/%s.png' % file
#
# height, width = raw_img.shape[:2]
# h_scale = 260/height
# w_scale = 201/height
# resized = cv2.resize(raw_img, None, fx=w_scale, fy=h_scale, interpolation=cv2.INTER_AREA)
# # print(' raw_img shape: %s' % str(raw_img.shape))
#
# vertical_size = resized.shape[0]
# horizontal_size = resized.shape[1]
#
# vrt_pad = 280 - vertical_size
# hor_pad = 280 - horizontal_size
#
# top_pad = int(vrt_pad/2)
# bot_pad = vrt_pad - top_pad
#
# lft_pad = int(hor_pad/2)
# rht_pad = hor_pad - lft_pad
#
# if lft_pad < 0 or rht_pad < 0 or top_pad < 0 or bot_pad < 0:
# return None
#
# # print(' Image padding')
# # print(' Top: %d' % top_pad)
# # print(' Bottom: %d' % bot_pad)
# # print(' Left: %d' % lft_pad)
# # print(' Right: %d' % rht_pad)
#
# pad_img = cv2.copyMakeBorder(resized, top_pad, bot_pad, lft_pad, rht_pad, cv2.BORDER_CONSTANT, value=BORDER_COLOR)
# # print(' pad_img shape: %s' % str(pad_img.shape))
#
# grey_img = cv2.cvtColor(pad_img, cv2.COLOR_BGR2GRAY)
# print('[DataHelper] Saving new file to "%s"' % new_file)
# cv2.imwrite(new_file, grey_img)
# # print(' grey_img shape: %s' %s str(grey_img.shape))
#
# res_img = grey_img.reshape([280, 280, 1])
# print(' res_img shape: %s' % str(res_img.shape))
#
# return res_img