214 lines
5.7 KiB
Python
214 lines
5.7 KiB
Python
import os
|
|
import cv2
|
|
import regex
|
|
|
|
import numpy as np
|
|
|
|
|
|
cur_batch_num = 0
|
|
|
|
x_train = []
|
|
y_train = []
|
|
|
|
train_size = 0
|
|
|
|
x_test = []
|
|
y_test = []
|
|
|
|
test_size = 0
|
|
|
|
total_ran = 0
|
|
|
|
R_TO_GREY = (0.2989/255)
|
|
G_TO_GREY = (0.5870/255)
|
|
B_TO_GREY = (0.1140/255)
|
|
|
|
RGB_TO_GREY = [R_TO_GREY,
|
|
G_TO_GREY,
|
|
B_TO_GREY]
|
|
|
|
TOP_BOTTOM_PAD = 10
|
|
LEFT_PAD = 39
|
|
RIGHT_PAD = 40
|
|
|
|
BORDER_COLOR = [255, 255, 255]
|
|
|
|
filter_regex = regex.compile('[A-Za-z]')
|
|
|
|
|
|
def num_epochs():
|
|
return total_ran/train_size
|
|
|
|
|
|
def next_train_batch(batch_size):
|
|
global cur_batch_num, total_ran
|
|
|
|
start_idx = batch_size * cur_batch_num
|
|
cur_batch_num += 1
|
|
end_idx = start_idx + batch_size
|
|
if train_size < end_idx:
|
|
end_idx = train_size
|
|
|
|
if train_size < start_idx:
|
|
cur_batch_num = 0
|
|
start_idx = 0
|
|
end_idx = batch_size
|
|
|
|
# print('[DataHelper] Generated training batch of size %d' % batch_size)
|
|
total_ran += batch_size
|
|
return x_train[start_idx:end_idx], y_train[start_idx:end_idx]
|
|
|
|
|
|
def next_train_items():
|
|
if total_ran > train_size:
|
|
idx = total_ran % train_size
|
|
else:
|
|
idx = total_ran
|
|
|
|
return x_train[idx], y_train[idx]
|
|
|
|
|
|
def get_test_data():
|
|
return x_test, y_test
|
|
|
|
|
|
def load_data(data_root, shuffle=True):
|
|
files = _get_file_names(data_root, shuffle)
|
|
global x_test, y_test, x_train, y_train, test_size, train_size
|
|
first_pages = []
|
|
non_first_pages = []
|
|
# Get the first and non first pages
|
|
cur_img = 1
|
|
tot_imgs = len(files)
|
|
for file in files:
|
|
img = _get_clean_img(file)
|
|
print('[DataHelper] Loaded image %d/%d' % (cur_img, tot_imgs))
|
|
if img is not None:
|
|
if '.001.' in file:
|
|
first_pages.append(img)
|
|
else:
|
|
non_first_pages.append(img)
|
|
else:
|
|
print('[DataHelper] Image at "%s" is bad' % file)
|
|
os.remove(file)
|
|
cur_img += 1
|
|
del files
|
|
|
|
# non_first_pages = non_first_pages[0:len(first_pages)]
|
|
|
|
first_page_labels = [np.array([1, 0]) for _ in first_pages]
|
|
non_first_page_labels = [np.array([0, 1]) for _ in non_first_pages]
|
|
|
|
x_test = first_pages + non_first_pages
|
|
y_test = first_page_labels + non_first_page_labels
|
|
test_size = len(x_test)
|
|
|
|
x_train = []
|
|
y_train = []
|
|
|
|
i = 0
|
|
print('[DataHelper] Full data set size: %d' % test_size)
|
|
while i < test_size:
|
|
if i % 5 == 0:
|
|
x_train.append(x_test[i])
|
|
y_train.append(y_test[i])
|
|
i += 1
|
|
|
|
x_test = np.array(x_test)
|
|
y_test = np.array(y_test)
|
|
|
|
x_train = np.array(x_train)
|
|
y_train = np.array(y_train)
|
|
|
|
train_size = len(x_train)
|
|
test_size = len(x_test)
|
|
|
|
print('[DataHelper] Loaded %d first pages and %d other files' % (len(first_pages), len(non_first_pages)))
|
|
|
|
print('[DataHelper] Training set size: %d' % train_size)
|
|
print('[DataHelper] Testing set size: %d' % test_size)
|
|
|
|
|
|
def _get_file_names(test_file_root, shuffle):
|
|
names = [os.path.join(test_file_root, file) for file in os.listdir(test_file_root)[0:500] if _is_okay_file(file)]
|
|
# for folder in os.listdir(test_file_root):
|
|
# folder = os.path.join(test_file_root, folder)
|
|
# if os.path.isdir(folder):
|
|
# for file in os.listdir(folder):
|
|
# file = os.path.join(folder, file)
|
|
# if file.endswith('.jpg'):
|
|
# names.append(file)
|
|
print('[DataHelper] Found %d files to train with' % len(names))
|
|
if shuffle:
|
|
print('[DataHelper] Shuffling file names')
|
|
np.random.shuffle(names)
|
|
print('[DataHelper] Done shuffling file names')
|
|
return names
|
|
|
|
|
|
def _is_okay_file(file_name):
|
|
if '.pro' in file_name \
|
|
or '.csv' in file_name \
|
|
or '.bid' in file_name \
|
|
or '.bininfo' in file_name \
|
|
or '.csv' in file_name \
|
|
or '.ftr' in file_name \
|
|
or '.log' in file_name \
|
|
or '.tmp' in file_name \
|
|
or '.jpg' in file_name:
|
|
return False
|
|
return True
|
|
|
|
|
|
def _get_clean_img(file_name):
|
|
|
|
img = cv2.imread(file_name, cv2.IMREAD_GRAYSCALE)
|
|
result = img.reshape([280, 280, 1])
|
|
return result
|
|
|
|
# print('[DataHelper] Processing Image: "%s"' % file_name)
|
|
#
|
|
# raw_img = cv2.imread(file_name)
|
|
# _, file = os.path.split(file_name)
|
|
# new_file = '/home/cdiesch/Documents/TFFirstPageClassifier/GeneratedData/%s.png' % file
|
|
#
|
|
# height, width = raw_img.shape[:2]
|
|
# h_scale = 260/height
|
|
# w_scale = 201/height
|
|
# resized = cv2.resize(raw_img, None, fx=w_scale, fy=h_scale, interpolation=cv2.INTER_AREA)
|
|
# # print(' raw_img shape: %s' % str(raw_img.shape))
|
|
#
|
|
# vertical_size = resized.shape[0]
|
|
# horizontal_size = resized.shape[1]
|
|
#
|
|
# vrt_pad = 280 - vertical_size
|
|
# hor_pad = 280 - horizontal_size
|
|
#
|
|
# top_pad = int(vrt_pad/2)
|
|
# bot_pad = vrt_pad - top_pad
|
|
#
|
|
# lft_pad = int(hor_pad/2)
|
|
# rht_pad = hor_pad - lft_pad
|
|
#
|
|
# if lft_pad < 0 or rht_pad < 0 or top_pad < 0 or bot_pad < 0:
|
|
# return None
|
|
#
|
|
# # print(' Image padding')
|
|
# # print(' Top: %d' % top_pad)
|
|
# # print(' Bottom: %d' % bot_pad)
|
|
# # print(' Left: %d' % lft_pad)
|
|
# # print(' Right: %d' % rht_pad)
|
|
#
|
|
# pad_img = cv2.copyMakeBorder(resized, top_pad, bot_pad, lft_pad, rht_pad, cv2.BORDER_CONSTANT, value=BORDER_COLOR)
|
|
# # print(' pad_img shape: %s' % str(pad_img.shape))
|
|
#
|
|
# grey_img = cv2.cvtColor(pad_img, cv2.COLOR_BGR2GRAY)
|
|
# print('[DataHelper] Saving new file to "%s"' % new_file)
|
|
# cv2.imwrite(new_file, grey_img)
|
|
# # print(' grey_img shape: %s' %s str(grey_img.shape))
|
|
#
|
|
# res_img = grey_img.reshape([280, 280, 1])
|
|
# print(' res_img shape: %s' % str(res_img.shape))
|
|
#
|
|
# return res_img
|