Sleds/TFFirstPageEngine/DataHelper.py

import os
import cv2
import regex

import numpy as np


cur_batch_num = 0

x_train = []
y_train = []

train_size = 0

x_test = []
y_test = []

test_size = 0

total_ran = 0

R_TO_GREY = (0.2989/255)
G_TO_GREY = (0.5870/255)
B_TO_GREY = (0.1140/255)

RGB_TO_GREY = [R_TO_GREY,
               G_TO_GREY,
               B_TO_GREY]

TOP_BOTTOM_PAD = 10
LEFT_PAD = 39
RIGHT_PAD = 40

BORDER_COLOR = [255, 255, 255]

filter_regex = regex.compile('[A-Za-z]')


def num_epochs():
    return total_ran/train_size


def next_train_batch(batch_size):
    global cur_batch_num, total_ran

    start_idx = batch_size * cur_batch_num
    cur_batch_num += 1
    end_idx = start_idx + batch_size
    if train_size < end_idx:
        end_idx = train_size

    if train_size < start_idx:
        cur_batch_num = 0
        start_idx = 0
        end_idx = batch_size

    # print('[DataHelper] Generated training batch of size %d' % batch_size)
    total_ran += batch_size
    return x_train[start_idx:end_idx], y_train[start_idx:end_idx]


def next_train_items():
    if total_ran > train_size:
        idx = total_ran % train_size
    else:
        idx = total_ran

    return x_train[idx], y_train[idx]


def get_test_data():
    return x_test, y_test


def load_data(data_root, shuffle=True):
    files = _get_file_names(data_root, shuffle)
    global x_test, y_test, x_train, y_train, test_size, train_size
    first_pages = []
    non_first_pages = []
    # Get the first and non first pages
    cur_img = 1
    tot_imgs = len(files)
    for file in files:
        img = _get_clean_img(file)
        print('[DataHelper] Loaded image %d/%d' % (cur_img, tot_imgs))
        if img is not None:
            if '.001.' in file:
                first_pages.append(img)
            else:
                non_first_pages.append(img)
        else:
            print('[DataHelper] Image at "%s" is bad' % file)
            os.remove(file)
        cur_img += 1
    del files

    # non_first_pages = non_first_pages[0:len(first_pages)]

    first_page_labels = [np.array([1, 0]) for _ in first_pages]
    non_first_page_labels = [np.array([0, 1]) for _ in non_first_pages]

    x_test = first_pages + non_first_pages
    y_test = first_page_labels + non_first_page_labels
    test_size = len(x_test)

    x_train = []
    y_train = []

    i = 0
    print('[DataHelper] Full data set size: %d' % test_size)
    while i < test_size:
        if i % 5 == 0:
            x_train.append(x_test[i])
            y_train.append(y_test[i])
        i += 1

    x_test = np.array(x_test)
    y_test = np.array(y_test)

    x_train = np.array(x_train)
    y_train = np.array(y_train)

    train_size = len(x_train)
    test_size = len(x_test)

    print('[DataHelper] Loaded %d first pages and %d other files' % (len(first_pages), len(non_first_pages)))

    print('[DataHelper] Training set size: %d' % train_size)
    print('[DataHelper] Testing set size: %d' % test_size)


def _get_file_names(test_file_root, shuffle):
    names = [os.path.join(test_file_root, file) for file in os.listdir(test_file_root)[0:500] if _is_okay_file(file)]
    # for folder in os.listdir(test_file_root):
    #     folder = os.path.join(test_file_root, folder)
    #     if os.path.isdir(folder):
    #         for file in os.listdir(folder):
    #             file = os.path.join(folder, file)
    #             if file.endswith('.jpg'):
    #                 names.append(file)
    print('[DataHelper] Found %d files to train with' % len(names))
    if shuffle:
        print('[DataHelper] Shuffling file names')
        np.random.shuffle(names)
        print('[DataHelper] Done shuffling file names')
    return names


def _is_okay_file(file_name):
    if '.pro' in file_name \
            or '.csv' in file_name \
            or '.bid' in file_name \
            or '.bininfo' in file_name \
            or '.csv' in file_name \
            or '.ftr' in file_name \
            or '.log' in file_name \
            or '.tmp' in file_name \
            or '.jpg' in file_name:
        return False
    return True


def _get_clean_img(file_name):

    img = cv2.imread(file_name, cv2.IMREAD_GRAYSCALE)
    result = img.reshape([280, 280, 1])
    return result

    # print('[DataHelper] Processing Image: "%s"' % file_name)
    #
    # raw_img = cv2.imread(file_name)
    # _, file = os.path.split(file_name)
    # new_file = '/home/cdiesch/Documents/TFFirstPageClassifier/GeneratedData/%s.png' % file
    #
    # height, width = raw_img.shape[:2]
    # h_scale = 260/height
    # w_scale = 201/height
    # resized = cv2.resize(raw_img, None, fx=w_scale, fy=h_scale, interpolation=cv2.INTER_AREA)
    # # print('   raw_img shape: %s' % str(raw_img.shape))
    #
    # vertical_size = resized.shape[0]
    # horizontal_size = resized.shape[1]
    #
    # vrt_pad = 280 - vertical_size
    # hor_pad = 280 - horizontal_size
    #
    # top_pad = int(vrt_pad/2)
    # bot_pad = vrt_pad - top_pad
    #
    # lft_pad = int(hor_pad/2)
    # rht_pad = hor_pad - lft_pad
    #
    # if lft_pad < 0 or rht_pad < 0 or top_pad < 0 or bot_pad < 0:
    #     return None
    #
    # # print('   Image padding')
    # # print('      Top:     %d' % top_pad)
    # # print('      Bottom:  %d' % bot_pad)
    # # print('      Left:    %d' % lft_pad)
    # # print('      Right:   %d' % rht_pad)
    #
    # pad_img = cv2.copyMakeBorder(resized, top_pad, bot_pad, lft_pad, rht_pad, cv2.BORDER_CONSTANT, value=BORDER_COLOR)
    # # print('   pad_img shape: %s' % str(pad_img.shape))
    #
    # grey_img = cv2.cvtColor(pad_img, cv2.COLOR_BGR2GRAY)
    # print('[DataHelper] Saving new file to "%s"' % new_file)
    # cv2.imwrite(new_file, grey_img)
    # # print('   grey_img shape: %s' %s str(grey_img.shape))
    #
    # res_img = grey_img.reshape([280, 280, 1])
    # print('   res_img shape: %s' % str(res_img.shape))
    #
    # return res_img