完美解決keras 讀取多個hdf5文件進行訓(xùn)練的問題
用keras進行大數(shù)據(jù)訓(xùn)練,為了加快訓(xùn)練,需要提前制作訓(xùn)練集。
由于HDF5的特性,所有數(shù)據(jù)需要一次性讀入到內(nèi)存中,才能保存。
為此,我采用分批次分為2個以上HDF5進行存儲。
1、先讀取每個標簽下的圖片,并設(shè)置標簽
def load_dataset(path_name,data_path):
images = []
labels = []
train_images = []
valid_images = []
train_labels = []
valid_labels = []
counter = 0
allpath = os.listdir(path_name)
nb_classes = len(allpath)
print("label_num: ",nb_classes)
for child_dir in allpath:
child_path = os.path.join(path_name, child_dir)
for dir_image in os.listdir(child_path):
if dir_image.endswith('.jpg'):
img = cv2.imread(os.path.join(child_path, dir_image))
image = misc.imresize(img, (IMAGE_SIZE, IMAGE_SIZE), interp='bilinear')
#resized_img = cv2.resize(img, (IMAGE_SIZE, IMAGE_SIZE))
images.append(image)
labels.append(counter)
2、該標簽下的數(shù)據(jù)集分割為訓(xùn)練集(train images),驗證集(val images),訓(xùn)練標簽(train labels),驗證標簽
(val labels)
def split_dataset(images, labels): train_images, valid_images, train_labels, valid_labels = train_test_split(images,\ labels, test_size = 0.2, random_state = random.randint(0, 100)) #print(train_images.shape[0], 'train samples') #print(valid_images.shape[0], 'valid samples') return train_images, valid_images, train_labels ,valid_labels
3、分割后的數(shù)據(jù)分別添加到總的訓(xùn)練集,驗證集,訓(xùn)練標簽,驗證標簽。
其次,清空原有的圖片集和標簽集,目的是節(jié)省內(nèi)存。假如一次性讀入多個標簽的數(shù)據(jù)集與標簽集,進行數(shù)據(jù)分割后,會占用大于單純進行上述操作兩倍以上的內(nèi)存。
images = np.array(images)
t_images, v_images, t_labels ,v_labels = split_dataset(images, labels)
for i in range(len(t_images)):
train_images.append(t_images[i])
train_labels.append(t_labels[i])
for j in range(len(v_images)):
valid_images.append(v_images[j])
valid_labels.append(v_labels[j])
if counter%50== 49:
print( counter+1 , "is read to the memory!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
images = []
labels = []
counter = counter + 1
print("train_images num: ", len(train_images), " ", "valid_images num: ",len(valid_images))
4、進行判斷,直到讀到自己自己分割的那個標簽。
開始進行寫入。寫入之前,為了更好地訓(xùn)練模型,需要把對應(yīng)的圖片集和標簽打亂順序。
if ((counter % 4316 == 4315) or (counter == nb_classes - 1)):
print("start write images and labels data...................................................................")
num = counter // 5000
dirs = data_path + "/" + "h5_" + str(num - 1)
if not os.path.exists(dirs):
os.makedirs(dirs)
data2h5(dirs, t_images, v_images, t_labels ,v_labels)
對應(yīng)打亂順序并寫入到HDF5
def data2h5(dirs_path, train_images, valid_images, train_labels ,valid_labels):
TRAIN_HDF5 = dirs_path + '/' + "train.hdf5"
VAL_HDF5 = dirs_path + '/' + "val.hdf5"
#shuffle
state1 = np.random.get_state()
np.random.shuffle(train_images)
np.random.set_state(state1)
np.random.shuffle(train_labels)
state2 = np.random.get_state()
np.random.shuffle(valid_images)
np.random.set_state(state2)
np.random.shuffle(valid_labels)
datasets = [
("train",train_images,train_labels,TRAIN_HDF5),
("val",valid_images,valid_labels,VAL_HDF5)]
for (dType,images,labels,outputPath) in datasets:
# HDF5 initial
f = h5py.File(outputPath, "w")
f.create_dataset("x_"+dType, data=images)
f.create_dataset("y_"+dType, data=labels)
#f.create_dataset("x_"+dType, data=images, compression="gzip", compression_opts=9)
#f.create_dataset("y_"+dType, data=labels, compression="gzip", compression_opts=9)
f.close()
5、判斷文件全部讀入
def read_dataset(dirs):
files = os.listdir(dirs)
print(files)
for file in files:
path = dirs+'/' + file
dataset = h5py.File(path, "r")
file = file.split('.')
set_x_orig = dataset["x_"+file[0]].shape[0]
set_y_orig = dataset["y_"+file[0]].shape[0]
print(set_x_orig)
print(set_y_orig)
6、訓(xùn)練中,采用迭代器讀入數(shù)據(jù)
def generator(self, datagen, mode):
passes=np.inf
aug = ImageDataGenerator(
featurewise_center = False,
samplewise_center = False,
featurewise_std_normalization = False,
samplewise_std_normalization = False,
zca_whitening = False,
rotation_range = 20,
width_shift_range = 0.2,
height_shift_range = 0.2,
horizontal_flip = True,
vertical_flip = False)
epochs = 0
# 默認是無限循環(huán)遍歷
while epochs < passes:
# 遍歷數(shù)據(jù)
file_dir = os.listdir(self.data_path)
for file in file_dir:
#print(file)
file_path = os.path.join(self.data_path,file)
TRAIN_HDF5 = file_path +"/train.hdf5"
VAL_HDF5 = file_path +"/val.hdf5"
#TEST_HDF5 = file_path +"/test.hdf5"
db_t = h5py.File(TRAIN_HDF5)
numImages_t = db_t['y_train'].shape[0]
db_v = h5py.File(VAL_HDF5)
numImages_v = db_v['y_val'].shape[0]
if mode == "train":
for i in np.arange(0, numImages_t, self.BS):
images = db_t['x_train'][i: i+self.BS]
labels = db_t['y_train'][i: i+self.BS]
if K.image_data_format() == 'channels_first':
images = images.reshape(images.shape[0], 3, IMAGE_SIZE,IMAGE_SIZE)
else:
images = images.reshape(images.shape[0], IMAGE_SIZE, IMAGE_SIZE, 3)
images = images.astype('float32')
images = images/255
if datagen :
(images,labels) = next(aug.flow(images,labels,batch_size = self.BS))
# one-hot編碼
if self.binarize:
labels = np_utils.to_categorical(labels,self.classes)
yield ({'input_1': images}, {'softmax': labels})
elif mode == "val":
for i in np.arange(0, numImages_v, self.BS):
images = db_v['x_val'][i: i+self.BS]
labels = db_v['y_val'][i: i+self.BS]
if K.image_data_format() == 'channels_first':
images = images.reshape(images.shape[0], 3, IMAGE_SIZE,IMAGE_SIZE)
else:
images = images.reshape(images.shape[0], IMAGE_SIZE, IMAGE_SIZE, 3)
images = images.astype('float32')
images = images/255
if datagen :
(images,labels) = next(aug.flow(images,labels,batch_size = self.BS))
#one-hot編碼
if self.binarize:
labels = np_utils.to_categorical(labels,self.classes)
yield ({'input_1': images}, {'softmax': labels})
epochs += 1
7、至此,就大功告成了
完整的代碼:
# -*- coding: utf-8 -*-
"""
Created on Mon Feb 12 20:46:12 2018
@author: william_yue
"""
import os
import numpy as np
import cv2
import random
from scipy import misc
import h5py
from sklearn.model_selection import train_test_split
from keras import backend as K
K.clear_session()
from keras.utils import np_utils
IMAGE_SIZE = 128
# 加載數(shù)據(jù)集并按照交叉驗證的原則劃分數(shù)據(jù)集并進行相關(guān)預(yù)處理工作
def split_dataset(images, labels):
# 導(dǎo)入了sklearn庫的交叉驗證模塊,利用函數(shù)train_test_split()來劃分訓(xùn)練集和驗證集
# 劃分出了20%的數(shù)據(jù)用于驗證,80%用于訓(xùn)練模型
train_images, valid_images, train_labels, valid_labels = train_test_split(images,\
labels, test_size = 0.2, random_state = random.randint(0, 100))
return train_images, valid_images, train_labels ,valid_labels
def data2h5(dirs_path, train_images, valid_images, train_labels ,valid_labels):
#def data2h5(dirs_path, train_images, valid_images, test_images, train_labels ,valid_labels, test_labels):
TRAIN_HDF5 = dirs_path + '/' + "train.hdf5"
VAL_HDF5 = dirs_path + '/' + "val.hdf5"
#采用標簽與圖片相同的順序分別打亂訓(xùn)練集與驗證集
state1 = np.random.get_state()
np.random.shuffle(train_images)
np.random.set_state(state1)
np.random.shuffle(train_labels)
state2 = np.random.get_state()
np.random.shuffle(valid_images)
np.random.set_state(state2)
np.random.shuffle(valid_labels)
datasets = [
("train",train_images,train_labels,TRAIN_HDF5),
("val",valid_images,valid_labels,VAL_HDF5)]
for (dType,images,labels,outputPath) in datasets:
# 初始化HDF5寫入
f = h5py.File(outputPath, "w")
f.create_dataset("x_"+dType, data=images)
f.create_dataset("y_"+dType, data=labels)
#f.create_dataset("x_"+dType, data=images, compression="gzip", compression_opts=9)
#f.create_dataset("y_"+dType, data=labels, compression="gzip", compression_opts=9)
f.close()
def read_dataset(dirs):
files = os.listdir(dirs)
print(files)
for file in files:
path = dirs+'/' + file
file_read = os.listdir(path)
for i in file_read:
path_read = os.path.join(path, i)
dataset = h5py.File(path_read, "r")
i = i.split('.')
set_x_orig = dataset["x_"+i[0]].shape[0]
set_y_orig = dataset["y_"+i[0]].shape[0]
print(set_x_orig)
print(set_y_orig)
#循環(huán)讀取每個標簽集下的所有圖片
def load_dataset(path_name,data_path):
images = []
labels = []
train_images = []
valid_images = []
train_labels = []
valid_labels = []
counter = 0
allpath = os.listdir(path_name)
nb_classes = len(allpath)
print("label_num: ",nb_classes)
for child_dir in allpath:
child_path = os.path.join(path_name, child_dir)
for dir_image in os.listdir(child_path):
if dir_image.endswith('.jpg'):
img = cv2.imread(os.path.join(child_path, dir_image))
image = misc.imresize(img, (IMAGE_SIZE, IMAGE_SIZE), interp='bilinear')
#resized_img = cv2.resize(img, (IMAGE_SIZE, IMAGE_SIZE))
images.append(image)
labels.append(counter)
images = np.array(images)
t_images, v_images, t_labels ,v_labels = split_dataset(images, labels)
for i in range(len(t_images)):
train_images.append(t_images[i])
train_labels.append(t_labels[i])
for j in range(len(v_images)):
valid_images.append(v_images[j])
valid_labels.append(v_labels[j])
if counter%50== 49:
print( counter+1 , "is read to the memory!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
images = []
labels = []
if ((counter % 4316 == 4315) or (counter == nb_classes - 1)):
print("train_images num: ", len(train_images), " ", "valid_images num: ",len(valid_images))
print("start write images and labels data...................................................................")
num = counter // 5000
dirs = data_path + "/" + "h5_" + str(num - 1)
if not os.path.exists(dirs):
os.makedirs(dirs)
data2h5(dirs, train_images, valid_images, train_labels ,valid_labels)
#read_dataset(dirs)
print("File HDF5_%d "%num, " id done!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
train_images = []
valid_images = []
train_labels = []
valid_labels = []
counter = counter + 1
print("All File HDF5 done!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
read_dataset(data_path)
#讀取訓(xùn)練數(shù)據(jù)集的文件夾,把他們的名字返回給一個list
def read_name_list(path_name):
name_list = []
for child_dir in os.listdir(path_name):
name_list.append(child_dir)
return name_list
if __name__ == '__main__':
path = "data"
data_path = "data_hdf5_half"
if not os.path.exists(data_path):
os.makedirs(data_path)
load_dataset(path,data_path)
以上這篇完美解決keras 讀取多個hdf5文件進行訓(xùn)練的問題就是小編分享給大家的全部內(nèi)容了,希望能給大家一個參考,也希望大家多多支持腳本之家。
相關(guān)文章
Python面向?qū)ο髮崿F(xiàn)一個對象調(diào)用另一個對象操作示例
這篇文章主要介紹了Python面向?qū)ο髮崿F(xiàn)一個對象調(diào)用另一個對象操作,結(jié)合實例形式分析了Python對象的定義、初始化、調(diào)用等相關(guān)操作技巧,需要的朋友可以參考下2019-04-04
Windows下安裝python2.7及科學(xué)計算套裝
這篇文章主要向大家介紹的是在windows系統(tǒng)下安裝python 2.7以及numpy安裝、six安裝、dateutil安裝、pyparsing安裝、matplotlib安裝和scipy安裝的方法,分享給大家,需要的小伙伴可以參考下,相對來說,windows下的安裝還是比較簡單的。2015-03-03
matplotlib之Font family [‘sans-serif‘] not&nbs
本文主要介紹了matplotlib之Font family [‘sans-serif‘] not found的問題解決,文中通過示例代碼介紹的非常詳細,對大家的學(xué)習(xí)或者工作具有一定的參考學(xué)習(xí)價值,需要的朋友們下面隨著小編來一起學(xué)習(xí)學(xué)習(xí)吧2023-03-03

