tensorflow 變長序列存儲實例

更新時間：2020年01月20日 10:36:35 作者：戰(zhàn)場由我一人主宰

今天小編就為大家分享一篇tensorflow 變長序列存儲實例，具有很好的參考價值，希望對大家有所幫助。一起跟隨小編過來看看吧

問題

問題是這樣的，要把一個數(shù)組存到tfrecord中，然后讀取

a = np.array([[0, 54, 91, 153, 177,1],
  [0, 50, 89, 147, 196],
  [0, 38, 79, 157],
  [0, 49, 89, 147, 177],
  [0, 32, 73, 145]])

圖片我都存儲了，這個不還是小意思，一頓操作

import tensorflow as tf
import numpy as np

def _int64_feature(value):
 if not isinstance(value,list):
 value = [value]
 return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

# Write an array to TFrecord.
# a is an array which contains lists of variant length.
a = np.array([[0, 54, 91, 153, 177,1],
  [0, 50, 89, 147, 196],
  [0, 38, 79, 157],
  [0, 49, 89, 147, 177],
  [0, 32, 73, 145]])

writer = tf.python_io.TFRecordWriter('file')

for i in range(a.shape[0]):
 feature = {'i' : _int64_feature(i), 
  'data': _int64_feature(a[i])}

 # Create an example protocol buffer
 example = tf.train.Example(features=tf.train.Features(feature=feature))

 # Serialize to string and write on the file
 writer.write(example.SerializeToString())

writer.close()


# Use Dataset API to read the TFRecord file.
filenames = ["file"]
dataset = tf.data.TFRecordDataset(filenames)
def _parse_function(example_proto):
 keys_to_features = {'i':tf.FixedLenFeature([],tf.int64),
   'data':tf.FixedLenFeature([],tf.int64)}
 parsed_features = tf.parse_single_example(example_proto, keys_to_features)
 return parsed_features['i'], parsed_features['data']

dataset = dataset.map(_parse_function)
dataset = dataset.shuffle(buffer_size=1)
dataset = dataset.repeat() 
dataset = dataset.batch(1)
iterator = dataset.make_one_shot_iterator()
i, data = iterator.get_next()
with tf.Session() as sess:
 print(sess.run([i, data]))
 print(sess.run([i, data]))
 print(sess.run([i, data]))

報了奇怪的錯誤，Name: <unknown>, Key: data, Index: 0. Number of int64 values != expected. Values size: 6 but output shape: [] 這意思是我數(shù)據(jù)長度為6，但是讀出來的是[]，這到底是哪里錯了，我先把讀取的代碼注釋掉，看看tfreocrd有沒有寫成功，發(fā)現(xiàn)寫成功了，這就表明是讀取的問題，我懷疑是因為每次寫入的長度是變化的原因，但是又有覺得不是，因為圖片的尺寸都是不同的，我還是可以讀取的，百思不得其解的時候我發(fā)現(xiàn)存儲圖片的時候是img.tobytes(),我把一個數(shù)組轉(zhuǎn)換成了bytes，而且用的也是bytes存儲，是不是tensorflow會把這個bytes當(dāng)成一個元素，雖然每個圖片的size不同，但是tobytes后tensorflow都會當(dāng)成一個元素，然后讀取的時候再根據(jù)(height,width,channel)來解析成圖片。

我來試試不存為int64，而是存為bytes。又是一頓厲害的操作

數(shù)據(jù)轉(zhuǎn)為bytes

# -*- coding: utf-8 -*-

import tensorflow as tf
import numpy as np

def _byte_feature(value):
 return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _int64_feature(value):
 if not isinstance(value,list):
 value = [value]
 return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
# Write an array to TFrecord.
# a is an array which contains lists of variant length.
a = np.array([[0, 54, 91, 153, 177,1],
  [0, 50, 89, 147, 196],
  [0, 38, 79, 157],
  [0, 49, 89, 147, 177],
  [0, 32, 73, 145]])

writer = tf.python_io.TFRecordWriter('file')

for i in range(a.shape[0]): # i = 0 ~ 4
 feature = {'len' : _int64_feature(len(a[i])), # 將無意義的i改成len，為了后面還原
  'data': _byte_feature(np.array(a[i]).tobytes())} # 我也不知道為什么a[i]是list（后面就知道了），要存bytes需要numpy一下

 # Create an example protocol buffer
 example = tf.train.Example(features=tf.train.Features(feature=feature))

 # Serialize to string and write on the file
 writer.write(example.SerializeToString())

writer.close()

#
# Use Dataset API to read the TFRecord file.
filenames = ["file"]
dataset = tf.data.TFRecordDataset(filenames)
def _parse_function(example_proto):
 keys_to_features = {'len':tf.FixedLenFeature([],tf.int64),
   'data':tf.FixedLenFeature([],tf.string)} # 改成string
 parsed_features = tf.parse_single_example(example_proto, keys_to_features)
 return parsed_features['len'], parsed_features['data']

dataset = dataset.map(_parse_function)
dataset = dataset.shuffle(buffer_size=1)
dataset = dataset.repeat() 
dataset = dataset.batch(1)
iterator = dataset.make_one_shot_iterator()
i, data = iterator.get_next()
with tf.Session() as sess:
 print(sess.run([i, data]))
 print(sess.run([i, data]))
 print(sess.run([i, data]))


"""
[array([6], dtype=int64), array([b'\x00\x00\x00\x006\x00\x00\x00[\x00\x00\x00\x99\x00\x00\x00\xb1\x00\x00\x00\x01\x00\x00\x00'],
 dtype=object)]
[array([5], dtype=int64), array([b'\x00\x00\x00\x002\x00\x00\x00Y\x00\x00\x00\x93\x00\x00\x00\xc4\x00\x00\x00'],
 dtype=object)]
[array([4], dtype=int64), array([b'\x00\x00\x00\x00&\x00\x00\x00O\x00\x00\x00\x9d\x00\x00\x00'],
 dtype=object)]
"""

bytes數(shù)據(jù)解碼

如愿的輸出來了，但是這個bytes我該如何解碼呢

方法一，我們自己解析

 a,b= sess.run([i,data])
 c = np.frombuffer(b[0],dtype=np.int,count=a[0])

方法二使用tensorflow的解析函數(shù)

def _parse_function(example_proto):
 keys_to_features = {'len':tf.FixedLenFeature([],tf.int64),
   'data':tf.FixedLenFeature([],tf.string)} # 改成string
 parsed_features = tf.parse_single_example(example_proto, keys_to_features)
 dat = tf.decode_raw(parsed_features['data'],tf.int64) # 用的是這個解析函數(shù)，我們使用int64的格式存儲的，解析的時候也是轉(zhuǎn)換為int64
 return parsed_features['len'], dat
"""
[array([6]), array([[ 0, 54, 91, 153, 177, 1]])]
[array([5]), array([[ 0, 50, 89, 147, 196]])]
[array([4]), array([[ 0, 38, 79, 157]])]
"""

可以看到是二維數(shù)組，這是因為我們使用的是batch輸出，雖然我們的bathc_size=1，但是還是會以二維list的格式輸出。我手賤再來修改點東西，

def _parse_function(example_proto):
 keys_to_features = {'len':tf.FixedLenFeature([1],tf.int64),
   'data':tf.FixedLenFeature([1],tf.string)} 
 parsed_features = tf.parse_single_example(example_proto, keys_to_features)
 dat = tf.decode_raw(parsed_features['data'],tf.int64)
 return parsed_features['len'], dat

"""
[array([[6]]), array([[[ 0, 54, 91, 153, 177, 1]]])]
[array([[5]]), array([[[ 0, 50, 89, 147, 196]]])]
[array([[4]]), array([[[ 0, 38, 79, 157]]])]
"""

呦呵，又變成3維的了，讓他報個錯試試

def _parse_function(example_proto):
 keys_to_features = {'len':tf.FixedLenFeature([2],tf.int64), # 1 修改為 2
   'data':tf.FixedLenFeature([1],tf.string)} # 改成string
 parsed_features = tf.parse_single_example(example_proto, keys_to_features)
 return parsed_features['len'], parsed_features['data']

"""
InvalidArgumentError: Key: len. Can't parse serialized Example.
 [[Node: ParseSingleExample/ParseSingleExample = ParseSingleExample[Tdense=[DT_STRING, DT_INT64], dense_keys=["data", "len"], dense_shapes=[[1], [2]], num_sparse=0, sparse_keys=[], sparse_types=[]](arg0, ParseSingleExample/Const, ParseSingleExample/Const_1)]]
 [[Node: IteratorGetNext_22 = IteratorGetNext[output_shapes=[[?,2], [?,1]], output_types=[DT_INT64, DT_STRING], _device="/job:localhost/replica:0/task:0/device:CPU:0"](OneShotIterator_22)]]
"""

可以看到dense_keys=["data", "len"], dense_shapes=[[1], [2]],，tf.FixedLenFeature是讀取固定長度的數(shù)據(jù)，我猜測[]的意思就是讀取全部數(shù)據(jù)，[1]就是讀取一個數(shù)據(jù)，每個數(shù)據(jù)可能包含多個數(shù)據(jù)，形如[[1，2],[3，3，4],[2]....]，哈哈這都是我瞎猜的，做我女朋友好不好。

tensorflow 變長數(shù)組存儲

反正是可以讀取了。但是如果是自己定義的變長數(shù)組，每次都要自己解析，這樣很麻煩（我瞎遍的），所以tensorflow就定義了變長數(shù)組的解析方法tf.VarLenFeature，我們就不需要把邊長數(shù)組變?yōu)閎ytes再解析了，又是一頓操作

import tensorflow as tf
import numpy as np

def _int64_feature(value):
 if not isinstance(value,list):
 value = [value]
 return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

# Write an array to TFrecord.
# a is an array which contains lists of variant length.
a = np.array([[0, 54, 91, 153, 177,1],
  [0, 50, 89, 147, 196],
  [0, 38, 79, 157],
  [0, 49, 89, 147, 177],
  [0, 32, 73, 145]])

writer = tf.python_io.TFRecordWriter('file')

for i in range(a.shape[0]): # i = 0 ~ 4
 feature = {'i' : _int64_feature(i), 
  'data': _int64_feature(a[i])}

 # Create an example protocol buffer
 example = tf.train.Example(features=tf.train.Features(feature=feature))

 # Serialize to string and write on the file
 writer.write(example.SerializeToString())

writer.close()


# Use Dataset API to read the TFRecord file.
filenames = ["file"]
dataset = tf.data.TFRecordDataset(filenames)
def _parse_function(example_proto):
 keys_to_features = {'i':tf.FixedLenFeature([],tf.int64),
   'data':tf.VarLenFeature(tf.int64)}
 parsed_features = tf.parse_single_example(example_proto, keys_to_features)
 return parsed_features['i'], tf.sparse_tensor_to_dense(parsed_features['data'])

dataset = dataset.map(_parse_function)
dataset = dataset.shuffle(buffer_size=1)
dataset = dataset.repeat() 
dataset = dataset.batch(1)
iterator = dataset.make_one_shot_iterator()
i, data = iterator.get_next()
with tf.Session() as sess:
 print(sess.run([i, data]))
 print(sess.run([i, data]))
 print(sess.run([i, data]))

"""
[array([0], dtype=int64), array([[ 0, 54, 91, 153, 177, 1]], dtype=int64)]
[array([1], dtype=int64), array([[ 0, 50, 89, 147, 196]], dtype=int64)]
[array([2], dtype=int64), array([[ 0, 38, 79, 157]], dtype=int64)]
"""

batch輸出

輸出還是數(shù)組，哈哈哈。再來一波操作

dataset = dataset.batch(2)
"""
Cannot batch tensors with different shapes in component 1. First element had shape [6] and element 1 had shape [5].
"""

這是因為一個batch中數(shù)據(jù)的shape必須是一致的，第一個元素長度為6，第二個元素長度為5，就會報錯。辦法就是補成一樣的長度，在這之前先測試點別的

a = np.array([[0, 54, 91, 153, 177,1],
  [0, 50, 89, 147, 196],
  [0, 38, 79, 157],
  [0, 49, 89, 147, 177],
  [0, 32, 73, 145]])


for i in range(a.shape[0]):
 print(type(a[i]))

"""
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
"""

可以發(fā)現(xiàn)長度不一的array每一個數(shù)據(jù)是list（一開始我以為是object）。然后補齊

a = np.array([[0, 54, 91, 153, 177,1],
  [0, 50, 89, 147, 196,0],
  [0, 38, 79, 157,0,0],
  [0, 49, 89, 147, 177,0],
  [0, 32, 73, 145,0,0]])


for i in range(a.shape[0]):
 print(type(a[i]))

"""
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
"""

返回的是numpy。為什么要做這件事呢？

def _int64_feature(value):
 if not isinstance(value,list):
 value = [value]
 return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

tensorflow要求我們輸入的是list或者直接是numpy.ndarry，如果是list中包含numpy.ndarry [numpy.ndarry]就會報錯。上面的那個數(shù)組時邊長的，返回的時list，沒有什么錯誤，我們補齊看看

a = np.array([[0, 54, 91, 153, 177,1],
  [0, 50, 89, 147, 196,0],
  [0, 38, 79, 157,0,0],
  [0, 49, 89, 147, 177,0],
  [0, 32, 73, 145,0,0]])

"""
TypeError: only size-1 arrays can be converted to Python scalars
"""

這就是因為返回的不是list，而是numpy.ndarry,而_int64_feature函數(shù)中先判斷numpy.ndarry不是list，所以轉(zhuǎn)成了[numpy.ndarry]就報錯了?？梢宰鲂┬薷?，一種方法是將numpy.ndarry轉(zhuǎn)為list

for i in range(a.shape[0]): # i = 0 ~ 4
 feature = {'i' : _int64_feature(i), 
  'data': _int64_feature(a[i].tolist())}

這樣補齊了我們就可以修改batch的值了

dataset = dataset.batch(2)

"""
[array([0, 2], dtype=int64), array([[ 0, 54, 91, 153, 177, 1],
 [ 0, 38, 79, 157, 0, 0]], dtype=int64)]
[array([1, 3], dtype=int64), array([[ 0, 50, 89, 147, 196, 0],
 [ 0, 49, 89, 147, 177, 0]], dtype=int64)]
[array([4, 0], dtype=int64), array([[ 0, 32, 73, 145, 0, 0],
 [ 0, 54, 91, 153, 177, 1]], dtype=int64)]
"""

當(dāng)然tensorflow不會讓我自己補齊，已經(jīng)提供了補齊函數(shù)padded_batch，

# -*- coding: utf-8 -*-

import tensorflow as tf

def _int64_feature(value):
 if not isinstance(value,list):
 value = [value]
 return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

a = [[0, 54, 91, 153, 177,1],
  [0, 50, 89, 147, 196],
  [0, 38, 79, 157],
  [0, 49, 89, 147, 177],
  [0, 32, 73, 145]]

writer = tf.python_io.TFRecordWriter('file')

for v in a: # i = 0 ~ 4
 feature = {'data': _int64_feature(v)}

 # Create an example protocol buffer
 example = tf.train.Example(features=tf.train.Features(feature=feature))

 # Serialize to string and write on the file
 writer.write(example.SerializeToString())

writer.close()


# Use Dataset API to read the TFRecord file.
filenames = ["file"]
dataset = tf.data.TFRecordDataset(filenames)
def _parse_function(example_proto):
 keys_to_features = {'data':tf.VarLenFeature(tf.int64)}
 parsed_features = tf.parse_single_example(example_proto, keys_to_features)
 return tf.sparse_tensor_to_dense( parsed_features['data'])

dataset = dataset.map(_parse_function)
dataset = dataset.shuffle(buffer_size=1)
dataset = dataset.repeat() 
dataset = dataset.padded_batch(2,padded_shapes=([None]))
iterator = dataset.make_one_shot_iterator()
data = iterator.get_next()
with tf.Session() as sess:
 print(sess.run([data]))
 print(sess.run([data]))
 print(sess.run([data]))


"""
[array([[ 0, 54, 91, 153, 177, 1],
 [ 0, 50, 89, 147, 196, 0]])]
[array([[ 0, 38, 79, 157, 0],
 [ 0, 49, 89, 147, 177]])]
[array([[ 0, 32, 73, 145, 0, 0],
 [ 0, 54, 91, 153, 177, 1]])]
"""

可以看到的確是自動補齊了。

圖片batch

直接來測試一下圖片數(shù)據(jù)

# -*- coding: utf-8 -*-

import tensorflow as tf
import matplotlib.pyplot as plt
def _byte_feature(value):
 return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

files = tf.gfile.Glob('*.jpeg')
writer = tf.python_io.TFRecordWriter('file')
for file in files:

 with tf.gfile.FastGFile(file,'rb') as f:
 img_buff = f.read()
 feature = {'img': _byte_feature(tf.compat.as_bytes(img_buff))}
 example = tf.train.Example(features=tf.train.Features(feature=feature))
 writer.write(example.SerializeToString())
writer.close()


filenames = ["file"]
dataset = tf.data.TFRecordDataset(filenames)
def _parse_function(example_proto):
 keys_to_features = {'img':tf.FixedLenFeature([], tf.string)}
 parsed_features = tf.parse_single_example(example_proto, keys_to_features)
 image = tf.image.decode_jpeg(parsed_features['img'])
 return image

dataset = dataset.map(_parse_function)
dataset = dataset.shuffle(buffer_size=1)
dataset = dataset.repeat() 
dataset = dataset.batch(2)
iterator = dataset.make_one_shot_iterator()
image = iterator.get_next()

with tf.Session() as sess:
 img = sess.run([image])
 print(len(img))
 print(img[0].shape)
 plt.imshow(img[0][0])

"""
Cannot batch tensors with different shapes in component 0. First element had shape [440,440,3] and element 1 had shape [415,438,3].
"""

看到了沒有，一個batch中圖片的尺寸不同，就不可以batch了，我們必須要將一個batch的圖片resize成相同的代大小。

def _parse_function(example_proto):
 keys_to_features = {'img':tf.FixedLenFeature([], tf.string)}
 parsed_features = tf.parse_single_example(example_proto, keys_to_features)
 image = tf.image.decode_jpeg(parsed_features['img'])
 image = tf.image.convert_image_dtype(image,tf.float32)# 直接resize，會將uint8轉(zhuǎn)為float類型，但是plt.imshow只能顯示uint8或者0-1之間float類型，這個函數(shù)就是將uint8轉(zhuǎn)為0-1之間的float類型，相當(dāng)于除以255.0
 image = tf.image.resize_images(image,(224,224))
 return image

但是有時候我們希望輸入圖片尺寸是不一樣的，不需要reize，這樣只能將batch_size=1。一個batch中的圖片shape必須是一樣的，我們可以這樣折中訓(xùn)練，使用tensorflow提供的動態(tài)填充接口，將一個batch中的圖片填充為相同的shape。

dataset = dataset.padded_batch(2,padded_shapes=([None,None,3]))

如果我們想要將圖片的名稱作為標簽保存下來要怎么做呢？

# -*- coding: utf-8 -*-

import tensorflow as tf
import matplotlib.pyplot as plt
import os

out_charset="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"

def _byte_feature(value):
 return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _int64_feature(values):
 if not isinstance(values,list):
 values = [values]
 return tf.train.Feature(int64_list=tf.train.Int64List(value=values))

files = tf.gfile.Glob('*.jpg')
writer = tf.python_io.TFRecordWriter('file')
for file in files:
 with tf.gfile.FastGFile(file,'rb') as f:
 img_buff = f.read()
 filename = os.path.basename(file).split('.')[0]
 label = list(map(lambda x:out_charset.index(x),filename))
 feature = {'label':_int64_feature(label),
  'filename':_byte_feature(tf.compat.as_bytes(filename)),
  'img': _byte_feature(tf.compat.as_bytes(img_buff))}
 example = tf.train.Example(features=tf.train.Features(feature=feature))
 writer.write(example.SerializeToString())
writer.close()


filenames = ["file"]
dataset = tf.data.TFRecordDataset(filenames)
def _parse_function(example_proto):
 keys_to_features = {
  'label':tf.VarLenFeature(tf.int64),
  'filename':tf.FixedLenFeature([],tf.string),
  'img':tf.FixedLenFeature([], tf.string)}
 parsed_features = tf.parse_single_example(example_proto, keys_to_features)
 label = tf.sparse_tensor_to_dense(parsed_features['label'])
 filename = parsed_features['filename']
 image = tf.image.decode_jpeg(parsed_features['img'])
 return image,label,filename

dataset = dataset.map(_parse_function)
dataset = dataset.shuffle(buffer_size=1)
dataset = dataset.repeat() 
dataset = dataset.padded_batch(3,padded_shapes=([None,None,3],[None],[]))
#因為返回有三個，所以每一個都要有padded_shapes,但是解碼后的image和label都是變長的
#所以需要pad None,而filename沒有解碼，返回來是byte類型的，只有一個值，所以不需要pad
iterator = dataset.make_one_shot_iterator()
image,label,filename = iterator.get_next()

with tf.Session() as sess:
 print(label.eval())

瞎試

如果寫入的數(shù)據(jù)是一個list會是怎樣呢

a = np.arange(16).reshape(2,4,2)

"""
TypeError: [0, 1] has type list, but expected one of: int, long
"""

不過想想也是，tf.train.Feature(int64_list=tf.train.Int64List(value=value))這個函數(shù)就是存儲數(shù)據(jù)類型為int64的list的。但是如果我們要存儲詞向量該怎么辦呢？例如一句話是一個樣本s1='我愛你',假如使用one-hot編碼，我=[0,0,1],愛=[0,1,0],你=[1,0,0],s1=[[0,0,1],[0,1,0],[1,0,0]]。這一個樣本該怎么存儲呢？

以上這篇tensorflow 變長序列存儲實例就是小編分享給大家的全部內(nèi)容了，希望能給大家一個參考，也希望大家多多支持腳本之家。

您可能感興趣的文章: