要建立適合訓練的管道,需要對資料集做一些轉換
def tokenize_pairs(pt, en):
pt = tokenizers.pt.tokenize(pt)
pt = pt.to_tensor()
en = tokenizers.en.tokenize(en)
en = en.to_tensor()
return pt, en
to_tensor
是將資料轉換成tensor的型態,方便後續tensorflow做處理
BUFFER_SIZE = 20000
BATCH_SIZE = 64
def make_batches(ds):
return (
ds
.cache()
.shuffle(BUFFER_SIZE)
.batch(BATCH_SIZE)
.map(tokenize_pairs, num_parallel_calls=tf.data.AUTOTUNE)
.prefetch(tf.data.AUTOTUNE))
train_batches = make_batches(train_examples)
val_batches = make_batches(val_examples)
這邊來講一下BUFFER_SIZE跟BATCH_SIZE
import tensorflow as tf
import numpy as np
dataset = tf.data.Dataset.from_tensor_slices(np.array([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]))
batch_dataset = dataset.batch(4)
for ele in batch_dataset:
print(ele)
tf.Tensor([1 2 3 4], shape=(4,), dtype=int32)
tf.Tensor([5 6 7 8], shape=(4,), dtype=int32)
tf.Tensor([ 9 10 11 12], shape=(4,), dtype=int32)
tf.Tensor([13 14 15 16], shape=(4,), dtype=int32)
這邊可以看到輸出資料是按照順序被切割成每個tensor有BATCH_SIZE筆data,shuffle則是用來打亂順序的
import tensorflow as tf
import numpy as np
shuffle_dataset = dataset.shuffle(4)
for i in shuffle_dataset:
print(i)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(12, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)
tf.Tensor(14, shape=(), dtype=int32)
tf.Tensor(15, shape=(), dtype=int32)
tf.Tensor(11, shape=(), dtype=int32)
tf.Tensor(13, shape=(), dtype=int32)
tf.Tensor(10, shape=(), dtype=int32)
tf.Tensor(16, shape=(), dtype=int32)
shuffle會隨機選出BUFFER_SIZE個資料並且輸出1個item出來
import tensorflow as tf
import numpy as np
dataset = tf.data.Dataset.from_tensor_slices(np.array([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]))
dataset = dataset.shuffle(16).batch(2)
for j in dataset:
print(j)
tf.Tensor([ 1 16], shape=(2,), dtype=int32)
tf.Tensor([13 7], shape=(2,), dtype=int32)
tf.Tensor([ 3 10], shape=(2,), dtype=int32)
tf.Tensor([14 15], shape=(2,), dtype=int32)
tf.Tensor([2 8], shape=(2,), dtype=int32)
tf.Tensor([ 4 12], shape=(2,), dtype=int32)
tf.Tensor([6 5], shape=(2,), dtype=int32)
tf.Tensor([ 9 11], shape=(2,), dtype=int32)
兩者混合使用就會得到每個tensor有BATCH_SIZE個資料,並且資料都是隨機的,一般來說最好的方式是BUFFER_SIZE=Dataset_size
https://blog.csdn.net/QLBFA/article/details/108143449