用keras里的tokenizer重写了生成数据集这部分的代码,大家可以参考一下:
import tensorflow_datasets as tfds
examples,info=tfds.load('ted_hrlr_translate/pt_to_en',with_info=True,
as_supervised=True)
train_examples,val_examples=examples['train'],examples['validation']
en_tokenizer=keras.preprocessing.text.Tokenizer(num_words=None,filters='',
split=' ')
en_tokenizer.fit_on_texts(repr(en.numpy()).split(' ') for pt,en in
train_examples)
pt_tokenizer=keras.preprocessing.text.Tokenizer(num_words=None,
filters='',split=' ')
pt_tokenizer.fit_on_texts(repr(pt.numpy()).split(' ') for pt,en in
train_examples)
buffer_size=20000
batch_size=64
max_length=40
def encode_to_id(pt_sentence,en_sentence):
pt_sentence=pt_tokenizer.texts_to_sequences(repr(pt_sentence.numpy()).split(' '))
pt_sentence=tf.convert_to_tensor(pt_sentence,dtype=tf.int64)
pt_sentence=tf.reshape(pt_sentence,shape=(len(pt_sentence),))
en_sentence=en_tokenizer.texts_to_sequences(repr(en_sentence.numpy()).split(' '))
en_sentence=tf.convert_to_tensor(en_sentence,dtype=tf.int64)
en_sentence=tf.reshape(en_sentence,shape=(len(en_sentence),))
return pt_sentence,en_sentence
def filter_by_maxlen(pt,en):
return tf.logical_and(tf.size(pt)<=max_length,tf.size(en)<=max_length)
def tf_encode_to_id(pt_sentence,en_sentence):
return tf.py_function(encode_to_id,[pt_sentence,en_sentence],[tf.int64,tf.int64])
train_dataset=train_examples.map(tf_encode_to_id)
train_dataset=train_dataset.filter(filter_by_maxlen)
train_dataset=train_dataset.shuffle(buffer_size).padded_batch(batch_size,padded_shapes=([-1],[-1]))