# Load the dataset
data = open('./irish-lyrics-eof.txt').read()
# Lowercase and split the text
corpus = data.lower().split("\n")
# Initialize the Tokenizer class
tokenizer = Tokenizer()
# Generate the word index dictionary
# Define the total words. You add 1 for the index `0` which is just the padding token.
total_words = len(tokenizer.word_index) + 1
# Initialize the sequences list
input_sequences = []
# Loop over every line
for line in corpus:
# Tokenize the current line
token_list = tokenizer.texts_to_sequences([line])[0]
# Loop over the line several times to generate the subphrases
for i in range(1, len(token_list)):
# Generate the subphrase
n_gram_sequence = token_list[:i+1]
# Append the subphrase to the sequences list
# Get the length of the longest line
max_sequence_len = max([len(x) for x in input_sequences])
# Pad all sequences
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
# Create inputs and label by splitting the last token in the subphrases
xs, labels = input_sequences[:,:-1],input_sequences[:,-1]
# Convert the label into one-hot arrays
ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)
# Hyperparameters
embedding_dim = 100
lstm_units = 150
learning_rate = 0.01
# Build the model
model = Sequential([
Embedding(total_words, embedding_dim, input_length=max_sequence_len-1),
Dense(total_words, activation='softmax')
# Use categorical crossentropy because this is a multi-class problem
epochs = 100
# Train the model
history = model.fit(xs, ys, epochs=epochs)
# Define seed text
seed_text = "good morning"
# Define total words to predict
next_words = 20
# Loop until desired length is reached
for _ in range(next_words):
# Convert the seed text to a token sequence
token_list = tokenizer.texts_to_sequences([seed_text])[0]
# Pad the sequence
token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
# Feed to the model and get the probabilities for each index
probabilities = model.predict(token_list)
# Get the index with the highest probability
predicted = np.argmax(probabilities, axis=-1)[0]
# Ignore if index is 0 because that is just the padding.
if predicted != 0:
# Look up the word associated with the index.
output_word = tokenizer.index_word[predicted]
# Combine with the seed text
seed_text += " " + output_word
# Print the result
以"good morning”為例,造出20個字的句子是:
good morning of the day before the last dim weeping and the song they sang love love love love me he love