Credits: Forked from deep-learning-keras-tensorflow by Valerio Maggio
<img src="imgs/RNN-rolled.png"/ width="80px" height="80px">
<img src="imgs/RNN-unrolled.png"/ width="400px" height="400px">
<img src="imgs/LSTM3-chain.png"/ width="60%">
from keras.optimizers import SGD
from keras.preprocessing.text import one_hot, text_to_word_sequence, base_filter
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU
from keras.preprocessing import sequence
import os
import pickle
import numpy as np
DATA_DIRECTORY = os.path.join(os.path.abspath(os.path.curdir), 'data')
print(DATA_DIRECTORY)
/home/valerio/deep-learning-keras-euroscipy2016/data
male_posts = []
female_post = []
with open(os.path.join(DATA_DIRECTORY,"male_blog_list.txt"),"rb") as male_file:
male_posts= pickle.load(male_file)
with open(os.path.join(DATA_DIRECTORY,"female_blog_list.txt"),"rb") as female_file:
female_posts = pickle.load(female_file)
filtered_male_posts = list(filter(lambda p: len(p) > 0, male_posts))
filtered_female_posts = list(filter(lambda p: len(p) > 0, female_posts))
# text processing - one hot builds index of the words
male_one_hot = []
female_one_hot = []
n = 30000
for post in filtered_male_posts:
try:
male_one_hot.append(one_hot(post, n, split=" ", filters=base_filter(), lower=True))
except:
continue
for post in filtered_female_posts:
try:
female_one_hot.append(one_hot(post,n,split=" ",filters=base_filter(),lower=True))
except:
continue
# 0 for male, 1 for female
concatenate_array_rnn = np.concatenate((np.zeros(len(male_one_hot)),
np.ones(len(female_one_hot))))
from sklearn.cross_validation import train_test_split
X_train_rnn, X_test_rnn, y_train_rnn, y_test_rnn = train_test_split(np.concatenate((female_one_hot,male_one_hot)),
concatenate_array_rnn,
test_size=0.2)
maxlen = 100
X_train_rnn = sequence.pad_sequences(X_train_rnn, maxlen=maxlen)
X_test_rnn = sequence.pad_sequences(X_test_rnn, maxlen=maxlen)
print('X_train_rnn shape:', X_train_rnn.shape, y_train_rnn.shape)
print('X_test_rnn shape:', X_test_rnn.shape, y_test_rnn.shape)
X_train_rnn shape: (3873, 100) (3873,) X_test_rnn shape: (969, 100) (969,)
max_features = 30000
dimension = 128
output_dimension = 128
model = Sequential()
model.add(Embedding(max_features, dimension))
model.add(LSTM(output_dimension))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='mean_squared_error', optimizer='sgd', metrics=['accuracy'])
model.fit(X_train_rnn, y_train_rnn, batch_size=32,
nb_epoch=4, validation_data=(X_test_rnn, y_test_rnn))
Train on 3873 samples, validate on 969 samples Epoch 1/4 3873/3873 [==============================] - 3s - loss: 0.2487 - acc: 0.5378 - val_loss: 0.2506 - val_acc: 0.5191 Epoch 2/4 3873/3873 [==============================] - 3s - loss: 0.2486 - acc: 0.5401 - val_loss: 0.2508 - val_acc: 0.5191 Epoch 3/4 3873/3873 [==============================] - 3s - loss: 0.2484 - acc: 0.5417 - val_loss: 0.2496 - val_acc: 0.5191 Epoch 4/4 3873/3873 [==============================] - 3s - loss: 0.2484 - acc: 0.5399 - val_loss: 0.2502 - val_acc: 0.5191
<keras.callbacks.History at 0x7fa1e96ac4e0>
score, acc = model.evaluate(X_test_rnn, y_test_rnn, batch_size=32)
969/969 [==============================] - 0s
print(score, acc)
0.250189056399 0.519091847357
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(decode_error='ignore', norm='l2', min_df=5)
tfidf_male = vectorizer.fit_transform(filtered_male_posts)
tfidf_female = vectorizer.fit_transform(filtered_female_posts)
flattened_array_tfidf_male = tfidf_male.toarray()
flattened_array_tfidf_female = tfidf_male.toarray()
y_rnn = np.concatenate((np.zeros(len(flattened_array_tfidf_male)),
np.ones(len(flattened_array_tfidf_female))))
X_train_rnn, X_test_rnn, y_train_rnn, y_test_rnn = train_test_split(np.concatenate((flattened_array_tfidf_male,
flattened_array_tfidf_female)),
y_rnn,test_size=0.2)
maxlen = 100
X_train_rnn = sequence.pad_sequences(X_train_rnn, maxlen=maxlen)
X_test_rnn = sequence.pad_sequences(X_test_rnn, maxlen=maxlen)
print('X_train_rnn shape:', X_train_rnn.shape, y_train_rnn.shape)
print('X_test_rnn shape:', X_test_rnn.shape, y_test_rnn.shape)
X_train_rnn shape: (4152, 100) (4152,) X_test_rnn shape: (1038, 100) (1038,)
max_features = 30000
model = Sequential()
model.add(Embedding(max_features, dimension))
model.add(LSTM(output_dimension))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='mean_squared_error',optimizer='sgd', metrics=['accuracy'])
model.fit(X_train_rnn, y_train_rnn,
batch_size=32, nb_epoch=4,
validation_data=(X_test_rnn, y_test_rnn))
Train on 4152 samples, validate on 1038 samples Epoch 1/4 4152/4152 [==============================] - 3s - loss: 0.2502 - acc: 0.4988 - val_loss: 0.2503 - val_acc: 0.4865 Epoch 2/4 4152/4152 [==============================] - 3s - loss: 0.2507 - acc: 0.4843 - val_loss: 0.2500 - val_acc: 0.4865 Epoch 3/4 4152/4152 [==============================] - 3s - loss: 0.2504 - acc: 0.4952 - val_loss: 0.2501 - val_acc: 0.4865 Epoch 4/4 4152/4152 [==============================] - 3s - loss: 0.2506 - acc: 0.4913 - val_loss: 0.2500 - val_acc: 0.5135
<keras.callbacks.History at 0x7fa1f466f278>
score,acc = model.evaluate(X_test_rnn, y_test_rnn,
batch_size=32)
1038/1038 [==============================] - 0s
print(score, acc)
0.249981284572 0.513487476145
# reading all the male text data into one string
male_post = ' '.join(filtered_male_posts)
#building character set for the male posts
character_set_male = set(male_post)
#building two indices - character index and index of character
char_indices = dict((c, i) for i, c in enumerate(character_set_male))
indices_char = dict((i, c) for i, c in enumerate(character_set_male))
# cut the text in semi-redundant sequences of maxlen characters
maxlen = 20
step = 1
sentences = []
next_chars = []
for i in range(0, len(male_post) - maxlen, step):
sentences.append(male_post[i : i + maxlen])
next_chars.append(male_post[i + maxlen])
#Vectorisation of input
x_male = np.zeros((len(male_post), maxlen, len(character_set_male)), dtype=np.bool)
y_male = np.zeros((len(male_post), len(character_set_male)), dtype=np.bool)
print(x_male.shape, y_male.shape)
for i, sentence in enumerate(sentences):
for t, char in enumerate(sentence):
x_male[i, t, char_indices[char]] = 1
y_male[i, char_indices[next_chars[i]]] = 1
print(x_male.shape, y_male.shape)
(2552476, 20, 152) (2552476, 152) (2552476, 20, 152) (2552476, 152)
# build the model: a single LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(character_set_male))))
model.add(Dense(len(character_set_male)))
model.add(Activation('softmax'))
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)
Build model...
auto_text_generating_male_model.compile(loss='mean_squared_error',optimizer='sgd')
import random, sys
# helper function to sample an index from a probability array
def sample(a, diversity=0.75):
if random.random() > diversity:
return np.argmax(a)
while 1:
i = random.randint(0, len(a)-1)
if a[i] > random.random():
return i
# train the model, output generated text after each iteration
for iteration in range(1,10):
print()
print('-' * 50)
print('Iteration', iteration)
model.fit(x_male, y_male, batch_size=128, nb_epoch=1)
start_index = random.randint(0, len(male_post) - maxlen - 1)
for diversity in [0.2, 0.4, 0.6, 0.8]:
print()
print('----- diversity:', diversity)
generated = ''
sentence = male_post[start_index : start_index + maxlen]
generated += sentence
print('----- Generating with seed: "' + sentence + '"')
for iteration in range(400):
try:
x = np.zeros((1, maxlen, len(character_set_male)))
for t, char in enumerate(sentence):
x[0, t, char_indices[char]] = 1.
preds = model.predict(x, verbose=0)[0]
next_index = sample(preds, diversity)
next_char = indices_char[next_index]
generated += next_char
sentence = sentence[1:] + next_char
except:
continue
print(sentence)
print()
-------------------------------------------------- Iteration 1 Epoch 1/1 2552476/2552476 [==============================] - 226s - loss: 1.8022 ----- diversity: 0.2 ----- Generating with seed: "p from the lack of " sense of the search ----- diversity: 0.4 ----- Generating with seed: "p from the lack of " through that possibl ----- diversity: 0.6 ----- Generating with seed: "p from the lack of " . This is a " by p ----- diversity: 0.8 ----- Generating with seed: "p from the lack of " d he latermal ta we -------------------------------------------------- Iteration 2 Epoch 1/1 2552476/2552476 [==============================] - 228s - loss: 1.7312 ----- diversity: 0.2 ----- Generating with seed: "s Last Dance" with t" screening on the st ----- diversity: 0.4 ----- Generating with seed: "s Last Dance" with t" r song think of the ----- diversity: 0.6 ----- Generating with seed: "s Last Dance" with t" . I'm akin computer ----- diversity: 0.8 ----- Generating with seed: "s Last Dance" with t" played that comment -------------------------------------------------- Iteration 3 Epoch 1/1 2552476/2552476 [==============================] - 229s - loss: 1.8693 ----- diversity: 0.2 ----- Generating with seed: ", as maybe someone w" the ssone the so the ----- diversity: 0.4 ----- Generating with seed: ", as maybe someone w" the sasd nouts and t ----- diversity: 0.6 ----- Generating with seed: ", as maybe someone w" p hin I had at f¿ to ----- diversity: 0.8 ----- Generating with seed: ", as maybe someone w" oge rely bluy leanda -------------------------------------------------- Iteration 4 Epoch 1/1 2552476/2552476 [==============================] - 228s - loss: 1.9135 ----- diversity: 0.2 ----- Generating with seed: "o the package :(. Ah" suadedbe teacher th ----- diversity: 0.4 ----- Generating with seed: "o the package :(. Ah" e a searingly the id ----- diversity: 0.6 ----- Generating with seed: "o the package :(. Ah" propost the bure so ----- diversity: 0.8 ----- Generating with seed: "o the package :(. Ah" ing.Lever fan. By in -------------------------------------------------- Iteration 5 Epoch 1/1 2552476/2552476 [==============================] - 229s - loss: 4.5892 ----- diversity: 0.2 ----- Generating with seed: "ot as long as my fri" atde getu th> QQ.“] ----- diversity: 0.4 ----- Generating with seed: "ot as long as my fri" tQ t[we QaaefYhere Q ----- diversity: 0.6 ----- Generating with seed: "ot as long as my fri" ew[”*ing”e[ t[w that ----- diversity: 0.8 ----- Generating with seed: "ot as long as my fri" me]sQoonQ“]e” ti nw -------------------------------------------------- Iteration 6 Epoch 1/1 2552476/2552476 [==============================] - 229s - loss: 6.7174 ----- diversity: 0.2 ----- Generating with seed: "use I'm pretty damn " me g 'o a a a a ----- diversity: 0.4 ----- Generating with seed: "use I'm pretty damn " a o theT a o a ----- diversity: 0.6 ----- Generating with seed: "use I'm pretty damn " n . thot auupe to ----- diversity: 0.8 ----- Generating with seed: "use I'm pretty damn " tomalek ho tt Ion i -------------------------------------------------- Iteration 7 Epoch 1/1 2552476/2552476 [==============================] - 227s - loss: 6.9138 ----- diversity: 0.2 ----- Generating with seed: "ats all got along be" thrtg t ia thv i c ----- diversity: 0.4 ----- Generating with seed: "ats all got along be" th wtot.. t to gt? ----- diversity: 0.6 ----- Generating with seed: "ats all got along be" ed dthwnn,is a ment ----- diversity: 0.8 ----- Generating with seed: "ats all got along be" t incow . wmiyit -------------------------------------------------- Iteration 8 Epoch 1/1 2552476/2552476 [==============================] - 228s - loss: 11.0629 ----- diversity: 0.2 ----- Generating with seed: "oot of my sleeping b" m g te>t e s t anab ----- diversity: 0.4 ----- Generating with seed: "oot of my sleeping b" dttoe s s“snge es s ----- diversity: 0.6 ----- Generating with seed: "oot of my sleeping b" tut hou wen a onap ----- diversity: 0.8 ----- Generating with seed: "oot of my sleeping b" evtyr tt e io on tok -------------------------------------------------- Iteration 9 Epoch 1/1 2552476/2552476 [==============================] - 228s - loss: 8.7874 ----- diversity: 0.2 ----- Generating with seed: " I’ve always looked " ea e ton ann n ffee ----- diversity: 0.4 ----- Generating with seed: " I’ve always looked " o tire n a anV sia a ----- diversity: 0.6 ----- Generating with seed: " I’ve always looked " r i jooe Vag o en ----- diversity: 0.8 ----- Generating with seed: " I’ve always looked " ao at ge ena oro o