train_data_raw.head() import tensorflow as tf from keras import backend import logging import numpy as np import pandas as pd import random pd.set_option('display.max_colwidth', -1) #To have reproducability: Set all the seeds, make sure multithreading is off, if possible don't use GPU. tf.set_random_seed(7) np.random.seed(7) session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) backend.set_session(tf.Session(graph=tf.get_default_graph(), config=session_conf)) # Synthetic Training and Test data def generate_data(hour, minute, sentence=''): special = [15,30] suffix = "" #print(hour, minute) dictionary = {1:"one", 2:"two", 3:"three", 4:"four", 5:"five", 6:"six", 7:"seven", 8:"eight", 9:"nine", 10:"ten", 11:"eleven", 12:"twelve", 13:"thirteen", 14:"fourteen", 16:"sixteen", 17:"seventeen", 18:"eighteen", 19:"nineteen", 20:"twenty", 30:"thirty", 40:"forty", 50:"fifty"} result = "" if minute == 15: result= "quarter past" elif minute == 30: result= "half past" elif minute == 0: pass else: if minute in dictionary: result = dictionary[minute] + " minutes past" else: minute1 = int(str(minute // 10 ) + "0") minute2 = minute % 10 result = dictionary[minute1] + ' ' + dictionary[minute2] + " minutes past" if hour == 0: suffix = "mid night" elif hour >= 1 and hour <= 11: suffix = "morning" elif hour == 12: suffix = "noon" elif hour > 12 and hour <=16: suffix = "after noon" elif hour > 16 and hour <=19: suffix = "evening" elif hour > 20 and hour <=23: suffix = "night" save_hour = hour if hour > 12: hour = hour - 12 if hour > 0: # Lets introduce some variation in the way how hours an sufffixes are formed, just for randomness if hour % 2 == 0: result = result + " " + dictionary[hour]+ " in the " + suffix else: result = result + " " + dictionary[hour]+ " " + suffix else: result = result + " " + suffix if sentence != '': result = sentence.replace('#@#', result) return save_hour, minute, result # Random sentence templates to shove our time compnents into to form propert english sentences sentence=[ 'The murder happened exactly #@#', '#@#, was the time on the clock when I entered the house', 'Time flies its #@# now', 'Really was it #@# twice in a row?' ] def train(): data = [] i = 0 while i < 200000: hour = random.randint(0,23) minute = random.randint(0,59) sent = random.randint(0,3) hour, minute, result = generate_data(hour, minute, sentence[sent]) inwords = result data.append({"inwords":inwords, "hour": hour, "minute":minute}) i += 1 df = pd.DataFrame(data) #df.columns = ['inwords', 'hour', 'minute'] return df def test(): data = [] i = 0 while i < 20000: hour = random.randint(10,15) minute = random.randint(0,59) sent = random.randint(0,3) hour, minute, result = generate_data(hour, minute, sentence[sent]) inwords = result data.append({"inwords":inwords, "hour": hour, "minute":minute}) i += 1 df = pd.DataFrame(data) #df.columns = ['inwords', 'hour', 'minute'] return df train_data_raw = train() test_data_raw = test() # import os # from google.colab import drive # drive.mount('/content/drive') # print(os.listdir("/content/drive/My Drive")) train_data_raw.head() from keras.preprocessing import text, sequence from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences vocab_size = 5000 # based on words in the entire corpus max_len = 25 # based on word count in phrases train_phrases = list(train_data_raw['inwords'].values) test_phrases = list(test_data_raw['inwords'].values) train_target = pd.get_dummies(train_data_raw['hour'].values) #Vocabulary-Indexing of the train and test phrases, make sure "filters" parm doesn't clean out punctuations which you we dont intend to tokenizer = Tokenizer(num_words=vocab_size, lower=True, filters=',?.\n\t') tokenizer.fit_on_texts(train_phrases + test_phrases) encoded_train_phrases = tokenizer.texts_to_sequences(train_phrases) encoded_test_phrases = tokenizer.texts_to_sequences(test_phrases) #Watch for a POST padding, as opposed to the default PRE padding X_train_words = sequence.pad_sequences(encoded_train_phrases, maxlen=max_len, padding='post') X_test_words = sequence.pad_sequences(encoded_test_phrases, maxlen=max_len, padding='post') print (X_train_words.shape) print (X_test_words.shape) print (train_target.shape) print ('Done Tokenizing and indexing phrases based on the vocabulary learned from the entire Train and Test corpus') from keras.callbacks import EarlyStopping from keras.layers import Dense, Input, Embedding, Dropout, CuDNNLSTM, CuDNNGRU, Flatten, TimeDistributed, RepeatVector from keras.layers import Bidirectional from keras.models import Model print("Building layers") print('starting to stitch and compile model') # Embedding layer for text inputs input_words = Input((max_len,)) x_words = Embedding(vocab_size, 300, input_length=max_len)(input_words) x_words = Bidirectional(CuDNNLSTM(128))(x_words) x_words = Dropout(0.2)(x_words) x_words = Dense(32, activation="relu")(x_words) predictions = Dense(24, activation="softmax")(x_words) model = Model(inputs=input_words, outputs=predictions) model.compile(optimizer='rmsprop' ,loss='categorical_crossentropy', metrics=['accuracy']) print(model.summary()) early_stop = EarlyStopping(monitor = "val_loss", mode="min", patience = 3, verbose=1) #fit the model nb_epoch = 10 history = model.fit(X_train_words, train_target, epochs=nb_epoch, verbose=1, batch_size = 256, callbacks=[early_stop], validation_split = 0.2, shuffle=True) train_loss = np.mean(history.history['loss']) val_loss = np.mean(history.history['val_loss']) print('Train loss: %f' % (train_loss*100)) print('Validation loss: %f' % (val_loss*100)) pred_test = model.predict(X_test_words, batch_size=128, verbose = 0) print (pred_test.shape) max_pred = np.floor(np.argmax(pred_test, axis=1)).astype(int) submission = pd.DataFrame({'Inwords':test_data_raw['inwords'],'Predicted': max_pred, 'Truth': test_data_raw['hour']}) submission = submission[['Inwords', 'Truth','Predicted']] submission.head() unseen = ["Lets say, we meet three morning tommorrow ?"] tokenizer.fit_on_texts(unseen) encoded_unseen_phrases = tokenizer.texts_to_sequences(unseen) X_unseen_words = sequence.pad_sequences(encoded_unseen_phrases, maxlen=max_len, padding='post') pred_unseen = model.predict(X_unseen_words, batch_size=128, verbose = 0) max_pred_unseen = np.floor(np.argmax(pred_unseen, axis=1)).astype(int) print(max_pred_unseen)