#!/usr/bin/env python
# coding: utf-8

# # Kaggle: Titanic: Machine Learning from Disaster  
# ### with EarlyStopping
# https://www.kaggle.com/c/titanic

# In[1]:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')

# random seed
import tensorflow as tf
import random as rn
import os
os.environ['PYTHONHASHSEED'] = '0'
random_n = 123
np.random.seed(random_n)
rn.seed(random_n)
session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
from keras import backend as K
tf.set_random_seed(random_n)
sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)

train = pd.read_csv('train.csv', index_col=0)
test = pd.read_csv('test.csv', index_col=0)


# In[2]:


train.head()


# ### Drop Survived and Ticket, then combine train with  test 

# In[3]:


train_tmp = train.drop(['Survived', 'Ticket'], axis=1)
test_tmp = test.drop(['Ticket'], axis=1)
df = pd.concat([train_tmp, test_tmp])
df.info()


# ### Name --> Title --> Number

# In[4]:


# Name to Title
df = df.assign(Title=df.Name.str.extract(' ([A-Za-z]+)\..', expand=True))
title_list = df.Title.unique()
print(title_list)


# In[5]:


# Title to Number(0-17)
df.Title = df.Title.replace(df.Title.unique(), np.arange(len(df.Title.unique())))

# Drop Name column
df = df.drop(['Name'], axis=1)
df.head()


# ### Sex --> male:0, female:1

# In[6]:


df.Sex = df.Sex.replace({'male': 0, 'female': 1})


# ### Cabin --> Number: nan:0, C:1, E:2, G:3, D:4, A:5, B:6, F:7, T:8

# In[7]:


df = df.assign(Cabin=df.Cabin.str[0])
cabin_list = df.Cabin.unique()

df.Cabin = df.Cabin.replace(df.Cabin.str[0].unique(), np.arange(len(df.Cabin.str[0].unique())))

print(cabin_list)
print(df.Cabin.unique())


# ### Embarked --> S:0, C:1, Q:2, nan

# In[8]:


df.Embarked.unique()


# In[9]:


df.Embarked = df.Embarked.replace({'S':0, 'C':1, 'Q':2})


# ## zscore or normalization:   
# * Age: including NaN
# * Fare: including NaN  
#   
# Z = (x - x.mean) / x.std  
# N = (x - x.min) / (x.max - x.min) 
#   
# sklearn.preprocessing.MinMaxScaler causes error with Null data.

# In[10]:


# Normalize Function
def normalize(df_col):
    df_col = (df_col - df_col.min()) / (df_col.max() - df_col.min())
    return df_col


# In[11]:


# Standardization(zscore)
def zscore(df_col):
    df_col = (df_col - df_col.mean()) / df_col.std()
    return df_col


# In[12]:


df.Age = zscore(df.Age)
df.Fare = zscore(df.Fare)
df.SibSp = zscore(df.SibSp)
df.Parch = zscore(df.Parch)
df.Title = zscore(df.Title)

# df.Age = normalize(df.Age)
# df.Fare = normalize(df.Fare)

# for col in df.columns:
#     df[col] = normalize(df[col])

df.describe()


# ## Split the Data into  Null-data and Notnull-data
# 
# Make a Copy of df: df0 = df.copy()  
# * Age
# * Embarked
# * Fare
# 

# In[13]:


# Drop Cabin if the result gets better
#df = df.drop(['Cabin'], axis=1)

df0 = df.copy()
df0.info()


# In[14]:


Age_null = df[df.Age.isnull()]
df = df[df.Age.notnull()]

Embarked_null = df[df.Embarked.isnull()]
df = df[df.Embarked.notnull()]

Fare_null = df[df.Fare.isnull()]
df = df[df.Fare.notnull()]


# ### Get Notnull Data: df.shape = (1043, 9)

# In[15]:


print(df.shape)
df.info()


# ## Model to fill NaN in Fare, Embarked, Age

# In[16]:


from keras.models import Sequential
from keras.layers import Flatten, Dense, Dropout, BatchNormalization
import keras
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

initializer = keras.initializers.glorot_uniform(seed=random_n)
# model for Fare, Embarked, Age
def fill_data(col):
    n_cols = len(df.columns) - 1
    num = len(df[col].unique())
    
    model = Sequential()
    model.add(Dense(64, activation='relu', input_shape=(n_cols,), kernel_initializer=initializer))
    model.add(Dropout(0.5, seed=random_n))
    
    if col == 'Embarked':
        model.add(Dense(num, activation='softmax', kernel_initializer=initializer))
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc'])
    else: # 'Fare', 'Age'
        model.add(Dense(1, activation='relu', kernel_initializer=initializer))
        model.compile(optimizer='adam', loss='mse', metrics=['mae'])
        
    data = df.drop([col], axis=1)
    
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.000001,verbose=1)
    checkpointer = ModelCheckpoint(filepath='checkpoint_'+col+'.hdf5', verbose=1, save_best_only=True)
    early_stopping = EarlyStopping(patience=10, verbose=1)
    epochs = 300
    hist = model.fit(data, df[col], 
                     epochs=epochs, 
                     batch_size=32,
                     verbose=1,
                     validation_split=0.1,
                     callbacks=[reduce_lr, early_stopping, checkpointer])

    null_data = df0[df0[col].isnull()]
    null_data = null_data.drop([col], axis=1)
    
    model.load_weights('checkpoint_'+col+'.hdf5')
    pred = model.predict(null_data)
    
    if col == 'Embarked':
        pred = pred.argmax(axis=1)
        
        plt.plot(hist.history['acc'], 'b-', label='acc' )
        plt.plot(hist.history['loss'], 'r-', label='loss' )
        plt.xlabel('epochs')
        plt.legend()
        plt.show()
        
    pred = pred.reshape(-1, )
    
    idx = df0[df0[col].isnull()].index.values

    for n, i in enumerate(idx):
        df0.loc[i, col] = pred[n]


# In[17]:


fill_data('Embarked') # id:62,830


# In[18]:


fill_data('Fare') # id:1044


# In[19]:


fill_data('Age') # id: 6,18,20,27,29,30


# ## Split the Data back to  Train and Test  

# In[20]:


#df0 = df0.drop(['Title'], axis=1)

train0 = df0[0:891].copy()
test0 = df0[891:].copy()

train0.head()


# ## Model to estimate Survived on Test data for submission

# In[21]:


df0_cols = len(df0.columns)

model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(df0_cols,), kernel_initializer=initializer))
model.add(Dropout(0.5, seed=random_n))

model.add(Dense(2, activation='softmax', kernel_initializer=initializer))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc'])

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.000001,verbose=1)
checkpointer = ModelCheckpoint(filepath='checkpoint_final.hdf5', verbose=1, save_best_only=True)
early_stopping = EarlyStopping(patience=10, verbose=1)

epochs = 300
hist = model.fit(train0, train.Survived, 
               epochs=epochs, 
               batch_size=5, 
               verbose=1,
               validation_split=0.2,
               callbacks=[reduce_lr, early_stopping, checkpointer])

model.load_weights('checkpoint_final.hdf5')
pred = model.predict(test0)


# In[22]:


# print(model.metrics_names)
plt.plot(hist.history['acc'], 'b-', label='acc' )
plt.plot(hist.history['loss'], 'r-', label='loss' )
plt.xlabel('epochs')
plt.legend()
plt.show()


# In[23]:


result = pred.argmax(axis=1)


# ## Submission file:

# In[24]:


# compare to the previous result
prev = pd.read_csv('submission.csv', index_col=0)
print('Diff: ', np.sum(prev.Survived.values != result))
print('Survived: ', result.sum())

# submission data to csv file 
submission = pd.DataFrame({'PassengerId': test.index, 'Survived': result})
submission.to_csv('submission.csv', index=False)


# In[25]:


result