#!/usr/bin/env python # coding: utf-8 # # Kaggle: Titanic: Machine Learning from Disaster # ### with EarlyStopping # https://www.kaggle.com/c/titanic # In[1]: import pandas as pd import numpy as np import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') # random seed import tensorflow as tf import random as rn import os os.environ['PYTHONHASHSEED'] = '0' random_n = 123 np.random.seed(random_n) rn.seed(random_n) session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) from keras import backend as K tf.set_random_seed(random_n) sess = tf.Session(graph=tf.get_default_graph(), config=session_conf) K.set_session(sess) train = pd.read_csv('train.csv', index_col=0) test = pd.read_csv('test.csv', index_col=0) # In[2]: train.head() # ### Drop Survived and Ticket, then combine train with test # In[3]: train_tmp = train.drop(['Survived', 'Ticket'], axis=1) test_tmp = test.drop(['Ticket'], axis=1) df = pd.concat([train_tmp, test_tmp]) df.info() # ### Name --> Title --> Number # In[4]: # Name to Title df = df.assign(Title=df.Name.str.extract(' ([A-Za-z]+)\..', expand=True)) title_list = df.Title.unique() print(title_list) # In[5]: # Title to Number(0-17) df.Title = df.Title.replace(df.Title.unique(), np.arange(len(df.Title.unique()))) # Drop Name column df = df.drop(['Name'], axis=1) df.head() # ### Sex --> male:0, female:1 # In[6]: df.Sex = df.Sex.replace({'male': 0, 'female': 1}) # ### Cabin --> Number: nan:0, C:1, E:2, G:3, D:4, A:5, B:6, F:7, T:8 # In[7]: df = df.assign(Cabin=df.Cabin.str[0]) cabin_list = df.Cabin.unique() df.Cabin = df.Cabin.replace(df.Cabin.str[0].unique(), np.arange(len(df.Cabin.str[0].unique()))) print(cabin_list) print(df.Cabin.unique()) # ### Embarked --> S:0, C:1, Q:2, nan # In[8]: df.Embarked.unique() # In[9]: df.Embarked = df.Embarked.replace({'S':0, 'C':1, 'Q':2}) # ## zscore or normalization: # * Age: including NaN # * Fare: including NaN # # Z = (x - x.mean) / x.std # N = (x - x.min) / (x.max - x.min) # # sklearn.preprocessing.MinMaxScaler causes error with Null data. # In[10]: # Normalize Function def normalize(df_col): df_col = (df_col - df_col.min()) / (df_col.max() - df_col.min()) return df_col # In[11]: # Standardization(zscore) def zscore(df_col): df_col = (df_col - df_col.mean()) / df_col.std() return df_col # In[12]: df.Age = zscore(df.Age) df.Fare = zscore(df.Fare) df.SibSp = zscore(df.SibSp) df.Parch = zscore(df.Parch) df.Title = zscore(df.Title) # df.Age = normalize(df.Age) # df.Fare = normalize(df.Fare) # for col in df.columns: # df[col] = normalize(df[col]) df.describe() # ## Split the Data into Null-data and Notnull-data # # Make a Copy of df: df0 = df.copy() # * Age # * Embarked # * Fare # # In[13]: # Drop Cabin if the result gets better #df = df.drop(['Cabin'], axis=1) df0 = df.copy() df0.info() # In[14]: Age_null = df[df.Age.isnull()] df = df[df.Age.notnull()] Embarked_null = df[df.Embarked.isnull()] df = df[df.Embarked.notnull()] Fare_null = df[df.Fare.isnull()] df = df[df.Fare.notnull()] # ### Get Notnull Data: df.shape = (1043, 9) # In[15]: print(df.shape) df.info() # ## Model to fill NaN in Fare, Embarked, Age # In[16]: from keras.models import Sequential from keras.layers import Flatten, Dense, Dropout, BatchNormalization import keras from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau initializer = keras.initializers.glorot_uniform(seed=random_n) # model for Fare, Embarked, Age def fill_data(col): n_cols = len(df.columns) - 1 num = len(df[col].unique()) model = Sequential() model.add(Dense(64, activation='relu', input_shape=(n_cols,), kernel_initializer=initializer)) model.add(Dropout(0.5, seed=random_n)) if col == 'Embarked': model.add(Dense(num, activation='softmax', kernel_initializer=initializer)) model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc']) else: # 'Fare', 'Age' model.add(Dense(1, activation='relu', kernel_initializer=initializer)) model.compile(optimizer='adam', loss='mse', metrics=['mae']) data = df.drop([col], axis=1) reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.000001,verbose=1) checkpointer = ModelCheckpoint(filepath='checkpoint_'+col+'.hdf5', verbose=1, save_best_only=True) early_stopping = EarlyStopping(patience=10, verbose=1) epochs = 300 hist = model.fit(data, df[col], epochs=epochs, batch_size=32, verbose=1, validation_split=0.1, callbacks=[reduce_lr, early_stopping, checkpointer]) null_data = df0[df0[col].isnull()] null_data = null_data.drop([col], axis=1) model.load_weights('checkpoint_'+col+'.hdf5') pred = model.predict(null_data) if col == 'Embarked': pred = pred.argmax(axis=1) plt.plot(hist.history['acc'], 'b-', label='acc' ) plt.plot(hist.history['loss'], 'r-', label='loss' ) plt.xlabel('epochs') plt.legend() plt.show() pred = pred.reshape(-1, ) idx = df0[df0[col].isnull()].index.values for n, i in enumerate(idx): df0.loc[i, col] = pred[n] # In[17]: fill_data('Embarked') # id:62,830 # In[18]: fill_data('Fare') # id:1044 # In[19]: fill_data('Age') # id: 6,18,20,27,29,30 # ## Split the Data back to Train and Test # In[20]: #df0 = df0.drop(['Title'], axis=1) train0 = df0[0:891].copy() test0 = df0[891:].copy() train0.head() # ## Model to estimate Survived on Test data for submission # In[21]: df0_cols = len(df0.columns) model = Sequential() model.add(Dense(64, activation='relu', input_shape=(df0_cols,), kernel_initializer=initializer)) model.add(Dropout(0.5, seed=random_n)) model.add(Dense(2, activation='softmax', kernel_initializer=initializer)) model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc']) reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.000001,verbose=1) checkpointer = ModelCheckpoint(filepath='checkpoint_final.hdf5', verbose=1, save_best_only=True) early_stopping = EarlyStopping(patience=10, verbose=1) epochs = 300 hist = model.fit(train0, train.Survived, epochs=epochs, batch_size=5, verbose=1, validation_split=0.2, callbacks=[reduce_lr, early_stopping, checkpointer]) model.load_weights('checkpoint_final.hdf5') pred = model.predict(test0) # In[22]: # print(model.metrics_names) plt.plot(hist.history['acc'], 'b-', label='acc' ) plt.plot(hist.history['loss'], 'r-', label='loss' ) plt.xlabel('epochs') plt.legend() plt.show() # In[23]: result = pred.argmax(axis=1) # ## Submission file: # In[24]: # compare to the previous result prev = pd.read_csv('submission.csv', index_col=0) print('Diff: ', np.sum(prev.Survived.values != result)) print('Survived: ', result.sum()) # submission data to csv file submission = pd.DataFrame({'PassengerId': test.index, 'Survived': result}) submission.to_csv('submission.csv', index=False) # In[25]: result