#!/usr/bin/env python # coding: utf-8 # ## Kaggle Titanic Dataset - Prediction of Survival # In[1]: import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns get_ipython().run_line_magic('matplotlib', 'inline') # ### Read Data # In[3]: dfTR = pd.read_csv('./Data/train.csv') dfTR.head() # In[ ]: # In[18]: ## Show dataframe info dfTR.info() # In[19]: ## Show dataframe description dfTR.describe() # ### Clean Data # In[20]: ## Show dataframe null counts dfTR.isnull().sum() # In[21]: dfTR.columns # In[22]: dfTR.head() # In[25]: ## Select columns SEL_COLS = ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'] dfTR = dfTR[SEL_COLS] dfTR.head() # In[31]: ## Drop NAs dfTR = dfTR.dropna() dfTR.shape # In[39]: ## View distributions SEL_VAR = SEL_COLS[0] ## Change index for other variables dfTR.groupby(SEL_VAR).count() # ### Visualize Data # #### Survival counts # In[41]: sns.countplot(data = dfTR, x='Survived') plt.show() # In[42]: sns.countplot(data=dfTR, x='Survived', hue='Sex') plt.show() # In[43]: sns.set_style('white') sns.countplot(data=dfTR, x='Survived', hue='Pclass') plt.show() # #### Age of the passengers # In[47]: sns.histplot(data=dfTR, x='Age') plt.show() # #### Siblings and Spouse count # In[48]: sns.countplot(data=dfTR, x='SibSp') plt.show() # #### Distribution of Fares # In[50]: sns.histplot(data=dfTR, x='Fare') plt.show() # ### Handle Categorical Variables # In[54]: dfSex = pd.get_dummies(dfTR['Sex'], prefix='Sex', drop_first=True) dfSex.head() # In[55]: dfEmb = pd.get_dummies(dfTR['Embarked'], prefix='Emb', drop_first=True) dfEmb.head() # In[56]: dfTR=pd.concat([dfTR.drop(['Sex','Embarked'],axis=1), dfSex, dfEmb], axis=1) dfTR.head() # ### ML Models for Prediction # ### Scale the data # In[60]: from sklearn.preprocessing import StandardScaler scalerTR = StandardScaler() # In[61]: XTR = np.array(dfTR[dfTR.columns[1:]]) XTR.shape # In[73]: YTR = np.array(dfTR[dfTR.columns[0:1]]).squeeze() YTR.shape # In[74]: scaler.fit(XTR) XTRNorm = scaler.transform(XTR) # ### Version 1: Train - Test Split # In[75]: from sklearn.model_selection import train_test_split X_train, X_test, Y_train, Y_test = train_test_split(XTRNorm, YTR, test_size = 0.3, random_state = 101) # ### 1. Logistic Regression # In[76]: from sklearn.linear_model import LogisticRegression mdl = LogisticRegression() # In[77]: mdl.fit(X_train,Y_train) # In[78]: pred = mdl.predict(X_test) pred # In[79]: Y_test # In[80]: from sklearn.metrics import accuracy_score, classification_report, confusion_matrix # In[81]: print(accuracy_score(Y_test,pred)) # In[82]: print(classification_report(Y_test,pred)) # In[88]: sns.heatmap(confusion_matrix(Y_test,pred), annot=True) plt.show() # ### 2. K-Nearest Neighbors # In[89]: from sklearn.neighbors import KNeighborsClassifier # In[90]: knn = KNeighborsClassifier(n_neighbors=1) # In[91]: knn.fit(X_train,Y_train) # In[92]: knnpredict = knn.predict(X_test) knnpredict # In[93]: print(accuracy_score(Y_test,knnpredict)) # In[94]: print(classification_report(Y_test,knnpredict)) # In[95]: sns.heatmap(confusion_matrix(Y_test,knnpredict),annot=True) plt.show() # ### 3 - Decision Tree # In[96]: from sklearn.tree import DecisionTreeClassifier # In[97]: dtree = DecisionTreeClassifier() # In[98]: dtree.fit(X_train,Y_train) # In[99]: treepredict = dtree.predict(X_test) treepredict # In[100]: print(accuracy_score(Y_test,treepredict)) # In[101]: print(classification_report(Y_test,treepredict)) # In[102]: sns.heatmap(confusion_matrix(Y_test,treepredict),cmap='coolwarm',annot=True) plt.show() # ### 4 - Random Forest # ###