#!/usr/bin/env python # coding: utf-8 # In[1]: import warnings import numpy as np import pandas as pd warnings.filterwarnings('ignore') # data visualization import seaborn as sns get_ipython().run_line_magic('matplotlib', 'inline') from matplotlib import pyplot as plt from matplotlib import style # Algorithms from sklearn import linear_model from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import Perceptron from sklearn.linear_model import SGDClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import LinearSVC from sklearn.naive_bayes import GaussianNB # In[2]: test_df = pd.read_csv("input/test.csv") train_df = pd.read_csv("input/train.csv") # In[3]: train_df.info() print('-'*50) test_df.info() # In[4]: train_df.head() # In[5]: test_df.head() # Combine the train and test dataset into one for processing # In[6]: data = [train_df, test_df] # #### Embarked # In[7]: train_df.groupby('Embarked')['Survived'].count() # In[8]: # Fill missing value with 'S' ports = {"S": 0, "C": 1, "Q": 2} for dataset in data: dataset["Embarked"] = dataset["Embarked"].fillna("S") dataset['Embarked'] = dataset['Embarked'].map(ports) # #### Fare # In[9]: for dataset in data: dataset['Fare'].fillna(dataset['Fare'].dropna().median(), inplace=True) dataset['Fare'] = dataset['Fare'].astype(int) # #### Age # Fill the missing age with random value between mean, standard deviation # In[10]: for dataset in data: average_age_titanic = dataset['Age'].mean() std_age_titanic = dataset['Age'].std() count_nan_age_titanic = dataset['Age'].isnull().sum() rand_ = np.random.randint( average_age_titanic - std_age_titanic, average_age_titanic + std_age_titanic, size = count_nan_age_titanic ) dataset['Age'][np.isnan(dataset['Age'])] = rand_ dataset['Age'] = dataset['Age'].astype(int) # Convert the `Age` feature into `Age Group` # In[11]: for dataset in data: dataset.loc[ dataset['Age'] <= 11, 'Age'] = 0 dataset.loc[(dataset['Age'] > 11) & (dataset['Age'] <= 18), 'Age'] = 1 dataset.loc[(dataset['Age'] > 18) & (dataset['Age'] <= 22), 'Age'] = 2 dataset.loc[(dataset['Age'] > 22) & (dataset['Age'] <= 27), 'Age'] = 3 dataset.loc[(dataset['Age'] > 27) & (dataset['Age'] <= 33), 'Age'] = 4 dataset.loc[(dataset['Age'] > 33) & (dataset['Age'] <= 40), 'Age'] = 5 dataset.loc[(dataset['Age'] > 40) & (dataset['Age'] <= 66), 'Age'] = 6 dataset.loc[ dataset['Age'] > 66, 'Age'] = 6 train_df['Age'].value_counts() # #### Name # From `Name` I create another feature call `Title` # In[12]: title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5} for dataset in data: dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False) dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',\ 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare') dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss') dataset['Title'] = dataset['Title'].replace('Ms', 'Miss') dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs') dataset['Title'] = dataset['Title'].map(title_mapping) dataset['Title'] = dataset['Title'].fillna(0) train_df.head() # #### Family size # In[13]: for dataset in data: dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1 train_df[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean().sort_values(by='Survived', ascending=False) # #### IsAlone # In[14]: for dataset in data: dataset['IsAlone'] = 0 dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1 train_df[['IsAlone', 'Survived']].groupby('IsAlone', as_index=False).mean() # #### Sex # In[15]: genders = {"male": 0, "female": 1} for dataset in data: dataset['Sex'] = dataset['Sex'].map(genders) # #### Age*Times # In[16]: for dataset in data: dataset['Age_Class'] = dataset['Age']*dataset['Pclass'] # #### Fare per Person # In[17]: for dataset in data: dataset['FarePerPerson'] = dataset['Fare'] / dataset['FamilySize'] dataset['FarePerPerson'] = dataset['FarePerPerson'].astype(int) # ### Clean up before training the data # I decided to drop `Cabin` (many missing data) and `Ticket` (doesn't make any impacts) columns. # In[18]: X_train = train_df.drop(['Name','Ticket', 'Cabin', 'SibSp', 'Parch', 'PassengerId', 'Survived'], axis=1) Y_train = train_df['Survived'] X_test = test_df.drop(['Name','Ticket', 'Cabin', 'SibSp', 'Parch', 'PassengerId'], axis=1).copy() # In[19]: X_train.head() # In[20]: X_train.shape, Y_train.shape, X_test.shape # #### Logistic Regression # In[21]: logreg = LogisticRegression() logreg.fit(X_train, Y_train) Y_pred = logreg.predict(X_test) acc_log = round(logreg.score(X_train, Y_train) * 100, 2) # #### Random Forest # In[22]: random_forest = RandomForestClassifier(n_estimators=100) random_forest.fit(X_train, Y_train) Y_prediction = random_forest.predict(X_test) random_forest.score(X_train, Y_train) acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2) # #### Perceptron # In[23]: perceptron = Perceptron(max_iter=5) perceptron.fit(X_train, Y_train) Y_pred = perceptron.predict(X_test) acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2) # #### SGDClassifier # In[24]: sgd = SGDClassifier(max_iter=5, tol=None) sgd.fit(X_train, Y_train) Y_pred = sgd.predict(X_test) sgd.score(X_train, Y_train) acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2) # #### Decision Tree # In[25]: decision_tree = DecisionTreeClassifier() decision_tree.fit(X_train, Y_train) Y_pred = decision_tree.predict(X_test) acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2) # #### Support Vector Machine # In[26]: linear_svc = LinearSVC() linear_svc.fit(X_train, Y_train) Y_pred = linear_svc.predict(X_test) acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2) # #### K Nearest Neighbor # In[27]: knn = KNeighborsClassifier(n_neighbors = 3) knn.fit(X_train, Y_train) Y_pred = knn.predict(X_test) acc_knn = round(knn.score(X_train, Y_train) * 100, 2) # #### Gaussian Naive Bayes # In[28]: gaussian = GaussianNB() gaussian.fit(X_train, Y_train) Y_pred = gaussian.predict(X_test) acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2) # In[29]: results = pd.DataFrame({ 'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 'Random Forest', 'Naive Bayes', 'Perceptron', 'Stochastic Gradient Decent', 'Decision Tree'], 'Score': [acc_linear_svc, acc_knn, acc_log, acc_random_forest, acc_gaussian, acc_perceptron, acc_sgd, acc_decision_tree]}) result_df = results.sort_values(by='Mod', ascending=False) result_df = result_df.set_index('Score') result_df.head(10) # In[30]: plt.figure(figsize=(13,6)) plt.title("Model Accuracy Comparison") sns.barplot(x="Score", y="Model", data=results) # Export `Random Forest` model to deploy production # In[31]: import pickle as pkl # In[34]: filename = 'model.pkl' pkl.dump(random_forest, open(filename, 'wb')) # In[ ]: