import pandas as pd
train = pd.read_csv("train.csv")
holdout = pd.read_csv("test.csv")
train.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
# %load functions.py
def process_missing(df):
"""Handle various missing values from the data set
Usage
------
holdout = process_missing(holdout)
"""
df["Fare"] = df["Fare"].fillna(train["Fare"].mean())
df["Embarked"] = df["Embarked"].fillna("S")
return df
def process_age(df):
"""Process the Age column into pre-defined 'bins'
Usage
------
train = process_age(train)
"""
df["Age"] = df["Age"].fillna(-0.5)
cut_points = [-1,0,5,12,18,35,60,100]
label_names = ["Missing","Infant","Child","Teenager","Young Adult","Adult","Senior"]
df["Age_categories"] = pd.cut(df["Age"],cut_points,labels=label_names)
return df
def process_fare(df):
"""Process the Fare column into pre-defined 'bins'
Usage
------
train = process_fare(train)
"""
cut_points = [-1,12,50,100,1000]
label_names = ["0-12","12-50","50-100","100+"]
df["Fare_categories"] = pd.cut(df["Fare"],cut_points,labels=label_names)
return df
def process_cabin(df):
"""Process the Cabin column into pre-defined 'bins'
Usage
------
train process_cabin(train)
"""
df["Cabin_type"] = df["Cabin"].str[0]
df["Cabin_type"] = df["Cabin_type"].fillna("Unknown")
df = df.drop('Cabin',axis=1)
return df
def process_titles(df):
"""Extract and categorize the title from the name column
Usage
------
train = process_titles(train)
"""
titles = {
"Mr" : "Mr",
"Mme": "Mrs",
"Ms": "Mrs",
"Mrs" : "Mrs",
"Master" : "Master",
"Mlle": "Miss",
"Miss" : "Miss",
"Capt": "Officer",
"Col": "Officer",
"Major": "Officer",
"Dr": "Officer",
"Rev": "Officer",
"Jonkheer": "Royalty",
"Don": "Royalty",
"Sir" : "Royalty",
"Countess": "Royalty",
"Dona": "Royalty",
"Lady" : "Royalty"
}
extracted_titles = df["Name"].str.extract(' ([A-Za-z]+)\.',expand=False)
df["Title"] = extracted_titles.map(titles)
return df
def create_dummies(df,column_name):
"""Create Dummy Columns (One Hot Encoding) from a single Column
Usage
------
train = create_dummies(train,"Age")
"""
dummies = pd.get_dummies(df[column_name],prefix=column_name)
df = pd.concat([df,dummies],axis=1)
return df
def pre_process(df):
df = process_missing(df)
df = process_age(df)
df = process_fare(df)
df = process_titles(df)
df = process_cabin(df)
for col in ["Age_categories","Fare_categories",
"Title","Cabin_type","Sex"]:
df = create_dummies(df,col)
return df
train = pre_process(train)
holdout = pre_process(holdout)
explore_cols = ["SibSp","Parch","Survived"]
explore = train[explore_cols].copy()
explore.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 891 entries, 0 to 890 Data columns (total 3 columns): SibSp 891 non-null int64 Parch 891 non-null int64 Survived 891 non-null int64 dtypes: int64(3) memory usage: 21.0 KB
import matplotlib.pyplot as plt
%matplotlib inline
explore.drop("Survived",axis=1).plot.hist(alpha=0.5,bins=8)
plt.show()
explore["familysize"] = explore[["SibSp","Parch"]].sum(axis=1)
explore.drop("Survived",axis=1).plot.hist(alpha=0.5,bins=10)
plt.xticks(range(11))
plt.show()
import numpy as np
for col in explore.columns.drop("Survived"):
pivot = explore.pivot_table(index=col,values="Survived")
plt.axhspan(.3, .6, alpha=0.2, color='red')
pivot.plot.bar(ylim=(0,1),yticks=np.arange(0,1,.1))
plt.show()
The SibSp
column shows the number of siblings and/or spouses each passenger had on board, while the Parch
columns shows the number of parents or children each passenger had onboard. Neither column has any missing values.
The distribution of values in both columns is skewed right, with the majority of values being zero.
You can sum these two columns to explore the total number of family members each passenger had onboard. The shape of the distribution of values in this case is similar, however there are less values at zero, and the quantity tapers off less rapidly as the values increase.
Looking at the survival rates of the the combined family members, you can see that few of the over 500 passengers with no family members survived, while greater numbers of passengers with family members survived.
def process_isalone(df):
df["familysize"] = df[["SibSp","Parch"]].sum(axis=1)
df["isalone"] = 0
df.loc[(df["familysize"] == 0),"isalone"] = 1
df = df.drop("familysize",axis=1)
return df
train = process_isalone(train)
holdout = process_isalone(holdout)
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
def select_features(df):
# Remove non-numeric columns, columns that have null values
df = df.select_dtypes([np.number]).dropna(axis=1)
all_X = df.drop(["Survived","PassengerId"],axis=1)
all_y = df["Survived"]
clf = RandomForestClassifier(random_state=1)
selector = RFECV(clf,cv=10)
selector.fit(all_X,all_y)
best_columns = list(all_X.columns[selector.support_])
print("Best Columns \n"+"-"*12+"\n{}\n".format(best_columns))
return best_columns
cols = select_features(train)
Best Columns ------------ ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Age_categories_Missing', 'Age_categories_Infant', 'Age_categories_Young Adult', 'Fare_categories_12-50', 'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Cabin_type_D', 'Cabin_type_E', 'Cabin_type_Unknown', 'Sex_female', 'Sex_male', 'isalone']
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
def select_model(df,features):
all_X = df[features]
all_y = df["Survived"]
# List of dictionaries, each containing a model name,
# it's estimator and a dict of hyperparameters
models = [
{
"name": "LogisticRegression",
"estimator": LogisticRegression(),
"hyperparameters":
{
"solver": ["newton-cg", "lbfgs", "liblinear"]
}
},
{
"name": "KNeighborsClassifier",
"estimator": KNeighborsClassifier(),
"hyperparameters":
{
"n_neighbors": range(1,20,2),
"weights": ["distance", "uniform"],
"algorithm": ["ball_tree", "kd_tree", "brute"],
"p": [1,2]
}
},
{
"name": "RandomForestClassifier",
"estimator": RandomForestClassifier(random_state=1),
"hyperparameters":
{
"n_estimators": [4, 6, 9],
"criterion": ["entropy", "gini"],
"max_depth": [2, 5, 10],
"max_features": ["log2", "sqrt"],
"min_samples_leaf": [1, 5, 8],
"min_samples_split": [2, 3, 5]
}
}
]
for model in models:
print(model['name'])
print('-'*len(model['name']))
grid = GridSearchCV(model["estimator"],
param_grid=model["hyperparameters"],
cv=10)
grid.fit(all_X,all_y)
model["best_params"] = grid.best_params_
model["best_score"] = grid.best_score_
model["best_model"] = grid.best_estimator_
print("Best Score: {}".format(model["best_score"]))
print("Best Parameters: {}\n".format(model["best_params"]))
return models
result = select_model(train,cols)
LogisticRegression ------------------ Best Score: 0.8204264870931538 Best Parameters: {'solver': 'liblinear'} KNeighborsClassifier -------------------- Best Score: 0.7755331088664422 Best Parameters: {'weights': 'uniform', 'p': 1, 'algorithm': 'brute', 'n_neighbors': 5} RandomForestClassifier ---------------------- Best Score: 0.8294051627384961 Best Parameters: {'max_features': 'log2', 'max_depth': 5, 'min_samples_split': 5, 'criterion': 'entropy', 'n_estimators': 4, 'min_samples_leaf': 1}
def save_submission_file(model,cols,filename="submission.csv"):
holdout_data = holdout[cols]
predictions = model.predict(holdout_data)
holdout_ids = holdout["PassengerId"]
submission_df = {"PassengerId": holdout_ids,
"Survived": predictions}
submission = pd.DataFrame(submission_df)
submission.to_csv(filename,index=False)
best_rf_model = result[2]["best_model"]
save_submission_file(best_rf_model,cols)