Bagging trees on Titanic¶

sklearn.tree.DecisionTreeClassifier

Creer les train et test sets
comme baseline: arbre de decision simple, not pruned, quel accuracy sur le test set ?
maintenant prendre 20 arbres, en limitant la taille a 2 niveaux
pour chaque arbre, predire les probas des echantillons du test set
puis moyenner les proba et utiliser le resultat pour determiner la classe predite.
quel accuracy sur le test set ?

In [112]:

import pandas as pd
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [113]:

df = pd.read_csv('../data/classification/titanic.csv', sep = ';')
print(df.shape)
df.sample(frac = 1)
df.head()

(1309, 12)

Out[113]:

	pclass	survived	name	sex	age	sibsp	parch	ticket	fare	cabin	embarked	home.dest
0	1	1	Allen, Miss. Elisabeth Walton	female	29.00	0	0	24160	211.3375	B5	S	St Louis, MO
1	1	1	Allison, Master. Hudson Trevor	male	0.92	1	2	113781	151.5500	C22 C26	S	Montreal, PQ / Chesterville, ON
2	1	0	Allison, Miss. Helen Loraine	female	2.00	1	2	113781	151.5500	C22 C26	S	Montreal, PQ / Chesterville, ON
3	1	0	Allison, Mr. Hudson Joshua Creighton	male	30.00	1	2	113781	151.5500	C22 C26	S	Montreal, PQ / Chesterville, ON
4	1	0	Allison, Mrs. Hudson J C (Bessie Waldo Daniels)	female	25.00	1	2	113781	151.5500	C22 C26	S	Montreal, PQ / Chesterville, ON

Preprocessing¶

In [114]:

# Age
df.loc[df.age.isnull(), 'age'] = np.mean(df.age)

# Fare
df.loc[df.fare.isnull(), 'fare'] = np.mean(df.fare)


# Choix arbitraire
df.loc[df.embarked.isnull(), 'embarked'] = 'C'

# Choix arbitraire
df.loc[df['home.dest'].isnull(), 'home.dest'] = 'Autre'


# extract title
df['title'] = df.name.apply(lambda x: re.search(' ([A-Z][a-z]+)\.', x).group(1))

for col in ['sex', 'embarked', 'home.dest', 'title']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    

# drop useless columns    
df.drop(columns = ['name','cabin', 'ticket'], inplace = True)

X = df.drop(columns = ['survived'], inplace = False)
y = df.survived

In [ ]:

In [117]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2)

Baseline¶

comme baseline: arbre de decision simple, not pruned, quel accuracy sur le test set On voit bien l'overfitting

In [118]:

clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

yhat_proba_test  = clf.predict_proba(X_test)[:,1]
yhat_proba_train = clf.predict_proba(X_train)[:,1]

print("\n -- ROC AUC Score test {:.4f}".format(roc_auc_score(y_test, yhat_proba_test)))
print("\n -- ROC AUC Score train {:.4f}".format(roc_auc_score(y_train, yhat_proba_train)))

 -- ROC AUC Score test 0.7463

 -- ROC AUC Score train 0.9991

N arbres max_depth = 2¶

maintenant prendre 20 arbres, en limitant la taille a 2 niveaux

A chaque iteration

In [152]:

N = 200

yhat_proba = [0] * len(y_test)

roc_auc_train, roc_auc_test = [], []

for n in range(N):
    idx = X_train.sample(frac = 0.2, replace = True).index

    clf = DecisionTreeClassifier(min_samples_leaf = 100)

    clf.fit(X_train.loc[idx], y_train[idx])
    
    yhat_proba_test += clf.predict_proba(X_test)[:,1]
    
    roc_auc_train.append(roc_auc_score(y_train, clf.predict_proba(X_train)[:,1] ))
    roc_auc_test.append(roc_auc_score(y_test, clf.predict_proba(X_test)[:,1] ))

yhat_proba_test = yhat_proba_test / N    

print("\n -- ROC AUC Score test Bagging {:.4f}".format(roc_auc_score(y_test, yhat_proba_test)))
print("\n -- ROC AUC Score train {:.4f}  {:.4f}  ".format(np.mean(roc_auc_train), np.std(roc_auc_train)))
print("\n -- ROC AUC Score test {:.4f}  {:.4f} ".format(np.mean(roc_auc_test), np.std(roc_auc_test)))

 -- ROC AUC Score test Bagging 0.8608

 -- ROC AUC Score train 0.5000  0.0000  

 -- ROC AUC Score test 0.5000  0.0000

In [ ]:

import matplotlib.pyplot as plt
%matplotlib inline

fig, ax = plt.subplots(1,1)
plt.boxplot(roc_auc_train);
fig, ax = plt.subplots(1,1)
plt.boxplot(roc_auc_test);

Random Forest¶

In [174]:

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth = 3,n_estimators=100, bootstrap=True )

clf.fit(X_train, y_train)
print(clf)
yhat_proba_test = clf.predict_proba(X_test)[:,1]
yhat_proba_train = clf.predict_proba(X_train)[:,1]

print("\n -- ROC AUC Score test {:.4f}".format(roc_auc_score(y_test, yhat_proba_test)))
print("\n -- ROC AUC Score train {:.4f}".format(roc_auc_score(y_train, yhat_proba_train)))

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

 -- ROC AUC Score test 0.8725

 -- ROC AUC Score train 0.8504

In [175]:

clf.feature_importances_

Out[175]:

array([ 0.13569291,  0.43813153,  0.03770966,  0.02240128,  0.01700829,
        0.10011347,  0.01994288,  0.0378396 ,  0.19116038])

In [176]:

X_train.columns

Out[176]:

Index(['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked',
       'home.dest', 'title'],
      dtype='object')

In [ ]: