sklearn.tree.DecisionTreeClassifier
import pandas as pd
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
df = pd.read_csv('../data/classification/titanic.csv', sep = ';')
print(df.shape)
df.sample(frac = 1)
df.head()
(1309, 12)
pclass | survived | name | sex | age | sibsp | parch | ticket | fare | cabin | embarked | home.dest | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | Allen, Miss. Elisabeth Walton | female | 29.00 | 0 | 0 | 24160 | 211.3375 | B5 | S | St Louis, MO |
1 | 1 | 1 | Allison, Master. Hudson Trevor | male | 0.92 | 1 | 2 | 113781 | 151.5500 | C22 C26 | S | Montreal, PQ / Chesterville, ON |
2 | 1 | 0 | Allison, Miss. Helen Loraine | female | 2.00 | 1 | 2 | 113781 | 151.5500 | C22 C26 | S | Montreal, PQ / Chesterville, ON |
3 | 1 | 0 | Allison, Mr. Hudson Joshua Creighton | male | 30.00 | 1 | 2 | 113781 | 151.5500 | C22 C26 | S | Montreal, PQ / Chesterville, ON |
4 | 1 | 0 | Allison, Mrs. Hudson J C (Bessie Waldo Daniels) | female | 25.00 | 1 | 2 | 113781 | 151.5500 | C22 C26 | S | Montreal, PQ / Chesterville, ON |
# Age
df.loc[df.age.isnull(), 'age'] = np.mean(df.age)
# Fare
df.loc[df.fare.isnull(), 'fare'] = np.mean(df.fare)
# Choix arbitraire
df.loc[df.embarked.isnull(), 'embarked'] = 'C'
# Choix arbitraire
df.loc[df['home.dest'].isnull(), 'home.dest'] = 'Autre'
# extract title
df['title'] = df.name.apply(lambda x: re.search(' ([A-Z][a-z]+)\.', x).group(1))
for col in ['sex', 'embarked', 'home.dest', 'title']:
le = LabelEncoder()
df[col] = le.fit_transform(df[col])
# drop useless columns
df.drop(columns = ['name','cabin', 'ticket'], inplace = True)
X = df.drop(columns = ['survived'], inplace = False)
y = df.survived
y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2)
comme baseline: arbre de decision simple, not pruned, quel accuracy sur le test set On voit bien l'overfitting
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
yhat_proba_test = clf.predict_proba(X_test)[:,1]
yhat_proba_train = clf.predict_proba(X_train)[:,1]
print("\n -- ROC AUC Score test {:.4f}".format(roc_auc_score(y_test, yhat_proba_test)))
print("\n -- ROC AUC Score train {:.4f}".format(roc_auc_score(y_train, yhat_proba_train)))
-- ROC AUC Score test 0.7463 -- ROC AUC Score train 0.9991
maintenant prendre 20 arbres, en limitant la taille a 2 niveaux
A chaque iteration
N = 200
yhat_proba = [0] * len(y_test)
roc_auc_train, roc_auc_test = [], []
for n in range(N):
idx = X_train.sample(frac = 0.2, replace = True).index
clf = DecisionTreeClassifier(min_samples_leaf = 100)
clf.fit(X_train.loc[idx], y_train[idx])
yhat_proba_test += clf.predict_proba(X_test)[:,1]
roc_auc_train.append(roc_auc_score(y_train, clf.predict_proba(X_train)[:,1] ))
roc_auc_test.append(roc_auc_score(y_test, clf.predict_proba(X_test)[:,1] ))
yhat_proba_test = yhat_proba_test / N
print("\n -- ROC AUC Score test Bagging {:.4f}".format(roc_auc_score(y_test, yhat_proba_test)))
print("\n -- ROC AUC Score train {:.4f} {:.4f} ".format(np.mean(roc_auc_train), np.std(roc_auc_train)))
print("\n -- ROC AUC Score test {:.4f} {:.4f} ".format(np.mean(roc_auc_test), np.std(roc_auc_test)))
-- ROC AUC Score test Bagging 0.8608 -- ROC AUC Score train 0.5000 0.0000 -- ROC AUC Score test 0.5000 0.0000
import matplotlib.pyplot as plt
%matplotlib inline
fig, ax = plt.subplots(1,1)
plt.boxplot(roc_auc_train);
fig, ax = plt.subplots(1,1)
plt.boxplot(roc_auc_test);
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth = 3,n_estimators=100, bootstrap=True )
clf.fit(X_train, y_train)
print(clf)
yhat_proba_test = clf.predict_proba(X_test)[:,1]
yhat_proba_train = clf.predict_proba(X_train)[:,1]
print("\n -- ROC AUC Score test {:.4f}".format(roc_auc_score(y_test, yhat_proba_test)))
print("\n -- ROC AUC Score train {:.4f}".format(roc_auc_score(y_train, yhat_proba_train)))
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=3, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False) -- ROC AUC Score test 0.8725 -- ROC AUC Score train 0.8504
clf.feature_importances_
array([ 0.13569291, 0.43813153, 0.03770966, 0.02240128, 0.01700829, 0.10011347, 0.01994288, 0.0378396 , 0.19116038])
X_train.columns
Index(['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'home.dest', 'title'], dtype='object')