#!/usr/bin/env python # coding: utf-8 # # Bagging trees on Titanic # # # sklearn.tree.DecisionTreeClassifier # # * Creer les train et test sets # * comme baseline: arbre de decision simple, not pruned, quel accuracy sur le test set ? # * maintenant prendre 20 arbres, en limitant la taille a 2 niveaux # * pour chaque arbre, predire les probas des echantillons du test set # * puis moyenner les proba et utiliser le resultat pour determiner la classe predite. # * quel accuracy sur le test set ? # In[112]: import pandas as pd import pandas as pd import numpy as np import re import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import confusion_matrix, classification_report from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split from sklearn.metrics import roc_auc_score # In[113]: df = pd.read_csv('../data/classification/titanic.csv', sep = ';') print(df.shape) df.sample(frac = 1) df.head() # # Preprocessing # # # # In[114]: # Age df.loc[df.age.isnull(), 'age'] = np.mean(df.age) # Fare df.loc[df.fare.isnull(), 'fare'] = np.mean(df.fare) # Choix arbitraire df.loc[df.embarked.isnull(), 'embarked'] = 'C' # Choix arbitraire df.loc[df['home.dest'].isnull(), 'home.dest'] = 'Autre' # extract title df['title'] = df.name.apply(lambda x: re.search(' ([A-Z][a-z]+)\.', x).group(1)) for col in ['sex', 'embarked', 'home.dest', 'title']: le = LabelEncoder() df[col] = le.fit_transform(df[col]) # drop useless columns df.drop(columns = ['name','cabin', 'ticket'], inplace = True) X = df.drop(columns = ['survived'], inplace = False) y = df.survived # In[ ]: y # In[117]: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2) # # Baseline # # comme baseline: arbre de decision simple, not pruned, quel accuracy sur le test set # On voit bien l'overfitting # In[118]: clf = DecisionTreeClassifier() clf.fit(X_train, y_train) yhat_proba_test = clf.predict_proba(X_test)[:,1] yhat_proba_train = clf.predict_proba(X_train)[:,1] print("\n -- ROC AUC Score test {:.4f}".format(roc_auc_score(y_test, yhat_proba_test))) print("\n -- ROC AUC Score train {:.4f}".format(roc_auc_score(y_train, yhat_proba_train))) # # N arbres max_depth = 2 # # maintenant prendre 20 arbres, en limitant la taille a 2 niveaux # # A chaque iteration # In[152]: N = 200 yhat_proba = [0] * len(y_test) roc_auc_train, roc_auc_test = [], [] for n in range(N): idx = X_train.sample(frac = 0.2, replace = True).index clf = DecisionTreeClassifier(min_samples_leaf = 100) clf.fit(X_train.loc[idx], y_train[idx]) yhat_proba_test += clf.predict_proba(X_test)[:,1] roc_auc_train.append(roc_auc_score(y_train, clf.predict_proba(X_train)[:,1] )) roc_auc_test.append(roc_auc_score(y_test, clf.predict_proba(X_test)[:,1] )) yhat_proba_test = yhat_proba_test / N print("\n -- ROC AUC Score test Bagging {:.4f}".format(roc_auc_score(y_test, yhat_proba_test))) print("\n -- ROC AUC Score train {:.4f} {:.4f} ".format(np.mean(roc_auc_train), np.std(roc_auc_train))) print("\n -- ROC AUC Score test {:.4f} {:.4f} ".format(np.mean(roc_auc_test), np.std(roc_auc_test))) # In[ ]: import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') fig, ax = plt.subplots(1,1) plt.boxplot(roc_auc_train); fig, ax = plt.subplots(1,1) plt.boxplot(roc_auc_test); # ![](https://gph.is/2c69qZB) # ![](https://gph.is/2c69qZB) # # Random Forest # In[174]: from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(max_depth = 3,n_estimators=100, bootstrap=True ) clf.fit(X_train, y_train) print(clf) yhat_proba_test = clf.predict_proba(X_test)[:,1] yhat_proba_train = clf.predict_proba(X_train)[:,1] print("\n -- ROC AUC Score test {:.4f}".format(roc_auc_score(y_test, yhat_proba_test))) print("\n -- ROC AUC Score train {:.4f}".format(roc_auc_score(y_train, yhat_proba_train))) # In[175]: clf.feature_importances_ # In[176]: X_train.columns # In[ ]: