import sys
assert sys.version_info >= (3, 6)
import numpy
assert numpy.__version__ >="1.17.3"
import numpy as np
import pandas
assert pandas.__version__ >= "0.25.1"
import pandas as pd
import matplotlib.pyplot as plt
import seaborn
assert seaborn.__version__ >= "0.9.0"
import seaborn as sns
import sklearn
assert sklearn.__version__ >='0.21.3'
from sklearn import metrics
%matplotlib inline
In this data set, each observation is a tumor, and it is described by the expression of 3,000 genes. The expression of a gene is a measure of how much of that gene is present in the biological sample. Because this affects how much of the protein this gene codes for is produced, and because proteins dictacte what cells can do, gene expression gives us valuable information about the tumor. In particular, the expression of the same gene in the same individual is different in different tissues.
The goal is to separate the endometrium tumors from the uterine ones.
endometrium_data = pd.read_csv('data/endometrium_uterus_tumor.csv', sep=",")
endometrium_data.head()
X = endometrium_data.drop(['ID_REF', 'Tissue'], axis=1).values
y = pd.get_dummies(endometrium_data['Tissue']).values[:,1]
from sklearn import model_selection
skf = model_selection.StratifiedKFold(n_splits=5)
skf.get_n_splits(X, y)
folds = [(tr,te) for (tr,te) in skf.split(X, y)]
Import the functions defined in the cross_validation.py
file. Module enables us to reuse code across different jupyter notebooks.
from cross_validation import cross_validate_clf , cross_validate_clf_optimize
A decision tree predicts the value of a target variable by learning simple decision rules inferred from the data features.
In scikit-learn, they are implemented in tree.DecisionTreeClassifier for classification and tree.DecisionTreeRegressor for regression.
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn import metrics
ypred_dt = []
for tree_index in range(5):
clf = tree.DecisionTreeClassifier()
pred_proba = cross_validate_clf(X, y, clf, folds)
ypred_dt.append(pred_proba)
print("%.3f" % metrics.accuracy_score(y, np.where(pred_proba > 0.5, 1, 0)))
fpr_dt = [] # will hold the 5 arrays of false positive rates (1 per tree)
tpr_dt = [] # will hold the 5 arrays of true positive rates (1 per tree)
auc_dt = [] # will hold the 5 areas under the ROC curve (1 per tree)
for tree_index in range(5):
# Compute the ROC curve of the current tree
fpr_dt_tmp, tpr_dt_tmp, thresholds = metrics.roc_curve(y, ypred_dt[tree_index], pos_label=1)
# Compute the area under the ROC curve of the current tree
auc_dt_tmp = metrics.auc(fpr_dt_tmp, tpr_dt_tmp)
fpr_dt.append(fpr_dt_tmp)
tpr_dt.append(tpr_dt_tmp)
auc_dt.append(auc_dt_tmp)
# Plot the first 4 ROC curves
for tree_index in range(4):
plt.plot(fpr_dt[tree_index], tpr_dt[tree_index], '-')
# Plot the last ROC curve, with a label that gives the mean/std AUC
plt.plot(fpr_dt[-1], tpr_dt[-1], '-',
label='DT (AUC = %0.2f +/- %0.2f)' % (np.mean(auc_dt), np.std(auc_dt)))
# Plot the ROC curve
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
plt.title('ROC curves', fontsize=16)
plt.legend(loc="lower right")
Decision trees' performance is very bad on this dataset
Question: What parameters of DecisionTreeClassifier can you play with to define trees differently than with the default parameters? Cross-validate these using a grid search with model_selection.GridSearchCV. Plot the optimal decision tree on the previous plot. Did you manage to improve performance?
from sklearn import model_selection
# Define the grid of parameters to test
param_grid = {'criterion':['gini', 'entropy'],
'min_samples_split':[2, 4, 16, 256]}
clf = model_selection.GridSearchCV(tree.DecisionTreeClassifier(random_state = 100),
param_grid,
scoring='roc_auc',
cv=5, iid=True)
# Cross-validate the GridSearchCV object
ypred_dt_opt = cross_validate_clf_optimize(X, y, clf, folds)
# Compute the ROC curve for the optimized DecisionTreeClassifier
fpr_dt_opt, tpr_dt_opt, thresholds = metrics.roc_curve(y, ypred_dt_opt, pos_label=1)
auc_dt_opt = metrics.auc(fpr_dt_opt, tpr_dt_opt)
# Plot the ROC curves of the 5 decision trees from earlier
fig = plt.figure(figsize=(5, 5))
for tree_index in range(4):
plt.plot(fpr_dt[tree_index], tpr_dt[tree_index], '-', color='blue')
plt.plot(fpr_dt[-1], tpr_dt[-1], '-', color='blue',
label='DT (AUC = %0.2f (+/- %0.2f))' % (np.mean(auc_dt), np.std(auc_dt)))
# Plot the ROC curve of the optimized DecisionTreeClassifier
plt.plot(fpr_dt_opt, tpr_dt_opt, color='orange', label='DT optimized (AUC=%0.2f)' % auc_dt_opt)
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
plt.title('ROC curves', fontsize=16)
plt.legend(loc="lower right", fontsize=12)
There is no clear optimal parameter and the performance is still poor.
We will resort to ensemble methods to try to improve the performance of single decision trees. Let us start with bagging trees: The different trees are to be built using a bootstrap sample of the data, that is to say, a sample built by randomly drawing n points with replacement from the original data, where n is the number of points in the training set.
Bagging is efficient when used with low bias and high variance weak learners. Indeed, by averaging such estimators, we lower the variance by obtaining a smoother estimator, which is still centered around the true density (low bias).
Bagging decision trees hence makes sense, as decision trees have:
Note: Bagging trees and random forests start being really powerful when using large number of trees (several hundreds). This is computationally more intensive, especially when the number of features is large, as in this lab. For the sake of computational time, we suggeste using small numbers of trees, but you might want to repeat this lab for larger number of trees at home.
Question Cross-validate a bagging ensemble of 5 decision trees on the data. Plot the resulting ROC curve, compared to the 5 decision trees you trained earlier.
from sklearn import ensemble
# Initialize a bag of trees
clf = ensemble.BaggingClassifier(n_estimators=5, max_features=X.shape[1], random_state = 100)
# Cross-validate the bagging trees on the tumor data
ypred_bt = cross_validate_clf(X, y, clf, folds)
# Compute the ROC curve of the bagging trees
fpr_bt, tpr_bt, thresholds = metrics.roc_curve(y, ypred_bt, pos_label=1)
auc_bt = metrics.auc(fpr_bt, tpr_bt)
# Plot the ROC curve of the 5 decision trees from earlier
fig = plt.figure(figsize=(5, 5))
for tree_index in range(4):
plt.plot(fpr_dt[tree_index], tpr_dt[tree_index], '-', color='blue')
plt.plot(fpr_dt[-1], tpr_dt[-1], '-', color='blue',
label='Decision Tree (AUC = %0.2f (+/- %0.2f))' % (np.mean(auc_dt), np.std(auc_dt)))
# Plot the ROC curve of the bagging trees
plt.plot(fpr_bt, tpr_bt, color='orange', label='Bagging Tree (AUC=%0.2f)' % auc_bt)
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
plt.title('ROC curves', fontsize=16)
plt.legend(loc="lower right", fontsize=12)
Question: How do the bagging trees perform compared to individual trees?
Answer: Combining five tree enables bagging tree to perform better than any other decision tree.
Question Use cross_validate_optimize
to optimize the number of decision trees to use in the bagging method. How many trees did you find to be an optimal choice?
# Number of trees to use
list_n_trees = [5, 10, 20, 50, 80]
# Start a ROC curve plot
fig = plt.figure(figsize=(5, 5))
for idx, n_trees in enumerate(list_n_trees):
# Initialize a bag of trees with n_trees trees
clf = ensemble.BaggingClassifier(n_estimators=n_trees, random_state = 100)
# Cross-validate the bagging trees on the tumor data
ypred_bt_opt = cross_validate_clf(X, y, clf, folds)
# Compute the ROC curve
fpr_bt_opt, tpr_bt_opt, thresholds = metrics.roc_curve(y, ypred_bt_opt, pos_label=1)
auc_bt_opt = metrics.auc(fpr_bt_opt, tpr_bt_opt)
# Plot the ROC curve
plt.plot(fpr_bt_opt, tpr_bt_opt, '-',
label='Bagging Tree %0.f trees (AUC = %0.2f)' % (n_trees, auc_bt_opt))
# Plot the ROC curve of the optimal decision tree
plt.plot(fpr_dt_opt, tpr_dt_opt, label='DT optimized (AUC=%0.2f)' % auc_dt_opt)
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
plt.title('ROC curves', fontsize=16)
plt.legend(loc="lower right", fontsize=12)
Increasing the number of trees increases the performance, but not too much.
In practice, simply bagging is typically not enough. In order to get a good reduction in variance, we require that the models being aggregated be uncorrelated, so that they make “different errors”. Bagging will usually get you highly correlated models that will make the same errors, and will therefore not reduce the variance of the combined predictor.
Question What is the difference between bagging trees and random forests? How does it intuitively fix the problem of correlations between trees ?
# Initialize a random forest with 5 trees
clf = ensemble.RandomForestClassifier(n_estimators = 10, criterion='entropy')
# Cross-validate the random forest on the tumor data
ypred_rf = cross_validate_clf(X, y, clf, folds)
# Compute the ROC curve of the random forest
fpr_rf, tpr_rf, thresholds = metrics.roc_curve(y, ypred_rf, pos_label=1)
auc_rf = metrics.auc(fpr_rf, tpr_rf)
# Plot the ROC curve of the 5 decision trees
fig = plt.figure(figsize=(5, 5))
for tree_index in range(4):
plt.plot(fpr_dt[tree_index], tpr_dt[tree_index], '-', color='grey')
plt.plot(fpr_dt[-1], tpr_dt[-1], '-', color='grey',
label='Decision Tree (AUC = %0.2f (+/- %0.2f))' % (np.mean(auc_dt), np.std(auc_dt)))
# Plot the ROC curve of the bagging trees (5 trees)
plt.plot(fpr_bt, tpr_bt, label='Bagging Tree (AUC=%0.2f)' % auc_bt)
# Plot the ROC curve of the random forest (5 trees)
plt.plot(fpr_rf, tpr_rf, label='RandomForest Tree (AUC=%0.2f)' % auc_rf)
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
plt.title('ROC curves', fontsize=16)
plt.legend(loc="lower right", fontsize=12)
As can be seen, with a low number of trees, random forest
performs similar to bagging trees.
?ensemble.RandomForestClassifier
Question What are the main parameters of Random Forest which can be optimized ?
Question Use cross_validate_clf_optimize
to optimize
# Define the grid of parameters to test
param_grid = {'criterion': ['gini', 'entropy'],
'max_features': ['auto', 'log2'],
'min_samples_leaf': [1,2,5],
'n_estimators': [5,10,20,50,80]
}
# Initialize a GridSearchCV object that will be used to cross-validate a random forest with these parameters.
clf = model_selection.GridSearchCV(ensemble.RandomForestClassifier(),
param_grid,
scoring='roc_auc',
cv=5, iid=True)
# Cross-validate the GridSearchCV object
ypred_rf_opt = cross_validate_clf_optimize(X, y, clf, folds)
# Compute the ROC curve for the optimized random forest
fpr_rf_opt, tpr_rf_opt, thresholds = metrics.roc_curve(y, ypred_rf_opt, pos_label=1)
auc_rf_opt = metrics.auc(fpr_rf_opt, tpr_rf_opt)
fig = plt.figure(figsize=(5, 5))
# Plot the ROC curve of the optimized DecisionTreeClassifier
plt.plot(fpr_dt_opt, tpr_dt_opt, color='grey', label='DT optimized (AUC=%0.2f)' % auc_dt_opt)
# Plot the ROC curve of the optimized bagging trees
plt.plot(fpr_bt_opt, tpr_bt_opt, label='BT 80 trees (AUC=%0.2f)' % auc_dt_opt)
# Plot the ROC curve of the optimized random forest
plt.plot(fpr_rf_opt, tpr_rf_opt, label='RF optimized (AUC = %0.2f)' % (auc_rf_opt))
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
plt.title('ROC curves', fontsize=16)
plt.legend(loc="lower right", fontsize=12)
Question How do your tree-based classifiers compare to regularized logistic regression models?
Plot the corresponding ROC curves.
from sklearn import linear_model
# Evaluate an optimized l1-regularized logistic regression
param_grid = {'C': np.logspace(-3, 3, 7)}
clf = model_selection.GridSearchCV(linear_model.LogisticRegression(penalty='l1', solver='liblinear'),
param_grid,
scoring='roc_auc',
cv=3,
iid=True)
ypred_l1 = cross_validate_clf_optimize(X, y, clf, folds)
fpr_l1, tpr_l1, thresholds_l1 = metrics.roc_curve(y, ypred_l1, pos_label=1)
auc_l1 = metrics.auc(fpr_l1, tpr_l1)
print('nb features of best sparse model:', len(np.where(clf.best_estimator_.coef_!=0)[0]))
# Evaluate an optimized l2-regularized logistic regression
clf = model_selection.GridSearchCV(linear_model.LogisticRegression(penalty='l2',solver='liblinear'),
param_grid,
scoring='roc_auc',
cv=3,
iid=True)
ypred_l2 = cross_validate_clf_optimize(X, y, clf, folds)
fpr_l2, tpr_l2, thresholds_l2 = metrics.roc_curve(y, ypred_l2, pos_label=1)
auc_l2 = metrics.auc(fpr_l2, tpr_l2)
# Plot the ROC curves
fig = plt.figure(figsize=(5, 5))
plt.plot(fpr_rf_opt, tpr_rf_opt,
label='RF optimized (AUC = %0.2f)' % (auc_rf_opt))
plt.plot(fpr_bt_opt, tpr_bt_opt,
label='BT optimized (AUC = %0.2f)' % (auc_bt_opt))
plt.plot(fpr_l1, tpr_l1,
label='l1 optimized (AUC = %0.2f)' % (auc_l1))
plt.plot(fpr_l2, tpr_l2,
label='l2 optimized (AUC = %0.2f)' % (auc_l2))
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
plt.title('ROC curves', fontsize=16)
plt.legend(loc="lower right", fontsize=12)
As can be seen, random forest slightly outperforms the $L_1$ regularized logistic regression (i.e., Lasso and Ridge logistic regression). This means that a simple model can be used to have similar performance.
Based on the performance obtained by the l1-regularized logistic regression, on the endometrium-vs-uterus dataset, a subset of features can yield better predictive models of gene expression data.
It is worth to notice that tree-based methods 'naturally' compute a measure of feature importance which can be directly use for selecting features. Indeed, at each split in each tree, the improvement in the split-criterion is the importance measure attributed to the splittings variables. Feature importance is accumulated over all trees in the forest.
In scikit-learn, this feature importance is accessible via the feature_importances_
attributes of random forests, and can be processed thanks to the meta-transformer feature_selection.SelectFromModel.
from sklearn.feature_selection import SelectFromModel
from sklearn import pipeline
from sklearn import ensemble
from sklearn import linear_model
from sklearn.model_selection import GridSearchCV, StratifiedKFold
# Creating folds
skf = StratifiedKFold(n_splits=5)
skf.get_n_splits(X, y)
folds = [(tr,te) for (tr,te) in skf.split(X, y)]
THRESHOLD_OPTIONS = ['mean', '1.5*mean', '2*mean', '5*mean']
C_OPTIONS = np.logspace(-3, 3, 7)
N_ESTIMATORS_OPTIONS = [10, 20, 50]
param_grid = [
{
'feature_selection': [SelectFromModel(ensemble.RandomForestClassifier(n_estimators=50))],
'feature_selection__threshold': THRESHOLD_OPTIONS,
'classification': [ensemble.RandomForestClassifier()],
'classification__n_estimators': N_ESTIMATORS_OPTIONS,
},
{
'feature_selection': [SelectFromModel(ensemble.RandomForestClassifier(n_estimators=50))],
'feature_selection__threshold': THRESHOLD_OPTIONS,
'classification': [linear_model.LogisticRegression(penalty='l2')],
'classification__C': C_OPTIONS,
},
]
pipe = pipeline.Pipeline([
('feature_selection', SelectFromModel((ensemble.RandomForestClassifier(n_estimators=50)))),
('classification', ensemble.RandomForestClassifier())
])
grid = GridSearchCV(pipe, cv=folds, n_jobs=3, param_grid=param_grid, scoring='roc_auc',iid=True)
grid.fit(X, y)
grid_l1_log_reg = GridSearchCV(linear_model.LogisticRegression(penalty='l1', solver='liblinear'), cv=folds, n_jobs=3,
param_grid={'C':C_OPTIONS}, scoring='roc_auc',iid=True)
grid_l1_log_reg.fit(X,y)
Let us now use a DataFrame to display the results of our evaluation procedure
data_frame = {name:[] for i in range(len(grid.cv_results_['params'])) for name in grid.cv_results_['params'][i]}
data_frame['score'] = []
sorted_index_score = np.argsort(grid.cv_results_['mean_test_score'])[::-1]
for ind in sorted_index_score:
data_frame['score'].append(grid.cv_results_['mean_test_score'][ind])
for name in data_frame.keys():
if name in grid.cv_results_['params'][ind]:
data_frame[name].append(grid.cv_results_['params'][ind][name])
elif name!='score':
data_frame[name].append(None)
pd.DataFrame(data_frame).head(10)
data_frame = {name:[] for i in range(len(grid_l1_log_reg.cv_results_['params']))
for name in grid_l1_log_reg.cv_results_['params'][i]}
data_frame['score'] = []
sorted_index_score = np.argsort(grid_l1_log_reg.cv_results_['mean_test_score'])[::-1]
for ind in sorted_index_score:
data_frame['score'].append(grid_l1_log_reg.cv_results_['mean_test_score'][ind])
for name in data_frame.keys():
if name in grid_l1_log_reg.cv_results_['params'][ind]:
data_frame[name].append(grid_l1_log_reg.cv_results_['params'][ind][name])
elif name!='score':
data_frame[name].append(None)
pd.DataFrame(data_frame).head(10)
print('number of original features: {}', X.shape[1])
tree_based_feature_selection = SelectFromModel(estimator=ensemble.RandomForestClassifier(n_estimators=50),
threshold='mean')
tree_based_feature_selection.fit(X, y)
print('number of selected features by random forest:', len(tree_based_feature_selection.get_support(indices=True)))
l1_clf = linear_model.LogisticRegression(penalty='l1', C=100,solver='liblinear')
l1_clf.fit(X,y)
print('number of selected features by log. reg. with L1 regularization:', len(np.where(l1_clf.coef_!=0)[0]))