#!/usr/bin/env python # coding: utf-8 # # A short guide on how to use mixed random forests # ## Introduction # This document gives a short tutorial on how to use mixed random forest (mixed RF) for feature selection # and prediction using python. The reader may benefit from having some background on gaussian process # prediction and the general use of the random forest as provided by the python scikit-learn package. # However, knowledge of neither is required to follow the steps of this tutorial. All source files for the # following examples are to be found in the examples directory of the mixed RF module. # ## Examples # For each of the following examples we require the loaded mixed RF module, some helper functions, the scipy library and # matplotlib for plotting. # In[1]: # Load limix module for mixed random forest from limix.ensemble.lmm_forest import Forest as LMF from limix.ensemble import lmm_forest_utils as utils import scipy as sp import pylab as pl # Activate inline plotting get_ipython().run_line_magic('matplotlib', 'inline') # ### Example 1: Recovering a single fixed effect # At first we need to create some data. # In[2]: sp.random.seed(43) n_sample = 100 X = sp.empty((n_sample,2)) X[:,0] = sp.arange(0,1,1.0/n_sample) X[:,1] = sp.random.rand(n_sample) noise = sp.random.randn(n_sample,1)*.05 y_fixed = (X[:,0:1] > .5)*.5 y_fn = y_fixed + noise # Here we consider a simulated data set of size 100. For each sample we simulate features in 2D where the # first dimension is sampled from a grid on [0, 1]. The second dimension contains random samples from the # uniform distribution on the interval ]0, 1[. We combine features for all samples into the matrix $\mathbf{X}$. The # fixed effect $\mathbf{y}_\text{fixed}$ shall only be affected by the first feature dimension. In addition, we add some gaussian # noise to obtain the simulated observation $\mathbf{y}_\text{fn}$. # # The second part of the simulation adds structured noise to the observations $\mathbf{y}_\text{fn}$. The samples become # connected (i.e. correlated) through the simulated covariance. # In[3]: # Add some independent noise to make sure kernel is positive semi definite kernel = utils.getQuadraticKernel(X[:,0], d=0.0025) + sp.eye(n_sample)*1e-8 y_conf = .5*sp.random.multivariate_normal(sp.zeros(n_sample), kernel).reshape(-1,1) y_tot = y_fn + y_conf # We can now visualize our simulated data (figure 1) # In[4]: pl.plot(X[:,0], y_fn, 'c') pl.plot(X[:,0], y_conf+y_fn, 'm') pl.plot(X[:,0], y_conf + y_fixed, 'g') pl.legend(['fixed effect + ind. noise', 'fixed effect + confounding + ind. noise', 'fixed effect + confounding'], bbox_to_anchor=(1.0, 1.4)) # Next, we divide our data into training- and test sample. # In[5]: training_sample = sp.zeros(n_sample, dtype='bool') training_sample[sp.random.permutation(n_sample)[:sp.int_(.66*n_sample)]] = True test_sample = ~training_sample X_train = X[training_sample] y_train = y_fn[training_sample] # We proceed by fitting the standard random forest to the training sample. We can do so using the mixed # RF module with the identity $\mathbf{I}$ as the covariance matrix (i.e. setting kernel=’iid’). # In[6]: random_forest = LMF(kernel='iid') random_forest.fit(X[training_sample],y_tot[training_sample]) response_rf = random_forest.predict(X[test_sample]) # For fitting the mixed RF we need to pick the rows and colums of the covariance according to the training # sample indexes. For prediction we need to use the cross covariance between training and test samples. # In[7]: kernel_train = kernel[sp.ix_(training_sample, training_sample)] kernel_test = kernel[sp.ix_(test_sample, training_sample)] lm_forest = LMF(kernel=kernel_train) lm_forest.fit(X[training_sample],y_tot[training_sample]) response_lmf = lm_forest.predict(X[test_sample], k=kernel_test) # Finally, we plot the results of our prediction # In[8]: pl.plot(X[:,0:1], y_fixed + y_conf, 'g') pl.plot(X[test_sample,0:1], response_lmf, '.b-.') pl.plot(X[test_sample,0:1], response_rf, '.k-.') pl.ylabel('predicted y') pl.xlabel('first dimension of X') pl.legend(['ground truth', 'mixed RF', 'RF'], loc=2) # ### Example 2: Recovering interactions and evaluating feature importances # In the second toy example we predict held out simulated data when the phenotype is caused by an # interaction of features and the random effect. Now our feature vector is a sorted array of integers ranging # from 0 to $2^8−1=255$. We can easily convert this into a 8D feature vector using the binary encoding of # each integer, i.e 0 $\rightarrow$ (0, 0, 0, 0, 0, 0, 0, 0) , 1 $\rightarrow$ (0, 0, 0, 0, 0, 0, 0, 1), . . . and 255 $\rightarrow$ (1, 1, 1, 1, 1, 1, 1, 1). # In[9]: sp.random.seed(42) n_samples=2**8 x = sp.arange(n_samples).reshape(-1,1) X = utils.convertToBinaryPredictor(x) # Our simulated fixed effect shall be an interaction of the fist and third feature dimension. # In[10]: y_fixed = X[:,0:1] * X[:,2:3] # Finally, like in the previous example, we simulate confounding by adding a sample from a multivariate # gaussian with a squared quadratic covariance function. # In[11]: kernel=utils.getQuadraticKernel(x, d=200) + sp.eye(n_samples)*1e-8 y_conf = y_fixed.copy() y_conf += sp.random.multivariate_normal(sp.zeros(n_samples),kernel).reshape(-1,1) y_conf += .1*sp.random.randn(n_samples,1) # In addition, we add a small amount of independent gaussian noise. # After splitting, we fit, both the mixed RF and the random forest on the training sample. Then test # sample is used obtain predictions. # In[12]: (training, test) = utils.crossValidationScheme(2, n_samples) lm_forest = LMF(kernel=kernel[sp.ix_(training, training)]) lm_forest.fit(X[training], y_conf[training]) response_tot = lm_forest.predict(X[test], kernel[sp.ix_(test,training)]) # make random forest prediction for comparison random_forest = LMF(kernel='iid') random_forest.fit(X[training], y_conf[training]) response_iid = random_forest.predict(X[test]) # So far everthing is analog to the previous example. In addition the mixed RF allows us to predict the # fixed effect only. All we need to do is dropping the cross covariance from the parameters of the prediction # function, i.e # In[13]: response_fixed = lm_forest.predict(X[test]) # The results of all predictions an the ground truth can be observed in the figure below. To # visualize the whole binary predictor matrix $\mathbf{X}$ in a single dimension we just need to plot the prediction # against the original (decimal) feature vector $\mathbf{x}$. # In[14]: pl.plot(x, y_fixed, 'g--') pl.plot(x, y_conf, '.7') pl.plot(x[test], response_tot, 'r-.') pl.plot(x[test], response_fixed, 'c-.') pl.plot(x[test], response_iid, 'b-.') pl.title('prediction') pl.xlabel('genotype (in decimal encoding)') pl.ylabel('phenotype') pl.legend(['fixed effect', 'fixed effect + confounding', 'mixed RF', 'mixed RF (fixed effect)', 'RF'], bbox_to_anchor=(1.2, 1.4), ncol=2) # Finally, we evaluate the feature importances which are automatically computed while fitting the forests. These scores are plotted in the figure below. # In[15]: feature_scores_lmf = lm_forest.log_importance feature_scores_rf = random_forest.log_importance n_predictors = X.shape[1] pl.bar(sp.arange(n_predictors), feature_scores_lmf, .3, color='r') pl.bar(sp.arange(n_predictors)+.3, feature_scores_rf, .3, color='b') pl.title('feature importances') pl.xlabel('feature dimension') pl.ylabel('log feature score') pl.xticks(sp.arange(n_predictors)+.3, sp.arange(n_predictors)+1) pl.legend(['mixed random forest', 'random forest']) # In[ ]: # In[ ]: # In[ ]: