#!/usr/bin/env python # coding: utf-8 # # Bayesian Coresets applied to the Phishing Dataset # # *(Better displayed in [nbviewer](https://nbviewer.jupyter.org/) as red warnings in font tag may not be displayed on github)* # # In this notebook we evaluate the use Bayesian coresets \[1\]\[2\] using Tensorflow/Edward \[4\] on the phishing dataset \[4\] (loosely following the experiments in \[1\]). As before, we rely on the code made availabe by Trevor Campbell for computing coresets \[3\]. # # This notebook builds on the previous notebook, to which we refer for a more detailed description of the code. # # The notebook is divided in four parts: # 1. **Setup**: defining the data and the model; # 2. **Baseline**: setting up a SVM baseline; # 3. **Full-dataset Bayesian inference**: performing inference and posterior prediction on the full dataset; # 4. **Coreset Bayesian inference**: running coreset computation and performing inference and posterior prediction on the coreset. # 5. **Evaluation**: comparing the three approaches above in terms of accuracy and time. # # **ISSUES**: still to correct/review: (i) proper way to handle coreset weights instead of upsampling; (ii) bottleneck at the tensorflow evaluation of the gradient. # # ## 1. Setup # # In this first section we define parameters and auxiliary functions, we generate and show the data, and we define the statistical model. # ### Importing libraries # In[2]: import numpy as np from scipy import stats, sparse import matplotlib.pyplot as plt from sklearn import datasets, svm, metrics import tensorflow as tf import edward as ed import bcoresets as bc import time # ### Setting a random seed # In[2]: np.random.seed(742) # ### Defining the projection code # # See previous notebook for an explanation of this code. # In[3]: class _Projection(object): def __init__(self, N, projection_dim, approx_posterior): self.dim = approx_posterior.shape[0].value self.x = np.zeros((N, 0)) self.approx_posterior = approx_posterior self.update_dimension(projection_dim) return def update_dimension(self, projection_dim): if projection_dim < self.x.shape[1]: self.x = self.x[:, :projection_dim] if projection_dim > self.x.shape[1]: old_dim = self.x.shape[1] w = np.zeros((self.x.shape[0], projection_dim)) w[:, :old_dim] = self.x w *= np.sqrt(old_dim) for j in range(projection_dim-old_dim): w[:, j+old_dim] = self._sample_component() w /= np.sqrt(projection_dim) self.x = w return def get(self): return self.x.copy() # In[4]: class ProjectionF(_Projection): def __init__(self, data, grad_log_likelihood, projection_dim, approx_posterior): self.data = data self.grad_log_likelihood = grad_log_likelihood _Projection.__init__(self, data.shape[0], projection_dim, approx_posterior) def _sample_component(self): sample_post = self.approx_posterior.sample().eval() sgll = [self.grad_log_likelihood.eval(feed_dict={X:self.data[i].reshape([1,self.data.shape[1]]), theta:sample_post}) for i in range(self.data.shape[0])] sgll = np.array(sgll) return np.sqrt(self.dim)*sgll[:,np.random.randint(self.dim)] # ### Defining the params of the simulation # # We define the parameters of the simulation, including the number of random projection dimension and core samples for coreset computation (*nrandomdims*, *ncoresamples*); the number of samples, the step length, the burn-in duration and the amount of thinning for the MC simulation (*n_mcsamples*, *n_mcstepsize*, *n_mcburnin*, *n_mcthinning*,); the number of samples for posterior prediction (*n_ppcsamples*). # In[5]: nrandomdims = 50 ncoresamples = 50 n_mcsamples = 10000 n_mcstepsize = 5.4e-2 n_mcburnin = int(n_mcsamples/2) n_mcthinning = 5 n_ppcsamples = 100 # ### Loading the data # # We load the data and randomly partition it in a training set (90%) and a test set (10%). # In[6]: D = datasets.load_svmlight_file('./data/phishing') X = sparse.csr_matrix.todense(D[0]) y = D[1] nsamples = X.shape[0] nfeatures = X.shape[1] trainmask = [(np.random.rand()>.1) for _ in range(nsamples)] Xtr = X[trainmask,:] ytr = y[trainmask] Xte = X[np.logical_not(trainmask),:] yte = y[np.logical_not(trainmask)] # ### Defining the model # # We define our standard Bayesian regression model as: # $$ # P(Y \vert \theta) \sim Bern (p = \sigma(X \theta)) # $$ # where $\theta$ is the two-dimensional vector of parameters and $\sigma(x) = \frac{1}{1+\exp{-x}}$ is the logistic function. # # We also define the log-likelihood of the models and we use Tensorflow for the definition of the gradient of the log-likelihood. # In[8]: X = tf.placeholder(tf.float32,[None,nfeatures]) theta = ed.models.Normal(loc=tf.zeros(nfeatures),scale=tf.ones(nfeatures)) y = ed.models.Bernoulli(probs=tf.sigmoid(ed.dot(X,theta))) log_likelihood = tf.log(tf.sigmoid(ed.dot(X,theta))) grad_log_likelihood = tf.gradients(log_likelihood,[theta])[0] # ## 2. Baseline # # We train a simple classifier on the data to set a baseline. # ### Linear SVM # We instantiate and train a linear SVM. We record its running time (*t_svm*) and its prediction output (*ypred_svm*). # In[7]: t0 = time.time() svmmodel = svm.LinearSVC() svmmodel.fit(Xtr, ytr) ypred_svm = svmmodel.predict(Xte) t_svm = time.time() - t0 # In[14]: X = tf.placeholder(tf.float32,[None,2]) theta = ed.models.Normal(loc=tf.zeros(2),scale=tf.ones(2)) y = ed.models.Bernoulli(probs=tf.sigmoid(ed.dot(X,theta))) log_likelihood = tf.log(tf.sigmoid(ed.dot(X,theta))) grad_log_likelihood = tf.gradients(log_likelihood,[theta])[0] # ## 3. Full-dataset Bayesian inference # # We now perform Bayesian inference on the full dataset. # ### Hamiltonian Monte Carlo # # We run inference on the full dataset using Hamiltonian Monte Carlo. As before, we record the running time (*t_hmcfull*) and its prediction output (*ypred_hmcfull*). # In[9]: t0 = time.time() thetachain = ed.models.Empirical(params=tf.Variable(tf.zeros([n_mcsamples,nfeatures]))) inference = ed.HMC({theta:thetachain}, {X:Xtr,y:ytr.reshape((ytr.shape[0]))}) inference.run(step_size=n_mcstepsize) thetahat = ed.models.Empirical(params = thetachain.params.eval()[n_mcburnin:n_mcsamples:n_mcthinning]) ypost_full = ed.copy(y, {theta:thetahat}) ppc_samples = [ypost_full.eval(feed_dict={X:Xte}) for _ in range(n_ppcsamples)] ypred_hmcfull = (np.array(ppc_samples).mean(axis=0) > .5) t_hmcfull = time.time() - t0 # ## 4. Coreset Bayesian inference # # We now perform Bayesian inference on the coreset. # ### GIGA Coreset # # We compute the GIGA coreset of the full dataset. This step includes evaluating an approximate posterior via a Laplace approximation, a discretization and random projection of the log-likelihood, the computation of the coreset and the upsampling of the result (see previous notebook for an explanation of this code). We record the running time of the coreset computation (*t_giga*). # In[10]: t0 = time.time() qtheta = ed.models.MultivariateNormalTriL( loc = tf.Variable(tf.zeros(nfeatures)), scale_tril = tf.Variable(tf.eye(nfeatures,nfeatures))) inference = ed.Laplace({theta:qtheta}, {X:Xtr,y:ytr.reshape((ytr.shape[0]))}) inference.run() randomprojection = ProjectionF(Xtr, grad_log_likelihood, nrandomdims, qtheta) vecs = randomprojection.get() bc_giga = bc.GIGA(vecs) bc_giga.run(ncoresamples) Wt = bc_giga.weights() Xwt = Xtr[Wt>0] ywt = ytr[Wt>0] ywt = ywt.reshape((ywt.shape[0],1)) Wt = Wt[Wt>0] for i in range(Wt.shape[0]): np.vstack((Xwt, np.tile(Xwt[i,:],(np.int32(np.floor(Wt[i])),1)))) np.vstack((ywt, np.tile(ywt[i,:],(np.int32(np.floor(Wt[i])),1)))) t_giga = time.time() - t0 # ### Hamiltonian Monte Carlo # # We run inference on the coreset using Hamiltonian Monte Carlo. We record the running time (*t_hmcgiga*) and its prediction output (*ypred_hmcgiga*). # In[11]: t0 = time.time() thetachain = ed.models.Empirical(params=tf.Variable(tf.zeros([n_mcsamples,nfeatures]))) inference = ed.HMC({theta:thetachain}, {X:Xwt,y:ywt.reshape((ywt.shape[0]))}) inference.run(step_size=n_mcstepsize) thetahat = ed.models.Empirical(params = thetachain.params.eval()[n_mcburnin:n_mcsamples:n_mcthinning]) ypost_giga = ed.copy(y, {theta:thetahat}) ppc_samples = [ypost_giga.eval(feed_dict={X:Xte}) for _ in range(n_ppcsamples)] ypred_hmcgiga = (np.array(ppc_samples).mean(axis=0) > .5) t_hmcgiga = time.time() - t0 # ## 5. Evaluation # We now compare the results of the three models considered above (linear SVM, HMC on the full dataset, HMC on coreset). # ### Accuracy # # We print out the *accuracy* and the *confusion matrix* of the three models. # In[13]: print('Performance using SVM on the full dataset') print('Accuracy: {0}'.format(metrics.accuracy_score(yte, ypred_svm))) print('Confusion matrix: {0}\n'.format(metrics.confusion_matrix(yte, ypred_svm))) print('Performance using HMC on the full dataset') print('Accuracy: {0}'.format(metrics.accuracy_score(yte, ypred_hmcfull))) print('Confusion matrix: {0}\n'.format(metrics.confusion_matrix(yte, ypred_hmcfull))) print('Performance using HMC on the GIGA dataset') print('Accuracy: {0}'.format(metrics.accuracy_score(yte, ypred_hmcgiga))) print('Confusion matrix: {0}'.format(metrics.confusion_matrix(yte, ypred_hmcgiga))) # ### Time # # We print out the computation *time* of the three models. # In[14]: print('Timings:') print('Running SVM: {0} secs'.format(t_svm)) print('Running HMC on the full dataset: {0} secs'.format(t_hmcfull)) print('Running GIGA: {0} secs'.format(t_giga)) print('Running HMC on the GIGA dataset: {0} secs'.format(t_hmcgiga)) # ## References # # \[1\] Campbell, T. & Broderick, T. Automated Scalable Bayesian Inference via Hilbert Coresets arXiv preprint arXiv:1710.05053, 2017. # # \[2\] Campbell, T. & Broderick, T. Bayesian coreset construction via greedy iterative geodesic ascent arXiv preprint arXiv:1802.01737, 2018 # # \[3\] [Bayesian Coresets: Automated, Scalable Inference](https://github.com/trevorcampbell/bayesian-coresets) # # \[4\] Tran, D.; Kucukelbir, A.; Dieng, A. B.; Rudolph, M.; Liang, D. & Blei, D. M. Edward: A library for probabilistic modeling, inference, and criticism arXiv preprint arXiv:1610.09787, 2016 # # \[5\] Chih-Chung Chang and Chih-Jen Lin, LIBSVM : a library for support vector machines. ACM Transactions on Intelligent Systems and Technology, 2:27:1--27:27, 2011. [Phishing dataset](https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#phishing)