#!/usr/bin/env python # coding: utf-8 # In[1]: import numpy as np import matplotlib.pyplot as plt import pandas as pd import random from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.gaussian_process.kernels import * from sklearn.gaussian_process.kernels import ConstantKernel as C # In[2]: np.random.seed(1) # # Gaussian Processes Regression # A random variable $X$ presents a Gaussian distribution with mean $\mu$ and variance $\sigma^{2}$ # if its probability density function (PDF) is: # $$ # P_{X}(x)=\frac{1}{\sqrt{2\pi}\sigma}\exp{\left(-\frac{(x-\mu)^{2}}{2\sigma^ {2}}\right)} # $$ # In[3]: # General form of a Gaussian distribution def dist(x, mu, sigma2): return((1/(np.sqrt(2*np.pi*sigma2))*np.exp(-(x-mu)**2/(2*sigma2)))) # In[4]: xs=np.linspace(-5,5,100) plt.plot(xs,dist(xs,0,0.2),label="$\mu=0$, $\sigma^{2}=0.2$") plt.plot(xs,dist(xs,0,1),label="$\mu=0$, $\sigma^{2}=1$") plt.plot(xs,dist(xs,0,5),label="$\mu=0$, $\sigma^{2}=5$") plt.grid() plt.xlabel("$x$") plt.ylabel("PDF") plt.xlim(-5,5) plt.vlines(0,-0.1,1.1,linestyle="dashed",color="red",linewidth=0.8) plt.title("General Gaussian Distribution") plt.savefig("NormalDistribution") plt.legend(); # Since any Gaussian distribution is characterized by the parameters $\mu$ and $\sigma^{2}$, it can be denoted as: # $$ # P_{X}(x)=\mathcal{N}(\mu,\sigma^{2}) # $$ # For two random variables with Gaussian distributions, we have a multivariate distribution. The parameter $\mu$ becomes into a vector with the means and $\sigma^{2}$ into a covariance matrix with the correlation information between the variables: # $$ # P_{X_{1},X_{2}}(x,y)=\mathcal{N}\left(\left[\begin{matrix}\mu_1\\\mu_2\end{matrix}\right], # \left[\begin{matrix}\sigma_{X_{1}}^{2}&\sigma_{X_{1},X_{2}}^{2}\\\sigma_{X_{2},X_{ 1}}^{2}&\sigma_{X_{2}}^{2}\end{matrix}\right]\right) # $$ # The above can be generalized to the case of infinitely many random variables where $\mu$ is transformed into a continuous function of averages $m(x)$ and the covariance matrix into a covariance function $k(x,x')$, designated as kernel: # $$ # f(x)\sim\mathcal{GP}(m(x),k(x,x')) # $$ # This is known as a Gaussian process and it is defined as a collection of random variables where each subset of them has a Gaussian distribution. # In conjunction with Bayesian statistical tools, they have applications in regression and classification problems. # ## Regression of a linear model using _scikit learn_. # The simplest form of regression consists on ignoring the variance or noise of the observational data, which means assuming that the measurements in the observations have no uncertainties. # # The file `Linear_example.txt` contains a mock data set that was constructed from a linear function. A random value was added to the independent variable $y$ to generate random scatter in the data. # # In[5]: # Reading data data=pd.read_table('Linear_example.txt',sep='\s+') # Independent variable in the data x=np.atleast_2d(np.asarray(data)[:,0]).T # Dependent variable in the data y=np.asarray(data)[:,1] # * Null variances: # In[6]: # MODEL ESTIMATION # Prior k = C(1.0)*Matern() gp = GaussianProcessRegressor(kernel=k, n_restarts_optimizer=20) # Inference gp.fit(x, y); # Values where the model is evaluated to make predictions xs = np.atleast_2d(np.linspace(0.9, 10.1, 1000)).T # Predictions and standard deviations of each y_pred, sigma = gp.predict(xs, return_std=True) # In[7]: def graph(f,sigma): plt.scatter(x,y,color="black",s=12,label=u'Data') plt.plot(xs, f, label=u'Model',color="midnightblue") plt.fill(np.concatenate([xs, xs[::-1]]), np.concatenate([f - 2*sigma, (f + 2*sigma)[::-1]]), color="blueviolet", alpha=.35, ec='None', label='95.5% Confidence Level') plt.fill(np.concatenate([xs, xs[::-1]]), np.concatenate([f - 3 * sigma, (f + 3 * sigma)[::-1]]), color="blue", alpha=.15, ec='None', label='99.7% Confidence Level') plt.xlabel('$X$') plt.ylabel('$Y$') plt.xlim(0.9,10.1) plt.legend(loc='upper left') plt.grid(); # In[8]: graph(y_pred,sigma) plt.savefig("LinearGP"); # * Random variances: # In[9]: # Random variances in the data dy = np.random.normal(5,2*np.random.rand(len(y))) # Estimation of the model k = C(1.0)*Matern() gp = GaussianProcessRegressor(kernel=k,alpha=dy**2, n_restarts_optimizer=10) gp.fit(x, y) y_pred, sigma = gp.predict(xs, return_std=True) graph(y_pred,sigma) plt.errorbar(x.ravel(), y, dy,linestyle="None", elinewidth=1, color="black") plt.savefig("LinearErrorGP"); # ## Estimation of the derivatives of a function using _GaPP_ # In[10]: import gapp from gapp import gp from gapp import dgp from gapp import covariance # In[11]: # Mock data set X=np.linspace(0,2*np.pi,30) Y=np.sin(X)+np.random.uniform(-0.15,0.15,len(X)) Sigma_2=np.random.uniform(0.1,0.3,len(X)) # Reconstruction of the function and its derivatives xmin = 0.0; xmax = 2*np.pi; nstar = 200 initheta = [0.7,1.4] g = dgp.DGaussianProcess(X, Y, Sigma_2, cXstar = (xmin, xmax, nstar)) (rec, theta) = g.gp(theta = initheta,thetatrain = 'True') (drec, theta) = g.dgp(theta = initheta,thetatrain = 'True') (d2rec, theta) = g.d2gp(theta = initheta,thetatrain = 'True') (d3rec, theta) = g.d3gp(theta = initheta,thetatrain = 'True') # In[12]: def plot(X, Y, Sigma, rec, drec, d2rec, d3rec,xlim_a,xlim_b,ylim_a,ylim_b): fig, axs = plt.subplots(2, 2, figsize=(13,8)) axs[0,0].fill_between(rec[:, 0], rec[:, 1] + 2*rec[:, 2], rec[:, 1] - 2*rec[:, 2], facecolor='blueviolet',alpha=.35) axs[0,0].fill_between(rec[:, 0], rec[:, 1] + 3*rec[:, 2], rec[:, 1] - 3*rec[:, 2], facecolor='blue',alpha=.15) axs[0,0].scatter(X,Y,s=12,color="black") axs[0,0].plot(rec[:, 0], rec[:, 1],color='midnightblue') axs[0,0].plot(np.linspace(0,2*np.pi,100),[np.sin(x) for x in np.linspace(0,2*np.pi,100)],color="red") axs[0,0].errorbar(X, Y, Sigma, color='black',elinewidth=1,linestyle='None') axs[0,0].set_xlabel("$x$", fontsize=17) axs[0,0].set_ylabel("$y(x)$", fontsize=17) axs[0,0].grid() axs[0,0].set_xlim([xlim_a, xlim_b]) axs[0,0].set_ylim([ylim_a, ylim_b]) axs[0,1].fill_between(drec[:, 0], drec[:, 1] + 2*drec[:, 2], drec[:, 1] - 2*drec[:, 2], facecolor='blueviolet',alpha=.35) axs[0,1].fill_between(drec[:, 0], drec[:, 1] + 3*drec[:, 2], drec[:, 1] - 3*drec[:, 2], facecolor='blue',alpha=.15) axs[0,1].plot(drec[:, 0], drec[:, 1],color='midnightblue') axs[0,1].set_xlabel("$x$", fontsize=17) axs[0,1].set_ylabel("$y'(x)$", fontsize=17) axs[0,1].grid() axs[0,1].plot(np.linspace(0,2*np.pi,100),[np.cos(x) for x in np.linspace(0,2*np.pi,100)],color="red") axs[0,1].set_xlim([xlim_a, xlim_b]) axs[0,1].set_ylim([ylim_a, ylim_b]) axs[1,0].fill_between(d2rec[:, 0], d2rec[:, 1] + 2*d2rec[:, 2], d2rec[:, 1] - 2*d2rec[:, 2], facecolor='blueviolet',alpha=.35) axs[1,0].fill_between(d2rec[:, 0], d2rec[:, 1] + 3*d2rec[:, 2], d2rec[:, 1] - 3*d2rec[:, 2], facecolor='blue',alpha=.15) axs[1,0].plot(d2rec[:, 0], d2rec[:, 1],color='midnightblue') axs[1,0].set_xlabel("$x$", fontsize=17) axs[1,0].set_ylabel("$y''(x)$", fontsize=17) axs[1,0].grid() axs[1,0].plot(np.linspace(0,2*np.pi,100),[-np.sin(x) for x in np.linspace(0,2*np.pi,100)],color="red") axs[1,0].set_xlim([xlim_a, xlim_b]) axs[1,0].set_ylim([ylim_a, ylim_b]) axs[1,1].fill_between(d3rec[:, 0], d3rec[:, 1] + 2*d3rec[:, 2], d3rec[:, 1] - 2*d3rec[:, 2], facecolor='blueviolet',alpha=.35) axs[1,1].fill_between(d3rec[:, 0], d3rec[:, 1] + 3*d3rec[:, 2], d3rec[:, 1] - 3*d3rec[:, 2], facecolor='blue',alpha=.15) axs[1,1].plot(d3rec[:, 0], d3rec[:, 1],color='midnightblue') axs[1,1].set_xlabel("$x$", fontsize=17) axs[1,1].set_ylabel("$y'''(x)$", fontsize=17) axs[1,1].grid() axs[1,1].plot(np.linspace(0,2*np.pi,100),[-np.cos(x) for x in np.linspace(0,2*np.pi,100)],color="red") axs[1,1].set_xlim([xlim_a, xlim_b]) axs[1,1].set_ylim([ylim_a, ylim_b]); # In[13]: plot(X, Y, Sigma_2, rec, drec, d2rec, d3rec,0,2*np.pi,-2,2) plt.savefig('Derivatives');