Author(s): Paul Miles | Date Created: July 19, 2019
Included in the pymcmcstat package is the ability to estimate the error variance as part of the sampling process. Furthermore, when using multiple data sets to inform parameter values, you can estimate error variances for each data set separately.
For more details regarding how to estimate the error variance please refer to:
# import required packages
import numpy as np
from pymcmcstat.MCMC import MCMC
import matplotlib.pyplot as plt
import pymcmcstat
print(pymcmcstat.__version__)
1.9.0
# define model function
def modelfun(xdata, theta):
m = theta[0]
b = theta[1]
nrow, ncol = xdata.shape
y = np.zeros([nrow, 1])
y = m*xdata.reshape(nrow,) + b
return y
# define sum-of-squares function
def ssfun(theta, data):
n = len(data.xdata)
ss = np.zeros([n])
for ii in range(n):
xdata = data.xdata[ii]
ydata = data.ydata[ii]
# eval model
ymodel = modelfun(xdata, theta).reshape(ydata.shape)
# calc sos
ss[ii] = ((ymodel - ydata)**2).sum()
return ss
We consider two simulated data sets. Both are linear, but each one has a different level of observation errors. The first data set has $\varepsilon_i \sim N(0, 0.1)$, whereas the second data set has $\varepsilon_i \sim N(0, 0.5)$.
# Add data
nds = 100
m = 2.
b = 3.
x1 = np.linspace(2, 3, num=nds).reshape(nds, 1)
y1 = m*x1 + b + 0.1*np.random.standard_normal(x1.shape)
res1 = y1 - modelfun(x1, [m, b]).reshape(y1.shape)
x2 = np.linspace(1, 5, num=nds).reshape(nds, 1)
y2 = m*x2 + b + 0.5*np.random.standard_normal(x2.shape)
res2 = y2 - modelfun(x2, [m, b]).reshape(y2.shape)
plt.figure(figsize=(8, 4))
plt.subplot(1, 2, 1)
plt.plot(x1, y1, '.b');
plt.plot(x1, modelfun(x1, [m, b]), '-r', linewidth=3);
plt.xlabel('$x_1$'); plt.ylabel('$y_1$');
plt.subplot(1, 2, 2)
plt.plot(x1, res1, '.g');
mr = res1.mean()
plt.plot([x1[0], x1[-1]], [mr, mr], '-k', linewidth=3)
plt.xlabel('$x_1$')
plt.ylabel(str('Residual, ($\\mu$ = {:5.4e})'.format(mr)));
plt.suptitle('Data Set 1')
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.figure(figsize=(8, 4))
plt.suptitle('Data Set 2')
plt.subplot(1, 2, 1)
plt.plot(x2, y2, '.b');
plt.plot(x2, modelfun(x2, [m, b]), '-r', linewidth = 3);
plt.xlabel('$x_2$')
plt.ylabel('$y_2$');
plt.subplot(1, 2, 2)
plt.plot(x2, res2, '.g');
mr = res2.mean()
plt.plot([x2[0], x2[-1]], [mr, mr], '-k', linewidth = 3)
plt.xlabel('$x_2$')
plt.ylabel(str('Residual, ($\\mu$ = {:5.4e})'.format(mr)));
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
add_data_set
method twice to add each data set to the MCMC data structure.updatesigma
flag must be turned on in order to include the observation errors in the sampling process.mcstat = MCMC()
mcstat.data.add_data_set(x1, y1)
mcstat.data.add_data_set(x2, y2)
mcstat.simulation_options.define_simulation_options(
nsimu=int(5.0e4),
updatesigma=True,
method='dram')
mcstat.model_settings.define_model_settings(sos_function=ssfun)
mcstat.parameters.add_model_parameter(name='m', theta0=1.0)
mcstat.parameters.add_model_parameter(name='b', theta0=2.0)
# run mcmc
mcstat.run_simulation()
# Extract results
results = mcstat.simulation_results.results
fullchain = results['chain']
fulls2chain = results['s2chain']
# Define burnin
burnin = int(fullchain.shape[0]/2)
chain = fullchain[burnin:, :]
s2chain = fulls2chain[burnin:, :]
names = results['names']
# display chain statistics
mcstat.chainstats(chain, results)
Sampling these parameters: name start [ min, max] N( mu, sigma^2) m: 1.00 [ -inf, inf] N( 0.00e+00, inf) b: 2.00 [ -inf, inf] N( 0.00e+00, inf) [-----------------100%-----------------] 50000 of 50000 complete in 12.8 sec ------------------------------ name : mean std MC_err tau geweke m : 2.0131 0.0238 0.0007 34.5162 0.9996 b : 2.9918 0.0606 0.0017 34.7970 0.9989 ------------------------------ ============================== Acceptance rate information --------------- Results dictionary: Stage 1: 4.49% Stage 2: 26.70% Net : 31.19% -> 15595/50000 --------------- Chain provided: Net : 38.79% -> 9698/25000 --------------- Note, the net acceptance rate from the results dictionary may be different if you only provided a subset of the chain, e.g., removed the first part for burnin-in. ------------------------------
from pymcmcstat import mcmcplot as mcp
mcp.plot_density_panel(chain, names);
mcp.plot_chain_panel(chain, names);
mcp.plot_density_panel(np.sqrt(s2chain), ['$\\sigma_1$', '$\\sigma_2$']);
print(np.mean(np.sqrt(s2chain), axis = 0))
[0.08622487 0.49118312]
We observe that the estimated observation error standard deviations have mean values around 0.11 and 0.50. As the data was generated assuming 0.10 and 0.50, this result is in good agreement.
# generate prediction intervals
from pymcmcstat import propagation as up
from pymcmcstat.MCMC import DataStructure
def predmodel(q, data):
return modelfun(data.xdata[0], q)
pdata = []
intervals = []
nds = len(mcstat.data.xdata)
for ii in range(nds):
pdata.append(DataStructure())
pdata[ii].add_data_set(mcstat.data.xdata[ii], mcstat.data.ydata[ii])
intervals.append(up.calculate_intervals(chain, results, pdata[ii], predmodel,
waitbar=True, s2chain=s2chain[:, ii]))
[-----------------100%-----------------] 500 of 500 complete in 0.0 sec
# plot prediction intervals
data_display = dict(marker='o', color='b', mfc='w')
model_display = dict(color='r', linestyle='--')
for ii, interval in enumerate(intervals):
fig, ax = up.plot_intervals(
interval,
time=pdata[ii].xdata[0],
ydata=pdata[ii].ydata[0],
adddata=True,
model_display=model_display,
data_display=data_display)
ax.set_xlabel(str('$x_{}$'.format(ii + 1)), fontsize=22)
ax.set_ylabel(str('$y_{}$'.format(ii + 1)), fontsize=22)
ax.set_title(str('Data Set {}'.format(ii + 1)), fontsize=22)
ax.tick_params(labelsize=22)
We observe the expected behavior as about 95% of the data points are contained within the 95% prediction interval. The prediction intervals are generated by propagating uncertainty from the parameters and the observation error chain, so this result further supports that our algorithm successfully estimates the observation errors in the data.