PhD Candidate University of York
Workshop Repository: https://github.com/vb690/introduction_bayesian_analysis
Introduction
Bayesian Approach to Inference
PyMC3
Applications
%load_ext autoreload
%autoreload 2
from tqdm import tqdm
import numpy as np
from scipy.stats import binom, beta, norm
from sklearn.datasets import make_regression
import pymc3 as pm
import matplotlib.pyplot as plt
import seaborn as sns
The autoreload extension is already loaded. To reload it, use: %reload_ext autoreload
Number of Reviews | Number of Outcomes | Reviews Outcomes |
---|---|---|
4 | 3 | Good, Bad, Good |
HYPOTHESIS | NUMBER OF WAYS [Good, Bad, Good] CAN APPEAR |
---|---|
0.0 | 0 |
0.25 | 3 |
0.50 | 8 |
0.75 | 9 |
1.0 | 0 |
See Gelman and Loken
HYPOTHESIS | PRIOR BELIEF | NUMBER OF WAYS [Bad] CAN APPEAR | WAYS X PRIOR |
---|---|---|---|
0.0 | 0 | 4 | 0 |
0.25 | 3 | 3 | 9 |
0.50 | 8 | 2 | 16 |
0.75 | 9 | 1 | 9 |
1.0 | 0 | 0 | 0 |
HYPOTHESIS | PRIOR BELIEF | NUMBER OF WAYS [Bad] CAN APPEAR | $\frac{WAYS \times PRIOR}{Normalizing}$ |
---|---|---|---|
0.0 | 0 | 4 | 0 |
0.25 | 3 | 3 | 0.26 |
0.50 | 8 | 2 | 0.47 |
0.75 | 9 | 1 | 0.26 |
1.0 | 0 | 0 | 0 |
from modules.visuals import visualize_binomial_update
visualize_binomial_update(
n_tests=6,
parameter_name='Good Reviews',
remapper={0: 'Bad', 1: 'Good'},
parameter_space=np.linspace(0, 1, 5),
figsize=(10, 5),
hist=True,
auto=False,
color='skyblue'
)
Plausibility of observing 333 good reviews out of 656 total assuming a proportion of good reviews of 0.5?
How the plausibility of observing 333 good reviews out of 656 total varies assuming proportion of good reviews that consider all the values going from 0.0 to 1.0?
Without seeing any data I can assume that there is nearly zero plausibility that the proportion of good reviews is 0.0 or 1.0
$Beta(\alpha, \beta)$
After seeing some data I update my prior. Now it is more plausibile that the proportion of Good reviews is close to 1.0
from modules.visuals import visualize_priors_effect
TOTAL_REVIEWS = 65
GOOD_REVIEWS = 33
PARAMETER_SPACE = np.linspace(0, 1, 100)
PRIORS = {
'Not a Clue': np.array([1] * len(PARAMETER_SPACE)),
'We Expect to be Ok': beta(2, 2).pdf(PARAMETER_SPACE),
'We Expect to be Good': beta(5, 2).pdf(PARAMETER_SPACE),
'It HAS to be Good': beta(10, 2).pdf(PARAMETER_SPACE),
}
visualize_priors_effect(
parameter_space=PARAMETER_SPACE,
priors=PRIORS,
likelihood=binom.pmf(GOOD_REVIEWS, TOTAL_REVIEWS, p=PARAMETER_SPACE),
figsize=(20, 5)
)
A Western traveler asks an Oriental philosopher to describe the nature of the world:
“It is a great ball resting on the flat back of the world turtle.”
“Ah yes, but what does the world turtle stand on?”
“On the back of a still larger turtle.”
“Yes, but what does he stand on?”
“A very perceptive question. But it’s no use, mister; it’s turtles all the way down.”
TOTAL_REVIEWS = 656
GOOD_REVIEWS = 333
with pm.Model() as p_esitmation_model:
# prior distribution of the paramter to estimate
theta = pm.Beta(
alpha=2,
beta=2,
name='parameter_p')
# the outocme distribution
outcome = pm.Binomial(
p=theta,
observed=GOOD_REVIEWS,
n=TOTAL_REVIEWS,
name='outcome'
)
GOOD_REVIEWS = np.random.binomial(666, 0.52, 100)
def build_model(observed):
"""
Function for building a PyMC3 model
Arguments:
observed: value or array of values, data to fit the model on
Returns:
estimation_model: A PyMC3 model
"""
with pm.Model() as esitmation_model:
# prior distribution of the paramter to estimate
p = pm.Beta(
alpha=3,
beta=3,
name='proportion'
)
# prior distribution of the paramter to estimate
n = pm.Poisson(
mu=666,
name='total reviews'
)
# the outocme distribution
outcome = pm.Binomial(
p=p,
observed=GOOD_REVIEWS,
n=n,
name='good reviews'
)
return esitmation_model
with build_model(GOOD_REVIEWS):
graph = pm.model_graph.model_to_graphviz()
graph
with build_model(GOOD_REVIEWS):
# we sample from the prior distribution of
# the paramters
prior_checks = pm.sample_prior_predictive(
samples=1000
)
from modules.visuals import visualize_bivariate_parameter_grid
visualize_bivariate_parameter_grid(
parameter_1=prior_checks['total reviews'],
parameter_2=prior_checks['proportion'],
parameter_1_name='Total Number Reviews',
parameter_2_name='Proportion of Good Reviews',
height=8
)
with build_model(GOOD_REVIEWS):
# find the posterior distribution using MCMC
train_traces = pm.sample(
draws=2000,
chains=2,
tune=1000,
cores=1
)
Sequential sampling (2 chains in 1 job) CompoundStep >NUTS: [proportion] >Metropolis: [total reviews]
Sampling 2 chains for 1_000 tune and 2_000 draw iterations (2_000 + 4_000 draws total) took 7 seconds. The rhat statistic is larger than 1.4 for some parameters. The sampler did not converge. The estimated number of effective samples is smaller than 200 for some parameters.
with build_model(GOOD_REVIEWS):
# sample from the posterior distribution of the parameters
posterior_checks = pm.sample_posterior_predictive(
train_traces,
var_names=['total reviews', 'proportion', 'good reviews']
)
pm.plot_trace(train_traces)
with build_model(GOOD_REVIEWS):
# visualize the posterior distribution
visualize_bivariate_parameter_grid(
parameter_1=posterior_checks['total reviews'],
parameter_2=posterior_checks['proportion'],
parameter_1_name='Total Number Reviews',
parameter_2_name='Proportion of Good Reviews',
height=7
)
print(pm.summary(train_traces)[['mean', 'sd', 'hdi_3%', 'hdi_97%', 'r_hat']])
mean sd hdi_3% hdi_97% r_hat total reviews 674.574 17.086 647.000 704.000 1.83 proportion 0.515 0.013 0.492 0.537 1.82
plt.figure(figsize=(10, 10))
sns.distplot(posterior_checks['good reviews'])
plt.xlabel('Predicted Number of Good Reviews')
plt.show()
C:\Users\valeriob\AppData\Local\Continuum\anaconda3\envs\workshop_env\lib\site-packages\seaborn\distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
PyMC3 - Framework | scikit-learn - Toolbox |
---|---|
Flexible | Constrained |
Modelling / Analysis | Machine-learning |
White-box | Grey-box |
Non-smooth API | Smooth API |
In theory works on GCP | Built for GCP |
More Thinking | Less Thinking |
More Tinkering | Less Tinkering |
More Thinking | Less Thinking |
More Control | Less Control |
More Insights | Less Insights |
# sklearn will do "bayesian" fitting
# but will return point estimate for the parameters
from sklearn.linear_model import BayesianRidge
ridge_model = BayesianRidge(
alpha_1=0.01, # prior Gamma
alpha_2=0.01, # prior Gamma
lambda_1=0.01, # prior Gamma
lambda_2=0.01 # prior Gamma
)
X = np.random.random(100)
y = np.random.random(100)
# PyMC3 will return distribution for the parameters
with pm.Model() as ridge_model:
# I decide that my intercpet comes from a normal distribution
intercept = pm.Normal(
'Intercept',
mu=0.,
sigma=1.
)
# I decide which type of prior I want to put on the slope
slope_sigma = pm.Gamma(
'Slope Sigma',
alpha=0.01,
beta=0.01
)
slope = pm.Normal(
'Slope',
mu=0,
sd=slope_sigma ,
)
mu = intercept + slope*X
sigma = pm.HalfCauchy(
'sigma',
beta=10.
)
# I decide that my outcome is normally distributed
outcome = pm.Normal(
'y',
mu=mu,
sd=sigma,
observed=y
)
from modules.visuals import visualize_bivariate_relationship
X, y, coef = make_regression(
n_samples=100,
n_features=1,
bias=200,
noise=50,
coef=True,
random_state=56
)
X = (X - X.mean()) / X.std()
X = X.flatten()
print(f'Real Coefficient {coef}')
visualize_bivariate_relationship(
X=X,
y=y,
X_label='Advertising Spending',
y_label='Net Revenue',
title=f'Impact of Advertising Spending on Net Revenue',
)
Real Coefficient 12.04988641563446
from modules.stats.models.linear_models import BivariateRegression
model = BivariateRegression(
intercept_prior=(0, 100),
slope_prior=(0, 100),
likelihood_sigma_prior=100,
X=X,
y=y
)
model.show_plate()
model.show_prior_summary()
new_parameters = {
'intercept_prior': (200, 10),
'slope_prior': (15, 1),
'likelihood_sigma_prior': 10
}
model.riparametrize_priors(new_parameters=new_parameters)
model.show_prior_summary()
model.fit(
MAP=False,
draws=1000,
tune=2000,
target_accept=.90,
chains=2,
cores=1,
init='adapt_diag'
)
Auto-assigning NUTS sampler... Initializing NUTS using adapt_diag... Sequential sampling (2 chains in 1 job) NUTS: [Sigma, Slope, Intercept]
Sampling 2 chains for 2_000 tune and 1_000 draw iterations (4_000 + 2_000 draws total) took 9 seconds.
model.show_posterior_summary(parameters_name=['Intercept', 'Slope'])
MCMC Estimates mean sd hdi_3% hdi_97% Intercept 190.884 3.874 183.494 197.808 Slope 15.098 0.993 13.337 16.927
from modules.utils.data_utils import generate_poisson_ar_data
from modules.visuals import visualize_time_series
process, true_parameters = generate_poisson_ar_data(
lam_int=50,
slope_a=5,
slope_b=5,
burn_factor=3,
time_steps=48
)
X = process[:-1]
y = process[1:]
X_tr = X[:-12]
y_tr = y[:-12]
X_ts = X[-12:]
y_ts = y[-12:]
visualize_time_series(
X=[i for i in range(len(process))],
y=process,
prediction_point=36,
X_label='Months',
y_label='Number of Visits',
figsize=(8, 8)
)
for parameter_name, parameter_value in true_parameters.items():
print(f'{parameter_name}: {parameter_value}')
intercept: 46 slope: 0.34198991038831933
from modules.stats.models.linear_models import PoissonAR1
ar_1_model = PoissonAR1(
X=X_tr,
y=y_tr,
intercept_prior=50,
slope_prior=(5, 5),
)
ar_1_model.show_plate()
ar_1_model.fit(
MAP=False,
draws=9000,
tune=1000,
chains=2,
cores=1,
target_accept=.90
)
Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... Sequential sampling (2 chains in 1 job) NUTS: [Slope, Intercept]
Sampling 2 chains for 1_000 tune and 9_000 draw iterations (2_000 + 18_000 draws total) took 28 seconds.
ar_1_model.show_posterior_summary(parameters_name=['Intercept', 'Slope'])
MCMC Estimates mean sd hdi_3% hdi_97% Intercept 49.842 0.992 47.903 51.621 Slope 0.311 0.024 0.268 0.358
prediction = ar_1_model.predict(X_ts, y_ts, verbose=False)
visualize_time_series(
X=[i for i in range(len(process))],
y=process,
prediction=prediction['y'],
prediction_point=36,
X_label='Months',
y_label='Number of Visits',
figsize=(8, 8)
)
prediction = []
x = X_ts[0]
for step in tqdm(range(12)):
posterior_prediction = ar_1_model.predict(x, 1, verbose=False)
prediction.append(posterior_prediction['y'])
x = posterior_prediction['y'].mean()
prediction = np.vstack(prediction).T
33%|███████████████████████████▋ | 4/12 [02:08<04:20, 32.62s/it]
visualize_time_series(
X=[i for i in range(len(process))],
y=process,
prediction=prediction,
prediction_point=36,
X_label='Months',
y_label='Number of Visits',
figsize=(8, 8)
)
from modules.utils.data_utils import generate_game_difficulty_data
df, true_parameters = generate_game_difficulty_data(
players=10,
levels=5,
n_sessions=100
)
df.head(10)
visualize_bivariate_parameter_grid(
parameter_1=true_parameters['level_difficulty'],
parameter_2=true_parameters['player_ability'],
parameter_1_name='Level Difficulty',
parameter_2_name='Player Ability',
height=7
)
from modules.visuals import visualize_heatmap
visualize_heatmap(
df=true_parameters,
pivot_varaibles=('player_ability', 'level_difficulty', 'probability_success'),
rounding=4,
figsize=(8, 8),
)
visualize_bivariate_relationship(
X=true_parameters['delta'],
y=true_parameters['probability_success'],
X_label='$\Delta$(Difficulty, Ability)',
y_label='Probability Success',
title=f'Probability Succeding Level as Function Difficulty - Ability',
c=true_parameters['probability_success'],
cmap='rocket'
)
from modules.stats.models.graphical_models import GameDifficultyModel
PLAYERS_ID = df['player_id'].values
LEVELS_ID = df['level'].values
ATTEMPTS = df['num_attempts'].values
SUCCESSES = df['num_success'].values
game_difficulty_model = GameDifficultyModel(
players_id=PLAYERS_ID,
levels_id=LEVELS_ID,
num_attempts=ATTEMPTS,
num_success=SUCCESSES
)
game_difficulty_model.show_plate()
game_difficulty_model.fit(
MAP=False,
draws=3000,
tune=2000,
chains=2,
cores=1,
target_accept=.90
)
game_difficulty_model.show_posterior_summary(
parameters_name=[
'player_ability_mu',
'hyper_sigma',
'level_difficulty_mu',
'player_ability',
'level_difficulty'
],
compact=True
)
game_difficulty_model.show_forest_plot(
parameters_name=['player_ability']
)
game_difficulty_model.show_forest_plot(
parameters_name=['level_difficulty']
)
from modules.utils.data_utils import generate_polynomial_data
from modules.stats.models.linear_models import PolynomialRegression
df, true_parameters = generate_polynomial_data(
X=[i for i in range(100)],
degree=2,
noise_ratio=0.1
)
X = df['X'].values
y = df['y'].values
visualize_bivariate_regression(
X=X,
y=y,
X_label='Predictor',
y_label='Outcome',
title=f'Quadratic Relationship',
)
models = {
'Linear': BivariateNormalRegression(
intercept_prior=(0, 1),
slope_prior=(0, 1),
likelihood_sigma_prior=1,
X=X,
y=y
),
'Quadratic': PolynomialRegression(
X=X,
y=y,
cubic=False
),
'Cubic': PolynomialRegression(
X=X,
y=y,
cubic=True
)
}
for name, model in models.items():
print(f'Sampling from {name}')
model.fit(
MAP=False,
draws=1000,
tune=1000,
chains=2,
cores=1,
target_accept=.90,
init='adapt_diag'
)
df_results = pm.compare(
{name: model.get_traces() for name, model in models.items()},
ic='loo',
scale='log'
)
df_results
pm.plot_compare(
df_results,
figsize=(10, 5)
)