#!/usr/bin/env python # coding: utf-8 # # p-values with Python from scratch # Calculating p-values with python. # # # ## Libraries and helper functions # In[1]: from typing import Tuple import math as m # In[2]: def normal_probability_below(x: float, mu: float = 0, sigma: float = 1) -> float: return (1 + m.erf((x - mu) / m.sqrt(2) / sigma)) / 2 # In[3]: def normal_probability_above(lo: float, mu: float = 0, sigma: float = 1) -> float: return 1 - normal_probability_below(lo, mu, sigma) # In[4]: def normal_approximation_to_binomial(n: int, p: float) -> Tuple[float, float]: mu = p * n sigma = m.sqrt(p * (1 - p) * n) return mu, sigma # ## Two-sided p-values # In[5]: def two_sided_p_value(x: float, mu: float = 0, sigma: float = 1) -> float: """Return the probability of getting at least as extreme value as `x`, given that our values are from a normal distribution with `mu` mean and `sigma` std. """ # If x is greater than the mean return everything above x if x >= mu: return 2 * normal_probability_above(x, mu, sigma) # If x is less than the mean than return everything below x else: return 2 * normal_probability_below(x, mu, sigma) # E.g. if our normal distribution has a mean of 500 and standard deviation of 15, the probabilty to get 530 or above is ~5.78% # In[6]: mu_0, sigma_0 = normal_approximation_to_binomial (1000 , 0.5 ) mu_0, sigma_0 # In[7]: two_sided_p_value(529.5, mu_0, sigma_0) # As the p-value is greater than 5%, so we don't reject the null hypothesis. # However, if we look at values beyond 32 distance from the mean, however, the probability will be less than 5%. # In[8]: two_sided_p_value(531.5, mu_0, sigma_0) # ## Validating with simulation # In[9]: import random # Run 1000 simulations with 1000 binomial trials extreme_value_count = 0 for _ in range(1000): num_heads = sum(1 if random.random() < 0.5 else 0 for _ in range(1000)) # Count the 'extreme' values (i.e. beyond 30 distance from the mean) if num_heads >= 530 or num_heads <= 470: extreme_value_count += 1 extreme_value_count / 1000 # assert 610 < extreme_value_count < 630, f"{extreme_value_count}" # ## One-sided p-values # In[10]: normal_probability_above(524.5, mu_0, sigma_0) # In[11]: normal_probability_above(526.5, mu_0, sigma_0)