# default_exp resimulation
Simulating match outcomes based on the xG of individual shots
#hide
from nbdev.showdoc import *
#export
import collections
import itertools
import numpy as np
Use Poisson-Binomial distribution calculation from https://github.com/tsakim/poibin
It looks like there are plans to package the code, but for now, just copy+paste the requisite class in here (original code is provided with MIT License).
#export
class PoiBin(object):
"""Poisson Binomial distribution for random variables.
This class implements the Poisson Binomial distribution for Bernoulli
trials with different success probabilities. The distribution describes
thus a random variable that is the sum of independent and not identically
distributed single Bernoulli random variables.
The class offers methods for calculating the probability mass function, the
cumulative distribution function, and p-values for right-sided testing.
"""
def __init__(self, probabilities):
"""Initialize the class and calculate the ``pmf`` and ``cdf``.
:param probabilities: sequence of success probabilities :math:`p_i \\in
[0, 1] \\forall i \\in [0, N]` for :math:`N` independent but not
identically distributed Bernoulli random variables
:type probabilities: numpy.array
"""
self.success_probabilities = np.array(probabilities)
self.number_trials = self.success_probabilities.size
self.check_input_prob()
self.omega = 2 * np.pi / (self.number_trials + 1)
self.pmf_list = self.get_pmf_xi()
self.cdf_list = self.get_cdf(self.pmf_list)
# ------------------------------------------------------------------------------
# Methods for the Poisson Binomial Distribution
# ------------------------------------------------------------------------------
def pmf(self, number_successes):
"""Calculate the probability mass function ``pmf`` for the input values.
The ``pmf`` is defined as
.. math::
pmf(k) = Pr(X = k), k = 0, 1, ..., n.
:param number_successes: number of successful trials for which the
probability mass function is calculated
:type number_successes: int or list of integers
"""
self.check_rv_input(number_successes)
return self.pmf_list[number_successes]
def cdf(self, number_successes):
"""Calculate the cumulative distribution function for the input values.
The cumulative distribution function ``cdf`` for a number ``k`` of
successes is defined as
.. math::
cdf(k) = Pr(X \\leq k), k = 0, 1, ..., n.
:param number_successes: number of successful trials for which the
cumulative distribution function is calculated
:type number_successes: int or list of integers
"""
self.check_rv_input(number_successes)
return self.cdf_list[number_successes]
def pval(self, number_successes):
"""Return the p-values corresponding to the input numbers of successes.
The p-values for right-sided testing are defined as
.. math::
pval(k) = Pr(X \\geq k ), k = 0, 1, ..., n.
.. note::
Since :math:`cdf(k) = Pr(X <= k)`, the function returns
.. math::
1 - cdf(X < k) & = 1 - cdf(X <= k - 1)
& = 1 - cdf(X <= k) + pmf(X = k),
k = 0, 1, .., n.
:param number_successes: number of successful trials for which the
p-value is calculated
:type number_successes: int, numpy.array, or list of integers
"""
self.check_rv_input(number_successes)
i = 0
try:
isinstance(number_successes, collections.Iterable)
pvalues = np.array(number_successes, dtype='float')
# if input is iterable (list, numpy.array):
for k in number_successes:
pvalues[i] = 1. - self.cdf(k) + self.pmf(k)
i += 1
return pvalues
except TypeError:
# if input is an integer:
if number_successes == 0:
return 1
else:
return 1 - self.cdf(number_successes - 1)
# ------------------------------------------------------------------------------
# Methods to obtain pmf and cdf
# ------------------------------------------------------------------------------
def get_cdf(self, event_probabilities):
"""Return the values of the cumulative density function.
Return a list which contains all the values of the cumulative
density function for :math:`i = 0, 1, ..., n`.
:param event_probabilities: array of single event probabilities
:type event_probabilities: numpy.array
"""
cdf = np.empty(self.number_trials + 1)
cdf[0] = event_probabilities[0]
for i in range(1, self.number_trials + 1):
cdf[i] = cdf[i - 1] + event_probabilities[i]
return cdf
def get_pmf_xi(self):
"""Return the values of the variable ``xi``.
The components ``xi`` make up the probability mass function, i.e.
:math:`\\xi(k) = pmf(k) = Pr(X = k)`.
"""
chi = np.empty(self.number_trials + 1, dtype=complex)
chi[0] = 1
half_number_trials = int(
self.number_trials / 2 + self.number_trials % 2)
# set first half of chis:
chi[1:half_number_trials + 1] = self.get_chi(
np.arange(1, half_number_trials + 1))
# set second half of chis:
chi[half_number_trials + 1:self.number_trials + 1] = np.conjugate(
chi[1:self.number_trials - half_number_trials + 1] [::-1])
chi /= self.number_trials + 1
xi = np.fft.fft(chi)
if self.check_xi_are_real(xi):
xi = xi.real
else:
raise TypeError("pmf / xi values have to be real.")
xi += np.finfo(type(xi[0])).eps
return xi
def get_chi(self, idx_array):
"""Return the values of ``chi`` for the specified indices.
:param idx_array: array of indices for which the ``chi`` values should
be calculated
:type idx_array: numpy.array
"""
# get_z:
exp_value = np.exp(self.omega * idx_array * 1j)
xy = 1 - self.success_probabilities + \
self.success_probabilities * exp_value[:, np.newaxis]
# sum over the principal values of the arguments of z:
argz_sum = np.arctan2(xy.imag, xy.real).sum(axis=1)
# get d value:
exparg = np.log(np.abs(xy)).sum(axis=1)
d_value = np.exp(exparg)
# get chi values:
chi = d_value * np.exp(argz_sum * 1j)
return chi
# ------------------------------------------------------------------------------
# Auxiliary functions
# ------------------------------------------------------------------------------
def check_rv_input(self, number_successes):
"""Assert that the input values ``number_successes`` are OK.
The input values ``number_successes`` for the random variable have to be
integers, greater or equal to 0, and smaller or equal to the total
number of trials ``self.number_trials``.
:param number_successes: number of successful trials
:type number_successes: int or list of integers """
try:
for k in number_successes:
assert (type(k) == int or type(k) == np.int64), \
"Values in input list must be integers"
assert k >= 0, 'Values in input list cannot be negative.'
assert k <= self.number_trials, \
'Values in input list must be smaller or equal to the ' \
'number of input probabilities "n"'
except TypeError:
assert (type(number_successes) == int or \
type(number_successes) == np.int64), \
'Input value must be an integer.'
assert number_successes >= 0, "Input value cannot be negative."
assert number_successes <= self.number_trials, \
'Input value cannot be greater than ' + str(self.number_trials)
return True
@staticmethod
def check_xi_are_real(xi_values):
"""Check whether all the ``xi``s have imaginary part equal to 0.
The probabilities :math:`\\xi(k) = pmf(k) = Pr(X = k)` have to be
positive and must have imaginary part equal to zero.
:param xi_values: single event probabilities
:type xi_values: complex
"""
return np.all(xi_values.imag <= np.finfo(float).eps)
def check_input_prob(self):
"""Check that all the input probabilities are in the interval [0, 1]."""
if self.success_probabilities.shape != (self.number_trials,):
raise ValueError(
"Input must be an one-dimensional array or a list.")
if not np.all(self.success_probabilities >= 0):
raise ValueError("Input probabilities have to be non negative.")
if not np.all(self.success_probabilities <= 1):
raise ValueError("Input probabilities have to be smaller than 1.")
#export
def poisson_binomial_pmf(probs, xs):
return PoiBin(probs).pmf(xs)
def resimulate_match(shots, up_to=26, min_xg=0.0001, **kwargs):
"""
'Resimulate' a match based on xG. Takes a list of maps, where each map
represents a shot has and has 'is_home' (bool) and 'xg' (float) keys.
"""
# Prevent potential underflow
home_xgs = [max(s['xg'], min_xg) for s in shots if s['is_home']]
away_xgs = [max(s['xg'], min_xg) for s in shots if not s['is_home']]
home_scores = list(range(min(len(home_xgs) + 1, up_to)))
away_scores = list(range(min(len(away_xgs) + 1, up_to)))
home_probs = dict(zip(home_scores, poisson_binomial_pmf(home_xgs, home_scores)))
away_probs = dict(zip(away_scores, poisson_binomial_pmf(away_xgs, away_scores)))
scores = []
for h, a in itertools.product(range(up_to), repeat=2):
home_prob = home_probs.get(h, 0)
away_prob = away_probs.get(a, 0)
scores.append({
'home_goals': h,
'away_goals': a,
'home_probability': home_prob,
'away_probability': away_prob,
'probability': home_prob*away_prob,
**kwargs
})
# Keep everything up to 4-4; filter out P == 0 results above that
return [
s for s in scores
if s['probability'] > 0
or (s['home_goals'] < 5 and s['away_goals'] < 5)
]
def extract_prob(probs, home_goals, away_goals):
filtered = [p for p in probs if p['home_goals'] == home_goals and p['away_goals'] == away_goals]
if len(filtered) == 0:
return 0
return filtered[0]['probability']
probs = resimulate_match([
{'is_home': True, 'xg': 0.1}
])
assert np.isclose(extract_prob(probs, 1, 0), 0.1)
shots = [
{"is_home": False, "xg": 0.030929630622267723},
{"is_home": False, "xg": 0.021505167707800865},
{"is_home": False, "xg": 0.013733051717281342},
{"is_home": False, "xg": 0.06314441561698914},
]
probs = resimulate_match(shots)
assert np.isclose(
extract_prob(probs, 0, 4),
np.product([s['xg'] for s in shots])
)