#!/usr/bin/env python # coding: utf-8 # # The empiricaldist API # # Copyright 2021 Allen Downey # # BSD 3-clause license: https://opensource.org/licenses/BSD-3-Clause # In[1]: import numpy as np import pandas as pd import matplotlib.pyplot as plt # ## A Pmf is a Series # # `empiricaldist` provides `Pmf`, which is a Pandas Series that represents a probability mass function. # In[2]: from empiricaldist import Pmf # You can create a `Pmf` in any of the ways you can create a `Series`, but the most common way is to use `from_seq` to make a `Pmf` from a sequence. # # The following is a `Pmf` that represents a six-sided die. # In[3]: d6 = Pmf.from_seq([1,2,3,4,5,6]) # By default, the probabilities are normalized to add up to 1. # In[4]: d6 # But you can also make an unnormalized `Pmf` if you want to keep track of the counts. # In[5]: d6 = Pmf.from_seq([1,2,3,4,5,6], normalize=False) d6 # Or normalize later (the return value is the prior sum). # In[6]: d6.normalize() # Now the Pmf is normalized. # In[7]: d6 # ### Properties # # In a `Pmf` the index contains the quantities (`qs`) and the values contain the probabilities (`ps`). # # These attributes are available as properties that return arrays (same semantics as the Pandas `values` property) # In[8]: d6.qs # In[9]: d6.ps # ### Plotting PMFs # # `Pmf` provides two plotting functions. `bar` plots the `Pmf` as a histogram. # In[10]: def decorate_dice(title): """Labels the axes. title: string """ plt.xlabel('Outcome') plt.ylabel('PMF') plt.title(title) # In[11]: d6.bar() decorate_dice('One die') # `plot` displays the `Pmf` as a line. # In[12]: d6.plot() decorate_dice('One die') # ### Selection # # The bracket operator looks up an outcome and returns its probability. # In[13]: d6[1] # In[14]: d6[6] # Outcomes that are not in the distribution cause a `KeyError` # # ``` # d6[7] # ``` # # You can also use parentheses to look up a quantity and get the corresponding probability. # In[15]: d6(1) # With parentheses, a quantity that is not in the distribution returns `0`, not an error. # In[16]: d6(7) # ### Mutation # # `Pmf` objects are mutable, but in general the result is not normalized. # In[17]: d6[7] = 1/6 d6 # In[18]: d6.sum() # In[19]: d6.normalize() # In[20]: d6.sum() # ### Statistics # # `Pmf` overrides the statistics methods to compute `mean`, `median`, etc. # # These functions only work correctly if the `Pmf` is normalized. # In[21]: d6 = Pmf.from_seq([1,2,3,4,5,6]) # In[22]: d6.mean() # In[23]: d6.var() # In[24]: d6.std() # ### Sampling # # `choice` chooses a random values from the Pmf, following the API of `np.random.choice` # In[25]: d6.choice(size=10) # `sample` chooses a random values from the `Pmf`, with replacement. # In[26]: d6.sample(n=10) # ## CDFs # # `empiricaldist` also provides `Cdf`, which represents a cumulative distribution function. # In[27]: from empiricaldist import Cdf # You can create an empty `Cdf` and then add elements. # # Here's a `Cdf` that represents a four-sided die. # In[28]: d4 = Cdf.from_seq([1,2,3,4]) # In[29]: d4 # ### Properties # # In a `Cdf` the index contains the quantities (`qs`) and the values contain the probabilities (`ps`). # # These attributes are available as properties that return arrays (same semantics as the Pandas `values` property) # In[30]: d4.qs # In[31]: d4.ps # ### Displaying CDFs # # `Cdf` provides two plotting functions. # # `plot` displays the `Cdf` as a line. # In[32]: def decorate_dice(title): """Labels the axes. title: string """ plt.xlabel('Outcome') plt.ylabel('CDF') plt.title(title) # In[33]: d4.plot() decorate_dice('One die') # `step` plots the Cdf as a step function (which is more technically correct). # In[34]: d4.step() decorate_dice('One die') # ### Selection # # The bracket operator works as usual. # In[35]: d4[1] # In[36]: d4[4] # ### Evaluating CDFs # # `Cdf` provides `forward` and `inverse`, which evaluate the CDF and its inverse as functions. # # Evaluating a `Cdf` forward maps from a quantity to its cumulative probability. # In[37]: d6 = Cdf.from_seq([1,2,3,4,5,6]) # In[38]: d6.forward(3) # `forward` interpolates, so it works for quantities that are not in the distribution. # In[39]: d6.forward(3.5) # In[40]: d6.forward(0) # In[41]: d6.forward(7) # You can also call the `Cdf` like a function (which it is). # In[42]: d6(1.5) # `forward` can take an array of quantities, too. # In[43]: def decorate_cdf(title): """Labels the axes. title: string """ plt.xlabel('Quantity') plt.ylabel('CDF') plt.title(title) # In[44]: qs = np.linspace(0, 7) ps = d6(qs) plt.plot(qs, ps) decorate_cdf('Forward evaluation') # `Cdf` also provides `inverse`, which computes the inverse `Cdf`: # In[45]: d6.inverse(0.5) # `quantile` is a synonym for `inverse` # In[46]: d6.quantile(0.5) # `inverse` and `quantile` work with arrays # In[47]: ps = np.linspace(0, 1) qs = d6.quantile(ps) plt.plot(qs, ps) decorate_cdf('Inverse evaluation') # These functions provide a simple way to make a Q-Q plot. # # Here are two samples from the same distribution. # In[48]: cdf1 = Cdf.from_seq(np.random.normal(size=100)) cdf2 = Cdf.from_seq(np.random.normal(size=100)) cdf1.plot() cdf2.plot() decorate_cdf('Two random samples') # Here's how we compute the Q-Q plot. # In[49]: def qq_plot(cdf1, cdf2): """Compute results for a Q-Q plot. Evaluates the inverse Cdfs for a range of cumulative probabilities. :param cdf1: Cdf :param cdf2: Cdf :return: tuple of arrays """ ps = np.linspace(0, 1) q1 = cdf1.quantile(ps) q2 = cdf2.quantile(ps) return q1, q2 # The result is near the identity line, which suggests that the samples are from the same distribution. # In[50]: q1, q2 = qq_plot(cdf1, cdf2) plt.plot(q1, q2) plt.xlabel('Quantity 1') plt.ylabel('Quantity 2') plt.title('Q-Q plot'); # Here's how we compute a P-P plot # In[51]: def pp_plot(cdf1, cdf2): """Compute results for a P-P plot. Evaluates the Cdfs for all quantities in either Cdf. :param cdf1: Cdf :param cdf2: Cdf :return: tuple of arrays """ qs = cdf1.index.union(cdf2.index) p1 = cdf1(qs) p2 = cdf2(qs) return p1, p2 # And here's what it looks like. # In[52]: p1, p2 = pp_plot(cdf1, cdf2) plt.plot(p1, p2) plt.xlabel('Cdf 1') plt.ylabel('Cdf 2') plt.title('P-P plot'); # ### Mutation # # `Cdf` objects are mutable, but in general the result is not a valid Cdf. # In[53]: d4[5] = 1.25 d4 # In[54]: d4.normalize() d4 # ### Statistics # # `Cdf` overrides the statistics methods to compute `mean`, `median`, etc. # In[55]: d6.mean() # In[56]: d6.var() # In[57]: d6.std() # ### Sampling # # `choice` chooses a random values from the Cdf, following the API of `np.random.choice` # In[58]: d6.choice(size=10) # `sample` chooses a random values from the `Cdf`, with replacement. # In[59]: d6.sample(n=10) # ### Arithmetic # # `Pmf` and `Cdf` provide `add_dist`, which computes the distribution of the sum. # Here's the distribution of the sum of two dice. # In[60]: d6 = Pmf.from_seq([1,2,3,4,5,6]) twice = d6.add_dist(d6) twice # In[61]: twice.bar() decorate_dice('Two dice') twice.mean() # To add a constant to a distribution, you could construct a deterministic `Pmf` # In[62]: const = Pmf.from_seq([1]) d6.add_dist(const) # But `add_dist` also handles constants as a special case: # In[63]: d6.add_dist(1) # Other arithmetic operations are also implemented # In[64]: d4 = Pmf.from_seq([1,2,3,4]) # In[65]: d6.sub_dist(d4) # In[66]: d4.mul_dist(d4) # In[67]: d4.div_dist(d4) # ### Comparison operators # # `Pmf` implements comparison operators that return probabilities. # # You can compare a `Pmf` to a scalar: # In[68]: d6.lt_dist(3) # In[69]: d4.ge_dist(2) # Or compare `Pmf` objects: # In[70]: d4.gt_dist(d6) # In[71]: d6.le_dist(d4) # In[72]: d4.eq_dist(d6) # Interestingly, this way of comparing distributions is [nontransitive](). # In[73]: A = Pmf.from_seq([2, 2, 4, 4, 9, 9]) B = Pmf.from_seq([1, 1, 6, 6, 8, 8]) C = Pmf.from_seq([3, 3, 5, 5, 7, 7]) # In[74]: A.gt_dist(B) # In[75]: B.gt_dist(C) # In[76]: C.gt_dist(A) # ### Joint distributions # # `Pmf.make_joint` takes two `Pmf` objects and makes their joint distribution, assuming independence. # In[77]: d4 = Pmf.from_seq(range(1,5)) d4 # In[78]: d6 = Pmf.from_seq(range(1,7)) d6 # In[79]: joint = Pmf.make_joint(d4, d6) joint # The result is a `Pmf` object that uses a MultiIndex to represent the values. # In[80]: joint.index # If you ask for the `qs`, you get an array of pairs: # In[81]: joint.qs # You can select elements using tuples: # In[82]: joint[1,1] # You can get unnnormalized conditional distributions by selecting on different axes: # In[83]: Pmf(joint[1]) # In[84]: Pmf(joint.loc[:, 1]) # But `Pmf` also provides `conditional(i,val)` which returns the conditional distribution where variable `i` has the value `val`: # In[85]: joint.conditional(0, 1) # In[86]: joint.conditional(1, 1) # It also provides `marginal(i)`, which returns the marginal distribution along axis `i` # In[87]: joint.marginal(0) # In[88]: joint.marginal(1) # Here are some ways of iterating through a joint distribution. # In[89]: for q in joint.qs: print(q) # In[90]: for p in joint.ps: print(p) # In[91]: for q, p in joint.items(): print(q, p) # In[92]: for (q1, q2), p in joint.items(): print(q1, q2, p) # In[ ]: