#!/usr/bin/env python
# coding: utf-8

# # The empiricaldist API
# 
# Copyright 2021 Allen Downey
# 
# BSD 3-clause license: https://opensource.org/licenses/BSD-3-Clause

# In[1]:


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


# ## A Pmf is a Series
# 
# `empiricaldist` provides `Pmf`, which is a Pandas Series that represents a probability mass function.

# In[2]:


from empiricaldist import Pmf


# You can create a `Pmf` in any of the ways you can create a `Series`, but the most common way is to use `from_seq` to make a `Pmf` from a sequence.
# 
# The following is a `Pmf` that represents a six-sided die.

# In[3]:


d6 = Pmf.from_seq([1,2,3,4,5,6])


# By default, the probabilities are normalized to add up to 1.

# In[4]:


d6


# But you can also make an unnormalized `Pmf` if you want to keep track of the counts.

# In[5]:


d6 = Pmf.from_seq([1,2,3,4,5,6], normalize=False)
d6


# Or normalize later (the return value is the prior sum).

# In[6]:


d6.normalize()


# Now the Pmf is normalized.

# In[7]:


d6


# ### Properties
# 
# In a `Pmf` the index contains the quantities (`qs`) and the values contain the probabilities (`ps`).
# 
# These attributes are available as properties that return arrays (same semantics as the Pandas `values` property)

# In[8]:


d6.qs


# In[9]:


d6.ps


# ### Plotting PMFs
# 
# `Pmf` provides two plotting functions.  `bar` plots the `Pmf` as a histogram.

# In[10]:


def decorate_dice(title):
    """Labels the axes.
    
    title: string
    """
    plt.xlabel('Outcome')
    plt.ylabel('PMF')
    plt.title(title)


# In[11]:


d6.bar()
decorate_dice('One die')


# `plot` displays the `Pmf` as a line.

# In[12]:


d6.plot()
decorate_dice('One die')


# ### Selection
# 
# The bracket operator looks up an outcome and returns its probability.

# In[13]:


d6[1]


# In[14]:


d6[6]


# Outcomes that are not in the distribution cause a `KeyError`
# 
# ```
# d6[7]
# ```
# 
# You can also use parentheses to look up a quantity and get the corresponding probability.

# In[15]:


d6(1)


# With parentheses, a quantity that is not in the distribution returns `0`, not an error.

# In[16]:


d6(7)


# ### Mutation
# 
# `Pmf` objects are mutable, but in general the result is not normalized.

# In[17]:


d6[7] = 1/6
d6


# In[18]:


d6.sum()


# In[19]:


d6.normalize()


# In[20]:


d6.sum()


# ### Statistics
# 
# `Pmf` overrides the statistics methods to compute `mean`, `median`, etc.
# 
# These functions only work correctly if the `Pmf` is normalized.

# In[21]:


d6 = Pmf.from_seq([1,2,3,4,5,6])


# In[22]:


d6.mean()


# In[23]:


d6.var()


# In[24]:


d6.std()


# ### Sampling
# 
# `choice` chooses a random values from the Pmf, following the API of `np.random.choice`

# In[25]:


d6.choice(size=10)


# `sample` chooses a random values from the `Pmf`, with replacement.

# In[26]:


d6.sample(n=10)


# ## CDFs
# 
# `empiricaldist` also provides `Cdf`, which represents a cumulative distribution function.

# In[27]:


from empiricaldist import Cdf


# You can create an empty `Cdf` and then add elements.
# 
# Here's a `Cdf` that represents a four-sided die.

# In[28]:


d4 = Cdf.from_seq([1,2,3,4])


# In[29]:


d4


# ### Properties
# 
# In a `Cdf` the index contains the quantities (`qs`) and the values contain the probabilities (`ps`).
# 
# These attributes are available as properties that return arrays (same semantics as the Pandas `values` property)

# In[30]:


d4.qs


# In[31]:


d4.ps


# ### Displaying CDFs
# 
# `Cdf` provides two plotting functions.
# 
# `plot` displays the `Cdf` as a line.

# In[32]:


def decorate_dice(title):
    """Labels the axes.
    
    title: string
    """
    plt.xlabel('Outcome')
    plt.ylabel('CDF')
    plt.title(title)


# In[33]:


d4.plot()
decorate_dice('One die')


# `step` plots the Cdf as a step function (which is more technically correct).

# In[34]:


d4.step()
decorate_dice('One die')


# ### Selection
# 
# The bracket operator works as usual.

# In[35]:


d4[1]


# In[36]:


d4[4]


# ### Evaluating CDFs
# 
# `Cdf` provides `forward` and `inverse`, which evaluate the CDF and its inverse as functions.
# 
# Evaluating a `Cdf` forward maps from a quantity to its cumulative probability.

# In[37]:


d6 = Cdf.from_seq([1,2,3,4,5,6])


# In[38]:


d6.forward(3)


# `forward` interpolates, so it works for quantities that are not in the distribution.

# In[39]:


d6.forward(3.5)


# In[40]:


d6.forward(0)


# In[41]:


d6.forward(7)


# You can also call the `Cdf` like a function (which it is).

# In[42]:


d6(1.5)


# `forward` can take an array of quantities, too.

# In[43]:


def decorate_cdf(title):
    """Labels the axes.
    
    title: string
    """
    plt.xlabel('Quantity')
    plt.ylabel('CDF')
    plt.title(title)


# In[44]:


qs = np.linspace(0, 7)
ps = d6(qs)
plt.plot(qs, ps)
decorate_cdf('Forward evaluation')


# `Cdf` also provides `inverse`, which computes the inverse `Cdf`:

# In[45]:


d6.inverse(0.5)


# `quantile` is a synonym for `inverse`

# In[46]:


d6.quantile(0.5)


# `inverse` and `quantile` work with arrays 

# In[47]:


ps = np.linspace(0, 1)
qs = d6.quantile(ps)
plt.plot(qs, ps)
decorate_cdf('Inverse evaluation')


# These functions provide a simple way to make a Q-Q plot.
# 
# Here are two samples from the same distribution.

# In[48]:


cdf1 = Cdf.from_seq(np.random.normal(size=100))
cdf2 = Cdf.from_seq(np.random.normal(size=100))

cdf1.plot()
cdf2.plot()
decorate_cdf('Two random samples')


# Here's how we compute the Q-Q plot.

# In[49]:


def qq_plot(cdf1, cdf2):
    """Compute results for a Q-Q plot.
    
    Evaluates the inverse Cdfs for a 
    range of cumulative probabilities.
    
    :param cdf1: Cdf
    :param cdf2: Cdf
    
    :return: tuple of arrays
    """
    ps = np.linspace(0, 1)
    q1 = cdf1.quantile(ps)
    q2 = cdf2.quantile(ps)
    return q1, q2


# The result is near the identity line, which suggests that the samples are from the same distribution.

# In[50]:


q1, q2 = qq_plot(cdf1, cdf2)
plt.plot(q1, q2)
plt.xlabel('Quantity 1')
plt.ylabel('Quantity 2')
plt.title('Q-Q plot');


# Here's how we compute a P-P plot

# In[51]:


def pp_plot(cdf1, cdf2):
    """Compute results for a P-P plot.
    
    Evaluates the Cdfs for all quantities in either Cdf.
    
    :param cdf1: Cdf
    :param cdf2: Cdf
    
    :return: tuple of arrays
    """
    qs = cdf1.index.union(cdf2.index)
    p1 = cdf1(qs)
    p2 = cdf2(qs)
    return p1, p2


# And here's what it looks like.

# In[52]:


p1, p2 = pp_plot(cdf1, cdf2)
plt.plot(p1, p2)
plt.xlabel('Cdf 1')
plt.ylabel('Cdf 2')
plt.title('P-P plot');


# ### Mutation
# 
# `Cdf` objects are mutable, but in general the result is not a valid Cdf.

# In[53]:


d4[5] = 1.25
d4


# In[54]:


d4.normalize()
d4


# ### Statistics
# 
# `Cdf` overrides the statistics methods to compute `mean`, `median`, etc.

# In[55]:


d6.mean()


# In[56]:


d6.var()


# In[57]:


d6.std()


# ### Sampling
# 
# `choice` chooses a random values from the Cdf, following the API of `np.random.choice`

# In[58]:


d6.choice(size=10)


# `sample` chooses a random values from the `Cdf`, with replacement.

# In[59]:


d6.sample(n=10)


# ### Arithmetic
# 
# `Pmf` and `Cdf` provide `add_dist`, which computes the distribution of the sum.

# Here's the distribution of the sum of two dice.

# In[60]:


d6 = Pmf.from_seq([1,2,3,4,5,6])

twice = d6.add_dist(d6)
twice


# In[61]:


twice.bar()
decorate_dice('Two dice')
twice.mean()


# To add a constant to a distribution, you could construct a deterministic `Pmf`

# In[62]:


const = Pmf.from_seq([1])
d6.add_dist(const)


# But `add_dist` also handles constants as a special case:

# In[63]:


d6.add_dist(1)


# Other arithmetic operations are also implemented

# In[64]:


d4 = Pmf.from_seq([1,2,3,4])


# In[65]:


d6.sub_dist(d4)


# In[66]:


d4.mul_dist(d4)


# In[67]:


d4.div_dist(d4)


# ### Comparison operators
# 
# `Pmf` implements comparison operators that return probabilities.
# 
# You can compare a `Pmf` to a scalar:

# In[68]:


d6.lt_dist(3)


# In[69]:


d4.ge_dist(2)


# Or compare `Pmf` objects:

# In[70]:


d4.gt_dist(d6)


# In[71]:


d6.le_dist(d4)


# In[72]:


d4.eq_dist(d6)


# Interestingly, this way of comparing distributions is [nontransitive]().

# In[73]:


A = Pmf.from_seq([2, 2, 4, 4, 9, 9])
B = Pmf.from_seq([1, 1, 6, 6, 8, 8])
C = Pmf.from_seq([3, 3, 5, 5, 7, 7])


# In[74]:


A.gt_dist(B)


# In[75]:


B.gt_dist(C)


# In[76]:


C.gt_dist(A)


# ### Joint distributions
# 
# `Pmf.make_joint` takes two `Pmf` objects and makes their joint distribution, assuming independence.

# In[77]:


d4 = Pmf.from_seq(range(1,5))
d4


# In[78]:


d6 = Pmf.from_seq(range(1,7))
d6


# In[79]:


joint = Pmf.make_joint(d4, d6)
joint


# The result is a `Pmf` object that uses a MultiIndex to represent the values.

# In[80]:


joint.index


# If you ask for the `qs`, you get an array of pairs:

# In[81]:


joint.qs


# You can select elements using tuples:

# In[82]:


joint[1,1]


# You can get unnnormalized conditional distributions by selecting on different axes:

# In[83]:


Pmf(joint[1])


# In[84]:


Pmf(joint.loc[:, 1])


# But `Pmf` also provides `conditional(i,val)` which returns the conditional distribution where variable `i` has the value `val`: 

# In[85]:


joint.conditional(0, 1)


# In[86]:


joint.conditional(1, 1)


# It also provides `marginal(i)`, which returns the marginal distribution along axis `i`

# In[87]:


joint.marginal(0)


# In[88]:


joint.marginal(1)


# Here are some ways of iterating through a joint distribution.

# In[89]:


for q in joint.qs:
    print(q)


# In[90]:


for p in joint.ps:
    print(p)


# In[91]:


for q, p in joint.items():
    print(q, p)


# In[92]:


for (q1, q2), p in joint.items():
    print(q1, q2, p)


# In[ ]: