#!/usr/bin/env python
# coding: utf-8

# # Skew test
# 
# Allen Downey
# 
# [MIT License](https://en.wikipedia.org/wiki/MIT_License)

# In[ ]:


get_ipython().run_line_magic('matplotlib', 'inline')

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='white')

from thinkstats2 import Pmf, Cdf

import thinkstats2
import thinkplot

decorate = thinkplot.config


# Suppose you buy a loaf of bread every day for a year, take it
# home, and weigh it.  You suspect that the distribution of weights is
# more skewed than a normal distribution with the same mean and
#   standard deviation.
# 
# To test your suspicion, write a definition for a class named
#   `SkewTest` that extends `thinkstats.HypothesisTest` and provides
#   two methods:
# 
# * `TestStatistic` should compute the skew of a given sample.
# 
# * `RunModel` should simulate the null hypothesis and return
#   simulated data.

# In[ ]:


class HypothesisTest(object):
    """Represents a hypothesis test."""

    def __init__(self, data):
        """Initializes.

        data: data in whatever form is relevant
        """
        self.data = data
        self.MakeModel()
        self.actual = self.TestStatistic(data)
        self.test_stats = None

    def PValue(self, iters=1000):
        """Computes the distribution of the test statistic and p-value.

        iters: number of iterations

        returns: float p-value
        """
        self.test_stats = np.array([self.TestStatistic(self.RunModel()) 
                                       for _ in range(iters)])

        count = sum(self.test_stats >= self.actual)
        return count / iters

    def MaxTestStat(self):
        """Returns the largest test statistic seen during simulations.
        """
        return np.max(self.test_stats)

    def PlotHist(self, label=None):
        """Draws a Cdf with vertical lines at the observed test stat.
        """
        plt.hist(self.test_stats, color='C4', alpha=0.5)
        plt.axvline(self.actual, linewidth=3, color='0.8')
        plt.xlabel('Test statistic')
        plt.ylabel('Count')
        plt.title('Distribution of the test statistic under the null hypothesis')

    def TestStatistic(self, data):
        """Computes the test statistic.

        data: data in whatever form is relevant        
        """
        raise UnimplementedMethodException()

    def MakeModel(self):
        """Build a model of the null hypothesis.
        """
        pass

    def RunModel(self):
        """Run the model of the null hypothesis.

        returns: simulated data
        """
        raise UnimplementedMethodException()


# In[ ]:


# Solution goes here


# To test this class, I'll generate a sample from an actual Gaussian distribution, so the null hypothesis is true.

# In[ ]:


mu = 1000
sigma = 35
data = np.random.normal(mu, sigma, size=365)


# Now we can make a `SkewTest` and compute the observed skewness.

# In[ ]:


test = SkewTest(data)
test.actual


# Here's the p-value.

# In[ ]:


test = SkewTest(data)
test.PValue()


# And the distribution of the test statistic under the null hypothesis.

# In[ ]:


test.PlotHist()


# Most of the time the p-value exceeds 5%, so we would conclude that the observed skewness could plausibly be due to random sample.
# 
# But let's see how often we get a false positive.

# In[ ]:


iters = 100
count = 0

for i in range(iters):
    data = np.random.normal(mu, sigma, size=365)
    test = SkewTest(data)
    p_value = test.PValue()
    if p_value < 0.05:
        count +=1
        
print(count/iters)


# In the long run, the false positive rate is the threshold we used, 5%.