#!/usr/bin/env python
# coding: utf-8

# # Analysis of variance (ANOVA)

# In[1]:


import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns


# In[2]:


get_ipython().run_line_magic('matplotlib', 'inline')
get_ipython().run_line_magic('config', "InlineBackend.figure_format = 'retina'")


# $\def\stderr#1{\mathbf{se}_{#1}}$
# $\def\stderrhat#1{\hat{\mathbf{se}}_{#1}}$
# $\newcommand{\Mean}{\textbf{Mean}}$
# $\newcommand{\Var}{\textbf{Var}}$
# $\newcommand{\Std}{\textbf{Std}}$
# $\newcommand{\Freq}{\textbf{Freq}}$
# $\newcommand{\RelFreq}{\textbf{RelFreq}}$
# $\newcommand{\DMeans}{\textbf{DMeans}}$
# $\newcommand{\Prop}{\textbf{Prop}}$
# $\newcommand{\DProps}{\textbf{DProps}}$

# ## Definitions

# ## Formulas

# In[ ]:


# ## Example

# ## Explanations

# ## Discussion

# ## Equivalence between ANOVA and OLS
# 
# via https://stats.stackexchange.com/questions/175246/why-is-anova-equivalent-to-linear-regression

# In[1]:


import numpy as np
from scipy.stats import randint, norm
np.random.seed(124)  # Fix the seed

x = randint(1,6).rvs(100) # Generate 100 random integer U[1,5]
y = x + norm().rvs(100)   # Generate my response sample


# In[2]:


import pandas as pd
import seaborn as sns
df = pd.DataFrame({"x":x, "y":y})
sns.stripplot(data=df, x="x", y="y")
df.groupby("x")["y"].mean()


# In[3]:


# One-way ANOVA
from scipy.stats import f_oneway

x1 = df[x==1]["y"]
x2 = df[x==2]["y"]
x3 = df[x==3]["y"]
x4 = df[x==4]["y"]
x5 = df[x==5]["y"]
res = f_oneway(x1, x2, x3, x4, x5)
res


# In[4]:


import statsmodels.api as sm
from statsmodels.formula.api import ols

# get ANOVA table as R like output
model = ols('y ~ C(x)', data=df).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
anova_table


# In[ ]:


# In[5]:


# MEANS
# 1    1.114427
# 2    1.958159
# 3    2.844082
# 4    4.198083
# 5    5.410594

# Ordinary Least Squares (OLS) model
model = ols('y ~ C(x)', data=df).fit()
model.summary()


# In[6]:


betas = model.params.values
betas


# In[7]:


scaled_batas = np.concatenate([[betas[0]], betas[0]+betas[1:]])
scaled_batas


# In[8]:


# Check if the two results are numerically equivalent
np.isclose(scaled_batas, df.groupby("x")["y"].mean().values)


# In[9]:


# # Ordinary Least Squares (OLS) model (no intercept)
# model = ols('y ~ C(x) -1', data=df).fit()
# model.summary()


# In[ ]:


# In[70]:


from scipy.stats.mstats import argstoarray
data = argstoarray(x1.values, x2.values, x3.values, x4.values, x5.values)


# In[48]:


data.count(axis=1)
np.sum( data.count(axis=1) * ( data.mean(axis=1) - data.mean() )**2 )


# In[65]:


# sswg manual compute
gmeans = data.mean(axis=1)
data_minus_gmeans = np.subtract(data.T, gmeans).T
(data_minus_gmeans**2).sum()


# In[69]:


# sswg via parallel axis thm
gmeans = data.mean(axis=1)
np.sum( (data**2).sum(axis=1) - data.count(axis=1) * gmeans**2 )


# In[45]:


from scipy.stats import f as fdist

def f_oneway(*args):
    """
    Performs a 1-way ANOVA, returning an F-value and probability given
    any number of groups.  From Heiman, pp.394-7.
    """
    # Construct a single array of arguments: each row is a group
    data = argstoarray(*args)
    ngroups = len(data)
    ntot = data.count()
    sstot = (data**2).sum() - (data.sum())**2/float(ntot)
    ssbg = (data.count(-1) * (data.mean(-1)-data.mean())**2).sum()
    sswg = sstot-ssbg
    print(ssbg, sswg, sstot)
    dfbg = ngroups-1
    dfwg = ntot - ngroups
    msb = ssbg/float(dfbg)
    msw = sswg/float(dfwg)
    f = msb/msw
    prob = fdist.sf(dfbg, dfwg, f)
    return f, prob


# In[46]:


f_oneway(x1.values, x2.values, x3.values, x4.values, x5.values)


# In[ ]: