#!/usr/bin/env python # coding: utf-8 # # Analysis of variance (ANOVA) # In[1]: import matplotlib.pyplot as plt import numpy as np import seaborn as sns # In[2]: get_ipython().run_line_magic('matplotlib', 'inline') get_ipython().run_line_magic('config', "InlineBackend.figure_format = 'retina'") # $\def\stderr#1{\mathbf{se}_{#1}}$ # $\def\stderrhat#1{\hat{\mathbf{se}}_{#1}}$ # $\newcommand{\Mean}{\textbf{Mean}}$ # $\newcommand{\Var}{\textbf{Var}}$ # $\newcommand{\Std}{\textbf{Std}}$ # $\newcommand{\Freq}{\textbf{Freq}}$ # $\newcommand{\RelFreq}{\textbf{RelFreq}}$ # $\newcommand{\DMeans}{\textbf{DMeans}}$ # $\newcommand{\Prop}{\textbf{Prop}}$ # $\newcommand{\DProps}{\textbf{DProps}}$ # ## Definitions # ## Formulas # In[ ]: # ## Example # ## Explanations # ## Discussion # ## Equivalence between ANOVA and OLS # # via https://stats.stackexchange.com/questions/175246/why-is-anova-equivalent-to-linear-regression # In[1]: import numpy as np from scipy.stats import randint, norm np.random.seed(124) # Fix the seed x = randint(1,6).rvs(100) # Generate 100 random integer U[1,5] y = x + norm().rvs(100) # Generate my response sample # In[2]: import pandas as pd import seaborn as sns df = pd.DataFrame({"x":x, "y":y}) sns.stripplot(data=df, x="x", y="y") df.groupby("x")["y"].mean() # In[3]: # One-way ANOVA from scipy.stats import f_oneway x1 = df[x==1]["y"] x2 = df[x==2]["y"] x3 = df[x==3]["y"] x4 = df[x==4]["y"] x5 = df[x==5]["y"] res = f_oneway(x1, x2, x3, x4, x5) res # In[4]: import statsmodels.api as sm from statsmodels.formula.api import ols # get ANOVA table as R like output model = ols('y ~ C(x)', data=df).fit() anova_table = sm.stats.anova_lm(model, typ=2) anova_table # In[ ]: # In[5]: # MEANS # 1 1.114427 # 2 1.958159 # 3 2.844082 # 4 4.198083 # 5 5.410594 # Ordinary Least Squares (OLS) model model = ols('y ~ C(x)', data=df).fit() model.summary() # In[6]: betas = model.params.values betas # In[7]: scaled_batas = np.concatenate([[betas[0]], betas[0]+betas[1:]]) scaled_batas # In[8]: # Check if the two results are numerically equivalent np.isclose(scaled_batas, df.groupby("x")["y"].mean().values) # In[9]: # # Ordinary Least Squares (OLS) model (no intercept) # model = ols('y ~ C(x) -1', data=df).fit() # model.summary() # In[ ]: # In[70]: from scipy.stats.mstats import argstoarray data = argstoarray(x1.values, x2.values, x3.values, x4.values, x5.values) # In[48]: data.count(axis=1) np.sum( data.count(axis=1) * ( data.mean(axis=1) - data.mean() )**2 ) # In[65]: # sswg manual compute gmeans = data.mean(axis=1) data_minus_gmeans = np.subtract(data.T, gmeans).T (data_minus_gmeans**2).sum() # In[69]: # sswg via parallel axis thm gmeans = data.mean(axis=1) np.sum( (data**2).sum(axis=1) - data.count(axis=1) * gmeans**2 ) # In[45]: from scipy.stats import f as fdist def f_oneway(*args): """ Performs a 1-way ANOVA, returning an F-value and probability given any number of groups. From Heiman, pp.394-7. """ # Construct a single array of arguments: each row is a group data = argstoarray(*args) ngroups = len(data) ntot = data.count() sstot = (data**2).sum() - (data.sum())**2/float(ntot) ssbg = (data.count(-1) * (data.mean(-1)-data.mean())**2).sum() sswg = sstot-ssbg print(ssbg, sswg, sstot) dfbg = ngroups-1 dfwg = ntot - ngroups msb = ssbg/float(dfbg) msw = sswg/float(dfwg) f = msb/msw prob = fdist.sf(dfbg, dfwg, f) return f, prob # In[46]: f_oneway(x1.values, x2.values, x3.values, x4.values, x5.values) # In[ ]: