#!/usr/bin/env python # coding: utf-8 # In[1]: import numpy as np import scipy as sp import scipy.stats import matplotlib.pyplot as plt import pandas as pd # # Concepts of Hypothesis Testing # You all heard of **null hypothesis** and **alternative hypothesis**, depends on the evidences that we decide to reject the null hypothesis or not. However if we do not have evidences to reject null hypothesis, we can't say that we accept null hypothesis, rather we say that _we can't reject null hypothesis based on current information_. # # Sometimes you might encounter the term of **type I error** and **type II error**, the former characterises the probability of rejecting a true null hypothesis, the latter characterises the probability of failing to reject a false null hypothesis. It might sounds counter-intuitive at first sight, but the plot below tells all story. # # The higher the significance level the lower probability of having type I error, but it increases the probability of having type II error. # In[2]: from plot_material import type12_error type12_error() # If you are yet bewildered, here is the guideline, the blue shaded area are genuinely generated by null distribution, however they are too distant (i.e. $2\sigma$ away) from the mean ($0$ in this example), so they are mistakenly rejected, this is what we call _Type I Error_. # # The orange shaded area are actually generated by alternative distribution, however they are in the adjacent area of mean of null hypothesis, so we failed to reject they, but wrongly. And this is called _Type II Error_. # # As you can see from the chart, if null distribution and alternative are far away from each other, the probability of both type of errors diminish to trivial. # # Rejection Region and p-Value # **Rejection region** is a range of values such that if the test statistic falls into that range, we decide to reject the null hypothesis in favour of the alternative hypothesis. # # To put it another way, a value has to be far enough from the mean of null distribution to fall into rejection region, then the distance is the evidence that the value might not be produced by null distribution, therefore a rejection of null hypothesis. # # Let's use some real data for illustration. The data format is ```.csv```, best tool is ```pandas``` library. # In[3]: data = pd.read_csv("500_Person_Gender_Height_Weight_Index.csv") data.head() # Null and alternative hypothesis are # $$ # H_0: \text{Average male height is 172}\newline # H_1: \text{Average male height isn't 172} # $$ # Calculate the sample mean and standard deviation of male height # In[4]: male_mean = data[data["Gender"] == "Male"]["Height"].mean() male_std = data[data["Gender"] == "Male"]["Height"].std(ddof=1) male_std_error = male_std / np.sqrt(len(data[data["Gender"] == "Male"])) male_null = 172 # The rejection region is simply an opposite view of expressing confidence interval # $$ # \bar{x}>\mu + t_\alpha\frac{s}{\sqrt{n}}\\ # \bar{x}<\mu - t_\alpha\frac{s}{\sqrt{n}} # $$ # Assume significance level $5\%$, then $+t_\alpha = t_{.025}$ and $-t_{\alpha} = t_{.975}$, where $t_{.025}$ and $t_{.975}$ can be calculated by ```.stat.t.ppf```. # In[5]: df = len(data[data["Gender"] == "Male"]) - 1 t_975 = sp.stats.t.ppf(0.975, df=df) t_975 # In[6]: t_025 = sp.stats.t.ppf(0.025, df=df) t_025 # In[7]: print( "The rejection region of null hypothesis is <{} and >{}".format( male_null - t_975 * male_std_error, male_null + t_975 * male_std_error ) ) # whereas the ```male_mean``` falls into # the rejection region, we reject null hypothesis in favour of alternative hypothesis # In[8]: male_mean # Alternatively we can construct $t$-statistic # $$ # t=\frac{\bar{x}-\mu}{s/\sqrt{n}} # $$ # Rejection region is where $t$-statistic larger or smaller than critical values # $$ # t>t_{\alpha} = t_{.025} \text{ and } t