#!/usr/bin/env python
# coding: utf-8
# # Hypothesis testing
# Hypothesis: University towns and non-university towns have same effect on their mean housing prices during recessions.
# # Data sources
# 1) List of university towns from wikipedia. This is a text file with 567 lines either containing state name or the name of city and the university in the city.
#
#
# 2) Housing data from Zillow containg information about house sale prices in each month from the year 1996 to 2016. A snapshot of the csv file is shown here.
#
#
# 3) Quarterly GDP data from US Department of Commerce, Bureau of Economic Analysis. A snapshot of the excel file is shown below.
#
#
# In[1]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
# In[2]:
def get_list_of_university_towns():
'''Returns a DataFrame of towns and the states they are in from the
university_towns.txt list.
Following steps were taken to extract the required information in the desired format:
1. For "State", removing characters from "[" to the end.
2. For "RegionName", when applicable, removing every character from " (" to the end.'''
uni_towns=pd.DataFrame(columns=['State','RegionName'])
with open('university_towns.txt', 'r') as f:
#Read non-empty lines:
data = (line.rstrip() for line in f)
lines = list(line for line in data if line)
#Get the index of states:
r_idx = [lines.index(line) for line in lines if '[edit]' in line]
#Separating states and university names using wrapping indexes:
university = []
state = [lines[i].replace('[edit]', '') for i in r_idx]
for i in range(len(r_idx)):
if i != len(r_idx)-1:
index_range=[j for j in range(r_idx[i]+1,r_idx[i+1])]
regions=[lines[j].split(" (")[0] for j in index_range]
for region in regions:
uni_towns=uni_towns.append(pd.DataFrame([[state[i],region]],columns=['State','RegionName']),ignore_index=True)
else:
index_range=[j for j in range(r_idx[i]+1,len(lines))]
regions = [lines[j].split(" (")[0] for j in index_range]
for region in regions:
uni_towns=uni_towns.append(pd.DataFrame([[state[i],region]],columns=['State','RegionName']),ignore_index=True)
return uni_towns
get_list_of_university_towns()
# # Get the quarter when recession starts
# Recession start is defined by two consecutive quarters of decline in GDP.
# In[3]:
def get_recession_start():
'''Returns the year and quarter of the recession start time after year 2000.'''
gdp=pd.read_excel('gdplev.xls',usecols="E,G",skiprows=219,names=['Quarter','GDP']).set_index('Quarter')
gdp['GDP change']=gdp['GDP']-gdp['GDP'].shift(+1)
gdp['RecessionStart'] = (gdp['GDP change'] < 0) & (gdp['GDP change'].shift(-1) < 0)& (gdp['GDP change'].shift(+1) > 0)
return gdp.index[gdp['RecessionStart']][0]
get_recession_start()
# # Get recession end quarter
# Recession end is defined by two consecutive quarters with GDP increase.
# In[4]:
def get_recession_end():
'''Returns the year and quarter of the recession end time'''
rstart=get_recession_start()
gdp=pd.read_excel('gdplev.xls',usecols="E,G",skiprows=219,names=['Quarter','GDP']).set_index('Quarter')
gdp['GDP change']=gdp['GDP']-gdp['GDP'].shift(+1)
gdp['RecessionEnd'] = (gdp['GDP change'] > 0) & (gdp['GDP change'].shift(+1) > 0)&(gdp['GDP change'].shift(+2) < 0)
#return gdp
list_of_year=list(gdp.index[gdp['RecessionEnd']])
for year in list_of_year:
if(year>rstart):
rend=year
break
#rend=[year in list_of_year if(year > rstart)]
return rend
get_recession_end()
# # Get recession bottom
# Recession bottom is defined as the quarter with minimum GDP between the start and end of recession
# In[5]:
def get_recession_bottom():
'''Returns the year and quarter of the recession bottom time'''
rstart=get_recession_start()
rend=get_recession_end()
gdp=pd.read_excel('gdplev.xls',usecols="E,G",skiprows=219,names=['Quarter','GDP']).set_index('Quarter')
rbottom=np.argmin(gdp.loc[rstart:rend,'GDP'])
return rbottom
get_recession_bottom()
# # Converting the monthly housing prices to quarterly prices
# In[6]:
# Use this dictionary to map state names to two letter acronyms
states = {'OH': 'Ohio', 'KY': 'Kentucky', 'AS': 'American Samoa', 'NV': 'Nevada', 'WY': 'Wyoming', 'NA': 'National', 'AL': 'Alabama', 'MD': 'Maryland', 'AK': 'Alaska', 'UT': 'Utah', 'OR': 'Oregon', 'MT': 'Montana', 'IL': 'Illinois', 'TN': 'Tennessee', 'DC': 'District of Columbia', 'VT': 'Vermont', 'ID': 'Idaho', 'AR': 'Arkansas', 'ME': 'Maine', 'WA': 'Washington', 'HI': 'Hawaii', 'WI': 'Wisconsin', 'MI': 'Michigan', 'IN': 'Indiana', 'NJ': 'New Jersey', 'AZ': 'Arizona', 'GU': 'Guam', 'MS': 'Mississippi', 'PR': 'Puerto Rico', 'NC': 'North Carolina', 'TX': 'Texas', 'SD': 'South Dakota', 'MP': 'Northern Mariana Islands', 'IA': 'Iowa', 'MO': 'Missouri', 'CT': 'Connecticut', 'WV': 'West Virginia', 'SC': 'South Carolina', 'LA': 'Louisiana', 'KS': 'Kansas', 'NY': 'New York', 'NE': 'Nebraska', 'OK': 'Oklahoma', 'FL': 'Florida', 'CA': 'California', 'CO': 'Colorado', 'PA': 'Pennsylvania', 'DE': 'Delaware', 'NM': 'New Mexico', 'RI': 'Rhode Island', 'MN': 'Minnesota', 'VI': 'Virgin Islands', 'NH': 'New Hampshire', 'MA': 'Massachusetts', 'GA': 'Georgia', 'ND': 'North Dakota', 'VA': 'Virginia'}
# In[7]:
def value2quarter(value):
year, month = value.split("-")
year = int(year)
month = int(month)
if month < 4:
return "{}q{}".format(year, 1)
if month < 7:
return "{}q{}".format(year, 2)
if month < 10:
return "{}q{}".format(year, 3)
return "{}q{}".format(year, 4)
def convert_housing_data_to_quarters():
df = pd.read_csv("City_Zhvi_AllHomes.csv")
df_t= pd.melt(df.drop(["Metro", "CountyName", "SizeRank"], axis=1), id_vars=["State", "RegionName", "RegionID",])
df_t["yearQ"] = df_t["variable"].apply(value2quarter)
df_t = df_t[(df_t.yearQ >= "2000q1") & (df_t.yearQ <= "2016q3")]
df_t["State"] = df_t["State"].map(states)
answer = df_t.groupby(
["State", "RegionName", "RegionID", "yearQ"]
)["value"].mean().unstack("yearQ").reset_index().drop("RegionID", axis=1).set_index(["State", "RegionName"])
return answer
convert_housing_data_to_quarters()
# # Exploring the effect of recession on housing prices in university and non-university towns
# In[8]:
df_university = get_list_of_university_towns()
df_university["is_univ_town"] = True
df_university = df_university.set_index(["State", "RegionName"])
df_housing_data = convert_housing_data_to_quarters()
df_all = df_housing_data.merge(df_university, left_index=True, right_index=True, how="left")
df_all["is_univ_town"] = df_all["is_univ_town"].fillna(False)
start = "2008q3"# get_recession_start()
bottom = "2009q2"# get_recession_bottom()
df_all["ratio"] = df_all[bottom]/df_all[start]
mean_price_ratio_univ = df_all.loc[df_all.is_univ_town, "ratio"].dropna()
mean_price_ratio_no_univ = df_all.loc[~df_all.is_univ_town, "ratio"].dropna()
# In[9]:
n1, bins1, patches1 = plt.hist(mean_price_ratio_univ, 20, density=True, facecolor='blue', alpha=0.5)
(mu_unitown, sigma_unitown) = norm.fit(mean_price_ratio_univ)
mu_unitown,sigma_unitown
# In[10]:
n2, bins2, patches2 = plt.hist(mean_price_ratio_no_univ, 20, density=True, facecolor='red', alpha=0.5)
(mu_non_unitown, sigma_non_unitown) = norm.fit(mean_price_ratio_no_univ)
mu_non_unitown,sigma_non_unitown
# # Run t-test (Welch's t-test)
# The variance of both samples are different hence use Welch's t-test (https://en.wikipedia.org/wiki/Student%27s_t-test#Independent_two-sample_t-test)
#
#
# In[11]:
def run_ttest():
statistic, pval = ttest_ind(
mean_price_ratio_univ,
mean_price_ratio_no_univ, equal_var=False)
different = pval < 0.01
better = "university town" if mean_price_ratio_univ.mean() > mean_price_ratio_no_univ.mean() else "non-university town"
return (different, pval, better)
run_ttest()
# # Conclusion: The university town housing prices are less affected by recession in 2008.
# ## Few points for further exploration:
# ### Is it true for all recession period?
# ### Is it true for all kinds of housing (condo, apartements, houses etc.)?
# In[ ]: