#!/usr/bin/env python # coding: utf-8 # # Hypothesis testing # Hypothesis: University towns and non-university towns have same effect on their mean housing prices during recessions. # # Data sources # 1) List of university towns from wikipedia. This is a text file with 567 lines either containing state name or the name of city and the university in the city. # # # 2) Housing data from Zillow containg information about house sale prices in each month from the year 1996 to 2016. A snapshot of the csv file is shown here. # # # 3) Quarterly GDP data from US Department of Commerce, Bureau of Economic Analysis. A snapshot of the excel file is shown below. # # # In[1]: import pandas as pd import numpy as np from scipy.stats import ttest_ind import numpy as np import matplotlib.pyplot as plt from scipy.stats import norm # In[2]: def get_list_of_university_towns(): '''Returns a DataFrame of towns and the states they are in from the university_towns.txt list. Following steps were taken to extract the required information in the desired format: 1. For "State", removing characters from "[" to the end. 2. For "RegionName", when applicable, removing every character from " (" to the end.''' uni_towns=pd.DataFrame(columns=['State','RegionName']) with open('university_towns.txt', 'r') as f: #Read non-empty lines: data = (line.rstrip() for line in f) lines = list(line for line in data if line) #Get the index of states: r_idx = [lines.index(line) for line in lines if '[edit]' in line] #Separating states and university names using wrapping indexes: university = [] state = [lines[i].replace('[edit]', '') for i in r_idx] for i in range(len(r_idx)): if i != len(r_idx)-1: index_range=[j for j in range(r_idx[i]+1,r_idx[i+1])] regions=[lines[j].split(" (")[0] for j in index_range] for region in regions: uni_towns=uni_towns.append(pd.DataFrame([[state[i],region]],columns=['State','RegionName']),ignore_index=True) else: index_range=[j for j in range(r_idx[i]+1,len(lines))] regions = [lines[j].split(" (")[0] for j in index_range] for region in regions: uni_towns=uni_towns.append(pd.DataFrame([[state[i],region]],columns=['State','RegionName']),ignore_index=True) return uni_towns get_list_of_university_towns() # # Get the quarter when recession starts # Recession start is defined by two consecutive quarters of decline in GDP. # In[3]: def get_recession_start(): '''Returns the year and quarter of the recession start time after year 2000.''' gdp=pd.read_excel('gdplev.xls',usecols="E,G",skiprows=219,names=['Quarter','GDP']).set_index('Quarter') gdp['GDP change']=gdp['GDP']-gdp['GDP'].shift(+1) gdp['RecessionStart'] = (gdp['GDP change'] < 0) & (gdp['GDP change'].shift(-1) < 0)& (gdp['GDP change'].shift(+1) > 0) return gdp.index[gdp['RecessionStart']][0] get_recession_start() # # Get recession end quarter # Recession end is defined by two consecutive quarters with GDP increase. # In[4]: def get_recession_end(): '''Returns the year and quarter of the recession end time''' rstart=get_recession_start() gdp=pd.read_excel('gdplev.xls',usecols="E,G",skiprows=219,names=['Quarter','GDP']).set_index('Quarter') gdp['GDP change']=gdp['GDP']-gdp['GDP'].shift(+1) gdp['RecessionEnd'] = (gdp['GDP change'] > 0) & (gdp['GDP change'].shift(+1) > 0)&(gdp['GDP change'].shift(+2) < 0) #return gdp list_of_year=list(gdp.index[gdp['RecessionEnd']]) for year in list_of_year: if(year>rstart): rend=year break #rend=[year in list_of_year if(year > rstart)] return rend get_recession_end() # # Get recession bottom # Recession bottom is defined as the quarter with minimum GDP between the start and end of recession # In[5]: def get_recession_bottom(): '''Returns the year and quarter of the recession bottom time''' rstart=get_recession_start() rend=get_recession_end() gdp=pd.read_excel('gdplev.xls',usecols="E,G",skiprows=219,names=['Quarter','GDP']).set_index('Quarter') rbottom=np.argmin(gdp.loc[rstart:rend,'GDP']) return rbottom get_recession_bottom() # # Converting the monthly housing prices to quarterly prices # In[6]: # Use this dictionary to map state names to two letter acronyms states = {'OH': 'Ohio', 'KY': 'Kentucky', 'AS': 'American Samoa', 'NV': 'Nevada', 'WY': 'Wyoming', 'NA': 'National', 'AL': 'Alabama', 'MD': 'Maryland', 'AK': 'Alaska', 'UT': 'Utah', 'OR': 'Oregon', 'MT': 'Montana', 'IL': 'Illinois', 'TN': 'Tennessee', 'DC': 'District of Columbia', 'VT': 'Vermont', 'ID': 'Idaho', 'AR': 'Arkansas', 'ME': 'Maine', 'WA': 'Washington', 'HI': 'Hawaii', 'WI': 'Wisconsin', 'MI': 'Michigan', 'IN': 'Indiana', 'NJ': 'New Jersey', 'AZ': 'Arizona', 'GU': 'Guam', 'MS': 'Mississippi', 'PR': 'Puerto Rico', 'NC': 'North Carolina', 'TX': 'Texas', 'SD': 'South Dakota', 'MP': 'Northern Mariana Islands', 'IA': 'Iowa', 'MO': 'Missouri', 'CT': 'Connecticut', 'WV': 'West Virginia', 'SC': 'South Carolina', 'LA': 'Louisiana', 'KS': 'Kansas', 'NY': 'New York', 'NE': 'Nebraska', 'OK': 'Oklahoma', 'FL': 'Florida', 'CA': 'California', 'CO': 'Colorado', 'PA': 'Pennsylvania', 'DE': 'Delaware', 'NM': 'New Mexico', 'RI': 'Rhode Island', 'MN': 'Minnesota', 'VI': 'Virgin Islands', 'NH': 'New Hampshire', 'MA': 'Massachusetts', 'GA': 'Georgia', 'ND': 'North Dakota', 'VA': 'Virginia'} # In[7]: def value2quarter(value): year, month = value.split("-") year = int(year) month = int(month) if month < 4: return "{}q{}".format(year, 1) if month < 7: return "{}q{}".format(year, 2) if month < 10: return "{}q{}".format(year, 3) return "{}q{}".format(year, 4) def convert_housing_data_to_quarters(): df = pd.read_csv("City_Zhvi_AllHomes.csv") df_t= pd.melt(df.drop(["Metro", "CountyName", "SizeRank"], axis=1), id_vars=["State", "RegionName", "RegionID",]) df_t["yearQ"] = df_t["variable"].apply(value2quarter) df_t = df_t[(df_t.yearQ >= "2000q1") & (df_t.yearQ <= "2016q3")] df_t["State"] = df_t["State"].map(states) answer = df_t.groupby( ["State", "RegionName", "RegionID", "yearQ"] )["value"].mean().unstack("yearQ").reset_index().drop("RegionID", axis=1).set_index(["State", "RegionName"]) return answer convert_housing_data_to_quarters() # # Exploring the effect of recession on housing prices in university and non-university towns # In[8]: df_university = get_list_of_university_towns() df_university["is_univ_town"] = True df_university = df_university.set_index(["State", "RegionName"]) df_housing_data = convert_housing_data_to_quarters() df_all = df_housing_data.merge(df_university, left_index=True, right_index=True, how="left") df_all["is_univ_town"] = df_all["is_univ_town"].fillna(False) start = "2008q3"# get_recession_start() bottom = "2009q2"# get_recession_bottom() df_all["ratio"] = df_all[bottom]/df_all[start] mean_price_ratio_univ = df_all.loc[df_all.is_univ_town, "ratio"].dropna() mean_price_ratio_no_univ = df_all.loc[~df_all.is_univ_town, "ratio"].dropna() # In[9]: n1, bins1, patches1 = plt.hist(mean_price_ratio_univ, 20, density=True, facecolor='blue', alpha=0.5) (mu_unitown, sigma_unitown) = norm.fit(mean_price_ratio_univ) mu_unitown,sigma_unitown # In[10]: n2, bins2, patches2 = plt.hist(mean_price_ratio_no_univ, 20, density=True, facecolor='red', alpha=0.5) (mu_non_unitown, sigma_non_unitown) = norm.fit(mean_price_ratio_no_univ) mu_non_unitown,sigma_non_unitown # # Run t-test (Welch's t-test) # The variance of both samples are different hence use Welch's t-test (https://en.wikipedia.org/wiki/Student%27s_t-test#Independent_two-sample_t-test) # # # In[11]: def run_ttest(): statistic, pval = ttest_ind( mean_price_ratio_univ, mean_price_ratio_no_univ, equal_var=False) different = pval < 0.01 better = "university town" if mean_price_ratio_univ.mean() > mean_price_ratio_no_univ.mean() else "non-university town" return (different, pval, better) run_ttest() # # Conclusion: The university town housing prices are less affected by recession in 2008. # ## Few points for further exploration: # ### Is it true for all recession period? # ### Is it true for all kinds of housing (condo, apartements, houses etc.)? # In[ ]: