Machine Learning - Housing Prices | Regression

Done By: Brian Tham

Check out my other projects on:

About this Dataset

This dataset contains house sale prices for King County, which includes Seattle. It includes homes sold between May 2014 and May 2015.

In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams
import scipy.stats as st
# Suppress Future Warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:

df = pd.read_csv("kc_house_data.csv")

In [3]:

print(df.shape)
print(df.nunique())

(21613, 21)
id               21436
date               372
price             4028
bedrooms            13
bathrooms           30
sqft_living       1038
sqft_lot          9782
floors               6
waterfront           2
view                 5
condition            5
grade               12
sqft_above         946
sqft_basement      306
yr_built           116
yr_renovated        70
zipcode             70
lat               5034
long               752
sqft_living15      777
sqft_lot15        8689
dtype: int64

In [4]:

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21613 non-null  int64  
 1   date           21613 non-null  object 
 2   price          21613 non-null  float64
 3   bedrooms       21613 non-null  int64  
 4   bathrooms      21613 non-null  float64
 5   sqft_living    21613 non-null  int64  
 6   sqft_lot       21613 non-null  int64  
 7   floors         21613 non-null  float64
 8   waterfront     21613 non-null  int64  
 9   view           21613 non-null  int64  
 10  condition      21613 non-null  int64  
 11  grade          21613 non-null  int64  
 12  sqft_above     21613 non-null  int64  
 13  sqft_basement  21613 non-null  int64  
 14  yr_built       21613 non-null  int64  
 15  yr_renovated   21613 non-null  int64  
 16  zipcode        21613 non-null  int64  
 17  lat            21613 non-null  float64
 18  long           21613 non-null  float64
 19  sqft_living15  21613 non-null  int64  
 20  sqft_lot15     21613 non-null  int64  
dtypes: float64(5), int64(15), object(1)
memory usage: 3.5+ MB
None

In [5]:

df.head()

Out[5]:

	id	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	...	grade	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	lat	long	sqft_living15	sqft_lot15
0	7129300520	20141013T000000	221900.0	3	1.00	1180	5650	1.0	...	7	1180	0	1955	0	98178	47.5112	-122.257	1340	5650
1	6414100192	20141209T000000	538000.0	3	2.25	2570	7242	2.0	...	7	2170	400	1951	1991	98125	47.7210	-122.319	1690	7639
2	5631500400	20150225T000000	180000.0	2	1.00	770	10000	1.0	...	6	770	0	1933	0	98028	47.7379	-122.233	2720	8062
3	2487200875	20141209T000000	604000.0	4	3.00	1960	5000	1.0	...	7	1050	910	1965	0	98136	47.5208	-122.393	1360	5000
4	1954400510	20150218T000000	510000.0	3	2.00	1680	8080	1.0	...	8	1680	0	1987	0	98074	47.6168	-122.045	1800	7503

5 rows × 21 columns

In [48]:

df.corr()

Out[48]:

	price	bathrooms	sqft_living	sqft_lot	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	lat	...	Bedrooms_3	Bedrooms_4	Bedrooms_5	Bedrooms_6	Bedrooms_7	Bedrooms_8	Bedrooms_9	Bedrooms_10	Bedrooms_11	Bedrooms_33
price	1.000000	0.518912	0.702035	0.089661	0.605567	0.323816	0.054012	0.126434	-0.053203	0.307003	...	-0.183648	0.177489	0.189925	0.087776	0.046995	0.037755	0.016064	0.008962	-0.000372	0.001851
bathrooms	0.518912	1.000000	0.701784	0.089988	0.599360	0.334909	0.377433	0.069322	-0.144088	0.049235	...	-0.166878	0.232540	0.265175	0.146150	0.091757	0.050548	0.050171	0.019878	0.008475	-0.000529
sqft_living	0.702035	0.701784	1.000000	0.172826	0.876597	0.435043	0.318049	0.055363	-0.199430	0.052529	...	-0.272404	0.353317	0.298006	0.148055	0.086562	0.045947	0.030756	0.020870	0.006815	-0.003406
sqft_lot	0.089661	0.089988	0.172826	1.000000	0.183512	0.015286	0.053080	0.007644	-0.129574	-0.085683	...	-0.015255	0.026631	0.011350	0.009485	0.005809	-0.003634	-0.003873	-0.001777	-0.001666	-0.001496
sqft_above	0.605567	0.599360	0.876597	0.183512	1.000000	-0.051943	0.423898	0.023285	-0.261190	-0.000816	...	-0.224832	0.338346	0.211493	0.096252	0.069528	0.025845	0.023074	0.009509	0.005024	-0.006148
sqft_basement	0.323816	0.334909	0.435043	0.015286	-0.051943	1.000000	-0.133124	0.071323	0.074845	0.110538	...	-0.144622	0.100140	0.222709	0.127151	0.049542	0.046992	0.020652	0.025518	0.004742	0.004434
yr_built	0.054012	0.377433	0.318049	0.053080	0.423898	-0.133124	1.000000	-0.224874	-0.346869	-0.148122	...	0.025095	0.130070	0.030812	-0.030627	-0.011663	-0.012276	-0.022318	-0.004548	-0.012275	-0.005559
yr_renovated	0.126434	0.069322	0.055363	0.007644	0.023285	0.071323	-0.224874	1.000000	0.064357	0.029398	...	-0.019092	0.004612	0.011731	0.023658	0.013192	-0.005155	0.010396	-0.002476	0.032424	-0.001429
zipcode	-0.053203	-0.144088	-0.199430	-0.129574	-0.261190	0.074845	-0.346869	0.064357	1.000000	0.267048	...	-0.024937	-0.104634	-0.040980	0.003215	0.006178	0.001791	0.010193	-0.008722	0.003567	0.003186
lat	0.307003	0.049235	0.052529	-0.085683	-0.000816	0.110538	-0.148122	0.029398	0.267048	1.000000	...	-0.056487	-0.019408	0.028817	0.016596	0.014833	0.012386	0.011150	0.003967	-0.000199	0.006271
long	0.021626	0.149157	0.240223	0.229521	0.343803	-0.144765	0.409356	-0.068372	-0.564072	-0.135512	...	0.003579	0.120164	0.025417	-0.021510	-0.007991	-0.009559	-0.011530	0.000940	-0.007202	-0.005656
sqft_living15	0.585379	0.497484	0.756420	0.144608	0.731870	0.200355	0.326229	-0.002673	-0.279033	0.048858	...	-0.201265	0.303741	0.188249	0.048366	0.021353	0.009981	0.002548	0.003555	-0.005623	-0.006516
sqft_lot15	0.082447	0.086952	0.183286	0.718557	0.194050	0.017276	0.070958	0.007854	-0.147221	-0.086419	...	0.001367	0.024002	0.003326	0.008222	0.004677	-0.004863	-0.005115	-0.001762	-0.001945	-0.002010
Floors_1.5	0.016023	-0.095399	-0.058316	0.013978	-0.053293	-0.021304	-0.380517	0.045535	0.143418	0.068018	...	-0.026902	0.031429	0.027085	0.008719	0.014171	-0.000990	-0.005188	-0.003668	-0.002118	-0.002118
Floors_2.0	0.232662	0.369959	0.420018	0.019461	0.578689	-0.211141	0.526940	0.003141	-0.184719	-0.052540	...	-0.120120	0.216322	0.059850	0.013063	0.010257	0.004053	0.009791	0.006923	0.008665	-0.005340
Floors_2.5	0.122770	0.079446	0.100509	0.004686	0.104773	0.012540	-0.028593	0.030001	0.023581	0.016589	...	-0.018567	0.007779	0.022752	0.057797	0.022052	-0.002125	0.063155	-0.001021	-0.000589	-0.000589
Floors_3.0	0.019750	0.063825	-0.054244	-0.043911	-0.014236	-0.085931	0.196846	-0.024820	0.102116	0.120427	...	0.070733	-0.080283	-0.034490	-0.006787	-0.000517	-0.004191	-0.002847	-0.002013	-0.001162	-0.001162
Floors_3.5	0.020611	0.008056	0.009179	-0.005607	0.012249	-0.003870	0.012853	-0.004043	0.010317	0.011089	...	0.001757	-0.013153	-0.005443	-0.002172	-0.000808	0.097634	-0.000321	-0.000227	-0.000131	-0.000131
watFront_1	0.266369	0.065428	0.103818	0.021604	0.072075	0.080588	-0.026161	0.092885	0.030285	-0.014274	...	-0.010837	-0.013664	0.014143	0.009349	-0.003658	-0.002139	-0.001453	-0.001027	-0.000593	-0.000593
View_1	0.092607	0.050049	0.066511	-0.008287	0.021839	0.097164	-0.034053	0.033563	0.043251	0.018019	...	-0.023353	0.012343	0.023569	-0.003976	0.021698	-0.003064	-0.002081	-0.001472	-0.000850	-0.000850
View_2	0.148418	0.099570	0.135285	0.037278	0.077861	0.135064	-0.044616	0.032590	0.052009	0.005065	...	-0.026897	0.012208	0.041669	0.019876	0.012348	-0.005298	-0.003599	0.016491	-0.001469	-0.001469
View_3	0.182880	0.119851	0.158885	0.073871	0.091663	0.158213	-0.018873	0.050668	0.040773	-0.013892	...	-0.029884	0.032458	0.027027	0.023466	-0.006524	0.008619	-0.002591	-0.001832	-0.001057	-0.001057
View_4	0.307932	0.110849	0.169460	0.019172	0.107625	0.150292	-0.020229	0.080818	0.040748	0.013969	...	-0.030822	0.003644	0.035703	0.034369	0.013180	-0.003003	-0.002040	-0.001442	-0.000833	-0.000833
Cond_2	-0.051917	-0.067614	-0.065324	0.037617	-0.058925	-0.025309	-0.067277	-0.008571	0.023615	-0.022650	...	-0.009602	-0.020979	-0.023345	0.003902	-0.003759	-0.002197	-0.001493	-0.001055	-0.000609	-0.000609
Cond_3	0.007131	0.123475	0.102413	-0.011452	0.194555	-0.151498	0.391719	0.069268	0.017798	0.042297	...	-0.013565	0.023362	-0.003094	-0.016160	0.000765	-0.001738	0.012250	-0.007798	0.005000	-0.009253
Cond_4	-0.030715	-0.114994	-0.083794	0.013157	-0.142486	0.092712	-0.257414	-0.054833	-0.060803	-0.057481	...	0.027374	-0.028501	-0.001074	0.014645	-0.002471	-0.001783	-0.009948	0.010813	-0.004061	-0.004061
Cond_5	0.057585	-0.003155	-0.018136	-0.014497	-0.088453	0.127865	-0.244353	-0.030077	0.058646	0.025827	...	-0.015589	0.014152	0.016400	0.002455	0.004140	0.006846	-0.004870	-0.003444	-0.001988	0.023273
Grade_1	-0.007376	-0.018536	-0.013257	0.000947	-0.012309	-0.004481	-0.001854	-0.001429	-0.006858	-0.001436	...	-0.006210	-0.004649	-0.001924	-0.000768	-0.000285	-0.000167	-0.000113	-0.000080	-0.000046	-0.000046
Grade_3	-0.010733	-0.026908	-0.019028	0.003370	-0.016957	-0.007761	-0.010298	-0.002476	-0.010631	-0.019177	...	-0.010756	-0.008053	-0.003333	-0.001330	-0.000494	-0.000289	-0.000196	-0.000139	-0.000080	-0.000080
Grade_4	-0.032520	-0.049693	-0.056650	0.006190	-0.050233	-0.023573	-0.039768	-0.001472	-0.005180	-0.017343	...	-0.030923	-0.025054	-0.010368	-0.004138	-0.001538	-0.000899	-0.000611	-0.000432	-0.000249	-0.000249
Grade_5	-0.084513	-0.127585	-0.127055	0.022899	-0.108432	-0.060782	-0.128795	-0.009295	0.009456	-0.046521	...	-0.042388	-0.052912	-0.021703	-0.000180	-0.004466	-0.002611	-0.001773	-0.001254	-0.000724	-0.000724
Grade_6	-0.209329	-0.322861	-0.312096	-0.019164	-0.280453	-0.122919	-0.313452	0.008192	0.148866	-0.062725	...	-0.023006	-0.141357	-0.066479	-0.026486	-0.009762	-0.007916	-0.005377	-0.003802	-0.002195	-0.002195
Grade_7	-0.315803	-0.234908	-0.358513	-0.068004	-0.386797	-0.020268	-0.211814	-0.020668	0.070263	-0.040414	...	0.157398	-0.137591	-0.058891	-0.012655	-0.010735	0.002290	0.008492	0.006004	0.008067	0.008067
Grade_8	0.004705	0.120300	0.071326	-0.024086	0.059621	0.036462	0.199846	0.004496	-0.052505	0.026337	...	0.007825	0.057869	0.002166	0.012593	0.003272	0.001470	-0.004231	-0.007361	-0.004250	-0.004250
Grade_9	0.235897	0.203087	0.318419	0.049548	0.343198	0.018640	0.219956	0.019602	-0.087643	0.042223	...	-0.101614	0.157856	0.064624	0.000115	-0.002024	-0.003315	-0.006182	0.007672	-0.002524	-0.002524
Grade_10	0.340799	0.259672	0.369058	0.074335	0.376160	0.062053	0.148728	0.002240	-0.073181	0.052306	...	-0.091455	0.113100	0.070517	0.014387	0.014890	0.002691	-0.003921	-0.002773	-0.001601	-0.001601
Grade_11	0.357412	0.255943	0.345771	0.077035	0.341966	0.077705	0.098705	-0.003183	-0.058145	0.039394	...	-0.086537	0.082598	0.070141	0.024601	0.027062	-0.003364	0.018347	-0.001616	-0.000933	-0.000933
Grade_12	0.290834	0.166158	0.238806	0.061535	0.223412	0.077555	0.047053	-0.002884	-0.039395	0.017403	...	-0.046044	0.031374	0.047553	0.012036	0.014437	0.027719	-0.001078	-0.000762	-0.000440	-0.000440
Grade_13	0.211803	0.100505	0.144329	0.007758	0.126575	0.062683	0.004686	0.022997	0.003766	0.013143	...	-0.018605	-0.004616	0.036295	0.048017	-0.001030	-0.000602	-0.000409	-0.000289	-0.000167	-0.000167
Bedrooms_1	-0.058411	-0.119700	-0.125461	0.002743	-0.112286	-0.050263	-0.093017	0.020456	0.045904	0.004951	...	-0.088000	-0.065890	-0.027266	-0.010883	-0.004046	-0.002365	-0.001606	-0.001136	-0.000656	-0.000656
Bedrooms_2	-0.144571	-0.313781	-0.350008	-0.027807	-0.314731	-0.137460	-0.203107	-0.002876	0.200333	0.078832	...	-0.349277	-0.261520	-0.108222	-0.043196	-0.016058	-0.009387	-0.006376	-0.004508	-0.002603	-0.002603
Bedrooms_3	-0.183648	-0.166878	-0.272404	-0.015255	-0.224832	-0.144622	0.025095	-0.019092	-0.024937	-0.056487	...	1.000000	-0.623946	-0.258200	-0.103058	-0.038311	-0.022395	-0.015212	-0.010756	-0.006210	-0.006210
Bedrooms_4	0.177489	0.232540	0.353317	0.026631	0.338346	0.100140	0.130070	0.004612	-0.104634	-0.019408	...	-0.623946	1.000000	-0.193327	-0.077165	-0.028685	-0.016768	-0.011390	-0.008053	-0.004649	-0.004649
Bedrooms_5	0.189925	0.265175	0.298006	0.011350	0.211493	0.222709	0.030812	0.011731	-0.040980	0.028817	...	-0.258200	-0.193327	1.000000	-0.031932	-0.011870	-0.006939	-0.004713	-0.003333	-0.001924	-0.001924
Bedrooms_6	0.087776	0.146150	0.148055	0.009485	0.096252	0.127151	-0.030627	0.023658	0.003215	0.016596	...	-0.103058	-0.077165	-0.031932	1.000000	-0.004738	-0.002770	-0.001881	-0.001330	-0.000768	-0.000768
Bedrooms_7	0.046995	0.091757	0.086562	0.005809	0.069528	0.049542	-0.011663	0.013192	0.006178	0.014833	...	-0.038311	-0.028685	-0.011870	-0.004738	1.000000	-0.001030	-0.000699	-0.000494	-0.000285	-0.000285
Bedrooms_8	0.037755	0.050548	0.045947	-0.003634	0.025845	0.046992	-0.012276	-0.005155	0.001791	0.012386	...	-0.022395	-0.016768	-0.006939	-0.002770	-0.001030	1.000000	-0.000409	-0.000289	-0.000167	-0.000167
Bedrooms_9	0.016064	0.050171	0.030756	-0.003873	0.023074	0.020652	-0.022318	0.010396	0.010193	0.011150	...	-0.015212	-0.011390	-0.004713	-0.001881	-0.000699	-0.000409	1.000000	-0.000196	-0.000113	-0.000113
Bedrooms_10	0.008962	0.019878	0.020870	-0.001777	0.009509	0.025518	-0.004548	-0.002476	-0.008722	0.003967	...	-0.010756	-0.008053	-0.003333	-0.001330	-0.000494	-0.000289	-0.000196	1.000000	-0.000080	-0.000080
Bedrooms_11	-0.000372	0.008475	0.006815	-0.001666	0.005024	0.004742	-0.012275	0.032424	0.003567	-0.000199	...	-0.006210	-0.004649	-0.001924	-0.000768	-0.000285	-0.000167	-0.000113	-0.000080	1.000000	-0.000046
Bedrooms_33	0.001851	-0.000529	-0.003406	-0.001496	-0.006148	0.004434	-0.005559	-0.001429	0.003186	0.006271	...	-0.006210	-0.004649	-0.001924	-0.000768	-0.000285	-0.000167	-0.000113	-0.000080	-0.000046	1.000000

51 rows × 51 columns

In [6]:

plt.figure(figsize = (12, 6))

plt.subplot(121)
plt.title('Price Distribuition')
sns.distplot(df['price'])

plt.subplot(122)
g1 = plt.scatter(range(df.shape[0]), np.sort(df.price.values))
g1= plt.title("Price Curve Distribuition", fontsize=15)
g1 = plt.xlabel("")
g1 = plt.ylabel("Amount(US)", fontsize=12)

plt.subplots_adjust(wspace = 0.3, hspace = 0.5,
                    top = 0.9)
plt.show()

In [7]:

print("Price Min")
print(df['price'].min())
print("Price Mean")
print(df['price'].mean())
print("Price Median")
print(df['price'].median())
print("Price Max")
print(df['price'].max())
print("Price Std")
print(df['price'].std())

Price Min
75000.0
Price Mean
540088.1417665294
Price Median
450000.0
Price Max
7700000.0
Price Std
367127.1964826997

In [8]:

plt.figure(figsize = (8, 5))
sns.jointplot(df.sqft_living, df.price, 
              alpha = 0.5)
plt.xlabel('Sqft Living')
plt.ylabel('Sale Price')
plt.show()

<Figure size 576x360 with 0 Axes>

In [9]:

condition = df['condition'].value_counts()

print("Condition counting: ")
print(condition)

fig, ax = plt.subplots(ncols=2, figsize=(14,5))
sns.countplot(x='condition', data=df, ax=ax[0])
sns.boxplot(x='condition', y= 'price',
            data=df, ax=ax[1])
plt.show()

Condition counting: 
3    14031
4     5679
5     1701
2      172
1       30
Name: condition, dtype: int64

In [10]:

plt.figure(figsize = (12,8))
g = sns.FacetGrid(data=df, hue='condition',height= 5, aspect=2)
g.map(plt.scatter, "sqft_living", "price")
plt.show()

<Figure size 864x576 with 0 Axes>

Exploring bathrooms columns by price and conditions

In [11]:

df["bathrooms"] = df['bathrooms'].round(0).astype(int)

print("Freuency bathroom description:")
print(df["bathrooms"].value_counts())

plt.figure(figsize = (12,8))
plt.subplots_adjust(hspace = 0.4, top = 0.8)

ax1 = plt.subplot(221)
ax1 = sns.countplot(x="bathrooms", data=df,
                    ax=ax1)
ax1.set_xticklabels(ax1.get_xticklabels(),rotation=90)
ax1.set_title("Bathrooms counting", fontsize=15)
ax1.set_xlabel("Bathrooms number")
ax1.set_xlabel("count")

ax2 = plt.subplot(222)
ax2 = sns.boxplot(x="bathrooms", y='price',
                  data=df, ax=ax2)
ax2.set_xticklabels(ax2.get_xticklabels(),rotation=90)
ax2.set_title("Bathrooms distribution price", fontsize=15)
ax2.set_xlabel("Bathrooms number")
ax2.set_ylabel("log Price(US)")

ax0 = plt.subplot(212)
ax0 = sns.stripplot(x="bathrooms", y="price",
                    data=df, alpha=0.5,
                    jitter=True, hue="condition")
ax0.set_title("Better view distribuition through price", fontsize=15)
ax0.set_xlabel("Bathroom number")
ax0.set_ylabel("log Price(US)")
ax0.set_xticklabels(ax0.get_xticklabels(),rotation=90)

plt.show()

Freuency bathroom description:
2    13851
1     3933
3     2527
4     1201
5       57
6       24
0       14
8        4
7        2
Name: bathrooms, dtype: int64

In [12]:

from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

plt.figure(figsize = (12,6))
ax1 = plt.subplot2grid((2,2), (0,0), colspan = 2)
for val in range(1,6,1):
    indeX = df.condition == val
    ax1.scatter(df.sqft_living.loc[indeX], df.price.loc[indeX], label = val, alpha=0.5)
ax1.legend(bbox_to_anchor = [1.1, 1])
ax1.set_xlabel('sqfit living area')
ax1.set_ylabel('Price house')
ax1.set_title('Sqft Living - Price w.r.t Conditions')

ax2 = plt.subplot2grid((2,2), (1,0))
sns.boxplot(x = 'condition', y = 'price', data = df, ax = ax2)
ax2.set_title('Box Plot Condition & Price', fontsize = 12)

ax3 = plt.subplot2grid((2,2), (1,1))
cubicQual = df.groupby(['condition'])['price'].mean().round(0)
testTrain = df.loc[:, ['condition', 'price']].copy()
testTrain['sqCond'] = np.power(testTrain['condition'],2)
mdl = linear_model.LinearRegression()
mdl.fit(testTrain[['condition', 'sqCond']], testTrain['price'])
y_pred = mdl.predict(testTrain[['condition', 'sqCond']])
print("Mean squared error: %.2f" % mean_squared_error(y_pred, testTrain.price))
# Plot outputs
ax3.scatter(testTrain['condition'], testTrain['price'],  color='black')
ax3.plot(testTrain['condition'], y_pred, color='blue', linewidth=3)
ax3.set_title('LinReg, price ~ condtion + sqft_cond', fontsize = 12)
ax3.set_xlabel('Condition Rate')
plt.subplots_adjust(hspace = 0.5, top = 0.9)
plt.suptitle('Condition Effect to Sale Price', fontsize = 14)
plt.show()

Mean squared error: 134582326689.42

In [13]:

from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

plt.figure(figsize = (12,6))
ax1 = plt.subplot2grid((2,2), (0,0), colspan = 2)

for val in range(0,5,1):
    indeX = df.view == val
    ax1.scatter(df.sqft_living.loc[indeX], df.price.loc[indeX], label = val, alpha=0.4)
ax1.legend(bbox_to_anchor = [1.1, 1])
ax1.set_xlabel('sqfit living area')
ax1.set_ylabel('Price house')
ax1.set_title('Sqft Living - Price w.r.t View')

ax2 = plt.subplot2grid((2,2), (1,0))
sns.boxplot(x = 'view', y = 'price', data = df, ax = ax2)
ax2.set_title('Box Plot View & Price', fontsize = 12)

ax3 = plt.subplot2grid((2,2), (1,1))
cubicV = df.groupby(['view'])['price'].mean().round(0)
testTrain = df.loc[:, ['view', 'price']].copy()
testTrain['sqview'] = np.power(testTrain['view'],2)
mdl = linear_model.LinearRegression()
mdl.fit(testTrain[['view', 'sqview']], testTrain['price'])
y_pred = mdl.predict(testTrain[['view', 'sqview']])
print("Mean squared error: %.2f" % mean_squared_error(y_pred, testTrain.price))
# Plot outputs
ax3.scatter(testTrain['view'], testTrain['price'],  color='black')
ax3.plot(testTrain['view'], y_pred, color='blue', linewidth=3)
ax3.set_title('LinReg, price ~ condtion + sqft_cond', fontsize = 12)
ax3.set_xlabel('View rate')
plt.subplots_adjust(hspace = 0.5, top = 0.9)
plt.suptitle('"VIEW" Effect To SalePrice', fontsize = 14)
plt.show()

Mean squared error: 112971203793.79

In [15]:

bedrooms = df.bedrooms.value_counts()


plt.figure(figsize = (12,8))
plt.subplots_adjust(hspace = 0.4, top = 0.8)


ax1 = plt.subplot(221)
ax1 = sns.countplot(x="bedrooms", data=df,
                    ax=ax1)
ax1.set_title("bedrooms counting", fontsize=15)
ax1.set_xlabel("Bathrooms number")
ax1.set_ylabel("count")

ax2 = plt.subplot(222)
ax2 = sns.regplot(x="bedrooms", y='price', 
                  data=df, ax=ax2, x_jitter=True)
ax2.set_title("Bedrooms distribution price", fontsize=15)
ax2.set_xlabel("Bedrooms number")
ax2.set_ylabel("log Price(US)")

ax0 = plt.subplot(212)
ax0 = sns.boxenplot(x="bedrooms", y="price",
                    data=df)
ax0.set_title("Better understaning price", fontsize=15)
ax0.set_xlabel("Bedrooms")
ax0.set_ylabel("log Price(US)")

plt.show()

In [16]:

print("Floors counting description")
print(df['floors'].value_counts())


plt.figure(figsize = (12,8))
plt.subplots_adjust(hspace = 0.4, top = 0.8)

ax1 = plt.subplot(221)
ax1 = sns.boxenplot(x="floors", y='price', 
                    data=df, ax=ax1, )
ax1.set_title("Floors counting", fontsize=15)
ax1.set_xlabel("Floors number")
ax1.set_ylabel("Count")

ax2 = plt.subplot(222)
ax2 = sns.countplot(x="floors",
                  data=df, ax=ax2)
ax2.set_title("Floor distribution by price", fontsize=15)
ax2.set_xlabel("Floor number")
ax2.set_ylabel("log Price(US)")

ax0 = plt.subplot(212)
ax0 = sns.regplot(x="floors", y="price", #I need to change floors by sqft_living and hue bye floors
                    data=df, x_jitter=True)
ax0.set_title("Better understaning price by floor", fontsize=15)
ax0.set_xlabel("Floor")
ax0.set_ylabel("log Price(US)")

plt.show()

Floors counting description
1.0    10680
2.0     8241
1.5     1910
3.0      613
2.5      161
3.5        8
Name: floors, dtype: int64

In [17]:

plt.figure(figsize = (12,8))
g=sns.lmplot(x="sqft_living", y="price", aspect=1.8,
                    data=df, hue="floors", fit_reg=False)
g.set_titles("Floors by sqft_living and price", fontsize=15)
g.set_xlabels("Sqft Living")
g.set_ylabels("Price(US)")
plt.show()

<Figure size 864x576 with 0 Axes>

In [18]:

print("Grade counting description")
print(df['grade'].value_counts())


plt.figure(figsize = (12,8))
plt.subplots_adjust(hspace = 0.4, top = 0.8)

ax1 = plt.subplot(221)
ax1 = sns.boxenplot(x="grade", y='price', 
                    data=df, ax=ax1, )
ax1.set_xticklabels(ax1.get_xticklabels(),rotation=90)
ax1.set_title("grade counting", fontsize=15)
ax1.set_xlabel("Grade number")
ax1.set_ylabel("Count")

ax2 = plt.subplot(222)
ax2 = sns.countplot(x="grade",
                  data=df, ax=ax2)
ax2.set_xticklabels(ax2.get_xticklabels(),rotation=90)
ax2.set_title("Grade distribution price", fontsize=15)
ax2.set_xlabel("Grade number")
ax2.set_ylabel("log Price(US)")

ax0 = plt.subplot(212)
ax0 = sns.regplot(x="grade", y="price",
                    data=df, x_jitter=True)
ax0.set_title("Better understaning price by grade", fontsize=15)
ax0.set_xlabel("Grade")
ax0.set_ylabel("log Price(US)")
ax0.set_xticklabels(ax0.get_xticklabels(),rotation=90)

plt.show()

Grade counting description
7     8981
8     6068
9     2615
6     2038
10    1134
11     399
5      242
12      90
4       29
13      13
3        3
1        1
Name: grade, dtype: int64

In [19]:

#Clearly view of bathrooms and bedrooms correlation

bath = ['bathrooms', 'bedrooms']
cm = sns.light_palette("green", as_cmap=True)
pd.crosstab(df[bath[0]], df[bath[1]]).style.background_gradient(cmap = cm)

Out[19]:

bedrooms	0	1	2	3	4	5	6	7	8	9	10	11	33
bathrooms
0	7	4	2	0	1	0	0	0	0	0	0	0	0
1	2	167	1587	1800	327	43	6	1	0	0	0	0	0
2	4	28	1129	7186	4709	695	90	7	1	0	1	0	1
3	0	0	41	656	1219	506	88	7	6	2	1	1	0
4	0	0	1	182	601	321	72	17	4	3	0	0	0
5	0	0	0	0	19	22	12	2	1	0	1	0	0
6	0	0	0	0	6	13	2	2	1	0	0	0	0
7	0	0	0	0	0	1	0	1	0	0	0	0	0
8	0	0	0	0	0	0	2	1	0	1	0	0	0

In [20]:

bath_cond = ['bathrooms', 'condition']
cm = sns.light_palette("green", as_cmap=True)
pd.crosstab(df[bath_cond[0]], df[bath_cond[1]]).style.background_gradient(cmap = cm)

Out[20]:

condition	1	2	3	4	5
bathrooms
0	1	1	9	3	0
1	20	91	2211	1300	311
2	9	75	9000	3702	1065
3	0	5	1713	553	256
4	0	0	1026	109	66
5	0	0	46	8	3
6	0	0	21	3	0
7	0	0	2	0	0
8	0	0	3	1	0

In [21]:

bed_cond = ['bedrooms', 'condition']
cm = sns.light_palette("green", as_cmap=True)
pd.crosstab(df[bed_cond[0]], df[bed_cond[1]]).style.background_gradient(cmap = cm)

Out[21]:

condition	1	2	3	4	5
bedrooms
0	1	1	10	1	0
1	4	11	124	48	12
2	12	51	1779	718	200
3	8	69	6308	2711	728
4	4	36	4580	1682	580
5	0	1	1031	418	151
6	1	3	158	87	23
7	0	0	25	9	4
8	0	0	8	3	2
9	0	0	6	0	0
10	0	0	1	2	0
11	0	0	1	0	0
33	0	0	0	0	1

In [22]:

cond_water = ['condition', 'waterfront']
cm = sns.light_palette("green", as_cmap=True)
pd.crosstab(df[cond_water[0]], df[cond_water[1]]).style.background_gradient(cmap = cm)

Out[22]:

waterfront	0	1
condition
1	29	1
2	171	1
3	13940	91
4	5629	50
5	1681	20

In [23]:

grade_cond = ['grade', 'condition']
cm = sns.light_palette("green", as_cmap=True)
pd.crosstab(df[grade_cond[0]], df[grade_cond[1]]).style.background_gradient(cmap = cm)

Out[23]:

condition	1	2	3	4	5
grade
1	1	0	0	0	0
3	0	1	1	0	1
4	1	5	13	10	0
5	9	15	100	84	34
6	11	59	1035	685	248
7	6	75	5234	2833	833
8	2	13	4269	1394	390
9	0	2	2041	446	126
10	0	2	921	156	55
11	0	0	332	56	11
12	0	0	74	13	3
13	0	0	11	2	0

In [24]:

grade_bed = ['grade', 'bedrooms']
cm = sns.light_palette("green", as_cmap=True)
pd.crosstab(df[grade_bed[0]], df[grade_bed[1]]).style.background_gradient(cmap = cm)

Out[24]:

bedrooms	0	1	2	3	4	5	6	7	8	9	10	11	33
grade
1	1	0	0	0	0	0	0	0	0	0	0	0	0
3	0	3	0	0	0	0	0	0	0	0	0	0	0
4	2	12	14	1	0	0	0	0	0	0	0	0	0
5	0	37	114	62	21	5	3	0	0	0	0	0	0
6	0	78	824	854	233	41	7	1	0	0	0	0	0
7	6	52	1205	4917	2177	501	98	11	6	4	2	1	1
8	3	14	499	2796	2194	455	90	12	4	1	0	0	0
9	0	2	78	832	1351	313	33	4	1	0	1	0	0
10	0	1	21	296	615	173	22	5	1	0	0	0	0
11	0	0	3	56	239	83	13	4	0	1	0	0	0
12	1	0	2	9	49	24	3	1	1	0	0	0	0
13	0	0	0	1	3	6	3	0	0	0	0	0	0

In [25]:

grade_bath = ['grade', 'bathrooms']
cm = sns.light_palette("green", as_cmap=True)
pd.crosstab(df[grade_bath[0]], df[grade_bath[1]]).style.background_gradient(cmap = cm)

Out[25]:

bathrooms	0	1	2	3	4	5	6	7	8
grade
1	1	0	0	0	0	0	0	0	0
3	2	1	0	0	0	0	0	0	0
4	0	28	1	0	0	0	0	0	0
5	1	204	36	1	0	0	0	0	0
6	2	1441	574	18	3	0	0	0	0
7	4	2104	6216	575	75	6	0	0	1
8	3	145	4788	882	244	4	2	0	0
9	0	9	1671	629	303	3	0	0	0
10	0	0	480	309	333	11	1	0	0
11	0	1	77	95	195	23	7	1	0
12	1	0	7	17	45	8	10	1	1
13	0	0	1	1	3	2	4	0	2

Correlation matrix¶

In [26]:

corr = df[['bathrooms', 'bedrooms', 'sqft_living', 'sqft_lot', 'floors', 'grade', 'price']]

plt.figure(figsize=(10,8))
plt.title('Correlation of variables')
sns.heatmap(corr.astype(float).corr(),vmax=1.0,  annot=True)
plt.show()

Year built distribuition¶

In [27]:

sns.distplot(df['yr_built'])

Out[27]:

<matplotlib.axes._subplots.AxesSubplot at 0x1dbc5efdb48>

In [28]:

g = sns.catplot(x="yr_built", y = "price", data=df[df['price'] < 1000000], 
                   height= 7, aspect = 2, kind="box" )
g.set_xticklabels(rotation=90)
plt.show()

In [29]:

df = df.merge(pd.get_dummies(df.floors, drop_first=True, prefix='Floors'), left_index=True, right_index=True)
df = df.merge(pd.get_dummies(df.waterfront, drop_first=True, prefix='watFront'), left_index=True, right_index=True)
df = df.merge(pd.get_dummies(df.view, drop_first=True, prefix='View'), left_index=True, right_index=True)
df = df.merge(pd.get_dummies(df.condition, drop_first=True, prefix='Cond'), left_index=True, right_index=True)
df = df.merge(pd.get_dummies(df.grade, prefix='Grade'), left_index=True, right_index=True)
df = df.merge(pd.get_dummies(df.bedrooms, drop_first=True, prefix='Bedrooms'), left_index=True, right_index=True)

In [30]:

del df['floors'],
del df['waterfront']
del df['view']
del df['condition']
del df['grade']
del df['bedrooms']

In [31]:

plt.figure(figsize=(15,12))
plt.title('Correlation of variables', fontsize=20)
sns.heatmap(df.corr().astype(float).corr(),vmax=1.0)
plt.show()

In [32]:

from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV # to split the data
from sklearn.metrics import explained_variance_score, median_absolute_error, r2_score, mean_squared_error, accuracy_score, confusion_matrix, classification_report, fbeta_score #To evaluate our model

In [33]:

from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import KFold, cross_val_score, train_test_split # Model evaluation
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler # Preprocessing
from sklearn.linear_model import Lasso, Ridge, ElasticNet, RANSACRegressor, SGDRegressor, HuberRegressor, BayesianRidge # Linear models
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostRegressor, GradientBoostingRegressor, ExtraTreesRegressor  # Ensemble methods
from xgboost import XGBRegressor, plot_importance # XGBoost
from sklearn.svm import SVR, SVC, LinearSVC  # Support Vector Regression
from sklearn.tree import DecisionTreeRegressor # Decision Tree Regression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline # Streaming pipelines
from sklearn.decomposition import KernelPCA, PCA # Dimensionality reduction
from sklearn.feature_selection import SelectFromModel # Dimensionality reduction
from sklearn.model_selection import learning_curve, validation_curve, GridSearchCV # Model evaluation
from sklearn.base import clone # Clone estimator
from sklearn.metrics import mean_squared_error as MSE

In [34]:

df.drop(['id', 'date'], axis=1, inplace=True)
X = df.drop("price",axis=1).values
y = df["price"].values

In [35]:

# Spliting X and y into train and test version
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=3)

Selecting the most important features¶

In [47]:

thresh = 5 * 10**(-3)
model = XGBRegressor(objective ='reg:squarederror')
model.fit(X_train, y_train)
#select features using threshold
selection = SelectFromModel(model, threshold=thresh, prefit=True)
select_X_train = selection.transform(X_train)
# eval model
select_X_val = selection.transform(X_test)
# test 
select_X_test = selection.transform(X_test)

Creating the pipeline with the models to we do a first evaluate of their power to this problem¶

In [37]:

pipelines = []
seed = 2

pipelines.append(
                ("Scaled_Ridge", 
                 Pipeline([
                     ("Scaler", StandardScaler()), 
                     ("Ridge", Ridge(random_state=seed, tol=10 ))
                      ]))
                )
pipelines.append(
                ("Scaled_Lasso", 
                 Pipeline([
                     ("Scaler", StandardScaler()), 
                     ("Lasso", Lasso(random_state=seed, tol=1))
                      ]))
                )
pipelines.append(
                ("Scaled_Elastic", 
                 Pipeline([
                     ("Scaler", StandardScaler()), 
                     ("Lasso", ElasticNet(random_state=seed))
                      ]))
                )

pipelines.append(
                ("Scaled_SVR",
                 Pipeline([
                     ("Scaler", StandardScaler()),
                     ("SVR",  SVR(kernel='linear', C=1e2, degree=5))
                 ])
                )
                )

pipelines.append(
                ("Scaled_RF_reg",
                 Pipeline([
                     ("Scaler", StandardScaler()),
                     ("RF", RandomForestRegressor(random_state=seed))
                 ])
                )
                )

pipelines.append(
                ("Scaled_ET_reg",
                 Pipeline([
                     ("Scaler", StandardScaler()),
                     ("ET", ExtraTreesRegressor(random_state=seed))
                 ])
                )
                )
pipelines.append(
                ("Scaled_BR_reg",
                 Pipeline([
                     ("Scaler", StandardScaler()),
                     ("BR", BaggingRegressor(random_state=seed))
                 ]))) 

pipelines.append(
                ("Scaled_Hub-Reg",
                 Pipeline([
                     ("Scaler", StandardScaler()),
                     ("Hub-Reg", HuberRegressor())
                 ]))) 
pipelines.append(
                ("Scaled_BayRidge",
                 Pipeline([
                     ("Scaler", StandardScaler()),
                     ("BR", BayesianRidge())
                 ]))) 

pipelines.append(
                ("Scaled_XGB_reg",
                 Pipeline([
                     ("Scaler", StandardScaler()),
                     ("XGBR", XGBRegressor(seed=seed))
                 ]))) 

pipelines.append(
                ("Scaled_DT_reg",
                 Pipeline([
                     ("Scaler", StandardScaler()),
                     ("DT_reg", DecisionTreeRegressor())
                 ]))) 

pipelines.append(
                ("Scaled_KNN_reg",
                 Pipeline([
                     ("Scaler", StandardScaler()),
                     ("KNN_reg", KNeighborsRegressor())
                 ])))

pipelines.append(
                ("Scaled_Gboost-Reg",
                 Pipeline([
                     ("Scaler", StandardScaler()),
                     ("GBoost-Reg", GradientBoostingRegressor())
                 ])))

pipelines.append(
                ("Scaled_RFR_PCA",
                 Pipeline([
                     ("Scaler", StandardScaler()),
                     ("PCA", PCA(n_components=3)),
                     ("XGB", RandomForestRegressor())
                 ])))

pipelines.append(
                ("Scaled_XGBR_PCA",
                 Pipeline([
                     ("Scaler", StandardScaler()),
                     ("PCA", PCA(n_components=3)),
                     ("XGB", XGBRegressor())
                 ])))

#'neg_mean_absolute_error', 'neg_mean_squared_error','r2'
scoring = 'r2'
n_folds = 7

results, names  = [], [] 

for name, model  in pipelines:
    kfold = KFold(n_splits=n_folds, random_state=seed)
    cv_results = cross_val_score(model, X_train, y_train, cv= kfold,
                                 scoring=scoring, n_jobs=-1)    
    names.append(name)
    results.append(cv_results)    
    msg = "%s: %f (+/- %f)" % (name, cv_results.mean(),  cv_results.std())
    print(msg)
    
# boxplot algorithm comparison
fig = plt.figure(figsize=(15,6))
fig.suptitle('Algorithm Comparison', fontsize=22)
ax = fig.add_subplot(111)
sns.boxplot(x=names, y=results)
ax.set_xticklabels(names)
ax.set_xlabel("Algorithmn Name", fontsize=20)
ax.set_ylabel("R Squared Score of Models", fontsize=18)
ax.set_xticklabels(ax.get_xticklabels(),rotation=45)
plt.show()

Scaled_Ridge: 0.727431 (+/- 0.009924)
Scaled_Lasso: 0.683566 (+/- 0.013035)
Scaled_Elastic: 0.700632 (+/- 0.009749)
Scaled_SVR: 0.671598 (+/- 0.009346)
Scaled_RF_reg: 0.868115 (+/- 0.010678)
Scaled_ET_reg: 0.873234 (+/- 0.011494)
Scaled_BR_reg: 0.847559 (+/- 0.013386)
Scaled_Hub-Reg: 0.701841 (+/- 0.009565)
Scaled_BayRidge: 0.727563 (+/- 0.009940)
Scaled_XGB_reg: 0.856188 (+/- 0.006032)
Scaled_DT_reg: 0.734013 (+/- 0.031810)
Scaled_KNN_reg: 0.716625 (+/- 0.020929)
Scaled_Gboost-Reg: 0.857853 (+/- 0.008397)
Scaled_RFR_PCA: 0.645886 (+/- 0.019619)
Scaled_XGBR_PCA: 0.657901 (+/- 0.017554)

Very cool results!

We can see that we got good models with a good r2 score.

All RandomForestRegression, ExtraTreesRgressor, BaggingRegressor and XGBRegressor have r2 higher than 0.80

I will set hyper parameters to the best models and try increase this score

Testing the best models¶

XGBoost Regressor¶

In [38]:

xgb = XGBRegressor(objective ='reg:squarederror',n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7)

In [39]:

xgb.fit(X_train, y_train)

Out[39]:

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.08, max_delta_step=0,
             max_depth=7, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:squarederror',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=0.75, verbosity=1)

In [40]:

y_hat = xgb.predict(X_test)

In [41]:

xgb.score(X_test,y_test)

Out[41]:

0.8938452606979226

Excellent result of XGB Regressor with some arbitrary params.

Randominzed Search in RandomForest Regressor¶

In [42]:

from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 200, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

Using Random Search to find the best Hyper Parameters¶

In [43]:

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  3.2min finished

Out[43]:

RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100,
                                                   n_jobs=None, oob_score=Fals...
                   iid='deprecated', n_iter=100, n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [10, 31, 52, 73, 94,
                                                         115, 136, 157, 178,
                                                         200]},
                   pre_dispatch='2*n_jobs', random_state=42, refit=True,
                   return_train_score=False, scoring=None, verbose=2)

In [44]:

#Knowing the best parameters
rf_random.best_params_

Out[44]:

{'n_estimators': 157,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 90,
 'bootstrap': True}

In [50]:

# Predicting with best params
y_hat_Search = rf_random.predict(X_test)

Random Forest Score:  0.880822431682647

Printing the difference between the 2 models scores¶

In [52]:

print("XGBoost Regressor R2-score: {}".format(round(r2_score(y_hat, y_test),4)))
print("RandomForest Regressor Prediction R2-score: {}".format(round(r2_score(y_hat_Search, y_test),4)))
print("\nMSE of XGBoost Regressor: {}".format(median_absolute_error(y_hat, y_test)))
print("MSE of RandomForest Regressor: {} ".format(median_absolute_error(y_hat_Search, y_test)))

XGBoost Regressor R2-score: 0.8724
RandomForest Regressor Prediction R2-score: 0.8545

MSE of XGBoost Regressor: 40211.765625
MSE of RandomForest Regressor: 39935.995261072996

In [53]:

print("XG Boost Score: ", xgb.score(X_test,y_test))
print("Random Forest Score: ", rf_random.score(X_test, y_test))

XG Boost Score:  0.8938452606979226
Random Forest Score:  0.880822431682647