import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams
import scipy.stats as st
# Suppress Future Warnings
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv("kc_house_data.csv")
print(df.shape)
print(df.nunique())
(21613, 21) id 21436 date 372 price 4028 bedrooms 13 bathrooms 30 sqft_living 1038 sqft_lot 9782 floors 6 waterfront 2 view 5 condition 5 grade 12 sqft_above 946 sqft_basement 306 yr_built 116 yr_renovated 70 zipcode 70 lat 5034 long 752 sqft_living15 777 sqft_lot15 8689 dtype: int64
print(df.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 21613 entries, 0 to 21612 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 21613 non-null int64 1 date 21613 non-null object 2 price 21613 non-null float64 3 bedrooms 21613 non-null int64 4 bathrooms 21613 non-null float64 5 sqft_living 21613 non-null int64 6 sqft_lot 21613 non-null int64 7 floors 21613 non-null float64 8 waterfront 21613 non-null int64 9 view 21613 non-null int64 10 condition 21613 non-null int64 11 grade 21613 non-null int64 12 sqft_above 21613 non-null int64 13 sqft_basement 21613 non-null int64 14 yr_built 21613 non-null int64 15 yr_renovated 21613 non-null int64 16 zipcode 21613 non-null int64 17 lat 21613 non-null float64 18 long 21613 non-null float64 19 sqft_living15 21613 non-null int64 20 sqft_lot15 21613 non-null int64 dtypes: float64(5), int64(15), object(1) memory usage: 3.5+ MB None
df.head()
id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | ... | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7129300520 | 20141013T000000 | 221900.0 | 3 | 1.00 | 1180 | 5650 | 1.0 | 0 | 0 | ... | 7 | 1180 | 0 | 1955 | 0 | 98178 | 47.5112 | -122.257 | 1340 | 5650 |
1 | 6414100192 | 20141209T000000 | 538000.0 | 3 | 2.25 | 2570 | 7242 | 2.0 | 0 | 0 | ... | 7 | 2170 | 400 | 1951 | 1991 | 98125 | 47.7210 | -122.319 | 1690 | 7639 |
2 | 5631500400 | 20150225T000000 | 180000.0 | 2 | 1.00 | 770 | 10000 | 1.0 | 0 | 0 | ... | 6 | 770 | 0 | 1933 | 0 | 98028 | 47.7379 | -122.233 | 2720 | 8062 |
3 | 2487200875 | 20141209T000000 | 604000.0 | 4 | 3.00 | 1960 | 5000 | 1.0 | 0 | 0 | ... | 7 | 1050 | 910 | 1965 | 0 | 98136 | 47.5208 | -122.393 | 1360 | 5000 |
4 | 1954400510 | 20150218T000000 | 510000.0 | 3 | 2.00 | 1680 | 8080 | 1.0 | 0 | 0 | ... | 8 | 1680 | 0 | 1987 | 0 | 98074 | 47.6168 | -122.045 | 1800 | 7503 |
5 rows × 21 columns
df.corr()
price | bathrooms | sqft_living | sqft_lot | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | ... | Bedrooms_3 | Bedrooms_4 | Bedrooms_5 | Bedrooms_6 | Bedrooms_7 | Bedrooms_8 | Bedrooms_9 | Bedrooms_10 | Bedrooms_11 | Bedrooms_33 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
price | 1.000000 | 0.518912 | 0.702035 | 0.089661 | 0.605567 | 0.323816 | 0.054012 | 0.126434 | -0.053203 | 0.307003 | ... | -0.183648 | 0.177489 | 0.189925 | 0.087776 | 0.046995 | 0.037755 | 0.016064 | 0.008962 | -0.000372 | 0.001851 |
bathrooms | 0.518912 | 1.000000 | 0.701784 | 0.089988 | 0.599360 | 0.334909 | 0.377433 | 0.069322 | -0.144088 | 0.049235 | ... | -0.166878 | 0.232540 | 0.265175 | 0.146150 | 0.091757 | 0.050548 | 0.050171 | 0.019878 | 0.008475 | -0.000529 |
sqft_living | 0.702035 | 0.701784 | 1.000000 | 0.172826 | 0.876597 | 0.435043 | 0.318049 | 0.055363 | -0.199430 | 0.052529 | ... | -0.272404 | 0.353317 | 0.298006 | 0.148055 | 0.086562 | 0.045947 | 0.030756 | 0.020870 | 0.006815 | -0.003406 |
sqft_lot | 0.089661 | 0.089988 | 0.172826 | 1.000000 | 0.183512 | 0.015286 | 0.053080 | 0.007644 | -0.129574 | -0.085683 | ... | -0.015255 | 0.026631 | 0.011350 | 0.009485 | 0.005809 | -0.003634 | -0.003873 | -0.001777 | -0.001666 | -0.001496 |
sqft_above | 0.605567 | 0.599360 | 0.876597 | 0.183512 | 1.000000 | -0.051943 | 0.423898 | 0.023285 | -0.261190 | -0.000816 | ... | -0.224832 | 0.338346 | 0.211493 | 0.096252 | 0.069528 | 0.025845 | 0.023074 | 0.009509 | 0.005024 | -0.006148 |
sqft_basement | 0.323816 | 0.334909 | 0.435043 | 0.015286 | -0.051943 | 1.000000 | -0.133124 | 0.071323 | 0.074845 | 0.110538 | ... | -0.144622 | 0.100140 | 0.222709 | 0.127151 | 0.049542 | 0.046992 | 0.020652 | 0.025518 | 0.004742 | 0.004434 |
yr_built | 0.054012 | 0.377433 | 0.318049 | 0.053080 | 0.423898 | -0.133124 | 1.000000 | -0.224874 | -0.346869 | -0.148122 | ... | 0.025095 | 0.130070 | 0.030812 | -0.030627 | -0.011663 | -0.012276 | -0.022318 | -0.004548 | -0.012275 | -0.005559 |
yr_renovated | 0.126434 | 0.069322 | 0.055363 | 0.007644 | 0.023285 | 0.071323 | -0.224874 | 1.000000 | 0.064357 | 0.029398 | ... | -0.019092 | 0.004612 | 0.011731 | 0.023658 | 0.013192 | -0.005155 | 0.010396 | -0.002476 | 0.032424 | -0.001429 |
zipcode | -0.053203 | -0.144088 | -0.199430 | -0.129574 | -0.261190 | 0.074845 | -0.346869 | 0.064357 | 1.000000 | 0.267048 | ... | -0.024937 | -0.104634 | -0.040980 | 0.003215 | 0.006178 | 0.001791 | 0.010193 | -0.008722 | 0.003567 | 0.003186 |
lat | 0.307003 | 0.049235 | 0.052529 | -0.085683 | -0.000816 | 0.110538 | -0.148122 | 0.029398 | 0.267048 | 1.000000 | ... | -0.056487 | -0.019408 | 0.028817 | 0.016596 | 0.014833 | 0.012386 | 0.011150 | 0.003967 | -0.000199 | 0.006271 |
long | 0.021626 | 0.149157 | 0.240223 | 0.229521 | 0.343803 | -0.144765 | 0.409356 | -0.068372 | -0.564072 | -0.135512 | ... | 0.003579 | 0.120164 | 0.025417 | -0.021510 | -0.007991 | -0.009559 | -0.011530 | 0.000940 | -0.007202 | -0.005656 |
sqft_living15 | 0.585379 | 0.497484 | 0.756420 | 0.144608 | 0.731870 | 0.200355 | 0.326229 | -0.002673 | -0.279033 | 0.048858 | ... | -0.201265 | 0.303741 | 0.188249 | 0.048366 | 0.021353 | 0.009981 | 0.002548 | 0.003555 | -0.005623 | -0.006516 |
sqft_lot15 | 0.082447 | 0.086952 | 0.183286 | 0.718557 | 0.194050 | 0.017276 | 0.070958 | 0.007854 | -0.147221 | -0.086419 | ... | 0.001367 | 0.024002 | 0.003326 | 0.008222 | 0.004677 | -0.004863 | -0.005115 | -0.001762 | -0.001945 | -0.002010 |
Floors_1.5 | 0.016023 | -0.095399 | -0.058316 | 0.013978 | -0.053293 | -0.021304 | -0.380517 | 0.045535 | 0.143418 | 0.068018 | ... | -0.026902 | 0.031429 | 0.027085 | 0.008719 | 0.014171 | -0.000990 | -0.005188 | -0.003668 | -0.002118 | -0.002118 |
Floors_2.0 | 0.232662 | 0.369959 | 0.420018 | 0.019461 | 0.578689 | -0.211141 | 0.526940 | 0.003141 | -0.184719 | -0.052540 | ... | -0.120120 | 0.216322 | 0.059850 | 0.013063 | 0.010257 | 0.004053 | 0.009791 | 0.006923 | 0.008665 | -0.005340 |
Floors_2.5 | 0.122770 | 0.079446 | 0.100509 | 0.004686 | 0.104773 | 0.012540 | -0.028593 | 0.030001 | 0.023581 | 0.016589 | ... | -0.018567 | 0.007779 | 0.022752 | 0.057797 | 0.022052 | -0.002125 | 0.063155 | -0.001021 | -0.000589 | -0.000589 |
Floors_3.0 | 0.019750 | 0.063825 | -0.054244 | -0.043911 | -0.014236 | -0.085931 | 0.196846 | -0.024820 | 0.102116 | 0.120427 | ... | 0.070733 | -0.080283 | -0.034490 | -0.006787 | -0.000517 | -0.004191 | -0.002847 | -0.002013 | -0.001162 | -0.001162 |
Floors_3.5 | 0.020611 | 0.008056 | 0.009179 | -0.005607 | 0.012249 | -0.003870 | 0.012853 | -0.004043 | 0.010317 | 0.011089 | ... | 0.001757 | -0.013153 | -0.005443 | -0.002172 | -0.000808 | 0.097634 | -0.000321 | -0.000227 | -0.000131 | -0.000131 |
watFront_1 | 0.266369 | 0.065428 | 0.103818 | 0.021604 | 0.072075 | 0.080588 | -0.026161 | 0.092885 | 0.030285 | -0.014274 | ... | -0.010837 | -0.013664 | 0.014143 | 0.009349 | -0.003658 | -0.002139 | -0.001453 | -0.001027 | -0.000593 | -0.000593 |
View_1 | 0.092607 | 0.050049 | 0.066511 | -0.008287 | 0.021839 | 0.097164 | -0.034053 | 0.033563 | 0.043251 | 0.018019 | ... | -0.023353 | 0.012343 | 0.023569 | -0.003976 | 0.021698 | -0.003064 | -0.002081 | -0.001472 | -0.000850 | -0.000850 |
View_2 | 0.148418 | 0.099570 | 0.135285 | 0.037278 | 0.077861 | 0.135064 | -0.044616 | 0.032590 | 0.052009 | 0.005065 | ... | -0.026897 | 0.012208 | 0.041669 | 0.019876 | 0.012348 | -0.005298 | -0.003599 | 0.016491 | -0.001469 | -0.001469 |
View_3 | 0.182880 | 0.119851 | 0.158885 | 0.073871 | 0.091663 | 0.158213 | -0.018873 | 0.050668 | 0.040773 | -0.013892 | ... | -0.029884 | 0.032458 | 0.027027 | 0.023466 | -0.006524 | 0.008619 | -0.002591 | -0.001832 | -0.001057 | -0.001057 |
View_4 | 0.307932 | 0.110849 | 0.169460 | 0.019172 | 0.107625 | 0.150292 | -0.020229 | 0.080818 | 0.040748 | 0.013969 | ... | -0.030822 | 0.003644 | 0.035703 | 0.034369 | 0.013180 | -0.003003 | -0.002040 | -0.001442 | -0.000833 | -0.000833 |
Cond_2 | -0.051917 | -0.067614 | -0.065324 | 0.037617 | -0.058925 | -0.025309 | -0.067277 | -0.008571 | 0.023615 | -0.022650 | ... | -0.009602 | -0.020979 | -0.023345 | 0.003902 | -0.003759 | -0.002197 | -0.001493 | -0.001055 | -0.000609 | -0.000609 |
Cond_3 | 0.007131 | 0.123475 | 0.102413 | -0.011452 | 0.194555 | -0.151498 | 0.391719 | 0.069268 | 0.017798 | 0.042297 | ... | -0.013565 | 0.023362 | -0.003094 | -0.016160 | 0.000765 | -0.001738 | 0.012250 | -0.007798 | 0.005000 | -0.009253 |
Cond_4 | -0.030715 | -0.114994 | -0.083794 | 0.013157 | -0.142486 | 0.092712 | -0.257414 | -0.054833 | -0.060803 | -0.057481 | ... | 0.027374 | -0.028501 | -0.001074 | 0.014645 | -0.002471 | -0.001783 | -0.009948 | 0.010813 | -0.004061 | -0.004061 |
Cond_5 | 0.057585 | -0.003155 | -0.018136 | -0.014497 | -0.088453 | 0.127865 | -0.244353 | -0.030077 | 0.058646 | 0.025827 | ... | -0.015589 | 0.014152 | 0.016400 | 0.002455 | 0.004140 | 0.006846 | -0.004870 | -0.003444 | -0.001988 | 0.023273 |
Grade_1 | -0.007376 | -0.018536 | -0.013257 | 0.000947 | -0.012309 | -0.004481 | -0.001854 | -0.001429 | -0.006858 | -0.001436 | ... | -0.006210 | -0.004649 | -0.001924 | -0.000768 | -0.000285 | -0.000167 | -0.000113 | -0.000080 | -0.000046 | -0.000046 |
Grade_3 | -0.010733 | -0.026908 | -0.019028 | 0.003370 | -0.016957 | -0.007761 | -0.010298 | -0.002476 | -0.010631 | -0.019177 | ... | -0.010756 | -0.008053 | -0.003333 | -0.001330 | -0.000494 | -0.000289 | -0.000196 | -0.000139 | -0.000080 | -0.000080 |
Grade_4 | -0.032520 | -0.049693 | -0.056650 | 0.006190 | -0.050233 | -0.023573 | -0.039768 | -0.001472 | -0.005180 | -0.017343 | ... | -0.030923 | -0.025054 | -0.010368 | -0.004138 | -0.001538 | -0.000899 | -0.000611 | -0.000432 | -0.000249 | -0.000249 |
Grade_5 | -0.084513 | -0.127585 | -0.127055 | 0.022899 | -0.108432 | -0.060782 | -0.128795 | -0.009295 | 0.009456 | -0.046521 | ... | -0.042388 | -0.052912 | -0.021703 | -0.000180 | -0.004466 | -0.002611 | -0.001773 | -0.001254 | -0.000724 | -0.000724 |
Grade_6 | -0.209329 | -0.322861 | -0.312096 | -0.019164 | -0.280453 | -0.122919 | -0.313452 | 0.008192 | 0.148866 | -0.062725 | ... | -0.023006 | -0.141357 | -0.066479 | -0.026486 | -0.009762 | -0.007916 | -0.005377 | -0.003802 | -0.002195 | -0.002195 |
Grade_7 | -0.315803 | -0.234908 | -0.358513 | -0.068004 | -0.386797 | -0.020268 | -0.211814 | -0.020668 | 0.070263 | -0.040414 | ... | 0.157398 | -0.137591 | -0.058891 | -0.012655 | -0.010735 | 0.002290 | 0.008492 | 0.006004 | 0.008067 | 0.008067 |
Grade_8 | 0.004705 | 0.120300 | 0.071326 | -0.024086 | 0.059621 | 0.036462 | 0.199846 | 0.004496 | -0.052505 | 0.026337 | ... | 0.007825 | 0.057869 | 0.002166 | 0.012593 | 0.003272 | 0.001470 | -0.004231 | -0.007361 | -0.004250 | -0.004250 |
Grade_9 | 0.235897 | 0.203087 | 0.318419 | 0.049548 | 0.343198 | 0.018640 | 0.219956 | 0.019602 | -0.087643 | 0.042223 | ... | -0.101614 | 0.157856 | 0.064624 | 0.000115 | -0.002024 | -0.003315 | -0.006182 | 0.007672 | -0.002524 | -0.002524 |
Grade_10 | 0.340799 | 0.259672 | 0.369058 | 0.074335 | 0.376160 | 0.062053 | 0.148728 | 0.002240 | -0.073181 | 0.052306 | ... | -0.091455 | 0.113100 | 0.070517 | 0.014387 | 0.014890 | 0.002691 | -0.003921 | -0.002773 | -0.001601 | -0.001601 |
Grade_11 | 0.357412 | 0.255943 | 0.345771 | 0.077035 | 0.341966 | 0.077705 | 0.098705 | -0.003183 | -0.058145 | 0.039394 | ... | -0.086537 | 0.082598 | 0.070141 | 0.024601 | 0.027062 | -0.003364 | 0.018347 | -0.001616 | -0.000933 | -0.000933 |
Grade_12 | 0.290834 | 0.166158 | 0.238806 | 0.061535 | 0.223412 | 0.077555 | 0.047053 | -0.002884 | -0.039395 | 0.017403 | ... | -0.046044 | 0.031374 | 0.047553 | 0.012036 | 0.014437 | 0.027719 | -0.001078 | -0.000762 | -0.000440 | -0.000440 |
Grade_13 | 0.211803 | 0.100505 | 0.144329 | 0.007758 | 0.126575 | 0.062683 | 0.004686 | 0.022997 | 0.003766 | 0.013143 | ... | -0.018605 | -0.004616 | 0.036295 | 0.048017 | -0.001030 | -0.000602 | -0.000409 | -0.000289 | -0.000167 | -0.000167 |
Bedrooms_1 | -0.058411 | -0.119700 | -0.125461 | 0.002743 | -0.112286 | -0.050263 | -0.093017 | 0.020456 | 0.045904 | 0.004951 | ... | -0.088000 | -0.065890 | -0.027266 | -0.010883 | -0.004046 | -0.002365 | -0.001606 | -0.001136 | -0.000656 | -0.000656 |
Bedrooms_2 | -0.144571 | -0.313781 | -0.350008 | -0.027807 | -0.314731 | -0.137460 | -0.203107 | -0.002876 | 0.200333 | 0.078832 | ... | -0.349277 | -0.261520 | -0.108222 | -0.043196 | -0.016058 | -0.009387 | -0.006376 | -0.004508 | -0.002603 | -0.002603 |
Bedrooms_3 | -0.183648 | -0.166878 | -0.272404 | -0.015255 | -0.224832 | -0.144622 | 0.025095 | -0.019092 | -0.024937 | -0.056487 | ... | 1.000000 | -0.623946 | -0.258200 | -0.103058 | -0.038311 | -0.022395 | -0.015212 | -0.010756 | -0.006210 | -0.006210 |
Bedrooms_4 | 0.177489 | 0.232540 | 0.353317 | 0.026631 | 0.338346 | 0.100140 | 0.130070 | 0.004612 | -0.104634 | -0.019408 | ... | -0.623946 | 1.000000 | -0.193327 | -0.077165 | -0.028685 | -0.016768 | -0.011390 | -0.008053 | -0.004649 | -0.004649 |
Bedrooms_5 | 0.189925 | 0.265175 | 0.298006 | 0.011350 | 0.211493 | 0.222709 | 0.030812 | 0.011731 | -0.040980 | 0.028817 | ... | -0.258200 | -0.193327 | 1.000000 | -0.031932 | -0.011870 | -0.006939 | -0.004713 | -0.003333 | -0.001924 | -0.001924 |
Bedrooms_6 | 0.087776 | 0.146150 | 0.148055 | 0.009485 | 0.096252 | 0.127151 | -0.030627 | 0.023658 | 0.003215 | 0.016596 | ... | -0.103058 | -0.077165 | -0.031932 | 1.000000 | -0.004738 | -0.002770 | -0.001881 | -0.001330 | -0.000768 | -0.000768 |
Bedrooms_7 | 0.046995 | 0.091757 | 0.086562 | 0.005809 | 0.069528 | 0.049542 | -0.011663 | 0.013192 | 0.006178 | 0.014833 | ... | -0.038311 | -0.028685 | -0.011870 | -0.004738 | 1.000000 | -0.001030 | -0.000699 | -0.000494 | -0.000285 | -0.000285 |
Bedrooms_8 | 0.037755 | 0.050548 | 0.045947 | -0.003634 | 0.025845 | 0.046992 | -0.012276 | -0.005155 | 0.001791 | 0.012386 | ... | -0.022395 | -0.016768 | -0.006939 | -0.002770 | -0.001030 | 1.000000 | -0.000409 | -0.000289 | -0.000167 | -0.000167 |
Bedrooms_9 | 0.016064 | 0.050171 | 0.030756 | -0.003873 | 0.023074 | 0.020652 | -0.022318 | 0.010396 | 0.010193 | 0.011150 | ... | -0.015212 | -0.011390 | -0.004713 | -0.001881 | -0.000699 | -0.000409 | 1.000000 | -0.000196 | -0.000113 | -0.000113 |
Bedrooms_10 | 0.008962 | 0.019878 | 0.020870 | -0.001777 | 0.009509 | 0.025518 | -0.004548 | -0.002476 | -0.008722 | 0.003967 | ... | -0.010756 | -0.008053 | -0.003333 | -0.001330 | -0.000494 | -0.000289 | -0.000196 | 1.000000 | -0.000080 | -0.000080 |
Bedrooms_11 | -0.000372 | 0.008475 | 0.006815 | -0.001666 | 0.005024 | 0.004742 | -0.012275 | 0.032424 | 0.003567 | -0.000199 | ... | -0.006210 | -0.004649 | -0.001924 | -0.000768 | -0.000285 | -0.000167 | -0.000113 | -0.000080 | 1.000000 | -0.000046 |
Bedrooms_33 | 0.001851 | -0.000529 | -0.003406 | -0.001496 | -0.006148 | 0.004434 | -0.005559 | -0.001429 | 0.003186 | 0.006271 | ... | -0.006210 | -0.004649 | -0.001924 | -0.000768 | -0.000285 | -0.000167 | -0.000113 | -0.000080 | -0.000046 | 1.000000 |
51 rows × 51 columns
plt.figure(figsize = (12, 6))
plt.subplot(121)
plt.title('Price Distribuition')
sns.distplot(df['price'])
plt.subplot(122)
g1 = plt.scatter(range(df.shape[0]), np.sort(df.price.values))
g1= plt.title("Price Curve Distribuition", fontsize=15)
g1 = plt.xlabel("")
g1 = plt.ylabel("Amount(US)", fontsize=12)
plt.subplots_adjust(wspace = 0.3, hspace = 0.5,
top = 0.9)
plt.show()
print("Price Min")
print(df['price'].min())
print("Price Mean")
print(df['price'].mean())
print("Price Median")
print(df['price'].median())
print("Price Max")
print(df['price'].max())
print("Price Std")
print(df['price'].std())
Price Min 75000.0 Price Mean 540088.1417665294 Price Median 450000.0 Price Max 7700000.0 Price Std 367127.1964826997
plt.figure(figsize = (8, 5))
sns.jointplot(df.sqft_living, df.price,
alpha = 0.5)
plt.xlabel('Sqft Living')
plt.ylabel('Sale Price')
plt.show()
<Figure size 576x360 with 0 Axes>
condition = df['condition'].value_counts()
print("Condition counting: ")
print(condition)
fig, ax = plt.subplots(ncols=2, figsize=(14,5))
sns.countplot(x='condition', data=df, ax=ax[0])
sns.boxplot(x='condition', y= 'price',
data=df, ax=ax[1])
plt.show()
Condition counting: 3 14031 4 5679 5 1701 2 172 1 30 Name: condition, dtype: int64
plt.figure(figsize = (12,8))
g = sns.FacetGrid(data=df, hue='condition',height= 5, aspect=2)
g.map(plt.scatter, "sqft_living", "price")
plt.show()
<Figure size 864x576 with 0 Axes>
df["bathrooms"] = df['bathrooms'].round(0).astype(int)
print("Freuency bathroom description:")
print(df["bathrooms"].value_counts())
plt.figure(figsize = (12,8))
plt.subplots_adjust(hspace = 0.4, top = 0.8)
ax1 = plt.subplot(221)
ax1 = sns.countplot(x="bathrooms", data=df,
ax=ax1)
ax1.set_xticklabels(ax1.get_xticklabels(),rotation=90)
ax1.set_title("Bathrooms counting", fontsize=15)
ax1.set_xlabel("Bathrooms number")
ax1.set_xlabel("count")
ax2 = plt.subplot(222)
ax2 = sns.boxplot(x="bathrooms", y='price',
data=df, ax=ax2)
ax2.set_xticklabels(ax2.get_xticklabels(),rotation=90)
ax2.set_title("Bathrooms distribution price", fontsize=15)
ax2.set_xlabel("Bathrooms number")
ax2.set_ylabel("log Price(US)")
ax0 = plt.subplot(212)
ax0 = sns.stripplot(x="bathrooms", y="price",
data=df, alpha=0.5,
jitter=True, hue="condition")
ax0.set_title("Better view distribuition through price", fontsize=15)
ax0.set_xlabel("Bathroom number")
ax0.set_ylabel("log Price(US)")
ax0.set_xticklabels(ax0.get_xticklabels(),rotation=90)
plt.show()
Freuency bathroom description: 2 13851 1 3933 3 2527 4 1201 5 57 6 24 0 14 8 4 7 2 Name: bathrooms, dtype: int64
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
plt.figure(figsize = (12,6))
ax1 = plt.subplot2grid((2,2), (0,0), colspan = 2)
for val in range(1,6,1):
indeX = df.condition == val
ax1.scatter(df.sqft_living.loc[indeX], df.price.loc[indeX], label = val, alpha=0.5)
ax1.legend(bbox_to_anchor = [1.1, 1])
ax1.set_xlabel('sqfit living area')
ax1.set_ylabel('Price house')
ax1.set_title('Sqft Living - Price w.r.t Conditions')
ax2 = plt.subplot2grid((2,2), (1,0))
sns.boxplot(x = 'condition', y = 'price', data = df, ax = ax2)
ax2.set_title('Box Plot Condition & Price', fontsize = 12)
ax3 = plt.subplot2grid((2,2), (1,1))
cubicQual = df.groupby(['condition'])['price'].mean().round(0)
testTrain = df.loc[:, ['condition', 'price']].copy()
testTrain['sqCond'] = np.power(testTrain['condition'],2)
mdl = linear_model.LinearRegression()
mdl.fit(testTrain[['condition', 'sqCond']], testTrain['price'])
y_pred = mdl.predict(testTrain[['condition', 'sqCond']])
print("Mean squared error: %.2f" % mean_squared_error(y_pred, testTrain.price))
# Plot outputs
ax3.scatter(testTrain['condition'], testTrain['price'], color='black')
ax3.plot(testTrain['condition'], y_pred, color='blue', linewidth=3)
ax3.set_title('LinReg, price ~ condtion + sqft_cond', fontsize = 12)
ax3.set_xlabel('Condition Rate')
plt.subplots_adjust(hspace = 0.5, top = 0.9)
plt.suptitle('Condition Effect to Sale Price', fontsize = 14)
plt.show()
Mean squared error: 134582326689.42
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
plt.figure(figsize = (12,6))
ax1 = plt.subplot2grid((2,2), (0,0), colspan = 2)
for val in range(0,5,1):
indeX = df.view == val
ax1.scatter(df.sqft_living.loc[indeX], df.price.loc[indeX], label = val, alpha=0.4)
ax1.legend(bbox_to_anchor = [1.1, 1])
ax1.set_xlabel('sqfit living area')
ax1.set_ylabel('Price house')
ax1.set_title('Sqft Living - Price w.r.t View')
ax2 = plt.subplot2grid((2,2), (1,0))
sns.boxplot(x = 'view', y = 'price', data = df, ax = ax2)
ax2.set_title('Box Plot View & Price', fontsize = 12)
ax3 = plt.subplot2grid((2,2), (1,1))
cubicV = df.groupby(['view'])['price'].mean().round(0)
testTrain = df.loc[:, ['view', 'price']].copy()
testTrain['sqview'] = np.power(testTrain['view'],2)
mdl = linear_model.LinearRegression()
mdl.fit(testTrain[['view', 'sqview']], testTrain['price'])
y_pred = mdl.predict(testTrain[['view', 'sqview']])
print("Mean squared error: %.2f" % mean_squared_error(y_pred, testTrain.price))
# Plot outputs
ax3.scatter(testTrain['view'], testTrain['price'], color='black')
ax3.plot(testTrain['view'], y_pred, color='blue', linewidth=3)
ax3.set_title('LinReg, price ~ condtion + sqft_cond', fontsize = 12)
ax3.set_xlabel('View rate')
plt.subplots_adjust(hspace = 0.5, top = 0.9)
plt.suptitle('"VIEW" Effect To SalePrice', fontsize = 14)
plt.show()
Mean squared error: 112971203793.79
bedrooms = df.bedrooms.value_counts()
plt.figure(figsize = (12,8))
plt.subplots_adjust(hspace = 0.4, top = 0.8)
ax1 = plt.subplot(221)
ax1 = sns.countplot(x="bedrooms", data=df,
ax=ax1)
ax1.set_title("bedrooms counting", fontsize=15)
ax1.set_xlabel("Bathrooms number")
ax1.set_ylabel("count")
ax2 = plt.subplot(222)
ax2 = sns.regplot(x="bedrooms", y='price',
data=df, ax=ax2, x_jitter=True)
ax2.set_title("Bedrooms distribution price", fontsize=15)
ax2.set_xlabel("Bedrooms number")
ax2.set_ylabel("log Price(US)")
ax0 = plt.subplot(212)
ax0 = sns.boxenplot(x="bedrooms", y="price",
data=df)
ax0.set_title("Better understaning price", fontsize=15)
ax0.set_xlabel("Bedrooms")
ax0.set_ylabel("log Price(US)")
plt.show()
print("Floors counting description")
print(df['floors'].value_counts())
plt.figure(figsize = (12,8))
plt.subplots_adjust(hspace = 0.4, top = 0.8)
ax1 = plt.subplot(221)
ax1 = sns.boxenplot(x="floors", y='price',
data=df, ax=ax1, )
ax1.set_title("Floors counting", fontsize=15)
ax1.set_xlabel("Floors number")
ax1.set_ylabel("Count")
ax2 = plt.subplot(222)
ax2 = sns.countplot(x="floors",
data=df, ax=ax2)
ax2.set_title("Floor distribution by price", fontsize=15)
ax2.set_xlabel("Floor number")
ax2.set_ylabel("log Price(US)")
ax0 = plt.subplot(212)
ax0 = sns.regplot(x="floors", y="price", #I need to change floors by sqft_living and hue bye floors
data=df, x_jitter=True)
ax0.set_title("Better understaning price by floor", fontsize=15)
ax0.set_xlabel("Floor")
ax0.set_ylabel("log Price(US)")
plt.show()
Floors counting description 1.0 10680 2.0 8241 1.5 1910 3.0 613 2.5 161 3.5 8 Name: floors, dtype: int64
plt.figure(figsize = (12,8))
g=sns.lmplot(x="sqft_living", y="price", aspect=1.8,
data=df, hue="floors", fit_reg=False)
g.set_titles("Floors by sqft_living and price", fontsize=15)
g.set_xlabels("Sqft Living")
g.set_ylabels("Price(US)")
plt.show()
<Figure size 864x576 with 0 Axes>
print("Grade counting description")
print(df['grade'].value_counts())
plt.figure(figsize = (12,8))
plt.subplots_adjust(hspace = 0.4, top = 0.8)
ax1 = plt.subplot(221)
ax1 = sns.boxenplot(x="grade", y='price',
data=df, ax=ax1, )
ax1.set_xticklabels(ax1.get_xticklabels(),rotation=90)
ax1.set_title("grade counting", fontsize=15)
ax1.set_xlabel("Grade number")
ax1.set_ylabel("Count")
ax2 = plt.subplot(222)
ax2 = sns.countplot(x="grade",
data=df, ax=ax2)
ax2.set_xticklabels(ax2.get_xticklabels(),rotation=90)
ax2.set_title("Grade distribution price", fontsize=15)
ax2.set_xlabel("Grade number")
ax2.set_ylabel("log Price(US)")
ax0 = plt.subplot(212)
ax0 = sns.regplot(x="grade", y="price",
data=df, x_jitter=True)
ax0.set_title("Better understaning price by grade", fontsize=15)
ax0.set_xlabel("Grade")
ax0.set_ylabel("log Price(US)")
ax0.set_xticklabels(ax0.get_xticklabels(),rotation=90)
plt.show()
Grade counting description 7 8981 8 6068 9 2615 6 2038 10 1134 11 399 5 242 12 90 4 29 13 13 3 3 1 1 Name: grade, dtype: int64
#Clearly view of bathrooms and bedrooms correlation
bath = ['bathrooms', 'bedrooms']
cm = sns.light_palette("green", as_cmap=True)
pd.crosstab(df[bath[0]], df[bath[1]]).style.background_gradient(cmap = cm)
bedrooms | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 33 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
bathrooms | |||||||||||||
0 | 7 | 4 | 2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 2 | 167 | 1587 | 1800 | 327 | 43 | 6 | 1 | 0 | 0 | 0 | 0 | 0 |
2 | 4 | 28 | 1129 | 7186 | 4709 | 695 | 90 | 7 | 1 | 0 | 1 | 0 | 1 |
3 | 0 | 0 | 41 | 656 | 1219 | 506 | 88 | 7 | 6 | 2 | 1 | 1 | 0 |
4 | 0 | 0 | 1 | 182 | 601 | 321 | 72 | 17 | 4 | 3 | 0 | 0 | 0 |
5 | 0 | 0 | 0 | 0 | 19 | 22 | 12 | 2 | 1 | 0 | 1 | 0 | 0 |
6 | 0 | 0 | 0 | 0 | 6 | 13 | 2 | 2 | 1 | 0 | 0 | 0 | 0 |
7 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
8 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 1 | 0 | 0 | 0 |
bath_cond = ['bathrooms', 'condition']
cm = sns.light_palette("green", as_cmap=True)
pd.crosstab(df[bath_cond[0]], df[bath_cond[1]]).style.background_gradient(cmap = cm)
condition | 1 | 2 | 3 | 4 | 5 |
---|---|---|---|---|---|
bathrooms | |||||
0 | 1 | 1 | 9 | 3 | 0 |
1 | 20 | 91 | 2211 | 1300 | 311 |
2 | 9 | 75 | 9000 | 3702 | 1065 |
3 | 0 | 5 | 1713 | 553 | 256 |
4 | 0 | 0 | 1026 | 109 | 66 |
5 | 0 | 0 | 46 | 8 | 3 |
6 | 0 | 0 | 21 | 3 | 0 |
7 | 0 | 0 | 2 | 0 | 0 |
8 | 0 | 0 | 3 | 1 | 0 |
bed_cond = ['bedrooms', 'condition']
cm = sns.light_palette("green", as_cmap=True)
pd.crosstab(df[bed_cond[0]], df[bed_cond[1]]).style.background_gradient(cmap = cm)
condition | 1 | 2 | 3 | 4 | 5 |
---|---|---|---|---|---|
bedrooms | |||||
0 | 1 | 1 | 10 | 1 | 0 |
1 | 4 | 11 | 124 | 48 | 12 |
2 | 12 | 51 | 1779 | 718 | 200 |
3 | 8 | 69 | 6308 | 2711 | 728 |
4 | 4 | 36 | 4580 | 1682 | 580 |
5 | 0 | 1 | 1031 | 418 | 151 |
6 | 1 | 3 | 158 | 87 | 23 |
7 | 0 | 0 | 25 | 9 | 4 |
8 | 0 | 0 | 8 | 3 | 2 |
9 | 0 | 0 | 6 | 0 | 0 |
10 | 0 | 0 | 1 | 2 | 0 |
11 | 0 | 0 | 1 | 0 | 0 |
33 | 0 | 0 | 0 | 0 | 1 |
cond_water = ['condition', 'waterfront']
cm = sns.light_palette("green", as_cmap=True)
pd.crosstab(df[cond_water[0]], df[cond_water[1]]).style.background_gradient(cmap = cm)
waterfront | 0 | 1 |
---|---|---|
condition | ||
1 | 29 | 1 |
2 | 171 | 1 |
3 | 13940 | 91 |
4 | 5629 | 50 |
5 | 1681 | 20 |
grade_cond = ['grade', 'condition']
cm = sns.light_palette("green", as_cmap=True)
pd.crosstab(df[grade_cond[0]], df[grade_cond[1]]).style.background_gradient(cmap = cm)
condition | 1 | 2 | 3 | 4 | 5 |
---|---|---|---|---|---|
grade | |||||
1 | 1 | 0 | 0 | 0 | 0 |
3 | 0 | 1 | 1 | 0 | 1 |
4 | 1 | 5 | 13 | 10 | 0 |
5 | 9 | 15 | 100 | 84 | 34 |
6 | 11 | 59 | 1035 | 685 | 248 |
7 | 6 | 75 | 5234 | 2833 | 833 |
8 | 2 | 13 | 4269 | 1394 | 390 |
9 | 0 | 2 | 2041 | 446 | 126 |
10 | 0 | 2 | 921 | 156 | 55 |
11 | 0 | 0 | 332 | 56 | 11 |
12 | 0 | 0 | 74 | 13 | 3 |
13 | 0 | 0 | 11 | 2 | 0 |
grade_bed = ['grade', 'bedrooms']
cm = sns.light_palette("green", as_cmap=True)
pd.crosstab(df[grade_bed[0]], df[grade_bed[1]]).style.background_gradient(cmap = cm)
bedrooms | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 33 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
grade | |||||||||||||
1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 2 | 12 | 14 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 | 0 | 37 | 114 | 62 | 21 | 5 | 3 | 0 | 0 | 0 | 0 | 0 | 0 |
6 | 0 | 78 | 824 | 854 | 233 | 41 | 7 | 1 | 0 | 0 | 0 | 0 | 0 |
7 | 6 | 52 | 1205 | 4917 | 2177 | 501 | 98 | 11 | 6 | 4 | 2 | 1 | 1 |
8 | 3 | 14 | 499 | 2796 | 2194 | 455 | 90 | 12 | 4 | 1 | 0 | 0 | 0 |
9 | 0 | 2 | 78 | 832 | 1351 | 313 | 33 | 4 | 1 | 0 | 1 | 0 | 0 |
10 | 0 | 1 | 21 | 296 | 615 | 173 | 22 | 5 | 1 | 0 | 0 | 0 | 0 |
11 | 0 | 0 | 3 | 56 | 239 | 83 | 13 | 4 | 0 | 1 | 0 | 0 | 0 |
12 | 1 | 0 | 2 | 9 | 49 | 24 | 3 | 1 | 1 | 0 | 0 | 0 | 0 |
13 | 0 | 0 | 0 | 1 | 3 | 6 | 3 | 0 | 0 | 0 | 0 | 0 | 0 |
grade_bath = ['grade', 'bathrooms']
cm = sns.light_palette("green", as_cmap=True)
pd.crosstab(df[grade_bath[0]], df[grade_bath[1]]).style.background_gradient(cmap = cm)
bathrooms | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |
---|---|---|---|---|---|---|---|---|---|
grade | |||||||||
1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 28 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
5 | 1 | 204 | 36 | 1 | 0 | 0 | 0 | 0 | 0 |
6 | 2 | 1441 | 574 | 18 | 3 | 0 | 0 | 0 | 0 |
7 | 4 | 2104 | 6216 | 575 | 75 | 6 | 0 | 0 | 1 |
8 | 3 | 145 | 4788 | 882 | 244 | 4 | 2 | 0 | 0 |
9 | 0 | 9 | 1671 | 629 | 303 | 3 | 0 | 0 | 0 |
10 | 0 | 0 | 480 | 309 | 333 | 11 | 1 | 0 | 0 |
11 | 0 | 1 | 77 | 95 | 195 | 23 | 7 | 1 | 0 |
12 | 1 | 0 | 7 | 17 | 45 | 8 | 10 | 1 | 1 |
13 | 0 | 0 | 1 | 1 | 3 | 2 | 4 | 0 | 2 |
corr = df[['bathrooms', 'bedrooms', 'sqft_living', 'sqft_lot', 'floors', 'grade', 'price']]
plt.figure(figsize=(10,8))
plt.title('Correlation of variables')
sns.heatmap(corr.astype(float).corr(),vmax=1.0, annot=True)
plt.show()
sns.distplot(df['yr_built'])
<matplotlib.axes._subplots.AxesSubplot at 0x1dbc5efdb48>
g = sns.catplot(x="yr_built", y = "price", data=df[df['price'] < 1000000],
height= 7, aspect = 2, kind="box" )
g.set_xticklabels(rotation=90)
plt.show()
df = df.merge(pd.get_dummies(df.floors, drop_first=True, prefix='Floors'), left_index=True, right_index=True)
df = df.merge(pd.get_dummies(df.waterfront, drop_first=True, prefix='watFront'), left_index=True, right_index=True)
df = df.merge(pd.get_dummies(df.view, drop_first=True, prefix='View'), left_index=True, right_index=True)
df = df.merge(pd.get_dummies(df.condition, drop_first=True, prefix='Cond'), left_index=True, right_index=True)
df = df.merge(pd.get_dummies(df.grade, prefix='Grade'), left_index=True, right_index=True)
df = df.merge(pd.get_dummies(df.bedrooms, drop_first=True, prefix='Bedrooms'), left_index=True, right_index=True)
del df['floors'],
del df['waterfront']
del df['view']
del df['condition']
del df['grade']
del df['bedrooms']
plt.figure(figsize=(15,12))
plt.title('Correlation of variables', fontsize=20)
sns.heatmap(df.corr().astype(float).corr(),vmax=1.0)
plt.show()
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV # to split the data
from sklearn.metrics import explained_variance_score, median_absolute_error, r2_score, mean_squared_error, accuracy_score, confusion_matrix, classification_report, fbeta_score #To evaluate our model
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import KFold, cross_val_score, train_test_split # Model evaluation
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler # Preprocessing
from sklearn.linear_model import Lasso, Ridge, ElasticNet, RANSACRegressor, SGDRegressor, HuberRegressor, BayesianRidge # Linear models
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostRegressor, GradientBoostingRegressor, ExtraTreesRegressor # Ensemble methods
from xgboost import XGBRegressor, plot_importance # XGBoost
from sklearn.svm import SVR, SVC, LinearSVC # Support Vector Regression
from sklearn.tree import DecisionTreeRegressor # Decision Tree Regression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline # Streaming pipelines
from sklearn.decomposition import KernelPCA, PCA # Dimensionality reduction
from sklearn.feature_selection import SelectFromModel # Dimensionality reduction
from sklearn.model_selection import learning_curve, validation_curve, GridSearchCV # Model evaluation
from sklearn.base import clone # Clone estimator
from sklearn.metrics import mean_squared_error as MSE
df.drop(['id', 'date'], axis=1, inplace=True)
X = df.drop("price",axis=1).values
y = df["price"].values
# Spliting X and y into train and test version
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=3)
thresh = 5 * 10**(-3)
model = XGBRegressor(objective ='reg:squarederror')
model.fit(X_train, y_train)
#select features using threshold
selection = SelectFromModel(model, threshold=thresh, prefit=True)
select_X_train = selection.transform(X_train)
# eval model
select_X_val = selection.transform(X_test)
# test
select_X_test = selection.transform(X_test)
pipelines = []
seed = 2
pipelines.append(
("Scaled_Ridge",
Pipeline([
("Scaler", StandardScaler()),
("Ridge", Ridge(random_state=seed, tol=10 ))
]))
)
pipelines.append(
("Scaled_Lasso",
Pipeline([
("Scaler", StandardScaler()),
("Lasso", Lasso(random_state=seed, tol=1))
]))
)
pipelines.append(
("Scaled_Elastic",
Pipeline([
("Scaler", StandardScaler()),
("Lasso", ElasticNet(random_state=seed))
]))
)
pipelines.append(
("Scaled_SVR",
Pipeline([
("Scaler", StandardScaler()),
("SVR", SVR(kernel='linear', C=1e2, degree=5))
])
)
)
pipelines.append(
("Scaled_RF_reg",
Pipeline([
("Scaler", StandardScaler()),
("RF", RandomForestRegressor(random_state=seed))
])
)
)
pipelines.append(
("Scaled_ET_reg",
Pipeline([
("Scaler", StandardScaler()),
("ET", ExtraTreesRegressor(random_state=seed))
])
)
)
pipelines.append(
("Scaled_BR_reg",
Pipeline([
("Scaler", StandardScaler()),
("BR", BaggingRegressor(random_state=seed))
])))
pipelines.append(
("Scaled_Hub-Reg",
Pipeline([
("Scaler", StandardScaler()),
("Hub-Reg", HuberRegressor())
])))
pipelines.append(
("Scaled_BayRidge",
Pipeline([
("Scaler", StandardScaler()),
("BR", BayesianRidge())
])))
pipelines.append(
("Scaled_XGB_reg",
Pipeline([
("Scaler", StandardScaler()),
("XGBR", XGBRegressor(seed=seed))
])))
pipelines.append(
("Scaled_DT_reg",
Pipeline([
("Scaler", StandardScaler()),
("DT_reg", DecisionTreeRegressor())
])))
pipelines.append(
("Scaled_KNN_reg",
Pipeline([
("Scaler", StandardScaler()),
("KNN_reg", KNeighborsRegressor())
])))
pipelines.append(
("Scaled_Gboost-Reg",
Pipeline([
("Scaler", StandardScaler()),
("GBoost-Reg", GradientBoostingRegressor())
])))
pipelines.append(
("Scaled_RFR_PCA",
Pipeline([
("Scaler", StandardScaler()),
("PCA", PCA(n_components=3)),
("XGB", RandomForestRegressor())
])))
pipelines.append(
("Scaled_XGBR_PCA",
Pipeline([
("Scaler", StandardScaler()),
("PCA", PCA(n_components=3)),
("XGB", XGBRegressor())
])))
#'neg_mean_absolute_error', 'neg_mean_squared_error','r2'
scoring = 'r2'
n_folds = 7
results, names = [], []
for name, model in pipelines:
kfold = KFold(n_splits=n_folds, random_state=seed)
cv_results = cross_val_score(model, X_train, y_train, cv= kfold,
scoring=scoring, n_jobs=-1)
names.append(name)
results.append(cv_results)
msg = "%s: %f (+/- %f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
# boxplot algorithm comparison
fig = plt.figure(figsize=(15,6))
fig.suptitle('Algorithm Comparison', fontsize=22)
ax = fig.add_subplot(111)
sns.boxplot(x=names, y=results)
ax.set_xticklabels(names)
ax.set_xlabel("Algorithmn Name", fontsize=20)
ax.set_ylabel("R Squared Score of Models", fontsize=18)
ax.set_xticklabels(ax.get_xticklabels(),rotation=45)
plt.show()
Scaled_Ridge: 0.727431 (+/- 0.009924) Scaled_Lasso: 0.683566 (+/- 0.013035) Scaled_Elastic: 0.700632 (+/- 0.009749) Scaled_SVR: 0.671598 (+/- 0.009346) Scaled_RF_reg: 0.868115 (+/- 0.010678) Scaled_ET_reg: 0.873234 (+/- 0.011494) Scaled_BR_reg: 0.847559 (+/- 0.013386) Scaled_Hub-Reg: 0.701841 (+/- 0.009565) Scaled_BayRidge: 0.727563 (+/- 0.009940) Scaled_XGB_reg: 0.856188 (+/- 0.006032) Scaled_DT_reg: 0.734013 (+/- 0.031810) Scaled_KNN_reg: 0.716625 (+/- 0.020929) Scaled_Gboost-Reg: 0.857853 (+/- 0.008397) Scaled_RFR_PCA: 0.645886 (+/- 0.019619) Scaled_XGBR_PCA: 0.657901 (+/- 0.017554)
Very cool results!
We can see that we got good models with a good r2 score.
All RandomForestRegression, ExtraTreesRgressor, BaggingRegressor and XGBRegressor have r2 higher than 0.80
I will set hyper parameters to the best models and try increase this score
xgb = XGBRegressor(objective ='reg:squarederror',n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
colsample_bytree=1, max_depth=7)
xgb.fit(X_train, y_train)
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1, gamma=0, importance_type='gain', learning_rate=0.08, max_delta_step=0, max_depth=7, min_child_weight=1, missing=None, n_estimators=100, n_jobs=1, nthread=None, objective='reg:squarederror', random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, silent=None, subsample=0.75, verbosity=1)
y_hat = xgb.predict(X_test)
xgb.score(X_test,y_test)
0.8938452606979226
Excellent result of XGB Regressor with some arbitrary params.
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 200, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation,
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)
Fitting 3 folds for each of 100 candidates, totalling 300 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers. [Parallel(n_jobs=-1)]: Done 17 tasks | elapsed: 5.0s [Parallel(n_jobs=-1)]: Done 138 tasks | elapsed: 1.3min [Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 3.2min finished
RandomizedSearchCV(cv=3, error_score=nan, estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse', max_depth=None, max_features='auto', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None, oob_score=Fals... iid='deprecated', n_iter=100, n_jobs=-1, param_distributions={'bootstrap': [True, False], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'max_features': ['auto', 'sqrt'], 'min_samples_leaf': [1, 2, 4], 'min_samples_split': [2, 5, 10], 'n_estimators': [10, 31, 52, 73, 94, 115, 136, 157, 178, 200]}, pre_dispatch='2*n_jobs', random_state=42, refit=True, return_train_score=False, scoring=None, verbose=2)
#Knowing the best parameters
rf_random.best_params_
{'n_estimators': 157, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 90, 'bootstrap': True}
# Predicting with best params
y_hat_Search = rf_random.predict(X_test)
Random Forest Score: 0.880822431682647
print("XGBoost Regressor R2-score: {}".format(round(r2_score(y_hat, y_test),4)))
print("RandomForest Regressor Prediction R2-score: {}".format(round(r2_score(y_hat_Search, y_test),4)))
print("\nMSE of XGBoost Regressor: {}".format(median_absolute_error(y_hat, y_test)))
print("MSE of RandomForest Regressor: {} ".format(median_absolute_error(y_hat_Search, y_test)))
XGBoost Regressor R2-score: 0.8724 RandomForest Regressor Prediction R2-score: 0.8545 MSE of XGBoost Regressor: 40211.765625 MSE of RandomForest Regressor: 39935.995261072996
print("XG Boost Score: ", xgb.score(X_test,y_test))
print("Random Forest Score: ", rf_random.score(X_test, y_test))
XG Boost Score: 0.8938452606979226 Random Forest Score: 0.880822431682647