import json
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso, LinearRegression, Ridge
from sklearn.metrics import (
make_scorer,
mean_absolute_error,
mean_squared_error,
r2_score,
)
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.svm import SVR
from sklearn.utils import shuffle
from tqdm import tqdm as progress_bar
from utils import (
CONTINUOUS_VARIABLES,
DISCRETE_VARIABLES,
NOMINAL_VARIABLES,
ORDINAL_VARIABLES,
TARGET_VARIABLES,
bias_score,
encode_ordinals,
load_clean_data,
max_deviation,
)
random_state = np.random.RandomState(42)
pd.set_option("display.max_columns", 250)
The DataFrame df1
holds the cleaned data from notebook 1 with the all the nominal and ordinal features automatically translated to factor variables and ordered integer values.
df1 = load_clean_data("data/data_clean.csv")
This cell basically replaces all the manual work that went into generating new and identifying "interesting" features in notebooks 2 and 3.
df1 = pd.concat([
df1[CONTINUOUS_VARIABLES + DISCRETE_VARIABLES + ORDINAL_VARIABLES + TARGET_VARIABLES],
pd.get_dummies(df1[NOMINAL_VARIABLES], dtype=int),
], axis=1)
# Re-order the columns for convenience.
df1 = df1[sorted(set(df1.columns) - set(TARGET_VARIABLES)) + TARGET_VARIABLES]
df1 = encode_ordinals(df1)
df1 = shuffle(df1, random_state=random_state)
df1.info()
<class 'pandas.core.frame.DataFrame'> MultiIndex: 2898 entries, (np.int64(144), np.int64(535153070)) to (np.int64(867), np.int64(907253130)) Columns: 248 entries, 1st Flr SF to SalePrice dtypes: float64(19), int64(229) memory usage: 5.7 MB
df1.head()
1st Flr SF | 2nd Flr SF | 3Ssn Porch | Alley_Grvl | Alley_NA | Alley_Pave | Bedroom AbvGr | Bldg Type_1Fam | Bldg Type_2FmCon | Bldg Type_Duplx | Bldg Type_TwnhsE | Bldg Type_TwnhsI | Bsmt Cond | Bsmt Exposure | Bsmt Full Bath | Bsmt Half Bath | Bsmt Qual | Bsmt Unf SF | BsmtFin SF 1 | BsmtFin SF 2 | BsmtFin Type 1 | BsmtFin Type 2 | Central Air_N | Central Air_Y | Condition 1_Artery | Condition 1_Feedr | Condition 1_Norm | Condition 1_PosA | Condition 1_PosN | Condition 1_RRAe | Condition 1_RRAn | Condition 1_RRNe | Condition 1_RRNn | Condition 2_Artery | Condition 2_Feedr | Condition 2_Norm | Condition 2_PosA | Condition 2_PosN | Condition 2_RRAe | Condition 2_RRAn | Condition 2_RRNe | Condition 2_RRNn | Electrical | Enclosed Porch | Exter Cond | Exter Qual | Exterior 1st_AsbShng | Exterior 1st_AsphShn | Exterior 1st_BrkComm | Exterior 1st_BrkFace | Exterior 1st_CBlock | Exterior 1st_CemntBd | Exterior 1st_HdBoard | Exterior 1st_ImStucc | Exterior 1st_MetalSd | Exterior 1st_Other | Exterior 1st_Plywood | Exterior 1st_PreCast | Exterior 1st_Stone | Exterior 1st_Stucco | Exterior 1st_VinylSd | Exterior 1st_Wd Sdng | Exterior 1st_WdShing | Exterior 2nd_AsbShng | Exterior 2nd_AsphShn | Exterior 2nd_BrkComm | Exterior 2nd_BrkFace | Exterior 2nd_CBlock | Exterior 2nd_CemntBd | Exterior 2nd_HdBoard | Exterior 2nd_ImStucc | Exterior 2nd_MetalSd | Exterior 2nd_Other | Exterior 2nd_Plywood | Exterior 2nd_PreCast | Exterior 2nd_Stone | Exterior 2nd_Stucco | Exterior 2nd_VinylSd | Exterior 2nd_Wd Sdng | Exterior 2nd_WdShing | Fence | Fireplace Qu | Fireplaces | Foundation_BrkTil | Foundation_CBlock | Foundation_PConc | Foundation_Slab | Foundation_Stone | Foundation_Wood | Full Bath | Functional | Garage Area | Garage Cars | Garage Cond | Garage Finish | Garage Qual | Garage Type_2Types | Garage Type_Attchd | Garage Type_Basment | Garage Type_BuiltIn | Garage Type_CarPort | Garage Type_Detchd | Garage Type_NA | Gr Liv Area | Half Bath | Heating QC | Heating_Floor | Heating_GasA | Heating_GasW | Heating_Grav | Heating_OthW | Heating_Wall | House Style_1.5Fin | House Style_1.5Unf | House Style_1Story | House Style_2.5Fin | House Style_2.5Unf | House Style_2Story | House Style_SFoyer | House Style_SLvl | Kitchen AbvGr | Kitchen Qual | Land Contour_Bnk | Land Contour_HLS | Land Contour_Low | Land Contour_Lvl | Land Slope | Lot Area | Lot Config_Corner | Lot Config_CulDSac | Lot Config_FR2 | Lot Config_FR3 | Lot Config_Inside | Lot Shape | Low Qual Fin SF | MS SubClass_020 | MS SubClass_030 | MS SubClass_040 | MS SubClass_045 | MS SubClass_050 | MS SubClass_060 | MS SubClass_070 | MS SubClass_075 | MS SubClass_080 | MS SubClass_085 | MS SubClass_090 | MS SubClass_120 | MS SubClass_150 | MS SubClass_160 | MS SubClass_180 | MS SubClass_190 | MS Zoning_A | MS Zoning_C | MS Zoning_FV | MS Zoning_I | MS Zoning_RH | MS Zoning_RL | MS Zoning_RM | MS Zoning_RP | Mas Vnr Area | Mas Vnr Type_BrkCmn | Mas Vnr Type_BrkFace | Mas Vnr Type_CBlock | Mas Vnr Type_None | Mas Vnr Type_Stone | Misc Feature_Elev | Misc Feature_Gar2 | Misc Feature_NA | Misc Feature_Othr | Misc Feature_Shed | Misc Feature_TenC | Misc Val | Mo Sold | Neighborhood_Blmngtn | Neighborhood_Blueste | Neighborhood_BrDale | Neighborhood_BrkSide | Neighborhood_ClearCr | Neighborhood_CollgCr | Neighborhood_Crawfor | Neighborhood_Edwards | Neighborhood_Gilbert | Neighborhood_Greens | Neighborhood_GrnHill | Neighborhood_IDOTRR | Neighborhood_Landmrk | Neighborhood_MeadowV | Neighborhood_Mitchel | Neighborhood_NPkVill | Neighborhood_NWAmes | Neighborhood_Names | Neighborhood_NoRidge | Neighborhood_NridgHt | Neighborhood_OldTown | Neighborhood_SWISU | Neighborhood_Sawyer | Neighborhood_SawyerW | Neighborhood_Somerst | Neighborhood_StoneBr | Neighborhood_Timber | Neighborhood_Veenker | Open Porch SF | Overall Cond | Overall Qual | Paved Drive | Pool Area | Pool QC | Roof Matl_ClyTile | Roof Matl_CompShg | Roof Matl_Membran | Roof Matl_Metal | Roof Matl_Roll | Roof Matl_Tar&Grv | Roof Matl_WdShake | Roof Matl_WdShngl | Roof Style_Flat | Roof Style_Gable | Roof Style_Gambrel | Roof Style_Hip | Roof Style_Mansard | Roof Style_Shed | Sale Condition_Abnorml | Sale Condition_AdjLand | Sale Condition_Alloca | Sale Condition_Family | Sale Condition_Normal | Sale Condition_Partial | Sale Type_COD | Sale Type_CWD | Sale Type_Con | Sale Type_ConLD | Sale Type_ConLI | Sale Type_ConLw | Sale Type_New | Sale Type_Oth | Sale Type_VWD | Sale Type_WD | Screen Porch | Street_Grvl | Street_Pave | TotRms AbvGrd | Total Bsmt SF | Utilities | Wood Deck SF | Year Built | Year Remod/Add | Yr Sold | SalePrice | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Order | PID | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
144 | 535153070 | 1194.0 | 0.0 | 0.0 | 0 | 1 | 0 | 3 | 1 | 0 | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 3 | 1194.0 | 0.0 | 0.0 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 120.0 | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 7 | 312.0 | 1 | 3 | 2 | 3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1194.0 | 0 | 2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 1 | 2 | 8760.0 | 0 | 0 | 0 | 0 | 1 | 3 | 0.0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 220.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0.0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 5 | 5 | 2 | 0.0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.0 | 0 | 1 | 6 | 1194.0 | 3 | 0.0 | 1959 | 1959 | 2010 | 148000.0 |
1574 | 916380060 | 1537.0 | 0.0 | 0.0 | 0 | 1 | 0 | 3 | 1 | 0 | 0 | 0 | 0 | 3 | 4 | 1 | 0 | 5 | 482.0 | 1036.0 | 0.0 | 6 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.0 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 2 | 7 | 788.0 | 3 | 3 | 3 | 3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1537.0 | 0 | 4 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 3 | 0 | 1 | 0 | 0 | 2 | 11563.0 | 0 | 0 | 0 | 0 | 1 | 2 | 0.0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 258.0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0.0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 26.0 | 4 | 7 | 2 | 0.0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.0 | 0 | 1 | 8 | 1518.0 | 3 | 0.0 | 2006 | 2007 | 2008 | 294000.0 |
490 | 528290190 | 774.0 | 656.0 | 0.0 | 0 | 1 | 0 | 3 | 1 | 0 | 0 | 0 | 0 | 3 | 1 | 0 | 0 | 4 | 384.0 | 0.0 | 0.0 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.0 | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 3 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 2 | 7 | 400.0 | 2 | 3 | 2 | 3 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1430.0 | 1 | 4 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 2 | 0 | 0 | 0 | 1 | 2 | 7750.0 | 0 | 0 | 0 | 0 | 1 | 3 | 0.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0.0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0.0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 4 | 6 | 2 | 0.0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.0 | 0 | 1 | 7 | 384.0 | 3 | 0.0 | 1999 | 2000 | 2009 | 156000.0 |
1730 | 528218050 | 783.0 | 701.0 | 0.0 | 0 | 1 | 0 | 3 | 1 | 0 | 0 | 0 | 0 | 3 | 1 | 0 | 0 | 4 | 783.0 | 0.0 | 0.0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.0 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 2 | 7 | 393.0 | 2 | 3 | 3 | 3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1484.0 | 1 | 4 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 3 | 0 | 0 | 0 | 1 | 2 | 10237.0 | 0 | 0 | 0 | 0 | 1 | 3 | 0.0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0.0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0.0 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 72.0 | 4 | 5 | 2 | 0.0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0.0 | 0 | 1 | 8 | 783.0 | 3 | 0.0 | 2005 | 2007 | 2007 | 178900.0 |
2276 | 921128030 | 1824.0 | 0.0 | 0.0 | 0 | 1 | 0 | 3 | 1 | 0 | 0 | 0 | 0 | 3 | 4 | 0 | 0 | 5 | 1824.0 | 0.0 | 0.0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 4 | 0.0 | 2 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 2 | 7 | 932.0 | 3 | 3 | 3 | 3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1824.0 | 0 | 4 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 4 | 0 | 1 | 0 | 0 | 2 | 12633.0 | 0 | 0 | 0 | 0 | 1 | 2 | 0.0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 242.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0.0 | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 36.0 | 4 | 9 | 2 | 0.0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 108.0 | 0 | 1 | 8 | 1824.0 | 3 | 160.0 | 2006 | 2007 | 2007 | 392000.0 |
Obtain the raw numpy arrays:
X1 = df1.drop(columns="SalePrice").values
y1 = df1["SalePrice"].values
The DataFrame df2
holds the data as manually processed in notebooks 2 and 3.
New features like the *years_since_** columns were generated or derived from other variables like has 2nd Flr (from the continuous 2nd Flr SF). Further, factor variables were created taking into account patterns in the visualizations. For example, Bldg Type's (from df1
) five categories were condensed into just three. In summary, df2
has less than half as many dimensions as df1
to accomodate for a potential curse of dimensionality.
df2 = load_clean_data("data/data_clean_with_transformations_and_factors.csv")
df2 = encode_ordinals(df2)
df2 = shuffle(df2, random_state=random_state)
df2.info()
<class 'pandas.core.frame.DataFrame'> MultiIndex: 2893 entries, (np.int64(976), np.int64(923226270)) to (np.int64(2650), np.int64(902128100)) Columns: 109 entries, 1st Flr SF to SalePrice (box-cox-0) dtypes: float64(27), int64(82) memory usage: 2.6 MB
df2.head()
1st Flr SF | 1st Flr SF (box-cox-0) | 2nd Flr SF | 3Ssn Porch | Bedroom AbvGr | Bsmt Cond | Bsmt Exposure | Bsmt Full Bath | Bsmt Half Bath | Bsmt Qual | Bsmt Unf SF | BsmtFin SF 1 | BsmtFin SF 2 | BsmtFin Type 1 | BsmtFin Type 2 | Electrical | Enclosed Porch | Fence | Fireplace Qu | Fireplaces | Full Bath | Functional | Garage Area | Garage Cars | Garage Cond | Garage Finish | Garage Qual | Gr Liv Area | Gr Liv Area (box-cox-0) | Half Bath | Kitchen AbvGr | Kitchen Qual | Land Slope | Lot Area | Lot Area (box-cox-0.1) | Lot Shape | Low Qual Fin SF | Mas Vnr Area | Misc Val | Mo Sold | Open Porch SF | Overall Cond | Overall Qual | Paved Drive | Pool Area | Pool QC | Screen Porch | TotRms AbvGrd | Total Bath | Total Bsmt SF | Total Porch SF | Total SF | Total SF (box-cox-0.2) | Utilities | Wood Deck SF | abnormal_sale | air_cond | build_type_1Fam | build_type_2Fam | build_type_Twnhs | found_BrkTil | found_CBlock | found_PConc | has 2nd Flr | has Bsmt | has Fireplace | has Garage | has Pool | has Porch | major_street | new_home | nhood_Blmngtn | nhood_Blueste | nhood_BrDale | nhood_BrkSide | nhood_ClearCr | nhood_CollgCr | nhood_Crawfor | nhood_Edwards | nhood_Gilbert | nhood_Greens | nhood_GrnHill | nhood_IDOTRR | nhood_Landmrk | nhood_MeadowV | nhood_Mitchel | nhood_NPkVill | nhood_NWAmes | nhood_Names | nhood_NoRidge | nhood_NridgHt | nhood_OldTown | nhood_SWISU | nhood_Sawyer | nhood_SawyerW | nhood_Somerst | nhood_StoneBr | nhood_Timber | nhood_Veenker | park | partial_sale | railway | recently_built | recently_remodeled | remodeled | years_since_built | years_since_remodeled | SalePrice | SalePrice (box-cox-0) | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Order | PID | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
976 | 923226270 | 630.0 | 6.445720 | 0.0 | 0.0 | 1 | 3 | 3 | 1 | 0 | 4 | 115.0 | 515.0 | 0.0 | 6 | 1 | 4 | 0.0 | 0 | 0 | 0 | 1 | 7 | 286.0 | 1 | 3 | 1 | 3 | 630.0 | 6.445720 | 0 | 1 | 3 | 2 | 1526.0 | 10.813995 | 3 | 0.0 | 0.0 | 0.0 | 5 | 0.0 | 7 | 3 | 2 | 0.0 | 0 | 0.0 | 3 | 2.0 | 630.0 | 0.0 | 1260.0 | 15.847026 | 3 | 0.0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 39 | 7 | 86000.0 | 11.362103 |
1112 | 528427070 | 894.0 | 6.795706 | 1039.0 | 0.0 | 4 | 3 | 2 | 0 | 0 | 4 | 894.0 | 0.0 | 0.0 | 1 | 1 | 4 | 0.0 | 0 | 4 | 1 | 2 | 7 | 668.0 | 3 | 3 | 3 | 3 | 1933.0 | 7.566828 | 1 | 1 | 3 | 2 | 14598.0 | 16.087312 | 2 | 0.0 | 74.0 | 0.0 | 1 | 18.0 | 4 | 5 | 2 | 0.0 | 0 | 0.0 | 9 | 2.5 | 894.0 | 118.0 | 2827.0 | 19.503897 | 3 | 100.0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 1 | 214000.0 | 12.273731 |
819 | 906340100 | 1680.0 | 7.426549 | 0.0 | 0.0 | 1 | 3 | 4 | 1 | 0 | 5 | 534.0 | 1021.0 | 0.0 | 6 | 1 | 4 | 0.0 | 0 | 4 | 1 | 1 | 7 | 1138.0 | 3 | 3 | 3 | 3 | 1680.0 | 7.426549 | 1 | 1 | 4 | 2 | 10933.0 | 15.343929 | 3 | 0.0 | 242.0 | 0.0 | 7 | 24.0 | 4 | 8 | 2 | 0.0 | 0 | 0.0 | 8 | 2.5 | 1555.0 | 209.0 | 3235.0 | 20.173573 | 3 | 185.0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 323262.0 | 12.686218 |
1651 | 527327050 | 1118.0 | 7.019297 | 912.0 | 0.0 | 4 | 3 | 1 | 0 | 0 | 3 | 692.0 | 156.0 | 0.0 | 2 | 1 | 4 | 0.0 | 0 | 3 | 1 | 2 | 7 | 551.0 | 2 | 3 | 3 | 3 | 2030.0 | 7.615791 | 1 | 1 | 3 | 2 | 12046.0 | 15.590825 | 2 | 0.0 | 298.0 | 0.0 | 6 | 224.0 | 5 | 5 | 2 | 0.0 | 0 | 0.0 | 8 | 2.5 | 848.0 | 224.0 | 2878.0 | 19.591678 | 3 | 0.0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 31 | 31 | 195000.0 | 12.180755 |
1140 | 531382090 | 754.0 | 6.625392 | 855.0 | 0.0 | 3 | 3 | 1 | 0 | 0 | 4 | 392.0 | 362.0 | 0.0 | 6 | 1 | 4 | 0.0 | 0 | 0 | 0 | 2 | 7 | 525.0 | 2 | 3 | 2 | 3 | 1609.0 | 7.383368 | 1 | 1 | 3 | 2 | 8453.0 | 14.700235 | 2 | 0.0 | 38.0 | 0.0 | 4 | 70.0 | 4 | 5 | 2 | 0.0 | 0 | 0.0 | 6 | 2.5 | 754.0 | 70.0 | 2363.0 | 18.640832 | 3 | 0.0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 13 | 13 | 182000.0 | 12.111762 |
Obtain the raw numpy arrays:
X2 = df2.drop(columns=["SalePrice", "SalePrice (box-cox-0)"]).values
y2 = df2["SalePrice"].values
y2l = df2["SalePrice (box-cox-0)"].values
Also, notebook 2 collects variables that correlate either weakly ($0.33 < \vert\rho\vert < 0.66$) or strongly ($\vert\rho\vert > 0.66$) with the SalePrice (or the logarithm thereof). These variables serve as a "naive" feature pre-selection.
with open("data/correlated_variables.json", "r") as file:
_ = json.loads(file.read())
weakly_correlated = _["weakly_correlated"]
strongly_correlated = _["strongly_correlated"]
pre_selection = sorted(set(weakly_correlated + strongly_correlated) & set(df2.columns))
The df3
DataFrame is just a subset of df2
(71 columns).
df3 = df2[pre_selection + TARGET_VARIABLES]
df3.info(verbose=False)
<class 'pandas.core.frame.DataFrame'> MultiIndex: 2893 entries, (np.int64(976), np.int64(923226270)) to (np.int64(2650), np.int64(902128100)) Columns: 32 entries, 1st Flr SF to SalePrice (box-cox-0) dtypes: float64(16), int64(16) memory usage: 908.9 KB
df3.head()
1st Flr SF | 1st Flr SF (box-cox-0) | Bsmt Exposure | Bsmt Qual | BsmtFin SF 1 | BsmtFin Type 1 | Fireplace Qu | Fireplaces | Full Bath | Garage Area | Garage Cars | Garage Cond | Garage Finish | Garage Qual | Gr Liv Area | Gr Liv Area (box-cox-0) | Half Bath | Kitchen Qual | Lot Area (box-cox-0.1) | Lot Shape | Mas Vnr Area | Overall Qual | Paved Drive | TotRms AbvGrd | Total Bath | Total Bsmt SF | Total Porch SF | Total SF | Total SF (box-cox-0.2) | Wood Deck SF | SalePrice | SalePrice (box-cox-0) | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Order | PID | ||||||||||||||||||||||||||||||||
976 | 923226270 | 630.0 | 6.445720 | 3 | 4 | 515.0 | 6 | 0 | 0 | 1 | 286.0 | 1 | 3 | 1 | 3 | 630.0 | 6.445720 | 0 | 3 | 10.813995 | 3 | 0.0 | 3 | 2 | 3 | 2.0 | 630.0 | 0.0 | 1260.0 | 15.847026 | 0.0 | 86000.0 | 11.362103 |
1112 | 528427070 | 894.0 | 6.795706 | 2 | 4 | 0.0 | 1 | 4 | 1 | 2 | 668.0 | 3 | 3 | 3 | 3 | 1933.0 | 7.566828 | 1 | 3 | 16.087312 | 2 | 74.0 | 5 | 2 | 9 | 2.5 | 894.0 | 118.0 | 2827.0 | 19.503897 | 100.0 | 214000.0 | 12.273731 |
819 | 906340100 | 1680.0 | 7.426549 | 4 | 5 | 1021.0 | 6 | 4 | 1 | 1 | 1138.0 | 3 | 3 | 3 | 3 | 1680.0 | 7.426549 | 1 | 4 | 15.343929 | 3 | 242.0 | 8 | 2 | 8 | 2.5 | 1555.0 | 209.0 | 3235.0 | 20.173573 | 185.0 | 323262.0 | 12.686218 |
1651 | 527327050 | 1118.0 | 7.019297 | 1 | 3 | 156.0 | 2 | 3 | 1 | 2 | 551.0 | 2 | 3 | 3 | 3 | 2030.0 | 7.615791 | 1 | 3 | 15.590825 | 2 | 298.0 | 5 | 2 | 8 | 2.5 | 848.0 | 224.0 | 2878.0 | 19.591678 | 0.0 | 195000.0 | 12.180755 |
1140 | 531382090 | 754.0 | 6.625392 | 1 | 4 | 362.0 | 6 | 0 | 0 | 2 | 525.0 | 2 | 3 | 2 | 3 | 1609.0 | 7.383368 | 1 | 3 | 14.700235 | 2 | 38.0 | 5 | 2 | 6 | 2.5 | 754.0 | 70.0 | 2363.0 | 18.640832 | 0.0 | 182000.0 | 12.111762 |
Obtain the raw numpy arrays:
X3 = df3.drop(columns=["SalePrice", "SalePrice (box-cox-0)"]).values
y3 = df3["SalePrice"].values
y3l = df3["SalePrice (box-cox-0)"].values
Define a function to run k-fold cross validation to obtain unbiased estimators for the following scores / errors:
def cross_validation(X, y, *, model, k=10, log=False, desc=None):
"""Perform a k-fold cross validation."""
bias, mae, max_dev, r2, rmse = [], [], [], [], []
# Iterate over the k folds.
for train, test in progress_bar(KFold(n_splits=k).split(X), desc=desc, total=k):
model.fit(X[train], y[train])
y_pred = model.predict(X[test])
# If the sales price is provided on a log scale,
# take the exponent first so that scores and
# errors are comparable to the non-logged counterparts.
if log:
y_true, y_pred = np.exp(y[test]), np.exp(y_pred)
else:
y_true, y_pred = y[test], y_pred
# Collect the scores/errors for each fold.
bias.append(bias_score(y_true, y_pred))
mae.append(mean_absolute_error(y_true, y_pred))
max_dev.append(max_deviation(y_true, y_pred))
r2.append(r2_score(y_true, y_pred))
rmse.append(mean_squared_error(y_true, y_pred))
# Round for convenience.
return {
"bias": np.round(np.mean(bias)),
"mae": np.round(np.mean(mae)),
"max_dev": np.round(np.mean(max_dev)),
"r2": np.round(np.mean(r2), 3),
"rmse": np.round(np.sqrt(np.mean(rmse))),
}
Use one dictionary to store all the results in a systematic way.
results = {}
A plain OLS regression model serves as the base case for benchmarking.
lm = LinearRegression()
Given the unprocessed data, the linear model is not able to make a good fit at all.
results[('lm','o')] = cross_validation(X1, y1, model=lm)
results[('lm','o')]
100%|██████████| 10/10 [00:01<00:00, 9.86it/s]
{'bias': np.float64(25204734.0), 'mae': np.float64(32984598.0), 'max_dev': np.float64(9091844797.0), 'r2': np.float64(-374439996.215), 'rmse': np.float64(1541948537.0)}
results[('lm','i')] = cross_validation(X2, y2, model=lm)
results[('lm','i')]
100%|██████████| 10/10 [00:00<00:00, 22.99it/s]
{'bias': np.float64(-40.0), 'mae': np.float64(15377.0), 'max_dev': np.float64(121895.0), 'r2': np.float64(0.92), 'rmse': np.float64(22178.0)}
results[('lm','il')] = cross_validation(X2, y2l, model=lm, log=True)
results[('lm','il')]
100%|██████████| 10/10 [00:00<00:00, 19.21it/s]
{'bias': np.float64(-888.0), 'mae': np.float64(12851.0), 'max_dev': np.float64(108012.0), 'r2': np.float64(0.94), 'rmse': np.float64(19210.0)}
results[('lm','p')] = cross_validation(X3, y3, model=lm)
results[('lm','p')]
100%|██████████| 10/10 [00:00<00:00, 278.33it/s]
{'bias': np.float64(25.0), 'mae': np.float64(18570.0), 'max_dev': np.float64(136253.0), 'r2': np.float64(0.89), 'rmse': np.float64(25994.0)}
results[('lm','pl')] = cross_validation(X3, y3l, model=lm, log=True)
results[('lm','pl')]
100%|██████████| 10/10 [00:00<00:00, 271.17it/s]
{'bias': np.float64(-1430.0), 'mae': np.float64(16155.0), 'max_dev': np.float64(127999.0), 'r2': np.float64(0.911), 'rmse': np.float64(23391.0)}
tol = 0.1
grid_search = GridSearchCV(
estimator=Lasso(tol=tol, selection="random", random_state=random_state),
param_grid={"alpha": [2 ** x for x in range(-8, 4)] + list(range(12, 65, 4))},
cv=KFold(n_splits=4),
n_jobs=-1,
)
grid_search.fit(X1, y1)
alpha = grid_search.best_params_["alpha"]
alpha
24
results[("lasso", "o")] = cross_validation(X1, y1, model=Lasso(alpha=alpha, tol=tol))
results[("lasso", "o")]
100%|██████████| 10/10 [00:00<00:00, 38.22it/s]
{'bias': np.float64(185.0), 'mae': np.float64(20586.0), 'max_dev': np.float64(268155.0), 'r2': np.float64(0.822), 'rmse': np.float64(33116.0)}
grid_search.fit(X2, y2)
alpha = grid_search.best_params_["alpha"]
alpha
28
results[("lasso", "i")] = cross_validation(X2, y2, model=Lasso(alpha=alpha, tol=tol))
results[("lasso", "i")]
100%|██████████| 10/10 [00:00<00:00, 54.59it/s]
{'bias': np.float64(-68.0), 'mae': np.float64(17523.0), 'max_dev': np.float64(129093.0), 'r2': np.float64(0.9), 'rmse': np.float64(24731.0)}
grid_search.fit(X2, y2l)
alpha = grid_search.best_params_["alpha"]
alpha
0.00390625
results[("lasso", "il")] = cross_validation(X2, y2l, model=Lasso(alpha=alpha, tol=tol), log=True)
results[("lasso", "il")]
100%|██████████| 10/10 [00:00<00:00, 46.34it/s]
{'bias': np.float64(-843.0), 'mae': np.float64(14414.0), 'max_dev': np.float64(118524.0), 'r2': np.float64(0.927), 'rmse': np.float64(21134.0)}
grid_search.fit(X3, y3)
alpha = grid_search.best_params_["alpha"]
alpha
0.00390625
results[("lasso", "p")] = cross_validation(X3, y3, model=Lasso(alpha=alpha, tol=tol))
results[("lasso", "p")]
100%|██████████| 10/10 [00:00<00:00, 328.75it/s]
{'bias': np.float64(30.0), 'mae': np.float64(22904.0), 'max_dev': np.float64(158375.0), 'r2': np.float64(0.84), 'rmse': np.float64(31248.0)}
grid_search.fit(X3, y3l)
alpha = grid_search.best_params_["alpha"]
alpha
0.00390625
results[("lasso", "pl")] = cross_validation(X3, y3l, model=Lasso(alpha=alpha, tol=tol), log=True)
results[("lasso", "pl")]
100%|██████████| 10/10 [00:00<00:00, 299.57it/s]
{'bias': np.float64(-875.0), 'mae': np.float64(16644.0), 'max_dev': np.float64(135627.0), 'r2': np.float64(0.904), 'rmse': np.float64(24239.0)}
grid_search = GridSearchCV(
estimator=Ridge(),
param_grid={"alpha": [2 ** x for x in range(-8, 4)] + list(range(12, 65, 4))},
cv=KFold(n_splits=4),
n_jobs=-1,
)
grid_search.fit(X1, y1)
alpha = grid_search.best_params_["alpha"]
alpha
0.125
results[("ridge", "o")] = cross_validation(X1, y1, model=Ridge(alpha=alpha))
results[("ridge", "o")]
100%|██████████| 10/10 [00:00<00:00, 41.08it/s]
{'bias': np.float64(152.0), 'mae': np.float64(17064.0), 'max_dev': np.float64(263561.0), 'r2': np.float64(0.853), 'rmse': np.float64(29970.0)}
grid_search.fit(X2, y2)
alpha = grid_search.best_params_["alpha"]
alpha
0.5
results[("ridge", "i")] = cross_validation(X2, y2, model=Ridge(alpha=alpha))
results[("ridge", "i")]
100%|██████████| 10/10 [00:00<00:00, 69.82it/s]
{'bias': np.float64(-52.0), 'mae': np.float64(15351.0), 'max_dev': np.float64(122508.0), 'r2': np.float64(0.92), 'rmse': np.float64(22106.0)}
grid_search.fit(X2, y2l)
alpha = grid_search.best_params_["alpha"]
alpha
0.5
results[("ridge", "il")] = cross_validation(X2, y2l, model=Ridge(alpha=alpha), log=True)
results[("ridge", "il")]
100%|██████████| 10/10 [00:00<00:00, 74.39it/s]
{'bias': np.float64(-916.0), 'mae': np.float64(12836.0), 'max_dev': np.float64(107968.0), 'r2': np.float64(0.94), 'rmse': np.float64(19152.0)}
grid_search.fit(X3, y3)
alpha = grid_search.best_params_["alpha"]
alpha
8
results[("ridge", "p")] = cross_validation(X3, y3, model=Ridge(alpha=alpha))
results[("ridge", "p")]
100%|██████████| 10/10 [00:00<00:00, 64.44it/s]
{'bias': np.float64(33.0), 'mae': np.float64(18534.0), 'max_dev': np.float64(136836.0), 'r2': np.float64(0.89), 'rmse': np.float64(25965.0)}
grid_search.fit(X3, y3l)
alpha = grid_search.best_params_["alpha"]
alpha
0.5
results[("ridge", "pl")] = cross_validation(X3, y3l, model=Ridge(alpha=alpha), log=True)
results[("ridge", "pl")]
100%|██████████| 10/10 [00:00<00:00, 60.47it/s]
{'bias': np.float64(-1389.0), 'mae': np.float64(16141.0), 'max_dev': np.float64(127870.0), 'r2': np.float64(0.911), 'rmse': np.float64(23366.0)}
rf = RandomForestRegressor(
n_estimators=500,
n_jobs=-1, random_state=random_state
)
results[("rf", "o")] = cross_validation(X1, y1, model=rf)
results[("rf", "o")]
100%|██████████| 10/10 [00:19<00:00, 1.96s/it]
{'bias': np.float64(-27.0), 'mae': np.float64(15331.0), 'max_dev': np.float64(164293.0), 'r2': np.float64(0.898), 'rmse': np.float64(25371.0)}
results[("rf", "i")] = cross_validation(X2, y2, model=rf)
results[("rf", "i")]
100%|██████████| 10/10 [00:16<00:00, 1.69s/it]
{'bias': np.float64(-53.0), 'mae': np.float64(15018.0), 'max_dev': np.float64(124828.0), 'r2': np.float64(0.912), 'rmse': np.float64(23190.0)}
results[("rf", "il")] = cross_validation(X2, y2l, model=rf, log=True)
results[("rf", "il")]
100%|██████████| 10/10 [00:19<00:00, 1.97s/it]
{'bias': np.float64(-2089.0), 'mae': np.float64(15068.0), 'max_dev': np.float64(136284.0), 'r2': np.float64(0.911), 'rmse': np.float64(23306.0)}
results[("rf", "p")] = cross_validation(X3, y3, model=rf)
results[("rf", "p")]
100%|██████████| 10/10 [00:12<00:00, 1.29s/it]
{'bias': np.float64(-232.0), 'mae': np.float64(16274.0), 'max_dev': np.float64(130943.0), 'r2': np.float64(0.9), 'rmse': np.float64(24685.0)}
results[("rf", "pl")] = cross_validation(X3, y3l, model=rf, log=True)
results[("rf", "pl")]
100%|██████████| 10/10 [00:11<00:00, 1.13s/it]
{'bias': np.float64(-2390.0), 'mae': np.float64(16388.0), 'max_dev': np.float64(141335.0), 'r2': np.float64(0.898), 'rmse': np.float64(24924.0)}
This notebook did not focus on hyper-parameter optimization. Therefore, the predictions by Lasso, Ridge, and the Random Forest can potentially be improved by fine-graining the grid search even more.
In general, the manually "improved" data clearly outperform the data that were only cleaned with the minimum effort. Also, the result suggests to allow the model to select its features. The manually pre-selected features perform well but not as good as the full feature set.
def scores_by_source(source, score="rmse", *, ascending=True):
rv = [
(model, scores[score])
for (model, data_source), scores in results.items()
if data_source == source
]
return sorted(rv, key=lambda x: x[1], reverse=(not ascending))
scores_by_source("o", "rmse")
[('rf', np.float64(25371.0)), ('ridge', np.float64(29970.0)), ('lasso', np.float64(33116.0)), ('lm', np.float64(1541948537.0))]
scores_by_source("i", "rmse")
[('ridge', np.float64(22106.0)), ('lm', np.float64(22178.0)), ('rf', np.float64(23190.0)), ('lasso', np.float64(24731.0))]
scores_by_source("il", "rmse")
[('ridge', np.float64(19152.0)), ('lm', np.float64(19210.0)), ('lasso', np.float64(21134.0)), ('rf', np.float64(23306.0))]
scores_by_source("p", "rmse")
[('rf', np.float64(24685.0)), ('ridge', np.float64(25965.0)), ('lm', np.float64(25994.0)), ('lasso', np.float64(31248.0))]
scores_by_source("pl", "rmse")
[('ridge', np.float64(23366.0)), ('lm', np.float64(23391.0)), ('lasso', np.float64(24239.0)), ('rf', np.float64(24924.0))]
scores_by_source("o", "r2", ascending=False)
[('rf', np.float64(0.898)), ('ridge', np.float64(0.853)), ('lasso', np.float64(0.822)), ('lm', np.float64(-374439996.215))]
scores_by_source("i", "r2", ascending=False)
[('lm', np.float64(0.92)), ('ridge', np.float64(0.92)), ('rf', np.float64(0.912)), ('lasso', np.float64(0.9))]
scores_by_source("il", "r2", ascending=False)
[('lm', np.float64(0.94)), ('ridge', np.float64(0.94)), ('lasso', np.float64(0.927)), ('rf', np.float64(0.911))]
scores_by_source("p", "r2", ascending=False)
[('rf', np.float64(0.9)), ('lm', np.float64(0.89)), ('ridge', np.float64(0.89)), ('lasso', np.float64(0.84))]
scores_by_source("pl", "r2", ascending=False)
[('lm', np.float64(0.911)), ('ridge', np.float64(0.911)), ('lasso', np.float64(0.904)), ('rf', np.float64(0.898))]