import pandas as pd
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing(return_X_y=False)
df = pd.DataFrame(housing.data, columns=housing.feature_names)
df['target'] = housing.target
target='target'
df
MedInc | HouseAge | AveRooms | AveBedrms | Population | AveOccup | Latitude | Longitude | target | |
---|---|---|---|---|---|---|---|---|---|
0 | 8.3252 | 41.0 | 6.984127 | 1.023810 | 322.0 | 2.555556 | 37.88 | -122.23 | 4.526 |
1 | 8.3014 | 21.0 | 6.238137 | 0.971880 | 2401.0 | 2.109842 | 37.86 | -122.22 | 3.585 |
2 | 7.2574 | 52.0 | 8.288136 | 1.073446 | 496.0 | 2.802260 | 37.85 | -122.24 | 3.521 |
3 | 5.6431 | 52.0 | 5.817352 | 1.073059 | 558.0 | 2.547945 | 37.85 | -122.25 | 3.413 |
4 | 3.8462 | 52.0 | 6.281853 | 1.081081 | 565.0 | 2.181467 | 37.85 | -122.25 | 3.422 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
20635 | 1.5603 | 25.0 | 5.045455 | 1.133333 | 845.0 | 2.560606 | 39.48 | -121.09 | 0.781 |
20636 | 2.5568 | 18.0 | 6.114035 | 1.315789 | 356.0 | 3.122807 | 39.49 | -121.21 | 0.771 |
20637 | 1.7000 | 17.0 | 5.205543 | 1.120092 | 1007.0 | 2.325635 | 39.43 | -121.22 | 0.923 |
20638 | 1.8672 | 18.0 | 5.329513 | 1.171920 | 741.0 | 2.123209 | 39.43 | -121.32 | 0.847 |
20639 | 2.3886 | 16.0 | 5.254717 | 1.162264 | 1387.0 | 2.616981 | 39.37 | -121.24 | 0.894 |
20640 rows × 9 columns
import sys
import warnings
warnings.simplefilter(action='ignore')
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df[df.columns[:-1]], df[target], test_size=0.25)
!pip install sweetviz
import sweetviz as sv
my_report = sv.analyze(df)
# we generate html report
# Default arguments will generate to "SWEETVIZ_REPORT.html"
my_report.show_html()
| | [ 0%] 00:00 -> (? left)
Report SWEETVIZ_REPORT.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.
# we generate inline report
my_report.show_notebook()
!pip install ydata_profiling
#!pip install matplotlib==3.1.3
from ydata_profiling import ProfileReport
profile = ProfileReport(df, title="Profiling Report")
#To generate a HTML report file, save the ProfileReport to an object and use the to_file() function:
profile.to_file("ydata_profiling.html")
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
Export report to file: 0%| | 0/1 [00:00<?, ?it/s]
#The HTML report can be directly embedded in a cell in a similar fashion:
profile.to_notebook_iframe()
!pip install mljar-supervised[full]
from supervised.automl import AutoML
#automl = AutoML(mode="Perform",results_path="AutoML_regression")
#automl = AutoML(mode="Explain"results_path="AutoML_regression")
automl = AutoML(results_path="AutoML_regression")
automl.fit(X_train, y_train)
Linear algorithm was disabled. AutoML directory: AutoML_regression The task is regression with evaluation metric rmse AutoML will use algorithms: ['Random Forest', 'LightGBM', 'Xgboost', 'CatBoost', 'Neural Network'] AutoML will ensemble available models AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'ensemble'] Skip simple_algorithms because no parameters were generated. * Step default_algorithms will try to check up to 5 models 1_Default_LightGBM: trained. 2_Default_Xgboost: trained. 3_Default_CatBoost: trained. 4_Default_NeuralNetwork: trained. 5_Default_RandomForest: trained. * Step not_so_random will try to check up to 20 models 10_LightGBM: trained. 6_Xgboost: trained. 14_CatBoost: trained. 18_RandomForest: trained. 22_NeuralNetwork: trained. 11_LightGBM rmse 0.465607 trained in 24.93 seconds (1-sample predict time 0.07 seconds) 7_Xgboost rmse 0.456562 trained in 37.01 seconds (1-sample predict time 0.053 seconds) 15_CatBoost rmse 0.465409 trained in 58.63 seconds (1-sample predict time 0.0516 seconds) 19_RandomForest rmse 0.771758 trained in 25.53 seconds (1-sample predict time 0.1619 seconds) 23_NeuralNetwork rmse 0.531705 trained in 23.49 seconds (1-sample predict time 0.099 seconds) 12_LightGBM rmse 0.442015 trained in 73.22 seconds (1-sample predict time 0.079 seconds) 8_Xgboost rmse 0.453561 trained in 22.02 seconds (1-sample predict time 0.0542 seconds) 16_CatBoost rmse 0.439662 trained in 46.66 seconds (1-sample predict time 0.0556 seconds) 20_RandomForest rmse 0.629602 trained in 34.85 seconds (1-sample predict time 0.1278 seconds) 24_NeuralNetwork rmse 0.563919 trained in 22.99 seconds (1-sample predict time 0.0675 seconds) 13_LightGBM rmse 0.450272 trained in 64.29 seconds (1-sample predict time 0.05 seconds) 9_Xgboost rmse 0.459185 trained in 14.1 seconds (1-sample predict time 0.0496 seconds) 17_CatBoost rmse 0.471491 trained in 30.68 seconds (1-sample predict time 0.0546 seconds) 21_RandomForest rmse 0.628874 trained in 25.65 seconds (1-sample predict time 0.1312 seconds) 25_NeuralNetwork rmse 0.541484 trained in 21.33 seconds (1-sample predict time 0.0687 seconds) * Step golden_features will try to check up to 3 models None 10 Add Golden Feature: AveOccup_ratio_MedInc Add Golden Feature: MedInc_ratio_AveOccup Add Golden Feature: MedInc_diff_AveOccup Add Golden Feature: MedInc_ratio_AveRooms Add Golden Feature: AveRooms_ratio_MedInc Add Golden Feature: Latitude_ratio_MedInc Add Golden Feature: MedInc_ratio_Latitude Add Golden Feature: MedInc_ratio_Longitude Add Golden Feature: Longitude_ratio_MedInc Add Golden Feature: Longitude_multiply_MedInc Created 10 Golden Features in 0.32 seconds. 14_CatBoost_GoldenFeatures rmse 0.439827 trained in 220.37 seconds (1-sample predict time 0.0833 seconds) 3_Default_CatBoost_GoldenFeatures rmse 0.441131 trained in 57.85 seconds (1-sample predict time 0.137 seconds) 16_CatBoost_GoldenFeatures rmse 0.441146 trained in 79.33 seconds (1-sample predict time 0.1532 seconds) * Step insert_random_feature will try to check up to 1 model 14_CatBoost_RandomFeature rmse 0.442379 trained in 105.28 seconds (1-sample predict time 0.0582 seconds) Drop features ['random_feature'] Skip features_selection because no parameters were generated. * Step hill_climbing_1 will try to check up to 17 models 26_CatBoost rmse 0.436406 trained in 194.52 seconds (1-sample predict time 0.0639 seconds) 27_CatBoost rmse 0.439561 trained in 61.5 seconds (1-sample predict time 0.109 seconds) 28_CatBoost rmse 0.437899 trained in 66.88 seconds (1-sample predict time 0.0555 seconds) 29_CatBoost rmse 0.445864 trained in 22.38 seconds (1-sample predict time 0.0553 seconds) 30_LightGBM rmse 0.44701 trained in 36.21 seconds (1-sample predict time 0.0762 seconds) 31_LightGBM rmse 0.447115 trained in 36.98 seconds (1-sample predict time 0.0517 seconds) 32_Xgboost rmse 0.453331 trained in 29.39 seconds (1-sample predict time 0.0775 seconds) 33_Xgboost rmse 0.448872 trained in 45.71 seconds (1-sample predict time 0.0493 seconds) 34_Xgboost rmse 0.454615 trained in 19.81 seconds (1-sample predict time 0.0496 seconds) 35_Xgboost rmse 0.454204 trained in 20.82 seconds (1-sample predict time 0.0512 seconds) 36_NeuralNetwork rmse 0.534436 trained in 21.89 seconds (1-sample predict time 0.1113 seconds) 37_NeuralNetwork rmse 0.546293 trained in 17.34 seconds (1-sample predict time 0.0683 seconds) 38_NeuralNetwork rmse 0.545955 trained in 26.4 seconds (1-sample predict time 0.0702 seconds) 39_RandomForest rmse 0.669056 trained in 31.72 seconds (1-sample predict time 0.1537 seconds) 40_RandomForest rmse 0.597718 trained in 35.29 seconds (1-sample predict time 0.1417 seconds) 41_RandomForest rmse 0.667502 trained in 34.09 seconds (1-sample predict time 0.1215 seconds) 42_RandomForest rmse 0.598416 trained in 39.05 seconds (1-sample predict time 0.111 seconds) * Step hill_climbing_2 will try to check up to 19 models 43_CatBoost rmse 0.436834 trained in 129.04 seconds (1-sample predict time 0.0778 seconds) 44_CatBoost rmse 0.440663 trained in 257.42 seconds (1-sample predict time 0.0568 seconds) 45_CatBoost rmse 0.438331 trained in 70.61 seconds (1-sample predict time 0.0589 seconds) 46_CatBoost rmse 0.439263 trained in 171.26 seconds (1-sample predict time 0.0869 seconds) 47_LightGBM rmse 0.442709 trained in 86.55 seconds (1-sample predict time 0.0507 seconds) 48_LightGBM rmse 0.443138 trained in 75.05 seconds (1-sample predict time 0.0472 seconds) 49_LightGBM rmse 0.445275 trained in 86.15 seconds (1-sample predict time 0.0475 seconds) * Step ensemble will try to check up to 1 model Ensemble rmse 0.431557 trained in 1.02 seconds (1-sample predict time 0.3651 seconds) AutoML fit time: 3202.14 seconds AutoML best model: Ensemble
AutoML(results_path='AutoML_regression')
y_predicted = automl.predict(X_test)
y_predicted
array([1.79598549, 1.01746422, 2.09676391, ..., 2.32656244, 1.35986105, 3.72703805])
pd.read_csv('/content/AutoML_regression/leaderboard.csv')
name | model_type | metric_type | metric_value | train_time | single_prediction_time | |
---|---|---|---|---|---|---|
0 | 1_Default_LightGBM | LightGBM | rmse | 0.445463 | 105.59 | 0.0479 |
1 | 2_Default_Xgboost | Xgboost | rmse | 0.449881 | 37.06 | 0.0612 |
2 | 3_Default_CatBoost | CatBoost | rmse | 0.438880 | 41.10 | 0.0553 |
3 | 4_Default_NeuralNetwork | Neural Network | rmse | 0.546717 | 17.67 | 0.0671 |
4 | 5_Default_RandomForest | Random Forest | rmse | 0.709524 | 16.21 | 0.0841 |
5 | 10_LightGBM | LightGBM | rmse | 0.447488 | 96.86 | 0.0706 |
6 | 6_Xgboost | Xgboost | rmse | 0.455735 | 32.28 | 0.0536 |
7 | 14_CatBoost | CatBoost | rmse | 0.436748 | 98.23 | 0.0995 |
8 | 18_RandomForest | Random Forest | rmse | 0.729217 | 34.63 | 0.1799 |
9 | 22_NeuralNetwork | Neural Network | rmse | 0.547975 | 16.70 | 0.0648 |
10 | 11_LightGBM | LightGBM | rmse | 0.465607 | 26.58 | 0.0700 |
11 | 7_Xgboost | Xgboost | rmse | 0.456562 | 38.05 | 0.0530 |
12 | 15_CatBoost | CatBoost | rmse | 0.465409 | 59.71 | 0.0516 |
13 | 19_RandomForest | Random Forest | rmse | 0.771758 | 26.59 | 0.1619 |
14 | 23_NeuralNetwork | Neural Network | rmse | 0.531705 | 24.89 | 0.0990 |
15 | 12_LightGBM | LightGBM | rmse | 0.442015 | 74.82 | 0.0790 |
16 | 8_Xgboost | Xgboost | rmse | 0.453561 | 23.06 | 0.0542 |
17 | 16_CatBoost | CatBoost | rmse | 0.439662 | 48.16 | 0.0556 |
18 | 20_RandomForest | Random Forest | rmse | 0.629602 | 35.92 | 0.1278 |
19 | 24_NeuralNetwork | Neural Network | rmse | 0.563919 | 24.08 | 0.0675 |
20 | 13_LightGBM | LightGBM | rmse | 0.450272 | 65.29 | 0.0500 |
21 | 9_Xgboost | Xgboost | rmse | 0.459185 | 15.10 | 0.0496 |
22 | 17_CatBoost | CatBoost | rmse | 0.471491 | 31.70 | 0.0546 |
23 | 21_RandomForest | Random Forest | rmse | 0.628874 | 26.67 | 0.1312 |
24 | 25_NeuralNetwork | Neural Network | rmse | 0.541484 | 22.27 | 0.0687 |
25 | 14_CatBoost_GoldenFeatures | CatBoost | rmse | 0.439827 | 222.13 | 0.0833 |
26 | 3_Default_CatBoost_GoldenFeatures | CatBoost | rmse | 0.441131 | 58.98 | 0.1370 |
27 | 16_CatBoost_GoldenFeatures | CatBoost | rmse | 0.441146 | 81.11 | 0.1532 |
28 | 14_CatBoost_RandomFeature | CatBoost | rmse | 0.442379 | 106.39 | 0.0582 |
29 | 26_CatBoost | CatBoost | rmse | 0.436406 | 195.65 | 0.0639 |
30 | 27_CatBoost | CatBoost | rmse | 0.439561 | 62.91 | 0.1090 |
31 | 28_CatBoost | CatBoost | rmse | 0.437899 | 68.00 | 0.0555 |
32 | 29_CatBoost | CatBoost | rmse | 0.445864 | 23.42 | 0.0553 |
33 | 30_LightGBM | LightGBM | rmse | 0.447010 | 37.51 | 0.0762 |
34 | 31_LightGBM | LightGBM | rmse | 0.447115 | 37.99 | 0.0517 |
35 | 32_Xgboost | Xgboost | rmse | 0.453331 | 30.94 | 0.0775 |
36 | 33_Xgboost | Xgboost | rmse | 0.448872 | 46.72 | 0.0493 |
37 | 34_Xgboost | Xgboost | rmse | 0.454615 | 21.03 | 0.0496 |
38 | 35_Xgboost | Xgboost | rmse | 0.454204 | 21.84 | 0.0512 |
39 | 36_NeuralNetwork | Neural Network | rmse | 0.534436 | 23.11 | 0.1113 |
40 | 37_NeuralNetwork | Neural Network | rmse | 0.546293 | 18.25 | 0.0683 |
41 | 38_NeuralNetwork | Neural Network | rmse | 0.545955 | 27.32 | 0.0702 |
42 | 39_RandomForest | Random Forest | rmse | 0.669056 | 32.75 | 0.1537 |
43 | 40_RandomForest | Random Forest | rmse | 0.597718 | 36.32 | 0.1417 |
44 | 41_RandomForest | Random Forest | rmse | 0.667502 | 35.26 | 0.1215 |
45 | 42_RandomForest | Random Forest | rmse | 0.598416 | 40.07 | 0.1110 |
46 | 43_CatBoost | CatBoost | rmse | 0.436834 | 130.75 | 0.0778 |
47 | 44_CatBoost | CatBoost | rmse | 0.440663 | 258.50 | 0.0568 |
48 | 45_CatBoost | CatBoost | rmse | 0.438331 | 71.69 | 0.0589 |
49 | 46_CatBoost | CatBoost | rmse | 0.439263 | 172.52 | 0.0869 |
50 | 47_LightGBM | LightGBM | rmse | 0.442709 | 87.62 | 0.0507 |
51 | 48_LightGBM | LightGBM | rmse | 0.443138 | 76.06 | 0.0472 |
52 | 49_LightGBM | LightGBM | rmse | 0.445275 | 87.22 | 0.0475 |
53 | Ensemble | Ensemble | rmse | 0.431557 | 1.02 | 0.3651 |
import imageio
def show_image(file):
im=imageio.imread(file)
plt.figure(figsize=(8,8))
plt.imshow(im)
plt.show()
show_image('/content/AutoML_regression/features_heatmap.png')
show_image('/content/AutoML_regression/ldb_performance_boxplot.png')
show_image('/content/AutoML_regression/correlation_heatmap.png')
show_image('/content/AutoML_regression/Ensemble/true_vs_predicted.png')
show_image('/content/AutoML_regression/30_LightGBM/permutation_importance.png')
!pip install tpot
from tpot import TPOTRegressor
tpot = TPOTRegressor(generations=2, population_size=50, verbosity=2, random_state=42)
tpot.fit(X_train, y_train)
Optimization Progress: 0%| | 0/150 [00:00<?, ?pipeline/s]
Generation 1 - Current best internal CV score: -0.2273782518846749 Generation 2 - Current best internal CV score: -0.2273782518846749 Best pipeline: LassoLarsCV(VarianceThreshold(XGBRegressor(input_matrix, learning_rate=0.1, max_depth=7, min_child_weight=18, n_estimators=100, n_jobs=1, objective=reg:squarederror, subsample=0.5, verbosity=0), threshold=0.01), normalize=False)
TPOTRegressor(generations=2, population_size=50, random_state=42, verbosity=2)
print(tpot.score(X_test, y_test))
tpot.export('TPOTRegressor_pipeline.py')
-0.23997506322785375