import pandas as pd
df = pd.read_csv("https://raw.githubusercontent.com/pplonski/datasets-for-start/master/adult/data.csv",skipinitialspace=True,)
target = 'income'
df
age | workclass | fnlwgt | education | education-num | marital-status | occupation | relationship | race | sex | capital-gain | capital-loss | hours-per-week | native-country | income | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 39 | State-gov | 77516 | Bachelors | 13 | Never-married | Adm-clerical | Not-in-family | White | Male | 2174 | 0 | 40 | United-States | <=50K |
1 | 50 | Self-emp-not-inc | 83311 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 13 | United-States | <=50K |
2 | 38 | Private | 215646 | HS-grad | 9 | Divorced | Handlers-cleaners | Not-in-family | White | Male | 0 | 0 | 40 | United-States | <=50K |
3 | 53 | Private | 234721 | 11th | 7 | Married-civ-spouse | Handlers-cleaners | Husband | Black | Male | 0 | 0 | 40 | United-States | <=50K |
4 | 28 | Private | 338409 | Bachelors | 13 | Married-civ-spouse | Prof-specialty | Wife | Black | Female | 0 | 0 | 40 | Cuba | <=50K |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
32556 | 27 | Private | 257302 | Assoc-acdm | 12 | Married-civ-spouse | Tech-support | Wife | White | Female | 0 | 0 | 38 | United-States | <=50K |
32557 | 40 | Private | 154374 | HS-grad | 9 | Married-civ-spouse | Machine-op-inspct | Husband | White | Male | 0 | 0 | 40 | United-States | >50K |
32558 | 58 | Private | 151910 | HS-grad | 9 | Widowed | Adm-clerical | Unmarried | White | Female | 0 | 0 | 40 | United-States | <=50K |
32559 | 22 | Private | 201490 | HS-grad | 9 | Never-married | Adm-clerical | Own-child | White | Male | 0 | 0 | 20 | United-States | <=50K |
32560 | 52 | Self-emp-inc | 287927 | HS-grad | 9 | Married-civ-spouse | Exec-managerial | Wife | White | Female | 15024 | 0 | 40 | United-States | >50K |
32561 rows × 15 columns
import sys
import warnings
warnings.simplefilter(action='ignore')
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df[df.columns[:-1]], df[target], test_size=0.25)
try:
import sweetviz as sv
except:
!pip install sweetviz --user
print('Restart runtime to import sweetviz')
import sweetviz as sv
my_report = sv.analyze(df)
# we generate html report
# Default arguments will generate to "SWEETVIZ_REPORT.html"
my_report.show_html()
| | [ 0%] 00:00 -> (? left)
Report SWEETVIZ_REPORT.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.
# we generate inline report
my_report.show_notebook()
# port_intra.show_my_report_intra = sv.compare_intra(df, df["sex"] == "Male", ["Male", "Female"])
# my_renotebook()
| | [ 0%] 00:00 -> (? left)
try:
import ydata_profiling
except:
!pip install ydata_profiling
import ydata_profiling
from ydata_profiling import ProfileReport
profile = ProfileReport(df, title="Profiling Report")
#To generate a HTML report file, save the ProfileReport to an object and use the to_file() function:
profile.to_file("ydata_profiling.html")
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
Export report to file: 0%| | 0/1 [00:00<?, ?it/s]
#The HTML report can be directly embedded in a cell in a similar fashion:
profile.to_notebook_iframe()
try:
from supervised.automl import AutoML
except:
!pip install mljar-supervised
from supervised.automl import AutoML
automl = AutoML(mode="Perform",results_path="AutoML_classifier")
# automl = AutoML(mode="Explain",results_path="AutoML_classifier")
#automl = AutoML(results_path="AutoML_classifier")
automl.fit(X_train, y_train)
Linear algorithm was disabled. AutoML directory: AutoML_classifier The task is binary_classification with evaluation metric logloss AutoML will use algorithms: ['Random Forest', 'LightGBM', 'Xgboost', 'CatBoost', 'Neural Network'] AutoML will ensemble available models AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'ensemble'] Skip simple_algorithms because no parameters were generated. * Step default_algorithms will try to check up to 5 models 1_Default_LightGBM logloss 0.28367 trained in 51.85 seconds (1-sample predict time 0.0637 seconds) 2_Default_Xgboost logloss 0.281645 trained in 29.19 seconds (1-sample predict time 0.0687 seconds) 3_Default_CatBoost logloss 0.281494 trained in 122.53 seconds (1-sample predict time 0.0868 seconds) There was an error during 4_Default_NeuralNetwork training. Please check AutoML_classifier/errors.md for details. 5_Default_RandomForest logloss 0.343649 trained in 52.6 seconds (1-sample predict time 0.1509 seconds) * Step not_so_random will try to check up to 20 models 9_LightGBM logloss 0.281723 trained in 71.05 seconds (1-sample predict time 0.0586 seconds) 5_Xgboost logloss 0.285297 trained in 28.94 seconds (1-sample predict time 0.0707 seconds) 13_CatBoost logloss 0.28344 trained in 188.03 seconds (1-sample predict time 0.1018 seconds) 17_RandomForest logloss 0.343196 trained in 36.17 seconds (1-sample predict time 0.1326 seconds) There was an error during 21_NeuralNetwork training. Please check AutoML_classifier/errors.md for details. 10_LightGBM logloss 0.285759 trained in 24.86 seconds (1-sample predict time 0.0598 seconds) 6_Xgboost logloss 0.284123 trained in 26.45 seconds (1-sample predict time 0.102 seconds) 14_CatBoost logloss 0.284349 trained in 118.36 seconds (1-sample predict time 0.0925 seconds) 18_RandomForest logloss 0.312455 trained in 79.02 seconds (1-sample predict time 0.1917 seconds) 22_NeuralNetwork logloss 0.326867 trained in 58.7 seconds (1-sample predict time 0.1055 seconds) 11_LightGBM logloss 0.284513 trained in 45.92 seconds (1-sample predict time 0.0684 seconds) 7_Xgboost logloss 0.292702 trained in 33.36 seconds (1-sample predict time 0.0671 seconds) 15_CatBoost logloss 0.282554 trained in 110.85 seconds (1-sample predict time 0.0446 seconds) 19_RandomForest logloss 0.361995 trained in 28.72 seconds (1-sample predict time 0.0987 seconds) There was an error during 23_NeuralNetwork training. Please check AutoML_classifier/errors.md for details. 12_LightGBM logloss 0.287056 trained in 48.24 seconds (1-sample predict time 0.0636 seconds) 8_Xgboost logloss 0.301941 trained in 26.4 seconds (1-sample predict time 0.0602 seconds) 16_CatBoost logloss 0.286049 trained in 72.05 seconds (1-sample predict time 0.0918 seconds) 20_RandomForest logloss 0.318659 trained in 50.74 seconds (1-sample predict time 0.1307 seconds) There was an error during 24_NeuralNetwork training. Please check AutoML_classifier/errors.md for details. * Step golden_features will try to check up to 3 models None 10 Add Golden Feature: education-num_multiply_age Add Golden Feature: education-num_diff_capital-gain Add Golden Feature: capital-loss_sum_capital-gain Add Golden Feature: education-num_diff_capital-loss Add Golden Feature: capital-gain_sum_education-num Add Golden Feature: capital-loss_sum_education-num Add Golden Feature: education-num_sum_age Add Golden Feature: age_diff_capital-loss Add Golden Feature: capital-gain_sum_age Add Golden Feature: hours-per-week_multiply_education-num Created 10 Golden Features in 0.6 seconds. 3_Default_CatBoost_GoldenFeatures logloss 0.282451 trained in 114.05 seconds (1-sample predict time 0.1013 seconds) 2_Default_Xgboost_GoldenFeatures logloss 0.283663 trained in 41.33 seconds (1-sample predict time 0.0981 seconds) 9_LightGBM_GoldenFeatures logloss 0.284549 trained in 101.59 seconds (1-sample predict time 0.1415 seconds) Not enough time to perform features selection. Skip Time needed for features selection ~ 498.0 seconds Please increase total_time_limit to at least (5039 seconds) to have features selection Skip insert_random_feature because no parameters were generated. Skip features_selection because no parameters were generated. * Step hill_climbing_1 will try to check up to 12 models 23_CatBoost logloss 0.281315 trained in 108.87 seconds (1-sample predict time 0.0451 seconds) 24_Xgboost logloss 0.281416 trained in 30.83 seconds (1-sample predict time 0.062 seconds) 25_Xgboost logloss 0.282471 trained in 32.22 seconds (1-sample predict time 0.1089 seconds) 26_LightGBM logloss 0.281746 trained in 44.57 seconds (1-sample predict time 0.0628 seconds) 27_CatBoost_GoldenFeatures logloss 0.28263 trained in 126.27 seconds (1-sample predict time 0.081 seconds) 28_Xgboost_GoldenFeatures logloss 0.283906 trained in 44.63 seconds (1-sample predict time 0.0986 seconds) 29_Xgboost_GoldenFeatures logloss 0.285152 trained in 42.0 seconds (1-sample predict time 0.091 seconds) 30_LightGBM logloss 0.284283 trained in 30.88 seconds (1-sample predict time 0.0609 seconds) 31_RandomForest logloss 0.312178 trained in 57.51 seconds (1-sample predict time 0.1453 seconds) 32_RandomForest logloss 0.312695 trained in 80.81 seconds (1-sample predict time 0.1691 seconds) 33_RandomForest logloss 0.31889 trained in 47.44 seconds (1-sample predict time 0.1295 seconds) There was an error during 34_NeuralNetwork training. Please check AutoML_classifier/errors.md for details. * Step hill_climbing_2 will try to check up to 11 models 34_CatBoost logloss 0.282154 trained in 165.78 seconds (1-sample predict time 0.0717 seconds) 35_CatBoost logloss 0.283375 trained in 73.95 seconds (1-sample predict time 0.0404 seconds) 36_Xgboost logloss 0.28087 trained in 33.78 seconds (1-sample predict time 0.0626 seconds) 37_CatBoost logloss 0.281991 trained in 220.79 seconds (1-sample predict time 0.0812 seconds) 38_CatBoost logloss 0.282666 trained in 71.34 seconds (1-sample predict time 0.0888 seconds) 39_Xgboost logloss 0.281315 trained in 33.81 seconds (1-sample predict time 0.0608 seconds) 40_LightGBM logloss 0.281506 trained in 79.63 seconds (1-sample predict time 0.06 seconds) 41_LightGBM logloss 0.281698 trained in 42.47 seconds (1-sample predict time 0.07 seconds) 42_RandomForest logloss 0.312014 trained in 79.83 seconds (1-sample predict time 0.2335 seconds) 43_RandomForest logloss 0.312348 trained in 66.9 seconds (1-sample predict time 0.1622 seconds) 44_NeuralNetwork logloss 0.333915 trained in 71.8 seconds (1-sample predict time 0.1035 seconds) * Step ensemble will try to check up to 1 model Ensemble logloss 0.278895 trained in 9.6 seconds (1-sample predict time 0.4094 seconds) AutoML fit time: 3426.89 seconds AutoML best model: Ensemble
AutoML(mode='Perform', results_path='AutoML_classifier')
y_predicted = automl.predict(X_test)
y_predicted
array(['<=50K', '<=50K', '<=50K', ..., '<=50K', '>50K', '>50K'], dtype=object)
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_predicted)
0.8799901731973959
#print(automl.score(X_test1, y_test))
pd.read_csv('/content/AutoML_classifier/leaderboard.csv')
name | model_type | metric_type | metric_value | train_time | single_prediction_time | |
---|---|---|---|---|---|---|
0 | 1_Default_LightGBM | LightGBM | logloss | 0.283670 | 53.91 | 0.0637 |
1 | 2_Default_Xgboost | Xgboost | logloss | 0.281645 | 31.50 | 0.0687 |
2 | 3_Default_CatBoost | CatBoost | logloss | 0.281494 | 125.60 | 0.0868 |
3 | 5_Default_RandomForest | Random Forest | logloss | 0.343649 | 54.67 | 0.1509 |
4 | 9_LightGBM | LightGBM | logloss | 0.281723 | 73.34 | 0.0586 |
5 | 5_Xgboost | Xgboost | logloss | 0.285297 | 31.02 | 0.0707 |
6 | 13_CatBoost | CatBoost | logloss | 0.283440 | 190.17 | 0.1018 |
7 | 17_RandomForest | Random Forest | logloss | 0.343196 | 38.41 | 0.1326 |
8 | 10_LightGBM | LightGBM | logloss | 0.285759 | 26.99 | 0.0598 |
9 | 6_Xgboost | Xgboost | logloss | 0.284123 | 28.54 | 0.1020 |
10 | 14_CatBoost | CatBoost | logloss | 0.284349 | 120.51 | 0.0925 |
11 | 18_RandomForest | Random Forest | logloss | 0.312455 | 81.09 | 0.1917 |
12 | 22_NeuralNetwork | Neural Network | logloss | 0.326867 | 60.76 | 0.1055 |
13 | 11_LightGBM | LightGBM | logloss | 0.284513 | 48.03 | 0.0684 |
14 | 7_Xgboost | Xgboost | logloss | 0.292702 | 36.36 | 0.0671 |
15 | 15_CatBoost | CatBoost | logloss | 0.282554 | 112.92 | 0.0446 |
16 | 19_RandomForest | Random Forest | logloss | 0.361995 | 30.80 | 0.0987 |
17 | 12_LightGBM | LightGBM | logloss | 0.287056 | 50.32 | 0.0636 |
18 | 8_Xgboost | Xgboost | logloss | 0.301941 | 28.50 | 0.0602 |
19 | 16_CatBoost | CatBoost | logloss | 0.286049 | 74.20 | 0.0918 |
20 | 20_RandomForest | Random Forest | logloss | 0.318659 | 52.84 | 0.1307 |
21 | 3_Default_CatBoost_GoldenFeatures | CatBoost | logloss | 0.282451 | 116.28 | 0.1013 |
22 | 2_Default_Xgboost_GoldenFeatures | Xgboost | logloss | 0.283663 | 43.52 | 0.0981 |
23 | 9_LightGBM_GoldenFeatures | LightGBM | logloss | 0.284549 | 103.81 | 0.1415 |
24 | 23_CatBoost | CatBoost | logloss | 0.281315 | 111.00 | 0.0451 |
25 | 24_Xgboost | Xgboost | logloss | 0.281416 | 33.35 | 0.0620 |
26 | 25_Xgboost | Xgboost | logloss | 0.282471 | 34.47 | 0.1089 |
27 | 26_LightGBM | LightGBM | logloss | 0.281746 | 46.65 | 0.0628 |
28 | 27_CatBoost_GoldenFeatures | CatBoost | logloss | 0.282630 | 128.47 | 0.0810 |
29 | 28_Xgboost_GoldenFeatures | Xgboost | logloss | 0.283906 | 46.83 | 0.0986 |
30 | 29_Xgboost_GoldenFeatures | Xgboost | logloss | 0.285152 | 44.19 | 0.0910 |
31 | 30_LightGBM | LightGBM | logloss | 0.284283 | 33.63 | 0.0609 |
32 | 31_RandomForest | Random Forest | logloss | 0.312178 | 60.33 | 0.1453 |
33 | 32_RandomForest | Random Forest | logloss | 0.312695 | 82.92 | 0.1691 |
34 | 33_RandomForest | Random Forest | logloss | 0.318890 | 49.52 | 0.1295 |
35 | 34_CatBoost | CatBoost | logloss | 0.282154 | 167.92 | 0.0717 |
36 | 35_CatBoost | CatBoost | logloss | 0.283375 | 76.11 | 0.0404 |
37 | 36_Xgboost | Xgboost | logloss | 0.280870 | 35.89 | 0.0626 |
38 | 37_CatBoost | CatBoost | logloss | 0.281991 | 222.96 | 0.0812 |
39 | 38_CatBoost | CatBoost | logloss | 0.282666 | 73.90 | 0.0888 |
40 | 39_Xgboost | Xgboost | logloss | 0.281315 | 35.91 | 0.0608 |
41 | 40_LightGBM | LightGBM | logloss | 0.281506 | 82.08 | 0.0600 |
42 | 41_LightGBM | LightGBM | logloss | 0.281698 | 44.67 | 0.0700 |
43 | 42_RandomForest | Random Forest | logloss | 0.312014 | 81.93 | 0.2335 |
44 | 43_RandomForest | Random Forest | logloss | 0.312348 | 69.02 | 0.1622 |
45 | 44_NeuralNetwork | Neural Network | logloss | 0.333915 | 73.88 | 0.1035 |
46 | Ensemble | Ensemble | logloss | 0.278895 | 9.60 | 0.4094 |
import imageio
def show_image(file):
im=imageio.imread(file)
plt.figure(figsize=(8,8))
plt.imshow(im)
plt.show()
show_image('/content/AutoML_classifier/features_heatmap.png')
show_image('/content/AutoML_classifier/ldb_performance_boxplot.png')
show_image('/content/AutoML_classifier/correlation_heatmap.png')
show_image('/content/AutoML_classifier/Ensemble/confusion_matrix.png')
show_image('/content/AutoML_classifier/Ensemble/precision_recall_curve.png')
show_image('/content/AutoML_classifier/Ensemble/roc_curve.png')
show_image('/content/AutoML_classifier/38_CatBoost/permutation_importance.png')
# automl1 = AutoML(algorithms=["CatBoost"],mode="Optuna", start_random_models=3)
# automl1.fit(X_train, y_train)
try:
from lazypredict.Supervised import LazyClassifier
except:
!pip install lazypredict
from lazypredict.Supervised import LazyClassifier
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)
models
100%|██████████| 29/29 [02:22<00:00, 4.91s/it]
Accuracy | Balanced Accuracy | ROC AUC | F1 Score | Time Taken | |
---|---|---|---|---|---|
Model |
#!pip install tpot
# X_train1 = pd.get_dummies(X_train).values
# X_test1 = pd.get_dummies(X_test).values
# #X_train1 = X_train
# #X_test1 = X_test
# from tpot import TPOTClassifier
# tpot = TPOTClassifier(generations=2, population_size=50, verbosity=2, random_state=42)
# tpot.fit(X_train1, y_train)
# tpot.export('TPOTClassifier_pipeline.py')
#!pip install -U scikit-learn==0.23.2
#!pip install --use-deprecated=legacy-resolver pycaret[full]
#from pycaret.classification import *
#s = setup(df, target = target)
#best = compare_models()
#print(best)
#plot_model(best)
#evaluate_model(best)