This notebook is created using PyCaret 2.0. Last updated : 28-07-2020
# check version
from pycaret.utils import version
version()
pycaret-nightly-0.39
from pycaret.datasets import get_data
index = get_data('index')
Dataset | Data Types | Default Task | Target Variable | # Instances | # Attributes | Missing Values | |
---|---|---|---|---|---|---|---|
0 | anomaly | Multivariate | Anomaly Detection | None | 1000 | 10 | N |
1 | france | Multivariate | Association Rule Mining | InvoiceNo, Description | 8557 | 8 | N |
2 | germany | Multivariate | Association Rule Mining | InvoiceNo, Description | 9495 | 8 | N |
3 | bank | Multivariate | Classification (Binary) | deposit | 45211 | 17 | N |
4 | blood | Multivariate | Classification (Binary) | Class | 748 | 5 | N |
5 | cancer | Multivariate | Classification (Binary) | Class | 683 | 10 | N |
6 | credit | Multivariate | Classification (Binary) | default | 24000 | 24 | N |
7 | diabetes | Multivariate | Classification (Binary) | Class variable | 768 | 9 | N |
8 | electrical_grid | Multivariate | Classification (Binary) | stabf | 10000 | 14 | N |
9 | employee | Multivariate | Classification (Binary) | left | 14999 | 10 | N |
10 | heart | Multivariate | Classification (Binary) | DEATH | 200 | 16 | N |
11 | heart_disease | Multivariate | Classification (Binary) | Disease | 270 | 14 | N |
12 | hepatitis | Multivariate | Classification (Binary) | Class | 154 | 32 | Y |
13 | income | Multivariate | Classification (Binary) | income >50K | 32561 | 14 | Y |
14 | juice | Multivariate | Classification (Binary) | Purchase | 1070 | 15 | N |
15 | nba | Multivariate | Classification (Binary) | TARGET_5Yrs | 1340 | 21 | N |
16 | wine | Multivariate | Classification (Binary) | type | 6498 | 13 | N |
17 | telescope | Multivariate | Classification (Binary) | Class | 19020 | 11 | N |
18 | glass | Multivariate | Classification (Multiclass) | Type | 214 | 10 | N |
19 | iris | Multivariate | Classification (Multiclass) | species | 150 | 5 | N |
20 | poker | Multivariate | Classification (Multiclass) | CLASS | 100000 | 11 | N |
21 | questions | Multivariate | Classification (Multiclass) | Next_Question | 499 | 4 | N |
22 | satellite | Multivariate | Classification (Multiclass) | Class | 6435 | 37 | N |
23 | asia_gdp | Multivariate | Clustering | None | 40 | 11 | N |
24 | elections | Multivariate | Clustering | None | 3195 | 54 | Y |
25 | Multivariate | Clustering | None | 7050 | 12 | N | |
26 | ipl | Multivariate | Clustering | None | 153 | 25 | N |
27 | jewellery | Multivariate | Clustering | None | 505 | 4 | N |
28 | mice | Multivariate | Clustering | None | 1080 | 82 | Y |
29 | migration | Multivariate | Clustering | None | 233 | 12 | N |
30 | perfume | Multivariate | Clustering | None | 20 | 29 | N |
31 | pokemon | Multivariate | Clustering | None | 800 | 13 | Y |
32 | population | Multivariate | Clustering | None | 255 | 56 | Y |
33 | public_health | Multivariate | Clustering | None | 224 | 21 | N |
34 | seeds | Multivariate | Clustering | None | 210 | 7 | N |
35 | wholesale | Multivariate | Clustering | None | 440 | 8 | N |
36 | tweets | Text | NLP | tweet | 8594 | 2 | N |
37 | amazon | Text | NLP / Classification | reviewText | 20000 | 2 | N |
38 | kiva | Text | NLP / Classification | en | 6818 | 7 | N |
39 | spx | Text | NLP / Regression | text | 874 | 4 | N |
40 | wikipedia | Text | NLP / Classification | Text | 500 | 3 | N |
41 | automobile | Multivariate | Regression | price | 202 | 26 | Y |
42 | bike | Multivariate | Regression | cnt | 17379 | 15 | N |
43 | boston | Multivariate | Regression | medv | 506 | 14 | N |
44 | concrete | Multivariate | Regression | strength | 1030 | 9 | N |
45 | diamond | Multivariate | Regression | Price | 6000 | 8 | N |
46 | energy | Multivariate | Regression | Heating Load / Cooling Load | 768 | 10 | N |
47 | forest | Multivariate | Regression | area | 517 | 13 | N |
48 | gold | Multivariate | Regression | Gold_T+22 | 2558 | 121 | N |
49 | house | Multivariate | Regression | SalePrice | 1461 | 81 | Y |
50 | insurance | Multivariate | Regression | charges | 1338 | 7 | N |
51 | parkinsons | Multivariate | Regression | PPE | 5875 | 22 | N |
52 | traffic | Multivariate | Regression | traffic_volume | 48204 | 8 | N |
data = get_data('juice')
Id | Purchase | WeekofPurchase | StoreID | PriceCH | PriceMM | DiscCH | DiscMM | SpecialCH | SpecialMM | LoyalCH | SalePriceMM | SalePriceCH | PriceDiff | Store7 | PctDiscMM | PctDiscCH | ListPriceDiff | STORE | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | CH | 237 | 1 | 1.75 | 1.99 | 0.00 | 0.0 | 0 | 0 | 0.500000 | 1.99 | 1.75 | 0.24 | No | 0.000000 | 0.000000 | 0.24 | 1 |
1 | 2 | CH | 239 | 1 | 1.75 | 1.99 | 0.00 | 0.3 | 0 | 1 | 0.600000 | 1.69 | 1.75 | -0.06 | No | 0.150754 | 0.000000 | 0.24 | 1 |
2 | 3 | CH | 245 | 1 | 1.86 | 2.09 | 0.17 | 0.0 | 0 | 0 | 0.680000 | 2.09 | 1.69 | 0.40 | No | 0.000000 | 0.091398 | 0.23 | 1 |
3 | 4 | MM | 227 | 1 | 1.69 | 1.69 | 0.00 | 0.0 | 0 | 0 | 0.400000 | 1.69 | 1.69 | 0.00 | No | 0.000000 | 0.000000 | 0.00 | 1 |
4 | 5 | CH | 228 | 7 | 1.69 | 1.69 | 0.00 | 0.0 | 0 | 0 | 0.956535 | 1.69 | 1.69 | 0.00 | Yes | 0.000000 | 0.000000 | 0.00 | 0 |
from pycaret.classification import *
clf1 = setup(data, target = 'Purchase', session_id=123, log_experiment=False, experiment_name='bank1')
Setup Succesfully Completed!
Description | Value | |
---|---|---|
0 | session_id | 123 |
1 | Target Type | Binary |
2 | Label Encoded | CH: 0, MM: 1 |
3 | Original Data | (1070, 19) |
4 | Missing Values | False |
5 | Numeric Features | 13 |
6 | Categorical Features | 5 |
7 | Ordinal Features | False |
8 | High Cardinality Features | False |
9 | High Cardinality Method | None |
10 | Sampled Data | (1070, 19) |
11 | Transformed Train Set | (748, 28) |
12 | Transformed Test Set | (322, 28) |
13 | Numeric Imputer | mean |
14 | Categorical Imputer | constant |
15 | Normalize | False |
16 | Normalize Method | None |
17 | Transformation | False |
18 | Transformation Method | None |
19 | PCA | False |
20 | PCA Method | None |
21 | PCA Components | None |
22 | Ignore Low Variance | False |
23 | Combine Rare Levels | False |
24 | Rare Level Threshold | None |
25 | Numeric Binning | False |
26 | Remove Outliers | False |
27 | Outliers Threshold | None |
28 | Remove Multicollinearity | False |
29 | Multicollinearity Threshold | None |
30 | Clustering | False |
31 | Clustering Iteration | None |
32 | Polynomial Features | False |
33 | Polynomial Degree | None |
34 | Trignometry Features | False |
35 | Polynomial Threshold | None |
36 | Group Features | False |
37 | Feature Selection | False |
38 | Features Selection Threshold | None |
39 | Feature Interaction | False |
40 | Feature Ratio | False |
41 | Interaction Threshold | None |
42 | Fix Imbalance | False |
43 | Fix Imbalance Method | SMOTE |
top5 = compare_models(n_select=5)
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | TT (Sec) | |
---|---|---|---|---|---|---|---|---|---|
0 | Logistic Regression | 0.8263 | 0.8959 | 0.7262 | 0.8139 | 0.7644 | 0.6280 | 0.6338 | 0.0429 |
1 | Linear Discriminant Analysis | 0.8263 | 0.8938 | 0.7536 | 0.7938 | 0.7713 | 0.6317 | 0.6342 | 0.0089 |
2 | Ridge Classifier | 0.8236 | 0.0000 | 0.7499 | 0.7920 | 0.7680 | 0.6262 | 0.6292 | 0.0045 |
3 | Ada Boost Classifier | 0.8075 | 0.8637 | 0.7053 | 0.7837 | 0.7398 | 0.5881 | 0.5924 | 0.0780 |
4 | Gradient Boosting Classifier | 0.8062 | 0.8869 | 0.7363 | 0.7651 | 0.7479 | 0.5909 | 0.5939 | 0.1205 |
5 | CatBoost Classifier | 0.8049 | 0.8932 | 0.7326 | 0.7629 | 0.7457 | 0.5878 | 0.5899 | 3.5314 |
6 | Extreme Gradient Boosting | 0.7914 | 0.8716 | 0.7294 | 0.7367 | 0.7309 | 0.5609 | 0.5633 | 0.0675 |
7 | Light Gradient Boosting Machine | 0.7861 | 0.8806 | 0.7053 | 0.7393 | 0.7195 | 0.5471 | 0.5497 | 0.0898 |
8 | Quadratic Discriminant Analysis | 0.7621 | 0.8240 | 0.6267 | 0.7397 | 0.6678 | 0.4863 | 0.5000 | 0.0060 |
9 | Random Forest Classifier | 0.7608 | 0.8397 | 0.6674 | 0.7124 | 0.6848 | 0.4928 | 0.4974 | 0.1142 |
10 | Decision Tree Classifier | 0.7594 | 0.7519 | 0.6911 | 0.6970 | 0.6907 | 0.4943 | 0.4975 | 0.0051 |
11 | Extra Trees Classifier | 0.7433 | 0.8205 | 0.6708 | 0.6758 | 0.6698 | 0.4605 | 0.4638 | 0.1503 |
12 | K Neighbors Classifier | 0.7231 | 0.7683 | 0.6062 | 0.6600 | 0.6287 | 0.4094 | 0.4129 | 0.0054 |
13 | Naive Bayes | 0.7140 | 0.7952 | 0.7466 | 0.6100 | 0.6708 | 0.4227 | 0.4301 | 0.0032 |
14 | SVM - Linear Kernel | 0.5267 | 0.0000 | 0.4200 | 0.2409 | 0.2561 | 0.0204 | 0.0299 | 0.0074 |
top5_tuned = [tune_model(i) for i in top5]
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
0 | 0.7467 | 0.8426 | 0.6897 | 0.6667 | 0.6780 | 0.4693 | 0.4695 |
1 | 0.8267 | 0.9475 | 0.7241 | 0.8077 | 0.7636 | 0.6274 | 0.6298 |
2 | 0.7600 | 0.8201 | 0.6552 | 0.7037 | 0.6786 | 0.4875 | 0.4883 |
3 | 0.8133 | 0.9070 | 0.8276 | 0.7273 | 0.7742 | 0.6162 | 0.6200 |
4 | 0.7867 | 0.8973 | 0.8621 | 0.6757 | 0.7576 | 0.5720 | 0.5856 |
5 | 0.8133 | 0.8906 | 0.7241 | 0.7778 | 0.7500 | 0.6014 | 0.6023 |
6 | 0.8133 | 0.8833 | 0.8333 | 0.7353 | 0.7812 | 0.6196 | 0.6233 |
7 | 0.8400 | 0.9222 | 0.8000 | 0.8000 | 0.8000 | 0.6667 | 0.6667 |
8 | 0.7973 | 0.8759 | 0.6897 | 0.7692 | 0.7273 | 0.5667 | 0.5689 |
9 | 0.8514 | 0.9111 | 0.7586 | 0.8462 | 0.8000 | 0.6823 | 0.6849 |
Mean | 0.8049 | 0.8898 | 0.7564 | 0.7509 | 0.7510 | 0.5909 | 0.5939 |
SD | 0.0314 | 0.0354 | 0.0673 | 0.0562 | 0.0420 | 0.0659 | 0.0662 |
lr = create_model('lr', fold = 5)
dt = create_model('dt')
rf = create_model('rf', fold = 5)
models()
models(type='ensemble').index.tolist()
ensembled_models = compare_models(whitelist = models(type='ensemble').index.tolist(), fold = 3)
tuned_lr = tune_model(lr)
tuned_rf = tune_model(rf)
bagged_dt = ensemble_model(dt)
boosted_dt = ensemble_model(dt, method = 'Boosting')
blender = blend_models(estimator_list = [boosted_dt, bagged_dt], method = 'soft')
stacker = stack_models(estimator_list = [boosted_dt,bagged_dt,tuned_rf], meta_model=rf)
plot_model(rf)
plot_model(rf, plot = 'confusion_matrix')
plot_model(rf, plot = 'boundary')
evaluate_model(rf)
catboost = create_model('catboost', cross_validation=False)
interpret_model(catboost)
interpret_model(catboost, plot = 'correlation')
interpret_model(catboost, plot = 'reason', observation = 12)
best = automl(optimize = 'Recall')
best
pred_holdouts = predict_model(lr)
pred_holdouts.head()
new_data = data.copy()
new_data.drop(['deposit'], axis=1, inplace=True)
predict_new = predict_model(lr, data=new_data)
predict_new.head()
save_model(lr, model_name='best-model')
loaded_bestmodel = load_model('best-model')
print(loaded_bestmodel)
from sklearn import set_config
set_config(display='diagram')
loaded_bestmodel[0]
from sklearn import set_config
set_config(display='text')
deploy_model(lr, model_name = 'best-aws', authentication = {'bucket' : 'pycaret-test'})
X_train = get_config('X_train')
X_train.head()
get_config('seed')
from pycaret.classification import set_config
set_config('seed', 999)
get_config('seed')
get_system_logs()
!mlflow ui
# to generate csv file with experiment logs
get_logs()
Thank you. For more information / tutorials on PyCaret, please visit https://www.pycaret.org