from optimalflow.utilis_func import pipeline_splitting_rule, update_parameters,reset_parameters
reset_parameters()
update_parameters(mode = "cls", estimator_name = "mlp", hidden_layer_sizes = [10],activation=["relu"],learning_rate = ["constant"],solver = ["sgd"])
update_parameters(mode = "cls", estimator_name = "svm", C=[0.1],kernel=["linear"])
update_parameters(mode = "cls", estimator_name = "ada", n_estimators =[50],learning_rate=[1])
update_parameters(mode = "cls", estimator_name = "rf", n_estimators =[50],max_depth=[2])
update_parameters(mode = "cls", estimator_name = "gb", n_estimators =[50],max_depth=[2],learning_rate=[1])
update_parameters(mode = "cls", estimator_name = "xgb", n_estimators =[50],max_depth=[2],learning_rate=[1])
from optimalflow.autoPipe import autoPipe
import pandas as pd
from optimalflow.funcPP import PPtools
from optimalflow.autoPP import dynaPreprocessing
from optimalflow.autoFS import dynaFS_clf
from optimalflow.autoCV import evaluate_model,dynaClassifier,dynaRegressor
df = pd.read_csv('./data/preprocessing/breast_cancer.csv')
custom_parameters = {
"scaler" : ["None", "standard"],
# threshold number of category dimension
"encode_band" : [4],
# low dimension encoding
"low_encode" : ["onehot","label"],
# high dimension encoding
"high_encode" : ["frequency", "mean"],
"winsorizer" : [(0.05,0.05),(0.1,0.1)],
"sparsity" : [0.4],
"cols" : [1000]
}
Done with the parameters reset. Previous Parameters are: {'hidden_layer_sizes': [10, 50, 100], 'activation': ['identity', 'relu', 'tanh', 'logistic'], 'learning_rate': ['constant', 'invscaling', 'adaptive'], 'solver': ['lbfgs', 'sgd', 'adam']} Current Parameters are updated as: {'hidden_layer_sizes': [10], 'activation': ['relu'], 'learning_rate': ['constant'], 'solver': ['sgd']} Done with the parameters update. Previous Parameters are: {'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'C': [0.1, 1, 10]} Current Parameters are updated as: {'C': [0.1], 'kernel': ['linear']} Done with the parameters update. Previous Parameters are: {'n_estimators': [50, 100, 150], 'learning_rate': [0.1, 1, 10, 100]} Current Parameters are updated as: {'n_estimators': [50], 'learning_rate': [1]} Done with the parameters update. Previous Parameters are: {'n_estimators': [5, 50, 250], 'max_depth': [2, 4, 8, 16, 32]} Current Parameters are updated as: {'n_estimators': [50], 'max_depth': [2]} Done with the parameters update. Previous Parameters are: {'n_estimators': [50, 100, 150, 200, 250, 300], 'max_depth': [1, 3, 5, 7, 9], 'learning_rate': [0.01, 0.1, 0.2, 0.3, 0.4]} Current Parameters are updated as: {'n_estimators': [50], 'max_depth': [2], 'learning_rate': [1]} Done with the parameters update. Previous Parameters are: {'n_estimators': [50, 100, 150, 200, 250, 300], 'max_depth': [3, 5, 7, 9], 'learning_rate': [0.01, 0.1, 0.2, 0.3, 0.4], 'verbosity': [0]} Current Parameters are updated as: {'n_estimators': [50], 'max_depth': [2], 'learning_rate': [1]} Done with the parameters update.
pipe = autoPipe(
[("autoPP",dynaPreprocessing(custom_parameters = custom_parameters, label_col = 'diagnosis', model_type = "cls")),
("datasets_splitting",pipeline_splitting_rule(val_size = 0.2, test_size = 0.2, random_state = 13)),
("autoFS",dynaFS_clf(fs_num = 8, random_state=13, cv = 5, in_pipeline = True, input_from_file = False)),
("autoCV",dynaClassifier(random_state = 13,cv_num = 5,in_pipeline = True, input_from_file = False)),
("model_evaluate",evaluate_model(model_type = "cls"))])
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_columns',None,'display.max_rows',None)
pd.set_option('max_colwidth', -1)
DICT_PREPROCESSING,DICT_FEATURE_SELECTION,DICT_MODELS_EVALUATION,DICT_DATA,dyna_report= pipe.fit(df)
Now in Progress - autoFS & autoCV Iteration: Estimate about 0.0 minutes left [####################] 100.0% The top 5 Models with Best Performance Metrics: Dataset Model_Name \ 3943 Dataset_563 mlp 4312 Dataset_616 lgr 3481 Dataset_497 mlp 2546 Dataset_363 gb 1139 Dataset_162 gb Best_Parameters \ 3943 [('activation', 'relu'), ('hidden_layer_sizes', (10,)), ('learning_rate', 'constant'), ('random_state', 13), ('solver', 'sgd')] 4312 [('C', 100), ('random_state', 13)] 3481 [('activation', 'relu'), ('hidden_layer_sizes', (10,)), ('learning_rate', 'constant'), ('random_state', 13), ('solver', 'sgd')] 2546 [('learning_rate', 1), ('max_depth', 2), ('n_estimators', 50), ('random_state', 13)] 1139 [('learning_rate', 1), ('max_depth', 2), ('n_estimators', 50), ('random_state', 13)] Accuracy Precision Recall Latency 3943 0.947 0.958 0.92 3.5 4312 0.947 0.923 0.96 3.0 3481 0.930 1.000 0.84 3.0 2546 0.930 0.957 0.88 1.0 1139 0.930 0.957 0.88 2.0
DICT_MODELS_EVALUATION['Dataset_0']
Model_Name | Accuracy | Precision | Recall | Latency | Best_Parameters | Dataset | |
---|---|---|---|---|---|---|---|
0 | lgr | 0.895 | 0.880 | 0.88 | 3.0 | [('C', 1000), ('random_state', 13)] | Dataset_0 |
0 | svm | 0.912 | 0.885 | 0.92 | 3.0 | [('C', 0.1), ('kernel', 'linear')] | Dataset_0 |
0 | mlp | 0.439 | 0.439 | 1.00 | 3.0 | [('activation', 'relu'), ('hidden_layer_sizes', (10,)), ('learning_rate', 'constant'), ('random_state', 13), ('solver', 'sgd')] | Dataset_0 |
0 | rf | 0.877 | 0.821 | 0.92 | 12.0 | [('max_depth', 2), ('n_estimators', 50), ('random_state', 13)] | Dataset_0 |
0 | ada | 0.912 | 0.955 | 0.84 | 17.0 | [('learning_rate', 1), ('n_estimators', 50), ('random_state', 13)] | Dataset_0 |
0 | gb | 0.877 | 0.846 | 0.88 | 3.0 | [('learning_rate', 1), ('max_depth', 2), ('n_estimators', 50), ('random_state', 13)] | Dataset_0 |
0 | xgb | 0.912 | 0.955 | 0.84 | 2.0 | [('learning_rate', 1), ('max_depth', 2), ('n_estimators', 50), ('random_state', 13)] | Dataset_0 |
dyna_report.head(15)
dyna_report.to_csv("dyna_report.csv",index=False)
Dataset | Model_Name | Best_Parameters | Accuracy | Precision | Recall | Latency | |
---|---|---|---|---|---|---|---|
3943 | Dataset_563 | mlp | [('activation', 'relu'), ('hidden_layer_sizes', (10,)), ('learning_rate', 'constant'), ('random_state', 13), ('solver', 'sgd')] | 0.947 | 0.958 | 0.92 | 3.5 |
4312 | Dataset_616 | lgr | [('C', 100), ('random_state', 13)] | 0.947 | 0.923 | 0.96 | 3.0 |
3481 | Dataset_497 | mlp | [('activation', 'relu'), ('hidden_layer_sizes', (10,)), ('learning_rate', 'constant'), ('random_state', 13), ('solver', 'sgd')] | 0.930 | 1.000 | 0.84 | 3.0 |
2546 | Dataset_363 | gb | [('learning_rate', 1), ('max_depth', 2), ('n_estimators', 50), ('random_state', 13)] | 0.930 | 0.957 | 0.88 | 1.0 |
1139 | Dataset_162 | gb | [('learning_rate', 1), ('max_depth', 2), ('n_estimators', 50), ('random_state', 13)] | 0.930 | 0.957 | 0.88 | 2.0 |
3901 | Dataset_557 | mlp | [('activation', 'relu'), ('hidden_layer_sizes', (10,)), ('learning_rate', 'constant'), ('random_state', 13), ('solver', 'sgd')] | 0.930 | 0.957 | 0.88 | 2.0 |
1314 | Dataset_187 | gb | [('learning_rate', 1), ('max_depth', 2), ('n_estimators', 50), ('random_state', 13)] | 0.930 | 0.957 | 0.88 | 3.0 |
1412 | Dataset_201 | gb | [('learning_rate', 1), ('max_depth', 2), ('n_estimators', 50), ('random_state', 13)] | 0.930 | 0.957 | 0.88 | 3.0 |
1678 | Dataset_239 | gb | [('learning_rate', 1), ('max_depth', 2), ('n_estimators', 50), ('random_state', 13)] | 0.930 | 0.957 | 0.88 | 3.0 |
1916 | Dataset_273 | gb | [('learning_rate', 1), ('max_depth', 2), ('n_estimators', 50), ('random_state', 13)] | 0.930 | 0.957 | 0.88 | 3.0 |
2644 | Dataset_377 | gb | [('learning_rate', 1), ('max_depth', 2), ('n_estimators', 50), ('random_state', 13)] | 0.930 | 0.957 | 0.88 | 3.0 |
2840 | Dataset_405 | gb | [('learning_rate', 1), ('max_depth', 2), ('n_estimators', 50), ('random_state', 13)] | 0.930 | 0.957 | 0.88 | 3.0 |
2851 | Dataset_407 | mlp | [('activation', 'relu'), ('hidden_layer_sizes', (10,)), ('learning_rate', 'constant'), ('random_state', 13), ('solver', 'sgd')] | 0.930 | 0.957 | 0.88 | 3.0 |
2994 | Dataset_427 | gb | [('learning_rate', 1), ('max_depth', 2), ('n_estimators', 50), ('random_state', 13)] | 0.930 | 0.957 | 0.88 | 3.0 |
3176 | Dataset_453 | gb | [('learning_rate', 1), ('max_depth', 2), ('n_estimators', 50), ('random_state', 13)] | 0.930 | 0.957 | 0.88 | 3.0 |
DICT_DATA['Dataset_0']['DICT_TEST']["X"].head(10)
concavity_mean | concave points_mean | perimeter_mean | radius_mean | texture_mean | |
---|---|---|---|---|---|
264 | 0.09061 | 0.065270 | 111.60 | 17.19 | 22.07 |
231 | 0.01633 | 0.006588 | 71.76 | 11.32 | 26.60 |
197 | 0.11030 | 0.057780 | 117.40 | 18.08 | 21.84 |
172 | 0.20320 | 0.109700 | 102.50 | 15.46 | 13.04 |
54 | 0.05253 | 0.033340 | 97.26 | 15.10 | 22.02 |
33 | 0.16570 | 0.075930 | 127.90 | 19.27 | 26.47 |
68 | 0.25080 | 0.043750 | 60.73 | 9.72 | 17.33 |
237 | 0.09042 | 0.060220 | 132.50 | 20.48 | 21.46 |
51 | 0.01857 | 0.017230 | 87.21 | 13.64 | 16.34 |
196 | 0.13850 | 0.065260 | 90.63 | 13.77 | 22.29 |
test = DICT_PREPROCESSING['Dataset_0']
test
"winsor_0-Scaler_None-- Encoded Features:['diagnosis', 'Size_3', 'area_mean', 'compactness_mean', 'concave points_mean', 'concavity_mean', 'fractal_dimension_mean', 'perimeter_mean', 'radius_mean', 'smoothness_mean', 'symmetry_mean', 'texture_mean', 'Frequency_Age', 'onehot_Position_1_left', 'onehot_Position_1_right', 'Frequency_Position_2', 'Frequency_Size_1', 'Frequency_Size_2', 'onehot_Treatment_no-recurrence-events', 'onehot_Treatment_recurrence-events', 'onehot_Type_1_ge40', 'onehot_Type_1_lt40', 'onehot_Type_1_premeno', 'onehot_Type_2_NaN', 'onehot_Type_2_no', 'onehot_Type_2_yes', 'onehot_Type_3_no', 'onehot_Type_3_yes']"
import pickle
def save_obj(obj, name ):
with open(name + '.pkl', 'wb') as f:
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
def load_obj(name ):
with open(name + '.pkl', 'rb') as f:
return pickle.load(f)
save_obj(DICT_PREPROCESSING,"dict_preprocess")
save_obj(DICT_DATA,"dict_data")
save_obj(DICT_MODELS_EVALUATION,"dict_models_evaluate")
save_obj(dyna_report,"dyna_report")
DICT_PREP = load_obj("dict_preprocess")
dyna_report = load_obj("dyna_report")
DICT_PREP['Dataset_0']
"winsor_0-Scaler_None-- Encoded Features:['diagnosis', 'Size_3', 'area_mean', 'compactness_mean', 'concave points_mean', 'concavity_mean', 'fractal_dimension_mean', 'perimeter_mean', 'radius_mean', 'smoothness_mean', 'symmetry_mean', 'texture_mean', 'Frequency_Age', 'onehot_Position_1_left', 'onehot_Position_1_right', 'Frequency_Position_2', 'Frequency_Size_1', 'Frequency_Size_2', 'onehot_Treatment_no-recurrence-events', 'onehot_Treatment_recurrence-events', 'onehot_Type_1_ge40', 'onehot_Type_1_lt40', 'onehot_Type_1_premeno', 'onehot_Type_2_NaN', 'onehot_Type_2_no', 'onehot_Type_2_yes', 'onehot_Type_3_no', 'onehot_Type_3_yes']"
from optimalflow.autoViz import autoViz
viz = autoViz(preprocess_dict=DICT_PREP,report=dyna_report)
viz.clf_model_retrieval(metrics='accuracy')
import re
import pandas as pd
columns = ["Dataset","Encode_low_dimension","Encode_high_dimension","Winsorize","Scale"]
df_pp = pd.DataFrame(columns=columns)
for i in list(DICT_PREPROCESSING.keys()):
row_pp = [i]
s = DICT_PREPROCESSING[i]
ext = re.search("Encoded Features:(.*)']", s).group(1)
if ("onehot_" in ext) and ("Frequency_" in ext):
row_pp.append('Low Dim_Onehot')
row_pp.append('High Dim_Frequency')
row_pp.append(re.search('winsor_(.*)-Scaler', s).group(1))
row_pp.append(re.search('-Scaler_(.*)-- ', s).group(1))
df_pp.loc[len(df_pp)] = row_pp
elif ("onehot_" in ext) and ("Mean_" in ext):
row_pp.append('Low Dim_Onehot')
row_pp.append('High Dim_Mean')
row_pp.append(re.search('winsor_(.*)-Scaler', s).group(1))
row_pp.append(re.search('-Scaler_(.*)-- ', s).group(1))
df_pp.loc[len(df_pp)] = row_pp
elif ("onehot_" in ext) and ("Mean_" not in ext) and ("Frequency_" not in ext):
row_pp.append('Low Dim_Onehot')
row_pp.append('High Dim_No Encoder')
row_pp.append(re.search('winsor_(.*)-Scaler', s).group(1))
row_pp.append(re.search('-Scaler_(.*)-- ', s).group(1))
df_pp.loc[len(df_pp)] = row_pp
elif ("Label_" in ext) and ("Frequency_" in ext):
row_pp.append('Low Dim_Label')
row_pp.append('High Dim_Frequency')
row_pp.append(re.search('winsor_(.*)-Scaler', s).group(1))
row_pp.append(re.search('-Scaler_(.*)-- ', s).group(1))
df_pp.loc[len(df_pp)] = row_pp
elif ("Label_" in ext) and ("Mean_" in ext):
row_pp.append('Low Dim_Label')
row_pp.append('High Dim_Mean')
row_pp.append(re.search('winsor_(.*)-Scaler', s).group(1))
row_pp.append(re.search('-Scaler_(.*)-- ', s).group(1))
df_pp.loc[len(df_pp)] = row_pp
elif ("Label_" in ext) and ("Mean_" not in ext) and ("Frequency_" not in ext):
row_pp.append('Low Dim_Label')
row_pp.append('High Dim_No Encoder')
row_pp.append(re.search('winsor_(.*)-Scaler', s).group(1))
row_pp.append(re.search('-Scaler_(.*)-- ', s).group(1))
df_pp.loc[len(df_pp)] = row_pp
elif ("Frequency_" in ext) and ("onehot_" not in ext) and ("Label_" not in ext):
row_pp.append('Low Dim_No Encoder')
row_pp.append('High Dim_Frequency')
row_pp.append(re.search('winsor_(.*)-Scaler', s).group(1))
row_pp.append(re.search('-Scaler_(.*)-- ', s).group(1))
df_pp.loc[len(df_pp)] = row_pp
elif ("Mean_" in ext) and ("onehot_" not in ext) and ("Label_" not in ext):
row_pp.append('Low Dim_No Encoder')
row_pp.append('High Dim_Mean')
row_pp.append(re.search('winsor_(.*)-Scaler', s).group(1))
row_pp.append(re.search('-Scaler_(.*)-- ', s).group(1))
df_pp.loc[len(df_pp)] = row_pp
elif ("Frequency_" not in ext) and ("Mean_" not in ext) and ("onehot_" not in ext) and ("Label_" not in ext):
row_pp.append('Low Dim_No Encoder')
row_pp.append('High Dim_No Encoder')
row_pp.append(re.search('winsor_(.*)-Scaler', s).group(1))
row_pp.append(re.search('-Scaler_(.*)-- ', s).group(1))
df_pp.loc[len(df_pp)] = row_pp
df_report_Accuracy = df_pp.merge(dyna_report[['Dataset','Accuracy']], how = 'left', on = 'Dataset')
bins = [0, 0.70, 0.90, 1]
labels = ["Low Accuracy","High Accuracy","Top Accuracy"]
df_report_Accuracy['Level'] = pd.cut(df_report_Accuracy['Accuracy'], bins=bins, labels=labels)
df_report_Accuracy['cnt'] = 1
df_report_Accuracy.loc[df_report_Accuracy['Scale'] == 'None','Scale'] = "No Scaler"
df_report_Accuracy['Scale'] = 'Scale_'+df_report_Accuracy['Scale']
df_report_Accuracy['Winsorize'] = 'Winsorize_' + df_report_Accuracy['Winsorize']
df_report_Accuracy.head(3)
df_report_Precision = df_pp.merge(dyna_report[['Dataset','Precision']], how = 'left', on = 'Dataset')
bins = [0, 0.70, 0.90, 1]
labels = ["Low Precision","High Precision","Top Precision"]
df_report_Precision['Level'] = pd.cut(df_report_Precision['Precision'], bins=bins, labels=labels)
df_report_Precision['cnt'] = 1
df_report_Precision.loc[df_report_Precision['Scale'] == 'None','Scale'] = "No Scaler"
df_report_Precision['Scale'] = 'Scale_'+df_report_Accuracy['Scale']
df_report_Precision['Winsorize'] = 'Winsorize_' + df_report_Precision['Winsorize']
df_report_Precision.head(3)
df_report_Recall = df_pp.merge(dyna_report[['Dataset','Recall']], how = 'left', on = 'Dataset')
bins = [0, 0.70, 0.90, 1]
labels = ["Low Recall","High Recall","Top Recall"]
df_report_Recall['Level'] = pd.cut(df_report_Recall['Recall'], bins=bins, labels=labels)
df_report_Recall['cnt'] = 1
df_report_Recall.loc[df_report_Recall['Scale'] == 'None','Scale'] = "No Scaler"
df_report_Recall['Scale'] = 'Scale_'+df_report_Accuracy['Scale']
df_report_Recall['Winsorize'] = 'Winsorize_' + df_report_Recall['Winsorize']
df_report_Recall.head(3)
Dataset | Encode_low_dimension | Encode_high_dimension | Winsorize | Scale | Accuracy | Level | cnt | |
---|---|---|---|---|---|---|---|---|
0 | Dataset_0 | Low Dim_Onehot | High Dim_Frequency | Winsorize_0 | Scale_No Scaler | 0.912 | Top Accuracy | 1 |
1 | Dataset_0 | Low Dim_Onehot | High Dim_Frequency | Winsorize_0 | Scale_No Scaler | 0.912 | Top Accuracy | 1 |
2 | Dataset_0 | Low Dim_Onehot | High Dim_Frequency | Winsorize_0 | Scale_No Scaler | 0.912 | Top Accuracy | 1 |
Dataset | Encode_low_dimension | Encode_high_dimension | Winsorize | Scale | Precision | Level | cnt | |
---|---|---|---|---|---|---|---|---|
0 | Dataset_0 | Low Dim_Onehot | High Dim_Frequency | Winsorize_0 | Scale_Scale_No Scaler | 0.955 | Top Precision | 1 |
1 | Dataset_0 | Low Dim_Onehot | High Dim_Frequency | Winsorize_0 | Scale_Scale_No Scaler | 0.955 | Top Precision | 1 |
2 | Dataset_0 | Low Dim_Onehot | High Dim_Frequency | Winsorize_0 | Scale_Scale_No Scaler | 0.885 | High Precision | 1 |
Dataset | Encode_low_dimension | Encode_high_dimension | Winsorize | Scale | Recall | Level | cnt | |
---|---|---|---|---|---|---|---|---|
0 | Dataset_0 | Low Dim_Onehot | High Dim_Frequency | Winsorize_0 | Scale_Scale_No Scaler | 0.84 | High Recall | 1 |
1 | Dataset_0 | Low Dim_Onehot | High Dim_Frequency | Winsorize_0 | Scale_Scale_No Scaler | 0.84 | High Recall | 1 |
2 | Dataset_0 | Low Dim_Onehot | High Dim_Frequency | Winsorize_0 | Scale_Scale_No Scaler | 0.92 | Top Recall | 1 |
step1_df = df_report_Accuracy.groupby(['Encode_low_dimension','Dataset'], as_index=False)['cnt'].count().rename({"cnt":"Total","Dataset":"antecedentIndex","Encode_low_dimension":"consequentIndex"},axis = 1)[['antecedentIndex','consequentIndex','Total']]
step2_df = df_report_Accuracy.groupby(['Encode_low_dimension','Encode_high_dimension'], as_index=False)['cnt'].count().rename({"cnt":"Total","Encode_low_dimension":"antecedentIndex","Encode_high_dimension":"consequentIndex"},axis = 1)[['antecedentIndex','consequentIndex','Total']]
step3_df = df_report_Accuracy.groupby(['Encode_high_dimension','Winsorize'], as_index=False)['cnt'].count().rename({"cnt":"Total","Encode_high_dimension":"antecedentIndex","Winsorize":"consequentIndex"},axis = 1)[['antecedentIndex','consequentIndex','Total']]
step4_df = df_report_Accuracy.groupby(['Winsorize','Scale'], as_index=False)['cnt'].count().rename({"cnt":"Total","Winsorize":"antecedentIndex","Scale":"consequentIndex"},axis = 1)[['antecedentIndex','consequentIndex','Total']]
step5_df = df_report_Accuracy.groupby(['Scale','Level'], as_index=False)['cnt'].count().rename({"cnt":"Total","Scale":"antecedentIndex","Level":"consequentIndex"},axis = 1)[['antecedentIndex','consequentIndex','Total']].dropna()
integrated_df = pd.concat([step1_df,step2_df,step3_df,step4_df,step5_df],axis = 0)
integrated_df.head(10)
antecedentIndex | consequentIndex | Total | |
---|---|---|---|
0 | Dataset_0 | Low Dim_Onehot | 7.0 |
1 | Dataset_1 | Low Dim_Onehot | 7.0 |
2 | Dataset_10 | Low Dim_Onehot | 7.0 |
3 | Dataset_100 | Low Dim_Onehot | 7.0 |
4 | Dataset_101 | Low Dim_Onehot | 7.0 |
5 | Dataset_102 | Low Dim_Onehot | 7.0 |
6 | Dataset_103 | Low Dim_Onehot | 7.0 |
7 | Dataset_104 | Low Dim_Onehot | 7.0 |
8 | Dataset_105 | Low Dim_Onehot | 7.0 |
9 | Dataset_106 | Low Dim_Onehot | 7.0 |
label_df = pd.DataFrame(integrated_df['antecedentIndex'].append(integrated_df['consequentIndex']).drop_duplicates(),columns = {"label"})
label_df['Number'] = label_df.reset_index().index
label_list = list(label_df.label)
source_df = pd.DataFrame(integrated_df['antecedentIndex'])
source_df = source_df.merge(label_df, left_on=['antecedentIndex'], right_on = ['label'],how = 'left')
source_list = list(source_df['Number'])
target_df = pd.DataFrame(integrated_df['consequentIndex'])
target_df = target_df.merge(label_df, left_on=['consequentIndex'], right_on = ['label'],how = 'left')
target_list = list(target_df['Number'])
value_list = [int(i) for i in list(integrated_df.Total)]
import plotly.graph_objects as go
fig = go.Figure(data=[go.Sankey(
node = dict(
pad = 15,
thickness = 10,
line = dict(color = 'rgb(25,100,90)', width = 0.5),
label = label_list,
color = 'rgb(71,172,55)'
),
link = dict(
source = source_list,
target = target_list,
value = value_list
))])
fig.update_layout(title = 'Pipeline Cluster Traversal Experiments - autoViz Model Retrieval Diagram <a href="https://www.linkedin.com/in/lei-tony-dong/"> ©Tony Dong</a>', font_size=8)
from plotly.offline import plot
plot(fig)
'temp-plot.html'
'pip' is not recognized as an internal or external command, operable program or batch file.
a = {'learning_rate': 1, 'max_depth': 2, 'n_estimators': 50, 'random_state': 13}
lis = a.items()
[i for i in lis]
[('learning_rate', 1), ('max_depth', 2), ('n_estimators', 50), ('random_state', 13)]
from optimalflow.autoViz import autoViz
import pandas as pd
df = pd.read_csv('dyna_report.csv')
a = autoViz(preprocess_dict = None,report = df)
a.table_report()
Comparing classic ml workflow
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None,
n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
"""
"""
if axes is None:
_, axes = plt.subplots(1, 3, figsize=(20, 5))
axes[0].set_title(title)
if ylim is not None:
axes[0].set_ylim(*ylim)
axes[0].set_xlabel("Training examples")
axes[0].set_ylabel("Score")
train_sizes, train_scores, test_scores, fit_times, _ = \
learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs,
train_sizes=train_sizes,
return_times=True)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
fit_times_mean = np.mean(fit_times, axis=1)
fit_times_std = np.std(fit_times, axis=1)
# Plot learning curve
axes[0].grid()
axes[0].fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
axes[0].fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1,
color="g")
axes[0].plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
axes[0].plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
axes[0].legend(loc="best")
# Plot n_samples vs fit_times
axes[1].grid()
axes[1].plot(train_sizes, fit_times_mean, 'o-')
axes[1].fill_between(train_sizes, fit_times_mean - fit_times_std,
fit_times_mean + fit_times_std, alpha=0.1)
axes[1].set_xlabel("Training examples")
axes[1].set_ylabel("fit_times")
axes[1].set_title("Scalability of the model")
# Plot fit_time vs score
axes[2].grid()
axes[2].plot(fit_times_mean, test_scores_mean, 'o-')
axes[2].fill_between(fit_times_mean, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1)
axes[2].set_xlabel("fit_times")
axes[2].set_ylabel("Score")
axes[2].set_title("Performance of the model")
return plt
fig, axes = plt.subplots(3, 1, figsize=(10, 15))
X, y = load_digits(return_X_y=True)
title = "Learning Curves"
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)
estimator = GaussianNB()
plot_learning_curve(estimator, title, X, y, axes=axes[:, 0], ylim=(0.7, 1.01),
cv=cv, n_jobs=4)
plt.show()
Dataset Model_Name Best_Parameters Accuracy Precision Recall Latency 3943 Dataset_563 mlp [('activation', 'relu'), ('hidden_layer_sizes', (10,)), ('learning_rate', 'constant'), ('random_state', 13), ('solver', 'sgd')] 0.947 0.958 0.92 3.5
dict_data = load_obj("dict_data")
dict_data["Dataset_563"]['DICT_Train'].keys()
dict_keys(['X', 'y'])
dict_data = load_obj("dict_data")
X = dict_data["Dataset_563"]['DICT_Train']['X']
y = dict_data["Dataset_563"]['DICT_Train']['y']
X.shape
type(X)
pandas.core.frame.DataFrame
y.shape
(171,)
import pandas as pd
test = pd.DataFrame(y)
test.head()
test = test.to_numpy()
test.shape
(171, 1)
X_digits, y_digits = load_digits(return_X_y=True)
type(X_digits)
numpy.ndarray
X.to_numpy()
array([[-0.47171866, -0.25147904, -0.67443392, ..., -0.20077146, 0.14041893, -0.16963779], [ 0.45796268, 0.22989631, 1.66854302, ..., 0.94593156, -1.15682917, -0.16963779], [-0.7996747 , -0.87043406, -0.41695619, ..., -0.28920614, 0.36982133, -0.16963779], ..., [-0.01869197, -0.34514378, 0.03820252, ..., 0.25024541, 0.05225644, -0.16963779], [-0.95971473, -0.96287041, -0.71312592, ..., -0.81391858, -0.57297756, -0.16963779], [-0.5047978 , 0.5262454 , -1.19396205, ..., -0.35995389, -0.09887926, -0.16963779]])
y.to_numpy()
array([0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0])
y_digits
array([0, 1, 2, ..., 8, 9, 8])
X_digits
array([[ 0., 0., 5., ..., 0., 0., 0.], [ 0., 0., 0., ..., 10., 0., 0.], [ 0., 0., 0., ..., 16., 9., 0.], ..., [ 0., 0., 1., ..., 6., 0., 0.], [ 0., 0., 2., ..., 12., 0., 0.], [ 0., 0., 10., ..., 12., 1., 0.]])
from sklearn.neural_network import MLPClassifier
import pickle
def save_obj(obj, name ):
with open(name + '.pkl', 'wb') as f:
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
def load_obj(name ):
with open(name + '.pkl', 'rb') as f:
return pickle.load(f)
# X_pd = X.to_numpy()
# y_pd = y.to_numpy()
X_pd = X_digits
y_pd = y_digits
fig, axes = plt.subplots(3, 1, figsize=(10, 15))
title = "Learning Curves"
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)
# estimator = MLPClassifier(random_state=13, activation = 'relu',hidden_layer_sizes = (10,),learning_rate = 'constant',solver='sgd')
estimator = GaussianNB()
plot_learning_curve(estimator, title, X_pd, y_pd, axes=axes[:, 0], ylim=(0.7, 1.01),cv=cv, n_jobs=4)
plt.show()
--------------------------------------------------------------------------- IndexError Traceback (most recent call last) <ipython-input-67-6377f75f4c5b> in <module> 22 # estimator = MLPClassifier(random_state=13, activation = 'relu',hidden_layer_sizes = (10,),learning_rate = 'constant',solver='sgd') 23 estimator = GaussianNB() ---> 24 plot_learning_curve(estimator, title, X_pd, y_pd, axes=axes[:, 0], ylim=(0.7, 1.01),cv=cv, n_jobs=4) 25 26 plt.show() IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None,
n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
if axes is None:
_, axes = plt.subplots(1, 3, figsize=(20, 5))
axes[0].set_title(title)
if ylim is not None:
axes[0].set_ylim(*ylim)
axes[0].set_xlabel("Training examples")
axes[0].set_ylabel("Score")
train_sizes, train_scores, test_scores, fit_times, _ = \
learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs,
train_sizes=train_sizes,
return_times=True)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
fit_times_mean = np.mean(fit_times, axis=1)
fit_times_std = np.std(fit_times, axis=1)
# Plot learning curve
axes[0].grid()
axes[0].fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
axes[0].fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1,
color="g")
axes[0].plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
axes[0].plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
axes[0].legend(loc="best")
# Plot n_samples vs fit_times
axes[1].grid()
axes[1].plot(train_sizes, fit_times_mean, 'o-')
axes[1].fill_between(train_sizes, fit_times_mean - fit_times_std,
fit_times_mean + fit_times_std, alpha=0.1)
axes[1].set_xlabel("Training examples")
axes[1].set_ylabel("fit_times")
axes[1].set_title("Scalability of the model")
# Plot fit_time vs score
axes[2].grid()
axes[2].plot(fit_times_mean, test_scores_mean, 'o-')
axes[2].fill_between(fit_times_mean, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1)
axes[2].set_xlabel("fit_times")
axes[2].set_ylabel("Score")
axes[2].set_title("Performance of the model")
return plt
fig, axes = plt.subplots(3, 2, figsize=(10, 15))
# X, y = load_digits(return_X_y=True)
dict_data = load_obj("dict_data")
X = dict_data["Dataset_563"]['DICT_Train']['X']
y = dict_data["Dataset_563"]['DICT_Train']['y']
X_pd = X.to_numpy()
y_pd = y.to_numpy()
title = "Learning Curves (OptimalFlow)"
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)
estimator = MLPClassifier(random_state=13, activation = 'relu',hidden_layer_sizes = (10,),learning_rate = 'constant',solver='sgd')
plot_learning_curve(estimator, title, X_pd, y_pd, axes=axes[:, 0], ylim=(0.7, 1.01),
cv=cv, n_jobs=4)
title = r"Learning Curves (SVM, RBF kernel, $\gamma=0.001$)"
# SVC is more expensive so we do a lower number of CV iterations:
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
estimator = SVC(gamma=0.001)
plot_learning_curve(estimator, title, X_pd, y_pd, axes=axes[:, 1], ylim=(0.2, 1.01),
cv=cv, n_jobs=4)
plt.show()
204 0 198 1 93 0 78 1 128 0 Name: diagnosis, dtype: int32
dict_data = load_obj("dict_data")
X = dict_data["Dataset_563"]['DICT_Train']['X']
y = dict_data["Dataset_563"]['DICT_Train']['y']
# Instantiate the visualizer
visualizer = ParallelCoordinates(shuffle=True)
# Fit the visualizer and display it
visualizer.fit_transform(X, y)
visualizer.show()
<matplotlib.axes._subplots.AxesSubplot at 0x2343b0a8550>
X.head()
temperature | relative humidity | light | CO2 | humidity | |
---|---|---|---|---|---|
0 | 23.18 | 27.2720 | 426.0 | 721.25 | 0.004793 |
1 | 23.15 | 27.2675 | 429.5 | 714.00 | 0.004783 |
2 | 23.15 | 27.2450 | 426.0 | 713.50 | 0.004779 |
3 | 23.15 | 27.2000 | 426.0 | 708.25 | 0.004772 |
4 | 23.10 | 27.2000 | 426.0 | 704.50 | 0.004757 |
from yellowbrick.features import ParallelCoordinates
from yellowbrick.datasets import load_occupancy
# Load the classification data set
X, y = load_occupancy()
# Specify the features of interest and the classes of the target
features = [
"temperature", "relative humidity", "light", "CO2", "humidity"
]
classes = ["unoccupied", "occupied"]
# Instantiate the visualizer
visualizer = ParallelCoordinates(
classes=classes, features=features,
normalize='standard', sample=0.05, shuffle=True,
)
# Fit the visualizer and display it
visualizer.fit_transform(X, y)
visualizer.show()
<matplotlib.axes._subplots.AxesSubplot at 0x2343cee49d0>