#!/usr/bin/env python # coding: utf-8 # # PiML Toolbox: Uploading Custom Data in Two Ways # # Other than the built-in datasets (e.g. BikeSharing, CaliforniaHousing, TaiwanCredit) for demo purpose, PiML supports custom data uploading for model development and validation. # # This example notebook demonstrates how to upload/read custom data in two ways. # # 1. **Upload new data** by the `piml.Experiment.data_loader()` widget (with file size limit 10MB) # # 2. **Manually read data** by `panda.read_csv()`, then register to `piml.Experiment`. # # For simplicity, we employ the CASP dataset you may download from `https://archive.ics.uci.edu/ml/machine-learning-databases/00265/CASP.csv` (~3.4MB, with details [here](https://archive.ics.uci.edu/ml/datasets/Physicochemical+Properties+of+Protein+Tertiary+Structure) ) and save it to your local drive. # # # Stage 0: Install PiML package on Google Colab # # 1. Run `!pip install piml` to install the latest version of PiML # 2. In Colab, you'll need restart the runtime in order to use newly installed PiML version. # In[ ]: get_ipython().system('pip install piml') # # Stage 1: Initialize an experiment, Load and Prepare data # In[1]: from piml import Experiment exp = Experiment() # In[2]: # The first way: upload new data from your local drive by the widget (with file size limit 10MB) # Choose "Upload new data" exp.data_loader() # In[3]: # The second way: manually read data from your local drive or URL, then register to piml.Experiment import pandas as pd data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/00265/CASP.csv") exp.data_loader(data=data) # In[4]: exp.data_summary() # In[5]: # Select RMSD as the target variable and click "UPDATE"; this is a regression task exp.data_prepare() # In[6]: exp.feature_select() # In[8]: exp.eda() # # Stage 2. Train intepretable models # # # In[9]: # Choose GLM, GAM, Tree models with default settings, click "RUN" to train; # After training is finished, register the three trained models one by one. exp.model_train() # # Stage 3. Explain and Interpret # In[10]: # Model-agnostic post-hoc explanation by Permutation Feature Importance, PDP (1D and 2D) vs. ALE (1D and 2D), LIME vs. SHAP exp.model_explain() # In[11]: # Model-specific inherent interpretation including feature importance, main effects and pairwise interactions. exp.model_interpret() # # Stage 4. Diagnose and Compare # In[12]: exp.model_diagnose() # In[13]: exp.model_compare() # In[ ]: