#!/usr/bin/env python # coding: utf-8 # In[1]: import datetime import cudf import pandas as pd import numpy as np from cuml.ensemble import RandomForestClassifier from cuml.metrics import accuracy_score import whylogs from whylogs.viz import ProfileVisualizer import warnings warnings.simplefilter('ignore') # In[2]: heart_data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.hungarian.data") # This dataset encodes missing values as `?`, so we will replace them with `np.nan` temporarily and then fill them with the median value for the column. # In[3]: heart_data.replace("?", np.nan, inplace=True) heart_data.fillna(heart_data.median(), inplace=True) heart_data = heart_data.astype('float32') # The dataset does not come packaged with a header, so we will add that next. # # More information about the data (including the header labels we'll add) can be found in `heart-disease.names`: # https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/heart-disease.names # In[4]: heart_data.columns = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'] # Finally, we will convert this object to an NVIDIA CuDF that works in the GPU. # In[5]: cu_heart_data = cudf.from_pandas(heart_data) cu_heart_data # The above is our first view of the data as a CuDF on the GPU using NVIDIA's RAPIDS toolkit. # # For simplicity, we'll choose the first 200 rows as training data and the remainder as batches of testing data. # In[7]: X_train = cu_heart_data.drop("target", axis=1)[:200] y_train = cu_heart_data["target"][:200] X_test1 = cu_heart_data.drop("target", axis=1)[200:231] y_test1 = cu_heart_data["target"][200:231] X_test2 = cu_heart_data.drop("target", axis=1)[231:262] y_test2 = cu_heart_data["target"][231:262] X_test3 = cu_heart_data.drop("target", axis=1)[262:] y_test3 = cu_heart_data["target"][262:] # In addition to CuDF, we will use CuML to train a random forest classifier on the `target` variable. # In[8]: cuml_model = RandomForestClassifier(n_estimators=40, max_depth=16, max_features=1.0, random_state=1095) cuml_model.fit(X_train, y_train) # At this point, we want to log the data that we've used for training and testing with WhyLogs. # # To do so, we first create a WhyLogs logging session using `get_or_create_session()`. For logging, we can create a new empty profile and track our CuDF directly from the GPU data structure. # # In this example, we will do this all from memory instead of writing to a file. # In[9]: session = whylogs.get_or_create_session() profile = session.new_profile(dataset_name="cudf-example") profile.track_dataframe(cu_heart_data) # One output from WhyLogs is a summary object that can be exported as a Pandas dataframe. # In[10]: summary = profile.flat_summary()["summary"] summary # On each batch of inference for our model, we should also log the data. # In[11]: profiles = [] # Model inference output1 = X_test1.reset_index(drop=True) output1["pred"] = cuml_model.predict(X_test1) # Log using WhyLogs profiles.append(session.profile_dataframe(output1, dataset_timestamp=datetime.datetime.now()-datetime.timedelta(days=2))) # Let's repeat this process for our remaining batches. # In[12]: # Inference and logging on second batch output2 = X_test2.reset_index(drop=True) output2["pred"] = cuml_model.predict(X_test2) #output2 = output2.to_pandas() profiles.append(session.profile_dataframe(output2, dataset_timestamp=datetime.datetime.now()-datetime.timedelta(days=1))) # Inference and logging on third batch output3 = X_test3.reset_index(drop=True) output3["pred"] = cuml_model.predict(X_test3) #output3 = output3.to_pandas() profiles.append(session.profile_dataframe(output3, dataset_timestamp=datetime.datetime.now())) # In addition to profiling and logging the data, WhyLogs also provides a convenient visualization module that works best on multiple dataframes. # In[13]: profiles # In[16]: viz = ProfileVisualizer() viz.set_profiles(profiles) viz.plot_distribution("chol") # In[17]: viz.plot_uniqueness("age") # In[ ]: