#!/usr/bin/env python
# coding: utf-8

# In[1]:


import datetime
import cudf
import pandas as pd
import numpy as np
from cuml.ensemble import RandomForestClassifier
from cuml.metrics import accuracy_score

import whylogs
from whylogs.viz import ProfileVisualizer

import warnings
warnings.simplefilter('ignore')


# In[2]:


heart_data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.hungarian.data")


# This dataset encodes missing values as `?`, so we will replace them with `np.nan` temporarily and then fill them with the median value for the column.

# In[3]:


heart_data.replace("?", np.nan, inplace=True)
heart_data.fillna(heart_data.median(), inplace=True)
heart_data = heart_data.astype('float32')


# The dataset does not come packaged with a header, so we will add that next.
# 
# More information about the data (including the header labels we'll add) can be found in `heart-disease.names`:  
# https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/heart-disease.names

# In[4]:


heart_data.columns = ['age', 'sex', 'cp', 'trestbps', 'chol',
                      'fbs', 'restecg', 'thalach', 'exang', 
                      'oldpeak', 'slope', 'ca', 'thal', 'target']


# Finally, we will convert this object to an NVIDIA CuDF that works in the GPU.

# In[5]:


cu_heart_data = cudf.from_pandas(heart_data)
cu_heart_data


# The above is our first view of the data as a CuDF on the GPU using NVIDIA's RAPIDS toolkit. 
# 
# For simplicity, we'll choose the first 200 rows as training data and the remainder as batches of testing data.

# In[7]:


X_train = cu_heart_data.drop("target", axis=1)[:200]
y_train = cu_heart_data["target"][:200]

X_test1 = cu_heart_data.drop("target", axis=1)[200:231]
y_test1 = cu_heart_data["target"][200:231]
X_test2 = cu_heart_data.drop("target", axis=1)[231:262]
y_test2 = cu_heart_data["target"][231:262]
X_test3 = cu_heart_data.drop("target", axis=1)[262:]
y_test3 = cu_heart_data["target"][262:]


# In addition to CuDF, we will use CuML to train a random forest classifier on the `target` variable.

# In[8]:


cuml_model = RandomForestClassifier(n_estimators=40,
                   max_depth=16,
                   max_features=1.0,
                   random_state=1095)

cuml_model.fit(X_train, y_train)


# At this point, we want to log the data that we've used for training and testing with WhyLogs.
# 
# To do so, we first create a WhyLogs logging session using `get_or_create_session()`. For logging, we can create a new empty profile and track our CuDF directly from the GPU data structure.
# 
# In this example, we will do this all from memory instead of writing to a file.

# In[9]:


session = whylogs.get_or_create_session()
profile = session.new_profile(dataset_name="cudf-example")
profile.track_dataframe(cu_heart_data)


# One output from WhyLogs is a summary object that can be exported as a Pandas dataframe.

# In[10]:


summary = profile.flat_summary()["summary"]
summary


# On each batch of inference for our model, we should also log the data.

# In[11]:


profiles = []

# Model inference
output1 = X_test1.reset_index(drop=True)
output1["pred"] = cuml_model.predict(X_test1)

# Log using WhyLogs
profiles.append(session.profile_dataframe(output1, 
        dataset_timestamp=datetime.datetime.now()-datetime.timedelta(days=2)))


# Let's repeat this process for our remaining batches.

# In[12]:


# Inference and logging on second batch
output2 = X_test2.reset_index(drop=True)
output2["pred"] = cuml_model.predict(X_test2)
#output2 = output2.to_pandas()
profiles.append(session.profile_dataframe(output2,
        dataset_timestamp=datetime.datetime.now()-datetime.timedelta(days=1)))

# Inference and logging on third batch
output3 = X_test3.reset_index(drop=True)
output3["pred"] = cuml_model.predict(X_test3)
#output3 = output3.to_pandas()
profiles.append(session.profile_dataframe(output3, dataset_timestamp=datetime.datetime.now()))


# In addition to profiling and logging the data, WhyLogs also provides a convenient visualization module that works best on multiple dataframes.

# In[13]:


profiles


# In[16]:


viz = ProfileVisualizer()
viz.set_profiles(profiles)
viz.plot_distribution("chol")


# In[17]:


viz.plot_uniqueness("age")


# In[ ]: