#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_line_magic('reload_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')
get_ipython().run_line_magic('matplotlib', 'inline')
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"]="0" 


# In[2]:


import ktrain
from ktrain import graph as gr


# # Node Classification in Graphs
# 
# 
# In this notebook, we will use *ktrain* to perform node classificaiton on the Cora citation graph. Each node represents a paper pertaining to one of several paper topics.  Links represent citations between papers.  The attributes or features assigned to each node are in the form of a multi-hot-encoded vector of words appearing in the paper.  The dataset is available [here](https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz).
# 
# The dataset is already in the form expected by *ktrain*, so let's begin.

# ### STEP 1: Load and Preprocess Data
# 
# We will hold out 10% of the nodes as a test set.  Since we set `holdout_for_inductive=False`, the nodes being heldout will remain in the graph, but only their features (not labels) will be visible to our model.  This is referred to as transductive inference.  Of the remaining nodes, 10% will be used for training and the remaining nodes will be used for validation (also transductive inference). As with the holdout nodes, the features (but not labels) of validation nodes will be available to the model during training. The return value `df_holdout` contain the features for the heldout nodes and `G_complete` is the original graph including the holdout nodes.  

# In[3]:


(train_data, val_data, preproc, df_holdout, G_complete)  = gr.graph_nodes_from_csv(
                                                       'data/cora/cora.content', # node attributes/labels
                                                       'data/cora/cora.cites',                  # edge list
                                                       sample_size=20, 
                                                       holdout_pct=0.1, holdout_for_inductive=False,
                                                       train_pct=0.1, sep='\t')


# The `preproc` object includes a reference to the training graph and a dataframe showing the features and target for each node in the graph (both training and validation nodes).

# In[4]:


preproc.df.target.value_counts()


# ### STEP 2: Build a Model and Wrap in Learner Object

# In[5]:


gr.print_node_classifiers()


# In[6]:


learner = ktrain.get_learner(model=gr.graph_node_classifier('graphsage', train_data, ), 
                             train_data=train_data, 
                             val_data=val_data, 
                             batch_size=64)


# ### STEP 3:  Estimate LR 
# Given the small number of batches per epoch, a larger number of epochs is required to estimate the learning rate. We will cap it at 100 here.

# In[7]:


learner.lr_find(max_epochs=100)


# In[8]:


learner.lr_plot()


# ### STEP 4: Train the Model
# We will train the model using `autofit`, which uses a triangular learning rate policy.  The training will automatically stop when the validation loss no longer improves. We save the weights of the model during training in case we would like to reload the weights from any epoch.

# In[7]:


learner.autofit(0.01, checkpoint_folder='/tmp/saved_weights')


# ## Evaluate
# 
# #### Validate

# In[8]:


learner.validate(class_names=preproc.get_classes())


# #### Create a Predictor Object

# In[9]:


p = ktrain.get_predictor(learner.model, preproc)


# #### Transductive Inference:  Making Predictions for Validation and Test Nodes in Original Training Graph
# In transductive inference, we make predictions for unlabeled nodes whose features are visible during training. Making predictions on validation nodes in the training graph is transductive inference.
# 
# Let's see how well our prediction is for the first validation example.

# In[12]:


p.predict_transductive(val_data.ids[0:1], return_proba=True)


# In[13]:


val_data[0][1][0]


# Let's make predictions for all **test** nodes in the holdout set, measure test accuracy, and visually compare some of them with ground truth.

# In[14]:


y_pred = p.predict_transductive(df_holdout.index, return_proba=False)


# In[16]:


y_true = df_holdout.target.values


# In[17]:


import pandas as pd
pd.DataFrame(zip(y_true, y_pred), columns=['Ground Truth', 'Predicted']).head()


# In[18]:


import numpy as np
(y_true == np.array(y_pred)).mean()


# Our final test accuracy for transductive inference on the holdout nodes is **82.32%** accuracy.

# In[ ]: