#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_line_magic('reload_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')
get_ipython().run_line_magic('matplotlib', 'inline')
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"]="0" 


# In[2]:


import ktrain
from ktrain import graph as gr


# # Node Classification in Graphs
# 
# 
# In this notebook, we will use *ktrain* to perform node classificaiton on the PubMed Diabetes citation graph.  In the PubMed graph, each node represents a paper pertaining to one of three topics:  *Diabetes Mellitus - Experimental*, *Diabetes Mellitus - Type 1*, and *Diabetes Mellitus - Type 2*.  Links represent citations between papers.  The attributes or features assigned to each node are in the form of a vector of words in each paper and their corresponding TF-IDF scores.  The dataset is available [here](https://linqs-data.soe.ucsc.edu/public/Pubmed-Diabetes.tgz).
# 
# *ktrain* expects two files for node classification problems.  The first is comma or tab delimited file listing the edges in the graph, where each row contains the node IDs forming the edge.  The second is a comma or tab delimted file listing the features or attributes associated with each node in the graph.  The first column in this file is the User ID and the last column should be string representing the target or label of the node.  All other nodes should be numerical features assumed to be standardized appropriately and non-null.  
# 
# We must prepare the raw data to conform to the above before we begin.

# ### Preparing the Data
# The code below will create two files that can be processed directly by *ktrain*:
# - `/tmp/pubmed-nodes.tab`
# - `/tmp/pubmed-edges.tab`

# In[3]:


# set this to the location of the downloaded Pubmed-Diabetes data
DATADIR = 'data/pubmed/Pubmed-Diabetes/data'


# In[4]:


import os.path
import pandas as pd
import itertools

# process links
edgelist = pd.read_csv(os.path.join(DATADIR, 'Pubmed-Diabetes.DIRECTED.cites.tab'), 
                      skiprows=2, header=None,delimiter='\t')
edgelist.drop(columns=[0,2], inplace=True)
edgelist.columns = ['source', 'target']
edgelist['source'] = edgelist['source'].map(lambda x: x.lstrip('paper:')) 
edgelist['target'] = edgelist['target'].map(lambda x: x.lstrip('paper:'))
edgelist.head()
edgelist.to_csv('/tmp/pubmed-edges.tab', sep='\t', header=None, index=False )

# process nodes and their attributes
nodes_as_dict = []
with open(os.path.join(os.path.expanduser(DATADIR), "Pubmed-Diabetes.NODE.paper.tab")) as fp:
    for line in itertools.islice(fp, 2, None):
        line_res = line.split("\t")
        pid = line_res[0]
        feat_name = ['pid'] + [l.split("=")[0] for l in line_res[1:]][:-1] # delete summary
        feat_value = [l.split("=")[1] for l in line_res[1:]][:-1] # delete summary
        feat_value = [pid] + [ float(x) for x in feat_value ] # change to numeric from str
        row = dict(zip(feat_name, feat_value))
        nodes_as_dict.append(row)
colnames = set()
for row in nodes_as_dict:
    colnames.update(list(row.keys()))
colnames = list(colnames)
colnames.sort()
colnames.remove('label')
colnames.append('label')
target_dict = {1:'Diabetes_Mellitus-Experimental', 2: 'Diabetes_Mellitus-Type_1', 3:'Diabetes_Mellitus-Type_2', }
with open('/tmp/pubmed-nodes.tab', 'w') as fp:
    #fp.write("\t".join(colnames)+'\n')
    for row in nodes_as_dict:
        feats = []
        for col in colnames:
            feats.append(row.get(col, 0.0))
        feats = [str(feat) for feat in feats]
        feats[-1] = round(float(feats[-1]))
        feats[-1] = target_dict[feats[-1]]
        fp.write("\t".join(feats) + '\n')


# ### STEP 1: Load and Preprocess Data
# 
# We will hold out 20% of the nodes as test nodes by setting `holdout_pct=0.2`.  Since we specified `holdout_for_inductive=True`, these heldout nodes are removed from the graph in order to later simulate making predicitions on new nodes added to the graph later (or *inductive inference*). If `holdout_for_inductive=False`, the features (not labels) of these nodes are accessible to the model during training.  Of the remaining nodes, 5% will be used for training and the remaining nodes will be used for validation (or *transductive inference*).  More information on transductive and inductive inference and the return values `df_holdout` and `df_complete` are provided below.
# 
# Note that if there are any unlabeled nodes in the graph, these will be automatically used as heldout nodes for which predictions can be made once the model is trained.  See the [twitter example notebook](https://github.com/amaiya/ktrain/blob/master/examples/graphs/hateful_twitter_users-GraphSAGE.ipynb) for an example of this.

# In[5]:


(train_data, val_data, preproc, 
 df_holdout, G_complete)        = gr.graph_nodes_from_csv('/tmp/pubmed-nodes.tab',
                                           '/tmp/pubmed-edges.tab',
                                           sample_size=10, holdout_pct=0.2, holdout_for_inductive=True,
                                           train_pct=0.05, sep='\t')


# The `preproc` object includes a reference to the training graph and a dataframe showing the features and target for each node in the graph (both training and validation nodes).

# In[6]:


preproc.df.target.value_counts()


# ### STEP 2: Build a Model and Wrap in Learner Object

# In[7]:


gr.print_node_classifiers()


# In[8]:


learner = ktrain.get_learner(model=gr.graph_node_classifier('graphsage', train_data), 
                             train_data=train_data, 
                             val_data=val_data, 
                             batch_size=64)


# ### STEP 3:  Estimate LR 
# Given the small number of batches per epoch, a larger number of epochs is required to estimate the learning rate. We will cap it at 100 here.

# In[9]:


learner.lr_find(max_epochs=100)


# In[10]:


learner.lr_plot()


# ### STEP 4: Train the Model
# We will train the model using `autofit`, which uses a triangular learning rate policy.  The training will automatically stop when the validation loss no longer improves.

# In[9]:


learner.autofit(0.01)


# ## Evaluate
# 
# #### Validate

# In[10]:


learner.validate(class_names=preproc.get_classes())


# #### Create a Predictor Object

# In[11]:


p = ktrain.get_predictor(learner.model, preproc)


# #### Transductive Inference:  Making Predictions for Unlabeled Nodes in Original Training Graph
# In transductive inference, we make predictions for unlabeled nodes whose features are visible during training. Making predictions on validation nodes in the training graph is transductive inference.
# 
# Let's see how well our prediction is for the first validation example.

# In[12]:


p.predict_transductive(val_data.ids[0:1], return_proba=True)


# In[13]:


val_data[0][1][0]


# Let's make predictions for all validation nodes and visually compare some of them with ground truth.

# In[14]:


y_pred = p.predict_transductive(val_data.ids, return_proba=False)


# In[15]:


y_true = preproc.df[preproc.df.index.isin(val_data.ids)]['target'].values


# In[16]:


import pandas as pd
pd.DataFrame(zip(y_true, y_pred), columns=['Ground Truth', 'Predicted']).head()


# #### Inductive Inference:  Making Predictions for New Nodes Not in the Original Training Graph
# In inductive inference, we make predictions for entirely new nodes that were not present in the traning graph.  The features or attributes of these nodes were **not** visible during training.  We consider a graph where the heldout nodes are added back into the training graph, which yields the original graph of 19,717 nodes. This graph, `G_complete` was returned as the last return value of `graph_nodes_from_csv`.

# In[17]:


y_pred = p.predict_inductive(df_holdout, G_complete, return_proba=False)


# In[18]:


y_true = df_holdout['target'].values


# In[19]:


import numpy as np
(y_true == np.array(y_pred)).mean()


# With an **83.03%** accuracy, we see that inductive performance is quite good and comparable to transductive performance.

# In[ ]: