#!/usr/bin/env python
# coding: utf-8

# # Unsupervised Graph Learning with GraphSage
# 
# 
# GraphScope provides the capability to process learning tasks. In this tutorial, we demonstrate how GraphScope trains a model with GraphSage.
# 
# The task is link prediction, which estimates the probability of links between nodes in a graph.
# 
# In this task, we use our implementation of GraphSAGE algorithm to build a model that predicts protein-protein links in the [PPI](https://humgenomics.biomedcentral.com/articles/10.1186/1479-7364-3-3-291) dataset. In which every node represents a protein. The task can be treated as a unsupervised link prediction on a homogeneous link network.
# 
# In this task, GraphSage algorithm would compress both structural and attribute information in the graph into low-dimensional embedding vectors on each node. These embeddings can be further used to predict links between nodes.
# 
# This tutorial has following steps:
# - Launching the learning engine and attaching to loaded graph.
# - Defining train process with builtin GraphSage model and hyper-parameters
# - Training and evaluating
# 

# In[ ]:


# Install graphscope package if you are NOT in the Playground

get_ipython().system('pip3 install graphscope')
get_ipython().system('pip3 uninstall -y importlib_metadata  # Address an module conflict issue on colab.google. Remove this line if you are not on colab.')


# In[ ]:


# Import the graphscope module.

import graphscope

graphscope.set_option(show_log=False)  # enable logging


# In[ ]:


# Load ppi dataset

from graphscope.dataset import load_ppi

graph = load_ppi()


# ## Launch learning engine 
# Then, we need to define a feature list for training. The training feature list should be seleted from the vertex properties. In this case, we choose all the properties prefix with "feat-" as the training features.
# 
# With the featrue list, next we launch a learning engine with the [graphlearn](https://graphscope.io/docs/reference/session.html#graphscope.Session.graphlearn) method of graphscope.
# 
# In this case, we specify the GCN training over "protein" nodes and "link" edges.
# 
# With gen_labels, we take protein nodes as training set.
# 

# In[ ]:


# define the features for learning
paper_features = []
for i in range(50):
    paper_features.append("feat-" + str(i))

# launch a learning engine.
lg = graphscope.graphlearn(
    graph,
    nodes=[("protein", paper_features)],
    edges=[("protein", "link", "protein")],
    gen_labels=[
        ("train", "protein", 100, (0, 100)),
    ],
)


# 
# We use the builtin GraphSage model to define the training process.You can find more detail about all the builtin learning models on [Graph Learning Model](https://graphscope.io/docs/learning_engine.html#data-model)
# 
# In the example, we use tensorflow as "NN" backend trainer.
# 

# In[ ]:


import numpy as np
import graphscope.learning
from graphscope.learning.examples import GraphSage
from graphscope.learning.graphlearn.python.model.tf.optimizer import get_tf_optimizer
from graphscope.learning.graphlearn.python.model.tf.trainer import LocalTFTrainer

# unsupervised GraphSage.


def train(config, graph):
    def model_fn():
        return GraphSage(
            graph,
            config["class_num"],
            config["features_num"],
            config["batch_size"],
            categorical_attrs_desc=config["categorical_attrs_desc"],
            hidden_dim=config["hidden_dim"],
            in_drop_rate=config["in_drop_rate"],
            neighs_num=config["neighs_num"],
            hops_num=config["hops_num"],
            node_type=config["node_type"],
            edge_type=config["edge_type"],
            full_graph_mode=config["full_graph_mode"],
            unsupervised=config["unsupervised"],
        )

    graphscope.learning.reset_default_tf_graph()
    trainer = LocalTFTrainer(
        model_fn,
        epoch=config["epoch"],
        optimizer=get_tf_optimizer(
            config["learning_algo"], config["learning_rate"], config["weight_decay"]
        ),
    )
    trainer.train()
    embs = trainer.get_node_embedding()
    np.save(config["emb_save_dir"], embs)


# define hyperparameters
config = {
    "class_num": 128,  # output dimension
    "features_num": 50,
    "batch_size": 512,
    "categorical_attrs_desc": "",
    "hidden_dim": 128,
    "in_drop_rate": 0.5,
    "hops_num": 2,
    "neighs_num": [5, 5],
    "full_graph_mode": False,
    "agg_type": "gcn",  # mean, sum
    "learning_algo": "adam",
    "learning_rate": 0.01,
    "weight_decay": 0.0005,
    "unsupervised": True,
    "epoch": 1,
    "emb_save_dir": "./id_emb",
    "node_type": "protein",
    "edge_type": "link",
}


# ## Run training process
# 
# After define training process and hyperparameters,
# 
# Now we can start the traning process with learning engine "lg" and the hyperparameters configurations.

# In[ ]:


train(config, lg)