#!/usr/bin/env python
# coding: utf-8

# ## 论文引用网络中的节点分类任务
# 
# 在这一教程中，我们将展示 GraphScope 如何结合图分析、图查询和图学习的能力，处理论文引用网络中的节点分类任务。
# 
# 
# 在这个例子中我们使用的是 [ogbn-mag](https://ogb.stanford.edu/docs/nodeprop/#ogbn-mag) 数据集。"ogbn" 是由微软学术关系图（Microsoft Academic Graph）的子集组成的异构图网络。该图中包含4种类型的实体（即论文、作者、机构和研究领域），以及连接两个实体的四种类型的有向关系边。
# 
# 我们需要处理的任务是，给出异构的 ogbn-mag 数据，在该图上预测每篇论文的类别。这是一个节点分类任务，该任务可以归类在各个领域、各个方向或研究小组的论文，通过对论文属性和引用图上的结构信息对论文进行分类。在该数据中，每个论文节点包含了一个从论文标题、摘要抽取的 128 维 word2vec 向量作为表征，该表征是经过预训练提前获取的；而结构信息是在计算过程中即时计算的。
# 
# 这一教程将会分为以下几个步骤：
# 
# - 通过gremlin交互式查询图；
# - 执行图算法做图分析；
# - 执行基于图数据的机器学习任务；

# In[ ]:


# Install graphscope package if you are NOT in the Playground

get_ipython().system('pip3 install graphscope')
get_ipython().system('pip3 uninstall -y importlib_metadata  # Address an module conflict issue on colab.google. Remove this line if you are not on colab.')


# In[ ]:


# Import the graphscope module

import graphscope

graphscope.set_option(show_log=False)  # enable logging


# In[ ]:


# Load the obgn_mag dataset as a graph

from graphscope.dataset import load_ogbn_mag

graph = load_ogbn_mag()


# ## Interactive query with gremlin
# 
# 在此示例中，我们启动了一个交互查询引擎，然后使用图遍历来查看两位给定作者共同撰写的论文数量。为了简化查询，我们假设作者可以分别由ID 2 和 4307 唯一标识。

# In[ ]:


# Get the entrypoint for submitting Gremlin queries on graph g.
interactive = graphscope.gremlin(graph)

# Count the number of papers two authors (with id 2 and 4307) have co-authored.
papers = interactive.execute(
    "g.V().has('author', 'id', 2).out('writes').where(__.in('writes').has('id', 4307)).count()"
).one()
print("result", papers)


# ## Graph analytics with analytical engine
# 
# 继续我们的示例，下面我们在图数据中进行图分析来生成节点结构特征。我们首先通过在特定周期内从全图中提取论文（使用Gremlin！）来导出一个子图，然后运行 k-core 分解和三角形计数以生成每个论文节点的结构特征。

# In[ ]:


# Exact a subgraph of publication within a time range.
sub_graph = interactive.subgraph("g.V().has('year', inside(2014, 2020)).outE('cites')")

# Project the subgraph to simple graph by selecting papers and their citations.
simple_g = sub_graph.project(vertices={"paper": []}, edges={"cites": []})
# compute the kcore and triangle-counting.
kc_result = graphscope.k_core(simple_g, k=5)
tc_result = graphscope.triangles(simple_g)

# Add the results as new columns to the citation graph.
sub_graph = sub_graph.add_column(kc_result, {"kcore": "r"})
sub_graph = sub_graph.add_column(tc_result, {"tc": "r"})


# ## Graph neural networks (GNNs)
# 
# 接着我们利用生成的结构特征和原有特征通过 GraphScope 的学习引擎来训练一个学习模型。
# 
# 在本例中，我们训练了 GraphSAGE 模型，将节点（论文）分类为349个类别，每个类别代表一个出处（例如预印本和会议）。

# In[ ]:


# Define the features for learning,
# we chose original 128-dimension feature and k-core, triangle count result as new features.
paper_features = []
for i in range(128):
    paper_features.append("feat_" + str(i))
paper_features.append("kcore")
paper_features.append("tc")

# Launch a learning engine. here we split the dataset, 75% as train, 10% as validation and 15% as test.
lg = graphscope.graphlearn(
    sub_graph,
    nodes=[("paper", paper_features)],
    edges=[("paper", "cites", "paper")],
    gen_labels=[
        ("train", "paper", 100, (0, 75)),
        ("val", "paper", 100, (75, 85)),
        ("test", "paper", 100, (85, 100)),
    ],
)

# Then we define the training process using the example GraphSAGE model with tensorflow.
try:
    # https://www.tensorflow.org/guide/migrate
    import tensorflow.compat.v1 as tf
    tf.disable_v2_behavior()
except ImportError:
    import tensorflow as tf

import graphscope.learning
from graphscope.learning.examples import EgoGraphSAGE
from graphscope.learning.examples import EgoSAGESupervisedDataLoader
from graphscope.learning.examples.tf.trainer import LocalTrainer

# supervised GraphSAGE.
def train_sage(graph, node_type, edge_type, class_num, features_num,
              hops_num=2, nbrs_num=[25, 10], epochs=2,
              hidden_dim=256, in_drop_rate=0.5, learning_rate=0.01,
):
    graphscope.learning.reset_default_tf_graph()

    dimensions = [features_num] + [hidden_dim] * (hops_num - 1) + [class_num]
    model = EgoGraphSAGE(dimensions, act_func=tf.nn.relu, dropout=in_drop_rate)

    # prepare train dataset
    train_data = EgoSAGESupervisedDataLoader(
        graph, graphscope.learning.Mask.TRAIN,
        node_type=node_type, edge_type=edge_type, nbrs_num=nbrs_num, hops_num=hops_num,
    )
    train_embedding = model.forward(train_data.src_ego)
    train_labels = train_data.src_ego.src.labels
    loss = tf.reduce_mean(
        tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=train_labels, logits=train_embedding,
        )
    )
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)

    # prepare test dataset
    test_data = EgoSAGESupervisedDataLoader(
        graph, graphscope.learning.Mask.TEST,
        node_type=node_type, edge_type=edge_type, nbrs_num=nbrs_num, hops_num=hops_num,
    )
    test_embedding = model.forward(test_data.src_ego)
    test_labels = test_data.src_ego.src.labels
    test_indices = tf.math.argmax(test_embedding, 1, output_type=tf.int32)
    test_acc = tf.div(
        tf.reduce_sum(tf.cast(tf.math.equal(test_indices, test_labels), tf.float32)),
        tf.cast(tf.shape(test_labels)[0], tf.float32),
    )

    # train and test
    trainer = LocalTrainer()
    trainer.train(train_data.iterator, loss, optimizer, epochs=epochs)
    trainer.test(test_data.iterator, test_acc)

train_sage(lg, node_type="paper", edge_type="cites",
          class_num=349,  # output dimension
          features_num=130,  # input dimension, 128 + kcore + triangle count
)