#!/usr/bin/env python # coding: utf-8 # ## 论文引用网络中的节点分类任务 # # 在这一教程中,我们将展示 GraphScope 如何结合图分析、图查询和图学习的能力,处理论文引用网络中的节点分类任务。 # # # 在这个例子中我们使用的是 [ogbn-mag](https://ogb.stanford.edu/docs/nodeprop/#ogbn-mag) 数据集。"ogbn" 是由微软学术关系图(Microsoft Academic Graph)的子集组成的异构图网络。该图中包含4种类型的实体(即论文、作者、机构和研究领域),以及连接两个实体的四种类型的有向关系边。 # # 我们需要处理的任务是,给出异构的 ogbn-mag 数据,在该图上预测每篇论文的类别。这是一个节点分类任务,该任务可以归类在各个领域、各个方向或研究小组的论文,通过对论文属性和引用图上的结构信息对论文进行分类。在该数据中,每个论文节点包含了一个从论文标题、摘要抽取的 128 维 word2vec 向量作为表征,该表征是经过预训练提前获取的;而结构信息是在计算过程中即时计算的。 # # 这一教程将会分为以下几个步骤: # # - 通过gremlin交互式查询图; # - 执行图算法做图分析; # - 执行基于图数据的机器学习任务; # In[ ]: # Install graphscope package if you are NOT in the Playground get_ipython().system('pip3 install graphscope') get_ipython().system('pip3 uninstall -y importlib_metadata # Address an module conflict issue on colab.google. Remove this line if you are not on colab.') # In[ ]: # Import the graphscope module import graphscope graphscope.set_option(show_log=False) # enable logging # In[ ]: # Load the obgn_mag dataset as a graph from graphscope.dataset import load_ogbn_mag graph = load_ogbn_mag() # ## Interactive query with gremlin # # 在此示例中,我们启动了一个交互查询引擎,然后使用图遍历来查看两位给定作者共同撰写的论文数量。为了简化查询,我们假设作者可以分别由ID 2 和 4307 唯一标识。 # In[ ]: # Get the entrypoint for submitting Gremlin queries on graph g. interactive = graphscope.gremlin(graph) # Count the number of papers two authors (with id 2 and 4307) have co-authored. papers = interactive.execute( "g.V().has('author', 'id', 2).out('writes').where(__.in('writes').has('id', 4307)).count()" ).one() print("result", papers) # ## Graph analytics with analytical engine # # 继续我们的示例,下面我们在图数据中进行图分析来生成节点结构特征。我们首先通过在特定周期内从全图中提取论文(使用Gremlin!)来导出一个子图,然后运行 k-core 分解和三角形计数以生成每个论文节点的结构特征。 # In[ ]: # Exact a subgraph of publication within a time range. sub_graph = interactive.subgraph("g.V().has('year', inside(2014, 2020)).outE('cites')") # Project the subgraph to simple graph by selecting papers and their citations. simple_g = sub_graph.project(vertices={"paper": []}, edges={"cites": []}) # compute the kcore and triangle-counting. kc_result = graphscope.k_core(simple_g, k=5) tc_result = graphscope.triangles(simple_g) # Add the results as new columns to the citation graph. sub_graph = sub_graph.add_column(kc_result, {"kcore": "r"}) sub_graph = sub_graph.add_column(tc_result, {"tc": "r"}) # ## Graph neural networks (GNNs) # # 接着我们利用生成的结构特征和原有特征通过 GraphScope 的学习引擎来训练一个学习模型。 # # 在本例中,我们训练了 GCN 模型,将节点(论文)分类为349个类别,每个类别代表一个出处(例如预印本和会议)。 # In[ ]: # Define the features for learning, # we chose original 128-dimension feature and k-core, triangle count result as new features. paper_features = [] for i in range(128): paper_features.append("feat_" + str(i)) paper_features.append("kcore") paper_features.append("tc") # Launch a learning engine. here we split the dataset, 75% as train, 10% as validation and 15% as test. lg = graphscope.graphlearn( sub_graph, nodes=[("paper", paper_features)], edges=[("paper", "cites", "paper")], gen_labels=[ ("train", "paper", 100, (0, 75)), ("val", "paper", 100, (75, 85)), ("test", "paper", 100, (85, 100)), ], ) # Then we define the training process, use internal GCN model. from graphscope.learning.examples import GCN from graphscope.learning.graphlearn.python.model.tf.optimizer import get_tf_optimizer from graphscope.learning.graphlearn.python.model.tf.trainer import LocalTFTrainer def train(config, graph): def model_fn(): return GCN( graph, config["class_num"], config["features_num"], config["batch_size"], val_batch_size=config["val_batch_size"], test_batch_size=config["test_batch_size"], categorical_attrs_desc=config["categorical_attrs_desc"], hidden_dim=config["hidden_dim"], in_drop_rate=config["in_drop_rate"], neighs_num=config["neighs_num"], hops_num=config["hops_num"], node_type=config["node_type"], edge_type=config["edge_type"], full_graph_mode=config["full_graph_mode"], ) trainer = LocalTFTrainer( model_fn, epoch=config["epoch"], optimizer=get_tf_optimizer( config["learning_algo"], config["learning_rate"], config["weight_decay"] ), ) trainer.train_and_evaluate() # hyperparameters config. config = { "class_num": 349, # output dimension "features_num": 130, # 128 dimension + kcore + triangle count "batch_size": 500, "val_batch_size": 100, "test_batch_size": 100, "categorical_attrs_desc": "", "hidden_dim": 256, "in_drop_rate": 0.5, "hops_num": 2, "neighs_num": [5, 10], "full_graph_mode": False, "agg_type": "gcn", # mean, sum "learning_algo": "adam", "learning_rate": 0.01, "weight_decay": 0.0005, "epoch": 5, "node_type": "paper", "edge_type": "cites", } # Start traning and evaluating train(config, lg) # In[ ]: