#!/usr/bin/env python # coding: utf-8 # # Unsupervised Graph Learning with GraphSage # # # GraphScope provides the capability to process learning tasks. In this tutorial, we demonstrate how GraphScope trains a model with GraphSage. # # The task is link prediction, which estimates the probability of links between nodes in a graph. # # In this task, we use our implementation of GraphSAGE algorithm to build a model that predicts protein-protein links in the [PPI](https://humgenomics.biomedcentral.com/articles/10.1186/1479-7364-3-3-291) dataset. In which every node represents a protein. The task can be treated as a unsupervised link prediction on a homogeneous link network. # # In this task, GraphSage algorithm would compress both structural and attribute information in the graph into low-dimensional embedding vectors on each node. These embeddings can be further used to predict links between nodes. # # This tutorial has following steps: # - Launching the learning engine and attaching to loaded graph. # - Defining train process with builtin GraphSage model and hyper-parameters # - Training and evaluating # # In[ ]: # Install graphscope package if you are NOT in the Playground get_ipython().system('pip3 install graphscope') get_ipython().system('pip3 uninstall -y importlib_metadata # Address an module conflict issue on colab.google. Remove this line if you are not on colab.') # In[ ]: # Import the graphscope module. import graphscope graphscope.set_option(show_log=False) # enable logging # In[ ]: # Load ppi dataset from graphscope.dataset import load_ppi graph = load_ppi() # ## Launch learning engine # Then, we need to define a feature list for training. The training feature list should be seleted from the vertex properties. In this case, we choose all the properties prefix with "feat-" as the training features. # # With the featrue list, next we launch a learning engine with the [graphlearn](https://graphscope.io/docs/reference/session.html#graphscope.Session.graphlearn) method of graphscope. # # In this case, we specify the GCN training over "protein" nodes and "link" edges. # # With gen_labels, we take protein nodes as training set. # # In[ ]: # define the features for learning paper_features = [] for i in range(50): paper_features.append("feat-" + str(i)) # launch a learning engine. lg = graphscope.graphlearn( graph, nodes=[("protein", paper_features)], edges=[("protein", "link", "protein")], gen_labels=[ ("train", "protein", 100, (0, 100)), ], ) # # We use the builtin GraphSage model to define the training process.You can find more detail about all the builtin learning models on [Graph Learning Model](https://graphscope.io/docs/learning_engine.html#data-model) # # In the example, we use tensorflow as "NN" backend trainer. # # In[ ]: import numpy as np import graphscope.learning from graphscope.learning.examples import GraphSage from graphscope.learning.graphlearn.python.model.tf.optimizer import get_tf_optimizer from graphscope.learning.graphlearn.python.model.tf.trainer import LocalTFTrainer # unsupervised GraphSage. def train(config, graph): def model_fn(): return GraphSage( graph, config["class_num"], config["features_num"], config["batch_size"], categorical_attrs_desc=config["categorical_attrs_desc"], hidden_dim=config["hidden_dim"], in_drop_rate=config["in_drop_rate"], neighs_num=config["neighs_num"], hops_num=config["hops_num"], node_type=config["node_type"], edge_type=config["edge_type"], full_graph_mode=config["full_graph_mode"], unsupervised=config["unsupervised"], ) graphscope.learning.reset_default_tf_graph() trainer = LocalTFTrainer( model_fn, epoch=config["epoch"], optimizer=get_tf_optimizer( config["learning_algo"], config["learning_rate"], config["weight_decay"] ), ) trainer.train() embs = trainer.get_node_embedding() np.save(config["emb_save_dir"], embs) # define hyperparameters config = { "class_num": 128, # output dimension "features_num": 50, "batch_size": 512, "categorical_attrs_desc": "", "hidden_dim": 128, "in_drop_rate": 0.5, "hops_num": 2, "neighs_num": [5, 5], "full_graph_mode": False, "agg_type": "gcn", # mean, sum "learning_algo": "adam", "learning_rate": 0.01, "weight_decay": 0.0005, "unsupervised": True, "epoch": 1, "emb_save_dir": "./id_emb", "node_type": "protein", "edge_type": "link", } # ## Run training process # # After define training process and hyperparameters, # # Now we can start the traning process with learning engine "lg" and the hyperparameters configurations. # In[ ]: train(config, lg)