#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('reload_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') get_ipython().run_line_magic('matplotlib', 'inline') import os os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"; os.environ["CUDA_VISIBLE_DEVICES"]="0" # ## Link Prediction With Graph Neural Networks # # In this example, we will use the Cora citation network [available for download here](https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz). Nodes are papers and links represent citations among papers. The objective is to use a small sample of positive (i.e., existing) links and negative (i.e, non-existing) links to build a model that can predict whther two nodes have a citation relationship. # # ## STEP 1: Load and Preprocess Dataset # Let's begin by loading and preprocessing the dataset. By default, *ktrain* will holdout *10%* (i.e., `val_pct=0.1`) of the links for validation (along with an equal number of negative links). An additional `train_pct` of links will be taken as the training set. Here, we set `train_pct=0.1`, which is also the default. # In[2]: import ktrain from ktrain import graph as gr # load data with supervision ratio of 10% (trn, val, preproc) = gr.graph_links_from_csv( 'data/cora/cora.content', # node attributes/labels 'data/cora/cora.cites', # edge list train_pct=0.1, sep='\t') # In[3]: print('original graph: %s nodes and %s edges' % (preproc.G.number_of_nodes(), preproc.G.number_of_edges())) # In[4]: print('validation graph: nodes: %s, links:%s' % (val.graph.number_of_nodes(), val.graph.number_of_edges())) # In[5]: print('training graph: nodes: %s, links:%s' % (trn.graph.number_of_nodes(), trn.graph.number_of_edges())) # ## STEP 2: Build a Graph Neural Network for Link Prediction # Next, we build a graph neural network model. *ktrain* currently supports [GraphSAGE models](https://cs.stanford.edu/people/jure/pubs/graphsage-nips17.pdf) for link prediction. # In[6]: gr.print_link_predictors() # In[7]: model = gr.graph_link_predictor('graphsage', trn, preproc) # We will wrap the model and data in a `Learner` object to facilitate training. For instance, let's set the global weight decay to 0.01. # In[8]: learner = ktrain.get_learner(model, train_data=trn, val_data=val) # In[9]: learner.set_weight_decay(wd=0.01) # ## STEP 3: Estimate Learning Rate Using Learning-Rate-Finder # In[11]: learner.lr_find(show_plot=True, max_epochs=10) # ## STEP 4: Train Model With [1Cycle](https://arxiv.org/pdf/1803.09820.pdf) Learning Rate Schedule] # In[10]: learner.fit_onecycle(0.01, 5) # ## Make Predictions # # We will create a `Predictor` object and make predictions. The predict method accepts a `networkx` graph with node features stored as node attributes and a list of edges (tuples of node IDs into the graph). The model will predict whether each edge should exist or not. Since we are making predictions existing edges in the graph, we expect to return a 1 (i.e., a string label of 'positive') for each input. # In[12]: predictor = ktrain.get_predictor(learner.model, preproc) # In[18]: predictor.predict(preproc.G, list(preproc.G.edges())[:5]) # In[19]: predictor.save('/tmp/mylinkpred') # In[20]: reloaded_predictor = ktrain.load_predictor('/tmp/mylinkpred') # In[21]: reloaded_predictor.get_classes() # In[22]: reloaded_predictor.predict(preproc.G, list(preproc.G.edges())[:5], return_proba=True)