#!/usr/bin/env python
# coding: utf-8
# # Network Visualisation
#
# Some examples of visualising networks in Jupyter notebooks.
# ## Create Network
#
# We can create a network from a simple edgelist, which is to say, a data object with two columns representing undirected edges between values in the two columns, or directed edges going from the entry in one column to the entry in another column.
# In[ ]:
#%pip install --upgrade pandas
import pandas as pd
df = pd.DataFrame({'from': ['a', 'b', 'c', 'c'], 'to': ['b', 'c', 'd', 'a']})
df
# We can write that data to a CSV file:
# In[ ]:
df.to_csv('dummy_graph.csv', index=False)
get_ipython().system(' head dummy_graph.csv')
# And read it back in again:
# In[ ]:
df = None
df = pd.read_csv('dummy_graph.csv')
df
# Create a graph object using `networkx` graphing package.
# In[202]:
#%pip install --upgrade networkx
import networkx as nx
# If we donlt set the `create_using` attribute, we generate an undirected graph
DG = nx.from_pandas_edgelist(df, 'from', 'to', create_using=nx.DiGraph())
# There are some default plotting packages in `networkx` but the diagrams they generate often look quite scrappy. There are various third party layout packages that can generate prettier diagrams.
# In[203]:
# This force atlas layout algorithm often generates "reliable" layouts...
#https://github.com/bhargavchippada/forceatlas2
#%pip install fa2
from fa2 import ForceAtlas2
import matplotlib.pyplot as plt
forceatlas2 = ForceAtlas2(
# Behavior alternatives
outboundAttractionDistribution=True, # Dissuade hubs
linLogMode=False, # NOT IMPLEMENTED
adjustSizes=False, # Prevent overlap (NOT IMPLEMENTED)
edgeWeightInfluence=1.0,
# Performance
jitterTolerance=1.0, # Tolerance
barnesHutOptimize=True,
barnesHutTheta=1.2,
multiThreaded=False, # NOT IMPLEMENTED
# Tuning
scalingRatio=2.0,
strongGravityMode=False,
gravity=5.0,
# Log
verbose=True)
# We can use the `forceatlas2` layout algorithm to generate layout co-ordinates for the nodes in the `networkx` graph:
# In[204]:
positions = forceatlas2.forceatlas2_networkx_layout(DG, pos=None, iterations=2000)
# We can then layout the graph using the force atlas algorithm:
# In[205]:
plt.figure(figsize=(40,20))
# Draw nodes
nx.draw_networkx_nodes(DG, positions, node_color="blue", alpha=0.4)
# Draw labels
nx.draw_networkx_labels(DG, positions)
# Draw edges
nx.draw_networkx_edges(DG, positions,
edge_color="green", # edge colour
alpha=0.5, # edge transparency
connectionstyle='arc3,rad=0.2' # edge "bendiness"
);
# connectionstyle requires directed graph
# Maybe also as an alternative: https://github.com/beyondbeneath/bezier-curved-edges-networkx
# One of the easiest ways to weight node sizes is by degree. We can obtain the degree of each node directly from the graph:
# In[206]:
d = dict(DG.degree)
d
# Scale the size by degree:
# In[207]:
sizes = [v * 500 for v in d.values()]
# Now render the graph with the size set:
# In[208]:
# Draw nodes
nx.draw_networkx_nodes(DG, positions, node_size=sizes, node_color="blue", alpha=0.4)
# Draw labels
nx.draw_networkx_labels(DG, positions)
# Draw edges
nx.draw_networkx_edges(DG, positions, edge_color="green", alpha=0.5, connectionstyle='arc3,rad=0.2');
# We can use alternative labels based on a lookup from each node.
# In[209]:
df_labels = pd.DataFrame({'node':['d', 'b', 'c', 'a'],
'label': ['D', 'B', 'C', 'A']})
df_labels
# We need to pass this as a `dict`:
# In[210]:
labels_map = df_labels.set_index('node').to_dict()['label']
labels_map
# Plot using these labels:
# In[211]:
# Draw nodes
nx.draw_networkx_nodes(DG, positions, node_size=sizes, node_color='blue', alpha=0.4)
# Draw labels using the labels_map labels
nx.draw_networkx_labels(DG, positions, labels = labels_map)
# Draw edges
nx.draw_networkx_edges(DG, positions, edge_color="green", alpha=0.5, connectionstyle='arc3,rad=0.2');
# We can colour nodes if we pass in a list of colours in the node order:
# In[212]:
node_colors = {'a':'red', 'b':'blue','c':'green','d':'yellow'}
node_colour_map = []
for node in DG.nodes():
node_colour_map.append(node_colors[node])
node_colour_map
# In[213]:
# Draw nodes
nx.draw_networkx_nodes(DG, positions, node_size=sizes, node_color=node_colour_map, alpha=0.4)
# Draw labels using the labels_map labels
nx.draw_networkx_labels(DG, positions, labels = labels_map)
# Draw edges
nx.draw_networkx_edges(DG, positions, edge_color="green", alpha=0.5, connectionstyle='arc3,rad=0.2');
# The label layout is not ideal. We can offset the labels using an algorithm that will offset the label, and also try to prevent overlapping labels.
# In[214]:
# TO DO - this uses the node value as the label; at the moment, I don't know if / how we can pass arbitrary label list
#https://github.com/Phlya/adjustText
# Takes a long time to run
#%pip install adjustText
from adjustText import adjust_text
def plot_adjusted_labels(DG, adjust=True, resizer=1):
"""Plot adjusted labels."""
if adjust:
#fontsize=sizes[i]/10
texts = [plt.text(positions[k][0], positions[k][1], k, fontsize=sizes[i]*resizer) for i, k in enumerate(positions)]
adjust_text(texts, arrowprops=dict(arrowstyle="-", color='k', lw=0.5)),
else:
nx.draw_networkx_labels(DG, positions)
# *The following is not a great example!*
# In[215]:
# Draw nodes
nx.draw_networkx_nodes(DG, positions, node_size=sizes, node_color="blue", alpha=0.4)
# Draw edges
nx.draw_networkx_edges(DG, positions, edge_color="green", alpha=0.5, connectionstyle='arc3,rad=0.2');
plot_adjusted_labels(DG, resizer=0.02)
# ## Trying it with real data
# In[216]:
# TO DO
# ## Additional Tweaks
#
# In a large graph, we may wan to limit the plotting of nodes to nodes above a certain degree. The following with create a filtered graph containing just nodes with larg degree in the original graph:
# In[217]:
def filter_graph_by_degree(DG, mindegree=1):
"""Filter a directed graph to nodes with a minimum degree."""
DF = nx.DiGraph()
fedges = filter(lambda x: DG.degree()[x[0]] >= mindegree and DG.degree()[x[1]] >= mindegree, DG.edges())
DF.add_edges_from(fedges)
# New network size
#DF.size()
return DF
# # Example Influence Network from Wikipedia/DBpedia
#
# The following is an example influence network using data from DBpedia.
# First some utility functions for running queries:
# In[218]:
#%pip install --upgrade SPARQLWrapper
#from linkeddataquery import SPARQLWrapper, dfResults, runQuery
# Import the necessary packages
from SPARQLWrapper import SPARQLWrapper, JSON
# Add some helper functions
# A function that will return the results of running a SPARQL query with
# a defined set of prefixes over a specified endpoint.
# It follows the same five-step process apart from creating the query, which
# is provided as an argument to the function.
def runQuery(endpoint, prefix, q):
''' Run a SPARQL query with a declared prefix over a specified endpoint '''
sparql = SPARQLWrapper(endpoint)
sparql.setQuery(prefix+q) # concatenate the strings representing the prefixes and the query
sparql.setReturnFormat(JSON)
return sparql.query().convert()
# Import pandas to provide facilities for creating a DataFrame to hold results
import pandas as pd
# Function to convert query results into a DataFrame
# The results are assumed to be in JSON format and therefore the Python dictionary will have
# the results indexed by 'results' and then 'bindings'.
def dict2df(results):
''' A function to flatten the SPARQL query results and return the column values '''
data = []
for result in results["results"]["bindings"]:
tmp = {}
for el in result:
tmp[el] = result[el]['value']
data.append(tmp)
df = pd.DataFrame(data)
return df
# Function to run a query and return results in a DataFrame
def dfResults(endpoint, prefix, q):
''' Generate a data frame containing the results of running
a SPARQL query with a declared prefix over a specified endpoint '''
return dict2df(runQuery(endpoint, prefix, q))
# Print a limited number of results of a query
def printQuery(results, limit=''):
''' Print the results from the SPARQL query '''
resdata = results["results"]["bindings"]
if limit != '':
resdata = results["results"]["bindings"][:limit]
for result in resdata:
for ans in result:
print('{0}: {1}'.format(ans, result[ans]['value']))
print()
# Run a query and print out a limited number of results
def printRunQuery(endpoint, prefix, q, limit=''):
''' Print the results from the SPARQL query '''
results = runQuery(endpoint, prefix, q)
printQuery(results, limit)
# Define the endpoint:
# In[219]:
endpoint="http://dbpedia.org/sparql"
sparql = SPARQLWrapper(endpoint)
# Set up some handy prefixes:
# In[220]:
prefix='''
prefix gephi:
prefix foaf:
prefix dct:
PREFIX dbr:
PREFIX dbo:
'''
# Create a query over the influence network of philosophers:
# In[221]:
q_philosophers = '''
SELECT ?philosopherName, ?influenceName WHERE {
?philosopher a
.
?influence a
.
?philosopher ?influence.
?philosopher foaf:name ?philosopherName.
?influence foaf:name ?influenceName.
} LIMIT 10000
'''
# Run the query:
# In[222]:
df2 = dfResults(endpoint, prefix, q_philosophers)
df2
# In[223]:
# Create a graph from the dataframe
DG = nx.from_pandas_edgelist(df2, 'philosopherName', 'influenceName', create_using=nx.DiGraph())
#The graph is quite large, so simplify it to just philosophers with significant degree
DF = filter_graph_by_degree(DG, 40)
DF.size()
# Render the significant influence network:
# In[227]:
# TO DO - ipywidget thing to let us interact with useful ForceAtlas parameters
forceatlas2 = ForceAtlas2(
# Behavior alternatives
outboundAttractionDistribution=True, # Dissuade hubs
linLogMode=False, # NOT IMPLEMENTED
adjustSizes=False, # Prevent overlap (NOT IMPLEMENTED)
edgeWeightInfluence=1.0,
# Performance
jitterTolerance=1.0, # Tolerance
barnesHutOptimize=True,
barnesHutTheta=1.2,
multiThreaded=False, # NOT IMPLEMENTED
# Tuning
scalingRatio=2.0,
strongGravityMode=False,
gravity=5.0,
# Log
verbose=True)
# Node sizing
d = dict(DF.degree)
sizes=[v * 1 for v in d.values()]
# Node locations
positions = forceatlas2.forceatlas2_networkx_layout(DF, pos=None, iterations=2000)
# Give ourselve a reasonable plot size to work with
plt.figure(figsize=(20,20))
nx.draw_networkx_nodes(DF, positions, node_size=sizes, with_labels=True, node_color="blue", alpha=0.4)
nx.draw_networkx_edges(DF, positions, edge_color="green", alpha=0.2, connectionstyle='arc3,rad=0.2')
# Position adjusted labels - this step may take some time
# Tweak the previous filter step to reduce graph size and speed things up
plot_adjusted_labels(DF, 0.3)
# In[ ]: