#!/usr/bin/env python # coding: utf-8 # # Network Visualisation # # Some examples of visualising networks in Jupyter notebooks. # ## Create Network # # We can create a network from a simple edgelist, which is to say, a data object with two columns representing undirected edges between values in the two columns, or directed edges going from the entry in one column to the entry in another column. # In[ ]: #%pip install --upgrade pandas import pandas as pd df = pd.DataFrame({'from': ['a', 'b', 'c', 'c'], 'to': ['b', 'c', 'd', 'a']}) df # We can write that data to a CSV file: # In[ ]: df.to_csv('dummy_graph.csv', index=False) get_ipython().system(' head dummy_graph.csv') # And read it back in again: # In[ ]: df = None df = pd.read_csv('dummy_graph.csv') df # Create a graph object using `networkx` graphing package. # In[202]: #%pip install --upgrade networkx import networkx as nx # If we donlt set the `create_using` attribute, we generate an undirected graph DG = nx.from_pandas_edgelist(df, 'from', 'to', create_using=nx.DiGraph()) # There are some default plotting packages in `networkx` but the diagrams they generate often look quite scrappy. There are various third party layout packages that can generate prettier diagrams. # In[203]: # This force atlas layout algorithm often generates "reliable" layouts... #https://github.com/bhargavchippada/forceatlas2 #%pip install fa2 from fa2 import ForceAtlas2 import matplotlib.pyplot as plt forceatlas2 = ForceAtlas2( # Behavior alternatives outboundAttractionDistribution=True, # Dissuade hubs linLogMode=False, # NOT IMPLEMENTED adjustSizes=False, # Prevent overlap (NOT IMPLEMENTED) edgeWeightInfluence=1.0, # Performance jitterTolerance=1.0, # Tolerance barnesHutOptimize=True, barnesHutTheta=1.2, multiThreaded=False, # NOT IMPLEMENTED # Tuning scalingRatio=2.0, strongGravityMode=False, gravity=5.0, # Log verbose=True) # We can use the `forceatlas2` layout algorithm to generate layout co-ordinates for the nodes in the `networkx` graph: # In[204]: positions = forceatlas2.forceatlas2_networkx_layout(DG, pos=None, iterations=2000) # We can then layout the graph using the force atlas algorithm: # In[205]: plt.figure(figsize=(40,20)) # Draw nodes nx.draw_networkx_nodes(DG, positions, node_color="blue", alpha=0.4) # Draw labels nx.draw_networkx_labels(DG, positions) # Draw edges nx.draw_networkx_edges(DG, positions, edge_color="green", # edge colour alpha=0.5, # edge transparency connectionstyle='arc3,rad=0.2' # edge "bendiness" ); # connectionstyle requires directed graph # Maybe also as an alternative: https://github.com/beyondbeneath/bezier-curved-edges-networkx # One of the easiest ways to weight node sizes is by degree. We can obtain the degree of each node directly from the graph: # In[206]: d = dict(DG.degree) d # Scale the size by degree: # In[207]: sizes = [v * 500 for v in d.values()] # Now render the graph with the size set: # In[208]: # Draw nodes nx.draw_networkx_nodes(DG, positions, node_size=sizes, node_color="blue", alpha=0.4) # Draw labels nx.draw_networkx_labels(DG, positions) # Draw edges nx.draw_networkx_edges(DG, positions, edge_color="green", alpha=0.5, connectionstyle='arc3,rad=0.2'); # We can use alternative labels based on a lookup from each node. # In[209]: df_labels = pd.DataFrame({'node':['d', 'b', 'c', 'a'], 'label': ['D', 'B', 'C', 'A']}) df_labels # We need to pass this as a `dict`: # In[210]: labels_map = df_labels.set_index('node').to_dict()['label'] labels_map # Plot using these labels: # In[211]: # Draw nodes nx.draw_networkx_nodes(DG, positions, node_size=sizes, node_color='blue', alpha=0.4) # Draw labels using the labels_map labels nx.draw_networkx_labels(DG, positions, labels = labels_map) # Draw edges nx.draw_networkx_edges(DG, positions, edge_color="green", alpha=0.5, connectionstyle='arc3,rad=0.2'); # We can colour nodes if we pass in a list of colours in the node order: # In[212]: node_colors = {'a':'red', 'b':'blue','c':'green','d':'yellow'} node_colour_map = [] for node in DG.nodes(): node_colour_map.append(node_colors[node]) node_colour_map # In[213]: # Draw nodes nx.draw_networkx_nodes(DG, positions, node_size=sizes, node_color=node_colour_map, alpha=0.4) # Draw labels using the labels_map labels nx.draw_networkx_labels(DG, positions, labels = labels_map) # Draw edges nx.draw_networkx_edges(DG, positions, edge_color="green", alpha=0.5, connectionstyle='arc3,rad=0.2'); # The label layout is not ideal. We can offset the labels using an algorithm that will offset the label, and also try to prevent overlapping labels. # In[214]: # TO DO - this uses the node value as the label; at the moment, I don't know if / how we can pass arbitrary label list #https://github.com/Phlya/adjustText # Takes a long time to run #%pip install adjustText from adjustText import adjust_text def plot_adjusted_labels(DG, adjust=True, resizer=1): """Plot adjusted labels.""" if adjust: #fontsize=sizes[i]/10 texts = [plt.text(positions[k][0], positions[k][1], k, fontsize=sizes[i]*resizer) for i, k in enumerate(positions)] adjust_text(texts, arrowprops=dict(arrowstyle="-", color='k', lw=0.5)), else: nx.draw_networkx_labels(DG, positions) # *The following is not a great example!* # In[215]: # Draw nodes nx.draw_networkx_nodes(DG, positions, node_size=sizes, node_color="blue", alpha=0.4) # Draw edges nx.draw_networkx_edges(DG, positions, edge_color="green", alpha=0.5, connectionstyle='arc3,rad=0.2'); plot_adjusted_labels(DG, resizer=0.02) # ## Trying it with real data # In[216]: # TO DO # ## Additional Tweaks # # In a large graph, we may wan to limit the plotting of nodes to nodes above a certain degree. The following with create a filtered graph containing just nodes with larg degree in the original graph: # In[217]: def filter_graph_by_degree(DG, mindegree=1): """Filter a directed graph to nodes with a minimum degree.""" DF = nx.DiGraph() fedges = filter(lambda x: DG.degree()[x[0]] >= mindegree and DG.degree()[x[1]] >= mindegree, DG.edges()) DF.add_edges_from(fedges) # New network size #DF.size() return DF # # Example Influence Network from Wikipedia/DBpedia # # The following is an example influence network using data from DBpedia. # First some utility functions for running queries: # In[218]: #%pip install --upgrade SPARQLWrapper #from linkeddataquery import SPARQLWrapper, dfResults, runQuery # Import the necessary packages from SPARQLWrapper import SPARQLWrapper, JSON # Add some helper functions # A function that will return the results of running a SPARQL query with # a defined set of prefixes over a specified endpoint. # It follows the same five-step process apart from creating the query, which # is provided as an argument to the function. def runQuery(endpoint, prefix, q): ''' Run a SPARQL query with a declared prefix over a specified endpoint ''' sparql = SPARQLWrapper(endpoint) sparql.setQuery(prefix+q) # concatenate the strings representing the prefixes and the query sparql.setReturnFormat(JSON) return sparql.query().convert() # Import pandas to provide facilities for creating a DataFrame to hold results import pandas as pd # Function to convert query results into a DataFrame # The results are assumed to be in JSON format and therefore the Python dictionary will have # the results indexed by 'results' and then 'bindings'. def dict2df(results): ''' A function to flatten the SPARQL query results and return the column values ''' data = [] for result in results["results"]["bindings"]: tmp = {} for el in result: tmp[el] = result[el]['value'] data.append(tmp) df = pd.DataFrame(data) return df # Function to run a query and return results in a DataFrame def dfResults(endpoint, prefix, q): ''' Generate a data frame containing the results of running a SPARQL query with a declared prefix over a specified endpoint ''' return dict2df(runQuery(endpoint, prefix, q)) # Print a limited number of results of a query def printQuery(results, limit=''): ''' Print the results from the SPARQL query ''' resdata = results["results"]["bindings"] if limit != '': resdata = results["results"]["bindings"][:limit] for result in resdata: for ans in result: print('{0}: {1}'.format(ans, result[ans]['value'])) print() # Run a query and print out a limited number of results def printRunQuery(endpoint, prefix, q, limit=''): ''' Print the results from the SPARQL query ''' results = runQuery(endpoint, prefix, q) printQuery(results, limit) # Define the endpoint: # In[219]: endpoint="http://dbpedia.org/sparql" sparql = SPARQLWrapper(endpoint) # Set up some handy prefixes: # In[220]: prefix=''' prefix gephi: prefix foaf: prefix dct: PREFIX dbr: PREFIX dbo: ''' # Create a query over the influence network of philosophers: # In[221]: q_philosophers = ''' SELECT ?philosopherName, ?influenceName WHERE { ?philosopher a . ?influence a . ?philosopher ?influence. ?philosopher foaf:name ?philosopherName. ?influence foaf:name ?influenceName. } LIMIT 10000 ''' # Run the query: # In[222]: df2 = dfResults(endpoint, prefix, q_philosophers) df2 # In[223]: # Create a graph from the dataframe DG = nx.from_pandas_edgelist(df2, 'philosopherName', 'influenceName', create_using=nx.DiGraph()) #The graph is quite large, so simplify it to just philosophers with significant degree DF = filter_graph_by_degree(DG, 40) DF.size() # Render the significant influence network: # In[227]: # TO DO - ipywidget thing to let us interact with useful ForceAtlas parameters forceatlas2 = ForceAtlas2( # Behavior alternatives outboundAttractionDistribution=True, # Dissuade hubs linLogMode=False, # NOT IMPLEMENTED adjustSizes=False, # Prevent overlap (NOT IMPLEMENTED) edgeWeightInfluence=1.0, # Performance jitterTolerance=1.0, # Tolerance barnesHutOptimize=True, barnesHutTheta=1.2, multiThreaded=False, # NOT IMPLEMENTED # Tuning scalingRatio=2.0, strongGravityMode=False, gravity=5.0, # Log verbose=True) # Node sizing d = dict(DF.degree) sizes=[v * 1 for v in d.values()] # Node locations positions = forceatlas2.forceatlas2_networkx_layout(DF, pos=None, iterations=2000) # Give ourselve a reasonable plot size to work with plt.figure(figsize=(20,20)) nx.draw_networkx_nodes(DF, positions, node_size=sizes, with_labels=True, node_color="blue", alpha=0.4) nx.draw_networkx_edges(DF, positions, edge_color="green", alpha=0.2, connectionstyle='arc3,rad=0.2') # Position adjusted labels - this step may take some time # Tweak the previous filter step to reduce graph size and speed things up plot_adjusted_labels(DF, 0.3) # In[ ]: