Notebook

In [ ]:

# netowrks
import networkx as nx
import igraph as ig

# data processing
import pandas as pd
import numpy as np

#some functions to make our lifes easier
import sys
sys.path.append("./")
from common_functions import *

# viz
import pylab as plt
import matplotlib as mpl
import seaborn as sns
%matplotlib inline

#Change the default options of visualization (improving the defaults)
custom_params = {"axes.spines.right": False, "axes.spines.top": False, "axes.spines.left": False, "axes.spines.bottom":
False,"lines.linewidth": 2, "grid.color": "lightgray", "legend.frameon": False, "xtick.labelcolor": "#484848", "ytick.labelcolor":
"#484848", "xtick.color": "#484848", "ytick.color": "#484848","text.color": "#484848", "axes.labelcolor": "#484848",
"axes.titlecolor":"#484848","figure.figsize": [5,3],
"axes.titlelocation":"left","xaxis.labellocation":"left","yaxis.labellocation":"bottom"}
palette = ["#3d348b","#e6af2e","#191716","#e0e2db"] #use your favourite colours
sns.set_theme(context='paper', style='white', palette=palette, font='Verdana', font_scale=1.3, color_codes=True,
rc=custom_params)

from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [ ]:

path_data = "../Data/"

Exercise 1: Reading and visualizing graphs¶

Goal: Get used to the networkx library

1.1. Basic plot¶

Read and understand the following code

In [ ]:

# Read an example data on florentine marriage families in the XV century
G = nx.florentine_families_graph()

# Show the nodes and edges
print("Nodes: ", G.nodes())
print("Edges: ", G.edges())

In [ ]:

# Visualize it
plt.figure(figsize=(5,4)) #set up dimensions

# create layout (once so we can reuse it if needed)
pos = nx.spring_layout(G, seed = 1)
nx.draw(G, pos = pos, with_labels = True, 
        edge_color = "gray", node_color = "lightgray")

1.2. Read and visualize your network¶

Network here: https://tinyurl.com/network-game (I added the direct CSV link. But you could also download it as a CSV file and move it to the folder of this notebook)

Use the nx.from_pandas_edgelist function (you need to set up the source and target parameters) Print number of nodes and edges

In [ ]:

# Read edgelist (link)
df = pd.read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSuZC86KjXYKSPr0Nw7mfRha4zwea3aMw-gTKliVcbRt_m3NRiCEyxbH_d5M8MBL0LWayg1WDmnqBET/pub?gid=0&single=true&output=csv")
display(df.head())

In [ ]:

#get help
nx.from_pandas_edgelist?

In [ ]:

# Convert to networkx
G = nx.from_pandas_edgelist(...)
print(len(G.nodes()))
print(len(G.edges()))

In [ ]:

# create figure and plot
plt.figure(figsize=(10,8))

# create layout (once so we can reuse it if needed)
pos = nx.spring_layout(G, seed = 1)
nx.draw(G, pos = pos, with_labels = True, node_size=10,
        edge_color = "lightgray", node_color = "gray")

1.3. Read and visualize the Twitter network¶

We will be using (parts of) this network in the following days.

It contains mentions, replies and quotes of user A to user B (but let's treat is as undirected)

replied_to    40255
mentioned     10407
quoted         3448

Use the nx.from_pandas_edgelist function Print number of nodes and edges

In [ ]:

# Read edgelist (the separator is a TAB)
path = f"{path_data}/ic2s2_netsci_3.tsv"

df = pd.read_csv(...)
display(df.head())

# Convert to networkx
G = nx.from_pandas_edgelist(df)
G.remove_edges_from(nx.selfloop_edges(G)) #remove self-edges
print(len(G.nodes()))
print(len(G.edges()))

In [ ]:

# create figure and plot
plt.figure(figsize=(20,16))

# create layout (once so we can reuse it if needed)
pos = nx.spring_layout(G, seed = 1)
nx.draw(G, pos = pos, with_labels = False, node_size=10,
        edge_color = "lightgray", node_color = "gray")

In [ ]:

# igraph is faster. Here is how to convert to igraph
h = ig.Graph().from_networkx(G)
# create layout
layout = h.layout_fruchterman_reingold()
# create positions in the networx format (as a dictionary)
pos = dict(zip(G.nodes(), layout.coords))

# Plot as before
plt.figure(figsize=(20, 16))
nx.draw(G, pos = pos, with_labels = False, node_size=10,
        edge_color = "lightgray", node_color = "gray")

1.4. Read and visualize a large(r) network¶

Read network in f"{path_data}/wiki-Vote.txt". Careful, it is a directed network, you need to use the create_using parameter.

(use iterations = 30 in the spring_layout)

Format of the file:

# Directed graph (each unordered pair of nodes is saved once): Wiki-Vote.txt 
# Wikipedia voting on promotion to administratorship (till January 2008). Directed edge A->B means user A voted on B becoming Wikipedia administrator.
# Nodes: 7115 Edges: 103689
# FromNodeId	ToNodeId
30	1412
30	3352

In [ ]:

# Read directed graph
path = f"{path_data}/wiki-Vote.txt"
G_wiki = nx.read_edgelist(path, create_using=...)
print(len(G_wiki.nodes()))
print(len(G_wiki.edges()))

In [ ]:

# Create layout (this will take a couple minutes). Networkx is a particularly slow library
pos = nx.spring_layout(G_wiki, seed = 1, iterations=30)

In [ ]:

# Nobody wants to see your hairball, but let's plot it anyway
plt.figure(figsize=(20, 20))

# Plot only nodes (too many lines)
nx.draw_networkx_nodes(G_wiki, pos = pos, node_size = 1, node_color = "k")

Exercise 2: Network characteristics¶

Network example: Protein interaction (PPI) in yeast. Nodes = Proteins, Edges = Recorded interactions in experiments

Exercise: Compare the PPI network with the Twitter network (ic2s2_netsci_3). What characteristics apply to both?

In [ ]:

# Read PPI network
path_network = f"{path_data}/ppi_network_prediction.graphml"
G = nx.read_graphml(path_network, node_type=int)
len(G.nodes()), len(G.edges())

In [ ]:

# igraph is faster. Here is how to convert to igraph
h = ig.Graph().from_networkx(G)
# create layout
layout = h.layout_fruchterman_reingold()
# create positions in the networx format (as a dictionary)
pos = dict(zip(G.nodes(), layout.coords))

# To use networkx (slow), uncomment the line below
#pos = nx.spring_layout(G, seed = 1)

# Plot as before
fig, ax = plt.subplots(figsize=(10, 8))
#This function is defined in common_functions.py file. It adds automatic coloring and nicer defaults
plot_network(G, a0 = ax, pos = pos, with_labels=False) 

2.1 Connectedness¶

In [ ]:

# Number of connected components
nx.components.number_connected_components(G)

In [ ]:

# Size of the connected components (first 20 sorted)
sorted([len(_) for _ in nx.connected_components(G)])[:20]

In [ ]:

# Let's keep the largest component (be careful, don't do this in your data withuot a very good reason)
largest_cc = max(nx.connected_components(G), key=len)

# Create a subgraph of G consisting only of nodes in largest_cc
G = G.subgraph(largest_cc).copy()
len(G)

2.2 Small diameters¶

In [ ]:

# Diameter
nx.diameter(G)

In [ ]:

# Average shortest path
nx.average_shortest_path_length(G)

2.3 Density¶

In [ ]:

nx.density(G)

2.4 Transitivity and average clustering¶

In [ ]:

# Average at the network level (# triangles / possible number of triangles)
print(nx.transitivity(G))

# Average at the node level = Average(# triangles_i / possible triangles_i )
print(nx.average_clustering(G))

2.5 Assortativity and homophily¶

What does it mean to have a negative/positive degree assortativity?

In [ ]:

nx.assortativity.degree_assortativity_coefficient(G)

2.6 Heavy tails¶

In [ ]:

# Find the degree of each node
degree_list = dict(G.degree()).values()

# count the number of nodes with a specific degree, sort it
from collections import Counter
C = Counter(degree_list)
deg, cnt = zip(*sorted(C.items()))

# Plot
plt.plot(deg, cnt, "o-")
plt.xlabel("Degree")
plt.ylabel("Count (degree)")

In [ ]:

# Defined in common_functions.py
plot_cdf(degree_list, compl = True, xlabel = "Degree (d)", ylabel = "p(Degree > d)")

2.7 Robustness to failures / Fragility to targeted attacks¶

In [ ]:

def attack_network(G, how="targeted", n_remove=200):
    """
    Removes nodes iteratively and keeps the (harmonic average of distance between all pairs of nodes)
    
    how = "targeted" targets the node with the highest degree
    """
    n_components = []

    G2 = G.copy()
    for i in range(n_remove):
        if how == "targeted":
            # Find the node with the highest degree
            node_to_remove = max(G2.degree(), key=lambda x: x[1])[0]

        else:
            # Random node
            node_to_remove = np.random.choice(G2.nodes(), 1)[0]

        # Remove node
        G2.remove_node(node_to_remove)
        # Number of nodes in giant component
        n_comp = max([len(_) for _ in nx.connected_components(G2)])
        n_components.append(100*n_comp/len(G2))
        
        
    return n_components

In [ ]:

# Create an ER graph (random graph)
G_r = nx.random_graphs.gnm_random_graph(1000,len(G.edges()))
# Let's keep the largest component (be careful, don't do this in your data withuot a very good reason)
largest_cc = max(nx.connected_components(G_r), key=len)

# Create a subgraph of G consisting only of nodes in largest_cc
G_r = G_r.subgraph(largest_cc).copy()

# Plot network random
plot_network(G_r, with_labels=False)
plt.show()

# Plot network PPI
plot_network(G, with_labels=False)

In [ ]:

# Remove 200 nodes
n_remove = 200
n_comp_attack_t = attack_network(G, how="targeted", n_remove = n_remove)
n_comp_attack_r = attack_network(G, how="random", n_remove = n_remove)

ref_n_comp_attack_t = attack_network(G_r, how="targeted", n_remove = n_remove)
ref_n_comp_attack_r = attack_network(G_r, how="random", n_remove = n_remove)

In [ ]:

plt.figure(figsize=(10, 5))
plt.subplot(121)
plt.plot(100*np.arange(n_remove)/len(G), n_comp_attack_t, label="Protein Interaction")
plt.plot(100*np.arange(n_remove)/len(G), ref_n_comp_attack_t, label="Random network")
plt.xlabel("Share of nodes removed")
plt.ylabel("Share of nodes in giant component")
plt.legend()
plt.title("Targeted attack")
plt.ylim(0,100)

plt.subplot(122)
plt.plot(100*np.arange(n_remove)/len(G), n_comp_attack_r, label="Protein Interaction")
plt.plot(100*np.arange(n_remove)/len(G), ref_n_comp_attack_r, label="Random network")
plt.xlabel("Share of nodes removed")
plt.ylabel("Share of nodes in giant component")
plt.legend()
plt.title("Random attack")
plt.ylim(0,100)

Exercise 3: Distributions (i.e. looking at characteristics of the nodes)¶

Degree
Number of triangles
Clustering (transitivity)
Local assortativity (homophily)
Path length

Goal: Understand how to calculate distributions in networkx, interpret differences in the degree distribution

Exercise: In 3.7 compare the Wikipedia network (default example), the PPI network and the Twitter network (IC2S2)

In [ ]:

# Read data on florentine marriage families in the XV century
G = nx.florentine_families_graph()
len(G)

In [ ]:

# Use the following function to plot the CDF of the degree distributions
def plot_cdf(values, scale = "log", ax = None, cum = True, compl = False, marker = 'o-', xlabel = "Degree (d)", ylabel = "p(Degree < d)"):
    """
    Calculates and plot CDF
    """
    from collections import Counter

    # count the number of instance per each degree, sort it
    C = Counter(values)
    deg, cnt = zip(*sorted(C.items()))
    
    # calcualte the cumulative distribution, normalize to be a probability instead of a count
    if cum:
        cs = np.cumsum(cnt)/np.sum(cnt)
    else:
        cs = cnt/np.sum(cnt)
    
    if compl:
        cs = 1 - cs
        
    if ax is None:
        ax = plt.subplot()
    # plot
    ax.plot(deg, cs, marker)
    ax.set_ylabel(ylabel)
    ax.set_xlabel(xlabel)
    plt.tight_layout()
    sns.despine(left=True, bottom=True)
    plt.xscale(scale)
    plt.yscale(scale)
    

3.1 Degree distribution¶

In [ ]:

def plot_network_distribution(G, values, mult = 1000, with_labels = True):
    """
    Plots network (color and node size depends on values) and distributions
    """
    import matplotlib as mpl
    norm = mpl.colors.Normalize(vmin=min(values), vmax=max(values), clip=True)
    mapper = mpl.cm.ScalarMappable(norm=norm, cmap=mpl.cm.coolwarm)

    f, (a0, a1, a2) = plt.subplots(1, 3, gridspec_kw={'width_ratios': [2, 1, 1]}, figsize=(12,4))
    
    node_size = mult*np.array(list(values))
    if min(node_size) < 0:
        node_size -= min(node_size)
        node_size += 1
        
    nx.draw(G, pos = nx.spring_layout(G, seed = 1), with_labels = with_labels, node_size = node_size, edge_color = "gray", 
           node_color = [mapper.to_rgba(i) for i in values], ax = a0,)


    sns.histplot(values, ax = a1)
    
    
    plot_cdf(values, ax = a2, compl = True, xlabel = "Cent c", ylabel = "p(Cent > c)")

In [ ]:

# Degree distribution
degree = G.degree() #also nx.degree(G)
degree_values = dict(degree).values()

# Plot using sns.histplot
sns.histplot(degree_values)
print(sorted(degree_values))

In [ ]:

# Plot CDF in log-log scale
plot_cdf(degree_values, scale = "log", ax = None, cum = True, compl = True)

In [ ]:

# change "mult" to change the size of the nodes 
plot_network_distribution(G, degree_values, mult = 100)

3.2 Distribution of number of triangles¶

In [ ]:

# Distribution of the number of triangles per node
n_triangs = nx.triangles(G).values() 
display(list(zip(G.nodes(), n_triangs))[:20])

plot_network_distribution(G, n_triangs, mult = 100)

3.3 Distribution of clustering¶

In [ ]:

n_clus = ...
display(list(zip(G.nodes(), n_clus))[:20])

plot_network_distribution(G, n_clus, mult = 1000)

3.4 Distribution of shortest paths¶

In [ ]:

# Calculate all shortest paths (careful, this quickly becomes unfeasible)
path_lenghts = ...

# Get results from a nested dictionary
path_lenghts = [list(_[1].values()) for _ in path_lenghts]
path_lenghts = [subitem for item in path_lenghts for subitem in item ]

# Plot using sns.histplot
sns.histplot(path_lenghts)
plt.show()

3.5 Distribution of local degree assortativity¶

In [ ]:

attribute = [k for v,k in G.degree()]
print(attribute)

# Defined in common_functions (based on Peel et al 2018)
local_assort = calculate_local_assort(G, attribute)
display(list(zip(G.nodes(), local_assort))[:20])

plot_network_distribution(G, local_assort, mult = 1000)

3.6 Degree distribution if ~ normal distribution (used in the lectures)¶

In [ ]:

# Don't change this
def plot_distributions(d, scale="log"):
    """
    Plot the PDF, CDF and CCDF.
    """
    plt.figure(figsize=(12,4))
    ax = plt.subplot(131)
    plot_cdf(d, cum = False, ax = ax,  xlabel = "Degree (d)", ylabel = "p(Degree)", marker = ".", scale=scale)
    if scale == "log":
        plt.plot([7E1, 5E2],[2.5E-3,1E-4],"--")
    plt.title("PDF")
    ax = plt.subplot(132)
    plot_cdf(d, cum = True, ax = ax,  xlabel = "Degree (d)", ylabel = "p(Degree < d)", marker="-", scale=scale)
    plt.title("CDF")
    ax = plt.subplot(133)
    plot_cdf(d, compl = True, ax = ax,  xlabel = "Degree (d)", ylabel = "p(Degree > d)", marker=".-", scale=scale)
    plt.title("CCDF")
    

In [ ]:

## Degree distribution (random normally distributed data)
## Try changing the scale to "log"
d = np.random.binomial(500, p = 30/500, size = 10000)
plot_distributions(d, scale="linear")

3.7 Degree distribution in the wiki network¶

In [ ]:

## Degree distribution (wiki network)
# Try the code using scale = "linear"
G_wiki = nx.read_edgelist(f"{path_data}/wiki-Vote.txt", create_using=nx.DiGraph())
d = [v for k,v in G_wiki.degree()]
plot_distributions(d, "log")
plt.xlim([100,1000])
plt.plot([2E2, 1E3], [2.5E-2,2.5E-4],"--")

In [ ]:

# Twitter data
df = pd.read_csv(f"{path_data}/ic2s2_netsci_3.tsv", sep="\t")
G_twitter = nx.from_pandas_edgelist(df)
G_twitter.remove_edges_from(nx.selfloop_edges(G_twitter)) #remove self-edges

d = [v for k,v in G_twitter.degree()]
plot_distributions(d, "log")
plt.xlim([100,1000])
plt.plot([2E2, 1E3], [2.5E-2,2.5E-4],"--")

In [ ]:

# PPI data
path_network = f"{path_data}/ppi_network_prediction.graphml"
G_ppi = nx.read_graphml(path_network, node_type=int)

d = [v for k,v in G_ppi.degree()]
plot_distributions(d, "log")

plt.plot([2E2, 1E3], [2.5E-2,2.5E-4],"--")

Exercise 4A¶

Is the network homophilic? (are people linked to people like them?). You can use the function nx.assortativity.attribute_assortativity_coefficient

Solution: Shuffle the node labels

Goal: Understand how a permutation test works (4A) and how reference models work (4B).

In [ ]:

# Create labels
ns = ['Acciaiuoli', 'Medici', 'Castellani', 'Peruzzi', 'Strozzi', 'Barbadori', 'Ridolfi', 'Tornabuoni', 'Albizzi', 'Salviati', 'Pazzi', 'Bischeri', 'Guadagni', 'Ginori', 'Lamberteschi']
classes = ["m", "m", "o", "o", "o", "o", "o", "m", "m", "m", "o", "o", "o", "o", "o"]
nx.set_node_attributes(G, dict(zip(ns,classes)), "classes")

# Plot
nx.draw(G, pos = nx.spring_layout(G, seed = 1), with_labels = True, edge_color = "gray", 
       node_color = [palette[0] if c == "m" else palette[1] for c in classes])
    

# calculate assortativity coefficient
assort = ...
print(assort)

In [ ]:

# Permutenode labels
G2 = G.copy()

# Randomize classes and calculate assortativity
iters = 10000
values = []
for i in range(iters):
    # shuffle the classes
    nx.set_node_attributes(G2, dict(zip(ns,np.random.permutation(classes))), "classes")
    # calculate assortativity and keep in a list
    values.append(nx.assortativity.attribute_assortativity_coefficient(G2, "classes"))
values = np.array(values)

# Plot results
sns.histplot(values, binwidth=0.05)
plt.plot([assort,assort],[0,iters/10], "--", color="gray")

# p-value (probability that we would observe a value equal or more extreme to the one observed given 
# that the null hyphotesis is true---i.e. the graph is the real graph and the links 
# are not correlated with the classes
print("p-value", 1-len(values[values<assort])/len(values))

Exercise 4B¶

Is the network transitive? (are your friends also friends between them?) Is the network assortative in terms of degree?

Solution: Shuffle the edges or Grow the network

In [ ]:

# Read data on florentine marriage families in the XV century
G = nx.florentine_families_graph()
# Calculate average clustering
print(...)
# Calculate global clustering (transitivity)
print(...)
# Calculate degree assortativity
print(...)

We can compare our network to randomly generated data (this is useful to test hypothesis/get a baseline)¶

In [ ]:

# Visualize random graphs (run several times)
degree_seq = [v for k,v in G.degree()]
G_r = nx.configuration_model(degree_seq)
G_r = nx.Graph(G_r)
nx.draw(G_r)

In [ ]:

# Visualize random graphs (run several times)
n = len(G)
m = len(G.edges())
G_r = nx.random_graphs.barabasi_albert_graph(n,int(m/n)+1)
G_r = nx.Graph(G_r)
nx.draw(G_r)

In [ ]:

# Visualize random graphs (run several times)
n = len(G)
m = len(G.edges())
G_r = nx.random_graphs.gnm_random_graph(n,m)
G_r = nx.Graph(G_r)
nx.draw(G_r)

The function conf_int (defined in common_functions.py) creates 100 random graphs and calculates the desired metric¶

In [ ]:

# Average local clustering coefficient
conf_int(G, nx.average_clustering, 100)
nx.average_clustering(G)
#same as np.mean(list(nx.clustering(G).values()))

In [ ]:

# Transitivity (global clustering)
conf_int(G, nx.transitivity, 100)
nx.transitivity(G)

In [ ]:

# Assortativity
conf_int(G, nx.assortativity.degree_assortativity_coefficient, 100)
nx.assortativity.degree_assortativity_coefficient(G)

Exercise 4B.2¶

Use igraph to read the network and calculate assortativity, and compare with random model (ER)

In [ ]:

G_wiki = nx.read_edgelist(f"{path_data}/wiki-Vote.txt", create_using=nx.DiGraph())
# Read directed graph
print(len(G_wiki.nodes()))
print(len(G_wiki.edges()))

# Convert to igraph
h = ig.Graph.from_networkx(G_wiki)

# Look at how much faster it is (700 times slower in my computer)
%timeit h.assortativity_degree()
%timeit nx.assortativity.degree_assortativity_coefficient(G_wiki)

Compare with random model (ER) -- igraph vs networkx¶

In [ ]:

# networkx
def conf_dens_nx(n, m):
    """Create random graph and calculate assortativity in networkx"""
    G_r = nx.random_graphs.gnm_random_graph(n,m)
    return nx.assortativity.degree_assortativity_coefficient(G_r)
def conf_dens_ig(n,m):
    """Create random graph and calculate assortativity in igraph"""
    h_r = ig.Graph.Erdos_Renyi(n=n,m=m)
    return h_r.assortativity_degree()

In [ ]:

%%time
# Doing it in igraph 10 times
n, m = len(G_wiki), len(G_wiki.edges())
print(np.percentile([conf_dens_nx(n, m) for i in range(10)], [5,95]))

nx.assortativity.degree_assortativity_coefficient(G_wiki)

In [ ]:

%%time
# Doing it in igraph 100 times
n, m = h.vcount(), h.ecount()
print(np.percentile([conf_dens_ig(n,m) for i in range(100)], [5,95]))

h.assortativity_degree()

In [ ]: