# netowrks
import networkx as nx
import igraph as ig
# data processing
import pandas as pd
import numpy as np
import scipy.sparse as ss
#some functions to make our lifes easier
import sys
sys.path.append("./")
from common_functions import *
# viz
import pylab as plt
import seaborn as sns
import matplotlib as mpl
%matplotlib inline
#Change the default options of visualization (improving the defaults)
custom_params = {"axes.spines.right": False, "axes.spines.top": False, "axes.spines.left": False, "axes.spines.bottom":
False,"lines.linewidth": 2, "grid.color": "lightgray", "legend.frameon": False, "xtick.labelcolor": "#484848", "ytick.labelcolor":
"#484848", "xtick.color": "#484848", "ytick.color": "#484848","text.color": "#484848", "axes.labelcolor": "#484848",
"axes.titlecolor":"#484848","figure.figsize": [5,3],
"axes.titlelocation":"left","xaxis.labellocation":"left","yaxis.labellocation":"bottom"}
palette = ["#3d348b","#e6af2e","#191716","#e0e2db"] #use your favourite colours
sns.set_theme(context='paper', style='white', palette=palette, font='Verdana', font_scale=1.3, color_codes=True,
rc=custom_params)
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
path_data = "../../Data/"
def plot_network(G, a0 = None, values = None):
"""
Plots network with nice defaults
"""
if values is None:
values = nx.degree_centrality(G).values()
if a0 is None:
fig, a0 = plt.subplots()
norm = mpl.colors.Normalize(vmin=min(values), vmax=max(values), clip=True)
mapper = mpl.cm.ScalarMappable(norm=norm, cmap=mpl.cm.coolwarm)
mapper._A = []
cb = plt.colorbar(mapper, ax = a0, location="bottom", shrink=0.8, pad = 0.02, label = "Value")
cb.outline.set_visible(False)
# NEtwork
if nx.is_bipartite(G):
top = [_ for _ in G.nodes() if _[0] != "S"]
pos = nx.bipartite_layout(G, top)
node_color = ["#e6af2e" if node in top else "#e0e2db" for node in G]
else:
pos = nx.spring_layout(G, seed = 1)
node_color = [mapper.to_rgba(i) for i in values]
nx.draw(G, pos = pos, with_labels = True, node_size=500*np.array(list(values)), edge_color = "darkgray",
node_color = node_color, ax = a0)
def plot_network_adj(G, values = None):
"""
Plots the graph (with color/node size adjusted according to values) and the adjacency matrix
"""
f, (a0, a1, a2) = plt.subplots(1, 3, gridspec_kw={'width_ratios': [1, 1, 0.5]}, figsize=(12,4))
# Plot network
plot_network(G, a0, values = values)
# Show adjacency table
df = nx.to_pandas_adjacency(G, nodelist=list(G.nodes()), dtype=int)
plot_table(a1, df.values, df.index, df.columns)
A = nx.to_scipy_sparse_array(G, nodelist=list(G.nodes()), weight=1)
N = len(G.nodes())
# Plot table as a heatmap
a2.spy(A)
sns.despine(bottom="True", left=True)
plt.grid(True)
a2.set_xticks(range(N), G.nodes(), rotation=90)
a2.set_yticks(range(N), G.nodes())
plt.tight_layout()
def plot_table(a1, cellText, rowLabels, colLabels):
"""
Plots a table in a figure
"""
cellText = pd.DataFrame(cellText)
the_table = a1.table(cellText=cellText.values, rowLabels=rowLabels, colLabels=colLabels, loc='center', colLoc = "left", cellColours=(cellText>0).replace({False: "white", True:"#e6af2e"}).values)
a1.axis(False)
the_table.scale(0.8, 1.6)
def adj_to_net(A, d_names = {0: "Alice", 1: "Bob", 2: "John", 3:"Amy", 4:"Mike", 5:"Rose"}):
"""
Create graph from adjacency matrix, rename nodes
"""
G = nx.from_numpy_array(A, create_using=nx.DiGraph())
G = nx.relabel_nodes(G, d_names)
return G
create_using
parameter. What is it doing?# Small directed network to understand matrix multiplication
G_dir = nx.from_edgelist([
("Alice", "Bob"),
("John", "Alice"),
("Bob", "John"),
("Amy", "John"),
("Mike", "John"),
("Rose", "Alice"),
("Mike", "Rose")
], create_using=nx.DiGraph())
plot_network_adj(G_dir)
# Small undirected network
G_undir = nx.from_edgelist([
("Alice", "Bob"),
("John", "Alice"),
("Bob", "John"),
("Amy", "John"),
("Mike", "John"),
("Rose", "Alice"),
("Mike", "Rose")
], create_using=nx.Graph())
plot_network_adj(G_undir)
# Create some metadata (number of children)
df = pd.DataFrame([["Alice", 2],
["Bob", 0],
["John", 2],
["Amy", 0],
["Mike", 1],
["Rose", 5]],
columns=["person","children"])
df
person | children | |
---|---|---|
0 | Alice | 2 |
1 | Bob | 0 |
2 | John | 2 |
3 | Amy | 0 |
4 | Mike | 1 |
5 | Rose | 5 |
Goal: Compare different ways to represent matrixes and convert between them
Start from the G_dir
network
You can do this using the functions nx.to_...(G)
, such as nx.to_scipy_sparse_array
What are the advantages of each?
# Example conversion to sparse matrix:
A_sparse = nx.to_scipy_sparse_array(G_dir, nodelist=list(G_dir.nodes()), weight=1)
A_sparse
<6x6 sparse array of type '<class 'numpy.int64'>' with 7 stored elements in Compressed Sparse Row format>
plt.spy(A_sparse)
<matplotlib.lines.Line2D at 0x10404c1f0>
Now starting from an edgelist (created below, df_edg
), convert it to a graph object. You have done this before (in the morning)
# Starting from an edgelist
df_edg = nx.to_pandas_edgelist(G_dir, nodelist=list(G_dir.nodes()))
df_edg.head()
source | target | |
---|---|---|
0 | Alice | Bob |
1 | Bob | John |
2 | John | Alice |
3 | Amy | John |
4 | Mike | John |
# Convert to graph and plot
G = ...
plot_network_adj(G)
Now starting from an adjacency matrix (created below, A
), convert it to a graph.
# Convert the following adjacency matrix
A = nx.to_scipy_sparse_array(G_dir, nodelist=list(G_dir.nodes()), weight=1)
# Translation between index and names
d_names = {0: 'Alice', 1: 'Bob', 2: 'John', 3: 'Amy', 4: 'Mike', 5: 'Rose'}
A.todense()
array([[0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0], [1, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0], [0, 0, 1, 0, 0, 1], [1, 0, 0, 0, 0, 0]])
# Convert to graph
G = ...
# Add labels
G = nx.relabel_nodes(G, d_names)
# Plot
plot_network_adj(G)
What is the average number of children of your friends?
In python
@
(e.g. A @ B)*
(e.g. A*B multiplies the element i,j of A with the element i,j of B)# Original network adding the children as a color/size
plot_network_adj(G_undir, values = df["children"])
# convert to adjacency array
A = nx.to_scipy_sparse_array(G_undir)
children = df["children"]
# Calculate sum of neighbors children
sum_children = ...
# Divide by number of neighbors to get an average
avg_children = sum_children / ...
print(sum_children)
plot_network_adj(G_undir, values=avg_children)
[7 4 3 2 7 3]
3a. Interpretation as number of paths
3b. Number of triangles and clustering
# Number of paths to go from node i to node j in x steps
#x = 1
print("One step away")
plot_network_adj(G_dir)
A = nx.to_scipy_sparse_array(G_dir)
#x = 2
print("Two steps away")
G2 = adj_to_net(A @ A) #A**2
plot_network_adj(G2)
#x = 3
print("Three steps away. Triangles!")
G3 = adj_to_net(A @ A @ A) #A**3
plot_network_adj(G3)
One step away Two steps away Three steps away. Triangles!
# Number of nodes reached in 3 steps
number_people = ...
number_people = number_people>0 #Don't count people many times
number_people.setdiag(0) # Remove yourself (in the diagonal)
display(number_people.todense())
#print total (sum per row)
number_people.todense().sum(1)
/Users/garci061/miniforge3/envs/networks/lib/python3.10/site-packages/scipy/sparse/_index.py:146: SparseEfficiencyWarning: Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient. self._set_arrayXarray(i, j, x)
array([2, 2, 2, 3, 4, 3])
display((A @ A).todense())
display(A.todense())
array([[0, 0, 1, 0, 0, 0], [1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0], [2, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0]])
array([[0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0], [1, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0], [0, 0, 1, 0, 0, 1], [1, 0, 0, 0, 0, 0]])
# Count number of triangles in the directed network (se A^3)
plot_network_adj(G_dir)
A = nx.to_scipy_sparse_array(G_dir)
path_3 = ...
# For undirected newtorks there are two directions, for directed networks one
display(list(zip(G_dir.nodes(), path_3.diagonal() )))
# Number of triangles = trace / 3 (each triangle count in 3 nodes)
print(...)
[('Alice', 1), ('Bob', 1), ('John', 1), ('Amy', 0), ('Mike', 0), ('Rose', 0)]
1.0
# Count number of triangles in the undirected network
plot_network_adj(G_undir)
A = nx.to_scipy_sparse_array(G_undir)
path_3 = ...
# For undirected newtorks there are two directions
display(list(zip(G_undir.nodes(), path_3.diagonal() / 2)))
# Number of triangles = trace / 6 (each triangle count in 3 nodes, each triangle counted in two directions)
print(...)
[('Alice', 1.0), ('Bob', 1.0), ('John', 1.0), ('Amy', 0.0), ('Mike', 0.0), ('Rose', 0.0)]
1.0
#Local clustering = number of triangles / number of potential links
print(nx.clustering(G_undir)) #standard nx function
# Number of triangles
path_3 = (A@A@A).diagonal()/2 # divided by two because there are two directions
# Number of potential links between neighbors
degree = A.sum(1)
potential_links = (degree*(degree-1)/2)
list(zip(G_undir.nodes(),
path_3,
potential_links,
path_3/potential_links
)
)
{'Alice': 0.3333333333333333, 'Bob': 1.0, 'John': 0.16666666666666666, 'Amy': 0, 'Mike': 0, 'Rose': 0}
/var/folders/hx/nz98f65j615c4ygz7xt694700000gp/T/ipykernel_81311/980830522.py:11: RuntimeWarning: invalid value encountered in true_divide list(zip(G_undir.nodes(), path_3, potential_links, path_3/potential_links))
[('Alice', 1.0, 3.0, 0.3333333333333333), ('Bob', 1.0, 1.0, 1.0), ('John', 1.0, 6.0, 0.16666666666666666), ('Amy', 0.0, 0.0, nan), ('Mike', 0.0, 1.0, 0.0), ('Rose', 0.0, 1.0, 0.0)]
Display using the function plot_network_distribution()
, so the centrality gets displayed as the node color.
G = nx.florentine_families_graph()
# Example of degree centrality
cent = nx.degree_centrality(G)
cent = cent.values()
plot_network_distribution(G, cent)
# Betweeness centrality
cent = ...
cent = cent.values()
plot_network_distribution(G, cent)
# Closeness centrality
cent = ...
cent = cent.values()
plot_network_distribution(G, cent)
# Harmonic centrality
cent = ...
cent = np.array(list(cent.values()))/15
plot_network_distribution(G, cent)
# Eigenvector centrality
cent = ...
cent = cent.values()
plot_network_distribution(G, cent)
# Pagerank centrality
cent = ...
cent = cent.values()
plot_network_distribution(G, cent)
#- Katz
#- HITS
Using the Twitter dataset
Calculate the top 10 nodes for each measure
# Read edgelist
df = pd.read_csv(f"{path_data}/ic2s2_netsci_3.tsv", sep="\t")
# Convert to networkx
Gt = nx.from_pandas_edgelist(df)
Gt.remove_edges_from(nx.selfloop_edges(Gt)) #remove self-edges
def extract_top_10(d_centrality):
return pd.DataFrame.from_dict(d_centrality, orient="index").sort_values(by=0, ascending=False).head(10)
#e.g. extract_top_10(nx.degree_centrality(Gt))
Let's calculate eigenvector and pagerank centrality using the power method
For eigenvector centrality, each node has an influence of 1, that it is distributed to the neighbors. This process is done many times, until it converges.
# Eigenvector centrality, manually
N = len(G_dir)
A = nx.to_numpy_array(G_undir)
# Start with everybody having 1 unit of influence
weight = np.ones(N)
for i in range(100):
# spread influence to neighbors
weight = weight @ A
# normalize using the geometrical mean
weight = weight / np.sqrt(np.sum(weight**2))
if i < 5:
print(G_undir.nodes())
print(weight)
plot_network(G_undir, None, values = weight)
plt.show()
# Compare with the results of networkx
print(list(zip(G_undir.nodes(), weight)))
nx.eigenvector_centrality(G_undir)
['Alice', 'Bob', 'John', 'Amy', 'Mike', 'Rose'] [0.48666426 0.32444284 0.64888568 0.16222142 0.32444284 0.32444284]
['Alice', 'Bob', 'John', 'Amy', 'Mike', 'Rose'] [0.50196464 0.43921906 0.50196464 0.25098232 0.37647348 0.3137279 ]
['Alice', 'Bob', 'John', 'Amy', 'Mike', 'Rose'] [0.48365083 0.38692067 0.60456354 0.19346033 0.31437304 0.33855558]
['Alice', 'Bob', 'John', 'Amy', 'Mike', 'Rose'] [0.51212115 0.41900822 0.53074374 0.23278234 0.36314046 0.30727269]
['Alice', 'Bob', 'John', 'Amy', 'Mike', 'Rose'] [0.483843 0.40141049 0.58777964 0.20428927 0.322562 0.33689809]
[('Alice', 0.49816568565363206), ('Bob', 0.40845488054415907), ('John', 0.5634381194270055), ('Amy', 0.21678431148030036), ('Mike', 0.3410115867353848), ('Rose', 0.3228756816201712)]
{'Alice': 0.49816562620567956, 'Bob': 0.4084543148167931, 'John': 0.563437771403002, 'Amy': 0.21678389847563684, 'Mike': 0.3410123357350908, 'Rose': 0.32287658256564244}
For pagerank centrality, we start with 1 random walker distributed over all nodes, from there, it gets distributed to the neighbors (1/k of the random walker go to each of the k neighbors). This process is done many times, until it converges.
# PAgerank centrality, manually
N = len(G_dir)
A = nx.to_numpy_array(G_dir)
## Calculate transition matrix (Row normalize matrix)
# construct diagonal inverse degree matrix
degree = A.sum(1)
D = np.diag(1./degree, 0)
A_hat = (D @ A)
# random walkers are spread out evenly
weight = np.ones(N)/N
for i in range(1000):
# calculate where random walkers go next
weight = weight @ (0.85*A_hat + 0.15/N)
# Compare results with networkx
print(list(zip(G_dir.nodes(), weight)))
nx.pagerank(G_dir)
[('Alice', 0.31535471331389314), ('Bob', 0.2930515063168089), ('John', 0.3059687803692869), ('Amy', 0.024999999999999696), ('Mike', 0.024999999999999696), ('Rose', 0.03562499999999957)]
{'Alice': 0.3153534818017285, 'Bob': 0.293052806246324, 'John': 0.3059687119519481, 'Amy': 0.025, 'Mike': 0.025, 'Rose': 0.035625000000000004}
A random walker ends up a fraction of time in each node proportional to the degree of the node
N = len(G_undir)
A = nx.to_numpy_array(G_undir)
## construct transition matrix (row normalised adjacency matrix)
# construct diagonal inverse degree matrix
degree = A.sum(1)
D = np.diag(1./degree, 0)
A_hat = (D @ A)
# it does not matter where the walker starts
weight = np.ones(N)/N
for i in range(1000):
# calculate power
weight = weight @ (A_hat)
# Normalize
weight = weight / np.sqrt(np.sum(weight**2))
# Compare to networkx
print(list(zip(G_undir.nodes(), weight*norm)))
nx.degree_centrality(G_undir)
[('Alice', 0.6000000000000004), ('Bob', 0.40000000000000024), ('John', 0.8), ('Amy', 0.20000000000000015), ('Mike', 0.4000000000000003), ('Rose', 0.4000000000000001)]
{'Alice': 0.6000000000000001, 'Bob': 0.4, 'John': 0.8, 'Amy': 0.2, 'Mike': 0.4, 'Rose': 0.4}