#!/usr/bin/env python # coding: utf-8 # # CS579: Lecture 05 # ** Community Detection ** # # *[Dr. Aron Culotta](http://cs.iit.edu/~culotta)* # *[Illinois Institute of Technology](http://iit.edu)* # # (Many figures come from [Mining of Massive Datasets](http://www.mmds.org/), Jure Leskovec, Anand Rajaraman, Jeff Ullman) # ![network](network.png) # # - **Why do we want to identify communities?** # - **What are the "communities" in this graph?** # - **Why did you choose these communities?** # #




# **A bad solution: Agglomerative clustering** # # - Let distance function $d(A,B)$ be the shortest path between nodes $A$ and $B$ # - Let $C_i$ and $C_j$ be two clusters of nodes. Then, let the distance between two clusters be the minimum distance of any two nodes: $d(C_i, C_j) = \min_{X \in C_i, Y \in C_j} \hspace{.1cm} d(X, Y)$ # - Greedy agglomerative clustering iterative merges the closest two clusters # # # **What would agglomerative clustering do on this network? ** # # ![network](network.png) # # $d(A,B) = d(A,C) = d(B, C) = d(B,D) = d(D,E) = d(D,F) = d(D,G) = d(E,F) = d(G,F) = 1$ # # $d(A,D) = d(C,D) ... = 2$ # #


# First merge: sample randomly from all nodes with distance == 1. # # #


# So, $\frac{1}{9}$ chance we merge $B$ and $D$ in first merge. # # Not desireable...any other ideas? # # What makes the edge between $B$ and $D$ special? # #



# **Betweenness:** The betweenness of an edge $(A, B)$ is the number of shortest paths between any nodes $X$ and $Y$ that include edge $(A, B)$. # # High betweenness $\rightarrow$ $A$ and $B$ belong in different communities. # ![network](network.png) # # # What is **betweenness** of $(B,D)$? # #


# # > $(B,D)$ is on all shortest paths connecting any of $\{A,B,C\}$ to any of $\{D,E,F,G\}$. # # > Thus, total number of shortest paths = number passing through $(B,D)$ = $3 * 4 = \mathbf{12}.$. So, $bt(B,D) = 12$ # # #



# What is **betweenness** of $(D,F)$? # #



# # > $(D,F)$ is on shortest paths from $\{A,B,C,D\}$ to $\{F\}$. # # > Thus, betweenness is $4 * 1 = \mathbf{4}.$ # # #
# Consider this graph: # # ![between](between3.png) # # What is **betweenness** of $(D,G)$? # #


# # > $(D,G)$ is on the shortest path from $D$ to $G$. # > $(D,G)$ is also on one of the two shortest paths from $D$ to $F$ and $E$ to $G$. # # > Since there can be several shortest paths between two nodes, an edge (a, b) is credited with the fraction of those shortest paths that include the edge (a, b). # # > Thus, betweenness is $\frac{1}{2} + \frac{1}{2} + 1 = \mathbf{2}.$ # Formally: # # $$ # bt(e) =\sum_{s,t \in V} \frac{\sigma(s, t|e)}{\sigma(s, t)} # $$ # where # # - $V$ is the set of nodes # - $\sigma(s, t)$ is the number of shortest paths between nodes $s$ and $t$ # - $\sigma(s, t|e)$ is the number of those paths passing through some edge $e$ # # If $s = t$, $\sigma(s, t) = 1$ # # So, if there are two shortest paths between $s$ and $t$, but only one goes through $e$, betweenness only increases by 0.5. # # # # In[15]: import networkx as nx get_ipython().run_line_magic('matplotlib', 'inline') def create_example_graph(): graph = nx.Graph() graph.add_edges_from([('A', 'B'), ('A', 'C'), ('B', 'C'), ('B', 'D'), ('D', 'E'), ('D', 'F'), ('D', 'G'), ('E', 'F'), ('G', 'F')]) return graph graph = create_example_graph() nx.draw(graph, with_labels=True) # In[16]: # We'll use networkx's built-in betweenness computation in this example. nx.edge_betweenness_centrality(graph, normalized=False) # nx.edge_betweenness_centrality(graph, normalized=True) # normalized between 0-1 # ** How to compute shortest path in undirected graph? ** # #


# # # [source](https://en.wikipedia.org/wiki/File:Animated_BFS.gif) # # ** Breadth first search: ** Given a sourse node $s$, compute the shortest paths to each of its neighbors. Proceed # In[4]: from collections import deque # double ended queue # stored as a doubly linked list q = deque() q.append(1) print(q) q.append(2) print(q) q.append(3) print(q) print('popleft returns: %d' % q.popleft()) print(q) print('pop returns: %d' % q.pop()) print(q) # In[5]: # compare with: a = [1,2,3] print(a.pop(0)) #print(a.pop()) a # **What is running time to remove first element of a dynamic array with $n$ elements (a list in Python)?** # #


# # $O(n)$: Need to shift all elements to the left. # #


# # **What is the running time to remove first element of a doubly linked list $n$ elements (a deque in Python)?** # #


# # $O(1)$ # # See more: # https://wiki.python.org/moin/TimeComplexity # # In[6]: # Sample implementation of doubly linked list. class Node: def __init__(self, val): self.val = val self.prev = None # previous node self.next = None # next node self.head = self def display(self): node = self.head vals = [] while node: vals.append(node.val) node = node.next print(vals) def popleft(self): """ Remove leftmost element of list. """ v = self.head.val self.next.prev = None self.head = self.next return v n1 = Node(1) n2 = Node(2) n3 = Node(3) n1.next = n2 n2.prev = n1 n2.next = n3 n3.prev = n2 mylist = n1 mylist.display() print(mylist.popleft()) print(mylist.popleft()) # In[17]: # to get the neighbors of a node: list(graph.neighbors('A')) # vs #graph.neighbors('A') # In[8]: nx.draw(graph, with_labels=True) # In[9]: def bfs(graph, start): """ Return the order in which nodes are visited in a breadth-first traversal, starting with the given node. """ q = deque() q.append(start) seen = set() # nodes we have already visited. res = [] while len(q) > 0: # while more to visit n = q.popleft() if n not in seen: res.append(n) seen.add(n) for nn in graph.neighbors(n): if nn not in seen: q.append(nn) return res bfs(graph, 'D') # To get all shortest paths from a node, perform BFS, while keeping track of the depth of the search. # In[18]: for s in graph.nodes(): paths = nx.single_source_shortest_path(graph, s) print('\nshortest paths for %s' % s) print(paths) # # Girvan-Newman Algorithm # # **Input:** Graph $G$; desired number of clusters $k$ # # **Output:** A hierarchical clustering of nodes, based on edge betweenness # # - **While** number of clusters $< k$: # - Compute the betweenness of all edges in $G$ # - Remove edge with highest betweenness # # ![between](between.png) # ![between2](between2.png) # ## Computing betweenness of all edges # # - All pairs-shortest-paths, but need to store the paths. # - How can we reduce redundant computation? # ## Computing betweenness of all edges # # ![newman1](newman1.png) # # 1.) Do breadth-first search starting at node $E$. # - Each level is length of shortest path from $E$ to that node # - Edges within the same level cannot be part of a shortest path from $E$ to some target. # # 2.) Label each node by the number of shortest paths that reach it from the root. # - Start by labeling the root ($E$). Then, each child node is the sum of its parents. # - E.g., $G = D + F$ # # ## Computing betweenness of all edges # # ![newman1](newman2.png) # # 3.) Compute fraction of shortest paths through each edge (bottom up). # - leaf nodes get credit 1 # - non-leaf nodes get credit of 1 + credits for edges to nodes at level below # - edges to level above gets credit proportional to fraction of shortest paths that go through it. # # E.g. Level 3: # - $A$ and $C$ are given credit 1 (they are leaf nodes) # # Level 2: # - $B$ gets credit $3$ ($A + C + 1$) # - All shortest paths from $\{E\}$ to $\{A, B, C\}$ go through B. # - $G$ gets credit 1 (leaf) # # ## Computing betweenness of all edges # # ![newman1](newman3.png) # # Level 1 Edges: # - $D,B$ edge gets all credit from node $B$ (3) # - $G$ has two parents, so edges $(D,G)$, $(F,G)$ share the credit from $G$ # - From step 1, $D$ and $F$ each have credit 1, so shared equally. $(\frac{1}{1+1} = .5)$ # - What if $D=5$, $F=3$? $\frac{5}{8}$, $\frac{3}{8}$ # # # Level 1 Nodes: # - $D = 1 + 3 + .5 = 4.5$ # - $F = 1 + .5 = 1.5$ # ## Computing betweenness of all edges # # ![newman1](newman3.png) # # - What if $D=5$, $F=3$? # # ## Computing betweenness of all edges # # ![newman1](newman3.png) # # - What if $D=5$, $F=3$? # $(D,G) = \frac{5}{8}$, $(F,G) = \frac{3}{8}$ # Final steps: # # - Repeat for each node as source # - Divide total by 2 (since each shortest path found twice, once in each direction) # # ![between](between.png) # **More detailed example for edge (D,E):** # # For each root node, we report the value computed for edge (D,E) for Girvan Newman: # # | root node | value for (D,E)| # |-----------|----------------| # | A | 1 | # | B | 1 | # | C | 1 | # | D | 1 | # | E | 4.5 | # | F | 0 | # | G | .5 | # | **total** | 9 | # # We then divide by 2 to get the final betweenness of (D,E), 4.5. # # # The reason the value is $1$ when the root node is one of ${A,B,C}$ is that $E$ will be a leaf node for each of these. (Shortest paths to $F$ or $G$ from $A,B,C$ will never traverse edge $(D,E)$). The value is 0.5 for $G$ as source, becasue half credit is given as there are two shortest paths of length 2 from $G$ to $E$, but only one traverses $(D,E)$. No other shortest path from root $G$ traverses $(D,E)$. # # In[11]: nx.edge_betweenness_centrality(graph, normalized=False) # In[12]: def girvan_newman(G, depth=0): """ Recursive implementation of the girvan_newman algorithm. See http://www-rohan.sdsu.edu/~gawron/python_for_ss/course_core/book_draft/Social_Networks/Networkx.html Args: G.....a networkx graph Returns: A list of all discovered communities, a list of lists of nodes. """ if G.order() == 1: return [G.nodes()] def find_best_edge(G0): eb = nx.edge_betweenness_centrality(G0) # eb is dict of (edge, score) pairs, where higher is better # Return the edge with the highest score. return sorted(eb.items(), key=lambda x: x[1], reverse=True)[0][0] # Each component is a separate community. We cluster each of these. components = [c for c in nx.connected_component_subgraphs(G)] indent = ' ' * depth # for printing while len(components) == 1: edge_to_remove = find_best_edge(G) print(indent + 'removing ' + str(edge_to_remove)) G.remove_edge(*edge_to_remove) components = [c for c in nx.connected_component_subgraphs(G)] result = [c.nodes() for c in components] print(indent + 'components=' + str(result)) for c in components: result.extend(girvan_newman(c, depth + 1)) return result # In[13]: result = girvan_newman(create_example_graph()) # In[14]: result # In[1]: from IPython.core.display import HTML HTML(open('../custom.css').read())