#!/usr/bin/env python # coding: utf-8 # #Graph Analysis of Domains Referring to Windeln.de # This notebook analyses the domains that are linking to Windeln.de and attempting to help understand the relationship between those websites. # In[13]: import os import re from pandas import * import numpy as np import networkx as nx import matplotlib.pyplot as plt from mpld3 import display get_ipython().run_line_magic('matplotlib', 'inline') # ###Read Raw Data # Data produced by backlinks.py # In[17]: df = read_csv('C:/workspace/analysis_reporting/joe/milk_china/anchor_urls.csv', sep='\t') # df.head() # In[15]: df_domain = df.ix[:, ['source_domain','alexa_cc_rank']].drop_duplicates() # Find relationship between source domains # In[16]: relation_pairs = [] key = ['source_domain','links'] df2 = df.ix[:,key].assign(num_links=1).dropna().groupby(key, as_index=0)['num_links'].sum() # df2 = df.ix[:,['source_domain','links']].drop_duplicates().dropna() df2.index = range(len(df2)) for i in df2.index: source_domain = df2.ix[i, 'source_domain'] num_links = df2.ix[i, 'num_links'] for link_domain in df2.ix[i, 'links'].split('|'): pair = (source_domain, link_domain, num_links) relation_pairs.append(pair) df_pair = DataFrame(list(set(relation_pairs)), columns=['src', 'target', 'num_links']) # In[20]: # plotting graph G = nx.Graph() pos=nx.spring_layout(G) for line in df_pair.values: src, tgt, wt = line.tolist() G.add_edge(src, tgt, weight=wt) # fig, ax = plt.subplots() plt.figure(figsize=(10, 10)) nx.draw_networkx(G, with_labels=1) display()