This notebook contains a variety of functions primarily for accessing the MediaWiki API to extract data page revisions, user revisions, article hyperlinks, category membership, and pageview dynamics.
These scripts invoke several non-standard libraries:
WikiTools - https://code.google.com/p/python-wikitools/
NetworkX - http://networkx.github.io/
Pandas - http://pandas.pydata.org/
This code was primarily authored by Brian Keegan (bkeegan@gmail.com) in 2012 and 2013 with contributions from Nick Bennett (nick271828@gmail.com).
from wikitools import wiki, api
import networkx as nx
from operator import itemgetter
from collections import Counter
import re, random, datetime, urlparse, urllib2, simplejson, copy
import pandas as pd
def is_ip(ip_string, masked=False):
# '''
# Input:
# ip_string - A string we'd like to check if it matches the pattern of a valid IP address.
# Output:
# A boolean value indicating whether the input was a valid IP address.
# '''
if not isinstance(ip_string, str) and not isinstance(ip_string, unicode):
return False
if masked:
ip_pattern = re.compile('((([\d]{1,3})|([Xx]{1,3}))\.){3}(([\d]{1,3})|([Xx]{1,3}))', re.UNICODE)
else:
ip_pattern = re.compile('([\d]{1,3}\.){3}([\d]{1,3})', re.UNICODE)
if ip_pattern.match(ip_string):
return True
else:
return False
def convert_to_datetime(string):
dt = datetime.datetime.strptime(string,'%Y-%m-%dT%H:%M:%SZ')
return dt
def convert_from_datetime(dt):
string = dt.strftime('%Y%m%d%H%M%S')
return string
def convert_datetime_to_epoch(dt):
epochtime = (dt - datetime.datetime(1970,1,1)).total_seconds()
return epochtime
def wikipedia_query(query_params,lang='en'):
site = wiki.Wiki(url='http://'+lang+'.wikipedia.org/w/api.php')
request = api.APIRequest(site, query_params)
result = request.query()
return result[query_params['action']]
def short_wikipedia_query(query_params,lang='en'):
site = wiki.Wiki(url='http://'+lang+'.wikipedia.org/w/api.php')
request = api.APIRequest(site, query_params)
# Don't do multiple requests
result = request.query(querycontinue=False)
return result[query_params['action']]
def random_string(le, letters=True, numerals=False):
def rc():
charset = []
cr = lambda x,y: range(ord(x), ord(y) + 1)
if letters:
charset += cr('a', 'z')
if numerals:
charset += cr('0', '9')
return chr(random.choice(charset))
def rcs(k):
return [rc() for i in range(k)]
return ''.join(rcs(le))
def clean_revision(rev):
# We must deal with some malformed user/userid values. Some
# revisions have the following problems:
# 1. no 'user' or 'userid' keys and the existence of the 'userhidden' key
# 2. 'userid'=='0' and 'user'=='Conversion script' and 'anon'==''
# 3. 'userid'=='0' and 'user'=='66.92.166.xxx' and 'anon'==''
# 4. 'userid'=='0' and 'user'=='204.55.21.34' and 'anon'==''
# In these cases, we must substitute a placeholder value
# for 'userid' to uniquely identify the respective kind
# of malformed revision as above.
revision = rev.copy()
if 'userhidden' in revision:
revision['user'] = random_string(15, letters=False, numerals=True)
revision['userid'] = revision['user']
elif 'anon' in revision:
if revision['user']=='Conversion script':
revision['user'] = random_string(14, letters=False, numerals=True)
revision['userid'] = revision['user']
elif is_ip(revision['user']):
# Just leaving this reflection in for consistency
revision['user'] = revision['user']
# The weird stuff about multiplying '0' by a number is to
# make sure that IP addresses end up looking like this:
# 192.168.1.1 -> 192168001001
# This serves to prevent collisions if the numbers were
# simply joined by removing the periods:
# 215.1.67.240 -> 215167240
# 21.51.67.240 -> 215167240
# This also results in the number being exactly 12 decimal digits.
revision['userid'] = ''.join(['0' * (3 - len(octet)) + octet \
for octet in revision['user'].split('.')])
elif is_ip(revision['user'], masked=True):
# Let's distinguish masked IP addresses, like
# 192.168.1.xxx or 255.XXX.XXX.XXX, by setting
# 'user'/'userid' both to a random 13 digit number
# or 13 character string.
# This will probably be unique and easily
# distinguished from an IP address (with 12 digits
# or characters).
revision['user'] = random_string(13, letters=False, numerals=True)
revision['userid'] = revision['user']
return revision
def cast_to_unicode(string):
if isinstance(string,str):
try:
string2 = string.decode('utf8')
except:
try:
string2 = string.decode('latin1')
except:
print "Some messed up encoding here"
elif isinstance(string,unicode):
string2 = string
return string2
def get_user_revisions(user,dt_end,lang):
'''
Input:
user - The name of a wikipedia user with no "User:" prefix, e.g. 'Madcoverboy'
dt_end - a datetime object indicating the maximum datetime to return for revisions
lang - a string (typically two characters) indicating the language version of Wikipedia to crawl
Output:
revisions - A list of revisions for the given article, each given as a dictionary. This will
include all properties as described by revision_properties, and will also include the
title and id of the source article.
'''
user = cast_to_unicode(user)
revisions = list()
dt_end_string = convert_from_datetime(dt_end)
result = wikipedia_query({'action':'query',
'list': 'usercontribs',
'ucuser': u"User:"+user,
'ucprop': 'ids|title|timestamp|sizediff',
#'ucnamespace':'0',
'uclimit': '500',
'ucend':dt_end_string},lang)
if result and 'usercontribs' in result.keys():
r = result['usercontribs']
r = sorted(r, key=lambda revision: revision['timestamp'])
for revision in r:
# Sometimes the size key is not present, so we'll set it to 0 in those cases
revision['sizediff'] = revision.get('sizediff', 0)
revision['timestamp'] = convert_to_datetime(revision['timestamp'])
revisions.append(revision)
return revisions
def get_user_properties(user,lang):
'''
Input:
user - a string with no "User:" prefix corresponding to the username ("Madcoverboy"
lang - a string (usually two digits) for the language version of Wikipedia to query
Output:
result - a dictionary containing attrubutes about the user
'''
user = cast_to_unicode(user)
result = wikipedia_query({'action':'query',
'list':'users',
'usprop':'blockinfo|groups|editcount|registration|gender',
'ususers':user},lang)
return result
def make_user_alters(revisions):
'''
Input:
revisions - a list of revisions generated by get_user_revisions
Output:
alters - a dictionary keyed by page name that returns a dictionary containing
the count of how many times the user edited the page, the timestamp of the user's
earliest edit to the page, the timestamp the user's latest edit to the page, and
the namespace of the page itself
'''
alters = dict()
for rev in revisions:
if rev['title'] not in alters.keys():
alters[rev['title']] = dict()
alters[rev['title']]['count'] = 1
alters[rev['title']]['min_timestamp'] = rev['timestamp']
alters[rev['title']]['max_timestamp'] = rev['timestamp']
alters[rev['title']]['ns'] = rev['ns']
else:
alters[rev['title']]['count'] += 1
alters[rev['title']]['max_timestamp'] = rev['timestamp']
return alters
def rename_on_redirect(article_title,lang='en'):
'''
Input:
article_title - a string with the name of the article or page that may be redirected to another title
lang - a string (typically two characters) indicating the language version of Wikipedia to crawl
Output:
article_title - a string with the name of the article or page that the redirect resolves to
'''
result = wikipedia_query({'titles': article_title,
'prop': 'info',
'action': 'query',
'redirects': 'True'},lang)
if 'redirects' in result.keys() and 'pages' in result.keys():
article_title = result['redirects'][0]['to']
return article_title
def get_page_revisions(article_title,dt_start,dt_end,lang):
'''
Input:
article - A string with the name of the article or page to crawl
dt_start - A datetime object indicating the minimum datetime to return for revisions
dt_end - a datetime object indicating the maximum datetime to return for revisions
lang - a string (typically two characters) indicating the language version of Wikipedia to crawl
Output:
revisions - A list of revisions for the given article, each given as a dictionary. This will
include all properties as described by revision_properties, and will also include the
title and id of the source article.
'''
article_title = rename_on_redirect(article_title)
dt_start_string = convert_from_datetime(dt_start)
dt_end_string = convert_from_datetime(dt_end)
revisions = list()
result = wikipedia_query({'titles': article_title,
'prop': 'revisions',
'rvprop': 'ids|timestamp|user|userid|size',
'rvlimit': '5000',
'rvstart': dt_start_string,
'rvend': dt_end_string,
'rvdir': 'newer',
'action': 'query'},lang)
if result and 'pages' in result.keys():
page_number = result['pages'].keys()[0]
try:
r = result['pages'][page_number]['revisions']
for revision in r:
revision['pageid'] = page_number
revision['title'] = result['pages'][page_number]['title']
# Sometimes the size key is not present, so we'll set it to 0 in those cases
revision['size'] = revision.get('size', 0)
revision['timestamp'] = convert_to_datetime(revision['timestamp'])
revisions.append(revision)
except KeyError:
revisions = list()
return revisions
def make_page_alters(revisions):
'''
Input:
revisions - a list of revisions generated by get_page_revisions
Output:
alters - a dictionary keyed by user name that returns a dictionary containing
the count of how many times the user edited the page, the timestamp of the user's
earliest edit to the page, the timestamp the user's latest edit to the page, and
the namespace of the page itself
'''
alters = dict()
for rev in revisions:
if rev['user'] not in alters.keys():
alters[rev['user']] = dict()
alters[rev['user']]['count'] = 1
alters[rev['user']]['min_timestamp'] = rev['timestamp']
alters[rev['user']]['max_timestamp'] = rev['timestamp']
else:
alters[rev['user']]['count'] += 1
alters[rev['user']]['max_timestamp'] = rev['timestamp']
return alters
def get_page_content(page_title,lang):
'''
Input:
page_title - A string with the name of the article or page to crawl
lang - A string (typically two characters) indicating the language version of Wikipedia to crawl
Output:
revisions_dict - A dictionary of revisions for the given article keyed by revision ID returning a
a dictionary of revision attributes. These attributes include all properties as described
by revision_properties, and will also include the title and id of the source article.
'''
article_title = rename_on_redirect(page_title)
revisions_dict = dict()
result = wikipedia_query({'titles': page_title,
'prop': 'revisions',
'rvprop': 'ids|timestamp|user|userid|size|content',
'rvlimit': '5000',
'action': 'query'},lang)
if result and 'pages' in result.keys():
page_number = result['pages'].keys()[0]
revisions = result['pages'][page_number]['revisions']
for revision in revisions:
rev = dict()
rev['pageid'] = page_number
rev['title'] = result['pages'][page_number]['title']
rev['size'] = revision.get('size', 0) # Sometimes the size key is not present, so we'll set it to 0 in those cases
rev['timestamp'] = convert_to_datetime(revision['timestamp'])
rev['content'] = revision.get('*',unicode()) # Sometimes content hidden, return with empty unicode string
rev['links'] = link_finder(rev['content'])
rev['username'] = revision['user']
rev['userid'] = revision['userid']
rev['revid'] = revision['revid']
revisions_dict[revision['revid']] = rev
return revisions_dict
def get_category_members(category_name, depth, lang='en'):
'''
Input:
category_name - The name of a Wikipedia(en) category, e.g. 'Category:2001_fires'.
depth - An integer in the range [0,n) reflecting the number of sub-categories to crawl
lang - A string (typically two-digits) corresponding to the language code for the Wikipedia to crawl
Output:
articles - A list of articles that are found within the given category or one of its
subcategories, explored recursively. Each article will be a dictionary object with
the keys 'title' and 'id' with the values of the individual article's title and
page_id respectively.
'''
articles = []
if depth < 0:
return articles
#Begin crawling articles in category
results = wikipedia_query({'list': 'categorymembers',
'cmtitle': category_name,
'cmtype': 'page',
'cmlimit': '500',
'action': 'query'},lang)
if 'categorymembers' in results.keys() and len(results['categorymembers']) > 0:
for i, page in enumerate(results['categorymembers']):
article = page['title']
articles.append(article)
# Begin crawling subcategories
results = wikipedia_query({'list': 'categorymembers',
'cmtitle': category_name,
'cmtype': 'subcat',
'cmlimit': '500',
'action': 'query'},lang)
subcategories = []
if 'categorymembers' in results.keys() and len(results['categorymembers']) > 0:
for i, category in enumerate(results['categorymembers']):
cat_title = category['title']
subcategories.append(cat_title)
for category in subcategories:
articles += get_category_members(category,depth-1)
return articles
def get_page_categories(page_title,lang='en'):
'''
Input:
page_title - A string with the name of the article or page to crawl
lang - A string (typically two-digits) corresponding to the language code for the Wikipedia to crawl
Output:
categories - A list of the names of the categories of which the page is a member
'''
page_title = rename_on_redirect(page_title)
results = wikipedia_query({'prop': 'categories',
'titles': page_title,
'cllimit': '500',
'clshow':'!hidden',
'action': 'query'},lang)
if 'pages' in results.keys():
page_number = results['pages'].keys()[0]
categories = results['pages'][page_number]['categories']
categories = [i['title'] for i in categories]
categories = [i for i in categories if i != u'Category:Living people']
else:
print u"{0} not found in category results".format(page_title)
return categories
def get_page_outlinks(page_title,lang='en'):
'''
Input:
page_title - A string with the name of the article or page to crawl
lang - A string (typically two-digits) corresponding to the language code for the Wikipedia to crawl
Output:
outlinks - A list of all "alter" pages that link out from the current version of the "ego" page
Notes:
This uses API calls to return all [[links]] which may be slower and result in overlinking from templates
'''
# This approach is susceptible to 'overlinking' as it includes links from templates
page_title = cast_to_unicode(page_title)
page_title = rename_on_redirect(page_title)
result = wikipedia_query({'titles': page_title,
'prop': 'links',
'pllimit': '500',
'plnamespace':'0',
'action': 'query'},lang)
if 'pages' in result.keys():
page_number = result['pages'].keys()[0]
results = result['pages'][page_number]['links']
outlinks = [l['title'] for l in results]
else:
print u"Error: No links found in {0}".format(page_title)
return outlinks
def get_page_inlinks(page_title,lang='en'):
'''
Input:
page_title - A string with the name of the article or page to crawl
lang - A string (typically two-digits) corresponding to the language code for the Wikipedia to crawl
Output:
inlinks - A list of all "alter" pages that link in to the current version of the "ego" page
'''
page_title = cast_to_unicode(page_title)
page_title = rename_on_redirect(page_title)
result = wikipedia_query({'bltitle': page_title,
'list': 'backlinks',
'bllimit': '500',
'blnamespace':'0',
'blfilterredir':'nonredirects',
'action': 'query'},lang)
if 'backlinks' in result.keys():
results = result['backlinks']
inlinks = [l['title'] for l in results]
else:
print u"Error: No links found in {0}".format(article_title)
return inlinks
# Links inside templates are included which results in completely-connected components
# Remove links from templates by getting a list of templates used across all pages
def get_page_templates(page_title,lang):
'''
Input:
page_title - A string with the name of the article or page to crawl
lang - A string (typically two-digits) corresponding to the language code for the Wikipedia to crawl
Output:
templates - A list of all the templates (which contain redundant links) in the current version
'''
page_title = cast_to_unicode(page_title)
page_title = rename_on_redirect(page_title)
result = wikipedia_query({'titles': page_title,
'prop': 'templates',
'tllimit': '500',
'action': 'query'},lang)
if 'pages' in result.keys():
page_id = result['pages'].keys()[0]
templates = [i['title'] for i in result['pages'][page_id]['templates']]
return templates
def get_page_links(page_title,lang='en'):
'''
Input:
page_title - A string with the name of the article or page to crawl that is the "ego" page
lang - A string (typically two-digits) corresponding to the language code for the Wikipedia to crawl
Output:
links - A dictionary keyed by ['in','out'] of all "alter" pages that link in to and out from the
current version of the "ego" page
'''
links=dict()
links['in'] = get_page_inlinks(page_title,lang)
links['out'] = get_page_outlinks(page_title,lang)
return links
# Identify links based on content of revisions
def link_finder(content_string):
'''
Input:
content_string - A string containing the raw wiki-markup for a page
Output:
links - A list of all "alter" pages that link out from the current version of the "ego" page
Notes:
This uses regular expressions to coarsely parse the content for instances of [[links]] and likely returns messy data
'''
links = list()
for i,j in re.findall(r'\[\[([^|\]]*\|)?([^\]]+)\]\]',content_string):
if len(i) == 0:
links.append(j)
elif u'#' not in i :
links.append(i[:-1])
elif u'#' in i:
new_i = i[:i.index(u'#')]
links.append(new_i)
links = [l for l in links if u'|' not in l and u'Category:' not in l and u'File:' not in l]
return links
def get_page_outlinks_from_content(page_title,lang='en'):
'''
Input:
page_title - A string with the name of the article or page to crawl that is the "ego" page
lang - A string (typically two-digits) corresponding to the language code for the Wikipedia to crawl
Output:
links - A list of all "alter" pages that link out from the current version of the "ego" page
Notes:
This uses regular expressions to coarsely parse the content for instances of [[links]] and may be messy
'''
page_title = cast_to_unicode(page_title)
page_title = rename_on_redirect(page_title)
# Get content from most recent revision of an article
result = short_wikipedia_query({'titles': page_title,
'prop': 'revisions',
'rvlimit': '1',
'rvprop':'ids|timestamp|user|userid|content',
'action': 'query'},lang)
if 'pages' in result.keys():
page_id = result['pages'].keys()[0]
content = result['pages'][page_id]['revisions'][0]['*']
links = link_finder(content)
else:
print u'...Error in {0}'.format(page_title)
links = list()
return links
def get_user_outdiscussion(user_name,dt_end,lang='en'):
'''
Input:
user_name - The name of a "ego" wikipedia user with no "User:" prefix, e.g. 'Madcoverboy'
dt_end - a datetime object indicating the maximum datetime to return for revisions
lang - a string (typically two characters) indicating the language version of Wikipedia to crawl
Output:
users - A list of all "alter" user talk pages that the ego has ever posted to
'''
# User revision code in only user namespace
user_name = cast_to_unicode(user_name)
users = dict()
dt_end_string = convert_from_datetime(dt_end)
result = wikipedia_query({'action':'query',
'list': 'usercontribs',
'ucuser': u"User:"+user_name,
'ucprop': 'ids|title|timestamp|sizediff',
'ucnamespace':'3',
'uclimit': '500',
'ucend':dt_end_string},lang)
if result and 'usercontribs' in result.keys():
r = result['usercontribs']
for rev in r:
alter = rev['title'][10:] # Ignore "User talk:"
if alter not in users.keys():
users[alter] = dict()
users[alter]['count'] = 1
users[alter]['min_timestamp'] = rev['timestamp']
users[alter]['max_timestamp'] = rev['timestamp']
else:
users[alter]['count'] += 1
users[alter]['max_timestamp'] = rev['timestamp']
return users
def get_user_indiscussion(user_name,dt_end,lang='en'):
'''
Input:
user_name - The name of a "ego" wikipedia user with no "User:" prefix, e.g. 'Madcoverboy'
dt_end - a datetime object indicating the maximum datetime to return for revisions
lang - a string (typically two characters) indicating the language version of Wikipedia to crawl
Output:
users - A list of all "alter" user talk pages that have ever posted to the user's talk page
'''
# Article revision code in only user talk page
user_name = cast_to_unicode(user_name)
users = dict()
dt_end_string = convert_from_datetime(dt_end)
result = wikipedia_query({'titles': u'User talk:'+user_name,
'prop': 'revisions',
'rvprop': 'ids|timestamp|user|userid|size',
'rvlimit': '5000',
'rvend': dt_end_string,
'action': 'query'},lang)
if result and 'pages' in result.keys():
page_number = result['pages'].keys()[0]
try:
r = result['pages'][page_number]['revisions']
for rev in r:
if rev['user'] not in users.keys():
users[rev['user']] = dict()
users[rev['user']]['count'] = 1
users[rev['user']]['min_timestamp'] = rev['timestamp']
users[rev['user']]['max_timestamp'] = rev['timestamp']
else:
users[rev['user']]['count'] += 1
users[rev['user']]['max_timestamp'] = rev['timestamp']
except KeyError:
pass
return users
def get_user_discussion(user_name,dt_end,lang='en'):
'''
Input:
user_name - The name of a "ego" wikipedia user with no "User:" prefix, e.g. 'Madcoverboy'
dt_end - a datetime object indicating the maximum datetime to return for revisions
lang - a string (typically two characters) indicating the language version of Wikipedia to crawl
Output:
users - A dictionary keyed by the values ['in','out'] that combines both get_user_outdiscussion and
get_user_indiscussion
'''
users=dict()
users['out'] = get_user_outdiscussion(user_name,dt_end,lang)
users['in'] = get_user_indiscussion(user_name,dt_end,lang)
return users
def make_article_trajectory(revisions):
'''
Input:
revisions - A list of revisions generated by get_page_revisions
Output:
g - A NetworkX DiGraph object corresponding to the trajectory of an article moving between users
Nodes are users and links from i to j exist when user i made a revision immediately following user j
'''
g = nx.DiGraph()
# Sort revisions on ascending timestamp
sorted_revisions = sorted(revisions,key=lambda k:k['timestamp'])
# Don't use the last revision
for num,rev in enumerate(sorted_revisions[:-1]):
# Edge exists between user and user in next revision
edge = (rev['user'],revisions[num+1]['user'])
if g.has_edge(*edge):
g[edge[0]][edge[1]]['weight'] += 1
else:
g.add_edge(*edge,weight=1)
return g
def make_editor_trajectory(revisions):
'''
Input:
revisions - A list of revisions generated by get_user_revisions
Output:
g - A NetworkX DiGraph object corresponding to the trajectory of a user moving between articles
Nodes are pages and links from i to j exist when page i was edited by the user immediately following page j
'''
g = nx.DiGraph()
# Sort revisions on ascending timestamp
sorted_revisions = sorted(revisions,key=lambda k:k['timestamp'])
# Don't use the last revision
for num,rev in enumerate(sorted_revisions[:-1]):
# Edge exists between user and user in next revision
edge = (rev['title'],revisions[num+1]['user'])
if g.has_edge(*edge):
g[rev['title']][revisions[num+1]['user']]['weight'] += 1
else:
g.add_edge(*edge,weight=1)
return g
def fixurl(url):
# turn string into unicode
if not isinstance(url,unicode):
url = url.decode('utf8')
# parse it
parsed = urlparse.urlsplit(url)
# divide the netloc further
userpass,at,hostport = parsed.netloc.rpartition('@')
user,colon1,pass_ = userpass.partition(':')
host,colon2,port = hostport.partition(':')
# encode each component
scheme = parsed.scheme.encode('utf8')
user = urllib2.quote(user.encode('utf8'))
colon1 = colon1.encode('utf8')
pass_ = urllib2.quote(pass_.encode('utf8'))
at = at.encode('utf8')
host = host.encode('idna')
colon2 = colon2.encode('utf8')
port = port.encode('utf8')
path = '/'.join( # could be encoded slashes!
urllib2.quote(urllib2.unquote(pce).encode('utf8'),'')
for pce in parsed.path.split('/')
)
query = urllib2.quote(urllib2.unquote(parsed.query).encode('utf8'),'=&?/')
fragment = urllib2.quote(urllib2.unquote(parsed.fragment).encode('utf8'))
# put it back together
netloc = ''.join((user,colon1,pass_,at,host,colon2,port))
return urlparse.urlunsplit((scheme,netloc,path,query,fragment))
def convert_months_to_strings(m):
if len(str(m)) > 1:
new_m = unicode(m)
else:
new_m = u'0'+unicode(m)
return new_m
def get_url(article_name,lang,month,year):
url = u"http://stats.grok.se/json/" + lang + u"/" + unicode(year) + convert_months_to_strings(month) + u"/" + article_name
fixed_url = fixurl(url)
return fixed_url
def requester(url):
opener = urllib2.build_opener()
req = urllib2.Request(url)
f = opener.open(req)
r = simplejson.load(f)
result = pd.Series(r['daily_views'])
return result
def clean_timestamps(df):
to_drop = list()
for d in df.index:
try:
datetime.date(int(d[0:4]),int(d[5:7]),int(d[8:10]))
except ValueError:
to_drop.append(d)
df2 = df.drop(to_drop,axis=0)
df2.index = pd.to_datetime(df2.index)
return df2
def get_pageviews(article,lang,min_date,max_date):
rng = pd.date_range(min_date,max_date,freq='M')
rng2 = [(i.month,i.year) for i in rng]
ts = pd.Series()
for i in rng2:
url = get_url(article,lang,i[0],i[1])
result = requester(url)
ts = pd.Series.append(result,ts)
ts = ts.sort_index()
ts = clean_timestamps(ts)
ts = ts.asfreq('D')
return ts
def make_pageview_df(article_list,lang,min_date,max_date):
df = pd.DataFrame(index=pd.date_range(start=min_date,end=max_date))
l = len(article_list)
for num,a in enumerate(article_list):
try:
print "{0} / {1} : {2}".format(num+1,l,a)
ts = get_pageviews(a,lang,min_date,max_date)
df[a] = ts
except:
print u'Something happened to {0}'.format(unicode(a))
pass
return df
def editors_other_activity(article_title,dt_start,dt_end,ignorelist,lang):
revisions = get_page_revisions(article_title,dt_start,dt_end,lang)
revision_alters = make_page_alters(revisions)
revision_alters2 = {k:v for k,v in revision_alters.iteritems() if k not in ignorelist}
alter_contributions = dict()
for num,editor_alter in enumerate(revision_alters2.keys()):
print u"{0} / {1}: {2}".format(num+1,len(revision_alters2.keys()),editor_alter)
alter_contributions[editor_alter] = get_user_revisions(editor_alter,dt_start,lang)
#el = directed_dict_to_edgelist(alter_discussions)
return revisions,alter_contributions
def editing_primary_discussion_secondary(article_title,dt_start,dt_end,ignorelist,lang):
revisions = get_page_revisions(article_title,dt_start,dt_end,lang)
revision_alters = make_page_alters(revisions)
revision_alters2 = {k:v for k,v in revision_alters.iteritems() if k not in ignorelist}
alter_discussions = dict()
for num,editor_alter in enumerate(revision_alters2.keys()):
print u"{0} / {1}: {2}".format(num+1,len(revision_alters2.keys()),editor_alter)
alter_discussions[editor_alter] = get_user_discussion(editor_alter,dt)
#el = directed_dict_to_edgelist(alter_discussions)
return revisions,alter_discussions
g = nx.DiGraph()
for user,revisions in alter_contribs.iteritems():
#print user
for rev in revisions:
article = rev['title']
# If edge already exists, iterate weight
if g.has_edge(user,title):
g[user][title]['weight'] += 1
# Otherwise create editor node and properties then add new edge
else:
# If editor node is not invalid or an IP, do a bunch of stuff
if 'invalid' not in user_props[user]['users'][0].keys():
ns = rev['ns']
gen = user_props[user]['users'][0]['gender']
edits = user_props[user]['users'][0]['editcount']
# Registration returns None sometimes
start = user_props[user]['users'][0]['registration']
if start is not None:
start = convert_datetime_to_epoch(convert_to_datetime(start))
else:
start = u'unknown'
# Add node
g.add_node(user, gender = gen, startdate = start, edits = edits, nodetype = 'user', ns='user')
g.add_node(article, gender = 'page', startdate = 'page', edits = 'page', sysop = 'page', autoconfirmed = 'page', nodetype = 'page',namespace=ns)
if 'sysop' in user_props[user]['users'][0]['groups']:
g.node[user]['sysop'] = 1
else:
g.node[user]['sysop'] = 0
if 'autoconfirmed' in user_props[user]['users'][0]['groups']:
g.node[user]['autoconfirmed'] = 1
else:
g.node[user]['autoconfirmed'] = 0
g.add_edge(user,article,weight=1)
# If editor node is ivalid or an IP, populate fields
else:
g.add_node(user,gender=u'unknown',start=u'uknown',edits=u'unknown',sysop=0,autoconfirmed=0,nodetype='user')
# Remove Talk:Chelsea_Manning because it's connected to everything
g.remove_node('Talk:Chelsea Manning')
editors = [title for title,attribs in g.nodes(data=True) if attribs['nodetype'] == 'user']
#pages = [title for title,attribs in g.nodes(data=True) if attribs['nodetype'] == 'page']
g2 = g.to_undirected()
g3 = nx.bipartite.weighted_projected_graph(g2,editors)
#g4 = nx.bipartite.weighted_projected_graph(g2,pages)
nx.write_graphml(g,'Manning_talk_coauthorship.graphml')
nx.write_gexf(g,'Manning_talk_coauthorship.gexf')
nx.write_graphml(g3,'Manning_talk_coediting.graphml')
nx.write_gexf(g3,'Manning_talk_coediting.gexf')
def editing_primary_hyperlink_secondary(article_title,dt_start,dt_end,ignorelist):
revisions = get_page_revisions(article_title,dt_start,dt_end,lang)
revision_alters = make_page_alters(revisions)
revision_alters2 = {k:v for k,v in revision_alters.iteritems() if k not in ignorelist}
alter_hyperlinks = dict()
for num,editor_alter in enumerate(revision_alters2.keys()):
print u"{0} / {1}: {2}".format(num+1,len(revision_alters2.keys()),editor_alter)
alter_discussions[editor_alter] = get_page_outlinks(editor_alter,dt)
el = directed_dict_to_edgelist(alter_discussions)
return revisions,alter_discussions,el
def two_step_editing(article_title,dt,ignorelist):
revisions = get_page_revisions(article_title,dt)
revision_alters = make_page_alters(revisions)
revision_alters2 = {k:v for k,v in revision_alters.iteritems() if k not in ignorelist}
alter_revisions = dict()
for num,editor_alter in enumerate(revision_alters2.keys()):
print u"{0} / {1}: {2}".format(num+1,len(revision_alters2.keys()),editor_alter)
alter_revisions[editor_alter] = get_user_revisions(editor_alter,dt)
return revisions, alter_revisions
def two_step_outlinks(page_title):
page_alters = dict()
templates_dict = dict()
links = get_page_outlinks(page_title)
page_alters[unicode(page_title)] = links
templates = get_page_templates(page_title)
templates_dict[page_title] = templates
l = len(links)
for num,link in enumerate(links):
print u"{0} / {1} : {2}".format(num+1,l,link)
try:
page_alters[link] = get_page_outlinks(link)
templates_dict[link] = get_page_templates(link)
except:
print u"...{0} doesn't exist".format(link)
pass
return page_alters,templates_dict
def two_step_outlinks_from_content(page_title):
page_alters = dict()
links = get_page_outlinks_from_content(page_title)
unique_links = list(set(links))
page_alters[unicode(page_title)] = unique_links
l = len(unique_links)
for num,link in enumerate(unique_links):
print u"{0} / {1} : {2}".format(num+1,l,link)
try:
page_alters[link] = get_page_outlinks_from_content(link)
except:
print u"...{0} doesn't exist".format(link)
pass
return page_alters
def make_hyperlink_network(hyperlink_dict):
hyperlink_g = nx.DiGraph()
for page,links in hyperlink_dict.iteritems():
for link in links:
# Only include links to 1-step alter pages, not 2-step alters' alters
if link in hyperlink_dict.keys():
hyperlink_g.add_edge(page,link)
return hyperlink_g
def make_shared_user_editing_network(alter_revisions_dict,threshold):
# Make the graph
net = nx.DiGraph()
for editor,revisions in alter_revisions_dict.iteritems():
articles = [r['title'] for r in revisions]
for num,article in enumerate(articles[:-1]):
if net.has_edge(article,articles[num+1]):
net[article][articles[num+1]]['weight'] += 1
else:
net.add_edge(article,articles[num+1],weight=1)
# If edge is below threshold, remove it
for i,j,d in net.edges_iter(data=True):
if d['weight'] < threshold:
net.remove_edge(i,j)
# Remove self-loops
for i,j,d in net.edges_iter(data=True):
if i == j:
net.remove_edge(i,j)
# Remove resulting isolates
isolates = nx.isolates(net)
for isolate in isolates:
net.remove_node(isolate)
return net
# Take the alter_revisions_dict keyed by user with a list of revisions
# And return an inverted alter_pages keyed by page with a dictionary of users
def invert_alter_revisions(alter_revisions_dict):
alter_pages = dict()
for user,revisions in alter_revisions_dict.iteritems():
temp_list = list()
for revision in revisions:
temp_list.append(revision['title'])
alter_pages[user] = dict(Counter(temp_list))
inverted_alter_pages = dict()
for user,counts in alter_pages.iteritems():
for article,count in counts.iteritems():
try:
inverted_alter_pages[article][user] = count
except KeyError:
inverted_alter_pages[article] = dict()
inverted_alter_pages[article][user] = count
return inverted_alter_pages
def make_shared_page_editing_network(alter_revisions_dict,threshold):
inverted_alter_revisions_dict = invert_alter_revisions(alter_revisions_dict)
# Make the graph
g = nx.DiGraph()
for page,users in inverted_alter_revisions_dict.iteritems():
user_list = users.keys()
for num,user in enumerate(user_list[:-1]):
next_user = user_list[num+1]
if g.has_edge(user,next_user):
g[user][next_user]['weight'] += 1
else:
g.add_edge(user,next_user,weight=1)
# If edge is below threshold, remove it
for i,j,d in g.edges_iter(data=True):
if d['weight'] < threshold:
g.remove_edge(i,j)
# Remove self-loops
for i,j,d in g.edges_iter(data=True):
if i == j:
g.remove_edge(i,j)
# Remove resulting isolates
isolates = nx.isolates(g)
for isolate in isolates:
g.remove_node(isolate)
return g
def make_category_network(categories_dict):
'''Takes a dictionary keyed by page name with list of categories as values
Returns a two-mode (enforced by DiGraph) page-category
'''
g_categories=nx.DiGraph()
for page,categories in categories_dict.iteritems():
for category in categories:
g_categories.add_node(page,node_type='page')
g_categories.add_node(category,node_type='category')
g_categories.add_edge(page,category)
return g_categories
Select articles from the 2012 Mexican elections category on the Spanish Wikipedia based on articles having more than one gubinatorial candidate having an existing article ("blue links").
articles = ['Elecciones estatales de 2012 en Yucatán','Elecciones estatales en Tabasco de 2012','Elecciones estatales en San Luis Potosí de 2012','Elecciones estatales de Morelos de 2012','Elecciones estatales en Jalisco de 2012','Elecciones estatales en Guanajuato de 2012','Elecciones en el Distrito Federal (México) de 2012','Elecciones estatales en Chiapas de 2012']
articles = [i.decode('utf8') for i in articles]
category_members = get_category_members('Categoría:Elecciones_de_México_de_2012',1,'es')
category_members
[u'Elecciones federales en M\xe9xico de 2012', u'Elecciones estatales de Campeche de 2012', u'Elecciones estatales en Chiapas de 2012', u'Elecciones estatales de Colima de 2012', u'Elecciones en el Distrito Federal (M\xe9xico) de 2012', u'Elecciones estatales del Estado de M\xe9xico de 2012', u'Elecciones estatales en Guanajuato de 2012', u'Elecciones estatales de Guerrero de 2012', u'Elecciones estatales extraordinarias de Hidalgo de 2012', u'Elecciones estatales en Jalisco de 2012', u'Elecciones estatales extraordinarias de Michoac\xe1n de 2012', u'Elecciones estatales de Morelos de 2012', u'Elecciones estatales de Nuevo Le\xf3n de 2012', u'Elecciones estatales de Quer\xe9taro de 2012', u'Elecciones estatales en San Luis Potos\xed de 2012', u'Elecciones estatales en Tabasco de 2012', u'Elecciones estatales de 2012 en Yucat\xe1n', u'Elecciones estatales extraordinarias de Yucat\xe1n de 2012']
bots = get_category_members('Category:All Wikipedia bots',3,'en')
bots = [b[5:] for b in bots]
user_props = dict()
for i,user in enumerate(alter_contribs.keys()):
print u"{0} / {1}: {2}".format(i+1,len(alter_contribs.keys()),user)
user_props[user] = get_user_properties(user,'en')
1 / 507: Edison 2 / 507: Richard BB 3 / 507: 156.98.4.11 4 / 507: Vobedd 5 / 507: Qcomplex5 6 / 507: Skyraider 7 / 507: Adjwilley 8 / 507: Wbm1058 9 / 507: Roscelese 10 / 507: Bernarddb 11 / 507: Solarguy17 12 / 507: It Is Me Here 13 / 507: Degen Earthfast 14 / 507: Tony Webster 15 / 507: Guerillero 16 / 507: Coffeepusher 17 / 507: Vexorian 18 / 507: Rhialto 19 / 507: Sodaant 20 / 507: Jfhutson 21 / 507: Marcus Qwertyus 22 / 507: Carolmooredc 23 / 507: Cullen328 24 / 507: Benlisquare 25 / 507: Rcsprinter123 26 / 507: EvergreenFir 27 / 507: Wslack 28 / 507: BrownHairedGirl 29 / 507: Thechungling 30 / 507: Two kinds of pork 31 / 507: CaseyPenk 32 / 507: Casey.Grim85 33 / 507: Pudeo 34 / 507: KoshVorlon 35 / 507: NE Ent 36 / 507: Miranche 37 / 507: Wctaiwan 38 / 507: Rlendog 39 / 507: FT2 40 / 507: Wallie 41 / 507: Livitup 42 / 507: 190.235.87.27 43 / 507: Param Mudgal 44 / 507: Pass a Method 45 / 507: David Gerard 46 / 507: Pawyilee 47 / 507: Trinitresque 48 / 507: Daffydavid 49 / 507: Scott Martin 50 / 507: 117.199.7.24 51 / 507: Jenssey 52 / 507: Zzyzx11 53 / 507: GorillaWarfare 54 / 507: Necrothesp 55 / 507: Hullaballoo Wolfowitz 56 / 507: Brettalan 57 / 507: 97.84.222.198 58 / 507: Scottywong 59 / 507: Themfromspace 60 / 507: Shrigley 61 / 507: LtGen 62 / 507: Nick 63 / 507: Steeletrap 64 / 507: Michael Dorosh 65 / 507: Yourself In Person 66 / 507: Fs 67 / 507: Juno 68 / 507: Me and 69 / 507: Sophie means wisdom 70 / 507: Ericloewe 71 / 507: Toyokuni3 72 / 507: AnonNep 73 / 507: Ileanadu 74 / 507: Jeude54cartes 75 / 507: Zoe Brain 76 / 507: Vinithehat 77 / 507: Cengime 78 / 507: Abeg92 79 / 507: Born2cycle 80 / 507: Kevin W. 81 / 507: Sovetus 82 / 507: Sj 83 / 507: 91.153.87.155 84 / 507: Wadewitz 85 / 507: Katana geldar 86 / 507: Vigyani 87 / 507: Solomonfromfinland 88 / 507: Mareklug 89 / 507: DrCruse 90 / 507: Eopsid 91 / 507: Scray 92 / 507: Theodolite 93 / 507: Dralwik 94 / 507: Snappy 95 / 507: PublicAmpersand 96 / 507: Zaphody3k 97 / 507: Agmonaco 98 / 507: Liz 99 / 507: SqueakBox 100 / 507: Crumpled Fire 101 / 507: A Thousand Doors 102 / 507: AzureCitizen 103 / 507: Hitmonchan 104 / 507: Hamiltonstone 105 / 507: 83.128.147.107 106 / 507: Miraculouschaos 107 / 507: Dyrnych 108 / 507: Hobit 109 / 507: DanHakimi 110 / 507: Wikipeterproject 111 / 507: Cameron Scott 112 / 507: PikkoroDaimao 113 / 507: GiantSnowman 114 / 507: Kelly 115 / 507: Cimon Avaro 116 / 507: 86.16.146.123 117 / 507: ThinkEnemies 118 / 507: KTC 119 / 507: Shii 120 / 507: BHC 121 / 507: Thegreatdr 122 / 507: Joefromrandb 123 / 507: Milkunderwood 124 / 507: Maximilian Schönherr 125 / 507: Kaldari 126 / 507: DHeyward 127 / 507: Byposted 128 / 507: Almonroth 129 / 507: Srlevine1 130 / 507: BlueSalix 131 / 507: Vanisaac 132 / 507: FutureTrillionaire 133 / 507: John Cline 134 / 507: Pointillist 135 / 507: Raeven0 136 / 507: Psychologicaloric 137 / 507: Tennenrishin 138 / 507: Atshal 139 / 507: Modest Genius 140 / 507: 5minutes 141 / 507: Josepharari 142 / 507: Tbhotch 143 / 507: 70.89.234.49 144 / 507: TParis 145 / 507: JamesAM 146 / 507: Golbez 147 / 507: 208.163.239.119 148 / 507: FormerIP 149 / 507: StAnselm 150 / 507: Cyclopia 151 / 507: HiB2Bornot2B 152 / 507: Jayron32 153 / 507: Iselilja 154 / 507: Jojhutton 155 / 507: BFWB 156 / 507: Talmage 157 / 507: 24.22.47.95 158 / 507: K7L 159 / 507: Azirus 160 / 507: Smyth 161 / 507: Cavarrone 162 / 507: OtterSmith 163 / 507: Anthonyhcole 164 / 507: R. fiend 165 / 507: Michael Glass 166 / 507: Soerfm 167 / 507: Loadmaster 168 / 507: Daira Hopwood 169 / 507: 85.65.68.209 170 / 507: 99.192.64.222 171 / 507: Kiralexis 172 / 507: DPRoberts534 173 / 507: 98.157.156.137 174 / 507: Insulam Simia 175 / 507: U-Mos 176 / 507: 2001:5C0:1000:A:0:0:0:49D 177 / 507: Jburman 178 / 507: Malerooster 179 / 507: Thehistorian10 180 / 507: Fightin' Phillie 181 / 507: Safiel 182 / 507: Coemgenus 183 / 507: Jackmcbarn 184 / 507: Archaeo 185 / 507: AlexTiefling 186 / 507: NativeForeigner 187 / 507: Belorn 188 / 507: LukeSurl 189 / 507: 86.173.69.123 190 / 507: Eregli bob 191 / 507: Nicholas Perkins 192 / 507: Amatulic 193 / 507: Gtadood 194 / 507: Torquemama007 195 / 507: Casiotone 196 / 507: Jean-Jacques Georges 197 / 507: Dainamo 198 / 507: Labattblueboy 199 / 507: Phil Sandifer 200 / 507: Pez Dispens3r 201 / 507: Bob bobato 202 / 507: DragonflySixtyseven 203 / 507: Bright Darkness 204 / 507: Psychonaut 205 / 507: Sbingner 206 / 507: Thebirdlover 207 / 507: Ukrained2012 208 / 507: AutomaticStrikeout 209 / 507: Maproom 210 / 507: GeorgeLouis 211 / 507: 69.244.220.253 212 / 507: 71.231.186.92 213 / 507: Synchronism 214 / 507: JCO312 215 / 507: Tariqabjotu 216 / 507: 71.90.172.117 217 / 507: Chris G 218 / 507: Obiwankenobi 219 / 507: Mr. Stradivarius 220 / 507: GenericBob 221 / 507: TheCatalyst31 222 / 507: 71.116.34.80 223 / 507: A.amitkumar 224 / 507: Sluffs 225 / 507: Vegaswikian 226 / 507: Tombomp 227 / 507: KathrynBrooks1 228 / 507: Canoe1967 229 / 507: 71.179.167.242 230 / 507: 184.152.74.159 231 / 507: Lacarids 232 / 507: Gymnophoria 233 / 507: Miranda1989 234 / 507: Robin Lionheart 235 / 507: GrimmC 236 / 507: 7daysahead 237 / 507: Richard75 238 / 507: GregorB 239 / 507: 97.123.210.252 240 / 507: Agnosticaphid 241 / 507: MONGO 242 / 507: Mpgviolist 243 / 507: Hebel 244 / 507: NinjaRobotPirate 245 / 507: Silver seren 246 / 507: Giants27 247 / 507: Brandmeister 248 / 507: Surfer43 249 / 507: Tarc 250 / 507: BrianJ34 251 / 507: Blueboar 252 / 507: Fighter1stClass 253 / 507: Maunus 254 / 507: Walterego 255 / 507: LlywelynII 256 / 507: QuackCD 257 / 507: BabbaQ 258 / 507: Sandstein 259 / 507: BD2412 260 / 507: 74.138.45.132 261 / 507: 88.66.37.221 262 / 507: Alaric 263 / 507: Theodore! 264 / 507: Penwhale 265 / 507: Blackbird 4 266 / 507: JDiala 267 / 507: Cls14 268 / 507: Dicklyon 269 / 507: Guy Macon 270 / 507: Dorsal Axe 271 / 507: Count Iblis 272 / 507: Cymru.lass 273 / 507: Fritzendugan 274 / 507: Muboshgu 275 / 507: PauAmma 276 / 507: TripleU 277 / 507: Ajfweb 278 / 507: Taylor Trescott 279 / 507: Søren 280 / 507: Helixdq 281 / 507: Gobonobo 282 / 507: Alanscottwalker 283 / 507: 84.18.241.143 284 / 507: Mike Rosoft 285 / 507: Netcrusher88 286 / 507: 2001:558:6024:12:10BB:B8E3:A9F3:C3C3 287 / 507: White whirlwind 288 / 507: Andrewman327 289 / 507: Sportfan5000 290 / 507: Tivanir2 291 / 507: ItsZippy 292 / 507: A Quest For Knowledge 293 / 507: Yintan 294 / 507: Another Believer 295 / 507: AjaxSmack 296 / 507: 151.230.243.44 297 / 507: Berean Hunter 298 / 507: Tryptofish 299 / 507: XMattingly 300 / 507: Jonie148 301 / 507: Ὁ οἶστρος 302 / 507: Jonathandeamer 303 / 507: Emarsee 304 / 507: JasonCNJ 305 / 507: MightySaiyan 306 / 507: 108.247.32.232 307 / 507: Writegeist 308 / 507: And Adoil Descended 309 / 507: 71.68.234.176 310 / 507: TheScootz 311 / 507: Risker 312 / 507: Sam Blacketer 313 / 507: SlimVirgin 314 / 507: JASpencer 315 / 507: Woody 316 / 507: Bdell555 317 / 507: Phoebe 318 / 507: 168.12.253.66 319 / 507: Hot Stop 320 / 507: Srich32977 321 / 507: 86.153.186.25 322 / 507: 181.179.58.111 323 / 507: Count Truthstein 324 / 507: Alex Hortman 325 / 507: Thatbox 326 / 507: George Ho 327 / 507: InedibleHulk 328 / 507: Isaidnoway 329 / 507: My very best wishes 330 / 507: Gaurav 331 / 507: Saxman1984 332 / 507: Mohamed CJ 333 / 507: 65.51.209.126 334 / 507: Cindamuse 335 / 507: MaxHarmony 336 / 507: HandsomeFella 337 / 507: Yonskii 338 / 507: 198.161.2.241 339 / 507: Wnt 340 / 507: Hbdragon88 341 / 507: Martylunsford 342 / 507: Wikid77 343 / 507: Shemp Howard, Jr. 344 / 507: 173.178.34.11 345 / 507: Gaijin42 346 / 507: Eclecticology 347 / 507: Red Slash 348 / 507: 76.65.128.222 349 / 507: Baseball Bugs 350 / 507: Redrose64 351 / 507: 82.42.38.252 352 / 507: IFreedom1212 353 / 507: Jehochman 354 / 507: Ken Arromdee 355 / 507: Trystan 356 / 507: Grolltech 357 / 507: NewAccount4Me 358 / 507: Totorotroll 359 / 507: Moncrief 360 / 507: Numazİs 361 / 507: LudicrousTripe 362 / 507: Toddy1 363 / 507: Soranoch 364 / 507: M.thoriyan 365 / 507: Welshsocialist 366 / 507: Eddpayne 367 / 507: Jayen466 368 / 507: Cowcharge 369 / 507: Nil Einne 370 / 507: Jbower47 371 / 507: 159.83.196.1 372 / 507: Foofbun 373 / 507: Countered 374 / 507: McGeddon 375 / 507: Fyunck(click) 376 / 507: Iamcuriousblue 377 / 507: NickCT 378 / 507: 88.73.34.231 379 / 507: Haxwell 380 / 507: 23 editor 381 / 507: 92.29.51.58 382 / 507: Edge3 383 / 507: SarekOfVulcan 384 / 507: Smowton 385 / 507: 190.103.67.169 386 / 507: Timrollpickering 387 / 507: Cjarbo2 388 / 507: Norden1990 389 / 507: Kairi Izumi 390 / 507: FoxyOrange 391 / 507: Mark Arsten 392 / 507: 2.80.208.56 393 / 507: Bearcat 394 / 507: Labellementeuse 395 / 507: Surtsicna 396 / 507: I JethroBT 397 / 507: Anagogist 398 / 507: DracoEssentialis 399 / 507: Njardarlogar 400 / 507: ColonelHenry 401 / 507: Floydian 402 / 507: Mattgirling 403 / 507: 69.155.81.253 404 / 507: Jaakko Sivonen 405 / 507: IRWolfie- 406 / 507: KumiokoCleanStart 407 / 507: Aoidh 408 / 507: 142.161.97.237 409 / 507: PenguiN42 410 / 507: Collect 411 / 507: MrDolomite 412 / 507: Oren0 413 / 507: McPhail 414 / 507: OohBunnies! 415 / 507: Sailsbystars 416 / 507: Joseph A. Spadaro 417 / 507: Wester 418 / 507: 68.81.192.33 419 / 507: Randy2063 420 / 507: Lyo 421 / 507: StuartH 422 / 507: OSborn 423 / 507: Niemti 424 / 507: Haipa Doragon 425 / 507: Steven Zhang 426 / 507: Wasmachien 427 / 507: 71.184.71.199 428 / 507: GregJackP 429 / 507: Deep Purple Dreams 430 / 507: Robofish 431 / 507: Longsight 432 / 507: Ginsengbomb 433 / 507: PiMaster3 434 / 507: AndyTheGrump 435 / 507: Mark Miller 436 / 507: PBS 437 / 507: Rannpháirtí anaithnid 438 / 507: Thryduulf 439 / 507: Space simian 440 / 507: Morwen 441 / 507: SchreiberBike 442 / 507: CFynn 443 / 507: Badanagram 444 / 507: -sche 445 / 507: Yetisyny 446 / 507: Carrite 447 / 507: Dmarquard 448 / 507: VictusB 449 / 507: Sca 450 / 507: Dirac66 451 / 507: LionMans Account 452 / 507: Scs 453 / 507: Bwmoll3 454 / 507: Bluerasberry 455 / 507: April Arcus 456 / 507: Antonio Hazard 457 / 507: Thinking of England 458 / 507: 94.31.32.30 459 / 507: Dee Earley 460 / 507: 108.226.20.130 461 / 507: JohnValeron 462 / 507: Tocino 463 / 507: Stryn 464 / 507: 97.90.153.202 465 / 507: General Staal 466 / 507: Josh Gorand 467 / 507: Rinnenadtrosc 468 / 507: Adrian 469 / 507: JasonJack 470 / 507: Alandeus 471 / 507: Abductive 472 / 507: Ross Hill 473 / 507: Cerejota 474 / 507: LFaraone 475 / 507: Lawsonstu 476 / 507: DebashisM 477 / 507: Crisis 478 / 507: An Editor With a Self-Referential Name 479 / 507: WeldNeck 480 / 507: Shoeless Ho 481 / 507: Somchai Sun 482 / 507: Paul Erik 483 / 507: CombatWombat42 484 / 507: Neutron 485 / 507: Amitabho 486 / 507: Bob K31416 487 / 507: 202.174.184.14 488 / 507: Andy Dingley 489 / 507: 91.125.230.213 490 / 507: Uvaduck 491 / 507: Daniel32708 492 / 507: FeydHuxtable 493 / 507: Mjb 494 / 507: Ishmael reis 495 / 507: Mispy 496 / 507: NorthBySouthBaranof 497 / 507: Prototime 498 / 507: Alex Bakharev 499 / 507: Stephan Schulz 500 / 507: Hurtsmyears 501 / 507: Pigsonthewing 502 / 507: Rgrasmus 503 / 507: Sue Gardner 504 / 507: Knowledgekid87 505 / 507: Tazerdadog 506 / 507: Wing gundam 507 / 507: 90.210.192.246
Get all of the links from each of these pages.
hyperlink_dict = dict()
for i,a in enumerate(category_members):
print u'{0} / {1} : {2}'.format(i+1,len(category_members),a)
hyperlink_dict[a] = get_page_outlinks_from_content(a,'es')
1 / 18 : Elecciones federales en México de 2012 2 / 18 : Elecciones estatales de Campeche de 2012 3 / 18 : Elecciones estatales en Chiapas de 2012 4 / 18 : Elecciones estatales de Colima de 2012 5 / 18 : Elecciones en el Distrito Federal (México) de 2012 6 / 18 : Elecciones estatales del Estado de México de 2012 7 / 18 : Elecciones estatales en Guanajuato de 2012 8 / 18 : Elecciones estatales de Guerrero de 2012 9 / 18 : Elecciones estatales extraordinarias de Hidalgo de 2012 10 / 18 : Elecciones estatales en Jalisco de 2012 11 / 18 : Elecciones estatales extraordinarias de Michoacán de 2012 12 / 18 : Elecciones estatales de Morelos de 2012 13 / 18 : Elecciones estatales de Nuevo León de 2012 14 / 18 : Elecciones estatales de Querétaro de 2012 15 / 18 : Elecciones estatales en San Luis Potosí de 2012 16 / 18 : Elecciones estatales en Tabasco de 2012 17 / 18 : Elecciones estatales de 2012 en Yucatán 18 / 18 : Elecciones estatales extraordinarias de Yucatán de 2012
Create a set of alters to crawl in turn, excluding links to categories, files, and archives.
hyperlink_alters = list()
for ego,alters in hyperlink_dict.iteritems():
alters = list(set(alters))
for alter in alters:
if u'Categor\xeda:' not in alter and u'Anexo:' not in alter and u'Archivo:' not in alter:
hyperlink_alters.append(alter)
hyperlink_alters = list(set(hyperlink_alters))
Crawl these alters and add their alters to the hyperlink dictionary. Some pages may not exist, in which case ignore them.
for i,a in enumerate(hyperlink_alters):
print u'{0} / {1} : {2}'.format(i+1,len(hyperlink_alters),a)
try:
hyperlink_dict[a] = get_page_outlinks_from_content(a,'es')
except KeyError:
print u"...{0} doesn't exist".format(a)
pass
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) <ipython-input-57-fd43abeaef06> in <module>() 2 print u'{0} / {1} : {2}'.format(i+1,len(hyperlink_alters),a) 3 try: ----> 4 hyperlink_dict[a] = get_page_outlinks_from_content(a,'es') 5 except KeyError: 6 print u"...{0} doesn't exist".format(a) <ipython-input-16-a913c4800090> in get_page_outlinks_from_content(page_title, lang) 70 def get_page_outlinks_from_content(page_title,lang): 71 page_title = cast_to_unicode(page_title) ---> 72 page_title = rename_on_redirect(page_title) 73 74 # Get content from most recent revision of an article <ipython-input-11-a2a093f5c382> in rename_on_redirect(article_title) 3 'prop': 'info', 4 'action': 'query', ----> 5 'redirects': 'True'}) 6 if 'redirects' in result.keys() and 'pages' in result.keys(): 7 article_title = result['redirects'][0]['to'] <ipython-input-15-d6a249997c3a> in wikipedia_query(query_params, lang) 37 site = wiki.Wiki(url='http://'+lang+'.wikipedia.org/w/api.php') 38 request = api.APIRequest(site, query_params) ---> 39 result = request.query() 40 return result[query_params['action']] 41 /Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/wikitools/api.pyc in query(self, querycontinue) 137 data = False 138 while not data: --> 139 rawdata = self.__getRaw() 140 data = self.__parseJSON(rawdata) 141 #Certain errors should probably be handled here... /Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/wikitools/api.pyc in __getRaw(self) 212 else: 213 catcherror = Exception --> 214 data = self.opener.open(self.request) 215 self.response = data.info() 216 if gzip: /Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.pyc in open(self, fullurl, data, timeout) 398 req = meth(req) 399 --> 400 response = self._open(req, data) 401 402 # post-process response /Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.pyc in _open(self, req, data) 416 protocol = req.get_type() 417 result = self._call_chain(self.handle_open, protocol, protocol + --> 418 '_open', req) 419 if result: 420 return result /Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.pyc in _call_chain(self, chain, kind, meth_name, *args) 376 func = getattr(handler, meth_name) 377 --> 378 result = func(*args) 379 if result is not None: 380 return result /Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.pyc in http_open(self, req) 1205 1206 def http_open(self, req): -> 1207 return self.do_open(httplib.HTTPConnection, req) 1208 1209 http_request = AbstractHTTPHandler.do_request_ /Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.pyc in do_open(self, http_class, req) 1178 else: 1179 try: -> 1180 r = h.getresponse(buffering=True) 1181 except TypeError: # buffering kw not supported 1182 r = h.getresponse() /Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.pyc in getresponse(self, buffering) 1028 response = self.response_class(*args, **kwds) 1029 -> 1030 response.begin() 1031 assert response.will_close != _UNKNOWN 1032 self.__state = _CS_IDLE /Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.pyc in begin(self) 405 # read until we get a non-100 response 406 while True: --> 407 version, status, reason = self._read_status() 408 if status != CONTINUE: 409 break /Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.pyc in _read_status(self) 363 def _read_status(self): 364 # Initialize with Simple-Response defaults --> 365 line = self.fp.readline() 366 if self.debuglevel > 0: 367 print "reply:", repr(line) /Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/socket.pyc in readline(self, size) 445 while True: 446 try: --> 447 data = self._sock.recv(self._rbufsize) 448 except error, e: 449 if e.args[0] == EINTR: KeyboardInterrupt:
1 / 847 :
hyperlink_graph = nx.DiGraph()
for ego,alters in hyperlink_dict.iteritems():
for alter in alters:
if alter in hyperlink_dict.keys():
hyperlink_graph.add_edge(ego,alter)
nx.write_graphml(hyperlink_graph,'hyperlinks.graphml')
net = nx.DiGraph()
for article,revisions in alter_revs.iteritems():
for revision in revisions:
if 'user' in revision.keys() and 'bot' not in revision['user']:
try:
net[revision['user']][revision['title']]['weight'] += 1
except KeyError:
net.add_node(revision['user'],node_type='user')
net.add_node(revision['title'],node_type='article')
net.add_edge(revision['user'],revision['title'],weight=1)
net_articles = [i for i,j in net.nodes(data=True) if j['node_type'] == 'article']
net_users = [i for i,j in net.nodes(data=True) if j['node_type'] == 'user']
len(net_users)
2443
result = cPickle.load(open('Boston_Marathon_bombings.p','rb'))
revisions_dict = dict()
page_number = result['pages'].keys()[0]
revisions = result['pages'][page_number]['revisions']
for revision in revisions:
rev = dict()
rev['pageid'] = page_number
rev['title'] = result['pages'][page_number]['title']
rev['size'] = revision.get('size', 0) # Sometimes the size key is not present, so we'll set it to 0 in those cases
rev['timestamp'] = convert_to_datetime(revision['timestamp'])
rev['content'] = revision.get('*',unicode()) # Sometimes content hidden, return with empty unicode string
rev['links'] = link_finder(rev['content'])
rev['username'] = revision['user']
rev['userid'] = revision['userid']
rev['revid'] = revision['revid']
revisions_dict[revision['revid']] = rev
def adjacency_calcs(revisions):
revisions = sorted(revisions,key=itemgetter('pageid','timestamp'))
revisions[0]['position'] = 0
revisions[0]['edit_lag'] = datetime.timedelta(0)
revisions[0]['bytes_added'] = revisions[0]['size']
revisions[0]['unique_users'] = [revisions[0]['username']]
revisions[0]['unique_users_count'] = 1
revisions[0]['article_age'] = 0
for num,rev in enumerate(revisions[:-1]):
revisions[num+1]['position'] = rev['position'] + 1
revisions[num+1]['edit_lag'] = revisions[num+1]['timestamp'] - rev['timestamp']
revisions[num+1]['bytes_added'] = revisions[num+1]['size'] - rev['size']
revisions[num+1]['unique_users'] = rev['unique_users']
revisions[num+1]['unique_users'].append(revisions[num+1]['username'])
revisions[num+1]['unique_users'] = list(set(revisions[num+1]['unique_users']))
revisions[num+1]['unique_users_count'] = len(revisions[num+1]['unique_users'])
revisions[num+1]['article_age'] = revisions[num+1]['timestamp'] - revisions[0]['timestamp']
return revisions