Around 90 researchers listed in the Brede Wiki have both an Google Scholar identifier and and Twitter identifier associated with them. These can be extracted and Google Scholar and Twitter can be queried to get citation count and followers count so the Kardashian index (K-index) can be computed.
See the plot and the Kardashian index at the bottom.
The list of the included researchers is here: http://neuro.compute.dtu.dk/wiki/Category:Researchers_in_Google_Scholar_and_Twitter
Read more about the Kardashian index here: http://neuro.compute.dtu.dk/wiki/Kardashian_index
New coefficients for power law for Google Scholar/Twitter are computed. The power law is 50.5 x citations ^ 0.28.
To run the below code you need to setup a python.cfg configuration file with the four Twitter tokens, user-agent and 'from' email.
from __future__ import division, print_function
try:
import ConfigParser as configparser
except ImportError:
import configparser
import copy
import json
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import requests
import scipy.spatial
import time
import tweepy
from urllib import quote
# Read configuration file for request user-agent and Twitter tokens
config = configparser.ConfigParser()
config.read(os.path.expanduser('~/etc/python.cfg'))
headers = {
'User-Agent': config.get('requests', 'user_agent'),
'From': config.get('requests', 'from')
}
# Fetch name of Brede Wiki page with researchers on Google Scholar and Twitter
url_gst = ('http://neuro.compute.dtu.dk/w/api.php?'
'action=query&format=json&list=categorymembers&'
'cmtitle=Category:Researchers in Google Scholar and Twitter')
response = requests.get(url_gst, headers=headers).json()
pagetitles = [page['title'] for page in response['query']['categorymembers']]
while 'query-continue' in response:
url_continue = url_gst + '&cmcontinue=' + response['query-continue']['categorymembers']['cmcontinue'].encode('utf-8')
response = requests.get(url_continue).json()
pagetitles.extend([page['title'] for page in response['query']['categorymembers']])
print(pagetitles)
# Get researcher data from Brede Wiki
url_pages = "http://neuro.compute.dtu.dk/w/index.php?action=raw&title="
pattern_researcher = re.compile('{{Researcher(\s*?\|.*?)}}',
re.DOTALL | re.IGNORECASE | re.UNICODE)
pattern_fields = re.compile(r'\s*\|\s*(\w+)\s*=\s*([^\|]*\w)',
re.DOTALL | re.UNICODE)
researchers = []
for pagetitle in pagetitles:
response = requests.get(url_pages + quote(pagetitle.encode('utf-8')), headers=headers)
print(pagetitle)
researcher = pattern_researcher.findall(response.text)
if researcher:
researchers.append(dict(pattern_fields.findall(researcher[0])))
else:
print("Problem with " + pagetitle)
researchers[14]
url_gs = 'http://scholar.google.com/citations?user='
headers = {
'User-Agent': config.get('requests', 'user_agent'),
'From': config.get('requests', 'from')
}
# <td class="cit-borderleft cit-data">2537</td>
pattern_gscount = re.compile('<td class="gsc_rsb_std">(\d+)</td>')
def get_google_scholar_counts(google_scholar_id):
response = requests.get(url_gs + google_scholar_id,
headers=headers)
counts = dict(zip(['citations', 'citations5', 'h-index', 'h-index5',
'i10-index', 'i10-index5'],
map(int, pattern_gscount.findall(response.text))))
return counts
# Yong-Yeol Ahn check
get_google_scholar_counts('US7OSNgAAAAJ')
# Get data from Google Scholar
for researcher in researchers:
if 'citations' not in researcher:
print(researcher['name'])
researcher.update(get_google_scholar_counts(researcher['googlescholar']))
time.sleep(5)
# Saving just in case
json.dump(researchers, open('researchers.json', 'w'))
# Twitter authentication
auth = tweepy.OAuthHandler(config.get('twitter', 'consumer_key'),
config.get('twitter', 'consumer_secret'))
auth.set_access_token(config.get('twitter', 'access_token'),
config.get('twitter', 'access_secret'))
# Function to download data from Twitter profiles
api = tweepy.API(auth)
def get_twitter_count(twitter_id):
try:
user = api.get_user(twitter_id)
counts = {
'Followers count': user.followers_count,
'Friends count': user.friends_count,
'Statuses count': user.statuses_count
}
return counts
except Exception:
print('Problem with ' + twitter_id)
return {}
# Testing with Finn Aarup Nielsen (fnielsen)
get_twitter_count('fnielsen')
# Download data from Twitter
for researcher in researchers:
researcher.update(get_twitter_count(researcher['twitter']))
print(researcher['name'])
# Save just in case
json.dump(researchers, open('researchers.json', 'w'))
researchers = json.load(open('researchers.json'))
researchers[0]
# Pandas!
df = pd.DataFrame(researchers)
%matplotlib inline
# isnan: Houston, we've had a problem
indices = (~np.isnan(df['citations'])) & (df['citations'] != 0)
reverse_index = indices[indices].index.values
# Plot the data
matplotlib.rc('font', family='DejaVu Sans')
fig = df.plot(x='citations', y='Followers count',
kind='scatter', figsize=(15, 10),
marker='*', s=df['Statuses count']/10,
linewidth=2, color=(0.8, 0.8, 0.8))
ax = plt.gca()
ax.set_xscale('log')
ax.set_yscale('log')
plt.xlabel('Google Scholar citations')
plt.ylabel('Twitter followers count')
plt.title('Kardashian index for Brede Wiki researchers on Google Scholar and Twitter')
# Power law fit
p = np.polyfit(np.log(df.ix[indices, 'citations']), np.log(df.ix[indices, 'Followers count']), 1)
powerlaw = np.frompyfunc(lambda x: np.exp(p[1]) * x ** p[0], 1, 1)
plt.plot([1, 200000], powerlaw([1, 200000]), linewidth=5, color=(0.5, 1, 0.5))
plt.text(10, 5000, '{:.3} x citations^{:0.2}'.format(np.exp(p[1]), p[0]), fontsize=20)
# Annotation of some of the points with researcher names
hull = scipy.spatial.ConvexHull(df.ix[indices, ['citations', 'Followers count']])
for index in hull.vertices:
x, y, name = df.ix[reverse_index[index], ['citations', 'Followers count', 'name']].values
try:
plt.text(x, y, name, horizontalalignment='center', verticalalignment='center')
except:
pass
# Myself and Ryoto and et al.
family_names = ['Nielsen', 'Tomioka', 'Willighagen']
for family_name in family_names:
x, y, name = df.ix[df['family_name'] == family_name, ['citations', 'Followers count', 'name']].values.flatten()
plt.text(x, y, name, horizontalalignment='center', verticalalignment='center')
dummy = plt.axis((1, 200000, 1, 20000))
plt.show()
df.describe()
df['K-index'] = df['Followers count'] / powerlaw(df['citations'])
# Identify the 'scientific Kardashians'
high_score = df[indices].sort(columns='K-index', ascending=False)[['name', 'K-index', 'Statuses count']]
high_score
# Not all is shown above
# The below code will give the full list:
# https://stackoverflow.com/questions/23388810/ipython-notebook-output-cell-is-truncating-contents-of-my-list
from IPython.display import HTML
HTML(high_score.to_html())