from __future__ import division
import pandas as pd
pd.set_printoptions(max_rows=100, max_columns=10)
from scipy import stats
import matplotlib.pyplot as plt

impact = pd.read_csv('SNIP_SJR_complete_1999_2011new_SNIP_and_SJR_v1_Oct_2012.csv')
open_access = pd.read_csv('open_access_journals.csv')

def rm_issn_punc(x):
    import re
    punc = re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ", x)
    space = ''.join(punc.split(" "))
    return space


def strip_space(x):
    try:
        new = int(y)
    except:
        new = str(x).strip(' ') 
    return new


def membership(x):
    blean = True
    open_lst = np.array(open_access.issn_)
    if x in open_lst:
        blean = False
    return blean


open_access['issn'] = open_access['ISSN'].map(rm_issn_punc)
open_access['issn_'] = open_access['issn'].map(strip_space)
impact['issn'] = impact['Print ISSN'].map(strip_space)
impact['closed'] = impact['issn'].map(membership)
closed = impact[impact.closed == True]

matches = pd.merge(impact, open_access, left_on=impact['issn'], right_on=['issn']).drop_duplicates()
matches = matches.dropna(how='all')
matches = matches.drop_duplicates(cols='Title')
matches[['Source Title', 'Title']].head(20)

print 'Closed: {0}, Open: {1}, Full list: {2}'.format(len(closed.issn.unique()), len(open_access.issn_.unique()), len(impact.issn.unique()))

top_impact_all = impact[['Source Title', '2011 SNIP2']].copy()
top_impact_all = pd.DataFrame(top_impact_all.sort('2011 SNIP2', ascending=False).dropna(), columns=['Source Title', '2011 SNIP2'])
top_impact_all['2011 SJR2'] = impact['2011 SJR2']
top_impact_all['Difference'] = top_impact_all['2011 SNIP2'] - top_impact_all['2011 SJR2']

top_impact_all.head(15)

top_impact_all.sort('2011 SJR2', ascending=False).dropna().head(15)

open_lang = open_access.Language.value_counts().head(10)/len(open_access)*100
open_lang

open_lang.plot(kind='bar', title='Most common languages, open source journals (%)', color='green', alpha=.3);

open_lang = open_access.Keyword.value_counts().head(15)/len(open_access)*100
open_lang

open_lang.plot(kind='bar', title='Most common keywords, open source journals (%)', color='green', alpha=.3);

plt.figure()
open_access['Start Year'].hist(range=(1980, 2012), bins=30, color='green', alpha=.3)
plt.title('Histogram of start year, open access journals');

timeline = open_access.sort('Start Year')
timeline[['Title', 'Start Year', 'End Year']].head(10)

fee = open_access['Publication fee'].value_counts()/len(open_access)*100
fee

fee.plot(kind='bar', title='Histogram of fee required (%)', color='green', alpha=.3);

len(open_access) - len(matches)

closed_field = closed[['Physical sciences', 'Life sciences', 'Social sciences']].count()/len(closed)
open_field = matches[['Physical sciences', 'Life sciences', 'Social sciences']].count()/len(matches)

closed_field.plot(color='green', kind='bar', alpha=.3);
open_field.plot(color='blue', kind='bar', alpha=.4, title='Comparison of discipline (%)\nGreen=closed, purple=OA');

countries = pd.DataFrame(closed.Country.value_counts(), columns=['closed'])
countries['open'] = open_access.Country.value_counts()
countries['proportion_oa'] = countries['open']/countries['closed']

countries_sorted = countries.sort('proportion_oa', ascending=False)
countries_sorted[countries_sorted.proportion_oa >= 0][:10]

countries.closed.head(10).plot(color='green', kind='bar', alpha=.3)
countries.open.head(10).plot(color='blue', kind='bar', alpha=.4, title='Comparison of top journal producers (%)\nGreen=closed, purple=OA');

countries_sorted

countries_sorted.proportion_oa[countries_sorted.proportion_oa > 0].head(10).plot(kind='bar', color ='m', rot=30, 
title ='Countries with highest proportion of OA journals (%)');

snip_dist = pd.DataFrame(closed['2011 SNIP2'], columns=['2011 closed SNIP'])
snip_dist['2011 open SNIP'] = matches['2011 SNIP2']

snip_dist[snip_dist['2011 closed SNIP'] <15].boxplot(sym='m+');

snip_dist.describe()

sjr_dist = pd.DataFrame(closed['2011 SJR2'], columns=['2011 closed SJR'])
sjr_dist['2011 open SJR'] = matches['2011 SJR2']

sjr_dist[sjr_dist['2011 closed SJR']<15].boxplot(sym='m+');

sjr_dist.describe()

open_years = matches[['1999 SNIP2', '2000 SNIP2', '2001 SNIP2', '2002 SNIP2', '2003 SNIP2', '2004 SNIP2', '2005 SNIP2', '2006 SNIP2',
                      '2007 SNIP2', '2008 SNIP2', '2009 SNIP2', '2010 SNIP2', '2011 SNIP2']]
closed_years = closed[['1999 SNIP2', '2000 SNIP2', '2001 SNIP2', '2002 SNIP2', '2003 SNIP2', '2004 SNIP2', '2005 SNIP2', '2006 SNIP2',
                      '2007 SNIP2', '2008 SNIP2', '2009 SNIP2', '2010 SNIP2', '2011 SNIP2']]

open_years.mean().plot(style='g');
closed_years.mean().plot(style='--', title='Mean SNIP score\nGreen=closed journals\nBlue=open journals', rot=30);

clean = pd.DataFrame(open_years.mean(), columns=['open'])
clean['closed'] = closed_years.mean()
clean['diff'] = clean.closed - clean.open
clean

clean['diff'].plot(title='Closed mean - open mean', rot=30, style='m');

matches[['1999 SNIP2', '2011 SNIP2']].describe()

open_years_sjr = matches[['1999 SJR2', '2000 SJR2', '2001 SJR2', '2002 SJR2', '2003 SJR2', '2004 SJR2', '2005 SJR2', '2006 SJR2',
                      '2007 SJR2', '2008 SJR2', '2009 SJR2', '2010 SJR2', '2011 SJR2']]
closed_years_sjr = closed[['1999 SJR2', '2000 SJR2', '2001 SJR2', '2002 SJR2', '2003 SJR2', '2004 SJR2', '2005 SJR2', '2006 SJR2',
                      '2007 SJR2', '2008 SJR2', '2009 SJR2', '2010 SJR2', '2011 SJR2']]

open_years_sjr.mean().plot(style='green');
closed_years_sjr.mean().plot(style='--', title='Mean SJR score\nGreen=closed journals\nBlue=open journals', rot=30);

sjr_diff = closed_years_sjr.mean() - open_years_sjr.mean()
sjr_diff

sjr_diff.plot(title='Closed journal SJR advantage over open journal', rot=30, ylim=(.0, .75), style='m');

matches[['1999 SJR2', '2011 SJR2']].describe()