from __future__ import division import pandas as pd pd.set_printoptions(max_rows=100, max_columns=10) from scipy import stats import matplotlib.pyplot as plt impact = pd.read_csv('SNIP_SJR_complete_1999_2011new_SNIP_and_SJR_v1_Oct_2012.csv') open_access = pd.read_csv('open_access_journals.csv') def rm_issn_punc(x): import re punc = re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ", x) space = ''.join(punc.split(" ")) return space def strip_space(x): try: new = int(y) except: new = str(x).strip(' ') return new def membership(x): blean = True open_lst = np.array(open_access.issn_) if x in open_lst: blean = False return blean open_access['issn'] = open_access['ISSN'].map(rm_issn_punc) open_access['issn_'] = open_access['issn'].map(strip_space) impact['issn'] = impact['Print ISSN'].map(strip_space) impact['closed'] = impact['issn'].map(membership) closed = impact[impact.closed == True] matches = pd.merge(impact, open_access, left_on=impact['issn'], right_on=['issn']).drop_duplicates() matches = matches.dropna(how='all') matches = matches.drop_duplicates(cols='Title') matches[['Source Title', 'Title']].head(20) print 'Closed: {0}, Open: {1}, Full list: {2}'.format(len(closed.issn.unique()), len(open_access.issn_.unique()), len(impact.issn.unique())) top_impact_all = impact[['Source Title', '2011 SNIP2']].copy() top_impact_all = pd.DataFrame(top_impact_all.sort('2011 SNIP2', ascending=False).dropna(), columns=['Source Title', '2011 SNIP2']) top_impact_all['2011 SJR2'] = impact['2011 SJR2'] top_impact_all['Difference'] = top_impact_all['2011 SNIP2'] - top_impact_all['2011 SJR2'] top_impact_all.head(15) top_impact_all.sort('2011 SJR2', ascending=False).dropna().head(15) open_lang = open_access.Language.value_counts().head(10)/len(open_access)*100 open_lang open_lang.plot(kind='bar', title='Most common languages, open source journals (%)', color='green', alpha=.3); open_lang = open_access.Keyword.value_counts().head(15)/len(open_access)*100 open_lang open_lang.plot(kind='bar', title='Most common keywords, open source journals (%)', color='green', alpha=.3); plt.figure() open_access['Start Year'].hist(range=(1980, 2012), bins=30, color='green', alpha=.3) plt.title('Histogram of start year, open access journals'); timeline = open_access.sort('Start Year') timeline[['Title', 'Start Year', 'End Year']].head(10) fee = open_access['Publication fee'].value_counts()/len(open_access)*100 fee fee.plot(kind='bar', title='Histogram of fee required (%)', color='green', alpha=.3); len(open_access) - len(matches) closed_field = closed[['Physical sciences', 'Life sciences', 'Social sciences']].count()/len(closed) open_field = matches[['Physical sciences', 'Life sciences', 'Social sciences']].count()/len(matches) closed_field.plot(color='green', kind='bar', alpha=.3); open_field.plot(color='blue', kind='bar', alpha=.4, title='Comparison of discipline (%)\nGreen=closed, purple=OA'); countries = pd.DataFrame(closed.Country.value_counts(), columns=['closed']) countries['open'] = open_access.Country.value_counts() countries['proportion_oa'] = countries['open']/countries['closed'] countries_sorted = countries.sort('proportion_oa', ascending=False) countries_sorted[countries_sorted.proportion_oa >= 0][:10] countries.closed.head(10).plot(color='green', kind='bar', alpha=.3) countries.open.head(10).plot(color='blue', kind='bar', alpha=.4, title='Comparison of top journal producers (%)\nGreen=closed, purple=OA'); countries_sorted countries_sorted.proportion_oa[countries_sorted.proportion_oa > 0].head(10).plot(kind='bar', color ='m', rot=30, title ='Countries with highest proportion of OA journals (%)'); snip_dist = pd.DataFrame(closed['2011 SNIP2'], columns=['2011 closed SNIP']) snip_dist['2011 open SNIP'] = matches['2011 SNIP2'] snip_dist[snip_dist['2011 closed SNIP'] <15].boxplot(sym='m+'); snip_dist.describe() sjr_dist = pd.DataFrame(closed['2011 SJR2'], columns=['2011 closed SJR']) sjr_dist['2011 open SJR'] = matches['2011 SJR2'] sjr_dist[sjr_dist['2011 closed SJR']<15].boxplot(sym='m+'); sjr_dist.describe() open_years = matches[['1999 SNIP2', '2000 SNIP2', '2001 SNIP2', '2002 SNIP2', '2003 SNIP2', '2004 SNIP2', '2005 SNIP2', '2006 SNIP2', '2007 SNIP2', '2008 SNIP2', '2009 SNIP2', '2010 SNIP2', '2011 SNIP2']] closed_years = closed[['1999 SNIP2', '2000 SNIP2', '2001 SNIP2', '2002 SNIP2', '2003 SNIP2', '2004 SNIP2', '2005 SNIP2', '2006 SNIP2', '2007 SNIP2', '2008 SNIP2', '2009 SNIP2', '2010 SNIP2', '2011 SNIP2']] open_years.mean().plot(style='g'); closed_years.mean().plot(style='--', title='Mean SNIP score\nGreen=closed journals\nBlue=open journals', rot=30); clean = pd.DataFrame(open_years.mean(), columns=['open']) clean['closed'] = closed_years.mean() clean['diff'] = clean.closed - clean.open clean clean['diff'].plot(title='Closed mean - open mean', rot=30, style='m'); matches[['1999 SNIP2', '2011 SNIP2']].describe() open_years_sjr = matches[['1999 SJR2', '2000 SJR2', '2001 SJR2', '2002 SJR2', '2003 SJR2', '2004 SJR2', '2005 SJR2', '2006 SJR2', '2007 SJR2', '2008 SJR2', '2009 SJR2', '2010 SJR2', '2011 SJR2']] closed_years_sjr = closed[['1999 SJR2', '2000 SJR2', '2001 SJR2', '2002 SJR2', '2003 SJR2', '2004 SJR2', '2005 SJR2', '2006 SJR2', '2007 SJR2', '2008 SJR2', '2009 SJR2', '2010 SJR2', '2011 SJR2']] open_years_sjr.mean().plot(style='green'); closed_years_sjr.mean().plot(style='--', title='Mean SJR score\nGreen=closed journals\nBlue=open journals', rot=30); sjr_diff = closed_years_sjr.mean() - open_years_sjr.mean() sjr_diff sjr_diff.plot(title='Closed journal SJR advantage over open journal', rot=30, ylim=(.0, .75), style='m'); matches[['1999 SJR2', '2011 SJR2']].describe()