#!/usr/bin/env python # coding: utf-8 # # Good Software - analysis notebook # ## Reproducing all panels from Figure 1 # #
# This notebook contains all Python code necessary to reproduce all panels from the Figure 1 of [our paper](./), starting from the raw dataset. If you are interested in reproducing our results, you can use Binder to get started immediately (no requirements). For more information, visit [our wiki page](./). # # **Github:** https://github.com/smangul1/good.software/wiki # # **Manuscript:** https://www.biorxiv.org/content/early/2018/10/25/452532 # # **Twitter thread:** https://twitter.com/serghei_mangul/status/1055561147431043072 # ### Importing libraries # In[3]: import pylab as pl import numpy as np import pandas as pd # ### Some definitions # In[4]: # Legend for link statuses legendStatus = { -1 : 'Time out', 1 : 'Accessible', 3 : 'Redirected', 4 : 'Broken' } # Colors for each status colors = { -1 : (1.0,0.6,0.2), 1 : (0.2,0.7,0.2), 3 : (0.2,0.4,1.0), 4 : (1.0,0.,0.) } # Scheme 2: colors2 = { 'bg_unreach' : (1.0,0.8,0.6), 'fg_unreach' : (0.8,0.,0.), 'bg_accessb' : (0.7,1.0,0.7), 'fg_accessb' : (0.0,0.0,0.8) } # Will only consider these years initialYear = 2005 # ### Util functions # In[5]: def simpleaxis(ax): ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.get_xaxis().tick_bottom() ax.get_yaxis().tick_left() # # Importing and parsing data # In[6]: links = pd.read_csv('links.bulk.csv') links.head() # In[7]: links.info() # Parsing from code into human-readable status: # # * -1 : time out # * 0..299 : good # * 300..399 : redirected # * \>= 400 : not found # # To make it easier, let's map these codes to -1 for time out, 1 for accessible, 3 for redirected, and 4 for not found. # In[8]: links = links[ links.year >= initialYear ] links['status'] = ( links['code'] // 100 ) links.loc[ links['status'] > 4, 'status' ] = 4 links.loc[ (links['status'] >= 0) & (links['status'] < 3), 'status' ] = 1 # Let's look at the histogram of statuses # In[9]: statuses = np.array( links.status ) f, ax = pl.subplots(1, 1) ax.hist( statuses, np.arange(-1.5,6,1), edgecolor=(0.2,0.2,0.2), lw=3, color=(0.2,0.6,1.0) ) ax.set_ylabel('Count') ax.set_xticks([-1, 1, 3, 4]) ax.set_xticklabels( ['Time out', 'Accessible', 'Redirected', 'Not found'], rotation=-45 ) pl.show() # In[10]: count, bin_values = np.histogram(statuses, bins = [-1.2,-0.8,1.8,3.8,4.8]) print(count, bin_values) total = float(count.sum()) for j in range(4): print(count[j]/total) # ### Filtering out repeated links # To avoid using repeated links, let's create another dataframe. # In[11]: uniqueLinks = links[ links['flag.uniqueness'] == 0 ] print(uniqueLinks.shape) uniqueLinks = uniqueLinks.drop(columns=['flag.uniqueness']) uniqueLinks.head() # In[12]: repeated_data = pd.concat(g for _, g in uniqueLinks.groupby("id") if len(g) > 1) repeated_data.head(15) # In[13]: uniqueLinks.info() # In[14]: uniqueLinks.describe() # In[15]: print( 'Percent of repeated links: %.1f%%' % ( (1 - len(uniqueLinks)/len(links))*100. ) ) # In[16]: statuses = np.array( uniqueLinks.status ) f, ax = pl.subplots(1, 1) ax.hist( statuses, np.arange(-1.5,6,1), edgecolor=(0.2,0.2,0.2), lw=3, color=(0.2,0.6,1.0) ) ax.set_ylabel('Count') ax.set_xticks([-1, 1, 3, 4]) ax.set_xticklabels( ['Time out', 'Accessible', 'Redirected', 'Not found'], rotation=-45 ) pl.show() # In[17]: np.unique(uniqueLinks.id).shape # ## Fixing http -> https redirection # # Due to increased security awareness in the past few years, a number of links marked as redirections are in fact just changing the http protocol to https. We will not consider them as redirection, if the rest of the URL is the same. # In[18]: redirection_checking = pd.read_csv('http2https.redirected.csv', header=None, names=['pubmed_id', 'url', 'status', 'https_redirect'] ) redirection_checking.head(10) # In[19]: for index, row in redirection_checking.iterrows(): if row['https_redirect'] == 'True' and uniqueLinks[ uniqueLinks.link == row['url']].shape[0] > 0: idx = np.where(uniqueLinks.link == row['url'])[0][0] uniqueLinks.iloc[idx,6] = 1 # In[20]: uniqueLinks.head() # In[21]: statuses = uniqueLinks.status count, bin_values = np.histogram(statuses, bins = [-1.2,-0.8,1.8,3.8,4.8]) print(count, bin_values) total = float(count.sum()) for j in range(4): print(count[j]/total) # In[22]: (count[0]+count[-1])/total # ## Constructing a dataframe with only abstracts # In[23]: uniqueLinks_abs = uniqueLinks[ uniqueLinks.type == 'abstract' ] uniqueLinks_abs.info() # ## Number of entries per journal # In[24]: journals = np.array( uniqueLinks.journal ) journalTitles = np.unique(journals) jhash = {} for jid,journal in enumerate(journalTitles): jhash[journal] = jid listxs = np.arange(0,11,1) journals_num = [ jhash[journal] for journal in journals ] counts, bins = np.histogram( journals_num, bins=listxs ) newIDX = np.argsort(counts) f, ax = pl.subplots(1, 1) ax.barh( bins[0:-1], counts[newIDX], 0.8, edgecolor=(0.2,0.2,0.2), lw=2, color=(0.2,0.6,1.0) ) ax.spines['bottom'].set_visible(False) ax.spines['right'].set_visible(False) ax.get_xaxis().tick_top() ax.get_yaxis().tick_left() ax.text( 5000, 11.3, 'Entries count', fontsize=12 ) ax.set_yticks( listxs ) ax.set_yticklabels( journalTitles[newIDX], rotation=0, fontsize=12 ) pl.show() print('Journal name \t\t Count') for jid,journal in enumerate(journalTitles[newIDX]): print('%s\t\t %d' %(journal[:15],counts[newIDX][jid]) ) # ## Number of entries per abstract # In[25]: journals = np.array( uniqueLinks_abs.journal ) journalTitles = np.unique(journals) jhash = {} for jid,journal in enumerate(journalTitles): jhash[journal] = jid listxs = np.arange(0,11,1) journals_num = [ jhash[journal] for journal in journals ] counts, bins = np.histogram( journals_num, bins=listxs ) newIDX = np.argsort(counts) f, ax = pl.subplots(1, 1) ax.barh( bins[0:-1], counts[newIDX], 0.8, edgecolor=(0.2,0.2,0.2), lw=2, color=(0.2,0.6,1.0) ) ax.spines['bottom'].set_visible(False) ax.spines['right'].set_visible(False) ax.get_xaxis().tick_top() ax.get_yaxis().tick_left() ax.text( 1200, 11.3, 'Entries count', fontsize=12 ) ax.set_yticks( listxs ) ax.set_yticklabels( journalTitles[newIDX], rotation=0, fontsize=12 ) pl.show() print('Journal name \t\t Count') for jid,journal in enumerate(journalTitles[newIDX]): print('%s\t\t %d' %(journal[:15],counts[newIDX][jid]) ) # ## Number of of links per year # In[26]: uLinks_year = uniqueLinks.groupby(['year', 'status']).agg('count') # In[27]: uLinks_year.head(12) # Grouping by status and retrieving the percent for each status per year. # In[28]: tseries = {} for j,status in enumerate([-1, 1, 3, 4]): tseries[status] = np.array( uLinks_year.iloc[j::4].link ) # Plotting # In[29]: barsize=0.75 f, ax = pl.subplots(1, 1, figsize=(4,2.2) ) b = np.zeros( tseries[status].shape[0] ) years = np.arange(2005,2018,1) for zlabel,status in enumerate([4,-1,1,3]): ax.bar( years, tseries[status] + b, barsize, label=legendStatus[status], color=colors[status], zorder=-1-zlabel ) b += tseries[status] simpleaxis(ax) ax.set_ylabel("Number of links") ax.set_yticks( np.arange(0,4001,1000) ) ax.set_yticklabels( np.array( np.arange(0,4001,1000), dtype=int ), fontsize=8 ) ax.set_xlabel("Year") ax.set_xticks( np.arange( 2005,2020,3 ) ) ax.set_xticklabels( np.arange( 2005,2020,3 ), fontsize=8 ) pl.legend(bbox_to_anchor=(1,1.4), frameon=False, ncol=4, handletextpad=0.2, columnspacing=0.8, handlelength=1) pl.tight_layout() pl.savefig('Figure_1_panels/NumLinks per year - all.pdf') # ## Percent of links per year # Normalizing the number of links for each year. # In[30]: tseries_norm = np.zeros( tseries[-1].shape ) for status in [-1, 1, 3, 4]: tseries_norm += tseries[status] # In[31]: barsize=0.75 f, ax = pl.subplots(1, 1, figsize=(4,2.2) ) b = np.zeros( tseries[status].shape[0] ) years = np.arange(2005,2018,1) for zlabel,status in enumerate([4,-1,1,3]): ax.bar( years, ( tseries[status]/tseries_norm + b )*100., barsize, label=legendStatus[status], color=colors[status], zorder=-1-zlabel ) b += tseries[status]/tseries_norm simpleaxis(ax) ax.set_ylabel("Percent of links (%)") ax.set_yticks( np.arange(0,101,25) ) ax.set_yticklabels( np.array( np.arange(0,101,25), dtype=int ), fontsize=8 ) ax.set_xlabel("Year") ax.set_xticks( np.arange( 2005,2020,3 ) ) ax.set_xticklabels( np.arange( 2005,2020,3 ), fontsize=8 ) pl.legend(bbox_to_anchor=(1,1.4), frameon=False, ncol=4, handletextpad=0.2, columnspacing=0.8, handlelength=1) pl.tight_layout() pl.savefig('Figure_1_panels/Percent per year - all.pdf') # ## Number of links as function of time # Counting total number per year # Plotting: # In[32]: f, ax = pl.subplots( 1, 1, figsize=(3.4, 2.) ) pl.plot( years, tseries[1] + tseries[3], 'o-', markersize=7, label='Accessible + Redirected', color=colors2['fg_accessb'], markeredgecolor=colors2['fg_accessb'], markerfacecolor=colors2['bg_accessb'] ) pl.plot( years, tseries[-1] + tseries[4], 's-', markersize=6, label='Broken + Time out', color=colors2['fg_unreach'], markeredgecolor=colors2['fg_unreach'], markerfacecolor=colors2['bg_unreach'] ) simpleaxis(ax) pl.ylabel('Number of links') pl.ylim(0,4050) pl.xlabel('Time (years)') pl.xlim(2004.5,2017.5) pl.legend(bbox_to_anchor=(0.9,1.5), frameon=False, handlelength=1.3) pl.tight_layout() pl.savefig('Figure_1_panels/Total per year - all.pdf') # Ratio between accessible links in 2017 in reference to that in 2005 # In[33]: numAccessible = tseries[1] + tseries[3] ratio = numAccessible[-1] / numAccessible[0] print('Ratio: %.2f' % ratio) avgGrowthRatio = (numAccessible[-1] - numAccessible[0]) / (2017-2005) / numAccessible[0] print( 'Average growth ratio: %.1f%%' % (avgGrowthRatio*100) ) # Ratio between unreachable links in 2017 in reference to that in 2005 # In[34]: numUnreachable = tseries[-1] + tseries[4] ratio = numUnreachable[-1] / numUnreachable[0] print('Ratio: %.2f' % ratio) avgGrowthRatio = (numUnreachable[-1] - numUnreachable[0]) / (2017-2005) / numUnreachable[0] print( 'Average growth ratio: %.1f%%' % (avgGrowthRatio*100) ) # # Parsing Github and SourceForge # In[35]: def count_status(statuses): count, bin_values = np.histogram(statuses, bins = [-1.2,-0.8,1.8,3.8,4.8]) print(count, bin_values) total = float(count.sum()) for j in range(4): print(count[j]/total) # In[36]: find_github = lambda x: 'github.com' in x.link find_sourceforge = lambda x: 'sourceforge.net' in x.link find_others = lambda x: ('github.com' not in x.link) and ('sourceforge.net' not in x.link) uniqLabs_github = uniqueLinks_abs[ uniqueLinks_abs.apply(find_github, axis=1) ] uniqLabs_sourceforge = uniqueLinks_abs[ uniqueLinks_abs.apply(find_sourceforge, axis=1) ] uniqLabs_others = uniqueLinks_abs[ uniqueLinks_abs.apply(find_others, axis=1) ] count_status(uniqLabs_github.status) count_status(uniqLabs_sourceforge.status) count_status(uniqLabs_others.status) # In[37]: f, ax = pl.subplots( 1, 1, figsize=(1.5, 1.4) ) # let's make sure the order is maintained: data_broken = np.array([0.08827085852478839, 0.004672897196261682,0]) data_timeout = np.array([0.17623941958887546, 0.004672897196261682, 0.0084985835694051]) data_accessible = np.array([0.4068923821039903, 0.7009345794392523, 0.9036827195467422]) data_redirected = np.array([0.32859733978234584, 0.2897196261682243, 0.08781869688385269]) colors = { -1 : (1.0,0.6,0.2), 1 : (0.2,0.7,0.2), 3 : (0.2,0.4,1.0), 4 : (1.0,0.,0.) } pl.bar( [0,1,2], data_broken+data_timeout+data_accessible+data_redirected, 0.7, color=colors[3], lw=1.3, capsize=8 ) pl.bar( [0,1,2], data_broken+data_timeout+data_accessible, 0.7, color=colors[1], lw=1.3, capsize=8 ) pl.bar( [0,1,2], data_broken+data_timeout, 0.7, color=colors[-1], lw=1.3, capsize=8 ) pl.bar( [0,1,2], data_broken, 0.7, color=colors[4], lw=1.3, capsize=8 ) ax.set_ylim(0,1) ax.set_ylabel('Percent of Links') ax.set_xlabel('') ax.set_xlim(-0.5,2.5) ax.set_xticks([0,1,2]) ax.set_xticklabels( ['Others','Source\nForge','GitHub'], rotation=-25, fontsize=9) simpleaxis(ax) pl.savefig('Figure_1_panels/Percent - github sourceforge.pdf') # In[38]: f, ax = pl.subplots(1, 1, figsize=(2.,1.6)) data_broken = np.array([0.08827085852478839, 0.004672897196261682,0])*100 data_timeout = np.array([0.17623941958887546, 0.004672897196261682, 0.0084985835694051])*100 data_unreachable = data_broken + data_timeout ax.barh( [0,1,2], data_unreachable, 0.6, edgecolor=(1.0,0.0,0.0), lw=0.1, color=(1.0,0.0,0.0) ) ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.get_xaxis().tick_bottom() ax.get_yaxis().tick_left() ax.text( -0.05, -1.5, 'Unreachable links (%)', fontsize=12 ) ax.set_xticks( [0, 10, 20, 30] ) ax.set_yticks( [0, 1, 2] ) ax.set_yticklabels( ['Others', 'Source\nForge', 'GitHub'], rotation=0, fontsize=11 ) pl.savefig('Figure_1_panels/Percent - github sourceforge_I.pdf') # In[39]: uniqLabs_github = uniqueLinks_abs[ uniqueLinks_abs.apply(find_github, axis=1) ] tseries_norm = uniqueLinks_abs.groupby(['year']).agg('count').type tseries_GH = uniqLabs_github.groupby(['year']).agg('count').type[:-1]/tseries_norm[5:-1]*100 years_gh = range(2010,2017) tseries_SF = uniqLabs_sourceforge.groupby(['year']).agg('count').type/tseries_norm[:-1]*100 years_sf = range(2005,2017) f, ax = pl.subplots( 1, 1, figsize=(2.4, 2.) ) gh_color=(0.0,0.0,0.5) gh_incolor=(0.3,0.8,1.0) pl.plot( years_gh, tseries_GH, 's-', zorder=2, markersize=5, label='GitHub', color=gh_color, markeredgecolor=gh_color, markerfacecolor=gh_incolor ) sf_color=(0.8,0.0,0.0) sf_incolor=(1.0,0.8,0.8) pl.plot( years_sf, tseries_SF, 'o-', zorder=1, markersize=5, label='SourceForge', color=sf_color, markeredgecolor=sf_color, markerfacecolor=sf_incolor ) simpleaxis(ax) pl.ylabel('Percent of links') pl.ylim(0,22) ax.set_xticks([2005, 2010, 2015]) pl.xlabel('Year') pl.xlim(2004.5,2016.5) pl.legend(bbox_to_anchor=(0.85,1.), frameon=False, handlelength=1.3, fontsize=10) pl.tight_layout() pl.savefig('Figure_1_panels/Total per year - github.pdf') # ## Number of links per year - abstract only # In[40]: uLinks_ya = uniqueLinks_abs.groupby(['year', 'status']).agg('count') tseries_abs = {} for j,status in enumerate([-1, 1, 3, 4]): tseries_abs[status] = np.array( uLinks_ya.iloc[j::4].link ) # In[41]: barsize=0.75 f, ax = pl.subplots(1, 1, figsize=(4,2.2) ) b = np.zeros( tseries_abs[status].shape[0] ) years = np.arange(2005,2018,1) for zlabel,status in enumerate([4,-1,1,3]): ax.bar( years, tseries_abs[status] + b, barsize, label=legendStatus[status], color=colors[status], zorder=-1-zlabel ) b += tseries_abs[status] simpleaxis(ax) ax.set_ylabel("Number of links") #ax.set_yticks( np.arange(0,2001,500) ) #ax.set_yticklabels( np.array( np.arange(0,2001,500), dtype=int ), fontsize=8 ) ax.set_xlabel("Year") ax.set_xticks( np.arange( 2005,2020,3 ) ) ax.set_xticklabels( np.arange( 2005,2020,3 ), fontsize=8 ) pl.legend(bbox_to_anchor=(1,1.4), frameon=False, ncol=4, handletextpad=0.2, columnspacing=0.8, handlelength=1) pl.tight_layout() pl.savefig('Figure_1_panels/NumLinks per year - Abstract.pdf') # In[42]: tseries_norm = np.zeros( tseries_abs[-1].shape ) for status in [-1, 1, 3, 4]: tseries_norm += tseries_abs[status] # In[43]: barsize=0.75 f, ax = pl.subplots(1, 1, figsize=(4,2.2) ) b = np.zeros( tseries_abs[status].shape[0] ) years = np.arange(2005,2018,1) for zlabel,status in enumerate([4,-1,1,3]): ax.bar( years, 100*(tseries_abs[status] + b)/tseries_norm, barsize, label=legendStatus[status], color=colors[status], zorder=-1-zlabel ) b += tseries_abs[status] simpleaxis(ax) ax.set_ylabel("Percent of links (%)") ax.set_yticks( np.arange(0,101,25) ) ax.set_yticklabels( np.array( np.arange(0,101,25), dtype=int ), fontsize=8 ) ax.set_xlabel("Year") ax.set_xticks( np.arange( 2005,2020,3 ) ) ax.set_xticklabels( np.arange( 2005,2020,3 ), fontsize=8 ) pl.legend(bbox_to_anchor=(1,1.4), frameon=False, ncol=4, handletextpad=0.2, columnspacing=0.8, handlelength=1) pl.tight_layout() pl.savefig('Figure_1_panels/Percent per year - Abstract.pdf') # ## Status vs IF of current journal # List of journals considered in this study: # In[44]: journals = uniqueLinks.journal.unique() print( journals ) # Impact factors were retrieved from this address: http://admin-apps.webofknowledge.com/JCR/JCR?RQ=HOME # In[45]: IFdata = pd.read_csv('impact_factors_timeseries.csv') IFdata = IFdata.fillna(0) IFdata # In[46]: count = {} Z = np.zeros( (10) ) for status in [1,3,-1,4]: count[status] = np.zeros( (10) ) def journalID(journalName): if journalName in journals: return np.argwhere(journals == journalName)[0][0] else: return -1 for index, row in uniqueLinks.iterrows(): jid = journalID(row.journal) if jid != -1: count[row['status']][ jid ] += 1 Z[ jid ] += 1 # Normalizing for status in [1,3,-1,4]: count[status] /= Z # In[47]: IFs = np.array( IFdata.iloc[0] )[1:] newIDX = np.argsort( IFs ) IFs = IFs[newIDX] # In[49]: barsize=0.7 f, ax = pl.subplots(1, 1, figsize=(3.6,2.0) ) IFs_x = np.arange(10) ax.bar( IFs_x, count[4][newIDX], barsize, label='Broken', color=colors[4], zorder=-1 ) b = np.copy( count[4][newIDX] ) ax.bar( IFs_x, b+count[-1][newIDX], barsize, label='Time out', color=colors[-1], zorder=-2 ) b += count[-1][newIDX] ax.bar( IFs_x, b+count[1][newIDX], barsize, label='Good', color=colors[1], zorder=-3 ) b += count[1][newIDX] ax.bar( IFs_x, b+count[3][newIDX], barsize, label='Redirect', color=colors[3], zorder=-4 ) simpleaxis(ax) ax.set_ylabel("Percent (%)") ax.set_yticks( np.arange(0,1.1,0.25) ) ax.set_yticklabels( np.array( np.arange(0,1.1,0.25)*100, dtype=int ), fontsize=8 ) ax.set_xlabel("Current Impact Factor") ax.set_xticks( IFs_x ) ax.set_xticklabels( ['2.3','2.5','3.7','4.5','7.1','7.3','10','12','25','41'], fontsize=8, ) pl.tight_layout() pl.savefig('Figure_1_panels/Percent per IF.pdf') # ## Status vs IF - abstracts only # List of journals considered in this study: # In[50]: journals = uniqueLinks_abs.journal.unique() print( journals ) # Impact factors were retrieved from this address: http://admin-apps.webofknowledge.com/JCR/JCR?RQ=HOME # In[51]: count = {} Z = np.zeros( (10) ) for status in [1,3,-1,4]: count[status] = np.zeros( (10) ) def journalID(journalName): if journalName in journals: return np.argwhere(journals == journalName)[0][0] else: return -1 for index, row in uniqueLinks_abs.iterrows(): jid = journalID(row.journal) if jid != -1: count[row['status']][ jid ] += 1 Z[ jid ] += 1 # Normalizing for status in [1,3,-1,4]: count[status] /= Z # In[52]: IFs = np.array( IFdata.iloc[0] )[1:] newIDX = np.argsort( IFs ) IFs = IFs[newIDX] # In[53]: barsize=0.7 f, ax = pl.subplots(1, 1, figsize=(3.5,2.2) ) IFs_x = np.arange(10) ax.bar( IFs_x, count[4][newIDX], barsize, label='Broken', color=colors[4], zorder=-1 ) b = np.copy( count[4][newIDX] ) ax.bar( IFs_x, b+count[-1][newIDX], barsize, label='Time out', color=colors[-1], zorder=-2 ) b += count[-1][newIDX] ax.bar( IFs_x, b+count[1][newIDX], barsize, label='Good', color=colors[1], zorder=-3 ) b += count[1][newIDX] ax.bar( IFs_x, b+count[3][newIDX], barsize, label='Redirect', color=colors[3], zorder=-4 ) simpleaxis(ax) ax.set_ylabel("Percent (%)") ax.set_yticks( np.arange(0,1.1,0.25) ) ax.set_yticklabels( np.array( np.arange(0,1.1,0.25)*100, dtype=int ), fontsize=8 ) ax.set_xlabel("Current Impact Factor") ax.set_xticks( IFs_x ) ax.set_xticklabels( ['2.3','2.5','3.7','4.5','7.1','7.3','10','12','25','41'], fontsize=8, ) pl.tight_layout() pl.savefig('Figure_1_panels/Percent per IF - abstracts.pdf') # ## Status vs IF at the time of publication # In[54]: IFpub = {} for status in [1,3,-1,4]: IFpub[status] = [] for index, row in uniqueLinks.iterrows(): year = row.year if ( year >= 2003 ) and ( year <= 2017 ) : # there's no IF for 2017 yet if year == 2017: year = 2016 yearIDX = -(year - 2003) + 13 IF_at_time = IFdata[row.journal].iloc[ yearIDX ] IFpub[row.status].append( IF_at_time ) # In[55]: f, ax = pl.subplots( 1, 1, figsize=(2.8, 2.0) ) # let's make sure the order is maintained: data = [IFpub[4], IFpub[-1], IFpub[1], IFpub[3]] parts = ax.violinplot( data, [0,1,2,3], showmeans=False, showmedians=False, showextrema=False, widths=0.7, points=200 ) colors=[(1.0,0.2,0.1),(1.0,0.6,0.2),(0.3,0.9,0.3),(0.3,0.7,1.0)] j = 0 for pc in parts['bodies']: pc.set_facecolor(colors[j]) pc.set_edgecolor( (0.2,0.2,0.2) ) pc.set_edgecolors( (0.2,0.2,0.2) ) pc.set_alpha(1) j += 1 def buttpos(x): return [x-0.05,x+0.05] for j in range(4): pl.plot(buttpos(j), [np.mean(data[j])]*2, color="k", linewidth=6, solid_capstyle="butt", zorder=4) for j in range(4): ax.text( j-0.4, -2, 'n='+str(len(data[j])), fontsize=7 ) ax.set_ylim(-3,14) ax.set_ylabel('Impact factor') ax.set_xlabel('') ax.set_xticks([0,1,2,3]) ax.set_xticklabels( ['Broken','Timeout','Accessible','Redirected'], rotation=-25, fontsize=8 ) simpleaxis(ax) pl.savefig('Figure_1_panels/IF vs status.pdf') # In[56]: from scipy import stats f_value, p_value = stats.f_oneway(data[0], data[1], data[3], data[2]) print(f_value, p_value) # ## Evolution of IF of each category over the years # In[57]: IFpub = {} Z = {} for status in [1,3,-1,4]: IFpub[status] = np.zeros(12) Z[status] = np.zeros(12) for index, row in uniqueLinks.iterrows(): year = row.year if ( year >= 2003 ) and ( year <= 2017 ) : # there's no IF for 2017 yet if year == 2017: year = 2016 yearIDX = year - 2005 IF_at_time = IFdata[row.journal].iloc[ yearIDX ] IFpub[row.status][yearIDX] += IF_at_time Z[row.status][yearIDX] += 1 # In[58]: colors = { -1 : (1.0,0.6,0.2), 1 : (0.2,0.7,0.2), 3 : (0.2,0.4,1.0), 4 : (1.0,0.,0.) } for s in [4,-1,1,3]: pl.plot( IFpub[s]/Z[s], color=colors[s] ) # ## Retrieving altmetric data # In[59]: def parseAltmetricData(filename): parsedData = {} inputfile = open( filename, 'r' ) # skipping first row inputfile.readline() for line in inputfile: lineSplit = line.split('\n')[0].split('[') ls_comma = lineSplit[0].split(',') pmid = int( ls_comma[0] ) parsedData[pmid] = {} score = float( ls_comma[1] ) parsedData[pmid]['score'] = score numReaders = int( ls_comma[2] ) parsedData[pmid]['numReaders'] = numReaders cited = int( ls_comma[3] ) parsedData[pmid]['cited'] = cited scopus = lineSplit[1][:-1] parsedData[pmid]['scopus'] = scopus return parsedData dictAltmetric = parseAltmetricData('./links.bulk.altmetric.csv') # In[60]: print( 'Number of Altmetric records: %d ' % (len(dictAltmetric.keys())) ) # In[61]: import matplotlib as mpl mpl.rcParams['hatch.linewidth'] = 3 # previous pdf hatch linewidth from statsmodels.stats.multicomp import MultiComparison # ## Altmetric score # In[62]: scores = {} for status in [1,3,-1,4]: scores[status] = [] for index, row in uniqueLinks.iterrows(): if row.id in dictAltmetric.keys(): numYears = (year - 2003) scores[ row.status ].append( dictAltmetric[row.id]['score'] ) # In[63]: f, ax = pl.subplots( 1, 1, figsize=(3.5, 1.5) ) # let's make sure the order is maintained: data = [scores[4], scores[-1], scores[1], scores[3]] parts = ax.violinplot( data, [0,1,2,3], showmeans=False, showmedians=False, showextrema=False, widths=0.7, points=500 ) colors=[(1.0,0.2,0.1),(1.0,0.6,0.2),(0.3,0.9,0.3),(0.3,0.7,1.0)] j = 0 for pc in parts['bodies']: pc.set_facecolor(colors[j]) pc.set_edgecolor( (0.2,0.2,0.2) ) pc.set_edgecolors( (0.2,0.2,0.2) ) pc.set_alpha(1) j += 1 def buttpos(x): return [x-0.05,x+0.05] for j in range(4): pl.plot(buttpos(j), [np.mean(data[j])]*2, color="k", linewidth=6, solid_capstyle="butt", zorder=4) for j in range(4): ax.text( j-0.3, -2.5, 'n='+str(len(data[j])), fontsize=7 ) ax.set_ylim(-4,30) ax.set_ylabel('Altmetric score') ax.set_xlabel('') ax.set_xticks([0,1,2,3]) ax.set_xticklabels( ['Broken','Timeout','Accessible','Redirected'], rotation=-25, fontsize=8) simpleaxis(ax) pl.savefig('Figure_1_panels/Altmetric Score vs status.pdf') # In[64]: f, ax = pl.subplots( 1, 1, figsize=(1.8, 1.4) ) # let's make sure the order is maintained: data = [ np.mean(scores[4]), np.mean(scores[-1]), np.mean(scores[1] + scores[3]) ] vardata = [ np.std(scores[4])/np.sqrt(len(scores[4])), np.std(scores[-1])/np.sqrt(len(scores[-1])), np.std(scores[1] + scores[3])/np.sqrt(len(scores[1] + scores[3])) ] colors=[(1.0,0.2,0.1),(1.0,0.6,0.2),(0.5,1.0,0.5)] pl.bar( [0,1,2], data, 0.7, yerr=vardata, color=colors, edgecolor=[(0.2,0.2,0.2)]*3, lw=1.3, capsize=8 ) pl.bar( [2], data[2:], 0.7, yerr=vardata[2:], color=colors[2:], hatch="//", edgecolor=[(0.1,0.4,1.0)]*3, lw=1.5, capsize=8 ) pl.bar( [2], data[2:], 0.7, yerr=vardata[2:], color='none', edgecolor=[(0.0,0.0,0.0)]*3, lw=1.5, capsize=8 ) ax.set_ylabel('Altmetric score') ax.set_ylim(0,18) ax.set_yticks([0,5,10,15]) ax.set_xlabel('') ax.set_xticks([0,1,2]) ax.set_xlim(-0.5,2.5) ax.set_xticklabels( ['Broken','Timeout','Accessible'], rotation=-25, fontsize=9) ax.text(1.3, 18, '*', fontsize=12) ax.plot([2,2,0.5,0.5], [16,18,18,16], color='k', lw=1.5) ax.plot([1,0], [16,16], color='k', lw=1.5) simpleaxis(ax) pl.savefig('Figure_1_panels/Altmetric Score vs status - barplot.pdf') # In[66]: from scipy.stats.mstats import kruskalwallis print(kruskalwallis(scores[-1],scores[4],scores[1],scores[3])) lists = scores[-1]+scores[4]+scores[1]+scores[3] categories = len(scores[-1])*[-1]+len(scores[4])*[4]+len(scores[1]+scores[3])*[1] mc = MultiComparison(lists, categories) result = mc.tukeyhsd() print(result) print(mc.groupsunique) # ## Altmetric number of readers # In[67]: scores = {} for status in [1,3,-1,4]: scores[status] = [] for index, row in uniqueLinks.iterrows(): if row.id in dictAltmetric.keys(): numYears = (year - 2003) scores[ row.status ].append( dictAltmetric[row.id]['numReaders'] / numYears ) # In[68]: f, ax = pl.subplots( 1, 1, figsize=(3.5, 1.8) ) # let's make sure the order is maintained: data = [scores[4], scores[-1], scores[1], scores[3]] parts = ax.violinplot( data, [0,1,2,3], showmeans=False, showmedians=False, showextrema=False, widths=0.7, points=500 ) colors=[(1.0,0.2,0.1),(1.0,0.6,0.2),(0.3,0.9,0.3),(0.3,0.7,1.0)] j = 0 for pc in parts['bodies']: pc.set_facecolor(colors[j]) pc.set_edgecolor( (0.2,0.2,0.2) ) pc.set_edgecolors( (0.2,0.2,0.2) ) pc.set_alpha(1) j += 1 def buttpos(x): return [x-0.05,x+0.05] for j in range(4): pl.plot(buttpos(j), [np.mean(data[j])]*2, color="k", linewidth=6, solid_capstyle="butt", zorder=4) # for j in range(4): # ax.text( j-0.4, -2, 'n='+str(len(data[j])), fontsize=7 ) ax.set_ylim(0,20) ax.set_ylabel('Altmetric readership') ax.set_xlabel('') ax.set_xticks([0,1,2,3]) ax.set_xticklabels( ['Broken','Timeout','Accessible','Redirected'], rotation=-25 ) simpleaxis(ax) pl.savefig('Figure_1_panels/Altmetric numReaders vs status.pdf') # In[70]: f, ax = pl.subplots( 1, 1, figsize=(1.8, 1.4) ) # let's make sure the order is maintained: data = [ np.mean(scores[4]), np.mean(scores[-1]), np.mean(scores[1] + scores[3]) ] vardata = [ np.std(scores[4])/np.sqrt(len(scores[4])), np.std(scores[-1])/np.sqrt(len(scores[-1])), np.std(scores[1] + scores[3])/np.sqrt(len(scores[1] + scores[3])) ] colors=[(1.0,0.2,0.1),(1.0,0.6,0.2),(0.5,1.0,0.5)] pl.bar( [0,1,2], data, 0.7, yerr=vardata, color=colors, edgecolor=[(0.2,0.2,0.2)]*3, lw=1.3, capsize=8 ) pl.bar( [2], data[2:], 0.7, yerr=vardata[2:], color=colors[2:], hatch="//", edgecolor=[(0.1,0.4,1.0)]*3, lw=1.5, capsize=8 ) pl.bar( [2], data[2:], 0.7, yerr=vardata[2:], color='none', edgecolor=[(0.0,0.0,0.0)]*3, lw=1.5, capsize=8 ) ax.set_ylim(0,10) ax.set_ylabel('Altmetric \n readership') ax.set_xlabel('') ax.set_xlim(-0.5,2.5) ax.set_xticks([0,1,2]) ax.set_xticklabels( ['Broken','Timeout','Accessible'], rotation=-25, fontsize=9) simpleaxis(ax) pl.savefig('Figure_1_panels/Altmetric numReaders vs status - barplot.pdf') # In[71]: lists = scores[-1]+scores[4]+scores[1]+scores[3] categories = len(scores[-1])*[-1]+len(scores[4])*[4]+len(scores[1]+scores[3])*[1] mc = MultiComparison(lists, categories) result = mc.tukeyhsd() print(result) print(mc.groupsunique) # ## Altmetric citation # In[72]: scores = {} for status in [1,3,-1,4]: scores[status] = [] for index, row in uniqueLinks.iterrows(): if row.id in dictAltmetric.keys(): numYears = (year - 2003) scores[ row.status ].append( dictAltmetric[row.id]['cited'] / numYears ) # In[73]: f, ax = pl.subplots( 1, 1, figsize=(3.5, 1.5) ) # let's make sure the order is maintained: data = [scores[4], scores[-1], scores[1], scores[3]] parts = ax.violinplot( data, [0,1,2,3], showmeans=False, showmedians=False, showextrema=False, widths=0.7, points=500 ) colors=[(1.0,0.2,0.1),(1.0,0.6,0.2),(0.3,0.9,0.3),(0.3,0.7,1.0)] j = 0 for pc in parts['bodies']: pc.set_facecolor(colors[j]) pc.set_edgecolor( (0.2,0.2,0.2) ) pc.set_edgecolors( (0.2,0.2,0.2) ) pc.set_alpha(1) j += 1 def buttpos(x): return [x-0.05,x+0.05] for j in range(4): pl.plot(buttpos(j), [np.mean(data[j])]*2, color="k", linewidth=6, solid_capstyle="butt", zorder=4) # for j in range(4): # ax.text( j-0.4, -2, 'n='+str(len(data[j])), fontsize=7 ) ax.set_ylim(0,9) ax.set_ylabel('Social media \n citations per year') ax.set_yticks([0,2,4,6,8]) ax.set_xlabel('') ax.set_xticks([0,1,2,3]) ax.set_xticklabels( ['Broken','Timeout','Accessible','Redirected'], rotation=-25, fontsize=8 ) simpleaxis(ax) pl.savefig('Figure_1_panels/Altmetric Cited vs status.pdf') # In[74]: f, ax = pl.subplots( 1, 1, figsize=(1.8, 1.4) ) # let's make sure the order is maintained: data = [ np.mean(scores[4]), np.mean(scores[-1]), np.mean(scores[1] + scores[3]) ] vardata = [ np.std(scores[4])/np.sqrt(len(scores[4])), np.std(scores[-1])/np.sqrt(len(scores[-1])), np.std(scores[1] + scores[3])/np.sqrt(len(scores[1] + scores[3])) ] colors=[(1.0,0.2,0.1),(1.0,0.6,0.2),(0.5,1.0,0.5)] pl.bar( [0,1,2], data, 0.7, yerr=vardata, color=colors, edgecolor=[(0.2,0.2,0.2)]*3, lw=1.3, capsize=8 ) pl.bar( [2], data[2:], 0.7, yerr=vardata[2:], color=colors[2:], hatch="//", edgecolor=[(0.1,0.4,1.0)]*3, lw=1.5, capsize=8 ) pl.bar( [2], data[2:], 0.7, yerr=vardata[2:], color='none', edgecolor=[(0.0,0.0,0.0)]*3, lw=1.5, capsize=8 ) ax.set_ylim(0,5) ax.set_ylabel('Social media \n citations per year') ax.set_xlabel('') ax.set_xlim(-0.5,2.5) ax.set_xticks([0,1,2]) ax.set_xticklabels( ['Broken','Timeout','Accessible'], rotation=-25, fontsize=9) ax.text(1.3, 4.7, '*', fontsize=12) ax.plot([2,2,0.5,0.5], [4,4.7,4.7,4], color='k', lw=1.5) ax.plot([0,1], [4,4], color='k', lw=1.5) simpleaxis(ax) pl.savefig('Figure_1_panels/Altmetric Cited vs status - barplot.pdf') # In[75]: lists = scores[-1]+scores[4]+scores[1]+scores[3] categories = len(scores[-1])*[-1]+len(scores[4])*[4]+len(scores[1]+scores[3])*[1] from scipy.stats.mstats import kruskalwallis print(kruskalwallis(scores[-1],scores[4],scores[1],scores[3])) mc = MultiComparison(lists, categories) result = mc.tukeyhsd() print(result) print(mc.groupsunique) #