In [1]:

%matplotlib inline

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from statsmodels.api import Logit
import patsy

from joblib import load, dump

import model_functions as mf
import eval_measures as ems

from IPython.display import display

MF reloaded 1
EMS reloaded

In [2]:

sns.set_context("paper")
sns.set_style("ticks")

In [3]:

%%time
with pd.HDFStore('out/Training_2002_2005.h5') as cstore:
    df_first = cstore['first_author']
    df_last = cstore['last_author']
    
print df_first.shape, df_last.shape
df_first.columns

(41618369, 56) (41618369, 56)
CPU times: user 1min 12s, sys: 38.9 s, total: 1min 51s
Wall time: 1min 51s

In [4]:

df_first.columns

Out[4]:

Index([u'source_id', u'source_year', u'source_j', u'source_n_mesh',
       u'source_n_mesh_ex', u'source_is_eng', u'source_country',
       u'source_is_journal', u'source_is_review', u'source_is_case_rep',
       u'source_is_let_ed_com', u'source_T_novelty', u'source_V_novelty',
       u'source_PT_novelty', u'source_PV_novelty', u'source_ncites',
       u'source_n_authors', u'sink_id', u'sink_year', u'sink_j',
       u'sink_n_mesh', u'sink_n_mesh_ex', u'sink_is_eng', u'sink_is_journal',
       u'sink_is_review', u'sink_is_case_rep', u'sink_is_let_ed_com',
       u'sink_T_novelty', u'sink_V_novelty', u'sink_PT_novelty',
       u'sink_PV_novelty', u'sink_n_authors', u'year_span', u'journal_same',
       u'mesh_sim', u'title_sim', u'lang_sim', u'affiliation_sim',
       u'pubtype_sim', u'cite_sim', u'author_sim', u'gender_sim', u'eth_sim',
       u'n_common_authors', u'auid', u'gender', u'eth1', u'eth2', u'pos',
       u'pos_nice', u'sink_last_ncites', u'sink_prev_ncites',
       u'auth_last_npapers', u'auth_prev_papers', u'jj_sim', u'is_self_cite'],
      dtype='object')

Load author years data¶

In [5]:

%%time
df_authors = pd.read_csv("data/AuthorityFirstLastYears.txt", sep="\t").rename(
    columns={"au_id": "auid"})
df_authors.shape

CPU times: user 7.82 s, sys: 1.15 s, total: 8.97 s
Wall time: 8.97 s

In [6]:

df_authors.head()

Out[6]:

	auid	first_year	last_year
0	9731334_2	1997	2009
1	2155715_1	1990	2009
2	7867892_1	1994	2009
3	14609221_2	2003	2009
4	8101337_1	1993	2007

In [7]:

df_authors.dtypes

Out[7]:

auid          object
first_year     int64
last_year      int64
dtype: object

In [8]:

df_authors.describe().astype(int)

Out[8]:

	first_year	last_year
count	9300182	9300182
mean	1989	1994
std	16	15
min	1865	0
25%	1980	1986
50%	1994	2000
75%	2003	2007
max	9999	2099

In [9]:

df_authors[df_authors.first_year == 9999].shape, df_authors[df_authors.first_year <= 1900].shape

Out[9]:

((3, 3), (3858, 3))

Load author expertise data¶

In [10]:

%%time
df_expertise = pd.read_csv("data/AuthorExpertise.txt", sep="\t")
df_expertise.columns, df_expertise.shape

CPU times: user 59.8 s, sys: 5.68 s, total: 1min 5s
Wall time: 1min 5s

In [11]:

df_expertise.columns, df_expertise.shape

Out[11]:

(Index([u'PMID', u'auid', u'match_len', u'match_prop', u'overall_coverage_len',
        u'overall_coverage_prop'],
       dtype='object'), (58761322, 6))

In [12]:

df_expertise.dtypes

Out[12]:

PMID                       int64
auid                      object
match_len                  int64
match_prop               float64
overall_coverage_len       int64
overall_coverage_prop    float64
dtype: object

First author¶

In [13]:

%%time
print df_first.shape
df_first = df_first.merge(df_authors, how="left", on="auid")
print df_first.shape

(41618369, 56)
(41618369, 58)
CPU times: user 1min 6s, sys: 34.9 s, total: 1min 41s
Wall time: 1min 41s

In [14]:

df_first["au_age"] = df_first["source_year"] - df_first["first_year"]

In [15]:

df_first.columns

Out[15]:

Index([u'source_id', u'source_year', u'source_j', u'source_n_mesh',
       u'source_n_mesh_ex', u'source_is_eng', u'source_country',
       u'source_is_journal', u'source_is_review', u'source_is_case_rep',
       u'source_is_let_ed_com', u'source_T_novelty', u'source_V_novelty',
       u'source_PT_novelty', u'source_PV_novelty', u'source_ncites',
       u'source_n_authors', u'sink_id', u'sink_year', u'sink_j',
       u'sink_n_mesh', u'sink_n_mesh_ex', u'sink_is_eng', u'sink_is_journal',
       u'sink_is_review', u'sink_is_case_rep', u'sink_is_let_ed_com',
       u'sink_T_novelty', u'sink_V_novelty', u'sink_PT_novelty',
       u'sink_PV_novelty', u'sink_n_authors', u'year_span', u'journal_same',
       u'mesh_sim', u'title_sim', u'lang_sim', u'affiliation_sim',
       u'pubtype_sim', u'cite_sim', u'author_sim', u'gender_sim', u'eth_sim',
       u'n_common_authors', u'auid', u'gender', u'eth1', u'eth2', u'pos',
       u'pos_nice', u'sink_last_ncites', u'sink_prev_ncites',
       u'auth_last_npapers', u'auth_prev_papers', u'jj_sim', u'is_self_cite',
       u'first_year', u'last_year', u'au_age'],
      dtype='object')

In [16]:

%%time
print df_first.shape
df_first = df_first.merge(df_expertise, how="left",
                          left_on=["source_id","auid"],
                          right_on=["PMID","auid"],)
print df_first.shape

(41618369, 59)
(41619240, 64)
CPU times: user 1min 1s, sys: 19.4 s, total: 1min 20s
Wall time: 1min 20s

In [17]:

df_first.columns

Out[17]:

Index([u'source_id', u'source_year', u'source_j', u'source_n_mesh',
       u'source_n_mesh_ex', u'source_is_eng', u'source_country',
       u'source_is_journal', u'source_is_review', u'source_is_case_rep',
       u'source_is_let_ed_com', u'source_T_novelty', u'source_V_novelty',
       u'source_PT_novelty', u'source_PV_novelty', u'source_ncites',
       u'source_n_authors', u'sink_id', u'sink_year', u'sink_j',
       u'sink_n_mesh', u'sink_n_mesh_ex', u'sink_is_eng', u'sink_is_journal',
       u'sink_is_review', u'sink_is_case_rep', u'sink_is_let_ed_com',
       u'sink_T_novelty', u'sink_V_novelty', u'sink_PT_novelty',
       u'sink_PV_novelty', u'sink_n_authors', u'year_span', u'journal_same',
       u'mesh_sim', u'title_sim', u'lang_sim', u'affiliation_sim',
       u'pubtype_sim', u'cite_sim', u'author_sim', u'gender_sim', u'eth_sim',
       u'n_common_authors', u'auid', u'gender', u'eth1', u'eth2', u'pos',
       u'pos_nice', u'sink_last_ncites', u'sink_prev_ncites',
       u'auth_last_npapers', u'auth_prev_papers', u'jj_sim', u'is_self_cite',
       u'first_year', u'last_year', u'au_age', u'PMID', u'match_len',
       u'match_prop', u'overall_coverage_len', u'overall_coverage_prop'],
      dtype='object')

In [18]:

%%time
df_first = df_first.drop("PMID", axis=1)
print df_first.shape

(41619240, 63)
CPU times: user 31.9 s, sys: 33.1 s, total: 1min 5s
Wall time: 1min 5s

Last author¶

In [19]:

%%time
print df_last.shape
df_last = df_last.merge(df_authors, how="left", on="auid")
print df_last.shape

(41618369, 56)
(41618369, 58)
CPU times: user 1min 7s, sys: 36.8 s, total: 1min 44s
Wall time: 1min 44s

In [20]:

df_last["au_age"] = df_last["source_year"] - df_last["first_year"]

In [21]:

%%time
print df_last.shape
df_last = df_last.merge(df_expertise, how="left",
                          left_on=["source_id","auid"],
                          right_on=["PMID","auid"],)
print df_last.shape
df_last = df_last.drop("PMID", axis=1)
print df_last.shape

(41618369, 59)
(41619267, 64)
(41619267, 63)
CPU times: user 1min 33s, sys: 53.2 s, total: 2min 26s
Wall time: 2min 26s

Modelling considerations¶

In [22]:

TOP_15_COUNTRIES = ["USA", "UNKNOWN", "UK", "JAPAN", "GERMANY", "FRANCE", "ITALY",
                    "CANADA", "CHINA", "AUSTRALIA", "SPAIN", "NETHERLANDS",
                    "SWEDEN", "INDIA", "OTHER"]
TOP_15_ETHNICITIES = ["ENGLISH", "GERMAN", "HISPANIC", "CHINESE",
                      "JAPANESE", "SLAV", "FRENCH", "ITALIAN", "INDIAN",
                      "NORDIC", "ARAB", "DUTCH", "KOREAN", "UNKNOWN", "OTHER"]
GENDERS = ["-", "F", "M"]

def prepare_data(df):
    df["eth_weight"] = 0.5 # Partial weight to multi ethnicity
    df.ix[df.eth2 == "UNKNOWN", "eth_weight"] = 1 # Full weight to single ethnicity
    df.ix[df.source_country == "-", "source_country"] = "UNKNOWN" # Set - to unknown
    df.source_country = df.source_country.astype("category", categories=TOP_15_COUNTRIES, ordered=False).fillna("OTHER")
    df.ix[df.eth1.isin(
        ["UNKNOWN", "TOOSHORT", "ERROR"]),
      "eth1"] = "UNKNOWN" # Set unknown ethnicities
    df.ix[df.eth2.isin(
            ["UNKNOWN", "TOOSHORT", "ERROR"]),
          "eth2"] = "UNKNOWN" # Set unknown ethnicities
    df.eth1 = df.eth1.astype("category", categories=TOP_15_ETHNICITIES, ordered=False).fillna("OTHER")
    df.eth2 = df.eth2.astype("category", categories=TOP_15_ETHNICITIES, ordered=False).fillna("OTHER")
    df.gender = df.gender.astype("category", categories=GENDERS, ordered=False).fillna("-")
    df[[u'source_is_eng', u'source_is_journal', u'source_is_review',
    u'source_is_case_rep', u'source_is_let_ed_com',
    u'sink_is_eng', u'sink_is_journal', u'sink_is_review', u'sink_is_case_rep',
    u'sink_is_let_ed_com', u'journal_same', u'affiliation_sim']] = df[[u'source_is_eng', u'source_is_journal', u'source_is_review',
    u'source_is_case_rep', u'source_is_let_ed_com',
    u'sink_is_eng', u'sink_is_journal', u'sink_is_review', u'sink_is_case_rep',
    u'sink_is_let_ed_com', u'journal_same', u'affiliation_sim']].astype("bool")

In [23]:

prepare_data(df_first)
prepare_data(df_last)

Overall data statistics¶

First author¶

In [24]:

%%time
df_t_first = df_first.pivot_table(index="gender", columns="is_self_cite",
               values=["source_year", u'source_n_mesh', u'source_is_eng', 
                       u'source_is_journal', u'source_is_review',
                       u'source_is_case_rep', u'source_is_let_ed_com',
                       u'source_T_novelty', u'source_V_novelty', u'source_PT_novelty',
                       u'source_PV_novelty', u'source_ncites', u'source_n_authors',
                       u'sink_year', u'sink_n_mesh', u'sink_n_mesh_ex', u'sink_is_eng',
                       u'sink_is_journal', u'sink_is_review', u'sink_is_case_rep',
                       u'sink_is_let_ed_com', u'sink_T_novelty', u'sink_V_novelty',
                       u'sink_PT_novelty', u'sink_PV_novelty', u'sink_n_authors',
                       u'year_span', u'journal_same',
                       u'mesh_sim', u'title_sim', u'lang_sim',
                       u'affiliation_sim', u'pubtype_sim', u'cite_sim', u'author_sim',
                       u'gender_sim', u'eth_sim', u'n_common_authors',
                       u'sink_last_ncites', u'sink_prev_ncites',
                       u'auth_last_npapers', u'auth_prev_papers',
                       u'au_age',
                       u'jj_sim', 
                       u'match_len', u'match_prop',
                       u'overall_coverage_len', u'overall_coverage_prop'
                      ],
                        aggfunc=[np.mean, np.std])
df_t_first

CPU times: user 1min 3s, sys: 41.4 s, total: 1min 45s
Wall time: 1min 45s

In [25]:

df_t_first.T.unstack(level=0).to_csv("First_Author_gender_self_cites_means.txt", sep="\t")

In [26]:

with pd.option_context('display.max_rows', 100,
                       'display.max_columns', 20,
                       'display.precision', 3
                      ):
    display(df_t_first.T.unstack(level=0))

	gender	-		F		M
		mean	std	mean	std	mean	std
	is_self_cite
source_year	0	2003.663	1.111	2003.631	1.112	2003.588	1.116
source_year	1	2003.635	1.114	2003.603	1.114	2003.563	1.118
source_n_mesh	0	13.492	5.859	13.340	5.463	12.566	5.427
source_n_mesh	1	12.879	5.999	13.121	5.392	11.993	5.363
source_is_eng	0	0.984	0.124	0.987	0.114	0.979	0.144
source_is_eng	1	0.995	0.072	0.992	0.091	0.986	0.116
source_is_journal	0	0.990	0.099	0.990	0.100	0.984	0.125
source_is_journal	1	0.986	0.119	0.984	0.124	0.975	0.157
source_is_review	0	0.137	0.344	0.192	0.394	0.249	0.432
source_is_review	1	0.140	0.347	0.173	0.378	0.260	0.439
source_is_case_rep	0	0.036	0.186	0.030	0.169	0.040	0.196
source_is_case_rep	1	0.011	0.107	0.012	0.107	0.015	0.122
source_is_let_ed_com	0	0.010	0.100	0.010	0.101	0.017	0.127
source_is_let_ed_com	1	0.015	0.120	0.016	0.125	0.027	0.162
source_T_novelty	0	19.846	12.504	20.965	12.666	22.183	13.109
source_T_novelty	1	20.304	12.522	21.361	12.524	23.031	13.119
source_V_novelty	0	3254.876	10964.931	3607.520	12431.366	3947.314	8873.425
source_V_novelty	1	3298.214	11458.840	3632.428	24886.111	4038.486	9871.289
source_PT_novelty	0	2.287	4.948	2.531	5.164	2.964	5.721
source_PT_novelty	1	2.333	4.960	2.540	5.089	3.177	5.922
source_PV_novelty	0	12.492	678.392	24.137	1390.502	16.884	488.653
source_PV_novelty	1	19.853	1199.187	10.357	564.516	17.537	472.326
source_ncites	0	45.780	66.675	53.320	94.552	55.949	81.095
source_ncites	1	37.349	33.979	42.054	49.816	45.410	59.791
source_n_authors	0	5.130	3.021	5.094	3.145	5.102	3.428
source_n_authors	1	5.176	4.281	5.183	3.538	5.068	3.698
sink_year	0	1995.610	7.494	1995.628	7.401	1995.435	7.726
sink_year	1	2000.468	3.569	2000.044	3.687	1999.378	4.569
sink_n_mesh	0	12.898	5.661	12.999	5.390	12.626	5.386
sink_n_mesh	1	13.219	6.051	13.518	5.356	12.724	5.480
sink_n_mesh_ex	0	58.737	24.327	59.668	23.138	58.163	23.339
sink_n_mesh_ex	1	60.834	26.808	62.792	23.686	59.273	24.495
sink_is_eng	0	0.996	0.065	0.996	0.063	0.994	0.076
sink_is_eng	1	0.995	0.068	0.995	0.073	0.992	0.090
sink_is_journal	0	0.985	0.123	0.983	0.129	0.981	0.136
sink_is_journal	1	0.989	0.105	0.988	0.108	0.982	0.131
sink_is_review	0	0.146	0.353	0.148	0.355	0.144	0.351
sink_is_review	1	0.051	0.220	0.065	0.246	0.085	0.279
sink_is_case_rep	0	0.030	0.171	0.028	0.166	0.037	0.189
sink_is_case_rep	1	0.015	0.120	0.015	0.121	0.020	0.141
sink_is_let_ed_com	0	0.016	0.124	0.017	0.130	0.019	0.137
sink_is_let_ed_com	1	0.011	0.106	0.012	0.107	0.018	0.132
sink_T_novelty	0	16.561	11.402	17.127	11.490	17.856	11.696
sink_T_novelty	1	18.326	11.820	19.033	11.811	19.854	12.131
sink_V_novelty	0	2445.306	9045.168	2521.256	7979.934	2650.913	8870.405
sink_V_novelty	1	2514.894	6996.478	2669.978	12369.446	2852.831	7750.489
sink_PT_novelty	0	1.786	4.393	1.853	4.443	2.023	4.664
sink_PT_novelty	1	1.722	4.228	1.820	4.268	2.125	4.792
sink_PV_novelty	0	14.636	550.987	14.576	766.610	17.488	794.826
sink_PV_novelty	1	10.081	450.898	6.804	199.527	12.890	467.356
sink_n_authors	0	4.770	5.124	4.790	4.669	4.840	5.046
sink_n_authors	1	5.549	5.748	5.635	4.423	5.537	4.668
year_span	0	8.053	7.410	8.003	7.325	8.154	7.650
year_span	1	3.167	3.357	3.559	3.508	4.185	4.428
journal_same	0	0.077	0.267	0.071	0.257	0.073	0.260
journal_same	1	0.218	0.413	0.184	0.387	0.175	0.380
mesh_sim	0	0.147	0.106	0.152	0.103	0.154	0.109
mesh_sim	1	0.215	0.148	0.225	0.142	0.210	0.143
title_sim	0	0.167	0.155	0.162	0.153	0.164	0.157
title_sim	1	0.268	0.200	0.258	0.195	0.240	0.195
lang_sim	0	0.982	0.131	0.984	0.124	0.976	0.154
lang_sim	1	0.996	0.064	0.991	0.094	0.986	0.118
affiliation_sim	0	0.319	0.466	0.299	0.458	0.295	0.456
affiliation_sim	1	0.903	0.296	0.899	0.301	0.866	0.341
pubtype_sim	0	0.529	0.265	0.519	0.263	0.502	0.265
pubtype_sim	1	0.683	0.282	0.660	0.283	0.611	0.290
cite_sim	0	0.031	0.046	0.029	0.043	0.029	0.044
cite_sim	1	0.111	0.132	0.103	0.122	0.095	0.121
author_sim	0	0.018	0.067	0.019	0.067	0.015	0.060
author_sim	1	0.425	0.254	0.410	0.248	0.365	0.241
gender_sim	0	0.667	0.286	0.724	0.263	0.826	0.215
gender_sim	1	0.890	0.142	0.900	0.133	0.915	0.120
eth_sim	0	0.676	0.199	0.690	0.208	0.685	0.207
eth_sim	1	0.944	0.079	0.944	0.078	0.933	0.089
n_common_authors	0	0.140	0.566	0.150	0.578	0.123	0.510
n_common_authors	1	2.782	2.913	2.738	2.023	2.441	1.965
sink_last_ncites	0	25.266	165.534	24.339	165.747	22.232	142.185
sink_last_ncites	1	6.577	14.424	6.685	13.854	7.326	15.854
sink_prev_ncites	0	354.507	6089.993	348.425	6061.923	264.824	4931.631
sink_prev_ncites	1	13.655	62.932	15.559	53.677	20.557	69.766
auth_last_npapers	0	3.199	3.842	2.853	2.792	4.398	4.851
auth_last_npapers	1	5.112	6.052	4.311	4.535	6.748	6.964
auth_prev_papers	0	11.535	29.212	12.628	25.234	29.963	60.302
auth_prev_papers	1	35.219	73.219	31.048	45.844	68.929	101.054
au_age	0	5.899	7.413	7.503	24.831	10.707	10.580
au_age	1	10.791	9.839	12.695	9.955	17.030	11.939
jj_sim	0	26.353	88.874	27.633	99.535	29.691	93.924
jj_sim	1	65.845	232.484	63.721	246.925	63.500	190.484
match_len	0	33.007	25.003	35.544	23.991	39.156	22.873
match_len	1	49.413	21.554	50.321	20.394	49.798	20.106
match_prop	0	0.516	0.339	0.569	0.326	0.662	0.307
match_prop	1	0.795	0.204	0.814	0.187	0.865	0.164
overall_coverage_len	0	57.854	23.877	58.374	22.610	55.554	22.272
overall_coverage_len	1	58.676	23.367	59.194	22.102	55.257	21.679
overall_coverage_prop	0	0.893	0.149	0.919	0.117	0.920	0.121
overall_coverage_prop	1	0.933	0.103	0.946	0.081	0.950	0.084

Last author¶

In [27]:

df_t_last = df_last.pivot_table(index="gender", columns="is_self_cite",
               values=["source_year", u'source_n_mesh', u'source_is_eng', 
                       u'source_is_journal', u'source_is_review',
                       u'source_is_case_rep', u'source_is_let_ed_com',
                       u'source_T_novelty', u'source_V_novelty', u'source_PT_novelty',
                       u'source_PV_novelty', u'source_ncites', u'source_n_authors',
                       u'sink_year', u'sink_n_mesh', u'sink_n_mesh_ex', u'sink_is_eng',
                       u'sink_is_journal', u'sink_is_review', u'sink_is_case_rep',
                       u'sink_is_let_ed_com', u'sink_T_novelty', u'sink_V_novelty',
                       u'sink_PT_novelty', u'sink_PV_novelty', u'sink_n_authors',
                       u'year_span', u'journal_same',
                       u'mesh_sim', u'title_sim', u'lang_sim',
                       u'affiliation_sim', u'pubtype_sim', u'cite_sim', u'author_sim',
                       u'gender_sim', u'eth_sim', u'n_common_authors',
                       u'sink_last_ncites', u'sink_prev_ncites',
                       u'auth_last_npapers', u'auth_prev_papers',
                       u'au_age',
                       u'jj_sim',
                       u'match_len', u'match_prop',
                       u'overall_coverage_len', u'overall_coverage_prop'
                      ],
                        aggfunc=[np.mean, np.std])
df_t_last

Out[27]:

	mean										...	std
	source_year		source_n_mesh		source_is_eng		source_is_journal		source_is_review		...	jj_sim		match_len		match_prop		overall_coverage_len		overall_coverage_prop
is_self_cite	0	1	0	1	0	1	0	1	0	1	...	0	1	0	1	0	1	0	1	0	1
gender
-	2003.668525	2003.669454	12.921061	13.370146	0.978522	0.995681	0.986633	0.989529	0.171399	0.120635	...	107.418105	196.201352	26.964844	22.586342	0.321955	0.136271	23.582839	23.404009	0.171760	0.097271
F	2003.634246	2003.616255	12.995463	13.705127	0.984779	0.993584	0.986530	0.988157	0.231312	0.149486	...	112.664784	169.739193	24.640737	21.337889	0.275971	0.119176	22.400058	22.330419	0.132850	0.079537
M	2003.597499	2003.589226	12.822445	13.196785	0.980800	0.992040	0.985936	0.986839	0.227771	0.158791	...	91.100172	151.407318	22.974276	21.753687	0.209302	0.104821	22.484419	22.627888	0.115065	0.075086

3 rows × 192 columns

In [28]:

df_t_last.T.unstack(level=0).to_csv("Last_Author_gender_self_cites_means.txt", sep="\t")

In [29]:

with pd.option_context('display.max_rows', 100,
                       'display.max_columns', 20,
                       'display.precision', 3
                      ):
    display(df_t_last.T.unstack(level=0))

	gender	-		F		M
		mean	std	mean	std	mean	std
	is_self_cite
source_year	0	2003.669	1.110	2003.634	1.112	2003.597	1.115
source_year	1	2003.669	1.112	2003.616	1.114	2003.589	1.116
source_n_mesh	0	12.921	5.690	12.995	5.410	12.822	5.487
source_n_mesh	1	13.370	6.102	13.705	5.576	13.197	5.682
source_is_eng	0	0.979	0.145	0.985	0.122	0.981	0.137
source_is_eng	1	0.996	0.066	0.994	0.080	0.992	0.089
source_is_journal	0	0.987	0.115	0.987	0.115	0.986	0.118
source_is_journal	1	0.990	0.102	0.988	0.108	0.987	0.114
source_is_review	0	0.171	0.377	0.231	0.422	0.228	0.419
source_is_review	1	0.121	0.326	0.149	0.357	0.159	0.365
source_is_case_rep	0	0.046	0.210	0.032	0.176	0.038	0.190
source_is_case_rep	1	0.013	0.111	0.011	0.105	0.012	0.110
source_is_let_ed_com	0	0.013	0.115	0.014	0.115	0.015	0.120
source_is_let_ed_com	1	0.011	0.104	0.012	0.110	0.014	0.118
source_T_novelty	0	21.133	12.993	21.882	12.990	21.668	12.977
source_T_novelty	1	19.388	12.091	19.907	12.071	20.400	12.267
source_V_novelty	0	3534.550	11270.630	3773.758	8360.877	3841.652	11202.508
source_V_novelty	1	3097.273	13115.467	3109.591	6873.506	3328.334	9211.716
source_PT_novelty	0	2.593	5.319	2.810	5.526	2.807	5.545
source_PT_novelty	1	2.096	4.570	2.236	4.731	2.392	4.958
source_PV_novelty	0	10.314	198.981	14.104	392.583	21.402	1042.992
source_PV_novelty	1	9.777	329.167	9.857	358.987	11.951	446.572
source_ncites	0	46.306	59.037	56.770	95.505	54.664	85.583
source_ncites	1	39.348	36.907	44.151	40.060	43.368	44.693
source_n_authors	0	5.176	3.610	4.827	3.125	5.150	3.301
source_n_authors	1	5.300	5.136	5.035	3.475	5.218	3.186
sink_year	0	1995.291	7.777	1995.531	7.548	1995.428	7.696
sink_year	1	1999.435	4.195	1998.849	4.363	1998.634	4.777
sink_n_mesh	0	12.621	5.579	12.843	5.349	12.720	5.415
sink_n_mesh	1	13.382	5.947	13.759	5.327	13.264	5.545
sink_n_mesh_ex	0	57.827	24.093	59.082	22.996	58.500	23.383
sink_n_mesh_ex	1	61.242	26.191	63.276	23.312	60.861	24.394
sink_is_eng	0	0.995	0.073	0.996	0.065	0.995	0.073
sink_is_eng	1	0.997	0.054	0.996	0.059	0.995	0.067
sink_is_journal	0	0.982	0.132	0.981	0.135	0.982	0.134
sink_is_journal	1	0.990	0.101	0.990	0.097	0.988	0.107
sink_is_review	0	0.148	0.355	0.151	0.358	0.147	0.354
sink_is_review	1	0.062	0.242	0.076	0.265	0.085	0.279
sink_is_case_rep	0	0.038	0.191	0.031	0.174	0.035	0.184
sink_is_case_rep	1	0.013	0.114	0.012	0.111	0.014	0.116
sink_is_let_ed_com	0	0.018	0.133	0.019	0.136	0.019	0.135
sink_is_let_ed_com	1	0.010	0.101	0.010	0.098	0.012	0.108
sink_T_novelty	0	17.273	11.708	17.633	11.678	17.555	11.637
sink_T_novelty	1	17.166	11.370	17.394	11.234	17.810	11.474
sink_V_novelty	0	2529.759	9435.899	2578.390	7800.220	2615.206	8625.002
sink_V_novelty	1	2404.246	5600.294	2391.179	12136.882	2542.507	9245.305
sink_PT_novelty	0	1.927	4.580	1.961	4.590	1.971	4.600
sink_PT_novelty	1	1.558	3.983	1.573	3.966	1.734	4.280
sink_PV_novelty	0	16.361	642.131	14.928	557.485	16.947	828.530
sink_PV_novelty	1	11.647	503.154	6.871	287.387	10.512	390.248
sink_n_authors	0	4.731	5.115	4.708	4.688	4.821	5.011
sink_n_authors	1	5.520	5.983	5.386	4.548	5.491	4.461
year_span	0	8.377	7.692	8.103	7.475	8.169	7.621
year_span	1	4.234	4.030	4.768	4.227	4.955	4.648
journal_same	0	0.069	0.254	0.068	0.251	0.071	0.256
journal_same	1	0.183	0.386	0.161	0.367	0.164	0.370
mesh_sim	0	0.151	0.110	0.152	0.104	0.152	0.107
mesh_sim	1	0.196	0.136	0.203	0.127	0.194	0.128
title_sim	0	0.168	0.158	0.160	0.154	0.162	0.155
title_sim	1	0.243	0.189	0.232	0.181	0.227	0.181
lang_sim	0	0.976	0.153	0.982	0.133	0.978	0.147
lang_sim	1	0.996	0.063	0.993	0.082	0.991	0.092
affiliation_sim	0	0.242	0.428	0.303	0.459	0.278	0.448
affiliation_sim	1	0.870	0.337	0.867	0.339	0.841	0.366
pubtype_sim	0	0.521	0.268	0.503	0.260	0.502	0.262
pubtype_sim	1	0.681	0.278	0.659	0.275	0.644	0.281
cite_sim	0	0.030	0.047	0.027	0.042	0.029	0.043
cite_sim	1	0.089	0.116	0.080	0.100	0.079	0.100
author_sim	0	0.009	0.047	0.010	0.050	0.008	0.044
author_sim	1	0.322	0.228	0.315	0.222	0.301	0.216
gender_sim	0	0.616	0.307	0.671	0.290	0.815	0.215
gender_sim	1	0.860	0.168	0.864	0.162	0.896	0.133
eth_sim	0	0.640	0.199	0.686	0.208	0.679	0.204
eth_sim	1	0.931	0.093	0.921	0.094	0.917	0.097
n_common_authors	0	0.068	0.525	0.079	0.391	0.070	0.366
n_common_authors	1	2.290	3.689	2.138	1.953	2.126	1.490
sink_last_ncites	0	25.413	169.723	23.482	160.319	23.656	153.000
sink_last_ncites	1	7.670	16.783	7.933	14.654	9.091	19.574
sink_prev_ncites	0	385.753	6578.110	332.230	5930.452	298.336	5330.678
sink_prev_ncites	1	25.078	98.974	28.786	72.432	33.947	104.446
auth_last_npapers	0	6.311	7.257	4.917	4.723	8.060	8.278
auth_last_npapers	1	8.625	8.087	6.275	5.232	9.771	9.401
auth_prev_papers	0	49.145	80.955	43.300	55.122	89.871	108.144
auth_prev_papers	1	84.821	106.007	67.926	66.621	121.084	127.151
au_age	0	13.199	54.156	15.811	10.592	20.591	11.295
au_age	1	18.764	24.810	20.439	9.624	23.681	10.557
jj_sim	0	27.960	107.418	29.395	112.665	28.511	91.100
jj_sim	1	53.157	196.201	48.006	169.739	47.886	151.407
match_len	0	43.598	26.965	46.601	24.641	51.027	22.974
match_len	1	56.995	22.586	57.283	21.338	57.233	21.754
match_prop	0	0.686	0.322	0.748	0.276	0.830	0.209
match_prop	1	0.886	0.136	0.893	0.119	0.916	0.105
overall_coverage_len	0	54.771	23.583	56.158	22.400	56.709	22.484
overall_coverage_len	1	60.226	23.404	60.749	22.330	59.634	22.628
overall_coverage_prop	0	0.872	0.172	0.905	0.133	0.922	0.115
overall_coverage_prop	1	0.933	0.097	0.944	0.080	0.952	0.075

Preliminary Statistics¶

In [30]:

df_first.shape

Out[30]:

(41619240, 64)

In [31]:

df_first.is_self_cite.mean()

Out[31]:

0.048513836389131565

In [32]:

df_last.shape

Out[32]:

(41619267, 64)

In [33]:

df_last.is_self_cite.mean()

Out[33]:

0.08648619400240759

In [34]:

df_last.is_self_cite.value_counts()

Out[34]:

0    38019775
1     3599492
Name: is_self_cite, dtype: int64

In [35]:

df_first.is_self_cite.value_counts()

Out[35]:

0    39600131
1     2019109
Name: is_self_cite, dtype: int64

In [36]:

df_first[["gender", "is_self_cite"]].groupby("gender").is_self_cite.mean()* 100

Out[36]:

gender
-    3.705023
F    3.779226
M    5.694023
Name: is_self_cite, dtype: float64

In [37]:

df_last[["gender", "is_self_cite"]].groupby("gender").is_self_cite.mean()* 100

Out[37]:

gender
-    6.557432
F    7.179615
M    9.276608
Name: is_self_cite, dtype: float64

Tables of gender self citation for age and prior papers¶

In [38]:

def filtered_data(df):
    # Base version 1
    df_filtered = df[(df.gender != "-")
                & (df.source_ncites >= 10)
                & (df.source_ncites <=60)]
    print df_filtered.shape, df.shape
    print "Filter dataset is %.2f%% of the original data." % (df_filtered.shape[0] * 100./df.shape[0])
    return df_filtered

def aggregate_function(x, median_col="auth_prev_papers", span=1):
    median = x[median_col].median()
    x = x[(x[median_col] >= (median - span))
         & (x[median_col] <= (median + span))]
    t = x.groupby("gender")["is_self_cite"].agg([np.mean, len])
    t["median"] = median
    return t

First author¶

In [39]:

%%time
df_t_first = filtered_data(df_first)
display(df_t_first.pivot_table(index=pd.cut(df_t_first.au_age,
                                    bins=[4,6,8,12,18,22,27,33]),
                      columns="gender", values="is_self_cite", aggfunc=[np.mean, len]))

(26148675, 64) (41619240, 64)
Filter dataset is 62.83% of the original data.

	mean			len
gender	-	F	M	-	F	M
au_age
(4, 6]	NaN	0.039301	0.045931	NaN	1075897.0	1850463.0
(6, 8]	NaN	0.044391	0.053354	NaN	797698.0	1502238.0
(8, 12]	NaN	0.053800	0.063481	NaN	1037551.0	2141894.0
(12, 18]	NaN	0.068604	0.080691	NaN	881774.0	2023373.0
(18, 22]	NaN	0.080206	0.096689	NaN	361556.0	927872.0
(22, 27]	NaN	0.088466	0.109285	NaN	292644.0	863976.0
(27, 33]	NaN	0.096430	0.125897	NaN	189672.0	682763.0

CPU times: user 10.9 s, sys: 6.72 s, total: 17.6 s
Wall time: 17.6 s

In [40]:

df_t_first[["gender", "is_self_cite"]].groupby("gender").is_self_cite.mean()* 100

Out[40]:

gender
-         NaN
F    3.962280
M    5.787288
Name: is_self_cite, dtype: float64

In [41]:

df_t_first.groupby(pd.cut(df_t_first.au_age,
                                    bins=[4,6,8,12,18,22,27,33])).auth_prev_papers.describe().to_frame().unstack()

Out[41]:

	auth_prev_papers
	count	mean	std	min	25%	50%	75%	max
au_age
(4, 6]	2926360.0	8.735600	7.918158	1.0	4.0	7.0	11.0	149.0
(6, 8]	2299936.0	13.070437	12.257853	1.0	6.0	10.0	16.0	169.0
(8, 12]	3179445.0	19.852068	18.575819	1.0	8.0	15.0	25.0	462.0
(12, 18]	2905147.0	33.785176	32.444887	1.0	13.0	24.0	43.0	640.0
(18, 22]	1289428.0	50.750995	48.758790	1.0	19.0	37.0	66.0	672.0
(22, 27]	1156620.0	68.638081	70.166317	1.0	26.0	49.0	88.0	1167.0
(27, 33]	872435.0	91.798813	93.371988	1.0	32.0	64.0	118.0	996.0

In [42]:

df_t_first.groupby(pd.cut(df_t_first.au_age,
                                    bins=[0,1,2,3,4,6,8,12,18,22,27,33])).apply(aggregate_function).unstack()

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/pandas/indexes/category.py:118: RuntimeWarning: Values and categories have different dtypes. Did you mean to use
'Categorical.from_codes(codes, categories)'?
  data = Categorical(data, categories=categories, ordered=ordered)
/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/pandas/indexes/category.py:118: RuntimeWarning: None of the categories were found in values. Did you mean to use
'Categorical.from_codes(codes, categories)'?
  data = Categorical(data, categories=categories, ordered=ordered)

Out[42]:

	mean			len			median
gender	-	F	M	-	F	M	-	F	M
au_age
(0, 1]	NaN	0.017071	0.018470	0	672893	883693	1.0	1.0	1.0
(1, 2]	NaN	0.022868	0.024924	0	644699	838927	2.0	2.0	2.0
(2, 3]	NaN	0.029854	0.031049	0	408996	553894	3.0	3.0	3.0
(3, 4]	NaN	0.039952	0.040922	0	216412	328285	5.0	5.0	5.0
(4, 6]	NaN	0.044158	0.044623	0	242514	397329	7.0	7.0	7.0
(6, 8]	NaN	0.047341	0.050060	0	129466	222635	10.0	10.0	10.0
(8, 12]	NaN	0.058824	0.059643	0	104073	206110	15.0	15.0	15.0
(12, 18]	NaN	0.078402	0.074128	0	51874	115112	24.0	24.0	24.0
(18, 22]	NaN	0.091831	0.094580	0	13808	33654	37.0	37.0	37.0
(22, 27]	NaN	0.102627	0.099051	0	7347	22645	49.0	49.0	49.0
(27, 33]	NaN	0.114641	0.118494	0	3777	13891	64.0	64.0	64.0

In [43]:

df_t_first.groupby(pd.cut(df_t_first.au_age,
                                    bins=range(25))).apply(aggregate_function).unstack()

Out[43]:

	mean			len			median
gender	-	F	M	-	F	M	-	F	M
au_age
(0, 1]	NaN	0.017071	0.018470	0	672893	883693	1.0	1.0	1.0
(1, 2]	NaN	0.022868	0.024924	0	644699	838927	2.0	2.0	2.0
(2, 3]	NaN	0.029854	0.031049	0	408996	553894	3.0	3.0	3.0
(3, 4]	NaN	0.039952	0.040922	0	216412	328285	5.0	5.0	5.0
(4, 5]	NaN	0.041940	0.041381	0	158153	244486	6.0	6.0	6.0
(5, 6]	NaN	0.043040	0.043589	0	113965	182980	7.0	7.0	7.0
(6, 7]	NaN	0.047336	0.049195	0	75376	131395	9.0	9.0	9.0
(7, 8]	NaN	0.045909	0.052484	0	54804	98240	11.0	11.0	11.0
(8, 9]	NaN	0.052262	0.054095	0	42383	81264	12.0	12.0	12.0
(9, 10]	NaN	0.056391	0.055294	0	28267	58216	14.0	14.0	14.0
(10, 11]	NaN	0.058894	0.058481	0	23500	44442	16.0	16.0	16.0
(11, 12]	NaN	0.068008	0.064212	0	19880	34822	18.0	18.0	18.0
(12, 13]	NaN	0.068207	0.069794	0	15541	30318	20.0	20.0	20.0
(13, 14]	NaN	0.080898	0.071207	0	10025	24478	22.0	22.0	22.0
(14, 15]	NaN	0.081695	0.076336	0	8801	22008	25.0	25.0	25.0
(15, 16]	NaN	0.078823	0.078074	0	7853	17112	26.0	26.0	26.0
(16, 17]	NaN	0.082689	0.083403	0	6470	14436	28.0	28.0	28.0
(17, 18]	NaN	0.088354	0.084625	0	4199	12585	30.0	30.0	30.0
(18, 19]	NaN	0.099937	0.078418	0	4763	10648	33.0	33.0	33.0
(19, 20]	NaN	0.091979	0.096602	0	3740	8799	36.0	36.0	36.0
(20, 21]	NaN	0.101410	0.096034	0	3688	8195	38.0	38.0	38.0
(21, 22]	NaN	0.123810	0.093370	0	3150	6983	41.0	41.0	41.0
(22, 23]	NaN	0.089327	0.094742	0	2183	5668	44.0	44.0	44.0
(23, 24]	NaN	0.090399	0.101774	0	1604	5807	46.0	46.0	46.0

In [44]:

df_t = df_t_first.groupby(["auth_prev_papers", "gender"]).is_self_cite.agg([np.mean, len]).unstack()
df_t.head()

Out[44]:

	mean		len
gender	F	M	F	M
auth_prev_papers
0	0.002630	0.002950	1508335.0	1698186.0
1	0.013839	0.014609	1102301.0	1358766.0
2	0.022141	0.022906	860697.0	1116372.0
3	0.028143	0.029529	681661.0	936528.0
4	0.032647	0.033089	551164.0	793562.0

In [45]:

df_t[df_t.index > 200].head()

Out[45]:

	mean		len
gender	F	M	F	M
auth_prev_papers
201	0.132075	0.172092	159.0	3533.0
202	0.267647	0.226891	340.0	3927.0
203	0.168317	0.163998	101.0	4122.0
204	0.142045	0.147041	176.0	3278.0
205	0.151181	0.187476	635.0	2587.0

In [46]:

display(df_t_first.pivot_table(index=pd.cut(df_t_first.match_prop,
                                    bins=np.arange(0,1.1, 0.1)),
                      columns="gender", values="is_self_cite", aggfunc=[np.mean, len]))

	mean			len
gender	-	F	M	-	F	M
match_prop
(0, 0.1]	NaN	0.004069	0.005368	NaN	53329.0	68362.0
(0.1, 0.2]	NaN	0.004880	0.006203	NaN	170889.0	240850.0
(0.2, 0.3]	NaN	0.006989	0.008101	NaN	316214.0	441031.0
(0.3, 0.4]	NaN	0.010732	0.012003	NaN	487158.0	700750.0
(0.4, 0.5]	NaN	0.016126	0.017434	NaN	678922.0	1028072.0
(0.5, 0.6]	NaN	0.022847	0.024576	NaN	856422.0	1322024.0
(0.6, 0.7]	NaN	0.032174	0.034580	NaN	1085855.0	1796901.0
(0.7, 0.8]	NaN	0.045395	0.049450	NaN	1306052.0	2342643.0
(0.8, 0.9]	NaN	0.063754	0.072298	NaN	1342261.0	2872999.0
(0.9, 1]	NaN	0.102035	0.123153	NaN	1388735.0	4082615.0

In [47]:

df_t_first[["source_id", "gender", "match_prop"]].groupby("source_id").first().groupby("gender").mean()

Out[47]:

	match_prop
gender
F	0.560099
M	0.651669

Last author¶

In [48]:

%%time
df_t_last = filtered_data(df_last)
display(df_t_last.pivot_table(index=pd.cut(df_t_last.au_age,
                                    bins=[4,6,8,12,18,22,27,33]),
                      columns="gender", values="is_self_cite", aggfunc=[np.mean, len]))

(27526723, 64) (41619267, 64)
Filter dataset is 66.14% of the original data.

	mean			len
gender	-	F	M	-	F	M
au_age
(4, 6]	NaN	0.043577	0.054559	NaN	245198.0	670691.0
(6, 8]	NaN	0.054254	0.068265	NaN	288087.0	913149.0
(8, 12]	NaN	0.068714	0.080418	NaN	713097.0	2403210.0
(12, 18]	NaN	0.084920	0.095047	NaN	1148708.0	4332862.0
(18, 22]	NaN	0.097655	0.105987	NaN	666520.0	2911318.0
(22, 27]	NaN	0.109483	0.113665	NaN	646299.0	3328608.0
(27, 33]	NaN	0.119224	0.123024	NaN	474820.0	3166102.0

CPU times: user 16.2 s, sys: 12.9 s, total: 29.1 s
Wall time: 29.1 s

In [49]:

df_t_last[["gender", "is_self_cite"]].groupby("gender").is_self_cite.mean()* 100

Out[49]:

gender
-         NaN
F    7.834364
M    9.931546
Name: is_self_cite, dtype: float64

In [50]:

df_t_last.groupby(pd.cut(df_t_last.au_age,
                                    bins=[4,6,8,12,18,22,27,33])).auth_prev_papers.describe().to_frame().unstack()

Out[50]:

	auth_prev_papers
	count	mean	std	min	25%	50%	75%	max
au_age
(4, 6]	915889.0	12.139577	10.938851	1.0	5.0	9.0	16.0	103.0
(6, 8]	1201236.0	19.737560	19.456762	1.0	8.0	15.0	25.0	371.0
(8, 12]	3116307.0	31.511303	30.970875	1.0	14.0	23.0	39.0	545.0
(12, 18]	5481570.0	52.363269	45.608768	1.0	24.0	40.0	66.0	640.0
(18, 22]	3577838.0	78.718758	64.068521	1.0	36.0	61.0	102.0	672.0
(22, 27]	3974907.0	106.866907	95.144210	1.0	48.0	82.0	134.0	1167.0
(27, 33]	3640922.0	136.860927	112.021429	1.0	63.0	107.0	176.0	1359.0

In [51]:

df_t_last.groupby(pd.cut(df_t_last.au_age,
                                    bins=[0,1,2,3,4,6,8,12,18,22,27,33])).apply(aggregate_function).unstack()

Out[51]:

	mean			len			median
gender	-	F	M	-	F	M	-	F	M
au_age
(0, 1]	NaN	0.012663	0.014134	0	92394	154800	1.0	1.0	1.0
(1, 2]	NaN	0.018248	0.019393	0	79132	142989	2.0	2.0	2.0
(2, 3]	NaN	0.031040	0.033840	0	38950	78191	4.0	4.0	4.0
(3, 4]	NaN	0.044148	0.046076	0	24463	58990	6.0	6.0	6.0
(4, 6]	NaN	0.049488	0.054344	0	41343	102200	9.0	9.0	9.0
(6, 8]	NaN	0.065949	0.070936	0	30827	88883	15.0	15.0	15.0
(8, 12]	NaN	0.074164	0.081661	0	46613	158166	23.0	23.0	23.0
(12, 18]	NaN	0.093752	0.096448	0	44479	170848	40.0	40.0	40.0
(18, 22]	NaN	0.103228	0.106392	0	18338	75316	61.0	61.0	61.0
(22, 27]	NaN	0.118308	0.113358	0	11301	60737	82.0	82.0	82.0
(27, 33]	NaN	0.159484	0.137148	0	5499	48065	107.0	107.0	107.0

In [52]:

df_t_last.groupby(pd.cut(df_t_last.au_age,
                                    bins=range(25))).apply(aggregate_function).unstack()

Out[52]:

	mean			len			median
gender	-	F	M	-	F	M	-	F	M
au_age
(0, 1]	NaN	0.012663	0.014134	0	92394	154800	1.0	1.0	1.0
(1, 2]	NaN	0.018248	0.019393	0	79132	142989	2.0	2.0	2.0
(2, 3]	NaN	0.031040	0.033840	0	38950	78191	4.0	4.0	4.0
(3, 4]	NaN	0.044148	0.046076	0	24463	58990	6.0	6.0	6.0
(4, 5]	NaN	0.048195	0.050784	0	22637	53383	8.0	8.0	8.0
(5, 6]	NaN	0.050926	0.059258	0	19224	47926	10.0	10.0	10.0
(6, 7]	NaN	0.059010	0.063816	0	16099	45866	13.0	13.0	13.0
(7, 8]	NaN	0.065459	0.072830	0	14956	44844	16.0	16.0	16.0
(8, 9]	NaN	0.075830	0.079277	0	14783	48173	19.0	19.0	19.0
(9, 10]	NaN	0.065065	0.083231	0	12449	39168	22.0	22.0	22.0
(10, 11]	NaN	0.083855	0.080426	0	11663	39129	25.0	25.0	25.0
(11, 12]	NaN	0.074760	0.093647	0	9992	36584	28.0	28.0	28.0
(12, 13]	NaN	0.086575	0.091422	0	8663	34948	31.0	31.0	31.0
(13, 14]	NaN	0.086962	0.096403	0	7992	32385	34.0	34.0	34.0
(14, 15]	NaN	0.099392	0.090820	0	9045	30654	38.0	38.0	38.0
(15, 16]	NaN	0.097702	0.093341	0	7093	27705	42.0	42.0	42.0
(16, 17]	NaN	0.102696	0.104670	0	6602	24391	46.0	46.0	46.0
(17, 18]	NaN	0.099453	0.100758	0	5480	24266	51.0	51.0	51.0
(18, 19]	NaN	0.113051	0.105487	0	4352	22818	54.0	54.0	54.0
(19, 20]	NaN	0.113076	0.104730	0	4864	18753	59.0	59.0	59.0
(20, 21]	NaN	0.102139	0.112132	0	3319	17301	64.0	64.0	64.0
(21, 22]	NaN	0.102107	0.103948	0	3369	16643	68.0	68.0	68.0
(22, 23]	NaN	0.127448	0.128132	0	3217	15086	72.0	72.0	72.0
(23, 24]	NaN	0.130902	0.117569	0	2605	12010	78.0	78.0	78.0

In [53]:

df_t = df_t_last.groupby(["auth_prev_papers", "gender"]).is_self_cite.agg([np.mean, len]).unstack()
df_t.head()

Out[53]:

	mean		len
gender	F	M	F	M
auth_prev_papers
0	0.001768	0.002221	354618.0	520850.0
1	0.009851	0.010922	174400.0	293266.0
2	0.017412	0.018043	124628.0	231498.0
3	0.021854	0.022842	104605.0	211282.0
4	0.027933	0.028851	96515.0	203803.0

In [54]:

df_t[df_t.index > 200].head()

Out[54]:

	mean		len
gender	F	M	F	M
auth_prev_papers
201	0.169872	0.139676	1248.0	24972.0
202	0.208205	0.133783	975.0	23740.0
203	0.120042	0.139193	958.0	21560.0
204	0.169231	0.142836	1885.0	19883.0
205	0.166667	0.141209	1686.0	23582.0

In [55]:

display(df_t_last.pivot_table(index=pd.cut(df_t_last.match_prop,
                                    bins=np.arange(0,1.1, 0.1)),
                      columns="gender", values="is_self_cite", aggfunc=[np.mean, len]))

	mean			len
gender	-	F	M	-	F	M
match_prop
(0, 0.1]	NaN	0.004332	0.007531	NaN	9925.0	17794.0
(0.1, 0.2]	NaN	0.005513	0.008563	NaN	28115.0	60377.0
(0.2, 0.3]	NaN	0.005923	0.009147	NaN	59427.0	130209.0
(0.3, 0.4]	NaN	0.008838	0.012541	NaN	97421.0	229974.0
(0.4, 0.5]	NaN	0.014203	0.018079	NaN	159619.0	400792.0
(0.5, 0.6]	NaN	0.023015	0.027089	NaN	242711.0	679087.0
(0.6, 0.7]	NaN	0.037999	0.039974	NaN	409853.0	1277263.0
(0.7, 0.8]	NaN	0.057539	0.059966	NaN	731955.0	2607622.0
(0.8, 0.9]	NaN	0.083113	0.087707	NaN	1201356.0	5074137.0
(0.9, 1]	NaN	0.125788	0.136504	NaN	1930062.0	10960109.0

In [56]:

df_t_last[["source_id", "gender", "match_prop"]].groupby("source_id").first().groupby("gender").mean()

Out[56]:

	match_prop
gender
F	0.732513
M	0.822449

Plot author age and self-citation¶

In [57]:

def get_lower_quantile(x):
    return pd.Series.quantile(x, q=0.05)
    
def get_upper_quantile(x):
    return pd.Series.quantile(x, q=0.95)


def mean_confidence_interval(data, confidence=0.95):
    from scipy import stats
    a = data*1.0
    n = len(a)
    m, se = a.mean(), stats.sem(a)
    h = se * stats.t._ppf((1+confidence)/2., n-1)
    return pd.Series([m, m-h, m+h], index=["mean", "ci_l", "ci_u"])

In [58]:

mean_confidence_interval(np.random.randn(100))

Out[58]:

mean   -0.141473
ci_l   -0.335985
ci_u    0.053040
dtype: float64

In [59]:

df_t_first.head(1000).groupby(["auth_prev_papers", "gender"]
                             ).match_prop.agg([
        pd.Series.median, pd.Series.count,
        get_lower_quantile, get_upper_quantile
    ]).unstack()

Out[59]:

	median		count		get_lower_quantile		get_upper_quantile
gender	F	M	F	M	F	M	F	M
auth_prev_papers
0	0.000000	0.000000	58.0	62.0	0.000000	0.000000	0.000000	0.000000
1	0.549296	0.504951	73.0	36.0	0.296000	0.504951	0.549296	0.504951
2	0.592105	0.314286	35.0	30.0	0.592105	0.314286	0.592105	0.314286
3	NaN	0.383721	NaN	109.0	NaN	0.380282	NaN	0.636364
4	0.677419	NaN	10.0	NaN	0.677419	NaN	0.677419	NaN
5	NaN	0.660870	NaN	69.0	NaN	0.625000	NaN	0.698276
6	0.702128	NaN	17.0	NaN	0.702128	NaN	0.702128	NaN
7	0.774194	NaN	39.0	NaN	0.774194	NaN	0.774194	NaN
11	NaN	0.578512	NaN	62.0	NaN	0.578512	NaN	0.743902
12	NaN	0.605263	NaN	47.0	NaN	0.605263	NaN	0.605263
14	0.918919	NaN	75.0	NaN	0.918919	NaN	0.983871	NaN
16	0.884615	NaN	40.0	NaN	0.884615	NaN	0.884615	NaN
20	NaN	1.000000	NaN	25.0	NaN	0.647059	NaN	1.000000
26	NaN	1.000000	NaN	43.0	NaN	1.000000	NaN	1.000000
75	NaN	1.000000	NaN	20.0	NaN	1.000000	NaN	1.000000
126	NaN	1.000000	NaN	50.0	NaN	1.000000	NaN	1.000000
129	NaN	0.962963	NaN	18.0	NaN	0.962963	NaN	0.962963
146	NaN	1.000000	NaN	18.0	NaN	1.000000	NaN	1.000000
156	NaN	1.000000	NaN	32.0	NaN	1.000000	NaN	1.000000
178	1.000000	NaN	32.0	NaN	1.000000	NaN	1.000000	NaN

In [60]:

markersize=3
linestyle="-"
linewidth=0.5
gender_params={
    "F": dict(
        label="Female",
        color="crimson",
        marker="o",
    ),
    "M": dict(
        label="Male",
        color="dodgerblue",
        marker="s",
    ),
    
}
genders = ["F", "M"]

plot_params=dict(
    markersize=markersize,
    capsize=0.5, elinewidth=0.01,
    alpha=0.6,
    linestyle="none"
)

In [61]:

fig, ax = plt.subplots(2,1, figsize=(8,4))
for i, (df_t, title, axi) in enumerate(zip(
        [df_t_first, df_t_last],
        ["First", "Last"],
        ax
    )):
    
    df_t = df_t[~df_t.match_prop.isnull()].groupby(["auth_prev_papers", "gender"]).match_prop.agg([
        pd.Series.median, pd.Series.count,
        get_lower_quantile, get_upper_quantile
    ]).unstack()
    for j, gender in enumerate(genders):
        axi.fill_between(df_t.index,
                 df_t["get_lower_quantile"][gender],
                 df_t["get_upper_quantile"][gender],
                 color=gender_params[gender]["color"],
                 alpha=0.1)

        axi.plot(df_t.index, df_t["median"][gender],
                label=gender_params[gender]["label"],
                color=gender_params[gender]["color"],
                linewidth=linewidth,
                linestyle=linestyle)
    axi.set_ylim([0.3, 1.0])
    axi.set_xlim([1, 100])
    axi.set_title(title)
    axi.set_xlabel("Author's prior papers")
    axi.set_ylabel("Median\n($\pm [0.25, 0.75]$ quantiles)\nauthor expertise")
    axi.legend(title="Gender (Author position)", loc="lower right")
sns.despine(offset=10)
fig.tight_layout()
plt.savefig("Review_Figures/Author_papers_expertise_gender.pdf", bbox_inches="tight")

In [62]:

fig, ax = plt.subplots(1,2, figsize=(8,4))
for i, (df_t, title, axi) in enumerate(zip(
        [df_t_first, df_t_last],
        ["First", "Last"],
        ax
    )):
    
    df_t = df_t[~df_t.match_prop.isnull()].groupby(["auth_prev_papers", "gender"]).match_prop.apply(
        mean_confidence_interval
    ).unstack().unstack()
    display(df_t.head())
    for j, gender in enumerate(genders):
        axi.fill_between(df_t.index,
                 df_t["ci_l"][gender],
                 df_t["ci_u"][gender],
                 color=gender_params[gender]["color"],
                 alpha=0.5)

        axi.plot(df_t.index, df_t["mean"][gender],
                label=gender_params[gender]["label"],
                color=gender_params[gender]["color"],
                linewidth=linewidth,
                linestyle=linestyle)
    axi.set_ylim([0.3, 1.0])
    axi.set_xlim([1, 30])
    axi.set_title(title)
    axi.set_xlabel("Author's prior papers")
    axi.set_ylabel("Mean\n($\pm 95%$ CI)\nauthor expertise")
    axi.legend(title="Gender", loc="lower right")
sns.despine(offset=10)
fig.tight_layout()
plt.savefig("Review_Figures/Author_papers_expertise_gender.pdf", bbox_inches="tight")

	mean		ci_l		ci_u
gender	F	M	F	M	F	M
auth_prev_papers
0	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
1	0.410855	0.399922	0.410469	0.399567	0.411241	0.400277
2	0.534227	0.521689	0.533804	0.521308	0.534650	0.522071
3	0.597923	0.588261	0.597464	0.587861	0.598382	0.588662
4	0.642608	0.626045	0.642119	0.625621	0.643098	0.626469

	mean		ci_l		ci_u
gender	F	M	F	M	F	M
auth_prev_papers
0	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
1	0.408693	0.386917	0.407659	0.386120	0.409726	0.387713
2	0.517016	0.492043	0.515838	0.491161	0.518195	0.492926
3	0.567342	0.552091	0.566088	0.551190	0.568596	0.552992
4	0.621439	0.596012	0.620200	0.595124	0.622678	0.596899

In [63]:

fig, ax = plt.subplots(2,1, figsize=(8,4))
for i, (df_t, title, axi) in enumerate(zip(
        [df_t_first, df_t_last],
        ["First", "Last"],
        ax
    )):
    
    df_t = df_t[~df_t.match_prop.isnull()].assign(
        expertise=lambda x: pd.cut(x["match_prop"],
                                   bins=np.arange(0,1,0.1),
                                   include_lowest=True,
                                   right=True
                                  )
    ).groupby(["expertise", "gender"]).is_self_cite.agg([
        pd.Series.mean, pd.Series.count,
    ]).unstack()
    width=0.4
    for j, gender in enumerate(genders):
        axi.bar(np.arange(df_t.index.shape[0]) + j*width, df_t["mean"][gender],
                label=gender_params[gender]["label"],
                color=gender_params[gender]["color"],
                width=width
               )
    #axi.set_ylim([0.3, 1.0])
    #axi.set_xlim([1, 100])
    axi.set_xticks(np.arange(df_t.index.shape[0]) + width / 2)
    axi.set_xticklabels(df_t.index.values)
    axi.set_title(title)
    axi.set_xlabel("Author's expertise")
    axi.set_ylabel("Mean\nself-citation")
    axi.legend(title="Gender", loc="upper left", ncol=2)
sns.despine(offset=10)
fig.tight_layout()
plt.savefig("Review_Figures/Self_citation_expertise_gender.pdf", bbox_inches="tight")

In [64]:

plt.matplotlib.__version__

Out[64]:

'2.0.0'

In [65]:

def model_predictions(df, formula, df_test, verbose=False, model_params=None):
    if model_params is None:
        model_params = dict()
    if verbose:
        print(df.shape, df_test.shape, formula)
    y,X = patsy.dmatrices(formula, data=df, return_type="dataframe")
    if verbose:
        print "Created dataframes"
        print "X.shape = %s, y.shape = %s" % (X.shape, y.shape)
    model = Logit(y,X).fit(disp=verbose,
                           **model_params
                           #method='lbfgs', maxiter=50
                          )
    if verbose:
        display(model.summary2())
    _, X_test = patsy.dmatrices(formula, data=df_test, return_type="dataframe")
    if verbose:
        print X_test.shape
    y_test = model.predict(X_test)
    return y_test, model

In [66]:

df_test = pd.DataFrame({
            "auth_prev_papers": np.arange(0,150)
        }).assign(is_self_cite=1)
df_test.head()

Out[66]:

	auth_prev_papers	is_self_cite
0	0	1
1	1	1
2	2	1
3	3	1
4	4	1

In [67]:

formula = ("is_self_cite ~ "
               "I(auth_prev_papers == 0)"
               "+ I(auth_prev_papers == 1)"
               "+ np.log10(auth_prev_papers + 1) + I(np.log10(auth_prev_papers + 1)**2)"
              )
y_test, model = model_predictions(
    df_t_first[(df_t_first.gender == "M") 
               & (df_t_first.auth_prev_papers <= 100)],
    formula,
    df_test, verbose=True)

((15801900, 64), (150, 2), 'is_self_cite ~ I(auth_prev_papers == 0)+ I(auth_prev_papers == 1)+ np.log10(auth_prev_papers + 1) + I(np.log10(auth_prev_papers + 1)**2)')
Created dataframes
X.shape = (15801900, 5), y.shape = (15801900, 1)
Optimization terminated successfully.
         Current function value: 0.187466
         Iterations 10

Model:	Logit	Pseudo R-squared:	0.064
Dependent Variable:	is_self_cite	AIC:	5924654.1657
Date:	2017-08-17 11:23	BIC:	5924727.0439
No. Observations:	15801900	Log-Likelihood:	-2.9623e+06
Df Model:	4	LL-Null:	-3.1648e+06
Df Residuals:	15801895	LLR p-value:	0.0000
Converged:	1.0000	Scale:	1.0000
No. Iterations:	10.0000

	Coef.	Std.Err.	z	P>\|z\|	[0.025	0.975]
Intercept	-4.3672	0.0105	-415.6943	0.0000	-4.3878	-4.3466
I(auth_prev_papers == 0)[T.True]	-1.4559	0.0176	-82.6117	0.0000	-1.4905	-1.4214
I(auth_prev_papers == 1)[T.True]	-0.2761	0.0094	-29.5123	0.0000	-0.2944	-0.2577
np.log10(auth_prev_papers + 1)	1.4643	0.0175	83.5181	0.0000	1.4299	1.4986
I(np.log10(auth_prev_papers + 1) ** 2)	-0.0989	0.0068	-14.4466	0.0000	-0.1123	-0.0855

(150, 5)

In [68]:

model.summary2().tables[1].ix["I(auth_prev_papers == 0)[T.True]", ["Coef.", "Std.Err."]]

Out[68]:

Coef.      -1.455942
Std.Err.    0.017624
Name: I(auth_prev_papers == 0)[T.True], dtype: float64

In [69]:

paper_filter = 100
df_t_data = df_t_first
df_t_data = df_t_data[(df_t_data.auth_prev_papers <= paper_filter)]
df_t_full = df_t_data.groupby("gender")[["auth_prev_papers", "is_self_cite"]].agg([np.mean, np.std, len])
df_t_full

Out[69]:

	auth_prev_papers			is_self_cite
	mean	std	len	mean	std	len
gender
-	NaN	NaN	0	NaN	NaN	0
F	10.205991	15.322557	9172202	0.038016	0.191234	9172202
M	16.654528	21.158397	15801900	0.050600	0.219179	15801900

In [70]:

df_t_full["auth_prev_papers"]

Out[70]:

	mean	std	len
gender
-	NaN	NaN	0
F	10.205991	15.322557	9172202
M	16.654528	21.158397	15801900

In [71]:

df_t_full[("auth_prev_papers", "error")] = df_t_full[("auth_prev_papers", "std")] / np.sqrt(df_t_full[("auth_prev_papers", "len")])
df_t_full[("is_self_cite", "error")] = np.sqrt(
    (df_t_full[("is_self_cite", "mean")] * (1-df_t_full[("is_self_cite", "mean")]))/df_t_full[("is_self_cite", "len")])
df_t_full = df_t_full.sort_index(level=0, axis=1)
df_t_full

Out[71]:

	auth_prev_papers				is_self_cite
	mean	std	len	error	mean	std	len	error
gender
-	NaN	NaN	0	NaN	NaN	NaN	0	NaN
F	10.205991	15.322557	9172202	0.005059	0.038016	0.191234	9172202	0.000063
M	16.654528	21.158397	15801900	0.005323	0.050600	0.219179	15801900	0.000055

In [72]:

df_t_full.ix[gender, [("auth_prev_papers", "mean")]].values

Out[72]:

array([ 16.65452756])

In [73]:

def get_age_self_cite_data(df_t_data):
    df_t = df_t_data.groupby(
        ["auth_prev_papers", "gender"]).is_self_cite.agg([np.mean, len]).unstack()
    df_t_full = df_t_data.assign(
        auth_prev_papers=np.log10(df_t_data["auth_prev_papers"] + 1).copy()
    ).groupby("gender")[["auth_prev_papers", "is_self_cite"]].agg([np.mean, np.std, len])
    df_t_full[("auth_prev_papers", "error")] = df_t_full[("auth_prev_papers", "std")] / np.sqrt(df_t_full[("auth_prev_papers", "len")])
    df_t_full[("is_self_cite", "error")] = np.sqrt(
        ((df_t_full[("is_self_cite", "mean")]
          * (1-df_t_full[("is_self_cite", "mean")])
         )/df_t_full[("is_self_cite", "len")]))
    df_t_full[("auth_prev_papers", "l_error")] = 10**(
        df_t_full[("is_self_cite", "mean")] - df_t_full[("is_self_cite", "error")]) - 1
    df_t_full[("auth_prev_papers", "u_error")] = 10**(
        df_t_full[("is_self_cite", "mean")] + df_t_full[("is_self_cite", "error")]) - 1
    df_t_full = df_t_full.sort_index(level=0, axis=1)
    
    error = np.sqrt(df_t["mean"].multiply(1 - df_t["mean"]).divide(df_t["len"]))
    return df_t, df_t_full, error


def plot_age_vs_self_cite(axi, df_t_data, df_t, df_t_full, error, gender,
                          gender_params, plot_params, model_params,
                          formula, df_test, error_plot=True,
                          mean_line=False, verbose=False):
    """
    TODO: Add beta of Male Female indicator to the plot. 
    """
    error_params = gender_params[gender]
    if error_plot:
        axi.errorbar(df_t.index.values, df_t["mean"][gender].values,
                     yerr=error[gender].values,
                     color=gender_params[gender]["color"],
                     #label=gender_params[gender]["label"],
                     marker=gender_params[gender]["marker"],
                     **plot_params
                    )
    else:
        axi.plot(df_t.index.values, df_t["mean"][gender].values,
                     color=gender_params[gender]["color"],
                     #label=gender_params[gender]["label"],
                     marker=gender_params[gender]["marker"],
                     **dict([(k,v)
                         for k,v in plot_params.items()
                         if k not in set(["capsize", "elinewidth"])
                     ])
                    )

    x_mean = 10**(df_t_full.ix[gender, [("auth_prev_papers", "mean")]].values)-1
    y_mean = df_t_full.ix[gender, [("is_self_cite", "mean")]].values
    y_mean_err = df_t_full.ix[gender, [("is_self_cite", "error")]].values
    if mean_line:
        axi.axhline(
            y=y_mean[0],
            color=gender_params[gender]["color"],
            label="{} ({:.2f}% self-citations)".format(
                gender_params[gender]["label"], y_mean[0]*100),
            linestyle="--",
            lw=linewidth+1
        )
    else:
        axi.errorbar(
            x_mean, y_mean,
            #xerr=[df_t_full[("auth_prev_papers", "l_error")].values,
            #      df_t_full[("auth_prev_papers", "u_error")].values],
            yerr=y_mean_err,
            color=gender_params[gender]["color"],
            marker=gender_params[gender]["marker"],
            label="{}(x={:.2f},y={:.2f})".format(
                gender_params[gender]["label"], x_mean[0], y_mean[0]*100),
            ms=10,
        )

    print "Fitting model for %s" % gender
    model = None
    try:
        y_test, model = model_predictions(
                df_t_data,
                formula, df_test, verbose=verbose, model_params=model_params)
        axi.plot(df_test["auth_prev_papers"], y_test,
                 color=gender_params[gender]["color"],
                 linestyle="-",
                 linewidth=linewidth
                )
    except:
        print("Modeling failed.")
        print(df_t_data[df_t_data.auth_prev_papers < 2].auth_prev_papers.value_counts())
        import traceback
        traceback.print_exc()
    return axi, model

def plot_age_dist(axi_dist, df_t, gender, gender_params, cdf=False):
    # Plot distribution
    y = (df_t["len"][gender] * 1./ df_t["len"][gender].sum())
    if cdf:
        y = y.cumsum()
    axi_dist.plot(df_t.index, y,
                  color=gender_params[gender]["color"],
                  label=gender_params[gender]["label"],
                  marker=gender_params[gender]["marker"],
                  linestyle="-",
                  alpha=0.5,
                  markersize=markersize
    )
    return axi_dist
    
    

In [74]:

plot_params

Out[74]:

{'alpha': 0.6,
 'capsize': 0.5,
 'elinewidth': 0.01,
 'linestyle': 'none',
 'markersize': 3}

In [75]:

formula = ("is_self_cite ~ "
               "I(auth_prev_papers == 0)"
               "+ I(auth_prev_papers == 1)"
               "+ C(gender, levels=['M', 'F'])"
               "+ np.log10(auth_prev_papers + 1) + I(np.log10(auth_prev_papers + 1)**2)"
              )

fig, ax = plt.subplots(
    nrows=2, ncols=2,
    sharex=True, sharey='row',
    gridspec_kw={'height_ratios': [2, 1]},
    figsize=(8,6))
paper_filter = 100

for i, (df_t_data, title) in enumerate(zip(
        [df_t_first, df_t_last],
        ["First", "Last"],
    )):
    axi, axi_dist = ax[:, i].flatten()
    #df_t_data = df_t_data[(df_t_data.auth_prev_papers <= paper_filter)]
        
    df_t, df_t_full, error = get_age_self_cite_data(df_t_data)
    for j, gender in enumerate(genders):
        df_test = pd.DataFrame({
                    "auth_prev_papers": np.arange(0,min([paper_filter, 150]))
                }).assign(is_self_cite=1, gender=gender)
        model_params = dict()
        axi, model = plot_age_vs_self_cite(axi,
                                    df_t_data, #[df_t_data.gender == gender],
                                    df_t, df_t_full, error, gender,
                                    gender_params, {
                                        k: (1 if k == "markersize" else v)
                                        for k,v in plot_params.items()
                                    }, model_params,
                                    formula, df_test, mean_line=True, verbose=False)
        axi_dist = plot_age_dist(axi_dist, df_t, gender, gender_params, cdf=True)
        legend_title = ""
        if model:
            model_stats = model.summary2().tables[1].ix[
                "C(gender, levels=['M', 'F'])[T.F]",
                ["Coef.", "Std.Err."]].values
            legend_title = "$\\beta_{F-M}=%.3f\ (%.3f)$" % (model_stats[0], model_stats[1])
    axi.set_ylim([0, 0.15])
    axi.set_xlim([0, 100])
    axi.set_title(title)
    axi.set_xlabel("Age (pub-count)")
    axi.set_ylabel("Self-citation proportion")
    axi.legend(loc='lower right', title=legend_title)

    axi_dist.legend(loc='lower right')
    axi_dist.set_xlabel("Age (pub-count)")
    axi_dist.set_ylabel("Cumulative proportion\nof citations")
    
#ax.legend(title="Gender (Author position)", loc="lower right")
sns.despine(offset=10)
fig.tight_layout()
plt.savefig("Review_Figures/Author_papers_self_cite_gender.pdf", bbox_inches="tight")

Fitting model for F
Fitting model for M
Fitting model for F
Fitting model for M

In [76]:

fig, ax = plt.subplots(4,2, sharey="row", figsize=(8,16))

filters = [
    (lambda x: x == 0, "$%s = 0$", 20),
    (lambda x: (x > 0) & (x <= 0.5), "$0 < %s \leq 0.5$", 50,),
    (lambda x: (x > 0.5) & (x < 1), "$0.5 < %s < 1$", 100),
    (lambda x: x == 1, "$%s = 1$", 100),
          ]
for i, (idx_filter, filter_title, paper_filter) in enumerate(filters):
    for j, (df_t, title, axi) in enumerate(zip(
            [df_t_first, df_t_last],
            ["First", "Last"],
            ax[i]
        )):
        df_t = df_t[idx_filter(df_t.match_prop) & (df_t.auth_prev_papers <= paper_filter)].groupby(
            ["auth_prev_papers", "gender"]).is_self_cite.agg([np.mean, len]).unstack()
        error = np.sqrt(df_t["mean"].multiply(1 - df_t["mean"]).divide(df_t["len"]))
        axi.errorbar(df_t.index, df_t["mean"]["F"], yerr=error["F"],
                 label="Female", color="crimson", linestyle="none", marker="o", alpha=0.7)
        axi.errorbar(df_t.index, df_t["mean"]["M"], yerr=error["M"],
                     label="Male", color="dodgerblue", linestyle="none", marker="^", alpha=0.7)
        axi.set_title("%s [%s]" % (title, filter_title % "exp"))
        axi.set_xlabel("Author's prior papers")
        axi.set_ylabel("Self-citation proportion")
        axi.legend(loc='best', title="Gender")
#ax.legend(title="Gender (Author position)", loc="lower right")
sns.despine(offset=10)
fig.tight_layout()
plt.savefig("Review_Figures/Author_papers_self_cite_gender_exp.pdf", bbox_inches="tight")

Load Journal names¶

In [77]:

%%time
df_journals = pd.read_csv("data/FullArticlesData.txt", sep="\t", usecols=["PMID", "journal"])
df_journals.head()

CPU times: user 1min 26s, sys: 6.56 s, total: 1min 32s
Wall time: 1min 32s

In [78]:

df_journals.head()

Out[78]:

	PMID	journal
0	26151966	J Hum Lact
1	26151965	J Hum Lact
2	26151955	EuroIntervention
3	26151954	EuroIntervention
4	26151953	EuroIntervention

In [79]:

df_journals.journal.value_counts().head(30)

Out[79]:

J Biol Chem                   171068
Science                       167415
PLoS One                      133591
Lancet                        129945
Proc Natl Acad Sci U S A      121705
Nature                        104418
Br Med J                       97226
Biochim Biophys Acta           96039
Biochem Biophys Res Commun     78341
Phys Rev Lett                  76322
N Engl J Med                   72020
JAMA                           66849
BMJ                            65858
Biochemistry                   62430
J Immunol                      62245
Brain Res                      56834
Am J Physiol                   54726
Biochem J                      54355
J Bacteriol                    51716
J Am Chem Soc                  50057
Cancer Res                     48966
Ann N Y Acad Sci               47684
J Urol                         47368
Phys Rev B Condens Matter      46890
FEBS Lett                      46770
Appl Opt                       43386
Blood                          43160
J Virol                        42269
Med J Aust                     41119
Ugeskr Laeger                  40687
Name: journal, dtype: int64

Journal categories¶

MEDICINE - NEMJ, JAMA, LANCET
BIOLOGY - CELL, Journal of Bio Chem
Bioinformatics - PLoS Com Bio, BMC BioInfo
EPIDEMIOLOGY - MMWR. Morbidity and Mortality Weekly Report, Emerging Infectious Diseases, International Journal of Epidemiology
DENTISTRY - Journal of Endodontics, Journal of Clinical Periodontology, Journal of Dental Research
GENERIC - Proc Natl Acad Sci U S A, Nature, Science, PLoS One

In [80]:

JOURNAL_NAMES  = dict(
    GENERIC=set(['Proc Natl Acad Sci U S A', 'Nature', 'Science',
                 'Ann N Y Acad Sci',]), # General Science
    MEDICINE = set(['JAMA', 'Lancet', 'N Engl J Med',
                    'BMJ', 'Cancer Res', 'Clin Cancer Res', 'J Clin Oncol',
                    'J Am Coll Cardiol', 'Gut', 'Circulation', 'Blood',
                    'J Immunol', 'Brain Res', #'Am J Physiol',
                    'J Urol', 
                    #'Med J Aust', 'Ugeskr Laeger'
                   ]), # General Medicine
    BIOLOGY = set(['J Biol Chem', 'Cell',  'Adv Exp Med Biol',
                   'Mol Cell', 'Biochim Biophys Acta', 'Biochemistry',
                   'Biochem J', 'FEBS Lett', 'J Bacteriol', 'J Virol',
                   'Bioinformatics', 'Nucleic Acids Res',
                  ]), # Biology
)

JOURNAL_NAMES

Out[80]:

{'BIOLOGY': {'Adv Exp Med Biol',
  'Biochem J',
  'Biochemistry',
  'Biochim Biophys Acta',
  'Bioinformatics',
  'Cell',
  'FEBS Lett',
  'J Bacteriol',
  'J Biol Chem',
  'J Virol',
  'Mol Cell',
  'Nucleic Acids Res'},
 'GENERIC': {'Ann N Y Acad Sci',
  'Nature',
  'Proc Natl Acad Sci U S A',
  'Science'},
 'MEDICINE': {'BMJ',
  'Blood',
  'Brain Res',
  'Cancer Res',
  'Circulation',
  'Clin Cancer Res',
  'Gut',
  'J Am Coll Cardiol',
  'J Clin Oncol',
  'J Immunol',
  'J Urol',
  'JAMA',
  'Lancet',
  'N Engl J Med'}}

In [81]:

df_journals[df_journals.journal.isin(JOURNAL_NAMES["MEDICINE"])].journal.value_counts()

Out[81]:

Lancet               129945
N Engl J Med          72020
JAMA                  66849
BMJ                   65858
J Immunol             62245
Brain Res             56834
Cancer Res            48966
J Urol                47368
Blood                 43160
Circulation           40094
J Am Coll Cardiol     21580
J Clin Oncol          21075
Gut                   15952
Clin Cancer Res       14845
Name: journal, dtype: int64

In [82]:

pd.concat([df_journals[df_journals.journal.isin(v)
                      ][["PMID", "journal"]].assign(JOURNAL_TYPE=k).reset_index(drop=True)
          for k,v in JOURNAL_NAMES.items()]).head()

Out[82]:

	PMID	journal	JOURNAL_TYPE
0	26151898	Brain Res	MEDICINE
1	26151676	J Urol	MEDICINE
2	26151285	JAMA	MEDICINE
3	26151284	JAMA	MEDICINE
4	26151282	JAMA	MEDICINE

In [83]:

df_test.assign(gender=gender).head()

Out[83]:

	auth_prev_papers	gender	is_self_cite
0	0	M	1
1	1	M	1
2	2	M	1
3	3	M	1
4	4	M	1

In [84]:

def get_journal_data(df_t_data, journal_name):
    journal_ids = set(df_journals.ix[df_journals.journal == journal_name, "PMID"].values)
    print(journal_name, len(journal_ids))
    df_t_data = df_t_data[(df_t_data.source_id.isin(journal_ids))]
    return df_t_data
    

def plot_journal_data(formula, selected_journals, filename,
                      paper_filter = 10000, min_citations=20000, verbose=False):
    nrows=len(selected_journals)
    fig = plt.figure(figsize=(8,4*nrows))
    grid_size=(nrows,2)
    journal_group_stats = []
    for k, journal_name in enumerate(selected_journals):
        journal_ids = set(df_journals.ix[df_journals.journal == journal_name, "PMID"].values)
        print(journal_name, len(journal_ids))
        for i, (df_t_data, title) in enumerate(zip(
                [df_t_first, df_t_last],
                ["First", "Last"],
            )):
            axi = plt.subplot2grid(grid_size, (k, i))
            df_t_data = df_t_data[
                (df_t_data.auth_prev_papers <= paper_filter)
                & (df_t_data.source_id.isin(journal_ids))
            ]
            df_t, df_t_full, error = get_age_self_cite_data(df_t_data)
            print(title, df_t_data.shape)
            if df_t_data.shape[0] < min_citations:
                    continue
            legend_title=""
            model_stats = [np.nan, np.nan, np.nan]
            for j, gender in enumerate(genders):
                df_test = pd.DataFrame({
                    "auth_prev_papers": np.arange(0,min([paper_filter, 150]))
                }).assign(is_self_cite=1, gender=gender)
                model_params = dict(method='lbfgs', maxiter=50)
                axi, model = plot_age_vs_self_cite(axi, df_t_data, df_t, df_t_full,
                                            error, gender, gender_params, plot_params, model_params,
                                            formula, df_test, mean_line=True, error_plot=False)
                if model:
                    model_stats = model.summary2().tables[1].ix[
                        "C(gender, levels=['M', 'F'])[T.F]",
                        ["Coef.", "Std.Err.", "P>|z|"]].values.tolist()
                    legend_title = "$\\beta_{F-M}=%.3f\\ (%.3f)$" % (model_stats[0], model_stats[1])
            journal_group_stats.append(
                [journal_name, title, df_t_data.shape[0]]+model_stats)
            axi.set_ylim([0, 0.24])
            xlim_max = 150
            if title == "First":
                xlim_max = 50
            axi.set_xlim([0, min([paper_filter, xlim_max])])
            axi.set_title("{0} [n={1:,}]\n({2})".format(title, df_t_data.shape[0], journal_name))
            axi.set_xlabel("Age (pub count)")
            axi.set_ylabel("Self-citation proportion")
            legend = axi.legend(loc='upper left', title=legend_title, ncol=1)
            legend.get_frame().set_facecolor('#FFFFFF')
        #ax.legend(title="Gender (Author position)", loc="lower right")
    sns.despine(offset=10)
    fig.tight_layout()
    plt.savefig(filename, bbox_inches="tight")
    return fig, journal_group_stats

In [85]:

def get_proportion(journal_name, paper_filter=100, verbose=False):
    journal_ids = set(df_journals.ix[df_journals.journal == journal_name, "PMID"].values)
    if verbose:
        print(journal_name, len(journal_ids))
    print("Paper cutoff: {}".format(paper_filter))
    for i, (df_t_data, title) in enumerate(zip(
            [df_t_first, df_t_last],
            ["First", "Last"],
        )):
        df_t = df_t_data[
            (df_t_data.source_id.isin(journal_ids))
        ]
        print(df_t.shape)
        display(df_t.gender.value_counts())
        total_papers = df_t.shape[0]
        df_t = df_t[(df_t.auth_prev_papers <= paper_filter)]
        print("{}\tProportion: {:.2f}%".format(title, df_t.shape[0]* 100./total_papers))       

In [86]:

get_proportion('Proc Natl Acad Sci U S A', paper_filter=30)

Paper cutoff: 30
(298356, 64)

M    201791
F     96565
-         0
Name: gender, dtype: int64

First	Proportion: 89.78%
(334968, 64)

M    284467
F     50501
-         0
Name: gender, dtype: int64

Last	Proportion: 26.02%

In [87]:

journal_formula = ("is_self_cite ~ "
               "I(auth_prev_papers == 0)"
                "+ I(auth_prev_papers == 1)"
                "+ C(gender, levels=['M', 'F'])"
               "+ np.log10(auth_prev_papers + 1)"
                "+ I(np.log10(auth_prev_papers + 1)**2)"
              )
selected_journals = {'Cell',}
filename = "Review_Figures/Author_papers_self_cite_gender_journals.pdf"
fig, journal_group_stats = plot_journal_data(journal_formula, selected_journals, filename)

('Cell', 17515)
('First', (32399, 64))
Fitting model for F
Fitting model for M
('Last', (37042, 64))
Fitting model for F
Fitting model for M

In [88]:

pd.DataFrame(journal_group_stats, columns=[
    "Journal", "Author Position", "citations",
    "beta", "stderr", "p-value"
]).pivot(index="Journal", columns="Author Position").reorder_levels([1,0], axis=1).sortlevel(0, axis=1)

Out[88]:

Author Position	First				Last
	citations	beta	stderr	p-value	citations	beta	stderr	p-value
Journal
Cell	32399	-0.200469	0.075531	0.007951	37042	-0.08922	0.04975	0.072915

Plot per journal category¶

In [89]:

journal_formula = ("is_self_cite ~ "
               "I(auth_prev_papers == 0)"
                "+ I(auth_prev_papers == 1)"
                " + C(gender, levels=['M', 'F'])"
               "+ np.log10(auth_prev_papers + 1)"
                "+ I(np.log10(auth_prev_papers + 1)**2)"
              )
df_journal_stats = {}
for journal_cat, selected_journals in JOURNAL_NAMES.items():
    print(journal_cat)
    filename = "Review_Figures/Author_papers_self_cite_gender_journals_{}.pdf".format(journal_cat)
    fig, journal_group_stats = plot_journal_data(journal_formula, selected_journals, filename)
    df_journal_stats[journal_cat] = journal_group_stats

MEDICINE
('Cancer Res', 48966)
('First', (131329, 64))
Fitting model for F
Fitting model for M
('Last', (149313, 64))
Fitting model for F
Fitting model for M
('Circulation', 40094)
('First', (92220, 64))
Fitting model for F
Fitting model for M
('Last', (98741, 64))
Fitting model for F
Fitting model for M
('N Engl J Med', 72020)
('First', (30970, 64))
Fitting model for F
Fitting model for M
('Last', (31971, 64))
Fitting model for F
Fitting model for M
('BMJ', 65858)
('First', (23987, 64))
Fitting model for F
Fitting model for M
('Last', (24438, 64))
Fitting model for F
Fitting model for M
('J Am Coll Cardiol', 21580)
('First', (50523, 64))
Fitting model for F
Fitting model for M
('Last', (52579, 64))
Fitting model for F
Fitting model for M
('J Clin Oncol', 21075)
('First', (61722, 64))
Fitting model for F
Fitting model for M
('Last', (62049, 64))
Fitting model for F
Fitting model for M
('J Immunol', 62245)
('First', (208354, 64))
Fitting model for F
Fitting model for M
('Last', (228129, 64))
Fitting model for F
Fitting model for M
('JAMA', 66849)
('First', (33674, 64))
Fitting model for F
Fitting model for M
('Last', (34651, 64))
Fitting model for F
Fitting model for M
('Gut', 15952)
('First', (31027, 64))
Fitting model for F
Fitting model for M
('Last', (32663, 64))
Fitting model for F
Fitting model for M
('Clin Cancer Res', 14845)
('First', (91687, 64))
Fitting model for F
Fitting model for M
('Last', (96551, 64))
Fitting model for F
Fitting model for M
('Brain Res', 56834)
('First', (100389, 64))
Fitting model for F
Fitting model for M
('Last', (108379, 64))
Fitting model for F
Fitting model for M
('J Urol', 47368)
('First', (49314, 64))
Fitting model for F
Fitting model for M
('Last', (50379, 64))
Fitting model for F
Fitting model for M
('Lancet', 129945)
('First', (26344, 64))
Fitting model for F
Fitting model for M
('Last', (26301, 64))
Fitting model for F
Fitting model for M
('Blood', 43160)
('First', (140887, 64))
Fitting model for F
Fitting model for M
('Last', (152394, 64))
Fitting model for F
Fitting model for M
GENERIC
('Science', 167415)
('First', (41328, 64))
Fitting model for F
Fitting model for M
('Last', (45043, 64))
Fitting model for F
Fitting model for M
('Proc Natl Acad Sci U S A', 121705)
('First', (298356, 64))
Fitting model for F
Fitting model for M
('Last', (334968, 64))
Fitting model for F
Fitting model for M
('Ann N Y Acad Sci', 47684)
('First', (52540, 64))
Fitting model for F
Fitting model for M
('Last', (54224, 64))
Fitting model for F
Fitting model for M
('Nature', 104418)
('First', (46138, 64))
Fitting model for F
Fitting model for M
('Last', (50625, 64))
Fitting model for F
Fitting model for M
BIOLOGY
('Mol Cell', 5804)
('First', (33538, 64))
Fitting model for F
Fitting model for M
('Last', (39524, 64))
Fitting model for F
Fitting model for M
('Biochim Biophys Acta', 96039)
('First', (104434, 64))
Fitting model for F
Fitting model for M
('Last', (110138, 64))
Fitting model for F
Fitting model for M
('Adv Exp Med Biol', 31938)
('First', (21625, 64))
Fitting model for F
Fitting model for M
('Last', (22072, 64))
Fitting model for F
Fitting model for M
('Biochem J', 54355)
('First', (91576, 64))
Fitting model for F
Fitting model for M
('Last', (97174, 64))
Fitting model for F
Fitting model for M
('Biochemistry', 62430)
('First', (182433, 64))
Fitting model for F
Fitting model for M
('Last', (204527, 64))
Fitting model for F
Fitting model for M
('Nucleic Acids Res', 39570)
('First', (98322, 64))
Fitting model for F
Fitting model for M
('Last', (104933, 64))
Fitting model for F
Fitting model for M
('FEBS Lett', 46770)
('First', (94021, 64))
Fitting model for F
Fitting model for M
('Last', (99364, 64))
Fitting model for F
Fitting model for M
('Cell', 17515)
('First', (32399, 64))
Fitting model for F
Fitting model for M
('Last', (37042, 64))
Fitting model for F
Fitting model for M
('J Bacteriol', 51716)
('First', (102012, 64))
Fitting model for F
Fitting model for M
('Last', (109737, 64))
Fitting model for F
Fitting model for M
('Bioinformatics', 10026)
('First', (20756, 64))
Fitting model for F
Fitting model for M
('Last', (23014, 64))
Fitting model for F
Fitting model for M
('J Virol', 42269)
('First', (155081, 64))
Fitting model for F
Fitting model for M
('Last', (172265, 64))
Fitting model for F
Fitting model for M
('J Biol Chem', 171068)
('First', (676859, 64))
Fitting model for F
Fitting model for M
('Last', (758553, 64))
Fitting model for F
Fitting model for M

In [90]:

journal_table_cols = [
    "Journal", "Author Position", "citations",
    "beta", "stderr", "p-value"
]
JOURNAL_CAT_NAMINGS={
    "GENERIC": "Science",
    "MEDICINE": "Medicine",
    "BIOLOGY": "Biology",
}
df_journal_stats_all = pd.concat(
    {
        JOURNAL_CAT_NAMINGS[k]: pd.DataFrame(journal_group_stats,
                        columns=journal_table_cols).pivot(
            index="Journal", columns="Author Position"
        ).reorder_levels([1,0], axis=1).sortlevel(0, axis=1).sort_values(
            ("First", "citations"), ascending=False)
        
        for k,journal_group_stats in df_journal_stats.items()
    }
)
with pd.option_context("display.precision",3, 'display.float_format', lambda x: '%.3f' % x):
    display(df_journal_stats_all)

	Author Position	First				Last
		citations	beta	stderr	p-value	citations	beta	stderr	p-value
	Journal
Biology	J Biol Chem	676859	-0.095	0.013	0.000	758553	0.009	0.010	0.348
	Biochemistry	182433	-0.036	0.024	0.143	204527	0.030	0.017	0.084
	J Virol	155081	-0.063	0.025	0.013	172265	0.024	0.018	0.185
	Biochim Biophys Acta	104434	-0.028	0.029	0.344	110138	0.003	0.025	0.915
	J Bacteriol	102012	-0.020	0.032	0.535	109737	0.082	0.022	0.000
	Nucleic Acids Res	98322	-0.107	0.035	0.002	104933	-0.051	0.027	0.061
	FEBS Lett	94021	-0.086	0.031	0.005	99364	0.007	0.027	0.805
	Biochem J	91576	-0.176	0.034	0.000	97174	-0.075	0.026	0.004
	Mol Cell	33538	-0.090	0.067	0.182	39524	-0.203	0.045	0.000
	Cell	32399	-0.200	0.076	0.008	37042	-0.089	0.050	0.073
	Adv Exp Med Biol	21625	-0.081	0.053	0.124	22072	0.101	0.056	0.070
	Bioinformatics	20756	-0.080	0.103	0.437	23014	0.009	0.082	0.912
Medicine	J Immunol	208354	-0.021	0.024	0.389	228129	-0.017	0.017	0.324
	Blood	140887	-0.041	0.028	0.140	152394	0.000	0.022	0.984
	Cancer Res	131329	0.056	0.029	0.057	149313	0.051	0.022	0.018
	Brain Res	100389	-0.071	0.031	0.025	108379	0.004	0.028	0.882
	Circulation	92220	-0.020	0.036	0.575	98741	-0.051	0.035	0.143
	Clin Cancer Res	91687	0.057	0.035	0.101	96551	0.050	0.031	0.113
	J Clin Oncol	61722	0.069	0.041	0.093	62049	0.056	0.041	0.176
	J Am Coll Cardiol	50523	-0.070	0.061	0.248	52579	0.124	0.057	0.030
	J Urol	49314	0.274	0.063	0.000	50379	0.099	0.063	0.113
	JAMA	33674	0.164	0.050	0.001	34651	-0.012	0.055	0.825
	Gut	31027	0.000	0.065	1.000	32663	0.028	0.069	0.683
	N Engl J Med	30970	0.068	0.061	0.270	31971	0.030	0.065	0.637
	Lancet	26344	-0.074	0.059	0.209	26301	-0.061	0.065	0.348
	BMJ	23987	0.170	0.065	0.009	24438	0.123	0.073	0.091
Science	Proc Natl Acad Sci U S A	298356	-0.067	0.020	0.001	334968	-0.016	0.015	0.297
	Ann N Y Acad Sci	52540	-0.035	0.033	0.288	54224	0.214	0.034	0.000
	Nature	46138	-0.162	0.054	0.003	50625	0.028	0.044	0.531
	Science	41328	-0.154	0.053	0.004	45043	0.066	0.041	0.107

In [91]:

with pd.option_context("display.precision",3, 'display.float_format', lambda x: '%.3f' % x):
    print(df_journal_stats_all.to_latex())

\begin{tabular}{llrrrrrrrr}
\toprule
        &                  &     First &        &        &         &      Last &        &        &         \\
        &                  & citations &   beta & stderr & p-value & citations &   beta & stderr & p-value \\
{} & Journal &           &        &        &         &           &        &        &         \\
\midrule
Biology & J Biol Chem &    676859 & -0.095 &  0.013 &   0.000 &    758553 &  0.009 &  0.010 &   0.348 \\
        & Biochemistry &    182433 & -0.036 &  0.024 &   0.143 &    204527 &  0.030 &  0.017 &   0.084 \\
        & J Virol &    155081 & -0.063 &  0.025 &   0.013 &    172265 &  0.024 &  0.018 &   0.185 \\
        & Biochim Biophys Acta &    104434 & -0.028 &  0.029 &   0.344 &    110138 &  0.003 &  0.025 &   0.915 \\
        & J Bacteriol &    102012 & -0.020 &  0.032 &   0.535 &    109737 &  0.082 &  0.022 &   0.000 \\
        & Nucleic Acids Res &     98322 & -0.107 &  0.035 &   0.002 &    104933 & -0.051 &  0.027 &   0.061 \\
        & FEBS Lett &     94021 & -0.086 &  0.031 &   0.005 &     99364 &  0.007 &  0.027 &   0.805 \\
        & Biochem J &     91576 & -0.176 &  0.034 &   0.000 &     97174 & -0.075 &  0.026 &   0.004 \\
        & Mol Cell &     33538 & -0.090 &  0.067 &   0.182 &     39524 & -0.203 &  0.045 &   0.000 \\
        & Cell &     32399 & -0.200 &  0.076 &   0.008 &     37042 & -0.089 &  0.050 &   0.073 \\
        & Adv Exp Med Biol &     21625 & -0.081 &  0.053 &   0.124 &     22072 &  0.101 &  0.056 &   0.070 \\
        & Bioinformatics &     20756 & -0.080 &  0.103 &   0.437 &     23014 &  0.009 &  0.082 &   0.912 \\
Medicine & J Immunol &    208354 & -0.021 &  0.024 &   0.389 &    228129 & -0.017 &  0.017 &   0.324 \\
        & Blood &    140887 & -0.041 &  0.028 &   0.140 &    152394 &  0.000 &  0.022 &   0.984 \\
        & Cancer Res &    131329 &  0.056 &  0.029 &   0.057 &    149313 &  0.051 &  0.022 &   0.018 \\
        & Brain Res &    100389 & -0.071 &  0.031 &   0.025 &    108379 &  0.004 &  0.028 &   0.882 \\
        & Circulation &     92220 & -0.020 &  0.036 &   0.575 &     98741 & -0.051 &  0.035 &   0.143 \\
        & Clin Cancer Res &     91687 &  0.057 &  0.035 &   0.101 &     96551 &  0.050 &  0.031 &   0.113 \\
        & J Clin Oncol &     61722 &  0.069 &  0.041 &   0.093 &     62049 &  0.056 &  0.041 &   0.176 \\
        & J Am Coll Cardiol &     50523 & -0.070 &  0.061 &   0.248 &     52579 &  0.124 &  0.057 &   0.030 \\
        & J Urol &     49314 &  0.274 &  0.063 &   0.000 &     50379 &  0.099 &  0.063 &   0.113 \\
        & JAMA &     33674 &  0.164 &  0.050 &   0.001 &     34651 & -0.012 &  0.055 &   0.825 \\
        & Gut &     31027 &  0.000 &  0.065 &   1.000 &     32663 &  0.028 &  0.069 &   0.683 \\
        & N Engl J Med &     30970 &  0.068 &  0.061 &   0.270 &     31971 &  0.030 &  0.065 &   0.637 \\
        & Lancet &     26344 & -0.074 &  0.059 &   0.209 &     26301 & -0.061 &  0.065 &   0.348 \\
        & BMJ &     23987 &  0.170 &  0.065 &   0.009 &     24438 &  0.123 &  0.073 &   0.091 \\
Science & Proc Natl Acad Sci U S A &    298356 & -0.067 &  0.020 &   0.001 &    334968 & -0.016 &  0.015 &   0.297 \\
        & Ann N Y Acad Sci &     52540 & -0.035 &  0.033 &   0.288 &     54224 &  0.214 &  0.034 &   0.000 \\
        & Nature &     46138 & -0.162 &  0.054 &   0.003 &     50625 &  0.028 &  0.044 &   0.531 \\
        & Science &     41328 & -0.154 &  0.053 &   0.004 &     45043 &  0.066 &  0.041 &   0.107 \\
\bottomrule
\end{tabular}

In [92]:

df_journal_stats_all.shape

Out[92]:

(30, 8)

Plot full models for journals¶

In [93]:

df_t_first.columns

Out[93]:

Index([u'source_id', u'source_year', u'source_j', u'source_n_mesh',
       u'source_n_mesh_ex', u'source_is_eng', u'source_country',
       u'source_is_journal', u'source_is_review', u'source_is_case_rep',
       u'source_is_let_ed_com', u'source_T_novelty', u'source_V_novelty',
       u'source_PT_novelty', u'source_PV_novelty', u'source_ncites',
       u'source_n_authors', u'sink_id', u'sink_year', u'sink_j',
       u'sink_n_mesh', u'sink_n_mesh_ex', u'sink_is_eng', u'sink_is_journal',
       u'sink_is_review', u'sink_is_case_rep', u'sink_is_let_ed_com',
       u'sink_T_novelty', u'sink_V_novelty', u'sink_PT_novelty',
       u'sink_PV_novelty', u'sink_n_authors', u'year_span', u'journal_same',
       u'mesh_sim', u'title_sim', u'lang_sim', u'affiliation_sim',
       u'pubtype_sim', u'cite_sim', u'author_sim', u'gender_sim', u'eth_sim',
       u'n_common_authors', u'auid', u'gender', u'eth1', u'eth2', u'pos',
       u'pos_nice', u'sink_last_ncites', u'sink_prev_ncites',
       u'auth_last_npapers', u'auth_prev_papers', u'jj_sim', u'is_self_cite',
       u'first_year', u'last_year', u'au_age', u'match_len', u'match_prop',
       u'overall_coverage_len', u'overall_coverage_prop', u'eth_weight'],
      dtype='object')

In [94]:

journal_model_formula = ("is_self_cite ~ "
            "I(auth_prev_papers == 0)"
           "+ I(auth_prev_papers == 1)"
           "+ np.log10(auth_prev_papers + 1) + I(np.log10(auth_prev_papers + 1)**2)"
           "+ C(gender, levels=['M', 'F'])"
           #"+ C(source_country, levels=TOP_15_COUNTRIES)"
           #"+ mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)"
           #"+ I(source_ncites == 1)"
           "+ np.log10(source_ncites)"
           "+ I(np.log10(source_ncites)**2) "#"+ I(np.log10(source_ncites)**3)"
           #"+ I(source_n_authors > 20)"
           #              " + np.log10(np.clip(source_n_authors, 0, 20))"
           #"+ I(np.log10(np.clip(source_n_authors, 0, 20)) ** 2)"
           #"+ np.log10(source_n_mesh_ex + 1) + "#"I(source_n_mesh_ex == 0)" 
           #"+ np.log10(sink_n_mesh_ex + 1) + I(sink_n_mesh_ex == 0)"
           "+ I(year_span < 0) + I(year_span == 0)"
                         " + mf.score_log_1(year_span) + I(mf.score_log_1(year_span)**2)"
           "+ I(sink_prev_ncites == 0) "
                         "+ np.log10(sink_prev_ncites + 1) + I(np.log10(sink_prev_ncites + 1)**2)"
           "+ I(jj_sim == 0) + np.log10(jj_sim + 1) + I(np.log10(jj_sim + 1)**2) + journal_same"
           #"+ source_is_eng + source_is_journal + source_is_review + source_is_case_rep + source_is_let_ed_com"
           #"+ sink_is_eng + sink_is_journal + sink_is_review + sink_is_case_rep + sink_is_let_ed_com"
           #"+ np.log10(np.nan_to_num(source_V_novelty) + 1)"
           #"+ np.log10(np.nan_to_num(sink_V_novelty) + 1) + I(np.log10(np.nan_to_num(sink_V_novelty) + 1)**2)"
          )

In [95]:

journal_full_model_stats = dict()
for journal_cat, selected_journals in JOURNAL_NAMES.items():
    journal_group_full_model_stats = []
    for journal_name in selected_journals:
        for i, (df_t_data, title) in enumerate(zip(
                        [df_t_first, df_t_last],
                        ["First", "Last"],
                    )):
            print(title)
            model_stats = [np.nan, np.nan, np.nan]
            df_t_data = get_journal_data(df_t_data, journal_name).copy()
            #prepare_data(df_t_data)
            try:
                y_test, model = model_predictions(
                    df_t_data, journal_model_formula,
                    df_t_data.iloc[:3], verbose=False,
                    model_params=dict(method='lbfgs', maxiter=100))
                model_stats = model.summary2().tables[1].ix[
                        "C(gender, levels=['M', 'F'])[T.F]",
                        ["Coef.", "Std.Err.", "P>|z|"]].values.tolist()
            except:
                print("Failed to fit")
            journal_group_full_model_stats.append(
                [journal_name, title, df_t_data.shape[0]]+model_stats
            )
    journal_full_model_stats[journal_cat] = journal_group_full_model_stats

First
('Cancer Res', 48966)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

Last
('Cancer Res', 48966)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

First
('Circulation', 40094)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

Last
('Circulation', 40094)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

First
('N Engl J Med', 72020)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

Last
('N Engl J Med', 72020)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

First
('BMJ', 65858)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

Last
('BMJ', 65858)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

First
('J Am Coll Cardiol', 21580)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

Last
('J Am Coll Cardiol', 21580)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

First
('J Clin Oncol', 21075)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

Last
('J Clin Oncol', 21075)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

First
('J Immunol', 62245)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

Last
('J Immunol', 62245)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

First
('JAMA', 66849)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

Last
('JAMA', 66849)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

First
('Gut', 15952)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

Last
('Gut', 15952)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

First
('Clin Cancer Res', 14845)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

Last
('Clin Cancer Res', 14845)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

First
('Brain Res', 56834)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

Last
('Brain Res', 56834)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

First
('J Urol', 47368)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

Last
('J Urol', 47368)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

First
('Lancet', 129945)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

Last
('Lancet', 129945)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

First
('Blood', 43160)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

Last
('Blood', 43160)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

First
('Science', 167415)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

Last
('Science', 167415)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

First
('Proc Natl Acad Sci U S A', 121705)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

Last
('Proc Natl Acad Sci U S A', 121705)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

First
('Ann N Y Acad Sci', 47684)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

Last
('Ann N Y Acad Sci', 47684)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

First
('Nature', 104418)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

Last
('Nature', 104418)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

First
('Mol Cell', 5804)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

Last
('Mol Cell', 5804)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

First
('Biochim Biophys Acta', 96039)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

Last
('Biochim Biophys Acta', 96039)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

First
('Adv Exp Med Biol', 31938)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

Last
('Adv Exp Med Biol', 31938)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

First
('Biochem J', 54355)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

Last
('Biochem J', 54355)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

First
('Biochemistry', 62430)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

Last
('Biochemistry', 62430)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

First
('Nucleic Acids Res', 39570)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

Last
('Nucleic Acids Res', 39570)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

First
('FEBS Lett', 46770)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

Last
('FEBS Lett', 46770)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

First
('Cell', 17515)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

Last
('Cell', 17515)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

First
('J Bacteriol', 51716)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

Last
('J Bacteriol', 51716)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

First
('Bioinformatics', 10026)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

Last
('Bioinformatics', 10026)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

First
('J Virol', 42269)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

Last
('J Virol', 42269)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

First
('J Biol Chem', 171068)
Last
('J Biol Chem', 171068)

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

In [96]:

journal_table_cols = [
    "Journal", "Author Position", "citations",
    "beta", "stderr", "p-value"
]
JOURNAL_CAT_NAMINGS={
    "GENERIC": "Science",
    "MEDICINE": "Medicine",
    "BIOLOGY": "Biology",
}

df_journal_full_model_stats = pd.concat(
    {
        JOURNAL_CAT_NAMINGS[k]: pd.DataFrame(journal_group_stats,
                        columns=journal_table_cols).pivot(
            index="Journal", columns="Author Position"
        ).reorder_levels([1,0], axis=1).sortlevel(0, axis=1).sort_values(
            ("First", "citations"), ascending=False)
        
        for k,journal_group_stats in journal_full_model_stats.items()
    }
)
with pd.option_context("display.precision",3, 'display.float_format', lambda x: '%.3f' % x):
    display(df_journal_full_model_stats)

	Author Position	First				Last
		citations	beta	stderr	p-value	citations	beta	stderr	p-value
	Journal
Biology	J Biol Chem	676859	-0.103	0.014	0.000	758553	0.015	0.010	0.122
	Biochemistry	182433	-0.005	0.026	0.859	204527	0.024	0.018	0.186
	J Virol	155081	-0.063	0.027	0.018	172265	0.005	0.019	0.781
	Biochim Biophys Acta	104434	-0.004	0.031	0.895	110138	0.050	0.026	0.057
	J Bacteriol	102012	-0.038	0.034	0.265	109737	0.063	0.023	0.006
	Nucleic Acids Res	98322	-0.037	0.037	0.315	104933	-0.031	0.028	0.271
	FEBS Lett	94021	-0.037	0.032	0.248	99364	0.005	0.028	0.857
	Biochem J	91576	-0.153	0.036	0.000	97174	-0.080	0.027	0.003
	Mol Cell	33538	-0.105	0.068	0.124	39524	-0.176	0.045	0.000
	Cell	32399	-0.151	0.076	0.047	37042	-0.122	0.051	0.016
	Adv Exp Med Biol	21625	-0.101	0.055	0.069	22072	0.091	0.058	0.114
	Bioinformatics	20756	-0.105	0.109	0.336	23014	-0.006	0.085	0.941
Medicine	J Immunol	208354	-0.028	0.025	0.254	228129	-0.020	0.018	0.253
	Blood	140887	-0.035	0.028	0.217	152394	0.003	0.023	0.896
	Cancer Res	131329	0.050	0.030	0.100	149313	0.041	0.022	0.064
	Brain Res	100389	-0.099	0.034	0.003	108379	0.016	0.029	0.571
	Circulation	92220	0.053	0.037	0.155	98741	-0.034	0.036	0.341
	Clin Cancer Res	91687	0.066	0.036	0.066	96551	0.048	0.032	0.135
	J Clin Oncol	61722	0.078	0.042	0.064	62049	0.085	0.042	0.042
	J Am Coll Cardiol	50523	-0.068	0.062	0.271	52579	0.202	0.058	0.000
	J Urol	49314	0.364	0.066	0.000	50379	0.162	0.064	0.012
	JAMA	33674	0.204	0.051	0.000	34651	-0.059	0.056	0.290
	Gut	31027	0.035	0.067	0.605	32663	0.065	0.070	0.353
	N Engl J Med	30970	0.017	0.063	0.780	31971	0.007	0.066	0.920
	Lancet	26344	-0.075	0.059	0.207	26301	-0.011	0.065	0.872
	BMJ	23987	0.057	0.067	0.396	24438	0.126	0.075	0.092
Science	Proc Natl Acad Sci U S A	298356	-0.008	0.020	0.688	334968	0.021	0.015	0.161
	Ann N Y Acad Sci	52540	0.003	0.034	0.929	54224	0.216	0.035	0.000
	Nature	46138	-0.110	0.055	0.047	50625	0.017	0.044	0.709
	Science	41328	-0.122	0.054	0.025	45043	0.081	0.041	0.051

In [97]:

fig, ax = plt.subplots(3,4, sharex=True, sharey="row", figsize=(12,9))
bins=[-0.3, -0.2, -0.1, 0, 0.1, 0.2, 0.3]
for j, journal_cat in enumerate(set(df_journal_stats_all.index.get_level_values(0).tolist())):
    print journal_cat
    for i, title in enumerate(["First", "Last"]):
        df_journal_stats_all.reset_index(level=1).ix[
            journal_cat, (title, "beta")].plot(kind="hist", bins=bins, ax=ax[j, 2*i])
        df_journal_full_model_stats.reset_index(level=1).ix[
            journal_cat, (title, "beta")].plot(kind="hist", bins=bins, ax=ax[j, 2*i+1])
        ax[j, 2*i].set_title("Small Model ({})".format(title))
        ax[j, 2*i].axvline(x=0, color="r", linestyle="--", lw=1)
        ax[j, 2*i+1].set_title("Full Model ({})".format(title))
        ax[j, 2*i+1].axvline(x=0, color="r", linestyle="--", lw=1)
    ax[j, 0].set_ylabel(journal_cat)
sns.despine(offset=10)
fig.tight_layout()

Medicine
Science
Biology

In [98]:

sns.color_palette(["#ff0000", "#000000"])

Out[98]:

[(1.0, 0.0, 0.0), (0.0, 0.0, 0.0)]

In [99]:

with sns.color_palette(["#ff0000", "#000000"]):
    fig, ax = plt.subplots(2,3, sharex=True, figsize=(15, 8))
    for i, title in enumerate(["First", "Last"]):
        for j, journal_name in enumerate(["Science", "Biology", "Medicine"]):
            pd.concat({
                "small": df_journal_stats_all.ix[journal_name,(title, "beta")],
                "big": df_journal_full_model_stats.ix[journal_name,(title, "beta")]
            }, axis=1).plot.barh(ax=ax[i,j])
            ax[i,j].set_ylabel(journal_name)
            ax[i,j].axvline(x=0, linestyle="--", color="0.5", linewidth=0.5)
        ax[i, 1].set_title(title)

    sns.despine(offset=10)
    fig.tight_layout()
    plt.savefig("Review_Figures/Journal_cat_betas_big_small.pdf", bbox_inches="tight")

Indiv journal analysis¶

In [100]:

journal_model_formula = ("is_self_cite ~ "
            "I(auth_prev_papers == 0)"
           "+ I(auth_prev_papers == 1)"
           "+ np.log10(auth_prev_papers + 1) + I(np.log10(auth_prev_papers + 1)**2)"
           "+ C(gender, levels=['M', 'F'])"
           "+ C(source_country, levels=TOP_15_COUNTRIES)"
           "+ mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)"
           #"+ I(source_ncites == 1)"
           "+ np.log10(source_ncites)"
           "+ I(np.log10(source_ncites)**2) "#"+ I(np.log10(source_ncites)**3)"
           #"+ I(source_n_authors > 20)"
           #              " + np.log10(np.clip(source_n_authors, 0, 20))"
           #"+ I(np.log10(np.clip(source_n_authors, 0, 20)) ** 2)"
           "+ np.log10(source_n_mesh_ex + 1) + "#"I(source_n_mesh_ex == 0)" 
           "+ np.log10(sink_n_mesh_ex + 1) + I(sink_n_mesh_ex == 0)"
           "+ I(year_span < 0) + I(year_span == 0)"
                         " + mf.score_log_1(year_span) + I(mf.score_log_1(year_span)**2)"
           "+ I(sink_prev_ncites == 0) "
                         "+ np.log10(sink_prev_ncites + 1) + I(np.log10(sink_prev_ncites + 1)**2)"
           "+ I(jj_sim == 0) + np.log10(jj_sim + 1) + I(np.log10(jj_sim + 1)**2) + journal_same"
           "+ source_is_eng + source_is_journal + source_is_review + source_is_case_rep + source_is_let_ed_com"
           "+ sink_is_eng + sink_is_journal + sink_is_review + sink_is_case_rep + sink_is_let_ed_com"
           "+ np.log10(np.nan_to_num(source_V_novelty) + 1)"
           "+ np.log10(np.nan_to_num(sink_V_novelty) + 1) + I(np.log10(np.nan_to_num(sink_V_novelty) + 1)**2)"
            "+ I(match_prop == 0) + I(match_prop == 1) + match_prop + I(match_prop**2)"
          )

In [101]:

for journal_name in ["J Biol Chem"]:
    for i, (df_t_data, title) in enumerate(zip(
                    [df_t_first, df_t_last],
                    ["First", "Last"],
                )):
        print(title)
        model_stats = [np.nan, np.nan, np.nan]
        df_t_data = get_journal_data(df_t_data, journal_name).copy()
        #prepare_data(df_t_data)
        try:
            y_test, model = model_predictions(
                df_t_data, journal_model_formula,
                df_t_data.iloc[:3], verbose=False,
                model_params=dict(method='lbfgs', maxiter=100))
            display(model.summary2())
        except:
            print("Failed to fit")

First
('J Biol Chem', 171068)
Using class based MultiVal
using complimentary weights for 2 columns. w and 1-w
Using class based MultiVal
using complimentary weights for 2 columns. w and 1-w

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

Using class based MultiVal
using complimentary weights for 2 columns. w and 1-w
Using class based MultiVal
using complimentary weights for 2 columns. w and 1-w

Model:	Logit	Pseudo R-squared:	0.234
Dependent Variable:	is_self_cite	AIC:	179540.6798
Date:	2017-08-17 12:10	BIC:	180294.7442
No. Observations:	676859	Log-Likelihood:	-89704.
Df Model:	65	LL-Null:	-1.1704e+05
Df Residuals:	676793	LLR p-value:	0.0000
Converged:	0.0000	Scale:	1.0000

	Coef.	Std.Err.	z	P>\|z\|	[0.025	0.975]
Intercept	-1.1151	1014742.2815	-0.0000	1.0000	-1988859.4405	1988857.2103
I(auth_prev_papers == 0)[T.True]	-0.3155	0.3863	-0.8168	0.4140	-1.0726	0.4416
I(auth_prev_papers == 1)[T.True]	-0.0406	0.0391	-1.0381	0.2992	-0.1171	0.0360
C(gender, levels=['M', 'F'])[T.F]	-0.0968	0.0143	-6.7576	0.0000	-0.1249	-0.0688
C(source_country, levels=TOP_15_COUNTRIES)[T.UNKNOWN]	-0.1516	0.2209	-0.6863	0.4925	-0.5845	0.2813
C(source_country, levels=TOP_15_COUNTRIES)[T.UK]	-0.1262	0.0301	-4.1859	0.0000	-0.1853	-0.0671
C(source_country, levels=TOP_15_COUNTRIES)[T.JAPAN]	0.2444	0.0374	6.5406	0.0000	0.1712	0.3177
C(source_country, levels=TOP_15_COUNTRIES)[T.GERMANY]	0.1017	0.0331	3.0733	0.0021	0.0368	0.1665
C(source_country, levels=TOP_15_COUNTRIES)[T.FRANCE]	-0.0219	0.0387	-0.5650	0.5721	-0.0977	0.0540
C(source_country, levels=TOP_15_COUNTRIES)[T.ITALY]	0.1600	0.0554	2.8902	0.0038	0.0515	0.2685
C(source_country, levels=TOP_15_COUNTRIES)[T.CANADA]	0.0903	0.0314	2.8797	0.0040	0.0288	0.1518
C(source_country, levels=TOP_15_COUNTRIES)[T.CHINA]	-0.3567	0.2333	-1.5287	0.1263	-0.8141	0.1006
C(source_country, levels=TOP_15_COUNTRIES)[T.AUSTRALIA]	-0.0291	0.0478	-0.6078	0.5434	-0.1228	0.0647
C(source_country, levels=TOP_15_COUNTRIES)[T.SPAIN]	0.1367	0.0545	2.5102	0.0121	0.0300	0.2434
C(source_country, levels=TOP_15_COUNTRIES)[T.NETHERLANDS]	0.1273	0.0658	1.9354	0.0529	-0.0016	0.2562
C(source_country, levels=TOP_15_COUNTRIES)[T.SWEDEN]	0.0434	0.0543	0.7994	0.4241	-0.0630	0.1498
C(source_country, levels=TOP_15_COUNTRIES)[T.INDIA]	0.2383	0.1171	2.0358	0.0418	0.0089	0.4677
C(source_country, levels=TOP_15_COUNTRIES)[T.OTHER]	0.0448	0.0246	1.8203	0.0687	-0.0034	0.0930
I(sink_n_mesh_ex == 0)[T.True]	0.0896	0.1947	0.4603	0.6453	-0.2920	0.4712
I(year_span < 0)[T.True]	0.0594	1.0043	0.0591	0.9528	-1.9090	2.0278
I(year_span == 0)[T.True]	0.8778	0.0508	17.2810	0.0000	0.7782	0.9773
I(sink_prev_ncites == 0)[T.True]	0.0694	0.0356	1.9506	0.0511	-0.0003	0.1392
I(jj_sim == 0)[T.True]	-0.4495	0.7676	-0.5856	0.5582	-1.9539	1.0549
journal_same[T.True]	0.5355	0.0515	10.3910	0.0000	0.4345	0.6365
source_is_eng[T.True]	-1.1151	1014742.2815	-0.0000	1.0000	-1988859.4405	1988857.2103
source_is_journal[T.True]	-1.1548	0.5198	-2.2217	0.0263	-2.1735	-0.1360
source_is_review[T.True]	0.0277	0.0715	0.3872	0.6986	-0.1124	0.1677
source_is_case_rep[T.True]	-0.1183	0.3727	-0.3175	0.7509	-0.8489	0.6122
source_is_let_ed_com[T.True]	0.1008	0.7345	0.1372	0.8909	-1.3389	1.5404
sink_is_eng[T.True]	-0.6509	0.3638	-1.7890	0.0736	-1.3640	0.0622
sink_is_journal[T.True]	0.4820	0.1462	3.2963	0.0010	0.1954	0.7686
sink_is_review[T.True]	-1.0143	0.0296	-34.2150	0.0000	-1.0724	-0.9562
sink_is_case_rep[T.True]	-0.7960	0.1584	-5.0264	0.0000	-1.1063	-0.4856
sink_is_let_ed_com[T.True]	-0.7621	0.1487	-5.1242	0.0000	-1.0536	-0.4706
I(match_prop == 0)[T.True]	-0.2242	0.3861	-0.5806	0.5615	-0.9810	0.5326
I(match_prop == 1)[T.True]	0.0969	0.0389	2.4908	0.0127	0.0206	0.1731
np.log10(auth_prev_papers + 1)	0.4224	0.0791	5.3403	0.0000	0.2674	0.5775
I(np.log10(auth_prev_papers + 1) ** 2)	0.1131	0.0303	3.7324	0.0002	0.0537	0.1725
mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[0]	-0.0599	0.0298	-2.0092	0.0445	-0.1183	-0.0015
mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[1]	-0.1120	0.0350	-3.2019	0.0014	-0.1805	-0.0434
mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[2]	0.0933	0.0311	3.0022	0.0027	0.0324	0.1543
mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[3]	-0.2355	0.0361	-6.5291	0.0000	-0.3062	-0.1648
mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[4]	-0.0574	0.0319	-1.8003	0.0718	-0.1198	0.0051
mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[5]	-0.1281	0.0330	-3.8871	0.0001	-0.1928	-0.0635
mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[6]	-0.1766	0.0488	-3.6194	0.0003	-0.2722	-0.0810
mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[7]	-0.0676	0.0363	-1.8632	0.0624	-0.1387	0.0035
mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[8]	-0.0178	0.0431	-0.4119	0.6804	-0.1023	0.0668
mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[9]	-0.2396	0.0552	-4.3411	0.0000	-0.3478	-0.1314
mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[10]	-0.0564	0.0588	-0.9600	0.3371	-0.1717	0.0588
mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[11]	-0.1512	0.0515	-2.9346	0.0033	-0.2522	-0.0502
mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[12]	-0.0210	0.1207	-0.1742	0.8617	-0.2576	0.2156
mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[13]	0.0319	0.0376	0.8497	0.3955	-0.0417	0.1056
np.log10(source_ncites)	-1.0072	0.9217	-1.0927	0.2745	-2.8137	0.7994
I(np.log10(source_ncites) ** 2)	-0.2492	0.2990	-0.8333	0.4047	-0.8352	0.3369
np.log10(source_n_mesh_ex + 1)	0.7598	0.0475	15.9939	0.0000	0.6667	0.8529
np.log10(sink_n_mesh_ex + 1)	0.0052	0.0472	0.1100	0.9124	-0.0872	0.0976
mf.score_log_1(year_span)	2.9238	0.1599	18.2805	0.0000	2.6103	3.2372
I(mf.score_log_1(year_span) ** 2)	-2.7949	0.1094	-25.5502	0.0000	-3.0093	-2.5805
np.log10(sink_prev_ncites + 1)	-0.5054	0.0613	-8.2510	0.0000	-0.6255	-0.3854
I(np.log10(sink_prev_ncites + 1) ** 2)	-0.3242	0.0237	-13.6848	0.0000	-0.3707	-0.2778
np.log10(jj_sim + 1)	-0.4062	1.5699	-0.2587	0.7958	-3.4832	2.6708
I(np.log10(jj_sim + 1) ** 2)	0.0404	0.7995	0.0506	0.9597	-1.5265	1.6074
np.log10(np.nan_to_num(source_V_novelty) + 1)	-0.0181	0.0112	-1.6097	0.1075	-0.0401	0.0039
np.log10(np.nan_to_num(sink_V_novelty) + 1)	0.0351	0.0578	0.6083	0.5430	-0.0781	0.1483
I(np.log10(np.nan_to_num(sink_V_novelty) + 1) ** 2)	-0.0056	0.0106	-0.5307	0.5956	-0.0264	0.0151
match_prop	2.0059	0.2477	8.0978	0.0000	1.5204	2.4914
I(match_prop ** 2)	0.9533	0.1826	5.2216	0.0000	0.5955	1.3112

Last
('J Biol Chem', 171068)
Using class based MultiVal
using complimentary weights for 2 columns. w and 1-w
Using class based MultiVal
using complimentary weights for 2 columns. w and 1-w

/content/smishra8/SOFTWARE/anaconda2/lib/python2.7/site-packages/statsmodels/base/model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

Using class based MultiVal
using complimentary weights for 2 columns. w and 1-w
Using class based MultiVal
using complimentary weights for 2 columns. w and 1-w

Model:	Logit	Pseudo R-squared:	0.096
Dependent Variable:	is_self_cite	AIC:	554395.4400
Date:	2017-08-17 12:12	BIC:	555157.0251
No. Observations:	758553	Log-Likelihood:	-2.7713e+05
Df Model:	65	LL-Null:	-3.0642e+05
Df Residuals:	758487	LLR p-value:	0.0000
Converged:	0.0000	Scale:	1.0000

	Coef.	Std.Err.	z	P>\|z\|	[0.025	0.975]
Intercept	-1.0288	nan	nan	nan	nan	nan
I(auth_prev_papers == 0)[T.True]	-0.2344	1.0262	-0.2284	0.8193	-2.2457	1.7768
I(auth_prev_papers == 1)[T.True]	-0.1873	0.1806	-1.0368	0.2998	-0.5413	0.1668
C(gender, levels=['M', 'F'])[T.F]	0.0079	0.0099	0.7957	0.4262	-0.0115	0.0273
C(source_country, levels=TOP_15_COUNTRIES)[T.UNKNOWN]	-0.0486	0.1286	-0.3780	0.7054	-0.3007	0.2034
C(source_country, levels=TOP_15_COUNTRIES)[T.UK]	-0.0829	0.0154	-5.3700	0.0000	-0.1132	-0.0526
C(source_country, levels=TOP_15_COUNTRIES)[T.JAPAN]	-0.2183	0.0331	-6.5893	0.0000	-0.2832	-0.1534
C(source_country, levels=TOP_15_COUNTRIES)[T.GERMANY]	-0.0776	0.0188	-4.1269	0.0000	-0.1145	-0.0408
C(source_country, levels=TOP_15_COUNTRIES)[T.FRANCE]	-0.2002	0.0229	-8.7309	0.0000	-0.2452	-0.1553
C(source_country, levels=TOP_15_COUNTRIES)[T.ITALY]	-0.2002	0.0314	-6.3652	0.0000	-0.2618	-0.1385
C(source_country, levels=TOP_15_COUNTRIES)[T.CANADA]	0.0199	0.0161	1.2323	0.2178	-0.0117	0.0514
C(source_country, levels=TOP_15_COUNTRIES)[T.CHINA]	-0.0940	0.0746	-1.2612	0.2072	-0.2402	0.0521
C(source_country, levels=TOP_15_COUNTRIES)[T.AUSTRALIA]	-0.1661	0.0269	-6.1854	0.0000	-0.2188	-0.1135
C(source_country, levels=TOP_15_COUNTRIES)[T.SPAIN]	-0.0066	0.0328	-0.2024	0.8396	-0.0709	0.0576
C(source_country, levels=TOP_15_COUNTRIES)[T.NETHERLANDS]	-0.0523	0.0391	-1.3355	0.1817	-0.1290	0.0244
C(source_country, levels=TOP_15_COUNTRIES)[T.SWEDEN]	-0.1117	0.0309	-3.6108	0.0003	-0.1724	-0.0511
C(source_country, levels=TOP_15_COUNTRIES)[T.INDIA]	-0.1204	0.0620	-1.9428	0.0520	-0.2419	0.0011
C(source_country, levels=TOP_15_COUNTRIES)[T.OTHER]	-0.0602	0.0140	-4.3050	0.0000	-0.0877	-0.0328
I(sink_n_mesh_ex == 0)[T.True]	-0.1592	0.1078	-1.4765	0.1398	-0.3704	0.0521
I(year_span < 0)[T.True]	0.0014	0.6576	0.0022	0.9983	-1.2875	1.2903
I(year_span == 0)[T.True]	0.5991	0.0289	20.7070	0.0000	0.5424	0.6558
I(sink_prev_ncites == 0)[T.True]	0.3229	0.0218	14.8031	0.0000	0.2802	0.3657
I(jj_sim == 0)[T.True]	-0.2231	0.2987	-0.7471	0.4550	-0.8086	0.3623
journal_same[T.True]	0.3037	0.0199	15.2433	0.0000	0.2647	0.3428
source_is_eng[T.True]	-1.0288	nan	nan	nan	nan	nan
source_is_journal[T.True]	-0.9845	0.2395	-4.1107	0.0000	-1.4540	-0.5151
source_is_review[T.True]	-0.1588	0.0568	-2.7943	0.0052	-0.2703	-0.0474
source_is_case_rep[T.True]	-0.0570	0.1963	-0.2905	0.7714	-0.4418	0.3278
source_is_let_ed_com[T.True]	0.0010	0.5815	0.0018	0.9986	-1.1388	1.1408
sink_is_eng[T.True]	-0.8783	0.1587	-5.5356	0.0000	-1.1892	-0.5673
sink_is_journal[T.True]	-0.2457	0.0684	-3.5913	0.0003	-0.3798	-0.1116
sink_is_review[T.True]	-0.5397	0.0136	-39.6901	0.0000	-0.5663	-0.5130
sink_is_case_rep[T.True]	-0.2957	0.0696	-4.2515	0.0000	-0.4320	-0.1594
sink_is_let_ed_com[T.True]	-0.6900	0.0731	-9.4360	0.0000	-0.8334	-0.5467
I(match_prop == 0)[T.True]	-0.2425	1.0063	-0.2409	0.8096	-2.2149	1.7299
I(match_prop == 1)[T.True]	0.1231	0.0129	9.5287	0.0000	0.0978	0.1485
np.log10(auth_prev_papers + 1)	1.0005	0.0679	14.7432	0.0000	0.8675	1.1335
I(np.log10(auth_prev_papers + 1) ** 2)	-0.1519	0.0182	-8.3469	0.0000	-0.1876	-0.1162
mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[0]	0.0106	0.0148	0.7171	0.4733	-0.0184	0.0396
mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[1]	-0.0934	0.0218	-4.2762	0.0000	-0.1362	-0.0506
mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[2]	-0.0298	0.0243	-1.2243	0.2208	-0.0774	0.0179
mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[3]	-0.0604	0.0321	-1.8825	0.0598	-0.1234	0.0025
mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[4]	-0.1143	0.0264	-4.3291	0.0000	-0.1660	-0.0625
mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[5]	-0.1421	0.0189	-7.5037	0.0000	-0.1792	-0.1049
mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[6]	0.0717	0.0262	2.7336	0.0063	0.0203	0.1232
mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[7]	-0.0863	0.0251	-3.4424	0.0006	-0.1354	-0.0372
mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[8]	0.0081	0.0224	0.3626	0.7169	-0.0358	0.0520
mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[9]	-0.2584	0.0404	-6.3895	0.0000	-0.3377	-0.1791
mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[10]	-0.0958	0.0313	-3.0580	0.0022	-0.1572	-0.0344
mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[11]	-0.1248	0.0425	-2.9376	0.0033	-0.2081	-0.0415
mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[12]	-0.1557	0.0886	-1.7569	0.0789	-0.3293	0.0180
mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[13]	-0.0404	0.0211	-1.9096	0.0562	-0.0818	0.0011
np.log10(source_ncites)	-0.8598	0.5269	-1.6319	0.1027	-1.8925	0.1728
I(np.log10(source_ncites) ** 2)	-0.1725	0.1699	-1.0153	0.3100	-0.5054	0.1605
np.log10(source_n_mesh_ex + 1)	0.3641	0.0248	14.6846	0.0000	0.3155	0.4127
np.log10(sink_n_mesh_ex + 1)	0.1979	0.0251	7.8960	0.0000	0.1488	0.2470
mf.score_log_1(year_span)	1.6899	0.0719	23.5173	0.0000	1.5490	1.8307
I(mf.score_log_1(year_span) ** 2)	-1.0177	0.0418	-24.3582	0.0000	-1.0995	-0.9358
np.log10(sink_prev_ncites + 1)	0.1455	0.0294	4.9526	0.0000	0.0879	0.2031
I(np.log10(sink_prev_ncites + 1) ** 2)	-0.3254	0.0094	-34.4898	0.0000	-0.3438	-0.3069
np.log10(jj_sim + 1)	-0.2896	0.5924	-0.4889	0.6249	-1.4506	0.8714
I(np.log10(jj_sim + 1) ** 2)	0.2417	0.2921	0.8274	0.4080	-0.3308	0.8142
np.log10(np.nan_to_num(source_V_novelty) + 1)	-0.0344	0.0060	-5.7730	0.0000	-0.0460	-0.0227
np.log10(np.nan_to_num(sink_V_novelty) + 1)	0.0829	0.0277	2.9870	0.0028	0.0285	0.1373
I(np.log10(np.nan_to_num(sink_V_novelty) + 1) ** 2)	-0.0073	0.0052	-1.4186	0.1560	-0.0175	0.0028
match_prop	0.7078	0.4721	1.4994	0.1338	-0.2174	1.6331
I(match_prop ** 2)	1.5720	0.2839	5.5364	0.0000	1.0155	2.1285

In [102]:

df_journal_full_model_stats[
    (df_journal_full_model_stats[("Last", "p-value")] < 0.05)
    #& (df_journal_full_model_stats[("Last", "beta")] > 0)
]

Out[102]:

	Author Position	First				Last
		citations	beta	stderr	p-value	citations	beta	stderr	p-value
	Journal
Biology	J Bacteriol	102012	-0.037663	0.033765	2.646655e-01	109737	0.063241	0.022894	5.739561e-03
	Biochem J	91576	-0.152946	0.036076	2.239402e-05	97174	-0.079645	0.027055	3.241796e-03
	Mol Cell	33538	-0.105273	0.068364	1.235868e-01	39524	-0.176283	0.045367	1.020336e-04
	Cell	32399	-0.151300	0.076033	4.659994e-02	37042	-0.122344	0.050564	1.553732e-02
Medicine	J Clin Oncol	61722	0.077715	0.041989	6.419382e-02	62049	0.085109	0.041894	4.219990e-02
	J Am Coll Cardiol	50523	-0.068152	0.061966	2.714106e-01	52579	0.202210	0.057868	4.752258e-04
	J Urol	49314	0.363535	0.065951	3.543983e-08	50379	0.162318	0.064427	1.175524e-02
Science	Ann N Y Acad Sci	52540	0.003086	0.034389	9.285022e-01	54224	0.216270	0.035162	7.713824e-10

In [103]:

print("Journal with both first and last authorships having significant gender effect.")
df_journal_full_model_stats[
    (df_journal_full_model_stats[("Last", "p-value")] < 0.05)
    #& (df_journal_full_model_stats[("Last", "beta")] > 0)
    & (df_journal_full_model_stats[("First", "p-value")] < 0.05)
    #& (df_journal_full_model_stats[("First", "beta")] > 0)
]

Journal with both first and last authorships having significant gender effect.

Out[103]:

	Author Position	First				Last
		citations	beta	stderr	p-value	citations	beta	stderr	p-value
	Journal
Biology	Biochem J	91576	-0.152946	0.036076	2.239402e-05	97174	-0.079645	0.027055	0.003242
Biology	Cell	32399	-0.151300	0.076033	4.659994e-02	37042	-0.122344	0.050564	0.015537
Medicine	J Urol	49314	0.363535	0.065951	3.543983e-08	50379	0.162318	0.064427	0.011755

In [ ]: