In [1]:

%matplotlib inline

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from statsmodels.api import Logit
import patsy

from scipy import stats

import pymc3 as pm

In [2]:

import model_functions as mf

MF reloaded

In [3]:

with pd.HDFStore("out/Model.v3.h5") as store:
    X = store["X"]
    print "Finished X"
    y = store["y"]
    print "Finished y"

Finished X
Finished y

In [ ]:

with pd.HDFStore("out/Model.v3.h5", mode="w") as store:
    store.put("X", X, format="table")
    print "Finished X"
    store.put("y", y, format="table")
    print "Finished y"

Load data from HDF5¶

In [3]:

with pd.HDFStore('out/ModelData.20160529.h5') as cstore:
    df = cstore['first_author']

In [4]:

GENDERS = ["-", "F", "M"]
TOP_15_ETHNICITIES = ["ENGLISH", "GERMAN", "HISPANIC", "CHINESE",
                      "JAPANESE", "SLAV", "FRENCH", "ITALIAN", "INDIAN",
                      "NORDIC", "ARAB", "DUTCH", "KOREAN", "UNKNOWN", "OTHER"]
TOP_15_COUNTRIES = ["USA", "UNKNOWN", "UK", "JAPAN", "GERMANY", "FRANCE", "ITALY",
                    "CANADA", "CHINA", "AUSTRALIA", "SPAIN", "NETHERLANDS",
                    "SWEDEN", "INDIA", "OTHER"]

sns.set_context("poster")
sns.set_style("ticks")

In [5]:

def logit(p):
    return np.log10(p/(1-p))

def get_empirical_logit(df, X_col, y_col, conditioning_var="Gender",
    bins=None, testing=False):
    if testing:
        # If testing the just print X and y columns
        print X_col, y_col
        return None
    ## Not testing. Generate the proportions per category
    print X_col, conditioning_var, y_col
    t = df.pivot_table(index=pd.cut(df[X_col], bins=bins),
      columns=conditioning_var, values=y_col,
      aggfunc=[np.mean, len])
    t = t.set_index(bins[1:])
    return t

In [68]:

colors = ["r", "b", "k"]
markers = ["o", "^", "s"]
conditioning_names = ["F", "M", "-"]
cm = plt.cm.get_cmap('Greys')
plt.clf()
plt.close("all")
fig, ax = plt.subplots(2,4, figsize=(20,10))
ax = ax.flatten()
plot_id = 0
y_feature_col = "is_self_cite"
xcols = ["auth_prev_papers", "sink_prev_ncites", 
         "source_n_mesh_ex", "sink_n_mesh_ex", "jj_sim",
         "year_span", "source_ncites", "source_n_authors"]
for k in xcols:
    if k in ["auth_prev_papers", "sink_prev_ncites", "jj_sim"]:
        bins = np.array(range(-1,1000))
        xscale = "symlog"
        xlims = [-1, 100]
    if k in ["source_n_mesh_ex", "sink_n_mesh_ex",]:
        bins = np.array(range(-1,100))
        xscale = "symlog"
        xlims = [-1, 100]
    if k in ["source_ncites"]:
        bins = np.array(range(-1,100))
        xscale = "log"
        xlims = [0.8, 100]
    if k in ["source_n_authors"]:
        bins = np.array(range(-1,100))
        xscale = "log"
        xlims = [1, 15]
    if k in ["year_span"]:
        bins = np.array([-10, -1] + range(100))
        xscale = "symlog"
        xlims = [-2, 100]
    print k
    t = get_empirical_logit(
        df, k, y_feature_col,
        conditioning_var="gender",
        bins=bins,
        testing=False)
    t = t[(t.index >= xlims[0]) & (t.index <= xlims[1])]
    t_logit = logit(t["mean"]).clip(-4,2)
    lgd_items = []
    for j,c in enumerate(conditioning_names):
        scp = ax[plot_id].scatter(t_logit.index, t_logit[c], edgecolor=colors[j], s=t["len"][c] / 1e4, 
                                  facecolor='none', alpha=0.5, linewidth=3, marker=markers[j], label=c)
        """
        
        line, = ax[plot_id].plot(t_logit[t_logit[c].notnull()].index, t_logit[t_logit[c].notnull()][c],
                                 marker="None", linestyle="-", color=colors[j], label=c, lw=1, alpha=0.7)
        lgd_items.append(line)
        """
        lgd_items.append(scp)
    ax[plot_id].set_xlabel(k)
    ax[plot_id].set_ylabel("$log_{10}(p/(1-p))$")
    ax[plot_id].set_title(k)
    ax[plot_id].set_xscale(xscale)
    ax[plot_id].set_xlim(xlims)
    #ax[plot_id].set_ylim((-4,2))
    plot_id += 1
plt.margins(0.1)
#ax.legend(handles=lgd_items, loc="upper center", ncol=3)
lgd = fig.legend(lgd_items, ["Female", "Male", "Unknown"],
                 loc = 'upper center',
                 bbox_to_anchor=(0.5,1.1), ncol=3,
                 frameon=True, fancybox=True, prop={"size": 16})
fig.tight_layout()
plt.savefig("Empirical_1.pdf")

auth_prev_papers
auth_prev_papers gender is_self_cite
sink_prev_ncites
sink_prev_ncites gender is_self_cite
source_n_mesh_ex
source_n_mesh_ex gender is_self_cite
sink_n_mesh_ex
sink_n_mesh_ex gender is_self_cite
jj_sim
jj_sim gender is_self_cite
year_span
year_span gender is_self_cite
source_ncites
source_ncites gender is_self_cite
source_n_authors
source_n_authors gender is_self_cite

In [70]:

colors = ["r", "b", "k"]
markers = ["o", "^", "s"]
conditioning_names = ["F", "M", "-"]
cm = plt.cm.get_cmap('Greys')
plt.clf()
plt.close("all")
fig, ax = plt.subplots(2,4, figsize=(20,10))
ax = ax.flatten()
plot_id = 0
y_feature_col = "is_self_cite"
xcols = ["source_T_novelty", "source_V_novelty", 
         "source_PT_novelty", "source_PV_novelty",
         "sink_T_novelty", "sink_V_novelty", 
         "sink_PT_novelty", "sink_PV_novelty",]
for k in xcols:
    if k in ["source_T_novelty", "source_PT_novelty", "sink_T_novelty", "sink_PT_novelty"]:
        bins = np.array([-1] + range(0,100))
        xscale = "symlog"
        xlims = [-1, 100]
    if k in ["source_V_novelty", "source_PV_novelty", "sink_V_novelty", "sink_PV_novelty"]:
        bins = np.array(range(0,100000))
        xscale = "log"
        xlims = [0.8, 1e5]
    print k
    t = get_empirical_logit(df, k, y_feature_col,
    conditioning_var="gender",
    bins=bins,
    testing=False)
    t = t[(t.index >= xlims[0]) & (t.index <= xlims[1])]
    t_logit = logit(t["mean"]).clip(-2,0)
    lgd_items = []
    for j,c in enumerate(conditioning_names):
        scp = ax[plot_id].scatter(t_logit.index, t_logit[c], edgecolor=colors[j], s=t["len"][c] / 1e5, 
                                  facecolor='none', alpha=0.7, linewidth=3, marker=markers[j])
        """
        line, = ax[plot_id].plot(t_logit[t_logit[c].notnull()].index, t_logit[t_logit[c].notnull()][c],
                                 marker="None", linestyle="-", color=colors[j], label=c, lw=1, alpha=0.7)
        lgd_items.append(line)
        """
        lgd_items.append(scp)
    """
    xlims = [t_logit[t_logit[c].notnull()].index.min(),
             t_logit[t_logit[c].notnull()].index.max()]
    """
    ax[plot_id].set_xlabel(k)
    ax[plot_id].set_ylabel("$log_{10}(p/(1-p))$")
    ax[plot_id].set_title(k)
    ax[plot_id].set_xscale(xscale)
    ax[plot_id].set_xlim(xlims)
    #ax[plot_id].set_ylim((-6,0.5))
    plot_id += 1

#ax.legend(handles=lgd_items, loc="upper center", ncol=3)
lgd = fig.legend(lgd_items, ["Female", "Male", "Unknown"],
                 loc = 'upper center',
                 bbox_to_anchor=(0.5,1.1), ncol=3,
                 frameon=True, fancybox=True, prop={"size": 16})
fig.tight_layout()
plt.savefig("Empirical_2.pdf")

source_T_novelty
source_T_novelty gender is_self_cite
source_V_novelty
source_V_novelty gender is_self_cite
source_PT_novelty
source_PT_novelty gender is_self_cite
source_PV_novelty
source_PV_novelty gender is_self_cite
sink_T_novelty
sink_T_novelty gender is_self_cite
sink_V_novelty
sink_V_novelty gender is_self_cite
sink_PT_novelty
sink_PT_novelty gender is_self_cite
sink_PV_novelty
sink_PV_novelty gender is_self_cite

Try models¶

In [72]:

formula = ("is_self_cite ~ I(auth_prev_papers == 0) + np.log10(auth_prev_papers + 1)"
           "+ C(gender, levels=GENDERS) + C(source_country, levels=TOP_15_COUNTRIES)"
           "+ mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)"
           "+ I(source_ncites == 1) + np.log10(source_ncites) + I(np.log10(source_ncites)**2)"
           "+ I(source_n_authors > 20) + np.log10(np.clip(source_n_authors, 0, 20))"
           "+ I(np.log10(np.clip(source_n_authors, 0, 20))**2)"
           "+ np.log10(source_n_mesh_ex + 1) + I(source_n_mesh_ex == 0)" 
           "+ np.log10(sink_n_mesh_ex + 1) + I(sink_n_mesh_ex == 0)"
           "+ I(year_span < 0) + I(year_span == 0) + mf.score_log_1(year_span) + I(mf.score_log_1(year_span)**2)"
           "+ I(sink_prev_ncites == 0) + np.log10(sink_prev_ncites + 1) + I(np.log10(sink_prev_ncites + 1)**2)"
           "+ I(jj_sim == 0) + np.log10(jj_sim + 1) + I(np.log10(jj_sim + 1)**2) + journal_same"
           "+ source_is_eng + source_is_journal + source_is_review + source_is_case_rep + source_is_let_ed_com"
           "+ sink_is_eng + sink_is_journal + sink_is_review + sink_is_case_rep + sink_is_let_ed_com"
           "+ np.log10(np.nan_to_num(source_V_novelty) + 1)"
           "+ np.log10(np.nan_to_num(sink_V_novelty) + 1) + I(np.log10(np.nan_to_num(sink_V_novelty) + 1)**2)"
          )
y,X = patsy.dmatrices(formula, data=df, return_type="dataframe")

Using class based MultiVal
using complimentary weights for 2 columns. w and 1-w
Using class based MultiVal
using complimentary weights for 2 columns. w and 1-w

In [73]:

X.columns, y.columns

Out[73]:

(Index([u'Intercept', u'I(auth_prev_papers == 0)[T.True]',
        u'C(gender, levels=GENDERS)[T.F]', u'C(gender, levels=GENDERS)[T.M]',
        u'C(source_country, levels=TOP_15_COUNTRIES)[T.UNKNOWN]',
        u'C(source_country, levels=TOP_15_COUNTRIES)[T.UK]',
        u'C(source_country, levels=TOP_15_COUNTRIES)[T.JAPAN]',
        u'C(source_country, levels=TOP_15_COUNTRIES)[T.GERMANY]',
        u'C(source_country, levels=TOP_15_COUNTRIES)[T.FRANCE]',
        u'C(source_country, levels=TOP_15_COUNTRIES)[T.ITALY]',
        u'C(source_country, levels=TOP_15_COUNTRIES)[T.CANADA]',
        u'C(source_country, levels=TOP_15_COUNTRIES)[T.CHINA]',
        u'C(source_country, levels=TOP_15_COUNTRIES)[T.AUSTRALIA]',
        u'C(source_country, levels=TOP_15_COUNTRIES)[T.SPAIN]',
        u'C(source_country, levels=TOP_15_COUNTRIES)[T.NETHERLANDS]',
        u'C(source_country, levels=TOP_15_COUNTRIES)[T.SWEDEN]',
        u'C(source_country, levels=TOP_15_COUNTRIES)[T.INDIA]',
        u'C(source_country, levels=TOP_15_COUNTRIES)[T.OTHER]',
        u'I(source_ncites == 1)[T.True]', u'I(source_n_authors > 20)[T.True]',
        u'I(source_n_mesh_ex == 0)[T.True]', u'I(sink_n_mesh_ex == 0)[T.True]',
        u'I(year_span < 0)[T.True]', u'I(year_span == 0)[T.True]',
        u'I(sink_prev_ncites == 0)[T.True]', u'I(jj_sim == 0)[T.True]',
        u'journal_same[T.True]', u'source_is_eng[T.True]',
        u'source_is_journal[T.True]', u'source_is_review[T.True]',
        u'source_is_case_rep[T.True]', u'source_is_let_ed_com[T.True]',
        u'sink_is_eng[T.True]', u'sink_is_journal[T.True]',
        u'sink_is_review[T.True]', u'sink_is_case_rep[T.True]',
        u'sink_is_let_ed_com[T.True]', u'np.log10(auth_prev_papers + 1)',
        u'mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[0]',
        u'mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[1]',
        u'mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[2]',
        u'mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[3]',
        u'mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[4]',
        u'mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[5]',
        u'mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[6]',
        u'mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[7]',
        u'mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[8]',
        u'mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[9]',
        u'mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[10]',
        u'mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[11]',
        u'mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[12]',
        u'mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[13]',
        u'np.log10(source_ncites)', u'I(np.log10(source_ncites) ** 2)',
        u'np.log10(np.clip(source_n_authors, 0, 20))',
        u'I(np.log10(np.clip(source_n_authors, 0, 20)) ** 2)',
        u'np.log10(source_n_mesh_ex + 1)', u'np.log10(sink_n_mesh_ex + 1)',
        u'mf.score_log_1(year_span)', u'I(mf.score_log_1(year_span) ** 2)',
        u'np.log10(sink_prev_ncites + 1)',
        u'I(np.log10(sink_prev_ncites + 1) ** 2)', u'np.log10(jj_sim + 1)',
        u'I(np.log10(jj_sim + 1) ** 2)',
        u'np.log10(np.nan_to_num(source_V_novelty) + 1)',
        u'np.log10(np.nan_to_num(sink_V_novelty) + 1)',
        u'I(np.log10(np.nan_to_num(sink_V_novelty) + 1) ** 2)'],
       dtype='object'), Index([u'is_self_cite'], dtype='object'))

In [74]:

model = Logit(y,X)
res = model.fit()
res.summary2()

Optimization terminated successfully.
         Current function value: 0.153249
         Iterations 10

Out[74]:

Model:	Logit	Pseudo R-squared:	0.211
Dependent Variable:	is_self_cite	AIC:	12756115.5917
Date:	2016-05-31 17:30	BIC:	12757157.0432
No. Observations:	41618369	Log-Likelihood:	-6.3780e+06
Df Model:	66	LL-Null:	-8.0787e+06
Df Residuals:	41618302	LLR p-value:	0.0000
Converged:	1.0000	Scale:	1.0000
No. Iterations:	10.0000

	Coef.	Std.Err.	z	P>\|z\|	[0.025	0.975]
Intercept	-2.5006	0.0278	-89.9202	0.0000	-2.5551	-2.4461
I(auth_prev_papers == 0)[T.True]	-1.5114	0.0080	-189.0480	0.0000	-1.5271	-1.4958
C(gender, levels=GENDERS)[T.F]	-0.0208	0.0033	-6.2308	0.0000	-0.0273	-0.0142
C(gender, levels=GENDERS)[T.M]	-0.0367	0.0031	-11.9396	0.0000	-0.0427	-0.0307
C(source_country, levels=TOP_15_COUNTRIES)[T.UNKNOWN]	-0.0533	0.0064	-8.2918	0.0000	-0.0658	-0.0407
C(source_country, levels=TOP_15_COUNTRIES)[T.UK]	-0.0066	0.0032	-2.0288	0.0425	-0.0129	-0.0002
C(source_country, levels=TOP_15_COUNTRIES)[T.JAPAN]	0.2510	0.0063	39.8045	0.0000	0.2387	0.2634
C(source_country, levels=TOP_15_COUNTRIES)[T.GERMANY]	0.0211	0.0042	5.0863	0.0000	0.0130	0.0293
C(source_country, levels=TOP_15_COUNTRIES)[T.FRANCE]	-0.0478	0.0053	-9.1071	0.0000	-0.0581	-0.0375
C(source_country, levels=TOP_15_COUNTRIES)[T.ITALY]	-0.1036	0.0065	-15.8645	0.0000	-0.1164	-0.0908
C(source_country, levels=TOP_15_COUNTRIES)[T.CANADA]	-0.0072	0.0042	-1.6913	0.0908	-0.0155	0.0011
C(source_country, levels=TOP_15_COUNTRIES)[T.CHINA]	-0.4342	0.0099	-43.7575	0.0000	-0.4536	-0.4148
C(source_country, levels=TOP_15_COUNTRIES)[T.AUSTRALIA]	0.0881	0.0052	16.8647	0.0000	0.0778	0.0983
C(source_country, levels=TOP_15_COUNTRIES)[T.SPAIN]	-0.1234	0.0063	-19.5337	0.0000	-0.1358	-0.1110
C(source_country, levels=TOP_15_COUNTRIES)[T.NETHERLANDS]	0.0617	0.0071	8.6683	0.0000	0.0477	0.0756
C(source_country, levels=TOP_15_COUNTRIES)[T.SWEDEN]	0.0952	0.0067	14.2377	0.0000	0.0821	0.1083
C(source_country, levels=TOP_15_COUNTRIES)[T.INDIA]	-0.0535	0.0104	-5.1201	0.0000	-0.0740	-0.0330
C(source_country, levels=TOP_15_COUNTRIES)[T.OTHER]	-0.0687	0.0027	-25.6879	0.0000	-0.0740	-0.0635
I(source_ncites == 1)[T.True]	0.3013	0.0179	16.8312	0.0000	0.2662	0.3363
I(source_n_authors > 20)[T.True]	-0.0128	0.0146	-0.8798	0.3790	-0.0413	0.0157
I(source_n_mesh_ex == 0)[T.True]	-0.6410	0.0125	-51.2535	0.0000	-0.6655	-0.6164
I(sink_n_mesh_ex == 0)[T.True]	-0.0932	0.0153	-6.0842	0.0000	-0.1232	-0.0632
I(year_span < 0)[T.True]	-0.5816	0.0157	-37.0900	0.0000	-0.6123	-0.5508
I(year_span == 0)[T.True]	0.5378	0.0057	94.7343	0.0000	0.5266	0.5489
I(sink_prev_ncites == 0)[T.True]	0.1263	0.0039	32.7241	0.0000	0.1187	0.1339
I(jj_sim == 0)[T.True]	-0.2875	0.0082	-35.2631	0.0000	-0.3034	-0.2715
journal_same[T.True]	0.4590	0.0029	157.1256	0.0000	0.4533	0.4647
source_is_eng[T.True]	0.6517	0.0075	87.3818	0.0000	0.6371	0.6663
source_is_journal[T.True]	0.2567	0.0110	23.3754	0.0000	0.2352	0.2783
source_is_review[T.True]	-0.0732	0.0026	-28.2009	0.0000	-0.0783	-0.0682
source_is_case_rep[T.True]	-0.9432	0.0066	-143.4855	0.0000	-0.9560	-0.9303
source_is_let_ed_com[T.True]	-0.4295	0.0108	-39.8818	0.0000	-0.4506	-0.4083
sink_is_eng[T.True]	-0.1873	0.0097	-19.3277	0.0000	-0.2063	-0.1683
sink_is_journal[T.True]	0.3288	0.0114	28.7348	0.0000	0.3064	0.3512
sink_is_review[T.True]	-0.7832	0.0029	-265.8463	0.0000	-0.7889	-0.7774
sink_is_case_rep[T.True]	-0.6397	0.0058	-110.4725	0.0000	-0.6510	-0.6284
sink_is_let_ed_com[T.True]	-0.2845	0.0114	-25.0195	0.0000	-0.3068	-0.2622
np.log10(auth_prev_papers + 1)	1.3879	0.0015	908.6660	0.0000	1.3849	1.3909
mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[0]	-0.0059	0.0037	-1.5986	0.1099	-0.0131	0.0013
mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[1]	0.0584	0.0042	13.8558	0.0000	0.0502	0.0667
mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[2]	0.0524	0.0041	12.8390	0.0000	0.0444	0.0603
mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[3]	-0.0826	0.0062	-13.2969	0.0000	-0.0948	-0.0704
mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[4]	0.0555	0.0045	12.2924	0.0000	0.0467	0.0644
mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[5]	-0.0695	0.0044	-15.7621	0.0000	-0.0782	-0.0609
mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[6]	-0.0528	0.0060	-8.7277	0.0000	-0.0646	-0.0409
mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[7]	-0.0584	0.0052	-11.1337	0.0000	-0.0687	-0.0481
mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[8]	0.1350	0.0048	27.8890	0.0000	0.1255	0.1445
mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[9]	-0.0495	0.0059	-8.4254	0.0000	-0.0610	-0.0380
mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[10]	0.0363	0.0066	5.5254	0.0000	0.0234	0.0492
mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[11]	-0.1800	0.0072	-24.8243	0.0000	-0.1942	-0.1658
mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[12]	0.0105	0.0181	0.5762	0.5645	-0.0251	0.0460
mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[13]	-0.1745	0.0043	-40.2929	0.0000	-0.1830	-0.1660
np.log10(source_ncites)	-0.1669	0.0116	-14.4282	0.0000	-0.1896	-0.1442
I(np.log10(source_ncites) ** 2)	-0.2305	0.0038	-60.6663	0.0000	-0.2380	-0.2231
np.log10(np.clip(source_n_authors, 0, 20))	0.0857	0.0177	4.8347	0.0000	0.0510	0.1204
I(np.log10(np.clip(source_n_authors, 0, 20)) ** 2)	-0.2557	0.0131	-19.5537	0.0000	-0.2813	-0.2300
np.log10(source_n_mesh_ex + 1)	-0.1558	0.0055	-28.4665	0.0000	-0.1666	-0.1451
np.log10(sink_n_mesh_ex + 1)	-0.1652	0.0055	-29.8204	0.0000	-0.1761	-0.1543
mf.score_log_1(year_span)	0.5424	0.0152	35.7017	0.0000	0.5126	0.5722
I(mf.score_log_1(year_span) ** 2)	-1.2861	0.0097	-133.1328	0.0000	-1.3051	-1.2672
np.log10(sink_prev_ncites + 1)	-0.4439	0.0064	-68.8330	0.0000	-0.4565	-0.4313
I(np.log10(sink_prev_ncites + 1) ** 2)	-0.1697	0.0025	-67.9147	0.0000	-0.1745	-0.1648
np.log10(jj_sim + 1)	0.0555	0.0100	5.5333	0.0000	0.0358	0.0751
I(np.log10(jj_sim + 1) ** 2)	0.0071	0.0030	2.4154	0.0157	0.0013	0.0130
np.log10(np.nan_to_num(source_V_novelty) + 1)	-0.1075	0.0013	-80.9294	0.0000	-0.1101	-0.1049
np.log10(np.nan_to_num(sink_V_novelty) + 1)	0.0938	0.0076	12.2957	0.0000	0.0789	0.1088
I(np.log10(np.nan_to_num(sink_V_novelty) + 1) ** 2)	-0.0278	0.0013	-21.3118	0.0000	-0.0303	-0.0252

In [100]:

with pd.HDFStore("out/Model.v3.h5", mode="w",) as store:
    store["X"] = X
    print "Finished X"
    store["y"] = y
    print "Finished y"

EMS reloaded
Finished X
Finished y

In [75]:

y_pred = res.predict()

In [77]:

y_pred[:10]

Out[77]:

array([ 0.32963518,  0.25792937,  0.03265883,  0.03094586,  0.03080083,
        0.06432226,  0.02241476,  0.10864539,  0.02931361,  0.13297744])

Test model feature categories¶

In [101]:

feature_dict = { 
	"Intercept": [u'Intercept',],
	"Gender": [
		u'C(gender, levels=GENDERS)[T.F]',
		u'C(gender, levels=GENDERS)[T.M]',
		],
	"Affiliation": [
		u'C(source_country, levels=TOP_15_COUNTRIES)[T.UNKNOWN]',
		u'C(source_country, levels=TOP_15_COUNTRIES)[T.UK]',
		u'C(source_country, levels=TOP_15_COUNTRIES)[T.JAPAN]',
		u'C(source_country, levels=TOP_15_COUNTRIES)[T.GERMANY]',
		u'C(source_country, levels=TOP_15_COUNTRIES)[T.FRANCE]',
		u'C(source_country, levels=TOP_15_COUNTRIES)[T.ITALY]',
		u'C(source_country, levels=TOP_15_COUNTRIES)[T.CANADA]',
		u'C(source_country, levels=TOP_15_COUNTRIES)[T.CHINA]',
		u'C(source_country, levels=TOP_15_COUNTRIES)[T.AUSTRALIA]',
		u'C(source_country, levels=TOP_15_COUNTRIES)[T.SPAIN]',
		u'C(source_country, levels=TOP_15_COUNTRIES)[T.NETHERLANDS]',
		u'C(source_country, levels=TOP_15_COUNTRIES)[T.SWEDEN]',
		u'C(source_country, levels=TOP_15_COUNTRIES)[T.INDIA]',
		u'C(source_country, levels=TOP_15_COUNTRIES)[T.OTHER]',
	],
	"Ethnicity": [
		u'mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[0]',
		u'mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[1]',
		u'mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[2]',
		u'mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[3]',
		u'mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[4]',
		u'mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[5]',
		u'mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[6]',
		u'mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[7]',
		u'mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[8]',
		u'mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[9]',
		u'mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[10]',
		u'mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[11]',
		u'mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[12]',
		u'mf.MC(eth1, eth2, weights=eth_weight, levels=TOP_15_ETHNICITIES)[13]',
	],
	"AuthorAge": [
		u'I(auth_prev_papers == 0)[T.True]',
		u'np.log10(auth_prev_papers + 1)',
	],
	"SourceCites": [
		u'I(source_ncites == 1)[T.True]',
		u'np.log10(source_ncites)',
		u'I(np.log10(source_ncites) ** 2)',
	],
	"SourceAuthors": [
		u'I(source_n_authors > 20)[T.True]',
		u'np.log10(np.clip(source_n_authors, 0, 20))',
		u'I(np.log10(np.clip(source_n_authors, 0, 20)) ** 2)',
	],
	"MeshCounts": [
		u'I(source_n_mesh_ex == 0)[T.True]',
		u'I(sink_n_mesh_ex == 0)[T.True]',
		u'np.log10(source_n_mesh_ex + 1)',
		u'np.log10(sink_n_mesh_ex + 1)',
	],
	"Journal": [
		u'journal_same[T.True]',
		u'I(jj_sim == 0)[T.True]',
		u'np.log10(jj_sim + 1)',
		u'I(np.log10(jj_sim + 1) ** 2)',
	],
	"YearSpan": [
		u'I(year_span < 0)[T.True]',
		u'I(year_span == 0)[T.True]',
		u'mf.score_log_1(year_span)',
		u'I(mf.score_log_1(year_span) ** 2)',
	],
	"SinkCites": [
		u'I(sink_prev_ncites == 0)[T.True]',
		u'np.log10(sink_prev_ncites + 1)',
		u'I(np.log10(sink_prev_ncites + 1) ** 2)',
	],
	"PubType": [
		u'source_is_journal[T.True]',
		u'source_is_review[T.True]',
		u'source_is_case_rep[T.True]',
		u'source_is_let_ed_com[T.True]',
		u'sink_is_journal[T.True]',
		u'sink_is_review[T.True]',
		u'sink_is_case_rep[T.True]',
		u'sink_is_let_ed_com[T.True]',
	],
	"Language": [
		u'source_is_eng[T.True]',
		u'sink_is_eng[T.True]',
	],
	"VolumeNovelty": [
		u'np.log10(np.nan_to_num(source_V_novelty) + 1)',
		u'np.log10(np.nan_to_num(sink_V_novelty) + 1)',
		u'I(np.log10(np.nan_to_num(sink_V_novelty) + 1) ** 2)'
	]
}

EMS reloaded

In [102]:

len(sum(feature_dict.values(), []))

Out[102]:

In [103]:

X.columns.shape

Out[103]:

(67,)

In [106]:

for k, v in feature_dict.iteritems():
    print "Shape of factor matrix with %s factors in feature category %s: %s" % (k, len(v), X[v].shape)

Shape of factor matrix with SourceCites factors in feature category 3: (41618369, 3)
Shape of factor matrix with Gender factors in feature category 2: (41618369, 2)
Shape of factor matrix with AuthorAge factors in feature category 2: (41618369, 2)
Shape of factor matrix with SourceAuthors factors in feature category 3: (41618369, 3)
Shape of factor matrix with VolumeNovelty factors in feature category 3: (41618369, 3)
Shape of factor matrix with Language factors in feature category 2: (41618369, 2)
Shape of factor matrix with YearSpan factors in feature category 4: (41618369, 4)
Shape of factor matrix with Journal factors in feature category 4: (41618369, 4)
Shape of factor matrix with PubType factors in feature category 8: (41618369, 8)
Shape of factor matrix with Affiliation factors in feature category 14: (41618369, 14)
Shape of factor matrix with SinkCites factors in feature category 3: (41618369, 3)
Shape of factor matrix with Intercept factors in feature category 1: (41618369, 1)
Shape of factor matrix with MeshCounts factors in feature category 4: (41618369, 4)
Shape of factor matrix with Ethnicity factors in feature category 14: (41618369, 14)

In [107]:

len(feature_dict)

Out[107]:

Model evaluation¶

In [108]:

import eval_measures as ems

In [109]:

def plot_prc(prc, ax, color="k", label="PRC"):
    precision, recall = prc
    ax.plot(recall, precision,marker="None", linestyle="-", color=color, label=label)

def get_all_eval_measures(res, endog, include_prc=False):
    predict = res.predict()
    measures = {}
    pred_table = ems.cm(predict, endog)
    measures["precision"] = ems.precision(pred_table)
    measures["recall"] = ems.recall(pred_table)
    measures["accuracy"] = ems.accuracy(pred_table)
    measures["f_score"] = ems.fscore_measure(pred_table)
    measures["rmse"] = ems.rmse(predict, endog)
    measures["mae"] = ems.mae(predict, endog)
    measures["auc"] = ems.auc(predict, endog)
    measures["llf"] = res.llf
    measures["aic"] = res.aic
    measures["bic"] = res.bic
    measures["prsquared"] = res.prsquared
    measures["df_model"] = res.df_model
    tn, fp, fn, tp = map(float, pred_table.flatten()) # WRT to 1 as positive label
    measures["tn"] = tn
    measures["fn"] = fn
    measures["fp"] = fp
    measures["tp"] = tp
    print "In eval measures function."
    if include_prc:
        ## Include the precision recall values
        prc = ems.prc(predict, endog, float_precision=3)
        measures["prc"] = prc
    return measures

In [110]:

measures = get_all_eval_measures(res, model.endog, include_prc=False)

In eval measures function.

In [112]:

measures

Out[112]:

{'accuracy': 0.9507482621435741,
 'aic': 12756115.591727437,
 'auc': 0.85428998282162383,
 'bic': 12757157.043224186,
 'df_model': 66.0,
 'f_score': 0.048553202570207,
 'fn': 1966737.0,
 'fp': 83040.0,
 'llf': -6377990.7958637187,
 'mae': 0.082279332242182854,
 'precision': 0.38643869928550845,
 'prsquared': 0.21051765454898874,
 'recall': 0.02590392057999899,
 'rmse': 0.20438018624692311,
 'tn': 39516291.0,
 'tp': 52301.0}

In [49]:

%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

In [50]:

%aimport model_functions

In [90]:

%autoreload 2

EMS reloaded

In [ ]: