In [33]:

%matplotlib inline
import pandas as pd
import seaborn as sns
import numpy as np
import scipy.sparse as sp
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt
import random
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.datasets import make_classification

Read raw impressions¶

For examples in the book, this file (above) was approx 20 million observations and required a large server to process for testing. The test file is 1 million observations. Predictions will be relatively poor, especially for kNN, because of extreme sparseness and small sample size.¶

In [34]:

df = pd.DataFrame.from_csv('data/advertising.csv')

In [35]:

df.head()

Out[35]:

	pub_domain	user_id	viewed	ctr	click
1000	1000	1410160	True	0.463089	True
1000	1000	1157972	True	0.463089	False
1000	1000	1061623	True	0.463089	True
1001	1001	1169015	True	0.303266	True
1001	1001	1111880	True	0.303266	False

Publication level summaries¶

In [36]:

df_pub = df[['viewed', 'click', 'pub_domain']].groupby('pub_domain').aggregate(sum)
df_pub['nImps'] = df.pub_domain.value_counts()
df_pub.columns = [['vImps', 'clicks', 'nImps']]
df_pub['CTR'] = df_pub.clicks / df_pub.nImps
df_pub['reach'] = df[['pub_domain', 'user_id']].groupby('pub_domain').aggregate(lambda x: len(x.unique()))
df_pub['vReach'] = df[df.viewed == True][['pub_domain', 'user_id']].groupby('pub_domain').aggregate(lambda x: int(len(x.unique())))
df_pub['vRate'] = (df_pub.vImps / df_pub.nImps) * [random.random() for x in range(len(df_pub))] * df_pub.CTR
df_pub['reachRate'] = df_pub.reach / df_pub.nImps
df_pub['vReachRate'] = df_pub.vReach / df_pub.nImps
df_pub['meanViewTime'] = [60 * random.random() * x for x in df_pub.CTR]
df_pub['exposure'] = df_pub.vImps * df_pub.meanViewTime
df_pub = df_pub.fillna(0)

Cardinalities of imps, pubs, and users¶

In [37]:

nImps = len(df)
nPubs = len(df.pub_domain.unique())
nUsers = len(df.user_id.unique())
print('nImps={}\nnPubs={}\nnUsers={}'.format(nImps, nPubs, nUsers))

nImps=1000000
nPubs=3993
nUsers=377464

In [38]:

df2 = df[df.click == True]
nImps2 = len(df2)
nPubs2 = len(df2.pub_domain.unique())
nUsers2 = len(df2.user_id.unique())
print('nImps2={}\nnPubs2={}\nnUsers2={}'.format(nImps2, nPubs2, nUsers2))

nImps2=161334
nPubs2=3920
nUsers2=131535

Clicks¶

In [39]:

nClicks = df.click.value_counts()[True]
print('nClicks={} ({}%)'.format(nClicks, round(float(nClicks) * 100 / nImps, 2)))

nClicks=161334 (16.13%)

Views¶

In [40]:

nViews = df.viewed.value_counts()[True]
print('nViews={} ({}%)'.format(nViews, round(float(nViews) * 100 / nImps, 2)))

nViews=800306 (80.03%)

Size of user / item matrix¶

In [41]:

uimsz = (nPubs * nUsers) # all visits
uimsz2 = (nPubs2 * nUsers2) # clicks
print('uim, all imps: {}, clicks only: {}'. format(uimsz, uimsz2))

uim, all imps: 1507213752, clicks only: 515617200

In [42]:

f = df.groupby('pub_domain').size()

Distribution of impressions by pub_domain¶

In [43]:

sns.distplot(np.log10(f));

In [44]:

df.head()

Out[44]:

	pub_domain	user_id	viewed	ctr	click
1000	1000	1410160	True	0.463089	True
1000	1000	1157972	True	0.463089	False
1000	1000	1061623	True	0.463089	True
1001	1001	1169015	True	0.303266	True
1001	1001	1111880	True	0.303266	False

In [45]:

_ = plt.hist(df.pub_domain.value_counts(), bins=range(2,100,1))

split data into training and test subsets¶

In [46]:

df.set_index(df.pub_domain, inplace=True)
df5, df6 = train_test_split(df, test_size = 0.2)

substitute integer indices for user and pub symbolic keys¶

In [47]:

users = df5.user_id.unique()
pubs = df5.pub_domain.unique()

user_idx, pub_idx = {}, {}
for i in range(len(users)):
    user_idx[users[i]] = i
for i in range(len(pubs)):
    pub_idx[pubs[i]] = i

pidx = {b: a for a, b in pub_idx.items()}    # for reverse lookup pub_domain = pidx[integer_key]

SVD¶

create sparse matrix¶

In [48]:

def makeUIM(df):
    dframe = df.sort_values(by='user_id', axis=0)
    nUsers = len(user_idx)
    nPubs = len(dframe.pub_domain.unique())
    V = sp.lil_matrix((nUsers, nPubs))
    def matput(imp):
        try:
            if imp.viewed:
                V[user_idx[imp.user_id], pub_idx[imp.pub_domain]] = 1
        except:
            pass
    _ = dframe[dframe.click == True].apply(matput, axis=1)
    return V

In [49]:

df5 = df5.sort_values(by='user_id', axis=0)
V = makeUIM(df5)
VT = V.T
df6 = df6.sort_values(by='user_id', axis=0)
TT = makeUIM(df6).T

compute SVD¶

In [50]:

u, s, vt = svds(V, k = 400)

plot singular values¶

In [51]:

plt.plot(s[::-1])

Out[51]:

[<matplotlib.lines.Line2D at 0x114d42850>]

KNN¶

In [52]:

ts0 = df6.pub_domain.value_counts()
tsums = pd.DataFrame.from_dict({
        'pub_domain': ts0.index,
        'nImps': ts0,
        'clicks': df6.groupby('pub_domain')['click'].aggregate(sum)
    })
tsums.reset_index(inplace=True)
tsums['CTR'] = tsums.clicks / np.float64(tsums.nImps)

ts1  = df.pub_domain.value_counts()
pubsums = pd.DataFrame.from_dict({
        'pub_domain': ts1.index,
        'nImps': ts1,
        'clicks': df.groupby('pub_domain')['click'].aggregate(sum)
    })
pubsums['CTR'] = pubsums.clicks / np.float64(pubsums.nImps)

In [53]:

tt = TT.todense()
tt = np.asarray(tt)
plt.rcParams["figure.figsize"] = [12.0, 4.0]

kMin = 1
kMax = 10
weightFunctions = {
    'f1': lambda x: [1 for i in range(len(x))],
    'f2': lambda x: 1 / x,
    'f3': lambda x: 1 / x ** 2
}
for idx, f in enumerate(weightFunctions):
    print('weight function{}'.format(f))
    rmseL = []
    wf = weightFunctions[f]
    for nNeighbors in range(kMin, kMax):
        neigh = NearestNeighbors(nNeighbors)
        neigh.fit(VT)    
        act = pd.Series()
        pred = pd.Series()
        for i in range(TT.shape[0]):
            d = neigh.kneighbors(tt[i,:].reshape(1, -1), return_distance=True)
            W = pd.Series([v for v in d[0][0]])
            y = pd.Series(pubsums.iloc[d[1][0]].CTR)
            a0 = tsums.iloc[i].CTR
            try:
                p0 = np.average(y[y.values > 0], weights = wf(W[y.values > 0]) )
            except:
                p0 = 0
            act = act.append(pd.Series(a0))
            pred = pred.append(pd.Series(p0))
            exit
        rmse = ((act.sub(pred).pow(2).mean()) ** .5 / (act.max() - act.min())) * 100 
        rmseL.append(rmse)
        print('{}. {}'.format(nNeighbors, rmse))
    plt0 = plt.subplot(130+idx+1)
    plt0.plot(range(kMin, kMax ,1), rmseL)
    plt.tight_layout(pad=2.0)
    exit

weight functionf1
1. 16.0163810193
2. 13.1859662817
3. 12.9689393099
4. 16.680206999
5. 14.6384491895
6. 14.0660686046
7. 14.0650293768
8. 13.4824095333
9. 13.942187028
weight functionf2
1. 15.2772888351
2. 12.3886556128
3. 12.1874934405
4. 16.194856847
5. 14.0479046742
6. 13.4458243001
7. 13.4445755208
8. 12.8089387628
9. 13.3148983652
weight functionf3
1. 15.2772888351
2. 12.3886674674
3. 12.1874643295
4. 16.1946693787
5. 14.0477756095
6. 13.4457115246
7. 13.444468316
8. 12.8088199565
9. 13.3148077251

In [ ]:

Random Forest Regressor¶

In [54]:

df_pub

Out[54]:

	vImps	clicks	nImps	CTR	reach	vReach	vRate	reachRate	vReachRate	meanViewTime	exposure
pub_domain
1000	3.0	2.0	3	0.666667	3	3.0	0.275262	1.000000	1.000000	33.520006	100.560017
1001	12.0	3.0	13	0.230769	13	12.0	0.122078	1.000000	0.923077	3.529025	42.348299
1002	8.0	5.0	11	0.454545	11	8.0	0.118468	1.000000	0.727273	10.604967	84.839733
1003	23.0	0.0	26	0.000000	26	23.0	0.000000	1.000000	0.884615	0.000000	0.000000
1004	15.0	7.0	22	0.318182	22	15.0	0.020841	1.000000	0.681818	5.079108	76.186626
1005	30.0	10.0	40	0.250000	40	30.0	0.071003	1.000000	0.750000	1.627720	48.831589
1006	24.0	6.0	29	0.206897	29	24.0	0.084437	1.000000	0.827586	7.496946	179.926693
1007	28.0	8.0	34	0.235294	34	28.0	0.091125	1.000000	0.823529	4.873371	136.454401
1008	32.0	6.0	36	0.166667	36	32.0	0.090447	1.000000	0.888889	9.077761	290.488350
1009	42.0	2.0	58	0.034483	58	42.0	0.022988	1.000000	0.724138	0.627032	26.335348
1010	47.0	4.0	57	0.070175	57	47.0	0.004582	1.000000	0.824561	1.474816	69.316332
1011	41.0	2.0	51	0.039216	51	41.0	0.000835	1.000000	0.803922	1.736893	71.212603
1012	39.0	12.0	47	0.255319	47	39.0	0.058998	1.000000	0.829787	7.929435	309.247953
1013	51.0	10.0	63	0.158730	63	51.0	0.022737	1.000000	0.809524	3.495873	178.289529
1014	52.0	3.0	64	0.046875	64	52.0	0.036995	1.000000	0.812500	1.926381	100.171790
1015	51.0	14.0	66	0.212121	66	51.0	0.151345	1.000000	0.772727	6.375563	325.153704
1016	61.0	0.0	73	0.000000	73	61.0	0.000000	1.000000	0.835616	0.000000	0.000000
1017	70.0	28.0	88	0.318182	87	69.0	0.060775	0.988636	0.784091	2.140038	149.802684
1018	64.0	11.0	80	0.137500	80	64.0	0.076433	1.000000	0.800000	7.370584	471.717395
1019	80.0	13.0	103	0.126214	103	80.0	0.052028	1.000000	0.776699	1.041808	83.344671
1020	78.0	4.0	100	0.040000	100	78.0	0.015776	1.000000	0.780000	2.303285	179.656253
1021	88.0	29.0	106	0.273585	106	88.0	0.126739	1.000000	0.830189	2.302842	202.650140
1022	81.0	10.0	105	0.095238	105	81.0	0.004029	1.000000	0.771429	3.049524	247.011468
1023	98.0	10.0	119	0.084034	119	98.0	0.056959	1.000000	0.823529	4.639320	454.653313
1024	98.0	23.0	124	0.185484	124	98.0	0.033021	1.000000	0.790323	1.170400	114.699215
1025	90.0	50.0	119	0.420168	119	90.0	0.239107	1.000000	0.756303	3.018353	271.651810
1026	108.0	34.0	129	0.263566	129	108.0	0.093460	1.000000	0.837209	15.454402	1669.075431
1027	116.0	73.0	141	0.517730	141	116.0	0.211257	1.000000	0.822695	14.865386	1724.384743
1028	110.0	17.0	140	0.121429	140	110.0	0.024198	1.000000	0.785714	1.399765	153.974098
1029	120.0	13.0	152	0.085526	152	120.0	0.059837	1.000000	0.789474	0.058177	6.981241
...	...	...	...	...	...	...	...	...	...	...	...
4963	3.0	0.0	5	0.000000	5	3.0	0.000000	1.000000	0.600000	0.000000	0.000000
4964	3.0	1.0	4	0.250000	4	3.0	0.168539	1.000000	0.750000	2.949026	8.847079
4965	1.0	1.0	2	0.500000	2	1.0	0.004760	1.000000	0.500000	29.457369	29.457369
4966	2.0	1.0	4	0.250000	4	2.0	0.025950	1.000000	0.500000	9.397525	18.795049
4967	1.0	0.0	2	0.000000	2	1.0	0.000000	1.000000	0.500000	0.000000	0.000000
4968	5.0	1.0	5	0.200000	5	5.0	0.114428	1.000000	1.000000	0.044619	0.223097
4969	5.0	0.0	5	0.000000	5	5.0	0.000000	1.000000	1.000000	0.000000	0.000000
4970	1.0	0.0	1	0.000000	1	1.0	0.000000	1.000000	1.000000	0.000000	0.000000
4971	1.0	0.0	1	0.000000	1	1.0	0.000000	1.000000	1.000000	0.000000	0.000000
4972	5.0	0.0	6	0.000000	6	5.0	0.000000	1.000000	0.833333	0.000000	0.000000
4973	2.0	0.0	2	0.000000	2	2.0	0.000000	1.000000	1.000000	0.000000	0.000000
4974	1.0	2.0	2	1.000000	2	1.0	0.005310	1.000000	0.500000	52.390245	52.390245
4975	2.0	0.0	2	0.000000	2	2.0	0.000000	1.000000	1.000000	0.000000	0.000000
4976	2.0	0.0	2	0.000000	2	2.0	0.000000	1.000000	1.000000	0.000000	0.000000
4977	2.0	0.0	4	0.000000	4	2.0	0.000000	1.000000	0.500000	0.000000	0.000000
4978	4.0	0.0	5	0.000000	5	4.0	0.000000	1.000000	0.800000	0.000000	0.000000
4979	1.0	1.0	4	0.250000	4	1.0	0.017988	1.000000	0.250000	5.202126	5.202126
4980	2.0	2.0	2	1.000000	2	2.0	0.936463	1.000000	1.000000	59.367776	118.735551
4982	1.0	0.0	2	0.000000	2	1.0	0.000000	1.000000	0.500000	0.000000	0.000000
4983	1.0	0.0	1	0.000000	1	1.0	0.000000	1.000000	1.000000	0.000000	0.000000
4985	2.0	0.0	2	0.000000	2	2.0	0.000000	1.000000	1.000000	0.000000	0.000000
4986	1.0	0.0	1	0.000000	1	1.0	0.000000	1.000000	1.000000	0.000000	0.000000
4987	1.0	1.0	1	1.000000	1	1.0	0.246574	1.000000	1.000000	50.474694	50.474694
4988	1.0	0.0	2	0.000000	2	1.0	0.000000	1.000000	0.500000	0.000000	0.000000
4990	2.0	0.0	3	0.000000	3	2.0	0.000000	1.000000	0.666667	0.000000	0.000000
4991	1.0	0.0	1	0.000000	1	1.0	0.000000	1.000000	1.000000	0.000000	0.000000
4992	0.0	0.0	1	0.000000	1	0.0	0.000000	1.000000	0.000000	0.000000	0.000000
4993	0.0	0.0	1	0.000000	1	0.0	0.000000	1.000000	0.000000	0.000000	0.000000
4996	1.0	0.0	1	0.000000	1	1.0	0.000000	1.000000	1.000000	0.000000	0.000000
4999	1.0	0.0	1	0.000000	1	1.0	0.000000	1.000000	1.000000	0.000000	0.000000

3993 rows × 11 columns

In [55]:

from sklearn.ensemble import RandomForestRegressor
from sklearn import cross_validation
features = ['exposure', 'meanViewTime', 'nImps', 'reach', 'reachRate', 
                    'vImps', 'vRate', 'vReach', 'vReachRate']

X_train, X_test, y_train, y_test =  cross_validation.train_test_split(
    df_pub[features], df_pub.CTR, test_size=0.40, random_state=0)
reg = RandomForestRegressor(n_estimators=100, n_jobs=-1)
model = reg.fit(X_train, y_train)

In [56]:

scores = cross_validation.cross_val_score(model, X_train, y_train)
print(scores, scores.mean())

(array([ 0.6366208 ,  0.62995482,  0.69740879]), 0.65466146813267001)

In [57]:

model.score(X_test, y_test)

Out[57]:

0.7195525938702716

In [58]:

plt.rcParams["figure.figsize"] = [12.0, 4.0]
plt.bar(range(len(features)), model.feature_importances_, align='center')
_ = plt.xticks(range(len(features)), features)

In [ ]: