Chapter 6 - NYC Taxi Full Example

http://chriswhong.com/open-data/foil_nyc_taxi/

http://www.andresmh.com/nyctaxitrips/

Potential models

  • Will the passenger tip the driver?
  • Will the trip take longer than average?
  • Will the fare be higher than average?
In [1]:
import pandas
%pylab inline
Populating the interactive namespace from numpy and matplotlib

Data exploration

In [2]:
N = 5e6
data = pandas.read_csv("trip_data_1.csv", nrows=N)
In [3]:
fare_data = pandas.read_csv("trip_fare_1.csv", nrows=N)
fare_cols = [u' payment_type', u' fare_amount', u' surcharge', u' mta_tax', u' tip_amount', u' tolls_amount', u' total_amount']
data = data.join(fare_data[fare_cols])
del fare_data
data[:10]
Out[3]:
medallion hack_license vendor_id rate_code store_and_fwd_flag pickup_datetime dropoff_datetime passenger_count trip_time_in_secs trip_distance ... pickup_latitude dropoff_longitude dropoff_latitude payment_type fare_amount surcharge mta_tax tip_amount tolls_amount total_amount
0 89D227B655E5C82AECF13C3F540D4CF4 BA96DE419E711691B9445D6A6307C170 CMT 1 N 2013-01-01 15:11:48 2013-01-01 15:18:10 4 382 1.0 ... 40.757977 -73.989838 40.751171 CSH 6.5 0.0 0.5 0 0.0 7.0
1 0BD7C8F5BA12B88E0B67BED28BEA73D8 9FD8F69F0804BDB5549F40E9DA1BE472 CMT 1 N 2013-01-06 00:18:35 2013-01-06 00:22:54 1 259 1.5 ... 40.731781 -73.994499 40.750660 CSH 6.0 0.5 0.5 0 0.0 7.0
2 0BD7C8F5BA12B88E0B67BED28BEA73D8 9FD8F69F0804BDB5549F40E9DA1BE472 CMT 1 N 2013-01-05 18:49:41 2013-01-05 18:54:23 1 282 1.1 ... 40.737770 -74.009834 40.726002 CSH 5.5 1.0 0.5 0 0.0 7.0
3 DFD2202EE08F7A8DC9A57B02ACB81FE2 51EE87E3205C985EF8431D850C786310 CMT 1 N 2013-01-07 23:54:15 2013-01-07 23:58:20 2 244 0.7 ... 40.759945 -73.984734 40.759388 CSH 5.0 0.5 0.5 0 0.0 6.0
4 DFD2202EE08F7A8DC9A57B02ACB81FE2 51EE87E3205C985EF8431D850C786310 CMT 1 N 2013-01-07 23:25:03 2013-01-07 23:34:24 1 560 2.1 ... 40.748528 -74.002586 40.747868 CSH 9.5 0.5 0.5 0 0.0 10.5
5 20D9ECB2CA0767CF7A01564DF2844A3E 598CCE5B9C1918568DEE71F43CF26CD2 CMT 1 N 2013-01-07 15:27:48 2013-01-07 15:38:37 1 648 1.7 ... 40.764252 -73.983322 40.743763 CSH 9.5 0.0 0.5 0 0.0 10.0
6 496644932DF3932605C22C7926FF0FE0 513189AD756FF14FE670D10B92FAF04C CMT 1 N 2013-01-08 11:01:15 2013-01-08 11:08:14 1 418 0.8 ... 40.743977 -74.007416 40.744343 CSH 6.0 0.0 0.5 0 0.0 6.5
7 0B57B9633A2FECD3D3B1944AFC7471CF CCD4367B417ED6634D986F573A552A62 CMT 1 N 2013-01-07 12:39:18 2013-01-07 13:10:56 3 1898 10.7 ... 40.756775 -73.865250 40.770630 CSH 34.0 0.0 0.5 0 4.8 39.3
8 2C0E91FF20A856C891483ED63589F982 1DA2F6543A62B8ED934771661A9D2FA0 CMT 1 N 2013-01-07 18:15:47 2013-01-07 18:20:47 1 299 0.8 ... 40.743137 -73.982712 40.735336 CSH 5.5 1.0 0.5 0 0.0 7.0
9 2D4B95E2FA7B2E85118EC5CA4570FA58 CD2F522EEE1FF5F5A8D8B679E23576B3 CMT 1 N 2013-01-07 15:33:28 2013-01-07 15:49:26 2 957 2.5 ... 40.786983 -73.952919 40.806370 CSH 13.0 0.0 0.5 0 0.0 13.5

10 rows × 21 columns

In [46]:
data.ix[:5, data.columns[:5]]
Out[46]:
medallion hack_license vendor_id rate_code store_and_fwd_flag
0 CD847FE5884F10A28217E9FBA11B275B 5FEFD00D9773268B72EE4E879852F190 CMT 1 N
1 20D9ECB2CA0767CF7A01564DF2844A3E 598CCE5B9C1918568DEE71F43CF26CD2 CMT 1 N
2 A954A71B6D44265AE756BF807E069396 D5CA7D478A14BA3BBFC20153C5C88B1A CMT 1 N
3 F6F7D02179BE915B23EF2DB57836442D 088879B44B80CC9ED43724776C539370 VTS 1 0
4 BE386D8524FCD16B3727DCF0A32D9B25 4EB96EC9F3A42794DEE233EC8A2616CE VTS 1 0
5 E9FF471F36A91031FE5B6D6228674089 72E0B04464AD6513F6A613AABB04E701 VTS 1 0
In [5]:
data.plot(x="trip_time_in_secs", y=" total_amount", kind="scatter", s=2)
xlim(0,1e4)
ylim(0,300)
Out[5]:
(0, 300)
In [44]:
ind = where(logical_and(data.trip_time_in_secs < 500, data[' total_amount'] > 30))[0]
data = data.drop(ind)
In [218]:
data[logical_and(data.dropoff_latitude > 40.6,data.dropoff_latitude < 40.9)].dropoff_latitude.hist(bins=20);
In [231]:
data[logical_and(data.dropoff_longitude > -74.05,data.dropoff_longitude < -73.9)].dropoff_longitude.hist(bins=20);
In [3]:
data.vendor_id.value_counts().plot(kind="bar");
In [4]:
data.rate_code.value_counts().plot(kind="bar", logy=True, ylim=(1,1e8));
In [5]:
data.store_and_fwd_flag.value_counts().plot(kind="bar");
In [16]:
data.passenger_count.value_counts().plot(kind="bar");
In [19]:
data.trip_time_in_secs[data.trip_time_in_secs < 4000].hist(bins=30);
In [21]:
data.trip_distance[data.trip_distance < 22].hist(bins=30);
In [6]:
data[' payment_type'].value_counts().plot(kind="bar", logy=True, ylim=(1,1e8));
In [6]:
data.plot(x="trip_time_in_secs", y="trip_distance", kind="scatter", s=2)
xlim(0,5000)
ylim(0,40)
Out[6]:
(0, 40)
In [9]:
figure(figsize=(16,8))
plot(data["pickup_latitude"], data["pickup_longitude"], 'b,')
xlim(40.6, 40.9)
ylim(-74.05, -73.9)
Out[9]:
(-74.05, -73.9)
In [6]:
data[data[' tip_amount'] < 15][' tip_amount'].hist(bins=30);
In [4]:
len(data)
data = data[data[' payment_type'] != "CSH"]
data.reset_index(inplace=True, drop=True)
len(data)
Out[4]:
3633518

Building first model

In [5]:
# Setup target
data['tipped'] = (data[' tip_amount'] > 0).astype("int")
data['tipped'].value_counts()
Out[5]:
1    3538547
0      94971
dtype: int64
In [6]:
feats1 = [u'rate_code', 'passenger_count', u'trip_time_in_secs', u'trip_distance', u'pickup_longitude', u'pickup_latitude', u'dropoff_longitude', u'dropoff_latitude', ' fare_amount', u' surcharge', u' mta_tax', ' tolls_amount']
In [7]:
M = len(data)
rand_idx = arange(M)
random.shuffle(rand_idx)
train_idx = rand_idx[int(M*0.2):]
test_idx = rand_idx[:int(M*0.2)]
In [8]:
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
In [15]:
sc = StandardScaler()
data_scaled = sc.fit_transform(data[feats1])
data_scaled[train_idx.tolist(),:].shape
Out[15]:
(2906815, 12)
In [27]:
sgd = SGDClassifier(loss="modified_huber")
sgd.fit(data.ix[train_idx,feats1], data['tipped'].ix[train_idx])
Out[27]:
SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0,
       fit_intercept=True, l1_ratio=0.15, learning_rate='optimal',
       loss='modified_huber', n_iter=5, n_jobs=1, penalty='l2',
       power_t=0.5, random_state=None, shuffle=False, verbose=0,
       warm_start=False)
In [28]:
preds = sgd.predict_proba(data.ix[test_idx,feats1])
In [29]:
fpr, tpr, thr = roc_curve(data['tipped'].ix[test_idx], preds[:,1])
auc = roc_auc_score(data['tipped'].ix[test_idx], preds[:,1])
In [30]:
auc
Out[30]:
0.5053715706684706
In [41]:
plot(fpr,tpr)
plot(fpr,fpr)
xlabel("False positive rate")
ylabel("True positive rate")
Out[41]:
<matplotlib.text.Text at 0x7f91eb2fb150>

Random Forest

In [9]:
from sklearn.ensemble import RandomForestClassifier
In [10]:
data.fillna(0, inplace=True)
count_nonzero(pandas.isnull(data.ix[train_idx,feats1]))
Out[10]:
0
In [33]:
rf1 = RandomForestClassifier(n_estimators=100, n_jobs=-1)
rf1.fit(data.ix[train_idx,feats1], data['tipped'].ix[train_idx])
Out[33]:
RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
            min_samples_split=2, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0)
In [34]:
preds1 = rf1.predict_proba(data.ix[test_idx,feats1])
In [21]:
from sklearn.metrics import roc_curve, roc_auc_score
In [36]:
fpr1, tpr1, thr1 = roc_curve(data['tipped'].ix[test_idx], preds1[:,1])
auc1 = roc_auc_score(data['tipped'].ix[test_idx], preds1[:,1])
In [ ]:
print auc1
rf1.score(data.ix[test_idx,feats1], data.ix[test_idx,'tipped'])
In [42]:
plot(fpr1,tpr1)
plot(fpr1,fpr1)
xlabel("False positive rate")
ylabel("True positive rate")
Out[42]:
<matplotlib.text.Text at 0x7f91f0b76e90>
In [44]:
fi = zip(feats1, rf1.feature_importances_)
fi.sort(key=lambda x: -x[1])
pandas.DataFrame(fi, columns=["Feature","Importance"])
Out[44]:
Feature Importance
0 dropoff_latitude 0.165411
1 dropoff_longitude 0.163337
2 pickup_latitude 0.163068
3 pickup_longitude 0.160285
4 trip_time_in_secs 0.122214
5 trip_distance 0.112020
6 fare_amount 0.067795
7 passenger_count 0.017850
8 surcharge 0.014259
9 rate_code 0.006974
10 tolls_amount 0.004067
11 mta_tax 0.002720

Features 2

In [45]:
data['trip_time_in_secs'][data['trip_time_in_secs'] < 1e-3] = -1
data['speed'] = data['trip_distance'] / data['trip_time_in_secs']
-c:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
In [46]:
feats2 = feats1 + ['speed']
feats2.remove('trip_time_in_secs')
In [ ]:
rf2 = RandomForestClassifier(n_estimators=100, n_jobs=-1)
rf2.fit(data.ix[train_idx,feats2], data['tipped'].ix[train_idx])
In [ ]:
preds2 = rf2.predict_proba(data.ix[test_idx,feats2])
In [ ]:
fpr2, tpr2, thr2 = roc_curve(data['tipped'].ix[test_idx], preds2[:,1])
auc2 = roc_auc_score(data['tipped'].ix[test_idx], preds2[:,1])
In [ ]:
print auc2
plot(fpr2,tpr2)
plot(fpr2,fpr2)
In [ ]:
fi2 = zip(feats2, rf2.feature_importances_)
fi2.sort(key=lambda x: x[1])
fi2

Features 3

In [11]:
feats3 = feats1
In [12]:
feats3
Out[12]:
[u'rate_code',
 'passenger_count',
 u'trip_time_in_secs',
 u'trip_distance',
 u'pickup_longitude',
 u'pickup_latitude',
 u'dropoff_longitude',
 u'dropoff_latitude',
 ' fare_amount',
 u' surcharge',
 u' mta_tax',
 ' tolls_amount']
In [13]:
from sklearn.feature_extraction import DictVectorizer
In [14]:
def cat_to_num(data):
    categories = unique(data)
    features = {}
    for cat in categories:
        binary = (data == cat)
        features["%s:%s"%(data.name, cat)] = binary.astype("int")
    return pandas.DataFrame(features)
In [15]:
payment_type_cats = cat_to_num(data[' payment_type'])
vendor_id_cats = cat_to_num(data['vendor_id'])
store_and_fwd_flag_cats = cat_to_num(data['store_and_fwd_flag'])
rate_code = cat_to_num(data['rate_code'])
In [16]:
data = data.join(payment_type_cats)
feats3 += payment_type_cats.columns
data = data.join(vendor_id_cats)
feats3 += vendor_id_cats.columns
data = data.join(store_and_fwd_flag_cats)
feats3 += store_and_fwd_flag_cats.columns
data = data.join(rate_code)
feats3 += rate_code.columns
In [17]:
feats3
Out[17]:
[u'rate_code',
 'passenger_count',
 u'trip_time_in_secs',
 u'trip_distance',
 u'pickup_longitude',
 u'pickup_latitude',
 u'dropoff_longitude',
 u'dropoff_latitude',
 ' fare_amount',
 u' surcharge',
 u' mta_tax',
 ' tolls_amount',
 ' payment_type:CRD',
 ' payment_type:DIS',
 ' payment_type:NOC',
 ' payment_type:UNK',
 'vendor_id:CMT',
 'vendor_id:VTS',
 'store_and_fwd_flag:0',
 'store_and_fwd_flag:N',
 'store_and_fwd_flag:Y',
 'rate_code:0',
 'rate_code:1',
 'rate_code:2',
 'rate_code:210',
 'rate_code:28',
 'rate_code:3',
 'rate_code:4',
 'rate_code:5',
 'rate_code:6',
 'rate_code:8']
In [18]:
rf3 = RandomForestClassifier(n_estimators=100, n_jobs=-1)
rf3.fit(data.ix[train_idx,feats3], data['tipped'].ix[train_idx])
Out[18]:
RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
            min_samples_split=2, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0)
In [ ]:
rf3.score(data.ix[test_idx,feats3], data.ix[test_idx,'tipped'])
In [23]:
fpr3, tpr3, thr3 = roc_curve(data['tipped'].ix[test_idx], preds3[:,1])
auc3 = roc_auc_score(data['tipped'].ix[test_idx], preds3[:,1])
print auc3
plot(fpr3,tpr3)
plot(fpr3,fpr3)
xlabel("False positive rate")
ylabel("True positive rate")
0.655894612736
Out[23]:
<matplotlib.text.Text at 0x7f61abdf8150>
In [25]:
fi3 = zip(feats3, rf3.feature_importances_)
fi3.sort(key=lambda x: -x[1])
pandas.DataFrame(fi3, columns=["Feature","Importance"])
Out[25]:
Feature Importance
0 dropoff_latitude 0.163023
1 pickup_latitude 0.161114
2 dropoff_longitude 0.160988
3 pickup_longitude 0.158672
4 trip_time_in_secs 0.111172
5 trip_distance 0.106693
6 fare_amount 0.067567
7 passenger_count 0.019286
8 surcharge 0.010330
9 payment_type:NOC 0.008361
10 payment_type:CRD 0.008247
11 rate_code 0.004074
12 tolls_amount 0.003869
13 rate_code:5 0.003375
14 payment_type:DIS 0.003146
15 mta_tax 0.001742
16 vendor_id:VTS 0.001604
17 store_and_fwd_flag:N 0.001587
18 vendor_id:CMT 0.001350
19 store_and_fwd_flag:0 0.001072
20 rate_code:1 0.000757
21 payment_type:UNK 0.000625
22 rate_code:2 0.000494
23 store_and_fwd_flag:Y 0.000432
24 rate_code:3 0.000184
25 rate_code:0 0.000127
26 rate_code:4 0.000108
27 rate_code:6 0.000002
28 rate_code:8 0.000001
29 rate_code:210 0.000000
30 rate_code:28 0.000000

Features 4

In [26]:
feats4 = feats3
In [27]:
# Datetime features (hour of day, day of week, week of year)
pickup = pandas.to_datetime(data['pickup_datetime'])
dropoff = pandas.to_datetime(data['dropoff_datetime'])
data['pickup_hour'] = pickup.apply(lambda e: e.hour)
data['pickup_day'] = pickup.apply(lambda e: e.dayofweek)
#data['pickup_week'] = pickup.apply(lambda e: e.week)
data['dropoff_hour'] = dropoff.apply(lambda e: e.hour)
data['dropoff_day'] = dropoff.apply(lambda e: e.dayofweek)
#data['dropoff_week'] = dropoff.apply(lambda e: e.week)
In [28]:
feats4 += ['pickup_hour', 'pickup_day', 'dropoff_hour', 'dropoff_day']
In [15]:
feats4
Out[15]:
[u'rate_code',
 'passenger_count',
 u'trip_time_in_secs',
 u'trip_distance',
 u'pickup_longitude',
 u'pickup_latitude',
 u'dropoff_longitude',
 u'dropoff_latitude',
 ' fare_amount',
 u' surcharge',
 u' mta_tax',
 ' tolls_amount',
 'pickup_hour',
 'pickup_day',
 'pickup_week',
 'dropoff_hour',
 'dropoff_day',
 'dropoff_week']
In [29]:
rf4 = RandomForestClassifier(n_estimators=100, n_jobs=-1)
rf4.fit(data.ix[train_idx,feats4], data['tipped'].ix[train_idx])
Out[29]:
RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
            min_samples_split=2, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0)
In [33]:
preds4 = rf4.predict_proba(data.ix[test_idx,feats4])
rf4.score(data.ix[test_idx,feats4], data.ix[test_idx,'tipped'])
Out[33]:
0.97507922768999167
In [31]:
fpr4, tpr4, thr4 = roc_curve(data['tipped'].ix[test_idx], preds4[:,1])
auc4 = roc_auc_score(data['tipped'].ix[test_idx], preds4[:,1])
print auc4
figure(figsize=(14,8))
plot(fpr4,tpr4, "g-", linewidth=3)
plot(fpr4,fpr4, "k-", linewidth=1)
xlabel("False positive rate")
ylabel("True positive rate")
0.662020800456
Out[31]:
<matplotlib.text.Text at 0x7f61add7f410>
In [32]:
fi4 = zip(feats4, rf4.feature_importances_)
fi4.sort(key=lambda x: -x[1])
pandas.DataFrame(fi4, columns=["Feature","Importance"])
Out[32]:
Feature Importance
0 dropoff_latitude 1.342966e-01
1 dropoff_longitude 1.332890e-01
2 pickup_latitude 1.326898e-01
3 pickup_longitude 1.314082e-01
4 trip_time_in_secs 1.000610e-01
5 trip_distance 9.924300e-02
6 fare_amount 7.099682e-02
7 dropoff_hour 3.942205e-02
8 pickup_hour 3.924598e-02
9 pickup_day 2.417833e-02
10 dropoff_day 2.413835e-02
11 passenger_count 2.300375e-02
12 payment_type:CRD 9.137798e-03
13 payment_type:NOC 8.063106e-03
14 surcharge 5.429348e-03
15 rate_code 4.786564e-03
16 tolls_amount 4.155098e-03
17 rate_code:5 2.718071e-03
18 payment_type:DIS 2.717370e-03
19 mta_tax 2.096613e-03
20 vendor_id:CMT 1.639687e-03
21 store_and_fwd_flag:N 1.561414e-03
22 store_and_fwd_flag:0 1.335843e-03
23 vendor_id:VTS 1.149871e-03
24 rate_code:1 8.534049e-04
25 payment_type:UNK 8.062925e-04
26 store_and_fwd_flag:Y 6.048130e-04
27 rate_code:2 5.845484e-04
28 rate_code:4 1.482718e-04
29 rate_code:3 1.346250e-04
30 rate_code:0 1.015749e-04
31 rate_code:6 1.928360e-06
32 rate_code:8 8.636844e-07
33 rate_code:28 3.379825e-08
34 rate_code:210 9.253761e-09
In [59]:
data.ix[data[' payment_type:CSH'] == 1,'tipped'].value_counts()
Out[59]:
0    987113
1        21
dtype: int64
In [242]:
figure(figsize=(16,8))
plot(data[data['tipped'] == True]["dropoff_latitude"], data[data['tipped'] == True]["dropoff_longitude"], 'b,')
plot(data[data['tipped'] == False]["dropoff_latitude"], data[data['tipped'] == False]["dropoff_longitude"], 'r,')
xlim(40.6, 40.9)
ylim(-74.05, -73.9)
Out[242]:
(-74.05, -73.9)