In [1]:

import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import  cross_validate
from sklearn.base import BaseEstimator
from sklearn.metrics import (make_scorer, mean_absolute_error)

In [2]:

from sklearn import __version__
__version__

Out[2]:

'0.21.2'

In [3]:

from _compute_median import _read_all_data

In [4]:

data = _read_all_data()

In [5]:

data.head(4)

Out[5]:

	address_type	agency	agency_name	bbl	borough	bridge_highway_direction	bridge_highway_name	bridge_highway_segment	city	closed_date	...	resolution_description	road_ramp	status	street_name	taxi_company_borough	taxi_pick_up_location	unique_key	x_coordinate_state_plane	y_coordinate_state_plane	vehicle_type
0	ADDRESS	DOHMH	Department of Health and Mental Hygiene	5.080220e+09	STATEN ISLAND	NaN	NaN	NaN	STATEN ISLAND	NaT	...	The Department of Health and Mental Hygiene wi...	NaN	Open	WOOD AVENUE	NaN	NaN	43058507	916296.0	126389.0	NaN
1	ADDRESS	DOHMH	Department of Health and Mental Hygiene	4.097000e+09	QUEENS	NaN	NaN	NaN	Jamaica	NaT	...	The Department of Health and Mental Hygiene wi...	NaN	Open	87 AVENUE	NaN	NaN	43058506	1035684.0	196858.0	NaN
2	INTERSECTION	DOHMH	Department of Health and Mental Hygiene	NaN	BROOKLYN	NaN	NaN	NaN	BROOKLYN	NaT	...	The Department of Health and Mental Hygiene wi...	NaN	Open	NaN	NaN	NaN	43060680	1023962.0	182899.0	NaN
3	ADDRESS	DOHMH	Department of Health and Mental Hygiene	4.104670e+09	QUEENS	NaN	NaN	NaN	Hollis	2019-06-25	...	The Department of Health and Mental Hygiene wi...	NaN	Closed	196 STREET	NaN	NaN	43056246	1049383.0	200048.0	NaN

4 rows × 41 columns

In [6]:

data.columns

Out[6]:

Index(['address_type', 'agency', 'agency_name', 'bbl', 'borough',
       'bridge_highway_direction', 'bridge_highway_name',
       'bridge_highway_segment', 'city', 'closed_date', 'community_board',
       'complaint_type', 'created_date', 'cross_street_1', 'cross_street_2',
       'descriptor', 'due_date', 'facility_type', 'incident_address',
       'incident_zip', 'intersection_street_1', 'intersection_street_2',
       'landmark', 'latitude', 'location', 'location_type', 'longitude',
       'open_data_channel_type', 'park_borough', 'park_facility_name',
       'resolution_action_updated_date', 'resolution_description', 'road_ramp',
       'status', 'street_name', 'taxi_company_borough',
       'taxi_pick_up_location', 'unique_key', 'x_coordinate_state_plane',
       'y_coordinate_state_plane', 'vehicle_type'],
      dtype='object')

In [7]:

features = ['complaint_type', 'latitude','longitude', 'created_date']

In [8]:

data['time_to_action'] = (data['resolution_action_updated_date'] - data['created_date']) # / pd.np.timedelta64(1, 'M')

In [9]:

data_ = data.loc[data.complaint_type.str.contains('Noise'), features + ['time_to_action']]
data_ = data_[data_.notnull().all(1)]

In [10]:

data_['time_to_action'] = (data_['time_to_action'].dt.seconds / 3600).astype(int)

In [11]:

y = data_['time_to_action']
X = data_.drop('time_to_action', axis=1)

In [12]:

len(X)

Out[12]:

Little cleaning¶

In [13]:

X['complaint_type'].unique()

Out[13]:

array(['Noise - Commercial', 'Noise - Street/Sidewalk', 'Noise - Vehicle',
       'Noise - Residential', 'Noise', 'Noise - Park',
       'Noise - House of Worship', 'Collection Truck Noise'], dtype=object)

In [14]:

proper_names = {
    'Noise - Commercial':'commercial', 
    'Noise - Residential':'residential',
    'Noise - Street/Sidewalk':'street',
    'Noise - Vehicle':'vehicle', 
    'Noise - Park':'park',
    'Noise':'other', 
    'Noise - House of Worship':'worship', 
    'Collection Truck Noise':'truck'
}

In [15]:

X['complaint_type'] = X['complaint_type'].map(proper_names)

In [16]:

X.head(5)

Out[16]:

	complaint_type	latitude	longitude	created_date
7	commercial	40.717302	-73.949248	2019-06-23 00:00:00
10	street	40.837576	-73.889396	2019-06-23 00:00:08
11	vehicle	40.833693	-73.913846	2019-06-23 00:00:16
12	residential	40.823469	-73.924460	2019-06-23 00:00:25
13	street	40.848693	-73.903279	2019-06-23 00:00:28

Feature Generation¶

In [17]:

# class TimeTransformer(BaseEstimator):
#     cols = None
    
#     def __init__(self, cols=None):
#         self.cols = cols
    
#     def fit(self, X=None, y=None, groups=None):
        
#         if self.cols is None:
#             self.cols = X.select_dtypes(include=pd.np.datetime64).columns
#         return self
    
#     def transform(self, X, y=None, groups=None, cols=None):
        
#         for col in self.cols:
#             dates = X[col]
#             X = X.drop(col, axis=1)
#             X[f'{col}_dow'] = dates.dt.dayofweek
#             X[f'{col}_doy'] = dates.dt.dayofyear
#             X[f'{col}_tod'] = dates.dt.second

#         return X

from ml import TimeTransformer

In [18]:

t = TimeTransformer(cols=['created_date'])

In [19]:

# X.select_dtypes(include=pd.np.datetime64)

In [20]:

t.fit(X).transform(X).head(3)

Out[20]:

	complaint_type	latitude	longitude	created_date_dow	created_date_doy	created_date_tod
7	commercial	40.717302	-73.949248	6	174	0
10	street	40.837576	-73.889396	6	174	8
11	vehicle	40.833693	-73.913846	6	174	16

In [21]:

cats = X['complaint_type'].unique().tolist()

In [22]:

ct = ColumnTransformer(
        transformers=[
            ('ordinal', OrdinalEncoder(categories=[cats,]), [0]),
            ('time', TimeTransformer(cols=['created_date']), [3])
        ], remainder='passthrough')

In [23]:

model = RandomForestRegressor(n_estimators=100, random_state=2019)

In [24]:

pipe = Pipeline(steps=[('preprocessor', ct),
                        ('model', model)])

Cross-validate¶

In [25]:

cv = cross_validate(pipe, X, y, cv=5, scoring=make_scorer(mean_absolute_error),
                    verbose=1, n_jobs=3)

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed:   27.7s finished

In [26]:

pd.DataFrame(cv)

Out[26]:

	fit_time	score_time	test_score
0	13.226047	0.170526	2.843741
1	13.476237	0.211020	3.919784
2	13.026504	0.177489	3.015327
3	12.111729	0.160368	3.072551
4	12.182463	0.100503	2.752961

In [27]:

pd.DataFrame(cv)['test_score'].mean()

Out[27]:

3.1208729127942547

Train and store Model¶

In [28]:

pipe.fit(X, y)

Out[28]:

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('ordinal',
                                                  OrdinalEncoder(categories=[['commercial',
                                                                              'street',
                                                                              'vehicle',
                                                                              'residential',
                                                                              'other',
                                                                              'park',
                                                                              'worship',
                                                                              'truck']],
                                                                 dtype=<class 'numpy.float64'>),
                                                  [0]),
                                                 ('time',
                                                  TimeTransformer(cols=['create...
                                   verbose=False)),
                ('model',
                 RandomForestRegressor(bootstrap=True, criterion='mse',
                                       max_depth=None, max_features='auto',
                                       max_leaf_nodes=None,
                                       min_impurity_decrease=0.0,
                                       min_impurity_split=None,
                                       min_samples_leaf=1, min_samples_split=2,
                                       min_weight_fraction_leaf=0.0,
                                       n_estimators=100, n_jobs=None,
                                       oob_score=False, random_state=2019,
                                       verbose=0, warm_start=False))],
         verbose=False)

In [29]:

pipe.predict(X.head(1))[0]

Out[29]:

11.37

In [32]:

# import joblib
import pickle

In [33]:

with open('./model.pkl', 'wb') as f:
    joblib.dump(pipe, f)

Testing¶

In [67]:

singleton = pd.DataFrame([{'complaint_type':'dummy', 
                           'latitude':1.1111, 
                           'longitude':1.1111,
                           'created_date':pd.to_datetime('2019-01-01')}])

In [68]:

BODY = {
    'complaint_type': 'residential',
    'lat': "40.636626",
    'lon': "-73.951694",
    "date": "2019-06-08 00:00:09"
}

In [69]:

mapping = {
    'lon': 'longitude',
    'lat': 'latitude',
    'date': 'created_date'
}

dtypes = {
    'lon': float,
    'lat': float,
    'date': pd.to_datetime
}

In [70]:

singleton.loc[0, 'complaint_type'] = BODY['complaint_type']

for k, col in mapping.items():
    singleton.loc[0, col] = dtypes[k](BODY.get(k, pd.np.nan))

In [71]:

singleton

Out[71]:

	complaint_type	created_date	latitude	longitude
0	residential	2019-06-08 00:00:09	40.636626	-73.951694

In [72]:

singleton.dtypes

Out[72]:

complaint_type            object
created_date      datetime64[ns]
latitude                 float64
longitude                float64
dtype: object

In [73]:

X.dtypes

Out[73]:

complaint_type            object
latitude                 float64
longitude                float64
created_date      datetime64[ns]
dtype: object

In [74]:

pipe.predict(singleton[['complaint_type', 'latitude', 'longitude','created_date']])[0]

Out[74]:

0.89

In [75]:

singleton[['complaint_type', 'latitude', 'longitude','created_date']].dtypes

Out[75]:

complaint_type            object
latitude                 float64
longitude                float64
created_date      datetime64[ns]
dtype: object

In [ ]: