import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
from sklearn.base import BaseEstimator
from sklearn.metrics import (make_scorer, mean_absolute_error)
from sklearn import __version__
__version__
'0.21.2'
from _compute_median import _read_all_data
data = _read_all_data()
data.head(4)
address_type | agency | agency_name | bbl | borough | bridge_highway_direction | bridge_highway_name | bridge_highway_segment | city | closed_date | ... | resolution_description | road_ramp | status | street_name | taxi_company_borough | taxi_pick_up_location | unique_key | x_coordinate_state_plane | y_coordinate_state_plane | vehicle_type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | ADDRESS | DOHMH | Department of Health and Mental Hygiene | 5.080220e+09 | STATEN ISLAND | NaN | NaN | NaN | STATEN ISLAND | NaT | ... | The Department of Health and Mental Hygiene wi... | NaN | Open | WOOD AVENUE | NaN | NaN | 43058507 | 916296.0 | 126389.0 | NaN |
1 | ADDRESS | DOHMH | Department of Health and Mental Hygiene | 4.097000e+09 | QUEENS | NaN | NaN | NaN | Jamaica | NaT | ... | The Department of Health and Mental Hygiene wi... | NaN | Open | 87 AVENUE | NaN | NaN | 43058506 | 1035684.0 | 196858.0 | NaN |
2 | INTERSECTION | DOHMH | Department of Health and Mental Hygiene | NaN | BROOKLYN | NaN | NaN | NaN | BROOKLYN | NaT | ... | The Department of Health and Mental Hygiene wi... | NaN | Open | NaN | NaN | NaN | 43060680 | 1023962.0 | 182899.0 | NaN |
3 | ADDRESS | DOHMH | Department of Health and Mental Hygiene | 4.104670e+09 | QUEENS | NaN | NaN | NaN | Hollis | 2019-06-25 | ... | The Department of Health and Mental Hygiene wi... | NaN | Closed | 196 STREET | NaN | NaN | 43056246 | 1049383.0 | 200048.0 | NaN |
4 rows × 41 columns
data.columns
Index(['address_type', 'agency', 'agency_name', 'bbl', 'borough', 'bridge_highway_direction', 'bridge_highway_name', 'bridge_highway_segment', 'city', 'closed_date', 'community_board', 'complaint_type', 'created_date', 'cross_street_1', 'cross_street_2', 'descriptor', 'due_date', 'facility_type', 'incident_address', 'incident_zip', 'intersection_street_1', 'intersection_street_2', 'landmark', 'latitude', 'location', 'location_type', 'longitude', 'open_data_channel_type', 'park_borough', 'park_facility_name', 'resolution_action_updated_date', 'resolution_description', 'road_ramp', 'status', 'street_name', 'taxi_company_borough', 'taxi_pick_up_location', 'unique_key', 'x_coordinate_state_plane', 'y_coordinate_state_plane', 'vehicle_type'], dtype='object')
features = ['complaint_type', 'latitude','longitude', 'created_date']
data['time_to_action'] = (data['resolution_action_updated_date'] - data['created_date']) # / pd.np.timedelta64(1, 'M')
data_ = data.loc[data.complaint_type.str.contains('Noise'), features + ['time_to_action']]
data_ = data_[data_.notnull().all(1)]
data_['time_to_action'] = (data_['time_to_action'].dt.seconds / 3600).astype(int)
y = data_['time_to_action']
X = data_.drop('time_to_action', axis=1)
len(X)
40698
X['complaint_type'].unique()
array(['Noise - Commercial', 'Noise - Street/Sidewalk', 'Noise - Vehicle', 'Noise - Residential', 'Noise', 'Noise - Park', 'Noise - House of Worship', 'Collection Truck Noise'], dtype=object)
proper_names = {
'Noise - Commercial':'commercial',
'Noise - Residential':'residential',
'Noise - Street/Sidewalk':'street',
'Noise - Vehicle':'vehicle',
'Noise - Park':'park',
'Noise':'other',
'Noise - House of Worship':'worship',
'Collection Truck Noise':'truck'
}
X['complaint_type'] = X['complaint_type'].map(proper_names)
X.head(5)
complaint_type | latitude | longitude | created_date | |
---|---|---|---|---|
7 | commercial | 40.717302 | -73.949248 | 2019-06-23 00:00:00 |
10 | street | 40.837576 | -73.889396 | 2019-06-23 00:00:08 |
11 | vehicle | 40.833693 | -73.913846 | 2019-06-23 00:00:16 |
12 | residential | 40.823469 | -73.924460 | 2019-06-23 00:00:25 |
13 | street | 40.848693 | -73.903279 | 2019-06-23 00:00:28 |
# class TimeTransformer(BaseEstimator):
# cols = None
# def __init__(self, cols=None):
# self.cols = cols
# def fit(self, X=None, y=None, groups=None):
# if self.cols is None:
# self.cols = X.select_dtypes(include=pd.np.datetime64).columns
# return self
# def transform(self, X, y=None, groups=None, cols=None):
# for col in self.cols:
# dates = X[col]
# X = X.drop(col, axis=1)
# X[f'{col}_dow'] = dates.dt.dayofweek
# X[f'{col}_doy'] = dates.dt.dayofyear
# X[f'{col}_tod'] = dates.dt.second
# return X
from ml import TimeTransformer
t = TimeTransformer(cols=['created_date'])
# X.select_dtypes(include=pd.np.datetime64)
t.fit(X).transform(X).head(3)
complaint_type | latitude | longitude | created_date_dow | created_date_doy | created_date_tod | |
---|---|---|---|---|---|---|
7 | commercial | 40.717302 | -73.949248 | 6 | 174 | 0 |
10 | street | 40.837576 | -73.889396 | 6 | 174 | 8 |
11 | vehicle | 40.833693 | -73.913846 | 6 | 174 | 16 |
cats = X['complaint_type'].unique().tolist()
ct = ColumnTransformer(
transformers=[
('ordinal', OrdinalEncoder(categories=[cats,]), [0]),
('time', TimeTransformer(cols=['created_date']), [3])
], remainder='passthrough')
model = RandomForestRegressor(n_estimators=100, random_state=2019)
pipe = Pipeline(steps=[('preprocessor', ct),
('model', model)])
cv = cross_validate(pipe, X, y, cv=5, scoring=make_scorer(mean_absolute_error),
verbose=1, n_jobs=3)
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers. [Parallel(n_jobs=3)]: Done 5 out of 5 | elapsed: 27.7s finished
pd.DataFrame(cv)
fit_time | score_time | test_score | |
---|---|---|---|
0 | 13.226047 | 0.170526 | 2.843741 |
1 | 13.476237 | 0.211020 | 3.919784 |
2 | 13.026504 | 0.177489 | 3.015327 |
3 | 12.111729 | 0.160368 | 3.072551 |
4 | 12.182463 | 0.100503 | 2.752961 |
pd.DataFrame(cv)['test_score'].mean()
3.1208729127942547
pipe.fit(X, y)
Pipeline(memory=None, steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='passthrough', sparse_threshold=0.3, transformer_weights=None, transformers=[('ordinal', OrdinalEncoder(categories=[['commercial', 'street', 'vehicle', 'residential', 'other', 'park', 'worship', 'truck']], dtype=<class 'numpy.float64'>), [0]), ('time', TimeTransformer(cols=['create... verbose=False)), ('model', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None, oob_score=False, random_state=2019, verbose=0, warm_start=False))], verbose=False)
pipe.predict(X.head(1))[0]
11.37
# import joblib
import pickle
with open('./model.pkl', 'wb') as f:
joblib.dump(pipe, f)
singleton = pd.DataFrame([{'complaint_type':'dummy',
'latitude':1.1111,
'longitude':1.1111,
'created_date':pd.to_datetime('2019-01-01')}])
BODY = {
'complaint_type': 'residential',
'lat': "40.636626",
'lon': "-73.951694",
"date": "2019-06-08 00:00:09"
}
mapping = {
'lon': 'longitude',
'lat': 'latitude',
'date': 'created_date'
}
dtypes = {
'lon': float,
'lat': float,
'date': pd.to_datetime
}
singleton.loc[0, 'complaint_type'] = BODY['complaint_type']
for k, col in mapping.items():
singleton.loc[0, col] = dtypes[k](BODY.get(k, pd.np.nan))
singleton
complaint_type | created_date | latitude | longitude | |
---|---|---|---|---|
0 | residential | 2019-06-08 00:00:09 | 40.636626 | -73.951694 |
singleton.dtypes
complaint_type object created_date datetime64[ns] latitude float64 longitude float64 dtype: object
X.dtypes
complaint_type object latitude float64 longitude float64 created_date datetime64[ns] dtype: object
pipe.predict(singleton[['complaint_type', 'latitude', 'longitude','created_date']])[0]
0.89
singleton[['complaint_type', 'latitude', 'longitude','created_date']].dtypes
complaint_type object latitude float64 longitude float64 created_date datetime64[ns] dtype: object