%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
from mpl_toolkits.basemap import Basemap
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import SCORERS
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, RidgeCV, LassoCV
from scipy import sparse
from sklearn.metrics import mean_squared_error, mean_absolute_error
import random
PATH = './data/'
from jupyterthemes import jtplot
jtplot.style(theme='onedork')
file_name_cr = 'crime.csv'
df_crime = pd.read_csv(PATH+file_name_cr, encoding = "ISO-8859-1")
df_crime.head()
file_name_weather = 'Boston weather_clean.csv'
df_weather = pd.read_csv(PATH+file_name_weather, encoding = "ISO-8859-1")
df_weather.tail()
df_crime.isnull().sum()
Let's see what areas are in the data
df_crime.DISTRICT.value_counts()
df_crime.DISTRICT = df_crime.DISTRICT.fillna('NON')
df_crime.SHOOTING = df_crime.SHOOTING.fillna('NON')
df_crime.UCR_PART.value_counts()
df_crime.UCR_PART = df_crime.UCR_PART.fillna('Other')
len(df_crime.STREET.unique())
df_crime.STREET = df_crime.STREET.fillna('Other')
df_crime[np.isnan(df_crime.Lat)]['Location'].unique()
df_crime = df_crime[~np.isnan(df_crime.Lat)]
df_crime.isnull().sum()
df_weather.isnull().sum()
df_crime.OFFENSE_CODE_GROUP.value_counts()
df_crime.OFFENSE_CODE.value_counts()
min(df_crime.Lat), max(df_crime.Lat)
min(df_crime.Long), max(df_crime.Long)
There are spikes in the coordinates.
df_crime.OCCURRED_ON_DATE = df_crime.OCCURRED_ON_DATE.map(pd.to_datetime)
df_crime['test_one'] = 1
g = df_crime.groupby(('YEAR','MONTH'))['test_one'].sum()
fig = plt.figure(1, (12, 8))
ax1 = fig.add_subplot(211)
g.unstack(level=0).plot(kind='bar', grid = True, ax = ax1);
ax2 = fig.add_subplot(212)
g.plot(grid = True, ax = ax2);
ax2.set_xticks(range(len(g)));
ax2.set_xticklabels(["%s-%02d" % item for item in g.index.tolist()], rotation=90);
most crimes are committed in the summer, least in the winter. average offense remains constant
df_crime.groupby(('HOUR'))['test_one'].sum().plot(kind='bar',figsize=(15,6), grid = True);
most crimes are committed after dinner, least of all late at night
order = ['Monday', 'Tuesday', 'Wednesday','Thursday','Friday','Saturday','Sunday']
df_crime.groupby(('DAY_OF_WEEK'))['test_one'].sum().loc[order].plot(kind = 'pie', figsize=(7, 7), autopct='%.2f');
by the weekend the number of crimes decreases
g = df_crime[df_crime.SHOOTING != 'NON'].groupby(('YEAR','MONTH'))['test_one'].sum()
fig = plt.figure(1, (12, 12))
ax1 = fig.add_subplot(311)
g.unstack(level=0).plot(kind='bar', grid = True, ax = ax1, stacked=True);
ax2 = fig.add_subplot(312)
g.plot(grid = True, ax = ax2);
ax2.set_xticks(range(len(g)));
ax2.set_xticklabels(["%s-%02d" % item for item in g.index.tolist()], rotation=90);
g = df_crime[df_crime.SHOOTING != 'NON'].groupby(('DAY_OF_WEEK'))['test_one'].sum().loc[order]
ax3 = fig.add_subplot(313)
g.plot(kind='bar', grid = True, ax = ax3, stacked=True);
g = df_crime.groupby(('STREET'))['test_one'].sum().sort_values(ascending = False).head(30)
fig = plt.figure(1, (12, 8))
#ax1 = fig.add_subplot(211)
g.plot(kind='bar', grid = True);
g = df_crime.groupby(('OFFENSE_CODE_GROUP'))['test_one'].sum().sort_values(ascending = False).head(20)
fig = plt.figure(1, (12, 8))
#ax1 = fig.add_subplot(211)
g.plot(kind='barh', grid = True);
g = df_crime.groupby(('DISTRICT'))['test_one'].sum().sort_values(ascending = False).head(20)
fig = plt.figure(1, (12, 8))
#ax1 = fig.add_subplot(211)
g.plot(kind='pie', grid = True);
df_crime_pre = df_crime[(df_crime.Lat > 20) & (df_crime['Long']<-20)]
g = df_crime_pre.groupby(('OFFENSE_CODE_GROUP'))['Lat', 'Long', 'test_one'].aggregate((np.sum, np.median)).sort_values(by = ('test_one','sum'), ascending = False)
fig = plt.figure(1, (8, 8))
#ax1 = fig.add_subplot(211)
plt.scatter(x = g[('Lat','median')].values, y = g[('Long','median')].values, marker = '*');
fig = plt.figure(1, (8, 8))
sns.scatterplot(x='Long', y='Lat', hue='DISTRICT', data=df_crime_pre)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2);
df_crime_pre['Lat'].plot(kind='kde');
df_crime_pre['Long'].plot(kind='kde');
df_crime_pre['DAY'] = df_crime_pre['OCCURRED_ON_DATE'].dt.day
df_total = df_crime_pre.merge(df_weather, left_on=['YEAR', 'MONTH', 'DAY'], right_on=['Year', 'Month', 'Day'])
plt.figure();
df_total['Low Temp (F)'].hist(alpha = 0.5, bins = 30);
df_total['High Temp (F)'].hist(alpha = 0.5, bins = 30);
# this is a real temperature distribution
plt.figure();
df_weather['Low Temp (F)'].hist(alpha = 0.5, bins = 30);
df_weather['High Temp (F)'].hist(alpha = 0.5, bins = 30);
df_total.groupby('Events')['test_one'].aggregate(np.sum).plot.pie();
# what precipitations were at the time of the crime
calculate the correlation between variables
def plot_corr(df,size=10):
corr = df.corr()
fig, ax = plt.subplots(figsize=(size, size))
cs = ax.matshow(corr, interpolation='none', cmap='plasma')
plt.xticks(range(len(corr.columns)), corr.columns, rotation=90);
plt.yticks(range(len(corr.columns)), corr.columns);
cbar = fig.colorbar(cs, ax=ax, shrink=0.7)
df_dummies_OFFENSE_CODE_GROUP = pd.get_dummies(df_total['OFFENSE_CODE_GROUP'])
df_dummies_Events = pd.get_dummies(df_total['Events'])
df_dummies_DISTRICT = pd.get_dummies(df_total['DISTRICT'])
#df_dummies_SHOOTING = pd.get_dummies(df_total['SHOOTING'])
df_dummies_UCR_PART = pd.get_dummies(df_total['UCR_PART'])
df_dummies_STREET = pd.get_dummies(df_total['STREET'])
df_new = pd.concat([df_dummies_OFFENSE_CODE_GROUP, df_dummies_Events, df_dummies_DISTRICT, df_dummies_UCR_PART, df_total[['Lat', 'Long']], df_total.iloc[:, 23:42]], axis=1)
plot_corr(df_new, 25)
plt.figure(figsize=(12,12))
map = Basemap(llcrnrlon=-71.2, llcrnrlat=42.2, urcrnrlon=-70.95, urcrnrlat=42.46, epsg=4269)
#http://server.arcgisonline.com/arcgis/rest/services
#EPSG Number of America is 4269
#World_Street_Map
#World_Topo_Map
map.arcgisimage(service='World_Street_Map', xpixels = 1000, verbose= True)
for i in range(100):
xpt,ypt = map(df_crime.Long[i], df_crime.Lat[i])
map.scatter(xpt,ypt,c = 'b')
# airport coorinates
coo_aero = (-71.01, 42.36)
xpt,ypt = map(coo_aero[0],coo_aero[1])
map.scatter(xpt,ypt,c = 'r')
# south station
coo_v1 = (-71.06, 42.34)
xpt,ypt = map(coo_v1[0],coo_v1[1])
map.scatter(xpt,ypt,c = 'r')
plt.show()
X = df_total.copy()
X = X[X.OFFENSE_CODE == 3006]
X.head()
X.info()
X.drop(['INCIDENT_NUMBER', 'Location', 'test_one', 'Day', 'Month', 'Year'], axis = 1, inplace=True)
X['OFFENSE_CODE'] = X.OFFENSE_CODE.astype('category')
X['YEAR'] = X.YEAR.astype('category')
X['MONTH'] = X.MONTH.astype('category')
X['HOUR'] = X.HOUR.astype('category')
X['DAY'] = X.DAY.astype('category')
#X['quarter'] = X.quarter.astype('category')
X.drop(['STREET',], axis = 1, inplace=True)
X.drop(['DISTRICT',], axis = 1, inplace=True)
X.drop(['REPORTING_AREA',], axis = 1, inplace=True)
X.drop(['OFFENSE_DESCRIPTION',], axis = 1, inplace=True)
X_hot = pd.get_dummies(X)
X_hot.drop(['OCCURRED_ON_DATE',], axis = 1, inplace=True)
ss = StandardScaler()
col = [x for x in X_hot.columns if ((X_hot[x].dtype == np.float64 or X_hot[x].dtype == np.int64) and (x !='Lat') and (x != 'Long'))]
col
X_ss = pd.DataFrame(ss.fit_transform(X_hot[col].values), columns=col)
X_hot.drop(col, axis=1, inplace=True)
X_hot_ss = X_hot.reset_index(drop=True).join(X_ss.reset_index(drop=True))
X_hot_ss.shape
y1 = X_hot_ss['Lat']
y2 = X_hot_ss['Long']
X_hot_ss.drop(['Lat', 'Long'], axis=1, inplace=True)
X_train, X_val, y1_train, y1_val, y2_train, y2_val = train_test_split(X_hot_ss, y1, y2, random_state = 42)
%%time
model1_cv = RidgeCV(alphas= (0.01, 0.05,0.1,0.7,1,2), cv=7)
model2_cv = RidgeCV(alphas= (0.01, 0.05,0.1,0.7,1,2), cv=7)
#model = LinearRegression()
rid1_cv = model1_cv.fit(X_train, y1_train)
rid2_cv = model2_cv.fit(X_train, y2_train)
pred_Lat_cv = rid1_cv.predict(X_val)
mse_Lat_cv = mean_absolute_error(y1_val, pred_Lat_cv)
pred_Long_cv = rid2_cv.predict(X_val)
mse_Long_cv = mean_absolute_error(y2_val, pred_Long_cv)
mse_Lat_cv, mse_Long_cv
dd_Lat_cv = pd.DataFrame({'tr':y1_val, 'pred': pred_Lat_cv, 'del': (y1_val - pred_Lat_cv)}).sort_values(by=['del'])
dd_Long_cv = pd.DataFrame({'tr':y2_val, 'pred': pred_Long_cv, 'del': (y2_val - pred_Long_cv)}).sort_values(by=['del'])
fig = plt.figure(1, (12, 12))
ax1 = fig.add_subplot(211)
dd_Lat_cv['del'].hist(bins = 100, ax = ax1, range = (-0.2,0.2), log = True)
ax2 = fig.add_subplot(212)
dd_Long_cv['del'].hist(bins = 100, ax = ax2, range = (-0.2,0.2), log= True)
plt.figure(figsize=(12,12))
map = Basemap(llcrnrlon=-71.2, llcrnrlat=42.2, urcrnrlon=-70.95, urcrnrlat=42.46, epsg=4269)
#http://server.arcgisonline.com/arcgis/rest/services
#EPSG Number of America is 4269
#World_Street_Map
#World_Topo_Map
map.arcgisimage(service='World_Topo_Map', xpixels = 1000, verbose= True)
random.seed(3)
for rr in range(100):
rnd_int = random.randint(0,len(dd_Lat_cv))
coordinat_true_Lat = y1[rnd_int]
coordinat_true_Long = y2[rnd_int]
coordinat_pred_Lat = pred_Lat_cv[rnd_int]
coordinat_pred_Long = pred_Long_cv[rnd_int]
xpt1,ypt1 = map(coordinat_true_Long,coordinat_true_Lat)
map.scatter(xpt1,ypt1,c = 'r')
xpt2,ypt2 = map(coordinat_pred_Long,coordinat_pred_Lat)
map.scatter(xpt2,ypt2,c = 'b')
pd.DataFrame({'colum': X_train.columns, 'coef':np.abs(model1_cv.coef_)}).sort_values(['coef'], ascending=False).head(10)
pd.DataFrame({'colum': X_train.columns, 'coef':np.abs(model2_cv.coef_)}).sort_values(['coef'], ascending=False).head(10)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
%%time
#params = {'n_estimators':[160],'max_depth':[20]}
rfc1=RandomForestRegressor(random_state=42, n_jobs= 3, n_estimators = 200)
#rfc1_grid = GridSearchCV(rfc1,param_grid=params,cv=3,n_jobs= 3)
rfc1.fit(X_train, y1_train)
#rfc1=rfc1_grid.best_estimator_ # assign best model to rfc
rfc1
(pd.Series(rfc1.feature_importances_, index=X_train.columns)
.nlargest(20)
.plot(kind='barh'))
plt.title('Feature Importance') # top 10 features
%%time
rfc2=RandomForestRegressor(random_state=42, n_jobs= 3, n_estimators = 200)
#rfc2_grid = GridSearchCV(rfc2,param_grid=params,cv=5, n_jobs= 3)
rfc2.fit(X_train, y2_train)
#rfc2=rfc2_grid.best_estimator_ # assign best model to rfc
rfc2
(pd.Series(rfc2.feature_importances_, index=X_train.columns)
.nlargest(20)
.plot(kind='barh'))
plt.title('Feature Importance') # top 10 features
pred_Lat_rfr = rfc1.predict(X_val)
mse_Lat_rfr = mean_absolute_error(y1_val, pred_Lat_rfr)
pred_Long_rfr = rfc2.predict(X_val)
mse_Long_rfr = mean_absolute_error(y2_val, pred_Long_rfr)
mse_Lat_rfr, mse_Long_rfr
mse_Lat_cv, mse_Long_cv
plt.figure(figsize=(12,12))
map = Basemap(llcrnrlon=-71.2, llcrnrlat=42.2, urcrnrlon=-70.95, urcrnrlat=42.46, epsg=4269)
#http://server.arcgisonline.com/arcgis/rest/services
#EPSG Number of America is 4269
#World_Street_Map
#World_Topo_Map
map.arcgisimage(service='World_Topo_Map', xpixels = 1000, verbose= True)
random.seed(3)
for rr in range(100):
rnd_int = random.randint(0,len(dd_Lat_cv))
coordinat_true_Lat = y1[rnd_int]
coordinat_true_Long = y2[rnd_int]
coordinat_pred_Lat = pred_Lat_rfr[rnd_int]
coordinat_pred_Long = pred_Long_rfr[rnd_int]
xpt1,ypt1 = map(coordinat_true_Long,coordinat_true_Lat)
map.scatter(xpt1,ypt1,c = 'r')
xpt2,ypt2 = map(coordinat_pred_Long,coordinat_pred_Lat)
map.scatter(xpt2,ypt2,c = 'b')
map.plot((xpt1,xpt2),(ypt1,ypt2),c = 'y')
model3 = LassoCV(random_state = 42)
model4 = LassoCV(random_state = 42)
model3.fit(X_train, y1_train)
model4.fit(X_train, y2_train)
pred_Lat_3 = model3.predict(X_val)
mse_Lat_3 = mean_absolute_error(y1_val, pred_Lat_3)
pred_Long_3 = model4.predict(X_val)
mse_Long_3 = mean_absolute_error(y2_val, pred_Long_3)
mse_Lat_3, mse_Long_3
dd_Lat_lasso = pd.DataFrame({'tr':y1_val, 'pred': pred_Lat_3, 'del': (y1_val - pred_Lat_3)}).sort_values(by=['del'])
dd_Long_lasso = pd.DataFrame({'tr':y2_val, 'pred': pred_Long_3, 'del': (y2_val - pred_Long_3)}).sort_values(by=['del'])
fig = plt.figure(1, (12, 12))
ax1 = fig.add_subplot(211)
dd_Lat_lasso['del'].hist(bins = 100, ax = ax1, range = (-0.2,0.2), log = True)
ax2 = fig.add_subplot(212)
dd_Long_lasso['del'].hist(bins = 100, ax = ax2, range = (-0.2,0.2), log= True)
plt.figure(figsize=(12,12))
plt.figure(figsize=(12,12))
map = Basemap(llcrnrlon=-71.2, llcrnrlat=42.2, urcrnrlon=-70.95, urcrnrlat=42.46, epsg=4269)
map.arcgisimage(service='World_Topo_Map', xpixels = 1000, verbose= True)
random.seed(3)
for rr in range(100):
rnd_int = random.randint(0,len(dd_Lat_cv))
coordinat_true_Lat = y1[rnd_int]
coordinat_true_Long = y2[rnd_int]
coordinat_pred_Lat = pred_Lat_3[rnd_int]
coordinat_pred_Long = pred_Long_3[rnd_int]
xpt1,ypt1 = map(coordinat_true_Long,coordinat_true_Lat)
map.scatter(xpt1,ypt1,c = 'r')
xpt2,ypt2 = map(coordinat_pred_Long,coordinat_pred_Lat)
map.scatter(xpt2,ypt2,c = 'b')
model3.intercept_
pd.DataFrame({'colum': X_train.columns, 'coef':np.abs(model3.coef_)}).sort_values(['coef'], ascending=False).head(10)
pd.DataFrame({'colum': X_train.columns, 'coef':np.abs(model4.coef_)}).sort_values(['coef'], ascending=False).head(10)
df_total.head()
for c in df_total.columns:
col_type = df_total[c].dtype
if col_type == 'object' or col_type.name == 'category':
df_total[c] = df_total[c].astype('category')
X_total = df_total.copy()
X_total.drop(['STREET','DISTRICT','REPORTING_AREA','OFFENSE_DESCRIPTION','Lat', 'Long','Location','INCIDENT_NUMBER','OCCURRED_ON_DATE','test_one','Year', 'Month', 'Day'], axis = 1, inplace=True)
X_total = X_total[X_total['OFFENSE_CODE']==3006]
X_train, X_val, y1_train, y1_val, y2_train, y2_val = train_test_split(X_total, y1, y2, random_state = 42)
for i, c in enumerate(X_total.columns):
col_type = df_total[c].dtype
if col_type == 'object' or col_type.name == 'category':
print(i)
SCORERS.keys()
param = {'iterations':3000, 'depth':10, 'learning_rate':0.03, 'loss_function':'RMSE', 'random_seed' : 42,
'task_type' : "GPU", 'cat_features' : [1,2,5,7,29],'verbose':0}
model_y1 = CatBoostRegressor(**param)
#catboost_pool = Pool(X_train, y1_train, cat_features = [1,2,5,7,29])
model_y1.fit(X_train, y1_train)
fig = plt.figure(1, (12, 12))
plt.plot(np.log(model_y1.evals_result_['learn']['RMSE']))
%%time
#rsCV_y1 = RandomizedSearchCV(model_y1, param_distributions={'iterations':[100,500,1000,2000], 'depth':[i for i in range(2,15)]}, n_iter=20, n_jobs=1, random_state=42, scoring='neg_median_absolute_error')
#rsCV_y1.fit(X_train, y1_train)
model_y2 = CatBoostRegressor(**param )
model_y2.fit(X_train, y2_train)
%%time
#rsCV_y2 = RandomizedSearchCV(model_y2, param_distributions={'iterations':[100,500,1000,2000], 'depth':[i for i in range(2,15)]}, n_iter=20, n_jobs=1, random_state=42, scoring='neg_median_absolute_error')
#rsCV_y2.fit(X_train, y2_train)
#rsCV_y1.best_params_, rsCV_y1.best_params_
#y1_pred = rsCV_y1.best_estimator_.predict(X_val)
#y2_pred = rsCV_y2.best_estimator_.predict(X_val)
y1_pred = model_y1.predict(X_val)
y2_pred = model_y2.predict(X_val)
y1_pred, y2_pred
mse_Lat_boost = mean_absolute_error(y1_val, y1_pred)
mse_Long_boost = mean_absolute_error(y2_val, y2_pred)
mse_Lat_boost, mse_Long_boost
plt.figure(figsize=(12,12))
map = Basemap(llcrnrlon=-71.2, llcrnrlat=42.2, urcrnrlon=-70.95, urcrnrlat=42.46, epsg=4269)
#http://server.arcgisonline.com/arcgis/rest/services
#EPSG Number of America is 4269
#World_Street_Map
#World_Topo_Map
map.arcgisimage(service='World_Topo_Map', xpixels = 1000, verbose= True)
random.seed(3)
for rr in range(100):
rnd_int = random.randint(0,len(y1_pred))
coordinat_true_Lat = y1[rnd_int]
coordinat_true_Long = y2[rnd_int]
coordinat_pred_Lat = y1_pred[rnd_int]
coordinat_pred_Long = y2_pred[rnd_int]
xpt1,ypt1 = map(coordinat_true_Long,coordinat_true_Lat)
map.scatter(xpt1,ypt1,c = 'r')
xpt2,ypt2 = map(coordinat_pred_Long,coordinat_pred_Lat)
map.scatter(xpt2,ypt2,c = 'b')
# map.plot((xpt1,xpt2),(ypt1,ypt2),c = 'y')
feat_imp = pd.Series(model_y2.feature_importances_, index=X_train.columns)
feat_imp.nlargest(20).plot(kind='barh', figsize=(8,10))
X.columns
df = X
aggr_df = df.groupby('OCCURRED_ON_DATE')[['Lat']].count()
aggr_df.columns = ['Lat_count']
aggr_df.head()
daily_df = aggr_df.resample('D').apply(sum)
daily_df.head(n=10)
fig = plt.figure(figsize=(30,8))
plt.plot(daily_df.index, daily_df.Lat_count)
weekly_df = daily_df.resample('W').apply(sum)
fig = plt.figure(figsize=(30,8))
plt.plot(weekly_df.index, weekly_df.Lat_count)
from fbprophet import Prophet
import logging
logging.getLogger().setLevel(logging.ERROR)
df = daily_df
df.columns = ['y']
df.index.name = 'ds'
prediction_size = 90
train_df = df[:-prediction_size]
train_df.tail(n=3)
train_df['ds'] = train_df.index
m = Prophet()
m.fit(train_df);
future = m.make_future_dataframe(periods=prediction_size)
future.tail(n=3)
forecast = m.predict(future)
forecast.tail(n=3)
fig = plt.figure(figsize=(30,8))
plt.plot(df.y.tail(200));
plt.plot(forecast.ds.tail(prediction_size), forecast.yhat.tail(prediction_size));
plt.plot(forecast.ds.tail(prediction_size), forecast.yhat_lower.tail(prediction_size));
plt.plot(forecast.ds.tail(prediction_size), forecast.yhat_upper.tail(prediction_size));
def make_comparison_dataframe(historical, forecast):
"""Join the history with the forecast.
The resulting dataset will contain columns 'yhat', 'yhat_lower', 'yhat_upper' and 'y'.
"""
return forecast.set_index('ds')[['yhat', 'yhat_lower', 'yhat_upper']].join(historical)
cmp_df = make_comparison_dataframe(df, forecast)
cmp_df.tail(n=10)
def calculate_forecast_errors(df, prediction_size):
df = df.copy()
df['e'] = df['y'] - df['yhat']
predicted_part = df[-prediction_size:]
error_mean = lambda error_name: np.mean(np.abs(predicted_part[error_name]))
return {'MAE': error_mean('e')}
for err_name, err_value in calculate_forecast_errors(cmp_df, prediction_size).items():
print(err_name, err_value)
data = pd.DataFrame(df.y.copy())
data.columns = ["y"]
# Adding the lag of the target variable from 6 steps back up to 24
for i in range(6, 45):
data["lag_{}".format(i)] = data.y.shift(i)
data.tail(7)
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import TimeSeriesSplit
# for time-series cross-validation set 5 folds
tscv = TimeSeriesSplit(n_splits=5)
from sklearn.metrics import r2_score, median_absolute_error, mean_absolute_error
from sklearn.metrics import median_absolute_error, mean_squared_error, mean_squared_log_error
def mean_absolute_percentage_error(y_true, y_pred):
return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
def timeseries_train_test_split(X, y, test_size):
"""
Perform train-test split with respect to time series structure
"""
# get the index after which test set starts
test_index = int(len(X)*(1-test_size))
X_train = X.iloc[:test_index]
y_train = y.iloc[:test_index]
X_test = X.iloc[test_index:]
y_test = y.iloc[test_index:]
return X_train, X_test, y_train, y_test
y_ = data.dropna().y
X_ = data.dropna().drop(['y'], axis=1)
# reserve 30% of data for testing
X_train, X_test, y_train, y_test = timeseries_train_test_split(X_, y_, test_size=0.3)
lr = LinearRegression()
lr.fit(X_train, y_train)
def plotModelResults(model, X_train=X_train, X_test=X_test, plot_intervals=False, plot_anomalies=False):
prediction = model.predict(X_test)
plt.figure(figsize=(15, 7))
plt.plot(prediction, "g", label="prediction", linewidth=2.0)
plt.plot(y_test.values, label="actual", linewidth=2.0)
if plot_intervals:
cv = cross_val_score(model, X_train, y_train,
cv=tscv,
scoring="neg_mean_absolute_error")
mae = cv.mean() * (-1)
deviation = cv.std()
scale = 1.96
lower = prediction - (mae + scale * deviation)
upper = prediction + (mae + scale * deviation)
plt.plot(lower, "r--", label="upper bond / lower bond", alpha=0.5)
plt.plot(upper, "r--", alpha=0.5)
if plot_anomalies:
anomalies = np.array([np.NaN]*len(y_test))
anomalies[y_test<lower] = y_test[y_test<lower]
anomalies[y_test>upper] = y_test[y_test>upper]
plt.plot(anomalies, "o", markersize=10, label = "Anomalies")
error = mean_absolute_percentage_error(prediction, y_test)
plt.title("Mean absolute percentage error {0:.2f}%".format(error))
plt.legend(loc="best")
plt.tight_layout()
plt.grid(True);
def plotCoefficients(model):
"""
Plots sorted coefficient values of the model
"""
coefs = pd.DataFrame(model.coef_, X_train.columns)
coefs.columns = ["coef"]
coefs["abs"] = coefs.coef.apply(np.abs)
coefs = coefs.sort_values(by="abs", ascending=False).drop(["abs"], axis=1)
plt.figure(figsize=(15, 7))
coefs.coef.plot(kind='bar')
plt.grid(True, axis='y')
plt.hlines(y=0, xmin=0, xmax=len(coefs), linestyles='dashed');
plotModelResults(lr, plot_intervals=True)
plotCoefficients(lr)
data.index = pd.to_datetime(data.index)
data["day"] = data.index.day
data["weekday"] = data.index.weekday
data['is_weekend'] = data.weekday.isin([5,6])*1
data.tail()
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
y_ = data.dropna().y
X_ = data.dropna().drop(['y'], axis=1)
X_train, X_test, y_train, y_test = timeseries_train_test_split(X_, y_, test_size=0.3)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
plotModelResults(lr, X_train=X_train_scaled, X_test=X_test_scaled, plot_intervals=True)
plotCoefficients(lr)
def code_mean(data, cat_feature, real_feature):
return dict(data.groupby(cat_feature)[real_feature].mean())
def prepareData(series, lag_start, lag_end, test_size, target_encoding=False):
# copy of the initial dataset
data = pd.DataFrame(series.copy())
# data.columns = ["y"]
# lags of series
for i in range(lag_start, lag_end):
data["lag_{}".format(i)] = data.y.shift(i)
# datetime features
# data.index = pd.to_datetime(data.index)
data["day"] = data.index.day
data["weekday"] = data.index.weekday
data['is_weekend'] = data.weekday.isin([5,6])*1
if target_encoding:
# calculate averages on train set only
test_index = int(len(data.dropna())*(1-test_size))
data['weekday_average'] = list(map(code_mean(data[:test_index], 'weekday', "y").get, data.weekday))
data["day_average"] = list(map(code_mean(data[:test_index], 'day', "y").get, data.day))
# drop encoded variables
data.drop(["day", "weekday"], axis=1, inplace=True)
# train-test split
y = data.dropna().y
X = data.dropna().drop(['y'], axis=1)
X_train, X_test, y_train, y_test = timeseries_train_test_split(X, y, test_size=test_size)
return X_train, X_test, y_train, y_test
average_hour = code_mean(data, 'weekday', "y")
plt.figure(figsize=(7, 5))
plt.title("Hour averages")
pd.DataFrame.from_dict(average_hour, orient='index')[0].plot()
plt.grid(True);
X_train, X_test, y_train, y_test = prepareData(daily_df, lag_start=6, lag_end=25, test_size=0.3, target_encoding=False)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
plotModelResults(lr, X_train=X_train_scaled, X_test=X_test_scaled, plot_intervals=True, plot_anomalies=True)
plotCoefficients(lr)
##