%matplotlib inline
import os
import sys
from glob import glob
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
from calendar import day_abbr
import holidays
from fbprophet import Prophet
from sklearn.metrics import mean_absolute_error as MAE
import seaborn as sns
from scipy.stats import skew, skewtest
loc_counters = pd.read_csv('../data/cycling_Auckland/cycling_counters.csv')
loc_counters = loc_counters.query("user_type == 'Cyclists'")
loc_counters.loc[loc_counters.name.str.contains("Tamaki"),:]
name | id | Name.1 | latitude | longitude | site_code | setup_date | user_type | |
---|---|---|---|---|---|---|---|---|
44 | Tamaki Drive EB | 100000827 | Tamaki Drive EB | -36.847782 | 174.78935 | ECO08011685 | 12/11/2009 | Cyclists |
45 | Tamaki Drive WB | 100003810 | Tamaki Drive WB | -36.847942 | 174.78903 | U15G2011813 | 26/03/2012 | Cyclists |
lfiles = glob('../data/cycling_Auckland/cycling_counts_????.csv')
lfiles.sort()
l = []
for f in lfiles:
d = pd.read_csv(f, index_col=0, parse_dates=True)
l.append(d)
df = pd.concat(l, axis=0)
df = df.loc[:,['Tamaki Drive EB', 'Tamaki Drive WB']]
df.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x10bf46128>
Tamaki = df.loc[:,'Tamaki Drive WB'] + df.loc[:,'Tamaki Drive EB']
Tamaki.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x1a24121198>
Tamaki = Tamaki.loc['2013':'2018-06-01',]
Tamaki = Tamaki.to_frame(name='Tamaki Drive')
Tamaki.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x1a25107fd0>
seas_cycl = Tamaki.rolling(window=30*24, center=True).mean().groupby(Tamaki.index.dayofyear).mean()
f, ax = plt.subplots(figsize=(8,6))
seas_cycl.plot(ax=ax, lw=2, color='k', legend=False)
ax.grid(ls=':')
ax.set_xlabel('day of the year', fontsize=15)
ax.set_ylabel('cyclists number', fontsize=15);
[l.set_fontsize(13) for l in ax.xaxis.get_ticklabels()]
[l.set_fontsize(13) for l in ax.yaxis.get_ticklabels()]
ax.set_title('Tamaki Drive: 30 days running average hourly cycling counts', fontsize=15)
f.savefig('../figures/paper/seasonal_cycle.png', dpi=200)
f.savefig('../figures/paper/seasonal_cycle.jpeg', dpi=200)
f.savefig('../figures/paper/seasonal_cycle.pdf')
hour_week = Tamaki.copy()
hour_week.loc[:,'day_of_week'] = hour_week.index.dayofweek
hour_week.loc[:,'hour'] = hour_week.index.hour
hour_week = hour_week.groupby(['day_of_week','hour']).mean().unstack()
hour_week.columns = hour_week.columns.droplevel(0)
f, ax = plt.subplots(figsize=(12,6))
sns.heatmap(hour_week, ax = ax, cmap=plt.cm.gray_r, vmax=150)
# ax.grid(ls=':')
[ax.axhline(x, ls=':', lw=0.5, color='0.8') for x in np.arange(1, 7)]
[ax.axvline(x, ls=':', lw=0.5, color='0.8') for x in np.arange(1, 24)];
ax.set_title('number of cyclists per day of week and hour of the day', fontsize=16)
[l.set_fontsize(13) for l in ax.xaxis.get_ticklabels()]
[l.set_fontsize(13) for l in ax.yaxis.get_ticklabels()]
ax.set_xlabel('hour of the day', fontsize=15)
ax.set_ylabel('day of the week', fontsize=15)
ax.set_yticklabels(day_abbr[0:7]);
f.savefig('../figures/paper/cyclists_dayofweek_hourofday.png', dpi=200)
f.savefig('../figures/paper/cyclists_dayofweek_hourofday.jpeg', dpi=200)
f.savefig('../figures/paper/cyclists_dayofweek_hourofday.pdf')
summary_hour = Tamaki.groupby(Tamaki.index.hour).describe()
summary_hour.columns = summary_hour.columns.droplevel(0)
weekdays = Tamaki.loc[Tamaki.index.weekday_name.isin(['Monday','Tuesday','Wednesday','Thursday','Friday'])]
weekends = Tamaki.loc[Tamaki.index.weekday_name.isin(['Sunday','Saturday'])]
summary_hour_weekdays = weekdays.groupby(weekdays.index.hour).describe()
summary_hour_weekends = weekends.groupby(weekends.index.hour).describe()
summary_hour_weekdays.columns = summary_hour_weekdays.columns.droplevel(0)
summary_hour_weekends.columns = summary_hour_weekends.columns.droplevel(0)
f, ax = plt.subplots(figsize=(10,7))
ax.plot(summary_hour_weekends.index, summary_hour_weekends.loc[:,'mean'], color='k', label='week ends', ls='--', lw=3)
ax.fill_between(summary_hour_weekends.index, summary_hour_weekends.loc[:,'25%'], \
summary_hour_weekends.loc[:,'75%'], hatch='///', facecolor='0.8', alpha=0.1)
ax.set_xticks(range(24));
ax.grid(ls=':', color='0.8')
# ax.set_title('week-ends', fontsize=16)
ax.plot(summary_hour_weekdays.index, summary_hour_weekdays.loc[:,'mean'], color='k', label='week days', lw=3)
ax.fill_between(summary_hour_weekdays.index, summary_hour_weekdays.loc[:,'25%'], \
summary_hour_weekdays.loc[:,'75%'], hatch='\\\\\\', facecolor='0.8', alpha=0.1)
ax.legend(loc=1 , fontsize=15)
ax.set_xticks(range(24));
ax.grid(ls=':', color='0.8')
ax.set_ylim([0, 200])
ax.set_xlabel('hour of the day', fontsize=15)
ax.set_ylabel('cyclists number', fontsize=15);
[l.set_fontsize(13) for l in ax.xaxis.get_ticklabels()]
[l.set_fontsize(13) for l in ax.yaxis.get_ticklabels()]
ax.set_title('Tamaki drive: number of cyclists per hour of the day', fontsize=16)
f.savefig('../figures/paper/daily_cycle.png', dpi=200)
f.savefig('../figures/paper/daily_cycle.jpeg', dpi=200)
f.savefig('../figures/paper/daily_cycle.pdf')
# f, ax = plt.subplots(figsize=(10,7))
# ax.plot(summary_hour_weekends.index, summary_hour_weekends.loc[:,'mean'], color='b', label='week ends')
# ax.fill_between(summary_hour_weekends.index, summary_hour_weekends.loc[:,'25%'], \
# summary_hour_weekends.loc[:,'75%'], color='steelblue', alpha=0.3)
# ax.set_xticks(range(24));
# ax.grid(ls=':', color='0.8')
# # ax.set_title('week-ends', fontsize=16)
# ax.plot(summary_hour_weekdays.index, summary_hour_weekdays.loc[:,'mean'], color='r', label='week days')
# ax.fill_between(summary_hour_weekdays.index, summary_hour_weekdays.loc[:,'25%'], \
# summary_hour_weekdays.loc[:,'75%'], color='coral', alpha=0.3)
# ax.legend(loc=1 , fontsize=15)
# ax.set_xticks(range(24));
# ax.grid(ls=':', color='0.8')
# ax.set_ylim([0, 200])
# ax.set_xlabel('hour of the day', fontsize=15)
# ax.set_ylabel('cyclists number', fontsize=15);
# [l.set_fontsize(13) for l in ax.xaxis.get_ticklabels()]
# [l.set_fontsize(13) for l in ax.yaxis.get_ticklabels()]
# ax.set_title('Tamaki drive: number of cyclists per hour of the day', fontsize=16)
# f.savefig('../figures/paper/daily_cycle.png', dpi=200)
# f.savefig('../figures/paper/daily_cycle.jpeg', dpi=200)
# f.savefig('../figures/paper/daily_cycle.pdf')
def median_filter(df, varname = 'Tamaki Drive', window=24, std=3):
dfc = df.copy()
dfc = dfc.loc[:,[varname]]
dfc['median']= dfc[varname].rolling(window, center=True).median()
dfc['std'] = dfc[varname].rolling(window, center=True).std()
dfc.loc[dfc.loc[:,varname] >= dfc['median']+std*dfc['std'], varname] = np.nan
dfc.loc[dfc.loc[:,varname] <= dfc['median']-std*dfc['std'], varname] = np.nan
return dfc.loc[:, varname]
dfc = Tamaki.copy()
dfc.loc[:,'Tamaki Drive, Filtered'] = median_filter(dfc)
dfc.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x1a257c90f0>
df_filtered = dfc.loc[:,['Tamaki Drive, Filtered']].resample('1D').sum()
data = df_filtered.loc['2013':,['Tamaki Drive, Filtered']]
f, ax = plt.subplots(figsize=(12,8))
data.plot(ax=ax, color='0.2')
data.rolling(window=30, center=True).mean().plot(ax=ax, ls='-', lw=3, color='0.6')
ax.grid(ls=':')
ax.legend(['daily values','30 days running average'], frameon=False, fontsize=14)
[l.set_fontsize(13) for l in ax.xaxis.get_ticklabels()]
[l.set_fontsize(13) for l in ax.yaxis.get_ticklabels()]
ax.set_xlabel('date', fontsize=15)
ax.set_ylabel('cyclists number', fontsize=15);
ax.axvline('2017', color='0.8', lw=8, zorder=-1)
f.savefig('../figures/paper/cycling_counts_Tamaki_drive.png', dpi=200)
f.savefig('../figures/paper/cycling_counts_Tamaki_drive.jpeg', dpi=200)
f.savefig('../figures/paper/cycling_counts_Tamaki_drive.pdf', dpi=200)
data = data.rename({'Tamaki Drive, Filtered':'y'}, axis=1)
data.head()
y | |
---|---|
datetime | |
2013-01-01 | 1163.0 |
2013-01-02 | 1112.0 |
2013-01-03 | 527.0 |
2013-01-04 | 1045.0 |
2013-01-05 | 1422.0 |
def add_regressor(data, regressor, varname=None):
"""
adds a regressor to a dataframe of targets
"""
data_with_regressors = data.copy()
data_with_regressors.loc[:,varname] = regressor.loc[:,varname]
return data_with_regressors
def prepare_data(data, year=2017):
"""
prepare the data for ingestion by fbprophet:
1) divide in training and test set, using the `year` parameter (int)
2) reset the index and rename the `datetime` column to `ds`
returns the training and test dataframes
"""
data_train = data.loc[:str(year - 1),:]
data_test = data.loc[str(year):,:]
data_train.reset_index(inplace=True)
data_test.reset_index(inplace=True)
data_train = data_train.rename({'datetime':'ds'}, axis=1)
data_test = data_test.rename({'datetime':'ds'}, axis=1)
return data_train, data_test
def make_verif(forecast, data_train, data_test):
"""
put together the forecast (coming from fbprophet)
and the overved data, and set the index to be a proper datetime index,
for plotting
"""
forecast.index = pd.to_datetime(forecast.ds)
data_train.index = pd.to_datetime(data_train.ds)
data_test.index = pd.to_datetime(data_test.ds)
data = pd.concat([data_train, data_test], axis=0)
forecast.loc[:,'y'] = data.loc[:,'y']
return forecast
def plot_verif(verif, year=2017):
"""
plots the forecasts and observed data, the year parameters is used to indicate
the division between the training and test sets
"""
f, ax = plt.subplots(figsize=(10, 8))
train = verif.loc[:str(year - 1),:]
ax.plot(train.index, train.y, 'ko', markersize=3)
ax.plot(train.index, train.yhat, color='steelblue', lw=0.5)
ax.fill_between(train.index, train.yhat_lower, train.yhat_upper, color='steelblue', alpha=0.3)
test = verif.loc[str(year):,:]
ax.plot(test.index, test.y, 'ro', markersize=3)
ax.plot(test.index, test.yhat, color='coral', lw=0.5)
ax.fill_between(test.index, test.yhat_lower, test.yhat_upper, color='coral', alpha=0.3)
ax.axvline(str(year), color='0.8', alpha=0.7)
ax.grid(ls=':', lw=0.5)
return f
def plot_verif_component(verif, component='rain', year=2017):
"""
plots a specific component of the model
"""
f, ax = plt.subplots(figsize=(10, 8))
train = verif.loc[:str(year - 1),:]
ax.plot(train.index, train.loc[:,component] * 100, color='steelblue', lw=1)
ax.fill_between(train.index, train.loc[:, component+'_lower'] * 100, train.loc[:, component+'_upper'] * 100, color='steelblue', alpha=0.3)
test = verif.loc[str(year):,:]
ax.plot(test.index, test.loc[:,component] * 100, color='coral', lw=1)
ax.fill_between(test.index, test.loc[:, component+'_lower'] * 100, test.loc[:, component+'_upper'] * 100, color='coral', alpha=0.3)
ax.axvline(str(year), color='0.8', alpha=0.7)
ax.grid(ls=':', lw=0.5)
return f
def add_regressor_to_future(future, regressors_list):
"""
adds extra regressors to a `future` dataframe created by fbprophet
"""
futures = future.copy()
futures.index = pd.to_datetime(futures.ds)
regressors = pd.concat(regressors_list, axis=1)
futures = futures.merge(regressors, left_index=True, right_index=True)
futures = futures.reset_index(drop = True)
return futures
data_train, data_test = prepare_data(data, 2017)
The first step in fbprophet is to instantiate the model, it is there that you can set the prior scales
for each component of your time-series, as well as the number of Fourier series to use to model the cyclic components.
A general rule is that larger prior scales and larger number of Fourier series will make the model more flexible, but at the potential cost of generalisation: i.e. the model might overfit, learning the noise (rather than the signal) in the training data, but giving poor results when applied to yet unseen data (the test data)... setting these hyperparameters can be more an art than a science ...
m = Prophet(mcmc_samples=300, changepoint_prior_scale=0.01, seasonality_mode='multiplicative', \
yearly_seasonality=10, \
weekly_seasonality=True, \
daily_seasonality=False)
m.fit(data_train)
/Users/nicolasf/anaconda3/envs/PANGEO/lib/python3.6/site-packages/pystan/misc.py:399: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`. elif np.issubdtype(np.asarray(v).dtype, float): /Users/nicolasf/anaconda3/envs/PANGEO/lib/python3.6/site-packages/pystan/misc.py:456: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() or inspect.getfullargspec() if "chain_id" in inspect.getargspec(init).args:
<fbprophet.forecaster.Prophet at 0x1a29c1e828>
future
dataframe¶future = m.make_future_dataframe(periods=len(data_test), freq='1D')
future.head()
ds | |
---|---|
0 | 2013-01-01 |
1 | 2013-01-02 |
2 | 2013-01-03 |
3 | 2013-01-04 |
4 | 2013-01-05 |
future.tail()
ds | |
---|---|
1973 | 2018-05-28 |
1974 | 2018-05-29 |
1975 | 2018-05-30 |
1976 | 2018-05-31 |
1977 | 2018-06-01 |
forecast = m.predict(future)
components
of the forecast (trend + cyclic component [yearly seasonality, weekly seasonality] at this stage)¶f = m.plot_components(forecast)
verif = make_verif(forecast, data_train, data_test)
f = plot_verif(verif)
def plot_joint_plot(verif, x='yhat', y='y', title=None, fpath = '../figures/paper', fname = None):
g = sns.JointGrid(x='yhat', y='y', data = verif)
g = g.plot(sns.regplot, sns.distplot)
g.fig.set_figwidth(6)
g.fig.set_figheight(6)
ax = g.fig.axes[1]
if title is not None:
ax.set_title(title, fontsize=16)
ax = g.fig.axes[0]
ax.set_xlim([-5, None])
ax.set_ylim([-5, None])
ax.text(100, 2000, "R = {:+4.2f}\nMAE = {:4.1f}".format(verif.loc[:,['y','yhat']].corr().iloc[0,1], MAE(verif.loc[:,'y'].values, verif.loc[:,'yhat'].values)), fontsize=16)
ax.set_xlabel("model's estimate", fontsize=15)
ax.set_ylabel("observation", fontsize=15)
ax.grid(ls=':')
[l.set_fontsize(13) for l in ax.xaxis.get_ticklabels()]
[l.set_fontsize(13) for l in ax.yaxis.get_ticklabels()];
ax.grid(ls=':')
if fname is not None:
for ext in ['png','jpeg','pdf']:
g.fig.savefig(os.path.join(fpath, "{}.{}".format(fname, ext)), dpi=200)
plot_joint_plot(verif.loc[:'2017',:], title='train set', fname='train_set_joint_plot_no_climate')
/Users/nicolasf/anaconda3/envs/PANGEO/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result. return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
plot_joint_plot(verif.loc['2017':,:], title='test set', fname='test_set_joint_plot_no_climate')
/Users/nicolasf/anaconda3/envs/PANGEO/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result. return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
verif.loc['2017':,['y','yhat']].corr()
y | yhat | |
---|---|---|
y | 1.000000 | 0.452255 |
yhat | 0.452255 | 1.000000 |
MAE(verif.y.values, verif.yhat.values)
229.47578841652154
holidays_df = pd.DataFrame([], columns = ['ds','holiday'])
ldates = []
lnames = []
for date, name in sorted(holidays.NZ(prov='AUK', years=np.arange(2013, 2018 + 1)).items()):
# print(date, name)
ldates.append(date)
lnames.append(name)
ldates = np.array(ldates)
lnames = np.array(lnames)
holidays_df.loc[:,'ds'] = ldates
holidays_df.loc[:,'holiday'] = lnames
holidays_df.tail()
ds | holiday | |
---|---|---|
67 | 2018-04-25 | Anzac Day |
68 | 2018-06-04 | Queen's Birthday |
69 | 2018-10-22 | Labour Day |
70 | 2018-12-25 | Christmas Day |
71 | 2018-12-26 | Boxing Day |
holidays_df.holiday.unique()
array(["New Year's Day", "Day after New Year's Day", 'Auckland Anniversary Day', 'Waitangi Day', 'Good Friday', 'Easter Monday', 'Anzac Day', "Queen's Birthday", 'Labour Day', 'Christmas Day', 'Boxing Day', 'Anzac Day (Observed)', 'Boxing Day (Observed)', "Day after New Year's Day (Observed)", 'Waitangi Day (Observed)', 'Christmas Day (Observed)', "New Year's Day (Observed)"], dtype=object)
def remove_obs(x):
return x.replace(' (Observed)','')
holidays_df.loc[:,'holiday'] = holidays_df.loc[:,'holiday'].apply(remove_obs)
# holidays_df.loc[:,'lower_window'] = -1
# holidays_df.loc[:,'upper_window'] = 1
# holidays_df.loc[holidays_df.holiday == 'Christmas Day','lower_window'] = -3
m = Prophet(mcmc_samples=300, holidays=holidays_df, holidays_prior_scale=0.25, changepoint_prior_scale=0.01, seasonality_mode='multiplicative', \
yearly_seasonality=10, \
weekly_seasonality=True, \
daily_seasonality=False)
m.fit(data_train)
/Users/nicolasf/anaconda3/envs/PANGEO/lib/python3.6/site-packages/fbprophet/forecaster.py:253: FutureWarning: 'ds' is both an index level and a column label. Defaulting to column, but this will raise an ambiguity error in a future version df = df.sort_values('ds') /Users/nicolasf/anaconda3/envs/PANGEO/lib/python3.6/site-packages/pystan/misc.py:399: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`. elif np.issubdtype(np.asarray(v).dtype, float): /Users/nicolasf/anaconda3/envs/PANGEO/lib/python3.6/site-packages/pystan/misc.py:456: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() or inspect.getfullargspec() if "chain_id" in inspect.getargspec(init).args:
<fbprophet.forecaster.Prophet at 0x1a2c9275f8>
future = m.make_future_dataframe(periods=len(data_test), freq='1D')
forecast = m.predict(future)
f = m.plot_components(forecast)
verif = make_verif(forecast, data_train, data_test)
f = plot_verif(verif)
plot_joint_plot(verif.loc[:'2017',:], title='train set', fname='train_set_joint_plot_no_climate_holidays')
/Users/nicolasf/anaconda3/envs/PANGEO/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result. return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
plot_joint_plot(verif.loc['2017':,:], title='test set', fname='test_set_joint_plot_no_climate_holidays')
/Users/nicolasf/anaconda3/envs/PANGEO/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result. return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
(verif.loc['2017':,'yhat'] - verif.loc['2017':,'y']).mean()
7.28876082009843
f, ax = plt.subplots(figsize=(8,8))
sns.distplot((verif.loc['2017':,'yhat'] - verif.loc['2017':,'y']), ax=ax)
ax.grid(ls=':')
ax.set_xlabel('residuals', fontsize=15)
ax.set_ylabel("normalised frequency", fontsize=15)
ax.grid(ls=':')
[l.set_fontsize(13) for l in ax.xaxis.get_ticklabels()]
[l.set_fontsize(13) for l in ax.yaxis.get_ticklabels()];
ax.text(0.05, 0.9, "Skewness = {:+4.2f}\nMedian = {:+4.2f}".\
format(skew(verif.loc['2017':,'yhat'] - verif.loc['2017':,'y']), (verif.loc['2017':,'yhat'] - verif.loc['2017':,'y']).median()), \
fontsize=14, transform=ax.transAxes)
ax.axvline(0, color='0.4')
ax.set_title('Residuals distribution (test set)', fontsize=17)
f.savefig('../figures/paper/residuals_distribution_test_set_no_climate.jpeg', dpi=200)
f.savefig('../figures/paper/residuals_distribution_test_set_no_climate.pdf')
/Users/nicolasf/anaconda3/envs/PANGEO/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result. return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
temp = pd.read_csv('../data/weather/hourly/commute/temp_day.csv', index_col=0, parse_dates=True)
temp = temp.loc[:,['Tmin(C)']]
temp.columns = ['temp']
temp.head()
temp | |
---|---|
2012-01-01 | 19.807143 |
2012-01-02 | 18.000000 |
2012-01-03 | 19.335714 |
2012-01-04 | 19.307143 |
2012-01-05 | 19.978571 |
rain = pd.read_csv('../data/weather/hourly/commute/rain_day.csv', index_col=0, parse_dates=True)
rain = rain.loc[:,['Amount(mm)']]
rain.columns = ['rain']
rain.head()
rain | |
---|---|
2012-01-01 | 0.000000 |
2012-01-02 | 0.000000 |
2012-01-03 | 0.028571 |
2012-01-04 | 0.185714 |
2012-01-05 | 0.014286 |
sun = pd.read_csv('../data/weather/hourly/commute/sun_day.csv', index_col=0, parse_dates=True)
sun.columns = ['sun']
sun.head()
sun | |
---|---|
2012-01-01 | 0.078571 |
2012-01-02 | 0.128571 |
2012-01-03 | 0.321429 |
2012-01-04 | 0.128571 |
2012-01-05 | 0.378571 |
wind = pd.read_csv('../data/weather/hourly/commute/wind_day.csv', index_col=0, parse_dates=True)
wind = wind.loc[:,['Speed(m/s)']]
wind.columns = ['wind']
wind.head()
wind | |
---|---|
2011-01-01 | 8.464286 |
2011-01-02 | 3.857143 |
2011-01-03 | 3.871429 |
2011-01-04 | 2.392857 |
2011-01-05 | 6.621429 |
temp = temp.loc['2013':'2018-06-01',:]
rain = rain.loc['2013':'2018-06-01',:]
sun = sun.loc['2013':'2018-06-01',:]
wind = wind.loc['2013':'2018-06-01',:]
temp = temp.interpolate(method='linear')
rain = rain.interpolate(method='linear')
sun = sun.interpolate(method='linear')
wind = wind.interpolate(method='linear')
data_with_regressors = add_regressor(data, temp, varname='temp')
data_with_regressors = add_regressor(data_with_regressors, rain, varname='rain')
data_with_regressors = add_regressor(data_with_regressors, sun, varname='sun')
data_with_regressors = add_regressor(data_with_regressors, wind, varname='wind')
data_with_regressors.head()
y | temp | rain | sun | wind | |
---|---|---|---|---|---|
datetime | |||||
2013-01-01 | 1163.0 | 20.000000 | 0.000000 | 0.950000 | 6.100000 |
2013-01-02 | 1112.0 | 20.342857 | 0.000000 | 0.535714 | 4.428571 |
2013-01-03 | 527.0 | 16.278571 | 0.228571 | 0.014286 | 4.728571 |
2013-01-04 | 1045.0 | 17.635714 | 0.000000 | 0.742857 | 8.978571 |
2013-01-05 | 1422.0 | 19.592857 | 0.000000 | 0.964286 | 6.185714 |
data_with_regressors.tail()
y | temp | rain | sun | wind | |
---|---|---|---|---|---|
datetime | |||||
2018-05-28 | 1107.0 | 8.750000 | 0.0 | 0.271429 | 3.200000 |
2018-05-29 | 1464.0 | 7.764286 | 0.0 | 0.671429 | 2.571429 |
2018-05-30 | 1298.0 | 7.614286 | 0.0 | 0.621429 | 2.378571 |
2018-05-31 | 1239.0 | 8.192857 | 0.0 | 0.678571 | 2.057143 |
2018-06-01 | 1196.0 | 9.085714 | 0.0 | 0.635714 | 2.178571 |
data_train, data_test = prepare_data(data_with_regressors, 2017)
m = Prophet(mcmc_samples=300, holidays=holidays_df, holidays_prior_scale=0.25, changepoint_prior_scale=0.01, seasonality_mode='multiplicative', \
yearly_seasonality=10, \
weekly_seasonality=True, \
daily_seasonality=False)
m.add_regressor('temp', prior_scale=0.5, mode='multiplicative')
m.add_regressor('rain', prior_scale=0.5, mode='multiplicative')
m.add_regressor('sun', prior_scale=0.5, mode='multiplicative')
m.add_regressor('wind', prior_scale=0.5, mode='multiplicative')
<fbprophet.forecaster.Prophet at 0x1a2c7538d0>
m.fit(data_train)
/Users/nicolasf/anaconda3/envs/PANGEO/lib/python3.6/site-packages/pystan/misc.py:399: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`. elif np.issubdtype(np.asarray(v).dtype, float): /Users/nicolasf/anaconda3/envs/PANGEO/lib/python3.6/site-packages/pystan/misc.py:456: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() or inspect.getfullargspec() if "chain_id" in inspect.getargspec(init).args:
<fbprophet.forecaster.Prophet at 0x1a2c7538d0>
future = m.make_future_dataframe(periods=len(data_test), freq='1D')
futures = add_regressor_to_future(future, [temp, rain, sun, wind])
futures.head()
ds | temp | rain | sun | wind | |
---|---|---|---|---|---|
0 | 2013-01-01 | 20.000000 | 0.000000 | 0.950000 | 6.100000 |
1 | 2013-01-02 | 20.342857 | 0.000000 | 0.535714 | 4.428571 |
2 | 2013-01-03 | 16.278571 | 0.228571 | 0.014286 | 4.728571 |
3 | 2013-01-04 | 17.635714 | 0.000000 | 0.742857 | 8.978571 |
4 | 2013-01-05 | 19.592857 | 0.000000 | 0.964286 | 6.185714 |
forecast = m.predict(futures)
f = m.plot(forecast)
f = m.plot_components(forecast)
verif = make_verif(forecast, data_train, data_test)
verif.head()
ds | trend | yhat_lower | yhat_upper | trend_lower | trend_upper | Anzac Day | Anzac Day_lower | Anzac Day_upper | Auckland Anniversary Day | ... | wind_lower | wind_upper | yearly | yearly_lower | yearly_upper | additive_terms | additive_terms_lower | additive_terms_upper | yhat | y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
ds | |||||||||||||||||||||
2013-01-01 | 2013-01-01 | 1042.856628 | 854.508117 | 1372.145032 | 1024.786699 | 1059.903650 | 0.0 | 0.0 | 0.0 | 0.0 | ... | -0.046626 | -0.041785 | -0.025330 | -0.056368 | 0.006249 | 0.0 | 0.0 | 0.0 | 1121.175234 | 1163.0 |
2013-01-02 | 2013-01-02 | 1042.853520 | 901.818118 | 1424.948717 | 1024.808524 | 1059.853658 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.023787 | 0.026542 | -0.015524 | -0.045850 | 0.016498 | 0.0 | 0.0 | 0.0 | 1161.814429 | 1112.0 |
2013-01-03 | 2013-01-03 | 1042.850412 | 612.377785 | 1077.206538 | 1024.883619 | 1059.849853 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.012017 | 0.013409 | -0.004755 | -0.034417 | 0.026177 | 0.0 | 0.0 | 0.0 | 839.107142 | 527.0 |
2013-01-04 | 2013-01-04 | 1042.847304 | 728.770108 | 1207.449677 | 1024.951953 | 1059.823964 | 0.0 | 0.0 | 0.0 | 0.0 | ... | -0.172637 | -0.154715 | 0.006833 | -0.023277 | 0.037234 | 0.0 | 0.0 | 0.0 | 976.471694 | 1045.0 |
2013-01-05 | 2013-01-05 | 1042.844196 | 1143.669635 | 1624.216346 | 1025.016418 | 1059.764282 | 0.0 | 0.0 | 0.0 | 0.0 | ... | -0.050378 | -0.045148 | 0.019080 | -0.011171 | 0.049472 | 0.0 | 0.0 | 0.0 | 1375.987169 | 1422.0 |
5 rows × 71 columns
verif.loc[:,'yhat'] = verif.yhat.clip_lower(0)
verif.loc[:,'yhat_lower'] = verif.yhat_lower.clip_lower(0)
f = plot_verif(verif)
plot_joint_plot(verif.loc[:'2017',:], title='train set', fname='train_set_joint_plot_climate')
/Users/nicolasf/anaconda3/envs/PANGEO/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result. return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
plot_joint_plot(verif.loc['2017':,:], title='test set', fname='test_set_joint_plot_no_climate')
/Users/nicolasf/anaconda3/envs/PANGEO/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result. return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
residuals = verif.loc['2017':,'yhat'] - verif.loc['2017':,'y']
f, ax = plt.subplots(figsize=(8,8))
sns.distplot(residuals, ax=ax)
ax.grid(ls=':')
ax.set_xlabel('residuals', fontsize=15)
ax.set_ylabel("normalised frequency", fontsize=15)
ax.grid(ls=':')
[l.set_fontsize(13) for l in ax.xaxis.get_ticklabels()]
[l.set_fontsize(13) for l in ax.yaxis.get_ticklabels()];
ax.set_title('Residuals distribution (test set)', fontsize=17)
ax.text(0.05, 0.85, "Skewness = {:+4.2f}\nMedian = {:+4.2f}\nMean = {:+4.2f}".\
format(skew(residuals), residuals.median(), residuals.mean()), \
fontsize=14, transform=ax.transAxes)
f.savefig('../figures/paper/residuals_distribution_test_set_climate.png', dpi=200)
f.savefig('../figures/paper/residuals_distribution_test_set_climate.jpeg', dpi=200)
f.savefig('../figures/paper/residuals_distribution_test_set_climate.pdf')
/Users/nicolasf/anaconda3/envs/PANGEO/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result. return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
yhat
, dashed black line) and the observed values (y
, solid grey line) for the first 8 months of 2017¶f, ax = plt.subplots(figsize=(14,8))
verif.loc['2017-01-01':'2017-08-31',['y']].plot(lw=5, ax=ax, color='0.7', ls='-')
verif.loc['2017-01-01':'2017-08-31',['yhat']].plot(lw=3, ax=ax, color='k', ls='--')
ax.grid(ls=':')
ax.legend(['observations','forecast'], fontsize=15)
[l.set_fontsize(13) for l in ax.xaxis.get_ticklabels()]
[l.set_fontsize(13) for l in ax.yaxis.get_ticklabels()]
ax.set_ylabel('cyclists number', fontsize=15)
ax.set_xlabel('date', fontsize=15)
for ext in ['png','jpeg','pdf']:
f.savefig('../figures/paper/forecasts_obs_2017-08.{}'.format(ext), dpi=200)
corr = verif.loc[:,['y','yhat']].rolling(window=90, center=True).corr().iloc[0::2,1]
corr.index = corr.index.droplevel(1)
f, ax = plt.subplots(figsize=(10, 8))
corr.plot(ax=ax, lw=2, color='k')
ax.axhline(0.8, color='0.8', zorder=-1)
ax.axhline(0.6, color='0.8', zorder=-1)
ax.axvline('2017', color='k', zorder=-1)
ax.grid(ls=':')
ax.set_ylim([0.5, 0.9])
ax.set_xlabel('date', fontsize=15)
ax.set_ylabel("Pearson's R", fontsize=15)
ax.grid(ls=':')
[l.set_fontsize(13) for l in ax.xaxis.get_ticklabels()]
[l.set_fontsize(13) for l in ax.yaxis.get_ticklabels()]
ax.set_yticks(np.arange(0.5, 1., 0.1));
ax.set_title('90 days running window correlation\nbetween observed and modelled / predicted values', fontsize=15)
f.savefig('../figures/paper/moving_corr.png', dpi=200)
f.savefig('../figures/paper/moving_corr.jpeg', dpi=200)
f.savefig('../figures/paper/moving_corr.pdf')
corr_season_test = verif.loc['2017':,['y','yhat']].groupby(verif.loc['2017':,:].index.month).corr()
corr_season_train = verif.loc[:'2017',['y','yhat']].groupby(verif.loc[:'2017',:].index.month).corr()
corr_season = verif.loc[:,['y','yhat']].groupby(verif.loc[:,:].index.month).corr()
f, ax = plt.subplots(figsize=(8,8))
corr_season_train.xs('y', axis=0, level=1)['yhat'].plot(ax=ax, lw=3, marker='o', markersize=12, label='train set', ls='-', color='k')
corr_season_test.xs('y', axis=0, level=1)['yhat'].plot(ax=ax, lw=3, marker='o', markersize=12, label='test set', ls='--', color='k')
# corr_season.xs('y', axis=0, level=1)['yhat'].plot(ax=ax, lw=3, marker='o', markersize=12)
ax.legend(fontsize=17, loc=3)
ax.set_xticks(range(1, 13))
ax.set_xticklabels(list('JFMAMJJASOND'))
ax.set_xlabel('month', fontsize=15)
ax.set_ylabel("Pearson's R", fontsize=15)
ax.grid(ls=':')
[l.set_fontsize(13) for l in ax.xaxis.get_ticklabels()]
[l.set_fontsize(13) for l in ax.yaxis.get_ticklabels()]
ax.set_title('correlation per month', fontsize=17)
f.savefig('../figures/paper/correlation_obs_pred_per_month.png', dpi=200)
f.savefig('../figures/paper/correlation_obs_pred_per_month.jpeg', dpi=200)
f.savefig('../figures/paper/correlation_obs_pred_per_month.pdf')
f = plot_verif_component(verif, component = 'sun')
f = plot_verif_component(verif, component = 'rain')
f = plot_verif_component(verif, component = 'wind')
f = plot_verif_component(verif, component = 'extra_regressors_multiplicative')
verif.loc['2016-12-31':,'extra_regressors_multiplicative'].abs().mean()
0.1639002173526798
f = plot_verif_component(verif.loc['2016-12-31':,:], component = 'extra_regressors_multiplicative')
f = plot_verif_component(verif.loc['2016-12-31':,:], component = 'rain')
f = plot_verif_component(verif.loc['2016-12-31':,:], component = 'wind')