%reload_ext autoreload
%autoreload 2
from fastai.tabular import *
To create the feature-engineered train_clean and test_clean from the Kaggle competition data, run rossman_data_clean.ipynb
. One important step that deals with time series is this:
为了从Kaggle竞赛数据集中创建经过特征工程处理的train_clean和test_clean,运行rossman_data_clean.ipynb
,其中一个处理时间序列数据的重要步骤如下:
add_datepart(train, "Date", drop=False)
add_datepart(test, "Date", drop=False)
path = Config().data_path()/'rossmann'
train_df = pd.read_pickle(path/'train_clean')
train_df.head().T
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
index | 0 | 1 | 2 | 3 | 4 |
Store | 1 | 2 | 3 | 4 | 5 |
DayOfWeek | 5 | 5 | 5 | 5 | 5 |
Date | 2015-07-31 | 2015-07-31 | 2015-07-31 | 2015-07-31 | 2015-07-31 |
Sales | 5263 | 6064 | 8314 | 13995 | 4822 |
Customers | 555 | 625 | 821 | 1498 | 559 |
Open | 1 | 1 | 1 | 1 | 1 |
Promo | 1 | 1 | 1 | 1 | 1 |
StateHoliday | False | False | False | False | False |
SchoolHoliday | 1 | 1 | 1 | 1 | 1 |
Year | 2015 | 2015 | 2015 | 2015 | 2015 |
Month | 7 | 7 | 7 | 7 | 7 |
Week | 31 | 31 | 31 | 31 | 31 |
Day | 31 | 31 | 31 | 31 | 31 |
Dayofweek | 4 | 4 | 4 | 4 | 4 |
Dayofyear | 212 | 212 | 212 | 212 | 212 |
Is_month_end | True | True | True | True | True |
Is_month_start | False | False | False | False | False |
Is_quarter_end | False | False | False | False | False |
Is_quarter_start | False | False | False | False | False |
Is_year_end | False | False | False | False | False |
Is_year_start | False | False | False | False | False |
Elapsed | 1438300800 | 1438300800 | 1438300800 | 1438300800 | 1438300800 |
StoreType | c | a | a | c | a |
Assortment | a | a | a | c | a |
CompetitionDistance | 1270 | 570 | 14130 | 620 | 29910 |
CompetitionOpenSinceMonth | 9 | 11 | 12 | 9 | 4 |
CompetitionOpenSinceYear | 2008 | 2007 | 2006 | 2009 | 2015 |
Promo2 | 0 | 1 | 1 | 0 | 0 |
Promo2SinceWeek | 1 | 13 | 14 | 1 | 1 |
... | ... | ... | ... | ... | ... |
Min_Sea_Level_PressurehPa | 1015 | 1017 | 1017 | 1014 | 1016 |
Max_VisibilityKm | 31 | 10 | 31 | 10 | 10 |
Mean_VisibilityKm | 15 | 10 | 14 | 10 | 10 |
Min_VisibilitykM | 10 | 10 | 10 | 10 | 10 |
Max_Wind_SpeedKm_h | 24 | 14 | 14 | 23 | 14 |
Mean_Wind_SpeedKm_h | 11 | 11 | 5 | 16 | 11 |
Max_Gust_SpeedKm_h | NaN | NaN | NaN | NaN | NaN |
Precipitationmm | 0 | 0 | 0 | 0 | 0 |
CloudCover | 1 | 4 | 2 | 6 | 4 |
Events | Fog | Fog | Fog | NaN | NaN |
WindDirDegrees | 13 | 309 | 354 | 282 | 290 |
StateName | Hessen | Thueringen | NordrheinWestfalen | Berlin | Sachsen |
CompetitionOpenSince | 2008-09-15 | 2007-11-15 | 2006-12-15 | 2009-09-15 | 2015-04-15 |
CompetitionDaysOpen | 2510 | 2815 | 3150 | 2145 | 107 |
CompetitionMonthsOpen | 24 | 24 | 24 | 24 | 3 |
Promo2Since | 1900-01-01 | 2010-03-29 | 2011-04-04 | 1900-01-01 | 1900-01-01 |
Promo2Days | 0 | 1950 | 1579 | 0 | 0 |
Promo2Weeks | 0 | 25 | 25 | 0 | 0 |
AfterSchoolHoliday | 0 | 0 | 0 | 0 | 0 |
BeforeSchoolHoliday | 0 | 0 | 0 | 0 | 0 |
AfterStateHoliday | 57 | 67 | 57 | 67 | 57 |
BeforeStateHoliday | 0 | 0 | 0 | 0 | 0 |
AfterPromo | 0 | 0 | 0 | 0 | 0 |
BeforePromo | 0 | 0 | 0 | 0 | 0 |
SchoolHoliday_bw | 5 | 5 | 5 | 5 | 5 |
StateHoliday_bw | 0 | 0 | 0 | 0 | 0 |
Promo_bw | 5 | 5 | 5 | 5 | 5 |
SchoolHoliday_fw | 7 | 1 | 5 | 1 | 1 |
StateHoliday_fw | 0 | 0 | 0 | 0 | 0 |
Promo_fw | 5 | 1 | 5 | 1 | 1 |
93 rows × 5 columns
n = len(train_df); n
844338
idx = np.random.permutation(range(n))[:2000]
idx.sort()
small_train_df = train_df.iloc[idx[:1000]]
small_test_df = train_df.iloc[idx[1000:]]
small_cont_vars = ['CompetitionDistance', 'Mean_Humidity']
small_cat_vars = ['Store', 'DayOfWeek', 'PromoInterval']
small_train_df = small_train_df[small_cat_vars + small_cont_vars + ['Sales']]
small_test_df = small_test_df[small_cat_vars + small_cont_vars + ['Sales']]
small_train_df.head()
Store | DayOfWeek | PromoInterval | CompetitionDistance | Mean_Humidity | Sales | |
---|---|---|---|---|---|---|
267 | 268 | 5 | NaN | 4520.0 | 67 | 7492 |
604 | 606 | 5 | NaN | 2260.0 | 61 | 7187 |
983 | 986 | 5 | Feb,May,Aug,Nov | 620.0 | 61 | 7051 |
1636 | 525 | 4 | NaN | 1870.0 | 55 | 9673 |
2348 | 123 | 3 | NaN | 16760.0 | 50 | 10007 |
small_test_df.head()
Store | DayOfWeek | PromoInterval | CompetitionDistance | Mean_Humidity | Sales | |
---|---|---|---|---|---|---|
420510 | 829 | 3 | NaN | 110.0 | 55 | 6802 |
420654 | 973 | 3 | Jan,Apr,Jul,Oct | 330.0 | 59 | 6644 |
420990 | 194 | 2 | Feb,May,Aug,Nov | 16970.0 | 55 | 4720 |
421308 | 512 | 2 | Mar,Jun,Sept,Dec | 590.0 | 72 | 6248 |
421824 | 1029 | 2 | NaN | 1590.0 | 64 | 8004 |
categorify = Categorify(small_cat_vars, small_cont_vars)
categorify(small_train_df)
categorify(small_test_df, test=True)
small_test_df.head()
Store | DayOfWeek | PromoInterval | CompetitionDistance | Mean_Humidity | Sales | |
---|---|---|---|---|---|---|
420510 | NaN | 3 | NaN | 110.0 | 55 | 6802 |
420654 | 973.0 | 3 | Jan,Apr,Jul,Oct | 330.0 | 59 | 6644 |
420990 | NaN | 2 | Feb,May,Aug,Nov | 16970.0 | 55 | 4720 |
421308 | 512.0 | 2 | Mar,Jun,Sept,Dec | 590.0 | 72 | 6248 |
421824 | 1029.0 | 2 | NaN | 1590.0 | 64 | 8004 |
small_train_df.PromoInterval.cat.categories
Index(['Feb,May,Aug,Nov', 'Jan,Apr,Jul,Oct', 'Mar,Jun,Sept,Dec'], dtype='object')
small_train_df['PromoInterval'].cat.codes[:5]
267 -1 604 -1 983 0 1636 -1 2348 -1 dtype: int8
fill_missing = FillMissing(small_cat_vars, small_cont_vars)
fill_missing(small_train_df)
fill_missing(small_test_df, test=True)
small_train_df[small_train_df['CompetitionDistance_na'] == True]
Store | DayOfWeek | PromoInterval | CompetitionDistance | Mean_Humidity | Sales | CompetitionDistance_na | |
---|---|---|---|---|---|---|---|
185749 | 622 | 2 | NaN | 2300.0 | 93 | 4508 | True |
train_df = pd.read_pickle(path/'train_clean')
test_df = pd.read_pickle(path/'test_clean')
len(train_df),len(test_df)
(844338, 41088)
procs=[FillMissing, Categorify, Normalize]
cat_vars = ['Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday', 'CompetitionMonthsOpen',
'Promo2Weeks', 'StoreType', 'Assortment', 'PromoInterval', 'CompetitionOpenSinceYear', 'Promo2SinceYear',
'State', 'Week', 'Events', 'Promo_fw', 'Promo_bw', 'StateHoliday_fw', 'StateHoliday_bw',
'SchoolHoliday_fw', 'SchoolHoliday_bw']
cont_vars = ['CompetitionDistance', 'Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC',
'Max_Humidity', 'Mean_Humidity', 'Min_Humidity', 'Max_Wind_SpeedKm_h',
'Mean_Wind_SpeedKm_h', 'CloudCover', 'trend', 'trend_DE',
'AfterStateHoliday', 'BeforeStateHoliday', 'Promo', 'SchoolHoliday']
dep_var = 'Sales'
df = train_df[cat_vars + cont_vars + [dep_var,'Date']].copy()
test_df['Date'].min(), test_df['Date'].max()
('2015-08-01', '2015-09-17')
cut = train_df['Date'][(train_df['Date'] == train_df['Date'][len(test_df)])].index.max()
cut
41395
valid_idx = range(cut)
df[dep_var].head()
0 5263 1 6064 2 8314 3 13995 4 4822 Name: Sales, dtype: int64
data = (TabularList.from_df(df, path=path, cat_names=cat_vars, cont_names=cont_vars, procs=procs,)
.split_by_idx(valid_idx)
.label_from_df(cols=dep_var, label_cls=FloatList, log=True)
.add_test(TabularList.from_df(test_df, path=path, cat_names=cat_vars, cont_names=cont_vars))
.databunch())
doc(FloatList)
max_log_y = np.log(np.max(train_df['Sales'])*1.2)
y_range = torch.tensor([0, max_log_y], device=defaults.device)
learn = tabular_learner(data, layers=[1000,500], ps=[0.001,0.01], emb_drop=0.04,
y_range=y_range, metrics=exp_rmspe)
learn.model
TabularModel( (embeds): ModuleList( (0): Embedding(1116, 81) (1): Embedding(8, 5) (2): Embedding(4, 3) (3): Embedding(13, 7) (4): Embedding(32, 11) (5): Embedding(3, 3) (6): Embedding(26, 10) (7): Embedding(27, 10) (8): Embedding(5, 4) (9): Embedding(4, 3) (10): Embedding(4, 3) (11): Embedding(24, 9) (12): Embedding(9, 5) (13): Embedding(13, 7) (14): Embedding(53, 15) (15): Embedding(22, 9) (16): Embedding(7, 5) (17): Embedding(7, 5) (18): Embedding(4, 3) (19): Embedding(4, 3) (20): Embedding(9, 5) (21): Embedding(9, 5) (22): Embedding(3, 3) (23): Embedding(3, 3) ) (emb_drop): Dropout(p=0.04) (bn_cont): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (layers): Sequential( (0): Linear(in_features=233, out_features=1000, bias=True) (1): ReLU(inplace) (2): BatchNorm1d(1000, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (3): Dropout(p=0.001) (4): Linear(in_features=1000, out_features=500, bias=True) (5): ReLU(inplace) (6): BatchNorm1d(500, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (7): Dropout(p=0.01) (8): Linear(in_features=500, out_features=1, bias=True) ) )
len(data.train_ds.cont_names)
16
learn.lr_find()
LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.
learn.recorder.plot()
learn.fit_one_cycle(5, 1e-3, wd=0.2)
epoch | train_loss | valid_loss | exp_rmspe |
---|---|---|---|
1 | 0.023587 | 0.020941 | 0.140551 |
2 | 0.017678 | 0.023431 | 0.132211 |
3 | 0.017453 | 0.016929 | 0.120169 |
4 | 0.012608 | 0.016296 | 0.109245 |
5 | 0.010222 | 0.011238 | 0.105433 |
learn.save('1')
learn.recorder.plot_losses(last=-1)
learn.load('1');
learn.fit_one_cycle(5, 3e-4)
epoch | train_loss | valid_loss | exp_rmspe |
---|---|---|---|
1 | 0.012223 | 0.014312 | 0.116988 |
2 | 0.012001 | 0.017789 | 0.117619 |
3 | 0.011402 | 0.035596 | 0.114396 |
4 | 0.010067 | 0.015125 | 0.113652 |
5 | 0.009148 | 0.031326 | 0.116344 |
learn.fit_one_cycle(5, 3e-4)
epoch | train_loss | valid_loss | exp_rmspe |
---|---|---|---|
1 | 0.011840 | 0.013236 | 0.110483 |
2 | 0.010765 | 0.057664 | 0.129586 |
3 | 0.010101 | 0.042744 | 0.111584 |
4 | 0.008820 | 0.116893 | 0.135458 |
5 | 0.009144 | 0.017969 | 0.126323 |
(10th place in the competition was 0.108)
test_preds=learn.get_preds(DatasetType.Test)
test_df["Sales"]=np.exp(test_preds[0].data).numpy().T[0]
test_df[["Id","Sales"]]=test_df[["Id","Sales"]].astype("int")
test_df[["Id","Sales"]].to_csv("rossmann_submission.csv",index=False)