%reload_ext autoreload
%autoreload 2
%matplotlib inline
import numpy as np
import pandas as pd
from pathlib import Path
path = Path('data')
path.mkdir(parents=True, exist_ok=True)
path
PosixPath('data')
#!wget http://files.fast.ai/part2/lesson14/rossmann.tgz -P {path}
#!tar -xvzf {path}/rossmann.tgz -C {path}
#!rm {path}/rossmann.tgz
#!rm {path}/sample_submission.csv
table_paths = sorted(list(path.glob('*.csv')))
table_paths
[PosixPath('data/googletrend.csv'), PosixPath('data/state_names.csv'), PosixPath('data/store.csv'), PosixPath('data/store_states.csv'), PosixPath('data/test.csv'), PosixPath('data/train.csv'), PosixPath('data/weather.csv')]
tables = [pd.read_csv(table_path, low_memory=False) for table_path in table_paths]
googletrend_df, state_names_df, store_df, store_states_df, test_df, train_df, weather_df = tables
train_df.shape, test_df.shape
((1017209, 9), (41088, 8))
train_df['Date'] = pd.to_datetime(train_df.Date)
train_df.Date.min(), train_df.Date.max()
(Timestamp('2013-01-01 00:00:00'), Timestamp('2015-07-31 00:00:00'))
test_df['Date'] = pd.to_datetime(test_df.Date)
test_df.Date.min(), test_df.Date.max()
(Timestamp('2015-08-01 00:00:00'), Timestamp('2015-09-17 00:00:00'))
train_df.columns.difference(test_df.columns)
Index(['Customers', 'Sales'], dtype='object')
test_df.drop('Id', axis=1, inplace=True)
test_df['Customers'] = np.nan
test_df['Sales'] = np.nan
# Merge train and test to apply preprocessors on both of them at time
df = train_df.append(test_df, sort=False, ignore_index=True, verify_integrity=True)
df.Date.min(), df.Date.max()
(Timestamp('2013-01-01 00:00:00'), Timestamp('2015-09-17 00:00:00'))
# 0 and 1 to boolean
df['Open'] = df['Open'] != 0
df['Promo'] = df['Promo'] != 0
df['StateHoliday'] = df['StateHoliday'] != '0' # object
df['SchoolHoliday'] = df['SchoolHoliday'] != 0
import re
def add_datepart(df):
# Decompose date column into various parts for the purpose of constructing categoricals
date_col = df['Date']
targ_pre = re.sub('[Dd]ate$', '', 'Date')
attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
'Is_month_end', 'Is_month_start', 'Is_quarter_end',
'Is_quarter_start', 'Is_year_end', 'Is_year_start']
for n in attr:
df[targ_pre + n] = getattr(date_col.dt, n.lower())
df[targ_pre + 'Elapsed'] = date_col.astype(np.int64) // 10 ** 9
add_datepart(df)
df.drop('DayOfWeek', axis=1, inplace=True)
df.query('Date==20141231').iloc[0]
Store 1 Date 2014-12-31 00:00:00 Sales 2605 Customers 327 Open True Promo False StateHoliday False SchoolHoliday True Year 2014 Month 12 Week 1 Day 31 Dayofweek 2 Dayofyear 365 Is_month_end True Is_month_start False Is_quarter_end True Is_quarter_start False Is_year_end True Is_year_start False Elapsed 1419984000 Name: 236380, dtype: object
base_columns = df.columns
def left_outer_join(left, right, left_on, right_on=None, suffixes=('', '_y')):
# Left outer join which makes unmatched values on the right side zero
# The suffixes argument describes the naming convention for duplicate fields
if right_on is None: right_on = left_on
return left.merge(right, how='left', left_on=left_on, right_on=right_on, suffixes=suffixes)
# Populate stores with state name
store_df = left_outer_join(store_df, store_states_df, "Store")
store_df.State.isnull().sum()
0
store_df['Promo2'] = store_df['Promo2'] != 0
# Merge sales data with store information
df = left_outer_join(df, store_df, "Store")
df.StoreType.isnull().sum()
0
df[df.columns.difference(base_columns)].iloc[0]
Assortment a CompetitionDistance 1270 CompetitionOpenSinceMonth 9 CompetitionOpenSinceYear 2008 Promo2 False Promo2SinceWeek NaN Promo2SinceYear NaN PromoInterval NaN State HE StoreType c Name: 0, dtype: object
base_columns = df.columns
store_states_df['State'].unique()
array(['HE', 'TH', 'NW', 'BE', 'SN', 'SH', 'HB,NI', 'BY', 'BW', 'RP', 'ST', 'HH'], dtype=object)
# Extract date and state for merging purposes
googletrend_df['Date'] = googletrend_df.week.str.split(' - ', expand=True)[0]
googletrend_df['State'] = googletrend_df.file.str.split('_', expand=True)[2]
# Match the rest of the data
googletrend_df.loc[googletrend_df.State=='NI', "State"] = 'HB,NI'
googletrend_df['Date'] = pd.to_datetime(googletrend_df['Date'])
add_datepart(googletrend_df)
# Merge with google trends across states
df = left_outer_join(df, googletrend_df, ["State", "Year", "Week"])
df.trend.isnull().sum()
0
# Merge with google trends across Germany
trend_de_df = googletrend_df[googletrend_df.file == 'Rossmann_DE']
df = left_outer_join(df, trend_de_df, ["Year", "Week"], suffixes=('', '_DE'))
df.trend_DE.isnull().sum()
0
# Remove redundant columns
for c in df.columns:
if c in ['file', 'week'] or c.endswith('_y') or (c.endswith('_DE') and c != 'trend_DE'):
if c in df.columns: df.drop(c, inplace=True, axis=1)
df[df.columns.difference(base_columns)].iloc[0]
trend 85 trend_DE 83 Name: 0, dtype: int64
base_columns = df.columns
weather_df = left_outer_join(weather_df, state_names_df, "file", "StateName")
weather_df.StateName.isnull().sum()
0
weather_df['Date'] = pd.to_datetime(weather_df.Date)
# Merge with weather data
df = left_outer_join(df, weather_df, ["State", "Date"])
df.Mean_TemperatureC.isnull().sum()
0
df.drop(['file', 'StateName'], axis=1, inplace=True)
df[df.columns.difference(base_columns)].iloc[0]
CloudCover 1 Dew_PointC 9 Events Fog Max_Gust_SpeedKm_h NaN Max_Humidity 98 Max_Sea_Level_PressurehPa 1021 Max_TemperatureC 23 Max_VisibilityKm 31 Max_Wind_SpeedKm_h 24 MeanDew_PointC 6 Mean_Humidity 54 Mean_Sea_Level_PressurehPa 1018 Mean_TemperatureC 16 Mean_VisibilityKm 15 Mean_Wind_SpeedKm_h 11 Min_DewpointC 3 Min_Humidity 18 Min_Sea_Level_PressurehPa 1015 Min_TemperatureC 8 Min_VisibilitykM 10 Precipitationmm 0 WindDirDegrees 13 Name: 0, dtype: object
base_columns = df.columns
# Pick an arbitrary signal value that doesn't otherwise appear in the data
df['Promo2SinceYear'] = df.Promo2SinceYear.fillna(1900).astype(np.int32)
df['Promo2SinceWeek'] = df.Promo2SinceWeek.fillna(1).astype(np.int32)
from isoweek import Week
df["Promo2Since"] = pd.to_datetime(df.apply(lambda x: Week(x.Promo2SinceYear, x.Promo2SinceWeek).monday(), axis=1))
df["Promo2Days"] = df.Date.subtract(df["Promo2Since"]).dt.days
df.loc[df.Promo2Days<0, "Promo2Days"] = 0
df.loc[df.Promo2SinceYear<1990, "Promo2Days"] = 0
df["Promo2Weeks"] = df["Promo2Days"] // 7
df.loc[df.Promo2Weeks<0, "Promo2Weeks"] = 0
df.loc[df.Promo2Weeks>25, "Promo2Weeks"] = 25 # limit the number of categories
df['CompetitionOpenSinceYear'] = df.CompetitionOpenSinceYear.fillna(1900).astype(np.int32)
df['CompetitionOpenSinceMonth'] = df.CompetitionOpenSinceMonth.fillna(1).astype(np.int32)
# Populate some temporal columns related to competition
df["CompetitionOpenSince"] = pd.to_datetime(dict(year=df.CompetitionOpenSinceYear,
month=df.CompetitionOpenSinceMonth, day=15))
df["CompetitionDaysOpen"] = df.Date.subtract(df.CompetitionOpenSince).dt.days
df.loc[df.CompetitionOpenSinceYear<1990, "CompetitionDaysOpen"] = 0
df.loc[df.CompetitionDaysOpen<0, "CompetitionDaysOpen"] = 0
df["CompetitionMonthsOpen"] = df["CompetitionDaysOpen"] // 30
df.loc[df.CompetitionMonthsOpen<0, "CompetitionMonthsOpen"] = 0
df.loc[df.CompetitionMonthsOpen>24, "CompetitionMonthsOpen"] = 24
df[df.columns.difference(base_columns)].iloc[0]
CompetitionDaysOpen 2510 CompetitionMonthsOpen 24 CompetitionOpenSince 2008-09-15 00:00:00 Promo2Days 0 Promo2Since 1900-01-01 00:00:00 Promo2Weeks 0 Name: 0, dtype: object
base_columns = df.columns
def set_elapsed(df, fld, pre):
# Given a particular boolean field fld to monitor,
# this function will start tracking time since the last occurrence of that field
day1 = np.timedelta64(1, 'D')
last_date = np.datetime64()
last_store = 0
res = []
for s, v, d in zip(df.Store.values, df[fld].values, df.Date.values):
if s != last_store:
last_date = np.datetime64()
last_store = s
if v: last_date = d
res.append(abs((d-last_date).astype('timedelta64[D]') / day1))
df[pre+fld] = res
# Create a temporary dataframe with relevant data
on_columns = ["Date", "Store", "Open", "Promo", "StateHoliday", "SchoolHoliday"]
elapsed_df = df[on_columns].copy()
# Get durations of events
elapsed_df['Closed'] = ~elapsed_df['Open']
event_flds = ['Closed', 'SchoolHoliday', 'StateHoliday', 'Promo']
for fld in event_flds:
elapsed_df.sort_values(['Store', 'Date'], inplace=True)
set_elapsed(elapsed_df, fld, 'After')
elapsed_df.sort_values(['Store', 'Date'], ascending=[True, False], inplace=True)
set_elapsed(elapsed_df, fld, 'Before')
elapsed_df.set_index("Date", inplace=True)
# Replace NaNs with 0
for suffix in event_flds:
for prefix in ['Before', 'After']:
column = prefix + suffix
elapsed_df[column] = elapsed_df[column].fillna(0).astype(int)
# Calculate the number of events in the previous 7 days
bwd_df = elapsed_df[['Store']+event_flds].sort_index().groupby("Store").rolling(7, min_periods=1).sum()
# _ next 7 days
fwd_df = elapsed_df[['Store']+event_flds].sort_index(ascending=False).groupby("Store").rolling(7, min_periods=1).sum()
# Drop index
bwd_df.drop('Store', axis=1, inplace=True)
bwd_df.reset_index(inplace=True)
fwd_df.drop('Store', axis=1, inplace=True)
fwd_df.reset_index(inplace=True)
bwd_df.columns
Index(['Store', 'Date', 'Closed', 'SchoolHoliday', 'StateHoliday', 'Promo'], dtype='object')
fwd_df.columns
Index(['Store', 'Date', 'Closed', 'SchoolHoliday', 'StateHoliday', 'Promo'], dtype='object')
elapsed_df.columns
Index(['Store', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday', 'Closed', 'AfterClosed', 'BeforeClosed', 'AfterSchoolHoliday', 'BeforeSchoolHoliday', 'AfterStateHoliday', 'BeforeStateHoliday', 'AfterPromo', 'BeforePromo'], dtype='object')
elapsed_df = left_outer_join(elapsed_df, bwd_df, ['Date', 'Store'], suffixes=['', '_bw'])
elapsed_df = left_outer_join(elapsed_df, fwd_df, ['Date', 'Store'], suffixes=['', '_fw'])
elapsed_df.sort_values(by=['Store', 'Date']).head(10).T
989 | 988 | 987 | 986 | 985 | 984 | 983 | 982 | 981 | 980 | |
---|---|---|---|---|---|---|---|---|---|---|
Date | 2013-01-01 00:00:00 | 2013-01-02 00:00:00 | 2013-01-03 00:00:00 | 2013-01-04 00:00:00 | 2013-01-05 00:00:00 | 2013-01-06 00:00:00 | 2013-01-07 00:00:00 | 2013-01-08 00:00:00 | 2013-01-09 00:00:00 | 2013-01-10 00:00:00 |
Store | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
Open | False | True | True | True | True | False | True | True | True | True |
Promo | False | False | False | False | False | False | True | True | True | True |
StateHoliday | True | False | False | False | False | False | False | False | False | False |
SchoolHoliday | True | True | True | True | True | True | True | True | True | True |
Closed | True | False | False | False | False | True | False | False | False | False |
AfterClosed | 0 | 1 | 2 | 3 | 4 | 0 | 1 | 2 | 3 | 4 |
BeforeClosed | 0 | 4 | 3 | 2 | 1 | 0 | 6 | 5 | 4 | 3 |
AfterSchoolHoliday | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
BeforeSchoolHoliday | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
AfterStateHoliday | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |
BeforeStateHoliday | 0 | 86 | 85 | 84 | 83 | 82 | 81 | 80 | 79 | 78 |
AfterPromo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
BeforePromo | 6 | 5 | 4 | 3 | 2 | 1 | 0 | 0 | 0 | 0 |
Closed_bw | 1 | 1 | 1 | 1 | 1 | 2 | 2 | 1 | 1 | 1 |
SchoolHoliday_bw | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 7 | 7 | 7 |
StateHoliday_bw | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | 0 |
Promo_bw | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 3 | 4 |
Closed_fw | 2 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
SchoolHoliday_fw | 7 | 7 | 7 | 7 | 7 | 6 | 5 | 4 | 3 | 2 |
StateHoliday_fw | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
Promo_fw | 1 | 2 | 3 | 4 | 5 | 5 | 5 | 4 | 3 | 2 |
# Drop not needed fields before join
elapsed_df.drop(['Open'] + event_flds, axis=1, inplace=True)
df = left_outer_join(df, elapsed_df, ['Store', 'Date'])
df[df.columns.difference(base_columns)].iloc[0]
AfterClosed 5.0 AfterPromo 0.0 AfterSchoolHoliday 0.0 AfterStateHoliday 57.0 BeforeClosed 2.0 BeforePromo 0.0 BeforeSchoolHoliday 0.0 BeforeStateHoliday 0.0 Closed_bw 1.0 Closed_fw 1.0 Promo_bw 5.0 Promo_fw 5.0 SchoolHoliday_bw 5.0 SchoolHoliday_fw 7.0 StateHoliday_bw 0.0 StateHoliday_fw 0.0 Name: 0, dtype: float64
(~df.Open).sum()
178801
(df['Sales'] == 0).sum()
172871
# Closed stores won't be counted into the forecasts
df = df[~((~df.Open) | (df['Sales'] == 0))]
df.drop('Open', axis=1, inplace=True)
df.sort_values(by=['Date', 'Store'], inplace=True)
df.Date.iloc[0], df.Date.iloc[-1]
(Timestamp('2013-01-01 00:00:00'), Timestamp('2015-09-17 00:00:00'))
df.reset_index(inplace=True, drop=True)
df.index
RangeIndex(start=0, stop=879442, step=1)
# Get columns with missing values
df[df.columns[df.isna().any()]].isnull().sum()
Sales 35104 Customers 35104 CompetitionDistance 2262 PromoInterval 438106 Max_VisibilityKm 22172 Mean_VisibilityKm 22172 Min_VisibilitykM 22172 Max_Gust_SpeedKm_h 677088 CloudCover 73011 Events 211902 dtype: int64
df[['SchoolHoliday_bw', 'StateHoliday_bw', 'Promo_bw', 'Closed_bw',
'SchoolHoliday_fw', 'StateHoliday_fw', 'Promo_fw', 'Closed_fw']] = \
df[['SchoolHoliday_bw', 'StateHoliday_bw', 'Promo_bw', 'Closed_bw',
'SchoolHoliday_fw', 'StateHoliday_fw', 'Promo_fw', 'Closed_fw']].astype('int64')
train_df = df.loc[~df.Sales.isnull()]
train_df.shape
(844338, 76)
train_df[['Sales', 'Customers']] = train_df[['Sales', 'Customers']].astype('int64')
/opt/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py:3391: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy self[k1] = value[k2]
train_df.describe(include=[np.number]).loc[['count', 'min', 'max']].T.astype(np.int64)
count | min | max | |
---|---|---|---|
Store | 844338 | 1 | 1115 |
Sales | 844338 | 46 | 41551 |
Customers | 844338 | 8 | 7388 |
Year | 844338 | 2013 | 2015 |
Month | 844338 | 1 | 12 |
Week | 844338 | 1 | 52 |
Day | 844338 | 1 | 31 |
Dayofweek | 844338 | 0 | 6 |
Dayofyear | 844338 | 1 | 365 |
Elapsed | 844338 | 1356998400 | 1438300800 |
CompetitionDistance | 842152 | 20 | 75860 |
CompetitionOpenSinceMonth | 844338 | 1 | 12 |
CompetitionOpenSinceYear | 844338 | 1900 | 2015 |
Promo2SinceWeek | 844338 | 1 | 50 |
Promo2SinceYear | 844338 | 1900 | 2015 |
trend | 844338 | 28 | 100 |
trend_DE | 844338 | 50 | 100 |
Max_TemperatureC | 844338 | -11 | 38 |
Mean_TemperatureC | 844338 | -13 | 31 |
Min_TemperatureC | 844338 | -15 | 24 |
Dew_PointC | 844338 | -14 | 25 |
MeanDew_PointC | 844338 | -15 | 20 |
Min_DewpointC | 844338 | -73 | 18 |
Max_Humidity | 844338 | 44 | 100 |
Mean_Humidity | 844338 | 30 | 100 |
Min_Humidity | 844338 | 4 | 100 |
Max_Sea_Level_PressurehPa | 844338 | 976 | 1043 |
Mean_Sea_Level_PressurehPa | 844338 | 974 | 1040 |
Min_Sea_Level_PressurehPa | 844338 | 970 | 1038 |
Max_VisibilityKm | 823710 | 0 | 31 |
Mean_VisibilityKm | 823710 | 0 | 31 |
Min_VisibilitykM | 823710 | 0 | 31 |
Max_Wind_SpeedKm_h | 844338 | 3 | 101 |
Mean_Wind_SpeedKm_h | 844338 | 2 | 53 |
Max_Gust_SpeedKm_h | 194051 | 21 | 111 |
Precipitationmm | 844338 | 0 | 58 |
CloudCover | 776282 | 0 | 8 |
WindDirDegrees | 844338 | -1 | 360 |
Promo2Days | 844338 | 0 | 2195 |
Promo2Weeks | 844338 | 0 | 25 |
CompetitionDaysOpen | 844338 | 0 | 9177 |
CompetitionMonthsOpen | 844338 | 0 | 24 |
AfterClosed | 844338 | 0 | 926 |
BeforeClosed | 844338 | 0 | 986 |
AfterSchoolHoliday | 844338 | 0 | 107 |
BeforeSchoolHoliday | 844338 | 0 | 194 |
AfterStateHoliday | 844338 | 0 | 135 |
BeforeStateHoliday | 844338 | 0 | 195 |
AfterPromo | 844338 | 0 | 188 |
BeforePromo | 844338 | 0 | 16 |
Closed_bw | 844338 | 0 | 6 |
SchoolHoliday_bw | 844338 | 0 | 7 |
StateHoliday_bw | 844338 | 0 | 2 |
Promo_bw | 844338 | 0 | 5 |
Closed_fw | 844338 | 0 | 6 |
SchoolHoliday_fw | 844338 | 0 | 7 |
StateHoliday_fw | 844338 | 0 | 2 |
Promo_fw | 844338 | 0 | 5 |
train_df.describe(exclude=[np.number]).T
count | unique | top | freq | first | last | |
---|---|---|---|---|---|---|
Date | 844338 | 942 | 2015-06-09 00:00:00 | 1115 | 2013-01-01 00:00:00 | 2015-07-31 00:00:00 |
Promo | 844338 | 2 | False | 467463 | NaN | NaN |
StateHoliday | 844338 | 2 | False | 843428 | NaN | NaN |
SchoolHoliday | 844338 | 2 | False | 680893 | NaN | NaN |
Is_month_end | 844338 | 2 | False | 816299 | NaN | NaN |
Is_month_start | 844338 | 2 | False | 824972 | NaN | NaN |
Is_quarter_end | 844338 | 2 | False | 835723 | NaN | NaN |
Is_quarter_start | 844338 | 2 | False | 836819 | NaN | NaN |
Is_year_end | 844338 | 2 | False | 842288 | NaN | NaN |
Is_year_start | 844338 | 2 | False | 844285 | NaN | NaN |
StoreType | 844338 | 4 | a | 457042 | NaN | NaN |
Assortment | 844338 | 3 | a | 444875 | NaN | NaN |
Promo2 | 844338 | 2 | False | 423292 | NaN | NaN |
PromoInterval | 421046 | 3 | Jan,Apr,Jul,Oct | 242397 | NaN | NaN |
State | 844338 | 12 | NW | 222930 | NaN | NaN |
Events | 641281 | 21 | Rain | 324320 | NaN | NaN |
Promo2Since | 844338 | 56 | 1900-01-01 00:00:00 | 423292 | 1900-01-01 00:00:00 | 2015-06-01 00:00:00 |
CompetitionOpenSince | 844338 | 171 | 1900-01-15 00:00:00 | 269222 | 1900-01-15 00:00:00 | 2015-08-15 00:00:00 |
test_df = df.loc[df.Sales.isnull()]
test_df.drop('Sales', axis=1, inplace=True)
test_df.drop('Customers', axis=1, inplace=True)
test_df.shape
/opt/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py:3940: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy errors=errors)
(35104, 74)
# Save final tables
train_df.to_pickle(path/'train_df')
test_df.to_pickle(path/'test_df')