With this script you could have achieved the 13th place in the Rossmann Store Sales competition on Kaggle. No external data is used.
If you would add Google Trends daily searches for 'Rossmann', State data and Weather data per State, you can get to the 7th place with just one model. But because that model would be useless for real world usage, that data is kept out of this model.
This script uses about 8Gb of RAM and takes some hour to run.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import pylab
import csv
import datetime
import math
import re
import time
import random
import os
from pandas.tseries.offsets import *
from operator import *
from sklearn.cross_validation import train_test_split
%matplotlib inline
# plt.style.use('ggplot') # Good looking plots
np.set_printoptions(precision=4, threshold=10000, linewidth=100, edgeitems=999, suppress=True)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 100)
pd.set_option('expand_frame_repr', False)
pd.set_option('precision', 6)
start_time = time.time()
# Thanks to Chenglong Chen for providing this in the forum
def ToWeight(y):
w = np.zeros(y.shape, dtype=float)
ind = y != 0
w[ind] = 1./(y[ind]**2)
return w
def rmspe(yhat, y):
w = ToWeight(y)
rmspe = np.sqrt(np.mean( w * (y - yhat)**2 ))
return rmspe
def rmspe_xg(yhat, y):
# y = y.values
y = y.get_label()
y = np.exp(y) - 1
yhat = np.exp(yhat) - 1
w = ToWeight(y)
rmspe = np.sqrt(np.mean(w * (y - yhat)**2))
return "rmspe", rmspe
seed = 42
nrows = None
df_train = pd.read_csv('../data/train.csv',
nrows=nrows,
parse_dates=['Date'],
date_parser=(lambda dt: pd.to_datetime(dt, format='%Y-%m-%d')))
nrows = nrows
df_submit = pd.read_csv('../data/test.csv',
nrows=nrows,
parse_dates=['Date'],
date_parser=(lambda dt: pd.to_datetime(dt, format='%Y-%m-%d')))
### Setting a variable to easily distinguish train (1) from submit (0) set
df_train['Set'] = 1
df_submit['Set'] = 0
### Combine train and test set
frames = [df_train, df_submit]
df = pd.concat(frames)
df.info()
features_x = ['Store', 'Date', 'DayOfWeek', 'Open', 'Promo', 'SchoolHoliday', 'StateHoliday']
features_y = ['SalesLog']
### Remove rows where store is open, but no sales.
df = df.loc[~((df['Open'] == 1) & (df['Sales'] == 0))]
df.loc[df['Set'] == 1, 'SalesLog'] = np.log1p(df.loc[df['Set'] == 1]['Sales']) # = np.log(df['Sales'] + 1)
df['StateHoliday'] = df['StateHoliday'].astype('category').cat.codes
var_name = 'Date'
df[var_name + 'Day'] = pd.Index(df[var_name]).day
df[var_name + 'Week'] = pd.Index(df[var_name]).week
df[var_name + 'Month'] = pd.Index(df[var_name]).month
df[var_name + 'Year'] = pd.Index(df[var_name]).year
df[var_name + 'DayOfYear'] = pd.Index(df[var_name]).dayofyear
df[var_name + 'Day'] = df[var_name + 'Day'].fillna(0)
df[var_name + 'Week'] = df[var_name + 'Week'].fillna(0)
df[var_name + 'Month'] = df[var_name + 'Month'].fillna(0)
df[var_name + 'Year'] = df[var_name + 'Year'].fillna(0)
df[var_name + 'DayOfYear'] = df[var_name + 'DayOfYear'].fillna(0)
features_x.remove(var_name)
features_x.append(var_name + 'Day')
features_x.append(var_name + 'Week')
features_x.append(var_name + 'Month')
features_x.append(var_name + 'Year')
features_x.append(var_name + 'DayOfYear')
df['DateInt'] = df['Date'].astype(np.int64)
df_store = pd.read_csv('../data/store.csv',
nrows=nrows)
df_store.info()
### Convert Storetype and Assortment to numerical categories
df_store['StoreType'] = df_store['StoreType'].astype('category').cat.codes
df_store['Assortment'] = df_store['Assortment'].astype('category').cat.codes
### Convert competition open year and month to float
def convertCompetitionOpen(df):
try:
date = '{}-{}'.format(int(df['CompetitionOpenSinceYear']), int(df['CompetitionOpenSinceMonth']))
return pd.to_datetime(date)
except:
return np.nan
df_store['CompetitionOpenInt'] = df_store.apply(lambda df: convertCompetitionOpen(df), axis=1).astype(np.int64)
### Convert competition open year and month to float
def convertPromo2(df):
try:
date = '{}{}1'.format(int(df['Promo2SinceYear']), int(df['Promo2SinceWeek']))
return pd.to_datetime(date, format='%Y%W%w')
except:
return np.nan
df_store['Promo2SinceFloat'] = df_store.apply(lambda df: convertPromo2(df), axis=1).astype(np.int64)
s = df_store['PromoInterval'].str.split(',').apply(pd.Series, 1)
s.columns = ['PromoInterval0', 'PromoInterval1', 'PromoInterval2', 'PromoInterval3']
df_store = df_store.join(s)
def monthToNum(date):
return{
'Jan' : 1,
'Feb' : 2,
'Mar' : 3,
'Apr' : 4,
'May' : 5,
'Jun' : 6,
'Jul' : 7,
'Aug' : 8,
'Sept' : 9,
'Oct' : 10,
'Nov' : 11,
'Dec' : 12
}[date]
df_store['PromoInterval0'] = df_store['PromoInterval0'].map(lambda x: monthToNum(x) if str(x) != 'nan' else np.nan)
df_store['PromoInterval1'] = df_store['PromoInterval1'].map(lambda x: monthToNum(x) if str(x) != 'nan' else np.nan)
df_store['PromoInterval2'] = df_store['PromoInterval2'].map(lambda x: monthToNum(x) if str(x) != 'nan' else np.nan)
df_store['PromoInterval3'] = df_store['PromoInterval3'].map(lambda x: monthToNum(x) if str(x) != 'nan' else np.nan)
del df_store['PromoInterval']
store_features = ['Store', 'StoreType', 'Assortment',
'CompetitionDistance', 'CompetitionOpenInt',
'PromoInterval0']
### Features not helping
# PromoInterval1, PromoInterval2, PromoInterval3
features_x = list(set(features_x + store_features))
df = pd.merge(df, df_store[store_features], how='left', on=['Store'])
### Convert every NAN to -1
for feature in features_x:
df[feature] = df[feature].fillna(-1)
list_stores_to_check = [105,163,172,364,378,523,589,663,676,681,700,708,730,764,837,845,861,882,969,986]
plt.rcParams["figure.figsize"] = [20,len(list_stores_to_check)*5]
j = 1
for i in list_stores_to_check:
stor = i
# Normal sales
X1 = df.loc[(df['Set'] == 1) & (df['Store'] == stor) & (df['Open'] == 1)]
y1 = df.loc[(df['Set'] == 1) & (df['Store'] == stor) & (df['Open'] == 1)]['Sales']
Xt = df.loc[(df['Store'] == stor)]
plt.subplot(len(list_stores_to_check),1,j)
plt.plot(X1['DateInt'], y1, '-')
plt.minorticks_on()
plt.grid(True, which='both')
plt.title(i)
j += 1