from statsmodels.tsa.statespace.varmax import VARMAX
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
We have app data that for each user pairs reported app metrics and app name:
appDf = pd.read_csv("app.data", names=["user", "date", "app", "metric"])
appDf['date'] = pd.to_datetime(appDf['date'])
appDf.info()
print(appDf.user.unique(), appDf.app.unique())
appDf.head(5)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 497 entries, 0 to 496 Data columns (total 4 columns): user 497 non-null object date 497 non-null datetime64[ns] app 497 non-null object metric 497 non-null float64 dtypes: datetime64[ns](1), float64(1), object(2) memory usage: 15.6+ KB ['user_2' 'user_1' 'user_3'] [' app_2' ' app_1' ' app_3']
user | date | app | metric | |
---|---|---|---|---|
0 | user_2 | 2017-08-28 02:41:48 | app_2 | 0.00 |
1 | user_1 | 2017-08-28 11:01:01 | app_1 | 0.01 |
2 | user_3 | 2017-08-28 16:41:55 | app_1 | 0.10 |
3 | user_3 | 2017-08-29 02:43:39 | app_3 | 0.09 |
4 | user_1 | 2017-08-29 07:00:25 | app_1 | 0.02 |
We have location data that uses keywords for location and indicates a change of location:
locationDf = pd.read_csv("location.data", names=["user", "date", "location"])
locationDf['date'] = pd.to_datetime(locationDf['date'])
cross_l = pd.crosstab([locationDf.date, locationDf.user], locationDf.location)
cross_l.head(3)
location | bar | girlfriends | grocers | home | lunch | park | relatives | work | |
---|---|---|---|---|---|---|---|---|---|
date | user | ||||||||
2017-08-28 00:00:01 | user_2 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
user_3 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | |
2017-08-28 19:25:16 | user_3 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
Location data includes seasonality at actual seasonal levels (summer, winter), as well as weekday/weekend behavior. Here this is demonstrated with cumsum:
cross = cross_l.copy()
l2 = cross.groupby(['user']).cumsum().copy()
l2.reset_index(inplace=True)
plotaxis = plt.figure(figsize=(50,20)).gca()
for key, grp in l2.groupby(['user']):
my_ts = [ts.to_julian_date() - 1721424.5 for ts in grp['date']]
plt.plot(my_ts, grp.drop('user', axis=1).drop('date', axis=1), label=key)
plotaxis.xaxis.set_major_formatter(
matplotlib.dates.DateFormatter('%d/%m/%y')
)
xlabels = plotaxis.get_xticklabels()
plt.setp(xlabels, rotation=85, fontsize=25)
ylabels = plotaxis.get_yticklabels()
plt.setp(ylabels, fontsize=25)
plt.legend(bbox_to_anchor=(.02, 0.52, 1., .102), loc=3,
ncol=2, borderaxespad=0., prop={'size': 26})
#l3 = l2[l2['user'] == 'user_1'].copy()
#l3['month']=l3['date'].dt.month
#grouped = l3.groupby(l3['month'])
#
#for m in grouped.groups.keys():
# print(l3[l3['month'] == m].tail(1))
#l2.groupby(['user']).sum()
'User/Locations'
'User/Locations'
The some metric data is set to have positive or negative correlation in terms of growth, to location data. Excluding seasonality, the metric data should trend upwards:
#appDf.groupby(['user', 'app']).plot(x="date", y="metric", subplots=True)
plotaxis = plt.figure(figsize=(50,20)).gca()
for key, grp in appDf.groupby(['user', 'app']):
my_ts = [ts.to_julian_date() - 1721424.5 for ts in grp['date']]
plt.plot(my_ts, grp['metric'], label='%s@%s' % ("metric", key))
plotaxis.xaxis.set_major_formatter(
matplotlib.dates.DateFormatter('%d/%m/%y')
)
xlabels = plotaxis.get_xticklabels()
plt.setp(xlabels, rotation=85, fontsize=25)
ylabels = plotaxis.get_yticklabels()
plt.setp(ylabels, fontsize=25)
'Users/Apps'
'Users/Apps'
In the next plot, we can see in fine detail user_3's metric trend and location data. Looking at the very beginning at increased zoom, you might notice a visual correlation between the first few gaps and the faster-rising app data. Frequency of use should also positively correlate.
cross.reset_index(inplace=True)
u1_l = cross[cross['user'] == 'user_1']
u1_a = appDf[appDf['user'] == 'user_1']
u1_a1 = u1_a[u1_a['app'] == ' app_1']
u1_a2 = u1_a[u1_a['app'] == ' app_2']
plotaxis = plt.figure(figsize=(100,10)).gca()
for key, grp in u1_a.groupby(['app']):
my_ts = [ts.to_julian_date() for ts in grp['date']]
plt.plot(grp['date'], grp['metric'], '.-', label='%s@%s' % ("metric", key))
for key, grp in u1_l.groupby(['user']):
my_ts = [ts.to_julian_date() for ts in grp['date']]
plt.plot(grp['date'], grp.drop('user', axis=1).drop('date', axis=1), label=key)
plotaxis.xaxis.set_major_formatter(
matplotlib.dates.DateFormatter('%d/%m/%y')
)
xlabels = plotaxis.get_xticklabels()
plt.setp(xlabels, rotation=85, fontsize=25)
ylabels = plotaxis.get_yticklabels()
plt.setp(ylabels, fontsize=25)
True
True
We want to produce classical machine learning/statistical modeling as a baseline to justify RNN approaches. We will follow this strategy in preparation of an ARIMA model:
locDf = locationDf.copy()
locDf.set_index('date', inplace=True)
locDfs = {}
for user, user_loc_dc in locDf.groupby('user'):
locDfs[user] = user_loc_dc.resample('15T').agg('max').bfill()
aDf = appDf.copy()
aDf.set_index('date', inplace=True)
userLocAppDfs = {}
for user, a2_df in aDf.groupby('user'):
userDf = a2_df.resample('15T').agg('max')
userDf.reset_index(inplace=True)
userDf = pd.crosstab(index=userDf['date'], columns=userDf['app'], values=userDf['metric'], aggfunc=np.mean).fillna(np.nan, downcast='infer')
userDf['user'] = user
userDf.reset_index(inplace=True)
userDf.set_index('date', inplace=True)
userLocAppDfs[user] = userDf.resample('15T').agg('max')
loDf = locDfs[user]
loDf.reset_index(inplace=True)
loDf = pd.crosstab([loDf.date, loDf.user], loDf.location)
loDf.reset_index(inplace=True)
loDf.set_index('date', inplace=True)
loDf.drop('user', axis=1, inplace=True)
userLocAppDfs[user] = userLocAppDfs[user].join(loDf, how='outer')
userLocAppDfs[user] = userLocAppDfs[user].resample('15T').agg('max')
userLocAppDfs[user]['user'].fillna(user, inplace=True)
for loc in locationDf[locationDf['user'] == user].location.unique():
userLocAppDfs[user][loc] = userLocAppDfs[user][loc].replace(np.nan, 0)
for app in a2_df['app'].unique():
#print(userLocAppDfs[user][app].head(10))
#userLocAppDfs[user][app].fillna(value=0, inplace=True)
#userLocAppDfs[user][app].replace(0.0, np.nan, inplace=True)
userLocAppDfs[user][app].interpolate(method='linear', limit_area='inside', inplace=True)
userLocAppDfs[user][app].fillna(value=0, inplace=True)
#userLocAppDfs['user_1'].tail(5)
At this point, we have our data set per user. Here's what one user's data looks like:
u1 = userLocAppDfs['user_1'].copy()
u1.info()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 15011 entries, 2017-08-28 11:00:00 to 2018-01-31 19:30:00 Freq: 15T Data columns (total 10 columns): app_1 15011 non-null float64 app_2 15011 non-null float64 user 15011 non-null object bar 15011 non-null float64 grocers 15011 non-null float64 home 15011 non-null float64 lunch 15011 non-null float64 park 15011 non-null float64 relatives 15011 non-null float64 work 15011 non-null float64 dtypes: float64(9), object(1) memory usage: 1.3+ MB
u1 = userLocAppDfs['user_1'].copy()
# https://stackoverflow.com/questions/11927715/how-to-give-a-pandas-matplotlib-bar-graph-custom-colors
locations = [(x/8.75, x/40.0, 0.85) for x in range(7)] # color grad
#u3[[' bar', ' grocers', ' home', ' lunch', ' work']].plot(color=locations, figsize=(15,10))
u1[[' bar', ' grocers', ' home', ' lunch', ' work', ' park', ' relatives']].plot(color=locations, figsize=(15,10))
# https://stackoverflow.com/questions/52467781/how-can-i-fill-the-area-in-this-plot/52467937#52467937
#fig, ax = plt.subplots(1,1)
#for location in [' bar', ' grocers', ' home', ' lunch', ' work', ' park', ' relatives']: # or whatever subset of columns you want
# ax.fill_between(len(u1[location]), u1[location])
u1[' app_2'].plot(color='orange')
u1[' app_1'].plot(color='r')
#u1[135:140]
<matplotlib.axes._subplots.AxesSubplot at 0x106e9ce10>
okay, let's fit a model to data through oct 3 to see if we can predict the down-trend in app_1.
# from: https://machinelearningmastery.com/make-predictions-time-series-forecasting-python/
# create a difference transform of the dataset
def difference(dataset):
diff = list()
for i in range(1, len(dataset)):
value = dataset[i] - dataset[i - 1]
diff.append(value)
return np.array(diff)
# Make a prediction give regression coefficients and lag obs
def predict(coef, history):
yhat = coef[0]
for i in range(1, len(coef)):
yhat += coef[i] * history[-i]
return yhat
series = u1[[ ' app_1', ' app_2', ' bar', ' grocers', ' home', ' lunch', ' work', ' park', ' relatives' ]]
X = pd.DataFrame()
for column in [ ' app_1', ' app_2', ' bar', ' grocers', ' home', ' lunch', ' work', ' park', ' relatives' ]:
X[column] = difference(series[column].values)
size = (4*24)*54 # hoping
train, test = X[0:size], X[size:size+(14*4*24)]
train = train.loc[:, (train != train.iloc[0]).any()] # https://stackoverflow.com/questions/20209600/panda-dataframe-remove-constant-column
#print(train.var(), X.info())
# train autoregression
model = VARMAX(train)
model_fit = model.fit(maxiter=10000, method='css', disp=False)
print(model_fit.mle_retvals)
##window = model_fit.k_ar
coef = model_fit.params
# walk forward over time steps in test
history = [train.iloc[i] for i in range(len(train))]
predictions = list()
for t in range(len(test)):
yhat = predict(coef, history)
obs = test[t]
predictions.append(yhat)
history.append(obs)
error = mean_squared_error(test, predictions)
print('Test MSE: %.3f' % error)
# plot
pyplot.plot(test)
pyplot.plot(predictions, color='red')
pyplot.show()
/anaconda3/lib/python3.6/site-packages/statsmodels/base/model.py:508: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals "Check mle_retvals", ConvergenceWarning)
{'fopt': -25.760662959319767, 'gopt': array([ 1.77019621e+04, 5.21442253e+01, 2.73098635e-02, 2.82319501e-02, 2.63508440e-02, 2.62791509e-02, 2.70576482e-02, 2.78863091e-02, -1.76058279e-05, -3.13842818e-04, 8.70308270e-06, 7.91366972e-06, 2.77005086e-06, 7.38005212e-06, -2.94413383e-06, -1.05970344e-05, 3.70249609e-05, 5.77419002e-05, 4.70237183e-06, 6.86242174e-06, 5.08642017e-06, 5.76818593e-06, 4.42419434e-06, 4.53646010e-06, 7.01163572e-06, 4.88995511e-06, 4.98872055e-06, 5.01003683e-06, 5.00328667e-06, 5.00683939e-06, 4.99795760e-06, 4.98658892e-06, 4.96740427e-06, 7.20028481e-06, 5.01003683e-06, 4.93045604e-06, 4.98339148e-06, 4.97308861e-06, 5.02247133e-06, 5.01252373e-06, 5.41540146e-06, 5.43387557e-06, 4.99440489e-06, 4.99547070e-06, 2.41477949e-06, 4.46220838e-06, -4.79261075e-07, -1.64341429e-05, 3.58646446e-06, -6.92459423e-06, 5.17417220e-06, 5.30313571e-06, 4.35491643e-06, 5.06084064e-06, 3.42907924e-06, -1.38555833e-07, 8.05187028e-06, 9.58557678e-06, 4.92583752e-06, 4.88213914e-06, 5.98134875e-06, 5.12088150e-06, 7.13562542e-06, 1.30338407e-05, 1.18948407e-05, 4.82280882e-06, 4.97664132e-06, 5.02424768e-06, 5.09317033e-06, 5.07007769e-06, 5.14148724e-06, 5.68149972e-06, 1.62250699e+04, 1.08653537e+03, 1.04614483e+02, 1.02404890e+00, 2.92769712e-02, 5.55505284e-02, 1.07368302e+00, 3.06540116e-02, 2.85733794e-02, 7.07760503e-02, 9.94834948e-01, 2.81637689e-02, 2.62461164e-02, 2.62514408e-02, 4.32589911e-02, 1.04816881e+00, 3.01713939e-02, 2.79850280e-02, 2.79884901e-02, 2.83819936e-02, 6.40070471e-02, 9.79552695e-01, 2.83376497e-02, 2.66644129e-02, 2.66704113e-02, 2.70420813e-02, 2.66794959e-02, 6.87684722e-02, 1.06537098e+00, 3.04098549e-02, 2.83284301e-02, 2.83334536e-02, 2.87238020e-02, 2.83475526e-02, 2.83776007e-02, 2.00381188e-01]), 'fcalls': 2289, 'warnflag': 2, 'converged': False, 'iterations': 0}
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) /anaconda3/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance) 3062 try: -> 3063 return self._engine.get_loc(key) 3064 except KeyError: pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc() pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc() pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item() pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item() KeyError: 0 During handling of the above exception, another exception occurred: KeyError Traceback (most recent call last) <ipython-input-32-efb279a87d37> in <module>() 42 for t in range(len(test)): 43 yhat = predict(coef, history) ---> 44 obs = test[t] 45 predictions.append(yhat) 46 history.append(obs) /anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key) 2683 return self._getitem_multilevel(key) 2684 else: -> 2685 return self._getitem_column(key) 2686 2687 def _getitem_column(self, key): /anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in _getitem_column(self, key) 2690 # get column 2691 if self.columns.is_unique: -> 2692 return self._get_item_cache(key) 2693 2694 # duplicate columns & possible reduce dimensionality /anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in _get_item_cache(self, item) 2484 res = cache.get(item) 2485 if res is None: -> 2486 values = self._data.get(item) 2487 res = self._box_item_values(item, values) 2488 cache[item] = res /anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in get(self, item, fastpath) 4113 4114 if not isna(item): -> 4115 loc = self.items.get_loc(item) 4116 else: 4117 indexer = np.arange(len(self.items))[isna(self.items)] /anaconda3/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance) 3063 return self._engine.get_loc(key) 3064 except KeyError: -> 3065 return self._engine.get_loc(self._maybe_cast_indexer(key)) 3066 3067 indexer = self.get_indexer([key], method=method, tolerance=tolerance) pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc() pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc() pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item() pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item() KeyError: 0