Notebook

In [1]:

from statsmodels.tsa.statespace.varmax import VARMAX

from sklearn.metrics import mean_squared_error

import pandas as pd
import numpy as np

%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt

We have app data that for each user pairs reported app metrics and app name:

In [2]:

appDf = pd.read_csv("app.data", names=["user", "date", "app", "metric"])
appDf['date'] = pd.to_datetime(appDf['date'])
appDf.info()
print(appDf.user.unique(), appDf.app.unique())
appDf.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 497 entries, 0 to 496
Data columns (total 4 columns):
user      497 non-null object
date      497 non-null datetime64[ns]
app       497 non-null object
metric    497 non-null float64
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 15.6+ KB
['user_2' 'user_1' 'user_3'] [' app_2' ' app_1' ' app_3']

Out[2]:

	user	date	app	metric
0	user_2	2017-08-28 02:41:48	app_2	0.00
1	user_1	2017-08-28 11:01:01	app_1	0.01
2	user_3	2017-08-28 16:41:55	app_1	0.10
3	user_3	2017-08-29 02:43:39	app_3	0.09
4	user_1	2017-08-29 07:00:25	app_1	0.02

We have location data that uses keywords for location and indicates a change of location:

In [3]:

locationDf = pd.read_csv("location.data", names=["user", "date", "location"])
locationDf['date'] = pd.to_datetime(locationDf['date'])

cross_l = pd.crosstab([locationDf.date, locationDf.user], locationDf.location)

cross_l.head(3)

Out[3]:

	location	bar	girlfriends	grocers	home	lunch	park	relatives	work
date	user
2017-08-28 00:00:01	user_2	0	0	0	1	0	0	0	0
2017-08-28 00:00:01	user_3	0	0	0	1	0	0	0	0
2017-08-28 19:25:16	user_3	0	0	0	0	0	0	1	0

Location data includes seasonality at actual seasonal levels (summer, winter), as well as weekday/weekend behavior. Here this is demonstrated with cumsum:

In [4]:

cross = cross_l.copy()

l2 = cross.groupby(['user']).cumsum().copy()
l2.reset_index(inplace=True)

plotaxis = plt.figure(figsize=(50,20)).gca()
for key, grp in l2.groupby(['user']):
    my_ts = [ts.to_julian_date() - 1721424.5 for ts in grp['date']]
    plt.plot(my_ts, grp.drop('user', axis=1).drop('date', axis=1), label=key)

plotaxis.xaxis.set_major_formatter(
    matplotlib.dates.DateFormatter('%d/%m/%y')
)
xlabels = plotaxis.get_xticklabels()
plt.setp(xlabels, rotation=85, fontsize=25)
ylabels = plotaxis.get_yticklabels()
plt.setp(ylabels, fontsize=25)

plt.legend(bbox_to_anchor=(.02, 0.52, 1., .102), loc=3,
           ncol=2, borderaxespad=0., prop={'size': 26})    

#l3 = l2[l2['user'] == 'user_1'].copy()
#l3['month']=l3['date'].dt.month
#grouped = l3.groupby(l3['month'])
#
#for m in grouped.groups.keys():
#    print(l3[l3['month'] == m].tail(1))
#l2.groupby(['user']).sum()

'User/Locations'

Out[4]:

'User/Locations'

The some metric data is set to have positive or negative correlation in terms of growth, to location data. Excluding seasonality, the metric data should trend upwards:

In [5]:

#appDf.groupby(['user', 'app']).plot(x="date", y="metric", subplots=True)

plotaxis = plt.figure(figsize=(50,20)).gca()
for key, grp in appDf.groupby(['user', 'app']):
    my_ts = [ts.to_julian_date() - 1721424.5 for ts in grp['date']]
    plt.plot(my_ts, grp['metric'], label='%s@%s' % ("metric", key))

plotaxis.xaxis.set_major_formatter(
    matplotlib.dates.DateFormatter('%d/%m/%y')
)
xlabels = plotaxis.get_xticklabels()
plt.setp(xlabels, rotation=85, fontsize=25)
ylabels = plotaxis.get_yticklabels()
plt.setp(ylabels, fontsize=25)
'Users/Apps'

Out[5]:

'Users/Apps'

In the next plot, we can see in fine detail user_3's metric trend and location data. Looking at the very beginning at increased zoom, you might notice a visual correlation between the first few gaps and the faster-rising app data. Frequency of use should also positively correlate.

In [6]:

cross.reset_index(inplace=True)

u1_l = cross[cross['user'] == 'user_1']

u1_a = appDf[appDf['user'] == 'user_1']

u1_a1 = u1_a[u1_a['app'] == ' app_1']
u1_a2 = u1_a[u1_a['app'] == ' app_2']

plotaxis = plt.figure(figsize=(100,10)).gca()

for key, grp in u1_a.groupby(['app']):
    my_ts = [ts.to_julian_date() for ts in grp['date']]
    plt.plot(grp['date'], grp['metric'], '.-', label='%s@%s' % ("metric", key))    

for key, grp in u1_l.groupby(['user']):
    my_ts = [ts.to_julian_date() for ts in grp['date']]
    plt.plot(grp['date'], grp.drop('user', axis=1).drop('date', axis=1), label=key)
    
plotaxis.xaxis.set_major_formatter(
    matplotlib.dates.DateFormatter('%d/%m/%y')
)
xlabels = plotaxis.get_xticklabels()
plt.setp(xlabels, rotation=85, fontsize=25)
ylabels = plotaxis.get_yticklabels()
plt.setp(ylabels, fontsize=25)
True

Out[6]:

True

We want to produce classical machine learning/statistical modeling as a baseline to justify RNN approaches. We will follow this strategy in preparation of an ARIMA model:

resample to 15m intervals (data set would be insufficient to form the ARIMA model as the majority are errant occluding/masking/hiding the underlying predictive signal/model/equation/weighted average necessary to provide the baseline, so we will interpolate at the end)
fill rows in both set by date, so the data sets can be combined.
convert metric data to metric values per app: first creating a column for each app.
get column categories for location
divide data into per user charts.
combine app/metric and location data.
interpolate metric data per user to get curves we can predict from.

In [7]:

locDf = locationDf.copy()
locDf.set_index('date', inplace=True)

locDfs = {}
for user, user_loc_dc in locDf.groupby('user'):
    locDfs[user] = user_loc_dc.resample('15T').agg('max').bfill()
    
aDf = appDf.copy()
aDf.set_index('date', inplace=True)

userLocAppDfs = {}
for user, a2_df in aDf.groupby('user'):
    userDf = a2_df.resample('15T').agg('max')
                
    userDf.reset_index(inplace=True)
    userDf = pd.crosstab(index=userDf['date'], columns=userDf['app'], values=userDf['metric'], aggfunc=np.mean).fillna(np.nan, downcast='infer')

    userDf['user'] = user
        
    userDf.reset_index(inplace=True)
    userDf.set_index('date', inplace=True)

    userLocAppDfs[user] = userDf.resample('15T').agg('max')
            
    loDf = locDfs[user]
    loDf.reset_index(inplace=True)
    loDf = pd.crosstab([loDf.date, loDf.user], loDf.location)
    loDf.reset_index(inplace=True)

    loDf.set_index('date', inplace=True)
    loDf.drop('user', axis=1, inplace=True)
    
    userLocAppDfs[user] = userLocAppDfs[user].join(loDf, how='outer')
    userLocAppDfs[user] = userLocAppDfs[user].resample('15T').agg('max')
    userLocAppDfs[user]['user'].fillna(user, inplace=True)
    
    for loc in locationDf[locationDf['user'] == user].location.unique():
        userLocAppDfs[user][loc] = userLocAppDfs[user][loc].replace(np.nan, 0)
            
    for app in a2_df['app'].unique():
        #print(userLocAppDfs[user][app].head(10))
        #userLocAppDfs[user][app].fillna(value=0, inplace=True)
        #userLocAppDfs[user][app].replace(0.0, np.nan, inplace=True)

        userLocAppDfs[user][app].interpolate(method='linear', limit_area='inside', inplace=True)       
        userLocAppDfs[user][app].fillna(value=0, inplace=True)
        
#userLocAppDfs['user_1'].tail(5)

At this point, we have our data set per user. Here's what one user's data looks like:

In [8]:

u1 = userLocAppDfs['user_1'].copy()
u1.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 15011 entries, 2017-08-28 11:00:00 to 2018-01-31 19:30:00
Freq: 15T
Data columns (total 10 columns):
 app_1        15011 non-null float64
 app_2        15011 non-null float64
user          15011 non-null object
 bar          15011 non-null float64
 grocers      15011 non-null float64
 home         15011 non-null float64
 lunch        15011 non-null float64
 park         15011 non-null float64
 relatives    15011 non-null float64
 work         15011 non-null float64
dtypes: float64(9), object(1)
memory usage: 1.3+ MB

In [9]:

u1 = userLocAppDfs['user_1'].copy()
# https://stackoverflow.com/questions/11927715/how-to-give-a-pandas-matplotlib-bar-graph-custom-colors
locations = [(x/8.75, x/40.0, 0.85) for x in range(7)] # color grad

#u3[[' bar', ' grocers', ' home', ' lunch', ' work']].plot(color=locations, figsize=(15,10))
u1[[' bar', ' grocers', ' home', ' lunch', ' work', ' park', ' relatives']].plot(color=locations, figsize=(15,10))

# https://stackoverflow.com/questions/52467781/how-can-i-fill-the-area-in-this-plot/52467937#52467937
#fig, ax = plt.subplots(1,1)
#for location in [' bar', ' grocers', ' home', ' lunch', ' work', ' park', ' relatives']:  # or whatever subset of columns you want
#    ax.fill_between(len(u1[location]), u1[location])
                    
u1[' app_2'].plot(color='orange')
u1[' app_1'].plot(color='r')

#u1[135:140]

Out[9]:

<matplotlib.axes._subplots.AxesSubplot at 0x106e9ce10>

okay, let's fit a model to data through oct 3 to see if we can predict the down-trend in app_1.

In [32]:

# from: https://machinelearningmastery.com/make-predictions-time-series-forecasting-python/
# create a difference transform of the dataset
def difference(dataset):
    diff = list()
    for i in range(1, len(dataset)):
        value = dataset[i] - dataset[i - 1]
        diff.append(value)
    return np.array(diff)

# Make a prediction give regression coefficients and lag obs
def predict(coef, history):
    yhat = coef[0]
    for i in range(1, len(coef)):
        yhat += coef[i] * history[-i]
    return yhat

series = u1[[ ' app_1', ' app_2', ' bar', ' grocers', ' home', ' lunch', ' work', ' park', ' relatives' ]]

X = pd.DataFrame()
for column in [ ' app_1', ' app_2', ' bar', ' grocers', ' home', ' lunch', ' work', ' park', ' relatives' ]:
    X[column] = difference(series[column].values)

size = (4*24)*54 # hoping
train, test = X[0:size], X[size:size+(14*4*24)]

train = train.loc[:, (train != train.iloc[0]).any()] # https://stackoverflow.com/questions/20209600/panda-dataframe-remove-constant-column

#print(train.var(), X.info())

# train autoregression
model = VARMAX(train)
model_fit = model.fit(maxiter=10000, method='css', disp=False)

print(model_fit.mle_retvals)

##window = model_fit.k_ar
coef = model_fit.params

# walk forward over time steps in test
history = [train.iloc[i] for i in range(len(train))]
predictions = list()
for t in range(len(test)):
    yhat = predict(coef, history)
    obs = test[t]
    predictions.append(yhat)
    history.append(obs)
error = mean_squared_error(test, predictions)
print('Test MSE: %.3f' % error)

# plot
pyplot.plot(test)
pyplot.plot(predictions, color='red')
pyplot.show()

/anaconda3/lib/python3.6/site-packages/statsmodels/base/model.py:508: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

{'fopt': -25.760662959319767, 'gopt': array([ 1.77019621e+04,  5.21442253e+01,  2.73098635e-02,  2.82319501e-02,
        2.63508440e-02,  2.62791509e-02,  2.70576482e-02,  2.78863091e-02,
       -1.76058279e-05, -3.13842818e-04,  8.70308270e-06,  7.91366972e-06,
        2.77005086e-06,  7.38005212e-06, -2.94413383e-06, -1.05970344e-05,
        3.70249609e-05,  5.77419002e-05,  4.70237183e-06,  6.86242174e-06,
        5.08642017e-06,  5.76818593e-06,  4.42419434e-06,  4.53646010e-06,
        7.01163572e-06,  4.88995511e-06,  4.98872055e-06,  5.01003683e-06,
        5.00328667e-06,  5.00683939e-06,  4.99795760e-06,  4.98658892e-06,
        4.96740427e-06,  7.20028481e-06,  5.01003683e-06,  4.93045604e-06,
        4.98339148e-06,  4.97308861e-06,  5.02247133e-06,  5.01252373e-06,
        5.41540146e-06,  5.43387557e-06,  4.99440489e-06,  4.99547070e-06,
        2.41477949e-06,  4.46220838e-06, -4.79261075e-07, -1.64341429e-05,
        3.58646446e-06, -6.92459423e-06,  5.17417220e-06,  5.30313571e-06,
        4.35491643e-06,  5.06084064e-06,  3.42907924e-06, -1.38555833e-07,
        8.05187028e-06,  9.58557678e-06,  4.92583752e-06,  4.88213914e-06,
        5.98134875e-06,  5.12088150e-06,  7.13562542e-06,  1.30338407e-05,
        1.18948407e-05,  4.82280882e-06,  4.97664132e-06,  5.02424768e-06,
        5.09317033e-06,  5.07007769e-06,  5.14148724e-06,  5.68149972e-06,
        1.62250699e+04,  1.08653537e+03,  1.04614483e+02,  1.02404890e+00,
        2.92769712e-02,  5.55505284e-02,  1.07368302e+00,  3.06540116e-02,
        2.85733794e-02,  7.07760503e-02,  9.94834948e-01,  2.81637689e-02,
        2.62461164e-02,  2.62514408e-02,  4.32589911e-02,  1.04816881e+00,
        3.01713939e-02,  2.79850280e-02,  2.79884901e-02,  2.83819936e-02,
        6.40070471e-02,  9.79552695e-01,  2.83376497e-02,  2.66644129e-02,
        2.66704113e-02,  2.70420813e-02,  2.66794959e-02,  6.87684722e-02,
        1.06537098e+00,  3.04098549e-02,  2.83284301e-02,  2.83334536e-02,
        2.87238020e-02,  2.83475526e-02,  2.83776007e-02,  2.00381188e-01]), 'fcalls': 2289, 'warnflag': 2, 'converged': False, 'iterations': 0}

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/anaconda3/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   3062             try:
-> 3063                 return self._engine.get_loc(key)
   3064             except KeyError:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 0

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-32-efb279a87d37> in <module>()
     42 for t in range(len(test)):
     43     yhat = predict(coef, history)
---> 44     obs = test[t]
     45     predictions.append(yhat)
     46     history.append(obs)

/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2683             return self._getitem_multilevel(key)
   2684         else:
-> 2685             return self._getitem_column(key)
   2686 
   2687     def _getitem_column(self, key):

/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in _getitem_column(self, key)
   2690         # get column
   2691         if self.columns.is_unique:
-> 2692             return self._get_item_cache(key)
   2693 
   2694         # duplicate columns & possible reduce dimensionality

/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
   2484         res = cache.get(item)
   2485         if res is None:
-> 2486             values = self._data.get(item)
   2487             res = self._box_item_values(item, values)
   2488             cache[item] = res

/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in get(self, item, fastpath)
   4113 
   4114             if not isna(item):
-> 4115                 loc = self.items.get_loc(item)
   4116             else:
   4117                 indexer = np.arange(len(self.items))[isna(self.items)]

/anaconda3/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   3063                 return self._engine.get_loc(key)
   3064             except KeyError:
-> 3065                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   3066 
   3067         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 0

In [ ]: