In [1]:
from statsmodels.tsa.statespace.varmax import VARMAX
from statsmodels.tsa import stattools
from sklearn.metrics import mean_squared_error

import pandas as pd
import numpy as np

%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt

# https://www.kaggle.com/thebrownviking20/everything-you-can-do-with-a-time-series
plt.style.use('fivethirtyeight') 
# Above is a special style template for matplotlib, highly useful for visualizing time series data

We have app data that for each user pairs reported app metrics and app name:

In [2]:
appDf = pd.read_csv("app.data", names=["user", "date", "app", "metric"])
appDf['date'] = pd.to_datetime(appDf['date'])
appDf.info()
print(appDf.user.unique(), appDf.app.unique())
appDf.head(5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 497 entries, 0 to 496
Data columns (total 4 columns):
user      497 non-null object
date      497 non-null datetime64[ns]
app       497 non-null object
metric    497 non-null float64
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 15.6+ KB
['user_2' 'user_1' 'user_3'] [' app_2' ' app_1' ' app_3']
Out[2]:
user date app metric
0 user_2 2017-08-28 02:41:48 app_2 0.00
1 user_1 2017-08-28 11:01:01 app_1 0.01
2 user_3 2017-08-28 16:41:55 app_1 0.10
3 user_3 2017-08-29 02:43:39 app_3 0.09
4 user_1 2017-08-29 07:00:25 app_1 0.02
In [3]:
plt.figure(figsize=(50,20))
plt.tick_params(axis='both', which='major', labelsize=25)

plt.axhline(y=0, linewidth=1, linestyle='--', color='grey')

user_colors = ['r', 'g', 'b']

for i, user in enumerate(appDf.user.unique()):
    userDf = appDf[appDf['user'] == user]

    base_color = user_colors[i]
    
    plt.subplot(1,len(appDf.user.unique()),i+1)
    
    for app in userDf.app.unique():
        uaDf = userDf[userDf['app'] == app]
        
        e95 = 1.96/np.sqrt(len(uaDf))
        line_color = matplotlib.colors.to_rgb(base_color)
                
        plt.axhline(y=e95, linewidth=1, linestyle='--', color=1 - np.array(line_color)*0.66)
        plt.axhline(y=-1 * e95, linewidth=1, linestyle='--', color=np.array(line_color)*0.66)
        plt.axhspan(e95, -1 * e95, facecolor=np.array(line_color)*0.66, alpha=0.05)

        plt.plot(stattools.acf(uaDf['metric'], fft=True, nlags=140), label=user + " " + app, color=line_color)
        
    plt.legend(prop={'size': 35})

plt.show()

Interstingly, in the above we see that there is autocorrelation for user_2's app metric, but user_1 and user_3 just show a positive trend.

We have location data that uses keywords for location and indicates a change of location:

In [4]:
locationDf = pd.read_csv("location.data", names=["user", "date", "location"])
locationDf['date'] = pd.to_datetime(locationDf['date'])

cross_l = pd.crosstab([locationDf.date, locationDf.user], locationDf.location)

cross_l.head(3)
Out[4]:
location bar girlfriends grocers home lunch park relatives work
date user
2017-08-28 00:00:01 user_2 0 0 0 1 0 0 0 0
user_3 0 0 0 1 0 0 0 0
2017-08-28 19:25:16 user_3 0 0 0 0 0 0 1 0
In [5]:
plt.figure(figsize=(50,20))
plt.tick_params(axis='both', which='major', labelsize=25)

plt.axhline(y=0, linewidth=1, linestyle='--', color='grey')

user_colors = ['r', 'g', 'b']

unsorted = cross_l.copy()
unsorted.reset_index(inplace=True)

for i, user in enumerate(unsorted.user.unique()):
    userDf = unsorted[unsorted['user'] == user]
    sourceDf = locationDf[locationDf['user'] == user]

    base_color = user_colors[i]
    line_color = np.array(matplotlib.colors.to_rgb(base_color))
    
    plt.subplot(1,len(unsorted.user.unique()),i+1)
    
    for location in sourceDf.location.unique():
        e95 = 1.96/np.sqrt(len(userDf))
        line_color = line_color * 0.9
                
        plt.axhline(y=e95, linewidth=1, linestyle='--', color=1 - line_color*0.66)
        plt.axhline(y=-1 * e95, linewidth=1, linestyle='--', color=line_color*0.66)
        plt.axhspan(e95, -1 * e95, facecolor=line_color*0.66, alpha=0.05)

        plt.plot(stattools.acf(userDf[location], fft=True, nlags=350), label=user + " " + location, color=line_color)
        
    plt.legend(prop={'size': 35})

plt.show()

plt.figure(figsize=(50,20))
plt.tick_params(axis='both', which='major', labelsize=25)

plt.axhline(y=0, linewidth=1, linestyle='--', color='grey')

user_colors = ['r', 'g', 'b']

unsorted = cross_l.copy()
unsorted.reset_index(inplace=True)

for i, user in enumerate(unsorted.user.unique()):
    userDf = unsorted[unsorted['user'] == user]
    sourceDf = locationDf[locationDf['user'] == user]

    base_color = user_colors[i]
    line_color = np.array(matplotlib.colors.to_rgb(base_color))
    
    plt.subplot(1,len(unsorted.user.unique()),i+1)
    
    for location in sourceDf.location.unique():
        e95 = 1.96/np.sqrt(len(userDf))
        line_color = line_color * 0.9
                
        plt.axhline(y=e95, linewidth=1, linestyle='--', color=1 - line_color*0.66)
        plt.axhline(y=-1 * e95, linewidth=1, linestyle='--', color=line_color*0.66)
        plt.axhspan(e95, -1 * e95, facecolor=line_color*0.66, alpha=0.05)

        plt.plot(stattools.acf(userDf[location], fft=True, nlags=50), label=user + " " + location, color=line_color)
        
    plt.legend(prop={'size': 35})

plt.show()