In [1]:

%load_ext watermark
%watermark -a 'cs224' -u -d -v -p numpy,pandas

Author: cs224

Last updated: 2022-09-12

Python implementation: CPython
Python version       : 3.8.12
IPython version      : 8.2.0

numpy : 1.23.1
pandas: 1.4.2

In [2]:

%matplotlib inline
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
# pd.set_option('display.float_format', lambda x: '%.2f' % x)
np.set_printoptions(edgeitems=10)
np.set_printoptions(linewidth=1000)
np.set_printoptions(suppress=True)
np.core.arrayprint._line_width = 180

SEED = 42
np.random.seed(SEED)

sns.set()

In [3]:

from IPython.display import display, HTML
display(HTML("<style>.container { width:70% !important; }</style>"))

Sample Data at Regular Times¶

The function generate_interpolated_time_series implements two different methods for generating samples at regular time intervals via pd.merge and via pd.concat:

In [4]:

def generate_interpolated_time_series(df_in, freq='T', method='reindex', start=None):
    if start is None:
        start = df_in.index[0].floor(freq)
    end = df_in.index[-1]
    
    if method == 'reindex':
        idx = pd.date_range(start=start, end=end, freq=freq)
        ldf = df_in.reindex(df_in.index.union(idx)).interpolate().bfill()
        ldf = ldf[~ldf.index.isin(df_in.index.difference(idx))]
    else:    
        df_sampling = pd.DataFrame(index=pd.date_range(start, end, freq=freq))

        ldf = None
        if method=='merge':
            ldf = pd.merge(df_in, df_sampling, left_index=True, right_index=True, how='outer')
        elif method=='concat':
            ldf = pd.concat([df_in, df_sampling], axis=1)
        else:
            raise Exception('Method unknown: ' + method)

        row = ldf.iloc[0,:]
        ldf.iloc[0,np.arange(len(row))[pd.isnull(row)]] = df_in.bfill().iloc[0,np.arange(len(row))[pd.isnull(row)]]
        ldf = ldf.interpolate()

        ldf = ldf.loc[df_sampling.index,:]
        
    return ldf

Generate Sample Data¶

In [5]:

two_years_in_millis = 1000.0 * 60.0 * 60.0 * 24 * 365 * 2
two_years_in_millis

Out[5]:

63072000000.0

In [6]:

# roughly every 2 weeks; year has 52 weeks
rs = np.random.RandomState(seed=42)
schedule_millis = np.sort(rs.randint(low=0, high=two_years_in_millis, size=(52,)))
random_values = rs.randint(low=5, high=500, size=(52,)) * .10
schedule_millis

Out[6]:

array([ 2253890010,  5950318585,  5992124617,  6120997885,  7395928407,  9019323606, 10232596331, 11511833222, 12283350500, 13498510183, 13667599921, 13920098395, 16931627608, 18171550593, 19114748744, 19831932064, 21478181249, 21505186044, 21609326044, 21717122356, 22367939125, 25826776337, 27684640889, 28389719467, 29190929843, 29992748275, 31498038644, 32514809293, 32860315778, 33309023619, 34428312921, 38855133183, 40586384939, 41092960003, 44667062782, 44922131914, 46298420295, 47343692632, 49663944717, 49871670585, 51275693469, 51426888379, 52327453966, 54103059476, 55118152455, 56223726525, 57514167376, 58958184286, 59814544355, 60874137634, 62785489853, 63064136635])

In [7]:

random_values

Out[7]:

array([21.9, 25.6, 19.4, 30. , 21.7, 21.2, 24.1, 34.2, 37.1,  5.7, 28.4, 41.4, 22.1, 25.6, 19.2, 38.4, 49.7,  4.5, 16.1,  1.9, 30.5,  6.9, 34.9, 33.1,  1.3, 34.8, 13.3, 49.6, 14. , 47.6,  6.7, 14.3,  8.5, 39.6, 16.7, 42.3, 29.3, 38.3, 26.5, 49.4, 23.5,  4.5,  3.2, 13.9, 20.5, 33.2, 27.2, 42.2,  3.7,  5.2, 41.1,  6.6])

In [8]:

import time, datetime

In [9]:

curr_time = round(time.time()*1000)
curr_time

Out[9]:

1662961270015

In [10]:

now_millis = 1662879679947
now_millis

Out[10]:

1662879679947

In [11]:

schedule_millis = schedule_millis + now_millis
schedule_millis

Out[11]:

array([1665133569957, 1668829998532, 1668871804564, 1669000677832, 1670275608354, 1671899003553, 1673112276278, 1674391513169, 1675163030447, 1676378190130, 1676547279868, 1676799778342, 1679811307555, 1681051230540, 1681994428691, 1682711612011, 1684357861196, 1684384865991, 1684489005991, 1684596802303, 1685247619072, 1688706456284, 1690564320836, 1691269399414, 1692070609790, 1692872428222, 1694377718591, 1695394489240, 1695739995725, 1696188703566, 1697307992868, 1701734813130, 1703466064886, 1703972639950, 1707546742729, 1707801811861, 1709178100242, 1710223372579, 1712543624664, 1712751350532, 1714155373416, 1714306568326, 1715207133913, 1716982739423, 1717997832402, 1719103406472, 1720393847323, 1721837864233, 1722694224302, 1723753817581, 1725665169800, 1725943816582], dtype=int64)

In [12]:

lds = pd.Series(schedule_millis).apply(lambda t: pd.to_datetime(datetime.datetime.fromtimestamp(t/1000.0)))
ldf = pd.DataFrame(random_values, index=lds.values, columns=['values'])
ldf = ldf.cumsum()
ldf.head()

Out[12]:

	values
2022-10-07 11:06:09.957	21.9
2022-11-19 04:53:18.532	47.5
2022-11-19 16:30:04.564	66.9
2022-11-21 04:17:57.832	96.9
2022-12-05 22:26:48.354	118.6

In [13]:

plt.figure(figsize=(32, 8), dpi=80, facecolor='w', edgecolor='k')
ax = plt.subplot(1, 1, 1)
ldf.plot(ax=ax)

Out[13]:

<AxesSubplot:>

Apply to Sample Data¶

In [14]:

start = ldf.index[0].replace(second=0,microsecond=0,nanosecond=0,minute=0,hour=0)
start

Out[14]:

Timestamp('2022-10-07 00:00:00')

In [15]:

df_sampled_1 = generate_interpolated_time_series(ldf, freq='D', method='merge', start=start)
df_sampled_2 = generate_interpolated_time_series(ldf, freq='D', method='concat', start=start)
df_sampled_3 = generate_interpolated_time_series(ldf, freq='D', method='reindex')
all([df_sampled_1.equals(df_sampled_2),df_sampled_1.equals(df_sampled_3)])

Out[15]:

True

In [16]:

df_sampled_1.head()

Out[16]:

	values
2022-10-07	21.900000
2022-10-08	22.481818
2022-10-09	23.063636
2022-10-10	23.645455
2022-10-11	24.227273

In the following graph you can see that the sampled data overlays the ground truth data nicely as the orange nicely covers the blue.

In [17]:

plt.figure(figsize=(32, 8), dpi=80, facecolor='w', edgecolor='k')
ax = plt.subplot(1, 1, 1)
ldf.plot(ax=ax)
df_sampled_1.plot(ax=ax)

Out[17]:

<AxesSubplot:>

Rolling Windows¶

Sometimes you want to know summary data for rolling windows like "a month" (30 days) or "a year" (365 days). With the pandas.rolling function that is nicely doable:

In [18]:

plt.figure(figsize=(32, 8), dpi=80, facecolor='w', edgecolor='k')
ax = plt.subplot(1, 1, 1)
df_sampled_1['values'].diff().fillna(0.0).rolling(window=30).sum().to_frame().plot(ax=ax)

Out[18]:

<AxesSubplot:>

In [ ]: