%load_ext watermark
%watermark -a 'cs224' -u -d -v -p numpy,pandas
Author: cs224 Last updated: 2022-09-12 Python implementation: CPython Python version : 3.8.12 IPython version : 8.2.0 numpy : 1.23.1 pandas: 1.4.2
%matplotlib inline
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
# pd.set_option('display.float_format', lambda x: '%.2f' % x)
np.set_printoptions(edgeitems=10)
np.set_printoptions(linewidth=1000)
np.set_printoptions(suppress=True)
np.core.arrayprint._line_width = 180
SEED = 42
np.random.seed(SEED)
sns.set()
from IPython.display import display, HTML
display(HTML("<style>.container { width:70% !important; }</style>"))
The function generate_interpolated_time_series
implements two different methods for generating samples at regular time intervals via pd.merge
and via pd.concat
:
def generate_interpolated_time_series(df_in, freq='T', method='reindex', start=None):
if start is None:
start = df_in.index[0].floor(freq)
end = df_in.index[-1]
if method == 'reindex':
idx = pd.date_range(start=start, end=end, freq=freq)
ldf = df_in.reindex(df_in.index.union(idx)).interpolate().bfill()
ldf = ldf[~ldf.index.isin(df_in.index.difference(idx))]
else:
df_sampling = pd.DataFrame(index=pd.date_range(start, end, freq=freq))
ldf = None
if method=='merge':
ldf = pd.merge(df_in, df_sampling, left_index=True, right_index=True, how='outer')
elif method=='concat':
ldf = pd.concat([df_in, df_sampling], axis=1)
else:
raise Exception('Method unknown: ' + method)
row = ldf.iloc[0,:]
ldf.iloc[0,np.arange(len(row))[pd.isnull(row)]] = df_in.bfill().iloc[0,np.arange(len(row))[pd.isnull(row)]]
ldf = ldf.interpolate()
ldf = ldf.loc[df_sampling.index,:]
return ldf
two_years_in_millis = 1000.0 * 60.0 * 60.0 * 24 * 365 * 2
two_years_in_millis
63072000000.0
# roughly every 2 weeks; year has 52 weeks
rs = np.random.RandomState(seed=42)
schedule_millis = np.sort(rs.randint(low=0, high=two_years_in_millis, size=(52,)))
random_values = rs.randint(low=5, high=500, size=(52,)) * .10
schedule_millis
array([ 2253890010, 5950318585, 5992124617, 6120997885, 7395928407, 9019323606, 10232596331, 11511833222, 12283350500, 13498510183, 13667599921, 13920098395, 16931627608, 18171550593, 19114748744, 19831932064, 21478181249, 21505186044, 21609326044, 21717122356, 22367939125, 25826776337, 27684640889, 28389719467, 29190929843, 29992748275, 31498038644, 32514809293, 32860315778, 33309023619, 34428312921, 38855133183, 40586384939, 41092960003, 44667062782, 44922131914, 46298420295, 47343692632, 49663944717, 49871670585, 51275693469, 51426888379, 52327453966, 54103059476, 55118152455, 56223726525, 57514167376, 58958184286, 59814544355, 60874137634, 62785489853, 63064136635])
random_values
array([21.9, 25.6, 19.4, 30. , 21.7, 21.2, 24.1, 34.2, 37.1, 5.7, 28.4, 41.4, 22.1, 25.6, 19.2, 38.4, 49.7, 4.5, 16.1, 1.9, 30.5, 6.9, 34.9, 33.1, 1.3, 34.8, 13.3, 49.6, 14. , 47.6, 6.7, 14.3, 8.5, 39.6, 16.7, 42.3, 29.3, 38.3, 26.5, 49.4, 23.5, 4.5, 3.2, 13.9, 20.5, 33.2, 27.2, 42.2, 3.7, 5.2, 41.1, 6.6])
import time, datetime
curr_time = round(time.time()*1000)
curr_time
1662961270015
now_millis = 1662879679947
now_millis
1662879679947
schedule_millis = schedule_millis + now_millis
schedule_millis
array([1665133569957, 1668829998532, 1668871804564, 1669000677832, 1670275608354, 1671899003553, 1673112276278, 1674391513169, 1675163030447, 1676378190130, 1676547279868, 1676799778342, 1679811307555, 1681051230540, 1681994428691, 1682711612011, 1684357861196, 1684384865991, 1684489005991, 1684596802303, 1685247619072, 1688706456284, 1690564320836, 1691269399414, 1692070609790, 1692872428222, 1694377718591, 1695394489240, 1695739995725, 1696188703566, 1697307992868, 1701734813130, 1703466064886, 1703972639950, 1707546742729, 1707801811861, 1709178100242, 1710223372579, 1712543624664, 1712751350532, 1714155373416, 1714306568326, 1715207133913, 1716982739423, 1717997832402, 1719103406472, 1720393847323, 1721837864233, 1722694224302, 1723753817581, 1725665169800, 1725943816582], dtype=int64)
lds = pd.Series(schedule_millis).apply(lambda t: pd.to_datetime(datetime.datetime.fromtimestamp(t/1000.0)))
ldf = pd.DataFrame(random_values, index=lds.values, columns=['values'])
ldf = ldf.cumsum()
ldf.head()
values | |
---|---|
2022-10-07 11:06:09.957 | 21.9 |
2022-11-19 04:53:18.532 | 47.5 |
2022-11-19 16:30:04.564 | 66.9 |
2022-11-21 04:17:57.832 | 96.9 |
2022-12-05 22:26:48.354 | 118.6 |
plt.figure(figsize=(32, 8), dpi=80, facecolor='w', edgecolor='k')
ax = plt.subplot(1, 1, 1)
ldf.plot(ax=ax)
<AxesSubplot:>
start = ldf.index[0].replace(second=0,microsecond=0,nanosecond=0,minute=0,hour=0)
start
Timestamp('2022-10-07 00:00:00')
df_sampled_1 = generate_interpolated_time_series(ldf, freq='D', method='merge', start=start)
df_sampled_2 = generate_interpolated_time_series(ldf, freq='D', method='concat', start=start)
df_sampled_3 = generate_interpolated_time_series(ldf, freq='D', method='reindex')
all([df_sampled_1.equals(df_sampled_2),df_sampled_1.equals(df_sampled_3)])
True
df_sampled_1.head()
values | |
---|---|
2022-10-07 | 21.900000 |
2022-10-08 | 22.481818 |
2022-10-09 | 23.063636 |
2022-10-10 | 23.645455 |
2022-10-11 | 24.227273 |
In the following graph you can see that the sampled data overlays the ground truth data nicely as the orange nicely covers the blue.
plt.figure(figsize=(32, 8), dpi=80, facecolor='w', edgecolor='k')
ax = plt.subplot(1, 1, 1)
ldf.plot(ax=ax)
df_sampled_1.plot(ax=ax)
<AxesSubplot:>
Sometimes you want to know summary data for rolling windows like "a month" (30 days) or "a year" (365 days). With the pandas.rolling
function that is nicely doable:
plt.figure(figsize=(32, 8), dpi=80, facecolor='w', edgecolor='k')
ax = plt.subplot(1, 1, 1)
df_sampled_1['values'].diff().fillna(0.0).rolling(window=30).sum().to_frame().plot(ax=ax)
<AxesSubplot:>