#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('load_ext', 'watermark') get_ipython().run_line_magic('watermark', "-a 'cs224' -u -d -v -p numpy,pandas") # In[2]: get_ipython().run_line_magic('matplotlib', 'inline') import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns pd.set_option('display.max_columns', 500) pd.set_option('display.width', 1000) # pd.set_option('display.float_format', lambda x: '%.2f' % x) np.set_printoptions(edgeitems=10) np.set_printoptions(linewidth=1000) np.set_printoptions(suppress=True) np.core.arrayprint._line_width = 180 SEED = 42 np.random.seed(SEED) sns.set() # In[3]: from IPython.display import display, HTML display(HTML("")) # # Sample Data at Regular Times # The function `generate_interpolated_time_series` implements two different methods for generating samples at regular time intervals via `pd.merge` and via `pd.concat`: # In[4]: def generate_interpolated_time_series(df_in, freq='T', method='reindex', start=None): if start is None: start = df_in.index[0].floor(freq) end = df_in.index[-1] if method == 'reindex': idx = pd.date_range(start=start, end=end, freq=freq) ldf = df_in.reindex(df_in.index.union(idx)).interpolate().bfill() ldf = ldf[~ldf.index.isin(df_in.index.difference(idx))] else: df_sampling = pd.DataFrame(index=pd.date_range(start, end, freq=freq)) ldf = None if method=='merge': ldf = pd.merge(df_in, df_sampling, left_index=True, right_index=True, how='outer') elif method=='concat': ldf = pd.concat([df_in, df_sampling], axis=1) else: raise Exception('Method unknown: ' + method) row = ldf.iloc[0,:] ldf.iloc[0,np.arange(len(row))[pd.isnull(row)]] = df_in.bfill().iloc[0,np.arange(len(row))[pd.isnull(row)]] ldf = ldf.interpolate() ldf = ldf.loc[df_sampling.index,:] return ldf # # Generate Sample Data # In[5]: two_years_in_millis = 1000.0 * 60.0 * 60.0 * 24 * 365 * 2 two_years_in_millis # In[6]: # roughly every 2 weeks; year has 52 weeks rs = np.random.RandomState(seed=42) schedule_millis = np.sort(rs.randint(low=0, high=two_years_in_millis, size=(52,))) random_values = rs.randint(low=5, high=500, size=(52,)) * .10 schedule_millis # In[7]: random_values # In[8]: import time, datetime # In[9]: curr_time = round(time.time()*1000) curr_time # In[10]: now_millis = 1662879679947 now_millis # In[11]: schedule_millis = schedule_millis + now_millis schedule_millis # In[12]: lds = pd.Series(schedule_millis).apply(lambda t: pd.to_datetime(datetime.datetime.fromtimestamp(t/1000.0))) ldf = pd.DataFrame(random_values, index=lds.values, columns=['values']) ldf = ldf.cumsum() ldf.head() # In[13]: plt.figure(figsize=(32, 8), dpi=80, facecolor='w', edgecolor='k') ax = plt.subplot(1, 1, 1) ldf.plot(ax=ax) # # Apply to Sample Data # In[14]: start = ldf.index[0].replace(second=0,microsecond=0,nanosecond=0,minute=0,hour=0) start # In[15]: df_sampled_1 = generate_interpolated_time_series(ldf, freq='D', method='merge', start=start) df_sampled_2 = generate_interpolated_time_series(ldf, freq='D', method='concat', start=start) df_sampled_3 = generate_interpolated_time_series(ldf, freq='D', method='reindex') all([df_sampled_1.equals(df_sampled_2),df_sampled_1.equals(df_sampled_3)]) # In[16]: df_sampled_1.head() # In the following graph you can see that the sampled data overlays the ground truth data nicely as the orange nicely covers the blue. # In[17]: plt.figure(figsize=(32, 8), dpi=80, facecolor='w', edgecolor='k') ax = plt.subplot(1, 1, 1) ldf.plot(ax=ax) df_sampled_1.plot(ax=ax) # ## Rolling Windows # Sometimes you want to know summary data for rolling windows like "a month" (30 days) or "a year" (365 days). With the `pandas.rolling` function that is nicely doable: # In[18]: plt.figure(figsize=(32, 8), dpi=80, facecolor='w', edgecolor='k') ax = plt.subplot(1, 1, 1) df_sampled_1['values'].diff().fillna(0.0).rolling(window=30).sum().to_frame().plot(ax=ax) # In[ ]: