#!/usr/bin/env python # coding: utf-8 # # Mean Absolute Difference in Pandas Tutorial # This notebook explains how to calculate the mean absolute difference of time series data in `pandas`. # # This notebook will use gold and silver price data from `rdatasets` for this tutorial # ### Packages # The documentation for each package used in this tutorial is linked below: # * [pandas](https://pandas.pydata.org/docs/) # * [statsmodels](https://www.statsmodels.org/stable/index.html) # * [statsmodels.api](https://www.statsmodels.org/stable/api.html#statsmodels-api) # * [numpy](https://numpy.org/doc/stable/) # In[1]: import statsmodels.api as sm import numpy as np import pandas as pd # ## Create initial dataset # The data is from `rdatasets` imported using the Python package `statsmodels`. # In[2]: df = sm.datasets.get_rdataset('GoldSilver', 'AER').data.reset_index().rename(columns={'index': 'date'}) df.info() # In[3]: df['date'] = pd.to_datetime(df.date) # ## Time series aggregation # The `pandas` function `rolling` can be used to create aggregations on windows of specific lengths. Here, an aggregate of the daily gold and silver price data will be created covering the primary week. # # First, a datetime index needs to be created from the **date** column. # In[4]: df.set_index('date', inplace=True) # Pandas aggregation will require a function that can be called. The mean difference will be called using three functions from `numpy`: `mean`, `abs` and `diff`. `diff` calculates the difference between the current value and a prior value (by default the immediate prior value). # In[5]: def mean_absolute_difference(series): return np.mean(np.abs(np.diff(series))) # If, instead of an offset (**'7D'** representing 7 days), a number is used, it will just use the prior number of observations. # In[6]: weekly_resample = df.rolling('7D') aggregated_df = weekly_resample.agg(['mean', mean_absolute_difference]) aggregated_df.columns = ['_'.join(col).strip() + '_week' for col in aggregated_df.columns.values] # In[7]: aggregated_df.head(20) # In[ ]: