#!/usr/bin/env python # coding: utf-8 # (c) 2016 - present. Enplus Advisors, Inc. # In[1]: import datetime as dt import numpy as np import pandas as pd pd.set_option('precision', 2) # In[2]: # Sample dataset w/ 5 days of data def make_dataset(n_days=4): if n_days not in (4, 5): raise ValueError(f"n_days must be 4 or 5, got {n_days:d}.") data = { 'date': ['2015-12-28', '2015-12-29', '2015-12-30', '2015-12-31', '2016-01-04'], 'goog': [762.51, 776.60, 771.00, 758.88, 741.84], 'aapl': [106.82, 108.74, 107.32, 105.26, 105.35] } n_max = len(data['date']) slice_ = slice(n_max - n_days, n_max) sub = {k: v[slice_] for k,v in data.items()} dates = sub['date'] n = len(dates) # breakpoint() rv = pd.DataFrame({ 'ticker': ['GOOG'] * n + ['AAPL'] * n, 'date': [pd.to_datetime(x) for x in dates] * 2, 'close': sub['goog'] + sub['aapl'] }) return rv # In[3]: def make_long(): day_1 = dt.date(2015, 12, 29) day_2 = dt.date(2015, 12, 30) col_close = 'close' col_open = 'open' cols = ['date', 'ticker', 'variable', 'value'] rv = pd.DataFrame([ {'ticker': 'GOOG', 'date': day_1, 'variable': col_close, 'value': 776.60}, {'ticker': 'GOOG', 'date': day_2, 'variable': col_close, 'value': 771.00}, {'ticker': 'AAPL', 'date': day_1, 'variable': col_open, 'value': 107.01}, {'ticker': 'AAPL', 'date': day_1, 'variable': col_close, 'value': 105.26} ], columns=cols) return rv # # Programming with Data:
Advanced Python and Pandas # # # Advanced Merging & Reshaping # ## Grouped and Ordered Data # Working again with securities market data. In quant finance, this is a # common data type, daily stock prices. # ### Display the data # In[4]: _dts = ['2015-12-29', '2015-12-30', '2015-12-31', '2016-01-04'] _goog = [776.60, 771.00, 758.88, 741.84] _aapl = [108.74, 107.32, 105.26, 105.35] df = pd.DataFrame({ 'ticker': ['GOOG'] * 4 + ['AAPL'] * 4, 'date': [pd.to_datetime(x) for x in _dts] * 2, 'close': _goog + _aapl }) df # ### A single, ordered series # In[5]: tbill = pd.DataFrame({ 'date': [pd.to_datetime(x) for x in ['2015-12-30', '2016-01-04']], 'rate': [2.40, 2.56] }) tbill # ## Merge data that is grouped and ordered # # * Left panel is irregularly spaced, e.g. business days # * Right time series also irregularly spaced, e.g. a sparse subset of the first # series # ### How not to do the merge # Don't use plain `pd.merge` and fill forward across groups. # In[6]: pd.merge(df, tbill, on='date', how='left').ffill() # ### Merge Ordered V2 # In[7]: mkt = pd.merge_ordered(df, tbill, on='date', left_by='ticker', fill_method='ffill') mkt # # Reshaping & Pivoting # ## Wide and Long Formats # # * Depending on the operation or the data storage location, data stored # in a "wide" or "long" format # ### Long Format # # * Common format for data in relational databases because allows # new attributes without a schema change # * "Long" format is also called "stacked" or "record" format in the # `pandas` documentation. Also called `Entity-Attribute-Value (EAV)` # * "Sparse" by design # TODO: Include diagram of Long Format # In[8]: def make_long_aapl(): day_1 = dt.date(2015, 12, 29) day_2 = dt.date(2015, 12, 30) col_close = 'close' col_open = 'open' cols = ['date', 'ticker', 'variable', 'value'] rv = pd.DataFrame([ {'ticker': 'AAPL', 'date': day_1, 'variable': col_open, 'value': 106.96}, {'ticker': 'AAPL', 'date': day_1, 'variable': col_close, 'value': 108.74}, {'ticker': 'AAPL', 'date': day_2, 'variable': col_open, 'value': 108.58}, {'ticker': 'AAPL', 'date': day_2, 'variable': col_close, 'value': 107.32} ], columns=cols) return rv # ### Simplest Long Format # # * Multiple attributes for a single entity (AAPL) # * Row for every period (12/29 & 12/30) x (number of attributes) # In[9]: aapl_long = make_long_aapl() aapl_long # ### Wide Format # # * Identifiers stored in the index # * Each attribute has its own column # * Common format for use by machine learning algorithms # TODO: Include diagram of wide format # ### Long-to-Wide # In[10]: aapl_long # In[11]: aapl_long.pivot(index='date', columns='variable', values='value') # ### Long-to-Wide with multiple ID columns # In[12]: aapl_wide = aapl_long.set_index(['date', 'ticker', 'variable']).unstack() aapl_wide # ### Wide-to-Long # In[13]: aapl_wide.stack().reset_index() # ## Pivot Tables # In[14]: mkt # ### Simple Pivot Table # In[15]: pd.pivot_table(mkt, index='ticker', aggfunc='mean')