(c) 2016 - present. Enplus Advisors, Inc.
import datetime as dt
import numpy as np
import pandas as pd
pd.set_option('precision', 2)
# Sample dataset w/ 5 days of data
def make_dataset(n_days=4):
if n_days not in (4, 5):
raise ValueError(f"n_days must be 4 or 5, got {n_days:d}.")
data = {
'date': ['2015-12-28', '2015-12-29', '2015-12-30',
'2015-12-31', '2016-01-04'],
'goog': [762.51, 776.60, 771.00, 758.88, 741.84],
'aapl': [106.82, 108.74, 107.32, 105.26, 105.35]
}
n_max = len(data['date'])
slice_ = slice(n_max - n_days, n_max)
sub = {k: v[slice_] for k,v in data.items()}
dates = sub['date']
n = len(dates)
# breakpoint()
rv = pd.DataFrame({
'ticker': ['GOOG'] * n + ['AAPL'] * n,
'date': [pd.to_datetime(x) for x in dates] * 2,
'close': sub['goog'] + sub['aapl']
})
return rv
def make_long():
day_1 = dt.date(2015, 12, 29)
day_2 = dt.date(2015, 12, 30)
col_close = 'close'
col_open = 'open'
cols = ['date', 'ticker', 'variable', 'value']
rv = pd.DataFrame([
{'ticker': 'GOOG', 'date': day_1, 'variable': col_close, 'value': 776.60},
{'ticker': 'GOOG', 'date': day_2, 'variable': col_close, 'value': 771.00},
{'ticker': 'AAPL', 'date': day_1, 'variable': col_open, 'value': 107.01},
{'ticker': 'AAPL', 'date': day_1, 'variable': col_close, 'value': 105.26}
], columns=cols)
return rv
Working again with securities market data. In quant finance, this is a common data type, daily stock prices.
_dts = ['2015-12-29', '2015-12-30', '2015-12-31', '2016-01-04']
_goog = [776.60, 771.00, 758.88, 741.84]
_aapl = [108.74, 107.32, 105.26, 105.35]
df = pd.DataFrame({
'ticker': ['GOOG'] * 4 + ['AAPL'] * 4,
'date': [pd.to_datetime(x) for x in _dts] * 2,
'close': _goog + _aapl
})
df
ticker | date | close | |
---|---|---|---|
0 | GOOG | 2015-12-29 | 776.60 |
1 | GOOG | 2015-12-30 | 771.00 |
2 | GOOG | 2015-12-31 | 758.88 |
3 | GOOG | 2016-01-04 | 741.84 |
4 | AAPL | 2015-12-29 | 108.74 |
5 | AAPL | 2015-12-30 | 107.32 |
6 | AAPL | 2015-12-31 | 105.26 |
7 | AAPL | 2016-01-04 | 105.35 |
tbill = pd.DataFrame({
'date': [pd.to_datetime(x) for x in ['2015-12-30', '2016-01-04']],
'rate': [2.40, 2.56]
})
tbill
date | rate | |
---|---|---|
0 | 2015-12-30 | 2.40 |
1 | 2016-01-04 | 2.56 |
Don't use plain pd.merge
and fill forward across groups.
pd.merge(df, tbill, on='date', how='left').ffill()
ticker | date | close | rate | |
---|---|---|---|---|
0 | GOOG | 2015-12-29 | 776.60 | NaN |
1 | GOOG | 2015-12-30 | 771.00 | 2.40 |
2 | GOOG | 2015-12-31 | 758.88 | 2.40 |
3 | GOOG | 2016-01-04 | 741.84 | 2.56 |
4 | AAPL | 2015-12-29 | 108.74 | 2.56 |
5 | AAPL | 2015-12-30 | 107.32 | 2.40 |
6 | AAPL | 2015-12-31 | 105.26 | 2.40 |
7 | AAPL | 2016-01-04 | 105.35 | 2.56 |
mkt = pd.merge_ordered(df, tbill, on='date', left_by='ticker', fill_method='ffill')
mkt
ticker | date | close | rate | |
---|---|---|---|---|
0 | GOOG | 2015-12-29 | 776.60 | NaN |
1 | GOOG | 2015-12-30 | 771.00 | 2.40 |
2 | GOOG | 2015-12-31 | 758.88 | 2.40 |
3 | GOOG | 2016-01-04 | 741.84 | 2.56 |
4 | AAPL | 2015-12-29 | 108.74 | NaN |
5 | AAPL | 2015-12-30 | 107.32 | 2.40 |
6 | AAPL | 2015-12-31 | 105.26 | 2.40 |
7 | AAPL | 2016-01-04 | 105.35 | 2.56 |
pandas
documentation. Also called Entity-Attribute-Value (EAV)
TODO: Include diagram of Long Format
def make_long_aapl():
day_1 = dt.date(2015, 12, 29)
day_2 = dt.date(2015, 12, 30)
col_close = 'close'
col_open = 'open'
cols = ['date', 'ticker', 'variable', 'value']
rv = pd.DataFrame([
{'ticker': 'AAPL', 'date': day_1, 'variable': col_open, 'value': 106.96},
{'ticker': 'AAPL', 'date': day_1, 'variable': col_close, 'value': 108.74},
{'ticker': 'AAPL', 'date': day_2, 'variable': col_open, 'value': 108.58},
{'ticker': 'AAPL', 'date': day_2, 'variable': col_close, 'value': 107.32}
], columns=cols)
return rv
aapl_long = make_long_aapl()
aapl_long
date | ticker | variable | value | |
---|---|---|---|---|
0 | 2015-12-29 | AAPL | open | 106.96 |
1 | 2015-12-29 | AAPL | close | 108.74 |
2 | 2015-12-30 | AAPL | open | 108.58 |
3 | 2015-12-30 | AAPL | close | 107.32 |
TODO: Include diagram of wide format
aapl_long
date | ticker | variable | value | |
---|---|---|---|---|
0 | 2015-12-29 | AAPL | open | 106.96 |
1 | 2015-12-29 | AAPL | close | 108.74 |
2 | 2015-12-30 | AAPL | open | 108.58 |
3 | 2015-12-30 | AAPL | close | 107.32 |
aapl_long.pivot(index='date', columns='variable', values='value')
variable | close | open |
---|---|---|
date | ||
2015-12-29 | 108.74 | 106.96 |
2015-12-30 | 107.32 | 108.58 |
aapl_wide = aapl_long.set_index(['date', 'ticker', 'variable']).unstack()
aapl_wide
value | |||
---|---|---|---|
variable | close | open | |
date | ticker | ||
2015-12-29 | AAPL | 108.74 | 106.96 |
2015-12-30 | AAPL | 107.32 | 108.58 |
aapl_wide.stack().reset_index()
date | ticker | variable | value | |
---|---|---|---|---|
0 | 2015-12-29 | AAPL | close | 108.74 |
1 | 2015-12-29 | AAPL | open | 106.96 |
2 | 2015-12-30 | AAPL | close | 107.32 |
3 | 2015-12-30 | AAPL | open | 108.58 |
mkt
ticker | date | close | rate | |
---|---|---|---|---|
0 | GOOG | 2015-12-29 | 776.60 | NaN |
1 | GOOG | 2015-12-30 | 771.00 | 2.40 |
2 | GOOG | 2015-12-31 | 758.88 | 2.40 |
3 | GOOG | 2016-01-04 | 741.84 | 2.56 |
4 | AAPL | 2015-12-29 | 108.74 | NaN |
5 | AAPL | 2015-12-30 | 107.32 | 2.40 |
6 | AAPL | 2015-12-31 | 105.26 | 2.40 |
7 | AAPL | 2016-01-04 | 105.35 | 2.56 |
pd.pivot_table(mkt, index='ticker', aggfunc='mean')
close | rate | |
---|---|---|
ticker | ||
AAPL | 106.67 | 2.45 |
GOOG | 762.08 | 2.45 |