#!/usr/bin/env python # coding: utf-8 # # Financial and Economic Data Applications # In[ ]: from __future__ import division from pandas import Series, DataFrame import pandas as pd from numpy.random import randn import numpy as np pd.options.display.max_rows = 12 np.set_printoptions(precision=4, suppress=True) import matplotlib.pyplot as plt plt.rc('figure', figsize=(12, 6)) # In[ ]: get_ipython().run_line_magic('matplotlib', 'inline') # In[ ]: get_ipython().run_line_magic('pwd', '') # ## Data munging topics # ### Time series and cross-section alignment # In[ ]: close_px = pd.read_csv('ch11/stock_px.csv', parse_dates=True, index_col=0) volume = pd.read_csv('ch11/volume.csv', parse_dates=True, index_col=0) prices = close_px.ix['2011-09-05':'2011-09-14', ['AAPL', 'JNJ', 'SPX', 'XOM']] volume = volume.ix['2011-09-05':'2011-09-12', ['AAPL', 'JNJ', 'XOM']] # In[ ]: prices # In[ ]: volume # In[ ]: prices * volume # In[ ]: vwap = (prices * volume).sum() / volume.sum() # In[ ]: vwap # In[ ]: vwap.dropna() # In[ ]: prices.align(volume, join='inner') # In[ ]: s1 = Series(range(3), index=['a', 'b', 'c']) s2 = Series(range(4), index=['d', 'b', 'c', 'e']) s3 = Series(range(3), index=['f', 'a', 'c']) DataFrame({'one': s1, 'two': s2, 'three': s3}) # In[ ]: DataFrame({'one': s1, 'two': s2, 'three': s3}, index=list('face')) # ### Operations with time series of different frequencies # In[ ]: ts1 = Series(np.random.randn(3), index=pd.date_range('2012-6-13', periods=3, freq='W-WED')) ts1 # In[ ]: ts1.resample('B') # In[ ]: ts1.resample('B', fill_method='ffill') # In[ ]: dates = pd.DatetimeIndex(['2012-6-12', '2012-6-17', '2012-6-18', '2012-6-21', '2012-6-22', '2012-6-29']) ts2 = Series(np.random.randn(6), index=dates) ts2 # In[ ]: ts1.reindex(ts2.index, method='ffill') # In[ ]: ts2 + ts1.reindex(ts2.index, method='ffill') # #### Using periods instead of timestamps # In[ ]: gdp = Series([1.78, 1.94, 2.08, 2.01, 2.15, 2.31, 2.46], index=pd.period_range('1984Q2', periods=7, freq='Q-SEP')) infl = Series([0.025, 0.045, 0.037, 0.04], index=pd.period_range('1982', periods=4, freq='A-DEC')) gdp # In[ ]: infl # In[ ]: infl_q = infl.asfreq('Q-SEP', how='end') # In[ ]: infl_q # In[ ]: infl_q.reindex(gdp.index, method='ffill') # ### Time of day and "as of" data selection # In[ ]: # Make an intraday date range and time series rng = pd.date_range('2012-06-01 09:30', '2012-06-01 15:59', freq='T') # Make a 5-day series of 9:30-15:59 values rng = rng.append([rng + pd.offsets.BDay(i) for i in range(1, 4)]) ts = Series(np.arange(len(rng), dtype=float), index=rng) ts # In[ ]: from datetime import time ts[time(10, 0)] # In[ ]: ts.at_time(time(10, 0)) # In[ ]: ts.between_time(time(10, 0), time(10, 1)) # In[ ]: np.random.seed(12346) # In[ ]: # Set most of the time series randomly to NA indexer = np.sort(np.random.permutation(len(ts))[700:]) irr_ts = ts.copy() irr_ts[indexer] = np.nan irr_ts['2012-06-01 09:50':'2012-06-01 10:00'] # In[ ]: selection = pd.date_range('2012-06-01 10:00', periods=4, freq='B') irr_ts.asof(selection) # ### Splicing together data sources # In[ ]: data1 = DataFrame(np.ones((6, 3), dtype=float), columns=['a', 'b', 'c'], index=pd.date_range('6/12/2012', periods=6)) data2 = DataFrame(np.ones((6, 3), dtype=float) * 2, columns=['a', 'b', 'c'], index=pd.date_range('6/13/2012', periods=6)) spliced = pd.concat([data1.ix[:'2012-06-14'], data2.ix['2012-06-15':]]) spliced # In[ ]: data2 = DataFrame(np.ones((6, 4), dtype=float) * 2, columns=['a', 'b', 'c', 'd'], index=pd.date_range('6/13/2012', periods=6)) spliced = pd.concat([data1.ix[:'2012-06-14'], data2.ix['2012-06-15':]]) spliced # In[ ]: spliced_filled = spliced.combine_first(data2) spliced_filled # In[ ]: spliced.update(data2, overwrite=False) # In[ ]: spliced # In[ ]: cp_spliced = spliced.copy() cp_spliced[['a', 'c']] = data1[['a', 'c']] cp_spliced # ### Return indexes and cumulative returns # In[ ]: import pandas.io.data as web price = web.get_data_yahoo('AAPL', '2011-01-01')['Adj Close'] price[-5:] # In[ ]: price['2011-10-03'] / price['2011-3-01'] - 1 # In[ ]: returns = price.pct_change() ret_index = (1 + returns).cumprod() ret_index[0] = 1 # Set first value to 1 ret_index # In[ ]: m_returns = ret_index.resample('BM', how='last').pct_change() m_returns['2012'] # In[ ]: m_rets = (1 + returns).resample('M', how='prod', kind='period') - 1 m_rets['2012'] returns[dividend_dates] += dividend_pcts # ## Group transforms and analysis # In[ ]: pd.options.display.max_rows = 100 pd.options.display.max_columns = 10 np.random.seed(12345) # In[ ]: import random; random.seed(0) import string N = 1000 def rands(n): choices = string.ascii_uppercase return ''.join([random.choice(choices) for _ in xrange(n)]) tickers = np.array([rands(5) for _ in xrange(N)]) # In[ ]: M = 500 df = DataFrame({'Momentum' : np.random.randn(M) / 200 + 0.03, 'Value' : np.random.randn(M) / 200 + 0.08, 'ShortInterest' : np.random.randn(M) / 200 - 0.02}, index=tickers[:M]) # In[ ]: ind_names = np.array(['FINANCIAL', 'TECH']) sampler = np.random.randint(0, len(ind_names), N) industries = Series(ind_names[sampler], index=tickers, name='industry') # In[ ]: by_industry = df.groupby(industries) by_industry.mean() # In[ ]: by_industry.describe() # In[ ]: # Within-Industry Standardize def zscore(group): return (group - group.mean()) / group.std() df_stand = by_industry.apply(zscore) # In[ ]: df_stand.groupby(industries).agg(['mean', 'std']) # In[ ]: # Within-industry rank descending ind_rank = by_industry.rank(ascending=False) ind_rank.groupby(industries).agg(['min', 'max']) # In[ ]: # Industry rank and standardize by_industry.apply(lambda x: zscore(x.rank())) # ### Group factor exposures # In[ ]: from numpy.random import rand fac1, fac2, fac3 = np.random.rand(3, 1000) ticker_subset = tickers.take(np.random.permutation(N)[:1000]) # Weighted sum of factors plus noise port = Series(0.7 * fac1 - 1.2 * fac2 + 0.3 * fac3 + rand(1000), index=ticker_subset) factors = DataFrame({'f1': fac1, 'f2': fac2, 'f3': fac3}, index=ticker_subset) # In[ ]: factors.corrwith(port) # In[ ]: pd.ols(y=port, x=factors).beta # In[ ]: def beta_exposure(chunk, factors=None): return pd.ols(y=chunk, x=factors).beta # In[ ]: by_ind = port.groupby(industries) exposures = by_ind.apply(beta_exposure, factors=factors) exposures.unstack() # ### Decile and quartile analysis # In[ ]: import pandas.io.data as web data = web.get_data_yahoo('SPY', '2006-01-01') data.info() # In[ ]: px = data['Adj Close'] returns = px.pct_change() def to_index(rets): index = (1 + rets).cumprod() first_loc = max(index.index.get_loc(index.idxmax()) - 1, 0) index.values[first_loc] = 1 return index def trend_signal(rets, lookback, lag): signal = pd.rolling_sum(rets, lookback, min_periods=lookback - 5) return signal.shift(lag) # In[ ]: signal = trend_signal(returns, 100, 3) trade_friday = signal.resample('W-FRI').resample('B', fill_method='ffill') trade_rets = trade_friday.shift(1) * returns trade_rets = trade_rets[:len(returns)] # In[ ]: to_index(trade_rets).plot() # In[ ]: vol = pd.rolling_std(returns, 250, min_periods=200) * np.sqrt(250) def sharpe(rets, ann=250): return rets.mean() / rets.std() * np.sqrt(ann) # In[ ]: cats = pd.qcut(vol, 4) print('cats: %d, trade_rets: %d, vol: %d' % (len(cats), len(trade_rets), len(vol))) # In[ ]: trade_rets.groupby(cats).agg(sharpe) # ## More example applications # ### Signal frontier analysis # In[ ]: names = ['AAPL', 'GOOG', 'MSFT', 'DELL', 'GS', 'MS', 'BAC', 'C'] def get_px(stock, start, end): return web.get_data_yahoo(stock, start, end)['Adj Close'] px = DataFrame({n: get_px(n, None, None) for n in names}) # In[ ]: #px = pd.read_csv('ch11/stock_px.csv') # In[ ]: plt.close('all') # In[ ]: px = px.asfreq('B').fillna(method='pad') rets = px.pct_change() ((1 + rets).cumprod() - 1).plot() # In[ ]: def calc_mom(price, lookback, lag): mom_ret = price.shift(lag).pct_change(lookback) ranks = mom_ret.rank(axis=1, ascending=False) demeaned = ranks.subtract(ranks.mean(axis=1), axis=0) return demeaned.divide(demeaned.std(axis=1), axis=0) # In[ ]: compound = lambda x : (1 + x).prod() - 1 daily_sr = lambda x: x.mean() / x.std() def strat_sr(prices, lb, hold): # Compute portfolio weights freq = '%dB' % hold port = calc_mom(prices, lb, lag=1) daily_rets = prices.pct_change() # Compute portfolio returns port = port.shift(1).resample(freq, how='first') returns = daily_rets.resample(freq, how=compound) port_rets = (port * returns).sum(axis=1) return daily_sr(port_rets) * np.sqrt(252 / hold) # In[ ]: strat_sr(px, 70, 30) # In[ ]: from collections import defaultdict lookbacks = range(20, 90, 5) holdings = range(20, 90, 5) dd = defaultdict(dict) for lb in lookbacks: for hold in holdings: dd[lb][hold] = strat_sr(px, lb, hold) ddf = DataFrame(dd) ddf.index.name = 'Holding Period' ddf.columns.name = 'Lookback Period' # In[ ]: import matplotlib.pyplot as plt def heatmap(df, cmap=plt.cm.gray_r): fig = plt.figure() ax = fig.add_subplot(111) axim = ax.imshow(df.values, cmap=cmap, interpolation='nearest') ax.set_xlabel(df.columns.name) ax.set_xticks(np.arange(len(df.columns))) ax.set_xticklabels(list(df.columns)) ax.set_ylabel(df.index.name) ax.set_yticks(np.arange(len(df.index))) ax.set_yticklabels(list(df.index)) plt.colorbar(axim) # In[ ]: heatmap(ddf) # ### Future contract rolling # In[ ]: pd.options.display.max_rows = 10 # In[ ]: import pandas.io.data as web # Approximate price of S&P 500 index px = web.get_data_yahoo('SPY')['Adj Close'] * 10 px # In[ ]: from datetime import datetime expiry = {'ESU2': datetime(2012, 9, 21), 'ESZ2': datetime(2012, 12, 21)} expiry = Series(expiry).order() # In[ ]: expiry # In[ ]: np.random.seed(12347) N = 200 walk = (np.random.randint(0, 200, size=N) - 100) * 0.25 perturb = (np.random.randint(0, 20, size=N) - 10) * 0.25 walk = walk.cumsum() rng = pd.date_range(px.index[0], periods=len(px) + N, freq='B') near = np.concatenate([px.values, px.values[-1] + walk]) far = np.concatenate([px.values, px.values[-1] + walk + perturb]) prices = DataFrame({'ESU2': near, 'ESZ2': far}, index=rng) # In[ ]: prices.tail() # In[ ]: def get_roll_weights(start, expiry, items, roll_periods=5): # start : first date to compute weighting DataFrame # expiry : Series of ticker -> expiration dates # items : sequence of contract names dates = pd.date_range(start, expiry[-1], freq='B') weights = DataFrame(np.zeros((len(dates), len(items))), index=dates, columns=items) prev_date = weights.index[0] for i, (item, ex_date) in enumerate(expiry.iteritems()): if i < len(expiry) - 1: weights.ix[prev_date:ex_date - pd.offsets.BDay(), item] = 1 roll_rng = pd.date_range(end=ex_date - pd.offsets.BDay(), periods=roll_periods + 1, freq='B') decay_weights = np.linspace(0, 1, roll_periods + 1) weights.ix[roll_rng, item] = 1 - decay_weights weights.ix[roll_rng, expiry.index[i + 1]] = decay_weights else: weights.ix[prev_date:, item] = 1 prev_date = ex_date return weights # In[ ]: weights = get_roll_weights('6/1/2012', expiry, prices.columns) weights.ix['2012-09-12':'2012-09-21'] # In[ ]: rolled_returns = (prices.pct_change() * weights).sum(1) # ### Rolling correlation and linear regression # In[ ]: aapl = web.get_data_yahoo('AAPL', '2000-01-01')['Adj Close'] msft = web.get_data_yahoo('MSFT', '2000-01-01')['Adj Close'] aapl_rets = aapl.pct_change() msft_rets = msft.pct_change() # In[ ]: plt.figure() # In[ ]: pd.rolling_corr(aapl_rets, msft_rets, 250).plot() # In[ ]: plt.figure() # In[ ]: model = pd.ols(y=aapl_rets, x={'MSFT': msft_rets}, window=250) model.beta # In[ ]: model.beta['MSFT'].plot()