#!/usr/bin/env python # coding: utf-8 # # Time series # In[ ]: from __future__ import division from pandas import Series, DataFrame import pandas as pd from numpy.random import randn import numpy as np pd.options.display.max_rows = 12 np.set_printoptions(precision=4, suppress=True) import matplotlib.pyplot as plt plt.rc('figure', figsize=(12, 4)) # In[ ]: get_ipython().run_line_magic('matplotlib', 'inline') # ## Date and Time Data Types and Tools # In[ ]: from datetime import datetime now = datetime.now() now # In[ ]: now.year, now.month, now.day # In[ ]: delta = datetime(2011, 1, 7) - datetime(2008, 6, 24, 8, 15) delta # In[ ]: delta.days # In[ ]: delta.seconds # In[ ]: from datetime import timedelta start = datetime(2011, 1, 7) start + timedelta(12) # In[ ]: start - 2 * timedelta(12) # ### Converting between string and datetime # In[ ]: stamp = datetime(2011, 1, 3) # In[ ]: str(stamp) # In[ ]: stamp.strftime('%Y-%m-%d') # In[ ]: value = '2011-01-03' datetime.strptime(value, '%Y-%m-%d') # In[ ]: datestrs = ['7/6/2011', '8/6/2011'] [datetime.strptime(x, '%m/%d/%Y') for x in datestrs] # In[ ]: from dateutil.parser import parse parse('2011-01-03') # In[ ]: parse('Jan 31, 1997 10:45 PM') # In[ ]: parse('6/12/2011', dayfirst=True) # In[ ]: datestrs # In[ ]: pd.to_datetime(datestrs) # note: output changed (no '00:00:00' anymore) # In[ ]: idx = pd.to_datetime(datestrs + [None]) idx # In[ ]: idx[2] # In[ ]: pd.isnull(idx) # ## Time Series Basics # In[ ]: from datetime import datetime dates = [datetime(2011, 1, 2), datetime(2011, 1, 5), datetime(2011, 1, 7), datetime(2011, 1, 8), datetime(2011, 1, 10), datetime(2011, 1, 12)] ts = Series(np.random.randn(6), index=dates) ts # In[ ]: type(ts) # note: output changed to "pandas.core.series.Series" # In[ ]: ts.index # In[ ]: ts + ts[::2] # In[ ]: ts.index.dtype # note: output changed from dtype('datetime64[ns]') to dtype(' to Timestamp('2011-01-02 00:00:00') # ### Indexing, selection, subsetting # In[ ]: stamp = ts.index[2] ts[stamp] # In[ ]: ts['1/10/2011'] # In[ ]: ts['20110110'] # In[ ]: longer_ts = Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000)) longer_ts # In[ ]: longer_ts['2001'] # In[ ]: longer_ts['2001-05'] # In[ ]: ts[datetime(2011, 1, 7):] # In[ ]: ts # In[ ]: ts['1/6/2011':'1/11/2011'] # In[ ]: ts.truncate(after='1/9/2011') # In[ ]: dates = pd.date_range('1/1/2000', periods=100, freq='W-WED') long_df = DataFrame(np.random.randn(100, 4), index=dates, columns=['Colorado', 'Texas', 'New York', 'Ohio']) long_df.ix['5-2001'] # ### Time series with duplicate indices # In[ ]: dates = pd.DatetimeIndex(['1/1/2000', '1/2/2000', '1/2/2000', '1/2/2000', '1/3/2000']) dup_ts = Series(np.arange(5), index=dates) dup_ts # In[ ]: dup_ts.index.is_unique # In[ ]: dup_ts['1/3/2000'] # not duplicated # In[ ]: dup_ts['1/2/2000'] # duplicated # In[ ]: grouped = dup_ts.groupby(level=0) grouped.mean() # In[ ]: grouped.count() # ## Date ranges, Frequencies, and Shifting # In[ ]: ts # In[ ]: ts.resample('D') # ### Generating date ranges # In[ ]: index = pd.date_range('4/1/2012', '6/1/2012') index # In[ ]: pd.date_range(start='4/1/2012', periods=20) # In[ ]: pd.date_range(end='6/1/2012', periods=20) # In[ ]: pd.date_range('1/1/2000', '12/1/2000', freq='BM') # In[ ]: pd.date_range('5/2/2012 12:56:31', periods=5) # In[ ]: pd.date_range('5/2/2012 12:56:31', periods=5, normalize=True) # ### Frequencies and Date Offsets # In[ ]: from pandas.tseries.offsets import Hour, Minute hour = Hour() hour # In[ ]: four_hours = Hour(4) four_hours # In[ ]: pd.date_range('1/1/2000', '1/3/2000 23:59', freq='4h') # In[ ]: Hour(2) + Minute(30) # In[ ]: pd.date_range('1/1/2000', periods=10, freq='1h30min') # #### Week of month dates # In[ ]: rng = pd.date_range('1/1/2012', '9/1/2012', freq='WOM-3FRI') list(rng) # ### Shifting (leading and lagging) data # In[ ]: ts = Series(np.random.randn(4), index=pd.date_range('1/1/2000', periods=4, freq='M')) ts # In[ ]: ts.shift(2) # In[ ]: ts.shift(-2) ts / ts.shift(1) - 1 # In[ ]: ts.shift(2, freq='M') # In[ ]: ts.shift(3, freq='D') # In[ ]: ts.shift(1, freq='3D') # In[ ]: ts.shift(1, freq='90T') # #### Shifting dates with offsets # In[ ]: from pandas.tseries.offsets import Day, MonthEnd now = datetime(2011, 11, 17) now + 3 * Day() # In[ ]: now + MonthEnd() # In[ ]: now + MonthEnd(2) # In[ ]: offset = MonthEnd() offset.rollforward(now) # In[ ]: offset.rollback(now) # In[ ]: ts = Series(np.random.randn(20), index=pd.date_range('1/15/2000', periods=20, freq='4d')) ts.groupby(offset.rollforward).mean() # In[ ]: ts.resample('M', how='mean') # ## Time Zone Handling # In[ ]: import pytz pytz.common_timezones[-5:] # In[ ]: tz = pytz.timezone('US/Eastern') tz # ### Localization and Conversion # In[ ]: rng = pd.date_range('3/9/2012 9:30', periods=6, freq='D') ts = Series(np.random.randn(len(rng)), index=rng) # In[ ]: print(ts.index.tz) # In[ ]: pd.date_range('3/9/2012 9:30', periods=10, freq='D', tz='UTC') # In[ ]: ts_utc = ts.tz_localize('UTC') ts_utc # In[ ]: ts_utc.index # In[ ]: ts_utc.tz_convert('US/Eastern') # In[ ]: ts_eastern = ts.tz_localize('US/Eastern') ts_eastern.tz_convert('UTC') # In[ ]: ts_eastern.tz_convert('Europe/Berlin') # In[ ]: ts.index.tz_localize('Asia/Shanghai') # ### Operations with time zone-aware Timestamp objects # In[ ]: stamp = pd.Timestamp('2011-03-12 04:00') stamp_utc = stamp.tz_localize('utc') stamp_utc.tz_convert('US/Eastern') # In[ ]: stamp_moscow = pd.Timestamp('2011-03-12 04:00', tz='Europe/Moscow') stamp_moscow # In[ ]: stamp_utc.value # In[ ]: stamp_utc.tz_convert('US/Eastern').value # In[ ]: # 30 minutes before DST transition from pandas.tseries.offsets import Hour stamp = pd.Timestamp('2012-03-12 01:30', tz='US/Eastern') stamp # In[ ]: stamp + Hour() # In[ ]: # 90 minutes before DST transition stamp = pd.Timestamp('2012-11-04 00:30', tz='US/Eastern') stamp # In[ ]: stamp + 2 * Hour() # ### Operations between different time zones # In[ ]: rng = pd.date_range('3/7/2012 9:30', periods=10, freq='B') ts = Series(np.random.randn(len(rng)), index=rng) ts # In[ ]: ts1 = ts[:7].tz_localize('Europe/London') ts2 = ts1[2:].tz_convert('Europe/Moscow') result = ts1 + ts2 result.index # ## Periods and Period Arithmetic # In[ ]: p = pd.Period(2007, freq='A-DEC') p # In[ ]: p + 5 # In[ ]: p - 2 # In[ ]: pd.Period('2014', freq='A-DEC') - p # In[ ]: rng = pd.period_range('1/1/2000', '6/30/2000', freq='M') rng # In[ ]: Series(np.random.randn(6), index=rng) # In[ ]: values = ['2001Q3', '2002Q2', '2003Q1'] index = pd.PeriodIndex(values, freq='Q-DEC') index # ### Period Frequency Conversion # In[ ]: p = pd.Period('2007', freq='A-DEC') p.asfreq('M', how='start') # In[ ]: p.asfreq('M', how='end') # In[ ]: p = pd.Period('2007', freq='A-JUN') p.asfreq('M', 'start') # In[ ]: p.asfreq('M', 'end') # In[ ]: p = pd.Period('Aug-2007', 'M') p.asfreq('A-JUN') # In[ ]: rng = pd.period_range('2006', '2009', freq='A-DEC') ts = Series(np.random.randn(len(rng)), index=rng) ts # In[ ]: ts.asfreq('M', how='start') # In[ ]: ts.asfreq('B', how='end') # ### Quarterly period frequencies # In[ ]: p = pd.Period('2012Q4', freq='Q-JAN') p # In[ ]: p.asfreq('D', 'start') # In[ ]: p.asfreq('D', 'end') # In[ ]: p4pm = (p.asfreq('B', 'e') - 1).asfreq('T', 's') + 16 * 60 p4pm # In[ ]: p4pm.to_timestamp() # In[ ]: rng = pd.period_range('2011Q3', '2012Q4', freq='Q-JAN') ts = Series(np.arange(len(rng)), index=rng) ts # In[ ]: new_rng = (rng.asfreq('B', 'e') - 1).asfreq('T', 's') + 16 * 60 ts.index = new_rng.to_timestamp() ts # ### Converting Timestamps to Periods (and back) # In[ ]: rng = pd.date_range('1/1/2000', periods=3, freq='M') ts = Series(randn(3), index=rng) pts = ts.to_period() ts # In[ ]: pts # In[ ]: rng = pd.date_range('1/29/2000', periods=6, freq='D') ts2 = Series(randn(6), index=rng) ts2.to_period('M') # In[ ]: pts = ts.to_period() pts # In[ ]: pts.to_timestamp(how='end') # ### Creating a PeriodIndex from arrays # In[ ]: data = pd.read_csv('ch08/macrodata.csv') data.year # In[ ]: data.quarter # In[ ]: index = pd.PeriodIndex(year=data.year, quarter=data.quarter, freq='Q-DEC') index # In[ ]: data.index = index data.infl # ## Resampling and Frequency Conversion # In[ ]: rng = pd.date_range('1/1/2000', periods=100, freq='D') ts = Series(randn(len(rng)), index=rng) ts.resample('M', how='mean') # In[ ]: ts.resample('M', how='mean', kind='period') # ### Downsampling # In[ ]: rng = pd.date_range('1/1/2000', periods=12, freq='T') ts = Series(np.arange(12), index=rng) ts # In[ ]: ts.resample('5min', how='sum') # note: output changed (as the default changed from closed='right', label='right' to closed='left', label='left' # In[ ]: ts.resample('5min', how='sum', closed='left') # In[ ]: ts.resample('5min', how='sum', closed='left', label='left') # In[ ]: ts.resample('5min', how='sum', loffset='-1s') # #### Open-High-Low-Close (OHLC) resampling # In[ ]: ts.resample('5min', how='ohlc') # note: output changed because of changed defaults # #### Resampling with GroupBy # In[ ]: rng = pd.date_range('1/1/2000', periods=100, freq='D') ts = Series(np.arange(100), index=rng) ts.groupby(lambda x: x.month).mean() # In[ ]: ts.groupby(lambda x: x.weekday).mean() # ### Upsampling and interpolation # In[ ]: frame = DataFrame(np.random.randn(2, 4), index=pd.date_range('1/1/2000', periods=2, freq='W-WED'), columns=['Colorado', 'Texas', 'New York', 'Ohio']) frame # In[ ]: df_daily = frame.resample('D') df_daily # In[ ]: frame.resample('D', fill_method='ffill') # In[ ]: frame.resample('D', fill_method='ffill', limit=2) # In[ ]: frame.resample('W-THU', fill_method='ffill') # ### Resampling with periods # In[ ]: frame = DataFrame(np.random.randn(24, 4), index=pd.period_range('1-2000', '12-2001', freq='M'), columns=['Colorado', 'Texas', 'New York', 'Ohio']) frame[:5] # In[ ]: annual_frame = frame.resample('A-DEC', how='mean') annual_frame # In[ ]: # Q-DEC: Quarterly, year ending in December annual_frame.resample('Q-DEC', fill_method='ffill') # note: output changed, default value changed from convention='end' to convention='start' + 'start' changed to span-like # also the following cells # In[ ]: annual_frame.resample('Q-DEC', fill_method='ffill', convention='start') # In[ ]: annual_frame.resample('Q-MAR', fill_method='ffill') # ## Time series plotting # In[ ]: close_px_all = pd.read_csv('ch09/stock_px.csv', parse_dates=True, index_col=0) close_px = close_px_all[['AAPL', 'MSFT', 'XOM']] close_px = close_px.resample('B', fill_method='ffill') close_px.info() # In[ ]: close_px['AAPL'].plot() # In[ ]: close_px.ix['2009'].plot() # In[ ]: close_px['AAPL'].ix['01-2011':'03-2011'].plot() # In[ ]: appl_q = close_px['AAPL'].resample('Q-DEC', fill_method='ffill') appl_q.ix['2009':].plot() # ## Moving window functions # In[ ]: close_px = close_px.asfreq('B').fillna(method='ffill') # In[ ]: close_px.AAPL.plot() pd.rolling_mean(close_px.AAPL, 250).plot() # In[ ]: plt.figure() # In[ ]: appl_std250 = pd.rolling_std(close_px.AAPL, 250, min_periods=10) appl_std250[5:12] # In[ ]: appl_std250.plot() # In[ ]: # Define expanding mean in terms of rolling_mean expanding_mean = lambda x: rolling_mean(x, len(x), min_periods=1) # In[ ]: pd.rolling_mean(close_px, 60).plot(logy=True) # In[ ]: plt.close('all') # ### Exponentially-weighted functions # In[ ]: fig, axes = plt.subplots(nrows=2, ncols=1, sharex=True, sharey=True, figsize=(12, 7)) aapl_px = close_px.AAPL['2005':'2009'] ma60 = pd.rolling_mean(aapl_px, 60, min_periods=50) ewma60 = pd.ewma(aapl_px, span=60) aapl_px.plot(style='k-', ax=axes[0]) ma60.plot(style='k--', ax=axes[0]) aapl_px.plot(style='k-', ax=axes[1]) ewma60.plot(style='k--', ax=axes[1]) axes[0].set_title('Simple MA') axes[1].set_title('Exponentially-weighted MA') # ### Binary moving window functions # In[ ]: close_px spx_px = close_px_all['SPX'] # In[ ]: spx_rets = spx_px / spx_px.shift(1) - 1 returns = close_px.pct_change() corr = pd.rolling_corr(returns.AAPL, spx_rets, 125, min_periods=100) corr.plot() # In[ ]: corr = pd.rolling_corr(returns, spx_rets, 125, min_periods=100) corr.plot() # ### User-defined moving window functions # In[ ]: from scipy.stats import percentileofscore score_at_2percent = lambda x: percentileofscore(x, 0.02) result = pd.rolling_apply(returns.AAPL, 250, score_at_2percent) result.plot() # ## Performance and Memory Usage Notes # In[ ]: rng = pd.date_range('1/1/2000', periods=10000000, freq='10ms') ts = Series(np.random.randn(len(rng)), index=rng) ts # In[ ]: ts.resample('15min', how='ohlc').info() # In[ ]: get_ipython().run_line_magic('timeit', "ts.resample('15min', how='ohlc')") # In[ ]: rng = pd.date_range('1/1/2000', periods=10000000, freq='1s') ts = Series(np.random.randn(len(rng)), index=rng) get_ipython().run_line_magic('timeit', "ts.resample('15s', how='ohlc')")