from IPython.core.display import HTML
HTML("<iframe src=http://pandas.pydata.org width=800 height=350></iframe>")
import datetime
import pandas as pd
import pandas.io.data
from pandas import Series, DataFrame
pd.__version__
'0.11.0'
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rc('figure', figsize=(8, 7))
mpl.__version__
'1.2.1'
labels = ['a', 'b', 'c', 'd', 'e']
s = Series([1, 2, 3, 4, 5], index=labels)
s
a 1 b 2 c 3 d 4 e 5 dtype: int64
'b' in s
True
s['b']
2
mapping = s.to_dict()
mapping
{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5}
Series(mapping)
a 1 b 2 c 3 d 4 e 5 dtype: int64
aapl = pd.io.data.get_data_yahoo('AAPL',
start=datetime.datetime(2006, 10, 1),
end=datetime.datetime(2012, 1, 1))
aapl.head()
Open | High | Low | Close | Volume | Adj Close | |
---|---|---|---|---|---|---|
Date | ||||||
2006-10-02 | 75.10 | 75.87 | 74.30 | 74.86 | 25451400 | 73.29 |
2006-10-03 | 74.45 | 74.95 | 73.19 | 74.08 | 28239600 | 72.52 |
2006-10-04 | 74.10 | 75.46 | 73.16 | 75.38 | 29610100 | 73.80 |
2006-10-05 | 74.53 | 76.16 | 74.13 | 74.83 | 24424400 | 73.26 |
2006-10-06 | 74.42 | 75.04 | 73.81 | 74.22 | 16677100 | 72.66 |
aapl.to_csv('data/aapl_ohlc.csv')
!head data/aapl_ohlc.csv
Date,Open,High,Low,Close,Volume,Adj Close 2006-10-02,75.1,75.87,74.3,74.86,25451400,73.29 2006-10-03,74.45,74.95,73.19,74.08,28239600,72.52 2006-10-04,74.1,75.46,73.16,75.38,29610100,73.8 2006-10-05,74.53,76.16,74.13,74.83,24424400,73.26 2006-10-06,74.42,75.04,73.81,74.22,16677100,72.66 2006-10-09,73.8,75.08,73.53,74.63,15650800,73.06 2006-10-10,74.54,74.58,73.08,73.81,18985300,72.26 2006-10-11,73.42,73.98,72.6,73.23,20423400,71.69 2006-10-12,73.61,75.39,73.6,75.26,21173400,73.68
df = pd.read_csv('data/aapl_ohlc.csv', index_col='Date', parse_dates=True)
df.head()
Open | High | Low | Close | Volume | Adj Close | |
---|---|---|---|---|---|---|
Date | ||||||
2006-10-02 | 75.10 | 75.87 | 74.30 | 74.86 | 25451400 | 73.29 |
2006-10-03 | 74.45 | 74.95 | 73.19 | 74.08 | 28239600 | 72.52 |
2006-10-04 | 74.10 | 75.46 | 73.16 | 75.38 | 29610100 | 73.80 |
2006-10-05 | 74.53 | 76.16 | 74.13 | 74.83 | 24424400 | 73.26 |
2006-10-06 | 74.42 | 75.04 | 73.81 | 74.22 | 16677100 | 72.66 |
df.index
<class 'pandas.tseries.index.DatetimeIndex'> [2006-10-02 00:00:00, ..., 2011-12-30 00:00:00] Length: 1323, Freq: None, Timezone: None
ts = df['Close'][-10:]
ts
Date 2011-12-16 381.02 2011-12-19 382.21 2011-12-20 395.95 2011-12-21 396.45 2011-12-22 398.55 2011-12-23 403.33 2011-12-27 406.53 2011-12-28 402.64 2011-12-29 405.12 2011-12-30 405.00 Name: Close, dtype: float64
A DataFrame is a collection of Series objects. Slicing out a DataFrame column returns a Series.
type(ts)
pandas.core.series.TimeSeries
date = ts.index[5]
date
<Timestamp: 2011-12-23 00:00:00>
ts[date]
403.32999999999998
ts[5]
403.32999999999998
We can also select multiple columns.
df[['Open', 'Close']].head()
Open | Close | |
---|---|---|
Date | ||
2006-10-02 | 75.10 | 74.86 |
2006-10-03 | 74.45 | 74.08 |
2006-10-04 | 74.10 | 75.38 |
2006-10-05 | 74.53 | 74.83 |
2006-10-06 | 74.42 | 74.22 |
New columns can be added on the fly.
df['diff'] = df.Open - df.Close
df.head()
Open | High | Low | Close | Volume | Adj Close | diff | |
---|---|---|---|---|---|---|---|
Date | |||||||
2006-10-02 | 75.10 | 75.87 | 74.30 | 74.86 | 25451400 | 73.29 | 0.24 |
2006-10-03 | 74.45 | 74.95 | 73.19 | 74.08 | 28239600 | 72.52 | 0.37 |
2006-10-04 | 74.10 | 75.46 | 73.16 | 75.38 | 29610100 | 73.80 | -1.28 |
2006-10-05 | 74.53 | 76.16 | 74.13 | 74.83 | 24424400 | 73.26 | -0.30 |
2006-10-06 | 74.42 | 75.04 | 73.81 | 74.22 | 16677100 | 72.66 | 0.20 |
...and deleted on the fly.
del df['diff']
df.head()
Open | High | Low | Close | Volume | Adj Close | |
---|---|---|---|---|---|---|
Date | ||||||
2006-10-02 | 75.10 | 75.87 | 74.30 | 74.86 | 25451400 | 73.29 |
2006-10-03 | 74.45 | 74.95 | 73.19 | 74.08 | 28239600 | 72.52 |
2006-10-04 | 74.10 | 75.46 | 73.16 | 75.38 | 29610100 | 73.80 |
2006-10-05 | 74.53 | 76.16 | 74.13 | 74.83 | 24424400 | 73.26 |
2006-10-06 | 74.42 | 75.04 | 73.81 | 74.22 | 16677100 | 72.66 |
close_px = df['Adj Close']
mavg = pd.rolling_mean(close_px, 40)
mavg[-10:]
Date 2011-12-16 380.53500 2011-12-19 380.27400 2011-12-20 380.03350 2011-12-21 380.00100 2011-12-22 379.95075 2011-12-23 379.91750 2011-12-27 379.95600 2011-12-28 379.90350 2011-12-29 380.11425 2011-12-30 380.30000 dtype: float64
Returns defined as: $$ $$ $$r_t = \frac{p_t - p_{t-1}}{p_{t-1}} = \frac{p_t}{p_{t-1}} - 1$$
rets = close_px / close_px.shift(1) - 1
rets.head()
Date 2006-10-02 NaN 2006-10-03 -0.010506 2006-10-04 0.017650 2006-10-05 -0.007317 2006-10-06 -0.008190 Name: Adj Close, dtype: float64
Or alternatively .pct_change()
.
close_px.pct_change().head()
Date 2006-10-02 NaN 2006-10-03 -0.010506 2006-10-04 0.017650 2006-10-05 -0.007317 2006-10-06 -0.008190 Name: Adj Close, dtype: float64
Series and DataFrames have an associated .plot()
command (uses Matplotlib behind the scenes).
close_px.plot(label='AAPL')
mavg.plot(label='mavg')
plt.legend()
<matplotlib.legend.Legend at 0xa17cd8c>
df = pd.io.data.get_data_yahoo(['AAPL', 'GE', 'GOOG', 'IBM', 'KO', 'MSFT', 'PEP'],
start=datetime.datetime(2010, 1, 1),
end=datetime.datetime(2013, 1, 1))['Adj Close']
df.head()
AAPL | GE | GOOG | IBM | KO | MSFT | PEP | |
---|---|---|---|---|---|---|---|
Date | |||||||
2010-01-04 | 209.51 | 13.81 | 626.75 | 124.58 | 25.77 | 28.29 | 55.08 |
2010-01-05 | 209.87 | 13.88 | 623.99 | 123.07 | 25.46 | 28.30 | 55.75 |
2010-01-06 | 206.53 | 13.81 | 608.26 | 122.27 | 25.45 | 28.12 | 55.19 |
2010-01-07 | 206.15 | 14.53 | 594.10 | 121.85 | 25.39 | 27.83 | 54.84 |
2010-01-08 | 207.52 | 14.84 | 602.02 | 123.07 | 24.92 | 28.02 | 54.66 |
rets = df.pct_change()
plt.scatter(rets.PEP, rets.KO)
plt.xlabel('Returns PEP')
plt.ylabel('Returns KO')
<matplotlib.text.Text at 0xa1b5d8c>
pd.scatter_matrix(rets, diagonal='kde', figsize=(10, 10));
corr = rets.corr()
corr
AAPL | GE | GOOG | IBM | KO | MSFT | PEP | |
---|---|---|---|---|---|---|---|
AAPL | 1.000000 | 0.457186 | 0.533405 | 0.501582 | 0.388537 | 0.472015 | 0.309023 |
GE | 0.457186 | 1.000000 | 0.462540 | 0.609544 | 0.593366 | 0.600145 | 0.532517 |
GOOG | 0.533405 | 0.462540 | 1.000000 | 0.470798 | 0.404247 | 0.472152 | 0.329978 |
IBM | 0.501582 | 0.609544 | 0.470798 | 1.000000 | 0.602032 | 0.645565 | 0.490668 |
KO | 0.388537 | 0.593366 | 0.404247 | 0.602032 | 1.000000 | 0.523689 | 0.636500 |
MSFT | 0.472015 | 0.600145 | 0.472152 | 0.645565 | 0.523689 | 1.000000 | 0.476464 |
PEP | 0.309023 | 0.532517 | 0.329978 | 0.490668 | 0.636500 | 0.476464 | 1.000000 |
plt.imshow(corr, cmap='hot', interpolation='none')
plt.colorbar()
plt.xticks(range(len(corr)), corr.columns)
plt.yticks(range(len(corr)), corr.columns);
One thing we are often interested in is the relationship of expected returns and the risk we are taking one. Often there is a trade-off between the two.
Here we use plt.annotate
to put labels on the scatterplot.
plt.scatter(rets.mean(), rets.std())
plt.xlabel('Expected returns')
plt.ylabel('Risk')
for label, x, y in zip(rets.columns, rets.mean(), rets.std()):
plt.annotate(
label,
xy = (x, y), xytext = (20, -20),
textcoords = 'offset points', ha = 'right', va = 'bottom',
bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))
Lets pull some more securities from Yahoo finance and build a DataFrame of the adjusted closing prices.
series_list = []
securities = ['AAPL', 'GOOG', 'IBM', 'MSFT']
for security in securities:
s = pd.io.data.get_data_yahoo(security, start=datetime.datetime(2011, 10, 1), end=datetime.datetime(2013, 1, 1))['Adj Close']
s.name = security # Rename series to match security name
series_list.append(s)
pd.concat
can be used to concatenate multiple Series
into one DataFrame
.
df = pd.concat(series_list, axis=1)
df.head()
AAPL | GOOG | IBM | MSFT | |
---|---|---|---|---|
Date | ||||
2011-10-03 | 366.72 | 495.52 | 168.24 | 23.32 |
2011-10-04 | 364.67 | 501.90 | 169.65 | 24.09 |
2011-10-05 | 370.30 | 504.70 | 171.69 | 24.61 |
2011-10-06 | 369.44 | 514.71 | 176.39 | 25.04 |
2011-10-07 | 362.02 | 515.12 | 177.07 | 24.95 |
df.ix[0, 'AAPL'] = np.nan
df.ix[1, ['GOOG', 'IBM']] = np.nan
df.ix[[1, 2, 3], 'MSFT'] = np.nan
df.head()
AAPL | GOOG | IBM | MSFT | |
---|---|---|---|---|
Date | ||||
2011-10-03 | NaN | 495.52 | 168.24 | 23.32 |
2011-10-04 | 364.67 | NaN | NaN | NaN |
2011-10-05 | 370.30 | 504.70 | 171.69 | NaN |
2011-10-06 | 369.44 | 514.71 | 176.39 | NaN |
2011-10-07 | 362.02 | 515.12 | 177.07 | 24.95 |
Pandas has great support for computing with missing values.
(df.AAPL + df.GOOG).head()
Date 2011-10-03 NaN 2011-10-04 NaN 2011-10-05 875.00 2011-10-06 884.15 2011-10-07 877.14 dtype: float64
One common approacht to impute missing values in time series is forward filling.
df.ffill().head()
AAPL | GOOG | IBM | MSFT | |
---|---|---|---|---|
Date | ||||
2011-10-03 | NaN | 495.52 | 168.24 | 23.32 |
2011-10-04 | 364.67 | 495.52 | 168.24 | 23.32 |
2011-10-05 | 370.30 | 504.70 | 171.69 | 23.32 |
2011-10-06 | 369.44 | 514.71 | 176.39 | 23.32 |
2011-10-07 | 362.02 | 515.12 | 177.07 | 24.95 |