import addutils.toc ; addutils.toc.js(ipy_notebook=True)
With this tutorial we are going to see some of the statistical and computational tools offered by pandas
.
import datetime
import scipy.io
import numpy as np
import pandas as pd
import bokeh.plotting as bk
from IPython.display import display, HTML
from addutils import css_notebook, side_by_side2
css_notebook()
Given a pandas.Series
the method pct_change
returns a new pandas.Series
object containing percent change over a given number of periods.
s1 = pd.Series(range(10, 18) + np.random.randn(8) / 10)
pct_ch_1d = s1.pct_change() * 100
pct_ch_3d = s1.pct_change(periods=3) * 100
HTML(side_by_side2(s1, pct_ch_1d, pct_ch_3d))
0 | |
---|---|
0 | 10.187371 |
1 | 11.068102 |
2 | 11.944168 |
3 | 13.011350 |
4 | 14.080648 |
5 | 14.960530 |
6 | 15.987561 |
7 | 17.071012 |
0 | |
---|---|
0 | NaN |
1 | 8.645321 |
2 | 7.915230 |
3 | 8.934751 |
4 | 8.218195 |
5 | 6.248875 |
6 | 6.864935 |
7 | 6.776838 |
0 | |
---|---|
0 | NaN |
1 | NaN |
2 | NaN |
3 | 27.720384 |
4 | 27.218267 |
5 | 25.253847 |
6 | 22.873960 |
7 | 21.237402 |
Given two pandas.Series
the method cov
computes covariance between them, excluding missing values.
s1 = pd.util.testing.makeTimeSeries(7)
s2 = s1 + np.random.randn(len(s1)) / 10
HTML(side_by_side2(s1, s2))
0 | |
---|---|
2000-01-03 | 0.320914 |
2000-01-04 | -1.105849 |
2000-01-05 | 1.409292 |
2000-01-06 | -0.210900 |
2000-01-07 | -0.270258 |
2000-01-10 | 0.151734 |
2000-01-11 | 0.252147 |
0 | |
---|---|
2000-01-03 | 0.421457 |
2000-01-04 | -1.105964 |
2000-01-05 | 1.499170 |
2000-01-06 | -0.209599 |
2000-01-07 | -0.125346 |
2000-01-10 | 0.127648 |
2000-01-11 | 0.110899 |
s1.cov(s2)
0.59005429091399686
It is also possibile to compute pairwise covariance of a pandas.DataFrame
columns using pandas.DataFrame.cov
method. Here we use the module pandas.util.testing
in order to generate random data easily:
d1 = pd.util.testing.makeTimeDataFrame()
print d1.head()
print d1.cov()
A B C D 2000-01-03 -0.481372 -0.584465 -0.038654 0.280964 2000-01-04 -0.663815 -2.048175 -0.872593 -0.222687 2000-01-05 -0.719081 -0.553857 0.556233 0.467152 2000-01-06 -0.152237 0.568140 0.454709 0.478042 2000-01-07 -0.749170 1.024167 -1.523115 -1.723031 A B C D A 0.890847 0.066664 0.065301 0.301979 B 0.066664 1.035357 -0.237648 -0.107895 C 0.065301 -0.237648 1.004082 0.070748 D 0.301979 -0.107895 0.070748 0.798789
pandas.Series.corr
allows to compute correlation between two pandas.Series
. By the method
paramether it's possible to choose between:
s1.corr(s2, method='pearson')
0.99256798242819655
Like we just seen for covariance, it is possibile to call pandas.DataFrame.corr
to obtain pairwise correlation of columns over a pandas.DataFrame
d1.corr()
A | B | C | D | |
---|---|---|---|---|
A | 1.000000 | 0.069413 | 0.069045 | 0.357980 |
B | 0.069413 | 1.000000 | -0.233080 | -0.118642 |
C | 0.069045 | -0.233080 | 1.000000 | 0.078998 |
D | 0.357980 | -0.118642 | 0.078998 | 1.000000 |
pandas
provides also a lot of methods for calculating rolling moments.
[n for n in dir(pd) if n.startswith('rolling')]
['rolling_apply', 'rolling_corr', 'rolling_corr_pairwise', 'rolling_count', 'rolling_cov', 'rolling_kurt', 'rolling_max', 'rolling_mean', 'rolling_median', 'rolling_min', 'rolling_quantile', 'rolling_skew', 'rolling_std', 'rolling_sum', 'rolling_var', 'rolling_window']
Let's see some examples:
s3 = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))
s3 = s3.cumsum()
s3_max = pd.rolling_max(s3, 60)
s3_mean = pd.rolling_mean(s3, 60)
s3_min = pd.rolling_min(s3, 60)
data = {'cumsum':s3, 'max':s3_max, 'mean':s3_mean, 'min':s3_min}
df = pd.DataFrame(data)
df.tail()
cumsum | max | mean | min | |
---|---|---|---|---|
2002-09-22 | 23.134580 | 30.045892 | 26.343671 | 22.479375 |
2002-09-23 | 22.235159 | 30.045892 | 26.319127 | 22.235159 |
2002-09-24 | 21.350533 | 30.045892 | 26.287655 | 21.350533 |
2002-09-25 | 21.160216 | 30.045892 | 26.233954 | 21.160216 |
2002-09-26 | 22.698541 | 30.045892 | 26.177502 | 21.160216 |
fig = bk.figure(x_axis_type = "datetime",
tools="pan,box_zoom,reset", title = 'Rolling Moments',
plot_width=800, plot_height=400)
fig.line(df.index, df['cumsum'], color='cadetblue', legend='Cumulative Sum')
fig.line(df.index, df['max'], color='mediumorchid', legend='Max')
fig.line(df.index, df['min'], color='mediumpurple', legend='Min')
fig.line(df.index, df['mean'], color='navy', legend='Min')
bk.show(fig)
pandas.Series.cumsum
returns a new pandas.Series
containing the cumulative sum of the given values.
s4 = s3 + np.random.randn(len(s3))
rollc = pd.rolling_corr(s3, s4, window=10)
data2 = {'cumsum':s3, 'similar':s4, 'rolling correlation':rollc}
df2 = pd.DataFrame(data2)
fig = bk.figure(x_axis_type = "datetime", title = 'Rolling Correlation',
plot_width=800, plot_height=400)
fig.line(df2.index, df2['cumsum'], color='cadetblue', legend='Cumulative Sum')
fig.line(df2.index, df2['similar'], color='mediumorchid', legend='Similar')
fig.line(df2.index, df2['rolling correlation'], color='navy', legend='Rolling Corr.')
fig.legend.orientation = "bottom_right"
bk.show(fig)
AAPL = pd.read_csv('example_data/p03_AAPL.txt', index_col='Date', parse_dates=True)
price = AAPL['Adj Close']
display(price.tail())
Date 2012-09-17 699.78 2012-09-18 701.91 2012-09-19 702.10 2012-09-20 698.70 2012-09-21 700.09 Name: Adj Close, dtype: float64
pandas.Series.tail
returns the last n rows of a given pandas.Series
.
price['2011-10-03'] / price['2011-3-01'] - 1
returns = price.pct_change()
ret_index = (1 + returns).cumprod()
ret_index[0] = 1
monthly_returns = ret_index.resample('BM', how='last').pct_change()
fig = bk.figure(x_axis_type = 'datetime', title = 'Monthly Returns', plot_width=800, plot_height=400)
fig.line(monthly_returns.index, monthly_returns)
bk.show(fig)
Visit www.add-for.com for more tutorials and updates.
This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.