import pandas as pd
import numpy as np

# Set some Pandas options
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 25)

plt.plot(np.random.normal(size=100), np.random.normal(size=100), 'ro')

with mpl.rc_context(rc={'font.family': 'serif', 'font.weight': 'bold', 'font.size': 8}):
    fig = plt.figure(figsize=(6,3))
    ax1 = fig.add_subplot(121)
    ax1.set_xlabel('some random numbers')
    ax1.set_ylabel('more random numbers')
    ax1.set_title("Random scatterplot")
    plt.plot(np.random.normal(size=100), np.random.normal(size=100), 'r.')
    ax2 = fig.add_subplot(122)
    plt.hist(np.random.normal(size=100), bins=15)
    ax2.set_xlabel('sample')
    ax2.set_ylabel('cumulative sum')
    ax2.set_title("Normal distrubution")
    plt.tight_layout()
    plt.savefig("normalvars.png", dpi=150)

normals = pd.Series(np.random.normal(size=10))
normals.plot()

normals.cumsum().plot(grid=False)

variables = pd.DataFrame({'normal': np.random.normal(size=100), 
                       'gamma': np.random.gamma(1, size=100), 
                       'poisson': np.random.poisson(size=100)})
variables.cumsum(0).plot()

variables.cumsum(0).plot(subplots=True)

variables.cumsum(0).plot(secondary_y='normal')

fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(12, 4))
for i,var in enumerate(['normal','gamma','poisson']):
    variables[var].cumsum(0).plot(ax=axes[i], title=var)
axes[0].set_ylabel('cumulative sum')

titanic = pd.read_excel("data/titanic.xls", "titanic")
titanic.head()

titanic.groupby('pclass').survived.sum().plot(kind='bar')

titanic.groupby(['sex','pclass']).survived.sum().plot(kind='barh')

death_counts = pd.crosstab([titanic.pclass, titanic.sex], titanic.survived.astype(bool))
death_counts.plot(kind='bar', stacked=True, color=['black','gold'], grid=False)

death_counts.div(death_counts.sum(1).astype(float), axis=0).plot(kind='barh', stacked=True, color=['black','gold'])

titanic.fare.hist(grid=False)

titanic.fare.hist(bins=30)

sturges = lambda n: int(log2(n) + 1)
square_root = lambda n: int(sqrt(n))
from scipy.stats import kurtosis
doanes = lambda data: int(1 + log(len(data)) + log(1 + kurtosis(data) * (len(data) / 6.) ** 0.5))

n = len(titanic)
sturges(n), square_root(n), doanes(titanic.fare.dropna())

titanic.fare.hist(bins=doanes(titanic.fare.dropna()))

titanic.fare.dropna().plot(kind='kde', xlim=(0,600))

titanic.fare.hist(bins=doanes(titanic.fare.dropna()), normed=True, color='lightseagreen')
titanic.fare.dropna().plot(kind='kde', xlim=(0,600), style='r--')

titanic.boxplot(column='fare', by='pclass', grid=False)

bp = titanic.boxplot(column='age', by='pclass', grid=False)
for i in [1,2,3]:
    y = titanic.age[titanic.pclass==i].dropna()
    # Add some random "jitter" to the x-axis
    x = np.random.normal(i, 0.04, size=len(y))
    plot(x, y, 'r.', alpha=0.2)

titanic.groupby('pclass')['fare'].mean().plot(kind='bar', yerr=titanic.groupby('pclass')['fare'].std())

data1 = [150, 155, 175, 200, 245, 255, 395, 300, 305, 320, 375, 400, 420, 430, 440]
data2 = [225, 380]

fake_data = pd.DataFrame([data1, data2]).transpose()
p = fake_data.mean().plot(kind='bar', yerr=fake_data.std(), grid=False)

fake_data = pd.DataFrame([data1, data2]).transpose()
p = fake_data.mean().plot(kind='bar', yerr=fake_data.std(), grid=False)
x1, x2 = p.xaxis.get_majorticklocs()
plot(np.random.normal(x1, 0.01, size=len(data1)), data1, 'ro')
plot([x2]*len(data2), data2, 'ro')



baseball = pd.read_csv("data/baseball.csv")
baseball.head()

plt.scatter(baseball.ab, baseball.h)
xlim(0, 700); ylim(0, 200)

plt.scatter(baseball.ab, baseball.h, s=baseball.hr*10, alpha=0.5)
xlim(0, 700); ylim(0, 200)

plt.scatter(baseball.ab, baseball.h, c=baseball.hr, s=40, cmap='hot')
xlim(0, 700); ylim(0, 200);

_ = pd.scatter_matrix(baseball.loc[:,'r':'sb'], figsize=(12,8), diagonal='kde')

from pandas.tools.rplot import *

titanic = titanic[titanic.age.notnull() & titanic.fare.notnull()]

tp = RPlot(titanic, x='age')
tp.add(TrellisGrid(['pclass', 'sex']))
tp.add(GeomDensity())
_ = tp.render(gcf())

cdystonia = pd.read_csv("data/cdystonia.csv", index_col=None)
cdystonia.head()

plt.figure(figsize=(12,12))
bbp = RPlot(cdystonia, x='age', y='twstrs')
bbp.add(TrellisGrid(['week', 'treat']))
bbp.add(GeomScatter())
bbp.add(GeomPolyFit(degree=2))
_ = bbp.render(gcf())

cdystonia['site'] = cdystonia.site.astype(float)

plt.figure(figsize=(6,6))
cp = RPlot(cdystonia, x='age', y='twstrs')
cp.add(GeomPoint(colour=ScaleGradient('site', colour1=(1.0, 1.0, 0.5), colour2=(1.0, 0.0, 0.0)),
            size=ScaleSize('week', min_size=10.0, max_size=200.0),
            shape=ScaleShape('treat')))
_ = cp.render(gcf())