#!/usr/bin/env python # coding: utf-8 #
#

Introduction to Data Analysis with Python

#
#

Dr. Thomas Wiecki


#

Lead Data Scientist

# #
# # # Source: http://www.nutanix.com/2013/09/16/the-cup-has-been-flipped/ #
#

Introduction to Data Analysis with Python

#

The Path of the PyData Ninja

#
#

Dr. Thomas Wiecki


#

Lead Data Scientist

# #
# # About me # # * Lead Data Scientist at [Quantopian Inc](https://www.quantopian.com): Building a crowd sourced hedge fund. # * PhD from Brown University -- research on computational neuroscience and machine learning using Bayesian modeling. # * Twitter: [@twiecki](https://twitter.com/twiecki) # * GitHub: [@twiecki](https://github.com/twiecki) # * Blog: [http://twiecki.github.io](https://twiecki.github.io) # * Developer of [PyMC3](https://github.com/pymc-devs/pymc3). # # # * We back the best investment algorithms with investor capital, trading operations, and technology. # * Do your research in our hosted IPython environment using stock price history, corporate fundamental data, and other data sets. # * Write your algorithm in your browser. Then backtest it, for free, over 13 years of minute-level data. # * When you enter the contest, your algorithm will also be considered for our hedge fund. # * [We're hiring in Düsseldorf: Operations Engineer!](https://www.quantopian.com/about#op-37090-operations-engineer-israel) # ## Why use Python for data analysis? # * Python is a **general purpose language** -> No hodge-podge of perl, bash, matlab, fortran. # * Very easy to learn. # * Quality and quantity of data analysis libraries is very high and growing at a rapid pace. # * What are the alternatives? # - R: "The best thing about R is that it was written by statisticians. The worst thing about R is that it was written by statisticians." Bow Cogwill # - Matlab: $$$, not open # ## Jobs! # # #

The PyData Stack

# Source: [Jake VanderPlas: State of the Tools](https://www.youtube.com/watch?v=5GlNDD7qbP4) #
#

The PyData Stack

# #
#

The PyData Stack

# #
#

The PyData Stack

# #
#

The PyData Stack

# #
# ## Level 0: n00b # # # # How to get started # # * Start by installing the [Anaconda Python distribution](http://continuum.io/downloads#py34) (use Python 3.4) # * Install the jupyter notebook (former IPython) # * Do a basic Python tutorial to get a handle on the syntax, e.g. [Learn Python the Hard Way](http://learnpythonthehardway.org/) # # # Python basics # ### Interpreted and interactive # In[1]: 3 * 4 # ### Lists # In[2]: x = [1, 2, 3] print(x) # In[3]: x.append(4) print(x) # ### Dictionaries # In[4]: measurements = {'height': [1.70, 1.80, 1.50], 'weight': [60, 120, 50]} measurements # In[5]: measurements['height'] # ### Comprehensions # In[6]: x = [1, 2, 3, 4] [i**2 for i in x] # In[7]: def calc_bmi(weight, height): return weight / height**2 [calc_bmi(w, h) for w, h in zip(measurements['weight'], measurements['height'])] # ## Level 1: "The Pandas Wrangler" # # # ## How to become a "Pandas Wrangler" # # * Learn Pandas (data wrangling): http://pandas.pydata.org/pandas-docs/stable/tutorials.html # * Learn Seaborn (data visualization): http://stanford.edu/~mwaskom/software/seaborn/ # ### Why not start with NumPy and Matplotlib? # * These libraries have become core libraries. # * Better results can be achieved starting with Pandas and Seaborn. # * For more motivation, see http://twiecki.github.io/blog/2014/11/18/python-for-data-science/ # ## Pandas # In[8]: import pandas as pd import numpy as np # In[9]: s = pd.Series([1,3,5,np.nan,6,8]) s # In[10]: dates = pd.date_range('20130101', periods=6) df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD')) df # In[11]: df[df.A > 0] # In[12]: df.mean() # In[13]: df.mean(axis='columns') # ## Mixed types # In[14]: df2 = pd.DataFrame({ 'A' : 1., 'B' : pd.Timestamp('20130102'), 'C' : pd.Series(1,index=list(range(4)),dtype='float32'), 'D' : np.array([3] * 4,dtype='int32'), 'E' : pd.Categorical(["test","train","test","train"]), 'F' : 'foo' }) df2 # In[15]: df2.dtypes # ### Grouping # In[16]: df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C' : np.random.randn(8), 'D' : np.random.randn(8)}) # In[17]: df # In[18]: df.groupby('A').sum() # In[19]: df.groupby(['A', 'B']).sum() # ## Seaborn: Generating statistical plots # In[20]: get_ipython().run_line_magic('matplotlib', 'inline') import seaborn as sns # In[21]: x = np.random.normal(size=100) sns.distplot(x); # ### 2D distributions # In[22]: mean, cov = [0, 1], [(1, .5), (.5, 1)] data = np.random.multivariate_normal(mean, cov, 200) df = pd.DataFrame(data, columns=["x", "y"]) df # In[23]: sns.jointplot(x="x", y="y", data=df, kind="kde"); # ### All pairwise combinations # In[24]: iris = sns.load_dataset("iris") sns.pairplot(iris); # ## Seaborn: Regressions # In[25]: tips = sns.load_dataset("tips") # In[26]: tips.head() # In[27]: sns.lmplot(x="total_bill", y="tip", hue="smoker", data=tips); # In[28]: sns.lmplot(x="total_bill", y="tip", col="day", data=tips, col_wrap=2, size=3); # In[29]: sns.factorplot(x="time", y="total_bill", hue="smoker", col="day", data=tips, kind="box", size=4, aspect=.5); # ## Level 2: "The Kaggle top scorer" # # # ## Lots of machine learning and stats libraries # # * SciPy: comprehensive library of numerical routines like optimizers, integrators, FFT. # * scikit-learn: **The** ML library out there # * statsmodels: Frequentist statistics # * SymPy: Symbolic Math # * PyMC3: Probabilistic programming in Python # ## scikit-learn # # Taken from http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html # In[30]: from sklearn import svm X = [[0, 0], [1, 1]] y = [0, 1] clf = svm.SVC() clf.fit(X, y) # In[31]: clf.predict([[0, .5]]) # ### Advanced example: Grid Search with Cross-Validation to find hyper parameters # # Taken from http://scikit-learn.org/stable/auto_examples/grid_search_digits.html and http://scikit-learn.org/stable/auto_examples/datasets/plot_digits_last_image.html # In[32]: from sklearn import datasets from sklearn.cross_validation import train_test_split from sklearn.grid_search import GridSearchCV from sklearn.metrics import confusion_matrix from sklearn.svm import SVC digits = datasets.load_digits() # In[33]: import matplotlib.pyplot as plt #Display the first digit plt.figure(1, figsize=(3, 3)) plt.imshow(digits.images[-1], cmap=plt.cm.gray_r, interpolation='nearest') plt.grid('off') # In[34]: n_samples = len(digits.images) X = digits.images.reshape((n_samples, -1)) y = digits.target # Split the dataset in two equal parts X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.5, random_state=0) # In[35]: # Set the parameters by cross-validation tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}, {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}] clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5) clf.fit(X_train, y_train) # In[36]: print(clf.best_params_) # In[37]: y_true, y_pred = y_test, clf.predict(X_test) ax = sns.heatmap(confusion_matrix(y_true, y_pred)) ax.set(xlabel='true label', ylabel='predicted label'); # ## Level 3: "Lord of Speed" # # # ## Python is slow! # # * The interpreted language is indeed quite slow (just like matlab and R are slow) # * Vectorize computations (i.e. the matlab way): leads to unreadable code. # # ## Great tools to generate C-code # # * Cython: Write Python-like syntax that can be translated to fast C-code and called from Python. # * Numba: Directly write Python and auto-translate to LLVM. # * Theano: Write numerical expressions in a NumPy-like syntax to build up compute-graph that can be compiled. # * PyCUDA: GPU programming. # ## Comparing Python, Cython and Numba # # Taken from https://jakevdp.github.io/blog/2013/06/15/numba-vs-cython-take-2/ # In[38]: import numpy as np X = np.random.random((1000, 3)) # In[39]: def pairwise_python(X): M = X.shape[0] N = X.shape[1] D = np.empty((M, M), dtype=np.float) for i in range(M): for j in range(M): d = 0.0 for k in range(N): tmp = X[i, k] - X[j, k] d += tmp * tmp D[i, j] = np.sqrt(d) return D get_ipython().run_line_magic('timeit', 'pairwise_python(X)') # ## Cython # In[40]: get_ipython().run_line_magic('load_ext', 'cython') # In[41]: get_ipython().run_cell_magic('cython', '', 'import numpy as np\ncimport cython\nfrom libc.math cimport sqrt\n\n@cython.boundscheck(False)\n@cython.wraparound(False)\ndef pairwise_cython(double[:, ::1] X):\n cdef int M = X.shape[0]\n cdef int N = X.shape[1]\n cdef double tmp, d\n cdef double[:, ::1] D = np.empty((M, M), dtype=np.float64)\n for i in range(M):\n for j in range(M):\n d = 0.0\n for k in range(N):\n tmp = X[i, k] - X[j, k]\n d += tmp * tmp\n D[i, j] = sqrt(d)\n return np.asarray(D)\n') # In[42]: get_ipython().run_line_magic('timeit', 'pairwise_cython(X)') # ## Numba # In[43]: from numba.decorators import autojit pairwise_numba = autojit(pairwise_python) # Run once to compile before timing pairwise_numba(X) get_ipython().run_line_magic('timeit', 'pairwise_numba(X)') # # Level 4: "High Priest of Big Data" # # # # Lots of things happening! # # ## Big Data # * Dask # * Ibis # * PySpark # * bcolz # # ## Interactive data visualization # * Bokeh # * Plotly # * pyxley # ## Work interactively on Big Data with Dask # # Taken from https://jakevdp.github.io/blog/2015/08/14/out-of-core-dataframes-in-python/ # In[44]: get_ipython().system('ls -lahL POIWorld.csv') # In[45]: from dask import dataframe as dd columns = ["name", "amenity", "Longitude", "Latitude"] data = dd.read_csv('POIWorld.csv', usecols=columns) data # In[46]: with_name = data[data.name.notnull()] # In[47]: is_starbucks = with_name.name.str.contains('[Ss]tarbucks') is_dunkin = with_name.name.str.contains('[Dd]unkin') starbucks = with_name[is_starbucks] dunkin = with_name[is_dunkin] # In[48]: from dask.diagnostics import ProgressBar # In[49]: with ProgressBar(): starbucks_count, dunkin_count = dd.compute(starbucks.name.count(), dunkin.name.count()) # In[50]: starbucks_count, dunkin_count # In[51]: locs = dd.compute(starbucks.Longitude, starbucks.Latitude, dunkin.Longitude, dunkin.Latitude) # extract arrays of values fro the series: lon_s, lat_s, lon_d, lat_d = [loc.values for loc in locs] # In[52]: import matplotlib.pyplot as plt from mpl_toolkits.basemap import Basemap def draw_USA(): """initialize a basemap centered on the continental USA""" plt.figure(figsize=(14, 10)) return Basemap(projection='lcc', resolution='l', llcrnrlon=-119, urcrnrlon=-64, llcrnrlat=22, urcrnrlat=49, lat_1=33, lat_2=45, lon_0=-95, area_thresh=10000) m = draw_USA() # Draw map background m.fillcontinents(color='white', lake_color='#eeeeee') m.drawstates(color='lightgray') m.drawcoastlines(color='lightgray') m.drawcountries(color='lightgray') m.drawmapboundary(fill_color='#eeeeee') # Plot the values in Starbucks Green and Dunkin Donuts Orange style = dict(s=5, marker='o', alpha=0.5, zorder=2) m.scatter(lon_s, lat_s, latlon=True, label="Starbucks", color='#00592D', **style) m.scatter(lon_d, lat_d, latlon=True, label="Dunkin' Donuts", color='#FC772A', **style) plt.legend(loc='lower left', frameon=False); # ## Interactive data visualization with Bokeh # In[55]: from bokeh.io import output_notebook from bokeh.resources import CDN from bokeh.plotting import figure, show output_notebook(resources=CDN) from __future__ import print_function from math import pi from bokeh.browserlib import view from bokeh.document import Document from bokeh.embed import file_html from bokeh.models.glyphs import Circle, Text from bokeh.models import ( BasicTicker, ColumnDataSource, Grid, GridPlot, LinearAxis, DataRange1d, PanTool, Plot, WheelZoomTool ) from bokeh.resources import INLINE from bokeh.sampledata.iris import flowers from bokeh.plotting import show colormap = {'setosa': 'red', 'versicolor': 'green', 'virginica': 'blue'} flowers['color'] = flowers['species'].map(lambda x: colormap[x]) source = ColumnDataSource( data=dict( petal_length=flowers['petal_length'], petal_width=flowers['petal_width'], sepal_length=flowers['sepal_length'], sepal_width=flowers['sepal_width'], color=flowers['color'] ) ) text_source = ColumnDataSource( data=dict(xcenter=[125], ycenter=[135]) ) xdr = DataRange1d() ydr = DataRange1d() def make_plot(xname, yname, xax=False, yax=False, text=None): plot = Plot( x_range=xdr, y_range=ydr, background_fill="#efe8e2", border_fill='white', title="", min_border=2, h_symmetry=False, v_symmetry=False, plot_width=150, plot_height=150) circle = Circle(x=xname, y=yname, fill_color="color", fill_alpha=0.2, size=4, line_color="color") r = plot.add_glyph(source, circle) xdr.renderers.append(r) ydr.renderers.append(r) xticker = BasicTicker() if xax: xaxis = LinearAxis() plot.add_layout(xaxis, 'below') xticker = xaxis.ticker plot.add_layout(Grid(dimension=0, ticker=xticker)) yticker = BasicTicker() if yax: yaxis = LinearAxis() plot.add_layout(yaxis, 'left') yticker = yaxis.ticker plot.add_layout(Grid(dimension=1, ticker=yticker)) plot.add_tools(PanTool(), WheelZoomTool()) if text: text = " ".join(text.split('_')) text = Text( x={'field':'xcenter', 'units':'screen'}, y={'field':'ycenter', 'units':'screen'}, text=[text], angle=pi/4, text_font_style="bold", text_baseline="top", text_color="#ffaaaa", text_alpha=0.7, text_align="center", text_font_size="28pt" ) plot.add_glyph(text_source, text) return plot xattrs = ["petal_length", "petal_width", "sepal_width", "sepal_length"] yattrs = list(reversed(xattrs)) plots = [] for y in yattrs: row = [] for x in xattrs: xax = (y == yattrs[-1]) yax = (x == xattrs[0]) text = x if (x==y) else None plot = make_plot(x, y, xax, yax, text) row.append(plot) plots.append(row) grid = GridPlot(children=plots, title="iris_splom") # In[56]: show(grid) # # Staying up-to-date # * Get on [Twitter](https://twitter.com/twiecki) # * Frequent [HackerNews](https://news.ycombinator.com) # * Frequent [DataTau](https://datatau.com) # * Visit [PyData conferences](http://pydata.org/) and the [SciPy conference](http://conference.scipy.org/) #