#!/usr/bin/env python # coding: utf-8 # # Plotly Visualization # The aim of this notebook is to proivde guidelines on how to achieve parity with Pandas' visualization methods as explained in http://pandas.pydata.org/pandas-docs/stable/visualization.html with the use of **Plotly** and **Cufflinks** # In[50]: import pandas as pd import cufflinks as cf import numpy as np from IPython.display import display,HTML # In[51]: get_ipython().run_line_magic('reload_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # ## Theme # Cufflinks can set global theme (sytle) to used. # In this case we will use Matplotlib's `ggplot` style. # In[52]: cf.set_config_file(theme='ggplot',sharing='public',offline=False) # ## Basic Plotting # The `iplot` method on Series and DataFrame is wrapper of Plotly's `plot` method # In[53]: # Cufflinks can generate random data for different shapes # Let's generate a single line with 1000 points cf.datagen.lines(1,1000).iplot() # In[54]: # Generating 4 timeseries df=cf.datagen.lines(4,1000) df.iplot() # You can plot one column versus another using the *x* and *y* keywords in `iplot` # In[55]: df3 = pd.DataFrame(np.random.randn(1000, 2), columns=['B', 'C']).cumsum() df3['A'] = pd.Series(list(range(len(df3)))) df3.iplot(x='A', y='B') # ## Bar Plots # In[56]: df.ix[3].iplot(kind='bar',bargap=.5) # Calling a DataFrame’s `plot()` method with `kind='bar'` produces a multiple bar plot: # In[57]: df=pd.DataFrame(np.random.rand(10, 4), columns=['a', 'b', 'c', 'd']) df.iplot(kind='bar') # To produce a stacked bar plot, use `barmode=stack` # In[58]: df.iplot(kind='bar',barmode='stack') # To get horizontal bar plots, pass `kind='barh'` # In[59]: df.iplot(kind='barh',barmode='stack',bargap=.1) # ## Histograms # Historgrams can be used with `kind='histogram'` # In[60]: df = pd.DataFrame({'a': np.random.randn(1000) + 1, 'b': np.random.randn(1000), 'c': np.random.randn(1000) - 1}, columns=['a', 'b', 'c']) # In[61]: df.iplot(kind='histogram') # Histogram can be stacked by using `barmode=stack`. Bin size can be changed by `bin` keyword. # In[62]: df.iplot(kind='histogram',barmode='stack',bins=20) # Orientation can normalization can also be set for Histograms by using `orientation='horizontal'` and `histnorm=probability`. # In[63]: df.iplot(kind='histogram',columns=['a'],orientation='h',histnorm='probability') # Histograms (and any other kind of plot) can be set in a multiple layout by using `subplots=True` # In[64]: df_h=cf.datagen.histogram(4) df_h.iplot(kind='histogram',subplots=True,bins=50) # ## Box Plots # Boxplots can be drawn calling a `Series` and `DataFrame` with `kind='box'` # In[65]: df = pd.DataFrame(np.random.rand(10, 5), columns=['A', 'B', 'C', 'D', 'E']) df.iplot(kind='box') # ### Grouping values # In[66]: df = pd.DataFrame(np.random.rand(10,2), columns=['Col1', 'Col2'] ) df['X'] = pd.Series(['A','A','A','A','A','B','B','B','B','B']) # Grouping values by generating a list of figures # In[67]: figs=[df[df['X']==d][['Col1','Col2']].iplot(kind='box',asFigure=True) for d in pd.unique(df['X']) ] # In[68]: cf.iplot(cf.subplots(figs)) # Grouping values and ammending the keys # In[69]: def by(df,category): l=[] for cat in pd.unique(df[category]): _df=df[df[category]==cat] del _df[category] _df=_df.rename(columns=dict([(k,'{0}_{1}'.format(cat,k)) for k in _df.columns])) l.append(_df.iplot(kind='box',asFigure=True)) return l # In[70]: cf.iplot(cf.subplots(by(df,'X'))) # ## Area Plots # You can create area plots with Series.plot and DataFrame.plot by passing `kind='area'`. To produce stacked area plot, each column must be either all positive or all negative values. # # When input data contains NaN, it will be automatically filled by 0. If you want to drop or fill by different values, use dataframe.dropna() or dataframe.fillna() before calling plot. # # To fill the area you can use `fill=True` # In[71]: df = pd.DataFrame(np.random.rand(10, 4), columns=['a', 'b', 'c', 'd']) # In[72]: df.iplot(kind='area',fill=True,opacity=1) # For non-stacked charts you can use `kind=scatter` with `fill=True`. Alpha value is set to 0.3 unless otherwise specified: # In[73]: df.iplot(fill=True) # ## Scatter Plot # You can create scatter plots with DataFrame.plot by passing `kind='scatter'`. Scatter plot requires numeric columns for x and y axis. These can be specified by x and y keywords each, otherwise the DataFrame index will be used as `x` # In[74]: df = pd.DataFrame(np.random.rand(50, 4), columns=['a', 'b', 'c', 'd']) # In[75]: df.iplot(kind='scatter',x='a',y='b',mode='markers') # Colors can be assigned as either a list or dicitonary by using `color`. # The marker symbol can be defined by using `symbol` # In[76]: df.iplot(kind='scatter',mode='markers',symbol='dot',colors=['orange','teal','blue','yellow'],size=10) # Bubble charts can be used with `kind=bubble` and by assigning one column as the `size` # In[77]: df.iplot(kind='bubble',x='a',y='b',size='c') # ## Scatter Matrix # You can create a scatter plot matrix using the function `scatter_matrix` # In[78]: df = pd.DataFrame(np.random.randn(1000, 4), columns=['a', 'b', 'c', 'd']) # In[79]: df.scatter_matrix() # ## Subplots # Subplots can be defined with `subplots=True`. The shape of the output can also be determined with `shape=(rows,cols)`. If omitted then the subplot shape will automatically defined. # # Axes can be shared across plots with `shared_xaxes=True` as well as `shared_yaxes=True` # In[80]: df=cf.datagen.lines(4) # In[81]: df.iplot(subplots=True,shape=(4,1),shared_xaxes=True,vertical_spacing=.02,fill=True) # Subplot Title can be set with `subplot_titles`. If set to `True` then the column names will be used. Otherwise a list of strings can be passed. # In[82]: df.iplot(subplots=True,subplot_titles=True,legend=False) # Irregular Subplots can also be drawn using `specs`. # For example, for getting a charts that spans across 2 rows we can use `specs=[[{'rowspan':2},{}],[None,{}]]`. # For a full set of advanced layout you can see `help(cufflinks.subplots)` # In[83]: df=cf.datagen.bubble(10,50,mode='stocks') # In[84]: figs=cf.figures(df,[dict(kind='histogram',keys='x',color='blue'), dict(kind='scatter',mode='markers',x='x',y='y',size=5), dict(kind='scatter',mode='markers',x='x',y='y',size=5,color='teal')],asList=True) figs.append(cf.datagen.lines(1).figure(bestfit=True,colors=['blue'],bestfit_colors=['pink'])) base_layout=cf.tools.get_base_layout(figs) sp=cf.subplots(figs,shape=(3,2),base_layout=base_layout,vertical_spacing=.15,horizontal_spacing=.03, specs=[[{'rowspan':2},{}],[None,{}],[{'colspan':2},None]], subplot_titles=['Histogram','Scatter 1','Scatter 2','Bestfit Line']) sp['layout'].update(showlegend=False) # In[85]: cf.iplot(sp) # ### Shapes # Lines can be added with `hline` and `vline` for horizontal and vertical lines respectively. # These can be either a list of values (relative to the axis) or a dictionary. # In[86]: df=cf.datagen.lines(3,columns=['a','b','c']) # In[87]: df.iplot(hline=[2,4],vline=['2015-02-10']) # More advanced parameters can be passed in the form of a dictionary, including `width` and `color` and `dash` for the line dash type. # In[88]: df.iplot(hline=[dict(y=-1,color='blue',width=3),dict(y=1,color='pink',dash='dash')]) # Shaded areas can be plotted using `hspan` and `vspan` for horizontal and vertical areas respectively. # These can be set with a list of paired tuples (v0,v1) or a list of dictionaries with further parameters. # In[89]: df.iplot(hspan=[(-1,1),(2,5)]) # Extra parameters can be passed in the form of dictionaries, `width`, `fill`, `color`, `fillcolor`, `opacity` # In[90]: df.iplot(vspan={'x0':'2015-02-15','x1':'2015-03-15','color':'teal','fill':True,'opacity':.4}) # In[91]: # Plotting resistance lines max_vals=df.max().values.tolist() resistance=[dict(kind='line',y=i,color=j,width=2) for i,j in zip(max_vals,['red','blue','pink'])] df.iplot(hline=resistance) # Different shapes can also be used with `shapes` and identifying the `kind` which can be either *line*, *rect* or *circle* # In[92]: # Get min to max values df_a=df['a'] max_val=df_a.max() min_val=df_a.min() max_date=df_a[df_a==max_val].index[0].strftime('%Y-%m-%d') min_date=df_a[df_a==min_val].index[0].strftime('%Y-%m-%d') shape1=dict(kind='line',x0=max_date,y0=max_val,x1=min_date,y1=min_val,color='blue',width=2) shape2=dict(kind='rect',x0=max_date,x1=min_date,fill=True,color='gray',opacity=.3) # In[93]: df_a.iplot(shapes=[shape1,shape2]) # #### Other Shapes # In[94]: x0 = np.random.normal(2, 0.45, 300) y0 = np.random.normal(2, 0.45, 300) x1 = np.random.normal(6, 0.4, 200) y1 = np.random.normal(6, 0.4, 200) x2 = np.random.normal(4, 0.3, 200) y2 = np.random.normal(4, 0.3, 200) distributions = [(x0,y0),(x1,y1),(x2,y2)] # In[95]: dfs=[pd.DataFrame(dict(x=i,y=j)) for i,j in distributions] # In[96]: d=cf.Data() gen=cf.colorgen(scale='ggplot') for df in dfs: d_=df.figure(kind='scatter',mode='markers',x='x',y='y',size=5,colors=gen.next())['data'] for _ in d_: d.append(_) # In[97]: gen=cf.colorgen(scale='ggplot') shapes=[cf.tools.get_shape(kind='circle',x0=min(x),x1=max(x), y0=min(y),y1=max(y),color=gen.next(),fill=True, opacity=.3,width=.4) for x,y in distributions] # In[98]: fig=cf.Figure(data=d) fig['layout']=cf.getLayout(shapes=shapes,legend=False,title='Distribution Comparison') cf.iplot(fig,validate=False) # In[ ]: