#!/usr/bin/env python # coding: utf-8 # # Data visualization with Pandas and Altair # ## Python Data Visualization Ecosystem # # Unlike R, where the community has rallied around a single visualization package (ggplot2), Python users have many different packages to choose from -- all of which have their strengths and weaknesses. # # Here is a sampling of a few prominent options: # # **Matplotlib** # # Matplotlib is the "grandparent" of Python plotting libraries. It was written to look and act like MatLab, so it was originally written in a fairly "non-Pythonic" way. Since it has been around for the longest time, there are a lot of Python libraries that are built around it, and there have been various efforts to streamline and overhaul the way to interface with it. # # Link: https://matplotlib.org/ # # **Seaborn** # # Seaborn is built on top of Matplotlib to provide functions to build various specific statistical plots. But it also incorporates default nice styling, and also attempts to standardize the code. # # Link: https://seaborn.pydata.org/ # # **Plotnine** # # Plotnine is also built on top of Matplotlib, and is an effort to be a Python port of R's ggplot plotting library. The original Data Carpentry Python visualization lesson is written to use Altair, so that it can stay in sync with the Data Carpentry R lesson. # # Link: https://plotnine.readthedocs.io/en/stable/ # # **Plotly** # # Link: https://plotly.com/python/ # # **Bokeh** # # Link: https://bokeh.org/ # # **Altair** # # Link: https://altair-viz.github.io/ # # We will be using Altair for most of today's lesson for its combination to adherence to the Grammar of Graphics as well as its widespread adoption by Python users. # # # ## Visualization with Altair # ### Preparing our dataset # In[1]: import pandas as pd surveys = pd.read_csv('data/surveys.csv') surveys.info() # In[2]: species_counts = surveys.groupby('species_id')['record_id'].count().reset_index(name='species_count') species_counts.head() # In[3]: len(species_counts) # In[4]: big_species = species_counts[species_counts['species_count'] >= 50]['species_id'].to_list() big_species # In[5]: surveys_filtered = surveys[surveys['species_id'].isin(big_species)].dropna() surveys_filtered.info() # In[6]: surveys_filtered.to_csv('data/surveys_filtered.csv', index=False) # ### Building your plots iteratively # In[7]: import altair as alt # In[8]: import vegafusion as vf vf.enable_widget() # In[9]: source = surveys.sample(50) alt.Chart(source).mark_circle().encode(x='weight', y='hindfoot_length') # In[9]: url = 'https://gist.githubusercontent.com/MikeTrizna/cd01f9bf3e21d6f74823423bdb45a2f3/raw/2d8c36cf78c9b6abf6938451c60defc93c5911a4/surveys_filtered.csv' # In[10]: alt.Chart(surveys_filtered).mark_circle(opacity=0.1).encode(x='weight:Q', y='hindfoot_length:Q') # In[11]: alt.Chart(surveys_filtered).mark_circle(opacity=0.1, color='red').encode(x='weight:Q', y='hindfoot_length:Q') # In[12]: alt.Chart(surveys_filtered).mark_circle(opacity=0.1).encode(x='weight:Q', y='hindfoot_length:Q', color='species_id:N') # In[13]: alt.Chart(surveys_filtered).mark_circle(opacity=0.1).encode(x='weight:Q', y='hindfoot_length:Q', color='species_id:N', tooltip='species_id:N' ).interactive() # ### Faceting # In[14]: alt.Chart(surveys_filtered).mark_circle(opacity=0.1).encode(x='weight:Q', y='hindfoot_length:Q', facet='sex:N', color='species_id:N') # ### Boxplot # In[15]: alt.Chart(surveys_filtered).mark_boxplot().encode(x='species_id:N', y='weight:Q') # **Challenge** # # Make a boxplot of the dataset that shows the distribution of hindfoot_length values by plot_id # ### Built-in grouping # In[17]: alt.Chart(surveys_filtered).mark_bar().encode( x='plot_id:O', y='count():Q', color='sex:N' ) # In[18]: alt.Chart(surveys_filtered).mark_line().encode( x='year:O', y='count():Q', color='species_id:N' ) # **Challenge** # # Make a bar plot showing the breakdown of sex values by species_id # ### Crossfiltering # In[21]: brush = alt.selection_interval() points = alt.Chart(surveys_filtered).mark_point(opacity=0.1).encode( x='weight:Q', y='hindfoot_length:Q', color=alt.condition(brush, 'species_id:N', alt.value('lightgray')) ).add_params( brush ) bars = alt.Chart(surveys_filtered).mark_bar().encode( y='species_id:N', color='species_id:N', x='count(species_id):Q' ).transform_filter( brush ) points & bars # In[ ]: