#!/usr/bin/env python # coding: utf-8 # ## Pandas demo # # #### Henry Schreiner # # A few basic imports # In[ ]: import pandas as pd import numpy as np import matplotlib.pyplot as plt # ### MPG dataset # # Let's read a CSV file from the web *directly* into Pandas. # In[ ]: data = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/mpg.csv') data # Info about the dataframe: # In[ ]: data.info() # Detailed memory usage: # # In[ ]: # You can ask for statistical information per column or often per dataframe: # # # In[ ]: # Plots are easy; try `.plot`, `.scatter`, or `.hist`: # # # In[ ]: # In[ ]: # In[ ]: # You can select subsets, such as `mpg > 42`: # # # In[ ]: # Or you can use `groupby('cylinders')` to work on per-cylinder groups: # # # In[ ]: # If you want legends, it's no longer one line, but still simple: # In[ ]: for name, grp in data.groupby('cylinders').mpg: grp.hist(label=name) plt.legend(); # The category type is better for data.origin, and saves memory too! # # # In[ ]: data.origin.astype('category') # In[ ]: # In[ ]: # You can select using operators or isin: # # # In[ ]: # Now, let's convert the name into make and model: # # # In[ ]: cars = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/mpg.csv', dtype={"origin":"category"}) ... # cars["make"], cars["model"] = makemodel[0].astype('category'), makemodel[1] # del cars["name"] # We can put make and model together again: # # # In[ ]: # Math: To convert mpg to liters per 100 kilometers: # # $$ # lp100km = \frac{1}{mpg} \cdot 62.1371 \frac{\mathrm{miles}}{\mathrm{100 km}} \cdot 3.78541 \frac{\mathrm{liter}}{\mathrm{gallon}} # $$ # # # In[ ]: # In[ ]: # Also see: # # # * 10 minutes to Pandas: https://pandas.pydata.org/pandas-docs/stable/getting_started/10min.html # * 100 tips: https://www.dataschool.io/python-pandas-tips-and-tricks # * My blog: https://iscinumpy.gitlab.io # Not covered above: # * Fantastic date/time support, including holidays # * Resampling, interpolation # * Multi-indexing # * Support for many input formats, such as HTML, the clipbord, Excel files, and more # In[ ]: