#!/usr/bin/env python # coding: utf-8 # (c) 2016 - present. Enplus Advisors, Inc. # # Programming with Data
Foundations of Python and Pandas # # # Lesson 3: Split, Apply, Combine # In[1]: import numpy as np import pandas as pd pd.set_option('display.float_format', '{:,.2f}'.format) def image(name, **kwargs): from IPython.display import Image return Image(filename=f'assets/{name}', retina=True, **kwargs) # import matplotlib.pyplot as plt # %matplotlib inline # %config InlineBackend.figure_format='retina' # In[2]: cols = ['year', 'month', 'day', 'hour', 'air_temp', 'dew_point', 'pressure', 'wind_dir', 'wind_speed', 'sky_code', 'precip1', 'precip6'] dtypes = { 'year': 'int16', 'month': 'int8', 'day': 'int8', 'hour': 'int8', 'air_temp': 'Int32', 'dew_point': 'Int32', 'pressure': 'float64', 'wind_dir': 'Int32', 'wind_speed': 'float64', 'sky_code': 'Int32', 'precip1': 'Int32', 'precip6': 'Int32' } usecols = ['year', 'month', 'day', 'hour', 'air_temp', 'dew_point', 'wind_speed'] dat = (pd.read_fwf( 'data/726505-04845-2009', header=None, names=cols, dtype=dtypes, na_values='-9999', usecols=usecols) .query("month <= 6") .assign( air_temp=lambda df: df.air_temp / 10., # Celsius * 10 dew_point=lambda df: df.dew_point / 10. # Celsius * 10 # wind_speed=lambda df: df.wind_speed / 10. # meters/sec * 10 )) # dat.to_csv('data/weather-6m.csv', index=False, columns=usecols) # In[3]: dat.year.unique() # In[4]: dat.dtypes # # Weather Data from NOAA # # Hourly data for a single weather station in 2009. # In[5]: dat.head() # ## Visualize Split, Aggregate, Combine # In[6]: image('split-apply-combine.png') # ## As implemented by `pandas` # # * Split only # * Return a single, aggregate value per group/column (`agg`) # * Return one or more rows/columns per group (`apply`) # ## Split-Only # # * Iteration over the split `DataFrame` # * You are responsible for applying functions and combining data # * __BYOAC__ - Bring your own apply-combine # In[7]: grp = dat.groupby('month') # In[8]: for month, month_df in grp: print(f'Month: {month}') break # stop the iteration # In[9]: month_df.head() # ## Computing on the GroupBy object # grp.groups is a dictionary of groups -> index elements # In[10]: # Get the group names grp.groups.keys() # In[11]: list(grp.groups.values())[0][:5] # ### Extract specific groups # In[12]: # Retrieve a specific group grp.get_group(1)[:3] # first few rows # ## One aggregate per column/group # Default applies to all numeric columns. # In[13]: grp.mean() # ### Select a single column # Reduce over one column and return a `Series` or `DataFrame` # In[14]: # Returns a Series grp['air_temp'].mean() # In[15]: # Returns a DataFrame grp[['air_temp']].mean().T # ### One function, multiple columns # Apply the same function to multiple *selected* columns. # In[16]: grp[['air_temp', 'dew_point']].mean() # ### Multiple functions, one or more columns # Apply different functions to a single column. # # Use `agg` method (short for `aggregate`) # Apply different functions to a single column. Result columns have the same # names as the functions, e.g. `mean`, `sum`, `std`. # In[17]: grp['air_temp'].agg([np.mean, np.sum, 'std']) # ### Multiple functions, multiple columns # # Again, use the `agg` method. This parameterization gives the result `DataFrame` custom names. # In[18]: grp.agg( mean_air_temp=('air_temp', lambda x: x.mean()), mean_dew_point=('dew_point', 'mean'), ) # #### Pandas < 0.25 # In[19]: grp.agg({ 'air_temp': lambda x: x.mean(), 'dew_point': 'mean' }).rename(columns={ 'air_temp': 'mean_air_temp', 'dew_point': 'mean_dew_point' }) # ## Grouping by Multiple Variables # Same idea as before, except our results now have a MultiIndex. # In[20]: grp2 = dat.groupby(['month', 'hour']) grp2[['air_temp']].mean() # ## One or more values per group # Reduce/aggregate (`agg`) assumes we go from many observations to 1 result value # To return multiple rows and/or columns per group, `agg` doesn't work # Need a new method, `apply` # ### Apply/Reduce returning one row, > 1 columns # In[21]: grp3 = dat.groupby('month') grp3.apply(lambda df: df.air_temp.quantile([0.25, 0.5])) # ### Apply/Reduce returning > 1 rows, > 1 columns # In[22]: grp3.apply(lambda df: df.iloc[:2, ]).head(6) # TODO: Add Filter, Transform # # # # SHOW: Combining index and group