#!/usr/bin/env python # coding: utf-8 # # Groupby Architecture # ## Problem # # Pandas has a well-designed groupby architecture, but when developing against it I often hit three challenges: # # * It involves 4 to 5 classes, which can be hard to keep track of. # * Its design is similar to Categoricals--but what class names `codes`, another might name `labels`. # * Correctly ordering results uses a somewhat uncommon class: Splitter. # # This document lays out... # # * The **big picture** behind the groupby architecture. # * 3 key **splitting** cases: multiple group columns, NA group keys, and regrouping. # * How to order results for **combining** aggregation and transformation. # # By the end of this document, a person should be able to construct fast, custom split-apply-combine operations that perform over numpy arrays. # ## Motivating scenario # Consider the `corr()` method. This makes it easy to correlate two pandas Series. # However, `corr()` is not available for grouped Series. # In[1]: from siuba.data import mtcars g_cyl = mtcars.groupby('cyl') # works mtcars.hp.corr(mtcars.mpg) # doesn't work # g_cyl.hp.corr(g_cyl.mpg) # The `corr()` method largely just punts the operation to numpy, so in theory it shouldn't be hard to implement over grouped data. However, no method involving two grouped Series is implemented, which is why this doc exists. # ## Big picture # # Below is a class diagrom for the classes involved. # This diagram is a sketch, to help with the sequence diagram focused on splitting in the next section, so includes most, but not all properties. Lines starting with `//` are comments. # [![](https://mermaid.ink/img/eyJjb2RlIjoiY2xhc3NEaWFncmFtXG5cdGNsYXNzIEdyb3VwZXIge1xuICAgICAgZ3JvdXBpbmdzOiBMaXN0W0dyb3VwaW5nXVxuICAgICAgLy8gY29kZXMsIF8sIG5fZ3JvdXBzXG4gICAgICBncm91cF9pbmZvOiBUdXBsZVtBcnJheSwgQXJyYXksIGludF1cbiAgICAgIGxldmVsczogTGlzdFtJbmRleF1cbiAgICAgIHJlc3VsdF9pbmRleDogSW5kZXhcblxuICAgICAgX2dldF9zcGxpdHRlcigpXG4gICAgICBnZXRfc3BsaXR0ZXIoKVxuICAgICAgXG4gICAgICBfZ2V0X2dyb3VwX2tleXMoKSBJbmRleE9yTGlzdFxuICAgIH1cblx0XG4gICAgY2xhc3MgU3BsaXR0ZXIge1xuICAgICAgZGF0YVxuICAgICAgLy8gaS5lLiBjb2Rlc1xuICAgICAgbGFiZWxzXG4gICAgICBuZ3JvdXBzXG4gICAgICAvLyBpLmUuIHNvcnRlZCAuIGNvZGVzXG4gICAgICBzbGFiZWxzXG4gICAgICBcbiAgICAgIC8vIGUuZy4gZGF0YS5pbG9jW3NvcnRfaWR4XSBzb3J0c1xuICAgICAgc29ydF9pZHhcbiAgICAgIF9faW5pdF9fKGRhdGEsIGxhYmVscywgbmdyb3VwcylcbiAgICAgIF9nZXRfc29ydGVkX2RhdGEoKSBGcmFtZU9yU2VyaWVzXG4gICAgICBsaWIuZ2VuZXJhdGVfc2xpY2VzKClcbiAgICB9XG5cbiAgICBjbGFzcyBHcm91cGluZyB7XG4gICAgICBjb2RlcyBcbiAgICAgIGdyb3Vwc1xuICAgICAgaW5kZXggICAgICBcbiAgICAgIC8vIGUuZy4gYWdnLWluZGV4XG4gICAgICBncm91cF9pbmRleFxuICAgICAgLy8gaS5lLiBncm91cF9pbmRleFtjb2Rlc11cbiAgICAgIGdyb3VwZXJcbiAgICB9XG5cbiAgICBjbGFzcyBHcm91cEJ5IHtcbiAgICAgIG9iajogRnJhbWVPclNlcmllc1xuICAgICAgZ3JvdXBlcjogR3JvdXBlclxuICAgICAgbmdyb3Vwc1xuICAgIH1cblxuICAgIEdyb3VwZXIgLS0-IFNwbGl0dGVyXG4gICAgR3JvdXBlciAtLT4gR3JvdXBpbmdcbiAgICBHcm91cEJ5IC0tPiBHcm91cGVyXG5cdFx0XHRcdFx0IiwibWVybWFpZCI6eyJ0aGVtZSI6ImRlZmF1bHQiLCJ0aGVtZVZhcmlhYmxlcyI6eyJiYWNrZ3JvdW5kIjoid2hpdGUiLCJwcmltYXJ5Q29sb3IiOiIjRUNFQ0ZGIiwic2Vjb25kYXJ5Q29sb3IiOiIjZmZmZmRlIiwidGVydGlhcnlDb2xvciI6ImhzbCg4MCwgMTAwJSwgOTYuMjc0NTA5ODAzOSUpIiwicHJpbWFyeUJvcmRlckNvbG9yIjoiaHNsKDI0MCwgNjAlLCA4Ni4yNzQ1MDk4MDM5JSkiLCJzZWNvbmRhcnlCb3JkZXJDb2xvciI6ImhzbCg2MCwgNjAlLCA4My41Mjk0MTE3NjQ3JSkiLCJ0ZXJ0aWFyeUJvcmRlckNvbG9yIjoiaHNsKDgwLCA2MCUsIDg2LjI3NDUwOTgwMzklKSIsInByaW1hcnlUZXh0Q29sb3IiOiIjMTMxMzAwIiwic2Vjb25kYXJ5VGV4dENvbG9yIjoiIzAwMDAyMSIsInRlcnRpYXJ5VGV4dENvbG9yIjoicmdiKDkuNTAwMDAwMDAwMSwgOS41MDAwMDAwMDAxLCA5LjUwMDAwMDAwMDEpIiwibGluZUNvbG9yIjoiIzMzMzMzMyIsInRleHRDb2xvciI6IiMzMzMiLCJtYWluQmtnIjoiI0VDRUNGRiIsInNlY29uZEJrZyI6IiNmZmZmZGUiLCJib3JkZXIxIjoiIzkzNzBEQiIsImJvcmRlcjIiOiIjYWFhYTMzIiwiYXJyb3doZWFkQ29sb3IiOiIjMzMzMzMzIiwiZm9udEZhbWlseSI6IlwidHJlYnVjaGV0IG1zXCIsIHZlcmRhbmEsIGFyaWFsIiwiZm9udFNpemUiOiIxNnB4IiwibGFiZWxCYWNrZ3JvdW5kIjoiI2U4ZThlOCIsIm5vZGVCa2ciOiIjRUNFQ0ZGIiwibm9kZUJvcmRlciI6IiM5MzcwREIiLCJjbHVzdGVyQmtnIjoiI2ZmZmZkZSIsImNsdXN0ZXJCb3JkZXIiOiIjYWFhYTMzIiwiZGVmYXVsdExpbmtDb2xvciI6IiMzMzMzMzMiLCJ0aXRsZUNvbG9yIjoiIzMzMyIsImVkZ2VMYWJlbEJhY2tncm91bmQiOiIjZThlOGU4IiwiYWN0b3JCb3JkZXIiOiJoc2woMjU5LjYyNjE2ODIyNDMsIDU5Ljc3NjUzNjMxMjglLCA4Ny45MDE5NjA3ODQzJSkiLCJhY3RvckJrZyI6IiNFQ0VDRkYiLCJhY3RvclRleHRDb2xvciI6ImJsYWNrIiwiYWN0b3JMaW5lQ29sb3IiOiJncmV5Iiwic2lnbmFsQ29sb3IiOiIjMzMzIiwic2lnbmFsVGV4dENvbG9yIjoiIzMzMyIsImxhYmVsQm94QmtnQ29sb3IiOiIjRUNFQ0ZGIiwibGFiZWxCb3hCb3JkZXJDb2xvciI6ImhzbCgyNTkuNjI2MTY4MjI0MywgNTkuNzc2NTM2MzEyOCUsIDg3LjkwMTk2MDc4NDMlKSIsImxhYmVsVGV4dENvbG9yIjoiYmxhY2siLCJsb29wVGV4dENvbG9yIjoiYmxhY2siLCJub3RlQm9yZGVyQ29sb3IiOiIjYWFhYTMzIiwibm90ZUJrZ0NvbG9yIjoiI2ZmZjVhZCIsIm5vdGVUZXh0Q29sb3IiOiJibGFjayIsImFjdGl2YXRpb25Cb3JkZXJDb2xvciI6IiM2NjYiLCJhY3RpdmF0aW9uQmtnQ29sb3IiOiIjZjRmNGY0Iiwic2VxdWVuY2VOdW1iZXJDb2xvciI6IndoaXRlIiwic2VjdGlvbkJrZ0NvbG9yIjoicmdiYSgxMDIsIDEwMiwgMjU1LCAwLjQ5KSIsImFsdFNlY3Rpb25Ca2dDb2xvciI6IndoaXRlIiwic2VjdGlvbkJrZ0NvbG9yMiI6IiNmZmY0MDAiLCJ0YXNrQm9yZGVyQ29sb3IiOiIjNTM0ZmJjIiwidGFza0JrZ0NvbG9yIjoiIzhhOTBkZCIsInRhc2tUZXh0TGlnaHRDb2xvciI6IndoaXRlIiwidGFza1RleHRDb2xvciI6IndoaXRlIiwidGFza1RleHREYXJrQ29sb3IiOiJibGFjayIsInRhc2tUZXh0T3V0c2lkZUNvbG9yIjoiYmxhY2siLCJ0YXNrVGV4dENsaWNrYWJsZUNvbG9yIjoiIzAwMzE2MyIsImFjdGl2ZVRhc2tCb3JkZXJDb2xvciI6IiM1MzRmYmMiLCJhY3RpdmVUYXNrQmtnQ29sb3IiOiIjYmZjN2ZmIiwiZ3JpZENvbG9yIjoibGlnaHRncmV5IiwiZG9uZVRhc2tCa2dDb2xvciI6ImxpZ2h0Z3JleSIsImRvbmVUYXNrQm9yZGVyQ29sb3IiOiJncmV5IiwiY3JpdEJvcmRlckNvbG9yIjoiI2ZmODg4OCIsImNyaXRCa2dDb2xvciI6InJlZCIsInRvZGF5TGluZUNvbG9yIjoicmVkIiwibGFiZWxDb2xvciI6ImJsYWNrIiwiZXJyb3JCa2dDb2xvciI6IiM1NTIyMjIiLCJlcnJvclRleHRDb2xvciI6IiM1NTIyMjIiLCJjbGFzc1RleHQiOiIjMTMxMzAwIiwiZmlsbFR5cGUwIjoiI0VDRUNGRiIsImZpbGxUeXBlMSI6IiNmZmZmZGUiLCJmaWxsVHlwZTIiOiJoc2woMzA0LCAxMDAlLCA5Ni4yNzQ1MDk4MDM5JSkiLCJmaWxsVHlwZTMiOiJoc2woMTI0LCAxMDAlLCA5My41Mjk0MTE3NjQ3JSkiLCJmaWxsVHlwZTQiOiJoc2woMTc2LCAxMDAlLCA5Ni4yNzQ1MDk4MDM5JSkiLCJmaWxsVHlwZTUiOiJoc2woLTQsIDEwMCUsIDkzLjUyOTQxMTc2NDclKSIsImZpbGxUeXBlNiI6ImhzbCg4LCAxMDAlLCA5Ni4yNzQ1MDk4MDM5JSkiLCJmaWxsVHlwZTciOiJoc2woMTg4LCAxMDAlLCA5My41Mjk0MTE3NjQ3JSkifX0sInVwZGF0ZUVkaXRvciI6ZmFsc2V9)](https://mermaid-js.github.io/mermaid-live-editor/#/edit/eyJjb2RlIjoiY2xhc3NEaWFncmFtXG5cdGNsYXNzIEdyb3VwZXIge1xuICAgICAgZ3JvdXBpbmdzOiBMaXN0W0dyb3VwaW5nXVxuICAgICAgLy8gY29kZXMsIF8sIG5fZ3JvdXBzXG4gICAgICBncm91cF9pbmZvOiBUdXBsZVtBcnJheSwgQXJyYXksIGludF1cbiAgICAgIGxldmVsczogTGlzdFtJbmRleF1cbiAgICAgIHJlc3VsdF9pbmRleDogSW5kZXhcblxuICAgICAgX2dldF9zcGxpdHRlcigpXG4gICAgICBnZXRfc3BsaXR0ZXIoKVxuICAgICAgXG4gICAgICBfZ2V0X2dyb3VwX2tleXMoKSBJbmRleE9yTGlzdFxuICAgIH1cblx0XG4gICAgY2xhc3MgU3BsaXR0ZXIge1xuICAgICAgZGF0YVxuICAgICAgLy8gaS5lLiBjb2Rlc1xuICAgICAgbGFiZWxzXG4gICAgICBuZ3JvdXBzXG4gICAgICAvLyBpLmUuIHNvcnRlZCAuIGNvZGVzXG4gICAgICBzbGFiZWxzXG4gICAgICBcbiAgICAgIC8vIGUuZy4gZGF0YS5pbG9jW3NvcnRfaWR4XSBzb3J0c1xuICAgICAgc29ydF9pZHhcbiAgICAgIF9faW5pdF9fKGRhdGEsIGxhYmVscywgbmdyb3VwcylcbiAgICAgIF9nZXRfc29ydGVkX2RhdGEoKSBGcmFtZU9yU2VyaWVzXG4gICAgICBsaWIuZ2VuZXJhdGVfc2xpY2VzKClcbiAgICB9XG5cbiAgICBjbGFzcyBHcm91cGluZyB7XG4gICAgICBjb2RlcyBcbiAgICAgIGdyb3Vwc1xuICAgICAgaW5kZXggICAgICBcbiAgICAgIC8vIGUuZy4gYWdnLWluZGV4XG4gICAgICBncm91cF9pbmRleFxuICAgICAgLy8gaS5lLiBncm91cF9pbmRleFtjb2Rlc11cbiAgICAgIGdyb3VwZXJcbiAgICB9XG5cbiAgICBjbGFzcyBHcm91cEJ5IHtcbiAgICAgIG9iajogRnJhbWVPclNlcmllc1xuICAgICAgZ3JvdXBlcjogR3JvdXBlclxuICAgICAgbmdyb3Vwc1xuICAgIH1cblxuICAgIEdyb3VwZXIgLS0-IFNwbGl0dGVyXG4gICAgR3JvdXBlciAtLT4gR3JvdXBpbmdcbiAgICBHcm91cEJ5IC0tPiBHcm91cGVyXG5cdFx0XHRcdFx0IiwibWVybWFpZCI6eyJ0aGVtZSI6ImRlZmF1bHQiLCJ0aGVtZVZhcmlhYmxlcyI6eyJiYWNrZ3JvdW5kIjoid2hpdGUiLCJwcmltYXJ5Q29sb3IiOiIjRUNFQ0ZGIiwic2Vjb25kYXJ5Q29sb3IiOiIjZmZmZmRlIiwidGVydGlhcnlDb2xvciI6ImhzbCg4MCwgMTAwJSwgOTYuMjc0NTA5ODAzOSUpIiwicHJpbWFyeUJvcmRlckNvbG9yIjoiaHNsKDI0MCwgNjAlLCA4Ni4yNzQ1MDk4MDM5JSkiLCJzZWNvbmRhcnlCb3JkZXJDb2xvciI6ImhzbCg2MCwgNjAlLCA4My41Mjk0MTE3NjQ3JSkiLCJ0ZXJ0aWFyeUJvcmRlckNvbG9yIjoiaHNsKDgwLCA2MCUsIDg2LjI3NDUwOTgwMzklKSIsInByaW1hcnlUZXh0Q29sb3IiOiIjMTMxMzAwIiwic2Vjb25kYXJ5VGV4dENvbG9yIjoiIzAwMDAyMSIsInRlcnRpYXJ5VGV4dENvbG9yIjoicmdiKDkuNTAwMDAwMDAwMSwgOS41MDAwMDAwMDAxLCA5LjUwMDAwMDAwMDEpIiwibGluZUNvbG9yIjoiIzMzMzMzMyIsInRleHRDb2xvciI6IiMzMzMiLCJtYWluQmtnIjoiI0VDRUNGRiIsInNlY29uZEJrZyI6IiNmZmZmZGUiLCJib3JkZXIxIjoiIzkzNzBEQiIsImJvcmRlcjIiOiIjYWFhYTMzIiwiYXJyb3doZWFkQ29sb3IiOiIjMzMzMzMzIiwiZm9udEZhbWlseSI6IlwidHJlYnVjaGV0IG1zXCIsIHZlcmRhbmEsIGFyaWFsIiwiZm9udFNpemUiOiIxNnB4IiwibGFiZWxCYWNrZ3JvdW5kIjoiI2U4ZThlOCIsIm5vZGVCa2ciOiIjRUNFQ0ZGIiwibm9kZUJvcmRlciI6IiM5MzcwREIiLCJjbHVzdGVyQmtnIjoiI2ZmZmZkZSIsImNsdXN0ZXJCb3JkZXIiOiIjYWFhYTMzIiwiZGVmYXVsdExpbmtDb2xvciI6IiMzMzMzMzMiLCJ0aXRsZUNvbG9yIjoiIzMzMyIsImVkZ2VMYWJlbEJhY2tncm91bmQiOiIjZThlOGU4IiwiYWN0b3JCb3JkZXIiOiJoc2woMjU5LjYyNjE2ODIyNDMsIDU5Ljc3NjUzNjMxMjglLCA4Ny45MDE5NjA3ODQzJSkiLCJhY3RvckJrZyI6IiNFQ0VDRkYiLCJhY3RvclRleHRDb2xvciI6ImJsYWNrIiwiYWN0b3JMaW5lQ29sb3IiOiJncmV5Iiwic2lnbmFsQ29sb3IiOiIjMzMzIiwic2lnbmFsVGV4dENvbG9yIjoiIzMzMyIsImxhYmVsQm94QmtnQ29sb3IiOiIjRUNFQ0ZGIiwibGFiZWxCb3hCb3JkZXJDb2xvciI6ImhzbCgyNTkuNjI2MTY4MjI0MywgNTkuNzc2NTM2MzEyOCUsIDg3LjkwMTk2MDc4NDMlKSIsImxhYmVsVGV4dENvbG9yIjoiYmxhY2siLCJsb29wVGV4dENvbG9yIjoiYmxhY2siLCJub3RlQm9yZGVyQ29sb3IiOiIjYWFhYTMzIiwibm90ZUJrZ0NvbG9yIjoiI2ZmZjVhZCIsIm5vdGVUZXh0Q29sb3IiOiJibGFjayIsImFjdGl2YXRpb25Cb3JkZXJDb2xvciI6IiM2NjYiLCJhY3RpdmF0aW9uQmtnQ29sb3IiOiIjZjRmNGY0Iiwic2VxdWVuY2VOdW1iZXJDb2xvciI6IndoaXRlIiwic2VjdGlvbkJrZ0NvbG9yIjoicmdiYSgxMDIsIDEwMiwgMjU1LCAwLjQ5KSIsImFsdFNlY3Rpb25Ca2dDb2xvciI6IndoaXRlIiwic2VjdGlvbkJrZ0NvbG9yMiI6IiNmZmY0MDAiLCJ0YXNrQm9yZGVyQ29sb3IiOiIjNTM0ZmJjIiwidGFza0JrZ0NvbG9yIjoiIzhhOTBkZCIsInRhc2tUZXh0TGlnaHRDb2xvciI6IndoaXRlIiwidGFza1RleHRDb2xvciI6IndoaXRlIiwidGFza1RleHREYXJrQ29sb3IiOiJibGFjayIsInRhc2tUZXh0T3V0c2lkZUNvbG9yIjoiYmxhY2siLCJ0YXNrVGV4dENsaWNrYWJsZUNvbG9yIjoiIzAwMzE2MyIsImFjdGl2ZVRhc2tCb3JkZXJDb2xvciI6IiM1MzRmYmMiLCJhY3RpdmVUYXNrQmtnQ29sb3IiOiIjYmZjN2ZmIiwiZ3JpZENvbG9yIjoibGlnaHRncmV5IiwiZG9uZVRhc2tCa2dDb2xvciI6ImxpZ2h0Z3JleSIsImRvbmVUYXNrQm9yZGVyQ29sb3IiOiJncmV5IiwiY3JpdEJvcmRlckNvbG9yIjoiI2ZmODg4OCIsImNyaXRCa2dDb2xvciI6InJlZCIsInRvZGF5TGluZUNvbG9yIjoicmVkIiwibGFiZWxDb2xvciI6ImJsYWNrIiwiZXJyb3JCa2dDb2xvciI6IiM1NTIyMjIiLCJlcnJvclRleHRDb2xvciI6IiM1NTIyMjIiLCJjbGFzc1RleHQiOiIjMTMxMzAwIiwiZmlsbFR5cGUwIjoiI0VDRUNGRiIsImZpbGxUeXBlMSI6IiNmZmZmZGUiLCJmaWxsVHlwZTIiOiJoc2woMzA0LCAxMDAlLCA5Ni4yNzQ1MDk4MDM5JSkiLCJmaWxsVHlwZTMiOiJoc2woMTI0LCAxMDAlLCA5My41Mjk0MTE3NjQ3JSkiLCJmaWxsVHlwZTQiOiJoc2woMTc2LCAxMDAlLCA5Ni4yNzQ1MDk4MDM5JSkiLCJmaWxsVHlwZTUiOiJoc2woLTQsIDEwMCUsIDkzLjUyOTQxMTc2NDclKSIsImZpbGxUeXBlNiI6ImhzbCg4LCAxMDAlLCA5Ni4yNzQ1MDk4MDM5JSkiLCJmaWxsVHlwZTciOiJoc2woMTg4LCAxMDAlLCA5My41Mjk0MTE3NjQ3JSkifX0sInVwZGF0ZUVkaXRvciI6ZmFsc2V9) # **Class Responsibility-Collaboration Cards ([wiki](https://en.wikipedia.org/wiki/Class-responsibility-collaboration_card)):** # # * Grouper # - collect all information for group splits (e.g. over multiple columns) # - serve as primary point of interaction (e.g. `get_splitter()`, `result_index`) # * Grouping # - Represent an individual grouping column / its splitting info # - Represent index for individual aggregation result index # * Splitter # - Calculate arrays for sorting and unsorting operations # - Sort data according to group keys # - Calculate group slices for sorted data # ## Splitting # # The sequence diagram below gives a flavor for how splitting occurs, based on the scenario where you loop over grouped data. # [![](https://mermaid.ink/img/eyJjb2RlIjoic2VxdWVuY2VEaWFncmFtXG5Hcm91cEJ5LT4-R3JvdXBCeTogaXRlcigpXG5Hcm91cEJ5LT4-R3JvdXBlcjogZ2V0X2l0ZXJhdG9yKGRhdGEgPSBzZWxmLm9iailcbk5vdGUgT3ZlciBHcm91cGVyOiBDcmVhdGVzIERGIG9yIFNlcmllcyBzcGVjaWZpYyBzcGxpdHRlciBjbGFzc1xuR3JvdXBlci0-Pkdyb3VwZXI6IHNwbGl0dGVyID0gX2dldF9zcGxpdHRlcihkYXRhKVxuR3JvdXBlci0-Pkdyb3VwZXI6IGtleXMgPSBfZ2V0X2dyb3VwX2tleXMoKVxuTm90ZSBPdmVyIEdyb3VwZXI6IHNwbGl0dGVyIGNob3BzIHRvIHN1YiBTZXJpZXMgb3IgREZzXG4gIEdyb3VwZXItPj5TcGxpdHRlcjogaXRlcigpXG4gIFNwbGl0dGVyLT4-U3BsaXR0ZXI6IF9nZXRfc29ydGVkX2RhdGEoKVxuICBTcGxpdHRlci0-PlNwbGl0dGVyOiBsaWIuZ2VuZXJhdGVfc2xpY2VzKGxhYmVscywgbmdyb3VwcylcblxubG9vcCBcbiAgU3BsaXR0ZXItPj5TcGxpdHRlcjogX2Nob3AoLi5lYWNoIHNsaWNlLi4pXG5lbmRcbiAgU3BsaXR0ZXItLT4-R3JvdXBlcjogeWllbGQgaWksIGdyb3VwXG5cbkdyb3VwZXItLT4-R3JvdXBCeTogeWllbGQga2V5LCBncm91cCAiLCJtZXJtYWlkIjp7InRoZW1lIjoiZGVmYXVsdCIsInRoZW1lVmFyaWFibGVzIjp7ImJhY2tncm91bmQiOiJ3aGl0ZSIsInByaW1hcnlDb2xvciI6IiNFQ0VDRkYiLCJzZWNvbmRhcnlDb2xvciI6IiNmZmZmZGUiLCJ0ZXJ0aWFyeUNvbG9yIjoiaHNsKDgwLCAxMDAlLCA5Ni4yNzQ1MDk4MDM5JSkiLCJwcmltYXJ5Qm9yZGVyQ29sb3IiOiJoc2woMjQwLCA2MCUsIDg2LjI3NDUwOTgwMzklKSIsInNlY29uZGFyeUJvcmRlckNvbG9yIjoiaHNsKDYwLCA2MCUsIDgzLjUyOTQxMTc2NDclKSIsInRlcnRpYXJ5Qm9yZGVyQ29sb3IiOiJoc2woODAsIDYwJSwgODYuMjc0NTA5ODAzOSUpIiwicHJpbWFyeVRleHRDb2xvciI6IiMxMzEzMDAiLCJzZWNvbmRhcnlUZXh0Q29sb3IiOiIjMDAwMDIxIiwidGVydGlhcnlUZXh0Q29sb3IiOiJyZ2IoOS41MDAwMDAwMDAxLCA5LjUwMDAwMDAwMDEsIDkuNTAwMDAwMDAwMSkiLCJsaW5lQ29sb3IiOiIjMzMzMzMzIiwidGV4dENvbG9yIjoiIzMzMyIsIm1haW5Ca2ciOiIjRUNFQ0ZGIiwic2Vjb25kQmtnIjoiI2ZmZmZkZSIsImJvcmRlcjEiOiIjOTM3MERCIiwiYm9yZGVyMiI6IiNhYWFhMzMiLCJhcnJvd2hlYWRDb2xvciI6IiMzMzMzMzMiLCJmb250RmFtaWx5IjoiXCJ0cmVidWNoZXQgbXNcIiwgdmVyZGFuYSwgYXJpYWwiLCJmb250U2l6ZSI6IjE2cHgiLCJsYWJlbEJhY2tncm91bmQiOiIjZThlOGU4Iiwibm9kZUJrZyI6IiNFQ0VDRkYiLCJub2RlQm9yZGVyIjoiIzkzNzBEQiIsImNsdXN0ZXJCa2ciOiIjZmZmZmRlIiwiY2x1c3RlckJvcmRlciI6IiNhYWFhMzMiLCJkZWZhdWx0TGlua0NvbG9yIjoiIzMzMzMzMyIsInRpdGxlQ29sb3IiOiIjMzMzIiwiZWRnZUxhYmVsQmFja2dyb3VuZCI6IiNlOGU4ZTgiLCJhY3RvckJvcmRlciI6ImhzbCgyNTkuNjI2MTY4MjI0MywgNTkuNzc2NTM2MzEyOCUsIDg3LjkwMTk2MDc4NDMlKSIsImFjdG9yQmtnIjoiI0VDRUNGRiIsImFjdG9yVGV4dENvbG9yIjoiYmxhY2siLCJhY3RvckxpbmVDb2xvciI6ImdyZXkiLCJzaWduYWxDb2xvciI6IiMzMzMiLCJzaWduYWxUZXh0Q29sb3IiOiIjMzMzIiwibGFiZWxCb3hCa2dDb2xvciI6IiNFQ0VDRkYiLCJsYWJlbEJveEJvcmRlckNvbG9yIjoiaHNsKDI1OS42MjYxNjgyMjQzLCA1OS43NzY1MzYzMTI4JSwgODcuOTAxOTYwNzg0MyUpIiwibGFiZWxUZXh0Q29sb3IiOiJibGFjayIsImxvb3BUZXh0Q29sb3IiOiJibGFjayIsIm5vdGVCb3JkZXJDb2xvciI6IiNhYWFhMzMiLCJub3RlQmtnQ29sb3IiOiIjZmZmNWFkIiwibm90ZVRleHRDb2xvciI6ImJsYWNrIiwiYWN0aXZhdGlvbkJvcmRlckNvbG9yIjoiIzY2NiIsImFjdGl2YXRpb25Ca2dDb2xvciI6IiNmNGY0ZjQiLCJzZXF1ZW5jZU51bWJlckNvbG9yIjoid2hpdGUiLCJzZWN0aW9uQmtnQ29sb3IiOiJyZ2JhKDEwMiwgMTAyLCAyNTUsIDAuNDkpIiwiYWx0U2VjdGlvbkJrZ0NvbG9yIjoid2hpdGUiLCJzZWN0aW9uQmtnQ29sb3IyIjoiI2ZmZjQwMCIsInRhc2tCb3JkZXJDb2xvciI6IiM1MzRmYmMiLCJ0YXNrQmtnQ29sb3IiOiIjOGE5MGRkIiwidGFza1RleHRMaWdodENvbG9yIjoid2hpdGUiLCJ0YXNrVGV4dENvbG9yIjoid2hpdGUiLCJ0YXNrVGV4dERhcmtDb2xvciI6ImJsYWNrIiwidGFza1RleHRPdXRzaWRlQ29sb3IiOiJibGFjayIsInRhc2tUZXh0Q2xpY2thYmxlQ29sb3IiOiIjMDAzMTYzIiwiYWN0aXZlVGFza0JvcmRlckNvbG9yIjoiIzUzNGZiYyIsImFjdGl2ZVRhc2tCa2dDb2xvciI6IiNiZmM3ZmYiLCJncmlkQ29sb3IiOiJsaWdodGdyZXkiLCJkb25lVGFza0JrZ0NvbG9yIjoibGlnaHRncmV5IiwiZG9uZVRhc2tCb3JkZXJDb2xvciI6ImdyZXkiLCJjcml0Qm9yZGVyQ29sb3IiOiIjZmY4ODg4IiwiY3JpdEJrZ0NvbG9yIjoicmVkIiwidG9kYXlMaW5lQ29sb3IiOiJyZWQiLCJsYWJlbENvbG9yIjoiYmxhY2siLCJlcnJvckJrZ0NvbG9yIjoiIzU1MjIyMiIsImVycm9yVGV4dENvbG9yIjoiIzU1MjIyMiIsImNsYXNzVGV4dCI6IiMxMzEzMDAiLCJmaWxsVHlwZTAiOiIjRUNFQ0ZGIiwiZmlsbFR5cGUxIjoiI2ZmZmZkZSIsImZpbGxUeXBlMiI6ImhzbCgzMDQsIDEwMCUsIDk2LjI3NDUwOTgwMzklKSIsImZpbGxUeXBlMyI6ImhzbCgxMjQsIDEwMCUsIDkzLjUyOTQxMTc2NDclKSIsImZpbGxUeXBlNCI6ImhzbCgxNzYsIDEwMCUsIDk2LjI3NDUwOTgwMzklKSIsImZpbGxUeXBlNSI6ImhzbCgtNCwgMTAwJSwgOTMuNTI5NDExNzY0NyUpIiwiZmlsbFR5cGU2IjoiaHNsKDgsIDEwMCUsIDk2LjI3NDUwOTgwMzklKSIsImZpbGxUeXBlNyI6ImhzbCgxODgsIDEwMCUsIDkzLjUyOTQxMTc2NDclKSJ9fSwidXBkYXRlRWRpdG9yIjpmYWxzZX0)](https://mermaid-js.github.io/mermaid-live-editor/#/edit/eyJjb2RlIjoic2VxdWVuY2VEaWFncmFtXG5Hcm91cEJ5LT4-R3JvdXBCeTogaXRlcigpXG5Hcm91cEJ5LT4-R3JvdXBlcjogZ2V0X2l0ZXJhdG9yKGRhdGEgPSBzZWxmLm9iailcbk5vdGUgT3ZlciBHcm91cGVyOiBDcmVhdGVzIERGIG9yIFNlcmllcyBzcGVjaWZpYyBzcGxpdHRlciBjbGFzc1xuR3JvdXBlci0-Pkdyb3VwZXI6IHNwbGl0dGVyID0gX2dldF9zcGxpdHRlcihkYXRhKVxuR3JvdXBlci0-Pkdyb3VwZXI6IGtleXMgPSBfZ2V0X2dyb3VwX2tleXMoKVxuTm90ZSBPdmVyIEdyb3VwZXI6IHNwbGl0dGVyIGNob3BzIHRvIHN1YiBTZXJpZXMgb3IgREZzXG4gIEdyb3VwZXItPj5TcGxpdHRlcjogaXRlcigpXG4gIFNwbGl0dGVyLT4-U3BsaXR0ZXI6IF9nZXRfc29ydGVkX2RhdGEoKVxuICBTcGxpdHRlci0-PlNwbGl0dGVyOiBsaWIuZ2VuZXJhdGVfc2xpY2VzKGxhYmVscywgbmdyb3VwcylcblxubG9vcCBcbiAgU3BsaXR0ZXItPj5TcGxpdHRlcjogX2Nob3AoLi5lYWNoIHNsaWNlLi4pXG5lbmRcbiAgU3BsaXR0ZXItLT4-R3JvdXBlcjogeWllbGQgaWksIGdyb3VwXG5cbkdyb3VwZXItLT4-R3JvdXBCeTogeWllbGQga2V5LCBncm91cCAiLCJtZXJtYWlkIjp7InRoZW1lIjoiZGVmYXVsdCIsInRoZW1lVmFyaWFibGVzIjp7ImJhY2tncm91bmQiOiJ3aGl0ZSIsInByaW1hcnlDb2xvciI6IiNFQ0VDRkYiLCJzZWNvbmRhcnlDb2xvciI6IiNmZmZmZGUiLCJ0ZXJ0aWFyeUNvbG9yIjoiaHNsKDgwLCAxMDAlLCA5Ni4yNzQ1MDk4MDM5JSkiLCJwcmltYXJ5Qm9yZGVyQ29sb3IiOiJoc2woMjQwLCA2MCUsIDg2LjI3NDUwOTgwMzklKSIsInNlY29uZGFyeUJvcmRlckNvbG9yIjoiaHNsKDYwLCA2MCUsIDgzLjUyOTQxMTc2NDclKSIsInRlcnRpYXJ5Qm9yZGVyQ29sb3IiOiJoc2woODAsIDYwJSwgODYuMjc0NTA5ODAzOSUpIiwicHJpbWFyeVRleHRDb2xvciI6IiMxMzEzMDAiLCJzZWNvbmRhcnlUZXh0Q29sb3IiOiIjMDAwMDIxIiwidGVydGlhcnlUZXh0Q29sb3IiOiJyZ2IoOS41MDAwMDAwMDAxLCA5LjUwMDAwMDAwMDEsIDkuNTAwMDAwMDAwMSkiLCJsaW5lQ29sb3IiOiIjMzMzMzMzIiwidGV4dENvbG9yIjoiIzMzMyIsIm1haW5Ca2ciOiIjRUNFQ0ZGIiwic2Vjb25kQmtnIjoiI2ZmZmZkZSIsImJvcmRlcjEiOiIjOTM3MERCIiwiYm9yZGVyMiI6IiNhYWFhMzMiLCJhcnJvd2hlYWRDb2xvciI6IiMzMzMzMzMiLCJmb250RmFtaWx5IjoiXCJ0cmVidWNoZXQgbXNcIiwgdmVyZGFuYSwgYXJpYWwiLCJmb250U2l6ZSI6IjE2cHgiLCJsYWJlbEJhY2tncm91bmQiOiIjZThlOGU4Iiwibm9kZUJrZyI6IiNFQ0VDRkYiLCJub2RlQm9yZGVyIjoiIzkzNzBEQiIsImNsdXN0ZXJCa2ciOiIjZmZmZmRlIiwiY2x1c3RlckJvcmRlciI6IiNhYWFhMzMiLCJkZWZhdWx0TGlua0NvbG9yIjoiIzMzMzMzMyIsInRpdGxlQ29sb3IiOiIjMzMzIiwiZWRnZUxhYmVsQmFja2dyb3VuZCI6IiNlOGU4ZTgiLCJhY3RvckJvcmRlciI6ImhzbCgyNTkuNjI2MTY4MjI0MywgNTkuNzc2NTM2MzEyOCUsIDg3LjkwMTk2MDc4NDMlKSIsImFjdG9yQmtnIjoiI0VDRUNGRiIsImFjdG9yVGV4dENvbG9yIjoiYmxhY2siLCJhY3RvckxpbmVDb2xvciI6ImdyZXkiLCJzaWduYWxDb2xvciI6IiMzMzMiLCJzaWduYWxUZXh0Q29sb3IiOiIjMzMzIiwibGFiZWxCb3hCa2dDb2xvciI6IiNFQ0VDRkYiLCJsYWJlbEJveEJvcmRlckNvbG9yIjoiaHNsKDI1OS42MjYxNjgyMjQzLCA1OS43NzY1MzYzMTI4JSwgODcuOTAxOTYwNzg0MyUpIiwibGFiZWxUZXh0Q29sb3IiOiJibGFjayIsImxvb3BUZXh0Q29sb3IiOiJibGFjayIsIm5vdGVCb3JkZXJDb2xvciI6IiNhYWFhMzMiLCJub3RlQmtnQ29sb3IiOiIjZmZmNWFkIiwibm90ZVRleHRDb2xvciI6ImJsYWNrIiwiYWN0aXZhdGlvbkJvcmRlckNvbG9yIjoiIzY2NiIsImFjdGl2YXRpb25Ca2dDb2xvciI6IiNmNGY0ZjQiLCJzZXF1ZW5jZU51bWJlckNvbG9yIjoid2hpdGUiLCJzZWN0aW9uQmtnQ29sb3IiOiJyZ2JhKDEwMiwgMTAyLCAyNTUsIDAuNDkpIiwiYWx0U2VjdGlvbkJrZ0NvbG9yIjoid2hpdGUiLCJzZWN0aW9uQmtnQ29sb3IyIjoiI2ZmZjQwMCIsInRhc2tCb3JkZXJDb2xvciI6IiM1MzRmYmMiLCJ0YXNrQmtnQ29sb3IiOiIjOGE5MGRkIiwidGFza1RleHRMaWdodENvbG9yIjoid2hpdGUiLCJ0YXNrVGV4dENvbG9yIjoid2hpdGUiLCJ0YXNrVGV4dERhcmtDb2xvciI6ImJsYWNrIiwidGFza1RleHRPdXRzaWRlQ29sb3IiOiJibGFjayIsInRhc2tUZXh0Q2xpY2thYmxlQ29sb3IiOiIjMDAzMTYzIiwiYWN0aXZlVGFza0JvcmRlckNvbG9yIjoiIzUzNGZiYyIsImFjdGl2ZVRhc2tCa2dDb2xvciI6IiNiZmM3ZmYiLCJncmlkQ29sb3IiOiJsaWdodGdyZXkiLCJkb25lVGFza0JrZ0NvbG9yIjoibGlnaHRncmV5IiwiZG9uZVRhc2tCb3JkZXJDb2xvciI6ImdyZXkiLCJjcml0Qm9yZGVyQ29sb3IiOiIjZmY4ODg4IiwiY3JpdEJrZ0NvbG9yIjoicmVkIiwidG9kYXlMaW5lQ29sb3IiOiJyZWQiLCJsYWJlbENvbG9yIjoiYmxhY2siLCJlcnJvckJrZ0NvbG9yIjoiIzU1MjIyMiIsImVycm9yVGV4dENvbG9yIjoiIzU1MjIyMiIsImNsYXNzVGV4dCI6IiMxMzEzMDAiLCJmaWxsVHlwZTAiOiIjRUNFQ0ZGIiwiZmlsbFR5cGUxIjoiI2ZmZmZkZSIsImZpbGxUeXBlMiI6ImhzbCgzMDQsIDEwMCUsIDk2LjI3NDUwOTgwMzklKSIsImZpbGxUeXBlMyI6ImhzbCgxMjQsIDEwMCUsIDkzLjUyOTQxMTc2NDclKSIsImZpbGxUeXBlNCI6ImhzbCgxNzYsIDEwMCUsIDk2LjI3NDUwOTgwMzklKSIsImZpbGxUeXBlNSI6ImhzbCgtNCwgMTAwJSwgOTMuNTI5NDExNzY0NyUpIiwiZmlsbFR5cGU2IjoiaHNsKDgsIDEwMCUsIDk2LjI3NDUwOTgwMzklKSIsImZpbGxUeXBlNyI6ImhzbCgxODgsIDEwMCUsIDkzLjUyOTQxMTc2NDclKSJ9fSwidXBkYXRlRWRpdG9yIjpmYWxzZX0) # Below is a complete example of running the split manually, but by chopping numpy arrays directly. # In[2]: import pandas as pd from pandas._libs import lib df = pd.DataFrame({ 'g': ['c', 'b', 'b', 'a'], 'x': [1,2,3,4] }, index = [10, 11, 12, 13]) gdf = df.groupby('g') # note grouped Series and DataFrame have the same Grouper splitter = gdf.x.grouper._get_splitter(gdf.x.obj) starts, ends = lib.generate_slices(splitter.slabels, splitter.ngroups) arr_x = splitter._get_sorted_data().values # iterate over splits, applying function keys = gdf.grouper._get_group_keys() results = [] for i, (start, end) in enumerate(zip(starts, ends)): print("Group:", i, keys[i]) print(arr_x[start:end]) # ### Grouping by multiple columns # # **TODO** # ### NAs in grouping columns # # Note that by default pandas groupby drops rows where a grouping column has NAs. # In[3]: import pandas as pd df = pd.DataFrame({'g': ['a', 'a', pd.NA], 'x': [1,2,3]}) # Only has result for 'a' group df.groupby('g').agg('mean') # In[4]: # Keeps groups w/ NAs df.groupby('g', dropna = False).agg('mean') # **TODO:** How does this show up in the groupby classes? # ### Regrouping # # One really useful property is that you can pass a Grouper to the groupby method. # In[5]: g_cyl2 = g_cyl.obj.groupby(g_cyl.grouper) g_cyl.grouper is g_cyl2.grouper # This is how siuba is able to regroup transformations and compose operations like below. # In[6]: from siuba.experimental.pd_groups import fast_mutate from siuba import _ ## doesn't work #(g_cyl.mpg + g_cyl.mpg) / g_cyl.mpg # fine in siuba, regroups after each operation fast_mutate(g_cyl, res = (_.mpg + _.mpg) / _.hp) # essentially runs op_res1 = (g_cyl.obj.mpg + g_cyl.obj.mpg).groupby(g_cyl.grouper) op_res2 = op_res1.obj / g_cyl.obj.hp op_res2.head() # ## Applying # # In general, pandas apply architecture is very complicated. Much of the strategy involves... # # * looking up ideal forms of an operation from a string. e.g. .agg('mean'). # * trying things the fast way first, to see if they work. # # Because there is an incredible amount to be gained by applying operations to only numpy arrays, we won't go into more details here on how pandas does applies. Keep in mind that splitting and performing operations on numpy arrays tends to be very fast. # ## Combine variants # # The following sections show how the groupby architecture can aggregate different kinds of results. # I'll use results that are numpy arrays for these examples. # # **Key points for creating new columns from grouped operations:** # # * a manual split (e.g. via a loop over GroupBy) sorts chunks by group levels. # * an agg uses the grouping columns as its index, and sorts by them. # * take Grouper.codes (or Splitter.labels) to convert aggregation to original order. # * take Splitter.sort_idx to convert full length result over groups to original order. # # In[7]: import pandas as pd df = pd.DataFrame({ "g": ['b', 'c', 'a', 'c'], "x": [10, 11, 12, 13] }, index = [100, 101, 102, 103] ) gdf = df.groupby('g') # ### Aggregate # # The pandas agg method returns a Series with 1 result per group. The results are sorted by their index, which is the grouping columns. # # In[8]: res_agg = gdf.x.agg('mean') res_agg # * result is sorted by the index (e.g. df.sort_index()) # * the grouper property `result_index` is the index for aggregations # In[9]: # result index and grouper property are the same object res_agg.index is gdf.grouper.result_index # In[10]: # recreating Series from numpy array (values) pd.Series(res_agg.values, gdf.grouper.result_index) # In[11]: # recreating same aggregation using a loop import numpy as np res_arr = np.array([g['x'].values.mean() for k, g in gdf]) res_arr # In[12]: pd.Series(res_arr, gdf.grouper.result_index) # ### Transform (agg op) # # The pandas transform method is like a dplyr mutate. Its final result is returned in the same order as the original data. # # # In[13]: res_trans1 = gdf.x.transform('mean') res_trans1 # In[14]: res_agg # In[15]: codes, _, ngroups = gdf.grouper.group_info # which category does each element belong to? codes # In[16]: # note pandas uses pandas.core.algorithms.take_1d # could also use res_agg.iloc[codes] or Splitter.labels (shown below) res_agg.take(codes) res_agg.values[codes] # Note that a **manual transformed aggregate** can be done as follows. # In[17]: res = gdf.x.mean() # grouper.group_info is a 3-tuple: (ids, _, n_groups) res.take(gdf.grouper.group_info[0]) # ### Transform (window op) # # Elementwise operations, like `x + x` aren't possible with groupby objects, so we'll consider the cumulative sum operation, `cumsum`. Unlike mean, it calculates one result per element in the series. # In[18]: res_trans2 = gdf.x.transform('cumsum') res_trans2 # In[19]: res_trans2.values # In[20]: res_arr = np.concatenate([x.cumsum().values for k, x in gdf.x]).ravel() res_arr # Notice that the result above is sorted by group, with the value for the first group ("a") first. # # In order to get back the original order, like in transform, we use `splitter.sort_idx.argsort()`. # In[21]: splitter = gdf.grouper._get_splitter(gdf.obj) res_arr.take(splitter.sort_idx.argsort()) # Essentially, `sort_idx` was the order used to sort the original result, and `argsort` reverses it! # In[22]: # taking this moves # * position 2 to position 0 splitter.sort_idx # In[23]: # taking this moves the position 0 to position 2 splitter.sort_idx.argsort() # ### Filtering (the dplyr version) # # As far as I know, there is now simple mechanism for filtering grouped data (other than running groupby again). # ## Summary # # * aggs have `grouper.result_index` set as their index. # * aggs: get transformed, original order using `grouper.group_info[0]`. # * windows: get original order using `splitter.sort_idx.argsort()`.