from siuba import *
from siuba import meta_hook
import pandas as pd
from pandas import DataFrame, Series
df = DataFrame({
"repo": ["pandas", "dplyr", "ggplot2", "plotnine"],
"owner": ["pandas-dev", "tidyverse", "tidyverse", "has2k1"],
"language": ["python", "R", "R", "python"],
"stars": [17800, 2800, 3500, 1450],
"x": [1,2,3,None]
})
from pandas.core.groupby import DataFrameGroupBy
gdf = group_by(df, 'language', "owner")
out = mutate(gdf, rel_stars1 = _.stars - _.stars.min())
ungroup(out)
df.siu_group_by("language", "owner").siu_mutate(rel_stars1 = _.stars - _.stars.min()).siu_ungroup()
repo | owner | language | stars | x | rel_stars1 | |
---|---|---|---|---|---|---|
0 | pandas | pandas-dev | python | 17800 | 1.0 | 0 |
1 | dplyr | tidyverse | R | 2800 | 2.0 | 0 |
2 | ggplot2 | tidyverse | R | 3500 | 3.0 | 700 |
3 | plotnine | has2k1 | python | 1450 | NaN | 0 |
ungroup(mutate(out, rel_stars2 = _.stars + _.stars))
repo | owner | language | stars | x | rel_stars1 | rel_stars2 | |
---|---|---|---|---|---|---|---|
0 | pandas | pandas-dev | python | 17800 | 1.0 | 0 | 35600 |
1 | dplyr | tidyverse | R | 2800 | 2.0 | 0 | 5600 |
2 | ggplot2 | tidyverse | R | 3500 | 3.0 | 700 | 7000 |
3 | plotnine | has2k1 | python | 1450 | NaN | 0 | 2900 |
df \
.siu_group_by("language", "owner") \
.siu_mutate(rel_stars1 = _.stars - _.stars.min()) \
.siu_ungroup()
repo | owner | language | stars | x | rel_stars1 | |
---|---|---|---|---|---|---|
0 | pandas | pandas-dev | python | 17800 | 1.0 | 0 |
1 | dplyr | tidyverse | R | 2800 | 2.0 | 0 |
2 | ggplot2 | tidyverse | R | 3500 | 3.0 | 700 |
3 | plotnine | has2k1 | python | 1450 | NaN | 0 |
# TODO: change name filter to query?
# regular filter
filter(df, _.stars > 3000, _.stars < 15000)
# grouped filter
gdf = group_by(df, "language")
ungroup(filter(gdf, _.stars != _.stars.min()))
repo | owner | language | stars | x | |
---|---|---|---|---|---|
0 | pandas | pandas-dev | python | 17800 | 1.0 |
1 | ggplot2 | tidyverse | R | 3500 | 3.0 |
# summarize DataFrame
summarize(df, min_stars = _.stars.min())
# summarize grouped DataFrame
gdf = group_by(df, "language")
summarize(gdf, ttl_stars = _.stars.sum(), wat = _.stars.min())
language | ttl_stars | wat | |
---|---|---|---|
0 | R | 6300 | 2800 |
1 | python | 19250 | 1450 |
transmute(df, "repo", rel_stars1 = _.stars - _.stars.min())
ungroup(transmute(gdf, "repo", rel_stars1 = _.stars - _.stars.min()))
language | repo | rel_stars1 | |
---|---|---|---|
0 | python | pandas | 16350 |
1 | R | dplyr | 0 |
2 | R | ggplot2 | 700 |
3 | python | plotnine | 0 |
# thoughts:
# + can use dynamic values, e.g. colname == .x
# + if select implements some name class, then nothing magic happening
# e.g. _.y == _.x is equivalent to lambda cols: cols.y == cols.x
# - long winded (==, _.y seems harder to read than "y")
# select(df, _.y == _.x, -_.language)
select(df, _.y == _.x, -_.language)
# considered alternative with strings. E.g...
# select(df, "y = x", "language")
# select(df, dict(y = "x"), "language")
y | repo | owner | stars | |
---|---|---|---|---|
0 | 1.0 | pandas | pandas-dev | 17800 |
1 | 2.0 | dplyr | tidyverse | 2800 |
2 | 3.0 | ggplot2 | tidyverse | 3500 |
3 | NaN | plotnine | has2k1 | 1450 |
arrange(df, -_.owner, _.repo)
arrange(df, _.owner.str.len())
repo | owner | language | stars | x | |
---|---|---|---|---|---|
3 | plotnine | has2k1 | python | 1450 | NaN |
1 | dplyr | tidyverse | R | 2800 | 2.0 |
2 | ggplot2 | tidyverse | R | 3500 | 3.0 |
0 | pandas | pandas-dev | python | 17800 | 1.0 |
if_else(df.repo == "dplyr", "yeah", "no")
array(['no', 'yeah', 'no', 'no'], dtype='<U4')
f = if_else(_.repo.str.contains("d"), _.repo, "wat")
f(df)
array(['pandas', 'dplyr', 'wat', 'wat'], dtype=object)
case_when(df, {
_.stars > 10000: "incredible!",
_.stars > 1000: "pretty good!",
_.stars > 100 : "keep going!",
True: "I don't know"
})
array(['incredible!', 'pretty good!', 'pretty good!', 'pretty good!'], dtype='<U12')
# data column is an array of DataFrames
nest(df, -_.language, key = "data")
language | data | |
---|---|---|
0 | R | repo owner stars x 1 dplyr ... |
1 | python | repo owner stars x 0 panda... |
unnest(nest(df, -_.language, key = "data"), "data")
language | repo | owner | stars | x | |
---|---|---|---|---|---|
0 | R | dplyr | tidyverse | 2800 | 2.0 |
1 | R | ggplot2 | tidyverse | 3500 | 3.0 |
2 | python | pandas | pandas-dev | 17800 | 1.0 |
3 | python | plotnine | has2k1 | 1450 | NaN |
count(df, "language", "owner")
language | owner | n | |
---|---|---|---|
0 | R | tidyverse | 2 |
1 | python | has2k1 | 1 |
2 | python | pandas-dev | 1 |
add_count(df, "language", "owner")
repo | owner | language | stars | x | n | |
---|---|---|---|---|---|---|
0 | pandas | pandas-dev | python | 17800 | 1.0 | 1 |
1 | dplyr | tidyverse | R | 2800 | 2.0 | 2 |
2 | ggplot2 | tidyverse | R | 3500 | 3.0 | 2 |
3 | plotnine | has2k1 | python | 1450 | NaN | 1 |
distinct(df, _.language, _keep_all = True)
repo | owner | language | stars | x | |
---|---|---|---|---|---|
0 | pandas | pandas-dev | python | 17800 | 1.0 |
1 | dplyr | tidyverse | R | 2800 | 2.0 |
distinct(df, "language")
language | |
---|---|
0 | python |
1 | R |
distinct(df, lang2 = _.language.str.lower())
lang2 | |
---|---|
0 | python |
1 | r |
gdf = group_by(df, "language")
ungroup(distinct(gdf, lang2 = _.language.str.upper()))
lang2 | |
---|---|
0 | R |
1 | PYTHON |
follow = pd.DataFrame({
'repo': ['pandas', 'dplyr', 'ggplot2', 'plotnine'],
'repo2': ['pandas', 'dplyr', None, None],
'follow': [True, False, True, False]
})
join(df, follow, how = "inner", on = "repo")
# join(df, df)
join(df, follow, how = "inner", on = {"repo": "repo2"})
left_join(df, follow, "repo")
repo | owner | language | stars | x | repo2 | follow | |
---|---|---|---|---|---|---|---|
0 | pandas | pandas-dev | python | 17800 | 1.0 | pandas | True |
1 | dplyr | tidyverse | R | 2800 | 2.0 | dplyr | False |
2 | ggplot2 | tidyverse | R | 3500 | 3.0 | None | True |
3 | plotnine | has2k1 | python | 1450 | NaN | None | False |
df2 = spread(df, 'language', 'stars')
df2
# this should raise an error, because duplicate id col x key combos
#spread(pd.concat([df, df]), 'language', 'stars')
repo | owner | x | R | python | |
---|---|---|---|---|---|
0 | dplyr | tidyverse | 2.0 | 2800.0 | NaN |
1 | ggplot2 | tidyverse | 3.0 | 3500.0 | NaN |
2 | pandas | pandas-dev | 1.0 | NaN | 17800.0 |
3 | plotnine | has2k1 | NaN | NaN | 1450.0 |
gather(df2, "key", "value", "R", "python", drop_na = True)
# TODO
# gather(df2, _.key, _.value, _["R":"python"])
repo | owner | x | key | value | |
---|---|---|---|---|---|
0 | dplyr | tidyverse | 2.0 | R | 2800.0 |
1 | ggplot2 | tidyverse | 3.0 | R | 3500.0 |
2 | pandas | pandas-dev | 1.0 | python | 17800.0 |
3 | plotnine | has2k1 | NaN | python | 1450.0 |
f = Pipeable(f = lambda x: x + 1) >> Pipeable(f = lambda x: "x is: {}".format(x))
f(2)
'x is: 3'
(df
>> mutate(
new_repo = _.repo + " waattt",
case = case_when(_, {_.language == "python": "aw yeah", True: 'wat'})
)
>> filter(_.stars > 5000)
)
repo | owner | language | stars | x | new_repo | case | |
---|---|---|---|---|---|---|---|
0 | pandas | pandas-dev | python | 17800 | 1.0 | pandas waattt | aw yeah |
df >> group_by(_.language) >> summarize(wat = _.stars.mean())
language | wat | |
---|---|---|
0 | R | 3150.0 |
1 | python | 9625.0 |