In [1]:

from siuba import *
from siuba import meta_hook

import pandas as pd

from pandas import DataFrame, Series

In [2]:

df = DataFrame({
    "repo": ["pandas", "dplyr", "ggplot2", "plotnine"],
    "owner": ["pandas-dev", "tidyverse", "tidyverse", "has2k1"],
    "language": ["python", "R", "R", "python"],
    "stars": [17800, 2800, 3500, 1450],
    "x": [1,2,3,None]
    })

mutate¶

In [3]:

from pandas.core.groupby import DataFrameGroupBy

In [4]:

gdf = group_by(df, 'language', "owner")

out = mutate(gdf, rel_stars1 = _.stars - _.stars.min())

ungroup(out)

df.siu_group_by("language", "owner").siu_mutate(rel_stars1 = _.stars - _.stars.min()).siu_ungroup()

Out[4]:

	repo	owner	language	stars	x	rel_stars1
0	pandas	pandas-dev	python	17800	1.0	0
1	dplyr	tidyverse	R	2800	2.0	0
2	ggplot2	tidyverse	R	3500	3.0	700
3	plotnine	has2k1	python	1450	NaN	0

In [5]:

ungroup(mutate(out, rel_stars2 = _.stars + _.stars))

Out[5]:

	repo	owner	language	stars	x	rel_stars1	rel_stars2
0	pandas	pandas-dev	python	17800	1.0	0	35600
1	dplyr	tidyverse	R	2800	2.0	0	5600
2	ggplot2	tidyverse	R	3500	3.0	700	7000
3	plotnine	has2k1	python	1450	NaN	0	2900

In [6]:

df \
  .siu_group_by("language", "owner") \
  .siu_mutate(rel_stars1 = _.stars - _.stars.min()) \
  .siu_ungroup()

Out[6]:

	repo	owner	language	stars	x	rel_stars1
0	pandas	pandas-dev	python	17800	1.0	0
1	dplyr	tidyverse	R	2800	2.0	0
2	ggplot2	tidyverse	R	3500	3.0	700
3	plotnine	has2k1	python	1450	NaN	0

filter¶

In [7]:

# TODO: change name filter to query?

# regular filter
filter(df, _.stars > 3000, _.stars < 15000)

# grouped filter
gdf = group_by(df, "language")

ungroup(filter(gdf, _.stars != _.stars.min()))

Out[7]:

	repo	owner	language	stars	x
0	pandas	pandas-dev	python	17800	1.0
1	ggplot2	tidyverse	R	3500	3.0

summarize¶

In [8]:

# summarize DataFrame
summarize(df, min_stars = _.stars.min())

# summarize grouped DataFrame
gdf = group_by(df, "language")

summarize(gdf, ttl_stars = _.stars.sum(), wat = _.stars.min())

Out[8]:

	language	ttl_stars	wat
0	R	6300	2800
1	python	19250	1450

transmute¶

In [9]:

transmute(df, "repo", rel_stars1 = _.stars - _.stars.min())

ungroup(transmute(gdf, "repo", rel_stars1 = _.stars - _.stars.min()))

Out[9]:

	language	repo	rel_stars1
0	python	pandas	16350
1	R	dplyr	0
2	R	ggplot2	700
3	python	plotnine	0

select¶

In [10]:

# thoughts:
#  + can use dynamic values, e.g. colname == .x
#  + if select implements some name class, then nothing magic happening
#    e.g. _.y == _.x is equivalent to lambda cols: cols.y == cols.x
#  - long winded (==, _.y seems harder to read than "y")
# select(df, _.y == _.x, -_.language)

select(df, _.y == _.x, -_.language)
# considered alternative with strings. E.g...
# select(df, "y = x", "language")
# select(df, dict(y = "x"), "language")

Out[10]:

	y	repo	owner	stars
0	1.0	pandas	pandas-dev	17800
1	2.0	dplyr	tidyverse	2800
2	3.0	ggplot2	tidyverse	3500
3	NaN	plotnine	has2k1	1450

arrange¶

In [11]:

arrange(df, -_.owner, _.repo)

arrange(df, _.owner.str.len())

Out[11]:

	repo	owner	language	stars	x
3	plotnine	has2k1	python	1450	NaN
1	dplyr	tidyverse	R	2800	2.0
2	ggplot2	tidyverse	R	3500	3.0
0	pandas	pandas-dev	python	17800	1.0

helpers (if_else, case_when)¶

In [12]:

if_else(df.repo == "dplyr", "yeah", "no")

Out[12]:

array(['no', 'yeah', 'no', 'no'], dtype='<U4')

In [13]:

f = if_else(_.repo.str.contains("d"), _.repo, "wat")

f(df)

Out[13]:

array(['pandas', 'dplyr', 'wat', 'wat'], dtype=object)

In [14]:

case_when(df, {
    _.stars > 10000: "incredible!",
    _.stars > 1000: "pretty good!",
    _.stars > 100 : "keep going!",
    True: "I don't know"
})

Out[14]:

array(['incredible!', 'pretty good!', 'pretty good!', 'pretty good!'],
      dtype='<U12')

nest and unnest¶

In [15]:

# data column is an array of DataFrames
nest(df, -_.language, key = "data")

Out[15]:

	language	data
0	R	repo owner stars x 1 dplyr ...
1	python	repo owner stars x 0 panda...

In [16]:

unnest(nest(df, -_.language, key = "data"), "data")

Out[16]:

	language	repo	owner	stars	x
0	R	dplyr	tidyverse	2800	2.0
1	R	ggplot2	tidyverse	3500	3.0
2	python	pandas	pandas-dev	17800	1.0
3	python	plotnine	has2k1	1450	NaN

count¶

In [17]:

count(df, "language", "owner")

Out[17]:

	language	owner	n
0	R	tidyverse	2
1	python	has2k1	1
2	python	pandas-dev	1

In [18]:

add_count(df, "language", "owner")

Out[18]:

	repo	owner	language	stars	x	n
0	pandas	pandas-dev	python	17800	1.0	1
1	dplyr	tidyverse	R	2800	2.0	2
2	ggplot2	tidyverse	R	3500	3.0	2
3	plotnine	has2k1	python	1450	NaN	1

Distinct¶

In [19]:

distinct(df, _.language, _keep_all = True)

Out[19]:

	repo	owner	language	stars	x
0	pandas	pandas-dev	python	17800	1.0
1	dplyr	tidyverse	R	2800	2.0

In [20]:

distinct(df, "language")

Out[20]:

	language
0	python
1	R

In [21]:

distinct(df, lang2 = _.language.str.lower())

Out[21]:

	lang2
0	python
1	r

In [22]:

gdf = group_by(df, "language")

ungroup(distinct(gdf, lang2 = _.language.str.upper()))

Out[22]:

	lang2
0	R
1	PYTHON

Joins¶

In [23]:

follow = pd.DataFrame({
    'repo': ['pandas', 'dplyr', 'ggplot2', 'plotnine'],
    'repo2': ['pandas', 'dplyr', None, None],
    'follow': [True, False, True, False]
})

join(df, follow, how = "inner", on = "repo")
# join(df, df)

join(df, follow, how = "inner", on = {"repo": "repo2"})

left_join(df, follow, "repo")

Out[23]:

	repo	owner	language	stars	x	repo2	follow
0	pandas	pandas-dev	python	17800	1.0	pandas	True
1	dplyr	tidyverse	R	2800	2.0	dplyr	False
2	ggplot2	tidyverse	R	3500	3.0	None	True
3	plotnine	has2k1	python	1450	NaN	None	False

Spread and Gather¶

In [24]:

df2 = spread(df, 'language', 'stars')
df2
# this should raise an error, because duplicate id col x key combos
#spread(pd.concat([df, df]), 'language', 'stars')

Out[24]:

	repo	owner	x	R	python
0	dplyr	tidyverse	2.0	2800.0	NaN
1	ggplot2	tidyverse	3.0	3500.0	NaN
2	pandas	pandas-dev	1.0	NaN	17800.0
3	plotnine	has2k1	NaN	NaN	1450.0

In [25]:

gather(df2, "key", "value", "R", "python", drop_na = True)

# TODO
# gather(df2, _.key, _.value, _["R":"python"])

Out[25]:

	repo	owner	x	key	value
0	dplyr	tidyverse	2.0	R	2800.0
1	ggplot2	tidyverse	3.0	R	3500.0
2	pandas	pandas-dev	1.0	python	17800.0
3	plotnine	has2k1	NaN	python	1450.0

Piping¶

In [26]:

f = Pipeable(f = lambda x: x + 1) >> Pipeable(f = lambda x: "x is: {}".format(x))

f(2)

Out[26]:

'x is: 3'

In [27]:

(df
  >> mutate(
       new_repo = _.repo + " waattt",
       case = case_when(_, {_.language == "python": "aw yeah", True: 'wat'})
     )
  >> filter(_.stars > 5000)
  )

Out[27]:

	repo	owner	language	stars	x	new_repo	case
0	pandas	pandas-dev	python	17800	1.0	pandas waattt	aw yeah

In [28]:

df >> group_by(_.language) >> summarize(wat = _.stars.mean())

Out[28]:

	language	wat
0	R	3150.0
1	python	9625.0