# NBVAL_IGNORE_OUTPUT
from sqlalchemy import sql
from sqlalchemy import Table, Column, Integer, String, MetaData, ForeignKey
from sqlalchemy import create_engine
import os
port = os.environ.get("SB_TEST_PGPORT", "5432")
pwd = os.environ.get("SB_TEST_PGPASSWORD", "")
engine = create_engine('postgresql://postgres:%s@localhost:%s/postgres'%(pwd, port), echo=False)
metadata = MetaData()
users = Table('users', metadata,
Column('id', Integer, primary_key=True),
Column('name', String),
Column('fullname', String),
)
addresses = Table('addresses', metadata,
Column('id', Integer, primary_key=True),
Column('user_id', None, ForeignKey('users.id')),
Column('email_address', String, nullable=False)
)
metadata.drop_all(engine)
metadata.create_all(engine)
conn = engine.connect()
ins = users.insert().values(name='jack', fullname='Jack Jones')
result = conn.execute(ins)
ins = users.insert()
conn.execute(ins, id=2, name='wendy', fullname='Wendy Williams')
conn.execute(addresses.insert(), [
{'user_id': 1, 'email_address' : 'jack@yahoo.com'},
{'user_id': 1, 'email_address' : 'jack@msn.com'},
{'user_id': 2, 'email_address' : 'www@www.org'},
{'user_id': 2, 'email_address' : 'wendy@aol.com'},
])
<sqlalchemy.engine.result.ResultProxy at 0x110690470>
from siuba import *
from siuba.sql.verbs import LazyTbl, collect, show_query
from siuba.sql.dply.vector import dense_rank
import siuba.meta_hook.sqlalchemy.sql.functions as F
from sqlalchemy import sql
tbl_addresses = LazyTbl(conn, addresses)
tbl_users = LazyTbl(conn, users)
#tbl_addresses >> mutate(_, num = dense_rank(_.id)) >> show_query(_)
q = (tbl_addresses
>> group_by("user_id")
>> mutate(num = dense_rank(_.id))
>> filter(
_.id > _.id.min(),
_.email_address.str.startswith("jack")
)
>> ungroup()
>> show_query(simplify = True)
>> collect()
)
q
SELECT anon_1.id, anon_1.user_id, anon_1.email_address, anon_1.num FROM (SELECT id, user_id, email_address, num, min(anon_2.id) OVER (PARTITION BY anon_2.user_id) AS win1 FROM (SELECT id, user_id, email_address, dense_rank() OVER (PARTITION BY addresses.user_id ORDER BY addresses.id) AS num FROM addresses) AS anon_2) AS anon_1 WHERE anon_1.id > anon_1.win1 AND (anon_1.email_address LIKE 'jack' || '%%')
/Users/machow/Dropbox/Repo/siuba/siuba/sql/translate.py:31: SiubaSqlRuntimeWarning: dense_rank sql translation defaults na_option to None. To return identical result as pandas, use na_option = 'keep'. This warning only displays once per function SiubaSqlRuntimeWarning
id | user_id | email_address | num | |
---|---|---|---|---|
0 | 2 | 1 | jack@msn.com | 2 |
q = (tbl_addresses
>> mutate(rank = dense_rank(_.id) + 1)
>> show_query()
)
SELECT addresses.id, addresses.user_id, addresses.email_address, dense_rank() OVER (ORDER BY addresses.id) + 1 AS rank FROM addresses
/Users/machow/Dropbox/Repo/siuba/siuba/sql/translate.py:31: SiubaSqlRuntimeWarning: dense_rank sql translation defaults na_option to None. To return identical result as pandas, use na_option = 'keep'. This warning only displays once per function SiubaSqlRuntimeWarning
q = (tbl_addresses
>> group_by("user_id")
>> mutate(rank = _.id > dense_rank(_.id) + 1)
>> show_query()
)
SELECT addresses.id, addresses.user_id, addresses.email_address, addresses.id > dense_rank() OVER (PARTITION BY addresses.user_id, addresses.user_id, addresses.user_id ORDER BY addresses.id) + 1 AS rank FROM addresses
# rename and first mutate in same query,
# second mutate is outer query (since uses to prev col)
q = (tbl_addresses
>> select(_.email == _.email_address)
>> mutate(is_mikey = _.email.str.startswith("mikey"), mikey2 = _.is_mikey + 1)
>> show_query()
)
SELECT anon_1.email, anon_1.is_mikey, anon_1.is_mikey + 1 AS mikey2 FROM (SELECT anon_2.email AS email, (anon_2.email LIKE 'mikey' || '%%') AS is_mikey FROM (SELECT addresses.email_address AS email FROM addresses) AS anon_2) AS anon_1
q = (tbl_addresses
>> filter(_.id > 1)
>> show_query()
)
SELECT anon_1.id, anon_1.user_id, anon_1.email_address FROM (SELECT addresses.id AS id, addresses.user_id AS user_id, addresses.email_address AS email_address FROM addresses) AS anon_1 WHERE anon_1.id > 1
q = (tbl_addresses
>> group_by("user_id")
>> filter(_.id > 1)
>> show_query()
)
SELECT anon_1.id, anon_1.user_id, anon_1.email_address FROM (SELECT addresses.id AS id, addresses.user_id AS user_id, addresses.email_address AS email_address FROM addresses) AS anon_1 WHERE anon_1.id > 1
q = (tbl_addresses
>> group_by("user_id")
>> filter(dense_rank(_.id) > 1)
>> show_query()
>> collect()
)
q
SELECT anon_1.id, anon_1.user_id, anon_1.email_address FROM (SELECT anon_2.id AS id, anon_2.user_id AS user_id, anon_2.email_address AS email_address, dense_rank() OVER (PARTITION BY anon_2.user_id ORDER BY anon_2.id) AS win1 FROM (SELECT addresses.id AS id, addresses.user_id AS user_id, addresses.email_address AS email_address FROM addresses) AS anon_2) AS anon_1 WHERE anon_1.win1 > 1
id | user_id | email_address | |
---|---|---|---|
0 | 2 | 1 | jack@msn.com |
1 | 4 | 2 | wendy@aol.com |
q = (tbl_addresses
>> group_by("user_id")
>> summarize(avg_id = _.id.mean())
>> show_query()
>> collect()
)
q
SELECT addresses.user_id, avg(addresses.id) AS avg_id FROM addresses GROUP BY addresses.user_id
user_id | avg_id | |
---|---|---|
0 | 2 | 3.5 |
1 | 1 | 1.5 |
q = (tbl_addresses >> mutate(_, id2 = _.id + 1) >> summarize(_, m_id = _.id2.mean())) >> show_query()
SELECT avg(anon_1.id2) AS m_id FROM (SELECT addresses.id AS id, addresses.user_id AS user_id, addresses.email_address AS email_address, addresses.id + 1 AS id2 FROM addresses) AS anon_1
q = (tbl_addresses
>> group_by("user_id")
>> count(_.id)
>> show_query()
>> collect()
)
q
SELECT anon_1.user_id, anon_1.id, count(*) AS n FROM (SELECT addresses.id AS id, addresses.user_id AS user_id, addresses.email_address AS email_address FROM addresses) AS anon_1 GROUP BY anon_1.user_id, anon_1.id ORDER BY n DESC
user_id | id | n | |
---|---|---|---|
0 | 1 | 2 | 1 |
1 | 2 | 3 | 1 |
2 | 2 | 4 | 1 |
3 | 1 | 1 | 1 |
# TODO: not executable like this, how to get first SELECT out of parens?
# E.g. can use users.join, etc..
q = (tbl_addresses
>> left_join(_, tbl_users, {"user_id": "id"})
>> show_query()
>> collect()
)
q
SELECT anon_1.id, anon_1.user_id, anon_1.email_address, anon_2.fullname, anon_2.name FROM (SELECT addresses.id AS id, addresses.user_id AS user_id, addresses.email_address AS email_address FROM addresses) AS anon_1 LEFT OUTER JOIN (SELECT users.id AS id, users.name AS name, users.fullname AS fullname FROM users) AS anon_2 ON anon_1.user_id = anon_2.id
id | user_id | email_address | fullname | name | |
---|---|---|---|---|---|
0 | 1 | 1 | jack@yahoo.com | Jack Jones | jack |
1 | 2 | 1 | jack@msn.com | Jack Jones | jack |
2 | 3 | 2 | www@www.org | Wendy Williams | wendy |
3 | 4 | 2 | wendy@aol.com | Wendy Williams | wendy |
## TODO: fix sql case_when statements
q = (tbl_addresses
>> mutate(
label = case_when(_, {
_.id > 20: 0,
_.id > 1: 1,
True: _.id
})
)
>> show_query()
>> collect()
)
q
SELECT addresses.id, addresses.user_id, addresses.email_address, CASE WHEN (addresses.id > 20) THEN 0 WHEN (addresses.id > 1) THEN 1 ELSE addresses.id END AS label FROM addresses
id | user_id | email_address | label | |
---|---|---|---|---|
0 | 1 | 1 | jack@yahoo.com | 1 |
1 | 2 | 1 | jack@msn.com | 1 |
2 | 3 | 2 | www@www.org | 1 |
3 | 4 | 2 | wendy@aol.com | 1 |
## TODO: fix sql case_when statements
# works, but better to just use filter normally...
q = (tbl_addresses
>> filter(
case_when(_, {
_.id > 20: True,
_.id > 1: False,
True: True
})
)
>> show_query()
>> collect()
)
q
SELECT anon_1.id, anon_1.user_id, anon_1.email_address FROM (SELECT addresses.id AS id, addresses.user_id AS user_id, addresses.email_address AS email_address FROM addresses) AS anon_1 WHERE CASE WHEN (anon_1.id > 20) THEN true WHEN (anon_1.id > 1) THEN false ELSE true END
id | user_id | email_address | |
---|---|---|---|
0 | 1 | 1 | jack@yahoo.com |
q = (tbl_addresses
>> group_by("user_id")
>> mutate(
label = case_when(_, {
_.id > _.id.mean(): 0,
_.id > 20: 1,
True: _.id
})
)
>> show_query()
>> collect()
)
q
SELECT addresses.id, addresses.user_id, addresses.email_address, CASE WHEN (addresses.id > avg(addresses.id) OVER (PARTITION BY addresses.user_id)) THEN 0 WHEN (addresses.id > 20) THEN 1 ELSE addresses.id END AS label FROM addresses
id | user_id | email_address | label | |
---|---|---|---|---|
0 | 1 | 1 | jack@yahoo.com | 1 |
1 | 2 | 1 | jack@msn.com | 0 |
2 | 3 | 2 | www@www.org | 3 |
3 | 4 | 2 | wendy@aol.com | 0 |
# NBVAL_IGNORE_OUTPUT
case_when(_, {_.id > 1: "yeah", True: "no"})
█─'__call__' ├─<function case_when at 0x11b2e5488> ├─_ └─█─'<lazy>' └─█─'__call__' ├─<class 'dict'> └─{_.id > 1: 'yeah', True: 'no'}
q = (tbl_addresses
>> mutate(big_id = if_else(dense_rank(_.id) > 1, "yes", "no"))
>> show_query()
)
SELECT addresses.id, addresses.user_id, addresses.email_address, CASE WHEN (dense_rank() OVER (ORDER BY addresses.id) > 1) THEN 'yes' ELSE 'no' END AS big_id FROM addresses
/Users/machow/Dropbox/Repo/siuba/siuba/sql/translate.py:31: SiubaSqlRuntimeWarning: dense_rank sql translation defaults na_option to None. To return identical result as pandas, use na_option = 'keep'. This warning only displays once per function SiubaSqlRuntimeWarning
(tbl_addresses
>> head(3)
>> show_query()
>> collect()
)
SELECT addresses.id, addresses.user_id, addresses.email_address FROM addresses LIMIT 3
id | user_id | email_address | |
---|---|---|---|
0 | 1 | 1 | jack@yahoo.com |
1 | 2 | 1 | jack@msn.com |
2 | 3 | 2 | www@www.org |
(tbl_addresses
>> rename(id2 = "id")
>> show_query()
>> collect()
)
SELECT addresses.id AS id2, addresses.user_id, addresses.email_address FROM addresses
id2 | user_id | email_address | |
---|---|---|---|
0 | 1 | 1 | jack@yahoo.com |
1 | 2 | 1 | jack@msn.com |
2 | 3 | 2 | www@www.org |
3 | 4 | 2 | wendy@aol.com |
(tbl_addresses
>> distinct(_.user_id, user_id2 = _.user_id + 1)
>> show_query()
>> collect()
)
SELECT DISTINCT addresses.user_id, addresses.user_id + 1 AS user_id2 FROM addresses
user_id | user_id2 | |
---|---|---|
0 | 2 | 3 |
1 | 1 | 2 |
(tbl_addresses
>> group_by("user_id")
>> distinct(_.email_address, user_id2 = dense_rank(_.user_id))
>> show_query()
>> collect()
)
SELECT DISTINCT addresses.email_address, dense_rank() OVER (PARTITION BY addresses.user_id ORDER BY addresses.user_id) AS user_id2 FROM addresses
/Users/machow/Dropbox/Repo/siuba/siuba/sql/translate.py:31: SiubaSqlRuntimeWarning: dense_rank sql translation defaults na_option to None. To return identical result as pandas, use na_option = 'keep'. This warning only displays once per function SiubaSqlRuntimeWarning
email_address | user_id2 | |
---|---|---|
0 | jack@msn.com | 1 |
1 | jack@yahoo.com | 1 |
2 | www@www.org | 1 |
3 | wendy@aol.com | 1 |
from siuba.dply.vector import n
tbl_addresses \
>> group_by("user_id") \
>> summarize(n = n(_)) \
>> collect()
user_id | n | |
---|---|---|
0 | 2 | 2 |
1 | 1 | 2 |
q = tbl_addresses >> mutate(hour = _.id.dt.hour) >> show_query()
SELECT addresses.id, addresses.user_id, addresses.email_address, EXTRACT(hour FROM addresses.id) AS hour FROM addresses
from siuba.dply.vector import desc
(tbl_addresses
>> arrange(desc(_.id))
>> mutate(cumsum = _.user_id.cumsum())
>> arrange(_.cumsum)
>> show_query()
)
SELECT anon_1.id, anon_1.user_id, anon_1.email_address, sum(anon_1.user_id) OVER (ORDER BY anon_1.id DESC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cumsum FROM (SELECT addresses.id AS id, addresses.user_id AS user_id, addresses.email_address AS email_address FROM addresses ORDER BY addresses.id DESC) AS anon_1 ORDER BY cumsum
# Source: lazy query # DB Conn: Engine(postgresql://postgres:***@localhost:5432/postgres) # Preview:
id | user_id | email_address | cumsum | |
---|---|---|---|---|
0 | 4 | 2 | wendy@aol.com | 2 |
1 | 3 | 2 | www@www.org | 4 |
2 | 2 | 1 | jack@msn.com | 5 |
3 | 1 | 1 | jack@yahoo.com | 6 |
# .. may have more rows
(tbl_addresses
>> mutate(id2 = _.id.round(2))
>> show_query()
>> collect()
)
SELECT addresses.id, addresses.user_id, addresses.email_address, round(CAST(addresses.id AS NUMERIC), 2) AS id2 FROM addresses
id | user_id | email_address | id2 | |
---|---|---|---|---|
0 | 1 | 1 | jack@yahoo.com | 1.0 |
1 | 2 | 1 | jack@msn.com | 2.0 |
2 | 3 | 2 | www@www.org | 3.0 |
3 | 4 | 2 | wendy@aol.com | 4.0 |
import sqlalchemy
metadata2 = MetaData()
users2 = sqlalchemy.Table('users', metadata2, autoload = True, autoload_with = engine)
tbl_users2 = LazyTbl(conn, users2)
(tbl_users
>> mutate(id2 = _.id + 1)
>> show_query()
>> collect()
)
SELECT users.id, users.name, users.fullname, users.id + 1 AS id2 FROM users
id | name | fullname | id2 | |
---|---|---|---|---|
0 | 1 | jack | Jack Jones | 2 |
1 | 2 | wendy | Wendy Williams | 3 |
import sqlalchemy
metadata3 = MetaData()
tbl_users3 = LazyTbl(conn, "users")
(tbl_users
>> mutate(id2 = _.id + 1)
>> show_query()
>> collect()
)
SELECT users.id, users.name, users.fullname, users.id + 1 AS id2 FROM users
id | name | fullname | id2 | |
---|---|---|---|---|
0 | 1 | jack | Jack Jones | 2 |
1 | 2 | wendy | Wendy Williams | 3 |
tbl_users >> mutate(id2 = _.id + 1)
# Source: lazy query # DB Conn: Engine(postgresql://postgres:***@localhost:5432/postgres) # Preview:
id | name | fullname | id2 | |
---|---|---|---|---|
0 | 1 | jack | Jack Jones | 2 |
1 | 2 | wendy | Wendy Williams | 3 |
# .. may have more rows