#!/usr/bin/env python # coding: utf-8 # # Postgres # # In[1]: # NBVAL_IGNORE_OUTPUT from sqlalchemy import sql from sqlalchemy import Table, Column, Integer, String, MetaData, ForeignKey from sqlalchemy import create_engine import os port = os.environ.get("SB_TEST_PGPORT", "5432") pwd = os.environ.get("SB_TEST_PGPASSWORD", "") engine = create_engine('postgresql://postgres:%s@localhost:%s/postgres'%(pwd, port), echo=False) metadata = MetaData() users = Table('users', metadata, Column('id', Integer, primary_key=True), Column('name', String), Column('fullname', String), ) addresses = Table('addresses', metadata, Column('id', Integer, primary_key=True), Column('user_id', None, ForeignKey('users.id')), Column('email_address', String, nullable=False) ) metadata.drop_all(engine) metadata.create_all(engine) conn = engine.connect() ins = users.insert().values(name='jack', fullname='Jack Jones') result = conn.execute(ins) ins = users.insert() conn.execute(ins, id=2, name='wendy', fullname='Wendy Williams') conn.execute(addresses.insert(), [ {'user_id': 1, 'email_address' : 'jack@yahoo.com'}, {'user_id': 1, 'email_address' : 'jack@msn.com'}, {'user_id': 2, 'email_address' : 'www@www.org'}, {'user_id': 2, 'email_address' : 'wendy@aol.com'}, ]) # ## Big Example # In[2]: from siuba import * from siuba.sql.verbs import LazyTbl, collect, show_query from siuba.sql.dply.vector import dense_rank import siuba.meta_hook.sqlalchemy.sql.functions as F from sqlalchemy import sql tbl_addresses = LazyTbl(conn, addresses) tbl_users = LazyTbl(conn, users) #tbl_addresses >> mutate(_, num = dense_rank(_.id)) >> show_query(_) q = (tbl_addresses >> group_by("user_id") >> mutate(num = dense_rank(_.id)) >> filter( _.id > _.id.min(), _.email_address.str.startswith("jack") ) >> ungroup() >> show_query(simplify = True) >> collect() ) q # ## Mutate # In[3]: q = (tbl_addresses >> mutate(rank = dense_rank(_.id) + 1) >> show_query() ) # In[4]: q = (tbl_addresses >> group_by("user_id") >> mutate(rank = _.id > dense_rank(_.id) + 1) >> show_query() ) # In[5]: # rename and first mutate in same query, # second mutate is outer query (since uses to prev col) q = (tbl_addresses >> select(_.email == _.email_address) >> mutate(is_mikey = _.email.str.startswith("mikey"), mikey2 = _.is_mikey + 1) >> show_query() ) # ## Filter # In[6]: q = (tbl_addresses >> filter(_.id > 1) >> show_query() ) # In[7]: q = (tbl_addresses >> group_by("user_id") >> filter(_.id > 1) >> show_query() ) # In[8]: q = (tbl_addresses >> group_by("user_id") >> filter(dense_rank(_.id) > 1) >> show_query() >> collect() ) q # ## Summarize # In[9]: q = (tbl_addresses >> group_by("user_id") >> summarize(avg_id = _.id.mean()) >> show_query() >> collect() ) q # In[10]: q = (tbl_addresses >> mutate(_, id2 = _.id + 1) >> summarize(_, m_id = _.id2.mean())) >> show_query() # ## Count # In[11]: q = (tbl_addresses >> group_by("user_id") >> count(_.id) >> show_query() >> collect() ) q # ## Joins # In[12]: # TODO: not executable like this, how to get first SELECT out of parens? # E.g. can use users.join, etc.. q = (tbl_addresses >> left_join(_, tbl_users, {"user_id": "id"}) >> show_query() >> collect() ) q # ## case_when # In[13]: ## TODO: fix sql case_when statements q = (tbl_addresses >> mutate( label = case_when(_, { _.id > 20: 0, _.id > 1: 1, True: _.id }) ) >> show_query() >> collect() ) q # In[14]: ## TODO: fix sql case_when statements # works, but better to just use filter normally... q = (tbl_addresses >> filter( case_when(_, { _.id > 20: True, _.id > 1: False, True: True }) ) >> show_query() >> collect() ) q # In[15]: q = (tbl_addresses >> group_by("user_id") >> mutate( label = case_when(_, { _.id > _.id.mean(): 0, _.id > 20: 1, True: _.id }) ) >> show_query() >> collect() ) q # In[16]: # NBVAL_IGNORE_OUTPUT case_when(_, {_.id > 1: "yeah", True: "no"}) # ## if_else # In[17]: q = (tbl_addresses >> mutate(big_id = if_else(dense_rank(_.id) > 1, "yes", "no")) >> show_query() ) # ## Head # In[18]: (tbl_addresses >> head(3) >> show_query() >> collect() ) # ## Rename # In[19]: (tbl_addresses >> rename(id2 = "id") >> show_query() >> collect() ) # ## Distinct # In[20]: (tbl_addresses >> distinct(_.user_id, user_id2 = _.user_id + 1) >> show_query() >> collect() ) # In[21]: (tbl_addresses >> group_by("user_id") >> distinct(_.email_address, user_id2 = dense_rank(_.user_id)) >> show_query() >> collect() ) # # Technical # ## Translating symbolic function calls # In[22]: from siuba.dply.vector import n tbl_addresses \ >> group_by("user_id") \ >> summarize(n = n(_)) \ >> collect() # ## Translating str methods # ## Translating dt methods # In[23]: q = tbl_addresses >> mutate(hour = _.id.dt.hour) >> show_query() # ## SQL escapes # ## Window functions # In[24]: from siuba.dply.vector import desc (tbl_addresses >> arrange(desc(_.id)) >> mutate(cumsum = _.user_id.cumsum()) >> arrange(_.cumsum) >> show_query() ) # # Misc # ## postgres specific # In[25]: (tbl_addresses >> mutate(id2 = _.id.round(2)) >> show_query() >> collect() ) # ## autoload table w/ sqlalchemy # In[26]: import sqlalchemy metadata2 = MetaData() users2 = sqlalchemy.Table('users', metadata2, autoload = True, autoload_with = engine) tbl_users2 = LazyTbl(conn, users2) (tbl_users >> mutate(id2 = _.id + 1) >> show_query() >> collect() ) # ## auto table from string # In[27]: import sqlalchemy metadata3 = MetaData() tbl_users3 = LazyTbl(conn, "users") (tbl_users >> mutate(id2 = _.id + 1) >> show_query() >> collect() ) # ## LazyTbl repr # In[28]: tbl_users >> mutate(id2 = _.id + 1)