#!/usr/bin/env python # coding: utf-8 # # SQL translators # The purpose of this vignette is to walk through how expressions like `_.id.mean()` are converted into SQL. # # This process involves 3 parts # # 1. SQL translation functions, e.g. taking column "id" and producing the SQL "ROUND(id)". # 2. SQL translation from a symbolic call # - Converting method calls like `_.id.round(2)` to `round(_.id, 2)` # - Looking up SQL translators (e.g. for "mean" function call) # 3. Handling SQL partitions, like in OVER clauses # # ### Using sqlalchemy select statement for convenience # # Throughout this vignette, we'll use a select statement object from sqlalchemy, # so we can conveniently access its columns as needed. # In[1]: from sqlalchemy import sql col_names = ['id', 'x', 'y'] sel = sql.select([sql.column(x) for x in col_names]) print(sel) print(type(sel.columns)) print(sel.columns) # ## Translator functions # # A SQL translator function takes... # # * a first argument that is a sqlalchemy Column # * (optional) additional arguments for the translation # ### A simple translator # In[2]: f_simple_round = lambda col, n: sql.func.round(col, n) sql_expr = f_simple_round(sel.columns.x, 2) print(sql_expr) # The function above is essentially what most translator functions are. # # For example, here is the round function defined for postgresql. # One key difference is that it casts the column to a numeric beforehand. # In[3]: from siuba.sql.dialects.postgresql import funcs f_round = funcs['scalar']['round'] sql_expr = f_round(sel.columns.x, 2) print(sql_expr) # ### Handling windows with custom Over clauses # In[4]: f_win_mean = funcs['window']['mean'] sql_over_expr = f_win_mean(sel.columns.x) print(type(sql_over_expr)) print(sql_over_expr) # Notice that this window expression has an empty over clause. This clause needs to be able to include any variables we've grouped the data by. # # Siuba handles this by implementing a `set_over` method on these custom sqlalchemy Over clauses, which takes grouping and ordering variables as arguments. # In[5]: group_by_clause = sql.elements.ClauseList(sel.columns.x, sel.columns.y) print(sql_over_expr.set_over(group_by_clause)) # ## Call shaping # # The section above discusses how SQL translators are functions that take a sqlalchemy column, and return a SQL expression. However, when using siuba we often have expressions like... # # ``` # mutate(data, x = _.y.round(2)) # ``` # # In this case, before we can even use a SQL translator, we need to... # # * find the name and arguments of the method being called # * find the column it is being called on # # This is done by using the `CallTreeLocal` class to analyze the tree of operations for each expression. # In[6]: from siuba.siu import Lazy, CallTreeLocal, Call, strip_symbolic from siuba import _ _.y.round(2) # ### Example of translation with CallTreeLocal # In[7]: from siuba.sql.dialects.postgresql import funcs local_funcs = {**funcs['scalar'], **funcs['window']} call_shaper = CallTreeLocal( local_funcs, call_sub_attr = ('dt',) ) # In[8]: symbol = _.id.mean() call = strip_symbolic(symbol) print(call) # In[9]: func_call = call_shaper.enter(call) print(func_call(sel.columns)) # This is the same result as when we called the SQL translator for `mean` manually! # In that section we also showed that we can set group information, so that it takes # an average within each group. # # In this case it's easy to set group information to the Over clause. # However, an additional challenge is when it's part of a larger expression... # In[10]: call2 = strip_symbolic(_.id.mean() + 1) func_call2 = call_shaper.enter(call2) func_call2(sel.columns) # ## Handling partitions # # While the first section showed how siuba's custom Over clauses can add grouping info to a translation, it is missing one key detail: expressions that generate Over clauses, like `_.id.mean()`, can be part of larger expressions. For example `_.id.mean() + 1`. # # In this case, if we look at the call tree for that expression, the top operation is the addition... # In[11]: _.id.mean() + 1 # How can we create the appropriate expression... # # ``` # avg(some_col) OVER (PARTITION BY x, y) + 1 # ``` # # when the piece that needs grouping info is not easily accessible? The answer is by using a tree visitor, which steps down every black rectangle in the call tree shown above, from top to bottom. # # ### Full example # # Below, we copy the code from the call shaping section.. # In[12]: from siuba.sql.verbs import track_call_windows from siuba import _ from siuba.sql.dialects.postgresql import funcs local_funcs = {**funcs['scalar'], **funcs['window']} call_shaper = CallTreeLocal( local_funcs, call_sub_attr = ('dt',) ) symbol3 = _.id.mean() + 1 call3 = strip_symbolic(symbol3) func_call3 = call_shaper.enter(call3) # Finally, we pass the shaped call... # In[13]: col, windows, window_cte = track_call_windows( func_call3, sel.columns, group_by = ['x', 'y'], order_by = [], # note that this is optional, and results in window_cte being a # copy of this select that contains the window clauses window_cte = sel.select() ) print(col) print(windows) print(window_cte)