#!/usr/bin/env python # coding: utf-8 # YAML specification for Series method support across backends # ============================================================ # # (drafted around 21 March 2020) # # Siuba aims to support as pandas Series methods across a range of backends. # This means that users should be able to use the same method call they would on a pandas Series, and get back # roughly the same result. # # Backends include: # # * pandas - siuba makes grouped operations more flexible and faster, while returning the same results as using the `.apply` method. # * SQL (sqlite and postgresql) - for the exact same code used on pandas, users should be able to generate a SQL query. # * spark (in progress) - as with SQL, users should be able to execute in spark. # # Note that since pandas (e.g. with the `.apply` method) is the **reference implementation**. There are three big benefits of specifying the reference as data (eg in yaml): testing conformance, documentating, tracking changes over releases. # # **In this document, I'll first review the value of a spec, go through siuba's current spec, and then the script for transitioning to yaml**. # # ## More on importance # **Testing conformance** is important because we need to trust the different backends can be swapped in. This means, that for every series method they support, there should be at least one test that they... # # * return the exact same result # * in cases where the results differ (e.g. pandas returns an int, SQL returns a float), it should be marked explicitly somewhere. # * in cases where they don't support a verb (e.g. some SQL aggregations can't be used in mutate). # # **Documenting** is important because there are over 300 Series methods. A compact representation of support across backends will let people work quickly. # # **Tracking changes over releases** is important because as time goes on, we'll likely need to react to methods being deprecated in pandas. # # ## Why didn't I start with the YAML specification? # # Because there are 300+ Series methods, I wanted to prioritize a wide format, with few enclosing things `()/{}/[]`. # I was concerned that a long document would require a lot of scanning, and would be hard to jump in to. # # I was also doing a lot of research / exploration on spreadsheet (actually, on airtable!). # # Now that things are much further along, I'm ready to pay down the technical debt, while preserving two valuable modes of interacting with the spec: # # * yaml <-> spreadsheet - so I can have the wide representation on the spreadsheet # * yaml -> implementation - so it's no longer python code (better for docs), more explicit, and changes can be clearly diff'd in PRs. # ## New YAML specification # # ### Schema # # * example # * category # * backends: # - postgresql: \ # * importance: (currently held in airtable) # * action: # - status: {supported, todo, maydo, wontdo} # - kind: {elwise,agg,window} # - input_type: (default: null) # - result_type: (default: null) # - flags: [no_mutate, ...] # # ### Case: Pandas versions # # Siuba supports multiple pandas versions, so the spec will contain methods that exist in one version but not another. Similar considerations appply for deprecated methods. # # ### Case: Backend exceptions # # Backends may... # # * **be unable to implement a method**. For example, postgresql does not have nanosecond resolution, so cannot do a nanosecond method. # * **return a different type**. For example, postgresql date operations often return floats, while pandas' return ints. # # To this end, the spec allows the `backends` field to override settings configured in `action`. # # ### Requirements # # The spec should be used to do the following, without pulling in other data sources... # # * generate a support table # * generate series method unit tests # * generate the fast pandas grouped methods # ## Managing contributions # # Because all activity is now summarized through the spec, we should be able... # # * to point contributors to TODO entries # * ask them to audit maydo or wontdo issues (e.g. If I mistakenly thought something was not possible in postgres) # # Adding contributions will likely follow these steps... # # * open base dialect file (`siuba.sql.transform.py`) # * open postgresl dialect file (`siuba.sql.dialect.postgresql.py`) # * for todo method, add implementation to the first file if generic, otherwise the second. # * modify `siuba.spec.series.yml` to change "todo" to "supported" # # Migration script # # # Below I read the existing spec (written using siu expressions), and wrangle it into the new yaml format. This is needed, since flagging exceptions for actions on different backends were tacked on haphazardly as I went. # # For example: # # * unlike most aggregation methods, postgres's implementation of `_.nunique()` can't be used in a mutate. # * for date functions, pandas often returns an integer where postgres returns a float. # # Rather than override postgres's behavior in the second case, I'd prefer to declare it. By declaring it, we can always change how we handle it later. # # Below I read in the siuba spec and convert it to yaml. It's messy, but it gets the job done. # In[1]: import pandas as pd from siuba.spec import series # In[2]: # NOTE: this is very messy--but also a one way trip to the new YAML format hopefully forever PLANS = {"Todo", "Maydo", "Wontdo"} POSTGRESQL_STATUS = {"xfail": "todo", "not_impl": "wontdo", None: None} def get_postgresql_status(result): status = POSTGRESQL_STATUS[result.get("postgresql")] if status is None:# and result["type"] not in PLANS: if "sql_type" in result: return {"postgresql": {"result_type": "float"}} if "no_mutate" in result: return {"postgresql": {"flags": ["no_mutate"]}} return {} return {"postgresql": {"status": status}} def get_pandas_status(result): return "supported" if result["type"] not in PLANS else result["type"].lower() def get_type_info2(call, method, category): if call.func != "__rshift__": raise ValueError("Expected first expressions was >>") out = {} expr, result = call.args #accessors = ['str', 'dt', 'cat', 'sparse'] #accessor = ([ameth for ameth in accessors if ameth in expr.op_vars()] + [None])[0] result_dict = result.to_dict() # format action ---- action = { "status": get_pandas_status(result_dict), **result_dict } if action["type"] not in PLANS: action["kind"] = action["type"].lower() if "postgresql" in action: del action["postgresql"] if "no_mutate" in action: del action["no_mutate"] if "sql_type" in action: del action["sql_type"] del action["type"] if "op" in action: action["input_type"] = "bool" del action["op"] # backends --- backends = get_postgresql_status(result_dict) return dict( example = str(expr), category = category, #expr_frame = replace_meta_args(expr, _.x, _.y, _.z), #accessor = accessor[0], backends = backends, action = action ) # In[3]: out = {} for category, d in series.funcs_stripped.items(): for name, call in d.items(): out[name] = get_type_info2(call, name, category) # ### Reading into pandas # In[4]: pd.json_normalize([{'method': k, **v} for k, v in out.items()]) # ### Dumping to yaml # # Key questions: # # * should we sort alphabetically? **I think yes.** It won't group categories, but will very predictable. The YAML spec is for storing, the airtable (or DF) for browsing. # * loading the yaml takes half a second--is that reasonable for delaying the import? No, I don't think so. But it's only loaded during testing, and for the experimental fast pandas grouped ops. Before removing their experimental status, we should use the spec to generate the fast grouped ops code (or maybe save a subset of the spec for them, which shouldn't be edited). # In[5]: import yaml print(yaml.dump(out)[:344]) # In[6]: # uncomment to dump #yaml.dump(out, open("../../siuba/spec/series.yml", "w")) # In[7]: import pkg_resources # will go here (on my filesystem) pkg_resources.resource_filename("siuba.spec", "series.yml") # In[8]: get_ipython().run_cell_magic('time', '', 'spec = yaml.load(open("../../siuba/spec/series.yml"), Loader = yaml.SafeLoader)\n') # ## Join with airtable # # As a reminder, an entry from the yaml spec so far is shown below... # In[9]: raw_spec = out raw_spec["nunique"] # This is useful, but I had tracked other information on the airtable, like.. # # * **priority**: how important is this to implement? # * **version deprecated**: when was this deprecated? # * **result length**: eg does it return a single value, a value for each group, or something else? # # For now, I'll pull out priority, and will likely just keep the other info in the airtable. I would rather start with less rather than more (and wrap up faster in the process). # In[10]: import pandas as pd # read yaml spec into a dataframe, so we can join w/ airtable data = pd.DataFrame([{'method': k, 'data': v} for k, v in raw_spec.items()]) data.head() # In[11]: from airtable import Airtable import os # note, airtable API key is in my environment airtable = Airtable('appErTNqCFXn6stSH', 'methods') air_entries = airtable.get_all() # In[12]: air_df = pd.json_normalize(air_entries) air_df.columns = air_df.columns.str.replace("fields.", "") air_df.head() # In[13]: # Pull out priority info from siuba import * prioritized = ( data >> full_join(_, air_df, {"method": "method_name"}) >> filter(~_.data.isna()) >> select(-_.method_name, -_.createdTime, -_.id, -_.expr_frame, -_.expr_series) ) new_yaml = (prioritized >> mutate( priority = _.support_category.map({ 'priority-high': 3, 'priority-medium': 2, 'priority-low': 1, 'priority-zero': 0 }), data = _.apply( lambda d: { **d["data"], **({'priority': int(d["priority"])} if not pd.isna(d["priority"]) else {})}, axis = 1 ) ) >> pipe(_.set_index("method").data.to_dict()) ) list(new_yaml.items())[109:112] # In[14]: # uncomment to save yaml spec #yaml.dump(new_yaml, open("../../siuba/spec/series.yml", "w"))