#!/usr/bin/env python # coding: utf-8 # # Polars vs Pandas # ![image.png](attachment:0b7753cc-a06f-4c07-a980-89e00d54d526.png) # # (Didn't know this was a thing before I googled "Panda vs Polar" accidently, Credits: [Panda & Polar Bear](https://www.pandaandpolarbear.com/)) # ## Polars # # * [Python Docs](https://pola-rs.github.io/polars-book/user-guide/introduction.html) # * [Github](https://github.com/pola-rs/polars) # * [PyPI](https://pypi.org/project/polars/) # * Features: # * Lazy & eager computation # * Rust implementation # * [Arrow](https://arrow.apache.org/) memory layout # * Easy and transparent parallelisation using multithreading # * PySpark-like syntax and thus heavily inspired by SQL # * Supports real NA values in contrast to Pandas # * Easily deal with complex data types, e.g. list of strings/floats # * Copy-On-Write (COW) semantics in contrast to Pandas where you kind of never know # # Pandas # # * [Docs](https://pandas.pydata.org/) # * [Github](https://github.com/pandas-dev/pandas) # * [PyPI](https://pypi.org/project/pandas/) # * Features: # * De facto standard data wrangling library for Python # * Multi-index for rows and columns # * Quite unrestricted in what's possible, e.g. everything hashable can be a column name like integers, floats, enums # * Tons of functionality # * No parallelization # * Built on top of Numpy # # # Other Contenders # # * [Vaex](https://github.com/vaexio/vaex): Lazy Out-of-Core dataframes # * [Dask](https://docs.dask.org/): Built on top of Pandas and parallelizes it, on a single node or distributed. # * [H2O Datatable](https://github.com/h2oai/datatable): Inspired by R's data.tables # * [Modin](https://modin.readthedocs.io/en/latest/): Uses [Ray](https://docs.ray.io/) or [Dask](https://docs.dask.org/) to parallelize Pandas # * [RAPIDS](https://rapids.ai/): Data analysis on GPU # # Performance [benchmark](https://h2oai.github.io/db-benchmark/) of various frameworks # # Prelude # In[1]: from datetime import datetime import numpy as np import polars as pl from pathlib import Path from polars import col, lit import pandas as pd from pandas.io.common import get_handle # In[2]: pl.__version__ # In[3]: pd.__version__ # In[4]: # Download a huge csv as a test. Takes a while and only needed once... big_csv = Path("./big.csv") csv_url = "http://sdm.lbl.gov/fastbit/data/star2002-full.csv.gz" if not big_csv.exists(): with get_handle(csv_url, compression="gzip", mode="r") as fh_in, open(big_csv, "bw") as fh_out: fh_out.write(fh_in.handle.buffer.read()) # ## Eager Execution # In[5]: edf = pl.read_csv(str(big_csv), has_headers=False) # In[6]: edf.filter(col("column_1") == 1).select(["column_9"]).head() # #### alternatively *Pandas* style (not recommended!) # In[7]: edf[edf["column_1"] == 1][["column_9"]].head() # Why shouldn't I use the Pandas style? Because ... # # * it's much harder to read since it's not *operator chaining*, # * it's more verbose if you assign actual variable names to your dataframes and not just use `df` all the time. Check out this filtering example: `agg_metric_df[agg_metric_df["metric_1"] < 0.9]`. Using `col` to refer to the column of the current dataframe is much cleaner, # * it's not possible to switch later from eager to lazy execution # ## Lazy Execution # Just switching `read_csv` to `scan_csv` is all it needs to go from eager to lazy in this example. `collect` or `fetch` is then used to trigger the execution. # In[8]: ldf = pl.scan_csv(str(big_csv), has_headers=False) # In[9]: ldf = ldf.filter(col("column_1") == 1) ldf.select(["column_9"]).collect().head() # Pandas style fails in lazy mode: # In[10]: ldf = pl.scan_csv(str(big_csv), has_headers=False) ldf[ldf["column_1"] == 1][["column_9"]].head() # ## Slicing & Indexing # Slicing and indexing in Polars works with the help of the subscript syntax similar to Numpy, i.e. df[1] or df[1, 2]. Some simple rules apply: # # * indexing by a single dimension # * returns one or several rows if indexed by an integer, e.g. `df[42]`, `df[42:]`, # * returns one or several columns if index by a string, e.g. , `df["my_col"]`, `df[["col1", "col2]]`, # # * indexing by two dimensions # * returns the row(s) indexed by an integer in the first dimension and the column(s) indexed by integer or string in the second dimension, e.g. `df[69, 42]` or `df[69, "col_42"]` # # In case of integers also slices, e.g. `1:`, are possible. # In[11]: edf[1] # this is a bug right now # In[12]: edf[[1]] # In[13]: edf[1, 3] # index by (row, column) # In[14]: edf[1, "column_4"] # or as string # In[15]: edf[1, [2, 3]] # index by (row, column) but returns data frame # In[16]: edf[1:4, "column_4":] # slice by row and column's name # Since in Pandas there is an explicit index that can be any type, not just integer and columns that can have any immutable datatype, it has to workaround several ambiguities with special accessors like `iloc`, `loc`, `at`, `iat`, etc. # In[17]: pdf = edf.to_pandas() # In[18]: pdf.iloc[1, 3] # In[19]: # when mixing indexing by integer and by string it gets less comprehensible in Pandas pdf["column_4"].iloc[1] # In[20]: pdf.iloc[1, [2, 3]] # In[21]: pdf[["column_4"]].iloc[1:4] # In[22]: # for slicing with column names and guaranteed indexing by integer we have to write: pdf.loc[:, "column_4":].iloc[1:4] # `pdf.loc[1:4, "column_4":]` works only as long the index is set correctly. # ## Dealing with missing values # In[23]: left_df = pl.DataFrame({"a": [1, 2, 3], "b": [None, "b", "c"]}) right_df = pl.DataFrame({"a": [1, 2], "c": [42, 69]}) df = left_df.join(right_df, on="a", how="left") df # Note that the last element of the `c` column is `null`, not `NaN` as in Pandas, and the datatype is still int and not automatically converted to float as in Pandas. # In[24]: df.filter(col("c").is_null()) # Pandas does something pretty scary here # In[25]: left_pdf = left_df.to_pandas() right_pdf = right_df.to_pandas() # Note that "c"-column has type int: # In[26]: right_pdf.dtypes # In[27]: pdf = pd.merge(left_pdf, right_pdf, on="a", how="left") # In[28]: pdf # Depending on the datatype, Pandas shows `None` or `NaN`, also note that the column `c` was converted from `int` to `float` without our consent! # # New columns # In[29]: df.with_column((lit(3)*col("c")).alias("3*c")) # Same is possible in Pandas but note that we have to retype again the variable name `pdf` just to access a column! # In[30]: pdf.assign(**{"3*c": 3*pdf["c"]}) # # Column Expressions # In[31]: df = pl.DataFrame( { "nrs": [1, 2, 3, None, 5], "names": ["foo", "ham", "spam", "egg", None], "random": np.random.rand(5), "groups": ["A", "A", "B", "C", "B"], } ) df # In[32]: # and in Pandas pdf = df.to_pandas() # #### Construct a new dataframe with a sorted column and some aggregations # In[33]: df.select( [ pl.sum("nrs"), # or equivalently col("nrs").sum() col("names").sort(), col("names").n_unique().alias("unique_names_1"), ] ) # In Pandas we create a new DataFrame and reference several times `pdf` # In[34]: pd.DataFrame( { "nrs": pdf["nrs"].sum(), "names": pdf["names"].sort_values(), "unique_names_1": pdf["names"].nunique(dropna=False), } ) # #### Select certain elements from a column by filtering from another # In[35]: df.select(col("names").filter(col("random") > 0.4)) # Syntax in Pandas is way less readable # In[36]: pdf.loc[pdf["random"] > 0.4][["names"]] # Another way in Pandas is to use the query style: # In[37]: pdf.query("random > 0.4")[["names"]] # The problem with Pandas' query style is that it is basically string hacking with no checks whatsoever until the code is executed. Also reuse of certain expressions is highly limited if you have just strings. # ### Complex expressions are also possible # All expressions in Polars are *embarassingly parallel* by design and thus automatically parallelized # In[38]: df.select( [ (pl.sum("nrs") * pl.when(col("random") > 0.5) .then(0) .otherwise(col("random")) ).alias("result") ] ) # SQL-like `when`/`then`/`otherwise` statements are not possible in Pandas, thus we have to use `np.where`: # In[39]: pd.Series( np.where(pdf["random"] > 0.5, 0, pdf["random"] * pdf["nrs"].sum()), name="result" ).to_frame() # In[40]: # or easier to read but much slower since not vectorized at all pdf.random.apply(lambda x: 0 if x > 0.5 else x * pdf["nrs"].sum()).to_frame() # #### Even window expressions are possible # In[41]: df.select( [ col("*"), # select all col("random").sum().over("groups").alias("sum[random]/groups"), col("random").list().over("names").alias("random/name"), ] ) # Doing the same in Pandas is a bit more complex. Also note that there is an unexpected `NaN` in the last row. This is due to the fact that when inserting `pdf.groupby(['names'], dropna=False)['random'].apply(list)` we compare `NaN` to `NaN` which is false by definition. This is just another subtle problem caused by the fact that Pandas uses `NaN` to express `NA`. # Also note that Polars needs no explicit index like Pandas to do operations like this, just like Spark has no way to set an index explicitely. # In[42]: (pdf.set_index("groups") .assign(**{"sum[random]/groups": pdf.groupby(['groups'])['random'].sum()}) .set_index("names") .assign(**{"random/name": pdf.groupby(['names'], dropna=False)['random'].apply(list)}) .reset_index() ) # In[43]: # or alternatively using `join` which is also avoiding the NaN problem pdf.join( pdf.groupby('groups').random.sum().rename("sum[random]/groups"), on="groups" ).join( pdf.groupby('names', dropna=False).random.apply(list).rename("random/name"), on="names" ) # # GroupBy # In[44]: df = pl.read_csv("https://theunitedstates.io/congress-legislators/legislators-current.csv") pdf = df.to_pandas() # In[45]: (df.lazy() # allows for working only on a subset using limit .groupby("first_name") .agg( [ col("party").count().alias("n_party"), # renaming an aggregated column is a bliss col("gender").list(), col("last_name").first(), ] ) .sort("n_party", reverse=True) .limit(5) .collect() ) # Note how easily we can deal with lists of strings by aggregating over gender using `list()`. # In Pandas the same operation feels more like string hacking and renaming happens as a separate step having unnecessary repetitions of the column names. Everything is of course eagerly evaluated. # In[46]: (pdf.groupby("first_name") .agg({"party": "count", "gender": lambda grp: grp.to_list(), "last_name": "first"}) .rename(columns={"party": "n_party", "gender": "gender_agg_list", "last_name": "last_name_first"}) .sort_values(by="n_party", ascending=False) .reset_index() .head(5)) # #### Conditionals in aggregations # In[47]: (df.lazy() .groupby("state") .agg( [ (col("party") == "Democrat").sum().alias("demo"), (col("party") == "Republican").sum().alias("rep"), ] ) .sort("demo", reverse=True) .limit(5) .collect() ) # The translation to Pandas is "somewhat" more complicated... kudos to everyone able to solve this without looking it up on StackOverflow :-) # In[48]: (pdf.groupby("state") .agg({"party": [("demo", lambda grp: np.sum(grp == "Democrat")), ("rep", lambda grp: np.sum(grp == "Republican"))]}) .droplevel(0, axis=1) .reset_index() .sort_values(by="demo", ascending=False) .head(5) ) # #### Composition and reuse of more complex operations # In[49]: def compute_age() -> pl.Expr: # Date64 is time in ms ms_to_year = 1e3 * 3600 * 24 * 365 return ( lit(datetime(2021, 1, 1)) - col("birthday") ) / (ms_to_year) def avg_age(gender: str) -> pl.Expr: return ( compute_age() .filter(col("gender") == gender) .mean() .alias(f"avg {gender} age") ) (df.lazy() .groupby(["state"]) .agg( [ avg_age("M"), avg_age("F"), ] ) .sort("state") .limit(5) .collect() ) # Translating this to Pandas is really hard since we have no way to refer to a column. Also with Pandas' `agg` we have only access to the aggregation column and cannot filter by another, thus we have to use `apply`. # In[50]: def p_compute_age(grp: pd.DataFrame): # Date64 is time in ms s_to_year = 3600 * 24 * 365 return ( (datetime(2021, 1, 1) - grp["birthday"]).dt.total_seconds() ) / (s_to_year) def p_avg_age(grp: pd.DataFrame, gender: str): age = p_compute_age(grp) mean_age = age[grp["gender"] == gender].mean() return pd.Series([mean_age], index=[f"avg {gender} age"]) (pdf.groupby("state") .apply( lambda grp: pd.concat( [p_avg_age(grp, gender="M"), p_avg_age(grp, gender="F")] ) ) .reset_index() .sort_values(by="state") .head(5) ) # The same code in Pandas just feels not as clean as in Polars, thus showing nicely the power that comes with Polars' composable expressions. # # User-Defined (Aggregation) Functions # In[51]: df = pl.DataFrame({"foo": np.arange(10), "bar": np.random.rand(10), "cls": np.random.randint(2, size=10)}) pdf = df.to_pandas() # In[52]: df # #### Vector Operations # Use `map` for vector operations on a whole column, i.e. series -> series: # In[53]: def my_custom_func(s: pl.Series) -> pl.Series: return np.exp(s) / np.log(s) # In[54]: df.filter(col("bar").map(my_custom_func, return_dtype=pl.Float64) > -1) # Use `apply` for scalar operations on a cell or group level but returning a scalar: # In[55]: df.select(col("bar").apply(lambda x: 3*x)) # In Pandas, you have `apply` and `applymap` working quite similarily: # In[56]: pdf[pdf["bar"].apply(my_custom_func) > -1] # In[57]: pdf[["bar"]].applymap(lambda x: 3*x) # #### Aggregation Operations # In[58]: df.groupby(["cls"]).agg([col("bar").apply(lambda grp: 3*grp.sum())]) # Quite analogous in Pandas but of course you need to fight the multi-index: # In[59]: pdf.groupby("cls", as_index=False).agg({"bar": lambda grp: 3 * grp.sum()})