#!/usr/bin/env python # coding: utf-8 # In[49]: get_ipython().system('pip install --upgrade polars') get_ipython().system('pip install tqdm pandas matplotlib') # # 10 minutes to py-polars # This as short introduction to Polars to get you started with the basic concepts of data wrangling. It is very much influenced by [10 minutes to pandas](https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html). # # py-polars are the python bindings to Polars. It currently supports only a subset of the datatypes and operations supported by Polars. # However it should be enough to give your slow pipelines a boost. # In[2]: import polars as pl import numpy as np np.random.seed(1) # # Object creation # # Creating a `Series` by passing a list or array of values. # In[3]: pl.Series("a", [1, 2, 3]) # A `Series` can also have nullable values. # In[4]: s = pl.Series("with nullable values", [1, None, 3], nullable=True) s # Series have a data type and can be casted # In[5]: print(s.dtype) s.cast_f32() # A `DataFrame` can be created by passing a dictionary with keys as column names and # list values. # In[6]: df = pl.DataFrame({ "foo": np.random.rand(10), "bar": np.arange(10), "ham": ["h"] * 3 + ["a"] * 3 + ["m"] * 4 }) df.head(3) # The columns of the result `DataFrame` have different types and names # In[7]: print(df.dtypes) print(df.columns) # # Viewing data # # We can view the top and bottom rows of a `DataFrame` # In[8]: df.head(3) # In[9]: df.tail(3) # We can sort by column. # In[10]: df.sort("foo", reverse=True).head(5) # # Selection # We can select a single column, which returns a Series. # In[11]: df["foo"].head(3) # Or select a column by index # In[12]: df[0] # When we select in two dimensions, we select by row, column order. # Here we slice until the third row of the first column. # In[13]: df[:3, 0] # Or we can slice the whole `DataFrame` into a smaller sub `DataFrame` # In[14]: df[:4] # Or we slice both rows and columns # In[15]: df[3:5,1:] # # Boolean indexing # Boolean indexes can be used to filter data. # In[16]: df[df["foo"] > 0.5] # In[17]: df[df["ham"] == "a"] # # Setting # Adding a new column to the `DataFrame` can be done with `hstack`. # In[18]: df["new"] = np.arange(10) df # We can also define the column location by index # In[19]: df[0] = pl.Series("new_foo", np.random.rand(10)) df.head(3) # Or use a boolean mask to assign new values. # # _Note that every mutable assignment alocates new memory. So isn't actually mutable with regard to the actual memory. This is a performance trade off. Due to the immutable memory, slices, clones, subsets of `Series`/`DataFrames` are zero copy. If you need to mutate a lot of values, it's faster to do this in numpy allocate a new `Series`._ # In[20]: # selection order is row column df[df["new_foo"] > 0.5, "new_foo"] = 1 # we can also define the mutation location by passing an array of indexes # In[21]: s = df["ham"] s[[1, 2, 3]] = "c" s # # Concat # Polars provide methods to cobine multiple `DataFrames` and `Series`. # We can concatenate a `DataFrame` with `hstack`. # In[22]: # clones are super cheap! df1 = df.clone() df1.hstack(df.get_columns()) df1.head() # Or append rows from another `DataFrame`. # In[23]: df1 = df.clone() df1.vstack(df) print(df.height, df1.height) # # Joins # SQL-styel joins. # In[24]: left = pl.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]}) right = pl.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]}) left.join(right, left_on="key", right_on="key", how="inner") # Another example that can be given is: # In[25]: left = pl.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]}) right = pl.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]}) left.join(right, left_on="key", right_on="key", how="inner") # # Grouping # # By "group by" we are referring to a process involving one or more of the following steps: # * **Splitting** the data into groups based on some criteria # * **Applying** a function to each group independently # * **Combining** the results into a data structure # In[26]: df = pl.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C': np.random.randn(8), 'D': np.random.randn(8)}) df # In[27]: df.groupby("A").select(["C", "D"]).sum() # In[28]: df.groupby(["A", "B"]).select_all().first() # In[29]: df.groupby("A").select("C").quantile(0.2) # # Pivot tables # # Pivots create a summary table by a applying a groupby and defining a pivot column and values to aggregate. # In[30]: df = pl.DataFrame({'A': ['one', 'one', 'two', 'three'] * 3, 'B': ['A', 'B', 'C'] * 4, 'C': ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2, 'D': np.random.randn(12), 'E': np.random.randn(12)}) # In[31]: df.groupby("A").pivot(pivot_column="C", values_column="E").sum() # In[32]: pivotted = df.groupby(["A", "B"]).pivot(pivot_column="C", values_column="E").sum() pivotted # # Numpy interops # # Polars has zero cost interaction with numpy's [ufunc](https://numpy.org/doc/stable/reference/ufuncs.html) functionality. This means that if a function/ method isn't supported by Polars, we can use numpy's without any overhead. Numpy will write the output to Polars/ arrow memory, and the null bitmask will keep null information. # In[33]: s = pivotted["bar"] np.cos(s) # In[34]: np.exp(s) # # Clones # It was already mentioned previously, but clones are super cheap. This is due to the fact that the underlying memory backed by Polars is immutable. Slices and clones can be made with the guarantee that they will never be modified. # # Below we observe that cloning an array of 1e6 elements is almost 850x faster. The cost of clone a Polars Series is also constant and doesn't increase with memory size. Ideal for writing pure funcitons. The cost of cloning a DataFrame 10x the size is also very small. # In[35]: a = np.arange(int(1e6)) # In[36]: get_ipython().run_cell_magic('timeit', '', 'np.copy(a)\n') # In[37]: s = pl.Series("a", a) # In[38]: get_ipython().run_cell_magic('timeit', '', 's.clone()\n') # In[39]: df = pl.DataFrame({f"a_{i}": s.clone() for i in range(10)}) print(df.shape) df.head(3) # In[40]: get_ipython().run_cell_magic('timeit', '', 'df.clone()\n') # # Performance # Let's check some performances with differences with Pandas. # In[52]: import pandas as pd import tqdm from pandas.util.testing import rands import time import matplotlib.pyplot as plt # In[42]: def create_join_dfs(N = 10_000): left_pivot = int(0.8 * N) right_pivot = N - left_pivot indices = np.array([rands(10) for _ in range(N)], dtype="O") indices2 = np.array([rands(10) for _ in range(N)], dtype="O") key = np.tile(indices[:left_pivot], 10) key2 = np.tile(indices2[:left_pivot], 10) left = pd.DataFrame({"key": key, "key2": key2, "value": np.random.randn(len(key))}) right = pd.DataFrame( {"key": indices[right_pivot:], "key2": indices2[right_pivot:], "value2": np.random.randn(left_pivot)} ) return left, right left, right = create_join_dfs() # In[43]: def time_lambda(f: "fn() -> ()") -> float: """ eval time in ms """ t0 = time.time_ns() for _ in range(10): f() return (time.time_ns() - t0) / 10 / 1e6 time_lambda(lambda : 1) # In[53]: left_pl = [] left_pd = [] inner_pd = [] inner_pl = [] outer_pd = [] outer_pl = [] par_inner = [] par_left = [] n_proxy = [] for N in tqdm.tqdm([10, 30, 40, 50, 70, 100, 500, 1000, 2000]): N *= 1000 left, right = create_join_dfs(N) f_left_pd = lambda: left.merge(right, on="key", how="left") f_inner_pd = lambda: left.merge(right, on="key", how="inner") f_outer_pd = lambda: left.merge(right, on="key", how="outer") pd_left_t = time_lambda(f_left_pd) pd_inner_t = time_lambda(f_inner_pd) pd_outer_t = time_lambda(f_outer_pd) # create polars dfs left = pl.DataFrame(left.to_dict(orient="list")) right = pl.DataFrame(right.to_dict(orient="list")) f_left_pl = lambda: left.join(right, left_on="key", right_on="key", how="left") f_inner_pl = lambda: left.join(right, left_on="key", right_on="key", how="inner") f_outer_pl = lambda: left.join(right, left_on="key", right_on="key", how="outer") pl_left_t = time_lambda(f_left_pl) pl_inner_t = time_lambda(f_inner_pl) pl_outer_t = time_lambda(f_outer_pl) f_left_par = lambda: left.join(right, left_on="key", right_on="key", how="left", parallel=True) f_inner_par = lambda: left.join(right, left_on="key", right_on="key", how="inner", parallel=True) par_left_t = time_lambda(f_left_par) par_inner_t = time_lambda(f_inner_par) # pandas left_pd.append(pd_left_t) inner_pd.append(pd_inner_t) outer_pd.append(pd_outer_t) # polars left_pl.append(pl_left_t) inner_pl.append(pl_inner_t) outer_pl.append(pl_outer_t) # parallel polars par_left.append(par_left_t) par_inner.append(par_inner_t) n_proxy.append(N * 0.8) del left del right # In[79]: def make_fig(how): plt.figure(figsize=(18, 6)) if how == "inner": plt.plot(n_proxy, left_pd, label="pandas left") plt.plot(n_proxy, left_pl, label="polars left") plt.plot(n_proxy, par_left, label="polars left parallel") elif how == "outer": plt.plot(n_proxy, outer_pl, label="polars outer") plt.plot(n_proxy, outer_pd, label="pandas outer") else: plt.plot(n_proxy, inner_pl, label="polars inner") plt.plot(n_proxy, inner_pd, label="pandas inner") plt.plot(n_proxy, par_inner, label="polars inner parallel") # In[80]: for how in ["inner", "left", "outer"]: make_fig(how) plt.legend() plt.show() # In[85]: speedup_left = np.array(left_pd) / np.array(left_pl) speedup_inner = np.array(inner_pd) / np.array(inner_pl) speedup_outer = np.array(outer_pd) / np.array(outer_pl) speedup_left_par = np.array(left_pd) / np.array(par_left) speedup_inner_par = np.array(inner_pd) / np.array(par_inner) # In[96]: df = pd.DataFrame({ "n_rows": n_proxy, "speedup_left": speedup_left, "speedup_inner": speedup_inner, "speedup_outer": speedup_outer, "speedup_left_par": speedup_left, "speedup_inner_par": speedup_inner, }) df = df.set_index("n_rows") df.plot.bar(figsize=(17, 8), ylim=(1, 3)) # In[100]: def create_gb_df(size): str_groups = np.array(list("0123456789")) groups = np.arange(10) g = np.random.choice(groups, size) sg = np.random.choice(str_groups, size) v = np.random.randn(size) return pd.DataFrame({"groups": g, "values": v, "str": sg}) # In[115]: gb_pd = [] gb_pl = [] x = [] for size in tqdm.tqdm((1e2, 1e3, 1e4)): size = int(1000 * size) x.append(size) df = create_gb_df(size) t = time_lambda(lambda: df.groupby("groups").agg("str").count()) gb_pd.append(t) df = pl.DataFrame(df.to_dict(orient="list")) t = time_lambda(lambda: df.groupby("groups").select("str").count()) gb_pl.append(t) # In[116]: df = pd.DataFrame({ "dataset_size": x, "speedup": np.array(gb_pd) / np.array(gb_pl), }) df = df.set_index("dataset_size") df.plot.bar(figsize=(17, 8)) # In[ ]: