#!/usr/bin/env python # coding: utf-8 # # 10 minutes to polars # This as short introduction to Polars to get you started with the basic concepts of data wrangling. It is very much influenced by [10 minutes to pandas](https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html). # # We start by importing Polars. If you run this for the first time, get a coffee. This will take a while # In[2]: use polars::prelude::*; #[macro_use] extern crate polars; # # Object creation # Creating a `Series` by passing a list of nullable values. Note that we use `Option` to describe missing values. # In[3]: Series::new( "some_values with ones", &[Some(1), Some(3), Some(5), None, Some(6), Some(8)] ) # If we dont have any missing values, we can just pass a slice of `T`. # In[4]: Series::new( "some_non_null_values", &[1, 3, 5, 7, 6, 8] ) # The `Series` are actually an `Enum` around different typed values of a `ChunkedArray`. # You can think of a `ChunedkArray` as an array with a known type. Every `ChunkedArray` has a type alias that makes them more convenient to use. # # Some examples are: # # | Type | Alias | # |-----------------------------|------------------| # | `ChunkedArray` | `Float64Chunked` | # | `ChunkedArray` | `UInt32Chunked` | # | `ChunkedArray` | `BooleanChunked` | # | `ChunkedArray` | `Utf8Chunked` | # # See all available data types [here](https://ritchie46.github.io/polars/polars/datatypes/index.html). # # Create a `ChunkedArray` with null values: # In[5]: Int64Chunked::new_from_opt_slice("nullable", &[None, Some(1), Some(2)]) # Or create a `ChunkedArray` without null values. # In[6]: Int64Chunked::new_from_slice("non-nullable", &[1, 2, 3]) # Converting from `Series` to a `ChunkedArray` can be done by defining there type. # In[7]: let s = Series::new("values", &[1, 2, 3]); s.i32() # This will return an `Err` if you specify the wrong type. # In[8]: s.i64() # But we can cast a `Series` to the proper type and then unpack. # In[9]: s.cast::().unwrap().i64() # Below we use pattern matching to check if the cast was successful. Note that the clones on a `ChunkedArray` and a `Series` are very cheap, as the underlying data is wrapped by an `Arc`. # In[10]: let ca = match s.i64() { Err(_) => { s.cast::() .unwrap() .i64() .map(|ca| ca.clone()) .unwrap() }, Ok(ca) => ca.clone() }; ca # Converting from a `ChunkedArray` to a `Series`. # In[11]: ca.into_series() # A `DataFrame` is created from a `Vec` of `Series`. # In[12]: let dates = &[ "2020-08-21", "2020-08-21", "2020-08-22", "2020-08-23", "2020-08-22", ]; let fmt = "%Y-%m-%d"; let s0 = Date32Chunked::parse_from_str_slice("dates", dates, fmt).into(); let s1 = Series::new("n", &[1, 2, 3, 4, 5]); let s2 = Utf8Chunked::full("foos", "foo", 5).into(); let df = DataFrame::new(vec![s0, s1, s2]).expect("something went wrong"); df # The columns of the resulting `DataFrame` have different data types. # In[13]: df.dtypes() .iter() .zip(df.columns().iter()) .for_each(|(dtype, name)| println!("Column: '{}',\t dtype: {:?}", name, dtype)) # In[14]: df.columns() # # Viewing data # # Here is how to view the top and bottom rows of a DataFrame. # In[15]: df.head(Some(3)) # In[16]: df.tail(Some(3)) # Sorting by a column: # In[17]: let reverse = true; df.sort("dates", reverse).expect("column not sortable") # # Selection # Selecting a single column, which yields a `Result`: # In[18]: df.column("dates") .expect("columns don't exist") # Selecting 1 or multiple columns, which yield another `Result`: # In[19]: df.select("dates") .expect("column does not exist") # In[20]: df.select(&["dates", "n"]) .expect("column does not exist") # A `DataFrame` can also be sliced in to a subset of the DataFrame. # In[21]: let offset = 2; let length = 2; df.slice(offset, length) .expect("slice was not within bounds") # Select a column by index: # In[22]: df.select_at_idx(1) .expect("column was not within bounds") # # Boolean indexing # Boolean indexes can be used to filter data. Note that this also works on `Series` and `ChunkedArray`. We also use the `as_result!` macro. This utility expects a block that returns a `Result`. This makes it to convenient to use the `?` operator. # In[23]: as_result!({ // select the n column let n_s = df.column("n")?; let mask = n_s.gt(2); // filter values > 2 df.filter(&mask) }).unwrap() # Filter all values in the "n" column greater than 2 and smaller than 5: # In[24]: as_result!({ // select the n column let n_s = df.column("n")?; // create the boolean mask let mask = (n_s.gt(2) & n_s.lt(5))?; // filter values > 2 df.filter(&mask) }).unwrap() # For all the comparison methods available on `Series` and `ChunkArrays` check the [ChunkCompare trait](https://ritchie46.github.io/polars/polars/chunked_array/ops/trait.ChunkCompare.html). # # # Setting # Setting a new column can be done with the `hstack` operation. This is operation adds new columns to the existing `DataFrame`. # In[25]: let mut df = df; let s = Series::new("days", &["mo", "tue", "wed", "thu", "fri"]); df.hstack(&[s]).unwrap() # It isn't possible to get mutable access to the columns of a `DataFrame`, because this would give you the possibility to invalidate the `DataFrame` (for instance by replacing the column with a `Series` with a different length). # # Luckely there are other ways to mutate a DataFrame. We could for instance replace a column in the `DataFrame`: # In[26]: let s = Utf8Chunked::full("bars", "bar", 5); df.replace("foos", s).unwrap() # Or if we want to use the column we're replacing to determine the new column's values we can use the `apply` method and use a closure to create the new column. # # Below we use this determine `n + 1`: # In[27]: df.apply("n", |s| s + 1).unwrap() # Both the `replace` and the `apply` methods exist for selection by index; # * `replace_at_idx` # * `apply_at_idx` # In[28]: df.apply_at_idx(1, |s| s * 2) .unwrap() # Or we can apply a closure to the values that are valid under a condition constraint: # In[29]: as_result!({ let mask = (df.column("n")?.gt(4) & df.column("n")?.lt(10))?; df.may_apply("foos", |s| { s.utf8()? .set(&!mask, Some("not_within_bounds")) } ) } ).unwrap() # # Iterators # Every `ChunkedArray` implements the [IntoIterator trait](https://doc.rust-lang.org/std/iter/trait.IntoIterator.html) which gives us all the powerful trait methods available for iterators. # In[30]: as_result!({ let s = Series::new("a", [1, 2, 3, 4, 5]); let v = s.i32()? .into_iter() .sum::>(); println!("{:?}", v); Ok(()) }) # In[31]: as_result!({ // adds "ay" to every word. fn to_pig_latin(opt_val: Option<&str>) -> Option { opt_val.map(|val| format!("{}_ay", val)) } // may apply takes a closure that may fail. df.may_apply("days", |s| { let ca: Utf8Chunked = s.utf8()? .into_iter() .map(to_pig_latin) .collect(); Ok(ca) }); Ok(df.clone()) }).unwrap() # # Concat # # Polars provides various facilities for easily combining `DataFrames` and `Series`. # # We can concatenate a `DataFrame` with `hstack`: # In[32]: { let mut df1 = df.clone(); df1.hstack(df.get_columns()); println!("{:?}", df1); }; # Or append the rows of a second DataFrame: # In[33]: { let mut df1 = df.clone(); df1.vstack(&df); println!("{:?}", df1); }; # # Join # SQL-style joins. # In[34]: as_result!({ let left = DataFrame::new(vec![ Series::new("key", &["foo", "foo"]), Series::new("lval", &[1, 2]), ])?; let right = DataFrame::new(vec![ Series::new("key", &["foo", "foo"]), Series::new("rval", &[4, 5]), ])?; println!("{:?}", left); println!("{:?}", right); left.inner_join(&right, "key", "key") }).unwrap() # Another example that can be given is: # In[35]: as_result!({ let left = DataFrame::new(vec![ Series::new("key", &["foo", "bar"]), Series::new("lval", &[1, 2]), ])?; let right = DataFrame::new(vec![ Series::new("key", &["foo", "bar"]), Series::new("rval", &[4, 5]), ])?; println!("{:?}", left); println!("{:?}", right); left.inner_join(&right, "key", "key") }).unwrap() # # Grouping # # By "group by" we are referring to a process involving one or more of the following steps: # * **Splitting** the data into groups based on some criteria # * **Applying** a function to each group independently # * **Combining** the results into a data structure # In[44]: let df = DataFrame::new(vec![ Series::new("A", &["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"]), Series::new("B", &["one", "one", "two", "three", "two", "two", "one", "three"]), Int32Chunked::full("C", 1, 8).into(), Series::new("D", &[1, 2, 3, 4, 5, 6, 7, 8]) ]).unwrap(); df # Grouping and then applying the `sum()` method to the resulting groups: # In[45]: as_result!({ (&df).groupby("A")?.select("C").sum() }).unwrap() # In[46]: as_result!({ (&df).groupby(&["A", "B"])?.select("C").sum() }).unwrap() # # Pivot tables # Pivots create a summary table by a applying a groupby and defining a pivot column and values to aggregate. # In[58]: let s0 = Series::new("A", &["one", "one", "two", "three", "one", "one", "two", "three", "one", "one", "two", "three" ]); let s1 = Series::new("B", &["A", "B", "C", "A", "B", "C", "A", "B", "C", "A", "B", "C", ]); let s2 = Series::new("C", &["foo", "foo", "foo", "bar", "bar", "bar", "foo", "foo", "foo", "bar", "bar", "bar" ]); let s3 = Series::new("E", &((0..12).collect::>())); let df = DataFrame::new(vec![s0, s1, s2, s3]).unwrap(); df # In[61]: as_result!({ (&df).groupby(&["A"])?.pivot("C", "E").sum() }).unwrap() # In[59]: as_result!({ (&df).groupby(&["A", "B"])?.pivot("C", "E").sum() }).unwrap() # In[ ]: