#!/usr/bin/env python # coding: utf-8 # # Using `perspective.Table` # # This notebook provides an overview of `perspective.Table`, Perspective's core component that allows for lightning-fast data loading, query, update, and transformation. # # Tables can be used alone to manage datasets, or connected for data to flow quickly between multiple tables. Outside of a Jupyter context, Perspective tables can be used to create efficient [servers](https://github.com/finos/perspective/tree/master/examples/tornado-python) which allow data to be hosted and viewed by clients in the browser using Perspective's Javascript library. # In[ ]: from perspective import Table from datetime import date, datetime import numpy as np import pandas as pd import requests # ### Supported Data Formats # # Perspective supports 6 core data types: `int`, `float`, `str`, `bool`, `datetime`, and `date`, and several data formats: # In[ ]: # Pandas DataFrames df = pd.DataFrame({ "a": np.arange(0, 2), "b": np.array(["a", "b"], dtype=object), "nullable": [1.5, np.nan], # perspective handles `None` and `np.nan` values "mixed": [None, 1] }) # Column-oriented data = { "int": [i for i in range(4)], "float": [i * 1.25 for i in range(4)], "str": ["a", "b", "c", "d"], "bool": [True, False, True, False], "date": [date.today() for i in range(4)], "datetime": [datetime.now() for i in range(4)] } # Row-oriented rows = [{"a": 1, "b": True}, {"a": 2, "b": False}] # CSV strings csv = df.to_csv() # ### Schemas # # To explicitly specify data types for columns, create a schema (a `dict` of `str` column names to data types): # In[ ]: schema = { "int": float, "float": int, "str": str, "bool": bool, "date": datetime, "datetime": datetime } # ### Creating a Table # # A Table can be created by passing in a dataset or a schema, like the ones created above: # In[ ]: # From a dataset table = Table(data) # Or a dataframe df_table = Table(df) # Or a CSV csv_table = Table(csv) # tables can be created from schema table2 = Table(schema) assert table2.size() == 0 # constructing a table with an index, which is a column name to be used as the primary key indexed = Table(data, index="str") # or a limit, which is a total cap on the number of rows in the table - updates past `limit` overwite at row 0 limited = Table(data, limit=2) # ### Using the Table # # A Table has several queryable properties: # In[ ]: # schema() returns a mapping of column names to data types display("Table schema:", table.schema()) # size() returns the number of rows in the table display("Table has {} rows".format(table.size())) # columns() returns a List of the table's column names display("Table columns:", table.columns()) # ### Updating with new data # # To update or stream new data into the Table, call `table.update(data)`: # In[ ]: # you can update all columns table.update(data) print("after update:", table.size()) # or however many you'd like table.update({ "int": [5, 6, 7], "str": ["x", "y", "z"] }) # but you cannot add new columns through updating - create a new Table instead try: table.update({ "abcd": [1] }) except: pass # updates on unindexed tables always append print("after append:", table.size()) # updates on indexed tables should include the primary key - the new data overwrites at the row specified by the primary key indexed.update([{"str": "b", "int": 100}]) print("after indexed partial update:", indexed.size()) # without a primary key, the update appends to the end of the dataset indexed.update([{"int": 101}]) print("after indexed append:", indexed.size()) # # Queries and transformations using `View` # # `table.view()` allows you to apply various pivots, aggregates, sorts, filters, column selections, and expression computations on the Table, as well as return the results in a variety of output data formats. # # To create a view, call the `view()` method on an existing table: # In[ ]: view = table.view() # a view with zero transformations - returns the dataset as passed in # view metadata print("View has {} rows and {} columns".format(view.num_rows(), view.num_columns())) # ### Applying transformations # # To apply transformations, pass in the relevant `kwargs` into the constructor: # In[ ]: pivoted = table.view(row_pivots=["int"], column_pivots=["str"]) # group and split the underlying dataset aggregated = table.view(row_pivots=["int"], aggregates={"float": "avg"}) # specify aggregations for individual columns subset = table.view(columns=["float"]) # show only the columns you're interested in sorted_view = table.view(sort=[["str", "desc"], ["int", "asc"]]) # sort on a specific column, or multiple columns filtered = table.view(filter=[["int", ">", 2]]) # filter the dataset on a specific value expressions = table.view(expressions=['"int" + "float" / 100']) # calculate arbitrary expressions over the dataset # ### Serializing Data # # Views are used to serialize data to the user in several formats: # - `to_records`: outputs a list of dictionaries, each of which is a single row # - `to_dict`: outputs a dictionary of lists, each string key the name of a column # - `to_numpy`: outputs a dictionary of numpy arrays # - `to_df`: outputs a `pandas.DataFrame` # - `to_arrow`: outputs an Apache Arrow binary, which can be passed into another `perspective.Table` to create a copy of the first table # In[ ]: rows = view.to_records() columnar = view.to_dict() np_out = view.to_numpy() df_out = view.to_df() arrow_out = view.to_arrow() # Data from pivoted or otherwise transformed views reflect the state of the transformed dataset. # In[ ]: filtered_df = filtered.to_df() filtered_df # If the table is updated with data, views are automatically notified of the updates: # In[ ]: v1 = table.view() print("v1 has {} rows and {} columns".format(v1.num_rows(), v1.num_columns())) # In[ ]: table.update({"int": [100, 200, 300, 400]}) # In[ ]: print("v1 has {} rows and {} columns".format(v1.num_rows(), v1.num_columns())) # # Using callbacks to connect `Table` instances # # Custom callback functions can be applied on the `Table` and `View` instances. # # The most useful is `View.on_update`, which triggers a callback after the Table has been updated: # In[ ]: # The `delta` property is an Arrow binary containing updated rows def callback(port_id, delta): new_table = Table(delta) display(new_table.view().to_dict()) table = Table(data) view = table.view() # Register the callback with `mode="row"` to enable pushing back updated data view.on_update(callback, mode="row") # Update will trigger the callback table.update({ "int": [1, 3], "str": ["abc", "def"] }) # Because the callback can be triggered with a _copy_ of the updated data, `on_update` allows you to connect together multiple tables that all share state quickly and dependably: # In[ ]: # Create a table and a view t1 = Table(data) v1 = t1.view() # And a new table that feeds from `t1` t2 = Table(t1.schema()) # And a callback that updates `t2` whenever `t1` updates def cb(port_id, delta): t2.update(delta) # register the callback v1.on_update(cb, mode="row") # update t1, which updates t2 automatically t1.update(data) # t2 now has data after t1 is updated t2.view().to_df()