#!/usr/bin/env python
# coding: utf-8

# # Using `perspective.Table`
# 
# This notebook provides an overview of `perspective.Table`, Perspective's core component that allows for lightning-fast data loading, query, update, and transformation.
# 
# Tables can be used alone to manage datasets, or connected for data to flow quickly between multiple tables. Outside of a Jupyter context, Perspective tables can be used to create efficient [servers](https://github.com/finos/perspective/tree/master/examples/tornado-python) which allow data to be hosted and viewed by clients in the browser using Perspective's Javascript library.

# In[ ]:


from perspective import Table
from datetime import date, datetime
import numpy as np
import pandas as pd
import requests


# ### Supported Data Formats
# 
# Perspective supports 6 core data types: `int`, `float`, `str`, `bool`, `datetime`, and `date`, and several data formats:

# In[ ]:


# Pandas DataFrames
df = pd.DataFrame({
    "a": np.arange(0, 2),
    "b": np.array(["a", "b"], dtype=object),
    "nullable": [1.5, np.nan], # perspective handles `None` and `np.nan` values
    "mixed": [None, 1]
})

# Column-oriented
data = {
    "int": [i for i in range(4)],
    "float": [i * 1.25 for i in range(4)],
    "str": ["a", "b", "c", "d"],
    "bool": [True, False, True, False],
    "date": [date.today() for i in range(4)],
    "datetime": [datetime.now() for i in range(4)]
}

# Row-oriented
rows = [{"a": 1, "b": True}, {"a": 2, "b": False}]

# CSV strings
csv = df.to_csv()


# ### Schemas
# 
# To explicitly specify data types for columns, create a schema (a `dict` of `str` column names to data types):

# In[ ]:


schema = {
    "int": float,
    "float": int,
    "str": str,
    "bool": bool,
    "date": datetime,
    "datetime": datetime
}


# ### Creating a Table
# 
# A Table can be created by passing in a dataset or a schema, like the ones created above:

# In[ ]:


# From a dataset
table = Table(data)

# Or a dataframe
df_table = Table(df)

# Or a CSV
csv_table = Table(csv)

# tables can be created from schema
table2 = Table(schema)
assert table2.size() == 0

# constructing a table with an index, which is a column name to be used as the primary key
indexed = Table(data, index="str")

# or a limit, which is a total cap on the number of rows in the table - updates past `limit` overwite at row 0
limited = Table(data, limit=2)


# ### Using the Table
# 
# A Table has several queryable properties:

# In[ ]:


# schema() returns a mapping of column names to data types
display("Table schema:", table.schema())

# size() returns the number of rows in the table
display("Table has {} rows".format(table.size()))

# columns() returns a List of the table's column names
display("Table columns:", table.columns())


# ### Updating with new data
# 
# To update or stream new data into the Table, call `table.update(data)`:

# In[ ]:


# you can update all columns
table.update(data)
print("after update:", table.size())

# or however many you'd like
table.update({
    "int": [5, 6, 7],
    "str": ["x", "y", "z"]
})

# but you cannot add new columns through updating - create a new Table instead
try:
    table.update({
        "abcd": [1]
    })
except:
    pass

# updates on unindexed tables always append
print("after append:", table.size())

# updates on indexed tables should include the primary key - the new data overwrites at the row specified by the primary key
indexed.update([{"str": "b", "int": 100}])
print("after indexed partial update:", indexed.size())

# without a primary key, the update appends to the end of the dataset
indexed.update([{"int": 101}])
print("after indexed append:", indexed.size())


# # Queries and transformations using `View`
# 
# `table.view()` allows you to apply various pivots, aggregates, sorts, filters, column selections, and expression computations on the Table, as well as return the results in a variety of output data formats.
# 
# To create a view, call the `view()` method on an existing table:

# In[ ]:


view = table.view() # a view with zero transformations - returns the dataset as passed in

# view metadata
print("View has {} rows and {} columns".format(view.num_rows(), view.num_columns()))


# ### Applying transformations
# 
# To apply transformations, pass in the relevant `kwargs` into the constructor:

# In[ ]:


pivoted = table.view(row_pivots=["int"], column_pivots=["str"]) # group and split the underlying dataset

aggregated = table.view(row_pivots=["int"], aggregates={"float": "avg"}) # specify aggregations for individual columns

subset = table.view(columns=["float"]) # show only the columns you're interested in

sorted_view = table.view(sort=[["str", "desc"], ["int", "asc"]]) # sort on a specific column, or multiple columns

filtered = table.view(filter=[["int", ">", 2]]) # filter the dataset on a specific value

expressions = table.view(expressions=['"int" + "float" / 100']) # calculate arbitrary expressions over the dataset


# ### Serializing Data
# 
# Views are used to serialize data to the user in several formats:
# - `to_records`: outputs a list of dictionaries, each of which is a single row
# - `to_dict`: outputs a dictionary of lists, each string key the name of a column
# - `to_numpy`: outputs a dictionary of numpy arrays
# - `to_df`: outputs a `pandas.DataFrame`
# - `to_arrow`: outputs an Apache Arrow binary, which can be passed into another `perspective.Table` to create a copy of the first table

# In[ ]:


rows = view.to_records()

columnar = view.to_dict()

np_out = view.to_numpy()

df_out = view.to_df()

arrow_out = view.to_arrow()


# Data from pivoted or otherwise transformed views reflect the state of the transformed dataset.

# In[ ]:


filtered_df = filtered.to_df()
filtered_df


# If the table is updated with data, views are automatically notified of the updates:

# In[ ]:


v1 = table.view()
print("v1 has {} rows and {} columns".format(v1.num_rows(), v1.num_columns()))


# In[ ]:


table.update({"int": [100, 200, 300, 400]})


# In[ ]:


print("v1 has {} rows and {} columns".format(v1.num_rows(), v1.num_columns()))


# # Using callbacks to connect `Table` instances
# 
# Custom callback functions can be applied on the `Table` and `View` instances.
# 
# The most useful is `View.on_update`, which triggers a callback after the Table has been updated:

# In[ ]:


# The `delta` property is an Arrow binary containing updated rows
def callback(port_id, delta):
    new_table = Table(delta)
    display(new_table.view().to_dict())
    
table = Table(data)
view = table.view()

# Register the callback with `mode="row"` to enable pushing back updated data
view.on_update(callback, mode="row")

# Update will trigger the callback
table.update({
    "int": [1, 3],
    "str": ["abc", "def"]
})


# Because the callback can be triggered with a _copy_ of the updated data, `on_update` allows you to connect together multiple tables that all share state quickly and dependably:

# In[ ]:


# Create a table and a view
t1 = Table(data)
v1 = t1.view()

# And a new table that feeds from `t1`
t2 = Table(t1.schema())

# And a callback that updates `t2` whenever `t1` updates
def cb(port_id, delta):
    t2.update(delta)
    
# register the callback
v1.on_update(cb, mode="row")
    
# update t1, which updates t2 automatically
t1.update(data)

# t2 now has data after t1 is updated
t2.view().to_df()