#!/usr/bin/env python # coding: utf-8 # # Example of HeavyDB UDF/UDTF: Black-Scholes Model # The aim of this notebook is to demonstrate how one can enhance HeavyDB capabilities with UDF (User-Defined Functions) and UDTF (User-Defined Table Functions). For that end, we will demonstrate how one can create, register and invoke UDFs/UDTFs using [RBC](https://github.com/xnd-project/rbc) and [Numba](https://github.com/numba/numba). # ## Black-Scholes Model # The [Black-Scholes model](https://en.wikipedia.org/wiki/Black%E2%80%93Scholes_model) describes the price of an option over time. # $C(S_t, t) = N(d_1)S_t - N(d_2)PV(K)$ # $d_1 = \frac{1}{\sigma \sqrt{T - t}} [\ln(\frac{S_t}{K}) + (r + \frac{\sigma^2}{2})(T - t)]$ # $d_2 = d_1-\sigma \sqrt{T-t}$ # $PV(K) = K e^{-r(T-t)}$ # where # * $N$ is the standard normal cummulative distribution function (CDF) # * $T-t$ is the time to maturity # * $S$ is the spot price # * $K$ is the strike price # * $r$ is the risk-free rate # * $\sigma$ is the volatity of returns of the underlying asset. # ## Connecting to the HeavyDB server # Let's establish a connection to the HeavyDB server # In[31]: import warnings; warnings.filterwarnings('ignore') # In[32]: # NBVAL_IGNORE_OUTPUT import rbc print(f'Running with rbc version {rbc.__version__}') from rbc.heavydb import RemoteHeavyDB heavydb = RemoteHeavyDB(user='admin', password='HyperInteractive', host='127.0.0.1', port=6274) print(f'Connecting to HeavyDB version {heavydb.version}') assert heavydb.version[:3] >= (5, 4, 0), heavydb.version # This notebook requires HeavyDB version 5.4 or newer and rbc version 0.4 or newer. If the above fails or one is connected to older version of HeavyDB, please resolve the issue before continuing. # ## Test Data # For this notebook, we will simulate a financial Black-Scholes model using using data from the following website: # - http://www.scientific-consultants.com/nnbd.html # Read and adjust the test data # In[33]: import pandas as pd # Data taken from # http://www.scientific-consultants.com/nnbd.html df = pd.read_csv('data_black_scholes.csv', sep=',') df['S'] = 100.0 df['r'] = 0.0 df['TIME'] = df['TIME'] / 252 df.tail() # Create a table named "black_scholes" in the database: # In[34]: # NBVAL_IGNORE_OUTPUT SQL=lambda query: list(heavydb.sql_execute(query)[1]) SQL('drop table if exists black_scholes') SQL('create table if not exists black_scholes (' ' S DOUBLE, X DOUBLE, T DOUBLE, r DOUBLE,' ' sigma DOUBLE, OPRICE DOUBLE);'); # Insert data into the database: # In[20]: S = df['S'] X = df['STRIKE'] T = df['TIME'] r = df['r'] sigma = df['VLTY'] OPRICE = df['OPRICE'] heavydb.load_table_columnar('black_scholes', S=S, X=X, T=T, r=r, sigma=sigma, OPRICE=OPRICE) # Verify that the table **black_scholes** contains the data: # In[21]: result = SQL('SELECT X, T, sigma, OPRICE FROM black_scholes;') df2 = pd.DataFrame(result, columns=['STRIKE', 'TIME', 'VLTY', 'OPRICE']) df2.tail() # In[22]: result = SQL('SELECT COUNT(*) FROM black_scholes;') print(f'Number of rows: {result[0]}') # ## Why use UDF/UDTF? # For large datasets, retrieving the data from the server might not be viable due to hardware and/or network limitation. In those cases, makes sense to do the computation in the server rather than locally. # HeavyDB server v5.4 implements support for users to define UDFs and UDTFs. The former applies a function to a table data row-by-row while the latter is applied to an entire column. # The UDFs and table functions can be defined as Python functions using the [RBC](https://github.com/xnd-project/rbc/) tool that exploits [Numba](https://github.com/numba/numba) to convert Python functions to [LLVM intermediate representation](https://en.wikipedia.org/wiki/LLVM) and these are registered in the HeavyDB server. # ## Compute the Black-Scholes model remotely using HeavyDB UDFs # The following cells will register a function to be called from inside HeavyDB # In[23]: import numba import math from rbc.stdlib import array_api as np # Define a standard normal CDF using `@numba.njit` decorator that allows calling the CDF from HeavyDB UDF/UDTFs: # In[24]: @numba.njit def cnd_numba(d): A1 = 0.31938153 A2 = -0.356563782 A3 = 1.781477937 A4 = -1.821255978 A5 = 1.330274429 RSQRT2PI = 0.39894228040143267793994605993438 K = 1.0 / (1.0 + 0.2316419 * math.fabs(d)) ret_val = (RSQRT2PI * math.exp(-0.5 * d * d) * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))))) if d > 0: ret_val = 1.0 - ret_val return ret_val # Next, we'll define a function `black_scholes_UDF` that evaluates the Black-Scholes formula. The function will be registered with HeavyDB server so that one can use it in a SQL query. Notice the `@heavydb` decorator takes the signature of the function as argument: # - `@heavydb()` or `@heavydb()` # # In the example below, `black_scholes_UDT` accepts five `double` arguments `(S, X, T, r, sigma)` and returns a `double` value: # In[25]: @heavydb('double(double, double, double, double, double)') def black_scholes_UDF(S, X, T, r, sigma): d1 = (np.log(S/X) + (r + 0.5 * sigma**2)*T) / (sigma * np.sqrt(T)) d2 = d1 - sigma * np.sqrt(T) cndd1 = cnd_numba(d1) cndd2 = cnd_numba(d2) expRT = math.exp((-1. * r) * T) callResult = (S * cndd1 - X * expRT * cndd2) return callResult # The following cell illustrates a usage of `black_scholes_UDF` in a SQL query to HeavyDB server and compares the results with expected model values: # In[26]: result = SQL('SELECT OPRICE,' ' black_scholes_UDF(S, X, T, r, sigma) FROM black_scholes;') oprice = [] for expected, output in result: oprice.append(output) if len(oprice) < 5: print(f'Expected = {expected:.2f}, UDF Model output = {output:.2f}') # In[27]: # NBVAL_IGNORE_OUTPUT from matplotlib import pyplot as plt plt.plot(df['OPRICE'], linewidth=1, color='red', label='Expected') plt.plot(oprice, linewidth=2, color='blue', linestyle=':', label='Model Output') plt.legend(loc="upper left") plt.show() # ## Compute the Black-Scholes model remotely using HeavyDB UDTFs # Notice that the UDF `black_scholes_UDF` defined above is applied to table row-by-row. We can avoid the overhead of multiple calls of `black_scholes_UDF` if we re-define it as an UDTF that takes table columns as inputs, apply the Black-Scholes formula, and store the results to output columns: # # In a UDTF function, a column argument is specified as `Column` (i.e. `Column`). The last two arguments of a table function (with types `RowMultiplier` and `OutputColumn`) define how the size of the output column is computed and the output column itself. For instance, if `RowMultiplier` value is specified as `sizer=1`, then the output column has the same number of rows as the input columns. If `sizer=2`, then, the output column is twice as big as the input columns. One must always specify at least one `Column`, at least one `OutputColumn`, and exactly one `RowMultiplier` arguments when writing a UDTF. # # In the example below, `black_scholes_UDTF` receives five double column arguments (`Column`), the sizer argument, and the output column has type `double`. The return value of a UDTF (`int32`) must be the number of rows in output columns; usually it is `sizer` value multiplied by the number of rows in input columns but, in general, it the number of output column rows can be also smaller (but not greater). # In[28]: @heavydb('int32(Column, Column, Column, Column, Column,' ' RowMultiplier, OutputColumn)') def black_scholes_udtf(S, X, T, r, sigma, m, out): input_row_count = len(X) for i in range(input_row_count): d1 = (np.log(S[i]/X[i]) + (r[i] + 0.5 * sigma[i]**2)*T[i]) / (sigma[i] * np.sqrt(T[i])) out[i] = S[i] + X[i] + T[i] + r[i] + sigma[i] d2 = d1 - sigma[i] * np.sqrt(T[i]) cndd1 = cnd_numba(d1) cndd2 = cnd_numba(d2) expRT = math.exp((-1. * r[i]) * T[i]) out[i] = (S[i] * cndd1 - X[i] * expRT * cndd2) return m * input_row_count # In[29]: result = SQL('SELECT * FROM table(black_scholes_udtf(' ' cursor(SELECT S FROM black_scholes),' ' cursor(SELECT X FROM black_scholes),' ' cursor(SELECT T FROM black_scholes),' ' cursor(SELECT r FROM black_scholes),' ' cursor(SELECT sigma FROM black_scholes),' ' 1));') for expected, (output, ) in list(zip(df['OPRICE'], result))[:5]: print(f'Expected = {expected:.2f}, UDTF Model output = {output:.2f}') # In[30]: # NBVAL_IGNORE_OUTPUT flatten = lambda x: list(map(lambda y: y[0], x)) plt.plot(df['OPRICE'], linewidth=1, color='red', label='Expected') plt.plot(flatten(result), linewidth=2, color='blue', linestyle=':', label='Model Output') plt.legend(loc="upper left") plt.show()