In this notebook we show how you can finetune TAPAS using transformers
library to then import the model into Spark NLP.
IMPORTANT: This is just an example and from JSL we don't provide support to solve errors or doubts with the transformers library.
Strong supervision is one of the approaches of Table Question Answering. During training time, an aggregation operator (COUNT, AVERAGE, SUM) is added to the answer, as well as the cells which answer to the quesiton in the pair question-answer.
For example, if you want to get the SUM of the column "revenue":
We are going to finetune "google/tapas-base"
with the following aggregation operations:
Feel free to experiment with any other operator!
Let's keep in mind a few things before we start 😊
Spark NLP 4.2.0
and above. So please make sure you have upgraded to the latest Spark NLP releaseTensorFlow
and they have to be in Table Question&Answering
category.HuggingFace
and TensorFlow
. You don't need TensorFlow
to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.2.4.1
version and Transformers on 4.6.1
. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully.SentencePiece
library, so we install that as well%pip install tensorflow==2.10.0 transformers==4.22.1 datasets
! pip install torch-scatter -f https://data.pyg.org/whl/torch-1.12.0+cu112.html
from transformers import TapasTokenizer, TapasForQuestionAnswering, TapasConfig
import pandas as pd
import tensorflow as tf
import numpy as np
The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.
Moving 0 files to the new cache system
0it [00:00, ?it/s]
MODEL_NAME = "google/tapas-base"
aggregation_labels = {
0: "NONE",
1: "SUM",
2: "AVERAGE",
3: "COUNT",
4: "MAX",
5: "MIN"
}
lab_dict = {y:x for x,y in aggregation_labels.items()}
lab_dict
{'NONE': 0, 'SUM': 1, 'AVERAGE': 2, 'COUNT': 3, 'MAX': 4, 'MIN': 5}
original_config = TapasConfig.from_pretrained(MODEL_NAME)
#Let's first load the HF model
config = TapasConfig(num_aggregation_labels=len(aggregation_labels),
use_answer_as_supervision = False, #in case you’re using strong supervision, you should set use_answer_as_supervision of TapasConfig to False (because the ground truth aggregation label is given during training).
cell_selection_preference = original_config.cell_selection_preference,
aggregation_labels = aggregation_labels)
tokenizer = TapasTokenizer.from_pretrained(MODEL_NAME)
model = TapasForQuestionAnswering.from_pretrained(MODEL_NAME, config=config)
Some weights of TapasForQuestionAnswering were not initialized from the model checkpoint at google/tapas-base and are newly initialized: ['column_output_weights', 'output_bias', 'aggregation_classifier.weight', 'output_weights', 'aggregation_classifier.bias', 'column_output_bias'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
model.config.aggregation_labels
{0: 'NONE', 1: 'SUM', 2: 'AVERAGE', 3: 'COUNT', 4: 'MAX', 5: 'MIN'}
lab_dict = {y:x for x,y in model.config.aggregation_labels.items()}
lab_dict
{'NONE': 0, 'SUM': 1, 'AVERAGE': 2, 'COUNT': 3, 'MAX': 4, 'MIN': 5}
📜For the creation of the dataset, we have follow these steps:
The dataset + the questions + the answers will be our training dataset.
!wget https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/finance-nlp/data/tapas_example.pkl.bz2?raw=true
import bz2, pickle
datasource = "tapas_example_by_JSL"
ifile = bz2.BZ2File(f"tapas_example.pkl.bz2",'rb')
list_of_df = pickle.load(ifile)
print(f"Loaded {len(list_of_df)} dataframes from datasource `{datasource}`")
ifile.close()
Loaded 5150 dataframes from datasource `tapas_example_by_JSL`
MAX_DFS_PER_DATASOURCE = 2 # How many dataframes you want to use from list_of_df. This may require many resources, so leaving it to 1 for a quick demonstration.
MAX_QUESTIONS_PER_DF = None # To add all questions, leave it to None. To randomly select n questions, set the value to a number
EPOCHS = 8 # Number of times to go over each dataframe
BATCH_SIZE = 16 # Number of questions to process per batch. In Colab, more than 16 will trigger CUDA out of Memory
import torch
from transformers import AdamW
# GPU training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Adam optimizer
LR = 1e-5
optimizer = AdamW(model.parameters(), lr=LR)
/usr/local/lib/python3.7/dist-packages/transformers/optimization.py:310: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning FutureWarning,
def generate_NONE(datasource, list_of_df, max_dfs_per_datasource=MAX_DFS_PER_DATASOURCE, max_questions_per_df=MAX_QUESTIONS_PER_DF):
# The question dataframe contains the following columns:
# - A dymmy ID just to identify what type of question it is. It's ignored.
# - The question text
# - The aggregation string: NONE, COUNT, AVERAGE, SUM, MAX, MIN
# - If the aggreagation is not note, the FLOAT result of the operation.
# - A list of tuples, where each tuple is a cell answering to the question
# - A list of texts, where each text is the answer to the question
questions_df = []
for counter, df in enumerate(list_of_df):
if max_dfs_per_datasource is not None and counter == max_dfs_per_datasource:
break
questions = []
headers = df.columns
# CELL IN FIRST ROW
questions = [
(0,f"what is the first {x}", "NONE", np.nan, [(0,i)], [df.iloc[0,i]]) for i, x in enumerate(headers)
]
# CELL IN LAST ROW
questions.extend ( [
(1,f"what is the last {x}", "NONE", np.nan, [(len(df)-1,i)], [df.iloc[len(df)-1,i]]) for i, x in enumerate(headers)
] )
# FIRST ROW
# Warning: 0,0 in TAPAS means the first row, while in Pandas it means the column header!
# So when extracting the text (df.iloc) we start at 1
questions.append((2, "what is the first row", "NONE", np.nan, [(0, i) for i, x in enumerate(headers)], [df.iloc[0, i] for i, x in enumerate(headers)]))
questions.append((3, "what is the first entry", "NONE", np.nan, [(0, i) for i, x in enumerate(headers)], [df.iloc[0, i] for i, x in enumerate(headers)]))
# LAST ROW
# For TAPAS, last row is len(df)-1, but in Panads we access to that cell by df.iloc[len(df)]
questions.append((4, "what is the last row", "NONE", np.nan, [(len(df)-1, i) for i, x in enumerate(headers)], [df.iloc[len(df)-1, i] for i, x in enumerate(headers)]))
questions.append((5, "what is the last entry", "NONE", np.nan, [(len(df)-1, i) for i, x in enumerate(headers)], [df.iloc[len(df)-1, i] for i, x in enumerate(headers)]))
# ASKING FOR 1 CELL WITH ANOTHER CELL AS REFERENCE
for numrow, t in enumerate(df.iterrows()):
for numcol, h in enumerate(headers):
other_headers = [x for x in headers if x != h]
for i, o in enumerate(other_headers):
col = numcol
q = t[1].loc[o]
# Checking if there are many rows
rows = df[df[o]==q].index.tolist()
count = float(len(rows))
questions.append((6, f"what is the {h} when {o} is {q}", "NONE", np.nan, [(r, col) for r in rows], [df.iloc[r,col] for r in rows] ))
questions.append(
(7,f"how big is the table", "NONE", np.nan, [(len(df)-1, len(headers)-1)], [df.iloc[len(df)-1, len(headers)-1]]))
questions.append(
(8,f"what is the size of the table", "NONE", np.nan, [(len(df)-1, len(headers)-1)], [df.iloc[len(df)-1, len(headers)-1]]))
questions.append(
(9,f"what is the last cell of the table", "NONE", np.nan, [(len(df)-1, len(headers)-1)], [df.iloc[len(df)-1, len(headers)-1]]))
questions.append(
(10,f"what is the first cell of the table", "NONE", np.nan, [(0,0)], [df.iloc[0, 0]]))
max_q = len(questions)
if max_questions_per_df is not None:
max_q = min(max_questions_per_df, max_q)
question_sampling = random.sample(questions, max_q)
question_sampling_df = pd.DataFrame(question_sampling, columns=['id', 'question', 'aggr', 'float_answer', 'cells', 'cell_texts'])
question_sampling_df['table'] = pickle.dumps(df.to_dict())
# I add all the dataframes with their questions to a tuple, keeping the datasource it was taken from
questions_df.append(question_sampling_df)
return questions_df
import pickle
def generate_COUNT(datasource, list_of_df, max_dfs_per_datasource=MAX_DFS_PER_DATASOURCE, max_questions_per_df=MAX_QUESTIONS_PER_DF):
# The question dataframe contains the following columns:
# - A dymmy ID just to identify what type of question it is. It's ignored.
# - The question text
# - The aggregation string: NONE, COUNT, AVERAGE, SUM, MAX, MIN
# - the FLOAT result of the operation, not used unless you want to train a weak supervision model
# - A list of tuples, where each tuple is a cell answering to the question
# - A list of texts, where each text is the answer to the question
questions_df = []
for counter, df in enumerate(list_of_df):
if max_dfs_per_datasource is not None and counter == max_dfs_per_datasource:
break
questions = []
headers = df.columns
# ASKING FOR 1 CELL WITH ANOTHER CELL AS REFERENCE
for numrow, t in enumerate(df.iterrows()):
for numcol, h in enumerate(headers):
other_headers = [x for x in headers if x != h]
for i, o in enumerate(other_headers):
col = numcol
q = t[1].loc[o]
# Checking if there are many rows
rows = df[df[o]==q].index.tolist()
count = float(len(rows))
questions.append((5, f"how many times {o} is {q}", "COUNT", count, [(r, col) for r in rows], [df.iloc[r,col] for r in rows] ))
questions.append((5, f"count the number of times {o} is {q}", "COUNT", count, [(r, col) for r in rows], [df.iloc[r,col] for r in rows] ))
# COUNT
questions.extend([
(0,f"how many {x} are there", "COUNT", float(len(df)), [(a, i) for a in range(0, len(df))], [df.iloc[a, i] for a in range(0, len(df))]) for i, x in enumerate(headers)])
questions.extend([
(1,f"how many {x} do we have", "COUNT", float(len(df)), [(a, i) for a in range(0, len(df))], [df.iloc[a, i] for a in range(0, len(df))]) for i, x in enumerate(headers)])
questions.extend([
(2,f"how many {x} does the table have", "COUNT", float(len(df)), [(a, i) for a in range(0, len(df))], [df.iloc[a, i] for a in range(0, len(df))]) for i, x in enumerate(headers)])
questions.extend([
(3,f"how many {x} has the table", "COUNT", float(len(df)), [(a, i) for a in range(0, len(df))], [df.iloc[a, i] for a in range(0, len(df))]) for i, x in enumerate(headers)])
questions.append(
(4,f"how many rows do we have", "COUNT", float(len(df)), [(a, 0) for a in range(0, len(df))], [df.iloc[a, 0] for a in range(0, len(df))]))
questions.append(
(5,f"how many rows are there", "COUNT", float(len(df)), [(a, 0) for a in range(0, len(df))], [df.iloc[a, 0] for a in range(0, len(df))]))
questions.append(
(6,f"how many rows does the table have", "COUNT", float(len(df)), [(a, 0) for a in range(0, len(df))], [df.iloc[a, 0] for a in range(0, len(df))]))
questions.append(
(7,f"how many rows has the table", "COUNT", float(len(df)), [(a, 0) for a in range(0, len(df))], [df.iloc[a, 0] for a in range(0, len(df))]))
max_q = len(questions)
if max_questions_per_df is not None:
max_q = min(max_questions_per_df, max_q)
question_sampling = random.sample(questions, max_q)
question_sampling_df = pd.DataFrame(question_sampling, columns=['id', 'question', 'aggr', 'float_answer', 'cells', 'cell_texts'])
question_sampling_df['table'] = pickle.dumps(df.to_dict())
# I add all the dataframes with their questions to a tuple, keeping the datasource it was taken from
questions_df.append(question_sampling_df)
return questions_df
import pickle
def generate_SUM(datasource, list_of_df, max_dfs_per_datasource=MAX_DFS_PER_DATASOURCE, max_questions_per_df=MAX_QUESTIONS_PER_DF):
# The question dataframe contains the following columns:
# - A dymmy ID just to identify what type of question it is. It's ignored.
# - The question text
# - The aggregation string: NONE, COUNT, AVERAGE, SUM, MAX, MIN
# - the FLOAT result of the operation, not used unless you want to train a weak supervision model
# - A list of tuples, where each tuple is a cell answering to the question
# - A list of texts, where each text is the answer to the question
questions_df = []
for counter, df in enumerate(list_of_df):
if max_dfs_per_datasource is not None and counter == max_dfs_per_datasource:
break
questions = []
headers = df.columns
# SUM
quantity = list(headers).index('quantity')
qty_result_list = [df.iloc[i, quantity] for i in range(0, len(df))]
qty_sum = sum([float(x.replace(',','')) for x in qty_result_list])
questions.append((0, "what is the quantity total", "SUM", qty_sum, [(i, quantity) for i in range(0, len(df))], qty_result_list ))
questions.append((1, "what is the total of quantity", "SUM", qty_sum, [(i, quantity) for i in range(0, len(df))], qty_result_list ))
questions.append((2, "what is the total for quantity", "SUM", qty_sum, [(i, quantity) for i in range(0, len(df))], qty_result_list ))
questions.append((3, "what is the sum for quantity", "SUM", qty_sum, [(i, quantity) for i in range(0, len(df))], qty_result_list ))
# SUM AND AVERAGE WITH CONDITION
for numrow, t in enumerate(df.iterrows()):
for i, h in enumerate(headers):
if h in ['quantity', 'percentage']:
continue
numcol = i
q = t[1].loc[h]
rows = df[df[h]==q].index.tolist()
qty_result_list = [df.iloc[r, quantity] for r in rows]
qty_sum = sum([float(x.replace(',','')) for x in qty_result_list])
qty_avg = mean([float(x.replace(',','')) for x in qty_result_list])
qty_max = max([float(x.replace(',','')) for x in qty_result_list])
qty_min= min([float(x.replace(',','')) for x in qty_result_list])
# Quantity
questions.append((4, f"what is the overall quantity when {h} is {q}", "SUM", qty_sum, [(r, quantity) for r in rows],qty_result_list ))
questions.append((5, f"what is the total of quantity when {h} is {q}", "SUM", qty_sum, [(r, quantity) for r in rows], qty_result_list ))
questions.append((6, f"what is the sum of quantity when {h} is {q}", "SUM", qty_sum, [(r, quantity) for r in rows], qty_result_list ))
max_q = len(questions)
if max_questions_per_df is not None:
max_q = min(max_questions_per_df, max_q)
question_sampling = random.sample(questions, max_q)
question_sampling_df = pd.DataFrame(question_sampling, columns=['id', 'question', 'aggr', 'float_answer', 'cells', 'cell_texts'])
question_sampling_df['table'] = pickle.dumps(df.to_dict())
# I add all the dataframes with their questions to a tuple, keeping the datasource it was taken from
questions_df.append(question_sampling_df)
return questions_df
import pickle
def generate_AVERAGE(datasource, list_of_df, max_dfs_per_datasource=MAX_DFS_PER_DATASOURCE, max_questions_per_df=MAX_QUESTIONS_PER_DF):
# The question dataframe contains the following columns:
# - A dymmy ID just to identify what type of question it is. It's ignored.
# - The question text
# - The aggregation string: NONE, COUNT, AVERAGE, SUM, MAX, MIN
# - the FLOAT result of the operation, not used unless you want to train a weak supervision model
# - A list of tuples, where each tuple is a cell answering to the question
# - A list of texts, where each text is the answer to the question
questions_df = []
for counter, df in enumerate(list_of_df):
if max_dfs_per_datasource is not None and counter == max_dfs_per_datasource:
break
questions = []
headers = df.columns
percentage = list(headers).index('percentage')
per_result_list = [df.iloc[i, percentage] for i in range(0, len(df))]
per_avg = mean([float(x.replace('%','')) for x in per_result_list])
questions.append((0, "what is the percentage average", "AVERAGE", per_avg, [(i, percentage) for i in range(0, len(df))], per_result_list ))
questions.append((1, "what is the mean percentage", "AVERAGE", per_avg, [(i, percentage) for i in range(0, len(df))],per_result_list ))
questions.append((2, "what is the average percentage", "AVERAGE", per_avg, [(i, percentage) for i in range(0, len(df))], per_result_list ))
quantity = list(headers).index('quantity')
# SUM AND AVERAGE WITH CONDITION
for numrow, t in enumerate(df.iterrows()):
for i, h in enumerate(headers):
if h in ['quantity', 'percentage']:
continue
numcol = i
q = t[1].loc[h]
rows = df[df[h]==q].index.tolist()
qty_result_list = [df.iloc[r, quantity] for r in rows]
qty_sum = sum([float(x.replace(',','')) for x in qty_result_list])
qty_avg = mean([float(x.replace(',','')) for x in qty_result_list])
qty_max = max([float(x.replace(',','')) for x in qty_result_list])
qty_min= min([float(x.replace(',','')) for x in qty_result_list])
questions.append((3, f"what is the quantity average when {h} is {q}", "AVERAGE", qty_avg, [(r, quantity) for r in rows], qty_result_list ))
questions.append((4, f"what is the average quantity when {h} is {q}", "AVERAGE", qty_avg, [(r, quantity) for r in rows], qty_result_list ))
questions.append((5, f"what is the mean of quantity when {h} is {q}", "AVERAGE", qty_avg, [(r, quantity) for r in rows], qty_result_list ))
questions.append((6, f"what is the quantity mean when {h} is {q}", "AVERAGE", qty_avg, [(r, quantity) for r in rows], qty_result_list ))
# Percentage
per_result_list = [df.iloc[r, percentage] for r in rows]
per_avg = mean([float(x.replace('%','')) for x in per_result_list])
per_max = max([float(x.replace('%','')) for x in per_result_list])
per_min= min([float(x.replace('%','')) for x in per_result_list])
questions.append((7, f"what is the mean percentage when {h} is {q}", "AVERAGE", per_avg, [(r, percentage) for r in rows], per_result_list ))
questions.append((8, f"what is the average percentage when {h} is {q}", "AVERAGE", per_avg, [(r, percentage) for r in rows], per_result_list ))
# MEAN / AVERAGE
qty_result_list = [df.iloc[i, quantity] for i in range(0, len(df))]
qty_avg = mean([float(x.replace(',','')) for x in qty_result_list])
qty_max = max([float(x.replace(',','')) for x in qty_result_list])
qty_min = min([float(x.replace(',','')) for x in qty_result_list])
qty_result_list = [df.iloc[i, quantity] for i in range(0, len(df))]
questions.append((9, "what is the quantity average", "AVERAGE", qty_avg, [(i, quantity) for i in range(0, len(df))], qty_result_list ))
questions.append((10, "what is the mean of quantity", "AVERAGE", qty_avg, [(i, quantity) for i in range(0, len(df))], qty_result_list ))
questions.append((11, "what is the average quantity", "AVERAGE", qty_avg, [(i, quantity) for i in range(0, len(df))], qty_result_list ))
questions.append((12, "what is the quantity mean", "AVERAGE", qty_avg, [(i, quantity) for i in range(0, len(df))], qty_result_list ))
per_result_list = [df.iloc[i, percentage] for i in range(0, len(df))]
per_max = max([float(x.replace('%','')) for x in per_result_list])
per_min = min([float(x.replace('%','')) for x in per_result_list])
max_q = len(questions)
if max_questions_per_df is not None:
max_q = min(max_questions_per_df, max_q)
question_sampling = random.sample(questions, max_q)
question_sampling_df = pd.DataFrame(question_sampling, columns=['id', 'question', 'aggr', 'float_answer', 'cells', 'cell_texts'])
question_sampling_df['table'] = pickle.dumps(df.to_dict())
# I add all the dataframes with their questions to a tuple, keeping the datasource it was taken from
questions_df.append(question_sampling_df)
return questions_df
import pickle
def generate_MAX(datasource, list_of_df, max_dfs_per_datasource=MAX_DFS_PER_DATASOURCE, max_questions_per_df=MAX_QUESTIONS_PER_DF):
# The question dataframe contains the following columns:
# - A dymmy ID just to identify what type of question it is. It's ignored.
# - The question text
# - The aggregation string: NONE, COUNT, AVERAGE, SUM, MAX, MIN
# - the FLOAT result of the operation, not used unless you want to train a weak supervision model
# - A list of tuples, where each tuple is a cell answering to the question
# - A list of texts, where each text is the answer to the question
questions_df = []
for counter, df in enumerate(list_of_df):
if max_dfs_per_datasource is not None and counter == max_dfs_per_datasource:
break
questions = []
headers = df.columns
# MAX
quantity = list(headers).index('quantity')
qty_result_list = [df.iloc[i, quantity] for i in range(0, len(df))]
qty_max = max([float(x.replace(',','')) for x in qty_result_list])
questions.append((1, "what is the quantity max", "MAX", qty_max, [(i, quantity) for i in range(0, len(df))], qty_result_list ))
questions.append((2, "what is the max of quantity", "MAX", qty_max, [(i, quantity) for i in range(0, len(df))], qty_result_list ))
questions.append((3, "what is the max for quantity", "MAX", qty_max, [(i, quantity) for i in range(0, len(df))], qty_result_list ))
questions.append((4, "what is the maximum quantity", "MAX", qty_max, [(i, quantity) for i in range(0, len(df))], qty_result_list ))
questions.append((5, "what is the highest quantity", "MAX", qty_max, [(i, quantity) for i in range(0, len(df))], qty_result_list ))
percentage = list(headers).index('percentage')
per_result_list = [df.iloc[i, percentage] for i in range(0, len(df))]
per_max = max([float(x.replace('%','')) for x in per_result_list])
questions.append((6, "what is the percentage maximum", "AVERAGE", per_max, [(i, percentage) for i in range(0, len(df))], per_result_list ))
questions.append((7, "what is the max percentage", "AVERAGE", per_max, [(i, percentage) for i in range(0, len(df))],per_result_list ))
questions.append((8, "what is the maximum percentage", "AVERAGE", per_max, [(i, percentage) for i in range(0, len(df))], per_result_list ))
questions.append((9, "what is the highest percentage", "AVERAGE", per_max, [(i, percentage) for i in range(0, len(df))], per_result_list ))
# MAX WITH CONDITION
for numrow, t in enumerate(df.iterrows()):
for i, h in enumerate(headers):
if h in ['quantity', 'percentage']:
continue
numcol = i
q = t[1].loc[h]
rows = df[df[h]==q].index.tolist()
qty_result_list = [df.iloc[r, quantity] for r in rows]
qty_max = max([float(x.replace(',','')) for x in qty_result_list])
# Quantity
questions.append((10, f"what is the maximum quantity when {h} is {q}", "MAX", qty_max, [(r, quantity) for r in rows],qty_result_list ))
questions.append((11, f"what is the max of quantity when {h} is {q}", "MAX", qty_max, [(r, quantity) for r in rows], qty_result_list ))
questions.append((12, f"what is the quantity max when {h} is {q}", "MAX", qty_max, [(r, quantity) for r in rows], qty_result_list ))
questions.append((13, f"what is the highest quantity when {h} is {q}", "MAX", qty_max, [(r, quantity) for r in rows], qty_result_list ))
# Percentage
per_result_list = [df.iloc[r, percentage] for r in rows]
per_max = max([float(x.replace('%','')) for x in per_result_list])
questions.append((14, f"what is the max percentage when {h} is {q}", "MAX", per_max, [(r, percentage) for r in rows], per_result_list ))
questions.append((15, f"what is the maximum percentage when {h} is {q}", "MAX", per_max, [(r, percentage) for r in rows], per_result_list ))
questions.append((16, f"what is the highest percentage when {h} is {q}", "MAX", per_max, [(r, percentage) for r in rows], per_result_list ))
max_q = len(questions)
if max_questions_per_df is not None:
max_q = min(max_questions_per_df, max_q)
question_sampling = random.sample(questions, max_q)
question_sampling_df = pd.DataFrame(question_sampling, columns=['id', 'question', 'aggr', 'float_answer', 'cells', 'cell_texts'])
question_sampling_df['table'] = pickle.dumps(df.to_dict())
# I add all the dataframes with their questions to a tuple, keeping the datasource it was taken from
questions_df.append(question_sampling_df)
return questions_df
import pickle
def generate_MIN(datasource, list_of_df, max_dfs_per_datasource=MAX_DFS_PER_DATASOURCE, max_questions_per_df=MAX_QUESTIONS_PER_DF):
# The question dataframe contains the following columns:
# - A dymmy ID just to identify what type of question it is. It's ignored.
# - The question text
# - The aggregation string: NONE, COUNT, AVERAGE, SUM, MAX, MIN
# - the FLOAT result of the operation, not used unless you want to train a weak supervision model
# - A list of tuples, where each tuple is a cell answering to the question
# - A list of texts, where each text is the answer to the question
questions_df = []
for counter, df in enumerate(list_of_df):
if max_dfs_per_datasource is not None and counter == max_dfs_per_datasource:
break
questions = []
headers = df.columns
# MAX
quantity = list(headers).index('quantity')
qty_result_list = [df.iloc[i, quantity] for i in range(0, len(df))]
qty_min = min([float(x.replace(',','')) for x in qty_result_list])
questions.append((1, "what is the quantity min", "MIN", qty_min, [(i, quantity) for i in range(0, len(df))], qty_result_list ))
questions.append((2, "what is the min of quantity", "MIN", qty_min, [(i, quantity) for i in range(0, len(df))], qty_result_list ))
questions.append((3, "what is the min for quantity", "MIN", qty_min, [(i, quantity) for i in range(0, len(df))], qty_result_list ))
questions.append((4, "what is the minimum quantity", "MIN", qty_min, [(i, quantity) for i in range(0, len(df))], qty_result_list ))
questions.append((5, "what is the lowest quantity", "MIN", qty_min, [(i, quantity) for i in range(0, len(df))], qty_result_list ))
percentage = list(headers).index('percentage')
per_result_list = [df.iloc[i, percentage] for i in range(0, len(df))]
per_min = min([float(x.replace('%','')) for x in per_result_list])
questions.append((6, "what is the percentage min", "MIN", per_min, [(i, percentage) for i in range(0, len(df))], per_result_list ))
questions.append((7, "what is the min percentage", "MIN", per_min, [(i, percentage) for i in range(0, len(df))],per_result_list ))
questions.append((8, "what is the minimum percentage", "MIN", per_min, [(i, percentage) for i in range(0, len(df))], per_result_list ))
questions.append((9, "what is the lowest percentage", "MIN", per_min, [(i, percentage) for i in range(0, len(df))], per_result_list ))
# MAX WITH CONDITION
for numrow, t in enumerate(df.iterrows()):
for i, h in enumerate(headers):
if h in ['quantity', 'percentage']:
continue
numcol = i
q = t[1].loc[h]
rows = df[df[h]==q].index.tolist()
qty_result_list = [df.iloc[r, quantity] for r in rows]
qty_min= min([float(x.replace(',','')) for x in qty_result_list])
# Quantity
questions.append((10, f"what is the minimum quantity when {h} is {q}", "MIN", qty_min, [(r, quantity) for r in rows],qty_result_list ))
questions.append((11, f"what is the min of quantity when {h} is {q}", "MIN", qty_min, [(r, quantity) for r in rows], qty_result_list ))
questions.append((12, f"what is the quantity min when {h} is {q}", "MIN", qty_min, [(r, quantity) for r in rows], qty_result_list ))
questions.append((13, f"what is the lowest quantity when {h} is {q}", "MIN", qty_min, [(r, quantity) for r in rows], qty_result_list ))
# Percentage
per_result_list = [df.iloc[r, percentage] for r in rows]
per_min= min([float(x.replace('%','')) for x in per_result_list])
questions.append((14, f"what is the min percentage when {h} is {q}", "MIN", per_min, [(r, percentage) for r in rows], per_result_list ))
questions.append((15, f"what is the minimum percentage when {h} is {q}", "MIN", per_min, [(r, percentage) for r in rows], per_result_list ))
questions.append((16, f"what is the lowest percentage when {h} is {q}", "MIN", per_min, [(r, percentage) for r in rows], per_result_list ))
max_q = len(questions)
if max_questions_per_df is not None:
max_q = min(max_questions_per_df, max_q)
question_sampling = random.sample(questions, max_q)
question_sampling_df = pd.DataFrame(question_sampling, columns=['id', 'question', 'aggr', 'float_answer', 'cells', 'cell_texts'])
question_sampling_df['table'] = pickle.dumps(df.to_dict())
# I add all the dataframes with their questions to a tuple, keeping the datasource it was taken from
questions_df.append(question_sampling_df)
return questions_df
import torch
class TableDataset(torch.utils.data.Dataset):
def __init__(self, df, tokenizer):
self.df = df
self.tokenizer = tokenizer
def __getitem__(self, idx):
item = self.df.iloc[idx]
table = pickle.loads(item.table)
df_table = pd.DataFrame(table)
# this means it's the first table-question pair in a sequence
encoding = self.tokenizer(table=df_table,
queries=item.question,
answer_coordinates=item.cells,
answer_text=item.cell_texts,
padding="max_length",
truncation=True,
return_tensors="pt"
)
aggr = lab_dict[item.aggr]
# remove the batch dimension which the tokenizer adds
encoding = {key: val.squeeze(0) for key, val in encoding.items()}
# Aggregation operation ID
encoding["aggr"] = torch.tensor([aggr], dtype=torch.int64)
# Float Answer is for weak aggregation. That is, when you return the final value of the aggregation.
# We are leaving this here in case you want to experiment, no keep in mind in original TAPAS model
# only SUM, AVERAGE and COUNT can be used for weak-supervision.
# encoding["float_answer"] = torch.tensor([item.float_answer], dtype=torch.float32)
return encoding
def __len__(self):
return len(self.df)
import torch
from transformers import AdamW
# GPU training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Adam optimizer
optimizer = AdamW(model.parameters(), lr=LR)
import numpy as np
from statistics import mean
import sys
import shutil
import bz2
import pickle
import random
# The question dataframe contains the following columns:
# - A dymmy ID just to identify what type of question it is. It's ignored.
# - The question text
# - The aggregation string: NONE, COUNT, AVERAGE, SUM, MAX, MIN
# - If the aggreagation is not note, the FLOAT result of the operation.
# - A list of tuples, where each tuple is a cell answering to the question
# - A list of texts, where each text is the answer to the question
print("- Generating questions...")
questions_df = []
questions_df.extend(generate_NONE(datasource,
list_of_df,
max_dfs_per_datasource=MAX_DFS_PER_DATASOURCE,
max_questions_per_df=MAX_QUESTIONS_PER_DF))
questions_df.extend(generate_SUM(datasource,
list_of_df,
max_dfs_per_datasource=MAX_DFS_PER_DATASOURCE,
max_questions_per_df=MAX_QUESTIONS_PER_DF))
questions_df.extend(generate_AVERAGE(datasource,
list_of_df,
max_dfs_per_datasource=MAX_DFS_PER_DATASOURCE,
max_questions_per_df=MAX_QUESTIONS_PER_DF))
questions_df.extend(generate_COUNT(datasource,
list_of_df,
max_dfs_per_datasource=MAX_DFS_PER_DATASOURCE,
max_questions_per_df=MAX_QUESTIONS_PER_DF))
questions_df.extend(generate_MAX(datasource,
list_of_df,
max_dfs_per_datasource=MAX_DFS_PER_DATASOURCE,
max_questions_per_df=MAX_QUESTIONS_PER_DF))
questions_df.extend(generate_MIN(datasource,
list_of_df,
max_dfs_per_datasource=MAX_DFS_PER_DATASOURCE,
max_questions_per_df=MAX_QUESTIONS_PER_DF))
total_df = len(questions_df)
# We will aggregate all the losses for all the dataframes and show the
# AVERAGE LOSS as the EPOCH loss
epoch_losses = []
for epoch in range(EPOCHS): # loop over the dataset multiple times
# Sometimes you may get errors in pandas resolving the questions.
# I count how many errors we get during the epochs.
errors = 0
print("- Epoch:", epoch)
df_losses = []
for i, question_df in enumerate(questions_df):
print(f"\r-- Questions Dataframe Num.: {i}/{total_df}", end="")
train_dataset = TableDataset(df=question_df, tokenizer=tokenizer)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE)
losses = []
for idx, batch in enumerate(train_dataloader):
# get the inputs;
input_ids = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device)
token_type_ids = batch["token_type_ids"].to(device)
labels = batch["labels"].to(device)
aggr_labels = batch["aggr"].to(device)
# Only use it for weak-supervision, not for strong-supervision
# float_answer = batch["float_answer"].to(device)
numeric_values = batch["numeric_values"].to(device)
numeric_values_scale = batch["numeric_values_scale"].to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids,
labels=labels, aggregation_labels = aggr_labels, numeric_values=numeric_values,
numeric_values_scale=numeric_values_scale) #float_answer=float_answer, only provide if weak-supervision
loss = outputs.loss
losses.append(loss.item())
loss.backward()
optimizer.step()
df_losses.append(mean(losses))
epoch_mean_loss = mean(df_losses)
epoch_losses.append(epoch_mean_loss)
print(f"\n--- Epoch Loss: {epoch_mean_loss}")
print(f"--- Errors: {errors}")
# Saving model (this creates the folder too)
mod = f'{datasource}/model/'
modr = mod.replace("/","_")
model.save_pretrained(save_directory=mod,
is_main_process=True, state_dict=model.state_dict())
shutil.make_archive(modr, # File name
'zip',
'./',
mod)
- Generating questions... - Epoch: 0 -- Questions Dataframe Num.: 11/12 --- Epoch Loss: 1.5686425358767153 --- Errors: 0 - Epoch: 1 -- Questions Dataframe Num.: 11/12 --- Epoch Loss: 1.1960902499281898 --- Errors: 0 - Epoch: 2 -- Questions Dataframe Num.: 11/12 --- Epoch Loss: 0.9914937746003347 --- Errors: 0 - Epoch: 3 -- Questions Dataframe Num.: 11/12 --- Epoch Loss: 1.031176200393808 --- Errors: 0 - Epoch: 4 -- Questions Dataframe Num.: 11/12 --- Epoch Loss: 1.0282052185060915 --- Errors: 0 - Epoch: 5 -- Questions Dataframe Num.: 11/12 --- Epoch Loss: 0.8980713966506608 --- Errors: 0 - Epoch: 6 -- Questions Dataframe Num.: 11/12 --- Epoch Loss: 1.4654125168596366 --- Errors: 0 - Epoch: 7 -- Questions Dataframe Num.: 11/12 --- Epoch Loss: 1.1752010677899318 --- Errors: 0
'/content/tapas_example_by_JSL_model_.zip'
Let's test our model.
#process results using HF routine
def process_results(model, logits, logits_aggregation):
id2aggregation = model.config.aggregation_labels
if logits_aggregation is not None:
predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
inputs, logits, logits_aggregation
)
print(predicted_aggregation_indices)
aggregation_predictions_string = [id2aggregation[x] for x in predicted_aggregation_indices]
else:
predicted_answer_coordinates = tokenizer.convert_logits_to_predictions(
inputs, logits
)[0]
aggregation_predictions_string = [id2aggregation[0] for x in range(logits.shape[0])]
return predicted_answer_coordinates, aggregation_predictions_string
#show results
def show_results(model, logits, logits_aggregation, queries):
predicted_answer_coordinates, aggregation_predictions_string = process_results(model, logits, logits_aggregation)
answers = []
for coordinates in predicted_answer_coordinates:
if len(coordinates) == 1:
# only a single cell:
answers.append(table.iat[coordinates[0]])
else:
# multiple cells
cell_values = []
for coordinate in coordinates:
cell_values.append(table.iat[coordinate])
answers.append(", ".join(cell_values))
display(table)
print("")
for query, answer, predicted_agg in zip(queries, answers, aggregation_predictions_string):
print(query)
if predicted_agg == "NONE":
print("Predicted answer: " + answer)
else:
print("Predicted answer: " + predicted_agg + " > " + answer)
import collections
import numpy as np
def compute_prediction_sequence(model, data, device):
"""Computes predictions using model's answers to the previous questions."""
# prepare data
input_ids = data["input_ids"].to(device)
attention_mask = data["attention_mask"].to(device)
token_type_ids = data["token_type_ids"].to(device)
all_logits = []
prev_answers = None
num_batch = data["input_ids"].shape[0]
for idx in range(num_batch):
if prev_answers is not None:
coords_to_answer = prev_answers[idx]
# Next, set the label ids predicted by the model
prev_label_ids_example = token_type_ids_example[:,3] # shape (seq_len,)
model_label_ids = np.zeros_like(prev_label_ids_example.cpu().numpy()) # shape (seq_len,)
# for each token in the sequence:
token_type_ids_example = token_type_ids[idx] # shape (seq_len, 7)
for i in range(model_label_ids.shape[0]):
segment_id = token_type_ids_example[:,0].tolist()[i]
col_id = token_type_ids_example[:,1].tolist()[i] - 1
row_id = token_type_ids_example[:,2].tolist()[i] - 1
if row_id >= 0 and col_id >= 0 and segment_id == 1:
model_label_ids[i] = int(coords_to_answer[(col_id, row_id)])
# set the prev label ids of the example (shape (1, seq_len) )
token_type_ids_example[:,3] = torch.from_numpy(model_label_ids).type(torch.long).to(device)
prev_answers = {}
# get the example
input_ids_example = input_ids[idx] # shape (seq_len,)
attention_mask_example = attention_mask[idx] # shape (seq_len,)
token_type_ids_example = token_type_ids[idx] # shape (seq_len, 7)
# forward pass to obtain the logits
outputs = model(input_ids=input_ids_example.unsqueeze(0),
attention_mask=attention_mask_example.unsqueeze(0),
token_type_ids=token_type_ids_example.unsqueeze(0))
logits = outputs.logits
all_logits.append(logits)
# convert logits to probabilities (which are of shape (1, seq_len))
dist_per_token = torch.distributions.Bernoulli(logits=logits)
probabilities = dist_per_token.probs * attention_mask_example.type(torch.float32).to(dist_per_token.probs.device)
# Compute average probability per cell, aggregating over tokens.
# Dictionary maps coordinates to a list of one or more probabilities
coords_to_probs = collections.defaultdict(list)
prev_answers = {}
for i, p in enumerate(probabilities.squeeze().tolist()):
segment_id = token_type_ids_example[:,0].tolist()[i]
col = token_type_ids_example[:,1].tolist()[i] - 1
row = token_type_ids_example[:,2].tolist()[i] - 1
if col >= 0 and row >= 0 and segment_id == 1:
coords_to_probs[(col, row)].append(p)
# Next, map cell coordinates to 1 or 0 (depending on whether the mean prob of all cell tokens is > 0.5)
coords_to_answer = {}
for key in coords_to_probs:
coords_to_answer[key] = np.array(coords_to_probs[key]).mean() > 0.5
prev_answers[idx+1] = coords_to_answer
logits_batch = torch.cat(tuple(all_logits), 0)
return logits_batch
This data is fake and it's just for explanatory purposes
data = {'Company': ["JACKSON, INC.", "RETRO COMP, CORP.", "DEEPAI, Inc."],
'Revenue': ["5600000", "45000000", "59000000"],
'Share Percentage': ["1%", "2%", "3%"],
'Founding Date': ["7 february 1967", "10 june 1996", "28 november 1967"]}
queries = ["Which is the Company where Founding Date is 7 february 1967",
"What is the max Revenue",
"What is the lowest Share Percentage",
"What are the Founding Dates",
"What is the min Share Percentage",
"What is the max Revenue",
"How many companies are there"]
table = pd.DataFrame.from_dict(data)
import torch
# GPU training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
TapasForQuestionAnswering( (tapas): TapasModel( (embeddings): TapasEmbeddings( (word_embeddings): Embedding(30522, 768, padding_idx=0) (position_embeddings): Embedding(1024, 768) (token_type_embeddings_0): Embedding(3, 768) (token_type_embeddings_1): Embedding(256, 768) (token_type_embeddings_2): Embedding(256, 768) (token_type_embeddings_3): Embedding(2, 768) (token_type_embeddings_4): Embedding(256, 768) (token_type_embeddings_5): Embedding(256, 768) (token_type_embeddings_6): Embedding(10, 768) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (encoder): TapasEncoder( (layer): ModuleList( (0): TapasLayer( (attention): TapasAttention( (self): TapasSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): TapasSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): TapasIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): TapasOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (1): TapasLayer( (attention): TapasAttention( (self): TapasSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): TapasSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): TapasIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): TapasOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (2): TapasLayer( (attention): TapasAttention( (self): TapasSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): TapasSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): TapasIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): TapasOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (3): TapasLayer( (attention): TapasAttention( (self): TapasSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): TapasSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): TapasIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): TapasOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (4): TapasLayer( (attention): TapasAttention( (self): TapasSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): TapasSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): TapasIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): TapasOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (5): TapasLayer( (attention): TapasAttention( (self): TapasSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): TapasSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): TapasIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): TapasOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (6): TapasLayer( (attention): TapasAttention( (self): TapasSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): TapasSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): TapasIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): TapasOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (7): TapasLayer( (attention): TapasAttention( (self): TapasSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): TapasSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): TapasIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): TapasOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (8): TapasLayer( (attention): TapasAttention( (self): TapasSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): TapasSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): TapasIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): TapasOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (9): TapasLayer( (attention): TapasAttention( (self): TapasSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): TapasSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): TapasIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): TapasOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (10): TapasLayer( (attention): TapasAttention( (self): TapasSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): TapasSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): TapasIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): TapasOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (11): TapasLayer( (attention): TapasAttention( (self): TapasSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): TapasSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): TapasIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): TapasOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) (pooler): TapasPooler( (dense): Linear(in_features=768, out_features=768, bias=True) (activation): Tanh() ) ) (dropout): Dropout(p=0.1, inplace=False) (aggregation_classifier): Linear(in_features=768, out_features=6, bias=True) )
inputs = tokenizer(table=table, queries=queries, padding='max_length', return_tensors="pt")
logits = compute_prediction_sequence(model, inputs, device)
device = torch.device("cpu")
model.to(device)
outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], token_type_ids=inputs["token_type_ids"])
predicted_answer_coordinates, = tokenizer.convert_logits_to_predictions(inputs, logits.cpu().detach())
# Some models (example, trained with SQA mechanism) don't have aggregations.
has_aggregation = "logits_aggregation" in outputs
print(f"Does it has logits for aggregations? {str(has_aggregation)}")
show_results(model, outputs["logits"].detach(), outputs["logits_aggregation"].detach() if has_aggregation else None, queries)
Does it has logits for aggregations? True [0, 4, 5, 0, 5, 4, 3]
Company | Revenue | Share Percentage | Founding Date | |
---|---|---|---|---|
0 | JACKSON, INC. | 5600000 | 1% | 7 february 1967 |
1 | RETRO COMP, CORP. | 45000000 | 2% | 10 june 1996 |
2 | DEEPAI, Inc. | 59000000 | 3% | 28 november 1967 |
Which is the Company where Founding Date is 7 february 1967 Predicted answer: JACKSON, INC. What is the max Revenue Predicted answer: MAX > 5600000, 45000000, 59000000 What is the lowest Share Percentage Predicted answer: MIN > 1%, 2%, 3% What are the Founding Dates Predicted answer: 7 february 1967, 10 june 1996, 28 november 1967 What is the min Share Percentage Predicted answer: MIN > 1%, 2%, 3% What is the max Revenue Predicted answer: MAX > 5600000, 45000000, 59000000 How many companies are there Predicted answer: COUNT > JACKSON, INC., RETRO COMP, CORP., DEEPAI, Inc.
MODEL_NAME = 'tapas_jsl'
model.save_pretrained(save_directory=MODEL_NAME,
is_main_process=True, state_dict=model.state_dict())
! rm -Rf google
from transformers import TFTapasForQuestionAnswering
#Auxiliary class for exporting TF graph from HF
class JSLTapas(TFTapasForQuestionAnswering):
@tf.function(
input_signature=[
{
"input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
"attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
"token_type_ids": tf.TensorSpec((None, None, 7), tf.int32, name="token_type_ids"),
}
]
)
def serving(self, inputs):
outputs = self.call(
input_ids=inputs["input_ids"],
attention_mask=inputs["attention_mask"],
token_type_ids=inputs["token_type_ids"]
)
if not self._has_logits:
outputs.logits_aggregation = tf.zeros((tf.shape(outputs.logits)[0], 1))
return self.serving_output(outputs)
loaded_model = TFTapasForQuestionAnswering.from_pretrained(MODEL_NAME, from_pt=True)
All PyTorch model weights were used when initializing TFTapasForQuestionAnswering. All the weights of TFTapasForQuestionAnswering were initialized from the PyTorch model. If your task is similar to the task the model of the checkpoint was trained on, you can already use TFTapasForQuestionAnswering for predictions without further training.
jsl_model = JSLTapas.from_pretrained(MODEL_NAME, from_pt=True)
jsl_model._has_logits = has_aggregation
All PyTorch model weights were used when initializing JSLTapas. All the weights of JSLTapas were initialized from the PyTorch model. If your task is similar to the task the model of the checkpoint was trained on, you can already use JSLTapas for predictions without further training.
TF_TMP_LOCATION="tmp"
#Export graph
tf.saved_model.save(jsl_model, TF_TMP_LOCATION, signatures={
"serving_default": jsl_model.serving
})
#Save voculary
tokenizer.save_vocabulary(f"{TF_TMP_LOCATION}/assets")
WARNING:absl:Found untraced functions such as compute_column_logits_layer_call_fn, compute_column_logits_layer_call_and_return_conditional_losses, embeddings_layer_call_fn, embeddings_layer_call_and_return_conditional_losses, encoder_layer_call_fn while saving (showing 5 of 422). These functions will not be directly callable after loading.
('tmp/assets/vocab.txt',)
! pip install johnsnowlabs
from johnsnowlabs import nlp, finance
nlp.install(force_browser=True)
spark = nlp.start()
Spark Session already created, some configs may not take.
table
Company | Revenue | Share Percentage | Founding Date | |
---|---|---|---|---|
0 | JACKSON, INC. | 5600000 | 1% | 7 february 1967 |
1 | RETRO COMP, CORP. | 45000000 | 2% | 10 june 1996 |
2 | DEEPAI, Inc. | 59000000 | 3% | 28 november 1967 |
queries
['Which is the Company where Founding Date is 7 february 1967', 'What is the max Revenue', 'What is the lowest Share Percentage', 'What are the Founding Dates', 'What is the min Share Percentage', 'What is the max Revenue', 'How many companies are there']
json_data = """
{
"header": ["Company", "Revenue", "Share Percentage", "Founding Date"],
"rows": [
["JACKSON, INC.", "56000000", "1%", "7 february 1967"],
["RETRO COMP, CORP.", "450000000", "2%", "10 june 1996"],
["DEEPAI, Inc.", "590000000", "3%", "28 november 1967"],
]
}
"""
queries = ["Which is the Company where Founding Date is 7 february 1967",
"What is the max Revenue",
"What is the lowest Share Percentage",
"What are the Founding Dates",
"What is the min Share Percentage",
"What is the max Revenue",
"How many companies are there"]
data = spark.createDataFrame([
[json_data, " ".join(queries)]
]).toDF("table_json", "questions")
data.show()
+--------------------+--------------------+ | table_json| questions| +--------------------+--------------------+ | { "header": ["...|Which is the Comp...| +--------------------+--------------------+
SPARKNLP_MODEL_LOCATION = "tapas_jsl_spark_nlp"
MODEL_NAME = "google/tapas-base"
tokenizer = TapasTokenizer.from_pretrained(MODEL_NAME)
case_sensitive = not tokenizer.do_lower_case
nlp.TapasForQuestionAnswering\
.loadSavedModel(TF_TMP_LOCATION, spark)\
.setCaseSensitive(case_sensitive)\
.write().overwrite()\
.save(SPARKNLP_MODEL_LOCATION)
document_assembler = nlp.MultiDocumentAssembler() \
.setInputCols("table_json", "questions") \
.setOutputCols("document_table", "document_questions")
text_splitter = finance.TextSplitter() \
.setInputCols(["document_questions"]) \
.setOutputCol("questions")
table_assembler = nlp.TableAssembler()\
.setInputCols(["document_table"])\
.setOutputCol("table")
tapas = nlp.TapasForQuestionAnswering\
.load(SPARKNLP_MODEL_LOCATION)\
.setInputCols(["questions", "table"])\
.setOutputCol("answers")
pipeline = nlp.Pipeline(stages=[
document_assembler,
text_splitter,
table_assembler,
tapas
])
fit_model = pipeline.fit(data)
fit_model\
.transform(data)\
.selectExpr("explode(answers) AS answer")\
.select("answer")\
.show(truncate=False)