#!/usr/bin/env python # coding: utf-8 # # Notebook [1]: First steps with cdQA # This notebook shows how to use the `cdQA` pipeline to perform question answering on a custom dataset. # ***Note:*** *If you are using colab, you will need to install `cdQA` by executing `!pip install cdqa` in a cell.* # In[1]: import os import pandas as pd from ast import literal_eval from cdqa.utils.filters import filter_paragraphs from cdqa.pipeline import QAPipeline # ### Download pre-trained reader model and example dataset # In[2]: from cdqa.utils.download import download_model, download_bnpp_data download_bnpp_data(dir='./data/bnpp_newsroom_v1.1/') download_model(model='bert-squad_1.1', dir='./models') # ### Visualize the dataset # In[3]: df = pd.read_csv('./data/bnpp_newsroom_v1.1/bnpp_newsroom-v1.1.csv', converters={'paragraphs': literal_eval}) df = filter_paragraphs(df) df.head() # ### Instantiate the cdQA pipeline from a pre-trained reader model # In[4]: cdqa_pipeline = QAPipeline(reader='./models/bert_qa.joblib') cdqa_pipeline.fit_retriever(df=df) # ### Execute a query # In[5]: query = 'Since when does the Excellence Program of BNP Paribas exist?' prediction = cdqa_pipeline.predict(query) # ### Explore predictions # In[6]: print('query: {}'.format(query)) print('answer: {}'.format(prediction[0])) print('title: {}'.format(prediction[1])) print('paragraph: {}'.format(prediction[2]))