Notebook

In [1]:

from IPython.display import display, HTML
display(HTML('<style>.container { width:100% !important; }</style>'))

Import Packages¶

In [2]:

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings

from langchain.retrievers import EnsembleRetriever, WeaviateHybridSearchRetriever
from langchain_community.retrievers import BM25Retriever
from langchain_chroma import Chroma

from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA

from dotenv import load_dotenv
import weaviate
import os, torch

In [3]:

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device found: {DEVICE}')

_ = load_dotenv()
os.environ['OPENAI_API_KEY']=os.getenv('OPENAI_API_KEY')

Device found: cpu

Constants¶

In [4]:

# PDF file name
pdf_file = 'Llama3 paper.pdf'

# Chunk size and chunk overlap for text splitter
chunk_size = 2500
chunk_overlap = 100

# Ks to return from BM25, vector DB
return_k = 2

# Constants for the Embedding model
DEVICE = 'cpu' # accepts 'cuda' or 'cpu'
HF_MODEL = './models/all-MiniLM-L6-v2'  # Path where the embedding model is stored
HF_MODEL_PULL = 'sentence-transformers/all-MiniLM-L6-v2' # model to pull in case not found in local
HF_MODEL_KWARGS = {'device': DEVICE}  # Model kwargs for the embedding model
HF_ENCODE_KWARGS = {'normalize_embeddings': True}  # Encode kwargs for the embedding

In [5]:

QUERY1 = "What is the extended context length of Llama-3-8B-Instruct?"
QUERY2 = "What was the Zero-shot performance on MMLU?"

Read data from the PDF¶

In [6]:

# Read data from the PDF
loader = PyPDFLoader(pdf_file)
docs = loader.load_and_split()
print(f'The total pages loaded as content: {len(docs)}')

The total pages loaded as content: 6

In [7]:

# # Split the content into chunks
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
#                                                chunk_overlap=chunk_overlap)

# docs = text_splitter.split_documents(content)
# print(f'Number of document chunks created: {len(docs)}')

Sparse Retriever¶

In [8]:

# Initialize the BM25 retriever
bm25_retriever = BM25Retriever.from_documents(docs)
bm25_retriever.k = return_k

In [9]:

# Invoke the response
response = bm25_retriever.invoke(QUERY1)
for res in response:
    print(res.page_content[0:500], '\n\n===========================')

Extending Llama-3’s Context Ten-Fold Overnight
Peitian Zhang1,2, Ninglu Shao1,2, Zheng Liu1∗, Shitao Xiao1, Hongjin Qian1,2,
Qiwei Ye1, Zhicheng Dou2
1Beijing Academy of Artificial Intelligence
2Gaoling School of Artificial Intelligence, Renmin University of China
namespace.pt@gmail.com zhengliu1026@gmail.com
Abstract
We extend the context length of Llama-3-8B-Instruct from 8K to 80K via QLoRA
fine-tuning2. The entire training cycle is super efficient, which takes 8 hours on one
8xA800 (80G) GPU 

===========================
3K 6K 9K 11K 14K 16K 21K 26K 31K 36K
Context Length0.00.20.40.60.81.0Accuracy
Llama-3-8B-Instruct
Llama-3-8B-Instruct-262k
Llama-3-8B-Instruct-80K-QLoRAFigure 2: The accuracy of Topic Retrieval task.
Model Single-Doc Multi-Doc Summ. Few-Shot Synthetic Code Avg
Llama-3-8B-Instruct 37.33 36.04 26.83 69.56 37.75 53.24 43.20
Llama-3-8B-Instruct-262K 37.29 31.20 26.18 67.25 44.25 62.71 43.73
Llama-3-8B-Instruct-80K-QLoRA 43.57 43.07 28.93 69.15 48.50 51.95 47.19
Table 1: Evaluation results on LongBen 

===========================

Dense Retriever¶

In [10]:

# Initialize HuggingFace SentenceTransformerEmbeddings with specified model and arguments
try:
    embeddings = SentenceTransformerEmbeddings(
        model_name=HF_MODEL,  # The name of the saved HuggingFace model to use for embeddings
        model_kwargs=HF_MODEL_KWARGS,  # Additional keyword arguments for the model
        encode_kwargs=HF_ENCODE_KWARGS  # Keyword arguments for the encoding process
    )
except:
    # Fallback to a default model if the specified model fails to initialize
    embeddings = SentenceTransformerEmbeddings(
        model_name=HF_MODEL_PULL,  # HuggingFace model to use for embeddings
        model_kwargs=HF_MODEL_KWARGS,  # Additional keyword arguments for the model
        encode_kwargs=HF_ENCODE_KWARGS  # Keyword arguments for the encoding process
    )

In [11]:

# Initialize a Chroma vector database
chroma_vectordb = Chroma.from_documents(
    documents=docs,  # The text chunks to be embedded and indexed
    embedding=embeddings,  # The embeddings generated by SentenceTransformerEmbeddings
)
chroma_retriever = chroma_vectordb.as_retriever(search_kwargs={'k': return_k})

In [12]:

# Get the similar texts based on a query provided - to check
results_with_scores = chroma_vectordb.similarity_search_with_score(QUERY1, k=return_k)
for doc, score in results_with_scores:
    print(doc.page_content[0:500] + '\n\n===========================')

3K 6K 9K 11K 14K 16K 21K 26K 31K 36K
Context Length0.00.20.40.60.81.0Accuracy
Llama-3-8B-Instruct
Llama-3-8B-Instruct-262k
Llama-3-8B-Instruct-80K-QLoRAFigure 2: The accuracy of Topic Retrieval task.
Model Single-Doc Multi-Doc Summ. Few-Shot Synthetic Code Avg
Llama-3-8B-Instruct 37.33 36.04 26.83 69.56 37.75 53.24 43.20
Llama-3-8B-Instruct-262K 37.29 31.20 26.18 67.25 44.25 62.71 43.73
Llama-3-8B-Instruct-80K-QLoRA 43.57 43.07 28.93 69.15 48.50 51.95 47.19
Table 1: Evaluation results on LongBen

===========================
Extending Llama-3’s Context Ten-Fold Overnight
Peitian Zhang1,2, Ninglu Shao1,2, Zheng Liu1∗, Shitao Xiao1, Hongjin Qian1,2,
Qiwei Ye1, Zhicheng Dou2
1Beijing Academy of Artificial Intelligence
2Gaoling School of Artificial Intelligence, Renmin University of China
namespace.pt@gmail.com zhengliu1026@gmail.com
Abstract
We extend the context length of Llama-3-8B-Instruct from 8K to 80K via QLoRA
fine-tuning2. The entire training cycle is super efficient, which takes 8 hours on one
8xA800 (80G) GPU

===========================

Instantiate an Ensemble Retriever¶

In [13]:

# Initialize an ensemble retriever using BM25 and FAISS retriever defined above
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, chroma_retriever], weights=[0.6, 0.4]
)

In [14]:

# Get the retrieval for the QUERY1
response = ensemble_retriever.invoke(QUERY1)
for res in response:
    print(res.page_content[0:500], '\n\n===========================')

Extending Llama-3’s Context Ten-Fold Overnight
Peitian Zhang1,2, Ninglu Shao1,2, Zheng Liu1∗, Shitao Xiao1, Hongjin Qian1,2,
Qiwei Ye1, Zhicheng Dou2
1Beijing Academy of Artificial Intelligence
2Gaoling School of Artificial Intelligence, Renmin University of China
namespace.pt@gmail.com zhengliu1026@gmail.com
Abstract
We extend the context length of Llama-3-8B-Instruct from 8K to 80K via QLoRA
fine-tuning2. The entire training cycle is super efficient, which takes 8 hours on one
8xA800 (80G) GPU 

===========================
3K 6K 9K 11K 14K 16K 21K 26K 31K 36K
Context Length0.00.20.40.60.81.0Accuracy
Llama-3-8B-Instruct
Llama-3-8B-Instruct-262k
Llama-3-8B-Instruct-80K-QLoRAFigure 2: The accuracy of Topic Retrieval task.
Model Single-Doc Multi-Doc Summ. Few-Shot Synthetic Code Avg
Llama-3-8B-Instruct 37.33 36.04 26.83 69.56 37.75 53.24 43.20
Llama-3-8B-Instruct-262K 37.29 31.20 26.18 67.25 44.25 62.71 43.73
Llama-3-8B-Instruct-80K-QLoRA 43.57 43.07 28.93 69.15 48.50 51.95 47.19
Table 1: Evaluation results on LongBen 

===========================

In [15]:

# Get the retrieval for the QUERY2
response = ensemble_retriever.invoke(QUERY2)
for res in response:
    print(res.page_content[:500], '\n\n===========================')

3K 6K 9K 11K 14K 16K 21K 26K 31K 36K
Context Length0.00.20.40.60.81.0Accuracy
Llama-3-8B-Instruct
Llama-3-8B-Instruct-262k
Llama-3-8B-Instruct-80K-QLoRAFigure 2: The accuracy of Topic Retrieval task.
Model Single-Doc Multi-Doc Summ. Few-Shot Synthetic Code Avg
Llama-3-8B-Instruct 37.33 36.04 26.83 69.56 37.75 53.24 43.20
Llama-3-8B-Instruct-262K 37.29 31.20 26.18 67.25 44.25 62.71 43.73
Llama-3-8B-Instruct-80K-QLoRA 43.57 43.07 28.93 69.15 48.50 51.95 47.19
Table 1: Evaluation results on LongBen 

===========================
800014315 20631 26947 33263 39578 45894 52210 58526 64842 71157 77473 83789 90105 96421102736 109052 115368 121684 128000
Context Length0
11
22
33
44
55
66
77
88
100Depth Percent1.0Needle In A HayStack
12345678910
Accuracy Score from GPT3.5Figure 1: The accuracy score of Llama-3-8B-Instruct-80K-QLoRA on Needle-In-A-HayStack task.
The blue vertical line indicates the training length, i.e. 80K.
the same cluster to form each heterogeneous context. Therefore, the grouped texts share
some semantic si 

===========================
I. Molybog, Y . Nie, A. Poulton, J. Reizenstein, R. Rungta, K. Saladi, A. Schelten, R. Silva, E. M.
Smith, R. Subramanian, X. E. Tan, B. Tang, R. Taylor, A. Williams, J. X. Kuan, P. Xu, Z. Yan,
I. Zarov, Y . Zhang, A. Fan, M. Kambadur, S. Narang, A. Rodriguez, R. Stojnic, S. Edunov, and
T. Scialom. Llama 2: Open foundation and fine-tuned chat models, 2023.
[16] P. Zhang, Z. Liu, S. Xiao, N. Shao, Q. Ye, and Z. Dou. Soaring from 4k to 400k: Extending
llm’s context with activation beacon, 2024.
[1 

===========================
Extending Llama-3’s Context Ten-Fold Overnight
Peitian Zhang1,2, Ninglu Shao1,2, Zheng Liu1∗, Shitao Xiao1, Hongjin Qian1,2,
Qiwei Ye1, Zhicheng Dou2
1Beijing Academy of Artificial Intelligence
2Gaoling School of Artificial Intelligence, Renmin University of China
namespace.pt@gmail.com zhengliu1026@gmail.com
Abstract
We extend the context length of Llama-3-8B-Instruct from 8K to 80K via QLoRA
fine-tuning2. The entire training cycle is super efficient, which takes 8 hours on one
8xA800 (80G) GPU 

===========================

Use LLM to ask question from the Ensemble Retriever¶

In [16]:

llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
qa = RetrievalQA.from_chain_type(llm=llm, 
                                 chain_type='stuff', 
                                 retriever=ensemble_retriever)
qa.invoke(QUERY1)

Out[16]:

{'query': 'What is the extended context length of Llama-3-8B-Instruct?',
 'result': 'The extended context length of Llama-3-8B-Instruct is 80K.'}

Response from the PDF:¶

In [17]:

print(llm.invoke(QUERY1).content)  ## Hallucinating

The extended context length of Llama-3-8B-Instruct is 4096 tokens.

In [18]:

llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
qa = RetrievalQA.from_chain_type(llm=llm, 
                                 chain_type='stuff', 
                                 retriever=ensemble_retriever)
qa.invoke(QUERY2)

Out[18]:

{'query': 'What was the Zero-shot performance on MMLU?',
 'result': 'The zero-shot performance on MMLU for the Llama-3-8B-Instruct model was as follows:\n- STEM: 53.87%\n- Social: 75.66%\n- Humanities: 69.44%\n- Others: 69.75%\n- Average: 65.91%\n\nFor the Llama-3-8B-Instruct-262K model, the performance was:\n- STEM: 52.10%\n- Social: 73.26%\n- Humanities: 67.15%\n- Others: 69.80%\n- Average: 64.34%\n\nAnd for the Llama-3-8B-Instruct-80K-QLoRA model, the performance was:\n- STEM: 53.10%\n- Social: 73.24%\n- Humanities: 67.32%\n- Others: 68.79%\n- Average: 64.44%'}

Response from the PDF:¶

Using Weaviate to perform Hybrid Search¶

In [19]:

# Weaviate v4 connection - but langchain still uses v3 hence commented.
# https://github.com/langchain-ai/langchain/issues/18809
# client = weaviate.connect_to_wcs(
#     cluster_url=os.getenv('WEAVIATE_URL'),  # WCS URL
#     auth_credentials=weaviate.auth.AuthApiKey(os.getenv('WEAVIATE_API_KEY')),  # WCS key
#     headers={'X-OpenAI-Api-key': os.getenv('OPENAI_API_KEY')}  # OpenAI API key
# )

# Get the Weaviate URL
WEAVIATE_URL = os.getenv('WEAVIATE_URL') 
# Create an AuthApiKey object using the Weaviate API key
auth_client_secret = weaviate.AuthApiKey(api_key=os.getenv('WEAVIATE_API_KEY'))

# Create a Weaviate client object
client = weaviate.Client(
    url=WEAVIATE_URL, # Set the Weaviate URL
    additional_headers={'X-OpenAI-Api-Key': os.getenv('OPENAI_API_KEY')}, # Add additional headers to the client
    auth_client_secret=auth_client_secret # # Set the authentication method to use the API key
)

D:\Pratik Sharma\Python Individual Envs\Hybrid Search\hs\lib\site-packages\weaviate\__init__.py:128: DeprecationWarning: Dep010: Importing AuthApiKey from weaviate is deprecated. Please import it from its specific module: weaviate.auth
  _Warnings.root_module_import(name, map_[name])
D:\Pratik Sharma\Python Individual Envs\Hybrid Search\hs\lib\site-packages\weaviate\warnings.py:158: DeprecationWarning: Dep016: You are creating a Weaviate v3 client using `client =  weaviate.Client(...)`, which is
            deprecated. Consider creating a v4 (`weaviate.WeaviateClient`) client, using a `weaviate.connect_to_<method>`
            helper function.
            See here for
                - migrating from v3 to v4: https://weaviate.io/developers/weaviate/client-libraries/python/v3_v4_migration
                - general v4 usage: https://weaviate.io/developers/weaviate/client-libraries/python
            
  warnings.warn(

In [20]:

# Weaviate hybrid search retriever
weaviate_retriever = WeaviateHybridSearchRetriever(
    alpha = 0.5, # defaults to 0.5, which is equal weighting between keyword and semantic search
    client = client, # keyword arguments to pass to the Weaviate client
    index_name = "LangChain", # The name of the index to use
    text_key = "text", # The name of the text key to use
    attributes = [], # The attributes to return in the results
    k=2 # Set the number of results to return to 2
)
_ = weaviate_retriever.add_documents(docs)

In [21]:

# Get the retrieval for the query using Weaviate
response = weaviate_retriever.invoke(QUERY1)
for res in response:
    print(res.page_content[0:500], '\n\n===========================')

Extending Llama-3’s Context Ten-Fold Overnight
Peitian Zhang1,2, Ninglu Shao1,2, Zheng Liu1∗, Shitao Xiao1, Hongjin Qian1,2,
Qiwei Ye1, Zhicheng Dou2
1Beijing Academy of Artificial Intelligence
2Gaoling School of Artificial Intelligence, Renmin University of China
namespace.pt@gmail.com zhengliu1026@gmail.com
Abstract
We extend the context length of Llama-3-8B-Instruct from 8K to 80K via QLoRA
fine-tuning2. The entire training cycle is super efficient, which takes 8 hours on one
8xA800 (80G) GPU 

===========================
Extending Llama-3’s Context Ten-Fold Overnight
Peitian Zhang1,2, Ninglu Shao1,2, Zheng Liu1∗, Shitao Xiao1, Hongjin Qian1,2,
Qiwei Ye1, Zhicheng Dou2
1Beijing Academy of Artificial Intelligence
2Gaoling School of Artificial Intelligence, Renmin University of China
namespace.pt@gmail.com zhengliu1026@gmail.com
Abstract
We extend the context length of Llama-3-8B-Instruct from 8K to 80K via QLoRA
fine-tuning2. The entire training cycle is super efficient, which takes 8 hours on one
8xA800 (80G) GPU 

===========================

Use LLM to ask question from the Weaviate Retriever¶

In [22]:

llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
qa = RetrievalQA.from_chain_type(llm=llm, 
                                 chain_type='stuff', 
                                 retriever=weaviate_retriever)
qa.invoke(QUERY1)

Out[22]:

{'query': 'What is the extended context length of Llama-3-8B-Instruct?',
 'result': 'The extended context length of Llama-3-8B-Instruct is 80K.'}

Response from the PDF:¶

In [23]:

llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
qa = RetrievalQA.from_chain_type(llm=llm, 
                                 chain_type='stuff', 
                                 retriever=weaviate_retriever)
qa.invoke(QUERY2)

Out[23]:

{'query': 'What was the Zero-shot performance on MMLU?',
 'result': 'The zero-shot performance on MMLU for Llama-3-8B-Instruct was 65.91, for Llama-3-8B-Instruct-262K was 64.34, and for Llama-3-8B-Instruct-80K-QLoRA was 64.44.'}