from IPython.display import display, HTML
display(HTML('<style>.container { width:100% !important; }</style>'))
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.retrievers import EnsembleRetriever, WeaviateHybridSearchRetriever
from langchain_community.retrievers import BM25Retriever
from langchain_chroma import Chroma
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
from dotenv import load_dotenv
import weaviate
import os, torch
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device found: {DEVICE}')
_ = load_dotenv()
os.environ['OPENAI_API_KEY']=os.getenv('OPENAI_API_KEY')
Device found: cpu
# PDF file name
pdf_file = 'Llama3 paper.pdf'
# Chunk size and chunk overlap for text splitter
chunk_size = 2500
chunk_overlap = 100
# Ks to return from BM25, vector DB
return_k = 2
# Constants for the Embedding model
DEVICE = 'cpu' # accepts 'cuda' or 'cpu'
HF_MODEL = './models/all-MiniLM-L6-v2' # Path where the embedding model is stored
HF_MODEL_PULL = 'sentence-transformers/all-MiniLM-L6-v2' # model to pull in case not found in local
HF_MODEL_KWARGS = {'device': DEVICE} # Model kwargs for the embedding model
HF_ENCODE_KWARGS = {'normalize_embeddings': True} # Encode kwargs for the embedding
QUERY1 = "What is the extended context length of Llama-3-8B-Instruct?"
QUERY2 = "What was the Zero-shot performance on MMLU?"
# Read data from the PDF
loader = PyPDFLoader(pdf_file)
docs = loader.load_and_split()
print(f'The total pages loaded as content: {len(docs)}')
The total pages loaded as content: 6
# # Split the content into chunks
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
# chunk_overlap=chunk_overlap)
# docs = text_splitter.split_documents(content)
# print(f'Number of document chunks created: {len(docs)}')
# Initialize the BM25 retriever
bm25_retriever = BM25Retriever.from_documents(docs)
bm25_retriever.k = return_k
# Invoke the response
response = bm25_retriever.invoke(QUERY1)
for res in response:
print(res.page_content[0:500], '\n\n===========================')
Extending Llama-3’s Context Ten-Fold Overnight Peitian Zhang1,2, Ninglu Shao1,2, Zheng Liu1∗, Shitao Xiao1, Hongjin Qian1,2, Qiwei Ye1, Zhicheng Dou2 1Beijing Academy of Artificial Intelligence 2Gaoling School of Artificial Intelligence, Renmin University of China namespace.pt@gmail.com zhengliu1026@gmail.com Abstract We extend the context length of Llama-3-8B-Instruct from 8K to 80K via QLoRA fine-tuning2. The entire training cycle is super efficient, which takes 8 hours on one 8xA800 (80G) GPU =========================== 3K 6K 9K 11K 14K 16K 21K 26K 31K 36K Context Length0.00.20.40.60.81.0Accuracy Llama-3-8B-Instruct Llama-3-8B-Instruct-262k Llama-3-8B-Instruct-80K-QLoRAFigure 2: The accuracy of Topic Retrieval task. Model Single-Doc Multi-Doc Summ. Few-Shot Synthetic Code Avg Llama-3-8B-Instruct 37.33 36.04 26.83 69.56 37.75 53.24 43.20 Llama-3-8B-Instruct-262K 37.29 31.20 26.18 67.25 44.25 62.71 43.73 Llama-3-8B-Instruct-80K-QLoRA 43.57 43.07 28.93 69.15 48.50 51.95 47.19 Table 1: Evaluation results on LongBen ===========================
# Initialize HuggingFace SentenceTransformerEmbeddings with specified model and arguments
try:
embeddings = SentenceTransformerEmbeddings(
model_name=HF_MODEL, # The name of the saved HuggingFace model to use for embeddings
model_kwargs=HF_MODEL_KWARGS, # Additional keyword arguments for the model
encode_kwargs=HF_ENCODE_KWARGS # Keyword arguments for the encoding process
)
except:
# Fallback to a default model if the specified model fails to initialize
embeddings = SentenceTransformerEmbeddings(
model_name=HF_MODEL_PULL, # HuggingFace model to use for embeddings
model_kwargs=HF_MODEL_KWARGS, # Additional keyword arguments for the model
encode_kwargs=HF_ENCODE_KWARGS # Keyword arguments for the encoding process
)
# Initialize a Chroma vector database
chroma_vectordb = Chroma.from_documents(
documents=docs, # The text chunks to be embedded and indexed
embedding=embeddings, # The embeddings generated by SentenceTransformerEmbeddings
)
chroma_retriever = chroma_vectordb.as_retriever(search_kwargs={'k': return_k})
# Get the similar texts based on a query provided - to check
results_with_scores = chroma_vectordb.similarity_search_with_score(QUERY1, k=return_k)
for doc, score in results_with_scores:
print(doc.page_content[0:500] + '\n\n===========================')
3K 6K 9K 11K 14K 16K 21K 26K 31K 36K Context Length0.00.20.40.60.81.0Accuracy Llama-3-8B-Instruct Llama-3-8B-Instruct-262k Llama-3-8B-Instruct-80K-QLoRAFigure 2: The accuracy of Topic Retrieval task. Model Single-Doc Multi-Doc Summ. Few-Shot Synthetic Code Avg Llama-3-8B-Instruct 37.33 36.04 26.83 69.56 37.75 53.24 43.20 Llama-3-8B-Instruct-262K 37.29 31.20 26.18 67.25 44.25 62.71 43.73 Llama-3-8B-Instruct-80K-QLoRA 43.57 43.07 28.93 69.15 48.50 51.95 47.19 Table 1: Evaluation results on LongBen =========================== Extending Llama-3’s Context Ten-Fold Overnight Peitian Zhang1,2, Ninglu Shao1,2, Zheng Liu1∗, Shitao Xiao1, Hongjin Qian1,2, Qiwei Ye1, Zhicheng Dou2 1Beijing Academy of Artificial Intelligence 2Gaoling School of Artificial Intelligence, Renmin University of China namespace.pt@gmail.com zhengliu1026@gmail.com Abstract We extend the context length of Llama-3-8B-Instruct from 8K to 80K via QLoRA fine-tuning2. The entire training cycle is super efficient, which takes 8 hours on one 8xA800 (80G) GPU ===========================
# Initialize an ensemble retriever using BM25 and FAISS retriever defined above
ensemble_retriever = EnsembleRetriever(
retrievers=[bm25_retriever, chroma_retriever], weights=[0.6, 0.4]
)
# Get the retrieval for the QUERY1
response = ensemble_retriever.invoke(QUERY1)
for res in response:
print(res.page_content[0:500], '\n\n===========================')
Extending Llama-3’s Context Ten-Fold Overnight Peitian Zhang1,2, Ninglu Shao1,2, Zheng Liu1∗, Shitao Xiao1, Hongjin Qian1,2, Qiwei Ye1, Zhicheng Dou2 1Beijing Academy of Artificial Intelligence 2Gaoling School of Artificial Intelligence, Renmin University of China namespace.pt@gmail.com zhengliu1026@gmail.com Abstract We extend the context length of Llama-3-8B-Instruct from 8K to 80K via QLoRA fine-tuning2. The entire training cycle is super efficient, which takes 8 hours on one 8xA800 (80G) GPU =========================== 3K 6K 9K 11K 14K 16K 21K 26K 31K 36K Context Length0.00.20.40.60.81.0Accuracy Llama-3-8B-Instruct Llama-3-8B-Instruct-262k Llama-3-8B-Instruct-80K-QLoRAFigure 2: The accuracy of Topic Retrieval task. Model Single-Doc Multi-Doc Summ. Few-Shot Synthetic Code Avg Llama-3-8B-Instruct 37.33 36.04 26.83 69.56 37.75 53.24 43.20 Llama-3-8B-Instruct-262K 37.29 31.20 26.18 67.25 44.25 62.71 43.73 Llama-3-8B-Instruct-80K-QLoRA 43.57 43.07 28.93 69.15 48.50 51.95 47.19 Table 1: Evaluation results on LongBen ===========================
# Get the retrieval for the QUERY2
response = ensemble_retriever.invoke(QUERY2)
for res in response:
print(res.page_content[:500], '\n\n===========================')
3K 6K 9K 11K 14K 16K 21K 26K 31K 36K Context Length0.00.20.40.60.81.0Accuracy Llama-3-8B-Instruct Llama-3-8B-Instruct-262k Llama-3-8B-Instruct-80K-QLoRAFigure 2: The accuracy of Topic Retrieval task. Model Single-Doc Multi-Doc Summ. Few-Shot Synthetic Code Avg Llama-3-8B-Instruct 37.33 36.04 26.83 69.56 37.75 53.24 43.20 Llama-3-8B-Instruct-262K 37.29 31.20 26.18 67.25 44.25 62.71 43.73 Llama-3-8B-Instruct-80K-QLoRA 43.57 43.07 28.93 69.15 48.50 51.95 47.19 Table 1: Evaluation results on LongBen =========================== 800014315 20631 26947 33263 39578 45894 52210 58526 64842 71157 77473 83789 90105 96421102736 109052 115368 121684 128000 Context Length0 11 22 33 44 55 66 77 88 100Depth Percent1.0Needle In A HayStack 12345678910 Accuracy Score from GPT3.5Figure 1: The accuracy score of Llama-3-8B-Instruct-80K-QLoRA on Needle-In-A-HayStack task. The blue vertical line indicates the training length, i.e. 80K. the same cluster to form each heterogeneous context. Therefore, the grouped texts share some semantic si =========================== I. Molybog, Y . Nie, A. Poulton, J. Reizenstein, R. Rungta, K. Saladi, A. Schelten, R. Silva, E. M. Smith, R. Subramanian, X. E. Tan, B. Tang, R. Taylor, A. Williams, J. X. Kuan, P. Xu, Z. Yan, I. Zarov, Y . Zhang, A. Fan, M. Kambadur, S. Narang, A. Rodriguez, R. Stojnic, S. Edunov, and T. Scialom. Llama 2: Open foundation and fine-tuned chat models, 2023. [16] P. Zhang, Z. Liu, S. Xiao, N. Shao, Q. Ye, and Z. Dou. Soaring from 4k to 400k: Extending llm’s context with activation beacon, 2024. [1 =========================== Extending Llama-3’s Context Ten-Fold Overnight Peitian Zhang1,2, Ninglu Shao1,2, Zheng Liu1∗, Shitao Xiao1, Hongjin Qian1,2, Qiwei Ye1, Zhicheng Dou2 1Beijing Academy of Artificial Intelligence 2Gaoling School of Artificial Intelligence, Renmin University of China namespace.pt@gmail.com zhengliu1026@gmail.com Abstract We extend the context length of Llama-3-8B-Instruct from 8K to 80K via QLoRA fine-tuning2. The entire training cycle is super efficient, which takes 8 hours on one 8xA800 (80G) GPU ===========================
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
qa = RetrievalQA.from_chain_type(llm=llm,
chain_type='stuff',
retriever=ensemble_retriever)
qa.invoke(QUERY1)
{'query': 'What is the extended context length of Llama-3-8B-Instruct?', 'result': 'The extended context length of Llama-3-8B-Instruct is 80K.'}
print(llm.invoke(QUERY1).content) ## Hallucinating
The extended context length of Llama-3-8B-Instruct is 4096 tokens.
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
qa = RetrievalQA.from_chain_type(llm=llm,
chain_type='stuff',
retriever=ensemble_retriever)
qa.invoke(QUERY2)
{'query': 'What was the Zero-shot performance on MMLU?', 'result': 'The zero-shot performance on MMLU for the Llama-3-8B-Instruct model was as follows:\n- STEM: 53.87%\n- Social: 75.66%\n- Humanities: 69.44%\n- Others: 69.75%\n- Average: 65.91%\n\nFor the Llama-3-8B-Instruct-262K model, the performance was:\n- STEM: 52.10%\n- Social: 73.26%\n- Humanities: 67.15%\n- Others: 69.80%\n- Average: 64.34%\n\nAnd for the Llama-3-8B-Instruct-80K-QLoRA model, the performance was:\n- STEM: 53.10%\n- Social: 73.24%\n- Humanities: 67.32%\n- Others: 68.79%\n- Average: 64.44%'}
# Weaviate v4 connection - but langchain still uses v3 hence commented.
# https://github.com/langchain-ai/langchain/issues/18809
# client = weaviate.connect_to_wcs(
# cluster_url=os.getenv('WEAVIATE_URL'), # WCS URL
# auth_credentials=weaviate.auth.AuthApiKey(os.getenv('WEAVIATE_API_KEY')), # WCS key
# headers={'X-OpenAI-Api-key': os.getenv('OPENAI_API_KEY')} # OpenAI API key
# )
# Get the Weaviate URL
WEAVIATE_URL = os.getenv('WEAVIATE_URL')
# Create an AuthApiKey object using the Weaviate API key
auth_client_secret = weaviate.AuthApiKey(api_key=os.getenv('WEAVIATE_API_KEY'))
# Create a Weaviate client object
client = weaviate.Client(
url=WEAVIATE_URL, # Set the Weaviate URL
additional_headers={'X-OpenAI-Api-Key': os.getenv('OPENAI_API_KEY')}, # Add additional headers to the client
auth_client_secret=auth_client_secret # # Set the authentication method to use the API key
)
D:\Pratik Sharma\Python Individual Envs\Hybrid Search\hs\lib\site-packages\weaviate\__init__.py:128: DeprecationWarning: Dep010: Importing AuthApiKey from weaviate is deprecated. Please import it from its specific module: weaviate.auth _Warnings.root_module_import(name, map_[name]) D:\Pratik Sharma\Python Individual Envs\Hybrid Search\hs\lib\site-packages\weaviate\warnings.py:158: DeprecationWarning: Dep016: You are creating a Weaviate v3 client using `client = weaviate.Client(...)`, which is deprecated. Consider creating a v4 (`weaviate.WeaviateClient`) client, using a `weaviate.connect_to_<method>` helper function. See here for - migrating from v3 to v4: https://weaviate.io/developers/weaviate/client-libraries/python/v3_v4_migration - general v4 usage: https://weaviate.io/developers/weaviate/client-libraries/python warnings.warn(
# Weaviate hybrid search retriever
weaviate_retriever = WeaviateHybridSearchRetriever(
alpha = 0.5, # defaults to 0.5, which is equal weighting between keyword and semantic search
client = client, # keyword arguments to pass to the Weaviate client
index_name = "LangChain", # The name of the index to use
text_key = "text", # The name of the text key to use
attributes = [], # The attributes to return in the results
k=2 # Set the number of results to return to 2
)
_ = weaviate_retriever.add_documents(docs)
# Get the retrieval for the query using Weaviate
response = weaviate_retriever.invoke(QUERY1)
for res in response:
print(res.page_content[0:500], '\n\n===========================')
Extending Llama-3’s Context Ten-Fold Overnight Peitian Zhang1,2, Ninglu Shao1,2, Zheng Liu1∗, Shitao Xiao1, Hongjin Qian1,2, Qiwei Ye1, Zhicheng Dou2 1Beijing Academy of Artificial Intelligence 2Gaoling School of Artificial Intelligence, Renmin University of China namespace.pt@gmail.com zhengliu1026@gmail.com Abstract We extend the context length of Llama-3-8B-Instruct from 8K to 80K via QLoRA fine-tuning2. The entire training cycle is super efficient, which takes 8 hours on one 8xA800 (80G) GPU =========================== Extending Llama-3’s Context Ten-Fold Overnight Peitian Zhang1,2, Ninglu Shao1,2, Zheng Liu1∗, Shitao Xiao1, Hongjin Qian1,2, Qiwei Ye1, Zhicheng Dou2 1Beijing Academy of Artificial Intelligence 2Gaoling School of Artificial Intelligence, Renmin University of China namespace.pt@gmail.com zhengliu1026@gmail.com Abstract We extend the context length of Llama-3-8B-Instruct from 8K to 80K via QLoRA fine-tuning2. The entire training cycle is super efficient, which takes 8 hours on one 8xA800 (80G) GPU ===========================
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
qa = RetrievalQA.from_chain_type(llm=llm,
chain_type='stuff',
retriever=weaviate_retriever)
qa.invoke(QUERY1)
{'query': 'What is the extended context length of Llama-3-8B-Instruct?', 'result': 'The extended context length of Llama-3-8B-Instruct is 80K.'}
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
qa = RetrievalQA.from_chain_type(llm=llm,
chain_type='stuff',
retriever=weaviate_retriever)
qa.invoke(QUERY2)
{'query': 'What was the Zero-shot performance on MMLU?', 'result': 'The zero-shot performance on MMLU for Llama-3-8B-Instruct was 65.91, for Llama-3-8B-Instruct-262K was 64.34, and for Llama-3-8B-Instruct-80K-QLoRA was 64.44.'}