#!/usr/bin/env python # coding: utf-8 # # Import Packages # # # In[1]: # GLOBAL import os import pandas as pd import numpy as np import tiktoken from uuid import uuid4 # from tqdm import tqdm from dotenv import load_dotenv from tqdm.autonotebook import tqdm # LANGCHAIN import langchain from langchain.llms import OpenAI from langchain_community.document_loaders.csv_loader import CSVLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_openai import OpenAIEmbeddings, ChatOpenAI from langchain.chains.conversation.memory import ConversationBufferWindowMemory from langchain.chains import RetrievalQA from langchain_groq import ChatGroq from langchain_pinecone import PineconeVectorStore from langchain_core.prompts import PromptTemplate # VECTOR STORE import pinecone from pinecone import Pinecone, ServerlessSpec # AGENTS from langchain_community.tools.tavily_search import TavilySearchResults from langchain.agents import AgentExecutor, Tool, AgentType from langchain.agents.react.agent import create_react_agent from langchain import hub # In[2]: # Load environmental variables from a .env file load_dotenv() OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') # # Load Documents # There are several Document Loaders in the LangChain library depending on the type of file to be used. The most common ones include CSV, HTML, JSON, Markdown, File Directory or Microsoft Office formats. The complete list can be found [here](https://python.langchain.com/docs/modules/data_connection/document_loaders/office_file/). # # However, there is a more extensive [list](https://python.langchain.com/docs/integrations/document_loaders/google_drive/), where you can load directly from Google Cloud, Notion, Youtube or many other services. # # We will be using a csv file, so we will use the CSVLoader. Below you can find the code to load the file. As arguments we are using: # # - **file path** # - **source column**: indicates the column in the CSV file that contains the primary data of interest, transcript # - **metadata columns**: list of column names that contain additional information (metadata) about each entry in the transcript # # Loading the data in this way will benefit our RAG pipeline. The benefits of metadata are listed further below. # In[3]: # Load Documents loader = CSVLoader( file_path="./tedx.csv", encoding='utf-8', source_column="transcript", metadata_columns=["main_speaker", "name", "speaker_occupation", "title", "url", "description"] ) data = loader.load() len(data) # In[4]: # data[0] # The **CSVLoader** allow us to upload a csv file and there are some arguments that can further enhance the pipeline, mainly metadata. # # **Benefits of Metadata:** # # - *Enhanced Retrieval*: Metadata provides additional context about the primary data, which significantly improves the accuracy and relevance of the information retrieved. This is particularly important in RAG, where the goal is to retrieve data that is relevant to a user's query and provide it as context for the LLM. By incorporating metadata, the system can better understand the nuances of the query and retrieve data that is more likely to be relevant to the user's needs. For example, knowing the `main_speaker`, `title`, and `speaker_occupation` can offer insights into the content of the `transcript`. # # - *Improved Response Generation*: The addition of metadata to the data chunks being processed by the LLM allows for a more nuanced and context-aware retrieval process. This leads to higher quality outcomes in response generation, as the LLM is able to draw upon a richer understanding of the context in which the data is being retrieved. # # - *Reduced Hallucinations*: By grounding the LLM's output on relevant, external knowledge, RAG attempts to mitigate the risk of responding with incorrect or fabricated information (`hallucinations`). The use of metadata helps to ensure that the responses generated by the LLM are based on accurate and relevant information, rather than simply relying on static training data. # # - *Domain-Specific Responses*: Metadata can be used to provide domain-specific, relevant responses tailored to an organization's proprietary or domain-specific data. This is particularly important in applications where the LLM needs to provide responses that are specific to a particular domain or industry. # # - *Efficiency and Cost-Effectiveness*: RAG is a simple and cost-effective approach to customizing LLMs with domain-specific data. By leveraging metadata, organizations can deploy RAG without needing to customize the model, which is especially beneficial when models need to be updated frequently with new data. # # Indexing # The **Vector Store Index** is a tool that embeds your documents into vector representations. When you want to search through these embeddings, your query is also converted into a vector embedding. Then, the Vector Store Index performs a mathematical operation to rank all the document embeddings based on how semantically similar they are to your query embedding. # # The key steps are: # - Embedding your documents into vectors # - Turning your search query into a vector # - Comparing the query vector to all the document vectors # - Ranking the document vectors by their similarity to the query vector # - Returning the most relevant documents based on this ranking # # This allows you to search your document collection in a semantic, meaning-based way, rather than just looking for exact keyword matches. # # To understand the process of vector search, we will analyze the concepts of tokenization, similarity, and embedding, which are implemented by embedding models. # ## Tokenizer # # The term **token** refers to the fundamental units of **semantic meaning** that are derived from breaking down a sentence or piece of text. These tokens can then be encoded into vector representations, which are numerical representations that can be processed by large language models (LLMs). Tokens can be words or special characters like punctuation, but also can be `sub-words`. # # Below is an example of the tiktoken library, which is using the BPE (Byte Pair Encoding) algorithm to convert text into tokens. This library is the one used for models like gpt-3.5 and gpt-4. You can find a good explanation of the BPE algorithm in this link from [Hugging Face](https://huggingface.co/learn/nlp-course/chapter6/5?fw=pt). # # Screenshot 2024-05-02 112404 # # **Source:** https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken # The most common and efficient encoder is the `cl100k_base` with 100k tokens. # In[5]: # Tokenization # Count the number of tokens in a given string def num_tokens_from_string(question, encoding_name): encoding = tiktoken.get_encoding(encoding_name) num_tokens = encoding.encode(question) return encoding, num_tokens question = "How many TEDx talks are on the list?" encoding, num_tokens = num_tokens_from_string(question, "cl100k_base") print(f'Number of Words: {len(question.split())}') print(f'Number of Characters: {len(question)}') print(f'List of Tokens: {num_tokens}') print(f'Nr of Tokens: {len(num_tokens)}') # In[6]: # Decoding tokenizer encoding.decode([4438, 1690, 84296, 87, 13739, 527, 389, 279, 1160, 30]) # Screenshot 2024-05-02 100606 # # **Source:** https://tiktokenizer.vercel.app/?model=cl100k_base # # # According to OpenAI, as a rule of thumb 1 token corresponds to 4 characters of text for common English text. This means that 100 tokens correspond to 75 words. # ## Embedding # Embeddings are a way to represent high-dimensional sparse data like words in a more compact, lower-dimensional form while preserving the meaningful similarities between the original data points.. The key ideas are: # # - **Capturing Similarities:** Similar items, like synonymous words, will have embedding vectors that are close to each other. # # - **Spatial Representation:** The embedding vectors are positioned in a multi-dimensional space such that the distance between them (e.g. cosine similarity) reflects how related the original data points are # # Untitled (1) # # # **Source**: https://openai.com/index/new-embedding-models-and-api-updates # The most common metric used for similarity search is **cosine similarity**. It finds application in scenarios like semantic search and document classification, because it enables the comparison of vector directions, effectively assessing the overall content of documents. By comparing the vector representations of the query and the documents, cosine similarity can identify the most similar and relevant documents to return in the search results. # # Screenshot 2024-05-02 123447 # # **Source:** https://www.pinecone.io/learn/vector-similarity/ # # Cosine similarity is a measure of the similarity between two non-zero vectors. It calculates the cosine of the angle between the two vectors, which results in a value between 1 (identical) and -1 (opposite). # # Screenshot 2024-05-02 122629 # # **Source:** https://medium.com/kx-systems/how-vector-databases-search-by-similarity-a-comprehensive-primer-c4b80d13ce63 # Let's explore this using `OpenAIEmbedding`. OpenAI employs the `text-embedding-ada-002` (Ada v2) model as default, featuring 1536 dimensions. Essentially, this implies that queries and documents will be transformed into vectors with 1536 dimensions. # # Untitled # # # **Source:** https://openai.com/index/new-embedding-models-and-api-updates # In[7]: # Define cosine similarity function def cosine_similarity(query_emb, document_emb): # Calculate the dot product of the query and document embeddings dot_product = np.dot(query_emb, document_emb) # Calculate the L2 norms (magnitudes) of the query and document embeddings query_norm = np.linalg.norm(query_emb) document_norm = np.linalg.norm(document_emb) # Calculate the cosine similarity cosine_sim = dot_product / (query_norm * document_norm) return cosine_sim # In[8]: # Using Ada v2 default model question = "What is the topic of the TEDx talk from Al Gore?" document = "Averting the climate crisis" embedding = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY) query_emb = embedding.embed_query(question) document_emb = embedding.embed_query(document) cosine_sim = cosine_similarity(query_emb, document_emb) # print(f'Query Vector: {query_emb}') # print(f'Document Vector: {document_emb}') print(f'Query Dimensions: {len(query_emb)}') print(f'Document Dimensions: {len(document_emb)}') print("Cosine Similarity:", cosine_sim) # In[9]: # Using text-embedding-3-large model question = "What is the topic of the TEDx talk from Al Gore?" document = "Averting the climate crisis" embedding = OpenAIEmbeddings(model="text-embedding-3-large", openai_api_key=OPENAI_API_KEY) query_emb = embedding.embed_query(question) document_emb = embedding.embed_query(document) cosine_sim = cosine_similarity(query_emb, document_emb) # print(f'Query Vector: {query_emb}') # print(f'Document Vector: {document_emb}') print(f'Query Dimensions: {len(query_emb)}') print(f'Document Dimensions: {len(document_emb)}') print("Cosine Similarity:", cosine_sim) # We see that each model performs differently as each one has been trained with different data and depending in the use case, the results will differ. Now it is time to implement our dataset and create our index to see the performance for our specific data. # ## Text Splitters # Unfortunatelly, LLM models have some limitations when it comes to the point of processig text. One of those is the **context window**. The context window represents the maximum amount of text/tokens that a model can process at one time as an input to generate a response. Therefore we need to split our documents into smaller chunks that can fit into the model's context window. A complete list of OpenAI models can be found [here](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4). It spans from 4'096 tokens for the `gpt-3.5-turbo-instruct` to the `gpt-4-turbo` with 128'000 tokens. # # # Like the data loaders, LangChain offers several text splitters. In the table below you can see the main splitting methods and when to use which one. The `Adds Metadata` does not mean that it will add (or not) the metadata from the previous loader. For example for HTML has a `HTMLHeaderTextSplitter` and it means it will splits text at the element level and adds metadata for each chunk based on header text. # # In our case we already have the metadata available and we do not need to add them using and splitter. # Screenshot 2024-04-30 132645 # # The `RecursiveCharacterTextSplitter` is the recommended tool for splitting general text. It segments the text based on a defined chunk size, using a list of characters as separators. # # According to LangChain, the default separators include ["\n\n", "\n", " ", ""]. This means it aims to keep paragraphs together first, followed by sentences and words, as they typically exhibit the strongest semantic connections in text. # # To leverage this feature, we can utilize the `RecursiveCharacterTextSplitter` along with the tiktoken library to ensure that splits do not exceed the maximum token chunk size allowed by the language model. Each split will be recursively divided if its size exceeds the limit. # # The final design of our text splitter will be as follows: # # - Model: `gpt-3.5-turbo-0125` with a context window of 16,385 tokens # # - Chunk Size: number of tokens of one chunk # # - Chunk Overlap: number of tokens that overlap between two consecutive chunks # # - Separators: the order of separators # In[10]: # texts[0].metadata # In[11]: # # The resulting texts will have the metadata attached # for text in texts[]: # print(text.metadata) # In[12]: # Splitter text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( model_name="gpt-3.5-turbo-0125", chunk_size=512, chunk_overlap=20, separators= ["\n\n", "\n", " ", ""]) # In[13]: # # Make splits # splits = text_splitter.split_documents(data[:5]) # # Print the number of resulting text chunks # print(f'Number of Chunks: {len(splits)}') # # # Print the number of original pages/documents # print(f'Number of Pages: {len(data)}') # ## Vector Store # A Vector Store is a specialized database that is designed to store and manage high-dimensional vector data. Vector databases store data in the form of vector embedding, which can be retrieved by the LLMs and allow them to understand the context and meaning of the data, allowing better responses. # ### Indexing # Pinecone is a serverless vector store, which shows a very good performance for a fast vector search and retrieval process. # # The first step to use Pinecone is to create an Index where our embeddings will be stored. There are several parameters to be considered for this: # # - Index name # - Dimension: must be equal to the embedding model dimensions # - Metric: must match with the used to tain the embedding model for better results # - Serverless specifications # In[14]: # Pinecone Initialization index_name = "langchain-pinecone-test" PINECONE_API_KEY = os.getenv('PINECONE_API_KEY') pc = Pinecone(api_key = PINECONE_API_KEY) # In[15]: # Create Index pc.create_index( name=index_name, dimension=1536, metric="cosine", spec=ServerlessSpec( cloud="aws", region="us-east-1")) index = pc.Index(index_name) # In[16]: # # Delete Index # pc.delete_index(index_name) # In[17]: # List Indexes pc.list_indexes() # In[18]: # Describe Index index = pc.Index(index_name) index.describe_index_stats() # ### Namespaces # Pinecone allows you to split the data into namespaces within an index. This allows to send queries to an specific namespace. You could for example split your data by content, language or any other index suitable for your use case. # # For this specific example we will first upload 100 records of our data to a namespace and then we will create two splits each with 50 records. In total we will have 3 namspaces. # In[19]: # Create Main Namespace splits = text_splitter.split_documents(data[:100]) embed = embedding=OpenAIEmbeddings(model = "text-embedding-ada-002") db = PineconeVectorStore.from_documents(documents=splits, embedding=embed, index_name=index_name, namespace="main" ) # In[20]: # Create Vectorstore of Main index vectorstore = PineconeVectorStore(index_name=index_name, namespace="main", embedding=embed) # In[21]: # Search for similarity query = "Who is Al Gore" similarity = vectorstore.similarity_search(query, k=4) for i in range(len(similarity)): print(f"-------Result Nr. {i}-------") print(f"Main Speaker: {similarity[i].metadata['main_speaker']}") print(f" ") # In[22]: # Search for similarity with score query = "Who is Al Gore" similarity_with_score = vectorstore.similarity_search_with_score(query, k=4) for i in range(len(similarity_with_score)): print(f"-------Result Nr. {i}-------") print(f"Title: {similarity_with_score[i][0].metadata['title']}") print(f"Main Speaker: {similarity_with_score[i][0].metadata['main_speaker']}") print(f"Score: {similarity_with_score[i][1]}") print(f" ") # Now we will create 2 more namespaces each with 50 records. For that we will use the `upsert` function and the `metadata` to add data into our index but in a separate namespace. # First we will create the chunks # In[23]: # Create Chunked Metadata def chunked_metadata_embeddings(documents, embed): chunked_metadata = [] chunked_text = text_splitter.split_documents(documents) for index, text in enumerate(tqdm(chunked_text)): payload = { "metadata": { "source": text.metadata['source'], "row": text.metadata['row'], "chunk_num": index, "main_speaker": text.metadata['main_speaker'], "name": text.metadata['name'], "speaker_occupation": text.metadata['speaker_occupation'], "title": text.metadata['title'], "url": text.metadata['url'], "description": text.metadata['description'], }, "id": str(uuid4()), "values": embed.embed_documents([text.page_content])[0] # Assuming `embed` is defined elsewhere } chunked_metadata.append(payload) return chunked_metadata # In[24]: # Create first split split_one = chunked_metadata_embeddings(data[:50], embed) len(split_one) # In[25]: # Create second split split_two = chunked_metadata_embeddings(data[50:100], embed) len(split_two) # In[26]: # Upsert the document def batch_upsert(split, index , namespace, batch_size): print(f"Split Length: {len(split)}") for i in range(0, len(split), batch_size): batch = split[i:i + batch_size] index.upsert(vectors=batch, namespace=namespace) # In[27]: batch_upsert(split_one, index, "first_split", 10) # The following function will allow to find an specific chunk base on the main speaker. It returns the title and the chunk ID, which can be used to find it in the pinecone cloud. # In[28]: # Function to find item with main_speaker def find_item_with_row(metadata_list, main_speaker): for item in metadata_list: if item['metadata']['main_speaker'] == main_speaker: return item # Call the function to find item with main_speaker = Al Gore result_item = find_item_with_row(split_one, "Al Gore") # Print the result print(f'Chunk Nr: {result_item["metadata"]["chunk_num"]}') print(f'Chunk ID: {result_item["id"]}') print(f'Chunk Title: {result_item["metadata"]["title"]}') # # Screenshot 2024-05-10 140026 # In[43]: # index.delete(namespace="last_split", delete_all=True) # Now we can see that our index has two namespaces with the below function # In[44]: index.describe_index_stats() # We can now create the namespace of the second split and check that everything has been properly created. # In[45]: batch_upsert(split_two, index, "last_split", 20) # In[46]: index.describe_index_stats() # Screenshot 2024-05-10 135858 # # Now we will test our namespaces by creating two users, each of one will send the query to a different namespace. # In[47]: # Define Users query_one = "Who is Al Gore?" query_two = "Who is Rick Warren?" # Users dictionary users = [{ 'name': 'John', 'namespace': 'first_split', 'query': query_one }, { "name": "Jane", "namespace": 'last_split', "query": query_two }] def vectorize_query(embed, query): return embed.embed_query(query) # In[48]: # Create our vectors for each of our queries: query_vector_one = vectorize_query(embed, query_one) query_vector_two = vectorize_query(embed, query_two) # In[49]: len(query_vector_one), len(query_vector_two) # In[50]: # Define a list of new key-value pairs new_key_value_pairs = [ {'vector_query': query_vector_one}, {'vector_query': query_vector_two}, ] # Loop through the list of users and the list of new key-value pairs for user, new_pair in zip(users, new_key_value_pairs): user.update(new_pair) # In[51]: users[0]["name"], users[1]["name"] # In[52]: users[0].keys() # In[53]: print(f"Name: {users[0]['name']}") print(f"Namespace: {users[0]['namespace']}") print(f"Query: {users[0]['query']}") print(f"Vector Query: {users[0]['vector_query'][:3]}") # If we send the query to the namespace, we will get the `top_k` vectors related to that query # In[54]: # Query the namespace john = [t for t in users if t.get('name') == 'John'][0] john_query_vector = john['vector_query'] john_namespace = john['namespace'] index.query(vector=john_query_vector, top_k=2, include_metadata=True, namespace=john_namespace) # # RAG # Now that we have set up our namespaces, we can prepare our RAG pipeline. We will do so, using Agents # ## Retrieval # In[55]: # Create vectorstore embed = embedding=OpenAIEmbeddings(model = "text-embedding-ada-002") vectorstore = PineconeVectorStore(index_name=index_name, namespace="main", embedding=embed) # In this retrieval step, you can chose between Open AI or Groq. For Groq, create a `GROQ_API_KEY` which allow you to use some models like llama or mistral for free. We will also add some memory, which allow to keep track of the QA chain. # In[56]: # Retrieval # # Chat completion llm wit Groq # GROQ_API_KEY = os.getenv('GROQ_API_KEY') # llm = ChatGroq(groq_api_key=GROQ_API_KEY, # model_name="llama3-8b-8192", # temperature=0.0, # max_tokens=512) llm = ChatOpenAI(temperature=0.0, model="gpt-3.5-turbo", max_tokens=512) # Conversational memory conversational_memory = ConversationBufferWindowMemory( memory_key='chat_history', k=5, return_messages=True) # Retrieval qa chain qa_db = RetrievalQA.from_chain_type( llm=llm, chain_type="stuff", retriever=vectorstore.as_retriever()) # ## Augmented # We are going to use a slightly modified prompt template. First we download the react template, which is a common template using toools and agents and then we will add the instruction of in which tool to look up first. # A collection of templates can be found in the [langchain hub](https://smith.langchain.com/hub) # # # In[57]: prompt = hub.pull("hwchase17/react") print(prompt.template) # Now we will replace this line: # # `Action: the action to take, should be one of [{tool_names}]` # # By this line: # # `Action: the action to take, should be one of [{tool_names}]. Always look first in Pinecone Document Store` # # # In[58]: # Set prompt template template= ''' Answer the following questions as best you can. You have access to the following tools: {tools} Use the following format: Question: the input question you must answer Thought: you should always think about what to do Action: the action to take, should be one of [{tool_names}]. Always look first in Pinecone Document Store Action Input: the input to the action Observation: the result of the action ... (this Thought/Action/Action Input/Observation can repeat 2 times) Thought: I now know the final answer Final Answer: the final answer to the original input question Begin! Question: {input} Thought:{agent_scratchpad} ''' prompt = PromptTemplate.from_template(template) # ## Generation with Agent # We are going to set up 2 tools for our agent: # # - Tavily Search API: Tavily search over several sources like Bing or Google and returns the most relevant content. It offers 1000 API calls per month for free. # # - Vectorstore: Our vector store will be used to look for the information first. # In[65]: # Set up tools and agent import os TAVILY_API_KEY = os.getenv("TAVILY_API_KEY") tavily = TavilySearchResults(max_results=10, tavily_api_key=TAVILY_API_KEY) tools = [ Tool( name = "Pinecone Document Store", func = qa_db.run, description = "Use it to lookup information from the Pinecone Document Store" ), Tool( name="Tavily", func=tavily.run, description="Use this to lookup information from Tavily", ) ] agent = create_react_agent(llm, tools, prompt) agent_executor = AgentExecutor(tools=tools, agent=agent, handle_parsing_errors=True, verbose=True, memory=conversational_memory) # Once everything is set up, we can start making queries and check how the agents behave in terms priorization of agent, search quality and answers. # In[68]: agent_executor.invoke({"input":"Can you give me one title of a TED talk of Al Gore as main speaker?. \ Please look in the pinecone document store metadata as it has the title \ based on the transcripts"}) # In[69]: agent_executor.invoke({"input":"Did you find the previous title 'The Case for Optimism on Climate Change' in the Pinecone Document Store?"}) # In[70]: agent_executor.invoke({"input":"Can you look for a title within the Pinecone Document Store?"}) # In[71]: agent_executor.invoke({"input":"Is Dan Gilbert a main speaker of TEDx talks? If yes, give me the source of your answer"}) # In[72]: agent_executor.invoke({"input":"What is the main topic of Dan Gilbert TEDx talks?"}) # In[73]: conversational_memory.load_memory_variables({}) # In[74]: agent_executor.memory.clear() # In[ ]: