# 安装所需的软件包
!python3 -m pip install --upgrade langchain 'deeplake[enterprise]' openai tiktoken
import getpass
import os
# 导入所需的模块
from langchain.chains import RetrievalQA
from langchain_community.vectorstores import DeepLake
from langchain_openai import OpenAI, OpenAIEmbeddings
from langchain_text_splitters import (
CharacterTextSplitter,
RecursiveCharacterTextSplitter,
)
# 获取用户输入的 OpenAI API Key
os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")
# 获取用户输入的 Activeloop Token
activeloop_token = getpass.getpass("Activeloop Token:")
os.environ["ACTIVELOOP_TOKEN"] = activeloop_token
# 获取用户输入的 Activeloop Org
os.environ["ACTIVELOOP_ORG"] = getpass.getpass("Activeloop Org:")
# 获取 Activeloop Org ID
org_id = os.environ["ACTIVELOOP_ORG"]
# 创建 OpenAIEmbeddings 实例
embeddings = OpenAIEmbeddings()
# 设置数据集路径
dataset_path = "hub://" + org_id + "/data"
以上代码主要是导入所需的模块,并获取用户输入的 OpenAI API Key、Activeloop Token 和 Activeloop Org。然后创建了一个 OpenAIEmbeddings 实例,并设置了数据集路径。
你可以使用ChatGPT生成一个样本群聊对话,使用以下提示:
生成一个包含三个朋友谈论他们一天的群聊对话,引用真实地点和虚构的名字。让对话有趣并尽可能详细。
我已经在messages.txt
中生成了这样的对话。我们可以简单地使用这个作为我们的示例。
我们加载文本文件中的消息,对其进行分块并上传到ActiveLoop向量存储中。
# 打开名为 "messages.txt" 的文件
with open("messages.txt") as f:
# 读取文件内容并存储在 state_of_the_union 变量中
state_of_the_union = f.read()
# 使用 CharacterTextSplitter 类将文本分割成长度为 1000 的片段,不重叠
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
# 将 state_of_the_union 分割成多个片段,并存储在 pages 变量中
pages = text_splitter.split_text(state_of_the_union)
# 使用 RecursiveCharacterTextSplitter 类将文本分割成长度为 1000 的片段,重叠为 100
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
# 根据分割后的 pages 创建文档,并存储在 texts 变量中
texts = text_splitter.create_documents(pages)
# 打印 texts 变量的内容
print(texts)
# 设置数据集路径为 "hub://<org_id>/data"
dataset_path = "hub://" + org_id + "/data"
# 创建 OpenAIEmbeddings 实例并存储在 embeddings 变量中
embeddings = OpenAIEmbeddings()
# 使用 DeepLake 类从文档中提取信息,并存储在 db 变量中
db = DeepLake.from_documents(
texts, embeddings, dataset_path=dataset_path, overwrite=True
)
[Document(page_content='Participants:\n\nJerry: Loves movies and is a bit of a klutz.\nSamantha: Enthusiastic about food and always trying new restaurants.\nBarry: A nature lover, but always manages to get lost.\nJerry: Hey, guys! You won\'t believe what happened to me at the Times Square AMC theater. I tripped over my own feet and spilled popcorn everywhere! 🍿💥\n\nSamantha: LOL, that\'s so you, Jerry! Was the floor buttery enough for you to ice skate on after that? 😂\n\nBarry: Sounds like a regular Tuesday for you, Jerry. Meanwhile, I tried to find that new hiking trail in Central Park. You know, the one that\'s supposed to be impossible to get lost on? Well, guess what...\n\nJerry: You found a hidden treasure?\n\nBarry: No, I got lost. AGAIN. 🧭🙄\n\nSamantha: Barry, you\'d get lost in your own backyard! But speaking of treasures, I found this new sushi place in Little Tokyo. "Samantha\'s Sushi Symphony" it\'s called. Coincidence? I think not!\n\nJerry: Maybe they named it after your ability to eat your body weight in sushi. 🍣', metadata={}), Document(page_content='Barry: How do you even FIND all these places, Samantha?\n\nSamantha: Simple, I don\'t rely on Barry\'s navigation skills. 😉 But seriously, the wasabi there was hotter than Jerry\'s love for Marvel movies!\n\nJerry: Hey, nothing wrong with a little superhero action. By the way, did you guys see the new "Captain Crunch: Breakfast Avenger" trailer?\n\nSamantha: Captain Crunch? Are you sure you didn\'t get that from one of your Saturday morning cereal binges?\n\nBarry: Yeah, and did he defeat his arch-enemy, General Mills? 😆\n\nJerry: Ha-ha, very funny. Anyway, that sushi place sounds awesome, Samantha. Next time, let\'s go together, and maybe Barry can guide us... if we want a city-wide tour first.\n\nBarry: As long as we\'re not hiking, I\'ll get us there... eventually. 😅\n\nSamantha: It\'s a date! But Jerry, you\'re banned from carrying any food items.\n\nJerry: Deal! Just promise me no wasabi challenges. I don\'t want to end up like the time I tried Sriracha ice cream.', metadata={}), Document(page_content="Barry: Wait, what happened with Sriracha ice cream?\n\nJerry: Let's just say it was a hot situation. Literally. 🔥\n\nSamantha: 🤣 I still have the video!\n\nJerry: Samantha, if you value our friendship, that video will never see the light of day.\n\nSamantha: No promises, Jerry. No promises. 🤐😈\n\nBarry: I foresee a fun weekend ahead! 🎉", metadata={})] Your Deep Lake dataset has been successfully created!
\
Dataset(path='hub://adilkhan/data', tensors=['embedding', 'id', 'metadata', 'text']) tensor htype shape dtype compression ------- ------- ------- ------- ------- embedding embedding (3, 1536) float32 None id text (3, 1) str None metadata json (3, 1) str None text text (3, 1) str None
可选项
: 您还可以使用Deep Lake的托管张量数据库作为托管服务,并在那里运行查询。为了这样做,需要在创建向量存储时将运行时参数指定为 {'tensor_db': True}。此配置使得可以在托管张量数据库上执行查询,而不是在客户端上执行。需要注意的是,此功能不适用于本地或内存中存储的数据集。如果已经在托管张量数据库之外创建了向量存储,则可以按照规定的步骤将其转移到托管张量数据库中。
# 打开名为"messages.txt"的文件
with open("messages.txt") as f:
# 读取文件内容并存储在state_of_the_union变量中
state_of_the_union = f.read()
# 创建一个CharacterTextSplitter对象,设置每个块的大小为1000,块之间不重叠
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
# 使用text_splitter将state_of_the_union文本分割成多个页面
pages = text_splitter.split_text(state_of_the_union)
# 创建一个RecursiveCharacterTextSplitter对象,设置每个块的大小为1000,块之间重叠100个字符
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
# 使用text_splitter创建文档集合
texts = text_splitter.create_documents(pages)
# 打印文档集合
print(texts)
# 设置数据集路径为"hub://" + org + "/data"
dataset_path = "hub://" + org + "/data"
# 创建一个OpenAIEmbeddings对象
embeddings = OpenAIEmbeddings()
# 使用DeepLake从文档集合中构建数据库,设置数据集路径为dataset_path,覆盖已存在的数据集,设置运行时参数为{"tensor_db": True}
db = DeepLake.from_documents(
texts, embeddings, dataset_path=dataset_path, overwrite=True, runtime={"tensor_db": True}
)
现在我们可以提出一个问题,并通过语义搜索获得答案:
# 创建一个DeepLake对象,指定数据集路径、只读模式和嵌入
db = DeepLake(dataset_path=dataset_path, read_only=True, embedding=embeddings)
# 将DeepLake对象转换为检索器
retriever = db.as_retriever()
retriever.search_kwargs["distance_metric"] = "cos" # 设置检索参数中的距离度量为余弦相似度
retriever.search_kwargs["k"] = 4 # 设置检索参数中的返回结果数量为4个
# 使用RetrievalQA类创建一个QA对象,指定语言模型为OpenAI,链类型为"stuff",检索器为retriever,不返回源文档
qa = RetrievalQA.from_chain_type(
llm=OpenAI(), chain_type="stuff", retriever=retriever, return_source_documents=False
)
# 输入查询问题
query = input("Enter query:")
# 获取答案
ans = qa({"query": query})
print(ans)