#!/usr/bin/env python # coding: utf-8 # 我们将使用GPT-4的推理能力来推断元数据过滤器。根据您的使用情况,`"gpt-3.5-turbo"` 也可以使用。 # # # 从Weaviate向量数据库进行自动检索 # # 本指南演示了如何在[LlamaIndex](https://weaviate.io/)中使用Weaviate执行**自动检索**。 # # Weaviate向量数据库支持一组[元数据过滤器](https://weaviate.io/developers/weaviate/search/filters),除了用于语义搜索的查询字符串。给定自然语言查询,我们首先使用大型语言模型(LLM)推断一组元数据过滤器,以及要传递给向量数据库的正确查询字符串(也可以为空)。然后针对向量数据库执行整个查询包。 # # 这允许进行比top-k语义搜索更动态、更具表现力的检索形式。对于给定查询的相关上下文可能只需要在元数据标签上进行过滤,或者需要在过滤集合内进行过滤+语义搜索的联合组合,或者只需要原始语义搜索。 # # ## 运行一些示例数据 # # 我们尝试运行一些示例数据。请注意元数据过滤器是如何被推断出来的 - 这有助于更精确地检索! # # 如果您在Colab上打开此笔记本,您可能需要安装LlamaIndex 🦙。 # # In[ ]: get_ipython().run_line_magic('pip', 'install llama-index-vector-stores-weaviate') # In[ ]: get_ipython().system('pip install llama-index weaviate-client') # In[ ]: # 从Weaviate向量数据库进行自动检索 本指南演示了如何在[LlamaIndex](https://weaviate.io/)中使用Weaviate执行**自动检索**。 Weaviate向量数据库支持一组[元数据过滤器](https://weaviate.io/developers/weaviate/search/filters),除了用于语义搜索的查询字符串。给定自然语言查询,我们首先使用大型语言模型(LLM)推断一组元数据过滤器,以及要传递给向量数据库的正确查询字符串(也可以为空)。然后针对向量数据库执行整个查询包。 这允许进行比top-k语义搜索更动态、更具表现力的检索形式。对于给定查询的相关上下文可能只需要在元数据标签上进行过滤,或者需要在过滤集合内进行过滤+语义搜索的联合组合,或者只需要原始语义搜索。 # 我们将使用GPT-4的推理能力来推断元数据过滤器。根据您的使用情况,`"gpt-3.5-turbo"` 也可以使用。 # # In[ ]: # 设置OpenAIimport osimport getpassimport openaios.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")openai.api_key = os.environ["OPENAI_API_KEY"] # In[ ]: from llama_index.embeddings.openai import OpenAIEmbedding from llama_index.llms.openai import OpenAI from llama_index.core.settings import Settings Settings.llm = OpenAI(model="gpt-4") Settings.embed_model = OpenAIEmbedding() # 这个笔记本使用[嵌入模式](https://weaviate.io/developers/weaviate/installation/embedded)中的Weaviate,该模式支持Linux和macOS。 # # 如果你更喜欢尝试Weaviate的完全托管服务,[Weaviate Cloud Services (WCS)](https://weaviate.io/developers/weaviate/installation/weaviate-cloud-services),你可以取消注释中的代码。 # # In[ ]: import weaviatefrom weaviate.embedded import EmbeddedOptions# 连接到嵌入模式的Weaviate客户端client = weaviate.connect_to_embedded()# 如果要使用Weaviate云服务而不是嵌入模式,请启用此代码。"""import weaviate# 云端cluster_url = ""api_key = ""client = weaviate.connect_to_wcs(cluster_url=cluster_url, auth_credentials=weaviate.auth.AuthApiKey(api_key), )# 本地# client = weaviate.connect_to_local()""" # ## 定义一些示例数据 # # 我们向向量数据库中插入一些包含文本块的示例节点。请注意,每个 `TextNode` 不仅包含文本,还包含元数据,例如 `category` 和 `country`。这些元数据字段将被转换/存储在底层的向量数据库中。 # # In[ ]: from llama_index.core.schema import TextNode nodes = [ TextNode( text=( "Michael Jordan is a retired professional basketball player," " widely regarded as one of the greatest basketball players of all" " time." ), metadata={ "category": "Sports", "country": "United States", }, ), TextNode( text=( "Angelina Jolie is an American actress, filmmaker, and" " humanitarian. She has received numerous awards for her acting" " and is known for her philanthropic work." ), metadata={ "category": "Entertainment", "country": "United States", }, ), TextNode( text=( "Elon Musk is a business magnate, industrial designer, and" " engineer. He is the founder, CEO, and lead designer of SpaceX," " Tesla, Inc., Neuralink, and The Boring Company." ), metadata={ "category": "Business", "country": "United States", }, ), TextNode( text=( "Rihanna is a Barbadian singer, actress, and businesswoman. She" " has achieved significant success in the music industry and is" " known for her versatile musical style." ), metadata={ "category": "Music", "country": "Barbados", }, ), TextNode( text=( "Cristiano Ronaldo is a Portuguese professional footballer who is" " considered one of the greatest football players of all time. He" " has won numerous awards and set multiple records during his" " career." ), metadata={ "category": "Sports", "country": "Portugal", }, ), ] # ## 使用Weaviate向量存储构建向量索引 # # 在这里,我们将数据加载到向量存储中。如上所述,每个节点的文本和元数据都将转换为Weaviate中的相应表示。我们现在可以从Weaviate对这些数据运行语义查询,还可以进行元数据过滤。 # # In[ ]: from llama_index.core import VectorStoreIndex, StorageContext from llama_index.vector_stores.weaviate import WeaviateVectorStore vector_store = WeaviateVectorStore( weaviate_client=client, index_name="LlamaIndex_filter" ) storage_context = StorageContext.from_defaults(vector_store=vector_store) # In[ ]: index = VectorStoreIndex(nodes, storage_context=storage_context) # ## 定义 `VectorIndexAutoRetriever` # # 我们定义了核心的 `VectorIndexAutoRetriever` 模块。该模块接收 `VectorStoreInfo`,其中包含向量存储集合的结构化描述以及其支持的元数据过滤器。然后,这些信息将在自动检索提示中使用,LLM 将推断元数据过滤器。 # # In[ ]: from llama_index.core.retrievers import VectorIndexAutoRetriever from llama_index.core.vector_stores.types import MetadataInfo, VectorStoreInfo vector_store_info = VectorStoreInfo( content_info="brief biography of celebrities", metadata_info=[ MetadataInfo( name="category", type="str", description=( "Category of the celebrity, one of [Sports, Entertainment," " Business, Music]" ), ), MetadataInfo( name="country", type="str", description=( "Country of the celebrity, one of [United States, Barbados," " Portugal]" ), ), ], ) retriever = VectorIndexAutoRetriever( index, vector_store_info=vector_store_info ) # ## 运行一些示例数据 # # 我们尝试运行一些示例数据。请注意元数据过滤器是如何被推断出来的 - 这有助于更精确地检索! # # In[ ]: response = retriever.retrieve("Tell me about celebrities from United States") # In[ ]: print(response[0]) # In[ ]: response = retriever.retrieve( "Tell me about Sports celebrities from United States" ) # In[ ]: print(response[0])