#!/usr/bin/env python
# coding: utf-8

# 我们将使用GPT-4的推理能力来推断元数据过滤器。根据您的使用情况，`"gpt-3.5-turbo"` 也可以使用。
# 

# # 从Weaviate向量数据库进行自动检索
# 
# 本指南演示了如何在[LlamaIndex](https://weaviate.io/)中使用Weaviate执行**自动检索**。
# 
# Weaviate向量数据库支持一组[元数据过滤器](https://weaviate.io/developers/weaviate/search/filters)，除了用于语义搜索的查询字符串。给定自然语言查询，我们首先使用大型语言模型（LLM）推断一组元数据过滤器，以及要传递给向量数据库的正确查询字符串（也可以为空）。然后针对向量数据库执行整个查询包。
# 
# 这允许进行比top-k语义搜索更动态、更具表现力的检索形式。对于给定查询的相关上下文可能只需要在元数据标签上进行过滤，或者需要在过滤集合内进行过滤+语义搜索的联合组合，或者只需要原始语义搜索。
# 

# ## 运行一些示例数据
# 
# 我们尝试运行一些示例数据。请注意元数据过滤器是如何被推断出来的 - 这有助于更精确地检索！
# 

# 如果您在Colab上打开此笔记本，您可能需要安装LlamaIndex 🦙。
# 

# In[ ]:


get_ipython().run_line_magic('pip', 'install llama-index-vector-stores-weaviate')


# In[ ]:


get_ipython().system('pip install llama-index weaviate-client')


# In[ ]:


# 从Weaviate向量数据库进行自动检索

本指南演示了如何在[LlamaIndex](https://weaviate.io/)中使用Weaviate执行**自动检索**。

Weaviate向量数据库支持一组[元数据过滤器](https://weaviate.io/developers/weaviate/search/filters)，除了用于语义搜索的查询字符串。给定自然语言查询，我们首先使用大型语言模型（LLM）推断一组元数据过滤器，以及要传递给向量数据库的正确查询字符串（也可以为空）。然后针对向量数据库执行整个查询包。

这允许进行比top-k语义搜索更动态、更具表现力的检索形式。对于给定查询的相关上下文可能只需要在元数据标签上进行过滤，或者需要在过滤集合内进行过滤+语义搜索的联合组合，或者只需要原始语义搜索。


# 我们将使用GPT-4的推理能力来推断元数据过滤器。根据您的使用情况，`"gpt-3.5-turbo"` 也可以使用。
# 

# In[ ]:


# 设置OpenAIimport osimport getpassimport openaios.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")openai.api_key = os.environ["OPENAI_API_KEY"]


# In[ ]:


from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.core.settings import Settings

Settings.llm = OpenAI(model="gpt-4")
Settings.embed_model = OpenAIEmbedding()


# 这个笔记本使用[嵌入模式](https://weaviate.io/developers/weaviate/installation/embedded)中的Weaviate，该模式支持Linux和macOS。
# 
# 如果你更喜欢尝试Weaviate的完全托管服务，[Weaviate Cloud Services (WCS)](https://weaviate.io/developers/weaviate/installation/weaviate-cloud-services)，你可以取消注释中的代码。
# 

# In[ ]:


import weaviatefrom weaviate.embedded import EmbeddedOptions# 连接到嵌入模式的Weaviate客户端client = weaviate.connect_to_embedded()# 如果要使用Weaviate云服务而不是嵌入模式，请启用此代码。"""import weaviate# 云端cluster_url = ""api_key = ""client = weaviate.connect_to_wcs(cluster_url=cluster_url,    auth_credentials=weaviate.auth.AuthApiKey(api_key), )# 本地# client = weaviate.connect_to_local()"""


# ## 定义一些示例数据
# 
# 我们向向量数据库中插入一些包含文本块的示例节点。请注意，每个 `TextNode` 不仅包含文本，还包含元数据，例如 `category` 和 `country`。这些元数据字段将被转换/存储在底层的向量数据库中。
# 

# In[ ]:


from llama_index.core.schema import TextNode

nodes = [
    TextNode(
        text=(
            "Michael Jordan is a retired professional basketball player,"
            " widely regarded as one of the greatest basketball players of all"
            " time."
        ),
        metadata={
            "category": "Sports",
            "country": "United States",
        },
    ),
    TextNode(
        text=(
            "Angelina Jolie is an American actress, filmmaker, and"
            " humanitarian. She has received numerous awards for her acting"
            " and is known for her philanthropic work."
        ),
        metadata={
            "category": "Entertainment",
            "country": "United States",
        },
    ),
    TextNode(
        text=(
            "Elon Musk is a business magnate, industrial designer, and"
            " engineer. He is the founder, CEO, and lead designer of SpaceX,"
            " Tesla, Inc., Neuralink, and The Boring Company."
        ),
        metadata={
            "category": "Business",
            "country": "United States",
        },
    ),
    TextNode(
        text=(
            "Rihanna is a Barbadian singer, actress, and businesswoman. She"
            " has achieved significant success in the music industry and is"
            " known for her versatile musical style."
        ),
        metadata={
            "category": "Music",
            "country": "Barbados",
        },
    ),
    TextNode(
        text=(
            "Cristiano Ronaldo is a Portuguese professional footballer who is"
            " considered one of the greatest football players of all time. He"
            " has won numerous awards and set multiple records during his"
            " career."
        ),
        metadata={
            "category": "Sports",
            "country": "Portugal",
        },
    ),
]


# ## 使用Weaviate向量存储构建向量索引
# 
# 在这里，我们将数据加载到向量存储中。如上所述，每个节点的文本和元数据都将转换为Weaviate中的相应表示。我们现在可以从Weaviate对这些数据运行语义查询，还可以进行元数据过滤。
# 

# In[ ]:


from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.weaviate import WeaviateVectorStore

vector_store = WeaviateVectorStore(
    weaviate_client=client, index_name="LlamaIndex_filter"
)

storage_context = StorageContext.from_defaults(vector_store=vector_store)


# In[ ]:


index = VectorStoreIndex(nodes, storage_context=storage_context)


# ## 定义 `VectorIndexAutoRetriever`
# 
# 我们定义了核心的 `VectorIndexAutoRetriever` 模块。该模块接收 `VectorStoreInfo`，其中包含向量存储集合的结构化描述以及其支持的元数据过滤器。然后，这些信息将在自动检索提示中使用，LLM 将推断元数据过滤器。
# 

# In[ ]:


from llama_index.core.retrievers import VectorIndexAutoRetriever
from llama_index.core.vector_stores.types import MetadataInfo, VectorStoreInfo


vector_store_info = VectorStoreInfo(
    content_info="brief biography of celebrities",
    metadata_info=[
        MetadataInfo(
            name="category",
            type="str",
            description=(
                "Category of the celebrity, one of [Sports, Entertainment,"
                " Business, Music]"
            ),
        ),
        MetadataInfo(
            name="country",
            type="str",
            description=(
                "Country of the celebrity, one of [United States, Barbados,"
                " Portugal]"
            ),
        ),
    ],
)

retriever = VectorIndexAutoRetriever(
    index, vector_store_info=vector_store_info
)


# ## 运行一些示例数据
# 
# 我们尝试运行一些示例数据。请注意元数据过滤器是如何被推断出来的 - 这有助于更精确地检索！
# 

# In[ ]:


response = retriever.retrieve("Tell me about celebrities from United States")


# In[ ]:


print(response[0])


# In[ ]:


response = retriever.retrieve(
    "Tell me about Sports celebrities from United States"
)


# In[ ]:


print(response[0])