#!/usr/bin/env python # coding: utf-8 # 在 Colab 中打开 # # # Weaviate向量存储 # # 如果您在colab上打开这个笔记本,您可能需要安装LlamaIndex 🦙。 # # In[ ]: get_ipython().run_line_magic('pip', 'install llama-index-vector-stores-weaviate') # In[ ]: get_ipython().system('pip install llama-index') # #### 创建一个Weaviate客户端 # # In[ ]: import os import openai os.environ["OPENAI_API_KEY"] = "" openai.api_key = os.environ["OPENAI_API_KEY"] # In[ ]: import logging import sys logging.basicConfig(stream=sys.stdout, level=logging.INFO) logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout)) # In[ ]: import weaviate # In[ ]: # 云端 cluster_url = "" api_key = "" client = weaviate.connect_to_wcs( cluster_url=cluster_url, auth_credentials=weaviate.auth.AuthApiKey(api_key), ) # 本地 # client = connect_to_local() # #### 加载文档,构建VectorStoreIndex # # In[ ]: from llama_index.core import VectorStoreIndex, SimpleDirectoryReader from llama_index.vector_stores.weaviate import WeaviateVectorStore from IPython.display import Markdown, display # 下载数据 # # In[ ]: get_ipython().system("mkdir -p 'data/paul_graham/'") get_ipython().system("wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'") # In[ ]: # 加载文档 documents = SimpleDirectoryReader("./data/paul_graham").load_data() # In[ ]: from llama_index.core import StorageContext # 如果你想以后加载索引,请确保给它一个名称! vector_store = WeaviateVectorStore( weaviate_client=client, index_name="LlamaIndex" ) storage_context = StorageContext.from_defaults(vector_store=vector_store) index = VectorStoreIndex.from_documents( documents, storage_context=storage_context ) # 注意:你也可以选择手动定义一个索引名称。 # index_name = "test_prefix" # vector_store = WeaviateVectorStore(weaviate_client=client, index_name=index_name) # #### 查询索引 # # In[ ]: # 将日志级别设置为DEBUG,以获得更详细的输出 query_engine = index.as_query_engine() response = query_engine.query("作者在成长过程中做了什么?") # In[ ]: display(Markdown(f"{response}")) # ## 加载索引 # # 在这里,我们使用与创建初始索引时相同的索引名称。这样可以阻止它被自动生成,并使我们能够轻松地重新连接到它。 # # In[ ]: cluster_url = "" api_key = "" client = weaviate.connect_to_wcs( cluster_url=cluster_url, auth_credentials=weaviate.auth.AuthApiKey(api_key), ) # 本地 # client = weaviate.connect_to_local() # In[ ]: vector_store = WeaviateVectorStore( weaviate_client=client, index_name="LlamaIndex" ) loaded_index = VectorStoreIndex.from_vector_store(vector_store) # In[ ]: # 将日志级别设置为DEBUG,以获得更详细的输出 query_engine = loaded_index.as_query_engine() response = query_engine.query("What happened at interleaf?") display(Markdown(f"{response}")) # ## 元数据过滤 # # 让我们插入一个虚拟文档,并尝试进行过滤,以便只返回该文档。 # # In[ ]: from llama_index.core import Document doc = Document.example() print(doc.metadata) print("-----") print(doc.text[:100]) # In[ ]: loaded_index.insert(doc) # In[ ]: from llama_index.core.vector_stores import ExactMatchFilter, MetadataFilters filters = MetadataFilters( filters=[ExactMatchFilter(key="filename", value="README.md")] ) query_engine = loaded_index.as_query_engine(filters=filters) response = query_engine.query("What is the name of the file?") display(Markdown(f"{response}")) # # 完全删除索引 # # 您可以使用`delete_index`函数删除向量存储创建的索引。 # # In[ ]: vector_store.delete_index() # In[ ]: vector_store.delete_index() # 再次调用该函数不会有任何作用 # # 连接终止 # # 您必须确保关闭客户端连接: # # In[ ]: client.close()