#!/usr/bin/env python # coding: utf-8 # # [Beta] 多模态ReAct Agent # # 在Colab中打开 # # 在本教程中,我们将向您展示如何构建一个多模态ReAct代理。 # # 这是一个可以接受文本和图像作为输入任务定义的代理,通过思维链和工具使用来尝试解决任务。 # # 这是通过我们的低级代理API实现的,允许我们明确地步进ReAct循环,以向您展示每个步骤中发生的事情。 # # 我们展示了两个用例: # 1. **RAG代理**:给定文本/图像,可以查询RAG管道以查找答案。(给定OpenAI Dev Day 2023的屏幕截图) # 2. **Web代理**:给定文本/图像,可以查询网络工具以从网络中查找相关信息(给定鞋子的图片)。 # # **注意**:这明确是一个beta功能,抽象化可能会随时间改变! # # **注意**:目前仅适用于GPT-4V。 # # ## 使用RAG管道增强图像分析 # # 在本节中,我们将创建一个配备了RAG工具的多模态代理。 # # ### 设置数据 # # In[ ]: get_ipython().run_line_magic('pip', 'install llama-index-llms-openai') get_ipython().run_line_magic('pip', 'install llama-index-readers-web') get_ipython().run_line_magic('pip', 'install llama-index-multi-modal-llms-openai') get_ipython().run_line_magic('pip', 'install llama-index-tools-metaphor') # In[ ]: # 下载我们之后用于运行查询的图片 get_ipython().system('wget "https://images.openai.com/blob/a2e49de2-ba5b-4869-9c2d-db3b4b5dcc19/new-models-and-developer-products-announced-at-devday.jpg?width=2000" -O other_images/openai/dev_day.png') get_ipython().system('wget "https://drive.google.com/uc\\?id\\=1B4f5ZSIKN0zTTPPRlZ915Ceb3_uF9Zlq\\&export\\=download" -O other_images/adidas.png') # In[ ]: from llama_index.readers.web import SimpleWebPageReader url = "https://openai.com/blog/new-models-and-developer-products-announced-at-devday" reader = SimpleWebPageReader(html_to_text=True) documents = reader.load_data(urls=[url]) # ### 设置工具 # # In[ ]: from llama_index.llms.openai import OpenAI from llama_index.core import VectorStoreIndex from llama_index.core.tools import QueryEngineTool, ToolMetadata # In[ ]: from llama_index.core import Settings Settings.llm = OpenAI(temperature=0, model="gpt-3.5-turbo") # In[ ]: vector_index = VectorStoreIndex.from_documents( documents, ) # In[ ]: query_tool = QueryEngineTool( query_engine=vector_index.as_query_engine(), metadata=ToolMetadata( name=f"vector_tool", description=( "用于查找OpenAI宣布的新功能" # "用于查找有关图像的任何信息" ), ), ) # ### 设置代理 # # In[ ]: from llama_index.core.agent.react_multimodal.step import ( MultimodalReActAgentWorker, ) from llama_index.core.multi_modal_llms import MultiModalLLM from llama_index.multi_modal_llms.openai import OpenAIMultiModal from llama_index.core.agent import Task mm_llm = OpenAIMultiModal(model="gpt-4-vision-preview", max_new_tokens=1000) # 选项2:使用OpenAIAgentWorker进行初始化 react_step_engine = MultimodalReActAgentWorker.from_tools( [query_tool], # [], multi_modal_llm=mm_llm, verbose=True, ) agent = react_step_engine.as_agent() # In[ ]: query_str = ( "照片显示了OpenAI发布的一些新功能。" "你能在照片中找出这些功能并使用相关工具提供更多细节吗?" ) from llama_index.core.schema import ImageDocument # 图像文档 image_document = ImageDocument(image_path="other_images/openai/dev_day.png") task = agent.create_task( query_str, extra_state={"image_docs": [image_document]}, ) # In[ ]: from llama_index.core.agent import AgentRunner def execute_step(agent: AgentRunner, task: Task): step_output = agent.run_step(task.task_id) if step_output.is_last: response = agent.finalize_response(task.task_id) print(f"> Agent finished: {str(response)}") return response else: return None def execute_steps(agent: AgentRunner, task: Task): response = execute_step(agent, task) while response is None: response = execute_step(agent, task) return response # In[ ]: # 如果您只想一次性运行所有内容,请运行此代码,而不是下面的代码。 # response = execute_steps(agent, task) # In[ ]: response = execute_step(agent, task) # In[ ]: response = execute_step(agent, task) # In[ ]: print(str(response)) # ## 通过网络搜索增强图像分析 # # 在这个示例中,我们将向您展示如何设置一个由GPT-4V驱动的代理,以在网络上查找信息,帮助更好地解释给定的图像。 # # In[ ]: from llama_index.tools.metaphor import MetaphorToolSpec from llama_index.core.agent.react_multimodal.step import ( MultimodalReActAgentWorker, ) from llama_index.core.agent import AgentRunner from llama_index.core.multi_modal_llms import MultiModalLLM from llama_index.multi_modal_llms.openai import OpenAIMultiModal from llama_index.core.agent import Task metaphor_tool_spec = MetaphorToolSpec( api_key="", ) metaphor_tools = metaphor_tool_spec.to_tool_list() # In[ ]: mm_llm = OpenAIMultiModal(model="gpt-4-vision-preview", max_new_tokens=1000) # 选项2:使用OpenAIAgentWorker进行初始化 react_step_engine = MultimodalReActAgentWorker.from_tools( metaphor_tools, # [], multi_modal_llm=mm_llm, verbose=True, ) agent = react_step_engine.as_agent() # In[ ]: from llama_index.core.schema import ImageDocument query_str = "Look up some reviews regarding these shoes." image_document = ImageDocument(image_path="other_images/adidas.png") task = agent.create_task( query_str, extra_state={"image_docs": [image_document]} ) # In[ ]: response = execute_step(agent, task) # In[ ]: response = execute_step(agent, task) # In[ ]: # 执行步骤,获取agent的响应 response = execute_step(agent, task) # In[ ]: response = execute_step(agent, task) # In[ ]: print(str(response))