#!/usr/bin/env python
# coding: utf-8
# # [Beta] 多模态ReAct Agent
#
#
#
# 在本教程中,我们将向您展示如何构建一个多模态ReAct代理。
#
# 这是一个可以接受文本和图像作为输入任务定义的代理,通过思维链和工具使用来尝试解决任务。
#
# 这是通过我们的低级代理API实现的,允许我们明确地步进ReAct循环,以向您展示每个步骤中发生的事情。
#
# 我们展示了两个用例:
# 1. **RAG代理**:给定文本/图像,可以查询RAG管道以查找答案。(给定OpenAI Dev Day 2023的屏幕截图)
# 2. **Web代理**:给定文本/图像,可以查询网络工具以从网络中查找相关信息(给定鞋子的图片)。
#
# **注意**:这明确是一个beta功能,抽象化可能会随时间改变!
#
# **注意**:目前仅适用于GPT-4V。
#
# ## 使用RAG管道增强图像分析
#
# 在本节中,我们将创建一个配备了RAG工具的多模态代理。
#
# ### 设置数据
#
# In[ ]:
get_ipython().run_line_magic('pip', 'install llama-index-llms-openai')
get_ipython().run_line_magic('pip', 'install llama-index-readers-web')
get_ipython().run_line_magic('pip', 'install llama-index-multi-modal-llms-openai')
get_ipython().run_line_magic('pip', 'install llama-index-tools-metaphor')
# In[ ]:
# 下载我们之后用于运行查询的图片
get_ipython().system('wget "https://images.openai.com/blob/a2e49de2-ba5b-4869-9c2d-db3b4b5dcc19/new-models-and-developer-products-announced-at-devday.jpg?width=2000" -O other_images/openai/dev_day.png')
get_ipython().system('wget "https://drive.google.com/uc\\?id\\=1B4f5ZSIKN0zTTPPRlZ915Ceb3_uF9Zlq\\&export\\=download" -O other_images/adidas.png')
# In[ ]:
from llama_index.readers.web import SimpleWebPageReader
url = "https://openai.com/blog/new-models-and-developer-products-announced-at-devday"
reader = SimpleWebPageReader(html_to_text=True)
documents = reader.load_data(urls=[url])
# ### 设置工具
#
# In[ ]:
from llama_index.llms.openai import OpenAI
from llama_index.core import VectorStoreIndex
from llama_index.core.tools import QueryEngineTool, ToolMetadata
# In[ ]:
from llama_index.core import Settings
Settings.llm = OpenAI(temperature=0, model="gpt-3.5-turbo")
# In[ ]:
vector_index = VectorStoreIndex.from_documents(
documents,
)
# In[ ]:
query_tool = QueryEngineTool(
query_engine=vector_index.as_query_engine(),
metadata=ToolMetadata(
name=f"vector_tool",
description=(
"用于查找OpenAI宣布的新功能"
# "用于查找有关图像的任何信息"
),
),
)
# ### 设置代理
#
# In[ ]:
from llama_index.core.agent.react_multimodal.step import (
MultimodalReActAgentWorker,
)
from llama_index.core.multi_modal_llms import MultiModalLLM
from llama_index.multi_modal_llms.openai import OpenAIMultiModal
from llama_index.core.agent import Task
mm_llm = OpenAIMultiModal(model="gpt-4-vision-preview", max_new_tokens=1000)
# 选项2:使用OpenAIAgentWorker进行初始化
react_step_engine = MultimodalReActAgentWorker.from_tools(
[query_tool],
# [],
multi_modal_llm=mm_llm,
verbose=True,
)
agent = react_step_engine.as_agent()
# In[ ]:
query_str = (
"照片显示了OpenAI发布的一些新功能。"
"你能在照片中找出这些功能并使用相关工具提供更多细节吗?"
)
from llama_index.core.schema import ImageDocument
# 图像文档
image_document = ImageDocument(image_path="other_images/openai/dev_day.png")
task = agent.create_task(
query_str,
extra_state={"image_docs": [image_document]},
)
# In[ ]:
from llama_index.core.agent import AgentRunner
def execute_step(agent: AgentRunner, task: Task):
step_output = agent.run_step(task.task_id)
if step_output.is_last:
response = agent.finalize_response(task.task_id)
print(f"> Agent finished: {str(response)}")
return response
else:
return None
def execute_steps(agent: AgentRunner, task: Task):
response = execute_step(agent, task)
while response is None:
response = execute_step(agent, task)
return response
# In[ ]:
# 如果您只想一次性运行所有内容,请运行此代码,而不是下面的代码。
# response = execute_steps(agent, task)
# In[ ]:
response = execute_step(agent, task)
# In[ ]:
response = execute_step(agent, task)
# In[ ]:
print(str(response))
# ## 通过网络搜索增强图像分析
#
# 在这个示例中,我们将向您展示如何设置一个由GPT-4V驱动的代理,以在网络上查找信息,帮助更好地解释给定的图像。
#
# In[ ]:
from llama_index.tools.metaphor import MetaphorToolSpec
from llama_index.core.agent.react_multimodal.step import (
MultimodalReActAgentWorker,
)
from llama_index.core.agent import AgentRunner
from llama_index.core.multi_modal_llms import MultiModalLLM
from llama_index.multi_modal_llms.openai import OpenAIMultiModal
from llama_index.core.agent import Task
metaphor_tool_spec = MetaphorToolSpec(
api_key="",
)
metaphor_tools = metaphor_tool_spec.to_tool_list()
# In[ ]:
mm_llm = OpenAIMultiModal(model="gpt-4-vision-preview", max_new_tokens=1000)
# 选项2:使用OpenAIAgentWorker进行初始化
react_step_engine = MultimodalReActAgentWorker.from_tools(
metaphor_tools,
# [],
multi_modal_llm=mm_llm,
verbose=True,
)
agent = react_step_engine.as_agent()
# In[ ]:
from llama_index.core.schema import ImageDocument
query_str = "Look up some reviews regarding these shoes."
image_document = ImageDocument(image_path="other_images/adidas.png")
task = agent.create_task(
query_str, extra_state={"image_docs": [image_document]}
)
# In[ ]:
response = execute_step(agent, task)
# In[ ]:
response = execute_step(agent, task)
# In[ ]:
# 执行步骤,获取agent的响应
response = execute_step(agent, task)
# In[ ]:
response = execute_step(agent, task)
# In[ ]:
print(str(response))