#!/usr/bin/env python
# coding: utf-8

# [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/aurelio-labs/semantic-router/blob/main/docs/07-multi-modal.ipynb) [![Open nbviewer](https://raw.githubusercontent.com/pinecone-io/examples/master/assets/nbviewer-shield.svg)](https://nbviewer.org/github/aurelio-labs/semantic-router/blob/main/docs/07-multi-modal.ipynb)

# # Multi-Modal Routes

# The Semantic Router library can also be used for detection of specific images or videos, for example the detection of **N**ot **S**hrek **F**or **W**ork (NSFW) and **S**hrek **F**or **W**ork (SFW) images as we will demonstrate in this walkthrough.

# ## Getting Started

# We start by installing the library:

# In[ ]:


get_ipython().system('pip install -qU      "semantic-router[vision]"      datasets==2.17.0')
get_ipython().system('pip install datasets')


# We start by downloading a multi-modal dataset, we'll be using the `aurelio-ai/shrek-detection` dataset from Hugging Face.

# In[ ]:


from datasets import load_dataset

data = load_dataset("aurelio-ai/shrek-detection", split="train", trust_remote_code=True)
data[3]["image"]


# We will grab the images that are labeled with `is_shrek`:

# In[ ]:


shrek_pics = [d["image"] for d in data if d["is_shrek"]]
not_shrek_pics = [d["image"] for d in data if not d["is_shrek"]]
print(f"We have {len(shrek_pics)} shrek pics, and {len(not_shrek_pics)} not shrek pics")


# We start by defining a dictionary mapping routes to example phrases that should trigger those routes.

# In[ ]:


from semantic_router import Route

shrek = Route(
    name="shrek",
    utterances=shrek_pics,
)


# Let's define another for good measure:

# In[ ]:


not_shrek = Route(
    name="not_shrek",
    utterances=not_shrek_pics,
)

routes = [shrek, not_shrek]


# Now we initialize our embedding model:

# In[ ]:


from semantic_router.encoders.clip import CLIPEncoder

encoder = CLIPEncoder()


# Now we define the `RouteLayer`. When called, the route layer will consume text (a query) and output the category (`Route`) it belongs to — to initialize a `RouteLayer` we need our `encoder` model and a list of `routes`.

# In[ ]:


from semantic_router.layer import RouteLayer

rl = RouteLayer(encoder=encoder, routes=routes)


# Now we can test it with _text_ to see if we hit the routes that we defined with images:

# In[ ]:


rl("don't you love politics?")


# In[ ]:


rl("shrek")


# In[ ]:


rl("dwayne the rock johnson")


# Everything is being classified accurately, let's pull in some images that we haven't seen before and see if we can classify them as NSFW or SFW.

# In[ ]:


test_data = load_dataset(
    "aurelio-ai/shrek-detection", split="test", trust_remote_code=True
)
test_data


# In[ ]:


test_data[0]["image"]


# In[ ]:


rl(test_data[0]["image"]).name


# In[ ]:


test_data[1]["image"]


# In[ ]:


rl(test_data[1]["image"]).name


# In[ ]:


test_data[2]["image"]


# In[ ]:


rl(test_data[2]["image"]).name


# These all look great, we've managed to accurately classify these images using our relatively small route space.
# 
# Taking this further, we can easily use [route optimization](https://github.com/aurelio-labs/semantic-router/blob/main/docs/06-threshold-optimization.ipynb), the addition of more utterances, or more routes to improve performance further when and where required.

# ---