#!/usr/bin/env python
# coding: utf-8

# [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/aurelio-labs/semantic-router/blob/main/docs/examples/video-splitter.ipynb) [![Open nbviewer](https://raw.githubusercontent.com/pinecone-io/examples/master/assets/nbviewer-shield.svg)](https://nbviewer.org/github/aurelio-labs/semantic-router/blob/main/docs/examples/video-splitter.ipynb)

# # Splitting Videos using Semantic Router's ViT Encoder
# 
# Similar to prose, videos are a sequence of frames with a temporal component.
# 
# By using similarity between these frames, we can effectively split videos based on the constitutent frames' semantic
# 
# 
# Let's start by loading a test video and splitting it into frames

# In[ ]:


get_ipython().system('pip install -qU      "semantic-router[vision]==0.0.25"      opencv-python      matplotlib')


# In[1]:


import cv2

vidcap = cv2.VideoCapture("https://www.w3schools.com/html/mov_bbb.mp4")

frames = []
success, image = vidcap.read()
while success:
    frames.append(image)
    success, image = vidcap.read()
len(frames)


# In[2]:


from PIL import Image

image_frames = list(map(Image.fromarray, frames))
len(image_frames)


# Now that we have the frames loaded, we can go ahead and use the `Splitter` functionality to create splits based on frame similarity
# 
# First, lets initialise our ViT Encoder

# In[3]:


from semantic_router.encoders import VitEncoder

encoder = VitEncoder(device="mps")


# Now lets initialise our Splitter. 
# 
# > Note: currently, we can only use `semantic_router.splitters.ConsecutiveSimSplitter` for image content

# In[5]:


from semantic_router.splitters.consecutive_sim import ConsecutiveSimSplitter

splitter = ConsecutiveSimSplitter(encoder=encoder, score_threshold=0.5)

splits = splitter(docs=image_frames)
len(splits)


# In[6]:


import matplotlib.pyplot as plt

f, axarr = plt.subplots(len(splits), 3, figsize=(20, 5))

for i, split in enumerate(splits):
    axarr[i, 0].imshow(split.docs[0])
    num_docs = len(split.docs)
    mid = num_docs // 2
    axarr[i, 1].imshow(split.docs[mid])
    axarr[i, 2].imshow(split.docs[num_docs - 1])


# The video has two main camera angles, which is represented here by two Semantic Splits. (each row represents 1 split, columns represent frame samples within the split)
# 
# Split #1 - scene 1, high angle shot of Big Buck Bunny looking up at a butterfly
# 
# Split #2 - scene 2, straight-up angle shot of Big Buck Bunny, with a distinct yellow background
# 
# Using ViT features from frames, we were able to distinguish these two scenes

# ## What about non-animated footage?
# 
# Depending on the complexity of the footage you're trying to semantically split, you might need to adjust the `threshold` parameter for `semantic_splitter`
# 
# Let's use a public domain video from the automotive domain to demonstrate

# In[7]:


vidcap = cv2.VideoCapture(
    "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/WeAreGoingOnBullrun.mp4"
)

frames = []
success, image = vidcap.read()
while success:
    frames.append(image)
    success, image = vidcap.read()
image_frames = list(map(Image.fromarray, frames))
len(image_frames)


# ### How to pick the right `threshold`?
# 
# It's an art as much as it is a science.
# 
# A lower threshold value means that the splitter is more lenient to accepting frames within a `split`, with threshold 0 meaning all frames are just 1 split
# 
# Conversely, the higher the threshold value, the more strict the splitter is, with threshold 1 putting each frame (besides 100% identical ones) into the same split.
# 
# For this video, we empirically found a value of `0.65` to work the best.

# In[8]:


splitter = ConsecutiveSimSplitter(encoder=encoder, score_threshold=0.65)

splits = splitter(docs=image_frames)


# In[9]:


import matplotlib.pyplot as plt

f, axarr = plt.subplots(len(splits), 3, figsize=(20, 60))

for i, split in enumerate(splits):
    axarr[i, 0].imshow(split.docs[0])
    num_docs = len(split.docs)
    mid = num_docs // 2
    axarr[i, 1].imshow(split.docs[mid])
    axarr[i, 2].imshow(split.docs[num_docs - 1])