#!/usr/bin/env python # coding: utf-8 # [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/aurelio-labs/semantic-router/blob/main/docs/examples/video-splitter.ipynb) [![Open nbviewer](https://raw.githubusercontent.com/pinecone-io/examples/master/assets/nbviewer-shield.svg)](https://nbviewer.org/github/aurelio-labs/semantic-router/blob/main/docs/examples/video-splitter.ipynb) # # Splitting Videos using Semantic Router's ViT Encoder # # Similar to prose, videos are a sequence of frames with a temporal component. # # By using similarity between these frames, we can effectively split videos based on the constitutent frames' semantic # # # Let's start by loading a test video and splitting it into frames # In[ ]: get_ipython().system('pip install -qU "semantic-router[vision]==0.0.25" opencv-python matplotlib') # In[1]: import cv2 vidcap = cv2.VideoCapture("https://www.w3schools.com/html/mov_bbb.mp4") frames = [] success, image = vidcap.read() while success: frames.append(image) success, image = vidcap.read() len(frames) # In[2]: from PIL import Image image_frames = list(map(Image.fromarray, frames)) len(image_frames) # Now that we have the frames loaded, we can go ahead and use the `Splitter` functionality to create splits based on frame similarity # # First, lets initialise our ViT Encoder # In[3]: from semantic_router.encoders import VitEncoder encoder = VitEncoder(device="mps") # Now lets initialise our Splitter. # # > Note: currently, we can only use `semantic_router.splitters.ConsecutiveSimSplitter` for image content # In[5]: from semantic_router.splitters.consecutive_sim import ConsecutiveSimSplitter splitter = ConsecutiveSimSplitter(encoder=encoder, score_threshold=0.5) splits = splitter(docs=image_frames) len(splits) # In[6]: import matplotlib.pyplot as plt f, axarr = plt.subplots(len(splits), 3, figsize=(20, 5)) for i, split in enumerate(splits): axarr[i, 0].imshow(split.docs[0]) num_docs = len(split.docs) mid = num_docs // 2 axarr[i, 1].imshow(split.docs[mid]) axarr[i, 2].imshow(split.docs[num_docs - 1]) # The video has two main camera angles, which is represented here by two Semantic Splits. (each row represents 1 split, columns represent frame samples within the split) # # Split #1 - scene 1, high angle shot of Big Buck Bunny looking up at a butterfly # # Split #2 - scene 2, straight-up angle shot of Big Buck Bunny, with a distinct yellow background # # Using ViT features from frames, we were able to distinguish these two scenes # ## What about non-animated footage? # # Depending on the complexity of the footage you're trying to semantically split, you might need to adjust the `threshold` parameter for `semantic_splitter` # # Let's use a public domain video from the automotive domain to demonstrate # In[7]: vidcap = cv2.VideoCapture( "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/WeAreGoingOnBullrun.mp4" ) frames = [] success, image = vidcap.read() while success: frames.append(image) success, image = vidcap.read() image_frames = list(map(Image.fromarray, frames)) len(image_frames) # ### How to pick the right `threshold`? # # It's an art as much as it is a science. # # A lower threshold value means that the splitter is more lenient to accepting frames within a `split`, with threshold 0 meaning all frames are just 1 split # # Conversely, the higher the threshold value, the more strict the splitter is, with threshold 1 putting each frame (besides 100% identical ones) into the same split. # # For this video, we empirically found a value of `0.65` to work the best. # In[8]: splitter = ConsecutiveSimSplitter(encoder=encoder, score_threshold=0.65) splits = splitter(docs=image_frames) # In[9]: import matplotlib.pyplot as plt f, axarr = plt.subplots(len(splits), 3, figsize=(20, 60)) for i, split in enumerate(splits): axarr[i, 0].imshow(split.docs[0]) num_docs = len(split.docs) mid = num_docs // 2 axarr[i, 1].imshow(split.docs[mid]) axarr[i, 2].imshow(split.docs[num_docs - 1])