#!/usr/bin/env python
# coding: utf-8
# Copyright (c) Microsoft Corporation. All rights reserved.
#
# Licensed under the MIT License.
# # Quickstart: Web Cam Object Detection
#
#
# Object detection is the canonical computer vision task of determining where a specific object is in an image.
#
# This notebook shows a simple example of loading a pretrained Faster R-CNN model for object detection from a webcam stream using the `torchvision` package.
#
# To understand the basics of Object Detection, please visit our [FAQ](FAQ.md). For more details about the underlying technology of object detection tasks\, including finetuning, please see our [training introduction notebook](01_training_introduction.ipynb).
# ## Prerequisite for Webcam example
# This notebook assumes you have **a webcam** connected to your machine. We use the `ipywebrtc` module to show the webcam widget in the notebook. Currently, the widget works on **Chrome** and **Firefox**. For more details about the widget, please visit `ipywebrtc` [github](https://github.com/maartenbreddels/ipywebrtc) or [documentation](https://ipywebrtc.readthedocs.io/en/latest/).
# ## Initialization
# In[1]:
# Regular Python libraries
import io
import os
import sys
import time
import urllib.request
import matplotlib.pyplot as plt
# IPython
import scrapbook as sb
from ipywebrtc import CameraStream, ImageRecorder
from ipywidgets import HBox, Layout, widgets, Widget
# Image
from PIL import Image
# TorchVision
import torchvision
from torchvision import transforms as T
# utils_cv
sys.path.append("../../")
from utils_cv.common.data import data_path
from utils_cv.common.gpu import which_processor, is_windows
from utils_cv.detection.data import coco_labels
from utils_cv.detection.model import DetectionLearner
from utils_cv.detection.plot import PlotSettings, plot_boxes
# Change matplotlib backend so that plots are shown for windows
if is_windows():
plt.switch_backend('TkAgg')
print(f"TorchVision: {torchvision.__version__}")
which_processor()
# This shows your machine's GPUs (if it has any) and the computing device `torch/torchvision` is using.
# In[2]:
get_ipython().run_line_magic('reload_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')
get_ipython().run_line_magic('matplotlib', 'inline')
# # Load Pretrained Model
#
# We will start with a pretrained Faster R-CNN ResNet-50 FPN model which is a relatively small and fast CNN architecture. The [reported box AP](https://pytorch.org/docs/stable/torchvision/models.html#object-detection-instance-segmentation-and-person-keypoint-detection) is 37.0 trained on the instances set of COCO train2017 and evaluated on COCO val2017. [COCO](http://cocodataset.org) is a popular dataset in the research community and consists of 330K images with more than 200K annotated objects of [91 distinct classes](https://pytorch.org/docs/stable/torchvision/models.html#object-detection-instance-segmentation-and-person-keypoint-detection) such as person, car, boat, or cup.
#
# To _speed up_ model inference we restrict processing to use _at most_ an image resolution of 200 pixels, and _only_ use up to 5 candidate object proposals per image.
# In[3]:
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(
pretrained=True,
rpn_pre_nms_top_n_test = 5,
rpn_post_nms_top_n_test = 5,
max_size=200,
)
# Next, we just need to pass this model into our `DetectionLearner` object, and add the pre-defined coco-labels.
# In[4]:
detector = DetectionLearner(
model=model,
labels=coco_labels()[1:], # we use [1:] because the first element of the array is '__background__'
)
# # Object Detection
#
# ## From Image File
# First, we prepare a coffee mug image to show an example of how to score a single image by using the model.
# In[5]:
# Download an example image
IM_URL = "https://cvbp.blob.core.windows.net/public/images/cvbp_cup.jpg"
im_path = os.path.join(data_path(), "example.jpg")
urllib.request.urlretrieve(IM_URL, im_path)
im = Image.open(im_path)
im
# Using the `predict()` method, we ask the model to detect how many objects and what they are on this image. In this case, the only object is "cup".
# In[6]:
detections = detector.predict(im)
# In[7]:
plot_boxes(im, detections["det_bboxes"], plot_settings=PlotSettings(rect_color=(0, 255, 0)))
# In[8]:
detections
# ## From WebCam Stream
#
# Now, we use a WebCam stream for object detection. We use `ipywebrtc` to start a webcam and get the video stream which is sent to the notebook's widget. Note that Jupyter widgets are quite unstable - if the widget below does not show then see the "Troubleshooting" section in this [FAQ](../classification/FAQ.md) for possible fixes.
# In[9]:
# Webcam
w_cam = CameraStream(
constraints={
'facing_mode': 'user',
'audio': False,
'video': { 'width': 200, 'height': 200 }
},
layout=Layout(width='200px')
)
# Image recorder for taking a snapshot
w_imrecorder = ImageRecorder(stream=w_cam, layout=Layout(padding='0 0 0 50px'))
# Label widget to show our object detection results
w_im = widgets.Image(layout=Layout(width='200px'))
def detect_frame(_):
""" Detect objects on an image snapshot by using a pretrained model
"""
# Once capturing started, remove the capture widget since we don't need it anymore
if w_imrecorder.layout.display != 'none':
w_imrecorder.layout.display = 'none'
try:
# Get the image and convert to RGB
im = Image.open(io.BytesIO(w_imrecorder.image.value)).convert('RGB')
# Process the captured image
detections = detector.predict(im)
plot_boxes(im, detections["det_bboxes"], plot_settings=PlotSettings(rect_color=(0, 255, 0)))
# Convert the processed image back into the image widget for display
f = io.BytesIO()
im.save(f, format='png')
w_im.value = f.getvalue()
except OSError:
# If im_recorder doesn't have valid image data, skip it.
pass
# Taking the next snapshot programmatically
w_imrecorder.recording = True
# Register detect_frame as a callback. Will be called whenever image.value changes.
w_imrecorder.image.observe(detect_frame, 'value')
# In[10]:
# Show widgets
HBox([w_cam, w_imrecorder, w_im])
# Now, click the **capture button** in the widget to start object detection. Labels and bounding boxes are displayed to show the most probable objects predicted by the model for an image snapshot.
#
# ![Example Webcam Image](media/00_webcam_snapshot.png)
# # Conclusion
# In this notebook, we used a simple example to demonstrate how to use a pretrained model to detect objects on images. The model is limited to only predict object labels that are part of the COCO training samples. In the [training introduction notebook](01_training_introduction.ipynb), we will learn how to fine-tune a model on our own data.
# In[11]:
# Stop the model and webcam
Widget.close_all()
# In[12]:
# Preserve some of the notebook outputs
detections = [
(x.label_idx, x.label_name, [(x.left, x.top), (x.right, x.bottom)])
for x in detections["det_bboxes"]
]
sb.glue("detection_bounding_box", detections)