"Identify emojis in the real world with your phone’s camera - Experiment / YouTube Presentation
import torch
print("Torch version:", torch.__version__)
import torchvision
print("Torchvision version:", torchvision.__version__)
import numpy as np
print("Numpy version:", np.__version__)
import matplotlib
print("Matplotlib version:", matplotlib.__version__)
import PIL
print("PIL version:", PIL.__version__)
import IPython
print("IPython version:", IPython.__version__)
import cv2
print('OpenCV version:', cv2.__version__)
# Setup Matplotlib
%matplotlib inline
#%config InlineBackend.figure_format = 'retina' # If you have a retina screen
import matplotlib.pyplot as plt
# Load a pretrained model
model = torchvision.models.resnet18(pretrained=True)
# Set model in "evaluation" model
_ = model.eval()
from torchvision import transforms
# Define the input pipeline
pipeline = transforms.Compose([
transforms.ToPILImage(), # Convert webcam images to PIL format
transforms.ToTensor(), # Convert to PyTorch Tensor
transforms.Normalize( # Normalize using predefined values
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]
)
])
import json
# Load classes
imagenet_classes = json.load(open('imagenet-classes.json'))
# Function to transform label(s) to id(s)
to_id = {label: int(i) for i, label in imagenet_classes.items()} # label -> id
to_ids = lambda labels: [to_id[label] for label in labels] # labels -> ids
# Define items
items = {
'cup': ['cup', 'goblet', 'coffee_mug', 'espresso', 'eggnog', 'red_wine', 'beer_glass'],
'clock': ['digital_clock', 'digital_watch', 'analog_clock', 'wall_clock', 'stopwatch'],
'bottle': ['water_bottle', 'pop_bottle', 'wine_bottle', 'beer_bottle']
}
items_names = list(items.keys())
# The goal is to find all items
was_found = {c: False for c in items.keys()}
from IPython import display
import time
# Connect to webcam
if 'webcam' not in locals() or webcam is None:
webcam = cv2.VideoCapture(0)
try:
# Try to read from the webcam
webcam_found, _ = webcam.read()
if webcam_found:
# Create figure
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(6, 2))
for i in range(1000):
# Take a picture with the webcam
_, image = webcam.read()
# Process it
image = cv2.resize(image, (224, 224)) # Resize to fit the model
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # To RGB
# Classify image
image_pytorch = pipeline(image)
output = model(torch.autograd.Variable(image_pytorch[np.newaxis, :]))
all_probs = torch.nn.functional.softmax(output, 1).view(-1).data.numpy()
# Get probabilities for our items
probs = [all_probs[to_ids(labels)].max() for c, labels in items.items()]
# Did we find an object?
if max(probs) > 0.2:
found_class = items_names[np.argmax(probs)]
was_found[found_class] = True
# Plot the image
ax1.cla()
ax1.barh(np.arange(len(items)), probs, height=0.5, tick_label=['{} [{}]'.format(c, '✓' if done else '✗') for c, done in was_found.items()])
ax1.set_xlim(0, 1)
ax2.cla()
ax2.imshow(image, aspect='auto')
ax2.set_title('webcam')
# Set title
if np.all(list(was_found.values())):
ax1.set_title(r'Bravo \°$\smile$°/ !')
else:
ax1.set_title('Find a ..')
# Jupyter trick
display.clear_output(wait=True)
display.display(fig)
# Rest a bit for CPU
time.sleep(0.2)
# Clear output
display.clear_output()
else:
print('Cannot read from webcam, do you have one connected?')
except KeyboardInterrupt:
# Clear output
display.clear_output()
finally:
# Disconnect webcam
del(webcam)