#!/usr/bin/env python # coding: utf-8 # ## Scavenger hunt # # "Identify emojis in the real world with your phone’s camera - [Experiment][scavenger-experiment] / [YouTube Presentation][scavenger-youtube] # # # # # # [scavenger-youtube]:https://youtu.be/jr3q_9pJBr8 # [scavenger-experiment]:https://emojiscavengerhunt.withgoogle.com/ # ## Load libraries # In[ ]: import torch print("Torch version:", torch.__version__) import torchvision print("Torchvision version:", torchvision.__version__) import numpy as np print("Numpy version:", np.__version__) import matplotlib print("Matplotlib version:", matplotlib.__version__) import PIL print("PIL version:", PIL.__version__) import IPython print("IPython version:", IPython.__version__) import cv2 print('OpenCV version:', cv2.__version__) # In[ ]: # Setup Matplotlib get_ipython().run_line_magic('matplotlib', 'inline') #%config InlineBackend.figure_format = 'retina' # If you have a retina screen import matplotlib.pyplot as plt # ## Pretrained models # In[ ]: # Load a pretrained model model = torchvision.models.resnet18(pretrained=True) # Set model in "evaluation" model _ = model.eval() # In[ ]: from torchvision import transforms # Define the input pipeline pipeline = transforms.Compose([ transforms.ToPILImage(), # Convert webcam images to PIL format transforms.ToTensor(), # Convert to PyTorch Tensor transforms.Normalize( # Normalize using predefined values mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] ) ]) # In[ ]: import json # Load classes imagenet_classes = json.load(open('imagenet-classes.json')) # Function to transform label(s) to id(s) to_id = {label: int(i) for i, label in imagenet_classes.items()} # label -> id to_ids = lambda labels: [to_id[label] for label in labels] # labels -> ids # Define items items = { 'cup': ['cup', 'goblet', 'coffee_mug', 'espresso', 'eggnog', 'red_wine', 'beer_glass'], 'clock': ['digital_clock', 'digital_watch', 'analog_clock', 'wall_clock', 'stopwatch'], 'bottle': ['water_bottle', 'pop_bottle', 'wine_bottle', 'beer_bottle'] } items_names = list(items.keys()) # The goal is to find all items was_found = {c: False for c in items.keys()} # ## Test with webcam feed # In[ ]: from IPython import display import time # Connect to webcam if 'webcam' not in locals() or webcam is None: webcam = cv2.VideoCapture(0) try: # Try to read from the webcam webcam_found, _ = webcam.read() if webcam_found: # Create figure fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(6, 2)) for i in range(1000): # Take a picture with the webcam _, image = webcam.read() # Process it image = cv2.resize(image, (224, 224)) # Resize to fit the model image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # To RGB # Classify image image_pytorch = pipeline(image) output = model(torch.autograd.Variable(image_pytorch[np.newaxis, :])) all_probs = torch.nn.functional.softmax(output, 1).view(-1).data.numpy() # Get probabilities for our items probs = [all_probs[to_ids(labels)].max() for c, labels in items.items()] # Did we find an object? if max(probs) > 0.2: found_class = items_names[np.argmax(probs)] was_found[found_class] = True # Plot the image ax1.cla() ax1.barh(np.arange(len(items)), probs, height=0.5, tick_label=['{} [{}]'.format(c, '✓' if done else '✗') for c, done in was_found.items()]) ax1.set_xlim(0, 1) ax2.cla() ax2.imshow(image, aspect='auto') ax2.set_title('webcam') # Set title if np.all(list(was_found.values())): ax1.set_title(r'Bravo \°$\smile$°/ !') else: ax1.set_title('Find a ..') # Jupyter trick display.clear_output(wait=True) display.display(fig) # Rest a bit for CPU time.sleep(0.2) # Clear output display.clear_output() else: print('Cannot read from webcam, do you have one connected?') except KeyboardInterrupt: # Clear output display.clear_output() finally: # Disconnect webcam del(webcam)