Create a deep model for your own task (webcam/other set of images)
"Teach a machine using your camera" - Experiment / YouTube Presentation
import torch
print("Torch version:", torch.__version__)
import torchvision
print("Torchvision version:", torchvision.__version__)
import numpy as np
print("Numpy version:", np.__version__)
import matplotlib
print("Matplotlib version:", matplotlib.__version__)
import PIL
print("PIL version:", PIL.__version__)
import IPython
print("IPython version:", IPython.__version__)
import cv2
print('OpenCV version:', cv2.__version__)
# Setup Matplotlib
%matplotlib inline
#%config InlineBackend.figure_format = 'retina' # If you have a retina screen
import matplotlib.pyplot as plt
from IPython import display
import os, time
# Path to write images
img_path = os.path.join('images/normal')
prefix = 'session1'
# Connect to webcam
if 'webcam' not in locals() or webcam is None:
webcam = cv2.VideoCapture(0)
try:
# Try to read from the webcam
webcam_found, _ = webcam.read()
if webcam_found:
# How many photos to save
n_images = int(input("Number of photos: "))
# Create figure to display webcam
fig = plt.figure()
axis = fig.gca()
# Collect images
live_in = 3
image_taken = 0
while image_taken < n_images:
# Take a picture with the webcam
_, image = webcam.read()
# Process it
image = cv2.resize(image, (250, 250)) # Reduce size
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # To RGB
# Plot it
axis.cla()
axis.imshow(image_rgb)
if live_in == 0:
# We are live!
image_taken += 1
axis.set_title('Click ! ({}/{})'.format(image_taken, n_images))
# Save the image
path = os.path.join(img_path, '{}-{}.png'.format(prefix, image_taken))
cv2.imwrite(path, image)
# Time before taking the next picture
sleep_time = 0.2
else:
# We are not live
axis.set_title("We're live in .. {}".format(live_in))
sleep_time = 1
live_in -= 1
display.clear_output(wait=True)
display.display(fig)
# Sleep
time.sleep(sleep_time)
# Clear output
display.clear_output()
else:
print('Cannot read from webcam, do you have one connected?')
except KeyboardInterrupt:
# Clear output
display.clear_output()
finally:
# Disconnect webcam
del(webcam)
from torchvision import transforms
# Data transformations
normalize = transforms.Normalize(
mean=[0.485, 0.456, 0.406], # values for PyTorch models
std=[0.229, 0.224, 0.225]
)
train_transform = transforms.Compose([
transforms.RandomCrop(224),
transforms.ToTensor(),
normalize
])
valid_transform = transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor(),
normalize
])
# Create data set
trainset = torchvision.datasets.ImageFolder('images', train_transform)
validset = torchvision.datasets.ImageFolder('images', valid_transform)
classes = trainset.classes
n_classes = len(classes)
print('Classes:', classes)
from torch.utils.data.sampler import SubsetRandomSampler
# Define train/validation sets
n_images = len(trainset) # number of images in our data set
idx = np.arange(n_images) # idx: 0 .. (n_images - 1)
np.random.shuffle(idx) # shuffle
# Create train/validation samplers
valid_size = 100
train_sampler = SubsetRandomSampler(idx[:-valid_size])
valid_sampler = SubsetRandomSampler(idx[-valid_size:])
print('Train set:', len(train_sampler))
print('Validation set:', len(valid_sampler))
# Create data loaders
train_loader = torch.utils.data.DataLoader(trainset, batch_size=4, sampler=train_sampler)
valid_loader = torch.utils.data.DataLoader(validset, batch_size=4, sampler=valid_sampler)
# Plot a few samples
train_iter = iter(train_loader)
images, labels = next(train_iter)
print('Classes:', ', '.join(classes[i] for i in labels))
grid = torchvision.utils.make_grid(images, normalize=True)
plt.imshow(grid.numpy().transpose((1, 2, 0)))
plt.show()
Can we reuse what's been learned on other tasks? - Source cs231n
In practice, very few people train an entire Convolutional Network from scratch (with random initialization), because it is relatively rare to have a dataset of sufficient size. Instead, it is common to pretrain a ConvNet on a very large dataset (e.g. ImageNet, which contains 1.2 million images with 1000 categories), and then use the ConvNet either as an initialization or a fixed feature extractor for the task of interest.
Transfer Learning Scenarios
def resnet_freezed():
# Pretrained Network
model = torchvision.models.resnet18(pretrained=True)
# Freeze parameters
for param in model.parameters():
param.requires_grad = False
# Classification layer
model.fc = torch.nn.Linear(model.fc.in_features, len(classes))
return model
resnet_freezed()
from collections import defaultdict
# Create model
model = resnet_freezed()
# Criterion and optimizer for "training"
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.fc.parameters(), lr=0.01)
# Backprop step
def compute_loss(output, target):
y_tensor = torch.LongTensor(target)
y_variable = torch.autograd.Variable(y_tensor)
return criterion(output, y_variable)
def backpropagation(output, target):
optimizer.zero_grad() # Clear the gradients
loss = compute_loss(output, target) # Compute loss
loss.backward() # Backpropagation
optimizer.step() # Let the optimizer adjust our model
return loss.data
# Helper function
def get_accuracy(output, y):
predictions = torch.argmax(output, dim=1) # Max activation
is_correct = np.equal(predictions, y)
return is_correct.numpy().mean()
# Create a figure to visualize the results
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(8, 3))
def plot_learning():
# Plot what the network learned
fig.suptitle('Epoch {}, batch {:,}/{:,}'.format(epoch, batch, len(train_loader)))
ax1.cla()
ax2.cla()
# Set titles
if len(stats['val_t']) > 0:
ax1.set_title('Loss, val: {:.3f}'.format(np.mean(stats['val_loss'][-10:])))
ax2.set_title('Accuracy, val: {:.3f}'.format(np.mean(stats['val_acc'][-10:])))
else:
ax1.set_title('Loss')
ax2.set_title('Accuracy')
ax1.plot(stats['train_t'], stats['train_loss'], label='train')
ax1.plot(stats['val_t'], stats['val_loss'], label='valid')
ax1.legend()
ax2.plot(stats['train_t'], stats['train_acc'], label='train')
ax2.plot(stats['val_t'], stats['val_acc'], label='valid')
ax2.set_ylim(0, 1)
ax2.legend()
# Jupyter trick
IPython.display.clear_output(wait=True)
IPython.display.display(fig)
# Collect loss / accuracy values
stats = defaultdict(list)
t = 0 # Number of samples seen
print_step = 10 # Refresh rate
# Train Network
epoch = 1
do_training = True
while do_training:
# Set Model in "training" mode
model.train()
# Train by small batches of data
for batch, (batch_X, batch_y) in enumerate(train_loader, 1):
# Forward pass & backpropagation
output = model(batch_X)
loss = backpropagation(output, batch_y)
# Log "train" stats
stats['train_loss'].append(loss)
stats['train_acc'].append(get_accuracy(output, batch_y))
stats['train_t'].append(t)
if t%print_step == 0:
# Plot learning
plot_learning()
# Update t
t += train_loader.batch_size
# Set model in "validation" mode
model.eval()
# Log "validation" stats
loss_vals, acc_vals = [], []
for X, y in valid_loader:
output = model(X)
loss_vals.append(compute_loss(output, y).data)
acc_vals.append(get_accuracy(output, y))
stats['val_loss'].append(np.mean(loss_vals))
stats['val_acc'].append(np.mean(acc_vals))
stats['val_t'].append(t)
# Plot learning
plot_learning()
# Should we continue?
do_training = int(input('Continue training? 1 (yes) or 0 (no): '))
epoch += 1
# Clear output
IPython.display.clear_output(wait=True)
def resnet():
# Pretrained model
model = torchvision.models.resnet18(pretrained=True)
# Classification layer
model.fc = torch.nn.Linear(model.fc.in_features, len(classes))
return model
# Save the model and class names
state = {
'model': model,
'classes': classes
}
torch.save(state, os.path.join('data', 'webcam-model.p'))
# Load libraries
import torch
print("Torch version:", torch.__version__)
import torchvision
print("Torchvision version:", torchvision.__version__)
import matplotlib
print("Matplotlib version:", matplotlib.__version__)
import numpy as np
print("Numpy version:", np.__version__)
import cv2
print('OpenCV version:', cv2.__version__)
# Setup Matplotlib
%matplotlib inline
#%config InlineBackend.figure_format = 'retina' # If you have a retina screen
import matplotlib.pyplot as plt
import os
# Load Model
state = torch.load(os.path.join('data', 'webcam-model.p'))
model = state['model']
classes = state['classes']
print('Classes:', classes)
from torchvision import transforms
# Define image transformation
image_transform = transforms.Compose([
transforms.ToPILImage(), # Convert webcam images to PIL format
transforms.Resize((224, 224)), # Resize
transforms.ToTensor(),
transforms.Normalize(
mean=[0.485, 0.456, 0.406], # values for PyTorch models
std=[0.229, 0.224, 0.225]
)
])
# We will need some tools from PyTorch
from torch.autograd import Variable
import torch.nn as nn
# Tools to display webcam feed
from IPython import display
import time
# Connect to webcam
if 'webcam' not in locals() or webcam is None:
webcam = cv2.VideoCapture(0)
try:
# Try to read from the webcam
webcam_found, _ = webcam.read()
if webcam_found:
# Create figure
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(6, 2))
# Set network in "evaluation" mode
model.eval()
for i in range(100):
# Take a picture with the webcam
_, image = webcam.read()
# Process it
image = cv2.resize(image, (250, 250)) # Reduce size
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # To RGB
image_pytorch = image_transform(image_rgb)
# Classify image
output = model(Variable(image_pytorch[None, :]))
probs = nn.functional.softmax(output, 1).data.numpy()[0]
# Plot the image
ax1.cla()
ax1.barh(np.arange(len(classes)), probs, height=0.5, tick_label=classes)
ax1.set_xlim(0, 1)
ax2.cla()
ax2.imshow(image_rgb, aspect='auto')
# Jupyter trick
display.clear_output(wait=True)
display.display(fig)
# Rest a bit for CPU
time.sleep(0.2)
# Clear output
display.clear_output()
else:
print('Cannot read from webcam, do you have one connected?')
except KeyboardInterrupt:
# Clear output
display.clear_output()
finally:
# Disconnect webcam
del(webcam)