Notebook

Feed Forward Networks¶

Creating differentiable computation graphs for classification tasks.

Create the data¶

In [ ]:

import random
import torch
from torch import nn, optim
import torch.nn.functional as F
import math
import os

In [ ]:

%run plot_conf.py

In [ ]:

plt_style()

In [ ]:

from IPython import display

In [ ]:

seed=12345
random.seed(seed)
torch.manual_seed(seed)
N = 1000  # num_samples_per_class
D = 2  # dimensions
C = 3  # num_classes
H = 100  # num_hidden_units

In [ ]:

X = torch.zeros(N * C, D)
y = torch.zeros(N * C)

for i in range(C):
    index = 0
    r = torch.linspace(0, 1, N)
    t = torch.linspace(
        i * 2 * math.pi / C,
        (i + 2) * 2 * math.pi / C,
        N
    ) + torch.randn(N) * 0.1
    
    for ix in range(N * i, N * (i + 1)):
        X[ix] = r[index] * torch.FloatTensor((
            math.sin(t[index]), math.cos(t[index])
        ))
        y[ix] = i
        index += 1

print("SHAPES:")
print("-------------------")
print("X:", tuple(X.size()))
print("y:", tuple(y.size()))

In [ ]:

def plot_data(X, y, d=.0, auto=False):
    """
    Plot the data.
    """
    plt.clf()
    plt.scatter(X[:, 0], X[:, 1], c=y, s=20, cmap=plt.cm.Spectral)
    plt.axis('square')
    plt.axis((-1.1, 1.1, -1.1, 1.1))
    if auto is True: plt.axis('equal')
#     plt.savefig('spiral{:.2f}.png'.format(d))

In [ ]:

# Create the data
plot_data(X.numpy(), y.numpy())

In [ ]:

def plot_model(X, y, model, e=.0, auto=False):
    """
    Plot the model from torch weights.
    """
    
    X = X.numpy()
    y = y.numpy(),
    w1 = torch.transpose(model.fc1.weight.data, 0, 1).numpy()
    b1 = model.fc1.bias.data.numpy()
    w2 = torch.transpose(model.fc2.weight.data, 0, 1).numpy()
    b2 = model.fc2.bias.data.numpy()
    
    h = 0.01

    x_min, x_max = (-1.1, 1.1)
    y_min, y_max = (-1.1, 1.1)
    
    if auto is True:
        x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
        y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    Z = np.dot(np.maximum(0, np.dot(np.c_[xx.ravel(), yy.ravel()], w1) + b1), w2) + b2
    Z = np.argmax(Z, axis=1)
    Z = Z.reshape(xx.shape)
    fig = plt.figure()
    plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral, alpha=0.3)
    plt.scatter(X[:, 0], X[:, 1], c=y[0], s=40, cmap=plt.cm.Spectral)
    plt.axis((-1.1, 1.1, -1.1, 1.1))
    plt.axis('square')
    if auto is True:
        plt.axis((xx.min(), xx.max(), yy.min(), yy.max()))
    
#     plt.savefig('train{:03.2f}.png'.format(e))

Linear model¶

In [ ]:

learning_rate = 1e-3
lambda_l2 = 1e-5

In [ ]:

# Linear model
class linear_model(nn.Module):
    """
    Linear model.
    """
    def __init__(self, D_in, H, D_out):
        """
        Initialize weights.
        """
        super(linear_model, self).__init__()
        self.fc1 = nn.Linear(D_in, H)
        self.fc2 = nn.Linear(H, D_out)

    def forward(self, x):
        """
        Forward pass.
        """
        z = self.fc1(x)
        z = self.fc2(z)
        return z

In [ ]:

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [ ]:

# nn package to create our linear model
# each Linear module has a weight and bias
model = linear_model(D, H, C)
model.to(device) #Convert to CUDA

# nn package also has different loss functions.
# we use cross entropy loss for our classification task
criterion = torch.nn.CrossEntropyLoss()

# we use the optim package to apply
# stochastic gradient descent for our parameter updates
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay=lambda_l2) # built-in L2

# We convert our inputs and targets to Variables
# so we can use automatic differentiation but we 
# use require_grad=False b/c we don't want the gradients
# to alter these values.
input_X = torch.tensor(X, requires_grad=False, dtype=torch.float32)
y_true = torch.tensor(y, requires_grad=False, dtype=torch.long)

# Training
for t in range(1000):
    
    # Feed forward to get the logits
    y_pred = model(input_X)
    
    # Compute the loss and accuracy
    loss = criterion(y_pred, y_true)
    score, predicted = torch.max(y_pred, 1)
    acc = (y_true == predicted).sum().float() / len(y_true)
    print("[EPOCH]: %i, [LOSS]: %.6f, [ACCURACY]: %.3f" % (t, loss.item(), acc))
    display.clear_output(wait=True)
    
    # zero the gradients before running
    # the backward pass.
    optimizer.zero_grad()
    
    # Backward pass to compute the gradient
    # of loss w.r.t our learnable params. 
    loss.backward()
    
    # Update params
    optimizer.step()

In [ ]:

# Plot trained model
print(model)

In [ ]:

plot_model(X, y, model)

Two-layered network¶

In [ ]:

learning_rate = 1e-3
lambda_l2 = 1e-5

In [ ]:

# NN model
class two_layer_network(nn.Module):
    """
    NN model.
    """
    def __init__(self, D_in, H, D_out):
        """
        Initialize weights.
        """
        super(two_layer_network, self).__init__()
        self.fc1 = nn.Linear(D_in, H)
        self.fc2 = nn.Linear(H, D_out)

    def forward(self, x):
        """
        Forward pass.
        """
        z = F.relu(self.fc1(x))
        z = self.fc2(z)
        return z

In [ ]:

# nn package to create our linear model
# each Linear module has a weight and bias
model = two_layer_network(D, H, C)
model.to(device)

# nn package also has different loss functions.
# we use cross entropy loss for our classification task
criterion = torch.nn.CrossEntropyLoss()

# we use the optim package to apply
# ADAM for our parameter updates
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=lambda_l2) # built-in L2

# We convert our inputs and targest to Variables
# so we can use automatic differentiation but we 
# use require_grad=False b/c we don't want the gradients
# to alter these values.
input_X = torch.tensor(X, requires_grad=False, dtype=torch.float32)
y_true = torch.tensor(y, requires_grad=False, dtype=torch.long)

# e = 1.  # plotting purpose

# Training
for t in range(1000):
    
    # Feed forward to get the logits
    y_pred = model(input_X)
    
    # Compute the loss and accuracy
    loss = criterion(y_pred, y_true)
    score, predicted = torch.max(y_pred, 1)
    acc = (y_true == predicted).sum().float() / len(y_true)
    print("[EPOCH]: %i, [LOSS]: %.6f, [ACCURACY]: %.3f" % (t, loss.item(), acc))
    display.clear_output(wait=True)
    
    # zero the gradients before running
    # the backward pass.
    optimizer.zero_grad()
    
    # Backward pass to compute the gradient
    # of loss w.r.t our learnable params. 
    loss.backward()
    
    # Update params
    optimizer.step()
    
#    # Plot some progress
#     if t % math.ceil(e) == 0:
#         plot_model(X, y, model, e)
#         e *= 1.5

#! convert -delay 20 -crop 500x475+330+50 +repage $(gls -1v train*) train.gif

In [ ]:

# Plot trained model
print(model)
plot_model(X, y, model)