#hide !pip install -Uqq fastbook import fastbook fastbook.setup_book() [[chapter_foundations]] import torch from torch import tensor def matmul(a,b): ar,ac = a.shape # n_rows * n_cols br,bc = b.shape assert ac==br c = torch.zeros(ar, bc) for i in range(ar): for j in range(bc): for k in range(ac): c[i,j] += a[i,k] * b[k,j] return c m1 = torch.randn(5,28*28) m2 = torch.randn(784,10) %time t1=matmul(m1, m2) %timeit -n 20 t2=m1@m2 a = tensor([10., 6, -4]) b = tensor([2., 8, 7]) a + b a < b (a < b).all(), (a==b).all() (a + b).mean().item() m = tensor([[1., 2, 3], [4,5,6], [7,8,9]]) m*m n = tensor([[1., 2, 3], [4,5,6]]) m*n def matmul(a,b): ar,ac = a.shape br,bc = b.shape assert ac==br c = torch.zeros(ar, bc) for i in range(ar): for j in range(bc): c[i,j] = (a[i] * b[:,j]).sum() return c %timeit -n 20 t3 = matmul(m1,m2) a = tensor([10., 6, -4]) a > 0 m = tensor([[1., 2, 3], [4,5,6], [7,8,9]]) (m - 5) / 2.73 c = tensor([10.,20,30]) m = tensor([[1., 2, 3], [4,5,6], [7,8,9]]) m.shape,c.shape m + c c.expand_as(m) t = c.expand_as(m) t.storage() t.stride(), t.shape c + m c = tensor([10.,20,30]) m = tensor([[1., 2, 3], [4,5,6]]) c+m c = tensor([10.,20]) m = tensor([[1., 2, 3], [4,5,6]]) c+m c = tensor([10.,20,30]) m = tensor([[1., 2, 3], [4,5,6], [7,8,9]]) c = c.unsqueeze(1) m.shape,c.shape c+m t = c.expand_as(m) t.storage() t.stride(), t.shape c = tensor([10.,20,30]) c.shape, c.unsqueeze(0).shape,c.unsqueeze(1).shape c.shape, c[None,:].shape,c[:,None].shape c[None].shape,c[...,None].shape def matmul(a,b): ar,ac = a.shape br,bc = b.shape assert ac==br c = torch.zeros(ar, bc) for i in range(ar): # c[i,j] = (a[i,:] * b[:,j]).sum() # previous c[i] = (a[i ].unsqueeze(-1) * b).sum(dim=0) return c %timeit -n 20 t4 = matmul(m1,m2) def matmul(a,b): return torch.einsum('ik,kj->ij', a, b) %timeit -n 20 t5 = matmul(m1,m2) def lin(x, w, b): return x @ w + b x = torch.randn(200, 100) y = torch.randn(200) w1 = torch.randn(100,50) b1 = torch.zeros(50) w2 = torch.randn(50,1) b2 = torch.zeros(1) l1 = lin(x, w1, b1) l1.shape l1.mean(), l1.std() x = torch.randn(200, 100) for i in range(50): x = x @ torch.randn(100,100) x[0:5,0:5] x = torch.randn(200, 100) for i in range(50): x = x @ (torch.randn(100,100) * 0.01) x[0:5,0:5] x = torch.randn(200, 100) for i in range(50): x = x @ (torch.randn(100,100) * 0.1) x[0:5,0:5] x.std() x = torch.randn(200, 100) y = torch.randn(200) from math import sqrt w1 = torch.randn(100,50) / sqrt(100) b1 = torch.zeros(50) w2 = torch.randn(50,1) / sqrt(50) b2 = torch.zeros(1) l1 = lin(x, w1, b1) l1.mean(),l1.std() def relu(x): return x.clamp_min(0.) l2 = relu(l1) l2.mean(),l2.std() x = torch.randn(200, 100) for i in range(50): x = relu(x @ (torch.randn(100,100) * 0.1)) x[0:5,0:5] x = torch.randn(200, 100) for i in range(50): x = relu(x @ (torch.randn(100,100) * sqrt(2/100))) x[0:5,0:5] x = torch.randn(200, 100) y = torch.randn(200) w1 = torch.randn(100,50) * sqrt(2 / 100) b1 = torch.zeros(50) w2 = torch.randn(50,1) * sqrt(2 / 50) b2 = torch.zeros(1) l1 = lin(x, w1, b1) l2 = relu(l1) l2.mean(), l2.std() def model(x): l1 = lin(x, w1, b1) l2 = relu(l1) l3 = lin(l2, w2, b2) return l3 out = model(x) out.shape def mse(output, targ): return (output.squeeze(-1) - targ).pow(2).mean() loss = mse(out, y) def mse_grad(inp, targ): # grad of loss with respect to output of previous layer inp.g = 2. * (inp.squeeze() - targ).unsqueeze(-1) / inp.shape[0] def relu_grad(inp, out): # grad of relu with respect to input activations inp.g = (inp>0).float() * out.g def lin_grad(inp, out, w, b): # grad of matmul with respect to input inp.g = out.g @ w.t() w.g = inp.t() @ out.g b.g = out.g.sum(0) from sympy import symbols,diff sx,sy = symbols('sx sy') diff(sx**2, sx) def forward_and_backward(inp, targ): # forward pass: l1 = inp @ w1 + b1 l2 = relu(l1) out = l2 @ w2 + b2 # we don't actually need the loss in backward! loss = mse(out, targ) # backward pass: mse_grad(out, targ) lin_grad(l2, out, w2, b2) relu_grad(l1, l2) lin_grad(inp, l1, w1, b1) class Relu(): def __call__(self, inp): self.inp = inp self.out = inp.clamp_min(0.) return self.out def backward(self): self.inp.g = (self.inp>0).float() * self.out.g class Lin(): def __init__(self, w, b): self.w,self.b = w,b def __call__(self, inp): self.inp = inp self.out = inp@self.w + self.b return self.out def backward(self): self.inp.g = self.out.g @ self.w.t() self.w.g = self.inp.t() @ self.out.g self.b.g = self.out.g.sum(0) class Mse(): def __call__(self, inp, targ): self.inp = inp self.targ = targ self.out = (inp.squeeze() - targ).pow(2).mean() return self.out def backward(self): x = (self.inp.squeeze()-self.targ).unsqueeze(-1) self.inp.g = 2.*x/self.targ.shape[0] class Model(): def __init__(self, w1, b1, w2, b2): self.layers = [Lin(w1,b1), Relu(), Lin(w2,b2)] self.loss = Mse() def __call__(self, x, targ): for l in self.layers: x = l(x) return self.loss(x, targ) def backward(self): self.loss.backward() for l in reversed(self.layers): l.backward() model = Model(w1, b1, w2, b2) loss = model(x, y) model.backward() class LayerFunction(): def __call__(self, *args): self.args = args self.out = self.forward(*args) return self.out def forward(self): raise Exception('not implemented') def bwd(self): raise Exception('not implemented') def backward(self): self.bwd(self.out, *self.args) class Relu(LayerFunction): def forward(self, inp): return inp.clamp_min(0.) def bwd(self, out, inp): inp.g = (inp>0).float() * out.g class Lin(LayerFunction): def __init__(self, w, b): self.w,self.b = w,b def forward(self, inp): return inp@self.w + self.b def bwd(self, out, inp): inp.g = out.g @ self.w.t() self.w.g = inp.t() @ self.out.g self.b.g = out.g.sum(0) class Mse(LayerFunction): def forward (self, inp, targ): return (inp.squeeze() - targ).pow(2).mean() def bwd(self, out, inp, targ): inp.g = 2*(inp.squeeze()-targ).unsqueeze(-1) / targ.shape[0] from torch.autograd import Function class MyRelu(Function): @staticmethod def forward(ctx, i): result = i.clamp_min(0.) ctx.save_for_backward(i) return result @staticmethod def backward(ctx, grad_output): i, = ctx.saved_tensors return grad_output * (i>0).float() import torch.nn as nn class LinearLayer(nn.Module): def __init__(self, n_in, n_out): super().__init__() self.weight = nn.Parameter(torch.randn(n_out, n_in) * sqrt(2/n_in)) self.bias = nn.Parameter(torch.zeros(n_out)) def forward(self, x): return x @ self.weight.t() + self.bias lin = LinearLayer(10,2) p1,p2 = lin.parameters() p1.shape,p2.shape class Model(nn.Module): def __init__(self, n_in, nh, n_out): super().__init__() self.layers = nn.Sequential( nn.Linear(n_in,nh), nn.ReLU(), nn.Linear(nh,n_out)) self.loss = mse def forward(self, x, targ): return self.loss(self.layers(x).squeeze(), targ) class Model(Module): def __init__(self, n_in, nh, n_out): self.layers = nn.Sequential( nn.Linear(n_in,nh), nn.ReLU(), nn.Linear(nh,n_out)) self.loss = mse def forward(self, x, targ): return self.loss(self.layers(x).squeeze(), targ)