#!/usr/bin/env python # coding: utf-8 # # Notebook 4: Activation [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mattsankner/micrograd/blob/main/mg4_activation.ipynb) [![View in nbviewer](https://img.shields.io/badge/view-nbviewer-orange)](https://nbviewer.jupyter.org/github/mattsankner/micrograd/blob/main/mg4_activation.ipynb) # # Welcome to the fourth lecture! Here we will get more granular and controlled with our ```tanh()``` function. # ## Now that we have automated our forward and backward pass for a single neuron, did you find the error in the previous code? Run the code below until the next text block to find out what it was and how to fix it. # In[1]: get_ipython().system('pip install graphviz') get_ipython().system('export PATH="/usr/local/opt/graphviz/bin:$PATH"') # In[13]: import matplotlib.pyplot as plt import numpy as np import math # In[10]: from graphviz import Digraph #graphviz is an opensource vizualization software. We are building out this graph in graphviz API. def trace(root): #helper function that enumerates the ndoes and edges in the graph # builds a set of all nodes and edges in a graph nodes, edges = set(), set() def build(v): if v not in nodes: nodes.add(v) for child in v._prev: edges.add((child, v)) build(child) build(root) return nodes, edges def draw_dot(root): #creating op nodes (not Value objects) dot = Digraph(format='svg', graph_attr={'rankdir': 'LR'}) # LR = left to right nodes, edges = trace(root) #call trace for n in nodes: uid = str(id(n)) # for any value in the graph, create a rectangular ('record') node for it dot.node(name = uid, label = "{ %s | data %.4f | grad % .4f}" % (n.label, n.data, n.grad), shape='record') if n._op: # if this value is a result of some operation, create an op node for it. Not a value object dot.node(name = uid + n._op, label = n._op) # and connect this node to it dot.edge(uid + n._op, uid) for n1, n2 in edges: # connect n1 to the op node of n2 dot.edge(str(id(n1)), str(id(n2)) + n2._op) return dot # In[23]: class Value: def __init__(self, data, _children=(), _op='', label=''): self.data = data self.grad = 0.0 self._prev = set(_children) self._backward = lambda : None self._op = _op self.label=label def __repr__(self): return f"Value(data={self.data})" def __add__(self, other): out = Value(self.data + other.data, (self, other), '+') def _backward(): self.grad = 1.0 * out.grad other.grad = 1.0 * out.grad out._backward = _backward return out def __mul__(self, other): out = Value(self.data * other.data, (self, other), '*') def _backward(): self.grad = other.data * out.grad other.grad = self.data * out.grad out._backward = _backward return out def tanh(self): x = self.data t = (math.exp(2*x)-1)/(math.exp(2*x)+1) out = Value(t, (self, ), 'tanh') def _backward(): self.grad = (1-t**2) * out.grad out._backward = _backward return out #define backward() without the underscore. def backward(self): #Build the topological graph starting at self topo = [] #topological list where we populate the order visited = set() #maintain a set of visited nodes def build_topo(v): #start at the root node if v not in visited: visited.add(v) for child in v._prev: build_topo(child) topo.append(v) build_topo(self) self.grad = 1.0 #initalize root.grad for node in reversed(topo): node._backward() #call _backward() and do backpropogation on all of the children. # In[15]: #FINISH IT! o.backward() draw_dot(o) # ### Did you find the bug? # # Try running the following code: # In[22]: a = Value(3.0, label='a') b = a + a; b.label = 'b' b.backward() draw_dot(b) # ```a``` funnels two arrows on top of each other in its forward pass to ```b```, but what do you notice about the gradient? # # # The derivative of ```b``` with respect to ```a``` should be $2$, or $1 + 1$. That's the chain rule. Instead, ```a's``` gradient is $1.0$. # # We are going to run into this issue every time we use a variable more than once in expression. This is because in the ```Value``` class, we override ```self``` and ```other``` accidentally when resetting their values. Since ```self``` and ```other``` both point to the same object ```a```, ```other``` overrides ```self```. # # ``` python # def __add__(self, other): # out = Value(self.data + other.data, (self, other), '+') # # def _backward(): # self.grad = 1.0 * out.grad #a.grad = 1.0 * 1.0 # other.grad = 1.0 * out.grad #a.grad = 1.0 * 1.0 (resetting a.grad) # out._backward = _backward # return out # ``` # We can see this in a more complicated example. In the below example, ```d.backward()``` overrides ```e.backward()```, as ```d's``` gradients are correct but ```e```'s aren't. For you, it might be the other way around, but either way it won't work. # In[25]: a = Value(-2.0, label='a') b = Value(3.0, label= 'b') d = a * b; d.label = 'd' e = a + b; e.label = 'e' f = d * e; f.label = 'f' f.backward() draw_dot(f) # If we look at the multivariable case of the chain rule on wikipedia, the solution is simple: we accumulate the gradients. Instead of setting gradients, we simply do ```+= self.grad```. We instead deposit the gradients from each branch, even it's the same variable. This way they add on top of one another. # # We reinitialize the ```Value``` class with a different ```_backward()``` inside of our ```__add__```: # In[26]: class Value: def __init__(self, data, _children=(), _op='', label=''): self.data = data self.grad = 0.0 self._prev = set(_children) self._backward = lambda : None self._op = _op self.label=label def __repr__(self): return f"Value(data={self.data})" def __add__(self, other): out = Value(self.data + other.data, (self, other), '+') def _backward(): self.grad += 1.0 * out.grad #doesn't reset anymore other.grad += 1.0 * out.grad #doesn't reset anymore out._backward = _backward return out def __mul__(self, other): out = Value(self.data * other.data, (self, other), '*') def _backward(): self.grad = other.data * out.grad other.grad = self.data * out.grad out._backward = _backward return out def tanh(self): x = self.data t = (math.exp(2*x)-1)/(math.exp(2*x)+1) out = Value(t, (self, ), 'tanh') def _backward(): self.grad = (1-t**2) * out.grad out._backward = _backward return out #define backward() without the underscore. def backward(self): #Build the topological graph starting at self topo = [] #topological list where we populate the order visited = set() #maintain a set of visited nodes def build_topo(v): #start at the root node if v not in visited: visited.add(v) for child in v._prev: build_topo(child) topo.append(v) build_topo(self) self.grad = 1.0 #initalize root.grad for node in reversed(topo): node._backward() #call _backward() and do backpropogation on all of the children. # In[28]: #try again a = Value(3.0, label='a') b = a + a; b.label = 'b' b.backward() draw_dot(b) #a.grad should be 2 # In[31]: #try again a = Value(-2.0, label='a') b = Value(3.0, label= 'b') d = a * b; d.label = 'd' e = a + b; e.label = 'e' f = d * e; f.label = 'f' f.backward() draw_dot(f) #a.grad should be 3 + -6 = -3; b.grad should be -2 + -6 = -8 #this way each separate branch can deposit their gradients on top of each other # ## Now, we'll go through a few more exercises. # # Remember how we said earlier that we could have implemented ```tanh()``` as a series of explicit expressions with the ```exp()``` function? We instead built it as a single function because we knew its derivative and could backpropogate through it, but here we will implement it as a result of exponents, subtraction, etc. Remember the original ```tanh()``` function: # $$tanh(x)={\frac {\sinh x}{\cosh x}}={\frac {e^{x}-e^{-x}}{e^{x}+e^{-x}}}={\frac {e^{2x}-1}{e^{2x}+1}}$$ # # # # Let's test to see where we can improve functionality. What happens when we add a number to a ```Value``` object? # In[ ]: a = Value(2.0) #a + 1 and a * 1 causes ValueError, since we are adding a int (non-Value) to a Value object # We solve this by reinitializing the Value class with the following code that checks if ```other``` is an instance of ```Value```, and if not, we assume ```other``` is an ```int``` and create a ```Value``` object with it. We do this for both ```__add__``` and ```__mul__``` # # ``` python # other = other if isinstance(other, Value) else Value(other) # ``` # # Now, after updating this, we still have a problem. ```a + 2``` and ```a * 2``` would work, but would ```2 + a``` or ```2 * a```? # # It would not. In python, it knows how to do ```a.__mul___(2)```, but does not know how to do ```2.__mul__(a)```. It knows how to do ```self * other```, but not ```other * self```. # # Fortunately, we can define an ```__rmul__``` , which python checks for if it can't do the given operation. You can do the same with ```__radd__```. They will swap the order of the operands so python can compute the operation. # # ``` python # def __radd__(self, other): # return self + other # # def __rmul__(self, other): # return self * other # ``` # Now, we update the ```Value``` class and test: # In[56]: class Value: def __init__(self, data, _children=(), _op='', label=''): self.data = data self.grad = 0.0 self._prev = set(_children) self._backward = lambda : None self._op = _op self.label=label def __repr__(self): return f"Value(data={self.data})" def __add__(self, other): other = other if isinstance(other, Value) else Value(other) # new for a + 1 out = Value(self.data + other.data, (self, other), '+') def _backward(): self.grad += 1.0 * out.grad other.grad += 1.0 * out.grad out._backward = _backward return out def __mul__(self, other): other = other if isinstance(other, Value) else Value(other) #new for a * 1 out = Value(self.data * other.data, (self, other), '*') def _backward(): self.grad = other.data * out.grad other.grad = self.data * out.grad out._backward = _backward return out def tanh(self): x = self.data t = (math.exp(2*x)-1)/(math.exp(2*x)+1) out = Value(t, (self, ), 'tanh') def _backward(): self.grad = (1-t**2) * out.grad out._backward = _backward return out def __radd__(self, other): #new for 1 + a return self + other def __rmul__(self, other): #new for 1 * a return self * other def backward(self): topo = [] visited = set() def build_topo(v): visited.add(v) for child in v._prev: build_topo(child) topo.append(v) build_topo(self) self.grad = 1.0 for node in reversed(topo): node._backward() # In[57]: #test should work: a = Value(2.0) a + 1 a * 3 3 + a 2 * a # Now that we can add and divide both ways, let's add the ```exp()``` method into the ```Value``` class. It will mirror ```tanh()``` in that it inputs, transforms, and outputs a single scalar value. # # In the ```exp()```, we are trying to get ${e^{x}}$, or ${e^{self.data}}$. # # The issue, after we calculate ${e^{x}}$ in the code, is how we're going to backpropogate through it. # # We need to know the local derivative of ${e^{x}}$. # # $D/dx$ of ${e^{x}}$ is ${e^{x}}$. Thus, we can multiply it by ```out.grad``` with the chain rule to backpropogate. # # It will return ${e^{x}}$. # # ```e``` is the base of the natural system of logarithms (approximately 2.718282) and ```x``` is the number passed to it, or ```self.data```. # In[59]: class Value: def __init__(self, data, _children=(), _op='', label=''): self.data = data self.grad = 0.0 self._prev = set(_children) self._backward = lambda : None self._op = _op self.label=label def __repr__(self): return f"Value(data={self.data})" def __add__(self, other): other = other if isinstance(other, Value) else Value(other) out = Value(self.data + other.data, (self, other), '+') def _backward(): self.grad += 1.0 * out.grad other.grad += 1.0 * out.grad out._backward = _backward return out def __mul__(self, other): other = other if isinstance(other, Value) else Value(other) out = Value(self.data * other.data, (self, other), '*') def _backward(): self.grad = other.data * out.grad other.grad = self.data * out.grad out._backward = _backward return out def tanh(self): x = self.data t = (math.exp(2*x)-1)/(math.exp(2*x)+1) out = Value(t, (self, ), 'tanh') def _backward(): self.grad = (1-t**2) * out.grad out._backward = _backward return out def __radd__(self, other): return self + other def __rmul__(self, other): return self * other def exp(self): #trying to get e^self.data x = self.data #hold the value out = Value(math.exp(x), (self, ), 'exp') #set out to e^x def _backward(): #since the derivative of e^x is e^x, we can jsut use out, since out.data is e^x self.grad += out.data * out.grad #then we multiply out.data by out.grad (chain rule) out._backward = _backward return out def backward(self): topo = [] visited = set() def build_topo(v): visited.add(v) for child in v._prev: build_topo(child) topo.append(v) build_topo(self) self.grad = 1.0 for node in reversed(topo): node._backward() # In[60]: #let's test it: a = Value(4.0) a.exp() # Now, we want to divide. To do this, we can redefine division as an option for a power function. # This is because: # # ```a/b = a*1/b = a*b**1``` # # We make a power function: # # ```python # def __pow__(self, other): #self to the pow of other # #self.data is an int or a float, not a Value obj, just accessing .data prop # assert isinstance(other, (int, float)), "only supporting int/float powers for now" # out = Value(self.data**other, (self,),f'**{other}') # # #what's the chain rule for backprop thru the power function, where power is to the power of some kind of constant?? # def _backward(): # # #local derivative * out.gard # self.grad += other * self.data ** (other - 1) * out.grad # out._backward = _backward # return out # ``` # # Why is this the local derivative? If you look up derivative rules, you may come across the power rule: # $${\frac{d}{dx}({x}^n)={{n}{x}^{n-1}}}$$ # # # To use this, we'll add the following, self explanatory function: # ```python # def __truediv__(self, other): #self/other; calls self.__mul__(other.__pow__(-1)) # return self*other**-1 # ``` # We add it to the Value class and test: # In[72]: class Value: def __init__(self, data, _children=(), _op='', label=''): self.data = data self.grad = 0.0 self._prev = set(_children) self._backward = lambda : None self._op = _op self.label=label def __repr__(self): return f"Value(data={self.data})" def __add__(self, other): other = other if isinstance(other, Value) else Value(other) out = Value(self.data + other.data, (self, other), '+') def _backward(): self.grad += 1.0 * out.grad other.grad += 1.0 * out.grad out._backward = _backward return out def __mul__(self, other): other = other if isinstance(other, Value) else Value(other) out = Value(self.data * other.data, (self, other), '*') def _backward(): self.grad = other.data * out.grad other.grad = self.data * out.grad out._backward = _backward return out def tanh(self): x = self.data t = (math.exp(2*x)-1)/(math.exp(2*x)+1) out = Value(t, (self, ), 'tanh') def _backward(): self.grad = (1-t**2) * out.grad out._backward = _backward return out def __radd__(self, other): return self + other def __rmul__(self, other): return self * other def exp(self): #trying to get e^self.data x = self.data #hold the value out = Value(math.exp(x), (self, ), 'exp') #set out to e^x def _backward(): #since the derivative of e^x is e^x, we can jsut use out, since out.data is e^x self.grad += out.data * out.grad #then we multiply out.data by out.grad (chain rule) out._backward = _backward return out def __pow__(self, other): #self to the pow of other assert isinstance(other, (int, float)), "only supporting int/float powers for now" out = Value(self.data**other, (self,),f'**{other}') def _backward(): self.grad += other * self.data ** (other -1) * out.grad out._backward = _backward return out def __truediv__(self, other): #self/other return self*other**-1 def backward(self): topo = [] visited = set() def build_topo(v): visited.add(v) for child in v._prev: build_topo(child) topo.append(v) build_topo(self) self.grad = 1.0 for node in reversed(topo): node._backward() # In[84]: e = Value(-2.0) f = Value(4.0) e / f # Now, we'll give it the subtract functionality. I'm sure you get the idea at this point. Instead of ```self - other```, we implement ```self + -other``` (addition of negation): # In[136]: class Value: def __init__(self, data, _children=(), _op='', label=''): self.data = data self.grad = 0.0 self._backward = lambda: None self._prev = set(_children) self._op = _op self.label = label def __repr__(self): return f"Value(data={self.data})" def __add__(self, other): other = other if isinstance(other, Value) else Value(other) out = Value(self.data + other.data, (self, other), '+') def _backward(): self.grad += 1.0 * out.grad other.grad += 1.0 * out.grad out._backward = _backward return out def __mul__(self, other): other = other if isinstance(other, Value) else Value(other) out = Value(self.data * other.data, (self, other), '*') def _backward(): self.grad += other.data * out.grad other.grad += self.data * out.grad out._backward = _backward return out def __neg__(self): #new : negate self return self * -1 def __sub__(self, other): #self-other return self + (-other) def __pow__(self, other): assert isinstance(other, (int, float)), "only supporting int/float powers for now" out = Value(self.data**other, (self,),f'**{other}') def _backward(): self.grad += other * self.data ** (other -1) * out.grad out._backward = _backward return out def __rmul__(self,other): return self * other def __truediv__(self, other): return self*other**-1 def tanh(self): x = self.data t = (math.exp(2*x) - 1)/(math.exp(2*x) + 1) out = Value(t, (self, ), 'tanh') def _backward(): self.grad += (1 - t**2) * out.grad out._backward = _backward return out def exp(self): x = self.data out = Value(math.exp(x), (self, ), 'exp') def _backward(): self.grad += out.data * out.grad out._backward = _backward return out def backward(self): topo = [] visited = set() def build_topo(v): if v not in visited: visited.add(v) for child in v._prev: build_topo(child) topo.append(v) build_topo(self) self.grad = 1.0 for node in reversed(topo): node._backward() # In[137]: #test it g = Value(-3.4) z = Value(4) z - g # Now, let's revisit the variables from before, so we can break up the ```tan(h)``` into what we've built above and create this: # $$tanh(x)={\frac {e^{2x}-1}{e^{2x}+1}}$$ # In[138]: #redefine variables and run, noting the original gradients after a backward pass: x1 = Value(2.0, label='x1') x2 = Value(0.0, label='x2') w1 = Value(-3.0, label='w1') w2 = Value(1.0, label='w2') b = Value(6.8813735870195432, label='b') x1w1 = x1*w1; x1w1.label = 'x1*w1' x2w2 = x2*w2; x2w2.label = 'x2*w2' x1w1x2w2 = x1w1 + x2w2; x1w1x2w2.label = 'x1*w1 + x2*w2' n = x1w1x2w2 + b; n.label = 'n' o = n.tanh(); o.label = 'o' o.backward() draw_dot(o) # In[140]: x1 = Value(2.0, label='x1') x2 = Value(0.0, label='x2') w1 = Value(-3.0, label='w1') w2 = Value(1.0, label='w2') b = Value(6.8813735870195432, label='b') x1w1 = x1*w1; x1w1.label = 'x1*w1' x2w2 = x2*w2; x2w2.label = 'x2*w2' x1w1x2w2 = x1w1 + x2w2; x1w1x2w2.label = 'x1*w1 + x2*w2' n = x1w1x2w2 + b; n.label = 'n' #trying to get hyperbolic tangent : e^(2x) - 1 / e^(2x) + 1 e = (2*n).exp() #e^(2x o = (e - 1) / (e + 1) o.label = 'o' o.backward() #test backprop to see if the gradients are the same as before #expect to see a much longer graph because we broke up tanh() draw_dot(o) # Since the end and beginning gradients and data end up being the same for the backward and forward pass, we can assume we made the right conversion from our original ```tanh()``` to the broken down version. # # Andrej had us do these exercises because: # - practice writing more operations for the backward pass # - illustrate that the level of abstraction is up to the coder. You can write atomic or more composite functions like the ```tanh()```, and achieve the same result. The only thing you lose is the granularity of seeing the backward pass. As long as you can do the forward and backward pass, it doesn't matter what the operation is/how composite it is. If you can find the local gradient and you can chain it with the chain rule, you can do backpropogation with any type of operation. # ## Great job! You now should have a firm grasp on backpropogation through a neuron looks and is done. In the next lecture, we will abstract this process further with PyTorch, and continue building out our deep neural network, micrograd.