import math
import numpy as np
import matplotlib.pyplot as plt
from rich import print
from rich import pretty
Mircrograd from scratch
pretty.install()
Single Variable: x
Create a function
def f(x): return 3*x**2 - 4*x + 5
3.0) f(
20.0
= np.arange(-5, 5, 0.25)
xs xs
array([-5. , -4.75, -4.5 , -4.25, -4. , -3.75, -3.5 , -3.25, -3. , -2.75, -2.5 , -2.25, -2. , -1.75, -1.5 , -1.25, -1. , -0.75, -0.5 , -0.25, 0. , 0.25, 0.5 , 0.75, 1. , 1.25, 1.5 , 1.75, 2. , 2.25, 2.5 , 2.75, 3. , 3.25, 3.5 , 3.75, 4. , 4.25, 4.5 , 4.75])
= f(xs)
ys ys
array([100. , 91.6875, 83.75 , 76.1875, 69. , 62.1875, 55.75 , 49.6875, 44. , 38.6875, 33.75 , 29.1875, 25. , 21.1875, 17.75 , 14.6875, 12. , 9.6875, 7.75 , 6.1875, 5. , 4.1875, 3.75 , 3.6875, 4. , 4.6875, 5.75 , 7.1875, 9. , 11.1875, 13.75 , 16.6875, 20. , 23.6875, 27.75 , 32.1875, 37. , 42.1875, 47.75 , 53.6875])
Plot the function
plt.plot(xs, ys)
[<matplotlib.lines.Line2D object at 0x7fb25cd152d0>]
Derivative on increasing side of the curve
= 0.0000000001
h = 3.0
x + h) - f(x))/h (f(x
14.000001158365194
Derivative on the decreasing side of the curve
= 0.0000000001
h = -3.0
x + h) - f(x))/h (f(x
-21.999966293151374
Derivative on the bottom of the curve
= 0.0000000001
h = 2/3
x + h) - f(x))/h (f(x
0.0
Multivariable: a
, b
, c
= 2.0
a = -3.0
b = 10.0
c def d(a, b, c): return a*b + c
print(d(a, b, c))
4.0
Derivative with respect to a
= 0.0001
h
= 2.0
a = -3.0
b = 10.0
c
= d(a, b, c)
d1
+= h
a = d(a, b, c)
d2
print('d1', d1)
print('d2', d2)
print('slope', (d2 - d1)/h)
d1 4.0
d2 3.999699999999999
slope -3.000000000010772
Derivative with respect to b
= 0.0001
h
= 2.0
a = -3.0
b = 10.0
c
= d(a, b, c)
d1
+= h
b = d(a, b, c)
d2
print('d1', d1)
print('d2', d2)
print('slope', (d2 - d1)/h)
d1 4.0
d2 4.0002
slope 2.0000000000042206
Derivative with respect to c
= 0.0001
h
= 2.0
a = -3.0
b = 10.0
c
= d(a, b, c)
d1
+= h
c = d(a, b, c)
d2
print('d1', d1)
print('d2', d2)
print('slope', (d2 - d1)/h)
d1 4.0
d2 4.0001
slope 0.9999999999976694
Create Value Object
(mentioned in the README of micrograd )
Define intial template of Value Class
class Value:
def __init__(self, data):
self.data = data
def __repr__(self):
return f"Value(data={self.data})"
= Value(2.0)
a = Value(-3.0)
b a, b
(Value(data=2.0), Value(data=-3.0))
Add the add
function
class Value:
def __init__(self, data):
self.data = data
def __repr__(self):
return f"Value(data={self.data})"
def __add__(self, other): # ⭠ for adding among the value objects
return Value(self.data + other.data)
= Value(2.0)
a = Value(-3.0)
b a, b
(Value(data=2.0), Value(data=-3.0))
+ b # a.__add__(b) a
Value(data=-1.0)
Add the mul
function
class Value:
def __init__(self, data):
self.data = data
def __repr__(self):
return f"Value(data={self.data})"
def __add__(self, other):
return Value(self.data + other.data)
def __mul__(self, other): # ⭠ for multiplying among the value objects
return Value(self.data * other.data)
= Value(2.0)
a = Value(-3.0)
b a, b
(Value(data=2.0), Value(data=-3.0))
* b # a.__mul__(b) a
Value(data=-6.0)
= Value(10.0) c
= a * b + c; d d
Value(data=4.0)
Add the functionality to know what values created a value with _children
class Value:
def __init__(self, data, _children=()): # ⭠ Add _children
self.data = data
self._prev = set(_children) # ⭠ Add _children
def __repr__(self):
return f"Value(data ={self.data})"
def __add__(self, other):
return Value(self.data + other.data, (self, other))
def __mul__(self, other):
return Value(self.data * other.data, (self, other))
= Value(2.0)
a = Value(-3.0)
b = Value(10.0)
c = a*b + c
d d
Value(data =4.0)
# childrens are -6.0 (a *b) and 10.0 (c) d._prev
{Value(data =-6.0), Value(data =10.0)}
Add the functionality to know what operations created a value with _op
class Value:
def __init__(self, data, _children=(), _op=''): # ⭠ Add _op
self.data = data
self._prev = set(_children)
self._op = _op # ⭠ Add _op
def __repr__(self):
return f"Value(data={self.data})"
def __add__(self, other):
return Value(self.data + other.data, (self, other), '+')
def __mul__(self, other):
return Value(self.data * other.data, (self, other), '*')
= Value(2.0)
a = Value(-3.0)
b = Value(10.0)
c = a*b + c
d d
Value(data=4.0)
d._prev
{Value(data=10.0), Value(data=-6.0)}
d._op
'+'
Visualize the expression graph with operators and operands
from graphviz import Digraph
def trace(root):
# build a set of all nodes and edges in a graph
= set(), set()
nodes, edges def build(v):
if v not in nodes:
nodes.add(v)for child in v._prev:
edges.add((child, v))
build(child)
build(root)return nodes, edges
def draw_dot(root, label):
= Digraph(format='svg', graph_attr={'rankdir': 'LR'}) # LR = left to right
dot
= trace(root)
nodes, edges for n in nodes:
= str(id(n))
uid # for any value in the graph, create a rectangular ('record') node for it
= uid, label=label(n), shape='record') # ⭠ label function getting called
dot.node(name if n._op:
# if this value is a result of some operation, create an op node for it
= uid + n._op, label = n._op)
dot.node(name + n._op, uid)
dot.edge(uid
for n1, n2 in edges:
# connect n1 to the op node of n2
str(id(n1)), str(id(n2)) + n2._op)
dot.edge(
return dot
def label(node): return "{data %.4f}" % (node.data)
draw_dot(d, label)
Add label to each node
so that we know what are the corresponding variables for each value
class Value:
def __init__(self, data, _children=(), _op='', label=''): # ⭠ Add label
self.data = data
self._prev = set(_children)
self._op = _op
self.label = label # ⭠ Add label
def __repr__(self):
return f"Value(label={self.label} data={self.data})"
def __add__(self, other):
return Value(self.data + other.data, (self, other), '+')
def __mul__(self, other):
return Value(self.data * other.data, (self, other), '*')
= Value(2.0, label = 'a')
a = Value(-3.0, label='b')
b = Value(10, label = 'c')
c = a*b; e.label = 'e'
e = e + c; d.label = 'd'
d = Value(-2.0, label='f')
f = d * f; L.label = 'L'
L L
Value(label=L data=-8.0)
Change the label
function to render the label
def label(node): return "{%s | {data %.4f}}" % (node.label, node.data)
draw_dot(L, label)
Add grad
to Value
class
class Value:
def __init__(self, data, _children=(), _op='', label=''):
self.data = data
self.grad = 0.0 # ⭠ Add grad
self._prev = set(_children)
self._op = _op
self.label = label
def __repr__(self):
return f"Value(label={self.label} data={self.data})"
def __add__(self, other):
return Value(self.data + other.data, (self, other), '+')
def __mul__(self, other):
return Value(self.data * other.data, (self, other), '*')
= Value(2.0, label = 'a')
a = Value(-3.0, label='b')
b = Value(10, label = 'c')
c = a*b; e.label = 'e'
e = e + c; d.label = 'd'
d = Value(-2.0, label='f')
f = d * f; L.label = 'L'
L L.grad
0.0
def label(node): return "{%s | {data %.4f} | grad %.4f}" % (node.label, node.data, node.grad)
draw_dot(L, label)
Create a function lol
Derive with respect to a
def lol():
= 0.0001
h
= Value(2.0, label = 'a')
a = Value(-3.0, label='b')
b = Value(10, label = 'c')
c = a*b; e.label = 'e'
e = e + c; d.label = 'd'
d = Value(-2.0, label='f')
f = d * f; L.label = 'L'
L = L.data
L1
= Value(2.0 + h, label = 'a')
a = Value(-3.0, label='b')
b = Value(10, label = 'c')
c = a*b; e.label = 'e'
e = e + c; d.label = 'd'
d = Value(-2.0, label='f')
f = d * f; L.label = 'L'
L = L.data
L2
print((L2 - L1) / h)
lol()
6.000000000021544
Derive with respect to L
def lol():
= 0.0001
h
= Value(2.0, label = 'a')
a = Value(-3.0, label='b')
b = Value(10, label = 'c')
c = a*b; e.label = 'e'
e = e + c; d.label = 'd'
d = Value(-2.0, label='f')
f = d * f; L.label = 'L'
L = L.data
L1
= Value(2.0, label = 'a')
a = Value(-3.0, label='b')
b = Value(10, label = 'c')
c = a*b; e.label = 'e'
e = e + c; d.label = 'd'
d = Value(-2.0, label='f')
f = d * f; L.label = 'L'
L = L.data + h
L2
print((L2 - L1) / h)
lol()
0.9999999999976694
= 1 L.grad
draw_dot(L, label)
Derivative of L
with respect to f
\[ L = f \cdot d \]
\[ \frac{\partial L}{\partial f} = \frac{\partial (f \cdot d)}{\partial f} = d = 4.0 \]
def lol():
= 0.001
h
= Value(2.0, label = 'a')
a = Value(-3.0, label='b')
b = Value(10, label = 'c')
c = a*b; e.label = 'e'
e = e + c; d.label = 'd'
d = Value(-2.0, label='f')
f = d * f; L.label = 'L'
L = L.data
L1
= Value(2.0, label = 'a')
a = Value(-3.0, label='b')
b = Value(10, label = 'c')
c = a*b; e.label = 'e'
e = e + c; d.label = 'd'
d = Value(-2.0 + h, label='f')
f = d * f; L.label = 'L'
L = L.data
L2
print((L2 - L1) / h)
lol()
3.9999999999995595
= 4 f.grad
draw_dot(L, label)
Derivative of L
with respect to d
\[ \frac{\partial L}{\partial d} = \frac{\partial (f \cdot d)}{\partial d} = f = -2.0 \]
def lol():
= 0.001
h
= Value(2.0, label = 'a')
a = Value(-3.0, label='b')
b = Value(10, label = 'c')
c = a*b; e.label = 'e'
e = e + c; d.label = 'd'
d = Value(-2.0, label='f')
f = d * f; L.label = 'L'
L = L.data
L1
= Value(2.0, label = 'a')
a = Value(-3.0, label='b')
b = Value(10, label = 'c')
c = a*b; e.label = 'e'
e = e + c; d.label = 'd'
d += h
d.data = Value(-2.0, label='f')
f = d * f; L.label = 'L'
L = L.data
L2
print((L2 - L1) / h)
lol()
-2.000000000000668
= -2 d.grad
draw_dot(L, label)
Derivative of L
with respect to c
\[ \frac{\partial d}{\partial c} = \frac{\partial (c + e)}{\partial c} = 1.0 \]
\[ \frac{\partial L}{\partial c} = \frac{\partial L}{\partial d}\cdot\frac{\partial d}{\partial c} = f = -2.0 \]
def lol():
= 0.001
h
= Value(2.0, label = 'a')
a = Value(-3.0, label='b')
b = Value(10, label = 'c')
c = a*b; e.label = 'e'
e = e + c; d.label = 'd'
d = Value(-2.0, label='f')
f = d * f; L.label = 'L'
L = L.data
L1
= Value(2.0, label = 'a')
a = Value(-3.0, label='b')
b = Value(10 + h, label = 'c')
c = a*b; e.label = 'e'
e = e + c; d.label = 'd'
d = Value(-2.0, label='f')
f = d * f; L.label = 'L'
L = L.data
L2
print((L2 - L1) / h)
lol()
-1.9999999999988916
= -2 c.grad
draw_dot(L, label)
Derivative of L
with respect to e
\[ \frac{\partial d}{\partial e} = \frac{\partial (c + e)}{\partial e} = 1.0 \]
\[ \frac{\partial L}{\partial e} = \frac{\partial L}{\partial d} \cdot \frac{\partial d}{\partial e} = f = -2.0 \]
def lol():
= 0.001
h
= Value(2.0, label = 'a')
a = Value(-3.0, label='b')
b = Value(10, label = 'c')
c = a*b; e.label = 'e'
e = e + c; d.label = 'd'
d = Value(-2.0, label='f')
f = d * f; L.label = 'L'
L = L.data
L1
= Value(2.0, label = 'a')
a = Value(-3.0, label='b')
b = Value(10, label = 'c')
c = a*b; e.label = 'e'
e += h
e.data
= e + c; d.label = 'd'
d = Value(-2.0, label='f')
f = d * f; L.label = 'L'
L = L.data
L2
print((L2 - L1) / h)
lol()
-2.000000000000668
= -2 e.grad
draw_dot(L, label)
Derivative of L
with respect to a
\[ \frac{\partial e}{\partial a} = \frac{\partial ({a}\cdot{b})}{\partial a} = b \]
\[ \frac{\partial L}{\partial a} = \frac{\partial L}{\partial e} \cdot \frac{\partial e}{\partial a} = -2b = 6 \]
def lol():
= 0.001
h
= Value(2.0, label = 'a')
a = Value(-3.0, label='b')
b = Value(10, label = 'c')
c = a*b; e.label = 'e'
e = e + c; d.label = 'd'
d = Value(-2.0, label='f')
f = d * f; L.label = 'L'
L = L.data
L1
= Value(2.0 + h, label = 'a')
a = Value(-3.0, label='b')
b = Value(10, label = 'c')
c = a*b; e.label = 'e'
e
= e + c; d.label = 'd'
d = Value(-2.0, label='f')
f = d * f; L.label = 'L'
L = L.data
L2
print((L2 - L1) / h)
lol()
6.000000000000227
= 6 a.grad
draw_dot(L, label)
Derivative of L
with respect to b
\[ \frac{\partial e}{\partial b} = \frac{\partial ({a}\cdot{b})}{\partial b} = a \]
\[ \frac{\partial L}{\partial b} = \frac{\partial L}{\partial e} \cdot \frac{\partial e}{\partial b}= -2a = -4 \]
def lol():
= 0.001
h
= Value(2.0, label = 'a')
a = Value(-3.0, label='b')
b = Value(10, label = 'c')
c = a*b; e.label = 'e'
e = e + c; d.label = 'd'
d = Value(-2.0, label='f')
f = d * f; L.label = 'L'
L = L.data
L1
= Value(2.0, label = 'a')
a = Value(-3.0 + h, label='b')
b = Value(10, label = 'c')
c = a*b; e.label = 'e'
e
= e + c; d.label = 'd'
d = Value(-2.0, label='f')
f = d * f; L.label = 'L'
L = L.data
L2
print((L2 - L1) / h)
lol()
-3.9999999999995595
= -4 b.grad
draw_dot(L, label)
+= 0.01 * a.grad
a.data += 0.01 * b.grad
b.data += 0.01 * c.grad
c.data += 0.01 * f.grad f.data
= a * b
e = e + c
d = d * f
L print(L.data)
-7.286496
Neural Network
Tanh
-5, 5, 0.2), np.tanh(np.arange(-5, 5, 0.2))); plt.grid(); plt.plot(np.arange(
Add tanh
class Value:
def __init__(self, data, _children=(), _op='', label=''):
self.data = data
self.grad = 0.0 # ⭠ Add grad
self._prev = set(_children)
self._op = _op
self.label = label
def __repr__(self):
return f"Value(label={self.label} data={self.data})"
def __add__(self, other):
return Value(self.data + other.data, (self, other), '+')
def __mul__(self, other):
return Value(self.data * other.data, (self, other), '*')
def tanh(self):
= self.data
x = (math.exp(2*x) - 1)/(math.exp(2*x) + 1)
t = Value(t, (self,), 'tanh')
out return out
Inputs: x1, x2
= Value(2.0, label='x1')
x1 = Value(0.0, label='x2') x2
Weights: w1, w2
= Value(-3.0, label='w1')
w1 = Value(1.0, label='w2') w2
Bias
= Value(6.8813735870195432, label='b') b
x1w1 + x2w2 + b
= x1*w1; x1w1.label='x1*w1'
x1w1 = x2*w2; x2w2.label='x2*w2'
x2w2 = x1w1 + x2w2; x1w1x2w2.label='x1*w1 + x2*w2'
x1w1x2w2 = x1w1x2w2 + b; n.label='n' n
= n.tanh(); o.label = 'o' o
draw_dot(o, label)
Computing gradient of each node manually
\[ \frac{\partial o}{\partial o} = 1 \]
= 1.0 o.grad
draw_dot(o, label)
\[ o = \tanh(n) \] \[ \frac{\partial o}{\partial n} = \frac{\partial{\tanh(n)}}{\partial n} = 1 - \tanh(n)^2 = 1 - o^2 \]
1 - (o.data ** 2)
0.4999999999999999
= 0.5 n.grad
draw_dot(o, label)
With pluses as we saw the gradient will be same as previous gradient
= 0.5
x1w1x2w2.grad = 0.5 b.grad
draw_dot(o, label)
= 0.5
x1w1.grad = 0.5 x2w2.grad
draw_dot(o, label)
= w2.data * x2w2.grad
x2.grad = x2.data * x2w2.grad w2.grad
draw_dot(o, label)
= w1.data * x1w1.grad
x1.grad = x1.data * x1w1.grad w1.grad
draw_dot(o, label)
Computing gradient of each node with _backward()
class Value:
def __init__(self, data, _children=(), _op='', label=''):
self.data = data
self.grad = 0.0 # ⭠ Add grad
self._backward = lambda : None
self._prev = set(_children)
self._op = _op
self.label = label
def __repr__(self):
return f"Value(label={self.label} data={self.data})"
def __add__(self, other):
= Value(self.data + other.data, (self, other), '+')
out def _backward():
self.grad = 1.0 * out.grad
= 1.0 * out.grad
other.grad = _backward
out._backward return out
def __mul__(self, other):
= Value(self.data * other.data, (self, other), '*')
out def _backward():
self.grad = other.data * out.grad
= self.data * out.grad
other.grad = _backward
out._backward return out
def tanh(self):
= self.data
x = (math.exp(2*x) - 1)/(math.exp(2*x) + 1)
t = Value(t, (self,), 'tanh')
out
def _backward():
self.grad = (1 - t**2) * out.grad
= _backward
out._backward return out
Lets take the NN code from the above
= Value(2.0, label='x1')
x1 = Value(0.0, label='x2')
x2
= Value(-3.0, label='w1')
w1 = Value(1.0, label='w2')
w2
= Value(6.8813735870195432, label='b')
b
= x1*w1; x1w1.label='x1*w1'
x1w1 = x2*w2; x2w2.label='x2*w2'
x2w2 = x1w1 + x2w2; x1w1x2w2.label='x1*w1 + x2*w2'
x1w1x2w2 = x1w1x2w2 + b; n.label='n'
n = n.tanh(); o.label = 'o' o
draw_dot(o, label)
Backward on o
= 1.0 # setting this to 1 because Value's grad variable is 0
o.grad ; n.grad o._backward()
0.4999999999999999
Backward on n
;
n._backward() b.grad, x1w1x2w2.grad
(0.4999999999999999, 0.4999999999999999)
Backward on b
; b._backward()
Backward on x1w1x2w2
;
x1w1x2w2._backward() x1w1.grad, x2w2.grad
(0.4999999999999999, 0.4999999999999999)
Backward on x2w2
x2w2._backward() x2.grad, w2.grad
(0.4999999999999999, 0.0)
Backward on x1w1
x1w1._backward() x1.grad, w1.grad
(-1.4999999999999996, 0.9999999999999998)
Draw the computation graph
draw_dot(o, label)
Backward propogation with one call
= Value(2.0, label='x1')
x1 = Value(0.0, label='x2')
x2
= Value(-3.0, label='w1')
w1 = Value(1.0, label='w2')
w2
= Value(6.8813735870195432, label='b')
b
= x1*w1; x1w1.label='x1*w1'
x1w1 = x2*w2; x2w2.label='x2*w2'
x2w2 = x1w1 + x2w2; x1w1x2w2.label='x1*w1 + x2*w2'
x1w1x2w2 = x1w1x2w2 + b; n.label='n'
n = n.tanh(); o.label = 'o' o
draw_dot(o, label)
Topological sort
= []
topo = set()
visited def build_topo(v):
if v not in visited:
visited.add(v)for child in v._prev:
build_topo(child)
topo.append(v)
build_topo(o) topo
[ Value(label=x1 data=2.0), Value(label=w1 data=-3.0), Value(label=x1*w1 data=-6.0), Value(label=w2 data=1.0), Value(label=x2 data=0.0), Value(label=x2*w2 data=0.0), Value(label=x1*w1 + x2*w2 data=-6.0), Value(label=b data=6.881373587019543), Value(label=n data=0.8813735870195432), Value(label=o data=0.7071067811865476) ]
Apply backward
in reverse order of topological order of the computation graph
for node in reversed(topo):
node._backward()
draw_dot(o, label)
Add backward
to Value
class Value:
def __init__(self, data, _children=(), _op='', label=''):
self.data = data
self.grad = 0.0 # ⭠ Add grad
self._backward = lambda : None
self._prev = set(_children)
self._op = _op
self.label = label
def __repr__(self):
return f"Value(label={self.label} data={self.data})"
def __add__(self, other):
= Value(self.data + other.data, (self, other), '+')
out def _backward():
self.grad = 1.0 * out.grad
= 1.0 * out.grad
other.grad = _backward
out._backward return out
def __mul__(self, other):
= Value(self.data * other.data, (self, other), '*')
out def _backward():
self.grad = other.data * out.grad
= self.data * out.grad
other.grad = _backward
out._backward return out
def tanh(self):
= self.data
x = (math.exp(2*x) - 1)/(math.exp(2*x) + 1)
t = Value(t, (self,), 'tanh')
out
def _backward():
self.grad = (1 - t**2) * out.grad
= _backward
out._backward return out
def backward(self):
= []
topo = set()
visited def build_topo(v):
if v not in visited:
visited.add(v)for child in v._prev:
build_topo(child)
topo.append(v)self)
build_topo(
self.grad = 1.0
for node in reversed(topo):
node._backward()
= Value(2.0, label='x1')
x1 = Value(0.0, label='x2')
x2
= Value(-3.0, label='w1')
w1 = Value(1.0, label='w2')
w2
= Value(6.8813735870195432, label='b')
b
= x1*w1; x1w1.label='x1*w1'
x1w1 = x2*w2; x2w2.label='x2*w2'
x2w2 = x1w1 + x2w2; x1w1x2w2.label='x1*w1 + x2*w2'
x1w1x2w2 = x1w1x2w2 + b; n.label='n'
n = n.tanh(); o.label = 'o' o
o.backward()
draw_dot(o, label)
Fixing a backprop bug
= Value(3.0, label='b')
a = a + a; b.label = 'b'
b
b.backward() draw_dot(b, label)
= Value(-2.0, label='a')
a = Value(3.0, label='b')
b = a*b; d.label = 'd'
d = a+b; e.label = 'e'
e = d*e; f.label = 'f'
f
f.backward() draw_dot(f, label)
Accumulate the gradient
class Value:
def __init__(self, data, _children=(), _op='', label=''):
self.data = data
self.grad = 0.0 # ⭠ Add grad
self._backward = lambda : None
self._prev = set(_children)
self._op = _op
self.label = label
def __repr__(self):
return f"Value(label={self.label} data={self.data})"
def __add__(self, other):
= Value(self.data + other.data, (self, other), '+')
out def _backward():
self.grad += 1.0 * out.grad # <- Accumulate the gradient
+= 1.0 * out.grad # <- Accumulate the gradient
other.grad = _backward
out._backward return out
def __mul__(self, other):
= Value(self.data * other.data, (self, other), '*')
out def _backward():
self.grad += other.data * out.grad # <- Accumulate the gradient
+= self.data * out.grad # <- Accumulate the gradient
other.grad = _backward
out._backward return out
def tanh(self):
= self.data
x = (math.exp(2*x) - 1)/(math.exp(2*x) + 1)
t = Value(t, (self,), 'tanh')
out
def _backward():
self.grad += (1 - t**2) * out.grad # <- Accumulate the gradient
= _backward
out._backward return out
def backward(self):
= []
topo = set()
visited def build_topo(v):
if v not in visited:
visited.add(v)for child in v._prev:
build_topo(child)
topo.append(v)self)
build_topo(
self.grad = 1.0
for node in reversed(topo):
node._backward()
= Value(3.0, label='b')
a = a + a; b.label = 'b'
b
b.backward() draw_dot(b, label)
= Value(-2.0, label='a')
a = Value(3.0, label='b')
b = a*b; d.label = 'd'
d = a+b; e.label = 'e'
e = d*e; f.label = 'f'
f
f.backward() draw_dot(f, label)
Add and multiply Value
object with constant
class Value:
def __init__(self, data, _children=(), _op='', label=''):
self.data = data
self.grad = 0.0 # ⭠ Add grad
self._backward = lambda : None
self._prev = set(_children)
self._op = _op
self.label = label
def __repr__(self):
return f"Value(label={self.label} data={self.data})"
def __add__(self, other):
= other if isinstance(other, Value) else Value(other)
other = Value(self.data + other.data, (self, other), '+')
out def _backward():
self.grad += 1.0 * out.grad # <- Accumulate the gradient
+= 1.0 * out.grad # <- Accumulate the gradient
other.grad = _backward
out._backward return out
def __radd__(self, other): # other + self
return self + other
def __mul__(self, other):
= other if isinstance(other, Value) else Value(other)
other = Value(self.data * other.data, (self, other), '*')
out def _backward():
self.grad += other.data * out.grad # <- Accumulate the gradient
+= self.data * out.grad # <- Accumulate the gradient
other.grad = _backward
out._backward return out
def __rmul__(self, other): # other * self
return self * other
def tanh(self):
= self.data
x = (math.exp(2*x) - 1)/(math.exp(2*x) + 1)
t = Value(t, (self,), 'tanh')
out
def _backward():
self.grad += (1 - t**2) * out.grad # <- Accumulate the gradient
= _backward
out._backward return out
def backward(self):
= []
topo = set()
visited def build_topo(v):
if v not in visited:
visited.add(v)for child in v._prev:
build_topo(child)
topo.append(v)self)
build_topo(
self.grad = 1.0
for node in reversed(topo):
node._backward()
= Value(2.0); a + 1 a
Value(label= data=3.0)
= Value(2.0); a * 1 a
Value(label= data=2.0)
2 * a
Value(label= data=4.0)
2 + a
Value(label= data=4.0)
Implement tanh
class Value:
def __init__(self, data, _children=(), _op='', label=''):
self.data = data
self.grad = 0.0 # ⭠ Add grad
self._backward = lambda : None
self._prev = set(_children)
self._op = _op
self.label = label
def __repr__(self):
return f"Value(label={self.label} data={self.data})"
def __add__(self, other):
= other if isinstance(other, Value) else Value(other)
other = Value(self.data + other.data, (self, other), '+')
out def _backward():
self.grad += 1.0 * out.grad # <- Accumulate the gradient
+= 1.0 * out.grad # <- Accumulate the gradient
other.grad = _backward
out._backward return out
def __radd__(self, other): # other + self
return self + other
def __mul__(self, other):
= other if isinstance(other, Value) else Value(other)
other = Value(self.data * other.data, (self, other), '*')
out def _backward():
self.grad += other.data * out.grad # <- Accumulate the gradient
+= self.data * out.grad # <- Accumulate the gradient
other.grad = _backward
out._backward return out
def __rmul__(self, other): # other * self
return self * other
def __pow__(self, other):
assert isinstance(other, (int, float)), "only supporting int/float powers for now"
= Value(self.data ** other, (self,), f'**{other}')
out
def _backward():
self.grad += other * (self.data ** (other - 1)) * out.grad
= _backward
out._backward
return out
def exp(self):
= self.data
x = Value(math.exp(x), (self,), 'exp')
out
def _backward():
self.grad += out.data * out.grad
= _backward
out._backward return out
def __truediv__(self, other): # self / other
return self * other**-1
def tanh(self):
= self.data
x = (math.exp(2*x) - 1)/(math.exp(2*x) + 1)
t = Value(t, (self,), 'tanh')
out
def _backward():
self.grad += (1 - t**2) * out.grad # <- Accumulate the gradient
= _backward
out._backward return out
def __neg__(self): #-self
return -self
def __sub__(self, other): # self - other
return self + (-other)
def backward(self):
= []
topo = set()
visited def build_topo(v):
if v not in visited:
visited.add(v)for child in v._prev:
build_topo(child)
topo.append(v)self)
build_topo(
self.grad = 1.0
for node in reversed(topo):
node._backward()
= Value(2.0) a
a.exp()
Value(label= data=7.38905609893065)
= Value(3.0) b
/b a
Value(label= data=0.6666666666666666)
**4 a
Value(label= data=16.0)
- 1 a
Value(label= data=1.0)
= Value(2.0, label='x1')
x1 = Value(0.0, label='x2')
x2
= Value(-3.0, label='w1')
w1 = Value(1.0, label='w2')
w2
= Value(6.8813735870195432, label='b')
b
= x1*w1; x1w1.label='x1*w1'
x1w1 = x2*w2; x2w2.label='x2*w2'
x2w2 = x1w1 + x2w2; x1w1x2w2.label='x1*w1 + x2*w2'
x1w1x2w2 = x1w1x2w2 + b; n.label='n'
n # -----
= (2*n).exp()
e = (e - 1)/(e + 1)
o # -----
= 'o'
o.label
o.backward() draw_dot(o, label)
x1w1 + x2w2 + b
with PyTorch
import torch
= torch.Tensor([2.0]).double(); x1.requires_grad = True
x1 = torch.Tensor([0.0]).double(); x2.requires_grad = True
x2 = torch.Tensor([-3.0]).double(); w1.requires_grad = True
w1 = torch.Tensor([1.0]).double(); w2.requires_grad = True
w2 = torch.Tensor([6.8813735870195432]); b.requires_grad = True
b = x1*w1 + x2*w2 + b
n = torch.tanh(n)
o
print(o.data.item())
o.backward()
print('-----')
print('x2', x2.grad.item())
print('w2', w2.grad.item())
print('x1', x1.grad.item())
print('w1', w1.grad.item())
0.7071066904050358
-----
x2 0.5000001283844369
w2 0.0
x1 -1.5000003851533106
w1 1.0000002567688737
1, 2, 3], [4, 5, 6]]) torch.Tensor([[
tensor([[1., 2., 3.], [4., 5., 6.]])
Neural Network
import random
class Neuron:
def __init__(self, nin):
self.w = [Value(random.uniform(-1, 1)) for _ in range(nin)]
self.b = Value(random.uniform(-1, 1))
def __call__(self, x):
= sum((wi*xi for wi, xi in zip(self.w, x)), self.b)
act return act.tanh()
def parameters(self):
return self.w + [self.b]
= [2.0, 3.0]
x = Neuron(2)
n n(x)
Value(label= data=-0.71630401051218)
class Layer:
def __init__(self, nin, nout):
self.neurons = [Neuron(nin
for _ in range(nout)]
)
def __call__(self, x):
= [n(x) for n in self.neurons]
outs return outs[0] if len(outs) == 1 else outs
def parameters(self):
return [p for neuron in self.neurons for p in neuron.parameters()]
= [2.0, 3.0]
x = Layer(2, 3)
n n(x)
[ Value(label= data=0.9323923071860208), Value(label= data=-0.6957480842688355), Value(label= data=0.9949508713128399) ]
class MLP:
def __init__(self, nin, nouts):
= [nin] + nouts
sz self.layers = [Layer(sz[i], sz[i+1]) for i
in range(len(nouts))]
def __call__(self, x):
for layer in self.layers:
= layer(x)
x return x
def parameters(self):
return [p for layer in self.layers for p in layer.parameters()]
= [2.0, 3.0, -1.0]
x = MLP(3, [4, 4, 1])
n = n(x)
o = 1
o.grad o.backward()
len(n.parameters()) n.parameters(),
( [ Value(label= data=-0.8773320545613115), Value(label= data=0.21854271535158198), Value(label= data=0.13730892829595565), Value(label= data=-0.5436703421639371), Value(label= data=-0.5007041170945776), Value(label= data=0.9789830631658898), Value(label= data=0.8050974151663517), Value(label= data=-0.11016135996456167), Value(label= data=0.22124094253907778), Value(label= data=-0.8692488975746844), Value(label= data=-0.51512083826767), Value(label= data=-0.15884614255298235), Value(label= data=-0.9216734804692623), Value(label= data=0.6165197184222242), Value(label= data=0.33389808347375305), Value(label= data=0.6716163019747723), Value(label= data=0.7479127489965471), Value(label= data=0.6913996844396202), Value(label= data=-0.3719520946883914), Value(label= data=0.0381466491267759), Value(label= data=-0.8036261340897828), Value(label= data=0.14331062776761772), Value(label= data=-0.9904951973594573), Value(label= data=0.23265417282124412), Value(label= data=-0.5441204768729622), Value(label= data=0.09037344168895323), Value(label= data=-0.6263186959547287), Value(label= data=-0.7687145874568115), Value(label= data=0.8067183837857432), Value(label= data=-0.6695236110998573), Value(label= data=-0.4936725683149976), Value(label= data=-0.948783805686829), Value(label= data=-0.362064305878842), Value(label= data=0.71706547232376), Value(label= data=-0.38398098767491895), Value(label= data=-0.854407056637168), Value(label= data=-0.43771644655834585), Value(label= data=-0.8122254391243122), Value(label= data=-0.7849921896499341), Value(label= data=0.7867428242639574), Value(label= data=0.9508849142793219) ], 41 )
draw_dot(o, label)
Tiny Dataset with loss function
= [
xs 2.0, 3.0, -1.0],
[3.0, -1.0, 0.5],
[0.5, 1.0, 1.0],
[1.0, 1.0, -1.0]
[
]= [1.0, -1.0, -1.0, 1.0] ys
= [n(x) for x in xs]
ypred = sum([(yout - ygt)**2 for ygt, yout in zip(ys, ypred)])
loss loss
Value(label= data=6.145450264034548)
ypred
[ Value(label= data=0.9613622740038076), Value(label= data=0.5091506010451603), Value(label= data=0.9497390552324829), Value(label= data=0.7451677610062515) ]
Repeat
loss.backward()
0].neurons[0].w[0].grad n.layers[
-0.04473465638916681
0].neurons[0].w[0].data n.layers[
-0.8773320545613115
for p in n.parameters():
+= -0.01 * p.grad p.data
0].neurons[0].w[0].data n.layers[
-0.8768847079974198
= [n(x) for x in xs]
ypred = sum([(yout - ygt)**2 for ygt, yout in zip(ys, ypred)])
loss loss
Value(label= data=5.9448254035134465)
ypred
[ Value(label= data=0.9617265322081271), Value(label= data=0.43565539138090165), Value(label= data=0.9510677256835314), Value(label= data=0.725065694807515) ]
n.parameters()
[ Value(label= data=-0.8768847079974198), Value(label= data=0.21949508647796803), Value(label= data=0.1385938942733024), Value(label= data=-0.5427509639370535), Value(label= data=-0.4989987951971535), Value(label= data=0.9810637753982486), Value(label= data=0.8032841357361882), Value(label= data=-0.10822461638644491), Value(label= data=0.2191073445250642), Value(label= data=-0.8693341479297328), Value(label= data=-0.5150553447581354), Value(label= data=-0.1599712708867693), Value(label= data=-0.9204522149327646), Value(label= data=0.6176978033162941), Value(label= data=0.33248584468999454), Value(label= data=0.6728960760418363), Value(label= data=0.7467247012385713), Value(label= data=0.6928044491564865), Value(label= data=-0.3733392595509354), Value(label= data=0.03928302778692913), Value(label= data=-0.8017417247095591), Value(label= data=0.14178985429307983), Value(label= data=-0.9916730215251183), Value(label= data=0.23454886497577554), Value(label= data=-0.5461943160052498), Value(label= data=0.09193046790217876), Value(label= data=-0.6258347045651111), Value(label= data=-0.768590750645844), Value(label= data=0.8076512515956322), Value(label= data=-0.6702462673395139), Value(label= data=-0.49429632745656293), Value(label= data=-0.9460952730831453), Value(label= data=-0.3596651251112354), Value(label= data=0.7141165005167276), Value(label= data=-0.3806245013086102), Value(label= data=-0.8570897419717581), Value(label= data=-0.41643954968132757), Value(label= data=-0.8282591500009012), Value(label= data=-0.8021962015254895), Value(label= data=0.7707916705043153), Value(label= data=0.9262723943108128) ]
Make the above Repeat
section into training
loop
def train(repeats, model, xs, ygt, lr = 0.01):
for k in range(repeats):
# forward pass
= [model(x) for x in xs]
ypred = sum((yout - ygt)**2 for ygt, yout in zip(ygt, ypred))
loss print(k, loss.data)
# backward propagation
for p in n.parameters(): p.grad = 0.0 # zero_grad()
loss.backward()
# update: gradient descent
for p in model.parameters(): p.data += -lr * p.grad
return ypred
= [
xs 2.0, 3.0, -1.0],
[3.0, -1.0, 0.5],
[0.5, 1.0, 1.0],
[1.0, 1.0, -1.0]
[
]= [1.0, -1.0, -1.0, 1.0] ys
= MLP(3, [4, 4, 1]) model
10, model, xs, ys, 0.05) train(
0 7.220472885146723
1 6.8361918435934355
2 5.589046560434221
3 4.147937671605216
4 4.875203300452224
5 4.323610795464504
6 3.546174609092321
7 1.7412962395817813
8 0.5351626826196177
9 0.21335050879201323
[ Value(label= data=0.5781623601547444), Value(label= data=-0.8870687314798208), Value(label= data=-0.8905626911927673), Value(label= data=0.896687278453952) ]
Done!