from collections import Counter
import numpy as np
import torch
from rich import print
from rich import pretty
from matplotlib import pyplot as plt
Building makemore
= torch.Generator().manual_seed(2147483647) g
pretty.install()
Counting
Read in the data
def get_words(filename):
with open('../data/names.txt') as f:
return list(map(lambda x: x.strip(), f.readlines()))
= get_words('../data/names.txt') words
10] words[:
['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia', 'harper', 'evelyn']
len(words)
32033
Minimum Length
min(len(w) for w in words)
2
Maximum Length
max(len(w) for w in words)
15
Create paring of nth and n + 1th position characters
for w in words[:1]:
for ch1, ch2 in zip(w, w[1:]):
print(ch1, ch2)
e m
m m
m a
Add start (<S>) and end(<E>) tokens to the word
The model will know the starting and ending of the word
def generate_pairings(words, start_token='<S>', end_token='<E>'):
for w in words:
= [start_token] + list(w) + [end_token]
chs for ch1, ch2 in zip(chs, chs[1:]):
yield ch1, ch2
for ch1, ch2 in generate_pairings(words[:1]):
print(ch1, ch2)
<S> e
e m
m m
m a
a <E>
sum(1 for ch1, ch2 in generate_pairings(words))
228146
lets see for 3 words
for ch1, ch2 in generate_pairings(words[:3]):
print(ch1, ch2)
<S> e
e m
m m
m a
a <E>
<S> o
o l
l i
i v
v i
i a
a <E>
<S> a
a v
v a
a <E>
Count of bigrams
Bigram for 3 words
def create_bigram_counter(words):
= Counter()
b for ch1, ch2 in generate_pairings(words):
= (ch1, ch2)
bigram += 1
b[bigram] return b
3]) create_bigram_counter(words[:
Counter({ ('<S>', 'e'): 1, ('e', 'm'): 1, ('m', 'm'): 1, ('m', 'a'): 1, ('a', '<E>'): 3, ('<S>', 'o'): 1, ('o', 'l'): 1, ('l', 'i'): 1, ('i', 'v'): 1, ('v', 'i'): 1, ('i', 'a'): 1, ('<S>', 'a'): 1, ('a', 'v'): 1, ('v', 'a'): 1 })
Bigram for all words
= create_bigram_counter(words) b
10) b.most_common(
[ (('n', '<E>'), 6763), (('a', '<E>'), 6640), (('a', 'n'), 5438), (('<S>', 'a'), 4410), (('e', '<E>'), 3983), (('a', 'r'), 3264), (('e', 'l'), 3248), (('r', 'i'), 3033), (('n', 'a'), 2977), (('<S>', 'k'), 2963) ]
Create 2D array of the bigram
Little warmup with tensors
= torch.zeros((3, 5), dtype=torch.int32)
a a
tensor([[0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]], dtype=torch.int32)
a.dtype
torch.int32
1,3] = 1
a[ a
tensor([[0, 0, 0, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 0, 0]], dtype=torch.int32)
1, 3] += 1
a[ a
tensor([[0, 0, 0, 0, 0], [0, 0, 0, 2, 0], [0, 0, 0, 0, 0]], dtype=torch.int32)
2D matrix of alpahabets
def get_stoi(words, start_token, end_token, tokens_at_start=True):
= []
chars if tokens_at_start:
chars.append(start_token)if start_token != end_token: chars.append(end_token)
sorted(list(set(''.join(words)))))
chars.extend(
if not tokens_at_start:
chars.append(start_token)if start_token != end_token: chars.append(end_token)
= {s:i for i,s in enumerate(chars)}
stoi
return stoi
= get_stoi(words, '<S>', '<E>', tokens_at_start=False)
stoi stoi
{ 'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, 'f': 5, 'g': 6, 'h': 7, 'i': 8, 'j': 9, 'k': 10, 'l': 11, 'm': 12, 'n': 13, 'o': 14, 'p': 15, 'q': 16, 'r': 17, 's': 18, 't': 19, 'u': 20, 'v': 21, 'w': 22, 'x': 23, 'y': 24, 'z': 25, '<S>': 26, '<E>': 27 }
def create_bigram_matrix(words, start_token, end_token, tokens_at_start=True):
= get_stoi(words, start_token, end_token, tokens_at_start)
stoi = len(stoi)
alphabet_size = torch.zeros((alphabet_size, alphabet_size), dtype=torch.int32)
N for ch1, ch2 in generate_pairings(words, start_token, end_token):
= stoi[ch1]
ix1 = stoi[ch2]
ix2 += 1
N[ix1, ix2] return N
= create_bigram_matrix(words, '<S>', '<E>', False) N
10, :10] N[:
tensor([[ 556, 541, 470, 1042, 692, 134, 168, 2332, 1650, 175], [ 321, 38, 1, 65, 655, 0, 0, 41, 217, 1], [ 815, 0, 42, 1, 551, 0, 2, 664, 271, 3], [1303, 1, 3, 149, 1283, 5, 25, 118, 674, 9], [ 679, 121, 153, 384, 1271, 82, 125, 152, 818, 55], [ 242, 0, 0, 0, 123, 44, 1, 1, 160, 0], [ 330, 3, 0, 19, 334, 1, 25, 360, 190, 3], [2244, 8, 2, 24, 674, 2, 2, 1, 729, 9], [2445, 110, 509, 440, 1653, 101, 428, 95, 82, 76], [1473, 1, 4, 4, 440, 0, 0, 45, 119, 2]], dtype=torch.int32)
The type of a cell in the above N
is tensor
type(N[1, 1])
<class 'torch.Tensor'>
Therefore we have to call it with .item()
to get the value
type(N[1, 1].item())
<class 'int'>
plt.imshow(N)
= dict(map(reversed, stoi.items()))
itos itos
{ 0: 'a', 1: 'b', 2: 'c', 3: 'd', 4: 'e', 5: 'f', 6: 'g', 7: 'h', 8: 'i', 9: 'j', 10: 'k', 11: 'l', 12: 'm', 13: 'n', 14: 'o', 15: 'p', 16: 'q', 17: 'r', 18: 's', 19: 't', 20: 'u', 21: 'v', 22: 'w', 23: 'x', 24: 'y', 25: 'z', 26: '<S>', 27: '<E>' }
def plot_matrix(N, itos):
=(16, 16))
plt.figure(figsize='Blues')
plt.imshow(N, cmapfor i in range(N.shape[0]):
for j in range(N.shape[1]):
= itos[i] + itos[j]
chstr ="center", va="bottom", color="gray")
plt.text(j, i, chstr, ha="center", va="top", color="gray")
plt.text(j, i, N[i, j].item(), ha"off") plt.axis(
plot_matrix(N, itos)
Remove <E>
and <S>
in favor of a single .
token
Will deduct the columns and row having 0 values
= get_stoi(words, '.', '.')
stoi stoi
{ '.': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26 }
= dict(map(reversed, stoi.items())) itos
= create_bigram_matrix(words, '.', '.') N
0, 0] N[
tensor(0, dtype=torch.int32)
plot_matrix(N, itos)
0] N[
tensor([ 0, 4410, 1306, 1542, 1690, 1531, 417, 669, 874, 591, 2422, 2963, 1572, 2538, 1146, 394, 515, 92, 1639, 2055, 1308, 78, 376, 307, 134, 535, 929], dtype=torch.int32)
Sampling
Warm up with probability tensor
= torch.rand(3, generator=g)
p p
tensor([0.7081, 0.3542, 0.1054])
sum() p.
tensor(1.1678)
= p/p.sum()
p p
tensor([0.6064, 0.3033, 0.0903])
Drawing 20 samples
= torch.multinomial(p, num_samples=20, replacement=True, generator=g)
p_dist p_dist
tensor([1, 1, 2, 0, 0, 2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1])
len(p_dist[p_dist == 0])/len(p_dist)
0.45
len(p_dist[p_dist == 1])/len(p_dist)
0.45
len(p_dist[p_dist == 2])/len(p_dist)
0.1
Drawing 50 samples
= torch.multinomial(p, num_samples=50, replacement=True, generator=g)
p_dist p_dist
tensor([0, 2, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0])
len(p_dist[p_dist == 0])/len(p_dist)
0.64
len(p_dist[p_dist == 1])/len(p_dist)
0.32
len(p_dist[p_dist == 2])/len(p_dist)
0.04
Drawing a character wrt to probability of occurance
= N[0].float()
p = p / p.sum()
p p
tensor([0.0000, 0.1377, 0.0408, 0.0481, 0.0528, 0.0478, 0.0130, 0.0209, 0.0273, 0.0184, 0.0756, 0.0925, 0.0491, 0.0792, 0.0358, 0.0123, 0.0161, 0.0029, 0.0512, 0.0642, 0.0408, 0.0024, 0.0117, 0.0096, 0.0042, 0.0167, 0.0290])
= torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
ix ix
19
itos[ix]
's'
def generate_names(count, pdist_func, g):
for i in range(count):
= []
out = 0
ix while True:
= pdist_func(ix)
p = torch.multinomial(p, num_samples = 1, replacement = True, generator = g).item()
ix
out.append(itos[ix])if ix == 0:
break
yield ''.join(out)
= lambda ix: N[ix].float()/N[ix].sum()
p_occurance for name in generate_names(10, p_occurance, g): print(name)
blon.
ke.
a.
ry.
l.
balycaweriginnn.
data.
bh.
matt.
jeeve.
Drawing a character wrt to uniform probability
= lambda ix: torch.ones(len(N[ix]))/len(N[ix]) p_uniform
for name in generate_names(10, p_uniform, g): print(name)
wwjieqrrlvhtwogbqtwrmcjpnvrkifgnsgfvp.
kynsszpvqzmmwpogyzdhpfapyhlqdxcvczntn.
.
.
rxnsmepegjipknhbzrrz.
kgkznqqzsdaacfanvedfjga.
ycgfsirvvmcrvssnqjbjuqfzanulmxxkseuktjmbhn.
x.
wsuzuxkneqmel.
qrbcskqqopeqbkuidxrnmyyfvysdxvfwix.
Vectorized normalization of rows and columns
Warm up with normalization
= N.float() P
P.shape
torch.Size([27, 27])
sum(0, keepdim=True).shape P.
torch.Size([1, 27])
sum(1, keepdim=True).shape P.
torch.Size([27, 1])
sum(0, keepdim=False).shape P.
torch.Size([27])
sum(1, keepdim=False).shape P.
torch.Size([27])
Broadcasting
Two tensors are “broadcastable” if the following rules hold:
- Each tensor has at least one dimension.
- When iterating over the dimension sizes, starting at the trailing dimension, the dimension sizes must either be equal, one of them is 1, or one of them does not exist.
P.shape
torch.Size([27, 27])
= P.sum(1, keepdim=True)
P_sum_col P_sum_col.shape
torch.Size([27, 1])
As you can see above the shapes of the two variables P
and P_sum_col
are
27 by 27
27 by 1
Broadcasting will repeat the unit dimension of the second variable 27 times along the y axis and it does element wise division
So the P_norm
will be
= P/P_sum_col P_norm
P_norm.shape
torch.Size([27, 27])
= lambda ix: P_norm[ix] normalized_P
for name in generate_names(10, normalized_P, g): print(name)
ele.
zelensskan.
a.
ilelena.
arah.
lizanolbraris.
sil.
kyliketo.
asonnngaeyja.
an.
P_sum_col
without keepdims
= P.sum(1)
P_sum_col_wo_keepdims P_sum_col_wo_keepdims.shape
torch.Size([27])
And what if we use the variable P_sum_col_wo_keepdims
to divide the P, how will the broadcasting rule be applied?
So the shapes of the two variables P
and P_sum_col_wo_keepdims
are
27 by 27
27
We will arrange the trailing dimension of the P_sum_col_wo_keepdims
shape along with the P
shape, so it will be
27 by 27
1 by 27
Now broadcasting will copy the unit dimension of the P_sum_col_wo_keepdims
along the x-axis 27 times
The result will be
= P/P_sum_col_wo_keepdims P_norm_wo_keepdims
torch.equal(P_norm_wo_keepdims, P_norm)
False
So here we are normalizing the columns instead of the rows when broadcasting without keepdims
= lambda ix: P_norm_wo_keepdims[ix] wrongly_normalized_P
for name in generate_names(10, wrongly_normalized_P, g): print(name)
cishwambitzuruvefaum.
ajorun.
xilinnophajorovebrglmivoublicckyle.
joyquwasooxxentomprtyuquviequzaq.
juxtrcoxluckyjayspttycelllwyddstotyphaxxxwecquxzikoququzynikoposylixxuffruedrkowh.
ju.
ixxxisrielyavrhmidexytzrohauxiexxxxxxzurefffaigtzuzzantallyojoxxxt.
oprghah.
stzldouwinolyselppp.
j.
Loss function
Probability of each pairing
for ch1, ch2 in generate_pairings(words[:3], '.', '.'): print(f'{ch1}{ch2}')
.e
em
mm
ma
a.
.o
ol
li
iv
vi
ia
a.
.a
av
va
a.
def generate_pairing_probs(words):
for ch1, ch2 in generate_pairings(words,'.', '.'):
= stoi[ch1]
ix1 = stoi[ch2]
ix2 = P_norm[ix1, ix2]
prob yield ch1, ch2, prob
for ch1, ch2, prob in generate_pairing_probs(words[:3]): print(f'{ch1}{ch2}: {prob: .4f}')
.e: 0.0478
em: 0.0377
mm: 0.0253
ma: 0.3899
a.: 0.1960
.o: 0.0123
ol: 0.0780
li: 0.1777
iv: 0.0152
vi: 0.3541
ia: 0.1381
a.: 0.1960
.a: 0.1377
av: 0.0246
va: 0.2495
a.: 0.1960
The individual character probability is
1/27
0.037037037037037035
which is ~4%.
if the above probability assigned by the bigram model was 1 then the model is sure about what will come will next
Negative Log Likelihood
The product of the above probabilities will determine how the model is performing. As the product of probabilities will be very small, we are taking the log likelihood
Maximum Likelihood \[ ML = a \times b \times c \]
Log Likelihood \[ \log {(a \times b \times c)} = \log {a} + \log {b} + \log {c} \]
def print_prob_logprob(words):
for ch1, ch2, prob in generate_pairing_probs(words):
= torch.log(prob)
logprob print(f'{ch1}{ch2}: {prob: .4f} {logprob: .4f}')
3]) print_prob_logprob(words[:
.e: 0.0478 -3.0408
em: 0.0377 -3.2793
mm: 0.0253 -3.6772
ma: 0.3899 -0.9418
a.: 0.1960 -1.6299
.o: 0.0123 -4.3982
ol: 0.0780 -2.5508
li: 0.1777 -1.7278
iv: 0.0152 -4.1867
vi: 0.3541 -1.0383
ia: 0.1381 -1.9796
a.: 0.1960 -1.6299
.a: 0.1377 -1.9829
av: 0.0246 -3.7045
va: 0.2495 -1.3882
a.: 0.1960 -1.6299
Lets sum up all the log probabilities
def log_likelihood(words):
= 0
log_likelihood for ch1, ch2, prob in generate_pairing_probs(words):
+= torch.log(prob)
log_likelihood return log_likelihood
3]) log_likelihood(words[:
tensor(-38.7856)
The log likelihood
will be 0 if all the probabilities will be 1 and will be negative if one of more of the probability will be less than 0. The maximum number the log likelihood
will be 1. We want something which can be defined as loss such that higher the amount of inaccurate predictions higher the loss.
So if we take the negative of log likelihood
, we will get an increasing number with higher innacuracy.
def negative_log_likelihood(words):
return -log_likelihood(words)
3]) negative_log_likelihood(words[:
tensor(38.7856)
Sometimes we want to normalize the log_likelihood by the count of pairs. Lets do that
def log_likelihood_normalized(words):
= 0
count = 0
log_likelihood for ch1, ch2, prob in generate_pairing_probs(words):
+= torch.log(prob)
log_likelihood += 1
count return log_likelihood/count
log_likelihood_normalized(words)
tensor(-2.4541)
def negative_log_likelihood_normalized(words):
return -log_likelihood_normalized(words)
negative_log_likelihood_normalized(words)
tensor(2.4541)
So the training loss is 38.7856
Test it on a test data
"anubhav"]) negative_log_likelihood_normalized([
tensor(3.1186)
"anubhavm"]) negative_log_likelihood_normalized([
tensor(inf)
It is infinite loss, means that the model will not predict anubhavm
Lets see which pairing is giving infinite prob
"anubhavm"]) print_prob_logprob([
.a: 0.1377 -1.9829
an: 0.1605 -1.8296
nu: 0.0052 -5.2518
ub: 0.0329 -3.4157
bh: 0.0155 -4.1669
ha: 0.2946 -1.2220
av: 0.0246 -3.7045
vm: 0.0000 -inf
m.: 0.0777 -2.5551
We see that the pairing vm
has 0
probability of occurance which leads to infinite loss.
In the following table also m
is following v
0
times
plot_matrix(N, itos)
Model Smooting
To add a very small number (fake counts) to the count of pairing so that the likelihood is not 0 and therefore the negative log likelihood is not negative infinity
= (N + 1).float() P
The more fake count you add to N, the more uniform model (uniform probabilities) you will have. The less you add the more peak model (model probabilities) you will have
= P.sum(1, keepdim=True) P_sum_col
= P/P_sum_col P_norm
"anubhavm"]) print_prob_logprob([
.a: 0.1376 -1.9835
an: 0.1604 -1.8302
nu: 0.0053 -5.2429
ub: 0.0329 -3.4146
bh: 0.0157 -4.1529
ha: 0.2937 -1.2251
av: 0.0246 -3.7041
vm: 0.0004 -7.8633
m.: 0.0775 -2.5572
"anubhavm"]) negative_log_likelihood_normalized([
tensor(3.5526)
Neural Network
Create the train set of the bigrams
def generate_training_set(words, start_token='.', end_token='.'):
= [], []
xs, ys for ch1, ch2 in generate_pairings(words, start_token, end_token):
= stoi[ch1]
ix1 = stoi[ch2]
ix2
xs.append(ix1)
ys.append(ix2)return xs, ys
= generate_training_set(words[:1]) xs, ys
= torch.tensor(xs); xs xs
tensor([ 0, 5, 13, 13, 1])
= torch.tensor(ys); ys ys
tensor([ 5, 13, 13, 1, 0])
for ch1, ch2 in generate_pairings(words[:1], '.', '.'):
print(ch1, ch2)
. e
e m
m m
m a
a .
Difference between torch.tensor
and torch.Tensor
torch.tensor infers the dtype automatically, while torch.Tensor returns a torch.FloatTensor. I would recommend to stick to torch.tensor, which also has arguments like dtype, if you would like to change the type.
https://stackoverflow.com/a/63116398
xs.dtype, ys.dtype
(torch.int64, torch.int64)
= generate_training_set(words) xs, ys
= torch.Tensor(xs)
xs = torch.Tensor(ys)
ys xs.dtype, ys.dtype
(torch.float32, torch.float32)
One Hot Encoding of the training dataset
import torch.nn.functional as F
= generate_training_set(words[:1])
xs, ys = torch.tensor(xs)
xs = torch.tensor(ys) ys
= F.one_hot(xs, num_classes=27)
xenc xenc
tensor([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
xenc.shape
torch.Size([5, 27])
plt.imshow(xenc)
xenc.dtype
torch.int64
When we are sending numbers to NN we dont want the numbers to be int
but to be float
as it can take various values
= F.one_hot(xs, num_classes=27).float() xenc
xenc.dtype
torch.float32
Initialize the weight
= torch.randn((27, 1))
W W
tensor([[-1.0414], [-0.4622], [ 0.4704], [ 0.2034], [ 0.4376], [ 0.8326], [-1.1531], [-0.5384], [-1.5000], [-0.3734], [-0.9722], [ 0.7093], [ 1.6148], [ 0.6154], [ 0.6585], [-1.2100], [-0.4480], [ 2.4709], [ 1.5362], [-0.8239], [-1.8200], [-2.4810], [-1.1249], [ 1.2613], [-0.7899], [-0.3423], [-0.8073]])
W.shape
torch.Size([27, 1])
@ W xenc
tensor([[-1.0414], [ 0.8326], [ 0.6154], [ 0.6154], [-0.4622]])
Initialize random weight of 27 by 27
= torch.randn((27, 27))
W @ W xenc
tensor([[-1.3844e+00, 1.5959e-02, 3.7060e-01, 1.1356e+00, 5.2515e-01, 7.3794e-01, -1.0737e+00, -9.0978e-01, 1.2984e+00, 1.0683e+00, 1.2605e+00, -1.7498e+00, 4.6805e-01, -3.4442e-01, 1.0569e+00, 1.8138e-01, 8.4084e-01, 1.3287e+00, -7.5910e-01, 7.8683e-01, 9.5301e-01, -1.0442e+00, -2.4167e-02, 6.2387e-01, -6.6787e-02, -7.1907e-01, 1.2762e+00], [-9.1542e-01, -8.4699e-02, 8.1029e-01, 5.2382e-01, -1.4164e+00, 9.8146e-01, 2.2023e+00, 5.3777e-01, 2.7927e-01, -5.9158e-03, 1.1951e-01, -1.0505e+00, 2.1483e-01, 4.4787e-01, 1.7172e+00, 1.6195e+00, -1.2666e+00, -4.3973e-01, 7.8754e-02, 2.4022e-01, 5.2765e-01, 3.4238e-01, -1.5155e+00, -3.3794e-02, 1.3747e+00, 1.8808e+00, 3.2315e-01], [ 1.0474e+00, -1.1022e+00, 1.1412e+00, -1.0475e+00, 1.2827e+00, -1.1662e-01, -1.0313e+00, -5.0630e-01, -5.8584e-01, 3.7119e-01, -6.2447e-01, -6.1076e-01, 7.0085e-01, 2.1230e-01, 1.8492e+00, -1.5117e-01, 2.2283e+00, -1.1119e+00, -9.5698e-01, -2.8551e-02, 1.0193e+00, -8.8697e-01, -7.4386e-02, 1.3281e+00, 2.0499e-01, 8.1934e-01, 2.3981e-01], [ 1.0474e+00, -1.1022e+00, 1.1412e+00, -1.0475e+00, 1.2827e+00, -1.1662e-01, -1.0313e+00, -5.0630e-01, -5.8584e-01, 3.7119e-01, -6.2447e-01, -6.1076e-01, 7.0085e-01, 2.1230e-01, 1.8492e+00, -1.5117e-01, 2.2283e+00, -1.1119e+00, -9.5698e-01, -2.8551e-02, 1.0193e+00, -8.8697e-01, -7.4386e-02, 1.3281e+00, 2.0499e-01, 8.1934e-01, 2.3981e-01], [ 1.0060e+00, -1.6259e-02, -1.9179e+00, 1.6954e-02, 1.0129e+00, -8.4792e-01, 1.4553e+00, -8.6143e-01, 3.8685e-01, 7.8658e-01, 1.7895e+00, -3.5575e-01, 4.3668e-01, 4.7369e-01, -1.1651e+00, 5.3522e-02, -2.1702e+00, 1.2975e+00, 1.1129e+00, 8.5445e-01, 2.0814e-01, 2.7412e-01, -2.4321e-04, 1.3574e+00, -4.5190e-01, 1.5984e-01, -1.2650e-01]])
@ W).shape (xenc
torch.Size([5, 27])
@ W)[3, 13], (xenc[3] * W[:, 13]).sum() (xenc
(tensor(0.2123), tensor(0.2123))
Exponential
= (xenc @ W) # log counts
logits = logits.exp() # counts
counts counts
tensor([[0.2505, 1.0161, 1.4486, 3.1130, 1.6907, 2.0916, 0.3418, 0.4026, 3.6636, 2.9104, 3.5272, 0.1738, 1.5969, 0.7086, 2.8773, 1.1989, 2.3183, 3.7761, 0.4681, 2.1964, 2.5935, 0.3520, 0.9761, 1.8661, 0.9354, 0.4872, 3.5830], [0.4003, 0.9188, 2.2486, 1.6885, 0.2426, 2.6683, 9.0457, 1.7122, 1.3222, 0.9941, 1.1269, 0.3498, 1.2396, 1.5650, 5.5687, 5.0507, 0.2818, 0.6442, 1.0819, 1.2715, 1.6949, 1.4083, 0.2197, 0.9668, 3.9539, 6.5587, 1.3815], [2.8502, 0.3321, 3.1304, 0.3508, 3.6062, 0.8899, 0.3565, 0.6027, 0.5566, 1.4495, 0.5355, 0.5429, 2.0155, 1.2365, 6.3550, 0.8597, 9.2838, 0.3289, 0.3841, 0.9719, 2.7713, 0.4119, 0.9283, 3.7739, 1.2275, 2.2690, 1.2710], [2.8502, 0.3321, 3.1304, 0.3508, 3.6062, 0.8899, 0.3565, 0.6027, 0.5566, 1.4495, 0.5355, 0.5429, 2.0155, 1.2365, 6.3550, 0.8597, 9.2838, 0.3289, 0.3841, 0.9719, 2.7713, 0.4119, 0.9283, 3.7739, 1.2275, 2.2690, 1.2710], [2.7347, 0.9839, 0.1469, 1.0171, 2.7535, 0.4283, 4.2858, 0.4226, 1.4723, 2.1959, 5.9862, 0.7006, 1.5476, 1.6059, 0.3119, 1.0550, 0.1142, 3.6601, 3.0433, 2.3501, 1.2314, 1.3154, 0.9998, 3.8861, 0.6364, 1.1733, 0.8812]])
@ W)[3, 13] (xenc
tensor(0.2123)
3] xenc[
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
13] W[:,
tensor([-0.3444, 0.4737, 0.0557, -0.1620, -0.6734, 0.4479, -0.7111, 1.3282, 0.2026, 0.0208, 0.2722, 0.3473, -0.6560, 0.2123, 1.7973, 1.2086, -1.2879, -0.0824, -1.3538, -0.3161, -0.9458, -1.2972, 0.5641, -0.4949, 1.0295, 0.0753, -0.1173])
3] * W[:, 13]).sum() # is equal to (xenc @ W)[3, 13] (xenc[
tensor(0.2123)
= xenc @ W # log-counts
logits = logits.exp() counts
= counts / counts.sum(1, keepdims=True) probs
probs.shape
torch.Size([5, 27])
0].sum() probs[
tensor(1.)
Summary
xs
tensor([ 0, 5, 13, 13, 1])
ys
tensor([ 5, 13, 13, 1, 0])
= torch.randn((27, 27), generator=g) W
= F.one_hot(xs, num_classes=27).float()
xenc = xenc @ W
logits = logits.exp()
counts = counts/counts.sum(1, keepdims=True) probs
probs.shape
torch.Size([5, 27])
= torch.zeros(5)
nlls for i in range(5):
= xs[i].item()
x = ys[i].item()
y
print('-------------------')
print(f'bigram example {i+1}: {itos[x]}{itos[y]} (indexes {x}, {y})')
print('input to the neural network: ', x)
print('output probabilities from the neural net:', probs[i])
print('label (actual next character):', y)
= probs[i, y]
p print('probability assigned by the net to the correct character:', p.item())
= torch.log(p)
logp print('log likelihood:', logp.item())
= -logp
nll print('negative log likelihood:', nll.item())
= nll
nlls[i]
print('========')
print('average negtaive log likelihood, i.e. loss = ', nlls.mean().item())
-------------------
bigram example 1: .e (indexes 0, 5)
input to the neural network: 0
output probabilities from the neural net: tensor([0.0204, 0.0134, 0.0078, 0.0670, 0.0130, 0.0115, 0.0175, 0.0121, 0.0186, 0.0311, 0.0275, 0.1659, 0.0087, 0.0143, 0.0518, 0.0317, 0.0831, 0.0230, 0.0396, 0.0086, 0.0483, 0.0447, 0.0556, 0.0112, 0.0724, 0.0844, 0.0168])
label (actual next character): 5
probability assigned by the net to the correct character: 0.011521384119987488
log likelihood: -4.463550567626953
negative log likelihood: 4.463550567626953
-------------------
bigram example 2: em (indexes 5, 13)
input to the neural network: 5
output probabilities from the neural net: tensor([0.0081, 0.0690, 0.0499, 0.1331, 0.0985, 0.0740, 0.0093, 0.0052, 0.0234, 0.0321, 0.0267, 0.0309, 0.0093, 0.0228, 0.0269, 0.0085, 0.0049, 0.0363, 0.0139, 0.0326, 0.0531, 0.0262, 0.1151, 0.0097, 0.0136, 0.0420, 0.0248])
label (actual next character): 13
probability assigned by the net to the correct character: 0.0227525494992733
log likelihood: -3.7830779552459717
negative log likelihood: 3.7830779552459717
-------------------
bigram example 3: mm (indexes 13, 13)
input to the neural network: 13
output probabilities from the neural net: tensor([0.0230, 0.0133, 0.0162, 0.0483, 0.0080, 0.0372, 0.0084, 0.0216, 0.0159, 0.0524, 0.0227, 0.0227, 0.0092, 0.0415, 0.1000, 0.0354, 0.0172, 0.0423, 0.0553, 0.0036, 0.0085, 0.0553, 0.0140, 0.0077, 0.0252, 0.2709, 0.0243])
label (actual next character): 13
probability assigned by the net to the correct character: 0.04153481870889664
log likelihood: -3.181223154067993
negative log likelihood: 3.181223154067993
-------------------
bigram example 4: ma (indexes 13, 1)
input to the neural network: 13
output probabilities from the neural net: tensor([0.0230, 0.0133, 0.0162, 0.0483, 0.0080, 0.0372, 0.0084, 0.0216, 0.0159, 0.0524, 0.0227, 0.0227, 0.0092, 0.0415, 0.1000, 0.0354, 0.0172, 0.0423, 0.0553, 0.0036, 0.0085, 0.0553, 0.0140, 0.0077, 0.0252, 0.2709, 0.0243])
label (actual next character): 1
probability assigned by the net to the correct character: 0.013294448144733906
log likelihood: -4.320408821105957
negative log likelihood: 4.320408821105957
-------------------
bigram example 5: a. (indexes 1, 0)
input to the neural network: 1
output probabilities from the neural net: tensor([0.0538, 0.0021, 0.3426, 0.0492, 0.0995, 0.0047, 0.0090, 0.0162, 0.0012, 0.0138, 0.0374, 0.0028, 0.0075, 0.0097, 0.0124, 0.0284, 0.0163, 0.0218, 0.0011, 0.0579, 0.0165, 0.0460, 0.0432, 0.0132, 0.0680, 0.0072, 0.0184])
label (actual next character): 0
probability assigned by the net to the correct character: 0.05381616950035095
log likelihood: -2.9221813678741455
negative log likelihood: 2.9221813678741455
========
average negtaive log likelihood, i.e. loss = 3.734088182449341
Lets have the above one into function and try with different sampling
def train():
= F.one_hot(xs, num_classes=27).float()
xenc = xenc @ W
logits = logits.exp()
counts = counts/counts.sum(1, keepdims=True)
probs = torch.zeros(5)
nlls for i in range(5):
= xs[i].item()
x = ys[i].item()
y
= probs[i, y]
p
= torch.log(p)
logp
= -logp
nll
= nll
nlls[i]
return nlls.mean().item()
= torch.randn((27, 27))
W train()
3.5860557556152344
= torch.randn((27, 27))
W train()
3.2332470417022705
Forward Pass
xs, ys
(tensor([ 0, 5, 13, 13, 1]), tensor([ 5, 13, 13, 1, 0]))
0, 5], probs[1, 13], probs[2, 13], probs[3, 1], probs[4, 0] probs[
(tensor(0.0115), tensor(0.0228), tensor(0.0415), tensor(0.0133), tensor(0.0538))
5) torch.arange(
tensor([0, 1, 2, 3, 4])
5), ys] probs[torch.arange(
tensor([0.0115, 0.0228, 0.0415, 0.0133, 0.0538])
5), ys].log() probs[torch.arange(
tensor([-4.4636, -3.7831, -3.1812, -4.3204, -2.9222])
5), ys].log().mean() probs[torch.arange(
tensor(-3.7341)
= - probs[torch.arange(5), ys].log().mean()
loss loss
tensor(3.7341)
def train():
= F.one_hot(xs, num_classes=27).float()
xenc = xenc @ W
logits = logits.exp()
counts = counts/counts.sum(1, keepdims=True)
probs = - probs[torch.arange(5), ys].log().mean()
loss return loss
= torch.randn((27, 27))
W train()
tensor(3.2426)
Backward Pass
1st pass
= torch.randn((27, 27), requires_grad=True) W
= None # way to set to zero the gradient
W.grad = train()
loss loss.backward()
loss
tensor(4.3984, grad_fn=<NegBackward0>)
W.shape, W.grad.shape
(torch.Size([27, 27]), torch.Size([27, 27]))
1] W.grad[:
tensor([[ 0.0044, 0.0015, 0.0060, 0.0069, 0.0096, -0.1978, 0.0005, 0.0116, 0.0018, 0.0012, 0.0054, 0.0056, 0.0202, 0.0023, 0.0066, 0.0012, 0.0004, 0.0484, 0.0040, 0.0016, 0.0035, 0.0061, 0.0292, 0.0040, 0.0042, 0.0047, 0.0065]])
2nd pass
+= -0.1 * W.grad W.data
= None
W.grad = train()
loss loss.backward()
loss
tensor(4.3766, grad_fn=<NegBackward0>)
3rd pass
+= -0.1 * W.grad W.data
= None
W.grad = train()
loss loss.backward()
loss
tensor(4.3549, grad_fn=<NegBackward0>)
Training loop
= generate_training_set(words)
xs, ys = torch.tensor(xs)
xs = torch.tensor(ys)
ys = xs.nelement()
num print("Number of examples ", num)
= F.one_hot(xs, num_classes=27).float() xenc
Number of examples 228146
def train(xenc, ys, epochs, lr = 0.1):
= torch.randn((27, 27), requires_grad=True)
W for epoch in range(epochs):
# forward pass
= xenc @ W
logits = logits.exp()
counts = counts/counts.sum(1, keepdims=True)
probs = - probs[torch.arange(ys.shape[0]), ys].log().mean()
loss print('Epoch: ', epoch, 'Loss: ', loss)
# backward pass
= None
W.grad
loss.backward()+= - lr* W.grad
W.data return W
= train(xenc, ys, 10, 1) model
Epoch: 0 Loss: tensor(3.7543, grad_fn=<NegBackward0>)
Epoch: 1 Loss: tensor(3.7461, grad_fn=<NegBackward0>)
Epoch: 2 Loss: tensor(3.7380, grad_fn=<NegBackward0>)
Epoch: 3 Loss: tensor(3.7300, grad_fn=<NegBackward0>)
Epoch: 4 Loss: tensor(3.7221, grad_fn=<NegBackward0>)
Epoch: 5 Loss: tensor(3.7143, grad_fn=<NegBackward0>)
Epoch: 6 Loss: tensor(3.7066, grad_fn=<NegBackward0>)
Epoch: 7 Loss: tensor(3.6990, grad_fn=<NegBackward0>)
Epoch: 8 Loss: tensor(3.6914, grad_fn=<NegBackward0>)
Epoch: 9 Loss: tensor(3.6840, grad_fn=<NegBackward0>)
= train(xenc, ys, 10, 10) model
Epoch: 0 Loss: tensor(3.7679, grad_fn=<NegBackward0>)
Epoch: 1 Loss: tensor(3.6911, grad_fn=<NegBackward0>)
Epoch: 2 Loss: tensor(3.6209, grad_fn=<NegBackward0>)
Epoch: 3 Loss: tensor(3.5565, grad_fn=<NegBackward0>)
Epoch: 4 Loss: tensor(3.4974, grad_fn=<NegBackward0>)
Epoch: 5 Loss: tensor(3.4433, grad_fn=<NegBackward0>)
Epoch: 6 Loss: tensor(3.3937, grad_fn=<NegBackward0>)
Epoch: 7 Loss: tensor(3.3482, grad_fn=<NegBackward0>)
Epoch: 8 Loss: tensor(3.3064, grad_fn=<NegBackward0>)
Epoch: 9 Loss: tensor(3.2681, grad_fn=<NegBackward0>)
= train(xenc, ys, 10, 100) model
Epoch: 0 Loss: tensor(3.8536, grad_fn=<NegBackward0>)
Epoch: 1 Loss: tensor(3.1448, grad_fn=<NegBackward0>)
Epoch: 2 Loss: tensor(2.9057, grad_fn=<NegBackward0>)
Epoch: 3 Loss: tensor(2.7856, grad_fn=<NegBackward0>)
Epoch: 4 Loss: tensor(2.7163, grad_fn=<NegBackward0>)
Epoch: 5 Loss: tensor(2.6870, grad_fn=<NegBackward0>)
Epoch: 6 Loss: tensor(2.6442, grad_fn=<NegBackward0>)
Epoch: 7 Loss: tensor(2.6310, grad_fn=<NegBackward0>)
Epoch: 8 Loss: tensor(2.6032, grad_fn=<NegBackward0>)
Epoch: 9 Loss: tensor(2.6044, grad_fn=<NegBackward0>)
= train(xenc, ys, 100, 10) model
Epoch: 0 Loss: tensor(3.9659, grad_fn=<NegBackward0>)
Epoch: 1 Loss: tensor(3.8651, grad_fn=<NegBackward0>)
Epoch: 2 Loss: tensor(3.7738, grad_fn=<NegBackward0>)
Epoch: 3 Loss: tensor(3.6906, grad_fn=<NegBackward0>)
Epoch: 4 Loss: tensor(3.6145, grad_fn=<NegBackward0>)
Epoch: 5 Loss: tensor(3.5448, grad_fn=<NegBackward0>)
Epoch: 6 Loss: tensor(3.4810, grad_fn=<NegBackward0>)
Epoch: 7 Loss: tensor(3.4227, grad_fn=<NegBackward0>)
Epoch: 8 Loss: tensor(3.3695, grad_fn=<NegBackward0>)
Epoch: 9 Loss: tensor(3.3209, grad_fn=<NegBackward0>)
Epoch: 10 Loss: tensor(3.2766, grad_fn=<NegBackward0>)
Epoch: 11 Loss: tensor(3.2362, grad_fn=<NegBackward0>)
Epoch: 12 Loss: tensor(3.1992, grad_fn=<NegBackward0>)
Epoch: 13 Loss: tensor(3.1654, grad_fn=<NegBackward0>)
Epoch: 14 Loss: tensor(3.1343, grad_fn=<NegBackward0>)
Epoch: 15 Loss: tensor(3.1055, grad_fn=<NegBackward0>)
Epoch: 16 Loss: tensor(3.0788, grad_fn=<NegBackward0>)
Epoch: 17 Loss: tensor(3.0540, grad_fn=<NegBackward0>)
Epoch: 18 Loss: tensor(3.0307, grad_fn=<NegBackward0>)
Epoch: 19 Loss: tensor(3.0089, grad_fn=<NegBackward0>)
Epoch: 20 Loss: tensor(2.9884, grad_fn=<NegBackward0>)
Epoch: 21 Loss: tensor(2.9690, grad_fn=<NegBackward0>)
Epoch: 22 Loss: tensor(2.9507, grad_fn=<NegBackward0>)
Epoch: 23 Loss: tensor(2.9334, grad_fn=<NegBackward0>)
Epoch: 24 Loss: tensor(2.9170, grad_fn=<NegBackward0>)
Epoch: 25 Loss: tensor(2.9015, grad_fn=<NegBackward0>)
Epoch: 26 Loss: tensor(2.8867, grad_fn=<NegBackward0>)
Epoch: 27 Loss: tensor(2.8727, grad_fn=<NegBackward0>)
Epoch: 28 Loss: tensor(2.8594, grad_fn=<NegBackward0>)
Epoch: 29 Loss: tensor(2.8467, grad_fn=<NegBackward0>)
Epoch: 30 Loss: tensor(2.8347, grad_fn=<NegBackward0>)
Epoch: 31 Loss: tensor(2.8232, grad_fn=<NegBackward0>)
Epoch: 32 Loss: tensor(2.8123, grad_fn=<NegBackward0>)
Epoch: 33 Loss: tensor(2.8019, grad_fn=<NegBackward0>)
Epoch: 34 Loss: tensor(2.7920, grad_fn=<NegBackward0>)
Epoch: 35 Loss: tensor(2.7825, grad_fn=<NegBackward0>)
Epoch: 36 Loss: tensor(2.7735, grad_fn=<NegBackward0>)
Epoch: 37 Loss: tensor(2.7649, grad_fn=<NegBackward0>)
Epoch: 38 Loss: tensor(2.7567, grad_fn=<NegBackward0>)
Epoch: 39 Loss: tensor(2.7489, grad_fn=<NegBackward0>)
Epoch: 40 Loss: tensor(2.7414, grad_fn=<NegBackward0>)
Epoch: 41 Loss: tensor(2.7343, grad_fn=<NegBackward0>)
Epoch: 42 Loss: tensor(2.7274, grad_fn=<NegBackward0>)
Epoch: 43 Loss: tensor(2.7209, grad_fn=<NegBackward0>)
Epoch: 44 Loss: tensor(2.7147, grad_fn=<NegBackward0>)
Epoch: 45 Loss: tensor(2.7087, grad_fn=<NegBackward0>)
Epoch: 46 Loss: tensor(2.7030, grad_fn=<NegBackward0>)
Epoch: 47 Loss: tensor(2.6975, grad_fn=<NegBackward0>)
Epoch: 48 Loss: tensor(2.6923, grad_fn=<NegBackward0>)
Epoch: 49 Loss: tensor(2.6873, grad_fn=<NegBackward0>)
Epoch: 50 Loss: tensor(2.6824, grad_fn=<NegBackward0>)
Epoch: 51 Loss: tensor(2.6778, grad_fn=<NegBackward0>)
Epoch: 52 Loss: tensor(2.6734, grad_fn=<NegBackward0>)
Epoch: 53 Loss: tensor(2.6691, grad_fn=<NegBackward0>)
Epoch: 54 Loss: tensor(2.6650, grad_fn=<NegBackward0>)
Epoch: 55 Loss: tensor(2.6611, grad_fn=<NegBackward0>)
Epoch: 56 Loss: tensor(2.6573, grad_fn=<NegBackward0>)
Epoch: 57 Loss: tensor(2.6536, grad_fn=<NegBackward0>)
Epoch: 58 Loss: tensor(2.6501, grad_fn=<NegBackward0>)
Epoch: 59 Loss: tensor(2.6467, grad_fn=<NegBackward0>)
Epoch: 60 Loss: tensor(2.6434, grad_fn=<NegBackward0>)
Epoch: 61 Loss: tensor(2.6403, grad_fn=<NegBackward0>)
Epoch: 62 Loss: tensor(2.6372, grad_fn=<NegBackward0>)
Epoch: 63 Loss: tensor(2.6343, grad_fn=<NegBackward0>)
Epoch: 64 Loss: tensor(2.6314, grad_fn=<NegBackward0>)
Epoch: 65 Loss: tensor(2.6287, grad_fn=<NegBackward0>)
Epoch: 66 Loss: tensor(2.6260, grad_fn=<NegBackward0>)
Epoch: 67 Loss: tensor(2.6235, grad_fn=<NegBackward0>)
Epoch: 68 Loss: tensor(2.6210, grad_fn=<NegBackward0>)
Epoch: 69 Loss: tensor(2.6185, grad_fn=<NegBackward0>)
Epoch: 70 Loss: tensor(2.6162, grad_fn=<NegBackward0>)
Epoch: 71 Loss: tensor(2.6139, grad_fn=<NegBackward0>)
Epoch: 72 Loss: tensor(2.6117, grad_fn=<NegBackward0>)
Epoch: 73 Loss: tensor(2.6096, grad_fn=<NegBackward0>)
Epoch: 74 Loss: tensor(2.6075, grad_fn=<NegBackward0>)
Epoch: 75 Loss: tensor(2.6055, grad_fn=<NegBackward0>)
Epoch: 76 Loss: tensor(2.6035, grad_fn=<NegBackward0>)
Epoch: 77 Loss: tensor(2.6016, grad_fn=<NegBackward0>)
Epoch: 78 Loss: tensor(2.5998, grad_fn=<NegBackward0>)
Epoch: 79 Loss: tensor(2.5980, grad_fn=<NegBackward0>)
Epoch: 80 Loss: tensor(2.5962, grad_fn=<NegBackward0>)
Epoch: 81 Loss: tensor(2.5945, grad_fn=<NegBackward0>)
Epoch: 82 Loss: tensor(2.5928, grad_fn=<NegBackward0>)
Epoch: 83 Loss: tensor(2.5912, grad_fn=<NegBackward0>)
Epoch: 84 Loss: tensor(2.5896, grad_fn=<NegBackward0>)
Epoch: 85 Loss: tensor(2.5881, grad_fn=<NegBackward0>)
Epoch: 86 Loss: tensor(2.5866, grad_fn=<NegBackward0>)
Epoch: 87 Loss: tensor(2.5851, grad_fn=<NegBackward0>)
Epoch: 88 Loss: tensor(2.5837, grad_fn=<NegBackward0>)
Epoch: 89 Loss: tensor(2.5823, grad_fn=<NegBackward0>)
Epoch: 90 Loss: tensor(2.5809, grad_fn=<NegBackward0>)
Epoch: 91 Loss: tensor(2.5796, grad_fn=<NegBackward0>)
Epoch: 92 Loss: tensor(2.5783, grad_fn=<NegBackward0>)
Epoch: 93 Loss: tensor(2.5770, grad_fn=<NegBackward0>)
Epoch: 94 Loss: tensor(2.5757, grad_fn=<NegBackward0>)
Epoch: 95 Loss: tensor(2.5745, grad_fn=<NegBackward0>)
Epoch: 96 Loss: tensor(2.5733, grad_fn=<NegBackward0>)
Epoch: 97 Loss: tensor(2.5721, grad_fn=<NegBackward0>)
Epoch: 98 Loss: tensor(2.5710, grad_fn=<NegBackward0>)
Epoch: 99 Loss: tensor(2.5698, grad_fn=<NegBackward0>)
Prediction
def generate_names(count):
for i in range(count):
= []
out = 0
ix while True:
= F.one_hot(torch.tensor([ix]), num_classes=27).float()
xenc = xenc @ model # predict log-counts
logits = logits.exp()
counts = counts/counts.sum(1, keepdims=True)
p
= torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
ix
out.append(itos[ix])if ix == 0:
break
print(''.join(out))
5) generate_names(
zriwreisona.
ady.
myonaxrolin.
arravispgoikeen.
arolouliymairekorqgbwyuere.
Evaluate on Valid and Test set
from torch.utils.data import random_split
= xenc.shape[0] x_num
xenc.shape
torch.Size([228146, 27])
= random_split(range(x_num),
test_range, valid_range, train_range 0.1, 0.1, 0.8],
[=g) generator
= torch.tensor(test_range)
test_idx = torch.tensor(valid_range)
valid_idx = torch.tensor(train_range) train_idx
len(train_idx), len(valid_idx), len(test_idx)
(182516, 22815, 22815)
= xenc[train_idx], ys[train_idx]
x_train, y_train = xenc[valid_idx], ys[valid_idx]
x_valid, y_valid = xenc[test_idx], ys[test_idx] x_test, y_test
x_train.shape, x_valid.shape, x_test.shape
(torch.Size([182516, 27]), torch.Size([22815, 27]), torch.Size([22815, 27]))
y_train.shape, y_valid.shape, y_test.shape
(torch.Size([182516]), torch.Size([22815]), torch.Size([22815]))
= train(x_train, y_train, 100, 10) model
Epoch: 0 Loss: tensor(3.7710, grad_fn=<NegBackward0>)
Epoch: 1 Loss: tensor(3.6776, grad_fn=<NegBackward0>)
Epoch: 2 Loss: tensor(3.5960, grad_fn=<NegBackward0>)
Epoch: 3 Loss: tensor(3.5230, grad_fn=<NegBackward0>)
Epoch: 4 Loss: tensor(3.4572, grad_fn=<NegBackward0>)
Epoch: 5 Loss: tensor(3.3980, grad_fn=<NegBackward0>)
Epoch: 6 Loss: tensor(3.3445, grad_fn=<NegBackward0>)
Epoch: 7 Loss: tensor(3.2964, grad_fn=<NegBackward0>)
Epoch: 8 Loss: tensor(3.2528, grad_fn=<NegBackward0>)
Epoch: 9 Loss: tensor(3.2134, grad_fn=<NegBackward0>)
Epoch: 10 Loss: tensor(3.1774, grad_fn=<NegBackward0>)
Epoch: 11 Loss: tensor(3.1445, grad_fn=<NegBackward0>)
Epoch: 12 Loss: tensor(3.1142, grad_fn=<NegBackward0>)
Epoch: 13 Loss: tensor(3.0862, grad_fn=<NegBackward0>)
Epoch: 14 Loss: tensor(3.0601, grad_fn=<NegBackward0>)
Epoch: 15 Loss: tensor(3.0357, grad_fn=<NegBackward0>)
Epoch: 16 Loss: tensor(3.0128, grad_fn=<NegBackward0>)
Epoch: 17 Loss: tensor(2.9913, grad_fn=<NegBackward0>)
Epoch: 18 Loss: tensor(2.9711, grad_fn=<NegBackward0>)
Epoch: 19 Loss: tensor(2.9520, grad_fn=<NegBackward0>)
Epoch: 20 Loss: tensor(2.9340, grad_fn=<NegBackward0>)
Epoch: 21 Loss: tensor(2.9170, grad_fn=<NegBackward0>)
Epoch: 22 Loss: tensor(2.9009, grad_fn=<NegBackward0>)
Epoch: 23 Loss: tensor(2.8856, grad_fn=<NegBackward0>)
Epoch: 24 Loss: tensor(2.8712, grad_fn=<NegBackward0>)
Epoch: 25 Loss: tensor(2.8575, grad_fn=<NegBackward0>)
Epoch: 26 Loss: tensor(2.8446, grad_fn=<NegBackward0>)
Epoch: 27 Loss: tensor(2.8323, grad_fn=<NegBackward0>)
Epoch: 28 Loss: tensor(2.8206, grad_fn=<NegBackward0>)
Epoch: 29 Loss: tensor(2.8096, grad_fn=<NegBackward0>)
Epoch: 30 Loss: tensor(2.7991, grad_fn=<NegBackward0>)
Epoch: 31 Loss: tensor(2.7892, grad_fn=<NegBackward0>)
Epoch: 32 Loss: tensor(2.7798, grad_fn=<NegBackward0>)
Epoch: 33 Loss: tensor(2.7708, grad_fn=<NegBackward0>)
Epoch: 34 Loss: tensor(2.7623, grad_fn=<NegBackward0>)
Epoch: 35 Loss: tensor(2.7542, grad_fn=<NegBackward0>)
Epoch: 36 Loss: tensor(2.7466, grad_fn=<NegBackward0>)
Epoch: 37 Loss: tensor(2.7392, grad_fn=<NegBackward0>)
Epoch: 38 Loss: tensor(2.7323, grad_fn=<NegBackward0>)
Epoch: 39 Loss: tensor(2.7256, grad_fn=<NegBackward0>)
Epoch: 40 Loss: tensor(2.7193, grad_fn=<NegBackward0>)
Epoch: 41 Loss: tensor(2.7132, grad_fn=<NegBackward0>)
Epoch: 42 Loss: tensor(2.7074, grad_fn=<NegBackward0>)
Epoch: 43 Loss: tensor(2.7019, grad_fn=<NegBackward0>)
Epoch: 44 Loss: tensor(2.6966, grad_fn=<NegBackward0>)
Epoch: 45 Loss: tensor(2.6915, grad_fn=<NegBackward0>)
Epoch: 46 Loss: tensor(2.6866, grad_fn=<NegBackward0>)
Epoch: 47 Loss: tensor(2.6819, grad_fn=<NegBackward0>)
Epoch: 48 Loss: tensor(2.6774, grad_fn=<NegBackward0>)
Epoch: 49 Loss: tensor(2.6731, grad_fn=<NegBackward0>)
Epoch: 50 Loss: tensor(2.6689, grad_fn=<NegBackward0>)
Epoch: 51 Loss: tensor(2.6649, grad_fn=<NegBackward0>)
Epoch: 52 Loss: tensor(2.6610, grad_fn=<NegBackward0>)
Epoch: 53 Loss: tensor(2.6572, grad_fn=<NegBackward0>)
Epoch: 54 Loss: tensor(2.6536, grad_fn=<NegBackward0>)
Epoch: 55 Loss: tensor(2.6501, grad_fn=<NegBackward0>)
Epoch: 56 Loss: tensor(2.6467, grad_fn=<NegBackward0>)
Epoch: 57 Loss: tensor(2.6434, grad_fn=<NegBackward0>)
Epoch: 58 Loss: tensor(2.6402, grad_fn=<NegBackward0>)
Epoch: 59 Loss: tensor(2.6372, grad_fn=<NegBackward0>)
Epoch: 60 Loss: tensor(2.6342, grad_fn=<NegBackward0>)
Epoch: 61 Loss: tensor(2.6313, grad_fn=<NegBackward0>)
Epoch: 62 Loss: tensor(2.6285, grad_fn=<NegBackward0>)
Epoch: 63 Loss: tensor(2.6258, grad_fn=<NegBackward0>)
Epoch: 64 Loss: tensor(2.6231, grad_fn=<NegBackward0>)
Epoch: 65 Loss: tensor(2.6206, grad_fn=<NegBackward0>)
Epoch: 66 Loss: tensor(2.6181, grad_fn=<NegBackward0>)
Epoch: 67 Loss: tensor(2.6156, grad_fn=<NegBackward0>)
Epoch: 68 Loss: tensor(2.6133, grad_fn=<NegBackward0>)
Epoch: 69 Loss: tensor(2.6110, grad_fn=<NegBackward0>)
Epoch: 70 Loss: tensor(2.6087, grad_fn=<NegBackward0>)
Epoch: 71 Loss: tensor(2.6066, grad_fn=<NegBackward0>)
Epoch: 72 Loss: tensor(2.6044, grad_fn=<NegBackward0>)
Epoch: 73 Loss: tensor(2.6024, grad_fn=<NegBackward0>)
Epoch: 74 Loss: tensor(2.6004, grad_fn=<NegBackward0>)
Epoch: 75 Loss: tensor(2.5984, grad_fn=<NegBackward0>)
Epoch: 76 Loss: tensor(2.5965, grad_fn=<NegBackward0>)
Epoch: 77 Loss: tensor(2.5946, grad_fn=<NegBackward0>)
Epoch: 78 Loss: tensor(2.5928, grad_fn=<NegBackward0>)
Epoch: 79 Loss: tensor(2.5910, grad_fn=<NegBackward0>)
Epoch: 80 Loss: tensor(2.5893, grad_fn=<NegBackward0>)
Epoch: 81 Loss: tensor(2.5876, grad_fn=<NegBackward0>)
Epoch: 82 Loss: tensor(2.5860, grad_fn=<NegBackward0>)
Epoch: 83 Loss: tensor(2.5844, grad_fn=<NegBackward0>)
Epoch: 84 Loss: tensor(2.5828, grad_fn=<NegBackward0>)
Epoch: 85 Loss: tensor(2.5812, grad_fn=<NegBackward0>)
Epoch: 86 Loss: tensor(2.5797, grad_fn=<NegBackward0>)
Epoch: 87 Loss: tensor(2.5783, grad_fn=<NegBackward0>)
Epoch: 88 Loss: tensor(2.5768, grad_fn=<NegBackward0>)
Epoch: 89 Loss: tensor(2.5754, grad_fn=<NegBackward0>)
Epoch: 90 Loss: tensor(2.5741, grad_fn=<NegBackward0>)
Epoch: 91 Loss: tensor(2.5727, grad_fn=<NegBackward0>)
Epoch: 92 Loss: tensor(2.5714, grad_fn=<NegBackward0>)
Epoch: 93 Loss: tensor(2.5701, grad_fn=<NegBackward0>)
Epoch: 94 Loss: tensor(2.5689, grad_fn=<NegBackward0>)
Epoch: 95 Loss: tensor(2.5676, grad_fn=<NegBackward0>)
Epoch: 96 Loss: tensor(2.5664, grad_fn=<NegBackward0>)
Epoch: 97 Loss: tensor(2.5652, grad_fn=<NegBackward0>)
Epoch: 98 Loss: tensor(2.5641, grad_fn=<NegBackward0>)
Epoch: 99 Loss: tensor(2.5629, grad_fn=<NegBackward0>)
Evaluate on Valid set
= x_valid @ model
logits_valid = logits_valid.exp()
counts_valid = counts_valid/counts_valid.sum(1, keepdims=True)
pred_valid - pred_valid[torch.arange(x_valid.shape[0]), y_valid].log().mean()
tensor(2.5745, grad_fn=<NegBackward0>)
Evaluate on Test set
= x_test @ model
logits_test = logits_test.exp()
counts_test = counts_test/counts_test.sum(1, keepdims=True)
pred_test - pred_test[torch.arange(x_test.shape[0]), y_test].log().mean()
tensor(2.5639, grad_fn=<NegBackward0>)
Regularization (Label Smoothing)
Augment the loss function to have a small component (reguliarization loss) to have a smoother distribution of W. To make all W elements 0
To have a uniform probability distribution
** 2).mean() (W
tensor(0.9617, grad_fn=<MeanBackward0>)
def train(xenc, ys, epochs, lr = 0.1, regularization_parameter = 0.01, print_every_epoch=False):
= torch.randn((27, 27), requires_grad=True)
W for epoch in range(epochs):
# forward pass
= xenc @ W
logits = logits.exp()
counts = counts/counts.sum(1, keepdims=True)
probs = - probs[torch.arange(ys.shape[0]), ys].log().mean()
loss = regularization_parameter * (W ** 2).mean()
regularization_loss += regularization_loss
loss if print_every_epoch:
print('Epoch: ', epoch, 'Loss: ', loss)
# backward pass
= None
W.grad
loss.backward()+= - lr* W.grad
W.data
print('Loss: ', loss)
return W
= train(x_train, y_train, 100, 10, 0.1) model
Loss: tensor(2.6531, grad_fn=<AddBackward0>)
= train(x_train, y_train, 100, 10, 1) model
Loss: tensor(2.8925, grad_fn=<AddBackward0>)
= train(x_train, y_train, 100, 10, 0.001) model
Loss: tensor(2.5767, grad_fn=<AddBackward0>)
= train(x_train, y_train, 100, 10, 0.0001) model
Loss: tensor(2.5635, grad_fn=<AddBackward0>)