# !pip install ray[tune]
Wavenet Hyperparameter Tuning
# !pip install optuna
import numpy as np
import torch
import random
import os
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
= "cpu"
device if torch.cuda.is_available():
= "cuda:0" device
device
'cuda:0'
42); torch.manual_seed(
42) random.seed(
!ls
names.txt sample_data
Setup Data Loader
= open('names.txt', 'r').read().splitlines() words
random.shuffle(words)
def build_dataset(words, block_size=8):
= [], []
X, Y
42)
random.seed(
random.shuffle(words)
= sorted(list(set(''.join(words))))
chars = {s: i + 1 for i, s in enumerate(chars)}
stoi '.'] = 0
stoi[= {i: s for s, i in stoi.items()}
itos = len(itos)
vocab_size
for w in words:
= [0] * block_size
context for ch in w + '.':
= stoi[ch]
ix
X.append(context)
Y.append(ix)= context[1:] + [ix]
context
= torch.tensor(X).to(device)
X = torch.tensor(Y).to(device)
Y return X, Y
= int(0.8 * len(words))
n1 = int(0.9 * len(words))
n2
= build_dataset(words[:n1])
Xtr, Ytr = build_dataset(words[n1:n2])
Xdev, Ydev = build_dataset(words[n2:]) Xte, Yte
Xtr.shape
torch.Size([182625, 8])
Create Model
# --- Flatten Consecutive ---
class FlattenConsecutive(nn.Module):
def __init__(self, n):
super().__init__()
self.n = n
def forward(self, x):
= x.shape
B, T, C = x.reshape(B, T//self.n, C*self.n)
x if x.shape[1] == 1:
= x.squeeze(1)
x self.out = x
return self.out
# -- SwapDim ---
class SwapDim(nn.Module):
def __init__(self):
super().__init__()
def forward(self, x):
return torch.transpose(x, 1, 2)
# -- SwapDimBack --
class SwapDimBack(nn.Module):
def __init__(self):
super().__init__()
def forward(self, x):
return torch.transpose(x, 1, 2)
= 27
vocab_size = 24
n_embd = 128
n_hidden = nn.Sequential(
model
nn.Embedding(vocab_size, n_embd),2), nn.Linear(n_embd*2, n_hidden, bias=False), SwapDim(), nn.BatchNorm1d(n_hidden), SwapDimBack(), nn.Tanh(),
FlattenConsecutive(2), nn.Linear(n_hidden*2, n_hidden, bias=False), SwapDim(), nn.BatchNorm1d(n_hidden), SwapDimBack(), nn.Tanh(),
FlattenConsecutive(2), nn.Linear(n_hidden*2, n_hidden, bias=False), nn.BatchNorm1d(n_hidden), nn.Tanh(),
FlattenConsecutive(# nn.Linear(n_hidden, vocab_size),
).to(device)
= torch.randint(0, Xtr.shape[0], (4,))
ix = Xtr[ix], Ytr[ix]
Xb, Yb = model(Xb)
logits print(Xb.shape)
Xb, logits.shape
torch.Size([4, 8])
(tensor([[ 0, 0, 0, 0, 0, 0, 0, 1],
[ 0, 0, 0, 0, 0, 0, 0, 0],
[ 0, 0, 0, 0, 0, 16, 1, 24],
[ 0, 0, 0, 0, 0, 1, 4, 18]], device='cuda:0'),
torch.Size([4, 128]))
def build_model(n_embd, # the dimensionality of the character embedding vectors
# the number of neurons in the hidden layer of the MLP
n_hidden, = 0.1 # the factor by to reduce the weights of the last layer
last_layer_factor
):= 27
vocab_size = nn.Sequential(
model
nn.Embedding(vocab_size, n_embd),2), nn.Linear(n_embd*2, n_hidden, bias=False), SwapDim(), nn.BatchNorm1d(n_hidden), SwapDimBack(), nn.Tanh(),
FlattenConsecutive(2), nn.Linear(n_hidden*2, n_hidden, bias=False), SwapDim(), nn.BatchNorm1d(n_hidden), SwapDimBack(), nn.Tanh(),
FlattenConsecutive(2), nn.Linear(n_hidden*2, n_hidden, bias=False), nn.BatchNorm1d(n_hidden), nn.Tanh(),
FlattenConsecutive(
nn.Linear(n_hidden, vocab_size)
).to(device)
# parameter init
with torch.no_grad(): model[-1].weight *= last_layer_factor
= model.parameters()
parameters print("No of parameters ", sum(p.nelement() for p in parameters))
for p in parameters: p.requires_grad = True
return model
= build_model(24, 128) model
No of parameters 76579
= torch.randint(0, Xtr.shape[0], (4,))
ix = Xtr[ix], Ytr[ix]
Xb, Yb = model(Xb)
logits print(Xb.shape)
Xb
torch.Size([4, 8])
tensor([[ 0, 0, 0, 0, 3, 15, 18, 20],
[ 0, 0, 0, 0, 0, 0, 7, 9],
[ 0, 0, 0, 0, 0, 12, 1, 3],
[ 0, 0, 0, 17, 21, 9, 14, 3]], device='cuda:0')
logits.shape
torch.Size([4, 27])
def train(config, checkpoint_dir=None):
= config['n_embd']
n_embd = config['n_hidden']
n_hidden = config['last_layer_factor']
last_layer_factor = config['max_steps']
max_steps = config['lr']
lr = config['batch_size']
batch_size
= build_model(n_embd, n_hidden, last_layer_factor)
model
= F.cross_entropy(model(Xtr), Ytr)
train_loss print('Initial loss ', train_loss)
= torch.optim.AdamW(model.parameters(), lr=lr)
optimizer
= []
lossi
for i in range(max_steps):
= 0.0
running_loss = 0
epoch_steps # minibatch construct
= torch.randint(0, Xtr.shape[0], (batch_size,))
ix = Xtr[ix], Ytr[ix]
Xb, Yb
= model(Xb)
logits = F.cross_entropy(logits, Yb)
loss =True)
optimizer.zero_grad(set_to_none
loss.backward()
optimizer.step()
# track stats
if i % 10_000 == 0:
print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
lossi.append(loss.log10().item())
return model
= {
config "n_embd": 24,
"n_hidden": 128,
"lr": 0.001,
"last_layer_factor": 0.1,
"batch_size": 32,
"max_steps": 200_000
}
= train(config) m
No of parameters 76579
Initial loss tensor(3.2798, device='cuda:0', grad_fn=<NllLossBackward0>)
0/ 200000: 3.2850
10000/ 200000: 1.9604
20000/ 200000: 1.9096
30000/ 200000: 2.1808
40000/ 200000: 1.9603
50000/ 200000: 2.0830
60000/ 200000: 1.9285
70000/ 200000: 1.8355
80000/ 200000: 2.1152
90000/ 200000: 1.7333
100000/ 200000: 2.5383
110000/ 200000: 2.5408
120000/ 200000: 1.7806
130000/ 200000: 1.5074
140000/ 200000: 2.2836
150000/ 200000: 2.1666
160000/ 200000: 2.0499
170000/ 200000: 2.4158
180000/ 200000: 1.8051
190000/ 200000: 1.6264
= torch.randint(0, Xtr.shape[0], (4,))
ix = Xtr[ix], Ytr[ix]
Xb, Yb = m(Xb)
logits logits
tensor([[-0.3831, 5.9946, -0.4334, -3.1496, -1.0150, 3.1747, -3.4749, -3.2213,
0.9639, 2.5345, -2.5072, -4.6121, -2.3915, -1.1776, -1.0199, 4.6581,
-2.2405, -5.6197, 1.3689, -1.8678, -2.7871, 1.0448, -3.2367, -4.8778,
-4.8898, 1.1356, -0.6535],
[-1.8927, -1.4403, -2.6883, 2.1526, 0.8893, -1.0265, -1.4148, 0.8529,
1.6819, -2.4304, 1.2136, 3.3855, 1.9102, 1.1939, 1.9362, -3.9292,
-0.9157, -0.2079, 1.1477, 1.5550, 0.8729, -2.7374, 0.1523, 0.2212,
-0.3517, -0.8675, -0.6843],
[-1.8927, -1.4403, -2.6883, 2.1526, 0.8893, -1.0265, -1.4148, 0.8529,
1.6819, -2.4304, 1.2136, 3.3855, 1.9102, 1.1939, 1.9362, -3.9292,
-0.9157, -0.2079, 1.1477, 1.5550, 0.8729, -2.7374, 0.1523, 0.2212,
-0.3517, -0.8675, -0.6843],
[ 4.3652, 1.3847, 0.9043, -2.2867, 0.5967, 2.3756, -3.3935, -2.3518,
-3.2263, 2.7976, -4.1615, -2.9255, 0.9531, -1.9894, -1.8476, 0.9243,
-3.5244, -4.1242, -1.3296, 1.9078, 1.0783, -0.3561, -1.0896, -1.9243,
-2.5875, 0.5664, -3.5556]], device='cuda:0',
grad_fn=<AddmmBackward0>)
with torch.no_grad():
= F.cross_entropy(m(Xtr), Ytr).item()
train_loss = F.cross_entropy(m(Xdev), Ydev).item()
val_loss print(train_loss, val_loss)
1.86216139793396 2.0197317600250244