# !pip install ray[tune]Wavenet Hyperparameter Tuning
# !pip install optunaimport numpy as np
import torch
import random
import os
import torch.nn as nn
import torch.nn.functional as F
from torch import optimdevice = "cpu"
if torch.cuda.is_available():
device = "cuda:0"device'cuda:0'
torch.manual_seed(42);random.seed(42)!lsnames.txt sample_data
Setup Data Loader
words = open('names.txt', 'r').read().splitlines()random.shuffle(words)def build_dataset(words, block_size=8):
X, Y = [], []
random.seed(42)
random.shuffle(words)
chars = sorted(list(set(''.join(words))))
stoi = {s: i + 1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i: s for s, i in stoi.items()}
vocab_size = len(itos)
for w in words:
context = [0] * block_size
for ch in w + '.':
ix = stoi[ch]
X.append(context)
Y.append(ix)
context = context[1:] + [ix]
X = torch.tensor(X).to(device)
Y = torch.tensor(Y).to(device)
return X, Y
n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))
Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])Xtr.shapetorch.Size([182625, 8])
Create Model
# --- Flatten Consecutive ---
class FlattenConsecutive(nn.Module):
def __init__(self, n):
super().__init__()
self.n = n
def forward(self, x):
B, T, C = x.shape
x = x.reshape(B, T//self.n, C*self.n)
if x.shape[1] == 1:
x = x.squeeze(1)
self.out = x
return self.out
# -- SwapDim ---
class SwapDim(nn.Module):
def __init__(self):
super().__init__()
def forward(self, x):
return torch.transpose(x, 1, 2)
# -- SwapDimBack --
class SwapDimBack(nn.Module):
def __init__(self):
super().__init__()
def forward(self, x):
return torch.transpose(x, 1, 2)vocab_size = 27
n_embd = 24
n_hidden = 128
model = nn.Sequential(
nn.Embedding(vocab_size, n_embd),
FlattenConsecutive(2), nn.Linear(n_embd*2, n_hidden, bias=False), SwapDim(), nn.BatchNorm1d(n_hidden), SwapDimBack(), nn.Tanh(),
FlattenConsecutive(2), nn.Linear(n_hidden*2, n_hidden, bias=False), SwapDim(), nn.BatchNorm1d(n_hidden), SwapDimBack(), nn.Tanh(),
FlattenConsecutive(2), nn.Linear(n_hidden*2, n_hidden, bias=False), nn.BatchNorm1d(n_hidden), nn.Tanh(),
# nn.Linear(n_hidden, vocab_size),
).to(device)ix = torch.randint(0, Xtr.shape[0], (4,))
Xb, Yb = Xtr[ix], Ytr[ix]
logits = model(Xb)
print(Xb.shape)
Xb, logits.shapetorch.Size([4, 8])
(tensor([[ 0, 0, 0, 0, 0, 0, 0, 1],
[ 0, 0, 0, 0, 0, 0, 0, 0],
[ 0, 0, 0, 0, 0, 16, 1, 24],
[ 0, 0, 0, 0, 0, 1, 4, 18]], device='cuda:0'),
torch.Size([4, 128]))
def build_model(n_embd, # the dimensionality of the character embedding vectors
n_hidden, # the number of neurons in the hidden layer of the MLP
last_layer_factor = 0.1 # the factor by to reduce the weights of the last layer
):
vocab_size = 27
model = nn.Sequential(
nn.Embedding(vocab_size, n_embd),
FlattenConsecutive(2), nn.Linear(n_embd*2, n_hidden, bias=False), SwapDim(), nn.BatchNorm1d(n_hidden), SwapDimBack(), nn.Tanh(),
FlattenConsecutive(2), nn.Linear(n_hidden*2, n_hidden, bias=False), SwapDim(), nn.BatchNorm1d(n_hidden), SwapDimBack(), nn.Tanh(),
FlattenConsecutive(2), nn.Linear(n_hidden*2, n_hidden, bias=False), nn.BatchNorm1d(n_hidden), nn.Tanh(),
nn.Linear(n_hidden, vocab_size)
).to(device)
# parameter init
with torch.no_grad(): model[-1].weight *= last_layer_factor
parameters = model.parameters()
print("No of parameters ", sum(p.nelement() for p in parameters))
for p in parameters: p.requires_grad = True
return modelmodel = build_model(24, 128)No of parameters 76579
ix = torch.randint(0, Xtr.shape[0], (4,))
Xb, Yb = Xtr[ix], Ytr[ix]
logits = model(Xb)
print(Xb.shape)
Xbtorch.Size([4, 8])
tensor([[ 0, 0, 0, 0, 3, 15, 18, 20],
[ 0, 0, 0, 0, 0, 0, 7, 9],
[ 0, 0, 0, 0, 0, 12, 1, 3],
[ 0, 0, 0, 17, 21, 9, 14, 3]], device='cuda:0')
logits.shapetorch.Size([4, 27])
def train(config, checkpoint_dir=None):
n_embd = config['n_embd']
n_hidden = config['n_hidden']
last_layer_factor = config['last_layer_factor']
max_steps = config['max_steps']
lr = config['lr']
batch_size = config['batch_size']
model = build_model(n_embd, n_hidden, last_layer_factor)
train_loss = F.cross_entropy(model(Xtr), Ytr)
print('Initial loss ', train_loss)
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lossi = []
for i in range(max_steps):
running_loss = 0.0
epoch_steps = 0
# minibatch construct
ix = torch.randint(0, Xtr.shape[0], (batch_size,))
Xb, Yb = Xtr[ix], Ytr[ix]
logits = model(Xb)
loss = F.cross_entropy(logits, Yb)
optimizer.zero_grad(set_to_none=True)
loss.backward()
optimizer.step()
# track stats
if i % 10_000 == 0:
print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
lossi.append(loss.log10().item())
return modelconfig = {
"n_embd": 24,
"n_hidden": 128,
"lr": 0.001,
"last_layer_factor": 0.1,
"batch_size": 32,
"max_steps": 200_000
}m = train(config)No of parameters 76579
Initial loss tensor(3.2798, device='cuda:0', grad_fn=<NllLossBackward0>)
0/ 200000: 3.2850
10000/ 200000: 1.9604
20000/ 200000: 1.9096
30000/ 200000: 2.1808
40000/ 200000: 1.9603
50000/ 200000: 2.0830
60000/ 200000: 1.9285
70000/ 200000: 1.8355
80000/ 200000: 2.1152
90000/ 200000: 1.7333
100000/ 200000: 2.5383
110000/ 200000: 2.5408
120000/ 200000: 1.7806
130000/ 200000: 1.5074
140000/ 200000: 2.2836
150000/ 200000: 2.1666
160000/ 200000: 2.0499
170000/ 200000: 2.4158
180000/ 200000: 1.8051
190000/ 200000: 1.6264
ix = torch.randint(0, Xtr.shape[0], (4,))
Xb, Yb = Xtr[ix], Ytr[ix]
logits = m(Xb)
logitstensor([[-0.3831, 5.9946, -0.4334, -3.1496, -1.0150, 3.1747, -3.4749, -3.2213,
0.9639, 2.5345, -2.5072, -4.6121, -2.3915, -1.1776, -1.0199, 4.6581,
-2.2405, -5.6197, 1.3689, -1.8678, -2.7871, 1.0448, -3.2367, -4.8778,
-4.8898, 1.1356, -0.6535],
[-1.8927, -1.4403, -2.6883, 2.1526, 0.8893, -1.0265, -1.4148, 0.8529,
1.6819, -2.4304, 1.2136, 3.3855, 1.9102, 1.1939, 1.9362, -3.9292,
-0.9157, -0.2079, 1.1477, 1.5550, 0.8729, -2.7374, 0.1523, 0.2212,
-0.3517, -0.8675, -0.6843],
[-1.8927, -1.4403, -2.6883, 2.1526, 0.8893, -1.0265, -1.4148, 0.8529,
1.6819, -2.4304, 1.2136, 3.3855, 1.9102, 1.1939, 1.9362, -3.9292,
-0.9157, -0.2079, 1.1477, 1.5550, 0.8729, -2.7374, 0.1523, 0.2212,
-0.3517, -0.8675, -0.6843],
[ 4.3652, 1.3847, 0.9043, -2.2867, 0.5967, 2.3756, -3.3935, -2.3518,
-3.2263, 2.7976, -4.1615, -2.9255, 0.9531, -1.9894, -1.8476, 0.9243,
-3.5244, -4.1242, -1.3296, 1.9078, 1.0783, -0.3561, -1.0896, -1.9243,
-2.5875, 0.5664, -3.5556]], device='cuda:0',
grad_fn=<AddmmBackward0>)
with torch.no_grad():
train_loss = F.cross_entropy(m(Xtr), Ytr).item()
val_loss = F.cross_entropy(m(Xdev), Ydev).item()
print(train_loss, val_loss)1.86216139793396 2.0197317600250244