Wavenet Hyperparameter Tuning

# !pip install ray[tune]
# !pip install optuna
import numpy as np
import torch

import random
import os
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
device = "cpu"
if torch.cuda.is_available():
    device = "cuda:0"
device
'cuda:0'
torch.manual_seed(42);
random.seed(42)
!ls
names.txt  sample_data

Setup Data Loader

words = open('names.txt', 'r').read().splitlines()
random.shuffle(words)
def build_dataset(words, block_size=8):
    
    X, Y = [], []
    
    random.seed(42)
    random.shuffle(words)
    
    chars = sorted(list(set(''.join(words))))
    stoi = {s: i + 1 for i, s in enumerate(chars)}
    stoi['.'] = 0
    itos = {i: s for s, i in stoi.items()}
    vocab_size = len(itos)
    
    for w in words:
        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]
    
    X = torch.tensor(X).to(device)
    Y = torch.tensor(Y).to(device)
    return X, Y

n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])
Xtr.shape
torch.Size([182625, 8])

Create Model

# --- Flatten Consecutive ---
class FlattenConsecutive(nn.Module):
    def __init__(self, n):
        super().__init__()
        self.n = n
    
    def forward(self, x):
        B, T, C = x.shape
        x = x.reshape(B, T//self.n, C*self.n)
        if x.shape[1] == 1: 
            x = x.squeeze(1)
        self.out = x
        return self.out

# -- SwapDim ---
class SwapDim(nn.Module):
    def __init__(self):
        super().__init__()
        
    def forward(self, x):
        return torch.transpose(x, 1, 2)

# -- SwapDimBack -- 
class SwapDimBack(nn.Module):
    def __init__(self):
        super().__init__()
    
    def forward(self, x):
        return torch.transpose(x, 1, 2)
vocab_size = 27
n_embd = 24
n_hidden = 128
model = nn.Sequential(
    nn.Embedding(vocab_size, n_embd),
    FlattenConsecutive(2), nn.Linear(n_embd*2, n_hidden, bias=False), SwapDim(), nn.BatchNorm1d(n_hidden), SwapDimBack(), nn.Tanh(),
    FlattenConsecutive(2), nn.Linear(n_hidden*2, n_hidden, bias=False), SwapDim(), nn.BatchNorm1d(n_hidden), SwapDimBack(), nn.Tanh(),
   FlattenConsecutive(2), nn.Linear(n_hidden*2, n_hidden, bias=False),  nn.BatchNorm1d(n_hidden), nn.Tanh(),
#     nn.Linear(n_hidden, vocab_size),
).to(device)
ix = torch.randint(0, Xtr.shape[0], (4,))
Xb, Yb = Xtr[ix], Ytr[ix]
logits = model(Xb)
print(Xb.shape)
Xb, logits.shape
torch.Size([4, 8])
(tensor([[ 0,  0,  0,  0,  0,  0,  0,  1],
         [ 0,  0,  0,  0,  0,  0,  0,  0],
         [ 0,  0,  0,  0,  0, 16,  1, 24],
         [ 0,  0,  0,  0,  0,  1,  4, 18]], device='cuda:0'),
 torch.Size([4, 128]))
def build_model(n_embd, # the dimensionality of the character embedding vectors
                n_hidden, # the number of neurons in the hidden layer of the MLP 
                last_layer_factor = 0.1 # the factor by to reduce the weights of the last layer
               ):
    vocab_size = 27
    model = nn.Sequential(
    nn.Embedding(vocab_size, n_embd),
    FlattenConsecutive(2), nn.Linear(n_embd*2, n_hidden, bias=False), SwapDim(), nn.BatchNorm1d(n_hidden), SwapDimBack(), nn.Tanh(),
    FlattenConsecutive(2), nn.Linear(n_hidden*2, n_hidden, bias=False), SwapDim(), nn.BatchNorm1d(n_hidden), SwapDimBack(), nn.Tanh(),
   FlattenConsecutive(2), nn.Linear(n_hidden*2, n_hidden, bias=False),  nn.BatchNorm1d(n_hidden), nn.Tanh(),
      nn.Linear(n_hidden, vocab_size)
    ).to(device)


    # parameter init
    with torch.no_grad(): model[-1].weight *= last_layer_factor

    parameters = model.parameters()
    print("No of parameters ", sum(p.nelement() for p in parameters))
    for p in parameters: p.requires_grad = True
    return model
model = build_model(24, 128)
No of parameters  76579
ix = torch.randint(0, Xtr.shape[0], (4,))
Xb, Yb = Xtr[ix], Ytr[ix]
logits = model(Xb)
print(Xb.shape)
Xb
torch.Size([4, 8])
tensor([[ 0,  0,  0,  0,  3, 15, 18, 20],
        [ 0,  0,  0,  0,  0,  0,  7,  9],
        [ 0,  0,  0,  0,  0, 12,  1,  3],
        [ 0,  0,  0, 17, 21,  9, 14,  3]], device='cuda:0')
logits.shape
torch.Size([4, 27])
def train(config, checkpoint_dir=None):
    
    n_embd = config['n_embd']
    n_hidden = config['n_hidden']
    last_layer_factor = config['last_layer_factor']
    max_steps = config['max_steps'] 
    lr = config['lr']
    batch_size = config['batch_size']
    
    model = build_model(n_embd, n_hidden, last_layer_factor)

    train_loss = F.cross_entropy(model(Xtr), Ytr)
    print('Initial loss ', train_loss)
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    
    lossi = []
    
    for i in range(max_steps):
        running_loss = 0.0
        epoch_steps = 0
        # minibatch construct
        ix = torch.randint(0, Xtr.shape[0], (batch_size,))
        Xb, Yb = Xtr[ix], Ytr[ix]

        logits = model(Xb)
        loss = F.cross_entropy(logits, Yb)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

        # track stats
        if i % 10_000 == 0:
            print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
        lossi.append(loss.log10().item())
    
        
    return model
config = {
        "n_embd": 24,
        "n_hidden": 128,
        "lr": 0.001,
        "last_layer_factor": 0.1,
        "batch_size": 32,
        "max_steps": 200_000
    }
m = train(config)
No of parameters  76579
Initial loss  tensor(3.2798, device='cuda:0', grad_fn=<NllLossBackward0>)
      0/ 200000: 3.2850
  10000/ 200000: 1.9604
  20000/ 200000: 1.9096
  30000/ 200000: 2.1808
  40000/ 200000: 1.9603
  50000/ 200000: 2.0830
  60000/ 200000: 1.9285
  70000/ 200000: 1.8355
  80000/ 200000: 2.1152
  90000/ 200000: 1.7333
 100000/ 200000: 2.5383
 110000/ 200000: 2.5408
 120000/ 200000: 1.7806
 130000/ 200000: 1.5074
 140000/ 200000: 2.2836
 150000/ 200000: 2.1666
 160000/ 200000: 2.0499
 170000/ 200000: 2.4158
 180000/ 200000: 1.8051
 190000/ 200000: 1.6264
ix = torch.randint(0, Xtr.shape[0], (4,))
Xb, Yb = Xtr[ix], Ytr[ix]
logits = m(Xb)
logits
tensor([[-0.3831,  5.9946, -0.4334, -3.1496, -1.0150,  3.1747, -3.4749, -3.2213,
          0.9639,  2.5345, -2.5072, -4.6121, -2.3915, -1.1776, -1.0199,  4.6581,
         -2.2405, -5.6197,  1.3689, -1.8678, -2.7871,  1.0448, -3.2367, -4.8778,
         -4.8898,  1.1356, -0.6535],
        [-1.8927, -1.4403, -2.6883,  2.1526,  0.8893, -1.0265, -1.4148,  0.8529,
          1.6819, -2.4304,  1.2136,  3.3855,  1.9102,  1.1939,  1.9362, -3.9292,
         -0.9157, -0.2079,  1.1477,  1.5550,  0.8729, -2.7374,  0.1523,  0.2212,
         -0.3517, -0.8675, -0.6843],
        [-1.8927, -1.4403, -2.6883,  2.1526,  0.8893, -1.0265, -1.4148,  0.8529,
          1.6819, -2.4304,  1.2136,  3.3855,  1.9102,  1.1939,  1.9362, -3.9292,
         -0.9157, -0.2079,  1.1477,  1.5550,  0.8729, -2.7374,  0.1523,  0.2212,
         -0.3517, -0.8675, -0.6843],
        [ 4.3652,  1.3847,  0.9043, -2.2867,  0.5967,  2.3756, -3.3935, -2.3518,
         -3.2263,  2.7976, -4.1615, -2.9255,  0.9531, -1.9894, -1.8476,  0.9243,
         -3.5244, -4.1242, -1.3296,  1.9078,  1.0783, -0.3561, -1.0896, -1.9243,
         -2.5875,  0.5664, -3.5556]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
with torch.no_grad():
    train_loss = F.cross_entropy(m(Xtr), Ytr).item() 
    val_loss = F.cross_entropy(m(Xdev), Ydev).item()
    print(train_loss, val_loss)
1.86216139793396 2.0197317600250244