# !pip install ray[tune]
Wavenet Hyperparameter Tuning
# !pip install optuna
import numpy as np
import torch
import random
import os
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
import ray
from ray import tune
from ray.tune import CLIReporter, JupyterNotebookReporter
from ray.tune.schedulers import ASHAScheduler
from ray.tune.schedulers.pb2 import PB2
from ray.air import session
from ray.tune.suggest.optuna import OptunaSearch
from ray.air import Result
= "cpu"
device if torch.cuda.is_available():
= "cuda:0" device
device
'cuda:0'
42); torch.manual_seed(
42) random.seed(
Setup Data Loader
= open('names.txt', 'r').read().splitlines() words
random.shuffle(words)
def build_dataset(words, block_size=8):
= [], []
X, Y
42)
random.seed(
random.shuffle(words)
= sorted(list(set(''.join(words))))
chars = {s: i + 1 for i, s in enumerate(chars)}
stoi '.'] = 0
stoi[= {i: s for s, i in stoi.items()}
itos = len(itos)
vocab_size
for w in words:
= [0] * block_size
context for ch in w + '.':
= stoi[ch]
ix
X.append(context)
Y.append(ix)= context[1:] + [ix]
context
= torch.tensor(X).to(device)
X = torch.tensor(Y).to(device)
Y return X, Y
= int(0.8 * len(words))
n1 = int(0.9 * len(words))
n2
= build_dataset(words[:n1])
Xtr, Ytr = build_dataset(words[n1:n2])
Xdev, Ydev = build_dataset(words[n2:]) Xte, Yte
Xtr.shape
torch.Size([182625, 8])
Create Model
# --- Tanh --- layer
class Tanh:
def __call__(self, x):
self.out = torch.tanh(x).to(device)
return self.out
def parameters(self):
return []
# --- Linear --- layer
class Linear:
def __init__(self, fan_in, fan_out, bias=True):
self.weight = (torch.randn((fan_in, fan_out)) / fan_in ** 0.5).to(device) # note: kaiming init
self.bias = torch.zeros(fan_out).to(device) if bias else None #
def __call__(self, x):
self.out = x @ self.weight
if self.bias is not None:
self.out += self.bias
return self.out
def parameters(self):
return [self.weight] + ([] if self.bias is None else [self.bias])
# --- Embedding --- layer
class Embedding:
def __init__(self, num_embeddings, embedding_dim):
self.weight = torch.randn((num_embeddings, embedding_dim)).to(device)
def __call__(self, IX):
self.out = self.weight[IX]
return self.out
def parameters(self):
return [self.weight]
# --- Sequential ----
class Sequential:
def __init__(self, layers):
self.layers = layers
def __call__(self, x):
= x.shape
B, T, C = x.view(B, T//self.n, C*self.n)
x if x.shape[1] == 1:
= x.squeeze(1)
x self.out = x
return self.out
def parameters(self):
# get parameters of all layers and stretch them out into one list
return [p for layer in self.layers for p in layer.parameters()]
# --- BatchNorm1d ---
class BatchNorm1d:
def __init__(self, dim, eps=1e-5, momentum=0.1):
self.eps = eps
self.momentum = momentum
self.training = True
# parameters (trained with backprop)
self.gamma = torch.ones(dim).to(device)
self.beta = torch.zeros(dim).to(device)
# buffers (trained with a running `momentum update`)
self.running_mean = torch.zeros(dim).to(device)
self.running_var = torch.ones(dim).to(device)
def __call__(self, x):
# calculate the forward pass
if self.training:
if x.ndim == 2: dim = 0
elif x.ndim == 3: dim = (0, 1)
= x.mean(dim, keepdim=True)
xmean = x.var(dim, keepdim=True)
xvar else:
# print('In batchnorm call')
= self.running_mean
xmean = self.running_var
xvar
= (x - xmean) / torch.sqrt(xvar + self.eps)
xhat self.out = self.gamma * xhat + self.beta
# update the buffers
if self.training:
with torch.no_grad():
self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
self.running_var = (1 - self.momentum) * self.running_var + self.momentum * xvar
return self.out
def parameters(self):
return [self.gamma, self.beta]
# --- Flatten Consecutive ---
class FlattenConsecutive:
def __init__(self, n):
self.n = n
def __call__(self, x):
for layer in self.layers:
= layer(x)
x self.out = x
return self.out
def parameters(self):
# get parameters of all layers and stretch them out into one list
return [p for layer in self.layers for p in layer.parameters()]
def build_model(n_embd, # the dimensionality of the character embedding vectors
# the number of neurons in the hidden layer of the MLP
n_hidden, = 0.1 # the factor by to reduce the weights of the last layer
last_layer_factor
):= 27
vocab_size = Sequential([
model
Embedding(vocab_size, n_embd),2), Linear(n_embd*2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
FlattenConsecutive(2), Linear(n_hidden*2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
FlattenConsecutive(2), Linear(n_hidden*2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
FlattenConsecutive(
Linear(n_hidden, vocab_size),
])
# parameter init
with torch.no_grad(): model.layers[-1].weight *= last_layer_factor
= model.parameters()
parameters print("No of parameters ", sum(p.nelement() for p in parameters))
for p in parameters: p.requires_grad = True
return model
76579
= build_model(24, 128) model
= torch.randint(0, Xtr.shape[0], (4,))
ix = Xtr[ix], Ytr[ix]
Xb, Yb = model(Xb)
logits print(Xb.shape)
Xb
torch.Size([4, 8])
tensor([[ 0, 0, 0, 0, 0, 0, 0, 0],
[ 0, 0, 0, 0, 0, 0, 0, 6],
[ 0, 0, 0, 0, 0, 0, 0, 12],
[ 0, 0, 0, 2, 18, 9, 5, 12]])
logits
tensor([[ 0.0273, -0.0539, 0.0103, 0.0468, 0.0906, -0.0791, 0.0134, -0.0599,
-0.0407, -0.0617, 0.0134, -0.0230, -0.0095, 0.0769, 0.0231, 0.0755,
-0.0331, -0.0292, 0.0256, -0.1219, 0.0147, -0.0233, 0.0642, -0.0846,
-0.1325, 0.0057, 0.1157],
[ 0.0004, 0.0353, -0.0831, 0.0284, -0.0333, 0.0517, -0.0613, -0.0264,
0.0924, -0.0422, -0.0844, -0.0041, 0.0009, 0.0050, 0.0096, 0.0046,
0.0754, -0.0220, -0.0776, 0.0863, -0.0258, 0.0097, 0.1024, 0.0146,
0.1084, -0.0321, -0.0900],
[-0.0469, 0.0274, 0.0240, 0.0247, -0.0269, -0.0069, 0.1197, -0.0066,
-0.1216, 0.0030, -0.0689, -0.0530, 0.0465, -0.0710, -0.0207, -0.0524,
-0.0485, 0.0697, 0.0468, 0.0067, -0.0114, 0.0317, -0.0701, 0.0484,
0.1649, 0.0206, -0.0751],
[ 0.0238, -0.0383, -0.0129, -0.1187, -0.0069, -0.0030, -0.0763, 0.0728,
0.0896, 0.1063, 0.1267, 0.0320, -0.0092, -0.0483, -0.0477, -0.0461,
0.0270, 0.0346, 0.0168, 0.0283, 0.0219, -0.0034, -0.1050, 0.0165,
-0.1496, -0.0124, 0.0358]], device='cuda:0', grad_fn=<AddBackward0>)
for i, ix in enumerate(batch(range(10))):
print(i, ix)
0 range(0, 1)
1 range(1, 2)
2 range(2, 3)
3 range(3, 4)
4 range(4, 5)
5 range(5, 6)
6 range(6, 7)
7 range(7, 8)
8 range(8, 9)
9 range(9, 10)
def batch(iterable, n=1):
= len(iterable)
l for ndx in range(0, l, n):
yield iterable[ndx:min(ndx + n, l)]
for i, ix in enumerate(batch(range(10))):
print(i, ix)
0 range(0, 1)
1 range(1, 2)
2 range(2, 3)
3 range(3, 4)
4 range(4, 5)
5 range(5, 6)
6 range(6, 7)
7 range(7, 8)
8 range(8, 9)
9 range(9, 10)
# same optimization as last time
def train_no_tune(config, checkpoint_dir=None):
= config['n_embd']
n_embd = config['n_hidden']
n_hidden = config['last_layer_factor']
last_layer_factor = config['max_steps']
max_steps = config['lr']
lr = config['batch_size']
batch_size
= build_model(n_embd, n_hidden, last_layer_factor)
model
= F.cross_entropy(model(Xtr), Ytr)
train_loss print('Initial loss ', train_loss)
= []
lossi
for i in range(max_steps):
= 0.0
running_loss = 0
epoch_steps # minibatch construct
= torch.randint(0, Xtr.shape[0], (batch_size,))
ix = Xtr[ix], Ytr[ix]
Xb, Yb
# forward pass
= model(Xb)
logits = F.cross_entropy(logits, Yb)
loss
# backward pass
for p in model.parameters():
= None
p.grad
loss.backward()
# update: simple SGD
# lr = 0.1 if i < 150_000 else 0.01 # step learning rate decay
for p in model.parameters():
+= -lr * p.grad
p.data
# track stats
if i % 10_000 == 0:
print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
lossi.append(loss.log10().item())
# print(model)
return model
# print(train_loss.item(), val_loss.item())
= {
config "n_embd": 24,
"n_hidden": 128,
"lr": 0.1,
"last_layer_factor": 0.1,
"batch_size": 32,
"max_steps": 150_000
}
= train_no_tune(config) m
= torch.randint(0, Xtr.shape[0], (4,))
ix = Xtr[ix], Ytr[ix]
Xb, Yb = m(Xb)
logits logits
tensor([[ 7.3465, 3.0281, -0.8438, -0.6315, 0.1254, 2.3173, -5.4998, -3.1930,
1.5278, 1.3920, -0.8801, 0.2029, 0.6476, 1.1801, 3.3868, -0.4111,
-3.7620, -2.9908, 2.5343, 2.5573, -1.7074, 1.2317, -2.6614, -0.9130,
-1.6869, -1.5149, -0.9145],
[ 1.1573, 6.1267, -1.6542, -0.7126, -0.0082, 7.1435, -3.1067, -2.7086,
-0.5152, 5.9013, -1.5665, -2.7343, -0.8459, -2.8573, 0.9811, 5.7301,
-2.6570, -4.4699, 2.2010, -1.5982, -1.9610, 2.9759, 0.5002, -2.5367,
-3.4369, 5.2291, -4.5067],
[ 7.1585, 2.8464, 0.2289, -0.9455, 1.5724, 0.8947, -1.6819, 8.0458,
-3.1600, -1.8369, -0.4268, 0.1184, -0.3447, -0.5721, 2.6477, 0.2432,
-2.7675, -2.0082, 1.5645, 1.6701, -0.3294, -2.4148, -3.6991, -3.9939,
-2.7473, 0.7778, -1.0990],
[-0.6448, 5.7704, -0.8307, 0.5714, -1.6713, 5.6905, -2.8324, -4.6171,
1.4003, 7.0470, -1.9388, -1.7307, -1.1796, -0.9947, -1.5239, 2.1725,
-1.7857, -2.5253, 1.9881, 1.3517, -0.3367, -1.3917, -1.4497, -2.4642,
-1.0517, 4.1390, -1.2129]], device='cuda:0', grad_fn=<AddBackward0>)
for layer in m.layers: layer.training = False
= F.cross_entropy(m(Xtr), Ytr).item()
train_loss = F.cross_entropy(m(Xdev), Ydev).item()
val_loss train_loss, val_loss
(1.858979344367981, 2.039809226989746)
Tuning
def train(config):
= config['n_embd']
n_embd = config['n_hidden']
n_hidden = config['last_layer_factor']
last_layer_factor = config['max_steps']
max_steps = config['lr']
lr = config['batch_size']
batch_size
= build_model(n_embd, n_hidden, last_layer_factor)
model
= F.cross_entropy(model(Xtr), Ytr)
train_loss print('Initial loss ', train_loss)
= []
lossi
# max_steps = 100_00_000
for i in range(max_steps):
= 0.0
running_loss = 0
epoch_steps # minibatch construct
= torch.randint(0, Xtr.shape[0], (batch_size,))
ix = Xtr[ix], Ytr[ix]
Xb, Yb
# forward pass
= model(Xb)
logits = F.cross_entropy(logits, Yb)
loss
# backward pass
for p in model.parameters():
= None
p.grad
loss.backward()
# update: simple SGD
for p in model.parameters():
+= -lr * p.grad
p.data
# track stats
# if i % 10_000 == 0: # print every once in a while
# # print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
for layer in model.layers: layer.training = False
= F.cross_entropy(model(Xtr), Ytr)
train_loss = F.cross_entropy(model(Xdev), Ydev)
val_loss 'training_iteration':i, 'train_loss':train_loss.item(), 'val_loss':val_loss.item()})
session.report({for layer in model.layers: layer.training = True
lossi.append(loss.log10().item())
print('Train Loss ', F.cross_entropy(model(Xtr), Ytr).item())
print('Val Loss ', F.cross_entropy(model(Xdev), Ydev).item())
0/ 300000: 3.2818
10000/ 300000: 2.3348
20000/ 300000: 2.0034
30000/ 300000: 1.9822
40000/ 300000: 1.5388
50000/ 300000: 1.9057
60000/ 300000: 1.6040
70000/ 300000: 2.0101
80000/ 300000: 2.2422
90000/ 300000: 1.6573
100000/ 300000: 1.7261
110000/ 300000: 1.8587
120000/ 300000: 1.5202
130000/ 300000: 2.0675
140000/ 300000: 1.9777
150000/ 300000: 2.0351
160000/ 300000: 1.6458
170000/ 300000: 1.8868
180000/ 300000: 1.7516
190000/ 300000: 1.6009
200000/ 300000: 1.3999
210000/ 300000: 1.7495
220000/ 300000: 1.6129
230000/ 300000: 1.7232
240000/ 300000: 2.2450
250000/ 300000: 2.2155
260000/ 300000: 1.5074
270000/ 300000: 1.6431
280000/ 300000: 1.7525
290000/ 300000: 1.8047
def tuning():
= {
search_space "n_embd": tune.randint(4, 65),
"n_hidden": tune.randint(100, 500),
"lr": tune.loguniform(1e-4, 1e-1),
"last_layer_factor": tune.uniform(0.1, 1.01),
"batch_size": tune.choice([16, 32, 64]),
"max_steps": tune.randint(100_000, 10_00_000)
}
= [{
initial_params "n_embd": 24,
"n_hidden": 128,
"lr": 0.1,
"last_layer_factor": 0.1,
"batch_size": 32,
"max_steps": 150_000
}]
= OptunaSearch(points_to_evaluate=initial_params)
searcher = tune.with_resources(train, {"cpu": 30, "gpu": 1})
trainable_with_cpu_gpu
= tune.Tuner(
tuner
trainable_with_cpu_gpu,=tune.TuneConfig(
tune_config="val_loss",
metric="min",
mode=searcher,
search_alg=7,
num_samples
),=search_space,
param_space
)
= tuner.fit()
results
print("Best hyperparameters found were: ", results.get_best_result().config)
;
ray.shutdown() ray.init()
2023-03-19 17:02:02,578 INFO worker.py:1553 -- Started a local Ray instance.
Ray
Python version: | 3.8.10 |
Ray version: | 2.3.0 |
tuning()
/home/ubuntu/.local/lib/python3.8/site-packages/ray/tune/search/optuna/optuna_search.py:683: FutureWarning: IntUniformDistribution has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use :class:`~optuna.distributions.IntDistribution` instead.
return ot.distributions.IntUniformDistribution(
/home/ubuntu/.local/lib/python3.8/site-packages/ray/tune/search/optuna/optuna_search.py:662: FutureWarning: LogUniformDistribution has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use :class:`~optuna.distributions.FloatDistribution` instead.
return ot.distributions.LogUniformDistribution(
/home/ubuntu/.local/lib/python3.8/site-packages/ray/tune/search/optuna/optuna_search.py:671: FutureWarning: UniformDistribution has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use :class:`~optuna.distributions.FloatDistribution` instead.
return ot.distributions.UniformDistribution(
[I 2023-03-19 17:02:06,605] A new study created in memory with name: optuna
/home/ubuntu/.local/lib/python3.8/site-packages/optuna/distributions.py:799: FutureWarning: IntUniformDistribution(high=64, low=4, step=1) is deprecated and internally converted to IntDistribution(high=64, log=False, low=4, step=1). See https://github.com/optuna/optuna/issues/2941.
warnings.warn(message, FutureWarning)
/home/ubuntu/.local/lib/python3.8/site-packages/optuna/distributions.py:799: FutureWarning: IntUniformDistribution(high=499, low=100, step=1) is deprecated and internally converted to IntDistribution(high=499, log=False, low=100, step=1). See https://github.com/optuna/optuna/issues/2941.
warnings.warn(message, FutureWarning)
/home/ubuntu/.local/lib/python3.8/site-packages/optuna/distributions.py:799: FutureWarning: LogUniformDistribution(high=0.1, low=0.0001) is deprecated and internally converted to FloatDistribution(high=0.1, log=True, low=0.0001, step=None). See https://github.com/optuna/optuna/issues/2941.
warnings.warn(message, FutureWarning)
/home/ubuntu/.local/lib/python3.8/site-packages/optuna/distributions.py:799: FutureWarning: UniformDistribution(high=1.01, low=0.1) is deprecated and internally converted to FloatDistribution(high=1.01, log=False, low=0.1, step=None). See https://github.com/optuna/optuna/issues/2941.
warnings.warn(message, FutureWarning)
/home/ubuntu/.local/lib/python3.8/site-packages/optuna/distributions.py:799: FutureWarning: IntUniformDistribution(high=999999, low=100000, step=1) is deprecated and internally converted to IntDistribution(high=999999, log=False, low=100000, step=1). See https://github.com/optuna/optuna/issues/2941.
warnings.warn(message, FutureWarning)
2023-03-19 17:02:06,901 WARNING worker.py:1866 -- Warning: The actor ImplicitFunc is very large (14 MiB). Check that its definition is not implicitly capturing a large array or other object in scope. Tip: use ray.put() to put large objects in the Ray object store.
(pid=88428) /home/ubuntu/.local/lib/python3.8/site-packages/pandas/core/computation/expressions.py:20: UserWarning: Pandas requires version '2.7.3' or newer of 'numexpr' (version '2.7.1' currently installed).
(pid=88428) from pandas.core.computation.check import NUMEXPR_INSTALLED
Tune Status
Current time: | 2023-03-19 23:14:45 |
Running for: | 06:12:38.49 |
Memory: | 9.4/196.6 GiB |
System Info
Using FIFO scheduling algorithm.Resources requested: 30.0/30 CPUs, 1.0/1 GPUs, 0.0/125.77 GiB heap, 0.0/57.89 GiB objects (0.0/1.0 accelerator_type:A100)
Trial Status
Trial name | status | loc | batch_size | last_layer_factor | lr | max_steps | n_embd | n_hidden | iter | total time (s) | train_loss | val_loss |
---|---|---|---|---|---|---|---|---|---|---|---|---|
train_f6250653 | RUNNING | 10.19.31.32:88428 | 64 | 0.99565 | 0.000439031 | 540060 | 24 | 456 | 302615 | 19429.5 | 1.85993 | 2.00036 |
train_abc822a6 | PENDING | 16 | 0.127438 | 0.000668113 | 762048 | 30 | 168 | |||||
train_e8e76444 | TERMINATED | 10.19.31.32:88428 | 32 | 0.1 | 0.1 | 150000 | 24 | 128 | 149999 | 2925.94 | 1.88007 | 2.06663 |
(train pid=88428) No of parameters 76579
(train pid=88428) Initial loss tensor(3.2965, device='cuda:0', grad_fn=<NllLossBackward0>)
(train pid=88428) Train Loss 1.873051643371582
(train pid=88428) Val Loss 2.0602710247039795
(train pid=88428) No of parameters 869355
(train pid=88428) Initial loss
(train pid=88428) tensor(3.4301, device='cuda:0', grad_fn=<NllLossBackward0>)
Trial Progress
Trial name | date | done | episodes_total | experiment_id | hostname | iterations_since_restore | node_ip | pid | time_since_restore | time_this_iter_s | time_total_s | timestamp | timesteps_since_restore | timesteps_total | train_loss | training_iteration | trial_id | val_loss | warmup_time |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
train_e8e76444 | 2023-03-19_17-50-55 | True | 352e72ef24b0485790725bb268154829 | 129-146-49-170 | 150000 | 10.19.31.32 | 88428 | 2925.94 | 0.0204446 | 2925.94 | 1679248255 | 0 | 1.88007 | 149999 | e8e76444 | 2.06663 | 0.00304294 | ||
train_f6250653 | 2023-03-19_23-14-47 | False | 352e72ef24b0485790725bb268154829 | 129-146-49-170 | 302648 | 10.19.31.32 | 88428 | 19431.6 | 0.0662513 | 19431.6 | 1679267687 | 0 | 1.85784 | 302647 | f6250653 | 1.99851 | 0.00304294 |
Analyze the Ray Tune trials
All the trials’ information did not show up above therefore doing the analysis in the folowing cells
= f"ray_results/train_2023-03-19_17-02-06"
experiment_path print(f"Loading results from {experiment_path}...")
Loading results from ray_results/train_2023-03-19_17-02-06...
= tune.Tuner.restore(experiment_path)
restored_tuner = restored_tuner.get_results() result_grid
/home/ubuntu/.local/lib/python3.8/site-packages/ray/tune/tuner.py:230: UserWarning: Passing in the experiment's `trainable` will be a required argument to `Tuner.restore` starting from version 2.5. Please specify the trainable to avoid this warning.
warnings.warn(warning_message)
2023-03-21 14:13:11,149 INFO experiment_analysis.py:789 -- No `self.trials`. Drawing logdirs from checkpoint file. This may result in some information that is out of sync, as checkpointing is periodic.
= result_grid.get_dataframe()
results_df "training_iteration", "val_loss"]] results_df[[
training_iteration | val_loss | |
---|---|---|
0 | 870783 | 2.044881 |
1 | 423444 | 2.018260 |
2 | 762047 | 2.015049 |
3 | 149999 | 2.066634 |
4 | 199144 | 2.051399 |
5 | 289118 | 2.046853 |
6 | 540059 | 1.989679 |
= result_grid.get_best_result() best_result: Result
best_result.config
{'n_embd': 24,
'n_hidden': 456,
'lr': 0.0004390312584437158,
'last_layer_factor': 0.9956496873025195,
'batch_size': 64,
'max_steps': 540060}
best_result.checkpoint
train 1.7052044868469238
val 2.2605273723602295
best_result.metrics
{'training_iteration': 540059,
'train_loss': 1.7585725784301758,
'val_loss': 1.9896790981292725,
'time_this_iter_s': 0.06922125816345215,
'done': True,
'timesteps_total': None,
'episodes_total': None,
'trial_id': 'f6250653',
'experiment_id': '352e72ef24b0485790725bb268154829',
'date': '2023-03-20_03-45-17',
'timestamp': 1679283917,
'time_total_s': 35661.531294584274,
'pid': 88428,
'hostname': '129-146-49-170',
'node_ip': '10.19.31.32',
'config': {'n_embd': 24,
'n_hidden': 456,
'lr': 0.0004390312584437158,
'last_layer_factor': 0.9956496873025195,
'batch_size': 64,
'max_steps': 540060},
'time_since_restore': 35661.531294584274,
'timesteps_since_restore': 0,
'iterations_since_restore': 540060,
'warmup_time': 0.003042936325073242,
'experiment_tag': '2_batch_size=64,last_layer_factor=0.9956,lr=0.0004,max_steps=540060,n_embd=24,n_hidden=456'}
"training_iteration", "val_loss") best_result.metrics_dataframe.plot(
<AxesSubplot:xlabel='training_iteration'>
= None
ax for result in result_grid:
= f"lr={result.config['lr']:.3f}"
label if ax is None:
= result.metrics_dataframe.plot("training_iteration", "val_loss", label=label)
ax else:
"training_iteration", "val_loss", ax=ax, label=label)
result.metrics_dataframe.plot("Mean Accuracy vs. Training Iteration for All Trials")
ax.set_title("Mean Test Accuracy") ax.set_ylabel(
Text(0, 0.5, 'Mean Test Accuracy')
Train and Test with Best Config
best_result.metrics
{'training_iteration': 540059,
'train_loss': 1.7585725784301758,
'val_loss': 1.9896790981292725,
'time_this_iter_s': 0.06922125816345215,
'done': True,
'timesteps_total': None,
'episodes_total': None,
'trial_id': 'f6250653',
'experiment_id': '352e72ef24b0485790725bb268154829',
'date': '2023-03-20_03-45-17',
'timestamp': 1679283917,
'time_total_s': 35661.531294584274,
'pid': 88428,
'hostname': '129-146-49-170',
'node_ip': '10.19.31.32',
'config': {'n_embd': 24,
'n_hidden': 456,
'lr': 0.0004390312584437158,
'last_layer_factor': 0.9956496873025195,
'batch_size': 64,
'max_steps': 540060},
'time_since_restore': 35661.531294584274,
'timesteps_since_restore': 0,
'iterations_since_restore': 540060,
'warmup_time': 0.003042936325073242,
'experiment_tag': '2_batch_size=64,last_layer_factor=0.9956,lr=0.0004,max_steps=540060,n_embd=24,n_hidden=456'}
= {'n_embd': 24,
config 'n_hidden': 456,
'lr': 0.0004390312584437158,
'last_layer_factor': 0.9956496873025195,
'batch_size': 64,
'max_steps': 540060}
= train_no_tune(config) model
No of parameters 869355
Initial loss tensor(3.4630, device='cuda:0', grad_fn=<NllLossBackward>)
0/ 540060: 3.4067
10000/ 540060: 2.4061
20000/ 540060: 2.1215
30000/ 540060: 2.2546
40000/ 540060: 2.2071
50000/ 540060: 2.1789
60000/ 540060: 2.5076
70000/ 540060: 2.0653
80000/ 540060: 1.9867
90000/ 540060: 1.9995
100000/ 540060: 2.2169
110000/ 540060: 2.2312
120000/ 540060: 2.0663
130000/ 540060: 1.8910
140000/ 540060: 1.9463
150000/ 540060: 2.0409
160000/ 540060: 1.8872
170000/ 540060: 2.1610
180000/ 540060: 1.8664
190000/ 540060: 2.0235
200000/ 540060: 1.8428
210000/ 540060: 1.8524
220000/ 540060: 1.9312
230000/ 540060: 1.8659
240000/ 540060: 1.7783
250000/ 540060: 1.8855
260000/ 540060: 2.0965
270000/ 540060: 1.8449
280000/ 540060: 2.2505
290000/ 540060: 2.0819
300000/ 540060: 1.9650
310000/ 540060: 1.8367
320000/ 540060: 1.9641
330000/ 540060: 2.1167
340000/ 540060: 1.8066
350000/ 540060: 1.8251
360000/ 540060: 1.5952
370000/ 540060: 2.0327
380000/ 540060: 1.8862
390000/ 540060: 1.8184
400000/ 540060: 1.8827
410000/ 540060: 1.8935
420000/ 540060: 2.1733
430000/ 540060: 1.8394
440000/ 540060: 2.0016
450000/ 540060: 2.0970
460000/ 540060: 1.6659
470000/ 540060: 1.5410
480000/ 540060: 1.9260
490000/ 540060: 1.6596
500000/ 540060: 1.7006
510000/ 540060: 1.7158
520000/ 540060: 1.8447
530000/ 540060: 1.9106
540000/ 540060: 1.7005
for layer in model.layers: layer.training = False
with torch.no_grad():
= F.cross_entropy(model(Xtr), Ytr)
train_loss = F.cross_entropy(model(Xdev), Ydev)
val_loss = F.cross_entropy(model(Xte), Yte)
test_loss print('Train loss ', train_loss)
print('Val loss ', val_loss)
print('Test loss ', test_loss)
Train loss tensor(1.7622, device='cuda:0')
Val loss tensor(1.9855, device='cuda:0')
Test loss tensor(1.9815, device='cuda:0')