Hi there, I’m new to wandb. As I try to use wandb.sweep
to tune hyperparameters, I find the process much slower than when I was debugging the model. When running similar code using wandb, I find the GPU is almost idle, while it can be used up to 100% when not using sweep. I wonder how could this happen, what am I missing?
Specifically, I’m using text8
data to train a CBOW model. I followed the tutorial to organize the functions:
def train(config=None):
with wandb.init(config=config):
config = wandb.config
dataset = Text8Dataset(corpus, word_to_id, context_size=config.context_size)
dataloader = DataLoader(dataset, config.batch_size, shuffle=True)
negative_sampler = NegativeSampler(corpus, config.alpha)
model = CBOW(vocab_size, config.embedding_dim)
optimizer = build_optimizer(model, config.optimizer, config.lr)
num_negative_samples = config.num_negative_samples
for epoch in range(config.num_epochs):
loss = train_epoch(model, dataloader, optimizer, negative_sampler, num_negative_samples)
print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss:.4f}")
wandb.log({"loss": loss, "epoch": epoch})
def train_epoch(model, dataloader, optimizer, negative_sampler, num_negative_samples):
total_loss = 0
for context, target in dataloader:
input_context = torch.transpose(torch.row_stack([context_word for context_word in context]), 0, 1)
optimizer.zero_grad()
output = model(input_context)
loss = negative_sampling_loss(output, target, negative_sampler, num_neg_samples=num_negative_samples)
loss.backward()
optimizer.step()
total_loss += loss.item()
return total_loss / len(dataloader)
When not using wandb, I simply run this snippet, which runs an epoch within 2 minutes
vocab_size = len(vocab)
embedding_dim = 256
model = CBOW(vocab_size, embedding_dim)
context_size = 5
batch_size=512
lr = 0.005
num_epochs = 10
num_negative_samples = 10
alpha = 0.75
#data loading
text8_dataset = Text8Dataset(corpus, word_to_id, context_size=context_size)
dataloader = DataLoader(text8_dataset, batch_size=batch_size, shuffle=True)
# training
#criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)
#scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
negative_sampler = NegativeSampler(corpus, alpha)
model.train()
for epoch in range(num_epochs):
loss = train_epoch(model, dataloader, optimizer, negative_sampler, num_negative_samples)
print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss/len(dataloader):.4f}")