Hello!
Based on your code, it does seem that the sweeps are erroring out since you are creating a single sweep every time you run your code which is causing some conflicts between the sweep and the wandb library.
I would recommend structuring your code like this where you will create a function with the Trainer taking in arguments and spinning up the Trainer
instance. You would then use the sweep to input different hyperparameters into the function. Also, changing the agent’s count to 10 like wandb.agent(sweep_id, train, count=10)
will increase the amount of runs you have to 10.
sweep_config = {
"method": "random",
"name": "disaster-sweep",
"metric": {
"goal": "minimize",
"name": "train/loss"
},
"parameters": {
"epochs": {
"values": [5, 10]
},
"batch_size": {
"values": [8, 16, 32, 64]
},
"learning_rate": {
"values": [0.005, 0.0001, 0.00005]
},
"weight_decay": {
"values": [0.0001, 0.1]
}
}
}
# Training function with args
def train(config=None):
with wandb.init(config=config):
# set sweep configuration
config = wandb.config
# set training arguments
training_args = TrainingArguments(
output_dir='./results',
report_to='wandb', # Turn on Weights & Biases logging
num_train_epochs=config.epochs,
learning_rate=config.learning_rate,
weight_decay=config.weight_decay,
per_device_train_batch_size=config.batch_size,
per_device_eval_batch_size=config.batch_size,
save_strategy='epoch',
evaluation_strategy='epoch',
logging_strategy='epoch',
load_best_model_at_end=True,
remove_unused_columns=False,
)
# define training loop
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=valid_dataset,
compute_metrics=compute_metrics
)
# start training loop
trainer.train()
sweep_id = wandb.sweep(sweep_config, project='fun-sweep')
wandb.agent(sweep_id, train, count=10)