BrokenPipeError when doing sweeps

Hello!

Based on your code, it does seem that the sweeps are erroring out since you are creating a single sweep every time you run your code which is causing some conflicts between the sweep and the wandb library.

I would recommend structuring your code like this where you will create a function with the Trainer taking in arguments and spinning up the Trainer instance. You would then use the sweep to input different hyperparameters into the function. Also, changing the agent’s count to 10 like wandb.agent(sweep_id, train, count=10) will increase the amount of runs you have to 10.

sweep_config = {
    "method": "random",
    "name": "disaster-sweep",
    "metric": {
        "goal": "minimize",
        "name": "train/loss"
    },
    "parameters": {
        "epochs": {
            "values": [5, 10]
        },
        "batch_size": {
            "values": [8, 16, 32, 64]
        },
        "learning_rate": {
            "values": [0.005, 0.0001, 0.00005]
        },
        "weight_decay": {
            "values": [0.0001, 0.1]
        }
    }
}

# Training function with args
def train(config=None):
  with wandb.init(config=config):
    # set sweep configuration
    config = wandb.config

    # set training arguments
    training_args = TrainingArguments(
    output_dir='./results',
	report_to='wandb',  # Turn on Weights & Biases logging
        num_train_epochs=config.epochs,
        learning_rate=config.learning_rate,
        weight_decay=config.weight_decay,
        per_device_train_batch_size=config.batch_size,
        per_device_eval_batch_size=config.batch_size,
        save_strategy='epoch',
        evaluation_strategy='epoch',
        logging_strategy='epoch',
        load_best_model_at_end=True,
        remove_unused_columns=False,
    )


    # define training loop
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        compute_metrics=compute_metrics
    )


    # start training loop
    trainer.train()

sweep_id = wandb.sweep(sweep_config, project='fun-sweep')
wandb.agent(sweep_id, train, count=10)