Hello, I’m running my jupyter notebook using sweeps and wandb.agent but most of my sweep runs are failing with the following error message:
wandb: Adding directory to artifact (/home/phdomingues/masters/results/ViT/UNIFESP/masked/SN-UNIFESP/checkpoint-77)... Done. 2.4s
wandb: Adding directory to artifact (/home/phdomingues/masters/results/ViT/UNIFESP/masked/SN-UNIFESP/checkpoint-154)... Done. 3.2s
Traceback (most recent call last):
File "/tmp/ipykernel_4543/1672163409.py", line 32, in train
trainer.train()
File "/home/phdomingues/.miniconda3/envs/wandb/lib/python3.8/site-packages/transformers/trainer.py", line 1885, in train
return inner_training_loop(
File "/home/phdomingues/.miniconda3/envs/wandb/lib/python3.8/site-packages/transformers/trainer.py", line 2311, in _inner_training_loop
self._maybe_log_save_evaluate(tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval)
File "/home/phdomingues/.miniconda3/envs/wandb/lib/python3.8/site-packages/transformers/trainer.py", line 2733, in _maybe_log_save_evaluate
self.control = self.callback_handler.on_save(self.args, self.state, self.control)
File "/home/phdomingues/.miniconda3/envs/wandb/lib/python3.8/site-packages/transformers/trainer_callback.py", line 487, in on_save
return self.call_event("on_save", args, state, control)
File "/home/phdomingues/.miniconda3/envs/wandb/lib/python3.8/site-packages/transformers/trainer_callback.py", line 498, in call_event
result = getattr(callback, event)(
File "/home/phdomingues/.miniconda3/envs/wandb/lib/python3.8/site-packages/transformers/integrations/integration_utils.py", line 847, in on_save
artifact.add_dir(artifact_path)
File "/home/phdomingues/.miniconda3/envs/wandb/lib/python3.8/site-packages/wandb/sdk/artifacts/artifact.py", line 1226, in add_dir
raise ValueError("Path is not a directory: {}".format(local_path))
ValueError: Path is not a directory: /home/phdomingues/masters/results/ViT/UNIFESP/masked/SN-UNIFESP/checkpoint-231
A snippet of my code:
def train(config=None):
with wandb.init(config=config):
config = wandb.config
training_args = TrainingArguments(
output_dir=output_dir,
report_to='wandb',
save_strategy='epoch',
evaluation_strategy='epoch',
logging_strategy='epoch',
learning_rate=config.learning_rate,
weight_decay=config.weight_decay,
num_train_epochs=config.epochs,
per_device_train_batch_size=config.batch_size,
per_device_eval_batch_size=2,
save_total_limit=2,
remove_unused_columns=False,
push_to_hub=False,
fp16=True,
load_best_model_at_end=True,
)
trainer = Trainer(
model=model,
args=training_args,
data_collator=collate_fn,
compute_metrics=partial(compute_metrics, metrics=[load_metric(m, trust_remote_code=True) for m in METRICS]),
train_dataset=ds['train'],
eval_dataset=ds['test'],
tokenizer=processor,
)
trainer.train()
wandb.agent(sweep_id, train, count=20)
wandb.finish()
It seems to me that the library fails to create a checkpoint directory and breaks when trying to access it, or maybe it removes it too soon…
I would appreciate it if someone could help me figure out what is happening and how to solve it.