Hello! I’m new to wandb so I’m not really sure how to resolve this error. Nothing I’ve found helps either.
I got the error below after changing the compute_metrics function in my Trainer object (from transformers) so that it returns a None object at some point. I simply did that because I was testing something and didn’t want the output to be too long. However, this seems to have caused an issue that I can’t fix at all. It also prints this message regardless of what I do, even when I’m uninstalling or reinstalling wandb.
---------------------------------------------------------------------------
BrokenPipeError Traceback (most recent call last)
Cell In[37], line 44
31 callbacks = [early_stopping_callback, wandb_callback]
33 trainer = Trainer(
34 model=model,
35 args=training_args,
(...)
41 callbacks=callbacks
42 )
---> 44 trainer.train()
45 trainer.push_to_hub()
46 trainer.save_model()
File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:1771, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1768 try:
1769 # Disable progress bars when uploading models during checkpoints to avoid polluting stdout
1770 hf_hub_utils.disable_progress_bars()
-> 1771 return inner_training_loop(
1772 args=args,
1773 resume_from_checkpoint=resume_from_checkpoint,
1774 trial=trial,
1775 ignore_keys_for_eval=ignore_keys_for_eval,
1776 )
1777 finally:
1778 hf_hub_utils.enable_progress_bars()
File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:2193, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
2190 self.state.epoch = epoch + (step + 1 + steps_skipped) / steps_in_epoch
2191 self.control = self.callback_handler.on_step_end(args, self.state, self.control)
-> 2193 self._maybe_log_save_evaluate(tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval)
2194 else:
2195 self.control = self.callback_handler.on_substep_end(args, self.state, self.control)
File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:2577, in Trainer._maybe_log_save_evaluate(self, tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval)
2575 metrics = None
2576 if self.control.should_evaluate:
-> 2577 metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
2578 self._report_to_hp_search(trial, self.state.global_step, metrics)
2580 # Run delayed LR scheduler now that metrics are populated
File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:3387, in Trainer.evaluate(self, eval_dataset, ignore_keys, metric_key_prefix)
3377 start_time += output.metrics[f"{metric_key_prefix}_jit_compilation_time"]
3378 output.metrics.update(
3379 speed_metrics(
3380 metric_key_prefix,
(...)
3384 )
3385 )
-> 3387 self.log(output.metrics)
3389 if DebugOption.TPU_METRICS_DEBUG in self.args.debug:
3390 # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.)
3391 xm.master_print(met.metrics_report())
File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:2956, in Trainer.log(self, logs)
2954 output = {**logs, **{"step": self.state.global_step}}
2955 self.state.log_history.append(output)
-> 2956 self.control = self.callback_handler.on_log(self.args, self.state, self.control, logs)
File /opt/conda/lib/python3.10/site-packages/transformers/trainer_callback.py:407, in CallbackHandler.on_log(self, args, state, control, logs)
405 def on_log(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, logs):
406 control.should_log = False
--> 407 return self.call_event("on_log", args, state, control, logs=logs)
File /opt/conda/lib/python3.10/site-packages/transformers/trainer_callback.py:414, in CallbackHandler.call_event(self, event, args, state, control, **kwargs)
412 def call_event(self, event, args, state, control, **kwargs):
413 for callback in self.callbacks:
--> 414 result = getattr(callback, event)(
415 args,
416 state,
417 control,
418 model=self.model,
419 tokenizer=self.tokenizer,
420 optimizer=self.optimizer,
421 lr_scheduler=self.lr_scheduler,
422 train_dataloader=self.train_dataloader,
423 eval_dataloader=self.eval_dataloader,
424 **kwargs,
425 )
426 # A Callback can skip the return of `control` if it doesn't change it.
427 if result is not None:
File /opt/conda/lib/python3.10/site-packages/transformers/integrations/integration_utils.py:823, in WandbCallback.on_log(self, args, state, control, model, logs, **kwargs)
821 non_scalar_logs = {k: v for k, v in logs.items() if k not in single_value_scalars}
822 non_scalar_logs = rewrite_logs(non_scalar_logs)
--> 823 self._wandb.log({**non_scalar_logs, "train/global_step": state.global_step})
File /opt/conda/lib/python3.10/site-packages/wandb/sdk/wandb_run.py:420, in _run_decorator._noop.<locals>.wrapper(self, *args, **kwargs)
417 wandb.termwarn(message, repeat=False)
418 return cls.Dummy()
--> 420 return func(self, *args, **kwargs)
File /opt/conda/lib/python3.10/site-packages/wandb/sdk/wandb_run.py:371, in _run_decorator._noop_on_finish.<locals>.decorator_fn.<locals>.wrapper_fn(self, *args, **kwargs)
368 @functools.wraps(func)
369 def wrapper_fn(self: Type["Run"], *args: Any, **kwargs: Any) -> Any:
370 if not getattr(self, "_is_finished", False):
--> 371 return func(self, *args, **kwargs)
373 default_message = (
374 f"Run ({self.id}) is finished. The call to `{func.__name__}` will be ignored. "
375 f"Please make sure that you are using an active run."
376 )
377 resolved_message = message or default_message
File /opt/conda/lib/python3.10/site-packages/wandb/sdk/wandb_run.py:361, in _run_decorator._attach.<locals>.wrapper(self, *args, **kwargs)
359 raise e
360 cls._is_attaching = ""
--> 361 return func(self, *args, **kwargs)
File /opt/conda/lib/python3.10/site-packages/wandb/sdk/wandb_run.py:1838, in Run.log(self, data, step, commit, sync)
1831 if self._settings._shared and step is not None:
1832 wandb.termwarn(
1833 "In shared mode, the use of `wandb.log` with the step argument is not supported "
1834 f"and will be ignored. Please refer to {wburls.get('wandb_define_metric')} "
1835 "on how to customize your x-axis.",
1836 repeat=False,
1837 )
-> 1838 self._log(data=data, step=step, commit=commit)
File /opt/conda/lib/python3.10/site-packages/wandb/sdk/wandb_run.py:1602, in Run._log(self, data, step, commit)
1599 if any(not isinstance(key, str) for key in data.keys()):
1600 raise ValueError("Key values passed to `wandb.log` must be strings.")
-> 1602 self._partial_history_callback(data, step, commit)
1604 if step is not None:
1605 if os.getpid() != self._init_pid or self._is_attached:
File /opt/conda/lib/python3.10/site-packages/wandb/sdk/wandb_run.py:1474, in Run._partial_history_callback(self, row, step, commit)
1471 if self._backend and self._backend.interface:
1472 not_using_tensorboard = len(wandb.patched["tensorboard"]) == 0
-> 1474 self._backend.interface.publish_partial_history(
1475 row,
1476 user_step=self._step,
1477 step=step,
1478 flush=commit,
1479 publish_step=not_using_tensorboard,
1480 )
File /opt/conda/lib/python3.10/site-packages/wandb/sdk/interface/interface.py:602, in InterfaceBase.publish_partial_history(self, data, user_step, step, flush, publish_step, run)
600 if flush is not None:
601 partial_history.action.flush = flush
--> 602 self._publish_partial_history(partial_history)
File /opt/conda/lib/python3.10/site-packages/wandb/sdk/interface/interface_shared.py:89, in InterfaceShared._publish_partial_history(self, partial_history)
85 def _publish_partial_history(
86 self, partial_history: pb.PartialHistoryRequest
87 ) -> None:
88 rec = self._make_request(partial_history=partial_history)
---> 89 self._publish(rec)
File /opt/conda/lib/python3.10/site-packages/wandb/sdk/interface/interface_sock.py:51, in InterfaceSock._publish(self, record, local)
49 def _publish(self, record: "pb.Record", local: Optional[bool] = None) -> None:
50 self._assign(record)
---> 51 self._sock_client.send_record_publish(record)
File /opt/conda/lib/python3.10/site-packages/wandb/sdk/lib/sock_client.py:221, in SockClient.send_record_publish(self, record)
219 server_req = spb.ServerRequest()
220 server_req.record_publish.CopyFrom(record)
--> 221 self.send_server_request(server_req)
File /opt/conda/lib/python3.10/site-packages/wandb/sdk/lib/sock_client.py:155, in SockClient.send_server_request(self, msg)
154 def send_server_request(self, msg: Any) -> None:
--> 155 self._send_message(msg)
File /opt/conda/lib/python3.10/site-packages/wandb/sdk/lib/sock_client.py:152, in SockClient._send_message(self, msg)
150 header = struct.pack("<BI", ord("W"), raw_size)
151 with self._lock:
--> 152 self._sendall_with_error_handle(header + data)
File /opt/conda/lib/python3.10/site-packages/wandb/sdk/lib/sock_client.py:130, in SockClient._sendall_with_error_handle(self, data)
128 start_time = time.monotonic()
129 try:
--> 130 sent = self._sock.send(data)
131 # sent equal to 0 indicates a closed socket
132 if sent == 0:
BrokenPipeError: [Errno 32] Broken pipe
Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x786b8c31fa00>> (for post_run_cell), with arguments args (<ExecutionResult object at 786b8d1a56c0, execution_count=37 error_before_exec=None error_in_exec=[Errno 32] Broken pipe info=<ExecutionInfo object at 786b8d1a7f40, raw_cell="model = AutoModelForTokenClassification.from_pretr.." store_history=True silent=False shell_futures=True cell_id=51c3c680-dd89-4c04-878d-11c5485b97fe> result=None>,),kwargs {}: