Wandb Help: Broken Pipe Error

Hello! I’m new to wandb so I’m not really sure how to resolve this error. Nothing I’ve found helps either.

I got the error below after changing the compute_metrics function in my Trainer object (from transformers) so that it returns a None object at some point. I simply did that because I was testing something and didn’t want the output to be too long. However, this seems to have caused an issue that I can’t fix at all. It also prints this message regardless of what I do, even when I’m uninstalling or reinstalling wandb.

---------------------------------------------------------------------------
BrokenPipeError                           Traceback (most recent call last)
Cell In[37], line 44
     31 callbacks = [early_stopping_callback, wandb_callback]
     33 trainer = Trainer(
     34     model=model,
     35     args=training_args,
   (...)
     41     callbacks=callbacks
     42 )
---> 44 trainer.train()
     45 trainer.push_to_hub()
     46 trainer.save_model()

File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:1771, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
   1768 try:
   1769     # Disable progress bars when uploading models during checkpoints to avoid polluting stdout
   1770     hf_hub_utils.disable_progress_bars()
-> 1771     return inner_training_loop(
   1772         args=args,
   1773         resume_from_checkpoint=resume_from_checkpoint,
   1774         trial=trial,
   1775         ignore_keys_for_eval=ignore_keys_for_eval,
   1776     )
   1777 finally:
   1778     hf_hub_utils.enable_progress_bars()

File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:2193, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
   2190     self.state.epoch = epoch + (step + 1 + steps_skipped) / steps_in_epoch
   2191     self.control = self.callback_handler.on_step_end(args, self.state, self.control)
-> 2193     self._maybe_log_save_evaluate(tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval)
   2194 else:
   2195     self.control = self.callback_handler.on_substep_end(args, self.state, self.control)

File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:2577, in Trainer._maybe_log_save_evaluate(self, tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval)
   2575 metrics = None
   2576 if self.control.should_evaluate:
-> 2577     metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
   2578     self._report_to_hp_search(trial, self.state.global_step, metrics)
   2580     # Run delayed LR scheduler now that metrics are populated

File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:3387, in Trainer.evaluate(self, eval_dataset, ignore_keys, metric_key_prefix)
   3377     start_time += output.metrics[f"{metric_key_prefix}_jit_compilation_time"]
   3378 output.metrics.update(
   3379     speed_metrics(
   3380         metric_key_prefix,
   (...)
   3384     )
   3385 )
-> 3387 self.log(output.metrics)
   3389 if DebugOption.TPU_METRICS_DEBUG in self.args.debug:
   3390     # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.)
   3391     xm.master_print(met.metrics_report())

File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:2956, in Trainer.log(self, logs)
   2954 output = {**logs, **{"step": self.state.global_step}}
   2955 self.state.log_history.append(output)
-> 2956 self.control = self.callback_handler.on_log(self.args, self.state, self.control, logs)

File /opt/conda/lib/python3.10/site-packages/transformers/trainer_callback.py:407, in CallbackHandler.on_log(self, args, state, control, logs)
    405 def on_log(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, logs):
    406     control.should_log = False
--> 407     return self.call_event("on_log", args, state, control, logs=logs)

File /opt/conda/lib/python3.10/site-packages/transformers/trainer_callback.py:414, in CallbackHandler.call_event(self, event, args, state, control, **kwargs)
    412 def call_event(self, event, args, state, control, **kwargs):
    413     for callback in self.callbacks:
--> 414         result = getattr(callback, event)(
    415             args,
    416             state,
    417             control,
    418             model=self.model,
    419             tokenizer=self.tokenizer,
    420             optimizer=self.optimizer,
    421             lr_scheduler=self.lr_scheduler,
    422             train_dataloader=self.train_dataloader,
    423             eval_dataloader=self.eval_dataloader,
    424             **kwargs,
    425         )
    426         # A Callback can skip the return of `control` if it doesn't change it.
    427         if result is not None:

File /opt/conda/lib/python3.10/site-packages/transformers/integrations/integration_utils.py:823, in WandbCallback.on_log(self, args, state, control, model, logs, **kwargs)
    821 non_scalar_logs = {k: v for k, v in logs.items() if k not in single_value_scalars}
    822 non_scalar_logs = rewrite_logs(non_scalar_logs)
--> 823 self._wandb.log({**non_scalar_logs, "train/global_step": state.global_step})

File /opt/conda/lib/python3.10/site-packages/wandb/sdk/wandb_run.py:420, in _run_decorator._noop.<locals>.wrapper(self, *args, **kwargs)
    417         wandb.termwarn(message, repeat=False)
    418         return cls.Dummy()
--> 420 return func(self, *args, **kwargs)

File /opt/conda/lib/python3.10/site-packages/wandb/sdk/wandb_run.py:371, in _run_decorator._noop_on_finish.<locals>.decorator_fn.<locals>.wrapper_fn(self, *args, **kwargs)
    368 @functools.wraps(func)
    369 def wrapper_fn(self: Type["Run"], *args: Any, **kwargs: Any) -> Any:
    370     if not getattr(self, "_is_finished", False):
--> 371         return func(self, *args, **kwargs)
    373     default_message = (
    374         f"Run ({self.id}) is finished. The call to `{func.__name__}` will be ignored. "
    375         f"Please make sure that you are using an active run."
    376     )
    377     resolved_message = message or default_message

File /opt/conda/lib/python3.10/site-packages/wandb/sdk/wandb_run.py:361, in _run_decorator._attach.<locals>.wrapper(self, *args, **kwargs)
    359         raise e
    360     cls._is_attaching = ""
--> 361 return func(self, *args, **kwargs)

File /opt/conda/lib/python3.10/site-packages/wandb/sdk/wandb_run.py:1838, in Run.log(self, data, step, commit, sync)
   1831 if self._settings._shared and step is not None:
   1832     wandb.termwarn(
   1833         "In shared mode, the use of `wandb.log` with the step argument is not supported "
   1834         f"and will be ignored. Please refer to {wburls.get('wandb_define_metric')} "
   1835         "on how to customize your x-axis.",
   1836         repeat=False,
   1837     )
-> 1838 self._log(data=data, step=step, commit=commit)

File /opt/conda/lib/python3.10/site-packages/wandb/sdk/wandb_run.py:1602, in Run._log(self, data, step, commit)
   1599 if any(not isinstance(key, str) for key in data.keys()):
   1600     raise ValueError("Key values passed to `wandb.log` must be strings.")
-> 1602 self._partial_history_callback(data, step, commit)
   1604 if step is not None:
   1605     if os.getpid() != self._init_pid or self._is_attached:

File /opt/conda/lib/python3.10/site-packages/wandb/sdk/wandb_run.py:1474, in Run._partial_history_callback(self, row, step, commit)
   1471 if self._backend and self._backend.interface:
   1472     not_using_tensorboard = len(wandb.patched["tensorboard"]) == 0
-> 1474     self._backend.interface.publish_partial_history(
   1475         row,
   1476         user_step=self._step,
   1477         step=step,
   1478         flush=commit,
   1479         publish_step=not_using_tensorboard,
   1480     )

File /opt/conda/lib/python3.10/site-packages/wandb/sdk/interface/interface.py:602, in InterfaceBase.publish_partial_history(self, data, user_step, step, flush, publish_step, run)
    600 if flush is not None:
    601     partial_history.action.flush = flush
--> 602 self._publish_partial_history(partial_history)

File /opt/conda/lib/python3.10/site-packages/wandb/sdk/interface/interface_shared.py:89, in InterfaceShared._publish_partial_history(self, partial_history)
     85 def _publish_partial_history(
     86     self, partial_history: pb.PartialHistoryRequest
     87 ) -> None:
     88     rec = self._make_request(partial_history=partial_history)
---> 89     self._publish(rec)

File /opt/conda/lib/python3.10/site-packages/wandb/sdk/interface/interface_sock.py:51, in InterfaceSock._publish(self, record, local)
     49 def _publish(self, record: "pb.Record", local: Optional[bool] = None) -> None:
     50     self._assign(record)
---> 51     self._sock_client.send_record_publish(record)

File /opt/conda/lib/python3.10/site-packages/wandb/sdk/lib/sock_client.py:221, in SockClient.send_record_publish(self, record)
    219 server_req = spb.ServerRequest()
    220 server_req.record_publish.CopyFrom(record)
--> 221 self.send_server_request(server_req)

File /opt/conda/lib/python3.10/site-packages/wandb/sdk/lib/sock_client.py:155, in SockClient.send_server_request(self, msg)
    154 def send_server_request(self, msg: Any) -> None:
--> 155     self._send_message(msg)

File /opt/conda/lib/python3.10/site-packages/wandb/sdk/lib/sock_client.py:152, in SockClient._send_message(self, msg)
    150 header = struct.pack("<BI", ord("W"), raw_size)
    151 with self._lock:
--> 152     self._sendall_with_error_handle(header + data)

File /opt/conda/lib/python3.10/site-packages/wandb/sdk/lib/sock_client.py:130, in SockClient._sendall_with_error_handle(self, data)
    128 start_time = time.monotonic()
    129 try:
--> 130     sent = self._sock.send(data)
    131     # sent equal to 0 indicates a closed socket
    132     if sent == 0:

BrokenPipeError: [Errno 32] Broken pipe
Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x786b8c31fa00>> (for post_run_cell), with arguments args (<ExecutionResult object at 786b8d1a56c0, execution_count=37 error_before_exec=None error_in_exec=[Errno 32] Broken pipe info=<ExecutionInfo object at 786b8d1a7f40, raw_cell="model = AutoModelForTokenClassification.from_pretr.." store_history=True silent=False shell_futures=True cell_id=51c3c680-dd89-4c04-878d-11c5485b97fe> result=None>,),kwargs {}:

Hey @lcoder7000, thanks for writing in! It seems the error is coming from the callback in the transformers library:

File /opt/conda/lib/python3.10/site-packages/transformers/integrations/integration_utils.py:823, in WandbCallback

Could you try to reinstall it to check if the issue is the change you did in the callback?