Hello, thanks for the answer. Let me try to gather the files you requested.
The run is hapenning in a SageMaker training job so I need to see how to export the internal logs from the run. I will check this beginning of next week.
In the meantime, here’s a full tracelog from CLoudWatch
Traceback (most recent call last):
File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/internal/internal_util.py", line 49, in run
self._run()
File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/internal/internal_util.py", line 100, in _run
self._process(record)
File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/internal/internal.py", line 279, in _process
self._hm.handle(record)
File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/internal/handler.py", line 138, in handle
handler(record)
File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/internal/handler.py", line 148, in handle_request
handler(record)
File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/internal/handler.py", line 683, in handle_request_run_start
self._system_monitor.probe(publish=True)
File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/internal/system/system_monitor.py", line 228, in probe
self.system_info.publish(system_info)
File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/internal/system/system_info.py", line 255, in publish
self._save_patches()
File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/internal/system/system_info.py", line 146, in _save_patches
upstream_commit = self.git.get_upstream_fork_point()
File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/lib/gitlib.py", line 200, in get_upstream_fork_point
possible_relatives.append(tracking_branch.commit)
File "/opt/conda/lib/python3.10/site-packages/git/refs/symbolic.py", line 274, in _get_commit
obj = self._get_object()
File "/opt/conda/lib/python3.10/site-packages/git/refs/symbolic.py", line 267, in _get_object
return Object.new_from_sha(self.repo, hex_to_bin(self.dereference_recursive(self.repo, self.path)))
File "/opt/conda/lib/python3.10/site-packages/git/objects/base.py", line 94, in new_from_sha
oinfo = repo.odb.info(sha1)
File "/opt/conda/lib/python3.10/site-packages/git/db.py", line 40, in info
hexsha, typename, size = self._git.get_object_header(bin_to_hex(binsha))
File "/opt/conda/lib/python3.10/site-packages/git/cmd.py", line 1384, in get_object_header
return self.__get_object_header(cmd, ref)
File "/opt/conda/lib/python3.10/site-packages/git/cmd.py", line 1370, in __get_object_header
cmd.stdin.flush()
Traceback (most recent call last): File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/internal/internal_util.py", line 49, in run self._run() File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/internal/internal_util.py", line 100, in _run self._process(record) File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/internal/internal.py", line 279, in _process self._hm.handle(record) File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/internal/handler.py", line 138, in handle handler(record) File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/internal/handler.py", line 148, in handle_request handler(record) File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/internal/handler.py", line 683, in handle_request_run_start self._system_monitor.probe(publish=True) File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/internal/system/system_monitor.py", line 228, in probe self.system_info.publish(system_info) File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/internal/system/system_info.py", line 255, in publish self._save_patches() File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/internal/system/system_info.py", line 146, in _save_patches upstream_commit = self.git.get_upstream_fork_point() File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/lib/gitlib.py", line 200, in get_upstream_fork_point possible_relatives.append(tracking_branch.commit) File "/opt/conda/lib/python3.10/site-packages/git/refs/symbolic.py", line 274, in _get_commit obj = self._get_object() File "/opt/conda/lib/python3.10/site-packages/git/refs/symbolic.py", line 267, in _get_object return Object.new_from_sha(self.repo, hex_to_bin(self.dereference_recursive(self.repo, self.path))) File "/opt/conda/lib/python3.10/site-packages/git/objects/base.py", line 94, in new_from_sha oinfo = repo.odb.info(sha1) File "/opt/conda/lib/python3.10/site-packages/git/db.py", line 40, in info hexsha, typename, size = self._git.get_object_header(bin_to_hex(binsha)) File "/opt/conda/lib/python3.10/site-packages/git/cmd.py", line 1384, in get_object_header return self.__get_object_header(cmd, ref) File "/opt/conda/lib/python3.10/site-packages/git/cmd.py", line 1370, in __get_object_header cmd.stdin.flush()
2024-01-05T17:32:46.077+01:00 BrokenPipeError: [Errno 32] Broken pipe
2024-01-05T17:32:46.077+01:00 wandb: ERROR Internal wandb error: file data was not synced
2024-01-05T17:32:55.079+01:00 Problem at: /opt/conda/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py 399 experiment
2024-01-05T17:32:55.079+01:00 wandb: ERROR transport failed
2024-01-05T17:32:55.079+01:00 Traceback (most recent call last):
2024-01-05T17:32:55.080+01:00
File "/opt/conda/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/opt/conda/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/opt/ml/code/train.py", line 92, in <module>
mnistTrainer.fit(model)
File "/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 544, in fit
call._call_and_handle_interrupt(
File "/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 44, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 580, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 950, in _run
call._call_setup_hook(self) # allow user to setup lightning_module in accelerator environment
File "/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 86, in _call_setup_hook
if hasattr(logger, "experiment"):
File "/opt/conda/lib/python3.10/site-packages/lightning_fabric/loggers/logger.py", line 118, in experiment
return fn(self)
File "/opt/conda/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py", line 399, in experiment
self._experiment = wandb.init(**self._wandb_init)
File "/opt/conda/lib/python3.10/runpy.py", line 196, in _run_module_as_main return _run_code(code, main_globals, None, File "/opt/conda/lib/python3.10/runpy.py", line 86, in _run_code exec(code, run_globals) File "/opt/ml/code/train.py", line 92, in <module> mnistTrainer.fit(model) File "/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 544, in fit call._call_and_handle_interrupt( File "/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 44, in _call_and_handle_interrupt return trainer_fn(*args, **kwargs) File "/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 580, in _fit_impl self._run(model, ckpt_path=ckpt_path) File "/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 950, in _run call._call_setup_hook(self) # allow user to setup lightning_module in accelerator environment File "/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 86, in _call_setup_hook if hasattr(logger, "experiment"): File "/opt/conda/lib/python3.10/site-packages/lightning_fabric/loggers/logger.py", line 118, in experiment return fn(self) File "/opt/conda/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py", line 399, in experiment self._experiment = wandb.init(**self._wandb_init)
2024-01-05T17:32:55.080+01:00 File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/wandb_init.py", line 1189, in init raise e File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/wandb_init.py", line 1170, in init run = wi.init() File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/wandb_init.py", line 811, in init run_start_result = run_start_handle.wait(timeout=30) File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/lib/mailbox.py", line 281, in wait raise MailboxError("transport failed")
2024-01-05T17:32:55.080+01:00
wandb.sdk.lib.mailbox.MailboxError: transport failed
wandb.sdk.lib.mailbox.MailboxError: transport failed
2024-01-05T17:32:55.081+01:00
Traceback (most recent call last):
File "/opt/conda/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/opt/conda/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/opt/ml/code/train.py", line 92, in <module>
mnistTrainer.fit(model)
File "/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 544, in fit
call._call_and_handle_interrupt(
File "/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 44, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 580, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 950, in _run
call._call_setup_hook(self) # allow user to setup lightning_module in accelerator environment
File "/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 86, in _call_setup_hook
if hasattr(logger, "experiment"):
File "/opt/conda/lib/python3.10/site-packages/lightning_fabric/loggers/logger.py", line 118, in experiment
return fn(self)
File "/opt/conda/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py", line 399, in experiment
self._experiment = wandb.init(**self._wandb_init)
File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/wandb_init.py", line 1189, in init
raise e
File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/wandb_init.py", line 1170, in init
run = wi.init()
File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/wandb_init.py", line 811, in init
run_start_result = run_start_handle.wait(timeout=30)
File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/lib/mailbox.py", line 281, in wait
raise MailboxError("transport failed")
Traceback (most recent call last): File "/opt/conda/lib/python3.10/runpy.py", line 196, in _run_module_as_main return _run_code(code, main_globals, None, File "/opt/conda/lib/python3.10/runpy.py", line 86, in _run_code exec(code, run_globals) File "/opt/ml/code/train.py", line 92, in <module> mnistTrainer.fit(model) File "/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 544, in fit call._call_and_handle_interrupt( File "/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 44, in _call_and_handle_interrupt return trainer_fn(*args, **kwargs) File "/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 580, in _fit_impl self._run(model, ckpt_path=ckpt_path) File "/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 950, in _run call._call_setup_hook(self) # allow user to setup lightning_module in accelerator environment File "/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 86, in _call_setup_hook if hasattr(logger, "experiment"): File "/opt/conda/lib/python3.10/site-packages/lightning_fabric/loggers/logger.py", line 118, in experiment return fn(self) File "/opt/conda/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py", line 399, in experiment self._experiment = wandb.init(**self._wandb_init) File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/wandb_init.py", line 1189, in init raise e File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/wandb_init.py", line 1170, in init run = wi.init() File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/wandb_init.py", line 811, in init run_start_result = run_start_handle.wait(timeout=30) File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/lib/mailbox.py", line 281, in wait raise MailboxError("transport failed")
2024-01-05T17:32:55.081+01:00
wandb.sdk.lib.mailbox.MailboxError: transport failed
wandb.sdk.lib.mailbox.MailboxError: transport failed