Sometimes I have errors and they are not logged to the logs area in wandb. Why? How do I debug it. Current bug output:
Traceback (most recent call last):aded
File "/lfs/ampere9/0/brando9/miniconda/envs/beyond_scale/lib/python3.10/site-packages/huggingface_hub/utils/_errors.py", line 286, in hf_raise_for_status
response.raise_for_status()
File "/lfs/ampere9/0/brando9/miniconda/envs/beyond_scale/lib/python3.10/site-packages/requests/models.py", line 1021, in raise_for_status
raise HTTPError(http_error_msg, response=self)
requests.exceptions.HTTPError: 500 Server Error: Internal Server Error for url: https://huggingface.co/datasets/allenai/c4/resolve/1ddc917116b730e1859edef32896ec5c16be51d0/en/c4-validation.00005-of-00008.json.gz
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/lfs/ampere9/0/brando9/beyond-scale-language-data-diversity/src/training/train.py", line 336, in <module>
main()
File "/lfs/ampere9/0/brando9/beyond-scale-language-data-diversity/src/training/train.py", line 326, in main
train()
File "/lfs/ampere9/0/brando9/beyond-scale-language-data-diversity/src/training/train.py", line 314, in train
metrics = eval_hf_with_subsample('c4', 'en', 'validation', model, tokenizer, block_size, output_dir, max_eval_samples=None)
File "/afs/cs.stanford.edu/u/brando9/beyond-scale-language-data-diversity/src/training/utils.py", line 393, in eval_hf_with_subsample
metrics = eval_hf(trainer, path, name, split,)
File "/afs/cs.stanford.edu/u/brando9/beyond-scale-language-data-diversity/src/training/utils.py", line 366, in eval_hf
metrics = trainer.evaluate()
File "/lfs/ampere9/0/brando9/miniconda/envs/beyond_scale/lib/python3.10/site-packages/transformers/trainer.py", line 3095, in evaluate
output = eval_loop(
File "/lfs/ampere9/0/brando9/miniconda/envs/beyond_scale/lib/python3.10/site-packages/transformers/trainer.py", line 3274, in evaluation_loop
for step, inputs in enumerate(dataloader):
File "/lfs/ampere9/0/brando9/miniconda/envs/beyond_scale/lib/python3.10/site-packages/accelerate/data_loader.py", line 687, in __iter__
next_batch, next_batch_info = self._fetch_batches(main_iterator)
File "/lfs/ampere9/0/brando9/miniconda/envs/beyond_scale/lib/python3.10/site-packages/accelerate/data_loader.py", line 608, in _fetch_batches
batches.append(next(iterator))
File "/lfs/ampere9/0/brando9/miniconda/envs/beyond_scale/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 630, in __next__
data = self._next_data()
File "/lfs/ampere9/0/brando9/miniconda/envs/beyond_scale/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 674, in _next_data
data = self._dataset_fetcher.fetch(index) # may raise StopIteration
File "/lfs/ampere9/0/brando9/miniconda/envs/beyond_scale/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 32, in fetch
data.append(next(self.dataset_iter))
File "/lfs/ampere9/0/brando9/miniconda/envs/beyond_scale/lib/python3.10/site-packages/datasets/iterable_dataset.py", line 1384, in __iter__
for key, example in ex_iterable:
File "/lfs/ampere9/0/brando9/miniconda/envs/beyond_scale/lib/python3.10/site-packages/datasets/iterable_dataset.py", line 679, in __iter__
yield from self._iter()
File "/lfs/ampere9/0/brando9/miniconda/envs/beyond_scale/lib/python3.10/site-packages/datasets/iterable_dataset.py", line 694, in _iter
for key, example in iterator:
File "/lfs/ampere9/0/brando9/miniconda/envs/beyond_scale/lib/python3.10/site-packages/datasets/iterable_dataset.py", line 679, in __iter__
yield from self._iter()
File "/lfs/ampere9/0/brando9/miniconda/envs/beyond_scale/lib/python3.10/site-packages/datasets/iterable_dataset.py", line 701, in _iter
key_examples_list = [(key, example)] + list(iterator_batch)
File "/lfs/ampere9/0/brando9/miniconda/envs/beyond_scale/lib/python3.10/site-packages/datasets/iterable_dataset.py", line 1115, in __iter__
for key, example in self.ex_iterable:
File "/lfs/ampere9/0/brando9/miniconda/envs/beyond_scale/lib/python3.10/site-packages/datasets/iterable_dataset.py", line 234, in __iter__
yield from self.generate_examples_fn(**self.kwargs)
File "/lfs/ampere9/0/brando9/.cache/huggingface/modules/datasets_modules/datasets/c4/df532b158939272d032cc63ef19cd5b83e9b4d00c922b833e4cb18b2e9869b01/c4.py", line 89, in _generate_examples
for line in f:
File "/lfs/ampere9/0/brando9/miniconda/envs/beyond_scale/lib/python3.10/gzip.py", line 314, in read1
return self._buffer.read1(size)
File "/lfs/ampere9/0/brando9/miniconda/envs/beyond_scale/lib/python3.10/_compression.py", line 68, in readinto
data = self.read(len(byte_view))
File "/lfs/ampere9/0/brando9/miniconda/envs/beyond_scale/lib/python3.10/gzip.py", line 494, in read
buf = self._fp.read(io.DEFAULT_BUFFER_SIZE)
File "/lfs/ampere9/0/brando9/miniconda/envs/beyond_scale/lib/python3.10/gzip.py", line 97, in read
self.file.read(size-self._length+read)
File "/lfs/ampere9/0/brando9/miniconda/envs/beyond_scale/lib/python3.10/site-packages/datasets/download/streaming_download_manager.py", line 341, in read_with_retries
out = read(*args, **kwargs)
File "/lfs/ampere9/0/brando9/miniconda/envs/beyond_scale/lib/python3.10/site-packages/fsspec/spec.py", line 1856, in read
out = self.cache._fetch(self.loc, self.loc + length)
File "/lfs/ampere9/0/brando9/miniconda/envs/beyond_scale/lib/python3.10/site-packages/fsspec/caching.py", line 189, in _fetch
self.cache = self.fetcher(start, end) # new block replaces old
File "/lfs/ampere9/0/brando9/miniconda/envs/beyond_scale/lib/python3.10/site-packages/huggingface_hub/hf_file_system.py", line 626, in _fetch_range
hf_raise_for_status(r)
File "/lfs/ampere9/0/brando9/miniconda/envs/beyond_scale/lib/python3.10/site-packages/huggingface_hub/utils/_errors.py", line 333, in hf_raise_for_status
raise HfHubHTTPError(str(e), response=response) from e
huggingface_hub.utils._errors.HfHubHTTPError: 500 Server Error: Internal Server Error for url: https://huggingface.co/datasets/allenai/c4/resolve/1ddc917116b730e1859edef32896ec5c16be51d0/en/c4-validation.00005-of-00008.json.gz (Request ID: Root=1-65c03190-35f1fa9d7f5d30b40b16db01;957ab364-e4b5-4e1c-8be5-01c349ffce28)
Internal Error - We're working hard to fix this as soon as possible!
thanks in advance!