Hi @ramit_goolry!
After a bit more tweaking, I made a secondary script that would intercept the configurations from the API with a local controller. Following this, I saved each configuration in a yaml file, which I then loaded using argparse.
Nevertheless, the issue regarding the parallelization sometimes persists, but is irregular. It will sometimes crash when using the exact same configuration file, without any changes to my code:
Traceback (most recent call last):
File "/software/software/Python/3.8.6-GCCcore-10.2.0/lib/python3.8/runpy.py", line 194, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/software/software/Python/3.8.6-GCCcore-10.2.0/lib/python3.8/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/home/user/project/src/sweep_mpi.py", line 1, in <module>
File "<frozen importlib._bootstrap>", line 991, in _find_and_load
File "<frozen importlib._bootstrap>", line 971, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 914, in _find_spec
File "<frozen importlib._bootstrap_external>", line 1342, in find_spec
File "<frozen importlib._bootstrap_external>", line 1314, in _get_spec
File "<frozen importlib._bootstrap_external>", line 1443, in find_spec
File "<frozen importlib._bootstrap_external>", line 1483, in _fill_cache
BrokenPipeError: [Errno 108] Cannot send after transport endpoint shutdown: '/home/user/project'
--------------------------------------------------------------------------
Primary job terminated normally, but 1 process returned
a non-zero exit code. Per user-direction, the job has been aborted.
--------------------------------------------------------------------------
--------------------------------------------------------------------------
mpirun detected that one or more processes exited with non-zero status, thus causing
the job to be terminated. The first process to do so was:
Process name: [[19849,1],3]
Exit code: 1
--------------------------------------------------------------------------
This seems to result from a read operation timeout, as indicated by the debug log:
2022-05-19 10:23:11,048 ERROR SenderThread:7114 [retry.py:__call__():126] Retry attempt failed:
Traceback (most recent call last):
File "/data/user/.envs/osim/lib/python3.8/site-packages/urllib3/connectionpool.py", line 449, in _make_request
six.raise_from(e, None)
File "<string>", line 3, in raise_from
File "/data/user/.envs/osim/lib/python3.8/site-packages/urllib3/connectionpool.py", line 444, in _make_request
httplib_response = conn.getresponse()
File "/software/software/Python/3.8.6-GCCcore-10.2.0/lib/python3.8/http/client.py", line 1347, in getresponse
response.begin()
File "/software/software/Python/3.8.6-GCCcore-10.2.0/lib/python3.8/http/client.py", line 307, in begin
version, status, reason = self._read_status()
File "/software/software/Python/3.8.6-GCCcore-10.2.0/lib/python3.8/http/client.py", line 268, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "/software/software/Python/3.8.6-GCCcore-10.2.0/lib/python3.8/socket.py", line 669, in readinto
return self._sock.recv_into(b)
File "/software/software/Python/3.8.6-GCCcore-10.2.0/lib/python3.8/ssl.py", line 1241, in recv_into
return self.read(nbytes, buffer)
File "/software/software/Python/3.8.6-GCCcore-10.2.0/lib/python3.8/ssl.py", line 1099, in read
return self._sslobj.read(len, buffer)
socket.timeout: The read operation timed out
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/data/user/.envs/osim/lib/python3.8/site-packages/requests/adapters.py", line 440, in send
resp = conn.urlopen(
File "/data/user/.envs/osim/lib/python3.8/site-packages/urllib3/connectionpool.py", line 785, in urlopen
retries = retries.increment(
File "/data/user/.envs/osim/lib/python3.8/site-packages/urllib3/util/retry.py", line 550, in increment
raise six.reraise(type(error), error, _stacktrace)
File "/data/user/.envs/osim/lib/python3.8/site-packages/urllib3/packages/six.py", line 770, in reraise
raise value
File "/data/user/.envs/osim/lib/python3.8/site-packages/urllib3/connectionpool.py", line 703, in urlopen
httplib_response = self._make_request(
File "/data/user/.envs/osim/lib/python3.8/site-packages/urllib3/connectionpool.py", line 451, in _make_request
self._raise_timeout(err=e, url=url, timeout_value=read_timeout)
File "/data/user/.envs/osim/lib/python3.8/site-packages/urllib3/connectionpool.py", line 340, in _raise_timeout
raise ReadTimeoutError(
urllib3.exceptions.ReadTimeoutError: HTTPSConnectionPool(host='api.wandb.ai', port=443): Read timed out. (read timeout=10)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/data/user/.envs/osim/lib/python3.8/site-packages/wandb/sdk/lib/retry.py", line 102, in __call__
result = self._call_fn(*args, **kwargs)
File "/data/user/.envs/osim/lib/python3.8/site-packages/wandb/sdk/internal/internal_api.py", line 140, in execute
return self.client.execute(*args, **kwargs)
File "/data/user/.envs/osim/lib/python3.8/site-packages/wandb/vendor/gql-0.2.0/wandb_gql/client.py", line 52, in execute
result = self._get_result(document, *args, **kwargs)
File "/data/user/.envs/osim/lib/python3.8/site-packages/wandb/vendor/gql-0.2.0/wandb_gql/client.py", line 60, in _get_result
return self.transport.execute(document, *args, **kwargs)
File "/data/user/.envs/osim/lib/python3.8/site-packages/wandb/vendor/gql-0.2.0/wandb_gql/transport/requests.py", line 38, in execute
request = requests.post(self.url, **post_args)
File "/data/user/.envs/osim/lib/python3.8/site-packages/requests/api.py", line 117, in post
return request('post', url, data=data, json=json, **kwargs)
File "/data/user/.envs/osim/lib/python3.8/site-packages/requests/api.py", line 61, in request
return session.request(method=method, url=url, **kwargs)
File "/data/user/.envs/osim/lib/python3.8/site-packages/requests/sessions.py", line 529, in request
resp = self.send(prep, **send_kwargs)
File "/data/user/.envs/osim/lib/python3.8/site-packages/requests/sessions.py", line 645, in send
r = adapter.send(request, **kwargs)
File "/data/user/.envs/osim/lib/python3.8/site-packages/requests/adapters.py", line 532, in send
raise ReadTimeout(e, request=request)
requests.exceptions.ReadTimeout: HTTPSConnectionPool(host='api.wandb.ai', port=443): Read timed out. (read timeout=10)