import multiprocessing
import wandb
import multiprocessing as mp
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.datasets import fetch_openml
def train(X, y, config):
sgd_clf = SGDClassifier(**config, random_state=42)
y_pred = cross_val_predict(sgd_clf, X, y, cv=3)
return {
'accuracy': accuracy_score(y, y_pred),
'precision': precision_score(y, y_pred),
'recall': recall_score(y, y_pred),
'f1-score': f1_score(y, y_pred)
}
def main(X, y):
wandb.init()
metrics = train(X, y, wandb.config)
wandb.log(metrics)
def worker(sweep_id, X, y):
print('SweepId', sweep_id)
wandb.agent(sweep_id, function=lambda: main(X, y), count=2)
if __name__ == "__main__":
wandb.login(key='<myapikey>')
mnist = fetch_openml('mnist_784', as_frame=False, parser='auto')
X, y = mnist.data, mnist.target
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
y_train_5 = y_train == '5'
y_test_5 = y_test == '5'
sweep_configuration = {
"method": "random",
"metric": {"goal": "maximize", "name": "accuracy"},
"parameters": {
# 'loss': {'values': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron']},
# 'penalty': {'values': ['l1', 'l2', 'elasticnet']},
'alpha': {'values': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]},
'max_iter': {'values': [1000, 1500, 2000, 2500]}
}
}
wandb.setup()
sweep_id = wandb.sweep(sweep=sweep_configuration, project="visualize-mnist")
CPU_COUNT = multiprocessing.cpu_count()
print('CPU_COUNT', CPU_COUNT)
pool = mp.Pool(processes=CPU_COUNT - 2)
pool.starmap(worker, [(sweep_id, X_train, y_train_5) for i in range(CPU_COUNT - 2)])
I am trying to parallelize the sweep agent using the above code. When I exucute it, after some time I get the below error log. Only few runs execute sucessfully.
wandb: ERROR Connection to wandb service failed: [WinError 10061] No connection could be made because the target machine actively refused it.
multiprocessing.pool.RemoteTraceback:
"""
Traceback (most recent call last):
File "C:\Users\pradh\DataspellProjects\CS7641-ML\venv\lib\site-packages\wandb\sdk\wandb_manager.py", line 115, in _service_connect
svc_iface._svc_connect(port=port)
File "C:\Users\pradh\DataspellProjects\CS7641-ML\venv\lib\site-packages\wandb\sdk\service\service_sock.py", line 30, in _svc_connect
self._sock_client.connect(port=port)
File "C:\Users\pradh\DataspellProjects\CS7641-ML\venv\lib\site-packages\wandb\sdk\lib\sock_client.py", line 102, in connect
s.connect(("localhost", port))
ConnectionRefusedError: [WinError 10061] No connection could be made because the target machine actively refused it
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\pradh\AppData\Local\Programs\Python\Python39\lib\multiprocessing\pool.py", line 125, in worker
result = (True, func(*args, **kwds))
File "C:\Users\pradh\AppData\Local\Programs\Python\Python39\lib\multiprocessing\pool.py", line 51, in starmapstar
return list(itertools.starmap(args[0], args[1]))
File "C:\Users\pradh\DataspellProjects\CS7641-ML\sweep.py", line 32, in worker
wandb.agent(sweep_id, function=lambda: main(X, y), count=2)
File "C:\Users\pradh\DataspellProjects\CS7641-ML\venv\lib\site-packages\wandb\wandb_agent.py", line 579, in agent
wandb_sdk.wandb_login._login(_silent=True)
File "C:\Users\pradh\DataspellProjects\CS7641-ML\venv\lib\site-packages\wandb\sdk\wandb_login.py", line 277, in _login
wlogin.setup(kwargs)
File "C:\Users\pradh\DataspellProjects\CS7641-ML\venv\lib\site-packages\wandb\sdk\wandb_login.py", line 111, in setup
_logger = wandb.setup()._get_logger()
File "C:\Users\pradh\DataspellProjects\CS7641-ML\venv\lib\site-packages\wandb\sdk\wandb_setup.py", line 327, in setup
ret = _setup(settings=settings)
File "C:\Users\pradh\DataspellProjects\CS7641-ML\venv\lib\site-packages\wandb\sdk\wandb_setup.py", line 320, in _setup
wl = _WandbSetup(settings=settings)
File "C:\Users\pradh\DataspellProjects\CS7641-ML\venv\lib\site-packages\wandb\sdk\wandb_setup.py", line 303, in __init__
_WandbSetup._instance = _WandbSetup__WandbSetup(settings=settings, pid=pid)
File "C:\Users\pradh\DataspellProjects\CS7641-ML\venv\lib\site-packages\wandb\sdk\wandb_setup.py", line 114, in __init__
self._setup()
File "C:\Users\pradh\DataspellProjects\CS7641-ML\venv\lib\site-packages\wandb\sdk\wandb_setup.py", line 250, in _setup
self._setup_manager()
File "C:\Users\pradh\DataspellProjects\CS7641-ML\venv\lib\site-packages\wandb\sdk\wandb_setup.py", line 277, in _setup_manager
self._manager = wandb_manager._Manager(settings=self._settings)
File "C:\Users\pradh\DataspellProjects\CS7641-ML\venv\lib\site-packages\wandb\sdk\wandb_manager.py", line 152, in __init__
wandb._sentry.reraise(e)
File "C:\Users\pradh\DataspellProjects\CS7641-ML\venv\lib\site-packages\wandb\analytics\sentry.py", line 154, in reraise
raise exc.with_traceback(sys.exc_info()[2])
File "C:\Users\pradh\DataspellProjects\CS7641-ML\venv\lib\site-packages\wandb\sdk\wandb_manager.py", line 150, in __init__
self._service_connect()
File "C:\Users\pradh\DataspellProjects\CS7641-ML\venv\lib\site-packages\wandb\sdk\wandb_manager.py", line 124, in _service_connect
raise ManagerConnectionRefusedError(message)
wandb.sdk.wandb_manager.ManagerConnectionRefusedError: Connection to wandb service failed: [WinError 10061] No connection could be made because the target machine actively refused it.
"""
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "C:\Users\pradh\DataspellProjects\CS7641-ML\sweep.py", line 59, in <module>
pool.starmap(worker, [(sweep_id, X_train, y_train_5) for i in range(CPU_COUNT - 2)])
File "C:\Users\pradh\AppData\Local\Programs\Python\Python39\lib\multiprocessing\pool.py", line 372, in starmap
return self._map_async(func, iterable, starmapstar, chunksize).get()
File "C:\Users\pradh\AppData\Local\Programs\Python\Python39\lib\multiprocessing\pool.py", line 771, in get
raise self._value
wandb.sdk.wandb_manager.ManagerConnectionRefusedError: Connection to wandb service failed: [WinError 10061] No connection could be made because the target machine actively refused it.
Process finished with exit code 1
Is this an issue with wandb or my code?