Running sweep agent using multiprocessing.pool results in Connection Error

import multiprocessing
import wandb
import multiprocessing as mp
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.datasets import fetch_openml


def train(X, y, config):
    sgd_clf = SGDClassifier(**config, random_state=42)
    y_pred = cross_val_predict(sgd_clf, X, y, cv=3)

    return {
        'accuracy': accuracy_score(y, y_pred),
        'precision': precision_score(y, y_pred),
        'recall': recall_score(y, y_pred),
        'f1-score': f1_score(y, y_pred)
    }


def main(X, y):
    wandb.init()
    metrics = train(X, y, wandb.config)
    wandb.log(metrics)


def worker(sweep_id, X, y):
    print('SweepId', sweep_id)
    wandb.agent(sweep_id, function=lambda: main(X, y), count=2)


if __name__ == "__main__":
    wandb.login(key='<myapikey>')
    mnist = fetch_openml('mnist_784', as_frame=False, parser='auto')
    X, y = mnist.data, mnist.target
    X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
    y_train_5 = y_train == '5'
    y_test_5 = y_test == '5'

    sweep_configuration = {
        "method": "random",
        "metric": {"goal": "maximize", "name": "accuracy"},
        "parameters": {
            # 'loss': {'values': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron']},
            # 'penalty': {'values': ['l1', 'l2', 'elasticnet']},
            'alpha': {'values': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]},
            'max_iter': {'values': [1000, 1500, 2000, 2500]}
        }
    }

    wandb.setup()
    sweep_id = wandb.sweep(sweep=sweep_configuration, project="visualize-mnist")
    CPU_COUNT = multiprocessing.cpu_count()
    print('CPU_COUNT', CPU_COUNT)
    pool = mp.Pool(processes=CPU_COUNT - 2)
    pool.starmap(worker, [(sweep_id, X_train, y_train_5) for i in range(CPU_COUNT - 2)])

I am trying to parallelize the sweep agent using the above code. When I exucute it, after some time I get the below error log. Only few runs execute sucessfully.

wandb: ERROR Connection to wandb service failed: [WinError 10061] No connection could be made because the target machine actively refused it. 
multiprocessing.pool.RemoteTraceback: 
"""
Traceback (most recent call last):
  File "C:\Users\pradh\DataspellProjects\CS7641-ML\venv\lib\site-packages\wandb\sdk\wandb_manager.py", line 115, in _service_connect
    svc_iface._svc_connect(port=port)
  File "C:\Users\pradh\DataspellProjects\CS7641-ML\venv\lib\site-packages\wandb\sdk\service\service_sock.py", line 30, in _svc_connect
    self._sock_client.connect(port=port)
  File "C:\Users\pradh\DataspellProjects\CS7641-ML\venv\lib\site-packages\wandb\sdk\lib\sock_client.py", line 102, in connect
    s.connect(("localhost", port))
ConnectionRefusedError: [WinError 10061] No connection could be made because the target machine actively refused it

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\pradh\AppData\Local\Programs\Python\Python39\lib\multiprocessing\pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
  File "C:\Users\pradh\AppData\Local\Programs\Python\Python39\lib\multiprocessing\pool.py", line 51, in starmapstar
    return list(itertools.starmap(args[0], args[1]))
  File "C:\Users\pradh\DataspellProjects\CS7641-ML\sweep.py", line 32, in worker
    wandb.agent(sweep_id, function=lambda: main(X, y), count=2)
  File "C:\Users\pradh\DataspellProjects\CS7641-ML\venv\lib\site-packages\wandb\wandb_agent.py", line 579, in agent
    wandb_sdk.wandb_login._login(_silent=True)
  File "C:\Users\pradh\DataspellProjects\CS7641-ML\venv\lib\site-packages\wandb\sdk\wandb_login.py", line 277, in _login
    wlogin.setup(kwargs)
  File "C:\Users\pradh\DataspellProjects\CS7641-ML\venv\lib\site-packages\wandb\sdk\wandb_login.py", line 111, in setup
    _logger = wandb.setup()._get_logger()
  File "C:\Users\pradh\DataspellProjects\CS7641-ML\venv\lib\site-packages\wandb\sdk\wandb_setup.py", line 327, in setup
    ret = _setup(settings=settings)
  File "C:\Users\pradh\DataspellProjects\CS7641-ML\venv\lib\site-packages\wandb\sdk\wandb_setup.py", line 320, in _setup
    wl = _WandbSetup(settings=settings)
  File "C:\Users\pradh\DataspellProjects\CS7641-ML\venv\lib\site-packages\wandb\sdk\wandb_setup.py", line 303, in __init__
    _WandbSetup._instance = _WandbSetup__WandbSetup(settings=settings, pid=pid)
  File "C:\Users\pradh\DataspellProjects\CS7641-ML\venv\lib\site-packages\wandb\sdk\wandb_setup.py", line 114, in __init__
    self._setup()
  File "C:\Users\pradh\DataspellProjects\CS7641-ML\venv\lib\site-packages\wandb\sdk\wandb_setup.py", line 250, in _setup
    self._setup_manager()
  File "C:\Users\pradh\DataspellProjects\CS7641-ML\venv\lib\site-packages\wandb\sdk\wandb_setup.py", line 277, in _setup_manager
    self._manager = wandb_manager._Manager(settings=self._settings)
  File "C:\Users\pradh\DataspellProjects\CS7641-ML\venv\lib\site-packages\wandb\sdk\wandb_manager.py", line 152, in __init__
    wandb._sentry.reraise(e)
  File "C:\Users\pradh\DataspellProjects\CS7641-ML\venv\lib\site-packages\wandb\analytics\sentry.py", line 154, in reraise
    raise exc.with_traceback(sys.exc_info()[2])
  File "C:\Users\pradh\DataspellProjects\CS7641-ML\venv\lib\site-packages\wandb\sdk\wandb_manager.py", line 150, in __init__
    self._service_connect()
  File "C:\Users\pradh\DataspellProjects\CS7641-ML\venv\lib\site-packages\wandb\sdk\wandb_manager.py", line 124, in _service_connect
    raise ManagerConnectionRefusedError(message)
wandb.sdk.wandb_manager.ManagerConnectionRefusedError: Connection to wandb service failed: [WinError 10061] No connection could be made because the target machine actively refused it. 
"""

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "C:\Users\pradh\DataspellProjects\CS7641-ML\sweep.py", line 59, in <module>
    pool.starmap(worker, [(sweep_id, X_train, y_train_5) for i in range(CPU_COUNT - 2)])
  File "C:\Users\pradh\AppData\Local\Programs\Python\Python39\lib\multiprocessing\pool.py", line 372, in starmap
    return self._map_async(func, iterable, starmapstar, chunksize).get()
  File "C:\Users\pradh\AppData\Local\Programs\Python\Python39\lib\multiprocessing\pool.py", line 771, in get
    raise self._value
wandb.sdk.wandb_manager.ManagerConnectionRefusedError: Connection to wandb service failed: [WinError 10061] No connection could be made because the target machine actively refused it. 

Process finished with exit code 1

Is this an issue with wandb or my code?

Hello @vikrant17 !

Based on your code, it looks like process will be starting up a wandb.agent() which is not recommended. An agent is our process to be launching new sweeps rather than sweeps launching new agents. What I would advise would to parallelize the agents which would create multiple agents to run your sweep (which is what it looks like you are attempting to do).

Hi Vikrant, since we have not heard back from you we are going to close this request. If you would like to re-open the conversation, please let us know!

Thanks for the info.

I was able to parallelize the agents using joblib package.

This topic was automatically closed 60 days after the last reply. New replies are no longer allowed.