Hi
I am using wandb sweep inside my script which I have used dist.init_process_group(
backend=backend,
world_size=world_size,
rank=rank,
)
inside it. I got thos error when the first sweep finishes and second one wants to start
File “/arc/project/conda_envs/venv1/lib/python3.6/site-packages/torch/distributed/distributed_c10d.py”, line 544, in init_process_group
raise RuntimeError("trying to initialize the default process group " “twice!”)
RuntimeError: trying to initialize the default process group twice!
wandb: Waiting for W&B process to finish… (failed 1). Press Control-C to abort syncing.
wandb: View run swept-sweep-2 at: Weights & Biases
wandb: Synced 7 W&B file(s), 0 media file(s), 0 artifact file(s) and 1 other file(s)
wandb: Find logs at: /s/wandbdir/wandb/run-20240206_143802-c9vw3i1o/logs
Run c9vw3i1o errored: RuntimeError(‘trying to initialize the default process group twice!’,)
wandb: ERROR Run c9vw3i1o errored: RuntimeError(‘trying to initialize the default process group twice!’,)
if name == “main”:
args = parser.parse_args()
seed = args.seed
sweep_id = args.sweep_id
sweep_config= yaml.load(open(args.config, “r”), Loader=yaml.Loader)
# print(sweep_config)
# Define your sweep
sweep_id = wandb.sweep(sweep_config, project=“Swin_unet1”, entity=‘zahragh995’)
wandb.agent(sweep_id, function=lambda: main(sweep_config),count=10)
and these are my first lines of main
def main(config=None):
global args, cfg
# cfg = yaml.load(open(args.config, "r"), Loader=yaml.Loader)
with wandb.init(project="Swin_unet1", config=config, entity='zahragh995'):
cfg=wandb.config
print(cfg)
cfg["exp_path"] = os.path.dirname(args.config)
cfg["save_path"] = os.path.join(cfg["exp_path"], cfg["saver"]["snapshot_dir"])
cudnn.enabled = True
cudnn.benchmark = True
rank, word_size = setup_distributed(port=args.port)
sweep_config = yaml.load(open(args.config, "r"), Loader=yaml.Loader)
if rank == 0:
logger.info("{}".format(pprint.pformat(cfg)))
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
# print(osp.join(cfg["exp_path"], '/'.join(cfg["save_path"].rsplit('/', 2)[:-1]) +"/events_seg/" + current_time))
tb_logger = SummaryWriter(
osp.join(cfg["exp_path"], '/'.join(cfg["save_path"].rsplit('/', 2)[:-1]) +"/events_seg/" + current_time)
)
else:
tb_logger = None