Hi everyone,
I’m using W&B Sweeps to perform hyperparameter tuning with PyTorch Lightning on two GPUs. My setup works fine for the first run, but it stops at the second run and doesn’t proceed further.
(when I run the same code on a single GPU, it works fine without any issues)
Problem Description: After the first run completes, my code stops at the configuration definition stage and does not proceed to the next run. Below is the log output where it gets stuck.
wandb: Agent Starting Run: 74qhtrgy with config:
wandb: activation_function: sigmoid
wandb: batch_size: 32
wandb: d_ff: 119
wandb: d_model: 56
wandb: dropout: 0.4621265312832923
wandb: epochs: 10
wandb: learning_rate: 0.045521224681459006
wandb: nhead: 7
wandb: num_layers: 7
wandb: weight_decay: 0.0067762018632429675
wandb: Tracking run with wandb version 0.17.6
wandb: Run data is saved locally in /home/bourzakismail/My Projects/wandb/run-20240813_145424-b1rmeete/files/wandb/run-20240813_145506-74qhtrgy
wandb: Run wandb offline
to turn off syncing.
wandb: Syncing run vibrant-sweep-2
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Code:
def create_sweep_config():
return {
'method': 'bayes',
'name': 'EXP1_iTransformer_SPEED_10_YEARS',
'metric': {
'name': 'val_loss',
'goal': 'minimize'
},
'parameters': {
'num_layers': {'distribution': 'int_uniform', 'min': 1, 'max': 10},
'nhead': {'distribution': 'int_uniform', 'min': 2, 'max': 10},
'd_model': {'distribution': 'int_uniform', 'min': 50, 'max': 200},
'd_ff': {'distribution': 'int_uniform', 'min': 50, 'max': 200},
'activation_function': {'values': ['relu', 'sigmoid', 'tanh']},
'dropout': {'distribution': 'uniform', 'min': 0, 'max': 0.5},
'learning_rate': {'distribution': 'uniform', 'min': 1e-5, 'max': 1e-1},
'weight_decay': {'distribution': 'uniform', 'min': 1e-5, 'max': 1e-1},
'batch_size': {'value': 32},
'epochs': {'value': 10}
}
}
def train(config=None):
wandb.init(config=config)
config = wandb.config
if os.environ.get("LOCAL_RANK", None) is None:
os.environ["WANDB_DIR"] = wandb.run.dir
model = iTransformer(
seq_len=seq_len,
d_model=(config.d_model if config.d_model % 2 == 0 else config.d_model + 1) * (config.nhead if config.nhead % 2 == 0 else config.nhead + 1),
d_output=d_output,
nhead=(config.nhead if config.nhead % 2 == 0 else config.nhead + 1),
d_ff=config.d_ff,
activation_function=config.activation_function,
num_layers=config.num_layers,
dropout=config.dropout,
lr=config.learning_rate,
weight_decay=config.weight_decay,
batch_size=config.batch_size
)
wandb_logger = WandbLogger(log_model="all")
trainer = L.Trainer(
accelerator="gpu",
devices=2,
strategy='ddp',
max_epochs=config.epochs,
logger=wandb_logger,
callbacks=[EarlyStopping(monitor="val_loss", mode="min", patience=10)]
)
trainer.fit(model)
trainer.test(model)
if int(os.environ.get('LOCAL_RANK', 0)) == 0:
wandb.finish()
def main():
sweep_config = create_sweep_config()
sweep_id = wandb.sweep(sweep_config, project="SpeedPL_iTransformer_Model_10_YEARS")
wandb.agent(sweep_id, function=train, count=10)
if __name__ == "__main__":
if int(os.environ.get('LOCAL_RANK', 0)) == 0:
main()
else:
train()
Environment:
Lightning version: 2.3.3
W&B version: 0.17.6
torch version: 2.3.1+cu121
Python version: 3.10.12