Hi, I am having the same issue but not for the same reason I believe. Here’s part of my code:
sweep_config = {
'method': 'random',
'metric': {'goal': 'maximize', 'name': 'dev_acc'},
'parameters':
{
'hidden_dropout_prob': {'min': 0, 'max': 1, 'distribution': 'uniform'},
'lr': {'min': 0, 'max': 0.1, 'distribution': 'uniform'}
}
}
def train_multitask(sweep_config, args):
wandb.init(project='hpt-multitask', config=sweep_config)
device = torch.device('cuda') if args.use_gpu else torch.device('cpu')
# Load data
# Create the data and its corresponding datasets and dataloader
sst_train_data, num_labels, para_train_data, sts_train_data = load_multitask_data(args.sst_train, args.para_train, args.sts_train, split='train')
sst_dev_data, num_labels, para_dev_data, sts_dev_data = load_multitask_data(args.sst_dev, args.para_dev, args.sts_dev, split='train')
sst_train_data = SentenceClassificationDataset(sst_train_data, args)
sst_dev_data = SentenceClassificationDataset(sst_dev_data, args)
sst_train_dataloader = DataLoader(sst_train_data, shuffle=True, batch_size=args.batch_size,
collate_fn=sst_train_data.collate_fn)
sst_dev_dataloader = DataLoader(sst_dev_data, shuffle=False, batch_size=args.batch_size,
collate_fn=sst_dev_data.collate_fn)
# Init model
config = {'hidden_dropout_prob': sweep_config.hidden_dropout_prob,
'num_labels': num_labels,
'hidden_size': 768,
'data_dir': '.',
'option': args.option}
config = SimpleNamespace(**config)
model = MultitaskBERT(config)
model = model.to(device)
lr = sweep_config.lr
optimizer = AdamW(model.parameters(), lr=lr)
best_dev_acc = 0
## new code by Riya
para_train_data = SentencePairDataset(para_train_data, args)
para_dev_data = SentencePairDataset(para_dev_data, args)
para_train_dataloader = DataLoader(para_train_data, shuffle=False, batch_size=args.batch_size,
collate_fn=para_dev_data.collate_fn)
para_dev_dataloader = DataLoader(para_dev_data, shuffle=False, batch_size=args.batch_size,
collate_fn=para_dev_data.collate_fn)
sts_train_data = SentencePairDataset(sts_train_data, args, isRegression=True)
sts_dev_data = SentencePairDataset(sts_dev_data, args, isRegression=True)
sts_train_dataloader = DataLoader(sts_train_data, shuffle=False, batch_size=args.batch_size,
collate_fn=sts_dev_data.collate_fn)
sts_dev_dataloader = DataLoader(sts_dev_data, shuffle=False, batch_size=args.batch_size,
collate_fn=sts_dev_data.collate_fn)
for epoch in range(args.epochs):
model.train()
train_loss_sst = 0
train_loss_para = 0
train_loss_sts = 0
num_batches_sst = 0
num_batches_para = 0
num_batches_sts = 0
for batch in tqdm(sts_train_dataloader, desc=f'train-{epoch}', disable=TQDM_DISABLE):
b_ids1, b_mask1, b_ids2, b_mask2, b_labels, b_sent_ids = (batch['token_ids_1'], batch['attention_mask_1'],
batch['token_ids_2'], batch['attention_mask_2'],
batch['labels'], batch['sent_ids'])
b_ids1 = b_ids1.to(device)
b_mask1 = b_mask1.to(device)
b_ids2 = b_ids2.to(device)
b_mask2 = b_mask2.to(device)
b_labels = b_labels.to(device)
optimizer.zero_grad()
logit = model.predict_similarity(b_ids1, b_mask1, b_ids2, b_mask2)
# ASK ON ED: DO WE CHANGE IF JUST ONE LOGIT
loss = F.mse_loss(logit.view(-1).float(), b_labels.view(-1).float(), reduction='sum') / args.batch_size
loss.backward()
optimizer.step()
train_loss_sts += loss.item()
num_batches_sts += 1
train_loss_sts = train_loss_sts / (num_batches_sts)
for batch in tqdm(sst_train_dataloader, desc=f'train-{epoch}', disable=TQDM_DISABLE):
b_ids, b_mask, b_labels = (batch['token_ids'],
batch['attention_mask'], batch['labels'])
b_ids = b_ids.to(device)
b_mask = b_mask.to(device)
b_labels = b_labels.to(device)
optimizer.zero_grad()
logits = model.predict_sentiment(b_ids, b_mask)
loss = F.cross_entropy(logits, b_labels.view(-1), reduction='sum') / args.batch_size
loss.backward()
optimizer.step()
train_loss_sst += loss.item()
num_batches_sst += 1
train_loss_sst = train_loss_sst / (num_batches_sst)
for batch in tqdm(para_train_dataloader, desc=f'train-{epoch}', disable=TQDM_DISABLE):
b_ids1, b_mask1, b_ids2, b_mask2, b_labels, b_sent_ids = (batch['token_ids_1'], batch['attention_mask_1'],
batch['token_ids_2'], batch['attention_mask_2'],
batch['labels'], batch['sent_ids'])
b_ids1 = b_ids1.to(device)
b_mask1 = b_mask1.to(device)
b_ids2 = b_ids2.to(device)
b_mask2 = b_mask2.to(device)
b_labels = b_labels.to(device)
optimizer.zero_grad()
logit = model.predict_paraphrase(b_ids1, b_mask1, b_ids2, b_mask2)
# ASK ON ED: DO WE CHANGE IF JUST ONE LOGIT
loss = F.binary_cross_entropy_with_logits(logit.view(-1).float(), b_labels.view(-1).float(), reduction='sum') / args.batch_size
loss.backward()
optimizer.step()
train_loss_para += loss.item()
num_batches_para += 1
train_loss_para = train_loss_para / (num_batches_para)
para_train_accuracy, para_y_pred, para_sent_ids, sst_train_accuracy, sst_y_pred, sst_sent_ids, sts_train_corr, \
sts_y_pred, sts_sent_ids = model_eval_multitask(sst_train_dataloader, para_train_dataloader, sts_train_dataloader, model, device)
para_dev_accuracy, para_y_pred, para_sent_ids, sst_dev_accuracy, sst_y_pred, sst_sent_ids, sts_dev_corr, \
sts_y_pred, sts_sent_ids = model_eval_multitask(sst_dev_dataloader, para_dev_dataloader,
sts_dev_dataloader, model, device)
dev_acc = para_dev_accuracy + sst_dev_accuracy + (1+sts_dev_corr)/2 # tranform correlation so that on scale 0 to 1
if dev_acc > best_dev_acc:
best_dev_acc = dev_acc
save_model(model, optimizer, args, config, args.filepath)
print(
f"Epoch {epoch}: train loss sst:: {train_loss_sst :.3f}, train acc :: {sst_train_accuracy :.3f}, dev acc :: {sst_dev_accuracy :.3f}")
print(
f"Epoch {epoch}: train loss sts:: {train_loss_sts :.3f}, train corr :: {sts_train_corr :.3f}, dev acc :: {sts_dev_corr :.3f}")
print(
f"Epoch {epoch}: train loss para:: {train_loss_para :.3f}, train acc :: {para_train_accuracy :.3f}, dev acc :: {para_dev_accuracy :.3f}")
wandb.log({'dev_acc': dev_acc})
if __name__ == "__main__":
args = get_args()
args.filepath = f'{args.option}-{args.epochs}-{sweep_config.lr}-multitask.pt' # save path
seed_everything(args.seed) # fix the seed for reproducibility
train_multitask(sweep_config, args)
test_model(args)
sweep_id = wandb.sweep(sweep=sweep_config, project='hpt-multitask')
wandb.agent(sweep_id, function=train_multitask)
I get that dict object has no attribute lr and same for hidden_dropout_probs
Any help would be much appreciated!