I’m running hyperparameter optimization using LLaMA-8B with varying LoRA ranks in sweep. However, I’m consistently encountering a CUDA out of memory error starting from the 4th trial.
Total Trials: 20
GPU: NVIDIA A100 80GB PCIe
Batch Size :2
Gradient Accumulation: 2 ( also try 4)
I enable : os.environ[“PYTORCH_CUDA_ALLOC_CONF”] = “expandable_segments:True”
Training function (Simplified) :
def train(config=None):
# Clear memory from any previous run
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
for i in range(torch.cuda.device_count()):
torch.cuda.reset_peak_memory_stats(i)
if config is None:
config = wandb.config
if wandb.run is None:
wandb.init(
project=" ",
config=config,
name=f"trial-{trial_number}",
)
wandb.define_metric("train/*", step_metric="epoch")
wandb.define_metric("eval/*", step_metric="epoch")
cfg = wandb.config
wandb.config.update({
"lora_rank": cfg.rank,
"learning_rate": cfg.learning_rate
..................
})
model = AutoModelForSequenceClassification.from_pretrained(
cfg.model_name,
num_labels=2,
device_map={"": 0},
torch_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(cfg.model_name, add_prefix_space=True)
tokenizer.pad_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.pad_token_id
peft_cfg = LoraConfig(
r=cfg.rank,
...........
)
model = get_peft_model(model, peft_cfg)
global tokenized_dataset
training_args = TrainingArguments(
output_dir=f"./results/{wandb.run.id}",
per_device_train_batch_size=2,
per_device_eval_batch_size=2,
gradient_accumulation_steps=2,
gradient_checkpointing=True,
num_train_epochs=3,
eval_strategy="epoch",
save_strategy="epoch",
learning_rate=cfg.learning_rate,
weight_decay=0.01,
bf16=True,
label_names=["labels"],
logging_dir="./logs",
logging_steps=10,
load_best_model_at_end=True,
metric_for_best_model="eval_f1",
greater_is_better=True,
report_to="wandb",
optim="adamw_torch"
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["validation"],
tokenizer=tokenizer,
compute_metrics=compute_metrics,
callbacks=[LogPerEpochCallback(cfg)],
)
start_time = time.time()
trainer.train()
eval_results = trainer.evaluate()
end_time = time.time()
wandb.log({
..........
})
# not print any thing
print_prediction_samples(model, tokenizer, tokenized_dataset["validation"], n=3)
# Free all memory before next trial
del model, trainer, peft_cfg, training_args
del trainer
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
for i in range(torch.cuda.device_count()):
torch.cuda.reset_peak_memory_stats(i)
return eval_results["eval_f1"]
Error :
wandb: ERROR torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 112.00 MiB. GPU 0 has a total capacity of 79.14 GiB of which 36.75 MiB is free. Process 2203064 has 79.09 GiB memory in use. Of the allocated memory 78.58 GiB is allocated by PyTorch, and 12.70 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
I’d appreciate your insights or recommendations on this issue !