CUDA Out of Memory During LoRA Sweep with LLaMA

I’m running hyperparameter optimization using LLaMA-8B with varying LoRA ranks in sweep. However, I’m consistently encountering a CUDA out of memory error starting from the 4th trial.
Total Trials: 20
GPU: NVIDIA A100 80GB PCIe
Batch Size :2
Gradient Accumulation: 2 ( also try 4)
I enable : os.environ[“PYTORCH_CUDA_ALLOC_CONF”] = “expandable_segments:True”

Training function (Simplified) :

def train(config=None):

    # Clear memory from any previous run
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()
        for i in range(torch.cuda.device_count()):
            torch.cuda.reset_peak_memory_stats(i)

    if config is None:
        config = wandb.config

    if wandb.run is None:
        wandb.init(
            project=" ",
            config=config,
            name=f"trial-{trial_number}",

        )
        wandb.define_metric("train/*", step_metric="epoch")
        wandb.define_metric("eval/*",  step_metric="epoch")


    cfg = wandb.config
    wandb.config.update({
        "lora_rank": cfg.rank,
        "learning_rate": cfg.learning_rate
        ..................
    })

    model = AutoModelForSequenceClassification.from_pretrained(
        cfg.model_name,
        num_labels=2,
        device_map={"": 0},
        torch_dtype=torch.bfloat16
    )
    tokenizer = AutoTokenizer.from_pretrained(cfg.model_name, add_prefix_space=True)
    tokenizer.pad_token_id = tokenizer.eos_token_id
    model.config.pad_token_id = tokenizer.pad_token_id

    peft_cfg = LoraConfig(
        r=cfg.rank,
       ...........
    )
    model = get_peft_model(model, peft_cfg)
    
    global tokenized_dataset

    training_args = TrainingArguments(
        output_dir=f"./results/{wandb.run.id}",
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        gradient_accumulation_steps=2,
        gradient_checkpointing=True,
        num_train_epochs=3,
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=cfg.learning_rate,
        weight_decay=0.01,
        bf16=True,
        label_names=["labels"],
        logging_dir="./logs",
        logging_steps=10,
        load_best_model_at_end=True,
        metric_for_best_model="eval_f1",
        greater_is_better=True,
        report_to="wandb",
        optim="adamw_torch"
    )


    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["validation"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[LogPerEpochCallback(cfg)],
    )


    start_time = time.time()
    trainer.train()

    eval_results = trainer.evaluate()
    end_time = time.time()

    wandb.log({
        ..........
    })

   # not print any thing
    print_prediction_samples(model, tokenizer, tokenized_dataset["validation"], n=3)

    # Free all memory before next trial
    del model, trainer, peft_cfg, training_args
    del trainer  
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()
        for i in range(torch.cuda.device_count()):
            torch.cuda.reset_peak_memory_stats(i)

    return eval_results["eval_f1"]

Error :

wandb: ERROR torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 112.00 MiB. GPU 0 has a total capacity of 79.14 GiB of which 36.75 MiB is free. Process 2203064 has 79.09 GiB memory in use. Of the allocated memory 78.58 GiB is allocated by PyTorch, and 12.70 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

I’d appreciate your insights or recommendations on this issue !