Skip to content

SFTTrainer creates empty adapter.safetensors file while saving when training with LoRA and Deepspeed Zero3 #4416

@maximus-21

Description

@maximus-21

When training with SFTTrainer with peft and deepspeed zero3 configuration it results in adapter_model.safetensors file of just 40 bytes i.e empty. However when training with deepspeed zero2, it saves adapter_model.safetensors correctly with 20MB size. Also when lora is false in config and it's complete finetuning, deepspeed saves sharded weights correctly as it should in zero3 setting. Have already checked while training, number of trainable parameters are not zero and training (loss and validation generation) is as expected with deepspeed zero3. The issue is with saving the adapter weights.

Reproduction

"""
LoRA SFT Trainer with periodic validation generation logging
"""
import os
import torch
import yaml
import json
import random
import ast
import argparse
from datasets import load_dataset
from utils import *
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, TrainerCallback
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig, PeftModel
from huggingface_hub import login
from deepspeed.runtime.zero.config import ZeroStageEnum
from deepspeed.runtime.fp16.loss_scaler import LossScaler

os.environ["WANDB_DISABLED"] = "true"
os.environ["DISABLE_MLFLOW_INTEGRATION"] = "true"

local_rank = int(os.environ.get("LOCAL_RANK", 0))
torch.cuda.set_device(local_rank)


# DeepSpeed safe load
torch.serialization.add_safe_globals([ZeroStageEnum, LossScaler])

# ----------------------------
# Parse arguments
# ----------------------------
parser = argparse.ArgumentParser(description="LoRA SFT Trainer with YAML config")
parser.add_argument("--config", type=str, required=True, help="Path to train_config.yaml")
args = parser.parse_args()

# ----------------------------
# Load YAML config
# ----------------------------
with open(args.config, "r") as f:
    cfg = yaml.safe_load(f)

model_cfg = cfg["model"]
data_cfg = cfg["datasets"]
run_cfg = cfg["run"]

# ----------------------------
# Logging directory
# ----------------------------
logging_dir = os.path.join(run_cfg["output_dir"], run_cfg["exp_name"])

if local_rank == 0:
    os.makedirs(logging_dir, exist_ok=True)
    print(f"[INFO] Logging directory created at: {logging_dir}")

CACHE_DIR = run_cfg.get("cache_dir", None)

# ----------------------------
# Model + tokenizer
# ----------------------------
base_model_path = model_cfg["llm_path"]
resume_ckpt_path = model_cfg.get("ckpt", None) if run_cfg.get("resume_from_checkpoint", False) else None

print(f"[INFO] Loading base model: {base_model_path}")
model_config = AutoConfig.from_pretrained(base_model_path, trust_remote_code=True)

model = AutoModelForCausalLM.from_pretrained(
    base_model_path,
    cache_dir=CACHE_DIR,
    attn_implementation="flash_attention_2",
    torch_dtype=torch.bfloat16 if run_cfg.get("amp", True) else torch.float32,
    config=model_config
)
model.gradient_checkpointing_enable()
tokenizer = AutoTokenizer.from_pretrained(base_model_path, use_fast=True)

# ----------------------------
# Training dataset
# ----------------------------

dataset = load_dataset(
    "json",
    data_files=data_cfg["train_ann_path"],
    split="train",
    cache_dir=CACHE_DIR,
).shuffle(seed=run_cfg["seed"])

print("-" * 100)
print("Length of training dataset: ", len(dataset))
print("-" * 100)

# ----------------------------
# Validation dataset
# ----------------------------
val_dataset = None
if "valid_ann_path" in data_cfg:
    val_dataset = load_dataset(
        "json",
        data_files=data_cfg["valid_ann_path"],
        split="train",
        cache_dir=CACHE_DIR
    ).shuffle(seed=run_cfg["seed"])

    print("Length of validation dataset: ", len(val_dataset))
    
# ----------------------------
# LoRA / PEFT setup
# ----------------------------
peft_config = None
if model_cfg.get("lora", False):
    peft_config = LoraConfig(
        r=model_cfg["lora_rank"],
        lora_alpha=model_cfg["lora_alpha"],
        target_modules=model_cfg["target_modules"],
        lora_dropout=model_cfg["lora_dropout"],
        bias="none",
        task_type="CAUSAL_LM",
    )
    if resume_ckpt_path:
        try:
            print(f"[INFO] Resuming LoRA from checkpoint: {resume_ckpt_path}")
            model = PeftModel.from_pretrained(
                model,
                resume_ckpt_path,
                is_trainable=True,
                torch_dtype=torch.bfloat16
            )
            if not any(p.requires_grad for p in model.parameters()):
                raise ValueError("No trainable parameters found in PEFT model!")
        except Exception as e:
            print(f"[WARN] Failed to load PEFT checkpoint: {e}. Training LoRA from scratch using peft_config.")
else:
    if resume_ckpt_path:
        print(f"[INFO] Will resume full-model training from checkpoint: {resume_ckpt_path}")


# ----------------------------
# Training Arguments
# ----------------------------
optim_cfg = run_cfg["optims"]
training_arguments = SFTConfig(
    output_dir=logging_dir,
    bf16=run_cfg.get("amp", True),
    deepspeed=run_cfg.get("ds_config", "") if run_cfg.get("use_distributed", True) else None,
    optim=optim_cfg["optim"],
    per_device_train_batch_size=run_cfg["batch_size_train"],
    gradient_accumulation_steps=run_cfg["accum_grad_iters"],
    per_device_eval_batch_size=run_cfg["batch_size_eval"],
    log_level="debug",
    save_strategy="steps",
    save_steps=run_cfg["save_ckpt_freq_steps"],
    logging_steps=run_cfg["log_freq"],
    learning_rate=optim_cfg["init_lr"],
    weight_decay=optim_cfg["weight_decay"],
    num_train_epochs=optim_cfg["max_epoch"],
    lr_scheduler_type=optim_cfg["lr_scheduler"],
    warmup_steps=optim_cfg["warmup_steps"],
    #dataset_text_field="text",
    seed=run_cfg["seed"],
    max_length=model_cfg["max_seq_len"],
    dataset_kwargs={
        "add_special_tokens": False, # We template with special tokens
        "append_concat_token": True, # Add EOS token as separator token between examples
    }
)


# ----------------------------
# Trainer
# ----------------------------
trainer = SFTTrainer(
    model,
    args=training_arguments,
    train_dataset=dataset,
    peft_config=peft_config,
    processing_class=tokenizer
)

# ----------------------------
# Train
# ----------------------------
trainer.train()

deepspeed config:

{
  "zero_optimization": {
      "stage": 3,
      "offload_optimizer": {
      "device": "none"
      },
      "offload_param": {
      "device": "none"
      },
      "overlap_comm": true,
      "contiguous_gradients": true,
      "reduce_bucket_size": "auto",
      "stage3_prefetch_bucket_size": "auto",
      "stage3_param_persistence_threshold": "auto"
  },
  "bf16": {
      "enabled": true
  },
  "train_micro_batch_size_per_gpu": "auto",
  "gradient_accumulation_steps": "auto",
  "steps_per_print": 10,
  "wall_clock_breakdown": false,
  "gradient_clipping": 1.0
}

outputs:

***** Running training *****
  Num examples = 152,655
  Num Epochs = 2
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 9,542
  Number of trainable parameters = 10,186,752
 

System Info

Name: trl
Version: 0.24.0

Name: peft
Version: 0.17.1

Name: transformers
Version: 4.57.1

Checklist

  • I have checked that my issue isn't already filed (see open issues)
  • I have included my system information
  • Any code provided is minimal, complete, and reproducible (more on MREs)
  • Any code provided is properly formatted in code blocks, (no screenshot, more on code blocks)
  • Any traceback provided is complete

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions