GSPO Reinforcement Learning
Train with GSPO (Group Sequence Policy Optimization) RL in Unsloth.




Last updated
Was this helpful?
Was this helpful?
training_args = GRPOConfig(
output_dir = "vlm-grpo-unsloth",
per_device_train_batch_size = 8,
gradient_accumulation_steps = 4,
learning_rate = 5e-6,
adam_beta1 = 0.9,
adam_beta2 = 0.99,
weight_decay = 0.1,
warmup_ratio = 0.1,
lr_scheduler_type = "cosine",
optim = "adamw_8bit",
# beta = 0.00,
epsilon = 3e-4,
epsilon_high = 4e-4,
num_generations = 8,
max_prompt_length = 1024,
max_completion_length = 1024,
log_completions = False,
max_grad_norm = 0.1,
temperature = 0.9,
# report_to = "none", # Set to "wandb" if you want to log to Weights & Biases
num_train_epochs = 2, # For a quick test run, increase for full training
report_to = "none"
# GSPO is below:
importance_sampling_level = "sequence",
# Dr GRPO / GAPO etc
loss_type = "dr_grpo",
)