-
-
Notifications
You must be signed in to change notification settings - Fork 4k
Closed
Labels
Description
I use the last version of vllm and unsloth,and a custom LLaMA 3.3 70B fine-tuned using unsloth. I attached my grpo config, and trainer, I tried changing parameters and setting a custom optimiser but it seems like unsloth rewrites it anyway. Is there anyone who could tell me what could be the issue here?
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=args.model_repo,
max_seq_length=args.max_seq_length,
load_in_4bit=True,
dtype=None,
fast_inference = True,
max_lora_rank = 64,
)
model.gradient_checkpointing_enable()
tokenizer.pad_token = tokenizer.eos_token
robotics_data = load_robotics_dataset(args.data_path)
grpo_dataset = prepare_grpo_dataset(robotics_data, tokenizer)
from vllm import SamplingParams
vllm_sampling_params = SamplingParams(
min_p = 0.1,
top_p = 1.0,
top_k = -1,
seed = 3407,
stop = [tokenizer.eos_token],
include_stop_str_in_output = True,
)
training_args = GRPOConfig(
vllm_sampling_params = vllm_sampling_params,
temperature = 1.0,
learning_rate=5e-7, # Conservative for 70B
weight_decay=0.01,
warmup_steps=20,
lr_scheduler_type="linear",
optim="adamw_8bit",
logging_steps=1,
per_device_train_batch_size=1,
gradient_accumulation_steps=4,
num_generations=2,
max_prompt_length = 512,
max_completion_length = 1024,
max_steps=args.max_steps,
save_steps=100,
report_to="none",
output_dir="grpo_outputs",
eval_strategy = "steps",
eval_steps = 1,
fp16_full_eval = True,
eval_accumulation_steps = 1,
)
grpo_dataset = grpo_dataset.train_test_split(test_size=0.1)
# Initialize GRPO trainer
trainer = GRPOTrainer(
model=model,
processing_class=tokenizer,
reward_funcs=[hybrid_reward_function_with_llm_judge],
args=training_args,
train_dataset = grpo_dataset["train"],
eval_dataset = grpo_dataset["test"],
)