unslothai
diff --git a/‎tests/utils/test_qat.py‎
Lines changed: 154 additions & 0 deletions b/‎tests/utils/test_qat.py‎
Lines changed: 154 additions & 0 deletions
diff --git a/‎unsloth/dataprep/synthetic.py‎
Lines changed: 125 additions & 17 deletions b/‎unsloth/dataprep/synthetic.py‎
Lines changed: 125 additions & 17 deletions
diff --git a/‎unsloth/kernels/fast_lora.py‎
Lines changed: 6 additions & 0 deletions b/‎unsloth/kernels/fast_lora.py‎
Lines changed: 6 additions & 0 deletions
@@ -0,0 +1,154 @@
+from unsloth import FastLanguageModel
+
+from typing import Dict
+
+import pytest
+import torch
+from torchao.quantization.qat import FakeQuantizedLinear
+from torchao.quantization.qat.fake_quantizer import (
+    FakeQuantizerBase,
+    Float8FakeQuantizer,
+    Int4WeightPreshuffledFakeQuantizer,
+)
+
+
+class _CountingFakeQuantizer(torch.nn.Module):
+    """
+    Dummy fake quantizer that counts the number of times it has been called.
+    """
+    def __init__(self):
+        super().__init__()
+        self.count = 0
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        self.count += 1
+        return x
+
+
+def _get_model(qat_scheme: str, full_finetuning: bool):
+    """
+    Return a 2-tuple of (model, tokenizer), where the model has been configured
+    to use QAT. If `full_finetuning` is False, return the PEFT (LoRA) model.
+    """
+    model, tokenizer = FastLanguageModel.from_pretrained(
+        model_name = "unsloth/Qwen3-1.7B",
+        load_in_4bit = False,
+        full_finetuning = full_finetuning,
+        qat_scheme = qat_scheme if full_finetuning else None,
+    )
+    if not full_finetuning:
+        model = FastLanguageModel.get_peft_model(
+            model,
+            qat_scheme = qat_scheme,
+        )
+    return model, tokenizer
+
+
+def _test_linear_is_fake_quantized(linear: torch.nn.Linear, qat_scheme: str):
+    """
+    Verify that the given linear contains fake quantizers according to the `qat_scheme`.
+    """
+    if qat_scheme == "fp8-int4":
+        act_fq_class = Float8FakeQuantizer
+        weight_fq_class = Int4WeightPreshuffledFakeQuantizer
+        min_in_features = 128
+    elif qat_scheme == "fp8-fp8":
+        act_fq_class = Float8FakeQuantizer
+        weight_fq_class = Float8FakeQuantizer
+        min_in_features = -1
+    else:
+        raise ValueError(f"Unknown qat_scheme: {qat_scheme}")
+
+    # Check base layer activations and weights
+    base_layer = getattr(linear, "base_layer", linear)
+    if base_layer.in_features >= min_in_features:
+        assert isinstance(base_layer, FakeQuantizedLinear)
+        assert isinstance(base_layer.activation_fake_quantizer, act_fq_class)
+        assert isinstance(base_layer.weight_fake_quantizer, weight_fq_class)
+
+    # Check lora A and B (only for full_finetuning=False)
+    if hasattr(linear, "lora_A") and hasattr(linear, "lora_B"):
+        lora_A = linear.lora_A.default
+        lora_B = linear.lora_B.default
+        if lora_A.in_features >= min_in_features:
+            assert isinstance(lora_A, FakeQuantizedLinear)
+            assert isinstance(lora_A.activation_fake_quantizer, act_fq_class)
+            assert isinstance(lora_A.weight_fake_quantizer, weight_fq_class)
+        if lora_B.in_features >= min_in_features:
+            assert isinstance(lora_B, FakeQuantizedLinear)
+            assert isinstance(lora_B.activation_fake_quantizer, act_fq_class)
+            assert isinstance(lora_B.weight_fake_quantizer, weight_fq_class)
+
+
+def _test_fake_quantizers_are_called(
+    model: torch.nn.Module,
+    example_inputs: Dict,
+    full_finetuning: bool,
+):
+    """
+    Verify that the fake quantizers are actually called when the model is called.
+    """
+    def _swap_fake_quantizers(model: torch.nn.Module):
+        for name, child in model.named_children():
+            if isinstance(child, FakeQuantizerBase):
+                setattr(model, name, _CountingFakeQuantizer())
+
+    def _assert_fake_quantizers_are_called(model: torch.nn.Module):
+        for name, child in model.named_children():
+            if full_finetuning:
+                if isinstance(child, FakeQuantizedLinear):
+                    assert child.activation_fake_quantizer.count == 1
+                    assert child.weight_fake_quantizer.count == 1
+            else:
+                # For LoRA, we only fake quantize the input activations once per block:
+                # For self_attn, we only fake quantize the q_proj's input activations
+                # For mlp, we only fake quantize the gate_proj's input activations
+                if name == "self_attn":
+                    base_layer = child.q_proj.base_layer
+                    assert hasattr(base_layer, "activation_fake_quantizer")
+                    assert base_layer.activation_fake_quantizer.count == 1
+                elif name == "mlp":
+                    base_layer = child.gate_proj.base_layer
+                    assert hasattr(base_layer, "activation_fake_quantizer")
+                    assert base_layer.activation_fake_quantizer.count == 1
+                elif isinstance(child, FakeQuantizedLinear):
+                    # Weight fake quantizers should always be called
+                    assert child.weight_fake_quantizer.count == 1
+
+    for k, v in example_inputs.items():
+        example_inputs[k] = v.cuda()
+    model.apply(_swap_fake_quantizers)
+    model(**example_inputs)
+    model.apply(_assert_fake_quantizers_are_called)
+
+
+def _test_model_fake_quantize(qat_scheme: bool, full_finetuning: bool):
+    """
+    Test that all linear layers in the model are fake quantized according to the `qat_scheme`.
+    """
+    model, tokenizer = _get_model(qat_scheme, full_finetuning)
+    if full_finetuning:
+        model = model.model
+    else:
+        model = model.base_model.model.model
+    for layer in model.layers:
+        _test_linear_is_fake_quantized(layer.self_attn.q_proj, qat_scheme)
+        _test_linear_is_fake_quantized(layer.self_attn.k_proj, qat_scheme)
+        _test_linear_is_fake_quantized(layer.self_attn.v_proj, qat_scheme)
+        _test_linear_is_fake_quantized(layer.mlp.gate_proj, qat_scheme)
+        _test_linear_is_fake_quantized(layer.mlp.up_proj, qat_scheme)
+        _test_linear_is_fake_quantized(layer.mlp.down_proj, qat_scheme)
+    inputs = tokenizer("How are you?", return_tensors="pt")
+    _test_fake_quantizers_are_called(model, inputs, full_finetuning)
+
+
+# TODO: there are bad interactions across tests right now, need to figure out
+# how to disable model caching before re-enabling this test
+@pytest.mark.parametrize("qat_scheme", ["fp8-int4", "fp8-fp8"])
+def _test_full_model_fake_quantize(qat_scheme: bool):
+    _test_model_fake_quantize(qat_scheme, full_finetuning=True)
+
+
+@pytest.mark.parametrize("qat_scheme", ["fp8-int4", "fp8-fp8"])
+def test_lora_model_fake_quantize(qat_scheme: bool):
+    _test_model_fake_quantize(qat_scheme, full_finetuning=False)
@@ -16,13 +16,16 @@
     "SyntheticDataKit",
 ]
 import subprocess
+import threading
+from collections import deque
 import time
 import os
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 import requests
 import torch
 import gc
 import time
+import re
 from unsloth_zoo.vllm_utils import (
     load_vllm,
     patch_vllm,
@@ -35,6 +38,100 @@
     synthetic_qa_config,
 )
 
+def terminate_tree(proc: subprocess.Popen, timeout=15):
+    if proc is None or proc.poll() is not None:
+        return
+    
+    try:
+        import psutil
+        parent = psutil.Process(proc.pid)
+        for child in parent.children(recursive=True):
+            child.terminate()
+        parent.terminate()
+        parent.wait(timeout=timeout/2)
+        return
+    except:
+        pass
+    
+    if os.name == 'nt':
+        try:
+            subprocess.run(
+                ['taskkill', '/T', '/F', '/PID', str(proc.pid)],
+                capture_output=True,
+                timeout=5
+            )
+            proc.wait(timeout=1)
+            return
+        except:
+            pass
+    
+    proc.kill()
+    try:
+        proc.wait(timeout=5)
+    except:
+        pass
+
+class PipeCapture:
+    """Non blocking pipe capture"""
+    def __init__(self, pipe, keep_lines=2000, echo=False, name="", text=True, encoding='utf-8', errors='replace', ready_regex=None):
+        self.pipe = pipe
+        self.buf = deque(maxlen=keep_lines)
+        self.lock = threading.Lock()
+        self.echo = echo
+        self.name = name
+        self.text = text
+        self.encoding = encoding
+        self.errors = errors
+
+        self.ready_event = threading.Event()
+        self.closed_event = threading.Event()
+
+        self.ready_regex = None
+        if ready_regex is not None:
+            if not hasattr(ready_regex, "search"):
+                ready_regex = re.compile(ready_regex)
+            self.ready_regex = ready_regex
+
+        self.t = threading.Thread(target=self._reader, daemon=True)
+        self.t.start()
+
+    def _reader(self):
+        try:
+            sentinel = '' if self.text else b''
+            for raw_line in iter(self.pipe.readline, sentinel):
+                if not self.text:
+                    line = raw_line.decode(self.encoding, self.errors)
+                else:
+                    line = raw_line
+                line = line.rstrip('\r\n')
+                if self.echo:
+                    if "platform is" not in line:
+                        print(f"{self.name}: {line}")
+
+                with self.lock:
+                    self.buf.append(line)
+
+                if self.ready_regex is not None and self.ready_regex.search(line):
+                    self.ready_event.set()
+
+        finally:
+            try: self.pipe.close()
+            except Exception: pass
+            self.closed_event.set()
+
+    def wait_for_ready(self, timeout=None):
+        return self.ready_event.wait(timeout)
+
+    def has_closed(self):
+        return self.closed_event.is_set()
+
+    def wait_until_closed(self, timeout=None):
+        return self.closed_event.wait(timeout)
+
+    def tail(self, n=200):
+        with self.lock:
+            return '\n'.join(list(self.buf)[-n:])
+
 class SyntheticDataKit:
     def __init__(
         self,
@@ -44,6 +141,7 @@ def __init__(
         float8_kv_cache = False,
         conservativeness = 1.0,
         token = None,
+        timeout = 1200,  # maybe this is not enough for large models if we need to download
         **kwargs,
     ):
         assert(type(model_name) is str)
@@ -128,30 +226,40 @@ def __init__(
             stderr = subprocess.PIPE,
             start_new_session = True,
         )
+        ready_re = re.compile(r"Starting vLLM API server(?:\s+\d+)?\s+on\b")
         self.vllm_process = vllm_process
+        self.stdout_capture = PipeCapture(vllm_process.stdout, keep_lines = 1000,
+                                          echo = True, name = "vLLM STDOUT",
+                                          ready_regex = ready_re, text = False)
+        self.stderr_capture = PipeCapture(vllm_process.stderr, keep_lines = 2000,
+                                          echo = False, name = "vLLM STDERR",
+                                          ready_regex = None, text = False)
+        # we don't print stderr to console but self.stderr_capture.tail(200) will print the last 200 lines
 
-        ready_message_part = b"Starting vLLM API server on"
-        ready = False
-        while vllm_process.poll() is None:
-            output = vllm_process.stdout.readline()
-            if not output:
+        ready = self.stdout_capture.wait_for_ready(timeout = timeout)
+        if not ready:
+            if self.stdout_capture.has_closed() or self.vllm_process.poll() is not None:
                 print("Stdout stream ended before readiness message detected.")
-                break
-            output_str = output.decode('utf-8', errors='ignore').strip()
-            if "platform is" not in output_str:
-                print(f"vLLM STDOUT: {output_str}")
-            if ready_message_part in output:
-                print(f"\n--- vLLM Server Ready (Detected: '{ready_message_part.decode()}') ---")
-                ready = True
-                break
-            pass
+                print("\n--- stdout tail ---\n", self.stdout_capture.tail(50))
+                print("\n--- stderr tail ---\n", self.stderr_capture.tail(50))
+            else:
+                print(f"Unsloth: vllm_process failed to load! (timeout={timeout})")
+                print("\n--- stdout tail ---\n", self.stdout_capture.tail(50))
+                print("\n--- stderr tail ---\n", self.stderr_capture.tail(50))
+            terminate_tree(self.vllm_process)
+            return
+        else:
+            print("vLLM Server Ready Detected")
         pass
-        if vllm_process is None:
-            raise RuntimeError("Unsloth: vllm_process failed to load!")
+
         trial = 0
         while not self.check_vllm_status():
             if trial >= 100:
-                raise RuntimeError("Unsloth: vllm_process failed to load!")
+                print("Unsloth: vllm_process failed to load!")
+                print("\n--- stdout tail ---\n", self.stdout_capture.tail(50))
+                print("\n--- stderr tail ---\n", self.stderr_capture.tail(50))
+                terminate_tree(self.vllm_process)
+                return
             trial += 1
             time.sleep(1)
         return
 
@@ -14,6 +14,7 @@
 
 import torch
 from .utils import (
+    _maybe_fake_quantize_activations,
     fast_dequantize,
     QUANT_STATE,
     get_lora_parameters,
@@ -175,6 +176,7 @@ def backward(ctx, dY : torch.Tensor):
 
 from .swiglu import swiglu_fg_kernel, swiglu_DWf_DW_dfg_kernel
 def apply_lora_mlp_swiglu(self, X, inplace = True):
+    X = _maybe_fake_quantize_activations(X, self.gate_proj)
     gateW, gateW_quant, gateA, gateB, gateS = get_lora_parameters(self.gate_proj)
     upW,     upW_quant,   upA,   upB,   upS = get_lora_parameters(self.  up_proj)
     downW, downW_quant, downA, downB, downS = get_lora_parameters(self.down_proj)
@@ -190,6 +192,7 @@ def apply_lora_mlp_swiglu(self, X, inplace = True):
 
 from .geglu import geglu_exact_forward_kernel, geglu_exact_backward_kernel
 def apply_lora_mlp_geglu_exact(self, X, inplace = True):
+    X = _maybe_fake_quantize_activations(X, self.gate_proj)
     gateW, gateW_quant, gateA, gateB, gateS = get_lora_parameters(self.gate_proj)
     upW,     upW_quant,   upA,   upB,   upS = get_lora_parameters(self.  up_proj)
     downW, downW_quant, downA, downB, downS = get_lora_parameters(self.down_proj)
@@ -205,6 +208,7 @@ def apply_lora_mlp_geglu_exact(self, X, inplace = True):
 
 from .geglu import geglu_approx_forward_kernel, geglu_approx_backward_kernel
 def apply_lora_mlp_geglu_approx(self, X):
+    X = _maybe_fake_quantize_activations(X, self.gate_proj)
     gateW, gateW_quant, gateA, gateB, gateS = get_lora_parameters(self.gate_proj)
     upW,     upW_quant,   upA,   upB,   upS = get_lora_parameters(self.  up_proj)
     downW, downW_quant, downA, downB, downS = get_lora_parameters(self.down_proj)
@@ -360,6 +364,7 @@ def backward(ctx, dQ, dK, dV):
 
 
 def apply_lora_qkv(self, X, inplace = True):
+    X = _maybe_fake_quantize_activations(X, self.q_proj)
     QW, QW_quant, QA, QB, QS = get_lora_parameters(self.q_proj)
     KW, KW_quant, KA, KB, KS = get_lora_parameters(self.k_proj)
     VW, VW_quant, VA, VB, VS = get_lora_parameters(self.v_proj)
@@ -453,6 +458,7 @@ def backward(ctx, dY : torch.Tensor):
 
 
 def apply_lora_o(self, X):
+    X = _maybe_fake_quantize_activations(X, self.o_proj)
     OW, OW_quant, OA, OB, OS = get_lora_parameters(self.o_proj)
     O = LoRA_W.apply(X, OW, OW_quant, OA, OB, OS)
     return O