andrewor14
diff --git a/‎unsloth/kernels/utils.py‎
Lines changed: 30 additions & 2 deletions b/‎unsloth/kernels/utils.py‎
Lines changed: 30 additions & 2 deletions
diff --git a/‎unsloth/models/_utils.py‎
Lines changed: 20 additions & 1 deletion b/‎unsloth/models/_utils.py‎
Lines changed: 20 additions & 1 deletion
diff --git a/‎unsloth/models/llama.py‎
Lines changed: 5 additions & 2 deletions b/‎unsloth/models/llama.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎unsloth/models/loader.py‎
Lines changed: 47 additions & 3 deletions b/‎unsloth/models/loader.py‎
Lines changed: 47 additions & 3 deletions
diff --git a/‎unsloth/models/loader_utils.py‎
Lines changed: 129 additions & 1 deletion b/‎unsloth/models/loader_utils.py‎
Lines changed: 129 additions & 1 deletion
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import importlib
 import triton
 import ctypes
 
@@ -211,6 +212,10 @@ def _get_tensor_stream(tensor: torch_Tensor) -> c_void_p:
 torch_bfloat16 = torch.bfloat16
 
 
+# Whether torchao can be imported
+_HAS_TORCHAO = importlib.util.find_spec("torchao") is not None
+
+
 def QUANT_STATE(W):
     return getattr(W, "quant_state", None)
 
@@ -329,12 +334,33 @@ def _maybe_fake_quantize_activations(
     return X
 
 
+def _maybe_dequantize_torchao_float8_tensor(w: torch.Tensor) -> torch.Tensor:
+    """
+    Dequantize `w` if it is a `torchao.quantization.Float8Tensor` and only
+    during the backward pass, when the tensor is no longer rowwise scaled
+    because it's been transposed.
+    """
+    if not _HAS_TORCHAO:
+        return w
+    from torchao.quantization import Float8Tensor
+    if not isinstance(w, Float8Tensor):
+        return w
+    # In the backward pass, rowwise scaled becomes colwise scaled after we
+    # transpose the weight tensor. Use this case to detect backward
+    assert w.ndim == 2
+    if w.block_size[0] == w.shape[0] and w.block_size[1] == 1:
+        return w.dequantize()
+    else:
+        return w
+
+
 # INTEL GPU Specific Logic
 if DEVICE_TYPE == "xpu" and HAS_XPU_STREAM:
 
     @torch.inference_mode
     def fast_dequantize(W, quant_state = None, out = None, use_global_buffer = False):
         # TODO: After adding XPU BNB support, check this function
+        W = _maybe_dequantize_torchao_float8_tensor(W)
         if quant_state is None:
             return W
         if W.dtype == torch.float8_e4m3fn:
@@ -441,6 +467,7 @@ def fast_dequantize(W, quant_state = None, out = None, use_global_buffer = False
 
     @torch.inference_mode
     def fast_dequantize(W, quant_state = None, out = None, use_global_buffer = False):
+        W = _maybe_dequantize_torchao_float8_tensor(W)
         if quant_state is None:
             return W
         if W.dtype == torch.float8_e4m3fn:
@@ -551,6 +578,7 @@ def fast_dequantize(W, quant_state = None, out = None, use_global_buffer = False
 
     @torch.inference_mode
     def fast_dequantize(W, quant_state = None, out = None, use_global_buffer = False):
+        W = _maybe_dequantize_torchao_float8_tensor(W)
         if quant_state is None:
             return W
         if W.dtype == torch.float8_e4m3fn:
@@ -987,8 +1015,8 @@ def matmul_lora(X, W, W_quant, A, B, s, out = None):
     if W.dtype == torch.float8_e4m3fn:
         out = fp8_linear(X, W, W_quant)
     else:
-        W = fast_dequantize(W.t(), W_quant, use_global_buffer = True)
-        out = torch_matmul(X, W, out = out)
+        W = fast_dequantize(W, W_quant, use_global_buffer = True)
+        out = torch_matmul(X, W.t(), out = out)
     if W_quant is not None:
         del W
 
 
@@ -2012,7 +2012,7 @@ def error_out_no_vllm(*args, **kwargs):
 
 @dataclass
 class TorchAOConfig:
-    qat_scheme: str = "int4"
+    qat_scheme: Optional[str] = "int4"
 
     # Each (config, filter_fn) pair defines a quantization rule
     base_config_and_filter_fns: List[
@@ -2262,3 +2262,22 @@ def verify_fp8_support_if_applicable(model_config):
         raise ValueError(
             f"Unsloth: FP8 quantization is only supported on L4 and higher GPUs with compute capability 8.9 or higher. You are using {torch.cuda.get_device_name()}. Refer to https://developer.nvidia.com/cuda-gpus for more details."
         )
+
+
+def _get_inference_mode_context_manager(model: torch.nn.Module):
+    """
+    If the state dict was quantized using torchao, we will run into
+    the following error when calling ops like aten.t() in inference mode.
+    This is a bug in PyTorch that affects all tensor subclasses.
+
+        Cannot set version_counter for inference tensor
+
+    For now, we work around this issue by using `torch.no_grad()` in this case.
+    See https://github.com/pytorch/pytorch/issues/164872 for more details.
+    Otherwise, just return `torch.inference_mode()`.
+    """
+    torchao_config = getattr(model, "torchao_config", None)
+    if torchao_config is not None and torchao_config.qat_scheme is None:
+        return torch.no_grad()
+    else:
+        return torch.inference_mode()
@@ -21,7 +21,10 @@
 from ._utils import patch_unsloth_smart_gradient_checkpointing
 from ._utils import __version__, importlib_version
 from ._utils import move_to_device
-from ._utils import _prepare_model_for_qat
+from ._utils import (
+    _get_inference_mode_context_manager,
+    _prepare_model_for_qat,
+)
 from torch.nn.functional import scaled_dot_product_attention
 from transformers import __version__ as transformers_version
 from unsloth_zoo.utils import Version, _get_dtype
@@ -2030,7 +2033,7 @@ def unsloth_fast_generate(
 
     # Mixed precision autocast
     with (
-        torch.inference_mode(),
+        _get_inference_mode_context_manager(self),
         torch.autocast(device_type = DEVICE_TYPE_TORCH, dtype = dtype),
     ):
         output = self._old_generate(*args, **kwargs)
 
@@ -31,7 +31,12 @@
 from transformers import AutoConfig
 from transformers import __version__ as transformers_version
 from peft import PeftConfig, PeftModel
-from .loader_utils import get_model_name
+from .loader_utils import (
+    _check_load_in_fp8_settings,
+    _offline_quantize_to_fp8,
+    _tag_model_with_fp8_torchao_config,
+    get_model_name,
+)
 import os, contextlib, sys
 
 try:
@@ -140,6 +145,7 @@ def from_pretrained(
         max_lora_rank = 64,
         disable_log_stats = True,
         qat_scheme = None,
+        load_in_fp8 = False, # fp8 LoRA
         *args,
         **kwargs,
     ):
@@ -183,6 +189,7 @@ def from_pretrained(
                 max_lora_rank = max_lora_rank,
                 disable_log_stats = disable_log_stats,
                 qat_scheme = qat_scheme,
+                load_in_fp8 = load_in_fp8,
                 *args,
                 **kwargs,
             )
@@ -212,9 +219,23 @@ def from_pretrained(
                 )
             load_in_4bit = False
 
+        if load_in_fp8:
+            _check_load_in_fp8_settings(
+                fast_inference,
+                full_finetuning,
+                load_in_4bit,
+                load_in_8bit,
+                load_in_16bit,
+                use_exact_model_name,
+            )
+
         old_model_name = model_name
         if not use_exact_model_name:
-            model_name = get_model_name(model_name, load_in_4bit)
+            if load_in_fp8:
+                model_name = _offline_quantize_to_fp8(model_name)
+            else:
+                model_name = get_model_name(model_name, load_in_4bit)
+
         # Check if pre-quantized models are allowed
         # For eg AMD GPUs need blocksize = 128, but our pre-quants are blocksize = 64
         if not ALLOW_PREQUANTIZED_MODELS and model_name.lower().endswith(
@@ -476,6 +497,8 @@ def from_pretrained(
                 random_state = random_state,
                 max_lora_rank = max_lora_rank,
                 disable_log_stats = disable_log_stats,
+                qat_scheme = qat_scheme,
+                load_in_fp8 = load_in_fp8,
                 *args,
                 **kwargs,
             )
@@ -554,6 +577,9 @@ def from_pretrained(
             }
             model.config.update({"quantization_config": quantization_config})
 
+        if load_in_fp8:
+            _tag_model_with_fp8_torchao_config(model)
+
         if is_peft:
             # From https://github.com/huggingface/peft/issues/184
             # Now add PEFT adapters
@@ -634,6 +660,7 @@ def from_pretrained(
         max_lora_rank = 64,
         disable_log_stats = True,
         qat_scheme = None,
+        load_in_fp8 = False, # fp8 LoRA
         *args,
         **kwargs,
     ):
@@ -694,9 +721,23 @@ def from_pretrained(
                 )
             load_in_4bit = False
 
+        if load_in_fp8:
+            _check_load_in_fp8_settings(
+                fast_inference,
+                full_finetuning,
+                load_in_4bit,
+                load_in_8bit,
+                load_in_16bit,
+                use_exact_model_name,
+            )
+
         old_model_name = model_name
         if not use_exact_model_name:
-            model_name = get_model_name(model_name, load_in_4bit)
+            if load_in_fp8:
+                model_name = _offline_quantize_to_fp8(model_name)
+            else:
+                model_name = get_model_name(model_name, load_in_4bit)
+
         # Check if pre-quantized models are allowed
         # For eg AMD GPUs need blocksize = 128, but our pre-quants are blocksize = 64
         if not ALLOW_PREQUANTIZED_MODELS and model_name.lower().endswith(
@@ -1130,6 +1171,9 @@ def from_pretrained(
             }
             model.config.update({"quantization_config": quantization_config})
 
+        if load_in_fp8:
+            _tag_model_with_fp8_torchao_config(model)
+
         if is_peft:
             # From https://github.com/huggingface/peft/issues/184
             # Now add PEFT adapters
 
@@ -12,11 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import importlib
+import os
+import re
+import tempfile
 from .mapper import INT_TO_FLOAT_MAPPER, FLOAT_TO_INT_MAPPER, MAP_TO_UNSLOTH_16bit
 
 # https://github.com/huggingface/transformers/pull/26037 allows 4 bit loading!
 from packaging.version import Version
-from transformers import __version__ as transformers_version
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    TorchAoConfig,
+    __version__ as transformers_version,
+)
+from unsloth.models._utils import TorchAOConfig
+from unsloth_zoo.utils import Version
+import torch
 
 transformers_version = Version(transformers_version)
 SUPPORTS_FOURBIT = transformers_version >= Version("4.37")
@@ -144,3 +156,119 @@ def get_model_name(model_name, load_in_4bit = True):
                 'pip install --upgrade --no-cache-dir "git+https://github.com/unslothai/unsloth-zoo.git"\n'
             )
     return new_model_name if new_model_name is not None else model_name
+
+
+def _get_torchao_fp8_config():
+    """
+    Return a `torchao.quantization.Float8DynamicActivationFloat8WeightConfig`
+    to be used for `load_in_fp8=True`.
+    """
+    from torchao.quantization import Float8DynamicActivationFloat8WeightConfig, PerRow
+
+    return Float8DynamicActivationFloat8WeightConfig(
+        granularity=PerRow(),
+        activation_value_lb=1e-12,
+    )
+
+
+def _offline_quantize_to_fp8(model_name: str) -> str:
+    """
+    Quantizes the model to fp8 using torchao and saving the quantized model to a
+    temporary location. Return the path to the quantized model.
+
+    Note: Once on-the-fly quantization is added in vllm in
+    https://github.com/vllm-project/vllm/pull/26327, we should
+    dynamically quantize the model there instead:
+
+      llm = LLM(
+        ...
+        hf_overrides={"quantization_config_file": "torchao_config.json"},
+      )
+    """
+    temp_dir = tempfile.gettempdir()
+    new_model_name = model_name.split("/")[-1] + "-fp8"
+    new_model_name = os.path.join(temp_dir, new_model_name)
+    print(f"Quantizing '{model_name}' to fp8, using model_name='{new_model_name}' instead")
+    if not os.path.isdir(new_model_name):
+        qconfig = _get_torchao_fp8_config()
+        qconfig = TorchAoConfig(qconfig)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            torch_dtype="auto",
+            device_map="auto",
+            quantization_config=qconfig,
+        )
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        model.save_pretrained(new_model_name, safe_serialization=False)
+        tokenizer.save_pretrained(new_model_name)
+    return new_model_name
+
+
+def _tag_model_with_fp8_torchao_config(model: torch.nn.Module):
+    """
+    Tag a model with a `TorchAOConfig` so downstream callers will know what to do with it.
+    """
+    base_config = _get_torchao_fp8_config()
+    model.torchao_config = TorchAOConfig(
+        qat_scheme=None,
+        base_config_and_filter_fns=[(base_config, None)],
+    )
+
+
+def _check_load_in_fp8_settings(
+    fast_inference: bool,
+    full_finetuning: bool,
+    load_in_4bit: bool,
+    load_in_8bit: bool,
+    load_in_16bit: bool,
+    use_exact_model_name: bool,
+):
+    """
+    Assuming `load_in_fp8=True`, raise appropriate errors on incompatible settings
+    and environment. Currently this feature requires:
+    1. H100 GPUs or after
+    2. torchao 0.15.0+ (or nightly)
+    3. torch 2.9.0+
+    4. If fbgemm_gpu_genai is installed, require 1.4.1+
+    """
+    if not fast_inference:
+        raise ValueError("Unsloth: `load_in_fp8` is only supported for `fast_inference` for now")
+    if full_finetuning:
+        raise ValueError("Unsloth: `load_in_fp8` is not compatible with full finetuning")
+    if load_in_4bit or load_in_8bit or load_in_16bit:
+        raise ValueError(
+            "Unsloth: `load_in_fp8` is not compatible with `load_in_4bit`, `load_in_8bit` or `load_in_16bit`",
+        )
+    if use_exact_model_name:
+        raise ValueError("Unsloth: `load_in_fp8` requires `use_exact_model_name=False`")
+
+    # Check if this is Hopper or above
+    if not (torch.cuda.is_available()
+        and torch.version.cuda
+        and torch.cuda.get_device_capability() >= (9, 0)
+    ):
+        raise ValueError("Unsloth: `load_in_fp8` requires H100 GPUs or after")
+
+    # Check if torch >= 2.9.0
+    if Version(torch.__version__) < Version("2.9.0"):
+        raise ValueError("Unsloth: `load_in_fp8` requires torch 2.9.0+")
+
+    # Check if torchao has this PR: https://github.com/pytorch/ao/pull/3158,
+    # which will be released in 0.15.0.
+    error_message = "Unsloth: `load_in_fp8` requires torchao 0.15.0+ (or nightly)"
+    if importlib.util.find_spec("torchao") is None:
+        raise ValueError(error_message)
+    import torchao
+
+    if Version(torchao.__version__) < Version("0.15.0"):
+        raise ValueError(error_message)
+
+    # Check if fbgemm_gpu_genai is installed, if so, require >= 1.4.1
+    if (
+        importlib.util.find_spec("fbgemm_gpu") is not None and
+        importlib.util.find_spec("fbgemm_gpu.experimental") is not None
+    ):
+        import fbgemm_gpu.experimental.gen_ai
+
+        if Version(fbgemm_gpu.__version__) < Version("1.4.1"):
+            raise ValueError("Unsloth: `load_in_fp8` is only compatible with fbgemm_gpu_genai 1.4.1+")