andrewor14
diff --git a/‎unsloth/kernels/fast_lora.py‎
Lines changed: 11 additions & 0 deletions b/‎unsloth/kernels/fast_lora.py‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎unsloth/kernels/utils.py‎
Lines changed: 19 additions & 0 deletions b/‎unsloth/kernels/utils.py‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎unsloth/models/_utils.py‎
Lines changed: 1 addition & 1 deletion b/‎unsloth/models/_utils.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎unsloth/models/loader.py‎
Lines changed: 48 additions & 3 deletions b/‎unsloth/models/loader.py‎
Lines changed: 48 additions & 3 deletions
diff --git a/‎unsloth/models/loader_utils.py‎
Lines changed: 55 additions & 1 deletion b/‎unsloth/models/loader_utils.py‎
Lines changed: 55 additions & 1 deletion
diff --git a/‎unsloth/models/rl_replacements.py‎
Lines changed: 14 additions & 1 deletion b/‎unsloth/models/rl_replacements.py‎
Lines changed: 14 additions & 1 deletion
@@ -14,6 +14,7 @@
 
 import torch
 from .utils import (
+    _maybe_dequantize_torchao_float8_tensor,
     _maybe_fake_quantize_activations,
     fast_dequantize,
     QUANT_STATE,
@@ -128,6 +129,10 @@ def backward(ctx, dY: torch.Tensor):
         ) = ctx.custom_saved_tensors
         gateA, gateB, upA, upB, downA, downB, X, e, g = ctx.saved_tensors
 
+        gateW = _maybe_dequantize_torchao_float8_tensor(gateW)
+        upW = _maybe_dequantize_torchao_float8_tensor(upW)
+        downW = _maybe_dequantize_torchao_float8_tensor(downW)
+
         batch, seq_len, hd = X.shape
         dY = dY.view(-1, dY.shape[-1])
         X = X.view(-1, X.shape[-1])
@@ -420,6 +425,10 @@ def backward(ctx, dQ, dK, dV):
             VB,
         ) = ctx.saved_tensors
 
+        QW = _maybe_dequantize_torchao_float8_tensor(QW)
+        KW = _maybe_dequantize_torchao_float8_tensor(KW)
+        VW = _maybe_dequantize_torchao_float8_tensor(VW)
+
         batch, seq_len, hd = X.shape
         dQ = dQ.view(-1, dQ.shape[-1])
         dK = dK.reshape(-1, dK.shape[-1])  # view doesn't work on K.T
@@ -593,6 +602,8 @@ def backward(ctx, dY: torch.Tensor):
         W, W_quant, S = ctx.custom_saved_tensors
         A, B, X = ctx.saved_tensors
 
+        W = _maybe_dequantize_torchao_float8_tensor(W)
+
         batch, seq_len, hd = X.shape
         dY = dY.reshape(-1, dY.shape[-1])  # Must be reshape
         X = X.reshape(-1, X.shape[-1])  # Must be reshape
 
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import importlib
 import triton
 import ctypes
 
@@ -211,6 +212,10 @@ def _get_tensor_stream(tensor: torch_Tensor) -> c_void_p:
 torch_bfloat16 = torch.bfloat16
 
 
+# Whether torchao can be imported
+_HAS_TORCHAO = importlib.util.find_spec("torchao") is not None
+
+
 def QUANT_STATE(W):
     return getattr(W, "quant_state", None)
 
@@ -329,6 +334,20 @@ def _maybe_fake_quantize_activations(
     return X
 
 
+def _maybe_dequantize_torchao_float8_tensor(x: torch.Tensor) -> torch.Tensor:
+    """
+    If `x` is a `torchao.quantization.Float8Tensor`, dequantize it.
+    This is used in the backward pass of LoRA autograd functions.
+    """
+    if not _HAS_TORCHAO:
+        return x
+    from torchao.quantization import Float8Tensor
+    if isinstance(x, Float8Tensor):
+        return x.dequantize()
+    else:
+        return x
+
+
 # INTEL GPU Specific Logic
 if DEVICE_TYPE == "xpu" and HAS_XPU_STREAM:
 
 
@@ -2012,7 +2012,7 @@ def error_out_no_vllm(*args, **kwargs):
 
 @dataclass
 class TorchAOConfig:
-    qat_scheme: str = "int4"
+    qat_scheme: Optional[str] = "int4"
     base_config: AOBaseConfig = field(
         default_factory = lambda: Int4WeightOnlyConfig(group_size = 128)
     )
 
@@ -31,7 +31,11 @@
 from transformers import AutoConfig
 from transformers import __version__ as transformers_version
 from peft import PeftConfig, PeftModel
-from .loader_utils import get_model_name
+from .loader_utils import (
+    _offline_quantize_to_fp8,
+    _tag_model_with_fp8_torchao_config,
+    get_model_name,
+)
 import os, contextlib, sys
 
 try:
@@ -139,6 +143,7 @@ def from_pretrained(
         max_lora_rank = 64,
         disable_log_stats = True,
         qat_scheme = None,
+        load_in_fp8 = False, # fp8 LoRA
         *args,
         **kwargs,
     ):
@@ -182,6 +187,7 @@ def from_pretrained(
                 max_lora_rank = max_lora_rank,
                 disable_log_stats = disable_log_stats,
                 qat_scheme = qat_scheme,
+                load_in_fp8 = load_in_fp8,
                 *args,
                 **kwargs,
             )
@@ -211,9 +217,24 @@ def from_pretrained(
                 )
             load_in_4bit = False
 
+        if load_in_fp8 and not fast_inference:
+            raise ValueError("Unsloth: `load_in_fp8` is only supported for `fast_inference` for now")
+        if load_in_fp8 and full_finetuning:
+            raise ValueError("Unsloth: `load_in_fp8` is not compatible with full finetuning")
+        if load_in_fp8 and (load_in_4bit or load_in_8bit or load_in_16bit):
+            raise ValueError(
+                "Unsloth: `load_in_fp8` is not compatible with `load_in_4bit`, `load_in_8bit` or `load_in_16bit`",
+            )
+        if load_in_fp8 and use_exact_model_name:
+            raise ValueError("Unsloth: `load_in_fp8` requires `use_exact_model_name=False`")
+
         old_model_name = model_name
         if not use_exact_model_name:
-            model_name = get_model_name(model_name, load_in_4bit)
+            if load_in_fp8:
+                model_name = _offline_quantize_to_fp8(model_name)
+            else:
+                model_name = get_model_name(model_name, load_in_4bit)
+
         # Check if pre-quantized models are allowed
         # For eg AMD GPUs need blocksize = 128, but our pre-quants are blocksize = 64
         if not ALLOW_PREQUANTIZED_MODELS and model_name.lower().endswith(
@@ -475,6 +496,8 @@ def from_pretrained(
                 random_state = random_state,
                 max_lora_rank = max_lora_rank,
                 disable_log_stats = disable_log_stats,
+                qat_scheme = qat_scheme,
+                load_in_fp8 = load_in_fp8,
                 *args,
                 **kwargs,
             )
@@ -553,6 +576,9 @@ def from_pretrained(
             }
             model.config.update({"quantization_config": quantization_config})
 
+        if load_in_fp8:
+            _tag_model_with_fp8_torchao_config(model)
+
         if is_peft:
             # From https://github.com/huggingface/peft/issues/184
             # Now add PEFT adapters
@@ -621,6 +647,7 @@ def from_pretrained(
         max_lora_rank = 64,
         disable_log_stats = True,
         qat_scheme = None,
+        load_in_fp8 = False, # fp8 LoRA
         *args,
         **kwargs,
     ):
@@ -681,9 +708,24 @@ def from_pretrained(
                 )
             load_in_4bit = False
 
+        if load_in_fp8 and not fast_inference:
+            raise ValueError("Unsloth: `load_in_fp8` is only supported for `fast_inference` for now")
+        if load_in_fp8 and full_finetuning:
+            raise ValueError("Unsloth: `load_in_fp8` is not compatible with full finetuning")
+        if load_in_fp8 and (load_in_4bit or load_in_8bit or load_in_16bit):
+            raise ValueError(
+                "Unsloth: `load_in_fp8` is not compatible with `load_in_4bit`, `load_in_8bit` or `load_in_16bit`",
+            )
+        if load_in_fp8 and use_exact_model_name:
+            raise ValueError("Unsloth: `load_in_fp8` requires `use_exact_model_name=False`")
+
         old_model_name = model_name
         if not use_exact_model_name:
-            model_name = get_model_name(model_name, load_in_4bit)
+            if load_in_fp8:
+                model_name = _offline_quantize_to_fp8(model_name)
+            else:
+                model_name = get_model_name(model_name, load_in_4bit)
+
         # Check if pre-quantized models are allowed
         # For eg AMD GPUs need blocksize = 128, but our pre-quants are blocksize = 64
         if not ALLOW_PREQUANTIZED_MODELS and model_name.lower().endswith(
@@ -1117,6 +1159,9 @@ def from_pretrained(
             }
             model.config.update({"quantization_config": quantization_config})
 
+        if load_in_fp8:
+            _tag_model_with_fp8_torchao_config(model)
+
         if is_peft:
             # From https://github.com/huggingface/peft/issues/184
             # Now add PEFT adapters
 
@@ -12,11 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+import tempfile
 from .mapper import INT_TO_FLOAT_MAPPER, FLOAT_TO_INT_MAPPER, MAP_TO_UNSLOTH_16bit
 
 # https://github.com/huggingface/transformers/pull/26037 allows 4 bit loading!
 from packaging.version import Version
-from transformers import __version__ as transformers_version
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    TorchAoConfig,
+    __version__ as transformers_version,
+)
+from unsloth.models._utils import TorchAOConfig
+import torch
 
 transformers_version = Version(transformers_version)
 SUPPORTS_FOURBIT = transformers_version >= Version("4.37")
@@ -144,3 +153,48 @@ def get_model_name(model_name, load_in_4bit = True):
                 'pip install --upgrade --no-cache-dir "git+https://github.com/unslothai/unsloth-zoo.git"\n'
             )
     return new_model_name if new_model_name is not None else model_name
+
+
+def _offline_quantize_to_fp8(model_name: str) -> str:
+    """
+    Quantizes the model to fp8 using torchao and saving the quantized model to a
+    temporary location. Return the path to the quantized model.
+
+    Note: Once on-the-fly quantization is added in vllm in
+    https://github.com/vllm-project/vllm/pull/26327, we should
+    dynamically quantize the model there instead:
+
+      llm = LLM(
+        ...
+        hf_overrides={"quantization_config_file": "torchao_config.json"},
+      )
+    """
+    from torchao.quantization import Float8DynamicActivationFloat8WeightConfig, PerRow
+
+    temp_dir = tempfile.gettempdir()
+    new_model_name = model_name.split("/")[-1] + "-fp8"
+    new_model_name = os.path.join(temp_dir, new_model_name)
+    print(f"Quantizing '{model_name}' to fp8, using model_name='{new_model_name}' instead")
+    if not os.path.isdir(new_model_name):
+        qconfig = Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
+        qconfig = TorchAoConfig(qconfig)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            torch_dtype="auto",
+            device_map="auto",
+            quantization_config=qconfig,
+        )
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        model.save_pretrained(new_model_name, safe_serialization=False)
+        tokenizer.save_pretrained(new_model_name)
+    return new_model_name
+
+
+def _tag_model_with_fp8_torchao_config(model: torch.nn.Module):
+    """
+    Tag a model with a `TorchAOConfig` so downstream callers will know what to do with it.
+    """
+    from torchao.quantization import Float8DynamicActivationFloat8WeightConfig, PerRow
+
+    base_config = Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
+    model.torchao_config = TorchAOConfig(qat_scheme=None, base_config=base_config)
@@ -536,7 +536,20 @@ def _get_per_token_logps_and_entropies(
             )
 
             with torch.amp.autocast(device_type = "cuda", dtype = self._autocast_dtype):
-                with torch.inference_mode():
+                # If the state dict was quantized using torchao, we will run into
+                # the following error when calling ops like aten.t() in inference mode.
+                # This is a bug in PyTorch that affects all tensor subclasses.
+                #
+                #     Cannot set version_counter for inference tensor
+                #
+                # For now, we work around this issue by using torch.no_grad in this case.
+                # See https://github.com/pytorch/pytorch/issues/164872 for more details
+                torchao_config = getattr(model, "torchao_config", None)
+                if torchao_config is not None and torchao_config.qat_scheme is None:
+                    ctx_manager = torch.no_grad()
+                else:
+                    ctx_manager = torch.inference_mode()
+                with ctx_manager:
                     if pixel_values is None:
                         attention_mask = input_ids != self.processing_class.pad_token_id
                         attention_mask = attention_mask.to(attention_mask.dtype)
Original file line number	Diff line number	Diff line change
`@@ -2012,7 +2012,7 @@ def error_out_no_vllm(args, *kwargs):`
`2012`	`2012`
`2013`	`2013`	`@dataclass`
`2014`	`2014`	`class TorchAOConfig:`
`2015`		`- qat_scheme: str = "int4"`
	`2015`	`+ qat_scheme: Optional[str] = "int4"`
`2016`	`2016`	`base_config: AOBaseConfig = field(`
`2017`	`2017`	`default_factory = lambda: Int4WeightOnlyConfig(group_size = 128)`
`2018`	`2018`	`)`