unslothai
diff --git a/‎unsloth/models/llama.py‎
Lines changed: 9 additions & 3 deletions b/‎unsloth/models/llama.py‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎unsloth/models/loader.py‎
Lines changed: 14 additions & 0 deletions b/‎unsloth/models/loader.py‎
Lines changed: 14 additions & 0 deletions
@@ -3214,9 +3214,15 @@ def patch_peft_model(
                         )
                     ):
                         # https://stackoverflow.com/questions/50599045/python-replacing-a-function-within-a-class-of-a-module
-                        mlp_module.forward = types.MethodType(
-                            _apply_lora_mlp, mlp_module
-                        )
+                        if hasattr(mlp_module, "_unsloth_forward"):
+                            # then we've patched the mlp to use TiledMLP
+                            mlp_module._unsloth_forward = types.MethodType(
+                                _apply_lora_mlp, mlp_module
+                            )
+                        else:
+                            mlp_module.forward = types.MethodType(
+                                _apply_lora_mlp, mlp_module
+                            )
                         n_mlp += 1
                     else:
                         logger.warning_once(
 
@@ -57,6 +57,7 @@
 # https://github.com/huggingface/transformers/pull/26037 allows 4 bit loading!
 from unsloth_zoo.utils import Version, _get_dtype
 from unsloth_zoo.hf_utils import dtype_from_config
+from unsloth_zoo.tiled_mlp import patch_tiled_mlp
 
 transformers_version = Version(transformers_version)
 SUPPORTS_FOURBIT = transformers_version >= Version("4.37")
@@ -566,6 +567,13 @@ def from_pretrained(
             )
             # Patch it as well!
             model = dispatch_model.patch_peft_model(model, use_gradient_checkpointing)
+
+        # Patch Tiled MLP
+        # to turn on set UNSLOTH_TILED_MLP to "arctic", "target", or "target:{GB}""
+        patch_tiled_mlp_choice = os.environ.get("UNSLOTH_TILED_MLP", "0")
+        if patch_tiled_mlp_choice != "0":
+            patch_tiled_mlp(model, patch_options_str = patch_tiled_mlp_choice)
+
         return model, tokenizer
 
 
@@ -1138,6 +1146,12 @@ def from_pretrained(
             print("Unsloth: Applying QAT to mitigate quantization degradation")
             model = _prepare_model_for_qat(model, qat_scheme)
 
+        # Patch Tiled MLP
+        # to turn on set UNSLOTH_TILED_MLP to "arctic", "target", or "target:{GB}""
+        patch_tiled_mlp_choice = os.environ.get("UNSLOTH_TILED_MLP", "0")
+        if patch_tiled_mlp_choice != "0":
+            patch_tiled_mlp(model, patch_options_str = patch_tiled_mlp_choice)
+
         return model, tokenizer