unslothai · danielhanchen · Oct 20, 2025 · Sep 23, 2025 · Sep 23, 2025 · Sep 23, 2025
diff --git a/unsloth_zoo/__init__.py b/unsloth_zoo/__init__.py
@@ -14,9 +14,10 @@
 # You should have received a copy of the GNU Lesser General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
-__version__ = "2025.10.7"
+__version__ = "2025.10.8"
 
 import os
+import warnings
 # Hugging Face Hub faster downloads
 if "HF_HUB_ENABLE_HF_TRANSFER" not in os.environ:
     os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
@@ -101,3 +102,14 @@
     execute_with_time_limit,
     Benchmarker,
 )
+
+# Top some pydantic warnings
+try:
+    # pydantic/_internal/_generate_schema.py:2249: UnsupportedFieldAttributeWarning: The 'frozen' attribute with value True
+    # was provided to the `Field()` function, which has no effect in the context it was used.
+    # 'frozen' is field-specific metadata, and can only be attached to a model field using `Annotated` metadata or by assignment.
+    # This may have happened because an `Annotated` type alias using the `type` statement was used, or if the `Field()` function was attached to a single member of a union type.
+    from pydantic.warnings import UnsupportedFieldAttributeWarning
+    warnings.filterwarnings(action = "ignore", category = UnsupportedFieldAttributeWarning)
+except:
+    pass
diff --git a/unsloth_zoo/compiler.py b/unsloth_zoo/compiler.py
@@ -2035,7 +2035,14 @@ def unsloth_compile_transformers(
     except ModuleNotFoundError:
         return
     modeling_file = eval(model_location)
-    if hasattr(modeling_file, "__UNSLOTH_PATCHED__"): return
+    if hasattr(modeling_file, "__UNSLOTH_PATCHED__"):
+        # Get __UNSLOTH_SUPPORTS_SDPA__
+        if hasattr(modeling_file, "__UNSLOTH_SUPPORTS_SDPA__"):
+            if supports_sdpa is not None:
+                assert(type(supports_sdpa) is list and len(supports_sdpa) == 1)
+                supports_sdpa[0] = modeling_file.__UNSLOTH_SUPPORTS_SDPA__
+        return
+    pass
 
     # Use transformers model_type logger to suppress message: Remove `use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`
     exec("model_logger.addFilter(HideLoggingMessage('`use_cache`'))", globals(), locals())
@@ -2189,6 +2196,7 @@ def replaced_tqdm(*args, **kwargs):
     torch_modules = [x for x in torch_modules if x not in removal]
 
     # Check SDPA to load as eager or SDPA (Pixtral / Mistral 3 for eg doesn't have SDPA)
+    final_supports_sdpa = True
     if supports_sdpa is not None:
         assert(type(supports_sdpa) is list and len(supports_sdpa) == 1)
         if ("_supports_sdpa = True" in full_source) and ("_supports_sdpa = False" not in full_source):
@@ -2197,7 +2205,10 @@ def replaced_tqdm(*args, **kwargs):
             if supports_sdpa[0] != False: supports_sdpa[0] = True
         else:
             supports_sdpa[0] = False
+            final_supports_sdpa = False
     pass
+    # Save supports_sdpa to solve secondary imports
+    modeling_file.__UNSLOTH_SUPPORTS_SDPA__ = final_supports_sdpa
 
     # Get functions which are called
     called_functions = []

diff --git a/unsloth_zoo/empty_model.py b/unsloth_zoo/empty_model.py
@@ -223,33 +223,45 @@ def copy_attributes(original_model, new_model):
         if dict_skipped_count > 0:
             print(f"📋 Skipped {dict_skipped_count} non-config dictionaries")
         if skipped_count > 0:
-            print(f"⏭️  Skipped {skipped_count} total attributes (tensors, modules, non-config dicts, etc.)")
+            print(f"⏭️ Skipped {skipped_count} total attributes (tensors, modules, non-config dicts, etc.)")
             if skipped_count <= 10:
-                print(f"   Skipped: {skipped_attrs}")
+                print(f"    Skipped: {skipped_attrs}")
             else:
-                print(f"   Sample: {skipped_attrs[:5]}... and {skipped_count-5} more")
+                print(f"    Sample: {skipped_attrs[:5]}... and {skipped_count-5} more")
+pass
 
 
 @torch.inference_mode()
 def create_empty_causal_lm(config, dtype = torch.float16):
     # All Unsloth Zoo code licensed under LGPLv3
     from transformers import AutoModelForCausalLM
-    try:
-        from accelerate import init_empty_weights
-        # Suppress warning on uninited weights
-        old_warn = os.environ.get("UNSLOTH_WARN_UNINITIALIZED", "1")
-        os.environ["UNSLOTH_WARN_UNINITIALIZED"] = "0"
-        with init_empty_weights():
-            model_name = getattr(config, 'model_name')
-            kwargs = {"torch_dtype" if HAS_TORCH_DTYPE else "dtype" : dtype_from_config(config)}
-            if model_name is not None:
-                # This would persist quantization information.
+    from accelerate import init_empty_weights
+    # Suppress warning on uninited weights
+    old_warn = os.environ.get("UNSLOTH_WARN_UNINITIALIZED", "1")
+    os.environ["UNSLOTH_WARN_UNINITIALIZED"] = "0"
+    model_name = getattr(config, 'model_name')
+    kwargs = {"torch_dtype" if HAS_TORCH_DTYPE else "dtype" : dtype_from_config(config)}
+    original_meta_model = None
+    error = None
+    with init_empty_weights(include_buffers = True):
+        if model_name is not None:
+            try:
+                # This would persist quantization information for FP8 weights
                 original_meta_model = AutoModelForCausalLM.from_pretrained(model_name, **kwargs)
-            else:
+            except Exception as e:
+                error = str(e)
+                original_meta_model = None
+        if original_meta_model is None:
+            try:
+                # We must do this for 4.57.0 and above
                 original_meta_model = AutoModelForCausalLM.from_config(config)
-        # Suppress warning on uninited weights
-        os.environ["UNSLOTH_WARN_UNINITIALIZED"] = old_warn
-    except Exception as e:
+            except Exception as e:
+                error = str(e)
+                original_meta_model = None
+    pass
+    # Suppress warning on uninited weights
+    os.environ["UNSLOTH_WARN_UNINITIALIZED"] = old_warn
+    if error is not None and original_meta_model is None:
         print(f"Failed to create original_meta_model for AutoModelForCausalLM. Error {e}")
         original_meta_model = None
 
@@ -302,7 +314,7 @@ def _init_weights(self, module):
     try:
         # Use accelerate's init_empty_weights, not transformers.modeling_utils
         from accelerate import init_empty_weights
-        with init_empty_weights():
+        with init_empty_weights(include_buffers = True):
             original_meta_model = model_cls(config)
     except Exception as e:
         print(f"Failed to create original_meta_model for {model_cls.__name__}. Error {e}")

diff --git a/unsloth_zoo/fused_losses/cross_entropy_loss.py b/unsloth_zoo/fused_losses/cross_entropy_loss.py
@@ -26,7 +26,7 @@
 import functools
 import math
 from ..temporary_patches.common import UNSLOTH_ENABLE_LOGGING, torch_compile_options, logger
-from unsloth import DEVICE_TYPE
+from ..device_type import DEVICE_TYPE
 
 @functools.cache
 def _get_mapping(autograd):
@@ -198,6 +198,8 @@ def forward(
             n_chunks = extra_kwargs.pop("n_chunks")
         else:
             n_chunks = get_chunk_size(bsz, qlen, vocab_size, target_gb = target_gb)
+        if UNSLOTH_ENABLE_LOGGING:
+            logger.info(f"Fused CE Loss [bsz={bsz}][qlen={qlen}][vocab_size={vocab_size}][n_chunks={n_chunks}]")
         __shift_labels = torch.chunk(labels,                     n_chunks, dim = 0)
         __shift_states = torch.chunk(hidden_states.view(-1, hd), n_chunks, dim = 0)
         __grad_inputs  = torch.chunk(grad_inputs.view(-1, hd),   n_chunks, dim = 0)

diff --git a/unsloth_zoo/rl_environments.py b/unsloth_zoo/rl_environments.py
@@ -35,6 +35,7 @@
 from contextlib import contextmanager
 from functools import wraps
 import threading
+import errno
 import time
 from typing import Callable, TypeVar, Any, Tuple
 T = TypeVar("T")
@@ -299,16 +300,26 @@ def create_locked_down_function(function):
 pass
 
 
+def _retry_eintr(func, *args):
+    while True:
+        try:
+            return func(*args)
+        except OSError as e:
+            if getattr(e, "errno", None) == errno.EINTR:
+                continue
+            raise
+pass
+
 @contextmanager
-def time_limit(seconds: float):
+def time_limit(seconds: float, *, strict: bool = True, leeway: float = 0.05):
     """
     Enforce a wall-clock time limit using SIGALRM/ITIMER_REAL.
 
-    Key points:
-      - Nest-safe: earliest deadline wins.
-      - Restores any prior timer with remaining time corrected.
-      - **Interrupts** blocking syscalls so TimeoutError can be raised promptly.
-      - Unix-like OS, main thread only. Process-wide timer: not composable with other SIGALRM users.
+    - Earliest deadline wins (respects any currently armed ITIMER_REAL).
+    - EINTR-safe setup/teardown; resists Ctrl+C during cleanup.
+    - strict=True: 'fail-closed' — if body returns after the deadline and the
+      SIGALRM handler didn't get to run, raise TimeoutError on exit anyway.
+    - Unix-like OS, main thread only. Process-wide SIGALRM: not composable with other users.
     """
     if seconds <= 0:
         raise ValueError("seconds must be > 0")
@@ -317,39 +328,64 @@ def time_limit(seconds: float):
     if threading.current_thread() is not threading.main_thread():
         raise RuntimeError("time_limit must be used from the main thread")
 
+    start = time.monotonic()
+    deadline_at = start + seconds
+
     old_handler = signal.getsignal(signal.SIGALRM)
     prev_remaining, prev_interval = signal.getitimer(signal.ITIMER_REAL)
 
+    # Always respect any already-armed timer: take the earlier deadline.
+    deadline = seconds if prev_remaining <= 0.0 else min(seconds, prev_remaining)
+
+    fired = False  # set by our handler
+
     def _handler(signum, frame):
-        raise TimeoutError(f"Timed out after {seconds:g}s")
-    setattr(_handler, "__time_limit_handler__", True)
+        nonlocal fired
+        fired = True
+        # include the intended arming deadline for debugging
+        raise TimeoutError(f"Timed out after {deadline:g}s")
 
-    nested_ours = getattr(old_handler, "__time_limit_handler__", False)
-    start = time.monotonic()
-    delay_now = min(seconds, prev_remaining) if (nested_ours and prev_remaining > 0.0) else seconds
+    setattr(_handler, "__time_limit_handler__", True)
 
+    _retry_eintr(signal.signal, signal.SIGALRM, _handler)
     try:
-        signal.signal(signal.SIGALRM, _handler)
-
-        # IMPORTANT: ensure blocking syscalls are INTERRUPTED (no SA_RESTART),
-        # so control returns to Python and we can raise TimeoutError.
+        # Ensure blocking syscalls are interrupted (avoid SA_RESTART)
         try:
             signal.siginterrupt(signal.SIGALRM, True)
         except (AttributeError, OSError):
             pass
 
-        signal.setitimer(signal.ITIMER_REAL, delay_now)
+        _retry_eintr(signal.setitimer, signal.ITIMER_REAL, deadline)
         yield
     finally:
-        # Cancel our timer and restore the previous handler.
-        signal.setitimer(signal.ITIMER_REAL, 0.0)
-        signal.signal(signal.SIGALRM, old_handler)
-
-        # Restore prior timer with corrected remaining time.
-        if prev_remaining != 0.0 or prev_interval != 0.0:
-            elapsed = max(time.monotonic() - start, 0.0)
-            remaining = max(prev_remaining - elapsed, 0.0)
-            signal.setitimer(signal.ITIMER_REAL, remaining, prev_interval)
+        # Make teardown atomic wrt SIGINT and robust to EINTR
+        old_sigint = signal.getsignal(signal.SIGINT)
+        try:
+            _retry_eintr(signal.signal, signal.SIGINT, signal.SIG_IGN)
+            try:
+                _retry_eintr(signal.setitimer, signal.ITIMER_REAL, 0.0)  # cancel ours
+            finally:
+                _retry_eintr(signal.signal, signal.SIGALRM, old_handler)
+
+            # Restore prior timer with corrected remaining time.
+            if prev_remaining != 0.0 or prev_interval != 0.0:
+                elapsed = max(time.monotonic() - start, 0.0)
+                remaining = max(prev_remaining - elapsed, 0.0)
+                _retry_eintr(signal.setitimer, signal.ITIMER_REAL, remaining, prev_interval)
+        finally:
+            _retry_eintr(signal.signal, signal.SIGINT, old_sigint)
+
+        # ---- Fail-closed check (only if no TimeoutError was raised inside) ----
+        if strict and not fired:
+            now = time.monotonic()
+            if now > deadline_at + leeway:
+                # We exceeded wall time but the handler didn't get a chance to run.
+                # This typically means the body spent a long time in non-cooperative C code.
+                raise TimeoutError(
+                    f"Exceeded time limit ({seconds:g}s) without interrupt; "
+                    f"elapsed ≈ {now - start:.3f}s. "
+                    "The protected code likely blocked in a C extension or another SIGALRM user clobbered the timer."
+                )
 pass
 
 def execute_with_time_limit(seconds: float) -> Callable[[Callable[..., T]], Callable[..., T]]:

diff --git a/unsloth_zoo/rl_replacements.py b/unsloth_zoo/rl_replacements.py
@@ -23,7 +23,7 @@
 import os
 import numpy as np
 from typing import Union, Callable, Optional, List, Dict
-from unsloth import DEVICE_TYPE
+from .device_type import DEVICE_TYPE
 from .temporary_patches.common import torch_compile_options
 RL_REPLACEMENTS = dict()
 

diff --git a/unsloth_zoo/temporary_patches/gpt_oss.py b/unsloth_zoo/temporary_patches/gpt_oss.py
@@ -535,7 +535,7 @@ def forward(self, hidden_states):
 
 
 # Combo kernels uses too much VRAM for low memory GPUs
-from unsloth import DEVICE_TYPE
+from ..device_type import DEVICE_TYPE
 if DEVICE_TYPE == "xpu":
     device_memory = torch.xpu.memory.mem_get_info(0)[-1]
 else:

diff --git a/unsloth_zoo/vllm_utils.py b/unsloth_zoo/vllm_utils.py
@@ -39,6 +39,8 @@
 import math
 import gc
 import os
+import ast
+import sys
 import torch
 import json
 import psutil
@@ -59,8 +61,7 @@
     UNSLOTH_ENABLE_LOGGING,
 )
 from .log import logger
-from unsloth import DEVICE_TYPE
-from unsloth.models.vision import VLLM_SUPPORTED_VLM
+from .device_type import DEVICE_TYPE
 global LORA_REQUEST_ID
 
 # Ignore logging messages
@@ -1219,7 +1220,7 @@ def _override_to(self, *args, **kwargs):
                 layer.quant_method = "fbgemm_fp8"
             elif f"{layer_name}.weight_scale_inv" in quant_state_dict:
                 # This denotes that the model if FP8 dynamic quantized.
-                layer = FP8Linear(in_features = 0, out_features = 0, bias = has_bias, dtype=dtype, block_size = kwargs['block_size'], device = get_target_device(), activation_scheme = kwargs['activation_scheme'])
+                layer = FP8Linear(in_features = 0, out_features = 0, bias = has_bias, dtype = dtype, block_size = kwargs['block_size'], device = get_target_device(), activation_scheme = kwargs['activation_scheme'])
                 layer.in_features = weight.shape[1]
                 layer.out_features = weight.shape[0]
                 layer.weight = torch.nn.Parameter(weight, requires_grad = False)
@@ -1462,8 +1463,9 @@ def load_vllm(
     assert(conservativeness >= 0.0 and conservativeness <= 1.0)
 
     unsloth_vllm_standby = unsloth_vllm_standby or (os.getenv("UNSLOTH_VLLM_STANDBY", "0") != "0")
-    if unsloth_vllm_standby and gpu_memory_utilization < 0.9:
-        gpu_memory_utilization = 0.9
+    if unsloth_vllm_standby and gpu_memory_utilization < 0.8:
+        ## [TODO] Used to allow 0.9, but now 0.85 works only
+        gpu_memory_utilization = 0.8
         logger.info("Unsloth: Standby mode is enabled. Increasing `gpu_memory_utilization` to 0.9.")
 
     if DEVICE_TYPE == "cuda":
@@ -2360,9 +2362,50 @@ def _test_is_same_vlm(model, new_model, processor, test_backward=False):
                 mismatches.append(layer_name)
         print(f"Backward gradient statistics match for {len(matches)} layers: {matches}")
         print(f"Backward gradient statistics mismatch for {len(mismatches)} layers: {mismatches}")
+pass
 
 
-    pass
+def _read_unsloth_vision_source() -> str:
+    _VISION_TAIL = ("unsloth", "models", "vision.py")
+    from importlib.metadata import files, PackageNotFoundError, PackagePath
+    from pathlib import Path
+    # 1) Via installed distribution metadata (no import of the package)
+    try:
+        for entry in files("unsloth") or ():
+            if isinstance(entry, PackagePath):
+                parts = entry.parts
+                if len(parts) >= 3 and tuple(parts[-3:]) == _VISION_TAIL:
+                    return entry.read_text(encoding = "utf-8")
+    except PackageNotFoundError:
+        pass
+
+    # 2) Fallback: scan sys.path for a plain file
+    for base in map(Path, sys.path):
+        candidate = base.joinpath(*_VISION_TAIL)
+        if candidate.is_file():
+            return candidate.read_text(encoding = "utf-8")
+    raise FileNotFoundError("Could not locate unsloth/models/vision.py without importing it")
+pass
+
+
+def get_vllm_supported_vlm(_VAR_NAME = "VLLM_SUPPORTED_VLM"):
+    """
+    Parse VLLM_SUPPORTED_VLM from unsloth/models/vision.py as a literal.
+    """
+    src = _read_unsloth_vision_source()
+    tree = ast.parse(src)
+
+    # Support: `VLLM_SUPPORTED_VLM = [...]` and `VLLM_SUPPORTED_VLM: list[str] = [...]`
+    for node in tree.body:
+        if isinstance(node, ast.Assign):
+            if any(getattr(t, "id", None) == _VAR_NAME for t in node.targets):
+                return ast.literal_eval(node.value)
+        elif isinstance(node, ast.AnnAssign):
+            if getattr(node.target, "id", None) == _VAR_NAME:
+                return ast.literal_eval(node.value)
+    raise ValueError(f"{_VAR_NAME} not found as a literal in unsloth/models/vision.py")
+pass
+
 
 @torch.inference_mode
 def _test_get_vllm_state_dict(
@@ -2419,6 +2462,7 @@ def _test_get_vllm_state_dict(
     if not is_vision_model:
         model_class = AutoModelForCausalLM
     else:
+        VLLM_SUPPORTED_VLM = get_vllm_supported_vlm()
         if model_type in VLLM_SUPPORTED_VLM:
             import transformers
             model_class = getattr(transformers, config.architectures[0])