mcieliebak
diff --git a/‎llmware/gguf_configs.py‎
Lines changed: 6 additions & 3 deletions b/‎llmware/gguf_configs.py‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎llmware/models.py‎
Lines changed: 6 additions & 0 deletions b/‎llmware/models.py‎
Lines changed: 6 additions & 0 deletions
@@ -238,9 +238,9 @@ def add_ctypes_declarations (_lib):
     llama_new_context_with_model.argtypes = [llama_model_p_ctypes, llama_context_params]
     llama_new_context_with_model.restype = llama_context_p_ctypes
 
-    llama_numa_init = _lib.llama_numa_init
-    llama_numa_init.argtypes = [ctypes.c_int]
-    llama_numa_init.restype = None
+    # llama_numa_init = _lib.llama_numa_init
+    # llama_numa_init.argtypes = [ctypes.c_int]
+    # llama_numa_init.restype = None
 
     llama_free = _lib.llama_free
     llama_free.argtypes = [llama_context_p_ctypes]
@@ -810,6 +810,9 @@ class GGUFConfigs:
                   # --Linux / Windows - checks for cuda availability
                   "use_gpu": True,
 
+                  # note this will be used on Windows and Linux, but not Mac
+                  "n_gpu_layers": 50,
+
                   "backend_initialized": False,
 
                   "max_output_tokens": 256,
 
@@ -4683,6 +4683,12 @@ def load_model_for_inference(self, model_repo_path, model_card = None):
         self.model_params.use_mmap = True
         self.model_params.use_mlock = False
 
+        if self.use_gpu:
+            # on darwin, keep at 0 - on win32 and linux - set to 50 by default (e.g., shift all model layers to GPU)
+            if sys.platform.lower() == "win32" or sys.platform.lower().startswith("linux"):
+
+                self.model_params.n_gpu_layers = GGUFConfigs().get_config("n_gpu_layers")
+
         # update context parameters
         self.context_params = self._lib.llama_context_default_params()
         self.context_params.n_ctx = 2048