benchmark: compare backend graph computation times

Track execution time of individual tensor operations (views, copies, reshapes etc) during LLM forward passes using CGo bindings to the native graph runtime. This helps identify performance bottlenecks in the computation graph and optimize memory operations that can significantly impact inference latency.
Merge pull request #9203 from ollama/mxyng/sapphirerapids
2025-02-19 15:22:53 -08:00 · 2025-02-19 21:42:00 +00:00 · 2025-02-19 13:24:27 -08:00 · 2025-02-19 13:22:48 -08:00 · 2025-02-19 13:20:09 -08:00 · 2025-02-18 22:46:17 -05:00
20 changed files with 791 additions and 206 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -329,7 +329,9 @@ jobs:
          done
        working-directory: dist/${{ matrix.os }}-${{ matrix.arch }}
      - run: |
-          for ARCHIVE in dist/${{ matrix.os }}-${{ matrix.arch }}/*.tar.in; do tar c -C dist/${{ matrix.os }}-${{ matrix.arch }} -T $ARCHIVE | pigz -9vc >$(basename ${ARCHIVE//.*/}.tgz); done
+          for ARCHIVE in dist/${{ matrix.os }}-${{ matrix.arch }}/*.tar.in; do
+            tar c -C dist/${{ matrix.os }}-${{ matrix.arch }} -T $ARCHIVE --owner 0 --group 0 | pigz -9vc >$(basename ${ARCHIVE//.*/}.tgz);
+          done
      - uses: actions/upload-artifact@v4
        with:
          name: dist-${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.target }}
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,7 +24,7 @@ set(GGML_LLAMAFILE ON)
 set(GGML_CUDA_PEER_MAX_BATCH_SIZE 128)
 set(GGML_CUDA_GRAPHS ON)

-if((NOT CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
+if((CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
    OR (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm|aarch64|ARM64|ARMv[0-9]+"))
    set(GGML_CPU_ALL_VARIANTS ON)
 endif()
--- a/README.md
+++ b/README.md
@@ -381,6 +381,9 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [ChibiChat](https://github.com/CosmicEventHorizon/ChibiChat) (Kotlin-based Android app to chat with Ollama and Koboldcpp API endpoints)
 - [LocalLLM](https://github.com/qusaismael/localllm) (Minimal Web-App to run ollama models on it with a GUI)
 - [Ollamazing](https://github.com/buiducnhat/ollamazing) (Web extension to run Ollama models)
+- [OpenDeepResearcher-via-searxng](https://github.com/benhaotang/OpenDeepResearcher-via-searxng) (A Deep Research equivent endpoint with Ollama support for running locally)
+- [AntSK](https://github.com/AIDotNet/AntSK) (Out-of-the-box & Adaptable RAG Chatbot)
+- [MaxKB](https://github.com/1Panel-dev/MaxKB/) (Ready-to-use & flexible RAG Chatbot)

 ### Cloud

@@ -548,6 +551,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Alfred Ollama](https://github.com/zeitlings/alfred-ollama) (Alfred Workflow)
 - [TextLLaMA](https://github.com/adarshM84/TextLLaMA) A Chrome Extension that helps you write emails, correct grammar, and translate into any language
 - [Simple-Discord-AI](https://github.com/zyphixor/simple-discord-ai)
+- [LLM Telegram Bot](https://github.com/innightwolfsleep/llm_telegram_bot) (telegram bot, primary for RP. Oobabooga-like buttons, [A1111](https://github.com/AUTOMATIC1111/stable-diffusion-webui) API integration e.t.c)

 ### Supported backends

--- a/benchmark/ggml_backend_benchmark_test.go
+++ b/benchmark/ggml_backend_benchmark_test.go
@@ -0,0 +1,86 @@
+package backend
+
+import (
+	"flag"
+	"fmt"
+	"io"
+	"log"
+	"os"
+	"testing"
+
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/server"
+
+	_ "github.com/ollama/ollama/model/models/llama"
+)
+
+var modelName = flag.String("m", "", "Name of the model to benchmark")
+
+func suppressOutput() (cleanup func()) {
+	oldStdout, oldStderr := os.Stdout, os.Stderr
+	os.Stdout, os.Stderr = nil, nil
+	log.SetOutput(io.Discard)
+
+	return func() {
+		os.Stdout, os.Stderr = oldStdout, oldStderr
+		log.SetOutput(os.Stderr)
+	}
+}
+
+func setupModel(b *testing.B) model.Model {
+	if *modelName == "" {
+		b.Fatal("Error: -m flag is required for benchmark tests")
+	}
+
+	sm, err := server.GetModel(*modelName)
+	if err != nil {
+		b.Fatal(err)
+	}
+
+	m, err := model.New(sm.ModelPath)
+	if err != nil {
+		b.Fatal(err)
+	}
+
+	m.Config().Cache.Init(m.Backend(), ml.DTypeF32, 2048)
+	return m
+}
+
+func BenchmarkGGMLOperations(b *testing.B) {
+	// loading the GGML back-end logs to standard out and makes the bench output messy
+	cleanup := suppressOutput()
+	defer cleanup()
+
+	b.Setenv("OLLAMA_BENCHMARK", "1")
+	b.Setenv("OLLAMA_BACKEND", "ggml")
+
+	m := setupModel(b)
+
+	// Sample input data
+	inputIDs := []int32{1, 2, 3, 4, 5}
+	options := model.Options{
+		Inputs:    inputIDs,
+		Positions: []int32{1, 2, 3, 4, 5},
+		Sequences: []int{1, 1, 1, 1, 1},
+		Outputs:   []int32{int32(len(inputIDs) - 1)},
+	}
+
+	b.ResetTimer()
+
+	for range b.N {
+		ctx := m.Backend().NewContext()
+		defer ctx.Close()
+
+		modelOutput, err := model.Forward(ctx, m, options)
+		if err != nil {
+			b.Fatal(fmt.Errorf("forward pass failed: %v", err))
+		}
+
+		ctx.Compute(modelOutput)
+
+		for _, op := range ctx.Timing() {
+			b.ReportMetric(op.Duration, fmt.Sprintf("%s_ms", op.Type))
+		}
+	}
+}
--- a/cmd/cmd_test.go
+++ b/cmd/cmd_test.go
@@ -10,6 +10,7 @@ import (
 	"os"
 	"strings"
 	"testing"
+	"time"

 	"github.com/google/go-cmp/cmp"
 	"github.com/spf13/cobra"
@@ -490,6 +491,96 @@ func TestPushHandler(t *testing.T) {
 	}
 }

+func TestListHandler(t *testing.T) {
+	tests := []struct {
+		name           string
+		args           []string
+		serverResponse []api.ListModelResponse
+		expectedError  string
+		expectedOutput string
+	}{
+		{
+			name: "list all models",
+			args: []string{},
+			serverResponse: []api.ListModelResponse{
+				{Name: "model1", Digest: "sha256:abc123", Size: 1024, ModifiedAt: time.Now().Add(-24 * time.Hour)},
+				{Name: "model2", Digest: "sha256:def456", Size: 2048, ModifiedAt: time.Now().Add(-48 * time.Hour)},
+			},
+			expectedOutput: "NAME      ID              SIZE      MODIFIED     \n" +
+				"model1    sha256:abc12    1.0 KB    24 hours ago    \n" +
+				"model2    sha256:def45    2.0 KB    2 days ago      \n",
+		},
+		{
+			name: "filter models by prefix",
+			args: []string{"model1"},
+			serverResponse: []api.ListModelResponse{
+				{Name: "model1", Digest: "sha256:abc123", Size: 1024, ModifiedAt: time.Now().Add(-24 * time.Hour)},
+				{Name: "model2", Digest: "sha256:def456", Size: 2048, ModifiedAt: time.Now().Add(-24 * time.Hour)},
+			},
+			expectedOutput: "NAME      ID              SIZE      MODIFIED     \n" +
+				"model1    sha256:abc12    1.0 KB    24 hours ago    \n",
+		},
+		{
+			name:          "server error",
+			args:          []string{},
+			expectedError: "server error",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			mockServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+				if r.URL.Path != "/api/tags" || r.Method != http.MethodGet {
+					t.Errorf("unexpected request to %s %s", r.Method, r.URL.Path)
+					http.Error(w, "not found", http.StatusNotFound)
+					return
+				}
+
+				if tt.expectedError != "" {
+					http.Error(w, tt.expectedError, http.StatusInternalServerError)
+					return
+				}
+
+				response := api.ListResponse{Models: tt.serverResponse}
+				if err := json.NewEncoder(w).Encode(response); err != nil {
+					t.Fatal(err)
+				}
+			}))
+			defer mockServer.Close()
+
+			t.Setenv("OLLAMA_HOST", mockServer.URL)
+
+			cmd := &cobra.Command{}
+			cmd.SetContext(context.TODO())
+
+			// Capture stdout
+			oldStdout := os.Stdout
+			r, w, _ := os.Pipe()
+			os.Stdout = w
+
+			err := ListHandler(cmd, tt.args)
+
+			// Restore stdout and get output
+			w.Close()
+			os.Stdout = oldStdout
+			output, _ := io.ReadAll(r)
+
+			if tt.expectedError == "" {
+				if err != nil {
+					t.Errorf("expected no error, got %v", err)
+				}
+				if got := string(output); got != tt.expectedOutput {
+					t.Errorf("expected output:\n%s\ngot:\n%s", tt.expectedOutput, got)
+				}
+			} else {
+				if err == nil || !strings.Contains(err.Error(), tt.expectedError) {
+					t.Errorf("expected error containing %q, got %v", tt.expectedError, err)
+				}
+			}
+		})
+	}
+}
+
 func TestCreateHandler(t *testing.T) {
 	tests := []struct {
 		name           string
--- a/docs/windows.md
+++ b/docs/windows.md
@@ -55,7 +55,7 @@ Here's a quick example showing API access from `powershell`
 ## Troubleshooting

 Ollama on Windows stores files in a few different locations.  You can view them in
-the explorer window by hitting `<cmd>+R` and type in:
+the explorer window by hitting `<Ctrl>+R` and type in:
 - `explorer %LOCALAPPDATA%\Ollama` contains logs, and downloaded updates
    - *app.log* contains most resent logs from the GUI application
    - *server.log* contains the most recent server logs
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -167,6 +167,8 @@ var (
 	MultiUserCache = Bool("OLLAMA_MULTIUSER_CACHE")
 	// Enable the new Ollama engine
 	NewEngine = Bool("OLLAMA_NEW_ENGINE")
+	// Ollama is running in a benchmark context, additional timing data will be collected.
+	Benchmark = Bool("OLLAMA_BENCHMARK")
 )

 func String(s string) func() string {
--- a/format/format_test.go
+++ b/format/format_test.go
@@ -12,6 +12,9 @@ func TestHumanNumber(t *testing.T) {

 	testCases := []testCase{
 		{0, "0"},
+		{999, "999"},
+		{1000, "1K"},
+		{1001, "1K"},
 		{1000000, "1M"},
 		{125000000, "125M"},
 		{500500000, "500.50M"},
--- a/kvcache/causal_test.go
+++ b/kvcache/causal_test.go
@@ -305,6 +305,10 @@ func (b *testBackend) NewContext() ml.Context {
 	return &testContext{}
 }

+func (b *testBackend) SystemInfo() string {
+	return "not implemented"
+}
+
 type testContext struct{}

 func (c *testContext) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
@@ -348,6 +352,10 @@ func (c *testContext) MaxTensors() int {
 	return 10
 }

+func (c *testContext) Timing() []ml.OpTiming {
+	return []ml.OpTiming{}
+}
+
 func (c *testContext) Close() {}

 type testTensor struct {
@@ -430,7 +438,7 @@ func (t *testTensor) Conv2D(ctx ml.Context, weight ml.Tensor, s0, s1, p0, p1, d0
 	panic("not implemented")
 }

-func (t *testTensor) RoPE(ctx ml.Context, rc ml.RopeConfig) ml.Tensor {
+func (t *testTensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, dim uint32, base, scale float32) ml.Tensor {
 	panic("not implemented")
 }

--- a/llama/patches/0018-remove-amx.patch
+++ b/llama/patches/0018-remove-amx.patch
@@ -0,0 +1,24 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Michael Yang <mxyng@pm.me>
+Date: Tue, 18 Feb 2025 14:47:21 -0800
+Subject: [PATCH] remove amx
+
+---
+ ggml/src/CMakeLists.txt | 4 ----
+ 1 file changed, 4 deletions(-)
+
+diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
+index 72b488dd..50828717 100644
+--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
+@@ -293,10 +293,6 @@ if (GGML_CPU_ALL_VARIANTS)
+     ggml_add_cpu_backend_variant(skylakex       AVX F16C AVX2 FMA AVX512)
+     ggml_add_cpu_backend_variant(icelake        AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
+     ggml_add_cpu_backend_variant(alderlake      AVX F16C AVX2 FMA AVX_VNNI)
+-    if (NOT MSVC)
+-        # MSVC doesn't support AMX
+-        ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
+-    endif()
+ else ()
+     ggml_add_cpu_backend_variant_impl("")
+ endif()
--- a/llama/patches/0018-use-std-filesystem-path-instead-of-wstring.patch
+++ b/llama/patches/0018-use-std-filesystem-path-instead-of-wstring.patch
@@ -0,0 +1,285 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: jmorganca <jmorganca@gmail.com>
+Date: Sun, 16 Feb 2025 20:00:22 -0500
+Subject: [PATCH] use std::filesystem::path instead of wstring
+
+---
+ ggml/src/ggml-backend-reg.cpp | 116 ++++++++++++----------------------
+ 1 file changed, 40 insertions(+), 76 deletions(-)
+
+diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
+index 84b21dd8..de78feae 100644
+--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
+@@ -72,16 +72,6 @@
+ #    pragma clang diagnostic ignored "-Wdeprecated-declarations"
+ #endif
+ 
+-static std::wstring utf8_to_utf16(const std::string & str) {
+-    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
+-    return converter.from_bytes(str);
+-}
+-
+-static std::string utf16_to_utf8(const std::wstring & str) {
+-    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
+-    return converter.to_bytes(str);
+-}
+-
+ #if defined(__clang__)
+ #    pragma clang diagnostic pop
+ #endif
+@@ -96,12 +86,12 @@ struct dl_handle_deleter {
+     }
+ };
+ 
+-static dl_handle * dl_load_library(const std::wstring & path) {
+static dl_handle * dl_load_library(const std::filesystem::path & path) {
+     // suppress error dialogs for missing DLLs
+     DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
+     SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
+ 
+-    HMODULE handle = LoadLibraryW(path.c_str());
+    HMODULE handle = LoadLibraryW(path.wstring().c_str());
+ 
+     SetErrorMode(old_mode);
+ 
+@@ -129,8 +119,8 @@ struct dl_handle_deleter {
+     }
+ };
+ 
+-static void * dl_load_library(const std::wstring & path) {
+-    dl_handle * handle = dlopen(utf16_to_utf8(path).c_str(), RTLD_NOW | RTLD_LOCAL);
+static void * dl_load_library(const std::filesystem::path & path) {
+    dl_handle * handle = dlopen(path.string().c_str(), RTLD_NOW | RTLD_LOCAL);
+ 
+     return handle;
+ }
+@@ -222,11 +212,11 @@ struct ggml_backend_registry {
+         );
+     }
+ 
+-    ggml_backend_reg_t load_backend(const std::wstring & path, bool silent) {
+    ggml_backend_reg_t load_backend(const std::filesystem::path & path, bool silent) {
+         dl_handle_ptr handle { dl_load_library(path) };
+         if (!handle) {
+             if (!silent) {
+-                GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(path).c_str());
+                GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path.string().c_str());
+             }
+             return nullptr;
+         }
+@@ -234,7 +224,7 @@ struct ggml_backend_registry {
+         auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
+         if (score_fn && score_fn() == 0) {
+             if (!silent) {
+-                GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, utf16_to_utf8(path).c_str());
+                GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path.string().c_str());
+             }
+             return nullptr;
+         }
+@@ -242,7 +232,7 @@ struct ggml_backend_registry {
+         auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init");
+         if (!backend_init_fn) {
+             if (!silent) {
+-                GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, utf16_to_utf8(path).c_str());
+                GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, path.string().c_str());
+             }
+             return nullptr;
+         }
+@@ -251,16 +241,16 @@ struct ggml_backend_registry {
+         if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
+             if (!silent) {
+                 if (!reg) {
+-                    GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, utf16_to_utf8(path).c_str());
+                    GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path.string().c_str());
+                 } else {
+                     GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
+-                        __func__, utf16_to_utf8(path).c_str(), reg->api_version, GGML_BACKEND_API_VERSION);
+                        __func__, path.string().c_str(), reg->api_version, GGML_BACKEND_API_VERSION);
+                 }
+             }
+             return nullptr;
+         }
+ 
+-        GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), utf16_to_utf8(path).c_str());
+        GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path.string().c_str());
+ 
+         register_backend(reg, score_fn ? score_fn() : -1, std::move(handle));
+ 
+@@ -396,14 +386,14 @@ ggml_backend_t ggml_backend_init_best(void) {
+ 
+ // Dynamic loading
+ ggml_backend_reg_t ggml_backend_load(const char * path) {
+-    return get_reg().load_backend(utf8_to_utf16(path), false);
+    return get_reg().load_backend(path, false);
+ }
+ 
+ void ggml_backend_unload(ggml_backend_reg_t reg) {
+     get_reg().unload_backend(reg, true);
+ }
+ 
+-static std::wstring get_executable_path() {
+static std::filesystem::path get_executable_path() {
+ #if defined(__APPLE__)
+     // get executable path
+     std::vector<char> path;
+@@ -415,15 +405,9 @@ static std::wstring get_executable_path() {
+         }
+         path.resize(size);
+     }
+-    std::string base_path(path.data(), size);
+-    // remove executable name
+-    auto last_slash = base_path.find_last_of('/');
+-    if (last_slash != std::string::npos) {
+-        base_path = base_path.substr(0, last_slash);
+-    }
+-    return utf8_to_utf16(base_path + "/");
+
+    return std::filesystem::path(path.data()).parent_path();
+ #elif defined(__linux__) || defined(__FreeBSD__)
+-    std::string base_path = ".";
+     std::vector<char> path(1024);
+     while (true) {
+         // get executable path
+@@ -436,76 +420,56 @@ static std::wstring get_executable_path() {
+             break;
+         }
+         if (len < (ssize_t) path.size()) {
+-            base_path = std::string(path.data(), len);
+-            // remove executable name
+-            auto last_slash = base_path.find_last_of('/');
+-            if (last_slash != std::string::npos) {
+-                base_path = base_path.substr(0, last_slash);
+-            }
+-            break;
+            return std::filesystem::path(path.data()).parent_path();
+         }
+         path.resize(path.size() * 2);
+     }
+-
+-    return utf8_to_utf16(base_path + "/");
+ #elif defined(_WIN32)
+     std::vector<wchar_t> path(MAX_PATH);
+     DWORD len = GetModuleFileNameW(NULL, path.data(), path.size());
+     if (len == 0) {
+         return {};
+     }
+-    std::wstring base_path(path.data(), len);
+-    // remove executable name
+-    auto last_slash = base_path.find_last_of('\\');
+-    if (last_slash != std::string::npos) {
+-        base_path = base_path.substr(0, last_slash);
+-    }
+-    return base_path + L"\\";
+-#else
+-    return {};
+-#endif
+-}
+ 
+-static std::wstring backend_filename_prefix() {
+-#ifdef _WIN32
+-    return L"ggml-";
+    return std::filesystem::path(path.data()).parent_path();
+ #else
+-    return L"libggml-";
+    return {};
+ #endif
+ }
+ 
+-static std::wstring backend_filename_suffix() {
+static std::string backend_filename_prefix() {
+ #ifdef _WIN32
+-    return L".dll";
+    return "ggml-";
+ #else
+-    return L".so";
+    return "libggml-";
+ #endif
+ }
+ 
+-static std::wstring path_separator() {
+static std::string backend_filename_suffix() {
+ #ifdef _WIN32
+-    return L"\\";
+    return ".dll";
+ #else
+-    return L"/";
+    return ".so";
+ #endif
+ }
+ 
+ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) {
+     // enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
+      // TODO: search system paths
+-    std::wstring file_prefix = backend_filename_prefix() + utf8_to_utf16(name) + L"-";
+-    std::vector<std::wstring> search_paths;
+    namespace fs = std::filesystem;
+    std::string file_prefix = backend_filename_prefix() + name + "-";
+    std::vector<fs::path> search_paths;
+
+     if (user_search_path == nullptr) {
+-        search_paths.push_back(L"." + path_separator());
+        search_paths.push_back(fs::current_path());
+         search_paths.push_back(get_executable_path());
+     } else {
+-        search_paths.push_back(utf8_to_utf16(user_search_path) + path_separator());
+        search_paths.push_back(fs::u8path(user_search_path));
+     }
+ 
+     int best_score = 0;
+-    std::wstring best_path;
+    fs::path best_path;
+ 
+-    namespace fs = std::filesystem;
+     for (const auto & search_path : search_paths) {
+         if (!fs::exists(search_path)) {
+             continue;
+@@ -514,31 +478,31 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
+         for (const auto & entry : dir_it) {
+             try {
+                 if (entry.is_regular_file()) {
+-                    std::wstring filename = entry.path().filename().wstring();
+-                    std::wstring ext = entry.path().extension().wstring();
+                    std::string filename = entry.path().filename().string();
+                    std::string ext = entry.path().extension().string();
+                     if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
+-                        dl_handle_ptr handle { dl_load_library(entry.path().wstring()) };
+                        dl_handle_ptr handle { dl_load_library(entry.path()) };
+                         if (!handle) {
+-                            GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
+                            GGML_LOG_ERROR("%s: failed to load %s\n", __func__, entry.path().string().c_str());
+                             continue;
+                         }
+ 
+                         auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
+                         if (!score_fn) {
+-                            GGML_LOG_DEBUG("%s: failed to find ggml_backend_score in %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
+                            GGML_LOG_DEBUG("%s: failed to find ggml_backend_score in %s\n", __func__, entry.path().string().c_str());
+                             continue;
+                         }
+ 
+                         int s = score_fn();
+-                        GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str(), s);
+                        GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, entry.path().string().c_str(), s);
+                         if (s > best_score) {
+                             best_score = s;
+-                            best_path = entry.path().wstring();
+                            best_path = entry.path();
+                         }
+                     }
+                 }
+             } catch (const std::exception & e) {
+-                GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str(), e.what());
+                GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, entry.path().string().c_str(), e.what());
+             }
+         }
+     }
+@@ -546,7 +510,7 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
+     if (best_score == 0) {
+         // try to load the base backend
+         for (const auto & search_path : search_paths) {
+-            std::wstring path = search_path + backend_filename_prefix() + utf8_to_utf16(name) + backend_filename_suffix();
+            fs::path path = fs::path(search_path) / (backend_filename_prefix() + name + backend_filename_suffix());
+             if (fs::exists(path)) {
+                 return get_reg().load_backend(path, silent);
+             }
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -2,6 +2,7 @@ package ml

 import (
 	"bytes"
+	"cmp"
 	"encoding/binary"
 	"fmt"
 	"os"
@@ -23,6 +24,7 @@ type Backend interface {
 	Config() Config
 	Get(name string) Tensor
 	NewContext() Context
+	SystemInfo() string
 }

 var backends = make(map[string]func(*os.File) (Backend, error))
@@ -36,49 +38,13 @@ func RegisterBackend(name string, f func(*os.File) (Backend, error)) {
 }

 func NewBackend(f *os.File) (Backend, error) {
-	if backend, ok := backends["ggml"]; ok {
+	if backend, ok := backends[cmp.Or(os.Getenv("OLLAMA_BACKEND"), "ggml")]; ok {
 		return backend(f)
 	}

 	return nil, fmt.Errorf("unsupported backend")
 }

-// RopeType specifies the type of RoPE (Rotary Position Embedding) to use, these types are implemented in the backend
-type RopeType int
-
-const (
-	RopeTypeStandard RopeType = iota
-	_                         // not yet used
-	RopeTypeNeoX
-)
-
-// RopeConfig contains all configuration for the RoPE (Rotary Position Embedding) operation
-type RopeConfig struct {
-	// PositionIDs contains the position indices for each token in the sequence
-	// These indices are used to calculate the rotary embeddings
-	PositionIDs Tensor
-
-	// RopeFactors is an optional tensor containing pre-computed rotation factors
-	RopeFactors Tensor
-
-	// RopeDim specifies the dimension size for the rotary embeddings
-	RopeDim uint32
-
-	// RopeType indicates which RoPE variant to use (e.g. normal or neox)
-	RopeType RopeType
-
-	// OrigCtxLen stores the original context length the model was trained with
-	OrigCtxLen int
-
-	// RopeBase is the base value used in the frequency calculation
-	RopeBase float32
-
-	// RopeScale is a scaling factor applied to position indices
-	RopeScale float32
-
-	// YaRN parameters can be added here if they need to be configurable
-}
-
 type Context interface {
 	Zeros(dtype DType, shape ...int) Tensor
 	FromFloatSlice(s []float32, shape ...int) (Tensor, error)
@@ -88,6 +54,30 @@ type Context interface {
 	Compute(...Tensor)
 	MaxTensors() int
 	Close()
+
+	Timing() []OpTiming
+}
+
+// OpType is the type of operation performed during a forward pass.
+type OpType string
+
+const (
+	View       OpType = "View"
+	Copy       OpType = "Copy"
+	Reshape    OpType = "Reshape"
+	Permute    OpType = "Permute"
+	Contiguous OpType = "Contiguous"
+	Input      OpType = "Input"
+	ComputeOp  OpType = "Compute"
+	Transpose  OpType = "Transpose"
+)
+
+// OpTiming stores the timing information for a single operation.
+type OpTiming struct {
+	Type      OpType
+	Operation string
+	Duration  float64
+	Order     int
 }

 type Tensor interface {
@@ -111,7 +101,7 @@ type Tensor interface {
 	Scale(ctx Context, s float64) Tensor

 	Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
-	RoPE(ctx Context, rc RopeConfig) Tensor
+	RoPE(ctx Context, positionIDs, ropeFactors Tensor, dim uint32, base, scale float32) Tensor

 	Tanh(ctx Context) Tensor
 	GELU(ctx Context) Tensor
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -1,11 +1,77 @@
 package ggml

-// #cgo CPPFLAGS: -I${SRCDIR}/ggml/include
-// #include <stdlib.h>
-// #include <stdint.h>
-// #include "ggml.h"
-// #include "ggml-cpu.h"
-// #include "ggml-backend.h"
+/*
+#cgo CPPFLAGS: -I${SRCDIR}/ggml/include
+#include <stdlib.h>
+#include <stdint.h>
+#include <time.h>
+#include <string.h>
+#include "ggml.h"
+#include "ggml-cpu.h"
+#include "ggml-backend.h"
+static struct ggml_backend_feature * getBackendFeatures(void *fp, ggml_backend_reg_t reg) {return ((ggml_backend_get_features_t)(fp))(reg);}
+static struct ggml_backend_feature * getNextBackendFeatures(struct ggml_backend_feature * feature) { return &feature[1];}
+
+typedef enum {COMP_UNKNOWN,COMP_GCC,COMP_CLANG} COMPILER;
+COMPILER inline get_compiler() {
+#if defined(__clang__)
+	return COMP_CLANG;
+#elif defined(__GNUC__)
+	return COMP_GCC;
+#else
+	return UNKNOWN_COMPILER;
+#endif
+}
+
+// Define a fixed-size struct to store timing data
+#define MAX_TENSOR_NAME 256
+#define MAX_TIMINGS 1000
+
+typedef struct {
+    char tensor_name[MAX_TENSOR_NAME];
+    double duration_ms;
+} timing_entry;
+
+typedef struct {
+    timing_entry entries[MAX_TIMINGS];
+    int count;
+} timing_data;
+
+// Global timing data structure
+timing_data g_timings = {0};
+
+double get_time_ms() {
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return ts.tv_sec * 1000.0 + ts.tv_nsec / 1000000.0;
+}
+
+bool debug_callback(struct ggml_tensor * t, bool ask, void * user_data) {
+    static double start_time;
+    static char current_tensor[MAX_TENSOR_NAME];
+
+    if (ask) {
+        start_time = get_time_ms();
+        strncpy(current_tensor, t->name, MAX_TENSOR_NAME - 1);
+        current_tensor[MAX_TENSOR_NAME - 1] = '\0';
+    } else {
+        double end_time = get_time_ms();
+        double duration = end_time - start_time;
+
+        if (g_timings.count < MAX_TIMINGS) {
+            strncpy(g_timings.entries[g_timings.count].tensor_name, current_tensor, MAX_TENSOR_NAME - 1);
+            g_timings.entries[g_timings.count].duration_ms = duration;
+            g_timings.count++;
+        }
+    }
+    return true;
+}
+
+void clear_timings() {
+    g_timings.count = 0;
+}
+
+*/
 import "C"

 import (
@@ -13,9 +79,11 @@ import (
 	"io"
 	"log/slog"
 	"os"
+	"strings"
 	"sync"
 	"unsafe"

+	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	fs "github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/ml"
@@ -240,7 +308,62 @@ func (c *Context) Forward(t ml.Tensor) {
 	C.ggml_build_forward_expand(c.graph, t.(*Tensor).t)
 }

+// Timing retrieves the collected timing data
+func (c *Context) Timing() []ml.OpTiming {
+	sequence := make([]ml.OpTiming, C.g_timings.count)
+
+	for i := range int(C.g_timings.count) {
+		entry := C.g_timings.entries[i]
+		tensorName := C.GoString(&entry.tensor_name[0])
+
+		// Determine operation type and description based on tensor name
+		var opType ml.OpType
+		var opDesc string
+
+		switch {
+		case strings.Contains(tensorName, "(view)"):
+			opType, opDesc = ml.View, "Memory view"
+		case strings.Contains(tensorName, "(copy)") || strings.Contains(tensorName, "(copy of"):
+			opType, opDesc = ml.Copy, "Memory copy"
+		case strings.Contains(tensorName, "(reshaped)"):
+			opType, opDesc = ml.Reshape, "Reshape"
+		case strings.Contains(tensorName, "(permuted)"):
+			opType, opDesc = ml.Permute, "Permute dimensions"
+		case strings.Contains(tensorName, "(cont)"):
+			opType, opDesc = ml.Contiguous, "Make contiguous"
+		case strings.Contains(tensorName, "(transposed)"):
+			opType, opDesc = ml.Transpose, "Transpose"
+		case strings.HasPrefix(tensorName, "leaf_"):
+			opType, opDesc = ml.Input, fmt.Sprintf("Input tensor %s", tensorName)
+		case strings.HasPrefix(tensorName, "node_"):
+			opType, opDesc = ml.ComputeOp, fmt.Sprintf("Computation %s", tensorName)
+		default:
+			opType, opDesc = "Unknown", tensorName
+		}
+
+		sequence[i] = ml.OpTiming{
+			Type:      opType,
+			Operation: opDesc,
+			Duration:  float64(entry.duration_ms),
+			Order:     i,
+		}
+	}
+
+	return sequence
+}
+
 func (c *Context) Compute(tensors ...ml.Tensor) {
+	if envconfig.Benchmark() {
+		// Clear previous timings before new computation
+		C.clear_timings()
+
+		C.ggml_backend_sched_set_eval_callback(
+			c.sched,
+			C.ggml_backend_eval_callback(C.debug_callback),
+			nil,
+		)
+	}
+
 	C.ggml_backend_sched_graph_compute_async(c.sched, c.graph)

 	needSync := true
@@ -579,9 +702,13 @@ func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
 	}
 }

-func (t *Tensor) RoPE(ctx ml.Context, rc ml.RopeConfig) ml.Tensor {
-	if rc.RopeFactors == nil {
-		rc.RopeFactors = &Tensor{}
+const (
+	ropeTypeNorm C.int = iota
+)
+
+func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim uint32, ropeBase, ropeScale float32) ml.Tensor {
+	if ropeFactors == nil {
+		ropeFactors = &Tensor{}
 	}

 	dequant := t.t
@@ -591,15 +718,12 @@ func (t *Tensor) RoPE(ctx ml.Context, rc ml.RopeConfig) ml.Tensor {

 	return &Tensor{
 		t: C.ggml_rope_ext(
-			ctx.(*Context).ctx,
-			dequant,
-			rc.PositionIDs.(*Tensor).t,
-			rc.RopeFactors.(*Tensor).t,
-			C.int(rc.RopeDim),
-			C.int(rc.RopeType),
-			C.int(rc.OrigCtxLen),
-			C.float(rc.RopeBase),
-			C.float(rc.RopeScale),
+			ctx.(*Context).ctx, dequant, positionIDs.(*Tensor).t, ropeFactors.(*Tensor).t,
+			C.int(ropeDim),
+			131072,       // YaRN n_ctx_train
+			ropeTypeNorm, // ROPE_TYPE_NORM
+			C.float(ropeBase),
+			C.float(ropeScale),
 			0.,  // YaRN ext_factor
 			1.,  // YaRN attn_factor
 			32., // YaRN beta_fast
@@ -625,3 +749,34 @@ func (t *Tensor) Conv2D(ctx ml.Context, t2 ml.Tensor, s0, s1, p0, p1, d0, d1 int
 		t: C.ggml_conv_2d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(s0), C.int(s1), C.int(p0), C.int(p1), C.int(d0), C.int(d1)),
 	}
 }
+
+func (b *Backend) SystemInfo() string {
+	var compiler string
+	switch C.get_compiler() {
+	case C.COMP_UNKNOWN:
+		compiler = "cgo(unknown_compiler)"
+	case C.COMP_GCC:
+		compiler = "cgo(gcc)"
+	case C.COMP_CLANG:
+		compiler = "cgo(clang)"
+	}
+
+	var s string
+	for i := range C.ggml_backend_reg_count() {
+		reg := C.ggml_backend_reg_get(i)
+		fName := C.CString("ggml_backend_get_features")
+		defer C.free(unsafe.Pointer(fName))
+		get_features_fn := C.ggml_backend_reg_get_proc_address(reg, fName)
+		if get_features_fn != nil {
+			s += C.GoString(C.ggml_backend_reg_name(reg))
+			s += " : "
+			for features := C.getBackendFeatures(get_features_fn, reg); features.name != nil; features = C.getNextBackendFeatures(features) {
+				s += C.GoString(features.name)
+				s += " = "
+				s += C.GoString(features.value)
+				s += " | "
+			}
+		}
+	}
+	return s + compiler
+}
--- a/ml/backend/ggml/ggml/src/CMakeLists.txt
+++ b/ml/backend/ggml/ggml/src/CMakeLists.txt
@@ -293,10 +293,6 @@ if (GGML_CPU_ALL_VARIANTS)
    ggml_add_cpu_backend_variant(skylakex       AVX F16C AVX2 FMA AVX512)
    ggml_add_cpu_backend_variant(icelake        AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
    ggml_add_cpu_backend_variant(alderlake      AVX F16C AVX2 FMA AVX_VNNI)
-    if (NOT MSVC)
-        # MSVC doesn't support AMX
-        ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
-    endif()
 else ()
    ggml_add_cpu_backend_variant_impl("")
 endif()
--- a/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
@@ -72,16 +72,6 @@
 #    pragma clang diagnostic ignored "-Wdeprecated-declarations"
 #endif

-static std::wstring utf8_to_utf16(const std::string & str) {
-    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
-    return converter.from_bytes(str);
-}
-
-static std::string utf16_to_utf8(const std::wstring & str) {
-    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
-    return converter.to_bytes(str);
-}
-
 #if defined(__clang__)
 #    pragma clang diagnostic pop
 #endif
@@ -96,12 +86,12 @@ struct dl_handle_deleter {
    }
 };

-static dl_handle * dl_load_library(const std::wstring & path) {
+static dl_handle * dl_load_library(const std::filesystem::path & path) {
    // suppress error dialogs for missing DLLs
    DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
    SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);

-    HMODULE handle = LoadLibraryW(path.c_str());
+    HMODULE handle = LoadLibraryW(path.wstring().c_str());

    SetErrorMode(old_mode);

@@ -129,8 +119,8 @@ struct dl_handle_deleter {
    }
 };

-static void * dl_load_library(const std::wstring & path) {
-    dl_handle * handle = dlopen(utf16_to_utf8(path).c_str(), RTLD_NOW | RTLD_LOCAL);
+static void * dl_load_library(const std::filesystem::path & path) {
+    dl_handle * handle = dlopen(path.string().c_str(), RTLD_NOW | RTLD_LOCAL);

    return handle;
 }
@@ -222,11 +212,11 @@ struct ggml_backend_registry {
        );
    }

-    ggml_backend_reg_t load_backend(const std::wstring & path, bool silent) {
+    ggml_backend_reg_t load_backend(const std::filesystem::path & path, bool silent) {
        dl_handle_ptr handle { dl_load_library(path) };
        if (!handle) {
            if (!silent) {
-                GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(path).c_str());
+                GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path.string().c_str());
            }
            return nullptr;
        }
@@ -234,7 +224,7 @@ struct ggml_backend_registry {
        auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
        if (score_fn && score_fn() == 0) {
            if (!silent) {
-                GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, utf16_to_utf8(path).c_str());
+                GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path.string().c_str());
            }
            return nullptr;
        }
@@ -242,7 +232,7 @@ struct ggml_backend_registry {
        auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init");
        if (!backend_init_fn) {
            if (!silent) {
-                GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, utf16_to_utf8(path).c_str());
+                GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, path.string().c_str());
            }
            return nullptr;
        }
@@ -251,16 +241,16 @@ struct ggml_backend_registry {
        if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
            if (!silent) {
                if (!reg) {
-                    GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, utf16_to_utf8(path).c_str());
+                    GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path.string().c_str());
                } else {
                    GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
-                        __func__, utf16_to_utf8(path).c_str(), reg->api_version, GGML_BACKEND_API_VERSION);
+                        __func__, path.string().c_str(), reg->api_version, GGML_BACKEND_API_VERSION);
                }
            }
            return nullptr;
        }

-        GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), utf16_to_utf8(path).c_str());
+        GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path.string().c_str());

        register_backend(reg, score_fn ? score_fn() : -1, std::move(handle));

@@ -396,14 +386,14 @@ ggml_backend_t ggml_backend_init_best(void) {

 // Dynamic loading
 ggml_backend_reg_t ggml_backend_load(const char * path) {
-    return get_reg().load_backend(utf8_to_utf16(path), false);
+    return get_reg().load_backend(path, false);
 }

 void ggml_backend_unload(ggml_backend_reg_t reg) {
    get_reg().unload_backend(reg, true);
 }

-static std::wstring get_executable_path() {
+static std::filesystem::path get_executable_path() {
 #if defined(__APPLE__)
    // get executable path
    std::vector<char> path;
@@ -415,15 +405,9 @@ static std::wstring get_executable_path() {
        }
        path.resize(size);
    }
-    std::string base_path(path.data(), size);
-    // remove executable name
-    auto last_slash = base_path.find_last_of('/');
-    if (last_slash != std::string::npos) {
-        base_path = base_path.substr(0, last_slash);
-    }
-    return utf8_to_utf16(base_path + "/");
+
+    return std::filesystem::path(path.data()).parent_path();
 #elif defined(__linux__) || defined(__FreeBSD__)
-    std::string base_path = ".";
    std::vector<char> path(1024);
    while (true) {
        // get executable path
@@ -436,76 +420,56 @@ static std::wstring get_executable_path() {
            break;
        }
        if (len < (ssize_t) path.size()) {
-            base_path = std::string(path.data(), len);
-            // remove executable name
-            auto last_slash = base_path.find_last_of('/');
-            if (last_slash != std::string::npos) {
-                base_path = base_path.substr(0, last_slash);
-            }
-            break;
+            return std::filesystem::path(path.data()).parent_path();
        }
        path.resize(path.size() * 2);
    }
-
-    return utf8_to_utf16(base_path + "/");
 #elif defined(_WIN32)
    std::vector<wchar_t> path(MAX_PATH);
    DWORD len = GetModuleFileNameW(NULL, path.data(), path.size());
    if (len == 0) {
        return {};
    }
-    std::wstring base_path(path.data(), len);
-    // remove executable name
-    auto last_slash = base_path.find_last_of('\\');
-    if (last_slash != std::string::npos) {
-        base_path = base_path.substr(0, last_slash);
-    }
-    return base_path + L"\\";
+
+    return std::filesystem::path(path.data()).parent_path();
 #else
    return {};
 #endif
 }

-static std::wstring backend_filename_prefix() {
+static std::string backend_filename_prefix() {
 #ifdef _WIN32
-    return L"ggml-";
+    return "ggml-";
 #else
-    return L"libggml-";
+    return "libggml-";
 #endif
 }

-static std::wstring backend_filename_suffix() {
+static std::string backend_filename_suffix() {
 #ifdef _WIN32
-    return L".dll";
+    return ".dll";
 #else
-    return L".so";
-#endif
-}
-
-static std::wstring path_separator() {
-#ifdef _WIN32
-    return L"\\";
-#else
-    return L"/";
+    return ".so";
 #endif
 }

 static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) {
    // enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
     // TODO: search system paths
-    std::wstring file_prefix = backend_filename_prefix() + utf8_to_utf16(name) + L"-";
-    std::vector<std::wstring> search_paths;
+    namespace fs = std::filesystem;
+    std::string file_prefix = backend_filename_prefix() + name + "-";
+    std::vector<fs::path> search_paths;
+
    if (user_search_path == nullptr) {
-        search_paths.push_back(L"." + path_separator());
+        search_paths.push_back(fs::current_path());
        search_paths.push_back(get_executable_path());
    } else {
-        search_paths.push_back(utf8_to_utf16(user_search_path) + path_separator());
+        search_paths.push_back(fs::u8path(user_search_path));
    }

    int best_score = 0;
-    std::wstring best_path;
+    fs::path best_path;

-    namespace fs = std::filesystem;
    for (const auto & search_path : search_paths) {
        if (!fs::exists(search_path)) {
            continue;
@@ -514,31 +478,31 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
        for (const auto & entry : dir_it) {
            try {
                if (entry.is_regular_file()) {
-                    std::wstring filename = entry.path().filename().wstring();
-                    std::wstring ext = entry.path().extension().wstring();
+                    std::string filename = entry.path().filename().string();
+                    std::string ext = entry.path().extension().string();
                    if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
-                        dl_handle_ptr handle { dl_load_library(entry.path().wstring()) };
+                        dl_handle_ptr handle { dl_load_library(entry.path()) };
                        if (!handle) {
-                            GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
+                            GGML_LOG_ERROR("%s: failed to load %s\n", __func__, entry.path().string().c_str());
                            continue;
                        }

                        auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
                        if (!score_fn) {
-                            GGML_LOG_DEBUG("%s: failed to find ggml_backend_score in %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
+                            GGML_LOG_DEBUG("%s: failed to find ggml_backend_score in %s\n", __func__, entry.path().string().c_str());
                            continue;
                        }

                        int s = score_fn();
-                        GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str(), s);
+                        GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, entry.path().string().c_str(), s);
                        if (s > best_score) {
                            best_score = s;
-                            best_path = entry.path().wstring();
+                            best_path = entry.path();
                        }
                    }
                }
            } catch (const std::exception & e) {
-                GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str(), e.what());
+                GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, entry.path().string().c_str(), e.what());
            }
        }
    }
@@ -546,7 +510,7 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
    if (best_score == 0) {
        // try to load the base backend
        for (const auto & search_path : search_paths) {
-            std::wstring path = search_path + backend_filename_prefix() + utf8_to_utf16(name) + backend_filename_suffix();
+            fs::path path = fs::path(search_path) / (backend_filename_prefix() + name + backend_filename_suffix());
            if (fs::exists(path)) {
                return get_reg().load_backend(path, silent);
            }
--- a/model/model.go
+++ b/model/model.go
@@ -21,6 +21,7 @@ import (
 	_ "github.com/ollama/ollama/ml/backend"
 )

+// Options contains the inputs for a model forward pass
 type Options struct {
 	Inputs    []int32
 	Positions []int32
@@ -34,11 +35,13 @@ type config struct {
 	Cache kvcache.Cache
 }

+// Base implements the common fields and methods for all models
 type Base struct {
 	b ml.Backend
 	config
 }

+// Backend returns the underlying backend that will run the model
 func (m *Base) Backend() ml.Backend {
 	return m.b
 }
@@ -47,6 +50,7 @@ func (m *Base) Config() config {
 	return m.config
 }

+// Model implements a specific model architecture, defining the forward pass and any model-specific configuration
 type Model interface {
 	Forward(ml.Context, Options) (ml.Tensor, error)

@@ -56,6 +60,7 @@ type Model interface {

 var models = make(map[string]func(ml.Config) (Model, error))

+// Register registers a model constructor for the given architecture
 func Register(name string, f func(ml.Config) (Model, error)) {
 	if _, ok := models[name]; ok {
 		panic("model: model already registered")
@@ -64,8 +69,9 @@ func Register(name string, f func(ml.Config) (Model, error)) {
 	models[name] = f
 }

-func New(s string) (Model, error) {
-	r, err := os.Open(s)
+// New initializes a new model instance with the provided configuration based on the metadata in the model file
+func New(modelPath string) (Model, error) {
+	r, err := os.Open(modelPath)
 	if err != nil {
 		return nil, err
 	}
--- a/model/models/llama/model.go
+++ b/model/models/llama/model.go
@@ -10,10 +10,10 @@ import (
 )

 type Options struct {
-	RopeFactors                                  ml.Tensor `gguf:"rope_freqs.weight"`
-	origCtxLen, hiddenSize, numHeads, numKVHeads int
-	eps, ropeBase, ropeScale                     float32
-	ropeDim                                      uint32
+	RopeFactors                      ml.Tensor `gguf:"rope_freqs.weight"`
+	hiddenSize, numHeads, numKVHeads int
+	eps, ropeBase, ropeScale         float32
+	ropeDim                          uint32
 }

 type Model struct {
@@ -46,7 +46,6 @@ func New(c ml.Config) (model.Model, error) {
 			numHeads:   int(c.Uint("attention.head_count")),
 			numKVHeads: int(c.Uint("attention.head_count_kv")),
 			eps:        c.Float("attention.layer_norm_rms_epsilon"),
-			origCtxLen: int(c.Uint("context_length")),
 			ropeBase:   c.Float("rope.freq_base"),
 			ropeScale:  c.Float("rope.freq_scale", 1),
 			ropeDim:    c.Uint("rope.dimension_count"),
@@ -68,23 +67,14 @@ type SelfAttention struct {
 func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
 	headDim := opts.hiddenSize / opts.numHeads
-	rc := ml.RopeConfig{
-		PositionIDs: positionIDs,
-		RopeFactors: opts.RopeFactors,
-		RopeDim:     opts.ropeDim,
-		RopeType:    ml.RopeTypeStandard,
-		OrigCtxLen:  opts.origCtxLen,
-		RopeBase:    opts.ropeBase,
-		RopeScale:   opts.ropeScale,
-	}

 	q := sa.Query.Forward(ctx, hiddenState)
 	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
-	q = q.RoPE(ctx, rc)
+	q = q.RoPE(ctx, positionIDs, opts.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)

 	k := sa.Key.Forward(ctx, hiddenState)
 	k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-	k = k.RoPE(ctx, rc)
+	k = k.RoPE(ctx, positionIDs, opts.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)

 	v := sa.Value.Forward(ctx, hiddenState)
 	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
@@ -109,18 +99,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
 }

 func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return key.RoPE(
-		ctx,
-		ml.RopeConfig{
-			PositionIDs: shift,
-			RopeFactors: m.Options.RopeFactors,
-			RopeDim:     m.Options.ropeDim,
-			RopeType:    ml.RopeTypeStandard,
-			OrigCtxLen:  m.Options.origCtxLen,
-			RopeBase:    m.Options.ropeBase,
-			RopeScale:   m.Options.ropeScale,
-		},
-	), nil
+	return key.RoPE(ctx, shift, m.Options.RopeFactors, m.Options.ropeDim, m.Options.ropeBase, m.Options.ropeScale), nil
 }

 type MLP struct {
--- a/model/models/mllama/model_text.go
+++ b/model/models/mllama/model_text.go
@@ -19,23 +19,14 @@ type TextSelfAttention struct {
 func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, _ ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
 	headDim := opts.hiddenSize / opts.numHeads
-	rc := ml.RopeConfig{
-		PositionIDs: positions,
-		RopeFactors: opts.RopeFactors,
-		RopeDim:     opts.ropeDim,
-		RopeType:    ml.RopeTypeStandard,
-		OrigCtxLen:  opts.ctxLen,
-		RopeBase:    opts.ropeBase,
-		RopeScale:   opts.ropeScale,
-	}

 	query := sa.Query.Forward(ctx, hiddenState)
 	query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
-	query = query.RoPE(ctx, rc)
+	query = query.RoPE(ctx, positions, opts.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)

 	key := sa.Key.Forward(ctx, hiddenState)
 	key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-	key = key.RoPE(ctx, rc)
+	key = key.RoPE(ctx, positions, opts.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)

 	value := sa.Value.Forward(ctx, hiddenState)
 	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
@@ -61,18 +52,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, _ m

 func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
 	// This will only get called for layers in the cache, which are just the self attention layers
-	return key.RoPE(
-		ctx,
-		ml.RopeConfig{
-			PositionIDs: shift,
-			RopeFactors: m.RopeFactors,
-			RopeDim:     m.ropeDim,
-			RopeType:    ml.RopeTypeStandard,
-			OrigCtxLen:  m.ctxLen,
-			RopeBase:    m.ropeBase,
-			RopeScale:   m.ropeScale,
-		},
-	), nil
+	return key.RoPE(ctx, shift, m.RopeFactors, m.ropeDim, m.ropeBase, m.ropeScale), nil
 }

 type TextMLP struct {
@@ -209,9 +189,9 @@ func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, mask, cr
 type TextModelOptions struct {
 	RopeFactors ml.Tensor `gguf:"rope_freqs.weight"`

-	ctxLen, hiddenSize, numHeads, numKVHeads int
-	eps, ropeBase, ropeScale                 float32
-	ropeDim                                  uint32
+	hiddenSize, numHeads, numKVHeads int
+	eps, ropeBase, ropeScale         float32
+	ropeDim                          uint32

 	crossAttentionLayers []uint32
 }
--- a/progress/progress.go
+++ b/progress/progress.go
@@ -1,6 +1,7 @@
 package progress

 import (
+	"bufio"
 	"fmt"
 	"io"
 	"sync"
@@ -13,7 +14,8 @@ type State interface {

 type Progress struct {
 	mu sync.Mutex
-	w  io.Writer
+	// buffer output to minimize flickering on all terminals
+	w *bufio.Writer

 	pos int

@@ -22,7 +24,7 @@ type Progress struct {
 }

 func NewProgress(w io.Writer) *Progress {
-	p := &Progress{w: w}
+	p := &Progress{w: bufio.NewWriter(w)}
 	go p.start()
 	return p
 }
@@ -48,11 +50,14 @@ func (p *Progress) Stop() bool {
 	stopped := p.stop()
 	if stopped {
 		fmt.Fprint(p.w, "\n")
+		p.w.Flush()
 	}
 	return stopped
 }

 func (p *Progress) StopAndClear() bool {
+	defer p.w.Flush()
+
 	fmt.Fprint(p.w, "\033[?25l")
 	defer fmt.Fprint(p.w, "\033[?25h")

@@ -81,20 +86,24 @@ func (p *Progress) render() {
 	p.mu.Lock()
 	defer p.mu.Unlock()

+	defer p.w.Flush()
+
+	// eliminate flickering on terminals that support synchronized output
+	fmt.Fprint(p.w, "\033[?2026h")
+	defer fmt.Fprint(p.w, "\033[?2026l")
+
 	fmt.Fprint(p.w, "\033[?25l")
 	defer fmt.Fprint(p.w, "\033[?25h")

-	// clear already rendered progress lines
-	for i := range p.pos {
-		if i > 0 {
-			fmt.Fprint(p.w, "\033[A")
-		}
-		fmt.Fprint(p.w, "\033[2K\033[1G")
+	// move the cursor back to the beginning
+	for range p.pos - 1 {
+		fmt.Fprint(p.w, "\033[A")
 	}
+	fmt.Fprint(p.w, "\033[1G")

 	// render progress lines
 	for i, state := range p.states {
-		fmt.Fprint(p.w, state.String())
+		fmt.Fprint(p.w, state.String(), "\033[K")
 		if i < len(p.states)-1 {
 			fmt.Fprint(p.w, "\n")
 		}
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -813,6 +813,8 @@ func (s *Server) loadModel(
 		panic(err)
 	}

+	slog.Info("system", "info", s.model.Backend().SystemInfo() /* "threads", *threads */)
+
 	// TODO(jessegross): LoRA loading
 	if lpath.String() != "" {
 		panic("loras are not yet implemented")
@@ -881,7 +883,6 @@ func Execute(args []string) error {
 	})
 	slog.SetDefault(slog.New(handler))
 	slog.Info("starting ollama engine")
-	// TODO(jessegross): Some system info would be useful

 	server := &Server{
 		batchSize: *batchSize,
Author	SHA1	Message	Date
Bruce MacDonald	057cc54b66	benchmark: compare backend graph computation times Track execution time of individual tensor operations (views, copies, reshapes etc) during LLM forward passes using CGo bindings to the native graph runtime. This helps identify performance bottlenecks in the computation graph and optimize memory operations that can significantly impact inference latency.	2025-02-19 15:22:53 -08:00
Michael Yang	1e438b237c	Merge pull request #9203 from ollama/mxyng/sapphirerapids build: remove backend build for sapphirerapids	2025-02-19 21:42:00 +00:00
yuiseki	d721a02e7d	test: add test cases for ListHandler (#9146 )	2025-02-19 13:24:27 -08:00
zyxucp	778603a818	docs: Add AntSK to Community Integrations (#9214 )	2025-02-19 13:22:48 -08:00
maninhill	3c874df46e	docs: Add MaxKB to Community Integrations (#9212 )	2025-02-19 13:20:09 -08:00
Jeffrey Morgan	d2eb226c91	llama: add patch to fix ggml backend reg on Linux with utf-8 characters in the path (#9159 )	2025-02-18 22:46:17 -05:00
Michael Yang	e13e7c8d94	Merge pull request #9079 from jeremyschlatter/main cmd: fix flickering in progress bar	2025-02-18 22:59:29 +00:00
Jeremy Schlatter	78f403ff45	address code review comments	2025-02-18 14:50:09 -08:00
Michael Yang	5f8c03189e	build: remove backend build for sapphirerapids sapphire rapids has amx support but it ends up having a negative performance impact. emerald rapids also has amx support with a positive performance impact however there's no reasonable way in ggml to differentiate between the two. the impact is small (~6%) so disable amx entirely for simplicity	2025-02-18 14:47:58 -08:00
Michael Yang	08a299e1d0	cmake: avoid building intel backends on linux	2025-02-18 22:17:00 +00:00
Michael Yang	7b5d916a9a	ci: set owner/group in tarball set owner and group when building the linux tarball so extracted files are consistent. this is the behaviour of release tarballs in version 0.5.7 and lower	2025-02-18 20:11:09 +00:00
benhaotang	33ad61b112	Add OpenDeepResearcher-via-searxng to Community Integrations (#9138 )	2025-02-18 11:39:11 -08:00
L. Jiang	716e365615	test: add test cases for HumanNumber (#9108 )	2025-02-18 11:35:26 -08:00
innightwolfsleep	3b4424ff98	readme: add LLM Telegram Bot to community integrations (#9150 )	2025-02-18 10:04:30 -05:00
Jeremy Schlatter	f9c7ead160	cmd: eliminate flickering with synchronized output	2025-02-17 20:01:03 -08:00
Jeremy Schlatter	5930aaeb1a	cmd: fix cursor flickering in progress bar The previous commit fixed flickering in the progress bar itself. Cursor flickering is harder to address. Cursor flickering could be fixed by hiding the cursor altogether while the progress bar is displayed. The downside of this is that if the program is killed in such a way that it can't clean up its state, it would leave the cursor invisible. Instead, this commit introduces an output buffer. All of the escape codes and content for a single progress update are written to a buffer, which is then flushed to the terminal all at once. This significantly decreases the time during which the terminal has seen the cursor-hiding code but has not yet seen the cursor-showing code, thus minimizing (but not 100% eliminating) cursor flickering. For more context, see: https://gitlab.gnome.org/GNOME/vte/-/issues/2837#note_2269501	2025-02-17 14:56:57 -08:00
Jeremy Schlatter	faf67db089	cmd: fix progress bar flickering Previous code cleared the display before writing new content, creating a window where the terminal could (and in some cases did) render empty lines. Instead, we now write new content over the old content, only clearing the trailing end of lines for cases where the new line is shorter. Fixes #1664	2025-02-17 13:39:02 -08:00
James-William-Kincaid-III	0667baddc6	docs: fix incorrect shortcut key in windows.md (#9098 )	2025-02-15 15:38:24 -05:00
Bruce MacDonald	d006e1e09b	model: document high-level model interface (#9122 )	2025-02-14 16:01:00 -08:00
Daniel Hiltgen	df2680b4b9	Wire up system info log for new engine (#9123 )	2025-02-14 15:55:33 -08:00