readme: add ChibiChat to community integrations (#8883 )

build(rocm): add numa, elf (#8900 )
readme: add Ollama Chat WebUI for Docker to community integrations (#8084 )
2025-02-06 16:08:46 -08:00 · 2025-02-06 15:46:30 -08:00 · 2025-02-06 15:41:02 -08:00 · 2025-02-06 15:08:12 -08:00 · 2025-02-06 14:54:58 -08:00 · 2025-02-06 13:12:16 -08:00
20 changed files with 218 additions and 195 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -15,6 +15,10 @@ ml/backend/**/*.cu linguist-vendored
 ml/backend/**/*.cuh linguist-vendored
 ml/backend/**/*.m linguist-vendored
 ml/backend/**/*.metal linguist-vendored
+ml/backend/**/CMakeLists.txt linguist-vendored
+
+llama/build-info.cpp linguist-generated
+ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.s linguist-generated

 * text=auto
 *.go text eol=lf
--- a/.github/ISSUE_TEMPLATE/10_bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/10_bug_report.yml
@@ -9,6 +9,14 @@ body:
      description: What happened? What did you expect to happen?
    validations:
      required: true
+  - type: textarea
+    id: logs
+    attributes:
+      label: Relevant log output
+      description: Please copy and paste any relevant log output. See [Troubleshooting Guide](https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md#how-to-troubleshoot-issues) for details.
+      render: shell
+    validations:
+      required: false
  - type: dropdown
    id: os
    attributes:
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -303,32 +303,38 @@ jobs:
    steps:
      - uses: actions/checkout@v4
      - uses: docker/setup-buildx-action@v3
+      - uses: docker/build-push-action@v6
+        with:
+          context: .
+          platforms: ${{ matrix.os }}/${{ matrix.arch }}
+          target: ${{ matrix.target }}
+          build-args: |
+            GOFLAGS=${{ env.GOFLAGS }}
+            CGO_CFLAGS=${{ env.CGO_CFLAGS }}
+            CGO_CXXFLAGS=${{ env.CGO_CXXFLAGS }}
+          outputs: type=local,dest=dist/${{ matrix.os }}-${{ matrix.arch }}
+          cache-from: type=registry,ref=ollama/ollama:latest
+          cache-to: type=inline
      - run: |
-          sudo apt-get update && sudo apt-get install pigz
-          docker buildx build --platform $PLATFORM --target ${{ matrix.target }} --build-arg GOFLAGS --build-arg CGO_CFLAGS --build-arg CGO_CXXFLAGS --output type=local,dest=dist/$PLATFORM .
-
-          for COMPONENTS in dist/$PLATFORM/* dist/$PLATFORM/lib/ollama/*; do
-              if [ -d "$COMPONENTS" ]; then
-                  case "$COMPONENTS" in
-                      */bin) echo $COMPONENTS >>dist/ollama-${PLATFORM//\//-}.tar.in ;;
-                      */lib/ollama) echo $COMPONENTS >>dist/ollama-${PLATFORM//\//-}.tar.in;;
-                      */lib/ollama/cuda_v11) echo $COMPONENTS >>dist/ollama-${PLATFORM//\//-}.tar.in;;
-                      */lib/ollama/cuda_v12) echo $COMPONENTS >>dist/ollama-${PLATFORM//\//-}.tar.in;;
-                      */lib/ollama/cuda_jetpack5) echo $COMPONENTS >>dist/ollama-${PLATFORM//\//-}-jetpack5.tar.in ;;
-                      */lib/ollama/cuda_jetpack6) echo $COMPONENTS >>dist/ollama-${PLATFORM//\//-}-jetpack6.tar.in ;;
-                      */lib/ollama/rocm) echo $COMPONENTS >>dist/ollama-${PLATFORM//\//-}-rocm.tar.in ;;
-                  esac
-              fi
+          for COMPONENT in bin/* lib/ollama/*; do
+            case "$COMPONENT" in
+              bin/ollama)               echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+              lib/ollama/*.so)          echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+              lib/ollama/cuda_v11)      echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+              lib/ollama/cuda_v12)      echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+              lib/ollama/cuda_jetpack5) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
+              lib/ollama/cuda_jetpack6) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
+              lib/ollama/rocm)          echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-rocm.tar.in ;;
+            esac
          done
-
-          for ARCHIVE in dist/*.tar.in; do tar c -T $ARCHIVE --strip-components 3 | pigz -9cv >${ARCHIVE//.*/}.tgz; done
-        env:
-          PLATFORM: ${{ matrix.os }}/${{ matrix.arch }}
+        working-directory: dist/${{ matrix.os }}-${{ matrix.arch }}
+      - run: |
+          for ARCHIVE in dist/${{ matrix.os }}-${{ matrix.arch }}/*.tar.in; do tar c -C dist/${{ matrix.os }}-${{ matrix.arch }} -T $ARCHIVE | pigz -9vc >$(basename ${ARCHIVE//.*/}.tgz); done
      - uses: actions/upload-artifact@v4
        with:
          name: dist-${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.target }}
          path: |
-            dist/*.tgz
+            *.tgz

  # Build each Docker variant (OS, arch, and flavor) separately. Using QEMU is unreliable and slower.
  docker-build-push:
@@ -362,7 +368,7 @@ jobs:
      GOFLAGS: ${{ needs.setup-environment.outputs.GOFLAGS }}
    steps:
      - uses: actions/checkout@v4
-      - uses: docker/setup-buildx-action@v2
+      - uses: docker/setup-buildx-action@v3
      - uses: docker/login-action@v3
        with:
          username: ${{ vars.DOCKER_USER }}
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -163,5 +163,5 @@ jobs:
      - uses: actions/checkout@v4
      - name: Verify patches apply cleanly and do not change files
        run: |
-          make -f Makefile.sync clean checkout sync
+          make -f Makefile.sync clean sync
          git diff --compact-summary --exit-code
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -96,11 +96,12 @@ if(CMAKE_HIP_COMPILER)

    if(AMDGPU_TARGETS)
        add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-hip)
+
        set(OLLAMA_HIP_INSTALL_DIR ${OLLAMA_INSTALL_DIR}/rocm)
        install(TARGETS ggml-hip
            RUNTIME_DEPENDENCIES
                DIRECTORIES ${HIP_BIN_INSTALL_DIR} ${HIP_LIB_INSTALL_DIR}
-                PRE_INCLUDE_REGEXES amdhip64 hipblas rocblas amd_comgr hsa_runtime64 rocprofiler-register drm_amdgpu drm numa
+                PRE_INCLUDE_REGEXES hipblas rocblas amdhip64 rocsolver amd_comgr hsa-runtime64 rocsparse tinfo rocprofiler-register drm drm_amdgpu numa elf
                PRE_EXCLUDE_REGEXES ".*"
                POST_EXCLUDE_REGEXES "system32"
            RUNTIME DESTINATION ${OLLAMA_HIP_INSTALL_DIR} COMPONENT HIP
--- a/Makefile.sync
+++ b/Makefile.sync
@@ -15,7 +15,11 @@ help:
 	@echo "    make -f $(lastword $(MAKEFILE_LIST)) clean sync"

 .PHONY: sync
-sync: llama/llama.cpp ml/backend/ggml/ggml apply-patches
+sync: llama/build-info.cpp llama/llama.cpp ml/backend/ggml/ggml apply-patches
+
+.PHONY: llama/build-info.cpp
+llama/build-info.cpp: llama/build-info.cpp.in
+	sed -e 's|@FETCH_HEAD@|$(FETCH_HEAD)|' $< > $@

 .PHONY: llama/llama.cpp
 llama/llama.cpp: llama/vendor/ apply-patches
--- a/README.md
+++ b/README.md
@@ -369,9 +369,11 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Minima](https://github.com/dmayboroda/minima) (RAG with on-premises or fully local workflow)
 - [aidful-ollama-model-delete](https://github.com/AidfulAI/aidful-ollama-model-delete) (User interface for simplified model cleanup)
 - [Perplexica](https://github.com/ItzCrazyKns/Perplexica) (An AI-powered search engine & an open-source alternative to Perplexity AI)
+- [Ollama Chat WebUI for Docker ](https://github.com/oslook/ollama-webui) (Support for local docker deployment, lightweight ollama webui)
 - [AI Toolkit for Visual Studio Code](https://aka.ms/ai-tooklit/ollama-docs) (Microsoft-official VSCode extension to chat, test, evaluate models with Ollama support, and use them in your AI applications.)
 - [MinimalNextOllamaChat](https://github.com/anilkay/MinimalNextOllamaChat) (Minimal Web UI for Chat and Model Control)
 - [Chipper](https://github.com/TilmanGriesel/chipper) AI interface for tinkerers (Ollama, Haystack RAG, Python)
+- [ChibiChat](https://github.com/CosmicEventHorizon/ChibiChat) (Kotlin-based Android app to chat with Ollama and Koboldcpp API endpoints)

 ### Cloud

@@ -535,6 +537,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [TextCraft](https://github.com/suncloudsmoon/TextCraft) (Copilot in Word alternative using Ollama)
 - [Alfred Ollama](https://github.com/zeitlings/alfred-ollama) (Alfred Workflow)
 - [TextLLaMA](https://github.com/adarshM84/TextLLaMA) A Chrome Extension that helps you write emails, correct grammar, and translate into any language
+- [Simple-Discord-AI](https://github.com/zyphixor/simple-discord-ai)

 ### Supported backends

@@ -545,3 +548,4 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [OpenLIT](https://github.com/openlit/openlit) is an OpenTelemetry-native tool for monitoring Ollama Applications & GPUs using traces and metrics.
 - [HoneyHive](https://docs.honeyhive.ai/integrations/ollama) is an AI observability and evaluation platform for AI agents. Use HoneyHive to evaluate agent performance, interrogate failures, and monitor quality in production.
 - [Langfuse](https://langfuse.com/docs/integrations/ollama) is an open source LLM observability platform that enables teams to collaboratively monitor, evaluate and debug AI applications.
+- [MLflow Tracing](https://mlflow.org/docs/latest/llms/tracing/index.html#automatic-tracing) is an open source LLM observability tool with a convenient API to log and visualize traces, making it easy to debug and evaluate GenAI applications.
--- a/docs/development.md
+++ b/docs/development.md
@@ -118,3 +118,14 @@ To run tests, use `go test`:
 ```
 go test ./...
 ```
+
+## Library detection
+
+Ollama looks for acceleration libraries in the following paths relative to the `ollama` executable:
+
+* `./lib/ollama` (Windows)
+* `../lib/ollama` (Linux)
+* `.` (macOS)
+* `build/lib/ollama` (for development)
+
+If the libraries are not found, Ollama will not run with any acceleration libraries.
--- a/docs/linux.md
+++ b/docs/linux.md
@@ -186,3 +186,9 @@ sudo rm -r /usr/share/ollama
 sudo userdel ollama
 sudo groupdel ollama
 ```
+
+Remove installed libraries:
+
+```shell
+sudo rm -rf /usr/local/lib/ollama
+```
--- a/format/bytes.go
+++ b/format/bytes.go
@@ -40,8 +40,6 @@ func HumanBytes(b int64) string {
 	}

 	switch {
-	case value >= 100:
-		return fmt.Sprintf("%d %s", int(value), unit)
 	case value >= 10:
 		return fmt.Sprintf("%d %s", int(value), unit)
 	case value != math.Trunc(value):
--- a/format/bytes_test.go
+++ b/format/bytes_test.go
@@ -0,0 +1,91 @@
+package format
+
+import (
+	"testing"
+)
+
+func TestHumanBytes(t *testing.T) {
+	type testCase struct {
+		input    int64
+		expected string
+	}
+
+	tests := []testCase{
+		// Test bytes (B)
+		{0, "0 B"},
+		{1, "1 B"},
+		{999, "999 B"},
+
+		// Test kilobytes (KB)
+		{1000, "1 KB"},
+		{1500, "1.5 KB"},
+		{999999, "999 KB"},
+
+		// Test megabytes (MB)
+		{1000000, "1 MB"},
+		{1500000, "1.5 MB"},
+		{999999999, "999 MB"},
+
+		// Test gigabytes (GB)
+		{1000000000, "1 GB"},
+		{1500000000, "1.5 GB"},
+		{999999999999, "999 GB"},
+
+		// Test terabytes (TB)
+		{1000000000000, "1 TB"},
+		{1500000000000, "1.5 TB"},
+		{1999999999999, "2.0 TB"},
+
+		// Test fractional values
+		{1234, "1.2 KB"},
+		{1234567, "1.2 MB"},
+		{1234567890, "1.2 GB"},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.expected, func(t *testing.T) {
+			result := HumanBytes(tc.input)
+			if result != tc.expected {
+				t.Errorf("Expected %s, got %s", tc.expected, result)
+			}
+		})
+	}
+}
+
+func TestHumanBytes2(t *testing.T) {
+	type testCase struct {
+		input    uint64
+		expected string
+	}
+
+	tests := []testCase{
+		// Test bytes (B)
+		{0, "0 B"},
+		{1, "1 B"},
+		{1023, "1023 B"},
+
+		// Test kibibytes (KiB)
+		{1024, "1.0 KiB"},
+		{1536, "1.5 KiB"},
+		{1048575, "1024.0 KiB"},
+
+		// Test mebibytes (MiB)
+		{1048576, "1.0 MiB"},
+		{1572864, "1.5 MiB"},
+		{1073741823, "1024.0 MiB"},
+
+		// Test gibibytes (GiB)
+		{1073741824, "1.0 GiB"},
+		{1610612736, "1.5 GiB"},
+		{2147483648, "2.0 GiB"},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.expected, func(t *testing.T) {
+			result := HumanBytes2(tc.input)
+			if result != tc.expected {
+				t.Errorf("Expected %s, got %s", tc.expected, result)
+			}
+		})
+	}
+}
--- a/llama/build-info.cpp
+++ b/llama/build-info.cpp
@@ -1,4 +1,4 @@
 int LLAMA_BUILD_NUMBER = 0;
-char const *LLAMA_COMMIT = "ba1cb19cdd0d92e012e0f6e009e0620f854b6afd";
+char const *LLAMA_COMMIT = "46e3556e01b824e52395fb050b29804b6cff2a7c";
 char const *LLAMA_COMPILER = "";
 char const *LLAMA_BUILD_TARGET = "";
--- a/llama/build-info.cpp.in
+++ b/llama/build-info.cpp.in
@@ -0,0 +1,4 @@
+int LLAMA_BUILD_NUMBER = 0;
+char const *LLAMA_COMMIT = "@FETCH_HEAD@";
+char const *LLAMA_COMPILER = "";
+char const *LLAMA_BUILD_TARGET = "";
--- a/llama/llama.cpp/examples/llava/clip.cpp
+++ b/llama/llama.cpp/examples/llava/clip.cpp
@@ -1235,35 +1235,15 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        }
    }

-#ifdef GGML_USE_CUDA
-   new_clip->backend = ggml_backend_cuda_init(0);
-   LOG_INF("%s: CLIP using CUDA backend\n", __func__);
-#endif
-
-#ifdef GGML_USE_METAL
-   new_clip->backend = ggml_backend_metal_init();
-   LOG_INF("%s: CLIP using Metal backend\n", __func__);
-#endif
-
-#ifdef GGML_USE_CANN
-   new_clip->backend = ggml_backend_cann_init(0);
-   LOG_INF("%s: CLIP using CANN backend\n", __func__);
-#endif
-
-#ifdef GGML_USE_VULKAN
-   new_clip->backend = ggml_backend_vk_init(0);
-   LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
-#endif
-
-#ifdef GGML_USE_SYCL
-   new_clip->backend = ggml_backend_sycl_init(0);
-   LOG_INF("%s: CLIP using SYCL backend\n", __func__);
-#endif
-
-    if (!new_clip->backend) {
-        new_clip->backend = ggml_backend_cpu_init();
-        LOG_INF("%s: CLIP using CPU backend\n", __func__);
+    ggml_backend_t backend = ggml_backend_init_best();
+    if (backend == nullptr) {
+        LOG_ERR("%s: failed to initialize backend\n", __func__);
+        clip_free(new_clip);
+        gguf_free(ctx);
+        return nullptr;
    }
+    LOG_INF("%s: using %s backend\n", __func__, ggml_backend_name(backend));
+    new_clip->backend = backend;

    // model size and capabilities
    {
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -199,21 +199,25 @@ func (c *Context) KvCacheDefrag() {

 // Get the embeddings for a sequence id
 func (c *Context) GetEmbeddingsSeq(seqId int) []float32 {
-	embeddings := unsafe.Pointer(C.llama_get_embeddings_seq(c.c, C.int(seqId)))
-	if embeddings == nil {
+	e := unsafe.Pointer(C.llama_get_embeddings_seq(c.c, C.int(seqId)))
+	if e == nil {
 		return nil
 	}

-	return unsafe.Slice((*float32)(embeddings), c.Model().NEmbd())
+	embeddings := make([]float32, c.Model().NEmbd())
+	_ = copy(embeddings, unsafe.Slice((*float32)(e), c.Model().NEmbd()))
+	return embeddings
 }

 func (c *Context) GetEmbeddingsIth(i int) []float32 {
-	embeddings := unsafe.Pointer(C.llama_get_embeddings_ith(c.c, C.int32_t(i)))
-	if embeddings == nil {
+	e := unsafe.Pointer(C.llama_get_embeddings_ith(c.c, C.int32_t(i)))
+	if e == nil {
 		return nil
 	}

-	return unsafe.Slice((*float32)(embeddings), c.Model().NEmbd())
+	embeddings := make([]float32, c.Model().NEmbd())
+	_ = copy(embeddings, unsafe.Slice((*float32)(e), c.Model().NEmbd()))
+	return embeddings
 }

 type ModelParams struct {
--- a/llama/mllama.cpp
+++ b/llama/mllama.cpp
@@ -558,30 +558,15 @@ struct mllama_ctx *mllama_model_load(const char *fname, const int verbosity = 1)

    mllama_ctx *new_mllama = new mllama_ctx{};

-#ifdef GGML_USE_CUDA
-    new_mllama->backend = ggml_backend_cuda_init(0);
-    LOG("vision using CUDA backend");
-#endif
-
-#ifdef GGML_USE_METAL
-    new_mllama->backend = ggml_backend_metal_init();
-    LOG("vision using Metal backend");
-#endif
-
-#ifdef GGML_USE_CANN
-    new_mllama->backend = ggml_backend_cann_init(0);
-    LOG("vision using CANN backend");
-#endif
-
-#ifdef GGML_USE_VULKAN
-    new_mllama->backend = ggml_backend_vk_init(0);
-    LOG("vision using Vulkan backend");
-#endif
-
-    if (!new_mllama->backend) {
-        new_mllama->backend = ggml_backend_cpu_init();
-        LOG("vision using CPU backend");
+    ggml_backend_t backend = ggml_backend_init_best();
+    if (backend == nullptr) {
+        LOG("%s: failed to initialize backend\n", __func__);
+        mllama_free(new_mllama);
+        gguf_free(ctx);
+        return nullptr;
    }
+    LOG("%s: using %s backend\n", __func__, ggml_backend_name(backend));
+    new_mllama->backend = backend;

    // load tensors
    {
--- a/llama/patches/0013-use-dynamic-backend-loading-for-clip.patch
+++ b/llama/patches/0013-use-dynamic-backend-loading-for-clip.patch
@@ -1,14 +1,14 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: jmorganca <jmorganca@gmail.com>
 Date: Sat, 4 Jan 2025 22:52:48 -0800
-Subject: [PATCH] re-enable gpu for clip
+Subject: [PATCH] use dynamic backend loading for clip

 ---
- examples/llava/clip.cpp | 86 ++++++++++++++++++++---------------------
- 1 file changed, 43 insertions(+), 43 deletions(-)
+ examples/llava/clip.cpp | 74 +++++++++++++++--------------------------
+ 1 file changed, 27 insertions(+), 47 deletions(-)

 diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
-index b3c1829f..718052e1 100644
+index b3c1829f..86b91d5c 100644
 --- a/examples/llava/clip.cpp
 +++ b/examples/llava/clip.cpp
@@ -8,25 +8,25 @@
@@ -56,7 +56,7 @@ index b3c1829f..718052e1 100644
 
 #define STB_IMAGE_IMPLEMENTATION
 #include "stb_image.h"
-@@ -1235,30 +1235,30 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
+@@ -1235,35 +1235,15 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
         }
     }
 
@@ -84,30 +84,19 @@ index b3c1829f..718052e1 100644
 -//    new_clip->backend = ggml_backend_sycl_init(0);
 -//    LOG_INF("%s: CLIP using SYCL backend\n", __func__);
 -//#endif
-+#ifdef GGML_USE_CUDA
-+   new_clip->backend = ggml_backend_cuda_init(0);
-+   LOG_INF("%s: CLIP using CUDA backend\n", __func__);
-+#endif
-+
-+#ifdef GGML_USE_METAL
-+   new_clip->backend = ggml_backend_metal_init();
-+   LOG_INF("%s: CLIP using Metal backend\n", __func__);
-+#endif
-+
-+#ifdef GGML_USE_CANN
-+   new_clip->backend = ggml_backend_cann_init(0);
-+   LOG_INF("%s: CLIP using CANN backend\n", __func__);
-+#endif
-+
-+#ifdef GGML_USE_VULKAN
-+   new_clip->backend = ggml_backend_vk_init(0);
-+   LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
-+#endif
-+
-+#ifdef GGML_USE_SYCL
-+   new_clip->backend = ggml_backend_sycl_init(0);
-+   LOG_INF("%s: CLIP using SYCL backend\n", __func__);
-+#endif
+-
+-    if (!new_clip->backend) {
+-        new_clip->backend = ggml_backend_cpu_init();
+-        LOG_INF("%s: CLIP using CPU backend\n", __func__);
+    ggml_backend_t backend = ggml_backend_init_best();
+    if (backend == nullptr) {
+        LOG_ERR("%s: failed to initialize backend\n", __func__);
+        clip_free(new_clip);
+        gguf_free(ctx);
+        return nullptr;
+     }
+    LOG_INF("%s: using %s backend\n", __func__, ggml_backend_name(backend));
+    new_clip->backend = backend;
 
-     if (!new_clip->backend) {
-         new_clip->backend = ggml_backend_cpu_init();
+     // model size and capabilities
+     {
--- a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/generate_cu_files.py
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/generate_cu_files.py
@@ -1,77 +0,0 @@
-#!/usr/bin/env python3
-
-from glob import glob
-import os
-
-TYPES_KV = ["GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0", "GGML_TYPE_F16"]
-
-SOURCE_FATTN_VEC = """// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f{vkq_size}.cuh"
-
-DECL_FATTN_VEC_F{vkq_size}_CASE({head_size}, {type_k}, {type_v});
-"""
-
-SOURCE_FATTN_WMMA_START = """// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-wmma-f16.cuh"
-
-"""
-
-SOURCE_FATTN_WMMA_CASE = "DECL_FATTN_WMMA_F16_CASE({head_size}, {cols_per_block}, {kq_acc_t});\n"
-
-TYPES_MMQ = [
-    "GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0",
-    "GGML_TYPE_Q2_K", "GGML_TYPE_Q3_K", "GGML_TYPE_Q4_K", "GGML_TYPE_Q5_K", "GGML_TYPE_Q6_K",
-    "GGML_TYPE_IQ2_XXS", "GGML_TYPE_IQ2_XS", "GGML_TYPE_IQ2_S", "GGML_TYPE_IQ3_XXS", "GGML_TYPE_IQ3_S",
-    "GGML_TYPE_IQ1_S", "GGML_TYPE_IQ4_NL", "GGML_TYPE_IQ4_XS"
-]
-
-SOURCE_MMQ = """// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../mmq.cuh"
-
-DECL_MMQ_CASE({type});
-"""
-
-
-def get_short_name(long_quant_name):
-    return long_quant_name.replace("GGML_TYPE_", "").lower()
-
-
-def get_head_sizes(type_k, type_v):
-    if type_k == "GGML_TYPE_F16" and type_v == "GGML_TYPE_F16":
-        return [64, 128, 256]
-    if type_k == "GGML_TYPE_F16":
-        return [64, 128]
-    return [128]
-
-
-for filename in glob("*.cu"):
-    os.remove(filename)
-
-for vkq_size in [16, 32]:
-    for type_k in TYPES_KV:
-        for type_v in TYPES_KV:
-            for head_size in get_head_sizes(type_k, type_v):
-                with open(f"fattn-vec-f{vkq_size}-instance-hs{head_size}-{get_short_name(type_k)}-{get_short_name(type_v)}.cu", "w") as f:
-                    f.write(SOURCE_FATTN_VEC.format(vkq_size=vkq_size, head_size=head_size, type_k=type_k, type_v=type_v))
-
-for kq_acc_t in ["half", "float"]:
-    for cols_per_block in [8, 16, 32]:
-        if kq_acc_t == "float" and cols_per_block == 8:
-            continue
-
-        with open(f"fattn-wmma-f16-instance-kq{kq_acc_t}-cpb{cols_per_block}.cu", "w") as f:
-            f.write(SOURCE_FATTN_WMMA_START)
-
-            for head_size in [64, 80, 96, 112, 128, 256]:
-                if cols_per_block == 8 and head_size % 32 != 0: # wmma fragment is 8x32
-                    continue
-                if kq_acc_t == "float" and cols_per_block == 32 and head_size == 256: # register spilling, bad performance
-                    continue
-                f.write(SOURCE_FATTN_WMMA_CASE.format(kq_acc_t=kq_acc_t, cols_per_block=cols_per_block, head_size=head_size))
-
-for type in TYPES_MMQ:
-    with open(f"mmq-instance-{get_short_name(type)}.cu", "w") as f:
-        f.write(SOURCE_MMQ.format(type=type))
--- a/server/download.go
+++ b/server/download.go
@@ -172,7 +172,10 @@ func (b *blobDownload) Prepare(ctx context.Context, requestURL *url.URL, opts *r
 		}
 	}

-	slog.Info(fmt.Sprintf("downloading %s in %d %s part(s)", b.Digest[7:19], len(b.Parts), format.HumanBytes(b.Parts[0].Size)))
+	if len(b.Parts) > 0 {
+		slog.Info(fmt.Sprintf("downloading %s in %d %s part(s)", b.Digest[7:19], len(b.Parts), format.HumanBytes(b.Parts[0].Size)))
+	}
+
 	return nil
 }

@@ -365,7 +368,7 @@ func (b *blobDownload) downloadChunk(ctx context.Context, requestURL *url.URL, w
 				lastUpdated := part.lastUpdated
 				part.lastUpdatedMu.Unlock()

-				if !lastUpdated.IsZero() && time.Since(lastUpdated) > 5*time.Second {
+				if !lastUpdated.IsZero() && time.Since(lastUpdated) > 30*time.Second {
 					const msg = "%s part %d stalled; retrying. If this persists, press ctrl-c to exit, then 'ollama pull' to find a faster connection."
 					slog.Info(fmt.Sprintf(msg, b.Digest[7:19], part.N))
 					// reset last updated
--- a/server/upload.go
+++ b/server/upload.go
@@ -108,7 +108,9 @@ func (b *blobUpload) Prepare(ctx context.Context, requestURL *url.URL, opts *reg
 		offset += size
 	}

-	slog.Info(fmt.Sprintf("uploading %s in %d %s part(s)", b.Digest[7:19], len(b.Parts), format.HumanBytes(b.Parts[0].Size)))
+	if len(b.Parts) > 0 {
+		slog.Info(fmt.Sprintf("uploading %s in %d %s part(s)", b.Digest[7:19], len(b.Parts), format.HumanBytes(b.Parts[0].Size)))
+	}

 	requestURL, err = url.Parse(location)
 	if err != nil {
Author	SHA1	Message	Date
CosmicEventHorizon	e8d4eb3e68	readme: add ChibiChat to community integrations (#8883 )	2025-02-06 16:08:46 -08:00
Michael Yang	ae7e368f75	build(rocm): add numa, elf (#8900 )	2025-02-06 15:46:30 -08:00
oslook	31acd1ebf9	readme: add Ollama Chat WebUI for Docker to community integrations (#8084 )	2025-02-06 15:41:02 -08:00
Michael Yang	9a4757ae66	build(rocm): add tinfo (#8899 )	2025-02-06 15:08:12 -08:00
Abhinav Pant	7814019708	docs: add step for removing libraries in linux.md (#8897 )	2025-02-06 14:54:58 -08:00
Michael Yang	b698f9a0d8	build: add missing dependencies (#8896 )	2025-02-06 13:12:16 -08:00
Azis Alvriyanto	32285a6d19	format: rename test file from byte_test.go to bytes_test.go (#8865 )	2025-02-06 13:06:15 -08:00
Michael Yang	1c198977ec	ci: fix linux archive (#8862 ) the find returns intermediate directories which pulls the parent directories. it also omits files under lib/ollama. switch back to globbing	2025-02-05 19:45:58 -08:00
zyphixor	330b6c50b0	readme: add simple-discord-ai to community integrations (#8659 )	2025-02-05 18:35:04 -08:00
Diego Pereira	928911bc68	runner: avoid buffer overwrite when generating multiple embeddings (#8714 ) Shield the code processing the embedding result from subsequent calls that may overwrite the same buffer to process a second input when retrieving model embeddings.	2025-02-05 16:53:33 -08:00
Michael Yang	5b446cc815	chore: update gitattributes (#8860 ) * chore: update gitattributes * chore: add build info source	2025-02-05 16:37:18 -08:00
Daniel Lok	451c1596af	readme: add MLflow Tracing as an observability integration (#8811 )	2025-02-05 16:04:24 -08:00
Michael Yang	932bded12f	chore: add optional field for server logs	2025-02-05 15:55:32 -08:00
Michael Yang	070ad913ac	ci: fix linux archive	2025-02-05 15:08:02 -08:00
Azis Alvriyanto	8d8b9f83ae	format: byte formatting test coverage (#8692 ) Removed redundant checks and streamlined the switch-case structure. Added test cases for both HumanBytes and HumanBytes2 to cover a wide range of scenarios.	2025-02-05 12:23:07 -08:00
Jeffrey Morgan	f00d359a67	docs: add section in development.md on library detection (#8855 )	2025-02-05 11:16:27 -08:00
Yashwanth A	291def6adb	server: increase timeout in stall detection from 5s to 30s (#8831 ) In some cases, downloads slow due to disk i/o or other factors, causing the download to restart a part. This causes the download to "reverse" in percent completion. By increasing the timeout to 30s, this should happen less frequently.	2025-02-05 10:00:26 -08:00
Jeffrey Morgan	cd3fbf1c49	llama: use dynamic backend loading for mllama and clip (#8835 )	2025-02-05 09:46:56 -08:00
Jeffrey Morgan	c852b8e021	server: always print upload/download part info (#8832 )	2025-02-04 19:30:49 -08:00
William	d8932c55e7	server: fix out of bounds exception on model download (#8746 )	2025-02-04 18:52:47 -08:00