env.sh: cleanup unused RELEASE_IMAGE_REPO (#6855 )

Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>
readme: add terminal tool ParLlama to community integrations (#5623 )
2024-11-21 08:28:04 -08:00 · 2024-11-21 02:55:35 -08:00 · 2024-11-21 02:51:45 -08:00 · 2024-11-21 02:28:57 -08:00 · 2024-11-21 02:09:36 -08:00 · 2024-11-21 02:07:17 -08:00
48 changed files with 778 additions and 311 deletions
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -281,7 +281,7 @@ jobs:
        shell: bash
      - uses: golangci/golangci-lint-action@v6
        with:
-          args: --timeout 8m0s -v
+          args: --timeout 10m0s -v
  test:
    strategy:
      matrix:
--- a/66
+++ b/66
@@ -5,6 +5,8 @@ ARG CUDA_V11_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86"
 ARG CUDA_VERSION_12=12.4.0
 ARG CUDA_V12_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
 ARG ROCM_VERSION=6.1.2
+ARG JETPACK_6=r36.2.0
+ARG JETPACK_5=r35.4.1

 ### To create a local image for building linux binaries on mac or windows with efficient incremental builds
 #
@@ -13,7 +15,7 @@ ARG ROCM_VERSION=6.1.2
 #
 ### Then incremental builds will be much faster in this container
 #
-# make -C llama -j 10 && go build -trimpath -o dist/linux-amd64/ollama .
+# make -j 10 && go build -trimpath -o dist/linux-amd64/ollama .
 #
 FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS unified-builder-amd64
 ARG CMAKE_VERSION
@@ -76,9 +78,9 @@ ARG CUDA_V12_ARCHITECTURES
 ARG OLLAMA_FAST_BUILD
 RUN --mount=type=cache,target=/root/.ccache \
    if grep "^flags" /proc/cpuinfo|grep avx>/dev/null; then \
-        make -C llama -j $(expr $(nproc) / 2 ) ; \
+        make -j $(expr $(nproc) / 2 ) ; \
    else \
-        make -C llama -j 5 ; \
+        make -j 5 ; \
    fi

 FROM --platform=linux/arm64 unified-builder-arm64 AS runners-arm64
@@ -90,7 +92,46 @@ ARG CUDA_V11_ARCHITECTURES
 ARG CUDA_V12_ARCHITECTURES
 ARG OLLAMA_FAST_BUILD
 RUN --mount=type=cache,target=/root/.ccache \
-    make -C llama -j 8
+    make -j 5
+
+# Jetsons need to be built in discrete stages
+FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_5} AS runners-jetpack5-arm64
+ARG GOLANG_VERSION
+RUN apt-get update && apt-get install -y git curl ccache && \
+    curl -s -L https://dl.google.com/go/go${GOLANG_VERSION}.linux-arm64.tar.gz | tar xz -C /usr/local && \
+    ln -s /usr/local/go/bin/go /usr/local/bin/go && \
+    ln -s /usr/local/go/bin/gofmt /usr/local/bin/gofmt && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+WORKDIR /go/src/github.com/ollama/ollama/
+COPY . .
+ARG CGO_CFLAGS
+ENV GOARCH arm64
+RUN --mount=type=cache,target=/root/.ccache \
+    make -j 5 cuda_v11 \
+        CUDA_ARCHITECTURES="72;87" \
+        GPU_RUNNER_VARIANT=_jetpack5 \
+        CGO_EXTRA_LDFLAGS_LINUX=-L/usr/local/cuda/lib64/stubs \
+        DIST_LIB_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack5/lib/ollama \
+        DIST_GPU_RUNNER_DEPS_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack5/lib/ollama/cuda_jetpack5
+
+FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_6} AS runners-jetpack6-arm64
+ARG GOLANG_VERSION
+RUN apt-get update && apt-get install -y git curl ccache && \
+    curl -s -L https://dl.google.com/go/go${GOLANG_VERSION}.linux-arm64.tar.gz | tar xz -C /usr/local && \
+    ln -s /usr/local/go/bin/go /usr/local/bin/go && \
+    ln -s /usr/local/go/bin/gofmt /usr/local/bin/gofmt && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+WORKDIR /go/src/github.com/ollama/ollama/
+COPY . .
+ARG CGO_CFLAGS
+ENV GOARCH arm64
+RUN --mount=type=cache,target=/root/.ccache \
+    make -j 5 cuda_v12 \
+        CUDA_ARCHITECTURES="87" \
+        GPU_RUNNER_VARIANT=_jetpack6 \
+        CGO_EXTRA_LDFLAGS_LINUX=-L/usr/local/cuda/lib64/stubs \
+        DIST_LIB_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack6/lib/ollama \
+        DIST_GPU_RUNNER_DEPS_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack6/lib/ollama/cuda_jetpack6


 # Intermediate stages used for ./scripts/build_linux.sh
@@ -134,12 +175,20 @@ FROM --platform=linux/arm64 builder-arm64 AS build-arm64
 COPY . .
 COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
 COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/build/ build/
+COPY --from=runners-jetpack5-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
+COPY --from=runners-jetpack5-arm64 /go/src/github.com/ollama/ollama/build/ build/
+COPY --from=runners-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
+COPY --from=runners-jetpack6-arm64 /go/src/github.com/ollama/ollama/build/ build/
 ARG GOFLAGS
 ARG CGO_CFLAGS
 RUN --mount=type=cache,target=/root/.ccache \
    go build -trimpath -o dist/linux-arm64/bin/ollama .
 RUN cd dist/linux-$GOARCH && \
    tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
+RUN cd dist/linux-$GOARCH-jetpack5 && \
+    tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH-jetpack5.tgz
+RUN cd dist/linux-$GOARCH-jetpack6 && \
+    tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH-jetpack6.tgz

 FROM --platform=linux/amd64 scratch AS dist-amd64
 COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
@@ -180,16 +229,19 @@ RUN rm -rf \
 FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-amd64
 RUN apt-get update && \
    apt-get install -y ca-certificates && \
-    rm -rf /var/lib/apt/lists/*
+    apt-get clean && rm -rf /var/lib/apt/lists/*
 COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
 COPY --from=runners-cuda-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/

 FROM --platform=linux/arm64 ubuntu:22.04 AS runtime-arm64
 RUN apt-get update && \
    apt-get install -y ca-certificates && \
-    rm -rf /var/lib/apt/lists/*
+    apt-get clean && rm -rf /var/lib/apt/lists/*
 COPY --from=container-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/
 COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
+COPY --from=runners-jetpack5-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack5/lib/ /lib/
+COPY --from=runners-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack6/lib/ /lib/
+

 # ROCm libraries larger so we keep it distinct from the CPU/CUDA image
 FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-rocm
@@ -198,7 +250,7 @@ FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-rocm
 COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64-rocm/lib/ /lib/
 RUN apt-get update && \
    apt-get install -y ca-certificates && \
-    rm -rf /var/lib/apt/lists/*
+    apt-get clean && rm -rf /var/lib/apt/lists/*
 COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
 COPY --from=runners-rocm-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/

--- a/README.md
+++ b/README.md
@@ -47,26 +47,28 @@ Ollama supports a list of models available on [ollama.com/library](https://ollam

 Here are some example models that can be downloaded:

-| Model              | Parameters | Size  | Download                       |
-| ------------------ | ---------- | ----- | ------------------------------ |
-| Llama 3.2          | 3B         | 2.0GB | `ollama run llama3.2`          |
-| Llama 3.2          | 1B         | 1.3GB | `ollama run llama3.2:1b`       |
-| Llama 3.1          | 8B         | 4.7GB | `ollama run llama3.1`          |
-| Llama 3.1          | 70B        | 40GB  | `ollama run llama3.1:70b`      |
-| Llama 3.1          | 405B       | 231GB | `ollama run llama3.1:405b`     |
-| Phi 3 Mini         | 3.8B       | 2.3GB | `ollama run phi3`              |
-| Phi 3 Medium       | 14B        | 7.9GB | `ollama run phi3:medium`       |
-| Gemma 2            | 2B         | 1.6GB | `ollama run gemma2:2b`         |
-| Gemma 2            | 9B         | 5.5GB | `ollama run gemma2`            |
-| Gemma 2            | 27B        | 16GB  | `ollama run gemma2:27b`        |
-| Mistral            | 7B         | 4.1GB | `ollama run mistral`           |
-| Moondream 2        | 1.4B       | 829MB | `ollama run moondream`         |
-| Neural Chat        | 7B         | 4.1GB | `ollama run neural-chat`       |
-| Starling           | 7B         | 4.1GB | `ollama run starling-lm`       |
-| Code Llama         | 7B         | 3.8GB | `ollama run codellama`         |
-| Llama 2 Uncensored | 7B         | 3.8GB | `ollama run llama2-uncensored` |
-| LLaVA              | 7B         | 4.5GB | `ollama run llava`             |
-| Solar              | 10.7B      | 6.1GB | `ollama run solar`             |
+| Model              | Parameters | Size  | Download                         |
+| ------------------ | ---------- | ----- | -------------------------------- |
+| Llama 3.2          | 3B         | 2.0GB | `ollama run llama3.2`            |
+| Llama 3.2          | 1B         | 1.3GB | `ollama run llama3.2:1b`         |
+| Llama 3.2 Vision   | 11B        | 7.9GB | `ollama run llama3.2-vision`     |
+| Llama 3.2 Vision   | 90B        | 55GB  | `ollama run llama3.2-vision:90b` |
+| Llama 3.1          | 8B         | 4.7GB | `ollama run llama3.1`            |
+| Llama 3.1          | 70B        | 40GB  | `ollama run llama3.1:70b`        |
+| Llama 3.1          | 405B       | 231GB | `ollama run llama3.1:405b`       |
+| Phi 3 Mini         | 3.8B       | 2.3GB | `ollama run phi3`                |
+| Phi 3 Medium       | 14B        | 7.9GB | `ollama run phi3:medium`         |
+| Gemma 2            | 2B         | 1.6GB | `ollama run gemma2:2b`           |
+| Gemma 2            | 9B         | 5.5GB | `ollama run gemma2`              |
+| Gemma 2            | 27B        | 16GB  | `ollama run gemma2:27b`          |
+| Mistral            | 7B         | 4.1GB | `ollama run mistral`             |
+| Moondream 2        | 1.4B       | 829MB | `ollama run moondream`           |
+| Neural Chat        | 7B         | 4.1GB | `ollama run neural-chat`         |
+| Starling           | 7B         | 4.1GB | `ollama run starling-lm`         |
+| Code Llama         | 7B         | 3.8GB | `ollama run codellama`           |
+| Llama 2 Uncensored | 7B         | 3.8GB | `ollama run llama2-uncensored`   |
+| LLaVA              | 7B         | 4.5GB | `ollama run llava`               |
+| Solar              | 10.7B      | 6.1GB | `ollama run solar`               |

 > [!NOTE]
 > You should have at least 8 GB of RAM available to run the 7B models, 16 GB to run the 13B models, and 32 GB to run the 33B models.
@@ -306,11 +308,16 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Ollama RAG Chatbot](https://github.com/datvodinh/rag-chatbot.git) (Local Chat with multiple PDFs using Ollama and RAG)
 - [BrainSoup](https://www.nurgo-software.com/products/brainsoup) (Flexible native client with RAG & multi-agent automation)
 - [macai](https://github.com/Renset/macai) (macOS client for Ollama, ChatGPT, and other compatible API back-ends)
+- [Ollama Grid Search](https://github.com/dezoito/ollama-grid-search) (app to evaluate and compare models)
 - [Olpaka](https://github.com/Otacon/olpaka) (User-friendly Flutter Web App for Ollama)
 - [OllamaSpring](https://github.com/CrazyNeil/OllamaSpring) (Ollama Client for macOS)
 - [LLocal.in](https://github.com/kartikm7/llocal) (Easy to use Electron Desktop Client for Ollama)
+- [Shinkai Desktop](https://github.com/dcSpark/shinkai-apps) (Two click install Local AI using Ollama + Files + RAG)
 - [AiLama](https://github.com/zeyoyt/ailama) (A Discord User App that allows you to interact with Ollama anywhere in discord )
 - [Ollama with Google Mesop](https://github.com/rapidarchitect/ollama_mesop/) (Mesop Chat Client implementation with Ollama)
+- [R2R](https://github.com/SciPhi-AI/R2R) (Open-source RAG engine)
+- [Ollama-Kis](https://github.com/elearningshow/ollama-kis) (A simple easy to use GUI with sample custom LLM for Drivers Education) 
+- [OpenGPA](https://opengpa.org) (Open-source offline-first Enterprise Agentic Application) 
 - [Painting Droid](https://github.com/mateuszmigas/painting-droid) (Painting app with AI integrations)
 - [Kerlig AI](https://www.kerlig.com/) (AI writing assistant for macOS)
 - [AI Studio](https://github.com/MindWorkAI/AI-Studio)
@@ -318,6 +325,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LLMStack](https://github.com/trypromptly/LLMStack) (No-code multi-agent framework to build LLM agents and workflows)
 - [BoltAI for Mac](https://boltai.com) (AI Chat Client for Mac)
 - [Harbor](https://github.com/av/harbor) (Containerized LLM Toolkit with Ollama as default backend)
+- [PyGPT](https://github.com/szczyglis-dev/py-gpt) (AI desktop assistant for Linux, Windows and Mac)
+- [AutoGPT](https://github.com/Significant-Gravitas/AutoGPT/blob/master/docs/content/platform/ollama.md) (AutoGPT Ollama integration)
 - [Go-CREW](https://www.jonathanhecl.com/go-crew/) (Powerful Offline RAG in Golang)
 - [PartCAD](https://github.com/openvmp/partcad/) (CAD model generation with OpenSCAD and CadQuery)
 - [Ollama4j Web UI](https://github.com/ollama4j/ollama4j-web-ui) - Java-based Web UI for Ollama built with Vaadin, Spring Boot and Ollama4j
@@ -327,12 +336,22 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption)
 - [Archyve](https://github.com/nickthecook/archyve) (RAG-enabling document library)
 - [crewAI with Mesop](https://github.com/rapidarchitect/ollama-crew-mesop) (Mesop Web Interface to run crewAI with Ollama)
+- [Tkinter-based client](https://github.com/chyok/ollama-gui) (Python tkinter-based Client for Ollama)
 - [LLMChat](https://github.com/trendy-design/llmchat) (Privacy focused, 100% local, intuitive all-in-one chat interface)
 - [ARGO](https://github.com/xark-argo/argo) (Locally download and run Ollama and Huggingface models with RAG on Mac/Windows/Linux)
 - [G1](https://github.com/bklieger-groq/g1) (Prototype of using prompting strategies to improve the LLM's reasoning through o1-like reasoning chains.)
+- [Web management](https://github.com/lemonit-eric-mao/ollama-web-management) (Web management page)
+- [Promptery](https://github.com/promptery/promptery) (desktop client for Ollama.)
 - [Ollama App](https://github.com/JHubi1/ollama-app) (Modern and easy-to-use multi-platform client for Ollama)
+- [ollama-chat-app](https://github.com/anan1213095357/ollama-chat-app) (Flutter-based chat app)
+- [Perfect Memory AI](https://www.perfectmemory.ai/) (Productivity AI assists personalized by what you have seen on your screen, heard and said in the meetings)
 - [Hexabot](https://github.com/hexastack/hexabot) (A conversational AI builder)
- [Reddit Rate]((https://github.com/rapidarchitect/reddit_analyzer)) (Search and Rate Reddit topics with a weighted summation)
+- [Reddit Rate](https://github.com/rapidarchitect/reddit_analyzer) (Search and Rate Reddit topics with a weighted summation)
+- [OpenTalkGpt](https://github.com/adarshM84/OpenTalkGpt)
+- [VT](https://github.com/vinhnx/vt.ai) (A minimal multimodal AI chat app, with dynamic conversation routing. Supports local models via Ollama)
+- [Nosia](https://github.com/nosia-ai/nosia) (Easy to install and use RAG platform based on Ollama)
+- [Witsy](https://github.com/nbonamy/witsy) (An AI Desktop application avaiable for Mac/Windows/Linux) 
+- [Abbey](https://github.com/US-Artificial-Intelligence/abbey) (A configurable AI interface server with notebooks, document storage, and YouTube support)

 ### Terminal

@@ -356,9 +375,14 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [tlm](https://github.com/yusufcanb/tlm)
 - [podman-ollama](https://github.com/ericcurtin/podman-ollama)
 - [gollama](https://github.com/sammcj/gollama)
+- [ParLlama](https://github.com/paulrobello/parllama)
 - [Ollama eBook Summary](https://github.com/cognitivetech/ollama-ebook-summary/)
 - [Ollama Mixture of Experts (MOE) in 50 lines of code](https://github.com/rapidarchitect/ollama_moe)
 - [vim-intelligence-bridge](https://github.com/pepo-ec/vim-intelligence-bridge) Simple interaction of "Ollama" with the Vim editor
+- [bb7](https://github.com/drunkwcodes/bb7)
+- [SwollamaCLI](https://github.com/marcusziade/Swollama) bundled with the Swollama Swift package. [Demo](https://github.com/marcusziade/Swollama?tab=readme-ov-file#cli-usage)
+- [aichat](https://github.com/sigoden/aichat) All-in-one LLM CLI tool featuring Shell Assistant, Chat-REPL, RAG, AI tools & agents, with access to OpenAI, Claude, Gemini, Ollama, Groq, and more.
+- [orbiton](https://github.com/xyproto/orbiton) Configuration-free text editor and IDE with support for tab completion with Ollama.

 ### Apple Vision Pro
 - [Enchanted](https://github.com/AugustDev/enchanted)
@@ -382,9 +406,11 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/integrations/chat/ollama/) with [example](https://js.langchain.com/docs/tutorials/local_rag/)
 - [Firebase Genkit](https://firebase.google.com/docs/genkit/plugins/ollama)
 - [crewAI](https://github.com/crewAIInc/crewAI)
+- [Spring AI](https://github.com/spring-projects/spring-ai) with [reference](https://docs.spring.io/spring-ai/reference/api/chat/ollama-chat.html) and [example](https://github.com/tzolov/ollama-tools)
 - [LangChainGo](https://github.com/tmc/langchaingo/) with [example](https://github.com/tmc/langchaingo/tree/main/examples/ollama-completion-example)
 - [LangChain4j](https://github.com/langchain4j/langchain4j) with [example](https://github.com/langchain4j/langchain4j-examples/tree/main/ollama-examples/src/main/java)
 - [LangChainRust](https://github.com/Abraxas-365/langchain-rust) with [example](https://github.com/Abraxas-365/langchain-rust/blob/main/examples/llm_ollama.rs)
+- [LLPhant](https://github.com/theodo-group/LLPhant?tab=readme-ov-file#ollama)
 - [LlamaIndex](https://docs.llamaindex.ai/en/stable/examples/llm/ollama/) and [LlamaIndexTS](https://ts.llamaindex.ai/modules/llms/available_llms/ollama)
 - [LiteLLM](https://github.com/BerriAI/litellm)
 - [OllamaFarm for Go](https://github.com/presbrey/ollamafarm)
@@ -409,12 +435,20 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Portkey](https://portkey.ai/docs/welcome/integration-guides/ollama)
 - [PromptingTools.jl](https://github.com/svilupp/PromptingTools.jl) with an [example](https://svilupp.github.io/PromptingTools.jl/dev/examples/working_with_ollama)
 - [LlamaScript](https://github.com/Project-Llama/llamascript)
+- [llm-axe](https://github.com/emirsahin1/llm-axe) (Python Toolkit for Building LLM Powered Apps)
 - [Gollm](https://docs.gollm.co/examples/ollama-example)
+- [Gollama for Golang](https://github.com/jonathanhecl/gollama)
 - [Ollamaclient for Golang](https://github.com/xyproto/ollamaclient)
 - [High-level function abstraction in Go](https://gitlab.com/tozd/go/fun)
 - [Ollama PHP](https://github.com/ArdaGnsrn/ollama-php)
 - [Agents-Flex for Java](https://github.com/agents-flex/agents-flex) with [example](https://github.com/agents-flex/agents-flex/tree/main/agents-flex-llm/agents-flex-llm-ollama/src/test/java/com/agentsflex/llm/ollama)
+- [Parakeet](https://github.com/parakeet-nest/parakeet) is a GoLang library, made to simplify the development of small generative AI applications with Ollama.
+- [Haverscript](https://github.com/andygill/haverscript) with [examples](https://github.com/andygill/haverscript/tree/main/examples)
 - [Ollama for Swift](https://github.com/mattt/ollama-swift)
+- [Swollama for Swift](https://github.com/marcusziade/Swollama) with [DocC](https://marcusziade.github.io/Swollama/documentation/swollama/)
+- [GoLamify](https://github.com/prasad89/golamify)
+- [Ollama for Haskell](https://github.com/tusharad/ollama-haskell)
+- [multi-llm-ts](https://github.com/nbonamy/multi-llm-ts) (A Typescript/JavaScript library allowing access to different LLM in unified API)

 ### Mobile

@@ -428,6 +462,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Raycast extension](https://github.com/MassimilianoPasquini97/raycast_ollama)
 - [Discollama](https://github.com/mxyng/discollama) (Discord bot inside the Ollama discord channel)
 - [Continue](https://github.com/continuedev/continue)
+- [Vibe](https://github.com/thewh1teagle/vibe) (Transcribe and analyze meetings with Ollama)
 - [Obsidian Ollama plugin](https://github.com/hinterdupfinger/obsidian-ollama)
 - [Logseq Ollama plugin](https://github.com/omagdy7/ollama-logseq)
 - [NotesOllama](https://github.com/andersrex/notesollama) (Apple Notes Ollama plugin)
@@ -452,13 +487,16 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)
 - [Discord AI chat/moderation bot](https://github.com/rapmd73/Companion) Chat/moderation bot written in python. Uses Ollama to create personalities.
 - [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depends on ollama server)
+- [Terraform AWS Ollama & Open WebUI](https://github.com/xuyangbocn/terraform-aws-self-host-llm) (A Terraform module to deploy on AWS a ready-to-use Ollama service, together with its front end Open WebUI service.)
+- [node-red-contrib-ollama](https://github.com/jakubburkiewicz/node-red-contrib-ollama)
+- [Local AI Helper](https://github.com/ivostoykov/localAI) (Chrome and Firefox extensions that enable interactions with the active tab and customisable API endpoints. Includes secure storage for user prompts.)
 - [vnc-lm](https://github.com/jk011ru/vnc-lm) (A containerized Discord bot with support for attachments and web links)
 - [LSP-AI](https://github.com/SilasMarvin/lsp-ai) (Open-source language server for AI-powered functionality)
 - [QodeAssist](https://github.com/Palm1r/QodeAssist) (AI-powered coding assistant plugin for Qt Creator)
 - [Obsidian Quiz Generator plugin](https://github.com/ECuiDev/obsidian-quiz-generator)
 - [TextCraft](https://github.com/suncloudsmoon/TextCraft) (Copilot in Word alternative using Ollama)
+- [Alfred Ollama](https://github.com/zeitlings/alfred-ollama) (Alfred Workflow)

 ### Supported backends

 - [llama.cpp](https://github.com/ggerganov/llama.cpp) project founded by Georgi Gerganov.
-
--- a/api/client.go
+++ b/api/client.go
@@ -55,7 +55,7 @@ func checkError(resp *http.Response, body []byte) error {

 // ClientFromEnvironment creates a new [Client] using configuration from the
 // environment variable OLLAMA_HOST, which points to the network host and
-// port on which the ollama service is listenting. The format of this variable
+// port on which the ollama service is listening. The format of this variable
 // is:
 //
 //	<scheme>://<host>:<port>
--- a/api/types.go
+++ b/api/types.go
@@ -12,7 +12,7 @@ import (
 	"time"
 )

-// StatusError is an error with and HTTP status code.
+// StatusError is an error with an HTTP status code and message.
 type StatusError struct {
 	StatusCode   int
 	Status       string
@@ -57,7 +57,7 @@ type GenerateRequest struct {
 	Template string `json:"template"`

 	// Context is the context parameter returned from a previous call to
-	// Generate call. It can be used to keep a short conversational memory.
+	// [Client.Generate]. It can be used to keep a short conversational memory.
 	Context []int `json:"context,omitempty"`

 	// Stream specifies whether the response is streaming; it is true by default.
@@ -90,14 +90,14 @@ type ChatRequest struct {
 	// Messages is the messages of the chat - can be used to keep a chat memory.
 	Messages []Message `json:"messages"`

-	// Stream enable streaming of returned response; true by default.
+	// Stream enables streaming of returned responses; true by default.
 	Stream *bool `json:"stream,omitempty"`

 	// Format is the format to return the response in (e.g. "json").
 	Format string `json:"format"`

 	// KeepAlive controls how long the model will stay loaded into memory
-	// followin the request.
+	// following the request.
 	KeepAlive *Duration `json:"keep_alive,omitempty"`

 	// Tools is an optional list of tools the model has access to.
@@ -203,8 +203,8 @@ type Metrics struct {
 	EvalDuration       time.Duration `json:"eval_duration,omitempty"`
 }

-// Options specified in [GenerateRequest], if you add a new option here add it
-// to the API docs also.
+// Options specified in [GenerateRequest].  If you add a new option here, also
+// add it to the API docs.
 type Options struct {
 	Runner

@@ -236,7 +236,7 @@ type Runner struct {
 	NumGPU    int   `json:"num_gpu,omitempty"`
 	MainGPU   int   `json:"main_gpu,omitempty"`
 	LowVRAM   bool  `json:"low_vram,omitempty"`
-	F16KV     bool  `json:"f16_kv,omitempty"`
+	F16KV     bool  `json:"f16_kv,omitempty"` // Deprecated: This option is ignored
 	LogitsAll bool  `json:"logits_all,omitempty"`
 	VocabOnly bool  `json:"vocab_only,omitempty"`
 	UseMMap   *bool `json:"use_mmap,omitempty"`
@@ -613,7 +613,6 @@ func DefaultOptions() Options {
 			NumGPU:    -1, // -1 here indicates that NumGPU should be set dynamically
 			NumThread: 0,  // let the runtime decide
 			LowVRAM:   false,
-			F16KV:     true,
 			UseMLock:  false,
 			UseMMap:   nil,
 		},
--- a/app/ollama.iss
+++ b/app/ollama.iss
@@ -136,7 +136,7 @@ Type: filesandordirs; Name: "{%TEMP}\ollama*"
 Type: filesandordirs; Name: "{%LOCALAPPDATA}\Programs\Ollama"

 [Messages]
-WizardReady=Ollama Windows Preview
+WizardReady=Ollama
 ReadyLabel1=%nLet's get you up and running with your own large language models.
 SetupAppRunningError=Another Ollama installer is running.%n%nPlease cancel or finish the other installer, then click OK to continue with this install, or Cancel to exit.

--- a/app/tray/wintray/menus.go
+++ b/app/tray/wintray/menus.go
@@ -39,7 +39,7 @@ func (t *winTray) UpdateAvailable(ver string) error {
 		if err := t.addOrUpdateMenuItem(updateAvailableMenuID, 0, updateAvailableMenuTitle, true); err != nil {
 			return fmt.Errorf("unable to create menu entries %w", err)
 		}
-		if err := t.addOrUpdateMenuItem(updateMenuID, 0, updateMenutTitle, false); err != nil {
+		if err := t.addOrUpdateMenuItem(updateMenuID, 0, updateMenuTitle, false); err != nil {
 			return fmt.Errorf("unable to create menu entries %w", err)
 		}
 		if err := t.addSeparatorMenuItem(separatorMenuID, 0); err != nil {
--- a/app/tray/wintray/messages.go
+++ b/app/tray/wintray/messages.go
@@ -10,6 +10,6 @@ const (

 	quitMenuTitle            = "Quit Ollama"
 	updateAvailableMenuTitle = "An update is available"
-	updateMenutTitle         = "Restart to update"
+	updateMenuTitle          = "Restart to update"
 	diagLogsMenuTitle        = "View logs"
 )
--- a/app/tray/wintray/tray.go
+++ b/app/tray/wintray/tray.go
@@ -361,7 +361,7 @@ func (t *winTray) showMenu() error {

 	boolRet, _, err = pTrackPopupMenu.Call(
 		uintptr(t.menus[0]),
-		TPM_BOTTOMALIGN|TPM_LEFTALIGN,
+		TPM_BOTTOMALIGN|TPM_LEFTALIGN|TPM_RIGHTBUTTON,
 		uintptr(p.X),
 		uintptr(p.Y),
 		0,
--- a/app/tray/wintray/w32api.go
+++ b/app/tray/wintray/w32api.go
@@ -67,6 +67,7 @@ const (
 	SW_HIDE             = 0
 	TPM_BOTTOMALIGN     = 0x0020
 	TPM_LEFTALIGN       = 0x0000
+	TPM_RIGHTBUTTON     = 0x0002
 	WM_CLOSE            = 0x0010
 	WM_USER             = 0x0400
 	WS_CAPTION          = 0x00C00000
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -800,9 +800,9 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
 		case "parameters":
 			fmt.Println(resp.Parameters)
 		case "system":
-			fmt.Println(resp.System)
+			fmt.Print(resp.System)
 		case "template":
-			fmt.Println(resp.Template)
+			fmt.Print(resp.Template)
 		}

 		return nil
--- a/discover/amd_linux.go
+++ b/discover/amd_linux.go
@@ -350,7 +350,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 				return nil, err
 			}
 		}
-		gpuInfo.DependencyPath = libDir
+		gpuInfo.DependencyPath = []string{libDir}

 		if gfxOverride == "" {
 			// Only load supported list once
--- a/discover/amd_windows.go
+++ b/discover/amd_windows.go
@@ -111,7 +111,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 				UnreliableFreeMemory: true,

 				ID:             strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
-				DependencyPath: libDir,
+				DependencyPath: []string{libDir},
 				MinimumMemory:  rocmMinimumMemory,
 				Name:           name,
 				Compute:        gfx,
--- a/discover/gpu.go
+++ b/discover/gpu.go
@@ -240,7 +240,7 @@ func GetGPUInfo() GpuInfoList {
 					Library:        "cpu",
 					Variant:        cpuCapability.String(),
 					ID:             "0",
-					DependencyPath: depPath,
+					DependencyPath: []string{depPath},
 				},
 				CPUs: details,
 			},
@@ -293,11 +293,11 @@ func GetGPUInfo() GpuInfoList {
 				gpuInfo.DriverMinor = driverMinor
 				variant := cudaVariant(gpuInfo)
 				if depPath != "" {
-					gpuInfo.DependencyPath = depPath
+					gpuInfo.DependencyPath = []string{depPath}
 					// Check for variant specific directory
 					if variant != "" {
 						if _, err := os.Stat(filepath.Join(depPath, "cuda_"+variant)); err == nil {
-							gpuInfo.DependencyPath = filepath.Join(depPath, "cuda_"+variant)
+							gpuInfo.DependencyPath = []string{filepath.Join(depPath, "cuda_"+variant), depPath}
 						}
 					}
 				}
@@ -370,7 +370,7 @@ func GetGPUInfo() GpuInfoList {
 						gpuInfo.FreeMemory = uint64(memInfo.free)
 						gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
 						gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
-						gpuInfo.DependencyPath = depPath
+						gpuInfo.DependencyPath = []string{depPath}
 						oneapiGPUs = append(oneapiGPUs, gpuInfo)
 					}
 				}
--- a/discover/gpu_info_nvcuda.c
+++ b/discover/gpu_info_nvcuda.c
@@ -4,6 +4,7 @@
 #include "gpu_info_nvcuda.h"

 void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
+  LOG(resp->ch.verbose, "initializing %s\n", nvcuda_lib_path);
  CUresult ret;
  resp->err = NULL;
  resp->num_devices = 0;
@@ -57,8 +58,10 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
      resp->cudaErr = -1;
      return;
    }
+    LOG(resp->ch.verbose, "dlsym: %s - %p\n", l[i].s, *l[i].p);
  }

+  LOG(resp->ch.verbose, "calling cuInit\n");
  ret = (*resp->ch.cuInit)(0);
  if (ret != CUDA_SUCCESS) {
    LOG(resp->ch.verbose, "cuInit err: %d\n", ret);
@@ -75,15 +78,18 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
  resp->ch.driver_minor = 0;

  // Report driver version if we're in verbose mode, ignore errors
+  LOG(resp->ch.verbose, "calling cuDriverGetVersion\n");
  ret = (*resp->ch.cuDriverGetVersion)(&version);
  if (ret != CUDA_SUCCESS) {
    LOG(resp->ch.verbose, "cuDriverGetVersion failed: %d\n", ret);
  } else {
+    LOG(resp->ch.verbose, "raw version 0x%x\n", version);
    resp->ch.driver_major = version / 1000;
    resp->ch.driver_minor = (version - (resp->ch.driver_major * 1000)) / 10;
    LOG(resp->ch.verbose, "CUDA driver version: %d.%d\n", resp->ch.driver_major, resp->ch.driver_minor);
  }

+  LOG(resp->ch.verbose, "calling cuDeviceGetCount\n");
  ret = (*resp->ch.cuDeviceGetCount)(&resp->num_devices);
  if (ret != CUDA_SUCCESS) {
    LOG(resp->ch.verbose, "cuDeviceGetCount err: %d\n", ret);
@@ -94,6 +100,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
    resp->cudaErr = ret;
    return;
  }
+  LOG(resp->ch.verbose, "device count %d\n", resp->num_devices);
 }

 const int buflen = 256;
--- a/discover/types.go
+++ b/discover/types.go
@@ -25,7 +25,7 @@ type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
 	MinimumMemory uint64 `json:"-"`

 	// Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly
-	DependencyPath string `json:"lib_path,omitempty"`
+	DependencyPath []string `json:"lib_path,omitempty"`

 	// Extra environment variables specific to the GPU as list of [key,value]
 	EnvWorkarounds [][2]string `json:"envs,omitempty"`
--- a/docs/api.md
+++ b/docs/api.md
@@ -355,7 +355,6 @@ curl http://localhost:11434/api/generate -d '{
    "num_gpu": 1,
    "main_gpu": 0,
    "low_vram": false,
-    "f16_kv": true,
    "vocab_only": false,
    "use_mmap": true,
    "use_mlock": false,
@@ -831,10 +830,30 @@ Create a model from a [`Modelfile`](./modelfile.md). It is recommended to set `m

 ### Parameters

- `name`: name of the model to create
+- `model`: name of the model to create
 - `modelfile` (optional): contents of the Modelfile
 - `stream`: (optional) if `false` the response will be returned as a single response object, rather than a stream of objects
 - `path` (optional): path to the Modelfile
+- `quantize` (optional): quantize a non-quantized (e.g. float16) model
+
+#### Quantization types
+
+| Type | Recommended |
+| --- | :-: |
+| q2_K | |
+| q3_K_L | |
+| q3_K_M | |
+| q3_K_S | |
+| q4_0 | |
+| q4_1 | |
+| q4_K_M | * |
+| q4_K_S | |
+| q5_0 | |
+| q5_1 | |
+| q5_K_M | |
+| q5_K_S | |
+| q6_K | |
+| q8_0 | * |

 ### Examples

@@ -846,14 +865,14 @@ Create a new model from a `Modelfile`.

 ```shell
 curl http://localhost:11434/api/create -d '{
-  "name": "mario",
+  "model": "mario",
  "modelfile": "FROM llama3\nSYSTEM You are mario from Super Mario Bros."
 }'
 ```

 ##### Response

-A stream of JSON objects. Notice that the final JSON object shows a `"status": "success"`.
+A stream of JSON objects is returned:

 ```json
 {"status":"reading model metadata"}
@@ -869,13 +888,43 @@ A stream of JSON objects. Notice that the final JSON object shows a `"status": "
 {"status":"success"}
 ```

+#### Quantize a model
+
+Quantize a non-quantized model.
+
+##### Request
+
+```shell
+curl http://localhost:11434/api/create -d '{
+  "model": "llama3.1:quantized",
+  "modelfile": "FROM llama3.1:8b-instruct-fp16",
+  "quantize": "q4_K_M"
+}'
+```
+
+##### Response
+
+A stream of JSON objects is returned:
+
+```
+{"status":"quantizing F16 model to Q4_K_M"}
+{"status":"creating new layer sha256:667b0c1932bc6ffc593ed1d03f895bf2dc8dc6df21db3042284a6f4416b06a29"}
+{"status":"using existing layer sha256:11ce4ee3e170f6adebac9a991c22e22ab3f8530e154ee669954c4bc73061c258"}
+{"status":"using existing layer sha256:0ba8f0e314b4264dfd19df045cde9d4c394a52474bf92ed6a3de22a4ca31a177"}
+{"status":"using existing layer sha256:56bb8bd477a519ffa694fc449c2413c6f0e1d3b1c88fa7e3c9d88d3ae49d4dcb"}
+{"status":"creating new layer sha256:455f34728c9b5dd3376378bfb809ee166c145b0b4c1f1a6feca069055066ef9a"}
+{"status":"writing manifest"}
+{"status":"success"}
+```
+
+
 ### Check if a Blob Exists

 ```shell
 HEAD /api/blobs/:digest
 ```

-Ensures that the file blob used for a FROM or ADAPTER field exists on the server. This is checking your Ollama server and not Ollama.ai.
+Ensures that the file blob used for a FROM or ADAPTER field exists on the server. This is checking your Ollama server and not ollama.com.

 #### Query Parameters

@@ -980,7 +1029,7 @@ Show information about a model including details, modelfile, template, parameter

 ### Parameters

- `name`: name of the model to show
+- `model`: name of the model to show
 - `verbose`: (optional) if set to `true`, returns full data for verbose response fields

 ### Examples
@@ -989,7 +1038,7 @@ Show information about a model including details, modelfile, template, parameter

 ```shell
 curl http://localhost:11434/api/show -d '{
-  "name": "llama3.2"
+  "model": "llama3.2"
 }'
 ```

@@ -1069,7 +1118,7 @@ Delete a model and its data.

 ### Parameters

- `name`: model name to delete
+- `model`: model name to delete

 ### Examples

@@ -1077,7 +1126,7 @@ Delete a model and its data.

 ```shell
 curl -X DELETE http://localhost:11434/api/delete -d '{
-  "name": "llama3:13b"
+  "model": "llama3:13b"
 }'
 ```

@@ -1095,7 +1144,7 @@ Download a model from the ollama library. Cancelled pulls are resumed from where

 ### Parameters

- `name`: name of the model to pull
+- `model`: name of the model to pull
 - `insecure`: (optional) allow insecure connections to the library. Only use this if you are pulling from your own library during development.
 - `stream`: (optional) if `false` the response will be returned as a single response object, rather than a stream of objects

@@ -1105,7 +1154,7 @@ Download a model from the ollama library. Cancelled pulls are resumed from where

 ```shell
 curl http://localhost:11434/api/pull -d '{
-  "name": "llama3.2"
+  "model": "llama3.2"
 }'
 ```

@@ -1167,7 +1216,7 @@ Upload a model to a model library. Requires registering for ollama.ai and adding

 ### Parameters

- `name`: name of the model to push in the form of `<namespace>/<model>:<tag>`
+- `model`: name of the model to push in the form of `<namespace>/<model>:<tag>`
 - `insecure`: (optional) allow insecure connections to the library. Only use this if you are pushing to your library during development.
 - `stream`: (optional) if `false` the response will be returned as a single response object, rather than a stream of objects

@@ -1177,7 +1226,7 @@ Upload a model to a model library. Requires registering for ollama.ai and adding

 ```shell
 curl http://localhost:11434/api/push -d '{
-  "name": "mattw/pygmalion:latest"
+  "model": "mattw/pygmalion:latest"
 }'
 ```

--- a/docs/development.md
+++ b/docs/development.md
@@ -108,7 +108,7 @@ Custom CPU settings are not currently supported in the new Go server build but w

 #### Containerized Linux Build

-If you have Docker available, you can build linux binaries with `OLLAMA_NEW_RUNNERS=1 ./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included. The resulting binary is placed in `./dist`
+If you have Docker available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included. The resulting binary is placed in `./dist`

 ### Windows

--- a/docs/docker.md
+++ b/docs/docker.md
@@ -50,6 +50,9 @@ sudo systemctl restart docker
 docker run -d --gpus=all -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
 ```

+> [!NOTE]  
+> If you're running on an NVIDIA JetPack system, Ollama can't automatically discover the correct JetPack version. Pass the environment variable JETSON_JETPACK=5 or JETSON_JETPACK=6 to the container to select version 5 or 6.
+
 ### AMD GPU

 To run Ollama using Docker with AMD GPUs, use the `rocm` tag and the following command:
--- a/docs/import.md
+++ b/docs/import.md
@@ -32,7 +32,7 @@ ollama run my-model

 Ollama supports importing adapters based on several different model architectures including:

-  * Llama (including Llama 2, Llama 3, and Llama 3.1);
+  * Llama (including Llama 2, Llama 3, Llama 3.1, and Llama 3.2);
  * Mistral (including Mistral 1, Mistral 2, and Mixtral); and
  * Gemma (including Gemma 1 and Gemma 2)

@@ -67,14 +67,12 @@ ollama run my-model

 Ollama supports importing models for several different architectures including:

-  * Llama (including Llama 2, Llama 3, and Llama 3.1);
+  * Llama (including Llama 2, Llama 3, Llama 3.1, and Llama 3.2);
  * Mistral (including Mistral 1, Mistral 2, and Mixtral);
  * Gemma (including Gemma 1 and Gemma 2); and
  * Phi3

-This includes importing foundation models as well as any fine tuned models which which have been _fused_ with a foundation model.
-
-
+This includes importing foundation models as well as any fine tuned models which have been _fused_ with a foundation model.
 ## Importing a GGUF based model or adapter

 If you have a GGUF based model or adapter it is possible to import it into Ollama. You can obtain a GGUF model or adapter by:
@@ -83,7 +81,7 @@ If you have a GGUF based model or adapter it is possible to import it into Ollam
  * converting a Safetensors adapter with the `convert_lora_to_gguf.py` from Llama.cpp; or
  * downloading a model or adapter from a place such as HuggingFace

-To import a GGUF model, create a `Modelfile` containg:
+To import a GGUF model, create a `Modelfile` containing:

 ```dockerfile
 FROM /path/to/file.gguf
--- a/docs/linux.md
+++ b/docs/linux.md
@@ -112,6 +112,21 @@ sudo systemctl status ollama
 > https://www.amd.com/en/support/linux-drivers for best support of your Radeon
 > GPU.

+## Customizing
+
+To customize the installation of Ollama, you can edit the systemd service file or the environment variables by running:
+
+```
+sudo systemctl edit ollama
+```
+
+Alternatively, create an override file manually in `/etc/systemd/system/ollama.service.d/override.conf`:
+
+```ini
+[Service]
+Environment="OLLAMA_DEBUG=1"
+```
+
 ## Updating

 Update Ollama by running the install script again:
@@ -129,7 +144,7 @@ sudo tar -C /usr -xzf ollama-linux-amd64.tgz

 ## Installing specific versions

-Use `OLLAMA_VERSION` environment variable with the install script to install a specific version of Ollama, including pre-releases. You can find the version numbers in the [releases page](https://github.com/ollama/ollama/releases). 
+Use `OLLAMA_VERSION` environment variable with the install script to install a specific version of Ollama, including pre-releases. You can find the version numbers in the [releases page](https://github.com/ollama/ollama/releases).

 For example:

--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@@ -120,7 +120,7 @@ FROM <model directory>
 The model directory should contain the Safetensors weights for a supported architecture.

 Currently supported model architectures:
-  * Llama (including Llama 2, Llama 3, and Llama 3.1)
+  * Llama (including Llama 2, Llama 3, Llama 3.1, and Llama 3.2)
  * Mistral (including Mistral 1, Mistral 2, and Mixtral)
  * Gemma (including Gemma 1 and Gemma 2)
  * Phi3
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -95,13 +95,21 @@ If none of those resolve the problem, gather additional information and file an

 On linux, AMD GPU access typically requires `video` and/or `render` group membership to access the `/dev/kfd` device.  If permissions are not set up correctly, Ollama will detect this and report an error in the server log.

-When running in a container, in some Linux distributions and container runtimes, the ollama process may be unable to access the GPU.  Use `ls -ld /dev/kfd /dev/dri /dev/dri/*` on the host system to determine the group assignments on your system, and pass additional `--group-add ...` arguments to the container so it can access the required devices.
+When running in a container, in some Linux distributions and container runtimes, the ollama process may be unable to access the GPU.  Use `ls -lnd /dev/kfd /dev/dri /dev/dri/*` on the host system to determine the **numeric** group IDs on your system, and pass additional `--group-add ...` arguments to the container so it can access the required devices.   For example, in the following output `crw-rw---- 1 0  44 226,   0 Sep 16 16:55 /dev/dri/card0` the group ID column is `44` 
+
+If Ollama initially works on the GPU in a docker container, but then switches to running on CPU after some period of time with errors in the server log reporting GPU discovery failures, this can be resolved by disabling systemd cgroup management in Docker.  Edit `/etc/docker/daemon.json` on the host and add `"exec-opts": ["native.cgroupdriver=cgroupfs"]` to the docker configuration.

 If you are experiencing problems getting Ollama to correctly discover or use your GPU for inference, the following may help isolate the failure.
 - `AMD_LOG_LEVEL=3` Enable info log levels in the AMD HIP/ROCm libraries.  This can help show more detailed error codes that can help troubleshoot problems
 - `OLLAMA_DEBUG=1` During GPU discovery additional information will be reported
 - Check dmesg for any errors from amdgpu or kfd drivers `sudo dmesg | grep -i amdgpu` and `sudo dmesg | grep -i kfd`

+## Multiple AMD GPUs
+
+If you experience gibberish responses when models load across multiple AMD GPUs on Linux, see the following guide.
+
+- https://rocm.docs.amd.com/projects/radeon/en/latest/docs/install/native_linux/mgpu.html#mgpu-known-issues-and-limitations
+
 ## Windows Terminal Errors

 Older versions of Windows 10 (e.g., 21H1) are known to have a bug where the standard terminal program does not display control characters correctly.  This can result in a long string of strings like `←[?25h←[?25l` being displayed, sometimes erroring with `The parameter is incorrect`  To resolve this problem, please update to Win 10 22H1 or newer.
--- a/docs/tutorials/langchainpy.md
+++ b/docs/tutorials/langchainpy.md
@@ -10,7 +10,7 @@ This sounds like a typical censored response, but even llama2-uncensored gives a

 So let's figure out how we can use **LangChain** with Ollama to ask our question to the actual document, the Odyssey by Homer, using Python.

-Let's start by asking a simple question that we can get an answer to from the **Llama2** model using **Ollama**. First, we need to install the **LangChain** package:
+Let's start by asking a simple question that we can get an answer to from the **Llama3** model using **Ollama**. First, we need to install the **LangChain** package:

 `pip install langchain_community`

--- a/go.mod
+++ b/go.mod
@@ -12,7 +12,7 @@ require (
 	github.com/spf13/cobra v1.7.0
 	github.com/stretchr/testify v1.9.0
 	github.com/x448/float16 v0.8.4
-	golang.org/x/sync v0.3.0
+	golang.org/x/sync v0.9.0
 )

 require (
@@ -22,7 +22,7 @@ require (
 	github.com/mattn/go-runewidth v0.0.14
 	github.com/nlpodyssey/gopickle v0.3.0
 	github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c
-	golang.org/x/image v0.14.0
+	golang.org/x/image v0.22.0
 )

 require (
@@ -73,7 +73,7 @@ require (
 	golang.org/x/net v0.25.0 // indirect
 	golang.org/x/sys v0.20.0
 	golang.org/x/term v0.20.0
-	golang.org/x/text v0.15.0
+	golang.org/x/text v0.20.0
 	google.golang.org/protobuf v1.34.1
 	gopkg.in/yaml.v3 v3.0.1 // indirect
 )
--- a/go.sum
+++ b/go.sum
@@ -232,6 +232,8 @@ golang.org/x/image v0.0.0-20201208152932-35266b937fa6/go.mod h1:FeLwcggjj3mMvU+o
 golang.org/x/image v0.0.0-20210216034530-4410531fe030/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
 golang.org/x/image v0.14.0 h1:tNgSxAFe3jC4uYqvZdTr84SZoM1KfwdC9SKIFrLjFn4=
 golang.org/x/image v0.14.0/go.mod h1:HUYqC05R2ZcZ3ejNQsIHQDQiwWM4JBqmm6MKANTp4LE=
+golang.org/x/image v0.22.0 h1:UtK5yLUzilVrkjMAZAZ34DXGpASN8i8pj8g+O+yd10g=
+golang.org/x/image v0.22.0/go.mod h1:9hPFhljd4zZ1GNSIZJ49sqbp45GKK9t6w+iXvGqZUz4=
 golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
 golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU=
 golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
@@ -267,6 +269,8 @@ golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJ
 golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.3.0 h1:ftCYgMx6zT/asHUrPw8BLLscYtGznsLAnjq5RH9P66E=
 golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y=
+golang.org/x/sync v0.9.0 h1:fEo0HyrW1GIgZdpbhCRO0PkJajUS5H9IFUztCgEo2jQ=
+golang.org/x/sync v0.9.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
 golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
@@ -293,6 +297,8 @@ golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.15.0 h1:h1V/4gjBv8v9cjcR6+AR5+/cIYK5N/WAgiv4xlsEtAk=
 golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
+golang.org/x/text v0.20.0 h1:gK/Kv2otX8gz+wn7Rmb3vT96ZwuoxnQlY+HlJVj7Qug=
+golang.org/x/text v0.20.0/go.mod h1:D4IsuqiFMhST5bX19pQ9ikHC2GsaKyk/oF+pn3ducp4=
 golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
--- a/integration/context_test.go
+++ b/integration/context_test.go
@@ -10,7 +10,38 @@ import (
 	"github.com/ollama/ollama/api"
 )

+func TestLongInputContext(t *testing.T) {
+	// Setting NUM_PARALLEL to 1 ensures the allocated context is exactly what
+	// we asked for and there is nothing extra that we could spill over into
+	t.Setenv("OLLAMA_NUM_PARALLEL", "1")
+
+	// Longer needed for small footprint GPUs
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
+	defer cancel()
+	// Set up the test data
+	req := api.GenerateRequest{
+		Model:  "llama2",
+		Prompt: "Oh, don’t speak to me of Austria. Perhaps I don’t understand things, but Austria never has wished, and does not wish, for war. She is betraying us! Russia alone must save Europe. Our gracious sovereign recognizes his high vocation and will be true to it. That is the one thing I have faith in! Our good and wonderful sovereign has to perform the noblest role on earth, and he is so virtuous and noble that God will not forsake him. He will fulfill his vocation and crush the hydra of revolution, which has become more terrible than ever in the person of this murderer and villain! We alone must avenge the blood of the just one.... Whom, I ask you, can we rely on?... England with her commercial spirit will not and cannot understand the Emperor Alexander’s loftiness of soul. She has refused to evacuate Malta. She wanted to find, and still seeks, some secret motive in our actions. What answer did Novosíltsev get? None. The English have not understood and cannot understand the self-abnegation of our Emperor who wants nothing for himself, but only desires the good of mankind. And what have they promised? Nothing! And what little they have promised they will not perform! Prussia has always declared that Buonaparte is invincible, and that all Europe is powerless before him.... And I don’t believe a word that Hardenburg says, or Haugwitz either. This famous Prussian neutrality is just a trap. I have faith only in God and the lofty destiny of our adored monarch. He will save Europe! What country is this referring to?",
+		Stream: &stream,
+		Options: map[string]interface{}{
+			"temperature": 0,
+			"seed":        123,
+			"num_ctx":     128,
+		},
+	}
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
+	if err := PullIfMissing(ctx, client, req.Model); err != nil {
+		t.Fatalf("PullIfMissing failed: %v", err)
+	}
+	DoGenerate(ctx, t, client, req, []string{"russia", "germany", "france", "england", "austria", "prussia"}, 120*time.Second, 10*time.Second)
+}
+
 func TestContextExhaustion(t *testing.T) {
+	// Setting NUM_PARALLEL to 1 ensures the allocated context is exactly what
+	// we asked for and there is nothing extra that we could spill over into
+	t.Setenv("OLLAMA_NUM_PARALLEL", "1")
+
 	// Longer needed for small footprint GPUs
 	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
 	defer cancel()
--- a/llama/README.md
+++ b/llama/README.md
@@ -55,7 +55,7 @@ go build -tags avx,cuda .

 ### ROCm

-Install the [CUDA toolkit v11.3.1](https://developer.nvidia.com/cuda-11-3-1-download-archive):
+Install [ROCm](https://rocm.docs.amd.com/en/latest/).

 ```shell
 make ggml_hipblas.so
@@ -77,7 +77,7 @@ go build -tags avx,cuda .

 ### ROCm

-Install [ROCm 5.7.1](https://rocm.docs.amd.com/en/docs-5.7.1/).
+Install [ROCm](https://rocm.docs.amd.com/en/latest/).

 ```shell
 make ggml_hipblas.dll
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -21,6 +21,8 @@ package llama
 #cgo cuda CFLAGS: -fPIE -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
 #cgo cuda CXXFLAGS: -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
 #cgo cuda CXXFLAGS: -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
+#cgo cuda_jetpack5 LDFLAGS: -lggml_cuda_jetpack5 -L/usr/local/cuda-11/lib64
+#cgo cuda_jetpack6 LDFLAGS: -lggml_cuda_jetpack6 -L/usr/local/cuda-12/lib64
 #cgo cuda_v11 LDFLAGS: -lggml_cuda_v11 -L/usr/local/cuda-11/lib64
 #cgo cuda_v12 LDFLAGS: -lggml_cuda_v12 -L/usr/local/cuda-12/lib64
 #cgo darwin,amd64 CFLAGS: -Wno-incompatible-pointer-types-discards-qualifiers
@@ -36,8 +38,8 @@ package llama
 #cgo linux CXXFLAGS: -D_GNU_SOURCE
 #cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/Linux/amd64
 #cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/Linux/amd64
-#cgo linux,arm64 CFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA -D__ARM_FEATURE_MATMUL_INT8
-#cgo linux,arm64 CXXFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA -D__ARM_FEATURE_MATMUL_INT8
+#cgo linux,arm64 CFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA
+#cgo linux,arm64 CXXFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA
 #cgo linux,arm64 LDFLAGS: -L${SRCDIR}/build/Linux/arm64
 #cgo linux,arm64,sve CFLAGS: -march=armv8.6-a+sve
 #cgo linux,arm64,sve CXXFLAGS: -march=armv8.6-a+sve
@@ -155,9 +157,7 @@ type Context struct {
 	numThreads int
 }

-func (c *Context) KvCacheClear() {
-	C.llama_kv_cache_clear(c.c)
-}
+var ErrKvCacheFull = errors.New("could not find a kv cache slot")

 func (c *Context) Decode(batch *Batch) error {
 	// Positive return values does not mean a fatal error, but rather a warning.
@@ -171,7 +171,7 @@ func (c *Context) Decode(batch *Batch) error {
 	}

 	if code > 0 {
-		return fmt.Errorf("could not find a KV slot for the batch - try reducing the size of the batch or increase the context. code: %d", code)
+		return ErrKvCacheFull
 	}

 	return nil
@@ -193,6 +193,14 @@ func (c *Context) KvCacheSeqCp(srcSeqId int, dstSeqId int, p0 int, p1 int) {
 	C.llama_kv_cache_seq_cp(c.c, C.int(srcSeqId), C.int(dstSeqId), C.int(p0), C.int(p1))
 }

+func (c *Context) KvCacheClear() {
+	C.llama_kv_cache_clear(c.c)
+}
+
+func (c *Context) KvCacheDefrag() {
+	C.llama_kv_cache_defrag(c.c)
+}
+
 // Get the embeddings for a sequence id
 func (c *Context) GetEmbeddingsSeq(seqId int) []float32 {
 	embeddings := unsafe.Pointer(C.llama_get_embeddings_seq(c.c, C.int(seqId)))
@@ -382,6 +390,8 @@ func (b *Batch) Add(token int, embed []float32, pos int, logits bool, seqIds ...

 	if logits {
 		unsafe.Slice(b.c.logits, b.allocSize())[b.c.n_tokens] = 1
+	} else {
+		unsafe.Slice(b.c.logits, b.allocSize())[b.c.n_tokens] = 0
 	}

 	b.c.n_tokens += 1
@@ -598,6 +608,10 @@ func (c *Context) SetCrossAttention(state bool) {
 	C.llama_set_cross_attention(c.c, C.bool(state))
 }

+func (c *Context) Synchronize() {
+	C.llama_synchronize(c.c)
+}
+
 // sampling
 // TODO: this is a temporary wrapper to allow calling C++ code from CGo
 type SamplingContext struct {
--- a/llama/make/Makefile.rocm
+++ b/llama/make/Makefile.rocm
@@ -58,6 +58,8 @@ endif
 GPU_COMPILER_CUFLAGS = \
 	$(GPU_COMPILER_FPIC) \
 	$(addprefix -m,$(GPU_RUNNER_CPU_FLAGS)) \
+	-mf16c \
+	-mfma \
 	-parallel-jobs=2 \
 	-c \
 	-O3 \
@@ -77,6 +79,9 @@ GPU_COMPILER_CUFLAGS = \
 	-D_CRT_SECURE_NO_WARNINGS \
 	-D_GNU_SOURCE \
 	-D_XOPEN_SOURCE=600 \
+	-DUSE_PROF_API=1 \
+	-std=gnu++14 \
+	-x hip \
 	-mllvm=-amdgpu-early-inline-all=true \
 	-mllvm=-amdgpu-function-calls=false \
 	-Wno-expansion-to-defined \
@@ -87,6 +92,12 @@ GPU_COMPILER_CUFLAGS = \
 	-Wno-unused-result \
 	-I.

+# Workaround buggy P2P copy on some windows multi-GPU setups
+# This workaround breaks linux systems with small system RAM, so only enable on windows
+ifeq ($(OS),windows)
+	GPU_COMPILER_CUFLAGS += -DGGML_CUDA_NO_PEER_COPY=1
+endif
+
 include make/gpu.make

 # Adjust the rules from gpu.make to handle the ROCm dependencies properly
--- a/llama/make/cuda.make
+++ b/llama/make/cuda.make
@@ -20,7 +20,7 @@ GPU_COMPILER_CFLAGS_LINUX = $(CFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
 GPU_COMPILER_CXXFLAGS_WIN = $(CXXFLAGS) -D_WIN32_WINNT=0x602
 GPU_COMPILER_CXXFLAGS_LINUX = $(CXXFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
 GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT)*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
-GPU_DIST_DEPS_LIBS= $(sort $(addprefix $(DIST_LIB_DIR)/,$(notdir $(GPU_LIBS))))
+GPU_DIST_DEPS_LIBS= $(sort $(addprefix $(DIST_GPU_RUNNER_DEPS_DIR)/,$(notdir $(GPU_LIBS))))

 ifeq ($(OS),linux)
 	CUDA_PATH?=/usr/local/cuda
--- a/llama/make/gpu.make
+++ b/llama/make/gpu.make
@@ -85,7 +85,7 @@ $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): $(RUNNERS
 	GOARCH=$(ARCH) CGO_LDFLAGS="$(TARGET_CGO_LDFLAGS)" go build -buildmode=pie  $(GPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS) $(GPU_RUNNER_GO_TAGS)) -o $@ ./runner
 $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(GPU_RUNNER_OBJS) $(DIST_GPU_RUNNER_LIB_DEPS) $(COMMON_HDRS) $(GPU_RUNNER_HDRS)
 	@-mkdir -p $(dir $@)
-	$(CCACHE) $(GPU_COMPILER) --shared $(GPU_RUNNER_DRIVER_LIB_LINK) -L${DIST_GPU_RUNNER_DEPS_DIR} $(foreach lib, $(GPU_RUNNER_LIBS_SHORT), -l$(lib)) $(GPU_RUNNER_OBJS) -o $@
+	$(CCACHE) $(GPU_COMPILER) --shared -L$(GPU_LIB_DIR) $(GPU_RUNNER_DRIVER_LIB_LINK) -L${DIST_GPU_RUNNER_DEPS_DIR} $(foreach lib, $(GPU_RUNNER_LIBS_SHORT), -l$(lib)) $(GPU_RUNNER_OBJS) -o $@

 # Distribution targets
 $(RUNNERS_DIST_DIR)/%: $(RUNNERS_BUILD_DIR)/%
--- a/llama/runner/cache.go
+++ b/llama/runner/cache.go
@@ -2,6 +2,7 @@ package main

 import (
 	"errors"
+	"fmt"
 	"log/slog"
 	"reflect"
 	"time"
@@ -22,7 +23,11 @@ type InputCache struct {
 	lc *llama.Context
 }

-func NewInputCache(lc *llama.Context, kvSize int, numSlots int, multiUserCache bool) *InputCache {
+func NewInputCache(lc *llama.Context, kvSize int, numSlots int, multiUserCache bool) (*InputCache, error) {
+	if kvSize/numSlots < 1 {
+		return nil, fmt.Errorf("must have at least one kv cache entry per parallel sequence (kv: %v parallel: %v)", kvSize, numSlots)
+	}
+
 	slots := make([]InputCacheSlot, numSlots)

 	for i := range slots {
@@ -37,7 +42,7 @@ func NewInputCache(lc *llama.Context, kvSize int, numSlots int, multiUserCache b
 		slots:          slots,
 		multiUserCache: multiUserCache,
 		lc:             lc,
-	}
+	}, nil
 }

 // Locking: Operations on InputCacheSlot (including finding one
@@ -58,7 +63,7 @@ type InputCacheSlot struct {
 	lastUsed time.Time
 }

-func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCacheSlot, []input, int, error) {
+func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCacheSlot, []input, error) {
 	var slot *InputCacheSlot
 	var numPast int
 	var err error
@@ -75,7 +80,7 @@ func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCach
 		slot, numPast, err = c.findBestCacheSlot(prompt)
 	}
 	if err != nil {
-		return nil, nil, 0, err
+		return nil, nil, err
 	}

 	if !cachePrompt {
@@ -102,7 +107,7 @@ func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCach
 	prompt = prompt[numPast:]
 	slot.Inputs = slot.Inputs[:numPast]

-	return slot, prompt, numPast, nil
+	return slot, prompt, nil
 }

 func (c *InputCache) findLongestCacheSlot(prompt []input) (*InputCacheSlot, int, error) {
@@ -194,14 +199,38 @@ func countCommonPrefix(a []input, b []input) int {
 	return count
 }

-func (c *InputCache) ShiftCacheSlot(slot *InputCacheSlot, numKeep int, numDiscard int, numPast int) {
-	// TODO (jessegross): KV cache removal can fail for certain types of models
-	// server.cpp doesn't handle this, though we can be more graceful
-	c.lc.KvCacheSeqRm(slot.Id, numKeep, numKeep+numDiscard)
-	c.lc.KvCacheSeqAdd(slot.Id, numKeep+numDiscard, numPast, -numDiscard)
-
-	for i := numKeep + numDiscard; i < len(slot.Inputs); i++ {
-		slot.Inputs[i-numDiscard] = slot.Inputs[i]
+// Frees up space in the KV cache by deleting the oldest half of history and shifting
+// the newest half into that space (saving numKeep inputs at the beginning).
+//
+// Assumes that at least 1 entry can be freed up by shifting (i.e. numKeep < numCtx)
+func (c *InputCache) ShiftCacheSlot(slot *InputCacheSlot, numKeep int) error {
+	if numKeep >= c.numCtx {
+		return fmt.Errorf("unable to shift context - keep exceeds context (keep: %v context: %v)", numKeep, c.numCtx)
 	}
-	slot.Inputs = slot.Inputs[:len(slot.Inputs)-numDiscard]
+
+	targetFree := (c.numCtx - numKeep) / 2
+	targetFree = max(targetFree, 1)
+
+	currentFree := c.numCtx - len(slot.Inputs)
+	discard := targetFree - currentFree
+
+	if discard <= 0 {
+		return nil
+	}
+
+	slog.Debug("context limit hit - shifting", "id", slot.Id, "limit", c.numCtx, "input", len(slot.Inputs),
+		"keep", numKeep, "discard", discard)
+
+	// TODO (jessegross): KV cache removal can fail for certain types of models
+	if !c.lc.KvCacheSeqRm(slot.Id, numKeep, numKeep+discard) {
+		return fmt.Errorf("unable to remove old kv cache entries (id: %v, keep: %v discard: %v)", slot.Id, numKeep, discard)
+	}
+	c.lc.KvCacheSeqAdd(slot.Id, numKeep+discard, len(slot.Inputs), -discard)
+
+	for i := numKeep + discard; i < len(slot.Inputs); i++ {
+		slot.Inputs[i-discard] = slot.Inputs[i]
+	}
+	slot.Inputs = slot.Inputs[:len(slot.Inputs)-discard]
+
+	return nil
 }
--- a/llama/runner/image.go
+++ b/llama/runner/image.go
@@ -68,6 +68,10 @@ func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspect
 		return nil, nil
 	}

+	if len(data) <= 0 {
+		return nil, errors.New("received zero length image")
+	}
+
 	hash := c.hashImage(data)

 	c.mu.Lock()
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -20,6 +20,8 @@ import (
 	"time"
 	"unicode/utf8"

+	"golang.org/x/sync/semaphore"
+
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/llama"
 )
@@ -34,9 +36,6 @@ type input struct {
 }

 type Sequence struct {
-	// number of inputs evaluated
-	numPast int
-
 	// batch index
 	iBatch int

@@ -46,6 +45,9 @@ type Sequence struct {
 	// prompt inputs left to evaluate
 	inputs []input

+	// inputs that have been added to a batch but not yet submitted to Decode
+	pendingInputs []input
+
 	// tokens that have been generated but not returned yet (e.g. for stop sequences)
 	pendingResponses []string

@@ -112,16 +114,13 @@ func (s *Server) NewSequence(prompt string, images []ImageData, params NewSequen
 		params.numKeep = len(inputs)
 	}

-	if !params.embedding {
-		// Subtracting 4 ensures that at least 1 input can be discarded during shift
-		params.numKeep = min(params.numKeep, s.cache.numCtx-4)
-		params.numKeep += s.bosToken
-	} else {
-		// Embeddings are 1 shot - just truncate to the context window, without ever shifting
-		params.numKeep = min(params.numKeep, s.cache.numCtx)
+	if s.model.AddBOSToken() {
+		params.numKeep += 1
 	}

-	// truncate to fit in context window
+	// Ensure that at least 1 input can be discarded during shift
+	params.numKeep = min(params.numKeep, s.cache.numCtx-1)
+
 	if len(inputs) > s.cache.numCtx {
 		slog.Warn("truncating input prompt", "limit", s.cache.numCtx, "prompt", len(inputs), "numKeep", params.numKeep)
 		newInputs := inputs[:params.numKeep]
@@ -170,15 +169,13 @@ func (s *Server) inputs(prompt string, images []ImageData) ([]input, error) {

 	for i, part := range parts {
 		// text - tokenize
-		if strings.TrimSpace(part) != "" {
-			tokens, err := s.lc.Model().Tokenize(part, i == 0, true)
-			if err != nil {
-				return nil, err
-			}
+		tokens, err := s.lc.Model().Tokenize(part, i == 0, true)
+		if err != nil {
+			return nil, err
+		}

-			for _, t := range tokens {
-				inputs = append(inputs, input{token: t})
-			}
+		for _, t := range tokens {
+			inputs = append(inputs, input{token: t})
 		}

 		// image - generate image embedding
@@ -212,41 +209,51 @@ func (s *Server) inputs(prompt string, images []ImageData) ([]input, error) {
 }

 type Server struct {
-	model *llama.Model
-	lc    *llama.Context
+	// is the server ready to process requests?
+	// protects access to model and image
+	ready sync.WaitGroup

-	// required for image embeddings
+	// loaded model
+	model *llama.Model
+
+	// image model context for multi-modal models
 	image *ImageContext

+	// status for external health reporting - loading, ready to serve, etc.
+	status ServerStatus
+
+	// current progress on loading the model
+	progress float32
+
+	// number of simultaneous requests to handle
+	parallel int
+
+	// maximum number of elements in a batch (per sequence)
 	// TODO (jmorganca): make this n_batch
 	batchSize int

-	// parallel is the number of parallel requests to handle
-	parallel int
+	// protects access to everything below this line
+	// this is context state needed for decoding
+	mu sync.Mutex

-	// seqs is the list of parallel sequences being evaluated
-	// TODO (jmorganca): this can probably be moved into run()
+	// indicates that data is ready for processing
+	cond *sync.Cond
+
+	// decoding state
+	lc *llama.Context
+
+	// the list of simultaneous sequences being evaluated
 	seqs []*Sequence

+	// seqs can have a maximum of parallel entries, which
+	// is enfoced by seqSem
+	seqsSem *semaphore.Weighted
+
 	// KV cache
 	cache *InputCache

-	// does this model require a beginning of sequence token?
-	bosToken int
-
 	// next sequence for prompt processing to avoid starvation
 	nextSeq int
-
-	// is the server ready to process requests?
-	ready sync.WaitGroup
-
-	mu sync.Mutex
-
-	cond *sync.Cond
-
-	progress float32
-
-	status ServerStatus
 }

 func (s *Server) allNil() bool {
@@ -258,18 +265,6 @@ func (s *Server) allNil() bool {
 	return true
 }

-func (s *Server) shiftContext(seq *Sequence) {
-	numLeft := seq.numPast - seq.numKeep
-	numDiscard := numLeft / 2
-
-	slog.Debug("context limit hit - shifting", "limit", s.cache.numCtx, "numPast", seq.numPast,
-		"numKeep", seq.numKeep, "numLeft", numLeft, "numDiscard", numDiscard)
-
-	s.cache.ShiftCacheSlot(seq.cache, seq.numKeep, numDiscard, seq.numPast)
-
-	seq.numPast -= numDiscard
-}
-
 func flushPending(seq *Sequence) bool {
 	joined := strings.Join(seq.pendingResponses, "")
 	seq.pendingResponses = []string{}
@@ -335,7 +330,11 @@ func (s *Server) run(ctx context.Context) {
 		case <-ctx.Done():
 			return
 		default:
-			s.processBatch(tokenBatch, embedBatch)
+			err := s.processBatch(tokenBatch, embedBatch)
+			if err != nil {
+				panic(err)
+			}
+
 			tokenBatch.Clear()
 			embedBatch.Clear()
 		}
@@ -349,7 +348,7 @@ func (s *Server) run(ctx context.Context) {
 // these should instead be handled by the handlers
 // it should only be responsible for accepting tokens or embeddings and
 // processing batches as fast as possible
-func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch) {
+func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch) error {
 	s.mu.Lock()
 	for s.allNil() {
 		s.cond.Wait() // Wait until an item is added
@@ -369,17 +368,23 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 		}

 		// if past the num predict limit
-		if seq.numPredict > 0 && seq.numPredicted > seq.numPredict {
+		if seq.numPredict > 0 && seq.numPredicted >= seq.numPredict {
 			s.removeSequence(seqIdx, "limit")
 			continue
 		}

-		if seq.numPast+len(seq.inputs) > s.cache.numCtx {
-			s.shiftContext(seq)
-		}
-
-		var numInputsProcessed int
 		for i, input := range seq.inputs {
+			if len(seq.cache.Inputs)+len(seq.pendingInputs)+1 > s.cache.numCtx {
+				if len(seq.pendingInputs) == 0 {
+					err := s.cache.ShiftCacheSlot(seq.cache, seq.numKeep)
+					if err != nil {
+						return err
+					}
+				} else {
+					break
+				}
+			}
+
 			embedding := input.embed != nil

 			// If we don't currently have a batch, use one of the correct type and
@@ -403,28 +408,37 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 			}

 			crossAttention = seq.crossAttention
-			batch.Add(input.token, input.embed, seq.numPast, numInputsProcessed+1 == len(seq.inputs), seq.cache.Id)
-			seq.numPast++
-			numInputsProcessed++
-		}
-
-		if numInputsProcessed > 0 {
-			seq.cache.Inputs = append(seq.cache.Inputs, seq.inputs[:numInputsProcessed]...)
-			seq.inputs = seq.inputs[numInputsProcessed:]
+			batch.Add(input.token, input.embed, len(seq.cache.Inputs)+len(seq.pendingInputs), i+1 == len(seq.inputs), seq.cache.Id)
+			seq.pendingInputs = append(seq.pendingInputs, input)
 			seq.iBatch = batch.NumTokens() - 1
 		}
+
+		seq.inputs = seq.inputs[len(seq.pendingInputs):]
 	}

 	if batch == nil || batch.NumTokens() == 0 {
-		return
+		return nil
 	}

 	s.lc.SetCrossAttention(crossAttention)

 	err := s.lc.Decode(batch)
 	if err != nil {
-		slog.Error("failed to decode batch", "error", err)
-		return
+		if errors.Is(err, llama.ErrKvCacheFull) {
+			slog.Debug("defragmenting kv cache")
+			s.cache.lc.KvCacheDefrag()
+			err = s.lc.Decode(batch)
+		}
+		if err != nil {
+			return fmt.Errorf("failed to decode batch: %w", err)
+		}
+	}
+
+	if crossAttention {
+		// synchronize state to ensure the cross attention batch is complete.
+		// needed specifically for multi-GPU systems otherwise an inflight
+		// task may be incorrectly invalidated causing a crash
+		s.lc.Synchronize()
 	}

 	for i, seq := range s.seqs {
@@ -432,6 +446,12 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 			continue
 		}

+		// After calling Decode, pending inputs are now in the cache
+		if len(seq.pendingInputs) > 0 {
+			seq.cache.Inputs = append(seq.cache.Inputs, seq.pendingInputs...)
+			seq.pendingInputs = []input{}
+		}
+
 		// don't sample prompt processing
 		if len(seq.inputs) != 0 {
 			continue
@@ -444,7 +464,7 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)

 		// if done processing the prompt, generate an embedding and return
 		if seq.embeddingOnly {
-			embed := s.lc.GetEmbeddingsSeq(i)
+			embed := s.lc.GetEmbeddingsSeq(seq.cache.Id)
 			if embed == nil {
 				embed = s.lc.GetEmbeddingsIth(seq.iBatch)
 			}
@@ -514,6 +534,8 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 			s.removeSequence(i, "connection")
 		}
 	}
+
+	return nil
 }

 // TODO (jmorganca): use structs from the api package to avoid duplication
@@ -627,12 +649,17 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 		return
 	}

-	// TODO (jmorganca): add to sequence queue instead of
-	// failing if a slot isn't available
+	// Ensure that a place to put the sequence is available
+	if err := s.seqsSem.Acquire(r.Context(), 1); err != nil {
+		slog.Error("Failed to acquire semaphore", "error", err)
+		return
+	}
+	defer s.seqsSem.Release(1)
+
 	s.mu.Lock()
 	for i, sq := range s.seqs {
 		if sq == nil {
-			seq.cache, seq.inputs, seq.numPast, err = s.cache.LoadCacheSlot(seq.inputs, req.CachePrompt)
+			seq.cache, seq.inputs, err = s.cache.LoadCacheSlot(seq.inputs, req.CachePrompt)
 			if err != nil {
 				s.mu.Unlock()
 				http.Error(w, fmt.Sprintf("Failed to load cache: %v", err), http.StatusInternalServerError)
@@ -711,11 +738,17 @@ func (s *Server) embeddings(w http.ResponseWriter, r *http.Request) {
 		return
 	}

-	// TODO (jessegross): Wait for a free slot instead of failing and blocking forever
+	// Ensure that a place to put the sequence is available
+	if err := s.seqsSem.Acquire(r.Context(), 1); err != nil {
+		slog.Error("Failed to acquire semaphore", "error", err)
+		return
+	}
+	defer s.seqsSem.Release(1)
+
 	s.mu.Lock()
 	for i, sq := range s.seqs {
 		if sq == nil {
-			seq.cache, seq.inputs, seq.numPast, err = s.cache.LoadCacheSlot(seq.inputs, req.CachePrompt)
+			seq.cache, seq.inputs, err = s.cache.LoadCacheSlot(seq.inputs, req.CachePrompt)
 			if err != nil {
 				s.mu.Unlock()
 				http.Error(w, fmt.Sprintf("Failed to load cache: %v", err), http.StatusInternalServerError)
@@ -802,10 +835,6 @@ func (s *Server) loadModel(
 		}
 	}

-	if s.model.AddBOSToken() {
-		s.bosToken = 1
-	}
-
 	if ppath != "" {
 		var err error
 		s.image, err = NewImageContext(s.lc, ppath)
@@ -814,7 +843,10 @@ func (s *Server) loadModel(
 		}
 	}

-	s.cache = NewInputCache(s.lc, kvSize, s.parallel, multiUserCache)
+	s.cache, err = NewInputCache(s.lc, kvSize, s.parallel, multiUserCache)
+	if err != nil {
+		panic(err)
+	}

 	s.status = ServerStatusReady
 	s.ready.Done()
@@ -837,14 +869,8 @@ func main() {
 	mlock := flag.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing")
 	tensorSplit := flag.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
 	multiUserCache := flag.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
-	// Expose requirements as a JSON output to stdout
 	requirements := flag.Bool("requirements", false, "print json requirement information")

-	// These are either ignored by llama.cpp or have no significance to us
-	_ = flag.Bool("embedding", false, "enable embedding vector output (default: disabled)")
-	_ = flag.Bool("log-disable", false, "disables logging to a file")
-	_ = flag.Bool("memory-f32", false, "use f32 instead of f16 for memory key+value (default: disabled) not recommended: doubles context memory required and no measurable increase in quality")
-
 	flag.Parse()
 	if *requirements {
 		printRequirements(os.Stdout)
@@ -873,6 +899,7 @@ func main() {
 		batchSize: *batchSize,
 		parallel:  *parallel,
 		seqs:      make([]*Sequence, *parallel),
+		seqsSem:   semaphore.NewWeighted(int64(*parallel)),
 		status:    ServerStatusLoadingModel,
 	}

--- a/llm/server.go
+++ b/llm/server.go
@@ -186,7 +186,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 		"--model", model,
 		"--ctx-size", strconv.Itoa(opts.NumCtx),
 		"--batch-size", strconv.Itoa(opts.NumBatch),
-		"--embedding",
 	}

 	if opts.NumGPU >= 0 {
@@ -218,10 +217,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 		params = append(params, "--threads", strconv.Itoa(defaultThreads))
 	}

-	if !opts.F16KV {
-		params = append(params, "--memory-f32")
-	}
-
 	flashAttnEnabled := envconfig.FlashAttention()

 	for _, g := range gpus {
@@ -311,9 +306,9 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter

 		// Note: we always put the dependency path first
 		// since this was the exact version we compiled/linked against
-		if gpus[0].DependencyPath != "" {
+		if gpus[0].DependencyPath != nil {
 			// assume gpus from the same library have the same dependency path
-			libraryPaths = append([]string{gpus[0].DependencyPath}, libraryPaths...)
+			libraryPaths = append(gpus[0].DependencyPath, libraryPaths...)
 		}

 		server := filepath.Join(dir, "ollama_llama_server")
@@ -843,13 +838,15 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 	}

 	if err := scanner.Err(); err != nil {
-		if strings.Contains(err.Error(), "unexpected EOF") {
+		if strings.Contains(err.Error(), "unexpected EOF") || strings.Contains(err.Error(), "forcibly closed") {
 			s.Close()
-			msg := ""
+			var msg string
 			if s.status != nil && s.status.LastErrMsg != "" {
 				msg = s.status.LastErrMsg
+			} else {
+				msg = err.Error()
 			}
-			return fmt.Errorf("an unknown error was encountered while running the model %s", msg)
+			return fmt.Errorf("an error was encountered while running the model: %s", msg)
 		}

 		return fmt.Errorf("error reading llm response: %v", err)
@@ -1097,7 +1094,9 @@ func (s *llmServer) EstimatedTotal() uint64 {
 func (s *llmServer) EstimatedVRAMByGPU(gpuID string) uint64 {
 	for i, gpu := range s.gpus {
 		if gpu.ID == gpuID {
-			return s.estimate.GPUSizes[i]
+			if i < len(s.estimate.GPUSizes) {
+				return s.estimate.GPUSizes[i]
+			}
 		}
 	}
 	return 0
--- a/llm/status.go
+++ b/llm/status.go
@@ -27,6 +27,7 @@ var errorPrefixes = []string{
 	"\"ERR\"",
 	"error loading model",
 	"GGML_ASSERT",
+	"Deepseek2 does not support K-shift",
 }

 func (w *StatusWriter) Write(b []byte) (int, error) {
--- a/parser/parser.go
+++ b/parser/parser.go
@@ -65,9 +65,22 @@ var (
 	errInvalidCommand     = errors.New("command must be one of \"from\", \"license\", \"template\", \"system\", \"adapter\", \"parameter\", or \"message\"")
 )

+type ParserError struct {
+	LineNumber int
+	Msg        string
+}
+
+func (e *ParserError) Error() string {
+	if e.LineNumber > 0 {
+		return fmt.Sprintf("(line %d): %s", e.LineNumber, e.Msg)
+	}
+	return e.Msg
+}
+
 func ParseFile(r io.Reader) (*File, error) {
 	var cmd Command
 	var curr state
+	var currLine int = 1
 	var b bytes.Buffer
 	var role string

@@ -84,11 +97,18 @@ func ParseFile(r io.Reader) (*File, error) {
 			return nil, err
 		}

+		if isNewline(r) {
+			currLine++
+		}
+
 		next, r, err := parseRuneForState(r, curr)
 		if errors.Is(err, io.ErrUnexpectedEOF) {
 			return nil, fmt.Errorf("%w: %s", err, b.String())
 		} else if err != nil {
-			return nil, err
+			return nil, &ParserError{
+				LineNumber: currLine,
+				Msg:        err.Error(),
+			}
 		}

 		// process the state transition, some transitions need to be intercepted and redirected
@@ -96,7 +116,10 @@ func ParseFile(r io.Reader) (*File, error) {
 			switch curr {
 			case stateName:
 				if !isValidCommand(b.String()) {
-					return nil, errInvalidCommand
+					return nil, &ParserError{
+						LineNumber: currLine,
+						Msg:        errInvalidCommand.Error(),
+					}
 				}

 				// next state sometimes depends on the current buffer value
@@ -117,7 +140,10 @@ func ParseFile(r io.Reader) (*File, error) {
 				cmd.Name = b.String()
 			case stateMessage:
 				if !isValidMessageRole(b.String()) {
-					return nil, errInvalidMessageRole
+					return nil, &ParserError{
+						LineNumber: currLine,
+						Msg:        errInvalidMessageRole.Error(),
+					}
 				}

 				role = b.String()
--- a/parser/parser_test.go
+++ b/parser/parser_test.go
@@ -3,6 +3,7 @@ package parser
 import (
 	"bytes"
 	"encoding/binary"
+	"errors"
 	"fmt"
 	"io"
 	"strings"
@@ -180,8 +181,15 @@ func TestParseFileBadCommand(t *testing.T) {
 FROM foo
 BADCOMMAND param1 value1
 `
+	parserError := &ParserError{
+		LineNumber: 3,
+		Msg:        errInvalidCommand.Error(),
+	}
+
 	_, err := ParseFile(strings.NewReader(input))
-	require.ErrorIs(t, err, errInvalidCommand)
+	if !errors.As(err, &parserError) {
+		t.Errorf("unexpected error: expected: %s, actual: %s", parserError.Error(), err.Error())
+	}
 }

 func TestParseFileMessages(t *testing.T) {
@@ -245,7 +253,10 @@ FROM foo
 MESSAGE badguy I'm a bad guy!
 `,
 			nil,
-			errInvalidMessageRole,
+			&ParserError{
+				LineNumber: 3,
+				Msg:        errInvalidMessageRole.Error(),
+			},
 		},
 		{
 			`
@@ -264,13 +275,35 @@ MESSAGE system`,
 		},
 	}

-	for _, c := range cases {
+	for _, tt := range cases {
 		t.Run("", func(t *testing.T) {
-			modelfile, err := ParseFile(strings.NewReader(c.input))
-			require.ErrorIs(t, err, c.err)
+			modelfile, err := ParseFile(strings.NewReader(tt.input))
+
 			if modelfile != nil {
-				assert.Equal(t, c.expected, modelfile.Commands)
+				assert.Equal(t, tt.expected, modelfile.Commands)
 			}
+
+			if tt.err == nil {
+				if err != nil {
+					t.Fatalf("expected no error, but got %v", err)
+				}
+				return
+			}
+
+			switch tt.err.(type) {
+			case *ParserError:
+				var pErr *ParserError
+				if errors.As(err, &pErr) {
+					// got the correct type of error
+					return
+				}
+			}
+
+			if errors.Is(err, tt.err) {
+				return
+			}
+
+			t.Fatalf("unexpected error: expected: %v, actual: %v", tt.err, err)
 		})
 	}
 }
@@ -440,7 +473,6 @@ func TestParseFileParameters(t *testing.T) {
 		"num_gpu 1":                    {"num_gpu", "1"},
 		"main_gpu 1":                   {"main_gpu", "1"},
 		"low_vram true":                {"low_vram", "true"},
-		"f16_kv true":                  {"f16_kv", "true"},
 		"logits_all true":              {"logits_all", "true"},
 		"vocab_only true":              {"vocab_only", "true"},
 		"use_mmap true":                {"use_mmap", "true"},
--- a/scripts/build_darwin.sh
+++ b/scripts/build_darwin.sh
@@ -6,17 +6,18 @@ set -e

 mkdir -p dist

+# These require Xcode v13 or older to target MacOS v11
+# If installed to an alternate location use the following to enable
+# export SDKROOT=/Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
+# export DEVELOPER_DIR=/Applications/Xcode_12.5.1.app/Contents/Developer
+export CGO_CFLAGS=-mmacosx-version-min=11.3
+export CGO_CXXFLAGS=-mmacosx-version-min=11.3
+export CGO_LDFLAGS=-mmacosx-version-min=11.3
+
 for TARGETARCH in arm64 amd64; do
    echo "Building Go runner darwin $TARGETARCH"
    rm -rf llama/build
    GOOS=darwin ARCH=$TARGETARCH GOARCH=$TARGETARCH make -C llama -j 8
-    # These require Xcode v13 or older to target MacOS v11
-    # If installed to an alternate location use the following to enable
-    # export SDKROOT=/Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
-    # export DEVELOPER_DIR=/Applications/Xcode_12.5.1.app/Contents/Developer
-    export CGO_CFLAGS=-mmacosx-version-min=11.3
-    export CGO_CXXFLAGS=-mmacosx-version-min=11.3
-    export CGO_LDFLAGS=-mmacosx-version-min=11.3
    CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -trimpath -o dist/ollama-darwin-$TARGETARCH
    CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -trimpath -cover -o dist/ollama-darwin-$TARGETARCH-cov
 done
--- a/scripts/env.sh
+++ b/scripts/env.sh
@@ -5,7 +5,6 @@ export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$V
 # TODO - consider `docker buildx ls --format=json` to autodiscover platform capability
 PLATFORM=${PLATFORM:-"linux/arm64,linux/amd64"}
 DOCKER_ORG=${DOCKER_ORG:-"ollama"}
-RELEASE_IMAGE_REPO=${RELEASE_IMAGE_REPO:-"${DOCKER_ORG}/release"}
 FINAL_IMAGE_REPO=${FINAL_IMAGE_REPO:-"${DOCKER_ORG}/ollama"}
 OLLAMA_COMMON_BUILD_ARGS="--build-arg=VERSION \
    --build-arg=GOFLAGS \
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -4,9 +4,12 @@

 set -eu

+red="$( (/usr/bin/tput bold || :; /usr/bin/tput setaf 1 || :) 2>&-)"
+plain="$( (/usr/bin/tput sgr0 || :) 2>&-)"
+
 status() { echo ">>> $*" >&2; }
-error() { echo "ERROR $*"; exit 1; }
-warning() { echo "WARNING: $*"; }
+error() { echo "${red}ERROR:${plain} $*"; exit 1; }
+warning() { echo "${red}WARNING:${plain} $*"; }

 TEMP_DIR=$(mktemp -d)
 cleanup() { rm -rf $TEMP_DIR; }
@@ -93,6 +96,22 @@ else
    fi
 fi

+# Check for NVIDIA JetPack systems with additional downloads
+if [ -f /etc/nv_tegra_release ] ; then
+    if grep R36 /etc/nv_tegra_release > /dev/null ; then
+        status "Downloading JetPack 6 components"
+        curl --fail --show-error --location --progress-bar \
+            "https://ollama.com/download/ollama-linux-${ARCH}-jetpack6.tgz${VER_PARAM}" | \
+            $SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR"
+    elif grep R35 /etc/nv_tegra_release > /dev/null ; then
+        status "Downloading JetPack 5 components"
+        curl --fail --show-error --location --progress-bar \
+            "https://ollama.com/download/ollama-linux-${ARCH}-jetpack5.tgz${VER_PARAM}" | \
+            $SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR"
+    else
+        warning "Unsupported JetPack version detected.  GPU may not be supported"
+    fi
+fi

 install_success() {
    status 'The Ollama API is now available at 127.0.0.1:11434.'
@@ -146,6 +165,12 @@ EOF
            start_service() { $SUDO systemctl restart ollama; }
            trap start_service EXIT
            ;;
+        *)
+            warning "systemd is not running"
+            if [ "$IS_WSL2" = true ]; then
+                warning "see https://learn.microsoft.com/en-us/windows/wsl/systemd#how-to-enable-systemd to enable it"
+            fi
+            ;;
    esac
 }

@@ -163,6 +188,13 @@ if [ "$IS_WSL2" = true ]; then
    exit 0
 fi

+# Don't attempt to install drivers on Jetson systems
+if [ -f /etc/nv_tegra_release ] ; then
+    status "NVIDIA JetPack ready."
+    install_success
+    exit 0
+fi
+
 # Install GPU dependencies on Linux
 if ! available lspci && ! available lshw; then
    warning "Unable to detect NVIDIA/AMD GPU. Install lspci or lshw to automatically detect and install GPU dependencies."
--- a/server/images.go
+++ b/server/images.go
@@ -13,6 +13,7 @@ import (
 	"io"
 	"log"
 	"log/slog"
+	"net"
 	"net/http"
 	"net/url"
 	"os"
@@ -1071,6 +1072,21 @@ func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.UR
 	return nil, errUnauthorized
 }

+// testMakeRequestDialContext specifies the dial function for the http client in
+// makeRequest. It can be used to resolve hosts in model names to local
+// addresses for testing. For example, the model name ("example.com/my/model")
+// can be directed to push/pull from "127.0.0.1:1234".
+//
+// This is not safe to set across goroutines. It should be set in
+// the main test goroutine, and not by tests marked to run in parallel with
+// t.Parallel().
+//
+// It should be cleared after use, otherwise it will affect other tests.
+//
+// Ideally we would have some set this up the stack, but the code is not
+// structured in a way that makes this easy, so this will have to do for now.
+var testMakeRequestDialContext func(ctx context.Context, network, addr string) (net.Conn, error)
+
 func makeRequest(ctx context.Context, method string, requestURL *url.URL, headers http.Header, body io.Reader, regOpts *registryOptions) (*http.Response, error) {
 	if requestURL.Scheme != "http" && regOpts != nil && regOpts.Insecure {
 		requestURL.Scheme = "http"
@@ -1105,6 +1121,9 @@ func makeRequest(ctx context.Context, method string, requestURL *url.URL, header
 	}

 	resp, err := (&http.Client{
+		Transport: &http.Transport{
+			DialContext: testMakeRequestDialContext,
+		},
 		CheckRedirect: regOpts.CheckRedirect,
 	}).Do(req)
 	if err != nil {
--- a/server/prompt_test.go
+++ b/server/prompt_test.go
@@ -32,7 +32,7 @@ func TestChatPrompt(t *testing.T) {
 	mllamaModel := Model{Template: tmpl, ProjectorPaths: []string{"vision"}, Config: ConfigV2{ModelFamilies: []string{"mllama"}}}

 	createImg := func(width, height int) ([]byte, error) {
-		img := image.NewRGBA(image.Rect(0, 0, 5, 5))
+		img := image.NewRGBA(image.Rect(0, 0, width, height))
 		var buf bytes.Buffer

 		if err := png.Encode(&buf, img); err != nil {
--- a/server/routes.go
+++ b/server/routes.go
@@ -507,7 +507,7 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) {
 	embedding, err := r.Embedding(c.Request.Context(), req.Prompt)
 	if err != nil {
 		slog.Info(fmt.Sprintf("embedding generation failed: %v", err))
-		c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to generate embedding"})
+		c.JSON(http.StatusInternalServerError, gin.H{"error": fmt.Errorf("failed to generate embedding: %v", err)})
 		return
 	}

@@ -540,7 +540,8 @@ func (s *Server) PullHandler(c *gin.Context) {
 		return
 	}

-	if err := checkNameExists(name); err != nil {
+	name, err = getExistingName(name)
+	if err != nil {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 		return
 	}
@@ -621,19 +622,20 @@ func (s *Server) PushHandler(c *gin.Context) {
 	streamResponse(c, ch)
 }

-func checkNameExists(name model.Name) error {
-	names, err := Manifests(true)
+// getExistingName returns the original, on disk name if the input name is a
+// case-insensitive match, otherwise it returns the input name.
+func getExistingName(n model.Name) (model.Name, error) {
+	var zero model.Name
+	existing, err := Manifests(true)
 	if err != nil {
-		return err
+		return zero, err
 	}
-
-	for n := range names {
-		if strings.EqualFold(n.Filepath(), name.Filepath()) && n != name {
-			return errors.New("a model with that name already exists")
+	for e := range existing {
+		if n.EqualFold(e) {
+			return e, nil
 		}
 	}
-
-	return nil
+	return n, nil
 }

 func (s *Server) CreateHandler(c *gin.Context) {
@@ -652,7 +654,8 @@ func (s *Server) CreateHandler(c *gin.Context) {
 		return
 	}

-	if err := checkNameExists(name); err != nil {
+	name, err := getExistingName(name)
+	if err != nil {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 		return
 	}
@@ -958,14 +961,19 @@ func (s *Server) CopyHandler(c *gin.Context) {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("source %q is invalid", r.Source)})
 		return
 	}
+	src, err := getExistingName(src)
+	if err != nil {
+		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
+		return
+	}

 	dst := model.ParseName(r.Destination)
 	if !dst.IsValid() {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("destination %q is invalid", r.Destination)})
 		return
 	}
-
-	if err := checkNameExists(dst); err != nil {
+	dst, err = getExistingName(dst)
+	if err != nil {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 		return
 	}
--- a/server/routes_test.go
+++ b/server/routes_test.go
@@ -7,13 +7,18 @@ import (
 	"encoding/json"
 	"fmt"
 	"io"
+	"io/fs"
 	"math"
+	"math/rand/v2"
+	"net"
 	"net/http"
 	"net/http/httptest"
 	"os"
+	"path/filepath"
 	"sort"
 	"strings"
 	"testing"
+	"unicode"

 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/llm"
@@ -473,83 +478,129 @@ func Test_Routes(t *testing.T) {
 	}
 }

-func TestCase(t *testing.T) {
+func casingShuffle(s string) string {
+	rr := []rune(s)
+	for i := range rr {
+		if rand.N(2) == 0 {
+			rr[i] = unicode.ToUpper(rr[i])
+		} else {
+			rr[i] = unicode.ToLower(rr[i])
+		}
+	}
+	return string(rr)
+}
+
+func TestManifestCaseSensitivity(t *testing.T) {
 	t.Setenv("OLLAMA_MODELS", t.TempDir())

-	cases := []string{
-		"mistral",
-		"llama3:latest",
-		"library/phi3:q4_0",
-		"registry.ollama.ai/library/gemma:q5_K_M",
-		// TODO: host:port currently fails on windows (#4107)
-		// "localhost:5000/alice/bob:latest",
+	r := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusOK)
+		io.WriteString(w, `{}`) //nolint:errcheck
+	}))
+	defer r.Close()
+
+	nameUsed := make(map[string]bool)
+	name := func() string {
+		const fqmn = "example/namespace/model:tag"
+		for {
+			v := casingShuffle(fqmn)
+			if nameUsed[v] {
+				continue
+			}
+			nameUsed[v] = true
+			return v
+		}
+	}
+
+	wantStableName := name()
+
+	// checkManifestList tests that there is strictly one manifest in the
+	// models directory, and that the manifest is for the model under test.
+	checkManifestList := func() {
+		t.Helper()
+
+		mandir := filepath.Join(os.Getenv("OLLAMA_MODELS"), "manifests/")
+		var entries []string
+		t.Logf("dir entries:")
+		fsys := os.DirFS(mandir)
+		err := fs.WalkDir(fsys, ".", func(path string, info fs.DirEntry, err error) error {
+			if err != nil {
+				return err
+			}
+			t.Logf("    %s", fs.FormatDirEntry(info))
+			if info.IsDir() {
+				return nil
+			}
+			path = strings.TrimPrefix(path, mandir)
+			entries = append(entries, path)
+			return nil
+		})
+		if err != nil {
+			t.Fatalf("failed to walk directory: %v", err)
+		}
+
+		if len(entries) != 1 {
+			t.Errorf("len(got) = %d, want 1", len(entries))
+			return // do not use Fatal so following steps run
+		}
+
+		g := entries[0] // raw path
+		g = filepath.ToSlash(g)
+		w := model.ParseName(wantStableName).Filepath()
+		w = filepath.ToSlash(w)
+		if g != w {
+			t.Errorf("\ngot:  %s\nwant: %s", g, w)
+		}
+	}
+
+	checkOK := func(w *httptest.ResponseRecorder) {
+		t.Helper()
+		if w.Code != http.StatusOK {
+			t.Errorf("code = %d, want 200", w.Code)
+			t.Logf("body: %s", w.Body.String())
+		}
 	}

 	var s Server
-	for _, tt := range cases {
-		t.Run(tt, func(t *testing.T) {
-			w := createRequest(t, s.CreateHandler, api.CreateRequest{
-				Name:      tt,
-				Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
-				Stream:    &stream,
-			})
-
-			if w.Code != http.StatusOK {
-				t.Fatalf("expected status 200 got %d", w.Code)
-			}
-
-			expect, err := json.Marshal(map[string]string{"error": "a model with that name already exists"})
-			if err != nil {
-				t.Fatal(err)
-			}
-
-			t.Run("create", func(t *testing.T) {
-				w = createRequest(t, s.CreateHandler, api.CreateRequest{
-					Name:      strings.ToUpper(tt),
-					Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
-					Stream:    &stream,
-				})
-
-				if w.Code != http.StatusBadRequest {
-					t.Fatalf("expected status 500 got %d", w.Code)
-				}
-
-				if !bytes.Equal(w.Body.Bytes(), expect) {
-					t.Fatalf("expected error %s got %s", expect, w.Body.String())
-				}
-			})
-
-			t.Run("pull", func(t *testing.T) {
-				w := createRequest(t, s.PullHandler, api.PullRequest{
-					Name:   strings.ToUpper(tt),
-					Stream: &stream,
-				})
-
-				if w.Code != http.StatusBadRequest {
-					t.Fatalf("expected status 500 got %d", w.Code)
-				}
-
-				if !bytes.Equal(w.Body.Bytes(), expect) {
-					t.Fatalf("expected error %s got %s", expect, w.Body.String())
-				}
-			})
-
-			t.Run("copy", func(t *testing.T) {
-				w := createRequest(t, s.CopyHandler, api.CopyRequest{
-					Source:      tt,
-					Destination: strings.ToUpper(tt),
-				})
-
-				if w.Code != http.StatusBadRequest {
-					t.Fatalf("expected status 500 got %d", w.Code)
-				}
-
-				if !bytes.Equal(w.Body.Bytes(), expect) {
-					t.Fatalf("expected error %s got %s", expect, w.Body.String())
-				}
-			})
-		})
+	testMakeRequestDialContext = func(ctx context.Context, _, _ string) (net.Conn, error) {
+		var d net.Dialer
+		return d.DialContext(ctx, "tcp", r.Listener.Addr().String())
 	}
+	t.Cleanup(func() { testMakeRequestDialContext = nil })
+
+	t.Logf("creating")
+	checkOK(createRequest(t, s.CreateHandler, api.CreateRequest{
+		// Start with the stable name, and later use a case-shuffled
+		// version.
+		Name: wantStableName,
+
+		Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
+		Stream:    &stream,
+	}))
+	checkManifestList()
+
+	t.Logf("creating (again)")
+	checkOK(createRequest(t, s.CreateHandler, api.CreateRequest{
+		Name:      name(),
+		Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
+		Stream:    &stream,
+	}))
+	checkManifestList()
+
+	t.Logf("pulling")
+	checkOK(createRequest(t, s.PullHandler, api.PullRequest{
+		Name:     name(),
+		Stream:   &stream,
+		Insecure: true,
+	}))
+	checkManifestList()
+
+	t.Logf("copying")
+	checkOK(createRequest(t, s.CopyHandler, api.CopyRequest{
+		Source:      name(),
+		Destination: name(),
+	}))
+	checkManifestList()
 }

 func TestShow(t *testing.T) {
--- a/server/sched.go
+++ b/server/sched.go
@@ -130,11 +130,11 @@ func (s *Scheduler) processPending(ctx context.Context) {
 				continue
 			}
 			numParallel := int(envconfig.NumParallel())
-			// TODO (jmorganca): multimodal models don't support parallel yet
+			// TODO (jmorganca): mllama doesn't support parallel yet
 			// see https://github.com/ollama/ollama/issues/4165
-			if len(pending.model.ProjectorPaths) > 0 && numParallel != 1 {
+			if checkMllamaModelFamily(pending.model) && numParallel != 1 {
 				numParallel = 1
-				slog.Warn("multimodal models don't support parallel requests yet")
+				slog.Warn("mllama doesn't support parallel requests yet")
 			}

 			for {
--- a/types/model/name.go
+++ b/types/model/name.go
@@ -298,6 +298,13 @@ func (n Name) LogValue() slog.Value {
 	return slog.StringValue(n.String())
 }

+func (n Name) EqualFold(o Name) bool {
+	return strings.EqualFold(n.Host, o.Host) &&
+		strings.EqualFold(n.Namespace, o.Namespace) &&
+		strings.EqualFold(n.Model, o.Model) &&
+		strings.EqualFold(n.Tag, o.Tag)
+}
+
 func isValidLen(kind partKind, s string) bool {
 	switch kind {
 	case kindHost:
Author	SHA1	Message	Date
R0CKSTAR	b7bddeebc1	env.sh: cleanup unused RELEASE_IMAGE_REPO (#6855 ) Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>	2024-11-21 08:28:04 -08:00
Paul Robello	6a0c2ec50f	readme: add terminal tool ParLlama to community integrations (#5623 )	2024-11-21 02:55:35 -08:00
毛巳煜	baa41be2aa	readme: add a community made ollama web management tool (#7126 )	2024-11-21 02:51:45 -08:00
xuyangbocn	2157b1232e	readme: add Terraform AWS Ollama & Open WebUI community example (#5633 )	2024-11-21 02:28:57 -08:00
emrgnt-cmplxty	37711578a2	readme: add R2R to community integrations (#5587 )	2024-11-21 02:09:36 -08:00
Cyril Blaecke	fb2c9594e0	readme: Add Nosia to Community Integrations (#5381 )	2024-11-21 02:07:17 -08:00
Christian Tzolov	7fbcd55da3	readme: Add Spring AI library reference (#5981 )	2024-11-21 02:02:14 -08:00
Philippe Charrière	b4348bdd25	readme: add Parakeet to community integrations Parakeet is a GoLang SDK for Ollama --------- Co-authored-by: Parth Sareen <parth.sareen@ollama.com>	2024-11-21 02:00:32 -08:00
Marcin Szczygliński	155734e09a	readme: add community integration py-gpt (#6503 )	2024-11-21 01:54:39 -08:00
Michael	883d80e097	readme: add Promptery to community integrations (#7093 )	2024-11-21 01:46:20 -08:00
Jakub Burkiewicz	e4c9f75b23	readme: add node-red-contrib-ollama to community integrations (#4648 )	2024-11-21 01:09:37 -08:00
Dezoito	f5ec7cc872	readme: add ollama grid search, a community project (#4301 )	2024-11-21 01:02:46 -08:00
Franco Lombardo	811bafba82	readme: Add LLPhant to community integrations (#5679 )	2024-11-21 00:54:26 -08:00
Aarushi	431075fcbb	readme: add autogpt integration to list of community integrations (#6459 )	2024-11-21 00:51:38 -08:00
Kevin Brake	c4f27225ac	readme: add community contribution to readme ollama-kis (#5575 )	2024-11-21 00:31:27 -08:00
chyok	b7aa5ee06c	readme: Add tkinter-based client to community based integrations (#5412 )	2024-11-21 00:19:24 -08:00
Nico	3f87f71755	readme: add Shinkai Desktop to community integrations (#4877 )	2024-11-21 00:16:18 -08:00
Laurent Eschenauer	20623cec13	readme: add OpenGPA to community integrations (#5497 )	2024-11-21 00:13:54 -08:00
Andy Gill	0e5f31a86d	readme: add Haverscript to community integrations (#6945 ) Haverscript uses classical functional programming techniques to provide a composable interface for interacting with ollama-hosted LLMs.	2024-11-21 00:11:39 -08:00
drunkwcodes	7e92091751	readme: Terminal app bb7 to community integrations (#7064 )	2024-11-21 00:03:11 -08:00
boessu	1a742f54c9	readme: update AMD ROCm links (#7213 )	2024-11-20 23:48:55 -08:00
奶茶叔叔	6a89dcf848	readme: flutter-based chat app to community integrations (#7221 )	2024-11-20 23:30:10 -08:00
Alexander F. Rødseth	c5e238e8e5	readme: orbiton to community integrations (#7770 )	2024-11-20 23:24:05 -08:00
Nikita Ganzikov	fce30f407a	app: typo in wintray messages const (#7705 )	2024-11-20 22:01:58 -08:00
Daniel Hiltgen	d863298210	docs: Link to AMD guide on multi-GPU guidance (#7744 )	2024-11-20 16:00:46 -08:00
Jesse Gross	c4b34f2a2a	runner.go: Truncate inputs that exceed context rather than shifting Previous versions of the runner would truncate inputs to the context window before beginning processing. The main processing loop relied on this behavior if the context needed to be shifted later (due to token generation). If truncation did not occur then invariants would be broken, causing crashes or infinite loops. Later versions attempted to fix these bugs and make the logic less subtle so that all inputs could be handled. Truncation was removed to make things consistent. However, truncation is much faster than processing and shifting, so removing it caused performance problems when the input vastly exceeded the context size. This restores the input truncation as a performance optimization while keeping the more robust processing logic. Fixes #7762	2024-11-20 12:49:24 -08:00
Jesse Gross	c3ff916431	runner.go: Don't add inputs to cache view until actually processed We need to track which tokens are in the cache ourselves. We currently add tokens to the cache tracker when we add them to batch but they are not actually in the cache until we call Decode. This can cause confusion when we are shifting the cache. Avoids "could not find a KV slot for the batch" issues. Bug #7545	2024-11-20 12:49:24 -08:00
Jesse Gross	3fc1dc0e6f	runner.go: Hard fail on errors rather than potentially infinite looping We try to recover from errors by dropping the tokens that caused the problem and re-trying. However, dropping the tokens is not correct and continuing often leads to infinite loops. To avoid, this we end the sequence if such a condition is detected, which is also surprising. At this point, it is better to just report the error. This will make it easier to find problems and the alternatives are perhaps even more surprising to users. This is not a very satisfactory solution either - we should isolate the error and return it to the user without killing the whole process. However, this is an incremental step and consistent with most other failures (which either manifest as abort() or panic).	2024-11-20 12:49:24 -08:00
Jesse Gross	7121dfa309	runner.go: Retry decoding after defragmentation if needed Fragmentation of the KV cache can occur due to cache shifting or different sequences getting processed. Decode uses a heuristic to decide if it should defrag. However, this heuristic isn't 100% accurate, so decoding can sometimes fail by surprise. For these cases, if decode indicates that there is no KV cache space, we should defrag and then try again.	2024-11-20 12:49:24 -08:00
Jesse Gross	5f68fcab12	runner.go: Use correct index when retrieving embedding results This doesn't have any impact currently because NUM_PARALLEL is forced to 1 for embeddings, so both indicies will always be 0.	2024-11-20 12:49:24 -08:00
Emir Sahin	ecf41eed05	readme: add llm-axe to community integrations (#5931 )	2024-11-20 10:53:14 -08:00
Marcus Ziadé	b8c66d3307	readme: add a swift community integration (#7383 )	2024-11-20 10:49:15 -08:00
thewh1teagle	303f4bc79e	readme: add vibe app to community integrations (#7607 )	2024-11-20 10:45:10 -08:00
Adarsh Mishra	d2a25206b1	readme: add opentalkgpt to community integrations (#7707 )	2024-11-20 10:42:55 -08:00
rohitanshu	2f0a8c8778	docs: fix minor typo in import.md (#7764 ) change 'containg' to 'containing'	2024-11-20 09:57:32 -08:00
Gordon Kamer	bfd30f4286	readme: add Abbey to community integrations (#7746 )	2024-11-19 21:37:15 -08:00
Jonathan Hecl	0ef17ede89	readme: add Gollama to community integrations (#7756 )	2024-11-19 21:31:43 -08:00
Daniel Hiltgen	909a88c5c0	Improve crash reporting (#7728 ) Many model crashes are masked behind "An existing connection was forcibly closed by the remote host" This captures that common error message and wires in any detected errors from the log. This also adds the deepseek context shift error to the known errors we capture.	2024-11-19 16:26:57 -08:00
Daniel Hiltgen	f602ab4de4	expose underlying error on embedding failure (#7743 ) Avoid a round-trip asking users for logs to see what went wrong.	2024-11-19 16:26:05 -08:00
Gabe Goodhart	807ace5b1f	fix(runner): Set logits to 0 if false on Batch.Add https://github.com/ollama/ollama/issues/7656 Branch: Granite3StoppingBug-7656 Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>	2024-11-19 15:45:37 -08:00
Blake Mizerany	4b8a2e341a	server: allow mixed-case model names on push, pull, cp, and create (#7676 ) This change allows for mixed-case model names to be pushed, pulled, copied, and created, which was previously disallowed because the Ollama registry was backed by a Docker registry that enforced a naming convention that disallowed mixed-case names, which is no longer the case. This does not break existing, intended, behaviors. Also, make TestCase test a story of creating, updating, pulling, and copying a model with case variations, ensuring the model's manifest is updated correctly, and not duplicated across different files with different case variations.	2024-11-19 15:05:57 -08:00
frob	e66c29261a	Better error suppresion when getting terminal colours (#7739 ) Co-authored-by: Richard Lyons <frob@cloudstaff.com>	2024-11-19 08:33:52 -08:00
Patrick Devine	712d63c3f0	update the docs (#7731 )	2024-11-18 21:17:38 -08:00
Patrick Sy	6cdf27d154	readme: add Alfred Ollama to community integrations (#7724 )	2024-11-18 19:33:23 -08:00
frob	5c18e66384	Notify the user if systemd is not running (#6693 ) Co-authored-by: Richard Lyons <frob@cloudstaff.com>	2024-11-18 15:02:41 -08:00
Daniel Hiltgen	35096a7eff	win: add right click menu support (#7727 ) Enable both left and right click on the pop-up menu	2024-11-18 14:39:52 -08:00
Daniel Hiltgen	81d55d3e4d	fix index out of range on zero layer metal load (#7696 ) If the model doesn't fit any layers on metal, and we load zero layers we would panic trying to look up the GPU size during scheduling ops	2024-11-18 11:48:13 -08:00
Vinh Nguyen	a14f76491d	readme: improve Community Integrations section (#7718 )	2024-11-17 19:30:22 -08:00
Nicolas Bonamy	760cfa27e5	readme: add Witsy and multi-llm-ts to community integrations (#7713 )	2024-11-17 16:33:10 -08:00
Darius Kocar	c9a5aca3da	readme: add Perfect Memory AI to community integrations (#7431 )	2024-11-17 15:19:26 -08:00
Tushar Adhatrao	d5da2ab7e8	readme: add ollama-haskell library to community integrations (#7451 )	2024-11-17 15:18:04 -08:00
Vinh Nguyen	1c04117114	readme: add the VT app to the community integrations section (#7706 )	2024-11-17 14:35:41 -08:00
Jeffrey Morgan	8b4b243f5f	server: fix warnings in prompt_test.go (#7710 )	2024-11-17 13:01:04 -08:00
Jeffrey Morgan	b42a596425	docs: add customization section in linux.md (#7709 )	2024-11-17 11:48:12 -08:00
Daniel Hiltgen	4759d879f2	Install support for jetpacks (#7632 ) Follow up to #7217 - merge after release	2024-11-15 16:47:54 -08:00
Jesse Gross	d875e99e46	runner.go: Propagate panics back to the user. This is a partial revert of `8a35bb92` "runner.go: Increase survivability of main processing loop", removing the panic handler. Although we want to avoid errors taking down the runner, we also should make the user aware of problems when they happen. In the future, we can restructure things so both parts are true.	2024-11-15 11:52:25 -08:00
Jesse Gross	8a35bb926e	runner.go: Increase survivability of main processing loop Currently, if an error occurs during the prep stages (such as tokenizing) of a single request, it will only affect that request. However, if an error happens during decoding, it can take down the entire runner. Instead, it's better to drop the tokens that triggered the error and try to keep going. However, we also need to stop when we run out of tokens, otherwise, this just causes an infinite loop. This is likely the cause of at least some of the hanging issues that have been reported. Bug #7573	2024-11-14 17:18:41 -08:00
Daniel Hiltgen	a0ea067b63	build: fix arm container image (#7674 ) Fix a rebase glitch from the old C++ runner build model	2024-11-14 16:02:01 -08:00
Patrick Devine	4efb98cb4f	add line numbers for parser errors (#7326 )	2024-11-14 13:59:44 -08:00
Bruce MacDonald	0679d491fe	chore(deps): bump golang.org/x dependencies (#7655 ) - golang.org/x/sync v0.3.0 -> v0.9.0 - golang.org/x/image v0.14.0 -> v0.22.0 - golang.org/x/text v0.15.0 -> v0.20.0	2024-11-14 13:58:25 -08:00
Jesse Gross	c25ffde91d	runner.go: Don't trim whitespace from inputs It's possible to get prompts that consist entirely of whitespace - this is most likely to happen when generating embeddings. Currently, we will trim this away, leaving an empty prompt, which will then generate an error. Generating embeddings from whitespace should not trigger an error, as this may break pipelines. It's better to just leave the whitespace in place and process what we are given. This is consistent with past versions of Ollama. Bug #7578	2024-11-14 11:23:06 -08:00
Jesse Gross	17b386a891	runner.go: Enforce NUM_PARALLEL directly in the runner NUM_PARALEL is currently enforced by the Ollama server process - it will only issue requests to the runner if the maximum number of concurrent requests has not been exceeded. Although this should be sufficient, it is good for the runner to protect its own data structures. Currently, if too many requests get through to the runner, they will just get stuck and never return. This may help with reports of Ollama hanging, though it is unclear how it would actually occur. Bug #7573	2024-11-14 11:21:59 -08:00
Michael Yang	549c2bdfcf	Merge pull request #7657 from ollama/mxyng/sync fix(mllama): sync backend between batches	2024-11-14 09:40:04 -08:00
Blake Mizerany	67691e410d	cmd: preserve exact bytes when displaying template/system layers (#7586 )	2024-11-13 23:53:30 -08:00
Michael Yang	5b3393b6a2	fix(mllama): sync backend between batches	2024-11-13 16:37:21 -08:00
Jesse Gross	d7eb05b936	runner.go: Fix off-by-one for num predicted	2024-11-12 11:35:57 -08:00
Daniel Hiltgen	636a743c2b	CI: give windows lint more time (#7635 ) It looks like 8 minutes isn't quite enough and we're seeing sporadic timeouts	2024-11-12 11:22:39 -08:00
Daniel Hiltgen	df011054fa	Jetpack support for Go server (#7217 ) This adds support for the Jetson JetPack variants into the Go runner	2024-11-12 10:31:52 -08:00
Daniel Hiltgen	ac07160c8d	doc: capture numeric group requirement (#6941 ) Docker uses the container filesystem for name resolution, so we can't guide users to use the name of the host group. Instead they must specify the numeric ID.	2024-11-12 09:13:23 -08:00
Daniel Hiltgen	6606e4243c	docs: Capture docker cgroup workaround (#7519 ) GPU support can break on some systems after a while. This captures a known workaround to solve the problem.	2024-11-12 09:12:50 -08:00
Jesse Gross	65973ceb64	runner.go: Make KV entry accounting more robust The structure of the accounting for KV cache shifting was carried over from the old runner but it now doesn't feel natural with the new runner. There are a number of invariants that should hold true but are difficult to reason about. There is at least one bug report that would imply that the invariants are not holding. This reduces the number of implicit assumptions and is more forgiving of unexpected situations. It also improves behavior around which input tokens are kept when truncation occurs. Bug #7545	2024-11-11 20:23:03 -08:00
Joey Zheng	bebef1e50d	readme: add aichat terminal app to community integrations (#7418 )	2024-11-11 16:44:46 -08:00
Evan	d48c1c5a44	api: fix typos in Go Doc comments (#7620 )	2024-11-11 16:21:58 -08:00
Prasad Bhalerao	36a8372b28	readme: add GoLamify to community integrations (#7521 )	2024-11-10 22:38:18 -08:00
Ivo Stoykov	4e94227b5d	readme: add browser extension that enables using Ollama for interacting with web pages (#5827 )	2024-11-10 22:14:22 -08:00
frances720	479d551766	docs: add mentions of Llama 3.2 (#7517 )	2024-11-10 19:04:23 -08:00
Evan	76b2b723b2	api: fix typo in python ClientFromEnvironment docs (#7604 )	2024-11-10 17:30:27 -08:00
Arhan Busam	b8d77cdeab	readme: add llama3.2-vision to model list (#7580 )	2024-11-10 13:36:25 -08:00
Jesse Gross	c2e8cbaa14	runner.go: Check for zero length images If we get a request with a zero length image, it will result in an out-of-bounds error when we pass the data to the image encoder.	2024-11-08 09:39:32 -08:00
Edward J. Schwartz	771fab1dd8	docs: update langchainpy.md with proper model name (#7527 )	2024-11-08 09:36:17 -08:00
Daniel Hiltgen	3a5239e6bf	Set macos min version for all architectures (#7579 )	2024-11-08 09:27:04 -08:00
Daniel Hiltgen	3d25e7bf8c	win: remove preview title from installer (#7529 ) This should have been in #7347 but was overlooked.	2024-11-07 14:26:47 -08:00
Daniel Hiltgen	1618700c5a	Workaround buggy P2P ROCm copy on windows (#7466 ) This enables the workaround code only for windows which should help windows users with muliple AMD GPUs	2024-11-07 14:26:31 -08:00
Daniel Hiltgen	b111aa5a91	Debug logging for nvcuda init (#7532 ) Some users are reporting crashes during nvcuda.dll initialization on windows. This should help narrow down where things are going bad.	2024-11-07 14:25:53 -08:00
Daniel Hiltgen	9e83e550e1	Align rocm compiler flags (#7467 ) Bring consistency with the old generate script behavior	2024-11-07 10:20:50 -08:00
Daniel Hiltgen	fc2a0715df	Be explicit for gpu library link dir (#7560 ) On linux nvcc isn't automatically linking to the same cuda version.	2024-11-07 09:20:40 -08:00
Jesse Gross	3020d2dc58	docs: OLLAMA_NEW_RUNNERS no longer exists	2024-11-06 14:39:02 -08:00
Jesse Gross	a909417602	runner.go: Remove unused arguments Now that server.cpp is gone, we don't need to keep passing arguments that were only ignored and only kept for compatibility.	2024-11-06 13:32:18 -08:00
Jesse Gross	6cd566872b	sched: Lift parallel restriction for multimodal models except mllama The Go runner does not have a problem with supporting parallel requests for most multimodal models. Now that we won't be potentially falling back to server.cpp, this restriction can be lifted. However, the new mllama model can't support parallel requests, so we will need to keep a restriction for that.	2024-11-06 13:32:18 -08:00