CUDA: verify CC is supported by target library (#13298 )

model: ministral w/ llama4 scaling (#13292 )
This change: * fixes rope scaling in the mistral converter * updates ministral to include llama4 scaling * includes a new ministral parser for parsing reasoning and tool calling --------- Co-authored-by: jmorganca <jmorganca@gmail.com>
2025-12-02 09:28:41 -08:00 · 2025-12-01 23:20:14 -08:00 · 2025-12-01 15:36:47 -08:00 · 2025-12-01 15:10:16 -08:00 · 2025-12-01 12:48:16 -08:00 · 2025-11-29 23:46:10 -05:00
17 changed files with 485 additions and 69 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -19,6 +19,8 @@ ml/backend/**/*.comp linguist-vendored
 ml/backend/**/*.glsl linguist-vendored
 ml/backend/**/CMakeLists.txt linguist-vendored

+app/webview linguist-vendored
+
 llama/build-info.cpp linguist-generated
 ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.s linguist-generated

--- a/.golangci.yaml
+++ b/.golangci.yaml
@@ -11,7 +11,6 @@ linters:
    - errorlint
    - exptostd
    - gocheckcompilerdirectives
-    - gocritic
    - govet
    - ineffassign
    - intrange
--- a/api/client.go
+++ b/api/client.go
@@ -226,7 +226,14 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f

 		bts := scanner.Bytes()
 		if err := json.Unmarshal(bts, &errorResponse); err != nil {
-			return fmt.Errorf("unmarshal: %w", err)
+			if response.StatusCode >= http.StatusBadRequest {
+				return StatusError{
+					StatusCode:   response.StatusCode,
+					Status:       response.Status,
+					ErrorMessage: string(bts),
+				}
+			}
+			return errors.New(string(bts))
 		}

 		if response.StatusCode == http.StatusUnauthorized {
--- a/api/client_test.go
+++ b/api/client_test.go
@@ -55,6 +55,7 @@ func TestClientFromEnvironment(t *testing.T) {
 type testError struct {
 	message    string
 	statusCode int
+	raw        bool // if true, write message as-is instead of JSON encoding
 }

 func (e testError) Error() string {
@@ -111,6 +112,20 @@ func TestClientStream(t *testing.T) {
 				},
 			},
 		},
+		{
+			name: "plain text error response",
+			responses: []any{
+				"internal server error",
+			},
+			wantErr: "internal server error",
+		},
+		{
+			name: "HTML error page",
+			responses: []any{
+				"<html><body>404 Not Found</body></html>",
+			},
+			wantErr: "404 Not Found",
+		},
 	}

 	for _, tc := range testCases {
@@ -135,6 +150,12 @@ func TestClientStream(t *testing.T) {
 						return
 					}

+					if str, ok := resp.(string); ok {
+						fmt.Fprintln(w, str)
+						flusher.Flush()
+						continue
+					}
+
 					if err := json.NewEncoder(w).Encode(resp); err != nil {
 						t.Fatalf("failed to encode response: %v", err)
 					}
@@ -173,9 +194,10 @@ func TestClientStream(t *testing.T) {

 func TestClientDo(t *testing.T) {
 	testCases := []struct {
-		name     string
-		response any
-		wantErr  string
+		name           string
+		response       any
+		wantErr        string
+		wantStatusCode int
 	}{
 		{
 			name: "immediate error response",
@@ -183,7 +205,8 @@ func TestClientDo(t *testing.T) {
 				message:    "test error message",
 				statusCode: http.StatusBadRequest,
 			},
-			wantErr: "test error message",
+			wantErr:        "test error message",
+			wantStatusCode: http.StatusBadRequest,
 		},
 		{
 			name: "server error response",
@@ -191,7 +214,8 @@ func TestClientDo(t *testing.T) {
 				message:    "internal error",
 				statusCode: http.StatusInternalServerError,
 			},
-			wantErr: "internal error",
+			wantErr:        "internal error",
+			wantStatusCode: http.StatusInternalServerError,
 		},
 		{
 			name: "successful response",
@@ -203,6 +227,26 @@ func TestClientDo(t *testing.T) {
 				Success: true,
 			},
 		},
+		{
+			name: "plain text error response",
+			response: testError{
+				message:    "internal server error",
+				statusCode: http.StatusInternalServerError,
+				raw:        true,
+			},
+			wantErr:        "internal server error",
+			wantStatusCode: http.StatusInternalServerError,
+		},
+		{
+			name: "HTML error page",
+			response: testError{
+				message:    "<html><body>404 Not Found</body></html>",
+				statusCode: http.StatusNotFound,
+				raw:        true,
+			},
+			wantErr:        "<html><body>404 Not Found</body></html>",
+			wantStatusCode: http.StatusNotFound,
+		},
 	}

 	for _, tc := range testCases {
@@ -210,11 +254,16 @@ func TestClientDo(t *testing.T) {
 			ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 				if errResp, ok := tc.response.(testError); ok {
 					w.WriteHeader(errResp.statusCode)
-					err := json.NewEncoder(w).Encode(map[string]string{
-						"error": errResp.message,
-					})
-					if err != nil {
-						t.Fatal("failed to encode error response:", err)
+					if !errResp.raw {
+						err := json.NewEncoder(w).Encode(map[string]string{
+							"error": errResp.message,
+						})
+						if err != nil {
+							t.Fatal("failed to encode error response:", err)
+						}
+					} else {
+						// Write raw message (simulates non-JSON error responses)
+						fmt.Fprint(w, errResp.message)
 					}
 					return
 				}
@@ -241,6 +290,15 @@ func TestClientDo(t *testing.T) {
 				if err.Error() != tc.wantErr {
 					t.Errorf("error message mismatch: got %q, want %q", err.Error(), tc.wantErr)
 				}
+				if tc.wantStatusCode != 0 {
+					if statusErr, ok := err.(StatusError); ok {
+						if statusErr.StatusCode != tc.wantStatusCode {
+							t.Errorf("status code mismatch: got %d, want %d", statusErr.StatusCode, tc.wantStatusCode)
+						}
+					} else {
+						t.Errorf("expected StatusError, got %T", err)
+					}
+				}
 				return
 			}

--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -1430,7 +1430,7 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 		latest.Summary()
 	}

-	return &api.Message{Role: role, Content: fullResponse.String()}, nil
+	return &api.Message{Role: role, Thinking: thinkingContent.String(), Content: fullResponse.String()}, nil
 }

 func generate(cmd *cobra.Command, opts runOptions) error {
--- a/convert/convert_mistral.go
+++ b/convert/convert_mistral.go
@@ -29,6 +29,15 @@ type mistral3Model struct {
 		SlidingWindow         *uint32 `json:"sliding_window"`
 		HiddenAct             string  `json:"hidden_act"`
 		VocabSize             uint32  `json:"vocab_size"`
+		RopeParameters        struct {
+			BetaFast                  float32 `json:"beta_fast"`
+			BetaSlow                  float32 `json:"beta_slow"`
+			Factor                    float32 `json:"factor"`
+			ScalingBeta               float32 `json:"llama_4_scaling_beta"`
+			OrigMaxPositionEmbeddings uint32  `json:"original_max_position_embeddings"`
+			RopeType                  string  `json:"rope_type"`
+			RopeTheta                 float32 `json:"rope_theta"`
+		} `json:"rope_parameters"`
 	} `json:"text_config"`
 	VisionModel struct {
 		NumAttentionHeads uint32  `json:"num_attention_heads"`
@@ -61,8 +70,13 @@ func (p *mistral3Model) KV(t *Tokenizer) ggml.KV {
 	kv["mistral3.attention.layer_norm_rms_epsilon"] = p.TextModel.RMSNormEPS
 	kv["mistral3.attention.key_length"] = p.TextModel.HeadDim
 	kv["mistral3.attention.value_length"] = p.TextModel.HeadDim
-	kv["mistral3.rope.dimension_count"] = p.TextModel.HiddenSize / p.TextModel.NumHiddenLayers
-	kv["mistral3.rope.freq_base"] = p.TextModel.RopeTheta
+	kv["mistral3.rope.dimension_count"] = cmp.Or(p.TextModel.HeadDim, p.TextModel.HiddenSize/p.TextModel.NumAttentionHeads)
+	kv["mistral3.rope.freq_base"] = cmp.Or(p.TextModel.RopeTheta, p.TextModel.RopeParameters.RopeTheta)
+
+	if p.TextModel.RopeParameters.OrigMaxPositionEmbeddings > 0 {
+		kv["mistral3.rope.scaling.original_context_length"] = p.TextModel.RopeParameters.OrigMaxPositionEmbeddings
+		kv["mistral3.rope.scaling_beta"] = p.TextModel.RopeParameters.ScalingBeta
+	}

 	// Vision configuration
 	kv["mistral3.vision.block_count"] = p.VisionModel.NumHiddenLayers
--- a/discover/runner.go
+++ b/discover/runner.go
@@ -65,6 +65,7 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
 		}

 		slog.Info("discovering available GPUs...")
+		detectIncompatibleLibraries()

 		// Warn if any user-overrides are set which could lead to incorrect GPU discovery
 		overrideWarnings()
@@ -98,6 +99,9 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
 					continue
 				} else if jetpack != "" && filepath.Base(dir) != "cuda_"+jetpack {
 					continue
+				} else if jetpack == "" && strings.Contains(filepath.Base(dir), "cuda_jetpack") {
+					slog.Debug("jetpack not detected (set JETSON_JETPACK or OLLAMA_LLM_LIBRARY to override), skipping", "libDir", dir)
+					continue
 				} else if !envconfig.EnableVulkan() && strings.Contains(filepath.Base(dir), "vulkan") {
 					slog.Info("experimental Vulkan support disabled.  To enable, set OLLAMA_VULKAN=1")
 					continue
@@ -484,3 +488,16 @@ func overrideWarnings() {
 		slog.Warn("if GPUs are not correctly discovered, unset and try again")
 	}
 }
+
+func detectIncompatibleLibraries() {
+	if runtime.GOOS != "windows" {
+		return
+	}
+	basePath, err := exec.LookPath("ggml-base.dll")
+	if err != nil || basePath == "" {
+		return
+	}
+	if !strings.HasPrefix(basePath, ml.LibOllamaPath) {
+		slog.Warn("potentially incompatible library detected in PATH", "location", basePath)
+	}
+}
--- a/docs/faq.mdx
+++ b/docs/faq.mdx
@@ -57,8 +57,13 @@ ollama ps
 ```

 <Info>
-  **Output**: ``` NAME ID SIZE PROCESSOR UNTIL llama3:70b bcfb190ca3a7 42 GB
-  100% GPU 4 minutes from now ```
+
+**Output**:
+
+```
+NAME        ID            SIZE    PROCESSOR   UNTIL
+llama3:70b  bcfb190ca3a7  42 GB   100% GPU    4 minutes from now
+```
 </Info>

 The `Processor` column will show which memory the model was loaded in to:
@@ -385,4 +390,4 @@ Ollama for Windows and macOS register as a login item during installation.  You
 - In `Task Manager` go to the `Startup apps` tab, search for `ollama` then click `Disable`

 **MacOS**
- Open `Settings` and search for "Login Items", find the `Ollama` entry under "Allow in the Background`, then click the slider to disable.
+- Open `Settings` and search for "Login Items", find the `Ollama` entry under "Allow in the Background`, then click the slider to disable.
--- a/docs/modelfile.mdx
+++ b/docs/modelfile.mdx
@@ -149,9 +149,6 @@ PARAMETER <parameter> <parametervalue>

 | Parameter      | Description                                                                                                                                                                                                                                                                                                                                                                     | Value Type | Example Usage        |
 | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- | -------------------- |
-| mirostat       | Enable Mirostat sampling for controlling perplexity. (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)                                                                                                                                                                                                                                                                 | int        | mirostat 0           |
-| mirostat_eta   | Influences how quickly the algorithm responds to feedback from the generated text. A lower learning rate will result in slower adjustments, while a higher learning rate will make the algorithm more responsive. (Default: 0.1)                                                                                                                                                | float      | mirostat_eta 0.1     |
-| mirostat_tau   | Controls the balance between coherence and diversity of the output. A lower value will result in more focused and coherent text. (Default: 5.0)                                                                                                                                                                                                                                 | float      | mirostat_tau 5.0     |
 | num_ctx        | Sets the size of the context window used to generate the next token. (Default: 2048)                                                                                                                                                                                                                                                                                            | int        | num_ctx 4096         |
 | repeat_last_n  | Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx)                                                                                                                                                                                                                                                                   | int        | repeat_last_n 64     |
 | repeat_penalty | Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)                                                                                                                                                                                             | float      | repeat_penalty 1.1   |
--- a/llm/server.go
+++ b/llm/server.go
@@ -170,11 +170,6 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st

 	opts.NumBatch = min(opts.NumBatch, opts.NumCtx)

-	if f.KV().Architecture() == "nomic-bert" {
-		opts.NumBatch = opts.NumCtx
-		slog.Debug("nomic-bert model detected, setting batch size equal to context length", "num_batch", opts.NumBatch, "num_ctx", opts.NumCtx)
-	}
-
 	loadRequest := LoadRequest{LoraPath: adapters, KvSize: opts.NumCtx * numParallel, BatchSize: opts.NumBatch, Parallel: numParallel, MultiUserCache: envconfig.MultiUserCache()}

 	defaultThreads := systemInfo.ThreadCount
--- a/ml/device.go
+++ b/ml/device.go
@@ -509,11 +509,9 @@ func GetVisibleDevicesEnv(l []DeviceInfo) map[string]string {
 // to crash at inference time and requires deeper validation before we include
 // it in the supported devices list.
 func (d DeviceInfo) NeedsInitValidation() bool {
-	// At this time the only library we know needs a 2nd pass is ROCm since
-	// rocblas will crash on unsupported devices.  We want to find those crashes
-	// during bootstrap discovery so we can eliminate those GPUs before the user
-	// tries to run inference on them
-	return d.Library == "ROCm"
+	// ROCm: rocblas will crash on unsupported devices.
+	// CUDA: verify CC is supported by the version of the library
+	return d.Library == "ROCm" || d.Library == "CUDA"
 }

 // Set the init validation environment variable
--- a/model/models/mistral3/model.go
+++ b/model/models/mistral3/model.go
@@ -159,8 +159,9 @@ func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {

 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 	positions := ctx.Input().FromInts(batch.Positions, len(batch.Positions))
+	positionsScale := m.getScale(ctx, batch.Positions)

-	return m.TextModel.Forward(ctx, batch.Inputs, positions, batch.Outputs, batch, m.Cache), nil
+	return m.TextModel.Forward(ctx, batch.Inputs, positions, positionsScale, batch.Outputs, batch, m.Cache), nil
 }

 func init() {
--- a/model/models/mistral3/model_text.go
+++ b/model/models/mistral3/model_text.go
@@ -16,6 +16,8 @@ type TextOptions struct {
 	hiddenSize, numHeads, numKVHeads int
 	headDim, ropeDim                 int
 	eps, ropeBase, ropeScale         float32
+	ropeOrigPosEmbeddings            int
+	ropeScalingBeta                  float32
 }

 type TextModel struct {
@@ -34,7 +36,7 @@ type SelfAttention struct {
 	Output *nn.Linear `gguf:"attn_output"`
 }

-func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
+func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs, positionsScale ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
 	headDim := cmp.Or(opts.headDim, opts.hiddenSize/opts.numHeads)

@@ -49,6 +51,10 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
 	v := sa.Value.Forward(ctx, hiddenState)
 	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)

+	if opts.ropeOrigPosEmbeddings > 0 {
+		q = q.Mul(ctx, positionsScale)
+	}
+
 	kqv := nn.Attention(ctx, q, k, v, 1.0/math.Sqrt(float64(headDim)), cache)
 	kqv = kqv.Reshape(ctx, headDim*opts.numHeads, batchSize)
 	return sa.Output.Forward(ctx, kqv)
@@ -76,11 +82,11 @@ type Layer struct {
 	MLP           *MLP
 }

-func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
+func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, positionsScale, outputs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
 	residual := hiddenState

 	hiddenState = l.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
-	hiddenState = l.SelfAttention.Forward(ctx, hiddenState, positionIDs, cache, opts)
+	hiddenState = l.SelfAttention.Forward(ctx, hiddenState, positionIDs, positionsScale, cache, opts)

 	// In the final layer (outputs != nil), optimize by pruning to just the token positions
 	// we need logits for.
@@ -97,7 +103,7 @@ func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Ten
 	return hiddenState.Add(ctx, residual)
 }

-func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor, batch input.Batch, cache kvcache.Cache) ml.Tensor {
+func (m *TextModel) Forward(ctx ml.Context, inputs, positions, positionsScale, outputs ml.Tensor, batch input.Batch, cache kvcache.Cache) ml.Tensor {
 	hiddenState := m.TokenEmbedding.Forward(ctx, inputs).Duplicate(ctx)

 	// image embeddings
@@ -114,25 +120,36 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 			lastLayerOutputs = outputs
 		}

-		hiddenState = layer.Forward(ctx, hiddenState, positions, lastLayerOutputs, cache, m.TextOptions)
+		hiddenState = layer.Forward(ctx, hiddenState, positions, positionsScale, lastLayerOutputs, cache, m.TextOptions)
 	}

 	hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
 	return m.Output.Forward(ctx, hiddenState)
 }

+func (m *TextModel) getScale(ctx ml.Context, positions []int32) ml.Tensor {
+	posScale := make([]float32, len(positions))
+	for n, pos := range positions {
+		interval := math.Floor(float64(pos) / float64(m.ropeOrigPosEmbeddings))
+		posScale[n] = float32(1.0 + float64(m.ropeScalingBeta)*math.Log(1.0+interval))
+	}
+	return ctx.Input().FromFloats(posScale, 1, 1, len(posScale))
+}
+
 func newTextModel(c fs.Config) *TextModel {
 	return &TextModel{
 		Layers: make([]Layer, c.Uint("block_count")),
 		TextOptions: &TextOptions{
-			hiddenSize: int(c.Uint("embedding_length")),
-			numHeads:   int(c.Uint("attention.head_count")),
-			numKVHeads: int(c.Uint("attention.head_count_kv")),
-			headDim:    int(c.Uint("attention.key_length")),
-			ropeDim:    int(c.Uint("rope.dimension_count")),
-			eps:        c.Float("attention.layer_norm_rms_epsilon"),
-			ropeBase:   c.Float("rope.freq_base"),
-			ropeScale:  c.Float("rope.scaling.factor", 1),
+			hiddenSize:            int(c.Uint("embedding_length")),
+			numHeads:              int(c.Uint("attention.head_count")),
+			numKVHeads:            int(c.Uint("attention.head_count_kv")),
+			headDim:               int(c.Uint("attention.key_length")),
+			ropeDim:               int(c.Uint("rope.dimension_count")),
+			eps:                   c.Float("attention.layer_norm_rms_epsilon"),
+			ropeBase:              c.Float("rope.freq_base"),
+			ropeScale:             c.Float("rope.scaling.factor", 1),
+			ropeOrigPosEmbeddings: int(c.Uint("rope.scaling.original_context_length")),
+			ropeScalingBeta:       c.Float("rope.scaling_beta"),
 		},
 	}
 }
--- a/model/parsers/ministral.go
+++ b/model/parsers/ministral.go
@@ -0,0 +1,136 @@
+package parsers
+
+import (
+	"encoding/json"
+	"fmt"
+	"strings"
+
+	"github.com/ollama/ollama/api"
+)
+
+type ministralParserState int
+
+const (
+	ministralCollectingContent = iota
+	ministralCollectingThinkingContent
+	ministralCollectingToolName
+	ministralCollectingToolArgs
+)
+
+type MinistralParser struct {
+	state              ministralParserState
+	buffer             strings.Builder
+	tools              []api.Tool
+	hasThinkingSupport bool
+	currentTool        *api.Tool
+}
+
+func (p *MinistralParser) HasToolSupport() bool {
+	return true
+}
+
+func (p *MinistralParser) HasThinkingSupport() bool {
+	return p.hasThinkingSupport
+}
+
+func (p *MinistralParser) setInitialState(lastMessage *api.Message) {
+	prefill := lastMessage != nil && lastMessage.Role == "assistant"
+	if !p.HasThinkingSupport() {
+		p.state = ministralCollectingContent
+		return
+	}
+
+	if prefill && lastMessage.Content != "" {
+		p.state = ministralCollectingContent
+		return
+	}
+
+	p.state = ministralCollectingThinkingContent
+}
+
+func (p *MinistralParser) Init(tools []api.Tool, lastMessage *api.Message, thinkValue *api.ThinkValue) []api.Tool {
+	p.tools = tools
+	p.setInitialState(lastMessage)
+	return tools
+}
+
+func toolByName(tools []api.Tool, n string) (*api.Tool, error) {
+	for i := range tools {
+		if tools[i].Function.Name == n {
+			return &tools[i], nil
+		}
+	}
+	return nil, fmt.Errorf("tool '%s' not found", n)
+}
+
+func (p *MinistralParser) Add(s string, done bool) (content string, thinking string, calls []api.ToolCall, err error) {
+	p.buffer.WriteString(s)
+
+	switch p.state {
+	case ministralCollectingContent:
+		if strings.Contains(p.buffer.String(), "[TOOL_CALLS]") {
+			before, _ := splitAtTag(&p.buffer, "[TOOL_CALLS]", false)
+			if before != "" {
+				return before, "", calls, nil
+			}
+			p.state = ministralCollectingToolName
+		} else if strings.Contains(p.buffer.String(), "[THINK]") {
+			p.state = ministralCollectingThinkingContent
+			return "", "", calls, nil
+		} else {
+			p.buffer.Reset()
+			return s, "", calls, nil
+		}
+	case ministralCollectingThinkingContent:
+		if strings.Contains(p.buffer.String(), "[/THINK]") {
+			thinkingContent, after := splitAtTag(&p.buffer, "[/THINK]", true)
+			p.state = ministralCollectingContent
+			if after != "" {
+				p.buffer.Reset()
+				return after, thinkingContent, calls, nil
+			}
+			return "", thinkingContent, calls, nil
+		} else {
+			p.buffer.Reset()
+			return "", s, calls, nil
+		}
+	case ministralCollectingToolName:
+		if strings.Contains(p.buffer.String(), "[ARGS]") {
+			name, _ := splitAtTag(&p.buffer, "[ARGS]", false)
+
+			t, err := toolByName(p.tools, name)
+			if err != nil {
+				return "", "", calls, err
+			}
+			p.currentTool = t
+			p.state = ministralCollectingToolArgs
+			return "", "", calls, nil
+		}
+		return "", "", calls, nil
+	case ministralCollectingToolArgs:
+		if strings.Contains(p.buffer.String(), "}") {
+			before, _ := splitAtTag(&p.buffer, "}", false)
+			before += "}"
+
+			var data map[string]any
+			if err := json.Unmarshal([]byte(before), &data); err != nil {
+				// todo - throw a better error
+				return "", "", calls, err
+			}
+
+			p.state = ministralCollectingContent
+
+			call := api.ToolCall{
+				Function: api.ToolCallFunction{
+					Name:      p.currentTool.Function.Name,
+					Arguments: api.ToolCallFunctionArguments(data),
+				},
+			}
+			calls = append(calls, call)
+			return "", "", calls, nil
+		}
+		return "", "", calls, nil
+	}
+
+	return p.buffer.String(), thinking, calls, nil
+}
--- a/model/parsers/parsers.go
+++ b/model/parsers/parsers.go
@@ -1,6 +1,9 @@
 package parsers

 import (
+	"strings"
+	"unicode"
+
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/harmony"
 )
@@ -38,16 +41,17 @@ func ParserForName(name string) Parser {
 	if parser, ok := registry.constructors[name]; ok {
 		return parser()
 	}
+	var p Parser
+
 	switch name {
 	case "qwen3-coder":
-		parser := &Qwen3CoderParser{}
-		return parser
+		p = &Qwen3CoderParser{}
 	case "qwen3-vl-instruct":
-		parser := &Qwen3VLParser{hasThinkingSupport: false}
-		return parser
+		p = &Qwen3VLParser{hasThinkingSupport: false}
 	case "qwen3-vl-thinking":
-		parser := &Qwen3VLParser{hasThinkingSupport: true}
-		return parser
+		p = &Qwen3VLParser{hasThinkingSupport: true}
+	case "ministral":
+		p = &MinistralParser{hasThinkingSupport: false}
 	case "passthrough":
 		return &PassthroughParser{}
 	case "harmony":
@@ -57,6 +61,7 @@ func ParserForName(name string) Parser {
 	default:
 		return nil
 	}
+	return p
 }

 type PassthroughParser struct{}
@@ -76,3 +81,20 @@ func (p *PassthroughParser) HasToolSupport() bool {
 func (p *PassthroughParser) HasThinkingSupport() bool {
 	return false
 }
+
+func splitAtTag(sb *strings.Builder, tag string, trimAfter bool) (string, string) {
+	split := strings.SplitN(sb.String(), tag, 2)
+	if len(split) == 1 {
+		sb.Reset()
+		return split[0], ""
+	}
+	before := split[0]
+	before = strings.TrimRightFunc(before, unicode.IsSpace)
+	after := split[1]
+	if trimAfter {
+		after = strings.TrimLeftFunc(after, unicode.IsSpace)
+	}
+	sb.Reset()
+	sb.WriteString(after)
+	return before, after // return events
+}
--- a/model/parsers/parsers_test.go
+++ b/model/parsers/parsers_test.go
@@ -1,6 +1,7 @@
 package parsers

 import (
+	"strings"
 	"testing"

 	"github.com/ollama/ollama/api"
@@ -95,3 +96,164 @@ func TestUnknownParserReturnsNil(t *testing.T) {
 		t.Error("expected nil for unknown parser")
 	}
 }
+
+func TestSplitAtTag(t *testing.T) {
+	tests := []struct {
+		name       string
+		input      string
+		tag        string
+		trimAfter  bool
+		wantBefore string
+		wantAfter  string
+		wantSB     string // expected content of strings.Builder after operation
+	}{
+		{
+			name:       "basic split with trimAfter true",
+			input:      "hello <!-- split --> world",
+			tag:        "<!-- split -->",
+			trimAfter:  true,
+			wantBefore: "hello",
+			wantAfter:  "world",
+			wantSB:     "world",
+		},
+		{
+			name:       "basic split with trimAfter false",
+			input:      "hello <!-- split -->   world",
+			tag:        "<!-- split -->",
+			trimAfter:  false,
+			wantBefore: "hello",
+			wantAfter:  "   world",
+			wantSB:     "   world",
+		},
+		{
+			name:       "tag at beginning with trimAfter true",
+			input:      "<!-- split -->world",
+			tag:        "<!-- split -->",
+			trimAfter:  true,
+			wantBefore: "",
+			wantAfter:  "world",
+			wantSB:     "world",
+		},
+		{
+			name:       "tag at beginning with trimAfter false",
+			input:      "<!-- split -->   world",
+			tag:        "<!-- split -->",
+			trimAfter:  false,
+			wantBefore: "",
+			wantAfter:  "   world",
+			wantSB:     "   world",
+		},
+		{
+			name:       "tag at end with trimAfter true",
+			input:      "hello <!-- split -->",
+			tag:        "<!-- split -->",
+			trimAfter:  true,
+			wantBefore: "hello",
+			wantAfter:  "",
+			wantSB:     "",
+		},
+		{
+			name:       "tag at end with trimAfter false",
+			input:      "hello <!-- split -->",
+			tag:        "<!-- split -->",
+			trimAfter:  false,
+			wantBefore: "hello",
+			wantAfter:  "",
+			wantSB:     "",
+		},
+		{
+			name:       "multiple tags splits at first occurrence",
+			input:      "hello <!-- split --> world <!-- split --> end",
+			tag:        "<!-- split -->",
+			trimAfter:  true,
+			wantBefore: "hello",
+			wantAfter:  "world <!-- split --> end",
+			wantSB:     "world <!-- split --> end",
+		},
+		{
+			name:       "tag not present",
+			input:      "hello world",
+			tag:        "<!-- split -->",
+			trimAfter:  true,
+			wantBefore: "hello world",
+			wantAfter:  "",
+			wantSB:     "",
+		},
+		{
+			name:       "empty input",
+			input:      "",
+			tag:        "<!-- split -->",
+			trimAfter:  true,
+			wantBefore: "",
+			wantAfter:  "",
+			wantSB:     "",
+		},
+		{
+			name:       "only whitespace before tag",
+			input:      "   \t\n<!-- split -->world",
+			tag:        "<!-- split -->",
+			trimAfter:  true,
+			wantBefore: "",
+			wantAfter:  "world",
+			wantSB:     "world",
+		},
+		{
+			name:       "only whitespace after tag with trimAfter true",
+			input:      "hello<!-- split -->   \t\n",
+			tag:        "<!-- split -->",
+			trimAfter:  true,
+			wantBefore: "hello",
+			wantAfter:  "",
+			wantSB:     "",
+		},
+		{
+			name:       "only whitespace after tag with trimAfter false",
+			input:      "hello<!-- split -->   \t\n",
+			tag:        "<!-- split -->",
+			trimAfter:  false,
+			wantBefore: "hello",
+			wantAfter:  "   \t\n",
+			wantSB:     "   \t\n",
+		},
+		{
+			name:       "complex whitespace trimming",
+			input:      "  hello \t\n <!-- split --> \n\t world  ",
+			tag:        "<!-- split -->",
+			trimAfter:  true,
+			wantBefore: "  hello",
+			wantAfter:  "world  ",
+			wantSB:     "world  ",
+		},
+		{
+			name:       "tag with special characters",
+			input:      "text <tag attr=\"value\"> more text",
+			tag:        "<tag attr=\"value\">",
+			trimAfter:  true,
+			wantBefore: "text",
+			wantAfter:  "more text",
+			wantSB:     "more text",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			sb := &strings.Builder{}
+			sb.WriteString(tt.input)
+
+			before, after := splitAtTag(sb, tt.tag, tt.trimAfter)
+
+			// Check return values
+			if before != tt.wantBefore {
+				t.Errorf("splitAtTag() before = %q, want %q", before, tt.wantBefore)
+			}
+			if after != tt.wantAfter {
+				t.Errorf("splitAtTag() after = %q, want %q", after, tt.wantAfter)
+			}
+
+			// Check strings.Builder state
+			if sb.String() != tt.wantSB {
+				t.Errorf("strings.Builder after split = %q, want %q", sb.String(), tt.wantSB)
+			}
+		})
+	}
+}
--- a/model/parsers/qwen3vl.go
+++ b/model/parsers/qwen3vl.go
@@ -70,7 +70,6 @@ func (p *Qwen3VLParser) Add(s string, done bool) (content string, thinking strin
 	p.buffer.WriteString(s)
 	events := p.parseEvents()

-	var toolCalls []api.ToolCall
 	var contentSb strings.Builder
 	var thinkingSb strings.Builder
 	for _, event := range events {
@@ -81,7 +80,7 @@ func (p *Qwen3VLParser) Add(s string, done bool) (content string, thinking strin
 				slog.Warn("qwen tool call parsing failed", "error", err)
 				return "", "", nil, err
 			}
-			toolCalls = append(toolCalls, toolCall)
+			calls = append(calls, toolCall)
 		case qwenEventThinkingContent:
 			thinkingSb.WriteString(event.content)
 		case qwenEventContent:
@@ -91,7 +90,7 @@ func (p *Qwen3VLParser) Add(s string, done bool) (content string, thinking strin
 		}
 	}

-	return contentSb.String(), thinkingSb.String(), toolCalls, nil
+	return contentSb.String(), thinkingSb.String(), calls, nil
 }

 func (p *Qwen3VLParser) parseEvents() []qwenEvent {
@@ -113,19 +112,6 @@ func (p *Qwen3VLParser) parseEvents() []qwenEvent {
 	return all
 }

-func splitAtTag(p *Qwen3VLParser, tag string, trimAfter bool) (string, string) {
-	split := strings.SplitN(p.buffer.String(), tag, 2)
-	before := split[0]
-	before = strings.TrimRightFunc(before, unicode.IsSpace)
-	after := split[1]
-	if trimAfter {
-		after = strings.TrimLeftFunc(after, unicode.IsSpace)
-	}
-	p.buffer.Reset()
-	p.buffer.WriteString(after)
-	return before, after // return events
-}
-
 func (p *Qwen3VLParser) eatLeadingWhitespaceAndTransitionTo(nextState qwenParserState) ([]qwenEvent, bool) {
 	trimmed := strings.TrimLeftFunc(p.buffer.String(), unicode.IsSpace)
 	p.buffer.Reset()
@@ -144,7 +130,7 @@ func (p *Qwen3VLParser) eat() ([]qwenEvent, bool) {
 	case CollectingContent:
 		if strings.Contains(p.buffer.String(), toolOpenTag) {
 			// events = emitContentBeforeTag(p, events, toolOpenTag)
-			before, _ := splitAtTag(p, toolOpenTag, false)
+			before, _ := splitAtTag(&p.buffer, toolOpenTag, false)
 			if len(before) > 0 {
 				events = append(events, qwenEventContent{content: before})
 			}
@@ -195,7 +181,7 @@ func (p *Qwen3VLParser) eat() ([]qwenEvent, bool) {
 		}
 	case CollectingThinkingContent:
 		if strings.Contains(p.buffer.String(), thinkingCloseTag) {
-			thinking, remaining := splitAtTag(p, thinkingCloseTag, true)
+			thinking, remaining := splitAtTag(&p.buffer, thinkingCloseTag, true)
 			if len(thinking) > 0 {
 				events = append(events, qwenEventThinkingContent{content: thinking})
 			}
Author	SHA1	Message	Date
Daniel Hiltgen	f8f1071818	CUDA: verify CC is supported by target library (#13298 )	2025-12-02 09:28:41 -08:00
Patrick Devine	d3e0a0dee4	model: ministral w/ llama4 scaling (#13292 ) This change: * fixes rope scaling in the mistral converter * updates ministral to include llama4 scaling * includes a new ministral parser for parsing reasoning and tool calling --------- Co-authored-by: jmorganca <jmorganca@gmail.com>	2025-12-01 23:20:14 -08:00
Daniel Hiltgen	554172759c	win: warn if ggml-base detected in PATH (#13289 ) If the user has somehow installed another GGML based app which places a ggml-base lib somewhere in their PATH, we can experience runtime problems due to incompatibilities. This change adds a warning message if we detect a ggml-base outside of our install location to aid in troubleshooting.	2025-12-01 15:36:47 -08:00
Bruce MacDonald	5b6a8e6001	api/client: handle non-json streaming errors (#13007 ) While processing the response stream during a chat or generation if an error is occurred it is parsed and returned to the user. The issue with the existing code is that this assumed the response would be valid JSON, which is not a safe assumption and caused cryptic error messages to be displayed due to parsing failures: `invalid character 'i' looking for beginning of value` This change updates the stream function to return the raw error string if it cant be parsed as JSON. This should help with debugging issues by making sure the actual error reaches the user.	2025-12-01 15:10:16 -08:00
Daniel Hiltgen	467bbc0dd5	jetpack: require exact match or skip cuda_jetpack* (#13288 ) The cuda_jetpack libs will enumerate discrete GPUs on SBSA systems which leads to runtime failures of missing kernels. This fix requires an exact match to enable jetpacks instead of relying on enumeration to filter out supported libraries.	2025-12-01 12:48:16 -08:00
Jeffrey Morgan	6d9f9323c5	.gitattributes: add app/webview to linguist-vendored (#13274 )	2025-11-29 23:46:10 -05:00
Ondrej Kokes	0c2489605d	docs: fix output formatting in faq.mdx (#13231 ) There were a few Markdown typos in one FAQ answer. It now renders as a proper ascii table.	2025-11-28 19:19:21 -05:00
EntropyYue	8b1b89a984	docs: remove deprecated parameters (#13237 )	2025-11-26 11:03:09 +09:00