CI: replace clang compiler for windows (#12495 )

llm: Support KV cache quantization with gpt-oss
With the new version of GGML in #12245, KV cache quantization no longer causes a fallback to CPU.
2025-10-04 09:18:42 -07:00 · 2025-10-03 16:31:58 -07:00 · 2025-10-03 14:20:06 -07:00 · 2025-10-03 12:17:21 -07:00 · 2025-10-03 12:05:34 -07:00 · 2025-10-03 07:28:40 -07:00
5 changed files with 147 additions and 39 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -94,7 +94,7 @@ jobs:
            install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
            rocm-version: '6.2'
            flags: '-DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" -DCMAKE_CXX_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma"'
-            runner_dir: ''
+            runner_dir: 'rocm'
    runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }}
    environment: release
    env:
@@ -163,7 +163,7 @@ jobs:
          cmake --preset "${{ matrix.preset }}" ${{ matrix.flags }} -DOLLAMA_RUNNER_DIR="${{ matrix.runner_dir }}"
          cmake --build --parallel --preset "${{ matrix.preset }}"
          cmake --install build --component "${{ startsWith(matrix.preset, 'CUDA ') && 'CUDA' || startsWith(matrix.preset, 'ROCm ') && 'HIP' || 'CPU' }}" --strip --parallel 8
-          rm -force dist\lib\ollama\rocm\rocblas\library\*gfx906*
+          Remove-Item -Path dist\lib\ollama\rocm\rocblas\library\*gfx906* -ErrorAction SilentlyContinue
        env:
          CMAKE_GENERATOR: Ninja
      - uses: actions/upload-artifact@v4
@@ -176,19 +176,19 @@ jobs:
      matrix:
        os: [windows]
        arch: [amd64, arm64]
+        include:
+        - os: windows
+          arch: amd64
+          llvmarch: x86_64
+        - os: windows
+          arch: arm64
+          llvmarch: aarch64
    runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }}
    environment: release
    needs: [setup-environment]
    env:
      GOFLAGS: ${{ needs.setup-environment.outputs.GOFLAGS }}
    steps:
-      - name: Install AMD64 system dependencies
-        if: matrix.arch == 'amd64'
-        run: |
-          $ErrorActionPreference = "Stop"
-          Start-Process "C:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
-          echo "C:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
      - name: Install ARM64 system dependencies
        if: matrix.arch == 'arm64'
        run: |
@@ -200,15 +200,25 @@ jobs:

          choco install -y --no-progress git gzip
          echo "C:\Program Files\Git\cmd" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-
-          Invoke-WebRequest -Uri "https://github.com/mstorsjo/llvm-mingw/releases/download/20240619/llvm-mingw-20240619-ucrt-aarch64.zip" -OutFile "${{ runner.temp }}\llvm-mingw-ucrt-aarch64.zip"
-          Expand-Archive -Path ${{ runner.temp }}\llvm-mingw-ucrt-aarch64.zip -DestinationPath "C:\Program Files\"
-          $installPath=(Resolve-Path -Path "C:\Program Files\llvm-mingw-*-ucrt-aarch64").path
-          echo $installPath\bin | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      - name: Install clang and gcc-compat
+        run: |
+          $ErrorActionPreference = "Stop"
+          Set-ExecutionPolicy Bypass -Scope Process -Force
+          Invoke-WebRequest -Uri "https://github.com/mstorsjo/llvm-mingw/releases/download/20240619/llvm-mingw-20240619-ucrt-${{ matrix.llvmarch }}.zip" -OutFile "${{ runner.temp }}\llvm-mingw-ucrt.zip"
+          Expand-Archive -Path ${{ runner.temp }}\llvm-mingw-ucrt.zip -DestinationPath "C:\Program Files\"
+          $installPath=(Resolve-Path -Path "C:\Program Files\llvm-mingw-*-ucrt*").path
+          echo "$installPath\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
      - uses: actions/checkout@v4
      - uses: actions/setup-go@v5
        with:
          go-version-file: go.mod
+      - name: Verify gcc is actually clang
+        run: |
+          gcc -v 
+          if (((& gcc -v 2>&1) -join "`n") -notmatch 'clang') {
+            echo "ERROR: GCC must be clang for proper utf16 handling"
+            exit 1
+          }
      - run: |
          go build -o dist/${{ matrix.os }}-${{ matrix.arch }}/ .
      - uses: actions/upload-artifact@v4
--- a/discover/runner.go
+++ b/discover/runner.go
@@ -330,6 +330,9 @@ func GPUDevices(ctx context.Context, runners []FilteredRunnerDiscovery) []ml.Dev
 		}
 	}

+	// Apply any iGPU workarounds
+	iGPUWorkarounds(devices)
+
 	return devices
 }

@@ -540,3 +543,32 @@ func GetDevicesFromRunner(ctx context.Context, runner BaseRunner) ([]ml.DeviceIn
 		}
 	}
 }
+
+func iGPUWorkarounds(devices []ml.DeviceInfo) {
+	// short circuit if we have no iGPUs
+	anyiGPU := false
+	for i := range devices {
+		if devices[i].Integrated {
+			anyiGPU = true
+			break
+		}
+	}
+	if !anyiGPU {
+		return
+	}
+
+	memInfo, err := GetCPUMem()
+	if err != nil {
+		slog.Debug("failed to fetch system memory information for iGPU", "error", err)
+		return
+	}
+	for i := range devices {
+		if !devices[i].Integrated {
+			continue
+		}
+		// NVIDIA iGPUs return useless free VRAM data which ignores system buff/cache
+		if devices[i].Library == "CUDA" {
+			devices[i].FreeMemory = memInfo.FreeMemory
+		}
+	}
+}
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -870,11 +870,6 @@ func (f GGML) SupportsKVCacheType(cacheType string) bool {
 		return true
 	}

-	if arch := f.KV().Architecture(); slices.Contains([]string{"gptoss", "gpt-oss"}, arch) {
-		// gpt-oss uses attention with sinks which does not support quantized cache types
-		slog.Warn("model only supports non-quantized cache types", "model", arch)
-		return false
-	}
 	return slices.Contains([]string{"q8_0", "q4_0"}, cacheType)
 }

--- a/model/models/deepseek2/model.go
+++ b/model/models/deepseek2/model.go
@@ -150,7 +150,9 @@ func (moe *sparse) Moe(ctx ml.Context, hiddenStates, topKIndices, topKWeights ml
 }

 func (moe *sparse) topKIndices(ctx ml.Context, scores ml.Tensor, opts *Options) ml.Tensor {
-	scores = scores.Add(ctx, moe.ExpProbsBias)
+	if moe.ExpProbsBias != nil {
+		scores = scores.Add(ctx, moe.ExpProbsBias)
+	}
 	topKIndices := scores.TopK(ctx, opts.numExpertsUsed)
 	return topKIndices
 }
--- a/template/template_test.go
+++ b/template/template_test.go
@@ -154,24 +154,55 @@ func TestTemplate(t *testing.T) {
 }

 func TestParse(t *testing.T) {
-	cases := []struct {
+	validCases := []struct {
+		name     string
 		template string
 		vars     []string
 	}{
-		{"{{ .Prompt }}", []string{"prompt", "response"}},
-		{"{{ .System }} {{ .Prompt }}", []string{"prompt", "response", "system"}},
-		{"{{ .System }} {{ .Prompt }} {{ .Response }}", []string{"prompt", "response", "system"}},
-		{"{{ with .Tools }}{{ . }}{{ end }} {{ .System }} {{ .Prompt }}", []string{"prompt", "response", "system", "tools"}},
-		{"{{ range .Messages }}{{ .Role }} {{ .Content }}{{ end }}", []string{"content", "messages", "role"}},
-		{"{{ range .Messages }}{{ if eq .Role \"tool\" }}Tool Result: {{ .ToolName }} {{ .Content }}{{ end }}{{ end }}", []string{"content", "messages", "role", "toolname"}},
-		{`{{- range .Messages }}
+		{
+			name:     "PromptOnly",
+			template: "{{ .Prompt }}",
+			vars:     []string{"prompt", "response"},
+		},
+		{
+			name:     "SystemAndPrompt",
+			template: "{{ .System }} {{ .Prompt }}",
+			vars:     []string{"prompt", "response", "system"},
+		},
+		{
+			name:     "PromptResponseSystem",
+			template: "{{ .System }} {{ .Prompt }} {{ .Response }}",
+			vars:     []string{"prompt", "response", "system"},
+		},
+		{
+			name:     "ToolsBlock",
+			template: "{{ with .Tools }}{{ . }}{{ end }} {{ .System }} {{ .Prompt }}",
+			vars:     []string{"prompt", "response", "system", "tools"},
+		},
+		{
+			name:     "MessagesRange",
+			template: "{{ range .Messages }}{{ .Role }} {{ .Content }}{{ end }}",
+			vars:     []string{"content", "messages", "role"},
+		},
+		{
+			name:     "ToolResultConditional",
+			template: "{{ range .Messages }}{{ if eq .Role \"tool\" }}Tool Result: {{ .ToolName }} {{ .Content }}{{ end }}{{ end }}",
+			vars:     []string{"content", "messages", "role", "toolname"},
+		},
+		{
+			name: "MultilineSystemUserAssistant",
+			template: `{{- range .Messages }}
 {{- if eq .Role "system" }}SYSTEM:
 {{- else if eq .Role "user" }}USER:
 {{- else if eq .Role "assistant" }}ASSISTANT:
-{{- else if eq .Role "tool" }}TOOL: 
+{{- else if eq .Role "tool" }}TOOL:
 {{- end }} {{ .Content }}
-{{- end }}`, []string{"content", "messages", "role"}},
-		{`{{- if .Messages }}
+{{- end }}`,
+			vars: []string{"content", "messages", "role"},
+		},
+		{
+			name: "ChatMLLike",
+			template: `{{- if .Messages }}
 {{- range .Messages }}<|im_start|>{{ .Role }}
 {{ .Content }}<|im_end|>
 {{ end }}<|im_start|>assistant
@@ -182,22 +213,60 @@ func TestParse(t *testing.T) {
 {{ .Prompt }}<|im_end|>
 {{ end }}<|im_start|>assistant
 {{ .Response }}<|im_end|>
-{{- end -}}`, []string{"content", "messages", "prompt", "response", "role", "system"}},
+{{- end -}}`,
+			vars: []string{"content", "messages", "prompt", "response", "role", "system"},
+		},
 	}

-	for _, tt := range cases {
-		t.Run("", func(t *testing.T) {
+	for _, tt := range validCases {
+		tt := tt
+		t.Run(tt.name, func(t *testing.T) {
+			t.Parallel()
+
 			tmpl, err := Parse(tt.template)
 			if err != nil {
-				t.Fatal(err)
+				t.Fatalf("Parse returned unexpected error: %v", err)
 			}

-			v, err := tmpl.Vars()
+			gotVars, err := tmpl.Vars()
 			if err != nil {
-				t.Fatal(err)
+				t.Fatalf("Vars returned unexpected error: %v", err)
 			}
-			if diff := cmp.Diff(v, tt.vars); diff != "" {
-				t.Errorf("mismatch (-got +want):\n%s", diff)
+
+			if diff := cmp.Diff(gotVars, tt.vars); diff != "" {
+				t.Errorf("Vars mismatch (-got +want):\n%s", diff)
+			}
+		})
+	}
+}
+
+func TestParseError(t *testing.T) {
+	invalidCases := []struct {
+		name     string
+		template string
+		errorStr string
+	}{
+		{
+			"TemplateNotClosed",
+			"{{ .Prompt ",
+			"unclosed action",
+		},
+		{
+			"Template",
+			`{{define "x"}}{{template "x"}}{{end}}{{template "x"}}`,
+			"undefined template specified",
+		},
+	}
+
+	for _, tt := range invalidCases {
+		t.Run(tt.name, func(t *testing.T) {
+			_, err := Parse(tt.template)
+			if err == nil {
+				t.Fatalf("expected Parse to return an error for an invalid template, got nil")
+			}
+
+			if !strings.Contains(strings.ToLower(err.Error()), strings.ToLower(tt.errorStr)) {
+				t.Errorf("unexpected error message.\n got: %q\n want substring (case‑insensitive): %q", err.Error(), tt.errorStr)
 			}
 		})
 	}
Author	SHA1	Message	Date
Daniel Hiltgen	ae5e0f0889	CI: replace clang compiler for windows (#12495 )	2025-10-04 09:18:42 -07:00
Jesse Gross	19e6796eac	llm: Support KV cache quantization with gpt-oss With the new version of GGML in #12245, KV cache quantization no longer causes a fallback to CPU.	2025-10-03 16:31:58 -07:00
Grace	33801c1597	Fixed Deepseek2 adding nil tensor error	2025-10-03 14:20:06 -07:00
Daniel Hiltgen	e4340667e3	Workaround broken NVIDIA iGPU free VRAM data (#12490 ) The CUDA APIs for reporting free VRAM are useless on NVIDIA iGPU systems as they only return the kernels actual free memory and ignore buff/cache allocations which on a typical system will quickly fill up most of the free system memory. As a result, we incorrectly think there's very little available for GPU allocations which is wrong.	2025-10-03 12:17:21 -07:00
Patrick Devine	2fa1e92a99	test: add template error test (#12489 )	2025-10-03 12:05:34 -07:00
Daniel Hiltgen	07e36761c3	ci: place rocm windows in correct runner dir (#12487 )	2025-10-03 07:28:40 -07:00
Daniel Hiltgen	c29fb007c0	CI: temporarily disable clang install (#12486 ) This will likely yield builds that have problems with unicode characters but at least we can start testing the release while we try to find an alternate clang compiler for windows, or mingw ships a fixed version.	2025-10-02 20:31:18 -07:00
Daniel Hiltgen	730ed6e9e1	ci: fix windows build (#12485 )	2025-10-02 19:16:01 -07:00