Compare commits

..

8 Commits

Author SHA1 Message Date
Daniel Hiltgen
ae5e0f0889 CI: replace clang compiler for windows (#12495) 2025-10-04 09:18:42 -07:00
Jesse Gross
19e6796eac llm: Support KV cache quantization with gpt-oss
With the new version of GGML in #12245, KV cache quantization
no longer causes a fallback to CPU.
2025-10-03 16:31:58 -07:00
Grace
33801c1597 Fixed Deepseek2 adding nil tensor error 2025-10-03 14:20:06 -07:00
Daniel Hiltgen
e4340667e3 Workaround broken NVIDIA iGPU free VRAM data (#12490)
The CUDA APIs for reporting free VRAM are useless on NVIDIA iGPU
systems as they only return the kernels actual free memory and ignore
buff/cache allocations which on a typical system will quickly fill up
most of the free system memory.  As a result, we incorrectly think
there's very little available for GPU allocations which is wrong.
2025-10-03 12:17:21 -07:00
Patrick Devine
2fa1e92a99 test: add template error test (#12489) 2025-10-03 12:05:34 -07:00
Daniel Hiltgen
07e36761c3 ci: place rocm windows in correct runner dir (#12487) 2025-10-03 07:28:40 -07:00
Daniel Hiltgen
c29fb007c0 CI: temporarily disable clang install (#12486)
This will likely yield builds that have problems with unicode characters
but at least we can start testing the release while we try to find an
alternate clang compiler for windows, or mingw ships a fixed version.
2025-10-02 20:31:18 -07:00
Daniel Hiltgen
730ed6e9e1 ci: fix windows build (#12485) 2025-10-02 19:16:01 -07:00
5 changed files with 147 additions and 39 deletions

View File

@@ -94,7 +94,7 @@ jobs:
install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
rocm-version: '6.2'
flags: '-DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" -DCMAKE_CXX_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma"'
runner_dir: ''
runner_dir: 'rocm'
runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }}
environment: release
env:
@@ -163,7 +163,7 @@ jobs:
cmake --preset "${{ matrix.preset }}" ${{ matrix.flags }} -DOLLAMA_RUNNER_DIR="${{ matrix.runner_dir }}"
cmake --build --parallel --preset "${{ matrix.preset }}"
cmake --install build --component "${{ startsWith(matrix.preset, 'CUDA ') && 'CUDA' || startsWith(matrix.preset, 'ROCm ') && 'HIP' || 'CPU' }}" --strip --parallel 8
rm -force dist\lib\ollama\rocm\rocblas\library\*gfx906*
Remove-Item -Path dist\lib\ollama\rocm\rocblas\library\*gfx906* -ErrorAction SilentlyContinue
env:
CMAKE_GENERATOR: Ninja
- uses: actions/upload-artifact@v4
@@ -176,19 +176,19 @@ jobs:
matrix:
os: [windows]
arch: [amd64, arm64]
include:
- os: windows
arch: amd64
llvmarch: x86_64
- os: windows
arch: arm64
llvmarch: aarch64
runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }}
environment: release
needs: [setup-environment]
env:
GOFLAGS: ${{ needs.setup-environment.outputs.GOFLAGS }}
steps:
- name: Install AMD64 system dependencies
if: matrix.arch == 'amd64'
run: |
$ErrorActionPreference = "Stop"
Start-Process "C:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
echo "C:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
- name: Install ARM64 system dependencies
if: matrix.arch == 'arm64'
run: |
@@ -200,15 +200,25 @@ jobs:
choco install -y --no-progress git gzip
echo "C:\Program Files\Git\cmd" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
Invoke-WebRequest -Uri "https://github.com/mstorsjo/llvm-mingw/releases/download/20240619/llvm-mingw-20240619-ucrt-aarch64.zip" -OutFile "${{ runner.temp }}\llvm-mingw-ucrt-aarch64.zip"
Expand-Archive -Path ${{ runner.temp }}\llvm-mingw-ucrt-aarch64.zip -DestinationPath "C:\Program Files\"
$installPath=(Resolve-Path -Path "C:\Program Files\llvm-mingw-*-ucrt-aarch64").path
echo $installPath\bin | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
- name: Install clang and gcc-compat
run: |
$ErrorActionPreference = "Stop"
Set-ExecutionPolicy Bypass -Scope Process -Force
Invoke-WebRequest -Uri "https://github.com/mstorsjo/llvm-mingw/releases/download/20240619/llvm-mingw-20240619-ucrt-${{ matrix.llvmarch }}.zip" -OutFile "${{ runner.temp }}\llvm-mingw-ucrt.zip"
Expand-Archive -Path ${{ runner.temp }}\llvm-mingw-ucrt.zip -DestinationPath "C:\Program Files\"
$installPath=(Resolve-Path -Path "C:\Program Files\llvm-mingw-*-ucrt*").path
echo "$installPath\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
- uses: actions/checkout@v4
- uses: actions/setup-go@v5
with:
go-version-file: go.mod
- name: Verify gcc is actually clang
run: |
gcc -v
if (((& gcc -v 2>&1) -join "`n") -notmatch 'clang') {
echo "ERROR: GCC must be clang for proper utf16 handling"
exit 1
}
- run: |
go build -o dist/${{ matrix.os }}-${{ matrix.arch }}/ .
- uses: actions/upload-artifact@v4

View File

@@ -330,6 +330,9 @@ func GPUDevices(ctx context.Context, runners []FilteredRunnerDiscovery) []ml.Dev
}
}
// Apply any iGPU workarounds
iGPUWorkarounds(devices)
return devices
}
@@ -540,3 +543,32 @@ func GetDevicesFromRunner(ctx context.Context, runner BaseRunner) ([]ml.DeviceIn
}
}
}
func iGPUWorkarounds(devices []ml.DeviceInfo) {
// short circuit if we have no iGPUs
anyiGPU := false
for i := range devices {
if devices[i].Integrated {
anyiGPU = true
break
}
}
if !anyiGPU {
return
}
memInfo, err := GetCPUMem()
if err != nil {
slog.Debug("failed to fetch system memory information for iGPU", "error", err)
return
}
for i := range devices {
if !devices[i].Integrated {
continue
}
// NVIDIA iGPUs return useless free VRAM data which ignores system buff/cache
if devices[i].Library == "CUDA" {
devices[i].FreeMemory = memInfo.FreeMemory
}
}
}

View File

@@ -870,11 +870,6 @@ func (f GGML) SupportsKVCacheType(cacheType string) bool {
return true
}
if arch := f.KV().Architecture(); slices.Contains([]string{"gptoss", "gpt-oss"}, arch) {
// gpt-oss uses attention with sinks which does not support quantized cache types
slog.Warn("model only supports non-quantized cache types", "model", arch)
return false
}
return slices.Contains([]string{"q8_0", "q4_0"}, cacheType)
}

View File

@@ -150,7 +150,9 @@ func (moe *sparse) Moe(ctx ml.Context, hiddenStates, topKIndices, topKWeights ml
}
func (moe *sparse) topKIndices(ctx ml.Context, scores ml.Tensor, opts *Options) ml.Tensor {
scores = scores.Add(ctx, moe.ExpProbsBias)
if moe.ExpProbsBias != nil {
scores = scores.Add(ctx, moe.ExpProbsBias)
}
topKIndices := scores.TopK(ctx, opts.numExpertsUsed)
return topKIndices
}

View File

@@ -154,24 +154,55 @@ func TestTemplate(t *testing.T) {
}
func TestParse(t *testing.T) {
cases := []struct {
validCases := []struct {
name string
template string
vars []string
}{
{"{{ .Prompt }}", []string{"prompt", "response"}},
{"{{ .System }} {{ .Prompt }}", []string{"prompt", "response", "system"}},
{"{{ .System }} {{ .Prompt }} {{ .Response }}", []string{"prompt", "response", "system"}},
{"{{ with .Tools }}{{ . }}{{ end }} {{ .System }} {{ .Prompt }}", []string{"prompt", "response", "system", "tools"}},
{"{{ range .Messages }}{{ .Role }} {{ .Content }}{{ end }}", []string{"content", "messages", "role"}},
{"{{ range .Messages }}{{ if eq .Role \"tool\" }}Tool Result: {{ .ToolName }} {{ .Content }}{{ end }}{{ end }}", []string{"content", "messages", "role", "toolname"}},
{`{{- range .Messages }}
{
name: "PromptOnly",
template: "{{ .Prompt }}",
vars: []string{"prompt", "response"},
},
{
name: "SystemAndPrompt",
template: "{{ .System }} {{ .Prompt }}",
vars: []string{"prompt", "response", "system"},
},
{
name: "PromptResponseSystem",
template: "{{ .System }} {{ .Prompt }} {{ .Response }}",
vars: []string{"prompt", "response", "system"},
},
{
name: "ToolsBlock",
template: "{{ with .Tools }}{{ . }}{{ end }} {{ .System }} {{ .Prompt }}",
vars: []string{"prompt", "response", "system", "tools"},
},
{
name: "MessagesRange",
template: "{{ range .Messages }}{{ .Role }} {{ .Content }}{{ end }}",
vars: []string{"content", "messages", "role"},
},
{
name: "ToolResultConditional",
template: "{{ range .Messages }}{{ if eq .Role \"tool\" }}Tool Result: {{ .ToolName }} {{ .Content }}{{ end }}{{ end }}",
vars: []string{"content", "messages", "role", "toolname"},
},
{
name: "MultilineSystemUserAssistant",
template: `{{- range .Messages }}
{{- if eq .Role "system" }}SYSTEM:
{{- else if eq .Role "user" }}USER:
{{- else if eq .Role "assistant" }}ASSISTANT:
{{- else if eq .Role "tool" }}TOOL:
{{- else if eq .Role "tool" }}TOOL:
{{- end }} {{ .Content }}
{{- end }}`, []string{"content", "messages", "role"}},
{`{{- if .Messages }}
{{- end }}`,
vars: []string{"content", "messages", "role"},
},
{
name: "ChatMLLike",
template: `{{- if .Messages }}
{{- range .Messages }}<|im_start|>{{ .Role }}
{{ .Content }}<|im_end|>
{{ end }}<|im_start|>assistant
@@ -182,22 +213,60 @@ func TestParse(t *testing.T) {
{{ .Prompt }}<|im_end|>
{{ end }}<|im_start|>assistant
{{ .Response }}<|im_end|>
{{- end -}}`, []string{"content", "messages", "prompt", "response", "role", "system"}},
{{- end -}}`,
vars: []string{"content", "messages", "prompt", "response", "role", "system"},
},
}
for _, tt := range cases {
t.Run("", func(t *testing.T) {
for _, tt := range validCases {
tt := tt
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
tmpl, err := Parse(tt.template)
if err != nil {
t.Fatal(err)
t.Fatalf("Parse returned unexpected error: %v", err)
}
v, err := tmpl.Vars()
gotVars, err := tmpl.Vars()
if err != nil {
t.Fatal(err)
t.Fatalf("Vars returned unexpected error: %v", err)
}
if diff := cmp.Diff(v, tt.vars); diff != "" {
t.Errorf("mismatch (-got +want):\n%s", diff)
if diff := cmp.Diff(gotVars, tt.vars); diff != "" {
t.Errorf("Vars mismatch (-got +want):\n%s", diff)
}
})
}
}
func TestParseError(t *testing.T) {
invalidCases := []struct {
name string
template string
errorStr string
}{
{
"TemplateNotClosed",
"{{ .Prompt ",
"unclosed action",
},
{
"Template",
`{{define "x"}}{{template "x"}}{{end}}{{template "x"}}`,
"undefined template specified",
},
}
for _, tt := range invalidCases {
t.Run(tt.name, func(t *testing.T) {
_, err := Parse(tt.template)
if err == nil {
t.Fatalf("expected Parse to return an error for an invalid template, got nil")
}
if !strings.Contains(strings.ToLower(err.Error()), strings.ToLower(tt.errorStr)) {
t.Errorf("unexpected error message.\n got: %q\n want substring (caseinsensitive): %q", err.Error(), tt.errorStr)
}
})
}