ml/backend/ggml: optionally evaluate os.Executable() symlinks

for Intel macOS hosts, optionally evaluate symlinks to os.Executable ahead of loading backends, fixing issues where 'ollama' is a symlink and is run manually in the command line
2025-02-13 22:43:39 -08:00
18 changed files with 75 additions and 134 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -329,9 +329,7 @@ jobs:
          done
        working-directory: dist/${{ matrix.os }}-${{ matrix.arch }}
      - run: |
-          for ARCHIVE in dist/${{ matrix.os }}-${{ matrix.arch }}/*.tar.in; do
+          for ARCHIVE in dist/${{ matrix.os }}-${{ matrix.arch }}/*.tar.in; do tar c -C dist/${{ matrix.os }}-${{ matrix.arch }} -T $ARCHIVE | pigz -9vc >$(basename ${ARCHIVE//.*/}.tgz); done
            tar c -C dist/${{ matrix.os }}-${{ matrix.arch }} -T $ARCHIVE --owner 0 --group 0 | pigz -9vc >$(basename ${ARCHIVE//.*/}.tgz);
          done
      - uses: actions/upload-artifact@v4
        with:
          name: dist-${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.target }}
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,7 +24,7 @@ set(GGML_LLAMAFILE ON)
 set(GGML_CUDA_PEER_MAX_BATCH_SIZE 128)
 set(GGML_CUDA_GRAPHS ON)
-if((CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
+if((NOT CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
    OR (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm|aarch64|ARM64|ARMv[0-9]+"))
    set(GGML_CPU_ALL_VARIANTS ON)
 endif()
--- a/README.md
+++ b/README.md
@@ -381,7 +381,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [ChibiChat](https://github.com/CosmicEventHorizon/ChibiChat) (Kotlin-based Android app to chat with Ollama and Koboldcpp API endpoints)
 - [LocalLLM](https://github.com/qusaismael/localllm) (Minimal Web-App to run ollama models on it with a GUI)
 - [Ollamazing](https://github.com/buiducnhat/ollamazing) (Web extension to run Ollama models)
 - [OpenDeepResearcher-via-searxng](https://github.com/benhaotang/OpenDeepResearcher-via-searxng) (A Deep Research equivent endpoint with Ollama support for running locally)
 ### Cloud
@@ -549,7 +548,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Alfred Ollama](https://github.com/zeitlings/alfred-ollama) (Alfred Workflow)
 - [TextLLaMA](https://github.com/adarshM84/TextLLaMA) A Chrome Extension that helps you write emails, correct grammar, and translate into any language
 - [Simple-Discord-AI](https://github.com/zyphixor/simple-discord-ai)
 - [LLM Telegram Bot](https://github.com/innightwolfsleep/llm_telegram_bot) (telegram bot, primary for RP. Oobabooga-like buttons, [A1111](https://github.com/AUTOMATIC1111/stable-diffusion-webui) API integration e.t.c)
 ### Supported backends
--- a/api/client.go
+++ b/api/client.go
@@ -126,8 +126,7 @@ func (c *Client) do(ctx context.Context, method, path string, reqData, respData
 			return err
 		}
 	}
-
+	return nil
 	return ctx.Err()
 }
 const maxBufferSize = 512 * format.KiloByte
@@ -190,7 +189,7 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f
 		}
 	}
-	return ctx.Err()
+	return nil
 }
 // GenerateResponseFunc is a function that [Client.Generate] invokes every time
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -15,11 +15,13 @@ import (
 	"net"
 	"net/http"
 	"os"
 	"os/signal"
 	"path/filepath"
 	"runtime"
 	"strconv"
 	"strings"
 	"sync/atomic"
 	"syscall"
 	"time"
 	"github.com/containerd/console"
@@ -328,7 +330,6 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 			if err := PullHandler(cmd, []string{name}); err != nil {
 				return nil, err
 			}
 			return client.Show(cmd.Context(), &api.ShowRequest{Name: name})
 		}
 		return info, err
@@ -857,6 +858,17 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 	spinner := progress.NewSpinner("")
 	p.Add("", spinner)
 	cancelCtx, cancel := context.WithCancel(cmd.Context())
 	defer cancel()
 	sigChan := make(chan os.Signal, 1)
 	signal.Notify(sigChan, syscall.SIGINT)
 	go func() {
 		<-sigChan
 		cancel()
 	}()
 	var state *displayResponseState = &displayResponseState{}
 	var latest api.ChatResponse
 	var fullResponse strings.Builder
@@ -891,7 +903,10 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 		req.KeepAlive = opts.KeepAlive
 	}
-	if err := client.Chat(cmd.Context(), req, fn); err != nil {
+	if err := client.Chat(cancelCtx, req, fn); err != nil {
 		if errors.Is(err, context.Canceled) {
 			return nil, nil
 		}
 		return nil, err
 	}
@@ -931,6 +946,17 @@ func generate(cmd *cobra.Command, opts runOptions) error {
 		generateContext = []int{}
 	}
 	ctx, cancel := context.WithCancel(cmd.Context())
 	defer cancel()
 	sigChan := make(chan os.Signal, 1)
 	signal.Notify(sigChan, syscall.SIGINT)
 	go func() {
 		<-sigChan
 		cancel()
 	}()
 	var state *displayResponseState = &displayResponseState{}
 	fn := func(response api.GenerateResponse) error {
@@ -966,7 +992,10 @@ func generate(cmd *cobra.Command, opts runOptions) error {
 		KeepAlive: opts.KeepAlive,
 	}
-	if err := client.Generate(cmd.Context(), &request, fn); err != nil {
+	if err := client.Generate(ctx, &request, fn); err != nil {
 		if errors.Is(err, context.Canceled) {
 			return nil
 		}
 		return err
 	}
@@ -988,7 +1017,8 @@ func generate(cmd *cobra.Command, opts runOptions) error {
 		latest.Summary()
 	}
-	cmd.SetContext(context.WithValue(cmd.Context(), generateContextKey("context"), latest.Context))
+	ctx = context.WithValue(cmd.Context(), generateContextKey("context"), latest.Context)
 	cmd.SetContext(ctx)
 	return nil
 }
--- a/discover/path.go
+++ b/discover/path.go
@@ -19,10 +19,6 @@ var LibOllamaPath string = func() string {
 		return ""
 	}
 	if eval, err := filepath.EvalSymlinks(exe); err == nil {
 		exe = eval
 	}
 	var libPath string
 	switch runtime.GOOS {
 	case "windows":
--- a/docs/windows.md
+++ b/docs/windows.md
@@ -55,7 +55,7 @@ Here's a quick example showing API access from `powershell`
 ## Troubleshooting
 Ollama on Windows stores files in a few different locations.  You can view them in
-the explorer window by hitting `<Ctrl>+R` and type in:
+the explorer window by hitting `<cmd>+R` and type in:
 - `explorer %LOCALAPPDATA%\Ollama` contains logs, and downloaded updates
    - *app.log* contains most resent logs from the GUI application
    - *server.log* contains the most recent server logs
--- a/format/format_test.go
+++ b/format/format_test.go
@@ -12,9 +12,6 @@ func TestHumanNumber(t *testing.T) {
 	testCases := []testCase{
 		{0, "0"},
 		{999, "999"},
 		{1000, "1K"},
 		{1001, "1K"},
 		{1000000, "1M"},
 		{125000000, "125M"},
 		{500500000, "500.50M"},
--- a/kvcache/causal_test.go
+++ b/kvcache/causal_test.go
@@ -305,10 +305,6 @@ func (b *testBackend) NewContext() ml.Context {
 	return &testContext{}
 }
 func (b *testBackend) SystemInfo() string {
 	return "not implemented"
 }
 type testContext struct{}
 func (c *testContext) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
--- a/llm/server.go
+++ b/llm/server.go
@@ -320,10 +320,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt
 			return nil, fmt.Errorf("unable to lookup executable path: %w", err)
 		}
 		if eval, err := filepath.EvalSymlinks(exe); err == nil {
 			exe = eval
 		}
 		// TODO - once fully switched to the Go runner, load the model here for tokenize/detokenize cgo access
 		s := &llmServer{
 			port:        port,
--- a/main.go
+++ b/main.go
@@ -2,8 +2,6 @@ package main
 import (
 	"context"
 	"os"
 	"os/signal"
 	"github.com/spf13/cobra"
@@ -11,15 +9,5 @@ import (
 )
 func main() {
-	ctx, cancel := context.WithCancel(context.Background())
+	cobra.CheckErr(cmd.NewCLI().ExecuteContext(context.Background()))
 	defer cancel()
 	sigChan := make(chan os.Signal, 1)
 	signal.Notify(sigChan, os.Interrupt)
 	go func() {
 		<-sigChan
 		cancel()
 	}()
 	cobra.CheckErr(cmd.NewCLI().ExecuteContext(ctx))
 }
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -23,7 +23,6 @@ type Backend interface {
 	Config() Config
 	Get(name string) Tensor
 	NewContext() Context
 	SystemInfo() string
 }
 var backends = make(map[string]func(*os.File) (Backend, error))
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -1,27 +1,11 @@
 package ggml
-/*
+// #cgo CPPFLAGS: -I${SRCDIR}/ggml/include
-#cgo CPPFLAGS: -I${SRCDIR}/ggml/include
+// #include <stdlib.h>
-#include <stdlib.h>
+// #include <stdint.h>
-#include <stdint.h>
+// #include "ggml.h"
-#include "ggml.h"
+// #include "ggml-cpu.h"
-#include "ggml-cpu.h"
+// #include "ggml-backend.h"
 #include "ggml-backend.h"
 static struct ggml_backend_feature * getBackendFeatures(void *fp, ggml_backend_reg_t reg) {return ((ggml_backend_get_features_t)(fp))(reg);}
 static struct ggml_backend_feature * getNextBackendFeatures(struct ggml_backend_feature * feature) { return &feature[1];}
 typedef enum {COMP_UNKNOWN,COMP_GCC,COMP_CLANG} COMPILER;
 COMPILER inline get_compiler() {
 #if defined(__clang__)
 	return COMP_CLANG;
 #elif defined(__GNUC__)
 	return COMP_GCC;
 #else
 	return UNKNOWN_COMPILER;
 #endif
 }
 */
 import "C"
 import (
@@ -642,34 +626,3 @@ func (t *Tensor) Conv2D(ctx ml.Context, t2 ml.Tensor, s0, s1, p0, p1, d0, d1 int
 		t: C.ggml_conv_2d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(s0), C.int(s1), C.int(p0), C.int(p1), C.int(d0), C.int(d1)),
 	}
 }
 func (b *Backend) SystemInfo() string {
 	var compiler string
 	switch C.get_compiler() {
 	case C.COMP_UNKNOWN:
 		compiler = "cgo(unknown_compiler)"
 	case C.COMP_GCC:
 		compiler = "cgo(gcc)"
 	case C.COMP_CLANG:
 		compiler = "cgo(clang)"
 	}
 	var s string
 	for i := range C.ggml_backend_reg_count() {
 		reg := C.ggml_backend_reg_get(i)
 		fName := C.CString("ggml_backend_get_features")
 		defer C.free(unsafe.Pointer(fName))
 		get_features_fn := C.ggml_backend_reg_get_proc_address(reg, fName)
 		if get_features_fn != nil {
 			s += C.GoString(C.ggml_backend_reg_name(reg))
 			s += " : "
 			for features := C.getBackendFeatures(get_features_fn, reg); features.name != nil; features = C.getNextBackendFeatures(features) {
 				s += C.GoString(features.name)
 				s += " = "
 				s += C.GoString(features.value)
 				s += " | "
 			}
 		}
 	}
 	return s + compiler
 }
--- a/ml/backend/ggml/ggml/src/ggml.go
+++ b/ml/backend/ggml/ggml/src/ggml.go
@@ -47,6 +47,10 @@ var OnceLoad = sync.OnceFunc(func() {
 		exe = "."
 	}
 	if eval, err := filepath.EvalSymlinks(exe); err == nil {
 		exe = eval
 	}
 	// PATH, LD_LIBRARY_PATH, and DYLD_LIBRARY_PATH are often
 	// set by the parent process, however, use a default value
 	// if the environment variable is not set.
--- a/model/model.go
+++ b/model/model.go
@@ -21,7 +21,6 @@ import (
 	_ "github.com/ollama/ollama/ml/backend"
 )
 // Options contains the inputs for a model forward pass
 type Options struct {
 	Inputs    []int32
 	Positions []int32
@@ -35,13 +34,11 @@ type config struct {
 	Cache kvcache.Cache
 }
 // Base implements the common fields and methods for all models
 type Base struct {
 	b ml.Backend
 	config
 }
 // Backend returns the underlying backend that will run the model
 func (m *Base) Backend() ml.Backend {
 	return m.b
 }
@@ -50,7 +47,6 @@ func (m *Base) Config() config {
 	return m.config
 }
 // Model implements a specific model architecture, defining the forward pass and any model-specific configuration
 type Model interface {
 	Forward(ml.Context, Options) (ml.Tensor, error)
@@ -60,7 +56,6 @@ type Model interface {
 var models = make(map[string]func(ml.Config) (Model, error))
 // Register registers a model constructor for the given architecture
 func Register(name string, f func(ml.Config) (Model, error)) {
 	if _, ok := models[name]; ok {
 		panic("model: model already registered")
@@ -69,9 +64,8 @@ func Register(name string, f func(ml.Config) (Model, error)) {
 	models[name] = f
 }
-// New initializes a new model instance with the provided configuration based on the metadata in the model file
+func New(s string) (Model, error) {
-func New(modelPath string) (Model, error) {
+	r, err := os.Open(s)
 	r, err := os.Open(modelPath)
 	if err != nil {
 		return nil, err
 	}
--- a/progress/progress.go
+++ b/progress/progress.go
@@ -1,7 +1,6 @@
 package progress
 import (
 	"bufio"
 	"fmt"
 	"io"
 	"sync"
@@ -14,8 +13,7 @@ type State interface {
 type Progress struct {
 	mu sync.Mutex
-	// buffer output to minimize flickering on all terminals
+	w  io.Writer
 	w *bufio.Writer
 	pos int
@@ -24,7 +22,7 @@ type Progress struct {
 }
 func NewProgress(w io.Writer) *Progress {
-	p := &Progress{w: bufio.NewWriter(w)}
+	p := &Progress{w: w}
 	go p.start()
 	return p
 }
@@ -49,29 +47,26 @@ func (p *Progress) stop() bool {
 func (p *Progress) Stop() bool {
 	stopped := p.stop()
 	if stopped {
-		fmt.Fprintln(p.w)
+		fmt.Fprint(p.w, "\n")
 	}
 	// show cursor
 	fmt.Fprint(p.w, "\033[?25h")
 	p.w.Flush()
 	return stopped
 }
 func (p *Progress) StopAndClear() bool {
 	fmt.Fprint(p.w, "\033[?25l")
 	defer fmt.Fprint(p.w, "\033[?25h")
 	stopped := p.stop()
 	if stopped {
 		// clear all progress lines
-		for range p.pos - 1 {
+		for i := range p.pos {
-			fmt.Fprint(p.w, "\033[A")
+			if i > 0 {
 				fmt.Fprint(p.w, "\033[A")
 			}
 			fmt.Fprint(p.w, "\033[2K\033[1G")
 		}
 		fmt.Fprint(p.w, "\033[2K", "\033[1G")
 	}
 	// show cursor
 	fmt.Fprint(p.w, "\033[?25h")
 	p.w.Flush()
 	return stopped
 }
@@ -86,31 +81,30 @@ func (p *Progress) render() {
 	p.mu.Lock()
 	defer p.mu.Unlock()
-	fmt.Fprint(p.w, "\033[?2026h")
+	fmt.Fprint(p.w, "\033[?25l")
-	defer fmt.Fprint(p.w, "\033[?2026l")
+	defer fmt.Fprint(p.w, "\033[?25h")
-	for range p.pos - 1 {
+	// clear already rendered progress lines
-		fmt.Fprint(p.w, "\033[A")
+	for i := range p.pos {
 		if i > 0 {
 			fmt.Fprint(p.w, "\033[A")
 		}
 		fmt.Fprint(p.w, "\033[2K\033[1G")
 	}
 	fmt.Fprint(p.w, "\033[1G")
 	// render progress lines
 	for i, state := range p.states {
-		fmt.Fprint(p.w, state.String(), "\033[K")
+		fmt.Fprint(p.w, state.String())
 		if i < len(p.states)-1 {
 			fmt.Fprint(p.w, "\n")
 		}
 	}
 	p.pos = len(p.states)
 	p.w.Flush()
 }
 func (p *Progress) start() {
 	p.ticker = time.NewTicker(100 * time.Millisecond)
 	// hide cursor
 	fmt.Fprint(p.w, "\033[?25l")
 	for range p.ticker.C {
 		p.render()
 	}
--- a/runner/llamarunner/runner.go
+++ b/runner/llamarunner/runner.go
@@ -845,6 +845,8 @@ func (s *Server) loadModel(
 	threads int,
 	multiUserCache bool,
 ) {
 	llama.BackendInit()
 	var err error
 	s.model, err = llama.LoadModelFromFile(mpath, params)
 	if err != nil {
@@ -930,8 +932,6 @@ func Execute(args []string) error {
 	})
 	slog.SetDefault(slog.New(handler))
 	slog.Info("starting go runner")
 	llama.BackendInit()
 	slog.Info("system", "info", llama.PrintSystemInfo(), "threads", *threads)
 	server := &Server{
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -813,8 +813,6 @@ func (s *Server) loadModel(
 		panic(err)
 	}
 	slog.Info("system", "info", s.model.Backend().SystemInfo() /* "threads", *threads */)
 	// TODO(jessegross): LoRA loading
 	if lpath.String() != "" {
 		panic("loras are not yet implemented")
@@ -883,6 +881,7 @@ func Execute(args []string) error {
 	})
 	slog.SetDefault(slog.New(handler))
 	slog.Info("starting ollama engine")
 	// TODO(jessegross): Some system info would be useful
 	server := &Server{
 		batchSize: *batchSize,