add back f16c instructions on intel mac

fix readline history on linux
update submodule commit
2023-11-26 15:59:49 -05:00 · 2023-11-26 15:59:04 -05:00 · 2023-11-26 14:52:00 -05:00 · 2023-11-26 12:54:29 -05:00 · 2023-11-24 17:16:36 -05:00 · 2023-11-24 14:05:57 -05:00
14 changed files with 59 additions and 49 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,4 @@ dist
 ollama
 ggml-metal.metal
 .cache
+*.exe
--- a/api/client.go
+++ b/api/client.go
@@ -262,7 +262,7 @@ func (c *Client) Create(ctx context.Context, req *CreateRequest, fn CreateProgre

 func (c *Client) List(ctx context.Context) (*ListResponse, error) {
 	var lr ListResponse
-	if err := c.do(ctx, http.MethodGet, "/api/list", nil, &lr); err != nil {
+	if err := c.do(ctx, http.MethodGet, "/api/tags", nil, &lr); err != nil {
 		return nil, err
 	}
 	return &lr, nil
--- a/api/client.py
+++ b/api/client.py
@@ -157,7 +157,7 @@ def push(model_name, insecure=False, callback=None):
 # List models that are available locally.
 def list():
    try:
-        response = requests.get(f"{BASE_URL}/api/list")
+        response = requests.get(f"{BASE_URL}/api/tags")
        response.raise_for_status()
        data = response.json()
        models = data.get('models', [])
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -602,14 +602,12 @@ func generateInteractive(cmd *cobra.Command, model string, wordWrap bool, format
 		fmt.Fprintln(os.Stderr, "")
 	}

-	prompt := readline.Prompt{
+	scanner, err := readline.New(readline.Prompt{
 		Prompt:         ">>> ",
 		AltPrompt:      "... ",
 		Placeholder:    "Send a message (/? for help)",
 		AltPlaceholder: `Use """ to end multi-line input`,
-	}
-
-	scanner, err := readline.New(prompt)
+	})
 	if err != nil {
 		return err
 	}
@@ -617,7 +615,7 @@ func generateInteractive(cmd *cobra.Command, model string, wordWrap bool, format
 	fmt.Print(readline.StartBracketedPaste)
 	defer fmt.Printf(readline.EndBracketedPaste)

-	var multiLineBuffer string
+	var prompt string

 	for {
 		line, err := scanner.Readline()
@@ -630,27 +628,33 @@ func generateInteractive(cmd *cobra.Command, model string, wordWrap bool, format
 				fmt.Println("\nUse Ctrl-D or /bye to exit.")
 			}

+			scanner.Prompt.UseAlt = false
+			prompt = ""
+
 			continue
 		case err != nil:
 			return err
 		}

-		line = strings.TrimSpace(line)
-
 		switch {
-		case scanner.Prompt.UseAlt:
-			if strings.HasSuffix(line, `"""`) {
-				scanner.Prompt.UseAlt = false
-				multiLineBuffer += strings.TrimSuffix(line, `"""`)
-				line = multiLineBuffer
-				multiLineBuffer = ""
-			} else {
-				multiLineBuffer += line + " "
+		case strings.HasPrefix(prompt, `"""`):
+			// if the prompt so far starts with """ then we're in multiline mode
+			// and we need to keep reading until we find a line that ends with """
+			cut, found := strings.CutSuffix(line, `"""`)
+			prompt += cut + "\n"
+
+			if !found {
 				continue
 			}
-		case strings.HasPrefix(line, `"""`):
+
+			prompt = strings.TrimPrefix(prompt, `"""`)
+			scanner.Prompt.UseAlt = false
+		case strings.HasPrefix(line, `"""`) && len(prompt) == 0:
 			scanner.Prompt.UseAlt = true
-			multiLineBuffer = strings.TrimPrefix(line, `"""`) + " "
+			prompt += line + "\n"
+			continue
+		case scanner.Pasting:
+			prompt += line + "\n"
 			continue
 		case strings.HasPrefix(line, "/list"):
 			args := strings.Fields(line)
@@ -757,12 +761,17 @@ func generateInteractive(cmd *cobra.Command, model string, wordWrap bool, format
 		case strings.HasPrefix(line, "/"):
 			args := strings.Fields(line)
 			fmt.Printf("Unknown command '%s'. Type /? for help\n", args[0])
+			continue
+		default:
+			prompt += line
 		}

-		if len(line) > 0 && line[0] != '/' {
-			if err := generate(cmd, model, line, wordWrap, format); err != nil {
+		if len(prompt) > 0 && prompt[0] != '/' {
+			if err := generate(cmd, model, prompt, wordWrap, format); err != nil {
 				return err
 			}
+
+			prompt = ""
 		}
 	}
 }
--- a/docs/api.md
+++ b/docs/api.md
@@ -372,10 +372,10 @@ curl -T model.bin -X POST http://localhost:11434/api/blobs/sha256:29fdb92e57cf08

 Return 201 Created if the blob was successfully created.

-## List Models
+## List Local Models

 ```shell
-GET /api/list
+GET /api/tags
 ```

 List models that are available locally.
@@ -385,7 +385,7 @@ List models that are available locally.
 #### Request

 ```shell
-curl http://localhost:11434/api/list
+curl http://localhost:11434/api/tags
 ```

 #### Response
--- a/llm/llama.cpp/generate_darwin_amd64.go
+++ b/llm/llama.cpp/generate_darwin_amd64.go
@@ -13,6 +13,6 @@ package llm

 //go:generate git submodule update --force gguf
 //go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch
-//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_METAL=off -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off
+//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_METAL=off -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=on
 //go:generate cmake --build gguf/build/cpu --target server --config Release
 //go:generate mv gguf/build/cpu/bin/server gguf/build/cpu/bin/ollama-runner
--- a/llm/llama.cpp/generate_linux.go
+++ b/llm/llama.cpp/generate_linux.go
@@ -21,6 +21,6 @@ package llm
 //go:generate cmake -S ggml -B ggml/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
 //go:generate cmake --build ggml/build/cuda --target server --config Release
 //go:generate mv ggml/build/cuda/bin/server ggml/build/cuda/bin/ollama-runner
-//go:generate cmake -S gguf -B gguf/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off
+//go:generate cmake -S gguf -B gguf/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_CUDA_PEER_MAX_BATCH_SIZE=0
 //go:generate cmake --build gguf/build/cuda --target server --config Release
 //go:generate mv gguf/build/cuda/bin/server gguf/build/cuda/bin/ollama-runner
--- a/llm/llama.cpp/generate_windows.go
+++ b/llm/llama.cpp/generate_windows.go
@@ -14,3 +14,11 @@ package llm
 //go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off
 //go:generate cmake --build gguf/build/cpu --target server --config Release
 //go:generate cmd /c move gguf\build\cpu\bin\Release\server.exe gguf\build\cpu\bin\Release\ollama-runner.exe
+
+//go:generate cmake -S ggml -B ggml/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
+//go:generate cmake --build ggml/build/cuda --target server --config Release
+//go:generate cmd /c move ggml\build\cuda\bin\Release\server.exe ggml\build\cuda\bin\Release\ollama-runner.exe
+
+//go:generate cmake -S gguf -B gguf/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off
+//go:generate cmake --build gguf/build/cuda --target server --config Release
+//go:generate cmd /c move gguf\build\cuda\bin\Release\server.exe gguf\build\cuda\bin\Release\ollama-runner.exe
--- a/llm/llama.cpp/gguf
+++ b/llm/llama.cpp/gguf
--- a/llm/llama.go
+++ b/llm/llama.go
@@ -84,6 +84,7 @@ func chooseRunners(workDir, runnerType string) []ModelRunner {
 	case "windows":
 		// TODO: select windows GPU runner here when available
 		runners = []ModelRunner{
+			{Path: path.Join(buildPath, "cuda", "bin", "Release", "ollama-runner.exe"), Accelerated: true},
 			{Path: path.Join(buildPath, "cpu", "bin", "Release", "ollama-runner.exe")},
 		}
 	default:
@@ -269,7 +270,7 @@ func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
 	if opts.NumGPU != -1 {
 		return opts.NumGPU
 	}
-	if runtime.GOOS == "linux" {
+	if runtime.GOOS == "linux" || runtime.GOOS == "windows" {
 		freeBytes, err := CheckVRAM()
 		if err != nil {
 			if !errors.Is(err, errNvidiaSMI) {
--- a/readline/history.go
+++ b/readline/history.go
@@ -43,9 +43,12 @@ func (h *History) Init() error {
 	}

 	path := filepath.Join(home, ".ollama", "history")
+	if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil {
+		return err
+	}
+
 	h.Filename = path

-	//todo check if the file exists
 	f, err := os.OpenFile(path, os.O_CREATE|os.O_RDONLY, 0600)
 	if err != nil {
 		if errors.Is(err, os.ErrNotExist) {
--- a/readline/readline.go
+++ b/readline/readline.go
@@ -24,6 +24,7 @@ type Instance struct {
 	Prompt   *Prompt
 	Terminal *Terminal
 	History  *History
+	Pasting  bool
 }

 func New(prompt Prompt) (*Instance, error) {
@@ -46,7 +47,7 @@ func New(prompt Prompt) (*Instance, error) {

 func (i *Instance) Readline() (string, error) {
 	prompt := i.Prompt.Prompt
-	if i.Prompt.UseAlt {
+	if i.Prompt.UseAlt || i.Pasting {
 		prompt = i.Prompt.AltPrompt
 	}
 	fmt.Print(prompt)
@@ -63,12 +64,13 @@ func (i *Instance) Readline() (string, error) {
 	var esc bool
 	var escex bool
 	var metaDel bool
-	var pasteMode PasteMode

 	var currentLineBuf []rune

 	for {
-		if buf.IsEmpty() {
+		// don't show placeholder when pasting unless we're in multiline mode
+		showPlaceholder := !i.Pasting || i.Prompt.UseAlt
+		if buf.IsEmpty() && showPlaceholder {
 			ph := i.Prompt.Placeholder
 			if i.Prompt.UseAlt {
 				ph = i.Prompt.AltPlaceholder
@@ -119,9 +121,9 @@ func (i *Instance) Readline() (string, error) {
 					code += string(r)
 				}
 				if code == CharBracketedPasteStart {
-					pasteMode = PasteModeStart
+					i.Pasting = true
 				} else if code == CharBracketedPasteEnd {
-					pasteMode = PasteModeEnd
+					i.Pasting = false
 				}
 			case KeyDel:
 				if buf.Size() > 0 {
@@ -196,12 +198,7 @@ func (i *Instance) Readline() (string, error) {
 			}
 			buf.MoveToEnd()
 			fmt.Println()
-			switch pasteMode {
-			case PasteModeStart:
-				output = `"""` + output
-			case PasteModeEnd:
-				output = output + `"""`
-			}
+
 			return output, nil
 		default:
 			if metaDel {
--- a/readline/types.go
+++ b/readline/types.go
@@ -76,11 +76,3 @@ const (
 	CharBracketedPasteStart = "00~"
 	CharBracketedPasteEnd   = "01~"
 )
-
-type PasteMode int
-
-const (
-	PastModeOff = iota
-	PasteModeStart
-	PasteModeEnd
-)
--- a/server/routes.go
+++ b/server/routes.go
@@ -771,7 +771,6 @@ func Serve(ln net.Listener, allowOrigins []string) error {
 			c.String(http.StatusOK, "Ollama is running")
 		})

-		r.Handle(method, "/api/list", ListModelsHandler)
 		r.Handle(method, "/api/tags", ListModelsHandler)
 	}
Author	SHA1	Message	Date
Jeffrey Morgan	16a9006306	add back `f16c` instructions on intel mac	2023-11-26 15:59:49 -05:00
Jeffrey Morgan	e9216ea459	fix readline history on linux	2023-11-26 15:59:04 -05:00
Jeffrey Morgan	9e4a316405	update submodule commit	2023-11-26 14:52:00 -05:00
Jeffrey Morgan	9fb5e8399c	Fix issues with inputting and formatting multi line strings in `ollama run` Co-authored-by: Wen Sun <iwendellsun@gmail.com>	2023-11-26 12:54:29 -05:00
Jing Zhang	82b9b329ff	windows CUDA support (#1262 ) * Support cuda build in Windows * Enable dynamic NumGPU allocation for Windows	2023-11-24 17:16:36 -05:00
Jongwook Choi	12e8c12d2b	Disable CUDA peer access as a workaround for multi-gpu inference bug (#1261 ) When CUDA peer access is enabled, multi-gpu inference will produce garbage output. This is a known bug of llama.cpp (or nvidia). Until the upstream bug is fixed, we can disable CUDA peer access temporarily to ensure correct output. See #961.	2023-11-24 14:05:57 -05:00