cli: Send all images in conversation history

Currently the CLI only sends images from the most recent image- containing message. This prevents doing things like sending one message with an image and then a follow message with a second image and asking for comparision based on additional information not present in any text that was output. It's possible that some models have a problem with this but the CLI is not the right place to do this since any adjustments are model-specific and should affect all clients. Both llava:34b and minicpm-v do reasonable things with multiple images in the history.
runner.go: Handle truncation of tokens for stop sequences
2024-10-10 11:21:51 -07:00 · 2024-10-09 20:39:04 -07:00 · 2024-10-09 20:39:04 -07:00 · 2024-10-09 15:22:36 -07:00 · 2024-10-09 14:21:02 -07:00
6 changed files with 71 additions and 42 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,5 +1,12 @@
 llm/ext_server/* linguist-vendored
-llama/** linguist-vendored
+llama/**/*.cpp linguist-vendored
+llama/**/*.hpp linguist-vendored
+llama/**/*.h linguist-vendored
+llama/**/*.c linguist-vendored
+llama/**/*.cu linguist-vendored
+llama/**/*.cuh linguist-vendored
+llama/**/*.m linguist-vendored
+llama/**/*.metal linguist-vendored

 * text=auto
 *.go text eol=lf
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -442,13 +442,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 					return err
 				}

-				// clear all previous images for better responses
-				if len(images) > 0 {
-					for i := range opts.Messages {
-						opts.Messages[i].Images = nil
-					}
-				}
-
 				newMessage.Content = msg
 				newMessage.Images = images
 			}
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -451,14 +451,27 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 		sequence := strings.Join(seq.pendingResponses, "")

 		if ok, stop := findStop(sequence, seq.stop); ok {
-			slog.Debug("hit stop token", "stop", seq.stop)
+			slog.Debug("hit stop token", "pending", seq.pendingResponses, "stop", stop)

-			trimCacheLen := len(seq.pendingResponses) - 1
-			seq.pendingResponses = truncateStop(seq.pendingResponses, stop)
-			trimCacheLen -= len(seq.pendingResponses)
+			var tokenTruncated bool
+			origLen := len(seq.pendingResponses)
+			seq.pendingResponses, tokenTruncated = truncateStop(seq.pendingResponses, stop)
+			newLen := len(seq.pendingResponses)
+
+			// Update the cache based on the tokens that will be returned:
+			// - We have 1 token more than is currently in the cache because
+			// the last one generated wasn't submitted to Decode
+			// - Remove any stop sequences that we stripped out
+			// - If truncateStop removed a portion of a token, drop that
+			// - As defense-in-depth, if truncatedToken didn't find a stop token
+			// remove the extra one that we added to the cache len
+			tokenLen := len(seq.cache.Inputs) + 1
+			tokenLen -= origLen - newLen
+			if tokenTruncated || origLen == newLen {
+				tokenLen--
+			}
+			seq.cache.Inputs = seq.cache.Inputs[:tokenLen]

-			// remove any tokens from the cache that we don't actually return
-			seq.cache.Inputs = seq.cache.Inputs[:len(seq.cache.Inputs)-trimCacheLen]
 			s.removeSequence(i, "stop")
 			continue
 		}
--- a/llama/runner/stop.go
+++ b/llama/runner/stop.go
@@ -28,13 +28,13 @@ func containsStopSuffix(sequence string, stops []string) bool {

 // truncateStop removes the provided stop string from pieces,
 // returning the partial pieces with stop removed, including truncating
-// the last piece if required
-func truncateStop(pieces []string, stop string) []string {
+// the last piece if required (and signalling if this was the case)
+func truncateStop(pieces []string, stop string) ([]string, bool) {
 	joined := strings.Join(pieces, "")

 	index := strings.Index(joined, stop)
 	if index == -1 {
-		return pieces
+		return pieces, false
 	}

 	joined = joined[:index]
@@ -46,6 +46,7 @@ func truncateStop(pieces []string, stop string) []string {
 	}

 	var result []string
+	tokenTruncated := false
 	start := 0
 	for _, length := range lengths {
 		if start >= len(joined) {
@@ -55,12 +56,13 @@ func truncateStop(pieces []string, stop string) []string {
 		end := start + length
 		if end > len(joined) {
 			end = len(joined)
+			tokenTruncated = true
 		}
 		result = append(result, joined[start:end])
 		start = end
 	}

-	return result
+	return result, tokenTruncated
 }

 func incompleteUnicode(token string) bool {
--- a/llama/runner/stop_test.go
+++ b/llama/runner/stop_test.go
@@ -7,42 +7,54 @@ import (

 func TestTruncateStop(t *testing.T) {
 	tests := []struct {
-		name     string
-		pieces   []string
-		stop     string
-		expected []string
+		name          string
+		pieces        []string
+		stop          string
+		expected      []string
+		expectedTrunc bool
 	}{
 		{
-			name:     "Single word",
-			pieces:   []string{"hello", "world"},
-			stop:     "world",
-			expected: []string{"hello"},
+			name:          "Single word",
+			pieces:        []string{"hello", "world"},
+			stop:          "world",
+			expected:      []string{"hello"},
+			expectedTrunc: false,
 		},
 		{
-			name:     "Partial",
-			pieces:   []string{"hello", "wor"},
-			stop:     "or",
-			expected: []string{"hello", "w"},
+			name:          "Partial",
+			pieces:        []string{"hello", "wor"},
+			stop:          "or",
+			expected:      []string{"hello", "w"},
+			expectedTrunc: true,
 		},
 		{
-			name:     "Suffix",
-			pieces:   []string{"Hello", " there", "!"},
-			stop:     "!",
-			expected: []string{"Hello", " there"},
+			name:          "Suffix",
+			pieces:        []string{"Hello", " there", "!"},
+			stop:          "!",
+			expected:      []string{"Hello", " there"},
+			expectedTrunc: false,
 		},
 		{
-			name:     "Middle",
-			pieces:   []string{"hello", " wor"},
-			stop:     "llo w",
-			expected: []string{"he"},
+			name:          "Suffix partial",
+			pieces:        []string{"Hello", " the", "re!"},
+			stop:          "there!",
+			expected:      []string{"Hello", " "},
+			expectedTrunc: true,
+		},
+		{
+			name:          "Middle",
+			pieces:        []string{"hello", " wor"},
+			stop:          "llo w",
+			expected:      []string{"he"},
+			expectedTrunc: true,
 		},
 	}

 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
-			result := truncateStop(tt.pieces, tt.stop)
-			if !reflect.DeepEqual(result, tt.expected) {
-				t.Errorf("truncateStop(%v, %s): have %v; want %v", tt.pieces, tt.stop, result, tt.expected)
+			result, resultTrunc := truncateStop(tt.pieces, tt.stop)
+			if !reflect.DeepEqual(result, tt.expected) || resultTrunc != tt.expectedTrunc {
+				t.Errorf("truncateStop(%v, %s): have %v (%v); want %v (%v)", tt.pieces, tt.stop, result, resultTrunc, tt.expected, tt.expectedTrunc)
 			}
 		})
 	}
--- a/llm/server.go
+++ b/llm/server.go
@@ -1086,10 +1086,13 @@ func (s *llmServer) Detokenize(ctx context.Context, tokens []int) (string, error
 }

 func (s *llmServer) Close() error {
+	s.modelLock.Lock()
 	if s.model != nil {
 		llama.FreeModel(s.model)
 		s.model = nil
 	}
+	s.modelLock.Unlock()
+
 	if s.cmd != nil {
 		slog.Debug("stopping llama server")
 		if err := s.cmd.Process.Kill(); err != nil {
@@ -1100,7 +1103,6 @@ func (s *llmServer) Close() error {
 			slog.Debug("waiting for llama server to exit")
 			<-s.done
 		}
-		s.cmd = nil

 		slog.Debug("llama server stopped")
 	}
Author	SHA1	Message	Date
Jesse Gross	7fe3902552	cli: Send all images in conversation history Currently the CLI only sends images from the most recent image- containing message. This prevents doing things like sending one message with an image and then a follow message with a second image and asking for comparision based on additional information not present in any text that was output. It's possible that some models have a problem with this but the CLI is not the right place to do this since any adjustments are model-specific and should affect all clients. Both llava:34b and minicpm-v do reasonable things with multiple images in the history.	2024-10-10 11:21:51 -07:00
Jesse Gross	0077e22d52	runner.go: Handle truncation of tokens for stop sequences When a single token contains both text to be return and a stop sequence, this causes an out of bounds error when we update the cache to match our text. This is because we currently assume that the removing the stop sequence will consume at least one token. This also inverts the logic to deal with positive numbers, rather than a value to be subtracted, which is easier to reason about. Fixes #7153	2024-10-09 20:39:04 -07:00
Jesse Gross	03408f3437	server: Don't clear cmd when closing a server Close can be called on an LLM server if the runner subprocess dies. However, the Ollama scheduler code may not know about this yet and still try to access it. In this case, it is important that 'cmd' is still available as it is used to check on the status of the subprocess. If this happens, Kill may be called twice on the subprocess - that is fine. In addition, model unloading may race with new accesses, so we should hold a lock around this. This may result in the model being reloaded after the first close call - this is also fine as close will be called again later.	2024-10-09 20:39:04 -07:00
Daniel Hiltgen	cd7e01e8b9	fix vendoring attribute for metal (#7156 ) Add missing metal files to vendoring list	2024-10-09 15:22:36 -07:00
Daniel Hiltgen	7a962bd802	fix vendoring attribute (#7155 ) Expand out the file extensions for vendored code so git reports the status correctly	2024-10-09 14:21:02 -07:00