readme: add GPTranslate to community integrations (#11071 )

2025-06-14 08:54:03 -07:00
7 changed files with 231 additions and 378 deletions
--- a/README.md
+++ b/README.md
@@ -407,6 +407,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Lumina](https://github.com/cushydigit/lumina.git) (A lightweight, minimal React.js frontend for interacting with Ollama servers)
 - [Tiny Notepad](https://pypi.org/project/tiny-notepad) (A lightweight, notepad-like interface to chat with ollama available on PyPI)
 - [macLlama (macOS native)](https://github.com/hellotunamayo/macLlama) (A native macOS GUI application for interacting with Ollama models, featuring a chat interface.) 
+- [GPTranslate](https://github.com/philberndt/GPTranslate) (A fast and lightweight, AI powered desktop translation application written with Rust and Tauri. Features real-time translation with OpenAI/Azure/Ollama.)

 ### Cloud

--- a/llm/server.go
+++ b/llm/server.go
@@ -22,7 +22,6 @@ import (
 	"strings"
 	"sync"
 	"time"
-	"unicode/utf8"

 	"golang.org/x/sync/semaphore"

@@ -726,68 +725,10 @@ type CompletionResponse struct {
 	EvalDuration       time.Duration `json:"eval_duration"`
 }

-// unicodeBufferHandler wraps a completion response callback to handle partial UTF-8 sequences.
-// This function creates a stateful closure that is NOT safe for concurrent use.
-// Each completion request should create its own handler instance.
-func unicodeBufferHandler(fn func(CompletionResponse)) func(CompletionResponse) {
-	var pendingUTF8 string
-
-	return func(resp CompletionResponse) {
-		if resp.Content == "" && !resp.Done {
-			// No content to process, just pass through
-			fn(resp)
-			return
-		}
-
-		// Combine any pending UTF-8 with current content
-		combinedContent := pendingUTF8 + resp.Content
-		pendingUTF8 = ""
-
-		// Check if combined content is valid UTF-8
-		if utf8.ValidString(combinedContent) {
-			// Valid UTF-8, send it
-			resp.Content = combinedContent
-			fn(resp)
-		} else {
-			// Invalid UTF-8
-			if resp.Done {
-				// This is the final response, trim incomplete UTF-8
-				trimmedContent := combinedContent
-				for !utf8.ValidString(trimmedContent) && len(trimmedContent) > 0 {
-					trimmedContent = trimmedContent[:len(trimmedContent)-1]
-				}
-				resp.Content = trimmedContent
-				fn(resp)
-			} else {
-				// Not final response, split valid and invalid parts
-				validPrefix := combinedContent
-				for !utf8.ValidString(validPrefix) && len(validPrefix) > 0 {
-					validPrefix = validPrefix[:len(validPrefix)-1]
-				}
-
-				if len(validPrefix) > 0 {
-					// Send valid prefix
-					resp.Content = validPrefix
-					fn(resp)
-					// Buffer the remainder
-					pendingUTF8 = combinedContent[len(validPrefix):]
-				} else {
-					// No valid prefix, buffer everything
-					pendingUTF8 = combinedContent
-					// Don't send this response
-				}
-			}
-		}
-	}
-}
-
 func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
 	slog.Debug("completion request", "images", len(req.Images), "prompt", len(req.Prompt), "format", string(req.Format))
 	slog.Log(ctx, logutil.LevelTrace, "completion request", "prompt", req.Prompt)

-	// Wrap the callback with unicode buffer handling
-	unicodeFn := unicodeBufferHandler(fn)
-
 	if len(req.Format) > 0 {
 		switch string(req.Format) {
 		case `null`, `""`:
@@ -913,13 +854,13 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 			}

 			if c.Content != "" {
-				unicodeFn(CompletionResponse{
+				fn(CompletionResponse{
 					Content: c.Content,
 				})
 			}

 			if c.Done {
-				unicodeFn(c)
+				fn(c)
 				return nil
 			}
 		}
--- a/llm/server_test.go
+++ b/llm/server_test.go
@@ -70,152 +70,3 @@ func TestLLMServerCompletionFormat(t *testing.T) {
 	}, nil)
 	checkValid(err)
 }
-
-func TestUnicodeBufferHandler(t *testing.T) {
-	tests := []struct {
-		name              string
-		inputResponses    []CompletionResponse
-		expectedResponses []CompletionResponse
-		description       string
-	}{
-		{
-			name: "complete_unicode",
-			inputResponses: []CompletionResponse{
-				{Content: "Hello", Done: false},
-				{Content: " world", Done: false},
-				{Content: "!", Done: true},
-			},
-			expectedResponses: []CompletionResponse{
-				{Content: "Hello", Done: false},
-				{Content: " world", Done: false},
-				{Content: "!", Done: true},
-			},
-			description: "All responses with valid unicode should pass through unchanged",
-		},
-		{
-			name: "incomplete_unicode_at_end_with_done",
-			inputResponses: []CompletionResponse{
-				{Content: "Hello", Done: false},
-				{Content: string([]byte{0xF0, 0x9F}), Done: true}, // Incomplete emoji with Done=true
-			},
-			expectedResponses: []CompletionResponse{
-				{Content: "Hello", Done: false},
-				{Content: "", Done: true}, // Content is trimmed but response is still sent with Done=true
-			},
-			description: "When Done=true, incomplete Unicode at the end should be trimmed",
-		},
-		{
-			name: "split_unicode_across_responses",
-			inputResponses: []CompletionResponse{
-				{Content: "Hello " + string([]byte{0xF0, 0x9F}), Done: false}, // First part of 😀
-				{Content: string([]byte{0x98, 0x80}) + " world!", Done: true}, // Second part of 😀 and more text
-			},
-			expectedResponses: []CompletionResponse{
-				{Content: "Hello ", Done: false},  // Incomplete Unicode trimmed
-				{Content: "😀 world!", Done: true}, // Complete emoji in second response
-			},
-			description: "Unicode split across responses should be handled correctly",
-		},
-		{
-			name: "incomplete_unicode_buffered",
-			inputResponses: []CompletionResponse{
-				{Content: "Test " + string([]byte{0xF0, 0x9F}), Done: false}, // Incomplete emoji
-				{Content: string([]byte{0x98, 0x80}), Done: false},           // Complete the emoji
-				{Content: " done", Done: true},
-			},
-			expectedResponses: []CompletionResponse{
-				{Content: "Test ", Done: false}, // First part without incomplete unicode
-				{Content: "😀", Done: false},     // Complete emoji
-				{Content: " done", Done: true},
-			},
-			description: "Incomplete unicode should be buffered and combined with next response",
-		},
-		{
-			name: "empty_response_with_done",
-			inputResponses: []CompletionResponse{
-				{Content: "Complete response", Done: false},
-				{Content: "", Done: true}, // Empty response with Done=true
-			},
-			expectedResponses: []CompletionResponse{
-				{Content: "Complete response", Done: false},
-				{Content: "", Done: true}, // Should still be sent because Done=true
-			},
-			description: "Empty final response with Done=true should still be sent",
-		},
-		{
-			name: "done_reason_preserved",
-			inputResponses: []CompletionResponse{
-				{Content: "Response", Done: false},
-				{Content: " complete", Done: true, DoneReason: DoneReasonStop},
-			},
-			expectedResponses: []CompletionResponse{
-				{Content: "Response", Done: false},
-				{Content: " complete", Done: true, DoneReason: DoneReasonStop},
-			},
-			description: "DoneReason should be preserved in the final response",
-		},
-		{
-			name: "only_incomplete_unicode_not_done",
-			inputResponses: []CompletionResponse{
-				{Content: string([]byte{0xF0, 0x9F}), Done: false}, // Only incomplete unicode
-			},
-			expectedResponses: []CompletionResponse{
-				// No response expected - should be buffered
-			},
-			description: "Response with only incomplete unicode should be buffered if not done",
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			var actualResponses []CompletionResponse
-
-			// Create a callback that collects responses
-			callback := func(resp CompletionResponse) {
-				actualResponses = append(actualResponses, resp)
-			}
-
-			// Create the unicode buffer handler
-			handler := unicodeBufferHandler(callback)
-
-			// Send all input responses through the handler
-			for _, resp := range tt.inputResponses {
-				handler(resp)
-			}
-
-			// Verify the number of responses
-			if len(actualResponses) != len(tt.expectedResponses) {
-				t.Fatalf("%s: got %d responses, want %d responses",
-					tt.description, len(actualResponses), len(tt.expectedResponses))
-			}
-
-			// Verify each response matches the expected one
-			for i, expected := range tt.expectedResponses {
-				if i >= len(actualResponses) {
-					t.Fatalf("%s: missing response at index %d", tt.description, i)
-					continue
-				}
-
-				actual := actualResponses[i]
-
-				// Verify content
-				if actual.Content != expected.Content {
-					t.Errorf("%s: response[%d].Content = %q, want %q",
-						tt.description, i, actual.Content, expected.Content)
-				}
-
-				// Verify Done flag
-				if actual.Done != expected.Done {
-					t.Errorf("%s: response[%d].Done = %v, want %v",
-						tt.description, i, actual.Done, expected.Done)
-				}
-
-				// Verify DoneReason if specified
-				if actual.DoneReason != expected.DoneReason {
-					t.Errorf("%s: response[%d].DoneReason = %v, want %v",
-						tt.description, i, actual.DoneReason, expected.DoneReason)
-				}
-			}
-		})
-	}
-}
--- a/runner/common/stop.go
+++ b/runner/common/stop.go
@@ -2,8 +2,6 @@ package common

 import (
 	"strings"
-
-	"github.com/ollama/ollama/llm"
 )

 func FindStop(sequence string, stops []string) (bool, string) {
@@ -31,41 +29,68 @@ func ContainsStopSuffix(sequence string, stops []string) bool {
 // truncateStop removes the provided stop string from pieces,
 // returning the partial pieces with stop removed, including truncating
 // the last piece if required (and signalling if this was the case)
-func TruncateStop(resps []llm.CompletionResponse, stop string) ([]llm.CompletionResponse, bool) {
-	var sequence string
-	for _, resp := range resps {
-		sequence += resp.Content
+func TruncateStop(pieces []string, stop string) ([]string, bool) {
+	joined := strings.Join(pieces, "")
+
+	index := strings.Index(joined, stop)
+	if index == -1 {
+		return pieces, false
 	}

-	idx := strings.Index(sequence, stop)
-	if idx < 0 {
-		return resps, false
+	joined = joined[:index]
+
+	// Split truncated string back into pieces of original lengths
+	lengths := make([]int, len(pieces))
+	for i, piece := range pieces {
+		lengths[i] = len(piece)
 	}

-	truncated := sequence[:idx]
-	if len(truncated) == 0 {
-		return nil, true
-	}
-
-	result := make([]llm.CompletionResponse, 0, len(resps))
-
-	// Track position in truncated sequence
-	pos := 0
-	truncationHappened := false
-	for _, resp := range resps {
-		if pos >= len(truncated) {
+	var result []string
+	tokenTruncated := false
+	start := 0
+	for _, length := range lengths {
+		if start >= len(joined) {
 			break
 		}

-		chunk := truncated[pos:min(pos+len(resp.Content), len(truncated))]
-		if len(chunk) < len(resp.Content) {
-			truncationHappened = true
+		end := start + length
+		if end > len(joined) {
+			end = len(joined)
+			tokenTruncated = true
 		}
-		if len(chunk) > 0 {
-			result = append(result, llm.CompletionResponse{Content: chunk})
-		}
-		pos += len(resp.Content)
+		result = append(result, joined[start:end])
+		start = end
 	}

-	return result, truncationHappened
+	return result, tokenTruncated
+}
+
+func IncompleteUnicode(token string) bool {
+	incomplete := false
+
+	// check if there is incomplete UTF-8 character at the end
+	for i := 1; i < 5 && i <= len(token); i++ {
+		c := token[len(token)-i]
+
+		if (c & 0xc0) == 0x80 {
+			// continuation byte: 10xxxxxx
+			continue
+		}
+
+		if (c & 0xe0) == 0xc0 {
+			// 2-byte character: 110xxxxx ...
+			incomplete = i < 2
+		} else if (c & 0xf0) == 0xe0 {
+			// 3-byte character: 1110xxxx ...
+			incomplete = i < 3
+		} else if (c & 0xf8) == 0xf0 {
+			// 4-byte character: 11110xxx ...
+			incomplete = i < 4
+		}
+
+		// else 1-byte character or invalid byte
+		break
+	}
+
+	return incomplete
 }
--- a/runner/common/stop_test.go
+++ b/runner/common/stop_test.go
@@ -1,84 +1,51 @@
 package common

 import (
-	"fmt"
 	"reflect"
 	"testing"
-
-	"github.com/ollama/ollama/llm"
 )

 func TestTruncateStop(t *testing.T) {
 	tests := []struct {
 		name          string
-		pieces        []llm.CompletionResponse
+		pieces        []string
 		stop          string
-		expected      []llm.CompletionResponse
+		expected      []string
 		expectedTrunc bool
 	}{
 		{
-			name: "Single word",
-			pieces: []llm.CompletionResponse{
-				{Content: "Hello"},
-				{Content: "world"},
-			},
-			stop: "world",
-			expected: []llm.CompletionResponse{
-				{Content: "Hello"},
-			},
+			name:          "Single word",
+			pieces:        []string{"hello", "world"},
+			stop:          "world",
+			expected:      []string{"hello"},
 			expectedTrunc: false,
 		},
 		{
-			name: "Partial",
-			pieces: []llm.CompletionResponse{
-				{Content: "Hello"},
-				{Content: " wor"},
-			},
-			stop: "or",
-			expected: []llm.CompletionResponse{
-				{Content: "Hello"},
-				{Content: " w"},
-			},
+			name:          "Partial",
+			pieces:        []string{"hello", "wor"},
+			stop:          "or",
+			expected:      []string{"hello", "w"},
 			expectedTrunc: true,
 		},
 		{
-			name: "Suffix",
-			pieces: []llm.CompletionResponse{
-				{Content: "Hello"},
-				{Content: " there"},
-				{Content: "!"},
-			},
-			stop: "!",
-			expected: []llm.CompletionResponse{
-				{Content: "Hello"},
-				{Content: " there"},
-			},
+			name:          "Suffix",
+			pieces:        []string{"Hello", " there", "!"},
+			stop:          "!",
+			expected:      []string{"Hello", " there"},
 			expectedTrunc: false,
 		},
 		{
-			name: "Suffix partial",
-			pieces: []llm.CompletionResponse{
-				{Content: "Hello"},
-				{Content: " the"},
-				{Content: "re!"},
-			},
-			stop: "there!",
-			expected: []llm.CompletionResponse{
-				{Content: "Hello"},
-				{Content: " "},
-			},
+			name:          "Suffix partial",
+			pieces:        []string{"Hello", " the", "re!"},
+			stop:          "there!",
+			expected:      []string{"Hello", " "},
 			expectedTrunc: true,
 		},
 		{
-			name: "Middle",
-			pieces: []llm.CompletionResponse{
-				{Content: "Hello"},
-				{Content: " wo"},
-			},
-			stop: "llo w",
-			expected: []llm.CompletionResponse{
-				{Content: "He"},
-			},
+			name:          "Middle",
+			pieces:        []string{"hello", " wor"},
+			stop:          "llo w",
+			expected:      []string{"he"},
 			expectedTrunc: true,
 		},
 	}
@@ -87,23 +54,76 @@ func TestTruncateStop(t *testing.T) {
 		t.Run(tt.name, func(t *testing.T) {
 			result, resultTrunc := TruncateStop(tt.pieces, tt.stop)
 			if !reflect.DeepEqual(result, tt.expected) || resultTrunc != tt.expectedTrunc {
-				t.Errorf("truncateStop(%v, %v):\n%shave truncated %v\nwant truncated %v",
-					tt.pieces, tt.stop, formatContentDiff(result, tt.expected), resultTrunc, tt.expectedTrunc)
+				t.Errorf("truncateStop(%v, %s): have %v (%v); want %v (%v)", tt.pieces, tt.stop, result, resultTrunc, tt.expected, tt.expectedTrunc)
 			}
 		})
 	}
 }

-func formatContentDiff(result, expected []llm.CompletionResponse) string {
-	var s string
-	for i := 0; i < len(result) || i < len(expected); i++ {
-		if i < len(result) && i < len(expected) && result[i].Content != expected[i].Content {
-			s += fmt.Sprintf("[%d] %q vs %q\n", i, result[i].Content, expected[i].Content)
-		} else if i < len(result) && i >= len(expected) {
-			s += fmt.Sprintf("[%d] extra %q\n", i, result[i].Content)
-		} else if i >= len(result) && i < len(expected) {
-			s += fmt.Sprintf("[%d] missing %q\n", i, expected[i].Content)
-		}
+func TestIncompleteUnicode(t *testing.T) {
+	tests := []struct {
+		name     string
+		input    string
+		expected bool
+	}{
+		{
+			name:     "Basic",
+			input:    "hi",
+			expected: false,
+		},
+		{
+			name:     "Two byte",
+			input:    "hi" + string([]byte{0xc2, 0xa3}),
+			expected: false,
+		},
+		{
+			name:     "Two byte - missing last",
+			input:    "hi" + string([]byte{0xc2}),
+			expected: true,
+		},
+		{
+			name:     "Three byte",
+			input:    "hi" + string([]byte{0xe0, 0xA0, 0x80}),
+			expected: false,
+		},
+		{
+			name:     "Three byte - missing last",
+			input:    "hi" + string([]byte{0xe0, 0xA0}),
+			expected: true,
+		},
+		{
+			name:     "Three byte - missing last 2",
+			input:    "hi" + string([]byte{0xe0}),
+			expected: true,
+		},
+		{
+			name:     "Four byte",
+			input:    "hi" + string([]byte{0xf0, 0x92, 0x8a, 0xb7}),
+			expected: false,
+		},
+		{
+			name:     "Four byte - missing last",
+			input:    "hi" + string([]byte{0xf0, 0x92, 0x8a}),
+			expected: true,
+		},
+		{
+			name:     "Four byte - missing last 2",
+			input:    "hi" + string([]byte{0xf0, 0x92}),
+			expected: true,
+		},
+		{
+			name:     "Four byte - missing last 3",
+			input:    "hi" + string([]byte{0xf0}),
+			expected: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := IncompleteUnicode(tt.input)
+			if result != tt.expected {
+				t.Errorf("incompleteUnicode(%s): have %v; want %v", tt.input, result, tt.expected)
+			}
+		})
 	}
-	return s
 }
--- a/runner/llamarunner/runner.go
+++ b/runner/llamarunner/runner.go
@@ -17,6 +17,7 @@ import (
 	"strings"
 	"sync"
 	"time"
+	"unicode/utf8"

 	"golang.org/x/sync/semaphore"

@@ -51,13 +52,13 @@ type Sequence struct {
 	pendingInputs []input

 	// tokens that have been generated but not returned yet (e.g. for stop sequences)
-	pendingResponses []llm.CompletionResponse
+	pendingResponses []string

 	// input cache being used by this sequence
 	cache *InputCacheSlot

 	// channel to send responses over
-	responses chan llm.CompletionResponse
+	responses chan string

 	// channel to stop decoding (such as if the remote connection is closed)
 	quit chan bool
@@ -88,19 +89,6 @@ type Sequence struct {
 	numPromptInputs     int
 }

-func (seq *Sequence) send(resp llm.CompletionResponse) bool {
-	if len(resp.Content) > 0 || resp.Done {
-		select {
-		case seq.responses <- resp:
-			// Successfully sent
-			return true
-		case <-seq.quit:
-			return false
-		}
-	}
-	return true
-}
-
 type NewSequenceParams struct {
 	numPredict     int
 	stop           []string
@@ -159,8 +147,8 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe
 		numPromptInputs:     len(inputs),
 		startProcessingTime: startTime,
 		numPredict:          params.numPredict,
-		pendingResponses:    make([]llm.CompletionResponse, 0),
-		responses:           make(chan llm.CompletionResponse, 100),
+		pendingResponses:    make([]string, 0),
+		responses:           make(chan string, 100),
 		quit:                make(chan bool, 1),
 		embedding:           make(chan []float32, 1),
 		samplingCtx:         sc,
@@ -284,15 +272,36 @@ func (s *Server) allNil() bool {
 	return true
 }

+func flushPending(seq *Sequence) bool {
+	joined := strings.Join(seq.pendingResponses, "")
+	seq.pendingResponses = []string{}
+
+	// Check if there are any partial UTF-8 characters remaining.
+	// We already check and queue as we are generating but some may
+	// still make it here:
+	// - Sequence is ending, e.g. generation limit has been hit
+	// - Invalid characters in the middle of a string
+	// This is a stricter check to ensure we never output invalid Unicode.
+	for !utf8.ValidString(joined) {
+		joined = joined[:len(joined)-1]
+	}
+
+	if len(joined) == 0 {
+		return true
+	}
+
+	select {
+	case seq.responses <- joined:
+		return true
+	case <-seq.quit:
+		return false
+	}
+}
+
 func (s *Server) removeSequence(seqIndex int, reason llm.DoneReason) {
 	seq := s.seqs[seqIndex]

-	// Send any remaining pending responses
-	for _, resp := range seq.pendingResponses {
-		seq.send(resp)
-	}
-	seq.pendingResponses = []llm.CompletionResponse{}
-
+	flushPending(seq)
 	seq.doneReason = reason
 	close(seq.responses)
 	close(seq.embedding)
@@ -481,11 +490,8 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)

 		seq.inputs = []input{{token: token}}

-		seq.pendingResponses = append(seq.pendingResponses, llm.CompletionResponse{Content: piece})
-		sequence := ""
-		for _, r := range seq.pendingResponses {
-			sequence += r.Content
-		}
+		seq.pendingResponses = append(seq.pendingResponses, piece)
+		sequence := strings.Join(seq.pendingResponses, "")

 		if ok, stop := common.FindStop(sequence, seq.stop); ok {
 			slog.Debug("hit stop token", "pending", seq.pendingResponses, "stop", stop)
@@ -517,13 +523,13 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 			continue
 		}

-		for _, resp := range seq.pendingResponses {
-			if !seq.send(resp) {
-				s.removeSequence(i, llm.DoneReasonConnectionClosed)
-				break
-			}
+		if common.IncompleteUnicode(sequence) {
+			continue
+		}
+
+		if !flushPending(seq) {
+			s.removeSequence(i, llm.DoneReasonConnectionClosed)
 		}
-		seq.pendingResponses = []llm.CompletionResponse{}
 	}

 	return nil
@@ -621,7 +627,9 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 			return
 		case content, ok := <-seq.responses:
 			if ok {
-				if err := json.NewEncoder(w).Encode(&content); err != nil {
+				if err := json.NewEncoder(w).Encode(&llm.CompletionResponse{
+					Content: content,
+				}); err != nil {
 					http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
 					close(seq.quit)
 					return
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -20,6 +20,7 @@ import (
 	"strings"
 	"sync"
 	"time"
+	"unicode/utf8"

 	"golang.org/x/image/bmp"
 	"golang.org/x/sync/semaphore"
@@ -55,13 +56,13 @@ type Sequence struct {
 	pendingInputs []input.Input

 	// tokens that have been generated but not returned yet (e.g. for stop sequences)
-	pendingResponses []llm.CompletionResponse
+	pendingResponses []string

 	// input cache being used by this sequence
 	cache *InputCacheSlot

 	// channel to send responses over
-	responses chan llm.CompletionResponse
+	responses chan string

 	// channel to stop decoding (such as if the remote connection is closed)
 	quit chan bool
@@ -93,19 +94,6 @@ type Sequence struct {
 	numPromptInputs     int
 }

-func (seq *Sequence) send(resp llm.CompletionResponse) bool {
-	if len(resp.Content) > 0 || resp.Done {
-		select {
-		case seq.responses <- resp:
-			// Successfully sent
-			return true
-		case <-seq.quit:
-			return false
-		}
-	}
-	return true
-}
-
 type NewSequenceParams struct {
 	numPredict int
 	stop       []string
@@ -179,8 +167,8 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe
 		numPromptInputs:     len(inputs),
 		startProcessingTime: startTime,
 		numPredict:          params.numPredict,
-		pendingResponses:    make([]llm.CompletionResponse, 0),
-		responses:           make(chan llm.CompletionResponse, 100),
+		pendingResponses:    make([]string, 0),
+		responses:           make(chan string, 100),
 		quit:                make(chan bool, 1),
 		embedding:           make(chan []float32, 1),
 		sampler:             params.sampler,
@@ -325,15 +313,36 @@ func (s *Server) allNil() bool {
 	return true
 }

+func flushPending(seq *Sequence) bool {
+	joined := strings.Join(seq.pendingResponses, "")
+	seq.pendingResponses = []string{}
+
+	// Check if there are any partial UTF-8 characters remaining.
+	// We already check and queue as we are generating but some may
+	// still make it here:
+	// - Sequence is ending, e.g. generation limit has been hit
+	// - Invalid characters in the middle of a string
+	// This is a stricter check to ensure we never output invalid Unicode.
+	for !utf8.ValidString(joined) {
+		joined = joined[:len(joined)-1]
+	}
+
+	if len(joined) == 0 {
+		return true
+	}
+
+	select {
+	case seq.responses <- joined:
+		return true
+	case <-seq.quit:
+		return false
+	}
+}
+
 func (s *Server) removeSequence(seqIndex int, reason llm.DoneReason) {
 	seq := s.seqs[seqIndex]

-	// Send any remaining pending responses
-	for _, resp := range seq.pendingResponses {
-		seq.send(resp)
-	}
-	seq.pendingResponses = []llm.CompletionResponse{}
-
+	flushPending(seq)
 	seq.doneReason = reason
 	close(seq.responses)
 	close(seq.embedding)
@@ -532,11 +541,8 @@ func (s *Server) processBatch() error {

 		seq.inputs = []input.Input{{Token: token}}

-		seq.pendingResponses = append(seq.pendingResponses, llm.CompletionResponse{Content: piece})
-		sequence := ""
-		for _, r := range seq.pendingResponses {
-			sequence += r.Content
-		}
+		seq.pendingResponses = append(seq.pendingResponses, piece)
+		sequence := strings.Join(seq.pendingResponses, "")

 		if ok, stop := common.FindStop(sequence, seq.stop); ok {
 			slog.Debug("hit stop token", "pending", seq.pendingResponses, "stop", stop)
@@ -568,14 +574,13 @@ func (s *Server) processBatch() error {
 			continue
 		}

-		// Send all pending responses directly without unicode checking
-		for _, resp := range seq.pendingResponses {
-			if !seq.send(resp) {
-				s.removeSequence(i, llm.DoneReasonConnectionClosed)
-				break
-			}
+		if common.IncompleteUnicode(sequence) {
+			continue
+		}
+
+		if !flushPending(seq) {
+			s.removeSequence(i, llm.DoneReasonConnectionClosed)
 		}
-		seq.pendingResponses = []llm.CompletionResponse{}
 	}

 	return nil
@@ -678,7 +683,9 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 			return
 		case content, ok := <-seq.responses:
 			if ok {
-				if err := json.NewEncoder(w).Encode(&content); err != nil {
+				if err := json.NewEncoder(w).Encode(&llm.CompletionResponse{
+					Content: content,
+				}); err != nil {
 					http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
 					close(seq.quit)
 					return