From f82d00af74e8874861397ac6a05f35988cc45a52 Mon Sep 17 00:00:00 2001 From: Bruce MacDonald Date: Thu, 18 Dec 2025 14:07:31 -0800 Subject: [PATCH] openai: do not separate images into their own messages - Changed the case []any: branch to accumulate text and images into slices first, then create a single api.Message at the end - Multiple text parts are joined with \n\n - All images are collected into a single Images slice --- openai/openai.go | 29 ++++++++++++++++------------- openai/openai_test.go | 36 ++++++++++++++++++++++++++---------- 2 files changed, 42 insertions(+), 23 deletions(-) diff --git a/openai/openai.go b/openai/openai.go index 9dcba3000..5b4662a7e 100644 --- a/openai/openai.go +++ b/openai/openai.go @@ -463,6 +463,8 @@ func FromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) { } messages = append(messages, api.Message{Role: msg.Role, Content: content, Thinking: msg.Reasoning, ToolCalls: toolCalls, ToolName: toolName, ToolCallID: msg.ToolCallID}) case []any: + var texts []string + var images []api.ImageData for _, c := range content { data, ok := c.(map[string]any) if !ok { @@ -474,7 +476,7 @@ func FromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) { if !ok { return nil, errors.New("invalid message format") } - messages = append(messages, api.Message{Role: msg.Role, Content: text}) + texts = append(texts, text) case "image_url": var url string if urlMap, ok := data["image_url"].(map[string]any); ok { @@ -492,23 +494,24 @@ func FromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) { return nil, err } - messages = append(messages, api.Message{Role: msg.Role, Images: []api.ImageData{img}}) + images = append(images, img) default: return nil, errors.New("invalid message format") } } - // since we might have added multiple messages above, if we have tools - // calls we'll add them to the last message - if len(messages) > 0 && len(msg.ToolCalls) > 0 { - toolCalls, err := FromCompletionToolCall(msg.ToolCalls) - if err != nil { - return nil, err - } - messages[len(messages)-1].ToolCalls = toolCalls - messages[len(messages)-1].ToolName = toolName - messages[len(messages)-1].ToolCallID = msg.ToolCallID - messages[len(messages)-1].Thinking = msg.Reasoning + toolCalls, err := FromCompletionToolCall(msg.ToolCalls) + if err != nil { + return nil, err } + messages = append(messages, api.Message{ + Role: msg.Role, + Content: strings.Join(texts, "\n\n"), + Images: images, + Thinking: msg.Reasoning, + ToolCalls: toolCalls, + ToolName: toolName, + ToolCallID: msg.ToolCallID, + }) default: // content is only optional if tool calls are present if msg.ToolCalls == nil { diff --git a/openai/openai_test.go b/openai/openai_test.go index 51e243dec..683c2e447 100644 --- a/openai/openai_test.go +++ b/openai/openai_test.go @@ -41,7 +41,7 @@ func TestFromChatRequest_Basic(t *testing.T) { } } -func TestFromChatRequest_WithImage(t *testing.T) { +func TestFromChatRequest_MultiPartContent(t *testing.T) { imgData, _ := base64.StdEncoding.DecodeString(image) req := ChatCompletionRequest{ @@ -50,7 +50,12 @@ func TestFromChatRequest_WithImage(t *testing.T) { { Role: "user", Content: []any{ - map[string]any{"type": "text", "text": "Hello"}, + map[string]any{"type": "text", "text": "First part."}, + map[string]any{"type": "text", "text": "Second part."}, + map[string]any{ + "type": "image_url", + "image_url": map[string]any{"url": prefix + image}, + }, map[string]any{ "type": "image_url", "image_url": map[string]any{"url": prefix + image}, @@ -65,20 +70,31 @@ func TestFromChatRequest_WithImage(t *testing.T) { t.Fatalf("unexpected error: %v", err) } - if len(result.Messages) != 2 { - t.Fatalf("expected 2 messages, got %d", len(result.Messages)) + // Multi-part content array should produce a single message per OpenAI spec + if len(result.Messages) != 1 { + t.Fatalf("expected 1 message, got %d", len(result.Messages)) } - if result.Messages[0].Content != "Hello" { - t.Errorf("expected first message content 'Hello', got %q", result.Messages[0].Content) + msg := result.Messages[0] + if msg.Role != "user" { + t.Errorf("expected role 'user', got %q", msg.Role) } - if len(result.Messages[1].Images) != 1 { - t.Fatalf("expected 1 image, got %d", len(result.Messages[1].Images)) + // Multiple text parts should be joined + expectedContent := "First part.\n\nSecond part." + if msg.Content != expectedContent { + t.Errorf("expected content %q, got %q", expectedContent, msg.Content) } - if string(result.Messages[1].Images[0]) != string(imgData) { - t.Error("image data mismatch") + // Multiple images should be in the same message + if len(msg.Images) != 2 { + t.Fatalf("expected 2 images, got %d", len(msg.Images)) + } + + for i, img := range msg.Images { + if string(img) != string(imgData) { + t.Errorf("image %d data mismatch", i) + } } }