From 00c151b4630e8e5bb1bd6f353c184c325577e74c Mon Sep 17 00:00:00 2001 From: shaw Date: Sun, 8 Mar 2026 23:16:58 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20gpt->claude=E6=A0=BC=E5=BC=8F=E8=BD=AC?= =?UTF-8?q?=E6=8D=A2=E6=94=AF=E6=8C=81=E5=9B=BE=E7=89=87=E8=AF=86=E5=88=AB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../pkg/apicompat/anthropic_responses_test.go | 185 ++++++++++++++++++ .../pkg/apicompat/anthropic_to_responses.go | 100 +++++++--- backend/internal/pkg/apicompat/types.go | 15 +- 3 files changed, 276 insertions(+), 24 deletions(-) diff --git a/backend/internal/pkg/apicompat/anthropic_responses_test.go b/backend/internal/pkg/apicompat/anthropic_responses_test.go index 60c54539..c4677aba 100644 --- a/backend/internal/pkg/apicompat/anthropic_responses_test.go +++ b/backend/internal/pkg/apicompat/anthropic_responses_test.go @@ -733,3 +733,188 @@ func TestAnthropicToResponses_ToolChoiceSpecific(t *testing.T) { require.True(t, ok) assert.Equal(t, "get_weather", fn["name"]) } + +// --------------------------------------------------------------------------- +// Image content block conversion tests +// --------------------------------------------------------------------------- + +func TestAnthropicToResponses_UserImageBlock(t *testing.T) { + req := &AnthropicRequest{ + Model: "gpt-5.2", + MaxTokens: 1024, + Messages: []AnthropicMessage{ + {Role: "user", Content: json.RawMessage(`[ + {"type":"text","text":"What is in this image?"}, + {"type":"image","source":{"type":"base64","media_type":"image/png","data":"iVBOR"}} + ]`)}, + }, + } + + resp, err := AnthropicToResponses(req) + require.NoError(t, err) + + var items []ResponsesInputItem + require.NoError(t, json.Unmarshal(resp.Input, &items)) + require.Len(t, items, 1) + assert.Equal(t, "user", items[0].Role) + + var parts []ResponsesContentPart + require.NoError(t, json.Unmarshal(items[0].Content, &parts)) + require.Len(t, parts, 2) + assert.Equal(t, "input_text", parts[0].Type) + assert.Equal(t, "What is in this image?", parts[0].Text) + assert.Equal(t, "input_image", parts[1].Type) + assert.Equal(t, "data:image/png;base64,iVBOR", parts[1].ImageURL) +} + +func TestAnthropicToResponses_ImageOnlyUserMessage(t *testing.T) { + req := &AnthropicRequest{ + Model: "gpt-5.2", + MaxTokens: 1024, + Messages: []AnthropicMessage{ + {Role: "user", Content: json.RawMessage(`[ + {"type":"image","source":{"type":"base64","media_type":"image/jpeg","data":"/9j/4AAQ"}} + ]`)}, + }, + } + + resp, err := AnthropicToResponses(req) + require.NoError(t, err) + + var items []ResponsesInputItem + require.NoError(t, json.Unmarshal(resp.Input, &items)) + require.Len(t, items, 1) + + var parts []ResponsesContentPart + require.NoError(t, json.Unmarshal(items[0].Content, &parts)) + require.Len(t, parts, 1) + assert.Equal(t, "input_image", parts[0].Type) + assert.Equal(t, "data:image/jpeg;base64,/9j/4AAQ", parts[0].ImageURL) +} + +func TestAnthropicToResponses_ToolResultWithImage(t *testing.T) { + req := &AnthropicRequest{ + Model: "gpt-5.2", + MaxTokens: 1024, + Messages: []AnthropicMessage{ + {Role: "user", Content: json.RawMessage(`"Read the screenshot"`)}, + {Role: "assistant", Content: json.RawMessage(`[{"type":"tool_use","id":"toolu_1","name":"Read","input":{"file_path":"/tmp/screen.png"}}]`)}, + {Role: "user", Content: json.RawMessage(`[ + {"type":"tool_result","tool_use_id":"toolu_1","content":[ + {"type":"image","source":{"type":"base64","media_type":"image/png","data":"iVBOR"}} + ]} + ]`)}, + }, + } + + resp, err := AnthropicToResponses(req) + require.NoError(t, err) + + var items []ResponsesInputItem + require.NoError(t, json.Unmarshal(resp.Input, &items)) + // user + function_call + function_call_output + user(image) = 4 + require.Len(t, items, 4) + + // function_call_output should have text-only output (no image). + assert.Equal(t, "function_call_output", items[2].Type) + assert.Equal(t, "fc_toolu_1", items[2].CallID) + assert.Equal(t, "(empty)", items[2].Output) + + // Image should be in a separate user message. + assert.Equal(t, "user", items[3].Role) + var parts []ResponsesContentPart + require.NoError(t, json.Unmarshal(items[3].Content, &parts)) + require.Len(t, parts, 1) + assert.Equal(t, "input_image", parts[0].Type) + assert.Equal(t, "data:image/png;base64,iVBOR", parts[0].ImageURL) +} + +func TestAnthropicToResponses_ToolResultMixed(t *testing.T) { + req := &AnthropicRequest{ + Model: "gpt-5.2", + MaxTokens: 1024, + Messages: []AnthropicMessage{ + {Role: "user", Content: json.RawMessage(`"Describe the file"`)}, + {Role: "assistant", Content: json.RawMessage(`[{"type":"tool_use","id":"toolu_2","name":"Read","input":{"file_path":"/tmp/photo.png"}}]`)}, + {Role: "user", Content: json.RawMessage(`[ + {"type":"tool_result","tool_use_id":"toolu_2","content":[ + {"type":"text","text":"File metadata: 800x600 PNG"}, + {"type":"image","source":{"type":"base64","media_type":"image/png","data":"AAAA"}} + ]} + ]`)}, + }, + } + + resp, err := AnthropicToResponses(req) + require.NoError(t, err) + + var items []ResponsesInputItem + require.NoError(t, json.Unmarshal(resp.Input, &items)) + // user + function_call + function_call_output + user(image) = 4 + require.Len(t, items, 4) + + // function_call_output should have text-only output. + assert.Equal(t, "function_call_output", items[2].Type) + assert.Equal(t, "File metadata: 800x600 PNG", items[2].Output) + + // Image should be in a separate user message. + assert.Equal(t, "user", items[3].Role) + var parts []ResponsesContentPart + require.NoError(t, json.Unmarshal(items[3].Content, &parts)) + require.Len(t, parts, 1) + assert.Equal(t, "input_image", parts[0].Type) + assert.Equal(t, "data:image/png;base64,AAAA", parts[0].ImageURL) +} + +func TestAnthropicToResponses_TextOnlyToolResultBackwardCompat(t *testing.T) { + req := &AnthropicRequest{ + Model: "gpt-5.2", + MaxTokens: 1024, + Messages: []AnthropicMessage{ + {Role: "user", Content: json.RawMessage(`"Check weather"`)}, + {Role: "assistant", Content: json.RawMessage(`[{"type":"tool_use","id":"call_1","name":"get_weather","input":{"city":"NYC"}}]`)}, + {Role: "user", Content: json.RawMessage(`[ + {"type":"tool_result","tool_use_id":"call_1","content":[ + {"type":"text","text":"Sunny, 72°F"} + ]} + ]`)}, + }, + } + + resp, err := AnthropicToResponses(req) + require.NoError(t, err) + + var items []ResponsesInputItem + require.NoError(t, json.Unmarshal(resp.Input, &items)) + // user + function_call + function_call_output = 3 + require.Len(t, items, 3) + + // Text-only tool_result should produce a plain string. + assert.Equal(t, "Sunny, 72°F", items[2].Output) +} + +func TestAnthropicToResponses_ImageEmptyMediaType(t *testing.T) { + req := &AnthropicRequest{ + Model: "gpt-5.2", + MaxTokens: 1024, + Messages: []AnthropicMessage{ + {Role: "user", Content: json.RawMessage(`[ + {"type":"image","source":{"type":"base64","media_type":"","data":"iVBOR"}} + ]`)}, + }, + } + + resp, err := AnthropicToResponses(req) + require.NoError(t, err) + + var items []ResponsesInputItem + require.NoError(t, json.Unmarshal(resp.Input, &items)) + require.Len(t, items, 1) + + var parts []ResponsesContentPart + require.NoError(t, json.Unmarshal(items[0].Content, &parts)) + require.Len(t, parts, 1) + assert.Equal(t, "input_image", parts[0].Type) + // Should default to image/png when media_type is empty. + assert.Equal(t, "data:image/png;base64,iVBOR", parts[0].ImageURL) +} diff --git a/backend/internal/pkg/apicompat/anthropic_to_responses.go b/backend/internal/pkg/apicompat/anthropic_to_responses.go index f0af2936..09a6f227 100644 --- a/backend/internal/pkg/apicompat/anthropic_to_responses.go +++ b/backend/internal/pkg/apicompat/anthropic_to_responses.go @@ -169,7 +169,7 @@ func anthropicMsgToResponsesItems(m AnthropicMessage) ([]ResponsesInputItem, err // anthropicUserToResponses handles an Anthropic user message. Content can be a // plain string or an array of blocks. tool_result blocks are extracted into -// function_call_output items. +// function_call_output items. Image blocks are converted to input_image parts. func anthropicUserToResponses(raw json.RawMessage) ([]ResponsesInputItem, error) { // Try plain string. var s string @@ -184,28 +184,46 @@ func anthropicUserToResponses(raw json.RawMessage) ([]ResponsesInputItem, error) } var out []ResponsesInputItem + var toolResultImageParts []ResponsesContentPart // Extract tool_result blocks → function_call_output items. + // Images inside tool_results are extracted separately because the + // Responses API function_call_output.output only accepts strings. for _, b := range blocks { if b.Type != "tool_result" { continue } - text := extractAnthropicToolResultText(b) - if text == "" { - // OpenAI Responses API requires "output" field; use placeholder for empty results. - text = "(empty)" - } + outputText, imageParts := convertToolResultOutput(b) out = append(out, ResponsesInputItem{ Type: "function_call_output", CallID: toResponsesCallID(b.ToolUseID), - Output: text, + Output: outputText, }) + toolResultImageParts = append(toolResultImageParts, imageParts...) } - // Remaining text blocks → user message. - text := extractAnthropicTextFromBlocks(blocks) - if text != "" { - content, _ := json.Marshal(text) + // Remaining text + image blocks → user message with content parts. + // Also include images extracted from tool_results so the model can see them. + var parts []ResponsesContentPart + for _, b := range blocks { + switch b.Type { + case "text": + if b.Text != "" { + parts = append(parts, ResponsesContentPart{Type: "input_text", Text: b.Text}) + } + case "image": + if uri := anthropicImageToDataURI(b.Source); uri != "" { + parts = append(parts, ResponsesContentPart{Type: "input_image", ImageURL: uri}) + } + } + } + parts = append(parts, toolResultImageParts...) + + if len(parts) > 0 { + content, err := json.Marshal(parts) + if err != nil { + return nil, err + } out = append(out, ResponsesInputItem{Role: "user", Content: content}) } @@ -290,26 +308,64 @@ func fromResponsesCallID(id string) string { return id } -// extractAnthropicToolResultText gets the text content from a tool_result block. -func extractAnthropicToolResultText(b AnthropicContentBlock) string { - if len(b.Content) == 0 { +// anthropicImageToDataURI converts an AnthropicImageSource to a data URI string. +// Returns "" if the source is nil or has no data. +func anthropicImageToDataURI(src *AnthropicImageSource) string { + if src == nil || src.Data == "" { return "" } + mediaType := src.MediaType + if mediaType == "" { + mediaType = "image/png" + } + return "data:" + mediaType + ";base64," + src.Data +} + +// convertToolResultOutput extracts text and image content from a tool_result +// block. Returns the text as a string for the function_call_output Output +// field, plus any image parts that must be sent in a separate user message +// (the Responses API output field only accepts strings). +func convertToolResultOutput(b AnthropicContentBlock) (string, []ResponsesContentPart) { + if len(b.Content) == 0 { + return "(empty)", nil + } + + // Try plain string content. var s string if err := json.Unmarshal(b.Content, &s); err == nil { - return s + if s == "" { + s = "(empty)" + } + return s, nil } + + // Array of content blocks — may contain text and/or images. var inner []AnthropicContentBlock - if err := json.Unmarshal(b.Content, &inner); err == nil { - var parts []string - for _, ib := range inner { - if ib.Type == "text" && ib.Text != "" { - parts = append(parts, ib.Text) + if err := json.Unmarshal(b.Content, &inner); err != nil { + return "(empty)", nil + } + + // Separate text (for function_call_output) from images (for user message). + var textParts []string + var imageParts []ResponsesContentPart + for _, ib := range inner { + switch ib.Type { + case "text": + if ib.Text != "" { + textParts = append(textParts, ib.Text) + } + case "image": + if uri := anthropicImageToDataURI(ib.Source); uri != "" { + imageParts = append(imageParts, ResponsesContentPart{Type: "input_image", ImageURL: uri}) } } - return strings.Join(parts, "\n\n") } - return "" + + text := strings.Join(textParts, "\n\n") + if text == "" { + text = "(empty)" + } + return text, imageParts } // extractAnthropicTextFromBlocks joins all text blocks, ignoring thinking/ diff --git a/backend/internal/pkg/apicompat/types.go b/backend/internal/pkg/apicompat/types.go index 92c87540..bb9432ac 100644 --- a/backend/internal/pkg/apicompat/types.go +++ b/backend/internal/pkg/apicompat/types.go @@ -47,6 +47,9 @@ type AnthropicContentBlock struct { // type=thinking Thinking string `json:"thinking,omitempty"` + // type=image + Source *AnthropicImageSource `json:"source,omitempty"` + // type=tool_use ID string `json:"id,omitempty"` Name string `json:"name,omitempty"` @@ -58,6 +61,13 @@ type AnthropicContentBlock struct { IsError bool `json:"is_error,omitempty"` } +// AnthropicImageSource describes the source data for an image content block. +type AnthropicImageSource struct { + Type string `json:"type"` // "base64" + MediaType string `json:"media_type"` + Data string `json:"data"` +} + // AnthropicTool describes a tool available to the model. type AnthropicTool struct { Type string `json:"type,omitempty"` // e.g. "web_search_20250305" for server tools @@ -176,8 +186,9 @@ type ResponsesInputItem struct { // ResponsesContentPart is a typed content part in a Responses message. type ResponsesContentPart struct { - Type string `json:"type"` // "input_text" | "output_text" | "input_image" - Text string `json:"text,omitempty"` + Type string `json:"type"` // "input_text" | "output_text" | "input_image" + Text string `json:"text,omitempty"` + ImageURL string `json:"image_url,omitempty"` // data URI for input_image } // ResponsesTool describes a tool in the Responses API.