feat: gpt->claude格式转换支持图片识别

2026-04-18 22:04:45 +08:00 · 2026-03-08 23:16:58 +08:00
parent a2ae9f1f27
commit 00c151b463
3 changed files with 276 additions and 24 deletions
--- a/backend/internal/pkg/apicompat/anthropic_responses_test.go
+++ b/backend/internal/pkg/apicompat/anthropic_responses_test.go
@@ -733,3 +733,188 @@ func TestAnthropicToResponses_ToolChoiceSpecific(t *testing.T) {
 	require.True(t, ok)
 	assert.Equal(t, "get_weather", fn["name"])
 }
 // ---------------------------------------------------------------------------
 // Image content block conversion tests
 // ---------------------------------------------------------------------------
 func TestAnthropicToResponses_UserImageBlock(t *testing.T) {
 	req := &AnthropicRequest{
 		Model:     "gpt-5.2",
 		MaxTokens: 1024,
 		Messages: []AnthropicMessage{
 			{Role: "user", Content: json.RawMessage(`[
 				{"type":"text","text":"What is in this image?"},
 				{"type":"image","source":{"type":"base64","media_type":"image/png","data":"iVBOR"}}
 			]`)},
 		},
 	}
 	resp, err := AnthropicToResponses(req)
 	require.NoError(t, err)
 	var items []ResponsesInputItem
 	require.NoError(t, json.Unmarshal(resp.Input, &items))
 	require.Len(t, items, 1)
 	assert.Equal(t, "user", items[0].Role)
 	var parts []ResponsesContentPart
 	require.NoError(t, json.Unmarshal(items[0].Content, &parts))
 	require.Len(t, parts, 2)
 	assert.Equal(t, "input_text", parts[0].Type)
 	assert.Equal(t, "What is in this image?", parts[0].Text)
 	assert.Equal(t, "input_image", parts[1].Type)
 	assert.Equal(t, "data:image/png;base64,iVBOR", parts[1].ImageURL)
 }
 func TestAnthropicToResponses_ImageOnlyUserMessage(t *testing.T) {
 	req := &AnthropicRequest{
 		Model:     "gpt-5.2",
 		MaxTokens: 1024,
 		Messages: []AnthropicMessage{
 			{Role: "user", Content: json.RawMessage(`[
 				{"type":"image","source":{"type":"base64","media_type":"image/jpeg","data":"/9j/4AAQ"}}
 			]`)},
 		},
 	}
 	resp, err := AnthropicToResponses(req)
 	require.NoError(t, err)
 	var items []ResponsesInputItem
 	require.NoError(t, json.Unmarshal(resp.Input, &items))
 	require.Len(t, items, 1)
 	var parts []ResponsesContentPart
 	require.NoError(t, json.Unmarshal(items[0].Content, &parts))
 	require.Len(t, parts, 1)
 	assert.Equal(t, "input_image", parts[0].Type)
 	assert.Equal(t, "data:image/jpeg;base64,/9j/4AAQ", parts[0].ImageURL)
 }
 func TestAnthropicToResponses_ToolResultWithImage(t *testing.T) {
 	req := &AnthropicRequest{
 		Model:     "gpt-5.2",
 		MaxTokens: 1024,
 		Messages: []AnthropicMessage{
 			{Role: "user", Content: json.RawMessage(`"Read the screenshot"`)},
 			{Role: "assistant", Content: json.RawMessage(`[{"type":"tool_use","id":"toolu_1","name":"Read","input":{"file_path":"/tmp/screen.png"}}]`)},
 			{Role: "user", Content: json.RawMessage(`[
 				{"type":"tool_result","tool_use_id":"toolu_1","content":[
 					{"type":"image","source":{"type":"base64","media_type":"image/png","data":"iVBOR"}}
 				]}
 			]`)},
 		},
 	}
 	resp, err := AnthropicToResponses(req)
 	require.NoError(t, err)
 	var items []ResponsesInputItem
 	require.NoError(t, json.Unmarshal(resp.Input, &items))
 	// user + function_call + function_call_output + user(image) = 4
 	require.Len(t, items, 4)
 	// function_call_output should have text-only output (no image).
 	assert.Equal(t, "function_call_output", items[2].Type)
 	assert.Equal(t, "fc_toolu_1", items[2].CallID)
 	assert.Equal(t, "(empty)", items[2].Output)
 	// Image should be in a separate user message.
 	assert.Equal(t, "user", items[3].Role)
 	var parts []ResponsesContentPart
 	require.NoError(t, json.Unmarshal(items[3].Content, &parts))
 	require.Len(t, parts, 1)
 	assert.Equal(t, "input_image", parts[0].Type)
 	assert.Equal(t, "data:image/png;base64,iVBOR", parts[0].ImageURL)
 }
 func TestAnthropicToResponses_ToolResultMixed(t *testing.T) {
 	req := &AnthropicRequest{
 		Model:     "gpt-5.2",
 		MaxTokens: 1024,
 		Messages: []AnthropicMessage{
 			{Role: "user", Content: json.RawMessage(`"Describe the file"`)},
 			{Role: "assistant", Content: json.RawMessage(`[{"type":"tool_use","id":"toolu_2","name":"Read","input":{"file_path":"/tmp/photo.png"}}]`)},
 			{Role: "user", Content: json.RawMessage(`[
 				{"type":"tool_result","tool_use_id":"toolu_2","content":[
 					{"type":"text","text":"File metadata: 800x600 PNG"},
 					{"type":"image","source":{"type":"base64","media_type":"image/png","data":"AAAA"}}
 				]}
 			]`)},
 		},
 	}
 	resp, err := AnthropicToResponses(req)
 	require.NoError(t, err)
 	var items []ResponsesInputItem
 	require.NoError(t, json.Unmarshal(resp.Input, &items))
 	// user + function_call + function_call_output + user(image) = 4
 	require.Len(t, items, 4)
 	// function_call_output should have text-only output.
 	assert.Equal(t, "function_call_output", items[2].Type)
 	assert.Equal(t, "File metadata: 800x600 PNG", items[2].Output)
 	// Image should be in a separate user message.
 	assert.Equal(t, "user", items[3].Role)
 	var parts []ResponsesContentPart
 	require.NoError(t, json.Unmarshal(items[3].Content, &parts))
 	require.Len(t, parts, 1)
 	assert.Equal(t, "input_image", parts[0].Type)
 	assert.Equal(t, "data:image/png;base64,AAAA", parts[0].ImageURL)
 }
 func TestAnthropicToResponses_TextOnlyToolResultBackwardCompat(t *testing.T) {
 	req := &AnthropicRequest{
 		Model:     "gpt-5.2",
 		MaxTokens: 1024,
 		Messages: []AnthropicMessage{
 			{Role: "user", Content: json.RawMessage(`"Check weather"`)},
 			{Role: "assistant", Content: json.RawMessage(`[{"type":"tool_use","id":"call_1","name":"get_weather","input":{"city":"NYC"}}]`)},
 			{Role: "user", Content: json.RawMessage(`[
 				{"type":"tool_result","tool_use_id":"call_1","content":[
 					{"type":"text","text":"Sunny, 72°F"}
 				]}
 			]`)},
 		},
 	}
 	resp, err := AnthropicToResponses(req)
 	require.NoError(t, err)
 	var items []ResponsesInputItem
 	require.NoError(t, json.Unmarshal(resp.Input, &items))
 	// user + function_call + function_call_output = 3
 	require.Len(t, items, 3)
 	// Text-only tool_result should produce a plain string.
 	assert.Equal(t, "Sunny, 72°F", items[2].Output)
 }
 func TestAnthropicToResponses_ImageEmptyMediaType(t *testing.T) {
 	req := &AnthropicRequest{
 		Model:     "gpt-5.2",
 		MaxTokens: 1024,
 		Messages: []AnthropicMessage{
 			{Role: "user", Content: json.RawMessage(`[
 				{"type":"image","source":{"type":"base64","media_type":"","data":"iVBOR"}}
 			]`)},
 		},
 	}
 	resp, err := AnthropicToResponses(req)
 	require.NoError(t, err)
 	var items []ResponsesInputItem
 	require.NoError(t, json.Unmarshal(resp.Input, &items))
 	require.Len(t, items, 1)
 	var parts []ResponsesContentPart
 	require.NoError(t, json.Unmarshal(items[0].Content, &parts))
 	require.Len(t, parts, 1)
 	assert.Equal(t, "input_image", parts[0].Type)
 	// Should default to image/png when media_type is empty.
 	assert.Equal(t, "data:image/png;base64,iVBOR", parts[0].ImageURL)
 }
--- a/backend/internal/pkg/apicompat/anthropic_to_responses.go
+++ b/backend/internal/pkg/apicompat/anthropic_to_responses.go
@@ -169,7 +169,7 @@ func anthropicMsgToResponsesItems(m AnthropicMessage) ([]ResponsesInputItem, err
 // anthropicUserToResponses handles an Anthropic user message. Content can be a
 // plain string or an array of blocks. tool_result blocks are extracted into
-// function_call_output items.
+// function_call_output items. Image blocks are converted to input_image parts.
 func anthropicUserToResponses(raw json.RawMessage) ([]ResponsesInputItem, error) {
 	// Try plain string.
 	var s string
@@ -184,28 +184,46 @@ func anthropicUserToResponses(raw json.RawMessage) ([]ResponsesInputItem, error)
 	}
 	var out []ResponsesInputItem
 	var toolResultImageParts []ResponsesContentPart
 	// Extract tool_result blocks → function_call_output items.
 	// Images inside tool_results are extracted separately because the
 	// Responses API function_call_output.output only accepts strings.
 	for _, b := range blocks {
 		if b.Type != "tool_result" {
 			continue
 		}
-		text := extractAnthropicToolResultText(b)
+		outputText, imageParts := convertToolResultOutput(b)
 		if text == "" {
 			// OpenAI Responses API requires "output" field; use placeholder for empty results.
 			text = "(empty)"
 		}
 		out = append(out, ResponsesInputItem{
 			Type:   "function_call_output",
 			CallID: toResponsesCallID(b.ToolUseID),
-			Output: text,
+			Output: outputText,
 		})
 		toolResultImageParts = append(toolResultImageParts, imageParts...)
 	}
-	// Remaining text blocks → user message.
+	// Remaining text + image blocks → user message with content parts.
-	text := extractAnthropicTextFromBlocks(blocks)
+	// Also include images extracted from tool_results so the model can see them.
-	if text != "" {
+	var parts []ResponsesContentPart
-		content, _ := json.Marshal(text)
+	for _, b := range blocks {
 		switch b.Type {
 		case "text":
 			if b.Text != "" {
 				parts = append(parts, ResponsesContentPart{Type: "input_text", Text: b.Text})
 			}
 		case "image":
 			if uri := anthropicImageToDataURI(b.Source); uri != "" {
 				parts = append(parts, ResponsesContentPart{Type: "input_image", ImageURL: uri})
 			}
 		}
 	}
 	parts = append(parts, toolResultImageParts...)
 	if len(parts) > 0 {
 		content, err := json.Marshal(parts)
 		if err != nil {
 			return nil, err
 		}
 		out = append(out, ResponsesInputItem{Role: "user", Content: content})
 	}
@@ -290,26 +308,64 @@ func fromResponsesCallID(id string) string {
 	return id
 }
-// extractAnthropicToolResultText gets the text content from a tool_result block.
+// anthropicImageToDataURI converts an AnthropicImageSource to a data URI string.
-func extractAnthropicToolResultText(b AnthropicContentBlock) string {
+// Returns "" if the source is nil or has no data.
-	if len(b.Content) == 0 {
+func anthropicImageToDataURI(src *AnthropicImageSource) string {
 	if src == nil || src.Data == "" {
 		return ""
 	}
 	mediaType := src.MediaType
 	if mediaType == "" {
 		mediaType = "image/png"
 	}
 	return "data:" + mediaType + ";base64," + src.Data
 }
 // convertToolResultOutput extracts text and image content from a tool_result
 // block. Returns the text as a string for the function_call_output Output
 // field, plus any image parts that must be sent in a separate user message
 // (the Responses API output field only accepts strings).
 func convertToolResultOutput(b AnthropicContentBlock) (string, []ResponsesContentPart) {
 	if len(b.Content) == 0 {
 		return "(empty)", nil
 	}
 	// Try plain string content.
 	var s string
 	if err := json.Unmarshal(b.Content, &s); err == nil {
-		return s
+		if s == "" {
 			s = "(empty)"
 		}
 		return s, nil
 	}
 	// Array of content blocks — may contain text and/or images.
 	var inner []AnthropicContentBlock
-	if err := json.Unmarshal(b.Content, &inner); err == nil {
+	if err := json.Unmarshal(b.Content, &inner); err != nil {
-		var parts []string
+		return "(empty)", nil
-		for _, ib := range inner {
+	}
-			if ib.Type == "text" && ib.Text != "" {
+
-				parts = append(parts, ib.Text)
+	// Separate text (for function_call_output) from images (for user message).
 	var textParts []string
 	var imageParts []ResponsesContentPart
 	for _, ib := range inner {
 		switch ib.Type {
 		case "text":
 			if ib.Text != "" {
 				textParts = append(textParts, ib.Text)
 			}
 		case "image":
 			if uri := anthropicImageToDataURI(ib.Source); uri != "" {
 				imageParts = append(imageParts, ResponsesContentPart{Type: "input_image", ImageURL: uri})
 			}
 		}
 		return strings.Join(parts, "\n\n")
 	}
-	return ""
+
 	text := strings.Join(textParts, "\n\n")
 	if text == "" {
 		text = "(empty)"
 	}
 	return text, imageParts
 }
 // extractAnthropicTextFromBlocks joins all text blocks, ignoring thinking/
--- a/backend/internal/pkg/apicompat/types.go
+++ b/backend/internal/pkg/apicompat/types.go
@@ -47,6 +47,9 @@ type AnthropicContentBlock struct {
 	// type=thinking
 	Thinking string `json:"thinking,omitempty"`
 	// type=image
 	Source *AnthropicImageSource `json:"source,omitempty"`
 	// type=tool_use
 	ID    string          `json:"id,omitempty"`
 	Name  string          `json:"name,omitempty"`
@@ -58,6 +61,13 @@ type AnthropicContentBlock struct {
 	IsError   bool            `json:"is_error,omitempty"`
 }
 // AnthropicImageSource describes the source data for an image content block.
 type AnthropicImageSource struct {
 	Type      string `json:"type"` // "base64"
 	MediaType string `json:"media_type"`
 	Data      string `json:"data"`
 }
 // AnthropicTool describes a tool available to the model.
 type AnthropicTool struct {
 	Type        string          `json:"type,omitempty"` // e.g. "web_search_20250305" for server tools
@@ -176,8 +186,9 @@ type ResponsesInputItem struct {
 // ResponsesContentPart is a typed content part in a Responses message.
 type ResponsesContentPart struct {
-	Type string `json:"type"` // "input_text" | "output_text" | "input_image"
+	Type     string `json:"type"` // "input_text" | "output_text" | "input_image"
-	Text string `json:"text,omitempty"`
+	Text     string `json:"text,omitempty"`
 	ImageURL string `json:"image_url,omitempty"` // data URI for input_image
 }
 // ResponsesTool describes a tool in the Responses API.