mirror of
https://gitee.com/wanwujie/sub2api
synced 2026-04-18 22:04:45 +08:00
feat: gpt->claude格式转换支持图片识别
This commit is contained in:
@@ -733,3 +733,188 @@ func TestAnthropicToResponses_ToolChoiceSpecific(t *testing.T) {
|
|||||||
require.True(t, ok)
|
require.True(t, ok)
|
||||||
assert.Equal(t, "get_weather", fn["name"])
|
assert.Equal(t, "get_weather", fn["name"])
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Image content block conversion tests
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
func TestAnthropicToResponses_UserImageBlock(t *testing.T) {
|
||||||
|
req := &AnthropicRequest{
|
||||||
|
Model: "gpt-5.2",
|
||||||
|
MaxTokens: 1024,
|
||||||
|
Messages: []AnthropicMessage{
|
||||||
|
{Role: "user", Content: json.RawMessage(`[
|
||||||
|
{"type":"text","text":"What is in this image?"},
|
||||||
|
{"type":"image","source":{"type":"base64","media_type":"image/png","data":"iVBOR"}}
|
||||||
|
]`)},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
resp, err := AnthropicToResponses(req)
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
var items []ResponsesInputItem
|
||||||
|
require.NoError(t, json.Unmarshal(resp.Input, &items))
|
||||||
|
require.Len(t, items, 1)
|
||||||
|
assert.Equal(t, "user", items[0].Role)
|
||||||
|
|
||||||
|
var parts []ResponsesContentPart
|
||||||
|
require.NoError(t, json.Unmarshal(items[0].Content, &parts))
|
||||||
|
require.Len(t, parts, 2)
|
||||||
|
assert.Equal(t, "input_text", parts[0].Type)
|
||||||
|
assert.Equal(t, "What is in this image?", parts[0].Text)
|
||||||
|
assert.Equal(t, "input_image", parts[1].Type)
|
||||||
|
assert.Equal(t, "data:image/png;base64,iVBOR", parts[1].ImageURL)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestAnthropicToResponses_ImageOnlyUserMessage(t *testing.T) {
|
||||||
|
req := &AnthropicRequest{
|
||||||
|
Model: "gpt-5.2",
|
||||||
|
MaxTokens: 1024,
|
||||||
|
Messages: []AnthropicMessage{
|
||||||
|
{Role: "user", Content: json.RawMessage(`[
|
||||||
|
{"type":"image","source":{"type":"base64","media_type":"image/jpeg","data":"/9j/4AAQ"}}
|
||||||
|
]`)},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
resp, err := AnthropicToResponses(req)
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
var items []ResponsesInputItem
|
||||||
|
require.NoError(t, json.Unmarshal(resp.Input, &items))
|
||||||
|
require.Len(t, items, 1)
|
||||||
|
|
||||||
|
var parts []ResponsesContentPart
|
||||||
|
require.NoError(t, json.Unmarshal(items[0].Content, &parts))
|
||||||
|
require.Len(t, parts, 1)
|
||||||
|
assert.Equal(t, "input_image", parts[0].Type)
|
||||||
|
assert.Equal(t, "data:image/jpeg;base64,/9j/4AAQ", parts[0].ImageURL)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestAnthropicToResponses_ToolResultWithImage(t *testing.T) {
|
||||||
|
req := &AnthropicRequest{
|
||||||
|
Model: "gpt-5.2",
|
||||||
|
MaxTokens: 1024,
|
||||||
|
Messages: []AnthropicMessage{
|
||||||
|
{Role: "user", Content: json.RawMessage(`"Read the screenshot"`)},
|
||||||
|
{Role: "assistant", Content: json.RawMessage(`[{"type":"tool_use","id":"toolu_1","name":"Read","input":{"file_path":"/tmp/screen.png"}}]`)},
|
||||||
|
{Role: "user", Content: json.RawMessage(`[
|
||||||
|
{"type":"tool_result","tool_use_id":"toolu_1","content":[
|
||||||
|
{"type":"image","source":{"type":"base64","media_type":"image/png","data":"iVBOR"}}
|
||||||
|
]}
|
||||||
|
]`)},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
resp, err := AnthropicToResponses(req)
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
var items []ResponsesInputItem
|
||||||
|
require.NoError(t, json.Unmarshal(resp.Input, &items))
|
||||||
|
// user + function_call + function_call_output + user(image) = 4
|
||||||
|
require.Len(t, items, 4)
|
||||||
|
|
||||||
|
// function_call_output should have text-only output (no image).
|
||||||
|
assert.Equal(t, "function_call_output", items[2].Type)
|
||||||
|
assert.Equal(t, "fc_toolu_1", items[2].CallID)
|
||||||
|
assert.Equal(t, "(empty)", items[2].Output)
|
||||||
|
|
||||||
|
// Image should be in a separate user message.
|
||||||
|
assert.Equal(t, "user", items[3].Role)
|
||||||
|
var parts []ResponsesContentPart
|
||||||
|
require.NoError(t, json.Unmarshal(items[3].Content, &parts))
|
||||||
|
require.Len(t, parts, 1)
|
||||||
|
assert.Equal(t, "input_image", parts[0].Type)
|
||||||
|
assert.Equal(t, "data:image/png;base64,iVBOR", parts[0].ImageURL)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestAnthropicToResponses_ToolResultMixed(t *testing.T) {
|
||||||
|
req := &AnthropicRequest{
|
||||||
|
Model: "gpt-5.2",
|
||||||
|
MaxTokens: 1024,
|
||||||
|
Messages: []AnthropicMessage{
|
||||||
|
{Role: "user", Content: json.RawMessage(`"Describe the file"`)},
|
||||||
|
{Role: "assistant", Content: json.RawMessage(`[{"type":"tool_use","id":"toolu_2","name":"Read","input":{"file_path":"/tmp/photo.png"}}]`)},
|
||||||
|
{Role: "user", Content: json.RawMessage(`[
|
||||||
|
{"type":"tool_result","tool_use_id":"toolu_2","content":[
|
||||||
|
{"type":"text","text":"File metadata: 800x600 PNG"},
|
||||||
|
{"type":"image","source":{"type":"base64","media_type":"image/png","data":"AAAA"}}
|
||||||
|
]}
|
||||||
|
]`)},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
resp, err := AnthropicToResponses(req)
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
var items []ResponsesInputItem
|
||||||
|
require.NoError(t, json.Unmarshal(resp.Input, &items))
|
||||||
|
// user + function_call + function_call_output + user(image) = 4
|
||||||
|
require.Len(t, items, 4)
|
||||||
|
|
||||||
|
// function_call_output should have text-only output.
|
||||||
|
assert.Equal(t, "function_call_output", items[2].Type)
|
||||||
|
assert.Equal(t, "File metadata: 800x600 PNG", items[2].Output)
|
||||||
|
|
||||||
|
// Image should be in a separate user message.
|
||||||
|
assert.Equal(t, "user", items[3].Role)
|
||||||
|
var parts []ResponsesContentPart
|
||||||
|
require.NoError(t, json.Unmarshal(items[3].Content, &parts))
|
||||||
|
require.Len(t, parts, 1)
|
||||||
|
assert.Equal(t, "input_image", parts[0].Type)
|
||||||
|
assert.Equal(t, "data:image/png;base64,AAAA", parts[0].ImageURL)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestAnthropicToResponses_TextOnlyToolResultBackwardCompat(t *testing.T) {
|
||||||
|
req := &AnthropicRequest{
|
||||||
|
Model: "gpt-5.2",
|
||||||
|
MaxTokens: 1024,
|
||||||
|
Messages: []AnthropicMessage{
|
||||||
|
{Role: "user", Content: json.RawMessage(`"Check weather"`)},
|
||||||
|
{Role: "assistant", Content: json.RawMessage(`[{"type":"tool_use","id":"call_1","name":"get_weather","input":{"city":"NYC"}}]`)},
|
||||||
|
{Role: "user", Content: json.RawMessage(`[
|
||||||
|
{"type":"tool_result","tool_use_id":"call_1","content":[
|
||||||
|
{"type":"text","text":"Sunny, 72°F"}
|
||||||
|
]}
|
||||||
|
]`)},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
resp, err := AnthropicToResponses(req)
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
var items []ResponsesInputItem
|
||||||
|
require.NoError(t, json.Unmarshal(resp.Input, &items))
|
||||||
|
// user + function_call + function_call_output = 3
|
||||||
|
require.Len(t, items, 3)
|
||||||
|
|
||||||
|
// Text-only tool_result should produce a plain string.
|
||||||
|
assert.Equal(t, "Sunny, 72°F", items[2].Output)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestAnthropicToResponses_ImageEmptyMediaType(t *testing.T) {
|
||||||
|
req := &AnthropicRequest{
|
||||||
|
Model: "gpt-5.2",
|
||||||
|
MaxTokens: 1024,
|
||||||
|
Messages: []AnthropicMessage{
|
||||||
|
{Role: "user", Content: json.RawMessage(`[
|
||||||
|
{"type":"image","source":{"type":"base64","media_type":"","data":"iVBOR"}}
|
||||||
|
]`)},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
resp, err := AnthropicToResponses(req)
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
var items []ResponsesInputItem
|
||||||
|
require.NoError(t, json.Unmarshal(resp.Input, &items))
|
||||||
|
require.Len(t, items, 1)
|
||||||
|
|
||||||
|
var parts []ResponsesContentPart
|
||||||
|
require.NoError(t, json.Unmarshal(items[0].Content, &parts))
|
||||||
|
require.Len(t, parts, 1)
|
||||||
|
assert.Equal(t, "input_image", parts[0].Type)
|
||||||
|
// Should default to image/png when media_type is empty.
|
||||||
|
assert.Equal(t, "data:image/png;base64,iVBOR", parts[0].ImageURL)
|
||||||
|
}
|
||||||
|
|||||||
@@ -169,7 +169,7 @@ func anthropicMsgToResponsesItems(m AnthropicMessage) ([]ResponsesInputItem, err
|
|||||||
|
|
||||||
// anthropicUserToResponses handles an Anthropic user message. Content can be a
|
// anthropicUserToResponses handles an Anthropic user message. Content can be a
|
||||||
// plain string or an array of blocks. tool_result blocks are extracted into
|
// plain string or an array of blocks. tool_result blocks are extracted into
|
||||||
// function_call_output items.
|
// function_call_output items. Image blocks are converted to input_image parts.
|
||||||
func anthropicUserToResponses(raw json.RawMessage) ([]ResponsesInputItem, error) {
|
func anthropicUserToResponses(raw json.RawMessage) ([]ResponsesInputItem, error) {
|
||||||
// Try plain string.
|
// Try plain string.
|
||||||
var s string
|
var s string
|
||||||
@@ -184,28 +184,46 @@ func anthropicUserToResponses(raw json.RawMessage) ([]ResponsesInputItem, error)
|
|||||||
}
|
}
|
||||||
|
|
||||||
var out []ResponsesInputItem
|
var out []ResponsesInputItem
|
||||||
|
var toolResultImageParts []ResponsesContentPart
|
||||||
|
|
||||||
// Extract tool_result blocks → function_call_output items.
|
// Extract tool_result blocks → function_call_output items.
|
||||||
|
// Images inside tool_results are extracted separately because the
|
||||||
|
// Responses API function_call_output.output only accepts strings.
|
||||||
for _, b := range blocks {
|
for _, b := range blocks {
|
||||||
if b.Type != "tool_result" {
|
if b.Type != "tool_result" {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
text := extractAnthropicToolResultText(b)
|
outputText, imageParts := convertToolResultOutput(b)
|
||||||
if text == "" {
|
|
||||||
// OpenAI Responses API requires "output" field; use placeholder for empty results.
|
|
||||||
text = "(empty)"
|
|
||||||
}
|
|
||||||
out = append(out, ResponsesInputItem{
|
out = append(out, ResponsesInputItem{
|
||||||
Type: "function_call_output",
|
Type: "function_call_output",
|
||||||
CallID: toResponsesCallID(b.ToolUseID),
|
CallID: toResponsesCallID(b.ToolUseID),
|
||||||
Output: text,
|
Output: outputText,
|
||||||
})
|
})
|
||||||
|
toolResultImageParts = append(toolResultImageParts, imageParts...)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Remaining text blocks → user message.
|
// Remaining text + image blocks → user message with content parts.
|
||||||
text := extractAnthropicTextFromBlocks(blocks)
|
// Also include images extracted from tool_results so the model can see them.
|
||||||
if text != "" {
|
var parts []ResponsesContentPart
|
||||||
content, _ := json.Marshal(text)
|
for _, b := range blocks {
|
||||||
|
switch b.Type {
|
||||||
|
case "text":
|
||||||
|
if b.Text != "" {
|
||||||
|
parts = append(parts, ResponsesContentPart{Type: "input_text", Text: b.Text})
|
||||||
|
}
|
||||||
|
case "image":
|
||||||
|
if uri := anthropicImageToDataURI(b.Source); uri != "" {
|
||||||
|
parts = append(parts, ResponsesContentPart{Type: "input_image", ImageURL: uri})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
parts = append(parts, toolResultImageParts...)
|
||||||
|
|
||||||
|
if len(parts) > 0 {
|
||||||
|
content, err := json.Marshal(parts)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
out = append(out, ResponsesInputItem{Role: "user", Content: content})
|
out = append(out, ResponsesInputItem{Role: "user", Content: content})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -290,26 +308,64 @@ func fromResponsesCallID(id string) string {
|
|||||||
return id
|
return id
|
||||||
}
|
}
|
||||||
|
|
||||||
// extractAnthropicToolResultText gets the text content from a tool_result block.
|
// anthropicImageToDataURI converts an AnthropicImageSource to a data URI string.
|
||||||
func extractAnthropicToolResultText(b AnthropicContentBlock) string {
|
// Returns "" if the source is nil or has no data.
|
||||||
if len(b.Content) == 0 {
|
func anthropicImageToDataURI(src *AnthropicImageSource) string {
|
||||||
|
if src == nil || src.Data == "" {
|
||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
|
mediaType := src.MediaType
|
||||||
|
if mediaType == "" {
|
||||||
|
mediaType = "image/png"
|
||||||
|
}
|
||||||
|
return "data:" + mediaType + ";base64," + src.Data
|
||||||
|
}
|
||||||
|
|
||||||
|
// convertToolResultOutput extracts text and image content from a tool_result
|
||||||
|
// block. Returns the text as a string for the function_call_output Output
|
||||||
|
// field, plus any image parts that must be sent in a separate user message
|
||||||
|
// (the Responses API output field only accepts strings).
|
||||||
|
func convertToolResultOutput(b AnthropicContentBlock) (string, []ResponsesContentPart) {
|
||||||
|
if len(b.Content) == 0 {
|
||||||
|
return "(empty)", nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try plain string content.
|
||||||
var s string
|
var s string
|
||||||
if err := json.Unmarshal(b.Content, &s); err == nil {
|
if err := json.Unmarshal(b.Content, &s); err == nil {
|
||||||
return s
|
if s == "" {
|
||||||
|
s = "(empty)"
|
||||||
|
}
|
||||||
|
return s, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Array of content blocks — may contain text and/or images.
|
||||||
var inner []AnthropicContentBlock
|
var inner []AnthropicContentBlock
|
||||||
if err := json.Unmarshal(b.Content, &inner); err == nil {
|
if err := json.Unmarshal(b.Content, &inner); err != nil {
|
||||||
var parts []string
|
return "(empty)", nil
|
||||||
for _, ib := range inner {
|
}
|
||||||
if ib.Type == "text" && ib.Text != "" {
|
|
||||||
parts = append(parts, ib.Text)
|
// Separate text (for function_call_output) from images (for user message).
|
||||||
|
var textParts []string
|
||||||
|
var imageParts []ResponsesContentPart
|
||||||
|
for _, ib := range inner {
|
||||||
|
switch ib.Type {
|
||||||
|
case "text":
|
||||||
|
if ib.Text != "" {
|
||||||
|
textParts = append(textParts, ib.Text)
|
||||||
|
}
|
||||||
|
case "image":
|
||||||
|
if uri := anthropicImageToDataURI(ib.Source); uri != "" {
|
||||||
|
imageParts = append(imageParts, ResponsesContentPart{Type: "input_image", ImageURL: uri})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return strings.Join(parts, "\n\n")
|
|
||||||
}
|
}
|
||||||
return ""
|
|
||||||
|
text := strings.Join(textParts, "\n\n")
|
||||||
|
if text == "" {
|
||||||
|
text = "(empty)"
|
||||||
|
}
|
||||||
|
return text, imageParts
|
||||||
}
|
}
|
||||||
|
|
||||||
// extractAnthropicTextFromBlocks joins all text blocks, ignoring thinking/
|
// extractAnthropicTextFromBlocks joins all text blocks, ignoring thinking/
|
||||||
|
|||||||
@@ -47,6 +47,9 @@ type AnthropicContentBlock struct {
|
|||||||
// type=thinking
|
// type=thinking
|
||||||
Thinking string `json:"thinking,omitempty"`
|
Thinking string `json:"thinking,omitempty"`
|
||||||
|
|
||||||
|
// type=image
|
||||||
|
Source *AnthropicImageSource `json:"source,omitempty"`
|
||||||
|
|
||||||
// type=tool_use
|
// type=tool_use
|
||||||
ID string `json:"id,omitempty"`
|
ID string `json:"id,omitempty"`
|
||||||
Name string `json:"name,omitempty"`
|
Name string `json:"name,omitempty"`
|
||||||
@@ -58,6 +61,13 @@ type AnthropicContentBlock struct {
|
|||||||
IsError bool `json:"is_error,omitempty"`
|
IsError bool `json:"is_error,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// AnthropicImageSource describes the source data for an image content block.
|
||||||
|
type AnthropicImageSource struct {
|
||||||
|
Type string `json:"type"` // "base64"
|
||||||
|
MediaType string `json:"media_type"`
|
||||||
|
Data string `json:"data"`
|
||||||
|
}
|
||||||
|
|
||||||
// AnthropicTool describes a tool available to the model.
|
// AnthropicTool describes a tool available to the model.
|
||||||
type AnthropicTool struct {
|
type AnthropicTool struct {
|
||||||
Type string `json:"type,omitempty"` // e.g. "web_search_20250305" for server tools
|
Type string `json:"type,omitempty"` // e.g. "web_search_20250305" for server tools
|
||||||
@@ -176,8 +186,9 @@ type ResponsesInputItem struct {
|
|||||||
|
|
||||||
// ResponsesContentPart is a typed content part in a Responses message.
|
// ResponsesContentPart is a typed content part in a Responses message.
|
||||||
type ResponsesContentPart struct {
|
type ResponsesContentPart struct {
|
||||||
Type string `json:"type"` // "input_text" | "output_text" | "input_image"
|
Type string `json:"type"` // "input_text" | "output_text" | "input_image"
|
||||||
Text string `json:"text,omitempty"`
|
Text string `json:"text,omitempty"`
|
||||||
|
ImageURL string `json:"image_url,omitempty"` // data URI for input_image
|
||||||
}
|
}
|
||||||
|
|
||||||
// ResponsesTool describes a tool in the Responses API.
|
// ResponsesTool describes a tool in the Responses API.
|
||||||
|
|||||||
Reference in New Issue
Block a user