mirror of
https://gitee.com/wanwujie/sub2api
synced 2026-05-05 05:30:44 +08:00
fix(gateway): emit Anthropic-standard SSE error events and failover body
Two follow-ups to PR #2066's failover-wrap fix: 1. Failover ResponseBody (`UpstreamFailoverError.ResponseBody`) was encoded as `{"error": "<msg>"}` (string field). `ExtractUpstreamErrorMessage` probes for `error.message`, `detail`, or top-level `message` only — so `handleFailoverExhausted` and downstream passthrough rules saw an empty message, losing the EOF root cause in ops logs. Re-encode as the Anthropic standard shape `{"type":"error","error":{"type":"upstream_disconnected","message":"..."}}`. (Addresses the inline review comment from copilot-pull-request-reviewer on Wei-Shaw/sub2api#2066.) 2. The streaming `event: error` SSE frame for `response_too_large`, `stream_read_error`, and `stream_timeout` was non-standard (`{"error":"<reason>"}`). Anthropic SDKs (and Claude Code) expect `{"type":"error","error":{"type":"...","message":"..."}}` and parse `error.type`/`error.message` accordingly. Refactor `sendErrorEvent` to take both reason and message, and emit the standard frame so client SDKs surface a real diagnostic message instead of a generic stream error. This does not by itself prevent task interruption on long-stream EOF (SSE has no resume; client-side retry remains the only complete fix), but it gives both server-side ops logs and client-side error UIs a meaningful upstream message so users know the next step is to retry. Tests updated to assert the new body shape on both branches plus a new assertion that `ExtractUpstreamErrorMessage` returns a non-empty string. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -6871,14 +6871,31 @@ func (s *GatewayService) handleStreamingResponse(ctx context.Context, resp *http
|
|||||||
}
|
}
|
||||||
lastDataAt := time.Now()
|
lastDataAt := time.Now()
|
||||||
|
|
||||||
// 仅发送一次错误事件,避免多次写入导致协议混乱(写失败时尽力通知客户端)
|
// 仅发送一次错误事件,避免多次写入导致协议混乱(写失败时尽力通知客户端)。
|
||||||
|
// 事件格式遵循 Anthropic SSE 标准:{"type":"error","error":{"type":<reason>,"message":<message>}}
|
||||||
|
// 这样 Anthropic SDK / Claude Code 等客户端能按标准 error 类型解析,UI 能显示具体错误文案,
|
||||||
|
// 服务端 ExtractUpstreamErrorMessage 也能从透传的 body 中提取 message。
|
||||||
errorEventSent := false
|
errorEventSent := false
|
||||||
sendErrorEvent := func(reason string) {
|
sendErrorEvent := func(reason, message string) {
|
||||||
if errorEventSent {
|
if errorEventSent {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
errorEventSent = true
|
errorEventSent = true
|
||||||
_, _ = fmt.Fprintf(w, "event: error\ndata: {\"error\":\"%s\"}\n\n", reason)
|
if message == "" {
|
||||||
|
message = reason
|
||||||
|
}
|
||||||
|
body, err := json.Marshal(map[string]any{
|
||||||
|
"type": "error",
|
||||||
|
"error": map[string]string{
|
||||||
|
"type": reason,
|
||||||
|
"message": message,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
// json.Marshal 不可能在已知 string-only 输入上失败,保守 fallback
|
||||||
|
body = []byte(fmt.Sprintf(`{"type":"error","error":{"type":%q,"message":%q}}`, reason, message))
|
||||||
|
}
|
||||||
|
_, _ = fmt.Fprintf(w, "event: error\ndata: %s\n\n", body)
|
||||||
flusher.Flush()
|
flusher.Flush()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -7038,16 +7055,21 @@ func (s *GatewayService) handleStreamingResponse(ctx context.Context, resp *http
|
|||||||
// 客户端未断开,正常的错误处理
|
// 客户端未断开,正常的错误处理
|
||||||
if errors.Is(ev.err, bufio.ErrTooLong) {
|
if errors.Is(ev.err, bufio.ErrTooLong) {
|
||||||
logger.LegacyPrintf("service.gateway", "SSE line too long: account=%d max_size=%d error=%v", account.ID, maxLineSize, ev.err)
|
logger.LegacyPrintf("service.gateway", "SSE line too long: account=%d max_size=%d error=%v", account.ID, maxLineSize, ev.err)
|
||||||
sendErrorEvent("response_too_large")
|
sendErrorEvent("response_too_large", fmt.Sprintf("upstream SSE line exceeded %d bytes", maxLineSize))
|
||||||
return &streamingResult{usage: usage, firstTokenMs: firstTokenMs}, ev.err
|
return &streamingResult{usage: usage, firstTokenMs: firstTokenMs}, ev.err
|
||||||
}
|
}
|
||||||
// 上游中途读错误(unexpected EOF / connection reset 等,常见于 HTTP/2 GOAWAY):
|
// 上游中途读错误(unexpected EOF / connection reset 等,常见于 HTTP/2 GOAWAY):
|
||||||
// 若尚未向客户端写过任何字节,包成 UpstreamFailoverError 让 handler 层走 failover/重试。
|
// 若尚未向客户端写过任何字节,包成 UpstreamFailoverError 让 handler 层走 failover/重试。
|
||||||
// 已经开始写流时 SSE 协议无 resume,只能透传错误事件给客户端。
|
// 已经开始写流时 SSE 协议无 resume,只能透传错误事件给客户端。
|
||||||
|
disconnectMsg := fmt.Sprintf("upstream stream disconnected: %s", ev.err)
|
||||||
if !c.Writer.Written() {
|
if !c.Writer.Written() {
|
||||||
logger.LegacyPrintf("service.gateway", "Upstream stream read error before any client output (account=%d), failing over: %v", account.ID, ev.err)
|
logger.LegacyPrintf("service.gateway", "Upstream stream read error before any client output (account=%d), failing over: %v", account.ID, ev.err)
|
||||||
body, _ := json.Marshal(map[string]string{
|
body, _ := json.Marshal(map[string]any{
|
||||||
"error": fmt.Sprintf("upstream stream disconnected: %s", ev.err),
|
"type": "error",
|
||||||
|
"error": map[string]string{
|
||||||
|
"type": "upstream_disconnected",
|
||||||
|
"message": disconnectMsg,
|
||||||
|
},
|
||||||
})
|
})
|
||||||
return nil, &UpstreamFailoverError{
|
return nil, &UpstreamFailoverError{
|
||||||
StatusCode: http.StatusBadGateway,
|
StatusCode: http.StatusBadGateway,
|
||||||
@@ -7055,7 +7077,7 @@ func (s *GatewayService) handleStreamingResponse(ctx context.Context, resp *http
|
|||||||
RetryableOnSameAccount: true,
|
RetryableOnSameAccount: true,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
sendErrorEvent("stream_read_error")
|
sendErrorEvent("stream_read_error", disconnectMsg)
|
||||||
return &streamingResult{usage: usage, firstTokenMs: firstTokenMs}, fmt.Errorf("stream read error: %w", ev.err)
|
return &streamingResult{usage: usage, firstTokenMs: firstTokenMs}, fmt.Errorf("stream read error: %w", ev.err)
|
||||||
}
|
}
|
||||||
line := ev.line
|
line := ev.line
|
||||||
@@ -7114,7 +7136,7 @@ func (s *GatewayService) handleStreamingResponse(ctx context.Context, resp *http
|
|||||||
if s.rateLimitService != nil {
|
if s.rateLimitService != nil {
|
||||||
s.rateLimitService.HandleStreamTimeout(ctx, account, originalModel)
|
s.rateLimitService.HandleStreamTimeout(ctx, account, originalModel)
|
||||||
}
|
}
|
||||||
sendErrorEvent("stream_timeout")
|
sendErrorEvent("stream_timeout", fmt.Sprintf("upstream stream idle for %s", streamInterval))
|
||||||
return &streamingResult{usage: usage, firstTokenMs: firstTokenMs}, fmt.Errorf("stream data interval timeout")
|
return &streamingResult{usage: usage, firstTokenMs: firstTokenMs}, fmt.Errorf("stream data interval timeout")
|
||||||
|
|
||||||
case <-keepaliveCh:
|
case <-keepaliveCh:
|
||||||
|
|||||||
@@ -8,7 +8,6 @@ import (
|
|||||||
"io"
|
"io"
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/http/httptest"
|
"net/http/httptest"
|
||||||
"strings"
|
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@@ -246,7 +245,15 @@ func TestHandleStreamingResponse_StreamReadErrorBeforeOutput_TriggersFailover(t
|
|||||||
require.True(t, errors.As(err, &failoverErr), "未输出过字节时 stream read error 必须包成 UpstreamFailoverError,期望: %v", err)
|
require.True(t, errors.As(err, &failoverErr), "未输出过字节时 stream read error 必须包成 UpstreamFailoverError,期望: %v", err)
|
||||||
require.Equal(t, http.StatusBadGateway, failoverErr.StatusCode)
|
require.Equal(t, http.StatusBadGateway, failoverErr.StatusCode)
|
||||||
require.True(t, failoverErr.RetryableOnSameAccount, "GOAWAY 类错误应允许同账号重试")
|
require.True(t, failoverErr.RetryableOnSameAccount, "GOAWAY 类错误应允许同账号重试")
|
||||||
require.Contains(t, string(failoverErr.ResponseBody), "upstream stream disconnected")
|
|
||||||
|
// ResponseBody 必须是 Anthropic 标准 error 格式:
|
||||||
|
// 1) ExtractUpstreamErrorMessage 能正确从 error.message 提取消息(被 handleFailoverExhausted / ops 日志依赖)
|
||||||
|
// 2) error.type 标记为 upstream_disconnected
|
||||||
|
extractedMsg := ExtractUpstreamErrorMessage(failoverErr.ResponseBody)
|
||||||
|
require.NotEmpty(t, extractedMsg, "ExtractUpstreamErrorMessage 必须从 ResponseBody 取到非空 message,否则 ops 日志会丢失诊断信息")
|
||||||
|
require.Contains(t, extractedMsg, "upstream stream disconnected")
|
||||||
|
require.Contains(t, string(failoverErr.ResponseBody), `"type":"error"`)
|
||||||
|
require.Contains(t, string(failoverErr.ResponseBody), `"upstream_disconnected"`)
|
||||||
|
|
||||||
// 客户端应收不到任何 stream_read_error 事件,由 handler 层根据 failover 结果再决定
|
// 客户端应收不到任何 stream_read_error 事件,由 handler 层根据 failover 结果再决定
|
||||||
require.NotContains(t, rec.Body.String(), "stream_read_error")
|
require.NotContains(t, rec.Body.String(), "stream_read_error")
|
||||||
@@ -282,9 +289,11 @@ func TestHandleStreamingResponse_StreamReadErrorAfterOutput_PassesThrough(t *tes
|
|||||||
var failoverErr *UpstreamFailoverError
|
var failoverErr *UpstreamFailoverError
|
||||||
require.False(t, errors.As(err, &failoverErr), "已经向客户端写过字节时不能再 failover")
|
require.False(t, errors.As(err, &failoverErr), "已经向客户端写过字节时不能再 failover")
|
||||||
|
|
||||||
// 客户端必须收到 stream_read_error 事件
|
// 客户端必须收到 Anthropic 标准格式的 SSE error 事件,error.type=stream_read_error,
|
||||||
|
// error.message 含具体根因(让 SDK 能解析、UI 能显示具体错误)
|
||||||
body := rec.Body.String()
|
body := rec.Body.String()
|
||||||
require.True(t,
|
require.Contains(t, body, "event: error\n", "必须按 Anthropic SSE 标准发送 error 事件帧")
|
||||||
strings.Contains(body, "stream_read_error"),
|
require.Contains(t, body, `"type":"error"`, "data 必须含 type:error 顶层字段(Anthropic 标准)")
|
||||||
"已开始流后必须发送 stream_read_error 事件给客户端,实际响应: %q", body)
|
require.Contains(t, body, `"stream_read_error"`, "error.type 必须为 stream_read_error")
|
||||||
|
require.Contains(t, body, "upstream stream disconnected", "error.message 必须包含具体根因,Claude Code 等客户端才能显示有效错误文案")
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user