fix(gateway): emit Anthropic-standard SSE error events and failover body

Two follow-ups to PR #2066's failover-wrap fix:

1. Failover ResponseBody (`UpstreamFailoverError.ResponseBody`) was encoded
   as `{"error": "<msg>"}` (string field). `ExtractUpstreamErrorMessage`
   probes for `error.message`, `detail`, or top-level `message` only — so
   `handleFailoverExhausted` and downstream passthrough rules saw an empty
   message, losing the EOF root cause in ops logs. Re-encode as the
   Anthropic standard shape `{"type":"error","error":{"type":"upstream_disconnected","message":"..."}}`.
   (Addresses the inline review comment from copilot-pull-request-reviewer
   on Wei-Shaw/sub2api#2066.)

2. The streaming `event: error` SSE frame for `response_too_large`,
   `stream_read_error`, and `stream_timeout` was non-standard
   (`{"error":"<reason>"}`). Anthropic SDKs (and Claude Code) expect
   `{"type":"error","error":{"type":"...","message":"..."}}` and parse
   `error.type`/`error.message` accordingly. Refactor `sendErrorEvent` to
   take both reason and message, and emit the standard frame so client
   SDKs surface a real diagnostic message instead of a generic stream error.

This does not by itself prevent task interruption on long-stream EOF
(SSE has no resume; client-side retry remains the only complete fix), but
it gives both server-side ops logs and client-side error UIs a meaningful
upstream message so users know the next step is to retry.

Tests updated to assert the new body shape on both branches plus a new
assertion that `ExtractUpstreamErrorMessage` returns a non-empty string.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
alfadb
2026-04-28 20:24:17 +08:00
parent 6327573534
commit 4c474616b9
2 changed files with 45 additions and 14 deletions

View File

@@ -6871,14 +6871,31 @@ func (s *GatewayService) handleStreamingResponse(ctx context.Context, resp *http
}
lastDataAt := time.Now()
// 仅发送一次错误事件,避免多次写入导致协议混乱(写失败时尽力通知客户端)
// 仅发送一次错误事件,避免多次写入导致协议混乱(写失败时尽力通知客户端)
// 事件格式遵循 Anthropic SSE 标准:{"type":"error","error":{"type":<reason>,"message":<message>}}
// 这样 Anthropic SDK / Claude Code 等客户端能按标准 error 类型解析UI 能显示具体错误文案,
// 服务端 ExtractUpstreamErrorMessage 也能从透传的 body 中提取 message。
errorEventSent := false
sendErrorEvent := func(reason string) {
sendErrorEvent := func(reason, message string) {
if errorEventSent {
return
}
errorEventSent = true
_, _ = fmt.Fprintf(w, "event: error\ndata: {\"error\":\"%s\"}\n\n", reason)
if message == "" {
message = reason
}
body, err := json.Marshal(map[string]any{
"type": "error",
"error": map[string]string{
"type": reason,
"message": message,
},
})
if err != nil {
// json.Marshal 不可能在已知 string-only 输入上失败,保守 fallback
body = []byte(fmt.Sprintf(`{"type":"error","error":{"type":%q,"message":%q}}`, reason, message))
}
_, _ = fmt.Fprintf(w, "event: error\ndata: %s\n\n", body)
flusher.Flush()
}
@@ -7038,16 +7055,21 @@ func (s *GatewayService) handleStreamingResponse(ctx context.Context, resp *http
// 客户端未断开,正常的错误处理
if errors.Is(ev.err, bufio.ErrTooLong) {
logger.LegacyPrintf("service.gateway", "SSE line too long: account=%d max_size=%d error=%v", account.ID, maxLineSize, ev.err)
sendErrorEvent("response_too_large")
sendErrorEvent("response_too_large", fmt.Sprintf("upstream SSE line exceeded %d bytes", maxLineSize))
return &streamingResult{usage: usage, firstTokenMs: firstTokenMs}, ev.err
}
// 上游中途读错误unexpected EOF / connection reset 等,常见于 HTTP/2 GOAWAY
// 若尚未向客户端写过任何字节,包成 UpstreamFailoverError 让 handler 层走 failover/重试。
// 已经开始写流时 SSE 协议无 resume只能透传错误事件给客户端。
disconnectMsg := fmt.Sprintf("upstream stream disconnected: %s", ev.err)
if !c.Writer.Written() {
logger.LegacyPrintf("service.gateway", "Upstream stream read error before any client output (account=%d), failing over: %v", account.ID, ev.err)
body, _ := json.Marshal(map[string]string{
"error": fmt.Sprintf("upstream stream disconnected: %s", ev.err),
body, _ := json.Marshal(map[string]any{
"type": "error",
"error": map[string]string{
"type": "upstream_disconnected",
"message": disconnectMsg,
},
})
return nil, &UpstreamFailoverError{
StatusCode: http.StatusBadGateway,
@@ -7055,7 +7077,7 @@ func (s *GatewayService) handleStreamingResponse(ctx context.Context, resp *http
RetryableOnSameAccount: true,
}
}
sendErrorEvent("stream_read_error")
sendErrorEvent("stream_read_error", disconnectMsg)
return &streamingResult{usage: usage, firstTokenMs: firstTokenMs}, fmt.Errorf("stream read error: %w", ev.err)
}
line := ev.line
@@ -7114,7 +7136,7 @@ func (s *GatewayService) handleStreamingResponse(ctx context.Context, resp *http
if s.rateLimitService != nil {
s.rateLimitService.HandleStreamTimeout(ctx, account, originalModel)
}
sendErrorEvent("stream_timeout")
sendErrorEvent("stream_timeout", fmt.Sprintf("upstream stream idle for %s", streamInterval))
return &streamingResult{usage: usage, firstTokenMs: firstTokenMs}, fmt.Errorf("stream data interval timeout")
case <-keepaliveCh: