mirror of
https://gitee.com/wanwujie/sub2api
synced 2026-05-04 21:20:51 +08:00
Merge pull request #2066 from alfadb/fix/anthropic-stream-eof-failover
fix(gateway): Anthropic 流式 EOF 失败移交 + SSE error 帧标准化
This commit is contained in:
@@ -11,6 +11,7 @@ import (
|
||||
"io"
|
||||
"log/slog"
|
||||
mathrand "math/rand"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
@@ -20,6 +21,7 @@ import (
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/Wei-Shaw/sub2api/internal/config"
|
||||
@@ -6520,6 +6522,49 @@ func (s *GatewayService) shouldFailoverOn400(respBody []byte) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
// sanitizeStreamError 返回不含网络地址的客户端可见错误描述。
|
||||
// 默认 (*net.OpError).Error() 会拼接 Source/Addr 字段,泄露内部 IP/端口与上游
|
||||
// 服务器地址(例如 "read tcp 10.0.0.1:54321->52.1.2.3:443: read: connection
|
||||
// reset by peer")。该函数只保留可识别的错误类别,原始 err 仍在调用点写入日志。
|
||||
func sanitizeStreamError(err error) string {
|
||||
if err == nil {
|
||||
return ""
|
||||
}
|
||||
switch {
|
||||
case errors.Is(err, io.ErrUnexpectedEOF):
|
||||
return "unexpected EOF"
|
||||
case errors.Is(err, io.EOF):
|
||||
return "EOF"
|
||||
case errors.Is(err, context.Canceled):
|
||||
return "canceled"
|
||||
case errors.Is(err, context.DeadlineExceeded):
|
||||
return "deadline exceeded"
|
||||
case errors.Is(err, syscall.ECONNRESET):
|
||||
return "connection reset by peer"
|
||||
case errors.Is(err, syscall.ECONNABORTED):
|
||||
return "connection aborted"
|
||||
case errors.Is(err, syscall.ETIMEDOUT):
|
||||
return "connection timed out"
|
||||
case errors.Is(err, syscall.EPIPE):
|
||||
return "broken pipe"
|
||||
case errors.Is(err, syscall.ECONNREFUSED):
|
||||
return "connection refused"
|
||||
}
|
||||
var netErr *net.OpError
|
||||
if errors.As(err, &netErr) {
|
||||
if netErr.Timeout() {
|
||||
if netErr.Op != "" {
|
||||
return netErr.Op + " timeout"
|
||||
}
|
||||
return "i/o timeout"
|
||||
}
|
||||
if netErr.Op != "" {
|
||||
return netErr.Op + " network error"
|
||||
}
|
||||
}
|
||||
return "upstream connection error"
|
||||
}
|
||||
|
||||
// ExtractUpstreamErrorMessage 从上游响应体中提取错误消息
|
||||
// 支持 Claude 风格的错误格式:{"type":"error","error":{"type":"...","message":"..."}}
|
||||
func ExtractUpstreamErrorMessage(body []byte) string {
|
||||
@@ -6957,14 +7002,31 @@ func (s *GatewayService) handleStreamingResponse(ctx context.Context, resp *http
|
||||
}
|
||||
lastDataAt := time.Now()
|
||||
|
||||
// 仅发送一次错误事件,避免多次写入导致协议混乱(写失败时尽力通知客户端)
|
||||
// 仅发送一次错误事件,避免多次写入导致协议混乱(写失败时尽力通知客户端)。
|
||||
// 事件格式遵循 Anthropic SSE 标准:{"type":"error","error":{"type":<reason>,"message":<message>}}
|
||||
// 这样 Anthropic SDK / Claude Code 等客户端能按标准 error 类型解析,UI 能显示具体错误文案,
|
||||
// 服务端 ExtractUpstreamErrorMessage 也能从透传的 body 中提取 message。
|
||||
errorEventSent := false
|
||||
sendErrorEvent := func(reason string) {
|
||||
sendErrorEvent := func(reason, message string) {
|
||||
if errorEventSent {
|
||||
return
|
||||
}
|
||||
errorEventSent = true
|
||||
_, _ = fmt.Fprintf(w, "event: error\ndata: {\"error\":\"%s\"}\n\n", reason)
|
||||
if message == "" {
|
||||
message = reason
|
||||
}
|
||||
body, err := json.Marshal(map[string]any{
|
||||
"type": "error",
|
||||
"error": map[string]string{
|
||||
"type": reason,
|
||||
"message": message,
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
// json.Marshal 不可能在已知 string-only 输入上失败,保守 fallback
|
||||
body = []byte(fmt.Sprintf(`{"type":"error","error":{"type":%q,"message":%q}}`, reason, message))
|
||||
}
|
||||
_, _ = fmt.Fprintf(w, "event: error\ndata: %s\n\n", body)
|
||||
flusher.Flush()
|
||||
}
|
||||
|
||||
@@ -7124,10 +7186,32 @@ func (s *GatewayService) handleStreamingResponse(ctx context.Context, resp *http
|
||||
// 客户端未断开,正常的错误处理
|
||||
if errors.Is(ev.err, bufio.ErrTooLong) {
|
||||
logger.LegacyPrintf("service.gateway", "SSE line too long: account=%d max_size=%d error=%v", account.ID, maxLineSize, ev.err)
|
||||
sendErrorEvent("response_too_large")
|
||||
sendErrorEvent("response_too_large", fmt.Sprintf("upstream SSE line exceeded %d bytes", maxLineSize))
|
||||
return &streamingResult{usage: usage, firstTokenMs: firstTokenMs}, ev.err
|
||||
}
|
||||
sendErrorEvent("stream_read_error")
|
||||
// 上游中途读错误(unexpected EOF / connection reset 等,常见于 HTTP/2 GOAWAY):
|
||||
// 若尚未向客户端写过任何字节,包成 UpstreamFailoverError 让 handler 层走 failover/重试。
|
||||
// 已经开始写流时 SSE 协议无 resume,只能透传错误事件给客户端。
|
||||
// 注意:面向客户端的 disconnectMsg 必须用 sanitizeStreamError 剥离地址,
|
||||
// 默认 *net.OpError 的 Error() 会泄露内部 IP/端口和上游地址。完整 ev.err
|
||||
// 仅在下方 LegacyPrintf 内部日志中保留供运维诊断。
|
||||
disconnectMsg := "upstream stream disconnected: " + sanitizeStreamError(ev.err)
|
||||
if !c.Writer.Written() {
|
||||
logger.LegacyPrintf("service.gateway", "Upstream stream read error before any client output (account=%d), failing over: %v", account.ID, ev.err)
|
||||
body, _ := json.Marshal(map[string]any{
|
||||
"type": "error",
|
||||
"error": map[string]string{
|
||||
"type": "upstream_disconnected",
|
||||
"message": disconnectMsg,
|
||||
},
|
||||
})
|
||||
return nil, &UpstreamFailoverError{
|
||||
StatusCode: http.StatusBadGateway,
|
||||
ResponseBody: body,
|
||||
RetryableOnSameAccount: true,
|
||||
}
|
||||
}
|
||||
sendErrorEvent("stream_read_error", disconnectMsg)
|
||||
return &streamingResult{usage: usage, firstTokenMs: firstTokenMs}, fmt.Errorf("stream read error: %w", ev.err)
|
||||
}
|
||||
line := ev.line
|
||||
@@ -7186,7 +7270,7 @@ func (s *GatewayService) handleStreamingResponse(ctx context.Context, resp *http
|
||||
if s.rateLimitService != nil {
|
||||
s.rateLimitService.HandleStreamTimeout(ctx, account, originalModel)
|
||||
}
|
||||
sendErrorEvent("stream_timeout")
|
||||
sendErrorEvent("stream_timeout", fmt.Sprintf("upstream stream idle for %s", streamInterval))
|
||||
return &streamingResult{usage: usage, firstTokenMs: firstTokenMs}, fmt.Errorf("stream data interval timeout")
|
||||
|
||||
case <-keepaliveCh:
|
||||
|
||||
Reference in New Issue
Block a user