Merge tag 'v0.1.79' into develop

增强错误处理与重试机制,新增 MODEL_CAPACITY_EXHAUSTED 同账号固定间隔重试、瞬态错误同账号重试优先于故障转移,并大幅优化错误匹配性能。

- MODEL_CAPACITY_EXHAUSTED (503) 使用固定 1s 间隔重试最多 60 次,不切换账号
- 瞬态错误(Google 400、空流响应)同账号重试 2 次后再触发故障转移
- 空流响应触发 failover 自动换号重试,不再直接返回 502
- Google "Invalid project resource name" 400 错误触发 failover 并临时封禁账号 1 小时
- 错误透传规则新增 skip_monitoring 选项,匹配的错误不记录到运维监控日志
- Antigravity 转发支持 daily/prod 单 URL 切换

- 错误匹配性能优化:延迟/限制 body ToLower,预计算规则关键词和平台集合
- MODEL_CAPACITY_EXHAUSTED 全局去重,避免并发请求重复重试
- 503 重试 body 读取限制从 2MB 降至 8KB
- time.After 替换为 time.NewTimer,防止 context 取消时 timer 泄漏
- 临时封禁冷却时间从 30 分钟缩短至 1 分钟(同账号重试耗尽后)

- 修复错误透传规则 skip_monitoring 未生效的问题
- 修复 CI 检查失败(gofmt、errcheck、staticcheck)

# Conflicts:
#	backend/internal/service/error_passthrough_runtime_test.go
This commit is contained in:
liuxiongfeng
2026-02-10 23:25:51 +08:00
10 changed files with 329 additions and 99 deletions

View File

@@ -896,6 +896,10 @@ func (h *GatewayHandler) handleFailoverExhausted(c *gin.Context, failoverErr *se
msg = *rule.CustomMessage msg = *rule.CustomMessage
} }
if rule.SkipMonitoring {
c.Set(service.OpsSkipPassthroughKey, true)
}
h.handleStreamingAwareError(c, respCode, "upstream_error", msg, streamStarted) h.handleStreamingAwareError(c, respCode, "upstream_error", msg, streamStarted)
return return
} }

View File

@@ -542,6 +542,10 @@ func (h *GatewayHandler) handleGeminiFailoverExhausted(c *gin.Context, failoverE
msg = *rule.CustomMessage msg = *rule.CustomMessage
} }
if rule.SkipMonitoring {
c.Set(service.OpsSkipPassthroughKey, true)
}
googleError(c, respCode, msg) googleError(c, respCode, msg)
return return
} }

View File

@@ -354,6 +354,10 @@ func (h *OpenAIGatewayHandler) handleFailoverExhausted(c *gin.Context, failoverE
msg = *rule.CustomMessage msg = *rule.CustomMessage
} }
if rule.SkipMonitoring {
c.Set(service.OpsSkipPassthroughKey, true)
}
h.handleStreamingAwareError(c, respCode, "upstream_error", msg, streamStarted) h.handleStreamingAwareError(c, respCode, "upstream_error", msg, streamStarted)
return return
} }

View File

@@ -537,6 +537,13 @@ func OpsErrorLoggerMiddleware(ops *service.OpsService) gin.HandlerFunc {
// Store request headers/body only when an upstream error occurred to keep overhead minimal. // Store request headers/body only when an upstream error occurred to keep overhead minimal.
entry.RequestHeadersJSON = extractOpsRetryRequestHeaders(c) entry.RequestHeadersJSON = extractOpsRetryRequestHeaders(c)
// Skip logging if a passthrough rule with skip_monitoring=true matched.
if v, ok := c.Get(service.OpsSkipPassthroughKey); ok {
if skip, _ := v.(bool); skip {
return
}
}
enqueueOpsErrorLog(ops, entry, requestBody) enqueueOpsErrorLog(ops, entry, requestBody)
return return
} }

View File

@@ -16,6 +16,7 @@ import (
"os" "os"
"strconv" "strconv"
"strings" "strings"
"sync"
"sync/atomic" "sync/atomic"
"time" "time"
@@ -66,6 +67,9 @@ const (
// 单账号 503 退避重试:原地重试的总累计等待时间上限 // 单账号 503 退避重试:原地重试的总累计等待时间上限
// 超过此上限将不再重试,直接返回 503 // 超过此上限将不再重试,直接返回 503
antigravitySingleAccountSmartRetryTotalMaxWait = 30 * time.Second antigravitySingleAccountSmartRetryTotalMaxWait = 30 * time.Second
// MODEL_CAPACITY_EXHAUSTED 全局去重:重试全部失败后的 cooldown 时间
antigravityModelCapacityCooldown = 10 * time.Second
) )
// antigravityPassthroughErrorMessages 透传给客户端的错误消息白名单(小写) // antigravityPassthroughErrorMessages 透传给客户端的错误消息白名单(小写)
@@ -74,8 +78,15 @@ var antigravityPassthroughErrorMessages = []string{
"prompt is too long", "prompt is too long",
} }
// MODEL_CAPACITY_EXHAUSTED 全局去重:避免多个并发请求同时对同一模型进行容量耗尽重试
var (
modelCapacityExhaustedMu sync.RWMutex
modelCapacityExhaustedUntil = make(map[string]time.Time) // modelName -> cooldown until
)
const ( const (
antigravityBillingModelEnv = "GATEWAY_ANTIGRAVITY_BILL_WITH_MAPPED_MODEL" antigravityBillingModelEnv = "GATEWAY_ANTIGRAVITY_BILL_WITH_MAPPED_MODEL"
antigravityForwardBaseURLEnv = "GATEWAY_ANTIGRAVITY_FORWARD_BASE_URL"
antigravityFallbackSecondsEnv = "GATEWAY_ANTIGRAVITY_FALLBACK_COOLDOWN_SECONDS" antigravityFallbackSecondsEnv = "GATEWAY_ANTIGRAVITY_FALLBACK_COOLDOWN_SECONDS"
) )
@@ -137,6 +148,20 @@ type antigravityRetryLoopResult struct {
resp *http.Response resp *http.Response
} }
// resolveAntigravityForwardBaseURL 解析转发用 base URL。
// 默认使用 dailyForwardBaseURLs 的首个地址);当环境变量为 prod 时使用第二个地址。
func resolveAntigravityForwardBaseURL() string {
baseURLs := antigravity.ForwardBaseURLs()
if len(baseURLs) == 0 {
return ""
}
mode := strings.ToLower(strings.TrimSpace(os.Getenv(antigravityForwardBaseURLEnv)))
if mode == "prod" && len(baseURLs) > 1 {
return baseURLs[1]
}
return baseURLs[0]
}
// smartRetryAction 智能重试的处理结果 // smartRetryAction 智能重试的处理结果
type smartRetryAction int type smartRetryAction int
@@ -211,17 +236,38 @@ func (s *AntigravityGatewayService) handleSmartRetry(p antigravityRetryLoopParam
if isModelCapacityExhausted { if isModelCapacityExhausted {
maxAttempts = antigravityModelCapacityRetryMaxAttempts maxAttempts = antigravityModelCapacityRetryMaxAttempts
waitDuration = antigravityModelCapacityRetryWait waitDuration = antigravityModelCapacityRetryWait
// 全局去重:如果其他 goroutine 已在重试同一模型且尚在 cooldown 中,直接返回 503
if modelName != "" {
modelCapacityExhaustedMu.RLock()
cooldownUntil, exists := modelCapacityExhaustedUntil[modelName]
modelCapacityExhaustedMu.RUnlock()
if exists && time.Now().Before(cooldownUntil) {
log.Printf("%s status=%d model_capacity_exhausted_dedup model=%s account=%d cooldown_until=%v (skip retry)",
p.prefix, resp.StatusCode, modelName, p.account.ID, cooldownUntil.Format("15:04:05"))
return &smartRetryResult{
action: smartRetryActionBreakWithResp,
resp: &http.Response{
StatusCode: resp.StatusCode,
Header: resp.Header.Clone(),
Body: io.NopCloser(bytes.NewReader(respBody)),
},
}
}
}
} }
for attempt := 1; attempt <= maxAttempts; attempt++ { for attempt := 1; attempt <= maxAttempts; attempt++ {
log.Printf("%s status=%d oauth_smart_retry attempt=%d/%d delay=%v model=%s account=%d", log.Printf("%s status=%d oauth_smart_retry attempt=%d/%d delay=%v model=%s account=%d",
p.prefix, resp.StatusCode, attempt, maxAttempts, waitDuration, modelName, p.account.ID) p.prefix, resp.StatusCode, attempt, maxAttempts, waitDuration, modelName, p.account.ID)
timer := time.NewTimer(waitDuration)
select { select {
case <-p.ctx.Done(): case <-p.ctx.Done():
timer.Stop()
log.Printf("%s status=context_canceled_during_smart_retry", p.prefix) log.Printf("%s status=context_canceled_during_smart_retry", p.prefix)
return &smartRetryResult{action: smartRetryActionBreakWithResp, err: p.ctx.Err()} return &smartRetryResult{action: smartRetryActionBreakWithResp, err: p.ctx.Err()}
case <-time.After(waitDuration): case <-timer.C:
} }
// 智能重试:创建新请求 // 智能重试:创建新请求
@@ -242,6 +288,12 @@ func (s *AntigravityGatewayService) handleSmartRetry(p antigravityRetryLoopParam
retryResp, retryErr := p.httpUpstream.Do(retryReq, p.proxyURL, p.account.ID, p.account.Concurrency) retryResp, retryErr := p.httpUpstream.Do(retryReq, p.proxyURL, p.account.ID, p.account.Concurrency)
if retryErr == nil && retryResp != nil && retryResp.StatusCode != http.StatusTooManyRequests && retryResp.StatusCode != http.StatusServiceUnavailable { if retryErr == nil && retryResp != nil && retryResp.StatusCode != http.StatusTooManyRequests && retryResp.StatusCode != http.StatusServiceUnavailable {
log.Printf("%s status=%d smart_retry_success attempt=%d/%d", p.prefix, retryResp.StatusCode, attempt, maxAttempts) log.Printf("%s status=%d smart_retry_success attempt=%d/%d", p.prefix, retryResp.StatusCode, attempt, maxAttempts)
// 重试成功,清除 MODEL_CAPACITY_EXHAUSTED cooldown
if isModelCapacityExhausted && modelName != "" {
modelCapacityExhaustedMu.Lock()
delete(modelCapacityExhaustedUntil, modelName)
modelCapacityExhaustedMu.Unlock()
}
return &smartRetryResult{action: smartRetryActionBreakWithResp, resp: retryResp} return &smartRetryResult{action: smartRetryActionBreakWithResp, resp: retryResp}
} }
@@ -257,7 +309,7 @@ func (s *AntigravityGatewayService) handleSmartRetry(p antigravityRetryLoopParam
} }
lastRetryResp = retryResp lastRetryResp = retryResp
if retryResp != nil { if retryResp != nil {
lastRetryBody, _ = io.ReadAll(io.LimitReader(retryResp.Body, 2<<20)) lastRetryBody, _ = io.ReadAll(io.LimitReader(retryResp.Body, 8<<10))
_ = retryResp.Body.Close() _ = retryResp.Body.Close()
} }
@@ -283,6 +335,12 @@ func (s *AntigravityGatewayService) handleSmartRetry(p antigravityRetryLoopParam
// MODEL_CAPACITY_EXHAUSTED模型容量不足切换账号无意义 // MODEL_CAPACITY_EXHAUSTED模型容量不足切换账号无意义
// 直接返回上游错误响应,不设置模型限流,不切换账号 // 直接返回上游错误响应,不设置模型限流,不切换账号
if isModelCapacityExhausted { if isModelCapacityExhausted {
// 设置 cooldown让后续请求快速失败避免重复重试
if modelName != "" {
modelCapacityExhaustedMu.Lock()
modelCapacityExhaustedUntil[modelName] = time.Now().Add(antigravityModelCapacityCooldown)
modelCapacityExhaustedMu.Unlock()
}
log.Printf("%s status=%d smart_retry_exhausted_model_capacity attempts=%d model=%s account=%d body=%s (model capacity exhausted, not switching account)", log.Printf("%s status=%d smart_retry_exhausted_model_capacity attempts=%d model=%s account=%d body=%s (model capacity exhausted, not switching account)",
p.prefix, resp.StatusCode, maxAttempts, modelName, p.account.ID, truncateForLog(retryBody, 200)) p.prefix, resp.StatusCode, maxAttempts, modelName, p.account.ID, truncateForLog(retryBody, 200))
return &smartRetryResult{ return &smartRetryResult{
@@ -395,11 +453,13 @@ func (s *AntigravityGatewayService) handleSingleAccountRetryInPlace(
log.Printf("%s status=%d single_account_503_retry attempt=%d/%d delay=%v total_waited=%v model=%s account=%d", log.Printf("%s status=%d single_account_503_retry attempt=%d/%d delay=%v total_waited=%v model=%s account=%d",
p.prefix, resp.StatusCode, attempt, antigravitySingleAccountSmartRetryMaxAttempts, waitDuration, totalWaited, modelName, p.account.ID) p.prefix, resp.StatusCode, attempt, antigravitySingleAccountSmartRetryMaxAttempts, waitDuration, totalWaited, modelName, p.account.ID)
timer := time.NewTimer(waitDuration)
select { select {
case <-p.ctx.Done(): case <-p.ctx.Done():
timer.Stop()
log.Printf("%s status=context_canceled_during_single_account_retry", p.prefix) log.Printf("%s status=context_canceled_during_single_account_retry", p.prefix)
return &smartRetryResult{action: smartRetryActionBreakWithResp, err: p.ctx.Err()} return &smartRetryResult{action: smartRetryActionBreakWithResp, err: p.ctx.Err()}
case <-time.After(waitDuration): case <-timer.C:
} }
totalWaited += waitDuration totalWaited += waitDuration
@@ -433,7 +493,7 @@ func (s *AntigravityGatewayService) handleSingleAccountRetryInPlace(
_ = lastRetryResp.Body.Close() _ = lastRetryResp.Body.Close()
} }
lastRetryResp = retryResp lastRetryResp = retryResp
lastRetryBody, _ = io.ReadAll(io.LimitReader(retryResp.Body, 2<<20)) lastRetryBody, _ = io.ReadAll(io.LimitReader(retryResp.Body, 8<<10))
_ = retryResp.Body.Close() _ = retryResp.Body.Close()
// 解析新的重试信息,更新下次等待时间 // 解析新的重试信息,更新下次等待时间
@@ -494,10 +554,11 @@ func (s *AntigravityGatewayService) antigravityRetryLoop(p antigravityRetryLoopP
} }
} }
availableURLs := antigravity.DefaultURLAvailability.GetAvailableURLs() baseURL := resolveAntigravityForwardBaseURL()
if len(availableURLs) == 0 { if baseURL == "" {
availableURLs = antigravity.BaseURLs return nil, errors.New("no antigravity forward base url configured")
} }
availableURLs := []string{baseURL}
var resp *http.Response var resp *http.Response
var usedBaseURL string var usedBaseURL string
@@ -935,11 +996,11 @@ func (s *AntigravityGatewayService) TestConnection(ctx context.Context, account
proxyURL = account.Proxy.URL() proxyURL = account.Proxy.URL()
} }
// URL fallback 循环 baseURL := resolveAntigravityForwardBaseURL()
availableURLs := antigravity.DefaultURLAvailability.GetAvailableURLs() if baseURL == "" {
if len(availableURLs) == 0 { return nil, errors.New("no antigravity forward base url configured")
availableURLs = antigravity.BaseURLs // 所有 URL 都不可用时,重试所有
} }
availableURLs := []string{baseURL}
var lastErr error var lastErr error
for urlIdx, baseURL := range availableURLs { for urlIdx, baseURL := range availableURLs {
@@ -1404,7 +1465,7 @@ func (s *AntigravityGatewayService) Forward(ctx context.Context, c *gin.Context,
break break
} }
retryBody, _ := io.ReadAll(io.LimitReader(retryResp.Body, 2<<20)) retryBody, _ := io.ReadAll(io.LimitReader(retryResp.Body, 8<<10))
_ = retryResp.Body.Close() _ = retryResp.Body.Close()
if retryResp.StatusCode == http.StatusTooManyRequests { if retryResp.StatusCode == http.StatusTooManyRequests {
retryBaseURL := "" retryBaseURL := ""
@@ -2211,10 +2272,12 @@ func sleepAntigravityBackoffWithContext(ctx context.Context, attempt int) bool {
sleepFor = 0 sleepFor = 0
} }
timer := time.NewTimer(sleepFor)
select { select {
case <-ctx.Done(): case <-ctx.Done():
timer.Stop()
return false return false
case <-time.After(sleepFor): case <-timer.C:
return true return true
} }
} }
@@ -3251,6 +3314,21 @@ func (s *AntigravityGatewayService) writeMappedClaudeError(c *gin.Context, accou
log.Printf("[antigravity-Forward] upstream_error status=%d body=%s", upstreamStatus, truncateForLog(body, maxBytes)) log.Printf("[antigravity-Forward] upstream_error status=%d body=%s", upstreamStatus, truncateForLog(body, maxBytes))
} }
// 检查错误透传规则
if ptStatus, ptErrType, ptErrMsg, matched := applyErrorPassthroughRule(
c, account.Platform, upstreamStatus, body,
0, "", "",
); matched {
c.JSON(ptStatus, gin.H{
"type": "error",
"error": gin.H{"type": ptErrType, "message": ptErrMsg},
})
if upstreamMsg == "" {
return fmt.Errorf("upstream error: %d", upstreamStatus)
}
return fmt.Errorf("upstream error: %d message=%s", upstreamStatus, upstreamMsg)
}
var statusCode int var statusCode int
var errType, errMsg string var errType, errMsg string

View File

@@ -86,7 +86,9 @@ func (s *stubAntigravityAccountRepo) SetModelRateLimit(ctx context.Context, id i
return nil return nil
} }
func TestAntigravityRetryLoop_URLFallback_UsesLatestSuccess(t *testing.T) { func TestAntigravityRetryLoop_NoURLFallback_UsesConfiguredBaseURL(t *testing.T) {
t.Setenv(antigravityForwardBaseURLEnv, "")
oldBaseURLs := append([]string(nil), antigravity.BaseURLs...) oldBaseURLs := append([]string(nil), antigravity.BaseURLs...)
oldAvailability := antigravity.DefaultURLAvailability oldAvailability := antigravity.DefaultURLAvailability
defer func() { defer func() {
@@ -131,15 +133,16 @@ func TestAntigravityRetryLoop_URLFallback_UsesLatestSuccess(t *testing.T) {
require.NotNil(t, result) require.NotNil(t, result)
require.NotNil(t, result.resp) require.NotNil(t, result.resp)
defer func() { _ = result.resp.Body.Close() }() defer func() { _ = result.resp.Body.Close() }()
require.Equal(t, http.StatusOK, result.resp.StatusCode) require.Equal(t, http.StatusTooManyRequests, result.resp.StatusCode)
require.False(t, handleErrorCalled) require.True(t, handleErrorCalled)
require.Len(t, upstream.calls, 2) require.Len(t, upstream.calls, antigravityMaxRetries)
require.True(t, strings.HasPrefix(upstream.calls[0], base1)) for _, callURL := range upstream.calls {
require.True(t, strings.HasPrefix(upstream.calls[1], base2)) require.True(t, strings.HasPrefix(callURL, base1))
}
available := antigravity.DefaultURLAvailability.GetAvailableURLs() available := antigravity.DefaultURLAvailability.GetAvailableURLs()
require.NotEmpty(t, available) require.NotEmpty(t, available)
require.Equal(t, base2, available[0]) require.Equal(t, base1, available[0])
} }
// TestHandleUpstreamError_429_ModelRateLimit 测试 429 模型限流场景 // TestHandleUpstreamError_429_ModelRateLimit 测试 429 模型限流场景
@@ -927,6 +930,22 @@ func TestIsAntigravityAccountSwitchError(t *testing.T) {
} }
} }
func TestResolveAntigravityForwardBaseURL_DefaultDaily(t *testing.T) {
t.Setenv(antigravityForwardBaseURLEnv, "")
oldBaseURLs := append([]string(nil), antigravity.BaseURLs...)
defer func() {
antigravity.BaseURLs = oldBaseURLs
}()
prodURL := "https://prod.test"
dailyURL := "https://daily.test"
antigravity.BaseURLs = []string{dailyURL, prodURL}
resolved := resolveAntigravityForwardBaseURL()
require.Equal(t, dailyURL, resolved)
}
func TestAntigravityAccountSwitchError_Error(t *testing.T) { func TestAntigravityAccountSwitchError_Error(t *testing.T) {
err := &AntigravityAccountSwitchError{ err := &AntigravityAccountSwitchError{
OriginalAccountID: 789, OriginalAccountID: 789,

View File

@@ -153,13 +153,14 @@ func TestHandleSmartRetry_503_LongDelay_NoSingleAccountRetry_StillSwitches(t *te
Platform: PlatformAntigravity, Platform: PlatformAntigravity,
} }
// 503 + 39s >= 7s 阈值 // 503 + 39s >= 7s 阈值(使用 RATE_LIMIT_EXCEEDED 而非 MODEL_CAPACITY_EXHAUSTED
// 因为 MODEL_CAPACITY_EXHAUSTED 走独立的重试路径,不触发 shouldRateLimitModel
respBody := []byte(`{ respBody := []byte(`{
"error": { "error": {
"code": 503, "code": 503,
"status": "UNAVAILABLE", "status": "RESOURCE_EXHAUSTED",
"details": [ "details": [
{"@type": "type.googleapis.com/google.rpc.ErrorInfo", "metadata": {"model": "gemini-3-pro-high"}, "reason": "MODEL_CAPACITY_EXHAUSTED"}, {"@type": "type.googleapis.com/google.rpc.ErrorInfo", "metadata": {"model": "gemini-3-pro-high"}, "reason": "RATE_LIMIT_EXCEEDED"},
{"@type": "type.googleapis.com/google.rpc.RetryInfo", "retryDelay": "39s"} {"@type": "type.googleapis.com/google.rpc.RetryInfo", "retryDelay": "39s"}
] ]
} }
@@ -339,13 +340,14 @@ func TestHandleSmartRetry_503_ShortDelay_SingleAccountRetry_NoRateLimit(t *testi
// TestHandleSmartRetry_503_ShortDelay_NoSingleAccountRetry_SetsRateLimit // TestHandleSmartRetry_503_ShortDelay_NoSingleAccountRetry_SetsRateLimit
// 对照组503 + retryDelay < 7s + 无 SingleAccountRetry → 智能重试耗尽后照常设限流 // 对照组503 + retryDelay < 7s + 无 SingleAccountRetry → 智能重试耗尽后照常设限流
// 使用 RATE_LIMIT_EXCEEDED 而非 MODEL_CAPACITY_EXHAUSTED因为后者走独立的 60 次重试路径
func TestHandleSmartRetry_503_ShortDelay_NoSingleAccountRetry_SetsRateLimit(t *testing.T) { func TestHandleSmartRetry_503_ShortDelay_NoSingleAccountRetry_SetsRateLimit(t *testing.T) {
failRespBody := `{ failRespBody := `{
"error": { "error": {
"code": 503, "code": 503,
"status": "UNAVAILABLE", "status": "RESOURCE_EXHAUSTED",
"details": [ "details": [
{"@type": "type.googleapis.com/google.rpc.ErrorInfo", "metadata": {"model": "gemini-3-flash"}, "reason": "MODEL_CAPACITY_EXHAUSTED"}, {"@type": "type.googleapis.com/google.rpc.ErrorInfo", "metadata": {"model": "gemini-3-flash"}, "reason": "RATE_LIMIT_EXCEEDED"},
{"@type": "type.googleapis.com/google.rpc.RetryInfo", "retryDelay": "0.1s"} {"@type": "type.googleapis.com/google.rpc.RetryInfo", "retryDelay": "0.1s"}
] ]
} }
@@ -371,9 +373,9 @@ func TestHandleSmartRetry_503_ShortDelay_NoSingleAccountRetry_SetsRateLimit(t *t
respBody := []byte(`{ respBody := []byte(`{
"error": { "error": {
"code": 503, "code": 503,
"status": "UNAVAILABLE", "status": "RESOURCE_EXHAUSTED",
"details": [ "details": [
{"@type": "type.googleapis.com/google.rpc.ErrorInfo", "metadata": {"model": "gemini-3-flash"}, "reason": "MODEL_CAPACITY_EXHAUSTED"}, {"@type": "type.googleapis.com/google.rpc.ErrorInfo", "metadata": {"model": "gemini-3-flash"}, "reason": "RATE_LIMIT_EXCEEDED"},
{"@type": "type.googleapis.com/google.rpc.RetryInfo", "retryDelay": "0.1s"} {"@type": "type.googleapis.com/google.rpc.RetryInfo", "retryDelay": "0.1s"}
] ]
} }

View File

@@ -45,10 +45,20 @@ type ErrorPassthroughService struct {
cache ErrorPassthroughCache cache ErrorPassthroughCache
// 本地内存缓存,用于快速匹配 // 本地内存缓存,用于快速匹配
localCache []*model.ErrorPassthroughRule localCache []*cachedPassthroughRule
localCacheMu sync.RWMutex localCacheMu sync.RWMutex
} }
// cachedPassthroughRule 预计算的规则缓存,避免运行时重复 ToLower
type cachedPassthroughRule struct {
*model.ErrorPassthroughRule
lowerKeywords []string // 预计算的小写关键词
lowerPlatforms []string // 预计算的小写平台
errorCodeSet map[int]struct{} // 预计算的 error code set
}
const maxBodyMatchLen = 8 << 10 // 8KB错误信息不会在 8KB 之后才出现
// NewErrorPassthroughService 创建错误透传规则服务 // NewErrorPassthroughService 创建错误透传规则服务
func NewErrorPassthroughService( func NewErrorPassthroughService(
repo ErrorPassthroughRepository, repo ErrorPassthroughRepository,
@@ -150,17 +160,19 @@ func (s *ErrorPassthroughService) MatchRule(platform string, statusCode int, bod
return nil return nil
} }
bodyStr := strings.ToLower(string(body)) lowerPlatform := strings.ToLower(platform)
var bodyLower string // 延迟初始化,只在需要关键词匹配时计算
var bodyLowerDone bool
for _, rule := range rules { for _, rule := range rules {
if !rule.Enabled { if !rule.Enabled {
continue continue
} }
if !s.platformMatches(rule, platform) { if !s.platformMatchesCached(rule, lowerPlatform) {
continue continue
} }
if s.ruleMatches(rule, statusCode, bodyStr) { if s.ruleMatchesOptimized(rule, statusCode, body, &bodyLower, &bodyLowerDone) {
return rule return rule.ErrorPassthroughRule
} }
} }
@@ -168,7 +180,7 @@ func (s *ErrorPassthroughService) MatchRule(platform string, statusCode int, bod
} }
// getCachedRules 获取缓存的规则列表(按优先级排序) // getCachedRules 获取缓存的规则列表(按优先级排序)
func (s *ErrorPassthroughService) getCachedRules() []*model.ErrorPassthroughRule { func (s *ErrorPassthroughService) getCachedRules() []*cachedPassthroughRule {
s.localCacheMu.RLock() s.localCacheMu.RLock()
rules := s.localCache rules := s.localCache
s.localCacheMu.RUnlock() s.localCacheMu.RUnlock()
@@ -223,17 +235,39 @@ func (s *ErrorPassthroughService) reloadRulesFromDB(ctx context.Context) error {
return nil return nil
} }
// setLocalCache 设置本地缓存 // setLocalCache 设置本地缓存,预计算小写值和 set 以避免运行时重复计算
func (s *ErrorPassthroughService) setLocalCache(rules []*model.ErrorPassthroughRule) { func (s *ErrorPassthroughService) setLocalCache(rules []*model.ErrorPassthroughRule) {
cached := make([]*cachedPassthroughRule, len(rules))
for i, r := range rules {
cr := &cachedPassthroughRule{ErrorPassthroughRule: r}
if len(r.Keywords) > 0 {
cr.lowerKeywords = make([]string, len(r.Keywords))
for j, kw := range r.Keywords {
cr.lowerKeywords[j] = strings.ToLower(kw)
}
}
if len(r.Platforms) > 0 {
cr.lowerPlatforms = make([]string, len(r.Platforms))
for j, p := range r.Platforms {
cr.lowerPlatforms[j] = strings.ToLower(p)
}
}
if len(r.ErrorCodes) > 0 {
cr.errorCodeSet = make(map[int]struct{}, len(r.ErrorCodes))
for _, code := range r.ErrorCodes {
cr.errorCodeSet[code] = struct{}{}
}
}
cached[i] = cr
}
// 按优先级排序 // 按优先级排序
sorted := make([]*model.ErrorPassthroughRule, len(rules)) sort.Slice(cached, func(i, j int) bool {
copy(sorted, rules) return cached[i].Priority < cached[j].Priority
sort.Slice(sorted, func(i, j int) bool {
return sorted[i].Priority < sorted[j].Priority
}) })
s.localCacheMu.Lock() s.localCacheMu.Lock()
s.localCache = sorted s.localCache = cached
s.localCacheMu.Unlock() s.localCacheMu.Unlock()
} }
@@ -273,62 +307,79 @@ func (s *ErrorPassthroughService) invalidateAndNotify(ctx context.Context) {
} }
} }
// platformMatches 检查平台是否匹配 // ensureBodyLower 延迟初始化 body 的小写版本,只做一次转换,限制 8KB
func (s *ErrorPassthroughService) platformMatches(rule *model.ErrorPassthroughRule, platform string) bool { func ensureBodyLower(body []byte, bodyLower *string, done *bool) string {
// 如果没有配置平台限制,则匹配所有平台 if *done {
if len(rule.Platforms) == 0 { return *bodyLower
return true }
b := body
if len(b) > maxBodyMatchLen {
b = b[:maxBodyMatchLen]
}
*bodyLower = strings.ToLower(string(b))
*done = true
return *bodyLower
} }
platform = strings.ToLower(platform) // platformMatchesCached 使用预计算的小写平台检查是否匹配
for _, p := range rule.Platforms { func (s *ErrorPassthroughService) platformMatchesCached(rule *cachedPassthroughRule, lowerPlatform string) bool {
if strings.ToLower(p) == platform { if len(rule.lowerPlatforms) == 0 {
return true
}
for _, p := range rule.lowerPlatforms {
if p == lowerPlatform {
return true return true
} }
} }
return false return false
} }
// ruleMatches 检查规则是否匹配 // ruleMatchesOptimized 优化的规则匹配,支持短路和延迟 body 转换
func (s *ErrorPassthroughService) ruleMatches(rule *model.ErrorPassthroughRule, statusCode int, bodyLower string) bool { func (s *ErrorPassthroughService) ruleMatchesOptimized(rule *cachedPassthroughRule, statusCode int, body []byte, bodyLower *string, bodyLowerDone *bool) bool {
hasErrorCodes := len(rule.ErrorCodes) > 0 hasErrorCodes := len(rule.errorCodeSet) > 0
hasKeywords := len(rule.Keywords) > 0 hasKeywords := len(rule.lowerKeywords) > 0
// 如果没有配置任何条件,不匹配
if !hasErrorCodes && !hasKeywords { if !hasErrorCodes && !hasKeywords {
return false return false
} }
codeMatch := !hasErrorCodes || s.containsInt(rule.ErrorCodes, statusCode) codeMatch := !hasErrorCodes || s.containsIntSet(rule.errorCodeSet, statusCode)
keywordMatch := !hasKeywords || s.containsAnyKeyword(bodyLower, rule.Keywords)
if rule.MatchMode == model.MatchModeAll { if rule.MatchMode == model.MatchModeAll {
// "all" 模式:所有配置的条件都必须满足 // "all" 模式:所有配置的条件都必须满足,短路
return codeMatch && keywordMatch if hasErrorCodes && !codeMatch {
return false
}
if hasKeywords {
return s.containsAnyKeywordCached(ensureBodyLower(body, bodyLower, bodyLowerDone), rule.lowerKeywords)
}
return codeMatch
} }
// "any" 模式:任一条件满足即可 // "any" 模式:任一条件满足即可,短路
if hasErrorCodes && hasKeywords { if hasErrorCodes && hasKeywords {
return codeMatch || keywordMatch if codeMatch {
return true
} }
return codeMatch && keywordMatch return s.containsAnyKeywordCached(ensureBodyLower(body, bodyLower, bodyLowerDone), rule.lowerKeywords)
}
// 只配置了一种条件
if hasKeywords {
return s.containsAnyKeywordCached(ensureBodyLower(body, bodyLower, bodyLowerDone), rule.lowerKeywords)
}
return codeMatch
} }
// containsInt 检查切片是否包含指定整数 // containsIntSet 使用 map 查找替代线性扫描
func (s *ErrorPassthroughService) containsInt(slice []int, val int) bool { func (s *ErrorPassthroughService) containsIntSet(set map[int]struct{}, val int) bool {
for _, v := range slice { _, ok := set[val]
if v == val { return ok
return true }
}
} // containsAnyKeywordCached 使用预计算的小写关键词检查匹配
return false func (s *ErrorPassthroughService) containsAnyKeywordCached(bodyLower string, lowerKeywords []string) bool {
} for _, kw := range lowerKeywords {
if strings.Contains(bodyLower, kw) {
// containsAnyKeyword 检查字符串是否包含任一关键词(不区分大小写)
func (s *ErrorPassthroughService) containsAnyKeyword(bodyLower string, keywords []string) bool {
for _, kw := range keywords {
if strings.Contains(bodyLower, strings.ToLower(kw)) {
return true return true
} }
} }

View File

@@ -145,32 +145,58 @@ func newTestService(rules []*model.ErrorPassthroughRule) *ErrorPassthroughServic
return svc return svc
} }
// newCachedRuleForTest 从 model.ErrorPassthroughRule 创建 cachedPassthroughRule测试用
func newCachedRuleForTest(rule *model.ErrorPassthroughRule) *cachedPassthroughRule {
cr := &cachedPassthroughRule{ErrorPassthroughRule: rule}
if len(rule.Keywords) > 0 {
cr.lowerKeywords = make([]string, len(rule.Keywords))
for j, kw := range rule.Keywords {
cr.lowerKeywords[j] = strings.ToLower(kw)
}
}
if len(rule.Platforms) > 0 {
cr.lowerPlatforms = make([]string, len(rule.Platforms))
for j, p := range rule.Platforms {
cr.lowerPlatforms[j] = strings.ToLower(p)
}
}
if len(rule.ErrorCodes) > 0 {
cr.errorCodeSet = make(map[int]struct{}, len(rule.ErrorCodes))
for _, code := range rule.ErrorCodes {
cr.errorCodeSet[code] = struct{}{}
}
}
return cr
}
// ============================================================================= // =============================================================================
// 测试 ruleMatches 核心匹配逻辑 // 测试 ruleMatchesOptimized 核心匹配逻辑
// ============================================================================= // =============================================================================
func TestRuleMatches_NoConditions(t *testing.T) { func TestRuleMatches_NoConditions(t *testing.T) {
// 没有配置任何条件时,不应该匹配 // 没有配置任何条件时,不应该匹配
svc := newTestService(nil) svc := newTestService(nil)
rule := &model.ErrorPassthroughRule{ rule := newCachedRuleForTest(&model.ErrorPassthroughRule{
Enabled: true, Enabled: true,
ErrorCodes: []int{}, ErrorCodes: []int{},
Keywords: []string{}, Keywords: []string{},
MatchMode: model.MatchModeAny, MatchMode: model.MatchModeAny,
} })
assert.False(t, svc.ruleMatches(rule, 422, "some error message"), var bodyLower string
var bodyLowerDone bool
assert.False(t, svc.ruleMatchesOptimized(rule, 422, []byte("some error message"), &bodyLower, &bodyLowerDone),
"没有配置条件时不应该匹配") "没有配置条件时不应该匹配")
} }
func TestRuleMatches_OnlyErrorCodes_AnyMode(t *testing.T) { func TestRuleMatches_OnlyErrorCodes_AnyMode(t *testing.T) {
svc := newTestService(nil) svc := newTestService(nil)
rule := &model.ErrorPassthroughRule{ rule := newCachedRuleForTest(&model.ErrorPassthroughRule{
Enabled: true, Enabled: true,
ErrorCodes: []int{422, 400}, ErrorCodes: []int{422, 400},
Keywords: []string{}, Keywords: []string{},
MatchMode: model.MatchModeAny, MatchMode: model.MatchModeAny,
} })
tests := []struct { tests := []struct {
name string name string
@@ -186,7 +212,9 @@ func TestRuleMatches_OnlyErrorCodes_AnyMode(t *testing.T) {
for _, tt := range tests { for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) { t.Run(tt.name, func(t *testing.T) {
result := svc.ruleMatches(rule, tt.statusCode, tt.body) var bodyLower string
var bodyLowerDone bool
result := svc.ruleMatchesOptimized(rule, tt.statusCode, []byte(tt.body), &bodyLower, &bodyLowerDone)
assert.Equal(t, tt.expected, result) assert.Equal(t, tt.expected, result)
}) })
} }
@@ -194,12 +222,12 @@ func TestRuleMatches_OnlyErrorCodes_AnyMode(t *testing.T) {
func TestRuleMatches_OnlyKeywords_AnyMode(t *testing.T) { func TestRuleMatches_OnlyKeywords_AnyMode(t *testing.T) {
svc := newTestService(nil) svc := newTestService(nil)
rule := &model.ErrorPassthroughRule{ rule := newCachedRuleForTest(&model.ErrorPassthroughRule{
Enabled: true, Enabled: true,
ErrorCodes: []int{}, ErrorCodes: []int{},
Keywords: []string{"context limit", "model not supported"}, Keywords: []string{"context limit", "model not supported"},
MatchMode: model.MatchModeAny, MatchMode: model.MatchModeAny,
} })
tests := []struct { tests := []struct {
name string name string
@@ -210,16 +238,14 @@ func TestRuleMatches_OnlyKeywords_AnyMode(t *testing.T) {
{"关键词匹配 context limit", 500, "error: context limit reached", true}, {"关键词匹配 context limit", 500, "error: context limit reached", true},
{"关键词匹配 model not supported", 400, "the model not supported here", true}, {"关键词匹配 model not supported", 400, "the model not supported here", true},
{"关键词不匹配", 422, "some other error", false}, {"关键词不匹配", 422, "some other error", false},
// 注意ruleMatches 接收的 body 参数应该是已经转换为小写的 {"关键词大小写 - 自动转换", 500, "Context Limit exceeded", true},
// 实际使用时MatchRule 会先将 body 转换为小写再传给 ruleMatches
{"关键词大小写 - 输入已小写", 500, "context limit exceeded", true},
} }
for _, tt := range tests { for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) { t.Run(tt.name, func(t *testing.T) {
// 模拟 MatchRule 的行为:先转换为小写 var bodyLower string
bodyLower := strings.ToLower(tt.body) var bodyLowerDone bool
result := svc.ruleMatches(rule, tt.statusCode, bodyLower) result := svc.ruleMatchesOptimized(rule, tt.statusCode, []byte(tt.body), &bodyLower, &bodyLowerDone)
assert.Equal(t, tt.expected, result) assert.Equal(t, tt.expected, result)
}) })
} }
@@ -228,12 +254,12 @@ func TestRuleMatches_OnlyKeywords_AnyMode(t *testing.T) {
func TestRuleMatches_BothConditions_AnyMode(t *testing.T) { func TestRuleMatches_BothConditions_AnyMode(t *testing.T) {
// any 模式:错误码 OR 关键词 // any 模式:错误码 OR 关键词
svc := newTestService(nil) svc := newTestService(nil)
rule := &model.ErrorPassthroughRule{ rule := newCachedRuleForTest(&model.ErrorPassthroughRule{
Enabled: true, Enabled: true,
ErrorCodes: []int{422, 400}, ErrorCodes: []int{422, 400},
Keywords: []string{"context limit"}, Keywords: []string{"context limit"},
MatchMode: model.MatchModeAny, MatchMode: model.MatchModeAny,
} })
tests := []struct { tests := []struct {
name string name string
@@ -274,7 +300,9 @@ func TestRuleMatches_BothConditions_AnyMode(t *testing.T) {
for _, tt := range tests { for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) { t.Run(tt.name, func(t *testing.T) {
result := svc.ruleMatches(rule, tt.statusCode, tt.body) var bodyLower string
var bodyLowerDone bool
result := svc.ruleMatchesOptimized(rule, tt.statusCode, []byte(tt.body), &bodyLower, &bodyLowerDone)
assert.Equal(t, tt.expected, result, tt.reason) assert.Equal(t, tt.expected, result, tt.reason)
}) })
} }
@@ -283,12 +311,12 @@ func TestRuleMatches_BothConditions_AnyMode(t *testing.T) {
func TestRuleMatches_BothConditions_AllMode(t *testing.T) { func TestRuleMatches_BothConditions_AllMode(t *testing.T) {
// all 模式:错误码 AND 关键词 // all 模式:错误码 AND 关键词
svc := newTestService(nil) svc := newTestService(nil)
rule := &model.ErrorPassthroughRule{ rule := newCachedRuleForTest(&model.ErrorPassthroughRule{
Enabled: true, Enabled: true,
ErrorCodes: []int{422, 400}, ErrorCodes: []int{422, 400},
Keywords: []string{"context limit"}, Keywords: []string{"context limit"},
MatchMode: model.MatchModeAll, MatchMode: model.MatchModeAll,
} })
tests := []struct { tests := []struct {
name string name string
@@ -329,14 +357,16 @@ func TestRuleMatches_BothConditions_AllMode(t *testing.T) {
for _, tt := range tests { for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) { t.Run(tt.name, func(t *testing.T) {
result := svc.ruleMatches(rule, tt.statusCode, tt.body) var bodyLower string
var bodyLowerDone bool
result := svc.ruleMatchesOptimized(rule, tt.statusCode, []byte(tt.body), &bodyLower, &bodyLowerDone)
assert.Equal(t, tt.expected, result, tt.reason) assert.Equal(t, tt.expected, result, tt.reason)
}) })
} }
} }
// ============================================================================= // =============================================================================
// 测试 platformMatches 平台匹配逻辑 // 测试 platformMatchesCached 平台匹配逻辑
// ============================================================================= // =============================================================================
func TestPlatformMatches(t *testing.T) { func TestPlatformMatches(t *testing.T) {
@@ -394,10 +424,10 @@ func TestPlatformMatches(t *testing.T) {
for _, tt := range tests { for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) { t.Run(tt.name, func(t *testing.T) {
rule := &model.ErrorPassthroughRule{ rule := newCachedRuleForTest(&model.ErrorPassthroughRule{
Platforms: tt.rulePlatforms, Platforms: tt.rulePlatforms,
} })
result := svc.platformMatches(rule, tt.requestPlatform) result := svc.platformMatchesCached(rule, strings.ToLower(tt.requestPlatform))
assert.Equal(t, tt.expected, result) assert.Equal(t, tt.expected, result)
}) })
} }

View File

@@ -107,6 +107,37 @@ func appendOpsUpstreamError(c *gin.Context, ev OpsUpstreamErrorEvent) {
evCopy := ev evCopy := ev
existing = append(existing, &evCopy) existing = append(existing, &evCopy)
c.Set(OpsUpstreamErrorsKey, existing) c.Set(OpsUpstreamErrorsKey, existing)
checkSkipMonitoringForUpstreamEvent(c, &evCopy)
}
// checkSkipMonitoringForUpstreamEvent checks whether the upstream error event
// matches a passthrough rule with skip_monitoring=true and, if so, sets the
// OpsSkipPassthroughKey on the context. This ensures intermediate retry /
// failover errors (which never go through the final applyErrorPassthroughRule
// path) can still suppress ops_error_logs recording.
func checkSkipMonitoringForUpstreamEvent(c *gin.Context, ev *OpsUpstreamErrorEvent) {
if ev.UpstreamStatusCode == 0 {
return
}
svc := getBoundErrorPassthroughService(c)
if svc == nil {
return
}
// Use the best available body representation for keyword matching.
// Even when body is empty, MatchRule can still match rules that only
// specify ErrorCodes (no Keywords), so we always call it.
body := ev.Detail
if body == "" {
body = ev.Message
}
rule := svc.MatchRule(ev.Platform, ev.UpstreamStatusCode, []byte(body))
if rule != nil && rule.SkipMonitoring {
c.Set(OpsSkipPassthroughKey, true)
}
} }
func marshalOpsUpstreamErrors(events []*OpsUpstreamErrorEvent) *string { func marshalOpsUpstreamErrors(events []*OpsUpstreamErrorEvent) *string {