mirror of
https://gitee.com/wanwujie/sub2api
synced 2026-04-03 15:02:13 +08:00
refactor: extract failover error handling into shared HandleFailoverError
- Extract duplicated failover error handling from gateway_handler.go (Gemini-compat & Claude paths) and gemini_v1beta_handler.go into shared failover_loop.go - Introduce TempUnscheduler interface for testability (GatewayService implicitly satisfies it) - Add comprehensive unit tests for HandleFailoverError (32 test cases covering all paths) - Fix golangci-lint issues: errcheck in test type assertion, staticcheck QF1003 if/else→switch
This commit is contained in:
125
backend/internal/handler/failover_loop.go
Normal file
125
backend/internal/handler/failover_loop.go
Normal file
@@ -0,0 +1,125 @@
|
||||
package handler
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log"
|
||||
"time"
|
||||
|
||||
"sub2api/internal/service"
|
||||
)
|
||||
|
||||
// TempUnscheduler 用于 HandleFailoverError 中同账号重试耗尽后的临时封禁。
|
||||
// GatewayService 隐式实现此接口。
|
||||
type TempUnscheduler interface {
|
||||
TempUnscheduleRetryableError(ctx context.Context, accountID int64, failoverErr *service.UpstreamFailoverError)
|
||||
}
|
||||
|
||||
// FailoverAction 表示 failover 错误处理后的下一步动作
|
||||
type FailoverAction int
|
||||
|
||||
const (
|
||||
// FailoverRetry 同账号重试(调用方应 continue 重新进入循环,不更换账号)
|
||||
FailoverRetry FailoverAction = iota
|
||||
// FailoverSwitch 切换账号(调用方应 continue 重新选择账号)
|
||||
FailoverSwitch
|
||||
// FailoverExhausted 切换次数耗尽(调用方应返回错误响应)
|
||||
FailoverExhausted
|
||||
// FailoverCanceled context 已取消(调用方应直接 return)
|
||||
FailoverCanceled
|
||||
)
|
||||
|
||||
const (
|
||||
// maxSameAccountRetries 同账号重试次数上限(针对 RetryableOnSameAccount 错误)
|
||||
maxSameAccountRetries = 2
|
||||
// sameAccountRetryDelay 同账号重试间隔
|
||||
sameAccountRetryDelay = 500 * time.Millisecond
|
||||
)
|
||||
|
||||
// FailoverState 跨循环迭代共享的 failover 状态
|
||||
type FailoverState struct {
|
||||
SwitchCount int
|
||||
MaxSwitches int
|
||||
FailedAccountIDs map[int64]struct{}
|
||||
SameAccountRetryCount map[int64]int
|
||||
LastFailoverErr *service.UpstreamFailoverError
|
||||
ForceCacheBilling bool
|
||||
hasBoundSession bool
|
||||
}
|
||||
|
||||
// NewFailoverState 创建 failover 状态
|
||||
func NewFailoverState(maxSwitches int, hasBoundSession bool) *FailoverState {
|
||||
return &FailoverState{
|
||||
MaxSwitches: maxSwitches,
|
||||
FailedAccountIDs: make(map[int64]struct{}),
|
||||
SameAccountRetryCount: make(map[int64]int),
|
||||
hasBoundSession: hasBoundSession,
|
||||
}
|
||||
}
|
||||
|
||||
// HandleFailoverError 处理 UpstreamFailoverError,返回下一步动作。
|
||||
// 包含:缓存计费判断、同账号重试、临时封禁、切换计数、Antigravity 延时。
|
||||
func (s *FailoverState) HandleFailoverError(
|
||||
ctx context.Context,
|
||||
gatewayService TempUnscheduler,
|
||||
accountID int64,
|
||||
platform string,
|
||||
failoverErr *service.UpstreamFailoverError,
|
||||
) FailoverAction {
|
||||
s.LastFailoverErr = failoverErr
|
||||
|
||||
// 缓存计费判断
|
||||
if needForceCacheBilling(s.hasBoundSession, failoverErr) {
|
||||
s.ForceCacheBilling = true
|
||||
}
|
||||
|
||||
// 同账号重试:对 RetryableOnSameAccount 的临时性错误,先在同一账号上重试
|
||||
if failoverErr.RetryableOnSameAccount && s.SameAccountRetryCount[accountID] < maxSameAccountRetries {
|
||||
s.SameAccountRetryCount[accountID]++
|
||||
log.Printf("Account %d: retryable error %d, same-account retry %d/%d",
|
||||
accountID, failoverErr.StatusCode, s.SameAccountRetryCount[accountID], maxSameAccountRetries)
|
||||
if !sleepWithContext(ctx, sameAccountRetryDelay) {
|
||||
return FailoverCanceled
|
||||
}
|
||||
return FailoverRetry
|
||||
}
|
||||
|
||||
// 同账号重试用尽,执行临时封禁
|
||||
if failoverErr.RetryableOnSameAccount {
|
||||
gatewayService.TempUnscheduleRetryableError(ctx, accountID, failoverErr)
|
||||
}
|
||||
|
||||
// 加入失败列表
|
||||
s.FailedAccountIDs[accountID] = struct{}{}
|
||||
|
||||
// 检查是否耗尽
|
||||
if s.SwitchCount >= s.MaxSwitches {
|
||||
return FailoverExhausted
|
||||
}
|
||||
|
||||
// 递增切换计数
|
||||
s.SwitchCount++
|
||||
log.Printf("Account %d: upstream error %d, switching account %d/%d",
|
||||
accountID, failoverErr.StatusCode, s.SwitchCount, s.MaxSwitches)
|
||||
|
||||
// Antigravity 平台换号线性递增延时
|
||||
if platform == service.PlatformAntigravity {
|
||||
if !sleepFailoverDelay(ctx, s.SwitchCount) {
|
||||
return FailoverCanceled
|
||||
}
|
||||
}
|
||||
|
||||
return FailoverSwitch
|
||||
}
|
||||
|
||||
// sleepWithContext 等待指定时长,返回 false 表示 context 已取消。
|
||||
func sleepWithContext(ctx context.Context, d time.Duration) bool {
|
||||
if d <= 0 {
|
||||
return true
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return false
|
||||
case <-time.After(d):
|
||||
return true
|
||||
}
|
||||
}
|
||||
656
backend/internal/handler/failover_loop_test.go
Normal file
656
backend/internal/handler/failover_loop_test.go
Normal file
@@ -0,0 +1,656 @@
|
||||
package handler
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"sub2api/internal/service"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Mock
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// mockTempUnscheduler 记录 TempUnscheduleRetryableError 的调用信息。
|
||||
type mockTempUnscheduler struct {
|
||||
calls []tempUnscheduleCall
|
||||
}
|
||||
|
||||
type tempUnscheduleCall struct {
|
||||
accountID int64
|
||||
failoverErr *service.UpstreamFailoverError
|
||||
}
|
||||
|
||||
func (m *mockTempUnscheduler) TempUnscheduleRetryableError(_ context.Context, accountID int64, failoverErr *service.UpstreamFailoverError) {
|
||||
m.calls = append(m.calls, tempUnscheduleCall{accountID: accountID, failoverErr: failoverErr})
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Helper
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func newTestFailoverErr(statusCode int, retryable, forceBilling bool) *service.UpstreamFailoverError {
|
||||
return &service.UpstreamFailoverError{
|
||||
StatusCode: statusCode,
|
||||
RetryableOnSameAccount: retryable,
|
||||
ForceCacheBilling: forceBilling,
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// NewFailoverState 测试
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func TestNewFailoverState(t *testing.T) {
|
||||
t.Run("初始化字段正确", func(t *testing.T) {
|
||||
fs := NewFailoverState(5, true)
|
||||
require.Equal(t, 5, fs.MaxSwitches)
|
||||
require.Equal(t, 0, fs.SwitchCount)
|
||||
require.NotNil(t, fs.FailedAccountIDs)
|
||||
require.Empty(t, fs.FailedAccountIDs)
|
||||
require.NotNil(t, fs.SameAccountRetryCount)
|
||||
require.Empty(t, fs.SameAccountRetryCount)
|
||||
require.Nil(t, fs.LastFailoverErr)
|
||||
require.False(t, fs.ForceCacheBilling)
|
||||
require.True(t, fs.hasBoundSession)
|
||||
})
|
||||
|
||||
t.Run("无绑定会话", func(t *testing.T) {
|
||||
fs := NewFailoverState(3, false)
|
||||
require.Equal(t, 3, fs.MaxSwitches)
|
||||
require.False(t, fs.hasBoundSession)
|
||||
})
|
||||
|
||||
t.Run("零最大切换次数", func(t *testing.T) {
|
||||
fs := NewFailoverState(0, false)
|
||||
require.Equal(t, 0, fs.MaxSwitches)
|
||||
})
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// sleepWithContext 测试
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func TestSleepWithContext(t *testing.T) {
|
||||
t.Run("零时长立即返回true", func(t *testing.T) {
|
||||
start := time.Now()
|
||||
ok := sleepWithContext(context.Background(), 0)
|
||||
require.True(t, ok)
|
||||
require.Less(t, time.Since(start), 50*time.Millisecond)
|
||||
})
|
||||
|
||||
t.Run("负时长立即返回true", func(t *testing.T) {
|
||||
start := time.Now()
|
||||
ok := sleepWithContext(context.Background(), -1*time.Second)
|
||||
require.True(t, ok)
|
||||
require.Less(t, time.Since(start), 50*time.Millisecond)
|
||||
})
|
||||
|
||||
t.Run("正常等待后返回true", func(t *testing.T) {
|
||||
start := time.Now()
|
||||
ok := sleepWithContext(context.Background(), 50*time.Millisecond)
|
||||
elapsed := time.Since(start)
|
||||
require.True(t, ok)
|
||||
require.GreaterOrEqual(t, elapsed, 40*time.Millisecond)
|
||||
require.Less(t, elapsed, 500*time.Millisecond)
|
||||
})
|
||||
|
||||
t.Run("已取消context立即返回false", func(t *testing.T) {
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
cancel()
|
||||
|
||||
start := time.Now()
|
||||
ok := sleepWithContext(ctx, 5*time.Second)
|
||||
require.False(t, ok)
|
||||
require.Less(t, time.Since(start), 50*time.Millisecond)
|
||||
})
|
||||
|
||||
t.Run("等待期间context取消返回false", func(t *testing.T) {
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
go func() {
|
||||
time.Sleep(30 * time.Millisecond)
|
||||
cancel()
|
||||
}()
|
||||
|
||||
start := time.Now()
|
||||
ok := sleepWithContext(ctx, 5*time.Second)
|
||||
elapsed := time.Since(start)
|
||||
require.False(t, ok)
|
||||
require.Less(t, elapsed, 500*time.Millisecond)
|
||||
})
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// HandleFailoverError — 基本切换流程
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func TestHandleFailoverError_BasicSwitch(t *testing.T) {
|
||||
t.Run("非重试错误_非Antigravity_直接切换", func(t *testing.T) {
|
||||
mock := &mockTempUnscheduler{}
|
||||
fs := NewFailoverState(3, false)
|
||||
err := newTestFailoverErr(500, false, false)
|
||||
|
||||
action := fs.HandleFailoverError(context.Background(), mock, 100, "openai", err)
|
||||
|
||||
require.Equal(t, FailoverSwitch, action)
|
||||
require.Equal(t, 1, fs.SwitchCount)
|
||||
require.Contains(t, fs.FailedAccountIDs, int64(100))
|
||||
require.Equal(t, err, fs.LastFailoverErr)
|
||||
require.False(t, fs.ForceCacheBilling)
|
||||
require.Empty(t, mock.calls, "不应调用 TempUnschedule")
|
||||
})
|
||||
|
||||
t.Run("非重试错误_Antigravity_第一次切换无延迟", func(t *testing.T) {
|
||||
// switchCount 从 0→1 时,sleepFailoverDelay(ctx, 1) 的延时 = (1-1)*1s = 0
|
||||
mock := &mockTempUnscheduler{}
|
||||
fs := NewFailoverState(3, false)
|
||||
err := newTestFailoverErr(500, false, false)
|
||||
|
||||
start := time.Now()
|
||||
action := fs.HandleFailoverError(context.Background(), mock, 100, service.PlatformAntigravity, err)
|
||||
elapsed := time.Since(start)
|
||||
|
||||
require.Equal(t, FailoverSwitch, action)
|
||||
require.Equal(t, 1, fs.SwitchCount)
|
||||
require.Less(t, elapsed, 200*time.Millisecond, "第一次切换延迟应为 0")
|
||||
})
|
||||
|
||||
t.Run("非重试错误_Antigravity_第二次切换有1秒延迟", func(t *testing.T) {
|
||||
// switchCount 从 1→2 时,sleepFailoverDelay(ctx, 2) 的延时 = (2-1)*1s = 1s
|
||||
mock := &mockTempUnscheduler{}
|
||||
fs := NewFailoverState(3, false)
|
||||
fs.SwitchCount = 1 // 模拟已切换一次
|
||||
|
||||
err := newTestFailoverErr(500, false, false)
|
||||
start := time.Now()
|
||||
action := fs.HandleFailoverError(context.Background(), mock, 200, service.PlatformAntigravity, err)
|
||||
elapsed := time.Since(start)
|
||||
|
||||
require.Equal(t, FailoverSwitch, action)
|
||||
require.Equal(t, 2, fs.SwitchCount)
|
||||
require.GreaterOrEqual(t, elapsed, 800*time.Millisecond, "第二次切换延迟应约 1s")
|
||||
require.Less(t, elapsed, 3*time.Second)
|
||||
})
|
||||
|
||||
t.Run("连续切换直到耗尽", func(t *testing.T) {
|
||||
mock := &mockTempUnscheduler{}
|
||||
fs := NewFailoverState(2, false)
|
||||
|
||||
// 第一次切换:0→1
|
||||
err1 := newTestFailoverErr(500, false, false)
|
||||
action := fs.HandleFailoverError(context.Background(), mock, 100, "openai", err1)
|
||||
require.Equal(t, FailoverSwitch, action)
|
||||
require.Equal(t, 1, fs.SwitchCount)
|
||||
|
||||
// 第二次切换:1→2
|
||||
err2 := newTestFailoverErr(502, false, false)
|
||||
action = fs.HandleFailoverError(context.Background(), mock, 200, "openai", err2)
|
||||
require.Equal(t, FailoverSwitch, action)
|
||||
require.Equal(t, 2, fs.SwitchCount)
|
||||
|
||||
// 第三次已耗尽:SwitchCount(2) >= MaxSwitches(2)
|
||||
err3 := newTestFailoverErr(503, false, false)
|
||||
action = fs.HandleFailoverError(context.Background(), mock, 300, "openai", err3)
|
||||
require.Equal(t, FailoverExhausted, action)
|
||||
require.Equal(t, 2, fs.SwitchCount, "耗尽时不应继续递增")
|
||||
|
||||
// 验证失败账号列表
|
||||
require.Len(t, fs.FailedAccountIDs, 3)
|
||||
require.Contains(t, fs.FailedAccountIDs, int64(100))
|
||||
require.Contains(t, fs.FailedAccountIDs, int64(200))
|
||||
require.Contains(t, fs.FailedAccountIDs, int64(300))
|
||||
|
||||
// LastFailoverErr 应为最后一次的错误
|
||||
require.Equal(t, err3, fs.LastFailoverErr)
|
||||
})
|
||||
|
||||
t.Run("MaxSwitches为0时首次即耗尽", func(t *testing.T) {
|
||||
mock := &mockTempUnscheduler{}
|
||||
fs := NewFailoverState(0, false)
|
||||
err := newTestFailoverErr(500, false, false)
|
||||
|
||||
action := fs.HandleFailoverError(context.Background(), mock, 100, "openai", err)
|
||||
require.Equal(t, FailoverExhausted, action)
|
||||
require.Equal(t, 0, fs.SwitchCount)
|
||||
require.Contains(t, fs.FailedAccountIDs, int64(100))
|
||||
})
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// HandleFailoverError — 缓存计费 (ForceCacheBilling)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func TestHandleFailoverError_CacheBilling(t *testing.T) {
|
||||
t.Run("hasBoundSession为true时设置ForceCacheBilling", func(t *testing.T) {
|
||||
mock := &mockTempUnscheduler{}
|
||||
fs := NewFailoverState(3, true) // hasBoundSession=true
|
||||
err := newTestFailoverErr(500, false, false)
|
||||
|
||||
fs.HandleFailoverError(context.Background(), mock, 100, "openai", err)
|
||||
require.True(t, fs.ForceCacheBilling)
|
||||
})
|
||||
|
||||
t.Run("failoverErr.ForceCacheBilling为true时设置", func(t *testing.T) {
|
||||
mock := &mockTempUnscheduler{}
|
||||
fs := NewFailoverState(3, false)
|
||||
err := newTestFailoverErr(500, false, true) // ForceCacheBilling=true
|
||||
|
||||
fs.HandleFailoverError(context.Background(), mock, 100, "openai", err)
|
||||
require.True(t, fs.ForceCacheBilling)
|
||||
})
|
||||
|
||||
t.Run("两者均为false时不设置", func(t *testing.T) {
|
||||
mock := &mockTempUnscheduler{}
|
||||
fs := NewFailoverState(3, false)
|
||||
err := newTestFailoverErr(500, false, false)
|
||||
|
||||
fs.HandleFailoverError(context.Background(), mock, 100, "openai", err)
|
||||
require.False(t, fs.ForceCacheBilling)
|
||||
})
|
||||
|
||||
t.Run("一旦设置不会被后续错误重置", func(t *testing.T) {
|
||||
mock := &mockTempUnscheduler{}
|
||||
fs := NewFailoverState(3, false)
|
||||
|
||||
// 第一次:ForceCacheBilling=true → 设置
|
||||
err1 := newTestFailoverErr(500, false, true)
|
||||
fs.HandleFailoverError(context.Background(), mock, 100, "openai", err1)
|
||||
require.True(t, fs.ForceCacheBilling)
|
||||
|
||||
// 第二次:ForceCacheBilling=false → 仍然保持 true
|
||||
err2 := newTestFailoverErr(502, false, false)
|
||||
fs.HandleFailoverError(context.Background(), mock, 200, "openai", err2)
|
||||
require.True(t, fs.ForceCacheBilling, "ForceCacheBilling 一旦设置不应被重置")
|
||||
})
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// HandleFailoverError — 同账号重试 (RetryableOnSameAccount)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func TestHandleFailoverError_SameAccountRetry(t *testing.T) {
|
||||
t.Run("第一次重试返回FailoverRetry", func(t *testing.T) {
|
||||
mock := &mockTempUnscheduler{}
|
||||
fs := NewFailoverState(3, false)
|
||||
err := newTestFailoverErr(400, true, false)
|
||||
|
||||
start := time.Now()
|
||||
action := fs.HandleFailoverError(context.Background(), mock, 100, "openai", err)
|
||||
elapsed := time.Since(start)
|
||||
|
||||
require.Equal(t, FailoverRetry, action)
|
||||
require.Equal(t, 1, fs.SameAccountRetryCount[100])
|
||||
require.Equal(t, 0, fs.SwitchCount, "同账号重试不应增加切换计数")
|
||||
require.NotContains(t, fs.FailedAccountIDs, int64(100), "同账号重试不应加入失败列表")
|
||||
require.Empty(t, mock.calls, "同账号重试期间不应调用 TempUnschedule")
|
||||
// 验证等待了 sameAccountRetryDelay (500ms)
|
||||
require.GreaterOrEqual(t, elapsed, 400*time.Millisecond)
|
||||
require.Less(t, elapsed, 2*time.Second)
|
||||
})
|
||||
|
||||
t.Run("第二次重试仍返回FailoverRetry", func(t *testing.T) {
|
||||
mock := &mockTempUnscheduler{}
|
||||
fs := NewFailoverState(3, false)
|
||||
err := newTestFailoverErr(400, true, false)
|
||||
|
||||
// 第一次
|
||||
action := fs.HandleFailoverError(context.Background(), mock, 100, "openai", err)
|
||||
require.Equal(t, FailoverRetry, action)
|
||||
require.Equal(t, 1, fs.SameAccountRetryCount[100])
|
||||
|
||||
// 第二次
|
||||
action = fs.HandleFailoverError(context.Background(), mock, 100, "openai", err)
|
||||
require.Equal(t, FailoverRetry, action)
|
||||
require.Equal(t, 2, fs.SameAccountRetryCount[100])
|
||||
|
||||
require.Empty(t, mock.calls, "两次重试期间均不应调用 TempUnschedule")
|
||||
})
|
||||
|
||||
t.Run("第三次重试耗尽_触发TempUnschedule并切换", func(t *testing.T) {
|
||||
mock := &mockTempUnscheduler{}
|
||||
fs := NewFailoverState(3, false)
|
||||
err := newTestFailoverErr(400, true, false)
|
||||
|
||||
// 第一次、第二次重试
|
||||
fs.HandleFailoverError(context.Background(), mock, 100, "openai", err)
|
||||
fs.HandleFailoverError(context.Background(), mock, 100, "openai", err)
|
||||
require.Equal(t, 2, fs.SameAccountRetryCount[100])
|
||||
|
||||
// 第三次:重试已达到 maxSameAccountRetries(2),应切换账号
|
||||
action := fs.HandleFailoverError(context.Background(), mock, 100, "openai", err)
|
||||
require.Equal(t, FailoverSwitch, action)
|
||||
require.Equal(t, 1, fs.SwitchCount)
|
||||
require.Contains(t, fs.FailedAccountIDs, int64(100))
|
||||
|
||||
// 验证 TempUnschedule 被调用
|
||||
require.Len(t, mock.calls, 1)
|
||||
require.Equal(t, int64(100), mock.calls[0].accountID)
|
||||
require.Equal(t, err, mock.calls[0].failoverErr)
|
||||
})
|
||||
|
||||
t.Run("不同账号独立跟踪重试次数", func(t *testing.T) {
|
||||
mock := &mockTempUnscheduler{}
|
||||
fs := NewFailoverState(5, false)
|
||||
err := newTestFailoverErr(400, true, false)
|
||||
|
||||
// 账号 100 第一次重试
|
||||
action := fs.HandleFailoverError(context.Background(), mock, 100, "openai", err)
|
||||
require.Equal(t, FailoverRetry, action)
|
||||
require.Equal(t, 1, fs.SameAccountRetryCount[100])
|
||||
|
||||
// 账号 200 第一次重试(独立计数)
|
||||
action = fs.HandleFailoverError(context.Background(), mock, 200, "openai", err)
|
||||
require.Equal(t, FailoverRetry, action)
|
||||
require.Equal(t, 1, fs.SameAccountRetryCount[200])
|
||||
require.Equal(t, 1, fs.SameAccountRetryCount[100], "账号 100 的计数不应受影响")
|
||||
})
|
||||
|
||||
t.Run("重试耗尽后再次遇到同账号_直接切换", func(t *testing.T) {
|
||||
mock := &mockTempUnscheduler{}
|
||||
fs := NewFailoverState(5, false)
|
||||
err := newTestFailoverErr(400, true, false)
|
||||
|
||||
// 耗尽账号 100 的重试
|
||||
fs.HandleFailoverError(context.Background(), mock, 100, "openai", err) // retry 1
|
||||
fs.HandleFailoverError(context.Background(), mock, 100, "openai", err) // retry 2
|
||||
action := fs.HandleFailoverError(context.Background(), mock, 100, "openai", err) // exhausted → switch
|
||||
require.Equal(t, FailoverSwitch, action)
|
||||
|
||||
// 再次遇到账号 100,计数仍为 2,条件不满足 → 直接切换
|
||||
action = fs.HandleFailoverError(context.Background(), mock, 100, "openai", err)
|
||||
require.Equal(t, FailoverSwitch, action)
|
||||
require.Len(t, mock.calls, 2, "第二次耗尽也应调用 TempUnschedule")
|
||||
})
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// HandleFailoverError — TempUnschedule 调用验证
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func TestHandleFailoverError_TempUnschedule(t *testing.T) {
|
||||
t.Run("非重试错误不调用TempUnschedule", func(t *testing.T) {
|
||||
mock := &mockTempUnscheduler{}
|
||||
fs := NewFailoverState(3, false)
|
||||
err := newTestFailoverErr(500, false, false) // RetryableOnSameAccount=false
|
||||
|
||||
fs.HandleFailoverError(context.Background(), mock, 100, "openai", err)
|
||||
require.Empty(t, mock.calls)
|
||||
})
|
||||
|
||||
t.Run("重试错误耗尽后调用TempUnschedule_传入正确参数", func(t *testing.T) {
|
||||
mock := &mockTempUnscheduler{}
|
||||
fs := NewFailoverState(3, false)
|
||||
err := newTestFailoverErr(502, true, false)
|
||||
|
||||
// 耗尽重试
|
||||
fs.HandleFailoverError(context.Background(), mock, 42, "openai", err)
|
||||
fs.HandleFailoverError(context.Background(), mock, 42, "openai", err)
|
||||
fs.HandleFailoverError(context.Background(), mock, 42, "openai", err)
|
||||
|
||||
require.Len(t, mock.calls, 1)
|
||||
require.Equal(t, int64(42), mock.calls[0].accountID)
|
||||
require.Equal(t, 502, mock.calls[0].failoverErr.StatusCode)
|
||||
require.True(t, mock.calls[0].failoverErr.RetryableOnSameAccount)
|
||||
})
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// HandleFailoverError — Context 取消
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func TestHandleFailoverError_ContextCanceled(t *testing.T) {
|
||||
t.Run("同账号重试sleep期间context取消", func(t *testing.T) {
|
||||
mock := &mockTempUnscheduler{}
|
||||
fs := NewFailoverState(3, false)
|
||||
err := newTestFailoverErr(400, true, false)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
cancel() // 立即取消
|
||||
|
||||
start := time.Now()
|
||||
action := fs.HandleFailoverError(ctx, mock, 100, "openai", err)
|
||||
elapsed := time.Since(start)
|
||||
|
||||
require.Equal(t, FailoverCanceled, action)
|
||||
require.Less(t, elapsed, 100*time.Millisecond, "应立即返回")
|
||||
// 重试计数仍应递增
|
||||
require.Equal(t, 1, fs.SameAccountRetryCount[100])
|
||||
})
|
||||
|
||||
t.Run("Antigravity延迟期间context取消", func(t *testing.T) {
|
||||
mock := &mockTempUnscheduler{}
|
||||
fs := NewFailoverState(3, false)
|
||||
fs.SwitchCount = 1 // 下一次 switchCount=2 → delay = 1s
|
||||
err := newTestFailoverErr(500, false, false)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
cancel() // 立即取消
|
||||
|
||||
start := time.Now()
|
||||
action := fs.HandleFailoverError(ctx, mock, 100, service.PlatformAntigravity, err)
|
||||
elapsed := time.Since(start)
|
||||
|
||||
require.Equal(t, FailoverCanceled, action)
|
||||
require.Less(t, elapsed, 100*time.Millisecond, "应立即返回而非等待 1s")
|
||||
})
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// HandleFailoverError — FailedAccountIDs 跟踪
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func TestHandleFailoverError_FailedAccountIDs(t *testing.T) {
|
||||
t.Run("切换时添加到失败列表", func(t *testing.T) {
|
||||
mock := &mockTempUnscheduler{}
|
||||
fs := NewFailoverState(3, false)
|
||||
|
||||
fs.HandleFailoverError(context.Background(), mock, 100, "openai", newTestFailoverErr(500, false, false))
|
||||
require.Contains(t, fs.FailedAccountIDs, int64(100))
|
||||
|
||||
fs.HandleFailoverError(context.Background(), mock, 200, "openai", newTestFailoverErr(502, false, false))
|
||||
require.Contains(t, fs.FailedAccountIDs, int64(200))
|
||||
require.Len(t, fs.FailedAccountIDs, 2)
|
||||
})
|
||||
|
||||
t.Run("耗尽时也添加到失败列表", func(t *testing.T) {
|
||||
mock := &mockTempUnscheduler{}
|
||||
fs := NewFailoverState(0, false)
|
||||
|
||||
action := fs.HandleFailoverError(context.Background(), mock, 100, "openai", newTestFailoverErr(500, false, false))
|
||||
require.Equal(t, FailoverExhausted, action)
|
||||
require.Contains(t, fs.FailedAccountIDs, int64(100))
|
||||
})
|
||||
|
||||
t.Run("同账号重试期间不添加到失败列表", func(t *testing.T) {
|
||||
mock := &mockTempUnscheduler{}
|
||||
fs := NewFailoverState(3, false)
|
||||
|
||||
action := fs.HandleFailoverError(context.Background(), mock, 100, "openai", newTestFailoverErr(400, true, false))
|
||||
require.Equal(t, FailoverRetry, action)
|
||||
require.NotContains(t, fs.FailedAccountIDs, int64(100))
|
||||
})
|
||||
|
||||
t.Run("同一账号多次切换不重复添加", func(t *testing.T) {
|
||||
mock := &mockTempUnscheduler{}
|
||||
fs := NewFailoverState(5, false)
|
||||
|
||||
fs.HandleFailoverError(context.Background(), mock, 100, "openai", newTestFailoverErr(500, false, false))
|
||||
fs.HandleFailoverError(context.Background(), mock, 100, "openai", newTestFailoverErr(500, false, false))
|
||||
require.Len(t, fs.FailedAccountIDs, 1, "map 天然去重")
|
||||
})
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// HandleFailoverError — LastFailoverErr 更新
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func TestHandleFailoverError_LastFailoverErr(t *testing.T) {
|
||||
t.Run("每次调用都更新LastFailoverErr", func(t *testing.T) {
|
||||
mock := &mockTempUnscheduler{}
|
||||
fs := NewFailoverState(3, false)
|
||||
|
||||
err1 := newTestFailoverErr(500, false, false)
|
||||
fs.HandleFailoverError(context.Background(), mock, 100, "openai", err1)
|
||||
require.Equal(t, err1, fs.LastFailoverErr)
|
||||
|
||||
err2 := newTestFailoverErr(502, false, false)
|
||||
fs.HandleFailoverError(context.Background(), mock, 200, "openai", err2)
|
||||
require.Equal(t, err2, fs.LastFailoverErr)
|
||||
})
|
||||
|
||||
t.Run("同账号重试时也更新LastFailoverErr", func(t *testing.T) {
|
||||
mock := &mockTempUnscheduler{}
|
||||
fs := NewFailoverState(3, false)
|
||||
|
||||
err := newTestFailoverErr(400, true, false)
|
||||
fs.HandleFailoverError(context.Background(), mock, 100, "openai", err)
|
||||
require.Equal(t, err, fs.LastFailoverErr)
|
||||
})
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// HandleFailoverError — 综合集成场景
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func TestHandleFailoverError_IntegrationScenario(t *testing.T) {
|
||||
t.Run("模拟完整failover流程_多账号混合重试与切换", func(t *testing.T) {
|
||||
mock := &mockTempUnscheduler{}
|
||||
fs := NewFailoverState(3, true) // hasBoundSession=true
|
||||
|
||||
// 1. 账号 100 遇到可重试错误,同账号重试 2 次
|
||||
retryErr := newTestFailoverErr(400, true, false)
|
||||
action := fs.HandleFailoverError(context.Background(), mock, 100, "openai", retryErr)
|
||||
require.Equal(t, FailoverRetry, action)
|
||||
require.True(t, fs.ForceCacheBilling, "hasBoundSession=true 应设置 ForceCacheBilling")
|
||||
|
||||
action = fs.HandleFailoverError(context.Background(), mock, 100, "openai", retryErr)
|
||||
require.Equal(t, FailoverRetry, action)
|
||||
|
||||
// 2. 账号 100 重试耗尽 → TempUnschedule + 切换
|
||||
action = fs.HandleFailoverError(context.Background(), mock, 100, "openai", retryErr)
|
||||
require.Equal(t, FailoverSwitch, action)
|
||||
require.Equal(t, 1, fs.SwitchCount)
|
||||
require.Len(t, mock.calls, 1)
|
||||
|
||||
// 3. 账号 200 遇到不可重试错误 → 直接切换
|
||||
switchErr := newTestFailoverErr(500, false, false)
|
||||
action = fs.HandleFailoverError(context.Background(), mock, 200, "openai", switchErr)
|
||||
require.Equal(t, FailoverSwitch, action)
|
||||
require.Equal(t, 2, fs.SwitchCount)
|
||||
|
||||
// 4. 账号 300 遇到不可重试错误 → 再切换
|
||||
action = fs.HandleFailoverError(context.Background(), mock, 300, "openai", switchErr)
|
||||
require.Equal(t, FailoverSwitch, action)
|
||||
require.Equal(t, 3, fs.SwitchCount)
|
||||
|
||||
// 5. 账号 400 → 已耗尽 (SwitchCount=3 >= MaxSwitches=3)
|
||||
action = fs.HandleFailoverError(context.Background(), mock, 400, "openai", switchErr)
|
||||
require.Equal(t, FailoverExhausted, action)
|
||||
|
||||
// 最终状态验证
|
||||
require.Equal(t, 3, fs.SwitchCount, "耗尽时不再递增")
|
||||
require.Len(t, fs.FailedAccountIDs, 4, "4个不同账号都在失败列表中")
|
||||
require.True(t, fs.ForceCacheBilling)
|
||||
require.Len(t, mock.calls, 1, "只有账号 100 触发了 TempUnschedule")
|
||||
})
|
||||
|
||||
t.Run("模拟Antigravity平台完整流程", func(t *testing.T) {
|
||||
mock := &mockTempUnscheduler{}
|
||||
fs := NewFailoverState(2, false)
|
||||
|
||||
err := newTestFailoverErr(500, false, false)
|
||||
|
||||
// 第一次切换:delay = 0s
|
||||
start := time.Now()
|
||||
action := fs.HandleFailoverError(context.Background(), mock, 100, service.PlatformAntigravity, err)
|
||||
elapsed := time.Since(start)
|
||||
require.Equal(t, FailoverSwitch, action)
|
||||
require.Less(t, elapsed, 200*time.Millisecond, "第一次切换延迟为 0")
|
||||
|
||||
// 第二次切换:delay = 1s
|
||||
start = time.Now()
|
||||
action = fs.HandleFailoverError(context.Background(), mock, 200, service.PlatformAntigravity, err)
|
||||
elapsed = time.Since(start)
|
||||
require.Equal(t, FailoverSwitch, action)
|
||||
require.GreaterOrEqual(t, elapsed, 800*time.Millisecond, "第二次切换延迟约 1s")
|
||||
|
||||
// 第三次:耗尽(无延迟,因为在检查延迟之前就返回了)
|
||||
start = time.Now()
|
||||
action = fs.HandleFailoverError(context.Background(), mock, 300, service.PlatformAntigravity, err)
|
||||
elapsed = time.Since(start)
|
||||
require.Equal(t, FailoverExhausted, action)
|
||||
require.Less(t, elapsed, 200*time.Millisecond, "耗尽时不应有延迟")
|
||||
})
|
||||
|
||||
t.Run("ForceCacheBilling通过错误标志设置", func(t *testing.T) {
|
||||
mock := &mockTempUnscheduler{}
|
||||
fs := NewFailoverState(3, false) // hasBoundSession=false
|
||||
|
||||
// 第一次:ForceCacheBilling=false
|
||||
err1 := newTestFailoverErr(500, false, false)
|
||||
fs.HandleFailoverError(context.Background(), mock, 100, "openai", err1)
|
||||
require.False(t, fs.ForceCacheBilling)
|
||||
|
||||
// 第二次:ForceCacheBilling=true(Antigravity 粘性会话切换)
|
||||
err2 := newTestFailoverErr(500, false, true)
|
||||
fs.HandleFailoverError(context.Background(), mock, 200, "openai", err2)
|
||||
require.True(t, fs.ForceCacheBilling, "错误标志应触发 ForceCacheBilling")
|
||||
|
||||
// 第三次:ForceCacheBilling=false,但状态仍保持 true
|
||||
err3 := newTestFailoverErr(500, false, false)
|
||||
fs.HandleFailoverError(context.Background(), mock, 300, "openai", err3)
|
||||
require.True(t, fs.ForceCacheBilling, "不应重置")
|
||||
})
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// HandleFailoverError — 边界条件
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func TestHandleFailoverError_EdgeCases(t *testing.T) {
|
||||
t.Run("StatusCode为0的错误也能正常处理", func(t *testing.T) {
|
||||
mock := &mockTempUnscheduler{}
|
||||
fs := NewFailoverState(3, false)
|
||||
err := newTestFailoverErr(0, false, false)
|
||||
|
||||
action := fs.HandleFailoverError(context.Background(), mock, 100, "openai", err)
|
||||
require.Equal(t, FailoverSwitch, action)
|
||||
})
|
||||
|
||||
t.Run("AccountID为0也能正常跟踪", func(t *testing.T) {
|
||||
mock := &mockTempUnscheduler{}
|
||||
fs := NewFailoverState(3, false)
|
||||
err := newTestFailoverErr(500, true, false)
|
||||
|
||||
action := fs.HandleFailoverError(context.Background(), mock, 0, "openai", err)
|
||||
require.Equal(t, FailoverRetry, action)
|
||||
require.Equal(t, 1, fs.SameAccountRetryCount[0])
|
||||
})
|
||||
|
||||
t.Run("负AccountID也能正常跟踪", func(t *testing.T) {
|
||||
mock := &mockTempUnscheduler{}
|
||||
fs := NewFailoverState(3, false)
|
||||
err := newTestFailoverErr(500, true, false)
|
||||
|
||||
action := fs.HandleFailoverError(context.Background(), mock, -1, "openai", err)
|
||||
require.Equal(t, FailoverRetry, action)
|
||||
require.Equal(t, 1, fs.SameAccountRetryCount[-1])
|
||||
})
|
||||
|
||||
t.Run("空平台名称不触发Antigravity延迟", func(t *testing.T) {
|
||||
mock := &mockTempUnscheduler{}
|
||||
fs := NewFailoverState(3, false)
|
||||
fs.SwitchCount = 1
|
||||
err := newTestFailoverErr(500, false, false)
|
||||
|
||||
start := time.Now()
|
||||
action := fs.HandleFailoverError(context.Background(), mock, 100, "", err)
|
||||
elapsed := time.Since(start)
|
||||
|
||||
require.Equal(t, FailoverSwitch, action)
|
||||
require.Less(t, elapsed, 200*time.Millisecond, "空平台不应触发 Antigravity 延迟")
|
||||
})
|
||||
}
|
||||
@@ -232,12 +232,7 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
|
||||
hasBoundSession := sessionKey != "" && sessionBoundAccountID > 0
|
||||
|
||||
if platform == service.PlatformGemini {
|
||||
maxAccountSwitches := h.maxAccountSwitchesGemini
|
||||
switchCount := 0
|
||||
failedAccountIDs := make(map[int64]struct{})
|
||||
sameAccountRetryCount := make(map[int64]int) // 同账号重试计数
|
||||
var lastFailoverErr *service.UpstreamFailoverError
|
||||
var forceCacheBilling bool // 粘性会话切换时的缓存计费标记
|
||||
fs := NewFailoverState(h.maxAccountSwitchesGemini, hasBoundSession)
|
||||
|
||||
// 单账号分组提前设置 SingleAccountRetry 标记,让 Service 层首次 503 就不设模型限流标记。
|
||||
// 避免单账号分组收到 503 (MODEL_CAPACITY_EXHAUSTED) 时设 29s 限流,导致后续请求连续快速失败。
|
||||
@@ -247,27 +242,27 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
|
||||
}
|
||||
|
||||
for {
|
||||
selection, err := h.gatewayService.SelectAccountWithLoadAwareness(c.Request.Context(), apiKey.GroupID, sessionKey, reqModel, failedAccountIDs, "") // Gemini 不使用会话限制
|
||||
selection, err := h.gatewayService.SelectAccountWithLoadAwareness(c.Request.Context(), apiKey.GroupID, sessionKey, reqModel, fs.FailedAccountIDs, "") // Gemini 不使用会话限制
|
||||
if err != nil {
|
||||
if len(failedAccountIDs) == 0 {
|
||||
if len(fs.FailedAccountIDs) == 0 {
|
||||
h.handleStreamingAwareError(c, http.StatusServiceUnavailable, "api_error", "No available accounts: "+err.Error(), streamStarted)
|
||||
return
|
||||
}
|
||||
// Antigravity 单账号退避重试:分组内没有其他可用账号时,
|
||||
// 对 503 错误不直接返回,而是清除排除列表、等待退避后重试同一个账号。
|
||||
// 谷歌上游 503 (MODEL_CAPACITY_EXHAUSTED) 通常是暂时性的,等几秒就能恢复。
|
||||
if lastFailoverErr != nil && lastFailoverErr.StatusCode == http.StatusServiceUnavailable && switchCount <= maxAccountSwitches {
|
||||
if sleepAntigravitySingleAccountBackoff(c.Request.Context(), switchCount) {
|
||||
log.Printf("Antigravity single-account 503 retry: clearing failed accounts, retry %d/%d", switchCount, maxAccountSwitches)
|
||||
failedAccountIDs = make(map[int64]struct{})
|
||||
if fs.LastFailoverErr != nil && fs.LastFailoverErr.StatusCode == http.StatusServiceUnavailable && fs.SwitchCount <= fs.MaxSwitches {
|
||||
if sleepAntigravitySingleAccountBackoff(c.Request.Context(), fs.SwitchCount) {
|
||||
log.Printf("Antigravity single-account 503 retry: clearing failed accounts, retry %d/%d", fs.SwitchCount, fs.MaxSwitches)
|
||||
fs.FailedAccountIDs = make(map[int64]struct{})
|
||||
// 设置 context 标记,让 Service 层预检查等待限流过期而非直接切换
|
||||
ctx := context.WithValue(c.Request.Context(), ctxkey.SingleAccountRetry, true)
|
||||
c.Request = c.Request.WithContext(ctx)
|
||||
continue
|
||||
}
|
||||
}
|
||||
if lastFailoverErr != nil {
|
||||
h.handleFailoverExhausted(c, lastFailoverErr, service.PlatformGemini, streamStarted)
|
||||
if fs.LastFailoverErr != nil {
|
||||
h.handleFailoverExhausted(c, fs.LastFailoverErr, service.PlatformGemini, streamStarted)
|
||||
} else {
|
||||
h.handleFailoverExhaustedSimple(c, 502, streamStarted)
|
||||
}
|
||||
@@ -346,8 +341,8 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
|
||||
// 转发请求 - 根据账号平台分流
|
||||
var result *service.ForwardResult
|
||||
requestCtx := c.Request.Context()
|
||||
if switchCount > 0 {
|
||||
requestCtx = context.WithValue(requestCtx, ctxkey.AccountSwitchCount, switchCount)
|
||||
if fs.SwitchCount > 0 {
|
||||
requestCtx = context.WithValue(requestCtx, ctxkey.AccountSwitchCount, fs.SwitchCount)
|
||||
}
|
||||
if account.Platform == service.PlatformAntigravity {
|
||||
result, err = h.antigravityGatewayService.ForwardGemini(requestCtx, c, account, reqModel, "generateContent", reqStream, body, hasBoundSession)
|
||||
@@ -360,40 +355,16 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
|
||||
if err != nil {
|
||||
var failoverErr *service.UpstreamFailoverError
|
||||
if errors.As(err, &failoverErr) {
|
||||
lastFailoverErr = failoverErr
|
||||
if needForceCacheBilling(hasBoundSession, failoverErr) {
|
||||
forceCacheBilling = true
|
||||
}
|
||||
|
||||
// 同账号重试:对 RetryableOnSameAccount 的临时性错误,先在同一账号上重试
|
||||
if failoverErr.RetryableOnSameAccount && sameAccountRetryCount[account.ID] < maxSameAccountRetries {
|
||||
sameAccountRetryCount[account.ID]++
|
||||
log.Printf("Account %d: retryable error %d, same-account retry %d/%d",
|
||||
account.ID, failoverErr.StatusCode, sameAccountRetryCount[account.ID], maxSameAccountRetries)
|
||||
if !sleepSameAccountRetryDelay(c.Request.Context()) {
|
||||
return
|
||||
}
|
||||
action := fs.HandleFailoverError(c.Request.Context(), h.gatewayService, account.ID, account.Platform, failoverErr)
|
||||
switch action {
|
||||
case FailoverRetry, FailoverSwitch:
|
||||
continue
|
||||
}
|
||||
|
||||
// 同账号重试用尽,执行临时封禁并切换账号
|
||||
if failoverErr.RetryableOnSameAccount {
|
||||
h.gatewayService.TempUnscheduleRetryableError(c.Request.Context(), account.ID, failoverErr)
|
||||
}
|
||||
|
||||
failedAccountIDs[account.ID] = struct{}{}
|
||||
if switchCount >= maxAccountSwitches {
|
||||
h.handleFailoverExhausted(c, failoverErr, service.PlatformGemini, streamStarted)
|
||||
case FailoverExhausted:
|
||||
h.handleFailoverExhausted(c, fs.LastFailoverErr, service.PlatformGemini, streamStarted)
|
||||
return
|
||||
case FailoverCanceled:
|
||||
return
|
||||
}
|
||||
switchCount++
|
||||
log.Printf("Account %d: upstream error %d, switching account %d/%d", account.ID, failoverErr.StatusCode, switchCount, maxAccountSwitches)
|
||||
if account.Platform == service.PlatformAntigravity {
|
||||
if !sleepFailoverDelay(c.Request.Context(), switchCount) {
|
||||
return
|
||||
}
|
||||
}
|
||||
continue
|
||||
}
|
||||
// 错误响应已在Forward中处理,这里只记录日志
|
||||
log.Printf("Forward request failed: %v", err)
|
||||
@@ -421,7 +392,7 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
|
||||
}); err != nil {
|
||||
log.Printf("Record usage failed: %v", err)
|
||||
}
|
||||
}(result, account, userAgent, clientIP, forceCacheBilling)
|
||||
}(result, account, userAgent, clientIP, fs.ForceCacheBilling)
|
||||
return
|
||||
}
|
||||
}
|
||||
@@ -442,37 +413,32 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
|
||||
}
|
||||
|
||||
for {
|
||||
maxAccountSwitches := h.maxAccountSwitches
|
||||
switchCount := 0
|
||||
failedAccountIDs := make(map[int64]struct{})
|
||||
sameAccountRetryCount := make(map[int64]int) // 同账号重试计数
|
||||
var lastFailoverErr *service.UpstreamFailoverError
|
||||
fs := NewFailoverState(h.maxAccountSwitches, hasBoundSession)
|
||||
retryWithFallback := false
|
||||
var forceCacheBilling bool // 粘性会话切换时的缓存计费标记
|
||||
|
||||
for {
|
||||
// 选择支持该模型的账号
|
||||
selection, err := h.gatewayService.SelectAccountWithLoadAwareness(c.Request.Context(), currentAPIKey.GroupID, sessionKey, reqModel, failedAccountIDs, parsedReq.MetadataUserID)
|
||||
selection, err := h.gatewayService.SelectAccountWithLoadAwareness(c.Request.Context(), currentAPIKey.GroupID, sessionKey, reqModel, fs.FailedAccountIDs, parsedReq.MetadataUserID)
|
||||
if err != nil {
|
||||
if len(failedAccountIDs) == 0 {
|
||||
if len(fs.FailedAccountIDs) == 0 {
|
||||
h.handleStreamingAwareError(c, http.StatusServiceUnavailable, "api_error", "No available accounts: "+err.Error(), streamStarted)
|
||||
return
|
||||
}
|
||||
// Antigravity 单账号退避重试:分组内没有其他可用账号时,
|
||||
// 对 503 错误不直接返回,而是清除排除列表、等待退避后重试同一个账号。
|
||||
// 谷歌上游 503 (MODEL_CAPACITY_EXHAUSTED) 通常是暂时性的,等几秒就能恢复。
|
||||
if lastFailoverErr != nil && lastFailoverErr.StatusCode == http.StatusServiceUnavailable && switchCount <= maxAccountSwitches {
|
||||
if sleepAntigravitySingleAccountBackoff(c.Request.Context(), switchCount) {
|
||||
log.Printf("Antigravity single-account 503 retry: clearing failed accounts, retry %d/%d", switchCount, maxAccountSwitches)
|
||||
failedAccountIDs = make(map[int64]struct{})
|
||||
if fs.LastFailoverErr != nil && fs.LastFailoverErr.StatusCode == http.StatusServiceUnavailable && fs.SwitchCount <= fs.MaxSwitches {
|
||||
if sleepAntigravitySingleAccountBackoff(c.Request.Context(), fs.SwitchCount) {
|
||||
log.Printf("Antigravity single-account 503 retry: clearing failed accounts, retry %d/%d", fs.SwitchCount, fs.MaxSwitches)
|
||||
fs.FailedAccountIDs = make(map[int64]struct{})
|
||||
// 设置 context 标记,让 Service 层预检查等待限流过期而非直接切换
|
||||
ctx := context.WithValue(c.Request.Context(), ctxkey.SingleAccountRetry, true)
|
||||
c.Request = c.Request.WithContext(ctx)
|
||||
continue
|
||||
}
|
||||
}
|
||||
if lastFailoverErr != nil {
|
||||
h.handleFailoverExhausted(c, lastFailoverErr, platform, streamStarted)
|
||||
if fs.LastFailoverErr != nil {
|
||||
h.handleFailoverExhausted(c, fs.LastFailoverErr, platform, streamStarted)
|
||||
} else {
|
||||
h.handleFailoverExhaustedSimple(c, 502, streamStarted)
|
||||
}
|
||||
@@ -549,8 +515,8 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
|
||||
// 转发请求 - 根据账号平台分流
|
||||
var result *service.ForwardResult
|
||||
requestCtx := c.Request.Context()
|
||||
if switchCount > 0 {
|
||||
requestCtx = context.WithValue(requestCtx, ctxkey.AccountSwitchCount, switchCount)
|
||||
if fs.SwitchCount > 0 {
|
||||
requestCtx = context.WithValue(requestCtx, ctxkey.AccountSwitchCount, fs.SwitchCount)
|
||||
}
|
||||
if account.Platform == service.PlatformAntigravity && account.Type != service.AccountTypeAPIKey {
|
||||
result, err = h.antigravityGatewayService.Forward(requestCtx, c, account, body, hasBoundSession)
|
||||
@@ -598,40 +564,16 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
|
||||
}
|
||||
var failoverErr *service.UpstreamFailoverError
|
||||
if errors.As(err, &failoverErr) {
|
||||
lastFailoverErr = failoverErr
|
||||
if needForceCacheBilling(hasBoundSession, failoverErr) {
|
||||
forceCacheBilling = true
|
||||
}
|
||||
|
||||
// 同账号重试:对 RetryableOnSameAccount 的临时性错误,先在同一账号上重试
|
||||
if failoverErr.RetryableOnSameAccount && sameAccountRetryCount[account.ID] < maxSameAccountRetries {
|
||||
sameAccountRetryCount[account.ID]++
|
||||
log.Printf("Account %d: retryable error %d, same-account retry %d/%d",
|
||||
account.ID, failoverErr.StatusCode, sameAccountRetryCount[account.ID], maxSameAccountRetries)
|
||||
if !sleepSameAccountRetryDelay(c.Request.Context()) {
|
||||
return
|
||||
}
|
||||
action := fs.HandleFailoverError(c.Request.Context(), h.gatewayService, account.ID, account.Platform, failoverErr)
|
||||
switch action {
|
||||
case FailoverRetry, FailoverSwitch:
|
||||
continue
|
||||
}
|
||||
|
||||
// 同账号重试用尽,执行临时封禁并切换账号
|
||||
if failoverErr.RetryableOnSameAccount {
|
||||
h.gatewayService.TempUnscheduleRetryableError(c.Request.Context(), account.ID, failoverErr)
|
||||
}
|
||||
|
||||
failedAccountIDs[account.ID] = struct{}{}
|
||||
if switchCount >= maxAccountSwitches {
|
||||
h.handleFailoverExhausted(c, failoverErr, account.Platform, streamStarted)
|
||||
case FailoverExhausted:
|
||||
h.handleFailoverExhausted(c, fs.LastFailoverErr, account.Platform, streamStarted)
|
||||
return
|
||||
case FailoverCanceled:
|
||||
return
|
||||
}
|
||||
switchCount++
|
||||
log.Printf("Account %d: upstream error %d, switching account %d/%d", account.ID, failoverErr.StatusCode, switchCount, maxAccountSwitches)
|
||||
if account.Platform == service.PlatformAntigravity {
|
||||
if !sleepFailoverDelay(c.Request.Context(), switchCount) {
|
||||
return
|
||||
}
|
||||
}
|
||||
continue
|
||||
}
|
||||
// 错误响应已在Forward中处理,这里只记录日志
|
||||
log.Printf("Account %d: Forward request failed: %v", account.ID, err)
|
||||
@@ -659,7 +601,7 @@ func (h *GatewayHandler) Messages(c *gin.Context) {
|
||||
}); err != nil {
|
||||
log.Printf("Record usage failed: %v", err)
|
||||
}
|
||||
}(result, account, userAgent, clientIP, forceCacheBilling)
|
||||
}(result, account, userAgent, clientIP, fs.ForceCacheBilling)
|
||||
return
|
||||
}
|
||||
if !retryWithFallback {
|
||||
@@ -899,23 +841,6 @@ func needForceCacheBilling(hasBoundSession bool, failoverErr *service.UpstreamFa
|
||||
return hasBoundSession || (failoverErr != nil && failoverErr.ForceCacheBilling)
|
||||
}
|
||||
|
||||
const (
|
||||
// maxSameAccountRetries 同账号重试次数上限(针对 RetryableOnSameAccount 错误)
|
||||
maxSameAccountRetries = 2
|
||||
// sameAccountRetryDelay 同账号重试间隔
|
||||
sameAccountRetryDelay = 500 * time.Millisecond
|
||||
)
|
||||
|
||||
// sleepSameAccountRetryDelay 同账号重试固定延时,返回 false 表示 context 已取消。
|
||||
func sleepSameAccountRetryDelay(ctx context.Context) bool {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return false
|
||||
case <-time.After(sameAccountRetryDelay):
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
// sleepFailoverDelay 账号切换线性递增延时:第1次0s、第2次1s、第3次2s…
|
||||
// 返回 false 表示 context 已取消。
|
||||
func sleepFailoverDelay(ctx context.Context, switchCount int) bool {
|
||||
|
||||
@@ -321,11 +321,7 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
|
||||
hasBoundSession := sessionKey != "" && sessionBoundAccountID > 0
|
||||
cleanedForUnknownBinding := false
|
||||
|
||||
maxAccountSwitches := h.maxAccountSwitchesGemini
|
||||
switchCount := 0
|
||||
failedAccountIDs := make(map[int64]struct{})
|
||||
var lastFailoverErr *service.UpstreamFailoverError
|
||||
var forceCacheBilling bool // 粘性会话切换时的缓存计费标记
|
||||
fs := NewFailoverState(h.maxAccountSwitchesGemini, hasBoundSession)
|
||||
|
||||
// 单账号分组提前设置 SingleAccountRetry 标记,让 Service 层首次 503 就不设模型限流标记。
|
||||
// 避免单账号分组收到 503 (MODEL_CAPACITY_EXHAUSTED) 时设 29s 限流,导致后续请求连续快速失败。
|
||||
@@ -335,26 +331,26 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
|
||||
}
|
||||
|
||||
for {
|
||||
selection, err := h.gatewayService.SelectAccountWithLoadAwareness(c.Request.Context(), apiKey.GroupID, sessionKey, modelName, failedAccountIDs, "") // Gemini 不使用会话限制
|
||||
selection, err := h.gatewayService.SelectAccountWithLoadAwareness(c.Request.Context(), apiKey.GroupID, sessionKey, modelName, fs.FailedAccountIDs, "") // Gemini 不使用会话限制
|
||||
if err != nil {
|
||||
if len(failedAccountIDs) == 0 {
|
||||
if len(fs.FailedAccountIDs) == 0 {
|
||||
googleError(c, http.StatusServiceUnavailable, "No available Gemini accounts: "+err.Error())
|
||||
return
|
||||
}
|
||||
// Antigravity 单账号退避重试:分组内没有其他可用账号时,
|
||||
// 对 503 错误不直接返回,而是清除排除列表、等待退避后重试同一个账号。
|
||||
// 谷歌上游 503 (MODEL_CAPACITY_EXHAUSTED) 通常是暂时性的,等几秒就能恢复。
|
||||
if lastFailoverErr != nil && lastFailoverErr.StatusCode == http.StatusServiceUnavailable && switchCount <= maxAccountSwitches {
|
||||
if sleepAntigravitySingleAccountBackoff(c.Request.Context(), switchCount) {
|
||||
log.Printf("Antigravity single-account 503 retry: clearing failed accounts, retry %d/%d", switchCount, maxAccountSwitches)
|
||||
failedAccountIDs = make(map[int64]struct{})
|
||||
if fs.LastFailoverErr != nil && fs.LastFailoverErr.StatusCode == http.StatusServiceUnavailable && fs.SwitchCount <= fs.MaxSwitches {
|
||||
if sleepAntigravitySingleAccountBackoff(c.Request.Context(), fs.SwitchCount) {
|
||||
log.Printf("Antigravity single-account 503 retry: clearing failed accounts, retry %d/%d", fs.SwitchCount, fs.MaxSwitches)
|
||||
fs.FailedAccountIDs = make(map[int64]struct{})
|
||||
// 设置 context 标记,让 Service 层预检查等待限流过期而非直接切换
|
||||
ctx := context.WithValue(c.Request.Context(), ctxkey.SingleAccountRetry, true)
|
||||
c.Request = c.Request.WithContext(ctx)
|
||||
continue
|
||||
}
|
||||
}
|
||||
h.handleGeminiFailoverExhausted(c, lastFailoverErr)
|
||||
h.handleGeminiFailoverExhausted(c, fs.LastFailoverErr)
|
||||
return
|
||||
}
|
||||
account := selection.Account
|
||||
@@ -429,8 +425,8 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
|
||||
// 5) forward (根据平台分流)
|
||||
var result *service.ForwardResult
|
||||
requestCtx := c.Request.Context()
|
||||
if switchCount > 0 {
|
||||
requestCtx = context.WithValue(requestCtx, ctxkey.AccountSwitchCount, switchCount)
|
||||
if fs.SwitchCount > 0 {
|
||||
requestCtx = context.WithValue(requestCtx, ctxkey.AccountSwitchCount, fs.SwitchCount)
|
||||
}
|
||||
if account.Platform == service.PlatformAntigravity && account.Type != service.AccountTypeAPIKey {
|
||||
result, err = h.antigravityGatewayService.ForwardGemini(requestCtx, c, account, modelName, action, stream, body, hasBoundSession)
|
||||
@@ -443,24 +439,16 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
|
||||
if err != nil {
|
||||
var failoverErr *service.UpstreamFailoverError
|
||||
if errors.As(err, &failoverErr) {
|
||||
failedAccountIDs[account.ID] = struct{}{}
|
||||
if needForceCacheBilling(hasBoundSession, failoverErr) {
|
||||
forceCacheBilling = true
|
||||
}
|
||||
if switchCount >= maxAccountSwitches {
|
||||
lastFailoverErr = failoverErr
|
||||
h.handleGeminiFailoverExhausted(c, lastFailoverErr)
|
||||
action := fs.HandleFailoverError(c.Request.Context(), h.gatewayService, account.ID, account.Platform, failoverErr)
|
||||
switch action {
|
||||
case FailoverRetry, FailoverSwitch:
|
||||
continue
|
||||
case FailoverExhausted:
|
||||
h.handleGeminiFailoverExhausted(c, fs.LastFailoverErr)
|
||||
return
|
||||
case FailoverCanceled:
|
||||
return
|
||||
}
|
||||
lastFailoverErr = failoverErr
|
||||
switchCount++
|
||||
log.Printf("Gemini account %d: upstream error %d, switching account %d/%d", account.ID, failoverErr.StatusCode, switchCount, maxAccountSwitches)
|
||||
if account.Platform == service.PlatformAntigravity {
|
||||
if !sleepFailoverDelay(c.Request.Context(), switchCount) {
|
||||
return
|
||||
}
|
||||
}
|
||||
continue
|
||||
}
|
||||
// ForwardNative already wrote the response
|
||||
log.Printf("Gemini native forward failed: %v", err)
|
||||
@@ -506,7 +494,7 @@ func (h *GatewayHandler) GeminiV1BetaModels(c *gin.Context) {
|
||||
}); err != nil {
|
||||
log.Printf("Record usage failed: %v", err)
|
||||
}
|
||||
}(result, account, userAgent, clientIP, forceCacheBilling)
|
||||
}(result, account, userAgent, clientIP, fs.ForceCacheBilling)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
@@ -219,7 +219,9 @@ func TestApplyErrorPassthroughRule_SkipMonitoringSetsContextKey(t *testing.T) {
|
||||
assert.True(t, matched)
|
||||
v, exists := c.Get(OpsSkipPassthroughKey)
|
||||
assert.True(t, exists, "OpsSkipPassthroughKey should be set when skip_monitoring=true")
|
||||
assert.True(t, v.(bool))
|
||||
boolVal, ok := v.(bool)
|
||||
assert.True(t, ok, "value should be a bool")
|
||||
assert.True(t, boolVal)
|
||||
}
|
||||
|
||||
func TestApplyErrorPassthroughRule_NoSkipMonitoringDoesNotSetContextKey(t *testing.T) {
|
||||
|
||||
@@ -385,9 +385,10 @@ func (s *GatewayService) TempUnscheduleRetryableError(ctx context.Context, accou
|
||||
return
|
||||
}
|
||||
// 根据状态码选择封禁策略
|
||||
if failoverErr.StatusCode == http.StatusBadRequest {
|
||||
switch failoverErr.StatusCode {
|
||||
case http.StatusBadRequest:
|
||||
tempUnscheduleGoogleConfigError(ctx, s.accountRepo, accountID, "[handler]")
|
||||
} else if failoverErr.StatusCode == http.StatusBadGateway {
|
||||
case http.StatusBadGateway:
|
||||
tempUnscheduleEmptyResponse(ctx, s.accountRepo, accountID, "[handler]")
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user