fix(scheduler): resolve SetSnapshot race conditions and remove usage throttle

Backend: Fix three race conditions in SetSnapshot that caused account
scheduling anomalies and broken sticky sessions:
- Use Lua CAS script for atomic version activation, preventing version
  rollback when concurrent goroutines write snapshots simultaneously
- Add UnlockBucket to release rebuild lock immediately after completion
  instead of waiting 30s TTL expiry
- Replace immediate DEL of old snapshots with 60s EXPIRE grace period,
  preventing readers from hitting empty ZRANGE during version switches

Frontend: Remove serial queue throttle (1-2s delay per request) from
usage loading since backend now uses passive sampling. All usage
requests execute immediately in parallel.
This commit is contained in:
shaw
2026-04-29 22:48:39 +08:00
parent 40feb86ba4
commit 8bf2a7b88a
7 changed files with 91 additions and 95 deletions

View File

@@ -24,6 +24,49 @@ const (
defaultSchedulerSnapshotMGetChunkSize = 128
defaultSchedulerSnapshotWriteChunkSize = 256
// snapshotGraceTTLSeconds 旧快照过期的宽限期(秒)。
// 替代立即 DEL让正在读取旧版本的 reader 有足够时间完成 ZRANGE。
snapshotGraceTTLSeconds = 60
)
var (
// activateSnapshotScript 原子 CAS 切换快照版本。
// 仅当新版本号 >= 当前激活版本时才切换,防止并发写入导致版本回滚。
// 旧快照使用 EXPIRE 设置宽限期而非立即 DEL避免与 reader 竞态。
//
// KEYS[1] = activeKey (sched:active:{bucket})
// KEYS[2] = readyKey (sched:ready:{bucket})
// KEYS[3] = bucketSetKey (sched:buckets)
// KEYS[4] = snapshotKey (新写入的快照 key)
// ARGV[1] = 新版本号字符串
// ARGV[2] = bucket 字符串 (用于 SADD)
// ARGV[3] = 快照 key 前缀 (用于构造旧快照 key)
// ARGV[4] = 宽限期 TTL 秒数
//
// 返回 1 = 已激活, 0 = 版本过旧未激活
activateSnapshotScript = redis.NewScript(`
local currentActive = redis.call('GET', KEYS[1])
local newVersion = tonumber(ARGV[1])
if currentActive ~= false then
local curVersion = tonumber(currentActive)
if curVersion and newVersion < curVersion then
redis.call('DEL', KEYS[4])
return 0
end
end
redis.call('SET', KEYS[1], ARGV[1])
redis.call('SET', KEYS[2], '1')
redis.call('SADD', KEYS[3], ARGV[2])
if currentActive ~= false and currentActive ~= ARGV[1] then
redis.call('EXPIRE', ARGV[3] .. currentActive, tonumber(ARGV[4]))
end
return 1
`)
)
type schedulerCache struct {
@@ -108,9 +151,9 @@ func (c *schedulerCache) GetSnapshot(ctx context.Context, bucket service.Schedul
}
func (c *schedulerCache) SetSnapshot(ctx context.Context, bucket service.SchedulerBucket, accounts []service.Account) error {
activeKey := schedulerBucketKey(schedulerActivePrefix, bucket)
oldActive, _ := c.rdb.Get(ctx, activeKey).Result()
// Phase 1: 分配新版本号并写入快照数据。
// INCR 保证每个调用方获得唯一递增版本号。
// 写入的 snapshotKey 是新的版本化 keyreader 尚不知晓,因此无竞态。
versionKey := schedulerBucketKey(schedulerVersionPrefix, bucket)
version, err := c.rdb.Incr(ctx, versionKey).Result()
if err != nil {
@@ -124,7 +167,6 @@ func (c *schedulerCache) SetSnapshot(ctx context.Context, bucket service.Schedul
return err
}
pipe := c.rdb.Pipeline()
if len(accounts) > 0 {
// 使用序号作为 score保持数据库返回的排序语义。
members := make([]redis.Z, 0, len(accounts))
@@ -134,6 +176,7 @@ func (c *schedulerCache) SetSnapshot(ctx context.Context, bucket service.Schedul
Member: strconv.FormatInt(account.ID, 10),
})
}
pipe := c.rdb.Pipeline()
for start := 0; start < len(members); start += c.writeChunkSize {
end := start + c.writeChunkSize
if end > len(members) {
@@ -141,18 +184,25 @@ func (c *schedulerCache) SetSnapshot(ctx context.Context, bucket service.Schedul
}
pipe.ZAdd(ctx, snapshotKey, members[start:end]...)
}
} else {
pipe.Del(ctx, snapshotKey)
}
pipe.Set(ctx, activeKey, versionStr, 0)
pipe.Set(ctx, schedulerBucketKey(schedulerReadyPrefix, bucket), "1", 0)
pipe.SAdd(ctx, schedulerBucketSetKey, bucket.String())
if _, err := pipe.Exec(ctx); err != nil {
return err
if _, err := pipe.Exec(ctx); err != nil {
return err
}
}
if oldActive != "" && oldActive != versionStr {
_ = c.rdb.Del(ctx, schedulerSnapshotKey(bucket, oldActive)).Err()
// Phase 2: 原子 CAS 激活版本。
// Lua 脚本保证:仅当新版本 >= 当前激活版本时才切换 active 指针,
// 防止并发写入导致版本回滚。
// 旧快照使用 EXPIRE 宽限期而非立即 DEL避免 reader 竞态。
activeKey := schedulerBucketKey(schedulerActivePrefix, bucket)
readyKey := schedulerBucketKey(schedulerReadyPrefix, bucket)
snapshotKeyPrefix := fmt.Sprintf("%s%d:%s:%s:v", schedulerSnapshotPrefix, bucket.GroupID, bucket.Platform, bucket.Mode)
keys := []string{activeKey, readyKey, schedulerBucketSetKey, snapshotKey}
args := []any{versionStr, bucket.String(), snapshotKeyPrefix, snapshotGraceTTLSeconds}
_, err = activateSnapshotScript.Run(ctx, c.rdb, keys, args...).Result()
if err != nil {
return err
}
return nil
@@ -232,6 +282,11 @@ func (c *schedulerCache) TryLockBucket(ctx context.Context, bucket service.Sched
return c.rdb.SetNX(ctx, key, time.Now().UnixNano(), ttl).Result()
}
func (c *schedulerCache) UnlockBucket(ctx context.Context, bucket service.SchedulerBucket) error {
key := schedulerBucketKey(schedulerLockPrefix, bucket)
return c.rdb.Del(ctx, key).Err()
}
func (c *schedulerCache) ListBuckets(ctx context.Context) ([]service.SchedulerBucket, error) {
raw, err := c.rdb.SMembers(ctx, schedulerBucketSetKey).Result()
if err != nil {