Files
sub2api/backend/internal/service/ops_settings.go

569 lines
18 KiB
Go
Raw Normal View History

package service
import (
"context"
"encoding/json"
"errors"
"strings"
"time"
)
const (
opsAlertEvaluatorLeaderLockKeyDefault = "ops:alert:evaluator:leader"
opsAlertEvaluatorLeaderLockTTLDefault = 30 * time.Second
)
// =========================
// Email notification config
// =========================
func (s *OpsService) GetEmailNotificationConfig(ctx context.Context) (*OpsEmailNotificationConfig, error) {
defaultCfg := defaultOpsEmailNotificationConfig()
if s == nil || s.settingRepo == nil {
return defaultCfg, nil
}
if ctx == nil {
ctx = context.Background()
}
raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsEmailNotificationConfig)
if err != nil {
if errors.Is(err, ErrSettingNotFound) {
// Initialize defaults on first read (best-effort).
if b, mErr := json.Marshal(defaultCfg); mErr == nil {
_ = s.settingRepo.Set(ctx, SettingKeyOpsEmailNotificationConfig, string(b))
}
return defaultCfg, nil
}
return nil, err
}
cfg := &OpsEmailNotificationConfig{}
if err := json.Unmarshal([]byte(raw), cfg); err != nil {
// Corrupted JSON should not break ops UI; fall back to defaults.
return defaultCfg, nil
}
normalizeOpsEmailNotificationConfig(cfg)
return cfg, nil
}
func (s *OpsService) UpdateEmailNotificationConfig(ctx context.Context, req *OpsEmailNotificationConfigUpdateRequest) (*OpsEmailNotificationConfig, error) {
if s == nil || s.settingRepo == nil {
return nil, errors.New("setting repository not initialized")
}
if ctx == nil {
ctx = context.Background()
}
if req == nil {
return nil, errors.New("invalid request")
}
cfg, err := s.GetEmailNotificationConfig(ctx)
if err != nil {
return nil, err
}
if req.Alert != nil {
cfg.Alert.Enabled = req.Alert.Enabled
if req.Alert.Recipients != nil {
cfg.Alert.Recipients = req.Alert.Recipients
}
cfg.Alert.MinSeverity = strings.TrimSpace(req.Alert.MinSeverity)
cfg.Alert.RateLimitPerHour = req.Alert.RateLimitPerHour
cfg.Alert.BatchingWindowSeconds = req.Alert.BatchingWindowSeconds
cfg.Alert.IncludeResolvedAlerts = req.Alert.IncludeResolvedAlerts
}
if req.Report != nil {
cfg.Report.Enabled = req.Report.Enabled
if req.Report.Recipients != nil {
cfg.Report.Recipients = req.Report.Recipients
}
cfg.Report.DailySummaryEnabled = req.Report.DailySummaryEnabled
cfg.Report.DailySummarySchedule = strings.TrimSpace(req.Report.DailySummarySchedule)
cfg.Report.WeeklySummaryEnabled = req.Report.WeeklySummaryEnabled
cfg.Report.WeeklySummarySchedule = strings.TrimSpace(req.Report.WeeklySummarySchedule)
cfg.Report.ErrorDigestEnabled = req.Report.ErrorDigestEnabled
cfg.Report.ErrorDigestSchedule = strings.TrimSpace(req.Report.ErrorDigestSchedule)
cfg.Report.ErrorDigestMinCount = req.Report.ErrorDigestMinCount
cfg.Report.AccountHealthEnabled = req.Report.AccountHealthEnabled
cfg.Report.AccountHealthSchedule = strings.TrimSpace(req.Report.AccountHealthSchedule)
cfg.Report.AccountHealthErrorRateThreshold = req.Report.AccountHealthErrorRateThreshold
}
if err := validateOpsEmailNotificationConfig(cfg); err != nil {
return nil, err
}
normalizeOpsEmailNotificationConfig(cfg)
raw, err := json.Marshal(cfg)
if err != nil {
return nil, err
}
if err := s.settingRepo.Set(ctx, SettingKeyOpsEmailNotificationConfig, string(raw)); err != nil {
return nil, err
}
return cfg, nil
}
func defaultOpsEmailNotificationConfig() *OpsEmailNotificationConfig {
return &OpsEmailNotificationConfig{
Alert: OpsEmailAlertConfig{
Enabled: true,
Recipients: []string{},
MinSeverity: "",
RateLimitPerHour: 0,
BatchingWindowSeconds: 0,
IncludeResolvedAlerts: false,
},
Report: OpsEmailReportConfig{
Enabled: false,
Recipients: []string{},
DailySummaryEnabled: false,
DailySummarySchedule: "0 9 * * *",
WeeklySummaryEnabled: false,
WeeklySummarySchedule: "0 9 * * 1",
ErrorDigestEnabled: false,
ErrorDigestSchedule: "0 9 * * *",
ErrorDigestMinCount: 10,
AccountHealthEnabled: false,
AccountHealthSchedule: "0 9 * * *",
AccountHealthErrorRateThreshold: 10.0,
},
}
}
func normalizeOpsEmailNotificationConfig(cfg *OpsEmailNotificationConfig) {
if cfg == nil {
return
}
if cfg.Alert.Recipients == nil {
cfg.Alert.Recipients = []string{}
}
if cfg.Report.Recipients == nil {
cfg.Report.Recipients = []string{}
}
cfg.Alert.MinSeverity = strings.TrimSpace(cfg.Alert.MinSeverity)
cfg.Report.DailySummarySchedule = strings.TrimSpace(cfg.Report.DailySummarySchedule)
cfg.Report.WeeklySummarySchedule = strings.TrimSpace(cfg.Report.WeeklySummarySchedule)
cfg.Report.ErrorDigestSchedule = strings.TrimSpace(cfg.Report.ErrorDigestSchedule)
cfg.Report.AccountHealthSchedule = strings.TrimSpace(cfg.Report.AccountHealthSchedule)
// Fill missing schedules with defaults to avoid breaking cron logic if clients send empty strings.
if cfg.Report.DailySummarySchedule == "" {
cfg.Report.DailySummarySchedule = "0 9 * * *"
}
if cfg.Report.WeeklySummarySchedule == "" {
cfg.Report.WeeklySummarySchedule = "0 9 * * 1"
}
if cfg.Report.ErrorDigestSchedule == "" {
cfg.Report.ErrorDigestSchedule = "0 9 * * *"
}
if cfg.Report.AccountHealthSchedule == "" {
cfg.Report.AccountHealthSchedule = "0 9 * * *"
}
}
func validateOpsEmailNotificationConfig(cfg *OpsEmailNotificationConfig) error {
if cfg == nil {
return errors.New("invalid config")
}
if cfg.Alert.RateLimitPerHour < 0 {
return errors.New("alert.rate_limit_per_hour must be >= 0")
}
if cfg.Alert.BatchingWindowSeconds < 0 {
return errors.New("alert.batching_window_seconds must be >= 0")
}
switch strings.TrimSpace(cfg.Alert.MinSeverity) {
case "", "critical", "warning", "info":
default:
return errors.New("alert.min_severity must be one of: critical, warning, info, or empty")
}
if cfg.Report.ErrorDigestMinCount < 0 {
return errors.New("report.error_digest_min_count must be >= 0")
}
if cfg.Report.AccountHealthErrorRateThreshold < 0 || cfg.Report.AccountHealthErrorRateThreshold > 100 {
return errors.New("report.account_health_error_rate_threshold must be between 0 and 100")
}
return nil
}
// =========================
// Alert runtime settings
// =========================
func defaultOpsAlertRuntimeSettings() *OpsAlertRuntimeSettings {
return &OpsAlertRuntimeSettings{
EvaluationIntervalSeconds: 60,
DistributedLock: OpsDistributedLockSettings{
Enabled: true,
Key: opsAlertEvaluatorLeaderLockKeyDefault,
TTLSeconds: int(opsAlertEvaluatorLeaderLockTTLDefault.Seconds()),
},
Silencing: OpsAlertSilencingSettings{
Enabled: false,
GlobalUntilRFC3339: "",
GlobalReason: "",
Entries: []OpsAlertSilenceEntry{},
},
}
}
func normalizeOpsDistributedLockSettings(s *OpsDistributedLockSettings, defaultKey string, defaultTTLSeconds int) {
if s == nil {
return
}
s.Key = strings.TrimSpace(s.Key)
if s.Key == "" {
s.Key = defaultKey
}
if s.TTLSeconds <= 0 {
s.TTLSeconds = defaultTTLSeconds
}
}
func normalizeOpsAlertSilencingSettings(s *OpsAlertSilencingSettings) {
if s == nil {
return
}
s.GlobalUntilRFC3339 = strings.TrimSpace(s.GlobalUntilRFC3339)
s.GlobalReason = strings.TrimSpace(s.GlobalReason)
if s.Entries == nil {
s.Entries = []OpsAlertSilenceEntry{}
}
for i := range s.Entries {
s.Entries[i].UntilRFC3339 = strings.TrimSpace(s.Entries[i].UntilRFC3339)
s.Entries[i].Reason = strings.TrimSpace(s.Entries[i].Reason)
}
}
func validateOpsDistributedLockSettings(s OpsDistributedLockSettings) error {
if strings.TrimSpace(s.Key) == "" {
return errors.New("distributed_lock.key is required")
}
if s.TTLSeconds <= 0 || s.TTLSeconds > int((24*time.Hour).Seconds()) {
return errors.New("distributed_lock.ttl_seconds must be between 1 and 86400")
}
return nil
}
func validateOpsAlertSilencingSettings(s OpsAlertSilencingSettings) error {
parse := func(raw string) error {
if strings.TrimSpace(raw) == "" {
return nil
}
if _, err := time.Parse(time.RFC3339, raw); err != nil {
return errors.New("silencing time must be RFC3339")
}
return nil
}
if err := parse(s.GlobalUntilRFC3339); err != nil {
return err
}
for _, entry := range s.Entries {
if strings.TrimSpace(entry.UntilRFC3339) == "" {
return errors.New("silencing.entries.until_rfc3339 is required")
}
if _, err := time.Parse(time.RFC3339, entry.UntilRFC3339); err != nil {
return errors.New("silencing.entries.until_rfc3339 must be RFC3339")
}
}
return nil
}
func (s *OpsService) GetOpsAlertRuntimeSettings(ctx context.Context) (*OpsAlertRuntimeSettings, error) {
defaultCfg := defaultOpsAlertRuntimeSettings()
if s == nil || s.settingRepo == nil {
return defaultCfg, nil
}
if ctx == nil {
ctx = context.Background()
}
raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsAlertRuntimeSettings)
if err != nil {
if errors.Is(err, ErrSettingNotFound) {
if b, mErr := json.Marshal(defaultCfg); mErr == nil {
_ = s.settingRepo.Set(ctx, SettingKeyOpsAlertRuntimeSettings, string(b))
}
return defaultCfg, nil
}
return nil, err
}
cfg := &OpsAlertRuntimeSettings{}
if err := json.Unmarshal([]byte(raw), cfg); err != nil {
return defaultCfg, nil
}
if cfg.EvaluationIntervalSeconds <= 0 {
cfg.EvaluationIntervalSeconds = defaultCfg.EvaluationIntervalSeconds
}
normalizeOpsDistributedLockSettings(&cfg.DistributedLock, opsAlertEvaluatorLeaderLockKeyDefault, defaultCfg.DistributedLock.TTLSeconds)
normalizeOpsAlertSilencingSettings(&cfg.Silencing)
return cfg, nil
}
func (s *OpsService) UpdateOpsAlertRuntimeSettings(ctx context.Context, cfg *OpsAlertRuntimeSettings) (*OpsAlertRuntimeSettings, error) {
if s == nil || s.settingRepo == nil {
return nil, errors.New("setting repository not initialized")
}
if ctx == nil {
ctx = context.Background()
}
if cfg == nil {
return nil, errors.New("invalid config")
}
if cfg.EvaluationIntervalSeconds < 1 || cfg.EvaluationIntervalSeconds > int((24*time.Hour).Seconds()) {
return nil, errors.New("evaluation_interval_seconds must be between 1 and 86400")
}
if cfg.DistributedLock.Enabled {
if err := validateOpsDistributedLockSettings(cfg.DistributedLock); err != nil {
return nil, err
}
}
if cfg.Silencing.Enabled {
if err := validateOpsAlertSilencingSettings(cfg.Silencing); err != nil {
return nil, err
}
}
defaultCfg := defaultOpsAlertRuntimeSettings()
normalizeOpsDistributedLockSettings(&cfg.DistributedLock, opsAlertEvaluatorLeaderLockKeyDefault, defaultCfg.DistributedLock.TTLSeconds)
normalizeOpsAlertSilencingSettings(&cfg.Silencing)
raw, err := json.Marshal(cfg)
if err != nil {
return nil, err
}
if err := s.settingRepo.Set(ctx, SettingKeyOpsAlertRuntimeSettings, string(raw)); err != nil {
return nil, err
}
// Return a fresh copy (avoid callers holding pointers into internal slices that may be mutated).
updated := &OpsAlertRuntimeSettings{}
_ = json.Unmarshal(raw, updated)
return updated, nil
}
// =========================
// Advanced settings
// =========================
func defaultOpsAdvancedSettings() *OpsAdvancedSettings {
return &OpsAdvancedSettings{
DataRetention: OpsDataRetentionSettings{
CleanupEnabled: false,
CleanupSchedule: "0 2 * * *",
ErrorLogRetentionDays: 30,
MinuteMetricsRetentionDays: 30,
HourlyMetricsRetentionDays: 30,
},
Aggregation: OpsAggregationSettings{
AggregationEnabled: false,
},
IgnoreCountTokensErrors: true, // count_tokens 404 是预期行为,默认忽略
IgnoreContextCanceled: true, // Default to true - client disconnects are not errors
IgnoreNoAvailableAccounts: false, // Default to false - this is a real routing issue
IgnoreInsufficientBalanceErrors: false, // 默认不忽略,余额不足可能需要关注
DisplayOpenAITokenStats: false,
DisplayAlertEvents: true,
AutoRefreshEnabled: false,
AutoRefreshIntervalSec: 30,
}
}
func normalizeOpsAdvancedSettings(cfg *OpsAdvancedSettings) {
if cfg == nil {
return
}
cfg.DataRetention.CleanupSchedule = strings.TrimSpace(cfg.DataRetention.CleanupSchedule)
if cfg.DataRetention.CleanupSchedule == "" {
cfg.DataRetention.CleanupSchedule = "0 2 * * *"
}
feat(ops): allow retention days = 0 to wipe table on each scheduled cleanup Background / 背景 The ops cleanup task currently rejects retention days < 1 in both validate and normalize, so operators who want minimal-history setups (e.g. high churn deployments that prefer near-realtime cleanup) cannot express that intent through the UI. The only options are 1+ days, which keeps at least 24h of history regardless of cron frequency. ops 清理任务目前在 validate 和 normalize 两处都拒绝小于 1 的保留天数, 让希望尽量不留历史的运维场景(高吞吐部署 + 想用近实时清理)无法通过 UI 表达。最低只能配 1,等于不管 cron 多频繁,至少都会保留 24 小时的历史。 Purpose / 目的 Let admins set retention days to 0, meaning "every scheduled cleanup run wipes the corresponding table(s) entirely". Combined with a more frequent cron (e.g. `0 * * * *`) this yields effectively rolling cleanup. 允许管理员把保留天数设为 0,语义为"每次定时清理时把对应表全部清空"。 搭配更频繁的 cron(比如每小时整点)即可获得近似滚动清理的效果。 Changes / 改动内容 Backend - service/ops_settings.go: validate accepts [0, 365]; normalize only refills default 30 when value is < 0 (negative is treated as legacy bad data, 0 is honoured) - service/ops_cleanup_service.go: introduce `opsCleanupPlan(now, days)` returning `(cutoff, truncate, ok)`. days==0 returns truncate=true and short-circuits to a new `truncateOpsTable` helper that uses `TRUNCATE TABLE` (O(1), no WAL, no VACUUM pressure). days>0 keeps the existing batched DELETE path unchanged. Empty tables skip TRUNCATE to avoid the ACCESS EXCLUSIVE lock entirely - Extract `isMissingRelationError` helper to dedupe the "table not yet created" tolerance shared by both delete and truncate paths - Add unit tests for `opsCleanupPlan` (three branches) and `isMissingRelationError` 后端 - service/ops_settings.go: validate 接受 [0, 365];normalize 仅在 < 0 时回填默认 30(负数视为脏数据,0 被尊重) - service/ops_cleanup_service.go: 抽 `opsCleanupPlan(now, days)` 返回 `(cutoff, truncate, ok)`。days==0 → truncate=true,走新增 `truncateOpsTable`(TRUNCATE TABLE,O(1),无 WAL、无 VACUUM 压力); days>0 仍走原批量 DELETE 路径,行为完全不变。空表跳过 TRUNCATE, 避免无意义的 ACCESS EXCLUSIVE 锁 - 抽 `isMissingRelationError` helper 复用 delete / truncate 两处的 "表不存在"宽容判断 - 补 `opsCleanupPlan` 三分支 + `isMissingRelationError` 单元测试 Frontend - OpsSettingsDialog.vue: validation accepts [0, 365]; input min=0 - i18n (zh/en): hint mentions "0 = wipe all on every cleanup", validation message updated to 0-365 range 前端 - OpsSettingsDialog.vue: 校验放宽到 [0, 365],input min 改 0 - i18n(zh/en):hint 补"0 = 每次清理时清空所有",错误提示改 0-365 Trade-offs / 取舍 - TRUNCATE requires ACCESS EXCLUSIVE lock briefly, but ops tables only have the cleanup task as a writer, so the lock is invisible to other workloads - Empty-table guard avoids the lock when there is nothing to clean - Negative values are still treated as legacy bad data and replaced with default 30 to preserve compatibility
2026-04-29 15:01:02 +08:00
// 保留天数0 表示每次定时清理全部(清空所有),> 0 表示按天数保留;
// 仅在拿到非法的负数时回填默认值,避免覆盖用户主动设的 0。
if cfg.DataRetention.ErrorLogRetentionDays < 0 {
cfg.DataRetention.ErrorLogRetentionDays = 30
}
feat(ops): allow retention days = 0 to wipe table on each scheduled cleanup Background / 背景 The ops cleanup task currently rejects retention days < 1 in both validate and normalize, so operators who want minimal-history setups (e.g. high churn deployments that prefer near-realtime cleanup) cannot express that intent through the UI. The only options are 1+ days, which keeps at least 24h of history regardless of cron frequency. ops 清理任务目前在 validate 和 normalize 两处都拒绝小于 1 的保留天数, 让希望尽量不留历史的运维场景(高吞吐部署 + 想用近实时清理)无法通过 UI 表达。最低只能配 1,等于不管 cron 多频繁,至少都会保留 24 小时的历史。 Purpose / 目的 Let admins set retention days to 0, meaning "every scheduled cleanup run wipes the corresponding table(s) entirely". Combined with a more frequent cron (e.g. `0 * * * *`) this yields effectively rolling cleanup. 允许管理员把保留天数设为 0,语义为"每次定时清理时把对应表全部清空"。 搭配更频繁的 cron(比如每小时整点)即可获得近似滚动清理的效果。 Changes / 改动内容 Backend - service/ops_settings.go: validate accepts [0, 365]; normalize only refills default 30 when value is < 0 (negative is treated as legacy bad data, 0 is honoured) - service/ops_cleanup_service.go: introduce `opsCleanupPlan(now, days)` returning `(cutoff, truncate, ok)`. days==0 returns truncate=true and short-circuits to a new `truncateOpsTable` helper that uses `TRUNCATE TABLE` (O(1), no WAL, no VACUUM pressure). days>0 keeps the existing batched DELETE path unchanged. Empty tables skip TRUNCATE to avoid the ACCESS EXCLUSIVE lock entirely - Extract `isMissingRelationError` helper to dedupe the "table not yet created" tolerance shared by both delete and truncate paths - Add unit tests for `opsCleanupPlan` (three branches) and `isMissingRelationError` 后端 - service/ops_settings.go: validate 接受 [0, 365];normalize 仅在 < 0 时回填默认 30(负数视为脏数据,0 被尊重) - service/ops_cleanup_service.go: 抽 `opsCleanupPlan(now, days)` 返回 `(cutoff, truncate, ok)`。days==0 → truncate=true,走新增 `truncateOpsTable`(TRUNCATE TABLE,O(1),无 WAL、无 VACUUM 压力); days>0 仍走原批量 DELETE 路径,行为完全不变。空表跳过 TRUNCATE, 避免无意义的 ACCESS EXCLUSIVE 锁 - 抽 `isMissingRelationError` helper 复用 delete / truncate 两处的 "表不存在"宽容判断 - 补 `opsCleanupPlan` 三分支 + `isMissingRelationError` 单元测试 Frontend - OpsSettingsDialog.vue: validation accepts [0, 365]; input min=0 - i18n (zh/en): hint mentions "0 = wipe all on every cleanup", validation message updated to 0-365 range 前端 - OpsSettingsDialog.vue: 校验放宽到 [0, 365],input min 改 0 - i18n(zh/en):hint 补"0 = 每次清理时清空所有",错误提示改 0-365 Trade-offs / 取舍 - TRUNCATE requires ACCESS EXCLUSIVE lock briefly, but ops tables only have the cleanup task as a writer, so the lock is invisible to other workloads - Empty-table guard avoids the lock when there is nothing to clean - Negative values are still treated as legacy bad data and replaced with default 30 to preserve compatibility
2026-04-29 15:01:02 +08:00
if cfg.DataRetention.MinuteMetricsRetentionDays < 0 {
cfg.DataRetention.MinuteMetricsRetentionDays = 30
}
feat(ops): allow retention days = 0 to wipe table on each scheduled cleanup Background / 背景 The ops cleanup task currently rejects retention days < 1 in both validate and normalize, so operators who want minimal-history setups (e.g. high churn deployments that prefer near-realtime cleanup) cannot express that intent through the UI. The only options are 1+ days, which keeps at least 24h of history regardless of cron frequency. ops 清理任务目前在 validate 和 normalize 两处都拒绝小于 1 的保留天数, 让希望尽量不留历史的运维场景(高吞吐部署 + 想用近实时清理)无法通过 UI 表达。最低只能配 1,等于不管 cron 多频繁,至少都会保留 24 小时的历史。 Purpose / 目的 Let admins set retention days to 0, meaning "every scheduled cleanup run wipes the corresponding table(s) entirely". Combined with a more frequent cron (e.g. `0 * * * *`) this yields effectively rolling cleanup. 允许管理员把保留天数设为 0,语义为"每次定时清理时把对应表全部清空"。 搭配更频繁的 cron(比如每小时整点)即可获得近似滚动清理的效果。 Changes / 改动内容 Backend - service/ops_settings.go: validate accepts [0, 365]; normalize only refills default 30 when value is < 0 (negative is treated as legacy bad data, 0 is honoured) - service/ops_cleanup_service.go: introduce `opsCleanupPlan(now, days)` returning `(cutoff, truncate, ok)`. days==0 returns truncate=true and short-circuits to a new `truncateOpsTable` helper that uses `TRUNCATE TABLE` (O(1), no WAL, no VACUUM pressure). days>0 keeps the existing batched DELETE path unchanged. Empty tables skip TRUNCATE to avoid the ACCESS EXCLUSIVE lock entirely - Extract `isMissingRelationError` helper to dedupe the "table not yet created" tolerance shared by both delete and truncate paths - Add unit tests for `opsCleanupPlan` (three branches) and `isMissingRelationError` 后端 - service/ops_settings.go: validate 接受 [0, 365];normalize 仅在 < 0 时回填默认 30(负数视为脏数据,0 被尊重) - service/ops_cleanup_service.go: 抽 `opsCleanupPlan(now, days)` 返回 `(cutoff, truncate, ok)`。days==0 → truncate=true,走新增 `truncateOpsTable`(TRUNCATE TABLE,O(1),无 WAL、无 VACUUM 压力); days>0 仍走原批量 DELETE 路径,行为完全不变。空表跳过 TRUNCATE, 避免无意义的 ACCESS EXCLUSIVE 锁 - 抽 `isMissingRelationError` helper 复用 delete / truncate 两处的 "表不存在"宽容判断 - 补 `opsCleanupPlan` 三分支 + `isMissingRelationError` 单元测试 Frontend - OpsSettingsDialog.vue: validation accepts [0, 365]; input min=0 - i18n (zh/en): hint mentions "0 = wipe all on every cleanup", validation message updated to 0-365 range 前端 - OpsSettingsDialog.vue: 校验放宽到 [0, 365],input min 改 0 - i18n(zh/en):hint 补"0 = 每次清理时清空所有",错误提示改 0-365 Trade-offs / 取舍 - TRUNCATE requires ACCESS EXCLUSIVE lock briefly, but ops tables only have the cleanup task as a writer, so the lock is invisible to other workloads - Empty-table guard avoids the lock when there is nothing to clean - Negative values are still treated as legacy bad data and replaced with default 30 to preserve compatibility
2026-04-29 15:01:02 +08:00
if cfg.DataRetention.HourlyMetricsRetentionDays < 0 {
cfg.DataRetention.HourlyMetricsRetentionDays = 30
}
// Normalize auto refresh interval (default 30 seconds)
if cfg.AutoRefreshIntervalSec <= 0 {
cfg.AutoRefreshIntervalSec = 30
}
}
func validateOpsAdvancedSettings(cfg *OpsAdvancedSettings) error {
if cfg == nil {
return errors.New("invalid config")
}
feat(ops): allow retention days = 0 to wipe table on each scheduled cleanup Background / 背景 The ops cleanup task currently rejects retention days < 1 in both validate and normalize, so operators who want minimal-history setups (e.g. high churn deployments that prefer near-realtime cleanup) cannot express that intent through the UI. The only options are 1+ days, which keeps at least 24h of history regardless of cron frequency. ops 清理任务目前在 validate 和 normalize 两处都拒绝小于 1 的保留天数, 让希望尽量不留历史的运维场景(高吞吐部署 + 想用近实时清理)无法通过 UI 表达。最低只能配 1,等于不管 cron 多频繁,至少都会保留 24 小时的历史。 Purpose / 目的 Let admins set retention days to 0, meaning "every scheduled cleanup run wipes the corresponding table(s) entirely". Combined with a more frequent cron (e.g. `0 * * * *`) this yields effectively rolling cleanup. 允许管理员把保留天数设为 0,语义为"每次定时清理时把对应表全部清空"。 搭配更频繁的 cron(比如每小时整点)即可获得近似滚动清理的效果。 Changes / 改动内容 Backend - service/ops_settings.go: validate accepts [0, 365]; normalize only refills default 30 when value is < 0 (negative is treated as legacy bad data, 0 is honoured) - service/ops_cleanup_service.go: introduce `opsCleanupPlan(now, days)` returning `(cutoff, truncate, ok)`. days==0 returns truncate=true and short-circuits to a new `truncateOpsTable` helper that uses `TRUNCATE TABLE` (O(1), no WAL, no VACUUM pressure). days>0 keeps the existing batched DELETE path unchanged. Empty tables skip TRUNCATE to avoid the ACCESS EXCLUSIVE lock entirely - Extract `isMissingRelationError` helper to dedupe the "table not yet created" tolerance shared by both delete and truncate paths - Add unit tests for `opsCleanupPlan` (three branches) and `isMissingRelationError` 后端 - service/ops_settings.go: validate 接受 [0, 365];normalize 仅在 < 0 时回填默认 30(负数视为脏数据,0 被尊重) - service/ops_cleanup_service.go: 抽 `opsCleanupPlan(now, days)` 返回 `(cutoff, truncate, ok)`。days==0 → truncate=true,走新增 `truncateOpsTable`(TRUNCATE TABLE,O(1),无 WAL、无 VACUUM 压力); days>0 仍走原批量 DELETE 路径,行为完全不变。空表跳过 TRUNCATE, 避免无意义的 ACCESS EXCLUSIVE 锁 - 抽 `isMissingRelationError` helper 复用 delete / truncate 两处的 "表不存在"宽容判断 - 补 `opsCleanupPlan` 三分支 + `isMissingRelationError` 单元测试 Frontend - OpsSettingsDialog.vue: validation accepts [0, 365]; input min=0 - i18n (zh/en): hint mentions "0 = wipe all on every cleanup", validation message updated to 0-365 range 前端 - OpsSettingsDialog.vue: 校验放宽到 [0, 365],input min 改 0 - i18n(zh/en):hint 补"0 = 每次清理时清空所有",错误提示改 0-365 Trade-offs / 取舍 - TRUNCATE requires ACCESS EXCLUSIVE lock briefly, but ops tables only have the cleanup task as a writer, so the lock is invisible to other workloads - Empty-table guard avoids the lock when there is nothing to clean - Negative values are still treated as legacy bad data and replaced with default 30 to preserve compatibility
2026-04-29 15:01:02 +08:00
// 保留天数0 表示每次清理全部1-365 表示按天数保留。
if cfg.DataRetention.ErrorLogRetentionDays < 0 || cfg.DataRetention.ErrorLogRetentionDays > 365 {
return errors.New("error_log_retention_days must be between 0 and 365")
}
feat(ops): allow retention days = 0 to wipe table on each scheduled cleanup Background / 背景 The ops cleanup task currently rejects retention days < 1 in both validate and normalize, so operators who want minimal-history setups (e.g. high churn deployments that prefer near-realtime cleanup) cannot express that intent through the UI. The only options are 1+ days, which keeps at least 24h of history regardless of cron frequency. ops 清理任务目前在 validate 和 normalize 两处都拒绝小于 1 的保留天数, 让希望尽量不留历史的运维场景(高吞吐部署 + 想用近实时清理)无法通过 UI 表达。最低只能配 1,等于不管 cron 多频繁,至少都会保留 24 小时的历史。 Purpose / 目的 Let admins set retention days to 0, meaning "every scheduled cleanup run wipes the corresponding table(s) entirely". Combined with a more frequent cron (e.g. `0 * * * *`) this yields effectively rolling cleanup. 允许管理员把保留天数设为 0,语义为"每次定时清理时把对应表全部清空"。 搭配更频繁的 cron(比如每小时整点)即可获得近似滚动清理的效果。 Changes / 改动内容 Backend - service/ops_settings.go: validate accepts [0, 365]; normalize only refills default 30 when value is < 0 (negative is treated as legacy bad data, 0 is honoured) - service/ops_cleanup_service.go: introduce `opsCleanupPlan(now, days)` returning `(cutoff, truncate, ok)`. days==0 returns truncate=true and short-circuits to a new `truncateOpsTable` helper that uses `TRUNCATE TABLE` (O(1), no WAL, no VACUUM pressure). days>0 keeps the existing batched DELETE path unchanged. Empty tables skip TRUNCATE to avoid the ACCESS EXCLUSIVE lock entirely - Extract `isMissingRelationError` helper to dedupe the "table not yet created" tolerance shared by both delete and truncate paths - Add unit tests for `opsCleanupPlan` (three branches) and `isMissingRelationError` 后端 - service/ops_settings.go: validate 接受 [0, 365];normalize 仅在 < 0 时回填默认 30(负数视为脏数据,0 被尊重) - service/ops_cleanup_service.go: 抽 `opsCleanupPlan(now, days)` 返回 `(cutoff, truncate, ok)`。days==0 → truncate=true,走新增 `truncateOpsTable`(TRUNCATE TABLE,O(1),无 WAL、无 VACUUM 压力); days>0 仍走原批量 DELETE 路径,行为完全不变。空表跳过 TRUNCATE, 避免无意义的 ACCESS EXCLUSIVE 锁 - 抽 `isMissingRelationError` helper 复用 delete / truncate 两处的 "表不存在"宽容判断 - 补 `opsCleanupPlan` 三分支 + `isMissingRelationError` 单元测试 Frontend - OpsSettingsDialog.vue: validation accepts [0, 365]; input min=0 - i18n (zh/en): hint mentions "0 = wipe all on every cleanup", validation message updated to 0-365 range 前端 - OpsSettingsDialog.vue: 校验放宽到 [0, 365],input min 改 0 - i18n(zh/en):hint 补"0 = 每次清理时清空所有",错误提示改 0-365 Trade-offs / 取舍 - TRUNCATE requires ACCESS EXCLUSIVE lock briefly, but ops tables only have the cleanup task as a writer, so the lock is invisible to other workloads - Empty-table guard avoids the lock when there is nothing to clean - Negative values are still treated as legacy bad data and replaced with default 30 to preserve compatibility
2026-04-29 15:01:02 +08:00
if cfg.DataRetention.MinuteMetricsRetentionDays < 0 || cfg.DataRetention.MinuteMetricsRetentionDays > 365 {
return errors.New("minute_metrics_retention_days must be between 0 and 365")
}
feat(ops): allow retention days = 0 to wipe table on each scheduled cleanup Background / 背景 The ops cleanup task currently rejects retention days < 1 in both validate and normalize, so operators who want minimal-history setups (e.g. high churn deployments that prefer near-realtime cleanup) cannot express that intent through the UI. The only options are 1+ days, which keeps at least 24h of history regardless of cron frequency. ops 清理任务目前在 validate 和 normalize 两处都拒绝小于 1 的保留天数, 让希望尽量不留历史的运维场景(高吞吐部署 + 想用近实时清理)无法通过 UI 表达。最低只能配 1,等于不管 cron 多频繁,至少都会保留 24 小时的历史。 Purpose / 目的 Let admins set retention days to 0, meaning "every scheduled cleanup run wipes the corresponding table(s) entirely". Combined with a more frequent cron (e.g. `0 * * * *`) this yields effectively rolling cleanup. 允许管理员把保留天数设为 0,语义为"每次定时清理时把对应表全部清空"。 搭配更频繁的 cron(比如每小时整点)即可获得近似滚动清理的效果。 Changes / 改动内容 Backend - service/ops_settings.go: validate accepts [0, 365]; normalize only refills default 30 when value is < 0 (negative is treated as legacy bad data, 0 is honoured) - service/ops_cleanup_service.go: introduce `opsCleanupPlan(now, days)` returning `(cutoff, truncate, ok)`. days==0 returns truncate=true and short-circuits to a new `truncateOpsTable` helper that uses `TRUNCATE TABLE` (O(1), no WAL, no VACUUM pressure). days>0 keeps the existing batched DELETE path unchanged. Empty tables skip TRUNCATE to avoid the ACCESS EXCLUSIVE lock entirely - Extract `isMissingRelationError` helper to dedupe the "table not yet created" tolerance shared by both delete and truncate paths - Add unit tests for `opsCleanupPlan` (three branches) and `isMissingRelationError` 后端 - service/ops_settings.go: validate 接受 [0, 365];normalize 仅在 < 0 时回填默认 30(负数视为脏数据,0 被尊重) - service/ops_cleanup_service.go: 抽 `opsCleanupPlan(now, days)` 返回 `(cutoff, truncate, ok)`。days==0 → truncate=true,走新增 `truncateOpsTable`(TRUNCATE TABLE,O(1),无 WAL、无 VACUUM 压力); days>0 仍走原批量 DELETE 路径,行为完全不变。空表跳过 TRUNCATE, 避免无意义的 ACCESS EXCLUSIVE 锁 - 抽 `isMissingRelationError` helper 复用 delete / truncate 两处的 "表不存在"宽容判断 - 补 `opsCleanupPlan` 三分支 + `isMissingRelationError` 单元测试 Frontend - OpsSettingsDialog.vue: validation accepts [0, 365]; input min=0 - i18n (zh/en): hint mentions "0 = wipe all on every cleanup", validation message updated to 0-365 range 前端 - OpsSettingsDialog.vue: 校验放宽到 [0, 365],input min 改 0 - i18n(zh/en):hint 补"0 = 每次清理时清空所有",错误提示改 0-365 Trade-offs / 取舍 - TRUNCATE requires ACCESS EXCLUSIVE lock briefly, but ops tables only have the cleanup task as a writer, so the lock is invisible to other workloads - Empty-table guard avoids the lock when there is nothing to clean - Negative values are still treated as legacy bad data and replaced with default 30 to preserve compatibility
2026-04-29 15:01:02 +08:00
if cfg.DataRetention.HourlyMetricsRetentionDays < 0 || cfg.DataRetention.HourlyMetricsRetentionDays > 365 {
return errors.New("hourly_metrics_retention_days must be between 0 and 365")
}
if cfg.AutoRefreshIntervalSec < 15 || cfg.AutoRefreshIntervalSec > 300 {
return errors.New("auto_refresh_interval_seconds must be between 15 and 300")
}
return nil
}
func (s *OpsService) GetOpsAdvancedSettings(ctx context.Context) (*OpsAdvancedSettings, error) {
defaultCfg := defaultOpsAdvancedSettings()
if s == nil || s.settingRepo == nil {
return defaultCfg, nil
}
if ctx == nil {
ctx = context.Background()
}
raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsAdvancedSettings)
if err != nil {
if errors.Is(err, ErrSettingNotFound) {
if b, mErr := json.Marshal(defaultCfg); mErr == nil {
_ = s.settingRepo.Set(ctx, SettingKeyOpsAdvancedSettings, string(b))
}
return defaultCfg, nil
}
return nil, err
}
2026-03-13 17:18:04 +08:00
cfg := defaultOpsAdvancedSettings()
if err := json.Unmarshal([]byte(raw), cfg); err != nil {
return defaultCfg, nil
}
normalizeOpsAdvancedSettings(cfg)
return cfg, nil
}
func (s *OpsService) UpdateOpsAdvancedSettings(ctx context.Context, cfg *OpsAdvancedSettings) (*OpsAdvancedSettings, error) {
if s == nil || s.settingRepo == nil {
return nil, errors.New("setting repository not initialized")
}
if ctx == nil {
ctx = context.Background()
}
if cfg == nil {
return nil, errors.New("invalid config")
}
if err := validateOpsAdvancedSettings(cfg); err != nil {
return nil, err
}
normalizeOpsAdvancedSettings(cfg)
raw, err := json.Marshal(cfg)
if err != nil {
return nil, err
}
if err := s.settingRepo.Set(ctx, SettingKeyOpsAdvancedSettings, string(raw)); err != nil {
return nil, err
}
updated := &OpsAdvancedSettings{}
_ = json.Unmarshal(raw, updated)
return updated, nil
}
// =========================
// Metric thresholds
// =========================
const SettingKeyOpsMetricThresholds = "ops_metric_thresholds"
func defaultOpsMetricThresholds() *OpsMetricThresholds {
slaMin := 99.5
ttftMax := 500.0
reqErrMax := 5.0
upstreamErrMax := 5.0
return &OpsMetricThresholds{
SLAPercentMin: &slaMin,
TTFTp99MsMax: &ttftMax,
RequestErrorRatePercentMax: &reqErrMax,
UpstreamErrorRatePercentMax: &upstreamErrMax,
}
}
func (s *OpsService) GetMetricThresholds(ctx context.Context) (*OpsMetricThresholds, error) {
defaultCfg := defaultOpsMetricThresholds()
if s == nil || s.settingRepo == nil {
return defaultCfg, nil
}
if ctx == nil {
ctx = context.Background()
}
raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsMetricThresholds)
if err != nil {
if errors.Is(err, ErrSettingNotFound) {
if b, mErr := json.Marshal(defaultCfg); mErr == nil {
_ = s.settingRepo.Set(ctx, SettingKeyOpsMetricThresholds, string(b))
}
return defaultCfg, nil
}
return nil, err
}
cfg := &OpsMetricThresholds{}
if err := json.Unmarshal([]byte(raw), cfg); err != nil {
return defaultCfg, nil
}
return cfg, nil
}
func (s *OpsService) UpdateMetricThresholds(ctx context.Context, cfg *OpsMetricThresholds) (*OpsMetricThresholds, error) {
if s == nil || s.settingRepo == nil {
return nil, errors.New("setting repository not initialized")
}
if ctx == nil {
ctx = context.Background()
}
if cfg == nil {
return nil, errors.New("invalid config")
}
// Validate thresholds
if cfg.SLAPercentMin != nil && (*cfg.SLAPercentMin < 0 || *cfg.SLAPercentMin > 100) {
return nil, errors.New("sla_percent_min must be between 0 and 100")
}
if cfg.TTFTp99MsMax != nil && *cfg.TTFTp99MsMax < 0 {
return nil, errors.New("ttft_p99_ms_max must be >= 0")
}
if cfg.RequestErrorRatePercentMax != nil && (*cfg.RequestErrorRatePercentMax < 0 || *cfg.RequestErrorRatePercentMax > 100) {
return nil, errors.New("request_error_rate_percent_max must be between 0 and 100")
}
if cfg.UpstreamErrorRatePercentMax != nil && (*cfg.UpstreamErrorRatePercentMax < 0 || *cfg.UpstreamErrorRatePercentMax > 100) {
return nil, errors.New("upstream_error_rate_percent_max must be between 0 and 100")
}
raw, err := json.Marshal(cfg)
if err != nil {
return nil, err
}
if err := s.settingRepo.Set(ctx, SettingKeyOpsMetricThresholds, string(raw)); err != nil {
return nil, err
}
updated := &OpsMetricThresholds{}
_ = json.Unmarshal(raw, updated)
return updated, nil
}