diff --git a/backend/internal/service/ops_cleanup_service.go b/backend/internal/service/ops_cleanup_service.go index 08a10a02..44ec1ad1 100644 --- a/backend/internal/service/ops_cleanup_service.go +++ b/backend/internal/service/ops_cleanup_service.go @@ -184,6 +184,25 @@ func (c opsCleanupDeletedCounts) String() string { ) } +// opsCleanupPlan 把"保留天数"翻译成具体的清理动作。 +// - days < 0 → 跳过该项清理(ok=false),保留兼容老数据 +// - days == 0 → TRUNCATE TABLE(O(1) 全清),truncate=true +// - days > 0 → 批量 DELETE 早于 now-N天 的行,cutoff = now - N 天 +// +// 之所以 days==0 走 TRUNCATE 而非"now+24h cutoff + DELETE": +// - 速度从 O(N) 降到 O(1),对百万行级表毫秒完成 +// - 无 WAL 写入、无后续 VACUUM 压力 +// - 这些 ops 表只有 cleanup 任务自己写,TRUNCATE 的 ACCESS EXCLUSIVE 锁影响可忽略 +func opsCleanupPlan(now time.Time, days int) (cutoff time.Time, truncate, ok bool) { + if days < 0 { + return time.Time{}, false, false + } + if days == 0 { + return time.Time{}, true, true + } + return now.AddDate(0, 0, -days), false, true +} + func (s *OpsCleanupService) runCleanupOnce(ctx context.Context) (opsCleanupDeletedCounts, error) { out := opsCleanupDeletedCounts{} if s == nil || s.db == nil || s.cfg == nil { @@ -194,34 +213,42 @@ func (s *OpsCleanupService) runCleanupOnce(ctx context.Context) (opsCleanupDelet now := time.Now().UTC() - // Error-like tables: error logs / retry attempts / alert events. - if days := s.cfg.Ops.Cleanup.ErrorLogRetentionDays; days > 0 { - cutoff := now.AddDate(0, 0, -days) - n, err := deleteOldRowsByID(ctx, s.db, "ops_error_logs", "created_at", cutoff, batchSize, false) + // runOne 把"truncate? cutoff? batched delete?"封装到一处, + // 让三组清理(错误日志类 / 分钟指标 / 小时+日预聚合)调用方只关心表名和列名。 + runOne := func(truncate bool, cutoff time.Time, table, timeCol string, castDate bool) (int64, error) { + if truncate { + return truncateOpsTable(ctx, s.db, table) + } + return deleteOldRowsByID(ctx, s.db, table, timeCol, cutoff, batchSize, castDate) + } + + // Error-like tables: error logs / retry attempts / alert events / system logs / cleanup audits. + if cutoff, truncate, ok := opsCleanupPlan(now, s.cfg.Ops.Cleanup.ErrorLogRetentionDays); ok { + n, err := runOne(truncate, cutoff, "ops_error_logs", "created_at", false) if err != nil { return out, err } out.errorLogs = n - n, err = deleteOldRowsByID(ctx, s.db, "ops_retry_attempts", "created_at", cutoff, batchSize, false) + n, err = runOne(truncate, cutoff, "ops_retry_attempts", "created_at", false) if err != nil { return out, err } out.retryAttempts = n - n, err = deleteOldRowsByID(ctx, s.db, "ops_alert_events", "created_at", cutoff, batchSize, false) + n, err = runOne(truncate, cutoff, "ops_alert_events", "created_at", false) if err != nil { return out, err } out.alertEvents = n - n, err = deleteOldRowsByID(ctx, s.db, "ops_system_logs", "created_at", cutoff, batchSize, false) + n, err = runOne(truncate, cutoff, "ops_system_logs", "created_at", false) if err != nil { return out, err } out.systemLogs = n - n, err = deleteOldRowsByID(ctx, s.db, "ops_system_log_cleanup_audits", "created_at", cutoff, batchSize, false) + n, err = runOne(truncate, cutoff, "ops_system_log_cleanup_audits", "created_at", false) if err != nil { return out, err } @@ -229,9 +256,8 @@ func (s *OpsCleanupService) runCleanupOnce(ctx context.Context) (opsCleanupDelet } // Minute-level metrics snapshots. - if days := s.cfg.Ops.Cleanup.MinuteMetricsRetentionDays; days > 0 { - cutoff := now.AddDate(0, 0, -days) - n, err := deleteOldRowsByID(ctx, s.db, "ops_system_metrics", "created_at", cutoff, batchSize, false) + if cutoff, truncate, ok := opsCleanupPlan(now, s.cfg.Ops.Cleanup.MinuteMetricsRetentionDays); ok { + n, err := runOne(truncate, cutoff, "ops_system_metrics", "created_at", false) if err != nil { return out, err } @@ -239,15 +265,14 @@ func (s *OpsCleanupService) runCleanupOnce(ctx context.Context) (opsCleanupDelet } // Pre-aggregation tables (hourly/daily). - if days := s.cfg.Ops.Cleanup.HourlyMetricsRetentionDays; days > 0 { - cutoff := now.AddDate(0, 0, -days) - n, err := deleteOldRowsByID(ctx, s.db, "ops_metrics_hourly", "bucket_start", cutoff, batchSize, false) + if cutoff, truncate, ok := opsCleanupPlan(now, s.cfg.Ops.Cleanup.HourlyMetricsRetentionDays); ok { + n, err := runOne(truncate, cutoff, "ops_metrics_hourly", "bucket_start", false) if err != nil { return out, err } out.hourlyPreagg = n - n, err = deleteOldRowsByID(ctx, s.db, "ops_metrics_daily", "bucket_date", cutoff, batchSize, true) + n, err = runOne(truncate, cutoff, "ops_metrics_daily", "bucket_date", true) if err != nil { return out, err } @@ -303,7 +328,7 @@ WHERE id IN (SELECT id FROM batch) res, err := db.ExecContext(ctx, q, cutoff, batchSize) if err != nil { // If ops tables aren't present yet (partial deployments), treat as no-op. - if strings.Contains(strings.ToLower(err.Error()), "does not exist") && strings.Contains(strings.ToLower(err.Error()), "relation") { + if isMissingRelationError(err) { return total, nil } return total, err @@ -320,6 +345,46 @@ WHERE id IN (SELECT id FROM batch) return total, nil } +// truncateOpsTable 用 TRUNCATE TABLE 清空指定表,先 SELECT COUNT(*) 取得清空前行数用于 heartbeat。 +// +// 与 deleteOldRowsByID 的差异: +// - 不可指定 WHERE 条件,仅用于 days==0 的"清空全部"语义 +// - O(1) 释放表的物理存储页,毫秒级完成,无 WAL 写入、无 VACUUM 压力 +// - 需要 ACCESS EXCLUSIVE 锁,但 ops 表只有清理任务自己写入,瞬间锁影响可忽略 +// +// 表不存在(部分部署)静默返回 0,与 deleteOldRowsByID 保持一致。 +func truncateOpsTable(ctx context.Context, db *sql.DB, table string) (int64, error) { + if db == nil { + return 0, nil + } + var count int64 + if err := db.QueryRowContext(ctx, fmt.Sprintf("SELECT COUNT(*) FROM %s", table)).Scan(&count); err != nil { + if isMissingRelationError(err) { + return 0, nil + } + return 0, fmt.Errorf("count %s: %w", table, err) + } + if count == 0 { + return 0, nil + } + if _, err := db.ExecContext(ctx, fmt.Sprintf("TRUNCATE TABLE %s", table)); err != nil { + if isMissingRelationError(err) { + return 0, nil + } + return 0, fmt.Errorf("truncate %s: %w", table, err) + } + return count, nil +} + +// isMissingRelationError 判断 PG 报错是否为"表不存在",用于让清理任务在部分部署场景静默跳过。 +func isMissingRelationError(err error) bool { + if err == nil { + return false + } + s := strings.ToLower(err.Error()) + return strings.Contains(s, "does not exist") && strings.Contains(s, "relation") +} + func (s *OpsCleanupService) tryAcquireLeaderLock(ctx context.Context) (func(), bool) { if s == nil { return nil, false diff --git a/backend/internal/service/ops_cleanup_service_test.go b/backend/internal/service/ops_cleanup_service_test.go new file mode 100644 index 00000000..86657d27 --- /dev/null +++ b/backend/internal/service/ops_cleanup_service_test.go @@ -0,0 +1,64 @@ +package service + +import ( + "testing" + "time" +) + +func TestOpsCleanupPlan(t *testing.T) { + now := time.Date(2026, 4, 29, 12, 0, 0, 0, time.UTC) + + cases := []struct { + name string + days int + wantOK bool + wantTruncate bool + wantCutoff time.Time + }{ + {name: "negative skips", days: -1, wantOK: false}, + {name: "zero truncates", days: 0, wantOK: true, wantTruncate: true}, + {name: "positive yields past cutoff", days: 7, wantOK: true, wantCutoff: now.AddDate(0, 0, -7)}, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + cutoff, truncate, ok := opsCleanupPlan(now, tc.days) + if ok != tc.wantOK { + t.Fatalf("ok = %v, want %v", ok, tc.wantOK) + } + if !ok { + return + } + if truncate != tc.wantTruncate { + t.Fatalf("truncate = %v, want %v", truncate, tc.wantTruncate) + } + if !tc.wantTruncate && !cutoff.Equal(tc.wantCutoff) { + t.Fatalf("cutoff = %v, want %v", cutoff, tc.wantCutoff) + } + }) + } +} + +func TestIsMissingRelationError(t *testing.T) { + cases := []struct { + name string + err error + want bool + }{ + {name: "nil is not missing", err: nil, want: false}, + {name: "match relation does not exist", err: fakeErr(`pq: relation "ops_error_logs" does not exist`), want: true}, + {name: "match case-insensitive", err: fakeErr(`ERROR: Relation "x" Does Not Exist`), want: true}, + {name: "non-matching error", err: fakeErr("connection refused"), want: false}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + if got := isMissingRelationError(tc.err); got != tc.want { + t.Fatalf("got %v, want %v", got, tc.want) + } + }) + } +} + +type fakeErr string + +func (e fakeErr) Error() string { return string(e) } diff --git a/backend/internal/service/ops_settings.go b/backend/internal/service/ops_settings.go index 5871166c..ecc3a94b 100644 --- a/backend/internal/service/ops_settings.go +++ b/backend/internal/service/ops_settings.go @@ -387,13 +387,15 @@ func normalizeOpsAdvancedSettings(cfg *OpsAdvancedSettings) { if cfg.DataRetention.CleanupSchedule == "" { cfg.DataRetention.CleanupSchedule = "0 2 * * *" } - if cfg.DataRetention.ErrorLogRetentionDays <= 0 { + // 保留天数:0 表示每次定时清理全部(清空所有),> 0 表示按天数保留; + // 仅在拿到非法的负数时回填默认值,避免覆盖用户主动设的 0。 + if cfg.DataRetention.ErrorLogRetentionDays < 0 { cfg.DataRetention.ErrorLogRetentionDays = 30 } - if cfg.DataRetention.MinuteMetricsRetentionDays <= 0 { + if cfg.DataRetention.MinuteMetricsRetentionDays < 0 { cfg.DataRetention.MinuteMetricsRetentionDays = 30 } - if cfg.DataRetention.HourlyMetricsRetentionDays <= 0 { + if cfg.DataRetention.HourlyMetricsRetentionDays < 0 { cfg.DataRetention.HourlyMetricsRetentionDays = 30 } // Normalize auto refresh interval (default 30 seconds) @@ -406,14 +408,15 @@ func validateOpsAdvancedSettings(cfg *OpsAdvancedSettings) error { if cfg == nil { return errors.New("invalid config") } - if cfg.DataRetention.ErrorLogRetentionDays < 1 || cfg.DataRetention.ErrorLogRetentionDays > 365 { - return errors.New("error_log_retention_days must be between 1 and 365") + // 保留天数:0 表示每次清理全部,1-365 表示按天数保留。 + if cfg.DataRetention.ErrorLogRetentionDays < 0 || cfg.DataRetention.ErrorLogRetentionDays > 365 { + return errors.New("error_log_retention_days must be between 0 and 365") } - if cfg.DataRetention.MinuteMetricsRetentionDays < 1 || cfg.DataRetention.MinuteMetricsRetentionDays > 365 { - return errors.New("minute_metrics_retention_days must be between 1 and 365") + if cfg.DataRetention.MinuteMetricsRetentionDays < 0 || cfg.DataRetention.MinuteMetricsRetentionDays > 365 { + return errors.New("minute_metrics_retention_days must be between 0 and 365") } - if cfg.DataRetention.HourlyMetricsRetentionDays < 1 || cfg.DataRetention.HourlyMetricsRetentionDays > 365 { - return errors.New("hourly_metrics_retention_days must be between 1 and 365") + if cfg.DataRetention.HourlyMetricsRetentionDays < 0 || cfg.DataRetention.HourlyMetricsRetentionDays > 365 { + return errors.New("hourly_metrics_retention_days must be between 0 and 365") } if cfg.AutoRefreshIntervalSec < 15 || cfg.AutoRefreshIntervalSec > 300 { return errors.New("auto_refresh_interval_seconds must be between 15 and 300") diff --git a/frontend/src/i18n/locales/en.ts b/frontend/src/i18n/locales/en.ts index c66ca55b..270cd660 100644 --- a/frontend/src/i18n/locales/en.ts +++ b/frontend/src/i18n/locales/en.ts @@ -4648,7 +4648,7 @@ export default { errorLogRetentionDays: 'Error Log Retention Days', minuteMetricsRetentionDays: 'Minute Metrics Retention Days', hourlyMetricsRetentionDays: 'Hourly Metrics Retention Days', - retentionDaysHint: 'Recommended 7-90 days, longer periods will consume more storage', + retentionDaysHint: 'Recommended 7-90 days; longer periods consume more storage. Set to 0 to wipe all history on every scheduled cleanup', aggregation: 'Pre-aggregation Tasks', enableAggregation: 'Enable Pre-aggregation', aggregationHint: 'Pre-aggregation improves query performance for long time windows', @@ -4678,7 +4678,7 @@ export default { autoRefreshCountdown: 'Auto refresh: {seconds}s', validation: { title: 'Please fix the following issues', - retentionDaysRange: 'Retention days must be between 1-365 days', + retentionDaysRange: 'Retention days must be between 0 and 365 (0 = wipe all on every cleanup)', slaMinPercentRange: 'SLA minimum percentage must be between 0 and 100', ttftP99MaxRange: 'TTFT P99 maximum must be a number ≥ 0', requestErrorRateMaxRange: 'Request error rate maximum must be between 0 and 100', diff --git a/frontend/src/i18n/locales/zh.ts b/frontend/src/i18n/locales/zh.ts index 77d1c93c..fdfc9e41 100644 --- a/frontend/src/i18n/locales/zh.ts +++ b/frontend/src/i18n/locales/zh.ts @@ -4810,7 +4810,7 @@ export default { errorLogRetentionDays: '错误日志保留天数', minuteMetricsRetentionDays: '分钟指标保留天数', hourlyMetricsRetentionDays: '小时指标保留天数', - retentionDaysHint: '建议保留7-90天,过长会占用存储空间', + retentionDaysHint: '建议保留 7-90 天,过长会占用存储空间;填 0 表示每次定时清理时清空所有历史', aggregation: '预聚合任务', enableAggregation: '启用预聚合任务', aggregationHint: '预聚合可提升长时间窗口查询性能', @@ -4841,7 +4841,7 @@ export default { autoRefreshCountdown: '自动刷新:{seconds}s', validation: { title: '请先修正以下问题', - retentionDaysRange: '保留天数必须在1-365天之间', + retentionDaysRange: '保留天数必须在 0-365 天之间(0 = 每次清理时清空所有)', slaMinPercentRange: 'SLA最低百分比必须在0-100之间', ttftP99MaxRange: 'TTFT P99最大值必须大于等于0', requestErrorRateMaxRange: '请求错误率最大值必须在0-100之间', diff --git a/frontend/src/views/admin/ops/components/OpsSettingsDialog.vue b/frontend/src/views/admin/ops/components/OpsSettingsDialog.vue index 542f111d..5dba5b1d 100644 --- a/frontend/src/views/admin/ops/components/OpsSettingsDialog.vue +++ b/frontend/src/views/admin/ops/components/OpsSettingsDialog.vue @@ -136,13 +136,13 @@ const validation = computed(() => { // 验证高级设置 if (advancedSettings.value) { const { error_log_retention_days, minute_metrics_retention_days, hourly_metrics_retention_days } = advancedSettings.value.data_retention - if (error_log_retention_days < 1 || error_log_retention_days > 365) { + if (error_log_retention_days < 0 || error_log_retention_days > 365) { errors.push(t('admin.ops.settings.validation.retentionDaysRange')) } - if (minute_metrics_retention_days < 1 || minute_metrics_retention_days > 365) { + if (minute_metrics_retention_days < 0 || minute_metrics_retention_days > 365) { errors.push(t('admin.ops.settings.validation.retentionDaysRange')) } - if (hourly_metrics_retention_days < 1 || hourly_metrics_retention_days > 365) { + if (hourly_metrics_retention_days < 0 || hourly_metrics_retention_days > 365) { errors.push(t('admin.ops.settings.validation.retentionDaysRange')) } } @@ -431,7 +431,7 @@ async function saveAllSettings() { @@ -441,7 +441,7 @@ async function saveAllSettings() { @@ -451,7 +451,7 @@ async function saveAllSettings() {