backend/internal/service/ops_cleanup_service.go

package service

import (
	"context"
	"database/sql"
	"fmt"
	"strings"
	"sync"
	"time"

	"github.com/Wei-Shaw/sub2api/internal/config"
	"github.com/Wei-Shaw/sub2api/internal/pkg/logger"
	"github.com/google/uuid"
	"github.com/redis/go-redis/v9"
	"github.com/robfig/cron/v3"
)

const (
	opsCleanupJobName = "ops_cleanup"

	opsCleanupLeaderLockKeyDefault = "ops:cleanup:leader"
	opsCleanupLeaderLockTTLDefault = 30 * time.Minute
)

var opsCleanupCronParser = cron.NewParser(cron.Minute | cron.Hour | cron.Dom | cron.Month | cron.Dow)

var opsCleanupReleaseScript = redis.NewScript(`
if redis.call("GET", KEYS[1]) == ARGV[1] then
  return redis.call("DEL", KEYS[1])
end
return 0
`)

// OpsCleanupService periodically deletes old ops data to prevent unbounded DB growth.
//
// - Scheduling: 5-field cron spec (minute hour dom month dow).
// - Multi-instance: best-effort Redis leader lock so only one node runs cleanup.
// - Safety: deletes in batches to avoid long transactions.
//
// 附带：在 runCleanupOnce 末尾调用 ChannelMonitorService.RunDailyMaintenance，
// 统一共享 cron schedule + leader lock + heartbeat，避免再引一套调度。
type OpsCleanupService struct {
	opsRepo           OpsRepository
	db                *sql.DB
	redisClient       *redis.Client
	cfg               *config.Config
	channelMonitorSvc *ChannelMonitorService

	instanceID string

	cron *cron.Cron

	startOnce sync.Once
	stopOnce  sync.Once

	warnNoRedisOnce sync.Once
}

func NewOpsCleanupService(
	opsRepo OpsRepository,
	db *sql.DB,
	redisClient *redis.Client,
	cfg *config.Config,
	channelMonitorSvc *ChannelMonitorService,
) *OpsCleanupService {
	return &OpsCleanupService{
		opsRepo:           opsRepo,
		db:                db,
		redisClient:       redisClient,
		cfg:               cfg,
		channelMonitorSvc: channelMonitorSvc,
		instanceID:        uuid.NewString(),
	}
}

func (s *OpsCleanupService) Start() {
	if s == nil {
		return
	}
	if s.cfg != nil && !s.cfg.Ops.Enabled {
		return
	}
	if s.cfg != nil && !s.cfg.Ops.Cleanup.Enabled {
		logger.LegacyPrintf("service.ops_cleanup", "[OpsCleanup] not started (disabled)")
		return
	}
	if s.opsRepo == nil || s.db == nil {
		logger.LegacyPrintf("service.ops_cleanup", "[OpsCleanup] not started (missing deps)")
		return
	}

	s.startOnce.Do(func() {
		schedule := "0 2 * * *"
		if s.cfg != nil && strings.TrimSpace(s.cfg.Ops.Cleanup.Schedule) != "" {
			schedule = strings.TrimSpace(s.cfg.Ops.Cleanup.Schedule)
		}

		loc := time.Local
		if s.cfg != nil && strings.TrimSpace(s.cfg.Timezone) != "" {
			if parsed, err := time.LoadLocation(strings.TrimSpace(s.cfg.Timezone)); err == nil && parsed != nil {
				loc = parsed
			}
		}

		c := cron.New(cron.WithParser(opsCleanupCronParser), cron.WithLocation(loc))
		_, err := c.AddFunc(schedule, func() { s.runScheduled() })
		if err != nil {
			logger.LegacyPrintf("service.ops_cleanup", "[OpsCleanup] not started (invalid schedule=%q): %v", schedule, err)
			return
		}
		s.cron = c
		s.cron.Start()
		logger.LegacyPrintf("service.ops_cleanup", "[OpsCleanup] started (schedule=%q tz=%s)", schedule, loc.String())
	})
}

func (s *OpsCleanupService) Stop() {
	if s == nil {
		return
	}
	s.stopOnce.Do(func() {
		if s.cron != nil {
			ctx := s.cron.Stop()
			select {
			case <-ctx.Done():
			case <-time.After(3 * time.Second):
				logger.LegacyPrintf("service.ops_cleanup", "[OpsCleanup] cron stop timed out")
			}
		}
	})
}

func (s *OpsCleanupService) runScheduled() {
	if s == nil || s.db == nil || s.opsRepo == nil {
		return
	}

	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Minute)
	defer cancel()

	release, ok := s.tryAcquireLeaderLock(ctx)
	if !ok {
		return
	}
	if release != nil {
		defer release()
	}

	startedAt := time.Now().UTC()
	runAt := startedAt

	counts, err := s.runCleanupOnce(ctx)
	if err != nil {
		s.recordHeartbeatError(runAt, time.Since(startedAt), err)
		logger.LegacyPrintf("service.ops_cleanup", "[OpsCleanup] cleanup failed: %v", err)
		return
	}
	s.recordHeartbeatSuccess(runAt, time.Since(startedAt), counts)
	logger.LegacyPrintf("service.ops_cleanup", "[OpsCleanup] cleanup complete: %s", counts)
}

type opsCleanupDeletedCounts struct {
	errorLogs     int64
	retryAttempts int64
	alertEvents   int64
	systemLogs    int64
	logAudits     int64
	systemMetrics int64
	hourlyPreagg  int64
	dailyPreagg   int64
}

func (c opsCleanupDeletedCounts) String() string {
	return fmt.Sprintf(
		"error_logs=%d retry_attempts=%d alert_events=%d system_logs=%d log_audits=%d system_metrics=%d hourly_preagg=%d daily_preagg=%d",
		c.errorLogs,
		c.retryAttempts,
		c.alertEvents,
		c.systemLogs,
		c.logAudits,
		c.systemMetrics,
		c.hourlyPreagg,
		c.dailyPreagg,
	)
}

// opsCleanupPlan 把"保留天数"翻译成具体的清理动作。
//   - days < 0  → 跳过该项清理（ok=false），保留兼容老数据
//   - days == 0 → TRUNCATE TABLE（O(1) 全清），truncate=true
//   - days > 0  → 批量 DELETE 早于 now-N天 的行，cutoff = now - N 天
//
// 之所以 days==0 走 TRUNCATE 而非"now+24h cutoff + DELETE"：
//   - 速度从 O(N) 降到 O(1)，对百万行级表毫秒完成
//   - 无 WAL 写入、无后续 VACUUM 压力
//   - 这些 ops 表只有 cleanup 任务自己写，TRUNCATE 的 ACCESS EXCLUSIVE 锁影响可忽略
func opsCleanupPlan(now time.Time, days int) (cutoff time.Time, truncate, ok bool) {
	if days < 0 {
		return time.Time{}, false, false
	}
	if days == 0 {
		return time.Time{}, true, true
	}
	return now.AddDate(0, 0, -days), false, true
}

func (s *OpsCleanupService) runCleanupOnce(ctx context.Context) (opsCleanupDeletedCounts, error) {
	out := opsCleanupDeletedCounts{}
	if s == nil || s.db == nil || s.cfg == nil {
		return out, nil
	}

	batchSize := 5000

	now := time.Now().UTC()

	// runOne 把"truncate? cutoff? batched delete?"封装到一处，
	// 让三组清理（错误日志类 / 分钟指标 / 小时+日预聚合）调用方只关心表名和列名。
	runOne := func(truncate bool, cutoff time.Time, table, timeCol string, castDate bool) (int64, error) {
		if truncate {
			return truncateOpsTable(ctx, s.db, table)
		}
		return deleteOldRowsByID(ctx, s.db, table, timeCol, cutoff, batchSize, castDate)
	}

	// Error-like tables: error logs / retry attempts / alert events / system logs / cleanup audits.
	if cutoff, truncate, ok := opsCleanupPlan(now, s.cfg.Ops.Cleanup.ErrorLogRetentionDays); ok {
		n, err := runOne(truncate, cutoff, "ops_error_logs", "created_at", false)
		if err != nil {
			return out, err
		}
		out.errorLogs = n

		n, err = runOne(truncate, cutoff, "ops_retry_attempts", "created_at", false)
		if err != nil {
			return out, err
		}
		out.retryAttempts = n

		n, err = runOne(truncate, cutoff, "ops_alert_events", "created_at", false)
		if err != nil {
			return out, err
		}
		out.alertEvents = n

		n, err = runOne(truncate, cutoff, "ops_system_logs", "created_at", false)
		if err != nil {
			return out, err
		}
		out.systemLogs = n

		n, err = runOne(truncate, cutoff, "ops_system_log_cleanup_audits", "created_at", false)
		if err != nil {
			return out, err
		}
		out.logAudits = n
	}

	// Minute-level metrics snapshots.
	if cutoff, truncate, ok := opsCleanupPlan(now, s.cfg.Ops.Cleanup.MinuteMetricsRetentionDays); ok {
		n, err := runOne(truncate, cutoff, "ops_system_metrics", "created_at", false)
		if err != nil {
			return out, err
		}
		out.systemMetrics = n
	}

	// Pre-aggregation tables (hourly/daily).
	if cutoff, truncate, ok := opsCleanupPlan(now, s.cfg.Ops.Cleanup.HourlyMetricsRetentionDays); ok {
		n, err := runOne(truncate, cutoff, "ops_metrics_hourly", "bucket_start", false)
		if err != nil {
			return out, err
		}
		out.hourlyPreagg = n

		n, err = runOne(truncate, cutoff, "ops_metrics_daily", "bucket_date", true)
		if err != nil {
			return out, err
		}
		out.dailyPreagg = n
	}

	// Channel monitor 每日维护（聚合昨日明细 + 软删过期明细/聚合）。
	// 失败只记日志，不影响 ops 清理的成功状态（与 ops 各步骤风格一致）；
	// 维护本身已经把每步错误打到 slog，heartbeat result 不再分项记录。
	if s.channelMonitorSvc != nil {
		if err := s.channelMonitorSvc.RunDailyMaintenance(ctx); err != nil {
			logger.LegacyPrintf("service.ops_cleanup", "[OpsCleanup] channel monitor maintenance failed: %v", err)
		}
	}

	return out, nil
}

func deleteOldRowsByID(
	ctx context.Context,
	db *sql.DB,
	table string,
	timeColumn string,
	cutoff time.Time,
	batchSize int,
	castCutoffToDate bool,
) (int64, error) {
	if db == nil {
		return 0, nil
	}
	if batchSize <= 0 {
		batchSize = 5000
	}

	where := fmt.Sprintf("%s < $1", timeColumn)
	if castCutoffToDate {
		where = fmt.Sprintf("%s < $1::date", timeColumn)
	}

	q := fmt.Sprintf(`
WITH batch AS (
  SELECT id FROM %s
  WHERE %s
  ORDER BY id
  LIMIT $2
)
DELETE FROM %s
WHERE id IN (SELECT id FROM batch)
`, table, where, table)

	var total int64
	for {
		res, err := db.ExecContext(ctx, q, cutoff, batchSize)
		if err != nil {
			// If ops tables aren't present yet (partial deployments), treat as no-op.
			if isMissingRelationError(err) {
				return total, nil
			}
			return total, err
		}
		affected, err := res.RowsAffected()
		if err != nil {
			return total, err
		}
		total += affected
		if affected == 0 {
			break
		}
	}
	return total, nil
}

// truncateOpsTable 用 TRUNCATE TABLE 清空指定表，先 SELECT COUNT(*) 取得清空前行数用于 heartbeat。
//
// 与 deleteOldRowsByID 的差异：
//   - 不可指定 WHERE 条件，仅用于 days==0 的"清空全部"语义
//   - O(1) 释放表的物理存储页，毫秒级完成，无 WAL 写入、无 VACUUM 压力
//   - 需要 ACCESS EXCLUSIVE 锁，但 ops 表只有清理任务自己写入，瞬间锁影响可忽略
//
// 表不存在（部分部署）静默返回 0，与 deleteOldRowsByID 保持一致。
func truncateOpsTable(ctx context.Context, db *sql.DB, table string) (int64, error) {
	if db == nil {
		return 0, nil
	}
	var count int64
	if err := db.QueryRowContext(ctx, fmt.Sprintf("SELECT COUNT(*) FROM %s", table)).Scan(&count); err != nil {
		if isMissingRelationError(err) {
			return 0, nil
		}
		return 0, fmt.Errorf("count %s: %w", table, err)
	}
	if count == 0 {
		return 0, nil
	}
	if _, err := db.ExecContext(ctx, fmt.Sprintf("TRUNCATE TABLE %s", table)); err != nil {
		if isMissingRelationError(err) {
			return 0, nil
		}
		return 0, fmt.Errorf("truncate %s: %w", table, err)
	}
	return count, nil
}

// isMissingRelationError 判断 PG 报错是否为"表不存在"，用于让清理任务在部分部署场景静默跳过。
func isMissingRelationError(err error) bool {
	if err == nil {
		return false
	}
	s := strings.ToLower(err.Error())
	return strings.Contains(s, "does not exist") && strings.Contains(s, "relation")
}

func (s *OpsCleanupService) tryAcquireLeaderLock(ctx context.Context) (func(), bool) {
	if s == nil {
		return nil, false
	}
	// In simple run mode, assume single instance.
	if s.cfg != nil && s.cfg.RunMode == config.RunModeSimple {
		return nil, true
	}

	key := opsCleanupLeaderLockKeyDefault
	ttl := opsCleanupLeaderLockTTLDefault

	// Prefer Redis leader lock when available, but avoid stampeding the DB when Redis is flaky by
	// falling back to a DB advisory lock.
	if s.redisClient != nil {
		ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result()
		if err == nil {
			if !ok {
				return nil, false
			}
			return func() {
				_, _ = opsCleanupReleaseScript.Run(ctx, s.redisClient, []string{key}, s.instanceID).Result()
			}, true
		}
		// Redis error: fall back to DB advisory lock.
		s.warnNoRedisOnce.Do(func() {
			logger.LegacyPrintf("service.ops_cleanup", "[OpsCleanup] leader lock SetNX failed; falling back to DB advisory lock: %v", err)
		})
	} else {
		s.warnNoRedisOnce.Do(func() {
			logger.LegacyPrintf("service.ops_cleanup", "[OpsCleanup] redis not configured; using DB advisory lock")
		})
	}

	release, ok := tryAcquireDBAdvisoryLock(ctx, s.db, hashAdvisoryLockID(key))
	if !ok {
		return nil, false
	}
	return release, true
}

func (s *OpsCleanupService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration, counts opsCleanupDeletedCounts) {
	if s == nil || s.opsRepo == nil {
		return
	}
	now := time.Now().UTC()
	durMs := duration.Milliseconds()
	result := truncateString(counts.String(), 2048)
	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
	defer cancel()
	_ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
		JobName:        opsCleanupJobName,
		LastRunAt:      &runAt,
		LastSuccessAt:  &now,
		LastDurationMs: &durMs,
		LastResult:     &result,
	})
}

func (s *OpsCleanupService) recordHeartbeatError(runAt time.Time, duration time.Duration, err error) {
	if s == nil || s.opsRepo == nil || err == nil {
		return
	}
	now := time.Now().UTC()
	durMs := duration.Milliseconds()
	msg := truncateString(err.Error(), 2048)
	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
	defer cancel()
	_ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
		JobName:        opsCleanupJobName,
		LastRunAt:      &runAt,
		LastErrorAt:    &now,
		LastError:      &msg,
		LastDurationMs: &durMs,
	})
}
-												feat(service): 实现运维监控业务逻辑层

- 新增 ops 主服务（ops_service.go）和端口定义（ops_port.go）
- 实现账号可用性检查服务（ops_account_availability.go）
- 实现数据聚合服务（ops_aggregation_service.go）
- 实现告警评估服务（ops_alert_evaluator_service.go）
- 实现告警管理服务（ops_alerts.go）
- 实现数据清理服务（ops_cleanup_service.go）
- 实现并发控制服务（ops_concurrency.go）
- 实现仪表板服务（ops_dashboard.go）
- 实现错误处理服务（ops_errors.go）
- 实现直方图服务（ops_histograms.go）
- 实现指标采集服务（ops_metrics_collector.go）
- 实现查询模式服务（ops_query_mode.go）
- 实现实时监控服务（ops_realtime.go）
- 实现请求详情服务（ops_request_details.go）
- 实现重试机制服务（ops_retry.go）
- 实现配置管理服务（ops_settings.go）
- 实现趋势分析服务（ops_trends.go）
- 实现窗口统计服务（ops_window_stats.go）
- 添加 ops 相关领域常量
- 注册 service 依赖注入

											
										
										
											2026-01-09 20:53:44 +08:00
+								package service
 								import (
 									"context"
 									"database/sql"
 									"fmt"
 									"strings"
 									"sync"
 									"time"
 									"github.com/Wei-Shaw/sub2api/internal/config"
-												chore(logging): 完成后端日志审计与结构化迁移

- 将高密度服务与处理器日志迁移到新日志系统（LegacyPrintf/结构化日志）
- 增加 stdlog bridge 与兼容测试，保留旧日志捕获能力
- 将 OpenAI 断流告警改为结构化 Warn 并改造对应测试为 sink 捕获
- 补齐后端相关文件 logger 引用并通过全量 go test

											
										
										
											2026-02-12 19:01:09 +08:00
+									"github.com/Wei-Shaw/sub2api/internal/pkg/logger"
-												feat(service): 实现运维监控业务逻辑层

- 新增 ops 主服务（ops_service.go）和端口定义（ops_port.go）
- 实现账号可用性检查服务（ops_account_availability.go）
- 实现数据聚合服务（ops_aggregation_service.go）
- 实现告警评估服务（ops_alert_evaluator_service.go）
- 实现告警管理服务（ops_alerts.go）
- 实现数据清理服务（ops_cleanup_service.go）
- 实现并发控制服务（ops_concurrency.go）
- 实现仪表板服务（ops_dashboard.go）
- 实现错误处理服务（ops_errors.go）
- 实现直方图服务（ops_histograms.go）
- 实现指标采集服务（ops_metrics_collector.go）
- 实现查询模式服务（ops_query_mode.go）
- 实现实时监控服务（ops_realtime.go）
- 实现请求详情服务（ops_request_details.go）
- 实现重试机制服务（ops_retry.go）
- 实现配置管理服务（ops_settings.go）
- 实现趋势分析服务（ops_trends.go）
- 实现窗口统计服务（ops_window_stats.go）
- 添加 ops 相关领域常量
- 注册 service 依赖注入

											
										
										
											2026-01-09 20:53:44 +08:00
+									"github.com/google/uuid"
 									"github.com/redis/go-redis/v9"
 									"github.com/robfig/cron/v3"
 								)
 								const (
 									opsCleanupJobName = "ops_cleanup"
 									opsCleanupLeaderLockKeyDefault = "ops:cleanup:leader"
 									opsCleanupLeaderLockTTLDefault = 30 * time.Minute
 								)
 								var opsCleanupCronParser = cron.NewParser(cron.Minute | cron.Hour | cron.Dom | cron.Month | cron.Dow)
 								var opsCleanupReleaseScript = redis.NewScript(`
 								if redis.call("GET", KEYS[1]) == ARGV[1] then
 								  return redis.call("DEL", KEYS[1])
 								end
 								return 0
 								`)
 								// OpsCleanupService periodically deletes old ops data to prevent unbounded DB growth.
 								//
 								// - Scheduling: 5-field cron spec (minute hour dom month dow).
 								// - Multi-instance: best-effort Redis leader lock so only one node runs cleanup.
 								// - Safety: deletes in batches to avoid long transactions.
-												feat(channel-monitor): aggregate history to daily rollups + soft delete

明细只保留 1 天，超过 1 天聚合到新表 channel_monitor_daily_rollups（按
monitor_id/model/bucket_date 维度），聚合保留 30 天。两张表都用 SoftDeleteMixin
软删除（DELETE 自动改为 UPDATE deleted_at = NOW()）。

聚合 + 清理任务由 OpsCleanupService 的 cron 统一调度，与运维监控的清理共享
schedule（默认 0 2 * * *）和 leader lock。ChannelMonitorRunner 的 cleanupLoop
被移除，只保留 dueCheckLoop。

读取路径 ComputeAvailability* 改为 UNION 明细（今天 deleted_at IS NULL）+
聚合（过去 windowDays 天 deleted_at IS NULL），SUM(ok)/SUM(total) 自然加权
计算可用率，AVG latency 用 SUM(sum_latency_ms)/SUM(count_latency)。

watermark 表 channel_monitor_aggregation_watermark 单行（id=1），记录
last_aggregated_date，重启后从该日期 +1 继续聚合，首次为 nil 则从
today - 30d 开始回填，单次最多 35 天上限避免长事务。

raw SQL 的 ListLatestPerModel / ListLatestForMonitorIDs / ListRecentHistoryForMonitors
都补上 deleted_at IS NULL 过滤（SoftDeleteMixin interceptor 只对 ent query 生效）。

bump version to 0.1.114.28

GroupBadge 在 MonitorKeyPickerDialog 中复用平台主题色 + 倍率/专属倍率
（顺手优化）。

											
										
										
											2026-04-21 10:10:56 +08:00
+								//
 								// 附带：在 runCleanupOnce 末尾调用 ChannelMonitorService.RunDailyMaintenance，
 								// 统一共享 cron schedule + leader lock + heartbeat，避免再引一套调度。
-												feat(service): 实现运维监控业务逻辑层

- 新增 ops 主服务（ops_service.go）和端口定义（ops_port.go）
- 实现账号可用性检查服务（ops_account_availability.go）
- 实现数据聚合服务（ops_aggregation_service.go）
- 实现告警评估服务（ops_alert_evaluator_service.go）
- 实现告警管理服务（ops_alerts.go）
- 实现数据清理服务（ops_cleanup_service.go）
- 实现并发控制服务（ops_concurrency.go）
- 实现仪表板服务（ops_dashboard.go）
- 实现错误处理服务（ops_errors.go）
- 实现直方图服务（ops_histograms.go）
- 实现指标采集服务（ops_metrics_collector.go）
- 实现查询模式服务（ops_query_mode.go）
- 实现实时监控服务（ops_realtime.go）
- 实现请求详情服务（ops_request_details.go）
- 实现重试机制服务（ops_retry.go）
- 实现配置管理服务（ops_settings.go）
- 实现趋势分析服务（ops_trends.go）
- 实现窗口统计服务（ops_window_stats.go）
- 添加 ops 相关领域常量
- 注册 service 依赖注入

											
										
										
											2026-01-09 20:53:44 +08:00
+								type OpsCleanupService struct {
-												feat(channel-monitor): aggregate history to daily rollups + soft delete

明细只保留 1 天，超过 1 天聚合到新表 channel_monitor_daily_rollups（按
monitor_id/model/bucket_date 维度），聚合保留 30 天。两张表都用 SoftDeleteMixin
软删除（DELETE 自动改为 UPDATE deleted_at = NOW()）。

聚合 + 清理任务由 OpsCleanupService 的 cron 统一调度，与运维监控的清理共享
schedule（默认 0 2 * * *）和 leader lock。ChannelMonitorRunner 的 cleanupLoop
被移除，只保留 dueCheckLoop。

读取路径 ComputeAvailability* 改为 UNION 明细（今天 deleted_at IS NULL）+
聚合（过去 windowDays 天 deleted_at IS NULL），SUM(ok)/SUM(total) 自然加权
计算可用率，AVG latency 用 SUM(sum_latency_ms)/SUM(count_latency)。

watermark 表 channel_monitor_aggregation_watermark 单行（id=1），记录
last_aggregated_date，重启后从该日期 +1 继续聚合，首次为 nil 则从
today - 30d 开始回填，单次最多 35 天上限避免长事务。

raw SQL 的 ListLatestPerModel / ListLatestForMonitorIDs / ListRecentHistoryForMonitors
都补上 deleted_at IS NULL 过滤（SoftDeleteMixin interceptor 只对 ent query 生效）。

bump version to 0.1.114.28

GroupBadge 在 MonitorKeyPickerDialog 中复用平台主题色 + 倍率/专属倍率
（顺手优化）。

											
										
										
											2026-04-21 10:10:56 +08:00
+									opsRepo           OpsRepository
 									db                *sql.DB
 									redisClient       *redis.Client
 									cfg               *config.Config
 									channelMonitorSvc *ChannelMonitorService
-												feat(service): 实现运维监控业务逻辑层

- 新增 ops 主服务（ops_service.go）和端口定义（ops_port.go）
- 实现账号可用性检查服务（ops_account_availability.go）
- 实现数据聚合服务（ops_aggregation_service.go）
- 实现告警评估服务（ops_alert_evaluator_service.go）
- 实现告警管理服务（ops_alerts.go）
- 实现数据清理服务（ops_cleanup_service.go）
- 实现并发控制服务（ops_concurrency.go）
- 实现仪表板服务（ops_dashboard.go）
- 实现错误处理服务（ops_errors.go）
- 实现直方图服务（ops_histograms.go）
- 实现指标采集服务（ops_metrics_collector.go）
- 实现查询模式服务（ops_query_mode.go）
- 实现实时监控服务（ops_realtime.go）
- 实现请求详情服务（ops_request_details.go）
- 实现重试机制服务（ops_retry.go）
- 实现配置管理服务（ops_settings.go）
- 实现趋势分析服务（ops_trends.go）
- 实现窗口统计服务（ops_window_stats.go）
- 添加 ops 相关领域常量
- 注册 service 依赖注入

											
										
										
											2026-01-09 20:53:44 +08:00
 									instanceID string
-												fix(lint): 修复所有golangci-lint错误

- 修复depguard错误：为ops service文件添加redis导入例外
- 修复errcheck错误：添加错误检查和类型断言检查
- 修复gofmt错误：格式化代码
- 修复ineffassign错误：移除无效的idx++赋值
- 修复staticcheck错误：合并条件赋值
- 修复unused错误：移除未使用的字段和函数
  - ops_cleanup_service.go: entryID字段
  - ops_retry.go: status字段
  - ops_upstream_context.go: getOpsUpstreamErrors函数

											
										
										
											2026-01-11 23:26:29 +08:00
+									cron *cron.Cron
-												feat(service): 实现运维监控业务逻辑层

- 新增 ops 主服务（ops_service.go）和端口定义（ops_port.go）
- 实现账号可用性检查服务（ops_account_availability.go）
- 实现数据聚合服务（ops_aggregation_service.go）
- 实现告警评估服务（ops_alert_evaluator_service.go）
- 实现告警管理服务（ops_alerts.go）
- 实现数据清理服务（ops_cleanup_service.go）
- 实现并发控制服务（ops_concurrency.go）
- 实现仪表板服务（ops_dashboard.go）
- 实现错误处理服务（ops_errors.go）
- 实现直方图服务（ops_histograms.go）
- 实现指标采集服务（ops_metrics_collector.go）
- 实现查询模式服务（ops_query_mode.go）
- 实现实时监控服务（ops_realtime.go）
- 实现请求详情服务（ops_request_details.go）
- 实现重试机制服务（ops_retry.go）
- 实现配置管理服务（ops_settings.go）
- 实现趋势分析服务（ops_trends.go）
- 实现窗口统计服务（ops_window_stats.go）
- 添加 ops 相关领域常量
- 注册 service 依赖注入

											
										
										
											2026-01-09 20:53:44 +08:00
 									startOnce sync.Once
 									stopOnce  sync.Once
 									warnNoRedisOnce sync.Once
 								}
 								func NewOpsCleanupService(
 									opsRepo OpsRepository,
 									db *sql.DB,
 									redisClient *redis.Client,
 									cfg *config.Config,
-												feat(channel-monitor): aggregate history to daily rollups + soft delete

明细只保留 1 天，超过 1 天聚合到新表 channel_monitor_daily_rollups（按
monitor_id/model/bucket_date 维度），聚合保留 30 天。两张表都用 SoftDeleteMixin
软删除（DELETE 自动改为 UPDATE deleted_at = NOW()）。

聚合 + 清理任务由 OpsCleanupService 的 cron 统一调度，与运维监控的清理共享
schedule（默认 0 2 * * *）和 leader lock。ChannelMonitorRunner 的 cleanupLoop
被移除，只保留 dueCheckLoop。

读取路径 ComputeAvailability* 改为 UNION 明细（今天 deleted_at IS NULL）+
聚合（过去 windowDays 天 deleted_at IS NULL），SUM(ok)/SUM(total) 自然加权
计算可用率，AVG latency 用 SUM(sum_latency_ms)/SUM(count_latency)。

watermark 表 channel_monitor_aggregation_watermark 单行（id=1），记录
last_aggregated_date，重启后从该日期 +1 继续聚合，首次为 nil 则从
today - 30d 开始回填，单次最多 35 天上限避免长事务。

raw SQL 的 ListLatestPerModel / ListLatestForMonitorIDs / ListRecentHistoryForMonitors
都补上 deleted_at IS NULL 过滤（SoftDeleteMixin interceptor 只对 ent query 生效）。

bump version to 0.1.114.28

GroupBadge 在 MonitorKeyPickerDialog 中复用平台主题色 + 倍率/专属倍率
（顺手优化）。

											
										
										
											2026-04-21 10:10:56 +08:00
+									channelMonitorSvc *ChannelMonitorService,
-												feat(service): 实现运维监控业务逻辑层

- 新增 ops 主服务（ops_service.go）和端口定义（ops_port.go）
- 实现账号可用性检查服务（ops_account_availability.go）
- 实现数据聚合服务（ops_aggregation_service.go）
- 实现告警评估服务（ops_alert_evaluator_service.go）
- 实现告警管理服务（ops_alerts.go）
- 实现数据清理服务（ops_cleanup_service.go）
- 实现并发控制服务（ops_concurrency.go）
- 实现仪表板服务（ops_dashboard.go）
- 实现错误处理服务（ops_errors.go）
- 实现直方图服务（ops_histograms.go）
- 实现指标采集服务（ops_metrics_collector.go）
- 实现查询模式服务（ops_query_mode.go）
- 实现实时监控服务（ops_realtime.go）
- 实现请求详情服务（ops_request_details.go）
- 实现重试机制服务（ops_retry.go）
- 实现配置管理服务（ops_settings.go）
- 实现趋势分析服务（ops_trends.go）
- 实现窗口统计服务（ops_window_stats.go）
- 添加 ops 相关领域常量
- 注册 service 依赖注入

											
										
										
											2026-01-09 20:53:44 +08:00
+								) *OpsCleanupService {
 									return &OpsCleanupService{
-												feat(channel-monitor): aggregate history to daily rollups + soft delete

明细只保留 1 天，超过 1 天聚合到新表 channel_monitor_daily_rollups（按
monitor_id/model/bucket_date 维度），聚合保留 30 天。两张表都用 SoftDeleteMixin
软删除（DELETE 自动改为 UPDATE deleted_at = NOW()）。

聚合 + 清理任务由 OpsCleanupService 的 cron 统一调度，与运维监控的清理共享
schedule（默认 0 2 * * *）和 leader lock。ChannelMonitorRunner 的 cleanupLoop
被移除，只保留 dueCheckLoop。

读取路径 ComputeAvailability* 改为 UNION 明细（今天 deleted_at IS NULL）+
聚合（过去 windowDays 天 deleted_at IS NULL），SUM(ok)/SUM(total) 自然加权
计算可用率，AVG latency 用 SUM(sum_latency_ms)/SUM(count_latency)。

watermark 表 channel_monitor_aggregation_watermark 单行（id=1），记录
last_aggregated_date，重启后从该日期 +1 继续聚合，首次为 nil 则从
today - 30d 开始回填，单次最多 35 天上限避免长事务。

raw SQL 的 ListLatestPerModel / ListLatestForMonitorIDs / ListRecentHistoryForMonitors
都补上 deleted_at IS NULL 过滤（SoftDeleteMixin interceptor 只对 ent query 生效）。

bump version to 0.1.114.28

GroupBadge 在 MonitorKeyPickerDialog 中复用平台主题色 + 倍率/专属倍率
（顺手优化）。

											
										
										
											2026-04-21 10:10:56 +08:00
+										opsRepo:           opsRepo,
 										db:                db,
 										redisClient:       redisClient,
 										cfg:               cfg,
 										channelMonitorSvc: channelMonitorSvc,
 										instanceID:        uuid.NewString(),
-												feat(service): 实现运维监控业务逻辑层

- 新增 ops 主服务（ops_service.go）和端口定义（ops_port.go）
- 实现账号可用性检查服务（ops_account_availability.go）
- 实现数据聚合服务（ops_aggregation_service.go）
- 实现告警评估服务（ops_alert_evaluator_service.go）
- 实现告警管理服务（ops_alerts.go）
- 实现数据清理服务（ops_cleanup_service.go）
- 实现并发控制服务（ops_concurrency.go）
- 实现仪表板服务（ops_dashboard.go）
- 实现错误处理服务（ops_errors.go）
- 实现直方图服务（ops_histograms.go）
- 实现指标采集服务（ops_metrics_collector.go）
- 实现查询模式服务（ops_query_mode.go）
- 实现实时监控服务（ops_realtime.go）
- 实现请求详情服务（ops_request_details.go）
- 实现重试机制服务（ops_retry.go）
- 实现配置管理服务（ops_settings.go）
- 实现趋势分析服务（ops_trends.go）
- 实现窗口统计服务（ops_window_stats.go）
- 添加 ops 相关领域常量
- 注册 service 依赖注入

											
										
										
											2026-01-09 20:53:44 +08:00
+									}
 								}
 								func (s *OpsCleanupService) Start() {
 									if s == nil {
 										return
 									}
 									if s.cfg != nil && !s.cfg.Ops.Enabled {
 										return
 									}
 									if s.cfg != nil && !s.cfg.Ops.Cleanup.Enabled {
-												chore(logging): 完成后端日志审计与结构化迁移

- 将高密度服务与处理器日志迁移到新日志系统（LegacyPrintf/结构化日志）
- 增加 stdlog bridge 与兼容测试，保留旧日志捕获能力
- 将 OpenAI 断流告警改为结构化 Warn 并改造对应测试为 sink 捕获
- 补齐后端相关文件 logger 引用并通过全量 go test

											
										
										
											2026-02-12 19:01:09 +08:00
+										logger.LegacyPrintf("service.ops_cleanup", "[OpsCleanup] not started (disabled)")
-												feat(service): 实现运维监控业务逻辑层

- 新增 ops 主服务（ops_service.go）和端口定义（ops_port.go）
- 实现账号可用性检查服务（ops_account_availability.go）
- 实现数据聚合服务（ops_aggregation_service.go）
- 实现告警评估服务（ops_alert_evaluator_service.go）
- 实现告警管理服务（ops_alerts.go）
- 实现数据清理服务（ops_cleanup_service.go）
- 实现并发控制服务（ops_concurrency.go）
- 实现仪表板服务（ops_dashboard.go）
- 实现错误处理服务（ops_errors.go）
- 实现直方图服务（ops_histograms.go）
- 实现指标采集服务（ops_metrics_collector.go）
- 实现查询模式服务（ops_query_mode.go）
- 实现实时监控服务（ops_realtime.go）
- 实现请求详情服务（ops_request_details.go）
- 实现重试机制服务（ops_retry.go）
- 实现配置管理服务（ops_settings.go）
- 实现趋势分析服务（ops_trends.go）
- 实现窗口统计服务（ops_window_stats.go）
- 添加 ops 相关领域常量
- 注册 service 依赖注入

											
										
										
											2026-01-09 20:53:44 +08:00
+										return
 									}
 									if s.opsRepo == nil || s.db == nil {
-												chore(logging): 完成后端日志审计与结构化迁移

- 将高密度服务与处理器日志迁移到新日志系统（LegacyPrintf/结构化日志）
- 增加 stdlog bridge 与兼容测试，保留旧日志捕获能力
- 将 OpenAI 断流告警改为结构化 Warn 并改造对应测试为 sink 捕获
- 补齐后端相关文件 logger 引用并通过全量 go test

											
										
										
											2026-02-12 19:01:09 +08:00
+										logger.LegacyPrintf("service.ops_cleanup", "[OpsCleanup] not started (missing deps)")
-												feat(service): 实现运维监控业务逻辑层

- 新增 ops 主服务（ops_service.go）和端口定义（ops_port.go）
- 实现账号可用性检查服务（ops_account_availability.go）
- 实现数据聚合服务（ops_aggregation_service.go）
- 实现告警评估服务（ops_alert_evaluator_service.go）
- 实现告警管理服务（ops_alerts.go）
- 实现数据清理服务（ops_cleanup_service.go）
- 实现并发控制服务（ops_concurrency.go）
- 实现仪表板服务（ops_dashboard.go）
- 实现错误处理服务（ops_errors.go）
- 实现直方图服务（ops_histograms.go）
- 实现指标采集服务（ops_metrics_collector.go）
- 实现查询模式服务（ops_query_mode.go）
- 实现实时监控服务（ops_realtime.go）
- 实现请求详情服务（ops_request_details.go）
- 实现重试机制服务（ops_retry.go）
- 实现配置管理服务（ops_settings.go）
- 实现趋势分析服务（ops_trends.go）
- 实现窗口统计服务（ops_window_stats.go）
- 添加 ops 相关领域常量
- 注册 service 依赖注入

											
										
										
											2026-01-09 20:53:44 +08:00
+										return
 									}
 									s.startOnce.Do(func() {
 										schedule := "0 2 * * *"
 										if s.cfg != nil && strings.TrimSpace(s.cfg.Ops.Cleanup.Schedule) != "" {
 											schedule = strings.TrimSpace(s.cfg.Ops.Cleanup.Schedule)
 										}
 										loc := time.Local
 										if s.cfg != nil && strings.TrimSpace(s.cfg.Timezone) != "" {
 											if parsed, err := time.LoadLocation(strings.TrimSpace(s.cfg.Timezone)); err == nil && parsed != nil {
 												loc = parsed
 											}
-												fix(ci): 修复剩余的CI错误

- 修复 ops_repo_latency_histogram_buckets.go 中另一个函数的 WriteString 未检查错误
- 修复 ops_repo_request_details.go 和 ops_repo_trends.go 中的 Rows.Close 未检查错误
- 修复 ops_alert_models.go, ops_cleanup_service.go, ops_request_details.go 的格式化问题
- 移除 ops_retry.go 中未使用的 status 字段
- 修复 maxTime 函数重复声明（将测试文件中的函数重命名为 testMaxTime）

											
										
										
											2026-01-11 23:57:20 +08:00
+										}
 										c := cron.New(cron.WithParser(opsCleanupCronParser), cron.WithLocation(loc))
 										_, err := c.AddFunc(schedule, func() { s.runScheduled() })
 										if err != nil {
-												chore(logging): 完成后端日志审计与结构化迁移

- 将高密度服务与处理器日志迁移到新日志系统（LegacyPrintf/结构化日志）
- 增加 stdlog bridge 与兼容测试，保留旧日志捕获能力
- 将 OpenAI 断流告警改为结构化 Warn 并改造对应测试为 sink 捕获
- 补齐后端相关文件 logger 引用并通过全量 go test

											
										
										
											2026-02-12 19:01:09 +08:00
+											logger.LegacyPrintf("service.ops_cleanup", "[OpsCleanup] not started (invalid schedule=%q): %v", schedule, err)
-												fix(ci): 修复剩余的CI错误

- 修复 ops_repo_latency_histogram_buckets.go 中另一个函数的 WriteString 未检查错误
- 修复 ops_repo_request_details.go 和 ops_repo_trends.go 中的 Rows.Close 未检查错误
- 修复 ops_alert_models.go, ops_cleanup_service.go, ops_request_details.go 的格式化问题
- 移除 ops_retry.go 中未使用的 status 字段
- 修复 maxTime 函数重复声明（将测试文件中的函数重命名为 testMaxTime）

											
										
										
											2026-01-11 23:57:20 +08:00
+											return
 										}
 										s.cron = c
 										s.cron.Start()
-												chore(logging): 完成后端日志审计与结构化迁移

- 将高密度服务与处理器日志迁移到新日志系统（LegacyPrintf/结构化日志）
- 增加 stdlog bridge 与兼容测试，保留旧日志捕获能力
- 将 OpenAI 断流告警改为结构化 Warn 并改造对应测试为 sink 捕获
- 补齐后端相关文件 logger 引用并通过全量 go test

											
										
										
											2026-02-12 19:01:09 +08:00
+										logger.LegacyPrintf("service.ops_cleanup", "[OpsCleanup] started (schedule=%q tz=%s)", schedule, loc.String())
-												fix(ci): 修复剩余的CI错误

- 修复 ops_repo_latency_histogram_buckets.go 中另一个函数的 WriteString 未检查错误
- 修复 ops_repo_request_details.go 和 ops_repo_trends.go 中的 Rows.Close 未检查错误
- 修复 ops_alert_models.go, ops_cleanup_service.go, ops_request_details.go 的格式化问题
- 移除 ops_retry.go 中未使用的 status 字段
- 修复 maxTime 函数重复声明（将测试文件中的函数重命名为 testMaxTime）

											
										
										
											2026-01-11 23:57:20 +08:00
+									})
 								}
-												feat(service): 实现运维监控业务逻辑层

- 新增 ops 主服务（ops_service.go）和端口定义（ops_port.go）
- 实现账号可用性检查服务（ops_account_availability.go）
- 实现数据聚合服务（ops_aggregation_service.go）
- 实现告警评估服务（ops_alert_evaluator_service.go）
- 实现告警管理服务（ops_alerts.go）
- 实现数据清理服务（ops_cleanup_service.go）
- 实现并发控制服务（ops_concurrency.go）
- 实现仪表板服务（ops_dashboard.go）
- 实现错误处理服务（ops_errors.go）
- 实现直方图服务（ops_histograms.go）
- 实现指标采集服务（ops_metrics_collector.go）
- 实现查询模式服务（ops_query_mode.go）
- 实现实时监控服务（ops_realtime.go）
- 实现请求详情服务（ops_request_details.go）
- 实现重试机制服务（ops_retry.go）
- 实现配置管理服务（ops_settings.go）
- 实现趋势分析服务（ops_trends.go）
- 实现窗口统计服务（ops_window_stats.go）
- 添加 ops 相关领域常量
- 注册 service 依赖注入

											
										
										
											2026-01-09 20:53:44 +08:00
 								func (s *OpsCleanupService) Stop() {
 									if s == nil {
 										return
 									}
 									s.stopOnce.Do(func() {
 										if s.cron != nil {
 											ctx := s.cron.Stop()
 											select {
 											case <-ctx.Done():
 											case <-time.After(3 * time.Second):
-												chore(logging): 完成后端日志审计与结构化迁移

- 将高密度服务与处理器日志迁移到新日志系统（LegacyPrintf/结构化日志）
- 增加 stdlog bridge 与兼容测试，保留旧日志捕获能力
- 将 OpenAI 断流告警改为结构化 Warn 并改造对应测试为 sink 捕获
- 补齐后端相关文件 logger 引用并通过全量 go test

											
										
										
											2026-02-12 19:01:09 +08:00
+												logger.LegacyPrintf("service.ops_cleanup", "[OpsCleanup] cron stop timed out")
-												feat(service): 实现运维监控业务逻辑层

- 新增 ops 主服务（ops_service.go）和端口定义（ops_port.go）
- 实现账号可用性检查服务（ops_account_availability.go）
- 实现数据聚合服务（ops_aggregation_service.go）
- 实现告警评估服务（ops_alert_evaluator_service.go）
- 实现告警管理服务（ops_alerts.go）
- 实现数据清理服务（ops_cleanup_service.go）
- 实现并发控制服务（ops_concurrency.go）
- 实现仪表板服务（ops_dashboard.go）
- 实现错误处理服务（ops_errors.go）
- 实现直方图服务（ops_histograms.go）
- 实现指标采集服务（ops_metrics_collector.go）
- 实现查询模式服务（ops_query_mode.go）
- 实现实时监控服务（ops_realtime.go）
- 实现请求详情服务（ops_request_details.go）
- 实现重试机制服务（ops_retry.go）
- 实现配置管理服务（ops_settings.go）
- 实现趋势分析服务（ops_trends.go）
- 实现窗口统计服务（ops_window_stats.go）
- 添加 ops 相关领域常量
- 注册 service 依赖注入

											
										
										
											2026-01-09 20:53:44 +08:00
+											}
 										}
 									})
 								}
 								func (s *OpsCleanupService) runScheduled() {
 									if s == nil || s.db == nil || s.opsRepo == nil {
 										return
 									}
 									ctx, cancel := context.WithTimeout(context.Background(), 30*time.Minute)
 									defer cancel()
 									release, ok := s.tryAcquireLeaderLock(ctx)
 									if !ok {
 										return
 									}
 									if release != nil {
 										defer release()
 									}
 									startedAt := time.Now().UTC()
 									runAt := startedAt
 									counts, err := s.runCleanupOnce(ctx)
 									if err != nil {
 										s.recordHeartbeatError(runAt, time.Since(startedAt), err)
-												chore(logging): 完成后端日志审计与结构化迁移

- 将高密度服务与处理器日志迁移到新日志系统（LegacyPrintf/结构化日志）
- 增加 stdlog bridge 与兼容测试，保留旧日志捕获能力
- 将 OpenAI 断流告警改为结构化 Warn 并改造对应测试为 sink 捕获
- 补齐后端相关文件 logger 引用并通过全量 go test

											
										
										
											2026-02-12 19:01:09 +08:00
+										logger.LegacyPrintf("service.ops_cleanup", "[OpsCleanup] cleanup failed: %v", err)
-												feat(service): 实现运维监控业务逻辑层

- 新增 ops 主服务（ops_service.go）和端口定义（ops_port.go）
- 实现账号可用性检查服务（ops_account_availability.go）
- 实现数据聚合服务（ops_aggregation_service.go）
- 实现告警评估服务（ops_alert_evaluator_service.go）
- 实现告警管理服务（ops_alerts.go）
- 实现数据清理服务（ops_cleanup_service.go）
- 实现并发控制服务（ops_concurrency.go）
- 实现仪表板服务（ops_dashboard.go）
- 实现错误处理服务（ops_errors.go）
- 实现直方图服务（ops_histograms.go）
- 实现指标采集服务（ops_metrics_collector.go）
- 实现查询模式服务（ops_query_mode.go）
- 实现实时监控服务（ops_realtime.go）
- 实现请求详情服务（ops_request_details.go）
- 实现重试机制服务（ops_retry.go）
- 实现配置管理服务（ops_settings.go）
- 实现趋势分析服务（ops_trends.go）
- 实现窗口统计服务（ops_window_stats.go）
- 添加 ops 相关领域常量
- 注册 service 依赖注入

											
										
										
											2026-01-09 20:53:44 +08:00
+										return
 									}
-												refactor(ops): 优化任务心跳和组件刷新机制

后端改动:
- 添加 ops_job_heartbeats.last_result 字段记录任务执行结果
- 优化告警评估器统计信息(规则数/事件数/邮件数)
- 统一各定时任务的心跳记录格式

前端改动:
- 重构 OpsConcurrencyCard 使用父组件统一控制刷新节奏
- 移除独立的 5 秒刷新定时器,改用 refreshToken 机制
- 修复 TypeScript 类型错误

											
										
										
											2026-01-15 21:31:55 +08:00
+									s.recordHeartbeatSuccess(runAt, time.Since(startedAt), counts)
-												chore(logging): 完成后端日志审计与结构化迁移

- 将高密度服务与处理器日志迁移到新日志系统（LegacyPrintf/结构化日志）
- 增加 stdlog bridge 与兼容测试，保留旧日志捕获能力
- 将 OpenAI 断流告警改为结构化 Warn 并改造对应测试为 sink 捕获
- 补齐后端相关文件 logger 引用并通过全量 go test

											
										
										
											2026-02-12 19:01:09 +08:00
+									logger.LegacyPrintf("service.ops_cleanup", "[OpsCleanup] cleanup complete: %s", counts)
-												feat(service): 实现运维监控业务逻辑层

- 新增 ops 主服务（ops_service.go）和端口定义（ops_port.go）
- 实现账号可用性检查服务（ops_account_availability.go）
- 实现数据聚合服务（ops_aggregation_service.go）
- 实现告警评估服务（ops_alert_evaluator_service.go）
- 实现告警管理服务（ops_alerts.go）
- 实现数据清理服务（ops_cleanup_service.go）
- 实现并发控制服务（ops_concurrency.go）
- 实现仪表板服务（ops_dashboard.go）
- 实现错误处理服务（ops_errors.go）
- 实现直方图服务（ops_histograms.go）
- 实现指标采集服务（ops_metrics_collector.go）
- 实现查询模式服务（ops_query_mode.go）
- 实现实时监控服务（ops_realtime.go）
- 实现请求详情服务（ops_request_details.go）
- 实现重试机制服务（ops_retry.go）
- 实现配置管理服务（ops_settings.go）
- 实现趋势分析服务（ops_trends.go）
- 实现窗口统计服务（ops_window_stats.go）
- 添加 ops 相关领域常量
- 注册 service 依赖注入

											
										
										
											2026-01-09 20:53:44 +08:00
+								}
 								type opsCleanupDeletedCounts struct {
 									errorLogs     int64
 									retryAttempts int64
 									alertEvents   int64
-												feat(log): 落地统一日志底座与系统日志运维能力

											
										
										
											2026-02-12 16:27:29 +08:00
+									systemLogs    int64
 									logAudits     int64
-												feat(service): 实现运维监控业务逻辑层

- 新增 ops 主服务（ops_service.go）和端口定义（ops_port.go）
- 实现账号可用性检查服务（ops_account_availability.go）
- 实现数据聚合服务（ops_aggregation_service.go）
- 实现告警评估服务（ops_alert_evaluator_service.go）
- 实现告警管理服务（ops_alerts.go）
- 实现数据清理服务（ops_cleanup_service.go）
- 实现并发控制服务（ops_concurrency.go）
- 实现仪表板服务（ops_dashboard.go）
- 实现错误处理服务（ops_errors.go）
- 实现直方图服务（ops_histograms.go）
- 实现指标采集服务（ops_metrics_collector.go）
- 实现查询模式服务（ops_query_mode.go）
- 实现实时监控服务（ops_realtime.go）
- 实现请求详情服务（ops_request_details.go）
- 实现重试机制服务（ops_retry.go）
- 实现配置管理服务（ops_settings.go）
- 实现趋势分析服务（ops_trends.go）
- 实现窗口统计服务（ops_window_stats.go）
- 添加 ops 相关领域常量
- 注册 service 依赖注入

											
										
										
											2026-01-09 20:53:44 +08:00
+									systemMetrics int64
 									hourlyPreagg  int64
 									dailyPreagg   int64
 								}
 								func (c opsCleanupDeletedCounts) String() string {
 									return fmt.Sprintf(
-												feat(log): 落地统一日志底座与系统日志运维能力

											
										
										
											2026-02-12 16:27:29 +08:00
+										"error_logs=%d retry_attempts=%d alert_events=%d system_logs=%d log_audits=%d system_metrics=%d hourly_preagg=%d daily_preagg=%d",
-												feat(service): 实现运维监控业务逻辑层

- 新增 ops 主服务（ops_service.go）和端口定义（ops_port.go）
- 实现账号可用性检查服务（ops_account_availability.go）
- 实现数据聚合服务（ops_aggregation_service.go）
- 实现告警评估服务（ops_alert_evaluator_service.go）
- 实现告警管理服务（ops_alerts.go）
- 实现数据清理服务（ops_cleanup_service.go）
- 实现并发控制服务（ops_concurrency.go）
- 实现仪表板服务（ops_dashboard.go）
- 实现错误处理服务（ops_errors.go）
- 实现直方图服务（ops_histograms.go）
- 实现指标采集服务（ops_metrics_collector.go）
- 实现查询模式服务（ops_query_mode.go）
- 实现实时监控服务（ops_realtime.go）
- 实现请求详情服务（ops_request_details.go）
- 实现重试机制服务（ops_retry.go）
- 实现配置管理服务（ops_settings.go）
- 实现趋势分析服务（ops_trends.go）
- 实现窗口统计服务（ops_window_stats.go）
- 添加 ops 相关领域常量
- 注册 service 依赖注入

											
										
										
											2026-01-09 20:53:44 +08:00
+										c.errorLogs,
 										c.retryAttempts,
 										c.alertEvents,
-												feat(log): 落地统一日志底座与系统日志运维能力

											
										
										
											2026-02-12 16:27:29 +08:00
+										c.systemLogs,
 										c.logAudits,
-												feat(service): 实现运维监控业务逻辑层

- 新增 ops 主服务（ops_service.go）和端口定义（ops_port.go）
- 实现账号可用性检查服务（ops_account_availability.go）
- 实现数据聚合服务（ops_aggregation_service.go）
- 实现告警评估服务（ops_alert_evaluator_service.go）
- 实现告警管理服务（ops_alerts.go）
- 实现数据清理服务（ops_cleanup_service.go）
- 实现并发控制服务（ops_concurrency.go）
- 实现仪表板服务（ops_dashboard.go）
- 实现错误处理服务（ops_errors.go）
- 实现直方图服务（ops_histograms.go）
- 实现指标采集服务（ops_metrics_collector.go）
- 实现查询模式服务（ops_query_mode.go）
- 实现实时监控服务（ops_realtime.go）
- 实现请求详情服务（ops_request_details.go）
- 实现重试机制服务（ops_retry.go）
- 实现配置管理服务（ops_settings.go）
- 实现趋势分析服务（ops_trends.go）
- 实现窗口统计服务（ops_window_stats.go）
- 添加 ops 相关领域常量
- 注册 service 依赖注入

											
										
										
											2026-01-09 20:53:44 +08:00
+										c.systemMetrics,
 										c.hourlyPreagg,
 										c.dailyPreagg,
 									)
 								}
-												feat(ops): allow retention days = 0 to wipe table on each scheduled cleanup

Background / 背景

The ops cleanup task currently rejects retention days < 1 in both validate
and normalize, so operators who want minimal-history setups (e.g. high
churn deployments that prefer near-realtime cleanup) cannot express that
intent through the UI. The only options are 1+ days, which keeps at least
24h of history regardless of cron frequency.

ops 清理任务目前在 validate 和 normalize 两处都拒绝小于 1 的保留天数，
让希望尽量不留历史的运维场景（高吞吐部署 + 想用近实时清理）无法通过 UI
表达。最低只能配 1，等于不管 cron 多频繁，至少都会保留 24 小时的历史。

Purpose / 目的

Let admins set retention days to 0, meaning "every scheduled cleanup
run wipes the corresponding table(s) entirely". Combined with a more
frequent cron (e.g. `0 * * * *`) this yields effectively rolling cleanup.

允许管理员把保留天数设为 0，语义为"每次定时清理时把对应表全部清空"。
搭配更频繁的 cron（比如每小时整点）即可获得近似滚动清理的效果。

Changes / 改动内容

Backend

- service/ops_settings.go: validate accepts [0, 365]; normalize only
  refills default 30 when value is < 0 (negative is treated as legacy
  bad data, 0 is honoured)
- service/ops_cleanup_service.go: introduce `opsCleanupPlan(now, days)`
  returning `(cutoff, truncate, ok)`. days==0 returns truncate=true and
  short-circuits to a new `truncateOpsTable` helper that uses
  `TRUNCATE TABLE` (O(1), no WAL, no VACUUM pressure). days>0 keeps
  the existing batched DELETE path unchanged. Empty tables skip
  TRUNCATE to avoid the ACCESS EXCLUSIVE lock entirely
- Extract `isMissingRelationError` helper to dedupe the "table not
  yet created" tolerance shared by both delete and truncate paths
- Add unit tests for `opsCleanupPlan` (three branches) and
  `isMissingRelationError`

后端

- service/ops_settings.go: validate 接受 [0, 365]；normalize 仅在 < 0
  时回填默认 30（负数视为脏数据，0 被尊重）
- service/ops_cleanup_service.go: 抽 `opsCleanupPlan(now, days)` 返回
  `(cutoff, truncate, ok)`。days==0 → truncate=true，走新增
  `truncateOpsTable`（TRUNCATE TABLE，O(1)，无 WAL、无 VACUUM 压力）；
  days>0 仍走原批量 DELETE 路径，行为完全不变。空表跳过 TRUNCATE，
  避免无意义的 ACCESS EXCLUSIVE 锁
- 抽 `isMissingRelationError` helper 复用 delete / truncate 两处的
  "表不存在"宽容判断
- 补 `opsCleanupPlan` 三分支 + `isMissingRelationError` 单元测试

Frontend

- OpsSettingsDialog.vue: validation accepts [0, 365]; input min=0
- i18n (zh/en): hint mentions "0 = wipe all on every cleanup",
  validation message updated to 0-365 range

前端

- OpsSettingsDialog.vue: 校验放宽到 [0, 365]，input min 改 0
- i18n（zh/en）：hint 补"0 = 每次清理时清空所有"，错误提示改 0-365

Trade-offs / 取舍

- TRUNCATE requires ACCESS EXCLUSIVE lock briefly, but ops tables only
  have the cleanup task as a writer, so the lock is invisible to other
  workloads
- Empty-table guard avoids the lock when there is nothing to clean
- Negative values are still treated as legacy bad data and replaced
  with default 30 to preserve compatibility

											
										
										
											2026-04-29 15:01:02 +08:00
+								// opsCleanupPlan 把"保留天数"翻译成具体的清理动作。
 								//   - days < 0  → 跳过该项清理（ok=false），保留兼容老数据
 								//   - days == 0 → TRUNCATE TABLE（O(1) 全清），truncate=true
 								//   - days > 0  → 批量 DELETE 早于 now-N天 的行，cutoff = now - N 天
 								//
 								// 之所以 days==0 走 TRUNCATE 而非"now+24h cutoff + DELETE"：
 								//   - 速度从 O(N) 降到 O(1)，对百万行级表毫秒完成
 								//   - 无 WAL 写入、无后续 VACUUM 压力
 								//   - 这些 ops 表只有 cleanup 任务自己写，TRUNCATE 的 ACCESS EXCLUSIVE 锁影响可忽略
 								func opsCleanupPlan(now time.Time, days int) (cutoff time.Time, truncate, ok bool) {
 									if days < 0 {
 										return time.Time{}, false, false
 									}
 									if days == 0 {
 										return time.Time{}, true, true
 									}
 									return now.AddDate(0, 0, -days), false, true
 								}
-												feat(service): 实现运维监控业务逻辑层

- 新增 ops 主服务（ops_service.go）和端口定义（ops_port.go）
- 实现账号可用性检查服务（ops_account_availability.go）
- 实现数据聚合服务（ops_aggregation_service.go）
- 实现告警评估服务（ops_alert_evaluator_service.go）
- 实现告警管理服务（ops_alerts.go）
- 实现数据清理服务（ops_cleanup_service.go）
- 实现并发控制服务（ops_concurrency.go）
- 实现仪表板服务（ops_dashboard.go）
- 实现错误处理服务（ops_errors.go）
- 实现直方图服务（ops_histograms.go）
- 实现指标采集服务（ops_metrics_collector.go）
- 实现查询模式服务（ops_query_mode.go）
- 实现实时监控服务（ops_realtime.go）
- 实现请求详情服务（ops_request_details.go）
- 实现重试机制服务（ops_retry.go）
- 实现配置管理服务（ops_settings.go）
- 实现趋势分析服务（ops_trends.go）
- 实现窗口统计服务（ops_window_stats.go）
- 添加 ops 相关领域常量
- 注册 service 依赖注入

											
										
										
											2026-01-09 20:53:44 +08:00
+								func (s *OpsCleanupService) runCleanupOnce(ctx context.Context) (opsCleanupDeletedCounts, error) {
 									out := opsCleanupDeletedCounts{}
 									if s == nil || s.db == nil || s.cfg == nil {
 										return out, nil
 									}
 									batchSize := 5000
 									now := time.Now().UTC()
-												feat(ops): allow retention days = 0 to wipe table on each scheduled cleanup

Background / 背景

The ops cleanup task currently rejects retention days < 1 in both validate
and normalize, so operators who want minimal-history setups (e.g. high
churn deployments that prefer near-realtime cleanup) cannot express that
intent through the UI. The only options are 1+ days, which keeps at least
24h of history regardless of cron frequency.

ops 清理任务目前在 validate 和 normalize 两处都拒绝小于 1 的保留天数，
让希望尽量不留历史的运维场景（高吞吐部署 + 想用近实时清理）无法通过 UI
表达。最低只能配 1，等于不管 cron 多频繁，至少都会保留 24 小时的历史。

Purpose / 目的

Let admins set retention days to 0, meaning "every scheduled cleanup
run wipes the corresponding table(s) entirely". Combined with a more
frequent cron (e.g. `0 * * * *`) this yields effectively rolling cleanup.

允许管理员把保留天数设为 0，语义为"每次定时清理时把对应表全部清空"。
搭配更频繁的 cron（比如每小时整点）即可获得近似滚动清理的效果。

Changes / 改动内容

Backend

- service/ops_settings.go: validate accepts [0, 365]; normalize only
  refills default 30 when value is < 0 (negative is treated as legacy
  bad data, 0 is honoured)
- service/ops_cleanup_service.go: introduce `opsCleanupPlan(now, days)`
  returning `(cutoff, truncate, ok)`. days==0 returns truncate=true and
  short-circuits to a new `truncateOpsTable` helper that uses
  `TRUNCATE TABLE` (O(1), no WAL, no VACUUM pressure). days>0 keeps
  the existing batched DELETE path unchanged. Empty tables skip
  TRUNCATE to avoid the ACCESS EXCLUSIVE lock entirely
- Extract `isMissingRelationError` helper to dedupe the "table not
  yet created" tolerance shared by both delete and truncate paths
- Add unit tests for `opsCleanupPlan` (three branches) and
  `isMissingRelationError`

后端

- service/ops_settings.go: validate 接受 [0, 365]；normalize 仅在 < 0
  时回填默认 30（负数视为脏数据，0 被尊重）
- service/ops_cleanup_service.go: 抽 `opsCleanupPlan(now, days)` 返回
  `(cutoff, truncate, ok)`。days==0 → truncate=true，走新增
  `truncateOpsTable`（TRUNCATE TABLE，O(1)，无 WAL、无 VACUUM 压力）；
  days>0 仍走原批量 DELETE 路径，行为完全不变。空表跳过 TRUNCATE，
  避免无意义的 ACCESS EXCLUSIVE 锁
- 抽 `isMissingRelationError` helper 复用 delete / truncate 两处的
  "表不存在"宽容判断
- 补 `opsCleanupPlan` 三分支 + `isMissingRelationError` 单元测试

Frontend

- OpsSettingsDialog.vue: validation accepts [0, 365]; input min=0
- i18n (zh/en): hint mentions "0 = wipe all on every cleanup",
  validation message updated to 0-365 range

前端

- OpsSettingsDialog.vue: 校验放宽到 [0, 365]，input min 改 0
- i18n（zh/en）：hint 补"0 = 每次清理时清空所有"，错误提示改 0-365

Trade-offs / 取舍

- TRUNCATE requires ACCESS EXCLUSIVE lock briefly, but ops tables only
  have the cleanup task as a writer, so the lock is invisible to other
  workloads
- Empty-table guard avoids the lock when there is nothing to clean
- Negative values are still treated as legacy bad data and replaced
  with default 30 to preserve compatibility

											
										
										
											2026-04-29 15:01:02 +08:00
+									// runOne 把"truncate? cutoff? batched delete?"封装到一处，
 									// 让三组清理（错误日志类 / 分钟指标 / 小时+日预聚合）调用方只关心表名和列名。
 									runOne := func(truncate bool, cutoff time.Time, table, timeCol string, castDate bool) (int64, error) {
 										if truncate {
 											return truncateOpsTable(ctx, s.db, table)
 										}
 										return deleteOldRowsByID(ctx, s.db, table, timeCol, cutoff, batchSize, castDate)
 									}
 									// Error-like tables: error logs / retry attempts / alert events / system logs / cleanup audits.
 									if cutoff, truncate, ok := opsCleanupPlan(now, s.cfg.Ops.Cleanup.ErrorLogRetentionDays); ok {
 										n, err := runOne(truncate, cutoff, "ops_error_logs", "created_at", false)
-												feat(service): 实现运维监控业务逻辑层

- 新增 ops 主服务（ops_service.go）和端口定义（ops_port.go）
- 实现账号可用性检查服务（ops_account_availability.go）
- 实现数据聚合服务（ops_aggregation_service.go）
- 实现告警评估服务（ops_alert_evaluator_service.go）
- 实现告警管理服务（ops_alerts.go）
- 实现数据清理服务（ops_cleanup_service.go）
- 实现并发控制服务（ops_concurrency.go）
- 实现仪表板服务（ops_dashboard.go）
- 实现错误处理服务（ops_errors.go）
- 实现直方图服务（ops_histograms.go）
- 实现指标采集服务（ops_metrics_collector.go）
- 实现查询模式服务（ops_query_mode.go）
- 实现实时监控服务（ops_realtime.go）
- 实现请求详情服务（ops_request_details.go）
- 实现重试机制服务（ops_retry.go）
- 实现配置管理服务（ops_settings.go）
- 实现趋势分析服务（ops_trends.go）
- 实现窗口统计服务（ops_window_stats.go）
- 添加 ops 相关领域常量
- 注册 service 依赖注入

											
										
										
											2026-01-09 20:53:44 +08:00
+										if err != nil {
 											return out, err
 										}
 										out.errorLogs = n
-												feat(ops): allow retention days = 0 to wipe table on each scheduled cleanup

Background / 背景

The ops cleanup task currently rejects retention days < 1 in both validate
and normalize, so operators who want minimal-history setups (e.g. high
churn deployments that prefer near-realtime cleanup) cannot express that
intent through the UI. The only options are 1+ days, which keeps at least
24h of history regardless of cron frequency.

ops 清理任务目前在 validate 和 normalize 两处都拒绝小于 1 的保留天数，
让希望尽量不留历史的运维场景（高吞吐部署 + 想用近实时清理）无法通过 UI
表达。最低只能配 1，等于不管 cron 多频繁，至少都会保留 24 小时的历史。

Purpose / 目的

Let admins set retention days to 0, meaning "every scheduled cleanup
run wipes the corresponding table(s) entirely". Combined with a more
frequent cron (e.g. `0 * * * *`) this yields effectively rolling cleanup.

允许管理员把保留天数设为 0，语义为"每次定时清理时把对应表全部清空"。
搭配更频繁的 cron（比如每小时整点）即可获得近似滚动清理的效果。

Changes / 改动内容

Backend

- service/ops_settings.go: validate accepts [0, 365]; normalize only
  refills default 30 when value is < 0 (negative is treated as legacy
  bad data, 0 is honoured)
- service/ops_cleanup_service.go: introduce `opsCleanupPlan(now, days)`
  returning `(cutoff, truncate, ok)`. days==0 returns truncate=true and
  short-circuits to a new `truncateOpsTable` helper that uses
  `TRUNCATE TABLE` (O(1), no WAL, no VACUUM pressure). days>0 keeps
  the existing batched DELETE path unchanged. Empty tables skip
  TRUNCATE to avoid the ACCESS EXCLUSIVE lock entirely
- Extract `isMissingRelationError` helper to dedupe the "table not
  yet created" tolerance shared by both delete and truncate paths
- Add unit tests for `opsCleanupPlan` (three branches) and
  `isMissingRelationError`

后端

- service/ops_settings.go: validate 接受 [0, 365]；normalize 仅在 < 0
  时回填默认 30（负数视为脏数据，0 被尊重）
- service/ops_cleanup_service.go: 抽 `opsCleanupPlan(now, days)` 返回
  `(cutoff, truncate, ok)`。days==0 → truncate=true，走新增
  `truncateOpsTable`（TRUNCATE TABLE，O(1)，无 WAL、无 VACUUM 压力）；
  days>0 仍走原批量 DELETE 路径，行为完全不变。空表跳过 TRUNCATE，
  避免无意义的 ACCESS EXCLUSIVE 锁
- 抽 `isMissingRelationError` helper 复用 delete / truncate 两处的
  "表不存在"宽容判断
- 补 `opsCleanupPlan` 三分支 + `isMissingRelationError` 单元测试

Frontend

- OpsSettingsDialog.vue: validation accepts [0, 365]; input min=0
- i18n (zh/en): hint mentions "0 = wipe all on every cleanup",
  validation message updated to 0-365 range

前端

- OpsSettingsDialog.vue: 校验放宽到 [0, 365]，input min 改 0
- i18n（zh/en）：hint 补"0 = 每次清理时清空所有"，错误提示改 0-365

Trade-offs / 取舍

- TRUNCATE requires ACCESS EXCLUSIVE lock briefly, but ops tables only
  have the cleanup task as a writer, so the lock is invisible to other
  workloads
- Empty-table guard avoids the lock when there is nothing to clean
- Negative values are still treated as legacy bad data and replaced
  with default 30 to preserve compatibility

											
										
										
											2026-04-29 15:01:02 +08:00
+										n, err = runOne(truncate, cutoff, "ops_retry_attempts", "created_at", false)
-												feat(service): 实现运维监控业务逻辑层

- 新增 ops 主服务（ops_service.go）和端口定义（ops_port.go）
- 实现账号可用性检查服务（ops_account_availability.go）
- 实现数据聚合服务（ops_aggregation_service.go）
- 实现告警评估服务（ops_alert_evaluator_service.go）
- 实现告警管理服务（ops_alerts.go）
- 实现数据清理服务（ops_cleanup_service.go）
- 实现并发控制服务（ops_concurrency.go）
- 实现仪表板服务（ops_dashboard.go）
- 实现错误处理服务（ops_errors.go）
- 实现直方图服务（ops_histograms.go）
- 实现指标采集服务（ops_metrics_collector.go）
- 实现查询模式服务（ops_query_mode.go）
- 实现实时监控服务（ops_realtime.go）
- 实现请求详情服务（ops_request_details.go）
- 实现重试机制服务（ops_retry.go）
- 实现配置管理服务（ops_settings.go）
- 实现趋势分析服务（ops_trends.go）
- 实现窗口统计服务（ops_window_stats.go）
- 添加 ops 相关领域常量
- 注册 service 依赖注入

											
										
										
											2026-01-09 20:53:44 +08:00
+										if err != nil {
 											return out, err
 										}
 										out.retryAttempts = n
-												feat(ops): allow retention days = 0 to wipe table on each scheduled cleanup

Background / 背景

The ops cleanup task currently rejects retention days < 1 in both validate
and normalize, so operators who want minimal-history setups (e.g. high
churn deployments that prefer near-realtime cleanup) cannot express that
intent through the UI. The only options are 1+ days, which keeps at least
24h of history regardless of cron frequency.

ops 清理任务目前在 validate 和 normalize 两处都拒绝小于 1 的保留天数，
让希望尽量不留历史的运维场景（高吞吐部署 + 想用近实时清理）无法通过 UI
表达。最低只能配 1，等于不管 cron 多频繁，至少都会保留 24 小时的历史。

Purpose / 目的

Let admins set retention days to 0, meaning "every scheduled cleanup
run wipes the corresponding table(s) entirely". Combined with a more
frequent cron (e.g. `0 * * * *`) this yields effectively rolling cleanup.

允许管理员把保留天数设为 0，语义为"每次定时清理时把对应表全部清空"。
搭配更频繁的 cron（比如每小时整点）即可获得近似滚动清理的效果。

Changes / 改动内容

Backend

- service/ops_settings.go: validate accepts [0, 365]; normalize only
  refills default 30 when value is < 0 (negative is treated as legacy
  bad data, 0 is honoured)
- service/ops_cleanup_service.go: introduce `opsCleanupPlan(now, days)`
  returning `(cutoff, truncate, ok)`. days==0 returns truncate=true and
  short-circuits to a new `truncateOpsTable` helper that uses
  `TRUNCATE TABLE` (O(1), no WAL, no VACUUM pressure). days>0 keeps
  the existing batched DELETE path unchanged. Empty tables skip
  TRUNCATE to avoid the ACCESS EXCLUSIVE lock entirely
- Extract `isMissingRelationError` helper to dedupe the "table not
  yet created" tolerance shared by both delete and truncate paths
- Add unit tests for `opsCleanupPlan` (three branches) and
  `isMissingRelationError`

后端

- service/ops_settings.go: validate 接受 [0, 365]；normalize 仅在 < 0
  时回填默认 30（负数视为脏数据，0 被尊重）
- service/ops_cleanup_service.go: 抽 `opsCleanupPlan(now, days)` 返回
  `(cutoff, truncate, ok)`。days==0 → truncate=true，走新增
  `truncateOpsTable`（TRUNCATE TABLE，O(1)，无 WAL、无 VACUUM 压力）；
  days>0 仍走原批量 DELETE 路径，行为完全不变。空表跳过 TRUNCATE，
  避免无意义的 ACCESS EXCLUSIVE 锁
- 抽 `isMissingRelationError` helper 复用 delete / truncate 两处的
  "表不存在"宽容判断
- 补 `opsCleanupPlan` 三分支 + `isMissingRelationError` 单元测试

Frontend

- OpsSettingsDialog.vue: validation accepts [0, 365]; input min=0
- i18n (zh/en): hint mentions "0 = wipe all on every cleanup",
  validation message updated to 0-365 range

前端

- OpsSettingsDialog.vue: 校验放宽到 [0, 365]，input min 改 0
- i18n（zh/en）：hint 补"0 = 每次清理时清空所有"，错误提示改 0-365

Trade-offs / 取舍

- TRUNCATE requires ACCESS EXCLUSIVE lock briefly, but ops tables only
  have the cleanup task as a writer, so the lock is invisible to other
  workloads
- Empty-table guard avoids the lock when there is nothing to clean
- Negative values are still treated as legacy bad data and replaced
  with default 30 to preserve compatibility

											
										
										
											2026-04-29 15:01:02 +08:00
+										n, err = runOne(truncate, cutoff, "ops_alert_events", "created_at", false)
-												feat(service): 实现运维监控业务逻辑层

- 新增 ops 主服务（ops_service.go）和端口定义（ops_port.go）
- 实现账号可用性检查服务（ops_account_availability.go）
- 实现数据聚合服务（ops_aggregation_service.go）
- 实现告警评估服务（ops_alert_evaluator_service.go）
- 实现告警管理服务（ops_alerts.go）
- 实现数据清理服务（ops_cleanup_service.go）
- 实现并发控制服务（ops_concurrency.go）
- 实现仪表板服务（ops_dashboard.go）
- 实现错误处理服务（ops_errors.go）
- 实现直方图服务（ops_histograms.go）
- 实现指标采集服务（ops_metrics_collector.go）
- 实现查询模式服务（ops_query_mode.go）
- 实现实时监控服务（ops_realtime.go）
- 实现请求详情服务（ops_request_details.go）
- 实现重试机制服务（ops_retry.go）
- 实现配置管理服务（ops_settings.go）
- 实现趋势分析服务（ops_trends.go）
- 实现窗口统计服务（ops_window_stats.go）
- 添加 ops 相关领域常量
- 注册 service 依赖注入

											
										
										
											2026-01-09 20:53:44 +08:00
+										if err != nil {
 											return out, err
 										}
 										out.alertEvents = n
-												feat(log): 落地统一日志底座与系统日志运维能力

											
										
										
											2026-02-12 16:27:29 +08:00
-												feat(ops): allow retention days = 0 to wipe table on each scheduled cleanup

Background / 背景

The ops cleanup task currently rejects retention days < 1 in both validate
and normalize, so operators who want minimal-history setups (e.g. high
churn deployments that prefer near-realtime cleanup) cannot express that
intent through the UI. The only options are 1+ days, which keeps at least
24h of history regardless of cron frequency.

ops 清理任务目前在 validate 和 normalize 两处都拒绝小于 1 的保留天数，
让希望尽量不留历史的运维场景（高吞吐部署 + 想用近实时清理）无法通过 UI
表达。最低只能配 1，等于不管 cron 多频繁，至少都会保留 24 小时的历史。

Purpose / 目的

Let admins set retention days to 0, meaning "every scheduled cleanup
run wipes the corresponding table(s) entirely". Combined with a more
frequent cron (e.g. `0 * * * *`) this yields effectively rolling cleanup.

允许管理员把保留天数设为 0，语义为"每次定时清理时把对应表全部清空"。
搭配更频繁的 cron（比如每小时整点）即可获得近似滚动清理的效果。

Changes / 改动内容

Backend

- service/ops_settings.go: validate accepts [0, 365]; normalize only
  refills default 30 when value is < 0 (negative is treated as legacy
  bad data, 0 is honoured)
- service/ops_cleanup_service.go: introduce `opsCleanupPlan(now, days)`
  returning `(cutoff, truncate, ok)`. days==0 returns truncate=true and
  short-circuits to a new `truncateOpsTable` helper that uses
  `TRUNCATE TABLE` (O(1), no WAL, no VACUUM pressure). days>0 keeps
  the existing batched DELETE path unchanged. Empty tables skip
  TRUNCATE to avoid the ACCESS EXCLUSIVE lock entirely
- Extract `isMissingRelationError` helper to dedupe the "table not
  yet created" tolerance shared by both delete and truncate paths
- Add unit tests for `opsCleanupPlan` (three branches) and
  `isMissingRelationError`

后端

- service/ops_settings.go: validate 接受 [0, 365]；normalize 仅在 < 0
  时回填默认 30（负数视为脏数据，0 被尊重）
- service/ops_cleanup_service.go: 抽 `opsCleanupPlan(now, days)` 返回
  `(cutoff, truncate, ok)`。days==0 → truncate=true，走新增
  `truncateOpsTable`（TRUNCATE TABLE，O(1)，无 WAL、无 VACUUM 压力）；
  days>0 仍走原批量 DELETE 路径，行为完全不变。空表跳过 TRUNCATE，
  避免无意义的 ACCESS EXCLUSIVE 锁
- 抽 `isMissingRelationError` helper 复用 delete / truncate 两处的
  "表不存在"宽容判断
- 补 `opsCleanupPlan` 三分支 + `isMissingRelationError` 单元测试

Frontend

- OpsSettingsDialog.vue: validation accepts [0, 365]; input min=0
- i18n (zh/en): hint mentions "0 = wipe all on every cleanup",
  validation message updated to 0-365 range

前端

- OpsSettingsDialog.vue: 校验放宽到 [0, 365]，input min 改 0
- i18n（zh/en）：hint 补"0 = 每次清理时清空所有"，错误提示改 0-365

Trade-offs / 取舍

- TRUNCATE requires ACCESS EXCLUSIVE lock briefly, but ops tables only
  have the cleanup task as a writer, so the lock is invisible to other
  workloads
- Empty-table guard avoids the lock when there is nothing to clean
- Negative values are still treated as legacy bad data and replaced
  with default 30 to preserve compatibility

											
										
										
											2026-04-29 15:01:02 +08:00
+										n, err = runOne(truncate, cutoff, "ops_system_logs", "created_at", false)
-												feat(log): 落地统一日志底座与系统日志运维能力

											
										
										
											2026-02-12 16:27:29 +08:00
+										if err != nil {
 											return out, err
 										}
 										out.systemLogs = n
-												feat(ops): allow retention days = 0 to wipe table on each scheduled cleanup

Background / 背景

The ops cleanup task currently rejects retention days < 1 in both validate
and normalize, so operators who want minimal-history setups (e.g. high
churn deployments that prefer near-realtime cleanup) cannot express that
intent through the UI. The only options are 1+ days, which keeps at least
24h of history regardless of cron frequency.

ops 清理任务目前在 validate 和 normalize 两处都拒绝小于 1 的保留天数，
让希望尽量不留历史的运维场景（高吞吐部署 + 想用近实时清理）无法通过 UI
表达。最低只能配 1，等于不管 cron 多频繁，至少都会保留 24 小时的历史。

Purpose / 目的

Let admins set retention days to 0, meaning "every scheduled cleanup
run wipes the corresponding table(s) entirely". Combined with a more
frequent cron (e.g. `0 * * * *`) this yields effectively rolling cleanup.

允许管理员把保留天数设为 0，语义为"每次定时清理时把对应表全部清空"。
搭配更频繁的 cron（比如每小时整点）即可获得近似滚动清理的效果。

Changes / 改动内容

Backend

- service/ops_settings.go: validate accepts [0, 365]; normalize only
  refills default 30 when value is < 0 (negative is treated as legacy
  bad data, 0 is honoured)
- service/ops_cleanup_service.go: introduce `opsCleanupPlan(now, days)`
  returning `(cutoff, truncate, ok)`. days==0 returns truncate=true and
  short-circuits to a new `truncateOpsTable` helper that uses
  `TRUNCATE TABLE` (O(1), no WAL, no VACUUM pressure). days>0 keeps
  the existing batched DELETE path unchanged. Empty tables skip
  TRUNCATE to avoid the ACCESS EXCLUSIVE lock entirely
- Extract `isMissingRelationError` helper to dedupe the "table not
  yet created" tolerance shared by both delete and truncate paths
- Add unit tests for `opsCleanupPlan` (three branches) and
  `isMissingRelationError`

后端

- service/ops_settings.go: validate 接受 [0, 365]；normalize 仅在 < 0
  时回填默认 30（负数视为脏数据，0 被尊重）
- service/ops_cleanup_service.go: 抽 `opsCleanupPlan(now, days)` 返回
  `(cutoff, truncate, ok)`。days==0 → truncate=true，走新增
  `truncateOpsTable`（TRUNCATE TABLE，O(1)，无 WAL、无 VACUUM 压力）；
  days>0 仍走原批量 DELETE 路径，行为完全不变。空表跳过 TRUNCATE，
  避免无意义的 ACCESS EXCLUSIVE 锁
- 抽 `isMissingRelationError` helper 复用 delete / truncate 两处的
  "表不存在"宽容判断
- 补 `opsCleanupPlan` 三分支 + `isMissingRelationError` 单元测试

Frontend

- OpsSettingsDialog.vue: validation accepts [0, 365]; input min=0
- i18n (zh/en): hint mentions "0 = wipe all on every cleanup",
  validation message updated to 0-365 range

前端

- OpsSettingsDialog.vue: 校验放宽到 [0, 365]，input min 改 0
- i18n（zh/en）：hint 补"0 = 每次清理时清空所有"，错误提示改 0-365

Trade-offs / 取舍

- TRUNCATE requires ACCESS EXCLUSIVE lock briefly, but ops tables only
  have the cleanup task as a writer, so the lock is invisible to other
  workloads
- Empty-table guard avoids the lock when there is nothing to clean
- Negative values are still treated as legacy bad data and replaced
  with default 30 to preserve compatibility

											
										
										
											2026-04-29 15:01:02 +08:00
+										n, err = runOne(truncate, cutoff, "ops_system_log_cleanup_audits", "created_at", false)
-												feat(log): 落地统一日志底座与系统日志运维能力

											
										
										
											2026-02-12 16:27:29 +08:00
+										if err != nil {
 											return out, err
 										}
 										out.logAudits = n
-												feat(service): 实现运维监控业务逻辑层

- 新增 ops 主服务（ops_service.go）和端口定义（ops_port.go）
- 实现账号可用性检查服务（ops_account_availability.go）
- 实现数据聚合服务（ops_aggregation_service.go）
- 实现告警评估服务（ops_alert_evaluator_service.go）
- 实现告警管理服务（ops_alerts.go）
- 实现数据清理服务（ops_cleanup_service.go）
- 实现并发控制服务（ops_concurrency.go）
- 实现仪表板服务（ops_dashboard.go）
- 实现错误处理服务（ops_errors.go）
- 实现直方图服务（ops_histograms.go）
- 实现指标采集服务（ops_metrics_collector.go）
- 实现查询模式服务（ops_query_mode.go）
- 实现实时监控服务（ops_realtime.go）
- 实现请求详情服务（ops_request_details.go）
- 实现重试机制服务（ops_retry.go）
- 实现配置管理服务（ops_settings.go）
- 实现趋势分析服务（ops_trends.go）
- 实现窗口统计服务（ops_window_stats.go）
- 添加 ops 相关领域常量
- 注册 service 依赖注入

											
										
										
											2026-01-09 20:53:44 +08:00
+									}
 									// Minute-level metrics snapshots.
-												feat(ops): allow retention days = 0 to wipe table on each scheduled cleanup

Background / 背景

The ops cleanup task currently rejects retention days < 1 in both validate
and normalize, so operators who want minimal-history setups (e.g. high
churn deployments that prefer near-realtime cleanup) cannot express that
intent through the UI. The only options are 1+ days, which keeps at least
24h of history regardless of cron frequency.

ops 清理任务目前在 validate 和 normalize 两处都拒绝小于 1 的保留天数，
让希望尽量不留历史的运维场景（高吞吐部署 + 想用近实时清理）无法通过 UI
表达。最低只能配 1，等于不管 cron 多频繁，至少都会保留 24 小时的历史。

Purpose / 目的

Let admins set retention days to 0, meaning "every scheduled cleanup
run wipes the corresponding table(s) entirely". Combined with a more
frequent cron (e.g. `0 * * * *`) this yields effectively rolling cleanup.

允许管理员把保留天数设为 0，语义为"每次定时清理时把对应表全部清空"。
搭配更频繁的 cron（比如每小时整点）即可获得近似滚动清理的效果。

Changes / 改动内容

Backend

- service/ops_settings.go: validate accepts [0, 365]; normalize only
  refills default 30 when value is < 0 (negative is treated as legacy
  bad data, 0 is honoured)
- service/ops_cleanup_service.go: introduce `opsCleanupPlan(now, days)`
  returning `(cutoff, truncate, ok)`. days==0 returns truncate=true and
  short-circuits to a new `truncateOpsTable` helper that uses
  `TRUNCATE TABLE` (O(1), no WAL, no VACUUM pressure). days>0 keeps
  the existing batched DELETE path unchanged. Empty tables skip
  TRUNCATE to avoid the ACCESS EXCLUSIVE lock entirely
- Extract `isMissingRelationError` helper to dedupe the "table not
  yet created" tolerance shared by both delete and truncate paths
- Add unit tests for `opsCleanupPlan` (three branches) and
  `isMissingRelationError`

后端

- service/ops_settings.go: validate 接受 [0, 365]；normalize 仅在 < 0
  时回填默认 30（负数视为脏数据，0 被尊重）
- service/ops_cleanup_service.go: 抽 `opsCleanupPlan(now, days)` 返回
  `(cutoff, truncate, ok)`。days==0 → truncate=true，走新增
  `truncateOpsTable`（TRUNCATE TABLE，O(1)，无 WAL、无 VACUUM 压力）；
  days>0 仍走原批量 DELETE 路径，行为完全不变。空表跳过 TRUNCATE，
  避免无意义的 ACCESS EXCLUSIVE 锁
- 抽 `isMissingRelationError` helper 复用 delete / truncate 两处的
  "表不存在"宽容判断
- 补 `opsCleanupPlan` 三分支 + `isMissingRelationError` 单元测试

Frontend

- OpsSettingsDialog.vue: validation accepts [0, 365]; input min=0
- i18n (zh/en): hint mentions "0 = wipe all on every cleanup",
  validation message updated to 0-365 range

前端

- OpsSettingsDialog.vue: 校验放宽到 [0, 365]，input min 改 0
- i18n（zh/en）：hint 补"0 = 每次清理时清空所有"，错误提示改 0-365

Trade-offs / 取舍

- TRUNCATE requires ACCESS EXCLUSIVE lock briefly, but ops tables only
  have the cleanup task as a writer, so the lock is invisible to other
  workloads
- Empty-table guard avoids the lock when there is nothing to clean
- Negative values are still treated as legacy bad data and replaced
  with default 30 to preserve compatibility

											
										
										
											2026-04-29 15:01:02 +08:00
+									if cutoff, truncate, ok := opsCleanupPlan(now, s.cfg.Ops.Cleanup.MinuteMetricsRetentionDays); ok {
 										n, err := runOne(truncate, cutoff, "ops_system_metrics", "created_at", false)
-												feat(service): 实现运维监控业务逻辑层

- 新增 ops 主服务（ops_service.go）和端口定义（ops_port.go）
- 实现账号可用性检查服务（ops_account_availability.go）
- 实现数据聚合服务（ops_aggregation_service.go）
- 实现告警评估服务（ops_alert_evaluator_service.go）
- 实现告警管理服务（ops_alerts.go）
- 实现数据清理服务（ops_cleanup_service.go）
- 实现并发控制服务（ops_concurrency.go）
- 实现仪表板服务（ops_dashboard.go）
- 实现错误处理服务（ops_errors.go）
- 实现直方图服务（ops_histograms.go）
- 实现指标采集服务（ops_metrics_collector.go）
- 实现查询模式服务（ops_query_mode.go）
- 实现实时监控服务（ops_realtime.go）
- 实现请求详情服务（ops_request_details.go）
- 实现重试机制服务（ops_retry.go）
- 实现配置管理服务（ops_settings.go）
- 实现趋势分析服务（ops_trends.go）
- 实现窗口统计服务（ops_window_stats.go）
- 添加 ops 相关领域常量
- 注册 service 依赖注入

											
										
										
											2026-01-09 20:53:44 +08:00
+										if err != nil {
 											return out, err
 										}
 										out.systemMetrics = n
 									}
 									// Pre-aggregation tables (hourly/daily).
-												feat(ops): allow retention days = 0 to wipe table on each scheduled cleanup

Background / 背景

The ops cleanup task currently rejects retention days < 1 in both validate
and normalize, so operators who want minimal-history setups (e.g. high
churn deployments that prefer near-realtime cleanup) cannot express that
intent through the UI. The only options are 1+ days, which keeps at least
24h of history regardless of cron frequency.

ops 清理任务目前在 validate 和 normalize 两处都拒绝小于 1 的保留天数，
让希望尽量不留历史的运维场景（高吞吐部署 + 想用近实时清理）无法通过 UI
表达。最低只能配 1，等于不管 cron 多频繁，至少都会保留 24 小时的历史。

Purpose / 目的

Let admins set retention days to 0, meaning "every scheduled cleanup
run wipes the corresponding table(s) entirely". Combined with a more
frequent cron (e.g. `0 * * * *`) this yields effectively rolling cleanup.

允许管理员把保留天数设为 0，语义为"每次定时清理时把对应表全部清空"。
搭配更频繁的 cron（比如每小时整点）即可获得近似滚动清理的效果。

Changes / 改动内容

Backend

- service/ops_settings.go: validate accepts [0, 365]; normalize only
  refills default 30 when value is < 0 (negative is treated as legacy
  bad data, 0 is honoured)
- service/ops_cleanup_service.go: introduce `opsCleanupPlan(now, days)`
  returning `(cutoff, truncate, ok)`. days==0 returns truncate=true and
  short-circuits to a new `truncateOpsTable` helper that uses
  `TRUNCATE TABLE` (O(1), no WAL, no VACUUM pressure). days>0 keeps
  the existing batched DELETE path unchanged. Empty tables skip
  TRUNCATE to avoid the ACCESS EXCLUSIVE lock entirely
- Extract `isMissingRelationError` helper to dedupe the "table not
  yet created" tolerance shared by both delete and truncate paths
- Add unit tests for `opsCleanupPlan` (three branches) and
  `isMissingRelationError`

后端

- service/ops_settings.go: validate 接受 [0, 365]；normalize 仅在 < 0
  时回填默认 30（负数视为脏数据，0 被尊重）
- service/ops_cleanup_service.go: 抽 `opsCleanupPlan(now, days)` 返回
  `(cutoff, truncate, ok)`。days==0 → truncate=true，走新增
  `truncateOpsTable`（TRUNCATE TABLE，O(1)，无 WAL、无 VACUUM 压力）；
  days>0 仍走原批量 DELETE 路径，行为完全不变。空表跳过 TRUNCATE，
  避免无意义的 ACCESS EXCLUSIVE 锁
- 抽 `isMissingRelationError` helper 复用 delete / truncate 两处的
  "表不存在"宽容判断
- 补 `opsCleanupPlan` 三分支 + `isMissingRelationError` 单元测试

Frontend

- OpsSettingsDialog.vue: validation accepts [0, 365]; input min=0
- i18n (zh/en): hint mentions "0 = wipe all on every cleanup",
  validation message updated to 0-365 range

前端

- OpsSettingsDialog.vue: 校验放宽到 [0, 365]，input min 改 0
- i18n（zh/en）：hint 补"0 = 每次清理时清空所有"，错误提示改 0-365

Trade-offs / 取舍

- TRUNCATE requires ACCESS EXCLUSIVE lock briefly, but ops tables only
  have the cleanup task as a writer, so the lock is invisible to other
  workloads
- Empty-table guard avoids the lock when there is nothing to clean
- Negative values are still treated as legacy bad data and replaced
  with default 30 to preserve compatibility

											
										
										
											2026-04-29 15:01:02 +08:00
+									if cutoff, truncate, ok := opsCleanupPlan(now, s.cfg.Ops.Cleanup.HourlyMetricsRetentionDays); ok {
 										n, err := runOne(truncate, cutoff, "ops_metrics_hourly", "bucket_start", false)
-												feat(service): 实现运维监控业务逻辑层

- 新增 ops 主服务（ops_service.go）和端口定义（ops_port.go）
- 实现账号可用性检查服务（ops_account_availability.go）
- 实现数据聚合服务（ops_aggregation_service.go）
- 实现告警评估服务（ops_alert_evaluator_service.go）
- 实现告警管理服务（ops_alerts.go）
- 实现数据清理服务（ops_cleanup_service.go）
- 实现并发控制服务（ops_concurrency.go）
- 实现仪表板服务（ops_dashboard.go）
- 实现错误处理服务（ops_errors.go）
- 实现直方图服务（ops_histograms.go）
- 实现指标采集服务（ops_metrics_collector.go）
- 实现查询模式服务（ops_query_mode.go）
- 实现实时监控服务（ops_realtime.go）
- 实现请求详情服务（ops_request_details.go）
- 实现重试机制服务（ops_retry.go）
- 实现配置管理服务（ops_settings.go）
- 实现趋势分析服务（ops_trends.go）
- 实现窗口统计服务（ops_window_stats.go）
- 添加 ops 相关领域常量
- 注册 service 依赖注入

											
										
										
											2026-01-09 20:53:44 +08:00
+										if err != nil {
 											return out, err
 										}
 										out.hourlyPreagg = n
-												feat(ops): allow retention days = 0 to wipe table on each scheduled cleanup

Background / 背景

The ops cleanup task currently rejects retention days < 1 in both validate
and normalize, so operators who want minimal-history setups (e.g. high
churn deployments that prefer near-realtime cleanup) cannot express that
intent through the UI. The only options are 1+ days, which keeps at least
24h of history regardless of cron frequency.

ops 清理任务目前在 validate 和 normalize 两处都拒绝小于 1 的保留天数，
让希望尽量不留历史的运维场景（高吞吐部署 + 想用近实时清理）无法通过 UI
表达。最低只能配 1，等于不管 cron 多频繁，至少都会保留 24 小时的历史。

Purpose / 目的

Let admins set retention days to 0, meaning "every scheduled cleanup
run wipes the corresponding table(s) entirely". Combined with a more
frequent cron (e.g. `0 * * * *`) this yields effectively rolling cleanup.

允许管理员把保留天数设为 0，语义为"每次定时清理时把对应表全部清空"。
搭配更频繁的 cron（比如每小时整点）即可获得近似滚动清理的效果。

Changes / 改动内容

Backend

- service/ops_settings.go: validate accepts [0, 365]; normalize only
  refills default 30 when value is < 0 (negative is treated as legacy
  bad data, 0 is honoured)
- service/ops_cleanup_service.go: introduce `opsCleanupPlan(now, days)`
  returning `(cutoff, truncate, ok)`. days==0 returns truncate=true and
  short-circuits to a new `truncateOpsTable` helper that uses
  `TRUNCATE TABLE` (O(1), no WAL, no VACUUM pressure). days>0 keeps
  the existing batched DELETE path unchanged. Empty tables skip
  TRUNCATE to avoid the ACCESS EXCLUSIVE lock entirely
- Extract `isMissingRelationError` helper to dedupe the "table not
  yet created" tolerance shared by both delete and truncate paths
- Add unit tests for `opsCleanupPlan` (three branches) and
  `isMissingRelationError`

后端

- service/ops_settings.go: validate 接受 [0, 365]；normalize 仅在 < 0
  时回填默认 30（负数视为脏数据，0 被尊重）
- service/ops_cleanup_service.go: 抽 `opsCleanupPlan(now, days)` 返回
  `(cutoff, truncate, ok)`。days==0 → truncate=true，走新增
  `truncateOpsTable`（TRUNCATE TABLE，O(1)，无 WAL、无 VACUUM 压力）；
  days>0 仍走原批量 DELETE 路径，行为完全不变。空表跳过 TRUNCATE，
  避免无意义的 ACCESS EXCLUSIVE 锁
- 抽 `isMissingRelationError` helper 复用 delete / truncate 两处的
  "表不存在"宽容判断
- 补 `opsCleanupPlan` 三分支 + `isMissingRelationError` 单元测试

Frontend

- OpsSettingsDialog.vue: validation accepts [0, 365]; input min=0
- i18n (zh/en): hint mentions "0 = wipe all on every cleanup",
  validation message updated to 0-365 range

前端

- OpsSettingsDialog.vue: 校验放宽到 [0, 365]，input min 改 0
- i18n（zh/en）：hint 补"0 = 每次清理时清空所有"，错误提示改 0-365

Trade-offs / 取舍

- TRUNCATE requires ACCESS EXCLUSIVE lock briefly, but ops tables only
  have the cleanup task as a writer, so the lock is invisible to other
  workloads
- Empty-table guard avoids the lock when there is nothing to clean
- Negative values are still treated as legacy bad data and replaced
  with default 30 to preserve compatibility

											
										
										
											2026-04-29 15:01:02 +08:00
+										n, err = runOne(truncate, cutoff, "ops_metrics_daily", "bucket_date", true)
-												feat(service): 实现运维监控业务逻辑层

- 新增 ops 主服务（ops_service.go）和端口定义（ops_port.go）
- 实现账号可用性检查服务（ops_account_availability.go）
- 实现数据聚合服务（ops_aggregation_service.go）
- 实现告警评估服务（ops_alert_evaluator_service.go）
- 实现告警管理服务（ops_alerts.go）
- 实现数据清理服务（ops_cleanup_service.go）
- 实现并发控制服务（ops_concurrency.go）
- 实现仪表板服务（ops_dashboard.go）
- 实现错误处理服务（ops_errors.go）
- 实现直方图服务（ops_histograms.go）
- 实现指标采集服务（ops_metrics_collector.go）
- 实现查询模式服务（ops_query_mode.go）
- 实现实时监控服务（ops_realtime.go）
- 实现请求详情服务（ops_request_details.go）
- 实现重试机制服务（ops_retry.go）
- 实现配置管理服务（ops_settings.go）
- 实现趋势分析服务（ops_trends.go）
- 实现窗口统计服务（ops_window_stats.go）
- 添加 ops 相关领域常量
- 注册 service 依赖注入

											
										
										
											2026-01-09 20:53:44 +08:00
+										if err != nil {
 											return out, err
 										}
 										out.dailyPreagg = n
 									}
-												feat(channel-monitor): aggregate history to daily rollups + soft delete

明细只保留 1 天，超过 1 天聚合到新表 channel_monitor_daily_rollups（按
monitor_id/model/bucket_date 维度），聚合保留 30 天。两张表都用 SoftDeleteMixin
软删除（DELETE 自动改为 UPDATE deleted_at = NOW()）。

聚合 + 清理任务由 OpsCleanupService 的 cron 统一调度，与运维监控的清理共享
schedule（默认 0 2 * * *）和 leader lock。ChannelMonitorRunner 的 cleanupLoop
被移除，只保留 dueCheckLoop。

读取路径 ComputeAvailability* 改为 UNION 明细（今天 deleted_at IS NULL）+
聚合（过去 windowDays 天 deleted_at IS NULL），SUM(ok)/SUM(total) 自然加权
计算可用率，AVG latency 用 SUM(sum_latency_ms)/SUM(count_latency)。

watermark 表 channel_monitor_aggregation_watermark 单行（id=1），记录
last_aggregated_date，重启后从该日期 +1 继续聚合，首次为 nil 则从
today - 30d 开始回填，单次最多 35 天上限避免长事务。

raw SQL 的 ListLatestPerModel / ListLatestForMonitorIDs / ListRecentHistoryForMonitors
都补上 deleted_at IS NULL 过滤（SoftDeleteMixin interceptor 只对 ent query 生效）。

bump version to 0.1.114.28

GroupBadge 在 MonitorKeyPickerDialog 中复用平台主题色 + 倍率/专属倍率
（顺手优化）。

											
										
										
											2026-04-21 10:10:56 +08:00
+									// Channel monitor 每日维护（聚合昨日明细 + 软删过期明细/聚合）。
 									// 失败只记日志，不影响 ops 清理的成功状态（与 ops 各步骤风格一致）；
 									// 维护本身已经把每步错误打到 slog，heartbeat result 不再分项记录。
 									if s.channelMonitorSvc != nil {
 										if err := s.channelMonitorSvc.RunDailyMaintenance(ctx); err != nil {
 											logger.LegacyPrintf("service.ops_cleanup", "[OpsCleanup] channel monitor maintenance failed: %v", err)
 										}
 									}
-												feat(service): 实现运维监控业务逻辑层

- 新增 ops 主服务（ops_service.go）和端口定义（ops_port.go）
- 实现账号可用性检查服务（ops_account_availability.go）
- 实现数据聚合服务（ops_aggregation_service.go）
- 实现告警评估服务（ops_alert_evaluator_service.go）
- 实现告警管理服务（ops_alerts.go）
- 实现数据清理服务（ops_cleanup_service.go）
- 实现并发控制服务（ops_concurrency.go）
- 实现仪表板服务（ops_dashboard.go）
- 实现错误处理服务（ops_errors.go）
- 实现直方图服务（ops_histograms.go）
- 实现指标采集服务（ops_metrics_collector.go）
- 实现查询模式服务（ops_query_mode.go）
- 实现实时监控服务（ops_realtime.go）
- 实现请求详情服务（ops_request_details.go）
- 实现重试机制服务（ops_retry.go）
- 实现配置管理服务（ops_settings.go）
- 实现趋势分析服务（ops_trends.go）
- 实现窗口统计服务（ops_window_stats.go）
- 添加 ops 相关领域常量
- 注册 service 依赖注入

											
										
										
											2026-01-09 20:53:44 +08:00
+									return out, nil
 								}
 								func deleteOldRowsByID(
 									ctx context.Context,
 									db *sql.DB,
 									table string,
 									timeColumn string,
 									cutoff time.Time,
 									batchSize int,
 									castCutoffToDate bool,
 								) (int64, error) {
 									if db == nil {
 										return 0, nil
 									}
 									if batchSize <= 0 {
 										batchSize = 5000
 									}
 									where := fmt.Sprintf("%s < $1", timeColumn)
 									if castCutoffToDate {
 										where = fmt.Sprintf("%s < $1::date", timeColumn)
 									}
 									q := fmt.Sprintf(`
 								WITH batch AS (
 								  SELECT id FROM %s
 								  WHERE %s
 								  ORDER BY id
 								  LIMIT $2
 								)
 								DELETE FROM %s
 								WHERE id IN (SELECT id FROM batch)
 								`, table, where, table)
 									var total int64
 									for {
 										res, err := db.ExecContext(ctx, q, cutoff, batchSize)
 										if err != nil {
 											// If ops tables aren't present yet (partial deployments), treat as no-op.
-												feat(ops): allow retention days = 0 to wipe table on each scheduled cleanup

Background / 背景

The ops cleanup task currently rejects retention days < 1 in both validate
and normalize, so operators who want minimal-history setups (e.g. high
churn deployments that prefer near-realtime cleanup) cannot express that
intent through the UI. The only options are 1+ days, which keeps at least
24h of history regardless of cron frequency.

ops 清理任务目前在 validate 和 normalize 两处都拒绝小于 1 的保留天数，
让希望尽量不留历史的运维场景（高吞吐部署 + 想用近实时清理）无法通过 UI
表达。最低只能配 1，等于不管 cron 多频繁，至少都会保留 24 小时的历史。

Purpose / 目的

Let admins set retention days to 0, meaning "every scheduled cleanup
run wipes the corresponding table(s) entirely". Combined with a more
frequent cron (e.g. `0 * * * *`) this yields effectively rolling cleanup.

允许管理员把保留天数设为 0，语义为"每次定时清理时把对应表全部清空"。
搭配更频繁的 cron（比如每小时整点）即可获得近似滚动清理的效果。

Changes / 改动内容

Backend

- service/ops_settings.go: validate accepts [0, 365]; normalize only
  refills default 30 when value is < 0 (negative is treated as legacy
  bad data, 0 is honoured)
- service/ops_cleanup_service.go: introduce `opsCleanupPlan(now, days)`
  returning `(cutoff, truncate, ok)`. days==0 returns truncate=true and
  short-circuits to a new `truncateOpsTable` helper that uses
  `TRUNCATE TABLE` (O(1), no WAL, no VACUUM pressure). days>0 keeps
  the existing batched DELETE path unchanged. Empty tables skip
  TRUNCATE to avoid the ACCESS EXCLUSIVE lock entirely
- Extract `isMissingRelationError` helper to dedupe the "table not
  yet created" tolerance shared by both delete and truncate paths
- Add unit tests for `opsCleanupPlan` (three branches) and
  `isMissingRelationError`

后端

- service/ops_settings.go: validate 接受 [0, 365]；normalize 仅在 < 0
  时回填默认 30（负数视为脏数据，0 被尊重）
- service/ops_cleanup_service.go: 抽 `opsCleanupPlan(now, days)` 返回
  `(cutoff, truncate, ok)`。days==0 → truncate=true，走新增
  `truncateOpsTable`（TRUNCATE TABLE，O(1)，无 WAL、无 VACUUM 压力）；
  days>0 仍走原批量 DELETE 路径，行为完全不变。空表跳过 TRUNCATE，
  避免无意义的 ACCESS EXCLUSIVE 锁
- 抽 `isMissingRelationError` helper 复用 delete / truncate 两处的
  "表不存在"宽容判断
- 补 `opsCleanupPlan` 三分支 + `isMissingRelationError` 单元测试

Frontend

- OpsSettingsDialog.vue: validation accepts [0, 365]; input min=0
- i18n (zh/en): hint mentions "0 = wipe all on every cleanup",
  validation message updated to 0-365 range

前端

- OpsSettingsDialog.vue: 校验放宽到 [0, 365]，input min 改 0
- i18n（zh/en）：hint 补"0 = 每次清理时清空所有"，错误提示改 0-365

Trade-offs / 取舍

- TRUNCATE requires ACCESS EXCLUSIVE lock briefly, but ops tables only
  have the cleanup task as a writer, so the lock is invisible to other
  workloads
- Empty-table guard avoids the lock when there is nothing to clean
- Negative values are still treated as legacy bad data and replaced
  with default 30 to preserve compatibility

											
										
										
											2026-04-29 15:01:02 +08:00
+											if isMissingRelationError(err) {
-												feat(service): 实现运维监控业务逻辑层

- 新增 ops 主服务（ops_service.go）和端口定义（ops_port.go）
- 实现账号可用性检查服务（ops_account_availability.go）
- 实现数据聚合服务（ops_aggregation_service.go）
- 实现告警评估服务（ops_alert_evaluator_service.go）
- 实现告警管理服务（ops_alerts.go）
- 实现数据清理服务（ops_cleanup_service.go）
- 实现并发控制服务（ops_concurrency.go）
- 实现仪表板服务（ops_dashboard.go）
- 实现错误处理服务（ops_errors.go）
- 实现直方图服务（ops_histograms.go）
- 实现指标采集服务（ops_metrics_collector.go）
- 实现查询模式服务（ops_query_mode.go）
- 实现实时监控服务（ops_realtime.go）
- 实现请求详情服务（ops_request_details.go）
- 实现重试机制服务（ops_retry.go）
- 实现配置管理服务（ops_settings.go）
- 实现趋势分析服务（ops_trends.go）
- 实现窗口统计服务（ops_window_stats.go）
- 添加 ops 相关领域常量
- 注册 service 依赖注入

											
										
										
											2026-01-09 20:53:44 +08:00
+												return total, nil
 											}
 											return total, err
 										}
 										affected, err := res.RowsAffected()
 										if err != nil {
 											return total, err
 										}
 										total += affected
 										if affected == 0 {
 											break
 										}
 									}
 									return total, nil
 								}
-												feat(ops): allow retention days = 0 to wipe table on each scheduled cleanup

Background / 背景

The ops cleanup task currently rejects retention days < 1 in both validate
and normalize, so operators who want minimal-history setups (e.g. high
churn deployments that prefer near-realtime cleanup) cannot express that
intent through the UI. The only options are 1+ days, which keeps at least
24h of history regardless of cron frequency.

ops 清理任务目前在 validate 和 normalize 两处都拒绝小于 1 的保留天数，
让希望尽量不留历史的运维场景（高吞吐部署 + 想用近实时清理）无法通过 UI
表达。最低只能配 1，等于不管 cron 多频繁，至少都会保留 24 小时的历史。

Purpose / 目的

Let admins set retention days to 0, meaning "every scheduled cleanup
run wipes the corresponding table(s) entirely". Combined with a more
frequent cron (e.g. `0 * * * *`) this yields effectively rolling cleanup.

允许管理员把保留天数设为 0，语义为"每次定时清理时把对应表全部清空"。
搭配更频繁的 cron（比如每小时整点）即可获得近似滚动清理的效果。

Changes / 改动内容

Backend

- service/ops_settings.go: validate accepts [0, 365]; normalize only
  refills default 30 when value is < 0 (negative is treated as legacy
  bad data, 0 is honoured)
- service/ops_cleanup_service.go: introduce `opsCleanupPlan(now, days)`
  returning `(cutoff, truncate, ok)`. days==0 returns truncate=true and
  short-circuits to a new `truncateOpsTable` helper that uses
  `TRUNCATE TABLE` (O(1), no WAL, no VACUUM pressure). days>0 keeps
  the existing batched DELETE path unchanged. Empty tables skip
  TRUNCATE to avoid the ACCESS EXCLUSIVE lock entirely
- Extract `isMissingRelationError` helper to dedupe the "table not
  yet created" tolerance shared by both delete and truncate paths
- Add unit tests for `opsCleanupPlan` (three branches) and
  `isMissingRelationError`

后端

- service/ops_settings.go: validate 接受 [0, 365]；normalize 仅在 < 0
  时回填默认 30（负数视为脏数据，0 被尊重）
- service/ops_cleanup_service.go: 抽 `opsCleanupPlan(now, days)` 返回
  `(cutoff, truncate, ok)`。days==0 → truncate=true，走新增
  `truncateOpsTable`（TRUNCATE TABLE，O(1)，无 WAL、无 VACUUM 压力）；
  days>0 仍走原批量 DELETE 路径，行为完全不变。空表跳过 TRUNCATE，
  避免无意义的 ACCESS EXCLUSIVE 锁
- 抽 `isMissingRelationError` helper 复用 delete / truncate 两处的
  "表不存在"宽容判断
- 补 `opsCleanupPlan` 三分支 + `isMissingRelationError` 单元测试

Frontend

- OpsSettingsDialog.vue: validation accepts [0, 365]; input min=0
- i18n (zh/en): hint mentions "0 = wipe all on every cleanup",
  validation message updated to 0-365 range

前端

- OpsSettingsDialog.vue: 校验放宽到 [0, 365]，input min 改 0
- i18n（zh/en）：hint 补"0 = 每次清理时清空所有"，错误提示改 0-365

Trade-offs / 取舍

- TRUNCATE requires ACCESS EXCLUSIVE lock briefly, but ops tables only
  have the cleanup task as a writer, so the lock is invisible to other
  workloads
- Empty-table guard avoids the lock when there is nothing to clean
- Negative values are still treated as legacy bad data and replaced
  with default 30 to preserve compatibility

											
										
										
											2026-04-29 15:01:02 +08:00
+								// truncateOpsTable 用 TRUNCATE TABLE 清空指定表，先 SELECT COUNT(*) 取得清空前行数用于 heartbeat。
 								//
 								// 与 deleteOldRowsByID 的差异：
 								//   - 不可指定 WHERE 条件，仅用于 days==0 的"清空全部"语义
 								//   - O(1) 释放表的物理存储页，毫秒级完成，无 WAL 写入、无 VACUUM 压力
 								//   - 需要 ACCESS EXCLUSIVE 锁，但 ops 表只有清理任务自己写入，瞬间锁影响可忽略
 								//
 								// 表不存在（部分部署）静默返回 0，与 deleteOldRowsByID 保持一致。
 								func truncateOpsTable(ctx context.Context, db *sql.DB, table string) (int64, error) {
 									if db == nil {
 										return 0, nil
 									}
 									var count int64
 									if err := db.QueryRowContext(ctx, fmt.Sprintf("SELECT COUNT(*) FROM %s", table)).Scan(&count); err != nil {
 										if isMissingRelationError(err) {
 											return 0, nil
 										}
 										return 0, fmt.Errorf("count %s: %w", table, err)
 									}
 									if count == 0 {
 										return 0, nil
 									}
 									if _, err := db.ExecContext(ctx, fmt.Sprintf("TRUNCATE TABLE %s", table)); err != nil {
 										if isMissingRelationError(err) {
 											return 0, nil
 										}
 										return 0, fmt.Errorf("truncate %s: %w", table, err)
 									}
 									return count, nil
 								}
 								// isMissingRelationError 判断 PG 报错是否为"表不存在"，用于让清理任务在部分部署场景静默跳过。
 								func isMissingRelationError(err error) bool {
 									if err == nil {
 										return false
 									}
 									s := strings.ToLower(err.Error())
 									return strings.Contains(s, "does not exist") && strings.Contains(s, "relation")
 								}
-												feat(service): 实现运维监控业务逻辑层

- 新增 ops 主服务（ops_service.go）和端口定义（ops_port.go）
- 实现账号可用性检查服务（ops_account_availability.go）
- 实现数据聚合服务（ops_aggregation_service.go）
- 实现告警评估服务（ops_alert_evaluator_service.go）
- 实现告警管理服务（ops_alerts.go）
- 实现数据清理服务（ops_cleanup_service.go）
- 实现并发控制服务（ops_concurrency.go）
- 实现仪表板服务（ops_dashboard.go）
- 实现错误处理服务（ops_errors.go）
- 实现直方图服务（ops_histograms.go）
- 实现指标采集服务（ops_metrics_collector.go）
- 实现查询模式服务（ops_query_mode.go）
- 实现实时监控服务（ops_realtime.go）
- 实现请求详情服务（ops_request_details.go）
- 实现重试机制服务（ops_retry.go）
- 实现配置管理服务（ops_settings.go）
- 实现趋势分析服务（ops_trends.go）
- 实现窗口统计服务（ops_window_stats.go）
- 添加 ops 相关领域常量
- 注册 service 依赖注入

											
										
										
											2026-01-09 20:53:44 +08:00
+								func (s *OpsCleanupService) tryAcquireLeaderLock(ctx context.Context) (func(), bool) {
 									if s == nil {
 										return nil, false
 									}
 									// In simple run mode, assume single instance.
 									if s.cfg != nil && s.cfg.RunMode == config.RunModeSimple {
 										return nil, true
 									}
 									key := opsCleanupLeaderLockKeyDefault
 									ttl := opsCleanupLeaderLockTTLDefault
-												feat(运维监控): 增强监控功能和健康评分系统

后端改进：
- 新增健康评分计算服务（ops_health_score.go）
- 添加分布式锁支持（ops_advisory_lock.go）
- 优化指标采集和聚合逻辑
- 新增运维指标采集间隔配置（60-3600秒）
- 移除未使用的WebSocket查询token认证中间件
- 改进清理服务和告警评估逻辑

前端改进：
- 简化OpsDashboard组件结构
- 完善国际化文本（中英文）
- 新增运维监控相关API类型定义
- 添加运维指标采集间隔设置界面
- 优化错误详情模态框

测试：
- 添加健康评分单元测试
- 更新API契约测试

											
										
										
											2026-01-10 01:38:47 +08:00
+									// Prefer Redis leader lock when available, but avoid stampeding the DB when Redis is flaky by
 									// falling back to a DB advisory lock.
 									if s.redisClient != nil {
 										ok, err := s.redisClient.SetNX(ctx, key, s.instanceID, ttl).Result()
 										if err == nil {
 											if !ok {
 												return nil, false
 											}
 											return func() {
 												_, _ = opsCleanupReleaseScript.Run(ctx, s.redisClient, []string{key}, s.instanceID).Result()
 											}, true
 										}
 										// Redis error: fall back to DB advisory lock.
-												feat(service): 实现运维监控业务逻辑层

- 新增 ops 主服务（ops_service.go）和端口定义（ops_port.go）
- 实现账号可用性检查服务（ops_account_availability.go）
- 实现数据聚合服务（ops_aggregation_service.go）
- 实现告警评估服务（ops_alert_evaluator_service.go）
- 实现告警管理服务（ops_alerts.go）
- 实现数据清理服务（ops_cleanup_service.go）
- 实现并发控制服务（ops_concurrency.go）
- 实现仪表板服务（ops_dashboard.go）
- 实现错误处理服务（ops_errors.go）
- 实现直方图服务（ops_histograms.go）
- 实现指标采集服务（ops_metrics_collector.go）
- 实现查询模式服务（ops_query_mode.go）
- 实现实时监控服务（ops_realtime.go）
- 实现请求详情服务（ops_request_details.go）
- 实现重试机制服务（ops_retry.go）
- 实现配置管理服务（ops_settings.go）
- 实现趋势分析服务（ops_trends.go）
- 实现窗口统计服务（ops_window_stats.go）
- 添加 ops 相关领域常量
- 注册 service 依赖注入

											
										
										
											2026-01-09 20:53:44 +08:00
+										s.warnNoRedisOnce.Do(func() {
-												chore(logging): 完成后端日志审计与结构化迁移

- 将高密度服务与处理器日志迁移到新日志系统（LegacyPrintf/结构化日志）
- 增加 stdlog bridge 与兼容测试，保留旧日志捕获能力
- 将 OpenAI 断流告警改为结构化 Warn 并改造对应测试为 sink 捕获
- 补齐后端相关文件 logger 引用并通过全量 go test

											
										
										
											2026-02-12 19:01:09 +08:00
+											logger.LegacyPrintf("service.ops_cleanup", "[OpsCleanup] leader lock SetNX failed; falling back to DB advisory lock: %v", err)
-												feat(运维监控): 增强监控功能和健康评分系统

后端改进：
- 新增健康评分计算服务（ops_health_score.go）
- 添加分布式锁支持（ops_advisory_lock.go）
- 优化指标采集和聚合逻辑
- 新增运维指标采集间隔配置（60-3600秒）
- 移除未使用的WebSocket查询token认证中间件
- 改进清理服务和告警评估逻辑

前端改进：
- 简化OpsDashboard组件结构
- 完善国际化文本（中英文）
- 新增运维监控相关API类型定义
- 添加运维指标采集间隔设置界面
- 优化错误详情模态框

测试：
- 添加健康评分单元测试
- 更新API契约测试

											
										
										
											2026-01-10 01:38:47 +08:00
+										})
 									} else {
 										s.warnNoRedisOnce.Do(func() {
-												chore(logging): 完成后端日志审计与结构化迁移

- 将高密度服务与处理器日志迁移到新日志系统（LegacyPrintf/结构化日志）
- 增加 stdlog bridge 与兼容测试，保留旧日志捕获能力
- 将 OpenAI 断流告警改为结构化 Warn 并改造对应测试为 sink 捕获
- 补齐后端相关文件 logger 引用并通过全量 go test

											
										
										
											2026-02-12 19:01:09 +08:00
+											logger.LegacyPrintf("service.ops_cleanup", "[OpsCleanup] redis not configured; using DB advisory lock")
-												feat(service): 实现运维监控业务逻辑层

- 新增 ops 主服务（ops_service.go）和端口定义（ops_port.go）
- 实现账号可用性检查服务（ops_account_availability.go）
- 实现数据聚合服务（ops_aggregation_service.go）
- 实现告警评估服务（ops_alert_evaluator_service.go）
- 实现告警管理服务（ops_alerts.go）
- 实现数据清理服务（ops_cleanup_service.go）
- 实现并发控制服务（ops_concurrency.go）
- 实现仪表板服务（ops_dashboard.go）
- 实现错误处理服务（ops_errors.go）
- 实现直方图服务（ops_histograms.go）
- 实现指标采集服务（ops_metrics_collector.go）
- 实现查询模式服务（ops_query_mode.go）
- 实现实时监控服务（ops_realtime.go）
- 实现请求详情服务（ops_request_details.go）
- 实现重试机制服务（ops_retry.go）
- 实现配置管理服务（ops_settings.go）
- 实现趋势分析服务（ops_trends.go）
- 实现窗口统计服务（ops_window_stats.go）
- 添加 ops 相关领域常量
- 注册 service 依赖注入

											
										
										
											2026-01-09 20:53:44 +08:00
+										})
 									}
-												feat(运维监控): 增强监控功能和健康评分系统

后端改进：
- 新增健康评分计算服务（ops_health_score.go）
- 添加分布式锁支持（ops_advisory_lock.go）
- 优化指标采集和聚合逻辑
- 新增运维指标采集间隔配置（60-3600秒）
- 移除未使用的WebSocket查询token认证中间件
- 改进清理服务和告警评估逻辑

前端改进：
- 简化OpsDashboard组件结构
- 完善国际化文本（中英文）
- 新增运维监控相关API类型定义
- 添加运维指标采集间隔设置界面
- 优化错误详情模态框

测试：
- 添加健康评分单元测试
- 更新API契约测试

											
										
										
											2026-01-10 01:38:47 +08:00
 									release, ok := tryAcquireDBAdvisoryLock(ctx, s.db, hashAdvisoryLockID(key))
-												feat(service): 实现运维监控业务逻辑层

- 新增 ops 主服务（ops_service.go）和端口定义（ops_port.go）
- 实现账号可用性检查服务（ops_account_availability.go）
- 实现数据聚合服务（ops_aggregation_service.go）
- 实现告警评估服务（ops_alert_evaluator_service.go）
- 实现告警管理服务（ops_alerts.go）
- 实现数据清理服务（ops_cleanup_service.go）
- 实现并发控制服务（ops_concurrency.go）
- 实现仪表板服务（ops_dashboard.go）
- 实现错误处理服务（ops_errors.go）
- 实现直方图服务（ops_histograms.go）
- 实现指标采集服务（ops_metrics_collector.go）
- 实现查询模式服务（ops_query_mode.go）
- 实现实时监控服务（ops_realtime.go）
- 实现请求详情服务（ops_request_details.go）
- 实现重试机制服务（ops_retry.go）
- 实现配置管理服务（ops_settings.go）
- 实现趋势分析服务（ops_trends.go）
- 实现窗口统计服务（ops_window_stats.go）
- 添加 ops 相关领域常量
- 注册 service 依赖注入

											
										
										
											2026-01-09 20:53:44 +08:00
+									if !ok {
 										return nil, false
 									}
-												feat(运维监控): 增强监控功能和健康评分系统

后端改进：
- 新增健康评分计算服务（ops_health_score.go）
- 添加分布式锁支持（ops_advisory_lock.go）
- 优化指标采集和聚合逻辑
- 新增运维指标采集间隔配置（60-3600秒）
- 移除未使用的WebSocket查询token认证中间件
- 改进清理服务和告警评估逻辑

前端改进：
- 简化OpsDashboard组件结构
- 完善国际化文本（中英文）
- 新增运维监控相关API类型定义
- 添加运维指标采集间隔设置界面
- 优化错误详情模态框

测试：
- 添加健康评分单元测试
- 更新API契约测试

											
										
										
											2026-01-10 01:38:47 +08:00
+									return release, true
-												feat(service): 实现运维监控业务逻辑层

- 新增 ops 主服务（ops_service.go）和端口定义（ops_port.go）
- 实现账号可用性检查服务（ops_account_availability.go）
- 实现数据聚合服务（ops_aggregation_service.go）
- 实现告警评估服务（ops_alert_evaluator_service.go）
- 实现告警管理服务（ops_alerts.go）
- 实现数据清理服务（ops_cleanup_service.go）
- 实现并发控制服务（ops_concurrency.go）
- 实现仪表板服务（ops_dashboard.go）
- 实现错误处理服务（ops_errors.go）
- 实现直方图服务（ops_histograms.go）
- 实现指标采集服务（ops_metrics_collector.go）
- 实现查询模式服务（ops_query_mode.go）
- 实现实时监控服务（ops_realtime.go）
- 实现请求详情服务（ops_request_details.go）
- 实现重试机制服务（ops_retry.go）
- 实现配置管理服务（ops_settings.go）
- 实现趋势分析服务（ops_trends.go）
- 实现窗口统计服务（ops_window_stats.go）
- 添加 ops 相关领域常量
- 注册 service 依赖注入

											
										
										
											2026-01-09 20:53:44 +08:00
+								}
-												refactor(ops): 优化任务心跳和组件刷新机制

后端改动:
- 添加 ops_job_heartbeats.last_result 字段记录任务执行结果
- 优化告警评估器统计信息(规则数/事件数/邮件数)
- 统一各定时任务的心跳记录格式

前端改动:
- 重构 OpsConcurrencyCard 使用父组件统一控制刷新节奏
- 移除独立的 5 秒刷新定时器,改用 refreshToken 机制
- 修复 TypeScript 类型错误

											
										
										
											2026-01-15 21:31:55 +08:00
+								func (s *OpsCleanupService) recordHeartbeatSuccess(runAt time.Time, duration time.Duration, counts opsCleanupDeletedCounts) {
-												feat(service): 实现运维监控业务逻辑层

- 新增 ops 主服务（ops_service.go）和端口定义（ops_port.go）
- 实现账号可用性检查服务（ops_account_availability.go）
- 实现数据聚合服务（ops_aggregation_service.go）
- 实现告警评估服务（ops_alert_evaluator_service.go）
- 实现告警管理服务（ops_alerts.go）
- 实现数据清理服务（ops_cleanup_service.go）
- 实现并发控制服务（ops_concurrency.go）
- 实现仪表板服务（ops_dashboard.go）
- 实现错误处理服务（ops_errors.go）
- 实现直方图服务（ops_histograms.go）
- 实现指标采集服务（ops_metrics_collector.go）
- 实现查询模式服务（ops_query_mode.go）
- 实现实时监控服务（ops_realtime.go）
- 实现请求详情服务（ops_request_details.go）
- 实现重试机制服务（ops_retry.go）
- 实现配置管理服务（ops_settings.go）
- 实现趋势分析服务（ops_trends.go）
- 实现窗口统计服务（ops_window_stats.go）
- 添加 ops 相关领域常量
- 注册 service 依赖注入

											
										
										
											2026-01-09 20:53:44 +08:00
+									if s == nil || s.opsRepo == nil {
 										return
 									}
 									now := time.Now().UTC()
 									durMs := duration.Milliseconds()
-												refactor(ops): 优化任务心跳和组件刷新机制

后端改动:
- 添加 ops_job_heartbeats.last_result 字段记录任务执行结果
- 优化告警评估器统计信息(规则数/事件数/邮件数)
- 统一各定时任务的心跳记录格式

前端改动:
- 重构 OpsConcurrencyCard 使用父组件统一控制刷新节奏
- 移除独立的 5 秒刷新定时器,改用 refreshToken 机制
- 修复 TypeScript 类型错误

											
										
										
											2026-01-15 21:31:55 +08:00
+									result := truncateString(counts.String(), 2048)
-												feat(service): 实现运维监控业务逻辑层

- 新增 ops 主服务（ops_service.go）和端口定义（ops_port.go）
- 实现账号可用性检查服务（ops_account_availability.go）
- 实现数据聚合服务（ops_aggregation_service.go）
- 实现告警评估服务（ops_alert_evaluator_service.go）
- 实现告警管理服务（ops_alerts.go）
- 实现数据清理服务（ops_cleanup_service.go）
- 实现并发控制服务（ops_concurrency.go）
- 实现仪表板服务（ops_dashboard.go）
- 实现错误处理服务（ops_errors.go）
- 实现直方图服务（ops_histograms.go）
- 实现指标采集服务（ops_metrics_collector.go）
- 实现查询模式服务（ops_query_mode.go）
- 实现实时监控服务（ops_realtime.go）
- 实现请求详情服务（ops_request_details.go）
- 实现重试机制服务（ops_retry.go）
- 实现配置管理服务（ops_settings.go）
- 实现趋势分析服务（ops_trends.go）
- 实现窗口统计服务（ops_window_stats.go）
- 添加 ops 相关领域常量
- 注册 service 依赖注入

											
										
										
											2026-01-09 20:53:44 +08:00
+									ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
 									defer cancel()
 									_ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
 										JobName:        opsCleanupJobName,
 										LastRunAt:      &runAt,
 										LastSuccessAt:  &now,
 										LastDurationMs: &durMs,
-												refactor(ops): 优化任务心跳和组件刷新机制

后端改动:
- 添加 ops_job_heartbeats.last_result 字段记录任务执行结果
- 优化告警评估器统计信息(规则数/事件数/邮件数)
- 统一各定时任务的心跳记录格式

前端改动:
- 重构 OpsConcurrencyCard 使用父组件统一控制刷新节奏
- 移除独立的 5 秒刷新定时器,改用 refreshToken 机制
- 修复 TypeScript 类型错误

											
										
										
											2026-01-15 21:31:55 +08:00
+										LastResult:     &result,
-												feat(service): 实现运维监控业务逻辑层

- 新增 ops 主服务（ops_service.go）和端口定义（ops_port.go）
- 实现账号可用性检查服务（ops_account_availability.go）
- 实现数据聚合服务（ops_aggregation_service.go）
- 实现告警评估服务（ops_alert_evaluator_service.go）
- 实现告警管理服务（ops_alerts.go）
- 实现数据清理服务（ops_cleanup_service.go）
- 实现并发控制服务（ops_concurrency.go）
- 实现仪表板服务（ops_dashboard.go）
- 实现错误处理服务（ops_errors.go）
- 实现直方图服务（ops_histograms.go）
- 实现指标采集服务（ops_metrics_collector.go）
- 实现查询模式服务（ops_query_mode.go）
- 实现实时监控服务（ops_realtime.go）
- 实现请求详情服务（ops_request_details.go）
- 实现重试机制服务（ops_retry.go）
- 实现配置管理服务（ops_settings.go）
- 实现趋势分析服务（ops_trends.go）
- 实现窗口统计服务（ops_window_stats.go）
- 添加 ops 相关领域常量
- 注册 service 依赖注入

											
										
										
											2026-01-09 20:53:44 +08:00
+									})
 								}
 								func (s *OpsCleanupService) recordHeartbeatError(runAt time.Time, duration time.Duration, err error) {
 									if s == nil || s.opsRepo == nil || err == nil {
 										return
 									}
 									now := time.Now().UTC()
 									durMs := duration.Milliseconds()
 									msg := truncateString(err.Error(), 2048)
 									ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
 									defer cancel()
 									_ = s.opsRepo.UpsertJobHeartbeat(ctx, &OpsUpsertJobHeartbeatInput{
 										JobName:        opsCleanupJobName,
 										LastRunAt:      &runAt,
 										LastErrorAt:    &now,
 										LastError:      &msg,
 										LastDurationMs: &durMs,
 									})
 								}