backend/internal/service/ops_settings.go

package service

import (
	"context"
	"encoding/json"
	"errors"
	"strings"
	"time"
)

const (
	opsAlertEvaluatorLeaderLockKeyDefault = "ops:alert:evaluator:leader"
	opsAlertEvaluatorLeaderLockTTLDefault = 30 * time.Second
)

// =========================
// Email notification config
// =========================

func (s *OpsService) GetEmailNotificationConfig(ctx context.Context) (*OpsEmailNotificationConfig, error) {
	defaultCfg := defaultOpsEmailNotificationConfig()
	if s == nil || s.settingRepo == nil {
		return defaultCfg, nil
	}
	if ctx == nil {
		ctx = context.Background()
	}

	raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsEmailNotificationConfig)
	if err != nil {
		if errors.Is(err, ErrSettingNotFound) {
			// Initialize defaults on first read (best-effort).
			if b, mErr := json.Marshal(defaultCfg); mErr == nil {
				_ = s.settingRepo.Set(ctx, SettingKeyOpsEmailNotificationConfig, string(b))
			}
			return defaultCfg, nil
		}
		return nil, err
	}

	cfg := &OpsEmailNotificationConfig{}
	if err := json.Unmarshal([]byte(raw), cfg); err != nil {
		// Corrupted JSON should not break ops UI; fall back to defaults.
		return defaultCfg, nil
	}
	normalizeOpsEmailNotificationConfig(cfg)
	return cfg, nil
}

func (s *OpsService) UpdateEmailNotificationConfig(ctx context.Context, req *OpsEmailNotificationConfigUpdateRequest) (*OpsEmailNotificationConfig, error) {
	if s == nil || s.settingRepo == nil {
		return nil, errors.New("setting repository not initialized")
	}
	if ctx == nil {
		ctx = context.Background()
	}
	if req == nil {
		return nil, errors.New("invalid request")
	}

	cfg, err := s.GetEmailNotificationConfig(ctx)
	if err != nil {
		return nil, err
	}

	if req.Alert != nil {
		cfg.Alert.Enabled = req.Alert.Enabled
		if req.Alert.Recipients != nil {
			cfg.Alert.Recipients = req.Alert.Recipients
		}
		cfg.Alert.MinSeverity = strings.TrimSpace(req.Alert.MinSeverity)
		cfg.Alert.RateLimitPerHour = req.Alert.RateLimitPerHour
		cfg.Alert.BatchingWindowSeconds = req.Alert.BatchingWindowSeconds
		cfg.Alert.IncludeResolvedAlerts = req.Alert.IncludeResolvedAlerts
	}

	if req.Report != nil {
		cfg.Report.Enabled = req.Report.Enabled
		if req.Report.Recipients != nil {
			cfg.Report.Recipients = req.Report.Recipients
		}
		cfg.Report.DailySummaryEnabled = req.Report.DailySummaryEnabled
		cfg.Report.DailySummarySchedule = strings.TrimSpace(req.Report.DailySummarySchedule)
		cfg.Report.WeeklySummaryEnabled = req.Report.WeeklySummaryEnabled
		cfg.Report.WeeklySummarySchedule = strings.TrimSpace(req.Report.WeeklySummarySchedule)
		cfg.Report.ErrorDigestEnabled = req.Report.ErrorDigestEnabled
		cfg.Report.ErrorDigestSchedule = strings.TrimSpace(req.Report.ErrorDigestSchedule)
		cfg.Report.ErrorDigestMinCount = req.Report.ErrorDigestMinCount
		cfg.Report.AccountHealthEnabled = req.Report.AccountHealthEnabled
		cfg.Report.AccountHealthSchedule = strings.TrimSpace(req.Report.AccountHealthSchedule)
		cfg.Report.AccountHealthErrorRateThreshold = req.Report.AccountHealthErrorRateThreshold
	}

	if err := validateOpsEmailNotificationConfig(cfg); err != nil {
		return nil, err
	}

	normalizeOpsEmailNotificationConfig(cfg)
	raw, err := json.Marshal(cfg)
	if err != nil {
		return nil, err
	}
	if err := s.settingRepo.Set(ctx, SettingKeyOpsEmailNotificationConfig, string(raw)); err != nil {
		return nil, err
	}
	return cfg, nil
}

func defaultOpsEmailNotificationConfig() *OpsEmailNotificationConfig {
	return &OpsEmailNotificationConfig{
		Alert: OpsEmailAlertConfig{
			Enabled:               true,
			Recipients:            []string{},
			MinSeverity:           "",
			RateLimitPerHour:      0,
			BatchingWindowSeconds: 0,
			IncludeResolvedAlerts: false,
		},
		Report: OpsEmailReportConfig{
			Enabled:                         false,
			Recipients:                      []string{},
			DailySummaryEnabled:             false,
			DailySummarySchedule:            "0 9 * * *",
			WeeklySummaryEnabled:            false,
			WeeklySummarySchedule:           "0 9 * * 1",
			ErrorDigestEnabled:              false,
			ErrorDigestSchedule:             "0 9 * * *",
			ErrorDigestMinCount:             10,
			AccountHealthEnabled:            false,
			AccountHealthSchedule:           "0 9 * * *",
			AccountHealthErrorRateThreshold: 10.0,
		},
	}
}

func normalizeOpsEmailNotificationConfig(cfg *OpsEmailNotificationConfig) {
	if cfg == nil {
		return
	}
	if cfg.Alert.Recipients == nil {
		cfg.Alert.Recipients = []string{}
	}
	if cfg.Report.Recipients == nil {
		cfg.Report.Recipients = []string{}
	}

	cfg.Alert.MinSeverity = strings.TrimSpace(cfg.Alert.MinSeverity)
	cfg.Report.DailySummarySchedule = strings.TrimSpace(cfg.Report.DailySummarySchedule)
	cfg.Report.WeeklySummarySchedule = strings.TrimSpace(cfg.Report.WeeklySummarySchedule)
	cfg.Report.ErrorDigestSchedule = strings.TrimSpace(cfg.Report.ErrorDigestSchedule)
	cfg.Report.AccountHealthSchedule = strings.TrimSpace(cfg.Report.AccountHealthSchedule)

	// Fill missing schedules with defaults to avoid breaking cron logic if clients send empty strings.
	if cfg.Report.DailySummarySchedule == "" {
		cfg.Report.DailySummarySchedule = "0 9 * * *"
	}
	if cfg.Report.WeeklySummarySchedule == "" {
		cfg.Report.WeeklySummarySchedule = "0 9 * * 1"
	}
	if cfg.Report.ErrorDigestSchedule == "" {
		cfg.Report.ErrorDigestSchedule = "0 9 * * *"
	}
	if cfg.Report.AccountHealthSchedule == "" {
		cfg.Report.AccountHealthSchedule = "0 9 * * *"
	}
}

func validateOpsEmailNotificationConfig(cfg *OpsEmailNotificationConfig) error {
	if cfg == nil {
		return errors.New("invalid config")
	}

	if cfg.Alert.RateLimitPerHour < 0 {
		return errors.New("alert.rate_limit_per_hour must be >= 0")
	}
	if cfg.Alert.BatchingWindowSeconds < 0 {
		return errors.New("alert.batching_window_seconds must be >= 0")
	}
	switch strings.TrimSpace(cfg.Alert.MinSeverity) {
	case "", "critical", "warning", "info":
	default:
		return errors.New("alert.min_severity must be one of: critical, warning, info, or empty")
	}

	if cfg.Report.ErrorDigestMinCount < 0 {
		return errors.New("report.error_digest_min_count must be >= 0")
	}
	if cfg.Report.AccountHealthErrorRateThreshold < 0 || cfg.Report.AccountHealthErrorRateThreshold > 100 {
		return errors.New("report.account_health_error_rate_threshold must be between 0 and 100")
	}
	return nil
}

// =========================
// Alert runtime settings
// =========================

func defaultOpsAlertRuntimeSettings() *OpsAlertRuntimeSettings {
	return &OpsAlertRuntimeSettings{
		EvaluationIntervalSeconds: 60,
		DistributedLock: OpsDistributedLockSettings{
			Enabled:    true,
			Key:        opsAlertEvaluatorLeaderLockKeyDefault,
			TTLSeconds: int(opsAlertEvaluatorLeaderLockTTLDefault.Seconds()),
		},
		Silencing: OpsAlertSilencingSettings{
			Enabled:            false,
			GlobalUntilRFC3339: "",
			GlobalReason:       "",
			Entries:            []OpsAlertSilenceEntry{},
		},
	}
}

func normalizeOpsDistributedLockSettings(s *OpsDistributedLockSettings, defaultKey string, defaultTTLSeconds int) {
	if s == nil {
		return
	}
	s.Key = strings.TrimSpace(s.Key)
	if s.Key == "" {
		s.Key = defaultKey
	}
	if s.TTLSeconds <= 0 {
		s.TTLSeconds = defaultTTLSeconds
	}
}

func normalizeOpsAlertSilencingSettings(s *OpsAlertSilencingSettings) {
	if s == nil {
		return
	}
	s.GlobalUntilRFC3339 = strings.TrimSpace(s.GlobalUntilRFC3339)
	s.GlobalReason = strings.TrimSpace(s.GlobalReason)
	if s.Entries == nil {
		s.Entries = []OpsAlertSilenceEntry{}
	}
	for i := range s.Entries {
		s.Entries[i].UntilRFC3339 = strings.TrimSpace(s.Entries[i].UntilRFC3339)
		s.Entries[i].Reason = strings.TrimSpace(s.Entries[i].Reason)
	}
}

func validateOpsDistributedLockSettings(s OpsDistributedLockSettings) error {
	if strings.TrimSpace(s.Key) == "" {
		return errors.New("distributed_lock.key is required")
	}
	if s.TTLSeconds <= 0 || s.TTLSeconds > int((24*time.Hour).Seconds()) {
		return errors.New("distributed_lock.ttl_seconds must be between 1 and 86400")
	}
	return nil
}

func validateOpsAlertSilencingSettings(s OpsAlertSilencingSettings) error {
	parse := func(raw string) error {
		if strings.TrimSpace(raw) == "" {
			return nil
		}
		if _, err := time.Parse(time.RFC3339, raw); err != nil {
			return errors.New("silencing time must be RFC3339")
		}
		return nil
	}

	if err := parse(s.GlobalUntilRFC3339); err != nil {
		return err
	}
	for _, entry := range s.Entries {
		if strings.TrimSpace(entry.UntilRFC3339) == "" {
			return errors.New("silencing.entries.until_rfc3339 is required")
		}
		if _, err := time.Parse(time.RFC3339, entry.UntilRFC3339); err != nil {
			return errors.New("silencing.entries.until_rfc3339 must be RFC3339")
		}
	}
	return nil
}

func (s *OpsService) GetOpsAlertRuntimeSettings(ctx context.Context) (*OpsAlertRuntimeSettings, error) {
	defaultCfg := defaultOpsAlertRuntimeSettings()
	if s == nil || s.settingRepo == nil {
		return defaultCfg, nil
	}
	if ctx == nil {
		ctx = context.Background()
	}

	raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsAlertRuntimeSettings)
	if err != nil {
		if errors.Is(err, ErrSettingNotFound) {
			if b, mErr := json.Marshal(defaultCfg); mErr == nil {
				_ = s.settingRepo.Set(ctx, SettingKeyOpsAlertRuntimeSettings, string(b))
			}
			return defaultCfg, nil
		}
		return nil, err
	}

	cfg := &OpsAlertRuntimeSettings{}
	if err := json.Unmarshal([]byte(raw), cfg); err != nil {
		return defaultCfg, nil
	}

	if cfg.EvaluationIntervalSeconds <= 0 {
		cfg.EvaluationIntervalSeconds = defaultCfg.EvaluationIntervalSeconds
	}
	normalizeOpsDistributedLockSettings(&cfg.DistributedLock, opsAlertEvaluatorLeaderLockKeyDefault, defaultCfg.DistributedLock.TTLSeconds)
	normalizeOpsAlertSilencingSettings(&cfg.Silencing)

	return cfg, nil
}

func (s *OpsService) UpdateOpsAlertRuntimeSettings(ctx context.Context, cfg *OpsAlertRuntimeSettings) (*OpsAlertRuntimeSettings, error) {
	if s == nil || s.settingRepo == nil {
		return nil, errors.New("setting repository not initialized")
	}
	if ctx == nil {
		ctx = context.Background()
	}
	if cfg == nil {
		return nil, errors.New("invalid config")
	}

	if cfg.EvaluationIntervalSeconds < 1 || cfg.EvaluationIntervalSeconds > int((24*time.Hour).Seconds()) {
		return nil, errors.New("evaluation_interval_seconds must be between 1 and 86400")
	}
	if cfg.DistributedLock.Enabled {
		if err := validateOpsDistributedLockSettings(cfg.DistributedLock); err != nil {
			return nil, err
		}
	}
	if cfg.Silencing.Enabled {
		if err := validateOpsAlertSilencingSettings(cfg.Silencing); err != nil {
			return nil, err
		}
	}

	defaultCfg := defaultOpsAlertRuntimeSettings()
	normalizeOpsDistributedLockSettings(&cfg.DistributedLock, opsAlertEvaluatorLeaderLockKeyDefault, defaultCfg.DistributedLock.TTLSeconds)
	normalizeOpsAlertSilencingSettings(&cfg.Silencing)

	raw, err := json.Marshal(cfg)
	if err != nil {
		return nil, err
	}
	if err := s.settingRepo.Set(ctx, SettingKeyOpsAlertRuntimeSettings, string(raw)); err != nil {
		return nil, err
	}

	// Return a fresh copy (avoid callers holding pointers into internal slices that may be mutated).
	updated := &OpsAlertRuntimeSettings{}
	_ = json.Unmarshal(raw, updated)
	return updated, nil
}

// =========================
// Advanced settings
// =========================

func defaultOpsAdvancedSettings() *OpsAdvancedSettings {
	return &OpsAdvancedSettings{
		DataRetention: OpsDataRetentionSettings{
			CleanupEnabled:             false,
			CleanupSchedule:            "0 2 * * *",
			ErrorLogRetentionDays:      30,
			MinuteMetricsRetentionDays: 30,
			HourlyMetricsRetentionDays: 30,
		},
		Aggregation: OpsAggregationSettings{
			AggregationEnabled: false,
		},
		IgnoreCountTokensErrors:         true,  // count_tokens 404 是预期行为，默认忽略
		IgnoreContextCanceled:           true,  // Default to true - client disconnects are not errors
		IgnoreNoAvailableAccounts:       false, // Default to false - this is a real routing issue
		IgnoreInsufficientBalanceErrors: false, // 默认不忽略，余额不足可能需要关注
		DisplayOpenAITokenStats:         false,
		DisplayAlertEvents:              true,
		AutoRefreshEnabled:              false,
		AutoRefreshIntervalSec:          30,
	}
}

func normalizeOpsAdvancedSettings(cfg *OpsAdvancedSettings) {
	if cfg == nil {
		return
	}
	cfg.DataRetention.CleanupSchedule = strings.TrimSpace(cfg.DataRetention.CleanupSchedule)
	if cfg.DataRetention.CleanupSchedule == "" {
		cfg.DataRetention.CleanupSchedule = "0 2 * * *"
	}
	// 保留天数：0 表示每次定时清理全部（清空所有），> 0 表示按天数保留；
	// 仅在拿到非法的负数时回填默认值，避免覆盖用户主动设的 0。
	if cfg.DataRetention.ErrorLogRetentionDays < 0 {
		cfg.DataRetention.ErrorLogRetentionDays = 30
	}
	if cfg.DataRetention.MinuteMetricsRetentionDays < 0 {
		cfg.DataRetention.MinuteMetricsRetentionDays = 30
	}
	if cfg.DataRetention.HourlyMetricsRetentionDays < 0 {
		cfg.DataRetention.HourlyMetricsRetentionDays = 30
	}
	// Normalize auto refresh interval (default 30 seconds)
	if cfg.AutoRefreshIntervalSec <= 0 {
		cfg.AutoRefreshIntervalSec = 30
	}
}

func validateOpsAdvancedSettings(cfg *OpsAdvancedSettings) error {
	if cfg == nil {
		return errors.New("invalid config")
	}
	// 保留天数：0 表示每次清理全部，1-365 表示按天数保留。
	if cfg.DataRetention.ErrorLogRetentionDays < 0 || cfg.DataRetention.ErrorLogRetentionDays > 365 {
		return errors.New("error_log_retention_days must be between 0 and 365")
	}
	if cfg.DataRetention.MinuteMetricsRetentionDays < 0 || cfg.DataRetention.MinuteMetricsRetentionDays > 365 {
		return errors.New("minute_metrics_retention_days must be between 0 and 365")
	}
	if cfg.DataRetention.HourlyMetricsRetentionDays < 0 || cfg.DataRetention.HourlyMetricsRetentionDays > 365 {
		return errors.New("hourly_metrics_retention_days must be between 0 and 365")
	}
	if cfg.AutoRefreshIntervalSec < 15 || cfg.AutoRefreshIntervalSec > 300 {
		return errors.New("auto_refresh_interval_seconds must be between 15 and 300")
	}
	return nil
}

func (s *OpsService) GetOpsAdvancedSettings(ctx context.Context) (*OpsAdvancedSettings, error) {
	defaultCfg := defaultOpsAdvancedSettings()
	if s == nil || s.settingRepo == nil {
		return defaultCfg, nil
	}
	if ctx == nil {
		ctx = context.Background()
	}

	raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsAdvancedSettings)
	if err != nil {
		if errors.Is(err, ErrSettingNotFound) {
			if b, mErr := json.Marshal(defaultCfg); mErr == nil {
				_ = s.settingRepo.Set(ctx, SettingKeyOpsAdvancedSettings, string(b))
			}
			return defaultCfg, nil
		}
		return nil, err
	}

	cfg := defaultOpsAdvancedSettings()
	if err := json.Unmarshal([]byte(raw), cfg); err != nil {
		return defaultCfg, nil
	}

	normalizeOpsAdvancedSettings(cfg)
	return cfg, nil
}

func (s *OpsService) UpdateOpsAdvancedSettings(ctx context.Context, cfg *OpsAdvancedSettings) (*OpsAdvancedSettings, error) {
	if s == nil || s.settingRepo == nil {
		return nil, errors.New("setting repository not initialized")
	}
	if ctx == nil {
		ctx = context.Background()
	}
	if cfg == nil {
		return nil, errors.New("invalid config")
	}

	if err := validateOpsAdvancedSettings(cfg); err != nil {
		return nil, err
	}

	normalizeOpsAdvancedSettings(cfg)
	raw, err := json.Marshal(cfg)
	if err != nil {
		return nil, err
	}
	if err := s.settingRepo.Set(ctx, SettingKeyOpsAdvancedSettings, string(raw)); err != nil {
		return nil, err
	}

	updated := &OpsAdvancedSettings{}
	_ = json.Unmarshal(raw, updated)
	return updated, nil
}

// =========================
// Metric thresholds
// =========================

const SettingKeyOpsMetricThresholds = "ops_metric_thresholds"

func defaultOpsMetricThresholds() *OpsMetricThresholds {
	slaMin := 99.5
	ttftMax := 500.0
	reqErrMax := 5.0
	upstreamErrMax := 5.0
	return &OpsMetricThresholds{
		SLAPercentMin:               &slaMin,
		TTFTp99MsMax:                &ttftMax,
		RequestErrorRatePercentMax:  &reqErrMax,
		UpstreamErrorRatePercentMax: &upstreamErrMax,
	}
}

func (s *OpsService) GetMetricThresholds(ctx context.Context) (*OpsMetricThresholds, error) {
	defaultCfg := defaultOpsMetricThresholds()
	if s == nil || s.settingRepo == nil {
		return defaultCfg, nil
	}
	if ctx == nil {
		ctx = context.Background()
	}

	raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsMetricThresholds)
	if err != nil {
		if errors.Is(err, ErrSettingNotFound) {
			if b, mErr := json.Marshal(defaultCfg); mErr == nil {
				_ = s.settingRepo.Set(ctx, SettingKeyOpsMetricThresholds, string(b))
			}
			return defaultCfg, nil
		}
		return nil, err
	}

	cfg := &OpsMetricThresholds{}
	if err := json.Unmarshal([]byte(raw), cfg); err != nil {
		return defaultCfg, nil
	}

	return cfg, nil
}

func (s *OpsService) UpdateMetricThresholds(ctx context.Context, cfg *OpsMetricThresholds) (*OpsMetricThresholds, error) {
	if s == nil || s.settingRepo == nil {
		return nil, errors.New("setting repository not initialized")
	}
	if ctx == nil {
		ctx = context.Background()
	}
	if cfg == nil {
		return nil, errors.New("invalid config")
	}

	// Validate thresholds
	if cfg.SLAPercentMin != nil && (*cfg.SLAPercentMin < 0 || *cfg.SLAPercentMin > 100) {
		return nil, errors.New("sla_percent_min must be between 0 and 100")
	}
	if cfg.TTFTp99MsMax != nil && *cfg.TTFTp99MsMax < 0 {
		return nil, errors.New("ttft_p99_ms_max must be >= 0")
	}
	if cfg.RequestErrorRatePercentMax != nil && (*cfg.RequestErrorRatePercentMax < 0 || *cfg.RequestErrorRatePercentMax > 100) {
		return nil, errors.New("request_error_rate_percent_max must be between 0 and 100")
	}
	if cfg.UpstreamErrorRatePercentMax != nil && (*cfg.UpstreamErrorRatePercentMax < 0 || *cfg.UpstreamErrorRatePercentMax > 100) {
		return nil, errors.New("upstream_error_rate_percent_max must be between 0 and 100")
	}

	raw, err := json.Marshal(cfg)
	if err != nil {
		return nil, err
	}
	if err := s.settingRepo.Set(ctx, SettingKeyOpsMetricThresholds, string(raw)); err != nil {
		return nil, err
	}

	updated := &OpsMetricThresholds{}
	_ = json.Unmarshal(raw, updated)
	return updated, nil
}
-												feat(service): 实现运维监控业务逻辑层

- 新增 ops 主服务（ops_service.go）和端口定义（ops_port.go）
- 实现账号可用性检查服务（ops_account_availability.go）
- 实现数据聚合服务（ops_aggregation_service.go）
- 实现告警评估服务（ops_alert_evaluator_service.go）
- 实现告警管理服务（ops_alerts.go）
- 实现数据清理服务（ops_cleanup_service.go）
- 实现并发控制服务（ops_concurrency.go）
- 实现仪表板服务（ops_dashboard.go）
- 实现错误处理服务（ops_errors.go）
- 实现直方图服务（ops_histograms.go）
- 实现指标采集服务（ops_metrics_collector.go）
- 实现查询模式服务（ops_query_mode.go）
- 实现实时监控服务（ops_realtime.go）
- 实现请求详情服务（ops_request_details.go）
- 实现重试机制服务（ops_retry.go）
- 实现配置管理服务（ops_settings.go）
- 实现趋势分析服务（ops_trends.go）
- 实现窗口统计服务（ops_window_stats.go）
- 添加 ops 相关领域常量
- 注册 service 依赖注入

											
										
										
											2026-01-09 20:53:44 +08:00
+								package service
 								import (
 									"context"
 									"encoding/json"
 									"errors"
 									"strings"
 									"time"
 								)
 								const (
 									opsAlertEvaluatorLeaderLockKeyDefault = "ops:alert:evaluator:leader"
 									opsAlertEvaluatorLeaderLockTTLDefault = 30 * time.Second
 								)
 								// =========================
 								// Email notification config
 								// =========================
 								func (s *OpsService) GetEmailNotificationConfig(ctx context.Context) (*OpsEmailNotificationConfig, error) {
 									defaultCfg := defaultOpsEmailNotificationConfig()
 									if s == nil || s.settingRepo == nil {
 										return defaultCfg, nil
 									}
 									if ctx == nil {
 										ctx = context.Background()
 									}
 									raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsEmailNotificationConfig)
 									if err != nil {
 										if errors.Is(err, ErrSettingNotFound) {
 											// Initialize defaults on first read (best-effort).
 											if b, mErr := json.Marshal(defaultCfg); mErr == nil {
 												_ = s.settingRepo.Set(ctx, SettingKeyOpsEmailNotificationConfig, string(b))
 											}
 											return defaultCfg, nil
 										}
 										return nil, err
 									}
 									cfg := &OpsEmailNotificationConfig{}
 									if err := json.Unmarshal([]byte(raw), cfg); err != nil {
 										// Corrupted JSON should not break ops UI; fall back to defaults.
 										return defaultCfg, nil
 									}
 									normalizeOpsEmailNotificationConfig(cfg)
 									return cfg, nil
 								}
 								func (s *OpsService) UpdateEmailNotificationConfig(ctx context.Context, req *OpsEmailNotificationConfigUpdateRequest) (*OpsEmailNotificationConfig, error) {
 									if s == nil || s.settingRepo == nil {
 										return nil, errors.New("setting repository not initialized")
 									}
 									if ctx == nil {
 										ctx = context.Background()
 									}
 									if req == nil {
 										return nil, errors.New("invalid request")
 									}
 									cfg, err := s.GetEmailNotificationConfig(ctx)
 									if err != nil {
 										return nil, err
 									}
 									if req.Alert != nil {
 										cfg.Alert.Enabled = req.Alert.Enabled
 										if req.Alert.Recipients != nil {
 											cfg.Alert.Recipients = req.Alert.Recipients
 										}
 										cfg.Alert.MinSeverity = strings.TrimSpace(req.Alert.MinSeverity)
 										cfg.Alert.RateLimitPerHour = req.Alert.RateLimitPerHour
 										cfg.Alert.BatchingWindowSeconds = req.Alert.BatchingWindowSeconds
 										cfg.Alert.IncludeResolvedAlerts = req.Alert.IncludeResolvedAlerts
 									}
 									if req.Report != nil {
 										cfg.Report.Enabled = req.Report.Enabled
 										if req.Report.Recipients != nil {
 											cfg.Report.Recipients = req.Report.Recipients
 										}
 										cfg.Report.DailySummaryEnabled = req.Report.DailySummaryEnabled
 										cfg.Report.DailySummarySchedule = strings.TrimSpace(req.Report.DailySummarySchedule)
 										cfg.Report.WeeklySummaryEnabled = req.Report.WeeklySummaryEnabled
 										cfg.Report.WeeklySummarySchedule = strings.TrimSpace(req.Report.WeeklySummarySchedule)
 										cfg.Report.ErrorDigestEnabled = req.Report.ErrorDigestEnabled
 										cfg.Report.ErrorDigestSchedule = strings.TrimSpace(req.Report.ErrorDigestSchedule)
 										cfg.Report.ErrorDigestMinCount = req.Report.ErrorDigestMinCount
 										cfg.Report.AccountHealthEnabled = req.Report.AccountHealthEnabled
 										cfg.Report.AccountHealthSchedule = strings.TrimSpace(req.Report.AccountHealthSchedule)
 										cfg.Report.AccountHealthErrorRateThreshold = req.Report.AccountHealthErrorRateThreshold
 									}
 									if err := validateOpsEmailNotificationConfig(cfg); err != nil {
 										return nil, err
 									}
 									normalizeOpsEmailNotificationConfig(cfg)
 									raw, err := json.Marshal(cfg)
 									if err != nil {
 										return nil, err
 									}
 									if err := s.settingRepo.Set(ctx, SettingKeyOpsEmailNotificationConfig, string(raw)); err != nil {
 										return nil, err
 									}
 									return cfg, nil
 								}
 								func defaultOpsEmailNotificationConfig() *OpsEmailNotificationConfig {
 									return &OpsEmailNotificationConfig{
 										Alert: OpsEmailAlertConfig{
 											Enabled:               true,
 											Recipients:            []string{},
 											MinSeverity:           "",
 											RateLimitPerHour:      0,
 											BatchingWindowSeconds: 0,
 											IncludeResolvedAlerts: false,
 										},
 										Report: OpsEmailReportConfig{
 											Enabled:                         false,
 											Recipients:                      []string{},
 											DailySummaryEnabled:             false,
 											DailySummarySchedule:            "0 9 * * *",
 											WeeklySummaryEnabled:            false,
 											WeeklySummarySchedule:           "0 9 * * 1",
 											ErrorDigestEnabled:              false,
 											ErrorDigestSchedule:             "0 9 * * *",
 											ErrorDigestMinCount:             10,
 											AccountHealthEnabled:            false,
 											AccountHealthSchedule:           "0 9 * * *",
 											AccountHealthErrorRateThreshold: 10.0,
 										},
 									}
 								}
 								func normalizeOpsEmailNotificationConfig(cfg *OpsEmailNotificationConfig) {
 									if cfg == nil {
 										return
 									}
 									if cfg.Alert.Recipients == nil {
 										cfg.Alert.Recipients = []string{}
 									}
 									if cfg.Report.Recipients == nil {
 										cfg.Report.Recipients = []string{}
 									}
 									cfg.Alert.MinSeverity = strings.TrimSpace(cfg.Alert.MinSeverity)
 									cfg.Report.DailySummarySchedule = strings.TrimSpace(cfg.Report.DailySummarySchedule)
 									cfg.Report.WeeklySummarySchedule = strings.TrimSpace(cfg.Report.WeeklySummarySchedule)
 									cfg.Report.ErrorDigestSchedule = strings.TrimSpace(cfg.Report.ErrorDigestSchedule)
 									cfg.Report.AccountHealthSchedule = strings.TrimSpace(cfg.Report.AccountHealthSchedule)
 									// Fill missing schedules with defaults to avoid breaking cron logic if clients send empty strings.
 									if cfg.Report.DailySummarySchedule == "" {
 										cfg.Report.DailySummarySchedule = "0 9 * * *"
 									}
 									if cfg.Report.WeeklySummarySchedule == "" {
 										cfg.Report.WeeklySummarySchedule = "0 9 * * 1"
 									}
 									if cfg.Report.ErrorDigestSchedule == "" {
 										cfg.Report.ErrorDigestSchedule = "0 9 * * *"
 									}
 									if cfg.Report.AccountHealthSchedule == "" {
 										cfg.Report.AccountHealthSchedule = "0 9 * * *"
 									}
 								}
 								func validateOpsEmailNotificationConfig(cfg *OpsEmailNotificationConfig) error {
 									if cfg == nil {
 										return errors.New("invalid config")
 									}
 									if cfg.Alert.RateLimitPerHour < 0 {
 										return errors.New("alert.rate_limit_per_hour must be >= 0")
 									}
 									if cfg.Alert.BatchingWindowSeconds < 0 {
 										return errors.New("alert.batching_window_seconds must be >= 0")
 									}
 									switch strings.TrimSpace(cfg.Alert.MinSeverity) {
 									case "", "critical", "warning", "info":
 									default:
 										return errors.New("alert.min_severity must be one of: critical, warning, info, or empty")
 									}
 									if cfg.Report.ErrorDigestMinCount < 0 {
 										return errors.New("report.error_digest_min_count must be >= 0")
 									}
 									if cfg.Report.AccountHealthErrorRateThreshold < 0 || cfg.Report.AccountHealthErrorRateThreshold > 100 {
 										return errors.New("report.account_health_error_rate_threshold must be between 0 and 100")
 									}
 									return nil
 								}
 								// =========================
 								// Alert runtime settings
 								// =========================
 								func defaultOpsAlertRuntimeSettings() *OpsAlertRuntimeSettings {
 									return &OpsAlertRuntimeSettings{
 										EvaluationIntervalSeconds: 60,
 										DistributedLock: OpsDistributedLockSettings{
 											Enabled:    true,
 											Key:        opsAlertEvaluatorLeaderLockKeyDefault,
 											TTLSeconds: int(opsAlertEvaluatorLeaderLockTTLDefault.Seconds()),
 										},
 										Silencing: OpsAlertSilencingSettings{
 											Enabled:            false,
 											GlobalUntilRFC3339: "",
 											GlobalReason:       "",
 											Entries:            []OpsAlertSilenceEntry{},
 										},
 									}
 								}
 								func normalizeOpsDistributedLockSettings(s *OpsDistributedLockSettings, defaultKey string, defaultTTLSeconds int) {
 									if s == nil {
 										return
 									}
 									s.Key = strings.TrimSpace(s.Key)
 									if s.Key == "" {
 										s.Key = defaultKey
 									}
 									if s.TTLSeconds <= 0 {
 										s.TTLSeconds = defaultTTLSeconds
 									}
 								}
 								func normalizeOpsAlertSilencingSettings(s *OpsAlertSilencingSettings) {
 									if s == nil {
 										return
 									}
 									s.GlobalUntilRFC3339 = strings.TrimSpace(s.GlobalUntilRFC3339)
 									s.GlobalReason = strings.TrimSpace(s.GlobalReason)
 									if s.Entries == nil {
 										s.Entries = []OpsAlertSilenceEntry{}
 									}
 									for i := range s.Entries {
 										s.Entries[i].UntilRFC3339 = strings.TrimSpace(s.Entries[i].UntilRFC3339)
 										s.Entries[i].Reason = strings.TrimSpace(s.Entries[i].Reason)
 									}
 								}
 								func validateOpsDistributedLockSettings(s OpsDistributedLockSettings) error {
 									if strings.TrimSpace(s.Key) == "" {
 										return errors.New("distributed_lock.key is required")
 									}
 									if s.TTLSeconds <= 0 || s.TTLSeconds > int((24*time.Hour).Seconds()) {
 										return errors.New("distributed_lock.ttl_seconds must be between 1 and 86400")
 									}
 									return nil
 								}
 								func validateOpsAlertSilencingSettings(s OpsAlertSilencingSettings) error {
 									parse := func(raw string) error {
 										if strings.TrimSpace(raw) == "" {
 											return nil
 										}
 										if _, err := time.Parse(time.RFC3339, raw); err != nil {
 											return errors.New("silencing time must be RFC3339")
 										}
 										return nil
 									}
 									if err := parse(s.GlobalUntilRFC3339); err != nil {
 										return err
 									}
 									for _, entry := range s.Entries {
 										if strings.TrimSpace(entry.UntilRFC3339) == "" {
 											return errors.New("silencing.entries.until_rfc3339 is required")
 										}
 										if _, err := time.Parse(time.RFC3339, entry.UntilRFC3339); err != nil {
 											return errors.New("silencing.entries.until_rfc3339 must be RFC3339")
 										}
 									}
 									return nil
 								}
 								func (s *OpsService) GetOpsAlertRuntimeSettings(ctx context.Context) (*OpsAlertRuntimeSettings, error) {
 									defaultCfg := defaultOpsAlertRuntimeSettings()
 									if s == nil || s.settingRepo == nil {
 										return defaultCfg, nil
 									}
 									if ctx == nil {
 										ctx = context.Background()
 									}
 									raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsAlertRuntimeSettings)
 									if err != nil {
 										if errors.Is(err, ErrSettingNotFound) {
 											if b, mErr := json.Marshal(defaultCfg); mErr == nil {
 												_ = s.settingRepo.Set(ctx, SettingKeyOpsAlertRuntimeSettings, string(b))
 											}
 											return defaultCfg, nil
 										}
 										return nil, err
 									}
 									cfg := &OpsAlertRuntimeSettings{}
 									if err := json.Unmarshal([]byte(raw), cfg); err != nil {
 										return defaultCfg, nil
 									}
 									if cfg.EvaluationIntervalSeconds <= 0 {
 										cfg.EvaluationIntervalSeconds = defaultCfg.EvaluationIntervalSeconds
 									}
 									normalizeOpsDistributedLockSettings(&cfg.DistributedLock, opsAlertEvaluatorLeaderLockKeyDefault, defaultCfg.DistributedLock.TTLSeconds)
 									normalizeOpsAlertSilencingSettings(&cfg.Silencing)
 									return cfg, nil
 								}
 								func (s *OpsService) UpdateOpsAlertRuntimeSettings(ctx context.Context, cfg *OpsAlertRuntimeSettings) (*OpsAlertRuntimeSettings, error) {
 									if s == nil || s.settingRepo == nil {
 										return nil, errors.New("setting repository not initialized")
 									}
 									if ctx == nil {
 										ctx = context.Background()
 									}
 									if cfg == nil {
 										return nil, errors.New("invalid config")
 									}
 									if cfg.EvaluationIntervalSeconds < 1 || cfg.EvaluationIntervalSeconds > int((24*time.Hour).Seconds()) {
 										return nil, errors.New("evaluation_interval_seconds must be between 1 and 86400")
 									}
 									if cfg.DistributedLock.Enabled {
 										if err := validateOpsDistributedLockSettings(cfg.DistributedLock); err != nil {
 											return nil, err
 										}
 									}
 									if cfg.Silencing.Enabled {
 										if err := validateOpsAlertSilencingSettings(cfg.Silencing); err != nil {
 											return nil, err
 										}
 									}
 									defaultCfg := defaultOpsAlertRuntimeSettings()
 									normalizeOpsDistributedLockSettings(&cfg.DistributedLock, opsAlertEvaluatorLeaderLockKeyDefault, defaultCfg.DistributedLock.TTLSeconds)
 									normalizeOpsAlertSilencingSettings(&cfg.Silencing)
 									raw, err := json.Marshal(cfg)
 									if err != nil {
 										return nil, err
 									}
 									if err := s.settingRepo.Set(ctx, SettingKeyOpsAlertRuntimeSettings, string(raw)); err != nil {
 										return nil, err
 									}
 									// Return a fresh copy (avoid callers holding pointers into internal slices that may be mutated).
 									updated := &OpsAlertRuntimeSettings{}
 									_ = json.Unmarshal(raw, updated)
 									return updated, nil
 								}
-												feat(ops): 添加高级设置API支持

- 新增OpsAdvancedSettings数据模型
- 支持数据保留策略配置（错误日志、分钟级指标、小时级指标）
- 支持数据聚合开关配置
- 添加GET/PUT /admin/ops/advanced-settings接口
- 添加配置校验和默认值处理

相关文件：
- backend/internal/service/ops_settings_models.go
- backend/internal/service/ops_settings.go
- backend/internal/handler/admin/ops_settings_handler.go
- backend/internal/server/routes/admin.go
- backend/internal/service/domain_constants.go

											
										
										
											2026-01-11 19:51:18 +08:00
+								// =========================
 								// Advanced settings
 								// =========================
 								func defaultOpsAdvancedSettings() *OpsAdvancedSettings {
 									return &OpsAdvancedSettings{
 										DataRetention: OpsDataRetentionSettings{
-												fix(ci): 修复最后一批CI错误

- 修复 ops_repo_trends.go 中剩余3处 Rows.Close 未检查错误
- 修复 ops_settings.go, ops_settings_models.go, ops_trends.go 的格式化问题

											
										
										
											2026-01-12 00:02:19 +08:00
+											CleanupEnabled:             false,
 											CleanupSchedule:            "0 2 * * *",
 											ErrorLogRetentionDays:      30,
-												feat(ops): 添加高级设置API支持

- 新增OpsAdvancedSettings数据模型
- 支持数据保留策略配置（错误日志、分钟级指标、小时级指标）
- 支持数据聚合开关配置
- 添加GET/PUT /admin/ops/advanced-settings接口
- 添加配置校验和默认值处理

相关文件：
- backend/internal/service/ops_settings_models.go
- backend/internal/service/ops_settings.go
- backend/internal/handler/admin/ops_settings_handler.go
- backend/internal/server/routes/admin.go
- backend/internal/service/domain_constants.go

											
										
										
											2026-01-11 19:51:18 +08:00
+											MinuteMetricsRetentionDays: 30,
 											HourlyMetricsRetentionDays: 30,
 										},
 										Aggregation: OpsAggregationSettings{
 											AggregationEnabled: false,
 										},
-												feat(ops): add ignore insufficient balance errors toggle and extract error constants

- Add 5th error filter switch IgnoreInsufficientBalanceErrors to suppress
  upstream insufficient balance / insufficient_quota errors from ops log
- Extract hardcoded error strings into package-level constants for
  shouldSkipOpsErrorLog, normalizeOpsErrorType, classifyOpsPhase, and
  classifyOpsIsBusinessLimited
- Define ErrNoAvailableAccounts sentinel error and replace all
  errors.New("no available accounts") call sites
- Update tests to use require.ErrorIs with the sentinel error

											
										
										
											2026-03-15 17:25:35 +08:00
+										IgnoreCountTokensErrors:         true,  // count_tokens 404 是预期行为，默认忽略
 										IgnoreContextCanceled:           true,  // Default to true - client disconnects are not errors
 										IgnoreNoAvailableAccounts:       false, // Default to false - this is a real routing issue
 										IgnoreInsufficientBalanceErrors: false, // 默认不忽略，余额不足可能需要关注
 										DisplayOpenAITokenStats:         false,
 										DisplayAlertEvents:              true,
 										AutoRefreshEnabled:              false,
 										AutoRefreshIntervalSec:          30,
-												feat(ops): 添加高级设置API支持

- 新增OpsAdvancedSettings数据模型
- 支持数据保留策略配置（错误日志、分钟级指标、小时级指标）
- 支持数据聚合开关配置
- 添加GET/PUT /admin/ops/advanced-settings接口
- 添加配置校验和默认值处理

相关文件：
- backend/internal/service/ops_settings_models.go
- backend/internal/service/ops_settings.go
- backend/internal/handler/admin/ops_settings_handler.go
- backend/internal/server/routes/admin.go
- backend/internal/service/domain_constants.go

											
										
										
											2026-01-11 19:51:18 +08:00
+									}
 								}
 								func normalizeOpsAdvancedSettings(cfg *OpsAdvancedSettings) {
 									if cfg == nil {
 										return
 									}
 									cfg.DataRetention.CleanupSchedule = strings.TrimSpace(cfg.DataRetention.CleanupSchedule)
 									if cfg.DataRetention.CleanupSchedule == "" {
 										cfg.DataRetention.CleanupSchedule = "0 2 * * *"
 									}
-												feat(ops): allow retention days = 0 to wipe table on each scheduled cleanup

Background / 背景

The ops cleanup task currently rejects retention days < 1 in both validate
and normalize, so operators who want minimal-history setups (e.g. high
churn deployments that prefer near-realtime cleanup) cannot express that
intent through the UI. The only options are 1+ days, which keeps at least
24h of history regardless of cron frequency.

ops 清理任务目前在 validate 和 normalize 两处都拒绝小于 1 的保留天数，
让希望尽量不留历史的运维场景（高吞吐部署 + 想用近实时清理）无法通过 UI
表达。最低只能配 1，等于不管 cron 多频繁，至少都会保留 24 小时的历史。

Purpose / 目的

Let admins set retention days to 0, meaning "every scheduled cleanup
run wipes the corresponding table(s) entirely". Combined with a more
frequent cron (e.g. `0 * * * *`) this yields effectively rolling cleanup.

允许管理员把保留天数设为 0，语义为"每次定时清理时把对应表全部清空"。
搭配更频繁的 cron（比如每小时整点）即可获得近似滚动清理的效果。

Changes / 改动内容

Backend

- service/ops_settings.go: validate accepts [0, 365]; normalize only
  refills default 30 when value is < 0 (negative is treated as legacy
  bad data, 0 is honoured)
- service/ops_cleanup_service.go: introduce `opsCleanupPlan(now, days)`
  returning `(cutoff, truncate, ok)`. days==0 returns truncate=true and
  short-circuits to a new `truncateOpsTable` helper that uses
  `TRUNCATE TABLE` (O(1), no WAL, no VACUUM pressure). days>0 keeps
  the existing batched DELETE path unchanged. Empty tables skip
  TRUNCATE to avoid the ACCESS EXCLUSIVE lock entirely
- Extract `isMissingRelationError` helper to dedupe the "table not
  yet created" tolerance shared by both delete and truncate paths
- Add unit tests for `opsCleanupPlan` (three branches) and
  `isMissingRelationError`

后端

- service/ops_settings.go: validate 接受 [0, 365]；normalize 仅在 < 0
  时回填默认 30（负数视为脏数据，0 被尊重）
- service/ops_cleanup_service.go: 抽 `opsCleanupPlan(now, days)` 返回
  `(cutoff, truncate, ok)`。days==0 → truncate=true，走新增
  `truncateOpsTable`（TRUNCATE TABLE，O(1)，无 WAL、无 VACUUM 压力）；
  days>0 仍走原批量 DELETE 路径，行为完全不变。空表跳过 TRUNCATE，
  避免无意义的 ACCESS EXCLUSIVE 锁
- 抽 `isMissingRelationError` helper 复用 delete / truncate 两处的
  "表不存在"宽容判断
- 补 `opsCleanupPlan` 三分支 + `isMissingRelationError` 单元测试

Frontend

- OpsSettingsDialog.vue: validation accepts [0, 365]; input min=0
- i18n (zh/en): hint mentions "0 = wipe all on every cleanup",
  validation message updated to 0-365 range

前端

- OpsSettingsDialog.vue: 校验放宽到 [0, 365]，input min 改 0
- i18n（zh/en）：hint 补"0 = 每次清理时清空所有"，错误提示改 0-365

Trade-offs / 取舍

- TRUNCATE requires ACCESS EXCLUSIVE lock briefly, but ops tables only
  have the cleanup task as a writer, so the lock is invisible to other
  workloads
- Empty-table guard avoids the lock when there is nothing to clean
- Negative values are still treated as legacy bad data and replaced
  with default 30 to preserve compatibility

											
										
										
											2026-04-29 15:01:02 +08:00
+									// 保留天数：0 表示每次定时清理全部（清空所有），> 0 表示按天数保留；
 									// 仅在拿到非法的负数时回填默认值，避免覆盖用户主动设的 0。
 									if cfg.DataRetention.ErrorLogRetentionDays < 0 {
-												feat(ops): 添加高级设置API支持

- 新增OpsAdvancedSettings数据模型
- 支持数据保留策略配置（错误日志、分钟级指标、小时级指标）
- 支持数据聚合开关配置
- 添加GET/PUT /admin/ops/advanced-settings接口
- 添加配置校验和默认值处理

相关文件：
- backend/internal/service/ops_settings_models.go
- backend/internal/service/ops_settings.go
- backend/internal/handler/admin/ops_settings_handler.go
- backend/internal/server/routes/admin.go
- backend/internal/service/domain_constants.go

											
										
										
											2026-01-11 19:51:18 +08:00
+										cfg.DataRetention.ErrorLogRetentionDays = 30
 									}
-												feat(ops): allow retention days = 0 to wipe table on each scheduled cleanup

Background / 背景

The ops cleanup task currently rejects retention days < 1 in both validate
and normalize, so operators who want minimal-history setups (e.g. high
churn deployments that prefer near-realtime cleanup) cannot express that
intent through the UI. The only options are 1+ days, which keeps at least
24h of history regardless of cron frequency.

ops 清理任务目前在 validate 和 normalize 两处都拒绝小于 1 的保留天数，
让希望尽量不留历史的运维场景（高吞吐部署 + 想用近实时清理）无法通过 UI
表达。最低只能配 1，等于不管 cron 多频繁，至少都会保留 24 小时的历史。

Purpose / 目的

Let admins set retention days to 0, meaning "every scheduled cleanup
run wipes the corresponding table(s) entirely". Combined with a more
frequent cron (e.g. `0 * * * *`) this yields effectively rolling cleanup.

允许管理员把保留天数设为 0，语义为"每次定时清理时把对应表全部清空"。
搭配更频繁的 cron（比如每小时整点）即可获得近似滚动清理的效果。

Changes / 改动内容

Backend

- service/ops_settings.go: validate accepts [0, 365]; normalize only
  refills default 30 when value is < 0 (negative is treated as legacy
  bad data, 0 is honoured)
- service/ops_cleanup_service.go: introduce `opsCleanupPlan(now, days)`
  returning `(cutoff, truncate, ok)`. days==0 returns truncate=true and
  short-circuits to a new `truncateOpsTable` helper that uses
  `TRUNCATE TABLE` (O(1), no WAL, no VACUUM pressure). days>0 keeps
  the existing batched DELETE path unchanged. Empty tables skip
  TRUNCATE to avoid the ACCESS EXCLUSIVE lock entirely
- Extract `isMissingRelationError` helper to dedupe the "table not
  yet created" tolerance shared by both delete and truncate paths
- Add unit tests for `opsCleanupPlan` (three branches) and
  `isMissingRelationError`

后端

- service/ops_settings.go: validate 接受 [0, 365]；normalize 仅在 < 0
  时回填默认 30（负数视为脏数据，0 被尊重）
- service/ops_cleanup_service.go: 抽 `opsCleanupPlan(now, days)` 返回
  `(cutoff, truncate, ok)`。days==0 → truncate=true，走新增
  `truncateOpsTable`（TRUNCATE TABLE，O(1)，无 WAL、无 VACUUM 压力）；
  days>0 仍走原批量 DELETE 路径，行为完全不变。空表跳过 TRUNCATE，
  避免无意义的 ACCESS EXCLUSIVE 锁
- 抽 `isMissingRelationError` helper 复用 delete / truncate 两处的
  "表不存在"宽容判断
- 补 `opsCleanupPlan` 三分支 + `isMissingRelationError` 单元测试

Frontend

- OpsSettingsDialog.vue: validation accepts [0, 365]; input min=0
- i18n (zh/en): hint mentions "0 = wipe all on every cleanup",
  validation message updated to 0-365 range

前端

- OpsSettingsDialog.vue: 校验放宽到 [0, 365]，input min 改 0
- i18n（zh/en）：hint 补"0 = 每次清理时清空所有"，错误提示改 0-365

Trade-offs / 取舍

- TRUNCATE requires ACCESS EXCLUSIVE lock briefly, but ops tables only
  have the cleanup task as a writer, so the lock is invisible to other
  workloads
- Empty-table guard avoids the lock when there is nothing to clean
- Negative values are still treated as legacy bad data and replaced
  with default 30 to preserve compatibility

											
										
										
											2026-04-29 15:01:02 +08:00
+									if cfg.DataRetention.MinuteMetricsRetentionDays < 0 {
-												feat(ops): 添加高级设置API支持

- 新增OpsAdvancedSettings数据模型
- 支持数据保留策略配置（错误日志、分钟级指标、小时级指标）
- 支持数据聚合开关配置
- 添加GET/PUT /admin/ops/advanced-settings接口
- 添加配置校验和默认值处理

相关文件：
- backend/internal/service/ops_settings_models.go
- backend/internal/service/ops_settings.go
- backend/internal/handler/admin/ops_settings_handler.go
- backend/internal/server/routes/admin.go
- backend/internal/service/domain_constants.go

											
										
										
											2026-01-11 19:51:18 +08:00
+										cfg.DataRetention.MinuteMetricsRetentionDays = 30
 									}
-												feat(ops): allow retention days = 0 to wipe table on each scheduled cleanup

Background / 背景

The ops cleanup task currently rejects retention days < 1 in both validate
and normalize, so operators who want minimal-history setups (e.g. high
churn deployments that prefer near-realtime cleanup) cannot express that
intent through the UI. The only options are 1+ days, which keeps at least
24h of history regardless of cron frequency.

ops 清理任务目前在 validate 和 normalize 两处都拒绝小于 1 的保留天数，
让希望尽量不留历史的运维场景（高吞吐部署 + 想用近实时清理）无法通过 UI
表达。最低只能配 1，等于不管 cron 多频繁，至少都会保留 24 小时的历史。

Purpose / 目的

Let admins set retention days to 0, meaning "every scheduled cleanup
run wipes the corresponding table(s) entirely". Combined with a more
frequent cron (e.g. `0 * * * *`) this yields effectively rolling cleanup.

允许管理员把保留天数设为 0，语义为"每次定时清理时把对应表全部清空"。
搭配更频繁的 cron（比如每小时整点）即可获得近似滚动清理的效果。

Changes / 改动内容

Backend

- service/ops_settings.go: validate accepts [0, 365]; normalize only
  refills default 30 when value is < 0 (negative is treated as legacy
  bad data, 0 is honoured)
- service/ops_cleanup_service.go: introduce `opsCleanupPlan(now, days)`
  returning `(cutoff, truncate, ok)`. days==0 returns truncate=true and
  short-circuits to a new `truncateOpsTable` helper that uses
  `TRUNCATE TABLE` (O(1), no WAL, no VACUUM pressure). days>0 keeps
  the existing batched DELETE path unchanged. Empty tables skip
  TRUNCATE to avoid the ACCESS EXCLUSIVE lock entirely
- Extract `isMissingRelationError` helper to dedupe the "table not
  yet created" tolerance shared by both delete and truncate paths
- Add unit tests for `opsCleanupPlan` (three branches) and
  `isMissingRelationError`

后端

- service/ops_settings.go: validate 接受 [0, 365]；normalize 仅在 < 0
  时回填默认 30（负数视为脏数据，0 被尊重）
- service/ops_cleanup_service.go: 抽 `opsCleanupPlan(now, days)` 返回
  `(cutoff, truncate, ok)`。days==0 → truncate=true，走新增
  `truncateOpsTable`（TRUNCATE TABLE，O(1)，无 WAL、无 VACUUM 压力）；
  days>0 仍走原批量 DELETE 路径，行为完全不变。空表跳过 TRUNCATE，
  避免无意义的 ACCESS EXCLUSIVE 锁
- 抽 `isMissingRelationError` helper 复用 delete / truncate 两处的
  "表不存在"宽容判断
- 补 `opsCleanupPlan` 三分支 + `isMissingRelationError` 单元测试

Frontend

- OpsSettingsDialog.vue: validation accepts [0, 365]; input min=0
- i18n (zh/en): hint mentions "0 = wipe all on every cleanup",
  validation message updated to 0-365 range

前端

- OpsSettingsDialog.vue: 校验放宽到 [0, 365]，input min 改 0
- i18n（zh/en）：hint 补"0 = 每次清理时清空所有"，错误提示改 0-365

Trade-offs / 取舍

- TRUNCATE requires ACCESS EXCLUSIVE lock briefly, but ops tables only
  have the cleanup task as a writer, so the lock is invisible to other
  workloads
- Empty-table guard avoids the lock when there is nothing to clean
- Negative values are still treated as legacy bad data and replaced
  with default 30 to preserve compatibility

											
										
										
											2026-04-29 15:01:02 +08:00
+									if cfg.DataRetention.HourlyMetricsRetentionDays < 0 {
-												feat(ops): 添加高级设置API支持

- 新增OpsAdvancedSettings数据模型
- 支持数据保留策略配置（错误日志、分钟级指标、小时级指标）
- 支持数据聚合开关配置
- 添加GET/PUT /admin/ops/advanced-settings接口
- 添加配置校验和默认值处理

相关文件：
- backend/internal/service/ops_settings_models.go
- backend/internal/service/ops_settings.go
- backend/internal/handler/admin/ops_settings_handler.go
- backend/internal/server/routes/admin.go
- backend/internal/service/domain_constants.go

											
										
										
											2026-01-11 19:51:18 +08:00
+										cfg.DataRetention.HourlyMetricsRetentionDays = 30
 									}
-												feat(ops): 添加 count_tokens 错误过滤功能

功能特性：
- 自动识别并标记 count_tokens 请求的错误
- 支持配置是否在统计中忽略 count_tokens 错误
- 错误数据完整保留，仅在统计时动态过滤

技术实现：
- ops_error_logger.go: 自动标记 count_tokens 请求
- ops_repo.go: INSERT 语句添加 is_count_tokens 字段
- ops_repo_dashboard.go: buildErrorWhere 核心过滤函数
- ops_repo_preagg.go: 预聚合统计中添加过滤
- ops_repo_trends.go: 趋势统计查询添加过滤（2 处）
- ops_settings_models.go: 添加 ignore_count_tokens_errors 配置
- ops_settings.go: 配置验证和默认值设置
- ops_port.go: 错误日志模型添加 IsCountTokens 字段

业务价值：
- count_tokens 是探测性请求，其错误不影响真实业务 SLA
- 用户可根据需求灵活控制是否计入统计
- 提升错误率、告警等运维指标的准确性

影响范围：
- Dashboard 概览统计
- 错误趋势图表
- 告警规则评估
- 预聚合指标（hourly/daily）
- 健康分数计算

											
										
										
											2026-01-12 16:50:41 +08:00
+									// Normalize auto refresh interval (default 30 seconds)
 									if cfg.AutoRefreshIntervalSec <= 0 {
 										cfg.AutoRefreshIntervalSec = 30
 									}
-												feat(ops): 添加高级设置API支持

- 新增OpsAdvancedSettings数据模型
- 支持数据保留策略配置（错误日志、分钟级指标、小时级指标）
- 支持数据聚合开关配置
- 添加GET/PUT /admin/ops/advanced-settings接口
- 添加配置校验和默认值处理

相关文件：
- backend/internal/service/ops_settings_models.go
- backend/internal/service/ops_settings.go
- backend/internal/handler/admin/ops_settings_handler.go
- backend/internal/server/routes/admin.go
- backend/internal/service/domain_constants.go

											
										
										
											2026-01-11 19:51:18 +08:00
+								}
 								func validateOpsAdvancedSettings(cfg *OpsAdvancedSettings) error {
 									if cfg == nil {
 										return errors.New("invalid config")
 									}
-												feat(ops): allow retention days = 0 to wipe table on each scheduled cleanup

Background / 背景

The ops cleanup task currently rejects retention days < 1 in both validate
and normalize, so operators who want minimal-history setups (e.g. high
churn deployments that prefer near-realtime cleanup) cannot express that
intent through the UI. The only options are 1+ days, which keeps at least
24h of history regardless of cron frequency.

ops 清理任务目前在 validate 和 normalize 两处都拒绝小于 1 的保留天数，
让希望尽量不留历史的运维场景（高吞吐部署 + 想用近实时清理）无法通过 UI
表达。最低只能配 1，等于不管 cron 多频繁，至少都会保留 24 小时的历史。

Purpose / 目的

Let admins set retention days to 0, meaning "every scheduled cleanup
run wipes the corresponding table(s) entirely". Combined with a more
frequent cron (e.g. `0 * * * *`) this yields effectively rolling cleanup.

允许管理员把保留天数设为 0，语义为"每次定时清理时把对应表全部清空"。
搭配更频繁的 cron（比如每小时整点）即可获得近似滚动清理的效果。

Changes / 改动内容

Backend

- service/ops_settings.go: validate accepts [0, 365]; normalize only
  refills default 30 when value is < 0 (negative is treated as legacy
  bad data, 0 is honoured)
- service/ops_cleanup_service.go: introduce `opsCleanupPlan(now, days)`
  returning `(cutoff, truncate, ok)`. days==0 returns truncate=true and
  short-circuits to a new `truncateOpsTable` helper that uses
  `TRUNCATE TABLE` (O(1), no WAL, no VACUUM pressure). days>0 keeps
  the existing batched DELETE path unchanged. Empty tables skip
  TRUNCATE to avoid the ACCESS EXCLUSIVE lock entirely
- Extract `isMissingRelationError` helper to dedupe the "table not
  yet created" tolerance shared by both delete and truncate paths
- Add unit tests for `opsCleanupPlan` (three branches) and
  `isMissingRelationError`

后端

- service/ops_settings.go: validate 接受 [0, 365]；normalize 仅在 < 0
  时回填默认 30（负数视为脏数据，0 被尊重）
- service/ops_cleanup_service.go: 抽 `opsCleanupPlan(now, days)` 返回
  `(cutoff, truncate, ok)`。days==0 → truncate=true，走新增
  `truncateOpsTable`（TRUNCATE TABLE，O(1)，无 WAL、无 VACUUM 压力）；
  days>0 仍走原批量 DELETE 路径，行为完全不变。空表跳过 TRUNCATE，
  避免无意义的 ACCESS EXCLUSIVE 锁
- 抽 `isMissingRelationError` helper 复用 delete / truncate 两处的
  "表不存在"宽容判断
- 补 `opsCleanupPlan` 三分支 + `isMissingRelationError` 单元测试

Frontend

- OpsSettingsDialog.vue: validation accepts [0, 365]; input min=0
- i18n (zh/en): hint mentions "0 = wipe all on every cleanup",
  validation message updated to 0-365 range

前端

- OpsSettingsDialog.vue: 校验放宽到 [0, 365]，input min 改 0
- i18n（zh/en）：hint 补"0 = 每次清理时清空所有"，错误提示改 0-365

Trade-offs / 取舍

- TRUNCATE requires ACCESS EXCLUSIVE lock briefly, but ops tables only
  have the cleanup task as a writer, so the lock is invisible to other
  workloads
- Empty-table guard avoids the lock when there is nothing to clean
- Negative values are still treated as legacy bad data and replaced
  with default 30 to preserve compatibility

											
										
										
											2026-04-29 15:01:02 +08:00
+									// 保留天数：0 表示每次清理全部，1-365 表示按天数保留。
 									if cfg.DataRetention.ErrorLogRetentionDays < 0 || cfg.DataRetention.ErrorLogRetentionDays > 365 {
 										return errors.New("error_log_retention_days must be between 0 and 365")
-												feat(ops): 添加高级设置API支持

- 新增OpsAdvancedSettings数据模型
- 支持数据保留策略配置（错误日志、分钟级指标、小时级指标）
- 支持数据聚合开关配置
- 添加GET/PUT /admin/ops/advanced-settings接口
- 添加配置校验和默认值处理

相关文件：
- backend/internal/service/ops_settings_models.go
- backend/internal/service/ops_settings.go
- backend/internal/handler/admin/ops_settings_handler.go
- backend/internal/server/routes/admin.go
- backend/internal/service/domain_constants.go

											
										
										
											2026-01-11 19:51:18 +08:00
+									}
-												feat(ops): allow retention days = 0 to wipe table on each scheduled cleanup

Background / 背景

The ops cleanup task currently rejects retention days < 1 in both validate
and normalize, so operators who want minimal-history setups (e.g. high
churn deployments that prefer near-realtime cleanup) cannot express that
intent through the UI. The only options are 1+ days, which keeps at least
24h of history regardless of cron frequency.

ops 清理任务目前在 validate 和 normalize 两处都拒绝小于 1 的保留天数，
让希望尽量不留历史的运维场景（高吞吐部署 + 想用近实时清理）无法通过 UI
表达。最低只能配 1，等于不管 cron 多频繁，至少都会保留 24 小时的历史。

Purpose / 目的

Let admins set retention days to 0, meaning "every scheduled cleanup
run wipes the corresponding table(s) entirely". Combined with a more
frequent cron (e.g. `0 * * * *`) this yields effectively rolling cleanup.

允许管理员把保留天数设为 0，语义为"每次定时清理时把对应表全部清空"。
搭配更频繁的 cron（比如每小时整点）即可获得近似滚动清理的效果。

Changes / 改动内容

Backend

- service/ops_settings.go: validate accepts [0, 365]; normalize only
  refills default 30 when value is < 0 (negative is treated as legacy
  bad data, 0 is honoured)
- service/ops_cleanup_service.go: introduce `opsCleanupPlan(now, days)`
  returning `(cutoff, truncate, ok)`. days==0 returns truncate=true and
  short-circuits to a new `truncateOpsTable` helper that uses
  `TRUNCATE TABLE` (O(1), no WAL, no VACUUM pressure). days>0 keeps
  the existing batched DELETE path unchanged. Empty tables skip
  TRUNCATE to avoid the ACCESS EXCLUSIVE lock entirely
- Extract `isMissingRelationError` helper to dedupe the "table not
  yet created" tolerance shared by both delete and truncate paths
- Add unit tests for `opsCleanupPlan` (three branches) and
  `isMissingRelationError`

后端

- service/ops_settings.go: validate 接受 [0, 365]；normalize 仅在 < 0
  时回填默认 30（负数视为脏数据，0 被尊重）
- service/ops_cleanup_service.go: 抽 `opsCleanupPlan(now, days)` 返回
  `(cutoff, truncate, ok)`。days==0 → truncate=true，走新增
  `truncateOpsTable`（TRUNCATE TABLE，O(1)，无 WAL、无 VACUUM 压力）；
  days>0 仍走原批量 DELETE 路径，行为完全不变。空表跳过 TRUNCATE，
  避免无意义的 ACCESS EXCLUSIVE 锁
- 抽 `isMissingRelationError` helper 复用 delete / truncate 两处的
  "表不存在"宽容判断
- 补 `opsCleanupPlan` 三分支 + `isMissingRelationError` 单元测试

Frontend

- OpsSettingsDialog.vue: validation accepts [0, 365]; input min=0
- i18n (zh/en): hint mentions "0 = wipe all on every cleanup",
  validation message updated to 0-365 range

前端

- OpsSettingsDialog.vue: 校验放宽到 [0, 365]，input min 改 0
- i18n（zh/en）：hint 补"0 = 每次清理时清空所有"，错误提示改 0-365

Trade-offs / 取舍

- TRUNCATE requires ACCESS EXCLUSIVE lock briefly, but ops tables only
  have the cleanup task as a writer, so the lock is invisible to other
  workloads
- Empty-table guard avoids the lock when there is nothing to clean
- Negative values are still treated as legacy bad data and replaced
  with default 30 to preserve compatibility

											
										
										
											2026-04-29 15:01:02 +08:00
+									if cfg.DataRetention.MinuteMetricsRetentionDays < 0 || cfg.DataRetention.MinuteMetricsRetentionDays > 365 {
 										return errors.New("minute_metrics_retention_days must be between 0 and 365")
-												feat(ops): 添加高级设置API支持

- 新增OpsAdvancedSettings数据模型
- 支持数据保留策略配置（错误日志、分钟级指标、小时级指标）
- 支持数据聚合开关配置
- 添加GET/PUT /admin/ops/advanced-settings接口
- 添加配置校验和默认值处理

相关文件：
- backend/internal/service/ops_settings_models.go
- backend/internal/service/ops_settings.go
- backend/internal/handler/admin/ops_settings_handler.go
- backend/internal/server/routes/admin.go
- backend/internal/service/domain_constants.go

											
										
										
											2026-01-11 19:51:18 +08:00
+									}
-												feat(ops): allow retention days = 0 to wipe table on each scheduled cleanup

Background / 背景

The ops cleanup task currently rejects retention days < 1 in both validate
and normalize, so operators who want minimal-history setups (e.g. high
churn deployments that prefer near-realtime cleanup) cannot express that
intent through the UI. The only options are 1+ days, which keeps at least
24h of history regardless of cron frequency.

ops 清理任务目前在 validate 和 normalize 两处都拒绝小于 1 的保留天数，
让希望尽量不留历史的运维场景（高吞吐部署 + 想用近实时清理）无法通过 UI
表达。最低只能配 1，等于不管 cron 多频繁，至少都会保留 24 小时的历史。

Purpose / 目的

Let admins set retention days to 0, meaning "every scheduled cleanup
run wipes the corresponding table(s) entirely". Combined with a more
frequent cron (e.g. `0 * * * *`) this yields effectively rolling cleanup.

允许管理员把保留天数设为 0，语义为"每次定时清理时把对应表全部清空"。
搭配更频繁的 cron（比如每小时整点）即可获得近似滚动清理的效果。

Changes / 改动内容

Backend

- service/ops_settings.go: validate accepts [0, 365]; normalize only
  refills default 30 when value is < 0 (negative is treated as legacy
  bad data, 0 is honoured)
- service/ops_cleanup_service.go: introduce `opsCleanupPlan(now, days)`
  returning `(cutoff, truncate, ok)`. days==0 returns truncate=true and
  short-circuits to a new `truncateOpsTable` helper that uses
  `TRUNCATE TABLE` (O(1), no WAL, no VACUUM pressure). days>0 keeps
  the existing batched DELETE path unchanged. Empty tables skip
  TRUNCATE to avoid the ACCESS EXCLUSIVE lock entirely
- Extract `isMissingRelationError` helper to dedupe the "table not
  yet created" tolerance shared by both delete and truncate paths
- Add unit tests for `opsCleanupPlan` (three branches) and
  `isMissingRelationError`

后端

- service/ops_settings.go: validate 接受 [0, 365]；normalize 仅在 < 0
  时回填默认 30（负数视为脏数据，0 被尊重）
- service/ops_cleanup_service.go: 抽 `opsCleanupPlan(now, days)` 返回
  `(cutoff, truncate, ok)`。days==0 → truncate=true，走新增
  `truncateOpsTable`（TRUNCATE TABLE，O(1)，无 WAL、无 VACUUM 压力）；
  days>0 仍走原批量 DELETE 路径，行为完全不变。空表跳过 TRUNCATE，
  避免无意义的 ACCESS EXCLUSIVE 锁
- 抽 `isMissingRelationError` helper 复用 delete / truncate 两处的
  "表不存在"宽容判断
- 补 `opsCleanupPlan` 三分支 + `isMissingRelationError` 单元测试

Frontend

- OpsSettingsDialog.vue: validation accepts [0, 365]; input min=0
- i18n (zh/en): hint mentions "0 = wipe all on every cleanup",
  validation message updated to 0-365 range

前端

- OpsSettingsDialog.vue: 校验放宽到 [0, 365]，input min 改 0
- i18n（zh/en）：hint 补"0 = 每次清理时清空所有"，错误提示改 0-365

Trade-offs / 取舍

- TRUNCATE requires ACCESS EXCLUSIVE lock briefly, but ops tables only
  have the cleanup task as a writer, so the lock is invisible to other
  workloads
- Empty-table guard avoids the lock when there is nothing to clean
- Negative values are still treated as legacy bad data and replaced
  with default 30 to preserve compatibility

											
										
										
											2026-04-29 15:01:02 +08:00
+									if cfg.DataRetention.HourlyMetricsRetentionDays < 0 || cfg.DataRetention.HourlyMetricsRetentionDays > 365 {
 										return errors.New("hourly_metrics_retention_days must be between 0 and 365")
-												feat(ops): 添加高级设置API支持

- 新增OpsAdvancedSettings数据模型
- 支持数据保留策略配置（错误日志、分钟级指标、小时级指标）
- 支持数据聚合开关配置
- 添加GET/PUT /admin/ops/advanced-settings接口
- 添加配置校验和默认值处理

相关文件：
- backend/internal/service/ops_settings_models.go
- backend/internal/service/ops_settings.go
- backend/internal/handler/admin/ops_settings_handler.go
- backend/internal/server/routes/admin.go
- backend/internal/service/domain_constants.go

											
										
										
											2026-01-11 19:51:18 +08:00
+									}
-												feat(ops): 添加 count_tokens 错误过滤功能

功能特性：
- 自动识别并标记 count_tokens 请求的错误
- 支持配置是否在统计中忽略 count_tokens 错误
- 错误数据完整保留，仅在统计时动态过滤

技术实现：
- ops_error_logger.go: 自动标记 count_tokens 请求
- ops_repo.go: INSERT 语句添加 is_count_tokens 字段
- ops_repo_dashboard.go: buildErrorWhere 核心过滤函数
- ops_repo_preagg.go: 预聚合统计中添加过滤
- ops_repo_trends.go: 趋势统计查询添加过滤（2 处）
- ops_settings_models.go: 添加 ignore_count_tokens_errors 配置
- ops_settings.go: 配置验证和默认值设置
- ops_port.go: 错误日志模型添加 IsCountTokens 字段

业务价值：
- count_tokens 是探测性请求，其错误不影响真实业务 SLA
- 用户可根据需求灵活控制是否计入统计
- 提升错误率、告警等运维指标的准确性

影响范围：
- Dashboard 概览统计
- 错误趋势图表
- 告警规则评估
- 预聚合指标（hourly/daily）
- 健康分数计算

											
										
										
											2026-01-12 16:50:41 +08:00
+									if cfg.AutoRefreshIntervalSec < 15 || cfg.AutoRefreshIntervalSec > 300 {
 										return errors.New("auto_refresh_interval_seconds must be between 15 and 300")
 									}
-												feat(ops): 添加高级设置API支持

- 新增OpsAdvancedSettings数据模型
- 支持数据保留策略配置（错误日志、分钟级指标、小时级指标）
- 支持数据聚合开关配置
- 添加GET/PUT /admin/ops/advanced-settings接口
- 添加配置校验和默认值处理

相关文件：
- backend/internal/service/ops_settings_models.go
- backend/internal/service/ops_settings.go
- backend/internal/handler/admin/ops_settings_handler.go
- backend/internal/server/routes/admin.go
- backend/internal/service/domain_constants.go

											
										
										
											2026-01-11 19:51:18 +08:00
+									return nil
 								}
 								func (s *OpsService) GetOpsAdvancedSettings(ctx context.Context) (*OpsAdvancedSettings, error) {
 									defaultCfg := defaultOpsAdvancedSettings()
 									if s == nil || s.settingRepo == nil {
 										return defaultCfg, nil
 									}
 									if ctx == nil {
 										ctx = context.Background()
 									}
 									raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsAdvancedSettings)
 									if err != nil {
 										if errors.Is(err, ErrSettingNotFound) {
 											if b, mErr := json.Marshal(defaultCfg); mErr == nil {
 												_ = s.settingRepo.Set(ctx, SettingKeyOpsAdvancedSettings, string(b))
 											}
 											return defaultCfg, nil
 										}
 										return nil, err
 									}
-												feat(ops): allow hiding alert events

											
										
										
											2026-03-13 17:18:04 +08:00
+									cfg := defaultOpsAdvancedSettings()
-												feat(ops): 添加高级设置API支持

- 新增OpsAdvancedSettings数据模型
- 支持数据保留策略配置（错误日志、分钟级指标、小时级指标）
- 支持数据聚合开关配置
- 添加GET/PUT /admin/ops/advanced-settings接口
- 添加配置校验和默认值处理

相关文件：
- backend/internal/service/ops_settings_models.go
- backend/internal/service/ops_settings.go
- backend/internal/handler/admin/ops_settings_handler.go
- backend/internal/server/routes/admin.go
- backend/internal/service/domain_constants.go

											
										
										
											2026-01-11 19:51:18 +08:00
+									if err := json.Unmarshal([]byte(raw), cfg); err != nil {
 										return defaultCfg, nil
 									}
 									normalizeOpsAdvancedSettings(cfg)
 									return cfg, nil
 								}
 								func (s *OpsService) UpdateOpsAdvancedSettings(ctx context.Context, cfg *OpsAdvancedSettings) (*OpsAdvancedSettings, error) {
 									if s == nil || s.settingRepo == nil {
 										return nil, errors.New("setting repository not initialized")
 									}
 									if ctx == nil {
 										ctx = context.Background()
 									}
 									if cfg == nil {
 										return nil, errors.New("invalid config")
 									}
 									if err := validateOpsAdvancedSettings(cfg); err != nil {
 										return nil, err
 									}
 									normalizeOpsAdvancedSettings(cfg)
 									raw, err := json.Marshal(cfg)
 									if err != nil {
 										return nil, err
 									}
 									if err := s.settingRepo.Set(ctx, SettingKeyOpsAdvancedSettings, string(raw)); err != nil {
 										return nil, err
 									}
 									updated := &OpsAdvancedSettings{}
 									_ = json.Unmarshal(raw, updated)
 									return updated, nil
 								}
-												feat(ops): 后端添加指标阈值管理API

- 新增GetMetricThresholds和UpdateMetricThresholds接口
- 支持配置SLA、延迟P99、TTFT P99、请求错误率、上游错误率阈值
- 添加参数验证逻辑
- 提供默认阈值配置

											
										
										
											2026-01-12 11:42:56 +08:00
 								// =========================
 								// Metric thresholds
 								// =========================
 								const SettingKeyOpsMetricThresholds = "ops_metric_thresholds"
 								func defaultOpsMetricThresholds() *OpsMetricThresholds {
 									slaMin := 99.5
 									ttftMax := 500.0
 									reqErrMax := 5.0
 									upstreamErrMax := 5.0
 									return &OpsMetricThresholds{
 										SLAPercentMin:               &slaMin,
 										TTFTp99MsMax:                &ttftMax,
 										RequestErrorRatePercentMax:  &reqErrMax,
 										UpstreamErrorRatePercentMax: &upstreamErrMax,
 									}
 								}
 								func (s *OpsService) GetMetricThresholds(ctx context.Context) (*OpsMetricThresholds, error) {
 									defaultCfg := defaultOpsMetricThresholds()
 									if s == nil || s.settingRepo == nil {
 										return defaultCfg, nil
 									}
 									if ctx == nil {
 										ctx = context.Background()
 									}
 									raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsMetricThresholds)
 									if err != nil {
 										if errors.Is(err, ErrSettingNotFound) {
 											if b, mErr := json.Marshal(defaultCfg); mErr == nil {
 												_ = s.settingRepo.Set(ctx, SettingKeyOpsMetricThresholds, string(b))
 											}
 											return defaultCfg, nil
 										}
 										return nil, err
 									}
 									cfg := &OpsMetricThresholds{}
 									if err := json.Unmarshal([]byte(raw), cfg); err != nil {
 										return defaultCfg, nil
 									}
 									return cfg, nil
 								}
 								func (s *OpsService) UpdateMetricThresholds(ctx context.Context, cfg *OpsMetricThresholds) (*OpsMetricThresholds, error) {
 									if s == nil || s.settingRepo == nil {
 										return nil, errors.New("setting repository not initialized")
 									}
 									if ctx == nil {
 										ctx = context.Background()
 									}
 									if cfg == nil {
 										return nil, errors.New("invalid config")
 									}
 									// Validate thresholds
 									if cfg.SLAPercentMin != nil && (*cfg.SLAPercentMin < 0 || *cfg.SLAPercentMin > 100) {
 										return nil, errors.New("sla_percent_min must be between 0 and 100")
 									}
 									if cfg.TTFTp99MsMax != nil && *cfg.TTFTp99MsMax < 0 {
 										return nil, errors.New("ttft_p99_ms_max must be >= 0")
 									}
 									if cfg.RequestErrorRatePercentMax != nil && (*cfg.RequestErrorRatePercentMax < 0 || *cfg.RequestErrorRatePercentMax > 100) {
 										return nil, errors.New("request_error_rate_percent_max must be between 0 and 100")
 									}
 									if cfg.UpstreamErrorRatePercentMax != nil && (*cfg.UpstreamErrorRatePercentMax < 0 || *cfg.UpstreamErrorRatePercentMax > 100) {
 										return nil, errors.New("upstream_error_rate_percent_max must be between 0 and 100")
 									}
 									raw, err := json.Marshal(cfg)
 									if err != nil {
 										return nil, err
 									}
 									if err := s.settingRepo.Set(ctx, SettingKeyOpsMetricThresholds, string(raw)); err != nil {
 										return nil, err
 									}
 									updated := &OpsMetricThresholds{}
 									_ = json.Unmarshal(raw, updated)
 									return updated, nil
 								}