2026-01-09 20:53:44 +08:00
|
|
|
|
package service
|
|
|
|
|
|
|
|
|
|
|
|
import (
|
|
|
|
|
|
"context"
|
|
|
|
|
|
"encoding/json"
|
|
|
|
|
|
"errors"
|
|
|
|
|
|
"strings"
|
|
|
|
|
|
"time"
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
const (
|
|
|
|
|
|
opsAlertEvaluatorLeaderLockKeyDefault = "ops:alert:evaluator:leader"
|
|
|
|
|
|
opsAlertEvaluatorLeaderLockTTLDefault = 30 * time.Second
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
// =========================
|
|
|
|
|
|
// Email notification config
|
|
|
|
|
|
// =========================
|
|
|
|
|
|
|
|
|
|
|
|
func (s *OpsService) GetEmailNotificationConfig(ctx context.Context) (*OpsEmailNotificationConfig, error) {
|
|
|
|
|
|
defaultCfg := defaultOpsEmailNotificationConfig()
|
|
|
|
|
|
if s == nil || s.settingRepo == nil {
|
|
|
|
|
|
return defaultCfg, nil
|
|
|
|
|
|
}
|
|
|
|
|
|
if ctx == nil {
|
|
|
|
|
|
ctx = context.Background()
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsEmailNotificationConfig)
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
if errors.Is(err, ErrSettingNotFound) {
|
|
|
|
|
|
// Initialize defaults on first read (best-effort).
|
|
|
|
|
|
if b, mErr := json.Marshal(defaultCfg); mErr == nil {
|
|
|
|
|
|
_ = s.settingRepo.Set(ctx, SettingKeyOpsEmailNotificationConfig, string(b))
|
|
|
|
|
|
}
|
|
|
|
|
|
return defaultCfg, nil
|
|
|
|
|
|
}
|
|
|
|
|
|
return nil, err
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
cfg := &OpsEmailNotificationConfig{}
|
|
|
|
|
|
if err := json.Unmarshal([]byte(raw), cfg); err != nil {
|
|
|
|
|
|
// Corrupted JSON should not break ops UI; fall back to defaults.
|
|
|
|
|
|
return defaultCfg, nil
|
|
|
|
|
|
}
|
|
|
|
|
|
normalizeOpsEmailNotificationConfig(cfg)
|
|
|
|
|
|
return cfg, nil
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
func (s *OpsService) UpdateEmailNotificationConfig(ctx context.Context, req *OpsEmailNotificationConfigUpdateRequest) (*OpsEmailNotificationConfig, error) {
|
|
|
|
|
|
if s == nil || s.settingRepo == nil {
|
|
|
|
|
|
return nil, errors.New("setting repository not initialized")
|
|
|
|
|
|
}
|
|
|
|
|
|
if ctx == nil {
|
|
|
|
|
|
ctx = context.Background()
|
|
|
|
|
|
}
|
|
|
|
|
|
if req == nil {
|
|
|
|
|
|
return nil, errors.New("invalid request")
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
cfg, err := s.GetEmailNotificationConfig(ctx)
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
return nil, err
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if req.Alert != nil {
|
|
|
|
|
|
cfg.Alert.Enabled = req.Alert.Enabled
|
|
|
|
|
|
if req.Alert.Recipients != nil {
|
|
|
|
|
|
cfg.Alert.Recipients = req.Alert.Recipients
|
|
|
|
|
|
}
|
|
|
|
|
|
cfg.Alert.MinSeverity = strings.TrimSpace(req.Alert.MinSeverity)
|
|
|
|
|
|
cfg.Alert.RateLimitPerHour = req.Alert.RateLimitPerHour
|
|
|
|
|
|
cfg.Alert.BatchingWindowSeconds = req.Alert.BatchingWindowSeconds
|
|
|
|
|
|
cfg.Alert.IncludeResolvedAlerts = req.Alert.IncludeResolvedAlerts
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if req.Report != nil {
|
|
|
|
|
|
cfg.Report.Enabled = req.Report.Enabled
|
|
|
|
|
|
if req.Report.Recipients != nil {
|
|
|
|
|
|
cfg.Report.Recipients = req.Report.Recipients
|
|
|
|
|
|
}
|
|
|
|
|
|
cfg.Report.DailySummaryEnabled = req.Report.DailySummaryEnabled
|
|
|
|
|
|
cfg.Report.DailySummarySchedule = strings.TrimSpace(req.Report.DailySummarySchedule)
|
|
|
|
|
|
cfg.Report.WeeklySummaryEnabled = req.Report.WeeklySummaryEnabled
|
|
|
|
|
|
cfg.Report.WeeklySummarySchedule = strings.TrimSpace(req.Report.WeeklySummarySchedule)
|
|
|
|
|
|
cfg.Report.ErrorDigestEnabled = req.Report.ErrorDigestEnabled
|
|
|
|
|
|
cfg.Report.ErrorDigestSchedule = strings.TrimSpace(req.Report.ErrorDigestSchedule)
|
|
|
|
|
|
cfg.Report.ErrorDigestMinCount = req.Report.ErrorDigestMinCount
|
|
|
|
|
|
cfg.Report.AccountHealthEnabled = req.Report.AccountHealthEnabled
|
|
|
|
|
|
cfg.Report.AccountHealthSchedule = strings.TrimSpace(req.Report.AccountHealthSchedule)
|
|
|
|
|
|
cfg.Report.AccountHealthErrorRateThreshold = req.Report.AccountHealthErrorRateThreshold
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if err := validateOpsEmailNotificationConfig(cfg); err != nil {
|
|
|
|
|
|
return nil, err
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
normalizeOpsEmailNotificationConfig(cfg)
|
|
|
|
|
|
raw, err := json.Marshal(cfg)
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
return nil, err
|
|
|
|
|
|
}
|
|
|
|
|
|
if err := s.settingRepo.Set(ctx, SettingKeyOpsEmailNotificationConfig, string(raw)); err != nil {
|
|
|
|
|
|
return nil, err
|
|
|
|
|
|
}
|
|
|
|
|
|
return cfg, nil
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
func defaultOpsEmailNotificationConfig() *OpsEmailNotificationConfig {
|
|
|
|
|
|
return &OpsEmailNotificationConfig{
|
|
|
|
|
|
Alert: OpsEmailAlertConfig{
|
|
|
|
|
|
Enabled: true,
|
|
|
|
|
|
Recipients: []string{},
|
|
|
|
|
|
MinSeverity: "",
|
|
|
|
|
|
RateLimitPerHour: 0,
|
|
|
|
|
|
BatchingWindowSeconds: 0,
|
|
|
|
|
|
IncludeResolvedAlerts: false,
|
|
|
|
|
|
},
|
|
|
|
|
|
Report: OpsEmailReportConfig{
|
|
|
|
|
|
Enabled: false,
|
|
|
|
|
|
Recipients: []string{},
|
|
|
|
|
|
DailySummaryEnabled: false,
|
|
|
|
|
|
DailySummarySchedule: "0 9 * * *",
|
|
|
|
|
|
WeeklySummaryEnabled: false,
|
|
|
|
|
|
WeeklySummarySchedule: "0 9 * * 1",
|
|
|
|
|
|
ErrorDigestEnabled: false,
|
|
|
|
|
|
ErrorDigestSchedule: "0 9 * * *",
|
|
|
|
|
|
ErrorDigestMinCount: 10,
|
|
|
|
|
|
AccountHealthEnabled: false,
|
|
|
|
|
|
AccountHealthSchedule: "0 9 * * *",
|
|
|
|
|
|
AccountHealthErrorRateThreshold: 10.0,
|
|
|
|
|
|
},
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
func normalizeOpsEmailNotificationConfig(cfg *OpsEmailNotificationConfig) {
|
|
|
|
|
|
if cfg == nil {
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
if cfg.Alert.Recipients == nil {
|
|
|
|
|
|
cfg.Alert.Recipients = []string{}
|
|
|
|
|
|
}
|
|
|
|
|
|
if cfg.Report.Recipients == nil {
|
|
|
|
|
|
cfg.Report.Recipients = []string{}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
cfg.Alert.MinSeverity = strings.TrimSpace(cfg.Alert.MinSeverity)
|
|
|
|
|
|
cfg.Report.DailySummarySchedule = strings.TrimSpace(cfg.Report.DailySummarySchedule)
|
|
|
|
|
|
cfg.Report.WeeklySummarySchedule = strings.TrimSpace(cfg.Report.WeeklySummarySchedule)
|
|
|
|
|
|
cfg.Report.ErrorDigestSchedule = strings.TrimSpace(cfg.Report.ErrorDigestSchedule)
|
|
|
|
|
|
cfg.Report.AccountHealthSchedule = strings.TrimSpace(cfg.Report.AccountHealthSchedule)
|
|
|
|
|
|
|
|
|
|
|
|
// Fill missing schedules with defaults to avoid breaking cron logic if clients send empty strings.
|
|
|
|
|
|
if cfg.Report.DailySummarySchedule == "" {
|
|
|
|
|
|
cfg.Report.DailySummarySchedule = "0 9 * * *"
|
|
|
|
|
|
}
|
|
|
|
|
|
if cfg.Report.WeeklySummarySchedule == "" {
|
|
|
|
|
|
cfg.Report.WeeklySummarySchedule = "0 9 * * 1"
|
|
|
|
|
|
}
|
|
|
|
|
|
if cfg.Report.ErrorDigestSchedule == "" {
|
|
|
|
|
|
cfg.Report.ErrorDigestSchedule = "0 9 * * *"
|
|
|
|
|
|
}
|
|
|
|
|
|
if cfg.Report.AccountHealthSchedule == "" {
|
|
|
|
|
|
cfg.Report.AccountHealthSchedule = "0 9 * * *"
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
func validateOpsEmailNotificationConfig(cfg *OpsEmailNotificationConfig) error {
|
|
|
|
|
|
if cfg == nil {
|
|
|
|
|
|
return errors.New("invalid config")
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if cfg.Alert.RateLimitPerHour < 0 {
|
|
|
|
|
|
return errors.New("alert.rate_limit_per_hour must be >= 0")
|
|
|
|
|
|
}
|
|
|
|
|
|
if cfg.Alert.BatchingWindowSeconds < 0 {
|
|
|
|
|
|
return errors.New("alert.batching_window_seconds must be >= 0")
|
|
|
|
|
|
}
|
|
|
|
|
|
switch strings.TrimSpace(cfg.Alert.MinSeverity) {
|
|
|
|
|
|
case "", "critical", "warning", "info":
|
|
|
|
|
|
default:
|
|
|
|
|
|
return errors.New("alert.min_severity must be one of: critical, warning, info, or empty")
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if cfg.Report.ErrorDigestMinCount < 0 {
|
|
|
|
|
|
return errors.New("report.error_digest_min_count must be >= 0")
|
|
|
|
|
|
}
|
|
|
|
|
|
if cfg.Report.AccountHealthErrorRateThreshold < 0 || cfg.Report.AccountHealthErrorRateThreshold > 100 {
|
|
|
|
|
|
return errors.New("report.account_health_error_rate_threshold must be between 0 and 100")
|
|
|
|
|
|
}
|
|
|
|
|
|
return nil
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// =========================
|
|
|
|
|
|
// Alert runtime settings
|
|
|
|
|
|
// =========================
|
|
|
|
|
|
|
|
|
|
|
|
func defaultOpsAlertRuntimeSettings() *OpsAlertRuntimeSettings {
|
|
|
|
|
|
return &OpsAlertRuntimeSettings{
|
|
|
|
|
|
EvaluationIntervalSeconds: 60,
|
|
|
|
|
|
DistributedLock: OpsDistributedLockSettings{
|
|
|
|
|
|
Enabled: true,
|
|
|
|
|
|
Key: opsAlertEvaluatorLeaderLockKeyDefault,
|
|
|
|
|
|
TTLSeconds: int(opsAlertEvaluatorLeaderLockTTLDefault.Seconds()),
|
|
|
|
|
|
},
|
|
|
|
|
|
Silencing: OpsAlertSilencingSettings{
|
|
|
|
|
|
Enabled: false,
|
|
|
|
|
|
GlobalUntilRFC3339: "",
|
|
|
|
|
|
GlobalReason: "",
|
|
|
|
|
|
Entries: []OpsAlertSilenceEntry{},
|
|
|
|
|
|
},
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
func normalizeOpsDistributedLockSettings(s *OpsDistributedLockSettings, defaultKey string, defaultTTLSeconds int) {
|
|
|
|
|
|
if s == nil {
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
s.Key = strings.TrimSpace(s.Key)
|
|
|
|
|
|
if s.Key == "" {
|
|
|
|
|
|
s.Key = defaultKey
|
|
|
|
|
|
}
|
|
|
|
|
|
if s.TTLSeconds <= 0 {
|
|
|
|
|
|
s.TTLSeconds = defaultTTLSeconds
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
func normalizeOpsAlertSilencingSettings(s *OpsAlertSilencingSettings) {
|
|
|
|
|
|
if s == nil {
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
s.GlobalUntilRFC3339 = strings.TrimSpace(s.GlobalUntilRFC3339)
|
|
|
|
|
|
s.GlobalReason = strings.TrimSpace(s.GlobalReason)
|
|
|
|
|
|
if s.Entries == nil {
|
|
|
|
|
|
s.Entries = []OpsAlertSilenceEntry{}
|
|
|
|
|
|
}
|
|
|
|
|
|
for i := range s.Entries {
|
|
|
|
|
|
s.Entries[i].UntilRFC3339 = strings.TrimSpace(s.Entries[i].UntilRFC3339)
|
|
|
|
|
|
s.Entries[i].Reason = strings.TrimSpace(s.Entries[i].Reason)
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
func validateOpsDistributedLockSettings(s OpsDistributedLockSettings) error {
|
|
|
|
|
|
if strings.TrimSpace(s.Key) == "" {
|
|
|
|
|
|
return errors.New("distributed_lock.key is required")
|
|
|
|
|
|
}
|
|
|
|
|
|
if s.TTLSeconds <= 0 || s.TTLSeconds > int((24*time.Hour).Seconds()) {
|
|
|
|
|
|
return errors.New("distributed_lock.ttl_seconds must be between 1 and 86400")
|
|
|
|
|
|
}
|
|
|
|
|
|
return nil
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
func validateOpsAlertSilencingSettings(s OpsAlertSilencingSettings) error {
|
|
|
|
|
|
parse := func(raw string) error {
|
|
|
|
|
|
if strings.TrimSpace(raw) == "" {
|
|
|
|
|
|
return nil
|
|
|
|
|
|
}
|
|
|
|
|
|
if _, err := time.Parse(time.RFC3339, raw); err != nil {
|
|
|
|
|
|
return errors.New("silencing time must be RFC3339")
|
|
|
|
|
|
}
|
|
|
|
|
|
return nil
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if err := parse(s.GlobalUntilRFC3339); err != nil {
|
|
|
|
|
|
return err
|
|
|
|
|
|
}
|
|
|
|
|
|
for _, entry := range s.Entries {
|
|
|
|
|
|
if strings.TrimSpace(entry.UntilRFC3339) == "" {
|
|
|
|
|
|
return errors.New("silencing.entries.until_rfc3339 is required")
|
|
|
|
|
|
}
|
|
|
|
|
|
if _, err := time.Parse(time.RFC3339, entry.UntilRFC3339); err != nil {
|
|
|
|
|
|
return errors.New("silencing.entries.until_rfc3339 must be RFC3339")
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
return nil
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
func (s *OpsService) GetOpsAlertRuntimeSettings(ctx context.Context) (*OpsAlertRuntimeSettings, error) {
|
|
|
|
|
|
defaultCfg := defaultOpsAlertRuntimeSettings()
|
|
|
|
|
|
if s == nil || s.settingRepo == nil {
|
|
|
|
|
|
return defaultCfg, nil
|
|
|
|
|
|
}
|
|
|
|
|
|
if ctx == nil {
|
|
|
|
|
|
ctx = context.Background()
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsAlertRuntimeSettings)
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
if errors.Is(err, ErrSettingNotFound) {
|
|
|
|
|
|
if b, mErr := json.Marshal(defaultCfg); mErr == nil {
|
|
|
|
|
|
_ = s.settingRepo.Set(ctx, SettingKeyOpsAlertRuntimeSettings, string(b))
|
|
|
|
|
|
}
|
|
|
|
|
|
return defaultCfg, nil
|
|
|
|
|
|
}
|
|
|
|
|
|
return nil, err
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
cfg := &OpsAlertRuntimeSettings{}
|
|
|
|
|
|
if err := json.Unmarshal([]byte(raw), cfg); err != nil {
|
|
|
|
|
|
return defaultCfg, nil
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if cfg.EvaluationIntervalSeconds <= 0 {
|
|
|
|
|
|
cfg.EvaluationIntervalSeconds = defaultCfg.EvaluationIntervalSeconds
|
|
|
|
|
|
}
|
|
|
|
|
|
normalizeOpsDistributedLockSettings(&cfg.DistributedLock, opsAlertEvaluatorLeaderLockKeyDefault, defaultCfg.DistributedLock.TTLSeconds)
|
|
|
|
|
|
normalizeOpsAlertSilencingSettings(&cfg.Silencing)
|
|
|
|
|
|
|
|
|
|
|
|
return cfg, nil
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
func (s *OpsService) UpdateOpsAlertRuntimeSettings(ctx context.Context, cfg *OpsAlertRuntimeSettings) (*OpsAlertRuntimeSettings, error) {
|
|
|
|
|
|
if s == nil || s.settingRepo == nil {
|
|
|
|
|
|
return nil, errors.New("setting repository not initialized")
|
|
|
|
|
|
}
|
|
|
|
|
|
if ctx == nil {
|
|
|
|
|
|
ctx = context.Background()
|
|
|
|
|
|
}
|
|
|
|
|
|
if cfg == nil {
|
|
|
|
|
|
return nil, errors.New("invalid config")
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if cfg.EvaluationIntervalSeconds < 1 || cfg.EvaluationIntervalSeconds > int((24*time.Hour).Seconds()) {
|
|
|
|
|
|
return nil, errors.New("evaluation_interval_seconds must be between 1 and 86400")
|
|
|
|
|
|
}
|
|
|
|
|
|
if cfg.DistributedLock.Enabled {
|
|
|
|
|
|
if err := validateOpsDistributedLockSettings(cfg.DistributedLock); err != nil {
|
|
|
|
|
|
return nil, err
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
if cfg.Silencing.Enabled {
|
|
|
|
|
|
if err := validateOpsAlertSilencingSettings(cfg.Silencing); err != nil {
|
|
|
|
|
|
return nil, err
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
defaultCfg := defaultOpsAlertRuntimeSettings()
|
|
|
|
|
|
normalizeOpsDistributedLockSettings(&cfg.DistributedLock, opsAlertEvaluatorLeaderLockKeyDefault, defaultCfg.DistributedLock.TTLSeconds)
|
|
|
|
|
|
normalizeOpsAlertSilencingSettings(&cfg.Silencing)
|
|
|
|
|
|
|
|
|
|
|
|
raw, err := json.Marshal(cfg)
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
return nil, err
|
|
|
|
|
|
}
|
|
|
|
|
|
if err := s.settingRepo.Set(ctx, SettingKeyOpsAlertRuntimeSettings, string(raw)); err != nil {
|
|
|
|
|
|
return nil, err
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Return a fresh copy (avoid callers holding pointers into internal slices that may be mutated).
|
|
|
|
|
|
updated := &OpsAlertRuntimeSettings{}
|
|
|
|
|
|
_ = json.Unmarshal(raw, updated)
|
|
|
|
|
|
return updated, nil
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-01-11 19:51:18 +08:00
|
|
|
|
// =========================
|
|
|
|
|
|
// Advanced settings
|
|
|
|
|
|
// =========================
|
|
|
|
|
|
|
|
|
|
|
|
func defaultOpsAdvancedSettings() *OpsAdvancedSettings {
|
|
|
|
|
|
return &OpsAdvancedSettings{
|
|
|
|
|
|
DataRetention: OpsDataRetentionSettings{
|
2026-01-12 00:02:19 +08:00
|
|
|
|
CleanupEnabled: false,
|
|
|
|
|
|
CleanupSchedule: "0 2 * * *",
|
|
|
|
|
|
ErrorLogRetentionDays: 30,
|
2026-01-11 19:51:18 +08:00
|
|
|
|
MinuteMetricsRetentionDays: 30,
|
|
|
|
|
|
HourlyMetricsRetentionDays: 30,
|
|
|
|
|
|
},
|
|
|
|
|
|
Aggregation: OpsAggregationSettings{
|
|
|
|
|
|
AggregationEnabled: false,
|
|
|
|
|
|
},
|
2026-03-15 17:25:35 +08:00
|
|
|
|
IgnoreCountTokensErrors: true, // count_tokens 404 是预期行为,默认忽略
|
|
|
|
|
|
IgnoreContextCanceled: true, // Default to true - client disconnects are not errors
|
|
|
|
|
|
IgnoreNoAvailableAccounts: false, // Default to false - this is a real routing issue
|
|
|
|
|
|
IgnoreInsufficientBalanceErrors: false, // 默认不忽略,余额不足可能需要关注
|
|
|
|
|
|
DisplayOpenAITokenStats: false,
|
|
|
|
|
|
DisplayAlertEvents: true,
|
|
|
|
|
|
AutoRefreshEnabled: false,
|
|
|
|
|
|
AutoRefreshIntervalSec: 30,
|
2026-01-11 19:51:18 +08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
func normalizeOpsAdvancedSettings(cfg *OpsAdvancedSettings) {
|
|
|
|
|
|
if cfg == nil {
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
cfg.DataRetention.CleanupSchedule = strings.TrimSpace(cfg.DataRetention.CleanupSchedule)
|
|
|
|
|
|
if cfg.DataRetention.CleanupSchedule == "" {
|
|
|
|
|
|
cfg.DataRetention.CleanupSchedule = "0 2 * * *"
|
|
|
|
|
|
}
|
feat(ops): allow retention days = 0 to wipe table on each scheduled cleanup
Background / 背景
The ops cleanup task currently rejects retention days < 1 in both validate
and normalize, so operators who want minimal-history setups (e.g. high
churn deployments that prefer near-realtime cleanup) cannot express that
intent through the UI. The only options are 1+ days, which keeps at least
24h of history regardless of cron frequency.
ops 清理任务目前在 validate 和 normalize 两处都拒绝小于 1 的保留天数,
让希望尽量不留历史的运维场景(高吞吐部署 + 想用近实时清理)无法通过 UI
表达。最低只能配 1,等于不管 cron 多频繁,至少都会保留 24 小时的历史。
Purpose / 目的
Let admins set retention days to 0, meaning "every scheduled cleanup
run wipes the corresponding table(s) entirely". Combined with a more
frequent cron (e.g. `0 * * * *`) this yields effectively rolling cleanup.
允许管理员把保留天数设为 0,语义为"每次定时清理时把对应表全部清空"。
搭配更频繁的 cron(比如每小时整点)即可获得近似滚动清理的效果。
Changes / 改动内容
Backend
- service/ops_settings.go: validate accepts [0, 365]; normalize only
refills default 30 when value is < 0 (negative is treated as legacy
bad data, 0 is honoured)
- service/ops_cleanup_service.go: introduce `opsCleanupPlan(now, days)`
returning `(cutoff, truncate, ok)`. days==0 returns truncate=true and
short-circuits to a new `truncateOpsTable` helper that uses
`TRUNCATE TABLE` (O(1), no WAL, no VACUUM pressure). days>0 keeps
the existing batched DELETE path unchanged. Empty tables skip
TRUNCATE to avoid the ACCESS EXCLUSIVE lock entirely
- Extract `isMissingRelationError` helper to dedupe the "table not
yet created" tolerance shared by both delete and truncate paths
- Add unit tests for `opsCleanupPlan` (three branches) and
`isMissingRelationError`
后端
- service/ops_settings.go: validate 接受 [0, 365];normalize 仅在 < 0
时回填默认 30(负数视为脏数据,0 被尊重)
- service/ops_cleanup_service.go: 抽 `opsCleanupPlan(now, days)` 返回
`(cutoff, truncate, ok)`。days==0 → truncate=true,走新增
`truncateOpsTable`(TRUNCATE TABLE,O(1),无 WAL、无 VACUUM 压力);
days>0 仍走原批量 DELETE 路径,行为完全不变。空表跳过 TRUNCATE,
避免无意义的 ACCESS EXCLUSIVE 锁
- 抽 `isMissingRelationError` helper 复用 delete / truncate 两处的
"表不存在"宽容判断
- 补 `opsCleanupPlan` 三分支 + `isMissingRelationError` 单元测试
Frontend
- OpsSettingsDialog.vue: validation accepts [0, 365]; input min=0
- i18n (zh/en): hint mentions "0 = wipe all on every cleanup",
validation message updated to 0-365 range
前端
- OpsSettingsDialog.vue: 校验放宽到 [0, 365],input min 改 0
- i18n(zh/en):hint 补"0 = 每次清理时清空所有",错误提示改 0-365
Trade-offs / 取舍
- TRUNCATE requires ACCESS EXCLUSIVE lock briefly, but ops tables only
have the cleanup task as a writer, so the lock is invisible to other
workloads
- Empty-table guard avoids the lock when there is nothing to clean
- Negative values are still treated as legacy bad data and replaced
with default 30 to preserve compatibility
2026-04-29 15:01:02 +08:00
|
|
|
|
// 保留天数:0 表示每次定时清理全部(清空所有),> 0 表示按天数保留;
|
|
|
|
|
|
// 仅在拿到非法的负数时回填默认值,避免覆盖用户主动设的 0。
|
|
|
|
|
|
if cfg.DataRetention.ErrorLogRetentionDays < 0 {
|
2026-01-11 19:51:18 +08:00
|
|
|
|
cfg.DataRetention.ErrorLogRetentionDays = 30
|
|
|
|
|
|
}
|
feat(ops): allow retention days = 0 to wipe table on each scheduled cleanup
Background / 背景
The ops cleanup task currently rejects retention days < 1 in both validate
and normalize, so operators who want minimal-history setups (e.g. high
churn deployments that prefer near-realtime cleanup) cannot express that
intent through the UI. The only options are 1+ days, which keeps at least
24h of history regardless of cron frequency.
ops 清理任务目前在 validate 和 normalize 两处都拒绝小于 1 的保留天数,
让希望尽量不留历史的运维场景(高吞吐部署 + 想用近实时清理)无法通过 UI
表达。最低只能配 1,等于不管 cron 多频繁,至少都会保留 24 小时的历史。
Purpose / 目的
Let admins set retention days to 0, meaning "every scheduled cleanup
run wipes the corresponding table(s) entirely". Combined with a more
frequent cron (e.g. `0 * * * *`) this yields effectively rolling cleanup.
允许管理员把保留天数设为 0,语义为"每次定时清理时把对应表全部清空"。
搭配更频繁的 cron(比如每小时整点)即可获得近似滚动清理的效果。
Changes / 改动内容
Backend
- service/ops_settings.go: validate accepts [0, 365]; normalize only
refills default 30 when value is < 0 (negative is treated as legacy
bad data, 0 is honoured)
- service/ops_cleanup_service.go: introduce `opsCleanupPlan(now, days)`
returning `(cutoff, truncate, ok)`. days==0 returns truncate=true and
short-circuits to a new `truncateOpsTable` helper that uses
`TRUNCATE TABLE` (O(1), no WAL, no VACUUM pressure). days>0 keeps
the existing batched DELETE path unchanged. Empty tables skip
TRUNCATE to avoid the ACCESS EXCLUSIVE lock entirely
- Extract `isMissingRelationError` helper to dedupe the "table not
yet created" tolerance shared by both delete and truncate paths
- Add unit tests for `opsCleanupPlan` (three branches) and
`isMissingRelationError`
后端
- service/ops_settings.go: validate 接受 [0, 365];normalize 仅在 < 0
时回填默认 30(负数视为脏数据,0 被尊重)
- service/ops_cleanup_service.go: 抽 `opsCleanupPlan(now, days)` 返回
`(cutoff, truncate, ok)`。days==0 → truncate=true,走新增
`truncateOpsTable`(TRUNCATE TABLE,O(1),无 WAL、无 VACUUM 压力);
days>0 仍走原批量 DELETE 路径,行为完全不变。空表跳过 TRUNCATE,
避免无意义的 ACCESS EXCLUSIVE 锁
- 抽 `isMissingRelationError` helper 复用 delete / truncate 两处的
"表不存在"宽容判断
- 补 `opsCleanupPlan` 三分支 + `isMissingRelationError` 单元测试
Frontend
- OpsSettingsDialog.vue: validation accepts [0, 365]; input min=0
- i18n (zh/en): hint mentions "0 = wipe all on every cleanup",
validation message updated to 0-365 range
前端
- OpsSettingsDialog.vue: 校验放宽到 [0, 365],input min 改 0
- i18n(zh/en):hint 补"0 = 每次清理时清空所有",错误提示改 0-365
Trade-offs / 取舍
- TRUNCATE requires ACCESS EXCLUSIVE lock briefly, but ops tables only
have the cleanup task as a writer, so the lock is invisible to other
workloads
- Empty-table guard avoids the lock when there is nothing to clean
- Negative values are still treated as legacy bad data and replaced
with default 30 to preserve compatibility
2026-04-29 15:01:02 +08:00
|
|
|
|
if cfg.DataRetention.MinuteMetricsRetentionDays < 0 {
|
2026-01-11 19:51:18 +08:00
|
|
|
|
cfg.DataRetention.MinuteMetricsRetentionDays = 30
|
|
|
|
|
|
}
|
feat(ops): allow retention days = 0 to wipe table on each scheduled cleanup
Background / 背景
The ops cleanup task currently rejects retention days < 1 in both validate
and normalize, so operators who want minimal-history setups (e.g. high
churn deployments that prefer near-realtime cleanup) cannot express that
intent through the UI. The only options are 1+ days, which keeps at least
24h of history regardless of cron frequency.
ops 清理任务目前在 validate 和 normalize 两处都拒绝小于 1 的保留天数,
让希望尽量不留历史的运维场景(高吞吐部署 + 想用近实时清理)无法通过 UI
表达。最低只能配 1,等于不管 cron 多频繁,至少都会保留 24 小时的历史。
Purpose / 目的
Let admins set retention days to 0, meaning "every scheduled cleanup
run wipes the corresponding table(s) entirely". Combined with a more
frequent cron (e.g. `0 * * * *`) this yields effectively rolling cleanup.
允许管理员把保留天数设为 0,语义为"每次定时清理时把对应表全部清空"。
搭配更频繁的 cron(比如每小时整点)即可获得近似滚动清理的效果。
Changes / 改动内容
Backend
- service/ops_settings.go: validate accepts [0, 365]; normalize only
refills default 30 when value is < 0 (negative is treated as legacy
bad data, 0 is honoured)
- service/ops_cleanup_service.go: introduce `opsCleanupPlan(now, days)`
returning `(cutoff, truncate, ok)`. days==0 returns truncate=true and
short-circuits to a new `truncateOpsTable` helper that uses
`TRUNCATE TABLE` (O(1), no WAL, no VACUUM pressure). days>0 keeps
the existing batched DELETE path unchanged. Empty tables skip
TRUNCATE to avoid the ACCESS EXCLUSIVE lock entirely
- Extract `isMissingRelationError` helper to dedupe the "table not
yet created" tolerance shared by both delete and truncate paths
- Add unit tests for `opsCleanupPlan` (three branches) and
`isMissingRelationError`
后端
- service/ops_settings.go: validate 接受 [0, 365];normalize 仅在 < 0
时回填默认 30(负数视为脏数据,0 被尊重)
- service/ops_cleanup_service.go: 抽 `opsCleanupPlan(now, days)` 返回
`(cutoff, truncate, ok)`。days==0 → truncate=true,走新增
`truncateOpsTable`(TRUNCATE TABLE,O(1),无 WAL、无 VACUUM 压力);
days>0 仍走原批量 DELETE 路径,行为完全不变。空表跳过 TRUNCATE,
避免无意义的 ACCESS EXCLUSIVE 锁
- 抽 `isMissingRelationError` helper 复用 delete / truncate 两处的
"表不存在"宽容判断
- 补 `opsCleanupPlan` 三分支 + `isMissingRelationError` 单元测试
Frontend
- OpsSettingsDialog.vue: validation accepts [0, 365]; input min=0
- i18n (zh/en): hint mentions "0 = wipe all on every cleanup",
validation message updated to 0-365 range
前端
- OpsSettingsDialog.vue: 校验放宽到 [0, 365],input min 改 0
- i18n(zh/en):hint 补"0 = 每次清理时清空所有",错误提示改 0-365
Trade-offs / 取舍
- TRUNCATE requires ACCESS EXCLUSIVE lock briefly, but ops tables only
have the cleanup task as a writer, so the lock is invisible to other
workloads
- Empty-table guard avoids the lock when there is nothing to clean
- Negative values are still treated as legacy bad data and replaced
with default 30 to preserve compatibility
2026-04-29 15:01:02 +08:00
|
|
|
|
if cfg.DataRetention.HourlyMetricsRetentionDays < 0 {
|
2026-01-11 19:51:18 +08:00
|
|
|
|
cfg.DataRetention.HourlyMetricsRetentionDays = 30
|
|
|
|
|
|
}
|
2026-01-12 16:50:41 +08:00
|
|
|
|
// Normalize auto refresh interval (default 30 seconds)
|
|
|
|
|
|
if cfg.AutoRefreshIntervalSec <= 0 {
|
|
|
|
|
|
cfg.AutoRefreshIntervalSec = 30
|
|
|
|
|
|
}
|
2026-01-11 19:51:18 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
func validateOpsAdvancedSettings(cfg *OpsAdvancedSettings) error {
|
|
|
|
|
|
if cfg == nil {
|
|
|
|
|
|
return errors.New("invalid config")
|
|
|
|
|
|
}
|
feat(ops): allow retention days = 0 to wipe table on each scheduled cleanup
Background / 背景
The ops cleanup task currently rejects retention days < 1 in both validate
and normalize, so operators who want minimal-history setups (e.g. high
churn deployments that prefer near-realtime cleanup) cannot express that
intent through the UI. The only options are 1+ days, which keeps at least
24h of history regardless of cron frequency.
ops 清理任务目前在 validate 和 normalize 两处都拒绝小于 1 的保留天数,
让希望尽量不留历史的运维场景(高吞吐部署 + 想用近实时清理)无法通过 UI
表达。最低只能配 1,等于不管 cron 多频繁,至少都会保留 24 小时的历史。
Purpose / 目的
Let admins set retention days to 0, meaning "every scheduled cleanup
run wipes the corresponding table(s) entirely". Combined with a more
frequent cron (e.g. `0 * * * *`) this yields effectively rolling cleanup.
允许管理员把保留天数设为 0,语义为"每次定时清理时把对应表全部清空"。
搭配更频繁的 cron(比如每小时整点)即可获得近似滚动清理的效果。
Changes / 改动内容
Backend
- service/ops_settings.go: validate accepts [0, 365]; normalize only
refills default 30 when value is < 0 (negative is treated as legacy
bad data, 0 is honoured)
- service/ops_cleanup_service.go: introduce `opsCleanupPlan(now, days)`
returning `(cutoff, truncate, ok)`. days==0 returns truncate=true and
short-circuits to a new `truncateOpsTable` helper that uses
`TRUNCATE TABLE` (O(1), no WAL, no VACUUM pressure). days>0 keeps
the existing batched DELETE path unchanged. Empty tables skip
TRUNCATE to avoid the ACCESS EXCLUSIVE lock entirely
- Extract `isMissingRelationError` helper to dedupe the "table not
yet created" tolerance shared by both delete and truncate paths
- Add unit tests for `opsCleanupPlan` (three branches) and
`isMissingRelationError`
后端
- service/ops_settings.go: validate 接受 [0, 365];normalize 仅在 < 0
时回填默认 30(负数视为脏数据,0 被尊重)
- service/ops_cleanup_service.go: 抽 `opsCleanupPlan(now, days)` 返回
`(cutoff, truncate, ok)`。days==0 → truncate=true,走新增
`truncateOpsTable`(TRUNCATE TABLE,O(1),无 WAL、无 VACUUM 压力);
days>0 仍走原批量 DELETE 路径,行为完全不变。空表跳过 TRUNCATE,
避免无意义的 ACCESS EXCLUSIVE 锁
- 抽 `isMissingRelationError` helper 复用 delete / truncate 两处的
"表不存在"宽容判断
- 补 `opsCleanupPlan` 三分支 + `isMissingRelationError` 单元测试
Frontend
- OpsSettingsDialog.vue: validation accepts [0, 365]; input min=0
- i18n (zh/en): hint mentions "0 = wipe all on every cleanup",
validation message updated to 0-365 range
前端
- OpsSettingsDialog.vue: 校验放宽到 [0, 365],input min 改 0
- i18n(zh/en):hint 补"0 = 每次清理时清空所有",错误提示改 0-365
Trade-offs / 取舍
- TRUNCATE requires ACCESS EXCLUSIVE lock briefly, but ops tables only
have the cleanup task as a writer, so the lock is invisible to other
workloads
- Empty-table guard avoids the lock when there is nothing to clean
- Negative values are still treated as legacy bad data and replaced
with default 30 to preserve compatibility
2026-04-29 15:01:02 +08:00
|
|
|
|
// 保留天数:0 表示每次清理全部,1-365 表示按天数保留。
|
|
|
|
|
|
if cfg.DataRetention.ErrorLogRetentionDays < 0 || cfg.DataRetention.ErrorLogRetentionDays > 365 {
|
|
|
|
|
|
return errors.New("error_log_retention_days must be between 0 and 365")
|
2026-01-11 19:51:18 +08:00
|
|
|
|
}
|
feat(ops): allow retention days = 0 to wipe table on each scheduled cleanup
Background / 背景
The ops cleanup task currently rejects retention days < 1 in both validate
and normalize, so operators who want minimal-history setups (e.g. high
churn deployments that prefer near-realtime cleanup) cannot express that
intent through the UI. The only options are 1+ days, which keeps at least
24h of history regardless of cron frequency.
ops 清理任务目前在 validate 和 normalize 两处都拒绝小于 1 的保留天数,
让希望尽量不留历史的运维场景(高吞吐部署 + 想用近实时清理)无法通过 UI
表达。最低只能配 1,等于不管 cron 多频繁,至少都会保留 24 小时的历史。
Purpose / 目的
Let admins set retention days to 0, meaning "every scheduled cleanup
run wipes the corresponding table(s) entirely". Combined with a more
frequent cron (e.g. `0 * * * *`) this yields effectively rolling cleanup.
允许管理员把保留天数设为 0,语义为"每次定时清理时把对应表全部清空"。
搭配更频繁的 cron(比如每小时整点)即可获得近似滚动清理的效果。
Changes / 改动内容
Backend
- service/ops_settings.go: validate accepts [0, 365]; normalize only
refills default 30 when value is < 0 (negative is treated as legacy
bad data, 0 is honoured)
- service/ops_cleanup_service.go: introduce `opsCleanupPlan(now, days)`
returning `(cutoff, truncate, ok)`. days==0 returns truncate=true and
short-circuits to a new `truncateOpsTable` helper that uses
`TRUNCATE TABLE` (O(1), no WAL, no VACUUM pressure). days>0 keeps
the existing batched DELETE path unchanged. Empty tables skip
TRUNCATE to avoid the ACCESS EXCLUSIVE lock entirely
- Extract `isMissingRelationError` helper to dedupe the "table not
yet created" tolerance shared by both delete and truncate paths
- Add unit tests for `opsCleanupPlan` (three branches) and
`isMissingRelationError`
后端
- service/ops_settings.go: validate 接受 [0, 365];normalize 仅在 < 0
时回填默认 30(负数视为脏数据,0 被尊重)
- service/ops_cleanup_service.go: 抽 `opsCleanupPlan(now, days)` 返回
`(cutoff, truncate, ok)`。days==0 → truncate=true,走新增
`truncateOpsTable`(TRUNCATE TABLE,O(1),无 WAL、无 VACUUM 压力);
days>0 仍走原批量 DELETE 路径,行为完全不变。空表跳过 TRUNCATE,
避免无意义的 ACCESS EXCLUSIVE 锁
- 抽 `isMissingRelationError` helper 复用 delete / truncate 两处的
"表不存在"宽容判断
- 补 `opsCleanupPlan` 三分支 + `isMissingRelationError` 单元测试
Frontend
- OpsSettingsDialog.vue: validation accepts [0, 365]; input min=0
- i18n (zh/en): hint mentions "0 = wipe all on every cleanup",
validation message updated to 0-365 range
前端
- OpsSettingsDialog.vue: 校验放宽到 [0, 365],input min 改 0
- i18n(zh/en):hint 补"0 = 每次清理时清空所有",错误提示改 0-365
Trade-offs / 取舍
- TRUNCATE requires ACCESS EXCLUSIVE lock briefly, but ops tables only
have the cleanup task as a writer, so the lock is invisible to other
workloads
- Empty-table guard avoids the lock when there is nothing to clean
- Negative values are still treated as legacy bad data and replaced
with default 30 to preserve compatibility
2026-04-29 15:01:02 +08:00
|
|
|
|
if cfg.DataRetention.MinuteMetricsRetentionDays < 0 || cfg.DataRetention.MinuteMetricsRetentionDays > 365 {
|
|
|
|
|
|
return errors.New("minute_metrics_retention_days must be between 0 and 365")
|
2026-01-11 19:51:18 +08:00
|
|
|
|
}
|
feat(ops): allow retention days = 0 to wipe table on each scheduled cleanup
Background / 背景
The ops cleanup task currently rejects retention days < 1 in both validate
and normalize, so operators who want minimal-history setups (e.g. high
churn deployments that prefer near-realtime cleanup) cannot express that
intent through the UI. The only options are 1+ days, which keeps at least
24h of history regardless of cron frequency.
ops 清理任务目前在 validate 和 normalize 两处都拒绝小于 1 的保留天数,
让希望尽量不留历史的运维场景(高吞吐部署 + 想用近实时清理)无法通过 UI
表达。最低只能配 1,等于不管 cron 多频繁,至少都会保留 24 小时的历史。
Purpose / 目的
Let admins set retention days to 0, meaning "every scheduled cleanup
run wipes the corresponding table(s) entirely". Combined with a more
frequent cron (e.g. `0 * * * *`) this yields effectively rolling cleanup.
允许管理员把保留天数设为 0,语义为"每次定时清理时把对应表全部清空"。
搭配更频繁的 cron(比如每小时整点)即可获得近似滚动清理的效果。
Changes / 改动内容
Backend
- service/ops_settings.go: validate accepts [0, 365]; normalize only
refills default 30 when value is < 0 (negative is treated as legacy
bad data, 0 is honoured)
- service/ops_cleanup_service.go: introduce `opsCleanupPlan(now, days)`
returning `(cutoff, truncate, ok)`. days==0 returns truncate=true and
short-circuits to a new `truncateOpsTable` helper that uses
`TRUNCATE TABLE` (O(1), no WAL, no VACUUM pressure). days>0 keeps
the existing batched DELETE path unchanged. Empty tables skip
TRUNCATE to avoid the ACCESS EXCLUSIVE lock entirely
- Extract `isMissingRelationError` helper to dedupe the "table not
yet created" tolerance shared by both delete and truncate paths
- Add unit tests for `opsCleanupPlan` (three branches) and
`isMissingRelationError`
后端
- service/ops_settings.go: validate 接受 [0, 365];normalize 仅在 < 0
时回填默认 30(负数视为脏数据,0 被尊重)
- service/ops_cleanup_service.go: 抽 `opsCleanupPlan(now, days)` 返回
`(cutoff, truncate, ok)`。days==0 → truncate=true,走新增
`truncateOpsTable`(TRUNCATE TABLE,O(1),无 WAL、无 VACUUM 压力);
days>0 仍走原批量 DELETE 路径,行为完全不变。空表跳过 TRUNCATE,
避免无意义的 ACCESS EXCLUSIVE 锁
- 抽 `isMissingRelationError` helper 复用 delete / truncate 两处的
"表不存在"宽容判断
- 补 `opsCleanupPlan` 三分支 + `isMissingRelationError` 单元测试
Frontend
- OpsSettingsDialog.vue: validation accepts [0, 365]; input min=0
- i18n (zh/en): hint mentions "0 = wipe all on every cleanup",
validation message updated to 0-365 range
前端
- OpsSettingsDialog.vue: 校验放宽到 [0, 365],input min 改 0
- i18n(zh/en):hint 补"0 = 每次清理时清空所有",错误提示改 0-365
Trade-offs / 取舍
- TRUNCATE requires ACCESS EXCLUSIVE lock briefly, but ops tables only
have the cleanup task as a writer, so the lock is invisible to other
workloads
- Empty-table guard avoids the lock when there is nothing to clean
- Negative values are still treated as legacy bad data and replaced
with default 30 to preserve compatibility
2026-04-29 15:01:02 +08:00
|
|
|
|
if cfg.DataRetention.HourlyMetricsRetentionDays < 0 || cfg.DataRetention.HourlyMetricsRetentionDays > 365 {
|
|
|
|
|
|
return errors.New("hourly_metrics_retention_days must be between 0 and 365")
|
2026-01-11 19:51:18 +08:00
|
|
|
|
}
|
2026-01-12 16:50:41 +08:00
|
|
|
|
if cfg.AutoRefreshIntervalSec < 15 || cfg.AutoRefreshIntervalSec > 300 {
|
|
|
|
|
|
return errors.New("auto_refresh_interval_seconds must be between 15 and 300")
|
|
|
|
|
|
}
|
2026-01-11 19:51:18 +08:00
|
|
|
|
return nil
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
func (s *OpsService) GetOpsAdvancedSettings(ctx context.Context) (*OpsAdvancedSettings, error) {
|
|
|
|
|
|
defaultCfg := defaultOpsAdvancedSettings()
|
|
|
|
|
|
if s == nil || s.settingRepo == nil {
|
|
|
|
|
|
return defaultCfg, nil
|
|
|
|
|
|
}
|
|
|
|
|
|
if ctx == nil {
|
|
|
|
|
|
ctx = context.Background()
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsAdvancedSettings)
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
if errors.Is(err, ErrSettingNotFound) {
|
|
|
|
|
|
if b, mErr := json.Marshal(defaultCfg); mErr == nil {
|
|
|
|
|
|
_ = s.settingRepo.Set(ctx, SettingKeyOpsAdvancedSettings, string(b))
|
|
|
|
|
|
}
|
|
|
|
|
|
return defaultCfg, nil
|
|
|
|
|
|
}
|
|
|
|
|
|
return nil, err
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-03-13 17:18:04 +08:00
|
|
|
|
cfg := defaultOpsAdvancedSettings()
|
2026-01-11 19:51:18 +08:00
|
|
|
|
if err := json.Unmarshal([]byte(raw), cfg); err != nil {
|
|
|
|
|
|
return defaultCfg, nil
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
normalizeOpsAdvancedSettings(cfg)
|
|
|
|
|
|
return cfg, nil
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
func (s *OpsService) UpdateOpsAdvancedSettings(ctx context.Context, cfg *OpsAdvancedSettings) (*OpsAdvancedSettings, error) {
|
|
|
|
|
|
if s == nil || s.settingRepo == nil {
|
|
|
|
|
|
return nil, errors.New("setting repository not initialized")
|
|
|
|
|
|
}
|
|
|
|
|
|
if ctx == nil {
|
|
|
|
|
|
ctx = context.Background()
|
|
|
|
|
|
}
|
|
|
|
|
|
if cfg == nil {
|
|
|
|
|
|
return nil, errors.New("invalid config")
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if err := validateOpsAdvancedSettings(cfg); err != nil {
|
|
|
|
|
|
return nil, err
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
normalizeOpsAdvancedSettings(cfg)
|
|
|
|
|
|
raw, err := json.Marshal(cfg)
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
return nil, err
|
|
|
|
|
|
}
|
|
|
|
|
|
if err := s.settingRepo.Set(ctx, SettingKeyOpsAdvancedSettings, string(raw)); err != nil {
|
|
|
|
|
|
return nil, err
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
updated := &OpsAdvancedSettings{}
|
|
|
|
|
|
_ = json.Unmarshal(raw, updated)
|
|
|
|
|
|
return updated, nil
|
|
|
|
|
|
}
|
2026-01-12 11:42:56 +08:00
|
|
|
|
|
|
|
|
|
|
// =========================
|
|
|
|
|
|
// Metric thresholds
|
|
|
|
|
|
// =========================
|
|
|
|
|
|
|
|
|
|
|
|
const SettingKeyOpsMetricThresholds = "ops_metric_thresholds"
|
|
|
|
|
|
|
|
|
|
|
|
func defaultOpsMetricThresholds() *OpsMetricThresholds {
|
|
|
|
|
|
slaMin := 99.5
|
|
|
|
|
|
ttftMax := 500.0
|
|
|
|
|
|
reqErrMax := 5.0
|
|
|
|
|
|
upstreamErrMax := 5.0
|
|
|
|
|
|
return &OpsMetricThresholds{
|
|
|
|
|
|
SLAPercentMin: &slaMin,
|
|
|
|
|
|
TTFTp99MsMax: &ttftMax,
|
|
|
|
|
|
RequestErrorRatePercentMax: &reqErrMax,
|
|
|
|
|
|
UpstreamErrorRatePercentMax: &upstreamErrMax,
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
func (s *OpsService) GetMetricThresholds(ctx context.Context) (*OpsMetricThresholds, error) {
|
|
|
|
|
|
defaultCfg := defaultOpsMetricThresholds()
|
|
|
|
|
|
if s == nil || s.settingRepo == nil {
|
|
|
|
|
|
return defaultCfg, nil
|
|
|
|
|
|
}
|
|
|
|
|
|
if ctx == nil {
|
|
|
|
|
|
ctx = context.Background()
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
raw, err := s.settingRepo.GetValue(ctx, SettingKeyOpsMetricThresholds)
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
if errors.Is(err, ErrSettingNotFound) {
|
|
|
|
|
|
if b, mErr := json.Marshal(defaultCfg); mErr == nil {
|
|
|
|
|
|
_ = s.settingRepo.Set(ctx, SettingKeyOpsMetricThresholds, string(b))
|
|
|
|
|
|
}
|
|
|
|
|
|
return defaultCfg, nil
|
|
|
|
|
|
}
|
|
|
|
|
|
return nil, err
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
cfg := &OpsMetricThresholds{}
|
|
|
|
|
|
if err := json.Unmarshal([]byte(raw), cfg); err != nil {
|
|
|
|
|
|
return defaultCfg, nil
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return cfg, nil
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
func (s *OpsService) UpdateMetricThresholds(ctx context.Context, cfg *OpsMetricThresholds) (*OpsMetricThresholds, error) {
|
|
|
|
|
|
if s == nil || s.settingRepo == nil {
|
|
|
|
|
|
return nil, errors.New("setting repository not initialized")
|
|
|
|
|
|
}
|
|
|
|
|
|
if ctx == nil {
|
|
|
|
|
|
ctx = context.Background()
|
|
|
|
|
|
}
|
|
|
|
|
|
if cfg == nil {
|
|
|
|
|
|
return nil, errors.New("invalid config")
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Validate thresholds
|
|
|
|
|
|
if cfg.SLAPercentMin != nil && (*cfg.SLAPercentMin < 0 || *cfg.SLAPercentMin > 100) {
|
|
|
|
|
|
return nil, errors.New("sla_percent_min must be between 0 and 100")
|
|
|
|
|
|
}
|
|
|
|
|
|
if cfg.TTFTp99MsMax != nil && *cfg.TTFTp99MsMax < 0 {
|
|
|
|
|
|
return nil, errors.New("ttft_p99_ms_max must be >= 0")
|
|
|
|
|
|
}
|
|
|
|
|
|
if cfg.RequestErrorRatePercentMax != nil && (*cfg.RequestErrorRatePercentMax < 0 || *cfg.RequestErrorRatePercentMax > 100) {
|
|
|
|
|
|
return nil, errors.New("request_error_rate_percent_max must be between 0 and 100")
|
|
|
|
|
|
}
|
|
|
|
|
|
if cfg.UpstreamErrorRatePercentMax != nil && (*cfg.UpstreamErrorRatePercentMax < 0 || *cfg.UpstreamErrorRatePercentMax > 100) {
|
|
|
|
|
|
return nil, errors.New("upstream_error_rate_percent_max must be between 0 and 100")
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
raw, err := json.Marshal(cfg)
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
return nil, err
|
|
|
|
|
|
}
|
|
|
|
|
|
if err := s.settingRepo.Set(ctx, SettingKeyOpsMetricThresholds, string(raw)); err != nil {
|
|
|
|
|
|
return nil, err
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
updated := &OpsMetricThresholds{}
|
|
|
|
|
|
_ = json.Unmarshal(raw, updated)
|
|
|
|
|
|
return updated, nil
|
|
|
|
|
|
}
|