backend/internal/service/ops_alert_service_integration_test.go

//go:build integration

package service

import (
	"context"
	"database/sql"
	"sync"
	"sync/atomic"
	"testing"
	"time"

	"github.com/stretchr/testify/require"
)

// This integration test protects the DI startup contract for OpsAlertService.
//
// Background:
//   - OpsMetricsCollector previously called alertService.Start()/Evaluate() directly.
//   - Those direct calls were removed, so OpsAlertService must now start via DI
//     (ProvideOpsAlertService in wire.go) and run its own evaluation ticker.
//
// What we validate here:
//  1. When we construct via the Wire provider functions (ProvideOpsAlertService +
//     ProvideOpsMetricsCollector), OpsAlertService starts automatically.
//  2. Its evaluation loop continues to tick even if OpsMetricsCollector is stopped,
//     proving the alert evaluator is independent.
//  3. The evaluation path can trigger alert logic (CreateAlertEvent called).
func TestOpsAlertService_StartedViaWireProviders_RunsIndependentTicker(t *testing.T) {
	oldInterval := opsAlertEvalInterval
	opsAlertEvalInterval = 25 * time.Millisecond
	t.Cleanup(func() { opsAlertEvalInterval = oldInterval })

	repo := newFakeOpsRepository()
	opsService := NewOpsService(repo, nil)

	// Start via the Wire provider function (the production DI path).
	alertService := ProvideOpsAlertService(opsService, nil, nil)
	t.Cleanup(alertService.Stop)

	// Construct via ProvideOpsMetricsCollector (wire.go). Stop immediately to ensure
	// the alert ticker keeps running without the metrics collector.
	collector := ProvideOpsMetricsCollector(opsService, NewConcurrencyService(nil))
	collector.Stop()

	// Wait for at least one evaluation (run() calls evaluateOnce immediately).
	require.Eventually(t, func() bool {
		return repo.listRulesCalls.Load() >= 1
	}, 1*time.Second, 5*time.Millisecond)

	// Confirm the evaluation loop keeps ticking after the metrics collector is stopped.
	callsAfterCollectorStop := repo.listRulesCalls.Load()
	require.Eventually(t, func() bool {
		return repo.listRulesCalls.Load() >= callsAfterCollectorStop+2
	}, 1*time.Second, 5*time.Millisecond)

	// Confirm the evaluation logic actually fires an alert event at least once.
	select {
	case <-repo.eventCreatedCh:
		// ok
	case <-time.After(2 * time.Second):
		t.Fatalf("expected OpsAlertService to create an alert event, but none was created (ListAlertRules calls=%d)", repo.listRulesCalls.Load())
	}
}

func newFakeOpsRepository() *fakeOpsRepository {
	return &fakeOpsRepository{
		eventCreatedCh: make(chan struct{}),
	}
}

// fakeOpsRepository is a lightweight in-memory stub of OpsRepository for integration tests.
// It avoids real DB/Redis usage and provides deterministic responses fast.
type fakeOpsRepository struct {
	listRulesCalls atomic.Int64

	mu             sync.Mutex
	activeEvent    *OpsAlertEvent
	latestEvent    *OpsAlertEvent
	nextEventID    int64
	eventCreatedCh chan struct{}
	eventOnce      sync.Once
}

func (r *fakeOpsRepository) CreateErrorLog(ctx context.Context, log *OpsErrorLog) error {
	return nil
}

func (r *fakeOpsRepository) ListErrorLogsLegacy(ctx context.Context, filters OpsErrorLogFilters) ([]OpsErrorLog, error) {
	return nil, nil
}

func (r *fakeOpsRepository) ListErrorLogs(ctx context.Context, filter *ErrorLogFilter) ([]*ErrorLog, int64, error) {
	return nil, 0, nil
}

func (r *fakeOpsRepository) GetLatestSystemMetric(ctx context.Context) (*OpsMetrics, error) {
	return &OpsMetrics{WindowMinutes: 1}, sql.ErrNoRows
}

func (r *fakeOpsRepository) CreateSystemMetric(ctx context.Context, metric *OpsMetrics) error {
	return nil
}

func (r *fakeOpsRepository) GetWindowStats(ctx context.Context, startTime, endTime time.Time) (*OpsWindowStats, error) {
	return &OpsWindowStats{}, nil
}

func (r *fakeOpsRepository) GetProviderStats(ctx context.Context, startTime, endTime time.Time) ([]*ProviderStats, error) {
	return nil, nil
}

func (r *fakeOpsRepository) GetLatencyHistogram(ctx context.Context, startTime, endTime time.Time) ([]*LatencyHistogramItem, error) {
	return nil, nil
}

func (r *fakeOpsRepository) GetErrorDistribution(ctx context.Context, startTime, endTime time.Time) ([]*ErrorDistributionItem, error) {
	return nil, nil
}

func (r *fakeOpsRepository) ListRecentSystemMetrics(ctx context.Context, windowMinutes, limit int) ([]OpsMetrics, error) {
	if limit <= 0 {
		limit = 1
	}
	now := time.Now()
	metrics := make([]OpsMetrics, 0, limit)
	for i := 0; i < limit; i++ {
		metrics = append(metrics, OpsMetrics{
			WindowMinutes:   windowMinutes,
			CPUUsagePercent: 99,
			UpdatedAt:       now.Add(-time.Duration(i) * opsMetricsInterval),
		})
	}
	return metrics, nil
}

func (r *fakeOpsRepository) ListSystemMetricsRange(ctx context.Context, windowMinutes int, startTime, endTime time.Time, limit int) ([]OpsMetrics, error) {
	return nil, nil
}

func (r *fakeOpsRepository) ListAlertRules(ctx context.Context) ([]OpsAlertRule, error) {
	call := r.listRulesCalls.Add(1)
	// Delay enabling rules slightly so the test can stop OpsMetricsCollector first,
	// then observe the alert evaluator ticking independently.
	if call < 5 {
		return nil, nil
	}
	return []OpsAlertRule{
		{
			ID:               1,
			Name:             "cpu too high (test)",
			Enabled:          true,
			MetricType:       OpsMetricCPUUsagePercent,
			Operator:         ">",
			Threshold:        0,
			WindowMinutes:    1,
			SustainedMinutes: 1,
			Severity:         "P1",
			NotifyEmail:      false,
			NotifyWebhook:    false,
			CooldownMinutes:  0,
		},
	}, nil
}

func (r *fakeOpsRepository) GetActiveAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) {
	r.mu.Lock()
	defer r.mu.Unlock()
	if r.activeEvent == nil {
		return nil, nil
	}
	if r.activeEvent.RuleID != ruleID {
		return nil, nil
	}
	if r.activeEvent.Status != OpsAlertStatusFiring {
		return nil, nil
	}
	clone := *r.activeEvent
	return &clone, nil
}

func (r *fakeOpsRepository) GetLatestAlertEvent(ctx context.Context, ruleID int64) (*OpsAlertEvent, error) {
	r.mu.Lock()
	defer r.mu.Unlock()
	if r.latestEvent == nil || r.latestEvent.RuleID != ruleID {
		return nil, nil
	}
	clone := *r.latestEvent
	return &clone, nil
}

func (r *fakeOpsRepository) CreateAlertEvent(ctx context.Context, event *OpsAlertEvent) error {
	if event == nil {
		return nil
	}
	r.mu.Lock()
	defer r.mu.Unlock()

	r.nextEventID++
	event.ID = r.nextEventID

	clone := *event
	r.latestEvent = &clone
	if clone.Status == OpsAlertStatusFiring {
		r.activeEvent = &clone
	}

	r.eventOnce.Do(func() { close(r.eventCreatedCh) })
	return nil
}

func (r *fakeOpsRepository) UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt *time.Time) error {
	r.mu.Lock()
	defer r.mu.Unlock()
	if r.activeEvent != nil && r.activeEvent.ID == eventID {
		r.activeEvent.Status = status
		r.activeEvent.ResolvedAt = resolvedAt
	}
	if r.latestEvent != nil && r.latestEvent.ID == eventID {
		r.latestEvent.Status = status
		r.latestEvent.ResolvedAt = resolvedAt
	}
	return nil
}

func (r *fakeOpsRepository) UpdateAlertEventNotifications(ctx context.Context, eventID int64, emailSent, webhookSent bool) error {
	r.mu.Lock()
	defer r.mu.Unlock()
	if r.activeEvent != nil && r.activeEvent.ID == eventID {
		r.activeEvent.EmailSent = emailSent
		r.activeEvent.WebhookSent = webhookSent
	}
	if r.latestEvent != nil && r.latestEvent.ID == eventID {
		r.latestEvent.EmailSent = emailSent
		r.latestEvent.WebhookSent = webhookSent
	}
	return nil
}

func (r *fakeOpsRepository) CountActiveAlerts(ctx context.Context) (int, error) {
	r.mu.Lock()
	defer r.mu.Unlock()
	if r.activeEvent == nil {
		return 0, nil
	}
	return 1, nil
}

func (r *fakeOpsRepository) GetOverviewStats(ctx context.Context, startTime, endTime time.Time) (*OverviewStats, error) {
	return &OverviewStats{}, nil
}

func (r *fakeOpsRepository) GetCachedLatestSystemMetric(ctx context.Context) (*OpsMetrics, error) {
	return nil, nil
}

func (r *fakeOpsRepository) SetCachedLatestSystemMetric(ctx context.Context, metric *OpsMetrics) error {
	return nil
}

func (r *fakeOpsRepository) GetCachedDashboardOverview(ctx context.Context, timeRange string) (*DashboardOverviewData, error) {
	return nil, nil
}

func (r *fakeOpsRepository) SetCachedDashboardOverview(ctx context.Context, timeRange string, data *DashboardOverviewData, ttl time.Duration) error {
	return nil
}

func (r *fakeOpsRepository) PingRedis(ctx context.Context) error {
	return nil
}
运维监控系统安全加固和功能优化 (#21) * fix(ops): 修复运维监控系统的关键安全和稳定性问题 ## 修复内容 ### P0 严重问题 1. DNS Rebinding防护 (ops_alert_service.go) - 实现IP钉住机制防止验证后的DNS rebinding攻击 - 自定义Transport.DialContext强制只允许拨号到验证过的公网IP - 扩展IP黑名单，包括云metadata地址(169.254.169.254) - 添加完整的单元测试覆盖 2. OpsAlertService生命周期管理 (wire.go) - 在ProvideOpsMetricsCollector中添加opsAlertService.Start()调用 - 确保stopCtx正确初始化，避免nil指针问题 - 实现防御式启动，保证服务启动顺序 3. 数据库查询排序 (ops_repo.go) - 在ListRecentSystemMetrics中添加显式ORDER BY updated_at DESC, id DESC - 在GetLatestSystemMetric中添加排序保证 - 避免数据库返回顺序不确定导致告警误判 ### P1 重要问题 4. 并发安全 (ops_metrics_collector.go) - 为lastGCPauseTotal字段添加sync.Mutex保护 - 防止数据竞争 5. Goroutine泄漏 (ops_error_logger.go) - 实现worker pool模式限制并发goroutine数量 - 使用256容量缓冲队列和10个固定worker - 非阻塞投递，队列满时丢弃任务 6. 生命周期控制 (ops_alert_service.go) - 添加Start/Stop方法实现优雅关闭 - 使用context控制goroutine生命周期 - 实现WaitGroup等待后台任务完成 7. Webhook URL验证 (ops_alert_service.go) - 防止SSRF攻击：验证scheme、禁止内网IP - DNS解析验证，拒绝解析到私有IP的域名 - 添加8个单元测试覆盖各种攻击场景 8. 资源泄漏 (ops_repo.go) - 修复多处defer rows.Close()问题 - 简化冗余的defer func()包装 9. HTTP超时控制 (ops_alert_service.go) - 创建带10秒超时的http.Client - 添加buildWebhookHTTPClient辅助函数 - 防止HTTP请求无限期挂起 10. 数据库查询优化 (ops_repo.go) - 将GetWindowStats的4次独立查询合并为1次CTE查询 - 减少网络往返和表扫描次数 - 显著提升性能 11. 重试机制 (ops_alert_service.go) - 实现邮件发送重试：最多3次，指数退避(1s/2s/4s) - 添加webhook备用通道 - 实现完整的错误处理和日志记录 12. 魔法数字 (ops_repo.go, ops_metrics_collector.go) - 提取硬编码数字为有意义的常量 - 提高代码可读性和可维护性 ## 测试验证 - ✅ go test ./internal/service -tags opsalert_unit 通过 - ✅ 所有webhook验证测试通过 - ✅ 重试机制测试通过 ## 影响范围 - 运维监控系统安全性显著提升 - 系统稳定性和性能优化 - 无破坏性变更，向后兼容 * feat(ops): 运维监控系统V2 - 完整实现 ## 核心功能 - 运维监控仪表盘V2（实时监控、历史趋势、告警管理） - WebSocket实时QPS/TPS监控（30s心跳，自动重连） - 系统指标采集（CPU、内存、延迟、错误率等） - 多维度统计分析（按provider、model、user等维度） - 告警规则管理（阈值配置、通知渠道） - 错误日志追踪（详细错误信息、堆栈跟踪） ## 数据库Schema (Migration 025) ### 扩展现有表 - ops_system_metrics: 新增RED指标、错误分类、延迟指标、资源指标、业务指标 - ops_alert_rules: 新增JSONB字段（dimension_filters, notify_channels, notify_config） ### 新增表 - ops_dimension_stats: 多维度统计数据 - ops_data_retention_config: 数据保留策略配置 ### 新增视图和函数 - ops_latest_metrics: 最新1分钟窗口指标（已修复字段名和window过滤） - ops_active_alerts: 当前活跃告警（已修复字段名和状态值） - calculate_health_score: 健康分数计算函数 ## 一致性修复（98/100分） ### P0级别（阻塞Migration） - ✅ 修复ops_latest_metrics视图字段名（latency_p99→p99_latency_ms, cpu_usage→cpu_usage_percent） - ✅ 修复ops_active_alerts视图字段名（metric→metric_type, triggered_at→fired_at, trigger_value→metric_value, threshold→threshold_value） - ✅ 统一告警历史表名（删除ops_alert_history，使用ops_alert_events） - ✅ 统一API参数限制（ListMetricsHistory和ListErrorLogs的limit改为5000） ### P1级别（功能完整性） - ✅ 修复ops_latest_metrics视图未过滤window_minutes（添加WHERE m.window_minutes = 1） - ✅ 修复数据回填UPDATE逻辑（QPS计算改为request_count/(window_minutes60.0)） - ✅ 添加ops_alert_rules JSONB字段后端支持（Go结构体+序列化） ### P2级别（优化） - ✅ 前端WebSocket自动重连（指数退避1s→2s→4s→8s→16s，最大5次） - ✅ 后端WebSocket心跳检测（30s ping，60s pong超时） ## 技术实现 ### 后端 (Go) - Handler层: ops_handler.go（REST API）, ops_ws_handler.go（WebSocket） - Service层: ops_service.go（核心逻辑）, ops_cache.go（缓存）, ops_alerts.go（告警） - Repository层: ops_repo.go（数据访问）, ops.go（模型定义） - 路由: admin.go（新增ops相关路由） - 依赖注入: wire_gen.go（自动生成） ### 前端 (Vue3 + TypeScript) - 组件: OpsDashboardV2.vue（仪表盘主组件） - API: ops.ts（REST API + WebSocket封装） - 路由: index.ts（新增/admin/ops路由） - 国际化: en.ts, zh.ts（中英文支持） ## 测试验证 - ✅ 所有Go测试通过 - ✅ Migration可正常执行 - ✅ WebSocket连接稳定 - ✅ 前后端数据结构对齐 refactor: 代码清理和测试优化 ## 测试文件优化 - 简化integration test fixtures和断言 - 优化test helper函数 - 统一测试数据格式 ## 代码清理 - 移除未使用的代码和注释 - 简化concurrency_cache实现 - 优化middleware错误处理 ## 小修复 - 修复gateway_handler和openai_gateway_handler的小问题 - 统一代码风格和格式变更统计: 27个文件，292行新增，322行删除（净减少30行） * fix(ops): 运维监控系统安全加固和功能优化 ## 安全增强 - feat(security): WebSocket日志脱敏机制，防止token/api_key泄露 - feat(security): X-Forwarded-Host白名单验证，防止CSRF绕过 - feat(security): Origin策略配置化，支持strict/permissive模式 - feat(auth): WebSocket认证支持query参数传递token ## 配置优化 - feat(config): 支持环境变量配置代理信任和Origin策略 - OPS_WS_TRUST_PROXY - OPS_WS_TRUSTED_PROXIES - OPS_WS_ORIGIN_POLICY - fix(ops): 错误日志查询限流从5000降至500，优化内存使用 ## 架构改进 - refactor(ops): 告警服务解耦，独立运行评估定时器 - refactor(ops): OpsDashboard统一版本，移除V2分离 ## 测试和文档 - test(ops): 添加WebSocket安全验证单元测试（8个测试用例） - test(ops): 添加告警服务集成测试 - docs(api): 更新API文档，标注限流变更 - docs: 添加CHANGELOG记录breaking changes ## 修复文件 Backend: - backend/internal/server/middleware/logger.go - backend/internal/handler/admin/ops_handler.go - backend/internal/handler/admin/ops_ws_handler.go - backend/internal/server/middleware/admin_auth.go - backend/internal/service/ops_alert_service.go - backend/internal/service/ops_metrics_collector.go - backend/internal/service/wire.go Frontend: - frontend/src/views/admin/ops/OpsDashboard.vue - frontend/src/router/index.ts - frontend/src/api/admin/ops.ts Tests: - backend/internal/handler/admin/ops_ws_handler_test.go (新增) - backend/internal/service/ops_alert_service_integration_test.go (新增) Docs: - CHANGELOG.md (新增) - docs/API-运维监控中心2.0.md (更新) * fix(migrations): 修复calculate_health_score函数类型匹配问题在ops_latest_metrics视图中添加显式类型转换，确保参数类型与函数签名匹配 * fix(lint): 修复golangci-lint检查发现的所有问题 - 将Redis依赖从service层移到repository层 - 添加错误检查（WebSocket连接和读取超时） - 运行gofmt格式化代码 - 添加nil指针检查 - 删除未使用的alertService字段修复问题： - depguard: 3个（service层不应直接import redis） - errcheck: 3个（未检查错误返回值） - gofmt: 2个（代码格式问题） - staticcheck: 4个（nil指针解引用） - unused: 1个（未使用字段）代码统计： - 修改文件：11个 - 删除代码：490行 - 新增代码：105行 - 净减少：385行 2026-01-02 20:01:12 +08:00			`//go:build integration`

			`package service`

			`import (`
			`"context"`
			`"database/sql"`
			`"sync"`
			`"sync/atomic"`
			`"testing"`
			`"time"`

			`"github.com/stretchr/testify/require"`
			`)`

			`// This integration test protects the DI startup contract for OpsAlertService.`
			`//`
			`// Background:`
			`// - OpsMetricsCollector previously called alertService.Start()/Evaluate() directly.`
			`// - Those direct calls were removed, so OpsAlertService must now start via DI`
			`// (ProvideOpsAlertService in wire.go) and run its own evaluation ticker.`
			`//`
			`// What we validate here:`
			`// 1. When we construct via the Wire provider functions (ProvideOpsAlertService +`
			`// ProvideOpsMetricsCollector), OpsAlertService starts automatically.`
			`// 2. Its evaluation loop continues to tick even if OpsMetricsCollector is stopped,`
			`// proving the alert evaluator is independent.`
			`// 3. The evaluation path can trigger alert logic (CreateAlertEvent called).`
			`func TestOpsAlertService_StartedViaWireProviders_RunsIndependentTicker(t *testing.T) {`
			`oldInterval := opsAlertEvalInterval`
			`opsAlertEvalInterval = 25 * time.Millisecond`
			`t.Cleanup(func() { opsAlertEvalInterval = oldInterval })`

			`repo := newFakeOpsRepository()`
			`opsService := NewOpsService(repo, nil)`

			`// Start via the Wire provider function (the production DI path).`
			`alertService := ProvideOpsAlertService(opsService, nil, nil)`
			`t.Cleanup(alertService.Stop)`

			`// Construct via ProvideOpsMetricsCollector (wire.go). Stop immediately to ensure`
			`// the alert ticker keeps running without the metrics collector.`
			`collector := ProvideOpsMetricsCollector(opsService, NewConcurrencyService(nil))`
			`collector.Stop()`

			`// Wait for at least one evaluation (run() calls evaluateOnce immediately).`
			`require.Eventually(t, func() bool {`
			`return repo.listRulesCalls.Load() >= 1`
			`}, 1time.Second, 5time.Millisecond)`

			`// Confirm the evaluation loop keeps ticking after the metrics collector is stopped.`
			`callsAfterCollectorStop := repo.listRulesCalls.Load()`
			`require.Eventually(t, func() bool {`
			`return repo.listRulesCalls.Load() >= callsAfterCollectorStop+2`
			`}, 1time.Second, 5time.Millisecond)`

			`// Confirm the evaluation logic actually fires an alert event at least once.`
			`select {`
			`case <-repo.eventCreatedCh:`
			`// ok`
			`case <-time.After(2 * time.Second):`
			`t.Fatalf("expected OpsAlertService to create an alert event, but none was created (ListAlertRules calls=%d)", repo.listRulesCalls.Load())`
			`}`
			`}`

			`func newFakeOpsRepository() *fakeOpsRepository {`
			`return &fakeOpsRepository{`
			`eventCreatedCh: make(chan struct{}),`
			`}`
			`}`

			`// fakeOpsRepository is a lightweight in-memory stub of OpsRepository for integration tests.`
			`// It avoids real DB/Redis usage and provides deterministic responses fast.`
			`type fakeOpsRepository struct {`
			`listRulesCalls atomic.Int64`

			`mu sync.Mutex`
			`activeEvent *OpsAlertEvent`
			`latestEvent *OpsAlertEvent`
			`nextEventID int64`
			`eventCreatedCh chan struct{}`
			`eventOnce sync.Once`
			`}`

			`func (r fakeOpsRepository) CreateErrorLog(ctx context.Context, log OpsErrorLog) error {`
			`return nil`
			`}`

			`func (r *fakeOpsRepository) ListErrorLogsLegacy(ctx context.Context, filters OpsErrorLogFilters) ([]OpsErrorLog, error) {`
			`return nil, nil`
			`}`

			`func (r fakeOpsRepository) ListErrorLogs(ctx context.Context, filter ErrorLogFilter) ([]*ErrorLog, int64, error) {`
			`return nil, 0, nil`
			`}`

			`func (r fakeOpsRepository) GetLatestSystemMetric(ctx context.Context) (OpsMetrics, error) {`
			`return &OpsMetrics{WindowMinutes: 1}, sql.ErrNoRows`
			`}`

			`func (r fakeOpsRepository) CreateSystemMetric(ctx context.Context, metric OpsMetrics) error {`
			`return nil`
			`}`

			`func (r fakeOpsRepository) GetWindowStats(ctx context.Context, startTime, endTime time.Time) (OpsWindowStats, error) {`
			`return &OpsWindowStats{}, nil`
			`}`

			`func (r fakeOpsRepository) GetProviderStats(ctx context.Context, startTime, endTime time.Time) ([]ProviderStats, error) {`
			`return nil, nil`
			`}`

			`func (r fakeOpsRepository) GetLatencyHistogram(ctx context.Context, startTime, endTime time.Time) ([]LatencyHistogramItem, error) {`
			`return nil, nil`
			`}`

			`func (r fakeOpsRepository) GetErrorDistribution(ctx context.Context, startTime, endTime time.Time) ([]ErrorDistributionItem, error) {`
			`return nil, nil`
			`}`

			`func (r *fakeOpsRepository) ListRecentSystemMetrics(ctx context.Context, windowMinutes, limit int) ([]OpsMetrics, error) {`
			`if limit <= 0 {`
			`limit = 1`
			`}`
			`now := time.Now()`
			`metrics := make([]OpsMetrics, 0, limit)`
			`for i := 0; i < limit; i++ {`
			`metrics = append(metrics, OpsMetrics{`
			`WindowMinutes: windowMinutes,`
			`CPUUsagePercent: 99,`
			`UpdatedAt: now.Add(-time.Duration(i) * opsMetricsInterval),`
			`})`
			`}`
			`return metrics, nil`
			`}`

			`func (r *fakeOpsRepository) ListSystemMetricsRange(ctx context.Context, windowMinutes int, startTime, endTime time.Time, limit int) ([]OpsMetrics, error) {`
			`return nil, nil`
			`}`

			`func (r *fakeOpsRepository) ListAlertRules(ctx context.Context) ([]OpsAlertRule, error) {`
			`call := r.listRulesCalls.Add(1)`
			`// Delay enabling rules slightly so the test can stop OpsMetricsCollector first,`
			`// then observe the alert evaluator ticking independently.`
			`if call < 5 {`
			`return nil, nil`
			`}`
			`return []OpsAlertRule{`
			`{`
			`ID: 1,`
			`Name: "cpu too high (test)",`
			`Enabled: true,`
			`MetricType: OpsMetricCPUUsagePercent,`
			`Operator: ">",`
			`Threshold: 0,`
			`WindowMinutes: 1,`
			`SustainedMinutes: 1,`
			`Severity: "P1",`
			`NotifyEmail: false,`
			`NotifyWebhook: false,`
			`CooldownMinutes: 0,`
			`},`
			`}, nil`
			`}`

			`func (r fakeOpsRepository) GetActiveAlertEvent(ctx context.Context, ruleID int64) (OpsAlertEvent, error) {`
			`r.mu.Lock()`
			`defer r.mu.Unlock()`
			`if r.activeEvent == nil {`
			`return nil, nil`
			`}`
			`if r.activeEvent.RuleID != ruleID {`
			`return nil, nil`
			`}`
			`if r.activeEvent.Status != OpsAlertStatusFiring {`
			`return nil, nil`
			`}`
			`clone := *r.activeEvent`
			`return &clone, nil`
			`}`

			`func (r fakeOpsRepository) GetLatestAlertEvent(ctx context.Context, ruleID int64) (OpsAlertEvent, error) {`
			`r.mu.Lock()`
			`defer r.mu.Unlock()`
			`if r.latestEvent == nil \|\| r.latestEvent.RuleID != ruleID {`
			`return nil, nil`
			`}`
			`clone := *r.latestEvent`
			`return &clone, nil`
			`}`

			`func (r fakeOpsRepository) CreateAlertEvent(ctx context.Context, event OpsAlertEvent) error {`
			`if event == nil {`
			`return nil`
			`}`
			`r.mu.Lock()`
			`defer r.mu.Unlock()`

			`r.nextEventID++`
			`event.ID = r.nextEventID`

			`clone := *event`
			`r.latestEvent = &clone`
			`if clone.Status == OpsAlertStatusFiring {`
			`r.activeEvent = &clone`
			`}`

			`r.eventOnce.Do(func() { close(r.eventCreatedCh) })`
			`return nil`
			`}`

			`func (r fakeOpsRepository) UpdateAlertEventStatus(ctx context.Context, eventID int64, status string, resolvedAt time.Time) error {`
			`r.mu.Lock()`
			`defer r.mu.Unlock()`
			`if r.activeEvent != nil && r.activeEvent.ID == eventID {`
			`r.activeEvent.Status = status`
			`r.activeEvent.ResolvedAt = resolvedAt`
			`}`
			`if r.latestEvent != nil && r.latestEvent.ID == eventID {`
			`r.latestEvent.Status = status`
			`r.latestEvent.ResolvedAt = resolvedAt`
			`}`
			`return nil`
			`}`

			`func (r *fakeOpsRepository) UpdateAlertEventNotifications(ctx context.Context, eventID int64, emailSent, webhookSent bool) error {`
			`r.mu.Lock()`
			`defer r.mu.Unlock()`
			`if r.activeEvent != nil && r.activeEvent.ID == eventID {`
			`r.activeEvent.EmailSent = emailSent`
			`r.activeEvent.WebhookSent = webhookSent`
			`}`
			`if r.latestEvent != nil && r.latestEvent.ID == eventID {`
			`r.latestEvent.EmailSent = emailSent`
			`r.latestEvent.WebhookSent = webhookSent`
			`}`
			`return nil`
			`}`

			`func (r *fakeOpsRepository) CountActiveAlerts(ctx context.Context) (int, error) {`
			`r.mu.Lock()`
			`defer r.mu.Unlock()`
			`if r.activeEvent == nil {`
			`return 0, nil`
			`}`
			`return 1, nil`
			`}`

			`func (r fakeOpsRepository) GetOverviewStats(ctx context.Context, startTime, endTime time.Time) (OverviewStats, error) {`
			`return &OverviewStats{}, nil`
			`}`

			`func (r fakeOpsRepository) GetCachedLatestSystemMetric(ctx context.Context) (OpsMetrics, error) {`
			`return nil, nil`
			`}`

			`func (r fakeOpsRepository) SetCachedLatestSystemMetric(ctx context.Context, metric OpsMetrics) error {`
			`return nil`
			`}`

			`func (r fakeOpsRepository) GetCachedDashboardOverview(ctx context.Context, timeRange string) (DashboardOverviewData, error) {`
			`return nil, nil`
			`}`

			`func (r fakeOpsRepository) SetCachedDashboardOverview(ctx context.Context, timeRange string, data DashboardOverviewData, ttl time.Duration) error {`
			`return nil`
			`}`

			`func (r *fakeOpsRepository) PingRedis(ctx context.Context) error {`
			`return nil`
			`}`