Files
deer-flow/tests/unit/eval/test_metrics.py

208 lines
5.7 KiB
Python
Raw Normal View History

feat(eval): add report quality evaluation module and UI integration (#776) * feat(eval): add report quality evaluation module Addresses issue #773 - How to evaluate generated report quality objectively. This module provides two evaluation approaches: 1. Automated metrics (no LLM required): - Citation count and source diversity - Word count compliance per report style - Section structure validation - Image inclusion tracking 2. LLM-as-Judge evaluation: - Factual accuracy scoring - Completeness assessment - Coherence evaluation - Relevance and citation quality checks The combined evaluator provides a final score (1-10) and letter grade (A+ to F). Files added: - src/eval/__init__.py - src/eval/metrics.py - src/eval/llm_judge.py - src/eval/evaluator.py - tests/unit/eval/test_metrics.py - tests/unit/eval/test_evaluator.py * feat(eval): integrate report evaluation with web UI This commit adds the web UI integration for the evaluation module: Backend: - Add EvaluateReportRequest/Response models in src/server/eval_request.py - Add /api/report/evaluate endpoint to src/server/app.py Frontend: - Add evaluateReport API function in web/src/core/api/evaluate.ts - Create EvaluationDialog component with grade badge, metrics display, and optional LLM deep evaluation - Add evaluation button (graduation cap icon) to research-block.tsx toolbar - Add i18n translations for English and Chinese The evaluation UI allows users to: 1. View quick metrics-only evaluation (instant) 2. Optionally run deep LLM-based evaluation for detailed analysis 3. See grade (A+ to F), score (1-10), and metric breakdown * feat(eval): improve evaluation reliability and add LLM judge tests - Extract MAX_REPORT_LENGTH constant in llm_judge.py for maintainability - Add comprehensive unit tests for LLMJudge class (parse_response, calculate_weighted_score, evaluate with mocked LLM) - Pass reportStyle prop to EvaluationDialog for accurate evaluation criteria - Add researchQueries store map to reliably associate queries with research - Add getResearchQuery helper to retrieve query by researchId - Remove unused imports in test_metrics.py * fix(eval): use resolveServiceURL for evaluate API endpoint The evaluateReport function was using a relative URL '/api/report/evaluate' which sent requests to the Next.js server instead of the FastAPI backend. Changed to use resolveServiceURL() consistent with other API functions. * fix: improve type accuracy and React hooks in evaluation components - Fix get_word_count_target return type from Optional[Dict] to Dict since it always returns a value via default fallback - Fix useEffect dependency issue in EvaluationDialog using useRef to prevent unwanted re-evaluations - Add aria-label to GradeBadge for screen reader accessibility
2025-12-25 21:55:48 +08:00
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
# SPDX-License-Identifier: MIT
"""Unit tests for report evaluation metrics."""
from src.eval.metrics import (
compute_metrics,
count_citations,
count_images,
count_words,
detect_sections,
extract_domains,
get_word_count_target,
)
class TestCountWords:
"""Tests for word counting function."""
def test_english_words(self):
text = "This is a simple test sentence."
assert count_words(text) == 6
def test_chinese_characters(self):
text = "这是一个测试"
assert count_words(text) == 6
def test_mixed_content(self):
text = "Hello 你好 World 世界"
assert count_words(text) == 4 + 2 # 2 English + 4 Chinese
def test_empty_string(self):
assert count_words("") == 0
class TestCountCitations:
"""Tests for citation counting function."""
def test_markdown_citations(self):
text = """
Check out [Google](https://google.com) and [GitHub](https://github.com).
"""
assert count_citations(text) == 2
def test_no_citations(self):
text = "This is plain text without any links."
assert count_citations(text) == 0
def test_invalid_urls(self):
text = "[Link](not-a-url) [Another](ftp://ftp.example.com)"
assert count_citations(text) == 0
def test_complex_urls(self):
text = "[Article](https://example.com/path/to/article?id=123&ref=test)"
assert count_citations(text) == 1
class TestExtractDomains:
"""Tests for domain extraction function."""
def test_extract_multiple_domains(self):
text = """
https://google.com/search
https://www.github.com/user/repo
https://docs.python.org/3/
"""
domains = extract_domains(text)
assert len(domains) == 3
assert "google.com" in domains
assert "github.com" in domains
assert "docs.python.org" in domains
def test_deduplicate_domains(self):
text = """
https://example.com/page1
https://example.com/page2
https://www.example.com/page3
"""
domains = extract_domains(text)
assert len(domains) == 1
assert "example.com" in domains
def test_no_urls(self):
text = "Plain text without URLs"
assert extract_domains(text) == []
class TestCountImages:
"""Tests for image counting function."""
def test_markdown_images(self):
text = """
![Alt text](https://example.com/image1.png)
![](https://example.com/image2.jpg)
"""
assert count_images(text) == 2
def test_no_images(self):
text = "Text without images [link](url)"
assert count_images(text) == 0
class TestDetectSections:
"""Tests for section detection function."""
def test_detect_title(self):
text = "# My Report Title\n\nSome content here."
sections = detect_sections(text)
assert sections.get("title") is True
def test_detect_key_points(self):
text = "## Key Points\n- Point 1\n- Point 2"
sections = detect_sections(text)
assert sections.get("key_points") is True
def test_detect_chinese_sections(self):
text = """# 报告标题
## 要点
- 要点1
## 概述
这是概述内容
"""
sections = detect_sections(text)
assert sections.get("title") is True
assert sections.get("key_points") is True
assert sections.get("overview") is True
def test_detect_citations_section(self):
text = """
## Key Citations
- [Source 1](https://example.com)
"""
sections = detect_sections(text)
assert sections.get("key_citations") is True
class TestComputeMetrics:
"""Tests for the main compute_metrics function."""
def test_complete_report(self):
report = """
# Research Report Title
## Key Points
- Point 1
- Point 2
- Point 3
## Overview
This is an overview of the research topic.
## Detailed Analysis
Here is the detailed analysis with [source](https://example.com).
![Figure 1](https://example.com/image.png)
## Key Citations
- [Source 1](https://example.com)
- [Source 2](https://another.com)
"""
metrics = compute_metrics(report)
assert metrics.has_title is True
assert metrics.has_key_points is True
assert metrics.has_overview is True
assert metrics.has_citations_section is True
assert metrics.citation_count >= 2
assert metrics.image_count == 1
assert metrics.unique_sources >= 1
assert metrics.section_coverage_score > 0.5
def test_minimal_report(self):
report = "Just some text without structure."
metrics = compute_metrics(report)
assert metrics.has_title is False
assert metrics.citation_count == 0
assert metrics.section_coverage_score < 0.5
def test_metrics_to_dict(self):
report = "# Title\n\nSome content"
metrics = compute_metrics(report)
result = metrics.to_dict()
assert isinstance(result, dict)
assert "word_count" in result
assert "citation_count" in result
assert "section_coverage_score" in result
class TestGetWordCountTarget:
"""Tests for word count target function."""
def test_strategic_investment_target(self):
target = get_word_count_target("strategic_investment")
assert target["min"] == 10000
assert target["max"] == 15000
def test_news_target(self):
target = get_word_count_target("news")
assert target["min"] == 800
assert target["max"] == 2000
def test_default_target(self):
target = get_word_count_target("unknown_style")
assert target["min"] == 1000
assert target["max"] == 5000