mirror of
https://gitee.com/wanwujie/deer-flow
synced 2026-04-12 18:14:46 +08:00
490 lines
16 KiB
Python
490 lines
16 KiB
Python
|
|
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
||
|
|
# SPDX-License-Identifier: MIT
|
||
|
|
|
||
|
|
"""Unit tests for the combined report evaluator."""
|
||
|
|
|
||
|
|
import json
|
||
|
|
from unittest.mock import AsyncMock, MagicMock
|
||
|
|
|
||
|
|
import pytest
|
||
|
|
|
||
|
|
from src.eval.evaluator import CombinedEvaluation, ReportEvaluator, score_to_grade
|
||
|
|
from src.eval.llm_judge import (
|
||
|
|
EVALUATION_CRITERIA,
|
||
|
|
MAX_REPORT_LENGTH,
|
||
|
|
EvaluationResult,
|
||
|
|
LLMJudge,
|
||
|
|
)
|
||
|
|
from src.eval.metrics import ReportMetrics
|
||
|
|
|
||
|
|
|
||
|
|
class TestScoreToGrade:
|
||
|
|
"""Tests for score to grade conversion."""
|
||
|
|
|
||
|
|
def test_excellent_scores(self):
|
||
|
|
assert score_to_grade(9.5) == "A+"
|
||
|
|
assert score_to_grade(9.0) == "A+"
|
||
|
|
assert score_to_grade(8.7) == "A"
|
||
|
|
assert score_to_grade(8.5) == "A"
|
||
|
|
assert score_to_grade(8.2) == "A-"
|
||
|
|
|
||
|
|
def test_good_scores(self):
|
||
|
|
assert score_to_grade(7.8) == "B+"
|
||
|
|
assert score_to_grade(7.5) == "B+"
|
||
|
|
assert score_to_grade(7.2) == "B"
|
||
|
|
assert score_to_grade(7.0) == "B"
|
||
|
|
assert score_to_grade(6.7) == "B-"
|
||
|
|
|
||
|
|
def test_average_scores(self):
|
||
|
|
assert score_to_grade(6.2) == "C+"
|
||
|
|
assert score_to_grade(5.8) == "C"
|
||
|
|
assert score_to_grade(5.5) == "C"
|
||
|
|
assert score_to_grade(5.2) == "C-"
|
||
|
|
|
||
|
|
def test_poor_scores(self):
|
||
|
|
assert score_to_grade(4.5) == "D"
|
||
|
|
assert score_to_grade(4.0) == "D"
|
||
|
|
assert score_to_grade(3.0) == "F"
|
||
|
|
assert score_to_grade(1.0) == "F"
|
||
|
|
|
||
|
|
|
||
|
|
class TestReportEvaluator:
|
||
|
|
"""Tests for ReportEvaluator class."""
|
||
|
|
|
||
|
|
@pytest.fixture
|
||
|
|
def evaluator(self):
|
||
|
|
"""Create evaluator without LLM for metrics-only tests."""
|
||
|
|
return ReportEvaluator(use_llm=False)
|
||
|
|
|
||
|
|
@pytest.fixture
|
||
|
|
def sample_report(self):
|
||
|
|
"""Sample report for testing."""
|
||
|
|
return """
|
||
|
|
# Comprehensive Research Report
|
||
|
|
|
||
|
|
## Key Points
|
||
|
|
- Important finding number one with significant implications
|
||
|
|
- Critical discovery that changes our understanding
|
||
|
|
- Key insight that provides actionable recommendations
|
||
|
|
- Notable observation from the research data
|
||
|
|
|
||
|
|
## Overview
|
||
|
|
This report presents a comprehensive analysis of the research topic.
|
||
|
|
The findings are based on extensive data collection and analysis.
|
||
|
|
|
||
|
|
## Detailed Analysis
|
||
|
|
|
||
|
|
### Section 1: Background
|
||
|
|
The background of this research involves multiple factors.
|
||
|
|
[Source 1](https://example.com/source1) provides foundational context.
|
||
|
|
|
||
|
|
### Section 2: Methodology
|
||
|
|
Our methodology follows established research practices.
|
||
|
|
[Source 2](https://research.org/methods) outlines the approach.
|
||
|
|
|
||
|
|
### Section 3: Findings
|
||
|
|
The key findings include several important discoveries.
|
||
|
|

|
||
|
|
|
||
|
|
[Source 3](https://academic.edu/paper) supports these conclusions.
|
||
|
|
|
||
|
|
## Key Citations
|
||
|
|
- [Example Source](https://example.com/source1)
|
||
|
|
- [Research Methods](https://research.org/methods)
|
||
|
|
- [Academic Paper](https://academic.edu/paper)
|
||
|
|
- [Additional Reference](https://reference.com/doc)
|
||
|
|
"""
|
||
|
|
|
||
|
|
def test_evaluate_metrics_only(self, evaluator, sample_report):
|
||
|
|
"""Test metrics-only evaluation."""
|
||
|
|
result = evaluator.evaluate_metrics_only(sample_report)
|
||
|
|
|
||
|
|
assert "metrics" in result
|
||
|
|
assert "score" in result
|
||
|
|
assert "grade" in result
|
||
|
|
assert result["score"] > 0
|
||
|
|
assert result["grade"] in ["A+", "A", "A-", "B+", "B", "B-", "C+", "C", "C-", "D", "F"]
|
||
|
|
|
||
|
|
def test_evaluate_metrics_only_structure(self, evaluator, sample_report):
|
||
|
|
"""Test that metrics contain expected fields."""
|
||
|
|
result = evaluator.evaluate_metrics_only(sample_report)
|
||
|
|
metrics = result["metrics"]
|
||
|
|
|
||
|
|
assert "word_count" in metrics
|
||
|
|
assert "citation_count" in metrics
|
||
|
|
assert "unique_sources" in metrics
|
||
|
|
assert "image_count" in metrics
|
||
|
|
assert "section_coverage_score" in metrics
|
||
|
|
|
||
|
|
def test_evaluate_minimal_report(self, evaluator):
|
||
|
|
"""Test evaluation of minimal report."""
|
||
|
|
minimal_report = "Just some text."
|
||
|
|
result = evaluator.evaluate_metrics_only(minimal_report)
|
||
|
|
|
||
|
|
assert result["score"] < 5.0
|
||
|
|
assert result["grade"] in ["D", "F"]
|
||
|
|
|
||
|
|
def test_metrics_score_calculation(self, evaluator):
|
||
|
|
"""Test that metrics score is calculated correctly."""
|
||
|
|
good_report = """
|
||
|
|
# Title
|
||
|
|
|
||
|
|
## Key Points
|
||
|
|
- Point 1
|
||
|
|
- Point 2
|
||
|
|
|
||
|
|
## Overview
|
||
|
|
Overview content here.
|
||
|
|
|
||
|
|
## Detailed Analysis
|
||
|
|
Analysis with [cite](https://a.com) and [cite2](https://b.com)
|
||
|
|
and [cite3](https://c.com) and more [refs](https://d.com).
|
||
|
|
|
||
|
|

|
||
|
|
|
||
|
|
## Key Citations
|
||
|
|
- [A](https://a.com)
|
||
|
|
- [B](https://b.com)
|
||
|
|
"""
|
||
|
|
result = evaluator.evaluate_metrics_only(good_report)
|
||
|
|
assert result["score"] > 5.0
|
||
|
|
|
||
|
|
def test_combined_evaluation_to_dict(self):
|
||
|
|
"""Test CombinedEvaluation to_dict method."""
|
||
|
|
metrics = ReportMetrics(
|
||
|
|
word_count=1000,
|
||
|
|
citation_count=5,
|
||
|
|
unique_sources=3,
|
||
|
|
)
|
||
|
|
evaluation = CombinedEvaluation(
|
||
|
|
metrics=metrics,
|
||
|
|
llm_evaluation=None,
|
||
|
|
final_score=7.5,
|
||
|
|
grade="B+",
|
||
|
|
summary="Test summary",
|
||
|
|
)
|
||
|
|
|
||
|
|
result = evaluation.to_dict()
|
||
|
|
assert result["final_score"] == 7.5
|
||
|
|
assert result["grade"] == "B+"
|
||
|
|
assert result["metrics"]["word_count"] == 1000
|
||
|
|
|
||
|
|
|
||
|
|
class TestReportEvaluatorIntegration:
|
||
|
|
"""Integration tests for evaluator (may require LLM)."""
|
||
|
|
|
||
|
|
@pytest.mark.asyncio
|
||
|
|
async def test_full_evaluation_without_llm(self):
|
||
|
|
"""Test full evaluation with LLM disabled."""
|
||
|
|
evaluator = ReportEvaluator(use_llm=False)
|
||
|
|
|
||
|
|
report = """
|
||
|
|
# Test Report
|
||
|
|
|
||
|
|
## Key Points
|
||
|
|
- Key point 1
|
||
|
|
|
||
|
|
## Overview
|
||
|
|
Test overview.
|
||
|
|
|
||
|
|
## Key Citations
|
||
|
|
- [Test](https://test.com)
|
||
|
|
"""
|
||
|
|
|
||
|
|
result = await evaluator.evaluate(report, "test query")
|
||
|
|
|
||
|
|
assert isinstance(result, CombinedEvaluation)
|
||
|
|
assert result.final_score > 0
|
||
|
|
assert result.grade is not None
|
||
|
|
assert result.summary is not None
|
||
|
|
assert result.llm_evaluation is None
|
||
|
|
|
||
|
|
|
||
|
|
class TestLLMJudgeParseResponse:
|
||
|
|
"""Tests for LLMJudge._parse_response method."""
|
||
|
|
|
||
|
|
@pytest.fixture
|
||
|
|
def judge(self):
|
||
|
|
"""Create LLMJudge with mock LLM."""
|
||
|
|
return LLMJudge(llm=MagicMock())
|
||
|
|
|
||
|
|
@pytest.fixture
|
||
|
|
def valid_response_data(self):
|
||
|
|
"""Valid evaluation response data."""
|
||
|
|
return {
|
||
|
|
"scores": {
|
||
|
|
"factual_accuracy": 8,
|
||
|
|
"completeness": 7,
|
||
|
|
"coherence": 9,
|
||
|
|
"relevance": 8,
|
||
|
|
"citation_quality": 6,
|
||
|
|
"writing_quality": 8,
|
||
|
|
},
|
||
|
|
"overall_score": 8,
|
||
|
|
"strengths": ["Well researched", "Clear structure"],
|
||
|
|
"weaknesses": ["Could use more citations"],
|
||
|
|
"suggestions": ["Add more sources"],
|
||
|
|
}
|
||
|
|
|
||
|
|
def test_parse_valid_json(self, judge, valid_response_data):
|
||
|
|
"""Test parsing valid JSON response."""
|
||
|
|
response = json.dumps(valid_response_data)
|
||
|
|
result = judge._parse_response(response)
|
||
|
|
|
||
|
|
assert result["scores"]["factual_accuracy"] == 8
|
||
|
|
assert result["overall_score"] == 8
|
||
|
|
assert "Well researched" in result["strengths"]
|
||
|
|
|
||
|
|
def test_parse_json_in_markdown_block(self, judge, valid_response_data):
|
||
|
|
"""Test parsing JSON wrapped in markdown code block."""
|
||
|
|
response = f"```json\n{json.dumps(valid_response_data)}\n```"
|
||
|
|
result = judge._parse_response(response)
|
||
|
|
|
||
|
|
assert result["scores"]["coherence"] == 9
|
||
|
|
assert result["overall_score"] == 8
|
||
|
|
|
||
|
|
def test_parse_json_in_generic_code_block(self, judge, valid_response_data):
|
||
|
|
"""Test parsing JSON in generic code block."""
|
||
|
|
response = f"```\n{json.dumps(valid_response_data)}\n```"
|
||
|
|
result = judge._parse_response(response)
|
||
|
|
|
||
|
|
assert result["scores"]["relevance"] == 8
|
||
|
|
|
||
|
|
def test_parse_malformed_json_returns_defaults(self, judge):
|
||
|
|
"""Test that malformed JSON returns default scores."""
|
||
|
|
response = "This is not valid JSON at all"
|
||
|
|
result = judge._parse_response(response)
|
||
|
|
|
||
|
|
assert result["scores"]["factual_accuracy"] == 5
|
||
|
|
assert result["scores"]["completeness"] == 5
|
||
|
|
assert result["overall_score"] == 5
|
||
|
|
assert "Unable to parse evaluation" in result["strengths"]
|
||
|
|
assert "Evaluation parsing failed" in result["weaknesses"]
|
||
|
|
|
||
|
|
def test_parse_incomplete_json(self, judge):
|
||
|
|
"""Test parsing incomplete JSON."""
|
||
|
|
response = '{"scores": {"factual_accuracy": 8}' # Missing closing braces
|
||
|
|
result = judge._parse_response(response)
|
||
|
|
|
||
|
|
# Should return defaults due to parse failure
|
||
|
|
assert result["overall_score"] == 5
|
||
|
|
|
||
|
|
def test_parse_json_with_extra_text(self, judge, valid_response_data):
|
||
|
|
"""Test parsing JSON with surrounding text."""
|
||
|
|
response = f"Here is my evaluation:\n```json\n{json.dumps(valid_response_data)}\n```\nHope this helps!"
|
||
|
|
result = judge._parse_response(response)
|
||
|
|
|
||
|
|
assert result["scores"]["factual_accuracy"] == 8
|
||
|
|
|
||
|
|
|
||
|
|
class TestLLMJudgeCalculateWeightedScore:
|
||
|
|
"""Tests for LLMJudge._calculate_weighted_score method."""
|
||
|
|
|
||
|
|
@pytest.fixture
|
||
|
|
def judge(self):
|
||
|
|
"""Create LLMJudge with mock LLM."""
|
||
|
|
return LLMJudge(llm=MagicMock())
|
||
|
|
|
||
|
|
def test_calculate_with_all_scores(self, judge):
|
||
|
|
"""Test weighted score calculation with all criteria."""
|
||
|
|
scores = {
|
||
|
|
"factual_accuracy": 10, # weight 0.25
|
||
|
|
"completeness": 10, # weight 0.20
|
||
|
|
"coherence": 10, # weight 0.20
|
||
|
|
"relevance": 10, # weight 0.15
|
||
|
|
"citation_quality": 10, # weight 0.10
|
||
|
|
"writing_quality": 10, # weight 0.10
|
||
|
|
}
|
||
|
|
result = judge._calculate_weighted_score(scores)
|
||
|
|
assert result == 10.0
|
||
|
|
|
||
|
|
def test_calculate_with_varied_scores(self, judge):
|
||
|
|
"""Test weighted score with varied scores."""
|
||
|
|
scores = {
|
||
|
|
"factual_accuracy": 8, # 8 * 0.25 = 2.0
|
||
|
|
"completeness": 6, # 6 * 0.20 = 1.2
|
||
|
|
"coherence": 7, # 7 * 0.20 = 1.4
|
||
|
|
"relevance": 9, # 9 * 0.15 = 1.35
|
||
|
|
"citation_quality": 5, # 5 * 0.10 = 0.5
|
||
|
|
"writing_quality": 8, # 8 * 0.10 = 0.8
|
||
|
|
}
|
||
|
|
# Total: 7.25
|
||
|
|
result = judge._calculate_weighted_score(scores)
|
||
|
|
assert result == 7.25
|
||
|
|
|
||
|
|
def test_calculate_with_partial_scores(self, judge):
|
||
|
|
"""Test weighted score with only some criteria."""
|
||
|
|
scores = {
|
||
|
|
"factual_accuracy": 8, # weight 0.25
|
||
|
|
"completeness": 6, # weight 0.20
|
||
|
|
}
|
||
|
|
# (8 * 0.25 + 6 * 0.20) / (0.25 + 0.20) = 3.2 / 0.45 = 7.11
|
||
|
|
result = judge._calculate_weighted_score(scores)
|
||
|
|
assert abs(result - 7.11) < 0.01
|
||
|
|
|
||
|
|
def test_calculate_with_unknown_criteria(self, judge):
|
||
|
|
"""Test that unknown criteria are ignored."""
|
||
|
|
scores = {
|
||
|
|
"factual_accuracy": 10,
|
||
|
|
"unknown_criterion": 1, # Should be ignored
|
||
|
|
}
|
||
|
|
result = judge._calculate_weighted_score(scores)
|
||
|
|
assert result == 10.0
|
||
|
|
|
||
|
|
def test_calculate_with_empty_scores(self, judge):
|
||
|
|
"""Test with empty scores dict."""
|
||
|
|
result = judge._calculate_weighted_score({})
|
||
|
|
assert result == 0.0
|
||
|
|
|
||
|
|
def test_weights_sum_to_one(self):
|
||
|
|
"""Verify that all criteria weights sum to 1.0."""
|
||
|
|
total_weight = sum(c["weight"] for c in EVALUATION_CRITERIA.values())
|
||
|
|
assert abs(total_weight - 1.0) < 0.001
|
||
|
|
|
||
|
|
|
||
|
|
class TestLLMJudgeEvaluate:
|
||
|
|
"""Tests for LLMJudge.evaluate method with mocked LLM."""
|
||
|
|
|
||
|
|
@pytest.fixture
|
||
|
|
def valid_llm_response(self):
|
||
|
|
"""Create a valid LLM response."""
|
||
|
|
return json.dumps(
|
||
|
|
{
|
||
|
|
"scores": {
|
||
|
|
"factual_accuracy": 8,
|
||
|
|
"completeness": 7,
|
||
|
|
"coherence": 9,
|
||
|
|
"relevance": 8,
|
||
|
|
"citation_quality": 7,
|
||
|
|
"writing_quality": 8,
|
||
|
|
},
|
||
|
|
"overall_score": 8,
|
||
|
|
"strengths": ["Comprehensive coverage", "Well structured"],
|
||
|
|
"weaknesses": ["Some claims need more support"],
|
||
|
|
"suggestions": ["Add more academic sources"],
|
||
|
|
}
|
||
|
|
)
|
||
|
|
|
||
|
|
@pytest.mark.asyncio
|
||
|
|
async def test_successful_evaluation(self, valid_llm_response):
|
||
|
|
"""Test successful LLM evaluation."""
|
||
|
|
mock_llm = AsyncMock()
|
||
|
|
mock_response = MagicMock()
|
||
|
|
mock_response.content = valid_llm_response
|
||
|
|
mock_llm.ainvoke.return_value = mock_response
|
||
|
|
|
||
|
|
judge = LLMJudge(llm=mock_llm)
|
||
|
|
result = await judge.evaluate("Test report", "Test query")
|
||
|
|
|
||
|
|
assert isinstance(result, EvaluationResult)
|
||
|
|
assert result.scores["factual_accuracy"] == 8
|
||
|
|
assert result.overall_score == 8
|
||
|
|
assert result.weighted_score > 0
|
||
|
|
assert "Comprehensive coverage" in result.strengths
|
||
|
|
assert result.raw_response == valid_llm_response
|
||
|
|
|
||
|
|
@pytest.mark.asyncio
|
||
|
|
async def test_evaluation_with_llm_failure(self):
|
||
|
|
"""Test that LLM failures are handled gracefully."""
|
||
|
|
mock_llm = AsyncMock()
|
||
|
|
mock_llm.ainvoke.side_effect = Exception("LLM service unavailable")
|
||
|
|
|
||
|
|
judge = LLMJudge(llm=mock_llm)
|
||
|
|
result = await judge.evaluate("Test report", "Test query")
|
||
|
|
|
||
|
|
assert isinstance(result, EvaluationResult)
|
||
|
|
assert result.overall_score == 0
|
||
|
|
assert result.weighted_score == 0
|
||
|
|
assert all(score == 0 for score in result.scores.values())
|
||
|
|
assert any("failed" in w.lower() for w in result.weaknesses)
|
||
|
|
|
||
|
|
@pytest.mark.asyncio
|
||
|
|
async def test_evaluation_with_malformed_response(self):
|
||
|
|
"""Test handling of malformed LLM response."""
|
||
|
|
mock_llm = AsyncMock()
|
||
|
|
mock_response = MagicMock()
|
||
|
|
mock_response.content = "I cannot evaluate this report properly."
|
||
|
|
mock_llm.ainvoke.return_value = mock_response
|
||
|
|
|
||
|
|
judge = LLMJudge(llm=mock_llm)
|
||
|
|
result = await judge.evaluate("Test report", "Test query")
|
||
|
|
|
||
|
|
# Should return default scores
|
||
|
|
assert result.scores["factual_accuracy"] == 5
|
||
|
|
assert result.overall_score == 5
|
||
|
|
|
||
|
|
@pytest.mark.asyncio
|
||
|
|
async def test_evaluation_passes_report_style(self):
|
||
|
|
"""Test that report_style is passed to LLM."""
|
||
|
|
mock_llm = AsyncMock()
|
||
|
|
mock_response = MagicMock()
|
||
|
|
mock_response.content = json.dumps(
|
||
|
|
{
|
||
|
|
"scores": {k: 7 for k in EVALUATION_CRITERIA.keys()},
|
||
|
|
"overall_score": 7,
|
||
|
|
"strengths": [],
|
||
|
|
"weaknesses": [],
|
||
|
|
"suggestions": [],
|
||
|
|
}
|
||
|
|
)
|
||
|
|
mock_llm.ainvoke.return_value = mock_response
|
||
|
|
|
||
|
|
judge = LLMJudge(llm=mock_llm)
|
||
|
|
await judge.evaluate("Test report", "Test query", report_style="academic")
|
||
|
|
|
||
|
|
# Verify the prompt contains the report style
|
||
|
|
call_args = mock_llm.ainvoke.call_args
|
||
|
|
messages = call_args[0][0]
|
||
|
|
user_message_content = messages[1].content
|
||
|
|
assert "academic" in user_message_content
|
||
|
|
|
||
|
|
@pytest.mark.asyncio
|
||
|
|
async def test_evaluation_truncates_long_reports(self):
|
||
|
|
"""Test that very long reports are truncated."""
|
||
|
|
mock_llm = AsyncMock()
|
||
|
|
mock_response = MagicMock()
|
||
|
|
mock_response.content = json.dumps(
|
||
|
|
{
|
||
|
|
"scores": {k: 7 for k in EVALUATION_CRITERIA.keys()},
|
||
|
|
"overall_score": 7,
|
||
|
|
"strengths": [],
|
||
|
|
"weaknesses": [],
|
||
|
|
"suggestions": [],
|
||
|
|
}
|
||
|
|
)
|
||
|
|
mock_llm.ainvoke.return_value = mock_response
|
||
|
|
|
||
|
|
judge = LLMJudge(llm=mock_llm)
|
||
|
|
long_report = "x" * (MAX_REPORT_LENGTH + 5000)
|
||
|
|
await judge.evaluate(long_report, "Test query")
|
||
|
|
|
||
|
|
call_args = mock_llm.ainvoke.call_args
|
||
|
|
messages = call_args[0][0]
|
||
|
|
user_message_content = messages[1].content
|
||
|
|
# The report content in the message should be truncated to MAX_REPORT_LENGTH
|
||
|
|
assert len(user_message_content) < len(long_report) + 500
|
||
|
|
|
||
|
|
|
||
|
|
class TestEvaluationResult:
|
||
|
|
"""Tests for EvaluationResult dataclass."""
|
||
|
|
|
||
|
|
def test_to_dict(self):
|
||
|
|
"""Test EvaluationResult.to_dict method."""
|
||
|
|
result = EvaluationResult(
|
||
|
|
scores={"factual_accuracy": 8, "completeness": 7},
|
||
|
|
overall_score=7.5,
|
||
|
|
weighted_score=7.6,
|
||
|
|
strengths=["Good research"],
|
||
|
|
weaknesses=["Needs more detail"],
|
||
|
|
suggestions=["Expand section 2"],
|
||
|
|
raw_response="test response",
|
||
|
|
)
|
||
|
|
|
||
|
|
d = result.to_dict()
|
||
|
|
assert d["scores"]["factual_accuracy"] == 8
|
||
|
|
assert d["overall_score"] == 7.5
|
||
|
|
assert d["weighted_score"] == 7.6
|
||
|
|
assert "Good research" in d["strengths"]
|
||
|
|
# raw_response should not be in dict
|
||
|
|
assert "raw_response" not in d
|