tests/unit/eval/test_evaluator.py

# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
# SPDX-License-Identifier: MIT

"""Unit tests for the combined report evaluator."""

import json
from unittest.mock import AsyncMock, MagicMock

import pytest

from src.eval.evaluator import CombinedEvaluation, ReportEvaluator, score_to_grade
from src.eval.llm_judge import (
    EVALUATION_CRITERIA,
    MAX_REPORT_LENGTH,
    EvaluationResult,
    LLMJudge,
)
from src.eval.metrics import ReportMetrics


class TestScoreToGrade:
    """Tests for score to grade conversion."""

    def test_excellent_scores(self):
        assert score_to_grade(9.5) == "A+"
        assert score_to_grade(9.0) == "A+"
        assert score_to_grade(8.7) == "A"
        assert score_to_grade(8.5) == "A"
        assert score_to_grade(8.2) == "A-"

    def test_good_scores(self):
        assert score_to_grade(7.8) == "B+"
        assert score_to_grade(7.5) == "B+"
        assert score_to_grade(7.2) == "B"
        assert score_to_grade(7.0) == "B"
        assert score_to_grade(6.7) == "B-"

    def test_average_scores(self):
        assert score_to_grade(6.2) == "C+"
        assert score_to_grade(5.8) == "C"
        assert score_to_grade(5.5) == "C"
        assert score_to_grade(5.2) == "C-"

    def test_poor_scores(self):
        assert score_to_grade(4.5) == "D"
        assert score_to_grade(4.0) == "D"
        assert score_to_grade(3.0) == "F"
        assert score_to_grade(1.0) == "F"


class TestReportEvaluator:
    """Tests for ReportEvaluator class."""

    @pytest.fixture
    def evaluator(self):
        """Create evaluator without LLM for metrics-only tests."""
        return ReportEvaluator(use_llm=False)

    @pytest.fixture
    def sample_report(self):
        """Sample report for testing."""
        return """
# Comprehensive Research Report

## Key Points
- Important finding number one with significant implications
- Critical discovery that changes our understanding
- Key insight that provides actionable recommendations
- Notable observation from the research data

## Overview
This report presents a comprehensive analysis of the research topic.
The findings are based on extensive data collection and analysis.

## Detailed Analysis

### Section 1: Background
The background of this research involves multiple factors.
[Source 1](https://example.com/source1) provides foundational context.

### Section 2: Methodology
Our methodology follows established research practices.
[Source 2](https://research.org/methods) outlines the approach.

### Section 3: Findings
The key findings include several important discoveries.
![Research Data](https://example.com/chart.png)

[Source 3](https://academic.edu/paper) supports these conclusions.

## Key Citations
- [Example Source](https://example.com/source1)
- [Research Methods](https://research.org/methods)
- [Academic Paper](https://academic.edu/paper)
- [Additional Reference](https://reference.com/doc)
        """

    def test_evaluate_metrics_only(self, evaluator, sample_report):
        """Test metrics-only evaluation."""
        result = evaluator.evaluate_metrics_only(sample_report)

        assert "metrics" in result
        assert "score" in result
        assert "grade" in result
        assert result["score"] > 0
        assert result["grade"] in ["A+", "A", "A-", "B+", "B", "B-", "C+", "C", "C-", "D", "F"]

    def test_evaluate_metrics_only_structure(self, evaluator, sample_report):
        """Test that metrics contain expected fields."""
        result = evaluator.evaluate_metrics_only(sample_report)
        metrics = result["metrics"]

        assert "word_count" in metrics
        assert "citation_count" in metrics
        assert "unique_sources" in metrics
        assert "image_count" in metrics
        assert "section_coverage_score" in metrics

    def test_evaluate_minimal_report(self, evaluator):
        """Test evaluation of minimal report."""
        minimal_report = "Just some text."
        result = evaluator.evaluate_metrics_only(minimal_report)

        assert result["score"] < 5.0
        assert result["grade"] in ["D", "F"]

    def test_metrics_score_calculation(self, evaluator):
        """Test that metrics score is calculated correctly."""
        good_report = """
# Title

## Key Points
- Point 1
- Point 2

## Overview
Overview content here.

## Detailed Analysis
Analysis with [cite](https://a.com) and [cite2](https://b.com) 
and [cite3](https://c.com) and more [refs](https://d.com).

![Image](https://img.com/1.png)

## Key Citations
- [A](https://a.com)
- [B](https://b.com)
        """
        result = evaluator.evaluate_metrics_only(good_report)
        assert result["score"] > 5.0

    def test_combined_evaluation_to_dict(self):
        """Test CombinedEvaluation to_dict method."""
        metrics = ReportMetrics(
            word_count=1000,
            citation_count=5,
            unique_sources=3,
        )
        evaluation = CombinedEvaluation(
            metrics=metrics,
            llm_evaluation=None,
            final_score=7.5,
            grade="B+",
            summary="Test summary",
        )

        result = evaluation.to_dict()
        assert result["final_score"] == 7.5
        assert result["grade"] == "B+"
        assert result["metrics"]["word_count"] == 1000


class TestReportEvaluatorIntegration:
    """Integration tests for evaluator (may require LLM)."""

    @pytest.mark.asyncio
    async def test_full_evaluation_without_llm(self):
        """Test full evaluation with LLM disabled."""
        evaluator = ReportEvaluator(use_llm=False)

        report = """
# Test Report

## Key Points
- Key point 1

## Overview
Test overview.

## Key Citations
- [Test](https://test.com)
        """

        result = await evaluator.evaluate(report, "test query")

        assert isinstance(result, CombinedEvaluation)
        assert result.final_score > 0
        assert result.grade is not None
        assert result.summary is not None
        assert result.llm_evaluation is None


class TestLLMJudgeParseResponse:
    """Tests for LLMJudge._parse_response method."""

    @pytest.fixture
    def judge(self):
        """Create LLMJudge with mock LLM."""
        return LLMJudge(llm=MagicMock())

    @pytest.fixture
    def valid_response_data(self):
        """Valid evaluation response data."""
        return {
            "scores": {
                "factual_accuracy": 8,
                "completeness": 7,
                "coherence": 9,
                "relevance": 8,
                "citation_quality": 6,
                "writing_quality": 8,
            },
            "overall_score": 8,
            "strengths": ["Well researched", "Clear structure"],
            "weaknesses": ["Could use more citations"],
            "suggestions": ["Add more sources"],
        }

    def test_parse_valid_json(self, judge, valid_response_data):
        """Test parsing valid JSON response."""
        response = json.dumps(valid_response_data)
        result = judge._parse_response(response)

        assert result["scores"]["factual_accuracy"] == 8
        assert result["overall_score"] == 8
        assert "Well researched" in result["strengths"]

    def test_parse_json_in_markdown_block(self, judge, valid_response_data):
        """Test parsing JSON wrapped in markdown code block."""
        response = f"```json\n{json.dumps(valid_response_data)}\n```"
        result = judge._parse_response(response)

        assert result["scores"]["coherence"] == 9
        assert result["overall_score"] == 8

    def test_parse_json_in_generic_code_block(self, judge, valid_response_data):
        """Test parsing JSON in generic code block."""
        response = f"```\n{json.dumps(valid_response_data)}\n```"
        result = judge._parse_response(response)

        assert result["scores"]["relevance"] == 8

    def test_parse_malformed_json_returns_defaults(self, judge):
        """Test that malformed JSON returns default scores."""
        response = "This is not valid JSON at all"
        result = judge._parse_response(response)

        assert result["scores"]["factual_accuracy"] == 5
        assert result["scores"]["completeness"] == 5
        assert result["overall_score"] == 5
        assert "Unable to parse evaluation" in result["strengths"]
        assert "Evaluation parsing failed" in result["weaknesses"]

    def test_parse_incomplete_json(self, judge):
        """Test parsing incomplete JSON."""
        response = '{"scores": {"factual_accuracy": 8}'  # Missing closing braces
        result = judge._parse_response(response)

        # Should return defaults due to parse failure
        assert result["overall_score"] == 5

    def test_parse_json_with_extra_text(self, judge, valid_response_data):
        """Test parsing JSON with surrounding text."""
        response = f"Here is my evaluation:\n```json\n{json.dumps(valid_response_data)}\n```\nHope this helps!"
        result = judge._parse_response(response)

        assert result["scores"]["factual_accuracy"] == 8


class TestLLMJudgeCalculateWeightedScore:
    """Tests for LLMJudge._calculate_weighted_score method."""

    @pytest.fixture
    def judge(self):
        """Create LLMJudge with mock LLM."""
        return LLMJudge(llm=MagicMock())

    def test_calculate_with_all_scores(self, judge):
        """Test weighted score calculation with all criteria."""
        scores = {
            "factual_accuracy": 10,  # weight 0.25
            "completeness": 10,  # weight 0.20
            "coherence": 10,  # weight 0.20
            "relevance": 10,  # weight 0.15
            "citation_quality": 10,  # weight 0.10
            "writing_quality": 10,  # weight 0.10
        }
        result = judge._calculate_weighted_score(scores)
        assert result == 10.0

    def test_calculate_with_varied_scores(self, judge):
        """Test weighted score with varied scores."""
        scores = {
            "factual_accuracy": 8,  # 8 * 0.25 = 2.0
            "completeness": 6,  # 6 * 0.20 = 1.2
            "coherence": 7,  # 7 * 0.20 = 1.4
            "relevance": 9,  # 9 * 0.15 = 1.35
            "citation_quality": 5,  # 5 * 0.10 = 0.5
            "writing_quality": 8,  # 8 * 0.10 = 0.8
        }
        # Total: 7.25
        result = judge._calculate_weighted_score(scores)
        assert result == 7.25

    def test_calculate_with_partial_scores(self, judge):
        """Test weighted score with only some criteria."""
        scores = {
            "factual_accuracy": 8,  # weight 0.25
            "completeness": 6,  # weight 0.20
        }
        # (8 * 0.25 + 6 * 0.20) / (0.25 + 0.20) = 3.2 / 0.45 = 7.11
        result = judge._calculate_weighted_score(scores)
        assert abs(result - 7.11) < 0.01

    def test_calculate_with_unknown_criteria(self, judge):
        """Test that unknown criteria are ignored."""
        scores = {
            "factual_accuracy": 10,
            "unknown_criterion": 1,  # Should be ignored
        }
        result = judge._calculate_weighted_score(scores)
        assert result == 10.0

    def test_calculate_with_empty_scores(self, judge):
        """Test with empty scores dict."""
        result = judge._calculate_weighted_score({})
        assert result == 0.0

    def test_weights_sum_to_one(self):
        """Verify that all criteria weights sum to 1.0."""
        total_weight = sum(c["weight"] for c in EVALUATION_CRITERIA.values())
        assert abs(total_weight - 1.0) < 0.001


class TestLLMJudgeEvaluate:
    """Tests for LLMJudge.evaluate method with mocked LLM."""

    @pytest.fixture
    def valid_llm_response(self):
        """Create a valid LLM response."""
        return json.dumps(
            {
                "scores": {
                    "factual_accuracy": 8,
                    "completeness": 7,
                    "coherence": 9,
                    "relevance": 8,
                    "citation_quality": 7,
                    "writing_quality": 8,
                },
                "overall_score": 8,
                "strengths": ["Comprehensive coverage", "Well structured"],
                "weaknesses": ["Some claims need more support"],
                "suggestions": ["Add more academic sources"],
            }
        )

    @pytest.mark.asyncio
    async def test_successful_evaluation(self, valid_llm_response):
        """Test successful LLM evaluation."""
        mock_llm = AsyncMock()
        mock_response = MagicMock()
        mock_response.content = valid_llm_response
        mock_llm.ainvoke.return_value = mock_response

        judge = LLMJudge(llm=mock_llm)
        result = await judge.evaluate("Test report", "Test query")

        assert isinstance(result, EvaluationResult)
        assert result.scores["factual_accuracy"] == 8
        assert result.overall_score == 8
        assert result.weighted_score > 0
        assert "Comprehensive coverage" in result.strengths
        assert result.raw_response == valid_llm_response

    @pytest.mark.asyncio
    async def test_evaluation_with_llm_failure(self):
        """Test that LLM failures are handled gracefully."""
        mock_llm = AsyncMock()
        mock_llm.ainvoke.side_effect = Exception("LLM service unavailable")

        judge = LLMJudge(llm=mock_llm)
        result = await judge.evaluate("Test report", "Test query")

        assert isinstance(result, EvaluationResult)
        assert result.overall_score == 0
        assert result.weighted_score == 0
        assert all(score == 0 for score in result.scores.values())
        assert any("failed" in w.lower() for w in result.weaknesses)

    @pytest.mark.asyncio
    async def test_evaluation_with_malformed_response(self):
        """Test handling of malformed LLM response."""
        mock_llm = AsyncMock()
        mock_response = MagicMock()
        mock_response.content = "I cannot evaluate this report properly."
        mock_llm.ainvoke.return_value = mock_response

        judge = LLMJudge(llm=mock_llm)
        result = await judge.evaluate("Test report", "Test query")

        # Should return default scores
        assert result.scores["factual_accuracy"] == 5
        assert result.overall_score == 5

    @pytest.mark.asyncio
    async def test_evaluation_passes_report_style(self):
        """Test that report_style is passed to LLM."""
        mock_llm = AsyncMock()
        mock_response = MagicMock()
        mock_response.content = json.dumps(
            {
                "scores": {k: 7 for k in EVALUATION_CRITERIA.keys()},
                "overall_score": 7,
                "strengths": [],
                "weaknesses": [],
                "suggestions": [],
            }
        )
        mock_llm.ainvoke.return_value = mock_response

        judge = LLMJudge(llm=mock_llm)
        await judge.evaluate("Test report", "Test query", report_style="academic")

        # Verify the prompt contains the report style
        call_args = mock_llm.ainvoke.call_args
        messages = call_args[0][0]
        user_message_content = messages[1].content
        assert "academic" in user_message_content

    @pytest.mark.asyncio
    async def test_evaluation_truncates_long_reports(self):
        """Test that very long reports are truncated."""
        mock_llm = AsyncMock()
        mock_response = MagicMock()
        mock_response.content = json.dumps(
            {
                "scores": {k: 7 for k in EVALUATION_CRITERIA.keys()},
                "overall_score": 7,
                "strengths": [],
                "weaknesses": [],
                "suggestions": [],
            }
        )
        mock_llm.ainvoke.return_value = mock_response

        judge = LLMJudge(llm=mock_llm)
        long_report = "x" * (MAX_REPORT_LENGTH + 5000)
        await judge.evaluate(long_report, "Test query")

        call_args = mock_llm.ainvoke.call_args
        messages = call_args[0][0]
        user_message_content = messages[1].content
        # The report content in the message should be truncated to MAX_REPORT_LENGTH
        assert len(user_message_content) < len(long_report) + 500


class TestEvaluationResult:
    """Tests for EvaluationResult dataclass."""

    def test_to_dict(self):
        """Test EvaluationResult.to_dict method."""
        result = EvaluationResult(
            scores={"factual_accuracy": 8, "completeness": 7},
            overall_score=7.5,
            weighted_score=7.6,
            strengths=["Good research"],
            weaknesses=["Needs more detail"],
            suggestions=["Expand section 2"],
            raw_response="test response",
        )

        d = result.to_dict()
        assert d["scores"]["factual_accuracy"] == 8
        assert d["overall_score"] == 7.5
        assert d["weighted_score"] == 7.6
        assert "Good research" in d["strengths"]
        # raw_response should not be in dict
        assert "raw_response" not in d
feat(eval): add report quality evaluation module and UI integration (#776) * feat(eval): add report quality evaluation module Addresses issue #773 - How to evaluate generated report quality objectively. This module provides two evaluation approaches: 1. Automated metrics (no LLM required): - Citation count and source diversity - Word count compliance per report style - Section structure validation - Image inclusion tracking 2. LLM-as-Judge evaluation: - Factual accuracy scoring - Completeness assessment - Coherence evaluation - Relevance and citation quality checks The combined evaluator provides a final score (1-10) and letter grade (A+ to F). Files added: - src/eval/__init__.py - src/eval/metrics.py - src/eval/llm_judge.py - src/eval/evaluator.py - tests/unit/eval/test_metrics.py - tests/unit/eval/test_evaluator.py * feat(eval): integrate report evaluation with web UI This commit adds the web UI integration for the evaluation module: Backend: - Add EvaluateReportRequest/Response models in src/server/eval_request.py - Add /api/report/evaluate endpoint to src/server/app.py Frontend: - Add evaluateReport API function in web/src/core/api/evaluate.ts - Create EvaluationDialog component with grade badge, metrics display, and optional LLM deep evaluation - Add evaluation button (graduation cap icon) to research-block.tsx toolbar - Add i18n translations for English and Chinese The evaluation UI allows users to: 1. View quick metrics-only evaluation (instant) 2. Optionally run deep LLM-based evaluation for detailed analysis 3. See grade (A+ to F), score (1-10), and metric breakdown * feat(eval): improve evaluation reliability and add LLM judge tests - Extract MAX_REPORT_LENGTH constant in llm_judge.py for maintainability - Add comprehensive unit tests for LLMJudge class (parse_response, calculate_weighted_score, evaluate with mocked LLM) - Pass reportStyle prop to EvaluationDialog for accurate evaluation criteria - Add researchQueries store map to reliably associate queries with research - Add getResearchQuery helper to retrieve query by researchId - Remove unused imports in test_metrics.py * fix(eval): use resolveServiceURL for evaluate API endpoint The evaluateReport function was using a relative URL '/api/report/evaluate' which sent requests to the Next.js server instead of the FastAPI backend. Changed to use resolveServiceURL() consistent with other API functions. * fix: improve type accuracy and React hooks in evaluation components - Fix get_word_count_target return type from Optional[Dict] to Dict since it always returns a value via default fallback - Fix useEffect dependency issue in EvaluationDialog using useRef to prevent unwanted re-evaluations - Add aria-label to GradeBadge for screen reader accessibility 2025-12-25 21:55:48 +08:00			`# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates`
			`# SPDX-License-Identifier: MIT`

			`"""Unit tests for the combined report evaluator."""`

			`import json`
			`from unittest.mock import AsyncMock, MagicMock`

			`import pytest`

			`from src.eval.evaluator import CombinedEvaluation, ReportEvaluator, score_to_grade`
			`from src.eval.llm_judge import (`
			`EVALUATION_CRITERIA,`
			`MAX_REPORT_LENGTH,`
			`EvaluationResult,`
			`LLMJudge,`
			`)`
			`from src.eval.metrics import ReportMetrics`


			`class TestScoreToGrade:`
			`"""Tests for score to grade conversion."""`

			`def test_excellent_scores(self):`
			`assert score_to_grade(9.5) == "A+"`
			`assert score_to_grade(9.0) == "A+"`
			`assert score_to_grade(8.7) == "A"`
			`assert score_to_grade(8.5) == "A"`
			`assert score_to_grade(8.2) == "A-"`

			`def test_good_scores(self):`
			`assert score_to_grade(7.8) == "B+"`
			`assert score_to_grade(7.5) == "B+"`
			`assert score_to_grade(7.2) == "B"`
			`assert score_to_grade(7.0) == "B"`
			`assert score_to_grade(6.7) == "B-"`

			`def test_average_scores(self):`
			`assert score_to_grade(6.2) == "C+"`
			`assert score_to_grade(5.8) == "C"`
			`assert score_to_grade(5.5) == "C"`
			`assert score_to_grade(5.2) == "C-"`

			`def test_poor_scores(self):`
			`assert score_to_grade(4.5) == "D"`
			`assert score_to_grade(4.0) == "D"`
			`assert score_to_grade(3.0) == "F"`
			`assert score_to_grade(1.0) == "F"`


			`class TestReportEvaluator:`
			`"""Tests for ReportEvaluator class."""`

			`@pytest.fixture`
			`def evaluator(self):`
			`"""Create evaluator without LLM for metrics-only tests."""`
			`return ReportEvaluator(use_llm=False)`

			`@pytest.fixture`
			`def sample_report(self):`
			`"""Sample report for testing."""`
			`return """`
			`# Comprehensive Research Report`

			`## Key Points`
			`- Important finding number one with significant implications`
			`- Critical discovery that changes our understanding`
			`- Key insight that provides actionable recommendations`
			`- Notable observation from the research data`

			`## Overview`
			`This report presents a comprehensive analysis of the research topic.`
			`The findings are based on extensive data collection and analysis.`

			`## Detailed Analysis`

			`### Section 1: Background`
			`The background of this research involves multiple factors.`
			`[Source 1](https://example.com/source1) provides foundational context.`

			`### Section 2: Methodology`
			`Our methodology follows established research practices.`
			`[Source 2](https://research.org/methods) outlines the approach.`

			`### Section 3: Findings`
			`The key findings include several important discoveries.`
			`![Research Data](https://example.com/chart.png)`

			`[Source 3](https://academic.edu/paper) supports these conclusions.`

			`## Key Citations`
			`- [Example Source](https://example.com/source1)`
			`- [Research Methods](https://research.org/methods)`
			`- [Academic Paper](https://academic.edu/paper)`
			`- [Additional Reference](https://reference.com/doc)`
			`"""`

			`def test_evaluate_metrics_only(self, evaluator, sample_report):`
			`"""Test metrics-only evaluation."""`
			`result = evaluator.evaluate_metrics_only(sample_report)`

			`assert "metrics" in result`
			`assert "score" in result`
			`assert "grade" in result`
			`assert result["score"] > 0`
			`assert result["grade"] in ["A+", "A", "A-", "B+", "B", "B-", "C+", "C", "C-", "D", "F"]`

			`def test_evaluate_metrics_only_structure(self, evaluator, sample_report):`
			`"""Test that metrics contain expected fields."""`
			`result = evaluator.evaluate_metrics_only(sample_report)`
			`metrics = result["metrics"]`

			`assert "word_count" in metrics`
			`assert "citation_count" in metrics`
			`assert "unique_sources" in metrics`
			`assert "image_count" in metrics`
			`assert "section_coverage_score" in metrics`

			`def test_evaluate_minimal_report(self, evaluator):`
			`"""Test evaluation of minimal report."""`
			`minimal_report = "Just some text."`
			`result = evaluator.evaluate_metrics_only(minimal_report)`

			`assert result["score"] < 5.0`
			`assert result["grade"] in ["D", "F"]`

			`def test_metrics_score_calculation(self, evaluator):`
			`"""Test that metrics score is calculated correctly."""`
			`good_report = """`
			`# Title`

			`## Key Points`
			`- Point 1`
			`- Point 2`

			`## Overview`
			`Overview content here.`

			`## Detailed Analysis`
			`Analysis with [cite](https://a.com) and [cite2](https://b.com)`
			`and [cite3](https://c.com) and more [refs](https://d.com).`

			`![Image](https://img.com/1.png)`

			`## Key Citations`
			`- [A](https://a.com)`
			`- [B](https://b.com)`
			`"""`
			`result = evaluator.evaluate_metrics_only(good_report)`
			`assert result["score"] > 5.0`

			`def test_combined_evaluation_to_dict(self):`
			`"""Test CombinedEvaluation to_dict method."""`
			`metrics = ReportMetrics(`
			`word_count=1000,`
			`citation_count=5,`
			`unique_sources=3,`
			`)`
			`evaluation = CombinedEvaluation(`
			`metrics=metrics,`
			`llm_evaluation=None,`
			`final_score=7.5,`
			`grade="B+",`
			`summary="Test summary",`
			`)`

			`result = evaluation.to_dict()`
			`assert result["final_score"] == 7.5`
			`assert result["grade"] == "B+"`
			`assert result["metrics"]["word_count"] == 1000`


			`class TestReportEvaluatorIntegration:`
			`"""Integration tests for evaluator (may require LLM)."""`

			`@pytest.mark.asyncio`
			`async def test_full_evaluation_without_llm(self):`
			`"""Test full evaluation with LLM disabled."""`
			`evaluator = ReportEvaluator(use_llm=False)`

			`report = """`
			`# Test Report`

			`## Key Points`
			`- Key point 1`

			`## Overview`
			`Test overview.`

			`## Key Citations`
			`- [Test](https://test.com)`
			`"""`

			`result = await evaluator.evaluate(report, "test query")`

			`assert isinstance(result, CombinedEvaluation)`
			`assert result.final_score > 0`
			`assert result.grade is not None`
			`assert result.summary is not None`
			`assert result.llm_evaluation is None`


			`class TestLLMJudgeParseResponse:`
			`"""Tests for LLMJudge._parse_response method."""`

			`@pytest.fixture`
			`def judge(self):`
			`"""Create LLMJudge with mock LLM."""`
			`return LLMJudge(llm=MagicMock())`

			`@pytest.fixture`
			`def valid_response_data(self):`
			`"""Valid evaluation response data."""`
			`return {`
			`"scores": {`
			`"factual_accuracy": 8,`
			`"completeness": 7,`
			`"coherence": 9,`
			`"relevance": 8,`
			`"citation_quality": 6,`
			`"writing_quality": 8,`
			`},`
			`"overall_score": 8,`
			`"strengths": ["Well researched", "Clear structure"],`
			`"weaknesses": ["Could use more citations"],`
			`"suggestions": ["Add more sources"],`
			`}`

			`def test_parse_valid_json(self, judge, valid_response_data):`
			`"""Test parsing valid JSON response."""`
			`response = json.dumps(valid_response_data)`
			`result = judge._parse_response(response)`

			`assert result["scores"]["factual_accuracy"] == 8`
			`assert result["overall_score"] == 8`
			`assert "Well researched" in result["strengths"]`

			`def test_parse_json_in_markdown_block(self, judge, valid_response_data):`
			`"""Test parsing JSON wrapped in markdown code block."""`
			response = f"```json\n{json.dumps(valid_response_data)}\n```"
			`result = judge._parse_response(response)`

			`assert result["scores"]["coherence"] == 9`
			`assert result["overall_score"] == 8`

			`def test_parse_json_in_generic_code_block(self, judge, valid_response_data):`
			`"""Test parsing JSON in generic code block."""`
			response = f"```\n{json.dumps(valid_response_data)}\n```"
			`result = judge._parse_response(response)`

			`assert result["scores"]["relevance"] == 8`

			`def test_parse_malformed_json_returns_defaults(self, judge):`
			`"""Test that malformed JSON returns default scores."""`
			`response = "This is not valid JSON at all"`
			`result = judge._parse_response(response)`

			`assert result["scores"]["factual_accuracy"] == 5`
			`assert result["scores"]["completeness"] == 5`
			`assert result["overall_score"] == 5`
			`assert "Unable to parse evaluation" in result["strengths"]`
			`assert "Evaluation parsing failed" in result["weaknesses"]`

			`def test_parse_incomplete_json(self, judge):`
			`"""Test parsing incomplete JSON."""`
			`response = '{"scores": {"factual_accuracy": 8}' # Missing closing braces`
			`result = judge._parse_response(response)`

			`# Should return defaults due to parse failure`
			`assert result["overall_score"] == 5`

			`def test_parse_json_with_extra_text(self, judge, valid_response_data):`
			`"""Test parsing JSON with surrounding text."""`
			response = f"Here is my evaluation:\n```json\n{json.dumps(valid_response_data)}\n```\nHope this helps!"
			`result = judge._parse_response(response)`

			`assert result["scores"]["factual_accuracy"] == 8`


			`class TestLLMJudgeCalculateWeightedScore:`
			`"""Tests for LLMJudge._calculate_weighted_score method."""`

			`@pytest.fixture`
			`def judge(self):`
			`"""Create LLMJudge with mock LLM."""`
			`return LLMJudge(llm=MagicMock())`

			`def test_calculate_with_all_scores(self, judge):`
			`"""Test weighted score calculation with all criteria."""`
			`scores = {`
			`"factual_accuracy": 10, # weight 0.25`
			`"completeness": 10, # weight 0.20`
			`"coherence": 10, # weight 0.20`
			`"relevance": 10, # weight 0.15`
			`"citation_quality": 10, # weight 0.10`
			`"writing_quality": 10, # weight 0.10`
			`}`
			`result = judge._calculate_weighted_score(scores)`
			`assert result == 10.0`

			`def test_calculate_with_varied_scores(self, judge):`
			`"""Test weighted score with varied scores."""`
			`scores = {`
			`"factual_accuracy": 8, # 8 * 0.25 = 2.0`
			`"completeness": 6, # 6 * 0.20 = 1.2`
			`"coherence": 7, # 7 * 0.20 = 1.4`
			`"relevance": 9, # 9 * 0.15 = 1.35`
			`"citation_quality": 5, # 5 * 0.10 = 0.5`
			`"writing_quality": 8, # 8 * 0.10 = 0.8`
			`}`
			`# Total: 7.25`
			`result = judge._calculate_weighted_score(scores)`
			`assert result == 7.25`

			`def test_calculate_with_partial_scores(self, judge):`
			`"""Test weighted score with only some criteria."""`
			`scores = {`
			`"factual_accuracy": 8, # weight 0.25`
			`"completeness": 6, # weight 0.20`
			`}`
			`# (8 * 0.25 + 6 * 0.20) / (0.25 + 0.20) = 3.2 / 0.45 = 7.11`
			`result = judge._calculate_weighted_score(scores)`
			`assert abs(result - 7.11) < 0.01`

			`def test_calculate_with_unknown_criteria(self, judge):`
			`"""Test that unknown criteria are ignored."""`
			`scores = {`
			`"factual_accuracy": 10,`
			`"unknown_criterion": 1, # Should be ignored`
			`}`
			`result = judge._calculate_weighted_score(scores)`
			`assert result == 10.0`

			`def test_calculate_with_empty_scores(self, judge):`
			`"""Test with empty scores dict."""`
			`result = judge._calculate_weighted_score({})`
			`assert result == 0.0`

			`def test_weights_sum_to_one(self):`
			`"""Verify that all criteria weights sum to 1.0."""`
			`total_weight = sum(c["weight"] for c in EVALUATION_CRITERIA.values())`
			`assert abs(total_weight - 1.0) < 0.001`


			`class TestLLMJudgeEvaluate:`
			`"""Tests for LLMJudge.evaluate method with mocked LLM."""`

			`@pytest.fixture`
			`def valid_llm_response(self):`
			`"""Create a valid LLM response."""`
			`return json.dumps(`
			`{`
			`"scores": {`
			`"factual_accuracy": 8,`
			`"completeness": 7,`
			`"coherence": 9,`
			`"relevance": 8,`
			`"citation_quality": 7,`
			`"writing_quality": 8,`
			`},`
			`"overall_score": 8,`
			`"strengths": ["Comprehensive coverage", "Well structured"],`
			`"weaknesses": ["Some claims need more support"],`
			`"suggestions": ["Add more academic sources"],`
			`}`
			`)`

			`@pytest.mark.asyncio`
			`async def test_successful_evaluation(self, valid_llm_response):`
			`"""Test successful LLM evaluation."""`
			`mock_llm = AsyncMock()`
			`mock_response = MagicMock()`
			`mock_response.content = valid_llm_response`
			`mock_llm.ainvoke.return_value = mock_response`

			`judge = LLMJudge(llm=mock_llm)`
			`result = await judge.evaluate("Test report", "Test query")`

			`assert isinstance(result, EvaluationResult)`
			`assert result.scores["factual_accuracy"] == 8`
			`assert result.overall_score == 8`
			`assert result.weighted_score > 0`
			`assert "Comprehensive coverage" in result.strengths`
			`assert result.raw_response == valid_llm_response`

			`@pytest.mark.asyncio`
			`async def test_evaluation_with_llm_failure(self):`
			`"""Test that LLM failures are handled gracefully."""`
			`mock_llm = AsyncMock()`
			`mock_llm.ainvoke.side_effect = Exception("LLM service unavailable")`

			`judge = LLMJudge(llm=mock_llm)`
			`result = await judge.evaluate("Test report", "Test query")`

			`assert isinstance(result, EvaluationResult)`
			`assert result.overall_score == 0`
			`assert result.weighted_score == 0`
			`assert all(score == 0 for score in result.scores.values())`
			`assert any("failed" in w.lower() for w in result.weaknesses)`

			`@pytest.mark.asyncio`
			`async def test_evaluation_with_malformed_response(self):`
			`"""Test handling of malformed LLM response."""`
			`mock_llm = AsyncMock()`
			`mock_response = MagicMock()`
			`mock_response.content = "I cannot evaluate this report properly."`
			`mock_llm.ainvoke.return_value = mock_response`

			`judge = LLMJudge(llm=mock_llm)`
			`result = await judge.evaluate("Test report", "Test query")`

			`# Should return default scores`
			`assert result.scores["factual_accuracy"] == 5`
			`assert result.overall_score == 5`

			`@pytest.mark.asyncio`
			`async def test_evaluation_passes_report_style(self):`
			`"""Test that report_style is passed to LLM."""`
			`mock_llm = AsyncMock()`
			`mock_response = MagicMock()`
			`mock_response.content = json.dumps(`
			`{`
			`"scores": {k: 7 for k in EVALUATION_CRITERIA.keys()},`
			`"overall_score": 7,`
			`"strengths": [],`
			`"weaknesses": [],`
			`"suggestions": [],`
			`}`
			`)`
			`mock_llm.ainvoke.return_value = mock_response`

			`judge = LLMJudge(llm=mock_llm)`
			`await judge.evaluate("Test report", "Test query", report_style="academic")`

			`# Verify the prompt contains the report style`
			`call_args = mock_llm.ainvoke.call_args`
			`messages = call_args[0][0]`
			`user_message_content = messages[1].content`
			`assert "academic" in user_message_content`

			`@pytest.mark.asyncio`
			`async def test_evaluation_truncates_long_reports(self):`
			`"""Test that very long reports are truncated."""`
			`mock_llm = AsyncMock()`
			`mock_response = MagicMock()`
			`mock_response.content = json.dumps(`
			`{`
			`"scores": {k: 7 for k in EVALUATION_CRITERIA.keys()},`
			`"overall_score": 7,`
			`"strengths": [],`
			`"weaknesses": [],`
			`"suggestions": [],`
			`}`
			`)`
			`mock_llm.ainvoke.return_value = mock_response`

			`judge = LLMJudge(llm=mock_llm)`
			`long_report = "x" * (MAX_REPORT_LENGTH + 5000)`
			`await judge.evaluate(long_report, "Test query")`

			`call_args = mock_llm.ainvoke.call_args`
			`messages = call_args[0][0]`
			`user_message_content = messages[1].content`
			`# The report content in the message should be truncated to MAX_REPORT_LENGTH`
			`assert len(user_message_content) < len(long_report) + 500`


			`class TestEvaluationResult:`
			`"""Tests for EvaluationResult dataclass."""`

			`def test_to_dict(self):`
			`"""Test EvaluationResult.to_dict method."""`
			`result = EvaluationResult(`
			`scores={"factual_accuracy": 8, "completeness": 7},`
			`overall_score=7.5,`
			`weighted_score=7.6,`
			`strengths=["Good research"],`
			`weaknesses=["Needs more detail"],`
			`suggestions=["Expand section 2"],`
			`raw_response="test response",`
			`)`

			`d = result.to_dict()`
			`assert d["scores"]["factual_accuracy"] == 8`
			`assert d["overall_score"] == 7.5`
			`assert d["weighted_score"] == 7.6`
			`assert "Good research" in d["strengths"]`
			`# raw_response should not be in dict`
			`assert "raw_response" not in d`