feat(eval): add report quality evaluation module and UI integration (#776)

* feat(eval): add report quality evaluation module Addresses issue #773 - How to evaluate generated report quality objectively. This module provides two evaluation approaches: 1. Automated metrics (no LLM required): - Citation count and source diversity - Word count compliance per report style - Section structure validation - Image inclusion tracking 2. LLM-as-Judge evaluation: - Factual accuracy scoring - Completeness assessment - Coherence evaluation - Relevance and citation quality checks The combined evaluator provides a final score (1-10) and letter grade (A+ to F). Files added: - src/eval/__init__.py - src/eval/metrics.py - src/eval/llm_judge.py - src/eval/evaluator.py - tests/unit/eval/test_metrics.py - tests/unit/eval/test_evaluator.py * feat(eval): integrate report evaluation with web UI This commit adds the web UI integration for the evaluation module: Backend: - Add EvaluateReportRequest/Response models in src/server/eval_request.py - Add /api/report/evaluate endpoint to src/server/app.py Frontend: - Add evaluateReport API function in web/src/core/api/evaluate.ts - Create EvaluationDialog component with grade badge, metrics display, and optional LLM deep evaluation - Add evaluation button (graduation cap icon) to research-block.tsx toolbar - Add i18n translations for English and Chinese The evaluation UI allows users to: 1. View quick metrics-only evaluation (instant) 2. Optionally run deep LLM-based evaluation for detailed analysis 3. See grade (A+ to F), score (1-10), and metric breakdown * feat(eval): improve evaluation reliability and add LLM judge tests - Extract MAX_REPORT_LENGTH constant in llm_judge.py for maintainability - Add comprehensive unit tests for LLMJudge class (parse_response, calculate_weighted_score, evaluate with mocked LLM) - Pass reportStyle prop to EvaluationDialog for accurate evaluation criteria - Add researchQueries store map to reliably associate queries with research - Add getResearchQuery helper to retrieve query by researchId - Remove unused imports in test_metrics.py * fix(eval): use resolveServiceURL for evaluate API endpoint The evaluateReport function was using a relative URL '/api/report/evaluate' which sent requests to the Next.js server instead of the FastAPI backend. Changed to use resolveServiceURL() consistent with other API functions. * fix: improve type accuracy and React hooks in evaluation components - Fix get_word_count_target return type from Optional[Dict] to Dict since it always returns a value via default fallback - Fix useEffect dependency issue in EvaluationDialog using useRef to prevent unwanted re-evaluations - Add aria-label to GradeBadge for screen reader accessibility
2026-04-12 10:04:45 +08:00 · 2025-12-25 21:55:48 +08:00
parent 84a7f7815c
commit 8d9d767051
17 changed files with 2103 additions and 2 deletions
--- a/src/eval/init.py
+++ b/src/eval/init.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
+# SPDX-License-Identifier: MIT
+
+"""
+Report Quality Evaluation Module for DeerFlow.
+
+This module provides objective methods to evaluate generated report quality,
+including automated metrics and LLM-based evaluation.
+"""
+
+from .evaluator import ReportEvaluator
+from .metrics import ReportMetrics, compute_metrics
+from .llm_judge import LLMJudge, evaluate_with_llm
+
+__all__ = [
+    "ReportEvaluator",
+    "ReportMetrics",
+    "compute_metrics",
+    "LLMJudge",
+    "evaluate_with_llm",
+]
--- a/src/eval/evaluator.py
+++ b/src/eval/evaluator.py
@@ -0,0 +1,249 @@
+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
+# SPDX-License-Identifier: MIT
+
+"""
+Combined report evaluator orchestrating both automated metrics and LLM evaluation.
+"""
+
+import logging
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+
+from .llm_judge import EvaluationResult, LLMJudge
+from .metrics import ReportMetrics, compute_metrics, get_word_count_target
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class CombinedEvaluation:
+    """Combined evaluation results from metrics and LLM judge."""
+
+    metrics: ReportMetrics
+    llm_evaluation: Optional[EvaluationResult]
+    final_score: float
+    grade: str
+    summary: str
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary format."""
+        return {
+            "metrics": self.metrics.to_dict(),
+            "llm_evaluation": (
+                self.llm_evaluation.to_dict() if self.llm_evaluation else None
+            ),
+            "final_score": self.final_score,
+            "grade": self.grade,
+            "summary": self.summary,
+        }
+
+
+def score_to_grade(score: float) -> str:
+    """Convert numeric score to letter grade."""
+    if score >= 9.0:
+        return "A+"
+    elif score >= 8.5:
+        return "A"
+    elif score >= 8.0:
+        return "A-"
+    elif score >= 7.5:
+        return "B+"
+    elif score >= 7.0:
+        return "B"
+    elif score >= 6.5:
+        return "B-"
+    elif score >= 6.0:
+        return "C+"
+    elif score >= 5.5:
+        return "C"
+    elif score >= 5.0:
+        return "C-"
+    elif score >= 4.0:
+        return "D"
+    else:
+        return "F"
+
+
+class ReportEvaluator:
+    """
+    Combined report evaluator using both automated metrics and LLM-as-Judge.
+
+    This evaluator provides comprehensive report quality assessment by:
+    1. Computing automated metrics (fast, deterministic)
+    2. Running LLM-based evaluation (nuanced, contextual)
+    3. Combining both for a final score and grade
+    """
+
+    def __init__(self, llm: Any = None, use_llm: bool = True):
+        """
+        Initialize the evaluator.
+
+        Args:
+            llm: Optional LLM instance for LLM-as-Judge evaluation
+            use_llm: Whether to use LLM evaluation (can be disabled for speed)
+        """
+        self.use_llm = use_llm
+        self.llm_judge = LLMJudge(llm=llm) if use_llm else None
+
+    def _compute_metrics_score(
+        self, metrics: ReportMetrics, report_style: str
+    ) -> float:
+        """
+        Convert automated metrics to a 0-10 score.
+
+        Scoring breakdown:
+        - Section coverage: 30%
+        - Citation quality: 25%
+        - Word count compliance: 20%
+        - Source diversity: 15%
+        - Image inclusion: 10%
+        """
+        score = 0.0
+
+        section_score = metrics.section_coverage_score * 10
+        score += section_score * 0.30
+
+        citation_score = min(metrics.citation_count / 10, 1.0) * 10
+        score += citation_score * 0.25
+
+        target = get_word_count_target(report_style)
+        if target:
+            if target["min"] <= metrics.word_count <= target["max"]:
+                word_score = 10.0
+            elif metrics.word_count < target["min"]:
+                word_score = (metrics.word_count / target["min"]) * 8
+            else:
+                excess_ratio = metrics.word_count / target["max"]
+                word_score = max(10 - (excess_ratio - 1) * 5, 5)
+            score += word_score * 0.20
+
+        diversity_score = min(metrics.unique_sources / 5, 1.0) * 10
+        score += diversity_score * 0.15
+
+        image_score = min(metrics.image_count / 3, 1.0) * 10
+        score += image_score * 0.10
+
+        return round(score, 2)
+
+    def _generate_summary(
+        self,
+        metrics: ReportMetrics,
+        llm_eval: Optional[EvaluationResult],
+        final_score: float,
+        grade: str,
+    ) -> str:
+        """Generate a human-readable evaluation summary."""
+        lines = [f"Report Grade: {grade} ({final_score}/10)", ""]
+
+        lines.append("**Automated Metrics:**")
+        lines.append(f"- Word Count: {metrics.word_count}")
+        lines.append(f"- Citations: {metrics.citation_count}")
+        lines.append(f"- Unique Sources: {metrics.unique_sources}")
+        lines.append(f"- Images: {metrics.image_count}")
+        lines.append(
+            f"- Section Coverage: {metrics.section_coverage_score * 100:.0f}%"
+        )
+
+        if metrics.sections_missing:
+            lines.append(f"- Missing Sections: {', '.join(metrics.sections_missing)}")
+
+        if llm_eval:
+            lines.append("")
+            lines.append("**LLM Evaluation:**")
+            for criterion, score in llm_eval.scores.items():
+                lines.append(f"- {criterion.replace('_', ' ').title()}: {score}/10")
+
+            if llm_eval.strengths:
+                lines.append("")
+                lines.append("**Strengths:**")
+                for strength in llm_eval.strengths[:3]:
+                    lines.append(f"- {strength}")
+
+            if llm_eval.weaknesses:
+                lines.append("")
+                lines.append("**Areas for Improvement:**")
+                for weakness in llm_eval.weaknesses[:3]:
+                    lines.append(f"- {weakness}")
+
+        return "\n".join(lines)
+
+    async def evaluate(
+        self,
+        report: str,
+        query: str,
+        report_style: str = "default",
+    ) -> CombinedEvaluation:
+        """
+        Evaluate a report using both metrics and LLM.
+
+        Args:
+            report: The report text to evaluate
+            query: The original research query
+            report_style: The style of report
+
+        Returns:
+            CombinedEvaluation with full results
+        """
+        metrics = compute_metrics(report, report_style)
+        metrics_score = self._compute_metrics_score(metrics, report_style)
+
+        llm_eval = None
+        if self.use_llm and self.llm_judge:
+            try:
+                llm_eval = await self.llm_judge.evaluate(report, query, report_style)
+            except Exception as e:
+                logger.warning(f"LLM evaluation failed, using metrics only: {e}")
+
+        if llm_eval and llm_eval.overall_score > 0:
+            final_score = (metrics_score * 0.4) + (llm_eval.weighted_score * 0.6)
+        else:
+            final_score = metrics_score
+
+        final_score = round(final_score, 2)
+        grade = score_to_grade(final_score)
+
+        summary = self._generate_summary(metrics, llm_eval, final_score, grade)
+
+        return CombinedEvaluation(
+            metrics=metrics,
+            llm_evaluation=llm_eval,
+            final_score=final_score,
+            grade=grade,
+            summary=summary,
+        )
+
+    def evaluate_sync(
+        self,
+        report: str,
+        query: str,
+        report_style: str = "default",
+    ) -> CombinedEvaluation:
+        """Synchronous version of evaluate."""
+        import asyncio
+
+        return asyncio.run(self.evaluate(report, query, report_style))
+
+    def evaluate_metrics_only(
+        self,
+        report: str,
+        report_style: str = "default",
+    ) -> Dict[str, Any]:
+        """
+        Quick evaluation using only automated metrics (no LLM).
+
+        Args:
+            report: The report text to evaluate
+            report_style: The style of report
+
+        Returns:
+            Dictionary with metrics and score
+        """
+        metrics = compute_metrics(report, report_style)
+        metrics_score = self._compute_metrics_score(metrics, report_style)
+        grade = score_to_grade(metrics_score)
+
+        return {
+            "metrics": metrics.to_dict(),
+            "score": metrics_score,
+            "grade": grade,
+        }
--- a/src/eval/llm_judge.py
+++ b/src/eval/llm_judge.py
@@ -0,0 +1,282 @@
+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
+# SPDX-License-Identifier: MIT
+
+"""
+LLM-as-Judge evaluation for report quality.
+
+Uses an LLM to evaluate reports on multiple quality dimensions,
+providing more nuanced assessment than automated metrics alone.
+"""
+
+import json
+import logging
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+
+from langchain_core.messages import HumanMessage, SystemMessage
+
+logger = logging.getLogger(__name__)
+
+# Maximum characters of report content to send to the LLM for evaluation.
+# This limit prevents exceeding LLM context windows and controls token usage.
+MAX_REPORT_LENGTH = 15000
+
+EVALUATION_CRITERIA = {
+    "factual_accuracy": {
+        "description": "Are claims supported by cited sources? Is information accurate and verifiable?",
+        "weight": 0.25,
+    },
+    "completeness": {
+        "description": "Does the report comprehensively cover all aspects of the topic?",
+        "weight": 0.20,
+    },
+    "coherence": {
+        "description": "Is the report logically structured, well-organized, and easy to follow?",
+        "weight": 0.20,
+    },
+    "relevance": {
+        "description": "Does the content directly address the research question without unnecessary tangents?",
+        "weight": 0.15,
+    },
+    "citation_quality": {
+        "description": "Are sources credible, diverse, and properly cited?",
+        "weight": 0.10,
+    },
+    "writing_quality": {
+        "description": "Is the writing clear, professional, and appropriate for the target audience?",
+        "weight": 0.10,
+    },
+}
+
+JUDGE_SYSTEM_PROMPT = """You are an expert report quality evaluator. Your task is to objectively assess the quality of research reports.
+
+Evaluate the report on the following criteria, scoring each from 1-10:
+
+1. **Factual Accuracy** (1-10): Are claims supported by cited sources? Is information accurate?
+2. **Completeness** (1-10): Does the report cover all aspects of the topic comprehensively?
+3. **Coherence** (1-10): Is the report logically structured and easy to follow?
+4. **Relevance** (1-10): Does content directly address the research question?
+5. **Citation Quality** (1-10): Are sources credible, diverse, and properly cited?
+6. **Writing Quality** (1-10): Is the writing clear and appropriate for the audience?
+
+Respond ONLY with a valid JSON object in this exact format:
+{
+    "scores": {
+        "factual_accuracy": <1-10>,
+        "completeness": <1-10>,
+        "coherence": <1-10>,
+        "relevance": <1-10>,
+        "citation_quality": <1-10>,
+        "writing_quality": <1-10>
+    },
+    "overall_score": <1-10>,
+    "strengths": ["strength1", "strength2"],
+    "weaknesses": ["weakness1", "weakness2"],
+    "suggestions": ["suggestion1", "suggestion2"]
+}
+
+Be objective and thorough in your evaluation."""
+
+
+@dataclass
+class EvaluationResult:
+    """Container for LLM evaluation results."""
+
+    scores: Dict[str, int]
+    overall_score: float
+    weighted_score: float
+    strengths: List[str]
+    weaknesses: List[str]
+    suggestions: List[str]
+    raw_response: Optional[str] = None
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert evaluation result to dictionary."""
+        return {
+            "scores": self.scores,
+            "overall_score": self.overall_score,
+            "weighted_score": self.weighted_score,
+            "strengths": self.strengths,
+            "weaknesses": self.weaknesses,
+            "suggestions": self.suggestions,
+        }
+
+
+class LLMJudge:
+    """LLM-based report quality evaluator."""
+
+    def __init__(self, llm: Any = None):
+        """
+        Initialize the LLM Judge.
+
+        Args:
+            llm: LangChain-compatible LLM instance. If None, will be created on demand.
+        """
+        self._llm = llm
+
+    def _get_llm(self):
+        """Get or create the LLM instance."""
+        if self._llm is None:
+            from src.llms.llm import get_llm_by_type
+
+            self._llm = get_llm_by_type("basic")
+        return self._llm
+
+    def _calculate_weighted_score(self, scores: Dict[str, int]) -> float:
+        """Calculate weighted average score based on criteria weights."""
+        total_weight = 0
+        weighted_sum = 0
+
+        for criterion, score in scores.items():
+            if criterion in EVALUATION_CRITERIA:
+                weight = EVALUATION_CRITERIA[criterion]["weight"]
+                weighted_sum += score * weight
+                total_weight += weight
+
+        if total_weight > 0:
+            return round(weighted_sum / total_weight, 2)
+        return 0.0
+
+    def _parse_response(self, response: str) -> Dict[str, Any]:
+        """Parse LLM response into structured format."""
+        try:
+            json_match = response
+            if "```json" in response:
+                json_match = response.split("```json")[1].split("```")[0]
+            elif "```" in response:
+                json_match = response.split("```")[1].split("```")[0]
+
+            return json.loads(json_match.strip())
+        except (json.JSONDecodeError, IndexError) as e:
+            logger.warning(f"Failed to parse LLM response: {e}")
+            return {
+                "scores": {
+                    "factual_accuracy": 5,
+                    "completeness": 5,
+                    "coherence": 5,
+                    "relevance": 5,
+                    "citation_quality": 5,
+                    "writing_quality": 5,
+                },
+                "overall_score": 5,
+                "strengths": ["Unable to parse evaluation"],
+                "weaknesses": ["Evaluation parsing failed"],
+                "suggestions": ["Please re-run evaluation"],
+            }
+
+    async def evaluate(
+        self,
+        report: str,
+        query: str,
+        report_style: str = "default",
+    ) -> EvaluationResult:
+        """
+        Evaluate a report using LLM-as-Judge.
+
+        Args:
+            report: The report text to evaluate
+            query: The original research query
+            report_style: The style of report for context
+
+        Returns:
+            EvaluationResult with scores and feedback
+        """
+        llm = self._get_llm()
+
+        user_prompt = f"""Please evaluate the following research report.
+
+**Original Research Query:** {query}
+
+**Report Style:** {report_style}
+
+**Report to Evaluate:**
+{report[:MAX_REPORT_LENGTH]}
+
+Provide your evaluation in the specified JSON format."""
+
+        messages = [
+            SystemMessage(content=JUDGE_SYSTEM_PROMPT),
+            HumanMessage(content=user_prompt),
+        ]
+
+        try:
+            response = await llm.ainvoke(messages)
+            response_text = (
+                response.content if hasattr(response, "content") else str(response)
+            )
+
+            parsed = self._parse_response(response_text)
+
+            scores = parsed.get("scores", {})
+            weighted_score = self._calculate_weighted_score(scores)
+
+            return EvaluationResult(
+                scores=scores,
+                overall_score=parsed.get("overall_score", 5),
+                weighted_score=weighted_score,
+                strengths=parsed.get("strengths", []),
+                weaknesses=parsed.get("weaknesses", []),
+                suggestions=parsed.get("suggestions", []),
+                raw_response=response_text,
+            )
+
+        except Exception as e:
+            logger.error(f"LLM evaluation failed: {e}")
+            return EvaluationResult(
+                scores={
+                    "factual_accuracy": 0,
+                    "completeness": 0,
+                    "coherence": 0,
+                    "relevance": 0,
+                    "citation_quality": 0,
+                    "writing_quality": 0,
+                },
+                overall_score=0,
+                weighted_score=0,
+                strengths=[],
+                weaknesses=[f"Evaluation failed: {str(e)}"],
+                suggestions=["Please retry evaluation"],
+            )
+
+    def evaluate_sync(
+        self,
+        report: str,
+        query: str,
+        report_style: str = "default",
+    ) -> EvaluationResult:
+        """
+        Synchronous version of evaluate.
+
+        Args:
+            report: The report text to evaluate
+            query: The original research query
+            report_style: The style of report for context
+
+        Returns:
+            EvaluationResult with scores and feedback
+        """
+        import asyncio
+
+        return asyncio.run(self.evaluate(report, query, report_style))
+
+
+async def evaluate_with_llm(
+    report: str,
+    query: str,
+    report_style: str = "default",
+    llm: Any = None,
+) -> EvaluationResult:
+    """
+    Convenience function to evaluate a report with LLM.
+
+    Args:
+        report: The report text to evaluate
+        query: The original research query
+        report_style: The style of report for context
+        llm: Optional LLM instance to use
+
+    Returns:
+        EvaluationResult with scores and feedback
+    """
+    judge = LLMJudge(llm=llm)
+    return await judge.evaluate(report, query, report_style)
--- a/src/eval/metrics.py
+++ b/src/eval/metrics.py
@@ -0,0 +1,229 @@
+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
+# SPDX-License-Identifier: MIT
+
+"""
+Automated metrics for report quality evaluation.
+
+These metrics can be computed without LLM calls, providing fast and
+deterministic quality assessment.
+"""
+
+import re
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional
+from urllib.parse import urlparse
+
+
+@dataclass
+class ReportMetrics:
+    """Container for computed report metrics."""
+
+    word_count: int = 0
+    citation_count: int = 0
+    unique_sources: int = 0
+    image_count: int = 0
+    section_count: int = 0
+    sections_found: List[str] = field(default_factory=list)
+    sections_missing: List[str] = field(default_factory=list)
+    section_coverage_score: float = 0.0
+    has_title: bool = False
+    has_key_points: bool = False
+    has_overview: bool = False
+    has_citations_section: bool = False
+
+    def to_dict(self) -> Dict:
+        """Convert metrics to dictionary."""
+        return {
+            "word_count": self.word_count,
+            "citation_count": self.citation_count,
+            "unique_sources": self.unique_sources,
+            "image_count": self.image_count,
+            "section_count": self.section_count,
+            "sections_found": self.sections_found,
+            "sections_missing": self.sections_missing,
+            "section_coverage_score": self.section_coverage_score,
+            "has_title": self.has_title,
+            "has_key_points": self.has_key_points,
+            "has_overview": self.has_overview,
+            "has_citations_section": self.has_citations_section,
+        }
+
+
+# Required sections for different report styles
+REPORT_STYLE_SECTIONS = {
+    "default": [
+        "title",
+        "key_points",
+        "overview",
+        "detailed_analysis",
+        "key_citations",
+    ],
+    "academic": [
+        "title",
+        "key_points",
+        "overview",
+        "detailed_analysis",
+        "literature_review",
+        "methodology",
+        "key_citations",
+    ],
+    "news": [
+        "title",
+        "key_points",
+        "overview",
+        "detailed_analysis",
+        "key_citations",
+    ],
+    "popular_science": [
+        "title",
+        "key_points",
+        "overview",
+        "detailed_analysis",
+        "key_citations",
+    ],
+    "social_media": [
+        "title",
+        "key_points",
+        "overview",
+        "key_citations",
+    ],
+    "strategic_investment": [
+        "title",
+        "key_points",
+        "overview",
+        "detailed_analysis",
+        "executive_summary",
+        "market_analysis",
+        "technology_analysis",
+        "investment_recommendations",
+        "key_citations",
+    ],
+}
+
+# Section name patterns for detection (supports both English and Chinese)
+SECTION_PATTERNS = {
+    "title": r"^#\s+.+",
+    "key_points": r"(?:key\s*points|要点|关键发现|核心观点)",
+    "overview": r"(?:overview|概述|简介|背景)",
+    "detailed_analysis": r"(?:detailed\s*analysis|详细分析|深度分析|分析)",
+    "key_citations": r"(?:key\s*citations|references|参考文献|引用|来源)",
+    "literature_review": r"(?:literature\s*review|文献综述|研究回顾)",
+    "methodology": r"(?:methodology|方法论|研究方法)",
+    "executive_summary": r"(?:executive\s*summary|执行摘要|投资建议)",
+    "market_analysis": r"(?:market\s*analysis|市场分析|产业分析)",
+    "technology_analysis": r"(?:technology|技术.*(?:分析|解析|深度))",
+    "investment_recommendations": r"(?:investment.*recommend|投资建议|投资评级)",
+}
+
+
+def count_words(text: str) -> int:
+    """Count words in text, handling both English and Chinese."""
+    english_words = len(re.findall(r"\b[a-zA-Z]+\b", text))
+    chinese_chars = len(re.findall(r"[\u4e00-\u9fff]", text))
+    return english_words + chinese_chars
+
+
+def count_citations(text: str) -> int:
+    """Count markdown-style citations [text](url)."""
+    pattern = r"\[.+?\]\(https?://[^\s\)]+\)"
+    return len(re.findall(pattern, text))
+
+
+def extract_domains(text: str) -> List[str]:
+    """Extract unique domains from URLs in the text."""
+    url_pattern = r"https?://([^\s\)\]]+)"
+    urls = re.findall(url_pattern, text)
+    domains = set()
+    for url in urls:
+        try:
+            parsed = urlparse(f"http://{url}")
+            domain = parsed.netloc or url.split("/")[0]
+            domain = domain.lower().replace("www.", "")
+            if domain:
+                domains.add(domain)
+        except Exception:
+            continue
+    return list(domains)
+
+
+def count_images(text: str) -> int:
+    """Count markdown images ![alt](url)."""
+    pattern = r"!\[.*?\]\(.+?\)"
+    return len(re.findall(pattern, text))
+
+
+def detect_sections(text: str, report_style: str = "default") -> Dict[str, bool]:
+    """Detect which sections are present in the report."""
+    required_sections = REPORT_STYLE_SECTIONS.get(
+        report_style, REPORT_STYLE_SECTIONS["default"]
+    )
+    detected = {}
+
+    text_lower = text.lower()
+
+    for section in required_sections:
+        pattern = SECTION_PATTERNS.get(section, section.replace("_", r"\s*"))
+        if section == "title":
+            detected[section] = bool(re.search(pattern, text, re.MULTILINE))
+        else:
+            detected[section] = bool(
+                re.search(pattern, text_lower, re.IGNORECASE | re.MULTILINE)
+            )
+
+    return detected
+
+
+def compute_metrics(
+    report: str, report_style: str = "default", target_word_count: Optional[int] = None
+) -> ReportMetrics:
+    """
+    Compute automated metrics for a report.
+
+    Args:
+        report: The report text in markdown format
+        report_style: The style of report (academic, news, etc.)
+        target_word_count: Optional target word count for compliance check
+
+    Returns:
+        ReportMetrics object with computed values
+    """
+    metrics = ReportMetrics()
+
+    metrics.word_count = count_words(report)
+    metrics.citation_count = count_citations(report)
+
+    domains = extract_domains(report)
+    metrics.unique_sources = len(domains)
+
+    metrics.image_count = count_images(report)
+
+    sections_detected = detect_sections(report, report_style)
+    metrics.sections_found = [s for s, found in sections_detected.items() if found]
+    metrics.sections_missing = [
+        s for s, found in sections_detected.items() if not found
+    ]
+    metrics.section_count = len(metrics.sections_found)
+
+    total_sections = len(sections_detected)
+    if total_sections > 0:
+        metrics.section_coverage_score = len(metrics.sections_found) / total_sections
+
+    metrics.has_title = sections_detected.get("title", False)
+    metrics.has_key_points = sections_detected.get("key_points", False)
+    metrics.has_overview = sections_detected.get("overview", False)
+    metrics.has_citations_section = sections_detected.get("key_citations", False)
+
+    return metrics
+
+
+def get_word_count_target(report_style: str) -> Dict[str, int]:
+    """Get target word count range for a report style."""
+    targets = {
+        "strategic_investment": {"min": 10000, "max": 15000},
+        "academic": {"min": 3000, "max": 8000},
+        "news": {"min": 800, "max": 2000},
+        "popular_science": {"min": 1500, "max": 4000},
+        "social_media": {"min": 500, "max": 1500},
+        "default": {"min": 1000, "max": 5000},
+    }
+    return targets.get(report_style, targets["default"])
--- a/src/server/app.py
+++ b/src/server/app.py
@@ -35,6 +35,7 @@ from src.podcast.graph.builder import build_graph as build_podcast_graph
 from src.ppt.graph.builder import build_graph as build_ppt_graph
 from src.prompt_enhancer.graph.builder import build_graph as build_prompt_enhancer_graph
 from src.prose.graph.builder import build_graph as build_prose_graph
+from src.eval import ReportEvaluator
 from src.rag.builder import build_retriever
 from src.rag.milvus import load_examples as load_milvus_examples
 from src.rag.qdrant import load_examples as load_qdrant_examples
@@ -47,6 +48,7 @@ from src.server.chat_request import (
    GenerateProseRequest,
    TTSRequest,
 )
+from src.server.eval_request import EvaluateReportRequest, EvaluateReportResponse
 from src.server.config_request import ConfigResponse
 from src.server.mcp_request import MCPServerMetadataRequest, MCPServerMetadataResponse
 from src.server.mcp_utils import load_mcp_tools
@@ -946,6 +948,39 @@ async def generate_prose(request: GenerateProseRequest):
        raise HTTPException(status_code=500, detail=INTERNAL_SERVER_ERROR_DETAIL)


+@app.post("/api/report/evaluate", response_model=EvaluateReportResponse)
+async def evaluate_report(request: EvaluateReportRequest):
+    """Evaluate report quality using automated metrics and optionally LLM-as-Judge."""
+    try:
+        evaluator = ReportEvaluator(use_llm=request.use_llm)
+
+        if request.use_llm:
+            result = await evaluator.evaluate(
+                request.content, request.query, request.report_style or "default"
+            )
+            return EvaluateReportResponse(
+                metrics=result.metrics.to_dict(),
+                score=result.final_score,
+                grade=result.grade,
+                llm_evaluation=result.llm_evaluation.to_dict()
+                if result.llm_evaluation
+                else None,
+                summary=result.summary,
+            )
+        else:
+            result = evaluator.evaluate_metrics_only(
+                request.content, request.report_style or "default"
+            )
+            return EvaluateReportResponse(
+                metrics=result["metrics"],
+                score=result["score"],
+                grade=result["grade"],
+            )
+    except Exception as e:
+        logger.exception(f"Error occurred during report evaluation: {str(e)}")
+        raise HTTPException(status_code=500, detail=INTERNAL_SERVER_ERROR_DETAIL)
+
+
@app.post("/api/prompt/enhance")
 async def enhance_prompt(request: EnhancePromptRequest):
    try:
--- a/src/server/eval_request.py
+++ b/src/server/eval_request.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
+# SPDX-License-Identifier: MIT
+
+"""Request models for report evaluation endpoint."""
+
+from typing import Optional
+
+from pydantic import BaseModel, Field
+
+
+class EvaluateReportRequest(BaseModel):
+    """Request model for report evaluation."""
+
+    content: str = Field(description="Report markdown content to evaluate")
+    query: str = Field(description="Original research query")
+    report_style: Optional[str] = Field(
+        default="default", description="Report style (academic, news, etc.)"
+    )
+    use_llm: bool = Field(
+        default=False,
+        description="Whether to use LLM for deep evaluation (slower but more detailed)",
+    )
+
+
+class EvaluationMetrics(BaseModel):
+    """Automated metrics result."""
+
+    word_count: int
+    citation_count: int
+    unique_sources: int
+    image_count: int
+    section_count: int
+    section_coverage_score: float
+    sections_found: list[str]
+    sections_missing: list[str]
+    has_title: bool
+    has_key_points: bool
+    has_overview: bool
+    has_citations_section: bool
+
+
+class LLMEvaluationScores(BaseModel):
+    """LLM evaluation scores."""
+
+    factual_accuracy: int = 0
+    completeness: int = 0
+    coherence: int = 0
+    relevance: int = 0
+    citation_quality: int = 0
+    writing_quality: int = 0
+
+
+class LLMEvaluation(BaseModel):
+    """LLM evaluation result."""
+
+    scores: LLMEvaluationScores
+    overall_score: float
+    weighted_score: float
+    strengths: list[str]
+    weaknesses: list[str]
+    suggestions: list[str]
+
+
+class EvaluateReportResponse(BaseModel):
+    """Response model for report evaluation."""
+
+    metrics: EvaluationMetrics
+    score: float
+    grade: str
+    llm_evaluation: Optional[LLMEvaluation] = None
+    summary: Optional[str] = None